module RelatonIso::Scrapper
Constants
- DOMAIN
- PUBLISHERS
- STGABBR
- TYPES
Public Class Methods
Parse page. @param hit_data [Hash] @param lang [String, NilClass] @return [Hash]
# File lib/relaton_iso/scrapper.rb, line 56 def parse_page(hit_data, lang = nil) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity # path = "/contents/data/standard#{hit_data['splitPath']}/"\ # "#{hit_data['csnumber']}.html" doc, url = get_page "#{hit_data[:path].sub '/sites/isoorg', ''}.html" # Fetch edition. edition = doc&.xpath("//strong[contains(text(), 'Edition')]/..") &.children&.last&.text&.match(/\d+/)&.to_s titles, abstract, langs = fetch_titles_abstract(doc, lang) RelatonIsoBib::IsoBibliographicItem.new( fetched: Date.today.to_s, docid: fetch_docid(doc, edition, langs), docnumber: fetch_docnumber(doc), edition: edition, language: langs.map { |l| l[:lang] }, script: langs.map { |l| script(l[:lang]) }.uniq, title: titles, doctype: fetch_type(hit_data[:title]), docstatus: fetch_status(doc), ics: fetch_ics(doc), date: fetch_dates(doc, hit_data[:title]), contributor: fetch_contributors(hit_data[:title]), editorialgroup: fetch_workgroup(doc), abstract: abstract, copyright: fetch_copyright(doc), link: fetch_link(doc, url), relation: fetch_relations(doc), place: ["Geneva"], structuredidentifier: fetch_structuredidentifier(doc), ) end
Private Class Methods
# File lib/relaton_iso/scrapper.rb, line 375 def fetch_contributors(ref) ref.sub(/\s.*/, "").split("/").reduce([]) do |mem, abbrev| publisher = PUBLISHERS[abbrev] next mem unless publisher publisher[:abbreviation] = abbrev mem << { entity: publisher, role: [type: "publisher"] } end end
Fetch copyright. @param doc [Nokogiri::HTML::Document] @return [Array<Hash>]
# File lib/relaton_iso/scrapper.rb, line 416 def fetch_copyright(doc) ref = item_ref doc owner_name = ref.match(/.*?(?=\s)/).to_s from = ref.match(/(?<=:)\d{4}/).to_s if from.empty? from = doc.xpath("//span[@itemprop='releaseDate']").text.match(/\d{4}/).to_s end [{ owner: [{ name: owner_name }], from: from }] end
rubocop:disable Metrics/MethodLength Fetch dates @param doc [Nokogiri::HTML::Document] @param ref [String] @return [Array<Hash>]
# File lib/relaton_iso/scrapper.rb, line 352 def fetch_dates(doc, ref) # rubocop:disable Metrics/AbcSize, Metrics/PerceivedComplexity dates = [] %r{^[^\s]+\s[\d-]+:(?<ref_date_str>\d{4})} =~ ref pub_date_str = doc.xpath("//span[@itemprop='releaseDate']").text if ref_date_str ref_date = Date.strptime ref_date_str, "%Y" if pub_date_str.empty? dates << { type: "published", on: ref_date_str } else pub_date = Date.strptime pub_date_str, "%Y" if pub_date.year > ref_date.year dates << { type: "published", on: ref_date_str } dates << { type: "updated", on: pub_date_str } else dates << { type: "published", on: pub_date_str } end end elsif !pub_date_str.empty? dates << { type: "published", on: pub_date_str } end dates end
Fetch docid. @param doc [Nokogiri:HTML::Document] @param edition [String] @param langs [Array<Hash>] @return [Array<RelatonBib::DocumentIdentifier>]
# File lib/relaton_iso/scrapper.rb, line 176 def fetch_docid(doc, edition, langs) pubid = item_ref doc [ RelatonBib::DocumentIdentifier.new(id: pubid, type: "ISO"), RelatonBib::DocumentIdentifier.new( id: fetch_urn(doc, pubid, edition, langs), type: "URN", ), ] end
# File lib/relaton_iso/scrapper.rb, line 209 def fetch_docnumber(doc) item_ref(doc)&.match(/\d+/)&.to_s end
Fetch ICS. @param doc [Nokogiri::HTML::Document] @return [Array<Hash>]
# File lib/relaton_iso/scrapper.rb, line 389 def fetch_ics(doc) doc.xpath("//strong[contains(text(), "\ "'ICS')]/../following-sibling::dd/div/a").map do |i| code = i.text.match(/[\d.]+/).to_s.split "." { field: code[0], group: code[1], subgroup: code[2] } end end
Fetch links. @param doc [Nokogiri::HTML::Document] @param url [String] @return [Array<Hash>]
# File lib/relaton_iso/scrapper.rb, line 401 def fetch_link(doc, url) links = [{ type: "src", content: url }] obp = doc.at_css("a#obp-preview") links << { type: "obp", content: obp[:href] } if obp rss = doc.at("//a[contains(@href, 'rss')]") links << { type: "rss", content: DOMAIN + rss[:href] } if rss pub = doc.at "//p[contains(., 'publicly available')]/a", "//p[contains(., 'can be downloaded from the')]/a" links << { type: "pub", content: pub[:href] } if pub links end
Fetch relations. @param doc [Nokogiri::HTML::Document] @return [Array<Hash>]
# File lib/relaton_iso/scrapper.rb, line 277 def fetch_relations(doc) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity doc.xpath("//ul[@class='steps']/li", "//div[@class='sub-step']").reduce([]) do |a, r| r_type = r.at("h4", "h5").text date = [] type = case r_type when "Previously", "Will be replaced by" then "obsoletes" when "Corrigenda / Amendments", "Revised by", "Now confirmed" on = doc.xpath('//span[@class="stage-date"][contains(., "-")]').last date << { type: "circulated", on: on.text } if on "updates" else r_type end if ["Now", "Now under review"].include?(type) then a else a + r.css("a").map do |id| fref = RelatonBib::FormattedRef.new( content: id.text, format: "text/plain", ) bibitem = RelatonIsoBib::IsoBibliographicItem.new( formattedref: fref, date: date, ) { type: type, bibitem: bibitem } end end end end
Fetch status. @param doc [Nokogiri::HTML::Document] @param status [String] @return [Hash]
# File lib/relaton_iso/scrapper.rb, line 236 def fetch_status(doc) stg, substg = stage_code(doc).split "." RelatonBib::DocumentStatus.new(stage: stg, substage: substg) end
@param doc [Nokogiri::HTML::Document]
# File lib/relaton_iso/scrapper.rb, line 214 def fetch_structuredidentifier(doc) # rubocop:disable Metrics/MethodLength ref = item_ref doc unless ref return RelatonIsoBib::StructuredIdentifier.new( project_number: "?", part_number: "", prefix: nil, id: "?", ) end m = ref.match(/^(.*?\d+)-?((?<=-)\d+|)/) RelatonIsoBib::StructuredIdentifier.new( project_number: m[1], part: m[2], type: "ISO", ) end
Fetch titles. @param doc [Nokogiri::HTML::Document] @param lang [String] @return [Array<RelatonBib::TypedTitleString>]
# File lib/relaton_iso/scrapper.rb, line 327 def fetch_title(doc, lang) content = doc.at( "//nav[contains(@class,'heading-condensed')]/h2 | "\ "//nav[contains(@class,'heading-condensed')]/h3", )&.text&.gsub(/\u2014/, "-") return RelatonBib::TypedTitleStringCollection.new unless content RelatonBib::TypedTitleString.from_string content, lang, script(lang) end
Fetch titles and abstracts. @param doc [Nokigiri::HTML::Document] @param lang [String, NilClass] @return [Array<Array>]
# File lib/relaton_iso/scrapper.rb, line 96 def fetch_titles_abstract(doc, lang) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity titles = RelatonBib::TypedTitleStringCollection.new abstract = [] langs = languages(doc, lang).reduce([]) do |s, l| # Don't need to get page for en. We already have it. d = l[:path] ? get_page(l[:path])[0] : doc unless d.at("//h5[@class='help-block']"\ "[.='недоступно на русском языке']") s << l titles += fetch_title(d, l[:lang]) # Fetch abstracts. abstract_content = d.xpath( "//div[@itemprop='description']/p|//div[@itemprop='description']/ul/li", ).map do |a| a.name == "li" ? "- #{a.text}" : a.text end.reject(&:empty?).join("\n") unless abstract_content.empty? abstract << { content: abstract_content, language: l[:lang], script: script(l[:lang]), format: "text/plain", } end end s end [titles, abstract, langs] end
Fetch type. @param ref [String] @return [String]
# File lib/relaton_iso/scrapper.rb, line 308 def fetch_type(ref) %r{ ^(?<prefix>ISO|IWA|IEC) (?:(/IEC|/IEEE|/PRF|/NP|/DGuide)*\s|/) (?<type>TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|Guide|(?=\d+)) }x =~ ref # return "international-standard" if type_match.nil? if TYPES[type] then TYPES[type] elsif prefix == "ISO" then "international-standard" elsif prefix == "IWA" then "international-workshop-agreement" end # rescue => _e # puts 'Unknown document type: ' + title end
@param doc [Nokogiri:HTML::Document] @param pubid [String] @param edition [String] @param langs [Array<Hash>] @returnt [String]
# File lib/relaton_iso/scrapper.rb, line 191 def fetch_urn(doc, pubid, edition, langs) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity, Metrics/MethodLength orig = pubid.split.first.downcase.split("/").join "-" %r{(?<=)(?<type>DATA|GUIDE|ISP|IWA|PAS|R|TR|TS|TTA)} =~ pubid _, part, _year, corr, = IsoBibliography.ref_components pubid urn = "urn:iso:std:#{orig}" urn += ":#{type.downcase}" if type urn += ":#{fetch_docnumber(doc)}" urn += ":-#{part}" if part urn += ":stage-#{stage_code(doc)}" urn += ":ed-#{edition}" if edition if corr corrparts = corr.split urn += ":#{corrparts[0].downcase}:#{corrparts[-1]}" end urn += ":#{langs.map { |l| l[:lang] }.join(',')}" urn end
Fetch workgroup. @param doc [Nokogiri::HTML::Document] @return [Hash]
# File lib/relaton_iso/scrapper.rb, line 254 def fetch_workgroup(doc) # rubocop:disable Metrics/MethodLength wg_link = doc.css("div.entry-name.entry-block a")[0] # wg_url = DOMAIN + wg_link['href'] workgroup = wg_link.text.split "/" type = workgroup[1]&.match(/^[A-Z]+/)&.to_s || "TC" { name: "International Organization for Standardization", abbreviation: "ISO", url: "www.iso.org", technical_committee: [{ name: doc.css("div.entry-title")[0].text, identifier: wg_link.text, type: type, number: workgroup[1]&.match(/\d+/)&.to_s&.to_i, }], } end
rubocop:disable Metrics/AbcSize, Metrics/MethodLength Get page. @param path [String] page's path @return [Array<Nokogiri::HTML::Document, String>]
# File lib/relaton_iso/scrapper.rb, line 145 def get_page(path) url = DOMAIN + path uri = URI url resp = Net::HTTP.get_response(uri) # .encode("UTF-8") case resp.code when "301" path = resp["location"] url = DOMAIN + path uri = URI url resp = Net::HTTP.get_response(uri) # .encode("UTF-8") when "404" raise RelatonBib::RequestError, "#{url} not found." end n = 0 while resp.body !~ /<strong/ && n < 10 resp = Net::HTTP.get_response(uri) # .encode("UTF-8") n += 1 end [Nokogiri::HTML(resp.body), url] rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError, Errno::ETIMEDOUT raise RelatonBib::RequestError, "Could not access #{url}" end
# File lib/relaton_iso/scrapper.rb, line 228 def item_ref(doc) doc.at("//nav[contains(@class, 'heading-condensed')]/h1")&.text end
Returns available languages. @param doc [Nokogiri::HTML::Document] @pqrqm lang [String, NilClass] @return [Array<Hash>]
# File lib/relaton_iso/scrapper.rb, line 131 def languages(doc, lang) lgs = [{ lang: "en" }] doc.css("li#lang-switcher ul li a").each do |lang_link| lang_path = lang_link.attr("href") l = lang_path.match(%r{^/(fr)/}) lgs << { lang: l[1], path: lang_path } if l && (!lang || l[1] == lang) end lgs end
Return ISO script code. @param lang [String] @return [String]
# File lib/relaton_iso/scrapper.rb, line 340 def script(lang) case lang when "en", "fr" then "Latn" # when "ru" then "Cyrl" end end
# File lib/relaton_iso/scrapper.rb, line 241 def stage_code(doc) doc.at("//ul[@class='dropdown-menu']/li[@class='active']"\ "/a/span[@class='stage-code']").text end