class RelatonCie::DataFetcher
Constants
- URL
Public Class Methods
fetch(output: "data", format: "yaml")
click to toggle source
# File lib/relaton_cie/data_fetcher.rb, line 180 def self.fetch(output: "data", format: "yaml") t1 = Time.now puts "Started at: #{t1}" FileUtils.mkdir output unless Dir.exist? output new(output, format).fetch URL t2 = Time.now puts "Stopped at: #{t2}" puts "Done in: #{(t2 - t1).round} sec." end
new(output, format)
click to toggle source
# File lib/relaton_cie/data_fetcher.rb, line 12 def initialize(output, format) @agent = Mechanize.new @output = output @format = format end
Public Instance Methods
fetch(url)
click to toggle source
# File lib/relaton_cie/data_fetcher.rb, line 165 def fetch(url) result = time_req { @agent.get url } result.xpath("//li[@data-product]").each { |hit| parse_page hit } np = result.at '//a[@class="next_page"]' fetch "https://www.techstreet.com#{np[:href]}" if np end
fetch_abstract(doc)
click to toggle source
@param doc [Mechanize::Page] @return [Array<RelatonBib::FormattedString>]
# File lib/relaton_cie/data_fetcher.rb, line 95 def fetch_abstract(doc) content = doc.at('//div[contains(@class,"description")]')&.text&.strip return [] if content.nil? || content.empty? [RelatonBib::FormattedString.new(content: content, language: "en", script: "Latn")] end
fetch_contributor(doc)
click to toggle source
@param doc [Mechanize::Page] @return [Array<Hash>]
# File lib/relaton_cie/data_fetcher.rb, line 105 def fetch_contributor(doc) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity authors = doc.xpath('//hgroup/p[not(@class="pub_date")]').text contribs = [] until authors.empty? /^(?<sname1>\S+(?:\sder?\s)?[^\s,]+) (?:,?\s(?<sname2>[\w-]{2,})(?=,\s+\w\.))? (?:,?\s(?<fname>[\w-]{2,})(?!,\s+\w\.))? (?:(?:\s?,\s?|\s)(?<init>(?:\w(?:\s?\.|\s|,|$)[\s-]?)+))? (?:(?:,\s?|\s|\.|(?<=\s))(?:and\s)?)?/x =~ authors raise StandardError, "Author name not found in \"#{authors}\"" unless $LAST_MATCH_INFO authors.sub! $LAST_MATCH_INFO.to_s, "" sname = [sname1, sname2].compact.join " " surname = RelatonBib::LocalizedString.new sname, "en", "Latn" initial = (init&.strip || "").split(/(?:,|\.)(?:-|\s)?/).map do |int| RelatonBib::LocalizedString.new(int.strip, "en", "Latn") end forename = fname ? [RelatonBib::LocalizedString.new(fname, "en", "Latn")] : [] fullname = RelatonBib::FullName.new surname: surname, forename: forename, initial: initial person = RelatonBib::Person.new name: fullname contribs << { entity: person, role: [{ type: "author" }] } end org = RelatonBib::Organization.new( name: "Commission Internationale de L'Eclairage", abbreviation: "CIE", url: "cie.co.at" ) contribs << { entity: org, role: [{ type: "publisher" }] } end
fetch_date(doc)
click to toggle source
@param doc [Mechanize::Page] @return [Array<RelatonBib::BibliographicDate>]
# File lib/relaton_cie/data_fetcher.rb, line 55 def fetch_date(doc) doc.xpath('//dt[.="Published:"]/following-sibling::dd[1]').map do |d| pd = d.text.strip on = pd.match?(/^\d{4}(?:[^-]|$)/) ? pd : Date.strptime(pd, "%m/%d/%Y").strftime("%Y-%m-%d") RelatonBib::BibliographicDate.new(type: "published", on: on) end end
fetch_docid(hit, doc)
click to toggle source
@param hit [Nokogiri::HTML::Document] @param doc [Mechanize::Page] @return [Array<RelatonBib::DocumentIdentifier>]
# File lib/relaton_cie/data_fetcher.rb, line 21 def fetch_docid(hit, doc) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity code = hit.at("h3/a").text.strip.sub(/\u25b9/, "").gsub(" / ", "/") c2idx = %r{(?:\(|/)(?<c2>(?:ISO|IEC)\s[^()]+)} =~ code code = code[0...c2idx].strip if c2idx /^(?<code1>[^(]+)(?:\((?<code2>\w+\d+,(?:\sPages)?[^)]+))?/ =~ code if code1.match?(/^CIE/) c = code1.size > 25 && code2 ? "CIE #{code2.sub(/,(\sPages)?/, '')}" : code1 add = doc.at("//hgroup/h2")&.text&.match(/(Add)endum\s(\d+)$/) c += " #{add[1]} #{add[2]}" if add elsif (pcode = doc.at('//dt[.="Product Code(s):"]/following-sibling::dd')) c = "CIE #{pcode.text.strip.match(/[^,]+/)}" else num = code.match(/(?<=\()\w{2}\d+,.+(?=\))/).to_s.gsub(/,(?=\s)/, "") .gsub(/,(?=\S)/, " ") c = "CIE #{num}" end docid = [RelatonBib::DocumentIdentifier.new(type: "CIE", id: c)] isbn = doc.at('//dt[contains(.,"ISBN")]/following-sibling::dd') docid << RelatonBib::DocumentIdentifier.new(type: c2.match(/\w+/).to_s, id: c2.strip) if c2 docid << RelatonBib::DocumentIdentifier.new(type: "ISBN", id: isbn.text.strip) if isbn docid end
fetch_edition(doc)
click to toggle source
@param doc [Mechanize::Page] @return [String]
# File lib/relaton_cie/data_fetcher.rb, line 65 def fetch_edition(doc) doc.at('//dt[.="Edition:"]/following-sibling::dd')&.text&.match(/^\d+(?=th)/)&.to_s end
fetch_link(url)
click to toggle source
@param url [String] @return [Array<RelatonBib::TypedUri>]
# File lib/relaton_cie/data_fetcher.rb, line 89 def fetch_link(url) [RelatonBib::TypedUri.new(type: "src", content: url)] end
fetch_relation(doc)
click to toggle source
@param doc [Mechanize::Page] @return [Array<Hash>]
# File lib/relaton_cie/data_fetcher.rb, line 71 def fetch_relation(doc) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength doc.xpath('//section[@class="history"]/ol/li[not(contains(@class,"selected-product"))]').map do |rel| ref = rel.at("a") url = "https://www.techstreet.com#{ref[:href]}" title = RelatonBib::TypedTitleString.from_string ref.at('p/span[@class="title"]').text did = ref.at("h3").text docid = [RelatonBib::DocumentIdentifier.new(type: "CIE", id: did)] on = ref.at("p/time") date = [RelatonBib::BibliographicDate.new(type: "published", on: on[:datetime])] link = [RelatonBib::TypedUri.new(type: "src", content: url)] bibitem = RelatonBib::BibliographicItem.new docid: docid, title: title, link: link, date: date type = ref.at('//li/i[contains(@class,"historical")]') ? "updates" : "updatedBy" { type: type, bibitem: bibitem } end end
fetch_title(doc)
click to toggle source
@param doc [Mechanize::Page] @return [RelatonBib::TypedTitleStringCollection, Array]
# File lib/relaton_cie/data_fetcher.rb, line 46 def fetch_title(doc) t = doc.at("//hgroup/h2", "//hgroup/h1") return [] unless t RelatonBib::TypedTitleString.from_string t.text.strip end
parse_page(hit)
click to toggle source
@param hit [Nokogiri::HTML::Element]
# File lib/relaton_cie/data_fetcher.rb, line 148 def parse_page(hit) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength url = "https://www.techstreet.com#{hit.at('h3/a')[:href]}" doc = time_req { @agent.get url } item = RelatonBib::BibliographicItem.new( type: "standard", docid: fetch_docid(hit, doc), title: fetch_title(doc), link: fetch_link(url), abstract: fetch_abstract(doc), date: fetch_date(doc), edition: fetch_edition(doc), contributor: fetch_contributor(doc), relation: fetch_relation(doc), language: ["en"], script: ["Latn"], doctype: "document" ) write_file item rescue StandardError => e warn "Document: #{url}" warn e.message warn e.backtrace end
time_req() { || ... }
click to toggle source
# File lib/relaton_cie/data_fetcher.rb, line 172 def time_req t1 = Time.now result = yield t = 1 - (Time.now - t1) sleep t if t.positive? result end
write_file(bib)
click to toggle source
@param bib [RelatonItu::ItuBibliographicItem]
# File lib/relaton_cie/data_fetcher.rb, line 135 def write_file(bib) id = bib.docidentifier[0].id.gsub(%r{[/\s\-:.]}, "_") file = "#{@output}/#{id.upcase}.#{@format}" # if File.exist? file # warn "File #{file} exists. Docid: #{bib.docidentifier[0].id}" # warn "Link: #{bib.link.detect { |l| l.type == 'src' }.content}" # else out = @format == "xml" ? bib.to_xml(bibdata: true) : bib.to_hash.to_yaml File.write file, out, encoding: "UTF-8" # end end