class RelatonW3c::Scrapper

Constants

DOCTYPES

Public Class Methods

parse_page(hit) click to toggle source

@param hit [Hash] @return [RelatonW3c::W3cBibliographicItem]

# File lib/relaton_w3c/scrapper.rb, line 16
def parse_page(hit) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
  resp = Net::HTTP.get_response URI.parse(hit["link"])
  doc = resp.code == "200" ? Nokogiri::HTML(resp.body) : nil
  W3cBibliographicItem.new(
    type: "standard",
    docid: fetch_docid(hit),
    fetched: Date.today.to_s,
    language: ["en"],
    script: ["Latn"],
    title: fetch_title(hit, doc),
    abstract: fetch_abstract(doc),
    link: fetch_link(hit),
    date: fetch_date(hit, doc),
    doctype: fetch_doctype(hit, doc),
    contributor: fetch_contributor(hit, doc),
    relation: fetch_relation(doc),
    keyword: hit["keyword"],
  )
end

Private Class Methods

contrib_info(**args) click to toggle source

@param name [String] @param url [String, NilClass] @param role [Array<Hash>] @parma org [Hash] @return [RelatonBib::ContributionInfo]

# File lib/relaton_w3c/scrapper.rb, line 185
def contrib_info(**args)
  completename = RelatonBib::LocalizedString.new(args[:name])
  name = RelatonBib::FullName.new completename: completename
  af = []
  if args[:org]
    org = RelatonBib::Organization.new(**args[:org])
    af << RelatonBib::Affiliation.new(organization: org)
  end
  en = RelatonBib::Person.new name: name, url: args[:url], affiliation: af
  RelatonBib::ContributionInfo.new entity: en, role: args[:role]
end
fetch_abstract(doc) click to toggle source

@param doc [Nokogiri::HTML::Document, NilClass] @return [Array<RelatonBib::FormattedString>]

# File lib/relaton_w3c/scrapper.rb, line 73
def fetch_abstract(doc)
  return [] unless doc

  content = doc.at("//h2[.='Abstract']/following-sibling::p",
                   "//div[@class='abstract']/p").text
  [RelatonBib::FormattedString.new(content: content, language: "en",
                                   script: "Latn")]
end
fetch_contributor(hit, doc) click to toggle source

@param hit [Hash] @param doc [Nokogiri::HTML::Document, NilClass] @return [Array<RelatonBib::ContributionInfo>]

# File lib/relaton_w3c/scrapper.rb, line 130
def fetch_contributor(hit, doc) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
  if doc
    editors = find_contribs(doc, "Editors").reduce([]) do |mem, ed|
      c = parse_contrib ed, "editor"
      mem << c if c
      mem
    end
    contribs = find_contribs(doc, "Authors").reduce(editors) do |mem, ath|
      ed = mem.detect { |e| e[:id] && e[:id] == ath["data-editor-id"] }
      if ed
        ed[:role] << { type: "author" }
      else
        mem << parse_contrib(ath, "author")
      end
      mem
    end
    contribs.map { |c| contrib_info(**c) }
  else
    hit["editor"].map do |ed|
      contrib_info name: ed, role: [{ type: "editor" }]
    end
  end
end
fetch_date(hit, doc) click to toggle source

@param hit [Hash] @param doc [Nokogiri::HTML::Document, NilClass] @return [Array<RelatonBib::BibliographicDate>]

# File lib/relaton_w3c/scrapper.rb, line 91
def fetch_date(hit, doc) # rubocop:disable Metrics/CyclomaticComplexity
  on = hit["datepub"] || doc&.at("//h2/time[@datetime]")&.attr(:datetime)
  on ||= fetch_date1(doc) || fetch_date2(doc)
  [RelatonBib::BibliographicDate.new(type: "published", on: on)] if on
end
fetch_date1(doc) click to toggle source

@param doc [Nokogiri::HTML::Document, NilClass] @return [String]

# File lib/relaton_w3c/scrapper.rb, line 99
def fetch_date1(doc)
  d = doc&.at("//h2[@property='dc:issued']")&.attr(:content)
  d&.match(/\d{4}-\d{2}-\d{2}/)&.to_s
end
fetch_date2(doc) click to toggle source

@param doc [Nokogiri::HTML::Document, NilClass] @return [String]

# File lib/relaton_w3c/scrapper.rb, line 106
def fetch_date2(doc)
  d = doc&.at("//h2[contains(@id, 'w3c-recommendation')]")
  return unless d

  Date.parse(d.attr(:id.match(/\d{2}-\w+-\d{4}/).to_s)).to_s
end
fetch_docid(hit) click to toggle source

@param hit [Hash] @return [Array<RelatonBib::DocumentIdentifier>]

# File lib/relaton_w3c/scrapper.rb, line 40
def fetch_docid(hit)
  id = hit["link"].split("/").last
  [RelatonBib::DocumentIdentifier.new(id: id, type: "W3C")]
end
fetch_doctype(hit, doc) click to toggle source

@param hit [Hash] @param doc [Nokogiri::HTML::Document, NilClass] @return [String]

# File lib/relaton_w3c/scrapper.rb, line 116
def fetch_doctype(hit, doc)
  if hit["type"]
    DOCTYPES[hit["type"]]
  elsif doc
    type = HitCollection::TYPES.detect do |_k, v|
      doc.at("//h2[contains(., '#{v}')]/time[@datetime]")
    end
    DOCTYPES[type&.first]
  end
end
fetch_relation(doc) click to toggle source

@param doc [Nokogiri::HTML::Document] @return [Array<RelatonBib::DocumentRelation>]

# File lib/relaton_w3c/scrapper.rb, line 199
def fetch_relation(doc)
  return [] unless doc && (link = recommendation_link(doc))

  hit = { "link" => link }
  item = parse_page hit
  [RelatonBib::DocumentRelation.new(type: "obsoletedBy", bibitem: item)]
end
fetch_title(hit, doc) click to toggle source

@param hit [Hash] @param doc [Nokogiri::HTML::Document] @return [Array<RelatonBib::TypedTitleString>]

# File lib/relaton_w3c/scrapper.rb, line 48
def fetch_title(hit, doc) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
  titles = []
  if doc
    title = doc.at("//*[contains(@id, 'title')]")&.text
    if title && !title.empty?
      titles << { content: title.gsub(/\n/, " "), type: "main" }
    end
    subtitle = doc.at(
      "//h2[@id='subtitle']|//p[contains(@class, 'subline')]",
    )&.text
    titles << { content: subtitle, tipe: "subtitle" } if subtitle
  end
  if titles.empty? && hit["title"]
    titles << { content: hit["title"], type: "main" }
  end
  titles.map do |t|
    title = RelatonBib::FormattedString.new(
      content: t[:content], language: "en", script: "Latn",
    )
    RelatonBib::TypedTitleString.new(type: t[:type], title: title)
  end
end
find_contribs(doc, type) click to toggle source

@param doc [Nokogiri::NTML::Document] @param type [String] @return [Array<Nokogiri::XML::Element]

# File lib/relaton_w3c/scrapper.rb, line 157
def find_contribs(doc, type)
  doc.xpath("//dt[contains(.,'#{type}')]/following-sibling::dd"\
            "[preceding-sibling::dt[1][contains(.,'#{type}')]]")
end
parse_contrib(element, type) click to toggle source

@param element [Nokogiri::XML::Element] @param type [String] @return [Hash]

# File lib/relaton_w3c/scrapper.rb, line 165
def parse_contrib(element, type) # rubocop:disable Metrics/MethodLength
  p = element.at("a")
  return unless p

  contrib = {
    name: p.text,
    url: p[:href],
    role: [{ type: type }],
    id: element["data-editor-id"],
  }
  org = element.at("a[2]")
  contrib[:org] = { name: org.text, url: org[:href] } if org
  contrib
end