module ItuBib::Scrapper

Scrapper. rubocop:disable Metrics/ModuleLength

Constants

DOMAIN
TYPES

Public Class Methods

parse_page(hit_data) click to toggle source

Parse page. @param hit [Hash] @return [Hash] rubocop:disable Metrics/AbcSize, Metrics/MethodLength

# File lib/itubib/scrapper.rb, line 52
def parse_page(hit_data)
  doc = get_page hit_data[:url]

  # Fetch edition.
  edition = doc.at("//table/tr/td/span[contains(@id, 'Label8')]/b").text

  IsoBibItem::IsoBibliographicItem.new(
    docid:        fetch_docid(hit_data[:code]),
    edition:      edition,
    language:     ['en'],
    script:       ['Latn'],
    titles:       fetch_titles(hit_data),
    type:         fetch_type(doc),
    docstatus:    fetch_status(doc),
    ics:          [], # fetch_ics(doc),
    dates:        fetch_dates(doc),
    contributors: fetch_contributors(hit_data[:code]),
    workgroup:    fetch_workgroup(doc),
    abstract:     fetch_abstract(doc),
    copyright:    fetch_copyright(hit_data[:code], doc),
    link:         fetch_link(doc, hit_data[:url]),
    relations:    fetch_relations(doc)
  )
end

Private Class Methods

fetch_abstract(doc) click to toggle source

Fetch abstracts. @param doc [Nokigiri::HTML::Document] @return [Array<Array>]

# File lib/itubib/scrapper.rb, line 83
def fetch_abstract(doc)
  abstract_url = doc.at('//table/tr/td/span[contains(@id, "lbl_dms")]/div')
  return [] unless abstract_url

  url = abstract_url[:onclick].match(/https?[^']+/).to_s
  d = Nokogiri::HTML Net::HTTP.get(URI(url))
  abstract_content = d.css('p.MsoNormal').text.gsub(/\r\n/, '')
    .gsub(/\s{2,}/, ' ').gsub(/\u00a0/, '')

  [{
    content:  abstract_content,
    language: 'en',
    script:   'Latn'
  }]
end
fetch_contributors(code) click to toggle source

Fetch contributors @param doc [Nokogiri::HTML::Document] @return [Array<Hash>]

# File lib/itubib/scrapper.rb, line 244
def fetch_contributors(code)
  abbrev = code.sub(/-\w\s.*/, '')
  case abbrev
  when 'ITU'
    name = 'International Telecommunication Union'
    url = 'www.itu.int'
  end
  [{ entity: { name: name, url: url, abbreviation: abbrev }, roles: ['publisher'] }]
end
fetch_dates(doc) click to toggle source

Fetch dates @param doc [Nokogiri::HTML::Document] @return [Array<Hash>]

# File lib/itubib/scrapper.rb, line 232
def fetch_dates(doc)
  dates = []
  publish_date = doc.at("//table/tr/td/span[contains(@id, 'Label5')]").text
  unless publish_date.empty?
    dates << { type: 'published', on: publish_date }
  end
  dates
end
fetch_docid(code) click to toggle source

Fetch docid. @param doc [Nokogiri::HTML::Document] @return [Hash]

# File lib/itubib/scrapper.rb, line 130
def fetch_docid(code)
  m = code.match(/(?<=\s)(?<project>[^\s]+)-?(?<part>(?<=-)\d+|)-?(?<subpart>(?<=-)\d+|)/)
  {
    project_number: m[:project],
    part_number: m[:part],
    subpart_number: m[:subpart],
    prefix: nil,
    type: 'ITU',
    id: code
  }
end
fetch_relations(doc) click to toggle source

Fetch relations. @param doc [Nokogiri::HTML::Document] @return [Array<Hash>] rubocop:disable Metrics/MethodLength

# File lib/itubib/scrapper.rb, line 179
def fetch_relations(doc)
  doc.xpath('//div[contains(@id, "tab_sup")]//table/tr[position()>2]').map do |r|
    r_type = r.at('./td/span[contains(@id, "Label4")]/nobr').text.downcase
    type = case r_type
           when 'in force' then 'published'
           else r_type
           end
    ref = r.at('./td/span[contains(@id, "title_e")]/nobr/a')
    url = DOMAIN + ref[:href].sub(/^\./, '/ITU-T/recommendations')
    { type: type, identifier: ref.text, url: url }
  end
end
fetch_status(doc) click to toggle source

Fetch status. @param doc [Nokogiri::HTML::Document] @param status [String] @return [Hash]

# File lib/itubib/scrapper.rb, line 146
def fetch_status(doc)
  s = doc.at("//table/tr/td/span[contains(@id, 'Label7')]").text
  if s == 'In force'
    status   = 'Published'
    stage    = '60'
    substage = '60'
  else
    status   = 'Withdrawal'
    stage    = '95'
    substage = '99'
  end
  { status: status, stage: stage, substage: substage }
end
fetch_titles(hit_data) click to toggle source

Fetch titles. @param hit_data [Hash] @return [Array<Hash>]

# File lib/itubib/scrapper.rb, line 202
def fetch_titles(hit_data)
  titles = hit_data[:title].split ' - '
  case titles.size
  when 0
    intro, main, part = nil, "", nil
  when 1
    intro, main, part = nil, titles[0], nil
  when 2
    if /^(Part|Partie) \d+:/ =~ titles[1]
      intro, main, part = nil, titles[0], titles[1]
    else
      intro, main, part = titles[0], titles[1], nil
    end
  when 3
    intro, main, part = titles[0], titles[1], titles[2]
  else
    intro, main, part = titles[0], titles[1], titles[2..-1]&.join(" -- ")
  end
  [{
    title_intro: intro,
    title_main:  main,
    title_part:  part,
    language:    'en',
    script:      'Latn'
  }]
end
fetch_type(doc) click to toggle source

Fetch type. @param doc [Nokogiri::HTML::Document] @return [String]

# File lib/itubib/scrapper.rb, line 195
def fetch_type(doc)
  'international-standard'
end
fetch_workgroup(doc) click to toggle source

Fetch workgroup. @param doc [Nokogiri::HTML::Document] @return [Hash]

# File lib/itubib/scrapper.rb, line 163
def fetch_workgroup(doc)
  wg = doc.at('//table/tr/td/span[contains(@id, "Label8")]/a').text
  { name:                'International Telecommunication Union',
    abbreviation:        'ITU',
    url:                 'www.itu.int',
    technical_committee: {
      name:   wg,
      type:   'technicalCommittee',
      number: wg.match(/\d+/)&.to_s&.to_i
    } }
end
get_page(url) click to toggle source

rubocop:disable Metrics/AbcSize, Metrics/MethodLength Get page. @param path [String] page's path @return [Array<Nokogiri::HTML::Document, String>]

# File lib/itubib/scrapper.rb, line 116
def get_page(url)
  uri = URI url
  resp = Net::HTTP.get_response(uri)#.encode("UTF-8")
  while resp.code == '301' || resp.code == '302' || resp.code == '303'
    uri = URI resp['location']
    resp = Net::HTTP.get_response(uri)#.encode("UTF-8")
  end
  Nokogiri::HTML(resp.body)
end