module RfcBib::Scrapper

Scrapper module

Constants

ID_URI_PATTERN
RFC_URI_PATTERN

Public Class Methods

scrape_page(text) click to toggle source

@param text [String] @return [IsoBibItem::BibliographicItem]

# File lib/rfcbib/scrapper.rb, line 19
def scrape_page(text)

  # Remove initial "IETF " string if specified
  ref = text.
    gsub(/^IETF /, "").
    sub(' ', '.') + '.xml'

  uri = case ref
  when /^RFC/
    RFC_URI_PATTERN.dup
  when /^I-D/
    ID_URI_PATTERN.dup
  else
    warn "#{ref}: not recognised for RFC"
    return
  end

  uri = uri.gsub("CODE", ref)
  res = Net::HTTP.get_response(URI(uri))
  if res.code != "200"
    warn "No document found at #{uri}"
    return
  end
  doc = Nokogiri::HTML Net::HTTP.get(URI(uri))
  @reference = doc.at('//reference')
  return unless @reference
  bib_item
end

Private Class Methods

add_contact(contacts, type, value) click to toggle source

@param type [String] allowed “phone”, “email” or “uri” @param value [String]

# File lib/rfcbib/scrapper.rb, line 148
def add_contact(contacts, type, value)
  return unless value
  contacts << IsoBibItem::Contact.new(type: type, value: value.text)
end
address(postal) click to toggle source

@param postal [Nokogiri::XML::Document] @rerurn [IsoBibItem::Address]

# File lib/rfcbib/scrapper.rb, line 136
def address(postal)
  IsoBibItem::Address.new(
    street: [(postal.at('//postalLine') || postal.at('//street')).text],
    city: postal.at('//city').text,
    postcode: postal.at('//code').text,
    country: postal.at('//country').text,
    state: postal.at('//region').text
  )
end
affiliation(author) click to toggle source

@param author [Nokogiri::XML::Document] @return [IsoBibItem::Affilation]

# File lib/rfcbib/scrapper.rb, line 155
def affiliation(author)
  organization = author.at('//organization')
  IsoBibItem::Affilation.new IsoBibItem::Organization.new(
    name: organization.text.empty? ? 'IETF' : organization.text,
    abbreviation: organization[:abbrev] || 'IETF'
  )
end
bib_item() click to toggle source

@return [IsoBibItem::BibliographicItem]

# File lib/rfcbib/scrapper.rb, line 53
def bib_item
  IsoBibItem::BibliographicItem.new(
    id: @reference[:anchor],
    docid: docids(@reference[:anchor].sub(/^(RFC)/, "\\1 ")),
    status: status,
    language: [language],
    link: [{ type: 'src', content: @reference[:target] }],
    titles: titles,
    contributors: contributors,
    dates: dates,
    series: series
  )
end
contacts(addr) click to toggle source

@param postal [Nokogiri::XML::Document] @return [Array<IsoBibItem::Address, IsoBibItem::Phone>]

# File lib/rfcbib/scrapper.rb, line 123
def contacts(addr)
  contacts = []
  return contacts unless addr
  postal = addr.at('//postal')
  contacts << address(postal) if postal
  add_contact(contacts, 'phone', addr.at('//phone'))
  add_contact(contacts, 'email', addr.at('//email'))
  add_contact(contacts, 'uri', addr.at('//uri'))
  contacts
end
contributor_role(author) click to toggle source

@param author [Nokogiri::XML::Document] @return [String]

# File lib/rfcbib/scrapper.rb, line 165
def contributor_role(author)
  author[:role] || 'author'
end
contributors() click to toggle source

@return [Array<Hash>]

# File lib/rfcbib/scrapper.rb, line 80
def contributors
  persons + organizations
end
dates() click to toggle source

Extract date from reference.

@return [Array<IsoBibItem::BibliographicDate>] published data.

# File lib/rfcbib/scrapper.rb, line 179
def dates
  return unless (date = @reference.at '//front/date')
  d = [date[:year], month(date[:month]),
       (date[:day] || "01")].compact.join '-'
  date = Time.parse(d).strftime '%Y-%m-%d'
  [IsoBibItem::BibliographicDate.new(type: 'published', on: date)]
end
docids(id) click to toggle source

Extract document identifiers from reference

@return [Array<IsoBibItem::DocumentIdentifier>]

# File lib/rfcbib/scrapper.rb, line 192
def docids(id)
  ret = []
  ret << IsoBibItem::DocumentIdentifier.new(type: "IETF", id: id)
  ret = ret + @reference.xpath('//seriesinfo').map do |si|
    next unless si[:name] == 'DOI'
    IsoBibItem::DocumentIdentifier.new(id: si[:value], type: si[:name])
  end.compact
end
full_name(author) click to toggle source

@param author [Nokogiri::XML::Document] @return [IsoBibItem::FullName]

# File lib/rfcbib/scrapper.rb, line 107
def full_name(author)
  IsoBibItem::FullName.new(
    completename: localized_string(author[:fullname]),
    initials: [localized_string(author[:initials])],
    surname: [localized_string(author[:surname])]
  )
end
language() click to toggle source

@return [String]

# File lib/rfcbib/scrapper.rb, line 69
def language
  @reference[:lang] || 'en'
end
localized_string(content) click to toggle source

@param content [String] @return [IsoBibItem::LocalizedString]

# File lib/rfcbib/scrapper.rb, line 117
def localized_string(content)
  IsoBibItem::LocalizedString.new(content, language)
end
month(mo) click to toggle source
# File lib/rfcbib/scrapper.rb, line 169
def month(mo)
  return mo if /^\d+$/.match mo
  Date::MONTHNAMES.index(mo)
end
organizations() click to toggle source

@return [Array<Hash{Symbol=>IsoBibItem::Organization,Symbol=>Array<String>}>]

# File lib/rfcbib/scrapper.rb, line 97
def organizations
  @reference.xpath('//seriesinfo').map do |si|
    next unless si[:stream]
    entity = IsoBibItem::Organization.new name: si[:stream]
    { entity: entity, roles: ['author'] }
  end.compact
end
persons() click to toggle source

@return [Array<Hash{Symbol=>IsoBibItem::Person,Symbol=>Array<String>}>]

# File lib/rfcbib/scrapper.rb, line 85
def persons
  @reference.xpath('//front/author').map do |author|
    entity = IsoBibItem::Person.new(
      name: full_name(author),
      affiliation: [affiliation(author)],
      contacts: contacts(author.at('//address'))
    )
    { entity: entity, roles: [contributor_role(author)] }
  end
end
series() click to toggle source

Extract series form reference

@return [Array<IsoBibItem::FormattedString>]

# File lib/rfcbib/scrapper.rb, line 206
def series
  @reference.xpath('//seriesinfo').map do |si|
    next if si[:name] == 'DOI' || si[:stream] || si[:status]
    IsoBibItem::Series.new(
      title: IsoBibItem::FormattedString.new(
        content: si[:name], language: language, script: 'Latn'
      ),
      number: si[:value],
      type: "main"
    )
  end.compact
end
status() click to toggle source

extract status

@return [IsoBibItem::DocumentStatus]

# File lib/rfcbib/scrapper.rb, line 224
def status
  st = @reference.at('//seriesinfo[@status]')
  return unless st
  IsoBibItem::DocumentStatus.new(
    IsoBibItem::LocalizedString.new(st[:status])
  )
end
titles() click to toggle source

@return [Array<Hash>]

# File lib/rfcbib/scrapper.rb, line 74
def titles
  title = @reference.at('//front/title')
  [{ content: title.text, language: language, script: 'Latn' }]
end