class ArxivSync::XMLParser

Attributes

papers[RW]

Public Class Methods

new() click to toggle source
# File lib/arxivsync/parser.rb, line 31
def initialize
  @entities = HTMLEntities.new
end

Public Instance Methods

clean(str) click to toggle source
# File lib/arxivsync/parser.rb, line 48
def clean(str)
  str.gsub(/\s+/, ' ').strip
end
decode(string) click to toggle source
# File lib/arxivsync/parser.rb, line 70
def decode(string)
  str = @entities.decode(string)

  # Process latex entities -- except inside equations
  decoded = ""
  equation = false
  segment = ""
  str.chars do |ch|
    if ch == '$' 
      if !equation
        decoded << latex_decode(segment)
        segment = ch
      else
        decoded << segment + ch
        segment = ""
      end

      equation = !equation
    else
      segment << ch
    end
  end

  decoded << latex_decode(segment)
end
end_element(name) click to toggle source
# File lib/arxivsync/parser.rb, line 156
def end_element(name)
  case name
  when :version
    @model.versions.push(@version)
  when :metadata # End of a paper entry
    @papers.push(@model)
  end
  @el = nil
end
latex_decode(str) click to toggle source

Like LaTeX.decode but without the punctuation weirdness

# File lib/arxivsync/parser.rb, line 53
def latex_decode(str)
  string = str.dup

  LaTeX::Decode::Base.normalize(string)

  LaTeX::Decode::Maths.decode!(string)

  LaTeX::Decode::Accents.decode!(string)
  LaTeX::Decode::Diacritics.decode!(string)
  #LaTeX::Decode::Punctuation.decode!(string)
  LaTeX::Decode::Symbols.decode!(string)

  LaTeX::Decode::Base.strip_braces(string)

  LaTeX.normalize_C(string)
end
start_element(name, attributes=[]) click to toggle source
# File lib/arxivsync/parser.rb, line 35
def start_element(name, attributes=[])
  @el = name
  case name
  when :ListRecords
    @papers = []
  when :metadata
    @model = Paper.new
    @model.versions = []
  when :version
    @version = Version.new
  end
end
text(str) click to toggle source
# File lib/arxivsync/parser.rb, line 96
def text(str)
  case @el
  # Necessary elements
  when :id
    @model.id = clean(str)
  when :submitter
    @model.submitter = decode(clean(str))
  when :title
    @model.title = decode(clean(str))
  when :authors
    # Author strings may contain strange metadata
    # Non-regex parsing to handle nested parens
    @model.author_str = decode(clean(str))

    depth = 0
    no_parens = ""

    @model.author_str.chars do |ch|
      case ch
      when '('
        depth += 1
      when ')'
        depth -= 1
      else
        no_parens << ch if depth == 0
      end
    end

    @model.authors = no_parens.split(/,|:|;|\sand\s|\s?the\s/i)
      .map { |s| clean(s) }
      .reject { |s| s.empty? }
  when :categories
    @model.categories = clean(str).split(/\s/)
  when :abstract
    @model.abstract = decode(clean(str))

  # Optional elements
  when :comments
    @model.comments = decode(clean(str))
  when :"msc-class"
    @model.msc_class = clean(str)
  when :"report-no"
    @model.report_no = clean(str)
  when :"journal-ref"
    @model.journal_ref = clean(str)
  when :doi
    @model.doi = clean(str)
  when :proxy
    @model.proxy = clean(str)
  when :license
    @model.license = clean(str)

  # Versions
  when :date
    @version.date = Time.parse(clean(str))
  when :size
    @version.size = clean(str)
  end
end