class ArxivSync::XMLParser
Attributes
papers[RW]
Public Class Methods
new()
click to toggle source
# File lib/arxivsync/parser.rb, line 31 def initialize @entities = HTMLEntities.new end
Public Instance Methods
clean(str)
click to toggle source
# File lib/arxivsync/parser.rb, line 48 def clean(str) str.gsub(/\s+/, ' ').strip end
decode(string)
click to toggle source
# File lib/arxivsync/parser.rb, line 70 def decode(string) str = @entities.decode(string) # Process latex entities -- except inside equations decoded = "" equation = false segment = "" str.chars do |ch| if ch == '$' if !equation decoded << latex_decode(segment) segment = ch else decoded << segment + ch segment = "" end equation = !equation else segment << ch end end decoded << latex_decode(segment) end
end_element(name)
click to toggle source
# File lib/arxivsync/parser.rb, line 156 def end_element(name) case name when :version @model.versions.push(@version) when :metadata # End of a paper entry @papers.push(@model) end @el = nil end
latex_decode(str)
click to toggle source
Like LaTeX.decode but without the punctuation weirdness
# File lib/arxivsync/parser.rb, line 53 def latex_decode(str) string = str.dup LaTeX::Decode::Base.normalize(string) LaTeX::Decode::Maths.decode!(string) LaTeX::Decode::Accents.decode!(string) LaTeX::Decode::Diacritics.decode!(string) #LaTeX::Decode::Punctuation.decode!(string) LaTeX::Decode::Symbols.decode!(string) LaTeX::Decode::Base.strip_braces(string) LaTeX.normalize_C(string) end
start_element(name, attributes=[])
click to toggle source
# File lib/arxivsync/parser.rb, line 35 def start_element(name, attributes=[]) @el = name case name when :ListRecords @papers = [] when :metadata @model = Paper.new @model.versions = [] when :version @version = Version.new end end
text(str)
click to toggle source
# File lib/arxivsync/parser.rb, line 96 def text(str) case @el # Necessary elements when :id @model.id = clean(str) when :submitter @model.submitter = decode(clean(str)) when :title @model.title = decode(clean(str)) when :authors # Author strings may contain strange metadata # Non-regex parsing to handle nested parens @model.author_str = decode(clean(str)) depth = 0 no_parens = "" @model.author_str.chars do |ch| case ch when '(' depth += 1 when ')' depth -= 1 else no_parens << ch if depth == 0 end end @model.authors = no_parens.split(/,|:|;|\sand\s|\s?the\s/i) .map { |s| clean(s) } .reject { |s| s.empty? } when :categories @model.categories = clean(str).split(/\s/) when :abstract @model.abstract = decode(clean(str)) # Optional elements when :comments @model.comments = decode(clean(str)) when :"msc-class" @model.msc_class = clean(str) when :"report-no" @model.report_no = clean(str) when :"journal-ref" @model.journal_ref = clean(str) when :doi @model.doi = clean(str) when :proxy @model.proxy = clean(str) when :license @model.license = clean(str) # Versions when :date @version.date = Time.parse(clean(str)) when :size @version.size = clean(str) end end