class ArxivSync::XMLArchive

Public Class Methods

new(savedir, custom_params=nil) click to toggle source
# File lib/arxivsync/xmlarchive.rb, line 3
def initialize(savedir, custom_params=nil)
  @savedir = File.expand_path(savedir)

  if not Dir.exists?(@savedir)
    puts "Creating new XML archive at #{@savedir}\n".light_green
    Dir.mkdir(@savedir)
  end
end

Public Instance Methods

parse_dt(path) click to toggle source

Parse the timestamp from the path to a previously saved arxiv xml block

# File lib/arxivsync/xmlarchive.rb, line 14
def parse_dt(path)
  begin
    DateTime.parse(path.split('/')[-1].split('_')[0])
  rescue ArgumentError
    puts "Failed to parse timestamp from file #{path}\n".bold.light_red
    puts ("Are you sure this is an archive directory?\n" +
         "If so, it needs to be free of strange interloping files.").bold.light_white
    exit 1
  end
end
read_metadata(&b) click to toggle source

Parses the archive using Nokogiri’s SAX parser Yields Paper objects as they are created

# File lib/arxivsync/xmlarchive.rb, line 70
def read_metadata(&b)
  parser = XMLParser.new

  Dir.glob(File.join(@savedir, '*')).each do |path|
    Ox.sax_parse(parser, File.open(path))
    b.call(parser.papers)
  end
end
save_response(resp) click to toggle source

Saves a timestamped OAI XML response to disk, appending the resumption token to the filename if available

# File lib/arxivsync/xmlarchive.rb, line 81
def save_response(resp)
  content = resp.doc.to_s

  # Parse the response and extract some metadata
  doc = Nokogiri::XML(content)

  # responseDate for stamping files and potentially
  # initiating the next harvest
  responseDate = doc.css('responseDate').text

  # Total number of records in this harvest
  completeListSize = doc.css('resumptionToken').attr('completeListSize').value.to_i
  # How far we are in
  cursor = doc.css('resumptionToken').attr('cursor').value.to_i
  # How many records we gained in this response
  numRecords = doc.css('record').count.to_i

  # If we have a resumption_token, stick that on the filename.
  if resp.resumption_token && !resp.resumption_token.empty?
    suffix = resp.resumption_token
  else
    suffix = 'final'
  end

  # Write out the file and communicate progress
  filename = "#{responseDate}_#{suffix}"
  f = File.open("#{@savedir}/#{filename}", 'w')
  f.write(content)
  f.close
  puts "Saved #{cursor+numRecords} of #{completeListSize} records to #{filename}".light_green
end
sync(oai_params={}) click to toggle source

Download from the arXiv! This can be called in three potential states:

  • The savedir has yet to be populated with any xml, meaning we need to start a full mirror of the entire database.

  • The most recent xml file contains a resumptionToken, meaning the last harvest attempt was aborted prematurely and we need to resume.

  • The most recent xml file does not have a resumptionToken, in which case we begin a new harvest for everything since the responseDate of the last.

# File lib/arxivsync/xmlarchive.rb, line 35
def sync(oai_params={})
  # Find any existing xml files we may have, sorted by
  # responseDate in the filename
  existing = Dir.glob(File.join(@savedir, '*')).sort do |a,b|
    parse_dt(a) <=> parse_dt(b)
  end

  if existing.empty?
    puts ("Commencing full arXiv download. This will take ... a while.\n" +
         "Download can be safely aborted at any point and will resume from\n" +
         "last successful response. However, resumptionTokens *will* expire\n" +
         "if you leave it in an incomplete state for long enough.\n").bold.light_white
  else
    # Parse the most recent one
    last_response = Nokogiri::XML(File.open(existing[-1]))
    last_token = last_response.css('resumptionToken').text

    if last_token.empty? # Previous sync completed successfully
      responseDate = Date.parse(last_response.css('responseDate').text)
      puts "Downloading from last responseDate: #{responseDate}\n".bold.light_green
      oai_params[:from] = responseDate
    else # Previous sync aborted prematurely, resume
      puts "Resuming download using previous resumptionToken: #{last_token}\n".bold.light_green
      oai_params = { resumptionToken: last_token }
    end
  end

  downloader = Downloader.new(oai_params)
  downloader.start do |resp|
    save_response(resp)
  end
end