class ArxivSync::Downloader
Public Class Methods
new(initial_params={})
click to toggle source
# File lib/arxivsync/downloader.rb, line 3 def initialize(initial_params={}) @initial_params = initial_params if @initial_params[:from] == Date.today puts "Last responseDate was today. arXiv lacks date granularity beyond the day level; please wait before continuing harvest.".light_yellow return false end unless @initial_params[:resumptionToken] @initial_params[:metadataPrefix] ||= 'arXivRaw' end @last_params = nil domain = ENV['ARXIV_EXPORT_DOMAIN'] || "export.arxiv.org" @oai = OAI::Client.new("http://#{domain}/oai2") end
Public Instance Methods
make_request(params)
click to toggle source
# File lib/arxivsync/downloader.rb, line 51 def make_request(params) puts "Making OAI request with params: #{params.inspect}".light_magenta @last_params = params.clone # list_records will nuke our params begin return @oai.list_records(params) rescue Faraday::Error::TimeoutError puts "Request timed out; retrying in 20 seconds".light_yellow sleep 20 return retry_request end end
retry_request()
click to toggle source
# File lib/arxivsync/downloader.rb, line 47 def retry_request make_request(@last_params) end
start(&b)
click to toggle source
# File lib/arxivsync/downloader.rb, line 20 def start(&b) # Make the initial request resp = make_request(@initial_params) # Continue to make requests until the server stops sending # resumption tokens while true if !resp.resumption_token || resp.resumption_token.empty? if resp.doc.to_s.include?("Retry after 20 seconds") # Rate limitation puts "Honoring 503 and sleeping for 20 seconds...".light_yellow sleep 20 resp = retry_request else # No resumption_token and no retry should mean we're finished b.call(resp) puts "Finished archiving~!".bold.light_green break end else # We have a resumption_token, keep going! sleep 20 # let's be extra cautious though b.call(resp) resp = make_request(resumptionToken: resp.resumption_token) end end return self end