class Gaspar::Parser
Parser
This class parses a PDF into a machine-readable format
Public Class Methods
new(source, target, options = {})
click to toggle source
# File lib/gaspar.rb, line 12 def initialize(source, target, options = {}) @source = source @target = target @options = options @extractor = extractor end
Public Instance Methods
parse()
click to toggle source
# File lib/gaspar.rb, line 19 def parse @extractor.extract end
parse_with_content()
click to toggle source
# File lib/gaspar.rb, line 23 def parse_with_content @extractor.extract @extractor.content end
Private Instance Methods
determine_source(source)
click to toggle source
# File lib/gaspar.rb, line 54 def determine_source(source) is_file = File.exist?(source) && !File.directory?(source) is_http = URI(source).scheme == 'http' is_https = URI(source).scheme == 'https' unless is_file || is_http || is_https raise IOError, "Source (#{source}) is neither a file nor an URL." end is_file ? source : download_file(source) end
download_file(source)
click to toggle source
# File lib/gaspar.rb, line 43 def download_file(source) tmp_file = "/tmp/#{random_source_name}.pdf" File.open(tmp_file, 'wb') do |saved_file| open(URI.encode(source), 'rb') do |read_file| saved_file.write(read_file.read) end end tmp_file end
extractor()
click to toggle source
# File lib/gaspar.rb, line 30 def extractor src = determine_source(@source) pdf = Reader.new(src) Extractor.new( src, @target, pdf.page_count, @options ) end
random_source_name()
click to toggle source
# File lib/gaspar.rb, line 39 def random_source_name rand(16**16).to_s(16) end