class Gaspar::Parser

Parser This class parses a PDF into a machine-readable format

Public Class Methods

new(source, target, options = {}) click to toggle source
# File lib/gaspar.rb, line 12
def initialize(source, target, options = {})
  @source = source
  @target = target
  @options = options
  @extractor = extractor
end

Public Instance Methods

parse() click to toggle source
# File lib/gaspar.rb, line 19
def parse
  @extractor.extract
end
parse_with_content() click to toggle source
# File lib/gaspar.rb, line 23
def parse_with_content
  @extractor.extract
  @extractor.content
end

Private Instance Methods

determine_source(source) click to toggle source
# File lib/gaspar.rb, line 54
def determine_source(source)
  is_file = File.exist?(source) && !File.directory?(source)
  is_http = URI(source).scheme == 'http'
  is_https = URI(source).scheme == 'https'

  unless is_file || is_http || is_https
    raise IOError, "Source (#{source}) is neither a file nor an URL."
  end

  is_file ? source : download_file(source)
end
download_file(source) click to toggle source
# File lib/gaspar.rb, line 43
def download_file(source)
  tmp_file = "/tmp/#{random_source_name}.pdf"
  File.open(tmp_file, 'wb') do |saved_file|
    open(URI.encode(source), 'rb') do |read_file|
      saved_file.write(read_file.read)
    end
  end

  tmp_file
end
extractor() click to toggle source
# File lib/gaspar.rb, line 30
def extractor
  src = determine_source(@source)
  pdf = Reader.new(src)

  Extractor.new(
    src, @target, pdf.page_count, @options
  )
end
random_source_name() click to toggle source
# File lib/gaspar.rb, line 39
def random_source_name
  rand(16**16).to_s(16)
end