class Tsumetogi::TextExtractor

Public Class Methods

new(pdf_path, config = nil) click to toggle source
# File lib/tsumetogi/text_extractor.rb, line 5
def initialize(pdf_path, config = nil)
  @pdf_path = pdf_path
  @config = config || Tsumetogi::Config.new
  @text_path = @config.text_path
  @text_path ||= "#{File.dirname(@pdf_path)}/#{File.basename(@pdf_path, ".*")}.txt"
end

Public Instance Methods

extract() click to toggle source
# File lib/tsumetogi/text_extractor.rb, line 12
def extract
  Tsumetogi.logger.debug "extracting text from #{File.basename(@pdf_path)} to #{@text_path}"

  crop_options = []
  unless [@config.crop_x, @config.crop_y, @config.crop_w, @config.crop_h].all?(&:zero?)
    crop_options += ["-x", @config.crop_x.to_s]
    crop_options += ["-y", @config.crop_y.to_s]
    crop_options += ["-W", @config.crop_w.to_s]
    crop_options += ["-H", @config.crop_h.to_s]
  end

  cmd = ["pdftotext"]
  cmd += ["-r", @config.resolution.to_s]
  cmd += crop_options
  cmd += [@pdf_path, @text_path]

  Tsumetogi.logger.debug cmd.join(" ")
  system *cmd
end