class PdfboxTextExtraction

Constants

VERSION

Public Class Methods

configure_text_extraction_params(text_stripper, options) click to toggle source

Sets params on text_stripper. @param text_stripper [PDFTextStripper]

# File lib/pdfbox_text_extraction.rb, line 81
def self.configure_text_extraction_params(text_stripper, options)

  # *****************************************************
  # Extraction thresholds and tolerances

  # Set the character width-based tolerance value that is used to estimate
  # where spaces in text should be added.
  # Default: 0.30000001192092896
  if(o = options[:average_char_tolerance])
    text_stripper.setAverageCharTolerance(o)
  end

  # Set the minimum whitespace, as a multiple of the max height of the current
  # characters beyond which the current line start is considered to be a
  # paragraph start.
  # Default: 2.5
  if(o = options[:drop_threshold])
    text_stripper.setDropThreshold(o)
  end

  # Set the multiple of whitespace character widths for the current text
  # which the current line start can be indented from the previous line
  # start beyond which the current line start is considered to be a
  # paragraph start.
  # Default: 2.0
  if(o = options[:indent_threshold])
    text_stripper.setIndentThreshold(o)
  end

  # Set the space width-based tolerance value that is used to estimate where
  # spaces in text should be added.
  # Default: 0.5
  if(o = options[:spacing_tolerance])
    text_stripper.setSpacingTolerance(o)
  end

  # *****************************************************
  # Sort order

  # The order of the text tokens in a PDF file may not be in the same as
  # they appear visually on the screen.
  # Default: false
  if !(o = options[:sort_by_position]).nil? # Allow override of false
    text_stripper.setSortByPosition(o)
  end

  # *****************************************************
  # Separator tokens

  # Set the desired line separator for output text.
  # Default: "\n"
  if(o = options[:line_separator])
    text_stripper.setLineSeparator(o)
  end

  # Set the string which will be used at the end of a page.
  # Default: ""
  if(o = options[:page_end])
    text_stripper.setPageEnd(o)
  end

  # Set the string which will be used at the end of a page.
  # Default: ""
  if(o = options[:page_start])
    text_stripper.setPageStart(o)
  end

  # Set the string which will be used at the end of a paragraph.
  # Default: ""
  if(o = options[:paragraph_end])
    text_stripper.setParagraphEnd(o)
  end

  # Set the string which will be used at the end of a paragraph.
  # Default: ""
  if(o = options[:paragraph_start])
    text_stripper.setParagraphStart(o)
  end

end
run(path_to_pdf, options={}) click to toggle source

Runs text extraction and returns extracted text as string. Optionally can extract text from crop area only if crop area dimensions are given. All crop area dimensions are in inches.

@param path_to_pdf [String] @param options [Hash, optional] @option options [Float] crop_x crop area top left corner x-coordinate @option options [Float] crop_y crop area top left corner y-coordinate @option options [Float] crop_width crop area width @option options [Float] crop_height crop area height @option options [Float] average_char_tolerance @option options [Float] drop_threshold @option options [Float] indent_threshold @option options [Float] spacing_tolerance @option options [Boolean] sort_by_position @option options [String] line_separator @option options [String] page_end @option options [String] page_start @option options [String] paragraph_end @option options [String] paragraph_start @return [String] the extracted text

# File lib/pdfbox_text_extraction.rb, line 44
def self.run(path_to_pdf, options={})
  file = File.new(path_to_pdf)
  pd_doc = PDDocument.load(file)
  text_stripper = nil
  all_text = ''
  if [:crop_x, :crop_y, :crop_width, :crop_height].any? { |e| options[e] }
    # crop options given, extract from crop area only
    res = 72
    body_text_rect = Rectangle2D::Float.new(
      (options[:crop_x] * res),
      (options[:crop_y] * res),
      (options[:crop_width] * res),
      (options[:crop_height] * res)
    )
    text_stripper = PDFTextStripperByArea.new
    text_stripper.addRegion("bodyText", body_text_rect)
    configure_text_extraction_params(text_stripper, options)

    pd_doc.getPages.each do |page|
      text_stripper.extractRegions(page)
      # Get the body text of the current page
      all_text << text_stripper.getTextForRegion("bodyText")
    end
  else
    # No crop options given, extract all text
    text_stripper = PDFTextStripper.new
    configure_text_extraction_params(text_stripper, options)
    all_text << text_stripper.getText(pd_doc)
  end

  pd_doc.close

  all_text
end