class NewspaperWorks::TextExtraction::PageOCR

Attributes

html[RW]
path[RW]

Public Class Methods

new(path) click to toggle source
# File lib/newspaper_works/text_extraction/page_ocr.rb, line 12
def initialize(path)
  @path = path
  # hOCR html:
  @html = nil
  @words = nil
  @source_meta = nil
  @box = nil
  @plain = nil
end

Public Instance Methods

alto() click to toggle source
# File lib/newspaper_works/text_extraction/page_ocr.rb, line 69
def alto
  writer = NewspaperWorks::TextExtraction::RenderAlto.new(width, height)
  writer.to_alto(words)
end
height() click to toggle source
# File lib/newspaper_works/text_extraction/page_ocr.rb, line 65
def height
  identify[:height]
end
identify() click to toggle source
# File lib/newspaper_works/text_extraction/page_ocr.rb, line 56
def identify
  return @source_meta unless @source_meta.nil?
  @source_meta = NewspaperWorks::ImageTool.new(@path).metadata
end
load_words() click to toggle source
# File lib/newspaper_works/text_extraction/page_ocr.rb, line 29
def load_words
  preprocess_image
  html_path = run_ocr
  reader = NewspaperWorks::TextExtraction::HOCRReader.new(html_path)
  @words = reader.words
  @plain = reader.text
end
plain() click to toggle source
# File lib/newspaper_works/text_extraction/page_ocr.rb, line 51
def plain
  load_words if @plain.nil?
  @plain
end
run_ocr() click to toggle source
# File lib/newspaper_works/text_extraction/page_ocr.rb, line 22
def run_ocr
  outfile = File.join(Dir.mktmpdir, 'output_html')
  cmd = "tesseract #{path} #{outfile} hocr"
  `#{cmd}`
  outfile + '.hocr'
end
width() click to toggle source
# File lib/newspaper_works/text_extraction/page_ocr.rb, line 61
def width
  identify[:width]
end
word_json() click to toggle source
# File lib/newspaper_works/text_extraction/page_ocr.rb, line 42
def word_json
  builder = NewspaperWorks::TextExtraction::WordCoordsBuilder.new(
    words,
    width,
    height
  )
  builder.to_json
end
words() click to toggle source
# File lib/newspaper_works/text_extraction/page_ocr.rb, line 37
def words
  load_words if @words.nil?
  @words
end

Private Instance Methods

preprocess_image() click to toggle source

transform the image into a one-bit TIFF for OCR

# File lib/newspaper_works/text_extraction/page_ocr.rb, line 77
def preprocess_image
  tool = NewspaperWorks::ImageTool.new(@path)
  return if tool.metadata[:color] == 'monochrome'
  intermediate_path = File.join(Dir.mktmpdir, 'monochrome-interim.tif')
  tool.convert(intermediate_path, true)
  @path = intermediate_path
end