class NewspaperWorks::TextExtraction::PageOCR
Attributes
html[RW]
path[RW]
Public Class Methods
new(path)
click to toggle source
# File lib/newspaper_works/text_extraction/page_ocr.rb, line 12 def initialize(path) @path = path # hOCR html: @html = nil @words = nil @source_meta = nil @box = nil @plain = nil end
Public Instance Methods
alto()
click to toggle source
# File lib/newspaper_works/text_extraction/page_ocr.rb, line 69 def alto writer = NewspaperWorks::TextExtraction::RenderAlto.new(width, height) writer.to_alto(words) end
height()
click to toggle source
# File lib/newspaper_works/text_extraction/page_ocr.rb, line 65 def height identify[:height] end
identify()
click to toggle source
# File lib/newspaper_works/text_extraction/page_ocr.rb, line 56 def identify return @source_meta unless @source_meta.nil? @source_meta = NewspaperWorks::ImageTool.new(@path).metadata end
load_words()
click to toggle source
# File lib/newspaper_works/text_extraction/page_ocr.rb, line 29 def load_words preprocess_image html_path = run_ocr reader = NewspaperWorks::TextExtraction::HOCRReader.new(html_path) @words = reader.words @plain = reader.text end
plain()
click to toggle source
# File lib/newspaper_works/text_extraction/page_ocr.rb, line 51 def plain load_words if @plain.nil? @plain end
run_ocr()
click to toggle source
# File lib/newspaper_works/text_extraction/page_ocr.rb, line 22 def run_ocr outfile = File.join(Dir.mktmpdir, 'output_html') cmd = "tesseract #{path} #{outfile} hocr" `#{cmd}` outfile + '.hocr' end
width()
click to toggle source
# File lib/newspaper_works/text_extraction/page_ocr.rb, line 61 def width identify[:width] end
word_json()
click to toggle source
# File lib/newspaper_works/text_extraction/page_ocr.rb, line 42 def word_json builder = NewspaperWorks::TextExtraction::WordCoordsBuilder.new( words, width, height ) builder.to_json end
words()
click to toggle source
# File lib/newspaper_works/text_extraction/page_ocr.rb, line 37 def words load_words if @words.nil? @words end
Private Instance Methods
preprocess_image()
click to toggle source
transform the image into a one-bit TIFF for OCR
# File lib/newspaper_works/text_extraction/page_ocr.rb, line 77 def preprocess_image tool = NewspaperWorks::ImageTool.new(@path) return if tool.metadata[:color] == 'monochrome' intermediate_path = File.join(Dir.mktmpdir, 'monochrome-interim.tif') tool.convert(intermediate_path, true) @path = intermediate_path end