module RTesseract::Box
Public Class Methods
parse(content)
click to toggle source
# File lib/rtesseract/box.rb, line 16 def parse(content) content.lines.map { |line| parse_line(line) }.compact end
parse_confidence(line)
click to toggle source
# File lib/rtesseract/box.rb, line 45 def parse_confidence(line) line.match(/(?<=;)(.*?)(?=')/).to_s.split end
parse_line(line)
click to toggle source
# File lib/rtesseract/box.rb, line 20 def parse_line(line) return unless line.match?(/oc(rx|r)_word/) word = line.to_s.scan(/>(.*)</).flatten.first.to_s return if word.strip == '' word_info(word, parse_position(line), parse_confidence(line)) end
parse_position(line)
click to toggle source
# File lib/rtesseract/box.rb, line 41 def parse_position(line) line.match(/(?<=title)(.*?)(?=;)/).to_s.split end
run(source, errors, options)
click to toggle source
# File lib/rtesseract/box.rb, line 8 def run(source, errors, options) options = options.merge({ tessedit_create_hocr: 1 }) RTesseract::Command.new(source, temp_file_path, errors, options).run do |output_path| parse(File.read("#{output_path}.hocr")) end end
word_info(word, positions, confidence)
click to toggle source
# File lib/rtesseract/box.rb, line 30 def word_info(word, positions, confidence) { word: word, confidence: confidence[-1].to_i, x_start: positions[1].to_i, y_start: positions[2].to_i, x_end: positions[3].to_i, y_end: positions[4].to_i } end