module RTesseract::Box

Public Class Methods

parse(content) click to toggle source
# File lib/rtesseract/box.rb, line 16
def parse(content)
  content.lines.map { |line| parse_line(line) }.compact
end
parse_confidence(line) click to toggle source
# File lib/rtesseract/box.rb, line 45
def parse_confidence(line)
  line.match(/(?<=;)(.*?)(?=')/).to_s.split
end
parse_line(line) click to toggle source
# File lib/rtesseract/box.rb, line 20
def parse_line(line)
  return unless line.match?(/oc(rx|r)_word/)

  word = line.to_s.scan(/>(.*)</).flatten.first.to_s

  return if word.strip == ''

  word_info(word, parse_position(line), parse_confidence(line))
end
parse_position(line) click to toggle source
# File lib/rtesseract/box.rb, line 41
def parse_position(line)
  line.match(/(?<=title)(.*?)(?=;)/).to_s.split
end
run(source, errors, options) click to toggle source
# File lib/rtesseract/box.rb, line 8
def run(source, errors, options)
  options = options.merge({ tessedit_create_hocr: 1 })

  RTesseract::Command.new(source, temp_file_path, errors, options).run do |output_path|
    parse(File.read("#{output_path}.hocr"))
  end
end
word_info(word, positions, confidence) click to toggle source
# File lib/rtesseract/box.rb, line 30
def word_info(word, positions, confidence)
  {
    word: word,
    confidence: confidence[-1].to_i,
    x_start: positions[1].to_i,
    y_start: positions[2].to_i,
    x_end: positions[3].to_i,
    y_end: positions[4].to_i
  }
end