class HocrTurtletext::Reader
pdf-reader-turtletext methods such as text_in_region
, text_position
and fuzzed_y
method modified from the original at github.com/tardate/pdf-reader-turtletext
Public Class Methods
new(hocr_path, options = {})
click to toggle source
# File lib/hocr_turtletext/reader.rb, line 6 def initialize(hocr_path, options = {}) @hocr_path = hocr_path @options = options end
Public Instance Methods
bounding_box(&block)
click to toggle source
# File lib/hocr_turtletext/reader.rb, line 55 def bounding_box(&block) HocrTurtletext::Textangle.new(self, &block) end
content()
click to toggle source
# File lib/hocr_turtletext/reader.rb, line 11 def content hocr_content = File.read(@hocr_path) html = Nokogiri::HTML(hocr_content) pos_info_words = extract_words_from_html(html) pos_hash = to_pos_hash pos_info_words fuzzed_y = fuzzed_y(pos_hash) concat_words_in_lines(fuzzed_y) end
text_in_region(xmin, xmax, ymin, ymax, inclusive=false)
click to toggle source
# File lib/hocr_turtletext/reader.rb, line 20 def text_in_region(xmin, xmax, ymin, ymax, inclusive=false) return [] unless xmin && xmax && ymin && ymax text_map = content box = [] text_map.each do |y,text_row| if inclusive ? (y >= ymin && y <= ymax) : (y > ymin && y < ymax) row = [] text_row.each do |x,element| if inclusive ? (x >= xmin && x <= xmax) : (x > xmin && x < xmax) row << element end end box << row unless row.empty? end end box end
text_position(text)
click to toggle source
# File lib/hocr_turtletext/reader.rb, line 39 def text_position(text) item = if text.class <= Regexp content.map do |k,v| if x = v.reduce(nil){ |memo,vv| memo = (vv[1] =~ text) ? vv[0] : memo } [k,x] end end else content.map { |k,v| if x = v.rassoc(text) ; [k,x] ; end } end item = item.compact.flatten unless item.empty? { :x => item[1], :y => item[0] } end end
Private Instance Methods
concat_words_in_lines(fuzzed_y)
click to toggle source
# File lib/hocr_turtletext/reader.rb, line 113 def concat_words_in_lines(fuzzed_y) fuzzed_y.map do |line| x_pos_keyed_words = line[1] concatenated_words = [] x_pos_keyed_words.each do |x_pos_keyed_word| word_hash = x_pos_keyed_word[1] if concatenated_words.empty? || word_hash[:x_start] - concatenated_words.last[:x_end] > x_whitespace_threshold concatenated_words.push word_hash else concatenated_words.last[:word] = "#{concatenated_words.last[:word]} #{word_hash[:word]}" concatenated_words.last[:x_end] = word_hash[:x_end] end end line[1] = concatenated_words.map! do |word_hash| [word_hash[:x_start], word_hash[:word]] end line end end
extract_words_from_html(html)
click to toggle source
# File lib/hocr_turtletext/reader.rb, line 69 def extract_words_from_html(html) pos_info_words = [] html.css('span.ocrx_word, span.ocr_word') .reject { |word| word.text.strip.empty? } .each do |word| word_attributes = word.attributes['title'].value.to_s .delete(';').split(' ') pos_info_word = word_info(word, word_attributes) pos_info_words.push pos_info_word end pos_info_words end
fuzzed_y(input)
click to toggle source
# File lib/hocr_turtletext/reader.rb, line 94 def fuzzed_y(input) output = [] input.keys.sort.each do |precise_y| matching_y = output.map(&:first) .select { |new_y| (new_y - precise_y).abs < y_precision } .first || precise_y y_index = output.index{ |y| y.first == matching_y } new_row_content = input[precise_y].to_a if y_index row_content = output[y_index].last row_content += new_row_content output[y_index] = [matching_y,row_content.sort{ |a,b| a.first <=> b.first }] else output << [matching_y,new_row_content.sort{ |a,b| a.first <=> b.first }] end end output end
to_pos_hash(lines)
click to toggle source
# File lib/hocr_turtletext/reader.rb, line 83 def to_pos_hash(lines) lines.sort_by { |line| line[:y_start] } pos_hash = {} lines.each do |run| pos_hash[run[:y_start]] ||= {} pos_hash[run[:y_start]][run[:x_start]] = run end pos_hash end
word_info(word, data)
click to toggle source
# File lib/hocr_turtletext/reader.rb, line 134 def word_info(word, data) { word: word.text, x_start: data[1].to_i, y_start: data[2].to_i, x_end: data[3].to_i, y_end: data[4].to_i } end
x_whitespace_threshold()
click to toggle source
# File lib/hocr_turtletext/reader.rb, line 61 def x_whitespace_threshold @options[:x_whitespace_threshold] ||= 30 end
y_precision()
click to toggle source
# File lib/hocr_turtletext/reader.rb, line 65 def y_precision @options[:y_precision] ||= 3 end