class NewspaperWorks::TextExtraction::HOCRReader::HOCRDocStream
SAX Document Stream class to gather text and word tokens from hOCR
Attributes
Public Class Methods
# File lib/newspaper_works/text_extraction/hocr_reader.rb, line 18 def initialize super() # plain text buffer: @text = '' # list of word hash, containing word+coord: @words = [] # page width and height to be found in hOCR for `div.ocr_page` @width = nil @height = nil # to hold current word data state across #start_element, #characters, # and #end_element methods (to associate word with coordinates). @current = nil # to preserve element classname from start to use by #end_element @element_class_name = nil end
Public Instance Methods
# File lib/newspaper_works/text_extraction/hocr_reader.rb, line 111 def characters(value) return if @current.nil? return if @current[:coordinates].nil? @current[:word] ||= '' @current[:word] += value @text += value end
Consider element for processing?
- `div.ocr_page` — to get page width/height - `span.ocr_line` — to help make plain text readable - `span.ocrx_word` — for word-coordinate JSON and plain text word
@param name [String] Element name @param class_name [String] HTML class name @return [Boolean] true if element should be processed; otherwise false
# File lib/newspaper_works/text_extraction/hocr_reader.rb, line 56 def consider?(name, class_name) selector = "#{name}.#{class_name}" ['div.ocr_page', 'span.ocr_line', 'span.ocrx_word'].include?(selector) end
Callback for completion of parsing hOCR, used to normalize generated
text content (strip unneeded whitespace incidental to output).
# File lib/newspaper_works/text_extraction/hocr_reader.rb, line 130 def end_document # postprocess @text to remove trailing spaces on lines @text = @text.split("\n").map(&:strip).join("\n") # remove excess line break @text.gsub!(/\n+/, "\n") @text.delete("\r") # remove trailing whitespace at end of buffer @text.strip! end
Callback for element end; at this time, flush word coordinate state
for current word, and append line endings to plain text:
@param name [String] element name.
# File lib/newspaper_works/text_extraction/hocr_reader.rb, line 123 def end_element(_name) end_line if @element_class_name == 'ocr_line' end_word if @element_class_name == 'ocrx_word' end
# File lib/newspaper_works/text_extraction/hocr_reader.rb, line 89 def end_line # strip trailing whitespace @text.strip! # then insert a line break @text += "\n" end
# File lib/newspaper_works/text_extraction/hocr_reader.rb, line 83 def end_word # add trailing space to plaintext buffer for between words: @text += ' ' @words.push(@current) if word_complete? end
Return coordinates from `span.ocrx_word` element attribute hash
@param attrs [Hash] hash with hOCR `span.ocrx_word` element attributes @return [Array] Array of position x, y, width, height in px.
# File lib/newspaper_works/text_extraction/hocr_reader.rb, line 38 def s_coords(attrs) element_title = attrs['title'] bbox = element_title.split(';')[0].split('bbox ')[-1] x1, y1, x2, y2 = bbox.split(' ').map(&:to_i) height = y2 - y1 width = x2 - x1 hpos = x1 vpos = y1 [hpos, vpos, width, height] end
Callback for element start, ignores elements except for:
- `div.ocr_page` — to get page width/height - `span.ocr_line` — to help make plain text readable - `span.ocrx_word` — for word-coordinate JSON and plain text word
@param name [String] element name. @param attrs [Array] Array of key, value pair Arrays.
# File lib/newspaper_works/text_extraction/hocr_reader.rb, line 103 def start_element(name, attrs = []) attributes = attrs.to_h @element_class_name = attributes['class'] return unless consider?(name, @element_class_name) start_word(attributes) if @element_class_name == 'ocrx_word' start_page(attributes) if @element_class_name == 'ocr_page' end
# File lib/newspaper_works/text_extraction/hocr_reader.rb, line 68 def start_page(attrs) title = attrs['title'] fields = title.split(';') bbox = fields[1].split('bbox ')[-1].split(' ').map(&:to_i) # width and height: @width = bbox[2] @height = bbox[3] end
# File lib/newspaper_works/text_extraction/hocr_reader.rb, line 61 def start_word(attrs) @current = {} # will be replaced during #characters method call: @current[:word] = nil @current[:coordinates] = s_coords(attrs) end
# File lib/newspaper_works/text_extraction/hocr_reader.rb, line 77 def word_complete? return false if @current.nil? coords = @current[:coordinates] @current[:word] && !@current[:word].empty? && coords.size == 4 end