class NewspaperWorks::TextExtraction::AltoReader::AltoDocStream
SAX Document Stream class to gather text and word tokens from ALTO
Attributes
text[RW]
words[RW]
Public Class Methods
new(image_width = nil)
click to toggle source
Calls superclass method
# File lib/newspaper_works/text_extraction/alto_reader.rb, line 17 def initialize(image_width = nil) super() # scaling matters: @image_width = image_width @scaling = 1.0 # pt to px, if ALTO using points # plain text buffer: @text = '' # list of word hash, containing word+coord: @words = [] end
Public Instance Methods
compute_scaling(attrs)
click to toggle source
# File lib/newspaper_works/text_extraction/alto_reader.rb, line 40 def compute_scaling(attrs) return if @image_width.nil? match = attrs.select { |e| e[0].casecmp?('WIDTH') }[0] return if match.empty? page_width = match[1].to_i return if @image_width == page_width @scaling = page_width / @image_width.to_f end
end_document()
click to toggle source
Callback for completion of parsing ALTO, used to normalize generated
text content (strip unneeded whitespace incidental to output).
# File lib/newspaper_works/text_extraction/alto_reader.rb, line 82 def end_document # postprocess @text to remove trailing spaces on lines @text = @text.split("\n").map(&:strip).join("\n") # remove trailing whitespace at end of buffer @text.strip! end
end_element(name)
click to toggle source
Callback for element end, used here to manage endings of lines and
blocks.
@param name [String] element name.
# File lib/newspaper_works/text_extraction/alto_reader.rb, line 74 def end_element(name) @text << " " if name == 'String' @text << "\n" if name == 'TextBlock' @text << "\n" if name == 'TextLine' end
s_coords(attrs)
click to toggle source
Return coordinates from String element attribute hash
@param attrs [Hash] hash containing ALTO `String` element attributes. @return [Array] Array of position x, y, width, height in px.
# File lib/newspaper_works/text_extraction/alto_reader.rb, line 32 def s_coords(attrs) height = scale_value((attrs['HEIGHT'] || 0).to_i) width = scale_value((attrs['WIDTH'] || 0).to_i) hpos = scale_value((attrs['HPOS'] || 0).to_i) vpos = scale_value((attrs['VPOS'] || 0).to_i) [hpos, vpos, width, height] end
scale_value(v)
click to toggle source
# File lib/newspaper_works/text_extraction/alto_reader.rb, line 49 def scale_value(v) (v / @scaling).to_i end
start_element(name, attrs = [])
click to toggle source
Callback for element start, implementation of which ignores
non-String elements.
@param name [String] element name. @param attrs [Array] Array of key, value pair Arrays.
# File lib/newspaper_works/text_extraction/alto_reader.rb, line 58 def start_element(name, attrs = []) values = attrs.to_h compute_scaling(attrs) if name == 'Page' return if name != 'String' token = values['CONTENT'] @text << token @words << { word: token, coordinates: s_coords(values) } end