class NewspaperWorks::TextExtraction::AltoReader::AltoDocStream

SAX Document Stream class to gather text and word tokens from ALTO

Attributes

text[RW]
words[RW]

Public Class Methods

new(image_width = nil) click to toggle source
Calls superclass method
# File lib/newspaper_works/text_extraction/alto_reader.rb, line 17
def initialize(image_width = nil)
  super()
  # scaling matters:
  @image_width = image_width
  @scaling = 1.0 # pt to px, if ALTO using points
  # plain text buffer:
  @text = ''
  # list of word hash, containing word+coord:
  @words = []
end

Public Instance Methods

compute_scaling(attrs) click to toggle source
# File lib/newspaper_works/text_extraction/alto_reader.rb, line 40
def compute_scaling(attrs)
  return if @image_width.nil?
  match = attrs.select { |e| e[0].casecmp?('WIDTH') }[0]
  return if match.empty?
  page_width = match[1].to_i
  return if @image_width == page_width
  @scaling = page_width / @image_width.to_f
end
end_document() click to toggle source

Callback for completion of parsing ALTO, used to normalize generated

text content (strip unneeded whitespace incidental to output).
# File lib/newspaper_works/text_extraction/alto_reader.rb, line 82
def end_document
  # postprocess @text to remove trailing spaces on lines
  @text = @text.split("\n").map(&:strip).join("\n")
  # remove trailing whitespace at end of buffer
  @text.strip!
end
end_element(name) click to toggle source

Callback for element end, used here to manage endings of lines and

blocks.

@param name [String] element name.

# File lib/newspaper_works/text_extraction/alto_reader.rb, line 74
def end_element(name)
  @text << " " if name == 'String'
  @text << "\n" if name == 'TextBlock'
  @text << "\n" if name == 'TextLine'
end
s_coords(attrs) click to toggle source

Return coordinates from String element attribute hash

@param attrs [Hash] hash containing ALTO `String` element attributes. @return [Array] Array of position x, y, width, height in px.

# File lib/newspaper_works/text_extraction/alto_reader.rb, line 32
def s_coords(attrs)
  height = scale_value((attrs['HEIGHT'] || 0).to_i)
  width = scale_value((attrs['WIDTH'] || 0).to_i)
  hpos = scale_value((attrs['HPOS'] || 0).to_i)
  vpos = scale_value((attrs['VPOS'] || 0).to_i)
  [hpos, vpos, width, height]
end
scale_value(v) click to toggle source
# File lib/newspaper_works/text_extraction/alto_reader.rb, line 49
def scale_value(v)
  (v / @scaling).to_i
end
start_element(name, attrs = []) click to toggle source

Callback for element start, implementation of which ignores

non-String elements.

@param name [String] element name. @param attrs [Array] Array of key, value pair Arrays.

# File lib/newspaper_works/text_extraction/alto_reader.rb, line 58
def start_element(name, attrs = [])
  values = attrs.to_h
  compute_scaling(attrs) if name == 'Page'
  return if name != 'String'
  token = values['CONTENT']
  @text << token
  @words << {
    word: token,
    coordinates: s_coords(values)
  }
end