class NewspaperWorks::TextExtraction::RenderAlto

Public Class Methods

new(width, height, scaling = 1.0) click to toggle source
# File lib/newspaper_works/text_extraction/render_alto.rb, line 7
def initialize(width, height, scaling = 1.0)
  @height = height
  @width = width
  @scaling = scaling
end

Public Instance Methods

to_alto(words) click to toggle source
# File lib/newspaper_works/text_extraction/render_alto.rb, line 13
def to_alto(words)
  page = alto_page(@width, @height) do |xml|
    words.each do |word|
      xml.String(
        CONTENT: word[:word],
        WIDTH: scale_point(word[:coordinates][2]).to_s,
        HEIGHT: scale_point(word[:coordinates][3]).to_s,
        HPOS: scale_point(word[:coordinates][0]).to_s,
        VPOS: scale_point(word[:coordinates][1]).to_s
      ) { xml.text '' }
    end
  end
  page.to_xml
end

Private Instance Methods

alto_blockline(xml, pxwidth, pxheight) { |xml| ... } click to toggle source

make block line and call word-block

# File lib/newspaper_works/text_extraction/render_alto.rb, line 68
def alto_blockline(xml, pxwidth, pxheight)
  xml.TextBlock(ID: 'ID1a',
                HEIGHT: pxheight.to_i,
                WIDTH: pxwidth.to_i,
                HPOS: '0',
                VPOS: '0') do
    xml.TextLine(HEIGHT: pxheight.to_i,
                 WIDTH: pxwidth.to_i,
                 HPOS: '0',
                 VPOS: '0') do
      yield(xml)
    end
  end
end
alto_layout(xml, pxwidth, pxheight, &block) click to toggle source

return layout for page

# File lib/newspaper_works/text_extraction/render_alto.rb, line 51
def alto_layout(xml, pxwidth, pxheight, &block)
  xml.Layout do
    xml.Page(ID: 'ID1',
             PHYSICAL_IMG_NR: '1',
             HEIGHT: pxheight.to_i,
             WIDTH: pxwidth.to_i) do
      xml.PrintSpace(HEIGHT: pxheight.to_i,
                     WIDTH: pxwidth.to_i,
                     HPOS: '0',
                     VPOS: '0') do
        alto_blockline(xml, pxwidth, pxheight, &block)
      end
    end
  end
end
alto_page(pxwidth, pxheight, &block) click to toggle source

given block to manage word generation, wrap with page/block/line

# File lib/newspaper_works/text_extraction/render_alto.rb, line 31
def alto_page(pxwidth, pxheight, &block)
  builder = Nokogiri::XML::Builder.new(encoding: 'UTF-8') do |xml|
    xml.alto(xmlns: 'http://www.loc.gov/standards/alto/ns-v2#') do
      xml.Description do
        xml.MeasurementUnit 'pixel'
      end
      alto_layout(xml, pxwidth, pxheight, &block)
    end
  end
  builder
end
scale_point(value) click to toggle source
# File lib/newspaper_works/text_extraction/render_alto.rb, line 43
def scale_point(value)
  # note: presuming non-fractional, even though ALTO 2.1
  #   specifies coordinates are xsd:float, not xsd:int,
  #   simplify to integer value for output:
  (value * @scaling).to_i
end