class QndHtml2Page

Attributes

to_pages[R]

Public Class Methods

new(html, debug: false, pg_height: 770, width: '700px') click to toggle source
# File lib/qnd_html2page.rb, line 33
def initialize(html, debug: false, pg_height: 770, width: '700px')

  @html, @height, @width, @debug = html, pg_height, width, debug
  @to_pages = scan(RXFHelper.read(@html).first)

end

Private Instance Methods

scan(obj) click to toggle source
# File lib/qnd_html2page.rb, line 42
def scan(obj)
  
  raw_html = obj.is_a?(Rexle) ? obj.xml : obj
    
  # <br/> acts as a hard page break
  html = raw_html.gsub(/<br\s*\/>/) do |x|
    '<span class="qndhtml2pg">pagebreak' + x.object_id.to_s + '</span>'
  end
  
  # add the span tag after almost every element in the body

  doc = Rexle.new(html)
  body = doc.root.element('body')
  body.attributes[:style] = 'width: ' + @width

  count = 0
  body.each_recursive do |e|

    puts ('e: ' + e.xml).debug if @debug
    ignore_list = %w(span b li tr td dt dd em strong i a)
    next if ignore_list.include? e.name
    span = Rexle::Element.new('span').add_text(count.to_s)
    span.attributes[:class] = 'qndhtml2pg'
    e.insert_after span
    count += 1

  end

  # Fetch the y coordinate of every span tag to determine the
  # elements that can fit into each page.

  tmpfile = Tempfile.new('browser')
  File.write tmpfile.path + '.html', doc.root.xml

  browser = Ferrum::Browser.new
  browser.goto('file://' + tmpfile.path + '.html')
  span_list = browser.xpath('//span[@class="qndhtml2pg"]')
  
  maxheight = span_list.last.find_position.last

  a = span_list.map do |x| 
    ypos = x.text[/^pagebreak/] ? maxheight : x.find_position.last        
    [x.text, ypos] 
  end
  
  heights = ((maxheight) / @height).round.to_i.times\
      .inject([@height]) {|r, x| r << (r.last + @height)  } 
  
  puts ('heights: ' + heights.inspect).debug if @debug
  height = heights.shift

  a2 = a.inject([[]]) do |r,x|

    puts ('r: ' + x.inspect).debug if @debug
    puts ('x: ' + x.inspect).debug if @debug
    puts ('height: ' + height.inspect).debug if @debug
    
    if x.first[/^pagebreak/] then
      r << [x]
    else
      x.last < height ? (r.last << x) : (height = heights.shift; r << [x])
    end
    
    r

  end


  elements = doc.root.element('body').elements.to_a
  puts ('elements.length: ' + elements.length.inspect).debug if @debug
  offset2 = 0

  puts ('a2: ' + a2.inspect).debug if @debug
  
  # find each last record span stop using the given id
  stops = a2.map do |x|
    elements.index(elements.find {|e| e.text == x.last.first })
  end
  
  puts ('stops: ' + stops.inspect).debug if @debug
  
  pages = elements.slice_at(*stops).map do |e_list|

    div = Rexle::Element.new 'div'
    puts 'e_list: ' + e_list.inspect if @debug
    
    e_list.reject! do |e|
      r = e.name == 'span' and e.attributes[:class] == 'qndhtml2pg'
      puts 'r: ' + r.inspect
      r
    end
    
    next if e_list.empty?

    e_list.each {|e| div.add e}
    
    puts 'div: ' + div.xml.inspect if @debug
    div.xpath('//span[@class="qndhtml2pg"]').each(&:delete)
    puts 'after div: ' + div.xml.inspect if @debug      
    div
  end.compact

end