class HTML2Index

Public Class Methods

new(*argv) click to toggle source

initialize the HTML2Index-object

# File lib/html2index.rb, line 46
def initialize(*argv)
  init_logger()
  # ensure the configuration-file exist.
  options = ArgParser.parse(argv)
  $configuration = Configuration.new(options)
  $configuration.user_conf
  @log.level = $log_level
  @file = $configuration.source

  @log.debug("read #{@file}, write #{$configuration.target}")
  msg = File_Checking::file_check(@file, :exist?, :readable?, :file?)
  if(msg) 
    @log.error("Error: unsuitable file " <<@file << ": " << msg)
    exit false
  end

  ftype = File_Checking::file_type(@file)
  @log.debug('ftype for ' << @file << ' is ' << ftype.to_s)
  if(ftype && !ftype.empty? && !ftype[0].downcase.match("(html|xml) .*document") )
    @log.error(@file.dup << ' does not look like a valid file to scan: ' << ftype)
    exit false
  end

  @problem_log = File.join(temp_dir, PROBLEM_LOG)
  @log.debug('calling generate() ') 
  generate()
end

Public Instance Methods

dict_list() click to toggle source

create an unnumbered list of the consulted dictionaries.

# File lib/html2index.rb, line 75
def dict_list
  list = '<ul>'
  $configuration.dicts.each do |d|
    list << '<li>' << d.name << ' : ' << '<a href="' << d.url << '">' << d.url << '</a></li>' << "\n"
  end
  @log.debug('list is ' << list.to_s)
  list << "</ul>\n"
end
generate() click to toggle source

Parses the html-file and generates the glossary.

# File lib/html2index.rb, line 87
def generate()
  @log.info('Generating... PSE wait')
  begin
    @doc = HTML::Document.parse(File.open(@file ) )
    write_html()
  rescue SystemExit => ir
    @log.info "Bye"
    exit true
  rescue Exception => ex
    @log.error "line #{__LINE__}: " << ex.message
    exit false
  end
  if(File.exist?(@problem_log ))
    @log.info "Some expressions caused problems and are listed in " << @problem_log
  end
end

Private Instance Methods

dict_definition(expression) click to toggle source

Searches the expression in online-dictionaries and creates a Definition-object for each definition, found. Returns an array of all Definition-objects for the expression.

# File lib/html2index.rb, line 110
def dict_definition(expression)
  definitions = Array.new
  $configuration.dicts.each do |d|
    text_array = Array.new
    page = nil
    begin
      url = d.url.dup << URI.encode_www_form_component(expression.gsub(/\s/,'_').gsub("'", ''))
      page = Nokogiri::HTML(URI.open(url))
    rescue Exception => ex
      url_b = d.url.dup << URI.encode_www_form_component(expression.gsub(/\s/,'_'))
      @log.warn('WARNING, accessing ' << url << ': ' << ex.message)
      if(url_b != url)
        @log.warn("\twill try " << url_b)
        begin
          page = Nokogiri::HTML(URI.open(url_b, "User-Agent"=>"#{APPNAME} #{VERSION}", "From"=>"Function Test <bat.guano@don.blech.e4ward.com>"))
        rescue Exception => ex
          @log.warn("\tWARNING, accessing " << url_b << ': ' << ex.message)
        end
      end
    end
    if(page)
      nodes = page.xpath(d.xpath)
      if(nodes)
        text_array = nodes.collect {|n| n.inner_text.to_s.strip}
        if(!text_array.empty?)
          definition = Definition.new(d.name, expression.gsub('_', ' '), text_array)
          definition.color = d.color
          definitions.push(definition)
        end
      end
    else
      File.open(@problem_log, 'a'){|f| f.write(expression + "\n") }
    end
  end
  return definitions
end
format_index(ndef) click to toggle source

Formats the definition text.

# File lib/html2index.rb, line 198
def format_index(ndef)
  item = "\n<dt>" 
  if(ndef.respond_to?(:to_ary))
    item << ndef[0].expression.dup
    item << "</dt>"
    ndef.each_with_index do |dn, i|
      item << "\n\t<dd>(<i style='color:#" << dn.color << ";'>" << dn.origin << "</i>): " << dn.definition.join("<br/>") << "</dd>" 
    end
  else
    item << ndef.expression.dup << "</dt>"
    item << "\n\t<dd>(<i><u>" << dn.origin << "</i<): " << dn.definition << "</dd>"
  end
  return item
end
index() click to toggle source

Creates and returns an Array of Definitions.

# File lib/html2index.rb, line 150
  def index()
    begin
      #TODO: Hash
      list = Array.new
      def_list = Array.new
      tag = $configuration.html_tag
      attr = $configuration.html_attribute
      value = $configuration.attr_value
      xpath = ".//#{tag}[contains(@#{attr}, '#{value}')]"
      # xpath = './/' << value << '[@' << attr << '="' << value << '"]'
      @log.debug('xpath is ' << xpath)
      tags = @doc.xpath(xpath)
      tags.each do |t|
        expression = t.attribute('title').to_s
        unless (expression && !expression.strip.empty? )
          expression = t.text.to_s
        end
        if(expression)
          expression.gsub!(/\s+/, " ")
          expression.strip!     
          if(!expression.empty? && !def_list.include?(expression) )
            if(!list.include?expression.downcase) 
              list.push expression.downcase
              definition = dict_definition(expression)
              if(definition && !definition.empty?)
                # @log.debug('definition is ' << definition.to_s)
                def_list.push(format_index(definition) )
                def_list.sort!
              end
            end
          end
        end
      end
=begin
      if(@log.level == Logger::DEBUG)
        @log.debug('def_list is ') 
        def_list.each_with_index{|t, i| @log.debug( "%i %s"%[i, t]) }
      end
=end
    rescue Interrupt => ex
      @log.warn "--------------\n\tIndex generation interrupted by user\n---------------"
    end
    return def_list
  end
temp_dir() click to toggle source

def temp_dir options = {:remove => true}

# File lib/html2index.rb, line 213
def temp_dir
  @temp_dir ||= begin
                  @log.debug('creating temp_dir')
                  require 'tmpdir'
                  require 'fileutils'
                  path = File.join(Dir::tmpdir, "HTML2Index_#{Time.now.to_i}_#{rand(1000)}")
                  @log.debug('temp-dir path is ' << path)
                  Dir.mkdir(path)
                  # at_exit {FileUtils.rm_rf(path) if File.exists?(path)} if options[:remove]
                  File.new path
                end
end
write_html() click to toggle source
# File lib/html2index.rb, line 226
def write_html()
  tempfile = nil
  out = nil
  out_file = $configuration.target
  template = Template.new
  if out_file
    if(File.exist?(out_file))
      if(File.writable?(out_file))
        puts "\nWARNING! File " << out_file << " exists!"
        print "Do you want to overwrite it? (Y/n) "
        res = wait_for_user
        puts
        unless(['y', 'Y'].include?(res.chr) )
          puts "Okay, doing nothing."
          exit false
        end
      else
        puts "ERROR! File " << out_file << " is not writable! Aborting, bye." 
        exit false
      end
    end
  else
    @log.debug('out_file is STDOUT')
  end
  begin
    html = template.to_s

    # create the definitions in index()
    def_list = "<dl>\n" << index().join("\n") << "\n</dl\n>"
    # associate content with fields
    data = {:dict_list => dict_list(), :glossary => def_list }
    placeholders = $configuration.placeholders
    fdelim = $configuration.fdelim
    placeholders.each_pair do |k, v|
      repl = fdelim.dup << v << fdelim.reverse
      @log.debug('try to replace ' << repl)
      html.gsub!(repl, data[k])
    end
    @log.debug('html is now ' << html)
    html.gsub!(/\<head\>/, "<head>\n\t#{GeneratorMeta}")
    File.write(out_file, html) if out_file
    puts html if !out_file
  rescue Exception => ex
    @log.error( "line #{__LINE__} " << ex.message << ' (' << ex.class.name << ')')
  ensure
    out.close if out && !out.closed?
    File.unlink(out) if out
  end
end