class HTML2Index
Public Class Methods
new(*argv)
click to toggle source
initialize the HTML2Index-object
# File lib/html2index.rb, line 46 def initialize(*argv) init_logger() # ensure the configuration-file exist. options = ArgParser.parse(argv) $configuration = Configuration.new(options) $configuration.user_conf @log.level = $log_level @file = $configuration.source @log.debug("read #{@file}, write #{$configuration.target}") msg = File_Checking::file_check(@file, :exist?, :readable?, :file?) if(msg) @log.error("Error: unsuitable file " <<@file << ": " << msg) exit false end ftype = File_Checking::file_type(@file) @log.debug('ftype for ' << @file << ' is ' << ftype.to_s) if(ftype && !ftype.empty? && !ftype[0].downcase.match("(html|xml) .*document") ) @log.error(@file.dup << ' does not look like a valid file to scan: ' << ftype) exit false end @problem_log = File.join(temp_dir, PROBLEM_LOG) @log.debug('calling generate() ') generate() end
Public Instance Methods
dict_list()
click to toggle source
create an unnumbered list of the consulted dictionaries.
# File lib/html2index.rb, line 75 def dict_list list = '<ul>' $configuration.dicts.each do |d| list << '<li>' << d.name << ' : ' << '<a href="' << d.url << '">' << d.url << '</a></li>' << "\n" end @log.debug('list is ' << list.to_s) list << "</ul>\n" end
generate()
click to toggle source
Parses the html-file and generates the glossary.
# File lib/html2index.rb, line 87 def generate() @log.info('Generating... PSE wait') begin @doc = HTML::Document.parse(File.open(@file ) ) write_html() rescue SystemExit => ir @log.info "Bye" exit true rescue Exception => ex @log.error "line #{__LINE__}: " << ex.message exit false end if(File.exist?(@problem_log )) @log.info "Some expressions caused problems and are listed in " << @problem_log end end
Private Instance Methods
dict_definition(expression)
click to toggle source
Searches the expression in online-dictionaries and creates a Definition-object for each definition, found. Returns an array of all Definition-objects for the expression.
# File lib/html2index.rb, line 110 def dict_definition(expression) definitions = Array.new $configuration.dicts.each do |d| text_array = Array.new page = nil begin url = d.url.dup << URI.encode_www_form_component(expression.gsub(/\s/,'_').gsub("'", '')) page = Nokogiri::HTML(URI.open(url)) rescue Exception => ex url_b = d.url.dup << URI.encode_www_form_component(expression.gsub(/\s/,'_')) @log.warn('WARNING, accessing ' << url << ': ' << ex.message) if(url_b != url) @log.warn("\twill try " << url_b) begin page = Nokogiri::HTML(URI.open(url_b, "User-Agent"=>"#{APPNAME} #{VERSION}", "From"=>"Function Test <bat.guano@don.blech.e4ward.com>")) rescue Exception => ex @log.warn("\tWARNING, accessing " << url_b << ': ' << ex.message) end end end if(page) nodes = page.xpath(d.xpath) if(nodes) text_array = nodes.collect {|n| n.inner_text.to_s.strip} if(!text_array.empty?) definition = Definition.new(d.name, expression.gsub('_', ' '), text_array) definition.color = d.color definitions.push(definition) end end else File.open(@problem_log, 'a'){|f| f.write(expression + "\n") } end end return definitions end
format_index(ndef)
click to toggle source
Formats the definition text.
# File lib/html2index.rb, line 198 def format_index(ndef) item = "\n<dt>" if(ndef.respond_to?(:to_ary)) item << ndef[0].expression.dup item << "</dt>" ndef.each_with_index do |dn, i| item << "\n\t<dd>(<i style='color:#" << dn.color << ";'>" << dn.origin << "</i>): " << dn.definition.join("<br/>") << "</dd>" end else item << ndef.expression.dup << "</dt>" item << "\n\t<dd>(<i><u>" << dn.origin << "</i<): " << dn.definition << "</dd>" end return item end
index()
click to toggle source
Creates and returns an Array of Definitions.
# File lib/html2index.rb, line 150 def index() begin #TODO: Hash list = Array.new def_list = Array.new tag = $configuration.html_tag attr = $configuration.html_attribute value = $configuration.attr_value xpath = ".//#{tag}[contains(@#{attr}, '#{value}')]" # xpath = './/' << value << '[@' << attr << '="' << value << '"]' @log.debug('xpath is ' << xpath) tags = @doc.xpath(xpath) tags.each do |t| expression = t.attribute('title').to_s unless (expression && !expression.strip.empty? ) expression = t.text.to_s end if(expression) expression.gsub!(/\s+/, " ") expression.strip! if(!expression.empty? && !def_list.include?(expression) ) if(!list.include?expression.downcase) list.push expression.downcase definition = dict_definition(expression) if(definition && !definition.empty?) # @log.debug('definition is ' << definition.to_s) def_list.push(format_index(definition) ) def_list.sort! end end end end end =begin if(@log.level == Logger::DEBUG) @log.debug('def_list is ') def_list.each_with_index{|t, i| @log.debug( "%i %s"%[i, t]) } end =end rescue Interrupt => ex @log.warn "--------------\n\tIndex generation interrupted by user\n---------------" end return def_list end
temp_dir()
click to toggle source
def temp_dir
options = {:remove => true}
# File lib/html2index.rb, line 213 def temp_dir @temp_dir ||= begin @log.debug('creating temp_dir') require 'tmpdir' require 'fileutils' path = File.join(Dir::tmpdir, "HTML2Index_#{Time.now.to_i}_#{rand(1000)}") @log.debug('temp-dir path is ' << path) Dir.mkdir(path) # at_exit {FileUtils.rm_rf(path) if File.exists?(path)} if options[:remove] File.new path end end
write_html()
click to toggle source
# File lib/html2index.rb, line 226 def write_html() tempfile = nil out = nil out_file = $configuration.target template = Template.new if out_file if(File.exist?(out_file)) if(File.writable?(out_file)) puts "\nWARNING! File " << out_file << " exists!" print "Do you want to overwrite it? (Y/n) " res = wait_for_user puts unless(['y', 'Y'].include?(res.chr) ) puts "Okay, doing nothing." exit false end else puts "ERROR! File " << out_file << " is not writable! Aborting, bye." exit false end end else @log.debug('out_file is STDOUT') end begin html = template.to_s # create the definitions in index() def_list = "<dl>\n" << index().join("\n") << "\n</dl\n>" # associate content with fields data = {:dict_list => dict_list(), :glossary => def_list } placeholders = $configuration.placeholders fdelim = $configuration.fdelim placeholders.each_pair do |k, v| repl = fdelim.dup << v << fdelim.reverse @log.debug('try to replace ' << repl) html.gsub!(repl, data[k]) end @log.debug('html is now ' << html) html.gsub!(/\<head\>/, "<head>\n\t#{GeneratorMeta}") File.write(out_file, html) if out_file puts html if !out_file rescue Exception => ex @log.error( "line #{__LINE__} " << ex.message << ' (' << ex.class.name << ')') ensure out.close if out && !out.closed? File.unlink(out) if out end end