class DocParser::Document
The Document
class loads and parses the files. @see Parser
@see Output
Attributes
doc[R]
@return [Nokogiri::HTML::Document] a reference to the Nokogiri document
encoding[R]
@return [String] the encoding of the document
filename[R]
@return [String] the filename of the current document
html[R]
@return [String] the source of the document
results[R]
@return [Array] the results from this document
Public Class Methods
new(filename: nil, encoding: 'utf-8', parser: nil, logger: nil)
click to toggle source
# File lib/docparser/document.rb, line 24 def initialize(filename: nil, encoding: 'utf-8', parser: nil, logger: nil) @logger = logger || Logger.new(STDERR) @logger.level = Logger::INFO @logger.debug("Parsing #{filename}") @encoding = encoding @parser = parser @filename = filename @results = Array.new(@parser.outputs ? @parser.outputs.length : 0) { [] } read_file end
Public Instance Methods
add_row(*row, output: 0)
click to toggle source
Adds a row to an output
# File lib/docparser/document.rb, line 36 def add_row(*row, output: 0) output = @parser.outputs.index(output) if output.is_a? Output @logger.debug("#{filename}: Adding row #{row.flatten}") results[output] << row.flatten end
each_element(query) { |el| ... }
click to toggle source
# File lib/docparser/document.rb, line 53 def each_element(query) res = elements(query) if block_given? res.each { |el| yield el } else res end end
element_content(query)
click to toggle source
Executes a xpath query and returns the content @return [String] the content of the HTML node
# File lib/docparser/document.rb, line 65 def element_content(query) first = @doc.search(query).first if first.nil? nil else first.content end end
Also aliased as: css_content, xpath_content
elements(query)
click to toggle source
Executes a xpath/css query
# File lib/docparser/document.rb, line 49 def elements(query) @doc.search(query) end
inspect()
click to toggle source
@!visibility private
# File lib/docparser/document.rb, line 87 def inspect "<Document file:'#{@filename}', encoding:'#{@encoding}'>" end
parse!(&block)
click to toggle source
Parses the document @return [Array] containing the parse results
# File lib/docparser/document.rb, line 81 def parse!(&block) instance_exec(&block) results end
regexp(regexp)
click to toggle source
Matches the HTML source using a regular expression
# File lib/docparser/document.rb, line 75 def regexp(regexp) html.match(regexp) end
title()
click to toggle source
Extracts the document title @return [String] the title of the document
# File lib/docparser/document.rb, line 44 def title @title ||= xpath_content('//head/title') end
Private Instance Methods
read_file()
click to toggle source
# File lib/docparser/document.rb, line 93 def read_file encodingstring = @encoding == 'utf-8' ? 'r:utf-8' : "r:#{encoding}:utf-8" open(@filename, encodingstring) do |f| @html = f.read @logger.warn "#{filename} is empty" if @html.empty? @doc = Nokogiri(@html) end end