class DocParser::Document

The Document class loads and parses the files. @see Parser @see Output

Attributes

doc[R]

@return [Nokogiri::HTML::Document] a reference to the Nokogiri document

encoding[R]

@return [String] the encoding of the document

filename[R]

@return [String] the filename of the current document

html[R]

@return [String] the source of the document

results[R]

@return [Array] the results from this document

Public Class Methods

new(filename: nil, encoding: 'utf-8', parser: nil, logger: nil) click to toggle source
# File lib/docparser/document.rb, line 24
def initialize(filename: nil, encoding: 'utf-8', parser: nil, logger: nil)
  @logger = logger || Logger.new(STDERR)
  @logger.level = Logger::INFO
  @logger.debug("Parsing #{filename}")
  @encoding = encoding
  @parser = parser
  @filename = filename
  @results = Array.new(@parser.outputs ? @parser.outputs.length : 0) { [] }
  read_file
end

Public Instance Methods

add_row(*row, output: 0) click to toggle source

Adds a row to an output

# File lib/docparser/document.rb, line 36
def add_row(*row, output: 0)
  output = @parser.outputs.index(output) if output.is_a? Output
  @logger.debug("#{filename}: Adding row #{row.flatten}")
  results[output] << row.flatten
end
each_element(query) { |el| ... } click to toggle source
# File lib/docparser/document.rb, line 53
def each_element(query)
  res = elements(query)

  if block_given?
    res.each { |el| yield el }
  else
    res
  end
end
Also aliased as: css, xpath
element_content(query) click to toggle source

Executes a xpath query and returns the content @return [String] the content of the HTML node

# File lib/docparser/document.rb, line 65
def element_content(query)
  first = @doc.search(query).first
  if first.nil?
    nil
  else
    first.content
  end
end
Also aliased as: css_content, xpath_content
elements(query) click to toggle source

Executes a xpath/css query

# File lib/docparser/document.rb, line 49
def elements(query)
  @doc.search(query)
end
inspect() click to toggle source

@!visibility private

# File lib/docparser/document.rb, line 87
def inspect
  "<Document file:'#{@filename}', encoding:'#{@encoding}'>"
end
parse!(&block) click to toggle source

Parses the document @return [Array] containing the parse results

# File lib/docparser/document.rb, line 81
def parse!(&block)
  instance_exec(&block)
  results
end
regexp(regexp) click to toggle source

Matches the HTML source using a regular expression

# File lib/docparser/document.rb, line 75
def regexp(regexp)
  html.match(regexp)
end
title() click to toggle source

Extracts the document title @return [String] the title of the document

# File lib/docparser/document.rb, line 44
def title
  @title ||= xpath_content('//head/title')
end

Private Instance Methods

css(query)
Alias for: each_element
css_content(query)
Alias for: element_content
read_file() click to toggle source
# File lib/docparser/document.rb, line 93
def read_file
  encodingstring = @encoding == 'utf-8' ? 'r:utf-8' : "r:#{encoding}:utf-8"
  open(@filename, encodingstring) do |f|
    @html = f.read
    @logger.warn "#{filename} is empty" if @html.empty?
    @doc = Nokogiri(@html)
  end
end
xpath(query)
Alias for: each_element
xpath_content(query)
Alias for: element_content