class DocParser::Parser
The main parser class. This is the class you'll use to create your parser The real work happens in the Document
class @see Document
Attributes
@!visibility private
@!visibility private
@!visibility private
@!visibility private
Public Class Methods
Creates a new Parser
instance
@param files [Array] An array containing URLs or paths to files @param quiet [Boolean] Be quiet @param encoding [String] The encoding to use for opening the files @param parallel [Boolean] Use parallel processing @param output [Output, Array] The output(s), defaults to a Screenoutput @param range [Range] Range of files to process (nil means process all) @param num_processes
[Fixnum] Number of parallel processes
# File lib/docparser/parser.rb, line 38 def initialize(files: [], quiet: false, encoding: 'utf-8', parallel: true, output: nil, range: nil, num_processes: Parallel.processor_count + 1) @num_processes = parallel ? num_processes : 1 @files = range ? files[range] : files @encoding = encoding @logger = Logger.new(STDERR) @logger.level = quiet ? Logger::ERROR : Logger::INFO initialize_outputs output @logger.info "DocParser v#{VERSION} loaded" end
Public Instance Methods
Parses the `files`
Accepts a block which is executed for each document in the Document
context where you can access the content using Nokogiri.
@see Document
# File lib/docparser/parser.rb, line 59 def parse!(&block) @logger.info "Parsing #{@files.length} files (encoding: #{@encoding})." start_time = Time.now if @num_processes > 1 parallel_process(&block) else serial_process(&block) end @logger.info 'Processing finished' write_to_outputs @logger.info format('Done processing in %.2fs.', Time.now - start_time) end
Private Instance Methods
# File lib/docparser/parser.rb, line 78 def initialize_outputs(output) @outputs = [] if output.is_a? Output @outputs << output elsif output.is_a?(Array) && output.all? { |o| o.is_a? Output } @outputs = output elsif output raise ArgumentError, 'Invalid outputs specified' end @resultsets = Array.new(@outputs.length) { Set.new } end
# File lib/docparser/parser.rb, line 91 def parallel_process(&block) @logger.info "Starting #{@num_processes} processes" option = RUBY_ENGINE == 'ruby' ? :in_processes : :in_threads Parallel.map(@files, option => @num_processes) do |file| # :nocov: # parse_doc(file, &block) # :nocov: # end.each do |result| next unless @outputs result.each_with_index do |set, index| @resultsets[index].merge(set) end end end
# File lib/docparser/parser.rb, line 115 def parse_doc(file, &block) doc = Document.new(filename: file, encoding: @encoding, parser: self) doc.parse!(&block) end
# File lib/docparser/parser.rb, line 107 def serial_process(&block) @files.each do |file| parse_doc(file, &block).each_with_index do |set, index| @resultsets[index].merge(set) if @outputs end end end
# File lib/docparser/parser.rb, line 120 def write_to_outputs @logger.info 'Writing data..' @outputs.each_with_index do |output, index| @resultsets[index].each do |row| output.add_row row end @resultsets[index] = nil output.close end end