class MiddleSquid::Indexer
Used internally to build the blacklist database.
@see CLI#index
middle_squid index
command
Attributes
append[RW]
@return [Boolean]
entries[RW]
@return [Array<Symbol>]
full_index[RW]
@return [Boolean]
quiet[RW]
@return [Boolean]
Public Class Methods
new()
click to toggle source
# File lib/middle_squid/indexer.rb, line 20 def initialize @append = false @entries = [:url, :domain] @full_index = true @quiet = false @aliases = {} @cats_in_use = [] @indexed_cats = [] @total = { :url => 0, :domain => 0, :ignored => 0, :duplicate => 0, } end
Public Instance Methods
blacklists=(list)
click to toggle source
@param list [Array<BlackList>]
# File lib/middle_squid/indexer.rb, line 39 def blacklists=(list) @cats_in_use.clear @aliases.clear list.each {|bl| @cats_in_use << bl.category bl.aliases.each {|name| @aliases[name] = bl.category } } @cats_in_use.uniq! end
index(directories)
click to toggle source
@param directories [Array<String>]
# File lib/middle_squid/indexer.rb, line 55 def index(directories) if !@full_index && @cats_in_use.empty? oops 'the loaded configuration does not use any blacklist' info 'nothing to do in minimal indexing mode' return end start_time = Time.now db.transaction truncate unless @append directories.each {|dir| walk_in dir } cats_summary stats commit_or_rollback end_time = Time.now info "finished after #{end_time - start_time} seconds" ensure db.rollback if db.transaction_active? end
Private Instance Methods
append_to(category, line)
click to toggle source
# File lib/middle_squid/indexer.rb, line 158 def append_to(category, line) # fix invalid UTF-8 byte sequences line.scrub! '' # remove trailing whitespace line.strip! # ignore regex lists return :ignored unless line[0] =~ /\w/ # fix for dirty lists line.tr! '\\', '/' uri = MiddleSquid::URI.parse "http://#{line}" host, path = uri.cleanhost, uri.cleanpath if path.empty? return :ignored unless @entries.include? :domain db.execute 'INSERT INTO domains (category, host) VALUES (?, ?)', [category, host] :domain else return :ignored unless @entries.include? :url db.execute 'INSERT INTO urls (category, host, path) VALUES (?, ?, ?)', [category, host, path] :url end rescue SQLite3::ConstraintException :duplicate end
cats_summary()
click to toggle source
# File lib/middle_squid/indexer.rb, line 193 def cats_summary @indexed_cats.uniq! missing_cats = @cats_in_use - @indexed_cats info info "indexed #{@indexed_cats.size} categorie(s): #{@indexed_cats}" warn "could not find #{missing_cats}" unless missing_cats.empty? end
commit_or_rollback()
click to toggle source
# File lib/middle_squid/indexer.rb, line 210 def commit_or_rollback if @total[:domain] > 0 || @total[:url] > 0 info 'committing changes' db.commit else oops 'nothing to commit' info 'reverting changes' db.rollback end end
index_file(path)
click to toggle source
# File lib/middle_squid/indexer.rb, line 121 def index_file(path) pn = Pathname.new path return unless pn.file? dirname = pn.dirname.basename.to_s category = @aliases.has_key?(dirname) \ ? @aliases[dirname] : dirname if !@full_index return unless @cats_in_use.include? category end @indexed_cats << category total_size = File.size path current_pos = percent = reported = 0 status = "\rindexing #{dirname}/#{pn.basename} [%d%%]" output status % percent File.foreach(path) {|line| current_pos += line.bytesize percent = (current_pos.to_f / total_size * 100).to_i if percent != reported output status % percent reported = percent end type = append_to category, line @total[type] += 1 } output "\n" end
info(line = "")
click to toggle source
# File lib/middle_squid/indexer.rb, line 93 def info(line = "") line << "\n" output line end
oops(msg)
click to toggle source
# File lib/middle_squid/indexer.rb, line 85 def oops(msg) output "ERROR: #{msg}\n", always: true end
output(string, always: false)
click to toggle source
# File lib/middle_squid/indexer.rb, line 81 def output(string, always: false) $stderr.print string if always || !@quiet end
stats()
click to toggle source
# File lib/middle_squid/indexer.rb, line 202 def stats info "found #{@total[:domain]} domain(s)" info "found #{@total[:url]} url(s)" info "found #{@total[:duplicate]} duplicate(s)" info "found #{@total[:ignored]} ignored expression(s)" info end
truncate()
click to toggle source
# File lib/middle_squid/indexer.rb, line 98 def truncate info 'truncating database' db.execute 'DELETE FROM domains' db.execute 'DELETE FROM urls' end
walk_in(directory)
click to toggle source
# File lib/middle_squid/indexer.rb, line 105 def walk_in(directory) info "reading #{directory}" unless File.directory? directory warn "#{directory}: no such directory" return end files = Dir.glob File.join(directory, '*/*') files.sort! # fixes travis build files.each {|file| index_file file } end
warn(msg)
click to toggle source
# File lib/middle_squid/indexer.rb, line 89 def warn(msg) output "WARNING: #{msg}\n", always: true end