class MiddleSquid::Indexer

Used internally to build the blacklist database.

@see CLI#index middle_squid index command

Attributes

append[RW]

@return [Boolean]

entries[RW]

@return [Array<Symbol>]

full_index[RW]

@return [Boolean]

quiet[RW]

@return [Boolean]

Public Class Methods

new() click to toggle source
# File lib/middle_squid/indexer.rb, line 20
def initialize
  @append = false
  @entries = [:url, :domain]
  @full_index = true
  @quiet = false

  @aliases = {}
  @cats_in_use  = []
  @indexed_cats = []

  @total = {
    :url    => 0,
    :domain  => 0,
    :ignored  => 0,
    :duplicate => 0,
  }
end

Public Instance Methods

blacklists=(list) click to toggle source

@param list [Array<BlackList>]

# File lib/middle_squid/indexer.rb, line 39
def blacklists=(list)
  @cats_in_use.clear
  @aliases.clear

  list.each {|bl|
    @cats_in_use << bl.category

    bl.aliases.each {|name|
      @aliases[name] = bl.category
    }
  }

  @cats_in_use.uniq!
end
index(directories) click to toggle source

@param directories [Array<String>]

# File lib/middle_squid/indexer.rb, line 55
def index(directories)
  if !@full_index && @cats_in_use.empty?
    oops 'the loaded configuration does not use any blacklist'
    info 'nothing to do in minimal indexing mode'
    return
  end

  start_time = Time.now

  db.transaction

  truncate unless @append
  directories.each {|dir|
    walk_in dir
  }
  cats_summary
  stats
  commit_or_rollback

  end_time = Time.now
  info "finished after #{end_time - start_time} seconds"
ensure
  db.rollback if db.transaction_active?
end

Private Instance Methods

append_to(category, line) click to toggle source
# File lib/middle_squid/indexer.rb, line 158
def append_to(category, line)
  # fix invalid UTF-8 byte sequences
  line.scrub! ''

  # remove trailing whitespace
  line.strip!

  # ignore regex lists
  return :ignored unless line[0] =~ /\w/

  # fix for dirty lists
  line.tr! '\\', '/'

  uri = MiddleSquid::URI.parse "http://#{line}"
  host, path = uri.cleanhost, uri.cleanpath

  if path.empty?
    return :ignored unless @entries.include? :domain

    db.execute 'INSERT INTO domains (category, host) VALUES (?, ?)',
      [category, host]

    :domain
  else
    return :ignored unless @entries.include? :url

    db.execute 'INSERT INTO urls (category, host, path) VALUES (?, ?, ?)',
      [category, host, path]

    :url
  end
rescue SQLite3::ConstraintException
  :duplicate
end
cats_summary() click to toggle source
# File lib/middle_squid/indexer.rb, line 193
def cats_summary
  @indexed_cats.uniq!
  missing_cats = @cats_in_use - @indexed_cats

  info
  info "indexed #{@indexed_cats.size} categorie(s): #{@indexed_cats}"
  warn "could not find #{missing_cats}" unless missing_cats.empty?
end
commit_or_rollback() click to toggle source
# File lib/middle_squid/indexer.rb, line 210
def commit_or_rollback
  if @total[:domain] > 0 || @total[:url] > 0
    info 'committing changes'
    db.commit
  else
    oops 'nothing to commit'
    info 'reverting changes'
    db.rollback
  end
end
index_file(path) click to toggle source
# File lib/middle_squid/indexer.rb, line 121
def index_file(path)
  pn = Pathname.new path
  return unless pn.file?

  dirname = pn.dirname.basename.to_s
  category = @aliases.has_key?(dirname) \
    ? @aliases[dirname]
    : dirname

  if !@full_index
    return unless @cats_in_use.include? category
  end

  @indexed_cats << category

  total_size = File.size path
  current_pos = percent = reported = 0

  status = "\rindexing #{dirname}/#{pn.basename} [%d%%]"
  output status % percent

  File.foreach(path) {|line|
    current_pos += line.bytesize
    percent = (current_pos.to_f / total_size * 100).to_i

    if percent != reported
      output status % percent
      reported = percent
    end

    type = append_to category, line
    @total[type] += 1
  }

  output "\n"
end
info(line = "") click to toggle source
# File lib/middle_squid/indexer.rb, line 93
def info(line = "")
  line << "\n"
  output line
end
oops(msg) click to toggle source
# File lib/middle_squid/indexer.rb, line 85
def oops(msg)
  output "ERROR: #{msg}\n", always: true
end
output(string, always: false) click to toggle source
# File lib/middle_squid/indexer.rb, line 81
def output(string, always: false)
  $stderr.print string if always || !@quiet
end
stats() click to toggle source
# File lib/middle_squid/indexer.rb, line 202
def stats
  info "found #{@total[:domain]} domain(s)"
  info "found #{@total[:url]} url(s)"
  info "found #{@total[:duplicate]} duplicate(s)"
  info "found #{@total[:ignored]} ignored expression(s)"
  info
end
truncate() click to toggle source
# File lib/middle_squid/indexer.rb, line 98
def truncate
  info 'truncating database'

  db.execute 'DELETE FROM domains' 
  db.execute 'DELETE FROM urls' 
end
walk_in(directory) click to toggle source
# File lib/middle_squid/indexer.rb, line 105
def walk_in(directory)
  info "reading #{directory}"

  unless File.directory? directory
    warn "#{directory}: no such directory"
    return
  end

  files = Dir.glob File.join(directory, '*/*')
  files.sort! # fixes travis build

  files.each {|file|
    index_file file
  }
end
warn(msg) click to toggle source
# File lib/middle_squid/indexer.rb, line 89
def warn(msg)
  output "WARNING: #{msg}\n", always: true
end