class Libis::RosettaChecker::FilesToIngestCleanup

Constants

CSV_HEADER
FIND_SQL
LOG_PATTERN
MSG_CALC_FC
MSG_CHCK_DB
MSG_DEFLATE
SQL_DATA

Attributes

cfg[RW]
connection[RW]
cursor[RW]
logger[RW]
report[RW]

Public Class Methods

command() click to toggle source
# File lib/libis/rosetta_checker/files_to_ingest_cleanup.rb, line 21
def self.command
  'files2ingest'.freeze
end
new(cfg) click to toggle source
# File lib/libis/rosetta_checker/files_to_ingest_cleanup.rb, line 37
def initialize(cfg)
  @cfg = cfg

  setup_logging

  setup_db
end
options_class() click to toggle source
# File lib/libis/rosetta_checker/files_to_ingest_cleanup.rb, line 25
def self.options_class
  FilesToIngestCleanupOptions
end
run() click to toggle source
Calls superclass method
# File lib/libis/rosetta_checker/files_to_ingest_cleanup.rb, line 29
def self.run
  super do |cfg|
    self.new(cfg).run(ARGV)
  end
end
short_desc() click to toggle source
# File lib/libis/rosetta_checker/files_to_ingest_cleanup.rb, line 17
def self.short_desc
  'Report on files that are/are not ingested'.freeze
end

Public Instance Methods

finalize() click to toggle source
# File lib/libis/rosetta_checker/files_to_ingest_cleanup.rb, line 45
def finalize
  cursor.close if cursor
  connection.logoff if connection
end
run(argv) click to toggle source
# File lib/libis/rosetta_checker/files_to_ingest_cleanup.rb, line 50
def run(argv)
  raise ArgumentError, 'Need to specify at least a directory/file to parse' unless argv.size > 0
  while (dir = argv.shift)
    process_dir dir
    next if argv.empty?
    self.class.parse_options(argv)
    setup_logging
  end
end

Protected Instance Methods

check_file(info) click to toggle source
# File lib/libis/rosetta_checker/files_to_ingest_cleanup.rb, line 252
def check_file(info)
  logger.info MSG_CHCK_DB

  cursor.bind_param(':filesize', info[:size].to_i)
  cursor.bind_param(':checksum', info[:md5].to_s)

  cursor.exec

  info[:found] = 0
  info_list = []

  while (found = cursor.fetch_hash)
    dnx = found['DNX'].read
    next unless dnx =~ /<key id="fixityType">SHA1<\/key><key id="fixityValue">#{info[:sha1]}<\/key>/
    SQL_DATA.each {|x| info[x.to_sym] = found[x.upcase]}
    logger.info "    found match: #{info[:ie_id]}/#{info[:rep_id]}/#{info[:fl_id]}"
    if info[:original_name] =~ Regexp.new(info[:file].split(/[ #._-]/).join('.*'))
      logger.info "    name matches: #{info[:original_name]}"
      info[:name_match] = true
    else
      info[:name_match] = false
    end

    info_list << info.dup

  end

  if info_list.empty?
    to_report info
  else
    info_list.each do |i|
      i[:found] = info_list.count
      to_report i
    end
  end

  info_list.size

end
checksum_file(reader, info) click to toggle source
# File lib/libis/rosetta_checker/files_to_ingest_cleanup.rb, line 238
def checksum_file(reader, info)
  info[:size] = 0
  md5 = Digest::MD5.new
  sha1 = Digest::SHA1.new
  while (data = reader.read 2048000) do
    info[:size] += data.length
    md5 << data
    sha1 << data
  end
  reader.close
  info[:md5] = md5.hexdigest
  info[:sha1] = sha1.hexdigest
end
process_dir(dir) click to toggle source
# File lib/libis/rosetta_checker/files_to_ingest_cleanup.rb, line 127
def process_dir(dir)
  if File.directory?(dir)
    unless Dir.exist?(dir)
      logger.error "Directory '#{dir}' does not exist"
      return nil
    end
    unless File.readable?(dir)
      logger.error "Directory '#{dir}' cannot be read"
      return nil
    end
    logger.info "Processing dir '#{dir}'"
    Dir.entries(dir).each do |entry|
      next if %w'. ..'.include? entry
      path = File.join dir, entry
      begin
        process_dir path if @cfg.recursive
        next
      end if File.directory?(path)
      process_file path
    end
  elsif dir[0] == '@'
    file = to_file(dir[1..-1])
    return nil unless file
    logger.info "Processing input file '#{file}'"
    File.open(file, 'r').each_line do |line|
      process_file(line.chomp, File.dirname(file))
    end
  elsif File.file?(dir)
    process_file(dir)
  else
    raise ArgumentError, "Argument '#{dir}' should refer to an existing and readable file or directory"
  end
end
process_file(_file, *search_dir) click to toggle source
# File lib/libis/rosetta_checker/files_to_ingest_cleanup.rb, line 179
def process_file(_file, *search_dir)
  file = to_file(_file, *search_dir)
  return unless file

  if File.directory?(file)
    process_dir(file)
    return
  end

  info = {
      parent_type: 'D',
      parent: File.dirname(file),
      file: File.basename(file)
  }

  logger.info "- #{file}"
  if File.extname(file) == '.bz2'
    logger.info MSG_DEFLATE
    info[:parent_type] = 'F'
    info[:parent] = file
    info[:file] = File.basename file, '.bz2'
    logger.info MSG_CALC_FC
    checksum_file Bzip2::FFI::Reader.open(file), info
    check_file info
  elsif File.extname(file) == '.zip'
    begin
      logger.info MSG_CALC_FC
      checksum_file File.open(file), info
      unless check_file info > 0
        logger.info '  - Unpacking'.freeze
        info[:parent_type] = 'Z'
        info[:parent] = file
        Zip::File.open(file) do |zip|
          zip.each do |entry|
            next if entry.directory?
            info[:file] = entry.name
            logger.info "- #{file}/#{entry.name}"
            logger.info MSG_CALC_FC
            checksum_file entry.get_input_stream, info
            check_file info
          end
        end
      end
    rescue Zip::ZipError
      logger.error "Could not unpack file '#{file}'"
    rescue Exception
      logger.error "Could not access file '#{file}'"
    end
  else
    begin
      logger.info MSG_CALC_FC
      checksum_file File.open(file), info
      check_file info
    rescue Exception
      logger.error "Could not access file '#{file}'"
    end
  end
end
setup_db() click to toggle source
# File lib/libis/rosetta_checker/files_to_ingest_cleanup.rb, line 121
def setup_db
  @connection = OCI8.new(@cfg.dbuser, @cfg.dbpass, @cfg.dburl)
  @cursor = @connection.parse(FIND_SQL)
  @cursor.prefetch_rows = 10
end
setup_logging() click to toggle source
# File lib/libis/rosetta_checker/files_to_ingest_cleanup.rb, line 108
def setup_logging
  Logging.logger.root.level = :info
  @logger = Logging.logger[self.class.command]
  @logger.appenders = [Logging.appenders.stdout]
  Logging.appenders.stdout.level = (@cfg.quiet ? :warn : :info)
  @cfg.log_file = nil if @cfg.log_file&.chomp&.strip&.empty?
  @logger.add_appenders Logging.appenders.file(
      @cfg.log_file,
      truncate: false,
      layout: Logging.layouts.pattern(pattern: LOG_PATTERN)
  ) if @cfg.log_file
end
to_file(file, *search_dirs) click to toggle source
# File lib/libis/rosetta_checker/files_to_ingest_cleanup.rb, line 161
def to_file(file, *search_dirs)
  if File.exist?(file)
    return file if File.readable?(file)
    logger.error "File '#{file}' cannot be read"
    return nil
  end
  search_dirs.each do |dir|
    f = File.join(dir, file)
    if File.exist? f
      return f if File.readable?(f)
      logger.error "File '#{f}' cannot be read"
      return nil
    end
  end
  logger.error "File '#{file}' does not exist"
  nil
end
to_report(info = nil) click to toggle source
# File lib/libis/rosetta_checker/files_to_ingest_cleanup.rb, line 292
def to_report(info = nil)
  return unless @cfg.report
  unless @report
    @report ||= CSV.open(@cfg.report_file, 'wb')
    @report << CSV_HEADER
  end
  @report << CSV_HEADER.map {|x| info[x.to_sym]} if info
end