class MammothHasher

Public Class Methods

hash(filename, debug=false) click to toggle source
# File lib/mammoth-hasher.rb, line 4
def self.hash filename, debug=false
  time_start = Time.now if debug

  # we check that the file exist
  raise ArgumentError, "give the filename as a parameter (got nil)" if filename == nil
  raise ArgumentError, "filename must be a string" if ! filename.is_a? String
  filename = File.expand_path filename
  raise ArgumentError, "#{filename} does not exist" if ! File.exist? filename

  # algorithm parameters
  # WARNING: if you change them, the resulting hash will be different !
  number_of_chunks = 100
  length_of_chunks = 4

  # we get the file size (in bytes), used as PRNG (Pseudo Random Number Generator)
  filesize = File.size filename

  # if the file is not a big file, it's quicker to compute
  # the MD5 of the whole file than to apply our custom algorithm
  if filesize <= number_of_chunks*length_of_chunks
    file = File.open(filename, 'r')
    hash = Digest::MD5.file(file).hexdigest
    file.close
    puts (Time.now - time_start).to_s + " seconds" if debug
    return hash
  end

  # we initialize the PRNG
  prng = Random.new filesize

  # we always get a chunk at the offset 0 (beginning of file)
  # because that's where the magic number indicating the file type is
  # so making sure that it's still the same may prevent from some attacks
  offsets = [0]

  # we get 99 other offsets between 0 and (filesize - length_of_chunk)
  for i in 1..(number_of_chunks-1)
    offsets << prng.rand(filesize - length_of_chunks)
  end

  # we sort the offsets in ascending order
  # (in order to optimize the way the file will be read (in only one direction))
  offsets.sort

  # we concatenate all the bytes from all the chunks at the offset we choose
  bytes = ""
  for offset in offsets
    bytes = "#{bytes}#{File.read(filename, length_of_chunks, offset)}"
  end

  # we compute the final hash, which is the hash of the concatenation of all chunks
  hash = Digest::MD5.new.hexdigest bytes

  puts (Time.now - time_start).to_s + " seconds" if debug

  return hash
end