class Idata::Detector

Constants

COMMON_DELIMITERS
DEFAULT_DELIMITER
SAMPLE_SIZE

Public Class Methods

new(file) click to toggle source
# File lib/idata/detector.rb, line 23
def initialize(file)
  @file = file
  @sample = `head -n #{SAMPLE_SIZE} #{@file}`.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
  @sample_lines = @sample.split(/[\r\n]+/)
  @candidates = COMMON_DELIMITERS.map { |delim|
    [delim, @sample.scan(delim).count]
  }.to_h.select{|k,v| v > 0}
end

Public Instance Methods

find() click to toggle source
# File lib/idata/detector.rb, line 32
def find
  return DEFAULT_DELIMITER if @candidates.empty? # for example, file with only one header
  return find_same_occurence || find_valid || find_max_occurence || DEFAULT_DELIMITER
end
find_max_occurence() click to toggle source

most occurence

# File lib/idata/detector.rb, line 67
def find_max_occurence
  selected = @candidates.select{|k,v| v == @candidates.sort_by(&:last).last }.keys

  return selected.first if selected.count == 1
  return DEFAULT_DELIMITER if selected.include?(DEFAULT_DELIMITER)
end
find_same_occurence() click to toggle source

high confident level

# File lib/idata/detector.rb, line 53
def find_same_occurence
  selected = @candidates.select { |delim, count|
    begin
      CSV.parse(@sample, col_sep: delim).select{|e| !e.empty? }.map{|e| e.count}.uniq.count == 1
    rescue Exception => ex
      false
    end
  }.keys

  return selected.first if selected.count == 1
  return DEFAULT_DELIMITER if selected.include?(DEFAULT_DELIMITER)
end
find_valid() click to toggle source

just work

# File lib/idata/detector.rb, line 38
def find_valid
  selected = @candidates.select { |delim, count|
    begin
      CSV.parse(@sample, col_sep: delim)
      true
    rescue Exception => ex
      false
    end
  }.keys

  return selected.first if selected.count == 1
  return DEFAULT_DELIMITER if selected.include?(DEFAULT_DELIMITER)
end