class Idata::Detector
Constants
- COMMON_DELIMITERS
- DEFAULT_DELIMITER
- SAMPLE_SIZE
Public Class Methods
new(file)
click to toggle source
# File lib/idata/detector.rb, line 23 def initialize(file) @file = file @sample = `head -n #{SAMPLE_SIZE} #{@file}`.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '') @sample_lines = @sample.split(/[\r\n]+/) @candidates = COMMON_DELIMITERS.map { |delim| [delim, @sample.scan(delim).count] }.to_h.select{|k,v| v > 0} end
Public Instance Methods
find()
click to toggle source
# File lib/idata/detector.rb, line 32 def find return DEFAULT_DELIMITER if @candidates.empty? # for example, file with only one header return find_same_occurence || find_valid || find_max_occurence || DEFAULT_DELIMITER end
find_max_occurence()
click to toggle source
most occurence
# File lib/idata/detector.rb, line 67 def find_max_occurence selected = @candidates.select{|k,v| v == @candidates.sort_by(&:last).last }.keys return selected.first if selected.count == 1 return DEFAULT_DELIMITER if selected.include?(DEFAULT_DELIMITER) end
find_same_occurence()
click to toggle source
high confident level
# File lib/idata/detector.rb, line 53 def find_same_occurence selected = @candidates.select { |delim, count| begin CSV.parse(@sample, col_sep: delim).select{|e| !e.empty? }.map{|e| e.count}.uniq.count == 1 rescue Exception => ex false end }.keys return selected.first if selected.count == 1 return DEFAULT_DELIMITER if selected.include?(DEFAULT_DELIMITER) end
find_valid()
click to toggle source
just work
# File lib/idata/detector.rb, line 38 def find_valid selected = @candidates.select { |delim, count| begin CSV.parse(@sample, col_sep: delim) true rescue Exception => ex false end }.keys return selected.first if selected.count == 1 return DEFAULT_DELIMITER if selected.include?(DEFAULT_DELIMITER) end