class DataKit::CSV::SchemaAnalyzer
Attributes
csv[RW]
keys[RW]
sampling_rate[RW]
use_type_hints[RW]
Public Class Methods
analyze(csv, options = {})
click to toggle source
# File lib/data_kit/csv/schema_analyzer.rb, line 45 def analyze(csv, options = {}) analyzer = new(csv, :keys => options[:keys], :sampling_rate => options[:sampling_rate], :use_type_hints => options[:use_type_hints] ) analyzer.execute end
new(csv, options = {})
click to toggle source
# File lib/data_kit/csv/schema_analyzer.rb, line 9 def initialize(csv, options = {}) @csv = csv @keys = options[:keys] || [] @sampling_rate = options[:sampling_rate] || 0.1 if options[:use_type_hints].nil? || options[:use_type_hints] == true @use_type_hints = true else @use_type_hints = false end end
sampling_rate(file_size)
click to toggle source
# File lib/data_kit/csv/schema_analyzer.rb, line 55 def sampling_rate(file_size) if file_size < (1024 * 1024) sampling_rate = 1.0 else scale_factor = 500 sampling_rate = (scale_factor / Math.sqrt(file_size)).round(4) end end
Public Instance Methods
execute()
click to toggle source
# File lib/data_kit/csv/schema_analyzer.rb, line 21 def execute first = true analysis = nil random = Random.new csv.each_row do |row| if first first = false analysis = SchemaAnalysis.new(csv.headers, :use_type_hints => use_type_hints) end analysis.increment_total if random.rand <= sampling_rate analysis.increment_sample row.each_with_index do |value, index| analysis.insert(csv.headers[index].to_s, value) end end end analysis end