class DataKit::CSV::SchemaAnalyzer

Attributes

csv[RW]
keys[RW]
sampling_rate[RW]
use_type_hints[RW]

Public Class Methods

analyze(csv, options = {}) click to toggle source
# File lib/data_kit/csv/schema_analyzer.rb, line 45
def analyze(csv, options = {})
  analyzer = new(csv,
    :keys => options[:keys],
    :sampling_rate => options[:sampling_rate],
    :use_type_hints => options[:use_type_hints]
  )

  analyzer.execute
end
new(csv, options = {}) click to toggle source
# File lib/data_kit/csv/schema_analyzer.rb, line 9
def initialize(csv, options = {})
  @csv = csv
  @keys = options[:keys] || []
  @sampling_rate = options[:sampling_rate] || 0.1

  if options[:use_type_hints].nil? || options[:use_type_hints] == true
    @use_type_hints = true
  else
    @use_type_hints = false
  end
end
sampling_rate(file_size) click to toggle source
# File lib/data_kit/csv/schema_analyzer.rb, line 55
def sampling_rate(file_size)
  if file_size < (1024 * 1024)
    sampling_rate = 1.0
  else
    scale_factor = 500
    sampling_rate = (scale_factor / Math.sqrt(file_size)).round(4)
  end
end

Public Instance Methods

execute() click to toggle source
# File lib/data_kit/csv/schema_analyzer.rb, line 21
def execute
  first = true
  analysis = nil
  random = Random.new

  csv.each_row do |row|
    if first
      first = false
      analysis = SchemaAnalysis.new(csv.headers, :use_type_hints => use_type_hints)
    end

    analysis.increment_total
    if random.rand <= sampling_rate
      analysis.increment_sample
      row.each_with_index do |value, index|
        analysis.insert(csv.headers[index].to_s, value)
      end
    end
  end

  analysis
end