class CsvImportAnalyzer::CsvCheckBounds

Attributes

csv_column_datatypes[RW]
distinct_values[RW]
max_distinct_values[RW]
min_max_bounds[RW]
nullable[RW]
options[RW]

Public Class Methods

new(options = {}) click to toggle source
# File lib/csv-import-analyzer/analyzer/csv_check_bounds.rb, line 11
def initialize(options = {})
  @csv_column_datatypes = options[:csv_column_datatypes]
  @options = options
  @min_max_bounds = {}
  @distinct_values = {}
  @nullable = options[:nullable] || []
end

Public Instance Methods

chunk_size() click to toggle source
# File lib/csv-import-analyzer/analyzer/csv_check_bounds.rb, line 22
def chunk_size
  return options[:chunk_size]
end
delimiter() click to toggle source
# File lib/csv-import-analyzer/analyzer/csv_check_bounds.rb, line 25
def delimiter
  return options[:delimiter]
end
filename() click to toggle source
# File lib/csv-import-analyzer/analyzer/csv_check_bounds.rb, line 19
def filename
  return options[:filename]
end
get_min_max_values() click to toggle source

Public interface for CsvCheckBounds Processes the CSV file for min & max values and distinct values for each column

# File lib/csv-import-analyzer/analyzer/csv_check_bounds.rb, line 34
def get_min_max_values
  unless filename.nil?
    if File.exist?(filename)
      # Using SmarterCSV gem to retrieve the csv records in chunks
      # Chunk size can be set by the user
      # E.g. :chunk_size => 200 would retrieve 200 rows each time
      SmarterCSV.process(filename, {:col_sep => delimiter, :chunk_size => chunk_size, 
      :remove_empty_values => false, :remove_zero_values => false}) do |chunk|
        chunk.each do |row|
          row.each do |key, value|
            unless null_like?(value)
              process_min_max_for_column(key, value)
              process_distinct_values(key, value)
            else             
              nullable.push(key) unless nullable.include?(key)
            end
          end
        end
      end
      return {:min_max => min_max_bounds, :uniques => distinct_values}
    else
      FileNotFound.new
    end
  else
    MissingRequiredArguments.new("valid filename is required to check bounds")
  end
end

Private Instance Methods

add_bounds(key, value) click to toggle source

Method to decide on the min max values for each key Checks for length if key is of String format Check for values if key is of Numeric or Datetime format

# File lib/csv-import-analyzer/analyzer/csv_check_bounds.rb, line 84
def add_bounds(key, value)
  begin
    if csv_column_datatypes[key] == :string
      min_max_bounds[key][:min] = value.length if value.length < min_max_bounds[key][:min]
      min_max_bounds[key][:max] = value.length if value.length > min_max_bounds[key][:max]
    else
      min_max_bounds[key][:min] = value if value < min_max_bounds[key][:min]
      min_max_bounds[key][:max] = value if value > min_max_bounds[key][:max]
    end
  rescue ArgumentError, NoMethodError => e
    ###
    # TODO: Handle csv parse coversions of datatypes
    ###
  end
end
process_distinct_values(key, value) click to toggle source

Processes the max number of distinct values set for each column

# File lib/csv-import-analyzer/analyzer/csv_check_bounds.rb, line 103
def process_distinct_values(key, value)
  if distinct_values[key].nil?
    distinct_values[key] = [value]
  else
    if distinct_values[key].size > max_distinct_values
    else
      distinct_values[key].push(value) unless distinct_values[key].include?(value)
    end
  end
end
process_min_max_for_column(key, value) click to toggle source

If the key is of String type then we find the max length of it Any other datatype would have a min and max ranges

# File lib/csv-import-analyzer/analyzer/csv_check_bounds.rb, line 68
def process_min_max_for_column(key, value)
  if min_max_bounds[key].nil?
    unless csv_column_datatypes[key] == :string
      min_max_bounds[key] = {:min => value, :max => value}
    else
      min_max_bounds[key] = {:min => value.length, :max => 0}
    end
  end
  add_bounds(key, value)
end