class CsvMigration

Description: Parse and test data from a csv file.

Public Class Methods

new(file_name:, file_path: '', delimiter: ';', cli: false) click to toggle source

@param [String] file_name with extension (my_file.csv) @param [String] file_path path to file. By default current folder @param [String] delimiter for parsing, by default = ';' @param [String] cli mode. If true then class will use only for command line. By default false

# File lib/csv_migration.rb, line 9
def initialize(file_name:, file_path: '', delimiter: ';', cli: false)
  # File name for parsing in csv format
  @file_name_csv = file_name
  @file_path = file_path
  @delimiter = delimiter
  @cli = cli

  @file_name = @file_name_csv.split('.csv').first

  # File for export correct data from the base file
  @correct_file_data_csv = File.expand_path("#{@file_path}v_parser_correct_#{@file_name}.csv")
  @errors_log = File.expand_path("#{@file_path}v_parser_errors_#{@file_name}.log")
  @duplicates_log = File.expand_path("#{@file_path}v_parser_duplicates_#{@file_name}.log")
  @not_saved_file_data_errors = File.expand_path("#{@file_path}v_parser_not_saved_#{@file_name}.log")

  # Parsing file
  @file_for_parsing = File.expand_path(@file_path + @file_name_csv)

  # Remove old files
  remove_old_files

  # Count rows in the file without header
  @count_file_lines = `wc -l #{@file_for_parsing}`.split[0].to_i - 1

  @line_num = 0
  @counter_good_records = 0
  @counter_duplicates = 0

  # Raw data from a file without header
  @file_raw_data = []

  # Data after parsing
  @parsed_data = []

  # Header fields from csv file
  @parsing_file_header = []

  # Error statuses
  @errors = {}

  # Errors data
  @errors_data = {}

  # Duplicates records
  @duplicates = {}

  # Errors creating records from the file
  @not_saved_records = []

  # Relation of header name from the file with a specific field name of a table
  #
  # Key: column name in the csv file
  # Value:
  #         field: a field name of a table in a DB (symbol)
  #         require: a field should not be empty (true/false)
  #         replace: need to use @replace_dict ( @replace_dict = { 'what need replace' => 'replace to this' } ) (true/false)
  #         prefix: need to add value as a prefix from a field header name (header name from CSV file) (string)
  #         validate: callback method which necessary call for validating a specific format (symbol)
  #         is_empty: array with fields where need to search data if a value is empty (field name from CSV file header) (array of strings)
  #         default: a value which need set by default in any case (any type)
  #         callback: callback method which necessary call for creating a specific format (symbol)
  @ref_csv_head_from_file = {}

  # Dictionary with fields names from the @ref_csv_head_from_file where need to search duplicates
  @duplicates_dict = %i[]

  # Dictionary for replace a key word to a value word: 'hallo' => 'Hello'
  @replace_dict = {}
end

Public Instance Methods

call() click to toggle source

Start parsing

# File lib/csv_migration.rb, line 80
def call
  puts "Start parse file #{@file_for_parsing}"

  # Read line from csv file
  File.foreach(@file_for_parsing) do |line|
    data = line.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '').chomp.split(@delimiter).map(&:strip)

    if @line_num.zero?
      @parsing_file_header = data.map(&:downcase)
      @line_num += 1
      next
    end

    @file_raw_data << line.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '').chomp

    check = check_require_fields(data)

    unless check[:status]
      @line_num += 1
      puts "Incorrect data! Required field: #{check[:error]} is empty!"
      next
    end

    records = find_data_from_csv(data, @ref_csv_head_from_file)

    @parsed_data << { id: @line_num - 1 }.merge(records)

    puts "Parse left #{@count_file_lines - @line_num} lines"
    @line_num += 1
    @counter_good_records += 1
  end

  duplicates_id_list = check_duplicates
  remove_duplicates(duplicates_id_list) if duplicates_id_list.any?

  save_errors

  create_file_without_errors

  double_duplicates = @counter_good_records + @errors.values.sum + @counter_duplicates - @line_num - 1

  puts
  puts "Testing data was finished. All records in the file (without header): #{@line_num - 1}"
  puts "Good records: #{@counter_good_records}"
  puts "Bad records: #{@errors.values.sum}"
  puts "Duplicate records: #{@counter_duplicates}"
  puts "Duplicates more than one field: #{double_duplicates}" if double_duplicates.positive?
  puts "Successfully parsed records: #{@parsed_data.size}"

  error_actions if !@errors.values.sum.zero? || !@counter_duplicates.zero?

  create_data_to_db

  save_record_errors_to_file if @not_saved_records.any?
end
remove_old_files() click to toggle source

Remove old files

# File lib/csv_migration.rb, line 137
def remove_old_files
  File.delete(@errors_log) if File.exist?(@errors_log)
  File.delete(@duplicates_log) if File.exist?(@duplicates_log)
  File.delete(@correct_file_data_csv) if File.exist?(@correct_file_data_csv)
  File.delete(@not_saved_file_data_errors) if File.exist?(@not_saved_file_data_errors)
end

Private Instance Methods

add_record_to_db(_record) click to toggle source

Create new record in the DB

@param [Hash] _record in specific hash format

# File lib/csv_migration.rb, line 457
def add_record_to_db(_record)
  raise 'You should make realization callback method add_record_to_db(record)'

  # Search data of model if is necessary
  # user = User.find_by(email: record[:email].downcase)
  #
  # if user.nil?
  #   save_error(record, "User has not been found in the DB by email")
  #   next
  # end
end
blank?(var) click to toggle source

Checking variable on is nil or on is empty

@param [Any] var variable for check

# File lib/csv_migration.rb, line 149
def blank?(var)
  var.nil? || var.empty?
end
check_duplicates() click to toggle source

Search all duplicate records and saving it to a log file

# File lib/csv_migration.rb, line 325
def check_duplicates
  return [] if @parsed_data.size.zero? || @duplicates_dict.size.zero?

  id_list = []

  puts 'Start search duplicates...'

  @parsed_data.each do |row|
    id = row[:id]
    line = row.clone

    @duplicates_dict.each do |duplicate|
      next unless present?(line[duplicate])

      unless @duplicates.key?(line[duplicate])
        @duplicates = @duplicates.deep_merge(line[duplicate] => { id: [], field: duplicate, value: line[duplicate], data: [] })
      end

      @duplicates[line[duplicate]][:id] << id
      @duplicates[line[duplicate]][:data] << @file_raw_data[id]

      puts "Check line ##{id}"
    end
  end

  @duplicates = @duplicates.select { |_k, v| v[:data].size > 1 && (v[:value] != 'NULL' || blank?(v[:value])) }

  if @duplicates.any?
    file_duplicate = File.open(@duplicates_log, 'w')
    file_duplicate.puts @parsing_file_header.join(';')

    @duplicates.each do |_key, value|
      @counter_duplicates += value[:data].size

      file_duplicate.puts
      file_duplicate.puts "Duplicate field: #{value[:field]}, value: #{value[:value]}"
      file_duplicate.puts
      value[:data].each do |record|
        file_duplicate.puts record
      end

      id_list << value[:id]
    end

    file_duplicate.close
  end

  id_list.flatten.uniq
end
check_field(data, key, value) click to toggle source

Checking specific field which should be present

@param [Array] data list of values from a file @param [String] key header field name in a file @param [Object] value hash data from dict

# File lib/csv_migration.rb, line 208
def check_field(data, key, value)
  if @parsing_file_header.find_index(key).nil?
    if @cli
      puts "Please, correct settings in the @ref_csv_head_from_file hash. Key #{key} has not been found in the header of #{@file_name_csv} file!"
      exit
    else
      raise ArgumentError, "Please, correct settings in the @ref_csv_head_from_file hash. Key #{key} has not been found in the header of #{@file_name_csv} file!"
    end
  end

  if value[:require] && blank?(data[@parsing_file_header.find_index(key)])
    check = check_is_empty_field(key, data, value)

    return true if check && validate_field(data_value: check, value: value)

    @errors[value[:field]] = @errors[value[:field]].nil? ? 1 : @errors[value[:field]] + 1
    @errors_data[value[:field]] = [] unless present?(@errors_data[value[:field]])
    @errors_data[value[:field]] << [data.join(';')]

    return false
  end

  unless validate_field(data_value: data[@parsing_file_header.find_index(key)], value: value)
    @errors[value[:field]] = @errors[value[:field]].nil? ? 1 : @errors[value[:field]] + 1
    @errors_data[value[:field]] = [] unless present?(@errors_data[value[:field]])
    @errors_data[value[:field]] << [data.join(';')]

    return false
  end

  true
end
check_is_empty_field(key, data, field) click to toggle source

Check all fields on is empty

@param [String] key searched key @param [Array] data list with data from file @param [Hash] field Hash data from dict

# File lib/csv_migration.rb, line 259
def check_is_empty_field(key, data, field)
  return false unless present?(field[:is_empty])

  find_value_in_other_fields(key, data, field)
end
check_require_fields(data) click to toggle source

Check fields which should be present

@param [Array] data list of values

# File lib/csv_migration.rb, line 195
def check_require_fields(data)
  @ref_csv_head_from_file.each do |key, value|
    return { error: value[:field], status: false } unless check_field(data, key.downcase, value)
  end

  { status: true }
end
create_data_to_db() click to toggle source

Create new data in the DB This method get @parsed_data and call in loop create_data_to_db method

# File lib/csv_migration.rb, line 437
def create_data_to_db
  @parsed_data.each do |record|
    add_record_to_db(record)
  end

  show_finished_test
end
create_file_without_errors() click to toggle source

Create new csv export file without errors and duplicates

# File lib/csv_migration.rb, line 410
def create_file_without_errors
  file_export = File.open(@correct_file_data_csv, 'w')
  file_export.puts @parsing_file_header.join(';')

  @parsed_data.each do |value|
    file_export.puts @file_raw_data[value[:id]]
  end

  file_export.close
end
email_lowercase(value:, header_name:, prev_value:, field_data:) click to toggle source

Callback for lowercase data

@param [String] value Data from the CSV file after all manipulation (replace, prefix, etc) @param [String] header_name Header name a the CSV file @param [String] prev_value Data from the CSV file before all manipulation (replace, prefix, etc) @param [Hash] field_data Settings for a specific field from @ref_csv_head_from_file

# File lib/csv_migration.rb, line 177
def email_lowercase(value:, header_name:, prev_value:, field_data:)
  value.downcase
end
email_validate(value) click to toggle source

Callback for validating specific format data

@param [String] value data for validate

# File lib/csv_migration.rb, line 184
def email_validate(value)
  check = value.match?(/\A[\w+\-.]+@[a-z\d\-.]+\.[a-z]+\z/i)

  puts "Email Error #{value}" unless check

  check
end
error_actions() click to toggle source

Question action before saving data if exist errors

# File lib/csv_migration.rb, line 161
def error_actions
  return unless @cli

  print 'This file has errors. Do you want to save data without errors Y/n: '
  respond = STDIN.gets.chomp

  error_actions unless respond.casecmp('y').zero? || respond.casecmp('n').zero? || blank?(respond)
  exit if respond.casecmp('n').zero?
end
find_data_from_csv(data, object_dict) click to toggle source

Find data from a CSV file

@param [Array] data from file (read one line) @param [Hash] object_dict hash dict for creating data in specific format

# File lib/csv_migration.rb, line 286
def find_data_from_csv(data, object_dict)
  new_data = {}
  object_dict.each do |key, value|
    field_name = value[:field]

    prev_field_data = present?(new_data[field_name]) ? new_data[field_name] : nil

    new_data[field_name] = find_value_in_other_fields(key.downcase, data, value) if value[:require]
    new_data[field_name] = data[@parsing_file_header.find_index(key.downcase)]&.strip unless value[:require]

    if value[:prefix]
      prefix_value = data[@parsing_file_header.find_index(value[:prefix])]&.strip
      new_data[field_name] = prefix_value + ' ' + new_data[field_name] unless blank?(prefix_value)
    end

    new_data[field_name] = value[:default] if value[:default]
    new_data[field_name] = value[:set_is_empty] if blank?(new_data[field_name]) && value.key?(:set_is_empty)
    new_data[field_name] = replace_by_dict(new_data[field_name]) if value[:replace]

    if value[:callback] && respond_to?(value[:callback], true)
      new_data[field_name] = method(value[:callback].to_sym)
                               .call(value: new_data[field_name], header_name: key.downcase, prev_value: prev_field_data, field_data: value)
    end
  end

  new_data
end
find_value_in_other_fields(key, data, field) click to toggle source

Find value in other fields which was set for search

@param [String] key searched key @param [Array] data list with data from file @param [Hash] field Hash data from dict

# File lib/csv_migration.rb, line 270
def find_value_in_other_fields(key, data, field)
  return data[@parsing_file_header.find_index(key)] unless blank?(data[@parsing_file_header.find_index(key)])

  return false unless field[:is_empty]

  field[:is_empty].each do |value|
    return data[@parsing_file_header.find_index(value.downcase)] unless blank?(data[@parsing_file_header.find_index(value.downcase)])
  end

  false
end
present?(var) click to toggle source

Checking variable on if exist

@param [Any] var variable for check

# File lib/csv_migration.rb, line 156
def present?(var)
  !blank?(var)
end
remove_duplicates(id_list) click to toggle source

Remove duplicate records from parsed data

@param [Array] id_list list duplicates

# File lib/csv_migration.rb, line 378
def remove_duplicates(id_list)
  @counter_good_records -= id_list.size

  @parsed_data = @parsed_data.reject { |value| id_list.include?(value[:id]) }
end
replace_by_dict(string) click to toggle source

Replace text by dict @replace_dict

# File lib/csv_migration.rb, line 315
def replace_by_dict(string)
  @replace_dict.each do |key, value|
    next if blank?(string)
    return value if key.casecmp(string).zero?
  end

  string
end
save_error(record, error_text) click to toggle source

Add found error to errors data

@param [Hash] record parsed data @param [String] error_text message for error

# File lib/csv_migration.rb, line 425
def save_error(record, error_text)
  @not_saved_records << {
    raw: @file_raw_data[record[:id]],
    data: record,
    error: error_text
  }

  puts error_text
end
save_errors() click to toggle source

Save errors data to a log file

# File lib/csv_migration.rb, line 385
def save_errors
  errors = lambda do |errors_data|
    file_error = File.open(@errors_log, 'w')
    file_error.puts @parsing_file_header.join(';') unless errors_data.size.zero?

    errors_data.each do |key, value|
      file_error.puts
      file_error.puts ' ' * 10 + "#{key.capitalize}:"
      file_error.puts
      value.each do |data|
        file_error.puts data
      end
    end

    file_error.close
  end

  errors.call(@errors_data) if @errors_data.any?

  puts
  puts "Errors: #{@errors}" if @errors.any?
  puts
end
save_record_errors_to_file() click to toggle source

Save all records with errors to a file

# File lib/csv_migration.rb, line 470
def save_record_errors_to_file
  errors = lambda do |errors_data|
    file_error = File.open(@not_saved_file_data_errors, 'w')

    errors_data.each do |value|
      file_error.puts value
    end

    file_error.close
  end

  errors.call(@not_saved_records) if @not_saved_records.any?
end
show_finished_test() click to toggle source

Show text in the console after migration

# File lib/csv_migration.rb, line 446
def show_finished_test
  puts
  puts 'Migration was finished.'
  puts "Total records for insert: #{@parsed_data.size}"
  puts "Saved records: #{@parsed_data.size - @not_saved_records.size}"
  puts "Not saved records: #{@not_saved_records.size}. See log with errors: #{@not_saved_file_data_errors}"
end
validate_field(data_value:, value:) click to toggle source

Validate field if exist validation callback

@param [String] data_value value from file @param [Hash] value hash data from dict

# File lib/csv_migration.rb, line 245
def validate_field(data_value:, value:)

  return true unless present?(value[:validate])

  return method(value[:validate].to_sym).call(data_value) if respond_to?(value[:validate], true)

  true
end