class Parser::Normalization

Attributes

csv_file[RW]
normalized_data[RW]
scrubbed_csv[RW]

Public Class Methods

ascii() click to toggle source
# File lib/truss_parser/parser.rb, line 15
def self.ascii
  # the artii gem creates ascii text
  # this method shells it out
  `artii Truss Parser`
end
new(argv, normalized_data="normalized_data.csv", scrubbed_csv="scrubbed-sample.csv") click to toggle source
# File lib/truss_parser/parser.rb, line 9
def initialize(argv, normalized_data="normalized_data.csv", scrubbed_csv="scrubbed-sample.csv")
  @csv_file = argv[0]
  @normalized_data = normalized_data
  @scrubbed_csv = scrubbed_csv
end
welcome() click to toggle source
# File lib/truss_parser/parser.rb, line 21
def self.welcome
  puts 'Hello! Welcome to the Truss Parser that normalizes CSV data.
        A normalized CSV formatted file will be outputted on `stdout`.
        You will also see the normalized output as a CSV in `normalized_data.csv`.
        A warning will be piped to `stderr` if there is any unparseable data,
        and its correspnding row will be dropped from your output.'
end

Public Instance Methods

calculate_duration(row, key) click to toggle source
# File lib/truss_parser/parser.rb, line 140
def calculate_duration(row, key)
  duration_seconds = row[key].split(':')
    .map(&:to_f)
      .reduce(0) { |num1, num2| num1 * 60 + num2 }
  row[key] = duration_seconds
end
calculate_total_duration(row, foo_duration_seconds, bar_duration_seconds) click to toggle source
# File lib/truss_parser/parser.rb, line 136
def calculate_total_duration(row, foo_duration_seconds, bar_duration_seconds)
  row['totalduration'] = foo_duration_seconds + bar_duration_seconds
end
cleaned_file() click to toggle source
# File lib/truss_parser/parser.rb, line 92
def cleaned_file
  CSV.parse(File.read("#{csv_file}").scrub)
end
drop_unparseable_time(table) click to toggle source
# File lib/truss_parser/parser.rb, line 147
def drop_unparseable_time(table)
  table.each_with_index do |row, i|
    begin
      calculate_duration(row, :fooduration)
      calculate_duration(row, :barduration)
      format_timestamp(row[:timestamp])
    rescue ArgumentError => e
      STDERR.puts "Warning: Row #{i} will be deleted due to #{e.message}. Did you mistype a timestamp?
                  The following row of data will be dropped:
                  '#{row}' "

    end
    table.by_row![i].delete_if { |_| e.present? }
    table.by_row!.delete_if { |row| row.blank? }
  end
end
format_datetime(timestamp) click to toggle source
# File lib/truss_parser/parser.rb, line 132
def format_datetime(timestamp)
  timestamp.iso8601
end
format_timestamp(timestamp) click to toggle source
# File lib/truss_parser/parser.rb, line 119
def format_timestamp(timestamp)
  DateTime.strptime("#{timestamp} PST", '%m/%d/%y %l:%M:%S %p %Z')
end
generate_scrubbed_csv(arrays) click to toggle source
# File lib/truss_parser/parser.rb, line 79
def generate_scrubbed_csv(arrays)
  CSV.open("#{scrubbed_csv}", "w+") do |csv|
    arrays.map { |ary| csv << ary }
  end
end
normalize() click to toggle source
# File lib/truss_parser/parser.rb, line 47
def normalize
  CSV.foreach("#{scrubbed_csv}", headers: true, encoding: "utf-8") do |row|
    # convert PST to EST && format timestamps in iso8601
    update_timezone(row, 'Pacific Time (US & Canada)', 'Eastern Time (US & Canada)')

    # any zip codes with less than 5 digits, prepend 0's to them until they are 5 digits long
    validate_zipcode(row['zip'], row)
    # uppercase all names
    upcase_fullname(row['fullname'])

    # pass address column as is, validate everything is valid unicode
    # else, replace with Unicode Replacement Character
    validate_address(row['address'])

    foo_duration_seconds = calculate_duration(row, 'fooduration')
    bar_duration_seconds = calculate_duration(row, 'barduration')

    calculate_total_duration(row, foo_duration_seconds, bar_duration_seconds)

    unicode_notes_validation(row['notes'])

    CSV.open("#{normalized_data}", 'a') do |csv|
      csv << row.fields
    end
  end
  File.open("#{normalized_data}").map { |row| puts row if ENV['GEM_ENV'] != 'TEST' }
end
scrub() click to toggle source
# File lib/truss_parser/parser.rb, line 29
def scrub
  # make sure there is only one argument from STDIN
  validate_args
  # read in CSV with broken unicode from STDIN
  # and scrub the broken bytes
  cleaned_arrays = cleaned_file
  # generate a new CSV without broken unicode
  generate_scrubbed_csv(cleaned_arrays)

  table = CSV.table("#{scrubbed_csv}")

  # drop rows with unparseable DateTimes
  drop_unparseable_time(table)

  table.to_a.reject! { |row| row.blank? }
  generate_scrubbed_csv(table.to_a)
end
truncate() click to toggle source
# File lib/truss_parser/parser.rb, line 75
def truncate
  File.truncate("#{normalized_data}", 0)
end
unicode_notes_validation(notes) click to toggle source
# File lib/truss_parser/parser.rb, line 96
def unicode_notes_validation(notes)
  notes = '' if notes.nil?
  notes.encode('UTF-16', :undef => :replace, :invalid => :replace, :replace => '�').encode('UTF-8')
end
upcase_fullname(fullname) click to toggle source
# File lib/truss_parser/parser.rb, line 101
def upcase_fullname(fullname)
  fullname = '' if fullname.nil?
  fullname.upcase!
end
update_timezone(row, beginning_tz, result_tz) click to toggle source
# File lib/truss_parser/parser.rb, line 123
def update_timezone(row, beginning_tz, result_tz)
  Time.zone = beginning_tz
  row['timestamp'].nil? ? timestamp = format_timestamp(row[:timestamp]) : timestamp = format_timestamp(row['timestamp'])
  datetime = timestamp
  datetime_est = datetime.in_time_zone('EST')
  formatted_datetime_est = format_datetime(datetime_est)
  row['timestamp'] = formatted_datetime_est
end
validate_address(address) click to toggle source
# File lib/truss_parser/parser.rb, line 164
def validate_address(address)
  address = '' if address.nil?
  address.encode('UTF-16', :invalid => :replace, :replace => '�').encode('UTF-8')
end
validate_args() click to toggle source
# File lib/truss_parser/parser.rb, line 85
def validate_args
  if csv_file.split.length != 1 || scrubbed_csv.split.length != 1 || normalized_data.split.length != 1
    STDERR.puts "Warning: You did not input parameters correctly. Please try again with one command-line argument."
    exit
  end
end
validate_zipcode(zipcode, row) click to toggle source
# File lib/truss_parser/parser.rb, line 106
def validate_zipcode(zipcode, row)
  if /\D/ =~ zipcode
    STDERR.puts "Warning: You have some characters that are not numbers in
    your zipcode '#{zipcode}' in row: '#{row}'.
    For the time being, 0's will replace the non-number characters. "
    # remove non number characters in zipcode string
    zipcode = zipcode.gsub!(/\D/, '')
  end
  until zipcode.length == 5 do
    zipcode.prepend('0')
  end
end