class Parser::Normalization
Attributes
csv_file[RW]
normalized_data[RW]
scrubbed_csv[RW]
Public Class Methods
ascii()
click to toggle source
# File lib/truss_parser/parser.rb, line 15 def self.ascii # the artii gem creates ascii text # this method shells it out `artii Truss Parser` end
new(argv, normalized_data="normalized_data.csv", scrubbed_csv="scrubbed-sample.csv")
click to toggle source
# File lib/truss_parser/parser.rb, line 9 def initialize(argv, normalized_data="normalized_data.csv", scrubbed_csv="scrubbed-sample.csv") @csv_file = argv[0] @normalized_data = normalized_data @scrubbed_csv = scrubbed_csv end
welcome()
click to toggle source
# File lib/truss_parser/parser.rb, line 21 def self.welcome puts 'Hello! Welcome to the Truss Parser that normalizes CSV data. A normalized CSV formatted file will be outputted on `stdout`. You will also see the normalized output as a CSV in `normalized_data.csv`. A warning will be piped to `stderr` if there is any unparseable data, and its correspnding row will be dropped from your output.' end
Public Instance Methods
calculate_duration(row, key)
click to toggle source
# File lib/truss_parser/parser.rb, line 140 def calculate_duration(row, key) duration_seconds = row[key].split(':') .map(&:to_f) .reduce(0) { |num1, num2| num1 * 60 + num2 } row[key] = duration_seconds end
calculate_total_duration(row, foo_duration_seconds, bar_duration_seconds)
click to toggle source
# File lib/truss_parser/parser.rb, line 136 def calculate_total_duration(row, foo_duration_seconds, bar_duration_seconds) row['totalduration'] = foo_duration_seconds + bar_duration_seconds end
cleaned_file()
click to toggle source
# File lib/truss_parser/parser.rb, line 92 def cleaned_file CSV.parse(File.read("#{csv_file}").scrub) end
drop_unparseable_time(table)
click to toggle source
# File lib/truss_parser/parser.rb, line 147 def drop_unparseable_time(table) table.each_with_index do |row, i| begin calculate_duration(row, :fooduration) calculate_duration(row, :barduration) format_timestamp(row[:timestamp]) rescue ArgumentError => e STDERR.puts "Warning: Row #{i} will be deleted due to #{e.message}. Did you mistype a timestamp? The following row of data will be dropped: '#{row}' " end table.by_row![i].delete_if { |_| e.present? } table.by_row!.delete_if { |row| row.blank? } end end
format_datetime(timestamp)
click to toggle source
# File lib/truss_parser/parser.rb, line 132 def format_datetime(timestamp) timestamp.iso8601 end
format_timestamp(timestamp)
click to toggle source
# File lib/truss_parser/parser.rb, line 119 def format_timestamp(timestamp) DateTime.strptime("#{timestamp} PST", '%m/%d/%y %l:%M:%S %p %Z') end
generate_scrubbed_csv(arrays)
click to toggle source
# File lib/truss_parser/parser.rb, line 79 def generate_scrubbed_csv(arrays) CSV.open("#{scrubbed_csv}", "w+") do |csv| arrays.map { |ary| csv << ary } end end
normalize()
click to toggle source
# File lib/truss_parser/parser.rb, line 47 def normalize CSV.foreach("#{scrubbed_csv}", headers: true, encoding: "utf-8") do |row| # convert PST to EST && format timestamps in iso8601 update_timezone(row, 'Pacific Time (US & Canada)', 'Eastern Time (US & Canada)') # any zip codes with less than 5 digits, prepend 0's to them until they are 5 digits long validate_zipcode(row['zip'], row) # uppercase all names upcase_fullname(row['fullname']) # pass address column as is, validate everything is valid unicode # else, replace with Unicode Replacement Character validate_address(row['address']) foo_duration_seconds = calculate_duration(row, 'fooduration') bar_duration_seconds = calculate_duration(row, 'barduration') calculate_total_duration(row, foo_duration_seconds, bar_duration_seconds) unicode_notes_validation(row['notes']) CSV.open("#{normalized_data}", 'a') do |csv| csv << row.fields end end File.open("#{normalized_data}").map { |row| puts row if ENV['GEM_ENV'] != 'TEST' } end
scrub()
click to toggle source
# File lib/truss_parser/parser.rb, line 29 def scrub # make sure there is only one argument from STDIN validate_args # read in CSV with broken unicode from STDIN # and scrub the broken bytes cleaned_arrays = cleaned_file # generate a new CSV without broken unicode generate_scrubbed_csv(cleaned_arrays) table = CSV.table("#{scrubbed_csv}") # drop rows with unparseable DateTimes drop_unparseable_time(table) table.to_a.reject! { |row| row.blank? } generate_scrubbed_csv(table.to_a) end
truncate()
click to toggle source
# File lib/truss_parser/parser.rb, line 75 def truncate File.truncate("#{normalized_data}", 0) end
unicode_notes_validation(notes)
click to toggle source
# File lib/truss_parser/parser.rb, line 96 def unicode_notes_validation(notes) notes = '' if notes.nil? notes.encode('UTF-16', :undef => :replace, :invalid => :replace, :replace => '�').encode('UTF-8') end
upcase_fullname(fullname)
click to toggle source
# File lib/truss_parser/parser.rb, line 101 def upcase_fullname(fullname) fullname = '' if fullname.nil? fullname.upcase! end
update_timezone(row, beginning_tz, result_tz)
click to toggle source
# File lib/truss_parser/parser.rb, line 123 def update_timezone(row, beginning_tz, result_tz) Time.zone = beginning_tz row['timestamp'].nil? ? timestamp = format_timestamp(row[:timestamp]) : timestamp = format_timestamp(row['timestamp']) datetime = timestamp datetime_est = datetime.in_time_zone('EST') formatted_datetime_est = format_datetime(datetime_est) row['timestamp'] = formatted_datetime_est end
validate_address(address)
click to toggle source
# File lib/truss_parser/parser.rb, line 164 def validate_address(address) address = '' if address.nil? address.encode('UTF-16', :invalid => :replace, :replace => '�').encode('UTF-8') end
validate_args()
click to toggle source
# File lib/truss_parser/parser.rb, line 85 def validate_args if csv_file.split.length != 1 || scrubbed_csv.split.length != 1 || normalized_data.split.length != 1 STDERR.puts "Warning: You did not input parameters correctly. Please try again with one command-line argument." exit end end
validate_zipcode(zipcode, row)
click to toggle source
# File lib/truss_parser/parser.rb, line 106 def validate_zipcode(zipcode, row) if /\D/ =~ zipcode STDERR.puts "Warning: You have some characters that are not numbers in your zipcode '#{zipcode}' in row: '#{row}'. For the time being, 0's will replace the non-number characters. " # remove non number characters in zipcode string zipcode = zipcode.gsub!(/\D/, '') end until zipcode.length == 5 do zipcode.prepend('0') end end