module KualiCo::ETL

KualiCo extract, transform and load methods.

Public Class Methods

encode(str, opt={ encoding: "UTF-8" } ) click to toggle source

Encodes the input String and replaces invalid or undefined characters. @param [String] str the String to be encoded and invalid characters replaced with valid characters. @option opt [String] :encoding the character encoding to use. @return [String] the result of encoding the String and replacing invalid characters with valid characters. @see String#encode

# File lib/kuali_toolbox/etl.rb, line 108
def self.encode(str, opt={ encoding: "UTF-8" } )
  opt[:encoding] = "UTF-8" if opt[:encoding].nil?
  str.encode( opt[:encoding], :invalid => :replace,
              :undef => :replace, :replace => "" )
end
error(e) click to toggle source

Prepares an Exception for consistent error handling. @param [String, Exception] e the error to handle @return [Exception] an Exception with a message formatted with $INPUT_LINE_NUMBER. @raise [ArgumentError] if an invalid argument is passed.

# File lib/kuali_toolbox/etl.rb, line 30
def self.error(e)
  if e.kind_of? String
    # default to TextParseError
    return TextParseError.new "ERROR: Line #{$INPUT_LINE_NUMBER}: #{e}"
  end
  if e.kind_of? Exception
    return e.exception "ERROR: Line #{$INPUT_LINE_NUMBER}: #{e}"
  end
  raise ArgumentError, "Unsupported error type: #{e.class}"
end
mutate_sql_stmt!(insert_str, column_name, values_str, value) click to toggle source

Mutates two sides of a SQL insert statement: insert_str and values_str with column_name and value respectively. Proper SQL value quoting will be performed based on object type. @param [String] insert_str the left side of the insert statement (i.e. columns) @param [String] column_name the column name to append to insert_str. @param [String] values_str the right side of the insert statement (i.e. values) @param [Object] value the value to append to values_str. Must respond to to_s. @return [void]

# File lib/kuali_toolbox/etl.rb, line 133
def self.mutate_sql_stmt!(insert_str, column_name, values_str, value)
  insert_str.concat "#{column_name.upcase},"
  # TODO what are all of the valid types that should not be quoted?
  if value.kind_of? Integer
    values_str.concat "#{value},"
  else
    values_str.concat "'#{value}',"
  end
  return nil
end
parse_boolean(str, opt={}) click to toggle source

Matches the input against a set of well known boolean patterns. @param [String] str String to be matched against well known boolean patterns. @option opt [Boolean] :default the default return value if str is empty. @return [Boolean] the result of matching the str input against well known boolean patterns. @raise [TextParseError] if none of the known boolean patterns could be matched.

# File lib/kuali_toolbox/etl.rb, line 88
def self.parse_boolean(str, opt={})
  return true  if str == true
  return false if str == false
  b = parse_string str, opt
  return true  if b =~ /^(active|a|true|t|yes|y|1)$/i
  return false if b =~ /^(inactive|i|false|f|no|n|0)$/i
  if b.empty? && !opt[:default].nil?
    return opt[:default]
  end
  if b.empty?
    return nil
  end
  raise KualiCo::ETL::error TextParseError.new "invalid value for Boolean: '#{str}'"
end
parse_csv_command_line_options( executable, args, opt={ csv_options: { headers: :first_row, header_converters: :symbol, skip_blanks: true, col_sep: ",", quote_char: '"' } } ) click to toggle source

Parse common command line options for CSV –> SQL transformations. @param [String] executable the name of the script from which we are executing. See example. @param [Array<String>] args the command line args. @option opt [String] :csv_filename the input file from which the CSV will be read.

Defaults to the first element of args Array.

@option opt [String] :sql_filename the output file to which the SQL will be written. @option opt [Hash] :csv_options the options that will be used by the CSV parser. @return [Hash] a Hash containing the parsed command line results. @example The most common usage:

opt = KualiCo::ETL.parse_csv_command_line_options (File.basename $0), ARGF.argv
# File lib/kuali_toolbox/etl.rb, line 354
def self.parse_csv_command_line_options(
    executable, args, opt={ csv_options: { headers: :first_row,
                                           header_converters: :symbol,
                                           skip_blanks: true,
                                           col_sep: ",",
                                           quote_char: '"'
                                           }
                            } )
  optparse = OptionParser.new do |opts|
    opts.banner = "Usage: #{executable} [options] csv_file"
    opts.on( '-o' ,'--output SQL_FILE_OUTPUT', 'The file the SQL data will be writen to... (defaults to <csv_file>.sql)') do |f|
      opt[:sql_filename] = f
    end
    opts.on( '-s' ,'--separator SEPARATOR_CHARACTER', 'The character that separates each column of the CSV file.') do |s|
      opt[:csv_options][:col_sep] = s
    end
    opts.on( '-q' ,'--quote QUOTE_CHARACTER', 'The character used to quote fields.') do |q|
      opt[:csv_options][:quote_char] = q
    end
    opts.on( '-h', '--help', 'Display this screen' ) do
      puts opts
      exit 1
    end

    opt[:csv_filename] = args[0] unless opt[:csv_filename]
    if opt[:csv_filename].nil? || opt[:csv_filename].empty?
      puts opts
      exit 1
    end
  end
  optparse.parse!

  # construct a sensible default ouptput filename
  unless opt[:sql_filename]
    file_extension = File.extname opt[:csv_filename]
    dir_name = File.dirname opt[:csv_filename]
    base_name = File.basename opt[:csv_filename], file_extension
    opt[:sql_filename] = "#{dir_name}/#{base_name}.sql"
  end

  return opt
end
parse_date(str, opt={ valid_values: /^$|(\d{4}\-\d{2}\-\d{2}){1}/ }) click to toggle source

Parse a SQL date from a String. @param [String] str the String to be parsed. @param [Hash] opt options Hash will be passed through to parse_string. @return [String] the parsed date. nil or empty inputs will return '' by default. @see parse_string

# File lib/kuali_toolbox/etl.rb, line 257
def self.parse_date(str, opt={ valid_values: /^$|(\d{4}\-\d{2}\-\d{2}){1}/ })
  opt[:valid_values] = /^$|(\d{4}\-\d{2}\-\d{2}){1}/ if opt[:valid_values].nil?
  return parse_string str, opt
end
parse_date!(row, insert_str, values_str, opt={}) click to toggle source

Helper method for parse_date which finds the value by column :name and mutates the SQL statement accordingly. @param [CSV::Row] row the CSV Row being parsed @param [String] insert_str the left side of the insert statement (i.e. columns) @param [String] values_str the right side of the insert statement (i.e. values) @param [Hash] opt options Hash will be passed through to parse_date. @option opt [String] :name the name of the field being parsed. Required. @return [void] @raise [ArgumentError] :name is required. @see parse_date @see mutate_sql_stmt!

# File lib/kuali_toolbox/etl.rb, line 272
def self.parse_date!(row, insert_str, values_str, opt={})
  raise ArgumentError, "opt[:name] is required!" unless opt[:name]
  date = parse_date( row[ to_symbol( opt[:name] ) ], opt )
  mutate_sql_stmt! insert_str, opt[:name], values_str, date
end
parse_datetime(str, opt={ valid_values: /^$|(\d{4}\-\d{2}\-\d{2}){1}\s(\d{2}:\d{2}:\d{2})?/ }) click to toggle source

Parse a SQL datetime from a String. @param [String] str the String to be parsed. @param [Hash] opt options Hash will be passed through to parse_string. @return [String] the parsed datetime. nil or empty inputs will return '' by default. @see parse_string

# File lib/kuali_toolbox/etl.rb, line 283
def self.parse_datetime(str, opt={ valid_values: /^$|(\d{4}\-\d{2}\-\d{2}){1}\s(\d{2}:\d{2}:\d{2})?/ })
  opt[:valid_values] = /^$|(\d{4}\-\d{2}\-\d{2}){1}\s(\d{2}:\d{2}:\d{2})?/ if opt[:valid_values].nil?
  return parse_string str, opt
end
parse_datetime!(row, insert_str, values_str, opt={}) click to toggle source

Helper method for parse_datetime which finds the value by column :name and mutates the SQL statement accordingly. @param [CSV::Row] row the CSV Row being parsed @param [String] insert_str the left side of the insert statement (i.e. columns) @param [String] values_str the right side of the insert statement (i.e. values) @param [Hash] opt options Hash will be passed through to parse_datetime. @option opt [String] :name the name of the field being parsed. Required. @return [void] @raise [ArgumentError] :name is required. @see parse_datetime @see mutate_sql_stmt!

# File lib/kuali_toolbox/etl.rb, line 298
def self.parse_datetime!(row, insert_str, values_str, opt={})
  raise ArgumentError, "opt[:name] is required!" unless opt[:name]
  datetime = parse_datetime( row[ to_symbol( opt[:name] ) ], opt )
  mutate_sql_stmt! insert_str, opt[:name], values_str, datetime
end
parse_flag(str, opt={ length: 1, upcase: true }) click to toggle source

Useful for parsing “flag” like values; i.e. usually single characters. @param [String] str the String to be parsed. @param [Hash] opt options Hash will be passed through to parse_string. @option opt [Integer] :length the maximum supported length of the field. @option opt [Boolean] :upcase if true upcase the results. @return [String] the parsed “flag”. @see parse_string

# File lib/kuali_toolbox/etl.rb, line 336
def self.parse_flag(str, opt={ length: 1, upcase: true })
  opt[:length] = 1 if opt[:length].nil?
  opt[:upcase] = true if opt[:upcase].nil?
  retval = parse_string str, opt
  retval = retval.upcase if opt[:upcase] == true
  return retval
end
parse_float(str, opt={}) click to toggle source

Parse a Float from a String. @note Note the behavioral difference versus to_f. @param [String] str the String to be parsed. @param [Hash] opt options Hash will be passed through to parse_string. @return [Float, nil] the parsed Float. nil or empty inputs will return nil by default. @example Unlike to_f, nil or empty inputs will return nil by default

nil == parse_float(nil) && nil == parse_float('') && 0.0 != parse_float(nil)

@see parse_string

# File lib/kuali_toolbox/etl.rb, line 243
def self.parse_float(str, opt={})
  s = parse_string str, opt
  if s.empty?
    return nil;
  else
    return s.to_f
  end
end
parse_integer(str, opt={}) click to toggle source

Parse an Integer from a String. @note Note the behavioral difference versus to_i. @param [String] str the String to be parsed. @param [Hash] opt options Hash will be passed through to parse_string. @return [Integer, nil] the parsed Integer. nil or empty inputs will return nil by default. @example Unlike to_i, nil or empty inputs will return nil by default

nil == parse_integer(nil) && nil == parse_integer('') && 0 != parse_integer(nil)

@see parse_string

# File lib/kuali_toolbox/etl.rb, line 210
def self.parse_integer(str, opt={})
  s = parse_string str, opt
  if s.empty?
    return nil;
  else
    return s.to_i
  end
end
parse_integer!(row, insert_str, values_str, opt={}) click to toggle source

Helper method which finds the value by column :name and mutates the SQL statement accordingly. @param [CSV::Row] row the CSV Row being parsed @param [String] insert_str the left side of the insert statement (i.e. columns) @param [String] values_str the right side of the insert statement (i.e. values) @param [Hash] opt options Hash will be passed through to parse_integer. @option opt [String] :name the name of the field being parsed. Required. @return [void] @raise [ArgumentError] :name is required. @see parse_integer @see mutate_sql_stmt!

# File lib/kuali_toolbox/etl.rb, line 229
def self.parse_integer!(row, insert_str, values_str, opt={})
  raise ArgumentError, "opt[:name] is required!" unless opt[:name]
  i = parse_integer( row[ to_symbol( opt[:name] ) ], opt )
  mutate_sql_stmt! insert_str, opt[:name], values_str, i
end
parse_string(str, opt={ strict: true, required: false }) click to toggle source

Parses a string using common parsing behavior with options. This method forms the foundation of all the other parsing methods. @param [String] str the String to be parsed. @option opt [String, to_s] :default the default return value if str is empty. Must respond to to_s @option opt [Integer] :length raise a TextParseError if str.length > :length. @option opt [Boolean] :truncate string if longer than length. @option opt [String] :name the name of the field being parsed. Used only for error handling. @option opt [Boolean] :required raise a TextParseError if str is empty. @option opt [Boolean] :strict strict length checking will produce errors instead of warnings. @option opt [Array<Object>, Regexp] :valid_values all of the possible valid values. @return [String] the parsed results. nil or empty inputs will return the empty String by default(i.e. ''). @raise [TextParseError] if the field is :required and found to be empty. @raise [TextParseError] if str.length > :length && :strict @raise [TextParseError] if str does not match :valid_values @example nil or empty inputs will return the empty String by default

'' == parse_string(nil) && '' == parse_string('')

@see valid_value

# File lib/kuali_toolbox/etl.rb, line 161
def self.parse_string(str, opt={ strict: true, required: false })
  opt[:strict] = true if opt[:strict].nil?
  retval = encode str.to_s.strip
  if opt[:required] && retval.empty?
    raise KualiCo::ETL::error TextParseError.new "Required data element '#{opt[:name]}' not found: '#{str}'"
  end
  if opt[:default] && retval.empty?
    retval = opt[:default].to_s
  elsif retval.empty?
    return retval
  end
  if opt[:length] && retval.length > opt[:length].to_i
    detail = "#{opt[:name]}.length > #{opt[:length]}: '#{str}'-->'#{str[0..(opt[:length] - 1)]}'"
    if opt[:strict]
      raise KualiCo::ETL::error TextParseError.new "Data exceeds maximum field length: #{detail}"
    end
    puts KualiCo::ETL::warning "Data will be truncated: #{detail}"
    retval = retval[0..(opt[:length]-1)];
  end
  if opt[:valid_values] && ! valid_value(retval, opt[:valid_values], opt)
    raise KualiCo::ETL::error TextParseError.new "Illegal #{opt[:name]}: value '#{str}' not found in: #{opt[:valid_values]}"
  end
  return retval
end
parse_string!(row, insert_str, values_str, opt={}) click to toggle source

Helper method which finds the value by column :name and mutates the SQL statement accordingly. @param [CSV::Row] row the CSV Row being parsed @param [String] insert_str the left side of the insert statement (i.e. columns) @param [String] values_str the right side of the insert statement (i.e. values) @param [Hash] opt options Hash will be passed through to parse_string. @option opt [String] :name the name of the field being parsed. Required. @return [void] @raise [ArgumentError] :name is required. @see parse_string @see mutate_sql_stmt!

# File lib/kuali_toolbox/etl.rb, line 196
def self.parse_string!(row, insert_str, values_str, opt={})
  raise ArgumentError, "opt[:name] is required!" unless opt[:name]
  str = parse_string( row[ to_symbol( opt[:name] ) ], opt )
  mutate_sql_stmt! insert_str, opt[:name], values_str, str
end
parse_timestamp(str, opt={ valid_values: /^$|(\d{4}\-\d{2}\-\d{2}){1}\s(\d{2}:\d{2}:\d{2})?/ }) click to toggle source

Parse a SQL timestamp from a String. @param [String] str the String to be parsed. @param [Hash] opt options Hash will be passed through to parse_string. @return [String] the parsed timestamp. nil or empty inputs will return '' by default. @see parse_string

# File lib/kuali_toolbox/etl.rb, line 309
def self.parse_timestamp(str, opt={ valid_values: /^$|(\d{4}\-\d{2}\-\d{2}){1}\s(\d{2}:\d{2}:\d{2})?/ })
  return parse_datetime str, opt
end
parse_timestamp!(row, insert_str, values_str, opt={}) click to toggle source

Helper method for parse_timestamp which finds the value by column :name and mutates the SQL statement accordingly. @param [CSV::Row] row the CSV Row being parsed @param [String] insert_str the left side of the insert statement (i.e. columns) @param [String] values_str the right side of the insert statement (i.e. values) @param [Hash] opt options Hash will be passed through to parse_timestamp. @option opt [String] :name the name of the field being parsed. Required. @return [void] @raise [ArgumentError] :name is required. @see parse_timestamp @see mutate_sql_stmt!

# File lib/kuali_toolbox/etl.rb, line 323
def self.parse_timestamp!(row, insert_str, values_str, opt={})
  raise ArgumentError, "opt[:name] is required!" unless opt[:name]
  timestamp = parse_datetime( row[ to_symbol( opt[:name] ) ], opt )
  mutate_sql_stmt! insert_str, opt[:name], values_str, timestamp
end
to_symbol(str) click to toggle source

Matches the MRI CSV specification: The header String is downcased, spaces are replaced with underscores, non-word characters are dropped, and finally to_sym() is called. @param [String] str the String to be symbolized. @return [Symbol] String is downcased, spaces are replaced with underscores,

non-word characters are dropped

@raise [ArgumentError] if str is nil or empty.

# File lib/kuali_toolbox/etl.rb, line 121
def self.to_symbol(str)
  raise ArgumentError, "Illegal symbol name: '#{str}'" if str.nil? || str.empty?
  encode( str.downcase.gsub(/\s+/, "_").gsub(/\W+/, "") ).to_sym
end
valid_value(subject, valid_values, opt={}) click to toggle source

Tests whether the subject matches one of the valid values. @param [String, match] subject used for validity checking. @param [Array<Object>, Regexp] valid_values all of the possible valid values. @option opt [Boolean] :case_sensitive performs case sensitive matching @return [Boolean] true if the subject matches valid_values.

FYI valid_values must respond to #casecmp.

@raise [ArgumentError] if valid_values is nil or empty. @raise [ArgumentError] case sensitive matching only works for objects

that respond to #casecmp; primarily String objects.
# File lib/kuali_toolbox/etl.rb, line 65
def self.valid_value(subject, valid_values, opt={})
  raise ArgumentError, "valid_values must not be nil!" if valid_values.nil?
  if valid_values.kind_of? Regexp
    return true if subject =~ valid_values
  end
  if valid_values.kind_of? Array
    raise ArgumentError, "valid_values must have at least one element!" unless valid_values.length > 0
    if opt[:case_sensitive] == false # case insensitive comparison requested
      raise ArgumentError, "Object must respond to #casecmp" unless subject.respond_to? 'casecmp'
      valid_values.each do |valid_value|
        return true if valid_value.casecmp(subject) == 0
      end
    end
    return true if valid_values.include? subject # default to == equality
  end
  return false
end
warning(e) click to toggle source

Prepares an Exception for consistent warning handling. @param [String, Exception] e the warning to handle @return [Exception] an Exception with a message formatted with $INPUT_LINE_NUMBER. @raise [ArgumentError] if an invalid argument is passed.

# File lib/kuali_toolbox/etl.rb, line 45
def self.warning(e)
  if e.kind_of? String
    # default to TextParseError
    return TextParseError.new "WARN:  Line #{$INPUT_LINE_NUMBER}: #{e}"
  end
  if e.kind_of? Exception
    return e.exception "WARN:  Line #{$INPUT_LINE_NUMBER}: #{e}"
  end
  raise ArgumentError, "Unsupported error type: #{e.class}"
end