class MBPSOTeamFormation::MVH

Missing Values Handler

Public Instance Methods

fill_missing_values(table, tolerate_missing_values) click to toggle source

Replacing missing values by the most frequent values for non-numeric attributes and keeping original distribution when it comes to grades

# File lib/MBPSO_Team_Formation/mvh.rb, line 50
def fill_missing_values(table, tolerate_missing_values)
  # Running only in case of missing values
  missing_values = check_missing_values(table)

  most_frequent_gender, most_frequent_ethnicity, mean_grade, stdev = nil

  mean_grade, stdev = calculate_stdev(table['Grade'])

  frequencies = table['Gender']\
                .each_with_object(Hash.new(0)) { |v, h| h[v] += 1; }
  most_frequent_gender = table['Gender'].max_by { |v| frequencies[v] }

  frequencies = table['Ethnicity']\
                .each_with_object(Hash.new(0)) { |v, h| h[v] += 1; }
  most_frequent_ethnicity = table['Ethnicity'].max_by { |v| frequencies[v] }

  return [most_frequent_gender, most_frequent_ethnicity, mean_grade, stdev, true] unless missing_values

  # Notifying the user for the missing values and proceeding
  # according to the tolerance parameter
  unless tolerate_missing_values
    raise ArgumentError, 'Missing values are present in the data set'
  end

  warn('WARNING! There are missing values in the data set,'\
       ' which will be automatically handled.')

  # Replacing missing gender values with the most frequent gender in the data set
  unless missing_values[0].empty?
    missing_values[0].each do |x|
      table[x]['Gender'] = most_frequent_gender
    end
  end

  # Replacing missing ethnicity values with the most frequent gender in the data set
  unless missing_values[1].empty?
    missing_values[1].each do |x|
      table[x]['Ethnicity'] = most_frequent_ethnicity
    end
  end

  # Replacing missing grade values according to the mean and standard
  # deviation of the data to keep the original distribution
  unless missing_values[2].empty?
    missing_values[2].each do |x|
      case (rand * 100).round
      when 0..1
        table[x]['Grade'] = [(mean_grade - 3 * stdev).round, 0].max
      when 2..9
        table[x]['Grade'] = [(mean_grade - 2 * stdev).round, 0].max
      when 10..33
        table[x]['Grade'] = (mean_grade - stdev).round
      when 34..66
        table[x]['Grade'] = mean_grade.round
      when 67..91
        table[x]['Grade'] = (mean_grade + stdev).round
      when 92..99
        table[x]['Grade'] = [(mean_grade + 2 * stdev).round, 100].min
      when 99..100
        table[x]['Grade'] = [(mean_grade - 3 * stdev).round, 100].min
      end
    end
  end
  # Returning the already calculated most statistical parameters to be
  # used for finding students with close to median attributes if necessary
  # puts "mean - #{mean}, stdev - #{stdev}"
  [most_frequent_gender, most_frequent_ethnicity, mean_grade, stdev]
end

Private Instance Methods

calculate_stdev(data) click to toggle source

Calculating the mean and standard deviation of data, to be used when replacing missing grades

# File lib/MBPSO_Team_Formation/mvh.rb, line 37
def calculate_stdev(data)
  data = data.compact.map(&:to_i)
  mean = data.sum.to_f / data.size
  sum = 0

  data.each { |v| sum += (v - mean) ** 2 }
  stdev = Math.sqrt(sum / data.size)

  [mean, stdev]
end
check_missing_values(table) click to toggle source

Checking for missing values in the data set

# File lib/MBPSO_Team_Formation/mvh.rb, line 8
def check_missing_values(table)
  temp = Array.new(3) { [] } # Array that will hold the results, each array
  # inside it holds the indexes for a particular attribute

  # Checking Genders
  (0..table['Gender'].length - 1).each do |x|
    temp[0].append(x) if table['Gender'][x].nil?
  end

  # Checking ethnicities
  (0..table['Ethnicity'].length - 1).each do |x|
    temp[1].append(x) if table['Ethnicity'][x].nil?
  end

  # Checking grades
  (0..table['Grade'].length - 1).each do |x|
    temp[2].append(x) if table['Grade'][x].nil?
  end

  # Checking results
  if !temp[0].empty? || !temp[1].empty? || !temp[2].empty?
    temp
  else
    false
  end
end