class Parser

Constants

AMERICAN_DATE_FORMAT
EUROPEAN_DATE_FORMAT

Public Class Methods

parse(input) click to toggle source
# File lib/pseudo_date/parser.rb, line 7
def self.parse(input)
  date_hash = {}
  # Minor Pre Cleanup
  input.strip!; input.gsub!('~','')
  
  date = parse_with_poro_date(input)
  
  if date
    date_hash = { :year => date.year.to_s, :month => date.month.to_s, :day => date.day.to_s }
  else
    year, month, day = parse_string(input)
    date_hash = { :year => year, :month => month, :day => day }
  end
  
  # Post parsing cleanup
  date_hash.each do |key, value|
    date_hash[key] = if value.nil?
      key.to_s == 'year' ? '0000' : '00'
    else
      date_hash[key] = value.to_s.strip
    end
  end
  
  # Cleanup the single digit values
  unless date_hash.empty?
    date_hash.each do |key,value|
      date_hash[key] = "0#{value}" if value.to_s.length == 1
    end
  end
  
  # Two character years
  if date_hash[:year].length == 2
    date_hash[:year] = date_hash[:year].to_i > Date.today.year.to_s.slice(2..4).to_i ? "19#{date_hash[:year]}" : "20#{date_hash[:year]}"
  end
  
  # Attempt to correct some known OCR issues
  if date_hash[:year].to_s.match('00') && date_hash[:year] != '0000'
    date_hash[:year] = "2#{date_hash[:year].slice(1..3)}"
  end
  
  return date_hash.empty? ? nil : date_hash
end

Private Class Methods

parse_string(input) click to toggle source
# File lib/pseudo_date/parser.rb, line 68
def self.parse_string(input)
  day, month, year = "00", "00", "0000"
  if input.match('/') # 02/25/2008
    date_array = input.split('/')
    if date_array.length == 3
      begin
        parsed_date = Date.parse(self)
        month, day, year = parsed_date.month, parsed_date.day, parsed_date.year
      rescue
        month, day, year = date_array
      end
    elsif date_array.length == 2
      month, year = date_array
    end
  elsif input.length == 8 && is_numeric?(input) # 20080225
    year, month, day = input.slice(0..3), input.slice(4..5), input.slice(6..7)
  elsif input.match('-') # 1985-09-25 or 02-25-2008
    date_array = input.split('-')
    year = date_array.select{ |part| part.length == 4 }.first
    unless year.nil? || date_array.length != 3
      if date_array.first == year
        month = date_array.last
        day = date_array[1]
      else
        month = date_array.first
        day = date_array[1]
      end
      month, day = [day, month] if month.to_i > 12 && month.to_i > day.to_i
    end
  elsif input.length == 4 # 2004
    year = input.to_s
  elsif input.length == 2 # 85
    year = (input.to_i > Date.today.year.to_s.slice(2..4).to_i) ? "19#{input}" : "20#{input}"
  elsif input.match(/\w/) # Jun 23, 2004
    begin
      d = Date.parse(input)
      year, month, day = d.year.to_s, d.month.to_s, d.day.to_s
    rescue; end
  end
  return [year, month, day]
end
parse_with_poro_date(string) click to toggle source
# File lib/pseudo_date/parser.rb, line 52
def self.parse_with_poro_date(string)
  # If our date has 3 parts then let's try to parse it with Date::strptime
  if string.split(/\/|-/).length < 3
    case string
    when /-/ # Europeans generally use hyphens to separate date pieces
      Date.strptime(string, EUROPEAN_DATE_FORMAT)
    when /\// # Americans usually use a / to separate date pieces
      Date.strptime(string, AMERICAN_DATE_FORMAT)
    end
  else
    nil # Not enough parts so just return nil
  end
rescue
  nil # We don't actually care why Date is complaining. We'll fall back to slower parsing later.
end