module SoundX

Constants

VERSION

Public Class Methods

encode(p1) click to toggle source
static VALUE
rb_soundx(int argc, VALUE* argv, VALUE self)
{
  VALUE input;
  VALUE rbDestString;
  unsigned char* src;
  unsigned char written = 0; // Should not exceed 3, so 8 bits is plenty.
  size_t srclen;
  size_t i; // for looping the src
  unsigned char current;
  unsigned char match;

  /*
   Destination is padded with up to 3 trailing zeros
   for short names.
  */
  char dest[5] = {'0', '0', '0', '0', '\0'};

  rb_scan_args(argc, argv, "1", &input);

  Check_Type(input, T_STRING);

  srclen = RSTRING_LEN(input);
  src = (unsigned char *) StringValueCStr(input);

  if (src[0] > 127) {
    rb_raise(rb_eArgError, "non-ASCII character found");
  }

  dest[0] = toupper(src[0]);

  for(i = 1; i < srclen; i++) {
    if (written >= 3) {
      break;
    }

    if (src[i] > 127) {
      rb_raise(rb_eArgError, "non-ASCII character found");
      break;
    }

    current = tolower(src[i]);

    match = mapping[ current ];

    if (0xFE == match) {
      continue;
    }

    // The 2nd character must still respect
    // the 'double code number' rule.
    if (1 == i) {
      if (match == mapping[tolower(src[0])]) {
        continue;
      }
    }

    // Skip if previous character is the same
    // e.g. Gutierrez (2nd 'r' ignored)
    if (src[i] == src[i-1]) {
      continue;
    }

    // If the surname has different letters side-by-side that have the same number
    // in the soundex coding guide, they should be treated as one letter.
    // e.g. Jackson
    if (mapping[current] == mapping[tolower(src[i-1])] && src[i] != src[i-1]) {
      continue;
    }


    // If "H" or "W" separate two consonants that have the same soundex code,
    // the consonant to the right of the vowel is not coded.
    if (i+1 < srclen && (current == 'h' || current == 'w')) {
      if(dest[written] == mapping[tolower(src[i+1])]) {
        i = i + 1; // Skip over the next consonant
        continue;
      }
    }

    // We landed on a vowel-like character,
    // so skip to the next char
    if ('0' == match) {
      continue;
    }

    dest[written+1] = mapping[ current ];

    written++;
  }

  rbDestString = rb_str_new_cstr(dest);

  return rbDestString;
}