fix soundex algorithm

leobm · Mar 8, 2024 · 86079c0 · 86079c0
1 parent 0bbb85b
commit 86079c0
Show file tree

Hide file tree

Showing 7 changed files with 198 additions and 64 deletions.
diff --git a/gleam.toml b/gleam.toml
@@ -1,5 +1,5 @@
 name = "phonetic_gleam"
-version = "0.1.0"
+version = "0.1.1"
 
 # Fill out these fields if you intend to generate HTML documentation or publish
 # your project to the Hex package manager.

diff --git a/src/phonetic_gleam/cologne.gleam b/src/phonetic_gleam/cologne.gleam
@@ -1,6 +1,7 @@
 import gleam/int
 import gleam/string
 import gleam/list
+import phonetic_gleam/utils.{then_or_else}
 
 // https://en.wikipedia.org/wiki/Cologne_phonetics
 
@@ -28,18 +29,6 @@ fn prepare_word(word) {
   |> string.replace(each: "ß", with: "S")
 }
 
-fn cleanup_adjacent_codes(codes: List(Int)) {
-  codes
-  |> list.fold([], fn(acc, code) {
-    let last_code = case list.first(acc) {
-      Ok(c) -> c
-      Error(Nil) -> -1
-    }
-    { code == -1 || last_code == code }
-    |> then_or_else(acc, [code, ..acc])
-  })
-}
-
 fn remove_zeros(codes) {
   // delete all '0' characters, except at the beginning.
   codes
@@ -55,16 +44,9 @@ fn join_codes(codes: List(Int)) -> String {
   |> string.join("")
 }
 
-fn then_or_else(is, then, or_else) {
-  case is {
-    True -> then
-    False -> or_else
-  }
-}
-
 fn tr(word, recent_char, codes) -> List(Int) {
   case word {
-    "" -> codes
+    "" -> list.reverse(codes)
     _ -> {
       let assert [a, b, t] = first_second_rest(word)
       let is_before = fn(c) { c == b }
@@ -111,7 +93,8 @@ pub fn encode(word) -> String {
   word
   |> prepare_word
   |> tr("", [])
-  |> cleanup_adjacent_codes
+  |> utils.remove_value(-1)
+  |> utils.remove_adjacent_dups
   |> remove_zeros
   |> list.reverse
   |> join_codes

diff --git a/src/phonetic_gleam/nysiis.gleam b/src/phonetic_gleam/nysiis.gleam
@@ -1,6 +1,5 @@
 import gleam/string
-import gleam/list
-import gleam/set
+import phonetic_gleam/utils.{then_or_else}
 
 // https://en.wikipedia.org/wiki/New_York_State_Identification_and_Intelligence_System
 
@@ -71,26 +70,6 @@ fn drop_last_chars(word: String) -> String {
   }
 }
 
-fn remove_duplicates(word: String) -> String {
-  string.to_graphemes(word)
-  |> list.fold_right([], fn(acc, code) {
-    let last_code = case list.first(acc) {
-      Ok(c) -> c
-      Error(Nil) -> ""
-    }
-    { code == "" || last_code == code }
-    |> then_or_else(acc, [code, ..acc])
-  })
-  |> string.join("")
-}
-
-fn then_or_else(is, then, or_else) {
-  case is {
-    True -> then
-    False -> or_else
-  }
-}
-
 fn first_char(word) {
   case string.pop_grapheme(word) {
     Ok(#(a, _)) -> a
@@ -103,15 +82,17 @@ fn is_vowel(c: String) {
 }
 
 fn prepare_word(word: String) -> String {
-  let allowed_chars =
-    "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-    |> string.to_graphemes
-    |> set.from_list
+  word
+  |> string.uppercase
+  |> utils.remove_not_allowed_chars("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
+}
 
-  string.uppercase(word)
+fn cleanup(codes) {
+  codes
   |> string.to_graphemes
-  |> list.filter(fn(c) { set.contains(allowed_chars, c) })
+  |> utils.remove_adjacent_dups
   |> string.join("")
+  |> drop_last_chars
 }
 
 pub fn encode(word) -> String {
@@ -125,6 +106,5 @@ pub fn encode(word) -> String {
     |> string.to_graphemes
     |> tr("", "")
   }
-  |> remove_duplicates
-  |> drop_last_chars
+  |> cleanup
 }
diff --git a/src/phonetic_gleam/soundex.gleam b/src/phonetic_gleam/soundex.gleam
@@ -1,32 +1,53 @@
 import gleam/list
 import gleam/string
+import phonetic_gleam/utils
+
+fn tr_char(b) {
+  case b {
+    "B" | "F" | "P" | "V" -> "1"
+    "C" | "G" | "J" | "K" | "Q" | "S" | "X" | "Z" -> "2"
+    "D" | "T" -> "3"
+    "L" -> "4"
+    "M" | "N" -> "5"
+    "R" -> "6"
+    _ -> ""
+  }
+}
 
 fn tr(chars, acc) {
   case chars {
     [] -> list.reverse(acc)
-    [a, ..xs] if acc == [] -> tr(xs, [a, ..acc])
-    [a, ..xs] ->
-      case a {
-        "B" | "F" | "P" | "V" -> tr(xs, ["1", ..acc])
-        "C" | "G" | "J" | "K" | "Q" | "S" | "X" | "Z" -> tr(xs, ["2", ..acc])
-        "D" | "T" -> tr(xs, ["3", ..acc])
-        "L" -> tr(xs, ["4", ..acc])
-        "M" | "N" -> tr(xs, ["5", ..acc])
-        "R" -> tr(xs, ["6", ..acc])
-        _ -> tr(xs, acc)
+    [a, b, ..xs] if acc == [] -> {
+      // first character code equal with second?
+      case tr_char(a) == tr_char(b) {
+        True -> tr(xs, [a, ..acc])
+        False -> tr([b, ..xs], [a, ..acc])
       }
+    }
+    [a, ..xs] if acc == [] -> tr(xs, [a, ..acc])
+    [a, ..xs] -> tr(xs, [tr_char(a), ..acc])
   }
 }
 
+fn cleanup(codes) {
+  codes
+  |> utils.remove_adjacent_dups
+  |> utils.remove_value("")
+  |> list.take(4)
+  |> string.join("")
+  |> string.pad_right(to: 4, with: "0")
+}
+
 fn prepare_word(word) {
   word
   |> string.uppercase
+  |> utils.remove_not_allowed_chars("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
   |> string.to_graphemes
 }
 
 pub fn encode(word) {
   word
   |> prepare_word
   |> tr([])
-  |> string.join("")
+  |> cleanup
 }
diff --git a/src/phonetic_gleam/utils.gleam b/src/phonetic_gleam/utils.gleam
@@ -0,0 +1,34 @@
+import gleam/list
+import gleam/string
+import gleam/set
+
+pub fn remove_value(xs: List(a), value: a) {
+  list.filter(xs, fn(x) { x != value })
+}
+
+pub fn remove_not_allowed_chars(word: String, allowed_chars: String) {
+  let allowed_chars_set =
+    allowed_chars
+    |> string.to_graphemes
+    |> set.from_list
+  word
+  |> string.to_graphemes
+  |> list.filter(fn(c) { set.contains(allowed_chars_set, c) })
+  |> string.join("")
+}
+
+pub fn remove_adjacent_dups(xs: List(a)) -> List(a) {
+  list.fold_right(xs, [], fn(acc, x) {
+    case list.first(acc) {
+      Ok(r) if r == x -> acc
+      _ -> [x, ..acc]
+    }
+  })
+}
+
+pub fn then_or_else(is: Bool, then: a, or_else: a) {
+  case is {
+    True -> then
+    False -> or_else
+  }
+}
diff --git a/test/soundex_test.gleam b/test/soundex_test.gleam
@@ -1,6 +1,103 @@
 import gleeunit/should
 import phonetic_gleam/soundex
 
+// Implemented based on tests from the implemented Apache commons codec project.
+pub fn codes_basic_test() {
+  soundex.encode("testing")
+  |> should.equal("T235")
+  soundex.encode("The")
+  |> should.equal("T000")
+  soundex.encode("quick")
+  |> should.equal("Q200")
+  soundex.encode("brown")
+  |> should.equal("B650")
+  soundex.encode("fox")
+  |> should.equal("F200")
+  soundex.encode("jumped")
+  |> should.equal("J513")
+  soundex.encode("over")
+  |> should.equal("O160")
+  soundex.encode("the")
+  |> should.equal("T000")
+  soundex.encode("lazy")
+  |> should.equal("L200")
+  soundex.encode("dogs")
+  |> should.equal("D200")
+}
+
+// Implemented based on tests from the implemented Apache commons codec project.
+// Examples from http://www.bradandkathy.com/genealogy/overviewofsoundex.html
+pub fn codes_bradandkathy_examples_test() {
+  soundex.encode("Allricht")
+  |> should.equal("A462")
+  soundex.encode("Eberhard")
+  |> should.equal("E166")
+  soundex.encode("Engebrethson")
+  |> should.equal("E521")
+  soundex.encode("Heimbach")
+  |> should.equal("H512")
+  soundex.encode("Hanselmann")
+  |> should.equal("H524")
+  soundex.encode("Hildebrand")
+  |> should.equal("H431")
+  soundex.encode("Kavanagh")
+  |> should.equal("K152")
+  soundex.encode("Lind")
+  |> should.equal("L530")
+  soundex.encode("Lukaschowsky")
+  |> should.equal("L222")
+  soundex.encode("McDonnell")
+  |> should.equal("M235")
+  soundex.encode("McGee")
+  |> should.equal("M200")
+  soundex.encode("Opnian")
+  |> should.equal("O155")
+  soundex.encode("Oppenheimer")
+  |> should.equal("O155")
+  soundex.encode("Riedemanas")
+  |> should.equal("R355")
+  soundex.encode("Zita")
+  |> should.equal("Z300")
+  soundex.encode("Zitzmeinn")
+  |> should.equal("Z325")
+}
+
+// Examples from http://www.archives.gov/research_room/genealogy/census/soundex.html
+pub fn codes_census_examples_test() {
+  soundex.encode("Washington")
+  |> should.equal("W252")
+  soundex.encode("Lee")
+  |> should.equal("L000")
+  soundex.encode("Gutierrez")
+  |> should.equal("G362")
+  soundex.encode("Pfister")
+  |> should.equal("P236")
+  soundex.encode("Jackson")
+  |> should.equal("J250")
+  soundex.encode("Tymczak")
+  |> should.equal("T522")
+}
+
+// Examples from http://www.myatt.demon.co.uk/sxalg.htm
+pub fn codes_myatt_examples_test() {
+  soundex.encode("HOLMES")
+  |> should.equal("H452")
+  soundex.encode("ADOMOMI")
+  |> should.equal("A355")
+  soundex.encode("VONDERLEHR")
+  |> should.equal("V536")
+  soundex.encode("BALL")
+  |> should.equal("B400")
+  soundex.encode("SHAW")
+  |> should.equal("S000")
+  soundex.encode("JACKSON")
+  |> should.equal("J250")
+  soundex.encode("SCANLON")
+  |> should.equal("S545")
+  soundex.encode("SAINTJOHN")
+  |> should.equal("S532")
+}
+
 pub fn codes_for_known_words_test() {
   soundex.encode("Britney")
   |> should.equal("B635")
@@ -9,5 +106,7 @@ pub fn codes_for_known_words_test() {
   soundex.encode("Spears")
   |> should.equal("S162")
   soundex.encode("Superzicke")
-  |> should.equal("S16222")
+  |> should.equal("S162")
+  soundex.encode("'OBrien")
+  |> should.equal("O165")
 }
diff --git a/test/utils_test.gleam b/test/utils_test.gleam
@@ -0,0 +1,17 @@
+import gleeunit/should
+import phonetic_gleam/utils
+
+pub fn remove_adjacents_dup() {
+  utils.remove_adjacent_dups([])
+  |> should.equal([])
+  utils.remove_adjacent_dups([1])
+  |> should.equal([1])
+  utils.remove_adjacent_dups([1, 1])
+  |> should.equal([1])
+  utils.remove_adjacent_dups([1, 2, 3])
+  |> should.equal([1, 2, 3])
+  utils.remove_adjacent_dups([1, 2, 2, 3, 3, 4])
+  |> should.equal([1, 2, 3, 4])
+  utils.remove_adjacent_dups([1, 1, 2, 1, 1])
+  |> should.equal([1, 2, 1])
+}