Skip to content

Commit

Permalink
fix soundex algorithm
Browse files Browse the repository at this point in the history
  • Loading branch information
leobm committed Mar 8, 2024
1 parent 0bbb85b commit 86079c0
Show file tree
Hide file tree
Showing 7 changed files with 198 additions and 64 deletions.
2 changes: 1 addition & 1 deletion gleam.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name = "phonetic_gleam"
version = "0.1.0"
version = "0.1.1"

# Fill out these fields if you intend to generate HTML documentation or publish
# your project to the Hex package manager.
Expand Down
25 changes: 4 additions & 21 deletions src/phonetic_gleam/cologne.gleam
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import gleam/int
import gleam/string
import gleam/list
import phonetic_gleam/utils.{then_or_else}

// https://en.wikipedia.org/wiki/Cologne_phonetics

Expand Down Expand Up @@ -28,18 +29,6 @@ fn prepare_word(word) {
|> string.replace(each: "ß", with: "S")
}

fn cleanup_adjacent_codes(codes: List(Int)) {
codes
|> list.fold([], fn(acc, code) {
let last_code = case list.first(acc) {
Ok(c) -> c
Error(Nil) -> -1
}
{ code == -1 || last_code == code }
|> then_or_else(acc, [code, ..acc])
})
}

fn remove_zeros(codes) {
// delete all '0' characters, except at the beginning.
codes
Expand All @@ -55,16 +44,9 @@ fn join_codes(codes: List(Int)) -> String {
|> string.join("")
}

fn then_or_else(is, then, or_else) {
case is {
True -> then
False -> or_else
}
}

fn tr(word, recent_char, codes) -> List(Int) {
case word {
"" -> codes
"" -> list.reverse(codes)
_ -> {
let assert [a, b, t] = first_second_rest(word)
let is_before = fn(c) { c == b }
Expand Down Expand Up @@ -111,7 +93,8 @@ pub fn encode(word) -> String {
word
|> prepare_word
|> tr("", [])
|> cleanup_adjacent_codes
|> utils.remove_value(-1)
|> utils.remove_adjacent_dups
|> remove_zeros
|> list.reverse
|> join_codes
Expand Down
40 changes: 10 additions & 30 deletions src/phonetic_gleam/nysiis.gleam
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import gleam/string
import gleam/list
import gleam/set
import phonetic_gleam/utils.{then_or_else}

// https://en.wikipedia.org/wiki/New_York_State_Identification_and_Intelligence_System

Expand Down Expand Up @@ -71,26 +70,6 @@ fn drop_last_chars(word: String) -> String {
}
}

fn remove_duplicates(word: String) -> String {
string.to_graphemes(word)
|> list.fold_right([], fn(acc, code) {
let last_code = case list.first(acc) {
Ok(c) -> c
Error(Nil) -> ""
}
{ code == "" || last_code == code }
|> then_or_else(acc, [code, ..acc])
})
|> string.join("")
}

fn then_or_else(is, then, or_else) {
case is {
True -> then
False -> or_else
}
}

fn first_char(word) {
case string.pop_grapheme(word) {
Ok(#(a, _)) -> a
Expand All @@ -103,15 +82,17 @@ fn is_vowel(c: String) {
}

fn prepare_word(word: String) -> String {
let allowed_chars =
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|> string.to_graphemes
|> set.from_list
word
|> string.uppercase
|> utils.remove_not_allowed_chars("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
}

string.uppercase(word)
fn cleanup(codes) {
codes
|> string.to_graphemes
|> list.filter(fn(c) { set.contains(allowed_chars, c) })
|> utils.remove_adjacent_dups
|> string.join("")
|> drop_last_chars
}

pub fn encode(word) -> String {
Expand All @@ -125,6 +106,5 @@ pub fn encode(word) -> String {
|> string.to_graphemes
|> tr("", "")
}
|> remove_duplicates
|> drop_last_chars
|> cleanup
}
43 changes: 32 additions & 11 deletions src/phonetic_gleam/soundex.gleam
Original file line number Diff line number Diff line change
@@ -1,32 +1,53 @@
import gleam/list
import gleam/string
import phonetic_gleam/utils

fn tr_char(b) {
case b {
"B" | "F" | "P" | "V" -> "1"
"C" | "G" | "J" | "K" | "Q" | "S" | "X" | "Z" -> "2"
"D" | "T" -> "3"
"L" -> "4"
"M" | "N" -> "5"
"R" -> "6"
_ -> ""
}
}

fn tr(chars, acc) {
case chars {
[] -> list.reverse(acc)
[a, ..xs] if acc == [] -> tr(xs, [a, ..acc])
[a, ..xs] ->
case a {
"B" | "F" | "P" | "V" -> tr(xs, ["1", ..acc])
"C" | "G" | "J" | "K" | "Q" | "S" | "X" | "Z" -> tr(xs, ["2", ..acc])
"D" | "T" -> tr(xs, ["3", ..acc])
"L" -> tr(xs, ["4", ..acc])
"M" | "N" -> tr(xs, ["5", ..acc])
"R" -> tr(xs, ["6", ..acc])
_ -> tr(xs, acc)
[a, b, ..xs] if acc == [] -> {
// first character code equal with second?
case tr_char(a) == tr_char(b) {
True -> tr(xs, [a, ..acc])
False -> tr([b, ..xs], [a, ..acc])
}
}
[a, ..xs] if acc == [] -> tr(xs, [a, ..acc])
[a, ..xs] -> tr(xs, [tr_char(a), ..acc])
}
}

fn cleanup(codes) {
codes
|> utils.remove_adjacent_dups
|> utils.remove_value("")
|> list.take(4)
|> string.join("")
|> string.pad_right(to: 4, with: "0")
}

fn prepare_word(word) {
word
|> string.uppercase
|> utils.remove_not_allowed_chars("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
|> string.to_graphemes
}

pub fn encode(word) {
word
|> prepare_word
|> tr([])
|> string.join("")
|> cleanup
}
34 changes: 34 additions & 0 deletions src/phonetic_gleam/utils.gleam
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import gleam/list
import gleam/string
import gleam/set

pub fn remove_value(xs: List(a), value: a) {
list.filter(xs, fn(x) { x != value })
}

pub fn remove_not_allowed_chars(word: String, allowed_chars: String) {
let allowed_chars_set =
allowed_chars
|> string.to_graphemes
|> set.from_list
word
|> string.to_graphemes
|> list.filter(fn(c) { set.contains(allowed_chars_set, c) })
|> string.join("")
}

pub fn remove_adjacent_dups(xs: List(a)) -> List(a) {
list.fold_right(xs, [], fn(acc, x) {
case list.first(acc) {
Ok(r) if r == x -> acc
_ -> [x, ..acc]
}
})
}

pub fn then_or_else(is: Bool, then: a, or_else: a) {
case is {
True -> then
False -> or_else
}
}
101 changes: 100 additions & 1 deletion test/soundex_test.gleam
Original file line number Diff line number Diff line change
@@ -1,6 +1,103 @@
import gleeunit/should
import phonetic_gleam/soundex

// Implemented based on tests from the implemented Apache commons codec project.
pub fn codes_basic_test() {
soundex.encode("testing")
|> should.equal("T235")
soundex.encode("The")
|> should.equal("T000")
soundex.encode("quick")
|> should.equal("Q200")
soundex.encode("brown")
|> should.equal("B650")
soundex.encode("fox")
|> should.equal("F200")
soundex.encode("jumped")
|> should.equal("J513")
soundex.encode("over")
|> should.equal("O160")
soundex.encode("the")
|> should.equal("T000")
soundex.encode("lazy")
|> should.equal("L200")
soundex.encode("dogs")
|> should.equal("D200")
}

// Implemented based on tests from the implemented Apache commons codec project.
// Examples from http://www.bradandkathy.com/genealogy/overviewofsoundex.html
pub fn codes_bradandkathy_examples_test() {
soundex.encode("Allricht")
|> should.equal("A462")
soundex.encode("Eberhard")
|> should.equal("E166")
soundex.encode("Engebrethson")
|> should.equal("E521")
soundex.encode("Heimbach")
|> should.equal("H512")
soundex.encode("Hanselmann")
|> should.equal("H524")
soundex.encode("Hildebrand")
|> should.equal("H431")
soundex.encode("Kavanagh")
|> should.equal("K152")
soundex.encode("Lind")
|> should.equal("L530")
soundex.encode("Lukaschowsky")
|> should.equal("L222")
soundex.encode("McDonnell")
|> should.equal("M235")
soundex.encode("McGee")
|> should.equal("M200")
soundex.encode("Opnian")
|> should.equal("O155")
soundex.encode("Oppenheimer")
|> should.equal("O155")
soundex.encode("Riedemanas")
|> should.equal("R355")
soundex.encode("Zita")
|> should.equal("Z300")
soundex.encode("Zitzmeinn")
|> should.equal("Z325")
}

// Examples from http://www.archives.gov/research_room/genealogy/census/soundex.html
pub fn codes_census_examples_test() {
soundex.encode("Washington")
|> should.equal("W252")
soundex.encode("Lee")
|> should.equal("L000")
soundex.encode("Gutierrez")
|> should.equal("G362")
soundex.encode("Pfister")
|> should.equal("P236")
soundex.encode("Jackson")
|> should.equal("J250")
soundex.encode("Tymczak")
|> should.equal("T522")
}

// Examples from http://www.myatt.demon.co.uk/sxalg.htm
pub fn codes_myatt_examples_test() {
soundex.encode("HOLMES")
|> should.equal("H452")
soundex.encode("ADOMOMI")
|> should.equal("A355")
soundex.encode("VONDERLEHR")
|> should.equal("V536")
soundex.encode("BALL")
|> should.equal("B400")
soundex.encode("SHAW")
|> should.equal("S000")
soundex.encode("JACKSON")
|> should.equal("J250")
soundex.encode("SCANLON")
|> should.equal("S545")
soundex.encode("SAINTJOHN")
|> should.equal("S532")
}

pub fn codes_for_known_words_test() {
soundex.encode("Britney")
|> should.equal("B635")
Expand All @@ -9,5 +106,7 @@ pub fn codes_for_known_words_test() {
soundex.encode("Spears")
|> should.equal("S162")
soundex.encode("Superzicke")
|> should.equal("S16222")
|> should.equal("S162")
soundex.encode("'OBrien")
|> should.equal("O165")
}
17 changes: 17 additions & 0 deletions test/utils_test.gleam
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import gleeunit/should
import phonetic_gleam/utils

pub fn remove_adjacents_dup() {
utils.remove_adjacent_dups([])
|> should.equal([])
utils.remove_adjacent_dups([1])
|> should.equal([1])
utils.remove_adjacent_dups([1, 1])
|> should.equal([1])
utils.remove_adjacent_dups([1, 2, 3])
|> should.equal([1, 2, 3])
utils.remove_adjacent_dups([1, 2, 2, 3, 3, 4])
|> should.equal([1, 2, 3, 4])
utils.remove_adjacent_dups([1, 1, 2, 1, 1])
|> should.equal([1, 2, 1])
}

0 comments on commit 86079c0

Please sign in to comment.