When trying to link new records to an existing database, the Soundex algorithm is often used to convert English names to a phonetic code to avoid duplicates that could arise from misspelling a given sound. Unfortunately, Soundex doesn’t work too well with German names. For this purpose, the Kölner Phonetik is a tried and tested method. There are implementations in C#, but I found them less than intuitive. So here’s my humble attempt to change that.
using System.Text;
using System.Collections.Generic;
using System.Linq;
namespace Phonetic
{
/// <summary>
/// Implements the conversion of words to phonetic Codes by application of Cologne phonetics rules.
/// </summary>
/// <remarks>
/// Cologne phonetics is supposed to yield better results than Soundex regarding German words.
/// Contrary to Soundex, the length of the phonetic Code is not limited.
/// </remarks>
public class ColognePhonetic
{
/// <summary>
/// Straight-forward translation of https://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik to C#.
/// </summary>
public class Rule
{
// Needed to express 'initial'
public const char EmptyChar = '\0';
public static Dictionary<char, Rule[]> All
{
get
{
if (_rules == null)
{
_tempRules = new Dictionary<char, List<Rule>>();
_rules = new Dictionary<char, Rule[]>();
AddRule("AEIJOUYÖÜÄ", Code: "0");
AddRule("B", Code: "1");
AddRule("P", NotNext: "H", Code: "1");
AddRule("DT", NotNext: "CSZ", Code: "2");
AddRule("FVW", Code: "3");
AddRule("P", Next: "H", Code: "3");
AddRule("GKQ", Code: "4");
AddRule("C", Previous: EmptyChar + " ", Next: "AHKLOQRUX", Code: "4");
AddRule("C", Next: "AHKOQUX", NotPrevious: "SZ", Code: "4");
AddRule("X", NotPrevious: "CKQ", Code: "48");
AddRule("L", Code: "5");
AddRule("MN", Code: "6");
AddRule("R", Code: "7");
AddRule("SZß", Code: "8");
AddRule("C", Previous: "SZ", Code: "8");
AddRule("C", Previous: EmptyChar + " ", NotNext: "AHKLOQRUX", Code: "8");
AddRule("C", NotNext: "AHKOQUX", Code: "8");
AddRule("DT", Next: "CSZ", Code: "8");
AddRule("X", Previous: "CKQ", Code: "8");
FinalizeRules();
}
return _rules;
}
}
private static Dictionary<char, List<Rule>> _tempRules;
private static Dictionary<char, Rule[]> _rules;
char letter;
char[] NotPrevious;
char[] NotNext;
char[] Previous;
char[] Next;
public string Code { get; private set; }
public Rule(char letter, string Code)
{
this.letter = letter;
this.Code = Code;
}
private static bool Contains(char[] Arr, char c) => Arr == null || Arr.Contains(c);
private static bool NotContains(char[] Arr, char c) => Arr == null || !Arr.Contains(c);
public bool Applies(char prev, char curr, char next)
{
return curr == letter
&& Contains(Previous, prev)
&& NotContains(NotPrevious, prev)
&& Contains(Next, next)
&& NotContains(NotNext, next);
}
private static void AddRule(string Letters, string Code, string NotPrevious = null, string Previous = null, string NotNext = null, string Next = null)
{
char[] singleLetters = Letters.ToCharArray();
foreach (var letter in singleLetters)
{
if (!_tempRules.ContainsKey(letter))
_tempRules[letter] = new List<Rule>();
_tempRules[letter].Add(new Rule(letter, Code)
{
NotPrevious = NotPrevious?.ToCharArray(),
NotNext = NotNext?.ToCharArray(),
Previous = Previous?.ToCharArray(),
Next = Next?.ToCharArray()
});
}
}
private static void FinalizeRules()
{
foreach (var pair in _tempRules)
_rules[pair.Key] = pair.Value.ToArray();
}
}
char prev => (pos - 1 < 0) ? Rule.EmptyChar : s[pos - 1];
char curr => s[pos];
char next => (pos + 1 < s.Length) ? s[pos + 1] : Rule.EmptyChar;
char[] s;
int pos = -1;
StringBuilder Phonetic;
public ColognePhonetic(string s)
{
this.s = s.ToUpperInvariant().ToCharArray();
Phonetic = new StringBuilder(s.Length + 1);
Convert();
}
private bool HasNext()
{
++pos;
return pos < s.Length;
}
private void Convert()
{
while (HasNext())
{
if (Rule.All.ContainsKey(curr))
{
var rules = Rule.All[curr];
foreach (var rule in rules)
{
if (rule.Applies(prev, curr, next))
{
var Code = rule.Code;
Phonetic.Append(rule.Code);
break;
}
}
}
}
RemoveMultiples();
DiscardZeroes();
}
/// <summary>
/// Removes all neighbouring multiple code char occurences.
/// </summary>
private void RemoveMultiples()
{
for (int i = 0; i < Phonetic.Length; i++)
{
int j = i + 1;
while (j < Phonetic.Length && Phonetic[i] == Phonetic[j])
++j;
Phonetic.Remove(i + 1, j - i - 1);
}
}
/// <summary>
/// Removes all '0' code chars except at the beginning.
/// </summary>
private void DiscardZeroes()
{
for (int i = 1; i < Phonetic.Length; i++)
if (Phonetic[i] == '0')
Phonetic.Remove(i, 1);
}
public override string ToString() => Phonetic.ToString();
public static void Main(string[] args)
{
System.Console.WriteLine(new ColognePhonetic("Müller-Lüdenscheidt"));
}
}
}
Posted in
programming
2016-05-12 20:51 UTC