001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.language; 019 020import java.util.Locale; 021 022import org.apache.commons.codec.EncoderException; 023import org.apache.commons.codec.StringEncoder; 024 025/** 026 * Encodes a string into a Cologne Phonetic value. 027 * <p> 028 * Implements the <a href="http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik">Kölner Phonetik</a> (Cologne 029 * Phonetic) algorithm issued by Hans Joachim Postel in 1969. 030 * </p> 031 * <p> 032 * The <i>Kölner Phonetik</i> is a phonetic algorithm which is optimized for the German language. It is related to 033 * the well-known soundex algorithm. 034 * </p> 035 * 036 * <h2>Algorithm</h2> 037 * 038 * <ul> 039 * 040 * <li> 041 * <h3>Step 1:</h3> 042 * After preprocessing (conversion to upper case, transcription of <a 043 * href="http://en.wikipedia.org/wiki/Germanic_umlaut">germanic umlauts</a>, removal of non alphabetical characters) the 044 * letters of the supplied text are replaced by their phonetic code according to the following table. 045 * <table border="1"> 046 * <caption style="caption-side: bottom"><small><i>(Source: <a 047 * href="http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik#Buchstabencodes">Wikipedia (de): Kölner Phonetik -- 048 * Buchstabencodes</a>)</i></small></caption> <tbody> 049 * <tr> 050 * <th>Letter</th> 051 * <th>Context</th> 052 * <th>Code</th> 053 * </tr> 054 * <tr> 055 * <td>A, E, I, J, O, U, Y</td> 056 * <td></td> 057 * <td>0</td> 058 * </tr> 059 * <tr> 060 * 061 * <td>H</td> 062 * <td></td> 063 * <td>-</td> 064 * </tr> 065 * <tr> 066 * <td>B</td> 067 * <td></td> 068 * <td rowspan="2">1</td> 069 * </tr> 070 * <tr> 071 * <td>P</td> 072 * <td>not before H</td> 073 * 074 * </tr> 075 * <tr> 076 * <td>D, T</td> 077 * <td>not before C, S, Z</td> 078 * <td>2</td> 079 * </tr> 080 * <tr> 081 * <td>F, V, W</td> 082 * <td></td> 083 * <td rowspan="2">3</td> 084 * </tr> 085 * <tr> 086 * 087 * <td>P</td> 088 * <td>before H</td> 089 * </tr> 090 * <tr> 091 * <td>G, K, Q</td> 092 * <td></td> 093 * <td rowspan="3">4</td> 094 * </tr> 095 * <tr> 096 * <td rowspan="2">C</td> 097 * <td>at onset before A, H, K, L, O, Q, R, U, X</td> 098 * 099 * </tr> 100 * <tr> 101 * <td>before A, H, K, O, Q, U, X except after S, Z</td> 102 * </tr> 103 * <tr> 104 * <td>X</td> 105 * <td>not after C, K, Q</td> 106 * <td>48</td> 107 * </tr> 108 * <tr> 109 * <td>L</td> 110 * <td></td> 111 * 112 * <td>5</td> 113 * </tr> 114 * <tr> 115 * <td>M, N</td> 116 * <td></td> 117 * <td>6</td> 118 * </tr> 119 * <tr> 120 * <td>R</td> 121 * <td></td> 122 * <td>7</td> 123 * </tr> 124 * 125 * <tr> 126 * <td>S, Z</td> 127 * <td></td> 128 * <td rowspan="6">8</td> 129 * </tr> 130 * <tr> 131 * <td rowspan="3">C</td> 132 * <td>after S, Z</td> 133 * </tr> 134 * <tr> 135 * <td>at onset except before A, H, K, L, O, Q, R, U, X</td> 136 * </tr> 137 * 138 * <tr> 139 * <td>not before A, H, K, O, Q, U, X</td> 140 * </tr> 141 * <tr> 142 * <td>D, T</td> 143 * <td>before C, S, Z</td> 144 * </tr> 145 * <tr> 146 * <td>X</td> 147 * <td>after C, K, Q</td> 148 * </tr> 149 * </tbody> 150 * </table> 151 * 152 * <h4>Example:</h4> 153 * 154 * {@code "M}ü{@code ller-L}ü<code>denscheidt" 155 * => "MULLERLUDENSCHEIDT" => "6005507500206880022"</code> 156 * 157 * </li> 158 * 159 * <li> 160 * <h3>Step 2:</h3> 161 * Collapse of all multiple consecutive code digits. 162 * <h4>Example:</h4> 163 * {@code "6005507500206880022" => "6050750206802"}</li> 164 * 165 * <li> 166 * <h3>Step 3:</h3> 167 * Removal of all codes "0" except at the beginning. This means that two or more identical consecutive digits can occur 168 * if they occur after removing the "0" digits. 169 * 170 * <h4>Example:</h4> 171 * {@code "6050750206802" => "65752682"}</li> 172 * 173 * </ul> 174 * 175 * <p> 176 * This class is thread-safe. 177 * </p> 178 * 179 * @see <a href="http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik">Wikipedia (de): Kölner Phonetik (in German)</a> 180 * @since 1.5 181 */ 182public class ColognePhonetic implements StringEncoder { 183 184 // Predefined char arrays for better performance and less GC load 185 private static final char[] AEIJOUY = new char[] { 'A', 'E', 'I', 'J', 'O', 'U', 'Y' }; 186 private static final char[] CSZ = new char[] { 'C', 'S', 'Z' }; 187 private static final char[] FPVW = new char[] { 'F', 'P', 'V', 'W' }; 188 private static final char[] GKQ = new char[] { 'G', 'K', 'Q' }; 189 private static final char[] CKQ = new char[] { 'C', 'K', 'Q' }; 190 private static final char[] AHKLOQRUX = new char[] { 'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X' }; 191 private static final char[] SZ = new char[] { 'S', 'Z' }; 192 private static final char[] AHKOQUX = new char[] { 'A', 'H', 'K', 'O', 'Q', 'U', 'X' }; 193 private static final char[] DTX = new char[] { 'D', 'T', 'X' }; 194 195 private static final char CHAR_IGNORE = '-'; // is this character to be ignored? 196 197 /** 198 * This class is not thread-safe; the field {@link #length} is mutable. 199 * However, it is not shared between threads, as it is constructed on demand 200 * by the method {@link ColognePhonetic#colognePhonetic(String)} 201 */ 202 private abstract class CologneBuffer { 203 204 protected final char[] data; 205 206 protected int length = 0; 207 208 public CologneBuffer(final char[] data) { 209 this.data = data; 210 this.length = data.length; 211 } 212 213 public CologneBuffer(final int buffSize) { 214 this.data = new char[buffSize]; 215 this.length = 0; 216 } 217 218 protected abstract char[] copyData(int start, final int length); 219 220 public int length() { 221 return length; 222 } 223 224 @Override 225 public String toString() { 226 return new String(copyData(0, length)); 227 } 228 } 229 230 private class CologneOutputBuffer extends CologneBuffer { 231 232 private char lastCode; 233 234 public CologneOutputBuffer(final int buffSize) { 235 super(buffSize); 236 lastCode = '/'; // impossible value 237 } 238 239 /** 240 * Stores the next code in the output buffer, keeping track of the previous code. 241 * '0' is only stored if it is the first entry. 242 * Ignored chars are never stored. 243 * If the code is the same as the last code (whether stored or not) it is not stored. 244 * 245 * @param code the code to store. 246 */ 247 public void put(final char code) { 248 if (code != CHAR_IGNORE && lastCode != code && (code != '0' || length == 0)) { 249 data[length] = code; 250 length++; 251 } 252 lastCode = code; 253 } 254 255 @Override 256 protected char[] copyData(final int start, final int length) { 257 final char[] newData = new char[length]; 258 System.arraycopy(data, start, newData, 0, length); 259 return newData; 260 } 261 } 262 263 private class CologneInputBuffer extends CologneBuffer { 264 265 public CologneInputBuffer(final char[] data) { 266 super(data); 267 } 268 269 @Override 270 protected char[] copyData(final int start, final int length) { 271 final char[] newData = new char[length]; 272 System.arraycopy(data, data.length - this.length + start, newData, 0, length); 273 return newData; 274 } 275 276 public char getNextChar() { 277 return data[getNextPos()]; 278 } 279 280 protected int getNextPos() { 281 return data.length - length; 282 } 283 284 public char removeNext() { 285 final char ch = getNextChar(); 286 length--; 287 return ch; 288 } 289 } 290 291 /* 292 * Returns whether the array contains the key, or not. 293 */ 294 private static boolean arrayContains(final char[] arr, final char key) { 295 for (final char element : arr) { 296 if (element == key) { 297 return true; 298 } 299 } 300 return false; 301 } 302 303 /** 304 * <p> 305 * Implements the <i>Kölner Phonetik</i> algorithm. 306 * </p> 307 * <p> 308 * In contrast to the initial description of the algorithm, this implementation does the encoding in one pass. 309 * </p> 310 * 311 * @param text The source text to encode 312 * @return the corresponding encoding according to the <i>Kölner Phonetik</i> algorithm 313 */ 314 public String colognePhonetic(final String text) { 315 if (text == null) { 316 return null; 317 } 318 319 final CologneInputBuffer input = new CologneInputBuffer(preprocess(text)); 320 final CologneOutputBuffer output = new CologneOutputBuffer(input.length() * 2); 321 322 char nextChar; 323 324 char lastChar = CHAR_IGNORE; 325 char chr; 326 327 while (input.length() > 0) { 328 chr = input.removeNext(); 329 330 if (input.length() > 0) { 331 nextChar = input.getNextChar(); 332 } else { 333 nextChar = CHAR_IGNORE; 334 } 335 336 if (chr < 'A' || chr > 'Z') { 337 continue; // ignore unwanted characters 338 } 339 340 if (arrayContains(AEIJOUY, chr)) { 341 output.put('0'); 342 } else if (chr == 'B' || (chr == 'P' && nextChar != 'H')) { 343 output.put('1'); 344 } else if ((chr == 'D' || chr == 'T') && !arrayContains(CSZ, nextChar)) { 345 output.put('2'); 346 } else if (arrayContains(FPVW, chr)) { 347 output.put('3'); 348 } else if (arrayContains(GKQ, chr)) { 349 output.put('4'); 350 } else if (chr == 'X' && !arrayContains(CKQ, lastChar)) { 351 output.put('4'); 352 output.put('8'); 353 } else if (chr == 'S' || chr == 'Z') { 354 output.put('8'); 355 } else if (chr == 'C') { 356 if (output.length() == 0) { 357 if (arrayContains(AHKLOQRUX, nextChar)) { 358 output.put('4'); 359 } else { 360 output.put('8'); 361 } 362 } else { 363 if (arrayContains(SZ, lastChar) || !arrayContains(AHKOQUX, nextChar)) { 364 output.put('8'); 365 } else { 366 output.put('4'); 367 } 368 } 369 } else if (arrayContains(DTX, chr)) { 370 output.put('8'); 371 } else if (chr == 'R') { 372 output.put('7'); 373 } else if (chr == 'L') { 374 output.put('5'); 375 } else if (chr == 'M' || chr == 'N') { 376 output.put('6'); 377 } else if (chr == 'H') { 378 output.put(CHAR_IGNORE); // needed by put 379 } else { 380 // ignored; should not happen 381 } 382 383 lastChar = chr; 384 } 385 return output.toString(); 386 } 387 388 @Override 389 public Object encode(final Object object) throws EncoderException { 390 if (!(object instanceof String)) { 391 throw new EncoderException("This method's parameter was expected to be of the type " + 392 String.class.getName() + 393 ". But actually it was of the type " + 394 object.getClass().getName() + 395 "."); 396 } 397 return encode((String) object); 398 } 399 400 @Override 401 public String encode(final String text) { 402 return colognePhonetic(text); 403 } 404 405 /** 406 * Compares the first encoded string to the second encoded string. 407 * 408 * @param text1 source text to encode before testing for equality. 409 * @param text2 source text to encode before testing for equality. 410 * @return {@code true} if the encoding the first string equals the encoding of the second string, {@code false} 411 * otherwise 412 */ 413 public boolean isEncodeEqual(final String text1, final String text2) { 414 return colognePhonetic(text1).equals(colognePhonetic(text2)); 415 } 416 417 /** 418 * Converts the string to upper case and replaces Germanic umlaut characters 419 * The following characters are mapped: 420 * <ul> 421 * <li>capital A, umlaut mark</li> 422 * <li>capital U, umlaut mark</li> 423 * <li>capital O, umlaut mark</li> 424 * <li>small sharp s, German</li> 425 * </ul> 426 */ 427 private char[] preprocess(final String text) { 428 // This converts German small sharp s (Eszett) to SS 429 final char[] chrs = text.toUpperCase(Locale.GERMAN).toCharArray(); 430 431 for (int index = 0; index < chrs.length; index++) { 432 switch (chrs[index]) { 433 case '\u00C4': // capital A, umlaut mark 434 chrs[index] = 'A'; 435 break; 436 case '\u00DC': // capital U, umlaut mark 437 chrs[index] = 'U'; 438 break; 439 case '\u00D6': // capital O, umlaut mark 440 chrs[index] = 'O'; 441 break; 442 default: 443 break; 444 } 445 } 446 return chrs; 447 } 448}