001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.language; 019 020import org.apache.commons.codec.EncoderException; 021import org.apache.commons.codec.StringEncoder; 022 023/** 024 * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a 025 * general purpose scheme to find word with similar phonemes. 026 * 027 * <p>This class is thread-safe. 028 * Although not strictly immutable, the mutable fields are not actually used.</p> 029 */ 030public class Soundex implements StringEncoder { 031 032 /** 033 * The marker character used to indicate a silent (ignored) character. 034 * These are ignored except when they appear as the first character. 035 * <p> 036 * Note: the {@link #US_ENGLISH_MAPPING_STRING} does not use this mechanism 037 * because changing it might break existing code. Mappings that don't contain 038 * a silent marker code are treated as though H and W are silent. 039 * <p> 040 * To override this, use the {@link #Soundex(String, boolean)} constructor. 041 * @since 1.11 042 */ 043 public static final char SILENT_MARKER = '-'; 044 045 /** 046 * This is a default mapping of the 26 letters used in US English. A value of {@code 0} for a letter position 047 * means do not encode, but treat as a separator when it occurs between consonants with the same code. 048 * <p> 049 * (This constant is provided as both an implementation convenience and to allow Javadoc to pick 050 * up the value for the constant values page.) 051 * <p> 052 * <b>Note that letters H and W are treated specially.</b> 053 * They are ignored (after the first letter) and don't act as separators 054 * between consonants with the same code. 055 */ 056 // ABCDEFGHIJKLMNOPQRSTUVWXYZ 057 public static final String US_ENGLISH_MAPPING_STRING = "01230120022455012623010202"; 058 059 /** 060 * This is a default mapping of the 26 letters used in US English. A value of {@code 0} for a letter position 061 * means do not encode. 062 * 063 * @see Soundex#Soundex(char[]) 064 */ 065 private static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray(); 066 067 /** 068 * An instance of Soundex using the US_ENGLISH_MAPPING mapping. 069 * This treats H and W as silent letters. 070 * Apart from when they appear as the first letter, they are ignored. 071 * They don't act as separators between duplicate codes. 072 * 073 * @see #US_ENGLISH_MAPPING_STRING 074 */ 075 public static final Soundex US_ENGLISH = new Soundex(); 076 077 /** 078 * An instance of Soundex using the Simplified Soundex mapping, as described here: 079 * http://west-penwith.org.uk/misc/soundex.htm 080 * <p> 081 * This treats H and W the same as vowels (AEIOUY). 082 * Such letters aren't encoded (after the first), but they do 083 * act as separators when dropping duplicate codes. 084 * The mapping is otherwise the same as for {@link #US_ENGLISH} 085 * <p> 086 * @since 1.11 087 */ 088 public static final Soundex US_ENGLISH_SIMPLIFIED = new Soundex(US_ENGLISH_MAPPING_STRING, false); 089 090 /** 091 * An instance of Soundex using the mapping as per the Genealogy site: 092 * http://www.genealogy.com/articles/research/00000060.html 093 * <p> 094 * This treats vowels (AEIOUY), H and W as silent letters. 095 * Such letters are ignored (after the first) and do not 096 * act as separators when dropping duplicate codes. 097 * <p> 098 * The codes for consonants are otherwise the same as for 099 * {@link #US_ENGLISH_MAPPING_STRING} and {@link #US_ENGLISH_SIMPLIFIED} 100 * 101 * @since 1.11 102 */ 103 public static final Soundex US_ENGLISH_GENEALOGY = new Soundex("-123-12--22455-12623-1-2-2"); 104 // ABCDEFGHIJKLMNOPQRSTUVWXYZ 105 106 /** 107 * The maximum length of a Soundex code - Soundex codes are only four characters by definition. 108 * 109 * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0. 110 */ 111 @Deprecated 112 private int maxLength = 4; 113 114 /** 115 * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each 116 * letter is mapped. This implementation contains a default map for US_ENGLISH 117 */ 118 private final char[] soundexMapping; 119 120 /** 121 * Should H and W be treated specially? 122 * <p> 123 * In versions of the code prior to 1.11, 124 * the code always treated H and W as silent (ignored) letters. 125 * If this field is false, H and W are no longer special-cased. 126 */ 127 private final boolean specialCaseHW; 128 129 /** 130 * Creates an instance using US_ENGLISH_MAPPING 131 * 132 * @see Soundex#Soundex(char[]) 133 * @see Soundex#US_ENGLISH_MAPPING_STRING 134 */ 135 public Soundex() { 136 this.soundexMapping = US_ENGLISH_MAPPING; 137 this.specialCaseHW = true; 138 } 139 140 /** 141 * Creates a soundex instance using the given mapping. This constructor can be used to provide an internationalized 142 * mapping for a non-Western character set. 143 * 144 * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each 145 * letter is mapped. This implementation contains a default map for US_ENGLISH 146 * <p> 147 * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment 148 * 149 * @param mapping 150 * Mapping array to use when finding the corresponding code for a given character 151 */ 152 public Soundex(final char[] mapping) { 153 this.soundexMapping = new char[mapping.length]; 154 System.arraycopy(mapping, 0, this.soundexMapping, 0, mapping.length); 155 this.specialCaseHW = !hasMarker(this.soundexMapping); 156 } 157 158 private boolean hasMarker(final char[] mapping) { 159 for(final char ch : mapping) { 160 if (ch == SILENT_MARKER) { 161 return true; 162 } 163 } 164 return false; 165 } 166 167 /** 168 * Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping, 169 * and/or possibly provide an internationalized mapping for a non-Western character set. 170 * <p> 171 * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment 172 * 173 * @param mapping 174 * Mapping string to use when finding the corresponding code for a given character 175 * @since 1.4 176 */ 177 public Soundex(final String mapping) { 178 this.soundexMapping = mapping.toCharArray(); 179 this.specialCaseHW = !hasMarker(this.soundexMapping); 180 } 181 182 /** 183 * Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping, 184 * and/or possibly provide an internationalized mapping for a non-Western character set. 185 * 186 * @param mapping 187 * Mapping string to use when finding the corresponding code for a given character 188 * @param specialCaseHW if true, then 189 * @since 1.11 190 */ 191 public Soundex(final String mapping, final boolean specialCaseHW) { 192 this.soundexMapping = mapping.toCharArray(); 193 this.specialCaseHW = specialCaseHW; 194 } 195 196 /** 197 * Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This 198 * return value ranges from 0 through 4: 0 indicates little or no similarity, and 4 indicates strong similarity or 199 * identical values. 200 * 201 * @param s1 202 * A String that will be encoded and compared. 203 * @param s2 204 * A String that will be encoded and compared. 205 * @return The number of characters in the two encoded Strings that are the same from 0 to 4. 206 * 207 * @see SoundexUtils#difference(StringEncoder,String,String) 208 * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS 209 * T-SQL DIFFERENCE </a> 210 * 211 * @throws EncoderException 212 * if an error occurs encoding one of the strings 213 * @since 1.3 214 */ 215 public int difference(final String s1, final String s2) throws EncoderException { 216 return SoundexUtils.difference(this, s1, s2); 217 } 218 219 /** 220 * Encodes an Object using the soundex algorithm. This method is provided in order to satisfy the requirements of 221 * the Encoder interface, and will throw an EncoderException if the supplied object is not of type java.lang.String. 222 * 223 * @param obj 224 * Object to encode 225 * @return An object (or type java.lang.String) containing the soundex code which corresponds to the String 226 * supplied. 227 * @throws EncoderException 228 * if the parameter supplied is not of type java.lang.String 229 * @throws IllegalArgumentException 230 * if a character is not mapped 231 */ 232 @Override 233 public Object encode(final Object obj) throws EncoderException { 234 if (!(obj instanceof String)) { 235 throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String"); 236 } 237 return soundex((String) obj); 238 } 239 240 /** 241 * Encodes a String using the soundex algorithm. 242 * 243 * @param str 244 * A String object to encode 245 * @return A Soundex code corresponding to the String supplied 246 * @throws IllegalArgumentException 247 * if a character is not mapped 248 */ 249 @Override 250 public String encode(final String str) { 251 return soundex(str); 252 } 253 254 /** 255 * Returns the maxLength. Standard Soundex 256 * 257 * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0. 258 * @return int 259 */ 260 @Deprecated 261 public int getMaxLength() { 262 return this.maxLength; 263 } 264 265 /** 266 * Maps the given upper-case character to its Soundex code. 267 * 268 * @param ch 269 * An upper-case character. 270 * @return A Soundex code. 271 * @throws IllegalArgumentException 272 * Thrown if {@code ch} is not mapped. 273 */ 274 private char map(final char ch) { 275 final int index = ch - 'A'; 276 if (index < 0 || index >= this.soundexMapping.length) { 277 throw new IllegalArgumentException("The character is not mapped: " + ch + " (index=" + index + ")"); 278 } 279 return this.soundexMapping[index]; 280 } 281 282 /** 283 * Sets the maxLength. 284 * 285 * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0. 286 * @param maxLength 287 * The maxLength to set 288 */ 289 @Deprecated 290 public void setMaxLength(final int maxLength) { 291 this.maxLength = maxLength; 292 } 293 294 /** 295 * Retrieves the Soundex code for a given String object. 296 * 297 * @param str 298 * String to encode using the Soundex algorithm 299 * @return A soundex code for the String supplied 300 * @throws IllegalArgumentException 301 * if a character is not mapped 302 */ 303 public String soundex(String str) { 304 if (str == null) { 305 return null; 306 } 307 str = SoundexUtils.clean(str); 308 if (str.length() == 0) { 309 return str; 310 } 311 final char out[] = {'0', '0', '0', '0'}; 312 int count = 0; 313 final char first = str.charAt(0); 314 out[count++] = first; 315 char lastDigit = map(first); // previous digit 316 for(int i = 1; i < str.length() && count < out.length ; i++) { 317 final char ch = str.charAt(i); 318 if ((this.specialCaseHW) && (ch == 'H' || ch == 'W')) { // these are ignored completely 319 continue; 320 } 321 final char digit = map(ch); 322 if (digit == SILENT_MARKER) { 323 continue; 324 } 325 if (digit != '0' && digit != lastDigit) { // don't store vowels or repeats 326 out[count++] = digit; 327 } 328 lastDigit = digit; 329 } 330 return new String(out); 331 } 332 333}