001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.language;
019
020import org.apache.commons.codec.EncoderException;
021import org.apache.commons.codec.StringEncoder;
022
023/**
024 * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a
025 * general purpose scheme to find word with similar phonemes.
026 *
027 * <p>This class is thread-safe.
028 * Although not strictly immutable, the mutable fields are not actually used.</p>
029 */
030public class Soundex implements StringEncoder {
031
032    /**
033     * The marker character used to indicate a silent (ignored) character.
034     * These are ignored except when they appear as the first character.
035     * <p>
036     * Note: the {@link #US_ENGLISH_MAPPING_STRING} does not use this mechanism
037     * because changing it might break existing code. Mappings that don't contain
038     * a silent marker code are treated as though H and W are silent.
039     * <p>
040     * To override this, use the {@link #Soundex(String, boolean)} constructor.
041     * @since 1.11
042     */
043    public static final char SILENT_MARKER = '-';
044
045    /**
046     * This is a default mapping of the 26 letters used in US English. A value of {@code 0} for a letter position
047     * means do not encode, but treat as a separator when it occurs between consonants with the same code.
048     * <p>
049     * (This constant is provided as both an implementation convenience and to allow Javadoc to pick
050     * up the value for the constant values page.)
051     * <p>
052     * <b>Note that letters H and W are treated specially.</b>
053     * They are ignored (after the first letter) and don't act as separators
054     * between consonants with the same code.
055     */
056    //                                                      ABCDEFGHIJKLMNOPQRSTUVWXYZ
057    public static final String US_ENGLISH_MAPPING_STRING = "01230120022455012623010202";
058
059    /**
060     * This is a default mapping of the 26 letters used in US English. A value of {@code 0} for a letter position
061     * means do not encode.
062     *
063     * @see Soundex#Soundex(char[])
064     */
065    private static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray();
066
067    /**
068     * An instance of Soundex using the US_ENGLISH_MAPPING mapping.
069     * This treats H and W as silent letters.
070     * Apart from when they appear as the first letter, they are ignored.
071     * They don't act as separators between duplicate codes.
072     *
073     * @see #US_ENGLISH_MAPPING_STRING
074     */
075    public static final Soundex US_ENGLISH = new Soundex();
076
077    /**
078     * An instance of Soundex using the Simplified Soundex mapping, as described here:
079     * http://west-penwith.org.uk/misc/soundex.htm
080     * <p>
081     * This treats H and W the same as vowels (AEIOUY).
082     * Such letters aren't encoded (after the first), but they do
083     * act as separators when dropping duplicate codes.
084     * The mapping is otherwise the same as for {@link #US_ENGLISH}
085     * <p>
086     * @since 1.11
087     */
088    public static final Soundex US_ENGLISH_SIMPLIFIED = new Soundex(US_ENGLISH_MAPPING_STRING, false);
089
090    /**
091     * An instance of Soundex using the mapping as per the Genealogy site:
092     * http://www.genealogy.com/articles/research/00000060.html
093     * <p>
094     * This treats vowels (AEIOUY), H and W as silent letters.
095     * Such letters are ignored (after the first) and do not
096     * act as separators when dropping duplicate codes.
097     * <p>
098     * The codes for consonants are otherwise the same as for
099     * {@link #US_ENGLISH_MAPPING_STRING} and {@link #US_ENGLISH_SIMPLIFIED}
100     *
101     * @since 1.11
102     */
103    public static final Soundex US_ENGLISH_GENEALOGY = new Soundex("-123-12--22455-12623-1-2-2");
104    //                                                              ABCDEFGHIJKLMNOPQRSTUVWXYZ
105
106    /**
107     * The maximum length of a Soundex code - Soundex codes are only four characters by definition.
108     *
109     * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
110     */
111    @Deprecated
112    private int maxLength = 4;
113
114    /**
115     * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
116     * letter is mapped. This implementation contains a default map for US_ENGLISH
117     */
118    private final char[] soundexMapping;
119
120    /**
121     * Should H and W be treated specially?
122     * <p>
123     * In versions of the code prior to 1.11,
124     * the code always treated H and W as silent (ignored) letters.
125     * If this field is false, H and W are no longer special-cased.
126     */
127    private final boolean specialCaseHW;
128
129    /**
130     * Creates an instance using US_ENGLISH_MAPPING
131     *
132     * @see Soundex#Soundex(char[])
133     * @see Soundex#US_ENGLISH_MAPPING_STRING
134     */
135    public Soundex() {
136        this.soundexMapping = US_ENGLISH_MAPPING;
137        this.specialCaseHW = true;
138    }
139
140    /**
141     * Creates a soundex instance using the given mapping. This constructor can be used to provide an internationalized
142     * mapping for a non-Western character set.
143     *
144     * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
145     * letter is mapped. This implementation contains a default map for US_ENGLISH
146     * <p>
147     * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment
148     *
149     * @param mapping
150     *                  Mapping array to use when finding the corresponding code for a given character
151     */
152    public Soundex(final char[] mapping) {
153        this.soundexMapping = new char[mapping.length];
154        System.arraycopy(mapping, 0, this.soundexMapping, 0, mapping.length);
155        this.specialCaseHW = !hasMarker(this.soundexMapping);
156    }
157
158    private boolean hasMarker(final char[] mapping) {
159        for(final char ch : mapping) {
160            if (ch == SILENT_MARKER) {
161                return true;
162            }
163        }
164        return false;
165    }
166
167    /**
168     * Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping,
169     * and/or possibly provide an internationalized mapping for a non-Western character set.
170     * <p>
171     * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment
172     *
173     * @param mapping
174     *            Mapping string to use when finding the corresponding code for a given character
175     * @since 1.4
176     */
177    public Soundex(final String mapping) {
178        this.soundexMapping = mapping.toCharArray();
179        this.specialCaseHW = !hasMarker(this.soundexMapping);
180    }
181
182    /**
183     * Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping,
184     * and/or possibly provide an internationalized mapping for a non-Western character set.
185     *
186     * @param mapping
187     *            Mapping string to use when finding the corresponding code for a given character
188     * @param specialCaseHW if true, then
189     * @since 1.11
190     */
191    public Soundex(final String mapping, final boolean specialCaseHW) {
192        this.soundexMapping = mapping.toCharArray();
193        this.specialCaseHW = specialCaseHW;
194    }
195
196    /**
197     * Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This
198     * return value ranges from 0 through 4: 0 indicates little or no similarity, and 4 indicates strong similarity or
199     * identical values.
200     *
201     * @param s1
202     *                  A String that will be encoded and compared.
203     * @param s2
204     *                  A String that will be encoded and compared.
205     * @return The number of characters in the two encoded Strings that are the same from 0 to 4.
206     *
207     * @see SoundexUtils#difference(StringEncoder,String,String)
208     * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS
209     *          T-SQL DIFFERENCE </a>
210     *
211     * @throws EncoderException
212     *                  if an error occurs encoding one of the strings
213     * @since 1.3
214     */
215    public int difference(final String s1, final String s2) throws EncoderException {
216        return SoundexUtils.difference(this, s1, s2);
217    }
218
219    /**
220     * Encodes an Object using the soundex algorithm. This method is provided in order to satisfy the requirements of
221     * the Encoder interface, and will throw an EncoderException if the supplied object is not of type java.lang.String.
222     *
223     * @param obj
224     *                  Object to encode
225     * @return An object (or type java.lang.String) containing the soundex code which corresponds to the String
226     *             supplied.
227     * @throws EncoderException
228     *                  if the parameter supplied is not of type java.lang.String
229     * @throws IllegalArgumentException
230     *                  if a character is not mapped
231     */
232    @Override
233    public Object encode(final Object obj) throws EncoderException {
234        if (!(obj instanceof String)) {
235            throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String");
236        }
237        return soundex((String) obj);
238    }
239
240    /**
241     * Encodes a String using the soundex algorithm.
242     *
243     * @param str
244     *                  A String object to encode
245     * @return A Soundex code corresponding to the String supplied
246     * @throws IllegalArgumentException
247     *                  if a character is not mapped
248     */
249    @Override
250    public String encode(final String str) {
251        return soundex(str);
252    }
253
254    /**
255     * Returns the maxLength. Standard Soundex
256     *
257     * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
258     * @return int
259     */
260    @Deprecated
261    public int getMaxLength() {
262        return this.maxLength;
263    }
264
265    /**
266     * Maps the given upper-case character to its Soundex code.
267     *
268     * @param ch
269     *                  An upper-case character.
270     * @return A Soundex code.
271     * @throws IllegalArgumentException
272     *                  Thrown if {@code ch} is not mapped.
273     */
274    private char map(final char ch) {
275        final int index = ch - 'A';
276        if (index < 0 || index >= this.soundexMapping.length) {
277            throw new IllegalArgumentException("The character is not mapped: " + ch + " (index=" + index + ")");
278        }
279        return this.soundexMapping[index];
280    }
281
282    /**
283     * Sets the maxLength.
284     *
285     * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
286     * @param maxLength
287     *                  The maxLength to set
288     */
289    @Deprecated
290    public void setMaxLength(final int maxLength) {
291        this.maxLength = maxLength;
292    }
293
294    /**
295     * Retrieves the Soundex code for a given String object.
296     *
297     * @param str
298     *                  String to encode using the Soundex algorithm
299     * @return A soundex code for the String supplied
300     * @throws IllegalArgumentException
301     *                  if a character is not mapped
302     */
303    public String soundex(String str) {
304        if (str == null) {
305            return null;
306        }
307        str = SoundexUtils.clean(str);
308        if (str.length() == 0) {
309            return str;
310        }
311        final char out[] = {'0', '0', '0', '0'};
312        int count = 0;
313        final char first = str.charAt(0);
314        out[count++] = first;
315        char lastDigit = map(first); // previous digit
316        for(int i = 1; i < str.length() && count < out.length ; i++) {
317            final char ch = str.charAt(i);
318            if ((this.specialCaseHW) && (ch == 'H' || ch == 'W')) { // these are ignored completely
319                continue;
320            }
321            final char digit = map(ch);
322            if (digit == SILENT_MARKER) {
323                continue;
324            }
325            if (digit != '0' && digit != lastDigit) { // don't store vowels or repeats
326                out[count++] = digit;
327            }
328            lastDigit = digit;
329        }
330        return new String(out);
331    }
332
333}