001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.io.input;
018
019import static org.apache.commons.io.IOUtils.EOF;
020
021import java.io.IOException;
022import java.io.InputStream;
023import java.util.Arrays;
024import java.util.Comparator;
025import java.util.List;
026import org.apache.commons.io.ByteOrderMark;
027import org.apache.commons.io.IOUtils;
028
029/**
030 * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes.
031 *
032 * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the
033 * first byte in the stream.
034 *
035 * The {@link ByteOrderMark} implementation has the following pre-defined BOMs:
036 * <ul>
037 * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li>
038 * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li>
039 * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li>
040 * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li>
041 * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li>
042 * </ul>
043 *
044 *
045 * <h2>Example 1 - Detect and exclude a UTF-8 BOM</h2>
046 *
047 * <pre>
048 * BOMInputStream bomIn = new BOMInputStream(in);
049 * if (bomIn.hasBOM()) {
050 *     // has a UTF-8 BOM
051 * }
052 * </pre>
053 *
054 * <h2>Example 2 - Detect a UTF-8 BOM (but don't exclude it)</h2>
055 *
056 * <pre>
057 * boolean include = true;
058 * BOMInputStream bomIn = new BOMInputStream(in, include);
059 * if (bomIn.hasBOM()) {
060 *     // has a UTF-8 BOM
061 * }
062 * </pre>
063 *
064 * <h2>Example 3 - Detect Multiple BOMs</h2>
065 *
066 * <pre>
067 * BOMInputStream bomIn = new BOMInputStream(in,
068 *   ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE,
069 *   ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE
070 *   );
071 * if (bomIn.hasBOM() == false) {
072 *     // No BOM found
073 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
074 *     // has a UTF-16LE BOM
075 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
076 *     // has a UTF-16BE BOM
077 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) {
078 *     // has a UTF-32LE BOM
079 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) {
080 *     // has a UTF-32BE BOM
081 * }
082 * </pre>
083 *
084 * @see org.apache.commons.io.ByteOrderMark
085 * @see <a href="http://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a>
086 * @since 2.0
087 */
088public class BOMInputStream extends ProxyInputStream {
089    private final boolean include;
090    /**
091     * BOMs are sorted from longest to shortest.
092     */
093    private final List<ByteOrderMark> boms;
094    private ByteOrderMark byteOrderMark;
095    private int[] firstBytes;
096    private int fbLength;
097    private int fbIndex;
098    private int markFbIndex;
099    private boolean markedAtStart;
100
101    /**
102     * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM.
103     *
104     * @param delegate
105     *            the InputStream to delegate to
106     */
107    public BOMInputStream(final InputStream delegate) {
108        this(delegate, false, ByteOrderMark.UTF_8);
109    }
110
111    /**
112     * Constructs a new BOM InputStream that detects a a {@link ByteOrderMark#UTF_8} and optionally includes it.
113     *
114     * @param delegate
115     *            the InputStream to delegate to
116     * @param include
117     *            true to include the UTF-8 BOM or false to exclude it
118     */
119    public BOMInputStream(final InputStream delegate, final boolean include) {
120        this(delegate, include, ByteOrderMark.UTF_8);
121    }
122
123    /**
124     * Constructs a new BOM InputStream that excludes the specified BOMs.
125     *
126     * @param delegate
127     *            the InputStream to delegate to
128     * @param boms
129     *            The BOMs to detect and exclude
130     */
131    public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) {
132        this(delegate, false, boms);
133    }
134
135    /**
136     * Compares ByteOrderMark objects in descending length order.
137     */
138    private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = (bom1, bom2) -> {
139        final int len1 = bom1.length();
140        final int len2 = bom2.length();
141        return Integer.compare(len2, len1);
142    };
143
144    /**
145     * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them.
146     *
147     * @param delegate
148     *            the InputStream to delegate to
149     * @param include
150     *            true to include the specified BOMs or false to exclude them
151     * @param boms
152     *            The BOMs to detect and optionally exclude
153     */
154    public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) {
155        super(delegate);
156        if (IOUtils.length(boms) == 0) {
157            throw new IllegalArgumentException("No BOMs specified");
158        }
159        this.include = include;
160        final List<ByteOrderMark> list = Arrays.asList(boms);
161        // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
162        list.sort(ByteOrderMarkLengthComparator);
163        this.boms = list;
164
165    }
166
167    /**
168     * Indicates whether the stream contains one of the specified BOMs.
169     *
170     * @return true if the stream has one of the specified BOMs, otherwise false if it does not
171     * @throws IOException
172     *             if an error reading the first bytes of the stream occurs
173     */
174    public boolean hasBOM() throws IOException {
175        return getBOM() != null;
176    }
177
178    /**
179     * Indicates whether the stream contains the specified BOM.
180     *
181     * @param bom
182     *            The BOM to check for
183     * @return true if the stream has the specified BOM, otherwise false if it does not
184     * @throws IllegalArgumentException
185     *             if the BOM is not one the stream is configured to detect
186     * @throws IOException
187     *             if an error reading the first bytes of the stream occurs
188     */
189    public boolean hasBOM(final ByteOrderMark bom) throws IOException {
190        if (!boms.contains(bom)) {
191            throw new IllegalArgumentException("Stream not configure to detect " + bom);
192        }
193        getBOM();
194        return byteOrderMark != null && byteOrderMark.equals(bom);
195    }
196
197    /**
198     * Return the BOM (Byte Order Mark).
199     *
200     * @return The BOM or null if none
201     * @throws IOException
202     *             if an error reading the first bytes of the stream occurs
203     */
204    public ByteOrderMark getBOM() throws IOException {
205        if (firstBytes == null) {
206            fbLength = 0;
207            // BOMs are sorted from longest to shortest
208            final int maxBomSize = boms.get(0).length();
209            firstBytes = new int[maxBomSize];
210            // Read first maxBomSize bytes
211            for (int i = 0; i < firstBytes.length; i++) {
212                firstBytes[i] = in.read();
213                fbLength++;
214                if (firstBytes[i] < 0) {
215                    break;
216                }
217            }
218            // match BOM in firstBytes
219            byteOrderMark = find();
220            if (byteOrderMark != null) {
221                if (!include) {
222                    if (byteOrderMark.length() < firstBytes.length) {
223                        fbIndex = byteOrderMark.length();
224                    } else {
225                        fbLength = 0;
226                    }
227                }
228            }
229        }
230        return byteOrderMark;
231    }
232
233    /**
234     * Return the BOM charset Name - {@link ByteOrderMark#getCharsetName()}.
235     *
236     * @return The BOM charset Name or null if no BOM found
237     * @throws IOException
238     *             if an error reading the first bytes of the stream occurs
239     *
240     */
241    public String getBOMCharsetName() throws IOException {
242        getBOM();
243        return byteOrderMark == null ? null : byteOrderMark.getCharsetName();
244    }
245
246    /**
247     * This method reads and either preserves or skips the first bytes in the stream. It behaves like the single-byte
248     * <code>read()</code> method, either returning a valid byte or -1 to indicate that the initial bytes have been
249     * processed already.
250     *
251     * @return the byte read (excluding BOM) or -1 if the end of stream
252     * @throws IOException
253     *             if an I/O error occurs
254     */
255    private int readFirstBytes() throws IOException {
256        getBOM();
257        return fbIndex < fbLength ? firstBytes[fbIndex++] : EOF;
258    }
259
260    /**
261     * Find a BOM with the specified bytes.
262     *
263     * @return The matched BOM or null if none matched
264     */
265    private ByteOrderMark find() {
266        for (final ByteOrderMark bom : boms) {
267            if (matches(bom)) {
268                return bom;
269            }
270        }
271        return null;
272    }
273
274    /**
275     * Check if the bytes match a BOM.
276     *
277     * @param bom
278     *            The BOM
279     * @return true if the bytes match the bom, otherwise false
280     */
281    private boolean matches(final ByteOrderMark bom) {
282        // if (bom.length() != fbLength) {
283        // return false;
284        // }
285        // firstBytes may be bigger than the BOM bytes
286        for (int i = 0; i < bom.length(); i++) {
287            if (bom.get(i) != firstBytes[i]) {
288                return false;
289            }
290        }
291        return true;
292    }
293
294    // ----------------------------------------------------------------------------
295    // Implementation of InputStream
296    // ----------------------------------------------------------------------------
297
298    /**
299     * Invokes the delegate's <code>read()</code> method, detecting and optionally skipping BOM.
300     *
301     * @return the byte read (excluding BOM) or -1 if the end of stream
302     * @throws IOException
303     *             if an I/O error occurs
304     */
305    @Override
306    public int read() throws IOException {
307        final int b = readFirstBytes();
308        return b >= 0 ? b : in.read();
309    }
310
311    /**
312     * Invokes the delegate's <code>read(byte[], int, int)</code> method, detecting and optionally skipping BOM.
313     *
314     * @param buf
315     *            the buffer to read the bytes into
316     * @param off
317     *            The start offset
318     * @param len
319     *            The number of bytes to read (excluding BOM)
320     * @return the number of bytes read or -1 if the end of stream
321     * @throws IOException
322     *             if an I/O error occurs
323     */
324    @Override
325    public int read(final byte[] buf, int off, int len) throws IOException {
326        int firstCount = 0;
327        int b = 0;
328        while (len > 0 && b >= 0) {
329            b = readFirstBytes();
330            if (b >= 0) {
331                buf[off++] = (byte) (b & 0xFF);
332                len--;
333                firstCount++;
334            }
335        }
336        final int secondCount = in.read(buf, off, len);
337        return secondCount < 0 ? firstCount > 0 ? firstCount : EOF : firstCount + secondCount;
338    }
339
340    /**
341     * Invokes the delegate's <code>read(byte[])</code> method, detecting and optionally skipping BOM.
342     *
343     * @param buf
344     *            the buffer to read the bytes into
345     * @return the number of bytes read (excluding BOM) or -1 if the end of stream
346     * @throws IOException
347     *             if an I/O error occurs
348     */
349    @Override
350    public int read(final byte[] buf) throws IOException {
351        return read(buf, 0, buf.length);
352    }
353
354    /**
355     * Invokes the delegate's <code>mark(int)</code> method.
356     *
357     * @param readlimit
358     *            read ahead limit
359     */
360    @Override
361    public synchronized void mark(final int readlimit) {
362        markFbIndex = fbIndex;
363        markedAtStart = firstBytes == null;
364        in.mark(readlimit);
365    }
366
367    /**
368     * Invokes the delegate's <code>reset()</code> method.
369     *
370     * @throws IOException
371     *             if an I/O error occurs
372     */
373    @Override
374    public synchronized void reset() throws IOException {
375        fbIndex = markFbIndex;
376        if (markedAtStart) {
377            firstBytes = null;
378        }
379
380        in.reset();
381    }
382
383    /**
384     * Invokes the delegate's <code>skip(long)</code> method, detecting and optionally skipping BOM.
385     *
386     * @param n
387     *            the number of bytes to skip
388     * @return the number of bytes to skipped or -1 if the end of stream
389     * @throws IOException
390     *             if an I/O error occurs
391     */
392    @Override
393    public long skip(final long n) throws IOException {
394        int skipped = 0;
395        while ((n > skipped) && (readFirstBytes() >= 0)) {
396            skipped++;
397        }
398        return in.skip(n - skipped) + skipped;
399    }
400}