001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.io.input; 018 019import static org.apache.commons.io.IOUtils.EOF; 020 021import java.io.IOException; 022import java.io.InputStream; 023import java.util.Arrays; 024import java.util.Comparator; 025import java.util.List; 026import org.apache.commons.io.ByteOrderMark; 027import org.apache.commons.io.IOUtils; 028 029/** 030 * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes. 031 * 032 * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the 033 * first byte in the stream. 034 * 035 * The {@link ByteOrderMark} implementation has the following pre-defined BOMs: 036 * <ul> 037 * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li> 038 * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li> 039 * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li> 040 * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li> 041 * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li> 042 * </ul> 043 * 044 * 045 * <h2>Example 1 - Detect and exclude a UTF-8 BOM</h2> 046 * 047 * <pre> 048 * BOMInputStream bomIn = new BOMInputStream(in); 049 * if (bomIn.hasBOM()) { 050 * // has a UTF-8 BOM 051 * } 052 * </pre> 053 * 054 * <h2>Example 2 - Detect a UTF-8 BOM (but don't exclude it)</h2> 055 * 056 * <pre> 057 * boolean include = true; 058 * BOMInputStream bomIn = new BOMInputStream(in, include); 059 * if (bomIn.hasBOM()) { 060 * // has a UTF-8 BOM 061 * } 062 * </pre> 063 * 064 * <h2>Example 3 - Detect Multiple BOMs</h2> 065 * 066 * <pre> 067 * BOMInputStream bomIn = new BOMInputStream(in, 068 * ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, 069 * ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE 070 * ); 071 * if (bomIn.hasBOM() == false) { 072 * // No BOM found 073 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) { 074 * // has a UTF-16LE BOM 075 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) { 076 * // has a UTF-16BE BOM 077 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) { 078 * // has a UTF-32LE BOM 079 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) { 080 * // has a UTF-32BE BOM 081 * } 082 * </pre> 083 * 084 * @see org.apache.commons.io.ByteOrderMark 085 * @see <a href="http://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a> 086 * @since 2.0 087 */ 088public class BOMInputStream extends ProxyInputStream { 089 private final boolean include; 090 /** 091 * BOMs are sorted from longest to shortest. 092 */ 093 private final List<ByteOrderMark> boms; 094 private ByteOrderMark byteOrderMark; 095 private int[] firstBytes; 096 private int fbLength; 097 private int fbIndex; 098 private int markFbIndex; 099 private boolean markedAtStart; 100 101 /** 102 * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM. 103 * 104 * @param delegate 105 * the InputStream to delegate to 106 */ 107 public BOMInputStream(final InputStream delegate) { 108 this(delegate, false, ByteOrderMark.UTF_8); 109 } 110 111 /** 112 * Constructs a new BOM InputStream that detects a a {@link ByteOrderMark#UTF_8} and optionally includes it. 113 * 114 * @param delegate 115 * the InputStream to delegate to 116 * @param include 117 * true to include the UTF-8 BOM or false to exclude it 118 */ 119 public BOMInputStream(final InputStream delegate, final boolean include) { 120 this(delegate, include, ByteOrderMark.UTF_8); 121 } 122 123 /** 124 * Constructs a new BOM InputStream that excludes the specified BOMs. 125 * 126 * @param delegate 127 * the InputStream to delegate to 128 * @param boms 129 * The BOMs to detect and exclude 130 */ 131 public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) { 132 this(delegate, false, boms); 133 } 134 135 /** 136 * Compares ByteOrderMark objects in descending length order. 137 */ 138 private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = (bom1, bom2) -> { 139 final int len1 = bom1.length(); 140 final int len2 = bom2.length(); 141 return Integer.compare(len2, len1); 142 }; 143 144 /** 145 * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them. 146 * 147 * @param delegate 148 * the InputStream to delegate to 149 * @param include 150 * true to include the specified BOMs or false to exclude them 151 * @param boms 152 * The BOMs to detect and optionally exclude 153 */ 154 public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) { 155 super(delegate); 156 if (IOUtils.length(boms) == 0) { 157 throw new IllegalArgumentException("No BOMs specified"); 158 } 159 this.include = include; 160 final List<ByteOrderMark> list = Arrays.asList(boms); 161 // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes. 162 list.sort(ByteOrderMarkLengthComparator); 163 this.boms = list; 164 165 } 166 167 /** 168 * Indicates whether the stream contains one of the specified BOMs. 169 * 170 * @return true if the stream has one of the specified BOMs, otherwise false if it does not 171 * @throws IOException 172 * if an error reading the first bytes of the stream occurs 173 */ 174 public boolean hasBOM() throws IOException { 175 return getBOM() != null; 176 } 177 178 /** 179 * Indicates whether the stream contains the specified BOM. 180 * 181 * @param bom 182 * The BOM to check for 183 * @return true if the stream has the specified BOM, otherwise false if it does not 184 * @throws IllegalArgumentException 185 * if the BOM is not one the stream is configured to detect 186 * @throws IOException 187 * if an error reading the first bytes of the stream occurs 188 */ 189 public boolean hasBOM(final ByteOrderMark bom) throws IOException { 190 if (!boms.contains(bom)) { 191 throw new IllegalArgumentException("Stream not configure to detect " + bom); 192 } 193 getBOM(); 194 return byteOrderMark != null && byteOrderMark.equals(bom); 195 } 196 197 /** 198 * Return the BOM (Byte Order Mark). 199 * 200 * @return The BOM or null if none 201 * @throws IOException 202 * if an error reading the first bytes of the stream occurs 203 */ 204 public ByteOrderMark getBOM() throws IOException { 205 if (firstBytes == null) { 206 fbLength = 0; 207 // BOMs are sorted from longest to shortest 208 final int maxBomSize = boms.get(0).length(); 209 firstBytes = new int[maxBomSize]; 210 // Read first maxBomSize bytes 211 for (int i = 0; i < firstBytes.length; i++) { 212 firstBytes[i] = in.read(); 213 fbLength++; 214 if (firstBytes[i] < 0) { 215 break; 216 } 217 } 218 // match BOM in firstBytes 219 byteOrderMark = find(); 220 if (byteOrderMark != null) { 221 if (!include) { 222 if (byteOrderMark.length() < firstBytes.length) { 223 fbIndex = byteOrderMark.length(); 224 } else { 225 fbLength = 0; 226 } 227 } 228 } 229 } 230 return byteOrderMark; 231 } 232 233 /** 234 * Return the BOM charset Name - {@link ByteOrderMark#getCharsetName()}. 235 * 236 * @return The BOM charset Name or null if no BOM found 237 * @throws IOException 238 * if an error reading the first bytes of the stream occurs 239 * 240 */ 241 public String getBOMCharsetName() throws IOException { 242 getBOM(); 243 return byteOrderMark == null ? null : byteOrderMark.getCharsetName(); 244 } 245 246 /** 247 * This method reads and either preserves or skips the first bytes in the stream. It behaves like the single-byte 248 * <code>read()</code> method, either returning a valid byte or -1 to indicate that the initial bytes have been 249 * processed already. 250 * 251 * @return the byte read (excluding BOM) or -1 if the end of stream 252 * @throws IOException 253 * if an I/O error occurs 254 */ 255 private int readFirstBytes() throws IOException { 256 getBOM(); 257 return fbIndex < fbLength ? firstBytes[fbIndex++] : EOF; 258 } 259 260 /** 261 * Find a BOM with the specified bytes. 262 * 263 * @return The matched BOM or null if none matched 264 */ 265 private ByteOrderMark find() { 266 for (final ByteOrderMark bom : boms) { 267 if (matches(bom)) { 268 return bom; 269 } 270 } 271 return null; 272 } 273 274 /** 275 * Check if the bytes match a BOM. 276 * 277 * @param bom 278 * The BOM 279 * @return true if the bytes match the bom, otherwise false 280 */ 281 private boolean matches(final ByteOrderMark bom) { 282 // if (bom.length() != fbLength) { 283 // return false; 284 // } 285 // firstBytes may be bigger than the BOM bytes 286 for (int i = 0; i < bom.length(); i++) { 287 if (bom.get(i) != firstBytes[i]) { 288 return false; 289 } 290 } 291 return true; 292 } 293 294 // ---------------------------------------------------------------------------- 295 // Implementation of InputStream 296 // ---------------------------------------------------------------------------- 297 298 /** 299 * Invokes the delegate's <code>read()</code> method, detecting and optionally skipping BOM. 300 * 301 * @return the byte read (excluding BOM) or -1 if the end of stream 302 * @throws IOException 303 * if an I/O error occurs 304 */ 305 @Override 306 public int read() throws IOException { 307 final int b = readFirstBytes(); 308 return b >= 0 ? b : in.read(); 309 } 310 311 /** 312 * Invokes the delegate's <code>read(byte[], int, int)</code> method, detecting and optionally skipping BOM. 313 * 314 * @param buf 315 * the buffer to read the bytes into 316 * @param off 317 * The start offset 318 * @param len 319 * The number of bytes to read (excluding BOM) 320 * @return the number of bytes read or -1 if the end of stream 321 * @throws IOException 322 * if an I/O error occurs 323 */ 324 @Override 325 public int read(final byte[] buf, int off, int len) throws IOException { 326 int firstCount = 0; 327 int b = 0; 328 while (len > 0 && b >= 0) { 329 b = readFirstBytes(); 330 if (b >= 0) { 331 buf[off++] = (byte) (b & 0xFF); 332 len--; 333 firstCount++; 334 } 335 } 336 final int secondCount = in.read(buf, off, len); 337 return secondCount < 0 ? firstCount > 0 ? firstCount : EOF : firstCount + secondCount; 338 } 339 340 /** 341 * Invokes the delegate's <code>read(byte[])</code> method, detecting and optionally skipping BOM. 342 * 343 * @param buf 344 * the buffer to read the bytes into 345 * @return the number of bytes read (excluding BOM) or -1 if the end of stream 346 * @throws IOException 347 * if an I/O error occurs 348 */ 349 @Override 350 public int read(final byte[] buf) throws IOException { 351 return read(buf, 0, buf.length); 352 } 353 354 /** 355 * Invokes the delegate's <code>mark(int)</code> method. 356 * 357 * @param readlimit 358 * read ahead limit 359 */ 360 @Override 361 public synchronized void mark(final int readlimit) { 362 markFbIndex = fbIndex; 363 markedAtStart = firstBytes == null; 364 in.mark(readlimit); 365 } 366 367 /** 368 * Invokes the delegate's <code>reset()</code> method. 369 * 370 * @throws IOException 371 * if an I/O error occurs 372 */ 373 @Override 374 public synchronized void reset() throws IOException { 375 fbIndex = markFbIndex; 376 if (markedAtStart) { 377 firstBytes = null; 378 } 379 380 in.reset(); 381 } 382 383 /** 384 * Invokes the delegate's <code>skip(long)</code> method, detecting and optionally skipping BOM. 385 * 386 * @param n 387 * the number of bytes to skip 388 * @return the number of bytes to skipped or -1 if the end of stream 389 * @throws IOException 390 * if an I/O error occurs 391 */ 392 @Override 393 public long skip(final long n) throws IOException { 394 int skipped = 0; 395 while ((n > skipped) && (readFirstBytes() >= 0)) { 396 skipped++; 397 } 398 return in.skip(n - skipped) + skipped; 399 } 400}