001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.io.input; 018 019import static org.apache.commons.io.IOUtils.EOF; 020 021import java.io.IOException; 022import java.io.InputStream; 023import java.util.Arrays; 024import java.util.Comparator; 025import java.util.List; 026import java.util.Objects; 027 028import org.apache.commons.io.ByteOrderMark; 029import org.apache.commons.io.IOUtils; 030 031/** 032 * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes. 033 * <p> 034 * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the 035 * first byte in the stream. 036 * </p> 037 * <p> 038 * The {@link ByteOrderMark} implementation has the following predefined BOMs: 039 * </p> 040 * <ul> 041 * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li> 042 * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li> 043 * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li> 044 * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li> 045 * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li> 046 * </ul> 047 * <p> 048 * To build an instance, use {@link Builder}. 049 * </p> 050 * <h2>Example 1 - Detecting and excluding a UTF-8 BOM</h2> 051 * 052 * <pre> 053 * BOMInputStream bomIn = BOMInputStream.builder().setInputStream(in).get(); 054 * if (bomIn.hasBOM()) { 055 * // has a UTF-8 BOM 056 * } 057 * </pre> 058 * 059 * <h2>Example 2 - Detecting a UTF-8 BOM without excluding it</h2> 060 * 061 * <pre> 062 * boolean include = true; 063 * BOMInputStream bomIn = BOMInputStream.builder() 064 * .setInputStream(in) 065 * .setInclude(include) 066 * .get(); 067 * if (bomIn.hasBOM()) { 068 * // has a UTF-8 BOM 069 * } 070 * </pre> 071 * 072 * <h2>Example 3 - Detecting Multiple BOMs</h2> 073 * 074 * <pre> 075 * BOMInputStream bomIn = BOMInputStream.builder() 076 * .setInputStream(in) 077 * .setByteOrderMarks(ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE) 078 * .get(); 079 * if (bomIn.hasBOM() == false) { 080 * // No BOM found 081 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) { 082 * // has a UTF-16LE BOM 083 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) { 084 * // has a UTF-16BE BOM 085 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) { 086 * // has a UTF-32LE BOM 087 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) { 088 * // has a UTF-32BE BOM 089 * } 090 * </pre> 091 * <p> 092 * To build an instance, use {@link Builder}. 093 * </p> 094 * 095 * @see Builder 096 * @see org.apache.commons.io.ByteOrderMark 097 * @see <a href="https://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a> 098 * @since 2.0 099 */ 100public class BOMInputStream extends ProxyInputStream { 101 102 // @formatter:off 103 /** 104 * Builds a new {@link BOMInputStream}. 105 * 106 * <h2>Using NIO</h2> 107 * <pre>{@code 108 * BOMInputStream s = BOMInputStream.builder() 109 * .setPath(Paths.get("MyFile.xml")) 110 * .setByteOrderMarks(ByteOrderMark.UTF_8) 111 * .setInclude(false) 112 * .get();} 113 * </pre> 114 * <h2>Using IO</h2> 115 * <pre>{@code 116 * BOMInputStream s = BOMInputStream.builder() 117 * .setFile(new File("MyFile.xml")) 118 * .setByteOrderMarks(ByteOrderMark.UTF_8) 119 * .setInclude(false) 120 * .get();} 121 * </pre> 122 * 123 * @see #get() 124 * @since 2.12.0 125 */ 126 // @formatter:on 127 public static class Builder extends AbstractBuilder<BOMInputStream, Builder> { 128 129 private static final ByteOrderMark[] DEFAULT = { ByteOrderMark.UTF_8 }; 130 131 /** 132 * For test access. 133 * 134 * @return the default byte order mark 135 */ 136 static ByteOrderMark getDefaultByteOrderMark() { 137 return DEFAULT[0]; 138 } 139 140 private ByteOrderMark[] byteOrderMarks = DEFAULT; 141 142 private boolean include; 143 144 /** 145 * Builds a new {@link BOMInputStream}. 146 * <p> 147 * You must set input that supports {@link #getInputStream()}, otherwise, this method throws an exception. 148 * </p> 149 * <p> 150 * This builder use the following aspects: InputStream, OpenOption[], include, and ByteOrderMark[]. 151 * </p> 152 * <p> 153 * This builder use the following aspects: 154 * </p> 155 * <ul> 156 * <li>{@link #getInputStream()}</li> 157 * <li>include}</li> 158 * <li>byteOrderMarks</li> 159 * </ul> 160 * 161 * @return a new instance. 162 * @throws IllegalStateException if the {@code origin} is {@code null}. 163 * @throws UnsupportedOperationException if the origin cannot be converted to an {@link InputStream}. 164 * @throws IOException if an I/O error occurs. 165 * @see #getInputStream() 166 */ 167 @Override 168 public BOMInputStream get() throws IOException { 169 return new BOMInputStream(this); 170 } 171 172 /** 173 * Sets the ByteOrderMarks to detect and optionally exclude. 174 * <p> 175 * The default is {@link ByteOrderMark#UTF_8}. 176 * </p> 177 * 178 * @param byteOrderMarks the ByteOrderMarks to detect and optionally exclude. 179 * @return {@code this} instance. 180 */ 181 public Builder setByteOrderMarks(final ByteOrderMark... byteOrderMarks) { 182 this.byteOrderMarks = byteOrderMarks != null ? byteOrderMarks.clone() : DEFAULT; 183 return this; 184 } 185 186 /** 187 * Sets whether to include the UTF-8 BOM (true) or to exclude it (false). 188 * <p> 189 * The default is false. 190 * </p> 191 * 192 * @param include true to include the UTF-8 BOM or false to exclude it. return this; 193 * @return {@code this} instance. 194 */ 195 public Builder setInclude(final boolean include) { 196 this.include = include; 197 return this; 198 } 199 200 } 201 202 /** 203 * Compares ByteOrderMark objects in descending length order. 204 */ 205 private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = Comparator.comparing(ByteOrderMark::length).reversed(); 206 207 /** 208 * Constructs a new {@link Builder}. 209 * 210 * @return a new {@link Builder}. 211 * @since 2.12.0 212 */ 213 public static Builder builder() { 214 return new Builder(); 215 } 216 217 /** 218 * BOMs are sorted from longest to shortest. 219 */ 220 private final List<ByteOrderMark> boms; 221 222 private ByteOrderMark byteOrderMark; 223 private int fbIndex; 224 private int fbLength; 225 private int[] firstBytes; 226 private final boolean include; 227 private boolean markedAtStart; 228 private int markFbIndex; 229 230 private BOMInputStream(final Builder builder) throws IOException { 231 super(builder); 232 if (IOUtils.length(builder.byteOrderMarks) == 0) { 233 throw new IllegalArgumentException("No BOMs specified"); 234 } 235 this.include = builder.include; 236 final List<ByteOrderMark> list = Arrays.asList(builder.byteOrderMarks); 237 // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes. 238 list.sort(ByteOrderMarkLengthComparator); 239 this.boms = list; 240 } 241 242 /** 243 * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM. 244 * 245 * @param delegate 246 * the InputStream to delegate to 247 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 248 */ 249 @Deprecated 250 public BOMInputStream(final InputStream delegate) { 251 this(delegate, false, Builder.DEFAULT); 252 } 253 254 /** 255 * Constructs a new BOM InputStream that detects a {@link ByteOrderMark#UTF_8} and optionally includes it. 256 * 257 * @param delegate 258 * the InputStream to delegate to 259 * @param include 260 * true to include the UTF-8 BOM or false to exclude it 261 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 262 */ 263 @Deprecated 264 public BOMInputStream(final InputStream delegate, final boolean include) { 265 this(delegate, include, Builder.DEFAULT); 266 } 267 268 /** 269 * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them. 270 * 271 * @param delegate 272 * the InputStream to delegate to 273 * @param include 274 * true to include the specified BOMs or false to exclude them 275 * @param boms 276 * The BOMs to detect and optionally exclude 277 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 278 */ 279 @Deprecated 280 public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) { 281 super(delegate); 282 if (IOUtils.length(boms) == 0) { 283 throw new IllegalArgumentException("No BOMs specified"); 284 } 285 this.include = include; 286 final List<ByteOrderMark> list = Arrays.asList(boms); 287 // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes. 288 list.sort(ByteOrderMarkLengthComparator); 289 this.boms = list; 290 } 291 292 /** 293 * Constructs a new BOM InputStream that excludes the specified BOMs. 294 * 295 * @param delegate 296 * the InputStream to delegate to 297 * @param boms 298 * The BOMs to detect and exclude 299 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 300 */ 301 @Deprecated 302 public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) { 303 this(delegate, false, boms); 304 } 305 306 /** 307 * Find a BOM with the specified bytes. 308 * 309 * @return The matched BOM or null if none matched 310 */ 311 private ByteOrderMark find() { 312 return boms.stream().filter(this::matches).findFirst().orElse(null); 313 } 314 315 /** 316 * Gets the BOM (Byte Order Mark). 317 * 318 * @return The BOM or null if none 319 * @throws IOException 320 * if an error reading the first bytes of the stream occurs 321 */ 322 public ByteOrderMark getBOM() throws IOException { 323 if (firstBytes == null) { 324 fbLength = 0; 325 // BOMs are sorted from longest to shortest 326 final int maxBomSize = boms.get(0).length(); 327 firstBytes = new int[maxBomSize]; 328 // Read first maxBomSize bytes 329 for (int i = 0; i < firstBytes.length; i++) { 330 firstBytes[i] = in.read(); 331 afterRead(firstBytes[i]); 332 fbLength++; 333 if (firstBytes[i] < 0) { 334 break; 335 } 336 } 337 // match BOM in firstBytes 338 byteOrderMark = find(); 339 if (byteOrderMark != null && !include) { 340 if (byteOrderMark.length() < firstBytes.length) { 341 fbIndex = byteOrderMark.length(); 342 } else { 343 fbLength = 0; 344 } 345 } 346 } 347 return byteOrderMark; 348 } 349 350 /** 351 * Gets the BOM charset Name - {@link ByteOrderMark#getCharsetName()}. 352 * 353 * @return The BOM charset Name or null if no BOM found 354 * @throws IOException 355 * if an error reading the first bytes of the stream occurs 356 */ 357 public String getBOMCharsetName() throws IOException { 358 getBOM(); 359 return byteOrderMark == null ? null : byteOrderMark.getCharsetName(); 360 } 361 362 /** 363 * Tests whether the stream contains one of the specified BOMs. 364 * 365 * @return true if the stream has one of the specified BOMs, otherwise false if it does not 366 * @throws IOException 367 * if an error reading the first bytes of the stream occurs 368 */ 369 public boolean hasBOM() throws IOException { 370 return getBOM() != null; 371 } 372 373 /** 374 * Tests whether the stream contains the specified BOM. 375 * 376 * @param bom 377 * The BOM to check for 378 * @return true if the stream has the specified BOM, otherwise false if it does not 379 * @throws IllegalArgumentException 380 * if the BOM is not one the stream is configured to detect 381 * @throws IOException 382 * if an error reading the first bytes of the stream occurs 383 */ 384 public boolean hasBOM(final ByteOrderMark bom) throws IOException { 385 if (!boms.contains(bom)) { 386 throw new IllegalArgumentException("Stream not configured to detect " + bom); 387 } 388 return Objects.equals(getBOM(), bom); 389 } 390 391 /** 392 * Invokes the delegate's {@code mark(int)} method. 393 * 394 * @param readLimit 395 * read ahead limit 396 */ 397 @Override 398 public synchronized void mark(final int readLimit) { 399 markFbIndex = fbIndex; 400 markedAtStart = firstBytes == null; 401 in.mark(readLimit); 402 } 403 404 /** 405 * Checks if the bytes match a BOM. 406 * 407 * @param bom 408 * The BOM 409 * @return true if the bytes match the bom, otherwise false 410 */ 411 private boolean matches(final ByteOrderMark bom) { 412 // if (bom.length() != fbLength) { 413 // return false; 414 // } 415 // firstBytes may be bigger than the BOM bytes 416 for (int i = 0; i < bom.length(); i++) { 417 if (bom.get(i) != firstBytes[i]) { 418 return false; 419 } 420 } 421 return true; 422 } 423 424 /** 425 * Invokes the delegate's {@code read()} method, detecting and optionally skipping BOM. 426 * 427 * @return the byte read (excluding BOM) or -1 if the end of stream 428 * @throws IOException 429 * if an I/O error occurs 430 */ 431 @Override 432 public int read() throws IOException { 433 checkOpen(); 434 final int b = readFirstBytes(); 435 return b >= 0 ? b : in.read(); 436 } 437 438 /** 439 * Invokes the delegate's {@code read(byte[])} method, detecting and optionally skipping BOM. 440 * 441 * @param buf 442 * the buffer to read the bytes into 443 * @return the number of bytes read (excluding BOM) or -1 if the end of stream 444 * @throws IOException 445 * if an I/O error occurs 446 */ 447 @Override 448 public int read(final byte[] buf) throws IOException { 449 return read(buf, 0, buf.length); 450 } 451 452 /** 453 * Invokes the delegate's {@code read(byte[], int, int)} method, detecting and optionally skipping BOM. 454 * 455 * @param buf 456 * the buffer to read the bytes into 457 * @param off 458 * The start offset 459 * @param len 460 * The number of bytes to read (excluding BOM) 461 * @return the number of bytes read or -1 if the end of stream 462 * @throws IOException 463 * if an I/O error occurs 464 */ 465 @Override 466 public int read(final byte[] buf, int off, int len) throws IOException { 467 int firstCount = 0; 468 int b = 0; 469 while (len > 0 && b >= 0) { 470 b = readFirstBytes(); 471 if (b >= 0) { 472 buf[off++] = (byte) (b & 0xFF); 473 len--; 474 firstCount++; 475 } 476 } 477 final int secondCount = in.read(buf, off, len); 478 afterRead(secondCount); 479 return secondCount < 0 ? firstCount > 0 ? firstCount : EOF : firstCount + secondCount; 480 } 481 482 /** 483 * This method reads and either preserves or skips the first bytes in the stream. It behaves like the single-byte 484 * {@code read()} method, either returning a valid byte or -1 to indicate that the initial bytes have been 485 * processed already. 486 * 487 * @return the byte read (excluding BOM) or -1 if the end of stream 488 * @throws IOException 489 * if an I/O error occurs 490 */ 491 private int readFirstBytes() throws IOException { 492 getBOM(); 493 return fbIndex < fbLength ? firstBytes[fbIndex++] : EOF; 494 } 495 496 /** 497 * Invokes the delegate's {@code reset()} method. 498 * 499 * @throws IOException 500 * if an I/O error occurs 501 */ 502 @Override 503 public synchronized void reset() throws IOException { 504 fbIndex = markFbIndex; 505 if (markedAtStart) { 506 firstBytes = null; 507 } 508 in.reset(); 509 } 510 511 /** 512 * Invokes the delegate's {@code skip(long)} method, detecting and optionally skipping BOM. 513 * 514 * @param n 515 * the number of bytes to skip 516 * @return the number of bytes to skipped or -1 if the end of stream 517 * @throws IOException 518 * if an I/O error occurs 519 */ 520 @Override 521 public long skip(final long n) throws IOException { 522 int skipped = 0; 523 while (n > skipped && readFirstBytes() >= 0) { 524 skipped++; 525 } 526 return in.skip(n - skipped) + skipped; 527 } 528}