001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.io.input; 018 019import static org.apache.commons.io.IOUtils.EOF; 020 021import java.io.IOException; 022import java.io.InputStream; 023import java.util.Arrays; 024import java.util.Comparator; 025import java.util.List; 026import java.util.Objects; 027 028import org.apache.commons.io.ByteOrderMark; 029import org.apache.commons.io.IOUtils; 030 031/** 032 * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes. 033 * <p> 034 * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the 035 * first byte in the stream. 036 * </p> 037 * <p> 038 * The {@link ByteOrderMark} implementation has the following predefined BOMs: 039 * </p> 040 * <ul> 041 * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li> 042 * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li> 043 * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li> 044 * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li> 045 * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li> 046 * </ul> 047 * <p> 048 * To build an instance, use {@link Builder}. 049 * </p> 050 * <h2>Example 1 - Detecting and excluding a UTF-8 BOM</h2> 051 * 052 * <pre> 053 * BOMInputStream bomIn = BOMInputStream.builder().setInputStream(in).get(); 054 * if (bomIn.hasBOM()) { 055 * // has a UTF-8 BOM 056 * } 057 * </pre> 058 * 059 * <h2>Example 2 - Detecting a UTF-8 BOM without excluding it</h2> 060 * 061 * <pre> 062 * boolean include = true; 063 * BOMInputStream bomIn = BOMInputStream.builder() 064 * .setInputStream(in) 065 * .setInclude(include) 066 * .get(); 067 * if (bomIn.hasBOM()) { 068 * // has a UTF-8 BOM 069 * } 070 * </pre> 071 * 072 * <h2>Example 3 - Detecting Multiple BOMs</h2> 073 * 074 * <pre> 075 * BOMInputStream bomIn = BOMInputStream.builder() 076 * .setInputStream(in) 077 * .setByteOrderMarks(ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE) 078 * .get(); 079 * if (bomIn.hasBOM() == false) { 080 * // No BOM found 081 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) { 082 * // has a UTF-16LE BOM 083 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) { 084 * // has a UTF-16BE BOM 085 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) { 086 * // has a UTF-32LE BOM 087 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) { 088 * // has a UTF-32BE BOM 089 * } 090 * </pre> 091 * <p> 092 * To build an instance, use {@link Builder}. 093 * </p> 094 * 095 * @see Builder 096 * @see org.apache.commons.io.ByteOrderMark 097 * @see <a href="https://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a> 098 * @since 2.0 099 */ 100public class BOMInputStream extends ProxyInputStream { 101 102 // @formatter:off 103 /** 104 * Builds a new {@link BOMInputStream}. 105 * 106 * <h2>Using NIO</h2> 107 * <pre>{@code 108 * BOMInputStream s = BOMInputStream.builder() 109 * .setPath(Paths.get("MyFile.xml")) 110 * .setByteOrderMarks(ByteOrderMark.UTF_8) 111 * .setInclude(false) 112 * .get();} 113 * </pre> 114 * <h2>Using IO</h2> 115 * <pre>{@code 116 * BOMInputStream s = BOMInputStream.builder() 117 * .setFile(new File("MyFile.xml")) 118 * .setByteOrderMarks(ByteOrderMark.UTF_8) 119 * .setInclude(false) 120 * .get();} 121 * </pre> 122 * 123 * @see #get() 124 * @since 2.12.0 125 */ 126 // @formatter:on 127 public static class Builder extends AbstractBuilder<BOMInputStream, Builder> { 128 129 private static final ByteOrderMark[] DEFAULT = { ByteOrderMark.UTF_8 }; 130 131 /** 132 * For test access. 133 * 134 * @return the default byte order mark 135 */ 136 static ByteOrderMark getDefaultByteOrderMark() { 137 return DEFAULT[0]; 138 } 139 140 private ByteOrderMark[] byteOrderMarks = DEFAULT; 141 142 private boolean include; 143 144 /** 145 * Constructs a new builder of {@link BOMInputStream}. 146 */ 147 public Builder() { 148 // empty 149 } 150 151 /** 152 * Builds a new {@link BOMInputStream}. 153 * <p> 154 * You must set an aspect that supports {@link #getInputStream()}, otherwise, this method throws an exception. 155 * </p> 156 * <p> 157 * This builder uses the following aspects: InputStream, OpenOption[], include, and ByteOrderMark[]. 158 * </p> 159 * <p> 160 * This builder uses the following aspects: 161 * </p> 162 * <ul> 163 * <li>{@link #getInputStream()}</li> 164 * <li>include}</li> 165 * <li>byteOrderMarks</li> 166 * </ul> 167 * 168 * @return a new instance. 169 * @throws IllegalStateException if the {@code origin} is {@code null}. 170 * @throws UnsupportedOperationException if the origin cannot be converted to an {@link InputStream}. 171 * @throws IOException if an I/O error occurs converting to an {@link InputStream} using {@link #getInputStream()}. 172 * @see #getInputStream() 173 * @see #getUnchecked() 174 */ 175 @Override 176 public BOMInputStream get() throws IOException { 177 return new BOMInputStream(this); 178 } 179 180 /** 181 * Sets the ByteOrderMarks to detect and optionally exclude. 182 * <p> 183 * The default is {@link ByteOrderMark#UTF_8}. 184 * </p> 185 * 186 * @param byteOrderMarks the ByteOrderMarks to detect and optionally exclude. 187 * @return {@code this} instance. 188 */ 189 public Builder setByteOrderMarks(final ByteOrderMark... byteOrderMarks) { 190 this.byteOrderMarks = byteOrderMarks != null ? byteOrderMarks.clone() : DEFAULT; 191 return this; 192 } 193 194 /** 195 * Sets whether to include the UTF-8 BOM (true) or to exclude it (false). 196 * <p> 197 * The default is false. 198 * </p> 199 * 200 * @param include true to include the UTF-8 BOM or false to exclude it. return this; 201 * @return {@code this} instance. 202 */ 203 public Builder setInclude(final boolean include) { 204 this.include = include; 205 return this; 206 } 207 208 } 209 210 /** 211 * Compares ByteOrderMark objects in descending length order. 212 */ 213 private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = Comparator.comparing(ByteOrderMark::length).reversed(); 214 215 /** 216 * Constructs a new {@link Builder}. 217 * 218 * @return a new {@link Builder}. 219 * @since 2.12.0 220 */ 221 public static Builder builder() { 222 return new Builder(); 223 } 224 225 /** 226 * BOMs are sorted from longest to shortest. 227 */ 228 private final List<ByteOrderMark> bomList; 229 230 private ByteOrderMark byteOrderMark; 231 private int fbIndex; 232 private int fbLength; 233 private int[] firstBytes; 234 private final boolean include; 235 private boolean markedAtStart; 236 private int markFbIndex; 237 238 private BOMInputStream(final Builder builder) throws IOException { 239 super(builder); 240 if (IOUtils.length(builder.byteOrderMarks) == 0) { 241 throw new IllegalArgumentException("No ByteOrderMark specified."); 242 } 243 this.include = builder.include; 244 final List<ByteOrderMark> list = Arrays.asList(builder.byteOrderMarks); 245 // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes. 246 list.sort(ByteOrderMarkLengthComparator); 247 this.bomList = list; 248 } 249 250 /** 251 * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM. 252 * 253 * @param delegate 254 * the InputStream to delegate to 255 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 256 */ 257 @Deprecated 258 public BOMInputStream(final InputStream delegate) { 259 this(delegate, false, Builder.DEFAULT); 260 } 261 262 /** 263 * Constructs a new BOM InputStream that detects a {@link ByteOrderMark#UTF_8} and optionally includes it. 264 * 265 * @param delegate 266 * the InputStream to delegate to 267 * @param include 268 * true to include the UTF-8 BOM or false to exclude it 269 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 270 */ 271 @Deprecated 272 public BOMInputStream(final InputStream delegate, final boolean include) { 273 this(delegate, include, Builder.DEFAULT); 274 } 275 276 /** 277 * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them. 278 * 279 * @param delegate 280 * the InputStream to delegate to 281 * @param include 282 * true to include the specified BOMs or false to exclude them 283 * @param boms 284 * The BOMs to detect and optionally exclude 285 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 286 */ 287 @Deprecated 288 public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) { 289 super(delegate); 290 if (IOUtils.length(boms) == 0) { 291 throw new IllegalArgumentException("No BOMs specified"); 292 } 293 this.include = include; 294 final List<ByteOrderMark> list = Arrays.asList(boms); 295 // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes. 296 list.sort(ByteOrderMarkLengthComparator); 297 this.bomList = list; 298 } 299 300 /** 301 * Constructs a new BOM InputStream that excludes the specified BOMs. 302 * 303 * @param delegate 304 * the InputStream to delegate to 305 * @param boms 306 * The BOMs to detect and exclude 307 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 308 */ 309 @Deprecated 310 public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) { 311 this(delegate, false, boms); 312 } 313 314 /** 315 * Finds a ByteOrderMark with the configured bytes in {@code bomList}. 316 * 317 * @return The matched BOM or null if none matched. 318 */ 319 private ByteOrderMark find() { 320 return bomList.stream().filter(this::matches).findFirst().orElse(null); 321 } 322 323 /** 324 * Gets the ByteOrderMark (Byte Order Mark). 325 * 326 * @return The BOM or null if none matched. 327 * @throws IOException 328 * if an error reading the first bytes of the stream occurs. 329 */ 330 public ByteOrderMark getBOM() throws IOException { 331 if (firstBytes == null) { 332 byteOrderMark = readBom(); 333 } 334 return byteOrderMark; 335 } 336 337 /** 338 * Gets the BOM charset Name - {@link ByteOrderMark#getCharsetName()}. 339 * 340 * @return The BOM charset Name or null if no BOM found 341 * @throws IOException 342 * if an error reading the first bytes of the stream occurs 343 */ 344 public String getBOMCharsetName() throws IOException { 345 getBOM(); 346 return byteOrderMark == null ? null : byteOrderMark.getCharsetName(); 347 } 348 349 /** 350 * Tests whether the stream contains one of the specified BOMs. 351 * 352 * @return true if the stream has one of the specified BOMs, otherwise false if it does not 353 * @throws IOException 354 * if an error reading the first bytes of the stream occurs 355 */ 356 public boolean hasBOM() throws IOException { 357 return getBOM() != null; 358 } 359 360 /** 361 * Tests whether the stream contains the specified BOM. 362 * 363 * @param bom 364 * The BOM to check for 365 * @return true if the stream has the specified BOM, otherwise false if it does not 366 * @throws IllegalArgumentException 367 * if the BOM is not one the stream is configured to detect 368 * @throws IOException 369 * if an error reading the first bytes of the stream occurs 370 */ 371 public boolean hasBOM(final ByteOrderMark bom) throws IOException { 372 if (!bomList.contains(bom)) { 373 throw new IllegalArgumentException("Stream not configured to detect " + bom); 374 } 375 return Objects.equals(getBOM(), bom); 376 } 377 378 /** 379 * Invokes the delegate's {@code mark(int)} method. 380 * 381 * @param readLimit 382 * read ahead limit 383 */ 384 @Override 385 public synchronized void mark(final int readLimit) { 386 markFbIndex = fbIndex; 387 markedAtStart = firstBytes == null; 388 in.mark(readLimit); 389 } 390 391 /** 392 * Checks if the bytes match a BOM. 393 * 394 * @param bom 395 * The BOM 396 * @return true if the bytes match the bom, otherwise false 397 */ 398 private boolean matches(final ByteOrderMark bom) { 399 return bom.matches(firstBytes); 400 } 401 402 /** 403 * Invokes the delegate's {@code read()} method, detecting and optionally skipping BOM. 404 * 405 * @return the byte read (excluding BOM) or -1 if the end of stream 406 * @throws IOException 407 * if an I/O error occurs 408 */ 409 @Override 410 public int read() throws IOException { 411 checkOpen(); 412 final int b = readFirstBytes(); 413 return b >= 0 ? b : in.read(); 414 } 415 416 /** 417 * Invokes the delegate's {@code read(byte[])} method, detecting and optionally skipping BOM. 418 * 419 * @param buf 420 * the buffer to read the bytes into 421 * @return the number of bytes read (excluding BOM) or -1 if the end of stream 422 * @throws IOException 423 * if an I/O error occurs 424 */ 425 @Override 426 public int read(final byte[] buf) throws IOException { 427 return read(buf, 0, buf.length); 428 } 429 430 /** 431 * Invokes the delegate's {@code read(byte[], int, int)} method, detecting and optionally skipping BOM. 432 * 433 * @param buf 434 * the buffer to read the bytes into 435 * @param off 436 * The start offset 437 * @param len 438 * The number of bytes to read (excluding BOM) 439 * @return the number of bytes read or -1 if the end of stream 440 * @throws IOException 441 * if an I/O error occurs 442 */ 443 @Override 444 public int read(final byte[] buf, int off, int len) throws IOException { 445 int firstCount = 0; 446 int b = 0; 447 while (len > 0 && b >= 0) { 448 b = readFirstBytes(); 449 if (b >= 0) { 450 buf[off++] = (byte) (b & 0xFF); 451 len--; 452 firstCount++; 453 } 454 } 455 final int secondCount = in.read(buf, off, len); 456 afterRead(secondCount); 457 return secondCount < 0 ? firstCount > 0 ? firstCount : EOF : firstCount + secondCount; 458 } 459 460 private ByteOrderMark readBom() throws IOException { 461 fbLength = 0; 462 // BOMs are sorted from longest to shortest 463 final int maxBomSize = bomList.get(0).length(); 464 firstBytes = new int[maxBomSize]; 465 // Read first maxBomSize bytes 466 for (int i = 0; i < firstBytes.length; i++) { 467 firstBytes[i] = in.read(); 468 afterRead(firstBytes[i]); 469 fbLength++; 470 if (firstBytes[i] < 0) { 471 break; 472 } 473 } 474 // match BOM in firstBytes 475 final ByteOrderMark bom = find(); 476 if (bom != null && !include) { 477 if (bom.length() < firstBytes.length) { 478 fbIndex = bom.length(); 479 } else { 480 fbLength = 0; 481 } 482 } 483 return bom; 484 } 485 486 /** 487 * This method reads and either preserves or skips the first bytes in the stream. It behaves like the single-byte 488 * {@code read()} method, either returning a valid byte or -1 to indicate that the initial bytes have been 489 * processed already. 490 * 491 * @return the byte read (excluding BOM) or -1 if the end of stream 492 * @throws IOException 493 * if an I/O error occurs 494 */ 495 private int readFirstBytes() throws IOException { 496 getBOM(); 497 return fbIndex < fbLength ? firstBytes[fbIndex++] : EOF; 498 } 499 500 /** 501 * Invokes the delegate's {@code reset()} method. 502 * 503 * @throws IOException 504 * if an I/O error occurs 505 */ 506 @Override 507 public synchronized void reset() throws IOException { 508 fbIndex = markFbIndex; 509 if (markedAtStart) { 510 firstBytes = null; 511 } 512 in.reset(); 513 } 514 515 /** 516 * Invokes the delegate's {@code skip(long)} method, detecting and optionally skipping BOM. 517 * 518 * @param n 519 * the number of bytes to skip 520 * @return the number of bytes to skipped or -1 if the end of stream 521 * @throws IOException 522 * if an I/O error occurs 523 */ 524 @Override 525 public long skip(final long n) throws IOException { 526 int skipped = 0; 527 while (n > skipped && readFirstBytes() >= 0) { 528 skipped++; 529 } 530 return in.skip(n - skipped) + skipped; 531 } 532}