001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.io.input;
018
019import static org.apache.commons.io.IOUtils.EOF;
020
021import java.io.IOException;
022import java.io.InputStream;
023import java.util.Arrays;
024import java.util.Comparator;
025import java.util.List;
026import java.util.Objects;
027
028import org.apache.commons.io.ByteOrderMark;
029import org.apache.commons.io.IOUtils;
030
031/**
032 * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes.
033 * <p>
034 * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the
035 * first byte in the stream.
036 * </p>
037 * <p>
038 * The {@link ByteOrderMark} implementation has the following predefined BOMs:
039 * </p>
040 * <ul>
041 * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li>
042 * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li>
043 * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li>
044 * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li>
045 * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li>
046 * </ul>
047 * <p>
048 * To build an instance, use {@link Builder}.
049 * </p>
050 * <h2>Example 1 - Detecting and excluding a UTF-8 BOM</h2>
051 *
052 * <pre>
053 * BOMInputStream bomIn = BOMInputStream.builder().setInputStream(in).get();
054 * if (bomIn.hasBOM()) {
055 *     // has a UTF-8 BOM
056 * }
057 * </pre>
058 *
059 * <h2>Example 2 - Detecting a UTF-8 BOM without excluding it</h2>
060 *
061 * <pre>
062 * boolean include = true;
063 * BOMInputStream bomIn = BOMInputStream.builder()
064 *     .setInputStream(in)
065 *     .setInclude(include)
066 *     .get();
067 * if (bomIn.hasBOM()) {
068 *     // has a UTF-8 BOM
069 * }
070 * </pre>
071 *
072 * <h2>Example 3 - Detecting Multiple BOMs</h2>
073 *
074 * <pre>
075 * BOMInputStream bomIn = BOMInputStream.builder()
076 *   .setInputStream(in)
077 *   .setByteOrderMarks(ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE)
078 *   .get();
079 * if (bomIn.hasBOM() == false) {
080 *     // No BOM found
081 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
082 *     // has a UTF-16LE BOM
083 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
084 *     // has a UTF-16BE BOM
085 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) {
086 *     // has a UTF-32LE BOM
087 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) {
088 *     // has a UTF-32BE BOM
089 * }
090 * </pre>
091 * <p>
092 * To build an instance, use {@link Builder}.
093 * </p>
094 *
095 * @see Builder
096 * @see org.apache.commons.io.ByteOrderMark
097 * @see <a href="https://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a>
098 * @since 2.0
099 */
100public class BOMInputStream extends ProxyInputStream {
101
102    // @formatter:off
103    /**
104     * Builds a new {@link BOMInputStream}.
105     *
106     * <h2>Using NIO</h2>
107     * <pre>{@code
108     * BOMInputStream s = BOMInputStream.builder()
109     *   .setPath(Paths.get("MyFile.xml"))
110     *   .setByteOrderMarks(ByteOrderMark.UTF_8)
111     *   .setInclude(false)
112     *   .get();}
113     * </pre>
114     * <h2>Using IO</h2>
115     * <pre>{@code
116     * BOMInputStream s = BOMInputStream.builder()
117     *   .setFile(new File("MyFile.xml"))
118     *   .setByteOrderMarks(ByteOrderMark.UTF_8)
119     *   .setInclude(false)
120     *   .get();}
121     * </pre>
122     *
123     * @see #get()
124     * @since 2.12.0
125     */
126    // @formatter:on
127    public static class Builder extends AbstractBuilder<BOMInputStream, Builder> {
128
129        private static final ByteOrderMark[] DEFAULT = { ByteOrderMark.UTF_8 };
130
131        /**
132         * For test access.
133         *
134         * @return the default byte order mark
135         */
136        static ByteOrderMark getDefaultByteOrderMark() {
137            return DEFAULT[0];
138        }
139
140        private ByteOrderMark[] byteOrderMarks = DEFAULT;
141
142        private boolean include;
143
144        /**
145         * Constructs a new builder of {@link BOMInputStream}.
146         */
147        public Builder() {
148            // empty
149        }
150
151        /**
152         * Builds a new {@link BOMInputStream}.
153         * <p>
154         * You must set an aspect that supports {@link #getInputStream()}, otherwise, this method throws an exception.
155         * </p>
156         * <p>
157         * This builder uses the following aspects: InputStream, OpenOption[], include, and ByteOrderMark[].
158         * </p>
159         * <p>
160         * This builder uses the following aspects:
161         * </p>
162         * <ul>
163         * <li>{@link #getInputStream()}</li>
164         * <li>include}</li>
165         * <li>byteOrderMarks</li>
166         * </ul>
167         *
168         * @return a new instance.
169         * @throws IllegalStateException         if the {@code origin} is {@code null}.
170         * @throws UnsupportedOperationException if the origin cannot be converted to an {@link InputStream}.
171         * @throws IOException                   if an I/O error occurs converting to an {@link InputStream} using {@link #getInputStream()}.
172         * @see #getInputStream()
173         * @see #getUnchecked()
174         */
175        @Override
176        public BOMInputStream get() throws IOException {
177            return new BOMInputStream(this);
178        }
179
180        /**
181         * Sets the ByteOrderMarks to detect and optionally exclude.
182         * <p>
183         * The default is {@link ByteOrderMark#UTF_8}.
184         * </p>
185         *
186         * @param byteOrderMarks the ByteOrderMarks to detect and optionally exclude.
187         * @return {@code this} instance.
188         */
189        public Builder setByteOrderMarks(final ByteOrderMark... byteOrderMarks) {
190            this.byteOrderMarks = byteOrderMarks != null ? byteOrderMarks.clone() : DEFAULT;
191            return this;
192        }
193
194        /**
195         * Sets whether to include the UTF-8 BOM (true) or to exclude it (false).
196         * <p>
197         * The default is false.
198         * </p>
199         *
200         * @param include true to include the UTF-8 BOM or false to exclude it. return this;
201         * @return {@code this} instance.
202         */
203        public Builder setInclude(final boolean include) {
204            this.include = include;
205            return this;
206        }
207
208    }
209
210    /**
211     * Compares ByteOrderMark objects in descending length order.
212     */
213    private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = Comparator.comparing(ByteOrderMark::length).reversed();
214
215    /**
216     * Constructs a new {@link Builder}.
217     *
218     * @return a new {@link Builder}.
219     * @since 2.12.0
220     */
221    public static Builder builder() {
222        return new Builder();
223    }
224
225    /**
226     * BOMs are sorted from longest to shortest.
227     */
228    private final List<ByteOrderMark> bomList;
229
230    private ByteOrderMark byteOrderMark;
231    private int fbIndex;
232    private int fbLength;
233    private int[] firstBytes;
234    private final boolean include;
235    private boolean markedAtStart;
236    private int markFbIndex;
237
238    private BOMInputStream(final Builder builder) throws IOException {
239        super(builder);
240        if (IOUtils.length(builder.byteOrderMarks) == 0) {
241            throw new IllegalArgumentException("No ByteOrderMark specified.");
242        }
243        this.include = builder.include;
244        final List<ByteOrderMark> list = Arrays.asList(builder.byteOrderMarks);
245        // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
246        list.sort(ByteOrderMarkLengthComparator);
247        this.bomList = list;
248    }
249
250    /**
251     * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM.
252     *
253     * @param delegate
254     *            the InputStream to delegate to
255     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
256     */
257    @Deprecated
258    public BOMInputStream(final InputStream delegate) {
259        this(delegate, false, Builder.DEFAULT);
260    }
261
262    /**
263     * Constructs a new BOM InputStream that detects a {@link ByteOrderMark#UTF_8} and optionally includes it.
264     *
265     * @param delegate
266     *            the InputStream to delegate to
267     * @param include
268     *            true to include the UTF-8 BOM or false to exclude it
269     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
270     */
271    @Deprecated
272    public BOMInputStream(final InputStream delegate, final boolean include) {
273        this(delegate, include, Builder.DEFAULT);
274    }
275
276    /**
277     * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them.
278     *
279     * @param delegate
280     *            the InputStream to delegate to
281     * @param include
282     *            true to include the specified BOMs or false to exclude them
283     * @param boms
284     *            The BOMs to detect and optionally exclude
285     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
286     */
287    @Deprecated
288    public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) {
289        super(delegate);
290        if (IOUtils.length(boms) == 0) {
291            throw new IllegalArgumentException("No BOMs specified");
292        }
293        this.include = include;
294        final List<ByteOrderMark> list = Arrays.asList(boms);
295        // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
296        list.sort(ByteOrderMarkLengthComparator);
297        this.bomList = list;
298    }
299
300    /**
301     * Constructs a new BOM InputStream that excludes the specified BOMs.
302     *
303     * @param delegate
304     *            the InputStream to delegate to
305     * @param boms
306     *            The BOMs to detect and exclude
307     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
308     */
309    @Deprecated
310    public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) {
311        this(delegate, false, boms);
312    }
313
314    /**
315     * Finds a ByteOrderMark with the configured bytes in {@code bomList}.
316     *
317     * @return The matched BOM or null if none matched.
318     */
319    private ByteOrderMark find() {
320        return bomList.stream().filter(this::matches).findFirst().orElse(null);
321    }
322
323    /**
324     * Gets the ByteOrderMark (Byte Order Mark).
325     *
326     * @return The BOM or null if none matched.
327     * @throws IOException
328     *             if an error reading the first bytes of the stream occurs.
329     */
330    public ByteOrderMark getBOM() throws IOException {
331        if (firstBytes == null) {
332            byteOrderMark = readBom();
333        }
334        return byteOrderMark;
335    }
336
337    /**
338     * Gets the BOM charset Name - {@link ByteOrderMark#getCharsetName()}.
339     *
340     * @return The BOM charset Name or null if no BOM found
341     * @throws IOException
342     *             if an error reading the first bytes of the stream occurs
343     */
344    public String getBOMCharsetName() throws IOException {
345        getBOM();
346        return byteOrderMark == null ? null : byteOrderMark.getCharsetName();
347    }
348
349    /**
350     * Tests whether the stream contains one of the specified BOMs.
351     *
352     * @return true if the stream has one of the specified BOMs, otherwise false if it does not
353     * @throws IOException
354     *             if an error reading the first bytes of the stream occurs
355     */
356    public boolean hasBOM() throws IOException {
357        return getBOM() != null;
358    }
359
360    /**
361     * Tests whether the stream contains the specified BOM.
362     *
363     * @param bom
364     *            The BOM to check for
365     * @return true if the stream has the specified BOM, otherwise false if it does not
366     * @throws IllegalArgumentException
367     *             if the BOM is not one the stream is configured to detect
368     * @throws IOException
369     *             if an error reading the first bytes of the stream occurs
370     */
371    public boolean hasBOM(final ByteOrderMark bom) throws IOException {
372        if (!bomList.contains(bom)) {
373            throw new IllegalArgumentException("Stream not configured to detect " + bom);
374        }
375        return Objects.equals(getBOM(), bom);
376    }
377
378    /**
379     * Invokes the delegate's {@code mark(int)} method.
380     *
381     * @param readLimit
382     *            read ahead limit
383     */
384    @Override
385    public synchronized void mark(final int readLimit) {
386        markFbIndex = fbIndex;
387        markedAtStart = firstBytes == null;
388        in.mark(readLimit);
389    }
390
391    /**
392     * Checks if the bytes match a BOM.
393     *
394     * @param bom
395     *            The BOM
396     * @return true if the bytes match the bom, otherwise false
397     */
398    private boolean matches(final ByteOrderMark bom) {
399        return bom.matches(firstBytes);
400    }
401
402    /**
403     * Invokes the delegate's {@code read()} method, detecting and optionally skipping BOM.
404     *
405     * @return the byte read (excluding BOM) or -1 if the end of stream
406     * @throws IOException
407     *             if an I/O error occurs
408     */
409    @Override
410    public int read() throws IOException {
411        checkOpen();
412        final int b = readFirstBytes();
413        return b >= 0 ? b : in.read();
414    }
415
416    /**
417     * Invokes the delegate's {@code read(byte[])} method, detecting and optionally skipping BOM.
418     *
419     * @param buf
420     *            the buffer to read the bytes into
421     * @return the number of bytes read (excluding BOM) or -1 if the end of stream
422     * @throws IOException
423     *             if an I/O error occurs
424     */
425    @Override
426    public int read(final byte[] buf) throws IOException {
427        return read(buf, 0, buf.length);
428    }
429
430    /**
431     * Invokes the delegate's {@code read(byte[], int, int)} method, detecting and optionally skipping BOM.
432     *
433     * @param buf
434     *            the buffer to read the bytes into
435     * @param off
436     *            The start offset
437     * @param len
438     *            The number of bytes to read (excluding BOM)
439     * @return the number of bytes read or -1 if the end of stream
440     * @throws IOException
441     *             if an I/O error occurs
442     */
443    @Override
444    public int read(final byte[] buf, int off, int len) throws IOException {
445        int firstCount = 0;
446        int b = 0;
447        while (len > 0 && b >= 0) {
448            b = readFirstBytes();
449            if (b >= 0) {
450                buf[off++] = (byte) (b & 0xFF);
451                len--;
452                firstCount++;
453            }
454        }
455        final int secondCount = in.read(buf, off, len);
456        afterRead(secondCount);
457        return secondCount < 0 ? firstCount > 0 ? firstCount : EOF : firstCount + secondCount;
458    }
459
460    private ByteOrderMark readBom() throws IOException {
461        fbLength = 0;
462        // BOMs are sorted from longest to shortest
463        final int maxBomSize = bomList.get(0).length();
464        firstBytes = new int[maxBomSize];
465        // Read first maxBomSize bytes
466        for (int i = 0; i < firstBytes.length; i++) {
467            firstBytes[i] = in.read();
468            afterRead(firstBytes[i]);
469            fbLength++;
470            if (firstBytes[i] < 0) {
471                break;
472            }
473        }
474        // match BOM in firstBytes
475        final ByteOrderMark bom = find();
476        if (bom != null && !include) {
477            if (bom.length() < firstBytes.length) {
478                fbIndex = bom.length();
479            } else {
480                fbLength = 0;
481            }
482        }
483        return bom;
484    }
485
486    /**
487     * This method reads and either preserves or skips the first bytes in the stream. It behaves like the single-byte
488     * {@code read()} method, either returning a valid byte or -1 to indicate that the initial bytes have been
489     * processed already.
490     *
491     * @return the byte read (excluding BOM) or -1 if the end of stream
492     * @throws IOException
493     *             if an I/O error occurs
494     */
495    private int readFirstBytes() throws IOException {
496        getBOM();
497        return fbIndex < fbLength ? firstBytes[fbIndex++] : EOF;
498    }
499
500    /**
501     * Invokes the delegate's {@code reset()} method.
502     *
503     * @throws IOException
504     *             if an I/O error occurs
505     */
506    @Override
507    public synchronized void reset() throws IOException {
508        fbIndex = markFbIndex;
509        if (markedAtStart) {
510            firstBytes = null;
511        }
512        in.reset();
513    }
514
515    /**
516     * Invokes the delegate's {@code skip(long)} method, detecting and optionally skipping BOM.
517     *
518     * @param n
519     *            the number of bytes to skip
520     * @return the number of bytes to skipped or -1 if the end of stream
521     * @throws IOException
522     *             if an I/O error occurs
523     */
524    @Override
525    public long skip(final long n) throws IOException {
526        int skipped = 0;
527        while (n > skipped && readFirstBytes() >= 0) {
528            skipped++;
529        }
530        return in.skip(n - skipped) + skipped;
531    }
532}