BOMInputStream.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.commons.io.input;

import static org.apache.commons.io.IOUtils.EOF;

import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import java.util.Objects;

import org.apache.commons.io.ByteOrderMark;
import org.apache.commons.io.IOUtils;

/**
 * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes.
 * <p>
 * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the
 * first byte in the stream.
 * </p>
 * <p>
 * The {@link ByteOrderMark} implementation has the following predefined BOMs:
 * </p>
 * <ul>
 * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li>
 * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li>
 * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li>
 * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li>
 * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li>
 * </ul>
 * <p>
 * To build an instance, use {@link Builder}.
 * </p>
 * <h2>Example 1 - Detecting and excluding a UTF-8 BOM</h2>
 *
 * <pre>
 * BOMInputStream bomIn = BOMInputStream.builder().setInputStream(in).get();
 * if (bomIn.hasBOM()) {
 *     // has a UTF-8 BOM
 * }
 * </pre>
 *
 * <h2>Example 2 - Detecting a UTF-8 BOM without excluding it</h2>
 *
 * <pre>
 * boolean include = true;
 * BOMInputStream bomIn = BOMInputStream.builder()
 *     .setInputStream(in)
 *     .setInclude(include)
 *     .get();
 * if (bomIn.hasBOM()) {
 *     // has a UTF-8 BOM
 * }
 * </pre>
 *
 * <h2>Example 3 - Detecting Multiple BOMs</h2>
 *
 * <pre>
 * BOMInputStream bomIn = BOMInputStream.builder()
 *   .setInputStream(in)
 *   .setByteOrderMarks(ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE)
 *   .get();
 * if (bomIn.hasBOM() == false) {
 *     // No BOM found
 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
 *     // has a UTF-16LE BOM
 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
 *     // has a UTF-16BE BOM
 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) {
 *     // has a UTF-32LE BOM
 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) {
 *     // has a UTF-32BE BOM
 * }
 * </pre>
 * <p>
 * To build an instance, use {@link Builder}.
 * </p>
 *
 * @see Builder
 * @see org.apache.commons.io.ByteOrderMark
 * @see <a href="https://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a>
 * @since 2.0
 */
public class BOMInputStream extends ProxyInputStream {

    // @formatter:off
    /**
     * Builds a new {@link BOMInputStream}.
     *
     * <h2>Using NIO</h2>
     * <pre>{@code
     * BOMInputStream s = BOMInputStream.builder()
     *   .setPath(Paths.get("MyFile.xml"))
     *   .setByteOrderMarks(ByteOrderMark.UTF_8)
     *   .setInclude(false)
     *   .get();}
     * </pre>
     * <h2>Using IO</h2>
     * <pre>{@code
     * BOMInputStream s = BOMInputStream.builder()
     *   .setFile(new File("MyFile.xml"))
     *   .setByteOrderMarks(ByteOrderMark.UTF_8)
     *   .setInclude(false)
     *   .get();}
     * </pre>
     *
     * @see #get()
     * @since 2.12.0
     */
    // @formatter:on
    public static class Builder extends AbstractBuilder<BOMInputStream, Builder> {

        private static final ByteOrderMark[] DEFAULT = { ByteOrderMark.UTF_8 };

        /**
         * For test access.
         *
         * @return the default byte order mark
         */
        static ByteOrderMark getDefaultByteOrderMark() {
            return DEFAULT[0];
        }

        private ByteOrderMark[] byteOrderMarks = DEFAULT;

        private boolean include;

        /**
         * Builds a new {@link BOMInputStream}.
         * <p>
         * You must set input that supports {@link #getInputStream()}, otherwise, this method throws an exception.
         * </p>
         * <p>
         * This builder use the following aspects: InputStream, OpenOption[], include, and ByteOrderMark[].
         * </p>
         * <p>
         * This builder use the following aspects:
         * </p>
         * <ul>
         * <li>{@link #getInputStream()}</li>
         * <li>include}</li>
         * <li>byteOrderMarks</li>
         * </ul>
         *
         * @return a new instance.
         * @throws IllegalStateException         if the {@code origin} is {@code null}.
         * @throws UnsupportedOperationException if the origin cannot be converted to an {@link InputStream}.
         * @throws IOException                   if an I/O error occurs.
         * @see #getInputStream()
         */
        @Override
        public BOMInputStream get() throws IOException {
            return new BOMInputStream(this);
        }

        /**
         * Sets the ByteOrderMarks to detect and optionally exclude.
         * <p>
         * The default is {@link ByteOrderMark#UTF_8}.
         * </p>
         *
         * @param byteOrderMarks the ByteOrderMarks to detect and optionally exclude.
         * @return {@code this} instance.
         */
        public Builder setByteOrderMarks(final ByteOrderMark... byteOrderMarks) {
            this.byteOrderMarks = byteOrderMarks != null ? byteOrderMarks.clone() : DEFAULT;
            return this;
        }

        /**
         * Sets whether to include the UTF-8 BOM (true) or to exclude it (false).
         * <p>
         * The default is false.
         * </p>
         *
         * @param include true to include the UTF-8 BOM or false to exclude it. return this;
         * @return {@code this} instance.
         */
        public Builder setInclude(final boolean include) {
            this.include = include;
            return this;
        }

    }

    /**
     * Compares ByteOrderMark objects in descending length order.
     */
    private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = Comparator.comparing(ByteOrderMark::length).reversed();

    /**
     * Constructs a new {@link Builder}.
     *
     * @return a new {@link Builder}.
     * @since 2.12.0
     */
    public static Builder builder() {
        return new Builder();
    }

    /**
     * BOMs are sorted from longest to shortest.
     */
    private final List<ByteOrderMark> boms;

    private ByteOrderMark byteOrderMark;
    private int fbIndex;
    private int fbLength;
    private int[] firstBytes;
    private final boolean include;
    private boolean markedAtStart;
    private int markFbIndex;

    private BOMInputStream(final Builder builder) throws IOException {
        super(builder);
        if (IOUtils.length(builder.byteOrderMarks) == 0) {
            throw new IllegalArgumentException("No BOMs specified");
        }
        this.include = builder.include;
        final List<ByteOrderMark> list = Arrays.asList(builder.byteOrderMarks);
        // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
        list.sort(ByteOrderMarkLengthComparator);
        this.boms = list;
    }

    /**
     * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM.
     *
     * @param delegate
     *            the InputStream to delegate to
     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
     */
    @Deprecated
    public BOMInputStream(final InputStream delegate) {
        this(delegate, false, Builder.DEFAULT);
    }

    /**
     * Constructs a new BOM InputStream that detects a {@link ByteOrderMark#UTF_8} and optionally includes it.
     *
     * @param delegate
     *            the InputStream to delegate to
     * @param include
     *            true to include the UTF-8 BOM or false to exclude it
     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
     */
    @Deprecated
    public BOMInputStream(final InputStream delegate, final boolean include) {
        this(delegate, include, Builder.DEFAULT);
    }

    /**
     * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them.
     *
     * @param delegate
     *            the InputStream to delegate to
     * @param include
     *            true to include the specified BOMs or false to exclude them
     * @param boms
     *            The BOMs to detect and optionally exclude
     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
     */
    @Deprecated
    public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) {
        super(delegate);
        if (IOUtils.length(boms) == 0) {
            throw new IllegalArgumentException("No BOMs specified");
        }
        this.include = include;
        final List<ByteOrderMark> list = Arrays.asList(boms);
        // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
        list.sort(ByteOrderMarkLengthComparator);
        this.boms = list;
    }

    /**
     * Constructs a new BOM InputStream that excludes the specified BOMs.
     *
     * @param delegate
     *            the InputStream to delegate to
     * @param boms
     *            The BOMs to detect and exclude
     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
     */
    @Deprecated
    public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) {
        this(delegate, false, boms);
    }

    /**
     * Find a BOM with the specified bytes.
     *
     * @return The matched BOM or null if none matched
     */
    private ByteOrderMark find() {
        return boms.stream().filter(this::matches).findFirst().orElse(null);
    }

    /**
     * Gets the BOM (Byte Order Mark).
     *
     * @return The BOM or null if none
     * @throws IOException
     *             if an error reading the first bytes of the stream occurs
     */
    public ByteOrderMark getBOM() throws IOException {
        if (firstBytes == null) {
            fbLength = 0;
            // BOMs are sorted from longest to shortest
            final int maxBomSize = boms.get(0).length();
            firstBytes = new int[maxBomSize];
            // Read first maxBomSize bytes
            for (int i = 0; i < firstBytes.length; i++) {
                firstBytes[i] = in.read();
                afterRead(firstBytes[i]);
                fbLength++;
                if (firstBytes[i] < 0) {
                    break;
                }
            }
            // match BOM in firstBytes
            byteOrderMark = find();
            if (byteOrderMark != null && !include) {
                if (byteOrderMark.length() < firstBytes.length) {
                    fbIndex = byteOrderMark.length();
                } else {
                    fbLength = 0;
                }
            }
        }
        return byteOrderMark;
    }

    /**
     * Gets the BOM charset Name - {@link ByteOrderMark#getCharsetName()}.
     *
     * @return The BOM charset Name or null if no BOM found
     * @throws IOException
     *             if an error reading the first bytes of the stream occurs
     */
    public String getBOMCharsetName() throws IOException {
        getBOM();
        return byteOrderMark == null ? null : byteOrderMark.getCharsetName();
    }

    /**
     * Tests whether the stream contains one of the specified BOMs.
     *
     * @return true if the stream has one of the specified BOMs, otherwise false if it does not
     * @throws IOException
     *             if an error reading the first bytes of the stream occurs
     */
    public boolean hasBOM() throws IOException {
        return getBOM() != null;
    }

    /**
     * Tests whether the stream contains the specified BOM.
     *
     * @param bom
     *            The BOM to check for
     * @return true if the stream has the specified BOM, otherwise false if it does not
     * @throws IllegalArgumentException
     *             if the BOM is not one the stream is configured to detect
     * @throws IOException
     *             if an error reading the first bytes of the stream occurs
     */
    public boolean hasBOM(final ByteOrderMark bom) throws IOException {
        if (!boms.contains(bom)) {
            throw new IllegalArgumentException("Stream not configured to detect " + bom);
        }
        return Objects.equals(getBOM(), bom);
    }

    /**
     * Invokes the delegate's {@code mark(int)} method.
     *
     * @param readLimit
     *            read ahead limit
     */
    @Override
    public synchronized void mark(final int readLimit) {
        markFbIndex = fbIndex;
        markedAtStart = firstBytes == null;
        in.mark(readLimit);
    }

    /**
     * Checks if the bytes match a BOM.
     *
     * @param bom
     *            The BOM
     * @return true if the bytes match the bom, otherwise false
     */
    private boolean matches(final ByteOrderMark bom) {
        // if (bom.length() != fbLength) {
        // return false;
        // }
        // firstBytes may be bigger than the BOM bytes
        for (int i = 0; i < bom.length(); i++) {
            if (bom.get(i) != firstBytes[i]) {
                return false;
            }
        }
        return true;
    }

    /**
     * Invokes the delegate's {@code read()} method, detecting and optionally skipping BOM.
     *
     * @return the byte read (excluding BOM) or -1 if the end of stream
     * @throws IOException
     *             if an I/O error occurs
     */
    @Override
    public int read() throws IOException {
        checkOpen();
        final int b = readFirstBytes();
        return b >= 0 ? b : in.read();
    }

    /**
     * Invokes the delegate's {@code read(byte[])} method, detecting and optionally skipping BOM.
     *
     * @param buf
     *            the buffer to read the bytes into
     * @return the number of bytes read (excluding BOM) or -1 if the end of stream
     * @throws IOException
     *             if an I/O error occurs
     */
    @Override
    public int read(final byte[] buf) throws IOException {
        return read(buf, 0, buf.length);
    }

    /**
     * Invokes the delegate's {@code read(byte[], int, int)} method, detecting and optionally skipping BOM.
     *
     * @param buf
     *            the buffer to read the bytes into
     * @param off
     *            The start offset
     * @param len
     *            The number of bytes to read (excluding BOM)
     * @return the number of bytes read or -1 if the end of stream
     * @throws IOException
     *             if an I/O error occurs
     */
    @Override
    public int read(final byte[] buf, int off, int len) throws IOException {
        int firstCount = 0;
        int b = 0;
        while (len > 0 && b >= 0) {
            b = readFirstBytes();
            if (b >= 0) {
                buf[off++] = (byte) (b & 0xFF);
                len--;
                firstCount++;
            }
        }
        final int secondCount = in.read(buf, off, len);
        afterRead(secondCount);
        return secondCount < 0 ? firstCount > 0 ? firstCount : EOF : firstCount + secondCount;
    }

    /**
     * This method reads and either preserves or skips the first bytes in the stream. It behaves like the single-byte
     * {@code read()} method, either returning a valid byte or -1 to indicate that the initial bytes have been
     * processed already.
     *
     * @return the byte read (excluding BOM) or -1 if the end of stream
     * @throws IOException
     *             if an I/O error occurs
     */
    private int readFirstBytes() throws IOException {
        getBOM();
        return fbIndex < fbLength ? firstBytes[fbIndex++] : EOF;
    }

    /**
     * Invokes the delegate's {@code reset()} method.
     *
     * @throws IOException
     *             if an I/O error occurs
     */
    @Override
    public synchronized void reset() throws IOException {
        fbIndex = markFbIndex;
        if (markedAtStart) {
            firstBytes = null;
        }
        in.reset();
    }

    /**
     * Invokes the delegate's {@code skip(long)} method, detecting and optionally skipping BOM.
     *
     * @param n
     *            the number of bytes to skip
     * @return the number of bytes to skipped or -1 if the end of stream
     * @throws IOException
     *             if an I/O error occurs
     */
    @Override
    public long skip(final long n) throws IOException {
        int skipped = 0;
        while (n > skipped && readFirstBytes() >= 0) {
            skipped++;
        }
        return in.skip(n - skipped) + skipped;
    }
}