ByteOrderMark.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.commons.io;

import java.io.Serializable;
import java.nio.charset.StandardCharsets;
import java.util.Locale;
import java.util.Objects;

/**
 * Byte Order Mark (BOM) representation. See {@link org.apache.commons.io.input.BOMInputStream}.
 * <p>
 * We define the follow BOM constants:
 * </p>
 * <ul>
 * <li>{@link #UTF_16BE}</li>
 * <li>{@link #UTF_16LE}</li>
 * <li>{@link #UTF_32BE}</li>
 * <li>{@link #UTF_32LE}</li>
 * <li>{@link #UTF_8}</li>
 * </ul>
 * <h2>Deprecating Serialization</h2>
 * <p>
 * <em>Serialization is deprecated and will be removed in 3.0.</em>
 * </p>
 *
 * @see org.apache.commons.io.input.BOMInputStream
 * @see <a href="https://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia: Byte Order Mark</a>
 * @see <a href="http://www.w3.org/TR/2006/REC-xml-20060816/#sec-guessing">W3C: Autodetection of Character Encodings
 *      (Non-Normative)</a>
 * @since 2.0
 */
public class ByteOrderMark implements Serializable {

    private static final long serialVersionUID = 1L;

    /**
     * UTF-8 BOM.
     * <p>
     * This BOM is:
     * </p>
     * <pre>
     * 0xEF 0xBB 0xBF
     * </pre>
     */
    public static final ByteOrderMark UTF_8 = new ByteOrderMark(StandardCharsets.UTF_8.name(), 0xEF, 0xBB, 0xBF);

    /**
     * UTF-16BE BOM (Big-Endian).
     * <p>
     * This BOM is:
     * </p>
     * <pre>
     * 0xFE 0xFF
     * </pre>
     */
    public static final ByteOrderMark UTF_16BE = new ByteOrderMark(StandardCharsets.UTF_16BE.name(), 0xFE, 0xFF);

    /**
     * UTF-16LE BOM (Little-Endian).
     * <p>
     * This BOM is:
     * </p>
     * <pre>
     * 0xFF 0xFE
     * </pre>
     */
    public static final ByteOrderMark UTF_16LE = new ByteOrderMark(StandardCharsets.UTF_16LE.name(), 0xFF, 0xFE);

    /**
     * UTF-32BE BOM (Big-Endian).
     * <p>
     * This BOM is:
     * </p>
     * <pre>
     * 0x00 0x00 0xFE 0xFF
     * </pre>
     *
     * @since 2.2
     */
    public static final ByteOrderMark UTF_32BE = new ByteOrderMark("UTF-32BE", 0x00, 0x00, 0xFE, 0xFF);

    /**
     * UTF-32LE BOM (Little-Endian).
     * <p>
     * This BOM is:
     * </p>
     * <pre>
     * 0xFF 0xFE 0x00 0x00
     * </pre>
     *
     * @since 2.2
     */
    public static final ByteOrderMark UTF_32LE = new ByteOrderMark("UTF-32LE", 0xFF, 0xFE, 0x00, 0x00);

    /**
     * Unicode BOM character; external form depends on the encoding.
     *
     * @see <a href="https://unicode.org/faq/utf_bom.html#BOM">Byte Order Mark (BOM) FAQ</a>
     * @since 2.5
     */
    public static final char UTF_BOM = '\uFEFF';

    /**
     * Charset name.
     */
    private final String charsetName;

    /**
     * Bytes.
     */
    private final int[] bytes;

    /**
     * Constructs a new instance.
     *
     * @param charsetName The name of the charset the BOM represents
     * @param bytes The BOM's bytes
     * @throws IllegalArgumentException if the charsetName is zero length
     * @throws IllegalArgumentException if the bytes are zero length
     */
    public ByteOrderMark(final String charsetName, final int... bytes) {
        Objects.requireNonNull(charsetName, "charsetName");
        Objects.requireNonNull(bytes, "bytes");
        if (charsetName.isEmpty()) {
            throw new IllegalArgumentException("No charsetName specified");
        }
        if (bytes.length == 0) {
            throw new IllegalArgumentException("No bytes specified");
        }
        this.charsetName = charsetName;
        this.bytes = bytes.clone();
    }

    /**
     * Indicates if this instance's bytes equals another.
     *
     * @param obj The object to compare to
     * @return true if the bom's bytes are equal, otherwise
     * false
     */
    @Override
    public boolean equals(final Object obj) {
        if (!(obj instanceof ByteOrderMark)) {
            return false;
        }
        final ByteOrderMark bom = (ByteOrderMark) obj;
        if (bytes.length != bom.length()) {
            return false;
        }
        for (int i = 0; i < bytes.length; i++) {
            if (bytes[i] != bom.get(i)) {
                return false;
            }
        }
        return true;
    }

    /**
     * Gets the byte at the specified position.
     *
     * @param pos The position
     * @return The specified byte
     */
    public int get(final int pos) {
        return bytes[pos];
    }

    /**
     * Gets a copy of the BOM's bytes.
     *
     * @return a copy of the BOM's bytes
     */
    public byte[] getBytes() {
        final byte[] copy = IOUtils.byteArray(bytes.length);
        for (int i = 0; i < bytes.length; i++) {
            copy[i] = (byte) bytes[i];
        }
        return copy;
    }

    /**
     * Gets the name of the {@link java.nio.charset.Charset} the BOM represents.
     *
     * @return the character set name
     */
    public String getCharsetName() {
        return charsetName;
    }

    /**
     * Computes the hash code for this BOM.
     *
     * @return the hash code for this BOM.
     * @see Object#hashCode()
     */
    @Override
    public int hashCode() {
        int hashCode = getClass().hashCode();
        for (final int b : bytes) {
            hashCode += b;
        }
        return hashCode;
    }

    /**
     * Gets the length of the BOM's bytes.
     *
     * @return the length of the BOM's bytes
     */
    public int length() {
        return bytes.length;
    }

    /**
     * Converts this instance to a String representation of the BOM.
     *
     * @return the length of the BOM's bytes
     */
    @Override
    public String toString() {
        final StringBuilder builder = new StringBuilder();
        builder.append(getClass().getSimpleName());
        builder.append('[');
        builder.append(charsetName);
        builder.append(": ");
        for (int i = 0; i < bytes.length; i++) {
            if (i > 0) {
                builder.append(",");
            }
            builder.append("0x");
            builder.append(Integer.toHexString(0xFF & bytes[i]).toUpperCase(Locale.ROOT));
        }
        builder.append(']');
        return builder.toString();
    }

}