CharSequenceInputStream.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.commons.io.input;

import static org.apache.commons.io.IOUtils.EOF;

import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.util.Objects;

import org.apache.commons.io.Charsets;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.build.AbstractStreamBuilder;
import org.apache.commons.io.charset.CharsetEncoders;
import org.apache.commons.io.function.Uncheck;

/**
 * Implements an {@link InputStream} to read bytes from String, StringBuffer, StringBuilder or CharBuffer,
 * encoded using the specified Charset. The Charset defaults to Charset.defaultCharset().
 * <p>
 * <strong>Note:</strong> Supports {@link #mark(int)} and {@link #reset()}.
 * </p>
 * <p>
 * To build an instance, use {@link Builder}.
 * </p>
 *
 * @see Builder
 * @since 2.2
 */
public class CharSequenceInputStream extends InputStream {

    //@formatter:off
    /**
     * Builds a new {@link CharSequenceInputStream}.
     *
     * <p>
     * For example:
     * </p>
     * <h2>Using a Charset</h2>
     * <pre>{@code
     * CharSequenceInputStream s = CharSequenceInputStream.builder()
     *   .setBufferSize(8192)
     *   .setCharSequence("String")
     *   .setCharset(Charset.defaultCharset())
     *   .get();}
     * </pre>
     * <h2>Using a CharsetEncoder</h2>
     * <pre>{@code
     * CharSequenceInputStream s = CharSequenceInputStream.builder()
     *   .setBufferSize(8192)
     *   .setCharSequence("String")
     *   .setCharsetEncoder(Charset.defaultCharset().newEncoder()
     *     .onMalformedInput(CodingErrorAction.REPLACE)
     *     .onUnmappableCharacter(CodingErrorAction.REPLACE))
     *   .get();}
     * </pre>
     *
     * @see #get()
     * @since 2.13.0
     */
    //@formatter:on
    public static class Builder extends AbstractStreamBuilder<CharSequenceInputStream, Builder> {

        private CharsetEncoder charsetEncoder = newEncoder(getCharset());

        /**
         * Builds a new {@link CharSequenceInputStream}.
         * <p>
         * You must set input that supports {@link #getCharSequence()}, otherwise, this method throws an exception.
         * </p>
         * <p>
         * This builder use the following aspects:
         * </p>
         * <ul>
         * <li>{@link #getCharSequence()}</li>
         * <li>{@link #getBufferSize()}</li>
         * <li>{@link CharsetEncoder}</li>
         * </ul>
         *
         * @return a new instance.
         * @throws IllegalArgumentException if the buffer is not large enough to hold a complete character.
         */
        @Override
        public CharSequenceInputStream get() {
            return Uncheck.get(() -> new CharSequenceInputStream(getCharSequence(), getBufferSize(), charsetEncoder));
        }

        CharsetEncoder getCharsetEncoder() {
            return charsetEncoder;
        }

        @Override
        public Builder setCharset(final Charset charset) {
            super.setCharset(charset);
            charsetEncoder = newEncoder(getCharset());
            return this;
        }

        /**
         * Sets the charset encoder. Assumes that the caller has configured the encoder.
         *
         * @param newEncoder the charset encoder.
         * @return {@code this} instance.
         * @since 2.13.0
         */
        public Builder setCharsetEncoder(final CharsetEncoder newEncoder) {
            charsetEncoder = CharsetEncoders.toCharsetEncoder(newEncoder, () -> newEncoder(getCharsetDefault()));
            super.setCharset(charsetEncoder.charset());
            return this;
        }

    }

    private static final int NO_MARK = -1;

    /**
     * Constructs a new {@link Builder}.
     *
     * @return a new {@link Builder}.
     * @since 2.12.0
     */
    public static Builder builder() {
        return new Builder();
    }

    private static CharsetEncoder newEncoder(final Charset charset) {
        // @formatter:off
        return Charsets.toCharset(charset).newEncoder()
                .onMalformedInput(CodingErrorAction.REPLACE)
                .onUnmappableCharacter(CodingErrorAction.REPLACE);
        // @formatter:on
    }

    private final ByteBuffer bBuf;
    private int bBufMark; // position in bBuf
    private final CharBuffer cBuf;
    private int cBufMark; // position in cBuf
    private final CharsetEncoder charsetEncoder;

    /**
     * Constructs a new instance with a buffer size of {@link IOUtils#DEFAULT_BUFFER_SIZE}.
     *
     * @param cs the input character sequence.
     * @param charset the character set name to use.
     * @throws IllegalArgumentException if the buffer is not large enough to hold a complete character.
     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
     */
    @Deprecated
    public CharSequenceInputStream(final CharSequence cs, final Charset charset) {
        this(cs, charset, IOUtils.DEFAULT_BUFFER_SIZE);
    }

    /**
     * Constructs a new instance.
     *
     * @param cs the input character sequence.
     * @param charset the character set name to use, null maps to the default Charset.
     * @param bufferSize the buffer size to use.
     * @throws IllegalArgumentException if the buffer is not large enough to hold a complete character.
     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
     */
    @Deprecated
    public CharSequenceInputStream(final CharSequence cs, final Charset charset, final int bufferSize) {
        // @formatter:off
        this(cs, bufferSize, newEncoder(charset));
        // @formatter:on
    }

    private CharSequenceInputStream(final CharSequence cs, final int bufferSize, final CharsetEncoder charsetEncoder) {
        this.charsetEncoder = charsetEncoder;
        // Ensure that buffer is long enough to hold a complete character
        this.bBuf = ByteBuffer.allocate(ReaderInputStream.checkMinBufferSize(charsetEncoder, bufferSize));
        this.bBuf.flip();
        this.cBuf = CharBuffer.wrap(cs);
        this.cBufMark = NO_MARK;
        this.bBufMark = NO_MARK;
        try {
            fillBuffer();
        } catch (final CharacterCodingException ex) {
            // Reset everything without filling the buffer
            // so the same exception can be thrown again later.
            this.bBuf.clear();
            this.bBuf.flip();
            this.cBuf.rewind();
        }
    }

    /**
     * Constructs a new instance with a buffer size of {@link IOUtils#DEFAULT_BUFFER_SIZE}.
     *
     * @param cs the input character sequence.
     * @param charset the character set name to use.
     * @throws IllegalArgumentException if the buffer is not large enough to hold a complete character.
     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
     */
    @Deprecated
    public CharSequenceInputStream(final CharSequence cs, final String charset) {
        this(cs, charset, IOUtils.DEFAULT_BUFFER_SIZE);
    }

    /**
     * Constructs a new instance.
     *
     * @param cs the input character sequence.
     * @param charset the character set name to use, null maps to the default Charset.
     * @param bufferSize the buffer size to use.
     * @throws IllegalArgumentException if the buffer is not large enough to hold a complete character.
     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
     */
    @Deprecated
    public CharSequenceInputStream(final CharSequence cs, final String charset, final int bufferSize) {
        this(cs, Charsets.toCharset(charset), bufferSize);
    }

    /**
     * Gets a lower bound on the number of bytes remaining in the byte stream.
     *
     * @return the count of bytes that can be read without blocking (or returning EOF).
     * @throws IOException if an error occurs (probably not possible).
     */
    @Override
    public int available() throws IOException {
        return this.bBuf.remaining();
    }

    @Override
    public void close() throws IOException {
        bBuf.position(bBuf.limit());
    }

    /**
     * Fills the byte output buffer from the input char buffer.
     *
     * @throws CharacterCodingException
     *             an error encoding data.
     */
    private void fillBuffer() throws CharacterCodingException {
        this.bBuf.compact();
        final CoderResult result = this.charsetEncoder.encode(this.cBuf, this.bBuf, true);
        if (result.isError()) {
            result.throwException();
        }
        this.bBuf.flip();
    }

    /**
     * Gets the CharsetEncoder.
     *
     * @return the CharsetEncoder.
     */
    CharsetEncoder getCharsetEncoder() {
        return charsetEncoder;
    }

    /**
     * {@inheritDoc}
     * @param readLimit max read limit (ignored).
     */
    @Override
    public synchronized void mark(final int readLimit) {
        this.cBufMark = this.cBuf.position();
        this.bBufMark = this.bBuf.position();
        this.cBuf.mark();
        this.bBuf.mark();
        // It would be nice to be able to use mark & reset on the cBuf and bBuf;
        // however the bBuf is re-used so that won't work
    }

    @Override
    public boolean markSupported() {
        return true;
    }

    @Override
    public int read() throws IOException {
        for (;;) {
            if (this.bBuf.hasRemaining()) {
                return this.bBuf.get() & 0xFF;
            }
            fillBuffer();
            if (!this.bBuf.hasRemaining() && !this.cBuf.hasRemaining()) {
                return EOF;
            }
        }
    }

    @Override
    public int read(final byte[] b) throws IOException {
        return read(b, 0, b.length);
    }

    @Override
    public int read(final byte[] array, int off, int len) throws IOException {
        Objects.requireNonNull(array, "array");
        if (len < 0 || off + len > array.length) {
            throw new IndexOutOfBoundsException("Array Size=" + array.length + ", offset=" + off + ", length=" + len);
        }
        if (len == 0) {
            return 0; // must return 0 for zero length read
        }
        if (!this.bBuf.hasRemaining() && !this.cBuf.hasRemaining()) {
            return EOF;
        }
        int bytesRead = 0;
        while (len > 0) {
            if (this.bBuf.hasRemaining()) {
                final int chunk = Math.min(this.bBuf.remaining(), len);
                this.bBuf.get(array, off, chunk);
                off += chunk;
                len -= chunk;
                bytesRead += chunk;
            } else {
                fillBuffer();
                if (!this.bBuf.hasRemaining() && !this.cBuf.hasRemaining()) {
                    break;
                }
            }
        }
        return bytesRead == 0 && !this.cBuf.hasRemaining() ? EOF : bytesRead;
    }

    @Override
    public synchronized void reset() throws IOException {
        //
        // This is not the most efficient implementation, as it re-encodes from the beginning.
        //
        // Since the bBuf is re-used, in general it's necessary to re-encode the data.
        //
        // It should be possible to apply some optimizations however:
        // + use mark/reset on the cBuf and bBuf. This would only work if the buffer had not been (re)filled since
        // the mark. The code would have to catch InvalidMarkException - does not seem possible to check if mark is
        // valid otherwise. + Try saving the state of the cBuf before each fillBuffer; it might be possible to
        // restart from there.
        //
        if (this.cBufMark != NO_MARK) {
            // if cBuf is at 0, we have not started reading anything, so skip re-encoding
            if (this.cBuf.position() != 0) {
                this.charsetEncoder.reset();
                this.cBuf.rewind();
                this.bBuf.rewind();
                this.bBuf.limit(0); // rewind does not clear the buffer
                while (this.cBuf.position() < this.cBufMark) {
                    this.bBuf.rewind(); // empty the buffer (we only refill when empty during normal processing)
                    this.bBuf.limit(0);
                    fillBuffer();
                }
            }
            if (this.cBuf.position() != this.cBufMark) {
                throw new IllegalStateException("Unexpected CharBuffer position: actual=" + cBuf.position() + " " +
                        "expected=" + this.cBufMark);
            }
            this.bBuf.position(this.bBufMark);
            this.cBufMark = NO_MARK;
            this.bBufMark = NO_MARK;
        }
        mark(0);
    }

    @Override
    public long skip(long n) throws IOException {
        //
        // This could be made more efficient by using position to skip within the current buffer.
        //
        long skipped = 0;
        while (n > 0 && available() > 0) {
            this.read();
            n--;
            skipped++;
        }
        return skipped;
    }

}