001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.io.input;
019
020import static org.apache.commons.io.IOUtils.EOF;
021
022import java.io.IOException;
023import java.io.InputStream;
024import java.nio.ByteBuffer;
025import java.nio.CharBuffer;
026import java.nio.charset.CharacterCodingException;
027import java.nio.charset.Charset;
028import java.nio.charset.CharsetEncoder;
029import java.nio.charset.CoderResult;
030import java.nio.charset.CodingErrorAction;
031import java.util.Objects;
032
033import org.apache.commons.io.Charsets;
034import org.apache.commons.io.IOUtils;
035import org.apache.commons.io.build.AbstractStreamBuilder;
036import org.apache.commons.io.charset.CharsetEncoders;
037import org.apache.commons.io.function.Uncheck;
038
039/**
040 * Implements an {@link InputStream} to read bytes from String, StringBuffer, StringBuilder or CharBuffer,
041 * encoded using the specified Charset. The Charset defaults to Charset.defaultCharset().
042 * <p>
043 * <strong>Note:</strong> Supports {@link #mark(int)} and {@link #reset()}.
044 * </p>
045 * <p>
046 * To build an instance, use {@link Builder}.
047 * </p>
048 *
049 * @see Builder
050 * @since 2.2
051 */
052public class CharSequenceInputStream extends InputStream {
053
054    //@formatter:off
055    /**
056     * Builds a new {@link CharSequenceInputStream}.
057     *
058     * <p>
059     * For example:
060     * </p>
061     * <h2>Using a Charset</h2>
062     * <pre>{@code
063     * CharSequenceInputStream s = CharSequenceInputStream.builder()
064     *   .setBufferSize(8192)
065     *   .setCharSequence("String")
066     *   .setCharset(Charset.defaultCharset())
067     *   .get();}
068     * </pre>
069     * <h2>Using a CharsetEncoder</h2>
070     * <pre>{@code
071     * CharSequenceInputStream s = CharSequenceInputStream.builder()
072     *   .setBufferSize(8192)
073     *   .setCharSequence("String")
074     *   .setCharsetEncoder(Charset.defaultCharset().newEncoder()
075     *     .onMalformedInput(CodingErrorAction.REPLACE)
076     *     .onUnmappableCharacter(CodingErrorAction.REPLACE))
077     *   .get();}
078     * </pre>
079     *
080     * @see #get()
081     * @since 2.13.0
082     */
083    //@formatter:on
084    public static class Builder extends AbstractStreamBuilder<CharSequenceInputStream, Builder> {
085
086        private CharsetEncoder charsetEncoder = newEncoder(getCharset());
087
088        /**
089         * Builds a new {@link CharSequenceInputStream}.
090         * <p>
091         * You must set input that supports {@link #getCharSequence()}, otherwise, this method throws an exception.
092         * </p>
093         * <p>
094         * This builder use the following aspects:
095         * </p>
096         * <ul>
097         * <li>{@link #getCharSequence()}</li>
098         * <li>{@link #getBufferSize()}</li>
099         * <li>{@link CharsetEncoder}</li>
100         * </ul>
101         *
102         * @return a new instance.
103         * @throws IllegalArgumentException if the buffer is not large enough to hold a complete character.
104         */
105        @Override
106        public CharSequenceInputStream get() {
107            return Uncheck.get(() -> new CharSequenceInputStream(getCharSequence(), getBufferSize(), charsetEncoder));
108        }
109
110        CharsetEncoder getCharsetEncoder() {
111            return charsetEncoder;
112        }
113
114        @Override
115        public Builder setCharset(final Charset charset) {
116            super.setCharset(charset);
117            charsetEncoder = newEncoder(getCharset());
118            return this;
119        }
120
121        /**
122         * Sets the charset encoder. Assumes that the caller has configured the encoder.
123         *
124         * @param newEncoder the charset encoder.
125         * @return {@code this} instance.
126         * @since 2.13.0
127         */
128        public Builder setCharsetEncoder(final CharsetEncoder newEncoder) {
129            charsetEncoder = CharsetEncoders.toCharsetEncoder(newEncoder, () -> newEncoder(getCharsetDefault()));
130            super.setCharset(charsetEncoder.charset());
131            return this;
132        }
133
134    }
135
136    private static final int NO_MARK = -1;
137
138    /**
139     * Constructs a new {@link Builder}.
140     *
141     * @return a new {@link Builder}.
142     * @since 2.12.0
143     */
144    public static Builder builder() {
145        return new Builder();
146    }
147
148    private static CharsetEncoder newEncoder(final Charset charset) {
149        // @formatter:off
150        return Charsets.toCharset(charset).newEncoder()
151                .onMalformedInput(CodingErrorAction.REPLACE)
152                .onUnmappableCharacter(CodingErrorAction.REPLACE);
153        // @formatter:on
154    }
155
156    private final ByteBuffer bBuf;
157    private int bBufMark; // position in bBuf
158    private final CharBuffer cBuf;
159    private int cBufMark; // position in cBuf
160    private final CharsetEncoder charsetEncoder;
161
162    /**
163     * Constructs a new instance with a buffer size of {@link IOUtils#DEFAULT_BUFFER_SIZE}.
164     *
165     * @param cs the input character sequence.
166     * @param charset the character set name to use.
167     * @throws IllegalArgumentException if the buffer is not large enough to hold a complete character.
168     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
169     */
170    @Deprecated
171    public CharSequenceInputStream(final CharSequence cs, final Charset charset) {
172        this(cs, charset, IOUtils.DEFAULT_BUFFER_SIZE);
173    }
174
175    /**
176     * Constructs a new instance.
177     *
178     * @param cs the input character sequence.
179     * @param charset the character set name to use, null maps to the default Charset.
180     * @param bufferSize the buffer size to use.
181     * @throws IllegalArgumentException if the buffer is not large enough to hold a complete character.
182     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
183     */
184    @Deprecated
185    public CharSequenceInputStream(final CharSequence cs, final Charset charset, final int bufferSize) {
186        // @formatter:off
187        this(cs, bufferSize, newEncoder(charset));
188        // @formatter:on
189    }
190
191    private CharSequenceInputStream(final CharSequence cs, final int bufferSize, final CharsetEncoder charsetEncoder) {
192        this.charsetEncoder = charsetEncoder;
193        // Ensure that buffer is long enough to hold a complete character
194        this.bBuf = ByteBuffer.allocate(ReaderInputStream.checkMinBufferSize(charsetEncoder, bufferSize));
195        this.bBuf.flip();
196        this.cBuf = CharBuffer.wrap(cs);
197        this.cBufMark = NO_MARK;
198        this.bBufMark = NO_MARK;
199        try {
200            fillBuffer();
201        } catch (final CharacterCodingException ex) {
202            // Reset everything without filling the buffer
203            // so the same exception can be thrown again later.
204            this.bBuf.clear();
205            this.bBuf.flip();
206            this.cBuf.rewind();
207        }
208    }
209
210    /**
211     * Constructs a new instance with a buffer size of {@link IOUtils#DEFAULT_BUFFER_SIZE}.
212     *
213     * @param cs the input character sequence.
214     * @param charset the character set name to use.
215     * @throws IllegalArgumentException if the buffer is not large enough to hold a complete character.
216     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
217     */
218    @Deprecated
219    public CharSequenceInputStream(final CharSequence cs, final String charset) {
220        this(cs, charset, IOUtils.DEFAULT_BUFFER_SIZE);
221    }
222
223    /**
224     * Constructs a new instance.
225     *
226     * @param cs the input character sequence.
227     * @param charset the character set name to use, null maps to the default Charset.
228     * @param bufferSize the buffer size to use.
229     * @throws IllegalArgumentException if the buffer is not large enough to hold a complete character.
230     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
231     */
232    @Deprecated
233    public CharSequenceInputStream(final CharSequence cs, final String charset, final int bufferSize) {
234        this(cs, Charsets.toCharset(charset), bufferSize);
235    }
236
237    /**
238     * Gets a lower bound on the number of bytes remaining in the byte stream.
239     *
240     * @return the count of bytes that can be read without blocking (or returning EOF).
241     * @throws IOException if an error occurs (probably not possible).
242     */
243    @Override
244    public int available() throws IOException {
245        return this.bBuf.remaining();
246    }
247
248    @Override
249    public void close() throws IOException {
250        bBuf.position(bBuf.limit());
251    }
252
253    /**
254     * Fills the byte output buffer from the input char buffer.
255     *
256     * @throws CharacterCodingException
257     *             an error encoding data.
258     */
259    private void fillBuffer() throws CharacterCodingException {
260        this.bBuf.compact();
261        final CoderResult result = this.charsetEncoder.encode(this.cBuf, this.bBuf, true);
262        if (result.isError()) {
263            result.throwException();
264        }
265        this.bBuf.flip();
266    }
267
268    /**
269     * Gets the CharsetEncoder.
270     *
271     * @return the CharsetEncoder.
272     */
273    CharsetEncoder getCharsetEncoder() {
274        return charsetEncoder;
275    }
276
277    /**
278     * {@inheritDoc}
279     * @param readLimit max read limit (ignored).
280     */
281    @Override
282    public synchronized void mark(final int readLimit) {
283        this.cBufMark = this.cBuf.position();
284        this.bBufMark = this.bBuf.position();
285        this.cBuf.mark();
286        this.bBuf.mark();
287        // It would be nice to be able to use mark & reset on the cBuf and bBuf;
288        // however the bBuf is re-used so that won't work
289    }
290
291    @Override
292    public boolean markSupported() {
293        return true;
294    }
295
296    @Override
297    public int read() throws IOException {
298        for (;;) {
299            if (this.bBuf.hasRemaining()) {
300                return this.bBuf.get() & 0xFF;
301            }
302            fillBuffer();
303            if (!this.bBuf.hasRemaining() && !this.cBuf.hasRemaining()) {
304                return EOF;
305            }
306        }
307    }
308
309    @Override
310    public int read(final byte[] b) throws IOException {
311        return read(b, 0, b.length);
312    }
313
314    @Override
315    public int read(final byte[] array, int off, int len) throws IOException {
316        Objects.requireNonNull(array, "array");
317        if (len < 0 || off + len > array.length) {
318            throw new IndexOutOfBoundsException("Array Size=" + array.length + ", offset=" + off + ", length=" + len);
319        }
320        if (len == 0) {
321            return 0; // must return 0 for zero length read
322        }
323        if (!this.bBuf.hasRemaining() && !this.cBuf.hasRemaining()) {
324            return EOF;
325        }
326        int bytesRead = 0;
327        while (len > 0) {
328            if (this.bBuf.hasRemaining()) {
329                final int chunk = Math.min(this.bBuf.remaining(), len);
330                this.bBuf.get(array, off, chunk);
331                off += chunk;
332                len -= chunk;
333                bytesRead += chunk;
334            } else {
335                fillBuffer();
336                if (!this.bBuf.hasRemaining() && !this.cBuf.hasRemaining()) {
337                    break;
338                }
339            }
340        }
341        return bytesRead == 0 && !this.cBuf.hasRemaining() ? EOF : bytesRead;
342    }
343
344    @Override
345    public synchronized void reset() throws IOException {
346        //
347        // This is not the most efficient implementation, as it re-encodes from the beginning.
348        //
349        // Since the bBuf is re-used, in general it's necessary to re-encode the data.
350        //
351        // It should be possible to apply some optimizations however:
352        // + use mark/reset on the cBuf and bBuf. This would only work if the buffer had not been (re)filled since
353        // the mark. The code would have to catch InvalidMarkException - does not seem possible to check if mark is
354        // valid otherwise. + Try saving the state of the cBuf before each fillBuffer; it might be possible to
355        // restart from there.
356        //
357        if (this.cBufMark != NO_MARK) {
358            // if cBuf is at 0, we have not started reading anything, so skip re-encoding
359            if (this.cBuf.position() != 0) {
360                this.charsetEncoder.reset();
361                this.cBuf.rewind();
362                this.bBuf.rewind();
363                this.bBuf.limit(0); // rewind does not clear the buffer
364                while (this.cBuf.position() < this.cBufMark) {
365                    this.bBuf.rewind(); // empty the buffer (we only refill when empty during normal processing)
366                    this.bBuf.limit(0);
367                    fillBuffer();
368                }
369            }
370            if (this.cBuf.position() != this.cBufMark) {
371                throw new IllegalStateException("Unexpected CharBuffer position: actual=" + cBuf.position() + " " +
372                        "expected=" + this.cBufMark);
373            }
374            this.bBuf.position(this.bBufMark);
375            this.cBufMark = NO_MARK;
376            this.bBufMark = NO_MARK;
377        }
378        mark(0);
379    }
380
381    @Override
382    public long skip(long n) throws IOException {
383        //
384        // This could be made more efficient by using position to skip within the current buffer.
385        //
386        long skipped = 0;
387        while (n > 0 && available() > 0) {
388            this.read();
389            n--;
390            skipped++;
391        }
392        return skipped;
393    }
394
395}