001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.io.input;
018
019import static org.apache.commons.io.IOUtils.EOF;
020
021import java.io.BufferedReader;
022import java.io.IOException;
023import java.io.InputStream;
024import java.io.InputStreamReader;
025import java.io.OutputStreamWriter;
026import java.io.Reader;
027import java.nio.ByteBuffer;
028import java.nio.CharBuffer;
029import java.nio.charset.Charset;
030import java.nio.charset.CharsetEncoder;
031import java.nio.charset.CoderResult;
032import java.nio.charset.CodingErrorAction;
033import java.util.Objects;
034
035import org.apache.commons.io.Charsets;
036import org.apache.commons.io.IOUtils;
037import org.apache.commons.io.build.AbstractStreamBuilder;
038import org.apache.commons.io.charset.CharsetEncoders;
039
040/**
041 * {@link InputStream} implementation that reads a character stream from a {@link Reader} and transforms it to a byte stream using a specified charset encoding.
042 * The stream is transformed using a {@link CharsetEncoder} object, guaranteeing that all charset encodings supported by the JRE are handled correctly. In
043 * particular for charsets such as UTF-16, the implementation ensures that one and only one byte order marker is produced.
044 * <p>
045 * Since in general it is not possible to predict the number of characters to be read from the {@link Reader} to satisfy a read request on the
046 * {@link ReaderInputStream}, all reads from the {@link Reader} are buffered. There is therefore no well defined correlation between the current position of the
047 * {@link Reader} and that of the {@link ReaderInputStream}. This also implies that in general there is no need to wrap the underlying {@link Reader} in a
048 * {@link BufferedReader}.
049 * </p>
050 * <p>
051 * {@link ReaderInputStream} implements the inverse transformation of {@link InputStreamReader}; in the following example, reading from {@code in2}
052 * would return the same byte sequence as reading from {@code in} (provided that the initial byte sequence is legal with respect to the charset encoding):
053 * </p>
054 * <p>
055 * To build an instance, use {@link Builder}.
056 * </p>
057 * <pre>
058 * InputStream inputStream = ...
059 * Charset cs = ...
060 * InputStreamReader reader = new InputStreamReader(inputStream, cs);
061 * ReaderInputStream in2 = ReaderInputStream.builder()
062 *   .setReader(reader)
063 *   .setCharset(cs)
064 *   .get();
065 * </pre>
066 * <p>
067 * {@link ReaderInputStream} implements the same transformation as {@link OutputStreamWriter}, except that the control flow is reversed: both classes
068 * transform a character stream into a byte stream, but {@link OutputStreamWriter} pushes data to the underlying stream, while {@link ReaderInputStream}
069 * pulls it from the underlying stream.
070 * </p>
071 * <p>
072 * Note that while there are use cases where there is no alternative to using this class, very often the need to use this class is an indication of a flaw in
073 * the design of the code. This class is typically used in situations where an existing API only accepts an {@link InputStream}, but where the most natural way
074 * to produce the data is as a character stream, i.e. by providing a {@link Reader} instance. An example of a situation where this problem may appear is when
075 * implementing the {@code javax.activation.DataSource} interface from the Java Activation Framework.
076 * </p>
077 * <p>
078 * The {@link #available()} method of this class always returns 0. The methods {@link #mark(int)} and {@link #reset()} are not supported.
079 * </p>
080 * <p>
081 * Instances of {@link ReaderInputStream} are not thread safe.
082 * </p>
083 *
084 * @see Builder
085 * @see org.apache.commons.io.output.WriterOutputStream
086 * @since 2.0
087 */
088public class ReaderInputStream extends AbstractInputStream {
089
090    // @formatter:off
091    /**
092     * Builds a new {@link ReaderInputStream}.
093     *
094     * <p>
095     * For example:
096     * </p>
097     * <pre>{@code
098     * ReaderInputStream s = ReaderInputStream.builder()
099     *   .setPath(path)
100     *   .setCharsetEncoder(Charset.defaultCharset().newEncoder())
101     *   .get();}
102     * </pre>
103     *
104     * @see #get()
105     * @since 2.12.0
106     */
107    // @formatter:on
108    public static class Builder extends AbstractStreamBuilder<ReaderInputStream, Builder> {
109
110        private CharsetEncoder charsetEncoder = newEncoder(getCharset());
111
112        /**
113         * Builds a new {@link ReaderInputStream}.
114         *
115         * <p>
116         * You must set input that supports {@link #getReader()}, otherwise, this method throws an exception.
117         * </p>
118         * <p>
119         * This builder use the following aspects:
120         * </p>
121         * <ul>
122         * <li>{@link #getReader()}</li>
123         * <li>{@link #getBufferSize()}</li>
124         * <li>{@link #getCharset()}</li>
125         * <li>{@link CharsetEncoder}</li>
126         * </ul>
127         *
128         * @return a new instance.
129         * @throws UnsupportedOperationException if the origin cannot provide a Reader.
130         * @throws IllegalStateException if the {@code origin} is {@code null}.
131         * @see #getReader()
132         * @see CharsetEncoder
133         * @see #getBufferSize()
134         */
135        @SuppressWarnings("resource")
136        @Override
137        public ReaderInputStream get() throws IOException {
138            return new ReaderInputStream(getReader(), charsetEncoder, getBufferSize());
139        }
140
141        CharsetEncoder getCharsetEncoder() {
142            return charsetEncoder;
143        }
144
145        @Override
146        public Builder setCharset(final Charset charset) {
147            super.setCharset(charset);
148            charsetEncoder = newEncoder(getCharset());
149            return this;
150        }
151
152        /**
153         * Sets the charset encoder. Assumes that the caller has configured the encoder.
154         *
155         * @param newEncoder the charset encoder, null resets to a default encoder.
156         * @return {@code this} instance.
157         */
158        public Builder setCharsetEncoder(final CharsetEncoder newEncoder) {
159            charsetEncoder = CharsetEncoders.toCharsetEncoder(newEncoder, () -> newEncoder(getCharsetDefault()));
160            super.setCharset(charsetEncoder.charset());
161            return this;
162        }
163
164    }
165
166    /**
167     * Constructs a new {@link Builder}.
168     *
169     * @return a new {@link Builder}.
170     * @since 2.12.0
171     */
172    public static Builder builder() {
173        return new Builder();
174    }
175
176    static int checkMinBufferSize(final CharsetEncoder charsetEncoder, final int bufferSize) {
177        final float minRequired = minBufferSize(charsetEncoder);
178        if (bufferSize < minRequired) {
179            throw new IllegalArgumentException(String.format("Buffer size %,d must be at least %s for a CharsetEncoder %s.", bufferSize, minRequired,
180                    charsetEncoder.charset().displayName()));
181        }
182        return bufferSize;
183    }
184
185    static float minBufferSize(final CharsetEncoder charsetEncoder) {
186        return charsetEncoder.maxBytesPerChar() * 2;
187    }
188
189    private static CharsetEncoder newEncoder(final Charset charset) {
190        // @formatter:off
191        return Charsets.toCharset(charset).newEncoder()
192                .onMalformedInput(CodingErrorAction.REPLACE)
193                .onUnmappableCharacter(CodingErrorAction.REPLACE);
194        // @formatter:on
195    }
196
197    private final Reader reader;
198
199    private final CharsetEncoder charsetEncoder;
200
201    /**
202     * CharBuffer used as input for the decoder. It should be reasonably large as we read data from the underlying Reader into this buffer.
203     */
204    private final CharBuffer encoderIn;
205    /**
206     * ByteBuffer used as output for the decoder. This buffer can be small as it is only used to transfer data from the decoder to the buffer provided by the
207     * caller.
208     */
209    private final ByteBuffer encoderOut;
210
211    private CoderResult lastCoderResult;
212
213    private boolean endOfInput;
214
215    /**
216     * Constructs a new {@link ReaderInputStream} that uses the default character encoding with a default input buffer size of
217     * {@value IOUtils#DEFAULT_BUFFER_SIZE} characters.
218     *
219     * @param reader the target {@link Reader}
220     * @deprecated Use {@link ReaderInputStream#builder()} instead
221     */
222    @Deprecated
223    public ReaderInputStream(final Reader reader) {
224        this(reader, Charset.defaultCharset());
225    }
226
227    /**
228     * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters.
229     *
230     * <p>
231     * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
232     * </p>
233     *
234     * @param reader  the target {@link Reader}
235     * @param charset the charset encoding
236     * @deprecated Use {@link ReaderInputStream#builder()} instead, will be protected for subclasses.
237     */
238    @Deprecated
239    public ReaderInputStream(final Reader reader, final Charset charset) {
240        this(reader, charset, IOUtils.DEFAULT_BUFFER_SIZE);
241    }
242
243    /**
244     * Constructs a new {@link ReaderInputStream}.
245     *
246     * <p>
247     * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
248     * </p>
249     *
250     * @param reader     the target {@link Reader}.
251     * @param charset    the charset encoding.
252     * @param bufferSize the size of the input buffer in number of characters.
253     * @deprecated Use {@link ReaderInputStream#builder()} instead
254     */
255    @Deprecated
256    public ReaderInputStream(final Reader reader, final Charset charset, final int bufferSize) {
257        // @formatter:off
258        this(reader,
259            Charsets.toCharset(charset).newEncoder()
260                    .onMalformedInput(CodingErrorAction.REPLACE)
261                    .onUnmappableCharacter(CodingErrorAction.REPLACE),
262             bufferSize);
263        // @formatter:on
264    }
265
266    /**
267     * Constructs a new {@link ReaderInputStream}.
268     *
269     * <p>
270     * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing
271     * an encoder which had already been in use.
272     * </p>
273     *
274     * @param reader         the target {@link Reader}
275     * @param charsetEncoder the charset encoder
276     * @since 2.1
277     * @deprecated Use {@link ReaderInputStream#builder()} instead
278     */
279    @Deprecated
280    public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder) {
281        this(reader, charsetEncoder, IOUtils.DEFAULT_BUFFER_SIZE);
282    }
283
284    /**
285     * Constructs a new {@link ReaderInputStream}.
286     *
287     * <p>
288     * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing
289     * an encoder which had already been in use.
290     * </p>
291     *
292     * @param reader         the target {@link Reader}
293     * @param charsetEncoder the charset encoder, null defaults to the default Charset encoder.
294     * @param bufferSize     the size of the input buffer in number of characters
295     * @since 2.1
296     * @deprecated Use {@link ReaderInputStream#builder()} instead
297     */
298    @Deprecated
299    public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder, final int bufferSize) {
300        this.reader = reader;
301        this.charsetEncoder = CharsetEncoders.toCharsetEncoder(charsetEncoder);
302        this.encoderIn = CharBuffer.allocate(checkMinBufferSize(this.charsetEncoder, bufferSize));
303        this.encoderIn.flip();
304        this.encoderOut = ByteBuffer.allocate(128);
305        this.encoderOut.flip();
306    }
307
308    /**
309     * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters.
310     *
311     * <p>
312     * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
313     * </p>
314     *
315     * @param reader      the target {@link Reader}
316     * @param charsetName the name of the charset encoding
317     * @deprecated Use {@link ReaderInputStream#builder()} instead
318     */
319    @Deprecated
320    public ReaderInputStream(final Reader reader, final String charsetName) {
321        this(reader, charsetName, IOUtils.DEFAULT_BUFFER_SIZE);
322    }
323
324    /**
325     * Constructs a new {@link ReaderInputStream}.
326     *
327     * <p>
328     * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
329     * </p>
330     *
331     * @param reader      the target {@link Reader}
332     * @param charsetName the name of the charset encoding, null maps to the default Charset.
333     * @param bufferSize  the size of the input buffer in number of characters
334     * @deprecated Use {@link ReaderInputStream#builder()} instead
335     */
336    @Deprecated
337    public ReaderInputStream(final Reader reader, final String charsetName, final int bufferSize) {
338        this(reader, Charsets.toCharset(charsetName), bufferSize);
339    }
340
341    @Override
342    public int available() throws IOException {
343        if (encoderOut.hasRemaining()) {
344            return encoderOut.remaining();
345        }
346        return 0;
347    }
348
349    /**
350     * Closes the stream. This method will cause the underlying {@link Reader} to be closed.
351     *
352     * @throws IOException if an I/O error occurs.
353     */
354    @Override
355    public void close() throws IOException {
356        reader.close();
357        super.close();
358    }
359
360    /**
361     * Fills the internal char buffer from the reader.
362     *
363     * @throws IOException If an I/O error occurs
364     */
365    private void fillBuffer() throws IOException {
366        if (endOfInput) {
367            return;
368        }
369        if (!endOfInput && (lastCoderResult == null || lastCoderResult.isUnderflow())) {
370            encoderIn.compact();
371            final int position = encoderIn.position();
372            // We don't use Reader#read(CharBuffer) here because it is more efficient
373            // to write directly to the underlying char array (the default implementation
374            // copies data to a temporary char array).
375            final int c = reader.read(encoderIn.array(), position, encoderIn.remaining());
376            if (c == EOF) {
377                endOfInput = true;
378            } else {
379                encoderIn.position(position + c);
380            }
381            encoderIn.flip();
382        }
383        encoderOut.compact();
384        lastCoderResult = charsetEncoder.encode(encoderIn, encoderOut, endOfInput);
385        if (endOfInput) {
386            lastCoderResult = charsetEncoder.flush(encoderOut);
387        }
388        if (lastCoderResult.isError()) {
389            lastCoderResult.throwException();
390        }
391        encoderOut.flip();
392    }
393
394    /**
395     * Gets the CharsetEncoder.
396     *
397     * @return the CharsetEncoder.
398     */
399    CharsetEncoder getCharsetEncoder() {
400        return charsetEncoder;
401    }
402
403    /**
404     * Reads a single byte.
405     *
406     * @return either the byte read or {@code -1} if the end of the stream has been reached
407     * @throws IOException if an I/O error occurs.
408     */
409    @Override
410    public int read() throws IOException {
411        checkOpen();
412        for (;;) {
413            if (encoderOut.hasRemaining()) {
414                return encoderOut.get() & 0xFF;
415            }
416            fillBuffer();
417            if (endOfInput && !encoderOut.hasRemaining()) {
418                return EOF;
419            }
420        }
421    }
422
423    /**
424     * Reads the specified number of bytes into an array.
425     *
426     * @param b the byte array to read into
427     * @return the number of bytes read or {@code -1} if the end of the stream has been reached
428     * @throws IOException if an I/O error occurs.
429     */
430    @Override
431    public int read(final byte[] b) throws IOException {
432        return read(b, 0, b.length);
433    }
434
435    /**
436     * Reads the specified number of bytes into an array.
437     *
438     * @param array the byte array to read into
439     * @param off   the offset to start reading bytes into
440     * @param len   the number of bytes to read
441     * @return the number of bytes read or {@code -1} if the end of the stream has been reached
442     * @throws IOException if an I/O error occurs.
443     */
444    @Override
445    public int read(final byte[] array, int off, int len) throws IOException {
446        Objects.requireNonNull(array, "array");
447        if (len < 0 || off < 0 || off + len > array.length) {
448            throw new IndexOutOfBoundsException("Array size=" + array.length + ", offset=" + off + ", length=" + len);
449        }
450        int read = 0;
451        if (len == 0) {
452            return 0; // Always return 0 if len == 0
453        }
454        while (len > 0) {
455            if (encoderOut.hasRemaining()) { // Data from the last read not fully copied
456                final int c = Math.min(encoderOut.remaining(), len);
457                encoderOut.get(array, off, c);
458                off += c;
459                len -= c;
460                read += c;
461            } else if (endOfInput) { // Already reach EOF in the last read
462                break;
463            } else { // Read again
464                fillBuffer();
465            }
466        }
467        return read == 0 && endOfInput ? EOF : read;
468    }
469}