001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.io.input; 018 019import static org.apache.commons.io.IOUtils.EOF; 020 021import java.io.BufferedReader; 022import java.io.IOException; 023import java.io.InputStream; 024import java.io.InputStreamReader; 025import java.io.OutputStreamWriter; 026import java.io.Reader; 027import java.nio.ByteBuffer; 028import java.nio.CharBuffer; 029import java.nio.charset.Charset; 030import java.nio.charset.CharsetEncoder; 031import java.nio.charset.CoderResult; 032import java.nio.charset.CodingErrorAction; 033import java.util.Objects; 034 035import org.apache.commons.io.Charsets; 036import org.apache.commons.io.IOUtils; 037import org.apache.commons.io.build.AbstractStreamBuilder; 038import org.apache.commons.io.charset.CharsetEncoders; 039 040/** 041 * {@link InputStream} implementation that reads a character stream from a {@link Reader} and transforms it to a byte stream using a specified charset encoding. 042 * The stream is transformed using a {@link CharsetEncoder} object, guaranteeing that all charset encodings supported by the JRE are handled correctly. In 043 * particular for charsets such as UTF-16, the implementation ensures that one and only one byte order marker is produced. 044 * <p> 045 * Since in general it is not possible to predict the number of characters to be read from the {@link Reader} to satisfy a read request on the 046 * {@link ReaderInputStream}, all reads from the {@link Reader} are buffered. There is therefore no well defined correlation between the current position of the 047 * {@link Reader} and that of the {@link ReaderInputStream}. This also implies that in general there is no need to wrap the underlying {@link Reader} in a 048 * {@link BufferedReader}. 049 * </p> 050 * <p> 051 * {@link ReaderInputStream} implements the inverse transformation of {@link InputStreamReader}; in the following example, reading from {@code in2} 052 * would return the same byte sequence as reading from {@code in} (provided that the initial byte sequence is legal with respect to the charset encoding): 053 * </p> 054 * <p> 055 * To build an instance, use {@link Builder}. 056 * </p> 057 * <pre> 058 * InputStream inputStream = ... 059 * Charset cs = ... 060 * InputStreamReader reader = new InputStreamReader(inputStream, cs); 061 * ReaderInputStream in2 = ReaderInputStream.builder() 062 * .setReader(reader) 063 * .setCharset(cs) 064 * .get(); 065 * </pre> 066 * <p> 067 * {@link ReaderInputStream} implements the same transformation as {@link OutputStreamWriter}, except that the control flow is reversed: both classes 068 * transform a character stream into a byte stream, but {@link OutputStreamWriter} pushes data to the underlying stream, while {@link ReaderInputStream} 069 * pulls it from the underlying stream. 070 * </p> 071 * <p> 072 * Note that while there are use cases where there is no alternative to using this class, very often the need to use this class is an indication of a flaw in 073 * the design of the code. This class is typically used in situations where an existing API only accepts an {@link InputStream}, but where the most natural way 074 * to produce the data is as a character stream, by providing a {@link Reader} instance. An example of a situation where this problem may appear is when 075 * implementing the {@code javax.activation.DataSource} interface from the Java Activation Framework. 076 * </p> 077 * <p> 078 * The {@link #available()} method of this class always returns 0. The methods {@link #mark(int)} and {@link #reset()} are not supported. 079 * </p> 080 * <p> 081 * Instances of {@link ReaderInputStream} are not thread safe. 082 * </p> 083 * 084 * @see Builder 085 * @see org.apache.commons.io.output.WriterOutputStream 086 * @since 2.0 087 */ 088public class ReaderInputStream extends AbstractInputStream { 089 090 // @formatter:off 091 /** 092 * Builds a new {@link ReaderInputStream}. 093 * 094 * <p> 095 * For example: 096 * </p> 097 * <pre>{@code 098 * ReaderInputStream s = ReaderInputStream.builder() 099 * .setPath(path) 100 * .setCharsetEncoder(Charset.defaultCharset().newEncoder()) 101 * .get();} 102 * </pre> 103 * 104 * @see #get() 105 * @since 2.12.0 106 */ 107 // @formatter:on 108 public static class Builder extends AbstractStreamBuilder<ReaderInputStream, Builder> { 109 110 private CharsetEncoder charsetEncoder = newEncoder(getCharset()); 111 112 /** 113 * Builds a new {@link ReaderInputStream}. 114 * 115 * <p> 116 * You must set input that supports {@link #getReader()}, otherwise, this method throws an exception. 117 * </p> 118 * <p> 119 * This builder use the following aspects: 120 * </p> 121 * <ul> 122 * <li>{@link #getReader()}</li> 123 * <li>{@link #getBufferSize()}</li> 124 * <li>{@link #getCharset()}</li> 125 * <li>{@link CharsetEncoder}</li> 126 * </ul> 127 * 128 * @return a new instance. 129 * @throws UnsupportedOperationException if the origin cannot provide a Reader. 130 * @throws IllegalStateException if the {@code origin} is {@code null}. 131 * @see #getReader() 132 * @see CharsetEncoder 133 * @see #getBufferSize() 134 */ 135 @SuppressWarnings("resource") 136 @Override 137 public ReaderInputStream get() throws IOException { 138 return new ReaderInputStream(getReader(), charsetEncoder, getBufferSize()); 139 } 140 141 CharsetEncoder getCharsetEncoder() { 142 return charsetEncoder; 143 } 144 145 @Override 146 public Builder setCharset(final Charset charset) { 147 super.setCharset(charset); 148 charsetEncoder = newEncoder(getCharset()); 149 return this; 150 } 151 152 /** 153 * Sets the charset encoder. Assumes that the caller has configured the encoder. 154 * 155 * @param newEncoder the charset encoder, null resets to a default encoder. 156 * @return {@code this} instance. 157 */ 158 public Builder setCharsetEncoder(final CharsetEncoder newEncoder) { 159 charsetEncoder = CharsetEncoders.toCharsetEncoder(newEncoder, () -> newEncoder(getCharsetDefault())); 160 super.setCharset(charsetEncoder.charset()); 161 return this; 162 } 163 164 } 165 166 /** 167 * Constructs a new {@link Builder}. 168 * 169 * @return a new {@link Builder}. 170 * @since 2.12.0 171 */ 172 public static Builder builder() { 173 return new Builder(); 174 } 175 176 static int checkMinBufferSize(final CharsetEncoder charsetEncoder, final int bufferSize) { 177 final float minRequired = minBufferSize(charsetEncoder); 178 if (bufferSize < minRequired) { 179 throw new IllegalArgumentException(String.format("Buffer size %,d must be at least %s for a CharsetEncoder %s.", bufferSize, minRequired, 180 charsetEncoder.charset().displayName())); 181 } 182 return bufferSize; 183 } 184 185 static float minBufferSize(final CharsetEncoder charsetEncoder) { 186 return charsetEncoder.maxBytesPerChar() * 2; 187 } 188 189 private static CharsetEncoder newEncoder(final Charset charset) { 190 // @formatter:off 191 return Charsets.toCharset(charset).newEncoder() 192 .onMalformedInput(CodingErrorAction.REPLACE) 193 .onUnmappableCharacter(CodingErrorAction.REPLACE); 194 // @formatter:on 195 } 196 197 private final Reader reader; 198 199 private final CharsetEncoder charsetEncoder; 200 201 /** 202 * CharBuffer used as input for the decoder. It should be reasonably large as we read data from the underlying Reader into this buffer. 203 */ 204 private final CharBuffer encoderIn; 205 /** 206 * ByteBuffer used as output for the decoder. This buffer can be small as it is only used to transfer data from the decoder to the buffer provided by the 207 * caller. 208 */ 209 private final ByteBuffer encoderOut; 210 211 private CoderResult lastCoderResult; 212 213 private boolean endOfInput; 214 215 /** 216 * Constructs a new {@link ReaderInputStream} that uses the default character encoding with a default input buffer size of 217 * {@value IOUtils#DEFAULT_BUFFER_SIZE} characters. 218 * 219 * @param reader the target {@link Reader} 220 * @deprecated Use {@link ReaderInputStream#builder()} instead 221 */ 222 @Deprecated 223 public ReaderInputStream(final Reader reader) { 224 this(reader, Charset.defaultCharset()); 225 } 226 227 /** 228 * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters. 229 * 230 * <p> 231 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters. 232 * </p> 233 * 234 * @param reader the target {@link Reader} 235 * @param charset the charset encoding 236 * @deprecated Use {@link ReaderInputStream#builder()} instead, will be protected for subclasses. 237 */ 238 @Deprecated 239 public ReaderInputStream(final Reader reader, final Charset charset) { 240 this(reader, charset, IOUtils.DEFAULT_BUFFER_SIZE); 241 } 242 243 /** 244 * Constructs a new {@link ReaderInputStream}. 245 * 246 * <p> 247 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters. 248 * </p> 249 * 250 * @param reader the target {@link Reader}. 251 * @param charset the charset encoding. 252 * @param bufferSize the size of the input buffer in number of characters. 253 * @deprecated Use {@link ReaderInputStream#builder()} instead 254 */ 255 @Deprecated 256 public ReaderInputStream(final Reader reader, final Charset charset, final int bufferSize) { 257 // @formatter:off 258 this(reader, 259 Charsets.toCharset(charset).newEncoder() 260 .onMalformedInput(CodingErrorAction.REPLACE) 261 .onUnmappableCharacter(CodingErrorAction.REPLACE), 262 bufferSize); 263 // @formatter:on 264 } 265 266 /** 267 * Constructs a new {@link ReaderInputStream}. 268 * 269 * <p> 270 * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing 271 * an encoder which had already been in use. 272 * </p> 273 * 274 * @param reader the target {@link Reader} 275 * @param charsetEncoder the charset encoder 276 * @since 2.1 277 * @deprecated Use {@link ReaderInputStream#builder()} instead 278 */ 279 @Deprecated 280 public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder) { 281 this(reader, charsetEncoder, IOUtils.DEFAULT_BUFFER_SIZE); 282 } 283 284 /** 285 * Constructs a new {@link ReaderInputStream}. 286 * 287 * <p> 288 * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing 289 * an encoder which had already been in use. 290 * </p> 291 * 292 * @param reader the target {@link Reader} 293 * @param charsetEncoder the charset encoder, null defaults to the default Charset encoder. 294 * @param bufferSize the size of the input buffer in number of characters 295 * @since 2.1 296 * @deprecated Use {@link ReaderInputStream#builder()} instead 297 */ 298 @Deprecated 299 public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder, final int bufferSize) { 300 this.reader = reader; 301 this.charsetEncoder = CharsetEncoders.toCharsetEncoder(charsetEncoder); 302 this.encoderIn = CharBuffer.allocate(checkMinBufferSize(this.charsetEncoder, bufferSize)); 303 this.encoderIn.flip(); 304 this.encoderOut = ByteBuffer.allocate(128); 305 this.encoderOut.flip(); 306 } 307 308 /** 309 * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters. 310 * 311 * <p> 312 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters. 313 * </p> 314 * 315 * @param reader the target {@link Reader} 316 * @param charsetName the name of the charset encoding 317 * @deprecated Use {@link ReaderInputStream#builder()} instead 318 */ 319 @Deprecated 320 public ReaderInputStream(final Reader reader, final String charsetName) { 321 this(reader, charsetName, IOUtils.DEFAULT_BUFFER_SIZE); 322 } 323 324 /** 325 * Constructs a new {@link ReaderInputStream}. 326 * 327 * <p> 328 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters. 329 * </p> 330 * 331 * @param reader the target {@link Reader} 332 * @param charsetName the name of the charset encoding, null maps to the default Charset. 333 * @param bufferSize the size of the input buffer in number of characters 334 * @deprecated Use {@link ReaderInputStream#builder()} instead 335 */ 336 @Deprecated 337 public ReaderInputStream(final Reader reader, final String charsetName, final int bufferSize) { 338 this(reader, Charsets.toCharset(charsetName), bufferSize); 339 } 340 341 @Override 342 public int available() throws IOException { 343 if (encoderOut.hasRemaining()) { 344 return encoderOut.remaining(); 345 } 346 return 0; 347 } 348 349 /** 350 * Closes the stream. This method will cause the underlying {@link Reader} to be closed. 351 * 352 * @throws IOException if an I/O error occurs. 353 */ 354 @Override 355 public void close() throws IOException { 356 reader.close(); 357 super.close(); 358 } 359 360 /** 361 * Fills the internal char buffer from the reader. 362 * 363 * @throws IOException If an I/O error occurs 364 */ 365 private void fillBuffer() throws IOException { 366 if (endOfInput) { 367 return; 368 } 369 if (!endOfInput && (lastCoderResult == null || lastCoderResult.isUnderflow())) { 370 encoderIn.compact(); 371 final int position = encoderIn.position(); 372 // We don't use Reader#read(CharBuffer) here because it is more efficient 373 // to write directly to the underlying char array (the default implementation 374 // copies data to a temporary char array). 375 final int c = reader.read(encoderIn.array(), position, encoderIn.remaining()); 376 if (c == EOF) { 377 endOfInput = true; 378 } else { 379 encoderIn.position(position + c); 380 } 381 encoderIn.flip(); 382 } 383 encoderOut.compact(); 384 lastCoderResult = charsetEncoder.encode(encoderIn, encoderOut, endOfInput); 385 if (endOfInput) { 386 lastCoderResult = charsetEncoder.flush(encoderOut); 387 } 388 if (lastCoderResult.isError()) { 389 lastCoderResult.throwException(); 390 } 391 encoderOut.flip(); 392 } 393 394 /** 395 * Gets the CharsetEncoder. 396 * 397 * @return the CharsetEncoder. 398 */ 399 CharsetEncoder getCharsetEncoder() { 400 return charsetEncoder; 401 } 402 403 /** 404 * Reads a single byte. 405 * 406 * @return either the byte read or {@code -1} if the end of the stream has been reached 407 * @throws IOException if an I/O error occurs. 408 */ 409 @Override 410 public int read() throws IOException { 411 checkOpen(); 412 for (;;) { 413 if (encoderOut.hasRemaining()) { 414 return encoderOut.get() & 0xFF; 415 } 416 fillBuffer(); 417 if (endOfInput && !encoderOut.hasRemaining()) { 418 return EOF; 419 } 420 } 421 } 422 423 /** 424 * Reads the specified number of bytes into an array. 425 * 426 * @param b the byte array to read into 427 * @return the number of bytes read or {@code -1} if the end of the stream has been reached 428 * @throws IOException if an I/O error occurs. 429 */ 430 @Override 431 public int read(final byte[] b) throws IOException { 432 return read(b, 0, b.length); 433 } 434 435 /** 436 * Reads the specified number of bytes into an array. 437 * 438 * @param array the byte array to read into 439 * @param off the offset to start reading bytes into 440 * @param len the number of bytes to read 441 * @return the number of bytes read or {@code -1} if the end of the stream has been reached 442 * @throws IOException if an I/O error occurs. 443 */ 444 @Override 445 public int read(final byte[] array, int off, int len) throws IOException { 446 Objects.requireNonNull(array, "array"); 447 if (len < 0 || off < 0 || off + len > array.length) { 448 throw new IndexOutOfBoundsException("Array size=" + array.length + ", offset=" + off + ", length=" + len); 449 } 450 int read = 0; 451 if (len == 0) { 452 return 0; // Always return 0 if len == 0 453 } 454 while (len > 0) { 455 if (encoderOut.hasRemaining()) { // Data from the last read not fully copied 456 final int c = Math.min(encoderOut.remaining(), len); 457 encoderOut.get(array, off, c); 458 off += c; 459 len -= c; 460 read += c; 461 } else if (endOfInput) { // Already reach EOF in the last read 462 break; 463 } else { // Read again 464 fillBuffer(); 465 } 466 } 467 return read == 0 && endOfInput ? EOF : read; 468 } 469}