001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.io.input; 018 019import static org.apache.commons.io.IOUtils.EOF; 020 021import java.io.BufferedReader; 022import java.io.IOException; 023import java.io.InputStream; 024import java.io.InputStreamReader; 025import java.io.OutputStreamWriter; 026import java.io.Reader; 027import java.nio.ByteBuffer; 028import java.nio.CharBuffer; 029import java.nio.charset.Charset; 030import java.nio.charset.CharsetEncoder; 031import java.nio.charset.CoderResult; 032import java.nio.charset.CodingErrorAction; 033import java.util.Objects; 034 035import org.apache.commons.io.Charsets; 036import org.apache.commons.io.IOUtils; 037import org.apache.commons.io.build.AbstractStreamBuilder; 038import org.apache.commons.io.charset.CharsetEncoders; 039 040/** 041 * {@link InputStream} implementation that reads a character stream from a {@link Reader} and transforms it to a byte stream using a specified charset encoding. 042 * The stream is transformed using a {@link CharsetEncoder} object, guaranteeing that all charset encodings supported by the JRE are handled correctly. In 043 * particular for charsets such as UTF-16, the implementation ensures that one and only one byte order marker is produced. 044 * <p> 045 * Since in general it is not possible to predict the number of characters to be read from the {@link Reader} to satisfy a read request on the 046 * {@link ReaderInputStream}, all reads from the {@link Reader} are buffered. There is therefore no well defined correlation between the current position of the 047 * {@link Reader} and that of the {@link ReaderInputStream}. This also implies that in general there is no need to wrap the underlying {@link Reader} in a 048 * {@link BufferedReader}. 049 * </p> 050 * <p> 051 * {@link ReaderInputStream} implements the inverse transformation of {@link InputStreamReader}; in the following example, reading from {@code in2} 052 * would return the same byte sequence as reading from {@code in} (provided that the initial byte sequence is legal with respect to the charset encoding): 053 * </p> 054 * <p> 055 * To build an instance, use {@link Builder}. 056 * </p> 057 * <pre> 058 * InputStream inputStream = ... 059 * Charset cs = ... 060 * InputStreamReader reader = new InputStreamReader(inputStream, cs); 061 * ReaderInputStream in2 = ReaderInputStream.builder() 062 * .setReader(reader) 063 * .setCharset(cs) 064 * .get(); 065 * </pre> 066 * <p> 067 * {@link ReaderInputStream} implements the same transformation as {@link OutputStreamWriter}, except that the control flow is reversed: both classes 068 * transform a character stream into a byte stream, but {@link OutputStreamWriter} pushes data to the underlying stream, while {@link ReaderInputStream} 069 * pulls it from the underlying stream. 070 * </p> 071 * <p> 072 * Note that while there are use cases where there is no alternative to using this class, very often the need to use this class is an indication of a flaw in 073 * the design of the code. This class is typically used in situations where an existing API only accepts an {@link InputStream}, but where the most natural way 074 * to produce the data is as a character stream, by providing a {@link Reader} instance. An example of a situation where this problem may appear is when 075 * implementing the {@code javax.activation.DataSource} interface from the Java Activation Framework. 076 * </p> 077 * <p> 078 * The {@link #available()} method of this class always returns 0. The methods {@link #mark(int)} and {@link #reset()} are not supported. 079 * </p> 080 * <p> 081 * Instances of {@link ReaderInputStream} are not thread safe. 082 * </p> 083 * 084 * @see Builder 085 * @see org.apache.commons.io.output.WriterOutputStream 086 * @since 2.0 087 */ 088public class ReaderInputStream extends AbstractInputStream { 089 090 // @formatter:off 091 /** 092 * Builds a new {@link ReaderInputStream}. 093 * 094 * <p> 095 * For example: 096 * </p> 097 * <pre>{@code 098 * ReaderInputStream s = ReaderInputStream.builder() 099 * .setPath(path) 100 * .setCharsetEncoder(Charset.defaultCharset().newEncoder()) 101 * .get();} 102 * </pre> 103 * 104 * @see #get() 105 * @since 2.12.0 106 */ 107 // @formatter:on 108 public static class Builder extends AbstractStreamBuilder<ReaderInputStream, Builder> { 109 110 private CharsetEncoder charsetEncoder = newEncoder(getCharset()); 111 112 /** 113 * Constructs a new builder of {@link ReaderInputStream}. 114 */ 115 public Builder() { 116 // empty 117 } 118 119 /** 120 * Builds a new {@link ReaderInputStream}. 121 * 122 * <p> 123 * You must set an aspect that supports {@link #getReader()}, otherwise, this method throws an exception. 124 * </p> 125 * <p> 126 * This builder uses the following aspects: 127 * </p> 128 * <ul> 129 * <li>{@link #getReader()} gets the target aspect.</li> 130 * <li>{@link #getBufferSize()}</li> 131 * <li>{@link #getCharset()}</li> 132 * <li>{@link CharsetEncoder}</li> 133 * </ul> 134 * 135 * @return a new instance. 136 * @throws UnsupportedOperationException if the origin cannot provide a {@link Reader}. 137 * @throws IllegalStateException if the {@code origin} is {@code null}. 138 * @throws IOException if an I/O error occurs converting to a {@link Reader} using {@link #getReader()}. 139 * @see #getReader() 140 * @see CharsetEncoder 141 * @see #getBufferSize() 142 * @see #getUnchecked() 143 */ 144 @Override 145 public ReaderInputStream get() throws IOException { 146 return new ReaderInputStream(getReader(), charsetEncoder, getBufferSize()); 147 } 148 149 CharsetEncoder getCharsetEncoder() { 150 return charsetEncoder; 151 } 152 153 @Override 154 public Builder setCharset(final Charset charset) { 155 super.setCharset(charset); 156 charsetEncoder = newEncoder(getCharset()); 157 return this; 158 } 159 160 /** 161 * Sets the charset encoder. Assumes that the caller has configured the encoder. 162 * 163 * @param newEncoder the charset encoder, null resets to a default encoder. 164 * @return {@code this} instance. 165 */ 166 public Builder setCharsetEncoder(final CharsetEncoder newEncoder) { 167 charsetEncoder = CharsetEncoders.toCharsetEncoder(newEncoder, () -> newEncoder(getCharsetDefault())); 168 super.setCharset(charsetEncoder.charset()); 169 return this; 170 } 171 172 } 173 174 /** 175 * Constructs a new {@link Builder}. 176 * 177 * @return a new {@link Builder}. 178 * @since 2.12.0 179 */ 180 public static Builder builder() { 181 return new Builder(); 182 } 183 184 static int checkMinBufferSize(final CharsetEncoder charsetEncoder, final int bufferSize) { 185 final float minRequired = minBufferSize(charsetEncoder); 186 if (bufferSize < minRequired) { 187 throw new IllegalArgumentException(String.format("Buffer size %,d must be at least %s for a CharsetEncoder %s.", bufferSize, minRequired, 188 charsetEncoder.charset().displayName())); 189 } 190 return bufferSize; 191 } 192 193 static float minBufferSize(final CharsetEncoder charsetEncoder) { 194 return charsetEncoder.maxBytesPerChar() * 2; 195 } 196 197 private static CharsetEncoder newEncoder(final Charset charset) { 198 // @formatter:off 199 return Charsets.toCharset(charset).newEncoder() 200 .onMalformedInput(CodingErrorAction.REPLACE) 201 .onUnmappableCharacter(CodingErrorAction.REPLACE); 202 // @formatter:on 203 } 204 205 private final Reader reader; 206 207 private final CharsetEncoder charsetEncoder; 208 209 /** 210 * CharBuffer used as input for the decoder. It should be reasonably large as we read data from the underlying Reader into this buffer. 211 */ 212 private final CharBuffer encoderIn; 213 /** 214 * ByteBuffer used as output for the decoder. This buffer can be small as it is only used to transfer data from the decoder to the buffer provided by the 215 * caller. 216 */ 217 private final ByteBuffer encoderOut; 218 219 private CoderResult lastCoderResult; 220 221 private boolean endOfInput; 222 223 /** 224 * Constructs a new {@link ReaderInputStream} that uses the virtual machine's {@link Charset#defaultCharset() default charset} with a default input buffer 225 * size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters. 226 * 227 * @param reader the target {@link Reader} 228 * @deprecated Use {@link ReaderInputStream#builder()} instead 229 */ 230 @Deprecated 231 public ReaderInputStream(final Reader reader) { 232 this(reader, Charset.defaultCharset()); 233 } 234 235 /** 236 * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters. 237 * 238 * <p> 239 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters. 240 * </p> 241 * 242 * @param reader the target {@link Reader} 243 * @param charset the charset encoding 244 * @deprecated Use {@link ReaderInputStream#builder()} instead, will be protected for subclasses. 245 */ 246 @Deprecated 247 public ReaderInputStream(final Reader reader, final Charset charset) { 248 this(reader, charset, IOUtils.DEFAULT_BUFFER_SIZE); 249 } 250 251 /** 252 * Constructs a new {@link ReaderInputStream}. 253 * 254 * <p> 255 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters. 256 * </p> 257 * 258 * @param reader the target {@link Reader}. 259 * @param charset the charset encoding. 260 * @param bufferSize the size of the input buffer in number of characters. 261 * @deprecated Use {@link ReaderInputStream#builder()} instead 262 */ 263 @Deprecated 264 public ReaderInputStream(final Reader reader, final Charset charset, final int bufferSize) { 265 // @formatter:off 266 this(reader, 267 Charsets.toCharset(charset).newEncoder() 268 .onMalformedInput(CodingErrorAction.REPLACE) 269 .onUnmappableCharacter(CodingErrorAction.REPLACE), 270 bufferSize); 271 // @formatter:on 272 } 273 274 /** 275 * Constructs a new {@link ReaderInputStream}. 276 * 277 * <p> 278 * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing 279 * an encoder which had already been in use. 280 * </p> 281 * 282 * @param reader the target {@link Reader} 283 * @param charsetEncoder the charset encoder 284 * @since 2.1 285 * @deprecated Use {@link ReaderInputStream#builder()} instead 286 */ 287 @Deprecated 288 public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder) { 289 this(reader, charsetEncoder, IOUtils.DEFAULT_BUFFER_SIZE); 290 } 291 292 /** 293 * Constructs a new {@link ReaderInputStream}. 294 * 295 * <p> 296 * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing 297 * an encoder which had already been in use. 298 * </p> 299 * 300 * @param reader the target {@link Reader} 301 * @param charsetEncoder the charset encoder, null defaults to the default Charset encoder. 302 * @param bufferSize the size of the input buffer in number of characters 303 * @since 2.1 304 * @deprecated Use {@link ReaderInputStream#builder()} instead 305 */ 306 @Deprecated 307 public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder, final int bufferSize) { 308 this.reader = reader; 309 this.charsetEncoder = CharsetEncoders.toCharsetEncoder(charsetEncoder); 310 this.encoderIn = CharBuffer.allocate(checkMinBufferSize(this.charsetEncoder, bufferSize)); 311 this.encoderIn.flip(); 312 this.encoderOut = ByteBuffer.allocate(128); 313 this.encoderOut.flip(); 314 } 315 316 /** 317 * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters. 318 * 319 * <p> 320 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters. 321 * </p> 322 * 323 * @param reader the target {@link Reader} 324 * @param charsetName the name of the charset encoding 325 * @deprecated Use {@link ReaderInputStream#builder()} instead 326 */ 327 @Deprecated 328 public ReaderInputStream(final Reader reader, final String charsetName) { 329 this(reader, charsetName, IOUtils.DEFAULT_BUFFER_SIZE); 330 } 331 332 /** 333 * Constructs a new {@link ReaderInputStream}. 334 * 335 * <p> 336 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters. 337 * </p> 338 * 339 * @param reader the target {@link Reader} 340 * @param charsetName the name of the charset encoding, null maps to the default Charset. 341 * @param bufferSize the size of the input buffer in number of characters 342 * @deprecated Use {@link ReaderInputStream#builder()} instead 343 */ 344 @Deprecated 345 public ReaderInputStream(final Reader reader, final String charsetName, final int bufferSize) { 346 this(reader, Charsets.toCharset(charsetName), bufferSize); 347 } 348 349 @Override 350 public int available() throws IOException { 351 if (encoderOut.hasRemaining()) { 352 return encoderOut.remaining(); 353 } 354 return 0; 355 } 356 357 /** 358 * Closes the stream. This method will cause the underlying {@link Reader} to be closed. 359 * 360 * @throws IOException if an I/O error occurs. 361 */ 362 @Override 363 public void close() throws IOException { 364 reader.close(); 365 super.close(); 366 } 367 368 /** 369 * Fills the internal char buffer from the reader. 370 * 371 * @throws IOException If an I/O error occurs 372 */ 373 private void fillBuffer() throws IOException { 374 if (endOfInput) { 375 return; 376 } 377 if (!endOfInput && (lastCoderResult == null || lastCoderResult.isUnderflow())) { 378 encoderIn.compact(); 379 final int position = encoderIn.position(); 380 // We don't use Reader#read(CharBuffer) here because it is more efficient 381 // to write directly to the underlying char array (the default implementation 382 // copies data to a temporary char array). 383 final int c = reader.read(encoderIn.array(), position, encoderIn.remaining()); 384 if (c == EOF) { 385 endOfInput = true; 386 } else { 387 encoderIn.position(position + c); 388 } 389 encoderIn.flip(); 390 } 391 encoderOut.compact(); 392 lastCoderResult = charsetEncoder.encode(encoderIn, encoderOut, endOfInput); 393 if (endOfInput) { 394 lastCoderResult = charsetEncoder.flush(encoderOut); 395 } 396 if (lastCoderResult.isError()) { 397 lastCoderResult.throwException(); 398 } 399 encoderOut.flip(); 400 } 401 402 /** 403 * Gets the CharsetEncoder. 404 * 405 * @return the CharsetEncoder. 406 */ 407 CharsetEncoder getCharsetEncoder() { 408 return charsetEncoder; 409 } 410 411 /** 412 * Reads a single byte. 413 * 414 * @return either the byte read or {@code -1} if the end of the stream has been reached 415 * @throws IOException if an I/O error occurs. 416 */ 417 @Override 418 public int read() throws IOException { 419 checkOpen(); 420 for (;;) { 421 if (encoderOut.hasRemaining()) { 422 return encoderOut.get() & 0xFF; 423 } 424 fillBuffer(); 425 if (endOfInput && !encoderOut.hasRemaining()) { 426 return EOF; 427 } 428 } 429 } 430 431 /** 432 * Reads the specified number of bytes into an array. 433 * 434 * @param b the byte array to read into 435 * @return the number of bytes read or {@code -1} if the end of the stream has been reached 436 * @throws IOException if an I/O error occurs. 437 */ 438 @Override 439 public int read(final byte[] b) throws IOException { 440 return read(b, 0, b.length); 441 } 442 443 /** 444 * Reads the specified number of bytes into an array. 445 * 446 * @param array the byte array to read into 447 * @param off the offset to start reading bytes into 448 * @param len the number of bytes to read 449 * @return the number of bytes read or {@code -1} if the end of the stream has been reached 450 * @throws IOException if an I/O error occurs. 451 */ 452 @Override 453 public int read(final byte[] array, int off, int len) throws IOException { 454 Objects.requireNonNull(array, "array"); 455 if (len < 0 || off < 0 || off + len > array.length) { 456 throw new IndexOutOfBoundsException("Array size=" + array.length + ", offset=" + off + ", length=" + len); 457 } 458 int read = 0; 459 if (len == 0) { 460 return 0; // Always return 0 if len == 0 461 } 462 while (len > 0) { 463 if (encoderOut.hasRemaining()) { // Data from the last read not fully copied 464 final int c = Math.min(encoderOut.remaining(), len); 465 encoderOut.get(array, off, c); 466 off += c; 467 len -= c; 468 read += c; 469 } else if (endOfInput) { // Already reach EOF in the last read 470 break; 471 } else { // Read again 472 fillBuffer(); 473 } 474 } 475 return read == 0 && endOfInput ? EOF : read; 476 } 477}