001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.io.input; 018 019import java.io.BufferedInputStream; 020import java.io.BufferedReader; 021import java.io.File; 022import java.io.IOException; 023import java.io.InputStream; 024import java.io.InputStreamReader; 025import java.io.Reader; 026import java.io.StringReader; 027import java.net.HttpURLConnection; 028import java.net.URL; 029import java.net.URLConnection; 030import java.nio.charset.Charset; 031import java.nio.charset.StandardCharsets; 032import java.nio.file.Files; 033import java.nio.file.Path; 034import java.text.MessageFormat; 035import java.util.Locale; 036import java.util.Objects; 037import java.util.regex.Matcher; 038import java.util.regex.Pattern; 039 040import org.apache.commons.io.ByteOrderMark; 041import org.apache.commons.io.Charsets; 042import org.apache.commons.io.IOUtils; 043import org.apache.commons.io.build.AbstractStreamBuilder; 044import org.apache.commons.io.function.IOConsumer; 045import org.apache.commons.io.output.XmlStreamWriter; 046 047/** 048 * Character stream that handles all the necessary Voodoo to figure out the charset encoding of the XML document within the stream. 049 * <p> 050 * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a character stream. 051 * </p> 052 * <p> 053 * All this has to be done without consuming characters from the stream, if not the XML parser will not recognized the document as a valid XML. This is not 100% 054 * true, but it's close enough (UTF-8 BOM is not handled by all parsers right now, XmlStreamReader handles it and things work in all parsers). 055 * </p> 056 * <p> 057 * The XmlStreamReader class handles the charset encoding of XML documents in Files, raw streams and HTTP streams by offering a wide set of constructors. 058 * </p> 059 * <p> 060 * By default the charset encoding detection is lenient, the constructor with the lenient flag can be used for a script (following HTTP MIME and XML 061 * specifications). All this is nicely explained by Mark Pilgrim in his blog, <a href="https://diveintomark.org/archives/2004/02/13/xml-media-types"> 062 * Determining the character encoding of a feed</a>. 063 * </p> 064 * <p> 065 * To build an instance, use {@link Builder}. 066 * </p> 067 * <p> 068 * Originally developed for <a href="https://rome.dev.java.net">ROME</a> under Apache License 2.0. 069 * </p> 070 * 071 * @see Builder 072 * @see org.apache.commons.io.output.XmlStreamWriter 073 * @since 2.0 074 */ 075public class XmlStreamReader extends Reader { 076 077 // @formatter:off 078 /** 079 * Builds a new {@link XmlStreamWriter}. 080 * 081 * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection. 082 * <p> 083 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog 084 * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type. 085 * </p> 086 * <p> 087 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following: 088 * </p> 089 * <p> 090 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again. 091 * </p> 092 * <p> 093 * Else if the XML prolog had a charset encoding that encoding is used. 094 * </p> 095 * <p> 096 * Else if the content type had a charset encoding that encoding is used. 097 * </p> 098 * <p> 099 * Else 'UTF-8' is used. 100 * </p> 101 * <p> 102 * If lenient detection is indicated an XmlStreamReaderException is never thrown. 103 * </p> 104 * <p> 105 * For example: 106 * </p> 107 * 108 * <pre>{@code 109 * XmlStreamReader r = XmlStreamReader.builder() 110 * .setPath(path) 111 * .setCharset(StandardCharsets.UTF_8) 112 * .get(); 113 * } 114 * </pre> 115 * 116 * @see #get() 117 * @since 2.12.0 118 */ 119 // @formatter:on 120 public static class Builder extends AbstractStreamBuilder<XmlStreamReader, Builder> { 121 122 private boolean nullCharset = true; 123 private boolean lenient = true; 124 private String httpContentType; 125 126 /** 127 * Builds a new {@link XmlStreamWriter}. 128 * <p> 129 * You must set input that supports {@link #getInputStream()}, otherwise, this method throws an exception. 130 * </p> 131 * <p> 132 * This builder use the following aspects: 133 * </p> 134 * <ul> 135 * <li>{@link #getInputStream()}</li> 136 * <li>{@link #getCharset()}</li> 137 * <li>lenient</li> 138 * <li>httpContentType</li> 139 * </ul> 140 * 141 * @return a new instance. 142 * @throws IllegalStateException if the {@code origin} is {@code null}. 143 * @throws UnsupportedOperationException if the origin cannot be converted to an {@link InputStream}. 144 * @throws IOException if an I/O error occurs. 145 * @throws XmlStreamReaderException thrown if the Charset encoding could not be determined according to the specification. 146 * @see #getInputStream() 147 */ 148 @SuppressWarnings("resource") 149 @Override 150 public XmlStreamReader get() throws IOException { 151 final String defaultEncoding = nullCharset ? null : getCharset().name(); 152 // @formatter:off 153 return httpContentType == null 154 ? new XmlStreamReader(getInputStream(), lenient, defaultEncoding) 155 : new XmlStreamReader(getInputStream(), httpContentType, lenient, defaultEncoding); 156 // @formatter:on 157 } 158 159 @Override 160 public Builder setCharset(final Charset charset) { 161 nullCharset = charset == null; 162 return super.setCharset(charset); 163 } 164 165 @Override 166 public Builder setCharset(final String charset) { 167 nullCharset = charset == null; 168 return super.setCharset(Charsets.toCharset(charset, getCharsetDefault())); 169 } 170 171 /** 172 * Sets the HTTP content type. 173 * 174 * @param httpContentType the HTTP content type. 175 * @return {@code this} instance. 176 */ 177 public Builder setHttpContentType(final String httpContentType) { 178 this.httpContentType = httpContentType; 179 return this; 180 } 181 182 /** 183 * Sets the lenient toggle. 184 * 185 * @param lenient the lenient toggle. 186 * @return {@code this} instance. 187 */ 188 public Builder setLenient(final boolean lenient) { 189 this.lenient = lenient; 190 return this; 191 } 192 193 } 194 195 private static final String UTF_8 = StandardCharsets.UTF_8.name(); 196 197 private static final String US_ASCII = StandardCharsets.US_ASCII.name(); 198 199 private static final String UTF_16BE = StandardCharsets.UTF_16BE.name(); 200 201 private static final String UTF_16LE = StandardCharsets.UTF_16LE.name(); 202 203 private static final String UTF_32BE = "UTF-32BE"; 204 205 private static final String UTF_32LE = "UTF-32LE"; 206 207 private static final String UTF_16 = StandardCharsets.UTF_16.name(); 208 209 private static final String UTF_32 = "UTF-32"; 210 211 private static final String EBCDIC = "CP1047"; 212 213 private static final ByteOrderMark[] BOMS = { ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE, 214 ByteOrderMark.UTF_32LE }; 215 216 /** UTF_16LE and UTF_32LE have the same two starting BOM bytes. */ 217 private static final ByteOrderMark[] XML_GUESS_BYTES = { new ByteOrderMark(UTF_8, 0x3C, 0x3F, 0x78, 0x6D), 218 new ByteOrderMark(UTF_16BE, 0x00, 0x3C, 0x00, 0x3F), new ByteOrderMark(UTF_16LE, 0x3C, 0x00, 0x3F, 0x00), 219 new ByteOrderMark(UTF_32BE, 0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D), 220 new ByteOrderMark(UTF_32LE, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00), 221 new ByteOrderMark(EBCDIC, 0x4C, 0x6F, 0xA7, 0x94) }; 222 223 private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=[\"']?([.[^; \"']]*)[\"']?"); 224 225 /** 226 * Pattern capturing the encoding of the <a href="https://www.w3.org/TR/REC-xml/#sec-pi">{@code 'xml'} processing instruction</a>. 227 * <p> 228 * See also the <a href="https://www.w3.org/TR/2008/REC-xml-20081126/#NT-EncName">NT-EncName</a> XML specification. 229 * </p> 230 * <p> 231 * Note the documented pattern is: 232 * </p> 233 * <pre> 234 * EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* 235 * </pre> 236 * <p> 237 * However this does not match all the aliases that are supported by Java. For example, {@code '437'}, {@code 'ISO_8859-1:1987'} and 238 * {@code 'ebcdic-de-273+euro'}. 239 * </p> 240 */ 241 public static final Pattern ENCODING_PATTERN = Pattern.compile( 242 // @formatter:off 243 "^<\\?xml\\s+" 244 + "(?:version\\s*=\\s*(?:(?:\"1\\.[0-9]+\")|(?:'1.[0-9]+'))\\s+)??" 245 + "encoding\\s*=\\s*" 246 + "((?:\"[A-Za-z0-9][A-Za-z0-9._+:-]*\")" // double-quoted 247 + "|(?:'[A-Za-z0-9][A-Za-z0-9._+:-]*'))", // single-quoted 248 Pattern.MULTILINE); 249 // @formatter:on 250 251 private static final String RAW_EX_1 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch"; 252 253 private static final String RAW_EX_2 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM"; 254 255 private static final String HTTP_EX_1 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be null"; 256 257 private static final String HTTP_EX_2 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch"; 258 259 private static final String HTTP_EX_3 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Illegal MIME"; 260 261 /** 262 * Constructs a new {@link Builder}. 263 * 264 * @return a new {@link Builder}. 265 * @since 2.12.0 266 */ 267 public static Builder builder() { 268 return new Builder(); 269 } 270 271 /** 272 * Gets the charset parameter value, {@code null} if not present, {@code null} if httpContentType is {@code null}. 273 * 274 * @param httpContentType the HTTP content type 275 * @return The content type encoding (upcased) 276 */ 277 static String getContentTypeEncoding(final String httpContentType) { 278 String encoding = null; 279 if (httpContentType != null) { 280 final int i = httpContentType.indexOf(";"); 281 if (i > -1) { 282 final String postMime = httpContentType.substring(i + 1); 283 final Matcher m = CHARSET_PATTERN.matcher(postMime); 284 encoding = m.find() ? m.group(1) : null; 285 encoding = encoding != null ? encoding.toUpperCase(Locale.ROOT) : null; 286 } 287 } 288 return encoding; 289 } 290 291 /** 292 * Gets the MIME type or {@code null} if httpContentType is {@code null}. 293 * 294 * @param httpContentType the HTTP content type 295 * @return The mime content type 296 */ 297 static String getContentTypeMime(final String httpContentType) { 298 String mime = null; 299 if (httpContentType != null) { 300 final int i = httpContentType.indexOf(";"); 301 mime = i >= 0 ? httpContentType.substring(0, i) : httpContentType; 302 mime = mime.trim(); 303 } 304 return mime; 305 } 306 307 /** 308 * Gets the encoding declared in the <?xml encoding=...?>, {@code null} if none. 309 * 310 * @param inputStream InputStream to create the reader from. 311 * @param guessedEnc guessed encoding 312 * @return the encoding declared in the <?xml encoding=...?> 313 * @throws IOException thrown if there is a problem reading the stream. 314 */ 315 private static String getXmlProlog(final InputStream inputStream, final String guessedEnc) throws IOException { 316 String encoding = null; 317 if (guessedEnc != null) { 318 final byte[] bytes = IOUtils.byteArray(); 319 inputStream.mark(IOUtils.DEFAULT_BUFFER_SIZE); 320 int offset = 0; 321 int max = IOUtils.DEFAULT_BUFFER_SIZE; 322 int c = inputStream.read(bytes, offset, max); 323 int firstGT = -1; 324 String xmlProlog = ""; // avoid possible NPE warning (cannot happen; this just silences the warning) 325 while (c != -1 && firstGT == -1 && offset < IOUtils.DEFAULT_BUFFER_SIZE) { 326 offset += c; 327 max -= c; 328 c = inputStream.read(bytes, offset, max); 329 xmlProlog = new String(bytes, 0, offset, guessedEnc); 330 firstGT = xmlProlog.indexOf('>'); 331 } 332 if (firstGT == -1) { 333 if (c == -1) { 334 throw new IOException("Unexpected end of XML stream"); 335 } 336 throw new IOException("XML prolog or ROOT element not found on first " + offset + " bytes"); 337 } 338 final int bytesRead = offset; 339 if (bytesRead > 0) { 340 inputStream.reset(); 341 final BufferedReader bReader = new BufferedReader(new StringReader(xmlProlog.substring(0, firstGT + 1))); 342 final StringBuilder prolog = new StringBuilder(); 343 IOConsumer.forEach(bReader.lines(), l -> prolog.append(l).append(' ')); 344 final Matcher m = ENCODING_PATTERN.matcher(prolog); 345 if (m.find()) { 346 encoding = m.group(1).toUpperCase(Locale.ROOT); 347 encoding = encoding.substring(1, encoding.length() - 1); 348 } 349 } 350 } 351 return encoding; 352 } 353 354 /** 355 * Tests if the MIME type belongs to the APPLICATION XML family. 356 * 357 * @param mime The mime type 358 * @return true if the mime type belongs to the APPLICATION XML family, otherwise false 359 */ 360 static boolean isAppXml(final String mime) { 361 return mime != null && (mime.equals("application/xml") || mime.equals("application/xml-dtd") || mime.equals("application/xml-external-parsed-entity") 362 || mime.startsWith("application/") && mime.endsWith("+xml")); 363 } 364 365 /** 366 * Tests if the MIME type belongs to the TEXT XML family. 367 * 368 * @param mime The mime type 369 * @return true if the mime type belongs to the TEXT XML family, otherwise false 370 */ 371 static boolean isTextXml(final String mime) { 372 return mime != null && (mime.equals("text/xml") || mime.equals("text/xml-external-parsed-entity") || mime.startsWith("text/") && mime.endsWith("+xml")); 373 } 374 375 private final Reader reader; 376 377 private final String encoding; 378 379 private final String defaultEncoding; 380 381 /** 382 * Constructs a Reader for a File. 383 * <p> 384 * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to UTF-8. 385 * </p> 386 * <p> 387 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 388 * </p> 389 * 390 * @param file File to create a Reader from. 391 * @throws NullPointerException if the input is {@code null}. 392 * @throws IOException thrown if there is a problem reading the file. 393 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 394 */ 395 @Deprecated 396 public XmlStreamReader(final File file) throws IOException { 397 this(Objects.requireNonNull(file, "file").toPath()); 398 } 399 400 /** 401 * Constructs a Reader for a raw InputStream. 402 * <p> 403 * It follows the same logic used for files. 404 * </p> 405 * <p> 406 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 407 * </p> 408 * 409 * @param inputStream InputStream to create a Reader from. 410 * @throws NullPointerException if the input stream is {@code null}. 411 * @throws IOException thrown if there is a problem reading the stream. 412 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 413 */ 414 @Deprecated 415 public XmlStreamReader(final InputStream inputStream) throws IOException { 416 this(inputStream, true); 417 } 418 419 /** 420 * Constructs a Reader for a raw InputStream. 421 * <p> 422 * It follows the same logic used for files. 423 * </p> 424 * <p> 425 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following: 426 * </p> 427 * <p> 428 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again. 429 * </p> 430 * <p> 431 * Else if the XML prolog had a charset encoding that encoding is used. 432 * </p> 433 * <p> 434 * Else if the content type had a charset encoding that encoding is used. 435 * </p> 436 * <p> 437 * Else 'UTF-8' is used. 438 * </p> 439 * <p> 440 * If lenient detection is indicated an XmlStreamReaderException is never thrown. 441 * </p> 442 * 443 * @param inputStream InputStream to create a Reader from. 444 * @param lenient indicates if the charset encoding detection should be relaxed. 445 * @throws NullPointerException if the input stream is {@code null}. 446 * @throws IOException thrown if there is a problem reading the stream. 447 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification. 448 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 449 */ 450 @Deprecated 451 public XmlStreamReader(final InputStream inputStream, final boolean lenient) throws IOException { 452 this(inputStream, lenient, null); 453 } 454 455 /** 456 * Constructs a Reader for a raw InputStream. 457 * <p> 458 * It follows the same logic used for files. 459 * </p> 460 * <p> 461 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following: 462 * </p> 463 * <p> 464 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again. 465 * </p> 466 * <p> 467 * Else if the XML prolog had a charset encoding that encoding is used. 468 * </p> 469 * <p> 470 * Else if the content type had a charset encoding that encoding is used. 471 * </p> 472 * <p> 473 * Else 'UTF-8' is used. 474 * </p> 475 * <p> 476 * If lenient detection is indicated an XmlStreamReaderException is never thrown. 477 * </p> 478 * 479 * @param inputStream InputStream to create a Reader from. 480 * @param lenient indicates if the charset encoding detection should be relaxed. 481 * @param defaultEncoding The default encoding 482 * @throws NullPointerException if the input stream is {@code null}. 483 * @throws IOException thrown if there is a problem reading the stream. 484 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification. 485 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 486 */ 487 @Deprecated 488 @SuppressWarnings("resource") // InputStream is managed through a InputStreamReader in this instance. 489 public XmlStreamReader(final InputStream inputStream, final boolean lenient, final String defaultEncoding) throws IOException { 490 this.defaultEncoding = defaultEncoding; 491 final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(Objects.requireNonNull(inputStream, "inputStream"), IOUtils.DEFAULT_BUFFER_SIZE), 492 false, BOMS); 493 final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES); 494 this.encoding = processHttpStream(bom, pis, lenient); 495 this.reader = new InputStreamReader(pis, encoding); 496 } 497 498 /** 499 * Constructs a Reader using an InputStream and the associated content-type header. 500 * <p> 501 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog 502 * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type. 503 * </p> 504 * <p> 505 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 506 * </p> 507 * 508 * @param inputStream InputStream to create the reader from. 509 * @param httpContentType content-type header to use for the resolution of the charset encoding. 510 * @throws NullPointerException if the input stream is {@code null}. 511 * @throws IOException thrown if there is a problem reading the file. 512 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 513 */ 514 @Deprecated 515 public XmlStreamReader(final InputStream inputStream, final String httpContentType) throws IOException { 516 this(inputStream, httpContentType, true); 517 } 518 519 /** 520 * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection. 521 * <p> 522 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog 523 * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type. 524 * </p> 525 * <p> 526 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following: 527 * </p> 528 * <p> 529 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again. 530 * </p> 531 * <p> 532 * Else if the XML prolog had a charset encoding that encoding is used. 533 * </p> 534 * <p> 535 * Else if the content type had a charset encoding that encoding is used. 536 * </p> 537 * <p> 538 * Else 'UTF-8' is used. 539 * </p> 540 * <p> 541 * If lenient detection is indicated an XmlStreamReaderException is never thrown. 542 * </p> 543 * 544 * @param inputStream InputStream to create the reader from. 545 * @param httpContentType content-type header to use for the resolution of the charset encoding. 546 * @param lenient indicates if the charset encoding detection should be relaxed. 547 * @throws NullPointerException if the input stream is {@code null}. 548 * @throws IOException thrown if there is a problem reading the file. 549 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification. 550 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 551 */ 552 @Deprecated 553 public XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient) throws IOException { 554 this(inputStream, httpContentType, lenient, null); 555 } 556 557 /** 558 * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection. 559 * <p> 560 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog 561 * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type. 562 * </p> 563 * <p> 564 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following: 565 * </p> 566 * <p> 567 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again. 568 * </p> 569 * <p> 570 * Else if the XML prolog had a charset encoding that encoding is used. 571 * </p> 572 * <p> 573 * Else if the content type had a charset encoding that encoding is used. 574 * </p> 575 * <p> 576 * Else 'UTF-8' is used. 577 * </p> 578 * <p> 579 * If lenient detection is indicated an XmlStreamReaderException is never thrown. 580 * </p> 581 * 582 * @param inputStream InputStream to create the reader from. 583 * @param httpContentType content-type header to use for the resolution of the charset encoding. 584 * @param lenient indicates if the charset encoding detection should be relaxed. 585 * @param defaultEncoding The default encoding 586 * @throws NullPointerException if the input stream is {@code null}. 587 * @throws IOException thrown if there is a problem reading the file. 588 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification. 589 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 590 */ 591 @Deprecated 592 @SuppressWarnings("resource") // InputStream is managed through a InputStreamReader in this instance. 593 public XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient, final String defaultEncoding) 594 throws IOException { 595 this.defaultEncoding = defaultEncoding; 596 final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(Objects.requireNonNull(inputStream, "inputStream"), IOUtils.DEFAULT_BUFFER_SIZE), 597 false, BOMS); 598 final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES); 599 this.encoding = processHttpStream(bom, pis, lenient, httpContentType); 600 this.reader = new InputStreamReader(pis, encoding); 601 } 602 603 /** 604 * Constructs a Reader for a File. 605 * <p> 606 * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to UTF-8. 607 * </p> 608 * <p> 609 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 610 * </p> 611 * 612 * @param file File to create a Reader from. 613 * @throws NullPointerException if the input is {@code null}. 614 * @throws IOException thrown if there is a problem reading the file. 615 * @since 2.11.0 616 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 617 */ 618 @Deprecated 619 @SuppressWarnings("resource") // InputStream is managed through another reader in this instance. 620 public XmlStreamReader(final Path file) throws IOException { 621 this(Files.newInputStream(Objects.requireNonNull(file, "file"))); 622 } 623 624 /** 625 * Constructs a Reader using the InputStream of a URL. 626 * <p> 627 * If the URL is not of type HTTP and there is not 'content-type' header in the fetched data it uses the same logic used for Files. 628 * </p> 629 * <p> 630 * If the URL is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for an InputStream with content-type. 631 * </p> 632 * <p> 633 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 634 * </p> 635 * 636 * @param url URL to create a Reader from. 637 * @throws NullPointerException if the input is {@code null}. 638 * @throws IOException thrown if there is a problem reading the stream of the URL. 639 */ 640 public XmlStreamReader(final URL url) throws IOException { 641 this(Objects.requireNonNull(url, "url").openConnection(), null); 642 } 643 644 /** 645 * Constructs a Reader using the InputStream of a URLConnection. 646 * <p> 647 * If the URLConnection is not of type HttpURLConnection and there is not 'content-type' header in the fetched data it uses the same logic used for files. 648 * </p> 649 * <p> 650 * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for an InputStream with 651 * content-type. 652 * </p> 653 * <p> 654 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 655 * </p> 656 * 657 * @param urlConnection URLConnection to create a Reader from. 658 * @param defaultEncoding The default encoding 659 * @throws NullPointerException if the input is {@code null}. 660 * @throws IOException thrown if there is a problem reading the stream of the URLConnection. 661 */ 662 public XmlStreamReader(final URLConnection urlConnection, final String defaultEncoding) throws IOException { 663 Objects.requireNonNull(urlConnection, "urlConnection"); 664 this.defaultEncoding = defaultEncoding; 665 final boolean lenient = true; 666 final String contentType = urlConnection.getContentType(); 667 final InputStream inputStream = urlConnection.getInputStream(); 668 @SuppressWarnings("resource") // managed by the InputStreamReader tracked by this instance 669 // @formatter:off 670 final BOMInputStream bomInput = BOMInputStream.builder() 671 .setInputStream(new BufferedInputStream(inputStream, IOUtils.DEFAULT_BUFFER_SIZE)) 672 .setInclude(false) 673 .setByteOrderMarks(BOMS) 674 .get(); 675 @SuppressWarnings("resource") 676 final BOMInputStream piInput = BOMInputStream.builder() 677 .setInputStream(new BufferedInputStream(bomInput, IOUtils.DEFAULT_BUFFER_SIZE)) 678 .setInclude(true) 679 .setByteOrderMarks(XML_GUESS_BYTES) 680 .get(); 681 // @formatter:on 682 if (urlConnection instanceof HttpURLConnection || contentType != null) { 683 this.encoding = processHttpStream(bomInput, piInput, lenient, contentType); 684 } else { 685 this.encoding = processHttpStream(bomInput, piInput, lenient); 686 } 687 this.reader = new InputStreamReader(piInput, encoding); 688 } 689 690 /** 691 * Calculates the HTTP encoding. 692 * @param bomEnc BOM encoding 693 * @param xmlGuessEnc XML Guess encoding 694 * @param xmlEnc XML encoding 695 * @param lenient indicates if the charset encoding detection should be relaxed. 696 * @param httpContentType The HTTP content type 697 * 698 * @return the HTTP encoding 699 * @throws IOException thrown if there is a problem reading the stream. 700 */ 701 String calculateHttpEncoding(final String bomEnc, final String xmlGuessEnc, final String xmlEnc, final boolean lenient, final String httpContentType) 702 throws IOException { 703 704 // Lenient and has XML encoding 705 if (lenient && xmlEnc != null) { 706 return xmlEnc; 707 } 708 709 // Determine mime/encoding content types from HTTP Content Type 710 final String cTMime = getContentTypeMime(httpContentType); 711 final String cTEnc = getContentTypeEncoding(httpContentType); 712 final boolean appXml = isAppXml(cTMime); 713 final boolean textXml = isTextXml(cTMime); 714 715 // Mime type NOT "application/xml" or "text/xml" 716 if (!appXml && !textXml) { 717 final String msg = MessageFormat.format(HTTP_EX_3, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 718 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 719 } 720 721 // No content type encoding 722 if (cTEnc == null) { 723 if (appXml) { 724 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc); 725 } 726 return defaultEncoding == null ? US_ASCII : defaultEncoding; 727 } 728 729 // UTF-16BE or UTF-16LE content type encoding 730 if (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) { 731 if (bomEnc != null) { 732 final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 733 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 734 } 735 return cTEnc; 736 } 737 738 // UTF-16 content type encoding 739 if (cTEnc.equals(UTF_16)) { 740 if (bomEnc != null && bomEnc.startsWith(UTF_16)) { 741 return bomEnc; 742 } 743 final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 744 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 745 } 746 747 // UTF-32BE or UTF-132E content type encoding 748 if (cTEnc.equals(UTF_32BE) || cTEnc.equals(UTF_32LE)) { 749 if (bomEnc != null) { 750 final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 751 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 752 } 753 return cTEnc; 754 } 755 756 // UTF-32 content type encoding 757 if (cTEnc.equals(UTF_32)) { 758 if (bomEnc != null && bomEnc.startsWith(UTF_32)) { 759 return bomEnc; 760 } 761 final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 762 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 763 } 764 765 return cTEnc; 766 } 767 768 /** 769 * Calculate the raw encoding. 770 * 771 * @param bomEnc BOM encoding 772 * @param xmlGuessEnc XML Guess encoding 773 * @param xmlEnc XML encoding 774 * @return the raw encoding 775 * @throws IOException thrown if there is a problem reading the stream. 776 */ 777 String calculateRawEncoding(final String bomEnc, final String xmlGuessEnc, final String xmlEnc) throws IOException { 778 779 // BOM is Null 780 if (bomEnc == null) { 781 if (xmlGuessEnc == null || xmlEnc == null) { 782 return defaultEncoding == null ? UTF_8 : defaultEncoding; 783 } 784 if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) { 785 return xmlGuessEnc; 786 } 787 return xmlEnc; 788 } 789 790 // BOM is UTF-8 791 if (bomEnc.equals(UTF_8)) { 792 if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) { 793 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 794 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 795 } 796 if (xmlEnc != null && !xmlEnc.equals(UTF_8)) { 797 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 798 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 799 } 800 return bomEnc; 801 } 802 803 // BOM is UTF-16BE or UTF-16LE 804 if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) { 805 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) { 806 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 807 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 808 } 809 if (xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) { 810 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 811 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 812 } 813 return bomEnc; 814 } 815 816 // BOM is UTF-32BE or UTF-32LE 817 if (bomEnc.equals(UTF_32BE) || bomEnc.equals(UTF_32LE)) { 818 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) { 819 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 820 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 821 } 822 if (xmlEnc != null && !xmlEnc.equals(UTF_32) && !xmlEnc.equals(bomEnc)) { 823 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 824 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 825 } 826 return bomEnc; 827 } 828 829 // BOM is something else 830 final String msg = MessageFormat.format(RAW_EX_2, bomEnc, xmlGuessEnc, xmlEnc); 831 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 832 } 833 834 /** 835 * Closes the XmlStreamReader stream. 836 * 837 * @throws IOException thrown if there was a problem closing the stream. 838 */ 839 @Override 840 public void close() throws IOException { 841 reader.close(); 842 } 843 844 /** 845 * Does lenient detection. 846 * 847 * @param httpContentType content-type header to use for the resolution of the charset encoding. 848 * @param ex The thrown exception 849 * @return the encoding 850 * @throws IOException thrown if there is a problem reading the stream. 851 */ 852 private String doLenientDetection(String httpContentType, XmlStreamReaderException ex) throws IOException { 853 if (httpContentType != null && httpContentType.startsWith("text/html")) { 854 httpContentType = httpContentType.substring("text/html".length()); 855 httpContentType = "text/xml" + httpContentType; 856 try { 857 return calculateHttpEncoding(ex.getBomEncoding(), ex.getXmlGuessEncoding(), ex.getXmlEncoding(), true, httpContentType); 858 } catch (final XmlStreamReaderException ex2) { 859 ex = ex2; 860 } 861 } 862 String encoding = ex.getXmlEncoding(); 863 if (encoding == null) { 864 encoding = ex.getContentTypeEncoding(); 865 } 866 if (encoding == null) { 867 encoding = defaultEncoding == null ? UTF_8 : defaultEncoding; 868 } 869 return encoding; 870 } 871 872 /** 873 * Gets the default encoding to use if none is set in HTTP content-type, XML prolog and the rules based on content-type are not adequate. 874 * <p> 875 * If it is {@code null} the content-type based rules are used. 876 * </p> 877 * 878 * @return the default encoding to use. 879 */ 880 public String getDefaultEncoding() { 881 return defaultEncoding; 882 } 883 884 /** 885 * Gets the charset encoding of the XmlStreamReader. 886 * 887 * @return charset encoding. 888 */ 889 public String getEncoding() { 890 return encoding; 891 } 892 893 /** 894 * Process the raw stream. 895 * 896 * @param bomInput BOMInputStream to detect byte order marks 897 * @param piInput BOMInputStream to guess XML encoding 898 * @param lenient indicates if the charset encoding detection should be relaxed. 899 * @return the encoding to be used 900 * @throws IOException thrown if there is a problem reading the stream. 901 */ 902 private String processHttpStream(final BOMInputStream bomInput, final BOMInputStream piInput, final boolean lenient) throws IOException { 903 final String bomEnc = bomInput.getBOMCharsetName(); 904 final String xmlGuessEnc = piInput.getBOMCharsetName(); 905 final String xmlEnc = getXmlProlog(piInput, xmlGuessEnc); 906 try { 907 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc); 908 } catch (final XmlStreamReaderException ex) { 909 if (lenient) { 910 return doLenientDetection(null, ex); 911 } 912 throw ex; 913 } 914 } 915 916 /** 917 * Processes an HTTP stream. 918 * 919 * @param bomInput BOMInputStream to detect byte order marks 920 * @param piInput BOMInputStream to guess XML encoding 921 * @param lenient indicates if the charset encoding detection should be relaxed. 922 * @param httpContentType The HTTP content type 923 * @return the encoding to be used 924 * @throws IOException thrown if there is a problem reading the stream. 925 */ 926 private String processHttpStream(final BOMInputStream bomInput, final BOMInputStream piInput, final boolean lenient, final String httpContentType) 927 throws IOException { 928 final String bomEnc = bomInput.getBOMCharsetName(); 929 final String xmlGuessEnc = piInput.getBOMCharsetName(); 930 final String xmlEnc = getXmlProlog(piInput, xmlGuessEnc); 931 try { 932 return calculateHttpEncoding(bomEnc, xmlGuessEnc, xmlEnc, lenient, httpContentType); 933 } catch (final XmlStreamReaderException ex) { 934 if (lenient) { 935 return doLenientDetection(httpContentType, ex); 936 } 937 throw ex; 938 } 939 } 940 941 /** 942 * Reads the underlying reader's {@code read(char[], int, int)} method. 943 * 944 * @param buf the buffer to read the characters into 945 * @param offset The start offset 946 * @param len The number of bytes to read 947 * @return the number of characters read or -1 if the end of stream 948 * @throws IOException if an I/O error occurs. 949 */ 950 @Override 951 public int read(final char[] buf, final int offset, final int len) throws IOException { 952 return reader.read(buf, offset, len); 953 } 954 955}