CSVParser.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.csv;
import static org.apache.commons.csv.Token.Type.TOKEN;
import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.io.UncheckedIOException;
import java.net.URL;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Objects;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.TreeMap;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import org.apache.commons.io.function.Uncheck;
/**
* Parses CSV files according to the specified format.
*
* Because CSV appears in many different dialects, the parser supports many formats by allowing the
* specification of a {@link CSVFormat}.
*
* The parser works record-wise. It is not possible to go back, once a record has been parsed from the input stream.
*
* <h2>Creating instances</h2>
* <p>
* There are several static factory methods that can be used to create instances for various types of resources:
* </p>
* <ul>
* <li>{@link #parse(java.io.File, Charset, CSVFormat)}</li>
* <li>{@link #parse(String, CSVFormat)}</li>
* <li>{@link #parse(java.net.URL, java.nio.charset.Charset, CSVFormat)}</li>
* </ul>
* <p>
* Alternatively parsers can also be created by passing a {@link Reader} directly to the sole constructor.
*
* For those who like fluent APIs, parsers can be created using {@link CSVFormat#parse(java.io.Reader)} as a shortcut:
* </p>
* <pre>
* for (CSVRecord record : CSVFormat.EXCEL.parse(in)) {
* ...
* }
* </pre>
*
* <h2>Parsing record wise</h2>
* <p>
* To parse a CSV input from a file, you write:
* </p>
*
* <pre>{@code
* File csvData = new File("/path/to/csv");
* CSVParser parser = CSVParser.parse(csvData, CSVFormat.RFC4180);
* for (CSVRecord csvRecord : parser) {
* ...
* }}
* </pre>
*
* <p>
* This will read the parse the contents of the file using the
* <a href="https://tools.ietf.org/html/rfc4180" target="_blank">RFC 4180</a> format.
* </p>
*
* <p>
* To parse CSV input in a format like Excel, you write:
* </p>
*
* <pre>
* CSVParser parser = CSVParser.parse(csvData, CSVFormat.EXCEL);
* for (CSVRecord csvRecord : parser) {
* ...
* }
* </pre>
*
* <p>
* If the predefined formats don't match the format at hand, custom formats can be defined. More information about
* customizing CSVFormats is available in {@link CSVFormat CSVFormat Javadoc}.
* </p>
*
* <h2>Parsing into memory</h2>
* <p>
* If parsing record-wise is not desired, the contents of the input can be read completely into memory.
* </p>
*
* <pre>{@code
* Reader in = new StringReader("a;b\nc;d");
* CSVParser parser = new CSVParser(in, CSVFormat.EXCEL);
* List<CSVRecord> list = parser.getRecords();
* }</pre>
*
* <p>
* There are two constraints that have to be kept in mind:
* </p>
*
* <ol>
* <li>Parsing into memory starts at the current position of the parser. If you have already parsed records from
* the input, those records will not end up in the in-memory representation of your CSV data.</li>
* <li>Parsing into memory may consume a lot of system resources depending on the input. For example, if you're
* parsing a 150MB file of CSV data the contents will be read completely into memory.</li>
* </ol>
*
* <h2>Notes</h2>
* <p>
* The internal parser state is completely covered by the format and the reader state.
* </p>
*
* @see <a href="package-summary.html">package documentation for more details</a>
*/
public final class CSVParser implements Iterable<CSVRecord>, Closeable {
final class CSVRecordIterator implements Iterator<CSVRecord> {
private CSVRecord current;
/**
* Gets the next record.
*
* @return the next record.
*/
private CSVRecord getNextRecord() {
return Uncheck.get(CSVParser.this::nextRecord);
}
@Override
public boolean hasNext() {
if (isClosed()) {
return false;
}
if (current == null) {
current = getNextRecord();
}
return current != null;
}
@Override
public CSVRecord next() {
if (isClosed()) {
throw new NoSuchElementException("CSVParser has been closed");
}
CSVRecord next = current;
current = null;
if (next == null) {
// hasNext() wasn't called before
next = getNextRecord();
if (next == null) {
throw new NoSuchElementException("No more CSV records available");
}
}
return next;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
}
/**
* Header information based on name and position.
*/
private static final class Headers {
/**
* Header column positions (0-based)
*/
final Map<String, Integer> headerMap;
/**
* Header names in column order
*/
final List<String> headerNames;
Headers(final Map<String, Integer> headerMap, final List<String> headerNames) {
this.headerMap = headerMap;
this.headerNames = headerNames;
}
}
/**
* Creates a parser for the given {@link File}.
*
* @param file
* a CSV file. Must not be null.
* @param charset
* The Charset to decode the given file.
* @param format
* the CSVFormat used for CSV parsing. Must not be null.
* @return a new parser
* @throws IllegalArgumentException
* If the parameters of the format are inconsistent or if either file or format are null.
* @throws IOException
* If an I/O error occurs
* @throws CSVException Thrown on invalid input.
*/
public static CSVParser parse(final File file, final Charset charset, final CSVFormat format) throws IOException {
Objects.requireNonNull(file, "file");
return parse(file.toPath(), charset, format);
}
/**
* Creates a CSV parser using the given {@link CSVFormat}.
*
* <p>
* If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
* unless you close the {@code reader}.
* </p>
*
* @param inputStream
* an InputStream containing CSV-formatted input. Must not be null.
* @param charset
* The Charset to decode the given file.
* @param format
* the CSVFormat used for CSV parsing. Must not be null.
* @return a new CSVParser configured with the given reader and format.
* @throws IllegalArgumentException
* If the parameters of the format are inconsistent or if either reader or format are null.
* @throws IOException
* If there is a problem reading the header or skipping the first record
* @throws CSVException Thrown on invalid input.
* @since 1.5
*/
@SuppressWarnings("resource")
public static CSVParser parse(final InputStream inputStream, final Charset charset, final CSVFormat format)
throws IOException {
Objects.requireNonNull(inputStream, "inputStream");
Objects.requireNonNull(format, "format");
return parse(new InputStreamReader(inputStream, charset), format);
}
/**
* Creates and returns a parser for the given {@link Path}, which the caller MUST close.
*
* @param path
* a CSV file. Must not be null.
* @param charset
* The Charset to decode the given file.
* @param format
* the CSVFormat used for CSV parsing. Must not be null.
* @return a new parser
* @throws IllegalArgumentException
* If the parameters of the format are inconsistent or if either file or format are null.
* @throws IOException
* If an I/O error occurs
* @throws CSVException Thrown on invalid input.
* @since 1.5
*/
@SuppressWarnings("resource")
public static CSVParser parse(final Path path, final Charset charset, final CSVFormat format) throws IOException {
Objects.requireNonNull(path, "path");
Objects.requireNonNull(format, "format");
return parse(Files.newInputStream(path), charset, format);
}
/**
* Creates a CSV parser using the given {@link CSVFormat}
*
* <p>
* If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
* unless you close the {@code reader}.
* </p>
*
* @param reader
* a Reader containing CSV-formatted input. Must not be null.
* @param format
* the CSVFormat used for CSV parsing. Must not be null.
* @return a new CSVParser configured with the given reader and format.
* @throws IllegalArgumentException
* If the parameters of the format are inconsistent or if either reader or format are null.
* @throws IOException
* If there is a problem reading the header or skipping the first record
* @throws CSVException Thrown on invalid input.
* @since 1.5
*/
public static CSVParser parse(final Reader reader, final CSVFormat format) throws IOException {
return new CSVParser(reader, format);
}
/**
* Creates a parser for the given {@link String}.
*
* @param string
* a CSV string. Must not be null.
* @param format
* the CSVFormat used for CSV parsing. Must not be null.
* @return a new parser
* @throws IllegalArgumentException
* If the parameters of the format are inconsistent or if either string or format are null.
* @throws IOException
* If an I/O error occurs
* @throws CSVException Thrown on invalid input.
*/
public static CSVParser parse(final String string, final CSVFormat format) throws IOException {
Objects.requireNonNull(string, "string");
Objects.requireNonNull(format, "format");
return new CSVParser(new StringReader(string), format);
}
/**
* Creates and returns a parser for the given URL, which the caller MUST close.
*
* <p>
* If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless
* you close the {@code url}.
* </p>
*
* @param url
* a URL. Must not be null.
* @param charset
* the charset for the resource. Must not be null.
* @param format
* the CSVFormat used for CSV parsing. Must not be null.
* @return a new parser
* @throws IllegalArgumentException
* If the parameters of the format are inconsistent or if either url, charset or format are null.
* @throws IOException
* If an I/O error occurs
* @throws CSVException Thrown on invalid input.
*/
@SuppressWarnings("resource")
public static CSVParser parse(final URL url, final Charset charset, final CSVFormat format) throws IOException {
Objects.requireNonNull(url, "url");
Objects.requireNonNull(charset, "charset");
Objects.requireNonNull(format, "format");
return new CSVParser(new InputStreamReader(url.openStream(), charset), format);
}
private String headerComment;
private String trailerComment;
private final CSVFormat format;
private final Headers headers;
private final Lexer lexer;
private final CSVRecordIterator csvRecordIterator;
/** A record buffer for getRecord(). Grows as necessary and is reused. */
private final List<String> recordList = new ArrayList<>();
/**
* The next record number to assign.
*/
private long recordNumber;
/**
* Lexer offset when the parser does not start parsing at the beginning of the source. Usually used in combination
* with {@link #recordNumber}.
*/
private final long characterOffset;
private final Token reusableToken = new Token();
/**
* Constructs a new instance using the given {@link CSVFormat}
*
* <p>
* If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
* unless you close the {@code reader}.
* </p>
*
* @param reader
* a Reader containing CSV-formatted input. Must not be null.
* @param format
* the CSVFormat used for CSV parsing. Must not be null.
* @throws IllegalArgumentException
* If the parameters of the format are inconsistent or if either reader or format are null.
* @throws IOException
* If there is a problem reading the header or skipping the first record
* @throws CSVException Thrown on invalid input.
*/
public CSVParser(final Reader reader, final CSVFormat format) throws IOException {
this(reader, format, 0, 1);
}
/**
* Constructs a new instance using the given {@link CSVFormat}
*
* <p>
* If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
* unless you close the {@code reader}.
* </p>
*
* @param reader
* a Reader containing CSV-formatted input. Must not be null.
* @param format
* the CSVFormat used for CSV parsing. Must not be null.
* @param characterOffset
* Lexer offset when the parser does not start parsing at the beginning of the source.
* @param recordNumber
* The next record number to assign
* @throws IllegalArgumentException
* If the parameters of the format are inconsistent or if either the reader or format is null.
* @throws IOException
* If there is a problem reading the header or skipping the first record
* @throws CSVException Thrown on invalid input.
* @since 1.1
*/
@SuppressWarnings("resource")
public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber)
throws IOException {
Objects.requireNonNull(reader, "reader");
Objects.requireNonNull(format, "format");
this.format = format.copy();
this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
this.csvRecordIterator = new CSVRecordIterator();
this.headers = createHeaders();
this.characterOffset = characterOffset;
this.recordNumber = recordNumber - 1;
}
private void addRecordValue(final boolean lastRecord) {
final String input = format.trim(reusableToken.content.toString());
if (lastRecord && input.isEmpty() && format.getTrailingDelimiter()) {
return;
}
recordList.add(handleNull(input));
}
/**
* Closes resources.
*
* @throws IOException
* If an I/O error occurs
*/
@Override
public void close() throws IOException {
lexer.close();
}
private Map<String, Integer> createEmptyHeaderMap() {
return format.getIgnoreHeaderCase() ?
new TreeMap<>(String.CASE_INSENSITIVE_ORDER) :
new LinkedHashMap<>();
}
/**
* Creates the name to index mapping if the format defines a header.
*
* @return null if the format has no header.
* @throws IOException if there is a problem reading the header or skipping the first record
* @throws CSVException Thrown on invalid input.
*/
private Headers createHeaders() throws IOException {
Map<String, Integer> hdrMap = null;
List<String> headerNames = null;
final String[] formatHeader = format.getHeader();
if (formatHeader != null) {
hdrMap = createEmptyHeaderMap();
String[] headerRecord = null;
if (formatHeader.length == 0) {
// read the header from the first line of the file
final CSVRecord nextRecord = nextRecord();
if (nextRecord != null) {
headerRecord = nextRecord.values();
headerComment = nextRecord.getComment();
}
} else {
if (format.getSkipHeaderRecord()) {
final CSVRecord nextRecord = nextRecord();
if (nextRecord != null) {
headerComment = nextRecord.getComment();
}
}
headerRecord = formatHeader;
}
// build the name to index mappings
if (headerRecord != null) {
// Track an occurrence of a null, empty or blank header.
boolean observedMissing = false;
for (int i = 0; i < headerRecord.length; i++) {
final String header = headerRecord[i];
final boolean blankHeader = CSVFormat.isBlank(header);
if (blankHeader && !format.getAllowMissingColumnNames()) {
throw new IllegalArgumentException(
"A header name is missing in " + Arrays.toString(headerRecord));
}
final boolean containsHeader = blankHeader ? observedMissing : hdrMap.containsKey(header);
final DuplicateHeaderMode headerMode = format.getDuplicateHeaderMode();
final boolean duplicatesAllowed = headerMode == DuplicateHeaderMode.ALLOW_ALL;
final boolean emptyDuplicatesAllowed = headerMode == DuplicateHeaderMode.ALLOW_EMPTY;
if (containsHeader && !duplicatesAllowed && !(blankHeader && emptyDuplicatesAllowed)) {
throw new IllegalArgumentException(
String.format(
"The header contains a duplicate name: \"%s\" in %s. If this is valid then use CSVFormat.Builder.setDuplicateHeaderMode().",
header, Arrays.toString(headerRecord)));
}
observedMissing |= blankHeader;
if (header != null) {
hdrMap.put(header, Integer.valueOf(i)); // N.B. Explicit (un)boxing is intentional
if (headerNames == null) {
headerNames = new ArrayList<>(headerRecord.length);
}
headerNames.add(header);
}
}
}
}
// Make header names Collection immutable
return new Headers(hdrMap, headerNames == null ? Collections.emptyList() : Collections.unmodifiableList(headerNames));
}
/**
* Gets the current line number in the input stream.
*
* <p>
* <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to
* the record number.
* </p>
*
* @return current line number
*/
public long getCurrentLineNumber() {
return lexer.getCurrentLineNumber();
}
/**
* Gets the first end-of-line string encountered.
*
* @return the first end-of-line string
* @since 1.5
*/
public String getFirstEndOfLine() {
return lexer.getFirstEol();
}
/**
* Gets the header comment, if any.
* The header comment appears before the header record.
*
* @return the header comment for this stream, or null if no comment is available.
* @since 1.10.0
*/
public String getHeaderComment() {
return headerComment;
}
/**
* Gets a copy of the header map as defined in the CSVFormat's header.
* <p>
* The map keys are column names. The map values are 0-based indices.
* </p>
* <p>
* Note: The map can only provide a one-to-one mapping when the format did not
* contain null or duplicate column names.
* </p>
*
* @return a copy of the header map.
*/
public Map<String, Integer> getHeaderMap() {
if (headers.headerMap == null) {
return null;
}
final Map<String, Integer> map = createEmptyHeaderMap();
map.putAll(headers.headerMap);
return map;
}
/**
* Gets the underlying header map.
*
* @return the underlying header map.
*/
Map<String, Integer> getHeaderMapRaw() {
return headers.headerMap;
}
/**
* Gets a read-only list of header names that iterates in column order as defined in the CSVFormat's header.
* <p>
* Note: The list provides strings that can be used as keys in the header map.
* The list will not contain null column names if they were present in the input
* format.
* </p>
*
* @return read-only list of header names that iterates in column order.
* @see #getHeaderMap()
* @since 1.7
*/
public List<String> getHeaderNames() {
return Collections.unmodifiableList(headers.headerNames);
}
/**
* Gets the current record number in the input stream.
*
* <p>
* <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to
* the line number.
* </p>
*
* @return current record number
*/
public long getRecordNumber() {
return recordNumber;
}
/**
* Parses the CSV input according to the given format and returns the content as a list of
* {@link CSVRecord CSVRecords}.
*
* <p>
* The returned content starts at the current parse-position in the stream.
* </p>
*
* @return list of {@link CSVRecord CSVRecords}, may be empty
* @throws UncheckedIOException
* on parse error or input read-failure
*/
public List<CSVRecord> getRecords() {
return stream().collect(Collectors.toList());
}
/**
* Gets the trailer comment, if any.
* Trailer comments are located between the last record and EOF
*
* @return the trailer comment for this stream, or null if no comment is available.
* @since 1.10.0
*/
public String getTrailerComment() {
return trailerComment;
}
/**
* Handles whether the input is parsed as null
*
* @param input
* the cell data to further processed
* @return null if input is parsed as null, or input itself if the input isn't parsed as null
*/
private String handleNull(final String input) {
final boolean isQuoted = reusableToken.isQuoted;
final String nullString = format.getNullString();
final boolean strictQuoteMode = isStrictQuoteMode();
if (input.equals(nullString)) {
// nullString = NULL(String), distinguish between "NULL" and NULL in ALL_NON_NULL or NON_NUMERIC quote mode
return strictQuoteMode && isQuoted ? input : null;
}
// don't set nullString, distinguish between "" and ,, (absent values) in All_NON_NULL or NON_NUMERIC quote mode
return strictQuoteMode && nullString == null && input.isEmpty() && !isQuoted ? null : input;
}
/**
* Checks whether there is a header comment.
* The header comment appears before the header record.
* Note that if the parser's format has been given an explicit header
* (with {@link CSVFormat.Builder#setHeader(String... )} or another overload)
* and the header record is not being skipped
* ({@link CSVFormat.Builder#setSkipHeaderRecord} is false) then any initial comments
* will be associated with the first record, not the header.
*
* @return true if this parser has seen a header comment, false otherwise
* @since 1.10.0
*/
public boolean hasHeaderComment() {
return headerComment != null;
}
/**
* Checks whether there is a trailer comment.
* Trailer comments are located between the last record and EOF.
* The trailer comments will only be available after the parser has
* finished processing this stream.
*
* @return true if this parser has seen a trailer comment, false otherwise
* @since 1.10.0
*/
public boolean hasTrailerComment() {
return trailerComment != null;
}
/**
* Tests whether this parser is closed.
*
* @return whether this parser is closed.
*/
public boolean isClosed() {
return lexer.isClosed();
}
/**
* Tests whether the format's {@link QuoteMode} is {@link QuoteMode#ALL_NON_NULL} or {@link QuoteMode#NON_NUMERIC}.
*
* @return true if the format's {@link QuoteMode} is {@link QuoteMode#ALL_NON_NULL} or
* {@link QuoteMode#NON_NUMERIC}.
*/
private boolean isStrictQuoteMode() {
return format.getQuoteMode() == QuoteMode.ALL_NON_NULL ||
format.getQuoteMode() == QuoteMode.NON_NUMERIC;
}
/**
* Returns the record iterator.
*
* <p>
* An {@link IOException} caught during the iteration is re-thrown as an
* {@link IllegalStateException}.
* </p>
* <p>
* If the parser is closed, the iterator will not yield any more records.
* A call to {@link Iterator#hasNext()} will return {@code false} and
* a call to {@link Iterator#next()} will throw a
* {@link NoSuchElementException}.
* </p>
* <p>
* If it is necessary to construct an iterator which is usable after the
* parser is closed, one option is to extract all records as a list with
* {@link #getRecords()}, and return an iterator to that list.
* </p>
*/
@Override
public Iterator<CSVRecord> iterator() {
return csvRecordIterator;
}
/**
* Parses the next record from the current point in the stream.
*
* @return the record as an array of values, or {@code null} if the end of the stream has been reached
* @throws IOException on parse error or input read-failure
* @throws CSVException Thrown on invalid input.
*/
CSVRecord nextRecord() throws IOException {
CSVRecord result = null;
recordList.clear();
StringBuilder sb = null;
final long startCharPosition = lexer.getCharacterPosition() + characterOffset;
do {
reusableToken.reset();
lexer.nextToken(reusableToken);
switch (reusableToken.type) {
case TOKEN:
addRecordValue(false);
break;
case EORECORD:
addRecordValue(true);
break;
case EOF:
if (reusableToken.isReady) {
addRecordValue(true);
} else if (sb != null) {
trailerComment = sb.toString();
}
break;
case INVALID:
throw new IOException("(line " + getCurrentLineNumber() + ") invalid parse sequence");
case COMMENT: // Ignored currently
if (sb == null) { // first comment for this record
sb = new StringBuilder();
} else {
sb.append(Constants.LF);
}
sb.append(reusableToken.content);
reusableToken.type = TOKEN; // Read another token
break;
default:
throw new IllegalStateException("Unexpected Token type: " + reusableToken.type);
}
} while (reusableToken.type == TOKEN);
if (!recordList.isEmpty()) {
recordNumber++;
final String comment = Objects.toString(sb, null);
result = new CSVRecord(this, recordList.toArray(Constants.EMPTY_STRING_ARRAY), comment,
recordNumber, startCharPosition);
}
return result;
}
/**
* Returns a sequential {@code Stream} with this collection as its source.
* <p>
* If the parser is closed, the stream will not produce any more values.
* See the comments in {@link #iterator()}.
* </p>
* @return a sequential {@code Stream} with this collection as its source.
* @since 1.9.0
*/
public Stream<CSVRecord> stream() {
return StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator(), Spliterator.ORDERED), false);
}
}