Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *   https://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019
020package org.apache.commons.csv;
021
022import static org.apache.commons.csv.Token.Type.TOKEN;
023
024import java.io.Closeable;
025import java.io.File;
026import java.io.IOException;
027import java.io.InputStream;
028import java.io.InputStreamReader;
029import java.io.Reader;
030import java.io.StringReader;
031import java.io.UncheckedIOException;
032import java.net.URL;
033import java.nio.charset.Charset;
034import java.nio.file.Files;
035import java.nio.file.Path;
036import java.util.ArrayList;
037import java.util.Arrays;
038import java.util.Collections;
039import java.util.Iterator;
040import java.util.LinkedHashMap;
041import java.util.List;
042import java.util.Map;
043import java.util.NoSuchElementException;
044import java.util.Objects;
045import java.util.Spliterator;
046import java.util.Spliterators;
047import java.util.TreeMap;
048import java.util.stream.Collectors;
049import java.util.stream.Stream;
050import java.util.stream.StreamSupport;
051
052import org.apache.commons.io.build.AbstractStreamBuilder;
053import org.apache.commons.io.function.Uncheck;
054
055/**
056 * Parses CSV files according to the specified format.
057 *
058 * Because CSV appears in many different dialects, the parser supports many formats by allowing the
059 * specification of a {@link CSVFormat}.
060 *
061 * The parser works record-wise. It is not possible to go back, once a record has been parsed from the input stream.
062 *
063 * <h2>Creating instances</h2>
064 * <p>
065 * There are several static factory methods that can be used to create instances for various types of resources:
066 * </p>
067 * <ul>
068 *     <li>{@link #parse(java.io.File, Charset, CSVFormat)}</li>
069 *     <li>{@link #parse(String, CSVFormat)}</li>
070 *     <li>{@link #parse(java.net.URL, java.nio.charset.Charset, CSVFormat)}</li>
071 * </ul>
072 * <p>
073 * Alternatively parsers can also be created by passing a {@link Reader} directly to the sole constructor.
074 *
075 * For those who like fluent APIs, parsers can be created using {@link CSVFormat#parse(java.io.Reader)} as a shortcut:
076 * </p>
077 * <pre>
078 * for (CSVRecord record : CSVFormat.EXCEL.parse(in)) {
079 *     ...
080 * }
081 * </pre>
082 *
083 * <h2>Parsing record wise</h2>
084 * <p>
085 * To parse a CSV input from a file, you write:
086 * </p>
087 *
088 * <pre>{@code
089 * File csvData = new File("/path/to/csv");
090 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.RFC4180);
091 * for (CSVRecord csvRecord : parser) {
092 *     ...
093 * }}
094 * </pre>
095 *
096 * <p>
097 * This will read the parse the contents of the file using the
098 * <a href="https://tools.ietf.org/html/rfc4180" target="_blank">RFC 4180</a> format.
099 * </p>
100 *
101 * <p>
102 * To parse CSV input in a format like Excel, you write:
103 * </p>
104 *
105 * <pre>
106 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.EXCEL);
107 * for (CSVRecord csvRecord : parser) {
108 *     ...
109 * }
110 * </pre>
111 *
112 * <p>
113 * If the predefined formats don't match the format at hand, custom formats can be defined. More information about
114 * customizing CSVFormats is available in {@link CSVFormat CSVFormat Javadoc}.
115 * </p>
116 *
117 * <h2>Parsing into memory</h2>
118 * <p>
119 * If parsing record-wise is not desired, the contents of the input can be read completely into memory.
120 * </p>
121 *
122 * <pre>{@code
123 * Reader in = new StringReader("a;b\nc;d");
124 * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL);
125 * List<CSVRecord> list = parser.getRecords();
126 * }</pre>
127 *
128 * <p>
129 * There are two constraints that have to be kept in mind:
130 * </p>
131 *
132 * <ol>
133 *     <li>Parsing into memory starts at the current position of the parser. If you have already parsed records from
134 *     the input, those records will not end up in the in-memory representation of your CSV data.</li>
135 *     <li>Parsing into memory may consume a lot of system resources depending on the input. For example, if you're
136 *     parsing a 150MB file of CSV data the contents will be read completely into memory.</li>
137 * </ol>
138 *
139 * <h2>Notes</h2>
140 * <p>
141 * The internal parser state is completely covered by the format and the reader state.
142 * </p>
143 *
144 * @see <a href="package-summary.html">package documentation for more details</a>
145 */
146public final class CSVParser implements Iterable<CSVRecord>, Closeable {
147
148    /**
149     * Builds a new {@link CSVParser}.
150     *
151     * @since 1.13.0
152     */
153    public static class Builder extends AbstractStreamBuilder<CSVParser, Builder> {
154
155        private CSVFormat format;
156        private long characterOffset;
157        private long recordNumber = 1;
158        private boolean trackBytes;
159
160        /**
161         * Constructs a new instance.
162         */
163        protected Builder() {
164            // empty
165        }
166
167        @SuppressWarnings("resource")
168        @Override
169        public CSVParser get() throws IOException {
170            return new CSVParser(getReader(), format != null ? format : CSVFormat.DEFAULT, characterOffset, recordNumber, getCharset(), trackBytes);
171        }
172
173        /**
174         * Sets the lexer offset when the parser does not start parsing at the beginning of the source.
175         *
176         * @param characterOffset the lexer offset.
177         * @return this instance.
178         */
179        public Builder setCharacterOffset(final long characterOffset) {
180            this.characterOffset = characterOffset;
181            return asThis();
182        }
183
184        /**
185         * Sets the CSV format. A copy of the given format is kept.
186         *
187         * @param format the CSV format, null is equivalent to {@link CSVFormat#DEFAULT}.
188         * @return this instance.
189         */
190        public Builder setFormat(final CSVFormat format) {
191            this.format = CSVFormat.copy(format);
192            return asThis();
193        }
194
195        /**
196         * Sets the next record number to assign, defaults to {@code 1}.
197         *
198         * @param recordNumber the next record number to assign.
199         * @return this instance.
200         */
201        public Builder setRecordNumber(final long recordNumber) {
202            this.recordNumber = recordNumber;
203            return asThis();
204        }
205
206        /**
207         * Sets whether to enable byte tracking for the parser.
208         *
209         * @param trackBytes {@code true} to enable byte tracking; {@code false} to disable it.
210         * @return this instance.
211         * @since 1.13.0
212         */
213        public Builder setTrackBytes(final boolean trackBytes) {
214            this.trackBytes = trackBytes;
215            return asThis();
216        }
217
218    }
219
220    final class CSVRecordIterator implements Iterator<CSVRecord> {
221        private CSVRecord current;
222
223        /**
224         * Gets the next record.
225         *
226         * @throws IOException  on parse error or input read-failure
227         * @throws CSVException on invalid input.
228         * @return the next record.
229         */
230        private CSVRecord getNextRecord() {
231            return Uncheck.get(CSVParser.this::nextRecord);
232        }
233
234        @Override
235        public boolean hasNext() {
236            if (isClosed()) {
237                return false;
238            }
239            if (current == null) {
240                current = getNextRecord();
241            }
242
243            return current != null;
244        }
245
246        @Override
247        public CSVRecord next() {
248            if (isClosed()) {
249                throw new NoSuchElementException("CSVParser has been closed");
250            }
251            CSVRecord next = current;
252            current = null;
253
254            if (next == null) {
255                // hasNext() wasn't called before
256                next = getNextRecord();
257                if (next == null) {
258                    throw new NoSuchElementException("No more CSV records available");
259                }
260            }
261
262            return next;
263        }
264
265        @Override
266        public void remove() {
267            throw new UnsupportedOperationException();
268        }
269    }
270    /**
271     * Header information based on name and position.
272     */
273    private static final class Headers {
274
275        /**
276         * Header column positions (0-based)
277         */
278        final Map<String, Integer> headerMap;
279
280        /**
281         * Header names in column order
282         */
283        final List<String> headerNames;
284
285        Headers(final Map<String, Integer> headerMap, final List<String> headerNames) {
286            this.headerMap = headerMap;
287            this.headerNames = headerNames;
288        }
289    }
290
291    /**
292     * Creates a new builder.
293     *
294     * @return a new builder.
295     * @since 1.13.0
296     */
297    public static Builder builder() {
298        return new Builder();
299    }
300
301    /**
302     * Creates a parser for the given {@link File}.
303     *
304     * @param file
305     *            a CSV file. Must not be null.
306     * @param charset
307     *            The Charset to decode the given file.
308     * @param format
309     *            the CSVFormat used for CSV parsing. Must not be null.
310     * @return a new parser
311     * @throws IllegalArgumentException
312     *             If the parameters of the format are inconsistent or if either file or format are null.
313     * @throws IOException
314     *             If an I/O error occurs
315     * @throws CSVException Thrown on invalid input.
316     */
317    public static CSVParser parse(final File file, final Charset charset, final CSVFormat format) throws IOException {
318        Objects.requireNonNull(file, "file");
319        return parse(file.toPath(), charset, format);
320    }
321
322    /**
323     * Creates a CSV parser using the given {@link CSVFormat}.
324     *
325     * <p>
326     * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
327     * unless you close the {@code reader}.
328     * </p>
329     *
330     * @param inputStream
331     *            an InputStream containing CSV-formatted input. Must not be null.
332     * @param charset
333     *            The Charset to decode the given file.
334     * @param format
335     *            the CSVFormat used for CSV parsing. Must not be null.
336     * @return a new CSVParser configured with the given reader and format.
337     * @throws IllegalArgumentException
338     *             If the parameters of the format are inconsistent or if either reader or format are null.
339     * @throws IOException
340     *             If there is a problem reading the header or skipping the first record
341     * @throws CSVException Thrown on invalid input.
342     * @since 1.5
343     */
344    @SuppressWarnings("resource")
345    public static CSVParser parse(final InputStream inputStream, final Charset charset, final CSVFormat format)
346            throws IOException {
347        Objects.requireNonNull(inputStream, "inputStream");
348        Objects.requireNonNull(format, "format");
349        return parse(new InputStreamReader(inputStream, charset), format);
350    }
351
352    /**
353     * Creates and returns a parser for the given {@link Path}, which the caller MUST close.
354     *
355     * @param path
356     *            a CSV file. Must not be null.
357     * @param charset
358     *            The Charset to decode the given file.
359     * @param format
360     *            the CSVFormat used for CSV parsing. Must not be null.
361     * @return a new parser
362     * @throws IllegalArgumentException
363     *             If the parameters of the format are inconsistent or if either file or format are null.
364     * @throws IOException
365     *             If an I/O error occurs
366     * @throws CSVException Thrown on invalid input.
367     * @since 1.5
368     */
369    @SuppressWarnings("resource")
370    public static CSVParser parse(final Path path, final Charset charset, final CSVFormat format) throws IOException {
371        Objects.requireNonNull(path, "path");
372        Objects.requireNonNull(format, "format");
373        return parse(Files.newInputStream(path), charset, format);
374    }
375
376    /**
377     * Creates a CSV parser using the given {@link CSVFormat}
378     *
379     * <p>
380     * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
381     * unless you close the {@code reader}.
382     * </p>
383     *
384     * @param reader
385     *            a Reader containing CSV-formatted input. Must not be null.
386     * @param format
387     *            the CSVFormat used for CSV parsing. Must not be null.
388     * @return a new CSVParser configured with the given reader and format.
389     * @throws IllegalArgumentException
390     *             If the parameters of the format are inconsistent or if either reader or format are null.
391     * @throws IOException
392     *             If there is a problem reading the header or skipping the first record
393     * @throws CSVException Thrown on invalid input.
394     * @since 1.5
395     */
396    public static CSVParser parse(final Reader reader, final CSVFormat format) throws IOException {
397        return builder().setReader(reader).setFormat(format).get();
398    }
399
400    /**
401     * Creates a parser for the given {@link String}.
402     *
403     * @param string
404     *            a CSV string. Must not be null.
405     * @param format
406     *            the CSVFormat used for CSV parsing. Must not be null.
407     * @return a new parser
408     * @throws IllegalArgumentException
409     *             If the parameters of the format are inconsistent or if either string or format are null.
410     * @throws IOException
411     *             If an I/O error occurs
412     * @throws CSVException Thrown on invalid input.
413     */
414    public static CSVParser parse(final String string, final CSVFormat format) throws IOException {
415        Objects.requireNonNull(string, "string");
416        Objects.requireNonNull(format, "format");
417        return parse(new StringReader(string), format);
418    }
419
420    /**
421     * Creates and returns a parser for the given URL, which the caller MUST close.
422     *
423     * <p>
424     * If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless
425     * you close the {@code url}.
426     * </p>
427     *
428     * @param url
429     *            a URL. Must not be null.
430     * @param charset
431     *            the charset for the resource. Must not be null.
432     * @param format
433     *            the CSVFormat used for CSV parsing. Must not be null.
434     * @return a new parser
435     * @throws IllegalArgumentException
436     *             If the parameters of the format are inconsistent or if either url, charset or format are null.
437     * @throws IOException
438     *             If an I/O error occurs
439     * @throws CSVException Thrown on invalid input.
440     */
441    @SuppressWarnings("resource")
442    public static CSVParser parse(final URL url, final Charset charset, final CSVFormat format) throws IOException {
443        Objects.requireNonNull(url, "url");
444        return parse(url.openStream(), charset, format);
445    }
446
447    private String headerComment;
448
449    private String trailerComment;
450
451    private final CSVFormat format;
452
453    private final Headers headers;
454
455    private final Lexer lexer;
456
457    private final CSVRecordIterator csvRecordIterator;
458
459    /** A record buffer for getRecord(). Grows as necessary and is reused. */
460    private final List<String> recordList = new ArrayList<>();
461
462    /**
463     * The next record number to assign.
464     */
465    private long recordNumber;
466
467    /**
468     * Lexer offset when the parser does not start parsing at the beginning of the source. Usually used in combination
469     * with {@link #recordNumber}.
470     */
471    private final long characterOffset;
472
473    private final Token reusableToken = new Token();
474
475    /**
476     * Constructs a new instance using the given {@link CSVFormat}
477     *
478     * <p>
479     * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
480     * unless you close the {@code reader}.
481     * </p>
482     *
483     * @param reader
484     *            a Reader containing CSV-formatted input. Must not be null.
485     * @param format
486     *            the CSVFormat used for CSV parsing. Must not be null.
487     * @throws IllegalArgumentException
488     *             If the parameters of the format are inconsistent or if either reader or format are null.
489     * @throws IOException
490     *             If there is a problem reading the header or skipping the first record
491     * @throws CSVException Thrown on invalid input.
492     * @deprecated Will be removed in the next major version, use {@link Builder#get()}.
493     */
494    @Deprecated
495    public CSVParser(final Reader reader, final CSVFormat format) throws IOException {
496        this(reader, format, 0, 1);
497    }
498
499    /**
500     * Constructs a new instance using the given {@link CSVFormat}
501     *
502     * <p>
503     * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
504     * unless you close the {@code reader}.
505     * </p>
506     *
507     * @param reader
508     *            a Reader containing CSV-formatted input. Must not be null.
509     * @param format
510     *            the CSVFormat used for CSV parsing. Must not be null.
511     * @param characterOffset
512     *            Lexer offset when the parser does not start parsing at the beginning of the source.
513     * @param recordNumber
514     *            The next record number to assign.
515     * @throws IllegalArgumentException
516     *             If the parameters of the format are inconsistent or if either the reader or format is null.
517     * @throws IOException
518     *             if there is a problem reading the header or skipping the first record
519     * @throws CSVException on invalid input.
520     * @since 1.1
521     * @deprecated Will be private in the next major version, use {@link Builder#get()}.
522     */
523    @Deprecated
524    public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber)
525        throws IOException {
526            this(reader, format, characterOffset, recordNumber, null, false);
527        }
528
529    /**
530     * Constructs a new instance using the given {@link CSVFormat}
531     *
532     * <p>
533     * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
534     * unless you close the {@code reader}.
535     * </p>
536     *
537     * @param reader
538     *            a Reader containing CSV-formatted input. Must not be null.
539     * @param format
540     *            the CSVFormat used for CSV parsing. Must not be null.
541     * @param characterOffset
542     *            Lexer offset when the parser does not start parsing at the beginning of the source.
543     * @param recordNumber
544     *            The next record number to assign.
545     * @param charset
546     *            The character encoding to be used for the reader when enableByteTracking is true.
547     * @param trackBytes
548     *           {@code true} to enable byte tracking for the parser; {@code false} to disable it.
549     * @throws IllegalArgumentException
550     *             If the parameters of the format are inconsistent or if either the reader or format is null.
551     * @throws IOException
552     *             If there is a problem reading the header or skipping the first record.
553     * @throws CSVException Thrown on invalid input.
554     */
555    private CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber,
556        final Charset charset, final boolean trackBytes)
557        throws IOException {
558        Objects.requireNonNull(reader, "reader");
559        Objects.requireNonNull(format, "format");
560        this.format = format.copy();
561        this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, charset, trackBytes));
562        this.csvRecordIterator = new CSVRecordIterator();
563        this.headers = createHeaders();
564        this.characterOffset = characterOffset;
565        this.recordNumber = recordNumber - 1;
566    }
567
568    private void addRecordValue(final boolean lastRecord) {
569        final String input = format.trim(reusableToken.content.toString());
570        if (lastRecord && input.isEmpty() && format.getTrailingDelimiter()) {
571            return;
572        }
573        recordList.add(handleNull(input));
574    }
575
576    /**
577     * Closes resources.
578     *
579     * @throws IOException
580     *             If an I/O error occurs
581     */
582    @Override
583    public void close() throws IOException {
584        lexer.close();
585    }
586
587    private Map<String, Integer> createEmptyHeaderMap() {
588        return format.getIgnoreHeaderCase() ?
589                new TreeMap<>(String.CASE_INSENSITIVE_ORDER) :
590                new LinkedHashMap<>();
591    }
592
593    /**
594     * Creates the name to index mapping if the format defines a header.
595     *
596     * @return null if the format has no header.
597     * @throws IOException if there is a problem reading the header or skipping the first record
598     * @throws CSVException on invalid input.
599     */
600    private Headers createHeaders() throws IOException {
601        Map<String, Integer> hdrMap = null;
602        List<String> headerNames = null;
603        final String[] formatHeader = format.getHeader();
604        if (formatHeader != null) {
605            hdrMap = createEmptyHeaderMap();
606            String[] headerRecord = null;
607            if (formatHeader.length == 0) {
608                // read the header from the first line of the file
609                final CSVRecord nextRecord = nextRecord();
610                if (nextRecord != null) {
611                    headerRecord = nextRecord.values();
612                    headerComment = nextRecord.getComment();
613                }
614            } else {
615                if (format.getSkipHeaderRecord()) {
616                    final CSVRecord nextRecord = nextRecord();
617                    if (nextRecord != null) {
618                        headerComment = nextRecord.getComment();
619                    }
620                }
621                headerRecord = formatHeader;
622            }
623
624            // build the name to index mappings
625            if (headerRecord != null) {
626                // Track an occurrence of a null, empty or blank header.
627                boolean observedMissing = false;
628                for (int i = 0; i < headerRecord.length; i++) {
629                    final String header = headerRecord[i];
630                    final boolean blankHeader = CSVFormat.isBlank(header);
631                    if (blankHeader && !format.getAllowMissingColumnNames()) {
632                        throw new IllegalArgumentException(
633                            "A header name is missing in " + Arrays.toString(headerRecord));
634                    }
635
636                    final boolean containsHeader = blankHeader ? observedMissing : hdrMap.containsKey(header);
637                    final DuplicateHeaderMode headerMode = format.getDuplicateHeaderMode();
638                    final boolean duplicatesAllowed = headerMode == DuplicateHeaderMode.ALLOW_ALL;
639                    final boolean emptyDuplicatesAllowed = headerMode == DuplicateHeaderMode.ALLOW_EMPTY;
640
641                    if (containsHeader && !duplicatesAllowed && !(blankHeader && emptyDuplicatesAllowed)) {
642                        throw new IllegalArgumentException(
643                            String.format(
644                                "The header contains a duplicate name: \"%s\" in %s. If this is valid then use CSVFormat.Builder.setDuplicateHeaderMode().",
645                                header, Arrays.toString(headerRecord)));
646                    }
647                    observedMissing |= blankHeader;
648                    if (header != null) {
649                        hdrMap.put(header, Integer.valueOf(i)); // N.B. Explicit (un)boxing is intentional
650                        if (headerNames == null) {
651                            headerNames = new ArrayList<>(headerRecord.length);
652                        }
653                        headerNames.add(header);
654                    }
655                }
656            }
657        }
658        // Make header names Collection immutable
659        return new Headers(hdrMap, headerNames == null ? Collections.emptyList() : Collections.unmodifiableList(headerNames));
660    }
661
662    /**
663     * Gets the current line number in the input stream.
664     *
665     * <p>
666     * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to
667     * the record number.
668     * </p>
669     *
670     * @return current line number
671     */
672    public long getCurrentLineNumber() {
673        return lexer.getCurrentLineNumber();
674    }
675
676    /**
677     * Gets the first end-of-line string encountered.
678     *
679     * @return the first end-of-line string
680     * @since 1.5
681     */
682    public String getFirstEndOfLine() {
683        return lexer.getFirstEol();
684    }
685
686    /**
687     * Gets the header comment, if any.
688     * The header comment appears before the header record.
689     *
690     * @return the header comment for this stream, or null if no comment is available.
691     * @since 1.10.0
692     */
693    public String getHeaderComment() {
694        return headerComment;
695    }
696
697    /**
698     * Gets a copy of the header map as defined in the CSVFormat's header.
699     * <p>
700     * The map keys are column names. The map values are 0-based indices.
701     * </p>
702     * <p>
703     * Note: The map can only provide a one-to-one mapping when the format did not
704     * contain null or duplicate column names.
705     * </p>
706     *
707     * @return a copy of the header map.
708     */
709    public Map<String, Integer> getHeaderMap() {
710        if (headers.headerMap == null) {
711            return null;
712        }
713        final Map<String, Integer> map = createEmptyHeaderMap();
714        map.putAll(headers.headerMap);
715        return map;
716    }
717
718    /**
719     * Gets the underlying header map.
720     *
721     * @return the underlying header map.
722     */
723    Map<String, Integer> getHeaderMapRaw() {
724        return headers.headerMap;
725    }
726
727    /**
728     * Gets a read-only list of header names that iterates in column order as defined in the CSVFormat's header.
729     * <p>
730     * Note: The list provides strings that can be used as keys in the header map.
731     * The list will not contain null column names if they were present in the input
732     * format.
733     * </p>
734     *
735     * @return read-only list of header names that iterates in column order.
736     * @see #getHeaderMap()
737     * @since 1.7
738     */
739    public List<String> getHeaderNames() {
740        return Collections.unmodifiableList(headers.headerNames);
741    }
742
743    /**
744     * Gets the current record number in the input stream.
745     *
746     * <p>
747     * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to
748     * the line number.
749     * </p>
750     *
751     * @return current record number
752     */
753    public long getRecordNumber() {
754        return recordNumber;
755    }
756
757    /**
758     * Parses the CSV input according to the given format and returns the content as a list of
759     * {@link CSVRecord CSVRecords}.
760     *
761     * <p>
762     * The returned content starts at the current parse-position in the stream.
763     * </p>
764     *
765     * @return list of {@link CSVRecord CSVRecords}, may be empty
766     * @throws UncheckedIOException
767     *             on parse error or input read-failure
768     */
769    public List<CSVRecord> getRecords() {
770        return stream().collect(Collectors.toList());
771    }
772
773    /**
774     * Gets the trailer comment, if any.
775     * Trailer comments are located between the last record and EOF
776     *
777     * @return the trailer comment for this stream, or null if no comment is available.
778     * @since 1.10.0
779     */
780    public String getTrailerComment() {
781        return trailerComment;
782    }
783
784    /**
785     * Handles whether the input is parsed as null
786     *
787     * @param input
788     *           the cell data to further processed
789     * @return null if input is parsed as null, or input itself if the input isn't parsed as null
790     */
791    private String handleNull(final String input) {
792        final boolean isQuoted = reusableToken.isQuoted;
793        final String nullString = format.getNullString();
794        final boolean strictQuoteMode = isStrictQuoteMode();
795        if (input.equals(nullString)) {
796            // nullString = NULL(String), distinguish between "NULL" and NULL in ALL_NON_NULL or NON_NUMERIC quote mode
797            return strictQuoteMode && isQuoted ? input : null;
798        }
799        // don't set nullString, distinguish between "" and ,, (absent values) in All_NON_NULL or NON_NUMERIC quote mode
800        return strictQuoteMode && nullString == null && input.isEmpty() && !isQuoted ? null : input;
801    }
802
803    /**
804     * Checks whether there is a header comment.
805     * The header comment appears before the header record.
806     * Note that if the parser's format has been given an explicit header
807     * (with {@link CSVFormat.Builder#setHeader(String... )} or another overload)
808     * and the header record is not being skipped
809     * ({@link CSVFormat.Builder#setSkipHeaderRecord} is false) then any initial comments
810     * will be associated with the first record, not the header.
811     *
812     * @return true if this parser has seen a header comment, false otherwise
813     * @since 1.10.0
814     */
815    public boolean hasHeaderComment() {
816        return headerComment != null;
817    }
818
819    /**
820     * Checks whether there is a trailer comment.
821     * Trailer comments are located between the last record and EOF.
822     * The trailer comments will only be available after the parser has
823     * finished processing this stream.
824     *
825     * @return true if this parser has seen a trailer comment, false otherwise
826     * @since 1.10.0
827     */
828    public boolean hasTrailerComment() {
829        return trailerComment != null;
830    }
831
832    /**
833     * Tests whether this parser is closed.
834     *
835     * @return whether this parser is closed.
836     */
837    public boolean isClosed() {
838        return lexer.isClosed();
839    }
840
841    /**
842     * Tests whether the format's {@link QuoteMode} is {@link QuoteMode#ALL_NON_NULL} or {@link QuoteMode#NON_NUMERIC}.
843     *
844     * @return true if the format's {@link QuoteMode} is {@link QuoteMode#ALL_NON_NULL} or
845     *         {@link QuoteMode#NON_NUMERIC}.
846     */
847    private boolean isStrictQuoteMode() {
848        return format.getQuoteMode() == QuoteMode.ALL_NON_NULL ||
849               format.getQuoteMode() == QuoteMode.NON_NUMERIC;
850    }
851
852    /**
853     * Returns the record iterator.
854     *
855     * <p>
856     * An {@link IOException} caught during the iteration is re-thrown as an
857     * {@link IllegalStateException}.
858     * </p>
859     * <p>
860     * If the parser is closed, the iterator will not yield any more records.
861     * A call to {@link Iterator#hasNext()} will return {@code false} and
862     * a call to {@link Iterator#next()} will throw a
863     * {@link NoSuchElementException}.
864     * </p>
865     * <p>
866     * If it is necessary to construct an iterator which is usable after the
867     * parser is closed, one option is to extract all records as a list with
868     * {@link #getRecords()}, and return an iterator to that list.
869     * </p>
870     */
871    @Override
872    public Iterator<CSVRecord> iterator() {
873        return csvRecordIterator;
874    }
875
876    /**
877     * Parses the next record from the current point in the stream.
878     *
879     * @return the record as an array of values, or {@code null} if the end of the stream has been reached
880     * @throws IOException  on parse error or input read-failure
881     * @throws CSVException on invalid input.
882     */
883    CSVRecord nextRecord() throws IOException {
884        CSVRecord result = null;
885        recordList.clear();
886        StringBuilder sb = null;
887        final long startCharPosition = lexer.getCharacterPosition() + characterOffset;
888        final long startBytePosition = lexer.getBytesRead() + this.characterOffset;
889        do {
890            reusableToken.reset();
891            lexer.nextToken(reusableToken);
892            switch (reusableToken.type) {
893            case TOKEN:
894                addRecordValue(false);
895                break;
896            case EORECORD:
897                addRecordValue(true);
898                break;
899            case EOF:
900                if (reusableToken.isReady) {
901                    addRecordValue(true);
902                } else if (sb != null) {
903                    trailerComment = sb.toString();
904                }
905                break;
906            case INVALID:
907                throw new CSVException("(line %,d) invalid parse sequence", getCurrentLineNumber());
908            case COMMENT: // Ignored currently
909                if (sb == null) { // first comment for this record
910                    sb = new StringBuilder();
911                } else {
912                    sb.append(Constants.LF);
913                }
914                sb.append(reusableToken.content);
915                reusableToken.type = TOKEN; // Read another token
916                break;
917            default:
918                throw new CSVException("Unexpected Token type: %s", reusableToken.type);
919            }
920        } while (reusableToken.type == TOKEN);
921
922        if (!recordList.isEmpty()) {
923            recordNumber++;
924            final String comment = Objects.toString(sb, null);
925            result = new CSVRecord(this, recordList.toArray(Constants.EMPTY_STRING_ARRAY), comment,
926                recordNumber, startCharPosition, startBytePosition);
927        }
928        return result;
929    }
930
931    /**
932     * Returns a sequential {@code Stream} with this collection as its source.
933     * <p>
934     * If the parser is closed, the stream will not produce any more values.
935     * See the comments in {@link #iterator()}.
936     * </p>
937     * @return a sequential {@code Stream} with this collection as its source.
938     * @since 1.9.0
939     */
940    public Stream<CSVRecord> stream() {
941        return StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator(), Spliterator.ORDERED), false);
942    }
943
944}