View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.csv;
19  
20  import static org.apache.commons.io.IOUtils.EOF;
21  
22  import java.io.Closeable;
23  import java.io.IOException;
24  
25  import org.apache.commons.io.IOUtils;
26  
27  /**
28   * Lexical analyzer.
29   */
30  final class Lexer implements Closeable {
31  
32      private static final String CR_STRING = Character.toString(Constants.CR);
33      private static final String LF_STRING = Character.toString(Constants.LF);
34  
35      private final char[] delimiter;
36      private final char[] delimiterBuf;
37      private final char[] escapeDelimiterBuf;
38      private final int escape;
39      private final int quoteChar;
40      private final int commentStart;
41      private final boolean ignoreSurroundingSpaces;
42      private final boolean ignoreEmptyLines;
43      private final boolean lenientEof;
44      private final boolean trailingData;
45  
46      /** The buffered reader. */
47      private final ExtendedBufferedReader reader;
48      private String firstEol;
49  
50      private boolean isLastTokenDelimiter;
51  
52      Lexer(final CSVFormat format, final ExtendedBufferedReader reader) {
53          this.reader = reader;
54          this.delimiter = format.getDelimiterCharArray();
55          this.escape = nullToDisabled(format.getEscapeCharacter());
56          this.quoteChar = nullToDisabled(format.getQuoteCharacter());
57          this.commentStart = nullToDisabled(format.getCommentMarker());
58          this.ignoreSurroundingSpaces = format.getIgnoreSurroundingSpaces();
59          this.ignoreEmptyLines = format.getIgnoreEmptyLines();
60          this.lenientEof = format.getLenientEof();
61          this.trailingData = format.getTrailingData();
62          this.delimiterBuf = new char[delimiter.length - 1];
63          this.escapeDelimiterBuf = new char[2 * delimiter.length - 1];
64      }
65  
66      /**
67       * Appends the next escaped character to the token's content.
68       *
69       * @param token the current token
70       * @throws IOException  on stream access error
71       * @throws CSVException Thrown on invalid input.
72       */
73      private void appendNextEscapedCharacterToToken(final Token token) throws IOException {
74          if (isEscapeDelimiter()) {
75              token.content.append(delimiter);
76          } else {
77              final int unescaped = readEscape();
78              if (unescaped == EOF) { // unexpected char after escape
79                  token.content.append((char) escape).append((char) reader.getLastChar());
80              } else {
81                  token.content.append((char) unescaped);
82              }
83          }
84      }
85  
86      /**
87       * Closes resources.
88       *
89       * @throws IOException
90       *             If an I/O error occurs
91       */
92      @Override
93      public void close() throws IOException {
94          reader.close();
95      }
96  
97      /**
98       * Returns the current character position
99       *
100      * @return the current character position
101      */
102     long getCharacterPosition() {
103         return reader.getPosition();
104     }
105 
106     /**
107      * Returns the current line number
108      *
109      * @return the current line number
110      */
111     long getCurrentLineNumber() {
112         return reader.getLineNumber();
113     }
114 
115     String getFirstEol() {
116         return firstEol;
117     }
118 
119     boolean isClosed() {
120         return reader.isClosed();
121     }
122 
123     boolean isCommentStart(final int ch) {
124         return ch == commentStart;
125     }
126 
127     /**
128      * Determine whether the next characters constitute a delimiter through {@link ExtendedBufferedReader#peek(char[])}.
129      *
130      * @param ch
131      *             the current character.
132      * @return true if the next characters constitute a delimiter.
133      * @throws IOException If an I/O error occurs.
134      */
135     boolean isDelimiter(final int ch) throws IOException {
136         isLastTokenDelimiter = false;
137         if (ch != delimiter[0]) {
138             return false;
139         }
140         if (delimiter.length == 1) {
141             isLastTokenDelimiter = true;
142             return true;
143         }
144         reader.peek(delimiterBuf);
145         for (int i = 0; i < delimiterBuf.length; i++) {
146             if (delimiterBuf[i] != delimiter[i + 1]) {
147                 return false;
148             }
149         }
150         final int count = reader.read(delimiterBuf, 0, delimiterBuf.length);
151         isLastTokenDelimiter = count != EOF;
152         return isLastTokenDelimiter;
153     }
154 
155     /**
156      * Tests if the given character indicates the end of the file.
157      *
158      * @return true if the given character indicates the end of the file.
159      */
160     boolean isEndOfFile(final int ch) {
161         return ch == EOF;
162     }
163 
164     /**
165      * Tests if the given character is the escape character.
166      *
167      * @return true if the given character is the escape character.
168      */
169     boolean isEscape(final int ch) {
170         return ch == escape;
171     }
172 
173     /**
174      * Tests if the next characters constitute a escape delimiter through {@link ExtendedBufferedReader#peek(char[])}.
175      *
176      * For example, for delimiter "[|]" and escape '!', return true if the next characters constitute "![!|!]".
177      *
178      * @return true if the next characters constitute an escape delimiter.
179      * @throws IOException If an I/O error occurs.
180      */
181     boolean isEscapeDelimiter() throws IOException {
182         reader.peek(escapeDelimiterBuf);
183         if (escapeDelimiterBuf[0] != delimiter[0]) {
184             return false;
185         }
186         for (int i = 1; i < delimiter.length; i++) {
187             if (escapeDelimiterBuf[2 * i] != delimiter[i] || escapeDelimiterBuf[2 * i - 1] != escape) {
188                 return false;
189             }
190         }
191         final int count = reader.read(escapeDelimiterBuf, 0, escapeDelimiterBuf.length);
192         return count != EOF;
193     }
194 
195     private boolean isMetaChar(final int ch) {
196         return ch == escape || ch == quoteChar || ch == commentStart;
197     }
198 
199     boolean isQuoteChar(final int ch) {
200         return ch == quoteChar;
201     }
202 
203     /**
204      * Tests if the current character represents the start of a line: a CR, LF, or is at the start of the file.
205      *
206      * @param ch the character to check
207      * @return true if the character is at the start of a line.
208      */
209     boolean isStartOfLine(final int ch) {
210         return ch == Constants.LF || ch == Constants.CR || ch == Constants.UNDEFINED;
211     }
212 
213     /**
214      * Returns the next token.
215      * <p>
216      * A token corresponds to a term, a record change or an end-of-file indicator.
217      * </p>
218      *
219      * @param token an existing Token object to reuse. The caller is responsible for initializing the Token.
220      * @return the next token found.
221      * @throws IOException  on stream access error.
222      * @throws CSVException Thrown on invalid input.
223      */
224     Token nextToken(final Token token) throws IOException {
225         // Get the last read char (required for empty line detection)
226         int lastChar = reader.getLastChar();
227         // read the next char and set eol
228         int c = reader.read();
229         // Note: The following call will swallow LF if c == CR. But we don't need to know if the last char was CR or LF - they are equivalent here.
230         boolean eol = readEndOfLine(c);
231         // empty line detection: eol AND (last char was EOL or beginning)
232         if (ignoreEmptyLines) {
233             while (eol && isStartOfLine(lastChar)) {
234                 // Go on char ahead ...
235                 lastChar = c;
236                 c = reader.read();
237                 eol = readEndOfLine(c);
238                 // reached the end of the file without any content (empty line at the end)
239                 if (isEndOfFile(c)) {
240                     token.type = Token.Type.EOF;
241                     // don't set token.isReady here because no content
242                     return token;
243                 }
244             }
245         }
246         // Did we reach EOF during the last iteration already? EOF
247         if (isEndOfFile(lastChar) || !isLastTokenDelimiter && isEndOfFile(c)) {
248             token.type = Token.Type.EOF;
249             // don't set token.isReady here because no content
250             return token;
251         }
252         if (isStartOfLine(lastChar) && isCommentStart(c)) {
253             final String line = reader.readLine();
254             if (line == null) {
255                 token.type = Token.Type.EOF;
256                 // don't set token.isReady here because no content
257                 return token;
258             }
259             final String comment = line.trim();
260             token.content.append(comment);
261             token.type = Token.Type.COMMENT;
262             return token;
263         }
264         // Important: make sure a new char gets consumed in each iteration
265         while (token.type == Token.Type.INVALID) {
266             // ignore whitespaces at beginning of a token
267             if (ignoreSurroundingSpaces) {
268                 while (Character.isWhitespace((char) c) && !isDelimiter(c) && !eol) {
269                     c = reader.read();
270                     eol = readEndOfLine(c);
271                 }
272             }
273             // ok, start of token reached: encapsulated, or token
274             if (isDelimiter(c)) {
275                 // empty token return TOKEN("")
276                 token.type = Token.Type.TOKEN;
277             } else if (eol) {
278                 // empty token return EORECORD("")
279                 // noop: token.content.append("");
280                 token.type = Token.Type.EORECORD;
281             } else if (isQuoteChar(c)) {
282                 // consume encapsulated token
283                 parseEncapsulatedToken(token);
284             } else if (isEndOfFile(c)) {
285                 // end of file return EOF()
286                 // noop: token.content.append("");
287                 token.type = Token.Type.EOF;
288                 token.isReady = true; // there is data at EOF
289             } else {
290                 // next token must be a simple token
291                 // add removed blanks when not ignoring whitespace chars...
292                 parseSimpleToken(token, c);
293             }
294         }
295         return token;
296     }
297 
298     private int nullToDisabled(final Character c) {
299         return c == null ? Constants.UNDEFINED : c.charValue(); // Explicit unboxing
300     }
301 
302     /**
303      * Parses an encapsulated token.
304      * <p>
305      * Encapsulated tokens are surrounded by the given encapsulating string. The encapsulator itself might be included
306      * in the token using a doubling syntax (as "", '') or using escaping (as in \", \'). Whitespaces before and after
307      * an encapsulated token is ignored. The token is finished when one of the following conditions becomes true:
308      * </p>
309      * <ul>
310      * <li>An unescaped encapsulator has been reached and is followed by optional whitespace then:</li>
311      * <ul>
312      * <li>delimiter (TOKEN)</li>
313      * <li>end of line (EORECORD)</li>
314      * </ul>
315      * <li>end of stream has been reached (EOF)</li> </ul>
316      *
317      * @param token
318      *            the current token
319      * @return a valid token object
320      * @throws IOException
321      *             Thrown when in an invalid state: EOF before closing encapsulator or invalid character before
322      *             delimiter or EOL.
323      * @throws CSVException Thrown on invalid input.
324      */
325     private Token parseEncapsulatedToken(final Token token) throws IOException {
326         token.isQuoted = true;
327         // Save current line number in case needed for IOE
328         final long startLineNumber = getCurrentLineNumber();
329         int c;
330         while (true) {
331             c = reader.read();
332 
333             if (isQuoteChar(c)) {
334                 if (isQuoteChar(reader.peek())) {
335                     // double or escaped encapsulator -> add single encapsulator to token
336                     c = reader.read();
337                     token.content.append((char) c);
338                 } else {
339                     // token finish mark (encapsulator) reached: ignore whitespace till delimiter
340                     while (true) {
341                         c = reader.read();
342                         if (isDelimiter(c)) {
343                             token.type = Token.Type.TOKEN;
344                             return token;
345                         }
346                         if (isEndOfFile(c)) {
347                             token.type = Token.Type.EOF;
348                             token.isReady = true; // There is data at EOF
349                             return token;
350                         }
351                         if (readEndOfLine(c)) {
352                             token.type = Token.Type.EORECORD;
353                             return token;
354                         }
355                         if (trailingData) {
356                             token.content.append((char) c);
357                         } else if (!Character.isWhitespace((char) c)) {
358                             // error invalid char between token and next delimiter
359                             throw new CSVException("Invalid character between encapsulated token and delimiter at line: %,d, position: %,d",
360                                     getCurrentLineNumber(), getCharacterPosition());
361                         }
362                     }
363                 }
364             } else if (isEscape(c)) {
365                 appendNextEscapedCharacterToToken(token);
366             } else if (isEndOfFile(c)) {
367                 if (lenientEof) {
368                     token.type = Token.Type.EOF;
369                     token.isReady = true; // There is data at EOF
370                     return token;
371                 }
372                 // error condition (end of file before end of token)
373                 throw new CSVException("(startline %,d) EOF reached before encapsulated token finished", startLineNumber);
374             } else {
375                 // consume character
376                 token.content.append((char) c);
377             }
378         }
379     }
380 
381     /**
382      * Parses a simple token.
383      * <p>
384      * Simple tokens are tokens that are not surrounded by encapsulators. A simple token might contain escaped delimiters (as \, or \;). The token is finished
385      * when one of the following conditions becomes true:
386      * </p>
387      * <ul>
388      * <li>The end of line has been reached (EORECORD)</li>
389      * <li>The end of stream has been reached (EOF)</li>
390      * <li>An unescaped delimiter has been reached (TOKEN)</li>
391      * </ul>
392      *
393      * @param token the current token
394      * @param ch    the current character
395      * @return the filled token
396      * @throws IOException  on stream access error
397      * @throws CSVException Thrown on invalid input.
398      */
399     private Token parseSimpleToken(final Token token, int ch) throws IOException {
400         // Faster to use while(true)+break than while(token.type == INVALID)
401         while (true) {
402             if (readEndOfLine(ch)) {
403                 token.type = Token.Type.EORECORD;
404                 break;
405             }
406             if (isEndOfFile(ch)) {
407                 token.type = Token.Type.EOF;
408                 token.isReady = true; // There is data at EOF
409                 break;
410             }
411             if (isDelimiter(ch)) {
412                 token.type = Token.Type.TOKEN;
413                 break;
414             }
415             // continue
416             if (isEscape(ch)) {
417                 appendNextEscapedCharacterToToken(token);
418             } else {
419                 token.content.append((char) ch);
420             }
421             ch = reader.read(); // continue
422         }
423 
424         if (ignoreSurroundingSpaces) {
425             trimTrailingSpaces(token.content);
426         }
427 
428         return token;
429     }
430 
431     /**
432      * Greedily accepts \n, \r and \r\n This checker consumes silently the second control-character...
433      *
434      * @return true if the given or next character is a line-terminator
435      */
436     boolean readEndOfLine(int ch) throws IOException {
437         // check if we have \r\n...
438         if (ch == Constants.CR && reader.peek() == Constants.LF) {
439             // note: does not change ch outside of this method!
440             ch = reader.read();
441             // Save the EOL state
442             if (firstEol == null) {
443                 this.firstEol = Constants.CRLF;
444             }
445         }
446         // save EOL state here.
447         if (firstEol == null) {
448             if (ch == Constants.LF) {
449                 this.firstEol = LF_STRING;
450             } else if (ch == Constants.CR) {
451                 this.firstEol = CR_STRING;
452             }
453         }
454 
455         return ch == Constants.LF || ch == Constants.CR;
456     }
457 
458     // TODO escape handling needs more work
459     /**
460      * Handle an escape sequence. The current character must be the escape character. On return, the next character is available by calling
461      * {@link ExtendedBufferedReader#getLastChar()} on the input stream.
462      *
463      * @return the unescaped character (as an int) or {@link IOUtils#EOF} if char following the escape is invalid.
464      * @throws IOException  if there is a problem reading the stream or the end of stream is detected: the escape character is not allowed at end of stream
465      * @throws CSVException Thrown on invalid input.
466      */
467     int readEscape() throws IOException {
468         // the escape char has just been read (normally a backslash)
469         final int ch = reader.read();
470         switch (ch) {
471         case 'r':
472             return Constants.CR;
473         case 'n':
474             return Constants.LF;
475         case 't':
476             return Constants.TAB;
477         case 'b':
478             return Constants.BACKSPACE;
479         case 'f':
480             return Constants.FF;
481         case Constants.CR:
482         case Constants.LF:
483         case Constants.FF: // TODO is this correct?
484         case Constants.TAB: // TODO is this correct? Do tabs need to be escaped?
485         case Constants.BACKSPACE: // TODO is this correct?
486             return ch;
487         case EOF:
488             throw new CSVException("EOF while processing escape sequence");
489         default:
490             // Now check for meta-characters
491             if (isMetaChar(ch)) {
492                 return ch;
493             }
494             // indicate unexpected char - available from in.getLastChar()
495             return EOF;
496         }
497     }
498 
499     void trimTrailingSpaces(final StringBuilder buffer) {
500         int length = buffer.length();
501         while (length > 0 && Character.isWhitespace(buffer.charAt(length - 1))) {
502             length--;
503         }
504         if (length != buffer.length()) {
505             buffer.setLength(length);
506         }
507     }
508 }