View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.csv;
19  
20  import static org.apache.commons.csv.Constants.BACKSPACE;
21  import static org.apache.commons.csv.Constants.CR;
22  import static org.apache.commons.csv.Constants.FF;
23  import static org.apache.commons.csv.Constants.LF;
24  import static org.apache.commons.csv.Constants.TAB;
25  import static org.apache.commons.csv.Token.Type.COMMENT;
26  import static org.apache.commons.csv.Token.Type.EOF;
27  import static org.apache.commons.csv.Token.Type.EORECORD;
28  import static org.apache.commons.csv.Token.Type.TOKEN;
29  import static org.apache.commons.csv.TokenMatchers.hasContent;
30  import static org.apache.commons.csv.TokenMatchers.matches;
31  import static org.hamcrest.MatcherAssert.assertThat;
32  import static org.junit.jupiter.api.Assertions.assertEquals;
33  import static org.junit.jupiter.api.Assertions.assertFalse;
34  import static org.junit.jupiter.api.Assertions.assertThrows;
35  import static org.junit.jupiter.api.Assertions.assertTrue;
36  
37  import java.io.IOException;
38  import java.io.StringReader;
39  
40  import org.junit.jupiter.api.BeforeEach;
41  import org.junit.jupiter.api.Test;
42  
43  /**
44   */
45  public class LexerTest {
46  
47      private CSVFormat formatWithEscaping;
48  
49      @SuppressWarnings("resource")
50      private Lexer createLexer(final String input, final CSVFormat format) {
51          return new Lexer(format, new ExtendedBufferedReader(new StringReader(input)));
52      }
53  
54      @BeforeEach
55      public void setUp() {
56          formatWithEscaping = CSVFormat.DEFAULT.withEscape('\\');
57      }
58  
59      // simple token with escaping enabled
60      @Test
61      public void testBackslashWithEscaping() throws IOException {
62          /*
63           * file: a,\,,b \,,
64           */
65          final String code = "a,\\,,b\\\\\n\\,,\\\nc,d\\\r\ne";
66          final CSVFormat format = formatWithEscaping.withIgnoreEmptyLines(false);
67          assertTrue(format.isEscapeCharacterSet());
68          try (final Lexer parser = createLexer(code, format)) {
69              assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
70              assertThat(parser.nextToken(new Token()), matches(TOKEN, ","));
71              assertThat(parser.nextToken(new Token()), matches(EORECORD, "b\\"));
72              assertThat(parser.nextToken(new Token()), matches(TOKEN, ","));
73              assertThat(parser.nextToken(new Token()), matches(TOKEN, "\nc"));
74              assertThat(parser.nextToken(new Token()), matches(EORECORD, "d\r"));
75              assertThat(parser.nextToken(new Token()), matches(EOF, "e"));
76          }
77      }
78  
79      // simple token with escaping not enabled
80      @Test
81      public void testBackslashWithoutEscaping() throws IOException {
82          /*
83           * file: a,\,,b \,,
84           */
85          final String code = "a,\\,,b\\\n\\,,";
86          final CSVFormat format = CSVFormat.DEFAULT;
87          assertFalse(format.isEscapeCharacterSet());
88          try (final Lexer parser = createLexer(code, format)) {
89              assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
90              // an unquoted single backslash is not an escape char
91              assertThat(parser.nextToken(new Token()), matches(TOKEN, "\\"));
92              assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
93              assertThat(parser.nextToken(new Token()), matches(EORECORD, "b\\"));
94              // an unquoted single backslash is not an escape char
95              assertThat(parser.nextToken(new Token()), matches(TOKEN, "\\"));
96              assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
97              assertThat(parser.nextToken(new Token()), matches(EOF, ""));
98          }
99      }
100 
101     @Test
102     public void testBackspace() throws Exception {
103         try (final Lexer lexer = createLexer("character" + BACKSPACE + "NotEscaped", formatWithEscaping)) {
104             assertThat(lexer.nextToken(new Token()), hasContent("character" + BACKSPACE + "NotEscaped"));
105         }
106     }
107 
108     @Test
109     public void testComments() throws IOException {
110         final String code = "first,line,\n" + "second,line,tokenWith#no-comment\n" + "# comment line \n" +
111                 "third,line,#no-comment\n" + "# penultimate comment\n" + "# Final comment\n";
112         final CSVFormat format = CSVFormat.DEFAULT.withCommentMarker('#');
113         try (final Lexer parser = createLexer(code, format)) {
114             assertThat(parser.nextToken(new Token()), matches(TOKEN, "first"));
115             assertThat(parser.nextToken(new Token()), matches(TOKEN, "line"));
116             assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
117             assertThat(parser.nextToken(new Token()), matches(TOKEN, "second"));
118             assertThat(parser.nextToken(new Token()), matches(TOKEN, "line"));
119             assertThat(parser.nextToken(new Token()), matches(EORECORD, "tokenWith#no-comment"));
120             assertThat(parser.nextToken(new Token()), matches(COMMENT, "comment line"));
121             assertThat(parser.nextToken(new Token()), matches(TOKEN, "third"));
122             assertThat(parser.nextToken(new Token()), matches(TOKEN, "line"));
123             assertThat(parser.nextToken(new Token()), matches(EORECORD, "#no-comment"));
124             assertThat(parser.nextToken(new Token()), matches(COMMENT, "penultimate comment"));
125             assertThat(parser.nextToken(new Token()), matches(COMMENT, "Final comment"));
126             assertThat(parser.nextToken(new Token()), matches(EOF, ""));
127             assertThat(parser.nextToken(new Token()), matches(EOF, ""));
128         }
129     }
130 
131     @Test
132     public void testCommentsAndEmptyLines() throws IOException {
133         final String code = "1,2,3,\n" + // 1
134                 "\n" + // 1b
135                 "\n" + // 1c
136                 "a,b x,c#no-comment\n" + // 2
137                 "#foo\n" + // 3
138                 "\n" + // 4
139                 "\n" + // 4b
140                 "d,e,#no-comment\n" + // 5
141                 "\n" + // 5b
142                 "\n" + // 5c
143                 "# penultimate comment\n" + // 6
144                 "\n" + // 6b
145                 "\n" + // 6c
146                 "# Final comment\n"; // 7
147         final CSVFormat format = CSVFormat.DEFAULT.withCommentMarker('#').withIgnoreEmptyLines(false);
148         assertFalse(format.getIgnoreEmptyLines(), "Should not ignore empty lines");
149 
150         try (final Lexer parser = createLexer(code, format)) {
151             assertThat(parser.nextToken(new Token()), matches(TOKEN, "1"));
152             assertThat(parser.nextToken(new Token()), matches(TOKEN, "2"));
153             assertThat(parser.nextToken(new Token()), matches(TOKEN, "3"));
154             assertThat(parser.nextToken(new Token()), matches(EORECORD, "")); // 1
155             assertThat(parser.nextToken(new Token()), matches(EORECORD, "")); // 1b
156             assertThat(parser.nextToken(new Token()), matches(EORECORD, "")); // 1c
157             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
158             assertThat(parser.nextToken(new Token()), matches(TOKEN, "b x"));
159             assertThat(parser.nextToken(new Token()), matches(EORECORD, "c#no-comment")); // 2
160             assertThat(parser.nextToken(new Token()), matches(COMMENT, "foo")); // 3
161             assertThat(parser.nextToken(new Token()), matches(EORECORD, "")); // 4
162             assertThat(parser.nextToken(new Token()), matches(EORECORD, "")); // 4b
163             assertThat(parser.nextToken(new Token()), matches(TOKEN, "d"));
164             assertThat(parser.nextToken(new Token()), matches(TOKEN, "e"));
165             assertThat(parser.nextToken(new Token()), matches(EORECORD, "#no-comment")); // 5
166             assertThat(parser.nextToken(new Token()), matches(EORECORD, "")); // 5b
167             assertThat(parser.nextToken(new Token()), matches(EORECORD, "")); // 5c
168             assertThat(parser.nextToken(new Token()), matches(COMMENT, "penultimate comment")); // 6
169             assertThat(parser.nextToken(new Token()), matches(EORECORD, "")); // 6b
170             assertThat(parser.nextToken(new Token()), matches(EORECORD, "")); // 6c
171             assertThat(parser.nextToken(new Token()), matches(COMMENT, "Final comment")); // 7
172             assertThat(parser.nextToken(new Token()), matches(EOF, ""));
173             assertThat(parser.nextToken(new Token()), matches(EOF, ""));
174         }
175     }
176 
177     @Test
178     public void testCR() throws Exception {
179         try (final Lexer lexer = createLexer("character" + CR + "NotEscaped", formatWithEscaping)) {
180             assertThat(lexer.nextToken(new Token()), hasContent("character"));
181             assertThat(lexer.nextToken(new Token()), hasContent("NotEscaped"));
182         }
183     }
184 
185     // From CSV-1
186     @Test
187     public void testDelimiterIsWhitespace() throws IOException {
188         final String code = "one\ttwo\t\tfour \t five\t six";
189         try (final Lexer parser = createLexer(code, CSVFormat.TDF)) {
190             assertThat(parser.nextToken(new Token()), matches(TOKEN, "one"));
191             assertThat(parser.nextToken(new Token()), matches(TOKEN, "two"));
192             assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
193             assertThat(parser.nextToken(new Token()), matches(TOKEN, "four"));
194             assertThat(parser.nextToken(new Token()), matches(TOKEN, "five"));
195             assertThat(parser.nextToken(new Token()), matches(EOF, "six"));
196         }
197     }
198 
199     @Test
200     public void testEOFWithoutClosingQuote() throws Exception {
201         final String code = "a,\"b";
202         try (final Lexer parser = createLexer(code, CSVFormat.Builder.create().setLenientEof(true).build())) {
203             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
204             assertThat(parser.nextToken(new Token()), matches(EOF, "b"));
205         }
206         try (final Lexer parser = createLexer(code, CSVFormat.Builder.create().setLenientEof(false).build())) {
207             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
208             assertThrows(IOException.class, () -> parser.nextToken(new Token()));
209         }
210     }
211 
212     @Test // TODO is this correct? Do we expect <esc>BACKSPACE to be unescaped?
213     public void testEscapedBackspace() throws Exception {
214         try (final Lexer lexer = createLexer("character\\" + BACKSPACE + "Escaped", formatWithEscaping)) {
215             assertThat(lexer.nextToken(new Token()), hasContent("character" + BACKSPACE + "Escaped"));
216         }
217     }
218 
219     @Test
220     public void testEscapedCharacter() throws Exception {
221         try (final Lexer lexer = createLexer("character\\aEscaped", formatWithEscaping)) {
222             assertThat(lexer.nextToken(new Token()), hasContent("character\\aEscaped"));
223         }
224     }
225 
226     @Test
227     public void testEscapedControlCharacter() throws Exception {
228         // we are explicitly using an escape different from \ here
229         try (final Lexer lexer = createLexer("character!rEscaped", CSVFormat.DEFAULT.withEscape('!'))) {
230             assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped"));
231         }
232     }
233 
234     @Test
235     public void testEscapedControlCharacter2() throws Exception {
236         try (final Lexer lexer = createLexer("character\\rEscaped", CSVFormat.DEFAULT.withEscape('\\'))) {
237             assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped"));
238         }
239     }
240 
241     @Test
242     public void testEscapedCR() throws Exception {
243         try (final Lexer lexer = createLexer("character\\" + CR + "Escaped", formatWithEscaping)) {
244             assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped"));
245         }
246     }
247 
248     @Test // TODO is this correct? Do we expect <esc>FF to be unescaped?
249     public void testEscapedFF() throws Exception {
250         try (final Lexer lexer = createLexer("character\\" + FF + "Escaped", formatWithEscaping)) {
251             assertThat(lexer.nextToken(new Token()), hasContent("character" + FF + "Escaped"));
252         }
253     }
254 
255     @Test
256     public void testEscapedLF() throws Exception {
257         try (final Lexer lexer = createLexer("character\\" + LF + "Escaped", formatWithEscaping)) {
258             assertThat(lexer.nextToken(new Token()), hasContent("character" + LF + "Escaped"));
259         }
260     }
261 
262     @Test
263     public void testEscapedMySqlNullValue() throws Exception {
264         // MySQL uses \N to symbolize null values. We have to restore this
265         try (final Lexer lexer = createLexer("character\\NEscaped", formatWithEscaping)) {
266             assertThat(lexer.nextToken(new Token()), hasContent("character\\NEscaped"));
267         }
268     }
269 
270     @Test // TODO is this correct? Do we expect <esc>TAB to be unescaped?
271     public void testEscapedTab() throws Exception {
272         try (final Lexer lexer = createLexer("character\\" + TAB + "Escaped", formatWithEscaping)) {
273             assertThat(lexer.nextToken(new Token()), hasContent("character" + TAB + "Escaped"));
274         }
275 
276     }
277 
278     @Test
279     public void testEscapingAtEOF() throws Exception {
280         final String code = "escaping at EOF is evil\\";
281         try (final Lexer lexer = createLexer(code, formatWithEscaping)) {
282             assertThrows(IOException.class, () -> lexer.nextToken(new Token()));
283         }
284     }
285 
286     @Test
287     public void testFF() throws Exception {
288         try (final Lexer lexer = createLexer("character" + FF + "NotEscaped", formatWithEscaping)) {
289             assertThat(lexer.nextToken(new Token()), hasContent("character" + FF + "NotEscaped"));
290         }
291     }
292 
293     @Test
294     public void testIgnoreEmptyLines() throws IOException {
295         final String code = "first,line,\n" + "\n" + "\n" + "second,line\n" + "\n" + "\n" + "third line \n" + "\n" +
296                 "\n" + "last, line \n" + "\n" + "\n" + "\n";
297         final CSVFormat format = CSVFormat.DEFAULT.withIgnoreEmptyLines();
298         try (final Lexer parser = createLexer(code, format)) {
299             assertThat(parser.nextToken(new Token()), matches(TOKEN, "first"));
300             assertThat(parser.nextToken(new Token()), matches(TOKEN, "line"));
301             assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
302             assertThat(parser.nextToken(new Token()), matches(TOKEN, "second"));
303             assertThat(parser.nextToken(new Token()), matches(EORECORD, "line"));
304             assertThat(parser.nextToken(new Token()), matches(EORECORD, "third line "));
305             assertThat(parser.nextToken(new Token()), matches(TOKEN, "last"));
306             assertThat(parser.nextToken(new Token()), matches(EORECORD, " line "));
307             assertThat(parser.nextToken(new Token()), matches(EOF, ""));
308             assertThat(parser.nextToken(new Token()), matches(EOF, ""));
309         }
310     }
311 
312     @Test
313     public void testIsMetaCharCommentStart() throws IOException {
314         try (final Lexer lexer = createLexer("#", CSVFormat.DEFAULT.withCommentMarker('#'))) {
315             final int ch = lexer.readEscape();
316             assertEquals('#', ch);
317         }
318     }
319 
320     @Test
321     public void testLF() throws Exception {
322         try (final Lexer lexer = createLexer("character" + LF + "NotEscaped", formatWithEscaping)) {
323             assertThat(lexer.nextToken(new Token()), hasContent("character"));
324             assertThat(lexer.nextToken(new Token()), hasContent("NotEscaped"));
325         }
326     }
327 
328     // encapsulator tokenizer (single line)
329     @Test
330     public void testNextToken4() throws IOException {
331         /*
332          * file: a,"foo",b a, " foo",b a,"foo " ,b // whitespace after closing encapsulator a, " foo " ,b
333          */
334         final String code = "a,\"foo\",b\na,   \" foo\",b\na,\"foo \"  ,b\na,  \" foo \"  ,b";
335         try (final Lexer parser = createLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces())) {
336             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
337             assertThat(parser.nextToken(new Token()), matches(TOKEN, "foo"));
338             assertThat(parser.nextToken(new Token()), matches(EORECORD, "b"));
339             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
340             assertThat(parser.nextToken(new Token()), matches(TOKEN, " foo"));
341             assertThat(parser.nextToken(new Token()), matches(EORECORD, "b"));
342             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
343             assertThat(parser.nextToken(new Token()), matches(TOKEN, "foo "));
344             assertThat(parser.nextToken(new Token()), matches(EORECORD, "b"));
345             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
346             assertThat(parser.nextToken(new Token()), matches(TOKEN, " foo "));
347             // assertTokenEquals(EORECORD, "b", parser.nextToken(new Token()));
348             assertThat(parser.nextToken(new Token()), matches(EOF, "b"));
349         }
350     }
351 
352     // encapsulator tokenizer (multi line, delimiter in string)
353     @Test
354     public void testNextToken5() throws IOException {
355         final String code = "a,\"foo\n\",b\n\"foo\n  baar ,,,\"\n\"\n\t \n\"";
356         try (final Lexer parser = createLexer(code, CSVFormat.DEFAULT)) {
357             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
358             assertThat(parser.nextToken(new Token()), matches(TOKEN, "foo\n"));
359             assertThat(parser.nextToken(new Token()), matches(EORECORD, "b"));
360             assertThat(parser.nextToken(new Token()), matches(EORECORD, "foo\n  baar ,,,"));
361             assertThat(parser.nextToken(new Token()), matches(EOF, "\n\t \n"));
362         }
363     }
364 
365     // change delimiters, comment, encapsulater
366     @Test
367     public void testNextToken6() throws IOException {
368         /*
369          * file: a;'b and \' more ' !comment;;;; ;;
370          */
371         final String code = "a;'b and '' more\n'\n!comment;;;;\n;;";
372         final CSVFormat format = CSVFormat.DEFAULT.withQuote('\'').withCommentMarker('!').withDelimiter(';');
373         try (final Lexer parser = createLexer(code, format)) {
374             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
375             assertThat(parser.nextToken(new Token()), matches(EORECORD, "b and ' more\n"));
376         }
377     }
378 
379     @Test
380     public void testReadEscapeBackspace() throws IOException {
381         try (final Lexer lexer = createLexer("b", CSVFormat.DEFAULT.withEscape('\b'))) {
382             final int ch = lexer.readEscape();
383             assertEquals(BACKSPACE, ch);
384         }
385     }
386 
387     @Test
388     public void testReadEscapeFF() throws IOException {
389         try (final Lexer lexer = createLexer("f", CSVFormat.DEFAULT.withEscape('\f'))) {
390             final int ch = lexer.readEscape();
391             assertEquals(FF, ch);
392         }
393     }
394 
395     @Test
396     public void testReadEscapeTab() throws IOException {
397         try (final Lexer lexer = createLexer("t", CSVFormat.DEFAULT.withEscape('\t'))) {
398             final int ch = lexer.readEscape();
399             assertThat(lexer.nextToken(new Token()), matches(EOF, ""));
400             assertEquals(TAB, ch);
401         }
402     }
403 
404     @Test
405     public void testSurroundingSpacesAreDeleted() throws IOException {
406         final String code = "noSpaces,  leadingSpaces,trailingSpaces  ,  surroundingSpaces  ,  ,,";
407         try (final Lexer parser = createLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces())) {
408             assertThat(parser.nextToken(new Token()), matches(TOKEN, "noSpaces"));
409             assertThat(parser.nextToken(new Token()), matches(TOKEN, "leadingSpaces"));
410             assertThat(parser.nextToken(new Token()), matches(TOKEN, "trailingSpaces"));
411             assertThat(parser.nextToken(new Token()), matches(TOKEN, "surroundingSpaces"));
412             assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
413             assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
414             assertThat(parser.nextToken(new Token()), matches(EOF, ""));
415         }
416     }
417 
418     @Test
419     public void testSurroundingTabsAreDeleted() throws IOException {
420         final String code = "noTabs,\tleadingTab,trailingTab\t,\tsurroundingTabs\t,\t\t,,";
421         try (final Lexer parser = createLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces())) {
422             assertThat(parser.nextToken(new Token()), matches(TOKEN, "noTabs"));
423             assertThat(parser.nextToken(new Token()), matches(TOKEN, "leadingTab"));
424             assertThat(parser.nextToken(new Token()), matches(TOKEN, "trailingTab"));
425             assertThat(parser.nextToken(new Token()), matches(TOKEN, "surroundingTabs"));
426             assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
427             assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
428             assertThat(parser.nextToken(new Token()), matches(EOF, ""));
429         }
430     }
431 
432     @Test
433     public void testTab() throws Exception {
434         try (final Lexer lexer = createLexer("character" + TAB + "NotEscaped", formatWithEscaping)) {
435             assertThat(lexer.nextToken(new Token()), hasContent("character" + TAB + "NotEscaped"));
436         }
437     }
438 
439     @Test
440     public void testTrailingTextAfterQuote() throws Exception {
441         final String code = "\"a\" b,\"a\" \" b,\"a\" b \"\"";
442         try (final Lexer parser = createLexer(code, CSVFormat.Builder.create().setTrailingData(true).build())) {
443             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a b"));
444             assertThat(parser.nextToken(new Token()), matches(TOKEN, "a \" b"));
445             assertThat(parser.nextToken(new Token()), matches(EOF, "a b \"\""));
446         }
447         try (final Lexer parser = createLexer(code, CSVFormat.Builder.create().setTrailingData(false).build())) {
448             assertThrows(IOException.class, () -> parser.nextToken(new Token()));
449         }
450     }
451 
452     @Test
453     public void testTrimTrailingSpacesZeroLength() throws Exception {
454         final StringBuilder buffer = new StringBuilder("");
455         try (Lexer lexer = createLexer(buffer.toString(), CSVFormat.DEFAULT)) {
456             lexer.trimTrailingSpaces(buffer);
457             assertThat(lexer.nextToken(new Token()), matches(EOF, ""));
458         }
459     }
460 }