1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.commons.csv;
19
20 import static org.apache.commons.csv.Constants.BACKSPACE;
21 import static org.apache.commons.csv.Constants.CR;
22 import static org.apache.commons.csv.Constants.FF;
23 import static org.apache.commons.csv.Constants.LF;
24 import static org.apache.commons.csv.Constants.TAB;
25 import static org.apache.commons.csv.Token.Type.COMMENT;
26 import static org.apache.commons.csv.Token.Type.EOF;
27 import static org.apache.commons.csv.Token.Type.EORECORD;
28 import static org.apache.commons.csv.Token.Type.TOKEN;
29 import static org.apache.commons.csv.TokenMatchers.hasContent;
30 import static org.apache.commons.csv.TokenMatchers.matches;
31 import static org.hamcrest.MatcherAssert.assertThat;
32 import static org.junit.jupiter.api.Assertions.assertEquals;
33 import static org.junit.jupiter.api.Assertions.assertFalse;
34 import static org.junit.jupiter.api.Assertions.assertThrows;
35 import static org.junit.jupiter.api.Assertions.assertTrue;
36
37 import java.io.IOException;
38 import java.io.StringReader;
39
40 import org.junit.jupiter.api.BeforeEach;
41 import org.junit.jupiter.api.Test;
42
43
44
45 public class LexerTest {
46
47 private CSVFormat formatWithEscaping;
48
49 @SuppressWarnings("resource")
50 private Lexer createLexer(final String input, final CSVFormat format) {
51 return new Lexer(format, new ExtendedBufferedReader(new StringReader(input)));
52 }
53
54 @BeforeEach
55 public void setUp() {
56 formatWithEscaping = CSVFormat.DEFAULT.withEscape('\\');
57 }
58
59
60 @Test
61 public void testBackslashWithEscaping() throws IOException {
62
63
64
65 final String code = "a,\\,,b\\\\\n\\,,\\\nc,d\\\r\ne";
66 final CSVFormat format = formatWithEscaping.withIgnoreEmptyLines(false);
67 assertTrue(format.isEscapeCharacterSet());
68 try (final Lexer parser = createLexer(code, format)) {
69 assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
70 assertThat(parser.nextToken(new Token()), matches(TOKEN, ","));
71 assertThat(parser.nextToken(new Token()), matches(EORECORD, "b\\"));
72 assertThat(parser.nextToken(new Token()), matches(TOKEN, ","));
73 assertThat(parser.nextToken(new Token()), matches(TOKEN, "\nc"));
74 assertThat(parser.nextToken(new Token()), matches(EORECORD, "d\r"));
75 assertThat(parser.nextToken(new Token()), matches(EOF, "e"));
76 }
77 }
78
79
80 @Test
81 public void testBackslashWithoutEscaping() throws IOException {
82
83
84
85 final String code = "a,\\,,b\\\n\\,,";
86 final CSVFormat format = CSVFormat.DEFAULT;
87 assertFalse(format.isEscapeCharacterSet());
88 try (final Lexer parser = createLexer(code, format)) {
89 assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
90
91 assertThat(parser.nextToken(new Token()), matches(TOKEN, "\\"));
92 assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
93 assertThat(parser.nextToken(new Token()), matches(EORECORD, "b\\"));
94
95 assertThat(parser.nextToken(new Token()), matches(TOKEN, "\\"));
96 assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
97 assertThat(parser.nextToken(new Token()), matches(EOF, ""));
98 }
99 }
100
101 @Test
102 public void testBackspace() throws Exception {
103 try (final Lexer lexer = createLexer("character" + BACKSPACE + "NotEscaped", formatWithEscaping)) {
104 assertThat(lexer.nextToken(new Token()), hasContent("character" + BACKSPACE + "NotEscaped"));
105 }
106 }
107
108 @Test
109 public void testComments() throws IOException {
110 final String code = "first,line,\n" + "second,line,tokenWith#no-comment\n" + "# comment line \n" +
111 "third,line,#no-comment\n" + "# penultimate comment\n" + "# Final comment\n";
112 final CSVFormat format = CSVFormat.DEFAULT.withCommentMarker('#');
113 try (final Lexer parser = createLexer(code, format)) {
114 assertThat(parser.nextToken(new Token()), matches(TOKEN, "first"));
115 assertThat(parser.nextToken(new Token()), matches(TOKEN, "line"));
116 assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
117 assertThat(parser.nextToken(new Token()), matches(TOKEN, "second"));
118 assertThat(parser.nextToken(new Token()), matches(TOKEN, "line"));
119 assertThat(parser.nextToken(new Token()), matches(EORECORD, "tokenWith#no-comment"));
120 assertThat(parser.nextToken(new Token()), matches(COMMENT, "comment line"));
121 assertThat(parser.nextToken(new Token()), matches(TOKEN, "third"));
122 assertThat(parser.nextToken(new Token()), matches(TOKEN, "line"));
123 assertThat(parser.nextToken(new Token()), matches(EORECORD, "#no-comment"));
124 assertThat(parser.nextToken(new Token()), matches(COMMENT, "penultimate comment"));
125 assertThat(parser.nextToken(new Token()), matches(COMMENT, "Final comment"));
126 assertThat(parser.nextToken(new Token()), matches(EOF, ""));
127 assertThat(parser.nextToken(new Token()), matches(EOF, ""));
128 }
129 }
130
131 @Test
132 public void testCommentsAndEmptyLines() throws IOException {
133 final String code = "1,2,3,\n" +
134 "\n" +
135 "\n" +
136 "a,b x,c#no-comment\n" +
137 "#foo\n" +
138 "\n" +
139 "\n" +
140 "d,e,#no-comment\n" +
141 "\n" +
142 "\n" +
143 "# penultimate comment\n" +
144 "\n" +
145 "\n" +
146 "# Final comment\n";
147 final CSVFormat format = CSVFormat.DEFAULT.withCommentMarker('#').withIgnoreEmptyLines(false);
148 assertFalse(format.getIgnoreEmptyLines(), "Should not ignore empty lines");
149
150 try (final Lexer parser = createLexer(code, format)) {
151 assertThat(parser.nextToken(new Token()), matches(TOKEN, "1"));
152 assertThat(parser.nextToken(new Token()), matches(TOKEN, "2"));
153 assertThat(parser.nextToken(new Token()), matches(TOKEN, "3"));
154 assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
155 assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
156 assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
157 assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
158 assertThat(parser.nextToken(new Token()), matches(TOKEN, "b x"));
159 assertThat(parser.nextToken(new Token()), matches(EORECORD, "c#no-comment"));
160 assertThat(parser.nextToken(new Token()), matches(COMMENT, "foo"));
161 assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
162 assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
163 assertThat(parser.nextToken(new Token()), matches(TOKEN, "d"));
164 assertThat(parser.nextToken(new Token()), matches(TOKEN, "e"));
165 assertThat(parser.nextToken(new Token()), matches(EORECORD, "#no-comment"));
166 assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
167 assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
168 assertThat(parser.nextToken(new Token()), matches(COMMENT, "penultimate comment"));
169 assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
170 assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
171 assertThat(parser.nextToken(new Token()), matches(COMMENT, "Final comment"));
172 assertThat(parser.nextToken(new Token()), matches(EOF, ""));
173 assertThat(parser.nextToken(new Token()), matches(EOF, ""));
174 }
175 }
176
177 @Test
178 public void testCR() throws Exception {
179 try (final Lexer lexer = createLexer("character" + CR + "NotEscaped", formatWithEscaping)) {
180 assertThat(lexer.nextToken(new Token()), hasContent("character"));
181 assertThat(lexer.nextToken(new Token()), hasContent("NotEscaped"));
182 }
183 }
184
185
186 @Test
187 public void testDelimiterIsWhitespace() throws IOException {
188 final String code = "one\ttwo\t\tfour \t five\t six";
189 try (final Lexer parser = createLexer(code, CSVFormat.TDF)) {
190 assertThat(parser.nextToken(new Token()), matches(TOKEN, "one"));
191 assertThat(parser.nextToken(new Token()), matches(TOKEN, "two"));
192 assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
193 assertThat(parser.nextToken(new Token()), matches(TOKEN, "four"));
194 assertThat(parser.nextToken(new Token()), matches(TOKEN, "five"));
195 assertThat(parser.nextToken(new Token()), matches(EOF, "six"));
196 }
197 }
198
199 @Test
200 public void testEOFWithoutClosingQuote() throws Exception {
201 final String code = "a,\"b";
202 try (final Lexer parser = createLexer(code, CSVFormat.Builder.create().setLenientEof(true).build())) {
203 assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
204 assertThat(parser.nextToken(new Token()), matches(EOF, "b"));
205 }
206 try (final Lexer parser = createLexer(code, CSVFormat.Builder.create().setLenientEof(false).build())) {
207 assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
208 assertThrows(IOException.class, () -> parser.nextToken(new Token()));
209 }
210 }
211
212 @Test
213 public void testEscapedBackspace() throws Exception {
214 try (final Lexer lexer = createLexer("character\\" + BACKSPACE + "Escaped", formatWithEscaping)) {
215 assertThat(lexer.nextToken(new Token()), hasContent("character" + BACKSPACE + "Escaped"));
216 }
217 }
218
219 @Test
220 public void testEscapedCharacter() throws Exception {
221 try (final Lexer lexer = createLexer("character\\aEscaped", formatWithEscaping)) {
222 assertThat(lexer.nextToken(new Token()), hasContent("character\\aEscaped"));
223 }
224 }
225
226 @Test
227 public void testEscapedControlCharacter() throws Exception {
228
229 try (final Lexer lexer = createLexer("character!rEscaped", CSVFormat.DEFAULT.withEscape('!'))) {
230 assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped"));
231 }
232 }
233
234 @Test
235 public void testEscapedControlCharacter2() throws Exception {
236 try (final Lexer lexer = createLexer("character\\rEscaped", CSVFormat.DEFAULT.withEscape('\\'))) {
237 assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped"));
238 }
239 }
240
241 @Test
242 public void testEscapedCR() throws Exception {
243 try (final Lexer lexer = createLexer("character\\" + CR + "Escaped", formatWithEscaping)) {
244 assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped"));
245 }
246 }
247
248 @Test
249 public void testEscapedFF() throws Exception {
250 try (final Lexer lexer = createLexer("character\\" + FF + "Escaped", formatWithEscaping)) {
251 assertThat(lexer.nextToken(new Token()), hasContent("character" + FF + "Escaped"));
252 }
253 }
254
255 @Test
256 public void testEscapedLF() throws Exception {
257 try (final Lexer lexer = createLexer("character\\" + LF + "Escaped", formatWithEscaping)) {
258 assertThat(lexer.nextToken(new Token()), hasContent("character" + LF + "Escaped"));
259 }
260 }
261
262 @Test
263 public void testEscapedMySqlNullValue() throws Exception {
264
265 try (final Lexer lexer = createLexer("character\\NEscaped", formatWithEscaping)) {
266 assertThat(lexer.nextToken(new Token()), hasContent("character\\NEscaped"));
267 }
268 }
269
270 @Test
271 public void testEscapedTab() throws Exception {
272 try (final Lexer lexer = createLexer("character\\" + TAB + "Escaped", formatWithEscaping)) {
273 assertThat(lexer.nextToken(new Token()), hasContent("character" + TAB + "Escaped"));
274 }
275
276 }
277
278 @Test
279 public void testEscapingAtEOF() throws Exception {
280 final String code = "escaping at EOF is evil\\";
281 try (final Lexer lexer = createLexer(code, formatWithEscaping)) {
282 assertThrows(IOException.class, () -> lexer.nextToken(new Token()));
283 }
284 }
285
286 @Test
287 public void testFF() throws Exception {
288 try (final Lexer lexer = createLexer("character" + FF + "NotEscaped", formatWithEscaping)) {
289 assertThat(lexer.nextToken(new Token()), hasContent("character" + FF + "NotEscaped"));
290 }
291 }
292
293 @Test
294 public void testIgnoreEmptyLines() throws IOException {
295 final String code = "first,line,\n" + "\n" + "\n" + "second,line\n" + "\n" + "\n" + "third line \n" + "\n" +
296 "\n" + "last, line \n" + "\n" + "\n" + "\n";
297 final CSVFormat format = CSVFormat.DEFAULT.withIgnoreEmptyLines();
298 try (final Lexer parser = createLexer(code, format)) {
299 assertThat(parser.nextToken(new Token()), matches(TOKEN, "first"));
300 assertThat(parser.nextToken(new Token()), matches(TOKEN, "line"));
301 assertThat(parser.nextToken(new Token()), matches(EORECORD, ""));
302 assertThat(parser.nextToken(new Token()), matches(TOKEN, "second"));
303 assertThat(parser.nextToken(new Token()), matches(EORECORD, "line"));
304 assertThat(parser.nextToken(new Token()), matches(EORECORD, "third line "));
305 assertThat(parser.nextToken(new Token()), matches(TOKEN, "last"));
306 assertThat(parser.nextToken(new Token()), matches(EORECORD, " line "));
307 assertThat(parser.nextToken(new Token()), matches(EOF, ""));
308 assertThat(parser.nextToken(new Token()), matches(EOF, ""));
309 }
310 }
311
312 @Test
313 public void testIsMetaCharCommentStart() throws IOException {
314 try (final Lexer lexer = createLexer("#", CSVFormat.DEFAULT.withCommentMarker('#'))) {
315 final int ch = lexer.readEscape();
316 assertEquals('#', ch);
317 }
318 }
319
320 @Test
321 public void testLF() throws Exception {
322 try (final Lexer lexer = createLexer("character" + LF + "NotEscaped", formatWithEscaping)) {
323 assertThat(lexer.nextToken(new Token()), hasContent("character"));
324 assertThat(lexer.nextToken(new Token()), hasContent("NotEscaped"));
325 }
326 }
327
328
329 @Test
330 public void testNextToken4() throws IOException {
331
332
333
334 final String code = "a,\"foo\",b\na, \" foo\",b\na,\"foo \" ,b\na, \" foo \" ,b";
335 try (final Lexer parser = createLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces())) {
336 assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
337 assertThat(parser.nextToken(new Token()), matches(TOKEN, "foo"));
338 assertThat(parser.nextToken(new Token()), matches(EORECORD, "b"));
339 assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
340 assertThat(parser.nextToken(new Token()), matches(TOKEN, " foo"));
341 assertThat(parser.nextToken(new Token()), matches(EORECORD, "b"));
342 assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
343 assertThat(parser.nextToken(new Token()), matches(TOKEN, "foo "));
344 assertThat(parser.nextToken(new Token()), matches(EORECORD, "b"));
345 assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
346 assertThat(parser.nextToken(new Token()), matches(TOKEN, " foo "));
347
348 assertThat(parser.nextToken(new Token()), matches(EOF, "b"));
349 }
350 }
351
352
353 @Test
354 public void testNextToken5() throws IOException {
355 final String code = "a,\"foo\n\",b\n\"foo\n baar ,,,\"\n\"\n\t \n\"";
356 try (final Lexer parser = createLexer(code, CSVFormat.DEFAULT)) {
357 assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
358 assertThat(parser.nextToken(new Token()), matches(TOKEN, "foo\n"));
359 assertThat(parser.nextToken(new Token()), matches(EORECORD, "b"));
360 assertThat(parser.nextToken(new Token()), matches(EORECORD, "foo\n baar ,,,"));
361 assertThat(parser.nextToken(new Token()), matches(EOF, "\n\t \n"));
362 }
363 }
364
365
366 @Test
367 public void testNextToken6() throws IOException {
368
369
370
371 final String code = "a;'b and '' more\n'\n!comment;;;;\n;;";
372 final CSVFormat format = CSVFormat.DEFAULT.withQuote('\'').withCommentMarker('!').withDelimiter(';');
373 try (final Lexer parser = createLexer(code, format)) {
374 assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
375 assertThat(parser.nextToken(new Token()), matches(EORECORD, "b and ' more\n"));
376 }
377 }
378
379 @Test
380 public void testReadEscapeBackspace() throws IOException {
381 try (final Lexer lexer = createLexer("b", CSVFormat.DEFAULT.withEscape('\b'))) {
382 final int ch = lexer.readEscape();
383 assertEquals(BACKSPACE, ch);
384 }
385 }
386
387 @Test
388 public void testReadEscapeFF() throws IOException {
389 try (final Lexer lexer = createLexer("f", CSVFormat.DEFAULT.withEscape('\f'))) {
390 final int ch = lexer.readEscape();
391 assertEquals(FF, ch);
392 }
393 }
394
395 @Test
396 public void testReadEscapeTab() throws IOException {
397 try (final Lexer lexer = createLexer("t", CSVFormat.DEFAULT.withEscape('\t'))) {
398 final int ch = lexer.readEscape();
399 assertThat(lexer.nextToken(new Token()), matches(EOF, ""));
400 assertEquals(TAB, ch);
401 }
402 }
403
404 @Test
405 public void testSurroundingSpacesAreDeleted() throws IOException {
406 final String code = "noSpaces, leadingSpaces,trailingSpaces , surroundingSpaces , ,,";
407 try (final Lexer parser = createLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces())) {
408 assertThat(parser.nextToken(new Token()), matches(TOKEN, "noSpaces"));
409 assertThat(parser.nextToken(new Token()), matches(TOKEN, "leadingSpaces"));
410 assertThat(parser.nextToken(new Token()), matches(TOKEN, "trailingSpaces"));
411 assertThat(parser.nextToken(new Token()), matches(TOKEN, "surroundingSpaces"));
412 assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
413 assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
414 assertThat(parser.nextToken(new Token()), matches(EOF, ""));
415 }
416 }
417
418 @Test
419 public void testSurroundingTabsAreDeleted() throws IOException {
420 final String code = "noTabs,\tleadingTab,trailingTab\t,\tsurroundingTabs\t,\t\t,,";
421 try (final Lexer parser = createLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces())) {
422 assertThat(parser.nextToken(new Token()), matches(TOKEN, "noTabs"));
423 assertThat(parser.nextToken(new Token()), matches(TOKEN, "leadingTab"));
424 assertThat(parser.nextToken(new Token()), matches(TOKEN, "trailingTab"));
425 assertThat(parser.nextToken(new Token()), matches(TOKEN, "surroundingTabs"));
426 assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
427 assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
428 assertThat(parser.nextToken(new Token()), matches(EOF, ""));
429 }
430 }
431
432 @Test
433 public void testTab() throws Exception {
434 try (final Lexer lexer = createLexer("character" + TAB + "NotEscaped", formatWithEscaping)) {
435 assertThat(lexer.nextToken(new Token()), hasContent("character" + TAB + "NotEscaped"));
436 }
437 }
438
439 @Test
440 public void testTrailingTextAfterQuote() throws Exception {
441 final String code = "\"a\" b,\"a\" \" b,\"a\" b \"\"";
442 try (final Lexer parser = createLexer(code, CSVFormat.Builder.create().setTrailingData(true).build())) {
443 assertThat(parser.nextToken(new Token()), matches(TOKEN, "a b"));
444 assertThat(parser.nextToken(new Token()), matches(TOKEN, "a \" b"));
445 assertThat(parser.nextToken(new Token()), matches(EOF, "a b \"\""));
446 }
447 try (final Lexer parser = createLexer(code, CSVFormat.Builder.create().setTrailingData(false).build())) {
448 assertThrows(IOException.class, () -> parser.nextToken(new Token()));
449 }
450 }
451
452 @Test
453 public void testTrimTrailingSpacesZeroLength() throws Exception {
454 final StringBuilder buffer = new StringBuilder("");
455 try (Lexer lexer = createLexer(buffer.toString(), CSVFormat.DEFAULT)) {
456 lexer.trimTrailingSpaces(buffer);
457 assertThat(lexer.nextToken(new Token()), matches(EOF, ""));
458 }
459 }
460 }