1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.commons.csv;
19
20 import static org.apache.commons.io.IOUtils.EOF;
21
22 import java.io.Closeable;
23 import java.io.IOException;
24
25 import org.apache.commons.io.IOUtils;
26
27
28
29
30 final class Lexer implements Closeable {
31
32 private static final String CR_STRING = Character.toString(Constants.CR);
33 private static final String LF_STRING = Character.toString(Constants.LF);
34
35 private final char[] delimiter;
36 private final char[] delimiterBuf;
37 private final char[] escapeDelimiterBuf;
38 private final int escape;
39 private final int quoteChar;
40 private final int commentStart;
41 private final boolean ignoreSurroundingSpaces;
42 private final boolean ignoreEmptyLines;
43 private final boolean lenientEof;
44 private final boolean trailingData;
45
46
47 private final ExtendedBufferedReader reader;
48 private String firstEol;
49
50 private boolean isLastTokenDelimiter;
51
52 Lexer(final CSVFormat format, final ExtendedBufferedReader reader) {
53 this.reader = reader;
54 this.delimiter = format.getDelimiterCharArray();
55 this.escape = nullToDisabled(format.getEscapeCharacter());
56 this.quoteChar = nullToDisabled(format.getQuoteCharacter());
57 this.commentStart = nullToDisabled(format.getCommentMarker());
58 this.ignoreSurroundingSpaces = format.getIgnoreSurroundingSpaces();
59 this.ignoreEmptyLines = format.getIgnoreEmptyLines();
60 this.lenientEof = format.getLenientEof();
61 this.trailingData = format.getTrailingData();
62 this.delimiterBuf = new char[delimiter.length - 1];
63 this.escapeDelimiterBuf = new char[2 * delimiter.length - 1];
64 }
65
66
67
68
69
70
71
72
73 private void appendNextEscapedCharacterToToken(final Token token) throws IOException {
74 if (isEscapeDelimiter()) {
75 token.content.append(delimiter);
76 } else {
77 final int unescaped = readEscape();
78 if (unescaped == EOF) {
79 token.content.append((char) escape).append((char) reader.getLastChar());
80 } else {
81 token.content.append((char) unescaped);
82 }
83 }
84 }
85
86
87
88
89
90
91
92 @Override
93 public void close() throws IOException {
94 reader.close();
95 }
96
97
98
99
100
101
102 long getCharacterPosition() {
103 return reader.getPosition();
104 }
105
106
107
108
109
110
111 long getCurrentLineNumber() {
112 return reader.getLineNumber();
113 }
114
115 String getFirstEol() {
116 return firstEol;
117 }
118
119 boolean isClosed() {
120 return reader.isClosed();
121 }
122
123 boolean isCommentStart(final int ch) {
124 return ch == commentStart;
125 }
126
127
128
129
130
131
132
133
134
135 boolean isDelimiter(final int ch) throws IOException {
136 isLastTokenDelimiter = false;
137 if (ch != delimiter[0]) {
138 return false;
139 }
140 if (delimiter.length == 1) {
141 isLastTokenDelimiter = true;
142 return true;
143 }
144 reader.peek(delimiterBuf);
145 for (int i = 0; i < delimiterBuf.length; i++) {
146 if (delimiterBuf[i] != delimiter[i + 1]) {
147 return false;
148 }
149 }
150 final int count = reader.read(delimiterBuf, 0, delimiterBuf.length);
151 isLastTokenDelimiter = count != EOF;
152 return isLastTokenDelimiter;
153 }
154
155
156
157
158
159
160 boolean isEndOfFile(final int ch) {
161 return ch == EOF;
162 }
163
164
165
166
167
168
169 boolean isEscape(final int ch) {
170 return ch == escape;
171 }
172
173
174
175
176
177
178
179
180
181 boolean isEscapeDelimiter() throws IOException {
182 reader.peek(escapeDelimiterBuf);
183 if (escapeDelimiterBuf[0] != delimiter[0]) {
184 return false;
185 }
186 for (int i = 1; i < delimiter.length; i++) {
187 if (escapeDelimiterBuf[2 * i] != delimiter[i] || escapeDelimiterBuf[2 * i - 1] != escape) {
188 return false;
189 }
190 }
191 final int count = reader.read(escapeDelimiterBuf, 0, escapeDelimiterBuf.length);
192 return count != EOF;
193 }
194
195 private boolean isMetaChar(final int ch) {
196 return ch == escape || ch == quoteChar || ch == commentStart;
197 }
198
199 boolean isQuoteChar(final int ch) {
200 return ch == quoteChar;
201 }
202
203
204
205
206
207
208
209 boolean isStartOfLine(final int ch) {
210 return ch == Constants.LF || ch == Constants.CR || ch == Constants.UNDEFINED;
211 }
212
213
214
215
216
217
218
219
220
221
222
223
224 Token nextToken(final Token token) throws IOException {
225
226 int lastChar = reader.getLastChar();
227
228 int c = reader.read();
229
230 boolean eol = readEndOfLine(c);
231
232 if (ignoreEmptyLines) {
233 while (eol && isStartOfLine(lastChar)) {
234
235 lastChar = c;
236 c = reader.read();
237 eol = readEndOfLine(c);
238
239 if (isEndOfFile(c)) {
240 token.type = Token.Type.EOF;
241
242 return token;
243 }
244 }
245 }
246
247 if (isEndOfFile(lastChar) || !isLastTokenDelimiter && isEndOfFile(c)) {
248 token.type = Token.Type.EOF;
249
250 return token;
251 }
252 if (isStartOfLine(lastChar) && isCommentStart(c)) {
253 final String line = reader.readLine();
254 if (line == null) {
255 token.type = Token.Type.EOF;
256
257 return token;
258 }
259 final String comment = line.trim();
260 token.content.append(comment);
261 token.type = Token.Type.COMMENT;
262 return token;
263 }
264
265 while (token.type == Token.Type.INVALID) {
266
267 if (ignoreSurroundingSpaces) {
268 while (Character.isWhitespace((char) c) && !isDelimiter(c) && !eol) {
269 c = reader.read();
270 eol = readEndOfLine(c);
271 }
272 }
273
274 if (isDelimiter(c)) {
275
276 token.type = Token.Type.TOKEN;
277 } else if (eol) {
278
279
280 token.type = Token.Type.EORECORD;
281 } else if (isQuoteChar(c)) {
282
283 parseEncapsulatedToken(token);
284 } else if (isEndOfFile(c)) {
285
286
287 token.type = Token.Type.EOF;
288 token.isReady = true;
289 } else {
290
291
292 parseSimpleToken(token, c);
293 }
294 }
295 return token;
296 }
297
298 private int nullToDisabled(final Character c) {
299 return c == null ? Constants.UNDEFINED : c.charValue();
300 }
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325 private Token parseEncapsulatedToken(final Token token) throws IOException {
326 token.isQuoted = true;
327
328 final long startLineNumber = getCurrentLineNumber();
329 int c;
330 while (true) {
331 c = reader.read();
332
333 if (isQuoteChar(c)) {
334 if (isQuoteChar(reader.peek())) {
335
336 c = reader.read();
337 token.content.append((char) c);
338 } else {
339
340 while (true) {
341 c = reader.read();
342 if (isDelimiter(c)) {
343 token.type = Token.Type.TOKEN;
344 return token;
345 }
346 if (isEndOfFile(c)) {
347 token.type = Token.Type.EOF;
348 token.isReady = true;
349 return token;
350 }
351 if (readEndOfLine(c)) {
352 token.type = Token.Type.EORECORD;
353 return token;
354 }
355 if (trailingData) {
356 token.content.append((char) c);
357 } else if (!Character.isWhitespace((char) c)) {
358
359 throw new CSVException("Invalid character between encapsulated token and delimiter at line: %,d, position: %,d",
360 getCurrentLineNumber(), getCharacterPosition());
361 }
362 }
363 }
364 } else if (isEscape(c)) {
365 appendNextEscapedCharacterToToken(token);
366 } else if (isEndOfFile(c)) {
367 if (lenientEof) {
368 token.type = Token.Type.EOF;
369 token.isReady = true;
370 return token;
371 }
372
373 throw new CSVException("(startline %,d) EOF reached before encapsulated token finished", startLineNumber);
374 } else {
375
376 token.content.append((char) c);
377 }
378 }
379 }
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399 private Token parseSimpleToken(final Token token, int ch) throws IOException {
400
401 while (true) {
402 if (readEndOfLine(ch)) {
403 token.type = Token.Type.EORECORD;
404 break;
405 }
406 if (isEndOfFile(ch)) {
407 token.type = Token.Type.EOF;
408 token.isReady = true;
409 break;
410 }
411 if (isDelimiter(ch)) {
412 token.type = Token.Type.TOKEN;
413 break;
414 }
415
416 if (isEscape(ch)) {
417 appendNextEscapedCharacterToToken(token);
418 } else {
419 token.content.append((char) ch);
420 }
421 ch = reader.read();
422 }
423
424 if (ignoreSurroundingSpaces) {
425 trimTrailingSpaces(token.content);
426 }
427
428 return token;
429 }
430
431
432
433
434
435
436 boolean readEndOfLine(int ch) throws IOException {
437
438 if (ch == Constants.CR && reader.peek() == Constants.LF) {
439
440 ch = reader.read();
441
442 if (firstEol == null) {
443 this.firstEol = Constants.CRLF;
444 }
445 }
446
447 if (firstEol == null) {
448 if (ch == Constants.LF) {
449 this.firstEol = LF_STRING;
450 } else if (ch == Constants.CR) {
451 this.firstEol = CR_STRING;
452 }
453 }
454
455 return ch == Constants.LF || ch == Constants.CR;
456 }
457
458
459
460
461
462
463
464
465
466
467 int readEscape() throws IOException {
468
469 final int ch = reader.read();
470 switch (ch) {
471 case 'r':
472 return Constants.CR;
473 case 'n':
474 return Constants.LF;
475 case 't':
476 return Constants.TAB;
477 case 'b':
478 return Constants.BACKSPACE;
479 case 'f':
480 return Constants.FF;
481 case Constants.CR:
482 case Constants.LF:
483 case Constants.FF:
484 case Constants.TAB:
485 case Constants.BACKSPACE:
486 return ch;
487 case EOF:
488 throw new CSVException("EOF while processing escape sequence");
489 default:
490
491 if (isMetaChar(ch)) {
492 return ch;
493 }
494
495 return EOF;
496 }
497 }
498
499 void trimTrailingSpaces(final StringBuilder buffer) {
500 int length = buffer.length();
501 while (length > 0 && Character.isWhitespace(buffer.charAt(length - 1))) {
502 length--;
503 }
504 if (length != buffer.length()) {
505 buffer.setLength(length);
506 }
507 }
508 }