From afbf34ad92cbf741b9dfad84699e890d8695a6f9 Mon Sep 17 00:00:00 2001 From: Naveed Khan Date: Thu, 25 Jun 2026 23:12:11 +0530 Subject: [PATCH 1/2] evaluate isDelimiter once in nextToken whitespace skip --- .../java/org/apache/commons/csv/Lexer.java | 11 +++++++++-- .../org/apache/commons/csv/LexerTest.java | 19 +++++++++++++++++++ 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/apache/commons/csv/Lexer.java b/src/main/java/org/apache/commons/csv/Lexer.java index 93a584663..fe964480a 100644 --- a/src/main/java/org/apache/commons/csv/Lexer.java +++ b/src/main/java/org/apache/commons/csv/Lexer.java @@ -277,15 +277,22 @@ Token nextToken(final Token token) throws IOException { } // Important: make sure a new char gets consumed in each iteration while (token.type == Token.Type.INVALID) { + // isDelimiter consumes the trailing characters of a multi-character delimiter as a side effect, so it must + // only be evaluated once per character. Remember a match found while skipping whitespace below. + boolean delimiter = false; // ignore whitespaces at beginning of a token if (ignoreSurroundingSpaces) { - while (Character.isWhitespace((char) c) && !isDelimiter(c) && !eol) { + while (Character.isWhitespace((char) c) && !eol) { + if (isDelimiter(c)) { + delimiter = true; + break; + } c = reader.read(); eol = readEndOfLine(c); } } // ok, start of token reached: encapsulated, or token - if (isDelimiter(c)) { + if (delimiter || isDelimiter(c)) { // empty token return TOKEN("") token.type = Token.Type.TOKEN; } else if (eol) { diff --git a/src/test/java/org/apache/commons/csv/LexerTest.java b/src/test/java/org/apache/commons/csv/LexerTest.java index db1ab3a6d..445f710a1 100644 --- a/src/test/java/org/apache/commons/csv/LexerTest.java +++ b/src/test/java/org/apache/commons/csv/LexerTest.java @@ -447,6 +447,25 @@ void testPartialMultiCharacterDelimiterAtEOFAfterMismatch() throws IOException { } } + /** + * With {@code ignoreSurroundingSpaces} enabled and a multi-character delimiter whose first character is whitespace, + * the side-effecting {@link Lexer#isDelimiter(int)} must only be evaluated once per character, otherwise the + * delimiter is consumed in the whitespace-skip loop and the empty field at the boundary is dropped. + */ + @Test + void testEmptyTokenBeforeWhitespacePrefixedMultiCharacterDelimiter() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter(" |").setIgnoreSurroundingSpaces(true).get(); + try (Lexer lexer = createLexer(" |a", format)) { + assertNextToken(TOKEN, "", lexer); + assertNextToken(EOF, "a", lexer); + } + try (Lexer lexer = createLexer("a | |b", format)) { + assertNextToken(TOKEN, "a", lexer); + assertNextToken(TOKEN, "", lexer); + assertNextToken(EOF, "b", lexer); + } + } + @Test void testReadEscapeBackspace() throws IOException { try (Lexer lexer = createLexer("b", CSVFormat.DEFAULT.withEscape('\b'))) { From c9362f76baa32534e82334a27220b698db49788c Mon Sep 17 00:00:00 2001 From: Naveed Khan Date: Fri, 26 Jun 2026 02:45:35 +0530 Subject: [PATCH 2/2] add public-api test for whitespace-prefixed multi-char delimiter exercises the empty-field-dropped regression through CSVParser, not just the lexer. --- .../org/apache/commons/csv/CSVParserTest.java | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java index 051548757..565e132eb 100644 --- a/src/test/java/org/apache/commons/csv/CSVParserTest.java +++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java @@ -1758,6 +1758,26 @@ void testPartialMultiCharacterDelimiterAtEOFAfterMismatch() throws IOException { } } + /** + * With {@code ignoreSurroundingSpaces} enabled and a multi-character delimiter whose first character is whitespace, + * the empty field at the delimiter boundary must survive. The delimiter look-ahead is consumed while skipping + * leading whitespace, so re-evaluating it would drop the empty field and merge the following field's value. + */ + @Test + void testEmptyFieldBeforeWhitespacePrefixedMultiCharacterDelimiter() throws IOException { + final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter(" |").setIgnoreSurroundingSpaces(true).get(); + try (CSVParser parser = CSVParser.parse(" |a", format)) { + final List records = parser.getRecords(); + assertEquals(1, records.size()); + assertValuesEquals(new String[] { "", "a" }, records.get(0)); + } + try (CSVParser parser = CSVParser.parse("a | |b", format)) { + final List records = parser.getRecords(); + assertEquals(1, records.size()); + assertValuesEquals(new String[] { "a", "", "b" }, records.get(0)); + } + } + @Test void testProvidedHeader() throws Exception { final Reader in = new StringReader("a,b,c\n1,2,3\nx,y,z");