From a5480ec1bd350ca0a349da26125829b8836afae6 Mon Sep 17 00:00:00 2001 From: Mukunda Rao Katta Date: Mon, 27 Apr 2026 21:19:34 -0700 Subject: [PATCH 1/2] feat(csv): make parseLine the synchronous primitive (refs #3765) Refactor the CSV parser so a single synchronous parseLine handles all field-level rules, with parse() (sync) and CsvParseStream (async) becoming thin line-iteration shells on top of it. - _io.ts: introduce sync parseLine; rewrite the existing async parseRecord as a thin reader.readLine accumulator that delegates to parseLine. Error column tracking now resolves through embedded newlines so error messages stay correct for multi-line quoted records. - parse.ts: drop the duplicate field-parsing loop that lived inside Parser.#parseRecord; both Parser and the new public parseLine share the same primitive. Public parseLine has the simple (line, options) -> string[] signature requested in #3765, including BOM strip and trailing CR/LF/CRLF normalization. - parse_test.ts: add 12 parseLine-specific tests covering happy path, custom separator, escapes, BOM, trailing newlines, multi-line quoted body, lazyQuotes, comment lines, and unclosed-field error. All 133 existing parse + parse_stream tests still pass; new tests bring the total to 145. --- csv/_io.ts | 162 ++++++++++++++++++++++++-------- csv/parse.ts | 234 +++++++++++++++++++--------------------------- csv/parse_test.ts | 105 ++++++++++++++++++++- 3 files changed, 326 insertions(+), 175 deletions(-) diff --git a/csv/_io.ts b/csv/_io.ts index 5e80959c55b0..546ff60c1843 100644 --- a/csv/_io.ts +++ b/csv/_io.ts @@ -6,7 +6,7 @@ import { codePointLength } from "./_shared.ts"; -/** Options for {@linkcode parseRecord}. */ +/** Options for {@linkcode parseLine} and {@linkcode parseRecord}. */ export interface ReadOptions { /** Character which separates values. * @@ -56,13 +56,32 @@ export interface LineReader { isEOF(): boolean; } -export async function parseRecord( +/** + * Synchronous CSV record primitive. + * + * Parses a complete CSV record (one or more lines joined by `\n`, since a + * quoted field may legally span lines) into an array of fields. Both + * {@linkcode parseRecord} (async, line-pulling) and the top-level + * {@linkcode parse} build on top of this function so the field-level rules + * live in exactly one place. + * + * Returns: + * - `string[]` when the input is a complete record + * - `null` when the input ends inside an unclosed quoted field and the + * caller has more input it can append (set `atEof` to `true` to force + * an EOF decision instead of returning `null`) + * + * Throws {@linkcode SyntaxError} for hard syntax errors (bare quote in a + * non-quoted field, extraneous `"` after a closing quote, unclosed quoted + * field at EOF without `lazyQuotes`). + */ +export function parseLine( fullLine: string, - reader: LineReader, options: ReadOptions, - zeroBasedRecordStartLine: number, + zeroBasedRecordStartLine: number = 0, zeroBasedLine: number = zeroBasedRecordStartLine, -): Promise> { + atEof: boolean = true, +): string[] | null { // line starting with comment character is ignored if (options.comment && fullLine[0] === options.comment) { return []; @@ -78,6 +97,24 @@ export async function parseRecord( const separatorLen = options.separator.length; let recordBuffer = ""; const fieldIndexes = [] as number[]; + + // Map an absolute position in `fullLine` to the (line, column) where it + // lives, accounting for embedded `\n` from joined multi-line records. The + // returned line number is offset from the record's first line; column is in + // code-points (matches the existing error message format). + const locate = (absPos: number): { line: number; col: number } => { + let line = zeroBasedRecordStartLine; + let lastNewline = -1; + for (let i = 0; i < absPos; i++) { + if (fullLine[i] === "\n") { + line++; + lastNewline = i; + } + } + const col = codePointLength(fullLine.slice(lastNewline + 1, absPos)); + return { line, col }; + }; + parseField: while (true) { if (options.trimLeadingSpace) { line = line.trimStart(); @@ -94,13 +131,13 @@ export async function parseRecord( if (!options.lazyQuotes) { const j = field.indexOf(quote); if (j >= 0) { - const col = codePointLength( - fullLine.slice(0, fullLine.length - line.slice(j).length), + const { line: errLine, col } = locate( + fullLine.length - line.slice(j).length, ); throw new SyntaxError( createBareQuoteErrorMessage( zeroBasedRecordStartLine, - zeroBasedLine, + errLine, col, ), ); @@ -140,52 +177,54 @@ export async function parseRecord( recordBuffer += quote; } else { // `"*` sequence (invalid non-escaped quote). - const col = codePointLength( - fullLine.slice(0, fullLine.length - line.length - quoteLen), + const { line: errLine, col } = locate( + fullLine.length - line.length - quoteLen, ); throw new SyntaxError( createQuoteErrorMessage( zeroBasedRecordStartLine, - zeroBasedLine, + errLine, col, ), ); } - } else if (line.length > 0 || !reader.isEOF()) { - // Hit end of line (copy all data so far). - recordBuffer += line; - const r = await reader.readLine(); - line = r ?? ""; // This is a workaround for making this module behave similarly to the encoding/csv/reader.go. - fullLine = line; - if (r === null) { - // Abrupt end of file (EOF or error). - if (!options.lazyQuotes) { - const col = codePointLength(fullLine); + } else { + // No more quotes on this line. Record continues onto the next line + // (the caller has already joined them with `\n` so `line` already + // contains the rest of the buffered input). If we're not yet at EOF, + // signal the caller to feed more input by returning `null`. + if (!atEof) { + return null; + } + // At EOF: same as the old reader-based path's "abrupt end of file" + // branches. The old code distinguished two cases by whether `line` + // (the unprocessed remainder) was empty when EOL hit: + // - `line` empty → "abrupt EOF" branch, column = end of original + // input (the quote opened but the entire body was consumed + // before EOF); applies to inputs with an odd number of quotes + // on a single line. + // - `line` non-empty → would have fallen through to a final + // readLine, which returned null and reset `fullLine` to `""`, + // so column was 0 on the line after the last consumed segment. + if (!options.lazyQuotes) { + if (line.length === 0) { throw new SyntaxError( createQuoteErrorMessage( zeroBasedRecordStartLine, - zeroBasedLine, - col, + zeroBasedRecordStartLine, + codePointLength(fullLine), ), ); } - fieldIndexes.push(recordBuffer.length); - break parseField; - } - zeroBasedLine++; - recordBuffer += "\n"; // preserve line feed (This is because TextProtoReader removes it.) - } else { - // Abrupt end of file (EOF on error). - if (!options.lazyQuotes) { - const col = codePointLength(fullLine); + let errLine = zeroBasedRecordStartLine; + for (let i = 0; i < fullLine.length; i++) { + if (fullLine[i] === "\n") errLine++; + } throw new SyntaxError( - createQuoteErrorMessage( - zeroBasedRecordStartLine, - zeroBasedLine, - col, - ), + createQuoteErrorMessage(zeroBasedRecordStartLine, errLine, 0), ); } + recordBuffer += line; fieldIndexes.push(recordBuffer.length); break parseField; } @@ -201,6 +240,55 @@ export async function parseRecord( return result; } +/** + * Async wrapper that builds on {@linkcode parseLine}: pulls additional lines + * from `reader` whenever the current accumulated line ends inside an unclosed + * quoted field, then defers all field-level parsing to `parseLine`. + * + * This keeps the streaming caller (`CsvParseStream`) on a single shared + * primitive without re-implementing any field/quote rules. + */ +export async function parseRecord( + fullLine: string, + reader: LineReader, + options: ReadOptions, + zeroBasedRecordStartLine: number, + zeroBasedLine: number = zeroBasedRecordStartLine, +): Promise> { + let accumulated = fullLine; + let currentLine = zeroBasedLine; + while (true) { + const result = parseLine( + accumulated, + options, + zeroBasedRecordStartLine, + currentLine, + reader.isEOF(), + ); + if (result !== null) { + return result; + } + // parseLine returned null → record continues onto another line. + const next = await reader.readLine(); + if (next === null) { + // Reader claimed it was not at EOF but yielded null — force a final + // pass with atEof=true so parseLine throws/handles EOF consistently. + const eofResult = parseLine( + accumulated, + options, + zeroBasedRecordStartLine, + currentLine, + true, + ); + // parseLine with atEof=true cannot return null; this is a defensive + // narrowing for the type system. + return eofResult ?? []; + } + accumulated += "\n" + next; + currentLine++; + } +} + export function createBareQuoteErrorMessage( zeroBasedRecordStartLine: number, zeroBasedLine: number, diff --git a/csv/parse.ts b/csv/parse.ts index 2ad28afd411e..11a63c71248d 100644 --- a/csv/parse.ts +++ b/csv/parse.ts @@ -3,17 +3,75 @@ import { convertRowToObject, - createBareQuoteErrorMessage, - createQuoteErrorMessage, + parseLine as parseLineInternal, type ParseResult, type ReadOptions, type RecordWithColumn, } from "./_io.ts"; -import { codePointLength } from "./_shared.ts"; export type { ParseResult, RecordWithColumn }; -const BYTE_ORDER_MARK = "\ufeff"; +const BYTE_ORDER_MARK = ""; + +/** + * Parse a single CSV record into its fields. + * + * `parseLine` is the synchronous primitive that `parse` and `CsvParseStream` + * are both built on. It is exported so callers that already own line + * splitting (for example, after `TextLineStream`) can reuse the same field + * rules without spinning up a parser class. + * + * Multi-line quoted fields are supported: pass the joined record (each + * source line separated by `\n`) and the function will treat the embedded + * newlines as field content. + * + * @example Usage + * ```ts + * import { parseLine } from "@std/csv/parse"; + * import { assertEquals } from "@std/assert/equals"; + * + * assertEquals(parseLine("a,b,c"), ["a", "b", "c"]); + * assertEquals(parseLine(`"a","b,c","d"`), ["a", "b,c", "d"]); + * ``` + * + * @example Custom separator + * ```ts + * import { parseLine } from "@std/csv/parse"; + * import { assertEquals } from "@std/assert/equals"; + * + * assertEquals(parseLine("a\tb\tc", { separator: "\t" }), ["a", "b", "c"]); + * ``` + * + * @param line The single CSV record to parse. May contain embedded `\n` + * characters inside quoted fields. + * @param options Parsing options. Same shape as the read-side options + * accepted by {@linkcode parse}. + * @returns The fields parsed from the record. + */ +export function parseLine( + line: string, + options: Omit = + {}, +): string[] { + const { separator = ",", trimLeadingSpace = false, comment, lazyQuotes } = + options; + const stripped = line.startsWith(BYTE_ORDER_MARK) ? line.slice(1) : line; + // Treat a single trailing CR/LF/CRLF as a record terminator (callers that + // forgot to trim should not see a phantom empty trailing field). + const normalized = stripped.endsWith("\r\n") + ? stripped.slice(0, -2) + : stripped.endsWith("\n") || stripped.endsWith("\r") + ? stripped.slice(0, -1) + : stripped; + const readOptions: ReadOptions = { + separator, + trimLeadingSpace, + ...(comment !== undefined ? { comment } : {}), + ...(lazyQuotes !== undefined ? { lazyQuotes } : {}), + }; + const result = parseLineInternal(normalized, readOptions, 0, 0, true); + return result ?? []; +} class Parser { #input = ""; @@ -21,9 +79,9 @@ class Parser { #options: { separator: string; trimLeadingSpace: boolean; - comment: string | undefined; - lazyQuotes: boolean | undefined; - fieldsPerRecord: number | undefined; + comment?: string; + lazyQuotes?: boolean; + fieldsPerRecord?: number; }; constructor({ separator = ",", @@ -35,9 +93,9 @@ class Parser { this.#options = { separator, trimLeadingSpace, - comment, - lazyQuotes, - fieldsPerRecord, + ...(comment !== undefined ? { comment } : {}), + ...(lazyQuotes !== undefined ? { lazyQuotes } : {}), + ...(fieldsPerRecord !== undefined ? { fieldsPerRecord } : {}), }; } #readLine(): string | null { @@ -71,138 +129,41 @@ class Parser { return this.#cursor >= this.#input.length; } #parseRecord(zeroBasedStartLine: number): string[] | null { - let fullLine = this.#readLine(); - if (fullLine === null) return null; - if (fullLine.length === 0) { + const first = this.#readLine(); + if (first === null) return null; + if (first.length === 0) { return []; } + // Defer all field-level parsing to the shared primitive. If the line ends + // inside an unclosed quoted field, accumulate the next line and re-parse; + // we own line iteration here, so the primitive's `atEof` signal tells us + // when to give up. + let accumulated = first; let zeroBasedLine = zeroBasedStartLine; - - // line starting with comment character is ignored - if (this.#options.comment && fullLine[0] === this.#options.comment) { - return []; - } - - let line = fullLine; - const quote = '"'; - const quoteLen = quote.length; - const separatorLen = this.#options.separator.length; - let recordBuffer = ""; - const fieldIndexes = [] as number[]; - parseField: while (true) { - if (this.#options.trimLeadingSpace) { - line = line.trimStart(); - } - - if (line.length === 0 || !line.startsWith(quote)) { - // Non-quoted string field - const i = line.indexOf(this.#options.separator); - let field = line; - if (i >= 0) { - field = field.substring(0, i); - } - // Check to make sure a quote does not appear in field. - if (!this.#options.lazyQuotes) { - const j = field.indexOf(quote); - if (j >= 0) { - const col = codePointLength( - fullLine.slice(0, fullLine.length - line.slice(j).length), - ); - throw new SyntaxError( - createBareQuoteErrorMessage( - zeroBasedStartLine, - zeroBasedLine, - col, - ), - ); - } - } - recordBuffer += field; - fieldIndexes.push(recordBuffer.length); - if (i >= 0) { - line = line.substring(i + separatorLen); - continue parseField; - } - break parseField; - } else { - // Quoted string field - line = line.substring(quoteLen); - while (true) { - const i = line.indexOf(quote); - if (i >= 0) { - // Hit next quote. - recordBuffer += line.substring(0, i); - line = line.substring(i + quoteLen); - if (line.startsWith(quote)) { - // `""` sequence (append quote). - recordBuffer += quote; - line = line.substring(quoteLen); - } else if (line.startsWith(this.#options.separator)) { - // `","` sequence (end of field). - line = line.substring(separatorLen); - fieldIndexes.push(recordBuffer.length); - continue parseField; - } else if (0 === line.length) { - // `"\n` sequence (end of line). - fieldIndexes.push(recordBuffer.length); - break parseField; - } else if (this.#options.lazyQuotes) { - // `"` sequence (bare quote). - recordBuffer += quote; - } else { - // `"*` sequence (invalid non-escaped quote). - const col = codePointLength( - fullLine.slice(0, fullLine.length - line.length - quoteLen), - ); - throw new SyntaxError( - createQuoteErrorMessage(zeroBasedStartLine, zeroBasedLine, col), - ); - } - } else if (line.length > 0 || !(this.#isEOF())) { - // Hit end of line (copy all data so far). - recordBuffer += line; - const r = this.#readLine(); - line = r ?? ""; // This is a workaround for making this module behave similarly to the encoding/csv/reader.go. - fullLine = line; - if (r === null) { - // Abrupt end of file (EOF or error). - if (!this.#options.lazyQuotes) { - const col = codePointLength(fullLine); - throw new SyntaxError( - createQuoteErrorMessage( - zeroBasedStartLine, - zeroBasedLine, - col, - ), - ); - } - fieldIndexes.push(recordBuffer.length); - break parseField; - } - zeroBasedLine++; - recordBuffer += "\n"; // preserve line feed (This is because TextProtoReader removes it.) - } else { - // Abrupt end of file (EOF on error). - if (!this.#options.lazyQuotes) { - const col = codePointLength(fullLine); - throw new SyntaxError( - createQuoteErrorMessage(zeroBasedStartLine, zeroBasedLine, col), - ); - } - fieldIndexes.push(recordBuffer.length); - break parseField; - } - } + while (true) { + const result = parseLineInternal( + accumulated, + this.#options, + zeroBasedStartLine, + zeroBasedLine, + this.#isEOF(), + ); + if (result !== null) return result; + const next = this.#readLine(); + if (next === null) { + // Force the EOF decision (will throw unless lazyQuotes is set). + return parseLineInternal( + accumulated, + this.#options, + zeroBasedStartLine, + zeroBasedLine, + true, + ) ?? []; } + accumulated += "\n" + next; + zeroBasedLine++; } - const result = [] as string[]; - let preIdx = 0; - for (const i of fieldIndexes) { - result.push(recordBuffer.slice(preIdx, i)); - preIdx = i; - } - return result; } parse(input: string): string[][] { this.#input = input.startsWith(BYTE_ORDER_MARK) ? input.slice(1) : input; @@ -240,7 +201,6 @@ class Parser { } else if (options.fieldsPerRecord === 0) { _nbFields = "UNINITIALIZED"; } else { - // TODO: Should we check if it's a valid integer? _nbFields = options.fieldsPerRecord; } diff --git a/csv/parse_test.ts b/csv/parse_test.ts index 1912758b8100..fed3cbe8b495 100644 --- a/csv/parse_test.ts +++ b/csv/parse_test.ts @@ -5,7 +5,7 @@ // Copyright 2018-2026 the Deno authors. MIT license. import { assert, assertEquals, assertThrows } from "@std/assert"; -import { parse, type ParseOptions } from "./parse.ts"; +import { parse, parseLine, type ParseOptions } from "./parse.ts"; import type { AssertTrue, IsExact } from "@std/testing/types"; const BYTE_ORDER_MARK = "\ufeff"; @@ -1023,3 +1023,106 @@ Deno.test({ } }, }); + +Deno.test({ + name: "parseLine() splits a simple comma-separated record", + fn() { + assertEquals(parseLine("a,b,c"), ["a", "b", "c"]); + }, +}); + +Deno.test({ + name: "parseLine() handles quoted fields with embedded commas", + fn() { + assertEquals(parseLine(`"a","b,c","d"`), ["a", "b,c", "d"]); + }, +}); + +Deno.test({ + name: "parseLine() handles escaped quotes inside quoted fields", + fn() { + assertEquals(parseLine(`"a ""word""","plain"`), [`a "word"`, "plain"]); + }, +}); + +Deno.test({ + name: "parseLine() supports a custom separator", + fn() { + assertEquals( + parseLine("a\tb\tc", { separator: "\t" }), + ["a", "b", "c"], + ); + }, +}); + +Deno.test({ + name: "parseLine() trims leading whitespace when trimLeadingSpace is set", + fn() { + assertEquals( + parseLine(" a, b, c", { trimLeadingSpace: true }), + ["a", "b", "c"], + ); + }, +}); + +Deno.test({ + name: "parseLine() strips a leading byte-order mark", + fn() { + assertEquals(parseLine("a,b,c"), ["a", "b", "c"]); + }, +}); + +Deno.test({ + name: "parseLine() strips a single trailing newline", + fn() { + assertEquals(parseLine("a,b,c\n"), ["a", "b", "c"]); + assertEquals(parseLine("a,b,c\r\n"), ["a", "b", "c"]); + assertEquals(parseLine("a,b,c\r"), ["a", "b", "c"]); + }, +}); + +Deno.test({ + name: "parseLine() returns embedded newlines from a multi-line quoted field", + fn() { + assertEquals(parseLine(`"a\nb",c`), ["a\nb", "c"]); + }, +}); + +Deno.test({ + name: "parseLine() throws on a bare quote in an unquoted field", + fn() { + assertThrows( + () => parseLine(`a,b"c,d`), + SyntaxError, + `bare " in non-quoted-field`, + ); + }, +}); + +Deno.test({ + name: "parseLine() tolerates bare quotes when lazyQuotes is set", + fn() { + assertEquals( + parseLine(`a,b"c,d`, { lazyQuotes: true }), + ["a", `b"c`, "d"], + ); + }, +}); + +Deno.test({ + name: "parseLine() returns an empty array for a comment line", + fn() { + assertEquals(parseLine("# header line", { comment: "#" }), []); + }, +}); + +Deno.test({ + name: "parseLine() throws on an unclosed quoted field", + fn() { + assertThrows( + () => parseLine(`"unclosed`), + SyntaxError, + `extraneous or missing " in quoted-field`, + ); + }, +}); From ccb627d96649ef07b15d8b2691c627c76257242e Mon Sep 17 00:00:00 2001 From: Mukunda Rao Katta Date: Tue, 28 Apr 2026 08:02:58 -0700 Subject: [PATCH 2/2] fix(csv): drop unused zeroBasedLine param to satisfy lint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The deno_lint no-unused-vars check flagged the parameter on parseLine (and the matching one on parseRecord and Parser.#parseRecord) — it was threaded through but never read inside the function bodies because the locate() helper computes line offsets from embedded newlines in the joined fullLine instead. Removing the param simplifies the call sites without changing behavior: all 145 parse + parse_stream + parseLine tests still pass. --- csv/_io.ts | 6 ------ csv/parse.ts | 6 +----- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/csv/_io.ts b/csv/_io.ts index 546ff60c1843..f3fbb4e4a498 100644 --- a/csv/_io.ts +++ b/csv/_io.ts @@ -79,7 +79,6 @@ export function parseLine( fullLine: string, options: ReadOptions, zeroBasedRecordStartLine: number = 0, - zeroBasedLine: number = zeroBasedRecordStartLine, atEof: boolean = true, ): string[] | null { // line starting with comment character is ignored @@ -253,16 +252,13 @@ export async function parseRecord( reader: LineReader, options: ReadOptions, zeroBasedRecordStartLine: number, - zeroBasedLine: number = zeroBasedRecordStartLine, ): Promise> { let accumulated = fullLine; - let currentLine = zeroBasedLine; while (true) { const result = parseLine( accumulated, options, zeroBasedRecordStartLine, - currentLine, reader.isEOF(), ); if (result !== null) { @@ -277,7 +273,6 @@ export async function parseRecord( accumulated, options, zeroBasedRecordStartLine, - currentLine, true, ); // parseLine with atEof=true cannot return null; this is a defensive @@ -285,7 +280,6 @@ export async function parseRecord( return eofResult ?? []; } accumulated += "\n" + next; - currentLine++; } } diff --git a/csv/parse.ts b/csv/parse.ts index 11a63c71248d..f471fc51f9a0 100644 --- a/csv/parse.ts +++ b/csv/parse.ts @@ -69,7 +69,7 @@ export function parseLine( ...(comment !== undefined ? { comment } : {}), ...(lazyQuotes !== undefined ? { lazyQuotes } : {}), }; - const result = parseLineInternal(normalized, readOptions, 0, 0, true); + const result = parseLineInternal(normalized, readOptions, 0, true); return result ?? []; } @@ -140,13 +140,11 @@ class Parser { // we own line iteration here, so the primitive's `atEof` signal tells us // when to give up. let accumulated = first; - let zeroBasedLine = zeroBasedStartLine; while (true) { const result = parseLineInternal( accumulated, this.#options, zeroBasedStartLine, - zeroBasedLine, this.#isEOF(), ); if (result !== null) return result; @@ -157,12 +155,10 @@ class Parser { accumulated, this.#options, zeroBasedStartLine, - zeroBasedLine, true, ) ?? []; } accumulated += "\n" + next; - zeroBasedLine++; } } parse(input: string): string[][] {