From 01bd254c4e983203477e56ed9be098f938c44f2c Mon Sep 17 00:00:00 2001 From: Akshat Shenoi Date: Thu, 4 Jun 2026 20:46:50 +0000 Subject: [PATCH] [SPARK-57135][SQL] Support reading CSV files inside tar archives Adds support for reading CSV files packaged in tar archives (.tar, .tar.gz, .tgz) by streaming each archive entry through the CSV parser without unpacking to disk. Gated behind spark.sql.files.archive.reader.enabled (default false). --- .../sql/catalyst/FileSourceOptions.scala | 8 + .../apache/spark/sql/internal/SQLConf.scala | 10 + sql/core/pom.xml | 4 + .../execution/datasources/ArchiveReader.scala | 239 ++++++++++++++++ .../datasources/csv/CSVDataSource.scala | 93 ++++++- .../datasources/csv/CSVFileFormat.scala | 35 ++- .../datasources/ArchiveReadSuiteBase.scala | 234 ++++++++++++++++ .../datasources/ArchiveReaderSuite.scala | 259 ++++++++++++++++++ .../datasources/CSVArchiveReadBase.scala | 78 ++++++ .../CSVHeaderArchiveReadBase.scala | 52 ++++ .../CSVHeaderTarArchiveReadSuite.scala | 28 ++ .../CSVHeaderlessArchiveReadBase.scala | 43 +++ .../CSVHeaderlessTarArchiveReadSuite.scala | 28 ++ .../datasources/TarArchiveReadBase.scala | 62 +++++ 14 files changed, 1157 insertions(+), 16 deletions(-) create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ArchiveReader.scala create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ArchiveReadSuiteBase.scala create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ArchiveReaderSuite.scala create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVArchiveReadBase.scala create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderArchiveReadBase.scala create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderTarArchiveReadSuite.scala create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderlessArchiveReadBase.scala create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderlessTarArchiveReadSuite.scala create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/TarArchiveReadBase.scala diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/FileSourceOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/FileSourceOptions.scala index 22f84a1cad63d..0747a8045e7d2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/FileSourceOptions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/FileSourceOptions.scala @@ -45,6 +45,14 @@ class FileSourceOptions( val ignoreMissingFiles: Boolean = parameters.get(IGNORE_MISSING_FILES).map(_.toBoolean) .getOrElse(SQLConf.get.ignoreMissingFiles) + + /** + * Whether the data source may read tar archives (.tar/.tar.gz/.tgz) by streaming their entries. + * Gated by [[SQLConf.ARCHIVE_FORMAT_READER_ENABLED]] and resolved at construction (on the driver, + * where SQLConf is instantiated) so the value is stable once the options are serialized to + * executors. Only the CSV data source currently honors this. + */ + val archiveFormatEnabled: Boolean = SQLConf.get.getConf(SQLConf.ARCHIVE_FORMAT_READER_ENABLED) } object FileSourceOptions { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 0aed28e92558f..7971795c29bab 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -2695,6 +2695,16 @@ object SQLConf { .bytesConf(ByteUnit.BYTE) .createWithDefaultString("128MB") // parquet.block.size + val ARCHIVE_FORMAT_READER_ENABLED = buildConf("spark.sql.files.archive.reader.enabled") + .doc("When true, the CSV data source can read tar archives (.tar, .tar.gz, .tgz): each " + + "archive is read as a single split and its entries are streamed through the CSV parser " + + "(never unpacked to disk), as if the entries were separate CSV files. Only the CSV data " + + "source supports reading archives.") + .version("5.0.0") + .withBindingPolicy(ConfigBindingPolicy.SESSION) + .booleanConf + .createWithDefault(false) + val FILES_OPEN_COST_IN_BYTES = buildConf("spark.sql.files.openCostInBytes") .internal() .doc("The estimated cost to open a file, measured by the number of bytes could be scanned in" + diff --git a/sql/core/pom.xml b/sql/core/pom.xml index 1261200a9173c..e6673c9069f42 100644 --- a/sql/core/pom.xml +++ b/sql/core/pom.xml @@ -293,6 +293,10 @@ bcprov-jdk18on test + + org.apache.commons + commons-compress + org.bouncycastle bcpkix-jdk18on diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ArchiveReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ArchiveReader.scala new file mode 100644 index 0000000000000..e9737c87e1a2c --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ArchiveReader.scala @@ -0,0 +1,239 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +import java.io.{Closeable, InputStream} +import java.util.Locale +import java.util.zip.GZIPInputStream + +import scala.util.control.NonFatal + +import org.apache.commons.compress.archivers.tar.{TarArchiveEntry, TarArchiveInputStream} +import org.apache.commons.io.ByteOrderMark +import org.apache.commons.io.input.{BOMInputStream, CloseShieldInputStream} +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.hadoop.io.Text +import org.apache.hadoop.util.LineReader + +import org.apache.spark.TaskContext +import org.apache.spark.util.HadoopFSUtils + +/** + * Streaming reader for a single archive file. The archive is opened once and decompressed/unpacked + * as a stream -- entries are never materialized to local disk. [[readEntries]] hands each entry's + * bytes to a caller-supplied parse function as a bounded [[InputStream]] and concatenates the + * per-entry results into a single iterator, advancing to the next entry only once the current one + * is fully consumed. At most one entry is in flight at a time, so memory stays bounded regardless + * of archive size. + * + * This is format-agnostic: a data source whose per-file reader can consume an `InputStream` wires + * up archive support by calling [[readEntries]] from its read/inference paths and supplying a + * `parseEntry` that turns one entry stream into rows (or tokens). Formats that need random access + * within a file (e.g. Parquet/ORC footers) cannot use this streaming path. + * + * A concrete subclass implements [[readEntries]] for a specific archive format. Obtain the reader + * for a path via `ArchiveReader(path)`, which selects the implementation by file extension; new + * archive formats are added by writing another subclass rather than modifying existing ones. + */ +abstract class ArchiveReader(path: Path) { + + /** + * Streams the archive entry by entry, applying `parseEntry` to each non-skipped entry's + * `(name, stream)` and concatenating the results into a single iterator. The next entry is opened + * only once the current entry's iterator is exhausted, so nothing is buffered to disk and at most + * one entry's bytes are read at a time. The archive stream is closed when the returned iterator + * is exhausted, when [[Closeable.close]] is called on it, and (defensively) on task completion. + */ + def readEntries[T]( + conf: Configuration)( + parseEntry: (String, InputStream) => Iterator[T]): Iterator[T] +} + +object ArchiveReader { + + /** + * Whether `path` names an archive this reader can stream. Dispatched purely on the file + * extension -- `.tar`, `.tar.gz`, or `.tgz` -- since the bytes are not inspected here. + */ + def isArchivePath(path: Path): Boolean = { + val name = path.getName.toLowerCase(Locale.ROOT) + name.endsWith(".tar") || name.endsWith(".tar.gz") || name.endsWith(".tgz") + } + + /** + * Returns the [[ArchiveReader]] implementation for `path`, selected by its file extension. Only + * paths for which [[isArchivePath]] is true are supported; new archive formats add a case here. + */ + def apply(path: Path): ArchiveReader = new TarArchiveReader(path) + + /** + * Splits one already-decompressed archive entry's bytes into lines. The reusable, format-agnostic + * line source for archive entries; the entry stream is not closed here (the reader owns the + * underlying stream). + * + * @param in bytes of one archive entry. + * @param lineSeparatorInRead the explicit read line separator, or `None` to detect line breaks. + * @return an iterator over the entry's lines as [[Text]], without the trailing separator. + */ + def lineIterator(in: InputStream, lineSeparatorInRead: Option[Array[Byte]]): Iterator[Text] = { + // A leading byte-order mark is stripped (LineReader does not strip it on its own) so the lines + // match the non-archive read path. + val bomInputStream = BOMInputStream.builder() + .setInputStream(in) + .setByteOrderMarks( + ByteOrderMark.UTF_8, + ByteOrderMark.UTF_16LE, + ByteOrderMark.UTF_16BE, + ByteOrderMark.UTF_32LE, + ByteOrderMark.UTF_32BE) + .setInclude(false) + .get() + val reader = lineSeparatorInRead match { + case Some(sep) => new LineReader(bomInputStream, sep) + case _ => new LineReader(bomInputStream) + } + new Iterator[Text] { + private val text = new Text() + private var finished = false + private var hasValue = false + + override def hasNext: Boolean = { + if (!finished && !hasValue) { + finished = reader.readLine(text) == 0 + hasValue = !finished + } + !finished + } + + override def next(): Text = { + if (!hasNext) throw new NoSuchElementException + hasValue = false + text + } + } + } +} + +/** + * [[ArchiveReader]] for tar archives: plain `.tar`, gzipped `.tar.gz`, and `.tgz`. + * + * Gzip handling: Hadoop's `CompressionCodecFactory` matches the trailing `.gz` extension and + * auto-decompresses `.tar.gz` via `CodecStreams`, so we just wrap that stream in + * `TarArchiveInputStream`. `.tgz` is not a registered Hadoop codec extension, so the gzip layer is + * unwrapped explicitly here. + */ +class TarArchiveReader(path: Path) extends ArchiveReader(path) { + + // Paths Hadoop's codec factory won't auto-decompress: we apply the gzip layer here. + private def needsExplicitGunzip: Boolean = + path.getName.toLowerCase(Locale.ROOT).endsWith(".tgz") + + /** + * Whether an entry is not a real data file and must be skipped: a directory, or a name Spark's + * own file listing would filter out. Reusing [[HadoopFSUtils.shouldFilterOutPathName]] (the + * `InMemoryFileIndex` filter) keeps archive reads in parity with reading the same entries as + * loose files: `.`-prefixed sidecars (macOS `._x`, `.DS_Store`) and `_`-prefixed markers + * (`_SUCCESS`, `_committed_*`) are skipped, while data files are kept. + */ + private def shouldSkipEntry(entry: TarArchiveEntry): Boolean = { + if (entry.isDirectory) return true + val name = entry.getName + val basename = name.substring(name.lastIndexOf('/') + 1) + HadoopFSUtils.shouldFilterOutPathName(basename) + } + + /** Opens the archive as a tar stream, transparently decompressing `.tar.gz` / `.tgz`. */ + private def openTarStream(conf: Configuration): TarArchiveInputStream = { + val base = CodecStreams.createInputStreamWithCloseResource(conf, path) + val tarBytes = if (needsExplicitGunzip) new GZIPInputStream(base) else base + new TarArchiveInputStream(tarBytes) + } + + /** + * Wraps the shared tar stream as a view over exactly the current entry's bytes + * (`TarArchiveInputStream.read` returns -1 at the entry boundary). [[CloseShieldInputStream]] + * ignores `close()`, so a parser closing its input does not close the underlying archive; any + * unread remainder of an entry is skipped by `getNextEntry()` when advancing. + */ + private def entryStream(tar: TarArchiveInputStream): InputStream = + CloseShieldInputStream.wrap(tar) + + override def readEntries[T]( + conf: Configuration)( + parseEntry: (String, InputStream) => Iterator[T]): Iterator[T] = { + val tar = openTarStream(conf) + var closed = false + + def cleanup(): Unit = { + if (!closed) { + closed = true + try tar.close() catch { case NonFatal(_) => } + } + } + + Option(TaskContext.get()).foreach(_.addTaskCompletionListener[Unit](_ => cleanup())) + + new Iterator[T] with Closeable { + private var currentIter: Iterator[T] = Iterator.empty + private var done = false + + // Move to the next entry whose iterator has elements (releasing each exhausted entry's + // reader and skipping any unread bytes), or mark the stream done once entries run out. + // Advancing here -- driven by `hasNext` -- rather than eagerly after producing a row in + // `next` is essential for parsers that reuse a single mutable row and look ahead on + // `hasNext`: probing the current entry right after returning a row would overwrite that row's + // contents before the caller has copied it. + private def advance(): Unit = { + while (!done && !currentIter.hasNext) { + currentIter match { + case c: Closeable => try c.close() catch { case NonFatal(_) => } + case _ => + } + var entry = tar.getNextEntry + while (entry != null && shouldSkipEntry(entry)) entry = tar.getNextEntry + if (entry == null) { + done = true + cleanup() + } else { + currentIter = parseEntry(entry.getName, entryStream(tar)) + } + } + } + + // Open the first entry eagerly so construction reflects the archive's first entry. + advance() + + override def hasNext: Boolean = { + advance() + !done && currentIter.hasNext + } + + override def next(): T = { + if (!hasNext) throw new NoSuchElementException + currentIter.next() + } + + override def close(): Unit = { + done = true + currentIter = Iterator.empty + cleanup() + } + } + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala index 596edc8beaa34..76a34630a4d5d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.execution.datasources.csv -import java.io.{FileNotFoundException, IOException} +import java.io.{FileNotFoundException, InputStream, IOException} import java.nio.charset.{Charset, StandardCharsets} import scala.util.control.NonFatal @@ -71,7 +71,14 @@ abstract class CSVDataSource extends Serializable { parsedOptions.singleVariantColumn match { case Some(columnName) => Some(StructType(Array(StructField(columnName, VariantType)))) case None => - if (inputPaths.nonEmpty) { + if (parsedOptions.archiveFormatEnabled && + inputPaths.exists(f => ArchiveReader.isArchivePath(f.getPath))) { + // Schema inference is not yet supported for tar archives. Returning None makes Spark + // raise its standard "Unable to infer schema ... It must be specified manually" error + // (UNABLE_TO_INFER_SCHEMA), so reading an archive requires an explicit `.schema(...)`. + // Inferring a schema by streaming archive entries is planned as a follow-up. + None + } else if (inputPaths.nonEmpty) { Some(infer(sparkSession, inputPaths, parsedOptions)) } else { None @@ -83,6 +90,46 @@ abstract class CSVDataSource extends Serializable { sparkSession: SparkSession, inputPaths: Seq[FileStatus], parsedOptions: CSVOptions): StructType + + /** + * Streams a tar archive (`.tar`/`.tar.gz`/`.tgz`) entry by entry through the CSV parser without + * unpacking it to disk. The whole archive is a single split (see `CSVFileFormat.isSplitable`); a + * fresh header checker and parser are built per entry so each entry is parsed exactly like a + * standalone CSV file -- its header, if any, validated and dropped independently. The + * mode-specific implementation turns one entry into rows via `parseStream` / `parseIterator`. + * + * @param getParser builds a fresh [[UnivocityParser]]. + * @param getHeaderChecker builds a fresh [[CSVHeaderChecker]] for `(isStartOfFile, source)`. + */ + def readArchive( + conf: Configuration, + file: PartitionedFile, + getParser: () => UnivocityParser, + getHeaderChecker: (Boolean, String) => CSVHeaderChecker, + requiredSchema: StructType): Iterator[InternalRow] + + /** + * Shared driver used by the [[readArchive]] implementations: streams each non-skipped entry's + * `(parser, headerChecker, stream)` -- a fresh parser/header checker per entry -- through + * `parseEntry`. The header checker `source` (`CSV archive entry: !/`) names + * the entry in error messages. + */ + protected def streamArchiveEntries( + conf: Configuration, + file: PartitionedFile, + getParser: () => UnivocityParser, + getHeaderChecker: (Boolean, String) => CSVHeaderChecker)( + parseEntry: (UnivocityParser, CSVHeaderChecker, InputStream) => Iterator[InternalRow]) + : Iterator[InternalRow] = { + ArchiveReader(file.toPath).readEntries(conf) { (entryName, in) => + val headerChecker = + getHeaderChecker(true, s"CSV archive entry: ${file.urlEncodedPath}!/$entryName") + val parser = getParser() + headerChecker.setHeaderForSingleVariantColumn = + CSVDataSource.setHeaderForSingleVariantColumn(conf, file, parser) + parseEntry(parser, headerChecker, in) + } + } } object CSVDataSource extends Logging { @@ -144,6 +191,36 @@ object TextInputCSVDataSource extends CSVDataSource { UnivocityParser.parseIterator(lines, parser, headerChecker, requiredSchema) } + override def readArchive( + conf: Configuration, + file: PartitionedFile, + getParser: () => UnivocityParser, + getHeaderChecker: (Boolean, String) => CSVHeaderChecker, + requiredSchema: StructType): Iterator[InternalRow] = + // Stream each tar entry through the line-based parser, treating the entry exactly like a + // standalone CSV file (a fresh parser/header checker is built per entry). + streamArchiveEntries(conf, file, getParser, getHeaderChecker) { (parser, headerChecker, in) => + UnivocityParser.parseIterator( + entryLines(in, parser.options), parser, headerChecker, requiredSchema) + } + + /** + * Decodes one archive entry's bytes into the same CSV line strings the non-archive [[readFile]] + * path feeds to the parser: [[ArchiveReader.lineIterator]] splits the entry into lines (honoring + * a custom line separator) and each line is decoded with the configured charset. Like `readFile`, + * the decoded lines are fed to `UnivocityParser.parseIterator` without a re-appended terminator. + * + * @param in bytes of one already-decompressed archive entry; not closed here (the archive owns + * the underlying stream). + * @param options CSV options supplying the read line separator and charset. + * @return an iterator over the entry's lines. + */ + private def entryLines(in: InputStream, options: CSVOptions): Iterator[String] = { + ArchiveReader.lineIterator(in, options.lineSeparatorInRead).map { line => + new String(line.getBytes, 0, line.getLength, options.charset) + } + } + override def infer( sparkSession: SparkSession, inputPaths: Seq[FileStatus], @@ -227,6 +304,18 @@ object MultiLineCSVDataSource extends CSVDataSource with Logging { requiredSchema) } + override def readArchive( + conf: Configuration, + file: PartitionedFile, + getParser: () => UnivocityParser, + getHeaderChecker: (Boolean, String) => CSVHeaderChecker, + requiredSchema: StructType): Iterator[InternalRow] = + // Stream each tar entry whole through the multi-line parser (a fresh parser/header checker is + // built per entry). + streamArchiveEntries(conf, file, getParser, getHeaderChecker) { (parser, headerChecker, in) => + UnivocityParser.parseStream(in, parser, headerChecker, requiredSchema) + } + override def infer( sparkSession: SparkSession, inputPaths: Seq[FileStatus], diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala index 77a0c53ae4699..ab570de8d998f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala @@ -44,6 +44,11 @@ case class CSVFileFormat() extends TextBasedFileFormat with DataSourceRegister { options: Map[String, String], path: Path): Boolean = { val parsedOptions = getCsvOptions(sparkSession, options) + // A tar archive is decompressed/unpacked as a sequential stream, so it must be read as a + // single split rather than carved into byte ranges. + if (parsedOptions.archiveFormatEnabled && ArchiveReader.isArchivePath(path)) { + return false + } CSVDataSource(parsedOptions).isSplitable && super.isSplitable(sparkSession, options, path) } @@ -119,24 +124,26 @@ case class CSVFileFormat() extends TextBasedFileFormat with DataSourceRegister { dataSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)) val actualRequiredSchema = StructType( requiredSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)) - val parser = new UnivocityParser( - actualDataSchema, - actualRequiredSchema, - parsedOptions, - actualFilters) // Use column pruning when specified by Catalyst, except when one or more columns have // existence default value(s), since in that case we instruct the CSV parser to disable column // pruning and instead read each entire row in order to correctly assign the default value(s). val schema = if (isColumnPruningEnabled) actualRequiredSchema else actualDataSchema - val isStartOfFile = file.start == 0 - val headerChecker = new CSVHeaderChecker( - schema, parsedOptions, source = s"CSV file: ${file.urlEncodedPath}", isStartOfFile) - CSVDataSource(parsedOptions).readFile( - conf, - file, - parser, - headerChecker, - requiredSchema) + + def newParser(): UnivocityParser = + new UnivocityParser(actualDataSchema, actualRequiredSchema, parsedOptions, actualFilters) + def getHeaderChecker(isStartOfFile: Boolean, source: String): CSVHeaderChecker = + new CSVHeaderChecker(schema, parsedOptions, source, isStartOfFile) + + // A tar archive (always a single split, see `isSplitable`) is streamed entry by entry when + // archive reads are enabled; otherwise the file is parsed directly. + if (parsedOptions.archiveFormatEnabled && ArchiveReader.isArchivePath(file.toPath)) { + CSVDataSource(parsedOptions).readArchive( + conf, file, () => newParser(), getHeaderChecker, requiredSchema) + } else { + val parser = newParser() + val headerChecker = getHeaderChecker(file.start == 0, s"CSV file: ${file.urlEncodedPath}") + CSVDataSource(parsedOptions).readFile(conf, file, parser, headerChecker, requiredSchema) + } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ArchiveReadSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ArchiveReadSuiteBase.scala new file mode 100644 index 0000000000000..9d0300ec41177 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ArchiveReadSuiteBase.scala @@ -0,0 +1,234 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +import java.io.File +import java.nio.file.Files + +import org.apache.spark.{SparkConf, SparkException} +import org.apache.spark.sql.{DataFrame, QueryTest, Row} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.util.Utils + +/** + * Format- and archive-agnostic end-to-end tests for reading archives of data files through the + * streaming [[ArchiveReader]] path. Entries are streamed (never unpacked to disk), and the central + * contract verified throughout is parity with reading the same files from a directory. + * + * A concrete suite binds the abstract hooks below by mixing in a file-format trait (e.g. + * [[org.apache.spark.sql.execution.datasources.CSVArchiveReadBase]]) and an archive-format trait + * (e.g. [[TarArchiveReadBase]]), so the same tests run for every (file format, archive format) + * pair we support. New formats are added by writing the per-format trait once, not by duplicating + * these tests: + * {{{ + * class CSVHeaderTarArchiveReadSuite + * extends ArchiveReadSuiteBase with CSVHeaderArchiveReadBase with TarArchiveReadBase + * }}} + */ +trait ArchiveReadSuiteBase extends QueryTest with SharedSparkSession { + + override def sparkConf: SparkConf = + super.sparkConf.set(SQLConf.ARCHIVE_FORMAT_READER_ENABLED.key, "true") + + import testImplicits._ + + // ----- file-format hooks (bound by e.g. CSVArchiveReadBase) ---------------- + + /** The `DataFrameReader`/`DataFrameWriter` format name, e.g. "csv". */ + protected def format: String + + /** Extension of a single data file of [[format]] inside an archive, e.g. "csv". */ + protected def fileExtension: String + + /** Read options applied to every read (e.g. CSV `header`). */ + protected def readOptions: Map[String, String] + + /** Schema used to read the sample data produced by [[sampleDf]]. */ + protected def readSchema: String + + /** Encodes `df` as the bytes of a single data file of [[format]], honoring `writeOptions`. */ + protected def encodeFile(df: DataFrame, writeOptions: Map[String, String]): Array[Byte] + + /** Encodes `df` as a single data file using only the format's default write options. */ + protected final def encodeFile(df: DataFrame): Array[Byte] = encodeFile(df, Map.empty) + + // ----- archive-format hooks (bound by e.g. TarArchiveReadBase) ------------- + + /** Archive extensions to exercise, e.g. Seq("tar", "tar.gz", "tgz"). The head is the default. */ + protected def archiveExtensions: Seq[String] + + /** Writes `entries` (name -> bytes) into the archive at `dest`; compression follows the ext. */ + protected def writeArchive(dest: File, entries: Seq[(String, Array[Byte])]): Unit + + /** Writes bytes that are not a readable archive at `dest` (of [[corruptArchiveExtension]]). */ + protected def writeCorruptArchive(dest: File): Unit + + /** An archive extension whose reader fails on corrupt bytes (used by the corrupt-file tests). */ + protected def corruptArchiveExtension: String + + // ----- helpers ------------------------------------------------------------- + + /** Sample two-column data; the column names line up with [[readSchema]]. */ + protected def sampleDf(rows: (Int, String)*): DataFrame = rows.toDF("id", "name") + + /** Entry file name for the i-th data file in an archive. */ + protected def entryName(i: Int): String = s"part-$i.$fileExtension" + + /** Provides an archive-extensioned path inside a fresh temp dir to `f`. */ + protected def withArchiveFile( + extension: String = archiveExtensions.head)(f: File => Unit): Unit = { + val dir = Utils.createTempDir(namePrefix = "archive-test") + try f(new File(dir, s"archive.$extension")) finally Utils.deleteRecursively(dir) + } + + /** Reads `path` with the format, [[readOptions]] (plus `extraOptions`), and `schema`. */ + protected def read( + path: String, + extraOptions: Map[String, String] = Map.empty, + schema: String = readSchema): DataFrame = + spark.read.format(format).options(readOptions ++ extraOptions).schema(schema).load(path) + + /** + * Writes `entries` both into an archive and as loose files in a directory, then asserts the + * archive read produces exactly the same rows as the directory read. + */ + protected def assertArchiveMatchesDir( + entries: Seq[(String, Array[Byte])], + extraOptions: Map[String, String] = Map.empty, + schema: String = readSchema): Unit = { + withArchiveFile() { archive => + writeArchive(archive, entries) + val fromArchive = read(archive.getCanonicalPath, extraOptions, schema) + withTempDir { dir => + entries.foreach { case (name, b) => Files.write(new File(dir, name).toPath, b) } + checkAnswer(fromArchive, read(dir.getCanonicalPath, extraOptions, schema).collect().toSeq) + } + } + } + + // ----- tests --------------------------------------------------------------- + + test("read an archive of multiple entries matches the union of the inputs") { + archiveExtensions.foreach { ext => + withArchiveFile(ext) { archive => + val parts = Seq( + sampleDf((1, "Alice"), (2, "Bob")), + sampleDf((3, "Carol")), + sampleDf((4, "Dan"), (5, "Eve"))) + writeArchive( + archive, parts.zipWithIndex.map { case (p, i) => entryName(i) -> encodeFile(p) }) + checkAnswer(read(archive.getCanonicalPath), parts.reduce(_ union _)) + } + } + } + + test("archive entries parse like a directory of the same files") { + val parts = Seq(sampleDf((1, "Alice"), (2, "Bob")), sampleDf((3, "Carol"))) + assertArchiveMatchesDir(parts.zipWithIndex.map { case (p, i) => entryName(i) -> encodeFile(p) }) + } + + test("column pruning selects a subset of columns") { + withArchiveFile() { archive => + val data = sampleDf((1, "Alice"), (2, "Bob")) + writeArchive(archive, Seq(entryName(0) -> encodeFile(data))) + checkAnswer(read(archive.getCanonicalPath).select("name"), Seq(Row("Alice"), Row("Bob"))) + } + } + + test("multiple entries and multiple loose files under a partitioned dir, plus an empty archive") { + withTempDir { rootDir => + val partitionDir = new File(rootDir, "dt=2024-01-01") + assert(partitionDir.mkdirs()) + + val inArchive = Seq(sampleDf((1, "in-archive-a")), sampleDf((2, "in-archive-b"))) + val loose = Seq(sampleDf((3, "loose-a")), sampleDf((4, "loose-b"))) + val ext = archiveExtensions.head + + writeArchive( + new File(partitionDir, s"data.$ext"), + inArchive.zipWithIndex.map { case (p, i) => entryName(i) -> encodeFile(p) }) + // An empty archive in the same directory must contribute no rows. + writeArchive(new File(partitionDir, s"empty.$ext"), Seq.empty) + loose.zipWithIndex.foreach { case (p, i) => + Files.write(new File(partitionDir, s"loose-$i.$fileExtension").toPath, encodeFile(p)) + } + + val expected = (inArchive ++ loose).reduce(_ union _) + checkAnswer(read(rootDir.getCanonicalPath).select("id", "name"), expected) + } + } + + test("a directory of only empty archives yields no rows") { + withTempDir { dir => + archiveExtensions.foreach { ext => + writeArchive(new File(dir, s"empty-${ext.replace('.', '_')}.$ext"), Seq.empty) + } + checkAnswer(read(dir.getCanonicalPath), Seq.empty[Row]) + } + } + + test("an empty archive yields no rows") { + withArchiveFile() { archive => + writeArchive(archive, Seq.empty) + checkAnswer(read(archive.getCanonicalPath), Seq.empty[Row]) + } + } + + test("an archive always yields a single partition regardless of size") { + withArchiveFile() { archive => + val big = sampleDf((1 to 1000).map(i => (i, s"value-$i")): _*) + writeArchive(archive, (0 until 4).map(i => entryName(i) -> encodeFile(big))) + withSQLConf(SQLConf.FILES_MAX_PARTITION_BYTES.key -> "1024") { + val readDf = read(archive.getCanonicalPath) + assert(readDf.rdd.getNumPartitions == 1, + s"archive should be a single partition; got ${readDf.rdd.getNumPartitions}") + assert(readDf.count() == 4L * big.count()) + } + } + } + + Seq(true, false).foreach { ignoreCorrupt => + test(s"ignoreCorruptFiles=$ignoreCorrupt controls whether a corrupt archive is skipped") { + withArchiveFile(corruptArchiveExtension) { archive => + writeCorruptArchive(archive) + withSQLConf(SQLConf.IGNORE_CORRUPT_FILES.key -> ignoreCorrupt.toString) { + if (ignoreCorrupt) { + checkAnswer(read(archive.getCanonicalPath), Seq.empty[Row]) + } else { + intercept[SparkException](read(archive.getCanonicalPath).collect()) + } + } + } + } + } + + test("a corrupt archive among good ones is skipped whole, not per entry (ignoreCorruptFiles)") { + withTempDir { dir => + val good = sampleDf((1, "Alice"), (2, "Bob")) + writeArchive(new File(dir, s"good.${archiveExtensions.head}"), + Seq(entryName(0) -> encodeFile(good))) + writeCorruptArchive(new File(dir, s"bad.$corruptArchiveExtension")) + // A tar is one non-splittable unit, so corrupt handling is archive-granular: the corrupt + // archive is skipped in its entirety while the good archive's rows are still returned. + withSQLConf(SQLConf.IGNORE_CORRUPT_FILES.key -> "true") { + checkAnswer(read(dir.getCanonicalPath), good) + } + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ArchiveReaderSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ArchiveReaderSuite.scala new file mode 100644 index 0000000000000..48482aa1f5b3e --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ArchiveReaderSuite.scala @@ -0,0 +1,259 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +import java.io.{ByteArrayOutputStream, Closeable, File, FileOutputStream, InputStream, OutputStream} +import java.nio.charset.StandardCharsets +import java.util.Properties +import java.util.zip.GZIPOutputStream + +import scala.collection.mutable.ArrayBuffer + +import org.apache.commons.compress.archivers.tar.{TarArchiveEntry, TarArchiveOutputStream} +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path + +import org.apache.spark.{SparkFunSuite, TaskContext, TaskContextImpl} + +/** + * Unit tests for the streaming [[ArchiveReader]] core: `isArchivePath` dispatch and `readEntries` + * (entry ordering, gzip handling, dir/dotfile skipping, lazy advance, the non-closing entry + * stream, and cleanup). Nothing here touches local disk -- entries are consumed as streams. + */ +class ArchiveReaderSuite extends SparkFunSuite { + + private case class Entry(name: String, data: Array[Byte], isDir: Boolean = false) + + private def writeTar(file: File, entries: Seq[Entry]): Unit = + writeTarTo(new FileOutputStream(file), entries) + + /** Write a gzipped tar, used to verify the `.tar.gz` / `.tgz` archive paths. */ + private def writeTarGz(file: File, entries: Seq[Entry]): Unit = + writeTarTo(new GZIPOutputStream(new FileOutputStream(file)), entries) + + private def writeTarTo(rawOut: OutputStream, entries: Seq[Entry]): Unit = { + val out = new TarArchiveOutputStream(rawOut) + try { + entries.foreach { e => + // TarArchiveEntry treats a trailing slash in the name as a directory marker. + val rawName = if (e.isDir && !e.name.endsWith("/")) e.name + "/" else e.name + val tarEntry = new TarArchiveEntry(rawName) + if (!e.isDir) tarEntry.setSize(e.data.length.toLong) + out.putArchiveEntry(tarEntry) + if (!e.isDir) out.write(e.data) + out.closeArchiveEntry() + } + out.finish() + } finally out.close() + } + + private def textEntry(name: String, body: String): Entry = + Entry(name, body.getBytes(StandardCharsets.UTF_8)) + + private def readAll(in: InputStream): Array[Byte] = { + val out = new ByteArrayOutputStream() + val buf = new Array[Byte](4096) + var n = in.read(buf) + while (n >= 0) { + out.write(buf, 0, n) + n = in.read(buf) + } + out.toByteArray + } + + /** Drains every entry into `(name, decodedText)` pairs through `ArchiveReader.readEntries`. */ + private def collect(file: File): Seq[(String, String)] = + ArchiveReader(new Path(file.toURI)).readEntries(new Configuration()) { (name, in) => + Iterator.single((name, new String(readAll(in), StandardCharsets.UTF_8))) + }.toList + + // ----- isArchivePath ------------------------------------------------------ + + test("isArchivePath: positive cases") { + Seq( + "foo.tar", "FOO.TAR", "/a/b/c/x.tar", "weird.TaR", + "foo.tar.gz", "FOO.TAR.GZ", "mixed.Tar.Gz", "/a/b/c/x.tar.gz", + "foo.tgz", "FOO.TGZ", "/a/b/c/x.tgz" + ).foreach { p => + assert(ArchiveReader.isArchivePath(new Path(p)), s"expected archive match for $p") + } + } + + test("isArchivePath: negative cases") { + Seq("foo.csv", "foo.gz", "foo", "dir/", "foo.tarball", "data.zip", + "foo.tar.bz2", "foo.targz").foreach { p => + assert(!ArchiveReader.isArchivePath(new Path(p)), s"expected non-match for $p") + } + } + + // ----- readEntries -------------------------------------------------------- + + test("readEntries: empty tar yields empty iterator") { + withTempDir { dir => + val tar = new File(dir, "empty.tar") + writeTar(tar, Seq.empty) + assert(collect(tar).isEmpty) + } + } + + test("readEntries: single entry exposes its name and bytes") { + withTempDir { dir => + val tar = new File(dir, "single.tar") + writeTar(tar, Seq(textEntry("only.csv", "hello\n"))) + assert(collect(tar) == Seq("only.csv" -> "hello\n")) + } + } + + test("readEntries: multiple entries chained in tar order") { + withTempDir { dir => + val tar = new File(dir, "multi.tar") + writeTar(tar, Seq(textEntry("a.csv", "a"), textEntry("b.csv", "b"), textEntry("c.csv", "c"))) + assert(collect(tar) == Seq("a.csv" -> "a", "b.csv" -> "b", "c.csv" -> "c")) + } + } + + test("readEntries: gzipped tar (.tar.gz) via Hadoop codec factory") { + withTempDir { dir => + val tarGz = new File(dir, "data.tar.gz") + writeTarGz(tarGz, Seq(textEntry("a.csv", "a"), textEntry("b.csv", "b"))) + assert(collect(tarGz) == Seq("a.csv" -> "a", "b.csv" -> "b")) + } + } + + test("readEntries: gzipped tar (.tgz) via explicit GZIPInputStream wrap") { + withTempDir { dir => + val tgz = new File(dir, "data.tgz") + writeTarGz(tgz, Seq(textEntry("a.csv", "a"), textEntry("b.csv", "b"))) + assert(collect(tgz) == Seq("a.csv" -> "a", "b.csv" -> "b")) + } + } + + test("readEntries: directory entries are skipped") { + withTempDir { dir => + val tar = new File(dir, "dirs.tar") + writeTar(tar, Seq( + Entry("subdir", Array.emptyByteArray, isDir = true), + textEntry("subdir/data.csv", "x"))) + assert(collect(tar) == Seq("subdir/data.csv" -> "x")) + } + } + + test("readEntries: dotfile entries (e.g. macOS ._foo) are skipped") { + withTempDir { dir => + val tar = new File(dir, "dots.tar") + writeTar(tar, Seq( + textEntry("._real.csv", "junk"), // macOS AppleDouble sidecar + textEntry(".hidden", "ignored"), // bare dotfile + textEntry("real.csv", "kept"), + textEntry("nested/._sidecar", "junk2"))) // dotfile in a subdir + assert(collect(tar) == Seq("real.csv" -> "kept")) + } + } + + test("readEntries: advances lazily, one entry at a time") { + withTempDir { dir => + val tar = new File(dir, "lazy.tar") + writeTar(tar, Seq(textEntry("a.csv", "a"), textEntry("b.csv", "b"), textEntry("c.csv", "c"))) + + val opened = ArrayBuffer[String]() + // parseEntry yields a single element without reading the stream, so each invocation maps to + // exactly one consumed output element -- letting us observe when the next entry is opened. + val it = ArchiveReader(new Path(tar.toURI)).readEntries(new Configuration()) { (name, _) => + opened += name + Iterator.single(name) + } + + // Construction opens only the first entry; later entries open on demand as iteration + // crosses each entry boundary (never all upfront). + assert(opened.toList == List("a.csv")) + assert(it.hasNext) + assert(it.next() == "a.csv") + // Entry 0 is still in flight until its element is consumed, so entry 1 stays unopened. + assert(opened.toList == List("a.csv")) + assert(it.next() == "b.csv") + assert(opened.toList == List("a.csv", "b.csv")) + assert(it.next() == "c.csv") + assert(opened.toList == List("a.csv", "b.csv", "c.csv")) + assert(!it.hasNext) + assert(opened.size == 3) + } + } + + test("readEntries: a parseEntry that closes its stream still advances to the next entry") { + withTempDir { dir => + val tar = new File(dir, "close.tar") + writeTar(tar, Seq(textEntry("a.csv", "a"), textEntry("b.csv", "b"))) + + val seen = ArrayBuffer[String]() + val it = ArchiveReader(new Path(tar.toURI)).readEntries(new Configuration()) { (name, in) => + val body = new String(readAll(in), StandardCharsets.UTF_8) + in.close() // must NOT close the underlying archive + seen += body + Iterator.single(name) + } + assert(it.toList == List("a.csv", "b.csv")) + assert(seen.toList == List("a", "b")) + } + } + + test("readEntries: close() is safe, idempotent, and stops iteration") { + withTempDir { dir => + val tar = new File(dir, "closeable.tar") + writeTar(tar, Seq(textEntry("a.csv", "a"), textEntry("b.csv", "b"))) + + val it = ArchiveReader(new Path(tar.toURI)).readEntries(new Configuration()) { (name, _) => + Iterator.single(name) + } + assert(it.hasNext) + it.asInstanceOf[Closeable].close() + it.asInstanceOf[Closeable].close() // idempotent + assert(!it.hasNext) + } + } + + test("readEntries: TaskContext completion cleans up without error") { + withTempDir { dir => + val tar = new File(dir, "ctx.tar") + writeTar(tar, Seq(textEntry("a.csv", "a"), textEntry("b.csv", "b"))) + + val ctx = new TaskContextImpl( + stageId = 0, + stageAttemptNumber = 0, + partitionId = 0, + taskAttemptId = 1L, + attemptNumber = 0, + numPartitions = 0, + taskMemoryManager = null, + localProperties = new Properties, + metricsSystem = null, + cpus = 1) + TaskContext.setTaskContext(ctx) + try { + val it = ArchiveReader(new Path(tar.toURI)).readEntries(new Configuration()) { (name, _) => + Iterator.single(name) + } + assert(it.hasNext) + it.next() // open the archive and register the completion listener + // Simulate task completion without exhausting/closing the iterator. + ctx.markTaskCompleted(None) + } finally { + TaskContext.unset() + } + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVArchiveReadBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVArchiveReadBase.scala new file mode 100644 index 0000000000000..93dc8db5750f0 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVArchiveReadBase.scala @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +import java.nio.charset.StandardCharsets +import java.nio.file.Files + +import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.DataFrame +import org.apache.spark.util.Utils + +/** + * Binds [[ArchiveReadSuiteBase]]'s file-format hooks to CSV. The header-mode-specific tests live in + * the [[CSVHeaderArchiveReadBase]] and [[CSVHeaderlessArchiveReadBase]] sub-traits, so the shared + * archive tests from [[ArchiveReadSuiteBase]] run for both modes. + */ +trait CSVArchiveReadBase extends ArchiveReadSuiteBase { + + /** Whether the archived CSV files are written and read with a header row. */ + protected def header: Boolean + + override protected def format: String = "csv" + + override protected def fileExtension: String = "csv" + + override protected def readOptions: Map[String, String] = Map("header" -> header.toString) + + override protected def readSchema: String = "id INT, name STRING" + + override protected def encodeFile( + df: DataFrame, + writeOptions: Map[String, String]): Array[Byte] = { + val dir = Utils.createTempDir(namePrefix = "archive-test-encode") + try { + df.coalesce(1).write.format("csv") + .options(Map("header" -> header.toString) ++ writeOptions) + .mode("overwrite").save(dir.getCanonicalPath) + val parts = dir.listFiles().filter { f => + f.isFile && !f.getName.startsWith("_") && !f.getName.startsWith(".") && + !f.getName.endsWith(".crc") + } + assert(parts.length == 1, + s"expected exactly one data file, got: ${parts.map(_.getName).toList}") + Files.readAllBytes(parts.head.toPath) + } finally Utils.deleteRecursively(dir) + } + + /** Raw CSV bytes, for tests that need precise control over the row layout. */ + protected def csvBytes(s: String): Array[Byte] = s.getBytes(StandardCharsets.UTF_8) + + test("CSV: reading an archive without a schema fails (inference not yet supported)") { + // Schema inference for archives is a follow-up; until then an explicit schema is required, and + // an inference attempt raises Spark's standard UNABLE_TO_INFER_SCHEMA error. + withArchiveFile() { archive => + writeArchive(archive, Seq(entryName(0) -> encodeFile(sampleDf((1, "Alice"), (2, "Bob"))))) + val e = intercept[AnalysisException] { + spark.read.format(format).options(readOptions).load(archive.getCanonicalPath) + } + assert(e.getCondition == "UNABLE_TO_INFER_SCHEMA", + s"expected UNABLE_TO_INFER_SCHEMA, got ${e.getCondition}: ${e.getMessage}") + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderArchiveReadBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderArchiveReadBase.scala new file mode 100644 index 0000000000000..32ad6e818076f --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderArchiveReadBase.scala @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +/** + * [[CSVArchiveReadBase]] reading CSV files that carry a header row, plus header-specific archive + * tests (mismatched headers, and delimiter/multiline cases whose first row is a header). + */ +trait CSVHeaderArchiveReadBase extends CSVArchiveReadBase { + + import testImplicits._ + + override protected def header: Boolean = true + + test("CSV: entries with mismatched headers behave like standalone files") { + assertArchiveMatchesDir( + Seq( + entryName(0) -> encodeFile(sampleDf((1, "Alice"), (2, "Bob"))), + // A different second-column header: the schema's "name" column is absent from this entry. + entryName(1) -> encodeFile(Seq((3, "Carol")).toDF("id", "nickname")))) + } + + test("CSV: custom delimiter matches a directory read") { + assertArchiveMatchesDir( + Seq("a.csv" -> csvBytes("id;name\n1;Alice\n2;Bob\n")), + extraOptions = Map("delimiter" -> ";")) + } + + test("CSV: multiline quoted fields with embedded newlines match a directory read") { + assertArchiveMatchesDir( + Seq( + "a.csv" -> csvBytes("id,note\n1,\"line1\nline2\"\n2,\"plain\"\n"), + "b.csv" -> csvBytes("id,note\n3,\"a\nb\nc\"\n")), + extraOptions = Map("multiLine" -> "true"), + schema = "id INT, note STRING") + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderTarArchiveReadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderTarArchiveReadSuite.scala new file mode 100644 index 0000000000000..7ac2ad4084aa0 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderTarArchiveReadSuite.scala @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +/** + * Reads of header-carrying CSV files packed in tar archives (`.tar`/`.tar.gz`/`.tgz`): the shared + * archive tests from [[ArchiveReadSuiteBase]] plus the header-mode CSV tests from + * [[CSVHeaderArchiveReadBase]], run over tar containers via [[TarArchiveReadBase]]. + */ +class CSVHeaderTarArchiveReadSuite + extends ArchiveReadSuiteBase + with CSVHeaderArchiveReadBase + with TarArchiveReadBase diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderlessArchiveReadBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderlessArchiveReadBase.scala new file mode 100644 index 0000000000000..ba4d7f63bb464 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderlessArchiveReadBase.scala @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +/** + * [[CSVArchiveReadBase]] reading headerless CSV files (columns are positional), plus headerless + * delimiter/multiline archive tests. The shared archive tests from [[ArchiveReadSuiteBase]] cover + * the common headerless read paths. + */ +trait CSVHeaderlessArchiveReadBase extends CSVArchiveReadBase { + + override protected def header: Boolean = false + + test("CSV: headerless custom delimiter matches a directory read") { + assertArchiveMatchesDir( + Seq("a.csv" -> csvBytes("1;Alice\n2;Bob\n"), "b.csv" -> csvBytes("3;Carol\n")), + extraOptions = Map("delimiter" -> ";")) + } + + test("CSV: headerless multiline quoted fields with embedded newlines match a directory read") { + assertArchiveMatchesDir( + Seq( + "a.csv" -> csvBytes("1,\"line1\nline2\"\n2,\"plain\"\n"), + "b.csv" -> csvBytes("3,\"a\nb\nc\"\n")), + extraOptions = Map("multiLine" -> "true"), + schema = "id INT, note STRING") + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderlessTarArchiveReadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderlessTarArchiveReadSuite.scala new file mode 100644 index 0000000000000..28316c97ca8c9 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderlessTarArchiveReadSuite.scala @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +/** + * Reads of headerless CSV files packed in tar archives (`.tar`/`.tar.gz`/`.tgz`): the shared + * archive tests from [[ArchiveReadSuiteBase]] plus the headerless CSV tests from + * [[CSVHeaderlessArchiveReadBase]], run over tar containers via [[TarArchiveReadBase]]. + */ +class CSVHeaderlessTarArchiveReadSuite + extends ArchiveReadSuiteBase + with CSVHeaderlessArchiveReadBase + with TarArchiveReadBase diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/TarArchiveReadBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/TarArchiveReadBase.scala new file mode 100644 index 0000000000000..60a73b0891bae --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/TarArchiveReadBase.scala @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +import java.io.{File, FileOutputStream, OutputStream} +import java.nio.charset.StandardCharsets +import java.nio.file.Files +import java.util.Locale +import java.util.zip.GZIPOutputStream + +import org.apache.commons.compress.archivers.tar.{TarArchiveEntry, TarArchiveOutputStream} + +/** + * Binds [[ArchiveReadSuiteBase]]'s archive-format hooks to tar containers: plain `.tar`, gzipped + * `.tar.gz`, and `.tgz`. Reusable across file formats -- a `TarArchiveReadSuite` mixes this + * in alongside the file-format trait. + */ +trait TarArchiveReadBase extends ArchiveReadSuiteBase { + + override protected def archiveExtensions: Seq[String] = Seq("tar", "tar.gz", "tgz") + + override protected def corruptArchiveExtension: String = "tar.gz" + + override protected def writeArchive(dest: File, entries: Seq[(String, Array[Byte])]): Unit = { + val name = dest.getName.toLowerCase(Locale.ROOT) + val rawOut: OutputStream = if (name.endsWith(".gz") || name.endsWith(".tgz")) { + new GZIPOutputStream(new FileOutputStream(dest)) + } else { + new FileOutputStream(dest) + } + val out = new TarArchiveOutputStream(rawOut) + try { + entries.foreach { case (entryName, bytes) => + val entry = new TarArchiveEntry(entryName) + entry.setSize(bytes.length.toLong) + out.putArchiveEntry(entry) + out.write(bytes) + out.closeArchiveEntry() + } + out.finish() + } finally out.close() + } + + override protected def writeCorruptArchive(dest: File): Unit = + Files.write(dest.toPath, "this is not a valid gzip-compressed tar archive" + .getBytes(StandardCharsets.UTF_8)) +}