diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/FileSourceOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/FileSourceOptions.scala
index 22f84a1cad63d..0747a8045e7d2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/FileSourceOptions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/FileSourceOptions.scala
@@ -45,6 +45,14 @@ class FileSourceOptions(
val ignoreMissingFiles: Boolean = parameters.get(IGNORE_MISSING_FILES).map(_.toBoolean)
.getOrElse(SQLConf.get.ignoreMissingFiles)
+
+ /**
+ * Whether the data source may read tar archives (.tar/.tar.gz/.tgz) by streaming their entries.
+ * Gated by [[SQLConf.ARCHIVE_FORMAT_READER_ENABLED]] and resolved at construction (on the driver,
+ * where SQLConf is instantiated) so the value is stable once the options are serialized to
+ * executors. Only the CSV data source currently honors this.
+ */
+ val archiveFormatEnabled: Boolean = SQLConf.get.getConf(SQLConf.ARCHIVE_FORMAT_READER_ENABLED)
}
object FileSourceOptions {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 0aed28e92558f..7971795c29bab 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -2695,6 +2695,16 @@ object SQLConf {
.bytesConf(ByteUnit.BYTE)
.createWithDefaultString("128MB") // parquet.block.size
+ val ARCHIVE_FORMAT_READER_ENABLED = buildConf("spark.sql.files.archive.reader.enabled")
+ .doc("When true, the CSV data source can read tar archives (.tar, .tar.gz, .tgz): each " +
+ "archive is read as a single split and its entries are streamed through the CSV parser " +
+ "(never unpacked to disk), as if the entries were separate CSV files. Only the CSV data " +
+ "source supports reading archives.")
+ .version("5.0.0")
+ .withBindingPolicy(ConfigBindingPolicy.SESSION)
+ .booleanConf
+ .createWithDefault(false)
+
val FILES_OPEN_COST_IN_BYTES = buildConf("spark.sql.files.openCostInBytes")
.internal()
.doc("The estimated cost to open a file, measured by the number of bytes could be scanned in" +
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 1261200a9173c..e6673c9069f42 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -293,6 +293,10 @@
bcprov-jdk18on
test
+
+ org.apache.commons
+ commons-compress
+
org.bouncycastle
bcpkix-jdk18on
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ArchiveReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ArchiveReader.scala
new file mode 100644
index 0000000000000..e9737c87e1a2c
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ArchiveReader.scala
@@ -0,0 +1,239 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import java.io.{Closeable, InputStream}
+import java.util.Locale
+import java.util.zip.GZIPInputStream
+
+import scala.util.control.NonFatal
+
+import org.apache.commons.compress.archivers.tar.{TarArchiveEntry, TarArchiveInputStream}
+import org.apache.commons.io.ByteOrderMark
+import org.apache.commons.io.input.{BOMInputStream, CloseShieldInputStream}
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.io.Text
+import org.apache.hadoop.util.LineReader
+
+import org.apache.spark.TaskContext
+import org.apache.spark.util.HadoopFSUtils
+
+/**
+ * Streaming reader for a single archive file. The archive is opened once and decompressed/unpacked
+ * as a stream -- entries are never materialized to local disk. [[readEntries]] hands each entry's
+ * bytes to a caller-supplied parse function as a bounded [[InputStream]] and concatenates the
+ * per-entry results into a single iterator, advancing to the next entry only once the current one
+ * is fully consumed. At most one entry is in flight at a time, so memory stays bounded regardless
+ * of archive size.
+ *
+ * This is format-agnostic: a data source whose per-file reader can consume an `InputStream` wires
+ * up archive support by calling [[readEntries]] from its read/inference paths and supplying a
+ * `parseEntry` that turns one entry stream into rows (or tokens). Formats that need random access
+ * within a file (e.g. Parquet/ORC footers) cannot use this streaming path.
+ *
+ * A concrete subclass implements [[readEntries]] for a specific archive format. Obtain the reader
+ * for a path via `ArchiveReader(path)`, which selects the implementation by file extension; new
+ * archive formats are added by writing another subclass rather than modifying existing ones.
+ */
+abstract class ArchiveReader(path: Path) {
+
+ /**
+ * Streams the archive entry by entry, applying `parseEntry` to each non-skipped entry's
+ * `(name, stream)` and concatenating the results into a single iterator. The next entry is opened
+ * only once the current entry's iterator is exhausted, so nothing is buffered to disk and at most
+ * one entry's bytes are read at a time. The archive stream is closed when the returned iterator
+ * is exhausted, when [[Closeable.close]] is called on it, and (defensively) on task completion.
+ */
+ def readEntries[T](
+ conf: Configuration)(
+ parseEntry: (String, InputStream) => Iterator[T]): Iterator[T]
+}
+
+object ArchiveReader {
+
+ /**
+ * Whether `path` names an archive this reader can stream. Dispatched purely on the file
+ * extension -- `.tar`, `.tar.gz`, or `.tgz` -- since the bytes are not inspected here.
+ */
+ def isArchivePath(path: Path): Boolean = {
+ val name = path.getName.toLowerCase(Locale.ROOT)
+ name.endsWith(".tar") || name.endsWith(".tar.gz") || name.endsWith(".tgz")
+ }
+
+ /**
+ * Returns the [[ArchiveReader]] implementation for `path`, selected by its file extension. Only
+ * paths for which [[isArchivePath]] is true are supported; new archive formats add a case here.
+ */
+ def apply(path: Path): ArchiveReader = new TarArchiveReader(path)
+
+ /**
+ * Splits one already-decompressed archive entry's bytes into lines. The reusable, format-agnostic
+ * line source for archive entries; the entry stream is not closed here (the reader owns the
+ * underlying stream).
+ *
+ * @param in bytes of one archive entry.
+ * @param lineSeparatorInRead the explicit read line separator, or `None` to detect line breaks.
+ * @return an iterator over the entry's lines as [[Text]], without the trailing separator.
+ */
+ def lineIterator(in: InputStream, lineSeparatorInRead: Option[Array[Byte]]): Iterator[Text] = {
+ // A leading byte-order mark is stripped (LineReader does not strip it on its own) so the lines
+ // match the non-archive read path.
+ val bomInputStream = BOMInputStream.builder()
+ .setInputStream(in)
+ .setByteOrderMarks(
+ ByteOrderMark.UTF_8,
+ ByteOrderMark.UTF_16LE,
+ ByteOrderMark.UTF_16BE,
+ ByteOrderMark.UTF_32LE,
+ ByteOrderMark.UTF_32BE)
+ .setInclude(false)
+ .get()
+ val reader = lineSeparatorInRead match {
+ case Some(sep) => new LineReader(bomInputStream, sep)
+ case _ => new LineReader(bomInputStream)
+ }
+ new Iterator[Text] {
+ private val text = new Text()
+ private var finished = false
+ private var hasValue = false
+
+ override def hasNext: Boolean = {
+ if (!finished && !hasValue) {
+ finished = reader.readLine(text) == 0
+ hasValue = !finished
+ }
+ !finished
+ }
+
+ override def next(): Text = {
+ if (!hasNext) throw new NoSuchElementException
+ hasValue = false
+ text
+ }
+ }
+ }
+}
+
+/**
+ * [[ArchiveReader]] for tar archives: plain `.tar`, gzipped `.tar.gz`, and `.tgz`.
+ *
+ * Gzip handling: Hadoop's `CompressionCodecFactory` matches the trailing `.gz` extension and
+ * auto-decompresses `.tar.gz` via `CodecStreams`, so we just wrap that stream in
+ * `TarArchiveInputStream`. `.tgz` is not a registered Hadoop codec extension, so the gzip layer is
+ * unwrapped explicitly here.
+ */
+class TarArchiveReader(path: Path) extends ArchiveReader(path) {
+
+ // Paths Hadoop's codec factory won't auto-decompress: we apply the gzip layer here.
+ private def needsExplicitGunzip: Boolean =
+ path.getName.toLowerCase(Locale.ROOT).endsWith(".tgz")
+
+ /**
+ * Whether an entry is not a real data file and must be skipped: a directory, or a name Spark's
+ * own file listing would filter out. Reusing [[HadoopFSUtils.shouldFilterOutPathName]] (the
+ * `InMemoryFileIndex` filter) keeps archive reads in parity with reading the same entries as
+ * loose files: `.`-prefixed sidecars (macOS `._x`, `.DS_Store`) and `_`-prefixed markers
+ * (`_SUCCESS`, `_committed_*`) are skipped, while data files are kept.
+ */
+ private def shouldSkipEntry(entry: TarArchiveEntry): Boolean = {
+ if (entry.isDirectory) return true
+ val name = entry.getName
+ val basename = name.substring(name.lastIndexOf('/') + 1)
+ HadoopFSUtils.shouldFilterOutPathName(basename)
+ }
+
+ /** Opens the archive as a tar stream, transparently decompressing `.tar.gz` / `.tgz`. */
+ private def openTarStream(conf: Configuration): TarArchiveInputStream = {
+ val base = CodecStreams.createInputStreamWithCloseResource(conf, path)
+ val tarBytes = if (needsExplicitGunzip) new GZIPInputStream(base) else base
+ new TarArchiveInputStream(tarBytes)
+ }
+
+ /**
+ * Wraps the shared tar stream as a view over exactly the current entry's bytes
+ * (`TarArchiveInputStream.read` returns -1 at the entry boundary). [[CloseShieldInputStream]]
+ * ignores `close()`, so a parser closing its input does not close the underlying archive; any
+ * unread remainder of an entry is skipped by `getNextEntry()` when advancing.
+ */
+ private def entryStream(tar: TarArchiveInputStream): InputStream =
+ CloseShieldInputStream.wrap(tar)
+
+ override def readEntries[T](
+ conf: Configuration)(
+ parseEntry: (String, InputStream) => Iterator[T]): Iterator[T] = {
+ val tar = openTarStream(conf)
+ var closed = false
+
+ def cleanup(): Unit = {
+ if (!closed) {
+ closed = true
+ try tar.close() catch { case NonFatal(_) => }
+ }
+ }
+
+ Option(TaskContext.get()).foreach(_.addTaskCompletionListener[Unit](_ => cleanup()))
+
+ new Iterator[T] with Closeable {
+ private var currentIter: Iterator[T] = Iterator.empty
+ private var done = false
+
+ // Move to the next entry whose iterator has elements (releasing each exhausted entry's
+ // reader and skipping any unread bytes), or mark the stream done once entries run out.
+ // Advancing here -- driven by `hasNext` -- rather than eagerly after producing a row in
+ // `next` is essential for parsers that reuse a single mutable row and look ahead on
+ // `hasNext`: probing the current entry right after returning a row would overwrite that row's
+ // contents before the caller has copied it.
+ private def advance(): Unit = {
+ while (!done && !currentIter.hasNext) {
+ currentIter match {
+ case c: Closeable => try c.close() catch { case NonFatal(_) => }
+ case _ =>
+ }
+ var entry = tar.getNextEntry
+ while (entry != null && shouldSkipEntry(entry)) entry = tar.getNextEntry
+ if (entry == null) {
+ done = true
+ cleanup()
+ } else {
+ currentIter = parseEntry(entry.getName, entryStream(tar))
+ }
+ }
+ }
+
+ // Open the first entry eagerly so construction reflects the archive's first entry.
+ advance()
+
+ override def hasNext: Boolean = {
+ advance()
+ !done && currentIter.hasNext
+ }
+
+ override def next(): T = {
+ if (!hasNext) throw new NoSuchElementException
+ currentIter.next()
+ }
+
+ override def close(): Unit = {
+ done = true
+ currentIter = Iterator.empty
+ cleanup()
+ }
+ }
+ }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
index 596edc8beaa34..76a34630a4d5d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -17,7 +17,7 @@
package org.apache.spark.sql.execution.datasources.csv
-import java.io.{FileNotFoundException, IOException}
+import java.io.{FileNotFoundException, InputStream, IOException}
import java.nio.charset.{Charset, StandardCharsets}
import scala.util.control.NonFatal
@@ -71,7 +71,14 @@ abstract class CSVDataSource extends Serializable {
parsedOptions.singleVariantColumn match {
case Some(columnName) => Some(StructType(Array(StructField(columnName, VariantType))))
case None =>
- if (inputPaths.nonEmpty) {
+ if (parsedOptions.archiveFormatEnabled &&
+ inputPaths.exists(f => ArchiveReader.isArchivePath(f.getPath))) {
+ // Schema inference is not yet supported for tar archives. Returning None makes Spark
+ // raise its standard "Unable to infer schema ... It must be specified manually" error
+ // (UNABLE_TO_INFER_SCHEMA), so reading an archive requires an explicit `.schema(...)`.
+ // Inferring a schema by streaming archive entries is planned as a follow-up.
+ None
+ } else if (inputPaths.nonEmpty) {
Some(infer(sparkSession, inputPaths, parsedOptions))
} else {
None
@@ -83,6 +90,46 @@ abstract class CSVDataSource extends Serializable {
sparkSession: SparkSession,
inputPaths: Seq[FileStatus],
parsedOptions: CSVOptions): StructType
+
+ /**
+ * Streams a tar archive (`.tar`/`.tar.gz`/`.tgz`) entry by entry through the CSV parser without
+ * unpacking it to disk. The whole archive is a single split (see `CSVFileFormat.isSplitable`); a
+ * fresh header checker and parser are built per entry so each entry is parsed exactly like a
+ * standalone CSV file -- its header, if any, validated and dropped independently. The
+ * mode-specific implementation turns one entry into rows via `parseStream` / `parseIterator`.
+ *
+ * @param getParser builds a fresh [[UnivocityParser]].
+ * @param getHeaderChecker builds a fresh [[CSVHeaderChecker]] for `(isStartOfFile, source)`.
+ */
+ def readArchive(
+ conf: Configuration,
+ file: PartitionedFile,
+ getParser: () => UnivocityParser,
+ getHeaderChecker: (Boolean, String) => CSVHeaderChecker,
+ requiredSchema: StructType): Iterator[InternalRow]
+
+ /**
+ * Shared driver used by the [[readArchive]] implementations: streams each non-skipped entry's
+ * `(parser, headerChecker, stream)` -- a fresh parser/header checker per entry -- through
+ * `parseEntry`. The header checker `source` (`CSV archive entry: !/`) names
+ * the entry in error messages.
+ */
+ protected def streamArchiveEntries(
+ conf: Configuration,
+ file: PartitionedFile,
+ getParser: () => UnivocityParser,
+ getHeaderChecker: (Boolean, String) => CSVHeaderChecker)(
+ parseEntry: (UnivocityParser, CSVHeaderChecker, InputStream) => Iterator[InternalRow])
+ : Iterator[InternalRow] = {
+ ArchiveReader(file.toPath).readEntries(conf) { (entryName, in) =>
+ val headerChecker =
+ getHeaderChecker(true, s"CSV archive entry: ${file.urlEncodedPath}!/$entryName")
+ val parser = getParser()
+ headerChecker.setHeaderForSingleVariantColumn =
+ CSVDataSource.setHeaderForSingleVariantColumn(conf, file, parser)
+ parseEntry(parser, headerChecker, in)
+ }
+ }
}
object CSVDataSource extends Logging {
@@ -144,6 +191,36 @@ object TextInputCSVDataSource extends CSVDataSource {
UnivocityParser.parseIterator(lines, parser, headerChecker, requiredSchema)
}
+ override def readArchive(
+ conf: Configuration,
+ file: PartitionedFile,
+ getParser: () => UnivocityParser,
+ getHeaderChecker: (Boolean, String) => CSVHeaderChecker,
+ requiredSchema: StructType): Iterator[InternalRow] =
+ // Stream each tar entry through the line-based parser, treating the entry exactly like a
+ // standalone CSV file (a fresh parser/header checker is built per entry).
+ streamArchiveEntries(conf, file, getParser, getHeaderChecker) { (parser, headerChecker, in) =>
+ UnivocityParser.parseIterator(
+ entryLines(in, parser.options), parser, headerChecker, requiredSchema)
+ }
+
+ /**
+ * Decodes one archive entry's bytes into the same CSV line strings the non-archive [[readFile]]
+ * path feeds to the parser: [[ArchiveReader.lineIterator]] splits the entry into lines (honoring
+ * a custom line separator) and each line is decoded with the configured charset. Like `readFile`,
+ * the decoded lines are fed to `UnivocityParser.parseIterator` without a re-appended terminator.
+ *
+ * @param in bytes of one already-decompressed archive entry; not closed here (the archive owns
+ * the underlying stream).
+ * @param options CSV options supplying the read line separator and charset.
+ * @return an iterator over the entry's lines.
+ */
+ private def entryLines(in: InputStream, options: CSVOptions): Iterator[String] = {
+ ArchiveReader.lineIterator(in, options.lineSeparatorInRead).map { line =>
+ new String(line.getBytes, 0, line.getLength, options.charset)
+ }
+ }
+
override def infer(
sparkSession: SparkSession,
inputPaths: Seq[FileStatus],
@@ -227,6 +304,18 @@ object MultiLineCSVDataSource extends CSVDataSource with Logging {
requiredSchema)
}
+ override def readArchive(
+ conf: Configuration,
+ file: PartitionedFile,
+ getParser: () => UnivocityParser,
+ getHeaderChecker: (Boolean, String) => CSVHeaderChecker,
+ requiredSchema: StructType): Iterator[InternalRow] =
+ // Stream each tar entry whole through the multi-line parser (a fresh parser/header checker is
+ // built per entry).
+ streamArchiveEntries(conf, file, getParser, getHeaderChecker) { (parser, headerChecker, in) =>
+ UnivocityParser.parseStream(in, parser, headerChecker, requiredSchema)
+ }
+
override def infer(
sparkSession: SparkSession,
inputPaths: Seq[FileStatus],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala
index 77a0c53ae4699..ab570de8d998f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala
@@ -44,6 +44,11 @@ case class CSVFileFormat() extends TextBasedFileFormat with DataSourceRegister {
options: Map[String, String],
path: Path): Boolean = {
val parsedOptions = getCsvOptions(sparkSession, options)
+ // A tar archive is decompressed/unpacked as a sequential stream, so it must be read as a
+ // single split rather than carved into byte ranges.
+ if (parsedOptions.archiveFormatEnabled && ArchiveReader.isArchivePath(path)) {
+ return false
+ }
CSVDataSource(parsedOptions).isSplitable && super.isSplitable(sparkSession, options, path)
}
@@ -119,24 +124,26 @@ case class CSVFileFormat() extends TextBasedFileFormat with DataSourceRegister {
dataSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord))
val actualRequiredSchema = StructType(
requiredSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord))
- val parser = new UnivocityParser(
- actualDataSchema,
- actualRequiredSchema,
- parsedOptions,
- actualFilters)
// Use column pruning when specified by Catalyst, except when one or more columns have
// existence default value(s), since in that case we instruct the CSV parser to disable column
// pruning and instead read each entire row in order to correctly assign the default value(s).
val schema = if (isColumnPruningEnabled) actualRequiredSchema else actualDataSchema
- val isStartOfFile = file.start == 0
- val headerChecker = new CSVHeaderChecker(
- schema, parsedOptions, source = s"CSV file: ${file.urlEncodedPath}", isStartOfFile)
- CSVDataSource(parsedOptions).readFile(
- conf,
- file,
- parser,
- headerChecker,
- requiredSchema)
+
+ def newParser(): UnivocityParser =
+ new UnivocityParser(actualDataSchema, actualRequiredSchema, parsedOptions, actualFilters)
+ def getHeaderChecker(isStartOfFile: Boolean, source: String): CSVHeaderChecker =
+ new CSVHeaderChecker(schema, parsedOptions, source, isStartOfFile)
+
+ // A tar archive (always a single split, see `isSplitable`) is streamed entry by entry when
+ // archive reads are enabled; otherwise the file is parsed directly.
+ if (parsedOptions.archiveFormatEnabled && ArchiveReader.isArchivePath(file.toPath)) {
+ CSVDataSource(parsedOptions).readArchive(
+ conf, file, () => newParser(), getHeaderChecker, requiredSchema)
+ } else {
+ val parser = newParser()
+ val headerChecker = getHeaderChecker(file.start == 0, s"CSV file: ${file.urlEncodedPath}")
+ CSVDataSource(parsedOptions).readFile(conf, file, parser, headerChecker, requiredSchema)
+ }
}
}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ArchiveReadSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ArchiveReadSuiteBase.scala
new file mode 100644
index 0000000000000..9d0300ec41177
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ArchiveReadSuiteBase.scala
@@ -0,0 +1,234 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import java.io.File
+import java.nio.file.Files
+
+import org.apache.spark.{SparkConf, SparkException}
+import org.apache.spark.sql.{DataFrame, QueryTest, Row}
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.test.SharedSparkSession
+import org.apache.spark.util.Utils
+
+/**
+ * Format- and archive-agnostic end-to-end tests for reading archives of data files through the
+ * streaming [[ArchiveReader]] path. Entries are streamed (never unpacked to disk), and the central
+ * contract verified throughout is parity with reading the same files from a directory.
+ *
+ * A concrete suite binds the abstract hooks below by mixing in a file-format trait (e.g.
+ * [[org.apache.spark.sql.execution.datasources.CSVArchiveReadBase]]) and an archive-format trait
+ * (e.g. [[TarArchiveReadBase]]), so the same tests run for every (file format, archive format)
+ * pair we support. New formats are added by writing the per-format trait once, not by duplicating
+ * these tests:
+ * {{{
+ * class CSVHeaderTarArchiveReadSuite
+ * extends ArchiveReadSuiteBase with CSVHeaderArchiveReadBase with TarArchiveReadBase
+ * }}}
+ */
+trait ArchiveReadSuiteBase extends QueryTest with SharedSparkSession {
+
+ override def sparkConf: SparkConf =
+ super.sparkConf.set(SQLConf.ARCHIVE_FORMAT_READER_ENABLED.key, "true")
+
+ import testImplicits._
+
+ // ----- file-format hooks (bound by e.g. CSVArchiveReadBase) ----------------
+
+ /** The `DataFrameReader`/`DataFrameWriter` format name, e.g. "csv". */
+ protected def format: String
+
+ /** Extension of a single data file of [[format]] inside an archive, e.g. "csv". */
+ protected def fileExtension: String
+
+ /** Read options applied to every read (e.g. CSV `header`). */
+ protected def readOptions: Map[String, String]
+
+ /** Schema used to read the sample data produced by [[sampleDf]]. */
+ protected def readSchema: String
+
+ /** Encodes `df` as the bytes of a single data file of [[format]], honoring `writeOptions`. */
+ protected def encodeFile(df: DataFrame, writeOptions: Map[String, String]): Array[Byte]
+
+ /** Encodes `df` as a single data file using only the format's default write options. */
+ protected final def encodeFile(df: DataFrame): Array[Byte] = encodeFile(df, Map.empty)
+
+ // ----- archive-format hooks (bound by e.g. TarArchiveReadBase) -------------
+
+ /** Archive extensions to exercise, e.g. Seq("tar", "tar.gz", "tgz"). The head is the default. */
+ protected def archiveExtensions: Seq[String]
+
+ /** Writes `entries` (name -> bytes) into the archive at `dest`; compression follows the ext. */
+ protected def writeArchive(dest: File, entries: Seq[(String, Array[Byte])]): Unit
+
+ /** Writes bytes that are not a readable archive at `dest` (of [[corruptArchiveExtension]]). */
+ protected def writeCorruptArchive(dest: File): Unit
+
+ /** An archive extension whose reader fails on corrupt bytes (used by the corrupt-file tests). */
+ protected def corruptArchiveExtension: String
+
+ // ----- helpers -------------------------------------------------------------
+
+ /** Sample two-column data; the column names line up with [[readSchema]]. */
+ protected def sampleDf(rows: (Int, String)*): DataFrame = rows.toDF("id", "name")
+
+ /** Entry file name for the i-th data file in an archive. */
+ protected def entryName(i: Int): String = s"part-$i.$fileExtension"
+
+ /** Provides an archive-extensioned path inside a fresh temp dir to `f`. */
+ protected def withArchiveFile(
+ extension: String = archiveExtensions.head)(f: File => Unit): Unit = {
+ val dir = Utils.createTempDir(namePrefix = "archive-test")
+ try f(new File(dir, s"archive.$extension")) finally Utils.deleteRecursively(dir)
+ }
+
+ /** Reads `path` with the format, [[readOptions]] (plus `extraOptions`), and `schema`. */
+ protected def read(
+ path: String,
+ extraOptions: Map[String, String] = Map.empty,
+ schema: String = readSchema): DataFrame =
+ spark.read.format(format).options(readOptions ++ extraOptions).schema(schema).load(path)
+
+ /**
+ * Writes `entries` both into an archive and as loose files in a directory, then asserts the
+ * archive read produces exactly the same rows as the directory read.
+ */
+ protected def assertArchiveMatchesDir(
+ entries: Seq[(String, Array[Byte])],
+ extraOptions: Map[String, String] = Map.empty,
+ schema: String = readSchema): Unit = {
+ withArchiveFile() { archive =>
+ writeArchive(archive, entries)
+ val fromArchive = read(archive.getCanonicalPath, extraOptions, schema)
+ withTempDir { dir =>
+ entries.foreach { case (name, b) => Files.write(new File(dir, name).toPath, b) }
+ checkAnswer(fromArchive, read(dir.getCanonicalPath, extraOptions, schema).collect().toSeq)
+ }
+ }
+ }
+
+ // ----- tests ---------------------------------------------------------------
+
+ test("read an archive of multiple entries matches the union of the inputs") {
+ archiveExtensions.foreach { ext =>
+ withArchiveFile(ext) { archive =>
+ val parts = Seq(
+ sampleDf((1, "Alice"), (2, "Bob")),
+ sampleDf((3, "Carol")),
+ sampleDf((4, "Dan"), (5, "Eve")))
+ writeArchive(
+ archive, parts.zipWithIndex.map { case (p, i) => entryName(i) -> encodeFile(p) })
+ checkAnswer(read(archive.getCanonicalPath), parts.reduce(_ union _))
+ }
+ }
+ }
+
+ test("archive entries parse like a directory of the same files") {
+ val parts = Seq(sampleDf((1, "Alice"), (2, "Bob")), sampleDf((3, "Carol")))
+ assertArchiveMatchesDir(parts.zipWithIndex.map { case (p, i) => entryName(i) -> encodeFile(p) })
+ }
+
+ test("column pruning selects a subset of columns") {
+ withArchiveFile() { archive =>
+ val data = sampleDf((1, "Alice"), (2, "Bob"))
+ writeArchive(archive, Seq(entryName(0) -> encodeFile(data)))
+ checkAnswer(read(archive.getCanonicalPath).select("name"), Seq(Row("Alice"), Row("Bob")))
+ }
+ }
+
+ test("multiple entries and multiple loose files under a partitioned dir, plus an empty archive") {
+ withTempDir { rootDir =>
+ val partitionDir = new File(rootDir, "dt=2024-01-01")
+ assert(partitionDir.mkdirs())
+
+ val inArchive = Seq(sampleDf((1, "in-archive-a")), sampleDf((2, "in-archive-b")))
+ val loose = Seq(sampleDf((3, "loose-a")), sampleDf((4, "loose-b")))
+ val ext = archiveExtensions.head
+
+ writeArchive(
+ new File(partitionDir, s"data.$ext"),
+ inArchive.zipWithIndex.map { case (p, i) => entryName(i) -> encodeFile(p) })
+ // An empty archive in the same directory must contribute no rows.
+ writeArchive(new File(partitionDir, s"empty.$ext"), Seq.empty)
+ loose.zipWithIndex.foreach { case (p, i) =>
+ Files.write(new File(partitionDir, s"loose-$i.$fileExtension").toPath, encodeFile(p))
+ }
+
+ val expected = (inArchive ++ loose).reduce(_ union _)
+ checkAnswer(read(rootDir.getCanonicalPath).select("id", "name"), expected)
+ }
+ }
+
+ test("a directory of only empty archives yields no rows") {
+ withTempDir { dir =>
+ archiveExtensions.foreach { ext =>
+ writeArchive(new File(dir, s"empty-${ext.replace('.', '_')}.$ext"), Seq.empty)
+ }
+ checkAnswer(read(dir.getCanonicalPath), Seq.empty[Row])
+ }
+ }
+
+ test("an empty archive yields no rows") {
+ withArchiveFile() { archive =>
+ writeArchive(archive, Seq.empty)
+ checkAnswer(read(archive.getCanonicalPath), Seq.empty[Row])
+ }
+ }
+
+ test("an archive always yields a single partition regardless of size") {
+ withArchiveFile() { archive =>
+ val big = sampleDf((1 to 1000).map(i => (i, s"value-$i")): _*)
+ writeArchive(archive, (0 until 4).map(i => entryName(i) -> encodeFile(big)))
+ withSQLConf(SQLConf.FILES_MAX_PARTITION_BYTES.key -> "1024") {
+ val readDf = read(archive.getCanonicalPath)
+ assert(readDf.rdd.getNumPartitions == 1,
+ s"archive should be a single partition; got ${readDf.rdd.getNumPartitions}")
+ assert(readDf.count() == 4L * big.count())
+ }
+ }
+ }
+
+ Seq(true, false).foreach { ignoreCorrupt =>
+ test(s"ignoreCorruptFiles=$ignoreCorrupt controls whether a corrupt archive is skipped") {
+ withArchiveFile(corruptArchiveExtension) { archive =>
+ writeCorruptArchive(archive)
+ withSQLConf(SQLConf.IGNORE_CORRUPT_FILES.key -> ignoreCorrupt.toString) {
+ if (ignoreCorrupt) {
+ checkAnswer(read(archive.getCanonicalPath), Seq.empty[Row])
+ } else {
+ intercept[SparkException](read(archive.getCanonicalPath).collect())
+ }
+ }
+ }
+ }
+ }
+
+ test("a corrupt archive among good ones is skipped whole, not per entry (ignoreCorruptFiles)") {
+ withTempDir { dir =>
+ val good = sampleDf((1, "Alice"), (2, "Bob"))
+ writeArchive(new File(dir, s"good.${archiveExtensions.head}"),
+ Seq(entryName(0) -> encodeFile(good)))
+ writeCorruptArchive(new File(dir, s"bad.$corruptArchiveExtension"))
+ // A tar is one non-splittable unit, so corrupt handling is archive-granular: the corrupt
+ // archive is skipped in its entirety while the good archive's rows are still returned.
+ withSQLConf(SQLConf.IGNORE_CORRUPT_FILES.key -> "true") {
+ checkAnswer(read(dir.getCanonicalPath), good)
+ }
+ }
+ }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ArchiveReaderSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ArchiveReaderSuite.scala
new file mode 100644
index 0000000000000..48482aa1f5b3e
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ArchiveReaderSuite.scala
@@ -0,0 +1,259 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import java.io.{ByteArrayOutputStream, Closeable, File, FileOutputStream, InputStream, OutputStream}
+import java.nio.charset.StandardCharsets
+import java.util.Properties
+import java.util.zip.GZIPOutputStream
+
+import scala.collection.mutable.ArrayBuffer
+
+import org.apache.commons.compress.archivers.tar.{TarArchiveEntry, TarArchiveOutputStream}
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+
+import org.apache.spark.{SparkFunSuite, TaskContext, TaskContextImpl}
+
+/**
+ * Unit tests for the streaming [[ArchiveReader]] core: `isArchivePath` dispatch and `readEntries`
+ * (entry ordering, gzip handling, dir/dotfile skipping, lazy advance, the non-closing entry
+ * stream, and cleanup). Nothing here touches local disk -- entries are consumed as streams.
+ */
+class ArchiveReaderSuite extends SparkFunSuite {
+
+ private case class Entry(name: String, data: Array[Byte], isDir: Boolean = false)
+
+ private def writeTar(file: File, entries: Seq[Entry]): Unit =
+ writeTarTo(new FileOutputStream(file), entries)
+
+ /** Write a gzipped tar, used to verify the `.tar.gz` / `.tgz` archive paths. */
+ private def writeTarGz(file: File, entries: Seq[Entry]): Unit =
+ writeTarTo(new GZIPOutputStream(new FileOutputStream(file)), entries)
+
+ private def writeTarTo(rawOut: OutputStream, entries: Seq[Entry]): Unit = {
+ val out = new TarArchiveOutputStream(rawOut)
+ try {
+ entries.foreach { e =>
+ // TarArchiveEntry treats a trailing slash in the name as a directory marker.
+ val rawName = if (e.isDir && !e.name.endsWith("/")) e.name + "/" else e.name
+ val tarEntry = new TarArchiveEntry(rawName)
+ if (!e.isDir) tarEntry.setSize(e.data.length.toLong)
+ out.putArchiveEntry(tarEntry)
+ if (!e.isDir) out.write(e.data)
+ out.closeArchiveEntry()
+ }
+ out.finish()
+ } finally out.close()
+ }
+
+ private def textEntry(name: String, body: String): Entry =
+ Entry(name, body.getBytes(StandardCharsets.UTF_8))
+
+ private def readAll(in: InputStream): Array[Byte] = {
+ val out = new ByteArrayOutputStream()
+ val buf = new Array[Byte](4096)
+ var n = in.read(buf)
+ while (n >= 0) {
+ out.write(buf, 0, n)
+ n = in.read(buf)
+ }
+ out.toByteArray
+ }
+
+ /** Drains every entry into `(name, decodedText)` pairs through `ArchiveReader.readEntries`. */
+ private def collect(file: File): Seq[(String, String)] =
+ ArchiveReader(new Path(file.toURI)).readEntries(new Configuration()) { (name, in) =>
+ Iterator.single((name, new String(readAll(in), StandardCharsets.UTF_8)))
+ }.toList
+
+ // ----- isArchivePath ------------------------------------------------------
+
+ test("isArchivePath: positive cases") {
+ Seq(
+ "foo.tar", "FOO.TAR", "/a/b/c/x.tar", "weird.TaR",
+ "foo.tar.gz", "FOO.TAR.GZ", "mixed.Tar.Gz", "/a/b/c/x.tar.gz",
+ "foo.tgz", "FOO.TGZ", "/a/b/c/x.tgz"
+ ).foreach { p =>
+ assert(ArchiveReader.isArchivePath(new Path(p)), s"expected archive match for $p")
+ }
+ }
+
+ test("isArchivePath: negative cases") {
+ Seq("foo.csv", "foo.gz", "foo", "dir/", "foo.tarball", "data.zip",
+ "foo.tar.bz2", "foo.targz").foreach { p =>
+ assert(!ArchiveReader.isArchivePath(new Path(p)), s"expected non-match for $p")
+ }
+ }
+
+ // ----- readEntries --------------------------------------------------------
+
+ test("readEntries: empty tar yields empty iterator") {
+ withTempDir { dir =>
+ val tar = new File(dir, "empty.tar")
+ writeTar(tar, Seq.empty)
+ assert(collect(tar).isEmpty)
+ }
+ }
+
+ test("readEntries: single entry exposes its name and bytes") {
+ withTempDir { dir =>
+ val tar = new File(dir, "single.tar")
+ writeTar(tar, Seq(textEntry("only.csv", "hello\n")))
+ assert(collect(tar) == Seq("only.csv" -> "hello\n"))
+ }
+ }
+
+ test("readEntries: multiple entries chained in tar order") {
+ withTempDir { dir =>
+ val tar = new File(dir, "multi.tar")
+ writeTar(tar, Seq(textEntry("a.csv", "a"), textEntry("b.csv", "b"), textEntry("c.csv", "c")))
+ assert(collect(tar) == Seq("a.csv" -> "a", "b.csv" -> "b", "c.csv" -> "c"))
+ }
+ }
+
+ test("readEntries: gzipped tar (.tar.gz) via Hadoop codec factory") {
+ withTempDir { dir =>
+ val tarGz = new File(dir, "data.tar.gz")
+ writeTarGz(tarGz, Seq(textEntry("a.csv", "a"), textEntry("b.csv", "b")))
+ assert(collect(tarGz) == Seq("a.csv" -> "a", "b.csv" -> "b"))
+ }
+ }
+
+ test("readEntries: gzipped tar (.tgz) via explicit GZIPInputStream wrap") {
+ withTempDir { dir =>
+ val tgz = new File(dir, "data.tgz")
+ writeTarGz(tgz, Seq(textEntry("a.csv", "a"), textEntry("b.csv", "b")))
+ assert(collect(tgz) == Seq("a.csv" -> "a", "b.csv" -> "b"))
+ }
+ }
+
+ test("readEntries: directory entries are skipped") {
+ withTempDir { dir =>
+ val tar = new File(dir, "dirs.tar")
+ writeTar(tar, Seq(
+ Entry("subdir", Array.emptyByteArray, isDir = true),
+ textEntry("subdir/data.csv", "x")))
+ assert(collect(tar) == Seq("subdir/data.csv" -> "x"))
+ }
+ }
+
+ test("readEntries: dotfile entries (e.g. macOS ._foo) are skipped") {
+ withTempDir { dir =>
+ val tar = new File(dir, "dots.tar")
+ writeTar(tar, Seq(
+ textEntry("._real.csv", "junk"), // macOS AppleDouble sidecar
+ textEntry(".hidden", "ignored"), // bare dotfile
+ textEntry("real.csv", "kept"),
+ textEntry("nested/._sidecar", "junk2"))) // dotfile in a subdir
+ assert(collect(tar) == Seq("real.csv" -> "kept"))
+ }
+ }
+
+ test("readEntries: advances lazily, one entry at a time") {
+ withTempDir { dir =>
+ val tar = new File(dir, "lazy.tar")
+ writeTar(tar, Seq(textEntry("a.csv", "a"), textEntry("b.csv", "b"), textEntry("c.csv", "c")))
+
+ val opened = ArrayBuffer[String]()
+ // parseEntry yields a single element without reading the stream, so each invocation maps to
+ // exactly one consumed output element -- letting us observe when the next entry is opened.
+ val it = ArchiveReader(new Path(tar.toURI)).readEntries(new Configuration()) { (name, _) =>
+ opened += name
+ Iterator.single(name)
+ }
+
+ // Construction opens only the first entry; later entries open on demand as iteration
+ // crosses each entry boundary (never all upfront).
+ assert(opened.toList == List("a.csv"))
+ assert(it.hasNext)
+ assert(it.next() == "a.csv")
+ // Entry 0 is still in flight until its element is consumed, so entry 1 stays unopened.
+ assert(opened.toList == List("a.csv"))
+ assert(it.next() == "b.csv")
+ assert(opened.toList == List("a.csv", "b.csv"))
+ assert(it.next() == "c.csv")
+ assert(opened.toList == List("a.csv", "b.csv", "c.csv"))
+ assert(!it.hasNext)
+ assert(opened.size == 3)
+ }
+ }
+
+ test("readEntries: a parseEntry that closes its stream still advances to the next entry") {
+ withTempDir { dir =>
+ val tar = new File(dir, "close.tar")
+ writeTar(tar, Seq(textEntry("a.csv", "a"), textEntry("b.csv", "b")))
+
+ val seen = ArrayBuffer[String]()
+ val it = ArchiveReader(new Path(tar.toURI)).readEntries(new Configuration()) { (name, in) =>
+ val body = new String(readAll(in), StandardCharsets.UTF_8)
+ in.close() // must NOT close the underlying archive
+ seen += body
+ Iterator.single(name)
+ }
+ assert(it.toList == List("a.csv", "b.csv"))
+ assert(seen.toList == List("a", "b"))
+ }
+ }
+
+ test("readEntries: close() is safe, idempotent, and stops iteration") {
+ withTempDir { dir =>
+ val tar = new File(dir, "closeable.tar")
+ writeTar(tar, Seq(textEntry("a.csv", "a"), textEntry("b.csv", "b")))
+
+ val it = ArchiveReader(new Path(tar.toURI)).readEntries(new Configuration()) { (name, _) =>
+ Iterator.single(name)
+ }
+ assert(it.hasNext)
+ it.asInstanceOf[Closeable].close()
+ it.asInstanceOf[Closeable].close() // idempotent
+ assert(!it.hasNext)
+ }
+ }
+
+ test("readEntries: TaskContext completion cleans up without error") {
+ withTempDir { dir =>
+ val tar = new File(dir, "ctx.tar")
+ writeTar(tar, Seq(textEntry("a.csv", "a"), textEntry("b.csv", "b")))
+
+ val ctx = new TaskContextImpl(
+ stageId = 0,
+ stageAttemptNumber = 0,
+ partitionId = 0,
+ taskAttemptId = 1L,
+ attemptNumber = 0,
+ numPartitions = 0,
+ taskMemoryManager = null,
+ localProperties = new Properties,
+ metricsSystem = null,
+ cpus = 1)
+ TaskContext.setTaskContext(ctx)
+ try {
+ val it = ArchiveReader(new Path(tar.toURI)).readEntries(new Configuration()) { (name, _) =>
+ Iterator.single(name)
+ }
+ assert(it.hasNext)
+ it.next() // open the archive and register the completion listener
+ // Simulate task completion without exhausting/closing the iterator.
+ ctx.markTaskCompleted(None)
+ } finally {
+ TaskContext.unset()
+ }
+ }
+ }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVArchiveReadBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVArchiveReadBase.scala
new file mode 100644
index 0000000000000..93dc8db5750f0
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVArchiveReadBase.scala
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import java.nio.charset.StandardCharsets
+import java.nio.file.Files
+
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.util.Utils
+
+/**
+ * Binds [[ArchiveReadSuiteBase]]'s file-format hooks to CSV. The header-mode-specific tests live in
+ * the [[CSVHeaderArchiveReadBase]] and [[CSVHeaderlessArchiveReadBase]] sub-traits, so the shared
+ * archive tests from [[ArchiveReadSuiteBase]] run for both modes.
+ */
+trait CSVArchiveReadBase extends ArchiveReadSuiteBase {
+
+ /** Whether the archived CSV files are written and read with a header row. */
+ protected def header: Boolean
+
+ override protected def format: String = "csv"
+
+ override protected def fileExtension: String = "csv"
+
+ override protected def readOptions: Map[String, String] = Map("header" -> header.toString)
+
+ override protected def readSchema: String = "id INT, name STRING"
+
+ override protected def encodeFile(
+ df: DataFrame,
+ writeOptions: Map[String, String]): Array[Byte] = {
+ val dir = Utils.createTempDir(namePrefix = "archive-test-encode")
+ try {
+ df.coalesce(1).write.format("csv")
+ .options(Map("header" -> header.toString) ++ writeOptions)
+ .mode("overwrite").save(dir.getCanonicalPath)
+ val parts = dir.listFiles().filter { f =>
+ f.isFile && !f.getName.startsWith("_") && !f.getName.startsWith(".") &&
+ !f.getName.endsWith(".crc")
+ }
+ assert(parts.length == 1,
+ s"expected exactly one data file, got: ${parts.map(_.getName).toList}")
+ Files.readAllBytes(parts.head.toPath)
+ } finally Utils.deleteRecursively(dir)
+ }
+
+ /** Raw CSV bytes, for tests that need precise control over the row layout. */
+ protected def csvBytes(s: String): Array[Byte] = s.getBytes(StandardCharsets.UTF_8)
+
+ test("CSV: reading an archive without a schema fails (inference not yet supported)") {
+ // Schema inference for archives is a follow-up; until then an explicit schema is required, and
+ // an inference attempt raises Spark's standard UNABLE_TO_INFER_SCHEMA error.
+ withArchiveFile() { archive =>
+ writeArchive(archive, Seq(entryName(0) -> encodeFile(sampleDf((1, "Alice"), (2, "Bob")))))
+ val e = intercept[AnalysisException] {
+ spark.read.format(format).options(readOptions).load(archive.getCanonicalPath)
+ }
+ assert(e.getCondition == "UNABLE_TO_INFER_SCHEMA",
+ s"expected UNABLE_TO_INFER_SCHEMA, got ${e.getCondition}: ${e.getMessage}")
+ }
+ }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderArchiveReadBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderArchiveReadBase.scala
new file mode 100644
index 0000000000000..32ad6e818076f
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderArchiveReadBase.scala
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+/**
+ * [[CSVArchiveReadBase]] reading CSV files that carry a header row, plus header-specific archive
+ * tests (mismatched headers, and delimiter/multiline cases whose first row is a header).
+ */
+trait CSVHeaderArchiveReadBase extends CSVArchiveReadBase {
+
+ import testImplicits._
+
+ override protected def header: Boolean = true
+
+ test("CSV: entries with mismatched headers behave like standalone files") {
+ assertArchiveMatchesDir(
+ Seq(
+ entryName(0) -> encodeFile(sampleDf((1, "Alice"), (2, "Bob"))),
+ // A different second-column header: the schema's "name" column is absent from this entry.
+ entryName(1) -> encodeFile(Seq((3, "Carol")).toDF("id", "nickname"))))
+ }
+
+ test("CSV: custom delimiter matches a directory read") {
+ assertArchiveMatchesDir(
+ Seq("a.csv" -> csvBytes("id;name\n1;Alice\n2;Bob\n")),
+ extraOptions = Map("delimiter" -> ";"))
+ }
+
+ test("CSV: multiline quoted fields with embedded newlines match a directory read") {
+ assertArchiveMatchesDir(
+ Seq(
+ "a.csv" -> csvBytes("id,note\n1,\"line1\nline2\"\n2,\"plain\"\n"),
+ "b.csv" -> csvBytes("id,note\n3,\"a\nb\nc\"\n")),
+ extraOptions = Map("multiLine" -> "true"),
+ schema = "id INT, note STRING")
+ }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderTarArchiveReadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderTarArchiveReadSuite.scala
new file mode 100644
index 0000000000000..7ac2ad4084aa0
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderTarArchiveReadSuite.scala
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+/**
+ * Reads of header-carrying CSV files packed in tar archives (`.tar`/`.tar.gz`/`.tgz`): the shared
+ * archive tests from [[ArchiveReadSuiteBase]] plus the header-mode CSV tests from
+ * [[CSVHeaderArchiveReadBase]], run over tar containers via [[TarArchiveReadBase]].
+ */
+class CSVHeaderTarArchiveReadSuite
+ extends ArchiveReadSuiteBase
+ with CSVHeaderArchiveReadBase
+ with TarArchiveReadBase
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderlessArchiveReadBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderlessArchiveReadBase.scala
new file mode 100644
index 0000000000000..ba4d7f63bb464
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderlessArchiveReadBase.scala
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+/**
+ * [[CSVArchiveReadBase]] reading headerless CSV files (columns are positional), plus headerless
+ * delimiter/multiline archive tests. The shared archive tests from [[ArchiveReadSuiteBase]] cover
+ * the common headerless read paths.
+ */
+trait CSVHeaderlessArchiveReadBase extends CSVArchiveReadBase {
+
+ override protected def header: Boolean = false
+
+ test("CSV: headerless custom delimiter matches a directory read") {
+ assertArchiveMatchesDir(
+ Seq("a.csv" -> csvBytes("1;Alice\n2;Bob\n"), "b.csv" -> csvBytes("3;Carol\n")),
+ extraOptions = Map("delimiter" -> ";"))
+ }
+
+ test("CSV: headerless multiline quoted fields with embedded newlines match a directory read") {
+ assertArchiveMatchesDir(
+ Seq(
+ "a.csv" -> csvBytes("1,\"line1\nline2\"\n2,\"plain\"\n"),
+ "b.csv" -> csvBytes("3,\"a\nb\nc\"\n")),
+ extraOptions = Map("multiLine" -> "true"),
+ schema = "id INT, note STRING")
+ }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderlessTarArchiveReadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderlessTarArchiveReadSuite.scala
new file mode 100644
index 0000000000000..28316c97ca8c9
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderlessTarArchiveReadSuite.scala
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+/**
+ * Reads of headerless CSV files packed in tar archives (`.tar`/`.tar.gz`/`.tgz`): the shared
+ * archive tests from [[ArchiveReadSuiteBase]] plus the headerless CSV tests from
+ * [[CSVHeaderlessArchiveReadBase]], run over tar containers via [[TarArchiveReadBase]].
+ */
+class CSVHeaderlessTarArchiveReadSuite
+ extends ArchiveReadSuiteBase
+ with CSVHeaderlessArchiveReadBase
+ with TarArchiveReadBase
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/TarArchiveReadBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/TarArchiveReadBase.scala
new file mode 100644
index 0000000000000..60a73b0891bae
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/TarArchiveReadBase.scala
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import java.io.{File, FileOutputStream, OutputStream}
+import java.nio.charset.StandardCharsets
+import java.nio.file.Files
+import java.util.Locale
+import java.util.zip.GZIPOutputStream
+
+import org.apache.commons.compress.archivers.tar.{TarArchiveEntry, TarArchiveOutputStream}
+
+/**
+ * Binds [[ArchiveReadSuiteBase]]'s archive-format hooks to tar containers: plain `.tar`, gzipped
+ * `.tar.gz`, and `.tgz`. Reusable across file formats -- a `TarArchiveReadSuite` mixes this
+ * in alongside the file-format trait.
+ */
+trait TarArchiveReadBase extends ArchiveReadSuiteBase {
+
+ override protected def archiveExtensions: Seq[String] = Seq("tar", "tar.gz", "tgz")
+
+ override protected def corruptArchiveExtension: String = "tar.gz"
+
+ override protected def writeArchive(dest: File, entries: Seq[(String, Array[Byte])]): Unit = {
+ val name = dest.getName.toLowerCase(Locale.ROOT)
+ val rawOut: OutputStream = if (name.endsWith(".gz") || name.endsWith(".tgz")) {
+ new GZIPOutputStream(new FileOutputStream(dest))
+ } else {
+ new FileOutputStream(dest)
+ }
+ val out = new TarArchiveOutputStream(rawOut)
+ try {
+ entries.foreach { case (entryName, bytes) =>
+ val entry = new TarArchiveEntry(entryName)
+ entry.setSize(bytes.length.toLong)
+ out.putArchiveEntry(entry)
+ out.write(bytes)
+ out.closeArchiveEntry()
+ }
+ out.finish()
+ } finally out.close()
+ }
+
+ override protected def writeCorruptArchive(dest: File): Unit =
+ Files.write(dest.toPath, "this is not a valid gzip-compressed tar archive"
+ .getBytes(StandardCharsets.UTF_8))
+}