diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/FileSourceOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/FileSourceOptions.scala index 22f84a1cad63d..0747a8045e7d2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/FileSourceOptions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/FileSourceOptions.scala @@ -45,6 +45,14 @@ class FileSourceOptions( val ignoreMissingFiles: Boolean = parameters.get(IGNORE_MISSING_FILES).map(_.toBoolean) .getOrElse(SQLConf.get.ignoreMissingFiles) + + /** + * Whether the data source may read tar archives (.tar/.tar.gz/.tgz) by streaming their entries. + * Gated by [[SQLConf.ARCHIVE_FORMAT_READER_ENABLED]] and resolved at construction (on the driver, + * where SQLConf is instantiated) so the value is stable once the options are serialized to + * executors. Only the CSV data source currently honors this. + */ + val archiveFormatEnabled: Boolean = SQLConf.get.getConf(SQLConf.ARCHIVE_FORMAT_READER_ENABLED) } object FileSourceOptions { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 0aed28e92558f..7971795c29bab 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -2695,6 +2695,16 @@ object SQLConf { .bytesConf(ByteUnit.BYTE) .createWithDefaultString("128MB") // parquet.block.size + val ARCHIVE_FORMAT_READER_ENABLED = buildConf("spark.sql.files.archive.reader.enabled") + .doc("When true, the CSV data source can read tar archives (.tar, .tar.gz, .tgz): each " + + "archive is read as a single split and its entries are streamed through the CSV parser " + + "(never unpacked to disk), as if the entries were separate CSV files. Only the CSV data " + + "source supports reading archives.") + .version("5.0.0") + .withBindingPolicy(ConfigBindingPolicy.SESSION) + .booleanConf + .createWithDefault(false) + val FILES_OPEN_COST_IN_BYTES = buildConf("spark.sql.files.openCostInBytes") .internal() .doc("The estimated cost to open a file, measured by the number of bytes could be scanned in" + diff --git a/sql/core/pom.xml b/sql/core/pom.xml index 1261200a9173c..e6673c9069f42 100644 --- a/sql/core/pom.xml +++ b/sql/core/pom.xml @@ -293,6 +293,10 @@ bcprov-jdk18on test + + org.apache.commons + commons-compress + org.bouncycastle bcpkix-jdk18on diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ArchiveReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ArchiveReader.scala new file mode 100644 index 0000000000000..e9737c87e1a2c --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ArchiveReader.scala @@ -0,0 +1,239 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +import java.io.{Closeable, InputStream} +import java.util.Locale +import java.util.zip.GZIPInputStream + +import scala.util.control.NonFatal + +import org.apache.commons.compress.archivers.tar.{TarArchiveEntry, TarArchiveInputStream} +import org.apache.commons.io.ByteOrderMark +import org.apache.commons.io.input.{BOMInputStream, CloseShieldInputStream} +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.hadoop.io.Text +import org.apache.hadoop.util.LineReader + +import org.apache.spark.TaskContext +import org.apache.spark.util.HadoopFSUtils + +/** + * Streaming reader for a single archive file. The archive is opened once and decompressed/unpacked + * as a stream -- entries are never materialized to local disk. [[readEntries]] hands each entry's + * bytes to a caller-supplied parse function as a bounded [[InputStream]] and concatenates the + * per-entry results into a single iterator, advancing to the next entry only once the current one + * is fully consumed. At most one entry is in flight at a time, so memory stays bounded regardless + * of archive size. + * + * This is format-agnostic: a data source whose per-file reader can consume an `InputStream` wires + * up archive support by calling [[readEntries]] from its read/inference paths and supplying a + * `parseEntry` that turns one entry stream into rows (or tokens). Formats that need random access + * within a file (e.g. Parquet/ORC footers) cannot use this streaming path. + * + * A concrete subclass implements [[readEntries]] for a specific archive format. Obtain the reader + * for a path via `ArchiveReader(path)`, which selects the implementation by file extension; new + * archive formats are added by writing another subclass rather than modifying existing ones. + */ +abstract class ArchiveReader(path: Path) { + + /** + * Streams the archive entry by entry, applying `parseEntry` to each non-skipped entry's + * `(name, stream)` and concatenating the results into a single iterator. The next entry is opened + * only once the current entry's iterator is exhausted, so nothing is buffered to disk and at most + * one entry's bytes are read at a time. The archive stream is closed when the returned iterator + * is exhausted, when [[Closeable.close]] is called on it, and (defensively) on task completion. + */ + def readEntries[T]( + conf: Configuration)( + parseEntry: (String, InputStream) => Iterator[T]): Iterator[T] +} + +object ArchiveReader { + + /** + * Whether `path` names an archive this reader can stream. Dispatched purely on the file + * extension -- `.tar`, `.tar.gz`, or `.tgz` -- since the bytes are not inspected here. + */ + def isArchivePath(path: Path): Boolean = { + val name = path.getName.toLowerCase(Locale.ROOT) + name.endsWith(".tar") || name.endsWith(".tar.gz") || name.endsWith(".tgz") + } + + /** + * Returns the [[ArchiveReader]] implementation for `path`, selected by its file extension. Only + * paths for which [[isArchivePath]] is true are supported; new archive formats add a case here. + */ + def apply(path: Path): ArchiveReader = new TarArchiveReader(path) + + /** + * Splits one already-decompressed archive entry's bytes into lines. The reusable, format-agnostic + * line source for archive entries; the entry stream is not closed here (the reader owns the + * underlying stream). + * + * @param in bytes of one archive entry. + * @param lineSeparatorInRead the explicit read line separator, or `None` to detect line breaks. + * @return an iterator over the entry's lines as [[Text]], without the trailing separator. + */ + def lineIterator(in: InputStream, lineSeparatorInRead: Option[Array[Byte]]): Iterator[Text] = { + // A leading byte-order mark is stripped (LineReader does not strip it on its own) so the lines + // match the non-archive read path. + val bomInputStream = BOMInputStream.builder() + .setInputStream(in) + .setByteOrderMarks( + ByteOrderMark.UTF_8, + ByteOrderMark.UTF_16LE, + ByteOrderMark.UTF_16BE, + ByteOrderMark.UTF_32LE, + ByteOrderMark.UTF_32BE) + .setInclude(false) + .get() + val reader = lineSeparatorInRead match { + case Some(sep) => new LineReader(bomInputStream, sep) + case _ => new LineReader(bomInputStream) + } + new Iterator[Text] { + private val text = new Text() + private var finished = false + private var hasValue = false + + override def hasNext: Boolean = { + if (!finished && !hasValue) { + finished = reader.readLine(text) == 0 + hasValue = !finished + } + !finished + } + + override def next(): Text = { + if (!hasNext) throw new NoSuchElementException + hasValue = false + text + } + } + } +} + +/** + * [[ArchiveReader]] for tar archives: plain `.tar`, gzipped `.tar.gz`, and `.tgz`. + * + * Gzip handling: Hadoop's `CompressionCodecFactory` matches the trailing `.gz` extension and + * auto-decompresses `.tar.gz` via `CodecStreams`, so we just wrap that stream in + * `TarArchiveInputStream`. `.tgz` is not a registered Hadoop codec extension, so the gzip layer is + * unwrapped explicitly here. + */ +class TarArchiveReader(path: Path) extends ArchiveReader(path) { + + // Paths Hadoop's codec factory won't auto-decompress: we apply the gzip layer here. + private def needsExplicitGunzip: Boolean = + path.getName.toLowerCase(Locale.ROOT).endsWith(".tgz") + + /** + * Whether an entry is not a real data file and must be skipped: a directory, or a name Spark's + * own file listing would filter out. Reusing [[HadoopFSUtils.shouldFilterOutPathName]] (the + * `InMemoryFileIndex` filter) keeps archive reads in parity with reading the same entries as + * loose files: `.`-prefixed sidecars (macOS `._x`, `.DS_Store`) and `_`-prefixed markers + * (`_SUCCESS`, `_committed_*`) are skipped, while data files are kept. + */ + private def shouldSkipEntry(entry: TarArchiveEntry): Boolean = { + if (entry.isDirectory) return true + val name = entry.getName + val basename = name.substring(name.lastIndexOf('/') + 1) + HadoopFSUtils.shouldFilterOutPathName(basename) + } + + /** Opens the archive as a tar stream, transparently decompressing `.tar.gz` / `.tgz`. */ + private def openTarStream(conf: Configuration): TarArchiveInputStream = { + val base = CodecStreams.createInputStreamWithCloseResource(conf, path) + val tarBytes = if (needsExplicitGunzip) new GZIPInputStream(base) else base + new TarArchiveInputStream(tarBytes) + } + + /** + * Wraps the shared tar stream as a view over exactly the current entry's bytes + * (`TarArchiveInputStream.read` returns -1 at the entry boundary). [[CloseShieldInputStream]] + * ignores `close()`, so a parser closing its input does not close the underlying archive; any + * unread remainder of an entry is skipped by `getNextEntry()` when advancing. + */ + private def entryStream(tar: TarArchiveInputStream): InputStream = + CloseShieldInputStream.wrap(tar) + + override def readEntries[T]( + conf: Configuration)( + parseEntry: (String, InputStream) => Iterator[T]): Iterator[T] = { + val tar = openTarStream(conf) + var closed = false + + def cleanup(): Unit = { + if (!closed) { + closed = true + try tar.close() catch { case NonFatal(_) => } + } + } + + Option(TaskContext.get()).foreach(_.addTaskCompletionListener[Unit](_ => cleanup())) + + new Iterator[T] with Closeable { + private var currentIter: Iterator[T] = Iterator.empty + private var done = false + + // Move to the next entry whose iterator has elements (releasing each exhausted entry's + // reader and skipping any unread bytes), or mark the stream done once entries run out. + // Advancing here -- driven by `hasNext` -- rather than eagerly after producing a row in + // `next` is essential for parsers that reuse a single mutable row and look ahead on + // `hasNext`: probing the current entry right after returning a row would overwrite that row's + // contents before the caller has copied it. + private def advance(): Unit = { + while (!done && !currentIter.hasNext) { + currentIter match { + case c: Closeable => try c.close() catch { case NonFatal(_) => } + case _ => + } + var entry = tar.getNextEntry + while (entry != null && shouldSkipEntry(entry)) entry = tar.getNextEntry + if (entry == null) { + done = true + cleanup() + } else { + currentIter = parseEntry(entry.getName, entryStream(tar)) + } + } + } + + // Open the first entry eagerly so construction reflects the archive's first entry. + advance() + + override def hasNext: Boolean = { + advance() + !done && currentIter.hasNext + } + + override def next(): T = { + if (!hasNext) throw new NoSuchElementException + currentIter.next() + } + + override def close(): Unit = { + done = true + currentIter = Iterator.empty + cleanup() + } + } + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala index 596edc8beaa34..76a34630a4d5d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.execution.datasources.csv -import java.io.{FileNotFoundException, IOException} +import java.io.{FileNotFoundException, InputStream, IOException} import java.nio.charset.{Charset, StandardCharsets} import scala.util.control.NonFatal @@ -71,7 +71,14 @@ abstract class CSVDataSource extends Serializable { parsedOptions.singleVariantColumn match { case Some(columnName) => Some(StructType(Array(StructField(columnName, VariantType)))) case None => - if (inputPaths.nonEmpty) { + if (parsedOptions.archiveFormatEnabled && + inputPaths.exists(f => ArchiveReader.isArchivePath(f.getPath))) { + // Schema inference is not yet supported for tar archives. Returning None makes Spark + // raise its standard "Unable to infer schema ... It must be specified manually" error + // (UNABLE_TO_INFER_SCHEMA), so reading an archive requires an explicit `.schema(...)`. + // Inferring a schema by streaming archive entries is planned as a follow-up. + None + } else if (inputPaths.nonEmpty) { Some(infer(sparkSession, inputPaths, parsedOptions)) } else { None @@ -83,6 +90,46 @@ abstract class CSVDataSource extends Serializable { sparkSession: SparkSession, inputPaths: Seq[FileStatus], parsedOptions: CSVOptions): StructType + + /** + * Streams a tar archive (`.tar`/`.tar.gz`/`.tgz`) entry by entry through the CSV parser without + * unpacking it to disk. The whole archive is a single split (see `CSVFileFormat.isSplitable`); a + * fresh header checker and parser are built per entry so each entry is parsed exactly like a + * standalone CSV file -- its header, if any, validated and dropped independently. The + * mode-specific implementation turns one entry into rows via `parseStream` / `parseIterator`. + * + * @param getParser builds a fresh [[UnivocityParser]]. + * @param getHeaderChecker builds a fresh [[CSVHeaderChecker]] for `(isStartOfFile, source)`. + */ + def readArchive( + conf: Configuration, + file: PartitionedFile, + getParser: () => UnivocityParser, + getHeaderChecker: (Boolean, String) => CSVHeaderChecker, + requiredSchema: StructType): Iterator[InternalRow] + + /** + * Shared driver used by the [[readArchive]] implementations: streams each non-skipped entry's + * `(parser, headerChecker, stream)` -- a fresh parser/header checker per entry -- through + * `parseEntry`. The header checker `source` (`CSV archive entry: !/`) names + * the entry in error messages. + */ + protected def streamArchiveEntries( + conf: Configuration, + file: PartitionedFile, + getParser: () => UnivocityParser, + getHeaderChecker: (Boolean, String) => CSVHeaderChecker)( + parseEntry: (UnivocityParser, CSVHeaderChecker, InputStream) => Iterator[InternalRow]) + : Iterator[InternalRow] = { + ArchiveReader(file.toPath).readEntries(conf) { (entryName, in) => + val headerChecker = + getHeaderChecker(true, s"CSV archive entry: ${file.urlEncodedPath}!/$entryName") + val parser = getParser() + headerChecker.setHeaderForSingleVariantColumn = + CSVDataSource.setHeaderForSingleVariantColumn(conf, file, parser) + parseEntry(parser, headerChecker, in) + } + } } object CSVDataSource extends Logging { @@ -144,6 +191,36 @@ object TextInputCSVDataSource extends CSVDataSource { UnivocityParser.parseIterator(lines, parser, headerChecker, requiredSchema) } + override def readArchive( + conf: Configuration, + file: PartitionedFile, + getParser: () => UnivocityParser, + getHeaderChecker: (Boolean, String) => CSVHeaderChecker, + requiredSchema: StructType): Iterator[InternalRow] = + // Stream each tar entry through the line-based parser, treating the entry exactly like a + // standalone CSV file (a fresh parser/header checker is built per entry). + streamArchiveEntries(conf, file, getParser, getHeaderChecker) { (parser, headerChecker, in) => + UnivocityParser.parseIterator( + entryLines(in, parser.options), parser, headerChecker, requiredSchema) + } + + /** + * Decodes one archive entry's bytes into the same CSV line strings the non-archive [[readFile]] + * path feeds to the parser: [[ArchiveReader.lineIterator]] splits the entry into lines (honoring + * a custom line separator) and each line is decoded with the configured charset. Like `readFile`, + * the decoded lines are fed to `UnivocityParser.parseIterator` without a re-appended terminator. + * + * @param in bytes of one already-decompressed archive entry; not closed here (the archive owns + * the underlying stream). + * @param options CSV options supplying the read line separator and charset. + * @return an iterator over the entry's lines. + */ + private def entryLines(in: InputStream, options: CSVOptions): Iterator[String] = { + ArchiveReader.lineIterator(in, options.lineSeparatorInRead).map { line => + new String(line.getBytes, 0, line.getLength, options.charset) + } + } + override def infer( sparkSession: SparkSession, inputPaths: Seq[FileStatus], @@ -227,6 +304,18 @@ object MultiLineCSVDataSource extends CSVDataSource with Logging { requiredSchema) } + override def readArchive( + conf: Configuration, + file: PartitionedFile, + getParser: () => UnivocityParser, + getHeaderChecker: (Boolean, String) => CSVHeaderChecker, + requiredSchema: StructType): Iterator[InternalRow] = + // Stream each tar entry whole through the multi-line parser (a fresh parser/header checker is + // built per entry). + streamArchiveEntries(conf, file, getParser, getHeaderChecker) { (parser, headerChecker, in) => + UnivocityParser.parseStream(in, parser, headerChecker, requiredSchema) + } + override def infer( sparkSession: SparkSession, inputPaths: Seq[FileStatus], diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala index 77a0c53ae4699..ab570de8d998f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala @@ -44,6 +44,11 @@ case class CSVFileFormat() extends TextBasedFileFormat with DataSourceRegister { options: Map[String, String], path: Path): Boolean = { val parsedOptions = getCsvOptions(sparkSession, options) + // A tar archive is decompressed/unpacked as a sequential stream, so it must be read as a + // single split rather than carved into byte ranges. + if (parsedOptions.archiveFormatEnabled && ArchiveReader.isArchivePath(path)) { + return false + } CSVDataSource(parsedOptions).isSplitable && super.isSplitable(sparkSession, options, path) } @@ -119,24 +124,26 @@ case class CSVFileFormat() extends TextBasedFileFormat with DataSourceRegister { dataSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)) val actualRequiredSchema = StructType( requiredSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)) - val parser = new UnivocityParser( - actualDataSchema, - actualRequiredSchema, - parsedOptions, - actualFilters) // Use column pruning when specified by Catalyst, except when one or more columns have // existence default value(s), since in that case we instruct the CSV parser to disable column // pruning and instead read each entire row in order to correctly assign the default value(s). val schema = if (isColumnPruningEnabled) actualRequiredSchema else actualDataSchema - val isStartOfFile = file.start == 0 - val headerChecker = new CSVHeaderChecker( - schema, parsedOptions, source = s"CSV file: ${file.urlEncodedPath}", isStartOfFile) - CSVDataSource(parsedOptions).readFile( - conf, - file, - parser, - headerChecker, - requiredSchema) + + def newParser(): UnivocityParser = + new UnivocityParser(actualDataSchema, actualRequiredSchema, parsedOptions, actualFilters) + def getHeaderChecker(isStartOfFile: Boolean, source: String): CSVHeaderChecker = + new CSVHeaderChecker(schema, parsedOptions, source, isStartOfFile) + + // A tar archive (always a single split, see `isSplitable`) is streamed entry by entry when + // archive reads are enabled; otherwise the file is parsed directly. + if (parsedOptions.archiveFormatEnabled && ArchiveReader.isArchivePath(file.toPath)) { + CSVDataSource(parsedOptions).readArchive( + conf, file, () => newParser(), getHeaderChecker, requiredSchema) + } else { + val parser = newParser() + val headerChecker = getHeaderChecker(file.start == 0, s"CSV file: ${file.urlEncodedPath}") + CSVDataSource(parsedOptions).readFile(conf, file, parser, headerChecker, requiredSchema) + } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ArchiveReadSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ArchiveReadSuiteBase.scala new file mode 100644 index 0000000000000..9d0300ec41177 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ArchiveReadSuiteBase.scala @@ -0,0 +1,234 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +import java.io.File +import java.nio.file.Files + +import org.apache.spark.{SparkConf, SparkException} +import org.apache.spark.sql.{DataFrame, QueryTest, Row} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.util.Utils + +/** + * Format- and archive-agnostic end-to-end tests for reading archives of data files through the + * streaming [[ArchiveReader]] path. Entries are streamed (never unpacked to disk), and the central + * contract verified throughout is parity with reading the same files from a directory. + * + * A concrete suite binds the abstract hooks below by mixing in a file-format trait (e.g. + * [[org.apache.spark.sql.execution.datasources.CSVArchiveReadBase]]) and an archive-format trait + * (e.g. [[TarArchiveReadBase]]), so the same tests run for every (file format, archive format) + * pair we support. New formats are added by writing the per-format trait once, not by duplicating + * these tests: + * {{{ + * class CSVHeaderTarArchiveReadSuite + * extends ArchiveReadSuiteBase with CSVHeaderArchiveReadBase with TarArchiveReadBase + * }}} + */ +trait ArchiveReadSuiteBase extends QueryTest with SharedSparkSession { + + override def sparkConf: SparkConf = + super.sparkConf.set(SQLConf.ARCHIVE_FORMAT_READER_ENABLED.key, "true") + + import testImplicits._ + + // ----- file-format hooks (bound by e.g. CSVArchiveReadBase) ---------------- + + /** The `DataFrameReader`/`DataFrameWriter` format name, e.g. "csv". */ + protected def format: String + + /** Extension of a single data file of [[format]] inside an archive, e.g. "csv". */ + protected def fileExtension: String + + /** Read options applied to every read (e.g. CSV `header`). */ + protected def readOptions: Map[String, String] + + /** Schema used to read the sample data produced by [[sampleDf]]. */ + protected def readSchema: String + + /** Encodes `df` as the bytes of a single data file of [[format]], honoring `writeOptions`. */ + protected def encodeFile(df: DataFrame, writeOptions: Map[String, String]): Array[Byte] + + /** Encodes `df` as a single data file using only the format's default write options. */ + protected final def encodeFile(df: DataFrame): Array[Byte] = encodeFile(df, Map.empty) + + // ----- archive-format hooks (bound by e.g. TarArchiveReadBase) ------------- + + /** Archive extensions to exercise, e.g. Seq("tar", "tar.gz", "tgz"). The head is the default. */ + protected def archiveExtensions: Seq[String] + + /** Writes `entries` (name -> bytes) into the archive at `dest`; compression follows the ext. */ + protected def writeArchive(dest: File, entries: Seq[(String, Array[Byte])]): Unit + + /** Writes bytes that are not a readable archive at `dest` (of [[corruptArchiveExtension]]). */ + protected def writeCorruptArchive(dest: File): Unit + + /** An archive extension whose reader fails on corrupt bytes (used by the corrupt-file tests). */ + protected def corruptArchiveExtension: String + + // ----- helpers ------------------------------------------------------------- + + /** Sample two-column data; the column names line up with [[readSchema]]. */ + protected def sampleDf(rows: (Int, String)*): DataFrame = rows.toDF("id", "name") + + /** Entry file name for the i-th data file in an archive. */ + protected def entryName(i: Int): String = s"part-$i.$fileExtension" + + /** Provides an archive-extensioned path inside a fresh temp dir to `f`. */ + protected def withArchiveFile( + extension: String = archiveExtensions.head)(f: File => Unit): Unit = { + val dir = Utils.createTempDir(namePrefix = "archive-test") + try f(new File(dir, s"archive.$extension")) finally Utils.deleteRecursively(dir) + } + + /** Reads `path` with the format, [[readOptions]] (plus `extraOptions`), and `schema`. */ + protected def read( + path: String, + extraOptions: Map[String, String] = Map.empty, + schema: String = readSchema): DataFrame = + spark.read.format(format).options(readOptions ++ extraOptions).schema(schema).load(path) + + /** + * Writes `entries` both into an archive and as loose files in a directory, then asserts the + * archive read produces exactly the same rows as the directory read. + */ + protected def assertArchiveMatchesDir( + entries: Seq[(String, Array[Byte])], + extraOptions: Map[String, String] = Map.empty, + schema: String = readSchema): Unit = { + withArchiveFile() { archive => + writeArchive(archive, entries) + val fromArchive = read(archive.getCanonicalPath, extraOptions, schema) + withTempDir { dir => + entries.foreach { case (name, b) => Files.write(new File(dir, name).toPath, b) } + checkAnswer(fromArchive, read(dir.getCanonicalPath, extraOptions, schema).collect().toSeq) + } + } + } + + // ----- tests --------------------------------------------------------------- + + test("read an archive of multiple entries matches the union of the inputs") { + archiveExtensions.foreach { ext => + withArchiveFile(ext) { archive => + val parts = Seq( + sampleDf((1, "Alice"), (2, "Bob")), + sampleDf((3, "Carol")), + sampleDf((4, "Dan"), (5, "Eve"))) + writeArchive( + archive, parts.zipWithIndex.map { case (p, i) => entryName(i) -> encodeFile(p) }) + checkAnswer(read(archive.getCanonicalPath), parts.reduce(_ union _)) + } + } + } + + test("archive entries parse like a directory of the same files") { + val parts = Seq(sampleDf((1, "Alice"), (2, "Bob")), sampleDf((3, "Carol"))) + assertArchiveMatchesDir(parts.zipWithIndex.map { case (p, i) => entryName(i) -> encodeFile(p) }) + } + + test("column pruning selects a subset of columns") { + withArchiveFile() { archive => + val data = sampleDf((1, "Alice"), (2, "Bob")) + writeArchive(archive, Seq(entryName(0) -> encodeFile(data))) + checkAnswer(read(archive.getCanonicalPath).select("name"), Seq(Row("Alice"), Row("Bob"))) + } + } + + test("multiple entries and multiple loose files under a partitioned dir, plus an empty archive") { + withTempDir { rootDir => + val partitionDir = new File(rootDir, "dt=2024-01-01") + assert(partitionDir.mkdirs()) + + val inArchive = Seq(sampleDf((1, "in-archive-a")), sampleDf((2, "in-archive-b"))) + val loose = Seq(sampleDf((3, "loose-a")), sampleDf((4, "loose-b"))) + val ext = archiveExtensions.head + + writeArchive( + new File(partitionDir, s"data.$ext"), + inArchive.zipWithIndex.map { case (p, i) => entryName(i) -> encodeFile(p) }) + // An empty archive in the same directory must contribute no rows. + writeArchive(new File(partitionDir, s"empty.$ext"), Seq.empty) + loose.zipWithIndex.foreach { case (p, i) => + Files.write(new File(partitionDir, s"loose-$i.$fileExtension").toPath, encodeFile(p)) + } + + val expected = (inArchive ++ loose).reduce(_ union _) + checkAnswer(read(rootDir.getCanonicalPath).select("id", "name"), expected) + } + } + + test("a directory of only empty archives yields no rows") { + withTempDir { dir => + archiveExtensions.foreach { ext => + writeArchive(new File(dir, s"empty-${ext.replace('.', '_')}.$ext"), Seq.empty) + } + checkAnswer(read(dir.getCanonicalPath), Seq.empty[Row]) + } + } + + test("an empty archive yields no rows") { + withArchiveFile() { archive => + writeArchive(archive, Seq.empty) + checkAnswer(read(archive.getCanonicalPath), Seq.empty[Row]) + } + } + + test("an archive always yields a single partition regardless of size") { + withArchiveFile() { archive => + val big = sampleDf((1 to 1000).map(i => (i, s"value-$i")): _*) + writeArchive(archive, (0 until 4).map(i => entryName(i) -> encodeFile(big))) + withSQLConf(SQLConf.FILES_MAX_PARTITION_BYTES.key -> "1024") { + val readDf = read(archive.getCanonicalPath) + assert(readDf.rdd.getNumPartitions == 1, + s"archive should be a single partition; got ${readDf.rdd.getNumPartitions}") + assert(readDf.count() == 4L * big.count()) + } + } + } + + Seq(true, false).foreach { ignoreCorrupt => + test(s"ignoreCorruptFiles=$ignoreCorrupt controls whether a corrupt archive is skipped") { + withArchiveFile(corruptArchiveExtension) { archive => + writeCorruptArchive(archive) + withSQLConf(SQLConf.IGNORE_CORRUPT_FILES.key -> ignoreCorrupt.toString) { + if (ignoreCorrupt) { + checkAnswer(read(archive.getCanonicalPath), Seq.empty[Row]) + } else { + intercept[SparkException](read(archive.getCanonicalPath).collect()) + } + } + } + } + } + + test("a corrupt archive among good ones is skipped whole, not per entry (ignoreCorruptFiles)") { + withTempDir { dir => + val good = sampleDf((1, "Alice"), (2, "Bob")) + writeArchive(new File(dir, s"good.${archiveExtensions.head}"), + Seq(entryName(0) -> encodeFile(good))) + writeCorruptArchive(new File(dir, s"bad.$corruptArchiveExtension")) + // A tar is one non-splittable unit, so corrupt handling is archive-granular: the corrupt + // archive is skipped in its entirety while the good archive's rows are still returned. + withSQLConf(SQLConf.IGNORE_CORRUPT_FILES.key -> "true") { + checkAnswer(read(dir.getCanonicalPath), good) + } + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ArchiveReaderSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ArchiveReaderSuite.scala new file mode 100644 index 0000000000000..48482aa1f5b3e --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ArchiveReaderSuite.scala @@ -0,0 +1,259 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +import java.io.{ByteArrayOutputStream, Closeable, File, FileOutputStream, InputStream, OutputStream} +import java.nio.charset.StandardCharsets +import java.util.Properties +import java.util.zip.GZIPOutputStream + +import scala.collection.mutable.ArrayBuffer + +import org.apache.commons.compress.archivers.tar.{TarArchiveEntry, TarArchiveOutputStream} +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path + +import org.apache.spark.{SparkFunSuite, TaskContext, TaskContextImpl} + +/** + * Unit tests for the streaming [[ArchiveReader]] core: `isArchivePath` dispatch and `readEntries` + * (entry ordering, gzip handling, dir/dotfile skipping, lazy advance, the non-closing entry + * stream, and cleanup). Nothing here touches local disk -- entries are consumed as streams. + */ +class ArchiveReaderSuite extends SparkFunSuite { + + private case class Entry(name: String, data: Array[Byte], isDir: Boolean = false) + + private def writeTar(file: File, entries: Seq[Entry]): Unit = + writeTarTo(new FileOutputStream(file), entries) + + /** Write a gzipped tar, used to verify the `.tar.gz` / `.tgz` archive paths. */ + private def writeTarGz(file: File, entries: Seq[Entry]): Unit = + writeTarTo(new GZIPOutputStream(new FileOutputStream(file)), entries) + + private def writeTarTo(rawOut: OutputStream, entries: Seq[Entry]): Unit = { + val out = new TarArchiveOutputStream(rawOut) + try { + entries.foreach { e => + // TarArchiveEntry treats a trailing slash in the name as a directory marker. + val rawName = if (e.isDir && !e.name.endsWith("/")) e.name + "/" else e.name + val tarEntry = new TarArchiveEntry(rawName) + if (!e.isDir) tarEntry.setSize(e.data.length.toLong) + out.putArchiveEntry(tarEntry) + if (!e.isDir) out.write(e.data) + out.closeArchiveEntry() + } + out.finish() + } finally out.close() + } + + private def textEntry(name: String, body: String): Entry = + Entry(name, body.getBytes(StandardCharsets.UTF_8)) + + private def readAll(in: InputStream): Array[Byte] = { + val out = new ByteArrayOutputStream() + val buf = new Array[Byte](4096) + var n = in.read(buf) + while (n >= 0) { + out.write(buf, 0, n) + n = in.read(buf) + } + out.toByteArray + } + + /** Drains every entry into `(name, decodedText)` pairs through `ArchiveReader.readEntries`. */ + private def collect(file: File): Seq[(String, String)] = + ArchiveReader(new Path(file.toURI)).readEntries(new Configuration()) { (name, in) => + Iterator.single((name, new String(readAll(in), StandardCharsets.UTF_8))) + }.toList + + // ----- isArchivePath ------------------------------------------------------ + + test("isArchivePath: positive cases") { + Seq( + "foo.tar", "FOO.TAR", "/a/b/c/x.tar", "weird.TaR", + "foo.tar.gz", "FOO.TAR.GZ", "mixed.Tar.Gz", "/a/b/c/x.tar.gz", + "foo.tgz", "FOO.TGZ", "/a/b/c/x.tgz" + ).foreach { p => + assert(ArchiveReader.isArchivePath(new Path(p)), s"expected archive match for $p") + } + } + + test("isArchivePath: negative cases") { + Seq("foo.csv", "foo.gz", "foo", "dir/", "foo.tarball", "data.zip", + "foo.tar.bz2", "foo.targz").foreach { p => + assert(!ArchiveReader.isArchivePath(new Path(p)), s"expected non-match for $p") + } + } + + // ----- readEntries -------------------------------------------------------- + + test("readEntries: empty tar yields empty iterator") { + withTempDir { dir => + val tar = new File(dir, "empty.tar") + writeTar(tar, Seq.empty) + assert(collect(tar).isEmpty) + } + } + + test("readEntries: single entry exposes its name and bytes") { + withTempDir { dir => + val tar = new File(dir, "single.tar") + writeTar(tar, Seq(textEntry("only.csv", "hello\n"))) + assert(collect(tar) == Seq("only.csv" -> "hello\n")) + } + } + + test("readEntries: multiple entries chained in tar order") { + withTempDir { dir => + val tar = new File(dir, "multi.tar") + writeTar(tar, Seq(textEntry("a.csv", "a"), textEntry("b.csv", "b"), textEntry("c.csv", "c"))) + assert(collect(tar) == Seq("a.csv" -> "a", "b.csv" -> "b", "c.csv" -> "c")) + } + } + + test("readEntries: gzipped tar (.tar.gz) via Hadoop codec factory") { + withTempDir { dir => + val tarGz = new File(dir, "data.tar.gz") + writeTarGz(tarGz, Seq(textEntry("a.csv", "a"), textEntry("b.csv", "b"))) + assert(collect(tarGz) == Seq("a.csv" -> "a", "b.csv" -> "b")) + } + } + + test("readEntries: gzipped tar (.tgz) via explicit GZIPInputStream wrap") { + withTempDir { dir => + val tgz = new File(dir, "data.tgz") + writeTarGz(tgz, Seq(textEntry("a.csv", "a"), textEntry("b.csv", "b"))) + assert(collect(tgz) == Seq("a.csv" -> "a", "b.csv" -> "b")) + } + } + + test("readEntries: directory entries are skipped") { + withTempDir { dir => + val tar = new File(dir, "dirs.tar") + writeTar(tar, Seq( + Entry("subdir", Array.emptyByteArray, isDir = true), + textEntry("subdir/data.csv", "x"))) + assert(collect(tar) == Seq("subdir/data.csv" -> "x")) + } + } + + test("readEntries: dotfile entries (e.g. macOS ._foo) are skipped") { + withTempDir { dir => + val tar = new File(dir, "dots.tar") + writeTar(tar, Seq( + textEntry("._real.csv", "junk"), // macOS AppleDouble sidecar + textEntry(".hidden", "ignored"), // bare dotfile + textEntry("real.csv", "kept"), + textEntry("nested/._sidecar", "junk2"))) // dotfile in a subdir + assert(collect(tar) == Seq("real.csv" -> "kept")) + } + } + + test("readEntries: advances lazily, one entry at a time") { + withTempDir { dir => + val tar = new File(dir, "lazy.tar") + writeTar(tar, Seq(textEntry("a.csv", "a"), textEntry("b.csv", "b"), textEntry("c.csv", "c"))) + + val opened = ArrayBuffer[String]() + // parseEntry yields a single element without reading the stream, so each invocation maps to + // exactly one consumed output element -- letting us observe when the next entry is opened. + val it = ArchiveReader(new Path(tar.toURI)).readEntries(new Configuration()) { (name, _) => + opened += name + Iterator.single(name) + } + + // Construction opens only the first entry; later entries open on demand as iteration + // crosses each entry boundary (never all upfront). + assert(opened.toList == List("a.csv")) + assert(it.hasNext) + assert(it.next() == "a.csv") + // Entry 0 is still in flight until its element is consumed, so entry 1 stays unopened. + assert(opened.toList == List("a.csv")) + assert(it.next() == "b.csv") + assert(opened.toList == List("a.csv", "b.csv")) + assert(it.next() == "c.csv") + assert(opened.toList == List("a.csv", "b.csv", "c.csv")) + assert(!it.hasNext) + assert(opened.size == 3) + } + } + + test("readEntries: a parseEntry that closes its stream still advances to the next entry") { + withTempDir { dir => + val tar = new File(dir, "close.tar") + writeTar(tar, Seq(textEntry("a.csv", "a"), textEntry("b.csv", "b"))) + + val seen = ArrayBuffer[String]() + val it = ArchiveReader(new Path(tar.toURI)).readEntries(new Configuration()) { (name, in) => + val body = new String(readAll(in), StandardCharsets.UTF_8) + in.close() // must NOT close the underlying archive + seen += body + Iterator.single(name) + } + assert(it.toList == List("a.csv", "b.csv")) + assert(seen.toList == List("a", "b")) + } + } + + test("readEntries: close() is safe, idempotent, and stops iteration") { + withTempDir { dir => + val tar = new File(dir, "closeable.tar") + writeTar(tar, Seq(textEntry("a.csv", "a"), textEntry("b.csv", "b"))) + + val it = ArchiveReader(new Path(tar.toURI)).readEntries(new Configuration()) { (name, _) => + Iterator.single(name) + } + assert(it.hasNext) + it.asInstanceOf[Closeable].close() + it.asInstanceOf[Closeable].close() // idempotent + assert(!it.hasNext) + } + } + + test("readEntries: TaskContext completion cleans up without error") { + withTempDir { dir => + val tar = new File(dir, "ctx.tar") + writeTar(tar, Seq(textEntry("a.csv", "a"), textEntry("b.csv", "b"))) + + val ctx = new TaskContextImpl( + stageId = 0, + stageAttemptNumber = 0, + partitionId = 0, + taskAttemptId = 1L, + attemptNumber = 0, + numPartitions = 0, + taskMemoryManager = null, + localProperties = new Properties, + metricsSystem = null, + cpus = 1) + TaskContext.setTaskContext(ctx) + try { + val it = ArchiveReader(new Path(tar.toURI)).readEntries(new Configuration()) { (name, _) => + Iterator.single(name) + } + assert(it.hasNext) + it.next() // open the archive and register the completion listener + // Simulate task completion without exhausting/closing the iterator. + ctx.markTaskCompleted(None) + } finally { + TaskContext.unset() + } + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVArchiveReadBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVArchiveReadBase.scala new file mode 100644 index 0000000000000..93dc8db5750f0 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVArchiveReadBase.scala @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +import java.nio.charset.StandardCharsets +import java.nio.file.Files + +import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.DataFrame +import org.apache.spark.util.Utils + +/** + * Binds [[ArchiveReadSuiteBase]]'s file-format hooks to CSV. The header-mode-specific tests live in + * the [[CSVHeaderArchiveReadBase]] and [[CSVHeaderlessArchiveReadBase]] sub-traits, so the shared + * archive tests from [[ArchiveReadSuiteBase]] run for both modes. + */ +trait CSVArchiveReadBase extends ArchiveReadSuiteBase { + + /** Whether the archived CSV files are written and read with a header row. */ + protected def header: Boolean + + override protected def format: String = "csv" + + override protected def fileExtension: String = "csv" + + override protected def readOptions: Map[String, String] = Map("header" -> header.toString) + + override protected def readSchema: String = "id INT, name STRING" + + override protected def encodeFile( + df: DataFrame, + writeOptions: Map[String, String]): Array[Byte] = { + val dir = Utils.createTempDir(namePrefix = "archive-test-encode") + try { + df.coalesce(1).write.format("csv") + .options(Map("header" -> header.toString) ++ writeOptions) + .mode("overwrite").save(dir.getCanonicalPath) + val parts = dir.listFiles().filter { f => + f.isFile && !f.getName.startsWith("_") && !f.getName.startsWith(".") && + !f.getName.endsWith(".crc") + } + assert(parts.length == 1, + s"expected exactly one data file, got: ${parts.map(_.getName).toList}") + Files.readAllBytes(parts.head.toPath) + } finally Utils.deleteRecursively(dir) + } + + /** Raw CSV bytes, for tests that need precise control over the row layout. */ + protected def csvBytes(s: String): Array[Byte] = s.getBytes(StandardCharsets.UTF_8) + + test("CSV: reading an archive without a schema fails (inference not yet supported)") { + // Schema inference for archives is a follow-up; until then an explicit schema is required, and + // an inference attempt raises Spark's standard UNABLE_TO_INFER_SCHEMA error. + withArchiveFile() { archive => + writeArchive(archive, Seq(entryName(0) -> encodeFile(sampleDf((1, "Alice"), (2, "Bob"))))) + val e = intercept[AnalysisException] { + spark.read.format(format).options(readOptions).load(archive.getCanonicalPath) + } + assert(e.getCondition == "UNABLE_TO_INFER_SCHEMA", + s"expected UNABLE_TO_INFER_SCHEMA, got ${e.getCondition}: ${e.getMessage}") + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderArchiveReadBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderArchiveReadBase.scala new file mode 100644 index 0000000000000..32ad6e818076f --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderArchiveReadBase.scala @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +/** + * [[CSVArchiveReadBase]] reading CSV files that carry a header row, plus header-specific archive + * tests (mismatched headers, and delimiter/multiline cases whose first row is a header). + */ +trait CSVHeaderArchiveReadBase extends CSVArchiveReadBase { + + import testImplicits._ + + override protected def header: Boolean = true + + test("CSV: entries with mismatched headers behave like standalone files") { + assertArchiveMatchesDir( + Seq( + entryName(0) -> encodeFile(sampleDf((1, "Alice"), (2, "Bob"))), + // A different second-column header: the schema's "name" column is absent from this entry. + entryName(1) -> encodeFile(Seq((3, "Carol")).toDF("id", "nickname")))) + } + + test("CSV: custom delimiter matches a directory read") { + assertArchiveMatchesDir( + Seq("a.csv" -> csvBytes("id;name\n1;Alice\n2;Bob\n")), + extraOptions = Map("delimiter" -> ";")) + } + + test("CSV: multiline quoted fields with embedded newlines match a directory read") { + assertArchiveMatchesDir( + Seq( + "a.csv" -> csvBytes("id,note\n1,\"line1\nline2\"\n2,\"plain\"\n"), + "b.csv" -> csvBytes("id,note\n3,\"a\nb\nc\"\n")), + extraOptions = Map("multiLine" -> "true"), + schema = "id INT, note STRING") + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderTarArchiveReadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderTarArchiveReadSuite.scala new file mode 100644 index 0000000000000..7ac2ad4084aa0 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderTarArchiveReadSuite.scala @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +/** + * Reads of header-carrying CSV files packed in tar archives (`.tar`/`.tar.gz`/`.tgz`): the shared + * archive tests from [[ArchiveReadSuiteBase]] plus the header-mode CSV tests from + * [[CSVHeaderArchiveReadBase]], run over tar containers via [[TarArchiveReadBase]]. + */ +class CSVHeaderTarArchiveReadSuite + extends ArchiveReadSuiteBase + with CSVHeaderArchiveReadBase + with TarArchiveReadBase diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderlessArchiveReadBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderlessArchiveReadBase.scala new file mode 100644 index 0000000000000..ba4d7f63bb464 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderlessArchiveReadBase.scala @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +/** + * [[CSVArchiveReadBase]] reading headerless CSV files (columns are positional), plus headerless + * delimiter/multiline archive tests. The shared archive tests from [[ArchiveReadSuiteBase]] cover + * the common headerless read paths. + */ +trait CSVHeaderlessArchiveReadBase extends CSVArchiveReadBase { + + override protected def header: Boolean = false + + test("CSV: headerless custom delimiter matches a directory read") { + assertArchiveMatchesDir( + Seq("a.csv" -> csvBytes("1;Alice\n2;Bob\n"), "b.csv" -> csvBytes("3;Carol\n")), + extraOptions = Map("delimiter" -> ";")) + } + + test("CSV: headerless multiline quoted fields with embedded newlines match a directory read") { + assertArchiveMatchesDir( + Seq( + "a.csv" -> csvBytes("1,\"line1\nline2\"\n2,\"plain\"\n"), + "b.csv" -> csvBytes("3,\"a\nb\nc\"\n")), + extraOptions = Map("multiLine" -> "true"), + schema = "id INT, note STRING") + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderlessTarArchiveReadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderlessTarArchiveReadSuite.scala new file mode 100644 index 0000000000000..28316c97ca8c9 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/CSVHeaderlessTarArchiveReadSuite.scala @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +/** + * Reads of headerless CSV files packed in tar archives (`.tar`/`.tar.gz`/`.tgz`): the shared + * archive tests from [[ArchiveReadSuiteBase]] plus the headerless CSV tests from + * [[CSVHeaderlessArchiveReadBase]], run over tar containers via [[TarArchiveReadBase]]. + */ +class CSVHeaderlessTarArchiveReadSuite + extends ArchiveReadSuiteBase + with CSVHeaderlessArchiveReadBase + with TarArchiveReadBase diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/TarArchiveReadBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/TarArchiveReadBase.scala new file mode 100644 index 0000000000000..60a73b0891bae --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/TarArchiveReadBase.scala @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +import java.io.{File, FileOutputStream, OutputStream} +import java.nio.charset.StandardCharsets +import java.nio.file.Files +import java.util.Locale +import java.util.zip.GZIPOutputStream + +import org.apache.commons.compress.archivers.tar.{TarArchiveEntry, TarArchiveOutputStream} + +/** + * Binds [[ArchiveReadSuiteBase]]'s archive-format hooks to tar containers: plain `.tar`, gzipped + * `.tar.gz`, and `.tgz`. Reusable across file formats -- a `TarArchiveReadSuite` mixes this + * in alongside the file-format trait. + */ +trait TarArchiveReadBase extends ArchiveReadSuiteBase { + + override protected def archiveExtensions: Seq[String] = Seq("tar", "tar.gz", "tgz") + + override protected def corruptArchiveExtension: String = "tar.gz" + + override protected def writeArchive(dest: File, entries: Seq[(String, Array[Byte])]): Unit = { + val name = dest.getName.toLowerCase(Locale.ROOT) + val rawOut: OutputStream = if (name.endsWith(".gz") || name.endsWith(".tgz")) { + new GZIPOutputStream(new FileOutputStream(dest)) + } else { + new FileOutputStream(dest) + } + val out = new TarArchiveOutputStream(rawOut) + try { + entries.foreach { case (entryName, bytes) => + val entry = new TarArchiveEntry(entryName) + entry.setSize(bytes.length.toLong) + out.putArchiveEntry(entry) + out.write(bytes) + out.closeArchiveEntry() + } + out.finish() + } finally out.close() + } + + override protected def writeCorruptArchive(dest: File): Unit = + Files.write(dest.toPath, "this is not a valid gzip-compressed tar archive" + .getBytes(StandardCharsets.UTF_8)) +}