diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c72fb73a0..26da861c5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -31,13 +31,23 @@ jobs: matrix: os: [ubuntu-22.04] scala: [2.13, 2.12] - java: [temurin@8] - project: [root-spark33, root-spark34, root-spark35] + java: [temurin@8, temurin@17] + project: [root-spark33, root-spark34, root-spark35, root-spark40] exclude: - scala: 2.13 project: root-spark33 - scala: 2.13 project: root-spark34 + - scala: 2.12 + project: root-spark40 + - java: temurin@17 + project: root-spark33 + - java: temurin@17 + project: root-spark34 + - java: temurin@17 + project: root-spark35 + - java: temurin@8 + project: root-spark40 runs-on: ${{ matrix.os }} timeout-minutes: 60 steps: @@ -62,6 +72,19 @@ jobs: if: matrix.java == 'temurin@8' && steps.setup-java-temurin-8.outputs.cache-hit == 'false' run: sbt +update + - name: Setup Java (temurin@17) + id: setup-java-temurin-17 + if: matrix.java == 'temurin@17' + uses: actions/setup-java@v5 + with: + distribution: temurin + java-version: 17 + cache: sbt + + - name: sbt update + if: matrix.java == 'temurin@17' && steps.setup-java-temurin-17.outputs.cache-hit == 'false' + run: sbt +update + - name: Check that workflows are up to date run: sbt githubWorkflowCheck @@ -115,6 +138,19 @@ jobs: if: matrix.java == 'temurin@8' && steps.setup-java-temurin-8.outputs.cache-hit == 'false' run: sbt +update + - name: Setup Java (temurin@17) + id: setup-java-temurin-17 + if: matrix.java == 'temurin@17' + uses: actions/setup-java@v5 + with: + distribution: temurin + java-version: 17 + cache: sbt + + - name: sbt update + if: matrix.java == 'temurin@17' && steps.setup-java-temurin-17.outputs.cache-hit == 'false' + run: sbt +update + - name: Import signing key if: env.PGP_SECRET != '' && env.PGP_PASSPHRASE == '' env: @@ -169,10 +205,23 @@ jobs: if: matrix.java == 'temurin@8' && steps.setup-java-temurin-8.outputs.cache-hit == 'false' run: sbt +update + - name: Setup Java (temurin@17) + id: setup-java-temurin-17 + if: matrix.java == 'temurin@17' + uses: actions/setup-java@v5 + with: + distribution: temurin + java-version: 17 + cache: sbt + + - name: sbt update + if: matrix.java == 'temurin@17' && steps.setup-java-temurin-17.outputs.cache-hit == 'false' + run: sbt +update + - name: Submit Dependencies uses: scalacenter/sbt-dependency-submission@v2 with: - modules-ignore: root-spark33_2.13 root-spark33_2.12 docs_2.13 docs_2.12 root-spark34_2.13 root-spark34_2.12 root-spark35_2.13 root-spark35_2.12 + modules-ignore: root-spark33_2.13 root-spark33_2.12 docs_2.13 docs_2.12 root-spark34_2.13 root-spark34_2.12 root-spark35_2.13 root-spark35_2.12 root-spark40_2.13 configs-ignore: test scala-tool scala-doc-tool test-internal site: @@ -180,7 +229,7 @@ jobs: strategy: matrix: os: [ubuntu-22.04] - java: [temurin@11] + java: [temurin@17] runs-on: ${{ matrix.os }} steps: - name: Checkout current branch (full) @@ -204,17 +253,17 @@ jobs: if: matrix.java == 'temurin@8' && steps.setup-java-temurin-8.outputs.cache-hit == 'false' run: sbt +update - - name: Setup Java (temurin@11) - id: setup-java-temurin-11 - if: matrix.java == 'temurin@11' + - name: Setup Java (temurin@17) + id: setup-java-temurin-17 + if: matrix.java == 'temurin@17' uses: actions/setup-java@v5 with: distribution: temurin - java-version: 11 + java-version: 17 cache: sbt - name: sbt update - if: matrix.java == 'temurin@11' && steps.setup-java-temurin-11.outputs.cache-hit == 'false' + if: matrix.java == 'temurin@17' && steps.setup-java-temurin-17.outputs.cache-hit == 'false' run: sbt +update - name: Generate site diff --git a/README.md b/README.md index 8e3f0a8df..bb22cf29e 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ The compatible versions of [Spark](http://spark.apache.org/) and | Frameless | Spark | Cats | Cats-Effect | Scala | |-----------|-----------------------------|----------|-------------|-------------| +| 0.17.0 | 4.0.2† / 3.5.8 / 3.4.4 / 3.3.4 | 2.x | 3.x | 2.12 / 2.13 | | 0.16.0 | 3.5.0 / 3.4.0 / 3.3.0 | 2.x | 3.x | 2.12 / 2.13 | | 0.15.0 | 3.4.0 / 3.3.0 / 3.2.2 | 2.x | 3.x | 2.12 / 2.13 | | 0.14.1 | 3.4.0 / 3.3.0 / 3.2.2 | 2.x | 3.x | 2.12 / 2.13 | @@ -46,6 +47,8 @@ The compatible versions of [Spark](http://spark.apache.org/) and _\* 0.11.0 has broken Spark 3.1.2 and 3.0.1 artifacts published._ +_† The Spark 4.0.x artifacts (`-spark40`) are published for **Scala 2.13 only** and require **JDK 17+**, since Spark 4 dropped Scala 2.12 and JDK 8/11. The default (unsuffixed) artifacts still target Spark 3.5._ + Starting 0.11 we introduced Spark cross published artifacts: * By default, frameless artifacts depend on the most recent Spark version @@ -53,9 +56,10 @@ Starting 0.11 we introduced Spark cross published artifacts: Artifact names examples: -* `frameless-dataset` (the latest Spark dependency) +* `frameless-dataset` (the default Spark 3.5.x dependency) +* `frameless-dataset-spark40` (Spark 4.0.x dependency; Scala 2.13 + JDK 17 only) +* `frameless-dataset-spark34` (Spark 3.4.x dependency) * `frameless-dataset-spark33` (Spark 3.3.x dependency) -* `frameless-dataset-spark32` (Spark 3.2.x dependency) Versions 0.5.x and 0.6.x have identical features. The first is compatible with Spark 2.2.1 and the second with 2.3.0. diff --git a/build.sbt b/build.sbt index a38f38f4d..19e9d5d5b 100644 --- a/build.sbt +++ b/build.sbt @@ -1,4 +1,5 @@ val sparkVersion = "3.5.8" +val spark40Version = "4.0.2" val spark34Version = "3.4.4" val spark33Version = "3.3.4" val catsCoreVersion = "2.13.0" @@ -26,12 +27,25 @@ lazy val root = project .enablePlugins(NoPublishPlugin) .settings(crossScalaVersions := Nil) .aggregate( + `root-spark40`, `root-spark35`, `root-spark34`, `root-spark33`, docs ) +lazy val `root-spark40` = project + .in(file(".spark40")) + .enablePlugins(NoPublishPlugin) + .settings(crossScalaVersions := Seq(Scala213)) + .aggregate( + core, + `cats-spark40`, + `dataset-spark40`, + `refined-spark40`, + `ml-spark40` + ) + lazy val `root-spark35` = project .in(file(".spark35")) .enablePlugins(NoPublishPlugin) @@ -76,6 +90,15 @@ lazy val `cats-spark34` = project `dataset-spark34` % "test->test;compile->compile;provided->provided" ) +lazy val `cats-spark40` = project + .settings(name := "frameless-cats-spark40") + .settings(sourceDirectory := (cats / sourceDirectory).value) + .settings(catsSettings) + .settings(spark40Settings) + .dependsOn( + `dataset-spark40` % "test->test;compile->compile;provided->provided" + ) + lazy val `cats-spark33` = project .settings(name := "frameless-cats-spark33") .settings(sourceDirectory := (cats / sourceDirectory).value) @@ -111,6 +134,20 @@ lazy val `dataset-spark34` = project .settings(spark34Settings) .dependsOn(core % "test->test;compile->compile") +lazy val `dataset-spark40` = project + .settings(name := "frameless-dataset-spark40") + .settings(sourceDirectory := (dataset / sourceDirectory).value) + .settings( + Compile / unmanagedSourceDirectories += (dataset / baseDirectory).value / "src" / "main" / "spark-4" + ) + .settings( + Test / unmanagedSourceDirectories += (dataset / baseDirectory).value / "src" / "test" / "spark-3.3+" + ) + .settings(datasetSettings) + .settings(sparkDependencies(spark40Version)) + .settings(spark40Settings) + .dependsOn(core % "test->test;compile->compile") + lazy val `dataset-spark33` = project .settings(name := "frameless-dataset-spark33") .settings(sourceDirectory := (dataset / sourceDirectory).value) @@ -139,6 +176,15 @@ lazy val `refined-spark34` = project `dataset-spark34` % "test->test;compile->compile;provided->provided" ) +lazy val `refined-spark40` = project + .settings(name := "frameless-refined-spark40") + .settings(sourceDirectory := (refined / sourceDirectory).value) + .settings(refinedSettings) + .settings(spark40Settings) + .dependsOn( + `dataset-spark40` % "test->test;compile->compile;provided->provided" + ) + lazy val `refined-spark33` = project .settings(name := "frameless-refined-spark33") .settings(sourceDirectory := (refined / sourceDirectory).value) @@ -168,6 +214,17 @@ lazy val `ml-spark34` = project `dataset-spark34` % "test->test;compile->compile;provided->provided" ) +lazy val `ml-spark40` = project + .settings(name := "frameless-ml-spark40") + .settings(sourceDirectory := (ml / sourceDirectory).value) + .settings(mlSettings) + .settings(sparkMlDependencies(spark40Version)) + .settings(spark40Settings) + .dependsOn( + core % "test->test;compile->compile", + `dataset-spark40` % "test->test;compile->compile;provided->provided" + ) + lazy val `ml-spark33` = project .settings(name := "frameless-ml-spark33") .settings(sourceDirectory := (ml / sourceDirectory).value) @@ -191,7 +248,14 @@ lazy val docs = project "org.typelevel" % "kind-projector" % "0.13.4" cross CrossVersion.full ), scalacOptions += "-Ydelambdafy:inline", - libraryDependencies += "org.typelevel" %% "mouse" % "1.3.2" + libraryDependencies += "org.typelevel" %% "mouse" % "1.3.2", + // mdoc executes Spark code via `Compile / runMain`; on JDK 17 (the site CI job) Spark + // needs the module --add-opens flags, so fork the run and pass them through. Forking + // changes the working directory, so pin it to the repo root where the docs read their + // relative data files (e.g. docs/iris.data). + Compile / run / fork := true, + Compile / run / javaOptions ++= sparkJava17Options, + Compile / run / baseDirectory := (LocalRootProject / baseDirectory).value ) .dependsOn(dataset, cats, ml) @@ -241,7 +305,13 @@ lazy val datasetSettings = mc("frameless.functions.FramelessLit"), mc(f"frameless.functions.FramelessLit$$"), dmm("frameless.functions.package.litAggr"), - dmm("org.apache.spark.sql.FramelessInternals.column") + dmm("org.apache.spark.sql.FramelessInternals.column"), + // FramelessInternals is internal plumbing (Spark-version compat seam), not part of + // the intended public API. Spark 4 required reworking it: `column` is now the + // Expression->Column bridge and `mkDataset` derives the session from the source + // Dataset instead of taking a SQLContext. + imt("org.apache.spark.sql.FramelessInternals.column"), + imt("org.apache.spark.sql.FramelessInternals.mkDataset") ) }, coverageExcludedPackages := "org.apache.spark.sql.reflection", @@ -304,6 +374,27 @@ lazy val scalacOptionSettings = Def.setting { baseScalacOptions(scalaVersion.value) } +// JVM flags Spark needs on JDK 17+ (the module system blocks its reflective access +// to java.base internals otherwise). Empty on JDK 8/11. Reused by tests and the docs run. +lazy val sparkJava17Options: Seq[String] = + if (sys.props("java.specification.version").toDouble >= 17.0) { + Seq( + "--add-opens=java.base/java.lang=ALL-UNNAMED", + "--add-opens=java.base/java.lang.invoke=ALL-UNNAMED", + "--add-opens=java.base/java.lang.reflect=ALL-UNNAMED", + "--add-opens=java.base/java.io=ALL-UNNAMED", + "--add-opens=java.base/java.net=ALL-UNNAMED", + "--add-opens=java.base/java.nio=ALL-UNNAMED", + "--add-opens=java.base/java.util=ALL-UNNAMED", + "--add-opens=java.base/java.util.concurrent=ALL-UNNAMED", + "--add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED", + "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED", + "--add-opens=java.base/sun.nio.cs=ALL-UNNAMED", + "--add-opens=java.base/sun.security.action=ALL-UNNAMED", + "--add-opens=java.base/sun.util.calendar=ALL-UNNAMED" + ) + } else Seq.empty + lazy val framelessSettings = Seq( scalacOptions ++= scalacOptionSettings.value, Test / testOptions += Tests.Argument(TestFrameworks.ScalaTest, "-oDF"), @@ -313,28 +404,7 @@ lazy val framelessSettings = Seq( "org.scalatestplus" %% "scalatestplus-scalacheck" % scalatestplus % Test, "org.scalacheck" %% "scalacheck" % scalacheck % Test ), - Test / javaOptions ++= { - val baseOptions = Seq("-Xmx1G", "-ea") - val java17Options = - if (sys.props("java.specification.version").toDouble >= 17.0) { - Seq( - "--add-opens=java.base/java.lang=ALL-UNNAMED", - "--add-opens=java.base/java.lang.invoke=ALL-UNNAMED", - "--add-opens=java.base/java.lang.reflect=ALL-UNNAMED", - "--add-opens=java.base/java.io=ALL-UNNAMED", - "--add-opens=java.base/java.net=ALL-UNNAMED", - "--add-opens=java.base/java.nio=ALL-UNNAMED", - "--add-opens=java.base/java.util=ALL-UNNAMED", - "--add-opens=java.base/java.util.concurrent=ALL-UNNAMED", - "--add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED", - "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED", - "--add-opens=java.base/sun.nio.cs=ALL-UNNAMED", - "--add-opens=java.base/sun.security.action=ALL-UNNAMED", - "--add-opens=java.base/sun.util.calendar=ALL-UNNAMED" - ) - } else Seq.empty - baseOptions ++ java17Options - }, + Test / javaOptions ++= Seq("-Xmx1G", "-ea") ++ sparkJava17Options, Test / fork := true, Test / parallelExecution := false, mimaPreviousArtifacts ~= { @@ -352,6 +422,15 @@ lazy val framelessSettings = Seq( libraryDependencySchemes += "org.scala-lang.modules" %% "scala-xml" % VersionScheme.Always ) ++ consoleSettings +lazy val spark40Settings = Seq[Setting[_]]( + // Spark 4 dropped Scala 2.12 support; this module is 2.13-only. + crossScalaVersions := Seq(Scala213), + scalaVersion := Scala213, + tlVersionIntroduced := Map("2.13" -> "0.17.0"), + // Brand-new artifact: no previously published version to check binary compatibility against. + mimaPreviousArtifacts := Set.empty +) + lazy val spark34Settings = Seq[Setting[_]]( tlVersionIntroduced := Map("2.12" -> "0.14.1", "2.13" -> "0.14.1"), mimaPreviousArtifacts := Set( @@ -427,12 +506,32 @@ ThisBuild / developers := List( ThisBuild / tlCiReleaseBranches := Seq("master") ThisBuild / tlSitePublishBranch := Some("master") -val roots = List("root-spark33", "root-spark34", "root-spark35") +// Spark 3.x roots: 3.3/3.4 build on 2.12 only, 3.5 builds on both 2.12 and 2.13. +val spark3Roots = List("root-spark33", "root-spark34", "root-spark35") +// Spark 4.x roots: Scala 2.13 only (Spark 4 dropped 2.12). +val spark4Roots = List("root-spark40") +val roots = spark3Roots ++ spark4Roots + +// Spark 3.x builds/tests on JDK 8; Spark 4 requires JDK 17+. +val spark3Java = JavaSpec.temurin("8") +val spark4Java = JavaSpec.temurin("17") + +ThisBuild / githubWorkflowJavaVersions := Seq(spark3Java, spark4Java) ThisBuild / githubWorkflowBuildMatrixAdditions += "project" -> roots -ThisBuild / githubWorkflowBuildMatrixExclusions ++= roots.init.map { project => - MatrixExclude(Map("scala" -> "2.13", "project" -> project)) -} +ThisBuild / githubWorkflowBuildMatrixExclusions ++= + // 3.3/3.4 are 2.12-only; 3.5 builds both. Spark 4 is 2.13-only. + spark3Roots.init.map { project => + MatrixExclude(Map("scala" -> "2.13", "project" -> project)) + } ++ spark4Roots.map { project => + MatrixExclude(Map("scala" -> "2.12", "project" -> project)) + } ++ + // Pin each Spark line to its JDK: 3.x on JDK 8, 4.x on JDK 17. + spark3Roots.map { project => + MatrixExclude(Map("java" -> spark4Java.render, "project" -> project)) + } ++ spark4Roots.map { project => + MatrixExclude(Map("java" -> spark3Java.render, "project" -> project)) + } ThisBuild / githubWorkflowEnv += "SBT_OPTS" -> "-Xms1g -Xmx4g" diff --git a/dataset/src/main/scala/frameless/TypedColumn.scala b/dataset/src/main/scala/frameless/TypedColumn.scala index 0bbaf6fed..2888d8608 100644 --- a/dataset/src/main/scala/frameless/TypedColumn.scala +++ b/dataset/src/main/scala/frameless/TypedColumn.scala @@ -1,11 +1,16 @@ package frameless -import frameless.functions.{litAggr, lit => flit} +import frameless.functions.{ litAggr, lit => flit } import frameless.syntax._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.types.DecimalType -import org.apache.spark.sql.{Column, FramelessInternals} +import org.apache.spark.sql.{ Column, FramelessInternals } + +// Spark 4 added org.apache.spark.sql.catalyst.expressions.With, which the wildcard import +// above would otherwise bind in preference to frameless.With. Alias frameless.With so its +// references resolve consistently on every supported Spark version. +import frameless.{ With => FWith } import shapeless._ import shapeless.ops.record.Selector @@ -21,98 +26,128 @@ sealed trait UntypedExpression[T] { override def toString: String = expr.toString() } -/** Expression used in `select`-like constructions. - */ -sealed class TypedColumn[T, U](expr: Expression)( - implicit val uenc: TypedEncoder[U] -) extends AbstractTypedColumn[T, U](expr) { +/** + * Expression used in `select`-like constructions. + */ +sealed class TypedColumn[T, U]( + expr: Expression + )(implicit + val uenc: TypedEncoder[U]) + extends AbstractTypedColumn[T, U](expr) { type ThisType[A, B] = TypedColumn[A, B] - def this(column: Column)(implicit uencoder: TypedEncoder[U]) = + def this( + column: Column + )(implicit + uencoder: TypedEncoder[U] + ) = this(FramelessInternals.expr(column)) - override def typed[W, U1: TypedEncoder](c: Column): TypedColumn[W, U1] = c.typedColumn + override def typed[W, U1: TypedEncoder](c: Column): TypedColumn[W, U1] = + c.typedColumn override def lit[U1: TypedEncoder](c: U1): TypedColumn[T, U1] = flit(c) } -/** Expression used in `agg`-like constructions. - */ -sealed class TypedAggregate[T, U](expr: Expression)( - implicit val uenc: TypedEncoder[U] -) extends AbstractTypedColumn[T, U](expr) { +/** + * Expression used in `agg`-like constructions. + */ +sealed class TypedAggregate[T, U]( + expr: Expression + )(implicit + val uenc: TypedEncoder[U]) + extends AbstractTypedColumn[T, U](expr) { type ThisType[A, B] = TypedAggregate[A, B] - def this(column: Column)(implicit uencoder: TypedEncoder[U]) = { + def this( + column: Column + )(implicit + uencoder: TypedEncoder[U] + ) = { this(FramelessInternals.expr(column)) } - override def typed[W, U1: TypedEncoder](c: Column): TypedAggregate[W, U1] = c.typedAggregate + override def typed[W, U1: TypedEncoder](c: Column): TypedAggregate[W, U1] = + c.typedAggregate override def lit[U1: TypedEncoder](c: U1): TypedAggregate[T, U1] = litAggr(c) } -/** Generic representation of a typed column. A typed column can either be a [[TypedAggregate]] or - * a [[frameless.TypedColumn]]. - * - * Documentation marked "apache/spark" is thanks to apache/spark Contributors - * at https://github.com/apache/spark, licensed under Apache v2.0 available at - * http://www.apache.org/licenses/LICENSE-2.0 - * - * @tparam T phantom type representing the dataset on which this columns is - * selected. When `T = A with B` the selection is on either A or B. - * @tparam U type of column - */ -abstract class AbstractTypedColumn[T, U] - (val expr: Expression) - (implicit val uencoder: TypedEncoder[U]) +/** + * Generic representation of a typed column. A typed column can either be a [[TypedAggregate]] or + * a [[frameless.TypedColumn]]. + * + * Documentation marked "apache/spark" is thanks to apache/spark Contributors + * at https://github.com/apache/spark, licensed under Apache v2.0 available at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * @tparam T phantom type representing the dataset on which this columns is + * selected. When `T = A with B` the selection is on either A or B. + * @tparam U type of column + */ +abstract class AbstractTypedColumn[T, U]( + val expr: Expression + )(implicit + val uencoder: TypedEncoder[U]) extends UntypedExpression[T] { self => type ThisType[A, B] <: AbstractTypedColumn[A, B] - /** A helper class to make to simplify working with Optional fields. - * - * {{{ - * val x: TypedColumn[Option[Int]] = _ - * x.opt.map(_*2) // This only compiles if the type of x is Option[X] (in this example X is of type Int) - * }}} - * - * @note Known issue: map() will NOT work when the applied function is a udf(). - * It will compile and then throw a runtime error. - **/ + /** + * A helper class to make to simplify working with Optional fields. + * + * {{{ + * val x: TypedColumn[Option[Int]] = _ + * x.opt.map(_*2) // This only compiles if the type of x is Option[X] (in this example X is of type Int) + * }}} + * + * @note Known issue: map() will NOT work when the applied function is a udf(). + * It will compile and then throw a runtime error. + */ trait Mapper[X] { - def map[G, OutputType[_,_]](u: ThisType[T, X] => OutputType[T,G]) - (implicit - ev: OutputType[T,G] <:< AbstractTypedColumn[T, G] + + def map[G, OutputType[_, _]]( + u: ThisType[T, X] => OutputType[T, G] + )(implicit + ev: OutputType[T, G] <:< AbstractTypedColumn[T, G] ): OutputType[T, Option[G]] = { - u(self.asInstanceOf[ThisType[T, X]]).asInstanceOf[OutputType[T, Option[G]]] + u(self.asInstanceOf[ThisType[T, X]]) + .asInstanceOf[OutputType[T, Option[G]]] } } - /** Makes it easier to work with Optional columns. It returns an instance of `Mapper[X]` - * where `X` is type of the unwrapped Optional. E.g., in the case of `Option[Long]`, - * `X` is of type Long. - * - * {{{ - * val x: TypedColumn[Option[Int]] = _ - * x.opt.map(_*2) - * }}} - * */ - def opt[X](implicit x: U <:< Option[X]): Mapper[X] = new Mapper[X] {} + /** + * Makes it easier to work with Optional columns. It returns an instance of `Mapper[X]` + * where `X` is type of the unwrapped Optional. E.g., in the case of `Option[Long]`, + * `X` is of type Long. + * + * {{{ + * val x: TypedColumn[Option[Int]] = _ + * x.opt.map(_*2) + * }}} + */ + def opt[X]( + implicit + x: U <:< Option[X] + ): Mapper[X] = new Mapper[X] {} /** Fall back to an untyped Column */ - def untyped: Column = new Column(expr) + def untyped: Column = FramelessInternals.column(expr) - private def equalsTo[TT, W](other: ThisType[TT, U])(implicit w: With.Aux[T, TT, W]): ThisType[W, Boolean] = typed { + private def equalsTo[TT, W]( + other: ThisType[TT, U] + )(implicit + w: FWith.Aux[T, TT, W] + ): ThisType[W, Boolean] = typed { if (uencoder.nullable) EqualNullSafe(self.expr, other.expr) else EqualTo(self.expr, other.expr) } /** Creates a typed column of either TypedColumn or TypedAggregate from an expression. */ protected def typed[W, U1: TypedEncoder](e: Expression): ThisType[W, U1] = - typed(new Column(e)) + typed(FramelessInternals.column(e)) /** Creates a typed column of either TypedColumn or TypedAggregate. */ def typed[W, U1: TypedEncoder](c: Column): ThisType[W, U1] @@ -120,790 +155,1150 @@ abstract class AbstractTypedColumn[T, U] /** Creates a typed column of either TypedColumn or TypedAggregate. */ def lit[U1: TypedEncoder](c: U1): ThisType[T, U1] - /** Equality test. - * {{{ - * df.filter( df.col('a) === 1 ) - * }}} - * - * apache/spark - */ + /** + * Equality test. + * {{{ + * df.filter( df.col('a) === 1 ) + * }}} + * + * apache/spark + */ def ===(u: U): ThisType[T, Boolean] = equalsTo(lit(u)) - /** Equality test. - * {{{ - * df.filter( df.col('a) === df.col('b) ) - * }}} - * - * apache/spark - */ - def ===[TT, W](other: ThisType[TT, U])(implicit w: With.Aux[T, TT, W]): ThisType[W, Boolean] = + /** + * Equality test. + * {{{ + * df.filter( df.col('a) === df.col('b) ) + * }}} + * + * apache/spark + */ + def ===[TT, W]( + other: ThisType[TT, U] + )(implicit + w: FWith.Aux[T, TT, W] + ): ThisType[W, Boolean] = equalsTo(other) - /** Inequality test. - * - * {{{ - * df.filter(df.col('a) =!= df.col('b)) - * }}} - * - * apache/spark - */ - def =!=[TT, W](other: ThisType[TT, U])(implicit w: With.Aux[T, TT, W]): ThisType[W, Boolean] = + /** + * Inequality test. + * + * {{{ + * df.filter(df.col('a) =!= df.col('b)) + * }}} + * + * apache/spark + */ + def =!=[TT, W]( + other: ThisType[TT, U] + )(implicit + w: FWith.Aux[T, TT, W] + ): ThisType[W, Boolean] = typed(Not(equalsTo(other).expr)) - /** Inequality test. - * - * {{{ - * df.filter(df.col('a) =!= "a") - * }}} - * - * apache/spark - */ + /** + * Inequality test. + * + * {{{ + * df.filter(df.col('a) =!= "a") + * }}} + * + * apache/spark + */ def =!=(u: U): ThisType[T, Boolean] = typed(Not(equalsTo(lit(u)).expr)) - /** True if the current expression is an Option and it's None. - * - * apache/spark - */ - def isNone(implicit i0: U <:< Option[_]): ThisType[T, Boolean] = + /** + * True if the current expression is an Option and it's None. + * + * apache/spark + */ + def isNone( + implicit + i0: U <:< Option[_] + ): ThisType[T, Boolean] = typed(IsNull(expr)) - /** True if the current expression is an Option and it's not None. - * - * apache/spark - */ - def isNotNone(implicit i0: U <:< Option[_]): ThisType[T, Boolean] = + /** + * True if the current expression is an Option and it's not None. + * + * apache/spark + */ + def isNotNone( + implicit + i0: U <:< Option[_] + ): ThisType[T, Boolean] = typed(IsNotNull(expr)) - /** True if the current expression is a fractional number and is not NaN. - * - * apache/spark - */ - def isNaN(implicit n: CatalystNaN[U]): ThisType[T, Boolean] = + /** + * True if the current expression is a fractional number and is not NaN. + * + * apache/spark + */ + def isNaN( + implicit + n: CatalystNaN[U] + ): ThisType[T, Boolean] = typed(self.untyped.isNaN) /** - * True if the value for this optional column `exists` as expected - * (see `Option.exists`). - * - * {{{ - * df.col('opt).isSome(_ === someOtherCol) - * }}} - */ - def isSome[V](exists: ThisType[T, V] => ThisType[T, Boolean])(implicit i0: U <:< Option[V]): ThisType[T, Boolean] = someOr[V](exists, false) + * True if the value for this optional column `exists` as expected + * (see `Option.exists`). + * + * {{{ + * df.col('opt).isSome(_ === someOtherCol) + * }}} + */ + def isSome[V]( + exists: ThisType[T, V] => ThisType[T, Boolean] + )(implicit + i0: U <:< Option[V] + ): ThisType[T, Boolean] = someOr[V](exists, false) /** - * True if the value for this optional column `exists` as expected, - * or is `None`. (see `Option.forall`). - * - * {{{ - * df.col('opt).isSomeOrNone(_ === someOtherCol) - * }}} - */ - def isSomeOrNone[V](exists: ThisType[T, V] => ThisType[T, Boolean])(implicit i0: U <:< Option[V]): ThisType[T, Boolean] = someOr[V](exists, true) - - private def someOr[V](exists: ThisType[T, V] => ThisType[T, Boolean], default: Boolean)(implicit i0: U <:< Option[V]): ThisType[T, Boolean] = { + * True if the value for this optional column `exists` as expected, + * or is `None`. (see `Option.forall`). + * + * {{{ + * df.col('opt).isSomeOrNone(_ === someOtherCol) + * }}} + */ + def isSomeOrNone[V]( + exists: ThisType[T, V] => ThisType[T, Boolean] + )(implicit + i0: U <:< Option[V] + ): ThisType[T, Boolean] = someOr[V](exists, true) + + private def someOr[V]( + exists: ThisType[T, V] => ThisType[T, Boolean], + default: Boolean + )(implicit + i0: U <:< Option[V] + ): ThisType[T, Boolean] = { val defaultExpr = if (default) Literal.TrueLiteral else Literal.FalseLiteral typed(Coalesce(Seq(opt(i0).map(exists).expr, defaultExpr))) } - /** Convert an Optional column by providing a default value. - * - * {{{ - * df(df('opt).getOrElse(df('defaultValue))) - * }}} - */ - def getOrElse[TT, W, Out](default: ThisType[TT, Out])(implicit i0: U =:= Option[Out], i1: With.Aux[T, TT, W]): ThisType[W, Out] = + /** + * Convert an Optional column by providing a default value. + * + * {{{ + * df(df('opt).getOrElse(df('defaultValue))) + * }}} + */ + def getOrElse[TT, W, Out]( + default: ThisType[TT, Out] + )(implicit + i0: U =:= Option[Out], + i1: FWith.Aux[T, TT, W] + ): ThisType[W, Out] = typed(Coalesce(Seq(expr, default.expr)))(default.uencoder) - /** Convert an Optional column by providing a default value. - * - * {{{ - * df( df('opt).getOrElse(defaultConstant) ) - * }}} - */ - def getOrElse[Out: TypedEncoder](default: Out)(implicit i0: U =:= Option[Out]): ThisType[T, Out] = + /** + * Convert an Optional column by providing a default value. + * + * {{{ + * df( df('opt).getOrElse(defaultConstant) ) + * }}} + */ + def getOrElse[Out: TypedEncoder]( + default: Out + )(implicit + i0: U =:= Option[Out] + ): ThisType[T, Out] = getOrElse(lit[Out](default)) - /** Sum of this expression and another expression. - * - * {{{ - * // The following selects the sum of a person's height and weight. - * people.select( people.col('height) plus people.col('weight) ) - * }}} - * - * apache/spark - */ - def plus[TT, W](other: ThisType[TT, U])(implicit n: CatalystNumeric[U], w: With.Aux[T, TT, W]): ThisType[W, U] = + /** + * Sum of this expression and another expression. + * + * {{{ + * // The following selects the sum of a person's height and weight. + * people.select( people.col('height) plus people.col('weight) ) + * }}} + * + * apache/spark + */ + def plus[TT, W]( + other: ThisType[TT, U] + )(implicit + n: CatalystNumeric[U], + w: FWith.Aux[T, TT, W] + ): ThisType[W, U] = typed(self.untyped.plus(other.untyped)) - /** Sum of this expression and another expression. - * {{{ - * // The following selects the sum of a person's height and weight. - * people.select( people.col('height) + people.col('weight) ) - * }}} - * - * apache/spark - */ - def +[TT, W](other: ThisType[TT, U])(implicit n: CatalystNumeric[U], w: With.Aux[T, TT, W]): ThisType[W, U] = + /** + * Sum of this expression and another expression. + * {{{ + * // The following selects the sum of a person's height and weight. + * people.select( people.col('height) + people.col('weight) ) + * }}} + * + * apache/spark + */ + def +[TT, W]( + other: ThisType[TT, U] + )(implicit + n: CatalystNumeric[U], + w: FWith.Aux[T, TT, W] + ): ThisType[W, U] = plus(other) - /** Sum of this expression (column) with a constant. - * {{{ - * // The following selects the sum of a person's height and weight. - * people.select( people('height) + 2 ) - * }}} - * - * @param u a constant of the same type - * apache/spark - */ - def +(u: U)(implicit n: CatalystNumeric[U]): ThisType[T, U] = + /** + * Sum of this expression (column) with a constant. + * {{{ + * // The following selects the sum of a person's height and weight. + * people.select( people('height) + 2 ) + * }}} + * + * @param u a constant of the same type + * apache/spark + */ + def +( + u: U + )(implicit + n: CatalystNumeric[U] + ): ThisType[T, U] = typed(self.untyped.plus(u)) /** - * Inversion of boolean expression, i.e. NOT. - * {{{ - * // Select rows that are not active (isActive === false) - * df.filter( !df('isActive) ) - * }}} - * - * apache/spark - */ - def unary_!(implicit i0: U <:< Boolean): ThisType[T, Boolean] = + * Inversion of boolean expression, i.e. NOT. + * {{{ + * // Select rows that are not active (isActive === false) + * df.filter( !df('isActive) ) + * }}} + * + * apache/spark + */ + def unary_!( + implicit + i0: U <:< Boolean + ): ThisType[T, Boolean] = typed(!untyped) - /** Unary minus, i.e. negate the expression. - * {{{ - * // Select the amount column and negates all values. - * df.select( -df('amount) ) - * }}} - * - * apache/spark - */ - def unary_-(implicit n: CatalystNumeric[U]): ThisType[T, U] = + /** + * Unary minus, i.e. negate the expression. + * {{{ + * // Select the amount column and negates all values. + * df.select( -df('amount) ) + * }}} + * + * apache/spark + */ + def unary_-( + implicit + n: CatalystNumeric[U] + ): ThisType[T, U] = typed(-self.untyped) - /** Subtraction. Subtract the other expression from this expression. - * {{{ - * // The following selects the difference between people's height and their weight. - * people.select( people.col('height) minus people.col('weight) ) - * }}} - * - * apache/spark - */ - def minus[TT, W](other: ThisType[TT, U])(implicit n: CatalystNumeric[U], w: With.Aux[T, TT, W]): ThisType[W, U] = + /** + * Subtraction. Subtract the other expression from this expression. + * {{{ + * // The following selects the difference between people's height and their weight. + * people.select( people.col('height) minus people.col('weight) ) + * }}} + * + * apache/spark + */ + def minus[TT, W]( + other: ThisType[TT, U] + )(implicit + n: CatalystNumeric[U], + w: FWith.Aux[T, TT, W] + ): ThisType[W, U] = typed(self.untyped.minus(other.untyped)) - /** Subtraction. Subtract the other expression from this expression. - * {{{ - * // The following selects the difference between people's height and their weight. - * people.select( people.col('height) - people.col('weight) ) - * }}} - * - * apache/spark - */ - def -[TT, W](other: ThisType[TT, U])(implicit n: CatalystNumeric[U], w: With.Aux[T, TT, W]): ThisType[W, U] = + /** + * Subtraction. Subtract the other expression from this expression. + * {{{ + * // The following selects the difference between people's height and their weight. + * people.select( people.col('height) - people.col('weight) ) + * }}} + * + * apache/spark + */ + def -[TT, W]( + other: ThisType[TT, U] + )(implicit + n: CatalystNumeric[U], + w: FWith.Aux[T, TT, W] + ): ThisType[W, U] = minus(other) - /** Subtraction. Subtract the other expression from this expression. - * {{{ - * // The following selects the difference between people's height and their weight. - * people.select( people('height) - 1 ) - * }}} - * - * @param u a constant of the same type - * apache/spark - */ - def -(u: U)(implicit n: CatalystNumeric[U]): ThisType[T, U] = + /** + * Subtraction. Subtract the other expression from this expression. + * {{{ + * // The following selects the difference between people's height and their weight. + * people.select( people('height) - 1 ) + * }}} + * + * @param u a constant of the same type + * apache/spark + */ + def -( + u: U + )(implicit + n: CatalystNumeric[U] + ): ThisType[T, U] = typed(self.untyped.minus(u)) - /** Multiplication of this expression and another expression. - * {{{ - * // The following multiplies a person's height by their weight. - * people.select( people.col('height) multiply people.col('weight) ) - * }}} - * - * apache/spark - */ - def multiply[TT, W] - (other: ThisType[TT, U]) - (implicit + /** + * Multiplication of this expression and another expression. + * {{{ + * // The following multiplies a person's height by their weight. + * people.select( people.col('height) multiply people.col('weight) ) + * }}} + * + * apache/spark + */ + def multiply[TT, W]( + other: ThisType[TT, U] + )(implicit n: CatalystNumeric[U], - w: With.Aux[T, TT, W], + w: FWith.Aux[T, TT, W], t: ClassTag[U] ): ThisType[W, U] = typed { - if (t.runtimeClass == BigDecimal(0).getClass) { - // That's apparently the only way to get sound multiplication. - // See https://issues.apache.org/jira/browse/SPARK-22036 - val dt = DecimalType(20, 14) - self.untyped.cast(dt).multiply(other.untyped.cast(dt)) - } else { - self.untyped.multiply(other.untyped) - } + if (t.runtimeClass == BigDecimal(0).getClass) { + // That's apparently the only way to get sound multiplication. + // See https://issues.apache.org/jira/browse/SPARK-22036 + val dt = DecimalType(20, 14) + self.untyped.cast(dt).multiply(other.untyped.cast(dt)) + } else { + self.untyped.multiply(other.untyped) } + } - /** Multiplication of this expression and another expression. - * {{{ - * // The following multiplies a person's height by their weight. - * people.select( people.col('height) * people.col('weight) ) - * }}} - * - * apache/spark - */ - def *[TT, W](other: ThisType[TT, U])(implicit n: CatalystNumeric[U], w: With.Aux[T, TT, W], t: ClassTag[U]): ThisType[W, U] = + /** + * Multiplication of this expression and another expression. + * {{{ + * // The following multiplies a person's height by their weight. + * people.select( people.col('height) * people.col('weight) ) + * }}} + * + * apache/spark + */ + def *[TT, W]( + other: ThisType[TT, U] + )(implicit + n: CatalystNumeric[U], + w: FWith.Aux[T, TT, W], + t: ClassTag[U] + ): ThisType[W, U] = multiply(other) - /** Multiplication of this expression a constant. - * {{{ - * // The following multiplies a person's height by their weight. - * people.select( people.col('height) * people.col('weight) ) - * }}} - * - * apache/spark - */ - def *(u: U)(implicit n: CatalystNumeric[U]): ThisType[T, U] = + /** + * Multiplication of this expression a constant. + * {{{ + * // The following multiplies a person's height by their weight. + * people.select( people.col('height) * people.col('weight) ) + * }}} + * + * apache/spark + */ + def *( + u: U + )(implicit + n: CatalystNumeric[U] + ): ThisType[T, U] = typed(self.untyped.multiply(u)) - /** Modulo (a.k.a. remainder) expression. - * - * apache/spark - */ - def mod[Out: TypedEncoder, TT, W](other: ThisType[TT, U])(implicit n: CatalystNumeric[U], w: With.Aux[T, TT, W]): ThisType[W, Out] = + /** + * Modulo (a.k.a. remainder) expression. + * + * apache/spark + */ + def mod[Out: TypedEncoder, TT, W]( + other: ThisType[TT, U] + )(implicit + n: CatalystNumeric[U], + w: FWith.Aux[T, TT, W] + ): ThisType[W, Out] = typed(self.untyped.mod(other.untyped)) - /** Modulo (a.k.a. remainder) expression. - * - * apache/spark - */ - def %[TT, W](other: ThisType[TT, U])(implicit n: CatalystNumeric[U], w: With.Aux[T, TT, W]): ThisType[W, U] = + /** + * Modulo (a.k.a. remainder) expression. + * + * apache/spark + */ + def %[TT, W]( + other: ThisType[TT, U] + )(implicit + n: CatalystNumeric[U], + w: FWith.Aux[T, TT, W] + ): ThisType[W, U] = mod(other) - /** Modulo (a.k.a. remainder) expression. - * - * apache/spark - */ - def %(u: U)(implicit n: CatalystNumeric[U]): ThisType[T, U] = + /** + * Modulo (a.k.a. remainder) expression. + * + * apache/spark + */ + def %( + u: U + )(implicit + n: CatalystNumeric[U] + ): ThisType[T, U] = typed(self.untyped.mod(u)) - /** Division this expression by another expression. - * {{{ - * // The following divides a person's height by their weight. - * people.select( people('height) / people('weight) ) - * }}} - * - * @param other another column of the same type - * apache/spark - */ - def divide[Out: TypedEncoder, TT, W](other: ThisType[TT, U])(implicit n: CatalystDivisible[U, Out], w: With.Aux[T, TT, W]): ThisType[W, Out] = + /** + * Division this expression by another expression. + * {{{ + * // The following divides a person's height by their weight. + * people.select( people('height) / people('weight) ) + * }}} + * + * @param other another column of the same type + * apache/spark + */ + def divide[Out: TypedEncoder, TT, W]( + other: ThisType[TT, U] + )(implicit + n: CatalystDivisible[U, Out], + w: FWith.Aux[T, TT, W] + ): ThisType[W, Out] = typed(self.untyped.divide(other.untyped)) - /** Division this expression by another expression. - * {{{ - * // The following divides a person's height by their weight. - * people.select( people('height) / people('weight) ) - * }}} - * - * @param other another column of the same type - * apache/spark - */ - def /[Out, TT, W](other: ThisType[TT, U])(implicit n: CatalystDivisible[U, Out], e: TypedEncoder[Out], w: With.Aux[T, TT, W]): ThisType[W, Out] = + /** + * Division this expression by another expression. + * {{{ + * // The following divides a person's height by their weight. + * people.select( people('height) / people('weight) ) + * }}} + * + * @param other another column of the same type + * apache/spark + */ + def /[Out, TT, W]( + other: ThisType[TT, U] + )(implicit + n: CatalystDivisible[U, Out], + e: TypedEncoder[Out], + w: FWith.Aux[T, TT, W] + ): ThisType[W, Out] = divide(other) - /** Division this expression by another expression. - * {{{ - * // The following divides a person's height by their weight. - * people.select( people('height) / 2 ) - * }}} - * - * @param u a constant of the same type - * apache/spark - */ - def /(u: U)(implicit n: CatalystNumeric[U]): ThisType[T, Double] = + /** + * Division this expression by another expression. + * {{{ + * // The following divides a person's height by their weight. + * people.select( people('height) / 2 ) + * }}} + * + * @param u a constant of the same type + * apache/spark + */ + def /( + u: U + )(implicit + n: CatalystNumeric[U] + ): ThisType[T, Double] = typed(self.untyped.divide(u)) - /** Returns a descending ordering used in sorting - * - * apache/spark - */ - def desc(implicit catalystOrdered: CatalystOrdered[U]): SortedTypedColumn[T, U] = + /** + * Returns a descending ordering used in sorting + * + * apache/spark + */ + def desc( + implicit + catalystOrdered: CatalystOrdered[U] + ): SortedTypedColumn[T, U] = new SortedTypedColumn[T, U](untyped.desc) - /** Returns an ascending ordering used in sorting - * - * apache/spark - */ - def asc(implicit catalystOrdered: CatalystOrdered[U]): SortedTypedColumn[T, U] = + /** + * Returns an ascending ordering used in sorting + * + * apache/spark + */ + def asc( + implicit + catalystOrdered: CatalystOrdered[U] + ): SortedTypedColumn[T, U] = new SortedTypedColumn[T, U](untyped.asc) - /** Bitwise AND this expression and another expression. - * {{{ - * df.select(df.col('colA) bitwiseAND (df.col('colB))) - * }}} - * - * @param u a constant of the same type - * apache/spark - */ - def bitwiseAND(u: U)(implicit n: CatalystBitwise[U]): ThisType[T, U] = + /** + * Bitwise AND this expression and another expression. + * {{{ + * df.select(df.col('colA) bitwiseAND (df.col('colB))) + * }}} + * + * @param u a constant of the same type + * apache/spark + */ + def bitwiseAND( + u: U + )(implicit + n: CatalystBitwise[U] + ): ThisType[T, U] = typed(self.untyped.bitwiseAND(u)) - /** Bitwise AND this expression and another expression. - * {{{ - * df.select(df.col('colA) bitwiseAND (df.col('colB))) - * }}} - * - * @param u a constant of the same type - * apache/spark - */ - def bitwiseAND[TT, W](other: ThisType[TT, U])(implicit n: CatalystBitwise[U], w: With.Aux[T, TT, W]): ThisType[W, U] = + /** + * Bitwise AND this expression and another expression. + * {{{ + * df.select(df.col('colA) bitwiseAND (df.col('colB))) + * }}} + * + * @param u a constant of the same type + * apache/spark + */ + def bitwiseAND[TT, W]( + other: ThisType[TT, U] + )(implicit + n: CatalystBitwise[U], + w: FWith.Aux[T, TT, W] + ): ThisType[W, U] = typed(self.untyped.bitwiseAND(other.untyped)) - /** Bitwise AND this expression and another expression (of same type). - * {{{ - * df.select(df.col('colA).cast[Int] & -1) - * }}} - * - * @param u a constant of the same type - * apache/spark - */ - def &(u: U)(implicit n: CatalystBitwise[U]): ThisType[T, U] = + /** + * Bitwise AND this expression and another expression (of same type). + * {{{ + * df.select(df.col('colA).cast[Int] & -1) + * }}} + * + * @param u a constant of the same type + * apache/spark + */ + def &( + u: U + )(implicit + n: CatalystBitwise[U] + ): ThisType[T, U] = bitwiseAND(u) - /** Bitwise AND this expression and another expression. - * {{{ - * df.select(df.col('colA) & (df.col('colB))) - * }}} - * - * @param other a constant of the same type - * apache/spark - */ - def &[TT, W](other: ThisType[TT, U])(implicit n: CatalystBitwise[U], w: With.Aux[T, TT, W]): ThisType[W, U] = + /** + * Bitwise AND this expression and another expression. + * {{{ + * df.select(df.col('colA) & (df.col('colB))) + * }}} + * + * @param other a constant of the same type + * apache/spark + */ + def &[TT, W]( + other: ThisType[TT, U] + )(implicit + n: CatalystBitwise[U], + w: FWith.Aux[T, TT, W] + ): ThisType[W, U] = bitwiseAND(other) - /** Bitwise OR this expression and another expression. - * {{{ - * df.select(df.col('colA) bitwiseOR (df.col('colB))) - * }}} - * - * @param u a constant of the same type - * apache/spark - */ - def bitwiseOR(u: U)(implicit n: CatalystBitwise[U]): ThisType[T, U] = + /** + * Bitwise OR this expression and another expression. + * {{{ + * df.select(df.col('colA) bitwiseOR (df.col('colB))) + * }}} + * + * @param u a constant of the same type + * apache/spark + */ + def bitwiseOR( + u: U + )(implicit + n: CatalystBitwise[U] + ): ThisType[T, U] = typed(self.untyped.bitwiseOR(u)) - /** Bitwise OR this expression and another expression. - * {{{ - * df.select(df.col('colA) bitwiseOR (df.col('colB))) - * }}} - * - * @param other a constant of the same type - * apache/spark - */ - def bitwiseOR[TT, W](other: ThisType[TT, U])(implicit n: CatalystBitwise[U], w: With.Aux[T, TT, W]): ThisType[W, U] = + /** + * Bitwise OR this expression and another expression. + * {{{ + * df.select(df.col('colA) bitwiseOR (df.col('colB))) + * }}} + * + * @param other a constant of the same type + * apache/spark + */ + def bitwiseOR[TT, W]( + other: ThisType[TT, U] + )(implicit + n: CatalystBitwise[U], + w: FWith.Aux[T, TT, W] + ): ThisType[W, U] = typed(self.untyped.bitwiseOR(other.untyped)) - /** Bitwise OR this expression and another expression (of same type). - * {{{ - * df.select(df.col('colA).cast[Long] | 1L) - * }}} - * - * @param u a constant of the same type - * apache/spark - */ - def |(u: U)(implicit n: CatalystBitwise[U]): ThisType[T, U] = + /** + * Bitwise OR this expression and another expression (of same type). + * {{{ + * df.select(df.col('colA).cast[Long] | 1L) + * }}} + * + * @param u a constant of the same type + * apache/spark + */ + def |( + u: U + )(implicit + n: CatalystBitwise[U] + ): ThisType[T, U] = bitwiseOR(u) - /** Bitwise OR this expression and another expression. - * {{{ - * df.select(df.col('colA) | (df.col('colB))) - * }}} - * - * @param other a constant of the same type - * apache/spark - */ - def |[TT, W](other: ThisType[TT, U])(implicit n: CatalystBitwise[U], w: With.Aux[T, TT, W]): ThisType[W, U] = + /** + * Bitwise OR this expression and another expression. + * {{{ + * df.select(df.col('colA) | (df.col('colB))) + * }}} + * + * @param other a constant of the same type + * apache/spark + */ + def |[TT, W]( + other: ThisType[TT, U] + )(implicit + n: CatalystBitwise[U], + w: FWith.Aux[T, TT, W] + ): ThisType[W, U] = bitwiseOR(other) - /** Bitwise XOR this expression and another expression. - * {{{ - * df.select(df.col('colA) bitwiseXOR (df.col('colB))) - * }}} - * - * @param u a constant of the same type - * apache/spark - */ - def bitwiseXOR(u: U)(implicit n: CatalystBitwise[U]): ThisType[T, U] = + /** + * Bitwise XOR this expression and another expression. + * {{{ + * df.select(df.col('colA) bitwiseXOR (df.col('colB))) + * }}} + * + * @param u a constant of the same type + * apache/spark + */ + def bitwiseXOR( + u: U + )(implicit + n: CatalystBitwise[U] + ): ThisType[T, U] = typed(self.untyped.bitwiseXOR(u)) - /** Bitwise XOR this expression and another expression. - * {{{ - * df.select(df.col('colA) bitwiseXOR (df.col('colB))) - * }}} - * - * @param other a constant of the same type - * apache/spark - */ - def bitwiseXOR[TT, W](other: ThisType[TT, U])(implicit n: CatalystBitwise[U], w: With.Aux[T, TT, W]): ThisType[W, U] = + /** + * Bitwise XOR this expression and another expression. + * {{{ + * df.select(df.col('colA) bitwiseXOR (df.col('colB))) + * }}} + * + * @param other a constant of the same type + * apache/spark + */ + def bitwiseXOR[TT, W]( + other: ThisType[TT, U] + )(implicit + n: CatalystBitwise[U], + w: FWith.Aux[T, TT, W] + ): ThisType[W, U] = typed(self.untyped.bitwiseXOR(other.untyped)) - /** Bitwise XOR this expression and another expression (of same type). - * {{{ - * df.select(df.col('colA).cast[Long] ^ 1L) - * }}} - * - * @param u a constant of the same type - * apache/spark - */ - def ^(u: U)(implicit n: CatalystBitwise[U]): ThisType[T, U] = + /** + * Bitwise XOR this expression and another expression (of same type). + * {{{ + * df.select(df.col('colA).cast[Long] ^ 1L) + * }}} + * + * @param u a constant of the same type + * apache/spark + */ + def ^( + u: U + )(implicit + n: CatalystBitwise[U] + ): ThisType[T, U] = bitwiseXOR(u) - /** Bitwise XOR this expression and another expression. - * {{{ - * df.select(df.col('colA) ^ (df.col('colB))) - * }}} - * - * @param other a constant of the same type - * apache/spark - */ - def ^[TT, W](other: ThisType[TT, U])(implicit n: CatalystBitwise[U], w: With.Aux[T, TT, W]): ThisType[W, U] = + /** + * Bitwise XOR this expression and another expression. + * {{{ + * df.select(df.col('colA) ^ (df.col('colB))) + * }}} + * + * @param other a constant of the same type + * apache/spark + */ + def ^[TT, W]( + other: ThisType[TT, U] + )(implicit + n: CatalystBitwise[U], + w: FWith.Aux[T, TT, W] + ): ThisType[W, U] = bitwiseXOR(other) - /** Casts the column to a different type. - * {{{ - * df.select(df('a).cast[Int]) - * }}} - */ - def cast[A: TypedEncoder](implicit c: CatalystCast[U, A]): ThisType[T, A] = + /** + * Casts the column to a different type. + * {{{ + * df.select(df('a).cast[Int]) + * }}} + */ + def cast[A: TypedEncoder]( + implicit + c: CatalystCast[U, A] + ): ThisType[T, A] = typed(self.untyped.cast(TypedEncoder[A].catalystRepr)) /** - * An expression that returns a substring - * {{{ - * df.select(df('a).substr(0, 5)) - * }}} - * - * @param startPos starting position - * @param len length of the substring - */ - def substr(startPos: Int, len: Int)(implicit ev: U =:= String): ThisType[T, String] = + * An expression that returns a substring + * {{{ + * df.select(df('a).substr(0, 5)) + * }}} + * + * @param startPos starting position + * @param len length of the substring + */ + def substr( + startPos: Int, + len: Int + )(implicit + ev: U =:= String + ): ThisType[T, String] = typed(self.untyped.substr(startPos, len)) /** - * An expression that returns a substring - * {{{ - * df.select(df('a).substr(df('b), df('c))) - * }}} - * - * @param startPos expression for the starting position - * @param len expression for the length of the substring - */ - def substr[TT1, TT2, W1, W2](startPos: ThisType[TT1, Int], len: ThisType[TT2, Int]) - (implicit - ev: U =:= String, - w1: With.Aux[T, TT1, W1], - w2: With.Aux[W1, TT2, W2]): ThisType[W2, String] = + * An expression that returns a substring + * {{{ + * df.select(df('a).substr(df('b), df('c))) + * }}} + * + * @param startPos expression for the starting position + * @param len expression for the length of the substring + */ + def substr[TT1, TT2, W1, W2]( + startPos: ThisType[TT1, Int], + len: ThisType[TT2, Int] + )(implicit + ev: U =:= String, + w1: FWith.Aux[T, TT1, W1], + w2: FWith.Aux[W1, TT2, W2] + ): ThisType[W2, String] = typed(self.untyped.substr(startPos.untyped, len.untyped)) - /** SQL like expression. Returns a boolean column based on a SQL LIKE match. - * {{{ - * val ds = TypedDataset.create(X2("foo", "bar") :: Nil) - * // true - * ds.select(ds('a).like("foo")) - * - * // Selected column has value "bar" - * ds.select(when(ds('a).like("f"), ds('a)).otherwise(ds('b)) - * }}} - * apache/spark - */ - def like(literal: String)(implicit ev: U =:= String): ThisType[T, Boolean] = + /** + * SQL like expression. Returns a boolean column based on a SQL LIKE match. + * {{{ + * val ds = TypedDataset.create(X2("foo", "bar") :: Nil) + * // true + * ds.select(ds('a).like("foo")) + * + * // Selected column has value "bar" + * ds.select(when(ds('a).like("f"), ds('a)).otherwise(ds('b)) + * }}} + * apache/spark + */ + def like( + literal: String + )(implicit + ev: U =:= String + ): ThisType[T, Boolean] = typed(self.untyped.like(literal)) - /** SQL RLIKE expression (LIKE with Regex). Returns a boolean column based on a regex match. - * {{{ - * val ds = TypedDataset.create(X1("foo") :: Nil) - * // true - * ds.select(ds('a).rlike("foo")) - * - * // true - * ds.select(ds('a).rlike(".*)) - * }}} - * apache/spark - */ - def rlike(literal: String)(implicit ev: U =:= String): ThisType[T, Boolean] = + /** + * SQL RLIKE expression (LIKE with Regex). Returns a boolean column based on a regex match. + * {{{ + * val ds = TypedDataset.create(X1("foo") :: Nil) + * // true + * ds.select(ds('a).rlike("foo")) + * + * // true + * ds.select(ds('a).rlike(".*)) + * }}} + * apache/spark + */ + def rlike( + literal: String + )(implicit + ev: U =:= String + ): ThisType[T, Boolean] = typed(self.untyped.rlike(literal)) - /** String contains another string literal. - * {{{ - * df.filter ( df.col('a).contains("foo") ) - * }}} - * - * @param other a string that is being tested against. - * apache/spark - */ - def contains(other: String)(implicit ev: U =:= String): ThisType[T, Boolean] = + /** + * String contains another string literal. + * {{{ + * df.filter ( df.col('a).contains("foo") ) + * }}} + * + * @param other a string that is being tested against. + * apache/spark + */ + def contains( + other: String + )(implicit + ev: U =:= String + ): ThisType[T, Boolean] = typed(self.untyped.contains(other)) - /** String contains. - * {{{ - * df.filter ( df.col('a).contains(df.col('b) ) - * }}} - * - * @param other a column which values is used as a string that is being tested against. - * apache/spark - */ - def contains[TT, W](other: ThisType[TT, U])(implicit ev: U =:= String, w: With.Aux[T, TT, W]): ThisType[W, Boolean] = + /** + * String contains. + * {{{ + * df.filter ( df.col('a).contains(df.col('b) ) + * }}} + * + * @param other a column which values is used as a string that is being tested against. + * apache/spark + */ + def contains[TT, W]( + other: ThisType[TT, U] + )(implicit + ev: U =:= String, + w: FWith.Aux[T, TT, W] + ): ThisType[W, Boolean] = typed(self.untyped.contains(other.untyped)) - /** String starts with another string literal. - * {{{ - * df.filter ( df.col('a).startsWith("foo") - * }}} - * - * @param other a prefix that is being tested against. - * apache/spark - */ - def startsWith(other: String)(implicit ev: U =:= String): ThisType[T, Boolean] = + /** + * String starts with another string literal. + * {{{ + * df.filter ( df.col('a).startsWith("foo") + * }}} + * + * @param other a prefix that is being tested against. + * apache/spark + */ + def startsWith( + other: String + )(implicit + ev: U =:= String + ): ThisType[T, Boolean] = typed(self.untyped.startsWith(other)) - /** String starts with. - * {{{ - * df.filter ( df.col('a).startsWith(df.col('b)) - * }}} - * - * @param other a column which values is used as a prefix that is being tested against. - * apache/spark - */ - def startsWith[TT, W](other: ThisType[TT, U])(implicit ev: U =:= String, w: With.Aux[T, TT, W]): ThisType[W, Boolean] = + /** + * String starts with. + * {{{ + * df.filter ( df.col('a).startsWith(df.col('b)) + * }}} + * + * @param other a column which values is used as a prefix that is being tested against. + * apache/spark + */ + def startsWith[TT, W]( + other: ThisType[TT, U] + )(implicit + ev: U =:= String, + w: FWith.Aux[T, TT, W] + ): ThisType[W, Boolean] = typed(self.untyped.startsWith(other.untyped)) - /** String ends with another string literal. - * {{{ - * df.filter ( df.col('a).endsWith("foo") - * }}} - * - * @param other a suffix that is being tested against. - * apache/spark - */ - def endsWith(other: String)(implicit ev: U =:= String): ThisType[T, Boolean] = + /** + * String ends with another string literal. + * {{{ + * df.filter ( df.col('a).endsWith("foo") + * }}} + * + * @param other a suffix that is being tested against. + * apache/spark + */ + def endsWith( + other: String + )(implicit + ev: U =:= String + ): ThisType[T, Boolean] = typed(self.untyped.endsWith(other)) - /** String ends with. - * {{{ - * df.filter ( df.col('a).endsWith(df.col('b)) - * }}} - * - * @param other a column which values is used as a suffix that is being tested against. - * apache/spark - */ - def endsWith[TT, W](other: ThisType[TT, U])(implicit ev: U =:= String, w: With.Aux[T, TT, W]): ThisType[W, Boolean] = + /** + * String ends with. + * {{{ + * df.filter ( df.col('a).endsWith(df.col('b)) + * }}} + * + * @param other a column which values is used as a suffix that is being tested against. + * apache/spark + */ + def endsWith[TT, W]( + other: ThisType[TT, U] + )(implicit + ev: U =:= String, + w: FWith.Aux[T, TT, W] + ): ThisType[W, Boolean] = typed(self.untyped.endsWith(other.untyped)) - /** Boolean AND. - * {{{ - * df.filter ( (df.col('a) === 1).and(df.col('b) > 5) ) - * }}} - */ - def and[TT, W](other: ThisType[TT, Boolean])(implicit w: With.Aux[T, TT, W]): ThisType[W, Boolean] = + /** + * Boolean AND. + * {{{ + * df.filter ( (df.col('a) === 1).and(df.col('b) > 5) ) + * }}} + */ + def and[TT, W]( + other: ThisType[TT, Boolean] + )(implicit + w: FWith.Aux[T, TT, W] + ): ThisType[W, Boolean] = typed(self.untyped.and(other.untyped)) - /** Boolean AND. - * {{{ - * df.filter ( df.col('a) === 1 && df.col('b) > 5) - * }}} - */ - def && [TT, W](other: ThisType[TT, Boolean])(implicit w: With.Aux[T, TT, W]): ThisType[W, Boolean] = + /** + * Boolean AND. + * {{{ + * df.filter ( df.col('a) === 1 && df.col('b) > 5) + * }}} + */ + def &&[TT, W]( + other: ThisType[TT, Boolean] + )(implicit + w: FWith.Aux[T, TT, W] + ): ThisType[W, Boolean] = and(other) - /** Boolean OR. - * {{{ - * df.filter ( (df.col('a) === 1).or(df.col('b) > 5) ) - * }}} - */ - def or[TT, W](other: ThisType[TT, Boolean])(implicit w: With.Aux[T, TT, W]): ThisType[W, Boolean] = + /** + * Boolean OR. + * {{{ + * df.filter ( (df.col('a) === 1).or(df.col('b) > 5) ) + * }}} + */ + def or[TT, W]( + other: ThisType[TT, Boolean] + )(implicit + w: FWith.Aux[T, TT, W] + ): ThisType[W, Boolean] = typed(self.untyped.or(other.untyped)) - /** Boolean OR. - * {{{ - * df.filter ( df.col('a) === 1 || df.col('b) > 5) - * }}} - */ - def || [TT, W](other: ThisType[TT, Boolean])(implicit w: With.Aux[T, TT, W]): ThisType[W, Boolean] = + /** + * Boolean OR. + * {{{ + * df.filter ( df.col('a) === 1 || df.col('b) > 5) + * }}} + */ + def ||[TT, W]( + other: ThisType[TT, Boolean] + )(implicit + w: FWith.Aux[T, TT, W] + ): ThisType[W, Boolean] = or(other) - /** Less than. - * - * {{{ - * // The following selects people younger than the maxAge column. - * df.select(df('age) < df('maxAge) ) - * }}} - * - * @param other another column of the same type - * apache/spark - */ - def <[TT, W](other: ThisType[TT, U])(implicit i0: CatalystOrdered[U], w: With.Aux[T, TT, W]): ThisType[W, Boolean] = + /** + * Less than. + * + * {{{ + * // The following selects people younger than the maxAge column. + * df.select(df('age) < df('maxAge) ) + * }}} + * + * @param other another column of the same type + * apache/spark + */ + def <[TT, W]( + other: ThisType[TT, U] + )(implicit + i0: CatalystOrdered[U], + w: FWith.Aux[T, TT, W] + ): ThisType[W, Boolean] = typed(self.untyped < other.untyped) - /** Less than or equal to. - * - * {{{ - * // The following selects people younger or equal than the maxAge column. - * df.select(df('age) <= df('maxAge) - * }}} - * - * @param other another column of the same type - * apache/spark - */ - def <=[TT, W](other: ThisType[TT, U])(implicit i0: CatalystOrdered[U], w: With.Aux[T, TT, W]): ThisType[W, Boolean] = + /** + * Less than or equal to. + * + * {{{ + * // The following selects people younger or equal than the maxAge column. + * df.select(df('age) <= df('maxAge) + * }}} + * + * @param other another column of the same type + * apache/spark + */ + def <=[TT, W]( + other: ThisType[TT, U] + )(implicit + i0: CatalystOrdered[U], + w: FWith.Aux[T, TT, W] + ): ThisType[W, Boolean] = typed(self.untyped <= other.untyped) - /** Greater than. - * {{{ - * // The following selects people older than the maxAge column. - * df.select( df('age) > df('maxAge) ) - * }}} - * - * @param other another column of the same type - * apache/spark - */ - def >[TT, W](other: ThisType[TT, U])(implicit i0: CatalystOrdered[U], w: With.Aux[T, TT, W]): ThisType[W, Boolean] = + /** + * Greater than. + * {{{ + * // The following selects people older than the maxAge column. + * df.select( df('age) > df('maxAge) ) + * }}} + * + * @param other another column of the same type + * apache/spark + */ + def >[TT, W]( + other: ThisType[TT, U] + )(implicit + i0: CatalystOrdered[U], + w: FWith.Aux[T, TT, W] + ): ThisType[W, Boolean] = typed(self.untyped > other.untyped) - /** Greater than or equal. - * {{{ - * // The following selects people older or equal than the maxAge column. - * df.select( df('age) >= df('maxAge) ) - * }}} - * - * @param other another column of the same type - * apache/spark - */ - def >=[TT, W](other: ThisType[TT, U])(implicit i0: CatalystOrdered[U], w: With.Aux[T, TT, W]): ThisType[W, Boolean] = + /** + * Greater than or equal. + * {{{ + * // The following selects people older or equal than the maxAge column. + * df.select( df('age) >= df('maxAge) ) + * }}} + * + * @param other another column of the same type + * apache/spark + */ + def >=[TT, W]( + other: ThisType[TT, U] + )(implicit + i0: CatalystOrdered[U], + w: FWith.Aux[T, TT, W] + ): ThisType[W, Boolean] = typed(self.untyped >= other.untyped) - /** Less than. - * {{{ - * // The following selects people younger than 21. - * df.select( df('age) < 21 ) - * }}} - * - * @param u a constant of the same type - * apache/spark - */ - def <(u: U)(implicit i0: CatalystOrdered[U]): ThisType[T, Boolean] = + /** + * Less than. + * {{{ + * // The following selects people younger than 21. + * df.select( df('age) < 21 ) + * }}} + * + * @param u a constant of the same type + * apache/spark + */ + def <( + u: U + )(implicit + i0: CatalystOrdered[U] + ): ThisType[T, Boolean] = typed(self.untyped < lit(u)(self.uencoder).untyped) - /** Less than or equal to. - * {{{ - * // The following selects people younger than 22. - * df.select( df('age) <= 2 ) - * }}} - * - * @param u a constant of the same type - * apache/spark - */ - def <=(u: U)(implicit i0: CatalystOrdered[U]): ThisType[T, Boolean] = + /** + * Less than or equal to. + * {{{ + * // The following selects people younger than 22. + * df.select( df('age) <= 2 ) + * }}} + * + * @param u a constant of the same type + * apache/spark + */ + def <=( + u: U + )(implicit + i0: CatalystOrdered[U] + ): ThisType[T, Boolean] = typed(self.untyped <= lit(u)(self.uencoder).untyped) - /** Greater than. - * {{{ - * // The following selects people older than 21. - * df.select( df('age) > 21 ) - * }}} - * - * @param u another column of the same type - * apache/spark - */ - def >(u: U)(implicit i0: CatalystOrdered[U]): ThisType[T, Boolean] = + /** + * Greater than. + * {{{ + * // The following selects people older than 21. + * df.select( df('age) > 21 ) + * }}} + * + * @param u another column of the same type + * apache/spark + */ + def >( + u: U + )(implicit + i0: CatalystOrdered[U] + ): ThisType[T, Boolean] = typed(self.untyped > lit(u)(self.uencoder).untyped) - /** Greater than or equal. - * {{{ - * // The following selects people older than 20. - * df.select( df('age) >= 21 ) - * }}} - * - * @param u another column of the same type - * apache/spark - */ - def >=(u: U)(implicit i0: CatalystOrdered[U]): ThisType[T, Boolean] = + /** + * Greater than or equal. + * {{{ + * // The following selects people older than 20. + * df.select( df('age) >= 21 ) + * }}} + * + * @param u another column of the same type + * apache/spark + */ + def >=( + u: U + )(implicit + i0: CatalystOrdered[U] + ): ThisType[T, Boolean] = typed(self.untyped >= lit(u)(self.uencoder).untyped) /** - * Returns true if the value of this column is contained in of the arguments. - * {{{ - * // The following selects people with age 15, 20, or 30. - * df.select( df('age).isin(15, 20, 30) ) - * }}} - * - * @param values are constants of the same type - * apache/spark - */ - def isin(values: U*)(implicit e: CatalystIsin[U]): ThisType[T, Boolean] = - typed(self.untyped.isin(values:_*)) - - /** - * True if the current column is between the lower bound and upper bound, inclusive. - * - * @param lowerBound a constant of the same type - * @param upperBound a constant of the same type - * apache/spark - */ - def between(lowerBound: U, upperBound: U)(implicit i0: CatalystOrdered[U]): ThisType[T, Boolean] = - typed(self.untyped.between(lit(lowerBound)(self.uencoder).untyped, lit(upperBound)(self.uencoder).untyped)) - - /** - * True if the current column is between the lower bound and upper bound, inclusive. - * - * @param lowerBound another column of the same type - * @param upperBound another column of the same type - * apache/spark - */ - def between[TT1, TT2, W1, W2](lowerBound: ThisType[TT1, U], upperBound: ThisType[TT2, U]) - (implicit + * Returns true if the value of this column is contained in of the arguments. + * {{{ + * // The following selects people with age 15, 20, or 30. + * df.select( df('age).isin(15, 20, 30) ) + * }}} + * + * @param values are constants of the same type + * apache/spark + */ + def isin( + values: U* + )(implicit + e: CatalystIsin[U] + ): ThisType[T, Boolean] = + typed(self.untyped.isin(values: _*)) + + /** + * True if the current column is between the lower bound and upper bound, inclusive. + * + * @param lowerBound a constant of the same type + * @param upperBound a constant of the same type + * apache/spark + */ + def between( + lowerBound: U, + upperBound: U + )(implicit + i0: CatalystOrdered[U] + ): ThisType[T, Boolean] = + typed( + self.untyped.between( + lit(lowerBound)(self.uencoder).untyped, + lit(upperBound)(self.uencoder).untyped + ) + ) + + /** + * True if the current column is between the lower bound and upper bound, inclusive. + * + * @param lowerBound another column of the same type + * @param upperBound another column of the same type + * apache/spark + */ + def between[TT1, TT2, W1, W2]( + lowerBound: ThisType[TT1, U], + upperBound: ThisType[TT2, U] + )(implicit i0: CatalystOrdered[U], - w0: With.Aux[T, TT1, W1], - w1: With.Aux[TT2, W1, W2] + w0: FWith.Aux[T, TT1, W1], + w1: FWith.Aux[TT2, W1, W2] ): ThisType[W2, Boolean] = - typed(self.untyped.between(lowerBound.untyped, upperBound.untyped)) + typed(self.untyped.between(lowerBound.untyped, upperBound.untyped)) /** - * Returns a nested column matching the field `symbol`. - * - * @param symbol the field symbol - * @tparam V the type of the nested field - */ - def field[V](symbol: Witness.Lt[Symbol])(implicit + * Returns a nested column matching the field `symbol`. + * + * @param symbol the field symbol + * @tparam V the type of the nested field + */ + def field[V]( + symbol: Witness.Lt[Symbol] + )(implicit i0: TypedColumn.Exists[U, symbol.T, V], i1: TypedEncoder[V] - ): ThisType[T, V] = + ): ThisType[T, V] = typed(self.untyped.getField(symbol.value.name)) } - -sealed class SortedTypedColumn[T, U](val expr: Expression)( - implicit - val uencoder: TypedEncoder[U] -) extends UntypedExpression[T] { - - def this(column: Column)(implicit e: TypedEncoder[U]) = { +sealed class SortedTypedColumn[T, U]( + val expr: Expression + )(implicit + val uencoder: TypedEncoder[U]) + extends UntypedExpression[T] { + + def this( + column: Column + )(implicit + e: TypedEncoder[U] + ) = { this(FramelessInternals.expr(column)) } - def untyped: Column = new Column(expr) + def untyped: Column = FramelessInternals.column(expr) } object SortedTypedColumn { - implicit def defaultAscending[T, U : CatalystOrdered](typedColumn: TypedColumn[T, U]): SortedTypedColumn[T, U] = + + implicit def defaultAscending[T, U: CatalystOrdered]( + typedColumn: TypedColumn[T, U] + ): SortedTypedColumn[T, U] = new SortedTypedColumn[T, U](typedColumn.untyped.asc)(typedColumn.uencoder) - object defaultAscendingPoly extends Poly1 { - implicit def caseTypedColumn[T, U : CatalystOrdered] = at[TypedColumn[T, U]](c => defaultAscending(c)) - implicit def caseTypeSortedColumn[T, U] = at[SortedTypedColumn[T, U]](identity) - } + object defaultAscendingPoly extends Poly1 { + + implicit def caseTypedColumn[T, U: CatalystOrdered] = + at[TypedColumn[T, U]](c => defaultAscending(c)) + + implicit def caseTypeSortedColumn[T, U] = + at[SortedTypedColumn[T, U]](identity) + } } object TypedColumn { + /** Evidence that type `T` has column `K` with type `V`. */ @implicitNotFound(msg = "No column ${K} of type ${V} in ${T}") trait Exists[T, K, V] @@ -912,37 +1307,46 @@ object TypedColumn { trait ExistsMany[T, K <: HList, V] object ExistsMany { - implicit def deriveCons[T, KH, KT <: HList, V0, V1] - (implicit + + implicit def deriveCons[T, KH, KT <: HList, V0, V1]( + implicit head: Exists[T, KH, V0], tail: ExistsMany[V0, KT, V1] ): ExistsMany[T, KH :: KT, V1] = - new ExistsMany[T, KH :: KT, V1] {} + new ExistsMany[T, KH :: KT, V1] {} - implicit def deriveHNil[T, K, V](implicit head: Exists[T, K, V]): ExistsMany[T, K :: HNil, V] = + implicit def deriveHNil[T, K, V]( + implicit + head: Exists[T, K, V] + ): ExistsMany[T, K :: HNil, V] = new ExistsMany[T, K :: HNil, V] {} } object Exists { - def apply[T, V](column: Witness)(implicit e: Exists[T, column.T, V]): Exists[T, column.T, V] = e - implicit def deriveRecord[T, H <: HList, K, V] - (implicit + def apply[T, V]( + column: Witness + )(implicit + e: Exists[T, column.T, V] + ): Exists[T, column.T, V] = e + + implicit def deriveRecord[T, H <: HList, K, V]( + implicit i0: LabelledGeneric.Aux[T, H], i1: Selector.Aux[H, K, V] ): Exists[T, K, V] = new Exists[T, K, V] {} } /** - * {{{ - * import frameless.TypedColumn - * - * case class Foo(id: Int, bar: String) - * - * val colbar: TypedColumn[Foo, String] = TypedColumn { foo: Foo => foo.bar } - * val colid = TypedColumn[Foo, Int](_.id) - * }}} - */ + * {{{ + * import frameless.TypedColumn + * + * case class Foo(id: Int, bar: String) + * + * val colbar: TypedColumn[Foo, String] = TypedColumn { foo: Foo => foo.bar } + * val colid = TypedColumn[Foo, Int](_.id) + * }}} + */ def apply[T, U](x: T => U): TypedColumn[T, U] = macro TypedColumnMacroImpl.applyImpl[T, U] diff --git a/dataset/src/main/scala/frameless/TypedColumnMacroImpl.scala b/dataset/src/main/scala/frameless/TypedColumnMacroImpl.scala index 62fa2765d..23502ef3b 100644 --- a/dataset/src/main/scala/frameless/TypedColumnMacroImpl.scala +++ b/dataset/src/main/scala/frameless/TypedColumnMacroImpl.scala @@ -4,7 +4,10 @@ import scala.reflect.macros.whitebox private[frameless] object TypedColumnMacroImpl { - def applyImpl[T: c.WeakTypeTag, U: c.WeakTypeTag](c: whitebox.Context)(x: c.Tree): c.Expr[TypedColumn[T, U]] = { + def applyImpl[T: c.WeakTypeTag, U: c.WeakTypeTag]( + c: whitebox.Context + )(x: c.Tree + ): c.Expr[TypedColumn[T, U]] = { import c.universe._ val t = c.weakTypeOf[T] @@ -13,7 +16,9 @@ private[frameless] object TypedColumnMacroImpl { def buildExpression(path: List[String]): c.Expr[TypedColumn[T, U]] = { val columnName = path.mkString(".") - c.Expr[TypedColumn[T, U]](q"new _root_.frameless.TypedColumn[$t, $u]((org.apache.spark.sql.functions.col($columnName)).expr)") + c.Expr[TypedColumn[T, U]]( + q"new _root_.frameless.TypedColumn[$t, $u](_root_.org.apache.spark.sql.FramelessInternals.expr(org.apache.spark.sql.functions.col($columnName)))" + ) } def abort(msg: String) = c.abort(c.enclosingPosition, msg) @@ -48,34 +53,39 @@ private[frameless] object TypedColumnMacroImpl { } x match { - case fn: Function => fn.body match { - case select: Select if select.name.isTermName => - val expectedRoot: Option[String] = fn.vparams match { - case List(rt) if rt.rhs == EmptyTree => - Option.empty[String] - - case List(rt) => - Some(rt.toString) + case fn: Function => + fn.body match { + case select: Select if select.name.isTermName => + val expectedRoot: Option[String] = fn.vparams match { + case List(rt) if rt.rhs == EmptyTree => + Option.empty[String] + + case List(rt) => + Some(rt.toString) + + case u => + abort( + s"Select expression must have a single parameter: ${u mkString ", "}" + ) + } - case u => - abort(s"Select expression must have a single parameter: ${u mkString ", "}") - } + path(select, List.empty) match { + case root :: tail + if (expectedRoot.forall(_ == root) && check(t, tail)) => { + val colPath = tail.mkString(".") - path(select, List.empty) match { - case root :: tail if ( - expectedRoot.forall(_ == root) && check(t, tail)) => { - val colPath = tail.mkString(".") + c.Expr[TypedColumn[T, U]]( + q"new _root_.frameless.TypedColumn[$t, $u](_root_.org.apache.spark.sql.FramelessInternals.expr(org.apache.spark.sql.functions.col($colPath)))" + ) + } - c.Expr[TypedColumn[T, U]](q"new _root_.frameless.TypedColumn[$t, $u]((org.apache.spark.sql.functions.col($colPath)).expr)") + case _ => + abort(s"Invalid select expression: $select") } - case _ => - abort(s"Invalid select expression: $select") - } - - case t => - abort(s"Select expression expected: $t") - } + case t => + abort(s"Select expression expected: $t") + } case _ => abort(s"Function expected: $x") diff --git a/dataset/src/main/scala/frameless/TypedDataset.scala b/dataset/src/main/scala/frameless/TypedDataset.scala index add2170b2..82a016a3a 100644 --- a/dataset/src/main/scala/frameless/TypedDataset.scala +++ b/dataset/src/main/scala/frameless/TypedDataset.scala @@ -4,36 +4,58 @@ import java.util import frameless.functions.CatalystExplodableCollection import frameless.ops._ import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{Column, DataFrame, Dataset, FramelessInternals, SparkSession} -import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Literal} -import org.apache.spark.sql.catalyst.plans.logical.{Join, JoinHint} +import org.apache.spark.sql.{ + Column, + DataFrame, + Dataset, + FramelessInternals, + SparkSession +} +import org.apache.spark.sql.catalyst.expressions.{ + Attribute, + AttributeReference, + Literal +} +import org.apache.spark.sql.catalyst.plans.logical.{ Join, JoinHint } import org.apache.spark.sql.catalyst.plans.Inner import org.apache.spark.sql.types.StructType import shapeless._ import shapeless.labelled.FieldType -import shapeless.ops.hlist.{Diff, IsHCons, Mapper, Prepend, ToTraversable, Tupler} -import shapeless.ops.record.{Keys, Modifier, Remover, Values} +import shapeless.ops.hlist.{ + Diff, + IsHCons, + Mapper, + Prepend, + ToTraversable, + Tupler +} +import shapeless.ops.record.{ Keys, Modifier, Remover, Values } import scala.language.experimental.macros -/** [[TypedDataset]] is a safer interface for working with `Dataset`. - * - * NOTE: Prefer `TypedDataset.create` over `new TypedDataset` unless you - * know what you are doing. - * - * Documentation marked "apache/spark" is thanks to apache/spark Contributors - * at https://github.com/apache/spark, licensed under Apache v2.0 available at - * http://www.apache.org/licenses/LICENSE-2.0 - */ -class TypedDataset[T] protected[frameless](val dataset: Dataset[T])(implicit val encoder: TypedEncoder[T]) +/** + * [[TypedDataset]] is a safer interface for working with `Dataset`. + * + * NOTE: Prefer `TypedDataset.create` over `new TypedDataset` unless you + * know what you are doing. + * + * Documentation marked "apache/spark" is thanks to apache/spark Contributors + * at https://github.com/apache/spark, licensed under Apache v2.0 available at + * http://www.apache.org/licenses/LICENSE-2.0 + */ +class TypedDataset[T] protected[frameless] ( + val dataset: Dataset[T] + )(implicit + val encoder: TypedEncoder[T]) extends TypedDatasetForwarded[T] { self => private implicit val spark: SparkSession = dataset.sparkSession - /** Aggregates on the entire Dataset without groups. - * - * apache/spark - */ + /** + * Aggregates on the entire Dataset without groups. + * + * apache/spark + */ def agg[A](ca: TypedAggregate[T, A]): TypedDataset[A] = { implicit val ea = ca.uencoder val tuple1: TypedDataset[Tuple1[A]] = aggMany(ca) @@ -42,10 +64,8 @@ class TypedDataset[T] protected[frameless](val dataset: Dataset[T])(implicit val TypedEncoder[A].catalystRepr match { case StructType(_) => // if column is struct, we use all its fields - val df = tuple1 - .dataset - .selectExpr("_1.*") - .as[A](TypedExpressionEncoder[A]) + val df = + tuple1.dataset.selectExpr("_1.*").as[A](TypedExpressionEncoder[A]) TypedDataset.create(df) case other => @@ -54,52 +74,59 @@ class TypedDataset[T] protected[frameless](val dataset: Dataset[T])(implicit val } } - /** Aggregates on the entire Dataset without groups. - * - * apache/spark - */ + /** + * Aggregates on the entire Dataset without groups. + * + * apache/spark + */ def agg[A, B]( - ca: TypedAggregate[T, A], - cb: TypedAggregate[T, B] - ): TypedDataset[(A, B)] = { + ca: TypedAggregate[T, A], + cb: TypedAggregate[T, B] + ): TypedDataset[(A, B)] = { implicit val (ea, eb) = (ca.uencoder, cb.uencoder) aggMany(ca, cb) } - /** Aggregates on the entire Dataset without groups. - * - * apache/spark - */ + /** + * Aggregates on the entire Dataset without groups. + * + * apache/spark + */ def agg[A, B, C]( - ca: TypedAggregate[T, A], - cb: TypedAggregate[T, B], - cc: TypedAggregate[T, C] - ): TypedDataset[(A, B, C)] = { + ca: TypedAggregate[T, A], + cb: TypedAggregate[T, B], + cc: TypedAggregate[T, C] + ): TypedDataset[(A, B, C)] = { implicit val (ea, eb, ec) = (ca.uencoder, cb.uencoder, cc.uencoder) aggMany(ca, cb, cc) } - /** Aggregates on the entire Dataset without groups. - * - * apache/spark - */ + /** + * Aggregates on the entire Dataset without groups. + * + * apache/spark + */ def agg[A, B, C, D]( - ca: TypedAggregate[T, A], - cb: TypedAggregate[T, B], - cc: TypedAggregate[T, C], - cd: TypedAggregate[T, D] - ): TypedDataset[(A, B, C, D)] = { - implicit val (ea, eb, ec, ed) = (ca.uencoder, cb.uencoder, cc.uencoder, cd.uencoder) + ca: TypedAggregate[T, A], + cb: TypedAggregate[T, B], + cc: TypedAggregate[T, C], + cd: TypedAggregate[T, D] + ): TypedDataset[(A, B, C, D)] = { + implicit val (ea, eb, ec, ed) = + (ca.uencoder, cb.uencoder, cc.uencoder, cd.uencoder) aggMany(ca, cb, cc, cd) } - /** Aggregates on the entire Dataset without groups. - * - * apache/spark - */ + /** + * Aggregates on the entire Dataset without groups. + * + * apache/spark + */ object aggMany extends ProductArgs { - def applyProduct[U <: HList, Out0 <: HList, Out](columns: U) - (implicit + + def applyProduct[U <: HList, Out0 <: HList, Out]( + columns: U + )(implicit i0: AggregateTypes.Aux[T, U, Out0], i1: ToTraversable.Aux[U, List, UntypedExpression[T]], i2: Tupler.Aux[Out0, Out], @@ -109,7 +136,7 @@ class TypedDataset[T] protected[frameless](val dataset: Dataset[T])(implicit val val underlyingColumns = columns.toList[UntypedExpression[T]] val cols: Seq[Column] = for { (c, i) <- columns.toList[UntypedExpression[T]].zipWithIndex - } yield new Column(c.expr).as(s"_${i+1}") + } yield FramelessInternals.column(c.expr).as(s"_${i + 1}") // Workaround to SPARK-20346. One alternative is to allow the result to be Vector(null) for empty DataFrames. // Another one would be to return an Option. @@ -117,129 +144,163 @@ class TypedDataset[T] protected[frameless](val dataset: Dataset[T])(implicit val for { (c, i) <- underlyingColumns.zipWithIndex if !c.uencoder.nullable - } yield s"_${i+1} is not null" - ).mkString(" or ") + } yield s"_${i + 1} is not null" + ).mkString(" or ") - val selected = dataset.toDF().agg(cols.head, cols.tail:_*).as[Out](TypedExpressionEncoder[Out]) - TypedDataset.create[Out](if (filterStr.isEmpty) selected else selected.filter(filterStr)) + val selected = dataset + .toDF() + .agg(cols.head, cols.tail: _*) + .as[Out](TypedExpressionEncoder[Out]) + TypedDataset.create[Out]( + if (filterStr.isEmpty) selected else selected.filter(filterStr) + ) } } /** Returns a new [[TypedDataset]] where each record has been mapped on to the specified type. */ - def as[U]()(implicit as: As[T, U]): TypedDataset[U] = { + def as[U]( + )(implicit + as: As[T, U] + ): TypedDataset[U] = { implicit val uencoder = as.encoder TypedDataset.create(dataset.as[U](TypedExpressionEncoder[U])) } - /** Returns a checkpointed version of this [[TypedDataset]]. Checkpointing can be used to truncate the - * logical plan of this Dataset, which is especially useful in iterative algorithms where the - * plan may grow exponentially. It will be saved to files inside the checkpoint - * directory set with `SparkContext#setCheckpointDir`. - * - * Differs from `Dataset#checkpoint` by wrapping its result into an effect-suspending `F[_]`. - * - * apache/spark - */ - def checkpoint[F[_]](eager: Boolean)(implicit F: SparkDelay[F]): F[TypedDataset[T]] = + /** + * Returns a checkpointed version of this [[TypedDataset]]. Checkpointing can be used to truncate the + * logical plan of this Dataset, which is especially useful in iterative algorithms where the + * plan may grow exponentially. It will be saved to files inside the checkpoint + * directory set with `SparkContext#setCheckpointDir`. + * + * Differs from `Dataset#checkpoint` by wrapping its result into an effect-suspending `F[_]`. + * + * apache/spark + */ + def checkpoint[F[_]]( + eager: Boolean + )(implicit + F: SparkDelay[F] + ): F[TypedDataset[T]] = F.delay(TypedDataset.create[T](dataset.checkpoint(eager))) - /** Returns a new [[TypedDataset]] where each record has been mapped on to the specified type. - * Unlike `as` the projection U may include a subset of the columns of T and the column names and types must agree. - * - * {{{ - * case class Foo(i: Int, j: String) - * case class Bar(j: String) - * - * val t: TypedDataset[Foo] = ... - * val b: TypedDataset[Bar] = t.project[Bar] - * - * case class BarErr(e: String) - * // The following does not compile because `Foo` doesn't have a field with name `e` - * val e: TypedDataset[BarErr] = t.project[BarErr] - * }}} - */ - def project[U](implicit projector: SmartProject[T,U]): TypedDataset[U] = projector.apply(this) - - /** Returns a new [[TypedDataset]] that contains the elements of both this and the `other` [[TypedDataset]] - * combined. - * - * Note that, this function is not a typical set union operation, in that it does not eliminate - * duplicate items. As such, it is analogous to `UNION ALL` in SQL. - * - * Differs from `Dataset#union` by aligning fields if possible. - * It will not compile if `Datasets` have not compatible schema. - * - * Example: - * {{{ - * case class Foo(x: Int, y: Long) - * case class Bar(y: Long, x: Int) - * case class Faz(x: Int, y: Int, z: Int) - * - * foo: TypedDataset[Foo] = ... - * bar: TypedDataset[Bar] = ... - * faz: TypedDataset[Faz] = ... - * - * foo union bar: TypedDataset[Foo] - * foo union faz: TypedDataset[Foo] - * // won't compile, you need to reverse order, you can't project from less fields to more - * faz union foo - * - * }}} - * - * apache/spark - */ - def union[U: TypedEncoder](other: TypedDataset[U])(implicit projector: SmartProject[U, T]): TypedDataset[T] = + /** + * Returns a new [[TypedDataset]] where each record has been mapped on to the specified type. + * Unlike `as` the projection U may include a subset of the columns of T and the column names and types must agree. + * + * {{{ + * case class Foo(i: Int, j: String) + * case class Bar(j: String) + * + * val t: TypedDataset[Foo] = ... + * val b: TypedDataset[Bar] = t.project[Bar] + * + * case class BarErr(e: String) + * // The following does not compile because `Foo` doesn't have a field with name `e` + * val e: TypedDataset[BarErr] = t.project[BarErr] + * }}} + */ + def project[U]( + implicit + projector: SmartProject[T, U] + ): TypedDataset[U] = projector.apply(this) + + /** + * Returns a new [[TypedDataset]] that contains the elements of both this and the `other` [[TypedDataset]] + * combined. + * + * Note that, this function is not a typical set union operation, in that it does not eliminate + * duplicate items. As such, it is analogous to `UNION ALL` in SQL. + * + * Differs from `Dataset#union` by aligning fields if possible. + * It will not compile if `Datasets` have not compatible schema. + * + * Example: + * {{{ + * case class Foo(x: Int, y: Long) + * case class Bar(y: Long, x: Int) + * case class Faz(x: Int, y: Int, z: Int) + * + * foo: TypedDataset[Foo] = ... + * bar: TypedDataset[Bar] = ... + * faz: TypedDataset[Faz] = ... + * + * foo union bar: TypedDataset[Foo] + * foo union faz: TypedDataset[Foo] + * // won't compile, you need to reverse order, you can't project from less fields to more + * faz union foo + * + * }}} + * + * apache/spark + */ + def union[U: TypedEncoder]( + other: TypedDataset[U] + )(implicit + projector: SmartProject[U, T] + ): TypedDataset[T] = TypedDataset.create(dataset.union(other.project[T].dataset)) - /** Returns a new [[TypedDataset]] that contains the elements of both this and the `other` [[TypedDataset]] - * combined. - * - * Note that, this function is not a typical set union operation, in that it does not eliminate - * duplicate items. As such, it is analogous to `UNION ALL` in SQL. - * - * apache/spark - */ + /** + * Returns a new [[TypedDataset]] that contains the elements of both this and the `other` [[TypedDataset]] + * combined. + * + * Note that, this function is not a typical set union operation, in that it does not eliminate + * duplicate items. As such, it is analogous to `UNION ALL` in SQL. + * + * apache/spark + */ def union(other: TypedDataset[T]): TypedDataset[T] = { TypedDataset.create(dataset.union(other.dataset)) } - /** Returns the number of elements in the [[TypedDataset]]. - * - * Differs from `Dataset#count` by wrapping its result into an effect-suspending `F[_]`. - */ - def count[F[_]]()(implicit F: SparkDelay[F]): F[Long] = + /** + * Returns the number of elements in the [[TypedDataset]]. + * + * Differs from `Dataset#count` by wrapping its result into an effect-suspending `F[_]`. + */ + def count[F[_]]( + )(implicit + F: SparkDelay[F] + ): F[Long] = F.delay(dataset.count()) - /** Returns `TypedColumn` of type `A` given its name (alias for `col`). - * - * {{{ - * tf('id) - * }}} - * - * It is statically checked that column with such name exists and has type `A`. - */ - def apply[A](column: Witness.Lt[Symbol]) - (implicit + /** + * Returns `TypedColumn` of type `A` given its name (alias for `col`). + * + * {{{ + * tf('id) + * }}} + * + * It is statically checked that column with such name exists and has type `A`. + */ + def apply[A]( + column: Witness.Lt[Symbol] + )(implicit i0: TypedColumn.Exists[T, column.T, A], i1: TypedEncoder[A] ): TypedColumn[T, A] = col(column) - /** Returns `TypedColumn` of type `A` given its name. - * - * {{{ - * tf.col('id) - * }}} - * - * It is statically checked that column with such name exists and has type `A`. - */ - def col[A](column: Witness.Lt[Symbol]) - (implicit + /** + * Returns `TypedColumn` of type `A` given its name. + * + * {{{ + * tf.col('id) + * }}} + * + * It is statically checked that column with such name exists and has type `A`. + */ + def col[A]( + column: Witness.Lt[Symbol] + )(implicit i0: TypedColumn.Exists[T, column.T, A], i1: TypedEncoder[A] ): TypedColumn[T, A] = - new TypedColumn[T, A](dataset(column.value.name).as[A](TypedExpressionEncoder[A])) + new TypedColumn[T, A]( + dataset(column.value.name).as[A](TypedExpressionEncoder[A]) + ) - /** Returns `TypedColumn` of type `A` given a lambda indicating the field. + /** + * Returns `TypedColumn` of type `A` given a lambda indicating the field. * * {{{ * td.col(_.id) @@ -250,12 +311,13 @@ class TypedDataset[T] protected[frameless](val dataset: Dataset[T])(implicit val def col[A](x: Function1[T, A]): TypedColumn[T, A] = macro TypedColumnMacroImpl.applyImpl[T, A] - /** Projects the entire `TypedDataset[T]` into a single column of type `TypedColumn[T,T]`. - * {{{ - * ts: TypedDataset[Foo] = ... - * ts.select(ts.asCol, ts.asCol): TypedDataset[(Foo,Foo)] - * }}} - */ + /** + * Projects the entire `TypedDataset[T]` into a single column of type `TypedColumn[T,T]`. + * {{{ + * ts: TypedDataset[Foo] = ... + * ts.select(ts.asCol, ts.asCol): TypedDataset[(Foo,Foo)] + * }}} + */ def asCol: TypedColumn[T, T] = { val projectedColumn: Column = encoder.catalystRepr match { case StructType(_) => @@ -265,78 +327,98 @@ class TypedDataset[T] protected[frameless](val dataset: Dataset[T])(implicit val case _ => dataset.col(dataset.columns.head) } - - new TypedColumn[T,T](projectedColumn) + + new TypedColumn[T, T](projectedColumn) } - /** References the entire `TypedDataset[T]` as a single column - * of type `TypedColumn[T,T]` so it can be used in a join operation. - * - * {{{ - * def nameJoin(ds1: TypedDataset[Person], ds2: TypedDataset[Name]) = - * ds1.joinLeftSemi(ds2)(ds1.col('name) === ds2.asJoinColValue) - * }}} - */ - def asJoinColValue(implicit i0: IsValueClass[T]): TypedColumn[T, T] = { + /** + * References the entire `TypedDataset[T]` as a single column + * of type `TypedColumn[T,T]` so it can be used in a join operation. + * + * {{{ + * def nameJoin(ds1: TypedDataset[Person], ds2: TypedDataset[Name]) = + * ds1.joinLeftSemi(ds2)(ds1.col('name) === ds2.asJoinColValue) + * }}} + */ + def asJoinColValue( + implicit + i0: IsValueClass[T] + ): TypedColumn[T, T] = { import _root_.frameless.syntax._ dataset.col("value").typedColumn } object colMany extends SingletonProductArgs { - def applyProduct[U <: HList, Out](columns: U) - (implicit + + def applyProduct[U <: HList, Out]( + columns: U + )(implicit i0: TypedColumn.ExistsMany[T, U, Out], i1: TypedEncoder[Out], i2: ToTraversable.Aux[U, List, Symbol] ): TypedColumn[T, Out] = { - val names = columns.toList[Symbol].map(_.name) - val colExpr = FramelessInternals.resolveExpr(dataset, names) - new TypedColumn[T, Out](colExpr) - } + val names = columns.toList[Symbol].map(_.name) + val colExpr = FramelessInternals.resolveExpr(dataset, names) + new TypedColumn[T, Out](colExpr) + } } - /** Right hand side disambiguation of `col` for join expressions. - * To be used when writting self-joins, noop in other circumstances. - * - * Note: In vanilla Spark, disambiguation in self-joins is acheaved using - * String based aliases, which is obviously unsafe. - */ - def colRight[A](column: Witness.Lt[Symbol]) - (implicit + /** + * Right hand side disambiguation of `col` for join expressions. + * To be used when writting self-joins, noop in other circumstances. + * + * Note: In vanilla Spark, disambiguation in self-joins is acheaved using + * String based aliases, which is obviously unsafe. + */ + def colRight[A]( + column: Witness.Lt[Symbol] + )(implicit i0: TypedColumn.Exists[T, column.T, A], i1: TypedEncoder[A] ): TypedColumn[T, A] = - new TypedColumn[T, A](FramelessInternals.DisambiguateRight(col(column).expr)) - - /** Left hand side disambiguation of `col` for join expressions. - * To be used when writting self-joins, noop in other circumstances. - * - * Note: In vanilla Spark, disambiguation in self-joins is acheaved using - * String based aliases, which is obviously unsafe. - */ - def colLeft[A](column: Witness.Lt[Symbol]) - (implicit + new TypedColumn[T, A]( + FramelessInternals.DisambiguateRight(col(column).expr) + ) + + /** + * Left hand side disambiguation of `col` for join expressions. + * To be used when writting self-joins, noop in other circumstances. + * + * Note: In vanilla Spark, disambiguation in self-joins is acheaved using + * String based aliases, which is obviously unsafe. + */ + def colLeft[A]( + column: Witness.Lt[Symbol] + )(implicit i0: TypedColumn.Exists[T, column.T, A], i1: TypedEncoder[A] ): TypedColumn[T, A] = - new TypedColumn[T, A](FramelessInternals.DisambiguateLeft(col(column).expr)) - - /** Returns a `Seq` that contains all the elements in this [[TypedDataset]]. - * - * Running this operation requires moving all the data into the application's driver process, and - * doing so on a very large [[TypedDataset]] can crash the driver process with OutOfMemoryError. - * - * Differs from `Dataset#collect` by wrapping its result into an effect-suspending `F[_]`. - */ - def collect[F[_]]()(implicit F: SparkDelay[F]): F[Seq[T]] = + new TypedColumn[T, A](FramelessInternals.DisambiguateLeft(col(column).expr)) + + /** + * Returns a `Seq` that contains all the elements in this [[TypedDataset]]. + * + * Running this operation requires moving all the data into the application's driver process, and + * doing so on a very large [[TypedDataset]] can crash the driver process with OutOfMemoryError. + * + * Differs from `Dataset#collect` by wrapping its result into an effect-suspending `F[_]`. + */ + def collect[F[_]]( + )(implicit + F: SparkDelay[F] + ): F[Seq[T]] = F.delay(dataset.collect().toSeq) - /** Optionally returns the first element in this [[TypedDataset]]. - * - * Differs from `Dataset#first` by wrapping its result into an `Option` and an effect-suspending `F[_]`. - */ - def firstOption[F[_]]()(implicit F: SparkDelay[F]): F[Option[T]] = + /** + * Optionally returns the first element in this [[TypedDataset]]. + * + * Differs from `Dataset#first` by wrapping its result into an `Option` and an effect-suspending `F[_]`. + */ + def firstOption[F[_]]( + )(implicit + F: SparkDelay[F] + ): F[Option[T]] = F.delay { try { Option(dataset.first()) @@ -345,354 +427,468 @@ class TypedDataset[T] protected[frameless](val dataset: Dataset[T])(implicit val } } - /** Returns the first `num` elements of this [[TypedDataset]] as a `Seq`. - * - * Running take requires moving data into the application's driver process, and doing so with - * a very large `num` can crash the driver process with OutOfMemoryError. - * - * Differs from `Dataset#take` by wrapping its result into an effect-suspending `F[_]`. - * - * apache/spark - */ - def take[F[_]](num: Int)(implicit F: SparkDelay[F]): F[Seq[T]] = + /** + * Returns the first `num` elements of this [[TypedDataset]] as a `Seq`. + * + * Running take requires moving data into the application's driver process, and doing so with + * a very large `num` can crash the driver process with OutOfMemoryError. + * + * Differs from `Dataset#take` by wrapping its result into an effect-suspending `F[_]`. + * + * apache/spark + */ + def take[F[_]]( + num: Int + )(implicit + F: SparkDelay[F] + ): F[Seq[T]] = F.delay(dataset.take(num).toSeq) - /** Return an iterator that contains all rows in this [[TypedDataset]]. - * - * The iterator will consume as much memory as the largest partition in this [[TypedDataset]]. - * - * NOTE: this results in multiple Spark jobs, and if the input [[TypedDataset]] is the result - * of a wide transformation (e.g. join with different partitioners), to avoid - * recomputing the input [[TypedDataset]] should be cached first. - * - * Differs from `Dataset#toLocalIterator()` by wrapping its result into an effect-suspending `F[_]`. - * - * apache/spark - */ - def toLocalIterator[F[_]]()(implicit F: SparkDelay[F]): F[util.Iterator[T]] = + /** + * Return an iterator that contains all rows in this [[TypedDataset]]. + * + * The iterator will consume as much memory as the largest partition in this [[TypedDataset]]. + * + * NOTE: this results in multiple Spark jobs, and if the input [[TypedDataset]] is the result + * of a wide transformation (e.g. join with different partitioners), to avoid + * recomputing the input [[TypedDataset]] should be cached first. + * + * Differs from `Dataset#toLocalIterator()` by wrapping its result into an effect-suspending `F[_]`. + * + * apache/spark + */ + def toLocalIterator[F[_]]( + )(implicit + F: SparkDelay[F] + ): F[util.Iterator[T]] = F.delay(dataset.toLocalIterator()) - /** Alias for firstOption(). - */ - def headOption[F[_]]()(implicit F: SparkDelay[F]): F[Option[T]] = firstOption() + /** + * Alias for firstOption(). + */ + def headOption[F[_]]( + )(implicit + F: SparkDelay[F] + ): F[Option[T]] = firstOption() - /** Alias for take(). - */ - def head[F[_]](num: Int)(implicit F: SparkDelay[F]): F[Seq[T]] = take(num) + /** + * Alias for take(). + */ + def head[F[_]]( + num: Int + )(implicit + F: SparkDelay[F] + ): F[Seq[T]] = take(num) // $COVERAGE-OFF$ - /** Alias for firstOption(). - */ - @deprecated("Method may throw exception. Use headOption or firstOption instead.", "0.5.0") + /** + * Alias for firstOption(). + */ + @deprecated( + "Method may throw exception. Use headOption or firstOption instead.", + "0.5.0" + ) def head: T = dataset.head() - /** Alias for firstOption(). - */ - @deprecated("Method may throw exception. Use headOption or firstOption instead.", "0.5.0") + /** + * Alias for firstOption(). + */ + @deprecated( + "Method may throw exception. Use headOption or firstOption instead.", + "0.5.0" + ) def first: T = dataset.head() // $COVERAGE-ONN$ - /** Displays the content of this [[TypedDataset]] in a tabular form. Strings more than 20 characters - * will be truncated, and all cells will be aligned right. For example: - * {{{ - * year month AVG('Adj Close) MAX('Adj Close) - * 1980 12 0.503218 0.595103 - * 1981 01 0.523289 0.570307 - * 1982 02 0.436504 0.475256 - * 1983 03 0.410516 0.442194 - * 1984 04 0.450090 0.483521 - * }}} - * @param numRows Number of rows to show - * @param truncate Whether truncate long strings. If true, strings more than 20 characters will - * be truncated and all cells will be aligned right - * - * Differs from `Dataset#show` by wrapping its result into an effect-suspending `F[_]`. - * - * apache/spark - */ - def show[F[_]](numRows: Int = 20, truncate: Boolean = true)(implicit F: SparkDelay[F]): F[Unit] = + /** + * Displays the content of this [[TypedDataset]] in a tabular form. Strings more than 20 characters + * will be truncated, and all cells will be aligned right. For example: + * {{{ + * year month AVG('Adj Close) MAX('Adj Close) + * 1980 12 0.503218 0.595103 + * 1981 01 0.523289 0.570307 + * 1982 02 0.436504 0.475256 + * 1983 03 0.410516 0.442194 + * 1984 04 0.450090 0.483521 + * }}} + * @param numRows Number of rows to show + * @param truncate Whether truncate long strings. If true, strings more than 20 characters will + * be truncated and all cells will be aligned right + * + * Differs from `Dataset#show` by wrapping its result into an effect-suspending `F[_]`. + * + * apache/spark + */ + def show[F[_]]( + numRows: Int = 20, + truncate: Boolean = true + )(implicit + F: SparkDelay[F] + ): F[Unit] = F.delay(dataset.show(numRows, truncate)) - /** Returns a new [[frameless.TypedDataset]] that only contains elements where `column` is `true`. - * - * Differs from `TypedDatasetForward#filter` by taking a `TypedColumn[T, Boolean]` instead of a - * `T => Boolean`. Using a column expression instead of a regular function save one Spark → Scala - * deserialization which leads to better performance. - */ + /** + * Returns a new [[frameless.TypedDataset]] that only contains elements where `column` is `true`. + * + * Differs from `TypedDatasetForward#filter` by taking a `TypedColumn[T, Boolean]` instead of a + * `T => Boolean`. Using a column expression instead of a regular function save one Spark → Scala + * deserialization which leads to better performance. + */ def filter(column: TypedColumn[T, Boolean]): TypedDataset[T] = { - val filtered = dataset.toDF() - .filter(column.untyped) - .as[T](TypedExpressionEncoder[T]) + val filtered = + dataset.toDF().filter(column.untyped).as[T](TypedExpressionEncoder[T]) TypedDataset.create[T](filtered) } - /** Runs `func` on each element of this [[TypedDataset]]. - * - * Differs from `Dataset#foreach` by wrapping its result into an effect-suspending `F[_]`. - */ - def foreach[F[_]](func: T => Unit)(implicit F: SparkDelay[F]): F[Unit] = + /** + * Runs `func` on each element of this [[TypedDataset]]. + * + * Differs from `Dataset#foreach` by wrapping its result into an effect-suspending `F[_]`. + */ + def foreach[F[_]]( + func: T => Unit + )(implicit + F: SparkDelay[F] + ): F[Unit] = F.delay(dataset.foreach(func)) - /** Runs `func` on each partition of this [[TypedDataset]]. - * - * Differs from `Dataset#foreachPartition` by wrapping its result into an effect-suspending `F[_]`. - */ - def foreachPartition[F[_]](func: Iterator[T] => Unit)(implicit F: SparkDelay[F]): F[Unit] = + /** + * Runs `func` on each partition of this [[TypedDataset]]. + * + * Differs from `Dataset#foreachPartition` by wrapping its result into an effect-suspending `F[_]`. + */ + def foreachPartition[F[_]]( + func: Iterator[T] => Unit + )(implicit + F: SparkDelay[F] + ): F[Unit] = F.delay(dataset.foreachPartition(func)) /** - * Create a multi-dimensional cube for the current [[TypedDataset]] using the specified column, - * so we can run aggregation on it. - * See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions. - * - * Differs from `Dataset#cube` by wrapping values into `Option` instead of returning `null`. - * - * apache/spark - */ + * Create a multi-dimensional cube for the current [[TypedDataset]] using the specified column, + * so we can run aggregation on it. + * See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions. + * + * Differs from `Dataset#cube` by wrapping values into `Option` instead of returning `null`. + * + * apache/spark + */ def cube[K1]( - c1: TypedColumn[T, K1] - ): Cube1Ops[K1, T] = new Cube1Ops[K1, T](this, c1) - - /** - * Create a multi-dimensional cube for the current [[TypedDataset]] using the specified columns, - * so we can run aggregation on them. - * See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions. - * - * Differs from `Dataset#cube` by wrapping values into `Option` instead of returning `null`. - * - * apache/spark - */ + c1: TypedColumn[T, K1] + ): Cube1Ops[K1, T] = new Cube1Ops[K1, T](this, c1) + + /** + * Create a multi-dimensional cube for the current [[TypedDataset]] using the specified columns, + * so we can run aggregation on them. + * See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions. + * + * Differs from `Dataset#cube` by wrapping values into `Option` instead of returning `null`. + * + * apache/spark + */ def cube[K1, K2]( - c1: TypedColumn[T, K1], - c2: TypedColumn[T, K2] - ): Cube2Ops[K1, K2, T] = new Cube2Ops[K1, K2, T](this, c1, c2) - - /** - * Create a multi-dimensional cube for the current [[TypedDataset]] using the specified columns, - * so we can run aggregation on them. - * See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions. - * - * {{{ - * case class MyClass(a: Int, b: Int, c: Int) - * val ds: TypedDataset[MyClass] - - * val cubeDataset: TypedDataset[(Option[A], Option[B], Long)] = - * ds.cubeMany(ds('a), ds('b)).agg(count[MyClass]()) - * - * // original dataset: - * a b c - * 10 20 1 - * 15 25 2 - * - * // after aggregation: - * _1 _2 _3 - * 15 null 1 - * 15 25 1 - * null null 2 - * null 25 1 - * null 20 1 - * 10 null 1 - * 10 20 1 - * - * }}} - * - * Differs from `Dataset#cube` by wrapping values into `Option` instead of returning `null`. - * - * apache/spark - */ + c1: TypedColumn[T, K1], + c2: TypedColumn[T, K2] + ): Cube2Ops[K1, K2, T] = new Cube2Ops[K1, K2, T](this, c1, c2) + + /** + * Create a multi-dimensional cube for the current [[TypedDataset]] using the specified columns, + * so we can run aggregation on them. + * See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions. + * + * {{{ + * case class MyClass(a: Int, b: Int, c: Int) + * val ds: TypedDataset[MyClass] + * + * val cubeDataset: TypedDataset[(Option[A], Option[B], Long)] = + * ds.cubeMany(ds('a), ds('b)).agg(count[MyClass]()) + * + * // original dataset: + * a b c + * 10 20 1 + * 15 25 2 + * + * // after aggregation: + * _1 _2 _3 + * 15 null 1 + * 15 25 1 + * null null 2 + * null 25 1 + * null 20 1 + * 10 null 1 + * 10 20 1 + * + * }}} + * + * Differs from `Dataset#cube` by wrapping values into `Option` instead of returning `null`. + * + * apache/spark + */ object cubeMany extends ProductArgs { - def applyProduct[TK <: HList, K <: HList, KT](groupedBy: TK) - (implicit + + def applyProduct[TK <: HList, K <: HList, KT]( + groupedBy: TK + )(implicit i0: ColumnTypes.Aux[T, TK, K], i1: Tupler.Aux[K, KT], i2: ToTraversable.Aux[TK, List, UntypedExpression[T]] - ): CubeManyOps[T, TK, K, KT] = new CubeManyOps[T, TK, K, KT](self, groupedBy) + ): CubeManyOps[T, TK, K, KT] = + new CubeManyOps[T, TK, K, KT](self, groupedBy) } /** - * Groups the [[TypedDataset]] using the specified columns, so that we can run aggregation on them. - * See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions. - * - * apache/spark - */ + * Groups the [[TypedDataset]] using the specified columns, so that we can run aggregation on them. + * See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions. + * + * apache/spark + */ def groupBy[K1]( - c1: TypedColumn[T, K1] - ): GroupedBy1Ops[K1, T] = new GroupedBy1Ops[K1, T](this, c1) + c1: TypedColumn[T, K1] + ): GroupedBy1Ops[K1, T] = new GroupedBy1Ops[K1, T](this, c1) /** - * Groups the [[TypedDataset]] using the specified columns, so that we can run aggregation on them. - * See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions. - * - * apache/spark - */ + * Groups the [[TypedDataset]] using the specified columns, so that we can run aggregation on them. + * See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions. + * + * apache/spark + */ def groupBy[K1, K2]( - c1: TypedColumn[T, K1], - c2: TypedColumn[T, K2] - ): GroupedBy2Ops[K1, K2, T] = new GroupedBy2Ops[K1, K2, T](this, c1, c2) - - /** - * Groups the [[TypedDataset]] using the specified columns, so that we can run aggregation on them. - * See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions. - * - * {{{ - * case class MyClass(a: Int, b: Int, c: Int) - * val ds: TypedDataset[MyClass] - * - * val cubeDataset: TypedDataset[(Option[A], Option[B], Long)] = - * ds.groupByMany(ds('a), ds('b)).agg(count[MyClass]()) - * - * // original dataset: - * a b c - * 10 20 1 - * 15 25 2 - * - * // after aggregation: - * _1 _2 _3 - * 10 20 1 - * 15 25 1 - * - * }}} - * - * apache/spark - */ + c1: TypedColumn[T, K1], + c2: TypedColumn[T, K2] + ): GroupedBy2Ops[K1, K2, T] = new GroupedBy2Ops[K1, K2, T](this, c1, c2) + + /** + * Groups the [[TypedDataset]] using the specified columns, so that we can run aggregation on them. + * See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions. + * + * {{{ + * case class MyClass(a: Int, b: Int, c: Int) + * val ds: TypedDataset[MyClass] + * + * val cubeDataset: TypedDataset[(Option[A], Option[B], Long)] = + * ds.groupByMany(ds('a), ds('b)).agg(count[MyClass]()) + * + * // original dataset: + * a b c + * 10 20 1 + * 15 25 2 + * + * // after aggregation: + * _1 _2 _3 + * 10 20 1 + * 15 25 1 + * + * }}} + * + * apache/spark + */ object groupByMany extends ProductArgs { - def applyProduct[TK <: HList, K <: HList, KT](groupedBy: TK) - (implicit + + def applyProduct[TK <: HList, K <: HList, KT]( + groupedBy: TK + )(implicit i0: ColumnTypes.Aux[T, TK, K], i1: Tupler.Aux[K, KT], i2: ToTraversable.Aux[TK, List, UntypedExpression[T]] - ): GroupedByManyOps[T, TK, K, KT] = new GroupedByManyOps[T, TK, K, KT](self, groupedBy) + ): GroupedByManyOps[T, TK, K, KT] = + new GroupedByManyOps[T, TK, K, KT](self, groupedBy) } /** - * Create a multi-dimensional rollup for the current [[TypedDataset]] using the specified column, - * so we can run aggregation on it. - * See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions. - * - * Differs from `Dataset#rollup` by wrapping values into `Option` instead of returning `null`. - * - * apache/spark - */ + * Create a multi-dimensional rollup for the current [[TypedDataset]] using the specified column, + * so we can run aggregation on it. + * See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions. + * + * Differs from `Dataset#rollup` by wrapping values into `Option` instead of returning `null`. + * + * apache/spark + */ def rollup[K1]( - c1: TypedColumn[T, K1] - ): Rollup1Ops[K1, T] = new Rollup1Ops[K1, T](this, c1) - - /** - * Create a multi-dimensional rollup for the current [[TypedDataset]] using the specified columns, - * so we can run aggregation on them. - * See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions. - * - * Differs from `Dataset#rollup` by wrapping values into `Option` instead of returning `null`. - * - * apache/spark - */ + c1: TypedColumn[T, K1] + ): Rollup1Ops[K1, T] = new Rollup1Ops[K1, T](this, c1) + + /** + * Create a multi-dimensional rollup for the current [[TypedDataset]] using the specified columns, + * so we can run aggregation on them. + * See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions. + * + * Differs from `Dataset#rollup` by wrapping values into `Option` instead of returning `null`. + * + * apache/spark + */ def rollup[K1, K2]( - c1: TypedColumn[T, K1], - c2: TypedColumn[T, K2] - ): Rollup2Ops[K1, K2, T] = new Rollup2Ops[K1, K2, T](this, c1, c2) - - /** - * Create a multi-dimensional rollup for the current [[TypedDataset]] using the specified columns, - * so we can run aggregation on them. - * See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions. - * - * {{{ - * case class MyClass(a: Int, b: Int, c: Int) - * val ds: TypedDataset[MyClass] - * - * val cubeDataset: TypedDataset[(Option[A], Option[B], Long)] = - * ds.rollupMany(ds('a), ds('b)).agg(count[MyClass]()) - * - * // original dataset: - * a b c - * 10 20 1 - * 15 25 2 - * - * // after aggregation: - * _1 _2 _3 - * 15 null 1 - * 15 25 1 - * null null 2 - * 10 null 1 - * 10 20 1 - * - * }}} - * - * Differs from `Dataset#rollup` by wrapping values into `Option` instead of returning `null`. - * - * apache/spark - */ + c1: TypedColumn[T, K1], + c2: TypedColumn[T, K2] + ): Rollup2Ops[K1, K2, T] = new Rollup2Ops[K1, K2, T](this, c1, c2) + + /** + * Create a multi-dimensional rollup for the current [[TypedDataset]] using the specified columns, + * so we can run aggregation on them. + * See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions. + * + * {{{ + * case class MyClass(a: Int, b: Int, c: Int) + * val ds: TypedDataset[MyClass] + * + * val cubeDataset: TypedDataset[(Option[A], Option[B], Long)] = + * ds.rollupMany(ds('a), ds('b)).agg(count[MyClass]()) + * + * // original dataset: + * a b c + * 10 20 1 + * 15 25 2 + * + * // after aggregation: + * _1 _2 _3 + * 15 null 1 + * 15 25 1 + * null null 2 + * 10 null 1 + * 10 20 1 + * + * }}} + * + * Differs from `Dataset#rollup` by wrapping values into `Option` instead of returning `null`. + * + * apache/spark + */ object rollupMany extends ProductArgs { - def applyProduct[TK <: HList, K <: HList, KT](groupedBy: TK) - (implicit + + def applyProduct[TK <: HList, K <: HList, KT]( + groupedBy: TK + )(implicit i0: ColumnTypes.Aux[T, TK, K], i1: Tupler.Aux[K, KT], i2: ToTraversable.Aux[TK, List, UntypedExpression[T]] - ): RollupManyOps[T, TK, K, KT] = new RollupManyOps[T, TK, K, KT](self, groupedBy) + ): RollupManyOps[T, TK, K, KT] = + new RollupManyOps[T, TK, K, KT](self, groupedBy) } /** Computes the cartesian project of `this` `Dataset` with the `other` `Dataset` */ - def joinCross[U](other: TypedDataset[U]) - (implicit e: TypedEncoder[(T, U)]): TypedDataset[(T, U)] = - new TypedDataset(self.dataset.joinWith(other.dataset, new Column(Literal(true)), "cross")) - - /** Computes the full outer join of `this` `Dataset` with the `other` `Dataset`, - * returning a `Tuple2` for each pair where condition evaluates to true. - */ - def joinFull[U](other: TypedDataset[U])(condition: TypedColumn[T with U, Boolean]) - (implicit e: TypedEncoder[(Option[T], Option[U])]): TypedDataset[(Option[T], Option[U])] = - new TypedDataset(self.dataset.joinWith(other.dataset, condition.untyped, "full") - .as[(Option[T], Option[U])](TypedExpressionEncoder[(Option[T], Option[U])])) - - /** Computes the inner join of `this` `Dataset` with the `other` `Dataset`, - * returning a `Tuple2` for each pair where condition evaluates to true. - */ - def joinInner[U](other: TypedDataset[U])(condition: TypedColumn[T with U, Boolean]) - (implicit e: TypedEncoder[(T, U)]): TypedDataset[(T, U)] = { - import FramelessInternals._ - - val leftPlan = logicalPlan(dataset) - val rightPlan = logicalPlan(other.dataset) - val join = disambiguate(Join(leftPlan, rightPlan, Inner, Some(condition.expr), JoinHint.NONE)) - val joinedPlan = joinPlan(dataset, join, leftPlan, rightPlan) - val joinedDs = mkDataset(dataset.sqlContext, joinedPlan, TypedExpressionEncoder[(T, U)]) - - TypedDataset.create[(T, U)](joinedDs) - } + def joinCross[U]( + other: TypedDataset[U] + )(implicit + e: TypedEncoder[(T, U)] + ): TypedDataset[(T, U)] = + new TypedDataset( + self.dataset + .joinWith( + other.dataset, + FramelessInternals.column(Literal(true)), + "cross" + ) + .as[(T, U)](TypedExpressionEncoder[(T, U)]) + ) - /** Computes the left outer join of `this` `Dataset` with the `other` `Dataset`, - * returning a `Tuple2` for each pair where condition evaluates to true. - */ - def joinLeft[U](other: TypedDataset[U])(condition: TypedColumn[T with U, Boolean]) - (implicit e: TypedEncoder[(T, Option[U])]): TypedDataset[(T, Option[U])] = - new TypedDataset(self.dataset.joinWith(other.dataset, condition.untyped, "left_outer") - .as[(T, Option[U])](TypedExpressionEncoder[(T, Option[U])])) - - /** Computes the left semi join of `this` `Dataset` with the `other` `Dataset`, - * returning a `Tuple2` for each pair where condition evaluates to true. - */ - def joinLeftSemi[U](other: TypedDataset[U])(condition: TypedColumn[T with U, Boolean]): TypedDataset[T] = - new TypedDataset(self.dataset.join(other.dataset, condition.untyped, "leftsemi") - .as[T](TypedExpressionEncoder(encoder))) - - /** Computes the left anti join of `this` `Dataset` with the `other` `Dataset`, - * returning a `Tuple2` for each pair where condition evaluates to true. - */ - def joinLeftAnti[U](other: TypedDataset[U])(condition: TypedColumn[T with U, Boolean]): TypedDataset[T] = - new TypedDataset(self.dataset.join(other.dataset, condition.untyped, "leftanti") - .as[T](TypedExpressionEncoder(encoder))) - - /** Computes the right outer join of `this` `Dataset` with the `other` `Dataset`, - * returning a `Tuple2` for each pair where condition evaluates to true. - */ - def joinRight[U](other: TypedDataset[U])(condition: TypedColumn[T with U, Boolean]) - (implicit e: TypedEncoder[(Option[T], U)]): TypedDataset[(Option[T], U)] = - new TypedDataset(self.dataset.joinWith(other.dataset, condition.untyped, "right_outer") - .as[(Option[T], U)](TypedExpressionEncoder[(Option[T], U)])) + /** + * Computes the full outer join of `this` `Dataset` with the `other` `Dataset`, + * returning a `Tuple2` for each pair where condition evaluates to true. + */ + def joinFull[U]( + other: TypedDataset[U] + )(condition: TypedColumn[T with U, Boolean] + )(implicit + e: TypedEncoder[(Option[T], Option[U])] + ): TypedDataset[(Option[T], Option[U])] = + new TypedDataset( + self.dataset + .joinWith(other.dataset, condition.untyped, "full") + .as[(Option[T], Option[U])]( + TypedExpressionEncoder[(Option[T], Option[U])] + ) + ) + + /** + * Computes the inner join of `this` `Dataset` with the `other` `Dataset`, + * returning a `Tuple2` for each pair where condition evaluates to true. + */ + def joinInner[U]( + other: TypedDataset[U] + )(condition: TypedColumn[T with U, Boolean] + )(implicit + e: TypedEncoder[(T, U)] + ): TypedDataset[(T, U)] = { + import FramelessInternals._ + + val leftPlan = logicalPlan(dataset) + val rightPlan = logicalPlan(other.dataset) + val join = disambiguate( + Join(leftPlan, rightPlan, Inner, Some(condition.expr), JoinHint.NONE) + ) + val joinedPlan = joinPlan(dataset, join, leftPlan, rightPlan) + val joinedDs = + mkDataset(dataset, joinedPlan, TypedExpressionEncoder[(T, U)]) + + TypedDataset.create[(T, U)](joinedDs) + } + + /** + * Computes the left outer join of `this` `Dataset` with the `other` `Dataset`, + * returning a `Tuple2` for each pair where condition evaluates to true. + */ + def joinLeft[U]( + other: TypedDataset[U] + )(condition: TypedColumn[T with U, Boolean] + )(implicit + e: TypedEncoder[(T, Option[U])] + ): TypedDataset[(T, Option[U])] = + new TypedDataset( + self.dataset + .joinWith(other.dataset, condition.untyped, "left_outer") + .as[(T, Option[U])](TypedExpressionEncoder[(T, Option[U])]) + ) + + /** + * Computes the left semi join of `this` `Dataset` with the `other` `Dataset`, + * returning a `Tuple2` for each pair where condition evaluates to true. + */ + def joinLeftSemi[U]( + other: TypedDataset[U] + )(condition: TypedColumn[T with U, Boolean] + ): TypedDataset[T] = + new TypedDataset( + self.dataset + .join(other.dataset, condition.untyped, "leftsemi") + .as[T](TypedExpressionEncoder(encoder)) + ) + + /** + * Computes the left anti join of `this` `Dataset` with the `other` `Dataset`, + * returning a `Tuple2` for each pair where condition evaluates to true. + */ + def joinLeftAnti[U]( + other: TypedDataset[U] + )(condition: TypedColumn[T with U, Boolean] + ): TypedDataset[T] = + new TypedDataset( + self.dataset + .join(other.dataset, condition.untyped, "leftanti") + .as[T](TypedExpressionEncoder(encoder)) + ) + + /** + * Computes the right outer join of `this` `Dataset` with the `other` `Dataset`, + * returning a `Tuple2` for each pair where condition evaluates to true. + */ + def joinRight[U]( + other: TypedDataset[U] + )(condition: TypedColumn[T with U, Boolean] + )(implicit + e: TypedEncoder[(Option[T], U)] + ): TypedDataset[(Option[T], U)] = + new TypedDataset( + self.dataset + .joinWith(other.dataset, condition.untyped, "right_outer") + .as[(Option[T], U)](TypedExpressionEncoder[(Option[T], U)]) + ) private def disambiguate(join: Join): Join = { - val plan = FramelessInternals.ofRows(dataset.sparkSession, join).queryExecution.analyzed.asInstanceOf[Join] + val plan = FramelessInternals + .ofRows(dataset.sparkSession, join) + .queryExecution + .analyzed + .asInstanceOf[Join] val disambiguated = plan.condition.map(_.transform { case FramelessInternals.DisambiguateLeft(tagged: AttributeReference) => val leftDs = FramelessInternals.ofRows(spark, plan.left) @@ -707,43 +903,82 @@ class TypedDataset[T] protected[frameless](val dataset: Dataset[T])(implicit val plan.copy(condition = disambiguated) } - /** Takes a function from A => R and converts it to a UDF for TypedColumn[T, A] => TypedColumn[T, R]. - */ - def makeUDF[A: TypedEncoder, R: TypedEncoder](f: A => R): - TypedColumn[T, A] => TypedColumn[T, R] = functions.udf(f) - - /** Takes a function from (A1, A2) => R and converts it to a UDF for - * (TypedColumn[T, A1], TypedColumn[T, A2]) => TypedColumn[T, R]. - */ - def makeUDF[A1: TypedEncoder, A2: TypedEncoder, R: TypedEncoder](f: (A1, A2) => R): - (TypedColumn[T, A1], TypedColumn[T, A2]) => TypedColumn[T, R] = functions.udf(f) - - /** Takes a function from (A1, A2, A3) => R and converts it to a UDF for - * (TypedColumn[T, A1], TypedColumn[T, A2], TypedColumn[T, A3]) => TypedColumn[T, R]. - */ - def makeUDF[A1: TypedEncoder, A2: TypedEncoder, A3: TypedEncoder, R: TypedEncoder](f: (A1, A2, A3) => R): - (TypedColumn[T, A1], TypedColumn[T, A2], TypedColumn[T, A3]) => TypedColumn[T, R] = functions.udf(f) - - /** Takes a function from (A1, A2, A3, A4) => R and converts it to a UDF for - * (TypedColumn[T, A1], TypedColumn[T, A2], TypedColumn[T, A3], TypedColumn[T, A4]) => TypedColumn[T, R]. - */ - def makeUDF[A1: TypedEncoder, A2: TypedEncoder, A3: TypedEncoder, A4: TypedEncoder, R: TypedEncoder](f: (A1, A2, A3, A4) => R): - (TypedColumn[T, A1], TypedColumn[T, A2], TypedColumn[T, A3], TypedColumn[T, A4]) => TypedColumn[T, R] = functions.udf(f) - - /** Takes a function from (A1, A2, A3, A4, A5) => R and converts it to a UDF for - * (TypedColumn[T, A1], TypedColumn[T, A2], TypedColumn[T, A3], TypedColumn[T, A4], TypedColumn[T, A5]) => TypedColumn[T, R]. - */ - def makeUDF[A1: TypedEncoder, A2: TypedEncoder, A3: TypedEncoder, A4: TypedEncoder, A5: TypedEncoder, R: TypedEncoder](f: (A1, A2, A3, A4, A5) => R): - (TypedColumn[T, A1], TypedColumn[T, A2], TypedColumn[T, A3], TypedColumn[T, A4], TypedColumn[T, A5]) => TypedColumn[T, R] = functions.udf(f) - - /** Type-safe projection from type T to Tuple1[A] - * {{{ - * d.select( d('a), d('a)+d('b), ... ) - * }}} - */ + /** + * Takes a function from A => R and converts it to a UDF for TypedColumn[T, A] => TypedColumn[T, R]. + */ + def makeUDF[A: TypedEncoder, R: TypedEncoder]( + f: A => R + ): TypedColumn[T, A] => TypedColumn[T, R] = functions.udf(f) + + /** + * Takes a function from (A1, A2) => R and converts it to a UDF for + * (TypedColumn[T, A1], TypedColumn[T, A2]) => TypedColumn[T, R]. + */ + def makeUDF[A1: TypedEncoder, A2: TypedEncoder, R: TypedEncoder]( + f: (A1, A2) => R + ): (TypedColumn[T, A1], TypedColumn[T, A2]) => TypedColumn[T, R] = + functions.udf(f) + + /** + * Takes a function from (A1, A2, A3) => R and converts it to a UDF for + * (TypedColumn[T, A1], TypedColumn[T, A2], TypedColumn[T, A3]) => TypedColumn[T, R]. + */ + def makeUDF[ + A1: TypedEncoder, + A2: TypedEncoder, + A3: TypedEncoder, + R: TypedEncoder + ](f: (A1, A2, A3) => R + ): (TypedColumn[T, A1], TypedColumn[T, A2], TypedColumn[T, A3]) => TypedColumn[T, R] = + functions.udf(f) + + /** + * Takes a function from (A1, A2, A3, A4) => R and converts it to a UDF for + * (TypedColumn[T, A1], TypedColumn[T, A2], TypedColumn[T, A3], TypedColumn[T, A4]) => TypedColumn[T, R]. + */ + def makeUDF[ + A1: TypedEncoder, + A2: TypedEncoder, + A3: TypedEncoder, + A4: TypedEncoder, + R: TypedEncoder + ](f: (A1, A2, A3, A4) => R + ): ( + TypedColumn[T, A1], + TypedColumn[T, A2], + TypedColumn[T, A3], + TypedColumn[T, A4] + ) => TypedColumn[T, R] = functions.udf(f) + + /** + * Takes a function from (A1, A2, A3, A4, A5) => R and converts it to a UDF for + * (TypedColumn[T, A1], TypedColumn[T, A2], TypedColumn[T, A3], TypedColumn[T, A4], TypedColumn[T, A5]) => TypedColumn[T, R]. + */ + def makeUDF[ + A1: TypedEncoder, + A2: TypedEncoder, + A3: TypedEncoder, + A4: TypedEncoder, + A5: TypedEncoder, + R: TypedEncoder + ](f: (A1, A2, A3, A4, A5) => R + ): ( + TypedColumn[T, A1], + TypedColumn[T, A2], + TypedColumn[T, A3], + TypedColumn[T, A4], + TypedColumn[T, A5] + ) => TypedColumn[T, R] = functions.udf(f) + + /** + * Type-safe projection from type T to Tuple1[A] + * {{{ + * d.select( d('a), d('a)+d('b), ... ) + * }}} + */ def select[A]( - ca: TypedColumn[T, A] - ): TypedDataset[A] = { + ca: TypedColumn[T, A] + ): TypedDataset[A] = { implicit val ea = ca.uencoder val tuple1: TypedDataset[Tuple1[A]] = selectMany(ca) @@ -753,10 +988,8 @@ class TypedDataset[T] protected[frameless](val dataset: Dataset[T])(implicit val TypedEncoder[A].catalystRepr match { case StructType(_) => // if column is struct, we use all its fields - val df = tuple1 - .dataset - .selectExpr("_1.*") - .as[A](TypedExpressionEncoder[A]) + val df = + tuple1.dataset.selectExpr("_1.*").as[A](TypedExpressionEncoder[A]) TypedDataset.create(df) case other => @@ -765,217 +998,290 @@ class TypedDataset[T] protected[frameless](val dataset: Dataset[T])(implicit val } } - /** Type-safe projection from type T to Tuple2[A,B] - * {{{ - * d.select( d('a), d('a)+d('b), ... ) - * }}} - */ + /** + * Type-safe projection from type T to Tuple2[A,B] + * {{{ + * d.select( d('a), d('a)+d('b), ... ) + * }}} + */ def select[A, B]( - ca: TypedColumn[T, A], - cb: TypedColumn[T, B] - ): TypedDataset[(A, B)] = { + ca: TypedColumn[T, A], + cb: TypedColumn[T, B] + ): TypedDataset[(A, B)] = { implicit val (ea, eb) = (ca.uencoder, cb.uencoder) selectMany(ca, cb) } - /** Type-safe projection from type T to Tuple3[A,B,...] - * {{{ - * d.select( d('a), d('a)+d('b), ... ) - * }}} - */ + /** + * Type-safe projection from type T to Tuple3[A,B,...] + * {{{ + * d.select( d('a), d('a)+d('b), ... ) + * }}} + */ def select[A, B, C]( - ca: TypedColumn[T, A], - cb: TypedColumn[T, B], - cc: TypedColumn[T, C] - ): TypedDataset[(A, B, C)] = { + ca: TypedColumn[T, A], + cb: TypedColumn[T, B], + cc: TypedColumn[T, C] + ): TypedDataset[(A, B, C)] = { implicit val (ea, eb, ec) = (ca.uencoder, cb.uencoder, cc.uencoder) selectMany(ca, cb, cc) } - /** Type-safe projection from type T to Tuple4[A,B,...] - * {{{ - * d.select( d('a), d('a)+d('b), ... ) - * }}} - */ + /** + * Type-safe projection from type T to Tuple4[A,B,...] + * {{{ + * d.select( d('a), d('a)+d('b), ... ) + * }}} + */ def select[A, B, C, D]( - ca: TypedColumn[T, A], - cb: TypedColumn[T, B], - cc: TypedColumn[T, C], - cd: TypedColumn[T, D] - ): TypedDataset[(A, B, C, D)] = { - implicit val (ea, eb, ec, ed) = (ca.uencoder, cb.uencoder, cc.uencoder, cd.uencoder) + ca: TypedColumn[T, A], + cb: TypedColumn[T, B], + cc: TypedColumn[T, C], + cd: TypedColumn[T, D] + ): TypedDataset[(A, B, C, D)] = { + implicit val (ea, eb, ec, ed) = + (ca.uencoder, cb.uencoder, cc.uencoder, cd.uencoder) selectMany(ca, cb, cc, cd) } - /** Type-safe projection from type T to Tuple5[A,B,...] - * {{{ - * d.select( d('a), d('a)+d('b), ... ) - * }}} - */ + /** + * Type-safe projection from type T to Tuple5[A,B,...] + * {{{ + * d.select( d('a), d('a)+d('b), ... ) + * }}} + */ def select[A, B, C, D, E]( - ca: TypedColumn[T, A], - cb: TypedColumn[T, B], - cc: TypedColumn[T, C], - cd: TypedColumn[T, D], - ce: TypedColumn[T, E] - ): TypedDataset[(A, B, C, D, E)] = { + ca: TypedColumn[T, A], + cb: TypedColumn[T, B], + cc: TypedColumn[T, C], + cd: TypedColumn[T, D], + ce: TypedColumn[T, E] + ): TypedDataset[(A, B, C, D, E)] = { implicit val (ea, eb, ec, ed, ee) = (ca.uencoder, cb.uencoder, cc.uencoder, cd.uencoder, ce.uencoder) selectMany(ca, cb, cc, cd, ce) } - /** Type-safe projection from type T to Tuple6[A,B,...] - * {{{ - * d.select( d('a), d('a)+d('b), ... ) - * }}} - */ + /** + * Type-safe projection from type T to Tuple6[A,B,...] + * {{{ + * d.select( d('a), d('a)+d('b), ... ) + * }}} + */ def select[A, B, C, D, E, F]( - ca: TypedColumn[T, A], - cb: TypedColumn[T, B], - cc: TypedColumn[T, C], - cd: TypedColumn[T, D], - ce: TypedColumn[T, E], - cf: TypedColumn[T, F] - ): TypedDataset[(A, B, C, D, E, F)] = { + ca: TypedColumn[T, A], + cb: TypedColumn[T, B], + cc: TypedColumn[T, C], + cd: TypedColumn[T, D], + ce: TypedColumn[T, E], + cf: TypedColumn[T, F] + ): TypedDataset[(A, B, C, D, E, F)] = { implicit val (ea, eb, ec, ed, ee, ef) = - (ca.uencoder, cb.uencoder, cc.uencoder, cd.uencoder, ce.uencoder, cf.uencoder) + ( + ca.uencoder, + cb.uencoder, + cc.uencoder, + cd.uencoder, + ce.uencoder, + cf.uencoder + ) selectMany(ca, cb, cc, cd, ce, cf) } - /** Type-safe projection from type T to Tuple7[A,B,...] - * {{{ - * d.select( d('a), d('a)+d('b), ... ) - * }}} - */ + /** + * Type-safe projection from type T to Tuple7[A,B,...] + * {{{ + * d.select( d('a), d('a)+d('b), ... ) + * }}} + */ def select[A, B, C, D, E, F, G]( - ca: TypedColumn[T, A], - cb: TypedColumn[T, B], - cc: TypedColumn[T, C], - cd: TypedColumn[T, D], - ce: TypedColumn[T, E], - cf: TypedColumn[T, F], - cg: TypedColumn[T, G] - ): TypedDataset[(A, B, C, D, E, F, G)] = { + ca: TypedColumn[T, A], + cb: TypedColumn[T, B], + cc: TypedColumn[T, C], + cd: TypedColumn[T, D], + ce: TypedColumn[T, E], + cf: TypedColumn[T, F], + cg: TypedColumn[T, G] + ): TypedDataset[(A, B, C, D, E, F, G)] = { implicit val (ea, eb, ec, ed, ee, ef, eg) = - (ca.uencoder, cb.uencoder, cc.uencoder, cd.uencoder, ce.uencoder, cf.uencoder, cg.uencoder) + ( + ca.uencoder, + cb.uencoder, + cc.uencoder, + cd.uencoder, + ce.uencoder, + cf.uencoder, + cg.uencoder + ) selectMany(ca, cb, cc, cd, ce, cf, cg) } - /** Type-safe projection from type T to Tuple8[A,B,...] - * {{{ - * d.select( d('a), d('a)+d('b), ... ) - * }}} - */ + /** + * Type-safe projection from type T to Tuple8[A,B,...] + * {{{ + * d.select( d('a), d('a)+d('b), ... ) + * }}} + */ def select[A, B, C, D, E, F, G, H]( - ca: TypedColumn[T, A], - cb: TypedColumn[T, B], - cc: TypedColumn[T, C], - cd: TypedColumn[T, D], - ce: TypedColumn[T, E], - cf: TypedColumn[T, F], - cg: TypedColumn[T, G], - ch: TypedColumn[T, H] - ): TypedDataset[(A, B, C, D, E, F, G, H)] = { + ca: TypedColumn[T, A], + cb: TypedColumn[T, B], + cc: TypedColumn[T, C], + cd: TypedColumn[T, D], + ce: TypedColumn[T, E], + cf: TypedColumn[T, F], + cg: TypedColumn[T, G], + ch: TypedColumn[T, H] + ): TypedDataset[(A, B, C, D, E, F, G, H)] = { implicit val (ea, eb, ec, ed, ee, ef, eg, eh) = - (ca.uencoder, cb.uencoder, cc.uencoder, cd.uencoder, ce.uencoder, cf.uencoder, cg.uencoder, ch.uencoder) + ( + ca.uencoder, + cb.uencoder, + cc.uencoder, + cd.uencoder, + ce.uencoder, + cf.uencoder, + cg.uencoder, + ch.uencoder + ) selectMany(ca, cb, cc, cd, ce, cf, cg, ch) } - /** Type-safe projection from type T to Tuple9[A,B,...] - * {{{ - * d.select( d('a), d('a)+d('b), ... ) - * }}} - */ + /** + * Type-safe projection from type T to Tuple9[A,B,...] + * {{{ + * d.select( d('a), d('a)+d('b), ... ) + * }}} + */ def select[A, B, C, D, E, F, G, H, I]( - ca: TypedColumn[T, A], - cb: TypedColumn[T, B], - cc: TypedColumn[T, C], - cd: TypedColumn[T, D], - ce: TypedColumn[T, E], - cf: TypedColumn[T, F], - cg: TypedColumn[T, G], - ch: TypedColumn[T, H], - ci: TypedColumn[T, I] - ): TypedDataset[(A, B, C, D, E, F, G, H, I)] = { + ca: TypedColumn[T, A], + cb: TypedColumn[T, B], + cc: TypedColumn[T, C], + cd: TypedColumn[T, D], + ce: TypedColumn[T, E], + cf: TypedColumn[T, F], + cg: TypedColumn[T, G], + ch: TypedColumn[T, H], + ci: TypedColumn[T, I] + ): TypedDataset[(A, B, C, D, E, F, G, H, I)] = { implicit val (ea, eb, ec, ed, ee, ef, eg, eh, ei) = - (ca.uencoder, cb.uencoder, cc.uencoder, cd.uencoder, ce.uencoder, cf.uencoder, cg.uencoder, ch.uencoder, ci.uencoder) + ( + ca.uencoder, + cb.uencoder, + cc.uencoder, + cd.uencoder, + ce.uencoder, + cf.uencoder, + cg.uencoder, + ch.uencoder, + ci.uencoder + ) selectMany(ca, cb, cc, cd, ce, cf, cg, ch, ci) } - /** Type-safe projection from type T to Tuple10[A,B,...] - * {{{ - * d.select( d('a), d('a)+d('b), ... ) - * }}} - */ + /** + * Type-safe projection from type T to Tuple10[A,B,...] + * {{{ + * d.select( d('a), d('a)+d('b), ... ) + * }}} + */ def select[A, B, C, D, E, F, G, H, I, J]( - ca: TypedColumn[T, A], - cb: TypedColumn[T, B], - cc: TypedColumn[T, C], - cd: TypedColumn[T, D], - ce: TypedColumn[T, E], - cf: TypedColumn[T, F], - cg: TypedColumn[T, G], - ch: TypedColumn[T, H], - ci: TypedColumn[T, I], - cj: TypedColumn[T, J] - ): TypedDataset[(A, B, C, D, E, F, G, H, I, J)] = { + ca: TypedColumn[T, A], + cb: TypedColumn[T, B], + cc: TypedColumn[T, C], + cd: TypedColumn[T, D], + ce: TypedColumn[T, E], + cf: TypedColumn[T, F], + cg: TypedColumn[T, G], + ch: TypedColumn[T, H], + ci: TypedColumn[T, I], + cj: TypedColumn[T, J] + ): TypedDataset[(A, B, C, D, E, F, G, H, I, J)] = { implicit val (ea, eb, ec, ed, ee, ef, eg, eh, ei, ej) = - (ca.uencoder, cb.uencoder, cc.uencoder, cd.uencoder, ce.uencoder, cf.uencoder, cg.uencoder, ch.uencoder, ci.uencoder, cj.uencoder) + ( + ca.uencoder, + cb.uencoder, + cc.uencoder, + cd.uencoder, + ce.uencoder, + cf.uencoder, + cg.uencoder, + ch.uencoder, + ci.uencoder, + cj.uencoder + ) selectMany(ca, cb, cc, cd, ce, cf, cg, ch, ci, cj) } object selectMany extends ProductArgs { - def applyProduct[U <: HList, Out0 <: HList, Out](columns: U) - (implicit + + def applyProduct[U <: HList, Out0 <: HList, Out]( + columns: U + )(implicit i0: ColumnTypes.Aux[T, U, Out0], i1: ToTraversable.Aux[U, List, UntypedExpression[T]], i2: Tupler.Aux[Out0, Out], i3: TypedEncoder[Out] ): TypedDataset[Out] = { - val base = dataset.toDF() - .select(columns.toList[UntypedExpression[T]].map(c => new Column(c.expr)):_*) - val selected = base.as[Out](TypedExpressionEncoder[Out]) + val base = dataset + .toDF() + .select( + columns + .toList[UntypedExpression[T]] + .map(c => FramelessInternals.column(c.expr)): _* + ) + val selected = base.as[Out](TypedExpressionEncoder[Out]) - TypedDataset.create[Out](selected) - } + TypedDataset.create[Out](selected) + } } /** Sort each partition in the dataset using the columns selected. */ - def sortWithinPartitions[A: CatalystOrdered](ca: SortedTypedColumn[T, A]): TypedDataset[T] = + def sortWithinPartitions[A: CatalystOrdered]( + ca: SortedTypedColumn[T, A] + ): TypedDataset[T] = sortWithinPartitionsMany(ca) /** Sort each partition in the dataset using the columns selected. */ def sortWithinPartitions[A: CatalystOrdered, B: CatalystOrdered]( - ca: SortedTypedColumn[T, A], - cb: SortedTypedColumn[T, B] - ): TypedDataset[T] = sortWithinPartitionsMany(ca, cb) + ca: SortedTypedColumn[T, A], + cb: SortedTypedColumn[T, B] + ): TypedDataset[T] = sortWithinPartitionsMany(ca, cb) /** Sort each partition in the dataset using the columns selected. */ - def sortWithinPartitions[A: CatalystOrdered, B: CatalystOrdered, C: CatalystOrdered]( - ca: SortedTypedColumn[T, A], - cb: SortedTypedColumn[T, B], - cc: SortedTypedColumn[T, C] - ): TypedDataset[T] = sortWithinPartitionsMany(ca, cb, cc) - - /** Sort each partition in the dataset by the given column expressions - * Default sort order is ascending. - * {{{ - * d.sortWithinPartitionsMany(d('a), d('b).desc, d('c).asc) - * }}} - */ + def sortWithinPartitions[ + A: CatalystOrdered, + B: CatalystOrdered, + C: CatalystOrdered + ](ca: SortedTypedColumn[T, A], + cb: SortedTypedColumn[T, B], + cc: SortedTypedColumn[T, C] + ): TypedDataset[T] = sortWithinPartitionsMany(ca, cb, cc) + + /** + * Sort each partition in the dataset by the given column expressions + * Default sort order is ascending. + * {{{ + * d.sortWithinPartitionsMany(d('a), d('b).desc, d('c).asc) + * }}} + */ object sortWithinPartitionsMany extends ProductArgs { - def applyProduct[U <: HList, O <: HList](columns: U) - (implicit + + def applyProduct[U <: HList, O <: HList]( + columns: U + )(implicit i0: Mapper.Aux[SortedTypedColumn.defaultAscendingPoly.type, U, O], i1: ToTraversable.Aux[O, List, SortedTypedColumn[T, _]] ): TypedDataset[T] = { - val sorted = dataset.toDF() - .sortWithinPartitions(i0(columns).toList[SortedTypedColumn[T, _]].map(_.untyped):_*) + val sorted = dataset + .toDF() + .sortWithinPartitions( + i0(columns).toList[SortedTypedColumn[T, _]].map(_.untyped): _* + ) .as[T](TypedExpressionEncoder[T]) TypedDataset.create[T](sorted) @@ -983,273 +1289,316 @@ class TypedDataset[T] protected[frameless](val dataset: Dataset[T])(implicit val } /** Orders the TypedDataset using the column selected. */ - def orderBy[A: CatalystOrdered](ca: SortedTypedColumn[T, A]): TypedDataset[T] = + def orderBy[A: CatalystOrdered]( + ca: SortedTypedColumn[T, A] + ): TypedDataset[T] = orderByMany(ca) /** Orders the TypedDataset using the columns selected. */ def orderBy[A: CatalystOrdered, B: CatalystOrdered]( - ca: SortedTypedColumn[T, A], - cb: SortedTypedColumn[T, B] - ): TypedDataset[T] = orderByMany(ca, cb) - - /** Orders the TypedDataset using the columns selected. */ - def orderBy[A: CatalystOrdered, B: CatalystOrdered, C: CatalystOrdered]( - ca: SortedTypedColumn[T, A], - cb: SortedTypedColumn[T, B], - cc: SortedTypedColumn[T, C] - ): TypedDataset[T] = orderByMany(ca, cb, cc) - - /** Sort the dataset by any number of column expressions. - * Default sort order is ascending. - * {{{ - * d.orderByMany(d('a), d('b).desc, d('c).asc) - * }}} - */ + ca: SortedTypedColumn[T, A], + cb: SortedTypedColumn[T, B] + ): TypedDataset[T] = orderByMany(ca, cb) + + /** Orders the TypedDataset using the columns selected. */ + def orderBy[A: CatalystOrdered, B: CatalystOrdered, C: CatalystOrdered]( + ca: SortedTypedColumn[T, A], + cb: SortedTypedColumn[T, B], + cc: SortedTypedColumn[T, C] + ): TypedDataset[T] = orderByMany(ca, cb, cc) + + /** + * Sort the dataset by any number of column expressions. + * Default sort order is ascending. + * {{{ + * d.orderByMany(d('a), d('b).desc, d('c).asc) + * }}} + */ object orderByMany extends ProductArgs { - def applyProduct[U <: HList, O <: HList](columns: U) - (implicit + + def applyProduct[U <: HList, O <: HList]( + columns: U + )(implicit i0: Mapper.Aux[SortedTypedColumn.defaultAscendingPoly.type, U, O], i1: ToTraversable.Aux[O, List, SortedTypedColumn[T, _]] ): TypedDataset[T] = { - val sorted = dataset.toDF() - .orderBy(i0(columns).toList[SortedTypedColumn[T, _]].map(_.untyped):_*) + val sorted = dataset + .toDF() + .orderBy(i0(columns).toList[SortedTypedColumn[T, _]].map(_.untyped): _*) .as[T](TypedExpressionEncoder[T]) TypedDataset.create[T](sorted) } } - /** Returns a new Dataset as a tuple with the specified - * column dropped. - * Does not allow for dropping from a single column TypedDataset - * - * {{{ - * val d: TypedDataset[Foo(a: String, b: Int...)] = ??? - * val result = TypedDataset[(Int, ...)] = d.drop('a) - * }}} - * @param column column to drop specified as a Symbol - * @param i0 LabelledGeneric derived for T - * @param i1 Remover derived for TRep and column - * @param i2 values of T with column removed - * @param i3 tupler of values - * @param i4 evidence of encoder of the tupled values - * @tparam Out Tupled return type - * @tparam TRep shapeless' record representation of T - * @tparam Removed record of T with column removed - * @tparam ValuesFromRemoved values of T with column removed as an HList - * @tparam V value type of column in T - * @return - */ - def dropTupled[Out, TRep <: HList, Removed <: HList, ValuesFromRemoved <: HList, V] - (column: Witness.Lt[Symbol]) - (implicit + /** + * Returns a new Dataset as a tuple with the specified + * column dropped. + * Does not allow for dropping from a single column TypedDataset + * + * {{{ + * val d: TypedDataset[Foo(a: String, b: Int...)] = ??? + * val result = TypedDataset[(Int, ...)] = d.drop('a) + * }}} + * @param column column to drop specified as a Symbol + * @param i0 LabelledGeneric derived for T + * @param i1 Remover derived for TRep and column + * @param i2 values of T with column removed + * @param i3 tupler of values + * @param i4 evidence of encoder of the tupled values + * @tparam Out Tupled return type + * @tparam TRep shapeless' record representation of T + * @tparam Removed record of T with column removed + * @tparam ValuesFromRemoved values of T with column removed as an HList + * @tparam V value type of column in T + * @return + */ + def dropTupled[ + Out, + TRep <: HList, + Removed <: HList, + ValuesFromRemoved <: HList, + V + ](column: Witness.Lt[Symbol] + )(implicit i0: LabelledGeneric.Aux[T, TRep], i1: Remover.Aux[TRep, column.T, (V, Removed)], i2: Values.Aux[Removed, ValuesFromRemoved], i3: Tupler.Aux[ValuesFromRemoved, Out], i4: TypedEncoder[Out] ): TypedDataset[Out] = { - val dropped = dataset - .toDF() - .drop(column.value.name) - .as[Out](TypedExpressionEncoder[Out]) + val dropped = dataset + .toDF() + .drop(column.value.name) + .as[Out](TypedExpressionEncoder[Out]) - TypedDataset.create[Out](dropped) - } + TypedDataset.create[Out](dropped) + } /** - * Drops columns as necessary to return `U` - * - * @example - * {{{ - * case class X(i: Int, j: Int, k: Boolean) - * case class Y(i: Int, k: Boolean) - * val f: TypedDataset[X] = ??? - * val fNew: TypedDataset[Y] = f.drop[Y] - * }}} - * - * @tparam U the output type - * - * @see [[frameless.TypedDataset#project]] - */ - def drop[U](implicit projector: SmartProject[T,U]): TypedDataset[U] = project[U] - - /** Prepends a new column to the Dataset. - * - * {{{ - * case class X(i: Int, j: Int) - * val f: TypedDataset[X] = TypedDataset.create(X(1,1) :: X(1,1) :: X(1,10) :: Nil) - * val fNew: TypedDataset[(Int,Int,Boolean)] = f.withColumnTupled(f('j) === 10) - * }}} - */ - def withColumnTupled[A: TypedEncoder, H <: HList, FH <: HList, Out] - (ca: TypedColumn[T, A]) - (implicit + * Drops columns as necessary to return `U` + * + * @example + * {{{ + * case class X(i: Int, j: Int, k: Boolean) + * case class Y(i: Int, k: Boolean) + * val f: TypedDataset[X] = ??? + * val fNew: TypedDataset[Y] = f.drop[Y] + * }}} + * + * @tparam U the output type + * + * @see [[frameless.TypedDataset#project]] + */ + def drop[U]( + implicit + projector: SmartProject[T, U] + ): TypedDataset[U] = project[U] + + /** + * Prepends a new column to the Dataset. + * + * {{{ + * case class X(i: Int, j: Int) + * val f: TypedDataset[X] = TypedDataset.create(X(1,1) :: X(1,1) :: X(1,10) :: Nil) + * val fNew: TypedDataset[(Int,Int,Boolean)] = f.withColumnTupled(f('j) === 10) + * }}} + */ + def withColumnTupled[A: TypedEncoder, H <: HList, FH <: HList, Out]( + ca: TypedColumn[T, A] + )(implicit i0: Generic.Aux[T, H], i1: Prepend.Aux[H, A :: HNil, FH], i2: Tupler.Aux[FH, Out], i3: TypedEncoder[Out] ): TypedDataset[Out] = { - // Giving a random name to the new column (the proper name will be given by the Tuple-based encoder) - val selected = dataset.toDF().withColumn("I1X3T9CU1OP0128JYIO76TYZZA3AXHQ18RMI", ca.untyped) - .as[Out](TypedExpressionEncoder[Out]) + // Giving a random name to the new column (the proper name will be given by the Tuple-based encoder) + val selected = dataset + .toDF() + .withColumn("I1X3T9CU1OP0128JYIO76TYZZA3AXHQ18RMI", ca.untyped) + .as[Out](TypedExpressionEncoder[Out]) - TypedDataset.create[Out](selected) + TypedDataset.create[Out](selected) } - /** Returns a new [[frameless.TypedDataset]] with the specified column updated with a new value - * {{{ - * case class X(i: Int, j: Int) - * val f: TypedDataset[X] = TypedDataset.create(X(1,10) :: Nil) - * val fNew: TypedDataset[X] = f.withColumn('j, f('i)) // results in X(1, 1) :: Nil - * }}} - * @param column column given as a symbol to replace - * @param replacement column to replace the value with - * @param i0 Evidence that a column with the correct type and name exists - */ + /** + * Returns a new [[frameless.TypedDataset]] with the specified column updated with a new value + * {{{ + * case class X(i: Int, j: Int) + * val f: TypedDataset[X] = TypedDataset.create(X(1,10) :: Nil) + * val fNew: TypedDataset[X] = f.withColumn('j, f('i)) // results in X(1, 1) :: Nil + * }}} + * @param column column given as a symbol to replace + * @param replacement column to replace the value with + * @param i0 Evidence that a column with the correct type and name exists + */ def withColumnReplaced[A]( - column: Witness.Lt[Symbol], - replacement: TypedColumn[T, A] - )(implicit - i0: TypedColumn.Exists[T, column.T, A] - ): TypedDataset[T] = { - val updated = dataset.toDF().withColumn(column.value.name, replacement.untyped) + column: Witness.Lt[Symbol], + replacement: TypedColumn[T, A] + )(implicit + i0: TypedColumn.Exists[T, column.T, A] + ): TypedDataset[T] = { + val updated = dataset + .toDF() + .withColumn(column.value.name, replacement.untyped) .as[T](TypedExpressionEncoder[T]) TypedDataset.create[T](updated) } - /** Adds a column to a Dataset so long as the specified output type, `U`, has - * an extra column from `T` that has type `A`. - * - * @example - * {{{ - * case class X(i: Int, j: Int) - * case class Y(i: Int, j: Int, k: Boolean) - * val f: TypedDataset[X] = TypedDataset.create(X(1,1) :: X(1,1) :: X(1,10) :: Nil) - * val fNew: TypedDataset[Y] = f.withColumn[Y](f('j) === 10) - * }}} - * @param ca The typed column to add - * @param i0 TypeEncoder for output type U - * @param i1 TypeEncoder for added column type A - * @param i2 the LabelledGeneric derived for T - * @param i3 the LabelledGeneric derived for U - * @param i4 proof no fields have been removed - * @param i5 diff from T to U - * @param i6 keys from newFields - * @param i7 the one and only new key - * @param i8 the one and only new field enforcing the type of A exists - * @param i9 the keys of U - * @param iA allows for traversing the keys of U - * @tparam U the output type - * @tparam A The added column type - * @tparam TRep shapeless' record representation of T - * @tparam URep shapeless' record representation of U - * @tparam UKeys the keys of U as an HList - * @tparam NewFields the added fields to T to get U - * @tparam NewKeys the keys of NewFields as an HList - * @tparam NewKey the first, and only, key in NewKey - * - * @see [[frameless.TypedDataset.WithColumnApply#apply]] - */ + /** + * Adds a column to a Dataset so long as the specified output type, `U`, has + * an extra column from `T` that has type `A`. + * + * @example + * {{{ + * case class X(i: Int, j: Int) + * case class Y(i: Int, j: Int, k: Boolean) + * val f: TypedDataset[X] = TypedDataset.create(X(1,1) :: X(1,1) :: X(1,10) :: Nil) + * val fNew: TypedDataset[Y] = f.withColumn[Y](f('j) === 10) + * }}} + * @param ca The typed column to add + * @param i0 TypeEncoder for output type U + * @param i1 TypeEncoder for added column type A + * @param i2 the LabelledGeneric derived for T + * @param i3 the LabelledGeneric derived for U + * @param i4 proof no fields have been removed + * @param i5 diff from T to U + * @param i6 keys from newFields + * @param i7 the one and only new key + * @param i8 the one and only new field enforcing the type of A exists + * @param i9 the keys of U + * @param iA allows for traversing the keys of U + * @tparam U the output type + * @tparam A The added column type + * @tparam TRep shapeless' record representation of T + * @tparam URep shapeless' record representation of U + * @tparam UKeys the keys of U as an HList + * @tparam NewFields the added fields to T to get U + * @tparam NewKeys the keys of NewFields as an HList + * @tparam NewKey the first, and only, key in NewKey + * + * @see [[frameless.TypedDataset.WithColumnApply#apply]] + */ def withColumn[U] = new WithColumnApply[U] class WithColumnApply[U] { - def apply[A, TRep <: HList, URep <: HList, UKeys <: HList, NewFields <: HList, NewKeys <: HList, NewKey <: Symbol] - (ca: TypedColumn[T, A]) - (implicit - i0: TypedEncoder[U], - i1: TypedEncoder[A], - i2: LabelledGeneric.Aux[T, TRep], - i3: LabelledGeneric.Aux[U, URep], - i4: Diff.Aux[TRep, URep, HNil], - i5: Diff.Aux[URep, TRep, NewFields], - i6: Keys.Aux[NewFields, NewKeys], - i7: IsHCons.Aux[NewKeys, NewKey, HNil], - i8: IsHCons.Aux[NewFields, FieldType[NewKey, A], HNil], - i9: Keys.Aux[URep, UKeys], - iA: ToTraversable.Aux[UKeys, Seq, Symbol] - ): TypedDataset[U] = { + + def apply[ + A, + TRep <: HList, + URep <: HList, + UKeys <: HList, + NewFields <: HList, + NewKeys <: HList, + NewKey <: Symbol + ](ca: TypedColumn[T, A] + )(implicit + i0: TypedEncoder[U], + i1: TypedEncoder[A], + i2: LabelledGeneric.Aux[T, TRep], + i3: LabelledGeneric.Aux[U, URep], + i4: Diff.Aux[TRep, URep, HNil], + i5: Diff.Aux[URep, TRep, NewFields], + i6: Keys.Aux[NewFields, NewKeys], + i7: IsHCons.Aux[NewKeys, NewKey, HNil], + i8: IsHCons.Aux[NewFields, FieldType[NewKey, A], HNil], + i9: Keys.Aux[URep, UKeys], + iA: ToTraversable.Aux[UKeys, Seq, Symbol] + ): TypedDataset[U] = { val newColumnName = i7.head(i6()).name - val dfWithNewColumn = dataset - .toDF() - .withColumn(newColumnName, ca.untyped) + val dfWithNewColumn = dataset.toDF().withColumn(newColumnName, ca.untyped) val newColumns = i9.apply().to[Seq].map(_.name).map(dfWithNewColumn.col) - val selected = dfWithNewColumn - .select(newColumns: _*) - .as[U](TypedExpressionEncoder[U]) + val selected = + dfWithNewColumn.select(newColumns: _*).as[U](TypedExpressionEncoder[U]) TypedDataset.create[U](selected) } } /** - * Explodes a single column at a time. It only compiles if the type of column supports this operation. - * - * @example - * - * {{{ - * case class X(i: Int, j: Array[Int]) - * case class Y(i: Int, j: Int) - * - * val f: TypedDataset[X] = ??? - * val fNew: TypedDataset[Y] = f.explode('j).as[Y] - * }}} - * @param column the column we wish to explode - */ - def explode[A, TRep <: HList, V[_], OutMod <: HList, OutModValues <: HList, Out] - (column: Witness.Lt[Symbol]) - (implicit - i0: TypedColumn.Exists[T, column.T, V[A]], - i1: TypedEncoder[A], - i2: CatalystExplodableCollection[V], - i3: LabelledGeneric.Aux[T, TRep], - i4: Modifier.Aux[TRep, column.T, V[A], A, OutMod], - i5: Values.Aux[OutMod, OutModValues], - i6: Tupler.Aux[OutModValues, Out], - i7: TypedEncoder[Out] - ): TypedDataset[Out] = { - import org.apache.spark.sql.functions.{explode => sparkExplode} + * Explodes a single column at a time. It only compiles if the type of column supports this operation. + * + * @example + * + * {{{ + * case class X(i: Int, j: Array[Int]) + * case class Y(i: Int, j: Int) + * + * val f: TypedDataset[X] = ??? + * val fNew: TypedDataset[Y] = f.explode('j).as[Y] + * }}} + * @param column the column we wish to explode + */ + def explode[ + A, + TRep <: HList, + V[_], + OutMod <: HList, + OutModValues <: HList, + Out + ](column: Witness.Lt[Symbol] + )(implicit + i0: TypedColumn.Exists[T, column.T, V[A]], + i1: TypedEncoder[A], + i2: CatalystExplodableCollection[V], + i3: LabelledGeneric.Aux[T, TRep], + i4: Modifier.Aux[TRep, column.T, V[A], A, OutMod], + i5: Values.Aux[OutMod, OutModValues], + i6: Tupler.Aux[OutModValues, Out], + i7: TypedEncoder[Out] + ): TypedDataset[Out] = { + import org.apache.spark.sql.functions.{ explode => sparkExplode } val df = dataset.toDF() val trans = - df - .withColumn(column.value.name, sparkExplode(df(column.value.name))) + df.withColumn(column.value.name, sparkExplode(df(column.value.name))) .as[Out](TypedExpressionEncoder[Out]) TypedDataset.create[Out](trans) } /** - * Explodes a single column at a time. It only compiles if the type of column supports this operation. - * - * @example - * - * {{{ - * case class X(i: Int, j: Map[Int, Int]) - * case class Y(i: Int, j: (Int, Int)) - * - * val f: TypedDataset[X] = ??? - * val fNew: TypedDataset[Y] = f.explodeMap('j).as[Y] - * }}} - * @param column the column we wish to explode - */ - def explodeMap[A, B, V[_, _], TRep <: HList, OutMod <: HList, OutModValues <: HList, Out] - (column: Witness.Lt[Symbol]) - (implicit - i0: TypedColumn.Exists[T, column.T, V[A, B]], - i1: TypedEncoder[A], - i2: TypedEncoder[B], - i3: LabelledGeneric.Aux[T, TRep], - i4: Modifier.Aux[TRep, column.T, V[A,B], (A, B), OutMod], - i5: Values.Aux[OutMod, OutModValues], - i6: Tupler.Aux[OutModValues, Out], - i7: TypedEncoder[Out] - ): TypedDataset[Out] = { - import org.apache.spark.sql.functions.{explode => sparkExplode, struct => sparkStruct, col => sparkCol} + * Explodes a single column at a time. It only compiles if the type of column supports this operation. + * + * @example + * + * {{{ + * case class X(i: Int, j: Map[Int, Int]) + * case class Y(i: Int, j: (Int, Int)) + * + * val f: TypedDataset[X] = ??? + * val fNew: TypedDataset[Y] = f.explodeMap('j).as[Y] + * }}} + * @param column the column we wish to explode + */ + def explodeMap[ + A, + B, + V[_, _], + TRep <: HList, + OutMod <: HList, + OutModValues <: HList, + Out + ](column: Witness.Lt[Symbol] + )(implicit + i0: TypedColumn.Exists[T, column.T, V[A, B]], + i1: TypedEncoder[A], + i2: TypedEncoder[B], + i3: LabelledGeneric.Aux[T, TRep], + i4: Modifier.Aux[TRep, column.T, V[A, B], (A, B), OutMod], + i5: Values.Aux[OutMod, OutModValues], + i6: Tupler.Aux[OutModValues, Out], + i7: TypedEncoder[Out] + ): TypedDataset[Out] = { + import org.apache.spark.sql.functions.{ + explode => sparkExplode, + struct => sparkStruct, + col => sparkCol + } val df = dataset.toDF() // select all columns, all original columns and [key, value] columns appeared after the map explode @@ -1271,7 +1620,10 @@ class TypedDataset[T] protected[frameless](val dataset: Dataset[T])(implicit val exploded // map explode explodes it into [key, value] columns // the only way to put it into a column is to create a struct - .withColumn(columnRenamed, sparkStruct(exploded("key"), exploded("value"))) + .withColumn( + columnRenamed, + sparkStruct(exploded("key"), exploded("value")) + ) // selecting only original columns, we don't need [key, value] columns left in the DataFrame after the map explode .select(columns: _*) // rename columns back and form the result @@ -1281,72 +1633,81 @@ class TypedDataset[T] protected[frameless](val dataset: Dataset[T])(implicit val } /** - * Flattens a column of type Option[A]. Compiles only if the selected column is of type Option[A]. - * - * - * @example - * - * {{{ - * case class X(i: Int, j: Option[Int]) - * case class Y(i: Int, j: Int) - * - * val f: TypedDataset[X] = ??? - * val fNew: TypedDataset[Y] = f.flattenOption('j).as[Y] - * }}} - * - * @param column the column we wish to flatten - */ - def flattenOption[A, TRep <: HList, V[_], OutMod <: HList, OutModValues <: HList, Out] - (column: Witness.Lt[Symbol]) - (implicit - i0: TypedColumn.Exists[T, column.T, V[A]], - i1: TypedEncoder[A], - i2: V[A] =:= Option[A], - i3: LabelledGeneric.Aux[T, TRep], - i4: Modifier.Aux[TRep, column.T, V[A], A, OutMod], - i5: Values.Aux[OutMod, OutModValues], - i6: Tupler.Aux[OutModValues, Out], - i7: TypedEncoder[Out] - ): TypedDataset[Out] = { + * Flattens a column of type Option[A]. Compiles only if the selected column is of type Option[A]. + * + * @example + * + * {{{ + * case class X(i: Int, j: Option[Int]) + * case class Y(i: Int, j: Int) + * + * val f: TypedDataset[X] = ??? + * val fNew: TypedDataset[Y] = f.flattenOption('j).as[Y] + * }}} + * + * @param column the column we wish to flatten + */ + def flattenOption[ + A, + TRep <: HList, + V[_], + OutMod <: HList, + OutModValues <: HList, + Out + ](column: Witness.Lt[Symbol] + )(implicit + i0: TypedColumn.Exists[T, column.T, V[A]], + i1: TypedEncoder[A], + i2: V[A] =:= Option[A], + i3: LabelledGeneric.Aux[T, TRep], + i4: Modifier.Aux[TRep, column.T, V[A], A, OutMod], + i5: Values.Aux[OutMod, OutModValues], + i6: Tupler.Aux[OutModValues, Out], + i7: TypedEncoder[Out] + ): TypedDataset[Out] = { val df = dataset.toDF() - val trans = df.filter(df(column.value.name).isNotNull). - as[Out](TypedExpressionEncoder[Out]) + val trans = df + .filter(df(column.value.name).isNotNull) + .as[Out](TypedExpressionEncoder[Out]) TypedDataset.create[Out](trans) } } object TypedDataset { - def create[A](data: Seq[A]) - (implicit + + def create[A]( + data: Seq[A] + )(implicit encoder: TypedEncoder[A], sqlContext: SparkSession ): TypedDataset[A] = { - val dataset = sqlContext.createDataset(data)(TypedExpressionEncoder[A]) + val dataset = sqlContext.createDataset(data)(TypedExpressionEncoder[A]) - TypedDataset.create[A](dataset) - } + TypedDataset.create[A](dataset) + } - def create[A](data: RDD[A]) - (implicit + def create[A]( + data: RDD[A] + )(implicit encoder: TypedEncoder[A], sqlContext: SparkSession ): TypedDataset[A] = { - val dataset = sqlContext.createDataset(data)(TypedExpressionEncoder[A]) + val dataset = sqlContext.createDataset(data)(TypedExpressionEncoder[A]) - TypedDataset.create[A](dataset) - } + TypedDataset.create[A](dataset) + } def create[A: TypedEncoder](dataset: Dataset[A]): TypedDataset[A] = createUnsafe(dataset.toDF()) /** - * Creates a [[frameless.TypedDataset]] from a Spark [[org.apache.spark.sql.DataFrame]]. - * Note that the names and types need to align! - * - * This is an unsafe operation: If the schemas do not align, - * the error will be captured at runtime (not during compilation). - */ + * Creates a [[frameless.TypedDataset]] from a Spark [[org.apache.spark.sql.DataFrame]]. + * Note that the names and types need to align! + * + * This is an unsafe operation: If the schemas do not align, + * the error will be captured at runtime (not during compilation). + */ def createUnsafe[A: TypedEncoder](df: DataFrame): TypedDataset[A] = { val e = TypedEncoder[A] val output: Seq[Attribute] = df.queryExecution.analyzed.output @@ -1358,7 +1719,8 @@ object TypedDataset { throw new IllegalStateException( s"Unsupported creation of TypedDataset with ${targetFields.size} column(s) " + s"from a DataFrame with ${output.size} columns. " + - "Try to `select()` the proper columns in the right order before calling `create()`.") + "Try to `select()` the proper columns in the right order before calling `create()`." + ) } // Adapt names if they are not the same (note: types still might not match) @@ -1368,7 +1730,7 @@ object TypedDataset { val canSelect = targetColNames.toSet.subsetOf(output.map(_.name).toSet) val reshaped = if (shouldReshape && canSelect) { - df.select(targetColNames.head, targetColNames.tail:_*) + df.select(targetColNames.head, targetColNames.tail: _*) } else if (shouldReshape) { df.toDF(targetColNames: _*) } else { @@ -1378,9 +1740,14 @@ object TypedDataset { new TypedDataset[A](reshaped.as[A](TypedExpressionEncoder[A])) } - /** Prefer `TypedDataset.create` over `TypedDataset.unsafeCreate` unless you - * know what you are doing. */ - @deprecated("Prefer TypedDataset.create over TypedDataset.unsafeCreate", "0.3.0") + /** + * Prefer `TypedDataset.create` over `TypedDataset.unsafeCreate` unless you + * know what you are doing. + */ + @deprecated( + "Prefer TypedDataset.create over TypedDataset.unsafeCreate", + "0.3.0" + ) def unsafeCreate[A: TypedEncoder](dataset: Dataset[A]): TypedDataset[A] = { new TypedDataset[A](dataset) } diff --git a/dataset/src/main/scala/frameless/TypedDatasetForwarded.scala b/dataset/src/main/scala/frameless/TypedDatasetForwarded.scala index d417caf8e..0856732f2 100644 --- a/dataset/src/main/scala/frameless/TypedDatasetForwarded.scala +++ b/dataset/src/main/scala/frameless/TypedDatasetForwarded.scala @@ -6,366 +6,429 @@ import org.apache.spark.rdd.RDD import org.apache.spark.sql.execution.QueryExecution import org.apache.spark.sql.streaming.DataStreamWriter import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.{DataFrame, DataFrameWriter, SQLContext, SparkSession} +import org.apache.spark.sql.{ + DataFrame, + DataFrameWriter, + FramelessInternals, + SQLContext, + SparkSession +} import org.apache.spark.storage.StorageLevel import scala.util.Random -/** This trait implements [[TypedDataset]] methods that have the same signature - * than their `Dataset` equivalent. Each method simply forwards the call to the - * underlying `Dataset`. - * - * Documentation marked "apache/spark" is thanks to apache/spark Contributors - * at https://github.com/apache/spark, licensed under Apache v2.0 available at - * http://www.apache.org/licenses/LICENSE-2.0 - */ +/** + * This trait implements [[TypedDataset]] methods that have the same signature + * than their `Dataset` equivalent. Each method simply forwards the call to the + * underlying `Dataset`. + * + * Documentation marked "apache/spark" is thanks to apache/spark Contributors + * at https://github.com/apache/spark, licensed under Apache v2.0 available at + * http://www.apache.org/licenses/LICENSE-2.0 + */ trait TypedDatasetForwarded[T] { self: TypedDataset[T] => override def toString: String = dataset.toString /** - * Returns a `SparkSession` from this [[TypedDataset]]. - */ + * Returns a `SparkSession` from this [[TypedDataset]]. + */ def sparkSession: SparkSession = dataset.sparkSession /** - * Returns a `SQLContext` from this [[TypedDataset]]. - */ + * Returns a `SQLContext` from this [[TypedDataset]]. + */ def sqlContext: SQLContext = - dataset.sqlContext + FramelessInternals.sqlContext(dataset) /** - * Returns the schema of this Dataset. - * - * apache/spark - */ + * Returns the schema of this Dataset. + * + * apache/spark + */ def schema: StructType = dataset.schema - /** Prints the schema of the underlying `Dataset` to the console in a nice tree format. - * - * apache/spark + /** + * Prints the schema of the underlying `Dataset` to the console in a nice tree format. + * + * apache/spark */ def printSchema(): Unit = dataset.printSchema() - /** Prints the plans (logical and physical) to the console for debugging purposes. - * - * apache/spark + /** + * Prints the plans (logical and physical) to the console for debugging purposes. + * + * apache/spark */ def explain(extended: Boolean = false): Unit = dataset.explain(extended) /** - * Returns a `QueryExecution` from this [[TypedDataset]]. - * - * It is the primary workflow for executing relational queries using Spark. Designed to allow easy - * access to the intermediate phases of query execution for developers. - * - * apache/spark - */ + * Returns a `QueryExecution` from this [[TypedDataset]]. + * + * It is the primary workflow for executing relational queries using Spark. Designed to allow easy + * access to the intermediate phases of query execution for developers. + * + * apache/spark + */ def queryExecution: QueryExecution = dataset.queryExecution - /** Converts this strongly typed collection of data to generic Dataframe. In contrast to the - * strongly typed objects that Dataset operations work on, a Dataframe returns generic Row - * objects that allow fields to be accessed by ordinal or name. - * - * apache/spark - */ + /** + * Converts this strongly typed collection of data to generic Dataframe. In contrast to the + * strongly typed objects that Dataset operations work on, a Dataframe returns generic Row + * objects that allow fields to be accessed by ordinal or name. + * + * apache/spark + */ def toDF(): DataFrame = dataset.toDF() - /** Converts this [[TypedDataset]] to an RDD. - * - * apache/spark - */ + /** + * Converts this [[TypedDataset]] to an RDD. + * + * apache/spark + */ def rdd: RDD[T] = dataset.rdd - /** Returns a new [[TypedDataset]] that has exactly `numPartitions` partitions. - * - * apache/spark - */ + /** + * Returns a new [[TypedDataset]] that has exactly `numPartitions` partitions. + * + * apache/spark + */ def repartition(numPartitions: Int): TypedDataset[T] = TypedDataset.create(dataset.repartition(numPartitions)) - /** - * Get the [[TypedDataset]]'s current storage level, or StorageLevel.NONE if not persisted. - * - * apache/spark - */ + * Get the [[TypedDataset]]'s current storage level, or StorageLevel.NONE if not persisted. + * + * apache/spark + */ def storageLevel(): StorageLevel = dataset.storageLevel /** - * Returns the content of the [[TypedDataset]] as a Dataset of JSON strings. - * - * apache/spark - */ + * Returns the content of the [[TypedDataset]] as a Dataset of JSON strings. + * + * apache/spark + */ def toJSON: TypedDataset[String] = TypedDataset.create(dataset.toJSON) /** - * Interface for saving the content of the non-streaming [[TypedDataset]] out into external storage. - * - * apache/spark - */ + * Interface for saving the content of the non-streaming [[TypedDataset]] out into external storage. + * + * apache/spark + */ def write: DataFrameWriter[T] = dataset.write /** - * Interface for saving the content of the streaming Dataset out into external storage. - * - * apache/spark - */ + * Interface for saving the content of the streaming Dataset out into external storage. + * + * apache/spark + */ def writeStream: DataStreamWriter[T] = dataset.writeStream - - /** Returns a new [[TypedDataset]] that has exactly `numPartitions` partitions. - * Similar to coalesce defined on an RDD, this operation results in a narrow dependency, e.g. - * if you go from 1000 partitions to 100 partitions, there will not be a shuffle, instead each of - * the 100 new partitions will claim 10 of the current partitions. - * - * apache/spark - */ + + /** + * Returns a new [[TypedDataset]] that has exactly `numPartitions` partitions. + * Similar to coalesce defined on an RDD, this operation results in a narrow dependency, e.g. + * if you go from 1000 partitions to 100 partitions, there will not be a shuffle, instead each of + * the 100 new partitions will claim 10 of the current partitions. + * + * apache/spark + */ def coalesce(numPartitions: Int): TypedDataset[T] = TypedDataset.create(dataset.coalesce(numPartitions)) /** - * Returns an `Array` that contains all column names in this [[TypedDataset]]. - */ + * Returns an `Array` that contains all column names in this [[TypedDataset]]. + */ def columns: Array[String] = dataset.columns - /** Concise syntax for chaining custom transformations. - * - * apache/spark - */ + /** + * Concise syntax for chaining custom transformations. + * + * apache/spark + */ def transform[U](t: TypedDataset[T] => TypedDataset[U]): TypedDataset[U] = t(this) - /** Returns a new Dataset by taking the first `n` rows. The difference between this function - * and `head` is that `head` is an action and returns an array (by triggering query execution) - * while `limit` returns a new Dataset. - * - * apache/spark - */ + /** + * Returns a new Dataset by taking the first `n` rows. The difference between this function + * and `head` is that `head` is an action and returns an array (by triggering query execution) + * while `limit` returns a new Dataset. + * + * apache/spark + */ def limit(n: Int): TypedDataset[T] = TypedDataset.create(dataset.limit(n)) - /** Returns a new [[TypedDataset]] by sampling a fraction of records. - * - * apache/spark - */ - def sample(withReplacement: Boolean, fraction: Double, seed: Long = Random.nextLong()): TypedDataset[T] = + /** + * Returns a new [[TypedDataset]] by sampling a fraction of records. + * + * apache/spark + */ + def sample( + withReplacement: Boolean, + fraction: Double, + seed: Long = Random.nextLong() + ): TypedDataset[T] = TypedDataset.create(dataset.sample(withReplacement, fraction, seed)) - /** Returns a new [[TypedDataset]] that contains only the unique elements of this [[TypedDataset]]. - * - * Note that, equality checking is performed directly on the encoded representation of the data - * and thus is not affected by a custom `equals` function defined on `T`. - * - * apache/spark - */ + /** + * Returns a new [[TypedDataset]] that contains only the unique elements of this [[TypedDataset]]. + * + * Note that, equality checking is performed directly on the encoded representation of the data + * and thus is not affected by a custom `equals` function defined on `T`. + * + * apache/spark + */ def distinct: TypedDataset[T] = TypedDataset.create(dataset.distinct()) /** - * Returns a best-effort snapshot of the files that compose this [[TypedDataset]]. This method simply - * asks each constituent BaseRelation for its respective files and takes the union of all results. - * Depending on the source relations, this may not find all input files. Duplicates are removed. - * - * apache/spark - */ + * Returns a best-effort snapshot of the files that compose this [[TypedDataset]]. This method simply + * asks each constituent BaseRelation for its respective files and takes the union of all results. + * Depending on the source relations, this may not find all input files. Duplicates are removed. + * + * apache/spark + */ def inputFiles: Array[String] = dataset.inputFiles /** - * Returns true if the `collect` and `take` methods can be run locally - * (without any Spark executors). - * - * apache/spark - */ + * Returns true if the `collect` and `take` methods can be run locally + * (without any Spark executors). + * + * apache/spark + */ def isLocal: Boolean = dataset.isLocal /** - * Returns true if this [[TypedDataset]] contains one or more sources that continuously - * return data as it arrives. A [[TypedDataset]] that reads data from a streaming source - * must be executed as a `StreamingQuery` using the `start()` method in - * `DataStreamWriter`. Methods that return a single answer, e.g. `count()` or - * `collect()`, will throw an `AnalysisException` when there is a streaming - * source present. - * - * apache/spark - */ + * Returns true if this [[TypedDataset]] contains one or more sources that continuously + * return data as it arrives. A [[TypedDataset]] that reads data from a streaming source + * must be executed as a `StreamingQuery` using the `start()` method in + * `DataStreamWriter`. Methods that return a single answer, e.g. `count()` or + * `collect()`, will throw an `AnalysisException` when there is a streaming + * source present. + * + * apache/spark + */ def isStreaming: Boolean = dataset.isStreaming - /** Returns a new [[TypedDataset]] that contains only the elements of this [[TypedDataset]] that are also - * present in `other`. - * - * Note that, equality checking is performed directly on the encoded representation of the data - * and thus is not affected by a custom `equals` function defined on `T`. - * - * apache/spark - */ + /** + * Returns a new [[TypedDataset]] that contains only the elements of this [[TypedDataset]] that are also + * present in `other`. + * + * Note that, equality checking is performed directly on the encoded representation of the data + * and thus is not affected by a custom `equals` function defined on `T`. + * + * apache/spark + */ def intersect(other: TypedDataset[T]): TypedDataset[T] = TypedDataset.create(dataset.intersect(other.dataset)) /** - * Randomly splits this [[TypedDataset]] with the provided weights. - * Weights for splits, will be normalized if they don't sum to 1. - * - * apache/spark - */ + * Randomly splits this [[TypedDataset]] with the provided weights. + * Weights for splits, will be normalized if they don't sum to 1. + * + * apache/spark + */ // $COVERAGE-OFF$ We can not test this method because it is non-deterministic. def randomSplit(weights: Array[Double]): Array[TypedDataset[T]] = dataset.randomSplit(weights).map(TypedDataset.create[T]) // $COVERAGE-ON$ /** - * Randomly splits this [[TypedDataset]] with the provided weights. - * Weights for splits, will be normalized if they don't sum to 1. - * - * apache/spark - */ + * Randomly splits this [[TypedDataset]] with the provided weights. + * Weights for splits, will be normalized if they don't sum to 1. + * + * apache/spark + */ def randomSplit(weights: Array[Double], seed: Long): Array[TypedDataset[T]] = dataset.randomSplit(weights, seed).map(TypedDataset.create[T]) /** - * Returns a Java list that contains randomly split [[TypedDataset]] with the provided weights. - * Weights for splits, will be normalized if they don't sum to 1. - * - * apache/spark - */ - def randomSplitAsList(weights: Array[Double], seed: Long): util.List[TypedDataset[T]] = { + * Returns a Java list that contains randomly split [[TypedDataset]] with the provided weights. + * Weights for splits, will be normalized if they don't sum to 1. + * + * apache/spark + */ + def randomSplitAsList( + weights: Array[Double], + seed: Long + ): util.List[TypedDataset[T]] = { val values = randomSplit(weights, seed) java.util.Arrays.asList(values: _*) } - - /** Returns a new Dataset containing rows in this Dataset but not in another Dataset. - * This is equivalent to `EXCEPT` in SQL. - * - * Note that, equality checking is performed directly on the encoded representation of the data - * and thus is not affected by a custom `equals` function defined on `T`. - * - * apache/spark - */ + /** + * Returns a new Dataset containing rows in this Dataset but not in another Dataset. + * This is equivalent to `EXCEPT` in SQL. + * + * Note that, equality checking is performed directly on the encoded representation of the data + * and thus is not affected by a custom `equals` function defined on `T`. + * + * apache/spark + */ def except(other: TypedDataset[T]): TypedDataset[T] = TypedDataset.create(dataset.except(other.dataset)) - /** Persist this [[TypedDataset]] with the default storage level (`MEMORY_AND_DISK`). - * - * apache/spark - */ + /** + * Persist this [[TypedDataset]] with the default storage level (`MEMORY_AND_DISK`). + * + * apache/spark + */ def cache(): TypedDataset[T] = TypedDataset.create(dataset.cache()) - /** Persist this [[TypedDataset]] with the given storage level. - * @param newLevel One of: `MEMORY_ONLY`, `MEMORY_AND_DISK`, `MEMORY_ONLY_SER`, - * `MEMORY_AND_DISK_SER`, `DISK_ONLY`, `MEMORY_ONLY_2`, `MEMORY_AND_DISK_2`, etc. - * - * apache/spark - */ - def persist(newLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK): TypedDataset[T] = + /** + * Persist this [[TypedDataset]] with the given storage level. + * @param newLevel One of: `MEMORY_ONLY`, `MEMORY_AND_DISK`, `MEMORY_ONLY_SER`, + * `MEMORY_AND_DISK_SER`, `DISK_ONLY`, `MEMORY_ONLY_2`, `MEMORY_AND_DISK_2`, etc. + * + * apache/spark + */ + def persist( + newLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK + ): TypedDataset[T] = TypedDataset.create(dataset.persist(newLevel)) - /** Mark the [[TypedDataset]] as non-persistent, and remove all blocks for it from memory and disk. - * @param blocking Whether to block until all blocks are deleted. - * - * apache/spark - */ + /** + * Mark the [[TypedDataset]] as non-persistent, and remove all blocks for it from memory and disk. + * @param blocking Whether to block until all blocks are deleted. + * + * apache/spark + */ def unpersist(blocking: Boolean = false): TypedDataset[T] = TypedDataset.create(dataset.unpersist(blocking)) // $COVERAGE-OFF$ We do not test deprecated method since forwarded methods are tested. - @deprecated("deserialized methods have moved to a separate section to highlight their runtime overhead", "0.4.0") + @deprecated( + "deserialized methods have moved to a separate section to highlight their runtime overhead", + "0.4.0" + ) def map[U: TypedEncoder](func: T => U): TypedDataset[U] = deserialized.map(func) - @deprecated("deserialized methods have moved to a separate section to highlight their runtime overhead", "0.4.0") - def mapPartitions[U: TypedEncoder](func: Iterator[T] => Iterator[U]): TypedDataset[U] = + @deprecated( + "deserialized methods have moved to a separate section to highlight their runtime overhead", + "0.4.0" + ) + def mapPartitions[U: TypedEncoder]( + func: Iterator[T] => Iterator[U] + ): TypedDataset[U] = deserialized.mapPartitions(func) - @deprecated("deserialized methods have moved to a separate section to highlight their runtime overhead", "0.4.0") + @deprecated( + "deserialized methods have moved to a separate section to highlight their runtime overhead", + "0.4.0" + ) def flatMap[U: TypedEncoder](func: T => TraversableOnce[U]): TypedDataset[U] = deserialized.flatMap(func) - @deprecated("deserialized methods have moved to a separate section to highlight their runtime overhead", "0.4.0") + @deprecated( + "deserialized methods have moved to a separate section to highlight their runtime overhead", + "0.4.0" + ) def filter(func: T => Boolean): TypedDataset[T] = deserialized.filter(func) - @deprecated("deserialized methods have moved to a separate section to highlight their runtime overhead", "0.4.0") + @deprecated( + "deserialized methods have moved to a separate section to highlight their runtime overhead", + "0.4.0" + ) def reduceOption[F[_]: SparkDelay](func: (T, T) => T): F[Option[T]] = deserialized.reduceOption(func) // $COVERAGE-ON$ - /** Methods on `TypedDataset[T]` that go through a full serialization and - * deserialization of `T`, and execute outside of the Catalyst runtime. - * - * @example The correct way to do a projection on a single column is to - * use the `select` method as follows: - * - * {{{ - * ds: TypedDataset[(String, String, String)] -> ds.select(ds('_2)).run() - * }}} - * - * Spark provides an alternative way to obtain the same resulting `Dataset`, - * using the `map` method: - * - * {{{ - * ds: TypedDataset[(String, String, String)] -> ds.deserialized.map(_._2).run() - * }}} - * - * This second approach is however substantially slower than the first one, - * and should be avoided as possible. Indeed, under the hood this `map` will - * deserialize the entire `Tuple3` to an full JVM object, call the apply - * method of the `_._2` closure on it, and serialize the resulting String back - * to its Catalyst representation. - */ + /** + * Methods on `TypedDataset[T]` that go through a full serialization and + * deserialization of `T`, and execute outside of the Catalyst runtime. + * + * @example The correct way to do a projection on a single column is to + * use the `select` method as follows: + * + * {{{ + * ds: TypedDataset[(String, String, String)] -> ds.select(ds('_2)).run() + * }}} + * + * Spark provides an alternative way to obtain the same resulting `Dataset`, + * using the `map` method: + * + * {{{ + * ds: TypedDataset[(String, String, String)] -> ds.deserialized.map(_._2).run() + * }}} + * + * This second approach is however substantially slower than the first one, + * and should be avoided as possible. Indeed, under the hood this `map` will + * deserialize the entire `Tuple3` to an full JVM object, call the apply + * method of the `_._2` closure on it, and serialize the resulting String back + * to its Catalyst representation. + */ object deserialized { - /** Returns a new [[TypedDataset]] that contains the result of applying `func` to each element. - * - * apache/spark - */ + + /** + * Returns a new [[TypedDataset]] that contains the result of applying `func` to each element. + * + * apache/spark + */ def map[U: TypedEncoder](func: T => U): TypedDataset[U] = TypedDataset.create(self.dataset.map(func)(TypedExpressionEncoder[U])) - /** Returns a new [[TypedDataset]] that contains the result of applying `func` to each partition. - * - * apache/spark - */ - def mapPartitions[U: TypedEncoder](func: Iterator[T] => Iterator[U]): TypedDataset[U] = - TypedDataset.create(self.dataset.mapPartitions(func)(TypedExpressionEncoder[U])) - - /** Returns a new [[TypedDataset]] by first applying a function to all elements of this [[TypedDataset]], - * and then flattening the results. - * - * apache/spark - */ - def flatMap[U: TypedEncoder](func: T => TraversableOnce[U]): TypedDataset[U] = + /** + * Returns a new [[TypedDataset]] that contains the result of applying `func` to each partition. + * + * apache/spark + */ + def mapPartitions[U: TypedEncoder]( + func: Iterator[T] => Iterator[U] + ): TypedDataset[U] = + TypedDataset.create( + self.dataset.mapPartitions(func)(TypedExpressionEncoder[U]) + ) + + /** + * Returns a new [[TypedDataset]] by first applying a function to all elements of this [[TypedDataset]], + * and then flattening the results. + * + * apache/spark + */ + def flatMap[U: TypedEncoder]( + func: T => TraversableOnce[U] + ): TypedDataset[U] = TypedDataset.create(self.dataset.flatMap(func)(TypedExpressionEncoder[U])) - /** Returns a new [[TypedDataset]] that only contains elements where `func` returns `true`. - * - * apache/spark - */ + /** + * Returns a new [[TypedDataset]] that only contains elements where `func` returns `true`. + * + * apache/spark + */ def filter(func: T => Boolean): TypedDataset[T] = TypedDataset.create(self.dataset.filter(func)) - /** Optionally reduces the elements of this [[TypedDataset]] using the specified binary function. The given - * `func` must be commutative and associative or the result may be non-deterministic. - * - * Differs from `Dataset#reduce` by wrapping its result into an `Option` and an effect-suspending `F`. - */ - def reduceOption[F[_]](func: (T, T) => T)(implicit F: SparkDelay[F]): F[Option[T]] = + /** + * Optionally reduces the elements of this [[TypedDataset]] using the specified binary function. The given + * `func` must be commutative and associative or the result may be non-deterministic. + * + * Differs from `Dataset#reduce` by wrapping its result into an `Option` and an effect-suspending `F`. + */ + def reduceOption[F[_]]( + func: (T, T) => T + )(implicit + F: SparkDelay[F] + ): F[Option[T]] = F.delay { try { Option(self.dataset.reduce(func)) diff --git a/dataset/src/main/scala/frameless/TypedExpressionEncoder.scala b/dataset/src/main/scala/frameless/TypedExpressionEncoder.scala index 5b78cd292..71fa286a5 100644 --- a/dataset/src/main/scala/frameless/TypedExpressionEncoder.scala +++ b/dataset/src/main/scala/frameless/TypedExpressionEncoder.scala @@ -1,20 +1,25 @@ package frameless import org.apache.spark.sql.Encoder +import org.apache.spark.sql.FramelessInternals import org.apache.spark.sql.catalyst.analysis.GetColumnByOrdinal -import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder -import org.apache.spark.sql.catalyst.expressions.{BoundReference, CreateNamedStruct, If} +import org.apache.spark.sql.catalyst.expressions.{ + BoundReference, + CreateNamedStruct, + If +} import org.apache.spark.sql.types.StructType object TypedExpressionEncoder { - /** In Spark, DataFrame has always schema of StructType - * - * DataFrames of primitive types become records - * with a single field called "value" set in ExpressionEncoder. - */ + /** + * In Spark, DataFrame has always schema of StructType + * + * DataFrames of primitive types become records + * with a single field called "value" set in ExpressionEncoder. + */ def targetStructType[A](encoder: TypedEncoder[A]): StructType = - encoder.catalystRepr match { + encoder.catalystRepr match { case x: StructType => if (encoder.nullable) StructType(x.fields.map(_.copy(nullable = true))) else x @@ -22,7 +27,10 @@ object TypedExpressionEncoder { case dt => new StructType().add("value", dt, nullable = encoder.nullable) } - def apply[T](implicit encoder: TypedEncoder[T]): Encoder[T] = { + def apply[T]( + implicit + encoder: TypedEncoder[T] + ): Encoder[T] = { val in = BoundReference(0, encoder.jvmRepr, encoder.nullable) val (out, serializer) = encoder.toCatalyst(in) match { @@ -39,11 +47,10 @@ object TypedExpressionEncoder { } } - new ExpressionEncoder[T]( + FramelessInternals.expressionEncoder[T]( objSerializer = serializer, objDeserializer = encoder.fromCatalyst(out), - clsTag = encoder.classTag + classTag = encoder.classTag ) } } - diff --git a/dataset/src/main/scala/frameless/ops/GroupByOps.scala b/dataset/src/main/scala/frameless/ops/GroupByOps.scala index 3feeaca59..e6f51a407 100644 --- a/dataset/src/main/scala/frameless/ops/GroupByOps.scala +++ b/dataset/src/main/scala/frameless/ops/GroupByOps.scala @@ -3,36 +3,54 @@ package ops import org.apache.spark.sql.catalyst.analysis.UnresolvedAlias import org.apache.spark.sql.catalyst.plans.logical.Project -import org.apache.spark.sql.{Column, Dataset, FramelessInternals, RelationalGroupedDataset} +import org.apache.spark.sql.{ + Column, + Dataset, + FramelessInternals, + RelationalGroupedDataset +} import shapeless._ -import shapeless.ops.hlist.{Length, Mapped, Prepend, ToList, ToTraversable, Tupler} +import shapeless.ops.hlist.{ + Length, + Mapped, + Prepend, + ToList, + ToTraversable, + Tupler +} -class GroupedByManyOps[T, TK <: HList, K <: HList, KT] - (self: TypedDataset[T], groupedBy: TK) - (implicit +class GroupedByManyOps[T, TK <: HList, K <: HList, KT]( + self: TypedDataset[T], + groupedBy: TK + )(implicit i0: ColumnTypes.Aux[T, TK, K], i1: ToTraversable.Aux[TK, List, UntypedExpression[T]], - i3: Tupler.Aux[K, KT] - ) extends AggregatingOps[T, TK, K, KT](self, groupedBy, (dataset, cols) => dataset.groupBy(cols: _*)) { + i3: Tupler.Aux[K, KT]) + extends AggregatingOps[T, TK, K, KT]( + self, + groupedBy, + (dataset, cols) => dataset.groupBy(cols: _*) + ) { + object agg extends ProductArgs { - def applyProduct[TC <: HList, C <: HList, Out0 <: HList, Out1] - (columns: TC) - (implicit + + def applyProduct[TC <: HList, C <: HList, Out0 <: HList, Out1]( + columns: TC + )(implicit i3: AggregateTypes.Aux[T, TC, C], i4: Prepend.Aux[K, C, Out0], i5: Tupler.Aux[Out0, Out1], i6: TypedEncoder[Out1], i7: ToTraversable.Aux[TC, List, UntypedExpression[T]] ): TypedDataset[Out1] = { - aggregate[TC, Out1](columns) - } + aggregate[TC, Out1](columns) + } } } class GroupedBy1Ops[K1, V]( - self: TypedDataset[V], - g1: TypedColumn[V, K1] -) { + self: TypedDataset[V], + g1: TypedColumn[V, K1]) { private def underlying = new GroupedByManyOps(self, g1 :: HNil) private implicit def eg1 = g1.uencoder @@ -41,49 +59,77 @@ class GroupedBy1Ops[K1, V]( underlying.agg(c1) } - def agg[U1, U2](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2]): TypedDataset[(K1, U1, U2)] = { + def agg[U1, U2]( + c1: TypedAggregate[V, U1], + c2: TypedAggregate[V, U2] + ): TypedDataset[(K1, U1, U2)] = { implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder underlying.agg(c1, c2) } - def agg[U1, U2, U3](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2], c3: TypedAggregate[V, U3]): TypedDataset[(K1, U1, U2, U3)] = { - implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder; implicit val e3 = c3.uencoder + def agg[U1, U2, U3]( + c1: TypedAggregate[V, U1], + c2: TypedAggregate[V, U2], + c3: TypedAggregate[V, U3] + ): TypedDataset[(K1, U1, U2, U3)] = { + implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder; + implicit val e3 = c3.uencoder underlying.agg(c1, c2, c3) } - def agg[U1, U2, U3, U4](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2], c3: TypedAggregate[V, U3], c4: TypedAggregate[V, U4]): TypedDataset[(K1, U1, U2, U3, U4)] = { - implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder; implicit val e3 = c3.uencoder; implicit val e4 = c4.uencoder + def agg[U1, U2, U3, U4]( + c1: TypedAggregate[V, U1], + c2: TypedAggregate[V, U2], + c3: TypedAggregate[V, U3], + c4: TypedAggregate[V, U4] + ): TypedDataset[(K1, U1, U2, U3, U4)] = { + implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder; + implicit val e3 = c3.uencoder; implicit val e4 = c4.uencoder underlying.agg(c1, c2, c3, c4) } - def agg[U1, U2, U3, U4, U5](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2], c3: TypedAggregate[V, U3], c4: TypedAggregate[V, U4], c5: TypedAggregate[V, U5]): TypedDataset[(K1, U1, U2, U3, U4, U5)] = { - implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder; implicit val e3 = c3.uencoder; implicit val e4 = c4.uencoder; implicit val e5 = c5.uencoder + def agg[U1, U2, U3, U4, U5]( + c1: TypedAggregate[V, U1], + c2: TypedAggregate[V, U2], + c3: TypedAggregate[V, U3], + c4: TypedAggregate[V, U4], + c5: TypedAggregate[V, U5] + ): TypedDataset[(K1, U1, U2, U3, U4, U5)] = { + implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder; + implicit val e3 = c3.uencoder; implicit val e4 = c4.uencoder; + implicit val e5 = c5.uencoder underlying.agg(c1, c2, c3, c4, c5) } - /** Methods on `TypedDataset[T]` that go through a full serialization and - * deserialization of `T`, and execute outside of the Catalyst runtime. - */ + /** + * Methods on `TypedDataset[T]` that go through a full serialization and + * deserialization of `T`, and execute outside of the Catalyst runtime. + */ object deserialized { - def mapGroups[U: TypedEncoder](f: (K1, Iterator[V]) => U): TypedDataset[U] = { + + def mapGroups[U: TypedEncoder]( + f: (K1, Iterator[V]) => U + ): TypedDataset[U] = { underlying.deserialized.mapGroups(AggregatingOps.tuple1(f)) } - def flatMapGroups[U: TypedEncoder](f: (K1, Iterator[V]) => TraversableOnce[U]): TypedDataset[U] = { + def flatMapGroups[U: TypedEncoder]( + f: (K1, Iterator[V]) => TraversableOnce[U] + ): TypedDataset[U] = { underlying.deserialized.flatMapGroups(AggregatingOps.tuple1(f)) } } - def pivot[P: CatalystPivotable](pivotColumn: TypedColumn[V, P]): PivotNotValues[V, TypedColumn[V,K1] :: HNil, P] = + def pivot[P: CatalystPivotable]( + pivotColumn: TypedColumn[V, P] + ): PivotNotValues[V, TypedColumn[V, K1] :: HNil, P] = PivotNotValues(self, g1 :: HNil, pivotColumn) } - class GroupedBy2Ops[K1, K2, V]( - self: TypedDataset[V], - g1: TypedColumn[V, K1], - g2: TypedColumn[V, K2] -) { + self: TypedDataset[V], + g1: TypedColumn[V, K1], + g2: TypedColumn[V, K2]) { private def underlying = new GroupedByManyOps(self, g1 :: g2 :: HNil) private implicit def eg1 = g1.uencoder private implicit def eg2 = g2.uencoder @@ -93,58 +139,90 @@ class GroupedBy2Ops[K1, K2, V]( underlying.agg(c1) } - def agg[U1, U2](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2]): TypedDataset[(K1, K2, U1, U2)] = { + def agg[U1, U2]( + c1: TypedAggregate[V, U1], + c2: TypedAggregate[V, U2] + ): TypedDataset[(K1, K2, U1, U2)] = { implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder underlying.agg(c1, c2) } - def agg[U1, U2, U3](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2], c3: TypedAggregate[V, U3]): TypedDataset[(K1, K2, U1, U2, U3)] = { - implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder; implicit val e3 = c3.uencoder + def agg[U1, U2, U3]( + c1: TypedAggregate[V, U1], + c2: TypedAggregate[V, U2], + c3: TypedAggregate[V, U3] + ): TypedDataset[(K1, K2, U1, U2, U3)] = { + implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder; + implicit val e3 = c3.uencoder underlying.agg(c1, c2, c3) } - def agg[U1, U2, U3, U4](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2], c3: TypedAggregate[V, U3], c4: TypedAggregate[V, U4]): TypedDataset[(K1, K2, U1, U2, U3, U4)] = { - implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder; implicit val e3 = c3.uencoder; implicit val e4 = c4.uencoder - underlying.agg(c1 , c2 , c3 , c4) + def agg[U1, U2, U3, U4]( + c1: TypedAggregate[V, U1], + c2: TypedAggregate[V, U2], + c3: TypedAggregate[V, U3], + c4: TypedAggregate[V, U4] + ): TypedDataset[(K1, K2, U1, U2, U3, U4)] = { + implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder; + implicit val e3 = c3.uencoder; implicit val e4 = c4.uencoder + underlying.agg(c1, c2, c3, c4) } - def agg[U1, U2, U3, U4, U5](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2], c3: TypedAggregate[V, U3], c4: TypedAggregate[V, U4], c5: TypedAggregate[V, U5]): TypedDataset[(K1, K2, U1, U2, U3, U4, U5)] = { - implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder; implicit val e3 = c3.uencoder; implicit val e4 = c4.uencoder; implicit val e5 = c5.uencoder + def agg[U1, U2, U3, U4, U5]( + c1: TypedAggregate[V, U1], + c2: TypedAggregate[V, U2], + c3: TypedAggregate[V, U3], + c4: TypedAggregate[V, U4], + c5: TypedAggregate[V, U5] + ): TypedDataset[(K1, K2, U1, U2, U3, U4, U5)] = { + implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder; + implicit val e3 = c3.uencoder; implicit val e4 = c4.uencoder; + implicit val e5 = c5.uencoder underlying.agg(c1, c2, c3, c4, c5) } - - /** Methods on `TypedDataset[T]` that go through a full serialization and - * deserialization of `T`, and execute outside of the Catalyst runtime. - */ + /** + * Methods on `TypedDataset[T]` that go through a full serialization and + * deserialization of `T`, and execute outside of the Catalyst runtime. + */ object deserialized { - def mapGroups[U: TypedEncoder](f: ((K1, K2), Iterator[V]) => U): TypedDataset[U] = { + + def mapGroups[U: TypedEncoder]( + f: ((K1, K2), Iterator[V]) => U + ): TypedDataset[U] = { underlying.deserialized.mapGroups(f) } - def flatMapGroups[U: TypedEncoder](f: ((K1, K2), Iterator[V]) => TraversableOnce[U]): TypedDataset[U] = { + def flatMapGroups[U: TypedEncoder]( + f: ((K1, K2), Iterator[V]) => TraversableOnce[U] + ): TypedDataset[U] = { underlying.deserialized.flatMapGroups(f) } } - def pivot[P: CatalystPivotable](pivotColumn: TypedColumn[V, P]): - PivotNotValues[V, TypedColumn[V,K1] :: TypedColumn[V, K2] :: HNil, P] = - PivotNotValues(self, g1 :: g2 :: HNil, pivotColumn) + def pivot[P: CatalystPivotable]( + pivotColumn: TypedColumn[V, P] + ): PivotNotValues[V, TypedColumn[V, K1] :: TypedColumn[V, K2] :: HNil, P] = + PivotNotValues(self, g1 :: g2 :: HNil, pivotColumn) } -private[ops] abstract class AggregatingOps[T, TK <: HList, K <: HList, KT] - (self: TypedDataset[T], groupedBy: TK, groupingFunc: (Dataset[T], Seq[Column]) => RelationalGroupedDataset) - (implicit +private[ops] abstract class AggregatingOps[T, TK <: HList, K <: HList, KT]( + self: TypedDataset[T], + groupedBy: TK, + groupingFunc: (Dataset[T], Seq[Column]) => RelationalGroupedDataset + )(implicit i0: ColumnTypes.Aux[T, TK, K], i1: ToTraversable.Aux[TK, List, UntypedExpression[T]], - i2: Tupler.Aux[K, KT] - ) { - def aggregate[TC <: HList, Out1](columns: TC) - (implicit - i7: TypedEncoder[Out1], - i8: ToTraversable.Aux[TC, List, UntypedExpression[T]] - ): TypedDataset[Out1] = { - def expr(c: UntypedExpression[T]): Column = new Column(c.expr) + i2: Tupler.Aux[K, KT]) { + + def aggregate[TC <: HList, Out1]( + columns: TC + )(implicit + i7: TypedEncoder[Out1], + i8: ToTraversable.Aux[TC, List, UntypedExpression[T]] + ): TypedDataset[Out1] = { + def expr(c: UntypedExpression[T]): Column = + FramelessInternals.column(c.expr) val groupByExprs = groupedBy.toList[UntypedExpression[T]].map(expr) val aggregates = @@ -159,25 +237,32 @@ private[ops] abstract class AggregatingOps[T, TK <: HList, K <: HList, KT] TypedDataset.create[Out1](aggregated) } - /** Methods on `TypedDataset[T]` that go through a full serialization and - * deserialization of `T`, and execute outside of the Catalyst runtime. - */ + /** + * Methods on `TypedDataset[T]` that go through a full serialization and + * deserialization of `T`, and execute outside of the Catalyst runtime. + */ object deserialized { + def mapGroups[U: TypedEncoder]( - f: (KT, Iterator[T]) => U - )(implicit e: TypedEncoder[KT]): TypedDataset[U] = { + f: (KT, Iterator[T]) => U + )(implicit + e: TypedEncoder[KT] + ): TypedDataset[U] = { val func = (key: KT, it: Iterator[T]) => Iterator(f(key, it)) flatMapGroups(func) } def flatMapGroups[U: TypedEncoder]( - f: (KT, Iterator[T]) => TraversableOnce[U] - )(implicit e: TypedEncoder[KT]): TypedDataset[U] = { + f: (KT, Iterator[T]) => TraversableOnce[U] + )(implicit + e: TypedEncoder[KT] + ): TypedDataset[U] = { implicit val tendcoder = self.encoder val cols = groupedBy.toList[UntypedExpression[T]] val logicalPlan = FramelessInternals.logicalPlan(self.dataset) - val withKeyColumns = logicalPlan.output ++ cols.map(_.expr).map(UnresolvedAlias(_)) + val withKeyColumns = + logicalPlan.output ++ cols.map(_.expr).map(UnresolvedAlias(_)) val withKey = Project(withKeyColumns, logicalPlan) val executed = FramelessInternals.executePlan(self.dataset, withKey) val keyAttributes = executed.analyzed.output.takeRight(cols.size) @@ -188,10 +273,14 @@ private[ops] abstract class AggregatingOps[T, TK <: HList, K <: HList, KT] keyAttributes, dataAttributes, executed.analyzed - )(TypedExpressionEncoder[KT], TypedExpressionEncoder[T], TypedExpressionEncoder[U]) + )( + TypedExpressionEncoder[KT], + TypedExpressionEncoder[T], + TypedExpressionEncoder[U] + ) val groupedAndFlatMapped = FramelessInternals.mkDataset( - self.dataset.sqlContext, + self.dataset, mapGroups, TypedExpressionEncoder[U] ) @@ -201,66 +290,97 @@ private[ops] abstract class AggregatingOps[T, TK <: HList, K <: HList, KT] } private def retainGroupColumns: Boolean = { - self.dataset.sqlContext.getConf("spark.sql.retainGroupColumns", "true").toBoolean + FramelessInternals + .getConf(self.dataset, "spark.sql.retainGroupColumns", "true") + .toBoolean } - def pivot[P: CatalystPivotable](pivotColumn: TypedColumn[T, P]): PivotNotValues[T, TK, P] = + def pivot[P: CatalystPivotable]( + pivotColumn: TypedColumn[T, P] + ): PivotNotValues[T, TK, P] = PivotNotValues(self, groupedBy, pivotColumn) } private[ops] object AggregatingOps { + /** Utility function to help Spark with serialization of closures */ - def tuple1[K1, V, U](f: (K1, Iterator[V]) => U): (Tuple1[K1], Iterator[V]) => U = { - (x: Tuple1[K1], it: Iterator[V]) => f(x._1, it) + def tuple1[K1, V, U]( + f: (K1, Iterator[V]) => U + ): (Tuple1[K1], Iterator[V]) => U = { (x: Tuple1[K1], it: Iterator[V]) => + f(x._1, it) } } -/** Represents a typed Pivot operation. - */ +/** + * Represents a typed Pivot operation. + */ final case class Pivot[T, GroupedColumns <: HList, PivotType, Values <: HList]( - ds: TypedDataset[T], - groupedBy: GroupedColumns, - pivotedBy: TypedColumn[T, PivotType], - values: Values -) { + ds: TypedDataset[T], + groupedBy: GroupedColumns, + pivotedBy: TypedColumn[T, PivotType], + values: Values) { object agg extends ProductArgs { - def applyProduct[AggrColumns <: HList, AggrColumnTypes <: HList, GroupedColumnTypes <: HList, NumValues <: Nat, TypesForPivotedValues <: HList, TypesForPivotedValuesOpt <: HList, OutAsHList <: HList, Out] - (aggrColumns: AggrColumns) - (implicit + + def applyProduct[ + AggrColumns <: HList, + AggrColumnTypes <: HList, + GroupedColumnTypes <: HList, + NumValues <: Nat, + TypesForPivotedValues <: HList, + TypesForPivotedValuesOpt <: HList, + OutAsHList <: HList, + Out + ](aggrColumns: AggrColumns + )(implicit i0: AggregateTypes.Aux[T, AggrColumns, AggrColumnTypes], i1: ColumnTypes.Aux[T, GroupedColumns, GroupedColumnTypes], i2: Length.Aux[Values, NumValues], i3: Repeat.Aux[AggrColumnTypes, NumValues, TypesForPivotedValues], i4: Mapped.Aux[TypesForPivotedValues, Option, TypesForPivotedValuesOpt], - i5: Prepend.Aux[GroupedColumnTypes, TypesForPivotedValuesOpt, OutAsHList], + i5: Prepend.Aux[ + GroupedColumnTypes, + TypesForPivotedValuesOpt, + OutAsHList + ], i6: Tupler.Aux[OutAsHList, Out], i7: TypedEncoder[Out] ): TypedDataset[Out] = { - def mapAny[X](h: HList)(f: Any => X): List[X] = - h match { - case HNil => Nil - case x :: xs => f(x) :: mapAny(xs)(f) - } - - val aggCols: Seq[Column] = mapAny(aggrColumns)(x => new Column(x.asInstanceOf[TypedAggregate[_,_]].expr)) - val tmp = ds.dataset.toDF() - .groupBy(mapAny(groupedBy)(_.asInstanceOf[TypedColumn[_, _]].untyped): _*) - .pivot(pivotedBy.untyped.toString, mapAny(values)(identity)) - .agg(aggCols.head, aggCols.tail:_*) - .as[Out](TypedExpressionEncoder[Out]) - TypedDataset.create(tmp) - } + def mapAny[X](h: HList)(f: Any => X): List[X] = + h match { + case HNil => Nil + case x :: xs => f(x) :: mapAny(xs)(f) + } + + val aggCols: Seq[Column] = mapAny(aggrColumns)(x => + FramelessInternals.column(x.asInstanceOf[TypedAggregate[_, _]].expr) + ) + val tmp = ds.dataset + .toDF() + .groupBy( + mapAny(groupedBy)(_.asInstanceOf[TypedColumn[_, _]].untyped): _* + ) + .pivot(pivotedBy.untyped.toString, mapAny(values)(identity)) + .agg(aggCols.head, aggCols.tail: _*) + .as[Out](TypedExpressionEncoder[Out]) + TypedDataset.create(tmp) + } } } final case class PivotNotValues[T, GroupedColumns <: HList, PivotType]( - ds: TypedDataset[T], - groupedBy: GroupedColumns, - pivotedBy: TypedColumn[T, PivotType] -) extends ProductArgs { - - def onProduct[Values <: HList](values: Values)( - implicit validValues: ToList[Values, PivotType] // validValues: FilterNot.Aux[Values, PivotType, HNil] // did not work - ): Pivot[T, GroupedColumns, PivotType, Values] = Pivot(ds, groupedBy, pivotedBy, values) + ds: TypedDataset[T], + groupedBy: GroupedColumns, + pivotedBy: TypedColumn[T, PivotType]) + extends ProductArgs { + + def onProduct[Values <: HList]( + values: Values + )(implicit + validValues: ToList[ + Values, + PivotType + ] // validValues: FilterNot.Aux[Values, PivotType, HNil] // did not work + ): Pivot[T, GroupedColumns, PivotType, Values] = + Pivot(ds, groupedBy, pivotedBy, values) } diff --git a/dataset/src/main/scala/org/apache/spark/sql/FramelessInternals.scala b/dataset/src/main/scala/org/apache/spark/sql/FramelessInternals.scala deleted file mode 100644 index 5459230d4..000000000 --- a/dataset/src/main/scala/org/apache/spark/sql/FramelessInternals.scala +++ /dev/null @@ -1,73 +0,0 @@ -package org.apache.spark.sql - -import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.expressions.codegen._ -import org.apache.spark.sql.catalyst.expressions.{Alias, CreateStruct} -import org.apache.spark.sql.catalyst.expressions.{Expression, NamedExpression} -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan -import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project} -import org.apache.spark.sql.execution.QueryExecution -import org.apache.spark.sql.types._ -import org.apache.spark.sql.types.ObjectType -import scala.reflect.ClassTag - -object FramelessInternals { - def objectTypeFor[A](implicit classTag: ClassTag[A]): ObjectType = ObjectType(classTag.runtimeClass) - - def resolveExpr(ds: Dataset[_], colNames: Seq[String]): NamedExpression = { - ds.toDF().queryExecution.analyzed.resolve(colNames, ds.sparkSession.sessionState.analyzer.resolver).getOrElse { - throw new AnalysisException( - s"""Cannot resolve column name "$colNames" among (${ds.schema.fieldNames.mkString(", ")})""") - } - } - - def expr(column: Column): Expression = column.expr - - def logicalPlan(ds: Dataset[_]): LogicalPlan = ds.logicalPlan - - def executePlan(ds: Dataset[_], plan: LogicalPlan): QueryExecution = - ds.sparkSession.sessionState.executePlan(plan) - - def joinPlan(ds: Dataset[_], plan: LogicalPlan, leftPlan: LogicalPlan, rightPlan: LogicalPlan): LogicalPlan = { - val joined = executePlan(ds, plan) - val leftOutput = joined.analyzed.output.take(leftPlan.output.length) - val rightOutput = joined.analyzed.output.takeRight(rightPlan.output.length) - - Project(List( - Alias(CreateStruct(leftOutput), "_1")(), - Alias(CreateStruct(rightOutput), "_2")() - ), joined.analyzed) - } - - def mkDataset[T](sqlContext: SQLContext, plan: LogicalPlan, encoder: Encoder[T]): Dataset[T] = - new Dataset(sqlContext, plan, encoder) - - def ofRows(sparkSession: SparkSession, logicalPlan: LogicalPlan): DataFrame = - Dataset.ofRows(sparkSession, logicalPlan) - - // because org.apache.spark.sql.types.UserDefinedType is private[spark] - type UserDefinedType[A >: Null] = org.apache.spark.sql.types.UserDefinedType[A] - - // below only tested in SelfJoinTests.colLeft and colRight are equivalent to col outside of joins - // - via files (codegen) forces doGenCode eval. - /** Expression to tag columns from the left hand side of join expression. */ - case class DisambiguateLeft[T](tagged: Expression) extends Expression with NonSQLExpression { - def eval(input: InternalRow): Any = tagged.eval(input) - def nullable: Boolean = false - def children: Seq[Expression] = tagged :: Nil - def dataType: DataType = tagged.dataType - protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = tagged.genCode(ctx) - protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): Expression = copy(newChildren.head) - } - - /** Expression to tag columns from the right hand side of join expression. */ - case class DisambiguateRight[T](tagged: Expression) extends Expression with NonSQLExpression { - def eval(input: InternalRow): Any = tagged.eval(input) - def nullable: Boolean = false - def children: Seq[Expression] = tagged :: Nil - def dataType: DataType = tagged.dataType - protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = tagged.genCode(ctx) - protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): Expression = copy(newChildren.head) - } -} diff --git a/dataset/src/main/spark-3.4+/org/apache/spark/sql/FramelessInternals.scala b/dataset/src/main/spark-3.4+/org/apache/spark/sql/FramelessInternals.scala new file mode 100644 index 000000000..3022bf23c --- /dev/null +++ b/dataset/src/main/spark-3.4+/org/apache/spark/sql/FramelessInternals.scala @@ -0,0 +1,127 @@ +package org.apache.spark.sql + +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.expressions.codegen._ +import org.apache.spark.sql.catalyst.expressions.{ Alias, CreateStruct } +import org.apache.spark.sql.catalyst.expressions.{ Expression, NamedExpression } +import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.plans.logical.{ LogicalPlan, Project } +import org.apache.spark.sql.execution.QueryExecution +import org.apache.spark.sql.types._ +import org.apache.spark.sql.types.ObjectType +import scala.reflect.ClassTag + +object FramelessInternals { + + def objectTypeFor[A]( + implicit + classTag: ClassTag[A] + ): ObjectType = ObjectType(classTag.runtimeClass) + + def resolveExpr(ds: Dataset[_], colNames: Seq[String]): NamedExpression = { + ds.toDF() + .queryExecution + .analyzed + .resolve(colNames, ds.sparkSession.sessionState.analyzer.resolver) + .getOrElse { + throw new AnalysisException( + s"""Cannot resolve column name "$colNames" among (${ds.schema.fieldNames + .mkString(", ")})""" + ) + } + } + + /** Wraps a Catalyst `Expression` into a `Column`. */ + def column(e: Expression): Column = new Column(e) + + def expr(column: Column): Expression = column.expr + + def logicalPlan(ds: Dataset[_]): LogicalPlan = ds.logicalPlan + + def executePlan(ds: Dataset[_], plan: LogicalPlan): QueryExecution = + ds.sparkSession.sessionState.executePlan(plan) + + def sqlContext(ds: Dataset[_]): SQLContext = ds.sqlContext + + def getConf(ds: Dataset[_], key: String, default: String): String = + ds.sqlContext.getConf(key, default) + + def joinPlan( + ds: Dataset[_], + plan: LogicalPlan, + leftPlan: LogicalPlan, + rightPlan: LogicalPlan + ): LogicalPlan = { + val joined = executePlan(ds, plan) + val leftOutput = joined.analyzed.output.take(leftPlan.output.length) + val rightOutput = joined.analyzed.output.takeRight(rightPlan.output.length) + + Project( + List( + Alias(CreateStruct(leftOutput), "_1")(), + Alias(CreateStruct(rightOutput), "_2")() + ), + joined.analyzed + ) + } + + def mkDataset[T]( + source: Dataset[_], + plan: LogicalPlan, + encoder: Encoder[T] + ): Dataset[T] = + new Dataset(source.sparkSession, plan, encoder) + + def ofRows(sparkSession: SparkSession, logicalPlan: LogicalPlan): DataFrame = + Dataset.ofRows(sparkSession, logicalPlan) + + /** Builds an `ExpressionEncoder` from frameless' own serializer/deserializer expressions. */ + def expressionEncoder[T]( + objSerializer: Expression, + objDeserializer: Expression, + classTag: ClassTag[T] + ): ExpressionEncoder[T] = + new ExpressionEncoder[T](objSerializer, objDeserializer, classTag) + + // because org.apache.spark.sql.types.UserDefinedType is private[spark] + type UserDefinedType[A >: Null] = + org.apache.spark.sql.types.UserDefinedType[A] + + // below only tested in SelfJoinTests.colLeft and colRight are equivalent to col outside of joins + // - via files (codegen) forces doGenCode eval. + /** Expression to tag columns from the left hand side of join expression. */ + case class DisambiguateLeft[T](tagged: Expression) + extends Expression + with NonSQLExpression { + def eval(input: InternalRow): Any = tagged.eval(input) + def nullable: Boolean = false + def children: Seq[Expression] = tagged :: Nil + def dataType: DataType = tagged.dataType + + protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = + tagged.genCode(ctx) + + protected def withNewChildrenInternal( + newChildren: IndexedSeq[Expression] + ): Expression = copy(newChildren.head) + } + + /** Expression to tag columns from the right hand side of join expression. */ + case class DisambiguateRight[T](tagged: Expression) + extends Expression + with NonSQLExpression { + def eval(input: InternalRow): Any = tagged.eval(input) + def nullable: Boolean = false + def children: Seq[Expression] = tagged :: Nil + def dataType: DataType = tagged.dataType + + protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = + tagged.genCode(ctx) + + protected def withNewChildrenInternal( + newChildren: IndexedSeq[Expression] + ): Expression = copy(newChildren.head) + } +} diff --git a/dataset/src/main/spark-3/org/apache/spark/sql/FramelessInternals.scala b/dataset/src/main/spark-3/org/apache/spark/sql/FramelessInternals.scala new file mode 100644 index 000000000..3022bf23c --- /dev/null +++ b/dataset/src/main/spark-3/org/apache/spark/sql/FramelessInternals.scala @@ -0,0 +1,127 @@ +package org.apache.spark.sql + +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.expressions.codegen._ +import org.apache.spark.sql.catalyst.expressions.{ Alias, CreateStruct } +import org.apache.spark.sql.catalyst.expressions.{ Expression, NamedExpression } +import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.plans.logical.{ LogicalPlan, Project } +import org.apache.spark.sql.execution.QueryExecution +import org.apache.spark.sql.types._ +import org.apache.spark.sql.types.ObjectType +import scala.reflect.ClassTag + +object FramelessInternals { + + def objectTypeFor[A]( + implicit + classTag: ClassTag[A] + ): ObjectType = ObjectType(classTag.runtimeClass) + + def resolveExpr(ds: Dataset[_], colNames: Seq[String]): NamedExpression = { + ds.toDF() + .queryExecution + .analyzed + .resolve(colNames, ds.sparkSession.sessionState.analyzer.resolver) + .getOrElse { + throw new AnalysisException( + s"""Cannot resolve column name "$colNames" among (${ds.schema.fieldNames + .mkString(", ")})""" + ) + } + } + + /** Wraps a Catalyst `Expression` into a `Column`. */ + def column(e: Expression): Column = new Column(e) + + def expr(column: Column): Expression = column.expr + + def logicalPlan(ds: Dataset[_]): LogicalPlan = ds.logicalPlan + + def executePlan(ds: Dataset[_], plan: LogicalPlan): QueryExecution = + ds.sparkSession.sessionState.executePlan(plan) + + def sqlContext(ds: Dataset[_]): SQLContext = ds.sqlContext + + def getConf(ds: Dataset[_], key: String, default: String): String = + ds.sqlContext.getConf(key, default) + + def joinPlan( + ds: Dataset[_], + plan: LogicalPlan, + leftPlan: LogicalPlan, + rightPlan: LogicalPlan + ): LogicalPlan = { + val joined = executePlan(ds, plan) + val leftOutput = joined.analyzed.output.take(leftPlan.output.length) + val rightOutput = joined.analyzed.output.takeRight(rightPlan.output.length) + + Project( + List( + Alias(CreateStruct(leftOutput), "_1")(), + Alias(CreateStruct(rightOutput), "_2")() + ), + joined.analyzed + ) + } + + def mkDataset[T]( + source: Dataset[_], + plan: LogicalPlan, + encoder: Encoder[T] + ): Dataset[T] = + new Dataset(source.sparkSession, plan, encoder) + + def ofRows(sparkSession: SparkSession, logicalPlan: LogicalPlan): DataFrame = + Dataset.ofRows(sparkSession, logicalPlan) + + /** Builds an `ExpressionEncoder` from frameless' own serializer/deserializer expressions. */ + def expressionEncoder[T]( + objSerializer: Expression, + objDeserializer: Expression, + classTag: ClassTag[T] + ): ExpressionEncoder[T] = + new ExpressionEncoder[T](objSerializer, objDeserializer, classTag) + + // because org.apache.spark.sql.types.UserDefinedType is private[spark] + type UserDefinedType[A >: Null] = + org.apache.spark.sql.types.UserDefinedType[A] + + // below only tested in SelfJoinTests.colLeft and colRight are equivalent to col outside of joins + // - via files (codegen) forces doGenCode eval. + /** Expression to tag columns from the left hand side of join expression. */ + case class DisambiguateLeft[T](tagged: Expression) + extends Expression + with NonSQLExpression { + def eval(input: InternalRow): Any = tagged.eval(input) + def nullable: Boolean = false + def children: Seq[Expression] = tagged :: Nil + def dataType: DataType = tagged.dataType + + protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = + tagged.genCode(ctx) + + protected def withNewChildrenInternal( + newChildren: IndexedSeq[Expression] + ): Expression = copy(newChildren.head) + } + + /** Expression to tag columns from the right hand side of join expression. */ + case class DisambiguateRight[T](tagged: Expression) + extends Expression + with NonSQLExpression { + def eval(input: InternalRow): Any = tagged.eval(input) + def nullable: Boolean = false + def children: Seq[Expression] = tagged :: Nil + def dataType: DataType = tagged.dataType + + protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = + tagged.genCode(ctx) + + protected def withNewChildrenInternal( + newChildren: IndexedSeq[Expression] + ): Expression = copy(newChildren.head) + } +} diff --git a/dataset/src/main/spark-4/frameless/MapGroups.scala b/dataset/src/main/spark-4/frameless/MapGroups.scala new file mode 100644 index 000000000..25411420b --- /dev/null +++ b/dataset/src/main/spark-4/frameless/MapGroups.scala @@ -0,0 +1,25 @@ +package frameless + +import org.apache.spark.sql.Encoder +import org.apache.spark.sql.catalyst.expressions.Attribute +import org.apache.spark.sql.catalyst.plans.logical.{ + LogicalPlan, + MapGroups => SMapGroups +} + +object MapGroups { + + def apply[K: Encoder, T: Encoder, U: Encoder]( + func: (K, Iterator[T]) => TraversableOnce[U], + groupingAttributes: Seq[Attribute], + dataAttributes: Seq[Attribute], + child: LogicalPlan + ): LogicalPlan = + SMapGroups( + func, + groupingAttributes, + dataAttributes, + Seq(), // #698 - no order given + child + ) +} diff --git a/dataset/src/main/spark-4/org/apache/spark/sql/FramelessInternals.scala b/dataset/src/main/spark-4/org/apache/spark/sql/FramelessInternals.scala new file mode 100644 index 000000000..6daf2b4e4 --- /dev/null +++ b/dataset/src/main/spark-4/org/apache/spark/sql/FramelessInternals.scala @@ -0,0 +1,168 @@ +package org.apache.spark.sql + +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.expressions.codegen._ +import org.apache.spark.sql.catalyst.expressions.{ Alias, CreateStruct } +import org.apache.spark.sql.catalyst.expressions.{ Expression, NamedExpression } +import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder +import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.JavaBeanEncoder +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.plans.logical.{ LogicalPlan, Project } +import org.apache.spark.sql.classic.{ + Dataset => ClassicDataset, + SparkSession => ClassicSparkSession, + ExpressionUtils, + ColumnNodeToExpressionConverter +} +import org.apache.spark.sql.execution.QueryExecution +import org.apache.spark.sql.types._ +import org.apache.spark.sql.types.ObjectType +import scala.reflect.ClassTag + +/** + * Spark 4 split `Dataset`/`SparkSession`/`Column` into abstract API types + * (`org.apache.spark.sql.*`) and concrete implementations (`org.apache.spark.sql.classic.*`). + * The `Dataset`/`SparkSession` instances frameless holds are always the `classic` + * implementations at runtime, so the internal-only helpers below downcast to reach the + * `logicalPlan`/`sessionState`/`sqlContext` members that the abstract API no longer exposes. + * `Column` no longer wraps a Catalyst `Expression`; `classic.ExpressionUtils` is Spark's + * own bridge between the two. + */ +object FramelessInternals { + + def objectTypeFor[A]( + implicit + classTag: ClassTag[A] + ): ObjectType = ObjectType(classTag.runtimeClass) + + private def classic(ds: Dataset[_]): ClassicDataset[_] = + ds.asInstanceOf[ClassicDataset[_]] + + def resolveExpr(ds: Dataset[_], colNames: Seq[String]): NamedExpression = { + val cds = classic(ds) + cds.queryExecution.analyzed + .resolve(colNames, cds.sparkSession.sessionState.analyzer.resolver) + .getOrElse { + throw new AnalysisException( + errorClass = "UNRESOLVED_COLUMN.WITHOUT_SUGGESTION", + messageParameters = Map("objectName" -> colNames.mkString(".")) + ) + } + } + + /** Wraps a Catalyst `Expression` into a `Column` (Spark 4 bridge). */ + def column(e: Expression): Column = ExpressionUtils.column(e) + + /** + * Extracts the Catalyst `Expression` from a `Column`. + * + * `ExpressionUtils.expression` would return a lazy `ColumnNodeExpression` wrapper, which is + * `Unevaluable` and exposes no children. frameless builds join plans by hand and rewrites + * disambiguation markers via `Expression.transform`, both of which require a real, traversable + * expression tree - so convert the column's node eagerly instead. + */ + def expr(column: Column): Expression = + ColumnNodeToExpressionConverter(column.node) + + def logicalPlan(ds: Dataset[_]): LogicalPlan = classic(ds).logicalPlan + + def executePlan(ds: Dataset[_], plan: LogicalPlan): QueryExecution = + classic(ds).sparkSession.sessionState.executePlan(plan) + + def sqlContext(ds: Dataset[_]): SQLContext = classic(ds).sqlContext + + def getConf(ds: Dataset[_], key: String, default: String): String = + classic(ds).sparkSession.conf.get(key, default) + + def joinPlan( + ds: Dataset[_], + plan: LogicalPlan, + leftPlan: LogicalPlan, + rightPlan: LogicalPlan + ): LogicalPlan = { + val joined = executePlan(ds, plan) + val leftOutput = joined.analyzed.output.take(leftPlan.output.length) + val rightOutput = joined.analyzed.output.takeRight(rightPlan.output.length) + + Project( + List( + Alias(CreateStruct(leftOutput), "_1")(), + Alias(CreateStruct(rightOutput), "_2")() + ), + joined.analyzed + ) + } + + def mkDataset[T]( + source: Dataset[_], + plan: LogicalPlan, + encoder: Encoder[T] + ): Dataset[T] = + new ClassicDataset[T](classic(source).sparkSession, plan, encoder) + + def ofRows(sparkSession: SparkSession, logicalPlan: LogicalPlan): DataFrame = + ClassicDataset.ofRows( + sparkSession.asInstanceOf[ClassicSparkSession], + logicalPlan + ) + + /** + * Builds an `ExpressionEncoder` from frameless' own serializer/deserializer expressions. + * + * Spark 4's `ExpressionEncoder` takes a leading `AgnosticEncoder` (SPARK-49025), but it is + * only read for its `clsTag` and an Option-wrapping check - the serializer, deserializer and + * schema are all derived from the expressions frameless supplies. A minimal `JavaBeanEncoder` + * carrying the right `ClassTag` is therefore a correct, metadata-only stand-in. + */ + def expressionEncoder[T]( + objSerializer: Expression, + objDeserializer: Expression, + classTag: ClassTag[T] + ): ExpressionEncoder[T] = + new ExpressionEncoder[T]( + JavaBeanEncoder(classTag, Nil), + objSerializer, + objDeserializer + ) + + // because org.apache.spark.sql.types.UserDefinedType is private[spark] + type UserDefinedType[A >: Null] = + org.apache.spark.sql.types.UserDefinedType[A] + + // below only tested in SelfJoinTests.colLeft and colRight are equivalent to col outside of joins + // - via files (codegen) forces doGenCode eval. + /** Expression to tag columns from the left hand side of join expression. */ + case class DisambiguateLeft[T](tagged: Expression) + extends Expression + with NonSQLExpression { + def eval(input: InternalRow): Any = tagged.eval(input) + def nullable: Boolean = false + def children: Seq[Expression] = tagged :: Nil + def dataType: DataType = tagged.dataType + + protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = + tagged.genCode(ctx) + + protected def withNewChildrenInternal( + newChildren: IndexedSeq[Expression] + ): Expression = copy(newChildren.head) + } + + /** Expression to tag columns from the right hand side of join expression. */ + case class DisambiguateRight[T](tagged: Expression) + extends Expression + with NonSQLExpression { + def eval(input: InternalRow): Any = tagged.eval(input) + def nullable: Boolean = false + def children: Seq[Expression] = tagged :: Nil + def dataType: DataType = tagged.dataType + + protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = + tagged.genCode(ctx) + + protected def withNewChildrenInternal( + newChildren: IndexedSeq[Expression] + ): Expression = copy(newChildren.head) + } +} diff --git a/dataset/src/test/scala/frameless/SchemaTests.scala b/dataset/src/test/scala/frameless/SchemaTests.scala index 92fd33057..89fed7f86 100644 --- a/dataset/src/test/scala/frameless/SchemaTests.scala +++ b/dataset/src/test/scala/frameless/SchemaTests.scala @@ -2,7 +2,7 @@ package frameless import frameless.functions.aggregate._ import frameless.functions._ -import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.types.{ Metadata, StructType } import org.scalacheck.Prop import org.scalacheck.Prop._ import org.scalatest.matchers.should.Matchers @@ -10,10 +10,19 @@ import org.scalatest.matchers.should.Matchers class SchemaTests extends TypedDatasetSuite with Matchers { def structToNonNullable(struct: StructType): StructType = { - StructType(struct.fields.map( f => f.copy(nullable = false))) + // Spark 4 attaches metadata to aggregate output columns; clear it (and nullability) + // so comparisons consider only field names and types. No-op on Spark 3.x. + StructType( + struct.fields.map(f => + f.copy(nullable = false, metadata = Metadata.empty) + ) + ) } - def prop[A](dataset: TypedDataset[A], ignoreNullable: Boolean = false): Prop = { + def prop[A]( + dataset: TypedDataset[A], + ignoreNullable: Boolean = false + ): Prop = { val schema = dataset.dataset.schema Prop.all( @@ -24,7 +33,9 @@ class SchemaTests extends TypedDatasetSuite with Matchers { if (!ignoreNullable) TypedExpressionEncoder.targetStructType(dataset.encoder) ?= schema else - structToNonNullable(TypedExpressionEncoder.targetStructType(dataset.encoder)) ?= structToNonNullable(schema) + structToNonNullable( + TypedExpressionEncoder.targetStructType(dataset.encoder) + ) ?= structToNonNullable(schema) ) } diff --git a/dataset/src/test/scala/frameless/SelfJoinTests.scala b/dataset/src/test/scala/frameless/SelfJoinTests.scala index cede7be2a..742429108 100644 --- a/dataset/src/test/scala/frameless/SelfJoinTests.scala +++ b/dataset/src/test/scala/frameless/SelfJoinTests.scala @@ -2,13 +2,18 @@ package frameless import org.scalacheck.Prop import org.scalacheck.Prop._ -import org.apache.spark.sql.{SparkSession, functions => sparkFunctions} +import org.apache.spark.sql.{ SparkSession, functions => sparkFunctions } class SelfJoinTests extends TypedDatasetSuite { + // Without crossJoin.enabled=true Spark doesn't like trivial join conditions: // [error] Join condition is missing or trivial. // [error] Use the CROSS JOIN syntax to allow cartesian products between these relations. - def allowTrivialJoin[T](body: => T)(implicit session: SparkSession): T = { + def allowTrivialJoin[T]( + body: => T + )(implicit + session: SparkSession + ): T = { val crossJoin = "spark.sql.crossJoin.enabled" val oldSetting = session.conf.get(crossJoin) session.conf.set(crossJoin, "true") @@ -17,7 +22,11 @@ class SelfJoinTests extends TypedDatasetSuite { result } - def allowAmbiguousJoin[T](body: => T)(implicit session: SparkSession): T = { + def allowAmbiguousJoin[T]( + body: => T + )(implicit + session: SparkSession + ): T = { val crossJoin = "spark.sql.analyzer.failAmbiguousSelfJoin" val oldSetting = session.conf.get(crossJoin) session.conf.set(crossJoin, "false") @@ -27,22 +36,26 @@ class SelfJoinTests extends TypedDatasetSuite { } test("self join with colLeft/colRight disambiguation") { - def prop[ - A : TypedEncoder : Ordering, - B : TypedEncoder : Ordering - ](dx: List[X2[A, B]], d: X2[A, B]): Prop = allowAmbiguousJoin { + def prop[A: TypedEncoder: Ordering, B: TypedEncoder: Ordering]( + dx: List[X2[A, B]], + d: X2[A, B] + ): Prop = allowAmbiguousJoin { val data = d :: dx val ds = TypedDataset.create(data) // This is the way to write unambiguous self-join in vanilla, see https://goo.gl/XnkSUD val df1 = ds.dataset.as("df1") val df2 = ds.dataset.as("df2") - val vanilla = df1.join(df2, - sparkFunctions.col("df1.a") === sparkFunctions.col("df2.a")).count() + val vanilla = df1 + .join(df2, sparkFunctions.col("df1.a") === sparkFunctions.col("df2.a")) + .count() - val typed = ds.joinInner(ds)( - ds.colLeft('a) === ds.colRight('a) - ).count().run() + val typed = ds + .joinInner(ds)( + ds.colLeft('a) === ds.colRight('a) + ) + .count() + .run() vanilla ?= typed } @@ -50,48 +63,92 @@ class SelfJoinTests extends TypedDatasetSuite { check(prop[Int, Int] _) } + test("self join collects correct values via colLeft/colRight") { + def prop[A: TypedEncoder: Ordering, B: TypedEncoder: Ordering]( + dx: List[X2[A, B]], + d: X2[A, B] + ): Prop = allowAmbiguousJoin { + val data = d :: dx + val ds = TypedDataset.create(data) + + // Collecting the joined tuples exercises the colLeft/colRight disambiguation and the + // (T, U) ExpressionEncoder end to end, not just the row count: a regression guard for + // Spark 4, where columns no longer wrap Catalyst expressions directly. + val typed = ds + .joinInner(ds)(ds.colLeft('a) === ds.colRight('a)) + .collect() + .run() + .toVector + .sorted + + val expected = (for { + l <- data + r <- data + if l.a == r.a + } yield (l, r)).toVector.sorted + + typed ?= expected + } + + check(prop[Int, Int] _) + check(prop[String, Long] _) + } + test("trivial self join") { - def prop[ - A : TypedEncoder : Ordering, - B : TypedEncoder : Ordering - ](dx: List[X2[A, B]], d: X2[A, B]): Prop = - allowTrivialJoin { allowAmbiguousJoin { - - val data = d :: dx - val ds = TypedDataset.create(data) - val untyped = ds.dataset - // Interestingly, even with aliasing it seems that it's impossible to - // obtain a trivial join condition of shape df1.a == df1.a, Spark we - // always interpret that as df1.a == df2.a. For the purpose of this - // test we fall-back to lit(true) instead. - // val trivial = sparkFunctions.col("df1.a") === sparkFunctions.col("df1.a") - val trivial = sparkFunctions.lit(true) - val vanilla = untyped.as("df1").join(untyped.as("df2"), trivial).count() - - val typed = ds.joinInner(ds)(ds.colLeft('a) === ds.colLeft('a)).count().run - vanilla ?= typed - } } + def prop[A: TypedEncoder: Ordering, B: TypedEncoder: Ordering]( + dx: List[X2[A, B]], + d: X2[A, B] + ): Prop = + allowTrivialJoin { + allowAmbiguousJoin { + + val data = d :: dx + val ds = TypedDataset.create(data) + val untyped = ds.dataset + // Interestingly, even with aliasing it seems that it's impossible to + // obtain a trivial join condition of shape df1.a == df1.a, Spark we + // always interpret that as df1.a == df2.a. For the purpose of this + // test we fall-back to lit(true) instead. + // val trivial = sparkFunctions.col("df1.a") === sparkFunctions.col("df1.a") + val trivial = sparkFunctions.lit(true) + val vanilla = + untyped.as("df1").join(untyped.as("df2"), trivial).count() + + val typed = + ds.joinInner(ds)(ds.colLeft('a) === ds.colLeft('a)).count().run + vanilla ?= typed + } + } check(prop[Int, Int] _) } test("self join with unambiguous expression") { def prop[ - A : TypedEncoder : CatalystNumeric : Ordering, - B : TypedEncoder : Ordering - ](data: List[X3[A, A, B]]): Prop = allowAmbiguousJoin { + A: TypedEncoder: CatalystNumeric: Ordering, + B: TypedEncoder: Ordering + ](data: List[X3[A, A, B]] + ): Prop = allowAmbiguousJoin { val ds = TypedDataset.create(data) val df1 = ds.dataset.alias("df1") val df2 = ds.dataset.alias("df2") - val vanilla = df1.join(df2, - (sparkFunctions.col("df1.a") + sparkFunctions.col("df1.b")) === - (sparkFunctions.col("df2.a") + sparkFunctions.col("df2.b"))).count() - - val typed = ds.joinInner(ds)( - (ds.colLeft('a) + ds.colLeft('b)) === (ds.colRight('a) + ds.colRight('b)) - ).count().run() + val vanilla = df1 + .join( + df2, + (sparkFunctions.col("df1.a") + sparkFunctions.col("df1.b")) === + (sparkFunctions.col("df2.a") + sparkFunctions.col("df2.b")) + ) + .count() + + val typed = ds + .joinInner(ds)( + (ds.colLeft('a) + ds.colLeft('b)) === (ds.colRight('a) + ds + .colRight('b)) + ) + .count() + .run() vanilla ?= typed } @@ -99,41 +156,57 @@ class SelfJoinTests extends TypedDatasetSuite { check(prop[Int, Int] _) } - test("Do you want ambiguous self join? This is how you get ambiguous self join.") { + test( + "Do you want ambiguous self join? This is how you get ambiguous self join." + ) { def prop[ - A : TypedEncoder : CatalystNumeric : Ordering, - B : TypedEncoder : Ordering - ](data: List[X3[A, A, B]]): Prop = - allowTrivialJoin { allowAmbiguousJoin { - val ds = TypedDataset.create(data) - - // The point I'm making here is that it "behaves just like Spark". I - // don't know (or really care about how) how Spark disambiguates that - // internally... - val vanilla = ds.dataset.join(ds.dataset, - (ds.dataset("a") + ds.dataset("b")) === - (ds.dataset("a") + ds.dataset("b"))).count() - - val typed = ds.joinInner(ds)( - (ds.col('a) + ds.col('b)) === (ds.col('a) + ds.col('b)) - ).count().run() - - vanilla ?= typed - } } - - check(prop[Int, Int] _) - } + A: TypedEncoder: CatalystNumeric: Ordering, + B: TypedEncoder: Ordering + ](data: List[X3[A, A, B]] + ): Prop = + allowTrivialJoin { + allowAmbiguousJoin { + val ds = TypedDataset.create(data) + + // The point I'm making here is that it "behaves just like Spark". I + // don't know (or really care about how) how Spark disambiguates that + // internally... + val vanilla = ds.dataset + .join( + ds.dataset, + (ds.dataset("a") + ds.dataset("b")) === + (ds.dataset("a") + ds.dataset("b")) + ) + .count() + + val typed = ds + .joinInner(ds)( + (ds.col('a) + ds.col('b)) === (ds.col('a) + ds.col('b)) + ) + .count() + .run() + + vanilla ?= typed + } + } + + check(prop[Int, Int] _) + } test("colLeft and colRight are equivalent to col outside of joins") { - def prop[A, B, C, D](data: Vector[X4[A, B, C, D]])( - implicit - ea: TypedEncoder[A], - ex4: TypedEncoder[X4[A, B, C, D]] - ): Prop = { + def prop[A, B, C, D]( + data: Vector[X4[A, B, C, D]] + )(implicit + ea: TypedEncoder[A], + ex4: TypedEncoder[X4[A, B, C, D]] + ): Prop = { val dataset = TypedDataset.create(data) - val selectedCol = dataset.select(dataset.col [A]('a)).collect().run().toVector - val selectedColLeft = dataset.select(dataset.colLeft [A]('a)).collect().run().toVector - val selectedColRight = dataset.select(dataset.colRight[A]('a)).collect().run().toVector + val selectedCol = + dataset.select(dataset.col[A]('a)).collect().run().toVector + val selectedColLeft = + dataset.select(dataset.colLeft[A]('a)).collect().run().toVector + val selectedColRight = + dataset.select(dataset.colRight[A]('a)).collect().run().toVector (selectedCol ?= selectedColLeft) && (selectedCol ?= selectedColRight) } @@ -145,16 +218,26 @@ class SelfJoinTests extends TypedDatasetSuite { } test("colLeft and colRight are equivalent to col outside of joins - via files (codegen)") { - def prop[A, B, C, D](data: Vector[X4[A, B, C, D]])( - implicit - ea: TypedEncoder[A], - ex4: TypedEncoder[X4[A, B, C, D]] - ): Prop = { - TypedDataset.create(data).write.mode("overwrite").parquet("./target/testData") - val dataset = TypedDataset.createUnsafe[X4[A, B, C, D]](session.read.parquet("./target/testData")) - val selectedCol = dataset.select(dataset.col [A]('a)).collect().run().toVector - val selectedColLeft = dataset.select(dataset.colLeft [A]('a)).collect().run().toVector - val selectedColRight = dataset.select(dataset.colRight[A]('a)).collect().run().toVector + def prop[A, B, C, D]( + data: Vector[X4[A, B, C, D]] + )(implicit + ea: TypedEncoder[A], + ex4: TypedEncoder[X4[A, B, C, D]] + ): Prop = { + TypedDataset + .create(data) + .write + .mode("overwrite") + .parquet("./target/testData") + val dataset = TypedDataset.createUnsafe[X4[A, B, C, D]]( + session.read.parquet("./target/testData") + ) + val selectedCol = + dataset.select(dataset.col[A]('a)).collect().run().toVector + val selectedColLeft = + dataset.select(dataset.colLeft[A]('a)).collect().run().toVector + val selectedColRight = + dataset.select(dataset.colRight[A]('a)).collect().run().toVector (selectedCol ?= selectedColLeft) && (selectedCol ?= selectedColRight) } diff --git a/dataset/src/test/scala/frameless/TypedDatasetSuite.scala b/dataset/src/test/scala/frameless/TypedDatasetSuite.scala index 8a4697835..e31be7cbc 100644 --- a/dataset/src/test/scala/frameless/TypedDatasetSuite.scala +++ b/dataset/src/test/scala/frameless/TypedDatasetSuite.scala @@ -2,28 +2,35 @@ package frameless import com.globalmentor.apache.hadoop.fs.BareLocalFileSystem import org.apache.hadoop.fs.local.StreamingFS -import org.apache.spark.{SparkConf, SparkContext} -import org.apache.spark.sql.{SQLContext, SparkSession} +import org.apache.spark.{ SparkConf, SparkContext } +import org.apache.spark.sql.{ SQLContext, SparkSession } import org.scalactic.anyvals.PosZInt import org.scalatest.BeforeAndAfterAll import org.scalatestplus.scalacheck.Checkers import org.scalacheck.Prop import org.scalacheck.Prop._ -import scala.util.{Properties, Try} +import scala.util.{ Properties, Try } import org.scalatest.funsuite.AnyFunSuite trait SparkTesting { self: BeforeAndAfterAll => - val appID: String = new java.util.Date().toString + math.floor(math.random * 10E4).toLong.toString + val appID: String = new java.util.Date().toString + math + .floor(math.random * 10e4) + .toLong + .toString /** * Allows bare naked to be used instead of winutils for testing / dev */ def registerFS(sparkConf: SparkConf): SparkConf = { if (System.getProperty("os.name").startsWith("Windows")) - sparkConf.set("spark.hadoop.fs.file.impl", classOf[BareLocalFileSystem].getName). - set("spark.hadoop.fs.AbstractFileSystem.file.impl", classOf[StreamingFS].getName) + sparkConf + .set("spark.hadoop.fs.file.impl", classOf[BareLocalFileSystem].getName) + .set( + "spark.hadoop.fs.AbstractFileSystem.file.impl", + classOf[StreamingFS].getName + ) else sparkConf } @@ -33,6 +40,11 @@ trait SparkTesting { self: BeforeAndAfterAll => .setAppName("test") .set("spark.ui.enabled", "false") .set("spark.app.id", appID) + // Spark 4 enables ANSI SQL mode by default, which makes the property-based + // generators (extreme numeric values, malformed date strings) raise overflow / + // cast errors instead of the wrap-around / null semantics these tests assert. + // No-op on Spark 3.x, where ANSI is already disabled by default. + .set("spark.sql.ansi.enabled", "false") private var s: SparkSession = _ @@ -40,9 +52,9 @@ trait SparkTesting { self: BeforeAndAfterAll => implicit def sc: SparkContext = session.sparkContext implicit def sqlContext: SQLContext = session.sqlContext - def registerOptimizations(sqlContext: SQLContext): Unit = { } + def registerOptimizations(sqlContext: SQLContext): Unit = {} - def addSparkConfigProperties(config: SparkConf): Unit = { } + def addSparkConfigProperties(config: SparkConf): Unit = {} override def beforeAll(): Unit = { assert(s == null) @@ -59,11 +71,16 @@ trait SparkTesting { self: BeforeAndAfterAll => } } +class TypedDatasetSuite + extends AnyFunSuite + with Checkers + with BeforeAndAfterAll + with SparkTesting { -class TypedDatasetSuite extends AnyFunSuite with Checkers with BeforeAndAfterAll with SparkTesting { // Limit size of generated collections and number of checks to avoid OutOfMemoryError implicit override val generatorDrivenConfig: PropertyCheckConfiguration = { - def getPosZInt(name: String, default: PosZInt) = Properties.envOrNone(s"FRAMELESS_GEN_${name}") + def getPosZInt(name: String, default: PosZInt) = Properties + .envOrNone(s"FRAMELESS_GEN_${name}") .flatMap(s => Try(s.toInt).toOption) .flatMap(PosZInt.from) .getOrElse(default) @@ -75,17 +92,24 @@ class TypedDatasetSuite extends AnyFunSuite with Checkers with BeforeAndAfterAll implicit val sparkDelay: SparkDelay[Job] = Job.framelessSparkDelayForJob - def approximatelyEqual[A](a: A, b: A)(implicit numeric: Numeric[A]): Prop = { + def approximatelyEqual[A]( + a: A, + b: A + )(implicit + numeric: Numeric[A] + ): Prop = { val da = numeric.toDouble(a) val db = numeric.toDouble(b) - val epsilon = 1E-6 + val epsilon = 1e-6 // Spark has a weird behaviour concerning expressions that should return Inf // Most of the time they return NaN instead, for instance stddev of Seq(-7.827553978923477E227, -5.009124275715786E153) - if((da.isNaN || da.isInfinity) && (db.isNaN || db.isInfinity)) proved + if ((da.isNaN || da.isInfinity) && (db.isNaN || db.isInfinity)) proved else if ( (da - db).abs < epsilon || - (da - db).abs < da.abs / 100) - proved - else falsified :| s"Expected $a but got $b, which is more than 1% off and greater than epsilon = $epsilon." + (da - db).abs < da.abs / 100 + ) + proved + else + falsified :| s"Expected $a but got $b, which is more than 1% off and greater than epsilon = $epsilon." } } diff --git a/dataset/src/test/scala/frameless/forward/SQLContextTests.scala b/dataset/src/test/scala/frameless/forward/SQLContextTests.scala index 700f29b05..06c14f651 100644 --- a/dataset/src/test/scala/frameless/forward/SQLContextTests.scala +++ b/dataset/src/test/scala/frameless/forward/SQLContextTests.scala @@ -1,14 +1,15 @@ package frameless import org.scalacheck.Prop -import org.scalacheck.Prop.{forAll, _} +import org.scalacheck.Prop.{ forAll, _ } class SQLContextTests extends TypedDatasetSuite { test("sqlContext") { def prop[A: TypedEncoder](data: Vector[A]): Prop = { val dataset = TypedDataset.create[A](data) - dataset.sqlContext =? dataset.dataset.sqlContext + dataset.sqlContext =? org.apache.spark.sql.FramelessInternals + .sqlContext(dataset.dataset) } check(forAll(prop[Int] _)) diff --git a/dataset/src/test/spark-3.3+/frameless/sql/rules/FramelessLitPushDownTests.scala b/dataset/src/test/spark-3.3+/frameless/sql/rules/FramelessLitPushDownTests.scala index 36a443fb5..1df361b9b 100644 --- a/dataset/src/test/spark-3.3+/frameless/sql/rules/FramelessLitPushDownTests.scala +++ b/dataset/src/test/spark-3.3+/frameless/sql/rules/FramelessLitPushDownTests.scala @@ -2,19 +2,21 @@ package frameless.sql.rules import frameless._ import frameless.functions.Lit -import org.apache.spark.sql.catalyst.util.DateTimeUtils.{currentTimestamp, microsToInstant} -import org.apache.spark.sql.sources.{EqualTo, GreaterThanOrEqual, IsNotNull} +import org.apache.spark.sql.catalyst.util.DateTimeUtils.microsToInstant +import org.apache.spark.sql.sources.{ EqualTo, GreaterThanOrEqual, IsNotNull } import org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import java.time.Instant class FramelessLitPushDownTests extends SQLRulesSuite { - private val now: Long = currentTimestamp() + // micros since epoch; DateTimeUtils.currentTimestamp was removed in Spark 4. + private val now: Long = System.currentTimeMillis() * 1000L test("java.sql.Timestamp push-down") { val expected = java.sql.Timestamp.from(microsToInstant(now)) val expectedStructure = X1(SQLTimestamp(now)) - val expectedPushDownFilters = List(IsNotNull("a"), GreaterThanOrEqual("a", expected)) + val expectedPushDownFilters = + List(IsNotNull("a"), GreaterThanOrEqual("a", expected)) predicatePushDownTest[SQLTimestamp]( expectedStructure, @@ -27,7 +29,8 @@ class FramelessLitPushDownTests extends SQLRulesSuite { test("java.time.Instant push-down") { val expected = java.sql.Timestamp.from(microsToInstant(now)) val expectedStructure = X1(microsToInstant(now)) - val expectedPushDownFilters = List(IsNotNull("a"), GreaterThanOrEqual("a", expected)) + val expectedPushDownFilters = + List(IsNotNull("a"), GreaterThanOrEqual("a", expected)) predicatePushDownTest[Instant]( expectedStructure, @@ -40,7 +43,10 @@ class FramelessLitPushDownTests extends SQLRulesSuite { test("struct push-down") { type Payload = X4[Int, Int, Int, Int] val expectedStructure = X1(X4(1, 2, 3, 4)) - val expected = new GenericRowWithSchema(Array(1, 2, 3, 4), TypedExpressionEncoder[Payload].schema) + val expected = new GenericRowWithSchema( + Array(1, 2, 3, 4), + TypedExpressionEncoder[Payload].schema + ) val expectedPushDownFilters = List(IsNotNull("a"), EqualTo("a", expected)) predicatePushDownTest[Payload](