Skip to content

Commit 5a27dc9

Browse files
[SPARK-44761][CONNECT] Support DataStreamWriter.foreachBatch(VoidFunction2)
### What changes were proposed in this pull request? This PR adds the `DatastreamWriter.foreachBatch(VoidFunction2). ### Why are the changes needed? To increase binary compatibility with the APIs in `sql/core`. ### Does this PR introduce _any_ user-facing change? Yes. It adds a new method to DatastreamWriter. ### How was this patch tested? I modified an existing code path. Closes apache#42430 from hvanhovell/SPARK-44761. Lead-authored-by: Herman van Hovell <[email protected]> Co-authored-by: Hyukjin Kwon <[email protected]> Signed-off-by: Herman van Hovell <[email protected]>
1 parent cb16591 commit 5a27dc9

File tree

4 files changed

+29
-8
lines changed

4 files changed

+29
-8
lines changed

connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,13 @@ import scala.collection.JavaConverters._
2525
import com.google.protobuf.ByteString
2626

2727
import org.apache.spark.annotation.Evolving
28+
import org.apache.spark.api.java.function.VoidFunction2
2829
import org.apache.spark.connect.proto
2930
import org.apache.spark.connect.proto.Command
3031
import org.apache.spark.connect.proto.WriteStreamOperationStart
3132
import org.apache.spark.internal.Logging
3233
import org.apache.spark.sql.{Dataset, ForeachWriter}
33-
import org.apache.spark.sql.connect.common.DataTypeProtoConverter
34-
import org.apache.spark.sql.connect.common.ForeachWriterPacket
34+
import org.apache.spark.sql.connect.common.{DataTypeProtoConverter, ForeachWriterPacket, UdfUtils}
3535
import org.apache.spark.sql.execution.streaming.AvailableNowTrigger
3636
import org.apache.spark.sql.execution.streaming.ContinuousTrigger
3737
import org.apache.spark.sql.execution.streaming.OneTimeTrigger
@@ -247,6 +247,24 @@ final class DataStreamWriter[T] private[sql] (ds: Dataset[T]) extends Logging {
247247
this
248248
}
249249

250+
/**
251+
* :: Experimental ::
252+
*
253+
* (Java-specific) Sets the output of the streaming query to be processed using the provided
254+
* function. This is supported only in the micro-batch execution modes (that is, when the
255+
* trigger is not continuous). In every micro-batch, the provided function will be called in
256+
* every micro-batch with (i) the output rows as a Dataset and (ii) the batch identifier. The
257+
* batchId can be used to deduplicate and transactionally write the output (that is, the
258+
* provided Dataset) to external systems. The output Dataset is guaranteed to be exactly the
259+
* same for the same batchId (assuming all operations are deterministic in the query).
260+
*
261+
* @since 3.5.0
262+
*/
263+
@Evolving
264+
def foreachBatch(function: VoidFunction2[Dataset[T], java.lang.Long]): DataStreamWriter[T] = {
265+
foreachBatch(UdfUtils.foreachBatchFuncToScalaFunc(function))
266+
}
267+
250268
/**
251269
* Starts the execution of the streaming query, which will continually output results to the
252270
* given path as new data arrives. The returned [[StreamingQuery]] object can be used to

connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/CheckConnectJvmClientCompatibility.scala

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -234,9 +234,6 @@ object CheckConnectJvmClientCompatibility {
234234
// DataStreamWriter
235235
ProblemFilters.exclude[MissingClassProblem](
236236
"org.apache.spark.sql.streaming.DataStreamWriter$"),
237-
ProblemFilters.exclude[Problem](
238-
"org.apache.spark.sql.streaming.DataStreamWriter.foreachBatch" // TODO(SPARK-42944)
239-
),
240237
ProblemFilters.exclude[Problem](
241238
"org.apache.spark.sql.streaming.DataStreamWriter.SOURCE*" // These are constant vals.
242239
),

connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/streaming/ClientStreamingQuerySuite.scala

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import org.scalatest.concurrent.Eventually.eventually
2727
import org.scalatest.concurrent.Futures.timeout
2828
import org.scalatest.time.SpanSugar._
2929

30+
import org.apache.spark.api.java.function.VoidFunction2
3031
import org.apache.spark.internal.Logging
3132
import org.apache.spark.sql.{DataFrame, ForeachWriter, Row, SparkSession, SQLHelper}
3233
import org.apache.spark.sql.connect.client.util.QueryTest
@@ -412,11 +413,13 @@ class EventCollector extends StreamingQueryListener {
412413
}
413414
}
414415

415-
class ForeachBatchFn(val viewName: String) extends ((DataFrame, Long) => Unit) with Serializable {
416-
override def apply(df: DataFrame, batchId: Long): Unit = {
416+
class ForeachBatchFn(val viewName: String)
417+
extends VoidFunction2[DataFrame, java.lang.Long]
418+
with Serializable {
419+
override def call(df: DataFrame, batchId: java.lang.Long): Unit = {
417420
val count = df.count()
418421
df.sparkSession
419-
.createDataFrame(Seq((batchId, count)))
422+
.createDataFrame(Seq((batchId.toLong, count)))
420423
.createOrReplaceGlobalTempView(viewName)
421424
}
422425
}

connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/UdfUtils.scala

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,9 @@ private[sql] object UdfUtils extends Serializable {
5959
def foreachPartitionFuncToScalaFunc[T](f: ForeachPartitionFunction[T]): Iterator[T] => Unit =
6060
x => f.call(x.asJava)
6161

62+
def foreachBatchFuncToScalaFunc[D](f: VoidFunction2[D, java.lang.Long]): (D, Long) => Unit =
63+
(d, i) => f.call(d, i)
64+
6265
def flatMapFuncToScalaFunc[T, U](f: FlatMapFunction[T, U]): T => TraversableOnce[U] = x =>
6366
f.call(x).asScala
6467

0 commit comments

Comments
 (0)