[SPARK-28702][SQL] Display useful error message (instead of NPE) for invalid Dataset operations
### What changes were proposed in this pull request? Added proper message instead of NPE for invalid Dataset operations (e.g. calling actions inside of transformations) similar to SPARK-5063 for RDD ### Why are the changes needed? To report the user about the exact issue instead of NPE ### Does this PR introduce any user-facing change? No ### How was this patch tested? Manually tested ```scala test code snap "import spark.implicits._ val ds1 = spark.sparkContext.parallelize(1 to 100, 100).toDS() val ds2 = spark.sparkContext.parallelize(1 to 100, 100).toDS() ds1.map(x => { // scalastyle:off println(ds2.count + x) x }).collect()" ``` Closes #25503 from shivusondur/jira28702. Authored-by: shivusondur <shivusondur@gmail.com> Signed-off-by: Josh Rosen <rosenville@gmail.com>
This commit is contained in:
parent
33e45ec7b8
commit
23bed0d3c0
|
@ -26,7 +26,7 @@ import scala.util.control.NonFatal
|
|||
|
||||
import org.apache.commons.lang3.StringUtils
|
||||
|
||||
import org.apache.spark.TaskContext
|
||||
import org.apache.spark.{SparkException, TaskContext}
|
||||
import org.apache.spark.annotation.{DeveloperApi, Evolving, Experimental, Stable, Unstable}
|
||||
import org.apache.spark.api.java.JavaRDD
|
||||
import org.apache.spark.api.java.function._
|
||||
|
@ -184,11 +184,23 @@ private[sql] object Dataset {
|
|||
*/
|
||||
@Stable
|
||||
class Dataset[T] private[sql](
|
||||
@transient val sparkSession: SparkSession,
|
||||
@transient private val _sparkSession: SparkSession,
|
||||
@DeveloperApi @Unstable @transient val queryExecution: QueryExecution,
|
||||
@DeveloperApi @Unstable @transient val encoder: Encoder[T])
|
||||
extends Serializable {
|
||||
|
||||
@transient lazy val sparkSession: SparkSession = {
|
||||
if (_sparkSession == null) {
|
||||
throw new SparkException(
|
||||
"Dataset transformations and actions can only be invoked by the driver, not inside of" +
|
||||
" other Dataset transformations; for example, dataset1.map(x => dataset2.values.count()" +
|
||||
" * x) is invalid because the values transformation and count action cannot be " +
|
||||
"performed inside of the dataset1.map transformation. For more information," +
|
||||
" see SPARK-28702.")
|
||||
}
|
||||
_sparkSession
|
||||
}
|
||||
|
||||
// A globally unique id of this Dataset.
|
||||
private val id = Dataset.curId.getAndIncrement()
|
||||
|
||||
|
|
Loading…
Reference in a new issue