[SPARK-26952][SQL] Row count statics should respect the data reported by data source

## What changes were proposed in this pull request? In data source v2, if the data source scan implemented `SupportsReportStatistics`. `DataSourceV2Relation` should respect the row count reported by the data source. ## How was this patch tested? New UT test. Closes #23853 from ConeyLiu/report-row-count. Authored-by: Xianyang Liu <xianyang.liu@intel.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
2019-02-26 14:10:54 +08:00 · 2019-02-26 14:10:54 +08:00 · bc03c8b3fa
parent 52a180f25f
commit bc03c8b3fa
3 changed files with 130 additions and 4 deletions
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala
@ -22,7 +22,7 @@ import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
 import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics}
 import org.apache.spark.sql.catalyst.util.truncatedString
 import org.apache.spark.sql.sources.v2._
-import org.apache.spark.sql.sources.v2.reader._
+import org.apache.spark.sql.sources.v2.reader.{Statistics => V2Statistics, _}
 import org.apache.spark.sql.sources.v2.reader.streaming.{Offset, SparkDataStream}
 import org.apache.spark.sql.sources.v2.writer._
@ -56,7 +56,7 @@ case class DataSourceV2Relation(
    scan match {
      case r: SupportsReportStatistics =>
        val statistics = r.estimateStatistics()
-        Statistics(sizeInBytes = statistics.sizeInBytes().orElse(conf.defaultSizeInBytes))
+        DataSourceV2Relation.transformV2Stats(statistics, None, conf.defaultSizeInBytes)
      case _ =>
        Statistics(sizeInBytes = conf.defaultSizeInBytes)
    }
@ -89,7 +89,7 @@ case class StreamingDataSourceV2Relation(
  override def computeStats(): Statistics = scan match {
    case r: SupportsReportStatistics =>
      val statistics = r.estimateStatistics()
-      Statistics(sizeInBytes = statistics.sizeInBytes().orElse(conf.defaultSizeInBytes))
+      DataSourceV2Relation.transformV2Stats(statistics, None, conf.defaultSizeInBytes)
    case _ =>
      Statistics(sizeInBytes = conf.defaultSizeInBytes)
  }
@ -100,4 +100,21 @@ object DataSourceV2Relation {
    val output = table.schema().toAttributes
    DataSourceV2Relation(table, output, options)
  }
  /**
   * This is used to transform data source v2 statistics to logical.Statistics.
   */
  def transformV2Stats(
      v2Statistics: V2Statistics,
      defaultRowCount: Option[BigInt],
      defaultSizeInBytes: Long): Statistics = {
    val numRows: Option[BigInt] = if (v2Statistics.numRows().isPresent) {
      Some(v2Statistics.numRows().getAsLong)
    } else {
      defaultRowCount
    }
    Statistics(
      sizeInBytes = v2Statistics.sizeInBytes().orElse(defaultSizeInBytes),
      rowCount = numRows)
  }
 }
--- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaReportStatisticsDataSource.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaReportStatisticsDataSource.java
@ -0,0 +1,65 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package test.org.apache.spark.sql.sources.v2;
 import java.util.OptionalLong;
 import org.apache.spark.sql.sources.v2.DataSourceOptions;
 import org.apache.spark.sql.sources.v2.Table;
 import org.apache.spark.sql.sources.v2.TableProvider;
 import org.apache.spark.sql.sources.v2.reader.InputPartition;
 import org.apache.spark.sql.sources.v2.reader.ScanBuilder;
 import org.apache.spark.sql.sources.v2.reader.Statistics;
 import org.apache.spark.sql.sources.v2.reader.SupportsReportStatistics;
 public class JavaReportStatisticsDataSource implements TableProvider {
  class MyScanBuilder extends JavaSimpleScanBuilder implements SupportsReportStatistics {
    @Override
    public Statistics estimateStatistics() {
      return new Statistics() {
        @Override
        public OptionalLong sizeInBytes() {
          return OptionalLong.of(80);
        }
        @Override
        public OptionalLong numRows() {
          return OptionalLong.of(10);
        }
      };
    }
    @Override
    public InputPartition[] planInputPartitions() {
      InputPartition[] partitions = new InputPartition[2];
      partitions[0] = new JavaRangeInputPartition(0, 5);
      partitions[1] = new JavaRangeInputPartition(5, 10);
      return partitions;
    }
  }
  @Override
  public Table getTable(DataSourceOptions options) {
    return new JavaSimpleBatchTable() {
      @Override
      public ScanBuilder newScanBuilder(DataSourceOptions options) {
        return new MyScanBuilder();
      }
    };
  }
 }
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
@ -18,6 +18,7 @@
 package org.apache.spark.sql.sources.v2
 import java.io.File
 import java.util.OptionalLong
 import test.org.apache.spark.sql.sources.v2._
@ -182,6 +183,24 @@ class DataSourceV2Suite extends QueryTest with SharedSQLContext {
    }
  }
  test ("statistics report data source") {
    Seq(classOf[ReportStatisticsDataSource], classOf[JavaReportStatisticsDataSource]).foreach {
      cls =>
        withClue(cls.getName) {
          val df = spark.read.format(cls.getName).load()
          val logical = df.queryExecution.optimizedPlan.collect {
            case d: DataSourceV2Relation => d
          }.head
          val statics = logical.computeStats()
          assert(statics.rowCount.isDefined && statics.rowCount.get === 10,
            "Row count statics should be reported by data source")
          assert(statics.sizeInBytes === 80,
            "Size in bytes statics should be reported by data source")
        }
    }
  }
  test("SPARK-23574: no shuffle exchange with single partition") {
    val df = spark.read.format(classOf[SimpleSinglePartitionSource].getName).load().agg(count("*"))
    assert(df.queryExecution.executedPlan.collect { case e: Exchange => e }.isEmpty)
@ -621,7 +640,6 @@ object ColumnarReaderFactory extends PartitionReaderFactory {
  }
 }
 class PartitionAwareDataSource extends TableProvider {
  class MyScanBuilder extends SimpleScanBuilder
@ -689,3 +707,29 @@ class SimpleWriteOnlyDataSource extends SimpleWritableDataSource {
    }
  }
 }
 class ReportStatisticsDataSource extends TableProvider {
  class MyScanBuilder extends SimpleScanBuilder
    with SupportsReportStatistics {
    override def estimateStatistics(): Statistics = {
      new Statistics {
        override def sizeInBytes(): OptionalLong = OptionalLong.of(80)
        override def numRows(): OptionalLong = OptionalLong.of(10)
      }
    }
    override def planInputPartitions(): Array[InputPartition] = {
      Array(RangeInputPartition(0, 5), RangeInputPartition(5, 10))
    }
  }
  override def getTable(options: DataSourceOptions): Table = {
    new SimpleBatchTable {
      override def newScanBuilder(options: DataSourceOptions): ScanBuilder = {
        new MyScanBuilder
      }
    }
  }
 }