diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/CustomTaskMetric.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/CustomTaskMetric.java index 47644a3267..99ea8f6b4a 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/CustomTaskMetric.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/CustomTaskMetric.java @@ -25,7 +25,7 @@ import org.apache.spark.sql.connector.read.PartitionReader; * at the executor side. During query execution, Spark will collect the task metrics per partition * by {@link PartitionReader} and update internal metrics based on collected metric values. * For streaming query, Spark will collect and combine metrics for a final result per micro batch. - * + *

* The metrics will be gathered during query execution back to the driver and then combined. How * the task metrics are combined is defined by corresponding {@link CustomMetric} with same metric * name. The final result will be shown up in the data source scan operator in Spark UI. diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsAtomicPartitionManagement.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsAtomicPartitionManagement.java index 6fd271cf91..e2c693f2d0 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsAtomicPartitionManagement.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsAtomicPartitionManagement.java @@ -30,15 +30,16 @@ import org.apache.spark.sql.catalyst.analysis.PartitionsAlreadyExistException; *

* These APIs are used to modify table partition or partition metadata, * they will change the table data as well. - * ${@link #createPartitions}: - * add an array of partitions and any data they contain to the table - * ${@link #dropPartitions}: - * remove an array of partitions and any data they contain from the table - * ${@link #purgePartitions}: - * remove an array of partitions and any data they contain from the table by skipping - * a trash even if it is supported - * ${@link #truncatePartitions}: - * truncate an array of partitions by removing partitions data + *

* * @since 3.1.0 */ diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsPartitionManagement.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsPartitionManagement.java index 8531522f61..ec2b61a766 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsPartitionManagement.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsPartitionManagement.java @@ -32,17 +32,15 @@ import org.apache.spark.sql.types.StructType; *

* These APIs are used to modify table partition identifier or partition metadata. * In some cases, they will change the table data as well. - * ${@link #createPartition}: - * add a partition and any data it contains to the table - * ${@link #dropPartition}: - * remove a partition and any data it contains from the table - * ${@link #purgePartition}: - * remove a partition and any data it contains from the table by skipping a trash - * even if it is supported. - * ${@link #replacePartitionMetadata}: - * point a partition to a new location, which will swap one location's data for the other - * ${@link #truncatePartition}: - * remove partition data from the table + *

* * @since 3.1.0 */ diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableProvider.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableProvider.java index 4881fde06c..6b68fbe123 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableProvider.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableProvider.java @@ -78,14 +78,17 @@ public interface TableProvider { /** * Returns true if the source has the ability of accepting external table metadata when getting * tables. The external table metadata includes: - * 1. For table reader: user-specified schema from `DataFrameReader`/`DataStreamReader` and - * schema/partitioning stored in Spark catalog. - * 2. For table writer: the schema of the input `Dataframe` of - * `DataframeWriter`/`DataStreamWriter`. + *
    + *
  1. For table reader: user-specified schema from {@code DataFrameReader}/{@code + * DataStreamReader} and schema/partitioning stored in Spark catalog.
  2. + *
  3. For table writer: the schema of the input {@code Dataframe} of + * {@code DataframeWriter}/{@code DataStreamWriter}.
  4. + *
*

* By default this method returns false, which means the schema and partitioning passed to - * `getTable` are from the infer methods. Please override it if this source has expensive - * schema/partitioning inference and wants external table metadata to avoid inference. + * {@link #getTable(StructType, Transform[], Map)} are from the infer methods. Please override it + * if this source has expensive schema/partitioning inference and wants external table metadata + * to avoid inference. */ default boolean supportsExternalMetadata() { return false; diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/Batch.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/Batch.java index 73aefa55ae..6861f168b9 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/Batch.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/Batch.java @@ -36,10 +36,8 @@ public interface Batch { *

* If the {@link Scan} supports filter pushdown, this Batch is likely configured with a filter * and is responsible for creating splits for that filter, which is not a full scan. - *

*

* This method will be called only once during a data source scan, to launch one Spark job. - *

*/ InputPartition[] planInputPartitions(); diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/InputPartition.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/InputPartition.java index 03eec00168..b9e7c375b4 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/InputPartition.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/InputPartition.java @@ -24,7 +24,7 @@ import org.apache.spark.annotation.Evolving; /** * A serializable representation of an input partition returned by * {@link Batch#planInputPartitions()} and the corresponding ones in streaming . - * + *

* Note that {@link InputPartition} will be serialized and sent to executors, then * {@link PartitionReader} will be created by * {@link PartitionReaderFactory#createReader(InputPartition)} or @@ -42,11 +42,11 @@ public interface InputPartition extends Serializable { * faster, but Spark does not guarantee to run the input partition reader on these locations. * The implementations should make sure that it can be run on any location. * The location is a string representing the host name. - * + *

* Note that if a host name cannot be recognized by Spark, it will be ignored as it was not in * the returned locations. The default return value is empty string array, which means this * input partition's reader has no location preference. - * + *

* If this method fails (by throwing an exception), the action will fail and no Spark job will be * submitted. */ diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/PartitionReader.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/PartitionReader.java index dfecb77c66..c91f2b4bf3 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/PartitionReader.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/PartitionReader.java @@ -27,7 +27,7 @@ import org.apache.spark.sql.connector.CustomTaskMetric; * A partition reader returned by {@link PartitionReaderFactory#createReader(InputPartition)} or * {@link PartitionReaderFactory#createColumnarReader(InputPartition)}. It's responsible for * outputting data for a RDD partition. - * + *

* Note that, Currently the type `T` can only be {@link org.apache.spark.sql.catalyst.InternalRow} * for normal data sources, or {@link org.apache.spark.sql.vectorized.ColumnarBatch} for columnar * data sources(whose {@link PartitionReaderFactory#supportColumnarReads(InputPartition)} diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/PartitionReaderFactory.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/PartitionReaderFactory.java index 9dded247e8..52204fd3bf 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/PartitionReaderFactory.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/PartitionReaderFactory.java @@ -25,7 +25,7 @@ import org.apache.spark.sql.vectorized.ColumnarBatch; /** * A factory used to create {@link PartitionReader} instances. - * + *

* If Spark fails to execute any methods in the implementations of this interface or in the returned * {@link PartitionReader} (by throwing an exception), corresponding Spark task would fail and * get retried until hitting the maximum retry times. @@ -37,7 +37,7 @@ public interface PartitionReaderFactory extends Serializable { /** * Returns a row-based partition reader to read data from the given {@link InputPartition}. - * + *

* Implementations probably need to cast the input partition to the concrete * {@link InputPartition} class defined for the data source. */ @@ -45,7 +45,7 @@ public interface PartitionReaderFactory extends Serializable { /** * Returns a columnar partition reader to read data from the given {@link InputPartition}. - * + *

* Implementations probably need to cast the input partition to the concrete * {@link InputPartition} class defined for the data source. */ @@ -57,7 +57,7 @@ public interface PartitionReaderFactory extends Serializable { * Returns true if the given {@link InputPartition} should be read by Spark in a columnar way. * This means, implementations must also implement {@link #createColumnarReader(InputPartition)} * for the input partitions that this method returns true. - * + *

* As of Spark 2.4, Spark can only read all input partition in a columnar way, or none of them. * Data source can't mix columnar and row-based partitions. This may be relaxed in future * versions. diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsPushDownFilters.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsPushDownFilters.java index 6594af2773..5ab9092c9a 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsPushDownFilters.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsPushDownFilters.java @@ -39,14 +39,17 @@ public interface SupportsPushDownFilters extends ScanBuilder { /** * Returns the filters that are pushed to the data source via {@link #pushFilters(Filter[])}. - * + *

* There are 3 kinds of filters: - * 1. pushable filters which don't need to be evaluated again after scanning. - * 2. pushable filters which still need to be evaluated after scanning, e.g. parquet - * row group filter. - * 3. non-pushable filters. + *

    + *
  1. pushable filters which don't need to be evaluated again after scanning.
  2. + *
  3. pushable filters which still need to be evaluated after scanning, e.g. parquet row + * group filter.
  4. + *
  5. non-pushable filters.
  6. + *
+ *

* Both case 1 and 2 should be considered as pushed filters and should be returned by this method. - * + *

* It's possible that there is no filters in the query and {@link #pushFilters(Filter[])} * is never called, empty array should be returned for this case. */ diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsPushDownRequiredColumns.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsPushDownRequiredColumns.java index 4f7da3c2a3..34de67bcf4 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsPushDownRequiredColumns.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsPushDownRequiredColumns.java @@ -32,11 +32,11 @@ public interface SupportsPushDownRequiredColumns extends ScanBuilder { /** * Applies column pruning w.r.t. the given requiredSchema. - * + *

* Implementation should try its best to prune the unnecessary columns or nested fields, but it's * also OK to do the pruning partially, e.g., a data source may not be able to prune nested * fields, and only prune top-level columns. - * + *

* Note that, {@link Scan#readSchema()} implementation should take care of the column * pruning applied here. */ diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsReportPartitioning.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsReportPartitioning.java index e7a27e0749..51d56bdf0a 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsReportPartitioning.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsReportPartitioning.java @@ -23,7 +23,7 @@ import org.apache.spark.sql.connector.read.partitioning.Partitioning; /** * A mix in interface for {@link Scan}. Data sources can implement this interface to * report data partitioning and try to avoid shuffle at Spark side. - * + *

* Note that, when a {@link Scan} implementation creates exactly one {@link InputPartition}, * Spark may avoid adding a shuffle even if the reader does not implement this interface. * diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsReportStatistics.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsReportStatistics.java index 1e0c9ca7c7..031749dee0 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsReportStatistics.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsReportStatistics.java @@ -22,7 +22,7 @@ import org.apache.spark.annotation.Evolving; /** * A mix in interface for {@link Scan}. Data sources can implement this interface to * report statistics to Spark. - * + *

* As of Spark 3.0, statistics are reported to the optimizer after operators are pushed to the * data source. Implementations may return more accurate statistics based on pushed operators * which may improve query performance by providing better information to the optimizer. diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/partitioning/Distribution.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/partitioning/Distribution.java index 264b268e24..a5911a820a 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/partitioning/Distribution.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/partitioning/Distribution.java @@ -24,14 +24,15 @@ import org.apache.spark.sql.connector.read.PartitionReader; * An interface to represent data distribution requirement, which specifies how the records should * be distributed among the data partitions (one {@link PartitionReader} outputs data for one * partition). + *

* Note that this interface has nothing to do with the data ordering inside one * partition(the output records of a single {@link PartitionReader}). - * + *

* The instance of this interface is created and provided by Spark, then consumed by * {@link Partitioning#satisfy(Distribution)}. This means data source developers don't need to * implement this interface, but need to catch as more concrete implementations of this interface * as possible in {@link Partitioning#satisfy(Distribution)}. - * + *

* Concrete implementations until now: *