[MINOR] Data source v2 docs update.

## What changes were proposed in this pull request? This patch includes some doc updates for data source API v2. I was reading the code and noticed some minor issues. ## How was this patch tested? This is a doc only change. Author: Reynold Xin <rxin@databricks.com> Closes #19626 from rxin/dsv2-update.
2017-11-01 18:39:15 +01:00 · 2017-11-01 18:39:15 +01:00 · d43e1f06bd
parent 1ffe03d9e8
commit d43e1f06bd
7 changed files with 19 additions and 29 deletions
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2.java
@ -20,12 +20,11 @@ package org.apache.spark.sql.sources.v2;
 import org.apache.spark.annotation.InterfaceStability;
 /**
- * The base interface for data source v2. Implementations must have a public, no arguments
+ * The base interface for data source v2. Implementations must have a public, 0-arg constructor.
 * constructor.
 *
- * Note that this is an empty interface, data source implementations should mix-in at least one of
+ * Note that this is an empty interface. Data source implementations should mix-in at least one of
- * the plug-in interfaces like {@link ReadSupport}. Otherwise it's just a dummy data source which is
+ * the plug-in interfaces like {@link ReadSupport} and {@link WriteSupport}. Otherwise it's just
- * un-readable/writable.
+ * a dummy data source which is un-readable/writable.
 */
@InterfaceStability.Evolving
 public interface DataSourceV2 {}
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/WriteSupport.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/WriteSupport.java
@ -36,8 +36,8 @@ public interface WriteSupport {
   * sources can return None if there is no writing needed to be done according to the save mode.
   *
   * @param jobId A unique string for the writing job. It's possible that there are many writing
-   *              jobs running at the same time, and the returned {@link DataSourceV2Writer} should
+   *              jobs running at the same time, and the returned {@link DataSourceV2Writer} can
-   *              use this job id to distinguish itself with writers of other jobs.
+   *              use this job id to distinguish itself from other jobs.
   * @param schema the schema of the data to be written.
   * @param mode the save mode which determines what to do when the data are already in this data
   *             source, please refer to {@link SaveMode} for more details.
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java
@ -34,11 +34,11 @@ import org.apache.spark.sql.types.StructType;
 *
 * There are mainly 3 kinds of query optimizations:
 *   1. Operators push-down. E.g., filter push-down, required columns push-down(aka column
- *      pruning), etc. These push-down interfaces are named like `SupportsPushDownXXX`.
+ *      pruning), etc. Names of these interfaces start with `SupportsPushDown`.
- *   2. Information Reporting. E.g., statistics reporting, ordering reporting, etc. These
+ *   2. Information Reporting. E.g., statistics reporting, ordering reporting, etc.
- *      reporting interfaces are named like `SupportsReportingXXX`.
+ *      Names of these interfaces start with `SupportsReporting`.
- *   3. Special scans. E.g, columnar scan, unsafe row scan, etc. These scan interfaces are named
+ *   3. Special scans. E.g, columnar scan, unsafe row scan, etc.
- *      like `SupportsScanXXX`.
+ *      Names of these interfaces start with `SupportsScan`.
 *
 * Spark first applies all operator push-down optimizations that this data source supports. Then
 * Spark collects information this data source reported for further optimizations. Finally Spark
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownCatalystFilters.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownCatalystFilters.java
@ -31,8 +31,6 @@ import org.apache.spark.sql.catalyst.expressions.Expression;
 * {@link SupportsPushDownFilters}, Spark will ignore {@link SupportsPushDownFilters} and only
 * process this interface.
 */
@InterfaceStability.Evolving
@Experimental
@InterfaceStability.Unstable
 public interface SupportsPushDownCatalystFilters {
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanUnsafeRow.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanUnsafeRow.java
@ -30,8 +30,6 @@ import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
 * This is an experimental and unstable interface, as {@link UnsafeRow} is not public and may get
 * changed in the future Spark versions.
 */
@InterfaceStability.Evolving
@Experimental
@InterfaceStability.Unstable
 public interface SupportsScanUnsafeRow extends DataSourceV2Reader {
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceV2Writer.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceV2Writer.java
@ -40,15 +40,10 @@ import org.apache.spark.sql.types.StructType;
 *      some writers are aborted, or the job failed with an unknown reason, call
 *      {@link #abort(WriterCommitMessage[])}.
 *
- * Spark won't retry failed writing jobs, users should do it manually in their Spark applications if
+ * While Spark will retry failed writing tasks, Spark won't retry failed writing jobs. Users should
- * they want to retry.
+ * do it manually in their Spark applications if they want to retry.
 *
- * Please refer to the document of commit/abort methods for detailed specifications.
+ * Please refer to the documentation of commit/abort methods for detailed specifications.
 *
 * Note that, this interface provides a protocol between Spark and data sources for transactional
 * data writing, but the transaction here is Spark-level transaction, which may not be the
 * underlying storage transaction. For example, Spark successfully writes data to a Cassandra data
 * source, but Cassandra may need some more time to reach consistency at storage level.
 */
@InterfaceStability.Evolving
 public interface DataSourceV2Writer {
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataWriter.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataWriter.java
@ -57,8 +57,8 @@ public interface DataWriter<T> {
  /**
   * Writes one record.
   *
-   * If this method fails(throw exception), {@link #abort()} will be called and this data writer is
+   * If this method fails (by throwing an exception), {@link #abort()} will be called and this
-   * considered to be failed.
+   * data writer is considered to have been failed.
   */
  void write(T record);
@ -70,10 +70,10 @@ public interface DataWriter<T> {
   * The written data should only be visible to data source readers after
   * {@link DataSourceV2Writer#commit(WriterCommitMessage[])} succeeds, which means this method
   * should still "hide" the written data and ask the {@link DataSourceV2Writer} at driver side to
-   * do the final commitment via {@link WriterCommitMessage}.
+   * do the final commit via {@link WriterCommitMessage}.
   *
-   * If this method fails(throw exception), {@link #abort()} will be called and this data writer is
+   * If this method fails (by throwing an exception), {@link #abort()} will be called and this
-   * considered to be failed.
+   * data writer is considered to have been failed.
   */
  WriterCommitMessage commit();