diff --git a/docs/sql-data-sources-jdbc.md b/docs/sql-data-sources-jdbc.md index 90353ef998..decd880762 100644 --- a/docs/sql-data-sources-jdbc.md +++ b/docs/sql-data-sources-jdbc.md @@ -39,18 +39,24 @@ following command: ./bin/spark-shell --driver-class-path postgresql-9.4.1207.jar --jars postgresql-9.4.1207.jar {% endhighlight %} -Tables from the remote database can be loaded as a DataFrame or Spark SQL temporary view using -the Data Sources API. Users can specify the JDBC connection properties in the data source options. +## Data Source Option + +Spark supports the following case-insensitive options for JDBC. The Data source options of JDBC can be set via: +* the `.option`/`.options` methods of + * `DataFrameReader` + * `DataFrameWriter` +* `OPTIONS` clause at [CREATE TABLE USING DATA_SOURCE](sql-ref-syntax-ddl-create-table-datasource.html) + +For connection properties, users can specify the JDBC connection properties in the data source options. user and password are normally provided as connection properties for -logging into the data sources. In addition to the connection properties, Spark also supports -the following case-insensitive options: +logging into the data sources. @@ -159,7 +165,7 @@ the following case-insensitive options: diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index f9e37341dc..fa3e829a88 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -478,23 +478,12 @@ class DataFrameReader(OptionUtils): Parameters ---------- - url : str - a JDBC URL of the form ``jdbc:subprotocol:subname`` table : str the name of the table column : str, optional - the name of a column of numeric, date, or timestamp type - that will be used for partitioning; - if this parameter is specified, then ``numPartitions``, ``lowerBound`` - (inclusive), and ``upperBound`` (exclusive) will form partition strides - for generated WHERE clause expressions used to split the column - ``column`` evenly - lowerBound : str or int, optional - the minimum value of ``column`` used to decide partition stride - upperBound : str or int, optional - the maximum value of ``column`` used to decide partition stride - numPartitions : int, optional - the number of partitions + alias of ``partitionColumn`` option. Refer to ``partitionColumn`` in + `Data Source Option `_ + in the version you use. predicates : list, optional a list of expressions suitable for inclusion in WHERE clauses; each one defines one partition of the :class:`DataFrame` @@ -503,6 +492,15 @@ class DataFrameReader(OptionUtils): least properties "user" and "password" with their corresponding values. For example { 'user' : 'SYSTEM', 'password' : 'mypassword' } + Other Parameters + ---------------- + Extra options + For the extra options, refer to + `Data Source Option `_ + in the version you use. + + .. # noqa + Notes ----- Don't create too many partitions in parallel on a large cluster; @@ -985,8 +983,6 @@ class DataFrameWriter(OptionUtils): Parameters ---------- - url : str - a JDBC URL of the form ``jdbc:subprotocol:subname`` table : str Name of the table in the external database. mode : str, optional @@ -1002,6 +998,15 @@ class DataFrameWriter(OptionUtils): least properties "user" and "password" with their corresponding values. For example { 'user' : 'SYSTEM', 'password' : 'mypassword' } + Other Parameters + ---------------- + Extra options + For the extra options, refer to + `Data Source Option `_ + in the version you use. + + .. # noqa + Notes ----- Don't create too many partitions in parallel on a large cluster; diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index 8a066bf298..a878e98668 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -282,6 +282,11 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { * Construct a `DataFrame` representing the database table accessible via JDBC URL * url named table and connection properties. * + * You can find the JDBC-specific option and parameter documentation for reading tables + * via JDBC in + * + * Data Source Option in the version you use. + * * @since 1.4.0 */ def jdbc(url: String, table: String, properties: Properties): DataFrame = { @@ -293,6 +298,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { format("jdbc").load() } + // scalastyle:off line.size.limit /** * Construct a `DataFrame` representing the database table accessible via JDBC URL * url named table. Partitions of the table will be retrieved in parallel based on the parameters @@ -301,16 +307,14 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { * Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash * your external database systems. * - * @param url JDBC database url of the form `jdbc:subprotocol:subname`. + * You can find the JDBC-specific option and parameter documentation for reading tables via JDBC in + * + * Data Source Option in the version you use. + * * @param table Name of the table in the external database. - * @param columnName the name of a column of numeric, date, or timestamp type - * that will be used for partitioning. - * @param lowerBound the minimum value of `columnName` used to decide partition stride. - * @param upperBound the maximum value of `columnName` used to decide partition stride. - * @param numPartitions the number of partitions. This, along with `lowerBound` (inclusive), - * `upperBound` (exclusive), form partition strides for generated WHERE - * clause expressions used to split the column `columnName` evenly. When - * the input is less than 1, the number is set to 1. + * @param columnName Alias of `partitionColumn` option. Refer to `partitionColumn` in + * + * Data Source Option in the version you use. * @param connectionProperties JDBC database connection arguments, a list of arbitrary string * tag/value. Normally at least a "user" and "password" property * should be included. "fetchsize" can be used to control the @@ -318,6 +322,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { * for a Statement object to execute to the given number of seconds. * @since 1.4.0 */ + // scalastyle:on line.size.limit def jdbc( url: String, table: String, @@ -344,7 +349,11 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { * Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash * your external database systems. * - * @param url JDBC database url of the form `jdbc:subprotocol:subname` + * You can find the JDBC-specific option and parameter documentation for reading tables + * via JDBC in + * + * Data Source Option in the version you use. + * * @param table Name of the table in the external database. * @param predicates Condition in the where clause for each partition. * @param connectionProperties JDBC database connection arguments, a list of arbitrary string diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index a8af7c8ba8..c96fa9e4f9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -733,18 +733,10 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { * Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash * your external database systems. * - * You can set the following JDBC-specific option(s) for storing JDBC: - *
    - *
  • `truncate` (default `false`): use `TRUNCATE TABLE` instead of `DROP TABLE`.
  • - *
+ * JDBC-specific option and parameter documentation for storing tables via JDBC in + * + * Data Source Option in the version you use. * - * In case of failures, users should turn off `truncate` option to use `DROP TABLE` again. Also, - * due to the different behavior of `TRUNCATE TABLE` among DBMS, it's not always safe to use this. - * MySQLDialect, DB2Dialect, MsSqlServerDialect, DerbyDialect, and OracleDialect supports this - * while PostgresDialect and default JDBCDirect doesn't. For unknown and unsupported JDBCDirect, - * the user option `truncate` is ignored. - * - * @param url JDBC database url of the form `jdbc:subprotocol:subname` * @param table Name of the table in the external database. * @param connectionProperties JDBC database connection arguments, a list of arbitrary string * tag/value. Normally at least a "user" and "password" property
Property NameMeaning
url - The JDBC URL to connect to. The source-specific connection properties may be specified in the URL. e.g., jdbc:postgresql://localhost/test?user=fred&password=secret + The JDBC URL of the form jdbc:subprotocol:subname to connect to. The source-specific connection properties may be specified in the URL. e.g., jdbc:postgresql://localhost/test?user=fred&password=secret
truncate - This is a JDBC writer related option. When SaveMode.Overwrite is enabled, this option causes Spark to truncate an existing table instead of dropping and recreating it. This can be more efficient, and prevents the table metadata (e.g., indices) from being removed. However, it will not work in some cases, such as when the new data has a different schema. It defaults to false. This option applies only to writing. + This is a JDBC writer related option. When SaveMode.Overwrite is enabled, this option causes Spark to truncate an existing table instead of dropping and recreating it. This can be more efficient, and prevents the table metadata (e.g., indices) from being removed. However, it will not work in some cases, such as when the new data has a different schema. It defaults to false. This option applies only to writing. In case of failures, users should turn off truncate option to use DROP TABLE again. Also, due to the different behavior of TRUNCATE TABLE among DBMS, it's not always safe to use this. MySQLDialect, DB2Dialect, MsSqlServerDialect, DerbyDialect, and OracleDialect supports this while PostgresDialect and default JDBCDirect doesn't. For unknown and unsupported JDBCDirect, the user option truncate is ignored.