diff --git a/docs/sql-data-sources-orc.md b/docs/sql-data-sources-orc.md
index f5c9677c34..e4194d038a 100644
--- a/docs/sql-data-sources-orc.md
+++ b/docs/sql-data-sources-orc.md
@@ -172,3 +172,29 @@ When reading from Hive metastore ORC tables and inserting to Hive metastore ORC
2.0.0 |
+
+## Data Source Option
+
+Data source options of ORC can be set via:
+* the `.option`/`.options` methods of
+ * `DataFrameReader`
+ * `DataFrameWriter`
+ * `DataStreamReader`
+ * `DataStreamWriter`
+
+
+ Property Name | Default | Meaning | Scope |
+
+ mergeSchema |
+ None |
+ sets whether we should merge schemas collected from all ORC part-files. This will override spark.sql.orc.mergeSchema . The default value is specified in spark.sql.orc.mergeSchema . |
+ read |
+
+
+ compression |
+ None |
+ compression codec to use when saving to file. This can be one of the known case-insensitive shorten names (none, snappy, zlib, lzo, and zstd). This will override orc.compress and spark.sql.orc.compression.codec . If None is set, it uses the value specified in spark.sql.orc.compression.codec . |
+ write |
+
+
+Other generic options can be found in Generic File Source Options.
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 31c1f2f7ca..bbb32a643d 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -793,28 +793,13 @@ class DataFrameReader(OptionUtils):
Parameters
----------
path : str or list
- mergeSchema : str or bool, optional
- sets whether we should merge schemas collected from all
- ORC part-files. This will override ``spark.sql.orc.mergeSchema``.
- The default value is specified in ``spark.sql.orc.mergeSchema``.
- pathGlobFilter : str or bool
- an optional glob pattern to only include files with paths matching
- the pattern. The syntax follows `org.apache.hadoop.fs.GlobFilter`.
- It does not change the behavior of
- `partition discovery `_. # noqa
- recursiveFileLookup : str or bool
- recursively scan a directory for files. Using this option
- disables
- `partition discovery `_. # noqa
- modification times occurring before the specified time. The provided timestamp
- must be in the following format: YYYY-MM-DDTHH:mm:ss (e.g. 2020-06-01T13:00:00)
- modifiedBefore : an optional timestamp to only include files with
- modification times occurring before the specified time. The provided timestamp
- must be in the following format: YYYY-MM-DDTHH:mm:ss (e.g. 2020-06-01T13:00:00)
- modifiedAfter : an optional timestamp to only include files with
- modification times occurring after the specified time. The provided timestamp
- must be in the following format: YYYY-MM-DDTHH:mm:ss (e.g. 2020-06-01T13:00:00)
+ Other Parameters
+ ----------------
+ Extra options
+ For the extra options, refer to
+ `Data Source Option `_ # noqa
+ in the version you use.
Examples
--------
@@ -1417,12 +1402,13 @@ class DataFrameWriter(OptionUtils):
exists.
partitionBy : str or list, optional
names of partitioning columns
- compression : str, optional
- compression codec to use when saving to file. This can be one of the
- known case-insensitive shorten names (none, snappy, zlib, lzo, and zstd).
- This will override ``orc.compress`` and
- ``spark.sql.orc.compression.codec``. If None is set, it uses the value
- specified in ``spark.sql.orc.compression.codec``.
+
+ Other Parameters
+ ----------------
+ Extra options
+ For the extra options, refer to
+ `Data Source Option `_ # noqa
+ in the version you use.
Examples
--------
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index 2c90d7f2de..fd94d6fbf4 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -637,20 +637,12 @@ class DataStreamReader(OptionUtils):
.. versionadded:: 2.3.0
- Parameters
- ----------
- mergeSchema : str or bool, optional
- sets whether we should merge schemas collected from all
- ORC part-files. This will override ``spark.sql.orc.mergeSchema``.
- The default value is specified in ``spark.sql.orc.mergeSchema``.
- pathGlobFilter : str or bool, optional
- an optional glob pattern to only include files with paths matching
- the pattern. The syntax follows `org.apache.hadoop.fs.GlobFilter`.
- It does not change the behavior of `partition discovery`_.
- recursiveFileLookup : str or bool, optional
- recursively scan a directory for files. Using this option
- disables
- `partition discovery `_. # noqa
+ Other Parameters
+ ----------------
+ Extra options
+ For the extra options, refer to
+ `Data Source Option `_ # noqa
+ in the version you use.
Examples
--------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index f7e1903da6..fad9a0e15c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -874,23 +874,10 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
/**
* Loads ORC files and returns the result as a `DataFrame`.
*
- * You can set the following ORC-specific option(s) for reading ORC files:
- *
- * - `mergeSchema` (default is the value specified in `spark.sql.orc.mergeSchema`): sets whether
- * we should merge schemas collected from all ORC part-files. This will override
- * `spark.sql.orc.mergeSchema`.
- * - `pathGlobFilter`: an optional glob pattern to only include files with paths matching
- * the pattern. The syntax follows
org.apache.hadoop.fs.GlobFilter
.
- * It does not change the behavior of partition discovery.
- * - `modifiedBefore` (batch only): an optional timestamp to only include files with
- * modification times occurring before the specified Time. The provided timestamp
- * must be in the following form: YYYY-MM-DDTHH:mm:ss (e.g. 2020-06-01T13:00:00)
- * - `modifiedAfter` (batch only): an optional timestamp to only include files with
- * modification times occurring after the specified Time. The provided timestamp
- * must be in the following form: YYYY-MM-DDTHH:mm:ss (e.g. 2020-06-01T13:00:00)
- * - `recursiveFileLookup`: recursively scan a directory for files. Using this option
- * disables partition discovery
- *
+ * ORC-specific option(s) for reading ORC files can be found in
+ *
+ * Data Source Option in the version you use.
*
* @param paths input paths
* @since 2.0.0
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index fe6572cff5..9873582082 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -881,14 +881,10 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
* format("orc").save(path)
* }}}
*
- * You can set the following ORC-specific option(s) for writing ORC files:
- *
- * - `compression` (default is the value specified in `spark.sql.orc.compression.codec`):
- * compression codec to use when saving to file. This can be one of the known case-insensitive
- * shorten names(`none`, `snappy`, `zlib`, `lzo`, and `zstd`). This will override
- * `orc.compress` and `spark.sql.orc.compression.codec`. If `orc.compress` is given,
- * it overrides `spark.sql.orc.compression.codec`.
- *
+ * ORC-specific option(s) for writing ORC files can be found in
+ *
+ * Data Source Option in the version you use.
*
* @since 1.5.0
*/
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
index 1798f6e2c8..bb33d892c5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
@@ -453,20 +453,17 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
/**
* Loads a ORC file stream, returning the result as a `DataFrame`.
*
- * You can set the following ORC-specific option(s) for reading ORC files:
+ * You can set the following option(s):
*
* - `maxFilesPerTrigger` (default: no max limit): sets the maximum number of new files to be
* considered in every trigger.
- * - `mergeSchema` (default is the value specified in `spark.sql.orc.mergeSchema`): sets whether
- * we should merge schemas collected from all ORC part-files. This will override
- * `spark.sql.orc.mergeSchema`.
- * - `pathGlobFilter`: an optional glob pattern to only include files with paths matching
- * the pattern. The syntax follows
org.apache.hadoop.fs.GlobFilter
.
- * It does not change the behavior of partition discovery.
- * - `recursiveFileLookup`: recursively scan a directory for files. Using this option
- * disables partition discovery
*
*
+ * ORC-specific option(s) for reading ORC file stream can be found in
+ *
+ * Data Source Option in the version you use.
+ *
* @since 2.3.0
*/
def orc(path: String): DataFrame = {