diff --git a/docs/sql-data-sources-orc.md b/docs/sql-data-sources-orc.md index f5c9677c34..e4194d038a 100644 --- a/docs/sql-data-sources-orc.md +++ b/docs/sql-data-sources-orc.md @@ -172,3 +172,29 @@ When reading from Hive metastore ORC tables and inserting to Hive metastore ORC 2.0.0 + +## Data Source Option + +Data source options of ORC can be set via: +* the `.option`/`.options` methods of + * `DataFrameReader` + * `DataFrameWriter` + * `DataStreamReader` + * `DataStreamWriter` + + + + + + + + + + + + + + + +
Property NameDefaultMeaningScope
mergeSchemaNonesets whether we should merge schemas collected from all ORC part-files. This will override spark.sql.orc.mergeSchema. The default value is specified in spark.sql.orc.mergeSchema.read
compressionNonecompression codec to use when saving to file. This can be one of the known case-insensitive shorten names (none, snappy, zlib, lzo, and zstd). This will override orc.compress and spark.sql.orc.compression.codec. If None is set, it uses the value specified in spark.sql.orc.compression.codec.write
+Other generic options can be found in Generic File Source Options. diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 31c1f2f7ca..bbb32a643d 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -793,28 +793,13 @@ class DataFrameReader(OptionUtils): Parameters ---------- path : str or list - mergeSchema : str or bool, optional - sets whether we should merge schemas collected from all - ORC part-files. This will override ``spark.sql.orc.mergeSchema``. - The default value is specified in ``spark.sql.orc.mergeSchema``. - pathGlobFilter : str or bool - an optional glob pattern to only include files with paths matching - the pattern. The syntax follows `org.apache.hadoop.fs.GlobFilter`. - It does not change the behavior of - `partition discovery `_. # noqa - recursiveFileLookup : str or bool - recursively scan a directory for files. Using this option - disables - `partition discovery `_. # noqa - modification times occurring before the specified time. The provided timestamp - must be in the following format: YYYY-MM-DDTHH:mm:ss (e.g. 2020-06-01T13:00:00) - modifiedBefore : an optional timestamp to only include files with - modification times occurring before the specified time. The provided timestamp - must be in the following format: YYYY-MM-DDTHH:mm:ss (e.g. 2020-06-01T13:00:00) - modifiedAfter : an optional timestamp to only include files with - modification times occurring after the specified time. The provided timestamp - must be in the following format: YYYY-MM-DDTHH:mm:ss (e.g. 2020-06-01T13:00:00) + Other Parameters + ---------------- + Extra options + For the extra options, refer to + `Data Source Option `_ # noqa + in the version you use. Examples -------- @@ -1417,12 +1402,13 @@ class DataFrameWriter(OptionUtils): exists. partitionBy : str or list, optional names of partitioning columns - compression : str, optional - compression codec to use when saving to file. This can be one of the - known case-insensitive shorten names (none, snappy, zlib, lzo, and zstd). - This will override ``orc.compress`` and - ``spark.sql.orc.compression.codec``. If None is set, it uses the value - specified in ``spark.sql.orc.compression.codec``. + + Other Parameters + ---------------- + Extra options + For the extra options, refer to + `Data Source Option `_ # noqa + in the version you use. Examples -------- diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py index 2c90d7f2de..fd94d6fbf4 100644 --- a/python/pyspark/sql/streaming.py +++ b/python/pyspark/sql/streaming.py @@ -637,20 +637,12 @@ class DataStreamReader(OptionUtils): .. versionadded:: 2.3.0 - Parameters - ---------- - mergeSchema : str or bool, optional - sets whether we should merge schemas collected from all - ORC part-files. This will override ``spark.sql.orc.mergeSchema``. - The default value is specified in ``spark.sql.orc.mergeSchema``. - pathGlobFilter : str or bool, optional - an optional glob pattern to only include files with paths matching - the pattern. The syntax follows `org.apache.hadoop.fs.GlobFilter`. - It does not change the behavior of `partition discovery`_. - recursiveFileLookup : str or bool, optional - recursively scan a directory for files. Using this option - disables - `partition discovery `_. # noqa + Other Parameters + ---------------- + Extra options + For the extra options, refer to + `Data Source Option `_ # noqa + in the version you use. Examples -------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index f7e1903da6..fad9a0e15c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -874,23 +874,10 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { /** * Loads ORC files and returns the result as a `DataFrame`. * - * You can set the following ORC-specific option(s) for reading ORC files: - *
    - *
  • `mergeSchema` (default is the value specified in `spark.sql.orc.mergeSchema`): sets whether - * we should merge schemas collected from all ORC part-files. This will override - * `spark.sql.orc.mergeSchema`.
  • - *
  • `pathGlobFilter`: an optional glob pattern to only include files with paths matching - * the pattern. The syntax follows org.apache.hadoop.fs.GlobFilter. - * It does not change the behavior of partition discovery.
  • - *
  • `modifiedBefore` (batch only): an optional timestamp to only include files with - * modification times occurring before the specified Time. The provided timestamp - * must be in the following form: YYYY-MM-DDTHH:mm:ss (e.g. 2020-06-01T13:00:00)
  • - *
  • `modifiedAfter` (batch only): an optional timestamp to only include files with - * modification times occurring after the specified Time. The provided timestamp - * must be in the following form: YYYY-MM-DDTHH:mm:ss (e.g. 2020-06-01T13:00:00)
  • - *
  • `recursiveFileLookup`: recursively scan a directory for files. Using this option - * disables partition discovery
  • - *
+ * ORC-specific option(s) for reading ORC files can be found in + * + * Data Source Option in the version you use. * * @param paths input paths * @since 2.0.0 diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index fe6572cff5..9873582082 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -881,14 +881,10 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { * format("orc").save(path) * }}} * - * You can set the following ORC-specific option(s) for writing ORC files: - *
    - *
  • `compression` (default is the value specified in `spark.sql.orc.compression.codec`): - * compression codec to use when saving to file. This can be one of the known case-insensitive - * shorten names(`none`, `snappy`, `zlib`, `lzo`, and `zstd`). This will override - * `orc.compress` and `spark.sql.orc.compression.codec`. If `orc.compress` is given, - * it overrides `spark.sql.orc.compression.codec`.
  • - *
+ * ORC-specific option(s) for writing ORC files can be found in + * + * Data Source Option in the version you use. * * @since 1.5.0 */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala index 1798f6e2c8..bb33d892c5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala @@ -453,20 +453,17 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo /** * Loads a ORC file stream, returning the result as a `DataFrame`. * - * You can set the following ORC-specific option(s) for reading ORC files: + * You can set the following option(s): *
    *
  • `maxFilesPerTrigger` (default: no max limit): sets the maximum number of new files to be * considered in every trigger.
  • - *
  • `mergeSchema` (default is the value specified in `spark.sql.orc.mergeSchema`): sets whether - * we should merge schemas collected from all ORC part-files. This will override - * `spark.sql.orc.mergeSchema`.
  • - *
  • `pathGlobFilter`: an optional glob pattern to only include files with paths matching - * the pattern. The syntax follows org.apache.hadoop.fs.GlobFilter. - * It does not change the behavior of partition discovery.
  • - *
  • `recursiveFileLookup`: recursively scan a directory for files. Using this option - * disables partition discovery
  • *
* + * ORC-specific option(s) for reading ORC file stream can be found in + * + * Data Source Option in the version you use. + * * @since 2.3.0 */ def orc(path: String): DataFrame = {