[SPARK-25656][SQL][DOC][EXAMPLE] Add a doc and examples about extra data source options
## What changes were proposed in this pull request? Our current doc does not explain how we are passing the data source specific options to the underlying data source. According to [the review comment](https://github.com/apache/spark/pull/22622#discussion_r222911529), this PR aims to add more detailed information and examples ## How was this patch tested? Manual. Closes #22801 from dongjoon-hyun/SPARK-25656. Authored-by: Dongjoon Hyun <dongjoon@apache.org> Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
This commit is contained in:
parent
65a8d1b87f
commit
4506dad8a9
|
@ -82,6 +82,50 @@ To load a CSV file you can use:
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
The extra options are also used during write operation.
|
||||||
|
For example, you can control bloom filters and dictionary encodings for ORC data sources.
|
||||||
|
The following ORC example will create bloom filter and use dictionary encoding only for `favorite_color`.
|
||||||
|
For Parquet, there exists `parquet.enable.dictionary`, too.
|
||||||
|
To find more detailed information about the extra ORC/Parquet options,
|
||||||
|
visit the official Apache ORC/Parquet websites.
|
||||||
|
|
||||||
|
<div class="codetabs">
|
||||||
|
|
||||||
|
<div data-lang="scala" markdown="1">
|
||||||
|
{% include_example manual_save_options_orc scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala %}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div data-lang="java" markdown="1">
|
||||||
|
{% include_example manual_save_options_orc java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java %}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div data-lang="python" markdown="1">
|
||||||
|
{% include_example manual_save_options_orc python/sql/datasource.py %}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div data-lang="r" markdown="1">
|
||||||
|
{% include_example manual_save_options_orc r/RSparkSQLExample.R %}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div data-lang="sql" markdown="1">
|
||||||
|
|
||||||
|
{% highlight sql %}
|
||||||
|
CREATE TABLE users_with_options (
|
||||||
|
name STRING,
|
||||||
|
favorite_color STRING,
|
||||||
|
favorite_numbers array<integer>
|
||||||
|
) USING ORC
|
||||||
|
OPTIONS (
|
||||||
|
orc.bloom.filter.columns 'favorite_color',
|
||||||
|
orc.dictionary.key.threshold '1.0',
|
||||||
|
orc.column.encoding.direct 'name'
|
||||||
|
)
|
||||||
|
{% endhighlight %}
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
### Run SQL on files directly
|
### Run SQL on files directly
|
||||||
|
|
||||||
Instead of using read API to load a file into DataFrame and query it, you can also query that
|
Instead of using read API to load a file into DataFrame and query it, you can also query that
|
||||||
|
|
|
@ -123,6 +123,13 @@ public class JavaSQLDataSourceExample {
|
||||||
.option("header", "true")
|
.option("header", "true")
|
||||||
.load("examples/src/main/resources/people.csv");
|
.load("examples/src/main/resources/people.csv");
|
||||||
// $example off:manual_load_options_csv$
|
// $example off:manual_load_options_csv$
|
||||||
|
// $example on:manual_save_options_orc$
|
||||||
|
usersDF.write().format("orc")
|
||||||
|
.option("orc.bloom.filter.columns", "favorite_color")
|
||||||
|
.option("orc.dictionary.key.threshold", "1.0")
|
||||||
|
.option("orc.column.encoding.direct", "name")
|
||||||
|
.save("users_with_options.orc");
|
||||||
|
// $example off:manual_save_options_orc$
|
||||||
// $example on:direct_sql$
|
// $example on:direct_sql$
|
||||||
Dataset<Row> sqlDF =
|
Dataset<Row> sqlDF =
|
||||||
spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`");
|
spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`");
|
||||||
|
|
|
@ -57,6 +57,15 @@ def basic_datasource_example(spark):
|
||||||
format="csv", sep=":", inferSchema="true", header="true")
|
format="csv", sep=":", inferSchema="true", header="true")
|
||||||
# $example off:manual_load_options_csv$
|
# $example off:manual_load_options_csv$
|
||||||
|
|
||||||
|
# $example on:manual_save_options_orc$
|
||||||
|
df = spark.read.orc("examples/src/main/resources/users.orc")
|
||||||
|
(df.write.format("orc")
|
||||||
|
.option("orc.bloom.filter.columns", "favorite_color")
|
||||||
|
.option("orc.dictionary.key.threshold", "1.0")
|
||||||
|
.option("orc.column.encoding.direct", "name")
|
||||||
|
.save("users_with_options.orc"))
|
||||||
|
# $example off:manual_save_options_orc$
|
||||||
|
|
||||||
# $example on:write_sorting_and_bucketing$
|
# $example on:write_sorting_and_bucketing$
|
||||||
df.write.bucketBy(42, "name").sortBy("age").saveAsTable("people_bucketed")
|
df.write.bucketBy(42, "name").sortBy("age").saveAsTable("people_bucketed")
|
||||||
# $example off:write_sorting_and_bucketing$
|
# $example off:write_sorting_and_bucketing$
|
||||||
|
|
|
@ -114,10 +114,14 @@ write.df(namesAndAges, "namesAndAges.parquet", "parquet")
|
||||||
|
|
||||||
|
|
||||||
# $example on:manual_load_options_csv$
|
# $example on:manual_load_options_csv$
|
||||||
df <- read.df("examples/src/main/resources/people.csv", "csv", sep=";", inferSchema=T, header=T)
|
df <- read.df("examples/src/main/resources/people.csv", "csv", sep = ";", inferSchema = TRUE, header = TRUE)
|
||||||
namesAndAges <- select(df, "name", "age")
|
namesAndAges <- select(df, "name", "age")
|
||||||
# $example off:manual_load_options_csv$
|
# $example off:manual_load_options_csv$
|
||||||
|
|
||||||
|
# $example on:manual_save_options_orc$
|
||||||
|
df <- read.df("examples/src/main/resources/users.orc", "orc")
|
||||||
|
write.orc(df, "users_with_options.orc", orc.bloom.filter.columns = "favorite_color", orc.dictionary.key.threshold = 1.0, orc.column.encoding.direct = "name")
|
||||||
|
# $example off:manual_save_options_orc$
|
||||||
|
|
||||||
# $example on:direct_sql$
|
# $example on:direct_sql$
|
||||||
df <- sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`")
|
df <- sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`")
|
||||||
|
|
BIN
examples/src/main/resources/users.orc
Normal file
BIN
examples/src/main/resources/users.orc
Normal file
Binary file not shown.
|
@ -56,6 +56,13 @@ object SQLDataSourceExample {
|
||||||
.option("header", "true")
|
.option("header", "true")
|
||||||
.load("examples/src/main/resources/people.csv")
|
.load("examples/src/main/resources/people.csv")
|
||||||
// $example off:manual_load_options_csv$
|
// $example off:manual_load_options_csv$
|
||||||
|
// $example on:manual_save_options_orc$
|
||||||
|
usersDF.write.format("orc")
|
||||||
|
.option("orc.bloom.filter.columns", "favorite_color")
|
||||||
|
.option("orc.dictionary.key.threshold", "1.0")
|
||||||
|
.option("orc.column.encoding.direct", "name")
|
||||||
|
.save("users_with_options.orc")
|
||||||
|
// $example off:manual_save_options_orc$
|
||||||
|
|
||||||
// $example on:direct_sql$
|
// $example on:direct_sql$
|
||||||
val sqlDF = spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`")
|
val sqlDF = spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`")
|
||||||
|
|
Loading…
Reference in a new issue