[SPARK-25656][SQL][DOC][EXAMPLE] Add a doc and examples about extra data source options

## What changes were proposed in this pull request?

Our current doc does not explain how we are passing the data source specific options to the underlying data source. According to [the review comment](https://github.com/apache/spark/pull/22622#discussion_r222911529), this PR aims to add more detailed information and examples

## How was this patch tested?

Manual.

Closes #22801 from dongjoon-hyun/SPARK-25656.

Authored-by: Dongjoon Hyun <dongjoon@apache.org>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
This commit is contained in:
Dongjoon Hyun 2018-10-23 12:41:20 -07:00
parent 65a8d1b87f
commit 4506dad8a9
No known key found for this signature in database
GPG key ID: EDA00CE834F0FC5C
6 changed files with 72 additions and 1 deletions

View file

@ -82,6 +82,50 @@ To load a CSV file you can use:
</div> </div>
</div> </div>
The extra options are also used during write operation.
For example, you can control bloom filters and dictionary encodings for ORC data sources.
The following ORC example will create bloom filter and use dictionary encoding only for `favorite_color`.
For Parquet, there exists `parquet.enable.dictionary`, too.
To find more detailed information about the extra ORC/Parquet options,
visit the official Apache ORC/Parquet websites.
<div class="codetabs">
<div data-lang="scala" markdown="1">
{% include_example manual_save_options_orc scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala %}
</div>
<div data-lang="java" markdown="1">
{% include_example manual_save_options_orc java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java %}
</div>
<div data-lang="python" markdown="1">
{% include_example manual_save_options_orc python/sql/datasource.py %}
</div>
<div data-lang="r" markdown="1">
{% include_example manual_save_options_orc r/RSparkSQLExample.R %}
</div>
<div data-lang="sql" markdown="1">
{% highlight sql %}
CREATE TABLE users_with_options (
name STRING,
favorite_color STRING,
favorite_numbers array<integer>
) USING ORC
OPTIONS (
orc.bloom.filter.columns 'favorite_color',
orc.dictionary.key.threshold '1.0',
orc.column.encoding.direct 'name'
)
{% endhighlight %}
</div>
</div>
### Run SQL on files directly ### Run SQL on files directly
Instead of using read API to load a file into DataFrame and query it, you can also query that Instead of using read API to load a file into DataFrame and query it, you can also query that

View file

@ -123,6 +123,13 @@ public class JavaSQLDataSourceExample {
.option("header", "true") .option("header", "true")
.load("examples/src/main/resources/people.csv"); .load("examples/src/main/resources/people.csv");
// $example off:manual_load_options_csv$ // $example off:manual_load_options_csv$
// $example on:manual_save_options_orc$
usersDF.write().format("orc")
.option("orc.bloom.filter.columns", "favorite_color")
.option("orc.dictionary.key.threshold", "1.0")
.option("orc.column.encoding.direct", "name")
.save("users_with_options.orc");
// $example off:manual_save_options_orc$
// $example on:direct_sql$ // $example on:direct_sql$
Dataset<Row> sqlDF = Dataset<Row> sqlDF =
spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`"); spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`");

View file

@ -57,6 +57,15 @@ def basic_datasource_example(spark):
format="csv", sep=":", inferSchema="true", header="true") format="csv", sep=":", inferSchema="true", header="true")
# $example off:manual_load_options_csv$ # $example off:manual_load_options_csv$
# $example on:manual_save_options_orc$
df = spark.read.orc("examples/src/main/resources/users.orc")
(df.write.format("orc")
.option("orc.bloom.filter.columns", "favorite_color")
.option("orc.dictionary.key.threshold", "1.0")
.option("orc.column.encoding.direct", "name")
.save("users_with_options.orc"))
# $example off:manual_save_options_orc$
# $example on:write_sorting_and_bucketing$ # $example on:write_sorting_and_bucketing$
df.write.bucketBy(42, "name").sortBy("age").saveAsTable("people_bucketed") df.write.bucketBy(42, "name").sortBy("age").saveAsTable("people_bucketed")
# $example off:write_sorting_and_bucketing$ # $example off:write_sorting_and_bucketing$

View file

@ -114,10 +114,14 @@ write.df(namesAndAges, "namesAndAges.parquet", "parquet")
# $example on:manual_load_options_csv$ # $example on:manual_load_options_csv$
df <- read.df("examples/src/main/resources/people.csv", "csv", sep=";", inferSchema=T, header=T) df <- read.df("examples/src/main/resources/people.csv", "csv", sep = ";", inferSchema = TRUE, header = TRUE)
namesAndAges <- select(df, "name", "age") namesAndAges <- select(df, "name", "age")
# $example off:manual_load_options_csv$ # $example off:manual_load_options_csv$
# $example on:manual_save_options_orc$
df <- read.df("examples/src/main/resources/users.orc", "orc")
write.orc(df, "users_with_options.orc", orc.bloom.filter.columns = "favorite_color", orc.dictionary.key.threshold = 1.0, orc.column.encoding.direct = "name")
# $example off:manual_save_options_orc$
# $example on:direct_sql$ # $example on:direct_sql$
df <- sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`") df <- sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`")

Binary file not shown.

View file

@ -56,6 +56,13 @@ object SQLDataSourceExample {
.option("header", "true") .option("header", "true")
.load("examples/src/main/resources/people.csv") .load("examples/src/main/resources/people.csv")
// $example off:manual_load_options_csv$ // $example off:manual_load_options_csv$
// $example on:manual_save_options_orc$
usersDF.write.format("orc")
.option("orc.bloom.filter.columns", "favorite_color")
.option("orc.dictionary.key.threshold", "1.0")
.option("orc.column.encoding.direct", "name")
.save("users_with_options.orc")
// $example off:manual_save_options_orc$
// $example on:direct_sql$ // $example on:direct_sql$
val sqlDF = spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`") val sqlDF = spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`")