From 4506dad8a9613d4b6b319c0240119927265a67c1 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Tue, 23 Oct 2018 12:41:20 -0700 Subject: [PATCH] [SPARK-25656][SQL][DOC][EXAMPLE] Add a doc and examples about extra data source options ## What changes were proposed in this pull request? Our current doc does not explain how we are passing the data source specific options to the underlying data source. According to [the review comment](https://github.com/apache/spark/pull/22622#discussion_r222911529), this PR aims to add more detailed information and examples ## How was this patch tested? Manual. Closes #22801 from dongjoon-hyun/SPARK-25656. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun --- docs/sql-data-sources-load-save-functions.md | 44 ++++++++++++++++++ .../sql/JavaSQLDataSourceExample.java | 7 +++ examples/src/main/python/sql/datasource.py | 9 ++++ examples/src/main/r/RSparkSQLExample.R | 6 ++- examples/src/main/resources/users.orc | Bin 0 -> 547 bytes .../examples/sql/SQLDataSourceExample.scala | 7 +++ 6 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 examples/src/main/resources/users.orc diff --git a/docs/sql-data-sources-load-save-functions.md b/docs/sql-data-sources-load-save-functions.md index e1dd0a3f54..e4c7b1766f 100644 --- a/docs/sql-data-sources-load-save-functions.md +++ b/docs/sql-data-sources-load-save-functions.md @@ -82,6 +82,50 @@ To load a CSV file you can use: +The extra options are also used during write operation. +For example, you can control bloom filters and dictionary encodings for ORC data sources. +The following ORC example will create bloom filter and use dictionary encoding only for `favorite_color`. +For Parquet, there exists `parquet.enable.dictionary`, too. +To find more detailed information about the extra ORC/Parquet options, +visit the official Apache ORC/Parquet websites. + +
+ +
+{% include_example manual_save_options_orc scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala %} +
+ +
+{% include_example manual_save_options_orc java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java %} +
+ +
+{% include_example manual_save_options_orc python/sql/datasource.py %} +
+ +
+{% include_example manual_save_options_orc r/RSparkSQLExample.R %} +
+ +
+ +{% highlight sql %} +CREATE TABLE users_with_options ( + name STRING, + favorite_color STRING, + favorite_numbers array +) USING ORC +OPTIONS ( + orc.bloom.filter.columns 'favorite_color', + orc.dictionary.key.threshold '1.0', + orc.column.encoding.direct 'name' +) +{% endhighlight %} + +
+ +
+ ### Run SQL on files directly Instead of using read API to load a file into DataFrame and query it, you can also query that diff --git a/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java b/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java index ef3c904775..cbe9dfdaa9 100644 --- a/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java +++ b/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java @@ -123,6 +123,13 @@ public class JavaSQLDataSourceExample { .option("header", "true") .load("examples/src/main/resources/people.csv"); // $example off:manual_load_options_csv$ + // $example on:manual_save_options_orc$ + usersDF.write().format("orc") + .option("orc.bloom.filter.columns", "favorite_color") + .option("orc.dictionary.key.threshold", "1.0") + .option("orc.column.encoding.direct", "name") + .save("users_with_options.orc"); + // $example off:manual_save_options_orc$ // $example on:direct_sql$ Dataset sqlDF = spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`"); diff --git a/examples/src/main/python/sql/datasource.py b/examples/src/main/python/sql/datasource.py index d8c879dfe0..04660724b3 100644 --- a/examples/src/main/python/sql/datasource.py +++ b/examples/src/main/python/sql/datasource.py @@ -57,6 +57,15 @@ def basic_datasource_example(spark): format="csv", sep=":", inferSchema="true", header="true") # $example off:manual_load_options_csv$ + # $example on:manual_save_options_orc$ + df = spark.read.orc("examples/src/main/resources/users.orc") + (df.write.format("orc") + .option("orc.bloom.filter.columns", "favorite_color") + .option("orc.dictionary.key.threshold", "1.0") + .option("orc.column.encoding.direct", "name") + .save("users_with_options.orc")) + # $example off:manual_save_options_orc$ + # $example on:write_sorting_and_bucketing$ df.write.bucketBy(42, "name").sortBy("age").saveAsTable("people_bucketed") # $example off:write_sorting_and_bucketing$ diff --git a/examples/src/main/r/RSparkSQLExample.R b/examples/src/main/r/RSparkSQLExample.R index effba948e5..196a110f35 100644 --- a/examples/src/main/r/RSparkSQLExample.R +++ b/examples/src/main/r/RSparkSQLExample.R @@ -114,10 +114,14 @@ write.df(namesAndAges, "namesAndAges.parquet", "parquet") # $example on:manual_load_options_csv$ -df <- read.df("examples/src/main/resources/people.csv", "csv", sep=";", inferSchema=T, header=T) +df <- read.df("examples/src/main/resources/people.csv", "csv", sep = ";", inferSchema = TRUE, header = TRUE) namesAndAges <- select(df, "name", "age") # $example off:manual_load_options_csv$ +# $example on:manual_save_options_orc$ +df <- read.df("examples/src/main/resources/users.orc", "orc") +write.orc(df, "users_with_options.orc", orc.bloom.filter.columns = "favorite_color", orc.dictionary.key.threshold = 1.0, orc.column.encoding.direct = "name") +# $example off:manual_save_options_orc$ # $example on:direct_sql$ df <- sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`") diff --git a/examples/src/main/resources/users.orc b/examples/src/main/resources/users.orc new file mode 100644 index 0000000000000000000000000000000000000000..12478a5d03c26cb30b35af232a5764e076eaab1f GIT binary patch literal 547 zcmZ`#Jxc>Y5S`t<-He+fZZ-y&D1Mwxz(Or-4vQp$h^RSIU8P1nQP2b~QLz($LH>cQ z{tF8ce~yK{&c!BZT$uM}cG)*?rrFvo0%&DDwa5Zri!?d488{WO8PdpWktqx{m#HrO)INGvp)yr>5J3sxN1B9l z09)*Y`hL|YB_YBFybP~vd4M;e{Nxp&KdZC<;PMiYxg|pG772wj63;#7=$#qnd}2(&;MDi2oBw}Np|@jC6Rq*6F*-*nT9esXxyz3iqHb5=Cf&h^!ClJ)|Q zIq7{gBx=h%s>CV}haSWKJceUEhKjAnu?l}#tPS?J0iPHx>i*sY9Qt^a{vGU literal 0 HcmV?d00001 diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala b/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala index 7d83aacb11..18615d9b9b 100644 --- a/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala @@ -56,6 +56,13 @@ object SQLDataSourceExample { .option("header", "true") .load("examples/src/main/resources/people.csv") // $example off:manual_load_options_csv$ + // $example on:manual_save_options_orc$ + usersDF.write.format("orc") + .option("orc.bloom.filter.columns", "favorite_color") + .option("orc.dictionary.key.threshold", "1.0") + .option("orc.column.encoding.direct", "name") + .save("users_with_options.orc") + // $example off:manual_save_options_orc$ // $example on:direct_sql$ val sqlDF = spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`")