[SPARK-7462][SQL] Update documentation for retaining grouping columns in DataFrames.
Author: Reynold Xin <rxin@databricks.com>
Closes #6062 from rxin/agg-retain-doc and squashes the following commits:
43e511e [Reynold Xin] [SPARK-7462][SQL] Update documentation for retaining grouping columns in DataFrames.
(cherry picked from commit 3a9b6997df
)
Signed-off-by: Reynold Xin <rxin@databricks.com>
This commit is contained in:
parent
0dbfe16814
commit
eaa6116200
|
@ -1594,6 +1594,64 @@ options.
|
||||||
|
|
||||||
# Migration Guide
|
# Migration Guide
|
||||||
|
|
||||||
|
## Upgrading from Spark SQL 1.3 to 1.4
|
||||||
|
|
||||||
|
Based on user feedback, we changed the default behavior of `DataFrame.groupBy().agg()` to retain the grouping columns in the resulting `DataFrame`. To keep the behavior in 1.3, set `spark.sql.retainGroupColumns` to `false`.
|
||||||
|
|
||||||
|
<div class="codetabs">
|
||||||
|
<div data-lang="scala" markdown="1">
|
||||||
|
{% highlight scala %}
|
||||||
|
|
||||||
|
// In 1.3.x, in order for the grouping column "department" to show up,
|
||||||
|
// it must be included explicitly as part of the agg function call.
|
||||||
|
df.groupBy("department").agg($"department", max("age"), sum("expense"))
|
||||||
|
|
||||||
|
// In 1.4+, grouping column "department" is included automatically.
|
||||||
|
df.groupBy("department").agg(max("age"), sum("expense"))
|
||||||
|
|
||||||
|
// Revert to 1.3 behavior (not retaining grouping column) by:
|
||||||
|
sqlContext.setConf("spark.sql.retainGroupColumns", "false")
|
||||||
|
|
||||||
|
{% endhighlight %}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div data-lang="java" markdown="1">
|
||||||
|
{% highlight java %}
|
||||||
|
|
||||||
|
// In 1.3.x, in order for the grouping column "department" to show up,
|
||||||
|
// it must be included explicitly as part of the agg function call.
|
||||||
|
df.groupBy("department").agg(col("department"), max("age"), sum("expense"));
|
||||||
|
|
||||||
|
// In 1.4+, grouping column "department" is included automatically.
|
||||||
|
df.groupBy("department").agg(max("age"), sum("expense"));
|
||||||
|
|
||||||
|
// Revert to 1.3 behavior (not retaining grouping column) by:
|
||||||
|
sqlContext.setConf("spark.sql.retainGroupColumns", "false");
|
||||||
|
|
||||||
|
{% endhighlight %}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div data-lang="python" markdown="1">
|
||||||
|
{% highlight python %}
|
||||||
|
|
||||||
|
import pyspark.sql.functions as func
|
||||||
|
|
||||||
|
# In 1.3.x, in order for the grouping column "department" to show up,
|
||||||
|
# it must be included explicitly as part of the agg function call.
|
||||||
|
df.groupBy("department").agg("department"), func.max("age"), func.sum("expense"))
|
||||||
|
|
||||||
|
# In 1.4+, grouping column "department" is included automatically.
|
||||||
|
df.groupBy("department").agg(func.max("age"), func.sum("expense"))
|
||||||
|
|
||||||
|
# Revert to 1.3.x behavior (not retaining grouping column) by:
|
||||||
|
sqlContext.setConf("spark.sql.retainGroupColumns", "false")
|
||||||
|
|
||||||
|
{% endhighlight %}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
## Upgrading from Spark SQL 1.0-1.2 to 1.3
|
## Upgrading from Spark SQL 1.0-1.2 to 1.3
|
||||||
|
|
||||||
In Spark 1.3 we removed the "Alpha" label from Spark SQL and as part of this did a cleanup of the
|
In Spark 1.3 we removed the "Alpha" label from Spark SQL and as part of this did a cleanup of the
|
||||||
|
@ -1651,7 +1709,7 @@ moved into the udf object in `SQLContext`.
|
||||||
|
|
||||||
<div class="codetabs">
|
<div class="codetabs">
|
||||||
<div data-lang="scala" markdown="1">
|
<div data-lang="scala" markdown="1">
|
||||||
{% highlight java %}
|
{% highlight scala %}
|
||||||
|
|
||||||
sqlContext.udf.register("strLen", (s: String) => s.length())
|
sqlContext.udf.register("strLen", (s: String) => s.length())
|
||||||
|
|
||||||
|
|
|
@ -1228,12 +1228,14 @@ class Row(tuple):
|
||||||
raise AttributeError(item)
|
raise AttributeError(item)
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
|
"""Returns a tuple so Python knows how to pickle Row."""
|
||||||
if hasattr(self, "__fields__"):
|
if hasattr(self, "__fields__"):
|
||||||
return (_create_row, (self.__fields__, tuple(self)))
|
return (_create_row, (self.__fields__, tuple(self)))
|
||||||
else:
|
else:
|
||||||
return tuple.__reduce__(self)
|
return tuple.__reduce__(self)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
|
"""Printable representation of Row used in Python REPL."""
|
||||||
if hasattr(self, "__fields__"):
|
if hasattr(self, "__fields__"):
|
||||||
return "Row(%s)" % ", ".join("%s=%r" % (k, v)
|
return "Row(%s)" % ", ".join("%s=%r" % (k, v)
|
||||||
for k, v in zip(self.__fields__, tuple(self)))
|
for k, v in zip(self.__fields__, tuple(self)))
|
||||||
|
|
|
@ -146,11 +146,21 @@ class GroupedData protected[sql](df: DataFrame, groupingExprs: Seq[Expression])
|
||||||
*
|
*
|
||||||
* // Scala:
|
* // Scala:
|
||||||
* import org.apache.spark.sql.functions._
|
* import org.apache.spark.sql.functions._
|
||||||
* df.groupBy("department").agg($"department", max($"age"), sum($"expense"))
|
* df.groupBy("department").agg(max("age"), sum("expense"))
|
||||||
*
|
*
|
||||||
* // Java:
|
* // Java:
|
||||||
* import static org.apache.spark.sql.functions.*;
|
* import static org.apache.spark.sql.functions.*;
|
||||||
* df.groupBy("department").agg(col("department"), max(col("age")), sum(col("expense")));
|
* df.groupBy("department").agg(max("age"), sum("expense"));
|
||||||
|
* }}}
|
||||||
|
*
|
||||||
|
* Note that before Spark 1.4, the default behavior is to NOT retain grouping columns. To change
|
||||||
|
* to that behavior, set config variable `spark.sql.retainGroupColumns` to `false`.
|
||||||
|
* {{{
|
||||||
|
* // Scala, 1.3.x:
|
||||||
|
* df.groupBy("department").agg($"department", max("age"), sum("expense"))
|
||||||
|
*
|
||||||
|
* // Java, 1.3.x:
|
||||||
|
* df.groupBy("department").agg(col("department"), max("age"), sum("expense"));
|
||||||
* }}}
|
* }}}
|
||||||
*/
|
*/
|
||||||
@scala.annotation.varargs
|
@scala.annotation.varargs
|
||||||
|
|
Loading…
Reference in a new issue