[SPARK-7243][SQL] Reduce size for Contingency Tables in DataFrames
Reduced take size from 1e8 to 1e6.
cc rxin
Author: Burak Yavuz <brkyvz@gmail.com>
Closes #5900 from brkyvz/df-cont-followup and squashes the following commits:
c11e762 [Burak Yavuz] fix grammar
b30ace2 [Burak Yavuz] address comments
a417ba5 [Burak Yavuz] [SPARK-7243][SQL] Reduce size for Contingency Tables in DataFrames
(cherry picked from commit 18340d7be5
)
Signed-off-by: Reynold Xin <rxin@databricks.com>
This commit is contained in:
parent
29350eef30
commit
598902b549
|
@ -934,10 +934,11 @@ class DataFrame(object):
|
||||||
def crosstab(self, col1, col2):
|
def crosstab(self, col1, col2):
|
||||||
"""
|
"""
|
||||||
Computes a pair-wise frequency table of the given columns. Also known as a contingency
|
Computes a pair-wise frequency table of the given columns. Also known as a contingency
|
||||||
table. The number of distinct values for each column should be less than 1e4. The first
|
table. The number of distinct values for each column should be less than 1e4. At most 1e6
|
||||||
column of each row will be the distinct values of `col1` and the column names will be the
|
non-zero pair frequencies will be returned.
|
||||||
distinct values of `col2`. The name of the first column will be `$col1_$col2`. Pairs that
|
The first column of each row will be the distinct values of `col1` and the column names
|
||||||
have no occurrences will have `null` as their counts.
|
will be the distinct values of `col2`. The name of the first column will be `$col1_$col2`.
|
||||||
|
Pairs that have no occurrences will have `null` as their counts.
|
||||||
:func:`DataFrame.crosstab` and :func:`DataFrameStatFunctions.crosstab` are aliases.
|
:func:`DataFrame.crosstab` and :func:`DataFrameStatFunctions.crosstab` are aliases.
|
||||||
|
|
||||||
:param col1: The name of the first column. Distinct items will make the first item of
|
:param col1: The name of the first column. Distinct items will make the first item of
|
||||||
|
|
|
@ -65,10 +65,11 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Computes a pair-wise frequency table of the given columns. Also known as a contingency table.
|
* Computes a pair-wise frequency table of the given columns. Also known as a contingency table.
|
||||||
* The number of distinct values for each column should be less than 1e4. The first
|
* The number of distinct values for each column should be less than 1e4. At most 1e6 non-zero
|
||||||
* column of each row will be the distinct values of `col1` and the column names will be the
|
* pair frequencies will be returned.
|
||||||
* distinct values of `col2`. The name of the first column will be `$col1_$col2`. Counts will be
|
* The first column of each row will be the distinct values of `col1` and the column names will
|
||||||
* returned as `Long`s. Pairs that have no occurrences will have `null` as their counts.
|
* be the distinct values of `col2`. The name of the first column will be `$col1_$col2`. Counts
|
||||||
|
* will be returned as `Long`s. Pairs that have no occurrences will have `null` as their counts.
|
||||||
*
|
*
|
||||||
* @param col1 The name of the first column. Distinct items will make the first item of
|
* @param col1 The name of the first column. Distinct items will make the first item of
|
||||||
* each row.
|
* each row.
|
||||||
|
|
|
@ -102,9 +102,9 @@ private[sql] object StatFunctions extends Logging {
|
||||||
/** Generate a table of frequencies for the elements of two columns. */
|
/** Generate a table of frequencies for the elements of two columns. */
|
||||||
private[sql] def crossTabulate(df: DataFrame, col1: String, col2: String): DataFrame = {
|
private[sql] def crossTabulate(df: DataFrame, col1: String, col2: String): DataFrame = {
|
||||||
val tableName = s"${col1}_$col2"
|
val tableName = s"${col1}_$col2"
|
||||||
val counts = df.groupBy(col1, col2).agg(col(col1), col(col2), count("*")).take(1e8.toInt)
|
val counts = df.groupBy(col1, col2).agg(col(col1), col(col2), count("*")).take(1e6.toInt)
|
||||||
if (counts.length == 1e8.toInt) {
|
if (counts.length == 1e6.toInt) {
|
||||||
logWarning("The maximum limit of 1e8 pairs have been collected, which may not be all of " +
|
logWarning("The maximum limit of 1e6 pairs have been collected, which may not be all of " +
|
||||||
"the pairs. Please try reducing the amount of distinct items in your columns.")
|
"the pairs. Please try reducing the amount of distinct items in your columns.")
|
||||||
}
|
}
|
||||||
// get the distinct values of column 2, so that we can make them the column names
|
// get the distinct values of column 2, so that we can make them the column names
|
||||||
|
|
Loading…
Reference in a new issue