[SPARK-7243][SQL] Reduce size for Contingency Tables in DataFrames

Reduced take size from 1e8 to 1e6. cc rxin Author: Burak Yavuz <brkyvz@gmail.com> Closes #5900 from brkyvz/df-cont-followup and squashes the following commits: c11e762 [Burak Yavuz] fix grammar b30ace2 [Burak Yavuz] address comments a417ba5 [Burak Yavuz] [SPARK-7243][SQL] Reduce size for Contingency Tables in DataFrames (cherry picked from commit 18340d7be5) Signed-off-by: Reynold Xin <rxin@databricks.com>
2015-05-05 11:01:25 -07:00 · 2015-05-05 11:01:25 -07:00 · 598902b549
parent 29350eef30
commit 598902b549
3 changed files with 13 additions and 11 deletions
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@ -934,10 +934,11 @@ class DataFrame(object):
    def crosstab(self, col1, col2):
        """
        Computes a pair-wise frequency table of the given columns. Also known as a contingency
-        table. The number of distinct values for each column should be less than 1e4. The first
+        table. The number of distinct values for each column should be less than 1e4. At most 1e6
-        column of each row will be the distinct values of `col1` and the column names will be the
+        non-zero pair frequencies will be returned.
-        distinct values of `col2`. The name of the first column will be `$col1_$col2`. Pairs that
+        The first column of each row will be the distinct values of `col1` and the column names
-        have no occurrences will have `null` as their counts.
+        will be the distinct values of `col2`. The name of the first column will be `$col1_$col2`.
        Pairs that have no occurrences will have `null` as their counts.
        :func:`DataFrame.crosstab` and :func:`DataFrameStatFunctions.crosstab` are aliases.
        :param col1: The name of the first column. Distinct items will make the first item of
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@ -65,10 +65,11 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
  /**
   * Computes a pair-wise frequency table of the given columns. Also known as a contingency table.
-   * The number of distinct values for each column should be less than 1e4. The first
+   * The number of distinct values for each column should be less than 1e4. At most 1e6 non-zero
-   * column of each row will be the distinct values of `col1` and the column names will be the
+   * pair frequencies will be returned.
-   * distinct values of `col2`. The name of the first column will be `$col1_$col2`. Counts will be
+   * The first column of each row will be the distinct values of `col1` and the column names will
-   * returned as `Long`s. Pairs that have no occurrences will have `null` as their counts.
+   * be the distinct values of `col2`. The name of the first column will be `$col1_$col2`. Counts
   * will be returned as `Long`s. Pairs that have no occurrences will have `null` as their counts.
   *
   * @param col1 The name of the first column. Distinct items will make the first item of
   *             each row.
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
@ -102,9 +102,9 @@ private[sql] object StatFunctions extends Logging {
  /** Generate a table of frequencies for the elements of two columns. */
  private[sql] def crossTabulate(df: DataFrame, col1: String, col2: String): DataFrame = {
    val tableName = s"${col1}_$col2"
-    val counts = df.groupBy(col1, col2).agg(col(col1), col(col2), count("*")).take(1e8.toInt)
+    val counts = df.groupBy(col1, col2).agg(col(col1), col(col2), count("*")).take(1e6.toInt)
-    if (counts.length == 1e8.toInt) {
+    if (counts.length == 1e6.toInt) {
-      logWarning("The maximum limit of 1e8 pairs have been collected, which may not be all of " +
+      logWarning("The maximum limit of 1e6 pairs have been collected, which may not be all of " +
        "the pairs. Please try reducing the amount of distinct items in your columns.")
    }
    // get the distinct values of column 2, so that we can make them the column names