[SPARK-7319][SQL] Improve the output from DataFrame.show()

Author: 云峤 <chensong.cs@alibaba-inc.com> Closes #5865 from kaka1992/df.show and squashes the following commits: c79204b [云峤] Update a1338f6 [云峤] Update python dataFrame show test and add empty df unit test. 734369c [云峤] Update python dataFrame show test and add empty df unit test. 84aec3e [云峤] Update python dataFrame show test and add empty df unit test. 159b3d5 [云峤] update 03ef434 [云峤] update 7394fd5 [云峤] update test show ced487a [云峤] update pep8 b6e690b [云峤] Merge remote-tracking branch 'upstream/master' into df.show 30ac311 [云峤] [SPARK-7294] ADD BETWEEN 7d62368 [云峤] [SPARK-7294] ADD BETWEEN baf839b [云峤] [SPARK-7294] ADD BETWEEN d11d5b9 [云峤] [SPARK-7294] ADD BETWEEN (cherry picked from commit f32e69ecc3) Signed-off-by: Reynold Xin <rxin@databricks.com>
2015-05-04 12:08:38 -07:00 · 2015-05-04 12:08:38 -07:00 · 34edaa8ac2
parent 893b3103fe
commit 34edaa8ac2
5 changed files with 112 additions and 44 deletions
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@ -167,7 +167,7 @@ setMethod("isLocal",
 setMethod("showDF",
          signature(x = "DataFrame"),
          function(x, numRows = 20) {
-            cat(callJMethod(x@sdf, "showString", numToInt(numRows)), "\n")
+            callJMethod(x@sdf, "showString", numToInt(numRows))
          })
 #' show
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@ -641,7 +641,7 @@ test_that("toJSON() returns an RDD of the correct values", {
 test_that("showDF()", {
  df <- jsonFile(sqlCtx, jsonPath)
-  expect_output(showDF(df), "age  name   \nnull Michael\n30   Andy   \n19   Justin ")
+  expect_output(showDF(df), "+----+-------+\n| age|   name|\n+----+-------+\n|null|Michael|\n|  30|   Andy|\n|  19| Justin|\n+----+-------+\n")
 })
 test_that("isLocal()", {
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@ -275,9 +275,12 @@ class DataFrame(object):
        >>> df
        DataFrame[age: int, name: string]
        >>> df.show()
-        age name
+        +---+-----+
-        2   Alice
+        |age| name|
-        5   Bob
+        +---+-----+
        |  2|Alice|
        |  5|  Bob|
        +---+-----+
        """
        print(self._jdf.showString(n))
@ -591,12 +594,15 @@ class DataFrame(object):
        given, this function computes statistics for all numerical columns.
        >>> df.describe().show()
-        summary age
+        +-------+---+
-        count   2
+        |summary|age|
-        mean    3.5
+        +-------+---+
-        stddev  1.5
+        |  count|  2|
-        min     2
+        |   mean|3.5|
-        max     5
+        | stddev|1.5|
        |    min|  2|
        |    max|  5|
        +-------+---+
        """
        jdf = self._jdf.describe(self._jseq(cols))
        return DataFrame(jdf, self.sql_ctx)
@ -801,12 +807,18 @@ class DataFrame(object):
        :param subset: optional list of column names to consider.
        >>> df4.dropna().show()
-        age height name
+        +---+------+-----+
-        10  80     Alice
+        |age|height| name|
        +---+------+-----+
        | 10|    80|Alice|
        +---+------+-----+
        >>> df4.na.drop().show()
-        age height name
+        +---+------+-----+
-        10  80     Alice
+        |age|height| name|
        +---+------+-----+
        | 10|    80|Alice|
        +---+------+-----+
        """
        if how is not None and how not in ['any', 'all']:
            raise ValueError("how ('" + how + "') should be 'any' or 'all'")
@ -837,25 +849,34 @@ class DataFrame(object):
            then the non-string column is simply ignored.
        >>> df4.fillna(50).show()
-        age height name
+        +---+------+-----+
-        10  80     Alice
+        |age|height| name|
-        5   50     Bob
+        +---+------+-----+
-        50  50     Tom
+        | 10|    80|Alice|
-        50  50     null
+        |  5|    50|  Bob|
        | 50|    50|  Tom|
        | 50|    50| null|
        +---+------+-----+
        >>> df4.fillna({'age': 50, 'name': 'unknown'}).show()
-        age height name
+        +---+------+-------+
-        10  80     Alice
+        |age|height|   name|
-        5   null   Bob
+        +---+------+-------+
-        50  null   Tom
+        | 10|    80|  Alice|
-        50  null   unknown
+        |  5|  null|    Bob|
        | 50|  null|    Tom|
        | 50|  null|unknown|
        +---+------+-------+
        >>> df4.na.fill({'age': 50, 'name': 'unknown'}).show()
-        age height name
+        +---+------+-------+
-        10  80     Alice
+        |age|height|   name|
-        5   null   Bob
+        +---+------+-------+
-        50  null   Tom
+        | 10|    80|  Alice|
-        50  null   unknown
+        |  5|  null|    Bob|
        | 50|  null|    Tom|
        | 50|  null|unknown|
        +---+------+-------+
        """
        if not isinstance(value, (float, int, long, basestring, dict)):
            raise ValueError("value should be a float, int, long, string, or dict")
@ -1241,11 +1262,17 @@ class Column(object):
        >>> df = sc.parallelize([([1, 2], {"key": "value"})]).toDF(["l", "d"])
        >>> df.select(df.l.getItem(0), df.d.getItem("key")).show()
-        l[0] d[key]
+        +----+------+
-        1    value
+        |l[0]|d[key]|
        +----+------+
        |   1| value|
        +----+------+
        >>> df.select(df.l[0], df.d["key"]).show()
-        l[0] d[key]
+        +----+------+
-        1    value
+        |l[0]|d[key]|
        +----+------+
        |   1| value|
        +----+------+
        """
        return self[key]
@ -1255,11 +1282,17 @@ class Column(object):
        >>> from pyspark.sql import Row
        >>> df = sc.parallelize([Row(r=Row(a=1, b="b"))]).toDF()
        >>> df.select(df.r.getField("b")).show()
-        r.b
+        +---+
-        b
+        |r.b|
        +---+
        |  b|
        +---+
        >>> df.select(df.r.a).show()
-        r.a
+        +---+
-        1
+        |r.a|
        +---+
        |  1|
        +---+
        """
        return Column(self._jc.getField(name))
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@ -20,6 +20,7 @@ package org.apache.spark.sql
 import java.io.CharArrayWriter
 import java.sql.DriverManager
 import scala.collection.JavaConversions._
 import scala.language.implicitConversions
 import scala.reflect.ClassTag
@ -28,6 +29,7 @@ import scala.util.control.NonFatal
 import com.fasterxml.jackson.core.JsonFactory
 import org.apache.commons.lang3.StringUtils
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.api.python.SerDeUtil
@ -175,6 +177,7 @@ class DataFrame private[sql](
   * @param numRows Number of rows to show
   */
  private[sql] def showString(numRows: Int): String = {
    val sb = new StringBuilder
    val data = take(numRows)
    val numCols = schema.fieldNames.length
@ -194,12 +197,25 @@ class DataFrame private[sql](
      }
    }
-    // Pad the cells
+    // Create SeparateLine
-    rows.map { row =>
+    val sep: String = colWidths.map("-" * _).addString(sb, "+", "+", "+\n").toString()
-      row.zipWithIndex.map { case (cell, i) =>
+
-        String.format(s"%-${colWidths(i)}s", cell)
+    // column names
-      }.mkString(" ")
+    rows.head.zipWithIndex.map { case (cell, i) =>
-    }.mkString("\n")
+      StringUtils.leftPad(cell.toString, colWidths(i))
    }.addString(sb, "|", "|", "|\n")
    sb.append(sep)
    // data
    rows.tail.map {
      _.zipWithIndex.map { case (cell, i) =>
        StringUtils.leftPad(cell.toString, colWidths(i))
      }.addString(sb, "|", "|", "|\n")
    }
    sb.append(sep)
    sb.toString()
  }
  override def toString: String = {
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@ -598,6 +598,25 @@ class DataFrameSuite extends QueryTest {
    testData.select($"*").show(1000)
  }
  test("SPARK-7319 showString") {
    val expectedAnswer = """+---+-----+
                           ||key|value|
                           |+---+-----+
                           ||  1|    1|
                           |+---+-----+
                           |""".stripMargin
    assert(testData.select($"*").showString(1) === expectedAnswer)
  }
  test("SPARK-7327 show with empty dataFrame") {
    val expectedAnswer = """+---+-----+
                           ||key|value|
                           |+---+-----+
                           |+---+-----+
                           |""".stripMargin
    assert(testData.select($"*").filter($"key" < 0).showString(1) === expectedAnswer)
  }
  test("createDataFrame(RDD[Row], StructType) should convert UDTs (SPARK-6672)") {
    val rowRDD = TestSQLContext.sparkContext.parallelize(Seq(Row(new ExamplePoint(1.0, 2.0))))
    val schema = StructType(Array(StructField("point", new ExamplePointUDT(), false)))