[SPARK-7319][SQL] Improve the output from DataFrame.show()
Author: 云峤 <chensong.cs@alibaba-inc.com>
Closes #5865 from kaka1992/df.show and squashes the following commits:
c79204b [云峤] Update
a1338f6 [云峤] Update python dataFrame show test and add empty df unit test.
734369c [云峤] Update python dataFrame show test and add empty df unit test.
84aec3e [云峤] Update python dataFrame show test and add empty df unit test.
159b3d5 [云峤] update
03ef434 [云峤] update
7394fd5 [云峤] update test show
ced487a [云峤] update pep8
b6e690b [云峤] Merge remote-tracking branch 'upstream/master' into df.show
30ac311 [云峤] [SPARK-7294] ADD BETWEEN
7d62368 [云峤] [SPARK-7294] ADD BETWEEN
baf839b [云峤] [SPARK-7294] ADD BETWEEN
d11d5b9 [云峤] [SPARK-7294] ADD BETWEEN
(cherry picked from commit f32e69ecc3
)
Signed-off-by: Reynold Xin <rxin@databricks.com>
This commit is contained in:
parent
893b3103fe
commit
34edaa8ac2
|
@ -167,7 +167,7 @@ setMethod("isLocal",
|
||||||
setMethod("showDF",
|
setMethod("showDF",
|
||||||
signature(x = "DataFrame"),
|
signature(x = "DataFrame"),
|
||||||
function(x, numRows = 20) {
|
function(x, numRows = 20) {
|
||||||
cat(callJMethod(x@sdf, "showString", numToInt(numRows)), "\n")
|
callJMethod(x@sdf, "showString", numToInt(numRows))
|
||||||
})
|
})
|
||||||
|
|
||||||
#' show
|
#' show
|
||||||
|
|
|
@ -641,7 +641,7 @@ test_that("toJSON() returns an RDD of the correct values", {
|
||||||
|
|
||||||
test_that("showDF()", {
|
test_that("showDF()", {
|
||||||
df <- jsonFile(sqlCtx, jsonPath)
|
df <- jsonFile(sqlCtx, jsonPath)
|
||||||
expect_output(showDF(df), "age name \nnull Michael\n30 Andy \n19 Justin ")
|
expect_output(showDF(df), "+----+-------+\n| age| name|\n+----+-------+\n|null|Michael|\n| 30| Andy|\n| 19| Justin|\n+----+-------+\n")
|
||||||
})
|
})
|
||||||
|
|
||||||
test_that("isLocal()", {
|
test_that("isLocal()", {
|
||||||
|
|
|
@ -275,9 +275,12 @@ class DataFrame(object):
|
||||||
>>> df
|
>>> df
|
||||||
DataFrame[age: int, name: string]
|
DataFrame[age: int, name: string]
|
||||||
>>> df.show()
|
>>> df.show()
|
||||||
age name
|
+---+-----+
|
||||||
2 Alice
|
|age| name|
|
||||||
5 Bob
|
+---+-----+
|
||||||
|
| 2|Alice|
|
||||||
|
| 5| Bob|
|
||||||
|
+---+-----+
|
||||||
"""
|
"""
|
||||||
print(self._jdf.showString(n))
|
print(self._jdf.showString(n))
|
||||||
|
|
||||||
|
@ -591,12 +594,15 @@ class DataFrame(object):
|
||||||
given, this function computes statistics for all numerical columns.
|
given, this function computes statistics for all numerical columns.
|
||||||
|
|
||||||
>>> df.describe().show()
|
>>> df.describe().show()
|
||||||
summary age
|
+-------+---+
|
||||||
count 2
|
|summary|age|
|
||||||
mean 3.5
|
+-------+---+
|
||||||
stddev 1.5
|
| count| 2|
|
||||||
min 2
|
| mean|3.5|
|
||||||
max 5
|
| stddev|1.5|
|
||||||
|
| min| 2|
|
||||||
|
| max| 5|
|
||||||
|
+-------+---+
|
||||||
"""
|
"""
|
||||||
jdf = self._jdf.describe(self._jseq(cols))
|
jdf = self._jdf.describe(self._jseq(cols))
|
||||||
return DataFrame(jdf, self.sql_ctx)
|
return DataFrame(jdf, self.sql_ctx)
|
||||||
|
@ -801,12 +807,18 @@ class DataFrame(object):
|
||||||
:param subset: optional list of column names to consider.
|
:param subset: optional list of column names to consider.
|
||||||
|
|
||||||
>>> df4.dropna().show()
|
>>> df4.dropna().show()
|
||||||
age height name
|
+---+------+-----+
|
||||||
10 80 Alice
|
|age|height| name|
|
||||||
|
+---+------+-----+
|
||||||
|
| 10| 80|Alice|
|
||||||
|
+---+------+-----+
|
||||||
|
|
||||||
>>> df4.na.drop().show()
|
>>> df4.na.drop().show()
|
||||||
age height name
|
+---+------+-----+
|
||||||
10 80 Alice
|
|age|height| name|
|
||||||
|
+---+------+-----+
|
||||||
|
| 10| 80|Alice|
|
||||||
|
+---+------+-----+
|
||||||
"""
|
"""
|
||||||
if how is not None and how not in ['any', 'all']:
|
if how is not None and how not in ['any', 'all']:
|
||||||
raise ValueError("how ('" + how + "') should be 'any' or 'all'")
|
raise ValueError("how ('" + how + "') should be 'any' or 'all'")
|
||||||
|
@ -837,25 +849,34 @@ class DataFrame(object):
|
||||||
then the non-string column is simply ignored.
|
then the non-string column is simply ignored.
|
||||||
|
|
||||||
>>> df4.fillna(50).show()
|
>>> df4.fillna(50).show()
|
||||||
age height name
|
+---+------+-----+
|
||||||
10 80 Alice
|
|age|height| name|
|
||||||
5 50 Bob
|
+---+------+-----+
|
||||||
50 50 Tom
|
| 10| 80|Alice|
|
||||||
50 50 null
|
| 5| 50| Bob|
|
||||||
|
| 50| 50| Tom|
|
||||||
|
| 50| 50| null|
|
||||||
|
+---+------+-----+
|
||||||
|
|
||||||
>>> df4.fillna({'age': 50, 'name': 'unknown'}).show()
|
>>> df4.fillna({'age': 50, 'name': 'unknown'}).show()
|
||||||
age height name
|
+---+------+-------+
|
||||||
10 80 Alice
|
|age|height| name|
|
||||||
5 null Bob
|
+---+------+-------+
|
||||||
50 null Tom
|
| 10| 80| Alice|
|
||||||
50 null unknown
|
| 5| null| Bob|
|
||||||
|
| 50| null| Tom|
|
||||||
|
| 50| null|unknown|
|
||||||
|
+---+------+-------+
|
||||||
|
|
||||||
>>> df4.na.fill({'age': 50, 'name': 'unknown'}).show()
|
>>> df4.na.fill({'age': 50, 'name': 'unknown'}).show()
|
||||||
age height name
|
+---+------+-------+
|
||||||
10 80 Alice
|
|age|height| name|
|
||||||
5 null Bob
|
+---+------+-------+
|
||||||
50 null Tom
|
| 10| 80| Alice|
|
||||||
50 null unknown
|
| 5| null| Bob|
|
||||||
|
| 50| null| Tom|
|
||||||
|
| 50| null|unknown|
|
||||||
|
+---+------+-------+
|
||||||
"""
|
"""
|
||||||
if not isinstance(value, (float, int, long, basestring, dict)):
|
if not isinstance(value, (float, int, long, basestring, dict)):
|
||||||
raise ValueError("value should be a float, int, long, string, or dict")
|
raise ValueError("value should be a float, int, long, string, or dict")
|
||||||
|
@ -1241,11 +1262,17 @@ class Column(object):
|
||||||
|
|
||||||
>>> df = sc.parallelize([([1, 2], {"key": "value"})]).toDF(["l", "d"])
|
>>> df = sc.parallelize([([1, 2], {"key": "value"})]).toDF(["l", "d"])
|
||||||
>>> df.select(df.l.getItem(0), df.d.getItem("key")).show()
|
>>> df.select(df.l.getItem(0), df.d.getItem("key")).show()
|
||||||
l[0] d[key]
|
+----+------+
|
||||||
1 value
|
|l[0]|d[key]|
|
||||||
|
+----+------+
|
||||||
|
| 1| value|
|
||||||
|
+----+------+
|
||||||
>>> df.select(df.l[0], df.d["key"]).show()
|
>>> df.select(df.l[0], df.d["key"]).show()
|
||||||
l[0] d[key]
|
+----+------+
|
||||||
1 value
|
|l[0]|d[key]|
|
||||||
|
+----+------+
|
||||||
|
| 1| value|
|
||||||
|
+----+------+
|
||||||
"""
|
"""
|
||||||
return self[key]
|
return self[key]
|
||||||
|
|
||||||
|
@ -1255,11 +1282,17 @@ class Column(object):
|
||||||
>>> from pyspark.sql import Row
|
>>> from pyspark.sql import Row
|
||||||
>>> df = sc.parallelize([Row(r=Row(a=1, b="b"))]).toDF()
|
>>> df = sc.parallelize([Row(r=Row(a=1, b="b"))]).toDF()
|
||||||
>>> df.select(df.r.getField("b")).show()
|
>>> df.select(df.r.getField("b")).show()
|
||||||
r.b
|
+---+
|
||||||
b
|
|r.b|
|
||||||
|
+---+
|
||||||
|
| b|
|
||||||
|
+---+
|
||||||
>>> df.select(df.r.a).show()
|
>>> df.select(df.r.a).show()
|
||||||
r.a
|
+---+
|
||||||
1
|
|r.a|
|
||||||
|
+---+
|
||||||
|
| 1|
|
||||||
|
+---+
|
||||||
"""
|
"""
|
||||||
return Column(self._jc.getField(name))
|
return Column(self._jc.getField(name))
|
||||||
|
|
||||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.spark.sql
|
||||||
import java.io.CharArrayWriter
|
import java.io.CharArrayWriter
|
||||||
import java.sql.DriverManager
|
import java.sql.DriverManager
|
||||||
|
|
||||||
|
|
||||||
import scala.collection.JavaConversions._
|
import scala.collection.JavaConversions._
|
||||||
import scala.language.implicitConversions
|
import scala.language.implicitConversions
|
||||||
import scala.reflect.ClassTag
|
import scala.reflect.ClassTag
|
||||||
|
@ -28,6 +29,7 @@ import scala.util.control.NonFatal
|
||||||
|
|
||||||
import com.fasterxml.jackson.core.JsonFactory
|
import com.fasterxml.jackson.core.JsonFactory
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils
|
||||||
import org.apache.spark.annotation.{DeveloperApi, Experimental}
|
import org.apache.spark.annotation.{DeveloperApi, Experimental}
|
||||||
import org.apache.spark.api.java.JavaRDD
|
import org.apache.spark.api.java.JavaRDD
|
||||||
import org.apache.spark.api.python.SerDeUtil
|
import org.apache.spark.api.python.SerDeUtil
|
||||||
|
@ -175,6 +177,7 @@ class DataFrame private[sql](
|
||||||
* @param numRows Number of rows to show
|
* @param numRows Number of rows to show
|
||||||
*/
|
*/
|
||||||
private[sql] def showString(numRows: Int): String = {
|
private[sql] def showString(numRows: Int): String = {
|
||||||
|
val sb = new StringBuilder
|
||||||
val data = take(numRows)
|
val data = take(numRows)
|
||||||
val numCols = schema.fieldNames.length
|
val numCols = schema.fieldNames.length
|
||||||
|
|
||||||
|
@ -194,12 +197,25 @@ class DataFrame private[sql](
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Pad the cells
|
// Create SeparateLine
|
||||||
rows.map { row =>
|
val sep: String = colWidths.map("-" * _).addString(sb, "+", "+", "+\n").toString()
|
||||||
row.zipWithIndex.map { case (cell, i) =>
|
|
||||||
String.format(s"%-${colWidths(i)}s", cell)
|
// column names
|
||||||
}.mkString(" ")
|
rows.head.zipWithIndex.map { case (cell, i) =>
|
||||||
}.mkString("\n")
|
StringUtils.leftPad(cell.toString, colWidths(i))
|
||||||
|
}.addString(sb, "|", "|", "|\n")
|
||||||
|
|
||||||
|
sb.append(sep)
|
||||||
|
|
||||||
|
// data
|
||||||
|
rows.tail.map {
|
||||||
|
_.zipWithIndex.map { case (cell, i) =>
|
||||||
|
StringUtils.leftPad(cell.toString, colWidths(i))
|
||||||
|
}.addString(sb, "|", "|", "|\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
sb.append(sep)
|
||||||
|
sb.toString()
|
||||||
}
|
}
|
||||||
|
|
||||||
override def toString: String = {
|
override def toString: String = {
|
||||||
|
|
|
@ -598,6 +598,25 @@ class DataFrameSuite extends QueryTest {
|
||||||
testData.select($"*").show(1000)
|
testData.select($"*").show(1000)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test("SPARK-7319 showString") {
|
||||||
|
val expectedAnswer = """+---+-----+
|
||||||
|
||key|value|
|
||||||
|
|+---+-----+
|
||||||
|
|| 1| 1|
|
||||||
|
|+---+-----+
|
||||||
|
|""".stripMargin
|
||||||
|
assert(testData.select($"*").showString(1) === expectedAnswer)
|
||||||
|
}
|
||||||
|
|
||||||
|
test("SPARK-7327 show with empty dataFrame") {
|
||||||
|
val expectedAnswer = """+---+-----+
|
||||||
|
||key|value|
|
||||||
|
|+---+-----+
|
||||||
|
|+---+-----+
|
||||||
|
|""".stripMargin
|
||||||
|
assert(testData.select($"*").filter($"key" < 0).showString(1) === expectedAnswer)
|
||||||
|
}
|
||||||
|
|
||||||
test("createDataFrame(RDD[Row], StructType) should convert UDTs (SPARK-6672)") {
|
test("createDataFrame(RDD[Row], StructType) should convert UDTs (SPARK-6672)") {
|
||||||
val rowRDD = TestSQLContext.sparkContext.parallelize(Seq(Row(new ExamplePoint(1.0, 2.0))))
|
val rowRDD = TestSQLContext.sparkContext.parallelize(Seq(Row(new ExamplePoint(1.0, 2.0))))
|
||||||
val schema = StructType(Array(StructField("point", new ExamplePointUDT(), false)))
|
val schema = StructType(Array(StructField("point", new ExamplePointUDT(), false)))
|
||||||
|
|
Loading…
Reference in a new issue