[SPARK-33690][SQL] Escape meta-characters in showString

### What changes were proposed in this pull request?

This PR intends to escape meta-characters (e.g., \n and \t) in `Dataset.showString`.
Before this PR:
```
scala> Seq("aaa\nbbb\t\tccccc").toDF("value").show()
+--------------+
|         value|
+--------------+
|aaa
bbb		ccccc|
+--------------+
```
After this PR:
```
+-----------------+
|            value|
+-----------------+
|aaa\nbbb\t\tccccc|
+-----------------+
```

### Why are the changes needed?

For better output.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Added a unit test.

Closes #30647 from maropu/EscapeMetaInShow.

Authored-by: Takeshi Yamamuro <yamamuro@apache.org>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
This commit is contained in:
Takeshi Yamamuro 2020-12-13 15:04:23 -08:00 committed by Dongjoon Hyun
parent 45af3c9688
commit 8197ee3b15
No known key found for this signature in database
GPG key ID: EDA00CE834F0FC5C
4 changed files with 47 additions and 5 deletions

View file

@ -26,6 +26,8 @@ license: |
- In Spark 3.2, `spark.sql.adaptive.enabled` is enabled by default. To restore the behavior before Spark 3.2, you can set `spark.sql.adaptive.enabled` to `false`.
- In Spark 3.2, the meta-characters `\n` and `\t` are escaped in the `show()` action. In Spark 3.1 or earlier, the two metacharacters are output as it is.
## Upgrading from Spark SQL 3.0 to 3.1
- In Spark 3.1, statistical aggregation function includes `std`, `stddev`, `stddev_samp`, `variance`, `var_samp`, `skewness`, `kurtosis`, `covar_samp`, `corr` will return `NULL` instead of `Double.NaN` when `DivideByZero` occurs during expression evaluation, for example, when `stddev_samp` applied on a single element set. In Spark version 3.0 and earlier, it will return `Double.NaN` in such case. To restore the behavior before Spark 3.1, you can set `spark.sql.legacy.statisticalAggregate` to `true`.

View file

@ -308,7 +308,9 @@ class Dataset[T] private[sql](
val str = cell match {
case null => "null"
case binary: Array[Byte] => binary.map("%02X".format(_)).mkString("[", " ", "]")
case _ => cell.toString
case _ =>
// Escapes meta-characters not to break the `showString` format
cell.toString.replaceAll("\n", "\\\\n").replaceAll("\t", "\\\\t")
}
if (truncate > 0 && str.length > truncate) {
// do not show ellipses for strings shorter than 4 characters.

View file

@ -1235,6 +1235,44 @@ class DataFrameSuite extends QueryTest
assert(df.showString(10, vertical = true) === expectedAnswer)
}
test("SPARK-33690: showString: escape meta-characters") {
val df1 = Seq("aaa\nbbb\tccc").toDF("value")
assert(df1.showString(1, truncate = 0) ===
"""+-------------+
||value |
|+-------------+
||aaa\nbbb\tccc|
|+-------------+
|""".stripMargin)
val df2 = Seq(Seq("aaa\nbbb\tccc")).toDF("value")
assert(df2.showString(1, truncate = 0) ===
"""+---------------+
||value |
|+---------------+
||[aaa\nbbb\tccc]|
|+---------------+
|""".stripMargin)
val df3 = Seq(Map("aaa\nbbb\tccc" -> "aaa\nbbb\tccc")).toDF("value")
assert(df3.showString(1, truncate = 0) ===
"""+--------------------------------+
||value |
|+--------------------------------+
||{aaa\nbbb\tccc -> aaa\nbbb\tccc}|
|+--------------------------------+
|""".stripMargin)
val df4 = Seq("aaa\nbbb\tccc").toDF("value").selectExpr("named_struct('v', value)")
assert(df4.showString(1, truncate = 0) ===
"""+----------------------+
||named_struct(v, value)|
|+----------------------+
||{aaa\nbbb\tccc} |
|+----------------------+
|""".stripMargin)
}
test("SPARK-7319 showString") {
val expectedAnswer = """+---+-----+
||key|value|

View file

@ -261,11 +261,11 @@ class ExplainSuite extends ExplainSuiteHelper with DisableAdaptiveExecutionSuite
"PartitionFilters: \\[isnotnull\\(k#xL\\), dynamicpruningexpression\\(k#xL " +
"IN subquery#x\\)\\]"
val expected_pattern3 =
"Location: InMemoryFileIndex \\[.*org.apache.spark.sql.ExplainSuite" +
"/df2/.*, ... 99 entries\\]"
"Location: InMemoryFileIndex \\[\\S*org.apache.spark.sql.ExplainSuite" +
"/df2/\\S*, ... 99 entries\\]"
val expected_pattern4 =
"Location: InMemoryFileIndex \\[.*org.apache.spark.sql.ExplainSuite" +
"/df1/.*, ... 999 entries\\]"
"Location: InMemoryFileIndex \\[\\S*org.apache.spark.sql.ExplainSuite" +
"/df1/\\S*, ... 999 entries\\]"
withNormalizedExplain(sqlText) { normalizedOutput =>
assert(expected_pattern1.r.findAllMatchIn(normalizedOutput).length == 1)
assert(expected_pattern2.r.findAllMatchIn(normalizedOutput).length == 1)