From e8bf8fe213c0f66f6d32f845f4dc391fa5c530f3 Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Thu, 29 Apr 2021 09:50:15 -0700 Subject: [PATCH] [SPARK-35047][SQL] Allow Json datasources to write non-ascii characters as codepoints ### What changes were proposed in this pull request? This PR proposes to enable the JSON datasources to write non-ascii characters as codepoints. To enable/disable this feature, I introduce a new option `writeNonAsciiCharacterAsCodePoint` for JSON datasources. ### Why are the changes needed? JSON specification allows codepoints as literal but Spark SQL's JSON datasources don't support the way to do it. It's great if we can write non-ascii characters as codepoints, which is a platform neutral representation. ### Does this PR introduce _any_ user-facing change? Yes. Users can write non-ascii characters as codepoints with JSON datasources. ### How was this patch tested? New test. Closes #32147 from sarutak/json-unicode-write. Authored-by: Kousuke Saruta Signed-off-by: Dongjoon Hyun --- .../spark/sql/catalyst/json/JSONOptions.scala | 6 ++ .../sql/catalyst/json/JacksonGenerator.scala | 8 ++- .../datasources/json/JsonSuite.scala | 55 +++++++++++++++++++ 3 files changed, 68 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala index c145f26472..47be83a41d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala @@ -135,6 +135,12 @@ private[sql] class JSONOptions( */ val inferTimestamp: Boolean = parameters.get("inferTimestamp").map(_.toBoolean).getOrElse(false) + /** + * Generating \u0000 style codepoints for non-ASCII characters if the parameter is enabled. + */ + val writeNonAsciiCharacterAsCodePoint: Boolean = + parameters.get("writeNonAsciiCharacterAsCodePoint").map(_.toBoolean).getOrElse(false) + /** Build a Jackson [[JsonFactory]] using JSON options. */ def buildJsonFactory(): JsonFactory = { new JsonFactoryBuilder() diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala index d70fceb0c3..8007c3bd99 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala @@ -74,7 +74,13 @@ private[sql] class JacksonGenerator( private val gen = { val generator = new JsonFactory().createGenerator(writer).setRootValueSeparator(null) - if (options.pretty) generator.setPrettyPrinter(new DefaultPrettyPrinter("")) else generator + if (options.pretty) { + generator.setPrettyPrinter(new DefaultPrettyPrinter("")) + } + if (options.writeNonAsciiCharacterAsCodePoint) { + generator.setHighestNonEscapedChar(0x7F) + } + generator } private val lineSeparator: String = options.lineSeparatorInWrite diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala index 870701151e..b707a48413 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala @@ -2845,6 +2845,61 @@ abstract class JsonSuite } } + test("SPARK-35047: Write Non-ASCII character as codepoint") { + // scalastyle:off nonascii + withTempPaths(2) { paths => + paths.foreach(_.delete()) + val seq = Seq("a", "\n", "\u3042") + val df = seq.toDF + + val basePath1 = paths(0).getCanonicalPath + df.write.option("writeNonAsciiCharacterAsCodePoint", "true") + .option("pretty", "false").json(basePath1) + val actualText1 = spark.read.option("wholetext", "true").text(basePath1) + .sort("value").map(_.getString(0)).collect().mkString + val expectedText1 = + s"""{"value":"\\n"} + |{"value":"\\u3042"} + |{"value":"a"} + |""".stripMargin + assert(actualText1 === expectedText1) + + val actualJson1 = spark.read.json(basePath1) + .sort("value").map(_.getString(0)).collect().mkString + val expectedJson1 = "\na\u3042" + assert(actualJson1 === expectedJson1) + + // Test for pretty printed JSON. + // If multiLine option is set to true, the format should be should be + // one JSON record per file. So LEAF_NODE_DEFAULT_PARALLELISM is set here. + withSQLConf(SQLConf.LEAF_NODE_DEFAULT_PARALLELISM.key -> s"${seq.length}") { + val basePath2 = paths(1).getCanonicalPath + df.write.option("writeNonAsciiCharacterAsCodePoint", "true") + .option("pretty", "true").json(basePath2) + val actualText2 = spark.read.option("wholetext", "true").text(basePath2) + .sort("value").map(_.getString(0)).collect().mkString + val expectedText2 = + s"""{ + | "value" : "\\n" + |} + |{ + | "value" : "\\u3042" + |} + |{ + | "value" : "a" + |} + |""".stripMargin + assert(actualText2 === expectedText2) + + val actualJson2 = spark.read.option("multiLine", "true").json(basePath2) + .sort("value").map(_.getString(0)).collect().mkString + val expectedJson2 = "\na\u3042" + assert(actualJson2 === expectedJson2) + } + } + // scalastyle:on nonascii + } + test("SPARK-35104: Fix wrong indentation for multiple JSON even if `pretty` option is true") { withSQLConf(SQLConf.LEAF_NODE_DEFAULT_PARALLELISM.key -> "1") { withTempPath { path =>