[SPARK-35047][SQL] Allow Json datasources to write non-ascii characters as codepoints
### What changes were proposed in this pull request? This PR proposes to enable the JSON datasources to write non-ascii characters as codepoints. To enable/disable this feature, I introduce a new option `writeNonAsciiCharacterAsCodePoint` for JSON datasources. ### Why are the changes needed? JSON specification allows codepoints as literal but Spark SQL's JSON datasources don't support the way to do it. It's great if we can write non-ascii characters as codepoints, which is a platform neutral representation. ### Does this PR introduce _any_ user-facing change? Yes. Users can write non-ascii characters as codepoints with JSON datasources. ### How was this patch tested? New test. Closes #32147 from sarutak/json-unicode-write. Authored-by: Kousuke Saruta <sarutak@oss.nttdata.com> Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
This commit is contained in:
parent
8a5af37c25
commit
e8bf8fe213
|
@ -135,6 +135,12 @@ private[sql] class JSONOptions(
|
|||
*/
|
||||
val inferTimestamp: Boolean = parameters.get("inferTimestamp").map(_.toBoolean).getOrElse(false)
|
||||
|
||||
/**
|
||||
* Generating \u0000 style codepoints for non-ASCII characters if the parameter is enabled.
|
||||
*/
|
||||
val writeNonAsciiCharacterAsCodePoint: Boolean =
|
||||
parameters.get("writeNonAsciiCharacterAsCodePoint").map(_.toBoolean).getOrElse(false)
|
||||
|
||||
/** Build a Jackson [[JsonFactory]] using JSON options. */
|
||||
def buildJsonFactory(): JsonFactory = {
|
||||
new JsonFactoryBuilder()
|
||||
|
|
|
@ -74,7 +74,13 @@ private[sql] class JacksonGenerator(
|
|||
|
||||
private val gen = {
|
||||
val generator = new JsonFactory().createGenerator(writer).setRootValueSeparator(null)
|
||||
if (options.pretty) generator.setPrettyPrinter(new DefaultPrettyPrinter("")) else generator
|
||||
if (options.pretty) {
|
||||
generator.setPrettyPrinter(new DefaultPrettyPrinter(""))
|
||||
}
|
||||
if (options.writeNonAsciiCharacterAsCodePoint) {
|
||||
generator.setHighestNonEscapedChar(0x7F)
|
||||
}
|
||||
generator
|
||||
}
|
||||
|
||||
private val lineSeparator: String = options.lineSeparatorInWrite
|
||||
|
|
|
@ -2845,6 +2845,61 @@ abstract class JsonSuite
|
|||
}
|
||||
}
|
||||
|
||||
test("SPARK-35047: Write Non-ASCII character as codepoint") {
|
||||
// scalastyle:off nonascii
|
||||
withTempPaths(2) { paths =>
|
||||
paths.foreach(_.delete())
|
||||
val seq = Seq("a", "\n", "\u3042")
|
||||
val df = seq.toDF
|
||||
|
||||
val basePath1 = paths(0).getCanonicalPath
|
||||
df.write.option("writeNonAsciiCharacterAsCodePoint", "true")
|
||||
.option("pretty", "false").json(basePath1)
|
||||
val actualText1 = spark.read.option("wholetext", "true").text(basePath1)
|
||||
.sort("value").map(_.getString(0)).collect().mkString
|
||||
val expectedText1 =
|
||||
s"""{"value":"\\n"}
|
||||
|{"value":"\\u3042"}
|
||||
|{"value":"a"}
|
||||
|""".stripMargin
|
||||
assert(actualText1 === expectedText1)
|
||||
|
||||
val actualJson1 = spark.read.json(basePath1)
|
||||
.sort("value").map(_.getString(0)).collect().mkString
|
||||
val expectedJson1 = "\na\u3042"
|
||||
assert(actualJson1 === expectedJson1)
|
||||
|
||||
// Test for pretty printed JSON.
|
||||
// If multiLine option is set to true, the format should be should be
|
||||
// one JSON record per file. So LEAF_NODE_DEFAULT_PARALLELISM is set here.
|
||||
withSQLConf(SQLConf.LEAF_NODE_DEFAULT_PARALLELISM.key -> s"${seq.length}") {
|
||||
val basePath2 = paths(1).getCanonicalPath
|
||||
df.write.option("writeNonAsciiCharacterAsCodePoint", "true")
|
||||
.option("pretty", "true").json(basePath2)
|
||||
val actualText2 = spark.read.option("wholetext", "true").text(basePath2)
|
||||
.sort("value").map(_.getString(0)).collect().mkString
|
||||
val expectedText2 =
|
||||
s"""{
|
||||
| "value" : "\\n"
|
||||
|}
|
||||
|{
|
||||
| "value" : "\\u3042"
|
||||
|}
|
||||
|{
|
||||
| "value" : "a"
|
||||
|}
|
||||
|""".stripMargin
|
||||
assert(actualText2 === expectedText2)
|
||||
|
||||
val actualJson2 = spark.read.option("multiLine", "true").json(basePath2)
|
||||
.sort("value").map(_.getString(0)).collect().mkString
|
||||
val expectedJson2 = "\na\u3042"
|
||||
assert(actualJson2 === expectedJson2)
|
||||
}
|
||||
}
|
||||
// scalastyle:on nonascii
|
||||
}
|
||||
|
||||
test("SPARK-35104: Fix wrong indentation for multiple JSON even if `pretty` option is true") {
|
||||
withSQLConf(SQLConf.LEAF_NODE_DEFAULT_PARALLELISM.key -> "1") {
|
||||
withTempPath { path =>
|
||||
|
|
Loading…
Reference in a new issue