[SPARK-21167][SS] Decode the path generated by File sink to handle special characters
## What changes were proposed in this pull request? Decode the path generated by File sink to handle special characters. ## How was this patch tested? The added unit test. Author: Shixiong Zhu <shixiong@databricks.com> Closes #18381 from zsxwing/SPARK-21167.
This commit is contained in:
parent
53543374ce
commit
d66b143eec
|
@ -17,6 +17,8 @@
|
|||
|
||||
package org.apache.spark.sql.execution.streaming
|
||||
|
||||
import java.net.URI
|
||||
|
||||
import org.apache.hadoop.fs.{FileStatus, Path}
|
||||
import org.json4s.NoTypeHints
|
||||
import org.json4s.jackson.Serialization
|
||||
|
@ -47,7 +49,8 @@ case class SinkFileStatus(
|
|||
action: String) {
|
||||
|
||||
def toFileStatus: FileStatus = {
|
||||
new FileStatus(size, isDir, blockReplication, blockSize, modificationTime, new Path(path))
|
||||
new FileStatus(
|
||||
size, isDir, blockReplication, blockSize, modificationTime, new Path(new URI(path)))
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -64,6 +64,35 @@ class FileStreamSinkSuite extends StreamTest {
|
|||
}
|
||||
}
|
||||
|
||||
test("SPARK-21167: encode and decode path correctly") {
|
||||
val inputData = MemoryStream[String]
|
||||
val ds = inputData.toDS()
|
||||
|
||||
val outputDir = Utils.createTempDir(namePrefix = "stream.output").getCanonicalPath
|
||||
val checkpointDir = Utils.createTempDir(namePrefix = "stream.checkpoint").getCanonicalPath
|
||||
|
||||
val query = ds.map(s => (s, s.length))
|
||||
.toDF("value", "len")
|
||||
.writeStream
|
||||
.partitionBy("value")
|
||||
.option("checkpointLocation", checkpointDir)
|
||||
.format("parquet")
|
||||
.start(outputDir)
|
||||
|
||||
try {
|
||||
// The output is partitoned by "value", so the value will appear in the file path.
|
||||
// This is to test if we handle spaces in the path correctly.
|
||||
inputData.addData("hello world")
|
||||
failAfter(streamingTimeout) {
|
||||
query.processAllAvailable()
|
||||
}
|
||||
val outputDf = spark.read.parquet(outputDir)
|
||||
checkDatasetUnorderly(outputDf.as[(Int, String)], ("hello world".length, "hello world"))
|
||||
} finally {
|
||||
query.stop()
|
||||
}
|
||||
}
|
||||
|
||||
test("partitioned writing and batch reading") {
|
||||
val inputData = MemoryStream[Int]
|
||||
val ds = inputData.toDS()
|
||||
|
|
Loading…
Reference in a new issue