[SPARK-4742][SQL] The name of Parquet File generated by AppendingParquetOutputFormat should be zero padded
When I use Parquet File as a output file using ParquetOutputFormat#getDefaultWorkFile, the file name is not zero padded while RDD#saveAsText does zero padding. Author: Sasaki Toru <sasakitoa@nttdata.co.jp> Closes #3602 from sasakitoa/parquet-zeroPadding and squashes the following commits: 6b0e58f [Sasaki Toru] Merge branch 'master' of git://github.com/apache/spark into parquet-zeroPadding 20dc79d [Sasaki Toru] Fixed the name of Parquet File generated by AppendingParquetOutputFormat
This commit is contained in:
parent
0abbff2862
commit
8091dd62ea
|
@ -20,6 +20,7 @@ package org.apache.spark.sql.parquet
|
|||
import java.io.IOException
|
||||
import java.lang.{Long => JLong}
|
||||
import java.text.SimpleDateFormat
|
||||
import java.text.NumberFormat
|
||||
import java.util.concurrent.{Callable, TimeUnit}
|
||||
import java.util.{ArrayList, Collections, Date, List => JList}
|
||||
|
||||
|
@ -338,9 +339,13 @@ private[parquet] class AppendingParquetOutputFormat(offset: Int)
|
|||
|
||||
// override to choose output filename so not overwrite existing ones
|
||||
override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
|
||||
val numfmt = NumberFormat.getInstance()
|
||||
numfmt.setMinimumIntegerDigits(5)
|
||||
numfmt.setGroupingUsed(false)
|
||||
|
||||
val taskId: TaskID = getTaskAttemptID(context).getTaskID
|
||||
val partition: Int = taskId.getId
|
||||
val filename = s"part-r-${partition + offset}.parquet"
|
||||
val filename = "part-r-" + numfmt.format(partition + offset) + ".parquet"
|
||||
val committer: FileOutputCommitter =
|
||||
getOutputCommitter(context).asInstanceOf[FileOutputCommitter]
|
||||
new Path(committer.getWorkPath, filename)
|
||||
|
|
Loading…
Reference in a new issue