[SPARK-20773][SQL] ParquetWriteSupport.writeFields is quadratic in number of fields
Fix quadratic List indexing in ParquetWriteSupport. I noticed this function while profiling some code with today. It showed up as a significant factor in a table with twenty columns; with hundreds of columns, it could dominate any other function call. ## What changes were proposed in this pull request? The writeFields method iterates from 0 until number of fields, indexing into rootFieldWriters for each element. rootFieldWriters is a List, so indexing is a linear operation. The complexity of the writeFields method is thus quadratic in the number of fields. Solution: explicitly convert rootFieldWriters to Array (implicitly converted to WrappedArray) for constant-time indexing. ## How was this patch tested? This is a one-line change for performance reasons. Author: tpoterba <tpoterba@broadinstitute.org> Author: Tim Poterba <tpoterba@gmail.com> Closes #18005 from tpoterba/tpoterba-patch-1.
This commit is contained in:
parent
ce8edb8bf4
commit
3f2cd51ee0
|
@ -58,7 +58,7 @@ private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] wit
|
||||||
private var schema: StructType = _
|
private var schema: StructType = _
|
||||||
|
|
||||||
// `ValueWriter`s for all fields of the schema
|
// `ValueWriter`s for all fields of the schema
|
||||||
private var rootFieldWriters: Seq[ValueWriter] = _
|
private var rootFieldWriters: Array[ValueWriter] = _
|
||||||
|
|
||||||
// The Parquet `RecordConsumer` to which all `InternalRow`s are written
|
// The Parquet `RecordConsumer` to which all `InternalRow`s are written
|
||||||
private var recordConsumer: RecordConsumer = _
|
private var recordConsumer: RecordConsumer = _
|
||||||
|
@ -90,7 +90,7 @@ private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] wit
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
this.rootFieldWriters = schema.map(_.dataType).map(makeWriter)
|
this.rootFieldWriters = schema.map(_.dataType).map(makeWriter).toArray[ValueWriter]
|
||||||
|
|
||||||
val messageType = new ParquetSchemaConverter(configuration).convert(schema)
|
val messageType = new ParquetSchemaConverter(configuration).convert(schema)
|
||||||
val metadata = Map(ParquetReadSupport.SPARK_METADATA_KEY -> schemaString).asJava
|
val metadata = Map(ParquetReadSupport.SPARK_METADATA_KEY -> schemaString).asJava
|
||||||
|
@ -116,7 +116,7 @@ private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] wit
|
||||||
}
|
}
|
||||||
|
|
||||||
private def writeFields(
|
private def writeFields(
|
||||||
row: InternalRow, schema: StructType, fieldWriters: Seq[ValueWriter]): Unit = {
|
row: InternalRow, schema: StructType, fieldWriters: Array[ValueWriter]): Unit = {
|
||||||
var i = 0
|
var i = 0
|
||||||
while (i < row.numFields) {
|
while (i < row.numFields) {
|
||||||
if (!row.isNullAt(i)) {
|
if (!row.isNullAt(i)) {
|
||||||
|
@ -192,7 +192,7 @@ private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] wit
|
||||||
makeDecimalWriter(precision, scale)
|
makeDecimalWriter(precision, scale)
|
||||||
|
|
||||||
case t: StructType =>
|
case t: StructType =>
|
||||||
val fieldWriters = t.map(_.dataType).map(makeWriter)
|
val fieldWriters = t.map(_.dataType).map(makeWriter).toArray[ValueWriter]
|
||||||
(row: SpecializedGetters, ordinal: Int) =>
|
(row: SpecializedGetters, ordinal: Int) =>
|
||||||
consumeGroup {
|
consumeGroup {
|
||||||
writeFields(row.getStruct(ordinal, t.length), t, fieldWriters)
|
writeFields(row.getStruct(ordinal, t.length), t, fieldWriters)
|
||||||
|
|
Loading…
Reference in a new issue