[SPARK-20773][SQL] ParquetWriteSupport.writeFields is quadratic in number of fields

Fix quadratic List indexing in ParquetWriteSupport.

I noticed this function while profiling some code with today. It showed up as a significant factor in a table with twenty columns; with hundreds of columns, it could dominate any other function call.

## What changes were proposed in this pull request?

The writeFields method iterates from 0 until number of fields, indexing into rootFieldWriters for each element. rootFieldWriters is a List, so indexing is a linear operation. The complexity of the writeFields method is thus quadratic in the number of fields.

Solution: explicitly convert rootFieldWriters to Array (implicitly converted to WrappedArray) for constant-time indexing.

## How was this patch tested?

This is a one-line change for performance reasons.

Author: tpoterba <tpoterba@broadinstitute.org>
Author: Tim Poterba <tpoterba@gmail.com>

Closes #18005 from tpoterba/tpoterba-patch-1.
This commit is contained in:
tpoterba 2017-05-19 14:17:12 +02:00 committed by Herman van Hovell
parent ce8edb8bf4
commit 3f2cd51ee0

View file

@ -58,7 +58,7 @@ private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] wit
private var schema: StructType = _ private var schema: StructType = _
// `ValueWriter`s for all fields of the schema // `ValueWriter`s for all fields of the schema
private var rootFieldWriters: Seq[ValueWriter] = _ private var rootFieldWriters: Array[ValueWriter] = _
// The Parquet `RecordConsumer` to which all `InternalRow`s are written // The Parquet `RecordConsumer` to which all `InternalRow`s are written
private var recordConsumer: RecordConsumer = _ private var recordConsumer: RecordConsumer = _
@ -90,7 +90,7 @@ private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] wit
} }
this.rootFieldWriters = schema.map(_.dataType).map(makeWriter) this.rootFieldWriters = schema.map(_.dataType).map(makeWriter).toArray[ValueWriter]
val messageType = new ParquetSchemaConverter(configuration).convert(schema) val messageType = new ParquetSchemaConverter(configuration).convert(schema)
val metadata = Map(ParquetReadSupport.SPARK_METADATA_KEY -> schemaString).asJava val metadata = Map(ParquetReadSupport.SPARK_METADATA_KEY -> schemaString).asJava
@ -116,7 +116,7 @@ private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] wit
} }
private def writeFields( private def writeFields(
row: InternalRow, schema: StructType, fieldWriters: Seq[ValueWriter]): Unit = { row: InternalRow, schema: StructType, fieldWriters: Array[ValueWriter]): Unit = {
var i = 0 var i = 0
while (i < row.numFields) { while (i < row.numFields) {
if (!row.isNullAt(i)) { if (!row.isNullAt(i)) {
@ -192,7 +192,7 @@ private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] wit
makeDecimalWriter(precision, scale) makeDecimalWriter(precision, scale)
case t: StructType => case t: StructType =>
val fieldWriters = t.map(_.dataType).map(makeWriter) val fieldWriters = t.map(_.dataType).map(makeWriter).toArray[ValueWriter]
(row: SpecializedGetters, ordinal: Int) => (row: SpecializedGetters, ordinal: Int) =>
consumeGroup { consumeGroup {
writeFields(row.getStruct(ordinal, t.length), t, fieldWriters) writeFields(row.getStruct(ordinal, t.length), t, fieldWriters)