From 3f2cd51ee06f2c6d735754e5440bc4b74f8dcbc8 Mon Sep 17 00:00:00 2001 From: tpoterba Date: Fri, 19 May 2017 14:17:12 +0200 Subject: [PATCH] [SPARK-20773][SQL] ParquetWriteSupport.writeFields is quadratic in number of fields Fix quadratic List indexing in ParquetWriteSupport. I noticed this function while profiling some code with today. It showed up as a significant factor in a table with twenty columns; with hundreds of columns, it could dominate any other function call. ## What changes were proposed in this pull request? The writeFields method iterates from 0 until number of fields, indexing into rootFieldWriters for each element. rootFieldWriters is a List, so indexing is a linear operation. The complexity of the writeFields method is thus quadratic in the number of fields. Solution: explicitly convert rootFieldWriters to Array (implicitly converted to WrappedArray) for constant-time indexing. ## How was this patch tested? This is a one-line change for performance reasons. Author: tpoterba Author: Tim Poterba Closes #18005 from tpoterba/tpoterba-patch-1. --- .../datasources/parquet/ParquetWriteSupport.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala index 38b0e33937..63a8666f0d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala @@ -58,7 +58,7 @@ private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] wit private var schema: StructType = _ // `ValueWriter`s for all fields of the schema - private var rootFieldWriters: Seq[ValueWriter] = _ + private var rootFieldWriters: Array[ValueWriter] = _ // The Parquet `RecordConsumer` to which all `InternalRow`s are written private var recordConsumer: RecordConsumer = _ @@ -90,7 +90,7 @@ private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] wit } - this.rootFieldWriters = schema.map(_.dataType).map(makeWriter) + this.rootFieldWriters = schema.map(_.dataType).map(makeWriter).toArray[ValueWriter] val messageType = new ParquetSchemaConverter(configuration).convert(schema) val metadata = Map(ParquetReadSupport.SPARK_METADATA_KEY -> schemaString).asJava @@ -116,7 +116,7 @@ private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] wit } private def writeFields( - row: InternalRow, schema: StructType, fieldWriters: Seq[ValueWriter]): Unit = { + row: InternalRow, schema: StructType, fieldWriters: Array[ValueWriter]): Unit = { var i = 0 while (i < row.numFields) { if (!row.isNullAt(i)) { @@ -192,7 +192,7 @@ private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] wit makeDecimalWriter(precision, scale) case t: StructType => - val fieldWriters = t.map(_.dataType).map(makeWriter) + val fieldWriters = t.map(_.dataType).map(makeWriter).toArray[ValueWriter] (row: SpecializedGetters, ordinal: Int) => consumeGroup { writeFields(row.getStruct(ordinal, t.length), t, fieldWriters)