[SPARK-4386] Improve performance when writing Parquet files.
If you profile the writing of a Parquet file, the single worst time consuming call inside of org.apache.spark.sql.parquet.MutableRowWriteSupport.write is actually in the scala.collection.AbstractSequence.size call. This is because the size call actually ends up COUNTING the elements in a scala.collection.LinearSeqOptimized.length ("optimized?"). This doesn't need to be done. "size" is called repeatedly where needed rather than called once at the top of the method and stored in a 'val'. Author: Jim Carroll <jim@dontcallme.com> Closes #3254 from jimfcarroll/parquet-perf and squashes the following commits: 30cc0b5 [Jim Carroll] Improve performance when writing Parquet files.
This commit is contained in:
parent
0c7b66bd44
commit
f76b968370
|
@ -152,14 +152,15 @@ private[parquet] class RowWriteSupport extends WriteSupport[Row] with Logging {
|
|||
}
|
||||
|
||||
override def write(record: Row): Unit = {
|
||||
if (attributes.size > record.size) {
|
||||
val attributesSize = attributes.size
|
||||
if (attributesSize > record.size) {
|
||||
throw new IndexOutOfBoundsException(
|
||||
s"Trying to write more fields than contained in row (${attributes.size}>${record.size})")
|
||||
s"Trying to write more fields than contained in row (${attributesSize}>${record.size})")
|
||||
}
|
||||
|
||||
var index = 0
|
||||
writer.startMessage()
|
||||
while(index < attributes.size) {
|
||||
while(index < attributesSize) {
|
||||
// null values indicate optional fields but we do not check currently
|
||||
if (record(index) != null) {
|
||||
writer.startField(attributes(index).name, index)
|
||||
|
@ -312,14 +313,15 @@ private[parquet] class RowWriteSupport extends WriteSupport[Row] with Logging {
|
|||
// Optimized for non-nested rows
|
||||
private[parquet] class MutableRowWriteSupport extends RowWriteSupport {
|
||||
override def write(record: Row): Unit = {
|
||||
if (attributes.size > record.size) {
|
||||
val attributesSize = attributes.size
|
||||
if (attributesSize > record.size) {
|
||||
throw new IndexOutOfBoundsException(
|
||||
s"Trying to write more fields than contained in row (${attributes.size}>${record.size})")
|
||||
s"Trying to write more fields than contained in row (${attributesSize}>${record.size})")
|
||||
}
|
||||
|
||||
var index = 0
|
||||
writer.startMessage()
|
||||
while(index < attributes.size) {
|
||||
while(index < attributesSize) {
|
||||
// null values indicate optional fields but we do not check currently
|
||||
if (record(index) != null && record(index) != Nil) {
|
||||
writer.startField(attributes(index).name, index)
|
||||
|
|
Loading…
Reference in a new issue