[SPARK-36821][SQL] Make class ColumnarBatch extendable - addendum

### What changes were proposed in this pull request?
A follow up of https://github.com/apache/spark/pull/34054. Three things changed:
1. Add a test for extendable class `ColumnarBatch`
2. Make `ColumnarBatchRow` public.
3. Change private fields to protected fields.

### Why are the changes needed?
A follow up of https://github.com/apache/spark/pull/34054. Class ColumnarBatch need to be extendable to support better vectorized reading in multiple data sources. For example, Iceberg needs to filter out deleted rows in a batch before Spark consumes it, to support row-level delete( apache/iceberg#3141) in vectorized read.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
A new test is added

Closes #34087 from flyrain/SPARK-36821.

Authored-by: Yufei Gu <yufei_gu@apple.com>
Signed-off-by: DB Tsai <d_tsai@apple.com>
This commit is contained in:
Yufei Gu 2021-09-27 18:26:59 +00:00 committed by DB Tsai
parent e4e64c7552
commit d03999ab88
3 changed files with 305 additions and 175 deletions

View file

@ -18,25 +18,21 @@ package org.apache.spark.sql.vectorized;
import java.util.*;
import org.apache.spark.annotation.Evolving;
import org.apache.spark.annotation.DeveloperApi;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
import org.apache.spark.sql.types.*;
import org.apache.spark.unsafe.types.CalendarInterval;
import org.apache.spark.unsafe.types.UTF8String;
/**
* This class wraps multiple ColumnVectors as a row-wise table. It provides a row view of this
* batch so that Spark can access the data row by row. Instance of it is meant to be reused during
* the entire data loading process.
* the entire data loading process. A data source may extend this class with customized logic.
*/
@Evolving
@DeveloperApi
public class ColumnarBatch implements AutoCloseable {
private int numRows;
private final ColumnVector[] columns;
protected int numRows;
protected final ColumnVector[] columns;
// Staging row returned from `getRow`.
private final ColumnarBatchRow row;
protected final ColumnarBatchRow row;
/**
* Called to close all the columns in this batch. It is not valid to access the data after
@ -125,167 +121,3 @@ public class ColumnarBatch implements AutoCloseable {
this.row = new ColumnarBatchRow(columns);
}
}
/**
* An internal class, which wraps an array of {@link ColumnVector} and provides a row view.
*/
class ColumnarBatchRow extends InternalRow {
public int rowId;
private final ColumnVector[] columns;
ColumnarBatchRow(ColumnVector[] columns) {
this.columns = columns;
}
@Override
public int numFields() { return columns.length; }
@Override
public InternalRow copy() {
GenericInternalRow row = new GenericInternalRow(columns.length);
for (int i = 0; i < numFields(); i++) {
if (isNullAt(i)) {
row.setNullAt(i);
} else {
DataType dt = columns[i].dataType();
if (dt instanceof BooleanType) {
row.setBoolean(i, getBoolean(i));
} else if (dt instanceof ByteType) {
row.setByte(i, getByte(i));
} else if (dt instanceof ShortType) {
row.setShort(i, getShort(i));
} else if (dt instanceof IntegerType) {
row.setInt(i, getInt(i));
} else if (dt instanceof LongType) {
row.setLong(i, getLong(i));
} else if (dt instanceof FloatType) {
row.setFloat(i, getFloat(i));
} else if (dt instanceof DoubleType) {
row.setDouble(i, getDouble(i));
} else if (dt instanceof StringType) {
row.update(i, getUTF8String(i).copy());
} else if (dt instanceof BinaryType) {
row.update(i, getBinary(i));
} else if (dt instanceof DecimalType) {
DecimalType t = (DecimalType)dt;
row.setDecimal(i, getDecimal(i, t.precision(), t.scale()), t.precision());
} else if (dt instanceof DateType) {
row.setInt(i, getInt(i));
} else if (dt instanceof TimestampType) {
row.setLong(i, getLong(i));
} else {
throw new RuntimeException("Not implemented. " + dt);
}
}
}
return row;
}
@Override
public boolean anyNull() {
throw new UnsupportedOperationException();
}
@Override
public boolean isNullAt(int ordinal) { return columns[ordinal].isNullAt(rowId); }
@Override
public boolean getBoolean(int ordinal) { return columns[ordinal].getBoolean(rowId); }
@Override
public byte getByte(int ordinal) { return columns[ordinal].getByte(rowId); }
@Override
public short getShort(int ordinal) { return columns[ordinal].getShort(rowId); }
@Override
public int getInt(int ordinal) { return columns[ordinal].getInt(rowId); }
@Override
public long getLong(int ordinal) { return columns[ordinal].getLong(rowId); }
@Override
public float getFloat(int ordinal) { return columns[ordinal].getFloat(rowId); }
@Override
public double getDouble(int ordinal) { return columns[ordinal].getDouble(rowId); }
@Override
public Decimal getDecimal(int ordinal, int precision, int scale) {
return columns[ordinal].getDecimal(rowId, precision, scale);
}
@Override
public UTF8String getUTF8String(int ordinal) {
return columns[ordinal].getUTF8String(rowId);
}
@Override
public byte[] getBinary(int ordinal) {
return columns[ordinal].getBinary(rowId);
}
@Override
public CalendarInterval getInterval(int ordinal) {
return columns[ordinal].getInterval(rowId);
}
@Override
public ColumnarRow getStruct(int ordinal, int numFields) {
return columns[ordinal].getStruct(rowId);
}
@Override
public ColumnarArray getArray(int ordinal) {
return columns[ordinal].getArray(rowId);
}
@Override
public ColumnarMap getMap(int ordinal) {
return columns[ordinal].getMap(rowId);
}
@Override
public Object get(int ordinal, DataType dataType) {
if (dataType instanceof BooleanType) {
return getBoolean(ordinal);
} else if (dataType instanceof ByteType) {
return getByte(ordinal);
} else if (dataType instanceof ShortType) {
return getShort(ordinal);
} else if (dataType instanceof IntegerType) {
return getInt(ordinal);
} else if (dataType instanceof LongType) {
return getLong(ordinal);
} else if (dataType instanceof FloatType) {
return getFloat(ordinal);
} else if (dataType instanceof DoubleType) {
return getDouble(ordinal);
} else if (dataType instanceof StringType) {
return getUTF8String(ordinal);
} else if (dataType instanceof BinaryType) {
return getBinary(ordinal);
} else if (dataType instanceof DecimalType) {
DecimalType t = (DecimalType) dataType;
return getDecimal(ordinal, t.precision(), t.scale());
} else if (dataType instanceof DateType) {
return getInt(ordinal);
} else if (dataType instanceof TimestampType) {
return getLong(ordinal);
} else if (dataType instanceof ArrayType) {
return getArray(ordinal);
} else if (dataType instanceof StructType) {
return getStruct(ordinal, ((StructType)dataType).fields().length);
} else if (dataType instanceof MapType) {
return getMap(ordinal);
} else {
throw new UnsupportedOperationException("Datatype not supported " + dataType);
}
}
@Override
public void update(int ordinal, Object value) { throw new UnsupportedOperationException(); }
@Override
public void setNullAt(int ordinal) { throw new UnsupportedOperationException(); }
}

View file

@ -0,0 +1,189 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.sql.vectorized;
import org.apache.spark.annotation.DeveloperApi;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
import org.apache.spark.sql.types.*;
import org.apache.spark.unsafe.types.CalendarInterval;
import org.apache.spark.unsafe.types.UTF8String;
/**
* This class wraps an array of {@link ColumnVector} and provides a row view.
*/
@DeveloperApi
public final class ColumnarBatchRow extends InternalRow {
public int rowId;
private final ColumnVector[] columns;
public ColumnarBatchRow(ColumnVector[] columns) {
this.columns = columns;
}
@Override
public int numFields() { return columns.length; }
@Override
public InternalRow copy() {
GenericInternalRow row = new GenericInternalRow(columns.length);
for (int i = 0; i < numFields(); i++) {
if (isNullAt(i)) {
row.setNullAt(i);
} else {
DataType dt = columns[i].dataType();
if (dt instanceof BooleanType) {
row.setBoolean(i, getBoolean(i));
} else if (dt instanceof ByteType) {
row.setByte(i, getByte(i));
} else if (dt instanceof ShortType) {
row.setShort(i, getShort(i));
} else if (dt instanceof IntegerType) {
row.setInt(i, getInt(i));
} else if (dt instanceof LongType) {
row.setLong(i, getLong(i));
} else if (dt instanceof FloatType) {
row.setFloat(i, getFloat(i));
} else if (dt instanceof DoubleType) {
row.setDouble(i, getDouble(i));
} else if (dt instanceof StringType) {
row.update(i, getUTF8String(i).copy());
} else if (dt instanceof BinaryType) {
row.update(i, getBinary(i));
} else if (dt instanceof DecimalType) {
DecimalType t = (DecimalType)dt;
row.setDecimal(i, getDecimal(i, t.precision(), t.scale()), t.precision());
} else if (dt instanceof DateType) {
row.setInt(i, getInt(i));
} else if (dt instanceof TimestampType) {
row.setLong(i, getLong(i));
} else {
throw new RuntimeException("Not implemented. " + dt);
}
}
}
return row;
}
@Override
public boolean anyNull() {
throw new UnsupportedOperationException();
}
@Override
public boolean isNullAt(int ordinal) { return columns[ordinal].isNullAt(rowId); }
@Override
public boolean getBoolean(int ordinal) { return columns[ordinal].getBoolean(rowId); }
@Override
public byte getByte(int ordinal) { return columns[ordinal].getByte(rowId); }
@Override
public short getShort(int ordinal) { return columns[ordinal].getShort(rowId); }
@Override
public int getInt(int ordinal) { return columns[ordinal].getInt(rowId); }
@Override
public long getLong(int ordinal) { return columns[ordinal].getLong(rowId); }
@Override
public float getFloat(int ordinal) { return columns[ordinal].getFloat(rowId); }
@Override
public double getDouble(int ordinal) { return columns[ordinal].getDouble(rowId); }
@Override
public Decimal getDecimal(int ordinal, int precision, int scale) {
return columns[ordinal].getDecimal(rowId, precision, scale);
}
@Override
public UTF8String getUTF8String(int ordinal) {
return columns[ordinal].getUTF8String(rowId);
}
@Override
public byte[] getBinary(int ordinal) {
return columns[ordinal].getBinary(rowId);
}
@Override
public CalendarInterval getInterval(int ordinal) {
return columns[ordinal].getInterval(rowId);
}
@Override
public ColumnarRow getStruct(int ordinal, int numFields) {
return columns[ordinal].getStruct(rowId);
}
@Override
public ColumnarArray getArray(int ordinal) {
return columns[ordinal].getArray(rowId);
}
@Override
public ColumnarMap getMap(int ordinal) {
return columns[ordinal].getMap(rowId);
}
@Override
public Object get(int ordinal, DataType dataType) {
if (dataType instanceof BooleanType) {
return getBoolean(ordinal);
} else if (dataType instanceof ByteType) {
return getByte(ordinal);
} else if (dataType instanceof ShortType) {
return getShort(ordinal);
} else if (dataType instanceof IntegerType) {
return getInt(ordinal);
} else if (dataType instanceof LongType) {
return getLong(ordinal);
} else if (dataType instanceof FloatType) {
return getFloat(ordinal);
} else if (dataType instanceof DoubleType) {
return getDouble(ordinal);
} else if (dataType instanceof StringType) {
return getUTF8String(ordinal);
} else if (dataType instanceof BinaryType) {
return getBinary(ordinal);
} else if (dataType instanceof DecimalType) {
DecimalType t = (DecimalType) dataType;
return getDecimal(ordinal, t.precision(), t.scale());
} else if (dataType instanceof DateType) {
return getInt(ordinal);
} else if (dataType instanceof TimestampType) {
return getLong(ordinal);
} else if (dataType instanceof ArrayType) {
return getArray(ordinal);
} else if (dataType instanceof StructType) {
return getStruct(ordinal, ((StructType)dataType).fields().length);
} else if (dataType instanceof MapType) {
return getMap(ordinal);
} else {
throw new UnsupportedOperationException("Datatype not supported " + dataType);
}
}
@Override
public void update(int ordinal, Object value) { throw new UnsupportedOperationException(); }
@Override
public void setNullAt(int ordinal) { throw new UnsupportedOperationException(); }
}

View file

@ -20,6 +20,8 @@ package org.apache.spark.sql.execution.vectorized
import java.nio.ByteBuffer
import java.nio.ByteOrder
import java.nio.charset.StandardCharsets
import java.util
import java.util.NoSuchElementException
import scala.collection.JavaConverters._
import scala.collection.mutable
@ -36,7 +38,7 @@ import org.apache.spark.sql.catalyst.util.{ArrayBasedMapBuilder, DateTimeUtils,
import org.apache.spark.sql.execution.RowToColumnConverter
import org.apache.spark.sql.types._
import org.apache.spark.sql.util.ArrowUtils
import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch}
import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch, ColumnarBatchRow, ColumnVector}
import org.apache.spark.unsafe.Platform
import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
@ -1151,6 +1153,113 @@ class ColumnarBatchSuite extends SparkFunSuite {
}}
}
test("ColumnarBatch customization") {
(MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode => {
val schema = new StructType()
.add("intCol", IntegerType)
.add("doubleCol", DoubleType)
.add("intCol2", IntegerType)
.add("string", BinaryType)
val capacity = 4 * 1024
val columns = schema.fields.map { field =>
allocate(capacity, field.dataType, memMode)
}
val batch = new CustomizedColumnarBatch(columns.toArray)
assert(batch.numCols() == 4)
assert(batch.numRows() == 0)
assert(batch.rowIterator().hasNext == false)
// Add a row [1, 1.1, NULL, "Hello"]
columns(0).putInt(0, 1)
columns(1).putDouble(0, 1.1)
columns(2).putNull(0)
columns(3).putByteArray(0, "Hello".getBytes(StandardCharsets.UTF_8))
batch.setNumRows(1)
// Verify the results of the row.
assert(batch.numCols() == 4)
assert(batch.numRows() == 1)
// rowId 0 is skipped
assert(batch.rowIterator().hasNext == false)
// Reset and add 3 rows
columns.foreach(_.reset())
// Add rows [NULL, 2.2, 2, "abc"], [3, NULL, 3, ""], [4, 4.4, 4, "world"]
columns(0).putNull(0)
columns(1).putDouble(0, 2.2)
columns(2).putInt(0, 2)
columns(3).putByteArray(0, "abc".getBytes(StandardCharsets.UTF_8))
columns(0).putInt(1, 3)
columns(1).putNull(1)
columns(2).putInt(1, 3)
columns(3).putByteArray(1, "".getBytes(StandardCharsets.UTF_8))
columns(0).putInt(2, 4)
columns(1).putDouble(2, 4.4)
columns(2).putInt(2, 4)
columns(3).putByteArray(2, "world".getBytes(StandardCharsets.UTF_8))
batch.setNumRows(3)
def rowEquals(x: InternalRow, y: Row): Unit = {
assert(x.isNullAt(0) == y.isNullAt(0))
if (!x.isNullAt(0)) assert(x.getInt(0) == y.getInt(0))
assert(x.isNullAt(1) == y.isNullAt(1))
if (!x.isNullAt(1)) assert(x.getDouble(1) == y.getDouble(1))
assert(x.isNullAt(2) == y.isNullAt(2))
if (!x.isNullAt(2)) assert(x.getInt(2) == y.getInt(2))
assert(x.isNullAt(3) == y.isNullAt(3))
if (!x.isNullAt(3)) assert(x.getString(3) == y.getString(3))
}
// Verify
assert(batch.numRows() == 3)
val it2 = batch.rowIterator()
// Only second row is valid
rowEquals(it2.next(), Row(3, null, 3, ""))
assert(!it2.hasNext)
batch.close()
}}
}
class CustomizedColumnarBatch(columns: Array[ColumnVector]) extends ColumnarBatch(columns) {
val skipRowIds = List(0, 2)
override def rowIterator(): util.Iterator[InternalRow] = {
val maxRows: Int = numRows
val row = new ColumnarBatchRow(columns)
new util.Iterator[InternalRow]() {
var rowId = 0
override def hasNext: Boolean = {
while (skipRowIds.contains(rowId)) {
rowId += 1
}
rowId < maxRows
}
override def next: InternalRow = {
while (skipRowIds.contains(rowId)) {
rowId += 1
}
if (rowId >= maxRows) throw new NoSuchElementException
row.rowId = rowId
rowId += 1
row
}
override def remove(): Unit = {
throw new UnsupportedOperationException
}
}
}
}
private def doubleEquals(d1: Double, d2: Double): Boolean = {
if (d1.isNaN && d2.isNaN) {
true