[SPARK-36821][SQL] Make class ColumnarBatch extendable - addendum
### What changes were proposed in this pull request? A follow up of https://github.com/apache/spark/pull/34054. Three things changed: 1. Add a test for extendable class `ColumnarBatch` 2. Make `ColumnarBatchRow` public. 3. Change private fields to protected fields. ### Why are the changes needed? A follow up of https://github.com/apache/spark/pull/34054. Class ColumnarBatch need to be extendable to support better vectorized reading in multiple data sources. For example, Iceberg needs to filter out deleted rows in a batch before Spark consumes it, to support row-level delete( apache/iceberg#3141) in vectorized read. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? A new test is added Closes #34087 from flyrain/SPARK-36821. Authored-by: Yufei Gu <yufei_gu@apple.com> Signed-off-by: DB Tsai <d_tsai@apple.com>
This commit is contained in:
parent
e4e64c7552
commit
d03999ab88
|
@ -18,25 +18,21 @@ package org.apache.spark.sql.vectorized;
|
|||
|
||||
import java.util.*;
|
||||
|
||||
import org.apache.spark.annotation.Evolving;
|
||||
import org.apache.spark.annotation.DeveloperApi;
|
||||
import org.apache.spark.sql.catalyst.InternalRow;
|
||||
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
|
||||
import org.apache.spark.sql.types.*;
|
||||
import org.apache.spark.unsafe.types.CalendarInterval;
|
||||
import org.apache.spark.unsafe.types.UTF8String;
|
||||
|
||||
/**
|
||||
* This class wraps multiple ColumnVectors as a row-wise table. It provides a row view of this
|
||||
* batch so that Spark can access the data row by row. Instance of it is meant to be reused during
|
||||
* the entire data loading process.
|
||||
* the entire data loading process. A data source may extend this class with customized logic.
|
||||
*/
|
||||
@Evolving
|
||||
@DeveloperApi
|
||||
public class ColumnarBatch implements AutoCloseable {
|
||||
private int numRows;
|
||||
private final ColumnVector[] columns;
|
||||
protected int numRows;
|
||||
protected final ColumnVector[] columns;
|
||||
|
||||
// Staging row returned from `getRow`.
|
||||
private final ColumnarBatchRow row;
|
||||
protected final ColumnarBatchRow row;
|
||||
|
||||
/**
|
||||
* Called to close all the columns in this batch. It is not valid to access the data after
|
||||
|
@ -125,167 +121,3 @@ public class ColumnarBatch implements AutoCloseable {
|
|||
this.row = new ColumnarBatchRow(columns);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* An internal class, which wraps an array of {@link ColumnVector} and provides a row view.
|
||||
*/
|
||||
class ColumnarBatchRow extends InternalRow {
|
||||
public int rowId;
|
||||
private final ColumnVector[] columns;
|
||||
|
||||
ColumnarBatchRow(ColumnVector[] columns) {
|
||||
this.columns = columns;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int numFields() { return columns.length; }
|
||||
|
||||
@Override
|
||||
public InternalRow copy() {
|
||||
GenericInternalRow row = new GenericInternalRow(columns.length);
|
||||
for (int i = 0; i < numFields(); i++) {
|
||||
if (isNullAt(i)) {
|
||||
row.setNullAt(i);
|
||||
} else {
|
||||
DataType dt = columns[i].dataType();
|
||||
if (dt instanceof BooleanType) {
|
||||
row.setBoolean(i, getBoolean(i));
|
||||
} else if (dt instanceof ByteType) {
|
||||
row.setByte(i, getByte(i));
|
||||
} else if (dt instanceof ShortType) {
|
||||
row.setShort(i, getShort(i));
|
||||
} else if (dt instanceof IntegerType) {
|
||||
row.setInt(i, getInt(i));
|
||||
} else if (dt instanceof LongType) {
|
||||
row.setLong(i, getLong(i));
|
||||
} else if (dt instanceof FloatType) {
|
||||
row.setFloat(i, getFloat(i));
|
||||
} else if (dt instanceof DoubleType) {
|
||||
row.setDouble(i, getDouble(i));
|
||||
} else if (dt instanceof StringType) {
|
||||
row.update(i, getUTF8String(i).copy());
|
||||
} else if (dt instanceof BinaryType) {
|
||||
row.update(i, getBinary(i));
|
||||
} else if (dt instanceof DecimalType) {
|
||||
DecimalType t = (DecimalType)dt;
|
||||
row.setDecimal(i, getDecimal(i, t.precision(), t.scale()), t.precision());
|
||||
} else if (dt instanceof DateType) {
|
||||
row.setInt(i, getInt(i));
|
||||
} else if (dt instanceof TimestampType) {
|
||||
row.setLong(i, getLong(i));
|
||||
} else {
|
||||
throw new RuntimeException("Not implemented. " + dt);
|
||||
}
|
||||
}
|
||||
}
|
||||
return row;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean anyNull() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isNullAt(int ordinal) { return columns[ordinal].isNullAt(rowId); }
|
||||
|
||||
@Override
|
||||
public boolean getBoolean(int ordinal) { return columns[ordinal].getBoolean(rowId); }
|
||||
|
||||
@Override
|
||||
public byte getByte(int ordinal) { return columns[ordinal].getByte(rowId); }
|
||||
|
||||
@Override
|
||||
public short getShort(int ordinal) { return columns[ordinal].getShort(rowId); }
|
||||
|
||||
@Override
|
||||
public int getInt(int ordinal) { return columns[ordinal].getInt(rowId); }
|
||||
|
||||
@Override
|
||||
public long getLong(int ordinal) { return columns[ordinal].getLong(rowId); }
|
||||
|
||||
@Override
|
||||
public float getFloat(int ordinal) { return columns[ordinal].getFloat(rowId); }
|
||||
|
||||
@Override
|
||||
public double getDouble(int ordinal) { return columns[ordinal].getDouble(rowId); }
|
||||
|
||||
@Override
|
||||
public Decimal getDecimal(int ordinal, int precision, int scale) {
|
||||
return columns[ordinal].getDecimal(rowId, precision, scale);
|
||||
}
|
||||
|
||||
@Override
|
||||
public UTF8String getUTF8String(int ordinal) {
|
||||
return columns[ordinal].getUTF8String(rowId);
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte[] getBinary(int ordinal) {
|
||||
return columns[ordinal].getBinary(rowId);
|
||||
}
|
||||
|
||||
@Override
|
||||
public CalendarInterval getInterval(int ordinal) {
|
||||
return columns[ordinal].getInterval(rowId);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ColumnarRow getStruct(int ordinal, int numFields) {
|
||||
return columns[ordinal].getStruct(rowId);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ColumnarArray getArray(int ordinal) {
|
||||
return columns[ordinal].getArray(rowId);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ColumnarMap getMap(int ordinal) {
|
||||
return columns[ordinal].getMap(rowId);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object get(int ordinal, DataType dataType) {
|
||||
if (dataType instanceof BooleanType) {
|
||||
return getBoolean(ordinal);
|
||||
} else if (dataType instanceof ByteType) {
|
||||
return getByte(ordinal);
|
||||
} else if (dataType instanceof ShortType) {
|
||||
return getShort(ordinal);
|
||||
} else if (dataType instanceof IntegerType) {
|
||||
return getInt(ordinal);
|
||||
} else if (dataType instanceof LongType) {
|
||||
return getLong(ordinal);
|
||||
} else if (dataType instanceof FloatType) {
|
||||
return getFloat(ordinal);
|
||||
} else if (dataType instanceof DoubleType) {
|
||||
return getDouble(ordinal);
|
||||
} else if (dataType instanceof StringType) {
|
||||
return getUTF8String(ordinal);
|
||||
} else if (dataType instanceof BinaryType) {
|
||||
return getBinary(ordinal);
|
||||
} else if (dataType instanceof DecimalType) {
|
||||
DecimalType t = (DecimalType) dataType;
|
||||
return getDecimal(ordinal, t.precision(), t.scale());
|
||||
} else if (dataType instanceof DateType) {
|
||||
return getInt(ordinal);
|
||||
} else if (dataType instanceof TimestampType) {
|
||||
return getLong(ordinal);
|
||||
} else if (dataType instanceof ArrayType) {
|
||||
return getArray(ordinal);
|
||||
} else if (dataType instanceof StructType) {
|
||||
return getStruct(ordinal, ((StructType)dataType).fields().length);
|
||||
} else if (dataType instanceof MapType) {
|
||||
return getMap(ordinal);
|
||||
} else {
|
||||
throw new UnsupportedOperationException("Datatype not supported " + dataType);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void update(int ordinal, Object value) { throw new UnsupportedOperationException(); }
|
||||
|
||||
@Override
|
||||
public void setNullAt(int ordinal) { throw new UnsupportedOperationException(); }
|
||||
}
|
||||
|
|
|
@ -0,0 +1,189 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.spark.sql.vectorized;
|
||||
|
||||
import org.apache.spark.annotation.DeveloperApi;
|
||||
import org.apache.spark.sql.catalyst.InternalRow;
|
||||
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
|
||||
import org.apache.spark.sql.types.*;
|
||||
import org.apache.spark.unsafe.types.CalendarInterval;
|
||||
import org.apache.spark.unsafe.types.UTF8String;
|
||||
|
||||
/**
|
||||
* This class wraps an array of {@link ColumnVector} and provides a row view.
|
||||
*/
|
||||
@DeveloperApi
|
||||
public final class ColumnarBatchRow extends InternalRow {
|
||||
public int rowId;
|
||||
private final ColumnVector[] columns;
|
||||
|
||||
public ColumnarBatchRow(ColumnVector[] columns) {
|
||||
this.columns = columns;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int numFields() { return columns.length; }
|
||||
|
||||
@Override
|
||||
public InternalRow copy() {
|
||||
GenericInternalRow row = new GenericInternalRow(columns.length);
|
||||
for (int i = 0; i < numFields(); i++) {
|
||||
if (isNullAt(i)) {
|
||||
row.setNullAt(i);
|
||||
} else {
|
||||
DataType dt = columns[i].dataType();
|
||||
if (dt instanceof BooleanType) {
|
||||
row.setBoolean(i, getBoolean(i));
|
||||
} else if (dt instanceof ByteType) {
|
||||
row.setByte(i, getByte(i));
|
||||
} else if (dt instanceof ShortType) {
|
||||
row.setShort(i, getShort(i));
|
||||
} else if (dt instanceof IntegerType) {
|
||||
row.setInt(i, getInt(i));
|
||||
} else if (dt instanceof LongType) {
|
||||
row.setLong(i, getLong(i));
|
||||
} else if (dt instanceof FloatType) {
|
||||
row.setFloat(i, getFloat(i));
|
||||
} else if (dt instanceof DoubleType) {
|
||||
row.setDouble(i, getDouble(i));
|
||||
} else if (dt instanceof StringType) {
|
||||
row.update(i, getUTF8String(i).copy());
|
||||
} else if (dt instanceof BinaryType) {
|
||||
row.update(i, getBinary(i));
|
||||
} else if (dt instanceof DecimalType) {
|
||||
DecimalType t = (DecimalType)dt;
|
||||
row.setDecimal(i, getDecimal(i, t.precision(), t.scale()), t.precision());
|
||||
} else if (dt instanceof DateType) {
|
||||
row.setInt(i, getInt(i));
|
||||
} else if (dt instanceof TimestampType) {
|
||||
row.setLong(i, getLong(i));
|
||||
} else {
|
||||
throw new RuntimeException("Not implemented. " + dt);
|
||||
}
|
||||
}
|
||||
}
|
||||
return row;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean anyNull() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isNullAt(int ordinal) { return columns[ordinal].isNullAt(rowId); }
|
||||
|
||||
@Override
|
||||
public boolean getBoolean(int ordinal) { return columns[ordinal].getBoolean(rowId); }
|
||||
|
||||
@Override
|
||||
public byte getByte(int ordinal) { return columns[ordinal].getByte(rowId); }
|
||||
|
||||
@Override
|
||||
public short getShort(int ordinal) { return columns[ordinal].getShort(rowId); }
|
||||
|
||||
@Override
|
||||
public int getInt(int ordinal) { return columns[ordinal].getInt(rowId); }
|
||||
|
||||
@Override
|
||||
public long getLong(int ordinal) { return columns[ordinal].getLong(rowId); }
|
||||
|
||||
@Override
|
||||
public float getFloat(int ordinal) { return columns[ordinal].getFloat(rowId); }
|
||||
|
||||
@Override
|
||||
public double getDouble(int ordinal) { return columns[ordinal].getDouble(rowId); }
|
||||
|
||||
@Override
|
||||
public Decimal getDecimal(int ordinal, int precision, int scale) {
|
||||
return columns[ordinal].getDecimal(rowId, precision, scale);
|
||||
}
|
||||
|
||||
@Override
|
||||
public UTF8String getUTF8String(int ordinal) {
|
||||
return columns[ordinal].getUTF8String(rowId);
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte[] getBinary(int ordinal) {
|
||||
return columns[ordinal].getBinary(rowId);
|
||||
}
|
||||
|
||||
@Override
|
||||
public CalendarInterval getInterval(int ordinal) {
|
||||
return columns[ordinal].getInterval(rowId);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ColumnarRow getStruct(int ordinal, int numFields) {
|
||||
return columns[ordinal].getStruct(rowId);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ColumnarArray getArray(int ordinal) {
|
||||
return columns[ordinal].getArray(rowId);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ColumnarMap getMap(int ordinal) {
|
||||
return columns[ordinal].getMap(rowId);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object get(int ordinal, DataType dataType) {
|
||||
if (dataType instanceof BooleanType) {
|
||||
return getBoolean(ordinal);
|
||||
} else if (dataType instanceof ByteType) {
|
||||
return getByte(ordinal);
|
||||
} else if (dataType instanceof ShortType) {
|
||||
return getShort(ordinal);
|
||||
} else if (dataType instanceof IntegerType) {
|
||||
return getInt(ordinal);
|
||||
} else if (dataType instanceof LongType) {
|
||||
return getLong(ordinal);
|
||||
} else if (dataType instanceof FloatType) {
|
||||
return getFloat(ordinal);
|
||||
} else if (dataType instanceof DoubleType) {
|
||||
return getDouble(ordinal);
|
||||
} else if (dataType instanceof StringType) {
|
||||
return getUTF8String(ordinal);
|
||||
} else if (dataType instanceof BinaryType) {
|
||||
return getBinary(ordinal);
|
||||
} else if (dataType instanceof DecimalType) {
|
||||
DecimalType t = (DecimalType) dataType;
|
||||
return getDecimal(ordinal, t.precision(), t.scale());
|
||||
} else if (dataType instanceof DateType) {
|
||||
return getInt(ordinal);
|
||||
} else if (dataType instanceof TimestampType) {
|
||||
return getLong(ordinal);
|
||||
} else if (dataType instanceof ArrayType) {
|
||||
return getArray(ordinal);
|
||||
} else if (dataType instanceof StructType) {
|
||||
return getStruct(ordinal, ((StructType)dataType).fields().length);
|
||||
} else if (dataType instanceof MapType) {
|
||||
return getMap(ordinal);
|
||||
} else {
|
||||
throw new UnsupportedOperationException("Datatype not supported " + dataType);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void update(int ordinal, Object value) { throw new UnsupportedOperationException(); }
|
||||
|
||||
@Override
|
||||
public void setNullAt(int ordinal) { throw new UnsupportedOperationException(); }
|
||||
}
|
|
@ -20,6 +20,8 @@ package org.apache.spark.sql.execution.vectorized
|
|||
import java.nio.ByteBuffer
|
||||
import java.nio.ByteOrder
|
||||
import java.nio.charset.StandardCharsets
|
||||
import java.util
|
||||
import java.util.NoSuchElementException
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.collection.mutable
|
||||
|
@ -36,7 +38,7 @@ import org.apache.spark.sql.catalyst.util.{ArrayBasedMapBuilder, DateTimeUtils,
|
|||
import org.apache.spark.sql.execution.RowToColumnConverter
|
||||
import org.apache.spark.sql.types._
|
||||
import org.apache.spark.sql.util.ArrowUtils
|
||||
import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch}
|
||||
import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch, ColumnarBatchRow, ColumnVector}
|
||||
import org.apache.spark.unsafe.Platform
|
||||
import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
|
||||
|
||||
|
@ -1151,6 +1153,113 @@ class ColumnarBatchSuite extends SparkFunSuite {
|
|||
}}
|
||||
}
|
||||
|
||||
test("ColumnarBatch customization") {
|
||||
(MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode => {
|
||||
val schema = new StructType()
|
||||
.add("intCol", IntegerType)
|
||||
.add("doubleCol", DoubleType)
|
||||
.add("intCol2", IntegerType)
|
||||
.add("string", BinaryType)
|
||||
|
||||
val capacity = 4 * 1024
|
||||
val columns = schema.fields.map { field =>
|
||||
allocate(capacity, field.dataType, memMode)
|
||||
}
|
||||
val batch = new CustomizedColumnarBatch(columns.toArray)
|
||||
assert(batch.numCols() == 4)
|
||||
assert(batch.numRows() == 0)
|
||||
assert(batch.rowIterator().hasNext == false)
|
||||
|
||||
// Add a row [1, 1.1, NULL, "Hello"]
|
||||
columns(0).putInt(0, 1)
|
||||
columns(1).putDouble(0, 1.1)
|
||||
columns(2).putNull(0)
|
||||
columns(3).putByteArray(0, "Hello".getBytes(StandardCharsets.UTF_8))
|
||||
batch.setNumRows(1)
|
||||
|
||||
// Verify the results of the row.
|
||||
assert(batch.numCols() == 4)
|
||||
assert(batch.numRows() == 1)
|
||||
// rowId 0 is skipped
|
||||
assert(batch.rowIterator().hasNext == false)
|
||||
|
||||
// Reset and add 3 rows
|
||||
columns.foreach(_.reset())
|
||||
// Add rows [NULL, 2.2, 2, "abc"], [3, NULL, 3, ""], [4, 4.4, 4, "world"]
|
||||
columns(0).putNull(0)
|
||||
columns(1).putDouble(0, 2.2)
|
||||
columns(2).putInt(0, 2)
|
||||
columns(3).putByteArray(0, "abc".getBytes(StandardCharsets.UTF_8))
|
||||
|
||||
columns(0).putInt(1, 3)
|
||||
columns(1).putNull(1)
|
||||
columns(2).putInt(1, 3)
|
||||
columns(3).putByteArray(1, "".getBytes(StandardCharsets.UTF_8))
|
||||
|
||||
columns(0).putInt(2, 4)
|
||||
columns(1).putDouble(2, 4.4)
|
||||
columns(2).putInt(2, 4)
|
||||
columns(3).putByteArray(2, "world".getBytes(StandardCharsets.UTF_8))
|
||||
batch.setNumRows(3)
|
||||
|
||||
def rowEquals(x: InternalRow, y: Row): Unit = {
|
||||
assert(x.isNullAt(0) == y.isNullAt(0))
|
||||
if (!x.isNullAt(0)) assert(x.getInt(0) == y.getInt(0))
|
||||
|
||||
assert(x.isNullAt(1) == y.isNullAt(1))
|
||||
if (!x.isNullAt(1)) assert(x.getDouble(1) == y.getDouble(1))
|
||||
|
||||
assert(x.isNullAt(2) == y.isNullAt(2))
|
||||
if (!x.isNullAt(2)) assert(x.getInt(2) == y.getInt(2))
|
||||
|
||||
assert(x.isNullAt(3) == y.isNullAt(3))
|
||||
if (!x.isNullAt(3)) assert(x.getString(3) == y.getString(3))
|
||||
}
|
||||
|
||||
// Verify
|
||||
assert(batch.numRows() == 3)
|
||||
val it2 = batch.rowIterator()
|
||||
// Only second row is valid
|
||||
rowEquals(it2.next(), Row(3, null, 3, ""))
|
||||
assert(!it2.hasNext)
|
||||
|
||||
batch.close()
|
||||
}}
|
||||
}
|
||||
|
||||
class CustomizedColumnarBatch(columns: Array[ColumnVector]) extends ColumnarBatch(columns) {
|
||||
val skipRowIds = List(0, 2)
|
||||
override def rowIterator(): util.Iterator[InternalRow] = {
|
||||
val maxRows: Int = numRows
|
||||
val row = new ColumnarBatchRow(columns)
|
||||
new util.Iterator[InternalRow]() {
|
||||
var rowId = 0
|
||||
|
||||
override def hasNext: Boolean = {
|
||||
while (skipRowIds.contains(rowId)) {
|
||||
rowId += 1
|
||||
}
|
||||
rowId < maxRows
|
||||
}
|
||||
|
||||
override def next: InternalRow = {
|
||||
while (skipRowIds.contains(rowId)) {
|
||||
rowId += 1
|
||||
}
|
||||
|
||||
if (rowId >= maxRows) throw new NoSuchElementException
|
||||
row.rowId = rowId
|
||||
rowId += 1
|
||||
row
|
||||
}
|
||||
|
||||
override def remove(): Unit = {
|
||||
throw new UnsupportedOperationException
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private def doubleEquals(d1: Double, d2: Double): Boolean = {
|
||||
if (d1.isNaN && d2.isNaN) {
|
||||
true
|
||||
|
|
Loading…
Reference in a new issue