[SPARK-26151][SQL] Return partial results for bad CSV records
## What changes were proposed in this pull request? In the PR, I propose to change behaviour of `UnivocityParser` and `FailureSafeParser`, and return all fields that were parsed and converted to expected types successfully instead of just returning a row with all `null`s for a bad input in the `PERMISSIVE` mode. For example, for CSV line `0,2013-111-11 12:13:14` and DDL schema `a int, b timestamp`, new result is `Row(0, null)`. ## How was this patch tested? It was checked by existing tests from `CsvSuite` and `CsvFunctionsSuite`. Closes #23120 from MaxGekk/failuresafe-partial-result. Authored-by: Maxim Gekk <max.gekk@gmail.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
This commit is contained in:
parent
bfa3d32f77
commit
11e5f1bcd4
|
@ -243,21 +243,24 @@ class UnivocityParser(
|
||||||
() => getPartialResult(),
|
() => getPartialResult(),
|
||||||
new RuntimeException("Malformed CSV record"))
|
new RuntimeException("Malformed CSV record"))
|
||||||
} else {
|
} else {
|
||||||
try {
|
// When the length of the returned tokens is identical to the length of the parsed schema,
|
||||||
// When the length of the returned tokens is identical to the length of the parsed schema,
|
// we just need to convert the tokens that correspond to the required columns.
|
||||||
// we just need to convert the tokens that correspond to the required columns.
|
var badRecordException: Option[Throwable] = None
|
||||||
var i = 0
|
var i = 0
|
||||||
while (i < requiredSchema.length) {
|
while (i < requiredSchema.length) {
|
||||||
|
try {
|
||||||
row(i) = valueConverters(i).apply(getToken(tokens, i))
|
row(i) = valueConverters(i).apply(getToken(tokens, i))
|
||||||
i += 1
|
} catch {
|
||||||
|
case NonFatal(e) =>
|
||||||
|
badRecordException = badRecordException.orElse(Some(e))
|
||||||
}
|
}
|
||||||
|
i += 1
|
||||||
|
}
|
||||||
|
|
||||||
|
if (badRecordException.isEmpty) {
|
||||||
row
|
row
|
||||||
} catch {
|
} else {
|
||||||
case NonFatal(e) =>
|
throw BadRecordException(() => getCurrentInput, () => Some(row), badRecordException.get)
|
||||||
// For corrupted records with the number of tokens same as the schema,
|
|
||||||
// CSV reader doesn't support partial results. All fields other than the field
|
|
||||||
// configured by `columnNameOfCorruptRecord` are set to `null`.
|
|
||||||
throw BadRecordException(() => getCurrentInput, () => None, e)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -33,26 +33,21 @@ class FailureSafeParser[IN](
|
||||||
private val corruptFieldIndex = schema.getFieldIndex(columnNameOfCorruptRecord)
|
private val corruptFieldIndex = schema.getFieldIndex(columnNameOfCorruptRecord)
|
||||||
private val actualSchema = StructType(schema.filterNot(_.name == columnNameOfCorruptRecord))
|
private val actualSchema = StructType(schema.filterNot(_.name == columnNameOfCorruptRecord))
|
||||||
private val resultRow = new GenericInternalRow(schema.length)
|
private val resultRow = new GenericInternalRow(schema.length)
|
||||||
private val nullResult = new GenericInternalRow(schema.length)
|
|
||||||
|
|
||||||
// This function takes 2 parameters: an optional partial result, and the bad record. If the given
|
// This function takes 2 parameters: an optional partial result, and the bad record. If the given
|
||||||
// schema doesn't contain a field for corrupted record, we just return the partial result or a
|
// schema doesn't contain a field for corrupted record, we just return the partial result or a
|
||||||
// row with all fields null. If the given schema contains a field for corrupted record, we will
|
// row with all fields null. If the given schema contains a field for corrupted record, we will
|
||||||
// set the bad record to this field, and set other fields according to the partial result or null.
|
// set the bad record to this field, and set other fields according to the partial result or null.
|
||||||
private val toResultRow: (Option[InternalRow], () => UTF8String) => InternalRow = {
|
private val toResultRow: (Option[InternalRow], () => UTF8String) => InternalRow = {
|
||||||
if (corruptFieldIndex.isDefined) {
|
(row, badRecord) => {
|
||||||
(row, badRecord) => {
|
var i = 0
|
||||||
var i = 0
|
while (i < actualSchema.length) {
|
||||||
while (i < actualSchema.length) {
|
val from = actualSchema(i)
|
||||||
val from = actualSchema(i)
|
resultRow(schema.fieldIndex(from.name)) = row.map(_.get(i, from.dataType)).orNull
|
||||||
resultRow(schema.fieldIndex(from.name)) = row.map(_.get(i, from.dataType)).orNull
|
i += 1
|
||||||
i += 1
|
|
||||||
}
|
|
||||||
resultRow(corruptFieldIndex.get) = badRecord()
|
|
||||||
resultRow
|
|
||||||
}
|
}
|
||||||
} else {
|
corruptFieldIndex.foreach(index => resultRow(index) = badRecord())
|
||||||
(row, _) => row.getOrElse(nullResult)
|
resultRow
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -60,7 +60,7 @@ class CsvFunctionsSuite extends QueryTest with SharedSQLContext {
|
||||||
"mode" -> "Permissive", "columnNameOfCorruptRecord" -> columnNameOfCorruptRecord)))
|
"mode" -> "Permissive", "columnNameOfCorruptRecord" -> columnNameOfCorruptRecord)))
|
||||||
|
|
||||||
checkAnswer(df2, Seq(
|
checkAnswer(df2, Seq(
|
||||||
Row(Row(null, null, "0,2013-111-11 12:13:14")),
|
Row(Row(0, null, "0,2013-111-11 12:13:14")),
|
||||||
Row(Row(1, java.sql.Date.valueOf("1983-08-04"), null))))
|
Row(Row(1, java.sql.Date.valueOf("1983-08-04"), null))))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1116,7 +1116,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
|
||||||
.schema(schema)
|
.schema(schema)
|
||||||
.csv(testFile(valueMalformedFile))
|
.csv(testFile(valueMalformedFile))
|
||||||
checkAnswer(df1,
|
checkAnswer(df1,
|
||||||
Row(null, null) ::
|
Row(0, null) ::
|
||||||
Row(1, java.sql.Date.valueOf("1983-08-04")) ::
|
Row(1, java.sql.Date.valueOf("1983-08-04")) ::
|
||||||
Nil)
|
Nil)
|
||||||
|
|
||||||
|
@ -1131,7 +1131,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
|
||||||
.schema(schemaWithCorrField1)
|
.schema(schemaWithCorrField1)
|
||||||
.csv(testFile(valueMalformedFile))
|
.csv(testFile(valueMalformedFile))
|
||||||
checkAnswer(df2,
|
checkAnswer(df2,
|
||||||
Row(null, null, "0,2013-111-11 12:13:14") ::
|
Row(0, null, "0,2013-111-11 12:13:14") ::
|
||||||
Row(1, java.sql.Date.valueOf("1983-08-04"), null) ::
|
Row(1, java.sql.Date.valueOf("1983-08-04"), null) ::
|
||||||
Nil)
|
Nil)
|
||||||
|
|
||||||
|
@ -1148,7 +1148,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
|
||||||
.schema(schemaWithCorrField2)
|
.schema(schemaWithCorrField2)
|
||||||
.csv(testFile(valueMalformedFile))
|
.csv(testFile(valueMalformedFile))
|
||||||
checkAnswer(df3,
|
checkAnswer(df3,
|
||||||
Row(null, "0,2013-111-11 12:13:14", null) ::
|
Row(0, "0,2013-111-11 12:13:14", null) ::
|
||||||
Row(1, null, java.sql.Date.valueOf("1983-08-04")) ::
|
Row(1, null, java.sql.Date.valueOf("1983-08-04")) ::
|
||||||
Nil)
|
Nil)
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue