[SPARK-14103][SQL] Parse unescaped quotes in CSV data source.
## What changes were proposed in this pull request? This PR resolves the problem during parsing unescaped quotes in input data. For example, currently the data below: ``` "a"b,ccc,ddd e,f,g ``` produces a data below: - **Before** ```bash ["a"b,ccc,ddd[\n]e,f,g] <- as a value. ``` - **After** ```bash ["a"b], [ccc], [ddd] [e], [f], [g] ``` This PR bumps up the Univocity parser's version. This was fixed in `2.0.2`, https://github.com/uniVocity/univocity-parsers/issues/60. ## How was this patch tested? Unit tests in `CSVSuite` and `sbt/sbt scalastyle`. Author: hyukjinkwon <gurwls223@gmail.com> Closes #12226 from HyukjinKwon/SPARK-14103-quote.
This commit is contained in:
parent
04fb7dba70
commit
725b860e2b
|
@ -175,7 +175,7 @@ stax-api-1.0.1.jar
|
|||
stream-2.7.0.jar
|
||||
stringtemplate-3.2.1.jar
|
||||
super-csv-2.2.0.jar
|
||||
univocity-parsers-1.5.6.jar
|
||||
univocity-parsers-2.0.2.jar
|
||||
xbean-asm5-shaded-4.4.jar
|
||||
xmlenc-0.52.jar
|
||||
xz-1.0.jar
|
||||
|
|
|
@ -166,7 +166,7 @@ stax-api-1.0.1.jar
|
|||
stream-2.7.0.jar
|
||||
stringtemplate-3.2.1.jar
|
||||
super-csv-2.2.0.jar
|
||||
univocity-parsers-1.5.6.jar
|
||||
univocity-parsers-2.0.2.jar
|
||||
xbean-asm5-shaded-4.4.jar
|
||||
xmlenc-0.52.jar
|
||||
xz-1.0.jar
|
||||
|
|
|
@ -167,7 +167,7 @@ stax-api-1.0.1.jar
|
|||
stream-2.7.0.jar
|
||||
stringtemplate-3.2.1.jar
|
||||
super-csv-2.2.0.jar
|
||||
univocity-parsers-1.5.6.jar
|
||||
univocity-parsers-2.0.2.jar
|
||||
xbean-asm5-shaded-4.4.jar
|
||||
xmlenc-0.52.jar
|
||||
xz-1.0.jar
|
||||
|
|
|
@ -173,7 +173,7 @@ stax-api-1.0.1.jar
|
|||
stream-2.7.0.jar
|
||||
stringtemplate-3.2.1.jar
|
||||
super-csv-2.2.0.jar
|
||||
univocity-parsers-1.5.6.jar
|
||||
univocity-parsers-2.0.2.jar
|
||||
xbean-asm5-shaded-4.4.jar
|
||||
xercesImpl-2.9.1.jar
|
||||
xmlenc-0.52.jar
|
||||
|
|
|
@ -174,7 +174,7 @@ stax-api-1.0.1.jar
|
|||
stream-2.7.0.jar
|
||||
stringtemplate-3.2.1.jar
|
||||
super-csv-2.2.0.jar
|
||||
univocity-parsers-1.5.6.jar
|
||||
univocity-parsers-2.0.2.jar
|
||||
xbean-asm5-shaded-4.4.jar
|
||||
xercesImpl-2.9.1.jar
|
||||
xmlenc-0.52.jar
|
||||
|
|
|
@ -39,7 +39,7 @@
|
|||
<dependency>
|
||||
<groupId>com.univocity</groupId>
|
||||
<artifactId>univocity-parsers</artifactId>
|
||||
<version>1.5.6</version>
|
||||
<version>2.0.2</version>
|
||||
<type>jar</type>
|
||||
</dependency>
|
||||
<dependency>
|
||||
|
|
|
@ -47,6 +47,7 @@ private[sql] abstract class CsvReader(params: CSVOptions, headers: Seq[String])
|
|||
settings.setMaxColumns(params.maxColumns)
|
||||
settings.setNullValue(params.nullValue)
|
||||
settings.setMaxCharsPerColumn(params.maxCharsPerColumn)
|
||||
settings.setParseUnescapedQuotesUntilDelimiter(true)
|
||||
if (headers != null) settings.setHeaders(headers: _*)
|
||||
|
||||
new CsvParser(settings)
|
||||
|
|
2
sql/core/src/test/resources/unescaped-quotes.csv
Normal file
2
sql/core/src/test/resources/unescaped-quotes.csv
Normal file
|
@ -0,0 +1,2 @@
|
|||
"a"b,ccc,ddd
|
||||
ab,cc"c,ddd"
|
Can't render this file because it contains an unexpected character in line 1 and column 3.
|
|
@ -45,6 +45,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
|
|||
private val disableCommentsFile = "disable_comments.csv"
|
||||
private val boolFile = "bool.csv"
|
||||
private val simpleSparseFile = "simple_sparse.csv"
|
||||
private val unescapedQuotesFile = "unescaped-quotes.csv"
|
||||
|
||||
private def testFile(fileName: String): String = {
|
||||
Thread.currentThread().getContextClassLoader.getResource(fileName).toString
|
||||
|
@ -140,6 +141,17 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
|
|||
verifyCars(cars, withHeader = true)
|
||||
}
|
||||
|
||||
test("parse unescaped quotes with maxCharsPerColumn") {
|
||||
val rows = sqlContext.read
|
||||
.format("csv")
|
||||
.option("maxCharsPerColumn", "4")
|
||||
.load(testFile(unescapedQuotesFile))
|
||||
|
||||
val expectedRows = Seq(Row("\"a\"b", "ccc", "ddd"), Row("ab", "cc\"c", "ddd\""))
|
||||
|
||||
checkAnswer(rows, expectedRows)
|
||||
}
|
||||
|
||||
test("bad encoding name") {
|
||||
val exception = intercept[UnsupportedCharsetException] {
|
||||
sqlContext
|
||||
|
|
Loading…
Reference in a new issue