[SPARK-14103][SQL] Parse unescaped quotes in CSV data source.

## What changes were proposed in this pull request?

This PR resolves the problem during parsing unescaped quotes in input data. For example, currently the data below:

```
"a"b,ccc,ddd
e,f,g
```

produces a data below:

- **Before**

```bash
["a"b,ccc,ddd[\n]e,f,g]  <- as a value.
```

- **After**

```bash
["a"b], [ccc], [ddd]
[e], [f], [g]
```

This PR bumps up the Univocity parser's version. This was fixed in `2.0.2`, https://github.com/uniVocity/univocity-parsers/issues/60.

## How was this patch tested?

Unit tests in `CSVSuite` and `sbt/sbt scalastyle`.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #12226 from HyukjinKwon/SPARK-14103-quote.
This commit is contained in:
hyukjinkwon 2016-04-08 00:28:59 -07:00 committed by Reynold Xin
parent 04fb7dba70
commit 725b860e2b
9 changed files with 21 additions and 6 deletions

View file

@ -175,7 +175,7 @@ stax-api-1.0.1.jar
stream-2.7.0.jar
stringtemplate-3.2.1.jar
super-csv-2.2.0.jar
univocity-parsers-1.5.6.jar
univocity-parsers-2.0.2.jar
xbean-asm5-shaded-4.4.jar
xmlenc-0.52.jar
xz-1.0.jar

View file

@ -166,7 +166,7 @@ stax-api-1.0.1.jar
stream-2.7.0.jar
stringtemplate-3.2.1.jar
super-csv-2.2.0.jar
univocity-parsers-1.5.6.jar
univocity-parsers-2.0.2.jar
xbean-asm5-shaded-4.4.jar
xmlenc-0.52.jar
xz-1.0.jar

View file

@ -167,7 +167,7 @@ stax-api-1.0.1.jar
stream-2.7.0.jar
stringtemplate-3.2.1.jar
super-csv-2.2.0.jar
univocity-parsers-1.5.6.jar
univocity-parsers-2.0.2.jar
xbean-asm5-shaded-4.4.jar
xmlenc-0.52.jar
xz-1.0.jar

View file

@ -173,7 +173,7 @@ stax-api-1.0.1.jar
stream-2.7.0.jar
stringtemplate-3.2.1.jar
super-csv-2.2.0.jar
univocity-parsers-1.5.6.jar
univocity-parsers-2.0.2.jar
xbean-asm5-shaded-4.4.jar
xercesImpl-2.9.1.jar
xmlenc-0.52.jar

View file

@ -174,7 +174,7 @@ stax-api-1.0.1.jar
stream-2.7.0.jar
stringtemplate-3.2.1.jar
super-csv-2.2.0.jar
univocity-parsers-1.5.6.jar
univocity-parsers-2.0.2.jar
xbean-asm5-shaded-4.4.jar
xercesImpl-2.9.1.jar
xmlenc-0.52.jar

View file

@ -39,7 +39,7 @@
<dependency>
<groupId>com.univocity</groupId>
<artifactId>univocity-parsers</artifactId>
<version>1.5.6</version>
<version>2.0.2</version>
<type>jar</type>
</dependency>
<dependency>

View file

@ -47,6 +47,7 @@ private[sql] abstract class CsvReader(params: CSVOptions, headers: Seq[String])
settings.setMaxColumns(params.maxColumns)
settings.setNullValue(params.nullValue)
settings.setMaxCharsPerColumn(params.maxCharsPerColumn)
settings.setParseUnescapedQuotesUntilDelimiter(true)
if (headers != null) settings.setHeaders(headers: _*)
new CsvParser(settings)

View file

@ -0,0 +1,2 @@
"a"b,ccc,ddd
ab,cc"c,ddd"
Can't render this file because it contains an unexpected character in line 1 and column 3.

View file

@ -45,6 +45,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
private val disableCommentsFile = "disable_comments.csv"
private val boolFile = "bool.csv"
private val simpleSparseFile = "simple_sparse.csv"
private val unescapedQuotesFile = "unescaped-quotes.csv"
private def testFile(fileName: String): String = {
Thread.currentThread().getContextClassLoader.getResource(fileName).toString
@ -140,6 +141,17 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
verifyCars(cars, withHeader = true)
}
test("parse unescaped quotes with maxCharsPerColumn") {
val rows = sqlContext.read
.format("csv")
.option("maxCharsPerColumn", "4")
.load(testFile(unescapedQuotesFile))
val expectedRows = Seq(Row("\"a\"b", "ccc", "ddd"), Row("ab", "cc\"c", "ddd\""))
checkAnswer(rows, expectedRows)
}
test("bad encoding name") {
val exception = intercept[UnsupportedCharsetException] {
sqlContext