[SPARK-10835][ML] Word2Vec should accept non-null string array, in addition to existing null string array

## What changes were proposed in this pull request?

To match Tokenizer and for compatibility with Word2Vec, output a nullable string array type in NGram

## How was this patch tested?

Jenkins tests.

Author: Sean Owen <sowen@cloudera.com>

Closes #15179 from srowen/SPARK-10835.
This commit is contained in:
Sean Owen 2016-09-24 08:06:41 +01:00
parent 7c382524a9
commit f3fe55439e
No known key found for this signature in database
GPG key ID: BEB3956D6717BDDC
2 changed files with 23 additions and 1 deletions

View file

@ -108,7 +108,8 @@ private[feature] trait Word2VecBase extends Params
* Validate and transform the input schema.
*/
protected def validateAndTransformSchema(schema: StructType): StructType = {
SchemaUtils.checkColumnType(schema, $(inputCol), new ArrayType(StringType, true))
val typeCandidates = List(new ArrayType(StringType, true), new ArrayType(StringType, false))
SchemaUtils.checkColumnTypes(schema, $(inputCol), typeCandidates)
SchemaUtils.appendColumn(schema, $(outputCol), new VectorUDT)
}
}

View file

@ -207,5 +207,26 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
val newInstance = testDefaultReadWrite(instance)
assert(newInstance.getVectors.collect() === instance.getVectors.collect())
}
test("Word2Vec works with input that is non-nullable (NGram)") {
val spark = this.spark
import spark.implicits._
val sentence = "a q s t q s t b b b s t m s t m q "
val docDF = sc.parallelize(Seq(sentence, sentence)).map(_.split(" ")).toDF("text")
val ngram = new NGram().setN(2).setInputCol("text").setOutputCol("ngrams")
val ngramDF = ngram.transform(docDF)
val model = new Word2Vec()
.setVectorSize(2)
.setInputCol("ngrams")
.setOutputCol("result")
.fit(ngramDF)
// Just test that this transformation succeeds
model.transform(ngramDF).collect()
}
}