[SPARK-10835][ML] Word2Vec should accept non-null string array, in addition to existing null string array
## What changes were proposed in this pull request? To match Tokenizer and for compatibility with Word2Vec, output a nullable string array type in NGram ## How was this patch tested? Jenkins tests. Author: Sean Owen <sowen@cloudera.com> Closes #15179 from srowen/SPARK-10835.
This commit is contained in:
parent
7c382524a9
commit
f3fe55439e
|
@ -108,7 +108,8 @@ private[feature] trait Word2VecBase extends Params
|
|||
* Validate and transform the input schema.
|
||||
*/
|
||||
protected def validateAndTransformSchema(schema: StructType): StructType = {
|
||||
SchemaUtils.checkColumnType(schema, $(inputCol), new ArrayType(StringType, true))
|
||||
val typeCandidates = List(new ArrayType(StringType, true), new ArrayType(StringType, false))
|
||||
SchemaUtils.checkColumnTypes(schema, $(inputCol), typeCandidates)
|
||||
SchemaUtils.appendColumn(schema, $(outputCol), new VectorUDT)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -207,5 +207,26 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
|
|||
val newInstance = testDefaultReadWrite(instance)
|
||||
assert(newInstance.getVectors.collect() === instance.getVectors.collect())
|
||||
}
|
||||
|
||||
test("Word2Vec works with input that is non-nullable (NGram)") {
|
||||
val spark = this.spark
|
||||
import spark.implicits._
|
||||
|
||||
val sentence = "a q s t q s t b b b s t m s t m q "
|
||||
val docDF = sc.parallelize(Seq(sentence, sentence)).map(_.split(" ")).toDF("text")
|
||||
|
||||
val ngram = new NGram().setN(2).setInputCol("text").setOutputCol("ngrams")
|
||||
val ngramDF = ngram.transform(docDF)
|
||||
|
||||
val model = new Word2Vec()
|
||||
.setVectorSize(2)
|
||||
.setInputCol("ngrams")
|
||||
.setOutputCol("result")
|
||||
.fit(ngramDF)
|
||||
|
||||
// Just test that this transformation succeeds
|
||||
model.transform(ngramDF).collect()
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue