[SPARK-12874][ML] ML StringIndexer does not protect itself from column name duplication

## What changes were proposed in this pull request? ML StringIndexer does not protect itself from column name duplication. We should still improve a way to validate a schema of `StringIndexer` and `StringIndexerModel`. However, it would be great to fix at another issue. ## How was this patch tested? unit test Author: Yu ISHIKAWA <yuu.ishikawa@gmail.com> Closes #11370 from yu-iskw/SPARK-12874.
2016-02-25 13:21:33 -08:00 · 2016-02-25 13:21:33 -08:00 · 14e2700de2
parent fb8bb04766
commit 14e2700de2
2 changed files with 12 additions and 0 deletions
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
@ -150,6 +150,7 @@ class StringIndexerModel (
        "Skip StringIndexerModel.")
      return dataset
    }
+    validateAndTransformSchema(dataset.schema)

    val indexer = udf { label: String =>
      if (labelToIndex.contains(label)) {
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
@ -118,6 +118,17 @@ class StringIndexerSuite
    assert(indexerModel.transform(df).eq(df))
  }

+  test("StringIndexerModel can't overwrite output column") {
+    val df = sqlContext.createDataFrame(Seq((1, 2), (3, 4))).toDF("input", "output")
+    val indexer = new StringIndexer()
+      .setInputCol("input")
+      .setOutputCol("output")
+      .fit(df)
+    intercept[IllegalArgumentException] {
+      indexer.transform(df)
+    }
+  }
+
  test("StringIndexer read/write") {
    val t = new StringIndexer()
      .setInputCol("myInputCol")