[SPARK-31304][ML][EXAMPLES] Add examples for ml.stat.ANOVATest

### What changes were proposed in this pull request?

Add ANOVATest example for ml.stat.ANOVATest in python/java/scala

### Why are the changes needed?

Improve ML example

### Does this PR introduce any user-facing change?

No

### How was this patch tested?

manually run the example

Closes #28073 from kevinyu98/add-ANOVA-example.

Authored-by: Qianyang Yu <qyu@us.ibm.com>
Signed-off-by: Sean Owen <srowen@gmail.com>
This commit is contained in:
Qianyang Yu 2020-03-31 16:33:26 -05:00 committed by Sean Owen
parent 34c7ec8e0c
commit e65c21e093
3 changed files with 190 additions and 0 deletions

View file

@ -0,0 +1,75 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.examples.ml;
import org.apache.spark.sql.SparkSession;
// $example on$
import java.util.Arrays;
import java.util.List;
import org.apache.spark.ml.linalg.Vectors;
import org.apache.spark.ml.linalg.VectorUDT;
import org.apache.spark.ml.stat.ANOVATest;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.types.*;
// $example off$
/**
* An example for ANOVA testing.
* Run with
* <pre>
* bin/run-example ml.JavaANOVATestExample
* </pre>
*/
public class JavaANOVATestExample {
public static void main(String[] args) {
SparkSession spark = SparkSession
.builder()
.appName("JavaANOVATestExample")
.getOrCreate();
// $example on$
List<Row> data = Arrays.asList(
RowFactory.create(3.0, Vectors.dense(1.7, 4.4, 7.6, 5.8, 9.6, 2.3)),
RowFactory.create(2.0, Vectors.dense(8.8, 7.3, 5.7, 7.3, 2.2, 4.1)),
RowFactory.create(1.0, Vectors.dense(1.2, 9.5, 2.5, 3.1, 8.7, 2.5)),
RowFactory.create(2.0, Vectors.dense(3.7, 9.2, 6.1, 4.1, 7.5, 3.8)),
RowFactory.create(4.0, Vectors.dense(8.9, 5.2, 7.8, 8.3, 5.2, 3.0)),
RowFactory.create(4.0, Vectors.dense(7.9, 8.5, 9.2, 4.0, 9.4, 2.1))
);
StructType schema = new StructType(new StructField[]{
new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
new StructField("features", new VectorUDT(), false, Metadata.empty()),
});
Dataset<Row> df = spark.createDataFrame(data, schema);
Row r = ANOVATest.test(df, "features", "label").head();
System.out.println("pValues: " + r.get(0).toString());
System.out.println("degreesOfFreedom: " + r.getList(1).toString());
System.out.println("fValues: " + r.get(2).toString());
// $example off$
spark.stop();
}
}

View file

@ -0,0 +1,52 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
An example for ANOVA testing.
Run with:
bin/spark-submit examples/src/main/python/ml/anova_test_example.py
"""
from __future__ import print_function
from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import ANOVATest
# $example off$
if __name__ == "__main__":
spark = SparkSession\
.builder\
.appName("ANOVATestExample")\
.getOrCreate()
# $example on$
data = [(3.0, Vectors.dense([1.7, 4.4, 7.6, 5.8, 9.6, 2.3])),
(2.0, Vectors.dense([8.8, 7.3, 5.7, 7.3, 2.2, 4.1])),
(1.0, Vectors.dense([1.2, 9.5, 2.5, 3.1, 8.7, 2.5])),
(2.0, Vectors.dense([3.7, 9.2, 6.1, 4.1, 7.5, 3.8])),
(4.0, Vectors.dense([8.9, 5.2, 7.8, 8.3, 5.2, 3.0])),
(4.0, Vectors.dense([7.9, 8.5, 9.2, 4.0, 9.4, 2.1]))]
df = spark.createDataFrame(data, ["label", "features"])
r = ANOVATest.test(df, "features", "label").head()
print("pValues: " + str(r.pValues))
print("degreesOfFreedom: " + str(r.degreesOfFreedom))
print("fValues: " + str(r.fValues))
# $example off$
spark.stop()

View file

@ -0,0 +1,63 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// scalastyle:off println
package org.apache.spark.examples.ml
// $example on$
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.stat.ANOVATest
// $example off$
import org.apache.spark.sql.SparkSession
/**
* An example for ANOVA testing.
* Run with
* {{{
* bin/run-example ml.ANOVATestExample
* }}}
*/
object ANOVATestExample {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder
.appName("ANOVATestExample")
.getOrCreate()
import spark.implicits._
// $example on$
val data = Seq(
(3.0, Vectors.dense(1.7, 4.4, 7.6, 5.8, 9.6, 2.3)),
(2.0, Vectors.dense(8.8, 7.3, 5.7, 7.3, 2.2, 4.1)),
(1.0, Vectors.dense(1.2, 9.5, 2.5, 3.1, 8.7, 2.5)),
(2.0, Vectors.dense(3.7, 9.2, 6.1, 4.1, 7.5, 3.8)),
(4.0, Vectors.dense(8.9, 5.2, 7.8, 8.3, 5.2, 3.0)),
(4.0, Vectors.dense(7.9, 8.5, 9.2, 4.0, 9.4, 2.1))
)
val df = data.toDF("label", "features")
val anova = ANOVATest.test(df, "features", "label").head
println(s"pValues = ${anova.getAs[Vector](0)}")
println(s"degreesOfFreedom ${anova.getSeq[Int](1).mkString("[", ",", "]")}")
println(s"fValues ${anova.getAs[Vector](2)}")
// $example off$
spark.stop()
}
}
// scalastyle:on println