From 43ef1e52bfe359f0f051a607a8dc77cc3b269508 Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Mon, 21 Mar 2016 17:42:30 -0700 Subject: [PATCH] Revert "[SPARK-13019][DOCS] Replace example code in mllib-statistics.md using include_example" This reverts commit 1af8de200c4d3357bcb09e7bbc6deece00e885f2. --- docs/mllib-statistics.md | 438 +++++++++++++++--- .../mllib/JavaCorrelationsExample.java | 70 --- .../mllib/JavaHypothesisTestingExample.java | 84 ---- ...isTestingKolmogorovSmirnovTestExample.java | 49 -- .../JavaKernelDensityEstimationExample.java | 53 --- .../mllib/JavaStratifiedSamplingExample.java | 75 --- .../mllib/JavaSummaryStatisticsExample.java | 56 --- .../main/python/mllib/correlations_example.py | 48 -- .../mllib/hypothesis_testing_example.py | 65 --- ...testing_kolmogorov_smirnov_test_example.py | 40 -- .../kernel_density_estimation_example.py | 44 -- .../mllib/stratified_sampling_example.py | 38 -- .../mllib/summary_statistics_example.py | 42 -- .../examples/mllib/CorrelationsExample.scala | 62 --- .../mllib/HypothesisTestingExample.scala | 80 ---- ...sTestingKolmogorovSmirnovTestExample.scala | 54 --- .../KernelDensityEstimationExample.scala | 54 --- .../mllib/StratifiedSamplingExample.scala | 53 --- .../mllib/SummaryStatisticsExample.scala | 53 --- 19 files changed, 382 insertions(+), 1076 deletions(-) delete mode 100644 examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java delete mode 100644 examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java delete mode 100644 examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java delete mode 100644 examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java delete mode 100644 examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java delete mode 100644 examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java delete mode 100644 examples/src/main/python/mllib/correlations_example.py delete mode 100644 examples/src/main/python/mllib/hypothesis_testing_example.py delete mode 100644 examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py delete mode 100644 examples/src/main/python/mllib/kernel_density_estimation_example.py delete mode 100644 examples/src/main/python/mllib/stratified_sampling_example.py delete mode 100644 examples/src/main/python/mllib/summary_statistics_example.py delete mode 100644 examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala delete mode 100644 examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala delete mode 100644 examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala delete mode 100644 examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala delete mode 100644 examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala delete mode 100644 examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala diff --git a/docs/mllib-statistics.md b/docs/mllib-statistics.md index 02b81f153b..b773031bc7 100644 --- a/docs/mllib-statistics.md +++ b/docs/mllib-statistics.md @@ -10,24 +10,24 @@ displayTitle: Basic Statistics - spark.mllib `\[ \newcommand{\R}{\mathbb{R}} -\newcommand{\E}{\mathbb{E}} +\newcommand{\E}{\mathbb{E}} \newcommand{\x}{\mathbf{x}} \newcommand{\y}{\mathbf{y}} \newcommand{\wv}{\mathbf{w}} \newcommand{\av}{\mathbf{\alpha}} \newcommand{\bv}{\mathbf{b}} \newcommand{\N}{\mathbb{N}} -\newcommand{\id}{\mathbf{I}} -\newcommand{\ind}{\mathbf{1}} -\newcommand{\0}{\mathbf{0}} -\newcommand{\unit}{\mathbf{e}} -\newcommand{\one}{\mathbf{1}} +\newcommand{\id}{\mathbf{I}} +\newcommand{\ind}{\mathbf{1}} +\newcommand{\0}{\mathbf{0}} +\newcommand{\unit}{\mathbf{e}} +\newcommand{\one}{\mathbf{1}} \newcommand{\zero}{\mathbf{0}} \]` -## Summary statistics +## Summary statistics -We provide column summary statistics for `RDD[Vector]` through the function `colStats` +We provide column summary statistics for `RDD[Vector]` through the function `colStats` available in `Statistics`.
@@ -40,7 +40,19 @@ total count. Refer to the [`MultivariateStatisticalSummary` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.MultivariateStatisticalSummary) for details on the API. -{% include_example scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala %} +{% highlight scala %} +import org.apache.spark.mllib.linalg.Vector +import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics} + +val observations: RDD[Vector] = ... // an RDD of Vectors + +// Compute column summary statistics. +val summary: MultivariateStatisticalSummary = Statistics.colStats(observations) +println(summary.mean) // a dense vector containing the mean value for each column +println(summary.variance) // column-wise variance +println(summary.numNonzeros) // number of nonzeros in each column + +{% endhighlight %}
@@ -52,7 +64,24 @@ total count. Refer to the [`MultivariateStatisticalSummary` Java docs](api/java/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.html) for details on the API. -{% include_example java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java %} +{% highlight java %} +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.mllib.linalg.Vector; +import org.apache.spark.mllib.stat.MultivariateStatisticalSummary; +import org.apache.spark.mllib.stat.Statistics; + +JavaSparkContext jsc = ... + +JavaRDD mat = ... // an RDD of Vectors + +// Compute column summary statistics. +MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd()); +System.out.println(summary.mean()); // a dense vector containing the mean value for each column +System.out.println(summary.variance()); // column-wise variance +System.out.println(summary.numNonzeros()); // number of nonzeros in each column + +{% endhighlight %}
@@ -63,7 +92,20 @@ total count. Refer to the [`MultivariateStatisticalSummary` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.MultivariateStatisticalSummary) for more details on the API. -{% include_example python/mllib/summary_statistics_example.py %} +{% highlight python %} +from pyspark.mllib.stat import Statistics + +sc = ... # SparkContext + +mat = ... # an RDD of Vectors + +# Compute column summary statistics. +summary = Statistics.colStats(mat) +print(summary.mean()) +print(summary.variance()) +print(summary.numNonzeros()) + +{% endhighlight %}
@@ -71,38 +113,96 @@ Refer to the [`MultivariateStatisticalSummary` Python docs](api/python/pyspark.m ## Correlations Calculating the correlation between two series of data is a common operation in Statistics. In `spark.mllib` -we provide the flexibility to calculate pairwise correlations among many series. The supported +we provide the flexibility to calculate pairwise correlations among many series. The supported correlation methods are currently Pearson's and Spearman's correlation. - +
-[`Statistics`](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) provides methods to -calculate correlations between series. Depending on the type of input, two `RDD[Double]`s or +[`Statistics`](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) provides methods to +calculate correlations between series. Depending on the type of input, two `RDD[Double]`s or an `RDD[Vector]`, the output will be a `Double` or the correlation `Matrix` respectively. Refer to the [`Statistics` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.Statistics) for details on the API. -{% include_example scala/org/apache/spark/examples/mllib/CorrelationsExample.scala %} +{% highlight scala %} +import org.apache.spark.SparkContext +import org.apache.spark.mllib.linalg._ +import org.apache.spark.mllib.stat.Statistics + +val sc: SparkContext = ... + +val seriesX: RDD[Double] = ... // a series +val seriesY: RDD[Double] = ... // must have the same number of partitions and cardinality as seriesX + +// compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a +// method is not specified, Pearson's method will be used by default. +val correlation: Double = Statistics.corr(seriesX, seriesY, "pearson") + +val data: RDD[Vector] = ... // note that each Vector is a row and not a column + +// calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method. +// If a method is not specified, Pearson's method will be used by default. +val correlMatrix: Matrix = Statistics.corr(data, "pearson") + +{% endhighlight %}
-[`Statistics`](api/java/org/apache/spark/mllib/stat/Statistics.html) provides methods to -calculate correlations between series. Depending on the type of input, two `JavaDoubleRDD`s or +[`Statistics`](api/java/org/apache/spark/mllib/stat/Statistics.html) provides methods to +calculate correlations between series. Depending on the type of input, two `JavaDoubleRDD`s or a `JavaRDD`, the output will be a `Double` or the correlation `Matrix` respectively. Refer to the [`Statistics` Java docs](api/java/org/apache/spark/mllib/stat/Statistics.html) for details on the API. -{% include_example java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java %} +{% highlight java %} +import org.apache.spark.api.java.JavaDoubleRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.mllib.linalg.*; +import org.apache.spark.mllib.stat.Statistics; + +JavaSparkContext jsc = ... + +JavaDoubleRDD seriesX = ... // a series +JavaDoubleRDD seriesY = ... // must have the same number of partitions and cardinality as seriesX + +// compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a +// method is not specified, Pearson's method will be used by default. +Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson"); + +JavaRDD data = ... // note that each Vector is a row and not a column + +// calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method. +// If a method is not specified, Pearson's method will be used by default. +Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson"); + +{% endhighlight %}
-[`Statistics`](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) provides methods to -calculate correlations between series. Depending on the type of input, two `RDD[Double]`s or +[`Statistics`](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) provides methods to +calculate correlations between series. Depending on the type of input, two `RDD[Double]`s or an `RDD[Vector]`, the output will be a `Double` or the correlation `Matrix` respectively. Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API. -{% include_example python/mllib/correlations_example.py %} +{% highlight python %} +from pyspark.mllib.stat import Statistics + +sc = ... # SparkContext + +seriesX = ... # a series +seriesY = ... # must have the same number of partitions and cardinality as seriesX + +# Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a +# method is not specified, Pearson's method will be used by default. +print(Statistics.corr(seriesX, seriesY, method="pearson")) + +data = ... # an RDD of Vectors +# calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method. +# If a method is not specified, Pearson's method will be used by default. +print(Statistics.corr(data, method="pearson")) + +{% endhighlight %}
@@ -111,76 +211,187 @@ Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.ml Unlike the other statistics functions, which reside in `spark.mllib`, stratified sampling methods, `sampleByKey` and `sampleByKeyExact`, can be performed on RDD's of key-value pairs. For stratified -sampling, the keys can be thought of as a label and the value as a specific attribute. For example -the key can be man or woman, or document ids, and the respective values can be the list of ages -of the people in the population or the list of words in the documents. The `sampleByKey` method -will flip a coin to decide whether an observation will be sampled or not, therefore requires one -pass over the data, and provides an *expected* sample size. `sampleByKeyExact` requires significant +sampling, the keys can be thought of as a label and the value as a specific attribute. For example +the key can be man or woman, or document ids, and the respective values can be the list of ages +of the people in the population or the list of words in the documents. The `sampleByKey` method +will flip a coin to decide whether an observation will be sampled or not, therefore requires one +pass over the data, and provides an *expected* sample size. `sampleByKeyExact` requires significant more resources than the per-stratum simple random sampling used in `sampleByKey`, but will provide -the exact sampling size with 99.99% confidence. `sampleByKeyExact` is currently not supported in +the exact sampling size with 99.99% confidence. `sampleByKeyExact` is currently not supported in python.
[`sampleByKeyExact()`](api/scala/index.html#org.apache.spark.rdd.PairRDDFunctions) allows users to -sample exactly $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the desired +sample exactly $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the desired fraction for key $k$, $n_k$ is the number of key-value pairs for key $k$, and $K$ is the set of -keys. Sampling without replacement requires one additional pass over the RDD to guarantee sample +keys. Sampling without replacement requires one additional pass over the RDD to guarantee sample size, whereas sampling with replacement requires two additional passes. -{% include_example scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala %} +{% highlight scala %} +import org.apache.spark.SparkContext +import org.apache.spark.SparkContext._ +import org.apache.spark.rdd.PairRDDFunctions + +val sc: SparkContext = ... + +val data = ... // an RDD[(K, V)] of any key value pairs +val fractions: Map[K, Double] = ... // specify the exact fraction desired from each key + +// Get an exact sample from each stratum +val approxSample = data.sampleByKey(withReplacement = false, fractions) +val exactSample = data.sampleByKeyExact(withReplacement = false, fractions) + +{% endhighlight %}
[`sampleByKeyExact()`](api/java/org/apache/spark/api/java/JavaPairRDD.html) allows users to -sample exactly $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the desired +sample exactly $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the desired fraction for key $k$, $n_k$ is the number of key-value pairs for key $k$, and $K$ is the set of -keys. Sampling without replacement requires one additional pass over the RDD to guarantee sample +keys. Sampling without replacement requires one additional pass over the RDD to guarantee sample size, whereas sampling with replacement requires two additional passes. -{% include_example java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java %} +{% highlight java %} +import java.util.Map; + +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaSparkContext; + +JavaSparkContext jsc = ... + +JavaPairRDD data = ... // an RDD of any key value pairs +Map fractions = ... // specify the exact fraction desired from each key + +// Get an exact sample from each stratum +JavaPairRDD approxSample = data.sampleByKey(false, fractions); +JavaPairRDD exactSample = data.sampleByKeyExact(false, fractions); + +{% endhighlight %}
[`sampleByKey()`](api/python/pyspark.html#pyspark.RDD.sampleByKey) allows users to -sample approximately $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the -desired fraction for key $k$, $n_k$ is the number of key-value pairs for key $k$, and $K$ is the +sample approximately $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the +desired fraction for key $k$, $n_k$ is the number of key-value pairs for key $k$, and $K$ is the set of keys. *Note:* `sampleByKeyExact()` is currently not supported in Python. -{% include_example python/mllib/stratified_sampling_example.py %} +{% highlight python %} + +sc = ... # SparkContext + +data = ... # an RDD of any key value pairs +fractions = ... # specify the exact fraction desired from each key as a dictionary + +approxSample = data.sampleByKey(False, fractions); + +{% endhighlight %}
## Hypothesis testing -Hypothesis testing is a powerful tool in statistics to determine whether a result is statistically -significant, whether this result occurred by chance or not. `spark.mllib` currently supports Pearson's +Hypothesis testing is a powerful tool in statistics to determine whether a result is statistically +significant, whether this result occurred by chance or not. `spark.mllib` currently supports Pearson's chi-squared ( $\chi^2$) tests for goodness of fit and independence. The input data types determine -whether the goodness of fit or the independence test is conducted. The goodness of fit test requires +whether the goodness of fit or the independence test is conducted. The goodness of fit test requires an input type of `Vector`, whereas the independence test requires a `Matrix` as input. -`spark.mllib` also supports the input type `RDD[LabeledPoint]` to enable feature selection via chi-squared +`spark.mllib` also supports the input type `RDD[LabeledPoint]` to enable feature selection via chi-squared independence tests.
-[`Statistics`](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) provides methods to -run Pearson's chi-squared tests. The following example demonstrates how to run and interpret +[`Statistics`](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) provides methods to +run Pearson's chi-squared tests. The following example demonstrates how to run and interpret hypothesis tests. -{% include_example scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala %} +{% highlight scala %} +import org.apache.spark.SparkContext +import org.apache.spark.mllib.linalg._ +import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.mllib.stat.Statistics._ + +val sc: SparkContext = ... + +val vec: Vector = ... // a vector composed of the frequencies of events + +// compute the goodness of fit. If a second vector to test against is not supplied as a parameter, +// the test runs against a uniform distribution. +val goodnessOfFitTestResult = Statistics.chiSqTest(vec) +println(goodnessOfFitTestResult) // summary of the test including the p-value, degrees of freedom, + // test statistic, the method used, and the null hypothesis. + +val mat: Matrix = ... // a contingency matrix + +// conduct Pearson's independence test on the input contingency matrix +val independenceTestResult = Statistics.chiSqTest(mat) +println(independenceTestResult) // summary of the test including the p-value, degrees of freedom... + +val obs: RDD[LabeledPoint] = ... // (feature, label) pairs. + +// The contingency table is constructed from the raw (feature, label) pairs and used to conduct +// the independence test. Returns an array containing the ChiSquaredTestResult for every feature +// against the label. +val featureTestResults: Array[ChiSqTestResult] = Statistics.chiSqTest(obs) +var i = 1 +featureTestResults.foreach { result => + println(s"Column $i:\n$result") + i += 1 +} // summary of the test + +{% endhighlight %}
-[`Statistics`](api/java/org/apache/spark/mllib/stat/Statistics.html) provides methods to -run Pearson's chi-squared tests. The following example demonstrates how to run and interpret +[`Statistics`](api/java/org/apache/spark/mllib/stat/Statistics.html) provides methods to +run Pearson's chi-squared tests. The following example demonstrates how to run and interpret hypothesis tests. Refer to the [`ChiSqTestResult` Java docs](api/java/org/apache/spark/mllib/stat/test/ChiSqTestResult.html) for details on the API. -{% include_example java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java %} +{% highlight java %} +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.mllib.linalg.*; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.stat.Statistics; +import org.apache.spark.mllib.stat.test.ChiSqTestResult; + +JavaSparkContext jsc = ... + +Vector vec = ... // a vector composed of the frequencies of events + +// compute the goodness of fit. If a second vector to test against is not supplied as a parameter, +// the test runs against a uniform distribution. +ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec); +// summary of the test including the p-value, degrees of freedom, test statistic, the method used, +// and the null hypothesis. +System.out.println(goodnessOfFitTestResult); + +Matrix mat = ... // a contingency matrix + +// conduct Pearson's independence test on the input contingency matrix +ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat); +// summary of the test including the p-value, degrees of freedom... +System.out.println(independenceTestResult); + +JavaRDD obs = ... // an RDD of labeled points + +// The contingency table is constructed from the raw (feature, label) pairs and used to conduct +// the independence test. Returns an array containing the ChiSquaredTestResult for every feature +// against the label. +ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd()); +int i = 1; +for (ChiSqTestResult result : featureTestResults) { + System.out.println("Column " + i + ":"); + System.out.println(result); // summary of the test + i++; +} + +{% endhighlight %}
@@ -190,18 +401,50 @@ hypothesis tests. Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API. -{% include_example python/mllib/hypothesis_testing_example.py %} +{% highlight python %} +from pyspark import SparkContext +from pyspark.mllib.linalg import Vectors, Matrices +from pyspark.mllib.regresssion import LabeledPoint +from pyspark.mllib.stat import Statistics + +sc = SparkContext() + +vec = Vectors.dense(...) # a vector composed of the frequencies of events + +# compute the goodness of fit. If a second vector to test against is not supplied as a parameter, +# the test runs against a uniform distribution. +goodnessOfFitTestResult = Statistics.chiSqTest(vec) +print(goodnessOfFitTestResult) # summary of the test including the p-value, degrees of freedom, + # test statistic, the method used, and the null hypothesis. + +mat = Matrices.dense(...) # a contingency matrix + +# conduct Pearson's independence test on the input contingency matrix +independenceTestResult = Statistics.chiSqTest(mat) +print(independenceTestResult) # summary of the test including the p-value, degrees of freedom... + +obs = sc.parallelize(...) # LabeledPoint(feature, label) . + +# The contingency table is constructed from an RDD of LabeledPoint and used to conduct +# the independence test. Returns an array containing the ChiSquaredTestResult for every feature +# against the label. +featureTestResults = Statistics.chiSqTest(obs) + +for i, result in enumerate(featureTestResults): + print("Column $d:" % (i + 1)) + print(result) +{% endhighlight %}
Additionally, `spark.mllib` provides a 1-sample, 2-sided implementation of the Kolmogorov-Smirnov (KS) test for equality of probability distributions. By providing the name of a theoretical distribution -(currently solely supported for the normal distribution) and its parameters, or a function to +(currently solely supported for the normal distribution) and its parameters, or a function to calculate the cumulative distribution according to a given theoretical distribution, the user can test the null hypothesis that their sample is drawn from that distribution. In the case that the user tests against the normal distribution (`distName="norm"`), but does not provide distribution -parameters, the test initializes to the standard normal distribution and logs an appropriate +parameters, the test initializes to the standard normal distribution and logs an appropriate message.
@@ -212,7 +455,21 @@ and interpret the hypothesis tests. Refer to the [`Statistics` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.Statistics) for details on the API. -{% include_example scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala %} +{% highlight scala %} +import org.apache.spark.mllib.stat.Statistics + +val data: RDD[Double] = ... // an RDD of sample data + +// run a KS test for the sample versus a standard normal distribution +val testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1) +println(testResult) // summary of the test including the p-value, test statistic, + // and null hypothesis + // if our p-value indicates significance, we can reject the null hypothesis + +// perform a KS test using a cumulative distribution function of our making +val myCDF: Double => Double = ... +val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF) +{% endhighlight %}
@@ -222,7 +479,23 @@ and interpret the hypothesis tests. Refer to the [`Statistics` Java docs](api/java/org/apache/spark/mllib/stat/Statistics.html) for details on the API. -{% include_example java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java %} +{% highlight java %} +import java.util.Arrays; + +import org.apache.spark.api.java.JavaDoubleRDD; +import org.apache.spark.api.java.JavaSparkContext; + +import org.apache.spark.mllib.stat.Statistics; +import org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult; + +JavaSparkContext jsc = ... +JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.2, 1.0, ...)); +KolmogorovSmirnovTestResult testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0); +// summary of the test including the p-value, test statistic, +// and null hypothesis +// if our p-value indicates significance, we can reject the null hypothesis +System.out.println(testResult); +{% endhighlight %}
@@ -232,7 +505,19 @@ and interpret the hypothesis tests. Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API. -{% include_example python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py %} +{% highlight python %} +from pyspark.mllib.stat import Statistics + +parallelData = sc.parallelize([1.0, 2.0, ... ]) + +# run a KS test for the sample versus a standard normal distribution +testResult = Statistics.kolmogorovSmirnovTest(parallelData, "norm", 0, 1) +print(testResult) # summary of the test including the p-value, test statistic, + # and null hypothesis + # if our p-value indicates significance, we can reject the null hypothesis +# Note that the Scala functionality of calling Statistics.kolmogorovSmirnovTest with +# a lambda to calculate the CDF is not made available in the Python API +{% endhighlight %}
@@ -366,7 +651,21 @@ to do so. Refer to the [`KernelDensity` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.KernelDensity) for details on the API. -{% include_example scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala %} +{% highlight scala %} +import org.apache.spark.mllib.stat.KernelDensity +import org.apache.spark.rdd.RDD + +val data: RDD[Double] = ... // an RDD of sample data + +// Construct the density estimator with the sample data and a standard deviation for the Gaussian +// kernels +val kd = new KernelDensity() + .setSample(data) + .setBandwidth(3.0) + +// Find density estimates for the given values +val densities = kd.estimate(Array(-1.0, 2.0, 5.0)) +{% endhighlight %}
@@ -376,7 +675,21 @@ to do so. Refer to the [`KernelDensity` Java docs](api/java/org/apache/spark/mllib/stat/KernelDensity.html) for details on the API. -{% include_example java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java %} +{% highlight java %} +import org.apache.spark.mllib.stat.KernelDensity; +import org.apache.spark.rdd.RDD; + +RDD data = ... // an RDD of sample data + +// Construct the density estimator with the sample data and a standard deviation for the Gaussian +// kernels +KernelDensity kd = new KernelDensity() + .setSample(data) + .setBandwidth(3.0); + +// Find density estimates for the given values +double[] densities = kd.estimate(new double[] {-1.0, 2.0, 5.0}); +{% endhighlight %}
@@ -386,7 +699,20 @@ to do so. Refer to the [`KernelDensity` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.KernelDensity) for more details on the API. -{% include_example python/mllib/kernel_density_estimation_example.py %} +{% highlight python %} +from pyspark.mllib.stat import KernelDensity + +data = ... # an RDD of sample data + +# Construct the density estimator with the sample data and a standard deviation for the Gaussian +# kernels +kd = KernelDensity() +kd.setSample(data) +kd.setBandwidth(3.0) + +# Find density estimates for the given values +densities = kd.estimate([-1.0, 2.0, 5.0]) +{% endhighlight %}
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java deleted file mode 100644 index fd19b43504..0000000000 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -// $example on$ -import java.util.Arrays; - -import org.apache.spark.api.java.JavaDoubleRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.mllib.linalg.Matrix; -import org.apache.spark.mllib.linalg.Vector; -import org.apache.spark.mllib.linalg.Vectors; -import org.apache.spark.mllib.stat.Statistics; -// $example off$ - -public class JavaCorrelationsExample { - public static void main(String[] args) { - - SparkConf conf = new SparkConf().setAppName("JavaCorrelationsExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - - // $example on$ - JavaDoubleRDD seriesX = jsc.parallelizeDoubles( - Arrays.asList(1.0, 2.0, 3.0, 3.0, 5.0)); // a series - - // must have the same number of partitions and cardinality as seriesX - JavaDoubleRDD seriesY = jsc.parallelizeDoubles( - Arrays.asList(11.0, 22.0, 33.0, 33.0, 555.0)); - - // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. - // If a method is not specified, Pearson's method will be used by default. - Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson"); - System.out.println("Correlation is: " + correlation); - - // note that each Vector is a row and not a column - JavaRDD data = jsc.parallelize( - Arrays.asList( - Vectors.dense(1.0, 10.0, 100.0), - Vectors.dense(2.0, 20.0, 200.0), - Vectors.dense(5.0, 33.0, 366.0) - ) - ); - - // calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method. - // If a method is not specified, Pearson's method will be used by default. - Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson"); - System.out.println(correlMatrix.toString()); - // $example off$ - - jsc.stop(); - } -} - diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java deleted file mode 100644 index b48b95ff1d..0000000000 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; - -// $example on$ -import java.util.Arrays; - -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.mllib.linalg.Matrices; -import org.apache.spark.mllib.linalg.Matrix; -import org.apache.spark.mllib.linalg.Vector; -import org.apache.spark.mllib.linalg.Vectors; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.stat.Statistics; -import org.apache.spark.mllib.stat.test.ChiSqTestResult; -// $example off$ - -public class JavaHypothesisTestingExample { - public static void main(String[] args) { - - SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - - // $example on$ - // a vector composed of the frequencies of events - Vector vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25); - - // compute the goodness of fit. If a second vector to test against is not supplied - // as a parameter, the test runs against a uniform distribution. - ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec); - // summary of the test including the p-value, degrees of freedom, test statistic, - // the method used, and the null hypothesis. - System.out.println(goodnessOfFitTestResult + "\n"); - - // Create a contingency matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0)) - Matrix mat = Matrices.dense(3, 2, new double[]{1.0, 3.0, 5.0, 2.0, 4.0, 6.0}); - - // conduct Pearson's independence test on the input contingency matrix - ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat); - // summary of the test including the p-value, degrees of freedom... - System.out.println(independenceTestResult + "\n"); - - // an RDD of labeled points - JavaRDD obs = jsc.parallelize( - Arrays.asList( - new LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)), - new LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0)), - new LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5)) - ) - ); - - // The contingency table is constructed from the raw (feature, label) pairs and used to conduct - // the independence test. Returns an array containing the ChiSquaredTestResult for every feature - // against the label. - ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd()); - int i = 1; - for (ChiSqTestResult result : featureTestResults) { - System.out.println("Column " + i + ":"); - System.out.println(result + "\n"); // summary of the test - i++; - } - // $example off$ - - jsc.stop(); - } -} diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java deleted file mode 100644 index fe611c9ae6..0000000000 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -// $example on$ -import java.util.Arrays; - -import org.apache.spark.api.java.JavaDoubleRDD; -import org.apache.spark.mllib.stat.Statistics; -import org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult; -// $example off$ - -public class JavaHypothesisTestingKolmogorovSmirnovTestExample { - public static void main(String[] args) { - - SparkConf conf = - new SparkConf().setAppName("JavaHypothesisTestingKolmogorovSmirnovTestExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - - // $example on$ - JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.1, 0.15, 0.2, 0.3, 0.25)); - KolmogorovSmirnovTestResult testResult = - Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0); - // summary of the test including the p-value, test statistic, and null hypothesis - // if our p-value indicates significance, we can reject the null hypothesis - System.out.println(testResult); - // $example off$ - - jsc.stop(); - } -} - diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java deleted file mode 100644 index 41de0d90ec..0000000000 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -// $example on$ -import java.util.Arrays; - -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.mllib.stat.KernelDensity; -// $example off$ - -public class JavaKernelDensityEstimationExample { - public static void main(String[] args) { - - SparkConf conf = new SparkConf().setAppName("JavaKernelDensityEstimationExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - - // $example on$ - // an RDD of sample data - JavaRDD data = jsc.parallelize( - Arrays.asList(1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0)); - - // Construct the density estimator with the sample data - // and a standard deviation for the Gaussian kernels - KernelDensity kd = new KernelDensity().setSample(data).setBandwidth(3.0); - - // Find density estimates for the given values - double[] densities = kd.estimate(new double[]{-1.0, 2.0, 5.0}); - - System.out.println(Arrays.toString(densities)); - // $example off$ - - jsc.stop(); - } -} - diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java deleted file mode 100644 index f5a451019b..0000000000 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -import com.google.common.collect.ImmutableMap; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; - -// $example on$ -import java.util.*; - -import scala.Tuple2; - -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.function.VoidFunction; -// $example off$ - -public class JavaStratifiedSamplingExample { - public static void main(String[] args) { - - SparkConf conf = new SparkConf().setAppName("JavaStratifiedSamplingExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - - // $example on$ - List> list = new ArrayList>( - Arrays.>asList( - new Tuple2(1, 'a'), - new Tuple2(1, 'b'), - new Tuple2(2, 'c'), - new Tuple2(2, 'd'), - new Tuple2(2, 'e'), - new Tuple2(3, 'f') - ) - ); - - JavaPairRDD data = jsc.parallelizePairs(list); - - // specify the exact fraction desired from each key Map - ImmutableMap fractions = - ImmutableMap.of(1, (Object)0.1, 2, (Object) 0.6, 3, (Object) 0.3); - - // Get an approximate sample from each stratum - JavaPairRDD approxSample = data.sampleByKey(false, fractions); - // Get an exact sample from each stratum - JavaPairRDD exactSample = data.sampleByKeyExact(false, fractions); - // $example off$ - - System.out.println("approxSample size is " + approxSample.collect().size()); - for (Tuple2 t : approxSample.collect()) { - System.out.println(t._1() + " " + t._2()); - } - - System.out.println("exactSample size is " + exactSample.collect().size()); - for (Tuple2 t : exactSample.collect()) { - System.out.println(t._1() + " " + t._2()); - } - - jsc.stop(); - } -} diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java deleted file mode 100644 index 278706bc8f..0000000000 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -// $example on$ -import java.util.Arrays; - -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.mllib.linalg.Vector; -import org.apache.spark.mllib.linalg.Vectors; -import org.apache.spark.mllib.stat.MultivariateStatisticalSummary; -import org.apache.spark.mllib.stat.Statistics; -// $example off$ - -public class JavaSummaryStatisticsExample { - public static void main(String[] args) { - - SparkConf conf = new SparkConf().setAppName("JavaSummaryStatisticsExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - - // $example on$ - JavaRDD mat = jsc.parallelize( - Arrays.asList( - Vectors.dense(1.0, 10.0, 100.0), - Vectors.dense(2.0, 20.0, 200.0), - Vectors.dense(3.0, 30.0, 300.0) - ) - ); // an RDD of Vectors - - // Compute column summary statistics. - MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd()); - System.out.println(summary.mean()); // a dense vector containing the mean value for each column - System.out.println(summary.variance()); // column-wise variance - System.out.println(summary.numNonzeros()); // number of nonzeros in each column - // $example off$ - - jsc.stop(); - } -} diff --git a/examples/src/main/python/mllib/correlations_example.py b/examples/src/main/python/mllib/correlations_example.py deleted file mode 100644 index 66d18f6e5d..0000000000 --- a/examples/src/main/python/mllib/correlations_example.py +++ /dev/null @@ -1,48 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from __future__ import print_function - -import numpy as np - -from pyspark import SparkContext -# $example on$ -from pyspark.mllib.stat import Statistics -# $example off$ - -if __name__ == "__main__": - sc = SparkContext(appName="CorrelationsExample") # SparkContext - - # $example on$ - seriesX = sc.parallelize([1.0, 2.0, 3.0, 3.0, 5.0]) # a series - # seriesY must have the same number of partitions and cardinality as seriesX - seriesY = sc.parallelize([11.0, 22.0, 33.0, 33.0, 555.0]) - - # Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. - # If a method is not specified, Pearson's method will be used by default. - print("Correlation is: " + str(Statistics.corr(seriesX, seriesY, method="pearson"))) - - data = sc.parallelize( - [np.array([1.0, 10.0, 100.0]), np.array([2.0, 20.0, 200.0]), np.array([5.0, 33.0, 366.0])] - ) # an RDD of Vectors - - # calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method. - # If a method is not specified, Pearson's method will be used by default. - print(Statistics.corr(data, method="pearson")) - # $example off$ - - sc.stop() diff --git a/examples/src/main/python/mllib/hypothesis_testing_example.py b/examples/src/main/python/mllib/hypothesis_testing_example.py deleted file mode 100644 index e566ead0d3..0000000000 --- a/examples/src/main/python/mllib/hypothesis_testing_example.py +++ /dev/null @@ -1,65 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from __future__ import print_function - -from pyspark import SparkContext -# $example on$ -from pyspark.mllib.linalg import Matrices, Vectors -from pyspark.mllib.regression import LabeledPoint -from pyspark.mllib.stat import Statistics -# $example off$ - -if __name__ == "__main__": - sc = SparkContext(appName="HypothesisTestingExample") - - # $example on$ - vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25) # a vector composed of the frequencies of events - - # compute the goodness of fit. If a second vector to test against - # is not supplied as a parameter, the test runs against a uniform distribution. - goodnessOfFitTestResult = Statistics.chiSqTest(vec) - - # summary of the test including the p-value, degrees of freedom, - # test statistic, the method used, and the null hypothesis. - print("%s\n" % goodnessOfFitTestResult) - - mat = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0]) # a contingency matrix - - # conduct Pearson's independence test on the input contingency matrix - independenceTestResult = Statistics.chiSqTest(mat) - - # summary of the test including the p-value, degrees of freedom, - # test statistic, the method used, and the null hypothesis. - print("%s\n" % independenceTestResult) - - obs = sc.parallelize( - [LabeledPoint(1.0, [1.0, 0.0, 3.0]), - LabeledPoint(1.0, [1.0, 2.0, 0.0]), - LabeledPoint(1.0, [-1.0, 0.0, -0.5])] - ) # LabeledPoint(feature, label) - - # The contingency table is constructed from an RDD of LabeledPoint and used to conduct - # the independence test. Returns an array containing the ChiSquaredTestResult for every feature - # against the label. - featureTestResults = Statistics.chiSqTest(obs) - - for i, result in enumerate(featureTestResults): - print("Column %d:\n%s" % (i + 1, result)) - # $example off$ - - sc.stop() diff --git a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py deleted file mode 100644 index ef380dee79..0000000000 --- a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py +++ /dev/null @@ -1,40 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from __future__ import print_function - -from pyspark import SparkContext -# $example on$ -from pyspark.mllib.stat import Statistics -# $example off$ - -if __name__ == "__main__": - sc = SparkContext(appName="HypothesisTestingKolmogorovSmirnovTestExample") - - # $example on$ - parallelData = sc.parallelize([0.1, 0.15, 0.2, 0.3, 0.25]) - - # run a KS test for the sample versus a standard normal distribution - testResult = Statistics.kolmogorovSmirnovTest(parallelData, "norm", 0, 1) - # summary of the test including the p-value, test statistic, and null hypothesis - # if our p-value indicates significance, we can reject the null hypothesis - # Note that the Scala functionality of calling Statistics.kolmogorovSmirnovTest with - # a lambda to calculate the CDF is not made available in the Python API - print(testResult) - # $example off$ - - sc.stop() diff --git a/examples/src/main/python/mllib/kernel_density_estimation_example.py b/examples/src/main/python/mllib/kernel_density_estimation_example.py deleted file mode 100644 index 3e8f7241a4..0000000000 --- a/examples/src/main/python/mllib/kernel_density_estimation_example.py +++ /dev/null @@ -1,44 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from __future__ import print_function - -from pyspark import SparkContext -# $example on$ -from pyspark.mllib.stat import KernelDensity -# $example off$ - -if __name__ == "__main__": - sc = SparkContext(appName="KernelDensityEstimationExample") # SparkContext - - # $example on$ - # an RDD of sample data - data = sc.parallelize([1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0]) - - # Construct the density estimator with the sample data and a standard deviation for the Gaussian - # kernels - kd = KernelDensity() - kd.setSample(data) - kd.setBandwidth(3.0) - - # Find density estimates for the given values - densities = kd.estimate([-1.0, 2.0, 5.0]) - # $example off$ - - print(densities) - - sc.stop() diff --git a/examples/src/main/python/mllib/stratified_sampling_example.py b/examples/src/main/python/mllib/stratified_sampling_example.py deleted file mode 100644 index a13f8f08dd..0000000000 --- a/examples/src/main/python/mllib/stratified_sampling_example.py +++ /dev/null @@ -1,38 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from __future__ import print_function - -from pyspark import SparkContext - -if __name__ == "__main__": - sc = SparkContext(appName="StratifiedSamplingExample") # SparkContext - - # $example on$ - # an RDD of any key value pairs - data = sc.parallelize([(1, 'a'), (1, 'b'), (2, 'c'), (2, 'd'), (2, 'e'), (3, 'f')]) - - # specify the exact fraction desired from each key as a dictionary - fractions = {1: 0.1, 2: 0.6, 3: 0.3} - - approxSample = data.sampleByKey(False, fractions) - # $example off$ - - for each in approxSample.collect(): - print(each) - - sc.stop() diff --git a/examples/src/main/python/mllib/summary_statistics_example.py b/examples/src/main/python/mllib/summary_statistics_example.py deleted file mode 100644 index d55d1a2c2d..0000000000 --- a/examples/src/main/python/mllib/summary_statistics_example.py +++ /dev/null @@ -1,42 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from __future__ import print_function - -from pyspark import SparkContext -# $example on$ -import numpy as np - -from pyspark.mllib.stat import Statistics -# $example off$ - -if __name__ == "__main__": - sc = SparkContext(appName="SummaryStatisticsExample") # SparkContext - - # $example on$ - mat = sc.parallelize( - [np.array([1.0, 10.0, 100.0]), np.array([2.0, 20.0, 200.0]), np.array([3.0, 30.0, 300.0])] - ) # an RDD of Vectors - - # Compute column summary statistics. - summary = Statistics.colStats(mat) - print(summary.mean()) # a dense vector containing the mean value for each column - print(summary.variance()) # column-wise variance - print(summary.numNonzeros()) # number of nonzeros in each column - # $example off$ - - sc.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala deleted file mode 100644 index 1202caf534..0000000000 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// scalastyle:off println -package org.apache.spark.examples.mllib - -import org.apache.spark.{SparkConf, SparkContext} -// $example on$ -import org.apache.spark.mllib.linalg._ -import org.apache.spark.mllib.stat.Statistics -import org.apache.spark.rdd.RDD -// $example off$ - -object CorrelationsExample { - - def main(args: Array[String]): Unit = { - - val conf = new SparkConf().setAppName("CorrelationsExample") - val sc = new SparkContext(conf) - - // $example on$ - val seriesX: RDD[Double] = sc.parallelize(Array(1, 2, 3, 3, 5)) // a series - // must have the same number of partitions and cardinality as seriesX - val seriesY: RDD[Double] = sc.parallelize(Array(11, 22, 33, 33, 555)) - - // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a - // method is not specified, Pearson's method will be used by default. - val correlation: Double = Statistics.corr(seriesX, seriesY, "pearson") - println(s"Correlation is: $correlation") - - val data: RDD[Vector] = sc.parallelize( - Seq( - Vectors.dense(1.0, 10.0, 100.0), - Vectors.dense(2.0, 20.0, 200.0), - Vectors.dense(5.0, 33.0, 366.0)) - ) // note that each Vector is a row and not a column - - // calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method - // If a method is not specified, Pearson's method will be used by default. - val correlMatrix: Matrix = Statistics.corr(data, "pearson") - println(correlMatrix.toString) - // $example off$ - - sc.stop() - } -} -// scalastyle:on println - diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala deleted file mode 100644 index 0d391a3637..0000000000 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// scalastyle:off println -package org.apache.spark.examples.mllib - -import org.apache.spark.{SparkConf, SparkContext} -// $example on$ -import org.apache.spark.mllib.linalg._ -import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.mllib.stat.Statistics -import org.apache.spark.mllib.stat.test.ChiSqTestResult -import org.apache.spark.rdd.RDD -// $example off$ - -object HypothesisTestingExample { - - def main(args: Array[String]) { - - val conf = new SparkConf().setAppName("HypothesisTestingExample") - val sc = new SparkContext(conf) - - // $example on$ - // a vector composed of the frequencies of events - val vec: Vector = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25) - - // compute the goodness of fit. If a second vector to test against is not supplied - // as a parameter, the test runs against a uniform distribution. - val goodnessOfFitTestResult = Statistics.chiSqTest(vec) - // summary of the test including the p-value, degrees of freedom, test statistic, the method - // used, and the null hypothesis. - println(s"$goodnessOfFitTestResult\n") - - // a contingency matrix. Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0)) - val mat: Matrix = Matrices.dense(3, 2, Array(1.0, 3.0, 5.0, 2.0, 4.0, 6.0)) - - // conduct Pearson's independence test on the input contingency matrix - val independenceTestResult = Statistics.chiSqTest(mat) - // summary of the test including the p-value, degrees of freedom - println(s"$independenceTestResult\n") - - val obs: RDD[LabeledPoint] = - sc.parallelize( - Seq( - LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)), - LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0)), - LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5) - ) - ) - ) // (feature, label) pairs. - - // The contingency table is constructed from the raw (feature, label) pairs and used to conduct - // the independence test. Returns an array containing the ChiSquaredTestResult for every feature - // against the label. - val featureTestResults: Array[ChiSqTestResult] = Statistics.chiSqTest(obs) - featureTestResults.zipWithIndex.foreach { case (k, v) => - println("Column " + (v + 1).toString + ":") - println(k) - } // summary of the test - // $example off$ - - sc.stop() - } -} -// scalastyle:on println - diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala deleted file mode 100644 index 840874cf3c..0000000000 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// scalastyle:off println -package org.apache.spark.examples.mllib - -import org.apache.spark.{SparkConf, SparkContext} -// $example on$ -import org.apache.spark.mllib.stat.Statistics -import org.apache.spark.rdd.RDD -// $example off$ - -object HypothesisTestingKolmogorovSmirnovTestExample { - - def main(args: Array[String]): Unit = { - - val conf = new SparkConf().setAppName("HypothesisTestingKolmogorovSmirnovTestExample") - val sc = new SparkContext(conf) - - // $example on$ - val data: RDD[Double] = sc.parallelize(Seq(0.1, 0.15, 0.2, 0.3, 0.25)) // an RDD of sample data - - // run a KS test for the sample versus a standard normal distribution - val testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1) - // summary of the test including the p-value, test statistic, and null hypothesis if our p-value - // indicates significance, we can reject the null hypothesis. - println(testResult) - println() - - // perform a KS test using a cumulative distribution function of our making - val myCDF = Map(0.1 -> 0.2, 0.15 -> 0.6, 0.2 -> 0.05, 0.3 -> 0.05, 0.25 -> 0.1) - val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF) - println(testResult2) - // $example off$ - - sc.stop() - } -} -// scalastyle:on println - diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala deleted file mode 100644 index cc5d159b36..0000000000 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// scalastyle:off println -package org.apache.spark.examples.mllib - -import org.apache.spark.{SparkConf, SparkContext} -// $example on$ -import org.apache.spark.mllib.stat.KernelDensity -import org.apache.spark.rdd.RDD -// $example off$ - -object KernelDensityEstimationExample { - - def main(args: Array[String]): Unit = { - - val conf = new SparkConf().setAppName("KernelDensityEstimationExample") - val sc = new SparkContext(conf) - - // $example on$ - // an RDD of sample data - val data: RDD[Double] = sc.parallelize(Seq(1, 1, 1, 2, 3, 4, 5, 5, 6, 7, 8, 9, 9)) - - // Construct the density estimator with the sample data and a standard deviation - // for the Gaussian kernels - val kd = new KernelDensity() - .setSample(data) - .setBandwidth(3.0) - - // Find density estimates for the given values - val densities = kd.estimate(Array(-1.0, 2.0, 5.0)) - // $example off$ - - densities.foreach(println) - - sc.stop() - } -} -// scalastyle:on println - diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala deleted file mode 100644 index 169467926c..0000000000 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// scalastyle:off println -package org.apache.spark.examples.mllib - -import org.apache.spark.{SparkConf, SparkContext} - -object StratifiedSamplingExample { - - def main(args: Array[String]): Unit = { - - val conf = new SparkConf().setAppName("StratifiedSamplingExample") - val sc = new SparkContext(conf) - - // $example on$ - // an RDD[(K, V)] of any key value pairs - val data = sc.parallelize( - Seq((1, 'a'), (1, 'b'), (2, 'c'), (2, 'd'), (2, 'e'), (3, 'f'))) - - // specify the exact fraction desired from each key - val fractions = Map(1 -> 0.1, 2 -> 0.6, 3 -> 0.3) - - // Get an approximate sample from each stratum - val approxSample = data.sampleByKey(withReplacement = false, fractions) - // Get an exact sample from each stratum - val exactSample = data.sampleByKeyExact(withReplacement = false, fractions) - // $example off$ - - println("approxSample size is " + approxSample.collect().size.toString) - approxSample.collect().foreach(println) - - println("exactSample its size is " + exactSample.collect().size.toString) - exactSample.collect().foreach(println) - - sc.stop() - } -} -// scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala deleted file mode 100644 index 948b443c0a..0000000000 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// scalastyle:off println -package org.apache.spark.examples.mllib - -import org.apache.spark.{SparkConf, SparkContext} -// $example on$ -import org.apache.spark.mllib.linalg.Vectors -import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics} -// $example off$ - -object SummaryStatisticsExample { - - def main(args: Array[String]): Unit = { - - val conf = new SparkConf().setAppName("SummaryStatisticsExample") - val sc = new SparkContext(conf) - - // $example on$ - val observations = sc.parallelize( - Seq( - Vectors.dense(1.0, 10.0, 100.0), - Vectors.dense(2.0, 20.0, 200.0), - Vectors.dense(3.0, 30.0, 300.0) - ) - ) - - // Compute column summary statistics. - val summary: MultivariateStatisticalSummary = Statistics.colStats(observations) - println(summary.mean) // a dense vector containing the mean value for each column - println(summary.variance) // column-wise variance - println(summary.numNonzeros) // number of nonzeros in each column - // $example off$ - - sc.stop() - } -} -// scalastyle:on println