diff --git a/docs/mllib-statistics.md b/docs/mllib-statistics.md index b773031bc7..02b81f153b 100644 --- a/docs/mllib-statistics.md +++ b/docs/mllib-statistics.md @@ -10,24 +10,24 @@ displayTitle: Basic Statistics - spark.mllib `\[ \newcommand{\R}{\mathbb{R}} -\newcommand{\E}{\mathbb{E}} +\newcommand{\E}{\mathbb{E}} \newcommand{\x}{\mathbf{x}} \newcommand{\y}{\mathbf{y}} \newcommand{\wv}{\mathbf{w}} \newcommand{\av}{\mathbf{\alpha}} \newcommand{\bv}{\mathbf{b}} \newcommand{\N}{\mathbb{N}} -\newcommand{\id}{\mathbf{I}} -\newcommand{\ind}{\mathbf{1}} -\newcommand{\0}{\mathbf{0}} -\newcommand{\unit}{\mathbf{e}} -\newcommand{\one}{\mathbf{1}} +\newcommand{\id}{\mathbf{I}} +\newcommand{\ind}{\mathbf{1}} +\newcommand{\0}{\mathbf{0}} +\newcommand{\unit}{\mathbf{e}} +\newcommand{\one}{\mathbf{1}} \newcommand{\zero}{\mathbf{0}} \]` -## Summary statistics +## Summary statistics -We provide column summary statistics for `RDD[Vector]` through the function `colStats` +We provide column summary statistics for `RDD[Vector]` through the function `colStats` available in `Statistics`.
@@ -40,19 +40,7 @@ total count. Refer to the [`MultivariateStatisticalSummary` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.MultivariateStatisticalSummary) for details on the API. -{% highlight scala %} -import org.apache.spark.mllib.linalg.Vector -import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics} - -val observations: RDD[Vector] = ... // an RDD of Vectors - -// Compute column summary statistics. -val summary: MultivariateStatisticalSummary = Statistics.colStats(observations) -println(summary.mean) // a dense vector containing the mean value for each column -println(summary.variance) // column-wise variance -println(summary.numNonzeros) // number of nonzeros in each column - -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala %}
@@ -64,24 +52,7 @@ total count. Refer to the [`MultivariateStatisticalSummary` Java docs](api/java/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.html) for details on the API. -{% highlight java %} -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.mllib.linalg.Vector; -import org.apache.spark.mllib.stat.MultivariateStatisticalSummary; -import org.apache.spark.mllib.stat.Statistics; - -JavaSparkContext jsc = ... - -JavaRDD mat = ... // an RDD of Vectors - -// Compute column summary statistics. -MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd()); -System.out.println(summary.mean()); // a dense vector containing the mean value for each column -System.out.println(summary.variance()); // column-wise variance -System.out.println(summary.numNonzeros()); // number of nonzeros in each column - -{% endhighlight %} +{% include_example java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java %}
@@ -92,20 +63,7 @@ total count. Refer to the [`MultivariateStatisticalSummary` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.MultivariateStatisticalSummary) for more details on the API. -{% highlight python %} -from pyspark.mllib.stat import Statistics - -sc = ... # SparkContext - -mat = ... # an RDD of Vectors - -# Compute column summary statistics. -summary = Statistics.colStats(mat) -print(summary.mean()) -print(summary.variance()) -print(summary.numNonzeros()) - -{% endhighlight %} +{% include_example python/mllib/summary_statistics_example.py %}
@@ -113,96 +71,38 @@ print(summary.numNonzeros()) ## Correlations Calculating the correlation between two series of data is a common operation in Statistics. In `spark.mllib` -we provide the flexibility to calculate pairwise correlations among many series. The supported +we provide the flexibility to calculate pairwise correlations among many series. The supported correlation methods are currently Pearson's and Spearman's correlation. - +
-[`Statistics`](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) provides methods to -calculate correlations between series. Depending on the type of input, two `RDD[Double]`s or +[`Statistics`](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) provides methods to +calculate correlations between series. Depending on the type of input, two `RDD[Double]`s or an `RDD[Vector]`, the output will be a `Double` or the correlation `Matrix` respectively. Refer to the [`Statistics` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.Statistics) for details on the API. -{% highlight scala %} -import org.apache.spark.SparkContext -import org.apache.spark.mllib.linalg._ -import org.apache.spark.mllib.stat.Statistics - -val sc: SparkContext = ... - -val seriesX: RDD[Double] = ... // a series -val seriesY: RDD[Double] = ... // must have the same number of partitions and cardinality as seriesX - -// compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a -// method is not specified, Pearson's method will be used by default. -val correlation: Double = Statistics.corr(seriesX, seriesY, "pearson") - -val data: RDD[Vector] = ... // note that each Vector is a row and not a column - -// calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method. -// If a method is not specified, Pearson's method will be used by default. -val correlMatrix: Matrix = Statistics.corr(data, "pearson") - -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/mllib/CorrelationsExample.scala %}
-[`Statistics`](api/java/org/apache/spark/mllib/stat/Statistics.html) provides methods to -calculate correlations between series. Depending on the type of input, two `JavaDoubleRDD`s or +[`Statistics`](api/java/org/apache/spark/mllib/stat/Statistics.html) provides methods to +calculate correlations between series. Depending on the type of input, two `JavaDoubleRDD`s or a `JavaRDD`, the output will be a `Double` or the correlation `Matrix` respectively. Refer to the [`Statistics` Java docs](api/java/org/apache/spark/mllib/stat/Statistics.html) for details on the API. -{% highlight java %} -import org.apache.spark.api.java.JavaDoubleRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.mllib.linalg.*; -import org.apache.spark.mllib.stat.Statistics; - -JavaSparkContext jsc = ... - -JavaDoubleRDD seriesX = ... // a series -JavaDoubleRDD seriesY = ... // must have the same number of partitions and cardinality as seriesX - -// compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a -// method is not specified, Pearson's method will be used by default. -Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson"); - -JavaRDD data = ... // note that each Vector is a row and not a column - -// calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method. -// If a method is not specified, Pearson's method will be used by default. -Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson"); - -{% endhighlight %} +{% include_example java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java %}
-[`Statistics`](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) provides methods to -calculate correlations between series. Depending on the type of input, two `RDD[Double]`s or +[`Statistics`](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) provides methods to +calculate correlations between series. Depending on the type of input, two `RDD[Double]`s or an `RDD[Vector]`, the output will be a `Double` or the correlation `Matrix` respectively. Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API. -{% highlight python %} -from pyspark.mllib.stat import Statistics - -sc = ... # SparkContext - -seriesX = ... # a series -seriesY = ... # must have the same number of partitions and cardinality as seriesX - -# Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a -# method is not specified, Pearson's method will be used by default. -print(Statistics.corr(seriesX, seriesY, method="pearson")) - -data = ... # an RDD of Vectors -# calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method. -# If a method is not specified, Pearson's method will be used by default. -print(Statistics.corr(data, method="pearson")) - -{% endhighlight %} +{% include_example python/mllib/correlations_example.py %}
@@ -211,187 +111,76 @@ print(Statistics.corr(data, method="pearson")) Unlike the other statistics functions, which reside in `spark.mllib`, stratified sampling methods, `sampleByKey` and `sampleByKeyExact`, can be performed on RDD's of key-value pairs. For stratified -sampling, the keys can be thought of as a label and the value as a specific attribute. For example -the key can be man or woman, or document ids, and the respective values can be the list of ages -of the people in the population or the list of words in the documents. The `sampleByKey` method -will flip a coin to decide whether an observation will be sampled or not, therefore requires one -pass over the data, and provides an *expected* sample size. `sampleByKeyExact` requires significant +sampling, the keys can be thought of as a label and the value as a specific attribute. For example +the key can be man or woman, or document ids, and the respective values can be the list of ages +of the people in the population or the list of words in the documents. The `sampleByKey` method +will flip a coin to decide whether an observation will be sampled or not, therefore requires one +pass over the data, and provides an *expected* sample size. `sampleByKeyExact` requires significant more resources than the per-stratum simple random sampling used in `sampleByKey`, but will provide -the exact sampling size with 99.99% confidence. `sampleByKeyExact` is currently not supported in +the exact sampling size with 99.99% confidence. `sampleByKeyExact` is currently not supported in python.
[`sampleByKeyExact()`](api/scala/index.html#org.apache.spark.rdd.PairRDDFunctions) allows users to -sample exactly $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the desired +sample exactly $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the desired fraction for key $k$, $n_k$ is the number of key-value pairs for key $k$, and $K$ is the set of -keys. Sampling without replacement requires one additional pass over the RDD to guarantee sample +keys. Sampling without replacement requires one additional pass over the RDD to guarantee sample size, whereas sampling with replacement requires two additional passes. -{% highlight scala %} -import org.apache.spark.SparkContext -import org.apache.spark.SparkContext._ -import org.apache.spark.rdd.PairRDDFunctions - -val sc: SparkContext = ... - -val data = ... // an RDD[(K, V)] of any key value pairs -val fractions: Map[K, Double] = ... // specify the exact fraction desired from each key - -// Get an exact sample from each stratum -val approxSample = data.sampleByKey(withReplacement = false, fractions) -val exactSample = data.sampleByKeyExact(withReplacement = false, fractions) - -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala %}
[`sampleByKeyExact()`](api/java/org/apache/spark/api/java/JavaPairRDD.html) allows users to -sample exactly $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the desired +sample exactly $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the desired fraction for key $k$, $n_k$ is the number of key-value pairs for key $k$, and $K$ is the set of -keys. Sampling without replacement requires one additional pass over the RDD to guarantee sample +keys. Sampling without replacement requires one additional pass over the RDD to guarantee sample size, whereas sampling with replacement requires two additional passes. -{% highlight java %} -import java.util.Map; - -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaSparkContext; - -JavaSparkContext jsc = ... - -JavaPairRDD data = ... // an RDD of any key value pairs -Map fractions = ... // specify the exact fraction desired from each key - -// Get an exact sample from each stratum -JavaPairRDD approxSample = data.sampleByKey(false, fractions); -JavaPairRDD exactSample = data.sampleByKeyExact(false, fractions); - -{% endhighlight %} +{% include_example java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java %}
[`sampleByKey()`](api/python/pyspark.html#pyspark.RDD.sampleByKey) allows users to -sample approximately $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the -desired fraction for key $k$, $n_k$ is the number of key-value pairs for key $k$, and $K$ is the +sample approximately $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the +desired fraction for key $k$, $n_k$ is the number of key-value pairs for key $k$, and $K$ is the set of keys. *Note:* `sampleByKeyExact()` is currently not supported in Python. -{% highlight python %} - -sc = ... # SparkContext - -data = ... # an RDD of any key value pairs -fractions = ... # specify the exact fraction desired from each key as a dictionary - -approxSample = data.sampleByKey(False, fractions); - -{% endhighlight %} +{% include_example python/mllib/stratified_sampling_example.py %}
## Hypothesis testing -Hypothesis testing is a powerful tool in statistics to determine whether a result is statistically -significant, whether this result occurred by chance or not. `spark.mllib` currently supports Pearson's +Hypothesis testing is a powerful tool in statistics to determine whether a result is statistically +significant, whether this result occurred by chance or not. `spark.mllib` currently supports Pearson's chi-squared ( $\chi^2$) tests for goodness of fit and independence. The input data types determine -whether the goodness of fit or the independence test is conducted. The goodness of fit test requires +whether the goodness of fit or the independence test is conducted. The goodness of fit test requires an input type of `Vector`, whereas the independence test requires a `Matrix` as input. -`spark.mllib` also supports the input type `RDD[LabeledPoint]` to enable feature selection via chi-squared +`spark.mllib` also supports the input type `RDD[LabeledPoint]` to enable feature selection via chi-squared independence tests.
-[`Statistics`](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) provides methods to -run Pearson's chi-squared tests. The following example demonstrates how to run and interpret +[`Statistics`](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) provides methods to +run Pearson's chi-squared tests. The following example demonstrates how to run and interpret hypothesis tests. -{% highlight scala %} -import org.apache.spark.SparkContext -import org.apache.spark.mllib.linalg._ -import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.mllib.stat.Statistics._ - -val sc: SparkContext = ... - -val vec: Vector = ... // a vector composed of the frequencies of events - -// compute the goodness of fit. If a second vector to test against is not supplied as a parameter, -// the test runs against a uniform distribution. -val goodnessOfFitTestResult = Statistics.chiSqTest(vec) -println(goodnessOfFitTestResult) // summary of the test including the p-value, degrees of freedom, - // test statistic, the method used, and the null hypothesis. - -val mat: Matrix = ... // a contingency matrix - -// conduct Pearson's independence test on the input contingency matrix -val independenceTestResult = Statistics.chiSqTest(mat) -println(independenceTestResult) // summary of the test including the p-value, degrees of freedom... - -val obs: RDD[LabeledPoint] = ... // (feature, label) pairs. - -// The contingency table is constructed from the raw (feature, label) pairs and used to conduct -// the independence test. Returns an array containing the ChiSquaredTestResult for every feature -// against the label. -val featureTestResults: Array[ChiSqTestResult] = Statistics.chiSqTest(obs) -var i = 1 -featureTestResults.foreach { result => - println(s"Column $i:\n$result") - i += 1 -} // summary of the test - -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala %}
-[`Statistics`](api/java/org/apache/spark/mllib/stat/Statistics.html) provides methods to -run Pearson's chi-squared tests. The following example demonstrates how to run and interpret +[`Statistics`](api/java/org/apache/spark/mllib/stat/Statistics.html) provides methods to +run Pearson's chi-squared tests. The following example demonstrates how to run and interpret hypothesis tests. Refer to the [`ChiSqTestResult` Java docs](api/java/org/apache/spark/mllib/stat/test/ChiSqTestResult.html) for details on the API. -{% highlight java %} -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.mllib.linalg.*; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.stat.Statistics; -import org.apache.spark.mllib.stat.test.ChiSqTestResult; - -JavaSparkContext jsc = ... - -Vector vec = ... // a vector composed of the frequencies of events - -// compute the goodness of fit. If a second vector to test against is not supplied as a parameter, -// the test runs against a uniform distribution. -ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec); -// summary of the test including the p-value, degrees of freedom, test statistic, the method used, -// and the null hypothesis. -System.out.println(goodnessOfFitTestResult); - -Matrix mat = ... // a contingency matrix - -// conduct Pearson's independence test on the input contingency matrix -ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat); -// summary of the test including the p-value, degrees of freedom... -System.out.println(independenceTestResult); - -JavaRDD obs = ... // an RDD of labeled points - -// The contingency table is constructed from the raw (feature, label) pairs and used to conduct -// the independence test. Returns an array containing the ChiSquaredTestResult for every feature -// against the label. -ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd()); -int i = 1; -for (ChiSqTestResult result : featureTestResults) { - System.out.println("Column " + i + ":"); - System.out.println(result); // summary of the test - i++; -} - -{% endhighlight %} +{% include_example java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java %}
@@ -401,50 +190,18 @@ hypothesis tests. Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API. -{% highlight python %} -from pyspark import SparkContext -from pyspark.mllib.linalg import Vectors, Matrices -from pyspark.mllib.regresssion import LabeledPoint -from pyspark.mllib.stat import Statistics - -sc = SparkContext() - -vec = Vectors.dense(...) # a vector composed of the frequencies of events - -# compute the goodness of fit. If a second vector to test against is not supplied as a parameter, -# the test runs against a uniform distribution. -goodnessOfFitTestResult = Statistics.chiSqTest(vec) -print(goodnessOfFitTestResult) # summary of the test including the p-value, degrees of freedom, - # test statistic, the method used, and the null hypothesis. - -mat = Matrices.dense(...) # a contingency matrix - -# conduct Pearson's independence test on the input contingency matrix -independenceTestResult = Statistics.chiSqTest(mat) -print(independenceTestResult) # summary of the test including the p-value, degrees of freedom... - -obs = sc.parallelize(...) # LabeledPoint(feature, label) . - -# The contingency table is constructed from an RDD of LabeledPoint and used to conduct -# the independence test. Returns an array containing the ChiSquaredTestResult for every feature -# against the label. -featureTestResults = Statistics.chiSqTest(obs) - -for i, result in enumerate(featureTestResults): - print("Column $d:" % (i + 1)) - print(result) -{% endhighlight %} +{% include_example python/mllib/hypothesis_testing_example.py %}
Additionally, `spark.mllib` provides a 1-sample, 2-sided implementation of the Kolmogorov-Smirnov (KS) test for equality of probability distributions. By providing the name of a theoretical distribution -(currently solely supported for the normal distribution) and its parameters, or a function to +(currently solely supported for the normal distribution) and its parameters, or a function to calculate the cumulative distribution according to a given theoretical distribution, the user can test the null hypothesis that their sample is drawn from that distribution. In the case that the user tests against the normal distribution (`distName="norm"`), but does not provide distribution -parameters, the test initializes to the standard normal distribution and logs an appropriate +parameters, the test initializes to the standard normal distribution and logs an appropriate message.
@@ -455,21 +212,7 @@ and interpret the hypothesis tests. Refer to the [`Statistics` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.Statistics) for details on the API. -{% highlight scala %} -import org.apache.spark.mllib.stat.Statistics - -val data: RDD[Double] = ... // an RDD of sample data - -// run a KS test for the sample versus a standard normal distribution -val testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1) -println(testResult) // summary of the test including the p-value, test statistic, - // and null hypothesis - // if our p-value indicates significance, we can reject the null hypothesis - -// perform a KS test using a cumulative distribution function of our making -val myCDF: Double => Double = ... -val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF) -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala %}
@@ -479,23 +222,7 @@ and interpret the hypothesis tests. Refer to the [`Statistics` Java docs](api/java/org/apache/spark/mllib/stat/Statistics.html) for details on the API. -{% highlight java %} -import java.util.Arrays; - -import org.apache.spark.api.java.JavaDoubleRDD; -import org.apache.spark.api.java.JavaSparkContext; - -import org.apache.spark.mllib.stat.Statistics; -import org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult; - -JavaSparkContext jsc = ... -JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.2, 1.0, ...)); -KolmogorovSmirnovTestResult testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0); -// summary of the test including the p-value, test statistic, -// and null hypothesis -// if our p-value indicates significance, we can reject the null hypothesis -System.out.println(testResult); -{% endhighlight %} +{% include_example java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java %}
@@ -505,19 +232,7 @@ and interpret the hypothesis tests. Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API. -{% highlight python %} -from pyspark.mllib.stat import Statistics - -parallelData = sc.parallelize([1.0, 2.0, ... ]) - -# run a KS test for the sample versus a standard normal distribution -testResult = Statistics.kolmogorovSmirnovTest(parallelData, "norm", 0, 1) -print(testResult) # summary of the test including the p-value, test statistic, - # and null hypothesis - # if our p-value indicates significance, we can reject the null hypothesis -# Note that the Scala functionality of calling Statistics.kolmogorovSmirnovTest with -# a lambda to calculate the CDF is not made available in the Python API -{% endhighlight %} +{% include_example python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py %}
@@ -651,21 +366,7 @@ to do so. Refer to the [`KernelDensity` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.KernelDensity) for details on the API. -{% highlight scala %} -import org.apache.spark.mllib.stat.KernelDensity -import org.apache.spark.rdd.RDD - -val data: RDD[Double] = ... // an RDD of sample data - -// Construct the density estimator with the sample data and a standard deviation for the Gaussian -// kernels -val kd = new KernelDensity() - .setSample(data) - .setBandwidth(3.0) - -// Find density estimates for the given values -val densities = kd.estimate(Array(-1.0, 2.0, 5.0)) -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala %}
@@ -675,21 +376,7 @@ to do so. Refer to the [`KernelDensity` Java docs](api/java/org/apache/spark/mllib/stat/KernelDensity.html) for details on the API. -{% highlight java %} -import org.apache.spark.mllib.stat.KernelDensity; -import org.apache.spark.rdd.RDD; - -RDD data = ... // an RDD of sample data - -// Construct the density estimator with the sample data and a standard deviation for the Gaussian -// kernels -KernelDensity kd = new KernelDensity() - .setSample(data) - .setBandwidth(3.0); - -// Find density estimates for the given values -double[] densities = kd.estimate(new double[] {-1.0, 2.0, 5.0}); -{% endhighlight %} +{% include_example java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java %}
@@ -699,20 +386,7 @@ to do so. Refer to the [`KernelDensity` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.KernelDensity) for more details on the API. -{% highlight python %} -from pyspark.mllib.stat import KernelDensity - -data = ... # an RDD of sample data - -# Construct the density estimator with the sample data and a standard deviation for the Gaussian -# kernels -kd = KernelDensity() -kd.setSample(data) -kd.setBandwidth(3.0) - -# Find density estimates for the given values -densities = kd.estimate([-1.0, 2.0, 5.0]) -{% endhighlight %} +{% include_example python/mllib/kernel_density_estimation_example.py %}
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java new file mode 100644 index 0000000000..fd19b43504 --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.mllib; + +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +// $example on$ +import java.util.Arrays; + +import org.apache.spark.api.java.JavaDoubleRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.mllib.linalg.Matrix; +import org.apache.spark.mllib.linalg.Vector; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.mllib.stat.Statistics; +// $example off$ + +public class JavaCorrelationsExample { + public static void main(String[] args) { + + SparkConf conf = new SparkConf().setAppName("JavaCorrelationsExample"); + JavaSparkContext jsc = new JavaSparkContext(conf); + + // $example on$ + JavaDoubleRDD seriesX = jsc.parallelizeDoubles( + Arrays.asList(1.0, 2.0, 3.0, 3.0, 5.0)); // a series + + // must have the same number of partitions and cardinality as seriesX + JavaDoubleRDD seriesY = jsc.parallelizeDoubles( + Arrays.asList(11.0, 22.0, 33.0, 33.0, 555.0)); + + // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. + // If a method is not specified, Pearson's method will be used by default. + Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson"); + System.out.println("Correlation is: " + correlation); + + // note that each Vector is a row and not a column + JavaRDD data = jsc.parallelize( + Arrays.asList( + Vectors.dense(1.0, 10.0, 100.0), + Vectors.dense(2.0, 20.0, 200.0), + Vectors.dense(5.0, 33.0, 366.0) + ) + ); + + // calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method. + // If a method is not specified, Pearson's method will be used by default. + Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson"); + System.out.println(correlMatrix.toString()); + // $example off$ + + jsc.stop(); + } +} + diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java new file mode 100644 index 0000000000..b48b95ff1d --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.mllib; + +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; + +// $example on$ +import java.util.Arrays; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.mllib.linalg.Matrices; +import org.apache.spark.mllib.linalg.Matrix; +import org.apache.spark.mllib.linalg.Vector; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.stat.Statistics; +import org.apache.spark.mllib.stat.test.ChiSqTestResult; +// $example off$ + +public class JavaHypothesisTestingExample { + public static void main(String[] args) { + + SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingExample"); + JavaSparkContext jsc = new JavaSparkContext(conf); + + // $example on$ + // a vector composed of the frequencies of events + Vector vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25); + + // compute the goodness of fit. If a second vector to test against is not supplied + // as a parameter, the test runs against a uniform distribution. + ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec); + // summary of the test including the p-value, degrees of freedom, test statistic, + // the method used, and the null hypothesis. + System.out.println(goodnessOfFitTestResult + "\n"); + + // Create a contingency matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0)) + Matrix mat = Matrices.dense(3, 2, new double[]{1.0, 3.0, 5.0, 2.0, 4.0, 6.0}); + + // conduct Pearson's independence test on the input contingency matrix + ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat); + // summary of the test including the p-value, degrees of freedom... + System.out.println(independenceTestResult + "\n"); + + // an RDD of labeled points + JavaRDD obs = jsc.parallelize( + Arrays.asList( + new LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)), + new LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0)), + new LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5)) + ) + ); + + // The contingency table is constructed from the raw (feature, label) pairs and used to conduct + // the independence test. Returns an array containing the ChiSquaredTestResult for every feature + // against the label. + ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd()); + int i = 1; + for (ChiSqTestResult result : featureTestResults) { + System.out.println("Column " + i + ":"); + System.out.println(result + "\n"); // summary of the test + i++; + } + // $example off$ + + jsc.stop(); + } +} diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java new file mode 100644 index 0000000000..fe611c9ae6 --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.mllib; + +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +// $example on$ +import java.util.Arrays; + +import org.apache.spark.api.java.JavaDoubleRDD; +import org.apache.spark.mllib.stat.Statistics; +import org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult; +// $example off$ + +public class JavaHypothesisTestingKolmogorovSmirnovTestExample { + public static void main(String[] args) { + + SparkConf conf = + new SparkConf().setAppName("JavaHypothesisTestingKolmogorovSmirnovTestExample"); + JavaSparkContext jsc = new JavaSparkContext(conf); + + // $example on$ + JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.1, 0.15, 0.2, 0.3, 0.25)); + KolmogorovSmirnovTestResult testResult = + Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0); + // summary of the test including the p-value, test statistic, and null hypothesis + // if our p-value indicates significance, we can reject the null hypothesis + System.out.println(testResult); + // $example off$ + + jsc.stop(); + } +} + diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java new file mode 100644 index 0000000000..41de0d90ec --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.mllib; + +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +// $example on$ +import java.util.Arrays; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.mllib.stat.KernelDensity; +// $example off$ + +public class JavaKernelDensityEstimationExample { + public static void main(String[] args) { + + SparkConf conf = new SparkConf().setAppName("JavaKernelDensityEstimationExample"); + JavaSparkContext jsc = new JavaSparkContext(conf); + + // $example on$ + // an RDD of sample data + JavaRDD data = jsc.parallelize( + Arrays.asList(1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0)); + + // Construct the density estimator with the sample data + // and a standard deviation for the Gaussian kernels + KernelDensity kd = new KernelDensity().setSample(data).setBandwidth(3.0); + + // Find density estimates for the given values + double[] densities = kd.estimate(new double[]{-1.0, 2.0, 5.0}); + + System.out.println(Arrays.toString(densities)); + // $example off$ + + jsc.stop(); + } +} + diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java new file mode 100644 index 0000000000..f5a451019b --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.mllib; + +import com.google.common.collect.ImmutableMap; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; + +// $example on$ +import java.util.*; + +import scala.Tuple2; + +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.function.VoidFunction; +// $example off$ + +public class JavaStratifiedSamplingExample { + public static void main(String[] args) { + + SparkConf conf = new SparkConf().setAppName("JavaStratifiedSamplingExample"); + JavaSparkContext jsc = new JavaSparkContext(conf); + + // $example on$ + List> list = new ArrayList>( + Arrays.>asList( + new Tuple2(1, 'a'), + new Tuple2(1, 'b'), + new Tuple2(2, 'c'), + new Tuple2(2, 'd'), + new Tuple2(2, 'e'), + new Tuple2(3, 'f') + ) + ); + + JavaPairRDD data = jsc.parallelizePairs(list); + + // specify the exact fraction desired from each key Map + ImmutableMap fractions = + ImmutableMap.of(1, (Object)0.1, 2, (Object) 0.6, 3, (Object) 0.3); + + // Get an approximate sample from each stratum + JavaPairRDD approxSample = data.sampleByKey(false, fractions); + // Get an exact sample from each stratum + JavaPairRDD exactSample = data.sampleByKeyExact(false, fractions); + // $example off$ + + System.out.println("approxSample size is " + approxSample.collect().size()); + for (Tuple2 t : approxSample.collect()) { + System.out.println(t._1() + " " + t._2()); + } + + System.out.println("exactSample size is " + exactSample.collect().size()); + for (Tuple2 t : exactSample.collect()) { + System.out.println(t._1() + " " + t._2()); + } + + jsc.stop(); + } +} diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java new file mode 100644 index 0000000000..278706bc8f --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.mllib; + +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +// $example on$ +import java.util.Arrays; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.mllib.linalg.Vector; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.mllib.stat.MultivariateStatisticalSummary; +import org.apache.spark.mllib.stat.Statistics; +// $example off$ + +public class JavaSummaryStatisticsExample { + public static void main(String[] args) { + + SparkConf conf = new SparkConf().setAppName("JavaSummaryStatisticsExample"); + JavaSparkContext jsc = new JavaSparkContext(conf); + + // $example on$ + JavaRDD mat = jsc.parallelize( + Arrays.asList( + Vectors.dense(1.0, 10.0, 100.0), + Vectors.dense(2.0, 20.0, 200.0), + Vectors.dense(3.0, 30.0, 300.0) + ) + ); // an RDD of Vectors + + // Compute column summary statistics. + MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd()); + System.out.println(summary.mean()); // a dense vector containing the mean value for each column + System.out.println(summary.variance()); // column-wise variance + System.out.println(summary.numNonzeros()); // number of nonzeros in each column + // $example off$ + + jsc.stop(); + } +} diff --git a/examples/src/main/python/mllib/correlations_example.py b/examples/src/main/python/mllib/correlations_example.py new file mode 100644 index 0000000000..66d18f6e5d --- /dev/null +++ b/examples/src/main/python/mllib/correlations_example.py @@ -0,0 +1,48 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function + +import numpy as np + +from pyspark import SparkContext +# $example on$ +from pyspark.mllib.stat import Statistics +# $example off$ + +if __name__ == "__main__": + sc = SparkContext(appName="CorrelationsExample") # SparkContext + + # $example on$ + seriesX = sc.parallelize([1.0, 2.0, 3.0, 3.0, 5.0]) # a series + # seriesY must have the same number of partitions and cardinality as seriesX + seriesY = sc.parallelize([11.0, 22.0, 33.0, 33.0, 555.0]) + + # Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. + # If a method is not specified, Pearson's method will be used by default. + print("Correlation is: " + str(Statistics.corr(seriesX, seriesY, method="pearson"))) + + data = sc.parallelize( + [np.array([1.0, 10.0, 100.0]), np.array([2.0, 20.0, 200.0]), np.array([5.0, 33.0, 366.0])] + ) # an RDD of Vectors + + # calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method. + # If a method is not specified, Pearson's method will be used by default. + print(Statistics.corr(data, method="pearson")) + # $example off$ + + sc.stop() diff --git a/examples/src/main/python/mllib/hypothesis_testing_example.py b/examples/src/main/python/mllib/hypothesis_testing_example.py new file mode 100644 index 0000000000..e566ead0d3 --- /dev/null +++ b/examples/src/main/python/mllib/hypothesis_testing_example.py @@ -0,0 +1,65 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function + +from pyspark import SparkContext +# $example on$ +from pyspark.mllib.linalg import Matrices, Vectors +from pyspark.mllib.regression import LabeledPoint +from pyspark.mllib.stat import Statistics +# $example off$ + +if __name__ == "__main__": + sc = SparkContext(appName="HypothesisTestingExample") + + # $example on$ + vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25) # a vector composed of the frequencies of events + + # compute the goodness of fit. If a second vector to test against + # is not supplied as a parameter, the test runs against a uniform distribution. + goodnessOfFitTestResult = Statistics.chiSqTest(vec) + + # summary of the test including the p-value, degrees of freedom, + # test statistic, the method used, and the null hypothesis. + print("%s\n" % goodnessOfFitTestResult) + + mat = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0]) # a contingency matrix + + # conduct Pearson's independence test on the input contingency matrix + independenceTestResult = Statistics.chiSqTest(mat) + + # summary of the test including the p-value, degrees of freedom, + # test statistic, the method used, and the null hypothesis. + print("%s\n" % independenceTestResult) + + obs = sc.parallelize( + [LabeledPoint(1.0, [1.0, 0.0, 3.0]), + LabeledPoint(1.0, [1.0, 2.0, 0.0]), + LabeledPoint(1.0, [-1.0, 0.0, -0.5])] + ) # LabeledPoint(feature, label) + + # The contingency table is constructed from an RDD of LabeledPoint and used to conduct + # the independence test. Returns an array containing the ChiSquaredTestResult for every feature + # against the label. + featureTestResults = Statistics.chiSqTest(obs) + + for i, result in enumerate(featureTestResults): + print("Column %d:\n%s" % (i + 1, result)) + # $example off$ + + sc.stop() diff --git a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py new file mode 100644 index 0000000000..ef380dee79 --- /dev/null +++ b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py @@ -0,0 +1,40 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function + +from pyspark import SparkContext +# $example on$ +from pyspark.mllib.stat import Statistics +# $example off$ + +if __name__ == "__main__": + sc = SparkContext(appName="HypothesisTestingKolmogorovSmirnovTestExample") + + # $example on$ + parallelData = sc.parallelize([0.1, 0.15, 0.2, 0.3, 0.25]) + + # run a KS test for the sample versus a standard normal distribution + testResult = Statistics.kolmogorovSmirnovTest(parallelData, "norm", 0, 1) + # summary of the test including the p-value, test statistic, and null hypothesis + # if our p-value indicates significance, we can reject the null hypothesis + # Note that the Scala functionality of calling Statistics.kolmogorovSmirnovTest with + # a lambda to calculate the CDF is not made available in the Python API + print(testResult) + # $example off$ + + sc.stop() diff --git a/examples/src/main/python/mllib/kernel_density_estimation_example.py b/examples/src/main/python/mllib/kernel_density_estimation_example.py new file mode 100644 index 0000000000..3e8f7241a4 --- /dev/null +++ b/examples/src/main/python/mllib/kernel_density_estimation_example.py @@ -0,0 +1,44 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function + +from pyspark import SparkContext +# $example on$ +from pyspark.mllib.stat import KernelDensity +# $example off$ + +if __name__ == "__main__": + sc = SparkContext(appName="KernelDensityEstimationExample") # SparkContext + + # $example on$ + # an RDD of sample data + data = sc.parallelize([1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0]) + + # Construct the density estimator with the sample data and a standard deviation for the Gaussian + # kernels + kd = KernelDensity() + kd.setSample(data) + kd.setBandwidth(3.0) + + # Find density estimates for the given values + densities = kd.estimate([-1.0, 2.0, 5.0]) + # $example off$ + + print(densities) + + sc.stop() diff --git a/examples/src/main/python/mllib/stratified_sampling_example.py b/examples/src/main/python/mllib/stratified_sampling_example.py new file mode 100644 index 0000000000..a13f8f08dd --- /dev/null +++ b/examples/src/main/python/mllib/stratified_sampling_example.py @@ -0,0 +1,38 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function + +from pyspark import SparkContext + +if __name__ == "__main__": + sc = SparkContext(appName="StratifiedSamplingExample") # SparkContext + + # $example on$ + # an RDD of any key value pairs + data = sc.parallelize([(1, 'a'), (1, 'b'), (2, 'c'), (2, 'd'), (2, 'e'), (3, 'f')]) + + # specify the exact fraction desired from each key as a dictionary + fractions = {1: 0.1, 2: 0.6, 3: 0.3} + + approxSample = data.sampleByKey(False, fractions) + # $example off$ + + for each in approxSample.collect(): + print(each) + + sc.stop() diff --git a/examples/src/main/python/mllib/summary_statistics_example.py b/examples/src/main/python/mllib/summary_statistics_example.py new file mode 100644 index 0000000000..d55d1a2c2d --- /dev/null +++ b/examples/src/main/python/mllib/summary_statistics_example.py @@ -0,0 +1,42 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function + +from pyspark import SparkContext +# $example on$ +import numpy as np + +from pyspark.mllib.stat import Statistics +# $example off$ + +if __name__ == "__main__": + sc = SparkContext(appName="SummaryStatisticsExample") # SparkContext + + # $example on$ + mat = sc.parallelize( + [np.array([1.0, 10.0, 100.0]), np.array([2.0, 20.0, 200.0]), np.array([3.0, 30.0, 300.0])] + ) # an RDD of Vectors + + # Compute column summary statistics. + summary = Statistics.colStats(mat) + print(summary.mean()) # a dense vector containing the mean value for each column + print(summary.variance()) # column-wise variance + print(summary.numNonzeros()) # number of nonzeros in each column + # $example off$ + + sc.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala new file mode 100644 index 0000000000..1202caf534 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib + +import org.apache.spark.{SparkConf, SparkContext} +// $example on$ +import org.apache.spark.mllib.linalg._ +import org.apache.spark.mllib.stat.Statistics +import org.apache.spark.rdd.RDD +// $example off$ + +object CorrelationsExample { + + def main(args: Array[String]): Unit = { + + val conf = new SparkConf().setAppName("CorrelationsExample") + val sc = new SparkContext(conf) + + // $example on$ + val seriesX: RDD[Double] = sc.parallelize(Array(1, 2, 3, 3, 5)) // a series + // must have the same number of partitions and cardinality as seriesX + val seriesY: RDD[Double] = sc.parallelize(Array(11, 22, 33, 33, 555)) + + // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a + // method is not specified, Pearson's method will be used by default. + val correlation: Double = Statistics.corr(seriesX, seriesY, "pearson") + println(s"Correlation is: $correlation") + + val data: RDD[Vector] = sc.parallelize( + Seq( + Vectors.dense(1.0, 10.0, 100.0), + Vectors.dense(2.0, 20.0, 200.0), + Vectors.dense(5.0, 33.0, 366.0)) + ) // note that each Vector is a row and not a column + + // calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method + // If a method is not specified, Pearson's method will be used by default. + val correlMatrix: Matrix = Statistics.corr(data, "pearson") + println(correlMatrix.toString) + // $example off$ + + sc.stop() + } +} +// scalastyle:on println + diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala new file mode 100644 index 0000000000..0d391a3637 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib + +import org.apache.spark.{SparkConf, SparkContext} +// $example on$ +import org.apache.spark.mllib.linalg._ +import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.mllib.stat.Statistics +import org.apache.spark.mllib.stat.test.ChiSqTestResult +import org.apache.spark.rdd.RDD +// $example off$ + +object HypothesisTestingExample { + + def main(args: Array[String]) { + + val conf = new SparkConf().setAppName("HypothesisTestingExample") + val sc = new SparkContext(conf) + + // $example on$ + // a vector composed of the frequencies of events + val vec: Vector = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25) + + // compute the goodness of fit. If a second vector to test against is not supplied + // as a parameter, the test runs against a uniform distribution. + val goodnessOfFitTestResult = Statistics.chiSqTest(vec) + // summary of the test including the p-value, degrees of freedom, test statistic, the method + // used, and the null hypothesis. + println(s"$goodnessOfFitTestResult\n") + + // a contingency matrix. Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0)) + val mat: Matrix = Matrices.dense(3, 2, Array(1.0, 3.0, 5.0, 2.0, 4.0, 6.0)) + + // conduct Pearson's independence test on the input contingency matrix + val independenceTestResult = Statistics.chiSqTest(mat) + // summary of the test including the p-value, degrees of freedom + println(s"$independenceTestResult\n") + + val obs: RDD[LabeledPoint] = + sc.parallelize( + Seq( + LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)), + LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0)), + LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5) + ) + ) + ) // (feature, label) pairs. + + // The contingency table is constructed from the raw (feature, label) pairs and used to conduct + // the independence test. Returns an array containing the ChiSquaredTestResult for every feature + // against the label. + val featureTestResults: Array[ChiSqTestResult] = Statistics.chiSqTest(obs) + featureTestResults.zipWithIndex.foreach { case (k, v) => + println("Column " + (v + 1).toString + ":") + println(k) + } // summary of the test + // $example off$ + + sc.stop() + } +} +// scalastyle:on println + diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala new file mode 100644 index 0000000000..840874cf3c --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib + +import org.apache.spark.{SparkConf, SparkContext} +// $example on$ +import org.apache.spark.mllib.stat.Statistics +import org.apache.spark.rdd.RDD +// $example off$ + +object HypothesisTestingKolmogorovSmirnovTestExample { + + def main(args: Array[String]): Unit = { + + val conf = new SparkConf().setAppName("HypothesisTestingKolmogorovSmirnovTestExample") + val sc = new SparkContext(conf) + + // $example on$ + val data: RDD[Double] = sc.parallelize(Seq(0.1, 0.15, 0.2, 0.3, 0.25)) // an RDD of sample data + + // run a KS test for the sample versus a standard normal distribution + val testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1) + // summary of the test including the p-value, test statistic, and null hypothesis if our p-value + // indicates significance, we can reject the null hypothesis. + println(testResult) + println() + + // perform a KS test using a cumulative distribution function of our making + val myCDF = Map(0.1 -> 0.2, 0.15 -> 0.6, 0.2 -> 0.05, 0.3 -> 0.05, 0.25 -> 0.1) + val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF) + println(testResult2) + // $example off$ + + sc.stop() + } +} +// scalastyle:on println + diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala new file mode 100644 index 0000000000..cc5d159b36 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib + +import org.apache.spark.{SparkConf, SparkContext} +// $example on$ +import org.apache.spark.mllib.stat.KernelDensity +import org.apache.spark.rdd.RDD +// $example off$ + +object KernelDensityEstimationExample { + + def main(args: Array[String]): Unit = { + + val conf = new SparkConf().setAppName("KernelDensityEstimationExample") + val sc = new SparkContext(conf) + + // $example on$ + // an RDD of sample data + val data: RDD[Double] = sc.parallelize(Seq(1, 1, 1, 2, 3, 4, 5, 5, 6, 7, 8, 9, 9)) + + // Construct the density estimator with the sample data and a standard deviation + // for the Gaussian kernels + val kd = new KernelDensity() + .setSample(data) + .setBandwidth(3.0) + + // Find density estimates for the given values + val densities = kd.estimate(Array(-1.0, 2.0, 5.0)) + // $example off$ + + densities.foreach(println) + + sc.stop() + } +} +// scalastyle:on println + diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala new file mode 100644 index 0000000000..16b074ef60 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib + +import org.apache.spark.{SparkConf, SparkContext} + +object StratifiedSamplingExample { + + def main(args: Array[String]): Unit = { + + val conf = new SparkConf().setAppName("StratifiedSamplingExample") + val sc = new SparkContext(conf) + + // $example on$ + // an RDD[(K, V)] of any key value pairs + val data = sc.parallelize( + Seq((1, 'a'), (1, 'b'), (2, 'c'), (2, 'd'), (2, 'e'), (3, 'f'))) + + // specify the exact fraction desired from each key + val fractions = Map(1 -> 0.1, 2 -> 0.6, 3 -> 0.3) + + // Get an approximate sample from each stratum + val approxSample = data.sampleByKey(withReplacement = false, fractions = fractions) + // Get an exact sample from each stratum + val exactSample = data.sampleByKeyExact(withReplacement = false, fractions = fractions) + // $example off$ + + println("approxSample size is " + approxSample.collect().size.toString) + approxSample.collect().foreach(println) + + println("exactSample its size is " + exactSample.collect().size.toString) + exactSample.collect().foreach(println) + + sc.stop() + } +} +// scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala new file mode 100644 index 0000000000..948b443c0a --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib + +import org.apache.spark.{SparkConf, SparkContext} +// $example on$ +import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics} +// $example off$ + +object SummaryStatisticsExample { + + def main(args: Array[String]): Unit = { + + val conf = new SparkConf().setAppName("SummaryStatisticsExample") + val sc = new SparkContext(conf) + + // $example on$ + val observations = sc.parallelize( + Seq( + Vectors.dense(1.0, 10.0, 100.0), + Vectors.dense(2.0, 20.0, 200.0), + Vectors.dense(3.0, 30.0, 300.0) + ) + ) + + // Compute column summary statistics. + val summary: MultivariateStatisticalSummary = Statistics.colStats(observations) + println(summary.mean) // a dense vector containing the mean value for each column + println(summary.variance) // column-wise variance + println(summary.numNonzeros) // number of nonzeros in each column + // $example off$ + + sc.stop() + } +} +// scalastyle:on println