Revert "[SPARK-13019][DOCS] Replace example code in mllib-statistics.md using include_example"
This reverts commit 1af8de200c
.
This commit is contained in:
parent
3f49e0766f
commit
43ef1e52bf
|
@ -40,7 +40,19 @@ total count.
|
|||
|
||||
Refer to the [`MultivariateStatisticalSummary` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.MultivariateStatisticalSummary) for details on the API.
|
||||
|
||||
{% include_example scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala %}
|
||||
{% highlight scala %}
|
||||
import org.apache.spark.mllib.linalg.Vector
|
||||
import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
|
||||
|
||||
val observations: RDD[Vector] = ... // an RDD of Vectors
|
||||
|
||||
// Compute column summary statistics.
|
||||
val summary: MultivariateStatisticalSummary = Statistics.colStats(observations)
|
||||
println(summary.mean) // a dense vector containing the mean value for each column
|
||||
println(summary.variance) // column-wise variance
|
||||
println(summary.numNonzeros) // number of nonzeros in each column
|
||||
|
||||
{% endhighlight %}
|
||||
</div>
|
||||
|
||||
<div data-lang="java" markdown="1">
|
||||
|
@ -52,7 +64,24 @@ total count.
|
|||
|
||||
Refer to the [`MultivariateStatisticalSummary` Java docs](api/java/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.html) for details on the API.
|
||||
|
||||
{% include_example java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java %}
|
||||
{% highlight java %}
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.mllib.linalg.Vector;
|
||||
import org.apache.spark.mllib.stat.MultivariateStatisticalSummary;
|
||||
import org.apache.spark.mllib.stat.Statistics;
|
||||
|
||||
JavaSparkContext jsc = ...
|
||||
|
||||
JavaRDD<Vector> mat = ... // an RDD of Vectors
|
||||
|
||||
// Compute column summary statistics.
|
||||
MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd());
|
||||
System.out.println(summary.mean()); // a dense vector containing the mean value for each column
|
||||
System.out.println(summary.variance()); // column-wise variance
|
||||
System.out.println(summary.numNonzeros()); // number of nonzeros in each column
|
||||
|
||||
{% endhighlight %}
|
||||
</div>
|
||||
|
||||
<div data-lang="python" markdown="1">
|
||||
|
@ -63,7 +92,20 @@ total count.
|
|||
|
||||
Refer to the [`MultivariateStatisticalSummary` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.MultivariateStatisticalSummary) for more details on the API.
|
||||
|
||||
{% include_example python/mllib/summary_statistics_example.py %}
|
||||
{% highlight python %}
|
||||
from pyspark.mllib.stat import Statistics
|
||||
|
||||
sc = ... # SparkContext
|
||||
|
||||
mat = ... # an RDD of Vectors
|
||||
|
||||
# Compute column summary statistics.
|
||||
summary = Statistics.colStats(mat)
|
||||
print(summary.mean())
|
||||
print(summary.variance())
|
||||
print(summary.numNonzeros())
|
||||
|
||||
{% endhighlight %}
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
@ -82,7 +124,27 @@ an `RDD[Vector]`, the output will be a `Double` or the correlation `Matrix` resp
|
|||
|
||||
Refer to the [`Statistics` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.Statistics) for details on the API.
|
||||
|
||||
{% include_example scala/org/apache/spark/examples/mllib/CorrelationsExample.scala %}
|
||||
{% highlight scala %}
|
||||
import org.apache.spark.SparkContext
|
||||
import org.apache.spark.mllib.linalg._
|
||||
import org.apache.spark.mllib.stat.Statistics
|
||||
|
||||
val sc: SparkContext = ...
|
||||
|
||||
val seriesX: RDD[Double] = ... // a series
|
||||
val seriesY: RDD[Double] = ... // must have the same number of partitions and cardinality as seriesX
|
||||
|
||||
// compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a
|
||||
// method is not specified, Pearson's method will be used by default.
|
||||
val correlation: Double = Statistics.corr(seriesX, seriesY, "pearson")
|
||||
|
||||
val data: RDD[Vector] = ... // note that each Vector is a row and not a column
|
||||
|
||||
// calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
|
||||
// If a method is not specified, Pearson's method will be used by default.
|
||||
val correlMatrix: Matrix = Statistics.corr(data, "pearson")
|
||||
|
||||
{% endhighlight %}
|
||||
</div>
|
||||
|
||||
<div data-lang="java" markdown="1">
|
||||
|
@ -92,7 +154,28 @@ a `JavaRDD<Vector>`, the output will be a `Double` or the correlation `Matrix` r
|
|||
|
||||
Refer to the [`Statistics` Java docs](api/java/org/apache/spark/mllib/stat/Statistics.html) for details on the API.
|
||||
|
||||
{% include_example java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java %}
|
||||
{% highlight java %}
|
||||
import org.apache.spark.api.java.JavaDoubleRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.mllib.linalg.*;
|
||||
import org.apache.spark.mllib.stat.Statistics;
|
||||
|
||||
JavaSparkContext jsc = ...
|
||||
|
||||
JavaDoubleRDD seriesX = ... // a series
|
||||
JavaDoubleRDD seriesY = ... // must have the same number of partitions and cardinality as seriesX
|
||||
|
||||
// compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a
|
||||
// method is not specified, Pearson's method will be used by default.
|
||||
Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson");
|
||||
|
||||
JavaRDD<Vector> data = ... // note that each Vector is a row and not a column
|
||||
|
||||
// calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
|
||||
// If a method is not specified, Pearson's method will be used by default.
|
||||
Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson");
|
||||
|
||||
{% endhighlight %}
|
||||
</div>
|
||||
|
||||
<div data-lang="python" markdown="1">
|
||||
|
@ -102,7 +185,24 @@ an `RDD[Vector]`, the output will be a `Double` or the correlation `Matrix` resp
|
|||
|
||||
Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API.
|
||||
|
||||
{% include_example python/mllib/correlations_example.py %}
|
||||
{% highlight python %}
|
||||
from pyspark.mllib.stat import Statistics
|
||||
|
||||
sc = ... # SparkContext
|
||||
|
||||
seriesX = ... # a series
|
||||
seriesY = ... # must have the same number of partitions and cardinality as seriesX
|
||||
|
||||
# Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a
|
||||
# method is not specified, Pearson's method will be used by default.
|
||||
print(Statistics.corr(seriesX, seriesY, method="pearson"))
|
||||
|
||||
data = ... # an RDD of Vectors
|
||||
# calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
|
||||
# If a method is not specified, Pearson's method will be used by default.
|
||||
print(Statistics.corr(data, method="pearson"))
|
||||
|
||||
{% endhighlight %}
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
@ -128,7 +228,21 @@ fraction for key $k$, $n_k$ is the number of key-value pairs for key $k$, and $K
|
|||
keys. Sampling without replacement requires one additional pass over the RDD to guarantee sample
|
||||
size, whereas sampling with replacement requires two additional passes.
|
||||
|
||||
{% include_example scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala %}
|
||||
{% highlight scala %}
|
||||
import org.apache.spark.SparkContext
|
||||
import org.apache.spark.SparkContext._
|
||||
import org.apache.spark.rdd.PairRDDFunctions
|
||||
|
||||
val sc: SparkContext = ...
|
||||
|
||||
val data = ... // an RDD[(K, V)] of any key value pairs
|
||||
val fractions: Map[K, Double] = ... // specify the exact fraction desired from each key
|
||||
|
||||
// Get an exact sample from each stratum
|
||||
val approxSample = data.sampleByKey(withReplacement = false, fractions)
|
||||
val exactSample = data.sampleByKeyExact(withReplacement = false, fractions)
|
||||
|
||||
{% endhighlight %}
|
||||
</div>
|
||||
|
||||
<div data-lang="java" markdown="1">
|
||||
|
@ -138,7 +252,22 @@ fraction for key $k$, $n_k$ is the number of key-value pairs for key $k$, and $K
|
|||
keys. Sampling without replacement requires one additional pass over the RDD to guarantee sample
|
||||
size, whereas sampling with replacement requires two additional passes.
|
||||
|
||||
{% include_example java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java %}
|
||||
{% highlight java %}
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
|
||||
JavaSparkContext jsc = ...
|
||||
|
||||
JavaPairRDD<K, V> data = ... // an RDD of any key value pairs
|
||||
Map<K, Object> fractions = ... // specify the exact fraction desired from each key
|
||||
|
||||
// Get an exact sample from each stratum
|
||||
JavaPairRDD<K, V> approxSample = data.sampleByKey(false, fractions);
|
||||
JavaPairRDD<K, V> exactSample = data.sampleByKeyExact(false, fractions);
|
||||
|
||||
{% endhighlight %}
|
||||
</div>
|
||||
<div data-lang="python" markdown="1">
|
||||
[`sampleByKey()`](api/python/pyspark.html#pyspark.RDD.sampleByKey) allows users to
|
||||
|
@ -148,7 +277,16 @@ set of keys.
|
|||
|
||||
*Note:* `sampleByKeyExact()` is currently not supported in Python.
|
||||
|
||||
{% include_example python/mllib/stratified_sampling_example.py %}
|
||||
{% highlight python %}
|
||||
|
||||
sc = ... # SparkContext
|
||||
|
||||
data = ... # an RDD of any key value pairs
|
||||
fractions = ... # specify the exact fraction desired from each key as a dictionary
|
||||
|
||||
approxSample = data.sampleByKey(False, fractions);
|
||||
|
||||
{% endhighlight %}
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
@ -170,7 +308,41 @@ independence tests.
|
|||
run Pearson's chi-squared tests. The following example demonstrates how to run and interpret
|
||||
hypothesis tests.
|
||||
|
||||
{% include_example scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala %}
|
||||
{% highlight scala %}
|
||||
import org.apache.spark.SparkContext
|
||||
import org.apache.spark.mllib.linalg._
|
||||
import org.apache.spark.mllib.regression.LabeledPoint
|
||||
import org.apache.spark.mllib.stat.Statistics._
|
||||
|
||||
val sc: SparkContext = ...
|
||||
|
||||
val vec: Vector = ... // a vector composed of the frequencies of events
|
||||
|
||||
// compute the goodness of fit. If a second vector to test against is not supplied as a parameter,
|
||||
// the test runs against a uniform distribution.
|
||||
val goodnessOfFitTestResult = Statistics.chiSqTest(vec)
|
||||
println(goodnessOfFitTestResult) // summary of the test including the p-value, degrees of freedom,
|
||||
// test statistic, the method used, and the null hypothesis.
|
||||
|
||||
val mat: Matrix = ... // a contingency matrix
|
||||
|
||||
// conduct Pearson's independence test on the input contingency matrix
|
||||
val independenceTestResult = Statistics.chiSqTest(mat)
|
||||
println(independenceTestResult) // summary of the test including the p-value, degrees of freedom...
|
||||
|
||||
val obs: RDD[LabeledPoint] = ... // (feature, label) pairs.
|
||||
|
||||
// The contingency table is constructed from the raw (feature, label) pairs and used to conduct
|
||||
// the independence test. Returns an array containing the ChiSquaredTestResult for every feature
|
||||
// against the label.
|
||||
val featureTestResults: Array[ChiSqTestResult] = Statistics.chiSqTest(obs)
|
||||
var i = 1
|
||||
featureTestResults.foreach { result =>
|
||||
println(s"Column $i:\n$result")
|
||||
i += 1
|
||||
} // summary of the test
|
||||
|
||||
{% endhighlight %}
|
||||
</div>
|
||||
|
||||
<div data-lang="java" markdown="1">
|
||||
|
@ -180,7 +352,46 @@ hypothesis tests.
|
|||
|
||||
Refer to the [`ChiSqTestResult` Java docs](api/java/org/apache/spark/mllib/stat/test/ChiSqTestResult.html) for details on the API.
|
||||
|
||||
{% include_example java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java %}
|
||||
{% highlight java %}
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.mllib.linalg.*;
|
||||
import org.apache.spark.mllib.regression.LabeledPoint;
|
||||
import org.apache.spark.mllib.stat.Statistics;
|
||||
import org.apache.spark.mllib.stat.test.ChiSqTestResult;
|
||||
|
||||
JavaSparkContext jsc = ...
|
||||
|
||||
Vector vec = ... // a vector composed of the frequencies of events
|
||||
|
||||
// compute the goodness of fit. If a second vector to test against is not supplied as a parameter,
|
||||
// the test runs against a uniform distribution.
|
||||
ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec);
|
||||
// summary of the test including the p-value, degrees of freedom, test statistic, the method used,
|
||||
// and the null hypothesis.
|
||||
System.out.println(goodnessOfFitTestResult);
|
||||
|
||||
Matrix mat = ... // a contingency matrix
|
||||
|
||||
// conduct Pearson's independence test on the input contingency matrix
|
||||
ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat);
|
||||
// summary of the test including the p-value, degrees of freedom...
|
||||
System.out.println(independenceTestResult);
|
||||
|
||||
JavaRDD<LabeledPoint> obs = ... // an RDD of labeled points
|
||||
|
||||
// The contingency table is constructed from the raw (feature, label) pairs and used to conduct
|
||||
// the independence test. Returns an array containing the ChiSquaredTestResult for every feature
|
||||
// against the label.
|
||||
ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd());
|
||||
int i = 1;
|
||||
for (ChiSqTestResult result : featureTestResults) {
|
||||
System.out.println("Column " + i + ":");
|
||||
System.out.println(result); // summary of the test
|
||||
i++;
|
||||
}
|
||||
|
||||
{% endhighlight %}
|
||||
</div>
|
||||
|
||||
<div data-lang="python" markdown="1">
|
||||
|
@ -190,7 +401,39 @@ hypothesis tests.
|
|||
|
||||
Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API.
|
||||
|
||||
{% include_example python/mllib/hypothesis_testing_example.py %}
|
||||
{% highlight python %}
|
||||
from pyspark import SparkContext
|
||||
from pyspark.mllib.linalg import Vectors, Matrices
|
||||
from pyspark.mllib.regresssion import LabeledPoint
|
||||
from pyspark.mllib.stat import Statistics
|
||||
|
||||
sc = SparkContext()
|
||||
|
||||
vec = Vectors.dense(...) # a vector composed of the frequencies of events
|
||||
|
||||
# compute the goodness of fit. If a second vector to test against is not supplied as a parameter,
|
||||
# the test runs against a uniform distribution.
|
||||
goodnessOfFitTestResult = Statistics.chiSqTest(vec)
|
||||
print(goodnessOfFitTestResult) # summary of the test including the p-value, degrees of freedom,
|
||||
# test statistic, the method used, and the null hypothesis.
|
||||
|
||||
mat = Matrices.dense(...) # a contingency matrix
|
||||
|
||||
# conduct Pearson's independence test on the input contingency matrix
|
||||
independenceTestResult = Statistics.chiSqTest(mat)
|
||||
print(independenceTestResult) # summary of the test including the p-value, degrees of freedom...
|
||||
|
||||
obs = sc.parallelize(...) # LabeledPoint(feature, label) .
|
||||
|
||||
# The contingency table is constructed from an RDD of LabeledPoint and used to conduct
|
||||
# the independence test. Returns an array containing the ChiSquaredTestResult for every feature
|
||||
# against the label.
|
||||
featureTestResults = Statistics.chiSqTest(obs)
|
||||
|
||||
for i, result in enumerate(featureTestResults):
|
||||
print("Column $d:" % (i + 1))
|
||||
print(result)
|
||||
{% endhighlight %}
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
@ -212,7 +455,21 @@ and interpret the hypothesis tests.
|
|||
|
||||
Refer to the [`Statistics` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.Statistics) for details on the API.
|
||||
|
||||
{% include_example scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala %}
|
||||
{% highlight scala %}
|
||||
import org.apache.spark.mllib.stat.Statistics
|
||||
|
||||
val data: RDD[Double] = ... // an RDD of sample data
|
||||
|
||||
// run a KS test for the sample versus a standard normal distribution
|
||||
val testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)
|
||||
println(testResult) // summary of the test including the p-value, test statistic,
|
||||
// and null hypothesis
|
||||
// if our p-value indicates significance, we can reject the null hypothesis
|
||||
|
||||
// perform a KS test using a cumulative distribution function of our making
|
||||
val myCDF: Double => Double = ...
|
||||
val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF)
|
||||
{% endhighlight %}
|
||||
</div>
|
||||
|
||||
<div data-lang="java" markdown="1">
|
||||
|
@ -222,7 +479,23 @@ and interpret the hypothesis tests.
|
|||
|
||||
Refer to the [`Statistics` Java docs](api/java/org/apache/spark/mllib/stat/Statistics.html) for details on the API.
|
||||
|
||||
{% include_example java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java %}
|
||||
{% highlight java %}
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.spark.api.java.JavaDoubleRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
|
||||
import org.apache.spark.mllib.stat.Statistics;
|
||||
import org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult;
|
||||
|
||||
JavaSparkContext jsc = ...
|
||||
JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.2, 1.0, ...));
|
||||
KolmogorovSmirnovTestResult testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0);
|
||||
// summary of the test including the p-value, test statistic,
|
||||
// and null hypothesis
|
||||
// if our p-value indicates significance, we can reject the null hypothesis
|
||||
System.out.println(testResult);
|
||||
{% endhighlight %}
|
||||
</div>
|
||||
|
||||
<div data-lang="python" markdown="1">
|
||||
|
@ -232,7 +505,19 @@ and interpret the hypothesis tests.
|
|||
|
||||
Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API.
|
||||
|
||||
{% include_example python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py %}
|
||||
{% highlight python %}
|
||||
from pyspark.mllib.stat import Statistics
|
||||
|
||||
parallelData = sc.parallelize([1.0, 2.0, ... ])
|
||||
|
||||
# run a KS test for the sample versus a standard normal distribution
|
||||
testResult = Statistics.kolmogorovSmirnovTest(parallelData, "norm", 0, 1)
|
||||
print(testResult) # summary of the test including the p-value, test statistic,
|
||||
# and null hypothesis
|
||||
# if our p-value indicates significance, we can reject the null hypothesis
|
||||
# Note that the Scala functionality of calling Statistics.kolmogorovSmirnovTest with
|
||||
# a lambda to calculate the CDF is not made available in the Python API
|
||||
{% endhighlight %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
@ -366,7 +651,21 @@ to do so.
|
|||
|
||||
Refer to the [`KernelDensity` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.KernelDensity) for details on the API.
|
||||
|
||||
{% include_example scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala %}
|
||||
{% highlight scala %}
|
||||
import org.apache.spark.mllib.stat.KernelDensity
|
||||
import org.apache.spark.rdd.RDD
|
||||
|
||||
val data: RDD[Double] = ... // an RDD of sample data
|
||||
|
||||
// Construct the density estimator with the sample data and a standard deviation for the Gaussian
|
||||
// kernels
|
||||
val kd = new KernelDensity()
|
||||
.setSample(data)
|
||||
.setBandwidth(3.0)
|
||||
|
||||
// Find density estimates for the given values
|
||||
val densities = kd.estimate(Array(-1.0, 2.0, 5.0))
|
||||
{% endhighlight %}
|
||||
</div>
|
||||
|
||||
<div data-lang="java" markdown="1">
|
||||
|
@ -376,7 +675,21 @@ to do so.
|
|||
|
||||
Refer to the [`KernelDensity` Java docs](api/java/org/apache/spark/mllib/stat/KernelDensity.html) for details on the API.
|
||||
|
||||
{% include_example java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java %}
|
||||
{% highlight java %}
|
||||
import org.apache.spark.mllib.stat.KernelDensity;
|
||||
import org.apache.spark.rdd.RDD;
|
||||
|
||||
RDD<Double> data = ... // an RDD of sample data
|
||||
|
||||
// Construct the density estimator with the sample data and a standard deviation for the Gaussian
|
||||
// kernels
|
||||
KernelDensity kd = new KernelDensity()
|
||||
.setSample(data)
|
||||
.setBandwidth(3.0);
|
||||
|
||||
// Find density estimates for the given values
|
||||
double[] densities = kd.estimate(new double[] {-1.0, 2.0, 5.0});
|
||||
{% endhighlight %}
|
||||
</div>
|
||||
|
||||
<div data-lang="python" markdown="1">
|
||||
|
@ -386,7 +699,20 @@ to do so.
|
|||
|
||||
Refer to the [`KernelDensity` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.KernelDensity) for more details on the API.
|
||||
|
||||
{% include_example python/mllib/kernel_density_estimation_example.py %}
|
||||
{% highlight python %}
|
||||
from pyspark.mllib.stat import KernelDensity
|
||||
|
||||
data = ... # an RDD of sample data
|
||||
|
||||
# Construct the density estimator with the sample data and a standard deviation for the Gaussian
|
||||
# kernels
|
||||
kd = KernelDensity()
|
||||
kd.setSample(data)
|
||||
kd.setBandwidth(3.0)
|
||||
|
||||
# Find density estimates for the given values
|
||||
densities = kd.estimate([-1.0, 2.0, 5.0])
|
||||
{% endhighlight %}
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
|
|
@ -1,70 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.examples.mllib;
|
||||
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
// $example on$
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.spark.api.java.JavaDoubleRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.mllib.linalg.Matrix;
|
||||
import org.apache.spark.mllib.linalg.Vector;
|
||||
import org.apache.spark.mllib.linalg.Vectors;
|
||||
import org.apache.spark.mllib.stat.Statistics;
|
||||
// $example off$
|
||||
|
||||
public class JavaCorrelationsExample {
|
||||
public static void main(String[] args) {
|
||||
|
||||
SparkConf conf = new SparkConf().setAppName("JavaCorrelationsExample");
|
||||
JavaSparkContext jsc = new JavaSparkContext(conf);
|
||||
|
||||
// $example on$
|
||||
JavaDoubleRDD seriesX = jsc.parallelizeDoubles(
|
||||
Arrays.asList(1.0, 2.0, 3.0, 3.0, 5.0)); // a series
|
||||
|
||||
// must have the same number of partitions and cardinality as seriesX
|
||||
JavaDoubleRDD seriesY = jsc.parallelizeDoubles(
|
||||
Arrays.asList(11.0, 22.0, 33.0, 33.0, 555.0));
|
||||
|
||||
// compute the correlation using Pearson's method. Enter "spearman" for Spearman's method.
|
||||
// If a method is not specified, Pearson's method will be used by default.
|
||||
Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson");
|
||||
System.out.println("Correlation is: " + correlation);
|
||||
|
||||
// note that each Vector is a row and not a column
|
||||
JavaRDD<Vector> data = jsc.parallelize(
|
||||
Arrays.asList(
|
||||
Vectors.dense(1.0, 10.0, 100.0),
|
||||
Vectors.dense(2.0, 20.0, 200.0),
|
||||
Vectors.dense(5.0, 33.0, 366.0)
|
||||
)
|
||||
);
|
||||
|
||||
// calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
|
||||
// If a method is not specified, Pearson's method will be used by default.
|
||||
Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson");
|
||||
System.out.println(correlMatrix.toString());
|
||||
// $example off$
|
||||
|
||||
jsc.stop();
|
||||
}
|
||||
}
|
||||
|
|
@ -1,84 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.examples.mllib;
|
||||
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
|
||||
// $example on$
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.mllib.linalg.Matrices;
|
||||
import org.apache.spark.mllib.linalg.Matrix;
|
||||
import org.apache.spark.mllib.linalg.Vector;
|
||||
import org.apache.spark.mllib.linalg.Vectors;
|
||||
import org.apache.spark.mllib.regression.LabeledPoint;
|
||||
import org.apache.spark.mllib.stat.Statistics;
|
||||
import org.apache.spark.mllib.stat.test.ChiSqTestResult;
|
||||
// $example off$
|
||||
|
||||
public class JavaHypothesisTestingExample {
|
||||
public static void main(String[] args) {
|
||||
|
||||
SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingExample");
|
||||
JavaSparkContext jsc = new JavaSparkContext(conf);
|
||||
|
||||
// $example on$
|
||||
// a vector composed of the frequencies of events
|
||||
Vector vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25);
|
||||
|
||||
// compute the goodness of fit. If a second vector to test against is not supplied
|
||||
// as a parameter, the test runs against a uniform distribution.
|
||||
ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec);
|
||||
// summary of the test including the p-value, degrees of freedom, test statistic,
|
||||
// the method used, and the null hypothesis.
|
||||
System.out.println(goodnessOfFitTestResult + "\n");
|
||||
|
||||
// Create a contingency matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
|
||||
Matrix mat = Matrices.dense(3, 2, new double[]{1.0, 3.0, 5.0, 2.0, 4.0, 6.0});
|
||||
|
||||
// conduct Pearson's independence test on the input contingency matrix
|
||||
ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat);
|
||||
// summary of the test including the p-value, degrees of freedom...
|
||||
System.out.println(independenceTestResult + "\n");
|
||||
|
||||
// an RDD of labeled points
|
||||
JavaRDD<LabeledPoint> obs = jsc.parallelize(
|
||||
Arrays.asList(
|
||||
new LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)),
|
||||
new LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0)),
|
||||
new LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5))
|
||||
)
|
||||
);
|
||||
|
||||
// The contingency table is constructed from the raw (feature, label) pairs and used to conduct
|
||||
// the independence test. Returns an array containing the ChiSquaredTestResult for every feature
|
||||
// against the label.
|
||||
ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd());
|
||||
int i = 1;
|
||||
for (ChiSqTestResult result : featureTestResults) {
|
||||
System.out.println("Column " + i + ":");
|
||||
System.out.println(result + "\n"); // summary of the test
|
||||
i++;
|
||||
}
|
||||
// $example off$
|
||||
|
||||
jsc.stop();
|
||||
}
|
||||
}
|
|
@ -1,49 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.examples.mllib;
|
||||
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
// $example on$
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.spark.api.java.JavaDoubleRDD;
|
||||
import org.apache.spark.mllib.stat.Statistics;
|
||||
import org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult;
|
||||
// $example off$
|
||||
|
||||
public class JavaHypothesisTestingKolmogorovSmirnovTestExample {
|
||||
public static void main(String[] args) {
|
||||
|
||||
SparkConf conf =
|
||||
new SparkConf().setAppName("JavaHypothesisTestingKolmogorovSmirnovTestExample");
|
||||
JavaSparkContext jsc = new JavaSparkContext(conf);
|
||||
|
||||
// $example on$
|
||||
JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.1, 0.15, 0.2, 0.3, 0.25));
|
||||
KolmogorovSmirnovTestResult testResult =
|
||||
Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0);
|
||||
// summary of the test including the p-value, test statistic, and null hypothesis
|
||||
// if our p-value indicates significance, we can reject the null hypothesis
|
||||
System.out.println(testResult);
|
||||
// $example off$
|
||||
|
||||
jsc.stop();
|
||||
}
|
||||
}
|
||||
|
|
@ -1,53 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.examples.mllib;
|
||||
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
// $example on$
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.mllib.stat.KernelDensity;
|
||||
// $example off$
|
||||
|
||||
public class JavaKernelDensityEstimationExample {
|
||||
public static void main(String[] args) {
|
||||
|
||||
SparkConf conf = new SparkConf().setAppName("JavaKernelDensityEstimationExample");
|
||||
JavaSparkContext jsc = new JavaSparkContext(conf);
|
||||
|
||||
// $example on$
|
||||
// an RDD of sample data
|
||||
JavaRDD<Double> data = jsc.parallelize(
|
||||
Arrays.asList(1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0));
|
||||
|
||||
// Construct the density estimator with the sample data
|
||||
// and a standard deviation for the Gaussian kernels
|
||||
KernelDensity kd = new KernelDensity().setSample(data).setBandwidth(3.0);
|
||||
|
||||
// Find density estimates for the given values
|
||||
double[] densities = kd.estimate(new double[]{-1.0, 2.0, 5.0});
|
||||
|
||||
System.out.println(Arrays.toString(densities));
|
||||
// $example off$
|
||||
|
||||
jsc.stop();
|
||||
}
|
||||
}
|
||||
|
|
@ -1,75 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.examples.mllib;
|
||||
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
|
||||
// $example on$
|
||||
import java.util.*;
|
||||
|
||||
import scala.Tuple2;
|
||||
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.function.VoidFunction;
|
||||
// $example off$
|
||||
|
||||
public class JavaStratifiedSamplingExample {
|
||||
public static void main(String[] args) {
|
||||
|
||||
SparkConf conf = new SparkConf().setAppName("JavaStratifiedSamplingExample");
|
||||
JavaSparkContext jsc = new JavaSparkContext(conf);
|
||||
|
||||
// $example on$
|
||||
List<Tuple2<Integer, Character>> list = new ArrayList<Tuple2<Integer, Character>>(
|
||||
Arrays.<Tuple2<Integer, Character>>asList(
|
||||
new Tuple2(1, 'a'),
|
||||
new Tuple2(1, 'b'),
|
||||
new Tuple2(2, 'c'),
|
||||
new Tuple2(2, 'd'),
|
||||
new Tuple2(2, 'e'),
|
||||
new Tuple2(3, 'f')
|
||||
)
|
||||
);
|
||||
|
||||
JavaPairRDD<Integer, Character> data = jsc.parallelizePairs(list);
|
||||
|
||||
// specify the exact fraction desired from each key Map<K, Object>
|
||||
ImmutableMap<Integer, Object> fractions =
|
||||
ImmutableMap.of(1, (Object)0.1, 2, (Object) 0.6, 3, (Object) 0.3);
|
||||
|
||||
// Get an approximate sample from each stratum
|
||||
JavaPairRDD<Integer, Character> approxSample = data.sampleByKey(false, fractions);
|
||||
// Get an exact sample from each stratum
|
||||
JavaPairRDD<Integer, Character> exactSample = data.sampleByKeyExact(false, fractions);
|
||||
// $example off$
|
||||
|
||||
System.out.println("approxSample size is " + approxSample.collect().size());
|
||||
for (Tuple2<Integer, Character> t : approxSample.collect()) {
|
||||
System.out.println(t._1() + " " + t._2());
|
||||
}
|
||||
|
||||
System.out.println("exactSample size is " + exactSample.collect().size());
|
||||
for (Tuple2<Integer, Character> t : exactSample.collect()) {
|
||||
System.out.println(t._1() + " " + t._2());
|
||||
}
|
||||
|
||||
jsc.stop();
|
||||
}
|
||||
}
|
|
@ -1,56 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.examples.mllib;
|
||||
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
// $example on$
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.mllib.linalg.Vector;
|
||||
import org.apache.spark.mllib.linalg.Vectors;
|
||||
import org.apache.spark.mllib.stat.MultivariateStatisticalSummary;
|
||||
import org.apache.spark.mllib.stat.Statistics;
|
||||
// $example off$
|
||||
|
||||
public class JavaSummaryStatisticsExample {
|
||||
public static void main(String[] args) {
|
||||
|
||||
SparkConf conf = new SparkConf().setAppName("JavaSummaryStatisticsExample");
|
||||
JavaSparkContext jsc = new JavaSparkContext(conf);
|
||||
|
||||
// $example on$
|
||||
JavaRDD<Vector> mat = jsc.parallelize(
|
||||
Arrays.asList(
|
||||
Vectors.dense(1.0, 10.0, 100.0),
|
||||
Vectors.dense(2.0, 20.0, 200.0),
|
||||
Vectors.dense(3.0, 30.0, 300.0)
|
||||
)
|
||||
); // an RDD of Vectors
|
||||
|
||||
// Compute column summary statistics.
|
||||
MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd());
|
||||
System.out.println(summary.mean()); // a dense vector containing the mean value for each column
|
||||
System.out.println(summary.variance()); // column-wise variance
|
||||
System.out.println(summary.numNonzeros()); // number of nonzeros in each column
|
||||
// $example off$
|
||||
|
||||
jsc.stop();
|
||||
}
|
||||
}
|
|
@ -1,48 +0,0 @@
|
|||
#
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pyspark import SparkContext
|
||||
# $example on$
|
||||
from pyspark.mllib.stat import Statistics
|
||||
# $example off$
|
||||
|
||||
if __name__ == "__main__":
|
||||
sc = SparkContext(appName="CorrelationsExample") # SparkContext
|
||||
|
||||
# $example on$
|
||||
seriesX = sc.parallelize([1.0, 2.0, 3.0, 3.0, 5.0]) # a series
|
||||
# seriesY must have the same number of partitions and cardinality as seriesX
|
||||
seriesY = sc.parallelize([11.0, 22.0, 33.0, 33.0, 555.0])
|
||||
|
||||
# Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method.
|
||||
# If a method is not specified, Pearson's method will be used by default.
|
||||
print("Correlation is: " + str(Statistics.corr(seriesX, seriesY, method="pearson")))
|
||||
|
||||
data = sc.parallelize(
|
||||
[np.array([1.0, 10.0, 100.0]), np.array([2.0, 20.0, 200.0]), np.array([5.0, 33.0, 366.0])]
|
||||
) # an RDD of Vectors
|
||||
|
||||
# calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
|
||||
# If a method is not specified, Pearson's method will be used by default.
|
||||
print(Statistics.corr(data, method="pearson"))
|
||||
# $example off$
|
||||
|
||||
sc.stop()
|
|
@ -1,65 +0,0 @@
|
|||
#
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
from pyspark import SparkContext
|
||||
# $example on$
|
||||
from pyspark.mllib.linalg import Matrices, Vectors
|
||||
from pyspark.mllib.regression import LabeledPoint
|
||||
from pyspark.mllib.stat import Statistics
|
||||
# $example off$
|
||||
|
||||
if __name__ == "__main__":
|
||||
sc = SparkContext(appName="HypothesisTestingExample")
|
||||
|
||||
# $example on$
|
||||
vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25) # a vector composed of the frequencies of events
|
||||
|
||||
# compute the goodness of fit. If a second vector to test against
|
||||
# is not supplied as a parameter, the test runs against a uniform distribution.
|
||||
goodnessOfFitTestResult = Statistics.chiSqTest(vec)
|
||||
|
||||
# summary of the test including the p-value, degrees of freedom,
|
||||
# test statistic, the method used, and the null hypothesis.
|
||||
print("%s\n" % goodnessOfFitTestResult)
|
||||
|
||||
mat = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0]) # a contingency matrix
|
||||
|
||||
# conduct Pearson's independence test on the input contingency matrix
|
||||
independenceTestResult = Statistics.chiSqTest(mat)
|
||||
|
||||
# summary of the test including the p-value, degrees of freedom,
|
||||
# test statistic, the method used, and the null hypothesis.
|
||||
print("%s\n" % independenceTestResult)
|
||||
|
||||
obs = sc.parallelize(
|
||||
[LabeledPoint(1.0, [1.0, 0.0, 3.0]),
|
||||
LabeledPoint(1.0, [1.0, 2.0, 0.0]),
|
||||
LabeledPoint(1.0, [-1.0, 0.0, -0.5])]
|
||||
) # LabeledPoint(feature, label)
|
||||
|
||||
# The contingency table is constructed from an RDD of LabeledPoint and used to conduct
|
||||
# the independence test. Returns an array containing the ChiSquaredTestResult for every feature
|
||||
# against the label.
|
||||
featureTestResults = Statistics.chiSqTest(obs)
|
||||
|
||||
for i, result in enumerate(featureTestResults):
|
||||
print("Column %d:\n%s" % (i + 1, result))
|
||||
# $example off$
|
||||
|
||||
sc.stop()
|
|
@ -1,40 +0,0 @@
|
|||
#
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
from pyspark import SparkContext
|
||||
# $example on$
|
||||
from pyspark.mllib.stat import Statistics
|
||||
# $example off$
|
||||
|
||||
if __name__ == "__main__":
|
||||
sc = SparkContext(appName="HypothesisTestingKolmogorovSmirnovTestExample")
|
||||
|
||||
# $example on$
|
||||
parallelData = sc.parallelize([0.1, 0.15, 0.2, 0.3, 0.25])
|
||||
|
||||
# run a KS test for the sample versus a standard normal distribution
|
||||
testResult = Statistics.kolmogorovSmirnovTest(parallelData, "norm", 0, 1)
|
||||
# summary of the test including the p-value, test statistic, and null hypothesis
|
||||
# if our p-value indicates significance, we can reject the null hypothesis
|
||||
# Note that the Scala functionality of calling Statistics.kolmogorovSmirnovTest with
|
||||
# a lambda to calculate the CDF is not made available in the Python API
|
||||
print(testResult)
|
||||
# $example off$
|
||||
|
||||
sc.stop()
|
|
@ -1,44 +0,0 @@
|
|||
#
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
from pyspark import SparkContext
|
||||
# $example on$
|
||||
from pyspark.mllib.stat import KernelDensity
|
||||
# $example off$
|
||||
|
||||
if __name__ == "__main__":
|
||||
sc = SparkContext(appName="KernelDensityEstimationExample") # SparkContext
|
||||
|
||||
# $example on$
|
||||
# an RDD of sample data
|
||||
data = sc.parallelize([1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0])
|
||||
|
||||
# Construct the density estimator with the sample data and a standard deviation for the Gaussian
|
||||
# kernels
|
||||
kd = KernelDensity()
|
||||
kd.setSample(data)
|
||||
kd.setBandwidth(3.0)
|
||||
|
||||
# Find density estimates for the given values
|
||||
densities = kd.estimate([-1.0, 2.0, 5.0])
|
||||
# $example off$
|
||||
|
||||
print(densities)
|
||||
|
||||
sc.stop()
|
|
@ -1,38 +0,0 @@
|
|||
#
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
from pyspark import SparkContext
|
||||
|
||||
if __name__ == "__main__":
|
||||
sc = SparkContext(appName="StratifiedSamplingExample") # SparkContext
|
||||
|
||||
# $example on$
|
||||
# an RDD of any key value pairs
|
||||
data = sc.parallelize([(1, 'a'), (1, 'b'), (2, 'c'), (2, 'd'), (2, 'e'), (3, 'f')])
|
||||
|
||||
# specify the exact fraction desired from each key as a dictionary
|
||||
fractions = {1: 0.1, 2: 0.6, 3: 0.3}
|
||||
|
||||
approxSample = data.sampleByKey(False, fractions)
|
||||
# $example off$
|
||||
|
||||
for each in approxSample.collect():
|
||||
print(each)
|
||||
|
||||
sc.stop()
|
|
@ -1,42 +0,0 @@
|
|||
#
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
from pyspark import SparkContext
|
||||
# $example on$
|
||||
import numpy as np
|
||||
|
||||
from pyspark.mllib.stat import Statistics
|
||||
# $example off$
|
||||
|
||||
if __name__ == "__main__":
|
||||
sc = SparkContext(appName="SummaryStatisticsExample") # SparkContext
|
||||
|
||||
# $example on$
|
||||
mat = sc.parallelize(
|
||||
[np.array([1.0, 10.0, 100.0]), np.array([2.0, 20.0, 200.0]), np.array([3.0, 30.0, 300.0])]
|
||||
) # an RDD of Vectors
|
||||
|
||||
# Compute column summary statistics.
|
||||
summary = Statistics.colStats(mat)
|
||||
print(summary.mean()) # a dense vector containing the mean value for each column
|
||||
print(summary.variance()) # column-wise variance
|
||||
print(summary.numNonzeros()) # number of nonzeros in each column
|
||||
# $example off$
|
||||
|
||||
sc.stop()
|
|
@ -1,62 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// scalastyle:off println
|
||||
package org.apache.spark.examples.mllib
|
||||
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
// $example on$
|
||||
import org.apache.spark.mllib.linalg._
|
||||
import org.apache.spark.mllib.stat.Statistics
|
||||
import org.apache.spark.rdd.RDD
|
||||
// $example off$
|
||||
|
||||
object CorrelationsExample {
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
val conf = new SparkConf().setAppName("CorrelationsExample")
|
||||
val sc = new SparkContext(conf)
|
||||
|
||||
// $example on$
|
||||
val seriesX: RDD[Double] = sc.parallelize(Array(1, 2, 3, 3, 5)) // a series
|
||||
// must have the same number of partitions and cardinality as seriesX
|
||||
val seriesY: RDD[Double] = sc.parallelize(Array(11, 22, 33, 33, 555))
|
||||
|
||||
// compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a
|
||||
// method is not specified, Pearson's method will be used by default.
|
||||
val correlation: Double = Statistics.corr(seriesX, seriesY, "pearson")
|
||||
println(s"Correlation is: $correlation")
|
||||
|
||||
val data: RDD[Vector] = sc.parallelize(
|
||||
Seq(
|
||||
Vectors.dense(1.0, 10.0, 100.0),
|
||||
Vectors.dense(2.0, 20.0, 200.0),
|
||||
Vectors.dense(5.0, 33.0, 366.0))
|
||||
) // note that each Vector is a row and not a column
|
||||
|
||||
// calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method
|
||||
// If a method is not specified, Pearson's method will be used by default.
|
||||
val correlMatrix: Matrix = Statistics.corr(data, "pearson")
|
||||
println(correlMatrix.toString)
|
||||
// $example off$
|
||||
|
||||
sc.stop()
|
||||
}
|
||||
}
|
||||
// scalastyle:on println
|
||||
|
|
@ -1,80 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// scalastyle:off println
|
||||
package org.apache.spark.examples.mllib
|
||||
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
// $example on$
|
||||
import org.apache.spark.mllib.linalg._
|
||||
import org.apache.spark.mllib.regression.LabeledPoint
|
||||
import org.apache.spark.mllib.stat.Statistics
|
||||
import org.apache.spark.mllib.stat.test.ChiSqTestResult
|
||||
import org.apache.spark.rdd.RDD
|
||||
// $example off$
|
||||
|
||||
object HypothesisTestingExample {
|
||||
|
||||
def main(args: Array[String]) {
|
||||
|
||||
val conf = new SparkConf().setAppName("HypothesisTestingExample")
|
||||
val sc = new SparkContext(conf)
|
||||
|
||||
// $example on$
|
||||
// a vector composed of the frequencies of events
|
||||
val vec: Vector = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25)
|
||||
|
||||
// compute the goodness of fit. If a second vector to test against is not supplied
|
||||
// as a parameter, the test runs against a uniform distribution.
|
||||
val goodnessOfFitTestResult = Statistics.chiSqTest(vec)
|
||||
// summary of the test including the p-value, degrees of freedom, test statistic, the method
|
||||
// used, and the null hypothesis.
|
||||
println(s"$goodnessOfFitTestResult\n")
|
||||
|
||||
// a contingency matrix. Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
|
||||
val mat: Matrix = Matrices.dense(3, 2, Array(1.0, 3.0, 5.0, 2.0, 4.0, 6.0))
|
||||
|
||||
// conduct Pearson's independence test on the input contingency matrix
|
||||
val independenceTestResult = Statistics.chiSqTest(mat)
|
||||
// summary of the test including the p-value, degrees of freedom
|
||||
println(s"$independenceTestResult\n")
|
||||
|
||||
val obs: RDD[LabeledPoint] =
|
||||
sc.parallelize(
|
||||
Seq(
|
||||
LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)),
|
||||
LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0)),
|
||||
LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5)
|
||||
)
|
||||
)
|
||||
) // (feature, label) pairs.
|
||||
|
||||
// The contingency table is constructed from the raw (feature, label) pairs and used to conduct
|
||||
// the independence test. Returns an array containing the ChiSquaredTestResult for every feature
|
||||
// against the label.
|
||||
val featureTestResults: Array[ChiSqTestResult] = Statistics.chiSqTest(obs)
|
||||
featureTestResults.zipWithIndex.foreach { case (k, v) =>
|
||||
println("Column " + (v + 1).toString + ":")
|
||||
println(k)
|
||||
} // summary of the test
|
||||
// $example off$
|
||||
|
||||
sc.stop()
|
||||
}
|
||||
}
|
||||
// scalastyle:on println
|
||||
|
|
@ -1,54 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// scalastyle:off println
|
||||
package org.apache.spark.examples.mllib
|
||||
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
// $example on$
|
||||
import org.apache.spark.mllib.stat.Statistics
|
||||
import org.apache.spark.rdd.RDD
|
||||
// $example off$
|
||||
|
||||
object HypothesisTestingKolmogorovSmirnovTestExample {
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
val conf = new SparkConf().setAppName("HypothesisTestingKolmogorovSmirnovTestExample")
|
||||
val sc = new SparkContext(conf)
|
||||
|
||||
// $example on$
|
||||
val data: RDD[Double] = sc.parallelize(Seq(0.1, 0.15, 0.2, 0.3, 0.25)) // an RDD of sample data
|
||||
|
||||
// run a KS test for the sample versus a standard normal distribution
|
||||
val testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)
|
||||
// summary of the test including the p-value, test statistic, and null hypothesis if our p-value
|
||||
// indicates significance, we can reject the null hypothesis.
|
||||
println(testResult)
|
||||
println()
|
||||
|
||||
// perform a KS test using a cumulative distribution function of our making
|
||||
val myCDF = Map(0.1 -> 0.2, 0.15 -> 0.6, 0.2 -> 0.05, 0.3 -> 0.05, 0.25 -> 0.1)
|
||||
val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF)
|
||||
println(testResult2)
|
||||
// $example off$
|
||||
|
||||
sc.stop()
|
||||
}
|
||||
}
|
||||
// scalastyle:on println
|
||||
|
|
@ -1,54 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// scalastyle:off println
|
||||
package org.apache.spark.examples.mllib
|
||||
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
// $example on$
|
||||
import org.apache.spark.mllib.stat.KernelDensity
|
||||
import org.apache.spark.rdd.RDD
|
||||
// $example off$
|
||||
|
||||
object KernelDensityEstimationExample {
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
val conf = new SparkConf().setAppName("KernelDensityEstimationExample")
|
||||
val sc = new SparkContext(conf)
|
||||
|
||||
// $example on$
|
||||
// an RDD of sample data
|
||||
val data: RDD[Double] = sc.parallelize(Seq(1, 1, 1, 2, 3, 4, 5, 5, 6, 7, 8, 9, 9))
|
||||
|
||||
// Construct the density estimator with the sample data and a standard deviation
|
||||
// for the Gaussian kernels
|
||||
val kd = new KernelDensity()
|
||||
.setSample(data)
|
||||
.setBandwidth(3.0)
|
||||
|
||||
// Find density estimates for the given values
|
||||
val densities = kd.estimate(Array(-1.0, 2.0, 5.0))
|
||||
// $example off$
|
||||
|
||||
densities.foreach(println)
|
||||
|
||||
sc.stop()
|
||||
}
|
||||
}
|
||||
// scalastyle:on println
|
||||
|
|
@ -1,53 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// scalastyle:off println
|
||||
package org.apache.spark.examples.mllib
|
||||
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
|
||||
object StratifiedSamplingExample {
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
val conf = new SparkConf().setAppName("StratifiedSamplingExample")
|
||||
val sc = new SparkContext(conf)
|
||||
|
||||
// $example on$
|
||||
// an RDD[(K, V)] of any key value pairs
|
||||
val data = sc.parallelize(
|
||||
Seq((1, 'a'), (1, 'b'), (2, 'c'), (2, 'd'), (2, 'e'), (3, 'f')))
|
||||
|
||||
// specify the exact fraction desired from each key
|
||||
val fractions = Map(1 -> 0.1, 2 -> 0.6, 3 -> 0.3)
|
||||
|
||||
// Get an approximate sample from each stratum
|
||||
val approxSample = data.sampleByKey(withReplacement = false, fractions)
|
||||
// Get an exact sample from each stratum
|
||||
val exactSample = data.sampleByKeyExact(withReplacement = false, fractions)
|
||||
// $example off$
|
||||
|
||||
println("approxSample size is " + approxSample.collect().size.toString)
|
||||
approxSample.collect().foreach(println)
|
||||
|
||||
println("exactSample its size is " + exactSample.collect().size.toString)
|
||||
exactSample.collect().foreach(println)
|
||||
|
||||
sc.stop()
|
||||
}
|
||||
}
|
||||
// scalastyle:on println
|
|
@ -1,53 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// scalastyle:off println
|
||||
package org.apache.spark.examples.mllib
|
||||
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
// $example on$
|
||||
import org.apache.spark.mllib.linalg.Vectors
|
||||
import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
|
||||
// $example off$
|
||||
|
||||
object SummaryStatisticsExample {
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
val conf = new SparkConf().setAppName("SummaryStatisticsExample")
|
||||
val sc = new SparkContext(conf)
|
||||
|
||||
// $example on$
|
||||
val observations = sc.parallelize(
|
||||
Seq(
|
||||
Vectors.dense(1.0, 10.0, 100.0),
|
||||
Vectors.dense(2.0, 20.0, 200.0),
|
||||
Vectors.dense(3.0, 30.0, 300.0)
|
||||
)
|
||||
)
|
||||
|
||||
// Compute column summary statistics.
|
||||
val summary: MultivariateStatisticalSummary = Statistics.colStats(observations)
|
||||
println(summary.mean) // a dense vector containing the mean value for each column
|
||||
println(summary.variance) // column-wise variance
|
||||
println(summary.numNonzeros) // number of nonzeros in each column
|
||||
// $example off$
|
||||
|
||||
sc.stop()
|
||||
}
|
||||
}
|
||||
// scalastyle:on println
|
Loading…
Reference in a new issue