2014-08-01 18:02:17 -04:00
|
|
|
#
|
|
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
|
|
# this work for additional information regarding copyright ownership.
|
|
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
|
|
# (the "License"); you may not use this file except in compliance with
|
|
|
|
# the License. You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
#
|
|
|
|
|
2015-07-20 12:00:01 -04:00
|
|
|
import sys
|
|
|
|
|
2020-07-13 22:22:44 -04:00
|
|
|
from pyspark.rdd import RDD
|
2014-10-31 01:25:18 -04:00
|
|
|
from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
|
[SPARK-3964] [MLlib] [PySpark] add Hypothesis test Python API
```
pyspark.mllib.stat.StatisticschiSqTest(observed, expected=None)
:: Experimental ::
If `observed` is Vector, conduct Pearson's chi-squared goodness
of fit test of the observed data against the expected distribution,
or againt the uniform distribution (by default), with each category
having an expected frequency of `1 / len(observed)`.
(Note: `observed` cannot contain negative values)
If `observed` is matrix, conduct Pearson's independence test on the
input contingency matrix, which cannot contain negative entries or
columns or rows that sum up to 0.
If `observed` is an RDD of LabeledPoint, conduct Pearson's independence
test for every feature against the label across the input RDD.
For each feature, the (feature, label) pairs are converted into a
contingency matrix for which the chi-squared statistic is computed.
All label and feature values must be categorical.
:param observed: it could be a vector containing the observed categorical
counts/relative frequencies, or the contingency matrix
(containing either counts or relative frequencies),
or an RDD of LabeledPoint containing the labeled dataset
with categorical features. Real-valued features will be
treated as categorical for each distinct value.
:param expected: Vector containing the expected categorical counts/relative
frequencies. `expected` is rescaled if the `expected` sum
differs from the `observed` sum.
:return: ChiSquaredTest object containing the test statistic, degrees
of freedom, p-value, the method used, and the null hypothesis.
```
Author: Davies Liu <davies@databricks.com>
Closes #3091 from davies/his and squashes the following commits:
145d16c [Davies Liu] address comments
0ab0764 [Davies Liu] fix float
5097d54 [Davies Liu] add Hypothesis test Python API
2014-11-05 00:35:52 -05:00
|
|
|
from pyspark.mllib.linalg import Matrix, _convert_to_vector
|
2014-11-11 01:26:16 -05:00
|
|
|
from pyspark.mllib.regression import LabeledPoint
|
2015-07-20 12:00:01 -04:00
|
|
|
from pyspark.mllib.stat.test import ChiSqTestResult, KolmogorovSmirnovTestResult
|
2014-09-03 14:49:45 -04:00
|
|
|
|
|
|
|
|
2015-01-29 13:11:44 -05:00
|
|
|
__all__ = ['MultivariateStatisticalSummary', 'Statistics']
|
2014-08-13 02:47:42 -04:00
|
|
|
|
|
|
|
|
2014-10-31 01:25:18 -04:00
|
|
|
class MultivariateStatisticalSummary(JavaModelWrapper):
|
2014-08-13 02:47:42 -04:00
|
|
|
|
|
|
|
"""
|
|
|
|
Trait for multivariate statistical summary of a data matrix.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def mean(self):
|
2014-10-31 01:25:18 -04:00
|
|
|
return self.call("mean").toArray()
|
2014-08-13 02:47:42 -04:00
|
|
|
|
|
|
|
def variance(self):
|
2014-10-31 01:25:18 -04:00
|
|
|
return self.call("variance").toArray()
|
2014-08-13 02:47:42 -04:00
|
|
|
|
|
|
|
def count(self):
|
2015-04-16 19:20:57 -04:00
|
|
|
return int(self.call("count"))
|
2014-08-13 02:47:42 -04:00
|
|
|
|
|
|
|
def numNonzeros(self):
|
2014-10-31 01:25:18 -04:00
|
|
|
return self.call("numNonzeros").toArray()
|
2014-08-13 02:47:42 -04:00
|
|
|
|
|
|
|
def max(self):
|
2014-10-31 01:25:18 -04:00
|
|
|
return self.call("max").toArray()
|
2014-08-13 02:47:42 -04:00
|
|
|
|
|
|
|
def min(self):
|
2014-10-31 01:25:18 -04:00
|
|
|
return self.call("min").toArray()
|
2014-08-01 18:02:17 -04:00
|
|
|
|
2015-04-05 19:13:31 -04:00
|
|
|
def normL1(self):
|
|
|
|
return self.call("normL1").toArray()
|
|
|
|
|
|
|
|
def normL2(self):
|
|
|
|
return self.call("normL2").toArray()
|
|
|
|
|
2014-08-06 15:58:24 -04:00
|
|
|
|
2014-08-01 18:02:17 -04:00
|
|
|
class Statistics(object):
|
|
|
|
|
2014-08-13 02:47:42 -04:00
|
|
|
@staticmethod
|
2014-09-19 18:01:11 -04:00
|
|
|
def colStats(rdd):
|
2014-08-13 02:47:42 -04:00
|
|
|
"""
|
|
|
|
Computes column-wise summary statistics for the input RDD[Vector].
|
|
|
|
|
2020-11-24 20:24:41 -05:00
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
rdd : :py:class:`pyspark.RDD`
|
|
|
|
an RDD[Vector] for which column-wise summary statistics
|
|
|
|
are to be computed.
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
:class:`MultivariateStatisticalSummary`
|
|
|
|
object containing column-wise summary statistics.
|
|
|
|
|
|
|
|
Examples
|
|
|
|
--------
|
2014-09-19 18:01:11 -04:00
|
|
|
>>> from pyspark.mllib.linalg import Vectors
|
2014-08-13 02:47:42 -04:00
|
|
|
>>> rdd = sc.parallelize([Vectors.dense([2, 0, 0, -2]),
|
|
|
|
... Vectors.dense([4, 5, 0, 3]),
|
|
|
|
... Vectors.dense([6, 7, 0, 8])])
|
|
|
|
>>> cStats = Statistics.colStats(rdd)
|
|
|
|
>>> cStats.mean()
|
|
|
|
array([ 4., 4., 0., 3.])
|
|
|
|
>>> cStats.variance()
|
|
|
|
array([ 4., 13., 0., 25.])
|
|
|
|
>>> cStats.count()
|
2015-04-16 19:20:57 -04:00
|
|
|
3
|
2014-08-13 02:47:42 -04:00
|
|
|
>>> cStats.numNonzeros()
|
|
|
|
array([ 3., 2., 0., 3.])
|
|
|
|
>>> cStats.max()
|
|
|
|
array([ 6., 7., 0., 8.])
|
|
|
|
>>> cStats.min()
|
|
|
|
array([ 2., 0., 0., -2.])
|
|
|
|
"""
|
2014-10-31 01:25:18 -04:00
|
|
|
cStats = callMLlibFunc("colStats", rdd.map(_convert_to_vector))
|
|
|
|
return MultivariateStatisticalSummary(cStats)
|
2014-08-13 02:47:42 -04:00
|
|
|
|
2014-08-01 18:02:17 -04:00
|
|
|
@staticmethod
|
|
|
|
def corr(x, y=None, method=None):
|
|
|
|
"""
|
|
|
|
Compute the correlation (matrix) for the input RDD(s) using the
|
|
|
|
specified method.
|
2019-07-05 13:08:22 -04:00
|
|
|
Methods currently supported: `pearson (default), spearman`.
|
2014-08-01 18:02:17 -04:00
|
|
|
|
|
|
|
If a single RDD of Vectors is passed in, a correlation matrix
|
2019-07-05 13:08:22 -04:00
|
|
|
comparing the columns in the input RDD is returned. Use `method`
|
2014-08-01 18:02:17 -04:00
|
|
|
to specify the method to be used for single RDD inout.
|
|
|
|
If two RDDs of floats are passed in, a single float is returned.
|
|
|
|
|
2020-11-24 20:24:41 -05:00
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
x : :py:class:`pyspark.RDD`
|
|
|
|
an RDD of vector for which the correlation matrix is to be computed,
|
|
|
|
or an RDD of float of the same cardinality as y when y is specified.
|
|
|
|
y : :py:class:`pyspark.RDD`, optional
|
|
|
|
an RDD of float of the same cardinality as x.
|
|
|
|
method : str, optional
|
|
|
|
String specifying the method to use for computing correlation.
|
|
|
|
Supported: `pearson` (default), `spearman`
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
:py:class:`pyspark.mllib.linalg.Matrix`
|
|
|
|
Correlation matrix comparing columns in x.
|
|
|
|
|
|
|
|
Examples
|
|
|
|
--------
|
2014-08-01 18:02:17 -04:00
|
|
|
>>> x = sc.parallelize([1.0, 0.0, -2.0], 2)
|
|
|
|
>>> y = sc.parallelize([4.0, 5.0, 3.0], 2)
|
|
|
|
>>> zeros = sc.parallelize([0.0, 0.0, 0.0], 2)
|
|
|
|
>>> abs(Statistics.corr(x, y) - 0.6546537) < 1e-7
|
|
|
|
True
|
|
|
|
>>> Statistics.corr(x, y) == Statistics.corr(x, y, "pearson")
|
|
|
|
True
|
|
|
|
>>> Statistics.corr(x, y, "spearman")
|
|
|
|
0.5
|
|
|
|
>>> from math import isnan
|
|
|
|
>>> isnan(Statistics.corr(x, zeros))
|
|
|
|
True
|
2014-09-19 18:01:11 -04:00
|
|
|
>>> from pyspark.mllib.linalg import Vectors
|
2014-08-01 18:02:17 -04:00
|
|
|
>>> rdd = sc.parallelize([Vectors.dense([1, 0, 0, -2]), Vectors.dense([4, 5, 0, 3]),
|
|
|
|
... Vectors.dense([6, 7, 0, 8]), Vectors.dense([9, 0, 0, 1])])
|
[SPARK-2850] [SPARK-2626] [mllib] MLlib stats examples + small fixes
Added examples for statistical summarization:
* Scala: StatisticalSummary.scala
** Tests: correlation, MultivariateOnlineSummarizer
* python: statistical_summary.py
** Tests: correlation (since MultivariateOnlineSummarizer has no Python API)
Added examples for random and sampled RDDs:
* Scala: RandomAndSampledRDDs.scala
* python: random_and_sampled_rdds.py
* Both test:
** RandomRDDGenerators.normalRDD, normalVectorRDD
** RDD.sample, takeSample, sampleByKey
Added sc.stop() to all examples.
CorrelationSuite.scala
* Added 1 test for RDDs with only 1 value
RowMatrix.scala
* numCols(): Added check for numRows = 0, with error message.
* computeCovariance(): Added check for numRows <= 1, with error message.
Python SparseVector (pyspark/mllib/linalg.py)
* Added toDense() function
python/run-tests script
* Added stat.py (doc test)
CC: mengxr dorx Main changes were examples to show usage across APIs.
Author: Joseph K. Bradley <joseph.kurata.bradley@gmail.com>
Closes #1878 from jkbradley/mllib-stats-api-check and squashes the following commits:
ea5c047 [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into mllib-stats-api-check
dafebe2 [Joseph K. Bradley] Bug fixes for examples SampledRDDs.scala and sampled_rdds.py: Check for division by 0 and for missing key in maps.
8d1e555 [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into mllib-stats-api-check
60c72d9 [Joseph K. Bradley] Fixed stat.py doc test to work for Python versions printing nan or NaN.
b20d90a [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into mllib-stats-api-check
4e5d15e [Joseph K. Bradley] Changed pyspark/mllib/stat.py doc tests to use NaN instead of nan.
32173b7 [Joseph K. Bradley] Stats examples update.
c8c20dc [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into mllib-stats-api-check
cf70b07 [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into mllib-stats-api-check
0b7cec3 [Joseph K. Bradley] Small updates based on code review. Renamed statistical_summary.py to correlations.py
ab48f6e [Joseph K. Bradley] RowMatrix.scala * numCols(): Added check for numRows = 0, with error message. * computeCovariance(): Added check for numRows <= 1, with error message.
65e4ebc [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into mllib-stats-api-check
8195c78 [Joseph K. Bradley] Added examples for random and sampled RDDs: * Scala: RandomAndSampledRDDs.scala * python: random_and_sampled_rdds.py * Both test: ** RandomRDDGenerators.normalRDD, normalVectorRDD ** RDD.sample, takeSample, sampleByKey
064985b [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into mllib-stats-api-check
ee918e9 [Joseph K. Bradley] Added examples for statistical summarization: * Scala: StatisticalSummary.scala ** Tests: correlation, MultivariateOnlineSummarizer * python: statistical_summary.py ** Tests: correlation (since MultivariateOnlineSummarizer has no Python API)
2014-08-18 21:01:39 -04:00
|
|
|
>>> pearsonCorr = Statistics.corr(rdd)
|
2015-04-16 19:20:57 -04:00
|
|
|
>>> print(str(pearsonCorr).replace('nan', 'NaN'))
|
[SPARK-2850] [SPARK-2626] [mllib] MLlib stats examples + small fixes
Added examples for statistical summarization:
* Scala: StatisticalSummary.scala
** Tests: correlation, MultivariateOnlineSummarizer
* python: statistical_summary.py
** Tests: correlation (since MultivariateOnlineSummarizer has no Python API)
Added examples for random and sampled RDDs:
* Scala: RandomAndSampledRDDs.scala
* python: random_and_sampled_rdds.py
* Both test:
** RandomRDDGenerators.normalRDD, normalVectorRDD
** RDD.sample, takeSample, sampleByKey
Added sc.stop() to all examples.
CorrelationSuite.scala
* Added 1 test for RDDs with only 1 value
RowMatrix.scala
* numCols(): Added check for numRows = 0, with error message.
* computeCovariance(): Added check for numRows <= 1, with error message.
Python SparseVector (pyspark/mllib/linalg.py)
* Added toDense() function
python/run-tests script
* Added stat.py (doc test)
CC: mengxr dorx Main changes were examples to show usage across APIs.
Author: Joseph K. Bradley <joseph.kurata.bradley@gmail.com>
Closes #1878 from jkbradley/mllib-stats-api-check and squashes the following commits:
ea5c047 [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into mllib-stats-api-check
dafebe2 [Joseph K. Bradley] Bug fixes for examples SampledRDDs.scala and sampled_rdds.py: Check for division by 0 and for missing key in maps.
8d1e555 [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into mllib-stats-api-check
60c72d9 [Joseph K. Bradley] Fixed stat.py doc test to work for Python versions printing nan or NaN.
b20d90a [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into mllib-stats-api-check
4e5d15e [Joseph K. Bradley] Changed pyspark/mllib/stat.py doc tests to use NaN instead of nan.
32173b7 [Joseph K. Bradley] Stats examples update.
c8c20dc [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into mllib-stats-api-check
cf70b07 [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into mllib-stats-api-check
0b7cec3 [Joseph K. Bradley] Small updates based on code review. Renamed statistical_summary.py to correlations.py
ab48f6e [Joseph K. Bradley] RowMatrix.scala * numCols(): Added check for numRows = 0, with error message. * computeCovariance(): Added check for numRows <= 1, with error message.
65e4ebc [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into mllib-stats-api-check
8195c78 [Joseph K. Bradley] Added examples for random and sampled RDDs: * Scala: RandomAndSampledRDDs.scala * python: random_and_sampled_rdds.py * Both test: ** RandomRDDGenerators.normalRDD, normalVectorRDD ** RDD.sample, takeSample, sampleByKey
064985b [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into mllib-stats-api-check
ee918e9 [Joseph K. Bradley] Added examples for statistical summarization: * Scala: StatisticalSummary.scala ** Tests: correlation, MultivariateOnlineSummarizer * python: statistical_summary.py ** Tests: correlation (since MultivariateOnlineSummarizer has no Python API)
2014-08-18 21:01:39 -04:00
|
|
|
[[ 1. 0.05564149 NaN 0.40047142]
|
|
|
|
[ 0.05564149 1. NaN 0.91359586]
|
|
|
|
[ NaN NaN 1. NaN]
|
|
|
|
[ 0.40047142 0.91359586 NaN 1. ]]
|
|
|
|
>>> spearmanCorr = Statistics.corr(rdd, method="spearman")
|
2015-04-16 19:20:57 -04:00
|
|
|
>>> print(str(spearmanCorr).replace('nan', 'NaN'))
|
[SPARK-2850] [SPARK-2626] [mllib] MLlib stats examples + small fixes
Added examples for statistical summarization:
* Scala: StatisticalSummary.scala
** Tests: correlation, MultivariateOnlineSummarizer
* python: statistical_summary.py
** Tests: correlation (since MultivariateOnlineSummarizer has no Python API)
Added examples for random and sampled RDDs:
* Scala: RandomAndSampledRDDs.scala
* python: random_and_sampled_rdds.py
* Both test:
** RandomRDDGenerators.normalRDD, normalVectorRDD
** RDD.sample, takeSample, sampleByKey
Added sc.stop() to all examples.
CorrelationSuite.scala
* Added 1 test for RDDs with only 1 value
RowMatrix.scala
* numCols(): Added check for numRows = 0, with error message.
* computeCovariance(): Added check for numRows <= 1, with error message.
Python SparseVector (pyspark/mllib/linalg.py)
* Added toDense() function
python/run-tests script
* Added stat.py (doc test)
CC: mengxr dorx Main changes were examples to show usage across APIs.
Author: Joseph K. Bradley <joseph.kurata.bradley@gmail.com>
Closes #1878 from jkbradley/mllib-stats-api-check and squashes the following commits:
ea5c047 [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into mllib-stats-api-check
dafebe2 [Joseph K. Bradley] Bug fixes for examples SampledRDDs.scala and sampled_rdds.py: Check for division by 0 and for missing key in maps.
8d1e555 [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into mllib-stats-api-check
60c72d9 [Joseph K. Bradley] Fixed stat.py doc test to work for Python versions printing nan or NaN.
b20d90a [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into mllib-stats-api-check
4e5d15e [Joseph K. Bradley] Changed pyspark/mllib/stat.py doc tests to use NaN instead of nan.
32173b7 [Joseph K. Bradley] Stats examples update.
c8c20dc [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into mllib-stats-api-check
cf70b07 [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into mllib-stats-api-check
0b7cec3 [Joseph K. Bradley] Small updates based on code review. Renamed statistical_summary.py to correlations.py
ab48f6e [Joseph K. Bradley] RowMatrix.scala * numCols(): Added check for numRows = 0, with error message. * computeCovariance(): Added check for numRows <= 1, with error message.
65e4ebc [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into mllib-stats-api-check
8195c78 [Joseph K. Bradley] Added examples for random and sampled RDDs: * Scala: RandomAndSampledRDDs.scala * python: random_and_sampled_rdds.py * Both test: ** RandomRDDGenerators.normalRDD, normalVectorRDD ** RDD.sample, takeSample, sampleByKey
064985b [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into mllib-stats-api-check
ee918e9 [Joseph K. Bradley] Added examples for statistical summarization: * Scala: StatisticalSummary.scala ** Tests: correlation, MultivariateOnlineSummarizer * python: statistical_summary.py ** Tests: correlation (since MultivariateOnlineSummarizer has no Python API)
2014-08-18 21:01:39 -04:00
|
|
|
[[ 1. 0.10540926 NaN 0.4 ]
|
|
|
|
[ 0.10540926 1. NaN 0.9486833 ]
|
|
|
|
[ NaN NaN 1. NaN]
|
|
|
|
[ 0.4 0.9486833 NaN 1. ]]
|
2014-08-01 18:02:17 -04:00
|
|
|
>>> try:
|
|
|
|
... Statistics.corr(rdd, "spearman")
|
2015-04-16 19:20:57 -04:00
|
|
|
... print("Method name as second argument without 'method=' shouldn't be allowed.")
|
2014-08-01 18:02:17 -04:00
|
|
|
... except TypeError:
|
|
|
|
... pass
|
|
|
|
"""
|
|
|
|
# Check inputs to determine whether a single value or a matrix is needed for output.
|
|
|
|
# Since it's legal for users to use the method name as the second argument, we need to
|
|
|
|
# check if y is used to specify the method name instead.
|
|
|
|
if type(y) == str:
|
|
|
|
raise TypeError("Use 'method=' to specify method name.")
|
2014-09-19 18:01:11 -04:00
|
|
|
|
2014-08-01 18:02:17 -04:00
|
|
|
if not y:
|
2014-10-31 01:25:18 -04:00
|
|
|
return callMLlibFunc("corr", x.map(_convert_to_vector), method).toArray()
|
2014-08-01 18:02:17 -04:00
|
|
|
else:
|
2014-10-31 01:25:18 -04:00
|
|
|
return callMLlibFunc("corr", x.map(float), y.map(float), method)
|
2014-08-01 18:02:17 -04:00
|
|
|
|
[SPARK-3964] [MLlib] [PySpark] add Hypothesis test Python API
```
pyspark.mllib.stat.StatisticschiSqTest(observed, expected=None)
:: Experimental ::
If `observed` is Vector, conduct Pearson's chi-squared goodness
of fit test of the observed data against the expected distribution,
or againt the uniform distribution (by default), with each category
having an expected frequency of `1 / len(observed)`.
(Note: `observed` cannot contain negative values)
If `observed` is matrix, conduct Pearson's independence test on the
input contingency matrix, which cannot contain negative entries or
columns or rows that sum up to 0.
If `observed` is an RDD of LabeledPoint, conduct Pearson's independence
test for every feature against the label across the input RDD.
For each feature, the (feature, label) pairs are converted into a
contingency matrix for which the chi-squared statistic is computed.
All label and feature values must be categorical.
:param observed: it could be a vector containing the observed categorical
counts/relative frequencies, or the contingency matrix
(containing either counts or relative frequencies),
or an RDD of LabeledPoint containing the labeled dataset
with categorical features. Real-valued features will be
treated as categorical for each distinct value.
:param expected: Vector containing the expected categorical counts/relative
frequencies. `expected` is rescaled if the `expected` sum
differs from the `observed` sum.
:return: ChiSquaredTest object containing the test statistic, degrees
of freedom, p-value, the method used, and the null hypothesis.
```
Author: Davies Liu <davies@databricks.com>
Closes #3091 from davies/his and squashes the following commits:
145d16c [Davies Liu] address comments
0ab0764 [Davies Liu] fix float
5097d54 [Davies Liu] add Hypothesis test Python API
2014-11-05 00:35:52 -05:00
|
|
|
@staticmethod
|
|
|
|
def chiSqTest(observed, expected=None):
|
|
|
|
"""
|
|
|
|
If `observed` is Vector, conduct Pearson's chi-squared goodness
|
|
|
|
of fit test of the observed data against the expected distribution,
|
|
|
|
or againt the uniform distribution (by default), with each category
|
|
|
|
having an expected frequency of `1 / len(observed)`.
|
|
|
|
|
|
|
|
If `observed` is matrix, conduct Pearson's independence test on the
|
|
|
|
input contingency matrix, which cannot contain negative entries or
|
|
|
|
columns or rows that sum up to 0.
|
|
|
|
|
|
|
|
If `observed` is an RDD of LabeledPoint, conduct Pearson's independence
|
|
|
|
test for every feature against the label across the input RDD.
|
|
|
|
For each feature, the (feature, label) pairs are converted into a
|
|
|
|
contingency matrix for which the chi-squared statistic is computed.
|
|
|
|
All label and feature values must be categorical.
|
|
|
|
|
2020-11-24 20:24:41 -05:00
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
observed : :py:class:`pyspark.mllib.linalg.Vector` or \
|
|
|
|
:py:class:`pyspark.mllib.linalg.Matrix`
|
|
|
|
it could be a vector containing the observed categorical
|
|
|
|
counts/relative frequencies, or the contingency matrix
|
|
|
|
(containing either counts or relative frequencies),
|
|
|
|
or an RDD of LabeledPoint containing the labeled dataset
|
|
|
|
with categorical features. Real-valued features will be
|
|
|
|
treated as categorical for each distinct value.
|
|
|
|
expected : :py:class:`pyspark.mllib.linalg.Vector`
|
|
|
|
Vector containing the expected categorical counts/relative
|
|
|
|
frequencies. `expected` is rescaled if the `expected` sum
|
|
|
|
differs from the `observed` sum.
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
:py:class:`pyspark.mllib.stat.ChiSqTestResult`
|
|
|
|
object containing the test statistic, degrees
|
|
|
|
of freedom, p-value, the method used, and the null hypothesis.
|
|
|
|
|
|
|
|
Notes
|
|
|
|
-----
|
|
|
|
`observed` cannot contain negative values
|
|
|
|
|
|
|
|
Examples
|
|
|
|
--------
|
[SPARK-3964] [MLlib] [PySpark] add Hypothesis test Python API
```
pyspark.mllib.stat.StatisticschiSqTest(observed, expected=None)
:: Experimental ::
If `observed` is Vector, conduct Pearson's chi-squared goodness
of fit test of the observed data against the expected distribution,
or againt the uniform distribution (by default), with each category
having an expected frequency of `1 / len(observed)`.
(Note: `observed` cannot contain negative values)
If `observed` is matrix, conduct Pearson's independence test on the
input contingency matrix, which cannot contain negative entries or
columns or rows that sum up to 0.
If `observed` is an RDD of LabeledPoint, conduct Pearson's independence
test for every feature against the label across the input RDD.
For each feature, the (feature, label) pairs are converted into a
contingency matrix for which the chi-squared statistic is computed.
All label and feature values must be categorical.
:param observed: it could be a vector containing the observed categorical
counts/relative frequencies, or the contingency matrix
(containing either counts or relative frequencies),
or an RDD of LabeledPoint containing the labeled dataset
with categorical features. Real-valued features will be
treated as categorical for each distinct value.
:param expected: Vector containing the expected categorical counts/relative
frequencies. `expected` is rescaled if the `expected` sum
differs from the `observed` sum.
:return: ChiSquaredTest object containing the test statistic, degrees
of freedom, p-value, the method used, and the null hypothesis.
```
Author: Davies Liu <davies@databricks.com>
Closes #3091 from davies/his and squashes the following commits:
145d16c [Davies Liu] address comments
0ab0764 [Davies Liu] fix float
5097d54 [Davies Liu] add Hypothesis test Python API
2014-11-05 00:35:52 -05:00
|
|
|
>>> from pyspark.mllib.linalg import Vectors, Matrices
|
|
|
|
>>> observed = Vectors.dense([4, 6, 5])
|
|
|
|
>>> pearson = Statistics.chiSqTest(observed)
|
2015-04-16 19:20:57 -04:00
|
|
|
>>> print(pearson.statistic)
|
[SPARK-3964] [MLlib] [PySpark] add Hypothesis test Python API
```
pyspark.mllib.stat.StatisticschiSqTest(observed, expected=None)
:: Experimental ::
If `observed` is Vector, conduct Pearson's chi-squared goodness
of fit test of the observed data against the expected distribution,
or againt the uniform distribution (by default), with each category
having an expected frequency of `1 / len(observed)`.
(Note: `observed` cannot contain negative values)
If `observed` is matrix, conduct Pearson's independence test on the
input contingency matrix, which cannot contain negative entries or
columns or rows that sum up to 0.
If `observed` is an RDD of LabeledPoint, conduct Pearson's independence
test for every feature against the label across the input RDD.
For each feature, the (feature, label) pairs are converted into a
contingency matrix for which the chi-squared statistic is computed.
All label and feature values must be categorical.
:param observed: it could be a vector containing the observed categorical
counts/relative frequencies, or the contingency matrix
(containing either counts or relative frequencies),
or an RDD of LabeledPoint containing the labeled dataset
with categorical features. Real-valued features will be
treated as categorical for each distinct value.
:param expected: Vector containing the expected categorical counts/relative
frequencies. `expected` is rescaled if the `expected` sum
differs from the `observed` sum.
:return: ChiSquaredTest object containing the test statistic, degrees
of freedom, p-value, the method used, and the null hypothesis.
```
Author: Davies Liu <davies@databricks.com>
Closes #3091 from davies/his and squashes the following commits:
145d16c [Davies Liu] address comments
0ab0764 [Davies Liu] fix float
5097d54 [Davies Liu] add Hypothesis test Python API
2014-11-05 00:35:52 -05:00
|
|
|
0.4
|
|
|
|
>>> pearson.degreesOfFreedom
|
|
|
|
2
|
2015-04-16 19:20:57 -04:00
|
|
|
>>> print(round(pearson.pValue, 4))
|
[SPARK-3964] [MLlib] [PySpark] add Hypothesis test Python API
```
pyspark.mllib.stat.StatisticschiSqTest(observed, expected=None)
:: Experimental ::
If `observed` is Vector, conduct Pearson's chi-squared goodness
of fit test of the observed data against the expected distribution,
or againt the uniform distribution (by default), with each category
having an expected frequency of `1 / len(observed)`.
(Note: `observed` cannot contain negative values)
If `observed` is matrix, conduct Pearson's independence test on the
input contingency matrix, which cannot contain negative entries or
columns or rows that sum up to 0.
If `observed` is an RDD of LabeledPoint, conduct Pearson's independence
test for every feature against the label across the input RDD.
For each feature, the (feature, label) pairs are converted into a
contingency matrix for which the chi-squared statistic is computed.
All label and feature values must be categorical.
:param observed: it could be a vector containing the observed categorical
counts/relative frequencies, or the contingency matrix
(containing either counts or relative frequencies),
or an RDD of LabeledPoint containing the labeled dataset
with categorical features. Real-valued features will be
treated as categorical for each distinct value.
:param expected: Vector containing the expected categorical counts/relative
frequencies. `expected` is rescaled if the `expected` sum
differs from the `observed` sum.
:return: ChiSquaredTest object containing the test statistic, degrees
of freedom, p-value, the method used, and the null hypothesis.
```
Author: Davies Liu <davies@databricks.com>
Closes #3091 from davies/his and squashes the following commits:
145d16c [Davies Liu] address comments
0ab0764 [Davies Liu] fix float
5097d54 [Davies Liu] add Hypothesis test Python API
2014-11-05 00:35:52 -05:00
|
|
|
0.8187
|
|
|
|
>>> pearson.method
|
2020-07-13 22:22:44 -04:00
|
|
|
'pearson'
|
[SPARK-3964] [MLlib] [PySpark] add Hypothesis test Python API
```
pyspark.mllib.stat.StatisticschiSqTest(observed, expected=None)
:: Experimental ::
If `observed` is Vector, conduct Pearson's chi-squared goodness
of fit test of the observed data against the expected distribution,
or againt the uniform distribution (by default), with each category
having an expected frequency of `1 / len(observed)`.
(Note: `observed` cannot contain negative values)
If `observed` is matrix, conduct Pearson's independence test on the
input contingency matrix, which cannot contain negative entries or
columns or rows that sum up to 0.
If `observed` is an RDD of LabeledPoint, conduct Pearson's independence
test for every feature against the label across the input RDD.
For each feature, the (feature, label) pairs are converted into a
contingency matrix for which the chi-squared statistic is computed.
All label and feature values must be categorical.
:param observed: it could be a vector containing the observed categorical
counts/relative frequencies, or the contingency matrix
(containing either counts or relative frequencies),
or an RDD of LabeledPoint containing the labeled dataset
with categorical features. Real-valued features will be
treated as categorical for each distinct value.
:param expected: Vector containing the expected categorical counts/relative
frequencies. `expected` is rescaled if the `expected` sum
differs from the `observed` sum.
:return: ChiSquaredTest object containing the test statistic, degrees
of freedom, p-value, the method used, and the null hypothesis.
```
Author: Davies Liu <davies@databricks.com>
Closes #3091 from davies/his and squashes the following commits:
145d16c [Davies Liu] address comments
0ab0764 [Davies Liu] fix float
5097d54 [Davies Liu] add Hypothesis test Python API
2014-11-05 00:35:52 -05:00
|
|
|
>>> pearson.nullHypothesis
|
2020-07-13 22:22:44 -04:00
|
|
|
'observed follows the same distribution as expected.'
|
[SPARK-3964] [MLlib] [PySpark] add Hypothesis test Python API
```
pyspark.mllib.stat.StatisticschiSqTest(observed, expected=None)
:: Experimental ::
If `observed` is Vector, conduct Pearson's chi-squared goodness
of fit test of the observed data against the expected distribution,
or againt the uniform distribution (by default), with each category
having an expected frequency of `1 / len(observed)`.
(Note: `observed` cannot contain negative values)
If `observed` is matrix, conduct Pearson's independence test on the
input contingency matrix, which cannot contain negative entries or
columns or rows that sum up to 0.
If `observed` is an RDD of LabeledPoint, conduct Pearson's independence
test for every feature against the label across the input RDD.
For each feature, the (feature, label) pairs are converted into a
contingency matrix for which the chi-squared statistic is computed.
All label and feature values must be categorical.
:param observed: it could be a vector containing the observed categorical
counts/relative frequencies, or the contingency matrix
(containing either counts or relative frequencies),
or an RDD of LabeledPoint containing the labeled dataset
with categorical features. Real-valued features will be
treated as categorical for each distinct value.
:param expected: Vector containing the expected categorical counts/relative
frequencies. `expected` is rescaled if the `expected` sum
differs from the `observed` sum.
:return: ChiSquaredTest object containing the test statistic, degrees
of freedom, p-value, the method used, and the null hypothesis.
```
Author: Davies Liu <davies@databricks.com>
Closes #3091 from davies/his and squashes the following commits:
145d16c [Davies Liu] address comments
0ab0764 [Davies Liu] fix float
5097d54 [Davies Liu] add Hypothesis test Python API
2014-11-05 00:35:52 -05:00
|
|
|
|
|
|
|
>>> observed = Vectors.dense([21, 38, 43, 80])
|
|
|
|
>>> expected = Vectors.dense([3, 5, 7, 20])
|
|
|
|
>>> pearson = Statistics.chiSqTest(observed, expected)
|
2015-04-16 19:20:57 -04:00
|
|
|
>>> print(round(pearson.pValue, 4))
|
[SPARK-3964] [MLlib] [PySpark] add Hypothesis test Python API
```
pyspark.mllib.stat.StatisticschiSqTest(observed, expected=None)
:: Experimental ::
If `observed` is Vector, conduct Pearson's chi-squared goodness
of fit test of the observed data against the expected distribution,
or againt the uniform distribution (by default), with each category
having an expected frequency of `1 / len(observed)`.
(Note: `observed` cannot contain negative values)
If `observed` is matrix, conduct Pearson's independence test on the
input contingency matrix, which cannot contain negative entries or
columns or rows that sum up to 0.
If `observed` is an RDD of LabeledPoint, conduct Pearson's independence
test for every feature against the label across the input RDD.
For each feature, the (feature, label) pairs are converted into a
contingency matrix for which the chi-squared statistic is computed.
All label and feature values must be categorical.
:param observed: it could be a vector containing the observed categorical
counts/relative frequencies, or the contingency matrix
(containing either counts or relative frequencies),
or an RDD of LabeledPoint containing the labeled dataset
with categorical features. Real-valued features will be
treated as categorical for each distinct value.
:param expected: Vector containing the expected categorical counts/relative
frequencies. `expected` is rescaled if the `expected` sum
differs from the `observed` sum.
:return: ChiSquaredTest object containing the test statistic, degrees
of freedom, p-value, the method used, and the null hypothesis.
```
Author: Davies Liu <davies@databricks.com>
Closes #3091 from davies/his and squashes the following commits:
145d16c [Davies Liu] address comments
0ab0764 [Davies Liu] fix float
5097d54 [Davies Liu] add Hypothesis test Python API
2014-11-05 00:35:52 -05:00
|
|
|
0.0027
|
|
|
|
|
|
|
|
>>> data = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0]
|
|
|
|
>>> chi = Statistics.chiSqTest(Matrices.dense(3, 4, data))
|
2015-04-16 19:20:57 -04:00
|
|
|
>>> print(round(chi.statistic, 4))
|
[SPARK-3964] [MLlib] [PySpark] add Hypothesis test Python API
```
pyspark.mllib.stat.StatisticschiSqTest(observed, expected=None)
:: Experimental ::
If `observed` is Vector, conduct Pearson's chi-squared goodness
of fit test of the observed data against the expected distribution,
or againt the uniform distribution (by default), with each category
having an expected frequency of `1 / len(observed)`.
(Note: `observed` cannot contain negative values)
If `observed` is matrix, conduct Pearson's independence test on the
input contingency matrix, which cannot contain negative entries or
columns or rows that sum up to 0.
If `observed` is an RDD of LabeledPoint, conduct Pearson's independence
test for every feature against the label across the input RDD.
For each feature, the (feature, label) pairs are converted into a
contingency matrix for which the chi-squared statistic is computed.
All label and feature values must be categorical.
:param observed: it could be a vector containing the observed categorical
counts/relative frequencies, or the contingency matrix
(containing either counts or relative frequencies),
or an RDD of LabeledPoint containing the labeled dataset
with categorical features. Real-valued features will be
treated as categorical for each distinct value.
:param expected: Vector containing the expected categorical counts/relative
frequencies. `expected` is rescaled if the `expected` sum
differs from the `observed` sum.
:return: ChiSquaredTest object containing the test statistic, degrees
of freedom, p-value, the method used, and the null hypothesis.
```
Author: Davies Liu <davies@databricks.com>
Closes #3091 from davies/his and squashes the following commits:
145d16c [Davies Liu] address comments
0ab0764 [Davies Liu] fix float
5097d54 [Davies Liu] add Hypothesis test Python API
2014-11-05 00:35:52 -05:00
|
|
|
21.9958
|
|
|
|
|
|
|
|
>>> data = [LabeledPoint(0.0, Vectors.dense([0.5, 10.0])),
|
|
|
|
... LabeledPoint(0.0, Vectors.dense([1.5, 20.0])),
|
|
|
|
... LabeledPoint(1.0, Vectors.dense([1.5, 30.0])),
|
|
|
|
... LabeledPoint(0.0, Vectors.dense([3.5, 30.0])),
|
|
|
|
... LabeledPoint(0.0, Vectors.dense([3.5, 40.0])),
|
|
|
|
... LabeledPoint(1.0, Vectors.dense([3.5, 40.0])),]
|
|
|
|
>>> rdd = sc.parallelize(data, 4)
|
|
|
|
>>> chi = Statistics.chiSqTest(rdd)
|
2015-04-16 19:20:57 -04:00
|
|
|
>>> print(chi[0].statistic)
|
[SPARK-3964] [MLlib] [PySpark] add Hypothesis test Python API
```
pyspark.mllib.stat.StatisticschiSqTest(observed, expected=None)
:: Experimental ::
If `observed` is Vector, conduct Pearson's chi-squared goodness
of fit test of the observed data against the expected distribution,
or againt the uniform distribution (by default), with each category
having an expected frequency of `1 / len(observed)`.
(Note: `observed` cannot contain negative values)
If `observed` is matrix, conduct Pearson's independence test on the
input contingency matrix, which cannot contain negative entries or
columns or rows that sum up to 0.
If `observed` is an RDD of LabeledPoint, conduct Pearson's independence
test for every feature against the label across the input RDD.
For each feature, the (feature, label) pairs are converted into a
contingency matrix for which the chi-squared statistic is computed.
All label and feature values must be categorical.
:param observed: it could be a vector containing the observed categorical
counts/relative frequencies, or the contingency matrix
(containing either counts or relative frequencies),
or an RDD of LabeledPoint containing the labeled dataset
with categorical features. Real-valued features will be
treated as categorical for each distinct value.
:param expected: Vector containing the expected categorical counts/relative
frequencies. `expected` is rescaled if the `expected` sum
differs from the `observed` sum.
:return: ChiSquaredTest object containing the test statistic, degrees
of freedom, p-value, the method used, and the null hypothesis.
```
Author: Davies Liu <davies@databricks.com>
Closes #3091 from davies/his and squashes the following commits:
145d16c [Davies Liu] address comments
0ab0764 [Davies Liu] fix float
5097d54 [Davies Liu] add Hypothesis test Python API
2014-11-05 00:35:52 -05:00
|
|
|
0.75
|
2015-04-16 19:20:57 -04:00
|
|
|
>>> print(chi[1].statistic)
|
[SPARK-3964] [MLlib] [PySpark] add Hypothesis test Python API
```
pyspark.mllib.stat.StatisticschiSqTest(observed, expected=None)
:: Experimental ::
If `observed` is Vector, conduct Pearson's chi-squared goodness
of fit test of the observed data against the expected distribution,
or againt the uniform distribution (by default), with each category
having an expected frequency of `1 / len(observed)`.
(Note: `observed` cannot contain negative values)
If `observed` is matrix, conduct Pearson's independence test on the
input contingency matrix, which cannot contain negative entries or
columns or rows that sum up to 0.
If `observed` is an RDD of LabeledPoint, conduct Pearson's independence
test for every feature against the label across the input RDD.
For each feature, the (feature, label) pairs are converted into a
contingency matrix for which the chi-squared statistic is computed.
All label and feature values must be categorical.
:param observed: it could be a vector containing the observed categorical
counts/relative frequencies, or the contingency matrix
(containing either counts or relative frequencies),
or an RDD of LabeledPoint containing the labeled dataset
with categorical features. Real-valued features will be
treated as categorical for each distinct value.
:param expected: Vector containing the expected categorical counts/relative
frequencies. `expected` is rescaled if the `expected` sum
differs from the `observed` sum.
:return: ChiSquaredTest object containing the test statistic, degrees
of freedom, p-value, the method used, and the null hypothesis.
```
Author: Davies Liu <davies@databricks.com>
Closes #3091 from davies/his and squashes the following commits:
145d16c [Davies Liu] address comments
0ab0764 [Davies Liu] fix float
5097d54 [Davies Liu] add Hypothesis test Python API
2014-11-05 00:35:52 -05:00
|
|
|
1.5
|
|
|
|
"""
|
|
|
|
if isinstance(observed, RDD):
|
2014-11-11 01:26:16 -05:00
|
|
|
if not isinstance(observed.first(), LabeledPoint):
|
|
|
|
raise ValueError("observed should be an RDD of LabeledPoint")
|
[SPARK-3964] [MLlib] [PySpark] add Hypothesis test Python API
```
pyspark.mllib.stat.StatisticschiSqTest(observed, expected=None)
:: Experimental ::
If `observed` is Vector, conduct Pearson's chi-squared goodness
of fit test of the observed data against the expected distribution,
or againt the uniform distribution (by default), with each category
having an expected frequency of `1 / len(observed)`.
(Note: `observed` cannot contain negative values)
If `observed` is matrix, conduct Pearson's independence test on the
input contingency matrix, which cannot contain negative entries or
columns or rows that sum up to 0.
If `observed` is an RDD of LabeledPoint, conduct Pearson's independence
test for every feature against the label across the input RDD.
For each feature, the (feature, label) pairs are converted into a
contingency matrix for which the chi-squared statistic is computed.
All label and feature values must be categorical.
:param observed: it could be a vector containing the observed categorical
counts/relative frequencies, or the contingency matrix
(containing either counts or relative frequencies),
or an RDD of LabeledPoint containing the labeled dataset
with categorical features. Real-valued features will be
treated as categorical for each distinct value.
:param expected: Vector containing the expected categorical counts/relative
frequencies. `expected` is rescaled if the `expected` sum
differs from the `observed` sum.
:return: ChiSquaredTest object containing the test statistic, degrees
of freedom, p-value, the method used, and the null hypothesis.
```
Author: Davies Liu <davies@databricks.com>
Closes #3091 from davies/his and squashes the following commits:
145d16c [Davies Liu] address comments
0ab0764 [Davies Liu] fix float
5097d54 [Davies Liu] add Hypothesis test Python API
2014-11-05 00:35:52 -05:00
|
|
|
jmodels = callMLlibFunc("chiSqTest", observed)
|
|
|
|
return [ChiSqTestResult(m) for m in jmodels]
|
|
|
|
|
|
|
|
if isinstance(observed, Matrix):
|
|
|
|
jmodel = callMLlibFunc("chiSqTest", observed)
|
|
|
|
else:
|
|
|
|
if expected and len(expected) != len(observed):
|
|
|
|
raise ValueError("`expected` should have same length with `observed`")
|
|
|
|
jmodel = callMLlibFunc("chiSqTest", _convert_to_vector(observed), expected)
|
|
|
|
return ChiSqTestResult(jmodel)
|
|
|
|
|
2015-07-20 12:00:01 -04:00
|
|
|
@staticmethod
|
|
|
|
def kolmogorovSmirnovTest(data, distName="norm", *params):
|
|
|
|
"""
|
|
|
|
Performs the Kolmogorov-Smirnov (KS) test for data sampled from
|
|
|
|
a continuous distribution. It tests the null hypothesis that
|
|
|
|
the data is generated from a particular distribution.
|
|
|
|
|
|
|
|
The given data is sorted and the Empirical Cumulative
|
|
|
|
Distribution Function (ECDF) is calculated
|
|
|
|
which for a given point is the number of points having a CDF
|
|
|
|
value lesser than it divided by the total number of points.
|
|
|
|
|
|
|
|
Since the data is sorted, this is a step function
|
|
|
|
that rises by (1 / length of data) for every ordered point.
|
|
|
|
|
|
|
|
The KS statistic gives us the maximum distance between the
|
|
|
|
ECDF and the CDF. Intuitively if this statistic is large, the
|
2018-08-11 22:23:36 -04:00
|
|
|
probability that the null hypothesis is true becomes small.
|
2015-07-20 12:00:01 -04:00
|
|
|
For specific details of the implementation, please have a look
|
|
|
|
at the Scala documentation.
|
|
|
|
|
|
|
|
|
2020-11-24 20:24:41 -05:00
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
data : :py:class:`pyspark.RDD`
|
|
|
|
RDD, samples from the data
|
|
|
|
distName : str, optional
|
|
|
|
string, currently only "norm" is supported.
|
|
|
|
(Normal distribution) to calculate the
|
|
|
|
theoretical distribution of the data.
|
|
|
|
params
|
|
|
|
additional values which need to be provided for
|
|
|
|
a certain distribution.
|
|
|
|
If not provided, the default values are used.
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
:py:class:`pyspark.mllib.stat.KolmogorovSmirnovTestResult`
|
|
|
|
object containing the test statistic, degrees of freedom, p-value,
|
|
|
|
the method used, and the null hypothesis.
|
|
|
|
|
|
|
|
Examples
|
|
|
|
--------
|
2015-07-20 12:00:01 -04:00
|
|
|
>>> kstest = Statistics.kolmogorovSmirnovTest
|
|
|
|
>>> data = sc.parallelize([-1.0, 0.0, 1.0])
|
|
|
|
>>> ksmodel = kstest(data, "norm")
|
|
|
|
>>> print(round(ksmodel.pValue, 3))
|
|
|
|
1.0
|
|
|
|
>>> print(round(ksmodel.statistic, 3))
|
|
|
|
0.175
|
|
|
|
>>> ksmodel.nullHypothesis
|
2020-07-13 22:22:44 -04:00
|
|
|
'Sample follows theoretical distribution'
|
2015-07-20 12:00:01 -04:00
|
|
|
|
|
|
|
>>> data = sc.parallelize([2.0, 3.0, 4.0])
|
|
|
|
>>> ksmodel = kstest(data, "norm", 3.0, 1.0)
|
|
|
|
>>> print(round(ksmodel.pValue, 3))
|
|
|
|
1.0
|
|
|
|
>>> print(round(ksmodel.statistic, 3))
|
|
|
|
0.175
|
|
|
|
"""
|
|
|
|
if not isinstance(data, RDD):
|
|
|
|
raise TypeError("data should be an RDD, got %s." % type(data))
|
2020-07-13 22:22:44 -04:00
|
|
|
if not isinstance(distName, str):
|
2015-07-20 12:00:01 -04:00
|
|
|
raise TypeError("distName should be a string, got %s." % type(distName))
|
|
|
|
|
|
|
|
params = [float(param) for param in params]
|
|
|
|
return KolmogorovSmirnovTestResult(
|
|
|
|
callMLlibFunc("kolmogorovSmirnovTest", data, distName, params))
|
|
|
|
|
2014-08-01 18:02:17 -04:00
|
|
|
|
|
|
|
def _test():
|
|
|
|
import doctest
|
[SPARK-24740][PYTHON][ML] Make PySpark's tests compatible with NumPy 1.14+
## What changes were proposed in this pull request?
This PR proposes to make PySpark's tests compatible with NumPy 0.14+
NumPy 0.14.x introduced rather radical changes about its string representation.
For example, the tests below are failed:
```
**********************************************************************
File "/.../spark/python/pyspark/ml/linalg/__init__.py", line 895, in __main__.DenseMatrix.__str__
Failed example:
print(dm)
Expected:
DenseMatrix([[ 0., 2.],
[ 1., 3.]])
Got:
DenseMatrix([[0., 2.],
[1., 3.]])
**********************************************************************
File "/.../spark/python/pyspark/ml/linalg/__init__.py", line 899, in __main__.DenseMatrix.__str__
Failed example:
print(dm)
Expected:
DenseMatrix([[ 0., 1.],
[ 2., 3.]])
Got:
DenseMatrix([[0., 1.],
[2., 3.]])
**********************************************************************
File "/.../spark/python/pyspark/ml/linalg/__init__.py", line 939, in __main__.DenseMatrix.toArray
Failed example:
m.toArray()
Expected:
array([[ 0., 2.],
[ 1., 3.]])
Got:
array([[0., 2.],
[1., 3.]])
**********************************************************************
File "/.../spark/python/pyspark/ml/linalg/__init__.py", line 324, in __main__.DenseVector.dot
Failed example:
dense.dot(np.reshape([1., 2., 3., 4.], (2, 2), order='F'))
Expected:
array([ 5., 11.])
Got:
array([ 5., 11.])
**********************************************************************
File "/.../spark/python/pyspark/ml/linalg/__init__.py", line 567, in __main__.SparseVector.dot
Failed example:
a.dot(np.array([[1, 1], [2, 2], [3, 3], [4, 4]]))
Expected:
array([ 22., 22.])
Got:
array([22., 22.])
```
See [release note](https://docs.scipy.org/doc/numpy-1.14.0/release.html#compatibility-notes).
## How was this patch tested?
Manually tested:
```
$ ./run-tests --python-executables=python3.6,python2.7 --modules=pyspark-ml,pyspark-mllib
Running PySpark tests. Output is in /.../spark/python/unit-tests.log
Will test against the following Python executables: ['python3.6', 'python2.7']
Will test the following Python modules: ['pyspark-ml', 'pyspark-mllib']
Starting test(python2.7): pyspark.mllib.tests
Starting test(python2.7): pyspark.ml.classification
Starting test(python3.6): pyspark.mllib.tests
Starting test(python2.7): pyspark.ml.clustering
Finished test(python2.7): pyspark.ml.clustering (54s)
Starting test(python2.7): pyspark.ml.evaluation
Finished test(python2.7): pyspark.ml.classification (74s)
Starting test(python2.7): pyspark.ml.feature
Finished test(python2.7): pyspark.ml.evaluation (27s)
Starting test(python2.7): pyspark.ml.fpm
Finished test(python2.7): pyspark.ml.fpm (0s)
Starting test(python2.7): pyspark.ml.image
Finished test(python2.7): pyspark.ml.image (17s)
Starting test(python2.7): pyspark.ml.linalg.__init__
Finished test(python2.7): pyspark.ml.linalg.__init__ (1s)
Starting test(python2.7): pyspark.ml.recommendation
Finished test(python2.7): pyspark.ml.feature (76s)
Starting test(python2.7): pyspark.ml.regression
Finished test(python2.7): pyspark.ml.recommendation (69s)
Starting test(python2.7): pyspark.ml.stat
Finished test(python2.7): pyspark.ml.regression (45s)
Starting test(python2.7): pyspark.ml.tests
Finished test(python2.7): pyspark.ml.stat (28s)
Starting test(python2.7): pyspark.ml.tuning
Finished test(python2.7): pyspark.ml.tuning (20s)
Starting test(python2.7): pyspark.mllib.classification
Finished test(python2.7): pyspark.mllib.classification (31s)
Starting test(python2.7): pyspark.mllib.clustering
Finished test(python2.7): pyspark.mllib.tests (260s)
Starting test(python2.7): pyspark.mllib.evaluation
Finished test(python3.6): pyspark.mllib.tests (266s)
Starting test(python2.7): pyspark.mllib.feature
Finished test(python2.7): pyspark.mllib.evaluation (21s)
Starting test(python2.7): pyspark.mllib.fpm
Finished test(python2.7): pyspark.mllib.feature (38s)
Starting test(python2.7): pyspark.mllib.linalg.__init__
Finished test(python2.7): pyspark.mllib.linalg.__init__ (1s)
Starting test(python2.7): pyspark.mllib.linalg.distributed
Finished test(python2.7): pyspark.mllib.fpm (34s)
Starting test(python2.7): pyspark.mllib.random
Finished test(python2.7): pyspark.mllib.clustering (64s)
Starting test(python2.7): pyspark.mllib.recommendation
Finished test(python2.7): pyspark.mllib.random (15s)
Starting test(python2.7): pyspark.mllib.regression
Finished test(python2.7): pyspark.mllib.linalg.distributed (47s)
Starting test(python2.7): pyspark.mllib.stat.KernelDensity
Finished test(python2.7): pyspark.mllib.stat.KernelDensity (0s)
Starting test(python2.7): pyspark.mllib.stat._statistics
Finished test(python2.7): pyspark.mllib.recommendation (40s)
Starting test(python2.7): pyspark.mllib.tree
Finished test(python2.7): pyspark.mllib.regression (38s)
Starting test(python2.7): pyspark.mllib.util
Finished test(python2.7): pyspark.mllib.stat._statistics (19s)
Starting test(python3.6): pyspark.ml.classification
Finished test(python2.7): pyspark.mllib.tree (26s)
Starting test(python3.6): pyspark.ml.clustering
Finished test(python2.7): pyspark.mllib.util (27s)
Starting test(python3.6): pyspark.ml.evaluation
Finished test(python3.6): pyspark.ml.evaluation (30s)
Starting test(python3.6): pyspark.ml.feature
Finished test(python2.7): pyspark.ml.tests (234s)
Starting test(python3.6): pyspark.ml.fpm
Finished test(python3.6): pyspark.ml.fpm (1s)
Starting test(python3.6): pyspark.ml.image
Finished test(python3.6): pyspark.ml.clustering (55s)
Starting test(python3.6): pyspark.ml.linalg.__init__
Finished test(python3.6): pyspark.ml.linalg.__init__ (0s)
Starting test(python3.6): pyspark.ml.recommendation
Finished test(python3.6): pyspark.ml.classification (71s)
Starting test(python3.6): pyspark.ml.regression
Finished test(python3.6): pyspark.ml.image (18s)
Starting test(python3.6): pyspark.ml.stat
Finished test(python3.6): pyspark.ml.stat (37s)
Starting test(python3.6): pyspark.ml.tests
Finished test(python3.6): pyspark.ml.regression (59s)
Starting test(python3.6): pyspark.ml.tuning
Finished test(python3.6): pyspark.ml.feature (93s)
Starting test(python3.6): pyspark.mllib.classification
Finished test(python3.6): pyspark.ml.recommendation (83s)
Starting test(python3.6): pyspark.mllib.clustering
Finished test(python3.6): pyspark.ml.tuning (29s)
Starting test(python3.6): pyspark.mllib.evaluation
Finished test(python3.6): pyspark.mllib.evaluation (26s)
Starting test(python3.6): pyspark.mllib.feature
Finished test(python3.6): pyspark.mllib.classification (43s)
Starting test(python3.6): pyspark.mllib.fpm
Finished test(python3.6): pyspark.mllib.clustering (81s)
Starting test(python3.6): pyspark.mllib.linalg.__init__
Finished test(python3.6): pyspark.mllib.linalg.__init__ (2s)
Starting test(python3.6): pyspark.mllib.linalg.distributed
Finished test(python3.6): pyspark.mllib.fpm (48s)
Starting test(python3.6): pyspark.mllib.random
Finished test(python3.6): pyspark.mllib.feature (54s)
Starting test(python3.6): pyspark.mllib.recommendation
Finished test(python3.6): pyspark.mllib.random (18s)
Starting test(python3.6): pyspark.mllib.regression
Finished test(python3.6): pyspark.mllib.linalg.distributed (55s)
Starting test(python3.6): pyspark.mllib.stat.KernelDensity
Finished test(python3.6): pyspark.mllib.stat.KernelDensity (1s)
Starting test(python3.6): pyspark.mllib.stat._statistics
Finished test(python3.6): pyspark.mllib.recommendation (51s)
Starting test(python3.6): pyspark.mllib.tree
Finished test(python3.6): pyspark.mllib.regression (45s)
Starting test(python3.6): pyspark.mllib.util
Finished test(python3.6): pyspark.mllib.stat._statistics (21s)
Finished test(python3.6): pyspark.mllib.tree (27s)
Finished test(python3.6): pyspark.mllib.util (27s)
Finished test(python3.6): pyspark.ml.tests (264s)
```
Author: hyukjinkwon <gurwls223@apache.org>
Closes #21715 from HyukjinKwon/SPARK-24740.
2018-07-06 23:39:29 -04:00
|
|
|
import numpy
|
2016-05-23 21:14:48 -04:00
|
|
|
from pyspark.sql import SparkSession
|
[SPARK-24740][PYTHON][ML] Make PySpark's tests compatible with NumPy 1.14+
## What changes were proposed in this pull request?
This PR proposes to make PySpark's tests compatible with NumPy 0.14+
NumPy 0.14.x introduced rather radical changes about its string representation.
For example, the tests below are failed:
```
**********************************************************************
File "/.../spark/python/pyspark/ml/linalg/__init__.py", line 895, in __main__.DenseMatrix.__str__
Failed example:
print(dm)
Expected:
DenseMatrix([[ 0., 2.],
[ 1., 3.]])
Got:
DenseMatrix([[0., 2.],
[1., 3.]])
**********************************************************************
File "/.../spark/python/pyspark/ml/linalg/__init__.py", line 899, in __main__.DenseMatrix.__str__
Failed example:
print(dm)
Expected:
DenseMatrix([[ 0., 1.],
[ 2., 3.]])
Got:
DenseMatrix([[0., 1.],
[2., 3.]])
**********************************************************************
File "/.../spark/python/pyspark/ml/linalg/__init__.py", line 939, in __main__.DenseMatrix.toArray
Failed example:
m.toArray()
Expected:
array([[ 0., 2.],
[ 1., 3.]])
Got:
array([[0., 2.],
[1., 3.]])
**********************************************************************
File "/.../spark/python/pyspark/ml/linalg/__init__.py", line 324, in __main__.DenseVector.dot
Failed example:
dense.dot(np.reshape([1., 2., 3., 4.], (2, 2), order='F'))
Expected:
array([ 5., 11.])
Got:
array([ 5., 11.])
**********************************************************************
File "/.../spark/python/pyspark/ml/linalg/__init__.py", line 567, in __main__.SparseVector.dot
Failed example:
a.dot(np.array([[1, 1], [2, 2], [3, 3], [4, 4]]))
Expected:
array([ 22., 22.])
Got:
array([22., 22.])
```
See [release note](https://docs.scipy.org/doc/numpy-1.14.0/release.html#compatibility-notes).
## How was this patch tested?
Manually tested:
```
$ ./run-tests --python-executables=python3.6,python2.7 --modules=pyspark-ml,pyspark-mllib
Running PySpark tests. Output is in /.../spark/python/unit-tests.log
Will test against the following Python executables: ['python3.6', 'python2.7']
Will test the following Python modules: ['pyspark-ml', 'pyspark-mllib']
Starting test(python2.7): pyspark.mllib.tests
Starting test(python2.7): pyspark.ml.classification
Starting test(python3.6): pyspark.mllib.tests
Starting test(python2.7): pyspark.ml.clustering
Finished test(python2.7): pyspark.ml.clustering (54s)
Starting test(python2.7): pyspark.ml.evaluation
Finished test(python2.7): pyspark.ml.classification (74s)
Starting test(python2.7): pyspark.ml.feature
Finished test(python2.7): pyspark.ml.evaluation (27s)
Starting test(python2.7): pyspark.ml.fpm
Finished test(python2.7): pyspark.ml.fpm (0s)
Starting test(python2.7): pyspark.ml.image
Finished test(python2.7): pyspark.ml.image (17s)
Starting test(python2.7): pyspark.ml.linalg.__init__
Finished test(python2.7): pyspark.ml.linalg.__init__ (1s)
Starting test(python2.7): pyspark.ml.recommendation
Finished test(python2.7): pyspark.ml.feature (76s)
Starting test(python2.7): pyspark.ml.regression
Finished test(python2.7): pyspark.ml.recommendation (69s)
Starting test(python2.7): pyspark.ml.stat
Finished test(python2.7): pyspark.ml.regression (45s)
Starting test(python2.7): pyspark.ml.tests
Finished test(python2.7): pyspark.ml.stat (28s)
Starting test(python2.7): pyspark.ml.tuning
Finished test(python2.7): pyspark.ml.tuning (20s)
Starting test(python2.7): pyspark.mllib.classification
Finished test(python2.7): pyspark.mllib.classification (31s)
Starting test(python2.7): pyspark.mllib.clustering
Finished test(python2.7): pyspark.mllib.tests (260s)
Starting test(python2.7): pyspark.mllib.evaluation
Finished test(python3.6): pyspark.mllib.tests (266s)
Starting test(python2.7): pyspark.mllib.feature
Finished test(python2.7): pyspark.mllib.evaluation (21s)
Starting test(python2.7): pyspark.mllib.fpm
Finished test(python2.7): pyspark.mllib.feature (38s)
Starting test(python2.7): pyspark.mllib.linalg.__init__
Finished test(python2.7): pyspark.mllib.linalg.__init__ (1s)
Starting test(python2.7): pyspark.mllib.linalg.distributed
Finished test(python2.7): pyspark.mllib.fpm (34s)
Starting test(python2.7): pyspark.mllib.random
Finished test(python2.7): pyspark.mllib.clustering (64s)
Starting test(python2.7): pyspark.mllib.recommendation
Finished test(python2.7): pyspark.mllib.random (15s)
Starting test(python2.7): pyspark.mllib.regression
Finished test(python2.7): pyspark.mllib.linalg.distributed (47s)
Starting test(python2.7): pyspark.mllib.stat.KernelDensity
Finished test(python2.7): pyspark.mllib.stat.KernelDensity (0s)
Starting test(python2.7): pyspark.mllib.stat._statistics
Finished test(python2.7): pyspark.mllib.recommendation (40s)
Starting test(python2.7): pyspark.mllib.tree
Finished test(python2.7): pyspark.mllib.regression (38s)
Starting test(python2.7): pyspark.mllib.util
Finished test(python2.7): pyspark.mllib.stat._statistics (19s)
Starting test(python3.6): pyspark.ml.classification
Finished test(python2.7): pyspark.mllib.tree (26s)
Starting test(python3.6): pyspark.ml.clustering
Finished test(python2.7): pyspark.mllib.util (27s)
Starting test(python3.6): pyspark.ml.evaluation
Finished test(python3.6): pyspark.ml.evaluation (30s)
Starting test(python3.6): pyspark.ml.feature
Finished test(python2.7): pyspark.ml.tests (234s)
Starting test(python3.6): pyspark.ml.fpm
Finished test(python3.6): pyspark.ml.fpm (1s)
Starting test(python3.6): pyspark.ml.image
Finished test(python3.6): pyspark.ml.clustering (55s)
Starting test(python3.6): pyspark.ml.linalg.__init__
Finished test(python3.6): pyspark.ml.linalg.__init__ (0s)
Starting test(python3.6): pyspark.ml.recommendation
Finished test(python3.6): pyspark.ml.classification (71s)
Starting test(python3.6): pyspark.ml.regression
Finished test(python3.6): pyspark.ml.image (18s)
Starting test(python3.6): pyspark.ml.stat
Finished test(python3.6): pyspark.ml.stat (37s)
Starting test(python3.6): pyspark.ml.tests
Finished test(python3.6): pyspark.ml.regression (59s)
Starting test(python3.6): pyspark.ml.tuning
Finished test(python3.6): pyspark.ml.feature (93s)
Starting test(python3.6): pyspark.mllib.classification
Finished test(python3.6): pyspark.ml.recommendation (83s)
Starting test(python3.6): pyspark.mllib.clustering
Finished test(python3.6): pyspark.ml.tuning (29s)
Starting test(python3.6): pyspark.mllib.evaluation
Finished test(python3.6): pyspark.mllib.evaluation (26s)
Starting test(python3.6): pyspark.mllib.feature
Finished test(python3.6): pyspark.mllib.classification (43s)
Starting test(python3.6): pyspark.mllib.fpm
Finished test(python3.6): pyspark.mllib.clustering (81s)
Starting test(python3.6): pyspark.mllib.linalg.__init__
Finished test(python3.6): pyspark.mllib.linalg.__init__ (2s)
Starting test(python3.6): pyspark.mllib.linalg.distributed
Finished test(python3.6): pyspark.mllib.fpm (48s)
Starting test(python3.6): pyspark.mllib.random
Finished test(python3.6): pyspark.mllib.feature (54s)
Starting test(python3.6): pyspark.mllib.recommendation
Finished test(python3.6): pyspark.mllib.random (18s)
Starting test(python3.6): pyspark.mllib.regression
Finished test(python3.6): pyspark.mllib.linalg.distributed (55s)
Starting test(python3.6): pyspark.mllib.stat.KernelDensity
Finished test(python3.6): pyspark.mllib.stat.KernelDensity (1s)
Starting test(python3.6): pyspark.mllib.stat._statistics
Finished test(python3.6): pyspark.mllib.recommendation (51s)
Starting test(python3.6): pyspark.mllib.tree
Finished test(python3.6): pyspark.mllib.regression (45s)
Starting test(python3.6): pyspark.mllib.util
Finished test(python3.6): pyspark.mllib.stat._statistics (21s)
Finished test(python3.6): pyspark.mllib.tree (27s)
Finished test(python3.6): pyspark.mllib.util (27s)
Finished test(python3.6): pyspark.ml.tests (264s)
```
Author: hyukjinkwon <gurwls223@apache.org>
Closes #21715 from HyukjinKwon/SPARK-24740.
2018-07-06 23:39:29 -04:00
|
|
|
try:
|
|
|
|
# Numpy 1.14+ changed it's string format.
|
|
|
|
numpy.set_printoptions(legacy='1.13')
|
|
|
|
except TypeError:
|
|
|
|
pass
|
2014-08-01 18:02:17 -04:00
|
|
|
globs = globals().copy()
|
2016-05-23 21:14:48 -04:00
|
|
|
spark = SparkSession.builder\
|
|
|
|
.master("local[4]")\
|
|
|
|
.appName("mllib.stat.statistics tests")\
|
|
|
|
.getOrCreate()
|
|
|
|
globs['sc'] = spark.sparkContext
|
2014-08-01 18:02:17 -04:00
|
|
|
(failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
|
2016-05-23 21:14:48 -04:00
|
|
|
spark.stop()
|
2014-08-01 18:02:17 -04:00
|
|
|
if failure_count:
|
2018-03-08 06:38:34 -05:00
|
|
|
sys.exit(-1)
|
2014-08-01 18:02:17 -04:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
_test()
|