2017-03-28 22:19:16 -04:00
|
|
|
#
|
|
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
|
|
# this work for additional information regarding copyright ownership.
|
|
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
|
|
# (the "License"); you may not use this file except in compliance with
|
|
|
|
# the License. You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
#
|
|
|
|
|
2018-03-08 06:38:34 -05:00
|
|
|
import sys
|
|
|
|
|
2017-03-28 22:19:16 -04:00
|
|
|
from pyspark import since, SparkContext
|
|
|
|
from pyspark.ml.common import _java2py, _py2java
|
2018-04-17 13:11:08 -04:00
|
|
|
from pyspark.ml.wrapper import JavaWrapper, _jvm
|
|
|
|
from pyspark.sql.column import Column, _to_seq
|
|
|
|
from pyspark.sql.functions import lit
|
2017-03-28 22:19:16 -04:00
|
|
|
|
|
|
|
|
|
|
|
class ChiSquareTest(object):
|
|
|
|
"""
|
|
|
|
Conduct Pearson's independence test for every feature against the label. For each feature,
|
|
|
|
the (feature, label) pairs are converted into a contingency matrix for which the Chi-squared
|
|
|
|
statistic is computed. All label and feature values must be categorical.
|
|
|
|
|
|
|
|
The null hypothesis is that the occurrence of the outcomes is statistically independent.
|
|
|
|
|
|
|
|
.. versionadded:: 2.2.0
|
|
|
|
|
|
|
|
"""
|
|
|
|
@staticmethod
|
2020-05-11 10:09:00 -04:00
|
|
|
def test(dataset, featuresCol, labelCol, flatten=False):
|
2017-03-28 22:19:16 -04:00
|
|
|
"""
|
|
|
|
Perform a Pearson's independence test using dataset.
|
2018-04-10 14:18:14 -04:00
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
.. versionadded:: 2.2.0
|
2020-05-11 10:09:00 -04:00
|
|
|
.. versionchanged:: 3.1.0
|
|
|
|
Added optional ``flatten`` argument.
|
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
dataset : :py:class:`pyspark.sql.DataFrame`
|
|
|
|
DataFrame of categorical labels and categorical features.
|
|
|
|
Real-valued features will be treated as categorical for each distinct value.
|
|
|
|
featuresCol : str
|
|
|
|
Name of features column in dataset, of type `Vector` (`VectorUDT`).
|
|
|
|
labelCol : str
|
|
|
|
Name of label column in dataset, of any numerical type.
|
|
|
|
flatten : bool, optional
|
|
|
|
if True, flattens the returned dataframe.
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
:py:class:`pyspark.sql.DataFrame`
|
|
|
|
DataFrame containing the test result for every feature against the label.
|
|
|
|
If flatten is True, this DataFrame will contain one row per feature with the following
|
|
|
|
fields:
|
|
|
|
|
|
|
|
- `featureIndex: int`
|
|
|
|
- `pValue: float`
|
|
|
|
- `degreesOfFreedom: int`
|
|
|
|
- `statistic: float`
|
|
|
|
|
|
|
|
If flatten is False, this DataFrame will contain a single Row with the following fields:
|
|
|
|
|
|
|
|
- `pValues: Vector`
|
|
|
|
- `degreesOfFreedom: Array[int]`
|
|
|
|
- `statistics: Vector`
|
|
|
|
|
|
|
|
Each of these fields has one value per feature.
|
|
|
|
|
|
|
|
Examples
|
|
|
|
--------
|
2018-04-10 14:18:14 -04:00
|
|
|
>>> from pyspark.ml.linalg import Vectors
|
|
|
|
>>> from pyspark.ml.stat import ChiSquareTest
|
|
|
|
>>> dataset = [[0, Vectors.dense([0, 0, 1])],
|
|
|
|
... [0, Vectors.dense([1, 0, 1])],
|
|
|
|
... [1, Vectors.dense([2, 1, 1])],
|
|
|
|
... [1, Vectors.dense([3, 1, 1])]]
|
|
|
|
>>> dataset = spark.createDataFrame(dataset, ["label", "features"])
|
|
|
|
>>> chiSqResult = ChiSquareTest.test(dataset, 'features', 'label')
|
|
|
|
>>> chiSqResult.select("degreesOfFreedom").collect()[0]
|
|
|
|
Row(degreesOfFreedom=[3, 1, 0])
|
2020-05-11 10:09:00 -04:00
|
|
|
>>> chiSqResult = ChiSquareTest.test(dataset, 'features', 'label', True)
|
|
|
|
>>> row = chiSqResult.orderBy("featureIndex").collect()
|
|
|
|
>>> row[0].statistic
|
|
|
|
4.0
|
2017-03-28 22:19:16 -04:00
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
javaTestObj = _jvm().org.apache.spark.ml.stat.ChiSquareTest
|
2020-05-11 10:09:00 -04:00
|
|
|
args = [_py2java(sc, arg) for arg in (dataset, featuresCol, labelCol, flatten)]
|
2017-03-28 22:19:16 -04:00
|
|
|
return _java2py(sc, javaTestObj.test(*args))
|
|
|
|
|
|
|
|
|
2017-04-07 05:00:10 -04:00
|
|
|
class Correlation(object):
|
|
|
|
"""
|
|
|
|
Compute the correlation matrix for the input dataset of Vectors using the specified method.
|
|
|
|
Methods currently supported: `pearson` (default), `spearman`.
|
|
|
|
|
|
|
|
.. versionadded:: 2.2.0
|
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
Notes
|
|
|
|
-----
|
|
|
|
For Spearman, a rank correlation, we need to create an RDD[Double] for each column
|
|
|
|
and sort it in order to retrieve the ranks and then join the columns back into an RDD[Vector],
|
|
|
|
which is fairly costly. Cache the input Dataset before calling corr with `method = 'spearman'`
|
|
|
|
to avoid recomputing the common lineage.
|
2017-04-07 05:00:10 -04:00
|
|
|
"""
|
|
|
|
@staticmethod
|
|
|
|
def corr(dataset, column, method="pearson"):
|
|
|
|
"""
|
|
|
|
Compute the correlation matrix with specified method using dataset.
|
2018-04-10 14:18:14 -04:00
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
.. versionadded:: 2.2.0
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
dataset : :py:class:`pyspark.sql.DataFrame`
|
|
|
|
A DataFrame.
|
|
|
|
column : str
|
|
|
|
The name of the column of vectors for which the correlation coefficient needs
|
|
|
|
to be computed. This must be a column of the dataset, and it must contain
|
|
|
|
Vector objects.
|
|
|
|
method : str, optional
|
|
|
|
String specifying the method to use for computing correlation.
|
|
|
|
Supported: `pearson` (default), `spearman`.
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
A DataFrame that contains the correlation matrix of the column of vectors. This
|
|
|
|
DataFrame contains a single row and a single column of name `METHODNAME(COLUMN)`.
|
|
|
|
|
|
|
|
Examples
|
|
|
|
--------
|
2020-08-08 11:51:57 -04:00
|
|
|
>>> from pyspark.ml.linalg import DenseMatrix, Vectors
|
2018-04-10 14:18:14 -04:00
|
|
|
>>> from pyspark.ml.stat import Correlation
|
|
|
|
>>> dataset = [[Vectors.dense([1, 0, 0, -2])],
|
|
|
|
... [Vectors.dense([4, 5, 0, 3])],
|
|
|
|
... [Vectors.dense([6, 7, 0, 8])],
|
|
|
|
... [Vectors.dense([9, 0, 0, 1])]]
|
|
|
|
>>> dataset = spark.createDataFrame(dataset, ['features'])
|
|
|
|
>>> pearsonCorr = Correlation.corr(dataset, 'features', 'pearson').collect()[0][0]
|
|
|
|
>>> print(str(pearsonCorr).replace('nan', 'NaN'))
|
|
|
|
DenseMatrix([[ 1. , 0.0556..., NaN, 0.4004...],
|
|
|
|
[ 0.0556..., 1. , NaN, 0.9135...],
|
|
|
|
[ NaN, NaN, 1. , NaN],
|
|
|
|
[ 0.4004..., 0.9135..., NaN, 1. ]])
|
|
|
|
>>> spearmanCorr = Correlation.corr(dataset, 'features', method='spearman').collect()[0][0]
|
|
|
|
>>> print(str(spearmanCorr).replace('nan', 'NaN'))
|
|
|
|
DenseMatrix([[ 1. , 0.1054..., NaN, 0.4 ],
|
|
|
|
[ 0.1054..., 1. , NaN, 0.9486... ],
|
|
|
|
[ NaN, NaN, 1. , NaN],
|
|
|
|
[ 0.4 , 0.9486... , NaN, 1. ]])
|
2017-04-07 05:00:10 -04:00
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
javaCorrObj = _jvm().org.apache.spark.ml.stat.Correlation
|
|
|
|
args = [_py2java(sc, arg) for arg in (dataset, column, method)]
|
|
|
|
return _java2py(sc, javaCorrObj.corr(*args))
|
|
|
|
|
|
|
|
|
2018-04-10 14:18:14 -04:00
|
|
|
class KolmogorovSmirnovTest(object):
|
|
|
|
"""
|
|
|
|
Conduct the two-sided Kolmogorov Smirnov (KS) test for data sampled from a continuous
|
|
|
|
distribution.
|
|
|
|
|
|
|
|
By comparing the largest difference between the empirical cumulative
|
|
|
|
distribution of the sample data and the theoretical distribution we can provide a test for the
|
|
|
|
the null hypothesis that the sample data comes from that theoretical distribution.
|
|
|
|
|
|
|
|
.. versionadded:: 2.4.0
|
|
|
|
|
|
|
|
"""
|
|
|
|
@staticmethod
|
|
|
|
def test(dataset, sampleCol, distName, *params):
|
|
|
|
"""
|
|
|
|
Conduct a one-sample, two-sided Kolmogorov-Smirnov test for probability distribution
|
|
|
|
equality. Currently supports the normal distribution, taking as parameters the mean and
|
|
|
|
standard deviation.
|
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
.. versionadded:: 2.4.0
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
dataset : :py:class:`pyspark.sql.DataFrame`
|
|
|
|
a Dataset or a DataFrame containing the sample of data to test.
|
|
|
|
sampleCol : str
|
|
|
|
Name of sample column in dataset, of any numerical type.
|
|
|
|
distName : str
|
|
|
|
a `string` name for a theoretical distribution, currently only support "norm".
|
|
|
|
params : float
|
|
|
|
a list of `float` values specifying the parameters to be used for the theoretical
|
|
|
|
distribution. For "norm" distribution, the parameters includes mean and variance.
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
A DataFrame that contains the Kolmogorov-Smirnov test result for the input sampled data.
|
|
|
|
This DataFrame will contain a single Row with the following fields:
|
|
|
|
|
|
|
|
- `pValue: Double`
|
|
|
|
- `statistic: Double`
|
|
|
|
|
|
|
|
Examples
|
|
|
|
--------
|
2018-04-10 14:18:14 -04:00
|
|
|
>>> from pyspark.ml.stat import KolmogorovSmirnovTest
|
|
|
|
>>> dataset = [[-1.0], [0.0], [1.0]]
|
|
|
|
>>> dataset = spark.createDataFrame(dataset, ['sample'])
|
|
|
|
>>> ksResult = KolmogorovSmirnovTest.test(dataset, 'sample', 'norm', 0.0, 1.0).first()
|
|
|
|
>>> round(ksResult.pValue, 3)
|
|
|
|
1.0
|
|
|
|
>>> round(ksResult.statistic, 3)
|
|
|
|
0.175
|
|
|
|
>>> dataset = [[2.0], [3.0], [4.0]]
|
|
|
|
>>> dataset = spark.createDataFrame(dataset, ['sample'])
|
|
|
|
>>> ksResult = KolmogorovSmirnovTest.test(dataset, 'sample', 'norm', 3.0, 1.0).first()
|
|
|
|
>>> round(ksResult.pValue, 3)
|
|
|
|
1.0
|
|
|
|
>>> round(ksResult.statistic, 3)
|
|
|
|
0.175
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
javaTestObj = _jvm().org.apache.spark.ml.stat.KolmogorovSmirnovTest
|
|
|
|
dataset = _py2java(sc, dataset)
|
|
|
|
params = [float(param) for param in params]
|
|
|
|
return _java2py(sc, javaTestObj.test(dataset, sampleCol, distName,
|
|
|
|
_jvm().PythonUtils.toSeq(params)))
|
|
|
|
|
|
|
|
|
2018-04-17 13:11:08 -04:00
|
|
|
class Summarizer(object):
|
|
|
|
"""
|
|
|
|
Tools for vectorized statistics on MLlib Vectors.
|
|
|
|
The methods in this package provide various statistics for Vectors contained inside DataFrames.
|
|
|
|
This class lets users pick the statistics they would like to extract for a given column.
|
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
.. versionadded:: 2.4.0
|
|
|
|
|
|
|
|
Examples
|
|
|
|
--------
|
2018-04-17 13:11:08 -04:00
|
|
|
>>> from pyspark.ml.stat import Summarizer
|
|
|
|
>>> from pyspark.sql import Row
|
|
|
|
>>> from pyspark.ml.linalg import Vectors
|
|
|
|
>>> summarizer = Summarizer.metrics("mean", "count")
|
|
|
|
>>> df = sc.parallelize([Row(weight=1.0, features=Vectors.dense(1.0, 1.0, 1.0)),
|
|
|
|
... Row(weight=0.0, features=Vectors.dense(1.0, 2.0, 3.0))]).toDF()
|
|
|
|
>>> df.select(summarizer.summary(df.features, df.weight)).show(truncate=False)
|
|
|
|
+-----------------------------------+
|
|
|
|
|aggregate_metrics(features, weight)|
|
|
|
|
+-----------------------------------+
|
2020-08-04 10:57:09 -04:00
|
|
|
|{[1.0,1.0,1.0], 1} |
|
2018-04-17 13:11:08 -04:00
|
|
|
+-----------------------------------+
|
|
|
|
<BLANKLINE>
|
|
|
|
>>> df.select(summarizer.summary(df.features)).show(truncate=False)
|
|
|
|
+--------------------------------+
|
|
|
|
|aggregate_metrics(features, 1.0)|
|
|
|
|
+--------------------------------+
|
2020-08-04 10:57:09 -04:00
|
|
|
|{[1.0,1.5,2.0], 2} |
|
2018-04-17 13:11:08 -04:00
|
|
|
+--------------------------------+
|
|
|
|
<BLANKLINE>
|
|
|
|
>>> df.select(Summarizer.mean(df.features, df.weight)).show(truncate=False)
|
|
|
|
+--------------+
|
|
|
|
|mean(features)|
|
|
|
|
+--------------+
|
|
|
|
|[1.0,1.0,1.0] |
|
|
|
|
+--------------+
|
|
|
|
<BLANKLINE>
|
|
|
|
>>> df.select(Summarizer.mean(df.features)).show(truncate=False)
|
|
|
|
+--------------+
|
|
|
|
|mean(features)|
|
|
|
|
+--------------+
|
|
|
|
|[1.0,1.5,2.0] |
|
|
|
|
+--------------+
|
|
|
|
<BLANKLINE>
|
|
|
|
"""
|
|
|
|
@staticmethod
|
|
|
|
@since("2.4.0")
|
|
|
|
def mean(col, weightCol=None):
|
|
|
|
"""
|
|
|
|
return a column of mean summary
|
|
|
|
"""
|
|
|
|
return Summarizer._get_single_metric(col, weightCol, "mean")
|
|
|
|
|
2019-12-02 01:44:31 -05:00
|
|
|
@staticmethod
|
|
|
|
@since("3.0.0")
|
|
|
|
def sum(col, weightCol=None):
|
|
|
|
"""
|
|
|
|
return a column of sum summary
|
|
|
|
"""
|
|
|
|
return Summarizer._get_single_metric(col, weightCol, "sum")
|
|
|
|
|
2018-04-17 13:11:08 -04:00
|
|
|
@staticmethod
|
|
|
|
@since("2.4.0")
|
|
|
|
def variance(col, weightCol=None):
|
|
|
|
"""
|
|
|
|
return a column of variance summary
|
|
|
|
"""
|
|
|
|
return Summarizer._get_single_metric(col, weightCol, "variance")
|
|
|
|
|
2019-12-02 01:44:31 -05:00
|
|
|
@staticmethod
|
|
|
|
@since("3.0.0")
|
|
|
|
def std(col, weightCol=None):
|
|
|
|
"""
|
|
|
|
return a column of std summary
|
|
|
|
"""
|
|
|
|
return Summarizer._get_single_metric(col, weightCol, "std")
|
|
|
|
|
2018-04-17 13:11:08 -04:00
|
|
|
@staticmethod
|
|
|
|
@since("2.4.0")
|
|
|
|
def count(col, weightCol=None):
|
|
|
|
"""
|
|
|
|
return a column of count summary
|
|
|
|
"""
|
|
|
|
return Summarizer._get_single_metric(col, weightCol, "count")
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
@since("2.4.0")
|
|
|
|
def numNonZeros(col, weightCol=None):
|
|
|
|
"""
|
|
|
|
return a column of numNonZero summary
|
|
|
|
"""
|
|
|
|
return Summarizer._get_single_metric(col, weightCol, "numNonZeros")
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
@since("2.4.0")
|
|
|
|
def max(col, weightCol=None):
|
|
|
|
"""
|
|
|
|
return a column of max summary
|
|
|
|
"""
|
|
|
|
return Summarizer._get_single_metric(col, weightCol, "max")
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
@since("2.4.0")
|
|
|
|
def min(col, weightCol=None):
|
|
|
|
"""
|
|
|
|
return a column of min summary
|
|
|
|
"""
|
|
|
|
return Summarizer._get_single_metric(col, weightCol, "min")
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
@since("2.4.0")
|
|
|
|
def normL1(col, weightCol=None):
|
|
|
|
"""
|
|
|
|
return a column of normL1 summary
|
|
|
|
"""
|
|
|
|
return Summarizer._get_single_metric(col, weightCol, "normL1")
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
@since("2.4.0")
|
|
|
|
def normL2(col, weightCol=None):
|
|
|
|
"""
|
|
|
|
return a column of normL2 summary
|
|
|
|
"""
|
|
|
|
return Summarizer._get_single_metric(col, weightCol, "normL2")
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def _check_param(featuresCol, weightCol):
|
|
|
|
if weightCol is None:
|
|
|
|
weightCol = lit(1.0)
|
|
|
|
if not isinstance(featuresCol, Column) or not isinstance(weightCol, Column):
|
|
|
|
raise TypeError("featureCol and weightCol should be a Column")
|
|
|
|
return featuresCol, weightCol
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def _get_single_metric(col, weightCol, metric):
|
|
|
|
col, weightCol = Summarizer._check_param(col, weightCol)
|
|
|
|
return Column(JavaWrapper._new_java_obj("org.apache.spark.ml.stat.Summarizer." + metric,
|
|
|
|
col._jc, weightCol._jc))
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def metrics(*metrics):
|
|
|
|
"""
|
|
|
|
Given a list of metrics, provides a builder that it turns computes metrics from a column.
|
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
See the documentation of :py:class:`Summarizer` for an example.
|
2018-04-17 13:11:08 -04:00
|
|
|
|
|
|
|
The following metrics are accepted (case sensitive):
|
|
|
|
- mean: a vector that contains the coefficient-wise mean.
|
2019-12-02 01:44:31 -05:00
|
|
|
- sum: a vector that contains the coefficient-wise sum.
|
2018-04-17 13:11:08 -04:00
|
|
|
- variance: a vector tha contains the coefficient-wise variance.
|
2019-12-02 01:44:31 -05:00
|
|
|
- std: a vector tha contains the coefficient-wise standard deviation.
|
2018-04-17 13:11:08 -04:00
|
|
|
- count: the count of all vectors seen.
|
|
|
|
- numNonzeros: a vector with the number of non-zeros for each coefficients
|
|
|
|
- max: the maximum for each coefficient.
|
|
|
|
- min: the minimum for each coefficient.
|
2018-11-05 18:34:23 -05:00
|
|
|
- normL2: the Euclidean norm for each coefficient.
|
2018-04-17 13:11:08 -04:00
|
|
|
- normL1: the L1 norm of each coefficient (sum of the absolute values).
|
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
.. versionadded:: 2.4.0
|
2018-04-17 13:11:08 -04:00
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
Notes
|
|
|
|
-----
|
|
|
|
Currently, the performance of this interface is about 2x~3x slower than using the RDD
|
2018-04-17 13:11:08 -04:00
|
|
|
interface.
|
2020-11-09 19:33:48 -05:00
|
|
|
|
|
|
|
Examples
|
|
|
|
--------
|
|
|
|
metrics : str
|
|
|
|
metrics that can be provided.
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
:py:class:`pyspark.ml.stat.SummaryBuilder`
|
2018-04-17 13:11:08 -04:00
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
js = JavaWrapper._new_java_obj("org.apache.spark.ml.stat.Summarizer.metrics",
|
|
|
|
_to_seq(sc, metrics))
|
|
|
|
return SummaryBuilder(js)
|
|
|
|
|
|
|
|
|
|
|
|
class SummaryBuilder(JavaWrapper):
|
|
|
|
"""
|
|
|
|
A builder object that provides summary statistics about a given column.
|
|
|
|
|
|
|
|
Users should not directly create such builders, but instead use one of the methods in
|
|
|
|
:py:class:`pyspark.ml.stat.Summarizer`
|
|
|
|
|
|
|
|
.. versionadded:: 2.4.0
|
|
|
|
|
|
|
|
"""
|
|
|
|
def __init__(self, jSummaryBuilder):
|
|
|
|
super(SummaryBuilder, self).__init__(jSummaryBuilder)
|
|
|
|
|
|
|
|
def summary(self, featuresCol, weightCol=None):
|
|
|
|
"""
|
|
|
|
Returns an aggregate object that contains the summary of the column with the requested
|
|
|
|
metrics.
|
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
.. versionadded:: 2.4.0
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
featuresCol : str
|
|
|
|
a column that contains features Vector object.
|
|
|
|
weightCol : str, optional
|
|
|
|
a column that contains weight value. Default weight is 1.0.
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
:py:class:`pyspark.sql.Column`
|
|
|
|
an aggregate column that contains the statistics. The exact content of this
|
|
|
|
structure is determined during the creation of the builder.
|
2018-04-17 13:11:08 -04:00
|
|
|
"""
|
|
|
|
featuresCol, weightCol = Summarizer._check_param(featuresCol, weightCol)
|
|
|
|
return Column(self._java_obj.summary(featuresCol._jc, weightCol._jc))
|
|
|
|
|
|
|
|
|
2019-12-27 00:30:18 -05:00
|
|
|
class MultivariateGaussian(object):
|
|
|
|
"""Represents a (mean, cov) tuple
|
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
.. versionadded:: 3.0.0
|
|
|
|
|
|
|
|
Examples
|
|
|
|
--------
|
2020-08-08 11:51:57 -04:00
|
|
|
>>> from pyspark.ml.linalg import DenseMatrix, Vectors
|
2019-12-27 00:30:18 -05:00
|
|
|
>>> m = MultivariateGaussian(Vectors.dense([11,12]), DenseMatrix(2, 2, (1.0, 3.0, 5.0, 2.0)))
|
|
|
|
>>> (m.mean, m.cov.toArray())
|
|
|
|
(DenseVector([11.0, 12.0]), array([[ 1., 5.],
|
|
|
|
[ 3., 2.]]))
|
|
|
|
"""
|
|
|
|
def __init__(self, mean, cov):
|
|
|
|
self.mean = mean
|
|
|
|
self.cov = cov
|
|
|
|
|
|
|
|
|
2017-03-28 22:19:16 -04:00
|
|
|
if __name__ == "__main__":
|
|
|
|
import doctest
|
[SPARK-24740][PYTHON][ML] Make PySpark's tests compatible with NumPy 1.14+
## What changes were proposed in this pull request?
This PR proposes to make PySpark's tests compatible with NumPy 0.14+
NumPy 0.14.x introduced rather radical changes about its string representation.
For example, the tests below are failed:
```
**********************************************************************
File "/.../spark/python/pyspark/ml/linalg/__init__.py", line 895, in __main__.DenseMatrix.__str__
Failed example:
print(dm)
Expected:
DenseMatrix([[ 0., 2.],
[ 1., 3.]])
Got:
DenseMatrix([[0., 2.],
[1., 3.]])
**********************************************************************
File "/.../spark/python/pyspark/ml/linalg/__init__.py", line 899, in __main__.DenseMatrix.__str__
Failed example:
print(dm)
Expected:
DenseMatrix([[ 0., 1.],
[ 2., 3.]])
Got:
DenseMatrix([[0., 1.],
[2., 3.]])
**********************************************************************
File "/.../spark/python/pyspark/ml/linalg/__init__.py", line 939, in __main__.DenseMatrix.toArray
Failed example:
m.toArray()
Expected:
array([[ 0., 2.],
[ 1., 3.]])
Got:
array([[0., 2.],
[1., 3.]])
**********************************************************************
File "/.../spark/python/pyspark/ml/linalg/__init__.py", line 324, in __main__.DenseVector.dot
Failed example:
dense.dot(np.reshape([1., 2., 3., 4.], (2, 2), order='F'))
Expected:
array([ 5., 11.])
Got:
array([ 5., 11.])
**********************************************************************
File "/.../spark/python/pyspark/ml/linalg/__init__.py", line 567, in __main__.SparseVector.dot
Failed example:
a.dot(np.array([[1, 1], [2, 2], [3, 3], [4, 4]]))
Expected:
array([ 22., 22.])
Got:
array([22., 22.])
```
See [release note](https://docs.scipy.org/doc/numpy-1.14.0/release.html#compatibility-notes).
## How was this patch tested?
Manually tested:
```
$ ./run-tests --python-executables=python3.6,python2.7 --modules=pyspark-ml,pyspark-mllib
Running PySpark tests. Output is in /.../spark/python/unit-tests.log
Will test against the following Python executables: ['python3.6', 'python2.7']
Will test the following Python modules: ['pyspark-ml', 'pyspark-mllib']
Starting test(python2.7): pyspark.mllib.tests
Starting test(python2.7): pyspark.ml.classification
Starting test(python3.6): pyspark.mllib.tests
Starting test(python2.7): pyspark.ml.clustering
Finished test(python2.7): pyspark.ml.clustering (54s)
Starting test(python2.7): pyspark.ml.evaluation
Finished test(python2.7): pyspark.ml.classification (74s)
Starting test(python2.7): pyspark.ml.feature
Finished test(python2.7): pyspark.ml.evaluation (27s)
Starting test(python2.7): pyspark.ml.fpm
Finished test(python2.7): pyspark.ml.fpm (0s)
Starting test(python2.7): pyspark.ml.image
Finished test(python2.7): pyspark.ml.image (17s)
Starting test(python2.7): pyspark.ml.linalg.__init__
Finished test(python2.7): pyspark.ml.linalg.__init__ (1s)
Starting test(python2.7): pyspark.ml.recommendation
Finished test(python2.7): pyspark.ml.feature (76s)
Starting test(python2.7): pyspark.ml.regression
Finished test(python2.7): pyspark.ml.recommendation (69s)
Starting test(python2.7): pyspark.ml.stat
Finished test(python2.7): pyspark.ml.regression (45s)
Starting test(python2.7): pyspark.ml.tests
Finished test(python2.7): pyspark.ml.stat (28s)
Starting test(python2.7): pyspark.ml.tuning
Finished test(python2.7): pyspark.ml.tuning (20s)
Starting test(python2.7): pyspark.mllib.classification
Finished test(python2.7): pyspark.mllib.classification (31s)
Starting test(python2.7): pyspark.mllib.clustering
Finished test(python2.7): pyspark.mllib.tests (260s)
Starting test(python2.7): pyspark.mllib.evaluation
Finished test(python3.6): pyspark.mllib.tests (266s)
Starting test(python2.7): pyspark.mllib.feature
Finished test(python2.7): pyspark.mllib.evaluation (21s)
Starting test(python2.7): pyspark.mllib.fpm
Finished test(python2.7): pyspark.mllib.feature (38s)
Starting test(python2.7): pyspark.mllib.linalg.__init__
Finished test(python2.7): pyspark.mllib.linalg.__init__ (1s)
Starting test(python2.7): pyspark.mllib.linalg.distributed
Finished test(python2.7): pyspark.mllib.fpm (34s)
Starting test(python2.7): pyspark.mllib.random
Finished test(python2.7): pyspark.mllib.clustering (64s)
Starting test(python2.7): pyspark.mllib.recommendation
Finished test(python2.7): pyspark.mllib.random (15s)
Starting test(python2.7): pyspark.mllib.regression
Finished test(python2.7): pyspark.mllib.linalg.distributed (47s)
Starting test(python2.7): pyspark.mllib.stat.KernelDensity
Finished test(python2.7): pyspark.mllib.stat.KernelDensity (0s)
Starting test(python2.7): pyspark.mllib.stat._statistics
Finished test(python2.7): pyspark.mllib.recommendation (40s)
Starting test(python2.7): pyspark.mllib.tree
Finished test(python2.7): pyspark.mllib.regression (38s)
Starting test(python2.7): pyspark.mllib.util
Finished test(python2.7): pyspark.mllib.stat._statistics (19s)
Starting test(python3.6): pyspark.ml.classification
Finished test(python2.7): pyspark.mllib.tree (26s)
Starting test(python3.6): pyspark.ml.clustering
Finished test(python2.7): pyspark.mllib.util (27s)
Starting test(python3.6): pyspark.ml.evaluation
Finished test(python3.6): pyspark.ml.evaluation (30s)
Starting test(python3.6): pyspark.ml.feature
Finished test(python2.7): pyspark.ml.tests (234s)
Starting test(python3.6): pyspark.ml.fpm
Finished test(python3.6): pyspark.ml.fpm (1s)
Starting test(python3.6): pyspark.ml.image
Finished test(python3.6): pyspark.ml.clustering (55s)
Starting test(python3.6): pyspark.ml.linalg.__init__
Finished test(python3.6): pyspark.ml.linalg.__init__ (0s)
Starting test(python3.6): pyspark.ml.recommendation
Finished test(python3.6): pyspark.ml.classification (71s)
Starting test(python3.6): pyspark.ml.regression
Finished test(python3.6): pyspark.ml.image (18s)
Starting test(python3.6): pyspark.ml.stat
Finished test(python3.6): pyspark.ml.stat (37s)
Starting test(python3.6): pyspark.ml.tests
Finished test(python3.6): pyspark.ml.regression (59s)
Starting test(python3.6): pyspark.ml.tuning
Finished test(python3.6): pyspark.ml.feature (93s)
Starting test(python3.6): pyspark.mllib.classification
Finished test(python3.6): pyspark.ml.recommendation (83s)
Starting test(python3.6): pyspark.mllib.clustering
Finished test(python3.6): pyspark.ml.tuning (29s)
Starting test(python3.6): pyspark.mllib.evaluation
Finished test(python3.6): pyspark.mllib.evaluation (26s)
Starting test(python3.6): pyspark.mllib.feature
Finished test(python3.6): pyspark.mllib.classification (43s)
Starting test(python3.6): pyspark.mllib.fpm
Finished test(python3.6): pyspark.mllib.clustering (81s)
Starting test(python3.6): pyspark.mllib.linalg.__init__
Finished test(python3.6): pyspark.mllib.linalg.__init__ (2s)
Starting test(python3.6): pyspark.mllib.linalg.distributed
Finished test(python3.6): pyspark.mllib.fpm (48s)
Starting test(python3.6): pyspark.mllib.random
Finished test(python3.6): pyspark.mllib.feature (54s)
Starting test(python3.6): pyspark.mllib.recommendation
Finished test(python3.6): pyspark.mllib.random (18s)
Starting test(python3.6): pyspark.mllib.regression
Finished test(python3.6): pyspark.mllib.linalg.distributed (55s)
Starting test(python3.6): pyspark.mllib.stat.KernelDensity
Finished test(python3.6): pyspark.mllib.stat.KernelDensity (1s)
Starting test(python3.6): pyspark.mllib.stat._statistics
Finished test(python3.6): pyspark.mllib.recommendation (51s)
Starting test(python3.6): pyspark.mllib.tree
Finished test(python3.6): pyspark.mllib.regression (45s)
Starting test(python3.6): pyspark.mllib.util
Finished test(python3.6): pyspark.mllib.stat._statistics (21s)
Finished test(python3.6): pyspark.mllib.tree (27s)
Finished test(python3.6): pyspark.mllib.util (27s)
Finished test(python3.6): pyspark.ml.tests (264s)
```
Author: hyukjinkwon <gurwls223@apache.org>
Closes #21715 from HyukjinKwon/SPARK-24740.
2018-07-06 23:39:29 -04:00
|
|
|
import numpy
|
2017-03-28 22:19:16 -04:00
|
|
|
import pyspark.ml.stat
|
|
|
|
from pyspark.sql import SparkSession
|
[SPARK-24740][PYTHON][ML] Make PySpark's tests compatible with NumPy 1.14+
## What changes were proposed in this pull request?
This PR proposes to make PySpark's tests compatible with NumPy 0.14+
NumPy 0.14.x introduced rather radical changes about its string representation.
For example, the tests below are failed:
```
**********************************************************************
File "/.../spark/python/pyspark/ml/linalg/__init__.py", line 895, in __main__.DenseMatrix.__str__
Failed example:
print(dm)
Expected:
DenseMatrix([[ 0., 2.],
[ 1., 3.]])
Got:
DenseMatrix([[0., 2.],
[1., 3.]])
**********************************************************************
File "/.../spark/python/pyspark/ml/linalg/__init__.py", line 899, in __main__.DenseMatrix.__str__
Failed example:
print(dm)
Expected:
DenseMatrix([[ 0., 1.],
[ 2., 3.]])
Got:
DenseMatrix([[0., 1.],
[2., 3.]])
**********************************************************************
File "/.../spark/python/pyspark/ml/linalg/__init__.py", line 939, in __main__.DenseMatrix.toArray
Failed example:
m.toArray()
Expected:
array([[ 0., 2.],
[ 1., 3.]])
Got:
array([[0., 2.],
[1., 3.]])
**********************************************************************
File "/.../spark/python/pyspark/ml/linalg/__init__.py", line 324, in __main__.DenseVector.dot
Failed example:
dense.dot(np.reshape([1., 2., 3., 4.], (2, 2), order='F'))
Expected:
array([ 5., 11.])
Got:
array([ 5., 11.])
**********************************************************************
File "/.../spark/python/pyspark/ml/linalg/__init__.py", line 567, in __main__.SparseVector.dot
Failed example:
a.dot(np.array([[1, 1], [2, 2], [3, 3], [4, 4]]))
Expected:
array([ 22., 22.])
Got:
array([22., 22.])
```
See [release note](https://docs.scipy.org/doc/numpy-1.14.0/release.html#compatibility-notes).
## How was this patch tested?
Manually tested:
```
$ ./run-tests --python-executables=python3.6,python2.7 --modules=pyspark-ml,pyspark-mllib
Running PySpark tests. Output is in /.../spark/python/unit-tests.log
Will test against the following Python executables: ['python3.6', 'python2.7']
Will test the following Python modules: ['pyspark-ml', 'pyspark-mllib']
Starting test(python2.7): pyspark.mllib.tests
Starting test(python2.7): pyspark.ml.classification
Starting test(python3.6): pyspark.mllib.tests
Starting test(python2.7): pyspark.ml.clustering
Finished test(python2.7): pyspark.ml.clustering (54s)
Starting test(python2.7): pyspark.ml.evaluation
Finished test(python2.7): pyspark.ml.classification (74s)
Starting test(python2.7): pyspark.ml.feature
Finished test(python2.7): pyspark.ml.evaluation (27s)
Starting test(python2.7): pyspark.ml.fpm
Finished test(python2.7): pyspark.ml.fpm (0s)
Starting test(python2.7): pyspark.ml.image
Finished test(python2.7): pyspark.ml.image (17s)
Starting test(python2.7): pyspark.ml.linalg.__init__
Finished test(python2.7): pyspark.ml.linalg.__init__ (1s)
Starting test(python2.7): pyspark.ml.recommendation
Finished test(python2.7): pyspark.ml.feature (76s)
Starting test(python2.7): pyspark.ml.regression
Finished test(python2.7): pyspark.ml.recommendation (69s)
Starting test(python2.7): pyspark.ml.stat
Finished test(python2.7): pyspark.ml.regression (45s)
Starting test(python2.7): pyspark.ml.tests
Finished test(python2.7): pyspark.ml.stat (28s)
Starting test(python2.7): pyspark.ml.tuning
Finished test(python2.7): pyspark.ml.tuning (20s)
Starting test(python2.7): pyspark.mllib.classification
Finished test(python2.7): pyspark.mllib.classification (31s)
Starting test(python2.7): pyspark.mllib.clustering
Finished test(python2.7): pyspark.mllib.tests (260s)
Starting test(python2.7): pyspark.mllib.evaluation
Finished test(python3.6): pyspark.mllib.tests (266s)
Starting test(python2.7): pyspark.mllib.feature
Finished test(python2.7): pyspark.mllib.evaluation (21s)
Starting test(python2.7): pyspark.mllib.fpm
Finished test(python2.7): pyspark.mllib.feature (38s)
Starting test(python2.7): pyspark.mllib.linalg.__init__
Finished test(python2.7): pyspark.mllib.linalg.__init__ (1s)
Starting test(python2.7): pyspark.mllib.linalg.distributed
Finished test(python2.7): pyspark.mllib.fpm (34s)
Starting test(python2.7): pyspark.mllib.random
Finished test(python2.7): pyspark.mllib.clustering (64s)
Starting test(python2.7): pyspark.mllib.recommendation
Finished test(python2.7): pyspark.mllib.random (15s)
Starting test(python2.7): pyspark.mllib.regression
Finished test(python2.7): pyspark.mllib.linalg.distributed (47s)
Starting test(python2.7): pyspark.mllib.stat.KernelDensity
Finished test(python2.7): pyspark.mllib.stat.KernelDensity (0s)
Starting test(python2.7): pyspark.mllib.stat._statistics
Finished test(python2.7): pyspark.mllib.recommendation (40s)
Starting test(python2.7): pyspark.mllib.tree
Finished test(python2.7): pyspark.mllib.regression (38s)
Starting test(python2.7): pyspark.mllib.util
Finished test(python2.7): pyspark.mllib.stat._statistics (19s)
Starting test(python3.6): pyspark.ml.classification
Finished test(python2.7): pyspark.mllib.tree (26s)
Starting test(python3.6): pyspark.ml.clustering
Finished test(python2.7): pyspark.mllib.util (27s)
Starting test(python3.6): pyspark.ml.evaluation
Finished test(python3.6): pyspark.ml.evaluation (30s)
Starting test(python3.6): pyspark.ml.feature
Finished test(python2.7): pyspark.ml.tests (234s)
Starting test(python3.6): pyspark.ml.fpm
Finished test(python3.6): pyspark.ml.fpm (1s)
Starting test(python3.6): pyspark.ml.image
Finished test(python3.6): pyspark.ml.clustering (55s)
Starting test(python3.6): pyspark.ml.linalg.__init__
Finished test(python3.6): pyspark.ml.linalg.__init__ (0s)
Starting test(python3.6): pyspark.ml.recommendation
Finished test(python3.6): pyspark.ml.classification (71s)
Starting test(python3.6): pyspark.ml.regression
Finished test(python3.6): pyspark.ml.image (18s)
Starting test(python3.6): pyspark.ml.stat
Finished test(python3.6): pyspark.ml.stat (37s)
Starting test(python3.6): pyspark.ml.tests
Finished test(python3.6): pyspark.ml.regression (59s)
Starting test(python3.6): pyspark.ml.tuning
Finished test(python3.6): pyspark.ml.feature (93s)
Starting test(python3.6): pyspark.mllib.classification
Finished test(python3.6): pyspark.ml.recommendation (83s)
Starting test(python3.6): pyspark.mllib.clustering
Finished test(python3.6): pyspark.ml.tuning (29s)
Starting test(python3.6): pyspark.mllib.evaluation
Finished test(python3.6): pyspark.mllib.evaluation (26s)
Starting test(python3.6): pyspark.mllib.feature
Finished test(python3.6): pyspark.mllib.classification (43s)
Starting test(python3.6): pyspark.mllib.fpm
Finished test(python3.6): pyspark.mllib.clustering (81s)
Starting test(python3.6): pyspark.mllib.linalg.__init__
Finished test(python3.6): pyspark.mllib.linalg.__init__ (2s)
Starting test(python3.6): pyspark.mllib.linalg.distributed
Finished test(python3.6): pyspark.mllib.fpm (48s)
Starting test(python3.6): pyspark.mllib.random
Finished test(python3.6): pyspark.mllib.feature (54s)
Starting test(python3.6): pyspark.mllib.recommendation
Finished test(python3.6): pyspark.mllib.random (18s)
Starting test(python3.6): pyspark.mllib.regression
Finished test(python3.6): pyspark.mllib.linalg.distributed (55s)
Starting test(python3.6): pyspark.mllib.stat.KernelDensity
Finished test(python3.6): pyspark.mllib.stat.KernelDensity (1s)
Starting test(python3.6): pyspark.mllib.stat._statistics
Finished test(python3.6): pyspark.mllib.recommendation (51s)
Starting test(python3.6): pyspark.mllib.tree
Finished test(python3.6): pyspark.mllib.regression (45s)
Starting test(python3.6): pyspark.mllib.util
Finished test(python3.6): pyspark.mllib.stat._statistics (21s)
Finished test(python3.6): pyspark.mllib.tree (27s)
Finished test(python3.6): pyspark.mllib.util (27s)
Finished test(python3.6): pyspark.ml.tests (264s)
```
Author: hyukjinkwon <gurwls223@apache.org>
Closes #21715 from HyukjinKwon/SPARK-24740.
2018-07-06 23:39:29 -04:00
|
|
|
try:
|
|
|
|
# Numpy 1.14+ changed it's string format.
|
|
|
|
numpy.set_printoptions(legacy='1.13')
|
|
|
|
except TypeError:
|
|
|
|
pass
|
2017-03-28 22:19:16 -04:00
|
|
|
|
|
|
|
globs = pyspark.ml.stat.__dict__.copy()
|
|
|
|
# The small batch size here ensures that we see multiple batches,
|
|
|
|
# even in these small test examples:
|
|
|
|
spark = SparkSession.builder \
|
|
|
|
.master("local[2]") \
|
|
|
|
.appName("ml.stat tests") \
|
|
|
|
.getOrCreate()
|
|
|
|
sc = spark.sparkContext
|
|
|
|
globs['sc'] = sc
|
|
|
|
globs['spark'] = spark
|
|
|
|
|
|
|
|
failure_count, test_count = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
|
|
|
|
spark.stop()
|
|
|
|
if failure_count:
|
2018-03-08 06:38:34 -05:00
|
|
|
sys.exit(-1)
|