spark-instrumented-optimizer/python/pyspark/ml/stat.py

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from pyspark import since, SparkContext
from pyspark.ml.common import _java2py, _py2java
from pyspark.ml.wrapper import _jvm


class ChiSquareTest(object):
    """
    .. note:: Experimental

    Conduct Pearson's independence test for every feature against the label. For each feature,
    the (feature, label) pairs are converted into a contingency matrix for which the Chi-squared
    statistic is computed. All label and feature values must be categorical.

    The null hypothesis is that the occurrence of the outcomes is statistically independent.

    :param dataset:
      DataFrame of categorical labels and categorical features.
      Real-valued features will be treated as categorical for each distinct value.
    :param featuresCol:
      Name of features column in dataset, of type `Vector` (`VectorUDT`).
    :param labelCol:
      Name of label column in dataset, of any numerical type.
    :return:
      DataFrame containing the test result for every feature against the label.
      This DataFrame will contain a single Row with the following fields:
      - `pValues: Vector`
      - `degreesOfFreedom: Array[Int]`
      - `statistics: Vector`
      Each of these fields has one value per feature.

    >>> from pyspark.ml.linalg import Vectors
    >>> from pyspark.ml.stat import ChiSquareTest
    >>> dataset = [[0, Vectors.dense([0, 0, 1])],
    ...            [0, Vectors.dense([1, 0, 1])],
    ...            [1, Vectors.dense([2, 1, 1])],
    ...            [1, Vectors.dense([3, 1, 1])]]
    >>> dataset = spark.createDataFrame(dataset, ["label", "features"])
    >>> chiSqResult = ChiSquareTest.test(dataset, 'features', 'label')
    >>> chiSqResult.select("degreesOfFreedom").collect()[0]
    Row(degreesOfFreedom=[3, 1, 0])

    .. versionadded:: 2.2.0

    """
    @staticmethod
    @since("2.2.0")
    def test(dataset, featuresCol, labelCol):
        """
        Perform a Pearson's independence test using dataset.
        """
        sc = SparkContext._active_spark_context
        javaTestObj = _jvm().org.apache.spark.ml.stat.ChiSquareTest
        args = [_py2java(sc, arg) for arg in (dataset, featuresCol, labelCol)]
        return _java2py(sc, javaTestObj.test(*args))


class Correlation(object):
    """
    .. note:: Experimental

    Compute the correlation matrix for the input dataset of Vectors using the specified method.
    Methods currently supported: `pearson` (default), `spearman`.

    .. note:: For Spearman, a rank correlation, we need to create an RDD[Double] for each column
      and sort it in order to retrieve the ranks and then join the columns back into an RDD[Vector],
      which is fairly costly. Cache the input Dataset before calling corr with `method = 'spearman'`
      to avoid recomputing the common lineage.

    :param dataset:
      A dataset or a dataframe.
    :param column:
      The name of the column of vectors for which the correlation coefficient needs
      to be computed. This must be a column of the dataset, and it must contain
      Vector objects.
    :param method:
      String specifying the method to use for computing correlation.
      Supported: `pearson` (default), `spearman`.
    :return:
      A dataframe that contains the correlation matrix of the column of vectors. This
      dataframe contains a single row and a single column of name
      '$METHODNAME($COLUMN)'.

    >>> from pyspark.ml.linalg import Vectors
    >>> from pyspark.ml.stat import Correlation
    >>> dataset = [[Vectors.dense([1, 0, 0, -2])],
    ...            [Vectors.dense([4, 5, 0, 3])],
    ...            [Vectors.dense([6, 7, 0, 8])],
    ...            [Vectors.dense([9, 0, 0, 1])]]
    >>> dataset = spark.createDataFrame(dataset, ['features'])
    >>> pearsonCorr = Correlation.corr(dataset, 'features', 'pearson').collect()[0][0]
    >>> print(str(pearsonCorr).replace('nan', 'NaN'))
    DenseMatrix([[ 1.        ,  0.0556...,         NaN,  0.4004...],
                 [ 0.0556...,  1.        ,         NaN,  0.9135...],
                 [        NaN,         NaN,  1.        ,         NaN],
                 [ 0.4004...,  0.9135...,         NaN,  1.        ]])
    >>> spearmanCorr = Correlation.corr(dataset, 'features', method='spearman').collect()[0][0]
    >>> print(str(spearmanCorr).replace('nan', 'NaN'))
    DenseMatrix([[ 1.        ,  0.1054...,         NaN,  0.4       ],
                 [ 0.1054...,  1.        ,         NaN,  0.9486... ],
                 [        NaN,         NaN,  1.        ,         NaN],
                 [ 0.4       ,  0.9486... ,         NaN,  1.        ]])

    .. versionadded:: 2.2.0

    """
    @staticmethod
    @since("2.2.0")
    def corr(dataset, column, method="pearson"):
        """
        Compute the correlation matrix with specified method using dataset.
        """
        sc = SparkContext._active_spark_context
        javaCorrObj = _jvm().org.apache.spark.ml.stat.Correlation
        args = [_py2java(sc, arg) for arg in (dataset, column, method)]
        return _java2py(sc, javaCorrObj.corr(*args))


if __name__ == "__main__":
    import doctest
    import pyspark.ml.stat
    from pyspark.sql import SparkSession

    globs = pyspark.ml.stat.__dict__.copy()
    # The small batch size here ensures that we see multiple batches,
    # even in these small test examples:
    spark = SparkSession.builder \
        .master("local[2]") \
        .appName("ml.stat tests") \
        .getOrCreate()
    sc = spark.sparkContext
    globs['sc'] = sc
    globs['spark'] = spark

    failure_count, test_count = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
    spark.stop()
    if failure_count:
        exit(-1)
[SPARK-20040][ML][PYTHON] pyspark wrapper for ChiSquareTest ## What changes were proposed in this pull request? A pyspark wrapper for spark.ml.stat.ChiSquareTest ## How was this patch tested? unit tests doctests Author: Bago Amirbekian <bago@databricks.com> Closes #17421 from MrBago/chiSquareTestWrapper. 2017-03-28 22:19:16 -04:00			`#`
			`# Licensed to the Apache Software Foundation (ASF) under one or more`
			`# contributor license agreements. See the NOTICE file distributed with`
			`# this work for additional information regarding copyright ownership.`
			`# The ASF licenses this file to You under the Apache License, Version 2.0`
			`# (the "License"); you may not use this file except in compliance with`
			`# the License. You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`#`

			`from pyspark import since, SparkContext`
			`from pyspark.ml.common import _java2py, _py2java`
			`from pyspark.ml.wrapper import _jvm`


			`class ChiSquareTest(object):`
			`"""`
			`.. note:: Experimental`

			`Conduct Pearson's independence test for every feature against the label. For each feature,`
			`the (feature, label) pairs are converted into a contingency matrix for which the Chi-squared`
			`statistic is computed. All label and feature values must be categorical.`

			`The null hypothesis is that the occurrence of the outcomes is statistically independent.`

			`:param dataset:`
			`DataFrame of categorical labels and categorical features.`
			`Real-valued features will be treated as categorical for each distinct value.`
			`:param featuresCol:`
			Name of features column in dataset, of type `Vector` (`VectorUDT`).
			`:param labelCol:`
			`Name of label column in dataset, of any numerical type.`
			`:return:`
			`DataFrame containing the test result for every feature against the label.`
			`This DataFrame will contain a single Row with the following fields:`
			- `pValues: Vector`
			- `degreesOfFreedom: Array[Int]`
			- `statistics: Vector`
			`Each of these fields has one value per feature.`

			`>>> from pyspark.ml.linalg import Vectors`
			`>>> from pyspark.ml.stat import ChiSquareTest`
			`>>> dataset = [[0, Vectors.dense([0, 0, 1])],`
			`... [0, Vectors.dense([1, 0, 1])],`
			`... [1, Vectors.dense([2, 1, 1])],`
			`... [1, Vectors.dense([3, 1, 1])]]`
			`>>> dataset = spark.createDataFrame(dataset, ["label", "features"])`
			`>>> chiSqResult = ChiSquareTest.test(dataset, 'features', 'label')`
			`>>> chiSqResult.select("degreesOfFreedom").collect()[0]`
			`Row(degreesOfFreedom=[3, 1, 0])`

			`.. versionadded:: 2.2.0`

			`"""`
			`@staticmethod`
			`@since("2.2.0")`
			`def test(dataset, featuresCol, labelCol):`
			`"""`
			`Perform a Pearson's independence test using dataset.`
			`"""`
			`sc = SparkContext._active_spark_context`
			`javaTestObj = _jvm().org.apache.spark.ml.stat.ChiSquareTest`
			`args = [_py2java(sc, arg) for arg in (dataset, featuresCol, labelCol)]`
			`return _java2py(sc, javaTestObj.test(*args))`


[SPARK-20076][ML][PYSPARK] Add Python interface for ml.stats.Correlation ## What changes were proposed in this pull request? The Dataframes-based support for the correlation statistics is added in #17108. This patch adds the Python interface for it. ## How was this patch tested? Python unit test. Please review http://spark.apache.org/contributing.html before opening a pull request. Author: Liang-Chi Hsieh <viirya@gmail.com> Closes #17494 from viirya/correlation-python-api. 2017-04-07 05:00:10 -04:00			`class Correlation(object):`
			`"""`
			`.. note:: Experimental`

			`Compute the correlation matrix for the input dataset of Vectors using the specified method.`
			Methods currently supported: `pearson` (default), `spearman`.

			`.. note:: For Spearman, a rank correlation, we need to create an RDD[Double] for each column`
			`and sort it in order to retrieve the ranks and then join the columns back into an RDD[Vector],`
			which is fairly costly. Cache the input Dataset before calling corr with `method = 'spearman'`
			`to avoid recomputing the common lineage.`

			`:param dataset:`
			`A dataset or a dataframe.`
			`:param column:`
			`The name of the column of vectors for which the correlation coefficient needs`
			`to be computed. This must be a column of the dataset, and it must contain`
			`Vector objects.`
			`:param method:`
			`String specifying the method to use for computing correlation.`
			Supported: `pearson` (default), `spearman`.
			`:return:`
			`A dataframe that contains the correlation matrix of the column of vectors. This`
			`dataframe contains a single row and a single column of name`
			`'$METHODNAME($COLUMN)'.`

			`>>> from pyspark.ml.linalg import Vectors`
			`>>> from pyspark.ml.stat import Correlation`
			`>>> dataset = [[Vectors.dense([1, 0, 0, -2])],`
			`... [Vectors.dense([4, 5, 0, 3])],`
			`... [Vectors.dense([6, 7, 0, 8])],`
			`... [Vectors.dense([9, 0, 0, 1])]]`
			`>>> dataset = spark.createDataFrame(dataset, ['features'])`
			`>>> pearsonCorr = Correlation.corr(dataset, 'features', 'pearson').collect()[0][0]`
			`>>> print(str(pearsonCorr).replace('nan', 'NaN'))`
			`DenseMatrix([[ 1. , 0.0556..., NaN, 0.4004...],`
			`[ 0.0556..., 1. , NaN, 0.9135...],`
			`[ NaN, NaN, 1. , NaN],`
			`[ 0.4004..., 0.9135..., NaN, 1. ]])`
			`>>> spearmanCorr = Correlation.corr(dataset, 'features', method='spearman').collect()[0][0]`
			`>>> print(str(spearmanCorr).replace('nan', 'NaN'))`
			`DenseMatrix([[ 1. , 0.1054..., NaN, 0.4 ],`
			`[ 0.1054..., 1. , NaN, 0.9486... ],`
			`[ NaN, NaN, 1. , NaN],`
			`[ 0.4 , 0.9486... , NaN, 1. ]])`

			`.. versionadded:: 2.2.0`

			`"""`
			`@staticmethod`
			`@since("2.2.0")`
			`def corr(dataset, column, method="pearson"):`
			`"""`
			`Compute the correlation matrix with specified method using dataset.`
			`"""`
			`sc = SparkContext._active_spark_context`
			`javaCorrObj = _jvm().org.apache.spark.ml.stat.Correlation`
			`args = [_py2java(sc, arg) for arg in (dataset, column, method)]`
			`return _java2py(sc, javaCorrObj.corr(*args))`


[SPARK-20040][ML][PYTHON] pyspark wrapper for ChiSquareTest ## What changes were proposed in this pull request? A pyspark wrapper for spark.ml.stat.ChiSquareTest ## How was this patch tested? unit tests doctests Author: Bago Amirbekian <bago@databricks.com> Closes #17421 from MrBago/chiSquareTestWrapper. 2017-03-28 22:19:16 -04:00			`if __name__ == "__main__":`
			`import doctest`
			`import pyspark.ml.stat`
			`from pyspark.sql import SparkSession`

			`globs = pyspark.ml.stat.__dict__.copy()`
			`# The small batch size here ensures that we see multiple batches,`
			`# even in these small test examples:`
			`spark = SparkSession.builder \`
			`.master("local[2]") \`
			`.appName("ml.stat tests") \`
			`.getOrCreate()`
			`sc = spark.sparkContext`
			`globs['sc'] = sc`
			`globs['spark'] = spark`

			`failure_count, test_count = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)`
			`spark.stop()`
			`if failure_count:`
			`exit(-1)`