spark-instrumented-optimizer/python/pyspark/pandas/ml.py
Hyukjin Kwon e9cc81151d [SPARK-36146][PYTHON][INFRA][TESTS] Upgrade Python version from 3.6 to 3.9 in GitHub Actions' linter/docs
This PR proposes to use Python 3.9 in documentation and linter at GitHub Actions. This PR also contains the fixes for mypy check (introduced by Python 3.9 upgrade)

```
python/pyspark/sql/pandas/_typing/protocols/frame.pyi:64: error: Name "np.ndarray" is not defined
python/pyspark/sql/pandas/_typing/protocols/frame.pyi:91: error: Name "np.recarray" is not defined
python/pyspark/sql/pandas/_typing/protocols/frame.pyi:165: error: Name "np.ndarray" is not defined
python/pyspark/pandas/categorical.py:82: error: Item "dtype[Any]" of "Union[dtype[Any], Any]" has no attribute "categories"
python/pyspark/pandas/categorical.py:109: error: Item "dtype[Any]" of "Union[dtype[Any], Any]" has no attribute "ordered"
python/pyspark/ml/linalg/__init__.pyi:184: error: Return type "ndarray[Any, Any]" of "toArray" incompatible with return type "NoReturn" in supertype "Matrix"
python/pyspark/ml/linalg/__init__.pyi:217: error: Return type "ndarray[Any, Any]" of "toArray" incompatible with return type "NoReturn" in supertype "Matrix"
python/pyspark/pandas/typedef/typehints.py:163: error: Module has no attribute "bool"; maybe "bool_" or "bool8"?
python/pyspark/pandas/typedef/typehints.py:174: error: Module has no attribute "float"; maybe "float_", "cfloat", or "float96"?
python/pyspark/pandas/typedef/typehints.py:180: error: Module has no attribute "int"; maybe "uint", "rint", or "intp"?
python/pyspark/pandas/ml.py:81: error: Value of type variable "_DTypeScalar_co" of "dtype" cannot be "object"
python/pyspark/pandas/indexing.py:1649: error: Module has no attribute "int"; maybe "uint", "rint", or "intp"?
python/pyspark/pandas/indexing.py:1656: error: Module has no attribute "int"; maybe "uint", "rint", or "intp"?
python/pyspark/pandas/frame.py:4969: error: Function "numpy.array" is not valid as a type
python/pyspark/pandas/frame.py:4969: note: Perhaps you need "Callable[...]" or a callback protocol?
python/pyspark/pandas/frame.py:4970: error: Function "numpy.array" is not valid as a type
python/pyspark/pandas/frame.py:4970: note: Perhaps you need "Callable[...]" or a callback protocol?
python/pyspark/pandas/frame.py:7402: error: "List[Any]" has no attribute "tolist"
python/pyspark/pandas/series.py:1030: error: Module has no attribute "_NoValue"
python/pyspark/pandas/series.py:1031: error: Module has no attribute "_NoValue"
python/pyspark/pandas/indexes/category.py:159: error: Item "dtype[Any]" of "Union[dtype[Any], Any]" has no attribute "categories"
python/pyspark/pandas/indexes/category.py:180: error: Item "dtype[Any]" of "Union[dtype[Any], Any]" has no attribute "ordered"
python/pyspark/pandas/namespace.py:2036: error: Argument 1 to "column_name" has incompatible type "float"; expected "str"
python/pyspark/pandas/mlflow.py:59: error: Incompatible types in assignment (expression has type "Type[floating[Any]]", variable has type "str")
python/pyspark/pandas/data_type_ops/categorical_ops.py:43: error: Item "dtype[Any]" of "Union[dtype[Any], Any]" has no attribute "categories"
python/pyspark/pandas/data_type_ops/categorical_ops.py:43: error: Item "dtype[Any]" of "Union[dtype[Any], Any]" has no attribute "ordered"
python/pyspark/pandas/data_type_ops/categorical_ops.py:56: error: Item "dtype[Any]" of "Union[dtype[Any], Any]" has no attribute "categories"
python/pyspark/pandas/tests/test_typedef.py:70: error: Name "np.float" is not defined
python/pyspark/pandas/tests/test_typedef.py:77: error: Name "np.float" is not defined
python/pyspark/pandas/tests/test_typedef.py:85: error: Name "np.float" is not defined
python/pyspark/pandas/tests/test_typedef.py💯 error: Name "np.float" is not defined
python/pyspark/pandas/tests/test_typedef.py:108: error: Name "np.float" is not defined
python/pyspark/mllib/clustering.pyi:152: error: Incompatible types in assignment (expression has type "ndarray[Any, Any]", base class "KMeansModel" defined the type as "List[ndarray[Any, Any]]")
python/pyspark/mllib/classification.pyi:93: error: Signature of "predict" incompatible with supertype "LinearClassificationModel"
Found 32 errors in 15 files (checked 315 source files)
1
```

Python 3.6 is deprecated at SPARK-35938

No. Maybe static analysis, etc. by some type hints but they are really non-breaking..

I manually checked by GitHub Actions build in forked repository.

Closes #33356 from HyukjinKwon/SPARK-36146.

Authored-by: Hyukjin Kwon <gurwls223@apache.org>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
(cherry picked from commit a71dd6af2f)
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
2021-07-16 11:41:53 +09:00

117 lines
4.1 KiB
Python

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import List, Tuple, TYPE_CHECKING, cast
import numpy as np
import pandas as pd
import pyspark
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from pyspark.pandas._typing import Label
from pyspark.pandas.utils import column_labels_level
if TYPE_CHECKING:
import pyspark.pandas as ps # noqa: F401 (SPARK-34943)
CORRELATION_OUTPUT_COLUMN = "__correlation_output__"
def corr(psdf: "ps.DataFrame", method: str = "pearson") -> pd.DataFrame:
"""
The correlation matrix of all the numerical columns of this dataframe.
Only accepts scalar numerical values for now.
:param psdf: the pandas-on-Spark dataframe.
:param method: {'pearson', 'spearman'}
* pearson : standard correlation coefficient
* spearman : Spearman rank correlation
:return: :class:`pandas.DataFrame`
>>> ps.DataFrame({'A': [0, 1], 'B': [1, 0], 'C': ['x', 'y']}).corr()
A B
A 1.0 -1.0
B -1.0 1.0
"""
assert method in ("pearson", "spearman")
ndf, column_labels = to_numeric_df(psdf)
corr = Correlation.corr(ndf, CORRELATION_OUTPUT_COLUMN, method)
pcorr = cast(pd.DataFrame, corr.toPandas())
arr = pcorr.iloc[0, 0].toArray()
if column_labels_level(column_labels) > 1:
idx = pd.MultiIndex.from_tuples(column_labels)
else:
idx = pd.Index([label[0] for label in column_labels])
return pd.DataFrame(arr, columns=idx, index=idx)
def to_numeric_df(psdf: "ps.DataFrame") -> Tuple[pyspark.sql.DataFrame, List[Label]]:
"""
Takes a dataframe and turns it into a dataframe containing a single numerical
vector of doubles. This dataframe has a single field called '_1'.
TODO: index is not preserved currently
:param psdf: the pandas-on-Spark dataframe.
:return: a pair of dataframe, list of strings (the name of the columns
that were converted to numerical types)
>>> to_numeric_df(ps.DataFrame({'A': [0, 1], 'B': [1, 0], 'C': ['x', 'y']}))
(DataFrame[__correlation_output__: vector], [('A',), ('B',)])
"""
# TODO, it should be more robust.
accepted_types = {
np.dtype(dt) # type: ignore
for dt in [np.int8, np.int16, np.int32, np.int64, np.float32, np.float64, np.bool_]
}
numeric_column_labels = [
label for label in psdf._internal.column_labels if psdf[label].dtype in accepted_types
]
numeric_df = psdf._internal.spark_frame.select(
*[psdf._internal.spark_column_for(idx) for idx in numeric_column_labels]
)
va = VectorAssembler(inputCols=numeric_df.columns, outputCol=CORRELATION_OUTPUT_COLUMN)
v = va.transform(numeric_df).select(CORRELATION_OUTPUT_COLUMN)
return v, numeric_column_labels
def _test() -> None:
import os
import doctest
import sys
from pyspark.sql import SparkSession
import pyspark.pandas.ml
os.chdir(os.environ["SPARK_HOME"])
globs = pyspark.pandas.ml.__dict__.copy()
globs["ps"] = pyspark.pandas
spark = SparkSession.builder.master("local[4]").appName("pyspark.pandas.ml tests").getOrCreate()
(failure_count, test_count) = doctest.testmod(
pyspark.pandas.ml, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE
)
spark.stop()
if failure_count:
sys.exit(-1)
if __name__ == "__main__":
_test()