a71dd6af2f
### What changes were proposed in this pull request? This PR proposes to use Python 3.9 in documentation and linter at GitHub Actions. This PR also contains the fixes for mypy check (introduced by Python 3.9 upgrade) ``` python/pyspark/sql/pandas/_typing/protocols/frame.pyi:64: error: Name "np.ndarray" is not defined python/pyspark/sql/pandas/_typing/protocols/frame.pyi:91: error: Name "np.recarray" is not defined python/pyspark/sql/pandas/_typing/protocols/frame.pyi:165: error: Name "np.ndarray" is not defined python/pyspark/pandas/categorical.py:82: error: Item "dtype[Any]" of "Union[dtype[Any], Any]" has no attribute "categories" python/pyspark/pandas/categorical.py:109: error: Item "dtype[Any]" of "Union[dtype[Any], Any]" has no attribute "ordered" python/pyspark/ml/linalg/__init__.pyi:184: error: Return type "ndarray[Any, Any]" of "toArray" incompatible with return type "NoReturn" in supertype "Matrix" python/pyspark/ml/linalg/__init__.pyi:217: error: Return type "ndarray[Any, Any]" of "toArray" incompatible with return type "NoReturn" in supertype "Matrix" python/pyspark/pandas/typedef/typehints.py:163: error: Module has no attribute "bool"; maybe "bool_" or "bool8"? python/pyspark/pandas/typedef/typehints.py:174: error: Module has no attribute "float"; maybe "float_", "cfloat", or "float96"? python/pyspark/pandas/typedef/typehints.py:180: error: Module has no attribute "int"; maybe "uint", "rint", or "intp"? python/pyspark/pandas/ml.py:81: error: Value of type variable "_DTypeScalar_co" of "dtype" cannot be "object" python/pyspark/pandas/indexing.py:1649: error: Module has no attribute "int"; maybe "uint", "rint", or "intp"? python/pyspark/pandas/indexing.py:1656: error: Module has no attribute "int"; maybe "uint", "rint", or "intp"? python/pyspark/pandas/frame.py:4969: error: Function "numpy.array" is not valid as a type python/pyspark/pandas/frame.py:4969: note: Perhaps you need "Callable[...]" or a callback protocol? python/pyspark/pandas/frame.py:4970: error: Function "numpy.array" is not valid as a type python/pyspark/pandas/frame.py:4970: note: Perhaps you need "Callable[...]" or a callback protocol? python/pyspark/pandas/frame.py:7402: error: "List[Any]" has no attribute "tolist" python/pyspark/pandas/series.py:1030: error: Module has no attribute "_NoValue" python/pyspark/pandas/series.py:1031: error: Module has no attribute "_NoValue" python/pyspark/pandas/indexes/category.py:159: error: Item "dtype[Any]" of "Union[dtype[Any], Any]" has no attribute "categories" python/pyspark/pandas/indexes/category.py:180: error: Item "dtype[Any]" of "Union[dtype[Any], Any]" has no attribute "ordered" python/pyspark/pandas/namespace.py:2036: error: Argument 1 to "column_name" has incompatible type "float"; expected "str" python/pyspark/pandas/mlflow.py:59: error: Incompatible types in assignment (expression has type "Type[floating[Any]]", variable has type "str") python/pyspark/pandas/data_type_ops/categorical_ops.py:43: error: Item "dtype[Any]" of "Union[dtype[Any], Any]" has no attribute "categories" python/pyspark/pandas/data_type_ops/categorical_ops.py:43: error: Item "dtype[Any]" of "Union[dtype[Any], Any]" has no attribute "ordered" python/pyspark/pandas/data_type_ops/categorical_ops.py:56: error: Item "dtype[Any]" of "Union[dtype[Any], Any]" has no attribute "categories" python/pyspark/pandas/tests/test_typedef.py:70: error: Name "np.float" is not defined python/pyspark/pandas/tests/test_typedef.py:77: error: Name "np.float" is not defined python/pyspark/pandas/tests/test_typedef.py:85: error: Name "np.float" is not defined python/pyspark/pandas/tests/test_typedef.py💯 error: Name "np.float" is not defined python/pyspark/pandas/tests/test_typedef.py:108: error: Name "np.float" is not defined python/pyspark/mllib/clustering.pyi:152: error: Incompatible types in assignment (expression has type "ndarray[Any, Any]", base class "KMeansModel" defined the type as "List[ndarray[Any, Any]]") python/pyspark/mllib/classification.pyi:93: error: Signature of "predict" incompatible with supertype "LinearClassificationModel" Found 32 errors in 15 files (checked 315 source files) 1 ``` ### Why are the changes needed? Python 3.6 is deprecated at SPARK-35938 ### Does this PR introduce _any_ user-facing change? No. Maybe static analysis, etc. by some type hints but they are really non-breaking.. ### How was this patch tested? I manually checked by GitHub Actions build in forked repository. Closes #33356 from HyukjinKwon/SPARK-36146. Authored-by: Hyukjin Kwon <gurwls223@apache.org> Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
117 lines
4.1 KiB
Python
117 lines
4.1 KiB
Python
#
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
# this work for additional information regarding copyright ownership.
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
# (the "License"); you may not use this file except in compliance with
|
|
# the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
from typing import List, Tuple, TYPE_CHECKING, cast
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
import pyspark
|
|
|
|
from pyspark.ml.feature import VectorAssembler
|
|
from pyspark.ml.stat import Correlation
|
|
|
|
from pyspark.pandas._typing import Label
|
|
from pyspark.pandas.utils import column_labels_level
|
|
|
|
if TYPE_CHECKING:
|
|
import pyspark.pandas as ps # noqa: F401 (SPARK-34943)
|
|
|
|
|
|
CORRELATION_OUTPUT_COLUMN = "__correlation_output__"
|
|
|
|
|
|
def corr(psdf: "ps.DataFrame", method: str = "pearson") -> pd.DataFrame:
|
|
"""
|
|
The correlation matrix of all the numerical columns of this dataframe.
|
|
|
|
Only accepts scalar numerical values for now.
|
|
|
|
:param psdf: the pandas-on-Spark dataframe.
|
|
:param method: {'pearson', 'spearman'}
|
|
* pearson : standard correlation coefficient
|
|
* spearman : Spearman rank correlation
|
|
:return: :class:`pandas.DataFrame`
|
|
|
|
>>> ps.DataFrame({'A': [0, 1], 'B': [1, 0], 'C': ['x', 'y']}).corr()
|
|
A B
|
|
A 1.0 -1.0
|
|
B -1.0 1.0
|
|
"""
|
|
assert method in ("pearson", "spearman")
|
|
ndf, column_labels = to_numeric_df(psdf)
|
|
corr = Correlation.corr(ndf, CORRELATION_OUTPUT_COLUMN, method)
|
|
pcorr = cast(pd.DataFrame, corr.toPandas())
|
|
arr = pcorr.iloc[0, 0].toArray()
|
|
if column_labels_level(column_labels) > 1:
|
|
idx = pd.MultiIndex.from_tuples(column_labels)
|
|
else:
|
|
idx = pd.Index([label[0] for label in column_labels])
|
|
return pd.DataFrame(arr, columns=idx, index=idx)
|
|
|
|
|
|
def to_numeric_df(psdf: "ps.DataFrame") -> Tuple[pyspark.sql.DataFrame, List[Label]]:
|
|
"""
|
|
Takes a dataframe and turns it into a dataframe containing a single numerical
|
|
vector of doubles. This dataframe has a single field called '_1'.
|
|
|
|
TODO: index is not preserved currently
|
|
:param psdf: the pandas-on-Spark dataframe.
|
|
:return: a pair of dataframe, list of strings (the name of the columns
|
|
that were converted to numerical types)
|
|
|
|
>>> to_numeric_df(ps.DataFrame({'A': [0, 1], 'B': [1, 0], 'C': ['x', 'y']}))
|
|
(DataFrame[__correlation_output__: vector], [('A',), ('B',)])
|
|
"""
|
|
# TODO, it should be more robust.
|
|
accepted_types = {
|
|
np.dtype(dt) # type: ignore
|
|
for dt in [np.int8, np.int16, np.int32, np.int64, np.float32, np.float64, np.bool_]
|
|
}
|
|
numeric_column_labels = [
|
|
label for label in psdf._internal.column_labels if psdf[label].dtype in accepted_types
|
|
]
|
|
numeric_df = psdf._internal.spark_frame.select(
|
|
*[psdf._internal.spark_column_for(idx) for idx in numeric_column_labels]
|
|
)
|
|
va = VectorAssembler(inputCols=numeric_df.columns, outputCol=CORRELATION_OUTPUT_COLUMN)
|
|
v = va.transform(numeric_df).select(CORRELATION_OUTPUT_COLUMN)
|
|
return v, numeric_column_labels
|
|
|
|
|
|
def _test() -> None:
|
|
import os
|
|
import doctest
|
|
import sys
|
|
from pyspark.sql import SparkSession
|
|
import pyspark.pandas.ml
|
|
|
|
os.chdir(os.environ["SPARK_HOME"])
|
|
|
|
globs = pyspark.pandas.ml.__dict__.copy()
|
|
globs["ps"] = pyspark.pandas
|
|
spark = SparkSession.builder.master("local[4]").appName("pyspark.pandas.ml tests").getOrCreate()
|
|
(failure_count, test_count) = doctest.testmod(
|
|
pyspark.pandas.ml, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE
|
|
)
|
|
spark.stop()
|
|
if failure_count:
|
|
sys.exit(-1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
_test()
|