From b8740a1d1ef6f7e08873a37c7ff1b4e3980679f0 Mon Sep 17 00:00:00 2001 From: itholic Date: Sun, 6 Jun 2021 17:30:07 -0700 Subject: [PATCH] [SPARK-35499][PYTHON] Apply black to pandas API on Spark codes ### What changes were proposed in this pull request? This PR proposes applying `black` to pandas API on Spark codes, for improving static analysis. By executing the `./dev/reformat-python` in the spark home directory, all the code of the pandas API on Spark is fixed according to the static analysis rules. ### Why are the changes needed? This can be reduces the cost of static analysis during development. It has been used continuously for about a year in the Koalas project and its convenience has been proven. ### Does this PR introduce _any_ user-facing change? No, it's dev-only. ### How was this patch tested? Manually reformat the pandas API on Spark codes by running the `./dev/reformat-python`, and checked the `./dev/lint-python` is passed. Closes #32779 from itholic/SPARK-35499. Authored-by: itholic Signed-off-by: Liang-Chi Hsieh --- .github/workflows/build_and_test.yml | 2 +- dev/lint-python | 32 +++++++++ dev/reformat-python | 32 +++++++++ dev/requirements.txt | 3 + dev/tox.ini | 4 +- python/pyspark/pandas/accessors.py | 4 +- python/pyspark/pandas/base.py | 4 +- python/pyspark/pandas/config.py | 2 +- python/pyspark/pandas/data_type_ops/base.py | 9 +-- .../pandas/data_type_ops/binary_ops.py | 8 ++- .../pandas/data_type_ops/boolean_ops.py | 44 ++++++++---- .../pandas/data_type_ops/categorical_ops.py | 2 +- .../pandas/data_type_ops/complex_ops.py | 15 ++-- .../pyspark/pandas/data_type_ops/date_ops.py | 2 +- .../pandas/data_type_ops/datetime_ops.py | 2 +- .../pyspark/pandas/data_type_ops/num_ops.py | 10 ++- .../pandas/data_type_ops/string_ops.py | 2 +- python/pyspark/pandas/extensions.py | 4 +- python/pyspark/pandas/frame.py | 23 ++++-- python/pyspark/pandas/groupby.py | 4 +- python/pyspark/pandas/indexes/multi.py | 9 +-- python/pyspark/pandas/indexing.py | 22 +++--- python/pyspark/pandas/internal.py | 69 +++++++++--------- python/pyspark/pandas/namespace.py | 14 +++- python/pyspark/pandas/plot/core.py | 12 +++- python/pyspark/pandas/plot/matplotlib.py | 2 +- python/pyspark/pandas/series.py | 25 ++++--- python/pyspark/pandas/spark/accessors.py | 4 +- .../tests/data_type_ops/test_binary_ops.py | 11 +-- .../tests/data_type_ops/test_boolean_ops.py | 6 +- .../data_type_ops/test_categorical_ops.py | 3 +- .../tests/data_type_ops/test_complex_ops.py | 31 ++++---- .../tests/data_type_ops/test_date_ops.py | 9 ++- .../tests/data_type_ops/test_datetime_ops.py | 3 +- .../tests/data_type_ops/test_num_ops.py | 12 ++-- .../tests/data_type_ops/test_string_ops.py | 6 +- .../tests/data_type_ops/testing_utils.py | 1 + .../pyspark/pandas/tests/indexes/test_base.py | 3 +- .../tests/plot/test_frame_plot_matplotlib.py | 2 +- .../tests/plot/test_frame_plot_plotly.py | 3 +- .../tests/plot/test_series_plot_matplotlib.py | 2 +- .../tests/plot/test_series_plot_plotly.py | 3 +- .../pyspark/pandas/tests/test_categorical.py | 6 +- python/pyspark/pandas/tests/test_dataframe.py | 72 ++++++++++++------- .../pandas/tests/test_dataframe_spark_io.py | 3 +- python/pyspark/pandas/tests/test_expanding.py | 3 +- python/pyspark/pandas/tests/test_groupby.py | 24 ++++--- python/pyspark/pandas/tests/test_namespace.py | 5 +- .../pandas/tests/test_ops_on_diff_frames.py | 37 +++++++--- .../tests/test_ops_on_diff_frames_groupby.py | 3 +- python/pyspark/pandas/tests/test_series.py | 27 ++++--- python/pyspark/pandas/typedef/typehints.py | 2 +- .../pandas/usage_logging/usage_logger.py | 2 +- python/pyspark/pandas/utils.py | 12 ++-- 54 files changed, 428 insertions(+), 228 deletions(-) create mode 100755 dev/reformat-python diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 2d9198bc3b..6a513cf3f7 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -366,7 +366,7 @@ jobs: # See also https://github.com/sphinx-doc/sphinx/issues/7551. # Jinja2 3.0.0+ causes error when building with Sphinx. # See also https://issues.apache.org/jira/browse/SPARK-35375. - python3.6 -m pip install flake8 pydata_sphinx_theme mypy numpydoc 'jinja2<3.0.0' + python3.6 -m pip install flake8 pydata_sphinx_theme mypy numpydoc 'jinja2<3.0.0' 'black==21.5b2' - name: Install R linter dependencies and SparkR run: | apt-get install -y libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev diff --git a/dev/lint-python b/dev/lint-python index 918718dd1e..02b9706c9d 100755 --- a/dev/lint-python +++ b/dev/lint-python @@ -25,6 +25,8 @@ MINIMUM_PYCODESTYLE="2.7.0" PYTHON_EXECUTABLE="python3" +BLACK_BUILD="$PYTHON_EXECUTABLE -m black" + function satisfies_min_version { local provided_version="$1" local expected_version="$2" @@ -185,6 +187,35 @@ flake8 checks failed." fi } +function black_test { + local BLACK_REPORT= + local BLACK_STATUS= + + # Skip check if black is not installed. + $BLACK_BUILD 2> /dev/null + if [ $? -ne 0 ]; then + echo "The $BLACK_BUILD command was not found. Skipping black checks for now." + echo + return + fi + + echo "starting black test..." + # Black is only applied for pandas API on Spark for now. + BLACK_REPORT=$( ($BLACK_BUILD python/pyspark/pandas --line-length 100 --check ) 2>&1) + BLACK_STATUS=$? + + if [ "$BLACK_STATUS" -ne 0 ]; then + echo "black checks failed:" + echo "$BLACK_REPORT" + echo "Please run 'dev/reformat-python' script." + echo "$BLACK_STATUS" + exit "$BLACK_STATUS" + else + echo "black checks passed." + echo + fi +} + SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )" SPARK_ROOT_DIR="$(dirname "${SCRIPT_DIR}")" @@ -194,6 +225,7 @@ pushd "$SPARK_ROOT_DIR" &> /dev/null PYTHON_SOURCE="$(find . -path ./docs/.local_ruby_bundle -prune -false -o -name "*.py")" compile_python_test "$PYTHON_SOURCE" +black_test pycodestyle_test "$PYTHON_SOURCE" flake8_test mypy_test diff --git a/dev/reformat-python b/dev/reformat-python new file mode 100755 index 0000000000..1b5ee654b5 --- /dev/null +++ b/dev/reformat-python @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The current directory of the script. +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +FWDIR="$( cd "$DIR"/.. && pwd )" +cd "$FWDIR" + +BLACK_BUILD="python -m black" +BLACK_VERSION="21.5b2" +$BLACK_BUILD 2> /dev/null +if [ $? -ne 0 ]; then + echo "The '$BLACK_BUILD' command was not found. Please install Black, for example, via 'pip install black==$BLACK_VERSION'." + exit 1 +fi + +# This script is only applied for pandas API on Spark for now. +$BLACK_BUILD python/pyspark/pandas --line-length 100 diff --git a/dev/requirements.txt b/dev/requirements.txt index 768223f1d6..f5d662b2f0 100644 --- a/dev/requirements.txt +++ b/dev/requirements.txt @@ -32,3 +32,6 @@ sphinx-plotly-directive # Development scripts jira PyGithub + +# pandas API on Spark Code formatter. +black diff --git a/dev/tox.ini b/dev/tox.ini index 069f0fd7d2..a0f04ef0cb 100644 --- a/dev/tox.ini +++ b/dev/tox.ini @@ -14,11 +14,13 @@ # limitations under the License. [pycodestyle] -ignore=E226,E241,E305,E402,E722,E731,E741,W503,W504 +ignore=E203,E226,E241,E305,E402,E722,E731,E741,W503,W504 max-line-length=100 exclude=*/target/*,python/pyspark/cloudpickle/*.py,shared.py,python/docs/source/conf.py,work/*/*.py,python/.eggs/*,dist/*,.git/* [flake8] select = E901,E999,F821,F822,F823,F401,F405,B006 +# Ignore F821 for plot documents in pandas API on Spark. +ignore = F821 exclude = python/docs/build/html/*,*/target/*,python/pyspark/cloudpickle/*.py,shared.py*,python/docs/source/conf.py,work/*/*.py,python/.eggs/*,dist/*,.git/*,python/out,python/pyspark/sql/pandas/functions.pyi,python/pyspark/sql/column.pyi,python/pyspark/worker.pyi,python/pyspark/java_gateway.pyi max-line-length = 100 diff --git a/python/pyspark/pandas/accessors.py b/python/pyspark/pandas/accessors.py index ad011c99f1..69a2ab9f75 100644 --- a/python/pyspark/pandas/accessors.py +++ b/python/pyspark/pandas/accessors.py @@ -48,7 +48,7 @@ if TYPE_CHECKING: class PandasOnSparkFrameMethods(object): - """ pandas-on-Spark specific features for DataFrame. """ + """pandas-on-Spark specific features for DataFrame.""" def __init__(self, frame: "DataFrame"): self._psdf = frame @@ -696,7 +696,7 @@ class PandasOnSparkFrameMethods(object): class PandasOnSparkSeriesMethods(object): - """ pandas-on-Spark specific features for Series. """ + """pandas-on-Spark specific features for Series.""" def __init__(self, series: "Series"): self._psser = series diff --git a/python/pyspark/pandas/base.py b/python/pyspark/pandas/base.py index 55a2c48bf9..49d3cf38a4 100644 --- a/python/pyspark/pandas/base.py +++ b/python/pyspark/pandas/base.py @@ -1068,9 +1068,7 @@ class IndexOpsMixin(object, metaclass=ABCMeta): if isinstance(self, MultiIndex): raise NotImplementedError("notna is not defined for MultiIndex") - return (~self.isnull()).rename( - self.name # type: ignore - ) + return (~self.isnull()).rename(self.name) # type: ignore notna = notnull diff --git a/python/pyspark/pandas/config.py b/python/pyspark/pandas/config.py index 9498202772..b03f2e1379 100644 --- a/python/pyspark/pandas/config.py +++ b/python/pyspark/pandas/config.py @@ -381,7 +381,7 @@ def _check_option(key: str) -> None: class DictWrapper: - """ provide attribute-style access to a nested dict""" + """provide attribute-style access to a nested dict""" def __init__(self, d: Dict[str, Option], prefix: str = ""): object.__setattr__(self, "d", d) diff --git a/python/pyspark/pandas/data_type_ops/base.py b/python/pyspark/pandas/data_type_ops/base.py index 1a43380e4e..1fcea517f7 100644 --- a/python/pyspark/pandas/data_type_ops/base.py +++ b/python/pyspark/pandas/data_type_ops/base.py @@ -45,11 +45,7 @@ if TYPE_CHECKING: from pyspark.pandas.series import Series # noqa: F401 (SPARK-34943) -def is_valid_operand_for_numeric_arithmetic( - operand: Any, - *, - allow_bool: bool = True -) -> bool: +def is_valid_operand_for_numeric_arithmetic(operand: Any, *, allow_bool: bool = True) -> bool: """Check whether the operand is valid for arithmetic operations against numerics.""" if isinstance(operand, numbers.Number) and not isinstance(operand, bool): return True @@ -58,7 +54,8 @@ def is_valid_operand_for_numeric_arithmetic( return False else: return isinstance(operand.spark.data_type, NumericType) or ( - allow_bool and isinstance(operand.spark.data_type, BooleanType)) + allow_bool and isinstance(operand.spark.data_type, BooleanType) + ) else: return False diff --git a/python/pyspark/pandas/data_type_ops/binary_ops.py b/python/pyspark/pandas/data_type_ops/binary_ops.py index 872503600c..71c18bfe25 100644 --- a/python/pyspark/pandas/data_type_ops/binary_ops.py +++ b/python/pyspark/pandas/data_type_ops/binary_ops.py @@ -34,7 +34,7 @@ class BinaryOps(DataTypeOps): @property def pretty_name(self) -> str: - return 'binaries' + return "binaries" def add(self, left, right) -> Union["Series", "Index"]: if isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, BinaryType): @@ -43,11 +43,13 @@ class BinaryOps(DataTypeOps): return column_op(F.concat)(left, F.lit(right)) else: raise TypeError( - "Concatenation can not be applied to %s and the given type." % self.pretty_name) + "Concatenation can not be applied to %s and the given type." % self.pretty_name + ) def radd(self, left, right) -> Union["Series", "Index"]: if isinstance(right, bytes): return left._with_new_scol(F.concat(F.lit(right), left.spark.column)) else: raise TypeError( - "Concatenation can not be applied to %s and the given type." % self.pretty_name) + "Concatenation can not be applied to %s and the given type." % self.pretty_name + ) diff --git a/python/pyspark/pandas/data_type_ops/boolean_ops.py b/python/pyspark/pandas/data_type_ops/boolean_ops.py index ec39971927..a93841f0d9 100644 --- a/python/pyspark/pandas/data_type_ops/boolean_ops.py +++ b/python/pyspark/pandas/data_type_ops/boolean_ops.py @@ -38,12 +38,13 @@ class BooleanOps(DataTypeOps): @property def pretty_name(self) -> str: - return 'booleans' + return "booleans" def add(self, left, right) -> Union["Series", "Index"]: if not is_valid_operand_for_numeric_arithmetic(right, allow_bool=False): raise TypeError( - "Addition can not be applied to %s and the given type." % self.pretty_name) + "Addition can not be applied to %s and the given type." % self.pretty_name + ) if isinstance(right, numbers.Number) and not isinstance(right, bool): left = left.spark.transform(lambda scol: scol.cast(as_spark_type(type(right)))) @@ -56,7 +57,8 @@ class BooleanOps(DataTypeOps): def sub(self, left, right) -> Union["Series", "Index"]: if not is_valid_operand_for_numeric_arithmetic(right, allow_bool=False): raise TypeError( - "Subtraction can not be applied to %s and the given type." % self.pretty_name) + "Subtraction can not be applied to %s and the given type." % self.pretty_name + ) if isinstance(right, numbers.Number) and not isinstance(right, bool): left = left.spark.transform(lambda scol: scol.cast(as_spark_type(type(right)))) return left - right @@ -68,7 +70,8 @@ class BooleanOps(DataTypeOps): def mul(self, left, right) -> Union["Series", "Index"]: if not is_valid_operand_for_numeric_arithmetic(right, allow_bool=False): raise TypeError( - "Multiplication can not be applied to %s and the given type." % self.pretty_name) + "Multiplication can not be applied to %s and the given type." % self.pretty_name + ) if isinstance(right, numbers.Number) and not isinstance(right, bool): left = left.spark.transform(lambda scol: scol.cast(as_spark_type(type(right)))) return left * right @@ -80,7 +83,8 @@ class BooleanOps(DataTypeOps): def truediv(self, left, right) -> Union["Series", "Index"]: if not is_valid_operand_for_numeric_arithmetic(right, allow_bool=False): raise TypeError( - "True division can not be applied to %s and the given type." % self.pretty_name) + "True division can not be applied to %s and the given type." % self.pretty_name + ) if isinstance(right, numbers.Number) and not isinstance(right, bool): left = left.spark.transform(lambda scol: scol.cast(as_spark_type(type(right)))) return left / right @@ -92,7 +96,8 @@ class BooleanOps(DataTypeOps): def floordiv(self, left, right) -> Union["Series", "Index"]: if not is_valid_operand_for_numeric_arithmetic(right, allow_bool=False): raise TypeError( - "Floor division can not be applied to %s and the given type." % self.pretty_name) + "Floor division can not be applied to %s and the given type." % self.pretty_name + ) if isinstance(right, numbers.Number) and not isinstance(right, bool): left = left.spark.transform(lambda scol: scol.cast(as_spark_type(type(right)))) return left // right @@ -104,7 +109,8 @@ class BooleanOps(DataTypeOps): def mod(self, left, right) -> Union["Series", "Index"]: if not is_valid_operand_for_numeric_arithmetic(right, allow_bool=False): raise TypeError( - "Modulo can not be applied to %s and the given type." % self.pretty_name) + "Modulo can not be applied to %s and the given type." % self.pretty_name + ) if isinstance(right, numbers.Number) and not isinstance(right, bool): left = left.spark.transform(lambda scol: scol.cast(as_spark_type(type(right)))) return left % right @@ -116,7 +122,8 @@ class BooleanOps(DataTypeOps): def pow(self, left, right) -> Union["Series", "Index"]: if not is_valid_operand_for_numeric_arithmetic(right, allow_bool=False): raise TypeError( - "Exponentiation can not be applied to %s and the given type." % self.pretty_name) + "Exponentiation can not be applied to %s and the given type." % self.pretty_name + ) if isinstance(right, numbers.Number) and not isinstance(right, bool): left = left.spark.transform(lambda scol: scol.cast(as_spark_type(type(right)))) return left ** right @@ -131,7 +138,8 @@ class BooleanOps(DataTypeOps): return right + left else: raise TypeError( - "Addition can not be applied to %s and the given type." % self.pretty_name) + "Addition can not be applied to %s and the given type." % self.pretty_name + ) def rsub(self, left, right) -> Union["Series", "Index"]: if isinstance(right, numbers.Number) and not isinstance(right, bool): @@ -139,7 +147,8 @@ class BooleanOps(DataTypeOps): return right - left else: raise TypeError( - "Subtraction can not be applied to %s and the given type." % self.pretty_name) + "Subtraction can not be applied to %s and the given type." % self.pretty_name + ) def rmul(self, left, right) -> Union["Series", "Index"]: if isinstance(right, numbers.Number) and not isinstance(right, bool): @@ -147,7 +156,8 @@ class BooleanOps(DataTypeOps): return right * left else: raise TypeError( - "Multiplication can not be applied to %s and the given type." % self.pretty_name) + "Multiplication can not be applied to %s and the given type." % self.pretty_name + ) def rtruediv(self, left, right) -> Union["Series", "Index"]: if isinstance(right, numbers.Number) and not isinstance(right, bool): @@ -155,7 +165,8 @@ class BooleanOps(DataTypeOps): return right / left else: raise TypeError( - "True division can not be applied to %s and the given type." % self.pretty_name) + "True division can not be applied to %s and the given type." % self.pretty_name + ) def rfloordiv(self, left, right) -> Union["Series", "Index"]: if isinstance(right, numbers.Number) and not isinstance(right, bool): @@ -163,7 +174,8 @@ class BooleanOps(DataTypeOps): return right // left else: raise TypeError( - "Floor division can not be applied to %s and the given type." % self.pretty_name) + "Floor division can not be applied to %s and the given type." % self.pretty_name + ) def rpow(self, left, right) -> Union["Series", "Index"]: if isinstance(right, numbers.Number) and not isinstance(right, bool): @@ -171,7 +183,8 @@ class BooleanOps(DataTypeOps): return right ** left else: raise TypeError( - "Exponentiation can not be applied to %s and the given type." % self.pretty_name) + "Exponentiation can not be applied to %s and the given type." % self.pretty_name + ) def rmod(self, left, right) -> Union["Series", "Index"]: if isinstance(right, numbers.Number) and not isinstance(right, bool): @@ -179,4 +192,5 @@ class BooleanOps(DataTypeOps): return right % left else: raise TypeError( - "Modulo can not be applied to %s and the given type." % self.pretty_name) + "Modulo can not be applied to %s and the given type." % self.pretty_name + ) diff --git a/python/pyspark/pandas/data_type_ops/categorical_ops.py b/python/pyspark/pandas/data_type_ops/categorical_ops.py index 9c5786820e..b9c8994c3b 100644 --- a/python/pyspark/pandas/data_type_ops/categorical_ops.py +++ b/python/pyspark/pandas/data_type_ops/categorical_ops.py @@ -25,4 +25,4 @@ class CategoricalOps(DataTypeOps): @property def pretty_name(self) -> str: - return 'categoricals' + return "categoricals" diff --git a/python/pyspark/pandas/data_type_ops/complex_ops.py b/python/pyspark/pandas/data_type_ops/complex_ops.py index 0992fc7871..edf709f476 100644 --- a/python/pyspark/pandas/data_type_ops/complex_ops.py +++ b/python/pyspark/pandas/data_type_ops/complex_ops.py @@ -34,22 +34,25 @@ class ArrayOps(DataTypeOps): @property def pretty_name(self) -> str: - return 'arrays' + return "arrays" def add(self, left, right) -> Union["Series", "Index"]: if not isinstance(right, IndexOpsMixin) or ( isinstance(right, IndexOpsMixin) and not isinstance(right.spark.data_type, ArrayType) ): raise TypeError( - "Concatenation can not be applied to %s and the given type." % self.pretty_name) + "Concatenation can not be applied to %s and the given type." % self.pretty_name + ) left_type = left.spark.data_type.elementType right_type = right.spark.data_type.elementType if left_type != right_type and not ( - isinstance(left_type, NumericType) and isinstance(right_type, NumericType)): + isinstance(left_type, NumericType) and isinstance(right_type, NumericType) + ): raise TypeError( - "Concatenation can only be applied to %s of the same type" % self.pretty_name) + "Concatenation can only be applied to %s of the same type" % self.pretty_name + ) return column_op(F.concat)(left, right) @@ -61,7 +64,7 @@ class MapOps(DataTypeOps): @property def pretty_name(self) -> str: - return 'maps' + return "maps" class StructOps(DataTypeOps): @@ -71,4 +74,4 @@ class StructOps(DataTypeOps): @property def pretty_name(self) -> str: - return 'structs' + return "structs" diff --git a/python/pyspark/pandas/data_type_ops/date_ops.py b/python/pyspark/pandas/data_type_ops/date_ops.py index a5a40d8c06..3d71bbf885 100644 --- a/python/pyspark/pandas/data_type_ops/date_ops.py +++ b/python/pyspark/pandas/data_type_ops/date_ops.py @@ -37,7 +37,7 @@ class DateOps(DataTypeOps): @property def pretty_name(self) -> str: - return 'dates' + return "dates" def sub(self, left, right) -> Union["Series", "Index"]: # Note that date subtraction casts arguments to integer. This is to mimic pandas's diff --git a/python/pyspark/pandas/data_type_ops/datetime_ops.py b/python/pyspark/pandas/data_type_ops/datetime_ops.py index 314bf1d041..f54bc5cee0 100644 --- a/python/pyspark/pandas/data_type_ops/datetime_ops.py +++ b/python/pyspark/pandas/data_type_ops/datetime_ops.py @@ -38,7 +38,7 @@ class DatetimeOps(DataTypeOps): @property def pretty_name(self) -> str: - return 'datetimes' + return "datetimes" def sub(self, left, right) -> Union["Series", "Index"]: # Note that timestamp subtraction casts arguments to integer. This is to mimic pandas's diff --git a/python/pyspark/pandas/data_type_ops/num_ops.py b/python/pyspark/pandas/data_type_ops/num_ops.py index ad3f595fcf..e79eda94cf 100644 --- a/python/pyspark/pandas/data_type_ops/num_ops.py +++ b/python/pyspark/pandas/data_type_ops/num_ops.py @@ -46,7 +46,7 @@ class NumericOps(DataTypeOps): @property def pretty_name(self) -> str: - return 'numerics' + return "numerics" def add(self, left, right) -> Union["Series", "Index"]: if ( @@ -159,7 +159,7 @@ class IntegralOps(NumericOps): @property def pretty_name(self) -> str: - return 'integrals' + return "integrals" def mul(self, left, right) -> Union["Series", "Index"]: if isinstance(right, str): @@ -211,9 +211,7 @@ class IntegralOps(NumericOps): return F.when(F.lit(right is np.nan), np.nan).otherwise( F.when( F.lit(right != 0) | F.lit(right).isNull(), F.floor(left.__div__(right)) - ).otherwise( - F.lit(np.inf).__div__(left) - ) + ).otherwise(F.lit(np.inf).__div__(left)) ) return numpy_column_op(floordiv)(left, right) @@ -253,7 +251,7 @@ class FractionalOps(NumericOps): @property def pretty_name(self) -> str: - return 'fractions' + return "fractions" def mul(self, left, right) -> Union["Series", "Index"]: if isinstance(right, str): diff --git a/python/pyspark/pandas/data_type_ops/string_ops.py b/python/pyspark/pandas/data_type_ops/string_ops.py index cd651b3769..9695affa63 100644 --- a/python/pyspark/pandas/data_type_ops/string_ops.py +++ b/python/pyspark/pandas/data_type_ops/string_ops.py @@ -38,7 +38,7 @@ class StringOps(DataTypeOps): @property def pretty_name(self) -> str: - return 'strings' + return "strings" def add(self, left, right) -> Union["Series", "Index"]: if isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, StringType): diff --git a/python/pyspark/pandas/extensions.py b/python/pyspark/pandas/extensions.py index f9aa3e6b07..cb4c342c74 100644 --- a/python/pyspark/pandas/extensions.py +++ b/python/pyspark/pandas/extensions.py @@ -124,7 +124,9 @@ def _register_accessor( ) warnings.warn( - msg, UserWarning, stacklevel=2, + msg, + UserWarning, + stacklevel=2, ) setattr(cls, name, CachedAccessor(name, accessor)) return accessor diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py index b97ee60718..baf6abbe3d 100644 --- a/python/pyspark/pandas/frame.py +++ b/python/pyspark/pandas/frame.py @@ -512,7 +512,7 @@ class DataFrame(Frame, Generic[T]): @property def _pssers(self): - """ Return a dict of column label -> Series which anchors `self`. """ + """Return a dict of column label -> Series which anchors `self`.""" from pyspark.pandas.series import Series if not hasattr(self, "_psseries"): @@ -2945,10 +2945,10 @@ defaultdict(, {'col..., 'col...})] else: index_spark_columns = ( internal.index_spark_columns[:level] - + internal.index_spark_columns[level + len(key):] + + internal.index_spark_columns[level + len(key) :] ) - index_names = internal.index_names[:level] + internal.index_names[level + len(key):] - index_dtypes = internal.index_dtypes[:level] + internal.index_dtypes[level + len(key):] + index_names = internal.index_names[:level] + internal.index_names[level + len(key) :] + index_dtypes = internal.index_dtypes[:level] + internal.index_dtypes[level + len(key) :] internal = internal.copy( index_spark_columns=index_spark_columns, @@ -5445,7 +5445,13 @@ defaultdict(, {'col..., 'col...})] return psdf def replace( - self, to_replace=None, value=None, inplace=False, limit=None, regex=False, method="pad", + self, + to_replace=None, + value=None, + inplace=False, + limit=None, + regex=False, + method="pad", ) -> Optional["DataFrame"]: """ Returns a new DataFrame replacing a value with another value. @@ -7110,7 +7116,10 @@ defaultdict(, {'col..., 'col...})] self._internal.index_dtypes, ) ) - index_map[i], index_map[j], = index_map[j], index_map[i] + index_map[i], index_map[j], = ( + index_map[j], + index_map[i], + ) index_spark_columns, index_names, index_dtypes = zip(*index_map) internal = self._internal.copy( index_spark_columns=list(index_spark_columns), @@ -7620,7 +7629,7 @@ defaultdict(, {'col..., 'col...})] column_labels.append(label) for label in right_internal.column_labels: # recover `right_prefix` here. - col = right_internal.spark_column_name_for(label)[len(right_prefix):] + col = right_internal.spark_column_name_for(label)[len(right_prefix) :] scol = right_scol_for(label).alias(col) if label in duplicate_columns: spark_column_name = left_internal.spark_column_name_for(label) diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py index 2840510a31..d0f159ef55 100644 --- a/python/pyspark/pandas/groupby.py +++ b/python/pyspark/pandas/groupby.py @@ -1806,7 +1806,9 @@ class GroupBy(object, metaclass=ABCMeta): ] psdf, groupkey_labels, _ = GroupBy._prepare_group_map_apply( - psdf, self._groupkeys, agg_columns, + psdf, + self._groupkeys, + agg_columns, ) groupkey_scols = [psdf._internal.spark_column_for(label) for label in groupkey_labels] diff --git a/python/pyspark/pandas/indexes/multi.py b/python/pyspark/pandas/indexes/multi.py index ee40d06f2c..6d21e38a5e 100644 --- a/python/pyspark/pandas/indexes/multi.py +++ b/python/pyspark/pandas/indexes/multi.py @@ -447,7 +447,10 @@ class MultiIndex(Index): self._internal.index_dtypes, ) ) - index_map[i], index_map[j], = index_map[j], index_map[i] + index_map[i], index_map[j], = ( + index_map[j], + index_map[i], + ) index_spark_columns, index_names, index_dtypes = zip(*index_map) internal = self._internal.copy( index_spark_columns=list(index_spark_columns), @@ -684,9 +687,7 @@ class MultiIndex(Index): raise NotImplementedError("nunique is not defined for MultiIndex") # TODO: add 'name' parameter after pd.MultiIndex.name is implemented - def copy( # type: ignore[override] - self, deep: Optional[bool] = None - ) -> "MultiIndex": + def copy(self, deep: Optional[bool] = None) -> "MultiIndex": # type: ignore[override] """ Make a copy of this object. diff --git a/python/pyspark/pandas/indexing.py b/python/pyspark/pandas/indexing.py index fdd6924c43..7e477ef328 100644 --- a/python/pyspark/pandas/indexing.py +++ b/python/pyspark/pandas/indexing.py @@ -333,35 +333,35 @@ class LocIndexerLike(IndexerLike, metaclass=ABCMeta): def _select_rows_by_series( self, rows_sel: "Series" ) -> Tuple[Optional[spark.Column], Optional[int], Optional[int]]: - """ Select rows by `Series` type key. """ + """Select rows by `Series` type key.""" pass @abstractmethod def _select_rows_by_spark_column( self, rows_sel: spark.Column ) -> Tuple[Optional[spark.Column], Optional[int], Optional[int]]: - """ Select rows by Spark `Column` type key. """ + """Select rows by Spark `Column` type key.""" pass @abstractmethod def _select_rows_by_slice( self, rows_sel: slice ) -> Tuple[Optional[spark.Column], Optional[int], Optional[int]]: - """ Select rows by `slice` type key. """ + """Select rows by `slice` type key.""" pass @abstractmethod def _select_rows_by_iterable( self, rows_sel: Iterable ) -> Tuple[Optional[spark.Column], Optional[int], Optional[int]]: - """ Select rows by `Iterable` type key. """ + """Select rows by `Iterable` type key.""" pass @abstractmethod def _select_rows_else( self, rows_sel: Any ) -> Tuple[Optional[spark.Column], Optional[int], Optional[int]]: - """ Select rows by other type key. """ + """Select rows by other type key.""" pass # Methods for col selection @@ -372,7 +372,7 @@ class LocIndexerLike(IndexerLike, metaclass=ABCMeta): ) -> Tuple[ List[Tuple], Optional[List[spark.Column]], Optional[List[Dtype]], bool, Optional[Tuple] ]: - """ Select columns by `Series` type key. """ + """Select columns by `Series` type key.""" pass @abstractmethod @@ -381,7 +381,7 @@ class LocIndexerLike(IndexerLike, metaclass=ABCMeta): ) -> Tuple[ List[Tuple], Optional[List[spark.Column]], Optional[List[Dtype]], bool, Optional[Tuple] ]: - """ Select columns by Spark `Column` type key. """ + """Select columns by Spark `Column` type key.""" pass @abstractmethod @@ -390,7 +390,7 @@ class LocIndexerLike(IndexerLike, metaclass=ABCMeta): ) -> Tuple[ List[Tuple], Optional[List[spark.Column]], Optional[List[Dtype]], bool, Optional[Tuple] ]: - """ Select columns by `slice` type key. """ + """Select columns by `slice` type key.""" pass @abstractmethod @@ -399,7 +399,7 @@ class LocIndexerLike(IndexerLike, metaclass=ABCMeta): ) -> Tuple[ List[Tuple], Optional[List[spark.Column]], Optional[List[Dtype]], bool, Optional[Tuple] ]: - """ Select columns by `Iterable` type key. """ + """Select columns by `Iterable` type key.""" pass @abstractmethod @@ -408,7 +408,7 @@ class LocIndexerLike(IndexerLike, metaclass=ABCMeta): ) -> Tuple[ List[Tuple], Optional[List[spark.Column]], Optional[List[Dtype]], bool, Optional[Tuple] ]: - """ Select columns by other type key. """ + """Select columns by other type key.""" pass def __getitem__(self, key) -> Union["Series", "DataFrame"]: @@ -1140,7 +1140,7 @@ class LocIndexer(LocIndexerLike): def _get_from_multiindex_column( self, key, missing_keys, labels=None, recursed=0 ) -> Tuple[List[Tuple], Optional[List[spark.Column]], Any, bool, Optional[Tuple]]: - """ Select columns from multi-index columns. """ + """Select columns from multi-index columns.""" assert isinstance(key, tuple) if labels is None: labels = [(label, label) for label in self._internal.column_labels] diff --git a/python/pyspark/pandas/internal.py b/python/pyspark/pandas/internal.py index 2e2372173d..46c0f5c16e 100644 --- a/python/pyspark/pandas/internal.py +++ b/python/pyspark/pandas/internal.py @@ -803,7 +803,7 @@ class InternalFrame(object): ) def spark_column_for(self, label: Tuple) -> spark.Column: - """ Return Spark Column for the given column label. """ + """Return Spark Column for the given column label.""" column_labels_to_scol = dict(zip(self.column_labels, self.data_spark_columns)) if label in column_labels_to_scol: return column_labels_to_scol[label] @@ -811,7 +811,7 @@ class InternalFrame(object): raise KeyError(name_like_string(label)) def spark_column_name_for(self, label_or_scol: Union[Tuple, spark.Column]) -> str: - """ Return the actual Spark column name for the given column label. """ + """Return the actual Spark column name for the given column label.""" if isinstance(label_or_scol, spark.Column): scol = label_or_scol else: @@ -819,7 +819,7 @@ class InternalFrame(object): return self.spark_frame.select(scol).columns[0] def spark_type_for(self, label_or_scol: Union[Tuple, spark.Column]) -> DataType: - """ Return DataType for the given column label. """ + """Return DataType for the given column label.""" if isinstance(label_or_scol, spark.Column): scol = label_or_scol else: @@ -827,7 +827,7 @@ class InternalFrame(object): return self.spark_frame.select(scol).schema[0].dataType def spark_column_nullable_for(self, label_or_scol: Union[Tuple, spark.Column]) -> bool: - """ Return nullability for the given column label. """ + """Return nullability for the given column label.""" if isinstance(label_or_scol, spark.Column): scol = label_or_scol else: @@ -835,7 +835,7 @@ class InternalFrame(object): return self.spark_frame.select(scol).schema[0].nullable def dtype_for(self, label: Tuple) -> Dtype: - """ Return dtype for the given column label. """ + """Return dtype for the given column label.""" column_labels_to_dtype = dict(zip(self.column_labels, self.data_dtypes)) if label in column_labels_to_dtype: return column_labels_to_dtype[label] @@ -844,80 +844,77 @@ class InternalFrame(object): @property def spark_frame(self) -> spark.DataFrame: - """ Return the managed Spark DataFrame. """ + """Return the managed Spark DataFrame.""" return self._sdf @lazy_property def data_spark_column_names(self) -> List[str]: - """ Return the managed column field names. """ + """Return the managed column field names.""" return self.spark_frame.select(self.data_spark_columns).columns @property def data_spark_columns(self) -> List[spark.Column]: - """ Return Spark Columns for the managed data columns. """ + """Return Spark Columns for the managed data columns.""" return self._data_spark_columns @property def index_spark_column_names(self) -> List[str]: - """ Return the managed index field names. """ + """Return the managed index field names.""" return self.spark_frame.select(self.index_spark_columns).columns @property def index_spark_columns(self) -> List[spark.Column]: - """ Return Spark Columns for the managed index columns. """ + """Return Spark Columns for the managed index columns.""" return self._index_spark_columns @lazy_property def spark_column_names(self) -> List[str]: - """ Return all the field names including index field names. """ + """Return all the field names including index field names.""" return self.spark_frame.select(self.spark_columns).columns @lazy_property def spark_columns(self) -> List[spark.Column]: - """ Return Spark Columns for the managed columns including index columns. """ + """Return Spark Columns for the managed columns including index columns.""" index_spark_columns = self.index_spark_columns return index_spark_columns + [ spark_column for spark_column in self.data_spark_columns - if all( - not spark_column_equals(spark_column, scol) - for scol in index_spark_columns - ) + if all(not spark_column_equals(spark_column, scol) for scol in index_spark_columns) ] @property def index_names(self) -> List[Optional[Tuple]]: - """ Return the managed index names. """ + """Return the managed index names.""" return self._index_names @lazy_property def index_level(self) -> int: - """ Return the level of the index. """ + """Return the level of the index.""" return len(self._index_names) @property def column_labels(self) -> List[Tuple]: - """ Return the managed column index. """ + """Return the managed column index.""" return self._column_labels @lazy_property def column_labels_level(self) -> int: - """ Return the level of the column index. """ + """Return the level of the column index.""" return len(self._column_label_names) @property def column_label_names(self) -> List[Optional[Tuple]]: - """ Return names of the index levels. """ + """Return names of the index levels.""" return self._column_label_names @property def index_dtypes(self) -> List[Dtype]: - """ Return dtypes for the managed index columns. """ + """Return dtypes for the managed index columns.""" return self._index_dtypes @property def data_dtypes(self) -> List[Dtype]: - """ Return dtypes for the managed columns. """ + """Return dtypes for the managed columns.""" return self._data_dtypes @lazy_property @@ -929,16 +926,13 @@ class InternalFrame(object): index_spark_columns = self.index_spark_columns data_columns = [] for spark_column in self.data_spark_columns: - if all( - not spark_column_equals(spark_column, scol) - for scol in index_spark_columns - ): + if all(not spark_column_equals(spark_column, scol) for scol in index_spark_columns): data_columns.append(spark_column) return self.spark_frame.select(index_spark_columns + data_columns) @lazy_property def to_pandas_frame(self) -> pd.DataFrame: - """ Return as pandas DataFrame. """ + """Return as pandas DataFrame.""" sdf = self.to_internal_spark_frame pdf = sdf.toPandas() if len(pdf) == 0 and len(sdf.schema) > 0: @@ -950,7 +944,7 @@ class InternalFrame(object): @lazy_property def arguments_for_restore_index(self) -> Dict: - """ Create arguments for `restore_index`. """ + """Create arguments for `restore_index`.""" column_names = [] ext_dtypes = { col: dtype @@ -1055,14 +1049,15 @@ class InternalFrame(object): pdf.columns = pd.MultiIndex.from_tuples(column_labels, names=names) else: pdf.columns = pd.Index( - [None if label is None else label[0] for label in column_labels], name=names[0], + [None if label is None else label[0] for label in column_labels], + name=names[0], ) return pdf @lazy_property def resolved_copy(self) -> "InternalFrame": - """ Copy the immutable InternalFrame with the updates resolved. """ + """Copy the immutable InternalFrame with the updates resolved.""" sdf = self.spark_frame.select(self.spark_columns + list(HIDDEN_COLUMNS)) return self.copy( spark_frame=sdf, @@ -1078,7 +1073,7 @@ class InternalFrame(object): data_columns: Optional[List[str]] = None, data_dtypes: Optional[List[Dtype]] = None ) -> "InternalFrame": - """ Copy the immutable InternalFrame with the updates by the specified Spark DataFrame. + """Copy the immutable InternalFrame with the updates by the specified Spark DataFrame. :param spark_frame: the new Spark DataFrame :param index_dtypes: the index dtypes. If None, the original dtyeps are used. @@ -1207,7 +1202,7 @@ class InternalFrame(object): ) def with_filter(self, pred: Union[spark.Column, "Series"]) -> "InternalFrame": - """ Copy the immutable InternalFrame with the updates by the predicate. + """Copy the immutable InternalFrame with the updates by the predicate. :param pred: the predicate to filter. :return: the copied InternalFrame. @@ -1280,7 +1275,7 @@ class InternalFrame(object): data_dtypes: Union[Optional[List[Dtype]], _NoValueType] = _NoValue, column_label_names: Union[Optional[List[Optional[Tuple]]], _NoValueType] = _NoValue ) -> "InternalFrame": - """ Copy the immutable InternalFrame. + """Copy the immutable InternalFrame. :param spark_frame: the new Spark DataFrame. If not specified, the original one is used. :param index_spark_columns: the list of Spark Column. @@ -1324,7 +1319,7 @@ class InternalFrame(object): @staticmethod def from_pandas(pdf: pd.DataFrame) -> "InternalFrame": - """ Create an immutable DataFrame from pandas DataFrame. + """Create an immutable DataFrame from pandas DataFrame. :param pdf: :class:`pd.DataFrame` :return: the created immutable DataFrame @@ -1354,7 +1349,9 @@ class InternalFrame(object): schema = StructType( [ StructField( - name, infer_pd_series_spark_type(col, dtype), nullable=bool(col.isnull().any()), + name, + infer_pd_series_spark_type(col, dtype), + nullable=bool(col.isnull().any()), ) for (name, col), dtype in zip(pdf.iteritems(), index_dtypes + data_dtypes) ] diff --git a/python/pyspark/pandas/namespace.py b/python/pyspark/pandas/namespace.py index a74456d5c8..bf3f388ad0 100644 --- a/python/pyspark/pandas/namespace.py +++ b/python/pyspark/pandas/namespace.py @@ -1903,7 +1903,7 @@ def get_dummies( raise KeyError(name_like_string(columns)) if prefix is None: prefix = [ - str(label[len(columns):]) + str(label[len(columns) :]) if len(label) > len(columns) + 1 else label[len(columns)] if len(label) == len(columns) + 1 @@ -2212,11 +2212,19 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=False) -> Union[ for psdf in psdfs_not_same_anchor: if join == "inner": concat_psdf = align_diff_frames( - resolve_func, concat_psdf, psdf, fillna=False, how="inner", + resolve_func, + concat_psdf, + psdf, + fillna=False, + how="inner", ) elif join == "outer": concat_psdf = align_diff_frames( - resolve_func, concat_psdf, psdf, fillna=False, how="full", + resolve_func, + concat_psdf, + psdf, + fillna=False, + how="full", ) concat_psdf = concat_psdf[column_labels] diff --git a/python/pyspark/pandas/plot/core.py b/python/pyspark/pandas/plot/core.py index 1e95e3e489..cb412f1464 100644 --- a/python/pyspark/pandas/plot/core.py +++ b/python/pyspark/pandas/plot/core.py @@ -374,11 +374,19 @@ class KdePlotBase: if ind is None: min_val, max_val = calc_min_max() sample_range = max_val - min_val - ind = np.linspace(min_val - 0.5 * sample_range, max_val + 0.5 * sample_range, 1000,) + ind = np.linspace( + min_val - 0.5 * sample_range, + max_val + 0.5 * sample_range, + 1000, + ) elif is_integer(ind): min_val, max_val = calc_min_max() sample_range = max_val - min_val - ind = np.linspace(min_val - 0.5 * sample_range, max_val + 0.5 * sample_range, ind,) + ind = np.linspace( + min_val - 0.5 * sample_range, + max_val + 0.5 * sample_range, + ind, + ) return ind @staticmethod diff --git a/python/pyspark/pandas/plot/matplotlib.py b/python/pyspark/pandas/plot/matplotlib.py index e8ac06203b..91387805c4 100644 --- a/python/pyspark/pandas/plot/matplotlib.py +++ b/python/pyspark/pandas/plot/matplotlib.py @@ -111,7 +111,7 @@ class PandasOnSparkBoxPlot(PandasBoxPlot, BoxPlotBase): precision=None, ): def update_dict(dictionary, rc_name, properties): - """ Loads properties in the dictionary from rc file if not already + """Loads properties in the dictionary from rc file if not already in the dictionary""" rc_str = "boxplot.{0}.{1}" if dictionary is None: diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py index a4f3aa4fe5..ae22c5b469 100644 --- a/python/pyspark/pandas/series.py +++ b/python/pyspark/pandas/series.py @@ -1592,7 +1592,11 @@ class Series(Frame, IndexOpsMixin, Generic[T]): else: return first_series(psdf) - def reindex(self, index: Optional[Any] = None, fill_value: Optional[Any] = None,) -> "Series": + def reindex( + self, + index: Optional[Any] = None, + fill_value: Optional[Any] = None, + ) -> "Series": """ Conform Series to new index with optional filling logic, placing NA/NaN in locations having no value in the previous index. A new object @@ -3485,7 +3489,8 @@ class Series(Frame, IndexOpsMixin, Generic[T]): if method == "first": window = ( Window.orderBy( - asc_func(self.spark.column), asc_func(F.col(NATURAL_ORDER_COLUMN_NAME)), + asc_func(self.spark.column), + asc_func(F.col(NATURAL_ORDER_COLUMN_NAME)), ) .partitionBy(*part_cols) .rowsBetween(Window.unboundedPreceding, Window.currentRow) @@ -3958,7 +3963,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]): ) internal = self._internal - scols = internal.index_spark_columns[len(item):] + [self.spark.column] + scols = internal.index_spark_columns[len(item) :] + [self.spark.column] rows = [internal.spark_columns[level] == index for level, index in enumerate(item)] sdf = internal.spark_frame.filter(reduce(lambda x, y: x & y, rows)).select(scols) @@ -3985,10 +3990,10 @@ class Series(Frame, IndexOpsMixin, Generic[T]): internal = internal.copy( spark_frame=sdf, index_spark_columns=[ - scol_for(sdf, col) for col in internal.index_spark_column_names[len(item):] + scol_for(sdf, col) for col in internal.index_spark_column_names[len(item) :] ], - index_dtypes=internal.index_dtypes[len(item):], - index_names=self._internal.index_names[len(item):], + index_dtypes=internal.index_dtypes[len(item) :], + index_names=self._internal.index_names[len(item) :], data_spark_columns=[scol_for(sdf, internal.data_spark_column_names[0])], ) return first_series(DataFrame(internal)) @@ -4660,7 +4665,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]): internal = self._internal scols = ( internal.index_spark_columns[:level] - + internal.index_spark_columns[level + len(key):] + + internal.index_spark_columns[level + len(key) :] + [self.spark.column] ) rows = [internal.spark_columns[lvl] == index for lvl, index in enumerate(key, level)] @@ -4675,10 +4680,10 @@ class Series(Frame, IndexOpsMixin, Generic[T]): index_spark_column_names = ( internal.index_spark_column_names[:level] - + internal.index_spark_column_names[level + len(key):] + + internal.index_spark_column_names[level + len(key) :] ) - index_names = internal.index_names[:level] + internal.index_names[level + len(key):] - index_dtypes = internal.index_dtypes[:level] + internal.index_dtypes[level + len(key):] + index_names = internal.index_names[:level] + internal.index_names[level + len(key) :] + index_dtypes = internal.index_dtypes[:level] + internal.index_dtypes[level + len(key) :] internal = internal.copy( spark_frame=sdf, diff --git a/python/pyspark/pandas/spark/accessors.py b/python/pyspark/pandas/spark/accessors.py index d94061acf2..3b23300fc3 100644 --- a/python/pyspark/pandas/spark/accessors.py +++ b/python/pyspark/pandas/spark/accessors.py @@ -44,12 +44,12 @@ class SparkIndexOpsMethods(metaclass=ABCMeta): @property def data_type(self) -> DataType: - """ Returns the data type as defined by Spark, as a Spark DataType object.""" + """Returns the data type as defined by Spark, as a Spark DataType object.""" return self._data._internal.spark_type_for(self._data._column_label) @property def nullable(self) -> bool: - """ Returns the nullability as defined by Spark. """ + """Returns the nullability as defined by Spark.""" return self._data._internal.spark_column_nullable_for(self._data._column_label) @property diff --git a/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py index 9137d5d1d1..02efbd2a12 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py @@ -26,7 +26,7 @@ from pyspark.testing.pandasutils import PandasOnSparkTestCase class BinaryOpsTest(PandasOnSparkTestCase, TestCasesUtils): @property def pser(self): - return pd.Series([b'1', b'2', b'3']) + return pd.Series([b"1", b"2", b"3"]) @property def psser(self): @@ -35,9 +35,9 @@ class BinaryOpsTest(PandasOnSparkTestCase, TestCasesUtils): def test_add(self): psser = self.psser pser = self.pser - self.assert_eq(psser + b'1', pser + b'1') + self.assert_eq(psser + b"1", pser + b"1") self.assert_eq(psser + psser, pser + pser) - self.assert_eq(psser + psser.astype('bytes'), pser + pser.astype('bytes')) + self.assert_eq(psser + psser.astype("bytes"), pser + pser.astype("bytes")) self.assertRaises(TypeError, lambda: psser + "x") self.assertRaises(TypeError, lambda: psser + 1) @@ -95,7 +95,7 @@ class BinaryOpsTest(PandasOnSparkTestCase, TestCasesUtils): self.assertRaises(TypeError, lambda: self.psser ** psser) def test_radd(self): - self.assert_eq(b'1' + self.psser, b'1' + self.pser) + self.assert_eq(b"1" + self.psser, b"1" + self.pser) self.assertRaises(TypeError, lambda: "x" + self.psser) self.assertRaises(TypeError, lambda: 1 + self.psser) @@ -129,7 +129,8 @@ if __name__ == "__main__": try: import xmlrunner # type: ignore[import] - testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2) + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) except ImportError: testRunner = None unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py index dff616fd45..6b4201a275 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py @@ -103,7 +103,8 @@ class BooleanOpsTest(PandasOnSparkTestCase, TestCasesUtils): with option_context("compute.ops_on_diff_frames", True): self.assert_eq( - self.pser / self.float_pser, (self.psser / self.float_psser).sort_index()) + self.pser / self.float_pser, (self.psser / self.float_psser).sort_index() + ) for psser in self.non_numeric_pssers.values(): self.assertRaises(TypeError, lambda: self.psser / psser) @@ -235,7 +236,8 @@ if __name__ == "__main__": try: import xmlrunner # type: ignore[import] - testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2) + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) except ImportError: testRunner = None unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py index 2b667dd2bb..d7d559eb20 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py @@ -122,7 +122,8 @@ if __name__ == "__main__": try: import xmlrunner # type: ignore[import] - testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2) + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) except ImportError: testRunner = None unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py index 51c2195cd3..5cb8b92fb1 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py @@ -32,16 +32,17 @@ class ComplexOpsTest(PandasOnSparkTestCase, TestCasesUtils): return [ pd.Series([[1, 2, 3]]), pd.Series([[0.1, 0.2, 0.3]]), - pd.Series([[decimal.Decimal(1), decimal.Decimal(2), decimal.Decimal(3)]]) + pd.Series([[decimal.Decimal(1), decimal.Decimal(2), decimal.Decimal(3)]]), ] @property def non_numeric_array_psers(self): return { - "string": pd.Series([['x', 'y', 'z']]), - "date": pd.Series([ - [datetime.date(1994, 1, 1), datetime.date(1994, 1, 2), datetime.date(1994, 1, 3)]]), - "bool": pd.Series([[True, True, False]]) + "string": pd.Series([["x", "y", "z"]]), + "date": pd.Series( + [[datetime.date(1994, 1, 1), datetime.date(1994, 1, 2), datetime.date(1994, 1, 3)]] + ), + "bool": pd.Series([[True, True, False]]), } @property @@ -80,16 +81,19 @@ class ComplexOpsTest(PandasOnSparkTestCase, TestCasesUtils): # Non-numeric array + Non-numeric array self.assertRaises( - TypeError, lambda: - self.non_numeric_array_pssers['string'] + self.non_numeric_array_pssers['bool'] + TypeError, + lambda: self.non_numeric_array_pssers["string"] + + self.non_numeric_array_pssers["bool"], ) self.assertRaises( - TypeError, lambda: - self.non_numeric_array_pssers['string'] + self.non_numeric_array_pssers['date'] + TypeError, + lambda: self.non_numeric_array_pssers["string"] + + self.non_numeric_array_pssers["date"], ) self.assertRaises( - TypeError, lambda: - self.non_numeric_array_pssers['bool'] + self.non_numeric_array_pssers['date'] + TypeError, + lambda: self.non_numeric_array_pssers["bool"] + + self.non_numeric_array_pssers["date"], ) for data_type in self.non_numeric_array_psers.keys(): @@ -97,7 +101,7 @@ class ComplexOpsTest(PandasOnSparkTestCase, TestCasesUtils): self.non_numeric_array_psers.get(data_type) + self.non_numeric_array_psers.get(data_type), self.non_numeric_array_pssers.get(data_type) - + self.non_numeric_array_pssers.get(data_type) + + self.non_numeric_array_pssers.get(data_type), ) # Numeric array + Non-numeric array @@ -193,7 +197,8 @@ if __name__ == "__main__": try: import xmlrunner # type: ignore[import] - testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2) + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) except ImportError: testRunner = None unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py index cc61132450..e26d8b6272 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py @@ -55,7 +55,8 @@ class DateOpsTest(PandasOnSparkTestCase, TestCasesUtils): self.assertRaises(TypeError, lambda: self.psser - "x") self.assertRaises(TypeError, lambda: self.psser - 1) self.assert_eq( - (self.pser - self.some_date).dt.days, self.psser - self.some_date, + (self.pser - self.some_date).dt.days, + self.psser - self.some_date, ) with option_context("compute.ops_on_diff_frames", True): for pser, psser in self.pser_psser_pairs: @@ -118,7 +119,8 @@ class DateOpsTest(PandasOnSparkTestCase, TestCasesUtils): self.assertRaises(TypeError, lambda: "x" - self.psser) self.assertRaises(TypeError, lambda: 1 - self.psser) self.assert_eq( - (self.some_date - self.pser).dt.days, self.some_date - self.psser, + (self.some_date - self.pser).dt.days, + self.some_date - self.psser, ) def test_rmul(self): @@ -152,7 +154,8 @@ if __name__ == "__main__": try: import xmlrunner # type: ignore[import] - testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2) + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) except ImportError: testRunner = None unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py index 08b8cbe4a3..7617500373 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py @@ -154,7 +154,8 @@ if __name__ == "__main__": try: import xmlrunner # type: ignore[import] - testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2) + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) except ImportError: testRunner = None unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py index e37384c56e..b5fdb60f34 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py @@ -34,6 +34,7 @@ class NumOpsTest(PandasOnSparkTestCase, TestCasesUtils): returns float32. The underlying reason is the respective Spark operations return DoubleType always. """ + @property def float_pser(self): return pd.Series([1, 2, 3], dtype=float) @@ -137,7 +138,8 @@ class NumOpsTest(PandasOnSparkTestCase, TestCasesUtils): self.assertRaises(TypeError, lambda: psser // self.non_numeric_pssers["datetime"]) self.assertRaises(TypeError, lambda: psser // self.non_numeric_pssers["date"]) self.assertRaises( - TypeError, lambda: psser // self.non_numeric_pssers["categorical"]) + TypeError, lambda: psser // self.non_numeric_pssers["categorical"] + ) if LooseVersion(pd.__version__) >= LooseVersion("0.25.3"): self.assert_eq( (self.float_psser // self.non_numeric_pssers["bool"]).sort_index(), @@ -146,7 +148,7 @@ class NumOpsTest(PandasOnSparkTestCase, TestCasesUtils): else: self.assert_eq( (self.float_pser // self.non_numeric_psers["bool"]).sort_index(), - ps.Series([1.0, 2.0, np.inf]) + ps.Series([1.0, 2.0, np.inf]), ) self.assertRaises(TypeError, lambda: self.float_psser // True) @@ -181,7 +183,8 @@ class NumOpsTest(PandasOnSparkTestCase, TestCasesUtils): self.assertRaises(TypeError, lambda: psser ** self.non_numeric_pssers["datetime"]) self.assertRaises(TypeError, lambda: psser ** self.non_numeric_pssers["date"]) self.assertRaises( - TypeError, lambda: psser ** self.non_numeric_pssers["categorical"]) + TypeError, lambda: psser ** self.non_numeric_pssers["categorical"] + ) self.assert_eq( (self.float_psser ** self.non_numeric_pssers["bool"]).sort_index(), self.float_pser ** self.non_numeric_psers["bool"], @@ -258,7 +261,8 @@ if __name__ == "__main__": try: import xmlrunner # type: ignore[import] - testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2) + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) except ImportError: testRunner = None unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py index 19a6d35e72..b33edfcce4 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py @@ -45,7 +45,8 @@ class StringOpsTest(PandasOnSparkTestCase, TestCasesUtils): self.assertRaises(TypeError, lambda: self.psser + self.non_numeric_pssers["datetime"]) self.assertRaises(TypeError, lambda: self.psser + self.non_numeric_pssers["date"]) self.assertRaises( - TypeError, lambda: self.psser + self.non_numeric_pssers["categorical"]) + TypeError, lambda: self.psser + self.non_numeric_pssers["categorical"] + ) self.assertRaises(TypeError, lambda: self.psser + self.non_numeric_pssers["bool"]) for psser in self.numeric_pssers: self.assertRaises(TypeError, lambda: self.psser + psser) @@ -135,7 +136,8 @@ if __name__ == "__main__": try: import xmlrunner # type: ignore[import] - testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2) + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) except ImportError: testRunner = None unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/data_type_ops/testing_utils.py b/python/pyspark/pandas/tests/data_type_ops/testing_utils.py index 54d24d425e..13fda52c24 100644 --- a/python/pyspark/pandas/tests/data_type_ops/testing_utils.py +++ b/python/pyspark/pandas/tests/data_type_ops/testing_utils.py @@ -26,6 +26,7 @@ import pyspark.pandas as ps class TestCasesUtils(object): """A utility holding common test cases for arithmetic operations of different data types.""" + @property def numeric_psers(self): dtypes = [np.float32, float, int, np.int32] diff --git a/python/pyspark/pandas/tests/indexes/test_base.py b/python/pyspark/pandas/tests/indexes/test_base.py index b6d4182825..1ec4228f25 100644 --- a/python/pyspark/pandas/tests/indexes/test_base.py +++ b/python/pyspark/pandas/tests/indexes/test_base.py @@ -174,7 +174,8 @@ class IndexesTest(PandasOnSparkTestCase, TestUtils): # The `name` argument is added in pandas 0.24. self.assert_eq(psidx.to_frame(name="x"), pidx.to_frame(name="x")) self.assert_eq( - psidx.to_frame(index=False, name="x"), pidx.to_frame(index=False, name="x"), + psidx.to_frame(index=False, name="x"), + pidx.to_frame(index=False, name="x"), ) self.assertRaises(TypeError, lambda: psidx.to_frame(name=["x"])) diff --git a/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py b/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py index 34252986a6..e7b5296994 100644 --- a/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py +++ b/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py @@ -425,7 +425,7 @@ class DataFramePlotMatplotlibTest(PandasOnSparkTestCase, TestUtils): def moving_average(a, n=10): ret = np.cumsum(a, dtype=float) ret[n:] = ret[n:] - ret[:-n] - return ret[n - 1:] / n + return ret[n - 1 :] / n def check_kde_plot(pdf, psdf, *args, **kwargs): _, ax1 = plt.subplots(1, 1) diff --git a/python/pyspark/pandas/tests/plot/test_frame_plot_plotly.py b/python/pyspark/pandas/tests/plot/test_frame_plot_plotly.py index 3a50674292..eb47448aed 100644 --- a/python/pyspark/pandas/tests/plot/test_frame_plot_plotly.py +++ b/python/pyspark/pandas/tests/plot/test_frame_plot_plotly.py @@ -169,7 +169,8 @@ class DataFramePlotPlotlyTest(PandasOnSparkTestCase, TestUtils): ) self.assertEqual( - psdf.plot(kind="pie", values="a"), express.pie(pdf, values="a"), + psdf.plot(kind="pie", values="a"), + express.pie(pdf, values="a"), ) psdf1 = self.psdf1 diff --git a/python/pyspark/pandas/tests/plot/test_series_plot_matplotlib.py b/python/pyspark/pandas/tests/plot/test_series_plot_matplotlib.py index d1097fd23c..7b18a35749 100644 --- a/python/pyspark/pandas/tests/plot/test_series_plot_matplotlib.py +++ b/python/pyspark/pandas/tests/plot/test_series_plot_matplotlib.py @@ -347,7 +347,7 @@ class SeriesPlotMatplotlibTest(PandasOnSparkTestCase, TestUtils): def moving_average(a, n=10): ret = np.cumsum(a, dtype=float) ret[n:] = ret[n:] - ret[:-n] - return ret[n - 1:] / n + return ret[n - 1 :] / n def check_kde_plot(pdf, psdf, *args, **kwargs): _, ax1 = plt.subplots(1, 1) diff --git a/python/pyspark/pandas/tests/plot/test_series_plot_plotly.py b/python/pyspark/pandas/tests/plot/test_series_plot_plotly.py index 06ab277904..d1004662ac 100644 --- a/python/pyspark/pandas/tests/plot/test_series_plot_plotly.py +++ b/python/pyspark/pandas/tests/plot/test_series_plot_plotly.py @@ -118,7 +118,8 @@ class SeriesPlotPlotlyTest(PandasOnSparkTestCase, TestUtils): psdf = self.psdf1 pdf = psdf.to_pandas() self.assertEqual( - psdf["a"].plot(kind="pie"), express.pie(pdf, values=pdf.columns[0], names=pdf.index), + psdf["a"].plot(kind="pie"), + express.pie(pdf, values=pdf.columns[0], names=pdf.index), ) # TODO: support multi-index columns diff --git a/python/pyspark/pandas/tests/test_categorical.py b/python/pyspark/pandas/tests/test_categorical.py index a6c6c230a0..2909bfd675 100644 --- a/python/pyspark/pandas/tests/test_categorical.py +++ b/python/pyspark/pandas/tests/test_categorical.py @@ -387,14 +387,16 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils): return pdf.astype(str) self.assert_eq( - psdf.pandas_on_spark.transform_batch(to_str).sort_index(), to_str(pdf).sort_index(), + psdf.pandas_on_spark.transform_batch(to_str).sort_index(), + to_str(pdf).sort_index(), ) def to_codes(pdf) -> ps.Series[np.int8]: return pdf.b.cat.codes self.assert_eq( - psdf.pandas_on_spark.transform_batch(to_codes).sort_index(), to_codes(pdf).sort_index(), + psdf.pandas_on_spark.transform_batch(to_codes).sort_index(), + to_codes(pdf).sort_index(), ) pdf = pd.DataFrame( diff --git a/python/pyspark/pandas/tests/test_dataframe.py b/python/pyspark/pandas/tests/test_dataframe.py index d34f34f34d..e54b7835c2 100644 --- a/python/pyspark/pandas/tests/test_dataframe.py +++ b/python/pyspark/pandas/tests/test_dataframe.py @@ -1518,7 +1518,8 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): # Assert approximate counts self.assert_eq( - ps.DataFrame({"A": range(100)}).nunique(approx=True), pd.Series([103], index=["A"]), + ps.DataFrame({"A": range(100)}).nunique(approx=True), + pd.Series([103], index=["A"]), ) self.assert_eq( ps.DataFrame({"A": range(100)}).nunique(approx=True, rsd=0.01), @@ -3354,7 +3355,8 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): columns2 = pd.Index(["numbers", "2", "3"], name="cols2") self.assert_eq( - pdf.reindex(columns=columns2).sort_index(), psdf.reindex(columns=columns2).sort_index(), + pdf.reindex(columns=columns2).sort_index(), + psdf.reindex(columns=columns2).sort_index(), ) columns = pd.Index(["numbers"], name="cols") @@ -3398,12 +3400,14 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): columns2 = pd.Index(["numbers", "2", "3"]) self.assert_eq( - pdf.reindex(columns=columns2).sort_index(), psdf.reindex(columns=columns2).sort_index(), + pdf.reindex(columns=columns2).sort_index(), + psdf.reindex(columns=columns2).sort_index(), ) columns2 = pd.Index(["numbers", "2", "3"], name="cols2") self.assert_eq( - pdf.reindex(columns=columns2).sort_index(), psdf.reindex(columns=columns2).sort_index(), + pdf.reindex(columns=columns2).sort_index(), + psdf.reindex(columns=columns2).sort_index(), ) # Reindexing single Index on single Index @@ -3506,7 +3510,8 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): [("X", "numbers"), ("Y", "2"), ("Y", "3")], names=["cols3", "cols4"] ) self.assert_eq( - pdf.reindex(columns=columns2).sort_index(), psdf.reindex(columns=columns2).sort_index(), + pdf.reindex(columns=columns2).sort_index(), + psdf.reindex(columns=columns2).sort_index(), ) self.assertRaises(TypeError, lambda: psdf.reindex(columns=["X"])) @@ -3527,7 +3532,8 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): psdf2 = ps.from_pandas(pdf2) self.assert_eq( - pdf.reindex_like(pdf2).sort_index(), psdf.reindex_like(psdf2).sort_index(), + pdf.reindex_like(pdf2).sort_index(), + psdf.reindex_like(psdf2).sort_index(), ) pdf2 = pd.DataFrame({"index_level_1": ["A", "C", "I"]}) @@ -3546,7 +3552,8 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): psdf2 = ps.from_pandas(pdf2) self.assert_eq( - pdf.reindex_like(pdf2).sort_index(), psdf.reindex_like(psdf2).sort_index(), + pdf.reindex_like(pdf2).sort_index(), + psdf.reindex_like(psdf2).sort_index(), ) self.assertRaises(TypeError, lambda: psdf.reindex_like(index2)) @@ -3569,7 +3576,8 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): psdf = ps.from_pandas(pdf) self.assert_eq( - pdf.reindex_like(pdf2).sort_index(), psdf.reindex_like(psdf2).sort_index(), + pdf.reindex_like(pdf2).sort_index(), + psdf.reindex_like(psdf2).sort_index(), ) def test_melt(self): @@ -3953,16 +3961,20 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(pdf.duplicated().sort_index(), psdf.duplicated().sort_index()) self.assert_eq( - pdf.duplicated(keep="last").sort_index(), psdf.duplicated(keep="last").sort_index(), + pdf.duplicated(keep="last").sort_index(), + psdf.duplicated(keep="last").sort_index(), ) self.assert_eq( - pdf.duplicated(keep=False).sort_index(), psdf.duplicated(keep=False).sort_index(), + pdf.duplicated(keep=False).sort_index(), + psdf.duplicated(keep=False).sort_index(), ) self.assert_eq( - pdf.duplicated(subset="b").sort_index(), psdf.duplicated(subset="b").sort_index(), + pdf.duplicated(subset="b").sort_index(), + psdf.duplicated(subset="b").sort_index(), ) self.assert_eq( - pdf.duplicated(subset=["b"]).sort_index(), psdf.duplicated(subset=["b"]).sort_index(), + pdf.duplicated(subset=["b"]).sort_index(), + psdf.duplicated(subset=["b"]).sort_index(), ) with self.assertRaisesRegex(ValueError, "'keep' only supports 'first', 'last' and False"): psdf.duplicated(keep="false") @@ -4009,7 +4021,8 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(pdf.duplicated().sort_index(), psdf.duplicated().sort_index()) self.assert_eq( - pdf.duplicated(subset=10).sort_index(), psdf.duplicated(subset=10).sort_index(), + pdf.duplicated(subset=10).sort_index(), + psdf.duplicated(subset=10).sort_index(), ) def test_ffill(self): @@ -4651,17 +4664,20 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): psdf.take([-1, -2], axis=1).sort_index(), pdf.take([-1, -2], axis=1).sort_index() ) self.assert_eq( - psdf.take(range(1, 3), axis=1).sort_index(), pdf.take(range(1, 3), axis=1).sort_index(), + psdf.take(range(1, 3), axis=1).sort_index(), + pdf.take(range(1, 3), axis=1).sort_index(), ) self.assert_eq( psdf.take(range(-1, -3), axis=1).sort_index(), pdf.take(range(-1, -3), axis=1).sort_index(), ) self.assert_eq( - psdf.take([2, 1], axis=1).sort_index(), pdf.take([2, 1], axis=1).sort_index(), + psdf.take([2, 1], axis=1).sort_index(), + pdf.take([2, 1], axis=1).sort_index(), ) self.assert_eq( - psdf.take([-1, -2], axis=1).sort_index(), pdf.take([-1, -2], axis=1).sort_index(), + psdf.take([-1, -2], axis=1).sort_index(), + pdf.take([-1, -2], axis=1).sort_index(), ) # MultiIndex columns @@ -4695,17 +4711,20 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): psdf.take([-1, -2], axis=1).sort_index(), pdf.take([-1, -2], axis=1).sort_index() ) self.assert_eq( - psdf.take(range(1, 3), axis=1).sort_index(), pdf.take(range(1, 3), axis=1).sort_index(), + psdf.take(range(1, 3), axis=1).sort_index(), + pdf.take(range(1, 3), axis=1).sort_index(), ) self.assert_eq( psdf.take(range(-1, -3), axis=1).sort_index(), pdf.take(range(-1, -3), axis=1).sort_index(), ) self.assert_eq( - psdf.take([2, 1], axis=1).sort_index(), pdf.take([2, 1], axis=1).sort_index(), + psdf.take([2, 1], axis=1).sort_index(), + pdf.take([2, 1], axis=1).sort_index(), ) self.assert_eq( - psdf.take([-1, -2], axis=1).sort_index(), pdf.take([-1, -2], axis=1).sort_index(), + psdf.take([-1, -2], axis=1).sort_index(), + pdf.take([-1, -2], axis=1).sort_index(), ) # Checking the type of indices. @@ -5524,35 +5543,40 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): psdf = ps.from_pandas(pdf) psdf.at_time("0:20") self.assert_eq( - pdf.at_time("0:20").sort_index(), psdf.at_time("0:20").sort_index(), + pdf.at_time("0:20").sort_index(), + psdf.at_time("0:20").sort_index(), ) # Index name is 'ts' pdf.index.name = "ts" psdf = ps.from_pandas(pdf) self.assert_eq( - pdf.at_time("0:20").sort_index(), psdf.at_time("0:20").sort_index(), + pdf.at_time("0:20").sort_index(), + psdf.at_time("0:20").sort_index(), ) # Index name is 'ts', column label is 'index' pdf.columns = pd.Index(["index"]) psdf = ps.from_pandas(pdf) self.assert_eq( - pdf.at_time("0:40").sort_index(), psdf.at_time("0:40").sort_index(), + pdf.at_time("0:40").sort_index(), + psdf.at_time("0:40").sort_index(), ) # Both index name and column label are 'index' pdf.index.name = "index" psdf = ps.from_pandas(pdf) self.assert_eq( - pdf.at_time("0:40").sort_index(), psdf.at_time("0:40").sort_index(), + pdf.at_time("0:40").sort_index(), + psdf.at_time("0:40").sort_index(), ) # Index name is 'index', column label is ('X', 'A') pdf.columns = pd.MultiIndex.from_arrays([["X"], ["A"]]) psdf = ps.from_pandas(pdf) self.assert_eq( - pdf.at_time("0:40").sort_index(), psdf.at_time("0:40").sort_index(), + pdf.at_time("0:40").sort_index(), + psdf.at_time("0:40").sort_index(), ) with self.assertRaisesRegex(NotImplementedError, "'asof' argument is not supported"): diff --git a/python/pyspark/pandas/tests/test_dataframe_spark_io.py b/python/pyspark/pandas/tests/test_dataframe_spark_io.py index 879df3245d..f31cd5c44f 100644 --- a/python/pyspark/pandas/tests/test_dataframe_spark_io.py +++ b/python/pyspark/pandas/tests/test_dataframe_spark_io.py @@ -310,7 +310,8 @@ class DataFrameSparkIOTest(PandasOnSparkTestCase, TestUtils): self.assert_eq(psdfs["Sheet_name_2"], pdfs1_squeezed["Sheet_name_2"]) self.assert_eq( - ps.read_excel(tmp, index_col=0, sheet_name="Sheet_name_2"), pdfs1["Sheet_name_2"], + ps.read_excel(tmp, index_col=0, sheet_name="Sheet_name_2"), + pdfs1["Sheet_name_2"], ) for sheet_name in sheet_names: diff --git a/python/pyspark/pandas/tests/test_expanding.py b/python/pyspark/pandas/tests/test_expanding.py index 807e1fe7c7..57b4e48f39 100644 --- a/python/pyspark/pandas/tests/test_expanding.py +++ b/python/pyspark/pandas/tests/test_expanding.py @@ -96,7 +96,8 @@ class ExpandingTest(PandasOnSparkTestCase, TestUtils): psdf = ps.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]}, index=idx) psdf.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")]) expected_result = pd.DataFrame( - {("a", "x"): [None, 2.0, 3.0, 4.0], ("a", "y"): [None, 2.0, 3.0, 4.0]}, index=idx, + {("a", "x"): [None, 2.0, 3.0, 4.0], ("a", "y"): [None, 2.0, 3.0, 4.0]}, + index=idx, ) self.assert_eq(psdf.expanding(2).count().sort_index(), expected_result.sort_index()) diff --git a/python/pyspark/pandas/tests/test_groupby.py b/python/pyspark/pandas/tests/test_groupby.py index 414c2fcf35..f0d0aec730 100644 --- a/python/pyspark/pandas/tests/test_groupby.py +++ b/python/pyspark/pandas/tests/test_groupby.py @@ -934,7 +934,8 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils): expected = ps.DataFrame({"b": [2, 2]}, index=pd.Index([0, 1], name="a")) self.assert_eq(psdf.groupby("a").nunique().sort_index(), expected) self.assert_eq( - psdf.groupby("a").nunique(dropna=False).sort_index(), expected, + psdf.groupby("a").nunique(dropna=False).sort_index(), + expected, ) else: self.assert_eq( @@ -968,10 +969,12 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils): if LooseVersion(pd.__version__) < LooseVersion("1.1.0"): expected = ps.DataFrame({("y", "b"): [2, 2]}, index=pd.Index([0, 1], name=("x", "a"))) self.assert_eq( - psdf.groupby(("x", "a")).nunique().sort_index(), expected, + psdf.groupby(("x", "a")).nunique().sort_index(), + expected, ) self.assert_eq( - psdf.groupby(("x", "a")).nunique(dropna=False).sort_index(), expected, + psdf.groupby(("x", "a")).nunique(dropna=False).sort_index(), + expected, ) else: self.assert_eq( @@ -1785,7 +1788,8 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils): pdf.groupby("A")[["B"]].bfill().sort_index(), ) self.assert_eq( - psdf.groupby("A")["B"].bfill().sort_index(), pdf.groupby("A")["B"].bfill().sort_index(), + psdf.groupby("A")["B"].bfill().sort_index(), + pdf.groupby("A")["B"].bfill().sort_index(), ) self.assert_eq( psdf.groupby("A")["B"].bfill()[idx[6]], pdf.groupby("A")["B"].bfill()[idx[6]] @@ -1893,7 +1897,8 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils): pdf.groupby("b").apply(lambda x: x + x.min()).sort_index(), ) self.assert_eq( - psdf.groupby("b").apply(len).sort_index(), pdf.groupby("b").apply(len).sort_index(), + psdf.groupby("b").apply(len).sort_index(), + pdf.groupby("b").apply(len).sort_index(), ) self.assert_eq( psdf.groupby("b")["a"] @@ -2556,7 +2561,8 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils): psdf = ps.from_pandas(pdf) self.assert_eq( - psdf.groupby("class").get_group("bird"), pdf.groupby("class").get_group("bird"), + psdf.groupby("class").get_group("bird"), + pdf.groupby("class").get_group("bird"), ) self.assert_eq( psdf.groupby("class")["name"].get_group("mammal"), @@ -2583,7 +2589,8 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils): (pdf.max_speed + 1).groupby(pdf["class"]).get_group("mammal"), ) self.assert_eq( - psdf.groupby("max_speed").get_group(80.5), pdf.groupby("max_speed").get_group(80.5), + psdf.groupby("max_speed").get_group(80.5), + pdf.groupby("max_speed").get_group(80.5), ) self.assertRaises(KeyError, lambda: psdf.groupby("class").get_group("fish")) @@ -2646,7 +2653,8 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils): lambda: psdf.groupby([("B", "class"), ("A", "name")]).get_group(("lion", "mammal")), ) self.assertRaises( - ValueError, lambda: psdf.groupby([("B", "class"), ("A", "name")]).get_group(("lion",)), + ValueError, + lambda: psdf.groupby([("B", "class"), ("A", "name")]).get_group(("lion",)), ) self.assertRaises( ValueError, lambda: psdf.groupby([("B", "class"), ("A", "name")]).get_group(("mammal",)) diff --git a/python/pyspark/pandas/tests/test_namespace.py b/python/pyspark/pandas/tests/test_namespace.py index b2e92acd0a..c9c515bea0 100644 --- a/python/pyspark/pandas/tests/test_namespace.py +++ b/python/pyspark/pandas/tests/test_namespace.py @@ -266,7 +266,10 @@ class NamespaceTest(PandasOnSparkTestCase, SQLTestUtils): objs = [ ([psdf1.A, psdf1.A.rename("B")], [pdf1.A, pdf1.A.rename("B")]), - ([psdf3[("X", "A")], psdf3[("X", "B")]], [pdf3[("X", "A")], pdf3[("X", "B")]],), + ( + [psdf3[("X", "A")], psdf3[("X", "B")]], + [pdf3[("X", "A")], pdf3[("X", "B")]], + ), ( [psdf3[("X", "A")], psdf3[("X", "B")].rename("ABC")], [pdf3[("X", "A")], pdf3[("X", "B")].rename("ABC")], diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py index 11088a9833..12e87b202e 100644 --- a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py +++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py @@ -728,7 +728,8 @@ class OpsOnDiffFramesEnabledTest(PandasOnSparkTestCase, SQLTestUtils): psser1 = ps.from_pandas(pser1) psser2 = ps.from_pandas(pser2) self.assert_eq( - pser1.compare(pser2).sort_index(), psser1.compare(psser2).sort_index(), + pser1.compare(pser2).sort_index(), + psser1.compare(psser2).sort_index(), ) # `keep_shape=True` @@ -757,7 +758,8 @@ class OpsOnDiffFramesEnabledTest(PandasOnSparkTestCase, SQLTestUtils): psser1 = ps.from_pandas(pser1) psser2 = ps.from_pandas(pser2) self.assert_eq( - pser1.compare(pser2).sort_index(), psser1.compare(psser2).sort_index(), + pser1.compare(pser2).sort_index(), + psser1.compare(psser2).sort_index(), ) # `keep_shape=True` with MultiIndex @@ -790,14 +792,16 @@ class OpsOnDiffFramesEnabledTest(PandasOnSparkTestCase, SQLTestUtils): columns=["self", "other"], ) self.assert_eq( - expected, psser1.compare(psser2, keep_shape=True).sort_index(), + expected, + psser1.compare(psser2, keep_shape=True).sort_index(), ) # `keep_equal=True` expected = ps.DataFrame( [["b", "a"], ["g", None], [None, "h"]], index=[0, 3, 4], columns=["self", "other"] ) self.assert_eq( - expected, psser1.compare(psser2, keep_equal=True).sort_index(), + expected, + psser1.compare(psser2, keep_equal=True).sort_index(), ) # `keep_shape=True` and `keep_equal=True` expected = ps.DataFrame( @@ -806,7 +810,8 @@ class OpsOnDiffFramesEnabledTest(PandasOnSparkTestCase, SQLTestUtils): columns=["self", "other"], ) self.assert_eq( - expected, psser1.compare(psser2, keep_shape=True, keep_equal=True).sort_index(), + expected, + psser1.compare(psser2, keep_shape=True, keep_equal=True).sort_index(), ) # MultiIndex @@ -838,7 +843,8 @@ class OpsOnDiffFramesEnabledTest(PandasOnSparkTestCase, SQLTestUtils): columns=["self", "other"], ) self.assert_eq( - expected, psser1.compare(psser2, keep_shape=True).sort_index(), + expected, + psser1.compare(psser2, keep_shape=True).sort_index(), ) # `keep_equal=True` expected = ps.DataFrame( @@ -847,7 +853,8 @@ class OpsOnDiffFramesEnabledTest(PandasOnSparkTestCase, SQLTestUtils): columns=["self", "other"], ) self.assert_eq( - expected, psser1.compare(psser2, keep_equal=True).sort_index(), + expected, + psser1.compare(psser2, keep_equal=True).sort_index(), ) # `keep_shape=True` and `keep_equal=True` expected = ps.DataFrame( @@ -858,15 +865,22 @@ class OpsOnDiffFramesEnabledTest(PandasOnSparkTestCase, SQLTestUtils): columns=["self", "other"], ) self.assert_eq( - expected, psser1.compare(psser2, keep_shape=True, keep_equal=True).sort_index(), + expected, + psser1.compare(psser2, keep_shape=True, keep_equal=True).sort_index(), ) # Different Index with self.assertRaisesRegex( ValueError, "Can only compare identically-labeled Series objects" ): - psser1 = ps.Series([1, 2, 3, 4, 5], index=pd.Index([1, 2, 3, 4, 5]),) - psser2 = ps.Series([2, 2, 3, 4, 1], index=pd.Index([5, 4, 3, 2, 1]),) + psser1 = ps.Series( + [1, 2, 3, 4, 5], + index=pd.Index([1, 2, 3, 4, 5]), + ) + psser2 = ps.Series( + [2, 2, 3, 4, 1], + index=pd.Index([5, 4, 3, 2, 1]), + ) psser1.compare(psser2) # Different MultiIndex with self.assertRaisesRegex( @@ -1153,7 +1167,8 @@ class OpsOnDiffFramesEnabledTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(psdf, pdf) with self.assertRaisesRegex( - ValueError, "shape mismatch", + ValueError, + "shape mismatch", ): psdf.iloc[[1, 2], [1]] = -another_psdf.max_speed diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py index 72840cff92..70c3089c70 100644 --- a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py +++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py @@ -135,7 +135,8 @@ class OpsOnDiffFramesGroupByTest(PandasOnSparkTestCase, SQLTestUtils): ) self.assert_eq( - psdf1.B.groupby(psdf2.A).sum().sort_index(), pdf1.B.groupby(pdf2.A).sum().sort_index(), + psdf1.B.groupby(psdf2.A).sum().sort_index(), + pdf1.B.groupby(pdf2.A).sum().sort_index(), ) self.assert_eq( (psdf1.B + 1).groupby(psdf2.A).sum().sort_index(), diff --git a/python/pyspark/pandas/tests/test_series.py b/python/pyspark/pandas/tests/test_series.py index d73114e070..5fea71d662 100644 --- a/python/pyspark/pandas/tests/test_series.py +++ b/python/pyspark/pandas/tests/test_series.py @@ -264,7 +264,8 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): psser = ps.from_pandas(pser) self.assert_eq( - pser.rename_axis("index2").sort_index(), psser.rename_axis("index2").sort_index(), + pser.rename_axis("index2").sort_index(), + psser.rename_axis("index2").sort_index(), ) self.assert_eq( @@ -463,7 +464,8 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(pser, psser) self.assert_eq( - pser.reindex(["A", "B"]).sort_index(), psser.reindex(["A", "B"]).sort_index(), + pser.reindex(["A", "B"]).sort_index(), + psser.reindex(["A", "B"]).sort_index(), ) self.assert_eq( @@ -491,7 +493,8 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): psser2 = ps.from_pandas(pser2) self.assert_eq( - pser.reindex_like(pser2).sort_index(), psser.reindex_like(psser2).sort_index(), + pser.reindex_like(pser2).sort_index(), + psser.reindex_like(psser2).sort_index(), ) self.assert_eq( @@ -507,7 +510,8 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): psser2 = ps.from_pandas(pser2) self.assert_eq( - pser.reindex_like(pser2).sort_index(), psser.reindex_like(psser2).sort_index(), + pser.reindex_like(pser2).sort_index(), + psser.reindex_like(psser2).sort_index(), ) self.assertRaises(TypeError, lambda: psser.reindex_like(index2)) @@ -521,7 +525,8 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): psser = ps.from_pandas(pser) self.assert_eq( - pser.reindex_like(pser2).sort_index(), psser.reindex_like(psser2).sort_index(), + pser.reindex_like(pser2).sort_index(), + psser.reindex_like(psser2).sort_index(), ) # Reindexing with DataFrame @@ -532,7 +537,8 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): psdf = ps.from_pandas(pdf) self.assert_eq( - pser.reindex_like(pdf).sort_index(), psser.reindex_like(psdf).sort_index(), + pser.reindex_like(pdf).sort_index(), + psser.reindex_like(psdf).sort_index(), ) def test_fillna(self): @@ -2897,19 +2903,22 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): pser = pd.Series([1, 2, 3, 4], index=idx) psser = ps.from_pandas(pser) self.assert_eq( - pser.at_time("0:20").sort_index(), psser.at_time("0:20").sort_index(), + pser.at_time("0:20").sort_index(), + psser.at_time("0:20").sort_index(), ) pser.index.name = "ts" psser = ps.from_pandas(pser) self.assert_eq( - pser.at_time("0:20").sort_index(), psser.at_time("0:20").sort_index(), + pser.at_time("0:20").sort_index(), + psser.at_time("0:20").sort_index(), ) pser.index.name = "index" psser = ps.from_pandas(pser) self.assert_eq( - pser.at_time("0:20").sort_index(), psser.at_time("0:20").sort_index(), + pser.at_time("0:20").sort_index(), + psser.at_time("0:20").sort_index(), ) diff --git a/python/pyspark/pandas/typedef/typehints.py b/python/pyspark/pandas/typedef/typehints.py index 819dca5549..f2fc5a4c4c 100644 --- a/python/pyspark/pandas/typedef/typehints.py +++ b/python/pyspark/pandas/typedef/typehints.py @@ -216,7 +216,7 @@ def as_spark_type(tpe: Union[str, type, Dtype], *, raise_error: bool = True) -> def spark_type_to_pandas_dtype( spark_type: types.DataType, *, use_extension_dtypes: bool = False ) -> Dtype: - """ Return the given Spark DataType to pandas dtype. """ + """Return the given Spark DataType to pandas dtype.""" if use_extension_dtypes and extension_dtypes_available: # IntegralType diff --git a/python/pyspark/pandas/usage_logging/usage_logger.py b/python/pyspark/pandas/usage_logging/usage_logger.py index 897a74eece..286b569390 100644 --- a/python/pyspark/pandas/usage_logging/usage_logger.py +++ b/python/pyspark/pandas/usage_logging/usage_logger.py @@ -25,7 +25,7 @@ from typing import Any, Optional def get_logger() -> Any: - """ An entry point of the plug-in and return the usage logger. """ + """An entry point of the plug-in and return the usage logger.""" return PandasOnSparkUsageLogger() diff --git a/python/pyspark/pandas/utils.py b/python/pyspark/pandas/utils.py index c208514f2e..6dacc3b4d0 100644 --- a/python/pyspark/pandas/utils.py +++ b/python/pyspark/pandas/utils.py @@ -440,7 +440,7 @@ def align_diff_frames( def is_testing() -> bool: - """ Indicates whether Spark is currently running tests. """ + """Indicates whether Spark is currently running tests.""" return "SPARK_TESTING" in os.environ @@ -574,12 +574,12 @@ def lazy_property(fn: Callable[[Any], Any]) -> property: def scol_for(sdf: spark.DataFrame, column_name: str) -> spark.Column: - """ Return Spark Column for the given column name. """ + """Return Spark Column for the given column name.""" return sdf["`{}`".format(column_name)] def column_labels_level(column_labels: List[Tuple]) -> int: - """ Return the level of the column index. """ + """Return the level of the column index.""" if len(column_labels) == 0: return 1 else: @@ -700,7 +700,7 @@ def is_name_like_value( def validate_axis(axis: Optional[Union[int, str]] = 0, none_axis: int = 0) -> int: - """ Check the given axis is valid. """ + """Check the given axis is valid.""" # convert to numeric axis axis = cast( Dict[Optional[Union[int, str]], int], {None: none_axis, "index": 0, "columns": 1} @@ -712,7 +712,7 @@ def validate_axis(axis: Optional[Union[int, str]] = 0, none_axis: int = 0) -> in def validate_bool_kwarg(value: Any, arg_name: str) -> Optional[bool]: - """ Ensures that argument passed in arg_name is of type bool. """ + """Ensures that argument passed in arg_name is of type bool.""" if not (isinstance(value, bool) or value is None): raise TypeError( 'For argument "{}" expected type bool, received ' @@ -722,7 +722,7 @@ def validate_bool_kwarg(value: Any, arg_name: str) -> Optional[bool]: def validate_how(how: str) -> str: - """ Check the given how for join is valid. """ + """Check the given how for join is valid.""" if how == "full": warnings.warn( "Warning: While pandas-on-Spark will accept 'full', you should use 'outer' "