diff --git a/python/pyspark/ml/tests/test_feature.py b/python/pyspark/ml/tests/test_feature.py index 244110a986..98b8ce6dfb 100644 --- a/python/pyspark/ml/tests/test_feature.py +++ b/python/pyspark/ml/tests/test_feature.py @@ -169,7 +169,7 @@ class FeatureTests(SparkSessionTestCase): # Test an empty vocabulary with QuietTest(self.sc): - with self.assertRaisesRegexp(Exception, "vocabSize.*invalid.*0"): + with self.assertRaisesRegex(Exception, "vocabSize.*invalid.*0"): CountVectorizerModel.from_vocabulary([], inputCol="words") # Test model with default settings can transform diff --git a/python/pyspark/ml/tests/test_image.py b/python/pyspark/ml/tests/test_image.py index 1001598779..00e4c95a84 100644 --- a/python/pyspark/ml/tests/test_image.py +++ b/python/pyspark/ml/tests/test_image.py @@ -47,19 +47,19 @@ class ImageFileFormatTest(SparkSessionTestCase): self.assertEqual(ImageSchema.undefinedImageType, "Undefined") with QuietTest(self.sc): - self.assertRaisesRegexp( + self.assertRaisesRegex( TypeError, "image argument should be pyspark.sql.types.Row; however", lambda: ImageSchema.toNDArray("a")) with QuietTest(self.sc): - self.assertRaisesRegexp( + self.assertRaisesRegex( ValueError, "image argument should have attributes specified in", lambda: ImageSchema.toNDArray(Row(a=1))) with QuietTest(self.sc): - self.assertRaisesRegexp( + self.assertRaisesRegex( TypeError, "array argument should be numpy.ndarray; however, it got", lambda: ImageSchema.toImage("a")) diff --git a/python/pyspark/ml/tests/test_param.py b/python/pyspark/ml/tests/test_param.py index 4cddf50f36..09fe21e9fd 100644 --- a/python/pyspark/ml/tests/test_param.py +++ b/python/pyspark/ml/tests/test_param.py @@ -308,7 +308,7 @@ class ParamTests(SparkSessionTestCase): LogisticRegression ) - self.assertRaisesRegexp( + self.assertRaisesRegex( ValueError, "Logistic Regression getThreshold found inconsistent.*$", LogisticRegression, threshold=0.42, thresholds=[0.5, 0.5] diff --git a/python/pyspark/ml/tests/test_persistence.py b/python/pyspark/ml/tests/test_persistence.py index 826e6cd351..0bbcfcdf50 100644 --- a/python/pyspark/ml/tests/test_persistence.py +++ b/python/pyspark/ml/tests/test_persistence.py @@ -442,7 +442,7 @@ class PersistenceTest(SparkSessionTestCase): del metadata['defaultParamMap'] metadataStr = json.dumps(metadata, separators=[',', ':']) loadedMetadata = reader._parseMetaData(metadataStr, ) - with self.assertRaisesRegexp(AssertionError, "`defaultParamMap` section not found"): + with self.assertRaisesRegex(AssertionError, "`defaultParamMap` section not found"): reader.getAndSetParams(lr, loadedMetadata) # Prior to 2.4.0, metadata doesn't have `defaultParamMap`. diff --git a/python/pyspark/ml/tests/test_tuning.py b/python/pyspark/ml/tests/test_tuning.py index 729e46419a..ced32c07f2 100644 --- a/python/pyspark/ml/tests/test_tuning.py +++ b/python/pyspark/ml/tests/test_tuning.py @@ -499,7 +499,7 @@ class CrossValidatorTests(SparkSessionTestCase): evaluator=evaluator, numFolds=2, foldCol="fold") - with self.assertRaisesRegexp(Exception, "Fold number must be in range"): + with self.assertRaisesRegex(Exception, "Fold number must be in range"): cv.fit(dataset_with_folds) cv = CrossValidator(estimator=lr, @@ -507,7 +507,7 @@ class CrossValidatorTests(SparkSessionTestCase): evaluator=evaluator, numFolds=4, foldCol="fold") - with self.assertRaisesRegexp(Exception, "The validation data at fold 3 is empty"): + with self.assertRaisesRegex(Exception, "The validation data at fold 3 is empty"): cv.fit(dataset_with_folds) diff --git a/python/pyspark/ml/tests/test_wrapper.py b/python/pyspark/ml/tests/test_wrapper.py index 31475299c7..8ed6a6bad9 100644 --- a/python/pyspark/ml/tests/test_wrapper.py +++ b/python/pyspark/ml/tests/test_wrapper.py @@ -54,7 +54,7 @@ class JavaWrapperMemoryTests(SparkSessionTestCase): model.__del__() def condition(): - with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object): + with self.assertRaisesRegex(py4j.protocol.Py4JError, error_no_object): model._java_obj.toString() self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString()) return True @@ -67,9 +67,9 @@ class JavaWrapperMemoryTests(SparkSessionTestCase): pass def condition(): - with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object): + with self.assertRaisesRegex(py4j.protocol.Py4JError, error_no_object): model._java_obj.toString() - with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object): + with self.assertRaisesRegex(py4j.protocol.Py4JError, error_no_object): summary._java_obj.toString() return True diff --git a/python/pyspark/sql/tests/test_arrow.py b/python/pyspark/sql/tests/test_arrow.py index e764c42d88..bf80c62ea0 100644 --- a/python/pyspark/sql/tests/test_arrow.py +++ b/python/pyspark/sql/tests/test_arrow.py @@ -34,7 +34,7 @@ from pyspark.testing.utils import QuietTest if have_pandas: import pandas as pd - from pandas.util.testing import assert_frame_equal + from pandas.testing import assert_frame_equal if have_pyarrow: import pyarrow as pa # noqa: F401 @@ -137,7 +137,7 @@ class ArrowTests(ReusedSQLTestCase): df = self.spark.createDataFrame([(None,)], schema=schema) with QuietTest(self.sc): with self.warnings_lock: - with self.assertRaisesRegexp(Exception, 'Unsupported type'): + with self.assertRaisesRegex(Exception, 'Unsupported type'): df.toPandas() def test_null_conversion(self): @@ -214,7 +214,7 @@ class ArrowTests(ReusedSQLTestCase): exception_udf = udf(raise_exception, IntegerType()) df = df.withColumn("error", exception_udf()) with QuietTest(self.sc): - with self.assertRaisesRegexp(Exception, 'My error'): + with self.assertRaisesRegex(Exception, 'My error'): df.toPandas() def _createDataFrame_toggle(self, pdf, schema=None): @@ -228,7 +228,7 @@ class ArrowTests(ReusedSQLTestCase): def test_createDataFrame_toggle(self): pdf = self.create_pandas_data_frame() df_no_arrow, df_arrow = self._createDataFrame_toggle(pdf, schema=self.schema) - self.assertEquals(df_no_arrow.collect(), df_arrow.collect()) + self.assertEqual(df_no_arrow.collect(), df_arrow.collect()) def test_createDataFrame_respect_session_timezone(self): from datetime import timedelta @@ -258,7 +258,7 @@ class ArrowTests(ReusedSQLTestCase): def test_createDataFrame_with_schema(self): pdf = self.create_pandas_data_frame() df = self.spark.createDataFrame(pdf, schema=self.schema) - self.assertEquals(self.schema, df.schema) + self.assertEqual(self.schema, df.schema) pdf_arrow = df.toPandas() assert_frame_equal(pdf_arrow, pdf) @@ -269,7 +269,7 @@ class ArrowTests(ReusedSQLTestCase): wrong_schema = StructType(fields) with self.sql_conf({"spark.sql.execution.pandas.convertToArrowArraySafely": False}): with QuietTest(self.sc): - with self.assertRaisesRegexp(Exception, "[D|d]ecimal.*got.*date"): + with self.assertRaisesRegex(Exception, "[D|d]ecimal.*got.*date"): self.spark.createDataFrame(pdf, schema=wrong_schema) def test_createDataFrame_with_names(self): @@ -277,23 +277,23 @@ class ArrowTests(ReusedSQLTestCase): new_names = list(map(str, range(len(self.schema.fieldNames())))) # Test that schema as a list of column names gets applied df = self.spark.createDataFrame(pdf, schema=list(new_names)) - self.assertEquals(df.schema.fieldNames(), new_names) + self.assertEqual(df.schema.fieldNames(), new_names) # Test that schema as tuple of column names gets applied df = self.spark.createDataFrame(pdf, schema=tuple(new_names)) - self.assertEquals(df.schema.fieldNames(), new_names) + self.assertEqual(df.schema.fieldNames(), new_names) def test_createDataFrame_column_name_encoding(self): pdf = pd.DataFrame({u'a': [1]}) columns = self.spark.createDataFrame(pdf).columns self.assertTrue(isinstance(columns[0], str)) - self.assertEquals(columns[0], 'a') + self.assertEqual(columns[0], 'a') columns = self.spark.createDataFrame(pdf, [u'b']).columns self.assertTrue(isinstance(columns[0], str)) - self.assertEquals(columns[0], 'b') + self.assertEqual(columns[0], 'b') def test_createDataFrame_with_single_data_type(self): with QuietTest(self.sc): - with self.assertRaisesRegexp(ValueError, ".*IntegerType.*not supported.*"): + with self.assertRaisesRegex(ValueError, ".*IntegerType.*not supported.*"): self.spark.createDataFrame(pd.DataFrame({"a": [1]}), schema="int") def test_createDataFrame_does_not_modify_input(self): @@ -311,7 +311,7 @@ class ArrowTests(ReusedSQLTestCase): from pyspark.sql.pandas.types import from_arrow_schema, to_arrow_schema arrow_schema = to_arrow_schema(self.schema) schema_rt = from_arrow_schema(arrow_schema) - self.assertEquals(self.schema, schema_rt) + self.assertEqual(self.schema, schema_rt) def test_createDataFrame_with_array_type(self): pdf = pd.DataFrame({"a": [[1, 2], [3, 4]], "b": [[u"x", u"y"], [u"y", u"z"]]}) @@ -420,7 +420,7 @@ class ArrowTests(ReusedSQLTestCase): def test_createDataFrame_fallback_disabled(self): with QuietTest(self.sc): - with self.assertRaisesRegexp(TypeError, 'Unsupported type'): + with self.assertRaisesRegex(TypeError, 'Unsupported type'): self.spark.createDataFrame( pd.DataFrame({"a": [[datetime.datetime(2015, 11, 1, 0, 30)]]}), "a: array") @@ -545,7 +545,7 @@ class MaxResultArrowTests(unittest.TestCase): cls.spark.stop() def test_exception_by_max_results(self): - with self.assertRaisesRegexp(Exception, "is bigger than"): + with self.assertRaisesRegex(Exception, "is bigger than"): self.spark.range(0, 10000, 1, 100).toPandas() diff --git a/python/pyspark/sql/tests/test_catalog.py b/python/pyspark/sql/tests/test_catalog.py index ca4e427a7d..56e7c97020 100644 --- a/python/pyspark/sql/tests/test_catalog.py +++ b/python/pyspark/sql/tests/test_catalog.py @@ -25,11 +25,11 @@ class CatalogTests(ReusedSQLTestCase): def test_current_database(self): spark = self.spark with self.database("some_db"): - self.assertEquals(spark.catalog.currentDatabase(), "default") + self.assertEqual(spark.catalog.currentDatabase(), "default") spark.sql("CREATE DATABASE some_db") spark.catalog.setCurrentDatabase("some_db") - self.assertEquals(spark.catalog.currentDatabase(), "some_db") - self.assertRaisesRegexp( + self.assertEqual(spark.catalog.currentDatabase(), "some_db") + self.assertRaisesRegex( AnalysisException, "does_not_exist", lambda: spark.catalog.setCurrentDatabase("does_not_exist")) @@ -38,10 +38,10 @@ class CatalogTests(ReusedSQLTestCase): spark = self.spark with self.database("some_db"): databases = [db.name for db in spark.catalog.listDatabases()] - self.assertEquals(databases, ["default"]) + self.assertEqual(databases, ["default"]) spark.sql("CREATE DATABASE some_db") databases = [db.name for db in spark.catalog.listDatabases()] - self.assertEquals(sorted(databases), ["default", "some_db"]) + self.assertEqual(sorted(databases), ["default", "some_db"]) def test_list_tables(self): from pyspark.sql.catalog import Table @@ -50,8 +50,8 @@ class CatalogTests(ReusedSQLTestCase): spark.sql("CREATE DATABASE some_db") with self.table("tab1", "some_db.tab2", "tab3_via_catalog"): with self.tempView("temp_tab"): - self.assertEquals(spark.catalog.listTables(), []) - self.assertEquals(spark.catalog.listTables("some_db"), []) + self.assertEqual(spark.catalog.listTables(), []) + self.assertEqual(spark.catalog.listTables("some_db"), []) spark.createDataFrame([(1, 1)]).createOrReplaceTempView("temp_tab") spark.sql("CREATE TABLE tab1 (name STRING, age INT) USING parquet") spark.sql("CREATE TABLE some_db.tab2 (name STRING, age INT) USING parquet") @@ -66,40 +66,40 @@ class CatalogTests(ReusedSQLTestCase): sorted(spark.catalog.listTables("default"), key=lambda t: t.name) tablesSomeDb = \ sorted(spark.catalog.listTables("some_db"), key=lambda t: t.name) - self.assertEquals(tables, tablesDefault) - self.assertEquals(len(tables), 3) - self.assertEquals(len(tablesSomeDb), 2) - self.assertEquals(tables[0], Table( + self.assertEqual(tables, tablesDefault) + self.assertEqual(len(tables), 3) + self.assertEqual(len(tablesSomeDb), 2) + self.assertEqual(tables[0], Table( name="tab1", database="default", description=None, tableType="MANAGED", isTemporary=False)) - self.assertEquals(tables[1], Table( + self.assertEqual(tables[1], Table( name="tab3_via_catalog", database="default", description=description, tableType="MANAGED", isTemporary=False)) - self.assertEquals(tables[2], Table( + self.assertEqual(tables[2], Table( name="temp_tab", database=None, description=None, tableType="TEMPORARY", isTemporary=True)) - self.assertEquals(tablesSomeDb[0], Table( + self.assertEqual(tablesSomeDb[0], Table( name="tab2", database="some_db", description=None, tableType="MANAGED", isTemporary=False)) - self.assertEquals(tablesSomeDb[1], Table( + self.assertEqual(tablesSomeDb[1], Table( name="temp_tab", database=None, description=None, tableType="TEMPORARY", isTemporary=True)) - self.assertRaisesRegexp( + self.assertRaisesRegex( AnalysisException, "does_not_exist", lambda: spark.catalog.listTables("does_not_exist")) @@ -119,12 +119,12 @@ class CatalogTests(ReusedSQLTestCase): self.assertTrue("to_timestamp" in functions) self.assertTrue("to_unix_timestamp" in functions) self.assertTrue("current_database" in functions) - self.assertEquals(functions["+"], Function( + self.assertEqual(functions["+"], Function( name="+", description=None, className="org.apache.spark.sql.catalyst.expressions.Add", isTemporary=True)) - self.assertEquals(functions, functionsDefault) + self.assertEqual(functions, functionsDefault) with self.function("func1", "some_db.func2"): spark.catalog.registerFunction("temp_func", lambda x: str(x)) @@ -141,7 +141,7 @@ class CatalogTests(ReusedSQLTestCase): self.assertTrue("temp_func" in newFunctionsSomeDb) self.assertTrue("func1" not in newFunctionsSomeDb) self.assertTrue("func2" in newFunctionsSomeDb) - self.assertRaisesRegexp( + self.assertRaisesRegex( AnalysisException, "does_not_exist", lambda: spark.catalog.listFunctions("does_not_exist")) @@ -158,16 +158,16 @@ class CatalogTests(ReusedSQLTestCase): columns = sorted(spark.catalog.listColumns("tab1"), key=lambda c: c.name) columnsDefault = \ sorted(spark.catalog.listColumns("tab1", "default"), key=lambda c: c.name) - self.assertEquals(columns, columnsDefault) - self.assertEquals(len(columns), 2) - self.assertEquals(columns[0], Column( + self.assertEqual(columns, columnsDefault) + self.assertEqual(len(columns), 2) + self.assertEqual(columns[0], Column( name="age", description=None, dataType="int", nullable=True, isPartition=False, isBucket=False)) - self.assertEquals(columns[1], Column( + self.assertEqual(columns[1], Column( name="name", description=None, dataType="string", @@ -176,26 +176,26 @@ class CatalogTests(ReusedSQLTestCase): isBucket=False)) columns2 = \ sorted(spark.catalog.listColumns("tab2", "some_db"), key=lambda c: c.name) - self.assertEquals(len(columns2), 2) - self.assertEquals(columns2[0], Column( + self.assertEqual(len(columns2), 2) + self.assertEqual(columns2[0], Column( name="nickname", description=None, dataType="string", nullable=True, isPartition=False, isBucket=False)) - self.assertEquals(columns2[1], Column( + self.assertEqual(columns2[1], Column( name="tolerance", description=None, dataType="float", nullable=True, isPartition=False, isBucket=False)) - self.assertRaisesRegexp( + self.assertRaisesRegex( AnalysisException, "tab2", lambda: spark.catalog.listColumns("tab2")) - self.assertRaisesRegexp( + self.assertRaisesRegex( AnalysisException, "does_not_exist", lambda: spark.catalog.listColumns("does_not_exist")) diff --git a/python/pyspark/sql/tests/test_column.py b/python/pyspark/sql/tests/test_column.py index 4a9c7106a1..2ae0a9bedd 100644 --- a/python/pyspark/sql/tests/test_column.py +++ b/python/pyspark/sql/tests/test_column.py @@ -47,7 +47,7 @@ class ColumnTests(ReusedSQLTestCase): self.assertTrue("Column" in _to_java_column(u"a").getClass().toString()) self.assertTrue("Column" in _to_java_column(self.spark.range(1).id).getClass().toString()) - self.assertRaisesRegexp( + self.assertRaisesRegex( TypeError, "Invalid argument, not a string or column", lambda: _to_java_column(1)) @@ -58,7 +58,7 @@ class ColumnTests(ReusedSQLTestCase): self.assertRaises(TypeError, lambda: _to_java_column(A())) self.assertRaises(TypeError, lambda: _to_java_column([])) - self.assertRaisesRegexp( + self.assertRaisesRegex( TypeError, "Invalid argument, not a string or column", lambda: udf(lambda x: x)(None)) @@ -79,9 +79,9 @@ class ColumnTests(ReusedSQLTestCase): cs.startswith('a'), cs.endswith('a'), ci.eqNullSafe(cs) self.assertTrue(all(isinstance(c, Column) for c in css)) self.assertTrue(isinstance(ci.cast(LongType()), Column)) - self.assertRaisesRegexp(ValueError, - "Cannot apply 'in' operator against a column", - lambda: 1 in cs) + self.assertRaisesRegex(ValueError, + "Cannot apply 'in' operator against a column", + lambda: 1 in cs) def test_column_accessor(self): from pyspark.sql.functions import col diff --git a/python/pyspark/sql/tests/test_conf.py b/python/pyspark/sql/tests/test_conf.py index 1cc0c1b756..9222e2b827 100644 --- a/python/pyspark/sql/tests/test_conf.py +++ b/python/pyspark/sql/tests/test_conf.py @@ -28,7 +28,7 @@ class ConfTests(ReusedSQLTestCase): self.assertEqual(spark.conf.get("bogo"), "ta") self.assertEqual(spark.conf.get("bogo", "not.read"), "ta") self.assertEqual(spark.conf.get("not.set", "ta"), "ta") - self.assertRaisesRegexp(Exception, "not.set", lambda: spark.conf.get("not.set")) + self.assertRaisesRegex(Exception, "not.set", lambda: spark.conf.get("not.set")) spark.conf.unset("bogo") self.assertEqual(spark.conf.get("bogo", "colombia"), "colombia") diff --git a/python/pyspark/sql/tests/test_dataframe.py b/python/pyspark/sql/tests/test_dataframe.py index d941707b89..e3977e8185 100644 --- a/python/pyspark/sql/tests/test_dataframe.py +++ b/python/pyspark/sql/tests/test_dataframe.py @@ -343,7 +343,7 @@ class DataFrameTests(ReusedSQLTestCase): self.spark.createDataFrame( [(u'Alice', 10, 80.1)], schema).replace({u"Alice": u"Bob", 10: 20}).first() - with self.assertRaisesRegexp( + with self.assertRaisesRegex( TypeError, 'value argument is required when to_replace is not a dictionary.'): self.spark.createDataFrame( @@ -390,7 +390,7 @@ class DataFrameTests(ReusedSQLTestCase): self.assertEqual(3, logical_plan.toString().count("itworks")) def test_sample(self): - self.assertRaisesRegexp( + self.assertRaisesRegex( TypeError, "should be a bool, float and number", lambda: self.spark.range(1).sample()) @@ -426,12 +426,12 @@ class DataFrameTests(ReusedSQLTestCase): self.assertEqual(df.collect(), data) # number of fields must match. - self.assertRaisesRegexp(Exception, "Length of object", - lambda: rdd.toDF("key: int").collect()) + self.assertRaisesRegex(Exception, "Length of object", + lambda: rdd.toDF("key: int").collect()) # field types mismatch will cause exception at runtime. - self.assertRaisesRegexp(Exception, "FloatType can not accept", - lambda: rdd.toDF("key: float, value: string").collect()) + self.assertRaisesRegex(Exception, "FloatType can not accept", + lambda: rdd.toDF("key: float, value: string").collect()) # flat schema values will be wrapped into row. df = rdd.map(lambda row: row.key).toDF("int") @@ -491,15 +491,15 @@ class DataFrameTests(ReusedSQLTestCase): spark.catalog.clearCache() self.assertFalse(spark.catalog.isCached("tab1")) self.assertFalse(spark.catalog.isCached("tab2")) - self.assertRaisesRegexp( + self.assertRaisesRegex( AnalysisException, "does_not_exist", lambda: spark.catalog.isCached("does_not_exist")) - self.assertRaisesRegexp( + self.assertRaisesRegex( AnalysisException, "does_not_exist", lambda: spark.catalog.cacheTable("does_not_exist")) - self.assertRaisesRegexp( + self.assertRaisesRegex( AnalysisException, "does_not_exist", lambda: spark.catalog.uncacheTable("does_not_exist")) @@ -523,12 +523,12 @@ class DataFrameTests(ReusedSQLTestCase): import numpy as np pdf = self._to_pandas() types = pdf.dtypes - self.assertEquals(types[0], np.int32) - self.assertEquals(types[1], np.object) - self.assertEquals(types[2], np.bool) - self.assertEquals(types[3], np.float32) - self.assertEquals(types[4], np.object) # datetime.date - self.assertEquals(types[5], 'datetime64[ns]') + self.assertEqual(types[0], np.int32) + self.assertEqual(types[1], np.object) + self.assertEqual(types[2], np.bool) + self.assertEqual(types[3], np.float32) + self.assertEqual(types[4], np.object) # datetime.date + self.assertEqual(types[5], 'datetime64[ns]') @unittest.skipIf(not have_pandas, pandas_requirement_message) # type: ignore def test_to_pandas_with_duplicated_column_names(self): @@ -540,8 +540,8 @@ class DataFrameTests(ReusedSQLTestCase): df = self.spark.sql(sql) pdf = df.toPandas() types = pdf.dtypes - self.assertEquals(types.iloc[0], np.int32) - self.assertEquals(types.iloc[1], np.int32) + self.assertEqual(types.iloc[0], np.int32) + self.assertEqual(types.iloc[1], np.int32) @unittest.skipIf(not have_pandas, pandas_requirement_message) # type: ignore def test_to_pandas_on_cross_join(self): @@ -560,13 +560,13 @@ class DataFrameTests(ReusedSQLTestCase): df = self.spark.sql(sql) pdf = df.toPandas() types = pdf.dtypes - self.assertEquals(types.iloc[0], np.int32) - self.assertEquals(types.iloc[1], np.int32) + self.assertEqual(types.iloc[0], np.int32) + self.assertEqual(types.iloc[1], np.int32) @unittest.skipIf(have_pandas, "Required Pandas was found.") def test_to_pandas_required_pandas_not_found(self): with QuietTest(self.sc): - with self.assertRaisesRegexp(ImportError, 'Pandas >= .* must be installed'): + with self.assertRaisesRegex(ImportError, 'Pandas >= .* must be installed'): self._to_pandas() @unittest.skipIf(not have_pandas, pandas_requirement_message) # type: ignore @@ -577,9 +577,9 @@ class DataFrameTests(ReusedSQLTestCase): data = [(1, "foo", 16777220), (None, "bar", None)] df = self.spark.createDataFrame(data, schema) types = df.toPandas().dtypes - self.assertEquals(types[0], np.float64) # doesn't convert to np.int32 due to NaN value. - self.assertEquals(types[1], np.object) - self.assertEquals(types[2], np.float64) + self.assertEqual(types[0], np.float64) # doesn't convert to np.int32 due to NaN value. + self.assertEqual(types[1], np.object) + self.assertEqual(types[2], np.float64) @unittest.skipIf(not have_pandas, pandas_requirement_message) # type: ignore def test_to_pandas_from_empty_dataframe(self): @@ -675,7 +675,7 @@ class DataFrameTests(ReusedSQLTestCase): @unittest.skipIf(have_pandas, "Required Pandas was found.") def test_create_dataframe_required_pandas_not_found(self): with QuietTest(self.sc): - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ImportError, "(Pandas >= .* must be installed|No module named '?pandas'?)"): import pandas as pd @@ -688,7 +688,7 @@ class DataFrameTests(ReusedSQLTestCase): @unittest.skipIf(not have_pandas, pandas_requirement_message) # type: ignore def test_create_dataframe_from_pandas_with_dst(self): import pandas as pd - from pandas.util.testing import assert_frame_equal + from pandas.testing import assert_frame_equal from datetime import datetime pdf = pd.DataFrame({'time': [datetime(2015, 10, 31, 22, 30)]}) @@ -724,7 +724,7 @@ class DataFrameTests(ReusedSQLTestCase): ||22222|22222| |+-----+-----+ |""" - self.assertEquals(re.sub(pattern, '', expected1), df.__repr__()) + self.assertEqual(re.sub(pattern, '', expected1), df.__repr__()) with self.sql_conf({"spark.sql.repl.eagerEval.truncate": 3}): expected2 = """+---+-----+ ||key|value| @@ -733,7 +733,7 @@ class DataFrameTests(ReusedSQLTestCase): ||222| 222| |+---+-----+ |""" - self.assertEquals(re.sub(pattern, '', expected2), df.__repr__()) + self.assertEqual(re.sub(pattern, '', expected2), df.__repr__()) with self.sql_conf({"spark.sql.repl.eagerEval.maxNumRows": 1}): expected3 = """+---+-----+ ||key|value| @@ -742,7 +742,7 @@ class DataFrameTests(ReusedSQLTestCase): |+---+-----+ |only showing top 1 row |""" - self.assertEquals(re.sub(pattern, '', expected3), df.__repr__()) + self.assertEqual(re.sub(pattern, '', expected3), df.__repr__()) # test when eager evaluation is enabled and _repr_html_ will be called with self.sql_conf({"spark.sql.repl.eagerEval.enabled": True}): @@ -752,7 +752,7 @@ class DataFrameTests(ReusedSQLTestCase): |2222222222 | |""" - self.assertEquals(re.sub(pattern, '', expected1), df._repr_html_()) + self.assertEqual(re.sub(pattern, '', expected1), df._repr_html_()) with self.sql_conf({"spark.sql.repl.eagerEval.truncate": 3}): expected2 = """ | @@ -760,7 +760,7 @@ class DataFrameTests(ReusedSQLTestCase): | |
keyvalue
222222
|""" - self.assertEquals(re.sub(pattern, '', expected2), df._repr_html_()) + self.assertEqual(re.sub(pattern, '', expected2), df._repr_html_()) with self.sql_conf({"spark.sql.repl.eagerEval.maxNumRows": 1}): expected3 = """ | @@ -768,19 +768,19 @@ class DataFrameTests(ReusedSQLTestCase): |
keyvalue
|only showing top 1 row |""" - self.assertEquals(re.sub(pattern, '', expected3), df._repr_html_()) + self.assertEqual(re.sub(pattern, '', expected3), df._repr_html_()) # test when eager evaluation is disabled and _repr_html_ will be called with self.sql_conf({"spark.sql.repl.eagerEval.enabled": False}): expected = "DataFrame[key: bigint, value: string]" - self.assertEquals(None, df._repr_html_()) - self.assertEquals(expected, df.__repr__()) + self.assertEqual(None, df._repr_html_()) + self.assertEqual(expected, df.__repr__()) with self.sql_conf({"spark.sql.repl.eagerEval.truncate": 3}): - self.assertEquals(None, df._repr_html_()) - self.assertEquals(expected, df.__repr__()) + self.assertEqual(None, df._repr_html_()) + self.assertEqual(expected, df.__repr__()) with self.sql_conf({"spark.sql.repl.eagerEval.maxNumRows": 1}): - self.assertEquals(None, df._repr_html_()) - self.assertEquals(expected, df.__repr__()) + self.assertEqual(None, df._repr_html_()) + self.assertEqual(expected, df.__repr__()) def test_to_local_iterator(self): df = self.spark.range(8, numPartitions=4) @@ -818,7 +818,7 @@ class DataFrameTests(ReusedSQLTestCase): def test_same_semantics_error(self): with QuietTest(self.sc): - with self.assertRaisesRegexp(ValueError, "should be of DataFrame.*int"): + with self.assertRaisesRegex(ValueError, "should be of DataFrame.*int"): self.spark.range(10).sameSemantics(1) def test_input_files(self): @@ -830,7 +830,7 @@ class DataFrameTests(ReusedSQLTestCase): input_files_list = self.spark.read.parquet(tpath).inputFiles() # input files list should contain 10 entries - self.assertEquals(len(input_files_list), 10) + self.assertEqual(len(input_files_list), 10) # all file paths in list must contain tpath for file_path in input_files_list: self.assertTrue(tpath in file_path) diff --git a/python/pyspark/sql/tests/test_datasources.py b/python/pyspark/sql/tests/test_datasources.py index 9425494fb0..26a6c58dba 100644 --- a/python/pyspark/sql/tests/test_datasources.py +++ b/python/pyspark/sql/tests/test_datasources.py @@ -107,7 +107,7 @@ class DataSourcesTests(ReusedSQLTestCase): df = self.spark.read.text(['python/test_support/sql/text-test.txt', 'python/test_support/sql/text-test.txt']) count = df.count() - self.assertEquals(count, 4) + self.assertEqual(count, 4) def test_json_sampling_ratio(self): rdd = self.spark.sparkContext.range(0, 100, 1, 1) \ @@ -115,14 +115,14 @@ class DataSourcesTests(ReusedSQLTestCase): schema = self.spark.read.option('inferSchema', True) \ .option('samplingRatio', 0.5) \ .json(rdd).schema - self.assertEquals(schema, StructType([StructField("a", LongType(), True)])) + self.assertEqual(schema, StructType([StructField("a", LongType(), True)])) def test_csv_sampling_ratio(self): rdd = self.spark.sparkContext.range(0, 100, 1, 1) \ .map(lambda x: '0.1' if x == 1 else str(x)) schema = self.spark.read.option('inferSchema', True)\ .csv(rdd, samplingRatio=0.5).schema - self.assertEquals(schema, StructType([StructField("_c0", IntegerType(), True)])) + self.assertEqual(schema, StructType([StructField("_c0", IntegerType(), True)])) def test_checking_csv_header(self): path = tempfile.mkdtemp() @@ -135,7 +135,7 @@ class DataSourcesTests(ReusedSQLTestCase): StructField('f1', IntegerType(), nullable=True)]) df = self.spark.read.option('header', 'true').schema(schema)\ .csv(path, enforceSchema=False) - self.assertRaisesRegexp( + self.assertRaisesRegex( Exception, "CSV header does not conform to the schema", lambda: df.collect()) @@ -154,7 +154,7 @@ class DataSourcesTests(ReusedSQLTestCase): StructField('b', LongType(), nullable=True), StructField('c', StringType(), nullable=True)]) readback = self.spark.read.json(path, dropFieldIfAllNull=True) - self.assertEquals(readback.schema, schema) + self.assertEqual(readback.schema, schema) finally: shutil.rmtree(path) diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py index 2858bdeca0..58599a9fa4 100644 --- a/python/pyspark/sql/tests/test_functions.py +++ b/python/pyspark/sql/tests/test_functions.py @@ -185,7 +185,7 @@ class FunctionsTests(ReusedSQLTestCase): ] df = self.spark.createDataFrame([['nick']], schema=['name']) - self.assertRaisesRegexp( + self.assertRaisesRegex( TypeError, "must be the same type", lambda: df.select(col('name').substr(0, lit(1)))) @@ -321,16 +321,16 @@ class FunctionsTests(ReusedSQLTestCase): df = self.spark.createDataFrame( [('Tom', 80), (None, 60), ('Alice', 50)], ["name", "height"]) - self.assertEquals( + self.assertEqual( df.select(df.name).orderBy(functions.asc_nulls_first('name')).collect(), [Row(name=None), Row(name=u'Alice'), Row(name=u'Tom')]) - self.assertEquals( + self.assertEqual( df.select(df.name).orderBy(functions.asc_nulls_last('name')).collect(), [Row(name=u'Alice'), Row(name=u'Tom'), Row(name=None)]) - self.assertEquals( + self.assertEqual( df.select(df.name).orderBy(functions.desc_nulls_first('name')).collect(), [Row(name=None), Row(name=u'Tom'), Row(name=u'Alice')]) - self.assertEquals( + self.assertEqual( df.select(df.name).orderBy(functions.desc_nulls_last('name')).collect(), [Row(name=u'Tom'), Row(name=u'Alice'), Row(name=None)]) @@ -354,7 +354,7 @@ class FunctionsTests(ReusedSQLTestCase): df = self.spark.createDataFrame([([1, 2, 3],), ([4, 5],)], ['x']) - self.assertEquals( + self.assertEqual( df.select(slice(df.x, 2, 2).alias("sliced")).collect(), df.select(slice(df.x, lit(2), lit(2)).alias("sliced")).collect(), ) @@ -364,7 +364,7 @@ class FunctionsTests(ReusedSQLTestCase): df = self.spark.range(1) - self.assertEquals( + self.assertEqual( df.select(array_repeat("id", 3)).toDF("val").collect(), df.select(array_repeat("id", lit(3))).toDF("val").collect(), ) @@ -580,14 +580,14 @@ class FunctionsTests(ReusedSQLTestCase): from datetime import date df = self.spark.range(1).selectExpr("'2017-01-22' as dateCol") parse_result = df.select(functions.to_date(functions.col("dateCol"))).first() - self.assertEquals(date(2017, 1, 22), parse_result['to_date(dateCol)']) + self.assertEqual(date(2017, 1, 22), parse_result['to_date(dateCol)']) def test_assert_true(self): from pyspark.sql.functions import assert_true df = self.spark.range(3) - self.assertEquals( + self.assertEqual( df.select(assert_true(df.id < 3)).toDF("val").collect(), [Row(val=None), Row(val=None), Row(val=None)], ) @@ -604,7 +604,7 @@ class FunctionsTests(ReusedSQLTestCase): with self.assertRaises(TypeError) as cm: df.select(assert_true(df.id < 2, 5)) - self.assertEquals( + self.assertEqual( "errMsg should be a Column or a str, got ", str(cm.exception) ) @@ -626,7 +626,7 @@ class FunctionsTests(ReusedSQLTestCase): with self.assertRaises(TypeError) as cm: df.select(raise_error(None)) - self.assertEquals( + self.assertEqual( "errMsg should be a Column or a str, got ", str(cm.exception) ) diff --git a/python/pyspark/sql/tests/test_pandas_cogrouped_map.py b/python/pyspark/sql/tests/test_pandas_cogrouped_map.py index 4afc1dfcc1..3c016e04ad 100644 --- a/python/pyspark/sql/tests/test_pandas_cogrouped_map.py +++ b/python/pyspark/sql/tests/test_pandas_cogrouped_map.py @@ -25,7 +25,7 @@ from pyspark.testing.utils import QuietTest if have_pandas: import pandas as pd - from pandas.util.testing import assert_frame_equal + from pandas.testing import assert_frame_equal if have_pyarrow: import pyarrow as pa # noqa: F401 @@ -135,8 +135,8 @@ class CogroupedMapInPandasTests(ReusedSQLTestCase): .applyInPandas(lambda x, y: pd.DataFrame([(x.sum().sum(), y.sum().sum())]), 'sum1 int, sum2 int').collect() - self.assertEquals(result[0]['sum1'], 165) - self.assertEquals(result[0]['sum2'], 165) + self.assertEqual(result[0]['sum1'], 165) + self.assertEqual(result[0]['sum2'], 165) def test_with_key_left(self): self._test_with_key(self.data1, self.data1, isLeft=True) @@ -174,7 +174,7 @@ class CogroupedMapInPandasTests(ReusedSQLTestCase): left = self.data1 right = self.data2 with QuietTest(self.sc): - with self.assertRaisesRegexp( + with self.assertRaisesRegex( NotImplementedError, 'Invalid return type.*ArrayType.*TimestampType'): left.groupby('id').cogroup(right.groupby('id')).applyInPandas( @@ -183,7 +183,7 @@ class CogroupedMapInPandasTests(ReusedSQLTestCase): def test_wrong_args(self): left = self.data1 right = self.data2 - with self.assertRaisesRegexp(ValueError, 'Invalid function'): + with self.assertRaisesRegex(ValueError, 'Invalid function'): left.groupby('id').cogroup(right.groupby('id')) \ .applyInPandas(lambda: 1, StructType([StructField("d", DoubleType())])) @@ -194,14 +194,14 @@ class CogroupedMapInPandasTests(ReusedSQLTestCase): row = df1.groupby("ColUmn").cogroup( df1.groupby("COLUMN") ).applyInPandas(lambda r, l: r + l, "column long, value long").first() - self.assertEquals(row.asDict(), Row(column=2, value=2).asDict()) + self.assertEqual(row.asDict(), Row(column=2, value=2).asDict()) df2 = self.spark.createDataFrame([(1, 1)], ("column", "value")) row = df1.groupby("ColUmn").cogroup( df2.groupby("COLUMN") ).applyInPandas(lambda r, l: r + l, "column long, value long").first() - self.assertEquals(row.asDict(), Row(column=2, value=2).asDict()) + self.assertEqual(row.asDict(), Row(column=2, value=2).asDict()) @staticmethod def _test_with_key(left, right, isLeft): diff --git a/python/pyspark/sql/tests/test_pandas_grouped_map.py b/python/pyspark/sql/tests/test_pandas_grouped_map.py index a639a8d51f..64803a6574 100644 --- a/python/pyspark/sql/tests/test_pandas_grouped_map.py +++ b/python/pyspark/sql/tests/test_pandas_grouped_map.py @@ -33,7 +33,7 @@ from pyspark.testing.utils import QuietTest if have_pandas: import pandas as pd - from pandas.util.testing import assert_frame_equal + from pandas.testing import assert_frame_equal if have_pyarrow: import pyarrow as pa # noqa: F401 @@ -160,7 +160,7 @@ class GroupedMapInPandasTests(ReusedSQLTestCase): def test_register_grouped_map_udf(self): foo_udf = pandas_udf(lambda x: x, "id long", PandasUDFType.GROUPED_MAP) with QuietTest(self.sc): - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ValueError, 'f.*SQL_BATCHED_UDF.*SQL_SCALAR_PANDAS_UDF.*SQL_GROUPED_AGG_PANDAS_UDF.*'): self.spark.catalog.registerFunction("foo_udf", foo_udf) @@ -244,7 +244,7 @@ class GroupedMapInPandasTests(ReusedSQLTestCase): def test_wrong_return_type(self): with QuietTest(self.sc): - with self.assertRaisesRegexp( + with self.assertRaisesRegex( NotImplementedError, 'Invalid return type.*grouped map Pandas UDF.*ArrayType.*TimestampType'): pandas_udf( @@ -256,20 +256,20 @@ class GroupedMapInPandasTests(ReusedSQLTestCase): df = self.data with QuietTest(self.sc): - with self.assertRaisesRegexp(ValueError, 'Invalid udf'): + with self.assertRaisesRegex(ValueError, 'Invalid udf'): df.groupby('id').apply(lambda x: x) - with self.assertRaisesRegexp(ValueError, 'Invalid udf'): + with self.assertRaisesRegex(ValueError, 'Invalid udf'): df.groupby('id').apply(udf(lambda x: x, DoubleType())) - with self.assertRaisesRegexp(ValueError, 'Invalid udf'): + with self.assertRaisesRegex(ValueError, 'Invalid udf'): df.groupby('id').apply(sum(df.v)) - with self.assertRaisesRegexp(ValueError, 'Invalid udf'): + with self.assertRaisesRegex(ValueError, 'Invalid udf'): df.groupby('id').apply(df.v + 1) - with self.assertRaisesRegexp(ValueError, 'Invalid function'): + with self.assertRaisesRegex(ValueError, 'Invalid function'): df.groupby('id').apply( pandas_udf(lambda: 1, StructType([StructField("d", DoubleType())]))) - with self.assertRaisesRegexp(ValueError, 'Invalid udf'): + with self.assertRaisesRegex(ValueError, 'Invalid udf'): df.groupby('id').apply(pandas_udf(lambda x, y: x, DoubleType())) - with self.assertRaisesRegexp(ValueError, 'Invalid udf.*GROUPED_MAP'): + with self.assertRaisesRegex(ValueError, 'Invalid udf.*GROUPED_MAP'): df.groupby('id').apply( pandas_udf(lambda x, y: x, DoubleType(), PandasUDFType.SCALAR)) @@ -284,7 +284,7 @@ class GroupedMapInPandasTests(ReusedSQLTestCase): for unsupported_type in unsupported_types: schema = StructType([StructField('id', LongType(), True), unsupported_type]) with QuietTest(self.sc): - with self.assertRaisesRegexp(NotImplementedError, common_err_msg): + with self.assertRaisesRegex(NotImplementedError, common_err_msg): pandas_udf(lambda x: x, schema, PandasUDFType.GROUPED_MAP) # Regression test for SPARK-23314 @@ -451,9 +451,9 @@ class GroupedMapInPandasTests(ReusedSQLTestCase): with self.sql_conf({"spark.sql.execution.pandas.convertToArrowArraySafely": False}): with QuietTest(self.sc): - with self.assertRaisesRegexp(Exception, "KeyError: 'id'"): + with self.assertRaisesRegex(Exception, "KeyError: 'id'"): grouped_df.apply(column_name_typo).collect() - with self.assertRaisesRegexp(Exception, "[D|d]ecimal.*got.*date"): + with self.assertRaisesRegex(Exception, "[D|d]ecimal.*got.*date"): grouped_df.apply(invalid_positional_types).collect() def test_positional_assignment_conf(self): @@ -482,7 +482,7 @@ class GroupedMapInPandasTests(ReusedSQLTestCase): # this was throwing an AnalysisException before SPARK-24208 res = df_with_pandas.alias('temp0').join(df_with_pandas.alias('temp1'), col('temp0.key') == col('temp1.key')) - self.assertEquals(res.count(), 5) + self.assertEqual(res.count(), 5) def test_mixed_scalar_udfs_followed_by_groupby_apply(self): df = self.spark.range(0, 10).toDF('v1') @@ -494,7 +494,7 @@ class GroupedMapInPandasTests(ReusedSQLTestCase): 'sum int', PandasUDFType.GROUPED_MAP)) - self.assertEquals(result.collect()[0]['sum'], 165) + self.assertEqual(result.collect()[0]['sum'], 165) def test_grouped_with_empty_partition(self): data = [Row(id=1, x=2), Row(id=1, x=3), Row(id=2, x=4)] @@ -604,7 +604,7 @@ class GroupedMapInPandasTests(ReusedSQLTestCase): df = self.spark.createDataFrame([[1, 1]], ["column", "score"]) row = df.groupby('COLUMN').applyInPandas( my_pandas_udf, schema="column integer, score float").first() - self.assertEquals(row.asDict(), Row(column=1, score=0.5).asDict()) + self.assertEqual(row.asDict(), Row(column=1, score=0.5).asDict()) if __name__ == "__main__": diff --git a/python/pyspark/sql/tests/test_pandas_map.py b/python/pyspark/sql/tests/test_pandas_map.py index 3ca437f75f..d53face702 100644 --- a/python/pyspark/sql/tests/test_pandas_map.py +++ b/python/pyspark/sql/tests/test_pandas_map.py @@ -61,7 +61,7 @@ class MapInPandasTests(ReusedSQLTestCase): df = self.spark.range(10) actual = df.mapInPandas(func, 'id long').collect() expected = df.collect() - self.assertEquals(actual, expected) + self.assertEqual(actual, expected) def test_multiple_columns(self): data = [(1, "foo"), (2, None), (3, "bar"), (4, "bar")] @@ -75,7 +75,7 @@ class MapInPandasTests(ReusedSQLTestCase): actual = df.mapInPandas(func, df.schema).collect() expected = df.collect() - self.assertEquals(actual, expected) + self.assertEqual(actual, expected) def test_different_output_length(self): def func(iterator): @@ -84,7 +84,7 @@ class MapInPandasTests(ReusedSQLTestCase): df = self.spark.range(10) actual = df.repartition(1).mapInPandas(func, 'a long').collect() - self.assertEquals(set((r.a for r in actual)), set(range(100))) + self.assertEqual(set((r.a for r in actual)), set(range(100))) def test_empty_iterator(self): def empty_iter(_): @@ -110,7 +110,7 @@ class MapInPandasTests(ReusedSQLTestCase): df = self.spark.range(10) actual = df.mapInPandas(func, 'id long').mapInPandas(func, 'id long').collect() expected = df.collect() - self.assertEquals(actual, expected) + self.assertEqual(actual, expected) if __name__ == "__main__": diff --git a/python/pyspark/sql/tests/test_pandas_udf.py b/python/pyspark/sql/tests/test_pandas_udf.py index cc742fc426..975eb4680d 100644 --- a/python/pyspark/sql/tests/test_pandas_udf.py +++ b/python/pyspark/sql/tests/test_pandas_udf.py @@ -114,31 +114,31 @@ class PandasUDFTests(ReusedSQLTestCase): @pandas_udf('blah') def foo(x): return x - with self.assertRaisesRegexp(ValueError, 'Invalid return type.*None'): + with self.assertRaisesRegex(ValueError, 'Invalid return type.*None'): @pandas_udf(functionType=PandasUDFType.SCALAR) def foo(x): return x - with self.assertRaisesRegexp(ValueError, 'Invalid function'): + with self.assertRaisesRegex(ValueError, 'Invalid function'): @pandas_udf('double', 100) def foo(x): return x - with self.assertRaisesRegexp(ValueError, '0-arg pandas_udfs.*not.*supported'): + with self.assertRaisesRegex(ValueError, '0-arg pandas_udfs.*not.*supported'): pandas_udf(lambda: 1, LongType(), PandasUDFType.SCALAR) - with self.assertRaisesRegexp(ValueError, '0-arg pandas_udfs.*not.*supported'): + with self.assertRaisesRegex(ValueError, '0-arg pandas_udfs.*not.*supported'): @pandas_udf(LongType(), PandasUDFType.SCALAR) def zero_with_type(): return 1 - with self.assertRaisesRegexp(TypeError, 'Invalid return type'): + with self.assertRaisesRegex(TypeError, 'Invalid return type'): @pandas_udf(returnType=PandasUDFType.GROUPED_MAP) def foo(df): return df - with self.assertRaisesRegexp(TypeError, 'Invalid return type'): + with self.assertRaisesRegex(TypeError, 'Invalid return type'): @pandas_udf(returnType='double', functionType=PandasUDFType.GROUPED_MAP) def foo(df): return df - with self.assertRaisesRegexp(ValueError, 'Invalid function'): + with self.assertRaisesRegex(ValueError, 'Invalid function'): @pandas_udf(returnType='k int, v double', functionType=PandasUDFType.GROUPED_MAP) def foo(k, v, w): return k @@ -154,14 +154,14 @@ class PandasUDFTests(ReusedSQLTestCase): df = self.spark.range(0, 100) # plain udf (test for SPARK-23754) - self.assertRaisesRegexp( + self.assertRaisesRegex( PythonException, exc_message, df.withColumn('v', udf(foo)('id')).collect ) # pandas scalar udf - self.assertRaisesRegexp( + self.assertRaisesRegex( PythonException, exc_message, df.withColumn( @@ -170,7 +170,7 @@ class PandasUDFTests(ReusedSQLTestCase): ) # pandas grouped map - self.assertRaisesRegexp( + self.assertRaisesRegex( PythonException, exc_message, df.groupBy('id').apply( @@ -178,7 +178,7 @@ class PandasUDFTests(ReusedSQLTestCase): ).collect ) - self.assertRaisesRegexp( + self.assertRaisesRegex( PythonException, exc_message, df.groupBy('id').apply( @@ -187,7 +187,7 @@ class PandasUDFTests(ReusedSQLTestCase): ) # pandas grouped agg - self.assertRaisesRegexp( + self.assertRaisesRegex( PythonException, exc_message, df.groupBy('id').agg( @@ -210,8 +210,8 @@ class PandasUDFTests(ReusedSQLTestCase): # Since 0.11.0, PyArrow supports the feature to raise an error for unsafe cast. with self.sql_conf({ "spark.sql.execution.pandas.convertToArrowArraySafely": True}): - with self.assertRaisesRegexp(Exception, - "Exception thrown when converting pandas.Series"): + with self.assertRaisesRegex(Exception, + "Exception thrown when converting pandas.Series"): df.select(['A']).withColumn('udf', udf('A')).collect() # Disabling Arrow safe type check. @@ -231,8 +231,8 @@ class PandasUDFTests(ReusedSQLTestCase): # When enabling safe type check, Arrow 0.11.0+ disallows overflow cast. with self.sql_conf({ "spark.sql.execution.pandas.convertToArrowArraySafely": True}): - with self.assertRaisesRegexp(Exception, - "Exception thrown when converting pandas.Series"): + with self.assertRaisesRegex(Exception, + "Exception thrown when converting pandas.Series"): df.withColumn('udf', udf('id')).collect() # Disabling safe type check, let Arrow do the cast anyway. diff --git a/python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py b/python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py index 2cbcf31f6e..b49092ed70 100644 --- a/python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py +++ b/python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py @@ -30,7 +30,7 @@ from pyspark.testing.utils import QuietTest if have_pandas: import pandas as pd - from pandas.util.testing import assert_frame_equal + from pandas.testing import assert_frame_equal @unittest.skipIf( @@ -145,20 +145,20 @@ class GroupedAggPandasUDFTests(ReusedSQLTestCase): def test_unsupported_types(self): with QuietTest(self.sc): - with self.assertRaisesRegexp(NotImplementedError, 'not supported'): + with self.assertRaisesRegex(NotImplementedError, 'not supported'): pandas_udf( lambda x: x, ArrayType(ArrayType(TimestampType())), PandasUDFType.GROUPED_AGG) with QuietTest(self.sc): - with self.assertRaisesRegexp(NotImplementedError, 'not supported'): + with self.assertRaisesRegex(NotImplementedError, 'not supported'): @pandas_udf('mean double, std double', PandasUDFType.GROUPED_AGG) def mean_and_std_udf(v): return v.mean(), v.std() with QuietTest(self.sc): - with self.assertRaisesRegexp(NotImplementedError, 'not supported'): + with self.assertRaisesRegex(NotImplementedError, 'not supported'): @pandas_udf(ArrayType(TimestampType()), PandasUDFType.GROUPED_AGG) def mean_and_std_udf(v): return {v.mean(): v.std()} @@ -428,7 +428,7 @@ class GroupedAggPandasUDFTests(ReusedSQLTestCase): array_udf = pandas_udf(lambda x: [1.0, 2.0], 'array', PandasUDFType.GROUPED_AGG) result1 = df.groupby('id').agg(array_udf(df['v']).alias('v2')) - self.assertEquals(result1.first()['v2'], [1.0, 2.0]) + self.assertEqual(result1.first()['v2'], [1.0, 2.0]) def test_invalid_args(self): df = self.data @@ -436,19 +436,19 @@ class GroupedAggPandasUDFTests(ReusedSQLTestCase): mean_udf = self.pandas_agg_mean_udf with QuietTest(self.sc): - with self.assertRaisesRegexp( + with self.assertRaisesRegex( AnalysisException, 'nor.*aggregate function'): df.groupby(df.id).agg(plus_one(df.v)).collect() with QuietTest(self.sc): - with self.assertRaisesRegexp( + with self.assertRaisesRegex( AnalysisException, 'aggregate function.*argument.*aggregate function'): df.groupby(df.id).agg(mean_udf(mean_udf(df.v))).collect() with QuietTest(self.sc): - with self.assertRaisesRegexp( + with self.assertRaisesRegex( AnalysisException, 'mixture.*aggregate function.*group aggregate pandas UDF'): df.groupby(df.id).agg(mean_udf(df.v), mean(df.v)).collect() diff --git a/python/pyspark/sql/tests/test_pandas_udf_scalar.py b/python/pyspark/sql/tests/test_pandas_udf_scalar.py index 5da5d043ce..2eb2dec001 100644 --- a/python/pyspark/sql/tests/test_pandas_udf_scalar.py +++ b/python/pyspark/sql/tests/test_pandas_udf_scalar.py @@ -133,7 +133,7 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): long_f(col('long')), float_f(col('float')), double_f(col('double')), decimal_f('decimal'), bool_f(col('bool')), array_long_f('array_long')) - self.assertEquals(df.collect(), res.collect()) + self.assertEqual(df.collect(), res.collect()) def test_register_nondeterministic_vectorized_udf_basic(self): random_pandas_udf = pandas_udf( @@ -169,7 +169,7 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: bool_f = pandas_udf(lambda x: x, BooleanType(), udf_type) res = df.select(bool_f(col('bool'))) - self.assertEquals(df.collect(), res.collect()) + self.assertEqual(df.collect(), res.collect()) def test_vectorized_udf_null_byte(self): data = [(None,), (2,), (3,), (4,)] @@ -178,7 +178,7 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: byte_f = pandas_udf(lambda x: x, ByteType(), udf_type) res = df.select(byte_f(col('byte'))) - self.assertEquals(df.collect(), res.collect()) + self.assertEqual(df.collect(), res.collect()) def test_vectorized_udf_null_short(self): data = [(None,), (2,), (3,), (4,)] @@ -187,7 +187,7 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: short_f = pandas_udf(lambda x: x, ShortType(), udf_type) res = df.select(short_f(col('short'))) - self.assertEquals(df.collect(), res.collect()) + self.assertEqual(df.collect(), res.collect()) def test_vectorized_udf_null_int(self): data = [(None,), (2,), (3,), (4,)] @@ -196,7 +196,7 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: int_f = pandas_udf(lambda x: x, IntegerType(), udf_type) res = df.select(int_f(col('int'))) - self.assertEquals(df.collect(), res.collect()) + self.assertEqual(df.collect(), res.collect()) def test_vectorized_udf_null_long(self): data = [(None,), (2,), (3,), (4,)] @@ -205,7 +205,7 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: long_f = pandas_udf(lambda x: x, LongType(), udf_type) res = df.select(long_f(col('long'))) - self.assertEquals(df.collect(), res.collect()) + self.assertEqual(df.collect(), res.collect()) def test_vectorized_udf_null_float(self): data = [(3.0,), (5.0,), (-1.0,), (None,)] @@ -214,7 +214,7 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: float_f = pandas_udf(lambda x: x, FloatType(), udf_type) res = df.select(float_f(col('float'))) - self.assertEquals(df.collect(), res.collect()) + self.assertEqual(df.collect(), res.collect()) def test_vectorized_udf_null_double(self): data = [(3.0,), (5.0,), (-1.0,), (None,)] @@ -223,7 +223,7 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: double_f = pandas_udf(lambda x: x, DoubleType(), udf_type) res = df.select(double_f(col('double'))) - self.assertEquals(df.collect(), res.collect()) + self.assertEqual(df.collect(), res.collect()) def test_vectorized_udf_null_decimal(self): data = [(Decimal(3.0),), (Decimal(5.0),), (Decimal(-1.0),), (None,)] @@ -232,7 +232,7 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: decimal_f = pandas_udf(lambda x: x, DecimalType(38, 18), udf_type) res = df.select(decimal_f(col('decimal'))) - self.assertEquals(df.collect(), res.collect()) + self.assertEqual(df.collect(), res.collect()) def test_vectorized_udf_null_string(self): data = [("foo",), (None,), ("bar",), ("bar",)] @@ -241,7 +241,7 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: str_f = pandas_udf(lambda x: x, StringType(), udf_type) res = df.select(str_f(col('str'))) - self.assertEquals(df.collect(), res.collect()) + self.assertEqual(df.collect(), res.collect()) def test_vectorized_udf_string_in_udf(self): df = self.spark.range(10) @@ -255,7 +255,7 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): str_f = pandas_udf(f, StringType(), udf_type) actual = df.select(str_f(col('id'))) expected = df.select(col('id').cast('string')) - self.assertEquals(expected.collect(), actual.collect()) + self.assertEqual(expected.collect(), actual.collect()) def test_vectorized_udf_datatype_string(self): df = self.spark.range(10).select( @@ -279,7 +279,7 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): long_f(col('long')), float_f(col('float')), double_f(col('double')), decimal_f('decimal'), bool_f(col('bool'))) - self.assertEquals(df.collect(), res.collect()) + self.assertEqual(df.collect(), res.collect()) def test_vectorized_udf_null_binary(self): data = [(bytearray(b"a"),), (None,), (bytearray(b"bb"),), (bytearray(b"ccc"),)] @@ -288,7 +288,7 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: str_f = pandas_udf(lambda x: x, BinaryType(), udf_type) res = df.select(str_f(col('binary'))) - self.assertEquals(df.collect(), res.collect()) + self.assertEqual(df.collect(), res.collect()) def test_vectorized_udf_array_type(self): data = [([1, 2],), ([3, 4],)] @@ -297,7 +297,7 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: array_f = pandas_udf(lambda x: x, ArrayType(IntegerType()), udf_type) result = df.select(array_f(col('array'))) - self.assertEquals(df.collect(), result.collect()) + self.assertEqual(df.collect(), result.collect()) def test_vectorized_udf_null_array(self): data = [([1, 2],), (None,), (None,), ([3, 4],), (None,)] @@ -306,7 +306,7 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: array_f = pandas_udf(lambda x: x, ArrayType(IntegerType()), udf_type) result = df.select(array_f(col('array'))) - self.assertEquals(df.collect(), result.collect()) + self.assertEqual(df.collect(), result.collect()) def test_vectorized_udf_struct_type(self): df = self.spark.range(10) @@ -375,7 +375,7 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: with QuietTest(self.sc): - with self.assertRaisesRegexp( + with self.assertRaisesRegex( Exception, 'Invalid return type with scalar Pandas UDFs'): pandas_udf(lambda x: x, returnType=nested_type, functionType=udf_type) @@ -392,7 +392,7 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): else: map_f = pandas_udf(lambda x: x, MapType(StringType(), LongType()), udf_type) result = df.select(map_f(col('map'))) - self.assertEquals(df.collect(), result.collect()) + self.assertEqual(df.collect(), result.collect()) def test_vectorized_udf_complex(self): df = self.spark.range(10).select( @@ -422,7 +422,7 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): (iter_add, iter_power2, iter_mul)]: res = df.select(add(col('a'), col('b')), power2(col('a')), mul(col('b'), col('c'))) expected = df.select(expr('a + b'), expr('power(2, a)'), expr('b * c')) - self.assertEquals(expected.collect(), res.collect()) + self.assertEqual(expected.collect(), res.collect()) def test_vectorized_udf_exception(self): df = self.spark.range(10) @@ -435,14 +435,14 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): for raise_exception in [scalar_raise_exception, iter_raise_exception]: with QuietTest(self.sc): - with self.assertRaisesRegexp(Exception, 'division( or modulo)? by zero'): + with self.assertRaisesRegex(Exception, 'division( or modulo)? by zero'): df.select(raise_exception(col('id'))).collect() def test_vectorized_udf_invalid_length(self): df = self.spark.range(10) raise_exception = pandas_udf(lambda _: pd.Series(1), LongType()) with QuietTest(self.sc): - with self.assertRaisesRegexp( + with self.assertRaisesRegex( Exception, 'Result vector from pandas_udf was not the required length'): df.select(raise_exception(col('id'))).collect() @@ -453,7 +453,7 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): yield pd.Series(1) with QuietTest(self.sc): - with self.assertRaisesRegexp( + with self.assertRaisesRegex( Exception, "The length of output in Scalar iterator.*" "the length of output was 1"): @@ -469,7 +469,7 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): with self.sql_conf({"spark.sql.execution.arrow.maxRecordsPerBatch": 3}): df1 = self.spark.range(10).repartition(1) with QuietTest(self.sc): - with self.assertRaisesRegexp( + with self.assertRaisesRegex( Exception, "pandas iterator UDF should exhaust"): df1.select(iter_udf_not_reading_all_input(col('id'))).collect() @@ -486,7 +486,7 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): for f, g in [(scalar_f, scalar_g), (iter_f, iter_g)]: res = df.select(g(f(col('id')))) - self.assertEquals(df.collect(), res.collect()) + self.assertEqual(df.collect(), res.collect()) def test_vectorized_udf_chained_struct_type(self): df = self.spark.range(10) @@ -517,7 +517,7 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): def test_vectorized_udf_wrong_return_type(self): with QuietTest(self.sc): for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: - with self.assertRaisesRegexp( + with self.assertRaisesRegex( NotImplementedError, 'Invalid return type.*scalar Pandas UDF.*ArrayType.*TimestampType'): pandas_udf(lambda x: x, ArrayType(TimestampType()), udf_type) @@ -529,7 +529,7 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): PandasUDFType.SCALAR_ITER) for f in [scalar_f, iter_f]: with QuietTest(self.sc): - with self.assertRaisesRegexp(Exception, 'Return.*type.*Series'): + with self.assertRaisesRegex(Exception, 'Return.*type.*Series'): df.select(f(col('id'))).collect() def test_vectorized_udf_decorator(self): @@ -545,14 +545,14 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): for identity in [scalar_identity, iter_identity]: res = df.select(identity(col('id'))) - self.assertEquals(df.collect(), res.collect()) + self.assertEqual(df.collect(), res.collect()) def test_vectorized_udf_empty_partition(self): df = self.spark.createDataFrame(self.sc.parallelize([Row(id=1)], 2)) for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: f = pandas_udf(lambda x: x, LongType(), udf_type) res = df.select(f(col('id'))) - self.assertEquals(df.collect(), res.collect()) + self.assertEqual(df.collect(), res.collect()) def test_vectorized_udf_struct_with_empty_partition(self): df = self.spark.createDataFrame(self.sc.parallelize([Row(id=1)], 2))\ @@ -585,16 +585,16 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): for f in [scalar_f, iter_f]: res = df.select(f(col('id'), col('id'))) - self.assertEquals(df.collect(), res.collect()) + self.assertEqual(df.collect(), res.collect()) def test_vectorized_udf_unsupported_types(self): with QuietTest(self.sc): for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: - with self.assertRaisesRegexp( + with self.assertRaisesRegex( NotImplementedError, 'Invalid return type.*scalar Pandas UDF.*ArrayType.*TimestampType'): pandas_udf(lambda x: x, ArrayType(TimestampType()), udf_type) - with self.assertRaisesRegexp( + with self.assertRaisesRegex( NotImplementedError, 'Invalid return type.*scalar Pandas UDF.*ArrayType.StructType'): pandas_udf(lambda x: x, @@ -637,10 +637,10 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): result = df.withColumn("check_data", check_data(col("idx"), col("date"), col("date_copy"))).collect() - self.assertEquals(len(data), len(result)) + self.assertEqual(len(data), len(result)) for i in range(len(result)): - self.assertEquals(data[i][1], result[i][1]) # "date" col - self.assertEquals(data[i][1], result[i][2]) # "date_copy" col + self.assertEqual(data[i][1], result[i][1]) # "date" col + self.assertEqual(data[i][1], result[i][2]) # "date_copy" col self.assertIsNone(result[i][3]) # "check_data" col def test_vectorized_udf_timestamps(self): @@ -686,10 +686,10 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): result = df.withColumn("check_data", check_data(col("idx"), col("timestamp"), col("timestamp_copy"))).collect() # Check that collection values are correct - self.assertEquals(len(data), len(result)) + self.assertEqual(len(data), len(result)) for i in range(len(result)): - self.assertEquals(data[i][1], result[i][1]) # "timestamp" col - self.assertEquals(data[i][1], result[i][2]) # "timestamp_copy" col + self.assertEqual(data[i][1], result[i][1]) # "timestamp" col + self.assertEqual(data[i][1], result[i][2]) # "timestamp_copy" col self.assertIsNone(result[i][3]) # "check_data" col def test_vectorized_udf_return_timestamp_tz(self): @@ -713,7 +713,7 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): i, ts = r ts_tz = pd.Timestamp(i, unit='D', tz='America/Los_Angeles').to_pydatetime() expected = spark_ts_t.fromInternal(spark_ts_t.toInternal(ts_tz)) - self.assertEquals(expected, ts) + self.assertEqual(expected, ts) def test_vectorized_udf_check_config(self): with self.sql_conf({"spark.sql.execution.arrow.maxRecordsPerBatch": 3}): @@ -799,9 +799,9 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): for random_udf in [self.nondeterministic_vectorized_udf, self.nondeterministic_vectorized_iter_udf]: with QuietTest(self.sc): - with self.assertRaisesRegexp(AnalysisException, 'nondeterministic'): + with self.assertRaisesRegex(AnalysisException, 'nondeterministic'): df.groupby(df.id).agg(sum(random_udf(df.id))).collect() - with self.assertRaisesRegexp(AnalysisException, 'nondeterministic'): + with self.assertRaisesRegex(AnalysisException, 'nondeterministic'): df.agg(sum(random_udf(df.id))).collect() def test_register_vectorized_udf_basic(self): @@ -825,8 +825,8 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): res2 = self.spark.sql( "SELECT add1(t.a, t.b) FROM (SELECT id as a, id as b FROM range(10)) t") expected = df.select(expr('a + b')) - self.assertEquals(expected.collect(), res1.collect()) - self.assertEquals(expected.collect(), res2.collect()) + self.assertEqual(expected.collect(), res1.collect()) + self.assertEqual(expected.collect(), res2.collect()) def test_scalar_iter_udf_init(self): import numpy as np @@ -854,7 +854,7 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): finally: raise RuntimeError("reached finally block") with QuietTest(self.sc): - with self.assertRaisesRegexp(Exception, "reached finally block"): + with self.assertRaisesRegex(Exception, "reached finally block"): self.spark.range(1).select(test_close(col("id"))).collect() def test_scalar_iter_udf_close_early(self): @@ -905,7 +905,7 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: foo_udf = pandas_udf(lambda x: x, 'timestamp', udf_type) result = df.withColumn('time', foo_udf(df.time)) - self.assertEquals(df.collect(), result.collect()) + self.assertEqual(df.collect(), result.collect()) def test_udf_category_type(self): @@ -1003,11 +1003,11 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): df_chained_4 = df.withColumn('f4_f2_f1', f4(f2(f1(df['v'])))) df_chained_5 = df.withColumn('f4_f3_f1', f4(f3(f1(df['v'])))) - self.assertEquals(expected_chained_1, df_chained_1.collect()) - self.assertEquals(expected_chained_2, df_chained_2.collect()) - self.assertEquals(expected_chained_3, df_chained_3.collect()) - self.assertEquals(expected_chained_4, df_chained_4.collect()) - self.assertEquals(expected_chained_5, df_chained_5.collect()) + self.assertEqual(expected_chained_1, df_chained_1.collect()) + self.assertEqual(expected_chained_2, df_chained_2.collect()) + self.assertEqual(expected_chained_3, df_chained_3.collect()) + self.assertEqual(expected_chained_4, df_chained_4.collect()) + self.assertEqual(expected_chained_5, df_chained_5.collect()) # Test multiple mixed UDF expressions in a single projection df_multi_1 = df \ @@ -1045,8 +1045,8 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): .withColumn('f4_f3_f2', f4(f3(f2(col('v'))))) \ .withColumn('f4_f3_f2_f1', f4(f3(f2(f1(col('v')))))) - self.assertEquals(expected_multi, df_multi_1.collect()) - self.assertEquals(expected_multi, df_multi_2.collect()) + self.assertEqual(expected_multi, df_multi_1.collect()) + self.assertEqual(expected_multi, df_multi_2.collect()) def test_mixed_udf_and_sql(self): df = self.spark.range(0, 1).toDF('v') @@ -1107,7 +1107,7 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): .withColumn('f3_f1_f2', f3(f1(f2(df['v'])))) \ .withColumn('f3_f2_f1', f3(f2(f1(df['v'])))) - self.assertEquals(expected, df1.collect()) + self.assertEqual(expected, df1.collect()) # SPARK-24721 @unittest.skipIf(not test_compiled, test_not_compiled_message) # type: ignore @@ -1138,17 +1138,17 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): for df in [filesource_df, datasource_df, datasource_v2_df]: result = df.withColumn('c', c1) expected = df.withColumn('c', lit(2)) - self.assertEquals(expected.collect(), result.collect()) + self.assertEqual(expected.collect(), result.collect()) for df in [filesource_df, datasource_df, datasource_v2_df]: result = df.withColumn('c', c2) expected = df.withColumn('c', col('i') + 1) - self.assertEquals(expected.collect(), result.collect()) + self.assertEqual(expected.collect(), result.collect()) for df in [filesource_df, datasource_df, datasource_v2_df]: for f in [f1, f2]: result = df.filter(f) - self.assertEquals(0, result.count()) + self.assertEqual(0, result.count()) finally: shutil.rmtree(path) diff --git a/python/pyspark/sql/tests/test_pandas_udf_typehints.py b/python/pyspark/sql/tests/test_pandas_udf_typehints.py index d9717da4d2..e30f43181a 100644 --- a/python/pyspark/sql/tests/test_pandas_udf_typehints.py +++ b/python/pyspark/sql/tests/test_pandas_udf_typehints.py @@ -29,7 +29,7 @@ from pyspark.sql import Row if have_pandas: import pandas as pd import numpy as np - from pandas.util.testing import assert_frame_equal + from pandas.testing import assert_frame_equal @unittest.skipIf( diff --git a/python/pyspark/sql/tests/test_pandas_udf_window.py b/python/pyspark/sql/tests/test_pandas_udf_window.py index 5ad2ecd8f8..d861bcce9e 100644 --- a/python/pyspark/sql/tests/test_pandas_udf_window.py +++ b/python/pyspark/sql/tests/test_pandas_udf_window.py @@ -26,7 +26,7 @@ from pyspark.testing.sqlutils import ReusedSQLTestCase, have_pandas, have_pyarro from pyspark.testing.utils import QuietTest if have_pandas: - from pandas.util.testing import assert_frame_equal + from pandas.testing import assert_frame_equal @unittest.skipIf( @@ -241,14 +241,14 @@ class WindowPandasUDFTests(ReusedSQLTestCase): array_udf = pandas_udf(lambda x: [1.0, 2.0], 'array', PandasUDFType.GROUPED_AGG) result1 = df.withColumn('v2', array_udf(df['v']).over(w)) - self.assertEquals(result1.first()['v2'], [1.0, 2.0]) + self.assertEqual(result1.first()['v2'], [1.0, 2.0]) def test_invalid_args(self): df = self.data w = self.unbounded_window with QuietTest(self.sc): - with self.assertRaisesRegexp( + with self.assertRaisesRegex( AnalysisException, '.*not supported within a window function'): foo_udf = pandas_udf(lambda x: x, 'v double', PandasUDFType.GROUPED_MAP) diff --git a/python/pyspark/sql/tests/test_types.py b/python/pyspark/sql/tests/test_types.py index 6b5c1ad6c4..eb4caf05d1 100644 --- a/python/pyspark/sql/tests/test_types.py +++ b/python/pyspark/sql/tests/test_types.py @@ -180,7 +180,7 @@ class TypesTests(ReusedSQLTestCase): self.assertEqual(df.columns, ['col1', '_2']) def test_infer_schema_fails(self): - with self.assertRaisesRegexp(TypeError, 'field a'): + with self.assertRaisesRegex(TypeError, 'field a'): self.spark.createDataFrame(self.spark.sparkContext.parallelize([[1, 1], ["x", 1]]), schema=["a", "b"], samplingRatio=0.99) @@ -578,18 +578,18 @@ class TypesTests(ReusedSQLTestCase): ArrayType(LongType()), ArrayType(LongType()) ), ArrayType(LongType())) - with self.assertRaisesRegexp(TypeError, 'element in array'): + with self.assertRaisesRegex(TypeError, 'element in array'): _merge_type(ArrayType(LongType()), ArrayType(DoubleType())) self.assertEqual(_merge_type( MapType(StringType(), LongType()), MapType(StringType(), LongType()) ), MapType(StringType(), LongType())) - with self.assertRaisesRegexp(TypeError, 'key of map'): + with self.assertRaisesRegex(TypeError, 'key of map'): _merge_type( MapType(StringType(), LongType()), MapType(DoubleType(), LongType())) - with self.assertRaisesRegexp(TypeError, 'value of map'): + with self.assertRaisesRegex(TypeError, 'value of map'): _merge_type( MapType(StringType(), LongType()), MapType(StringType(), DoubleType())) @@ -598,7 +598,7 @@ class TypesTests(ReusedSQLTestCase): StructType([StructField("f1", LongType()), StructField("f2", StringType())]), StructType([StructField("f1", LongType()), StructField("f2", StringType())]) ), StructType([StructField("f1", LongType()), StructField("f2", StringType())])) - with self.assertRaisesRegexp(TypeError, 'field f1'): + with self.assertRaisesRegex(TypeError, 'field f1'): _merge_type( StructType([StructField("f1", LongType()), StructField("f2", StringType())]), StructType([StructField("f1", DoubleType()), StructField("f2", StringType())])) @@ -607,7 +607,7 @@ class TypesTests(ReusedSQLTestCase): StructType([StructField("f1", StructType([StructField("f2", LongType())]))]), StructType([StructField("f1", StructType([StructField("f2", LongType())]))]) ), StructType([StructField("f1", StructType([StructField("f2", LongType())]))])) - with self.assertRaisesRegexp(TypeError, 'field f2 in field f1'): + with self.assertRaisesRegex(TypeError, 'field f2 in field f1'): _merge_type( StructType([StructField("f1", StructType([StructField("f2", LongType())]))]), StructType([StructField("f1", StructType([StructField("f2", StringType())]))])) @@ -616,7 +616,7 @@ class TypesTests(ReusedSQLTestCase): StructType([StructField("f1", ArrayType(LongType())), StructField("f2", StringType())]), StructType([StructField("f1", ArrayType(LongType())), StructField("f2", StringType())]) ), StructType([StructField("f1", ArrayType(LongType())), StructField("f2", StringType())])) - with self.assertRaisesRegexp(TypeError, 'element in array field f1'): + with self.assertRaisesRegex(TypeError, 'element in array field f1'): _merge_type( StructType([ StructField("f1", ArrayType(LongType())), @@ -635,7 +635,7 @@ class TypesTests(ReusedSQLTestCase): ), StructType([ StructField("f1", MapType(StringType(), LongType())), StructField("f2", StringType())])) - with self.assertRaisesRegexp(TypeError, 'value of map field f1'): + with self.assertRaisesRegex(TypeError, 'value of map field f1'): _merge_type( StructType([ StructField("f1", MapType(StringType(), LongType())), @@ -648,7 +648,7 @@ class TypesTests(ReusedSQLTestCase): StructType([StructField("f1", ArrayType(MapType(StringType(), LongType())))]), StructType([StructField("f1", ArrayType(MapType(StringType(), LongType())))]) ), StructType([StructField("f1", ArrayType(MapType(StringType(), LongType())))])) - with self.assertRaisesRegexp(TypeError, 'key of map element in array field f1'): + with self.assertRaisesRegex(TypeError, 'key of map element in array field f1'): _merge_type( StructType([StructField("f1", ArrayType(MapType(StringType(), LongType())))]), StructType([StructField("f1", ArrayType(MapType(DoubleType(), LongType())))]) @@ -734,7 +734,7 @@ class TypesTests(ReusedSQLTestCase): unsupported_types = all_types - set(supported_types) # test unsupported types for t in unsupported_types: - with self.assertRaisesRegexp(TypeError, "infer the type of the field myarray"): + with self.assertRaisesRegex(TypeError, "infer the type of the field myarray"): a = array.array(t) self.spark.createDataFrame([Row(myarray=a)]).collect() @@ -789,13 +789,13 @@ class DataTypeTests(unittest.TestCase): class DataTypeVerificationTests(unittest.TestCase): def test_verify_type_exception_msg(self): - self.assertRaisesRegexp( + self.assertRaisesRegex( ValueError, "test_name", lambda: _make_type_verifier(StringType(), nullable=False, name="test_name")(None)) schema = StructType([StructField('a', StructType([StructField('b', IntegerType())]))]) - self.assertRaisesRegexp( + self.assertRaisesRegex( TypeError, "field b in field a", lambda: _make_type_verifier(schema)([["data"]])) diff --git a/python/pyspark/sql/tests/test_udf.py b/python/pyspark/sql/tests/test_udf.py index 9a1c0edcce..bfc55dff94 100644 --- a/python/pyspark/sql/tests/test_udf.py +++ b/python/pyspark/sql/tests/test_udf.py @@ -98,7 +98,7 @@ class UDFTests(ReusedSQLTestCase): def test_udf_registration_return_type_not_none(self): with QuietTest(self.sc): - with self.assertRaisesRegexp(TypeError, "Invalid return type"): + with self.assertRaisesRegex(TypeError, "Invalid return type"): self.spark.catalog.registerFunction( "f", UserDefinedFunction(lambda x, y: len(x) + y, StringType()), StringType()) @@ -149,9 +149,9 @@ class UDFTests(ReusedSQLTestCase): df = self.spark.range(10) with QuietTest(self.sc): - with self.assertRaisesRegexp(AnalysisException, "nondeterministic"): + with self.assertRaisesRegex(AnalysisException, "nondeterministic"): df.groupby('id').agg(sum(udf_random_col())).collect() - with self.assertRaisesRegexp(AnalysisException, "nondeterministic"): + with self.assertRaisesRegex(AnalysisException, "nondeterministic"): df.agg(sum(udf_random_col())).collect() def test_chained_udf(self): @@ -203,7 +203,7 @@ class UDFTests(ReusedSQLTestCase): # Cross join. df = left.join(right, f("a", "b")) with self.sql_conf({"spark.sql.crossJoin.enabled": False}): - with self.assertRaisesRegexp(AnalysisException, 'Detected implicit cartesian product'): + with self.assertRaisesRegex(AnalysisException, 'Detected implicit cartesian product'): df.collect() with self.sql_conf({"spark.sql.crossJoin.enabled": True}): self.assertEqual(df.collect(), [Row(a=1, b=1)]) @@ -238,7 +238,7 @@ class UDFTests(ReusedSQLTestCase): f = udf(lambda a, b: a == b, BooleanType()) def runWithJoinType(join_type, type_string): - with self.assertRaisesRegexp( + with self.assertRaisesRegex( AnalysisException, 'Using PythonUDF.*%s is not supported.' % type_string): left.join(right, [f("a", "b"), left.a1 == right.b1], join_type).collect() @@ -385,18 +385,18 @@ class UDFTests(ReusedSQLTestCase): def test_non_existed_udf(self): spark = self.spark - self.assertRaisesRegexp(AnalysisException, "Can not load class non_existed_udf", - lambda: spark.udf.registerJavaFunction("udf1", "non_existed_udf")) + self.assertRaisesRegex(AnalysisException, "Can not load class non_existed_udf", + lambda: spark.udf.registerJavaFunction("udf1", "non_existed_udf")) # This is to check if a deprecated 'SQLContext.registerJavaFunction' can call its alias. sqlContext = spark._wrapped - self.assertRaisesRegexp(AnalysisException, "Can not load class non_existed_udf", - lambda: sqlContext.registerJavaFunction("udf1", "non_existed_udf")) + self.assertRaisesRegex(AnalysisException, "Can not load class non_existed_udf", + lambda: sqlContext.registerJavaFunction("udf1", "non_existed_udf")) def test_non_existed_udaf(self): spark = self.spark - self.assertRaisesRegexp(AnalysisException, "Can not load class non_existed_udaf", - lambda: spark.udf.registerJavaUDAF("udaf1", "non_existed_udaf")) + self.assertRaisesRegex(AnalysisException, "Can not load class non_existed_udaf", + lambda: spark.udf.registerJavaUDAF("udaf1", "non_existed_udaf")) def test_udf_with_input_file_name(self): from pyspark.sql.functions import input_file_name @@ -587,17 +587,17 @@ class UDFTests(ReusedSQLTestCase): for df in [filesource_df, datasource_df, datasource_v2_df]: result = df.withColumn('c', c1) expected = df.withColumn('c', lit(2)) - self.assertEquals(expected.collect(), result.collect()) + self.assertEqual(expected.collect(), result.collect()) for df in [filesource_df, datasource_df, datasource_v2_df]: result = df.withColumn('c', c2) expected = df.withColumn('c', col('i') + 1) - self.assertEquals(expected.collect(), result.collect()) + self.assertEqual(expected.collect(), result.collect()) for df in [filesource_df, datasource_df, datasource_v2_df]: for f in [f1, f2]: result = df.filter(f) - self.assertEquals(0, result.count()) + self.assertEqual(0, result.count()) finally: shutil.rmtree(path) diff --git a/python/pyspark/sql/tests/test_utils.py b/python/pyspark/sql/tests/test_utils.py index b08e17208d..005f0e892b 100644 --- a/python/pyspark/sql/tests/test_utils.py +++ b/python/pyspark/sql/tests/test_utils.py @@ -31,23 +31,22 @@ class UtilsTests(ReusedSQLTestCase): try: self.spark.sql("select `中文字段`") except AnalysisException as e: - self.assertRegexpMatches(str(e), "cannot resolve '`中文字段`'") + self.assertRegex(str(e), "cannot resolve '`中文字段`'") def test_capture_parse_exception(self): self.assertRaises(ParseException, lambda: self.spark.sql("abc")) def test_capture_illegalargument_exception(self): - self.assertRaisesRegexp(IllegalArgumentException, "Setting negative mapred.reduce.tasks", - lambda: self.spark.sql("SET mapred.reduce.tasks=-1")) + self.assertRaisesRegex(IllegalArgumentException, "Setting negative mapred.reduce.tasks", + lambda: self.spark.sql("SET mapred.reduce.tasks=-1")) df = self.spark.createDataFrame([(1, 2)], ["a", "b"]) - self.assertRaisesRegexp(IllegalArgumentException, "1024 is not in the permitted values", - lambda: df.select(sha2(df.a, 1024)).collect()) + self.assertRaisesRegex(IllegalArgumentException, "1024 is not in the permitted values", + lambda: df.select(sha2(df.a, 1024)).collect()) try: df.select(sha2(df.a, 1024)).collect() except IllegalArgumentException as e: - self.assertRegexpMatches(e.desc, "1024 is not in the permitted values") - self.assertRegexpMatches(e.stackTrace, - "org.apache.spark.sql.functions") + self.assertRegex(e.desc, "1024 is not in the permitted values") + self.assertRegex(e.stackTrace, "org.apache.spark.sql.functions") if __name__ == "__main__": diff --git a/python/pyspark/tests/test_profiler.py b/python/pyspark/tests/test_profiler.py index de72a547b0..e621321283 100644 --- a/python/pyspark/tests/test_profiler.py +++ b/python/pyspark/tests/test_profiler.py @@ -85,11 +85,11 @@ class ProfilerTests2(unittest.TestCase): def test_profiler_disabled(self): sc = SparkContext(conf=SparkConf().set("spark.python.profile", "false")) try: - self.assertRaisesRegexp( + self.assertRaisesRegex( RuntimeError, "'spark.python.profile' configuration must be set", lambda: sc.show_profiles()) - self.assertRaisesRegexp( + self.assertRaisesRegex( RuntimeError, "'spark.python.profile' configuration must be set", lambda: sc.dump_profiles("/tmp/abc")) diff --git a/python/pyspark/tests/test_rdd.py b/python/pyspark/tests/test_rdd.py index 47b8f10a5b..b17c039889 100644 --- a/python/pyspark/tests/test_rdd.py +++ b/python/pyspark/tests/test_rdd.py @@ -733,25 +733,25 @@ class RDDTests(ReusedPySparkTestCase): keyed_rdd = self.sc.parallelize((x % 2, x) for x in range(10)) msg = "Caught StopIteration thrown from user's code; failing the task" - self.assertRaisesRegexp(Py4JJavaError, msg, seq_rdd.map(stopit).collect) - self.assertRaisesRegexp(Py4JJavaError, msg, seq_rdd.filter(stopit).collect) - self.assertRaisesRegexp(Py4JJavaError, msg, seq_rdd.foreach, stopit) - self.assertRaisesRegexp(Py4JJavaError, msg, seq_rdd.reduce, stopit) - self.assertRaisesRegexp(Py4JJavaError, msg, seq_rdd.fold, 0, stopit) - self.assertRaisesRegexp(Py4JJavaError, msg, seq_rdd.foreach, stopit) - self.assertRaisesRegexp(Py4JJavaError, msg, - seq_rdd.cartesian(seq_rdd).flatMap(stopit).collect) + self.assertRaisesRegex(Py4JJavaError, msg, seq_rdd.map(stopit).collect) + self.assertRaisesRegex(Py4JJavaError, msg, seq_rdd.filter(stopit).collect) + self.assertRaisesRegex(Py4JJavaError, msg, seq_rdd.foreach, stopit) + self.assertRaisesRegex(Py4JJavaError, msg, seq_rdd.reduce, stopit) + self.assertRaisesRegex(Py4JJavaError, msg, seq_rdd.fold, 0, stopit) + self.assertRaisesRegex(Py4JJavaError, msg, seq_rdd.foreach, stopit) + self.assertRaisesRegex(Py4JJavaError, msg, + seq_rdd.cartesian(seq_rdd).flatMap(stopit).collect) # these methods call the user function both in the driver and in the executor # the exception raised is different according to where the StopIteration happens # RuntimeError is raised if in the driver # Py4JJavaError is raised if in the executor (wraps the RuntimeError raised in the worker) - self.assertRaisesRegexp((Py4JJavaError, RuntimeError), msg, - keyed_rdd.reduceByKeyLocally, stopit) - self.assertRaisesRegexp((Py4JJavaError, RuntimeError), msg, - seq_rdd.aggregate, 0, stopit, lambda *x: 1) - self.assertRaisesRegexp((Py4JJavaError, RuntimeError), msg, - seq_rdd.aggregate, 0, lambda *x: 1, stopit) + self.assertRaisesRegex((Py4JJavaError, RuntimeError), msg, + keyed_rdd.reduceByKeyLocally, stopit) + self.assertRaisesRegex((Py4JJavaError, RuntimeError), msg, + seq_rdd.aggregate, 0, stopit, lambda *x: 1) + self.assertRaisesRegex((Py4JJavaError, RuntimeError), msg, + seq_rdd.aggregate, 0, lambda *x: 1, stopit) def test_overwritten_global_func(self): # Regression test for SPARK-27000 @@ -768,7 +768,7 @@ class RDDTests(ReusedPySparkTestCase): rdd = self.sc.range(10).map(fail) - with self.assertRaisesRegexp(Exception, "local iterator error"): + with self.assertRaisesRegex(Exception, "local iterator error"): for _ in rdd.toLocalIterator(): pass diff --git a/python/pyspark/tests/test_worker.py b/python/pyspark/tests/test_worker.py index d7a4b84e8d..51ebee4de7 100644 --- a/python/pyspark/tests/test_worker.py +++ b/python/pyspark/tests/test_worker.py @@ -165,7 +165,7 @@ class WorkerTests(ReusedPySparkTestCase): self.sc.parallelize([1]).map(lambda x: f()).count() except Py4JJavaError as e: - self.assertRegexpMatches(str(e), "exception with 中") + self.assertRegex(str(e), "exception with 中") class WorkerReuseTest(PySparkTestCase):