From 91bd38467e607dde81d4c83fa3e1c989f8280e89 Mon Sep 17 00:00:00 2001 From: itholic Date: Fri, 16 Apr 2021 17:42:03 +0900 Subject: [PATCH] [SPARK-34995] Port/integrate Koalas remaining codes into PySpark ### What changes were proposed in this pull request? There are some more changes in Koalas such as [databricks/koalas#2141](https://github.com/databricks/koalas/commit/c8f803d6becb3accd767afdb3774c8656d0d0b47), [databricks/koalas#2143](https://github.com/databricks/koalas/commit/913d68868d38ee7158c640aceb837484f417267e) after the main code porting, this PR is to synchronize those changes with the `pyspark.pandas`. ### Why are the changes needed? We should port the whole Koalas codes into PySpark and synchronize them. ### Does this PR introduce _any_ user-facing change? Fixed some incompatible behavior with pandas 1.2.0 and added more to the `to_markdown` docstring. ### How was this patch tested? Manually tested in local. Closes #32197 from itholic/SPARK-34995-fix. Authored-by: itholic Signed-off-by: HyukjinKwon --- python/pyspark/pandas/generic.py | 4 + python/pyspark/pandas/indexing.py | 19 +++++ python/pyspark/pandas/tests/test_indexing.py | 11 +-- .../pandas/tests/test_ops_on_diff_frames.py | 76 ++++++++----------- 4 files changed, 58 insertions(+), 52 deletions(-) diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py index 5e97f2aa8d..0140ed5e2a 100644 --- a/python/pyspark/pandas/generic.py +++ b/python/pyspark/pandas/generic.py @@ -2872,6 +2872,10 @@ class Frame(object, metaclass=ABCMeta): str Series or DataFrame in Markdown-friendly format. + Notes + ----- + Requires the `tabulate `_ package. + Examples -------- >>> kser = ps.Series(["elk", "pig", "dog", "quetzal"], name="animal") diff --git a/python/pyspark/pandas/indexing.py b/python/pyspark/pandas/indexing.py index c13fd6b3ad..0b7c0b2a10 100644 --- a/python/pyspark/pandas/indexing.py +++ b/python/pyspark/pandas/indexing.py @@ -1698,6 +1698,25 @@ class iLocIndexer(LocIndexerLike): ) def __setitem__(self, key, value): + if is_list_like(value) and not isinstance(value, spark.Column): + iloc_item = self[key] + if not is_list_like(key) or not is_list_like(iloc_item): + raise ValueError("setting an array element with a sequence.") + else: + shape_iloc_item = iloc_item.shape + len_iloc_item = shape_iloc_item[0] + len_value = len(value) + if len_iloc_item != len_value: + if self._is_series: + raise ValueError( + "cannot set using a list-like indexer with a different length than " + "the value" + ) + else: + raise ValueError( + "shape mismatch: value array of shape ({},) could not be broadcast " + "to indexing result of shape {}".format(len_value, shape_iloc_item) + ) super().__setitem__(key, value) # Update again with resolved_copy to drop extra columns. self._kdf._update_internal_frame( diff --git a/python/pyspark/pandas/tests/test_indexing.py b/python/pyspark/pandas/tests/test_indexing.py index 89647c7fb6..8298767f67 100644 --- a/python/pyspark/pandas/tests/test_indexing.py +++ b/python/pyspark/pandas/tests/test_indexing.py @@ -1089,7 +1089,7 @@ class IndexingTest(ReusedSQLTestCase): kdf.iloc[0, 1] = 50 self.assert_eq(kdf, pdf) - with self.assertRaisesRegex(ValueError, "Incompatible indexer with Series"): + with self.assertRaisesRegex(ValueError, "setting an array element with a sequence."): kdf.iloc[0, 0] = -kdf.max_speed with self.assertRaisesRegex(ValueError, "shape mismatch"): kdf.iloc[:, [1, 0]] = -kdf.max_speed @@ -1227,14 +1227,7 @@ class IndexingTest(ReusedSQLTestCase): self.assert_eq(kser, pser) self.assert_eq(kdf, pdf) - # TODO: matching the behavior with pandas 1.2 and uncomment below test. - # with self.assertRaisesRegex( - # ValueError, - # "cannot set using a list-like indexer with a different length than the value", - # ): - # kser.iloc[[1]] = -kdf.b - - with self.assertRaisesRegex(ValueError, "Incompatible indexer with DataFrame"): + with self.assertRaisesRegex(ValueError, "setting an array element with a sequence."): kser.iloc[1] = kdf[["b"]] def test_iloc_raises(self): diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py index 9070b5ad7a..d567bae3cd 100644 --- a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py +++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py @@ -1151,25 +1151,17 @@ class OpsOnDiffFramesEnabledTest(ReusedSQLTestCase, SQLTestUtils): pdf.iloc[[0, 1, 2], 1] = -pdf.max_speed self.assert_eq(kdf, pdf) - # TODO: matching the behavior with pandas 1.2 and uncomment below test - # with self.assertRaisesRegex( - # ValueError, - # "shape mismatch: value array of shape (3,) could not be broadcast to indexing " - # "result of shape (2,1)", - # ): - # kdf.iloc[[1, 2], [1]] = -another_kdf.max_speed + with self.assertRaisesRegex( + ValueError, "shape mismatch", + ): + kdf.iloc[[1, 2], [1]] = -another_kdf.max_speed kdf.iloc[[0, 1, 2], 1] = 10 * another_kdf.max_speed pdf.iloc[[0, 1, 2], 1] = 10 * pdf.max_speed self.assert_eq(kdf, pdf) - # TODO: matching the behavior with pandas 1.2 and uncomment below test - # with self.assertRaisesRegex( - # ValueError, - # "shape mismatch: value array of shape (3,) could not be broadcast to indexing " - # "result of shape (1,)", - # ): - # kdf.iloc[[0], 1] = 10 * another_kdf.max_speed + with self.assertRaisesRegex(ValueError, "shape mismatch"): + kdf.iloc[[0], 1] = 10 * another_kdf.max_speed def test_series_loc_setitem(self): pdf = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}, index=["cobra", "viper", "sidewinder"]) @@ -1269,12 +1261,11 @@ class OpsOnDiffFramesEnabledTest(ReusedSQLTestCase, SQLTestUtils): self.assert_eq(kdf, pdf) self.assert_eq(ksery, psery) - # TODO: matching the behavior with pandas 1.2 and uncomment below test. - # with self.assertRaisesRegex( - # ValueError, - # "cannot set using a list-like indexer with a different length than the value", - # ): - # kser.iloc[[1, 2]] = -kser_another + with self.assertRaisesRegex( + ValueError, + "cannot set using a list-like indexer with a different length than the value", + ): + kser.iloc[[1, 2]] = -kser_another kser.iloc[[0, 1, 2]] = 10 * kser_another pser.iloc[[0, 1, 2]] = 10 * pser_another @@ -1282,11 +1273,11 @@ class OpsOnDiffFramesEnabledTest(ReusedSQLTestCase, SQLTestUtils): self.assert_eq(kdf, pdf) self.assert_eq(ksery, psery) - # with self.assertRaisesRegex( - # ValueError, - # "cannot set using a list-like indexer with a different length than the value", - # ): - # kser.iloc[[0]] = 10 * kser_another + with self.assertRaisesRegex( + ValueError, + "cannot set using a list-like indexer with a different length than the value", + ): + kser.iloc[[0]] = 10 * kser_another kser1.iloc[[0, 1, 2]] = -kser_another pser1.iloc[[0, 1, 2]] = -pser_another @@ -1294,11 +1285,11 @@ class OpsOnDiffFramesEnabledTest(ReusedSQLTestCase, SQLTestUtils): self.assert_eq(kdf, pdf) self.assert_eq(ksery, psery) - # with self.assertRaisesRegex( - # ValueError, - # "cannot set using a list-like indexer with a different length than the value", - # ): - # kser1.iloc[[1, 2]] = -kser_another + with self.assertRaisesRegex( + ValueError, + "cannot set using a list-like indexer with a different length than the value", + ): + kser1.iloc[[1, 2]] = -kser_another pdf = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}, index=["cobra", "viper", "sidewinder"]) kdf = ps.from_pandas(pdf) @@ -1317,12 +1308,11 @@ class OpsOnDiffFramesEnabledTest(ReusedSQLTestCase, SQLTestUtils): self.assert_eq(kdf, pdf) self.assert_eq(ksery, psery) - # TODO: matching the behavior with pandas 1.2 and uncomment below test. - # with self.assertRaisesRegex( - # ValueError, - # "cannot set using a list-like indexer with a different length than the value", - # ): - # kiloc[[1, 2]] = -kser_another + with self.assertRaisesRegex( + ValueError, + "cannot set using a list-like indexer with a different length than the value", + ): + kiloc[[1, 2]] = -kser_another kiloc[[0, 1, 2]] = 10 * kser_another piloc[[0, 1, 2]] = 10 * pser_another @@ -1330,11 +1320,11 @@ class OpsOnDiffFramesEnabledTest(ReusedSQLTestCase, SQLTestUtils): self.assert_eq(kdf, pdf) self.assert_eq(ksery, psery) - # with self.assertRaisesRegex( - # ValueError, - # "cannot set using a list-like indexer with a different length than the value", - # ): - # kiloc[[0]] = 10 * kser_another + with self.assertRaisesRegex( + ValueError, + "cannot set using a list-like indexer with a different length than the value", + ): + kiloc[[0]] = 10 * kser_another def test_update(self): pdf = pd.DataFrame({"x": [1, 2, 3], "y": [10, 20, 30]}) @@ -1863,7 +1853,7 @@ class OpsOnDiffFramesDisabledTest(ReusedSQLTestCase, SQLTestUtils): another_kdf = ps.DataFrame(pdf) with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"): - kdf.iloc[[1, 2], [1]] = another_kdf.max_speed + kdf.iloc[[1, 2], [1]] = another_kdf.max_speed.iloc[[1, 2]] def test_series_loc_setitem(self): pser = pd.Series([1, 2, 3], index=["cobra", "viper", "sidewinder"]) @@ -1889,7 +1879,7 @@ class OpsOnDiffFramesDisabledTest(ReusedSQLTestCase, SQLTestUtils): kser_another = ps.from_pandas(pser_another) with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"): - kser.iloc[[1]] = -kser_another + kser.iloc[[1]] = -kser_another.iloc[[1]] def test_where(self): pdf1 = pd.DataFrame({"A": [0, 1, 2, 3, 4], "B": [100, 200, 300, 400, 500]})