From 91bd38467e607dde81d4c83fa3e1c989f8280e89 Mon Sep 17 00:00:00 2001
From: itholic <haejoon.lee@databricks.com>
Date: Fri, 16 Apr 2021 17:42:03 +0900
Subject: [PATCH] [SPARK-34995] Port/integrate Koalas remaining codes into
 PySpark

### What changes were proposed in this pull request?

There are some more changes in Koalas such as [databricks/koalas#2141](https://github.com/databricks/koalas/commit/c8f803d6becb3accd767afdb3774c8656d0d0b47), [databricks/koalas#2143](https://github.com/databricks/koalas/commit/913d68868d38ee7158c640aceb837484f417267e) after the main code porting, this PR is to synchronize those changes with the `pyspark.pandas`.

### Why are the changes needed?

We should port the whole Koalas codes into PySpark and synchronize them.

### Does this PR introduce _any_ user-facing change?

Fixed some incompatible behavior with pandas 1.2.0 and added more to the `to_markdown` docstring.

### How was this patch tested?

Manually tested in local.

Closes #32197 from itholic/SPARK-34995-fix.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 python/pyspark/pandas/generic.py              |  4 +
 python/pyspark/pandas/indexing.py             | 19 +++++
 python/pyspark/pandas/tests/test_indexing.py  | 11 +--
 .../pandas/tests/test_ops_on_diff_frames.py   | 76 ++++++++-----------
 4 files changed, 58 insertions(+), 52 deletions(-)

diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py
index 5e97f2aa8d..0140ed5e2a 100644
--- a/python/pyspark/pandas/generic.py
+++ b/python/pyspark/pandas/generic.py
@@ -2872,6 +2872,10 @@ class Frame(object, metaclass=ABCMeta):
         str
             Series or DataFrame in Markdown-friendly format.
 
+        Notes
+        -----
+        Requires the `tabulate <https://pypi.org/project/tabulate>`_ package.
+
         Examples
         --------
         >>> kser = ps.Series(["elk", "pig", "dog", "quetzal"], name="animal")
diff --git a/python/pyspark/pandas/indexing.py b/python/pyspark/pandas/indexing.py
index c13fd6b3ad..0b7c0b2a10 100644
--- a/python/pyspark/pandas/indexing.py
+++ b/python/pyspark/pandas/indexing.py
@@ -1698,6 +1698,25 @@ class iLocIndexer(LocIndexerLike):
             )
 
     def __setitem__(self, key, value):
+        if is_list_like(value) and not isinstance(value, spark.Column):
+            iloc_item = self[key]
+            if not is_list_like(key) or not is_list_like(iloc_item):
+                raise ValueError("setting an array element with a sequence.")
+            else:
+                shape_iloc_item = iloc_item.shape
+                len_iloc_item = shape_iloc_item[0]
+                len_value = len(value)
+                if len_iloc_item != len_value:
+                    if self._is_series:
+                        raise ValueError(
+                            "cannot set using a list-like indexer with a different length than "
+                            "the value"
+                        )
+                    else:
+                        raise ValueError(
+                            "shape mismatch: value array of shape ({},) could not be broadcast "
+                            "to indexing result of shape {}".format(len_value, shape_iloc_item)
+                        )
         super().__setitem__(key, value)
         # Update again with resolved_copy to drop extra columns.
         self._kdf._update_internal_frame(
diff --git a/python/pyspark/pandas/tests/test_indexing.py b/python/pyspark/pandas/tests/test_indexing.py
index 89647c7fb6..8298767f67 100644
--- a/python/pyspark/pandas/tests/test_indexing.py
+++ b/python/pyspark/pandas/tests/test_indexing.py
@@ -1089,7 +1089,7 @@ class IndexingTest(ReusedSQLTestCase):
         kdf.iloc[0, 1] = 50
         self.assert_eq(kdf, pdf)
 
-        with self.assertRaisesRegex(ValueError, "Incompatible indexer with Series"):
+        with self.assertRaisesRegex(ValueError, "setting an array element with a sequence."):
             kdf.iloc[0, 0] = -kdf.max_speed
         with self.assertRaisesRegex(ValueError, "shape mismatch"):
             kdf.iloc[:, [1, 0]] = -kdf.max_speed
@@ -1227,14 +1227,7 @@ class IndexingTest(ReusedSQLTestCase):
         self.assert_eq(kser, pser)
         self.assert_eq(kdf, pdf)
 
-        # TODO: matching the behavior with pandas 1.2 and uncomment below test.
-        # with self.assertRaisesRegex(
-        #     ValueError,
-        #     "cannot set using a list-like indexer with a different length than the value",
-        # ):
-        #     kser.iloc[[1]] = -kdf.b
-
-        with self.assertRaisesRegex(ValueError, "Incompatible indexer with DataFrame"):
+        with self.assertRaisesRegex(ValueError, "setting an array element with a sequence."):
             kser.iloc[1] = kdf[["b"]]
 
     def test_iloc_raises(self):
diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py
index 9070b5ad7a..d567bae3cd 100644
--- a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py
+++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py
@@ -1151,25 +1151,17 @@ class OpsOnDiffFramesEnabledTest(ReusedSQLTestCase, SQLTestUtils):
         pdf.iloc[[0, 1, 2], 1] = -pdf.max_speed
         self.assert_eq(kdf, pdf)
 
-        # TODO: matching the behavior with pandas 1.2 and uncomment below test
-        # with self.assertRaisesRegex(
-        #     ValueError,
-        #     "shape mismatch: value array of shape (3,) could not be broadcast to indexing "
-        #     "result of shape (2,1)",
-        # ):
-        #     kdf.iloc[[1, 2], [1]] = -another_kdf.max_speed
+        with self.assertRaisesRegex(
+            ValueError, "shape mismatch",
+        ):
+            kdf.iloc[[1, 2], [1]] = -another_kdf.max_speed
 
         kdf.iloc[[0, 1, 2], 1] = 10 * another_kdf.max_speed
         pdf.iloc[[0, 1, 2], 1] = 10 * pdf.max_speed
         self.assert_eq(kdf, pdf)
 
-        # TODO: matching the behavior with pandas 1.2 and uncomment below test
-        # with self.assertRaisesRegex(
-        #     ValueError,
-        #     "shape mismatch: value array of shape (3,) could not be broadcast to indexing "
-        #     "result of shape (1,)",
-        # ):
-        #     kdf.iloc[[0], 1] = 10 * another_kdf.max_speed
+        with self.assertRaisesRegex(ValueError, "shape mismatch"):
+            kdf.iloc[[0], 1] = 10 * another_kdf.max_speed
 
     def test_series_loc_setitem(self):
         pdf = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}, index=["cobra", "viper", "sidewinder"])
@@ -1269,12 +1261,11 @@ class OpsOnDiffFramesEnabledTest(ReusedSQLTestCase, SQLTestUtils):
         self.assert_eq(kdf, pdf)
         self.assert_eq(ksery, psery)
 
-        # TODO: matching the behavior with pandas 1.2 and uncomment below test.
-        # with self.assertRaisesRegex(
-        #     ValueError,
-        #     "cannot set using a list-like indexer with a different length than the value",
-        # ):
-        #     kser.iloc[[1, 2]] = -kser_another
+        with self.assertRaisesRegex(
+            ValueError,
+            "cannot set using a list-like indexer with a different length than the value",
+        ):
+            kser.iloc[[1, 2]] = -kser_another
 
         kser.iloc[[0, 1, 2]] = 10 * kser_another
         pser.iloc[[0, 1, 2]] = 10 * pser_another
@@ -1282,11 +1273,11 @@ class OpsOnDiffFramesEnabledTest(ReusedSQLTestCase, SQLTestUtils):
         self.assert_eq(kdf, pdf)
         self.assert_eq(ksery, psery)
 
-        # with self.assertRaisesRegex(
-        #     ValueError,
-        #     "cannot set using a list-like indexer with a different length than the value",
-        # ):
-        #     kser.iloc[[0]] = 10 * kser_another
+        with self.assertRaisesRegex(
+            ValueError,
+            "cannot set using a list-like indexer with a different length than the value",
+        ):
+            kser.iloc[[0]] = 10 * kser_another
 
         kser1.iloc[[0, 1, 2]] = -kser_another
         pser1.iloc[[0, 1, 2]] = -pser_another
@@ -1294,11 +1285,11 @@ class OpsOnDiffFramesEnabledTest(ReusedSQLTestCase, SQLTestUtils):
         self.assert_eq(kdf, pdf)
         self.assert_eq(ksery, psery)
 
-        # with self.assertRaisesRegex(
-        #     ValueError,
-        #     "cannot set using a list-like indexer with a different length than the value",
-        # ):
-        #     kser1.iloc[[1, 2]] = -kser_another
+        with self.assertRaisesRegex(
+            ValueError,
+            "cannot set using a list-like indexer with a different length than the value",
+        ):
+            kser1.iloc[[1, 2]] = -kser_another
 
         pdf = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}, index=["cobra", "viper", "sidewinder"])
         kdf = ps.from_pandas(pdf)
@@ -1317,12 +1308,11 @@ class OpsOnDiffFramesEnabledTest(ReusedSQLTestCase, SQLTestUtils):
         self.assert_eq(kdf, pdf)
         self.assert_eq(ksery, psery)
 
-        # TODO: matching the behavior with pandas 1.2 and uncomment below test.
-        # with self.assertRaisesRegex(
-        #     ValueError,
-        #     "cannot set using a list-like indexer with a different length than the value",
-        # ):
-        #     kiloc[[1, 2]] = -kser_another
+        with self.assertRaisesRegex(
+            ValueError,
+            "cannot set using a list-like indexer with a different length than the value",
+        ):
+            kiloc[[1, 2]] = -kser_another
 
         kiloc[[0, 1, 2]] = 10 * kser_another
         piloc[[0, 1, 2]] = 10 * pser_another
@@ -1330,11 +1320,11 @@ class OpsOnDiffFramesEnabledTest(ReusedSQLTestCase, SQLTestUtils):
         self.assert_eq(kdf, pdf)
         self.assert_eq(ksery, psery)
 
-        # with self.assertRaisesRegex(
-        #     ValueError,
-        #     "cannot set using a list-like indexer with a different length than the value",
-        # ):
-        #     kiloc[[0]] = 10 * kser_another
+        with self.assertRaisesRegex(
+            ValueError,
+            "cannot set using a list-like indexer with a different length than the value",
+        ):
+            kiloc[[0]] = 10 * kser_another
 
     def test_update(self):
         pdf = pd.DataFrame({"x": [1, 2, 3], "y": [10, 20, 30]})
@@ -1863,7 +1853,7 @@ class OpsOnDiffFramesDisabledTest(ReusedSQLTestCase, SQLTestUtils):
         another_kdf = ps.DataFrame(pdf)
 
         with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"):
-            kdf.iloc[[1, 2], [1]] = another_kdf.max_speed
+            kdf.iloc[[1, 2], [1]] = another_kdf.max_speed.iloc[[1, 2]]
 
     def test_series_loc_setitem(self):
         pser = pd.Series([1, 2, 3], index=["cobra", "viper", "sidewinder"])
@@ -1889,7 +1879,7 @@ class OpsOnDiffFramesDisabledTest(ReusedSQLTestCase, SQLTestUtils):
         kser_another = ps.from_pandas(pser_another)
 
         with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"):
-            kser.iloc[[1]] = -kser_another
+            kser.iloc[[1]] = -kser_another.iloc[[1]]
 
     def test_where(self):
         pdf1 = pd.DataFrame({"A": [0, 1, 2, 3, 4], "B": [100, 200, 300, 400, 500]})