[SPARK-34995] Port/integrate Koalas remaining codes into PySpark

### What changes were proposed in this pull request?

There are some more changes in Koalas such as [databricks/koalas#2141](c8f803d6be), [databricks/koalas#2143](913d68868d) after the main code porting, this PR is to synchronize those changes with the `pyspark.pandas`.

### Why are the changes needed?

We should port the whole Koalas codes into PySpark and synchronize them.

### Does this PR introduce _any_ user-facing change?

Fixed some incompatible behavior with pandas 1.2.0 and added more to the `to_markdown` docstring.

### How was this patch tested?

Manually tested in local.

Closes #32154 from itholic/SPARK-34995.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
This commit is contained in:
itholic 2021-04-15 19:13:08 +09:00 committed by HyukjinKwon
parent 2cb962b132
commit 9689c44b60
3 changed files with 56 additions and 43 deletions

View file

@ -2872,6 +2872,10 @@ class Frame(object, metaclass=ABCMeta):
str
Series or DataFrame in Markdown-friendly format.
Notes
-----
Requires the `tabulate <https://pypi.org/project/tabulate>`_ package.
Examples
--------
>>> kser = ps.Series(["elk", "pig", "dog", "quetzal"], name="animal")

View file

@ -1698,6 +1698,25 @@ class iLocIndexer(LocIndexerLike):
)
def __setitem__(self, key, value):
if is_list_like(value) and not isinstance(value, spark.Column):
iloc_item = self[key]
if not is_list_like(key) or not is_list_like(iloc_item):
raise ValueError("setting an array element with a sequence.")
else:
shape_iloc_item = iloc_item.shape
len_iloc_item = shape_iloc_item[0]
len_value = len(value)
if len_iloc_item != len_value:
if self._is_series:
raise ValueError(
"cannot set using a list-like indexer with a different length than "
"the value"
)
else:
raise ValueError(
"shape mismatch: value array of shape ({},) could not be broadcast "
"to indexing result of shape {}".format(len_value, shape_iloc_item)
)
super().__setitem__(key, value)
# Update again with resolved_copy to drop extra columns.
self._kdf._update_internal_frame(

View file

@ -1151,25 +1151,17 @@ class OpsOnDiffFramesEnabledTest(ReusedSQLTestCase, SQLTestUtils):
pdf.iloc[[0, 1, 2], 1] = -pdf.max_speed
self.assert_eq(kdf, pdf)
# TODO: matching the behavior with pandas 1.2 and uncomment below test
# with self.assertRaisesRegex(
# ValueError,
# "shape mismatch: value array of shape (3,) could not be broadcast to indexing "
# "result of shape (2,1)",
# ):
# kdf.iloc[[1, 2], [1]] = -another_kdf.max_speed
with self.assertRaisesRegex(
ValueError, "shape mismatch",
):
kdf.iloc[[1, 2], [1]] = -another_kdf.max_speed
kdf.iloc[[0, 1, 2], 1] = 10 * another_kdf.max_speed
pdf.iloc[[0, 1, 2], 1] = 10 * pdf.max_speed
self.assert_eq(kdf, pdf)
# TODO: matching the behavior with pandas 1.2 and uncomment below test
# with self.assertRaisesRegex(
# ValueError,
# "shape mismatch: value array of shape (3,) could not be broadcast to indexing "
# "result of shape (1,)",
# ):
# kdf.iloc[[0], 1] = 10 * another_kdf.max_speed
with self.assertRaisesRegex(ValueError, "shape mismatch"):
kdf.iloc[[0], 1] = 10 * another_kdf.max_speed
def test_series_loc_setitem(self):
pdf = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}, index=["cobra", "viper", "sidewinder"])
@ -1269,12 +1261,11 @@ class OpsOnDiffFramesEnabledTest(ReusedSQLTestCase, SQLTestUtils):
self.assert_eq(kdf, pdf)
self.assert_eq(ksery, psery)
# TODO: matching the behavior with pandas 1.2 and uncomment below test.
# with self.assertRaisesRegex(
# ValueError,
# "cannot set using a list-like indexer with a different length than the value",
# ):
# kser.iloc[[1, 2]] = -kser_another
with self.assertRaisesRegex(
ValueError,
"cannot set using a list-like indexer with a different length than the value",
):
kser.iloc[[1, 2]] = -kser_another
kser.iloc[[0, 1, 2]] = 10 * kser_another
pser.iloc[[0, 1, 2]] = 10 * pser_another
@ -1282,11 +1273,11 @@ class OpsOnDiffFramesEnabledTest(ReusedSQLTestCase, SQLTestUtils):
self.assert_eq(kdf, pdf)
self.assert_eq(ksery, psery)
# with self.assertRaisesRegex(
# ValueError,
# "cannot set using a list-like indexer with a different length than the value",
# ):
# kser.iloc[[0]] = 10 * kser_another
with self.assertRaisesRegex(
ValueError,
"cannot set using a list-like indexer with a different length than the value",
):
kser.iloc[[0]] = 10 * kser_another
kser1.iloc[[0, 1, 2]] = -kser_another
pser1.iloc[[0, 1, 2]] = -pser_another
@ -1294,11 +1285,11 @@ class OpsOnDiffFramesEnabledTest(ReusedSQLTestCase, SQLTestUtils):
self.assert_eq(kdf, pdf)
self.assert_eq(ksery, psery)
# with self.assertRaisesRegex(
# ValueError,
# "cannot set using a list-like indexer with a different length than the value",
# ):
# kser1.iloc[[1, 2]] = -kser_another
with self.assertRaisesRegex(
ValueError,
"cannot set using a list-like indexer with a different length than the value",
):
kser1.iloc[[1, 2]] = -kser_another
pdf = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}, index=["cobra", "viper", "sidewinder"])
kdf = ps.from_pandas(pdf)
@ -1317,12 +1308,11 @@ class OpsOnDiffFramesEnabledTest(ReusedSQLTestCase, SQLTestUtils):
self.assert_eq(kdf, pdf)
self.assert_eq(ksery, psery)
# TODO: matching the behavior with pandas 1.2 and uncomment below test.
# with self.assertRaisesRegex(
# ValueError,
# "cannot set using a list-like indexer with a different length than the value",
# ):
# kiloc[[1, 2]] = -kser_another
with self.assertRaisesRegex(
ValueError,
"cannot set using a list-like indexer with a different length than the value",
):
kiloc[[1, 2]] = -kser_another
kiloc[[0, 1, 2]] = 10 * kser_another
piloc[[0, 1, 2]] = 10 * pser_another
@ -1330,11 +1320,11 @@ class OpsOnDiffFramesEnabledTest(ReusedSQLTestCase, SQLTestUtils):
self.assert_eq(kdf, pdf)
self.assert_eq(ksery, psery)
# with self.assertRaisesRegex(
# ValueError,
# "cannot set using a list-like indexer with a different length than the value",
# ):
# kiloc[[0]] = 10 * kser_another
with self.assertRaisesRegex(
ValueError,
"cannot set using a list-like indexer with a different length than the value",
):
kiloc[[0]] = 10 * kser_another
def test_update(self):
pdf = pd.DataFrame({"x": [1, 2, 3], "y": [10, 20, 30]})
@ -1863,7 +1853,7 @@ class OpsOnDiffFramesDisabledTest(ReusedSQLTestCase, SQLTestUtils):
another_kdf = ps.DataFrame(pdf)
with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"):
kdf.iloc[[1, 2], [1]] = another_kdf.max_speed
kdf.iloc[[1, 2], [1]] = another_kdf.max_speed.iloc[[1, 2]]
def test_series_loc_setitem(self):
pser = pd.Series([1, 2, 3], index=["cobra", "viper", "sidewinder"])
@ -1889,7 +1879,7 @@ class OpsOnDiffFramesDisabledTest(ReusedSQLTestCase, SQLTestUtils):
kser_another = ps.from_pandas(pser_another)
with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"):
kser.iloc[[1]] = -kser_another
kser.iloc[[1]] = -kser_another.iloc[[1]]
def test_where(self):
pdf1 = pd.DataFrame({"A": [0, 1, 2, 3, 4], "B": [100, 200, 300, 400, 500]})