[SPARK-34995] Port/integrate Koalas remaining codes into PySpark
### What changes were proposed in this pull request? There are some more changes in Koalas such as [databricks/koalas#2141](c8f803d6be
), [databricks/koalas#2143](913d68868d
) after the main code porting, this PR is to synchronize those changes with the `pyspark.pandas`. ### Why are the changes needed? We should port the whole Koalas codes into PySpark and synchronize them. ### Does this PR introduce _any_ user-facing change? Fixed some incompatible behavior with pandas 1.2.0 and added more to the `to_markdown` docstring. ### How was this patch tested? Manually tested in local. Closes #32197 from itholic/SPARK-34995-fix. Authored-by: itholic <haejoon.lee@databricks.com> Signed-off-by: HyukjinKwon <gurwls223@apache.org>
This commit is contained in:
parent
95db7e6459
commit
91bd38467e
|
@ -2872,6 +2872,10 @@ class Frame(object, metaclass=ABCMeta):
|
|||
str
|
||||
Series or DataFrame in Markdown-friendly format.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Requires the `tabulate <https://pypi.org/project/tabulate>`_ package.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> kser = ps.Series(["elk", "pig", "dog", "quetzal"], name="animal")
|
||||
|
|
|
@ -1698,6 +1698,25 @@ class iLocIndexer(LocIndexerLike):
|
|||
)
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
if is_list_like(value) and not isinstance(value, spark.Column):
|
||||
iloc_item = self[key]
|
||||
if not is_list_like(key) or not is_list_like(iloc_item):
|
||||
raise ValueError("setting an array element with a sequence.")
|
||||
else:
|
||||
shape_iloc_item = iloc_item.shape
|
||||
len_iloc_item = shape_iloc_item[0]
|
||||
len_value = len(value)
|
||||
if len_iloc_item != len_value:
|
||||
if self._is_series:
|
||||
raise ValueError(
|
||||
"cannot set using a list-like indexer with a different length than "
|
||||
"the value"
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
"shape mismatch: value array of shape ({},) could not be broadcast "
|
||||
"to indexing result of shape {}".format(len_value, shape_iloc_item)
|
||||
)
|
||||
super().__setitem__(key, value)
|
||||
# Update again with resolved_copy to drop extra columns.
|
||||
self._kdf._update_internal_frame(
|
||||
|
|
|
@ -1089,7 +1089,7 @@ class IndexingTest(ReusedSQLTestCase):
|
|||
kdf.iloc[0, 1] = 50
|
||||
self.assert_eq(kdf, pdf)
|
||||
|
||||
with self.assertRaisesRegex(ValueError, "Incompatible indexer with Series"):
|
||||
with self.assertRaisesRegex(ValueError, "setting an array element with a sequence."):
|
||||
kdf.iloc[0, 0] = -kdf.max_speed
|
||||
with self.assertRaisesRegex(ValueError, "shape mismatch"):
|
||||
kdf.iloc[:, [1, 0]] = -kdf.max_speed
|
||||
|
@ -1227,14 +1227,7 @@ class IndexingTest(ReusedSQLTestCase):
|
|||
self.assert_eq(kser, pser)
|
||||
self.assert_eq(kdf, pdf)
|
||||
|
||||
# TODO: matching the behavior with pandas 1.2 and uncomment below test.
|
||||
# with self.assertRaisesRegex(
|
||||
# ValueError,
|
||||
# "cannot set using a list-like indexer with a different length than the value",
|
||||
# ):
|
||||
# kser.iloc[[1]] = -kdf.b
|
||||
|
||||
with self.assertRaisesRegex(ValueError, "Incompatible indexer with DataFrame"):
|
||||
with self.assertRaisesRegex(ValueError, "setting an array element with a sequence."):
|
||||
kser.iloc[1] = kdf[["b"]]
|
||||
|
||||
def test_iloc_raises(self):
|
||||
|
|
|
@ -1151,25 +1151,17 @@ class OpsOnDiffFramesEnabledTest(ReusedSQLTestCase, SQLTestUtils):
|
|||
pdf.iloc[[0, 1, 2], 1] = -pdf.max_speed
|
||||
self.assert_eq(kdf, pdf)
|
||||
|
||||
# TODO: matching the behavior with pandas 1.2 and uncomment below test
|
||||
# with self.assertRaisesRegex(
|
||||
# ValueError,
|
||||
# "shape mismatch: value array of shape (3,) could not be broadcast to indexing "
|
||||
# "result of shape (2,1)",
|
||||
# ):
|
||||
# kdf.iloc[[1, 2], [1]] = -another_kdf.max_speed
|
||||
with self.assertRaisesRegex(
|
||||
ValueError, "shape mismatch",
|
||||
):
|
||||
kdf.iloc[[1, 2], [1]] = -another_kdf.max_speed
|
||||
|
||||
kdf.iloc[[0, 1, 2], 1] = 10 * another_kdf.max_speed
|
||||
pdf.iloc[[0, 1, 2], 1] = 10 * pdf.max_speed
|
||||
self.assert_eq(kdf, pdf)
|
||||
|
||||
# TODO: matching the behavior with pandas 1.2 and uncomment below test
|
||||
# with self.assertRaisesRegex(
|
||||
# ValueError,
|
||||
# "shape mismatch: value array of shape (3,) could not be broadcast to indexing "
|
||||
# "result of shape (1,)",
|
||||
# ):
|
||||
# kdf.iloc[[0], 1] = 10 * another_kdf.max_speed
|
||||
with self.assertRaisesRegex(ValueError, "shape mismatch"):
|
||||
kdf.iloc[[0], 1] = 10 * another_kdf.max_speed
|
||||
|
||||
def test_series_loc_setitem(self):
|
||||
pdf = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}, index=["cobra", "viper", "sidewinder"])
|
||||
|
@ -1269,12 +1261,11 @@ class OpsOnDiffFramesEnabledTest(ReusedSQLTestCase, SQLTestUtils):
|
|||
self.assert_eq(kdf, pdf)
|
||||
self.assert_eq(ksery, psery)
|
||||
|
||||
# TODO: matching the behavior with pandas 1.2 and uncomment below test.
|
||||
# with self.assertRaisesRegex(
|
||||
# ValueError,
|
||||
# "cannot set using a list-like indexer with a different length than the value",
|
||||
# ):
|
||||
# kser.iloc[[1, 2]] = -kser_another
|
||||
with self.assertRaisesRegex(
|
||||
ValueError,
|
||||
"cannot set using a list-like indexer with a different length than the value",
|
||||
):
|
||||
kser.iloc[[1, 2]] = -kser_another
|
||||
|
||||
kser.iloc[[0, 1, 2]] = 10 * kser_another
|
||||
pser.iloc[[0, 1, 2]] = 10 * pser_another
|
||||
|
@ -1282,11 +1273,11 @@ class OpsOnDiffFramesEnabledTest(ReusedSQLTestCase, SQLTestUtils):
|
|||
self.assert_eq(kdf, pdf)
|
||||
self.assert_eq(ksery, psery)
|
||||
|
||||
# with self.assertRaisesRegex(
|
||||
# ValueError,
|
||||
# "cannot set using a list-like indexer with a different length than the value",
|
||||
# ):
|
||||
# kser.iloc[[0]] = 10 * kser_another
|
||||
with self.assertRaisesRegex(
|
||||
ValueError,
|
||||
"cannot set using a list-like indexer with a different length than the value",
|
||||
):
|
||||
kser.iloc[[0]] = 10 * kser_another
|
||||
|
||||
kser1.iloc[[0, 1, 2]] = -kser_another
|
||||
pser1.iloc[[0, 1, 2]] = -pser_another
|
||||
|
@ -1294,11 +1285,11 @@ class OpsOnDiffFramesEnabledTest(ReusedSQLTestCase, SQLTestUtils):
|
|||
self.assert_eq(kdf, pdf)
|
||||
self.assert_eq(ksery, psery)
|
||||
|
||||
# with self.assertRaisesRegex(
|
||||
# ValueError,
|
||||
# "cannot set using a list-like indexer with a different length than the value",
|
||||
# ):
|
||||
# kser1.iloc[[1, 2]] = -kser_another
|
||||
with self.assertRaisesRegex(
|
||||
ValueError,
|
||||
"cannot set using a list-like indexer with a different length than the value",
|
||||
):
|
||||
kser1.iloc[[1, 2]] = -kser_another
|
||||
|
||||
pdf = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}, index=["cobra", "viper", "sidewinder"])
|
||||
kdf = ps.from_pandas(pdf)
|
||||
|
@ -1317,12 +1308,11 @@ class OpsOnDiffFramesEnabledTest(ReusedSQLTestCase, SQLTestUtils):
|
|||
self.assert_eq(kdf, pdf)
|
||||
self.assert_eq(ksery, psery)
|
||||
|
||||
# TODO: matching the behavior with pandas 1.2 and uncomment below test.
|
||||
# with self.assertRaisesRegex(
|
||||
# ValueError,
|
||||
# "cannot set using a list-like indexer with a different length than the value",
|
||||
# ):
|
||||
# kiloc[[1, 2]] = -kser_another
|
||||
with self.assertRaisesRegex(
|
||||
ValueError,
|
||||
"cannot set using a list-like indexer with a different length than the value",
|
||||
):
|
||||
kiloc[[1, 2]] = -kser_another
|
||||
|
||||
kiloc[[0, 1, 2]] = 10 * kser_another
|
||||
piloc[[0, 1, 2]] = 10 * pser_another
|
||||
|
@ -1330,11 +1320,11 @@ class OpsOnDiffFramesEnabledTest(ReusedSQLTestCase, SQLTestUtils):
|
|||
self.assert_eq(kdf, pdf)
|
||||
self.assert_eq(ksery, psery)
|
||||
|
||||
# with self.assertRaisesRegex(
|
||||
# ValueError,
|
||||
# "cannot set using a list-like indexer with a different length than the value",
|
||||
# ):
|
||||
# kiloc[[0]] = 10 * kser_another
|
||||
with self.assertRaisesRegex(
|
||||
ValueError,
|
||||
"cannot set using a list-like indexer with a different length than the value",
|
||||
):
|
||||
kiloc[[0]] = 10 * kser_another
|
||||
|
||||
def test_update(self):
|
||||
pdf = pd.DataFrame({"x": [1, 2, 3], "y": [10, 20, 30]})
|
||||
|
@ -1863,7 +1853,7 @@ class OpsOnDiffFramesDisabledTest(ReusedSQLTestCase, SQLTestUtils):
|
|||
another_kdf = ps.DataFrame(pdf)
|
||||
|
||||
with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"):
|
||||
kdf.iloc[[1, 2], [1]] = another_kdf.max_speed
|
||||
kdf.iloc[[1, 2], [1]] = another_kdf.max_speed.iloc[[1, 2]]
|
||||
|
||||
def test_series_loc_setitem(self):
|
||||
pser = pd.Series([1, 2, 3], index=["cobra", "viper", "sidewinder"])
|
||||
|
@ -1889,7 +1879,7 @@ class OpsOnDiffFramesDisabledTest(ReusedSQLTestCase, SQLTestUtils):
|
|||
kser_another = ps.from_pandas(pser_another)
|
||||
|
||||
with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"):
|
||||
kser.iloc[[1]] = -kser_another
|
||||
kser.iloc[[1]] = -kser_another.iloc[[1]]
|
||||
|
||||
def test_where(self):
|
||||
pdf1 = pd.DataFrame({"A": [0, 1, 2, 3, 4], "B": [100, 200, 300, 400, 500]})
|
||||
|
|
Loading…
Reference in a new issue