2021-04-14 22:45:15 -04:00
|
|
|
#
|
|
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
|
|
# this work for additional information regarding copyright ownership.
|
|
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
|
|
# (the "License"); you may not use this file except in compliance with
|
|
|
|
# the License. You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
#
|
|
|
|
|
|
|
|
import datetime
|
|
|
|
from distutils.version import LooseVersion
|
|
|
|
import unittest
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
from pyspark import pandas as ps
|
|
|
|
from pyspark.pandas.exceptions import SparkPandasIndexingError
|
2021-04-22 16:07:35 -04:00
|
|
|
from pyspark.testing.pandasutils import ComparisonTestBase, PandasOnSparkTestCase, compare_both
|
2021-04-14 22:45:15 -04:00
|
|
|
|
|
|
|
|
|
|
|
class BasicIndexingTest(ComparisonTestBase):
|
|
|
|
@property
|
|
|
|
def pdf(self):
|
|
|
|
return pd.DataFrame(
|
|
|
|
{"month": [1, 4, 7, 10], "year": [2012, 2014, 2013, 2014], "sale": [55, 40, 84, 31]}
|
|
|
|
)
|
|
|
|
|
|
|
|
@compare_both(almost=False)
|
|
|
|
def test_indexing(self, df):
|
|
|
|
df1 = df.set_index("month")
|
|
|
|
yield df1
|
|
|
|
|
|
|
|
yield df.set_index("month", drop=False)
|
|
|
|
yield df.set_index("month", append=True)
|
|
|
|
yield df.set_index(["year", "month"])
|
|
|
|
yield df.set_index(["year", "month"], drop=False)
|
|
|
|
yield df.set_index(["year", "month"], append=True)
|
|
|
|
|
|
|
|
yield df1.set_index("year", drop=False, append=True)
|
|
|
|
|
|
|
|
df2 = df1.copy()
|
|
|
|
df2.set_index("year", append=True, inplace=True)
|
|
|
|
yield df2
|
|
|
|
|
|
|
|
self.assertRaisesRegex(KeyError, "unknown", lambda: df.set_index("unknown"))
|
|
|
|
self.assertRaisesRegex(KeyError, "unknown", lambda: df.set_index(["month", "unknown"]))
|
|
|
|
|
|
|
|
for d in [df, df1, df2]:
|
|
|
|
yield d.reset_index()
|
|
|
|
yield d.reset_index(drop=True)
|
|
|
|
|
|
|
|
yield df1.reset_index(level=0)
|
|
|
|
yield df2.reset_index(level=1)
|
|
|
|
yield df2.reset_index(level=[1, 0])
|
|
|
|
yield df1.reset_index(level="month")
|
|
|
|
yield df2.reset_index(level="year")
|
|
|
|
yield df2.reset_index(level=["month", "year"])
|
|
|
|
yield df2.reset_index(level="month", drop=True)
|
|
|
|
yield df2.reset_index(level=["month", "year"], drop=True)
|
|
|
|
|
|
|
|
self.assertRaisesRegex(
|
|
|
|
IndexError,
|
|
|
|
"Too many levels: Index has only 1 level, not 3",
|
|
|
|
lambda: df1.reset_index(level=2),
|
|
|
|
)
|
|
|
|
self.assertRaisesRegex(
|
|
|
|
IndexError,
|
|
|
|
"Too many levels: Index has only 1 level, not 4",
|
|
|
|
lambda: df1.reset_index(level=[3, 2]),
|
|
|
|
)
|
|
|
|
self.assertRaisesRegex(KeyError, "unknown.*month", lambda: df1.reset_index(level="unknown"))
|
|
|
|
self.assertRaisesRegex(
|
|
|
|
KeyError, "Level unknown not found", lambda: df2.reset_index(level="unknown")
|
|
|
|
)
|
|
|
|
|
|
|
|
df3 = df2.copy()
|
|
|
|
df3.reset_index(inplace=True)
|
|
|
|
yield df3
|
|
|
|
|
|
|
|
yield df1.sale.reset_index()
|
|
|
|
yield df1.sale.reset_index(level=0)
|
|
|
|
yield df2.sale.reset_index(level=[1, 0])
|
|
|
|
yield df1.sale.reset_index(drop=True)
|
|
|
|
yield df1.sale.reset_index(name="s")
|
|
|
|
yield df1.sale.reset_index(name="s", drop=True)
|
|
|
|
|
|
|
|
s = df1.sale
|
|
|
|
self.assertRaisesRegex(
|
|
|
|
TypeError,
|
|
|
|
"Cannot reset_index inplace on a Series to create a DataFrame",
|
|
|
|
lambda: s.reset_index(inplace=True),
|
|
|
|
)
|
|
|
|
s.reset_index(drop=True, inplace=True)
|
|
|
|
yield s
|
|
|
|
yield df1
|
|
|
|
|
|
|
|
# multi-index columns
|
|
|
|
df4 = df.copy()
|
|
|
|
df4.columns = pd.MultiIndex.from_tuples(
|
|
|
|
[("cal", "month"), ("cal", "year"), ("num", "sale")]
|
|
|
|
)
|
|
|
|
df5 = df4.set_index(("cal", "month"))
|
|
|
|
yield df5
|
|
|
|
yield df4.set_index([("cal", "month"), ("num", "sale")])
|
|
|
|
|
|
|
|
self.assertRaises(KeyError, lambda: df5.reset_index(level=("cal", "month")))
|
|
|
|
|
|
|
|
yield df5.reset_index(level=[("cal", "month")])
|
|
|
|
|
|
|
|
# non-string names
|
|
|
|
df6 = df.copy()
|
|
|
|
df6.columns = [10.0, 20.0, 30.0]
|
|
|
|
df7 = df6.set_index(10.0)
|
|
|
|
yield df7
|
|
|
|
yield df6.set_index([10.0, 30.0])
|
|
|
|
|
|
|
|
yield df7.reset_index(level=10.0)
|
|
|
|
yield df7.reset_index(level=[10.0])
|
|
|
|
|
|
|
|
df8 = df.copy()
|
|
|
|
df8.columns = pd.MultiIndex.from_tuples([(10, "month"), (10, "year"), (20, "sale")])
|
|
|
|
df9 = df8.set_index((10, "month"))
|
|
|
|
yield df9
|
|
|
|
yield df8.set_index([(10, "month"), (20, "sale")])
|
|
|
|
|
|
|
|
yield df9.reset_index(level=[(10, "month")])
|
|
|
|
|
|
|
|
def test_from_pandas_with_explicit_index(self):
|
|
|
|
pdf = self.pdf
|
|
|
|
|
|
|
|
df1 = ps.from_pandas(pdf.set_index("month"))
|
|
|
|
self.assertPandasEqual(df1.to_pandas(), pdf.set_index("month"))
|
|
|
|
|
|
|
|
df2 = ps.from_pandas(pdf.set_index(["year", "month"]))
|
|
|
|
self.assertPandasEqual(df2.to_pandas(), pdf.set_index(["year", "month"]))
|
|
|
|
|
|
|
|
def test_limitations(self):
|
|
|
|
df = self.kdf.set_index("month")
|
|
|
|
|
|
|
|
self.assertRaisesRegex(
|
|
|
|
ValueError,
|
|
|
|
"Level should be all int or all string.",
|
|
|
|
lambda: df.reset_index([1, "month"]),
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2021-04-22 16:07:35 -04:00
|
|
|
class IndexingTest(PandasOnSparkTestCase):
|
2021-04-14 22:45:15 -04:00
|
|
|
@property
|
|
|
|
def pdf(self):
|
|
|
|
return pd.DataFrame(
|
|
|
|
{"a": [1, 2, 3, 4, 5, 6, 7, 8, 9], "b": [4, 5, 6, 3, 2, 1, 0, 0, 0]},
|
|
|
|
index=[0, 1, 3, 5, 6, 8, 9, 9, 9],
|
|
|
|
)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def kdf(self):
|
|
|
|
return ps.from_pandas(self.pdf)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def pdf2(self):
|
|
|
|
return pd.DataFrame(
|
|
|
|
{0: [1, 2, 3, 4, 5, 6, 7, 8, 9], 1: [4, 5, 6, 3, 2, 1, 0, 0, 0]},
|
|
|
|
index=[0, 1, 3, 5, 6, 8, 9, 9, 9],
|
|
|
|
)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def kdf2(self):
|
|
|
|
return ps.from_pandas(self.pdf2)
|
|
|
|
|
|
|
|
def test_at(self):
|
|
|
|
pdf = self.pdf
|
|
|
|
kdf = self.kdf
|
|
|
|
# Create the equivalent of pdf.loc[3] as a Koalas Series
|
|
|
|
# This is necessary because .loc[n] does not currently work with Koalas DataFrames (#383)
|
|
|
|
test_series = ps.Series([3, 6], index=["a", "b"], name="3")
|
|
|
|
|
|
|
|
# Assert invalided signatures raise TypeError
|
|
|
|
with self.assertRaises(TypeError, msg="Use DataFrame.at like .at[row_index, column_name]"):
|
|
|
|
kdf.at[3]
|
|
|
|
with self.assertRaises(TypeError, msg="Use DataFrame.at like .at[row_index, column_name]"):
|
|
|
|
kdf.at["ab"] # 'ab' is of length 2 but str type instead of tuple
|
|
|
|
with self.assertRaises(TypeError, msg="Use Series.at like .at[column_name]"):
|
|
|
|
test_series.at[3, "b"]
|
|
|
|
|
|
|
|
# Assert .at for DataFrames
|
|
|
|
self.assertEqual(kdf.at[3, "b"], 6)
|
|
|
|
self.assertEqual(kdf.at[3, "b"], pdf.at[3, "b"])
|
|
|
|
self.assert_eq(kdf.at[9, "b"], np.array([0, 0, 0]))
|
|
|
|
self.assert_eq(kdf.at[9, "b"], pdf.at[9, "b"])
|
|
|
|
|
|
|
|
# Assert .at for Series
|
|
|
|
self.assertEqual(test_series.at["b"], 6)
|
|
|
|
self.assertEqual(test_series.at["b"], pdf.loc[3].at["b"])
|
|
|
|
|
|
|
|
# Assert multi-character indices
|
|
|
|
self.assertEqual(
|
|
|
|
ps.Series([0, 1], index=["ab", "cd"]).at["ab"],
|
|
|
|
pd.Series([0, 1], index=["ab", "cd"]).at["ab"],
|
|
|
|
)
|
|
|
|
|
|
|
|
# Assert invalid column or index names result in a KeyError like with pandas
|
|
|
|
with self.assertRaises(KeyError, msg="x"):
|
|
|
|
kdf.at[3, "x"]
|
|
|
|
with self.assertRaises(KeyError, msg=99):
|
|
|
|
kdf.at[99, "b"]
|
|
|
|
|
|
|
|
with self.assertRaises(ValueError):
|
|
|
|
kdf.at[(3, 6), "b"]
|
|
|
|
with self.assertRaises(KeyError):
|
|
|
|
kdf.at[3, ("x", "b")]
|
|
|
|
|
|
|
|
# Assert setting values fails
|
|
|
|
with self.assertRaises(TypeError):
|
|
|
|
kdf.at[3, "b"] = 10
|
|
|
|
|
|
|
|
# non-string column names
|
|
|
|
pdf = self.pdf2
|
|
|
|
kdf = self.kdf2
|
|
|
|
|
|
|
|
# Assert .at for DataFrames
|
|
|
|
self.assertEqual(kdf.at[3, 1], 6)
|
|
|
|
self.assertEqual(kdf.at[3, 1], pdf.at[3, 1])
|
|
|
|
self.assert_eq(kdf.at[9, 1], np.array([0, 0, 0]))
|
|
|
|
self.assert_eq(kdf.at[9, 1], pdf.at[9, 1])
|
|
|
|
|
|
|
|
def test_at_multiindex(self):
|
|
|
|
pdf = self.pdf.set_index("b", append=True)
|
|
|
|
kdf = self.kdf.set_index("b", append=True)
|
|
|
|
|
|
|
|
# TODO: seems like a pandas' bug in pandas>=1.1.0
|
|
|
|
if LooseVersion(pd.__version__) < LooseVersion("1.1.0"):
|
|
|
|
self.assert_eq(kdf.at[(3, 6), "a"], pdf.at[(3, 6), "a"])
|
|
|
|
self.assert_eq(kdf.at[(3,), "a"], pdf.at[(3,), "a"])
|
|
|
|
self.assert_eq(list(kdf.at[(9, 0), "a"]), list(pdf.at[(9, 0), "a"]))
|
|
|
|
self.assert_eq(list(kdf.at[(9,), "a"]), list(pdf.at[(9,), "a"]))
|
|
|
|
else:
|
|
|
|
self.assert_eq(kdf.at[(3, 6), "a"], 3)
|
|
|
|
self.assert_eq(kdf.at[(3,), "a"], np.array([3]))
|
|
|
|
self.assert_eq(list(kdf.at[(9, 0), "a"]), [7, 8, 9])
|
|
|
|
self.assert_eq(list(kdf.at[(9,), "a"]), [7, 8, 9])
|
|
|
|
|
|
|
|
with self.assertRaises(ValueError):
|
|
|
|
kdf.at[3, "a"]
|
|
|
|
|
|
|
|
def test_at_multiindex_columns(self):
|
|
|
|
arrays = [np.array(["bar", "bar", "baz", "baz"]), np.array(["one", "two", "one", "two"])]
|
|
|
|
|
|
|
|
pdf = pd.DataFrame(np.random.randn(3, 4), index=["A", "B", "C"], columns=arrays)
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
self.assert_eq(kdf.at["B", ("bar", "one")], pdf.at["B", ("bar", "one")])
|
|
|
|
|
|
|
|
with self.assertRaises(KeyError):
|
|
|
|
kdf.at["B", "bar"]
|
|
|
|
|
|
|
|
# non-string column names
|
|
|
|
arrays = [np.array([0, 0, 1, 1]), np.array([1, 2, 1, 2])]
|
|
|
|
|
|
|
|
pdf = pd.DataFrame(np.random.randn(3, 4), index=["A", "B", "C"], columns=arrays)
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
self.assert_eq(kdf.at["B", (0, 1)], pdf.at["B", (0, 1)])
|
|
|
|
|
|
|
|
def test_iat(self):
|
|
|
|
pdf = self.pdf
|
|
|
|
kdf = self.kdf
|
|
|
|
# Create the equivalent of pdf.loc[3] as a Koalas Series
|
|
|
|
# This is necessary because .loc[n] does not currently work with Koalas DataFrames (#383)
|
|
|
|
test_series = ps.Series([3, 6], index=["a", "b"], name="3")
|
|
|
|
|
|
|
|
# Assert invalided signatures raise TypeError
|
|
|
|
with self.assertRaises(
|
|
|
|
TypeError,
|
|
|
|
msg="Use DataFrame.at like .iat[row_interget_position, column_integer_position]",
|
|
|
|
):
|
|
|
|
kdf.iat[3]
|
|
|
|
with self.assertRaises(
|
|
|
|
ValueError, msg="iAt based indexing on multi-index can only have tuple values"
|
|
|
|
):
|
|
|
|
kdf.iat[3, "b"] # 'ab' is of length 2 but str type instead of tuple
|
|
|
|
with self.assertRaises(TypeError, msg="Use Series.iat like .iat[row_integer_position]"):
|
|
|
|
test_series.iat[3, "b"]
|
|
|
|
|
|
|
|
# Assert .iat for DataFrames
|
|
|
|
self.assertEqual(kdf.iat[7, 0], 8)
|
|
|
|
self.assertEqual(kdf.iat[7, 0], pdf.iat[7, 0])
|
|
|
|
|
|
|
|
# Assert .iat for Series
|
|
|
|
self.assertEqual(test_series.iat[1], 6)
|
|
|
|
self.assertEqual(test_series.iat[1], pdf.loc[3].iat[1])
|
|
|
|
|
|
|
|
# Assert invalid column or integer position result in a KeyError like with pandas
|
|
|
|
with self.assertRaises(KeyError, msg=99):
|
|
|
|
kdf.iat[0, 99]
|
|
|
|
with self.assertRaises(KeyError, msg=99):
|
|
|
|
kdf.iat[99, 0]
|
|
|
|
|
|
|
|
with self.assertRaises(ValueError):
|
|
|
|
kdf.iat[(1, 1), 1]
|
|
|
|
with self.assertRaises(ValueError):
|
|
|
|
kdf.iat[1, (1, 1)]
|
|
|
|
|
|
|
|
# Assert setting values fails
|
|
|
|
with self.assertRaises(TypeError):
|
|
|
|
kdf.iat[4, 1] = 10
|
|
|
|
|
|
|
|
def test_iat_multiindex(self):
|
|
|
|
pdf = self.pdf.set_index("b", append=True)
|
|
|
|
kdf = self.kdf.set_index("b", append=True)
|
|
|
|
|
|
|
|
self.assert_eq(kdf.iat[7, 0], pdf.iat[7, 0])
|
|
|
|
|
|
|
|
with self.assertRaises(ValueError):
|
|
|
|
kdf.iat[3, "a"]
|
|
|
|
|
|
|
|
def test_iat_multiindex_columns(self):
|
|
|
|
arrays = [np.array(["bar", "bar", "baz", "baz"]), np.array(["one", "two", "one", "two"])]
|
|
|
|
|
|
|
|
pdf = pd.DataFrame(np.random.randn(3, 4), index=["A", "B", "C"], columns=arrays)
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
self.assert_eq(kdf.iat[1, 3], pdf.iat[1, 3])
|
|
|
|
|
|
|
|
with self.assertRaises(KeyError):
|
|
|
|
kdf.iat[0, 99]
|
|
|
|
with self.assertRaises(KeyError):
|
|
|
|
kdf.iat[99, 0]
|
|
|
|
|
|
|
|
def test_loc(self):
|
|
|
|
kdf = self.kdf
|
|
|
|
pdf = self.pdf
|
|
|
|
|
|
|
|
self.assert_eq(kdf.loc[5:5], pdf.loc[5:5])
|
|
|
|
self.assert_eq(kdf.loc[3:8], pdf.loc[3:8])
|
|
|
|
self.assert_eq(kdf.loc[:8], pdf.loc[:8])
|
|
|
|
self.assert_eq(kdf.loc[3:], pdf.loc[3:])
|
|
|
|
self.assert_eq(kdf.loc[[5]], pdf.loc[[5]])
|
|
|
|
self.assert_eq(kdf.loc[:], pdf.loc[:])
|
|
|
|
|
|
|
|
# TODO?: self.assert_eq(kdf.loc[[3, 4, 1, 8]], pdf.loc[[3, 4, 1, 8]])
|
|
|
|
# TODO?: self.assert_eq(kdf.loc[[3, 4, 1, 9]], pdf.loc[[3, 4, 1, 9]])
|
|
|
|
# TODO?: self.assert_eq(kdf.loc[np.array([3, 4, 1, 9])], pdf.loc[np.array([3, 4, 1, 9])])
|
|
|
|
|
|
|
|
self.assert_eq(kdf.a.loc[5:5], pdf.a.loc[5:5])
|
|
|
|
self.assert_eq(kdf.a.loc[3:8], pdf.a.loc[3:8])
|
|
|
|
self.assert_eq(kdf.a.loc[:8], pdf.a.loc[:8])
|
|
|
|
self.assert_eq(kdf.a.loc[3:], pdf.a.loc[3:])
|
|
|
|
self.assert_eq(kdf.a.loc[[5]], pdf.a.loc[[5]])
|
|
|
|
|
|
|
|
# TODO?: self.assert_eq(kdf.a.loc[[3, 4, 1, 8]], pdf.a.loc[[3, 4, 1, 8]])
|
|
|
|
# TODO?: self.assert_eq(kdf.a.loc[[3, 4, 1, 9]], pdf.a.loc[[3, 4, 1, 9]])
|
|
|
|
# TODO?: self.assert_eq(kdf.a.loc[np.array([3, 4, 1, 9])],
|
|
|
|
# pdf.a.loc[np.array([3, 4, 1, 9])])
|
|
|
|
|
|
|
|
self.assert_eq(kdf.a.loc[[]], pdf.a.loc[[]])
|
|
|
|
self.assert_eq(kdf.a.loc[np.array([])], pdf.a.loc[np.array([])])
|
|
|
|
|
|
|
|
self.assert_eq(kdf.loc[1000:], pdf.loc[1000:])
|
|
|
|
self.assert_eq(kdf.loc[-2000:-1000], pdf.loc[-2000:-1000])
|
|
|
|
|
|
|
|
self.assert_eq(kdf.loc[5], pdf.loc[5])
|
|
|
|
self.assert_eq(kdf.loc[9], pdf.loc[9])
|
|
|
|
self.assert_eq(kdf.a.loc[5], pdf.a.loc[5])
|
|
|
|
self.assert_eq(kdf.a.loc[9], pdf.a.loc[9])
|
|
|
|
|
|
|
|
self.assertRaises(KeyError, lambda: kdf.loc[10])
|
|
|
|
self.assertRaises(KeyError, lambda: kdf.a.loc[10])
|
|
|
|
|
|
|
|
# monotonically increasing index test
|
|
|
|
pdf = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6, 7, 8, 9]}, index=[0, 1, 1, 2, 2, 2, 4, 5, 6])
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
self.assert_eq(kdf.loc[:2], pdf.loc[:2])
|
|
|
|
self.assert_eq(kdf.loc[:3], pdf.loc[:3])
|
|
|
|
self.assert_eq(kdf.loc[3:], pdf.loc[3:])
|
|
|
|
self.assert_eq(kdf.loc[4:], pdf.loc[4:])
|
|
|
|
self.assert_eq(kdf.loc[3:2], pdf.loc[3:2])
|
|
|
|
self.assert_eq(kdf.loc[-1:2], pdf.loc[-1:2])
|
|
|
|
self.assert_eq(kdf.loc[3:10], pdf.loc[3:10])
|
|
|
|
|
|
|
|
# monotonically decreasing index test
|
|
|
|
pdf = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6, 7, 8, 9]}, index=[6, 5, 5, 4, 4, 4, 2, 1, 0])
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
self.assert_eq(kdf.loc[:4], pdf.loc[:4])
|
|
|
|
self.assert_eq(kdf.loc[:3], pdf.loc[:3])
|
|
|
|
self.assert_eq(kdf.loc[3:], pdf.loc[3:])
|
|
|
|
self.assert_eq(kdf.loc[2:], pdf.loc[2:])
|
|
|
|
self.assert_eq(kdf.loc[2:3], pdf.loc[2:3])
|
|
|
|
self.assert_eq(kdf.loc[2:-1], pdf.loc[2:-1])
|
|
|
|
self.assert_eq(kdf.loc[10:3], pdf.loc[10:3])
|
|
|
|
|
|
|
|
# test when type of key is string and given value is not included in key
|
|
|
|
pdf = pd.DataFrame({"a": [1, 2, 3]}, index=["a", "b", "d"])
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
self.assert_eq(kdf.loc["a":"z"], pdf.loc["a":"z"])
|
|
|
|
|
|
|
|
# KeyError when index is not monotonic increasing or decreasing
|
|
|
|
# and specified values don't exist in index
|
|
|
|
kdf = ps.DataFrame([[1, 2], [4, 5], [7, 8]], index=["cobra", "viper", "sidewinder"])
|
|
|
|
|
|
|
|
self.assertRaises(KeyError, lambda: kdf.loc["cobra":"koalas"])
|
|
|
|
self.assertRaises(KeyError, lambda: kdf.loc["koalas":"viper"])
|
|
|
|
|
|
|
|
kdf = ps.DataFrame([[1, 2], [4, 5], [7, 8]], index=[10, 30, 20])
|
|
|
|
|
|
|
|
self.assertRaises(KeyError, lambda: kdf.loc[0:30])
|
|
|
|
self.assertRaises(KeyError, lambda: kdf.loc[10:100])
|
|
|
|
|
|
|
|
def test_loc_non_informative_index(self):
|
|
|
|
pdf = pd.DataFrame({"x": [1, 2, 3, 4]}, index=[10, 20, 30, 40])
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
self.assert_eq(kdf.loc[20:30], pdf.loc[20:30])
|
|
|
|
|
|
|
|
pdf = pd.DataFrame({"x": [1, 2, 3, 4]}, index=[10, 20, 20, 40])
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(kdf.loc[20:20], pdf.loc[20:20])
|
|
|
|
|
|
|
|
def test_loc_with_series(self):
|
|
|
|
kdf = self.kdf
|
|
|
|
pdf = self.pdf
|
|
|
|
|
|
|
|
self.assert_eq(kdf.loc[kdf.a % 2 == 0], pdf.loc[pdf.a % 2 == 0])
|
|
|
|
self.assert_eq(kdf.loc[kdf.a % 2 == 0, "a"], pdf.loc[pdf.a % 2 == 0, "a"])
|
|
|
|
self.assert_eq(kdf.loc[kdf.a % 2 == 0, ["a"]], pdf.loc[pdf.a % 2 == 0, ["a"]])
|
|
|
|
self.assert_eq(kdf.a.loc[kdf.a % 2 == 0], pdf.a.loc[pdf.a % 2 == 0])
|
|
|
|
|
|
|
|
self.assert_eq(kdf.loc[kdf.copy().a % 2 == 0], pdf.loc[pdf.copy().a % 2 == 0])
|
|
|
|
self.assert_eq(kdf.loc[kdf.copy().a % 2 == 0, "a"], pdf.loc[pdf.copy().a % 2 == 0, "a"])
|
|
|
|
self.assert_eq(kdf.loc[kdf.copy().a % 2 == 0, ["a"]], pdf.loc[pdf.copy().a % 2 == 0, ["a"]])
|
|
|
|
self.assert_eq(kdf.a.loc[kdf.copy().a % 2 == 0], pdf.a.loc[pdf.copy().a % 2 == 0])
|
|
|
|
|
|
|
|
def test_loc_noindex(self):
|
|
|
|
kdf = self.kdf
|
|
|
|
kdf = kdf.reset_index()
|
|
|
|
pdf = self.pdf
|
|
|
|
pdf = pdf.reset_index()
|
|
|
|
|
|
|
|
self.assert_eq(kdf[["a"]], pdf[["a"]])
|
|
|
|
|
|
|
|
self.assert_eq(kdf.loc[:], pdf.loc[:])
|
|
|
|
self.assert_eq(kdf.loc[5:5], pdf.loc[5:5])
|
|
|
|
|
|
|
|
def test_loc_multiindex(self):
|
|
|
|
kdf = self.kdf
|
|
|
|
kdf = kdf.set_index("b", append=True)
|
|
|
|
pdf = self.pdf
|
|
|
|
pdf = pdf.set_index("b", append=True)
|
|
|
|
|
|
|
|
self.assert_eq(kdf.loc[:], pdf.loc[:])
|
|
|
|
self.assert_eq(kdf.loc[5:5], pdf.loc[5:5])
|
|
|
|
self.assert_eq(kdf.loc[5:9], pdf.loc[5:9])
|
|
|
|
|
|
|
|
self.assert_eq(kdf.loc[5], pdf.loc[5])
|
|
|
|
self.assert_eq(kdf.loc[9], pdf.loc[9])
|
|
|
|
# TODO: self.assert_eq(kdf.loc[(5, 3)], pdf.loc[(5, 3)])
|
|
|
|
# TODO: self.assert_eq(kdf.loc[(9, 0)], pdf.loc[(9, 0)])
|
|
|
|
self.assert_eq(kdf.a.loc[5], pdf.a.loc[5])
|
|
|
|
self.assert_eq(kdf.a.loc[9], pdf.a.loc[9])
|
|
|
|
self.assertTrue((kdf.a.loc[(5, 3)] == pdf.a.loc[(5, 3)]).all())
|
|
|
|
self.assert_eq(kdf.a.loc[(9, 0)], pdf.a.loc[(9, 0)])
|
|
|
|
|
|
|
|
# monotonically increasing index test
|
|
|
|
pdf = pd.DataFrame(
|
|
|
|
{"a": [1, 2, 3, 4, 5]},
|
|
|
|
index=pd.MultiIndex.from_tuples(
|
|
|
|
[("x", "a"), ("x", "b"), ("y", "c"), ("y", "d"), ("z", "e")]
|
|
|
|
),
|
|
|
|
)
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
for rows_sel in [
|
|
|
|
slice(None),
|
|
|
|
slice("y", None),
|
|
|
|
slice(None, "y"),
|
|
|
|
slice(("x", "b"), None),
|
|
|
|
slice(None, ("y", "c")),
|
|
|
|
slice(("x", "b"), ("y", "c")),
|
|
|
|
slice("x", ("y", "c")),
|
|
|
|
slice(("x", "b"), "y"),
|
|
|
|
]:
|
|
|
|
with self.subTest("monotonically increasing", rows_sel=rows_sel):
|
|
|
|
self.assert_eq(kdf.loc[rows_sel], pdf.loc[rows_sel])
|
|
|
|
self.assert_eq(kdf.a.loc[rows_sel], pdf.a.loc[rows_sel])
|
|
|
|
|
|
|
|
# monotonically increasing first index test
|
|
|
|
pdf = pd.DataFrame(
|
|
|
|
{"a": [1, 2, 3, 4, 5]},
|
|
|
|
index=pd.MultiIndex.from_tuples(
|
|
|
|
[("x", "a"), ("x", "b"), ("y", "c"), ("y", "a"), ("z", "e")]
|
|
|
|
),
|
|
|
|
)
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
for rows_sel in [
|
|
|
|
slice(None),
|
|
|
|
slice("y", None),
|
|
|
|
slice(None, "y"),
|
|
|
|
]:
|
|
|
|
with self.subTest("monotonically increasing first index", rows_sel=rows_sel):
|
|
|
|
self.assert_eq(kdf.loc[rows_sel], pdf.loc[rows_sel])
|
|
|
|
self.assert_eq(kdf.a.loc[rows_sel], pdf.a.loc[rows_sel])
|
|
|
|
|
|
|
|
for rows_sel in [
|
|
|
|
slice(("x", "b"), None),
|
|
|
|
slice(None, ("y", "c")),
|
|
|
|
slice(("x", "b"), ("y", "c")),
|
|
|
|
slice("x", ("y", "c")),
|
|
|
|
slice(("x", "b"), "y"),
|
|
|
|
]:
|
|
|
|
with self.subTest("monotonically increasing first index", rows_sel=rows_sel):
|
|
|
|
self.assertRaises(KeyError, lambda: kdf.loc[rows_sel])
|
|
|
|
self.assertRaises(KeyError, lambda: kdf.a.loc[rows_sel])
|
|
|
|
|
|
|
|
# not monotonically increasing index test
|
|
|
|
pdf = pd.DataFrame(
|
|
|
|
{"a": [1, 2, 3, 4, 5]},
|
|
|
|
index=pd.MultiIndex.from_tuples(
|
|
|
|
[("z", "e"), ("y", "d"), ("y", "c"), ("x", "b"), ("x", "a")]
|
|
|
|
),
|
|
|
|
)
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
for rows_sel in [
|
|
|
|
slice("y", None),
|
|
|
|
slice(None, "y"),
|
|
|
|
slice(("x", "b"), None),
|
|
|
|
slice(None, ("y", "c")),
|
|
|
|
slice(("x", "b"), ("y", "c")),
|
|
|
|
slice("x", ("y", "c")),
|
|
|
|
slice(("x", "b"), "y"),
|
|
|
|
]:
|
|
|
|
with self.subTest("monotonically decreasing", rows_sel=rows_sel):
|
|
|
|
self.assertRaises(KeyError, lambda: kdf.loc[rows_sel])
|
|
|
|
self.assertRaises(KeyError, lambda: kdf.a.loc[rows_sel])
|
|
|
|
|
|
|
|
def test_loc2d_multiindex(self):
|
|
|
|
kdf = self.kdf
|
|
|
|
kdf = kdf.set_index("b", append=True)
|
|
|
|
pdf = self.pdf
|
|
|
|
pdf = pdf.set_index("b", append=True)
|
|
|
|
|
|
|
|
self.assert_eq(kdf.loc[:, :], pdf.loc[:, :])
|
|
|
|
self.assert_eq(kdf.loc[:, "a"], pdf.loc[:, "a"])
|
|
|
|
self.assert_eq(kdf.loc[5:5, "a"], pdf.loc[5:5, "a"])
|
|
|
|
|
|
|
|
self.assert_eq(kdf.loc[:, "a":"a"], pdf.loc[:, "a":"a"])
|
|
|
|
self.assert_eq(kdf.loc[:, "a":"c"], pdf.loc[:, "a":"c"])
|
|
|
|
self.assert_eq(kdf.loc[:, "b":"c"], pdf.loc[:, "b":"c"])
|
|
|
|
|
|
|
|
def test_loc2d(self):
|
|
|
|
kdf = self.kdf
|
|
|
|
pdf = self.pdf
|
|
|
|
|
|
|
|
# index indexer is always regarded as slice for duplicated values
|
|
|
|
self.assert_eq(kdf.loc[5:5, "a"], pdf.loc[5:5, "a"])
|
|
|
|
self.assert_eq(kdf.loc[[5], "a"], pdf.loc[[5], "a"])
|
|
|
|
self.assert_eq(kdf.loc[5:5, ["a"]], pdf.loc[5:5, ["a"]])
|
|
|
|
self.assert_eq(kdf.loc[[5], ["a"]], pdf.loc[[5], ["a"]])
|
|
|
|
self.assert_eq(kdf.loc[:, :], pdf.loc[:, :])
|
|
|
|
|
|
|
|
self.assert_eq(kdf.loc[3:8, "a"], pdf.loc[3:8, "a"])
|
|
|
|
self.assert_eq(kdf.loc[:8, "a"], pdf.loc[:8, "a"])
|
|
|
|
self.assert_eq(kdf.loc[3:, "a"], pdf.loc[3:, "a"])
|
|
|
|
self.assert_eq(kdf.loc[[8], "a"], pdf.loc[[8], "a"])
|
|
|
|
|
|
|
|
self.assert_eq(kdf.loc[3:8, ["a"]], pdf.loc[3:8, ["a"]])
|
|
|
|
self.assert_eq(kdf.loc[:8, ["a"]], pdf.loc[:8, ["a"]])
|
|
|
|
self.assert_eq(kdf.loc[3:, ["a"]], pdf.loc[3:, ["a"]])
|
|
|
|
# TODO?: self.assert_eq(kdf.loc[[3, 4, 3], ['a']], pdf.loc[[3, 4, 3], ['a']])
|
|
|
|
|
|
|
|
self.assertRaises(SparkPandasIndexingError, lambda: kdf.loc[3, 3, 3])
|
|
|
|
self.assertRaises(SparkPandasIndexingError, lambda: kdf.a.loc[3, 3])
|
|
|
|
self.assertRaises(SparkPandasIndexingError, lambda: kdf.a.loc[3:, 3])
|
|
|
|
self.assertRaises(SparkPandasIndexingError, lambda: kdf.a.loc[kdf.a % 2 == 0, 3])
|
|
|
|
|
|
|
|
self.assert_eq(kdf.loc[5, "a"], pdf.loc[5, "a"])
|
|
|
|
self.assert_eq(kdf.loc[9, "a"], pdf.loc[9, "a"])
|
|
|
|
self.assert_eq(kdf.loc[5, ["a"]], pdf.loc[5, ["a"]])
|
|
|
|
self.assert_eq(kdf.loc[9, ["a"]], pdf.loc[9, ["a"]])
|
|
|
|
|
|
|
|
self.assert_eq(kdf.loc[:, "a":"a"], pdf.loc[:, "a":"a"])
|
|
|
|
self.assert_eq(kdf.loc[:, "a":"d"], pdf.loc[:, "a":"d"])
|
|
|
|
self.assert_eq(kdf.loc[:, "c":"d"], pdf.loc[:, "c":"d"])
|
|
|
|
|
|
|
|
# bool list-like column select
|
|
|
|
bool_list = [True, False]
|
|
|
|
self.assert_eq(kdf.loc[:, bool_list], pdf.loc[:, bool_list])
|
|
|
|
self.assert_eq(kdf.loc[:, np.array(bool_list)], pdf.loc[:, np.array(bool_list)])
|
|
|
|
|
|
|
|
pser = pd.Series(bool_list, index=pdf.columns)
|
|
|
|
self.assert_eq(kdf.loc[:, pser], pdf.loc[:, pser])
|
|
|
|
pser = pd.Series(list(reversed(bool_list)), index=list(reversed(pdf.columns)))
|
|
|
|
self.assert_eq(kdf.loc[:, pser], pdf.loc[:, pser])
|
|
|
|
|
|
|
|
self.assertRaises(IndexError, lambda: kdf.loc[:, bool_list[:-1]])
|
|
|
|
self.assertRaises(IndexError, lambda: kdf.loc[:, np.array(bool_list + [True])])
|
|
|
|
self.assertRaises(SparkPandasIndexingError, lambda: kdf.loc[:, pd.Series(bool_list)])
|
|
|
|
|
|
|
|
# non-string column names
|
|
|
|
kdf = self.kdf2
|
|
|
|
pdf = self.pdf2
|
|
|
|
|
|
|
|
self.assert_eq(kdf.loc[5:5, 0], pdf.loc[5:5, 0])
|
|
|
|
self.assert_eq(kdf.loc[5:5, [0]], pdf.loc[5:5, [0]])
|
|
|
|
self.assert_eq(kdf.loc[3:8, 0], pdf.loc[3:8, 0])
|
|
|
|
self.assert_eq(kdf.loc[3:8, [0]], pdf.loc[3:8, [0]])
|
|
|
|
|
|
|
|
self.assert_eq(kdf.loc[:, 0:0], pdf.loc[:, 0:0])
|
|
|
|
self.assert_eq(kdf.loc[:, 0:3], pdf.loc[:, 0:3])
|
|
|
|
self.assert_eq(kdf.loc[:, 2:3], pdf.loc[:, 2:3])
|
|
|
|
|
|
|
|
def test_loc2d_multiindex_columns(self):
|
|
|
|
arrays = [np.array(["bar", "bar", "baz", "baz"]), np.array(["one", "two", "one", "two"])]
|
|
|
|
|
|
|
|
pdf = pd.DataFrame(np.random.randn(3, 4), index=["A", "B", "C"], columns=arrays)
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
self.assert_eq(kdf.loc["B":"B", "bar"], pdf.loc["B":"B", "bar"])
|
|
|
|
self.assert_eq(kdf.loc["B":"B", ["bar"]], pdf.loc["B":"B", ["bar"]])
|
|
|
|
|
|
|
|
self.assert_eq(kdf.loc[:, "bar":"bar"], pdf.loc[:, "bar":"bar"])
|
|
|
|
self.assert_eq(kdf.loc[:, "bar":("baz", "one")], pdf.loc[:, "bar":("baz", "one")])
|
|
|
|
self.assert_eq(
|
|
|
|
kdf.loc[:, ("bar", "two"):("baz", "one")], pdf.loc[:, ("bar", "two"):("baz", "one")]
|
|
|
|
)
|
|
|
|
self.assert_eq(kdf.loc[:, ("bar", "two"):"bar"], pdf.loc[:, ("bar", "two"):"bar"])
|
|
|
|
self.assert_eq(kdf.loc[:, "a":"bax"], pdf.loc[:, "a":"bax"])
|
|
|
|
self.assert_eq(
|
|
|
|
kdf.loc[:, ("bar", "x"):("baz", "a")],
|
|
|
|
pdf.loc[:, ("bar", "x"):("baz", "a")],
|
|
|
|
almost=True,
|
|
|
|
)
|
|
|
|
|
|
|
|
pdf = pd.DataFrame(
|
|
|
|
np.random.randn(3, 4),
|
|
|
|
index=["A", "B", "C"],
|
|
|
|
columns=pd.MultiIndex.from_tuples(
|
|
|
|
[("bar", "two"), ("bar", "one"), ("baz", "one"), ("baz", "two")]
|
|
|
|
),
|
|
|
|
)
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
self.assert_eq(kdf.loc[:, "bar":"baz"], pdf.loc[:, "bar":"baz"])
|
|
|
|
|
|
|
|
self.assertRaises(KeyError, lambda: kdf.loc[:, "bar":("baz", "one")])
|
|
|
|
self.assertRaises(KeyError, lambda: kdf.loc[:, ("bar", "two"):"bar"])
|
|
|
|
|
|
|
|
# bool list-like column select
|
|
|
|
bool_list = [True, False, True, False]
|
|
|
|
self.assert_eq(kdf.loc[:, bool_list], pdf.loc[:, bool_list])
|
|
|
|
self.assert_eq(kdf.loc[:, np.array(bool_list)], pdf.loc[:, np.array(bool_list)])
|
|
|
|
|
|
|
|
pser = pd.Series(bool_list, index=pdf.columns)
|
|
|
|
self.assert_eq(kdf.loc[:, pser], pdf.loc[:, pser])
|
|
|
|
|
|
|
|
pser = pd.Series(list(reversed(bool_list)), index=list(reversed(pdf.columns)))
|
|
|
|
self.assert_eq(kdf.loc[:, pser], pdf.loc[:, pser])
|
|
|
|
|
|
|
|
# non-string column names
|
|
|
|
arrays = [np.array([0, 0, 1, 1]), np.array([1, 2, 1, 2])]
|
|
|
|
|
|
|
|
pdf = pd.DataFrame(np.random.randn(3, 4), index=["A", "B", "C"], columns=arrays)
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
self.assert_eq(kdf.loc["B":"B", 0], pdf.loc["B":"B", 0])
|
|
|
|
self.assert_eq(kdf.loc["B":"B", [0]], pdf.loc["B":"B", [0]])
|
|
|
|
self.assert_eq(kdf.loc[:, 0:0], pdf.loc[:, 0:0])
|
|
|
|
self.assert_eq(kdf.loc[:, 0:(1, 1)], pdf.loc[:, 0:(1, 1)])
|
|
|
|
self.assert_eq(kdf.loc[:, (0, 2):(1, 1)], pdf.loc[:, (0, 2):(1, 1)])
|
|
|
|
self.assert_eq(kdf.loc[:, (0, 2):0], pdf.loc[:, (0, 2):0])
|
|
|
|
self.assert_eq(kdf.loc[:, -1:2], pdf.loc[:, -1:2])
|
|
|
|
|
|
|
|
def test_loc2d_with_known_divisions(self):
|
|
|
|
pdf = pd.DataFrame(
|
|
|
|
np.random.randn(20, 5), index=list("abcdefghijklmnopqrst"), columns=list("ABCDE")
|
|
|
|
)
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
self.assert_eq(kdf.loc[["a"], "A"], pdf.loc[["a"], "A"])
|
|
|
|
self.assert_eq(kdf.loc[["a"], ["A"]], pdf.loc[["a"], ["A"]])
|
|
|
|
self.assert_eq(kdf.loc["a":"o", "A"], pdf.loc["a":"o", "A"])
|
|
|
|
self.assert_eq(kdf.loc["a":"o", ["A"]], pdf.loc["a":"o", ["A"]])
|
|
|
|
self.assert_eq(kdf.loc[["n"], ["A"]], pdf.loc[["n"], ["A"]])
|
|
|
|
self.assert_eq(kdf.loc[["a", "c", "n"], ["A"]], pdf.loc[["a", "c", "n"], ["A"]])
|
|
|
|
# TODO?: self.assert_eq(kdf.loc[['t', 'b'], ['A']], pdf.loc[['t', 'b'], ['A']])
|
|
|
|
# TODO?: self.assert_eq(kdf.loc[['r', 'r', 'c', 'g', 'h'], ['A']],
|
|
|
|
# TODO?: pdf.loc[['r', 'r', 'c', 'g', 'h'], ['A']])
|
|
|
|
|
|
|
|
@unittest.skip("TODO: should handle duplicated columns properly")
|
|
|
|
def test_loc2d_duplicated_columns(self):
|
|
|
|
pdf = pd.DataFrame(
|
|
|
|
np.random.randn(20, 5), index=list("abcdefghijklmnopqrst"), columns=list("AABCD")
|
|
|
|
)
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
# TODO?: self.assert_eq(kdf.loc[['a'], 'A'], pdf.loc[['a'], 'A'])
|
|
|
|
# TODO?: self.assert_eq(kdf.loc[['a'], ['A']], pdf.loc[['a'], ['A']])
|
|
|
|
self.assert_eq(kdf.loc[["j"], "B"], pdf.loc[["j"], "B"])
|
|
|
|
self.assert_eq(kdf.loc[["j"], ["B"]], pdf.loc[["j"], ["B"]])
|
|
|
|
|
|
|
|
# TODO?: self.assert_eq(kdf.loc['a':'o', 'A'], pdf.loc['a':'o', 'A'])
|
|
|
|
# TODO?: self.assert_eq(kdf.loc['a':'o', ['A']], pdf.loc['a':'o', ['A']])
|
|
|
|
self.assert_eq(kdf.loc["j":"q", "B"], pdf.loc["j":"q", "B"])
|
|
|
|
self.assert_eq(kdf.loc["j":"q", ["B"]], pdf.loc["j":"q", ["B"]])
|
|
|
|
|
|
|
|
# TODO?: self.assert_eq(kdf.loc['a':'o', 'B':'D'], pdf.loc['a':'o', 'B':'D'])
|
|
|
|
# TODO?: self.assert_eq(kdf.loc['a':'o', 'B':'D'], pdf.loc['a':'o', 'B':'D'])
|
|
|
|
# TODO?: self.assert_eq(kdf.loc['j':'q', 'B':'A'], pdf.loc['j':'q', 'B':'A'])
|
|
|
|
# TODO?: self.assert_eq(kdf.loc['j':'q', 'B':'A'], pdf.loc['j':'q', 'B':'A'])
|
|
|
|
|
|
|
|
self.assert_eq(kdf.loc[kdf.B > 0, "B"], pdf.loc[pdf.B > 0, "B"])
|
|
|
|
# TODO?: self.assert_eq(kdf.loc[kdf.B > 0, ['A', 'C']], pdf.loc[pdf.B > 0, ['A', 'C']])
|
|
|
|
|
|
|
|
def test_getitem(self):
|
|
|
|
pdf = pd.DataFrame(
|
|
|
|
{
|
|
|
|
"A": [1, 2, 3, 4, 5, 6, 7, 8, 9],
|
|
|
|
"B": [9, 8, 7, 6, 5, 4, 3, 2, 1],
|
|
|
|
"C": [True, False, True] * 3,
|
|
|
|
},
|
|
|
|
columns=list("ABC"),
|
|
|
|
)
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(kdf["A"], pdf["A"])
|
|
|
|
|
|
|
|
self.assert_eq(kdf[["A", "B"]], pdf[["A", "B"]])
|
|
|
|
|
|
|
|
self.assert_eq(kdf[kdf.C], pdf[pdf.C])
|
|
|
|
|
|
|
|
self.assertRaises(KeyError, lambda: kdf["X"])
|
|
|
|
self.assertRaises(KeyError, lambda: kdf[["A", "X"]])
|
|
|
|
self.assertRaises(AttributeError, lambda: kdf.X)
|
|
|
|
|
|
|
|
# not str/unicode
|
|
|
|
# TODO?: pdf = pd.DataFrame(np.random.randn(10, 5))
|
|
|
|
# TODO?: kdf = ps.from_pandas(pdf)
|
|
|
|
# TODO?: self.assert_eq(kdf[0], pdf[0])
|
|
|
|
# TODO?: self.assert_eq(kdf[[1, 2]], pdf[[1, 2]])
|
|
|
|
|
|
|
|
# TODO?: self.assertRaises(KeyError, lambda: pdf[8])
|
|
|
|
# TODO?: self.assertRaises(KeyError, lambda: pdf[[1, 8]])
|
|
|
|
|
|
|
|
# non-string column names
|
|
|
|
pdf = pd.DataFrame(
|
|
|
|
{
|
|
|
|
10: [1, 2, 3, 4, 5, 6, 7, 8, 9],
|
|
|
|
20: [9, 8, 7, 6, 5, 4, 3, 2, 1],
|
|
|
|
30: [True, False, True] * 3,
|
|
|
|
}
|
|
|
|
)
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
self.assert_eq(kdf[10], pdf[10])
|
|
|
|
self.assert_eq(kdf[[10, 20]], pdf[[10, 20]])
|
|
|
|
|
|
|
|
def test_getitem_slice(self):
|
|
|
|
pdf = pd.DataFrame(
|
|
|
|
{
|
|
|
|
"A": [1, 2, 3, 4, 5, 6, 7, 8, 9],
|
|
|
|
"B": [9, 8, 7, 6, 5, 4, 3, 2, 1],
|
|
|
|
"C": [True, False, True] * 3,
|
|
|
|
},
|
|
|
|
index=list("abcdefghi"),
|
|
|
|
)
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(kdf["a":"e"], pdf["a":"e"])
|
|
|
|
self.assert_eq(kdf["a":"b"], pdf["a":"b"])
|
|
|
|
self.assert_eq(kdf["f":], pdf["f":])
|
|
|
|
|
|
|
|
def test_loc_on_numpy_datetimes(self):
|
|
|
|
pdf = pd.DataFrame(
|
|
|
|
{"x": [1, 2, 3]}, index=list(map(np.datetime64, ["2014", "2015", "2016"]))
|
|
|
|
)
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
self.assert_eq(kdf.loc["2014":"2015"], pdf.loc["2014":"2015"])
|
|
|
|
|
|
|
|
def test_loc_on_pandas_datetimes(self):
|
|
|
|
pdf = pd.DataFrame(
|
|
|
|
{"x": [1, 2, 3]}, index=list(map(pd.Timestamp, ["2014", "2015", "2016"]))
|
|
|
|
)
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
self.assert_eq(kdf.loc["2014":"2015"], pdf.loc["2014":"2015"])
|
|
|
|
|
|
|
|
@unittest.skip("TODO?: the behavior of slice for datetime")
|
|
|
|
def test_loc_datetime_no_freq(self):
|
|
|
|
datetime_index = pd.date_range("2016-01-01", "2016-01-31", freq="12h")
|
|
|
|
datetime_index.freq = None # FORGET FREQUENCY
|
|
|
|
pdf = pd.DataFrame({"num": range(len(datetime_index))}, index=datetime_index)
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
slice_ = slice("2016-01-03", "2016-01-05")
|
|
|
|
result = kdf.loc[slice_, :]
|
|
|
|
expected = pdf.loc[slice_, :]
|
|
|
|
self.assert_eq(result, expected)
|
|
|
|
|
|
|
|
@unittest.skip("TODO?: the behavior of slice for datetime")
|
|
|
|
def test_loc_timestamp_str(self):
|
|
|
|
pdf = pd.DataFrame(
|
|
|
|
{"A": np.random.randn(100), "B": np.random.randn(100)},
|
|
|
|
index=pd.date_range("2011-01-01", freq="H", periods=100),
|
|
|
|
)
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
# partial string slice
|
|
|
|
# TODO?: self.assert_eq(pdf.loc['2011-01-02'],
|
|
|
|
# TODO?: kdf.loc['2011-01-02'])
|
|
|
|
self.assert_eq(pdf.loc["2011-01-02":"2011-01-05"], kdf.loc["2011-01-02":"2011-01-05"])
|
|
|
|
|
|
|
|
# series
|
|
|
|
# TODO?: self.assert_eq(pdf.A.loc['2011-01-02'],
|
|
|
|
# TODO?: kdf.A.loc['2011-01-02'])
|
|
|
|
self.assert_eq(pdf.A.loc["2011-01-02":"2011-01-05"], kdf.A.loc["2011-01-02":"2011-01-05"])
|
|
|
|
|
|
|
|
pdf = pd.DataFrame(
|
|
|
|
{"A": np.random.randn(100), "B": np.random.randn(100)},
|
|
|
|
index=pd.date_range("2011-01-01", freq="M", periods=100),
|
|
|
|
)
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
# TODO?: self.assert_eq(pdf.loc['2011-01'], kdf.loc['2011-01'])
|
|
|
|
# TODO?: self.assert_eq(pdf.loc['2011'], kdf.loc['2011'])
|
|
|
|
|
|
|
|
self.assert_eq(pdf.loc["2011-01":"2012-05"], kdf.loc["2011-01":"2012-05"])
|
|
|
|
self.assert_eq(pdf.loc["2011":"2015"], kdf.loc["2011":"2015"])
|
|
|
|
|
|
|
|
# series
|
|
|
|
# TODO?: self.assert_eq(pdf.B.loc['2011-01'], kdf.B.loc['2011-01'])
|
|
|
|
# TODO?: self.assert_eq(pdf.B.loc['2011'], kdf.B.loc['2011'])
|
|
|
|
|
|
|
|
self.assert_eq(pdf.B.loc["2011-01":"2012-05"], kdf.B.loc["2011-01":"2012-05"])
|
|
|
|
self.assert_eq(pdf.B.loc["2011":"2015"], kdf.B.loc["2011":"2015"])
|
|
|
|
|
|
|
|
@unittest.skip("TODO?: the behavior of slice for datetime")
|
|
|
|
def test_getitem_timestamp_str(self):
|
|
|
|
pdf = pd.DataFrame(
|
|
|
|
{"A": np.random.randn(100), "B": np.random.randn(100)},
|
|
|
|
index=pd.date_range("2011-01-01", freq="H", periods=100),
|
|
|
|
)
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
# partial string slice
|
|
|
|
# TODO?: self.assert_eq(pdf['2011-01-02'],
|
|
|
|
# TODO?: kdf['2011-01-02'])
|
|
|
|
self.assert_eq(pdf["2011-01-02":"2011-01-05"], kdf["2011-01-02":"2011-01-05"])
|
|
|
|
|
|
|
|
pdf = pd.DataFrame(
|
|
|
|
{"A": np.random.randn(100), "B": np.random.randn(100)},
|
|
|
|
index=pd.date_range("2011-01-01", freq="M", periods=100),
|
|
|
|
)
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
# TODO?: self.assert_eq(pdf['2011-01'], kdf['2011-01'])
|
|
|
|
# TODO?: self.assert_eq(pdf['2011'], kdf['2011'])
|
|
|
|
|
|
|
|
self.assert_eq(pdf["2011-01":"2012-05"], kdf["2011-01":"2012-05"])
|
|
|
|
self.assert_eq(pdf["2011":"2015"], kdf["2011":"2015"])
|
|
|
|
|
|
|
|
@unittest.skip("TODO?: period index can't convert to DataFrame correctly")
|
|
|
|
def test_getitem_period_str(self):
|
|
|
|
pdf = pd.DataFrame(
|
|
|
|
{"A": np.random.randn(100), "B": np.random.randn(100)},
|
|
|
|
index=pd.period_range("2011-01-01", freq="H", periods=100),
|
|
|
|
)
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
# partial string slice
|
|
|
|
# TODO?: self.assert_eq(pdf['2011-01-02'],
|
|
|
|
# TODO?: kdf['2011-01-02'])
|
|
|
|
self.assert_eq(pdf["2011-01-02":"2011-01-05"], kdf["2011-01-02":"2011-01-05"])
|
|
|
|
|
|
|
|
pdf = pd.DataFrame(
|
|
|
|
{"A": np.random.randn(100), "B": np.random.randn(100)},
|
|
|
|
index=pd.period_range("2011-01-01", freq="M", periods=100),
|
|
|
|
)
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
# TODO?: self.assert_eq(pdf['2011-01'], kdf['2011-01'])
|
|
|
|
# TODO?: self.assert_eq(pdf['2011'], kdf['2011'])
|
|
|
|
|
|
|
|
self.assert_eq(pdf["2011-01":"2012-05"], kdf["2011-01":"2012-05"])
|
|
|
|
self.assert_eq(pdf["2011":"2015"], kdf["2011":"2015"])
|
|
|
|
|
|
|
|
def test_iloc(self):
|
|
|
|
pdf = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]})
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
self.assert_eq(kdf.iloc[0, 0], pdf.iloc[0, 0])
|
|
|
|
for indexer in [0, [0], [0, 1], [1, 0], [False, True, True], slice(0, 1)]:
|
|
|
|
self.assert_eq(kdf.iloc[:, indexer], pdf.iloc[:, indexer])
|
|
|
|
self.assert_eq(kdf.iloc[:1, indexer], pdf.iloc[:1, indexer])
|
|
|
|
self.assert_eq(kdf.iloc[:-1, indexer], pdf.iloc[:-1, indexer])
|
|
|
|
# self.assert_eq(kdf.iloc[kdf.index == 2, indexer], pdf.iloc[pdf.index == 2, indexer])
|
|
|
|
|
|
|
|
def test_iloc_multiindex_columns(self):
|
|
|
|
arrays = [np.array(["bar", "bar", "baz", "baz"]), np.array(["one", "two", "one", "two"])]
|
|
|
|
|
|
|
|
pdf = pd.DataFrame(np.random.randn(3, 4), index=["A", "B", "C"], columns=arrays)
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
for indexer in [0, [0], [0, 1], [1, 0], [False, True, True, True], slice(0, 1)]:
|
|
|
|
self.assert_eq(kdf.iloc[:, indexer], pdf.iloc[:, indexer])
|
|
|
|
self.assert_eq(kdf.iloc[:1, indexer], pdf.iloc[:1, indexer])
|
|
|
|
self.assert_eq(kdf.iloc[:-1, indexer], pdf.iloc[:-1, indexer])
|
|
|
|
# self.assert_eq(kdf.iloc[kdf.index == "B", indexer],
|
|
|
|
# pdf.iloc[pdf.index == "B", indexer])
|
|
|
|
|
|
|
|
def test_iloc_series(self):
|
|
|
|
pser = pd.Series([1, 2, 3])
|
|
|
|
kser = ps.from_pandas(pser)
|
|
|
|
|
|
|
|
self.assert_eq(kser.iloc[0], pser.iloc[0])
|
|
|
|
self.assert_eq(kser.iloc[:], pser.iloc[:])
|
|
|
|
self.assert_eq(kser.iloc[:1], pser.iloc[:1])
|
|
|
|
self.assert_eq(kser.iloc[:-1], pser.iloc[:-1])
|
|
|
|
|
|
|
|
self.assert_eq((kser + 1).iloc[0], (pser + 1).iloc[0])
|
|
|
|
self.assert_eq((kser + 1).iloc[:], (pser + 1).iloc[:])
|
|
|
|
self.assert_eq((kser + 1).iloc[:1], (pser + 1).iloc[:1])
|
|
|
|
self.assert_eq((kser + 1).iloc[:-1], (pser + 1).iloc[:-1])
|
|
|
|
|
|
|
|
def test_iloc_slice_rows_sel(self):
|
|
|
|
pdf = pd.DataFrame({"A": [1, 2] * 5, "B": [3, 4] * 5, "C": [5, 6] * 5})
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
for rows_sel in [
|
|
|
|
slice(None),
|
|
|
|
slice(0, 1),
|
|
|
|
slice(1, 2),
|
|
|
|
slice(-3, None),
|
|
|
|
slice(None, -3),
|
|
|
|
slice(None, 0),
|
|
|
|
slice(None, None, 3),
|
|
|
|
slice(3, 8, 2),
|
|
|
|
slice(None, None, -2),
|
|
|
|
slice(8, 3, -2),
|
|
|
|
slice(8, None, -2),
|
|
|
|
slice(None, 3, -2),
|
|
|
|
]:
|
|
|
|
with self.subTest(rows_sel=rows_sel):
|
|
|
|
self.assert_eq(kdf.iloc[rows_sel].sort_index(), pdf.iloc[rows_sel].sort_index())
|
|
|
|
self.assert_eq(kdf.A.iloc[rows_sel].sort_index(), pdf.A.iloc[rows_sel].sort_index())
|
|
|
|
self.assert_eq(
|
|
|
|
(kdf.A + 1).iloc[rows_sel].sort_index(), (pdf.A + 1).iloc[rows_sel].sort_index()
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_iloc_iterable_rows_sel(self):
|
|
|
|
pdf = pd.DataFrame({"A": [1, 2] * 5, "B": [3, 4] * 5, "C": [5, 6] * 5})
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
for rows_sel in [
|
|
|
|
[],
|
|
|
|
np.array([0, 1]),
|
|
|
|
[1, 2],
|
|
|
|
np.array([-3]),
|
|
|
|
[3],
|
|
|
|
np.array([-2]),
|
|
|
|
[8, 3, -5],
|
|
|
|
]:
|
|
|
|
with self.subTest(rows_sel=rows_sel):
|
|
|
|
self.assert_eq(kdf.iloc[rows_sel].sort_index(), pdf.iloc[rows_sel].sort_index())
|
|
|
|
self.assert_eq(kdf.A.iloc[rows_sel].sort_index(), pdf.A.iloc[rows_sel].sort_index())
|
|
|
|
self.assert_eq(
|
|
|
|
(kdf.A + 1).iloc[rows_sel].sort_index(), (pdf.A + 1).iloc[rows_sel].sort_index()
|
|
|
|
)
|
|
|
|
|
|
|
|
with self.subTest(rows_sel=rows_sel):
|
|
|
|
self.assert_eq(
|
|
|
|
kdf.iloc[rows_sel, :].sort_index(), pdf.iloc[rows_sel, :].sort_index()
|
|
|
|
)
|
|
|
|
|
|
|
|
with self.subTest(rows_sel=rows_sel):
|
|
|
|
self.assert_eq(
|
|
|
|
kdf.iloc[rows_sel, :1].sort_index(), pdf.iloc[rows_sel, :1].sort_index()
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_frame_loc_setitem(self):
|
|
|
|
pdf = pd.DataFrame(
|
|
|
|
[[1, 2], [4, 5], [7, 8]],
|
|
|
|
index=["cobra", "viper", "sidewinder"],
|
|
|
|
columns=["max_speed", "shield"],
|
|
|
|
)
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
pser1 = pdf.max_speed
|
|
|
|
pser2 = pdf.shield
|
|
|
|
kser1 = kdf.max_speed
|
|
|
|
kser2 = kdf.shield
|
|
|
|
|
|
|
|
pdf.loc[["viper", "sidewinder"], ["shield", "max_speed"]] = 10
|
|
|
|
kdf.loc[["viper", "sidewinder"], ["shield", "max_speed"]] = 10
|
|
|
|
self.assert_eq(kdf, pdf)
|
|
|
|
self.assert_eq(kser1, pser1)
|
|
|
|
self.assert_eq(kser2, pser2)
|
|
|
|
|
|
|
|
pdf.loc[["viper", "sidewinder"], "shield"] = 50
|
|
|
|
kdf.loc[["viper", "sidewinder"], "shield"] = 50
|
|
|
|
self.assert_eq(kdf, pdf)
|
|
|
|
self.assert_eq(kser1, pser1)
|
|
|
|
self.assert_eq(kser2, pser2)
|
|
|
|
|
|
|
|
pdf.loc["cobra", "max_speed"] = 30
|
|
|
|
kdf.loc["cobra", "max_speed"] = 30
|
|
|
|
self.assert_eq(kdf, pdf)
|
|
|
|
self.assert_eq(kser1, pser1)
|
|
|
|
self.assert_eq(kser2, pser2)
|
|
|
|
|
|
|
|
pdf.loc[pdf.max_speed < 5, "max_speed"] = -pdf.max_speed
|
|
|
|
kdf.loc[kdf.max_speed < 5, "max_speed"] = -kdf.max_speed
|
|
|
|
self.assert_eq(kdf, pdf)
|
|
|
|
self.assert_eq(kser1, pser1)
|
|
|
|
self.assert_eq(kser2, pser2)
|
|
|
|
|
|
|
|
pdf.loc[pdf.max_speed < 2, "max_speed"] = -pdf.max_speed
|
|
|
|
kdf.loc[kdf.max_speed < 2, "max_speed"] = -kdf.max_speed
|
|
|
|
self.assert_eq(kdf, pdf)
|
|
|
|
self.assert_eq(kser1, pser1)
|
|
|
|
self.assert_eq(kser2, pser2)
|
|
|
|
|
|
|
|
pdf.loc[:, "min_speed"] = 0
|
|
|
|
kdf.loc[:, "min_speed"] = 0
|
|
|
|
self.assert_eq(kdf, pdf, almost=True)
|
|
|
|
self.assert_eq(kser1, pser1)
|
|
|
|
self.assert_eq(kser2, pser2)
|
|
|
|
|
|
|
|
with self.assertRaisesRegex(ValueError, "Incompatible indexer with Series"):
|
|
|
|
kdf.loc["cobra", "max_speed"] = -kdf.max_speed
|
|
|
|
with self.assertRaisesRegex(ValueError, "shape mismatch"):
|
|
|
|
kdf.loc[:, ["shield", "max_speed"]] = -kdf.max_speed
|
|
|
|
with self.assertRaisesRegex(ValueError, "Only a dataframe with one column can be assigned"):
|
|
|
|
kdf.loc[:, "max_speed"] = kdf
|
|
|
|
|
|
|
|
# multi-index columns
|
|
|
|
columns = pd.MultiIndex.from_tuples(
|
|
|
|
[("x", "max_speed"), ("x", "shield"), ("y", "min_speed")]
|
|
|
|
)
|
|
|
|
pdf.columns = columns
|
|
|
|
kdf.columns = columns
|
|
|
|
|
|
|
|
pdf.loc[:, ("y", "shield")] = -pdf[("x", "shield")]
|
|
|
|
kdf.loc[:, ("y", "shield")] = -kdf[("x", "shield")]
|
|
|
|
self.assert_eq(kdf, pdf, almost=True)
|
|
|
|
self.assert_eq(kser1, pser1)
|
|
|
|
self.assert_eq(kser2, pser2)
|
|
|
|
|
|
|
|
pdf.loc[:, "z"] = 100
|
|
|
|
kdf.loc[:, "z"] = 100
|
|
|
|
self.assert_eq(kdf, pdf, almost=True)
|
|
|
|
self.assert_eq(kser1, pser1)
|
|
|
|
self.assert_eq(kser2, pser2)
|
|
|
|
|
|
|
|
with self.assertRaisesRegex(KeyError, "Key length \\(3\\) exceeds index depth \\(2\\)"):
|
|
|
|
kdf.loc[:, [("x", "max_speed", "foo")]] = -kdf[("x", "shield")]
|
|
|
|
|
|
|
|
pdf = pd.DataFrame(
|
|
|
|
[[1], [4], [7]], index=["cobra", "viper", "sidewinder"], columns=["max_speed"]
|
|
|
|
)
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
pdf.loc[:, "max_speed"] = pdf
|
|
|
|
kdf.loc[:, "max_speed"] = kdf
|
|
|
|
self.assert_eq(kdf, pdf)
|
|
|
|
|
|
|
|
def test_frame_iloc_setitem(self):
|
|
|
|
pdf = pd.DataFrame(
|
|
|
|
[[1, 2], [4, 5], [7, 8]],
|
|
|
|
index=["cobra", "viper", "sidewinder"],
|
|
|
|
columns=["max_speed", "shield"],
|
|
|
|
)
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
pdf.iloc[[1, 2], [1, 0]] = 10
|
|
|
|
kdf.iloc[[1, 2], [1, 0]] = 10
|
|
|
|
self.assert_eq(kdf, pdf)
|
|
|
|
|
|
|
|
pdf.iloc[0, 1] = 50
|
|
|
|
kdf.iloc[0, 1] = 50
|
|
|
|
self.assert_eq(kdf, pdf)
|
|
|
|
|
2021-04-16 04:42:03 -04:00
|
|
|
with self.assertRaisesRegex(ValueError, "setting an array element with a sequence."):
|
2021-04-14 22:45:15 -04:00
|
|
|
kdf.iloc[0, 0] = -kdf.max_speed
|
|
|
|
with self.assertRaisesRegex(ValueError, "shape mismatch"):
|
|
|
|
kdf.iloc[:, [1, 0]] = -kdf.max_speed
|
|
|
|
with self.assertRaisesRegex(ValueError, "Only a dataframe with one column can be assigned"):
|
|
|
|
kdf.iloc[:, 0] = kdf
|
|
|
|
|
|
|
|
pdf = pd.DataFrame(
|
|
|
|
[[1], [4], [7]], index=["cobra", "viper", "sidewinder"], columns=["max_speed"]
|
|
|
|
)
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
pdf.iloc[:, 0] = pdf
|
|
|
|
kdf.iloc[:, 0] = kdf
|
|
|
|
self.assert_eq(kdf, pdf)
|
|
|
|
|
|
|
|
def test_series_loc_setitem(self):
|
|
|
|
pdf = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}, index=["cobra", "viper", "sidewinder"])
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
pser = pdf.x
|
|
|
|
psery = pdf.y
|
|
|
|
kser = kdf.x
|
|
|
|
ksery = kdf.y
|
|
|
|
|
|
|
|
pser.loc[pser % 2 == 1] = -pser
|
|
|
|
kser.loc[kser % 2 == 1] = -kser
|
|
|
|
self.assert_eq(kser, pser)
|
|
|
|
self.assert_eq(kdf, pdf)
|
|
|
|
self.assert_eq(ksery, psery)
|
|
|
|
|
|
|
|
for key, value in [
|
|
|
|
(["viper", "sidewinder"], 10),
|
|
|
|
("viper", 50),
|
|
|
|
(slice(None), 10),
|
|
|
|
(slice(None, "viper"), 20),
|
|
|
|
(slice("viper", None), 30),
|
|
|
|
]:
|
|
|
|
with self.subTest(key=key, value=value):
|
|
|
|
pser.loc[key] = value
|
|
|
|
kser.loc[key] = value
|
|
|
|
self.assert_eq(kser, pser)
|
|
|
|
self.assert_eq(kdf, pdf)
|
|
|
|
self.assert_eq(ksery, psery)
|
|
|
|
|
|
|
|
with self.assertRaises(ValueError):
|
|
|
|
kser.loc["viper"] = -kser
|
|
|
|
|
|
|
|
# multiindex
|
|
|
|
pser = pd.Series(
|
|
|
|
[1, 2, 3],
|
|
|
|
index=pd.MultiIndex.from_tuples([("x", "cobra"), ("x", "viper"), ("y", "sidewinder")]),
|
|
|
|
)
|
|
|
|
kser = ps.from_pandas(pser)
|
|
|
|
|
|
|
|
pser.loc["x"] = pser * 10
|
|
|
|
kser.loc["x"] = kser * 10
|
|
|
|
self.assert_eq(kser, pser)
|
|
|
|
|
|
|
|
pser.loc["y"] = pser * 10
|
|
|
|
kser.loc["y"] = kser * 10
|
|
|
|
self.assert_eq(kser, pser)
|
|
|
|
|
|
|
|
if LooseVersion(pd.__version__) < LooseVersion("1.0"):
|
|
|
|
# TODO: seems like a pandas' bug in pandas>=1.0.0?
|
|
|
|
pser.loc[("x", "viper"):"y"] = pser * 20
|
|
|
|
kser.loc[("x", "viper"):"y"] = kser * 20
|
|
|
|
self.assert_eq(kser, pser)
|
|
|
|
|
|
|
|
def test_series_iloc_setitem(self):
|
|
|
|
pdf = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}, index=["cobra", "viper", "sidewinder"])
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
pser = pdf.x
|
|
|
|
psery = pdf.y
|
|
|
|
kser = kdf.x
|
|
|
|
ksery = kdf.y
|
|
|
|
|
|
|
|
piloc = pser.iloc
|
|
|
|
kiloc = kser.iloc
|
|
|
|
|
|
|
|
pser1 = pser + 1
|
|
|
|
kser1 = kser + 1
|
|
|
|
|
|
|
|
for key, value in [
|
|
|
|
([1, 2], 10),
|
|
|
|
(1, 50),
|
|
|
|
(slice(None), 10),
|
|
|
|
(slice(None, 1), 20),
|
|
|
|
(slice(1, None), 30),
|
|
|
|
]:
|
|
|
|
with self.subTest(key=key, value=value):
|
|
|
|
pser.iloc[key] = value
|
|
|
|
kser.iloc[key] = value
|
|
|
|
self.assert_eq(kser, pser)
|
|
|
|
self.assert_eq(kdf, pdf)
|
|
|
|
self.assert_eq(ksery, psery)
|
|
|
|
|
|
|
|
piloc[key] = -value
|
|
|
|
kiloc[key] = -value
|
|
|
|
self.assert_eq(kser, pser)
|
|
|
|
self.assert_eq(kdf, pdf)
|
|
|
|
self.assert_eq(ksery, psery)
|
|
|
|
|
|
|
|
pser1.iloc[key] = value
|
|
|
|
kser1.iloc[key] = value
|
|
|
|
self.assert_eq(kser1, pser1)
|
|
|
|
self.assert_eq(kdf, pdf)
|
|
|
|
self.assert_eq(ksery, psery)
|
|
|
|
|
|
|
|
with self.assertRaises(ValueError):
|
|
|
|
kser.iloc[1] = -kser
|
|
|
|
|
|
|
|
pser = pd.Index([1, 2, 3]).to_series()
|
|
|
|
kser = ps.Index([1, 2, 3]).to_series()
|
|
|
|
|
|
|
|
pser1 = pser + 1
|
|
|
|
kser1 = kser + 1
|
|
|
|
|
|
|
|
pser.iloc[0] = 10
|
|
|
|
kser.iloc[0] = 10
|
|
|
|
self.assert_eq(kser, pser)
|
|
|
|
|
|
|
|
pser1.iloc[0] = 20
|
|
|
|
kser1.iloc[0] = 20
|
|
|
|
self.assert_eq(kser1, pser1)
|
|
|
|
|
|
|
|
pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
pser = pdf.a
|
|
|
|
kser = kdf.a
|
|
|
|
|
|
|
|
pser.iloc[[0, 1, 2]] = -pdf.b
|
|
|
|
kser.iloc[[0, 1, 2]] = -kdf.b
|
|
|
|
self.assert_eq(kser, pser)
|
|
|
|
self.assert_eq(kdf, pdf)
|
|
|
|
|
2021-04-16 04:42:03 -04:00
|
|
|
with self.assertRaisesRegex(ValueError, "setting an array element with a sequence."):
|
2021-04-14 22:45:15 -04:00
|
|
|
kser.iloc[1] = kdf[["b"]]
|
|
|
|
|
|
|
|
def test_iloc_raises(self):
|
|
|
|
pdf = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]})
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
with self.assertRaisesRegex(SparkPandasIndexingError, "Only accepts pairs of candidates"):
|
|
|
|
kdf.iloc[[0, 1], [0, 1], [1, 2]]
|
|
|
|
|
|
|
|
with self.assertRaisesRegex(SparkPandasIndexingError, "Too many indexers"):
|
|
|
|
kdf.A.iloc[[0, 1], [0, 1]]
|
|
|
|
|
|
|
|
with self.assertRaisesRegex(TypeError, "cannot do slice indexing with these indexers"):
|
|
|
|
kdf.iloc[:"b", :]
|
|
|
|
|
|
|
|
with self.assertRaisesRegex(TypeError, "cannot do slice indexing with these indexers"):
|
|
|
|
kdf.iloc[:, :"b"]
|
|
|
|
|
|
|
|
with self.assertRaisesRegex(TypeError, "cannot perform reduce with flexible type"):
|
|
|
|
kdf.iloc[:, ["A"]]
|
|
|
|
|
|
|
|
with self.assertRaisesRegex(ValueError, "Location based indexing can only have"):
|
|
|
|
kdf.iloc[:, "A"]
|
|
|
|
|
|
|
|
with self.assertRaisesRegex(IndexError, "out of range"):
|
|
|
|
kdf.iloc[:, [5, 6]]
|
|
|
|
|
|
|
|
def test_index_operator_datetime(self):
|
|
|
|
dates = pd.date_range("20130101", periods=6)
|
|
|
|
pdf = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
# Positional iloc search
|
|
|
|
self.assert_eq(kdf[:4], pdf[:4], almost=True)
|
|
|
|
self.assert_eq(kdf[:3], pdf[:3], almost=True)
|
|
|
|
self.assert_eq(kdf[3:], pdf[3:], almost=True)
|
|
|
|
self.assert_eq(kdf[2:], pdf[2:], almost=True)
|
|
|
|
self.assert_eq(kdf[2:3], pdf[2:3], almost=True)
|
|
|
|
self.assert_eq(kdf[2:-1], pdf[2:-1], almost=True)
|
|
|
|
self.assert_eq(kdf[10:3], pdf[10:3], almost=True)
|
|
|
|
|
|
|
|
# Index loc search
|
|
|
|
self.assert_eq(kdf.A[4], pdf.A[4])
|
|
|
|
self.assert_eq(kdf.A[3], pdf.A[3])
|
|
|
|
|
|
|
|
# Positional iloc search
|
|
|
|
self.assert_eq(kdf.A[:4], pdf.A[:4], almost=True)
|
|
|
|
self.assert_eq(kdf.A[:3], pdf.A[:3], almost=True)
|
|
|
|
self.assert_eq(kdf.A[3:], pdf.A[3:], almost=True)
|
|
|
|
self.assert_eq(kdf.A[2:], pdf.A[2:], almost=True)
|
|
|
|
self.assert_eq(kdf.A[2:3], pdf.A[2:3], almost=True)
|
|
|
|
self.assert_eq(kdf.A[2:-1], pdf.A[2:-1], almost=True)
|
|
|
|
self.assert_eq(kdf.A[10:3], pdf.A[10:3], almost=True)
|
|
|
|
|
|
|
|
dt1 = datetime.datetime.strptime("2013-01-02", "%Y-%m-%d")
|
|
|
|
dt2 = datetime.datetime.strptime("2013-01-04", "%Y-%m-%d")
|
|
|
|
|
|
|
|
# Index loc search
|
|
|
|
self.assert_eq(kdf[:dt2], pdf[:dt2], almost=True)
|
|
|
|
self.assert_eq(kdf[dt1:], pdf[dt1:], almost=True)
|
|
|
|
self.assert_eq(kdf[dt1:dt2], pdf[dt1:dt2], almost=True)
|
|
|
|
self.assert_eq(kdf.A[dt2], pdf.A[dt2], almost=True)
|
|
|
|
self.assert_eq(kdf.A[:dt2], pdf.A[:dt2], almost=True)
|
|
|
|
self.assert_eq(kdf.A[dt1:], pdf.A[dt1:], almost=True)
|
|
|
|
self.assert_eq(kdf.A[dt1:dt2], pdf.A[dt1:dt2], almost=True)
|
|
|
|
|
|
|
|
def test_index_operator_int(self):
|
|
|
|
pdf = pd.DataFrame(np.random.randn(6, 4), index=[1, 3, 5, 7, 9, 11], columns=list("ABCD"))
|
|
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
|
|
|
|
# Positional iloc search
|
|
|
|
self.assert_eq(kdf[:4], pdf[:4])
|
|
|
|
self.assert_eq(kdf[:3], pdf[:3])
|
|
|
|
self.assert_eq(kdf[3:], pdf[3:])
|
|
|
|
self.assert_eq(kdf[2:], pdf[2:])
|
|
|
|
self.assert_eq(kdf[2:3], pdf[2:3])
|
|
|
|
self.assert_eq(kdf[2:-1], pdf[2:-1])
|
|
|
|
self.assert_eq(kdf[10:3], pdf[10:3])
|
|
|
|
|
|
|
|
# Index loc search
|
|
|
|
self.assert_eq(kdf.A[5], pdf.A[5])
|
|
|
|
self.assert_eq(kdf.A[3], pdf.A[3])
|
|
|
|
with self.assertRaisesRegex(
|
|
|
|
NotImplementedError, "Duplicated row selection is not currently supported"
|
|
|
|
):
|
|
|
|
kdf.iloc[[1, 1]]
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
from pyspark.pandas.tests.test_indexing import * # noqa: F401
|
|
|
|
|
|
|
|
try:
|
|
|
|
import xmlrunner # type: ignore[import]
|
|
|
|
testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
|
|
|
|
except ImportError:
|
|
|
|
testRunner = None
|
|
|
|
unittest.main(testRunner=testRunner, verbosity=2)
|