2021-04-15 19:53:30 -04:00
|
|
|
#
|
|
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
|
|
# this work for additional information regarding copyright ownership.
|
|
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
|
|
# (the "License"); you may not use this file except in compliance with
|
|
|
|
# the License. You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
#
|
|
|
|
|
|
|
|
import inspect
|
|
|
|
import unittest
|
|
|
|
from distutils.version import LooseVersion
|
|
|
|
from datetime import datetime
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
import pyspark.pandas as ps
|
|
|
|
from pyspark.pandas.exceptions import PandasNotImplementedError
|
|
|
|
from pyspark.pandas.missing.indexes import (
|
|
|
|
MissingPandasLikeCategoricalIndex,
|
|
|
|
MissingPandasLikeDatetimeIndex,
|
|
|
|
MissingPandasLikeIndex,
|
|
|
|
MissingPandasLikeMultiIndex,
|
|
|
|
)
|
2021-04-22 16:07:35 -04:00
|
|
|
from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils, SPARK_CONF_ARROW_ENABLED
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
|
2021-04-22 16:07:35 -04:00
|
|
|
class IndexesTest(PandasOnSparkTestCase, TestUtils):
|
2021-04-15 19:53:30 -04:00
|
|
|
@property
|
|
|
|
def pdf(self):
|
|
|
|
return pd.DataFrame(
|
|
|
|
{"a": [1, 2, 3, 4, 5, 6, 7, 8, 9], "b": [4, 5, 6, 3, 2, 1, 0, 0, 0]},
|
|
|
|
index=[0, 1, 3, 5, 6, 8, 9, 9, 9],
|
|
|
|
)
|
|
|
|
|
|
|
|
@property
|
2021-05-20 18:08:30 -04:00
|
|
|
def psdf(self):
|
2021-04-15 19:53:30 -04:00
|
|
|
return ps.from_pandas(self.pdf)
|
|
|
|
|
|
|
|
def test_index_basic(self):
|
|
|
|
for pdf in [
|
|
|
|
pd.DataFrame(np.random.randn(10, 5), index=np.random.randint(100, size=10)),
|
|
|
|
pd.DataFrame(
|
|
|
|
np.random.randn(10, 5), index=np.random.randint(100, size=10).astype(np.int32)
|
|
|
|
),
|
|
|
|
pd.DataFrame(np.random.randn(10, 5), index=np.random.randn(10)),
|
|
|
|
pd.DataFrame(np.random.randn(10, 5), index=np.random.randn(10).astype(np.float32)),
|
|
|
|
pd.DataFrame(np.random.randn(10, 5), index=list("abcdefghij")),
|
|
|
|
pd.DataFrame(
|
|
|
|
np.random.randn(10, 5), index=pd.date_range("2011-01-01", freq="D", periods=10)
|
|
|
|
),
|
|
|
|
pd.DataFrame(np.random.randn(10, 5), index=pd.Categorical(list("abcdefghij"))),
|
|
|
|
pd.DataFrame(np.random.randn(10, 5), columns=list("abcde")).set_index(["a", "b"]),
|
|
|
|
]:
|
2021-05-20 18:08:30 -04:00
|
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf.index, pdf.index)
|
|
|
|
self.assert_eq(type(psdf.index).__name__, type(pdf.index).__name__)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_index_from_series(self):
|
|
|
|
pser = pd.Series([1, 2, 3], name="a", index=[10, 20, 30])
|
2021-05-20 18:08:30 -04:00
|
|
|
psser = ps.from_pandas(pser)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(ps.Index(psser), pd.Index(pser))
|
|
|
|
self.assert_eq(ps.Index(psser, dtype="float"), pd.Index(pser, dtype="float"))
|
|
|
|
self.assert_eq(ps.Index(psser, name="x"), pd.Index(pser, name="x"))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
if LooseVersion(pd.__version__) >= LooseVersion("1.1"):
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(ps.Int64Index(psser), pd.Int64Index(pser))
|
|
|
|
self.assert_eq(ps.Float64Index(psser), pd.Float64Index(pser))
|
2021-04-15 19:53:30 -04:00
|
|
|
else:
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(ps.Int64Index(psser), pd.Int64Index(pser).rename("a"))
|
|
|
|
self.assert_eq(ps.Float64Index(psser), pd.Float64Index(pser).rename("a"))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pser = pd.Series([datetime(2021, 3, 1), datetime(2021, 3, 2)], name="x", index=[10, 20])
|
2021-05-20 18:08:30 -04:00
|
|
|
psser = ps.from_pandas(pser)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(ps.Index(psser), pd.Index(pser))
|
|
|
|
self.assert_eq(ps.DatetimeIndex(psser), pd.DatetimeIndex(pser))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_index_from_index(self):
|
|
|
|
pidx = pd.Index([1, 2, 3], name="a")
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(ps.Index(psidx), pd.Index(pidx))
|
|
|
|
self.assert_eq(ps.Index(psidx, dtype="float"), pd.Index(pidx, dtype="float"))
|
|
|
|
self.assert_eq(ps.Index(psidx, name="x"), pd.Index(pidx, name="x"))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(ps.Int64Index(psidx), pd.Int64Index(pidx))
|
|
|
|
self.assert_eq(ps.Float64Index(psidx), pd.Float64Index(pidx))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx = pd.DatetimeIndex(["2021-03-01", "2021-03-02"])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(ps.Index(psidx), pd.Index(pidx))
|
|
|
|
self.assert_eq(ps.DatetimeIndex(psidx), pd.DatetimeIndex(pidx))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_index_getattr(self):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = self.psdf.index
|
2021-04-15 19:53:30 -04:00
|
|
|
item = "databricks"
|
|
|
|
|
|
|
|
expected_error_message = "'.*Index' object has no attribute '{}'".format(item)
|
|
|
|
with self.assertRaisesRegex(AttributeError, expected_error_message):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.__getattr__(item)
|
2021-04-15 19:53:30 -04:00
|
|
|
with self.assertRaisesRegex(AttributeError, expected_error_message):
|
|
|
|
ps.from_pandas(pd.date_range("2011-01-01", freq="D", periods=10)).__getattr__(item)
|
|
|
|
|
|
|
|
def test_multi_index_getattr(self):
|
|
|
|
arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]]
|
|
|
|
idx = pd.MultiIndex.from_arrays(arrays, names=("number", "color"))
|
|
|
|
pdf = pd.DataFrame(np.random.randn(4, 5), idx)
|
2021-05-20 18:08:30 -04:00
|
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
psidx = psdf.index
|
2021-04-15 19:53:30 -04:00
|
|
|
item = "databricks"
|
|
|
|
|
|
|
|
expected_error_message = "'MultiIndex' object has no attribute '{}'".format(item)
|
|
|
|
with self.assertRaisesRegex(AttributeError, expected_error_message):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.__getattr__(item)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_to_series(self):
|
|
|
|
pidx = self.pdf.index
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = self.psdf.index
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psidx.to_series(), pidx.to_series())
|
|
|
|
self.assert_eq(psidx.to_series(name="a"), pidx.to_series(name="a"))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# With name
|
|
|
|
pidx.name = "Koalas"
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.name = "Koalas"
|
|
|
|
self.assert_eq(psidx.to_series(), pidx.to_series())
|
|
|
|
self.assert_eq(psidx.to_series(name=("x", "a")), pidx.to_series(name=("x", "a")))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# With tupled name
|
|
|
|
pidx.name = ("x", "a")
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.name = ("x", "a")
|
|
|
|
self.assert_eq(psidx.to_series(), pidx.to_series())
|
|
|
|
self.assert_eq(psidx.to_series(name="a"), pidx.to_series(name="a"))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq((psidx + 1).to_series(), (pidx + 1).to_series())
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx = self.pdf.set_index("b", append=True).index
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = self.psdf.set_index("b", append=True).index
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
with self.sql_conf({SPARK_CONF_ARROW_ENABLED: False}):
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psidx.to_series(), pidx.to_series())
|
|
|
|
self.assert_eq(psidx.to_series(name="a"), pidx.to_series(name="a"))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
expected_error_message = "Series.name must be a hashable type"
|
|
|
|
with self.assertRaisesRegex(TypeError, expected_error_message):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.to_series(name=["x", "a"])
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_to_frame(self):
|
|
|
|
pidx = self.pdf.index
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = self.psdf.index
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psidx.to_frame(), pidx.to_frame())
|
|
|
|
self.assert_eq(psidx.to_frame(index=False), pidx.to_frame(index=False))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx.name = "a"
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.name = "a"
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psidx.to_frame(), pidx.to_frame())
|
|
|
|
self.assert_eq(psidx.to_frame(index=False), pidx.to_frame(index=False))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
if LooseVersion(pd.__version__) >= LooseVersion("0.24"):
|
|
|
|
# The `name` argument is added in pandas 0.24.
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psidx.to_frame(name="x"), pidx.to_frame(name="x"))
|
2021-04-15 19:53:30 -04:00
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.to_frame(index=False, name="x"), pidx.to_frame(index=False, name="x"),
|
2021-04-15 19:53:30 -04:00
|
|
|
)
|
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assertRaises(TypeError, lambda: psidx.to_frame(name=["x"]))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# non-string name
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psidx.to_frame(name=10), pidx.to_frame(name=10))
|
|
|
|
self.assert_eq(psidx.to_frame(name=("x", 10)), pidx.to_frame(name=("x", 10)))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx = self.pdf.set_index("b", append=True).index
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = self.psdf.set_index("b", append=True).index
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psidx.to_frame(), pidx.to_frame())
|
|
|
|
self.assert_eq(psidx.to_frame(index=False), pidx.to_frame(index=False))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
if LooseVersion(pd.__version__) >= LooseVersion("0.24"):
|
|
|
|
# The `name` argument is added in pandas 0.24.
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psidx.to_frame(name=["x", "y"]), pidx.to_frame(name=["x", "y"]))
|
|
|
|
self.assert_eq(psidx.to_frame(name=("x", "y")), pidx.to_frame(name=("x", "y")))
|
2021-04-15 19:53:30 -04:00
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.to_frame(index=False, name=["x", "y"]),
|
2021-04-15 19:53:30 -04:00
|
|
|
pidx.to_frame(index=False, name=["x", "y"]),
|
|
|
|
)
|
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assertRaises(TypeError, lambda: psidx.to_frame(name="x"))
|
|
|
|
self.assertRaises(ValueError, lambda: psidx.to_frame(name=["x"]))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# non-string names
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psidx.to_frame(name=[10, 20]), pidx.to_frame(name=[10, 20]))
|
|
|
|
self.assert_eq(psidx.to_frame(name=("x", 10)), pidx.to_frame(name=("x", 10)))
|
2021-04-15 19:53:30 -04:00
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.to_frame(name=[("x", 10), ("y", 20)]),
|
2021-04-15 19:53:30 -04:00
|
|
|
pidx.to_frame(name=[("x", 10), ("y", 20)]),
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_index_names(self):
|
2021-05-20 18:08:30 -04:00
|
|
|
psdf = self.psdf
|
|
|
|
self.assertIsNone(psdf.index.name)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
idx = pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], name="x")
|
|
|
|
pdf = pd.DataFrame(np.random.randn(10, 5), index=idx, columns=list("abcde"))
|
2021-05-20 18:08:30 -04:00
|
|
|
psdf = ps.from_pandas(pdf)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pser = pdf.a
|
2021-05-20 18:08:30 -04:00
|
|
|
psser = psdf.a
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assertEqual(psdf.index.name, pdf.index.name)
|
|
|
|
self.assertEqual(psdf.index.names, pdf.index.names)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx = pdf.index
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = psdf.index
|
2021-04-15 19:53:30 -04:00
|
|
|
pidx.name = "renamed"
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.name = "renamed"
|
|
|
|
self.assertEqual(psidx.name, pidx.name)
|
|
|
|
self.assertEqual(psidx.names, pidx.names)
|
|
|
|
self.assert_eq(psidx, pidx)
|
|
|
|
self.assertEqual(psdf.index.name, pdf.index.name)
|
|
|
|
self.assertEqual(psdf.index.names, pdf.index.names)
|
|
|
|
self.assertEqual(psser.index.names, pser.index.names)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx.name = None
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.name = None
|
|
|
|
self.assertEqual(psidx.name, pidx.name)
|
|
|
|
self.assertEqual(psidx.names, pidx.names)
|
|
|
|
self.assert_eq(psidx, pidx)
|
|
|
|
self.assertEqual(psdf.index.name, pdf.index.name)
|
|
|
|
self.assertEqual(psdf.index.names, pdf.index.names)
|
|
|
|
self.assertEqual(psser.index.names, pser.index.names)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
with self.assertRaisesRegex(ValueError, "Names must be a list-like"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.names = "hi"
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
expected_error_message = "Length of new names must be {}, got {}".format(
|
2021-05-20 18:08:30 -04:00
|
|
|
psdf._internal.index_level, len(["0", "1"])
|
2021-04-15 19:53:30 -04:00
|
|
|
)
|
|
|
|
with self.assertRaisesRegex(ValueError, expected_error_message):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.names = ["0", "1"]
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
expected_error_message = "Index.name must be a hashable type"
|
|
|
|
with self.assertRaisesRegex(TypeError, expected_error_message):
|
|
|
|
ps.Index([1, 2, 3], name=["0", "1"])
|
|
|
|
with self.assertRaisesRegex(TypeError, expected_error_message):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.name = ["renamed"]
|
2021-04-15 19:53:30 -04:00
|
|
|
with self.assertRaisesRegex(TypeError, expected_error_message):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.name = ["0", "1"]
|
2021-04-15 19:53:30 -04:00
|
|
|
with self.assertRaisesRegex(TypeError, expected_error_message):
|
|
|
|
ps.Index([(1, 2), (3, 4)], names=["a", ["b"]])
|
|
|
|
|
|
|
|
def test_multi_index_names(self):
|
|
|
|
arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]]
|
|
|
|
idx = pd.MultiIndex.from_arrays(arrays, names=("number", "color"))
|
|
|
|
pdf = pd.DataFrame(np.random.randn(4, 5), idx)
|
2021-05-20 18:08:30 -04:00
|
|
|
psdf = ps.from_pandas(pdf)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assertEqual(psdf.index.names, pdf.index.names)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx = pdf.index
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = psdf.index
|
2021-04-15 19:53:30 -04:00
|
|
|
pidx.names = ["renamed_number", "renamed_color"]
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.names = ["renamed_number", "renamed_color"]
|
|
|
|
self.assertEqual(psidx.names, pidx.names)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx.names = ["renamed_number", None]
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.names = ["renamed_number", None]
|
|
|
|
self.assertEqual(psidx.names, pidx.names)
|
|
|
|
self.assert_eq(psidx, pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
with self.assertRaises(PandasNotImplementedError):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.name
|
2021-04-15 19:53:30 -04:00
|
|
|
with self.assertRaises(PandasNotImplementedError):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.name = "renamed"
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_index_rename(self):
|
|
|
|
pdf = pd.DataFrame(
|
|
|
|
np.random.randn(10, 5), index=pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], name="x")
|
|
|
|
)
|
2021-05-20 18:08:30 -04:00
|
|
|
psdf = ps.from_pandas(pdf)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx = pdf.index
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = psdf.index
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psidx.rename("y"), pidx.rename("y"))
|
|
|
|
self.assert_eq(psdf.index.names, pdf.index.names)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# non-string names
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psidx.rename(0), pidx.rename(0))
|
|
|
|
self.assert_eq(psidx.rename(("y", 0)), pidx.rename(("y", 0)))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.rename("z", inplace=True)
|
2021-04-15 19:53:30 -04:00
|
|
|
pidx.rename("z", inplace=True)
|
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psidx, pidx)
|
|
|
|
self.assert_eq(psdf.index.names, pdf.index.names)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psidx.rename(None), pidx.rename(None))
|
|
|
|
self.assert_eq(psdf.index.names, pdf.index.names)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assertRaises(TypeError, lambda: psidx.rename(["x", "y"]))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_multi_index_rename(self):
|
|
|
|
arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]]
|
|
|
|
idx = pd.MultiIndex.from_arrays(arrays, names=("number", "color"))
|
|
|
|
pdf = pd.DataFrame(np.random.randn(4, 5), idx)
|
2021-05-20 18:08:30 -04:00
|
|
|
psdf = ps.from_pandas(pdf)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pmidx = pdf.index
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx = psdf.index
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psmidx.rename(["n", "c"]), pmidx.rename(["n", "c"]))
|
|
|
|
self.assert_eq(psdf.index.names, pdf.index.names)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# non-string names
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psmidx.rename([0, 1]), pmidx.rename([0, 1]))
|
2021-04-15 19:53:30 -04:00
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx.rename([("x", "a"), ("y", "b")]), pmidx.rename([("x", "a"), ("y", "b")])
|
2021-04-15 19:53:30 -04:00
|
|
|
)
|
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx.rename(["num", "col"], inplace=True)
|
2021-04-15 19:53:30 -04:00
|
|
|
pmidx.rename(["num", "col"], inplace=True)
|
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psmidx, pmidx)
|
|
|
|
self.assert_eq(psdf.index.names, pdf.index.names)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psmidx.rename([None, None]), pmidx.rename([None, None]))
|
|
|
|
self.assert_eq(psdf.index.names, pdf.index.names)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assertRaises(TypeError, lambda: psmidx.rename("number"))
|
|
|
|
self.assertRaises(TypeError, lambda: psmidx.rename(None))
|
|
|
|
self.assertRaises(ValueError, lambda: psmidx.rename(["number"]))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_multi_index_levshape(self):
|
|
|
|
pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2)])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
|
|
|
self.assertEqual(pidx.levshape, psidx.levshape)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_index_unique(self):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = self.psdf.index
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# here the output is different than pandas in terms of order
|
|
|
|
expected = [0, 1, 3, 5, 6, 8, 9]
|
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(expected, sorted(psidx.unique().to_pandas()))
|
|
|
|
self.assert_eq(expected, sorted(psidx.unique(level=0).to_pandas()))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
expected = [1, 2, 4, 6, 7, 9, 10]
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(expected, sorted((psidx + 1).unique().to_pandas()))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
with self.assertRaisesRegex(IndexError, "Too many levels*"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.unique(level=1)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
with self.assertRaisesRegex(KeyError, "Requested level (hi)*"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.unique(level="hi")
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_multi_index_copy(self):
|
|
|
|
arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]]
|
|
|
|
idx = pd.MultiIndex.from_arrays(arrays, names=("number", "color"))
|
|
|
|
pdf = pd.DataFrame(np.random.randn(4, 5), idx)
|
2021-05-20 18:08:30 -04:00
|
|
|
psdf = ps.from_pandas(pdf)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psdf.index.copy(), pdf.index.copy())
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_drop_duplicates(self):
|
|
|
|
pidx = pd.Index([4, 2, 4, 1, 4, 3])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psidx.drop_duplicates().sort_values(), pidx.drop_duplicates().sort_values())
|
2021-04-15 19:53:30 -04:00
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
(psidx + 1).drop_duplicates().sort_values(), (pidx + 1).drop_duplicates().sort_values()
|
2021-04-15 19:53:30 -04:00
|
|
|
)
|
|
|
|
|
|
|
|
def test_dropna(self):
|
|
|
|
pidx = pd.Index([np.nan, 2, 4, 1, np.nan, 3])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psidx.dropna(), pidx.dropna())
|
|
|
|
self.assert_eq((psidx + 1).dropna(), (pidx + 1).dropna())
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_index_symmetric_difference(self):
|
|
|
|
pidx1 = pd.Index([1, 2, 3, 4])
|
|
|
|
pidx2 = pd.Index([2, 3, 4, 5])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx1 = ps.from_pandas(pidx1)
|
|
|
|
psidx2 = ps.from_pandas(pidx2)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx1.symmetric_difference(psidx2).sort_values(),
|
2021-04-15 19:53:30 -04:00
|
|
|
pidx1.symmetric_difference(pidx2).sort_values(),
|
|
|
|
)
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
(psidx1 + 1).symmetric_difference(psidx2).sort_values(),
|
2021-04-15 19:53:30 -04:00
|
|
|
(pidx1 + 1).symmetric_difference(pidx2).sort_values(),
|
|
|
|
)
|
|
|
|
|
|
|
|
pmidx1 = pd.MultiIndex(
|
|
|
|
[["lama", "cow", "falcon"], ["speed", "weight", "length"]],
|
|
|
|
[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 0, 0, 1, 2, 0, 1, 2]],
|
|
|
|
)
|
|
|
|
pmidx2 = pd.MultiIndex(
|
|
|
|
[["koalas", "cow", "falcon"], ["speed", "weight", "length"]],
|
|
|
|
[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 0, 0, 1, 2, 0, 1, 2]],
|
|
|
|
)
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx1 = ps.from_pandas(pmidx1)
|
|
|
|
psmidx2 = ps.from_pandas(pmidx2)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx1.symmetric_difference(psmidx2).sort_values(),
|
2021-04-15 19:53:30 -04:00
|
|
|
pmidx1.symmetric_difference(pmidx2).sort_values(),
|
|
|
|
)
|
|
|
|
|
|
|
|
idx = ps.Index(["a", "b", "c"])
|
|
|
|
midx = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
|
|
|
|
|
|
|
|
with self.assertRaisesRegex(NotImplementedError, "Doesn't support*"):
|
|
|
|
idx.symmetric_difference(midx)
|
|
|
|
|
|
|
|
def test_multi_index_symmetric_difference(self):
|
|
|
|
idx = ps.Index(["a", "b", "c"])
|
|
|
|
midx = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
|
|
|
|
midx_ = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
|
|
|
|
|
|
|
|
self.assert_eq(
|
|
|
|
midx.symmetric_difference(midx_),
|
|
|
|
midx.to_pandas().symmetric_difference(midx_.to_pandas()),
|
|
|
|
)
|
|
|
|
|
|
|
|
with self.assertRaisesRegex(NotImplementedError, "Doesn't support*"):
|
|
|
|
midx.symmetric_difference(idx)
|
|
|
|
|
|
|
|
def test_missing(self):
|
2021-05-20 18:08:30 -04:00
|
|
|
psdf = ps.DataFrame(
|
2021-04-15 19:53:30 -04:00
|
|
|
{
|
|
|
|
"a": [1, 2, 3],
|
|
|
|
"b": [4, 5, 6],
|
|
|
|
"c": pd.date_range("2011-01-01", freq="D", periods=3),
|
|
|
|
"d": pd.Categorical(["a", "b", "c"]),
|
|
|
|
}
|
|
|
|
)
|
|
|
|
|
|
|
|
# Index functions
|
|
|
|
missing_functions = inspect.getmembers(MissingPandasLikeIndex, inspect.isfunction)
|
|
|
|
unsupported_functions = [
|
|
|
|
name for (name, type_) in missing_functions if type_.__name__ == "unsupported_function"
|
|
|
|
]
|
|
|
|
for name in unsupported_functions:
|
|
|
|
with self.assertRaisesRegex(
|
|
|
|
PandasNotImplementedError,
|
|
|
|
"method.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name),
|
|
|
|
):
|
2021-05-20 18:08:30 -04:00
|
|
|
getattr(psdf.set_index("a").index, name)()
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
deprecated_functions = [
|
|
|
|
name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function"
|
|
|
|
]
|
|
|
|
for name in deprecated_functions:
|
|
|
|
with self.assertRaisesRegex(
|
|
|
|
PandasNotImplementedError, "method.*Index.*{}.*is deprecated".format(name)
|
|
|
|
):
|
2021-05-20 18:08:30 -04:00
|
|
|
getattr(psdf.set_index("a").index, name)()
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# MultiIndex functions
|
|
|
|
missing_functions = inspect.getmembers(MissingPandasLikeMultiIndex, inspect.isfunction)
|
|
|
|
unsupported_functions = [
|
|
|
|
name for (name, type_) in missing_functions if type_.__name__ == "unsupported_function"
|
|
|
|
]
|
|
|
|
for name in unsupported_functions:
|
|
|
|
with self.assertRaisesRegex(
|
|
|
|
PandasNotImplementedError,
|
|
|
|
"method.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name),
|
|
|
|
):
|
2021-05-20 18:08:30 -04:00
|
|
|
getattr(psdf.set_index(["a", "b"]).index, name)()
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
deprecated_functions = [
|
|
|
|
name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function"
|
|
|
|
]
|
|
|
|
for name in deprecated_functions:
|
|
|
|
with self.assertRaisesRegex(
|
|
|
|
PandasNotImplementedError, "method.*Index.*{}.*is deprecated".format(name)
|
|
|
|
):
|
2021-05-20 18:08:30 -04:00
|
|
|
getattr(psdf.set_index(["a", "b"]).index, name)()
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# DatetimeIndex functions
|
|
|
|
missing_functions = inspect.getmembers(MissingPandasLikeDatetimeIndex, inspect.isfunction)
|
|
|
|
unsupported_functions = [
|
|
|
|
name for (name, type_) in missing_functions if type_.__name__ == "unsupported_function"
|
|
|
|
]
|
|
|
|
for name in unsupported_functions:
|
|
|
|
with self.assertRaisesRegex(
|
|
|
|
PandasNotImplementedError,
|
|
|
|
"method.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name),
|
|
|
|
):
|
2021-05-20 18:08:30 -04:00
|
|
|
getattr(psdf.set_index("c").index, name)()
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
deprecated_functions = [
|
|
|
|
name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function"
|
|
|
|
]
|
|
|
|
for name in deprecated_functions:
|
|
|
|
with self.assertRaisesRegex(
|
|
|
|
PandasNotImplementedError, "method.*Index.*{}.*is deprecated".format(name)
|
|
|
|
):
|
2021-05-20 18:08:30 -04:00
|
|
|
getattr(psdf.set_index("c").index, name)()
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# CategoricalIndex functions
|
|
|
|
missing_functions = inspect.getmembers(
|
|
|
|
MissingPandasLikeCategoricalIndex, inspect.isfunction
|
|
|
|
)
|
|
|
|
unsupported_functions = [
|
|
|
|
name for (name, type_) in missing_functions if type_.__name__ == "unsupported_function"
|
|
|
|
]
|
|
|
|
for name in unsupported_functions:
|
|
|
|
with self.assertRaisesRegex(
|
|
|
|
PandasNotImplementedError,
|
|
|
|
"method.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name),
|
|
|
|
):
|
2021-05-20 18:08:30 -04:00
|
|
|
getattr(psdf.set_index("d").index, name)()
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
deprecated_functions = [
|
|
|
|
name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function"
|
|
|
|
]
|
|
|
|
for name in deprecated_functions:
|
|
|
|
with self.assertRaisesRegex(
|
|
|
|
PandasNotImplementedError, "method.*Index.*{}.*is deprecated".format(name)
|
|
|
|
):
|
2021-05-20 18:08:30 -04:00
|
|
|
getattr(psdf.set_index("d").index, name)()
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# Index properties
|
|
|
|
missing_properties = inspect.getmembers(
|
|
|
|
MissingPandasLikeIndex, lambda o: isinstance(o, property)
|
|
|
|
)
|
|
|
|
unsupported_properties = [
|
|
|
|
name
|
|
|
|
for (name, type_) in missing_properties
|
|
|
|
if type_.fget.__name__ == "unsupported_property"
|
|
|
|
]
|
|
|
|
for name in unsupported_properties:
|
|
|
|
with self.assertRaisesRegex(
|
|
|
|
PandasNotImplementedError,
|
|
|
|
"property.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name),
|
|
|
|
):
|
2021-05-20 18:08:30 -04:00
|
|
|
getattr(psdf.set_index("a").index, name)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
deprecated_properties = [
|
|
|
|
name
|
|
|
|
for (name, type_) in missing_properties
|
|
|
|
if type_.fget.__name__ == "deprecated_property"
|
|
|
|
]
|
|
|
|
for name in deprecated_properties:
|
|
|
|
with self.assertRaisesRegex(
|
|
|
|
PandasNotImplementedError, "property.*Index.*{}.*is deprecated".format(name)
|
|
|
|
):
|
2021-05-20 18:08:30 -04:00
|
|
|
getattr(psdf.set_index("a").index, name)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# MultiIndex properties
|
|
|
|
missing_properties = inspect.getmembers(
|
|
|
|
MissingPandasLikeMultiIndex, lambda o: isinstance(o, property)
|
|
|
|
)
|
|
|
|
unsupported_properties = [
|
|
|
|
name
|
|
|
|
for (name, type_) in missing_properties
|
|
|
|
if type_.fget.__name__ == "unsupported_property"
|
|
|
|
]
|
|
|
|
for name in unsupported_properties:
|
|
|
|
with self.assertRaisesRegex(
|
|
|
|
PandasNotImplementedError,
|
|
|
|
"property.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name),
|
|
|
|
):
|
2021-05-20 18:08:30 -04:00
|
|
|
getattr(psdf.set_index(["a", "b"]).index, name)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
deprecated_properties = [
|
|
|
|
name
|
|
|
|
for (name, type_) in missing_properties
|
|
|
|
if type_.fget.__name__ == "deprecated_property"
|
|
|
|
]
|
|
|
|
for name in deprecated_properties:
|
|
|
|
with self.assertRaisesRegex(
|
|
|
|
PandasNotImplementedError, "property.*Index.*{}.*is deprecated".format(name)
|
|
|
|
):
|
2021-05-20 18:08:30 -04:00
|
|
|
getattr(psdf.set_index(["a", "b"]).index, name)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# DatetimeIndex properties
|
|
|
|
missing_properties = inspect.getmembers(
|
|
|
|
MissingPandasLikeDatetimeIndex, lambda o: isinstance(o, property)
|
|
|
|
)
|
|
|
|
unsupported_properties = [
|
|
|
|
name
|
|
|
|
for (name, type_) in missing_properties
|
|
|
|
if type_.fget.__name__ == "unsupported_property"
|
|
|
|
]
|
|
|
|
for name in unsupported_properties:
|
|
|
|
with self.assertRaisesRegex(
|
|
|
|
PandasNotImplementedError,
|
|
|
|
"property.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name),
|
|
|
|
):
|
2021-05-20 18:08:30 -04:00
|
|
|
getattr(psdf.set_index("c").index, name)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# CategoricalIndex properties
|
|
|
|
missing_properties = inspect.getmembers(
|
|
|
|
MissingPandasLikeCategoricalIndex, lambda o: isinstance(o, property)
|
|
|
|
)
|
|
|
|
unsupported_properties = [
|
|
|
|
name
|
|
|
|
for (name, type_) in missing_properties
|
|
|
|
if type_.fget.__name__ == "unsupported_property"
|
|
|
|
]
|
|
|
|
for name in unsupported_properties:
|
|
|
|
with self.assertRaisesRegex(
|
|
|
|
PandasNotImplementedError,
|
|
|
|
"property.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name),
|
|
|
|
):
|
2021-05-20 18:08:30 -04:00
|
|
|
getattr(psdf.set_index("d").index, name)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_index_has_duplicates(self):
|
|
|
|
indexes = [("a", "b", "c"), ("a", "a", "c"), (1, 3, 3), (1, 2, 3)]
|
|
|
|
names = [None, "ks", "ks", None]
|
|
|
|
has_dup = [False, True, True, False]
|
|
|
|
|
|
|
|
for idx, name, expected in zip(indexes, names, has_dup):
|
|
|
|
pdf = pd.DataFrame({"a": [1, 2, 3]}, index=pd.Index(idx, name=name))
|
2021-05-20 18:08:30 -04:00
|
|
|
psdf = ps.from_pandas(pdf)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assertEqual(psdf.index.has_duplicates, expected)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_multiindex_has_duplicates(self):
|
|
|
|
indexes = [
|
|
|
|
[list("abc"), list("edf")],
|
|
|
|
[list("aac"), list("edf")],
|
|
|
|
[list("aac"), list("eef")],
|
|
|
|
[[1, 4, 4], [4, 6, 6]],
|
|
|
|
]
|
|
|
|
has_dup = [False, False, True, True]
|
|
|
|
|
|
|
|
for idx, expected in zip(indexes, has_dup):
|
|
|
|
pdf = pd.DataFrame({"a": [1, 2, 3]}, index=idx)
|
2021-05-20 18:08:30 -04:00
|
|
|
psdf = ps.from_pandas(pdf)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assertEqual(psdf.index.has_duplicates, expected)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_multi_index_not_supported(self):
|
2021-05-20 18:08:30 -04:00
|
|
|
psdf = ps.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
with self.assertRaisesRegex(TypeError, "cannot perform any with this index type"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psdf.set_index(["a", "b"]).index.any()
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
with self.assertRaisesRegex(TypeError, "cannot perform all with this index type"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psdf.set_index(["a", "b"]).index.all()
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_index_nlevels(self):
|
|
|
|
pdf = pd.DataFrame({"a": [1, 2, 3]}, index=pd.Index(["a", "b", "c"]))
|
2021-05-20 18:08:30 -04:00
|
|
|
psdf = ps.from_pandas(pdf)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assertEqual(psdf.index.nlevels, 1)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_multiindex_nlevel(self):
|
|
|
|
pdf = pd.DataFrame({"a": [1, 2, 3]}, index=[list("abc"), list("def")])
|
2021-05-20 18:08:30 -04:00
|
|
|
psdf = ps.from_pandas(pdf)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assertEqual(psdf.index.nlevels, 2)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_multiindex_from_arrays(self):
|
|
|
|
arrays = [["a", "a", "b", "b"], ["red", "blue", "red", "blue"]]
|
|
|
|
pidx = pd.MultiIndex.from_arrays(arrays)
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.MultiIndex.from_arrays(arrays)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx, psidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_multiindex_swaplevel(self):
|
|
|
|
pidx = pd.MultiIndex.from_arrays([["a", "b"], [1, 2]])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
|
|
|
self.assert_eq(pidx.swaplevel(0, 1), psidx.swaplevel(0, 1))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx = pd.MultiIndex.from_arrays([["a", "b"], [1, 2]], names=["word", "number"])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
|
|
|
self.assert_eq(pidx.swaplevel(0, 1), psidx.swaplevel(0, 1))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx = pd.MultiIndex.from_arrays([["a", "b"], [1, 2]], names=["word", None])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
|
|
|
self.assert_eq(pidx.swaplevel(-2, -1), psidx.swaplevel(-2, -1))
|
|
|
|
self.assert_eq(pidx.swaplevel(0, 1), psidx.swaplevel(0, 1))
|
|
|
|
self.assert_eq(pidx.swaplevel("word", 1), psidx.swaplevel("word", 1))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
with self.assertRaisesRegex(IndexError, "Too many levels: Index"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.swaplevel(-3, "word")
|
2021-04-15 19:53:30 -04:00
|
|
|
with self.assertRaisesRegex(IndexError, "Too many levels: Index"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.swaplevel(0, 2)
|
2021-04-15 19:53:30 -04:00
|
|
|
with self.assertRaisesRegex(IndexError, "Too many levels: Index"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.swaplevel(0, -3)
|
2021-04-15 19:53:30 -04:00
|
|
|
with self.assertRaisesRegex(KeyError, "Level work not found"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.swaplevel(0, "work")
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_multiindex_droplevel(self):
|
|
|
|
pidx = pd.MultiIndex.from_tuples(
|
|
|
|
[("a", "x", 1), ("b", "y", 2)], names=["level1", "level2", "level3"]
|
|
|
|
)
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
with self.assertRaisesRegex(IndexError, "Too many levels: Index has only 3 levels, not 5"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.droplevel(4)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
with self.assertRaisesRegex(KeyError, "Level level4 not found"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.droplevel("level4")
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
with self.assertRaisesRegex(KeyError, "Level.*level3.*level4.*not found"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.droplevel([("level3", "level4")])
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
with self.assertRaisesRegex(
|
|
|
|
ValueError,
|
|
|
|
"Cannot remove 4 levels from an index with 3 levels: at least one "
|
|
|
|
"level must be left.",
|
|
|
|
):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.droplevel([0, 0, 1, 2])
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
with self.assertRaisesRegex(
|
|
|
|
ValueError,
|
|
|
|
"Cannot remove 3 levels from an index with 3 levels: at least one "
|
|
|
|
"level must be left.",
|
|
|
|
):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.droplevel([0, 1, 2])
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx.droplevel(0), psidx.droplevel(0))
|
|
|
|
self.assert_eq(pidx.droplevel([0, 1]), psidx.droplevel([0, 1]))
|
|
|
|
self.assert_eq(pidx.droplevel((0, 1)), psidx.droplevel((0, 1)))
|
|
|
|
self.assert_eq(pidx.droplevel([0, "level2"]), psidx.droplevel([0, "level2"]))
|
|
|
|
self.assert_eq(pidx.droplevel((0, "level2")), psidx.droplevel((0, "level2")))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# non-string names
|
|
|
|
pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2)], names=[1.0, 2.0, 3.0])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
|
|
|
self.assert_eq(pidx.droplevel(1.0), psidx.droplevel(1.0))
|
|
|
|
self.assert_eq(pidx.droplevel([0, 2.0]), psidx.droplevel([0, 2.0]))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_index_fillna(self):
|
|
|
|
pidx = pd.Index([1, 2, None])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx.fillna(0), psidx.fillna(0), almost=True)
|
|
|
|
self.assert_eq(pidx.rename("name").fillna(0), psidx.rename("name").fillna(0), almost=True)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
with self.assertRaisesRegex(TypeError, "Unsupported type list"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.fillna([1, 2])
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_index_drop(self):
|
|
|
|
pidx = pd.Index([1, 2, 3])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx.drop(1), psidx.drop(1))
|
|
|
|
self.assert_eq(pidx.drop([1, 2]), psidx.drop([1, 2]))
|
|
|
|
self.assert_eq((pidx + 1).drop([2, 3]), (psidx + 1).drop([2, 3]))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_multiindex_drop(self):
|
|
|
|
pidx = pd.MultiIndex.from_tuples(
|
|
|
|
[("a", "x"), ("b", "y"), ("c", "z")], names=["level1", "level2"]
|
|
|
|
)
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
|
|
|
self.assert_eq(pidx.drop("a"), psidx.drop("a"))
|
|
|
|
self.assert_eq(pidx.drop(["a", "b"]), psidx.drop(["a", "b"]))
|
|
|
|
self.assert_eq(pidx.drop(["x", "y"], level=1), psidx.drop(["x", "y"], level=1))
|
|
|
|
self.assert_eq(
|
|
|
|
pidx.drop(["x", "y"], level="level2"), psidx.drop(["x", "y"], level="level2")
|
|
|
|
)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx.names = ["lv1", "lv2"]
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.names = ["lv1", "lv2"]
|
|
|
|
self.assert_eq(pidx.drop(["x", "y"], level="lv2"), psidx.drop(["x", "y"], level="lv2"))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assertRaises(IndexError, lambda: psidx.drop(["a", "b"], level=2))
|
|
|
|
self.assertRaises(KeyError, lambda: psidx.drop(["a", "b"], level="level"))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.names = ["lv", "lv"]
|
|
|
|
self.assertRaises(ValueError, lambda: psidx.drop(["x", "y"], level="lv"))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_sort_values(self):
|
|
|
|
pidx = pd.Index([-10, -100, 200, 100])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx.sort_values(), psidx.sort_values())
|
|
|
|
self.assert_eq(pidx.sort_values(ascending=False), psidx.sort_values(ascending=False))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx.name = "koalas"
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.name = "koalas"
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx.sort_values(), psidx.sort_values())
|
|
|
|
self.assert_eq(pidx.sort_values(ascending=False), psidx.sort_values(ascending=False))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx.names = ["hello", "koalas", "goodbye"]
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.names = ["hello", "koalas", "goodbye"]
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx.sort_values(), psidx.sort_values())
|
|
|
|
self.assert_eq(pidx.sort_values(ascending=False), psidx.sort_values(ascending=False))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_index_drop_duplicates(self):
|
|
|
|
pidx = pd.Index([1, 1, 2])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
|
|
|
self.assert_eq(pidx.drop_duplicates().sort_values(), psidx.drop_duplicates().sort_values())
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx = pd.MultiIndex.from_tuples([(1, 1), (1, 1), (2, 2)], names=["level1", "level2"])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
|
|
|
self.assert_eq(pidx.drop_duplicates().sort_values(), psidx.drop_duplicates().sort_values())
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_index_sort(self):
|
|
|
|
idx = ps.Index([1, 2, 3, 4, 5])
|
|
|
|
midx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2)])
|
|
|
|
|
|
|
|
with self.assertRaisesRegex(
|
|
|
|
TypeError, "cannot sort an Index object in-place, use sort_values instead"
|
|
|
|
):
|
|
|
|
idx.sort()
|
|
|
|
with self.assertRaisesRegex(
|
|
|
|
TypeError, "cannot sort an Index object in-place, use sort_values instead"
|
|
|
|
):
|
|
|
|
midx.sort()
|
|
|
|
|
|
|
|
def test_multiindex_isna(self):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
with self.assertRaisesRegex(NotImplementedError, "isna is not defined for MultiIndex"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.isna()
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
with self.assertRaisesRegex(NotImplementedError, "isna is not defined for MultiIndex"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.isnull()
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
with self.assertRaisesRegex(NotImplementedError, "notna is not defined for MultiIndex"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.notna()
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
with self.assertRaisesRegex(NotImplementedError, "notna is not defined for MultiIndex"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.notnull()
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_index_nunique(self):
|
|
|
|
pidx = pd.Index([1, 1, 2, None])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx.nunique(), psidx.nunique())
|
|
|
|
self.assert_eq(pidx.nunique(dropna=True), psidx.nunique(dropna=True))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_multiindex_nunique(self):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
|
2021-04-15 19:53:30 -04:00
|
|
|
with self.assertRaisesRegex(NotImplementedError, "notna is not defined for MultiIndex"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.notnull()
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_multiindex_rename(self):
|
|
|
|
pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx = pidx.rename(list("ABC"))
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = psidx.rename(list("ABC"))
|
|
|
|
self.assert_eq(pidx, psidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx = pidx.rename(["my", "name", "is"])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = psidx.rename(["my", "name", "is"])
|
|
|
|
self.assert_eq(pidx, psidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_multiindex_set_names(self):
|
|
|
|
pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx = pidx.set_names(["set", "new", "names"])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = psidx.set_names(["set", "new", "names"])
|
|
|
|
self.assert_eq(pidx, psidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx.set_names(["set", "new", "names"], inplace=True)
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.set_names(["set", "new", "names"], inplace=True)
|
|
|
|
self.assert_eq(pidx, psidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx = pidx.set_names("first", level=0)
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = psidx.set_names("first", level=0)
|
|
|
|
self.assert_eq(pidx, psidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx = pidx.set_names("second", level=1)
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = psidx.set_names("second", level=1)
|
|
|
|
self.assert_eq(pidx, psidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx = pidx.set_names("third", level=2)
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = psidx.set_names("third", level=2)
|
|
|
|
self.assert_eq(pidx, psidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx.set_names("first", level=0, inplace=True)
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.set_names("first", level=0, inplace=True)
|
|
|
|
self.assert_eq(pidx, psidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx.set_names("second", level=1, inplace=True)
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.set_names("second", level=1, inplace=True)
|
|
|
|
self.assert_eq(pidx, psidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx.set_names("third", level=2, inplace=True)
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.set_names("third", level=2, inplace=True)
|
|
|
|
self.assert_eq(pidx, psidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_multiindex_from_tuples(self):
|
|
|
|
tuples = [(1, "red"), (1, "blue"), (2, "red"), (2, "blue")]
|
|
|
|
pidx = pd.MultiIndex.from_tuples(tuples)
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.MultiIndex.from_tuples(tuples)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx, psidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_multiindex_from_product(self):
|
|
|
|
iterables = [[0, 1, 2], ["green", "purple"]]
|
|
|
|
pidx = pd.MultiIndex.from_product(iterables)
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.MultiIndex.from_product(iterables)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx, psidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_multiindex_tuple_column_name(self):
|
|
|
|
column_labels = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")])
|
|
|
|
pdf = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=column_labels)
|
|
|
|
pdf.set_index(("a", "x"), append=True, inplace=True)
|
2021-05-20 18:08:30 -04:00
|
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(pdf, psdf)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_len(self):
|
|
|
|
pidx = pd.Index(range(10000))
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(len(pidx), len(psidx))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(len(pidx), len(psidx))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_delete(self):
|
|
|
|
pidx = pd.Index([10, 9, 8, 7, 6, 7, 8, 9, 10])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx.delete(8).sort_values(), psidx.delete(8).sort_values())
|
|
|
|
self.assert_eq(pidx.delete(-9).sort_values(), psidx.delete(-9).sort_values())
|
|
|
|
self.assert_eq(
|
|
|
|
pidx.delete([-9, 0, 8]).sort_values(), psidx.delete([-9, 0, 8]).sort_values()
|
|
|
|
)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
with self.assertRaisesRegex(IndexError, "index 9 is out of bounds for axis 0 with size 9"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.delete([0, 9])
|
2021-04-15 19:53:30 -04:00
|
|
|
with self.assertRaisesRegex(
|
|
|
|
IndexError, "index -10 is out of bounds for axis 0 with size 9"
|
|
|
|
):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.delete([-10, 0])
|
2021-04-15 19:53:30 -04:00
|
|
|
with self.assertRaisesRegex(IndexError, "index 9 is out of bounds for axis 0 with size 9"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.delete(9)
|
2021-04-15 19:53:30 -04:00
|
|
|
with self.assertRaisesRegex(
|
|
|
|
IndexError, "index -10 is out of bounds for axis 0 with size 9"
|
|
|
|
):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.delete(-10)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# MultiIndex
|
|
|
|
pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx.delete(2).sort_values(), psidx.delete(2).sort_values())
|
|
|
|
self.assert_eq(pidx.delete(-3).sort_values(), psidx.delete(-3).sort_values())
|
|
|
|
self.assert_eq(
|
|
|
|
pidx.delete([-3, 0, 2]).sort_values(), psidx.delete([-3, 0, 2]).sort_values()
|
|
|
|
)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
with self.assertRaisesRegex(IndexError, "index 3 is out of bounds for axis 0 with size 3"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.delete([0, 3])
|
2021-04-15 19:53:30 -04:00
|
|
|
with self.assertRaisesRegex(IndexError, "index -4 is out of bounds for axis 0 with size 3"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.delete([-4, 0])
|
2021-04-15 19:53:30 -04:00
|
|
|
with self.assertRaisesRegex(IndexError, "index 3 is out of bounds for axis 0 with size 3"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.delete(3)
|
2021-04-15 19:53:30 -04:00
|
|
|
with self.assertRaisesRegex(IndexError, "index -4 is out of bounds for axis 0 with size 3"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.delete(-4)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_append(self):
|
|
|
|
# Index
|
|
|
|
pidx = pd.Index(range(10000))
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx.append(pidx), psidx.append(psidx))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# Index with name
|
|
|
|
pidx1 = pd.Index(range(10000), name="a")
|
|
|
|
pidx2 = pd.Index(range(10000), name="b")
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx1 = ps.from_pandas(pidx1)
|
|
|
|
psidx2 = ps.from_pandas(pidx2)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx1.append(pidx2), psidx1.append(psidx2))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx2.append(pidx1), psidx2.append(psidx1))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# Index from DataFrame
|
|
|
|
pdf1 = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=["a", "b", "c"])
|
|
|
|
pdf2 = pd.DataFrame({"a": [7, 8, 9], "d": [10, 11, 12]}, index=["x", "y", "z"])
|
2021-05-20 18:08:30 -04:00
|
|
|
psdf1 = ps.from_pandas(pdf1)
|
|
|
|
psdf2 = ps.from_pandas(pdf2)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx1 = pdf1.set_index("a").index
|
|
|
|
pidx2 = pdf2.set_index("d").index
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx1 = psdf1.set_index("a").index
|
|
|
|
psidx2 = psdf2.set_index("d").index
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx1.append(pidx2), psidx1.append(psidx2))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx2.append(pidx1), psidx2.append(psidx1))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# Index from DataFrame with MultiIndex columns
|
|
|
|
pdf1 = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
|
|
|
|
pdf2 = pd.DataFrame({"a": [7, 8, 9], "d": [10, 11, 12]})
|
|
|
|
pdf1.columns = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y")])
|
|
|
|
pdf2.columns = pd.MultiIndex.from_tuples([("a", "x"), ("d", "y")])
|
2021-05-20 18:08:30 -04:00
|
|
|
psdf1 = ps.from_pandas(pdf1)
|
|
|
|
psdf2 = ps.from_pandas(pdf2)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx1 = pdf1.set_index(("a", "x")).index
|
|
|
|
pidx2 = pdf2.set_index(("d", "y")).index
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx1 = psdf1.set_index(("a", "x")).index
|
|
|
|
psidx2 = psdf2.set_index(("d", "y")).index
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx1.append(pidx2), psidx1.append(psidx2))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx2.append(pidx1), psidx2.append(psidx1))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# MultiIndex
|
|
|
|
pmidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx = ps.from_pandas(pmidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pmidx.append(pmidx), psmidx.append(psmidx))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# MultiIndex with names
|
|
|
|
pmidx1 = pd.MultiIndex.from_tuples(
|
|
|
|
[("a", "x", 1), ("b", "y", 2), ("c", "z", 3)], names=["x", "y", "z"]
|
|
|
|
)
|
|
|
|
pmidx2 = pd.MultiIndex.from_tuples(
|
|
|
|
[("a", "x", 1), ("b", "y", 2), ("c", "z", 3)], names=["p", "q", "r"]
|
|
|
|
)
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx1 = ps.from_pandas(pmidx1)
|
|
|
|
psmidx2 = ps.from_pandas(pmidx2)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pmidx1.append(pmidx2), psmidx1.append(psmidx2))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pmidx2.append(pmidx1), psmidx2.append(psmidx1))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pmidx1.append(pmidx2).names, psmidx1.append(psmidx2).names)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pmidx1.append(pmidx2).names, psmidx1.append(psmidx2).names)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# Index & MultiIndex currently is not supported
|
|
|
|
expected_error_message = r"append\(\) between Index & MultiIndex currently is not supported"
|
|
|
|
with self.assertRaisesRegex(NotImplementedError, expected_error_message):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.append(psmidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
with self.assertRaisesRegex(NotImplementedError, expected_error_message):
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx.append(psidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_argmin(self):
|
|
|
|
pidx = pd.Index([100, 50, 10, 20, 30, 60, 0, 50, 0, 100, 100, 100, 20, 0, 0])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx.argmin(), psidx.argmin())
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# MultiIndex
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
|
2021-04-15 19:53:30 -04:00
|
|
|
with self.assertRaisesRegex(
|
|
|
|
TypeError, "reduction operation 'argmin' not allowed for this dtype"
|
|
|
|
):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.argmin()
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_argmax(self):
|
|
|
|
pidx = pd.Index([100, 50, 10, 20, 30, 60, 0, 50, 0, 100, 100, 100, 20, 0, 0])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx.argmax(), psidx.argmax())
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# MultiIndex
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
|
2021-04-15 19:53:30 -04:00
|
|
|
with self.assertRaisesRegex(
|
|
|
|
TypeError, "reduction operation 'argmax' not allowed for this dtype"
|
|
|
|
):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.argmax()
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_min(self):
|
|
|
|
pidx = pd.Index([3, 2, 1])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx.min(), psidx.min())
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# MultiIndex
|
|
|
|
pmidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2)])
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx = ps.from_pandas(pmidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pmidx.min(), psmidx.min())
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx = pd.DatetimeIndex(["2021-02-01", "2021-01-01", "2021-04-01", "2021-03-01"])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx.min(), psidx.min())
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_max(self):
|
|
|
|
pidx = pd.Index([3, 2, 1])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx.max(), psidx.max())
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# MultiIndex
|
|
|
|
pmidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2)])
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx = ps.from_pandas(pmidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pmidx.max(), psmidx.max())
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx = pd.DatetimeIndex(["2021-02-01", "2021-01-01", "2021-04-01", "2021-03-01"])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx.max(), psidx.max())
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_monotonic(self):
|
|
|
|
# test monotonic_increasing & monotonic_decreasing for MultiIndex.
|
|
|
|
# Since the Behavior for null value was changed in pandas >= 1.0.0,
|
|
|
|
# several cases are tested differently.
|
|
|
|
datas = []
|
|
|
|
|
|
|
|
# increasing / decreasing ordered each index level with string
|
|
|
|
datas.append([("w", "a"), ("x", "b"), ("y", "c"), ("z", "d")])
|
|
|
|
datas.append([("w", "d"), ("x", "c"), ("y", "b"), ("z", "a")])
|
|
|
|
datas.append([("z", "a"), ("y", "b"), ("x", "c"), ("w", "d")])
|
|
|
|
datas.append([("z", "d"), ("y", "c"), ("x", "b"), ("w", "a")])
|
|
|
|
# mixed order each index level with string
|
|
|
|
datas.append([("z", "a"), ("x", "b"), ("y", "c"), ("w", "d")])
|
|
|
|
datas.append([("z", "a"), ("y", "c"), ("x", "b"), ("w", "d")])
|
|
|
|
|
|
|
|
# increasing / decreasing ordered each index level with integer
|
|
|
|
datas.append([(1, 100), (2, 200), (3, 300), (4, 400), (5, 500)])
|
|
|
|
datas.append([(1, 500), (2, 400), (3, 300), (4, 200), (5, 100)])
|
|
|
|
datas.append([(5, 100), (4, 200), (3, 300), (2, 400), (1, 500)])
|
|
|
|
datas.append([(5, 500), (4, 400), (3, 300), (2, 200), (1, 100)])
|
|
|
|
# mixed order each index level with integer
|
|
|
|
datas.append([(1, 500), (3, 400), (2, 300), (4, 200), (5, 100)])
|
|
|
|
datas.append([(1, 100), (2, 300), (3, 200), (4, 400), (5, 500)])
|
|
|
|
|
|
|
|
# integer / negative mixed tests
|
|
|
|
datas.append([("a", -500), ("b", -400), ("c", -300), ("d", -200), ("e", -100)])
|
|
|
|
datas.append([("e", -500), ("d", -400), ("c", -300), ("b", -200), ("a", -100)])
|
|
|
|
datas.append([(-5, "a"), (-4, "b"), (-3, "c"), (-2, "d"), (-1, "e")])
|
|
|
|
datas.append([(-5, "e"), (-4, "d"), (-3, "c"), (-2, "b"), (-1, "a")])
|
|
|
|
datas.append([(-5, "e"), (-3, "d"), (-2, "c"), (-4, "b"), (-1, "a")])
|
|
|
|
datas.append([(-5, "e"), (-4, "c"), (-3, "b"), (-2, "d"), (-1, "a")])
|
|
|
|
|
|
|
|
# boolean type tests
|
|
|
|
datas.append([(True, True), (True, True)])
|
|
|
|
datas.append([(True, True), (True, False)])
|
|
|
|
datas.append([(True, False), (True, True)])
|
|
|
|
datas.append([(False, True), (False, True)])
|
|
|
|
datas.append([(False, True), (False, False)])
|
|
|
|
datas.append([(False, False), (False, True)])
|
|
|
|
datas.append([(True, True), (False, True)])
|
|
|
|
datas.append([(True, True), (False, False)])
|
|
|
|
datas.append([(True, False), (False, True)])
|
|
|
|
datas.append([(False, True), (True, True)])
|
|
|
|
datas.append([(False, True), (True, False)])
|
|
|
|
datas.append([(False, False), (True, True)])
|
|
|
|
|
|
|
|
# duplicated index value tests
|
|
|
|
datas.append([("x", "d"), ("y", "c"), ("y", "b"), ("z", "a")])
|
|
|
|
datas.append([("x", "d"), ("y", "b"), ("y", "c"), ("z", "a")])
|
|
|
|
|
|
|
|
# more depth tests
|
|
|
|
datas.append([("x", "d", "o"), ("y", "c", "p"), ("y", "c", "q"), ("z", "a", "r")])
|
|
|
|
datas.append([("x", "d", "o"), ("y", "c", "q"), ("y", "c", "p"), ("z", "a", "r")])
|
|
|
|
|
|
|
|
# None type tests (None type is treated as False from pandas >= 1.1.4)
|
|
|
|
# Refer https://github.com/pandas-dev/pandas/issues/37220
|
|
|
|
datas.append([(1, 100), (2, 200), (None, 300), (4, 400), (5, 500)])
|
|
|
|
datas.append([(1, 100), (2, 200), (None, None), (4, 400), (5, 500)])
|
|
|
|
datas.append([("x", "d"), ("y", "c"), ("y", None), ("z", "a")])
|
|
|
|
datas.append([("x", "d"), ("y", "c"), ("y", "b"), (None, "a")])
|
|
|
|
datas.append([("x", "d"), ("y", "b"), ("y", "c"), (None, "a")])
|
|
|
|
datas.append([("x", "d", "o"), ("y", "c", "p"), ("y", "c", None), ("z", "a", "r")])
|
|
|
|
|
|
|
|
for data in datas:
|
|
|
|
with self.subTest(data=data):
|
|
|
|
pmidx = pd.MultiIndex.from_tuples(data)
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx = ps.from_pandas(pmidx)
|
|
|
|
self.assert_eq(psmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing)
|
|
|
|
self.assert_eq(psmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# datas below return different result depends on pandas version.
|
|
|
|
# Because the behavior of handling null values is changed in pandas >= 1.1.4.
|
|
|
|
# Since Koalas follows latest pandas, all of them should return `False`.
|
|
|
|
datas = []
|
|
|
|
datas.append([(1, 100), (2, 200), (3, None), (4, 400), (5, 500)])
|
|
|
|
datas.append([(1, None), (2, 200), (3, 300), (4, 400), (5, 500)])
|
|
|
|
datas.append([(1, 100), (2, 200), (3, 300), (4, 400), (5, None)])
|
|
|
|
datas.append([(False, None), (True, True)])
|
|
|
|
datas.append([(None, False), (True, True)])
|
|
|
|
datas.append([(False, False), (True, None)])
|
|
|
|
datas.append([(False, False), (None, True)])
|
|
|
|
datas.append([("x", "d"), ("y", None), ("y", None), ("z", "a")])
|
|
|
|
datas.append([("x", "d", "o"), ("y", "c", None), ("y", "c", None), ("z", "a", "r")])
|
|
|
|
datas.append([(1, 100), (2, 200), (3, 300), (4, 400), (None, 500)])
|
|
|
|
datas.append([(1, 100), (2, 200), (3, 300), (4, 400), (None, None)])
|
|
|
|
datas.append([(5, 100), (4, 200), (3, None), (2, 400), (1, 500)])
|
|
|
|
datas.append([(5, None), (4, 200), (3, 300), (2, 400), (1, 500)])
|
|
|
|
datas.append([(5, 100), (4, 200), (3, None), (2, 400), (1, 500)])
|
|
|
|
datas.append([(5, 100), (4, 200), (3, 300), (2, 400), (1, None)])
|
|
|
|
datas.append([(True, None), (True, True)])
|
|
|
|
datas.append([(None, True), (True, True)])
|
|
|
|
datas.append([(True, True), (None, True)])
|
|
|
|
datas.append([(True, True), (True, None)])
|
|
|
|
datas.append([(None, 100), (2, 200), (3, 300), (4, 400), (5, 500)])
|
|
|
|
datas.append([(None, None), (2, 200), (3, 300), (4, 400), (5, 500)])
|
|
|
|
datas.append([("x", "d"), ("y", None), ("y", "c"), ("z", "a")])
|
|
|
|
datas.append([("x", "d", "o"), ("y", "c", None), ("y", "c", "q"), ("z", "a", "r")])
|
|
|
|
|
|
|
|
for data in datas:
|
|
|
|
with self.subTest(data=data):
|
|
|
|
pmidx = pd.MultiIndex.from_tuples(data)
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx = ps.from_pandas(pmidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
if LooseVersion(pd.__version__) < LooseVersion("1.1.4"):
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psmidx.is_monotonic_increasing, False)
|
|
|
|
self.assert_eq(psmidx.is_monotonic_decreasing, False)
|
2021-04-15 19:53:30 -04:00
|
|
|
else:
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing)
|
|
|
|
self.assert_eq(psmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# The datas below are tested another way since they cannot be an arguments for
|
|
|
|
# `MultiIndex.from_tuples` in pandas >= 1.1.0.
|
|
|
|
# Refer https://github.com/databricks/koalas/pull/1688#issuecomment-667156560 for detail.
|
|
|
|
if LooseVersion(pd.__version__) < LooseVersion("1.1.0"):
|
|
|
|
pmidx = pd.MultiIndex.from_tuples(
|
|
|
|
[(-5, None), (-4, None), (-3, None), (-2, None), (-1, None)]
|
|
|
|
)
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx = ps.from_pandas(pmidx)
|
|
|
|
self.assert_eq(psmidx.is_monotonic_increasing, False)
|
|
|
|
self.assert_eq(psmidx.is_monotonic_decreasing, False)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pmidx = pd.MultiIndex.from_tuples(
|
|
|
|
[(None, "e"), (None, "c"), (None, "b"), (None, "d"), (None, "a")]
|
|
|
|
)
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx = ps.from_pandas(pmidx)
|
|
|
|
self.assert_eq(psmidx.is_monotonic_increasing, False)
|
|
|
|
self.assert_eq(psmidx.is_monotonic_decreasing, False)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pmidx = pd.MultiIndex.from_tuples(
|
|
|
|
[(None, None), (None, None), (None, None), (None, None), (None, None)]
|
|
|
|
)
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx = ps.from_pandas(pmidx)
|
|
|
|
self.assert_eq(psmidx.is_monotonic_increasing, False)
|
|
|
|
self.assert_eq(psmidx.is_monotonic_decreasing, False)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pmidx = pd.MultiIndex.from_tuples([(None, None)])
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx = ps.from_pandas(pmidx)
|
|
|
|
self.assert_eq(psmidx.is_monotonic_increasing, False)
|
|
|
|
self.assert_eq(psmidx.is_monotonic_decreasing, False)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-26 23:33:30 -04:00
|
|
|
else:
|
|
|
|
[(-5, None), (-4, None), (-3, None), (-2, None), (-1, None)]
|
|
|
|
psdf = ps.DataFrame({"a": [-5, -4, -3, -2, -1], "b": [1, 1, 1, 1, 1]})
|
|
|
|
psdf["b"] = None
|
|
|
|
psmidx = psdf.set_index(["a", "b"]).index
|
|
|
|
pmidx = psmidx.to_pandas()
|
|
|
|
self.assert_eq(psmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing)
|
|
|
|
self.assert_eq(psmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing)
|
|
|
|
|
|
|
|
[(None, "e"), (None, "c"), (None, "b"), (None, "d"), (None, "a")]
|
|
|
|
psdf = ps.DataFrame({"a": [1, 1, 1, 1, 1], "b": ["e", "c", "b", "d", "a"]})
|
|
|
|
psdf["a"] = None
|
|
|
|
psmidx = psdf.set_index(["a", "b"]).index
|
|
|
|
pmidx = psmidx.to_pandas()
|
|
|
|
self.assert_eq(psmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing)
|
|
|
|
self.assert_eq(psmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing)
|
|
|
|
|
|
|
|
[(None, None), (None, None), (None, None), (None, None), (None, None)]
|
|
|
|
psdf = ps.DataFrame({"a": [1, 1, 1, 1, 1], "b": [1, 1, 1, 1, 1]})
|
|
|
|
psdf["a"] = None
|
|
|
|
psdf["b"] = None
|
|
|
|
psmidx = psdf.set_index(["a", "b"]).index
|
|
|
|
pmidx = psmidx.to_pandas()
|
|
|
|
self.assert_eq(psmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing)
|
|
|
|
self.assert_eq(psmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing)
|
|
|
|
[(None, None)]
|
|
|
|
psdf = ps.DataFrame({"a": [1], "b": [1]})
|
|
|
|
psdf["a"] = None
|
|
|
|
psdf["b"] = None
|
|
|
|
psmidx = psdf.set_index(["a", "b"]).index
|
|
|
|
pmidx = psmidx.to_pandas()
|
|
|
|
self.assert_eq(psmidx.is_monotonic_increasing, pmidx.is_monotonic_increasing)
|
|
|
|
self.assert_eq(psmidx.is_monotonic_decreasing, pmidx.is_monotonic_decreasing)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_difference(self):
|
|
|
|
# Index
|
|
|
|
pidx1 = pd.Index([1, 2, 3, 4], name="koalas")
|
|
|
|
pidx2 = pd.Index([3, 4, 5, 6], name="koalas")
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx1 = ps.from_pandas(pidx1)
|
|
|
|
psidx2 = ps.from_pandas(pidx2)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx1.difference(psidx2).sort_values(), pidx1.difference(pidx2).sort_values()
|
|
|
|
)
|
|
|
|
self.assert_eq(
|
|
|
|
psidx1.difference([3, 4, 5, 6]).sort_values(),
|
2021-04-15 19:53:30 -04:00
|
|
|
pidx1.difference([3, 4, 5, 6]).sort_values(),
|
|
|
|
)
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx1.difference((3, 4, 5, 6)).sort_values(),
|
2021-04-15 19:53:30 -04:00
|
|
|
pidx1.difference((3, 4, 5, 6)).sort_values(),
|
|
|
|
)
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx1.difference({3, 4, 5, 6}).sort_values(),
|
2021-04-15 19:53:30 -04:00
|
|
|
pidx1.difference({3, 4, 5, 6}).sort_values(),
|
|
|
|
)
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx1.difference({3: 1, 4: 2, 5: 3, 6: 4}).sort_values(),
|
2021-04-15 19:53:30 -04:00
|
|
|
pidx1.difference({3: 1, 4: 2, 5: 3, 6: 4}).sort_values(),
|
|
|
|
)
|
|
|
|
|
|
|
|
# Exceptions for Index
|
|
|
|
with self.assertRaisesRegex(TypeError, "Input must be Index or array-like"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx1.difference("1234")
|
2021-04-15 19:53:30 -04:00
|
|
|
with self.assertRaisesRegex(TypeError, "Input must be Index or array-like"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx1.difference(1234)
|
2021-04-15 19:53:30 -04:00
|
|
|
with self.assertRaisesRegex(TypeError, "Input must be Index or array-like"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx1.difference(12.34)
|
2021-04-15 19:53:30 -04:00
|
|
|
with self.assertRaisesRegex(TypeError, "Input must be Index or array-like"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx1.difference(None)
|
2021-04-15 19:53:30 -04:00
|
|
|
with self.assertRaisesRegex(TypeError, "Input must be Index or array-like"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx1.difference(np.nan)
|
2021-04-15 19:53:30 -04:00
|
|
|
with self.assertRaisesRegex(
|
|
|
|
ValueError, "The 'sort' keyword only takes the values of None or True; 1 was passed."
|
|
|
|
):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx1.difference(psidx2, sort=1)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# MultiIndex
|
|
|
|
pidx1 = pd.MultiIndex.from_tuples(
|
|
|
|
[("a", "x", 1), ("b", "y", 2), ("c", "z", 3)], names=["hello", "koalas", "world"]
|
|
|
|
)
|
|
|
|
pidx2 = pd.MultiIndex.from_tuples(
|
|
|
|
[("a", "x", 1), ("b", "z", 2), ("k", "z", 3)], names=["hello", "koalas", "world"]
|
|
|
|
)
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx1 = ps.from_pandas(pidx1)
|
|
|
|
psidx2 = ps.from_pandas(pidx2)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx1.difference(psidx2).sort_values(), pidx1.difference(pidx2).sort_values()
|
|
|
|
)
|
|
|
|
self.assert_eq(
|
|
|
|
psidx1.difference({("a", "x", 1)}).sort_values(),
|
2021-04-15 19:53:30 -04:00
|
|
|
pidx1.difference({("a", "x", 1)}).sort_values(),
|
|
|
|
)
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx1.difference({("a", "x", 1): [1, 2, 3]}).sort_values(),
|
2021-04-15 19:53:30 -04:00
|
|
|
pidx1.difference({("a", "x", 1): [1, 2, 3]}).sort_values(),
|
|
|
|
)
|
|
|
|
|
|
|
|
# Exceptions for MultiIndex
|
|
|
|
with self.assertRaisesRegex(TypeError, "other must be a MultiIndex or a list of tuples"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx1.difference(["b", "z", "2"])
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_repeat(self):
|
|
|
|
pidx = pd.Index(["a", "b", "c"])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psidx.repeat(3).sort_values(), pidx.repeat(3).sort_values())
|
|
|
|
self.assert_eq(psidx.repeat(0).sort_values(), pidx.repeat(0).sort_values())
|
|
|
|
self.assert_eq((psidx + "x").repeat(3).sort_values(), (pidx + "x").repeat(3).sort_values())
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assertRaises(ValueError, lambda: psidx.repeat(-1))
|
|
|
|
self.assertRaises(TypeError, lambda: psidx.repeat("abc"))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pmidx = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")])
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx = ps.from_pandas(pmidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psmidx.repeat(3).sort_values(), pmidx.repeat(3).sort_values())
|
|
|
|
self.assert_eq(psmidx.repeat(0).sort_values(), pmidx.repeat(0).sort_values(), almost=True)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assertRaises(ValueError, lambda: psmidx.repeat(-1))
|
|
|
|
self.assertRaises(TypeError, lambda: psmidx.repeat("abc"))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_unique(self):
|
|
|
|
pidx = pd.Index(["a", "b", "a"])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psidx.unique().sort_values(), pidx.unique().sort_values())
|
|
|
|
self.assert_eq(psidx.unique().sort_values(), pidx.unique().sort_values())
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pmidx = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("x", "a")])
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx = ps.from_pandas(pmidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psmidx.unique().sort_values(), pmidx.unique().sort_values())
|
|
|
|
self.assert_eq(psmidx.unique().sort_values(), pmidx.unique().sort_values())
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_asof(self):
|
|
|
|
# Increasing values
|
|
|
|
pidx = pd.Index(["2013-12-31", "2014-01-02", "2014-01-03"])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psidx.asof("2014-01-01"), pidx.asof("2014-01-01"))
|
|
|
|
self.assert_eq(psidx.asof("2014-01-02"), pidx.asof("2014-01-02"))
|
|
|
|
self.assert_eq(repr(psidx.asof("1999-01-02")), repr(pidx.asof("1999-01-02")))
|
|
|
|
self.assert_eq(psidx.asof("2014-01-04"), pidx.asof("2014-01-04"))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx = pd.DatetimeIndex(["2013-12-31", "2014-01-02", "2014-01-03"])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psidx.asof("2014-01-01"), pidx.asof("2014-01-01"))
|
|
|
|
self.assert_eq(psidx.asof("2014-01-02"), pidx.asof("2014-01-02"))
|
|
|
|
self.assert_eq(repr(psidx.asof("1999-01-02")), repr(pidx.asof("1999-01-02")))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# Decreasing values
|
|
|
|
pidx = pd.Index(["2014-01-03", "2014-01-02", "2013-12-31"])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psidx.asof("2014-01-01"), pidx.asof("2014-01-01"))
|
|
|
|
self.assert_eq(psidx.asof("2014-01-02"), pidx.asof("2014-01-02"))
|
|
|
|
self.assert_eq(psidx.asof("1999-01-02"), pidx.asof("1999-01-02"))
|
|
|
|
self.assert_eq(repr(psidx.asof("2015-01-02")), repr(pidx.asof("2015-01-02")))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx = pd.DatetimeIndex(["2014-01-03", "2014-01-02", "2013-12-31"])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# TODO: a pandas bug?
|
2021-05-20 18:08:30 -04:00
|
|
|
# self.assert_eq(psidx.asof("2014-01-01"), pidx.asof("2014-01-01"))
|
|
|
|
# self.assert_eq(psidx.asof("2014-01-02"), pidx.asof("2014-01-02"))
|
|
|
|
# self.assert_eq(psidx.asof("1999-01-02"), pidx.asof("1999-01-02"))
|
|
|
|
# self.assert_eq(repr(psidx.asof("2015-01-02")), repr(pidx.asof("2015-01-02")))
|
|
|
|
self.assert_eq(psidx.asof("2014-01-01"), pd.Timestamp("2014-01-02 00:00:00"))
|
|
|
|
self.assert_eq(psidx.asof("2014-01-02"), pd.Timestamp("2014-01-02 00:00:00"))
|
|
|
|
self.assert_eq(psidx.asof("1999-01-02"), pd.Timestamp("2013-12-31 00:00:00"))
|
|
|
|
self.assert_eq(repr(psidx.asof("2015-01-02")), repr(pd.NaT))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# Not increasing, neither decreasing (ValueError)
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.Index(["2013-12-31", "2015-01-02", "2014-01-03"])
|
|
|
|
self.assertRaises(ValueError, lambda: psidx.asof("2013-12-31"))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx = ps.MultiIndex.from_tuples([("a", "a"), ("a", "b"), ("a", "c")])
|
|
|
|
self.assertRaises(NotImplementedError, lambda: psmidx.asof(("a", "b")))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_union(self):
|
|
|
|
# Index
|
|
|
|
pidx1 = pd.Index([1, 2, 3, 4])
|
|
|
|
pidx2 = pd.Index([3, 4, 5, 6])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx1 = ps.from_pandas(pidx1)
|
|
|
|
psidx2 = ps.from_pandas(pidx2)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psidx1.union(psidx2), pidx1.union(pidx2))
|
|
|
|
self.assert_eq(psidx2.union(psidx1), pidx2.union(pidx1))
|
|
|
|
self.assert_eq(psidx1.union([3, 4, 5, 6]), pidx1.union([3, 4, 5, 6]), almost=True)
|
|
|
|
self.assert_eq(psidx2.union([1, 2, 3, 4]), pidx2.union([1, 2, 3, 4]), almost=True)
|
2021-04-15 19:53:30 -04:00
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx1.union(ps.Series([3, 4, 5, 6])), pidx1.union(pd.Series([3, 4, 5, 6])), almost=True
|
2021-04-15 19:53:30 -04:00
|
|
|
)
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx2.union(ps.Series([1, 2, 3, 4])), pidx2.union(pd.Series([1, 2, 3, 4])), almost=True
|
2021-04-15 19:53:30 -04:00
|
|
|
)
|
|
|
|
|
|
|
|
# Testing if the result is correct after sort=False.
|
|
|
|
# The `sort` argument is added in pandas 0.24.
|
|
|
|
if LooseVersion(pd.__version__) >= LooseVersion("0.24"):
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx1.union(psidx2, sort=False).sort_values(),
|
2021-04-15 19:53:30 -04:00
|
|
|
pidx1.union(pidx2, sort=False).sort_values(),
|
|
|
|
)
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx2.union(psidx1, sort=False).sort_values(),
|
2021-04-15 19:53:30 -04:00
|
|
|
pidx2.union(pidx1, sort=False).sort_values(),
|
|
|
|
)
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx1.union([3, 4, 5, 6], sort=False).sort_values(),
|
2021-04-15 19:53:30 -04:00
|
|
|
pidx1.union([3, 4, 5, 6], sort=False).sort_values(),
|
|
|
|
almost=True,
|
|
|
|
)
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx2.union([1, 2, 3, 4], sort=False).sort_values(),
|
2021-04-15 19:53:30 -04:00
|
|
|
pidx2.union([1, 2, 3, 4], sort=False).sort_values(),
|
|
|
|
almost=True,
|
|
|
|
)
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx1.union(ps.Series([3, 4, 5, 6]), sort=False).sort_values(),
|
2021-04-15 19:53:30 -04:00
|
|
|
pidx1.union(pd.Series([3, 4, 5, 6]), sort=False).sort_values(),
|
|
|
|
almost=True,
|
|
|
|
)
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx2.union(ps.Series([1, 2, 3, 4]), sort=False).sort_values(),
|
2021-04-15 19:53:30 -04:00
|
|
|
pidx2.union(pd.Series([1, 2, 3, 4]), sort=False).sort_values(),
|
|
|
|
almost=True,
|
|
|
|
)
|
|
|
|
|
|
|
|
# Duplicated values for Index is supported in pandas >= 1.0.0
|
|
|
|
if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"):
|
|
|
|
pidx1 = pd.Index([1, 2, 3, 4, 3, 4, 3, 4])
|
|
|
|
pidx2 = pd.Index([3, 4, 3, 4, 5, 6])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx1 = ps.from_pandas(pidx1)
|
|
|
|
psidx2 = ps.from_pandas(pidx2)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psidx1.union(psidx2), pidx1.union(pidx2))
|
|
|
|
self.assert_eq(psidx2.union(psidx1), pidx2.union(pidx1))
|
2021-04-15 19:53:30 -04:00
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx1.union([3, 4, 3, 3, 5, 6]), pidx1.union([3, 4, 3, 4, 5, 6]), almost=True
|
2021-04-15 19:53:30 -04:00
|
|
|
)
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx2.union([1, 2, 3, 4, 3, 4, 3, 4]),
|
2021-04-15 19:53:30 -04:00
|
|
|
pidx2.union([1, 2, 3, 4, 3, 4, 3, 4]),
|
|
|
|
almost=True,
|
|
|
|
)
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx1.union(ps.Series([3, 4, 3, 3, 5, 6])),
|
2021-04-15 19:53:30 -04:00
|
|
|
pidx1.union(pd.Series([3, 4, 3, 4, 5, 6])),
|
|
|
|
almost=True,
|
|
|
|
)
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx2.union(ps.Series([1, 2, 3, 4, 3, 4, 3, 4])),
|
2021-04-15 19:53:30 -04:00
|
|
|
pidx2.union(pd.Series([1, 2, 3, 4, 3, 4, 3, 4])),
|
|
|
|
almost=True,
|
|
|
|
)
|
|
|
|
|
|
|
|
# MultiIndex
|
|
|
|
pmidx1 = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")])
|
|
|
|
pmidx2 = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("x", "c"), ("x", "d")])
|
|
|
|
pmidx3 = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 4)])
|
|
|
|
pmidx4 = pd.MultiIndex.from_tuples([(1, 3), (1, 4), (1, 5), (1, 6)])
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx1 = ps.from_pandas(pmidx1)
|
|
|
|
psmidx2 = ps.from_pandas(pmidx2)
|
|
|
|
psmidx3 = ps.from_pandas(pmidx3)
|
|
|
|
psmidx4 = ps.from_pandas(pmidx4)
|
|
|
|
|
|
|
|
self.assert_eq(psmidx1.union(psmidx2), pmidx1.union(pmidx2))
|
|
|
|
self.assert_eq(psmidx2.union(psmidx1), pmidx2.union(pmidx1))
|
|
|
|
self.assert_eq(psmidx3.union(psmidx4), pmidx3.union(pmidx4))
|
|
|
|
self.assert_eq(psmidx4.union(psmidx3), pmidx4.union(pmidx3))
|
2021-04-15 19:53:30 -04:00
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx1.union([("x", "a"), ("x", "b"), ("x", "c"), ("x", "d")]),
|
2021-04-15 19:53:30 -04:00
|
|
|
pmidx1.union([("x", "a"), ("x", "b"), ("x", "c"), ("x", "d")]),
|
|
|
|
)
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx2.union([("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")]),
|
2021-04-15 19:53:30 -04:00
|
|
|
pmidx2.union([("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")]),
|
|
|
|
)
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx3.union([(1, 3), (1, 4), (1, 5), (1, 6)]),
|
2021-04-15 19:53:30 -04:00
|
|
|
pmidx3.union([(1, 3), (1, 4), (1, 5), (1, 6)]),
|
|
|
|
)
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx4.union([(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 4)]),
|
2021-04-15 19:53:30 -04:00
|
|
|
pmidx4.union([(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 4)]),
|
|
|
|
)
|
|
|
|
|
|
|
|
# Testing if the result is correct after sort=False.
|
|
|
|
# The `sort` argument is added in pandas 0.24.
|
|
|
|
if LooseVersion(pd.__version__) >= LooseVersion("0.24"):
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx1.union(psmidx2, sort=False).sort_values(),
|
2021-04-15 19:53:30 -04:00
|
|
|
pmidx1.union(pmidx2, sort=False).sort_values(),
|
|
|
|
)
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx2.union(psmidx1, sort=False).sort_values(),
|
2021-04-15 19:53:30 -04:00
|
|
|
pmidx2.union(pmidx1, sort=False).sort_values(),
|
|
|
|
)
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx3.union(psmidx4, sort=False).sort_values(),
|
2021-04-15 19:53:30 -04:00
|
|
|
pmidx3.union(pmidx4, sort=False).sort_values(),
|
|
|
|
)
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx4.union(psmidx3, sort=False).sort_values(),
|
2021-04-15 19:53:30 -04:00
|
|
|
pmidx4.union(pmidx3, sort=False).sort_values(),
|
|
|
|
)
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx1.union(
|
2021-04-15 19:53:30 -04:00
|
|
|
[("x", "a"), ("x", "b"), ("x", "c"), ("x", "d")], sort=False
|
|
|
|
).sort_values(),
|
|
|
|
pmidx1.union(
|
|
|
|
[("x", "a"), ("x", "b"), ("x", "c"), ("x", "d")], sort=False
|
|
|
|
).sort_values(),
|
|
|
|
)
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx2.union(
|
2021-04-15 19:53:30 -04:00
|
|
|
[("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")], sort=False
|
|
|
|
).sort_values(),
|
|
|
|
pmidx2.union(
|
|
|
|
[("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")], sort=False
|
|
|
|
).sort_values(),
|
|
|
|
)
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx3.union([(1, 3), (1, 4), (1, 5), (1, 6)], sort=False).sort_values(),
|
2021-04-15 19:53:30 -04:00
|
|
|
pmidx3.union([(1, 3), (1, 4), (1, 5), (1, 6)], sort=False).sort_values(),
|
|
|
|
)
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx4.union(
|
2021-04-15 19:53:30 -04:00
|
|
|
[(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 4)], sort=False
|
|
|
|
).sort_values(),
|
|
|
|
pmidx4.union(
|
|
|
|
[(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 4)], sort=False
|
|
|
|
).sort_values(),
|
|
|
|
)
|
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assertRaises(NotImplementedError, lambda: psidx1.union(psmidx1))
|
|
|
|
self.assertRaises(TypeError, lambda: psmidx1.union(psidx1))
|
|
|
|
self.assertRaises(TypeError, lambda: psmidx1.union(["x", "a"]))
|
|
|
|
self.assertRaises(ValueError, lambda: psidx1.union(ps.range(2)))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_take(self):
|
|
|
|
# Index
|
|
|
|
pidx = pd.Index([100, 200, 300, 400, 500], name="Koalas")
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psidx.take([0, 2, 4]).sort_values(), pidx.take([0, 2, 4]).sort_values())
|
2021-04-15 19:53:30 -04:00
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.take(range(0, 5, 2)).sort_values(), pidx.take(range(0, 5, 2)).sort_values()
|
2021-04-15 19:53:30 -04:00
|
|
|
)
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psidx.take([-4, -2, 0]).sort_values(), pidx.take([-4, -2, 0]).sort_values())
|
2021-04-15 19:53:30 -04:00
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.take(range(-4, 1, 2)).sort_values(), pidx.take(range(-4, 1, 2)).sort_values()
|
2021-04-15 19:53:30 -04:00
|
|
|
)
|
|
|
|
|
|
|
|
# MultiIndex
|
|
|
|
pmidx = pd.MultiIndex.from_tuples(
|
|
|
|
[("x", "a"), ("x", "b"), ("x", "c")], names=["hello", "Koalas"]
|
|
|
|
)
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx = ps.from_pandas(pmidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psmidx.take([0, 2]).sort_values(), pmidx.take([0, 2]).sort_values())
|
2021-04-15 19:53:30 -04:00
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx.take(range(0, 4, 2)).sort_values(), pmidx.take(range(0, 4, 2)).sort_values()
|
2021-04-15 19:53:30 -04:00
|
|
|
)
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psmidx.take([-2, 0]).sort_values(), pmidx.take([-2, 0]).sort_values())
|
2021-04-15 19:53:30 -04:00
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx.take(range(-2, 1, 2)).sort_values(), pmidx.take(range(-2, 1, 2)).sort_values()
|
2021-04-15 19:53:30 -04:00
|
|
|
)
|
|
|
|
|
|
|
|
# Checking the type of indices.
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assertRaises(TypeError, lambda: psidx.take(1))
|
|
|
|
self.assertRaises(TypeError, lambda: psidx.take("1"))
|
|
|
|
self.assertRaises(TypeError, lambda: psidx.take({1, 2}))
|
|
|
|
self.assertRaises(TypeError, lambda: psidx.take({1: None, 2: None}))
|
|
|
|
self.assertRaises(TypeError, lambda: psmidx.take(1))
|
|
|
|
self.assertRaises(TypeError, lambda: psmidx.take("1"))
|
|
|
|
self.assertRaises(TypeError, lambda: psmidx.take({1, 2}))
|
|
|
|
self.assertRaises(TypeError, lambda: psmidx.take({1: None, 2: None}))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_index_get_level_values(self):
|
|
|
|
pidx = pd.Index([1, 2, 3], name="ks")
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
for level in [0, "ks"]:
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psidx.get_level_values(level), pidx.get_level_values(level))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_multiindex_get_level_values(self):
|
|
|
|
pmidx = pd.MultiIndex.from_tuples([("a", "d"), ("b", "e"), ("c", "f")])
|
|
|
|
pmidx.names = ["level_1", "level_2"]
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx = ps.from_pandas(pmidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
for level in [0, 1, "level_1", "level_2"]:
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psmidx.get_level_values(level), pmidx.get_level_values(level))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_index_get_level_number(self):
|
|
|
|
# name of two levels are the same, which is None
|
2021-05-20 18:08:30 -04:00
|
|
|
psdf = ps.DataFrame({"a": [1, 2, 3]}, index=[list("aac"), list("ddf")])
|
2021-04-15 19:53:30 -04:00
|
|
|
with self.assertRaisesRegex(
|
|
|
|
ValueError, "The name None occurs multiple times, use a level number"
|
|
|
|
):
|
2021-05-20 18:08:30 -04:00
|
|
|
psdf.index._get_level_number(None)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
mi = pd.MultiIndex.from_arrays((list("abc"), list("def")))
|
|
|
|
mi.names = ["level_1", "level_2"]
|
2021-05-20 18:08:30 -04:00
|
|
|
psdf = ps.DataFrame({"a": [1, 2, 3]}, index=mi)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# level is not int and not in the level name list
|
|
|
|
with self.assertRaisesRegex(KeyError, "Level lv_3 not found"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psdf.index._get_level_number("lv_3")
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# level is int, but an invalid negative number
|
|
|
|
with self.assertRaisesRegex(IndexError, "Too many levels: Index has only"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psdf.index._get_level_number(-3)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# level is int, but an invalid positive number
|
|
|
|
with self.assertRaisesRegex(IndexError, "Too many levels: Index has only"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psdf.index._get_level_number(3)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# Correct and valid inputs in numbers
|
|
|
|
level_number = [-2, -1, 0, 1]
|
|
|
|
outputs = [0, 1, 0, 1]
|
|
|
|
|
|
|
|
for lv, output in zip(level_number, outputs):
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assertEqual(output, psdf.index._get_level_number(lv))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# Valid inputs as level names
|
|
|
|
level_names = ["level_1", "level_2"]
|
|
|
|
outputs = [0, 1]
|
|
|
|
|
|
|
|
for lv, output in zip(level_names, outputs):
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assertEqual(output, psdf.index._get_level_number(lv))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_holds_integer(self):
|
|
|
|
pidx = pd.Index([1, 2, 3, 4])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
|
|
|
self.assert_eq(pidx.holds_integer(), psidx.holds_integer())
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx = pd.Index([1.1, 2.2, 3.3, 4.4])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
|
|
|
self.assert_eq(pidx.holds_integer(), psidx.holds_integer())
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx = pd.Index(["A", "B", "C", "D"])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
|
|
|
self.assert_eq(pidx.holds_integer(), psidx.holds_integer())
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# MultiIndex
|
|
|
|
pmidx = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "a")])
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx = ps.from_pandas(pmidx)
|
|
|
|
self.assert_eq(pmidx.holds_integer(), psmidx.holds_integer())
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pmidx = pd.MultiIndex.from_tuples([(10, 1), (10, 2), (20, 1)])
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx = ps.from_pandas(pmidx)
|
|
|
|
self.assert_eq(pmidx.holds_integer(), psmidx.holds_integer())
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_abs(self):
|
|
|
|
pidx = pd.Index([-2, -1, 0, 1])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(abs(pidx), abs(psidx))
|
|
|
|
self.assert_eq(np.abs(pidx), np.abs(psidx))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.MultiIndex.from_tuples([(1, 2)], names=["level1", "level2"])
|
2021-04-15 19:53:30 -04:00
|
|
|
with self.assertRaisesRegex(TypeError, "perform __abs__ with this index"):
|
2021-05-20 18:08:30 -04:00
|
|
|
abs(psidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_hasnans(self):
|
|
|
|
# BooleanType
|
|
|
|
pidx = pd.Index([True, False, True, True])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
|
|
|
self.assert_eq(pidx.hasnans, psidx.hasnans)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx = pd.Index([True, False, np.nan, True])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
|
|
|
self.assert_eq(pidx.hasnans, psidx.hasnans)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# TimestampType
|
|
|
|
pser = pd.Series([pd.Timestamp("2020-07-30") for _ in range(3)])
|
2021-05-20 18:08:30 -04:00
|
|
|
psser = ps.from_pandas(pser)
|
|
|
|
self.assert_eq(pser.hasnans, psser.hasnans)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pser = pd.Series([pd.Timestamp("2020-07-30"), np.nan, pd.Timestamp("2020-07-30")])
|
2021-05-20 18:08:30 -04:00
|
|
|
psser = ps.from_pandas(pser)
|
|
|
|
self.assert_eq(pser.hasnans, psser.hasnans)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# Not supported for MultiIndex
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx = ps.Index([("a", 1), ("b", 2)])
|
|
|
|
self.assertRaises(NotImplementedError, lambda: psmidx.hasnans())
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_intersection(self):
|
|
|
|
pidx = pd.Index([1, 2, 3, 4], name="Koalas")
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# other = Index
|
|
|
|
pidx_other = pd.Index([3, 4, 5, 6], name="Koalas")
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx_other = ps.from_pandas(pidx_other)
|
|
|
|
self.assert_eq(pidx.intersection(pidx_other), psidx.intersection(psidx_other).sort_values())
|
2021-04-15 19:53:30 -04:00
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
(pidx + 1).intersection(pidx_other), (psidx + 1).intersection(psidx_other).sort_values()
|
2021-04-15 19:53:30 -04:00
|
|
|
)
|
|
|
|
|
|
|
|
pidx_other_different_name = pd.Index([3, 4, 5, 6], name="Databricks")
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx_other_different_name = ps.from_pandas(pidx_other_different_name)
|
2021-04-15 19:53:30 -04:00
|
|
|
self.assert_eq(
|
|
|
|
pidx.intersection(pidx_other_different_name),
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.intersection(psidx_other_different_name).sort_values(),
|
2021-04-15 19:53:30 -04:00
|
|
|
)
|
|
|
|
self.assert_eq(
|
|
|
|
(pidx + 1).intersection(pidx_other_different_name),
|
2021-05-20 18:08:30 -04:00
|
|
|
(psidx + 1).intersection(psidx_other_different_name).sort_values(),
|
2021-04-15 19:53:30 -04:00
|
|
|
)
|
|
|
|
|
|
|
|
pidx_other_from_frame = pd.DataFrame({"a": [3, 4, 5, 6]}).set_index("a").index
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx_other_from_frame = ps.from_pandas(pidx_other_from_frame)
|
2021-04-15 19:53:30 -04:00
|
|
|
self.assert_eq(
|
|
|
|
pidx.intersection(pidx_other_from_frame),
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.intersection(psidx_other_from_frame).sort_values(),
|
2021-04-15 19:53:30 -04:00
|
|
|
)
|
|
|
|
self.assert_eq(
|
|
|
|
(pidx + 1).intersection(pidx_other_from_frame),
|
2021-05-20 18:08:30 -04:00
|
|
|
(psidx + 1).intersection(psidx_other_from_frame).sort_values(),
|
2021-04-15 19:53:30 -04:00
|
|
|
)
|
|
|
|
|
|
|
|
# other = MultiIndex
|
|
|
|
pmidx = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx = ps.from_pandas(pmidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
if LooseVersion(pd.__version__) < LooseVersion("1.2.0"):
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.intersection(psmidx).sort_values(),
|
|
|
|
psidx._psdf.head(0).index.rename(None),
|
2021-04-15 19:53:30 -04:00
|
|
|
almost=True,
|
|
|
|
)
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
(psidx + 1).intersection(psmidx).sort_values(),
|
|
|
|
psidx._psdf.head(0).index.rename(None),
|
2021-04-15 19:53:30 -04:00
|
|
|
almost=True,
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
pidx.intersection(pmidx), psidx.intersection(psmidx).sort_values(), almost=True
|
2021-04-15 19:53:30 -04:00
|
|
|
)
|
|
|
|
self.assert_eq(
|
|
|
|
(pidx + 1).intersection(pmidx),
|
2021-05-20 18:08:30 -04:00
|
|
|
(psidx + 1).intersection(psmidx).sort_values(),
|
2021-04-15 19:53:30 -04:00
|
|
|
almost=True,
|
|
|
|
)
|
|
|
|
|
|
|
|
# other = Series
|
|
|
|
pser = pd.Series([3, 4, 5, 6])
|
2021-05-20 18:08:30 -04:00
|
|
|
psser = ps.from_pandas(pser)
|
2021-04-15 19:53:30 -04:00
|
|
|
if LooseVersion(pd.__version__) < LooseVersion("1.2.0"):
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psidx.intersection(psser).sort_values(), ps.Index([3, 4], name="Koalas"))
|
2021-04-15 19:53:30 -04:00
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
(psidx + 1).intersection(psser).sort_values(), ps.Index([3, 4, 5], name="Koalas")
|
2021-04-15 19:53:30 -04:00
|
|
|
)
|
|
|
|
else:
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx.intersection(pser), psidx.intersection(psser).sort_values())
|
2021-04-15 19:53:30 -04:00
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
(pidx + 1).intersection(pser), (psidx + 1).intersection(psser).sort_values()
|
2021-04-15 19:53:30 -04:00
|
|
|
)
|
|
|
|
|
|
|
|
pser_different_name = pd.Series([3, 4, 5, 6], name="Databricks")
|
2021-05-20 18:08:30 -04:00
|
|
|
psser_different_name = ps.from_pandas(pser_different_name)
|
2021-04-15 19:53:30 -04:00
|
|
|
if LooseVersion(pd.__version__) < LooseVersion("1.2.0"):
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.intersection(psser_different_name).sort_values(),
|
2021-04-15 19:53:30 -04:00
|
|
|
ps.Index([3, 4], name="Koalas"),
|
|
|
|
)
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
(psidx + 1).intersection(psser_different_name).sort_values(),
|
2021-04-15 19:53:30 -04:00
|
|
|
ps.Index([3, 4, 5], name="Koalas"),
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
self.assert_eq(
|
|
|
|
pidx.intersection(pser_different_name),
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.intersection(psser_different_name).sort_values(),
|
2021-04-15 19:53:30 -04:00
|
|
|
)
|
|
|
|
self.assert_eq(
|
|
|
|
(pidx + 1).intersection(pser_different_name),
|
2021-05-20 18:08:30 -04:00
|
|
|
(psidx + 1).intersection(psser_different_name).sort_values(),
|
2021-04-15 19:53:30 -04:00
|
|
|
)
|
|
|
|
|
|
|
|
others = ([3, 4, 5, 6], (3, 4, 5, 6), {3: None, 4: None, 5: None, 6: None})
|
|
|
|
for other in others:
|
|
|
|
if LooseVersion(pd.__version__) < LooseVersion("1.2.0"):
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.intersection(other).sort_values(), ps.Index([3, 4], name="Koalas")
|
2021-04-15 19:53:30 -04:00
|
|
|
)
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
(psidx + 1).intersection(other).sort_values(),
|
|
|
|
ps.Index([3, 4, 5], name="Koalas"),
|
2021-04-15 19:53:30 -04:00
|
|
|
)
|
|
|
|
else:
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx.intersection(other), psidx.intersection(other).sort_values())
|
2021-04-15 19:53:30 -04:00
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
(pidx + 1).intersection(other), (psidx + 1).intersection(other).sort_values()
|
2021-04-15 19:53:30 -04:00
|
|
|
)
|
|
|
|
|
|
|
|
# MultiIndex / other = Index
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
pmidx.intersection(pidx), psmidx.intersection(psidx).sort_values(), almost=True
|
2021-04-15 19:53:30 -04:00
|
|
|
)
|
|
|
|
self.assert_eq(
|
|
|
|
pmidx.intersection(pidx_other_from_frame),
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx.intersection(psidx_other_from_frame).sort_values(),
|
2021-04-15 19:53:30 -04:00
|
|
|
almost=True,
|
|
|
|
)
|
|
|
|
|
|
|
|
# MultiIndex / other = MultiIndex
|
|
|
|
pmidx_other = pd.MultiIndex.from_tuples([("c", "z"), ("d", "w")])
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx_other = ps.from_pandas(pmidx_other)
|
2021-04-15 19:53:30 -04:00
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
pmidx.intersection(pmidx_other), psmidx.intersection(psmidx_other).sort_values()
|
2021-04-15 19:53:30 -04:00
|
|
|
)
|
|
|
|
|
|
|
|
# MultiIndex / other = list
|
|
|
|
other = [("c", "z"), ("d", "w")]
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pmidx.intersection(other), psmidx.intersection(other).sort_values())
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# MultiIndex / other = tuple
|
|
|
|
other = (("c", "z"), ("d", "w"))
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pmidx.intersection(other), psmidx.intersection(other).sort_values())
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# MultiIndex / other = dict
|
|
|
|
other = {("c", "z"): None, ("d", "w"): None}
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pmidx.intersection(other), psmidx.intersection(other).sort_values())
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
with self.assertRaisesRegex(TypeError, "Input must be Index or array-like"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.intersection(4)
|
2021-04-15 19:53:30 -04:00
|
|
|
with self.assertRaisesRegex(TypeError, "other must be a MultiIndex or a list of tuples"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx.intersection(4)
|
2021-04-15 19:53:30 -04:00
|
|
|
with self.assertRaisesRegex(TypeError, "other must be a MultiIndex or a list of tuples"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx.intersection(ps.Series([3, 4, 5, 6]))
|
2021-04-15 19:53:30 -04:00
|
|
|
with self.assertRaisesRegex(ValueError, "Index data must be 1-dimensional"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.intersection(ps.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}))
|
2021-04-15 19:53:30 -04:00
|
|
|
with self.assertRaisesRegex(ValueError, "Index data must be 1-dimensional"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx.intersection(ps.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_item(self):
|
|
|
|
pidx = pd.Index([10])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx.item(), psidx.item())
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# with timestamp
|
|
|
|
pidx = pd.Index([datetime(1990, 3, 9)])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx.item(), psidx.item())
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# MultiIndex
|
|
|
|
pmidx = pd.MultiIndex.from_tuples([("a", "x")])
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx = ps.from_pandas(pmidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pmidx.item(), psmidx.item())
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# MultiIndex with timestamp
|
|
|
|
pmidx = pd.MultiIndex.from_tuples([(datetime(1990, 3, 9), datetime(2019, 8, 15))])
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx = ps.from_pandas(pmidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx.item(), psidx.item())
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
err_msg = "can only convert an array of size 1 to a Python scalar"
|
|
|
|
with self.assertRaisesRegex(ValueError, err_msg):
|
|
|
|
ps.Index([10, 20]).item()
|
|
|
|
with self.assertRaisesRegex(ValueError, err_msg):
|
|
|
|
ps.MultiIndex.from_tuples([("a", "x"), ("b", "y")]).item()
|
|
|
|
|
|
|
|
def test_inferred_type(self):
|
|
|
|
# Integer
|
|
|
|
pidx = pd.Index([1, 2, 3])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
|
|
|
self.assert_eq(pidx.inferred_type, psidx.inferred_type)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# Floating
|
|
|
|
pidx = pd.Index([1.0, 2.0, 3.0])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
|
|
|
self.assert_eq(pidx.inferred_type, psidx.inferred_type)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# String
|
|
|
|
pidx = pd.Index(["a", "b", "c"])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
|
|
|
self.assert_eq(pidx.inferred_type, psidx.inferred_type)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# Boolean
|
|
|
|
pidx = pd.Index([True, False, True, False])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
|
|
|
self.assert_eq(pidx.inferred_type, psidx.inferred_type)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# MultiIndex
|
|
|
|
pmidx = pd.MultiIndex.from_tuples([("a", "x")])
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx = ps.from_pandas(pmidx)
|
|
|
|
self.assert_eq(pmidx.inferred_type, psmidx.inferred_type)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_multi_index_from_index(self):
|
|
|
|
tuples = [(1, "red"), (1, "blue"), (2, "red"), (2, "blue")]
|
|
|
|
pmidx = pd.Index(tuples)
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx = ps.Index(tuples)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assertTrue(isinstance(psmidx, ps.MultiIndex))
|
|
|
|
self.assert_eq(pmidx, psmidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# Specify the `names`
|
|
|
|
pmidx = pd.Index(tuples, names=["Hello", "Koalas"])
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx = ps.Index(tuples, names=["Hello", "Koalas"])
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assertTrue(isinstance(psmidx, ps.MultiIndex))
|
|
|
|
self.assert_eq(pmidx, psmidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
@unittest.skipIf(
|
|
|
|
LooseVersion(pd.__version__) < LooseVersion("0.24"),
|
|
|
|
"MultiIndex.from_frame is new in pandas 0.24",
|
|
|
|
)
|
|
|
|
def test_multiindex_from_frame(self):
|
|
|
|
pdf = pd.DataFrame(
|
|
|
|
[["HI", "Temp"], ["HI", "Precip"], ["NJ", "Temp"], ["NJ", "Precip"]], columns=["a", "b"]
|
|
|
|
)
|
2021-05-20 18:08:30 -04:00
|
|
|
psdf = ps.from_pandas(pdf)
|
2021-04-15 19:53:30 -04:00
|
|
|
pidx = pd.MultiIndex.from_frame(pdf)
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.MultiIndex.from_frame(psdf)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx, psidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# Specify `names`
|
|
|
|
pidx = pd.MultiIndex.from_frame(pdf, names=["state", "observation"])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.MultiIndex.from_frame(psdf, names=["state", "observation"])
|
|
|
|
self.assert_eq(pidx, psidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx = pd.MultiIndex.from_frame(pdf, names=("state", "observation"))
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.MultiIndex.from_frame(psdf, names=("state", "observation"))
|
|
|
|
self.assert_eq(pidx, psidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# MultiIndex columns
|
|
|
|
pidx = pd.MultiIndex.from_tuples([("a", "w"), ("b", "x")])
|
|
|
|
pdf.columns = pidx
|
2021-05-20 18:08:30 -04:00
|
|
|
psdf = ps.from_pandas(pdf)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx = pd.MultiIndex.from_frame(pdf)
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.MultiIndex.from_frame(psdf)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx, psidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# tuples for names
|
|
|
|
pidx = pd.MultiIndex.from_frame(pdf, names=[("a", "w"), ("b", "x")])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.MultiIndex.from_frame(psdf, names=[("a", "w"), ("b", "x")])
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx, psidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
err_msg = "Input must be a DataFrame"
|
|
|
|
with self.assertRaisesRegex(TypeError, err_msg):
|
|
|
|
ps.MultiIndex.from_frame({"a": [1, 2, 3], "b": [4, 5, 6]})
|
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assertRaises(TypeError, lambda: ps.MultiIndex.from_frame(psdf, names="ab"))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# non-string names
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
ps.MultiIndex.from_frame(psdf, names=[0, 1]),
|
|
|
|
pd.MultiIndex.from_frame(pdf, names=[0, 1]),
|
2021-04-15 19:53:30 -04:00
|
|
|
)
|
|
|
|
self.assert_eq(
|
2021-05-20 18:08:30 -04:00
|
|
|
ps.MultiIndex.from_frame(psdf, names=[("x", 0), ("y", 1)]),
|
2021-04-15 19:53:30 -04:00
|
|
|
pd.MultiIndex.from_frame(pdf, names=[("x", 0), ("y", 1)]),
|
|
|
|
)
|
|
|
|
|
|
|
|
pdf = pd.DataFrame([["HI", "Temp"], ["HI", "Precip"], ["NJ", "Temp"], ["NJ", "Precip"]])
|
2021-05-20 18:08:30 -04:00
|
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(ps.MultiIndex.from_frame(psdf), pd.MultiIndex.from_frame(pdf))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_is_type_compatible(self):
|
|
|
|
data_types = ["integer", "floating", "string", "boolean"]
|
|
|
|
# Integer
|
|
|
|
pidx = pd.Index([1, 2, 3])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
for data_type in data_types:
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx.is_type_compatible(data_type), psidx.is_type_compatible(data_type))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# Floating
|
|
|
|
pidx = pd.Index([1.0, 2.0, 3.0])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
for data_type in data_types:
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx.is_type_compatible(data_type), psidx.is_type_compatible(data_type))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# String
|
|
|
|
pidx = pd.Index(["a", "b", "c"])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
for data_type in data_types:
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx.is_type_compatible(data_type), psidx.is_type_compatible(data_type))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# Boolean
|
|
|
|
pidx = pd.Index([True, False, True, False])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
for data_type in data_types:
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx.is_type_compatible(data_type), psidx.is_type_compatible(data_type))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# MultiIndex
|
|
|
|
pmidx = pd.MultiIndex.from_tuples([("a", "x")])
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx = ps.from_pandas(pmidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
for data_type in data_types:
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(
|
|
|
|
pmidx.is_type_compatible(data_type), psmidx.is_type_compatible(data_type)
|
|
|
|
)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_asi8(self):
|
|
|
|
# Integer
|
|
|
|
pidx = pd.Index([1, 2, 3])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
|
|
|
self.assert_eq(pidx.asi8, psidx.asi8)
|
|
|
|
self.assert_eq(pidx.astype("int").asi8, psidx.astype("int").asi8)
|
|
|
|
self.assert_eq(pidx.astype("int16").asi8, psidx.astype("int16").asi8)
|
|
|
|
self.assert_eq(pidx.astype("int8").asi8, psidx.astype("int8").asi8)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# Integer with missing value
|
|
|
|
pidx = pd.Index([1, 2, None, 4, 5])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
|
|
|
self.assert_eq(pidx.asi8, psidx.asi8)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# Datetime
|
|
|
|
pidx = pd.date_range(end="1/1/2018", periods=3)
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
|
|
|
self.assert_eq(pidx.asi8, psidx.asi8)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# Floating
|
|
|
|
pidx = pd.Index([1.0, 2.0, 3.0])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
|
|
|
self.assert_eq(pidx.asi8, psidx.asi8)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# String
|
|
|
|
pidx = pd.Index(["a", "b", "c"])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
|
|
|
self.assert_eq(pidx.asi8, psidx.asi8)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# Boolean
|
|
|
|
pidx = pd.Index([True, False, True, False])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
|
|
|
self.assert_eq(pidx.asi8, psidx.asi8)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# MultiIndex
|
|
|
|
pmidx = pd.MultiIndex.from_tuples([(1, 2)])
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx = ps.from_pandas(pmidx)
|
|
|
|
self.assert_eq(pmidx.asi8, psmidx.asi8)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_index_is_unique(self):
|
|
|
|
indexes = [("a", "b", "c"), ("a", "a", "c"), (1, 3, 3), (1, 2, 3)]
|
|
|
|
names = [None, "ks", "ks", None]
|
|
|
|
is_uniq = [True, False, False, True]
|
|
|
|
|
|
|
|
for idx, name, expected in zip(indexes, names, is_uniq):
|
|
|
|
pdf = pd.DataFrame({"a": [1, 2, 3]}, index=pd.Index(idx, name=name))
|
2021-05-20 18:08:30 -04:00
|
|
|
psdf = ps.from_pandas(pdf)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assertEqual(psdf.index.is_unique, expected)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_multiindex_is_unique(self):
|
|
|
|
indexes = [
|
|
|
|
[list("abc"), list("edf")],
|
|
|
|
[list("aac"), list("edf")],
|
|
|
|
[list("aac"), list("eef")],
|
|
|
|
[[1, 4, 4], [4, 6, 6]],
|
|
|
|
]
|
|
|
|
is_uniq = [True, True, False, False]
|
|
|
|
|
|
|
|
for idx, expected in zip(indexes, is_uniq):
|
|
|
|
pdf = pd.DataFrame({"a": [1, 2, 3]}, index=idx)
|
2021-05-20 18:08:30 -04:00
|
|
|
psdf = ps.from_pandas(pdf)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assertEqual(psdf.index.is_unique, expected)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_view(self):
|
|
|
|
pidx = pd.Index([1, 2, 3, 4], name="Koalas")
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pidx.view(), psidx.view())
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# MultiIndex
|
|
|
|
pmidx = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx = ps.from_pandas(pmidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(pmidx.view(), psmidx.view())
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_insert(self):
|
|
|
|
# Integer
|
|
|
|
pidx = pd.Index([1, 2, 3], name="Koalas")
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
|
|
|
self.assert_eq(pidx.insert(1, 100), psidx.insert(1, 100))
|
|
|
|
self.assert_eq(pidx.insert(-1, 100), psidx.insert(-1, 100))
|
|
|
|
self.assert_eq(pidx.insert(100, 100), psidx.insert(100, 100))
|
|
|
|
self.assert_eq(pidx.insert(-100, 100), psidx.insert(-100, 100))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# Floating
|
|
|
|
pidx = pd.Index([1.0, 2.0, 3.0], name="Koalas")
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
|
|
|
self.assert_eq(pidx.insert(1, 100.0), psidx.insert(1, 100.0))
|
|
|
|
self.assert_eq(pidx.insert(-1, 100.0), psidx.insert(-1, 100.0))
|
|
|
|
self.assert_eq(pidx.insert(100, 100.0), psidx.insert(100, 100.0))
|
|
|
|
self.assert_eq(pidx.insert(-100, 100.0), psidx.insert(-100, 100.0))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# String
|
|
|
|
pidx = pd.Index(["a", "b", "c"], name="Koalas")
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
|
|
|
self.assert_eq(pidx.insert(1, "x"), psidx.insert(1, "x"))
|
|
|
|
self.assert_eq(pidx.insert(-1, "x"), psidx.insert(-1, "x"))
|
|
|
|
self.assert_eq(pidx.insert(100, "x"), psidx.insert(100, "x"))
|
|
|
|
self.assert_eq(pidx.insert(-100, "x"), psidx.insert(-100, "x"))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# Boolean
|
|
|
|
pidx = pd.Index([True, False, True, False], name="Koalas")
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
|
|
|
self.assert_eq(pidx.insert(1, True), psidx.insert(1, True))
|
|
|
|
self.assert_eq(pidx.insert(-1, True), psidx.insert(-1, True))
|
|
|
|
self.assert_eq(pidx.insert(100, True), psidx.insert(100, True))
|
|
|
|
self.assert_eq(pidx.insert(-100, True), psidx.insert(-100, True))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
# MultiIndex
|
|
|
|
pmidx = pd.MultiIndex.from_tuples(
|
|
|
|
[("a", "x"), ("b", "y"), ("c", "z")], names=["Hello", "Koalas"]
|
|
|
|
)
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx = ps.from_pandas(pmidx)
|
|
|
|
self.assert_eq(pmidx.insert(2, ("h", "j")), psmidx.insert(2, ("h", "j")))
|
|
|
|
self.assert_eq(pmidx.insert(-1, ("h", "j")), psmidx.insert(-1, ("h", "j")))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
err_msg = "index 4 is out of bounds for axis 0 with size 3"
|
|
|
|
with self.assertRaisesRegex(IndexError, err_msg):
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx.insert(4, ("b", "y"))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_astype(self):
|
|
|
|
pidx = pd.Index([10, 20, 15, 30, 45], name="x")
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.Index(pidx)
|
|
|
|
|
|
|
|
self.assert_eq(psidx.astype(int), pidx.astype(int))
|
|
|
|
self.assert_eq(psidx.astype(np.int), pidx.astype(np.int))
|
|
|
|
self.assert_eq(psidx.astype(np.int8), pidx.astype(np.int8))
|
|
|
|
self.assert_eq(psidx.astype(np.int16), pidx.astype(np.int16))
|
|
|
|
self.assert_eq(psidx.astype(np.int32), pidx.astype(np.int32))
|
|
|
|
self.assert_eq(psidx.astype(np.int64), pidx.astype(np.int64))
|
|
|
|
self.assert_eq(psidx.astype(np.byte), pidx.astype(np.byte))
|
|
|
|
self.assert_eq(psidx.astype("int"), pidx.astype("int"))
|
|
|
|
self.assert_eq(psidx.astype("int8"), pidx.astype("int8"))
|
|
|
|
self.assert_eq(psidx.astype("int16"), pidx.astype("int16"))
|
|
|
|
self.assert_eq(psidx.astype("int32"), pidx.astype("int32"))
|
|
|
|
self.assert_eq(psidx.astype("int64"), pidx.astype("int64"))
|
|
|
|
self.assert_eq(psidx.astype("b"), pidx.astype("b"))
|
|
|
|
self.assert_eq(psidx.astype("byte"), pidx.astype("byte"))
|
|
|
|
self.assert_eq(psidx.astype("i"), pidx.astype("i"))
|
|
|
|
self.assert_eq(psidx.astype("long"), pidx.astype("long"))
|
|
|
|
self.assert_eq(psidx.astype("short"), pidx.astype("short"))
|
|
|
|
self.assert_eq(psidx.astype(np.float), pidx.astype(np.float))
|
|
|
|
self.assert_eq(psidx.astype(np.float32), pidx.astype(np.float32))
|
|
|
|
self.assert_eq(psidx.astype(np.float64), pidx.astype(np.float64))
|
|
|
|
self.assert_eq(psidx.astype("float"), pidx.astype("float"))
|
|
|
|
self.assert_eq(psidx.astype("float32"), pidx.astype("float32"))
|
|
|
|
self.assert_eq(psidx.astype("float64"), pidx.astype("float64"))
|
|
|
|
self.assert_eq(psidx.astype("double"), pidx.astype("double"))
|
|
|
|
self.assert_eq(psidx.astype("f"), pidx.astype("f"))
|
|
|
|
self.assert_eq(psidx.astype(bool), pidx.astype(bool))
|
|
|
|
self.assert_eq(psidx.astype("bool"), pidx.astype("bool"))
|
|
|
|
self.assert_eq(psidx.astype("?"), pidx.astype("?"))
|
|
|
|
self.assert_eq(psidx.astype(np.unicode_), pidx.astype(np.unicode_))
|
|
|
|
self.assert_eq(psidx.astype("str"), pidx.astype("str"))
|
|
|
|
self.assert_eq(psidx.astype("U"), pidx.astype("U"))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx = pd.Index([10, 20, 15, 30, 45, None], name="x")
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.Index(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx = pd.Index(["hi", "hi ", " ", " \t", "", None], name="x")
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.Index(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psidx.astype(bool), pidx.astype(bool))
|
|
|
|
self.assert_eq(psidx.astype(str).to_numpy(), ["hi", "hi ", " ", " \t", "", "None"])
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx = pd.Index([True, False, None], name="x")
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.Index(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psidx.astype(bool), pidx.astype(bool))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx = pd.Index(["2020-10-27"], name="x")
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.Index(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psidx.astype("datetime64[ns]"), pidx.astype("datetime64[ns]"))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
with self.assertRaisesRegex(TypeError, "not understood"):
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx.astype("int63")
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_to_list(self):
|
|
|
|
# Index
|
|
|
|
pidx = pd.Index([1, 2, 3, 4, 5])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
# MultiIndex
|
|
|
|
tuples = [(1, "red"), (1, "blue"), (2, "red"), (2, "green")]
|
|
|
|
pmidx = pd.MultiIndex.from_tuples(tuples)
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx = ps.from_pandas(pmidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psidx.tolist(), pidx.tolist())
|
|
|
|
self.assert_eq(psmidx.tolist(), pmidx.tolist())
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_index_ops(self):
|
|
|
|
pidx = pd.Index([1, 2, 3, 4, 5])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psidx * 100 + psidx * 10 + psidx, pidx * 100 + pidx * 10 + pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx = pd.Index([1, 2, 3, 4, 5], name="a")
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psidx * 100 + psidx * 10 + psidx, pidx * 100 + pidx * 10 + pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pdf = pd.DataFrame(
|
|
|
|
index=pd.MultiIndex.from_tuples([(1, 2), (3, 4), (5, 6)], names=["a", "b"])
|
|
|
|
)
|
2021-05-20 18:08:30 -04:00
|
|
|
psdf = ps.from_pandas(pdf)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
pidx1 = pdf.index.get_level_values(0)
|
|
|
|
pidx2 = pdf.index.get_level_values(1)
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx1 = psdf.index.get_level_values(0)
|
|
|
|
psidx2 = psdf.index.get_level_values(1)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
if LooseVersion(pd.__version__) >= LooseVersion("1.0"):
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psidx1 * 10 + psidx2, pidx1 * 10 + pidx2)
|
2021-04-15 19:53:30 -04:00
|
|
|
else:
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assert_eq(psidx1 * 10 + psidx2, (pidx1 * 10 + pidx2).rename(None))
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
def test_factorize(self):
|
|
|
|
pidx = pd.Index(["a", "b", "a", "b"])
|
2021-05-20 18:08:30 -04:00
|
|
|
psidx = ps.from_pandas(pidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
pcodes, puniques = pidx.factorize(sort=True)
|
2021-05-20 18:08:30 -04:00
|
|
|
kcodes, kuniques = psidx.factorize()
|
2021-04-15 19:53:30 -04:00
|
|
|
self.assert_eq(pcodes.tolist(), kcodes.to_list())
|
|
|
|
self.assert_eq(puniques, kuniques)
|
|
|
|
|
|
|
|
pmidx = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")])
|
2021-05-20 18:08:30 -04:00
|
|
|
psmidx = ps.from_pandas(pmidx)
|
2021-04-15 19:53:30 -04:00
|
|
|
|
2021-05-20 18:08:30 -04:00
|
|
|
self.assertRaises(PandasNotImplementedError, lambda: psmidx.factorize())
|
2021-04-15 19:53:30 -04:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
from pyspark.pandas.tests.indexes.test_base import * # noqa: F401
|
|
|
|
|
|
|
|
try:
|
|
|
|
import xmlrunner # type: ignore[import]
|
2021-05-20 18:08:30 -04:00
|
|
|
|
|
|
|
testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
|
2021-04-15 19:53:30 -04:00
|
|
|
except ImportError:
|
|
|
|
testRunner = None
|
|
|
|
unittest.main(testRunner=testRunner, verbosity=2)
|