6b912e4179
### What changes were proposed in this pull request? There are still naming related to Koalas in test and function name. This PR addressed them to fit pandas-on-spark. - kdf -> psdf - kser -> psser - kidx -> psidx - kmidx -> psmidx - to_koalas() -> to_pandas_on_spark() ### Why are the changes needed? This is because the name Koalas is no longer used in PySpark. ### Does this PR introduce _any_ user-facing change? `to_koalas()` function is renamed to `to_pandas_on_spark()` ### How was this patch tested? Tested in local manually. After changing the related naming, I checked them one by one. Closes #32516 from itholic/SPARK-35364. Authored-by: itholic <haejoon.lee@databricks.com> Signed-off-by: Takuya UESHIN <ueshin@databricks.com>
5566 lines
217 KiB
Python
5566 lines
217 KiB
Python
#
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
# this work for additional information regarding copyright ownership.
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
# (the "License"); you may not use this file except in compliance with
|
|
# the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
from datetime import datetime
|
|
from distutils.version import LooseVersion
|
|
import inspect
|
|
import sys
|
|
import unittest
|
|
from io import StringIO
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
from pandas.tseries.offsets import DateOffset
|
|
from pyspark import StorageLevel
|
|
from pyspark.ml.linalg import SparseVector
|
|
from pyspark.sql import functions as F
|
|
|
|
from pyspark import pandas as ps
|
|
from pyspark.pandas.config import option_context
|
|
from pyspark.pandas.exceptions import PandasNotImplementedError
|
|
from pyspark.pandas.frame import CachedDataFrame
|
|
from pyspark.pandas.missing.frame import _MissingPandasLikeDataFrame
|
|
from pyspark.pandas.typedef.typehints import (
|
|
extension_dtypes,
|
|
extension_dtypes_available,
|
|
extension_float_dtypes_available,
|
|
extension_object_dtypes_available,
|
|
)
|
|
from pyspark.testing.pandasutils import (
|
|
have_tabulate,
|
|
PandasOnSparkTestCase,
|
|
SPARK_CONF_ARROW_ENABLED,
|
|
tabulate_requirement_message,
|
|
)
|
|
from pyspark.testing.sqlutils import SQLTestUtils
|
|
from pyspark.pandas.utils import name_like_string
|
|
|
|
|
|
class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
|
|
@property
|
|
def pdf(self):
|
|
return pd.DataFrame(
|
|
{"a": [1, 2, 3, 4, 5, 6, 7, 8, 9], "b": [4, 5, 6, 3, 2, 1, 0, 0, 0]},
|
|
index=np.random.rand(9),
|
|
)
|
|
|
|
@property
|
|
def psdf(self):
|
|
return ps.from_pandas(self.pdf)
|
|
|
|
@property
|
|
def df_pair(self):
|
|
pdf = self.pdf
|
|
psdf = ps.from_pandas(pdf)
|
|
return pdf, psdf
|
|
|
|
def test_dataframe(self):
|
|
pdf, psdf = self.df_pair
|
|
|
|
self.assert_eq(psdf["a"] + 1, pdf["a"] + 1)
|
|
|
|
self.assert_eq(psdf.columns, pd.Index(["a", "b"]))
|
|
|
|
self.assert_eq(psdf[psdf["b"] > 2], pdf[pdf["b"] > 2])
|
|
self.assert_eq(-psdf[psdf["b"] > 2], -pdf[pdf["b"] > 2])
|
|
self.assert_eq(psdf[["a", "b"]], pdf[["a", "b"]])
|
|
self.assert_eq(psdf.a, pdf.a)
|
|
self.assert_eq(psdf.b.mean(), pdf.b.mean())
|
|
self.assert_eq(psdf.b.var(), pdf.b.var())
|
|
self.assert_eq(psdf.b.std(), pdf.b.std())
|
|
|
|
pdf, psdf = self.df_pair
|
|
self.assert_eq(psdf[["a", "b"]], pdf[["a", "b"]])
|
|
|
|
self.assertEqual(psdf.a.notnull().rename("x").name, "x")
|
|
|
|
# check ps.DataFrame(ps.Series)
|
|
pser = pd.Series([1, 2, 3], name="x", index=np.random.rand(3))
|
|
psser = ps.from_pandas(pser)
|
|
self.assert_eq(pd.DataFrame(pser), ps.DataFrame(psser))
|
|
|
|
# check psdf[pd.Index]
|
|
pdf, psdf = self.df_pair
|
|
column_mask = pdf.columns.isin(["a", "b"])
|
|
index_cols = pdf.columns[column_mask]
|
|
self.assert_eq(psdf[index_cols], pdf[index_cols])
|
|
|
|
def _check_extension(self, psdf, pdf):
|
|
if LooseVersion("1.1") <= LooseVersion(pd.__version__) < LooseVersion("1.2.2"):
|
|
self.assert_eq(psdf, pdf, check_exact=False)
|
|
for dtype in psdf.dtypes:
|
|
self.assertTrue(isinstance(dtype, extension_dtypes))
|
|
else:
|
|
self.assert_eq(psdf, pdf)
|
|
|
|
@unittest.skipIf(not extension_dtypes_available, "pandas extension dtypes are not available")
|
|
def test_extension_dtypes(self):
|
|
pdf = pd.DataFrame(
|
|
{
|
|
"a": pd.Series([1, 2, None, 4], dtype="Int8"),
|
|
"b": pd.Series([1, None, None, 4], dtype="Int16"),
|
|
"c": pd.Series([1, 2, None, None], dtype="Int32"),
|
|
"d": pd.Series([None, 2, None, 4], dtype="Int64"),
|
|
}
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self._check_extension(psdf, pdf)
|
|
self._check_extension(psdf + F.lit(1).cast("byte"), pdf + 1)
|
|
self._check_extension(psdf + psdf, pdf + pdf)
|
|
|
|
@unittest.skipIf(not extension_dtypes_available, "pandas extension dtypes are not available")
|
|
def test_astype_extension_dtypes(self):
|
|
pdf = pd.DataFrame(
|
|
{
|
|
"a": [1, 2, None, 4],
|
|
"b": [1, None, None, 4],
|
|
"c": [1, 2, None, None],
|
|
"d": [None, 2, None, 4],
|
|
}
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
astype = {"a": "Int8", "b": "Int16", "c": "Int32", "d": "Int64"}
|
|
|
|
self._check_extension(psdf.astype(astype), pdf.astype(astype))
|
|
|
|
@unittest.skipIf(
|
|
not extension_object_dtypes_available, "pandas extension object dtypes are not available"
|
|
)
|
|
def test_extension_object_dtypes(self):
|
|
pdf = pd.DataFrame(
|
|
{
|
|
"a": pd.Series(["a", "b", None, "c"], dtype="string"),
|
|
"b": pd.Series([True, None, False, True], dtype="boolean"),
|
|
}
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self._check_extension(psdf, pdf)
|
|
|
|
@unittest.skipIf(
|
|
not extension_object_dtypes_available, "pandas extension object dtypes are not available"
|
|
)
|
|
def test_astype_extension_object_dtypes(self):
|
|
pdf = pd.DataFrame({"a": ["a", "b", None, "c"], "b": [True, None, False, True]})
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
astype = {"a": "string", "b": "boolean"}
|
|
|
|
self._check_extension(psdf.astype(astype), pdf.astype(astype))
|
|
|
|
@unittest.skipIf(
|
|
not extension_float_dtypes_available, "pandas extension float dtypes are not available"
|
|
)
|
|
def test_extension_float_dtypes(self):
|
|
pdf = pd.DataFrame(
|
|
{
|
|
"a": pd.Series([1.0, 2.0, None, 4.0], dtype="Float32"),
|
|
"b": pd.Series([1.0, None, 3.0, 4.0], dtype="Float64"),
|
|
}
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self._check_extension(psdf, pdf)
|
|
self._check_extension(psdf + 1, pdf + 1)
|
|
self._check_extension(psdf + psdf, pdf + pdf)
|
|
|
|
@unittest.skipIf(
|
|
not extension_float_dtypes_available, "pandas extension float dtypes are not available"
|
|
)
|
|
def test_astype_extension_float_dtypes(self):
|
|
pdf = pd.DataFrame({"a": [1.0, 2.0, None, 4.0], "b": [1.0, None, 3.0, 4.0]})
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
astype = {"a": "Float32", "b": "Float64"}
|
|
|
|
self._check_extension(psdf.astype(astype), pdf.astype(astype))
|
|
|
|
def test_insert(self):
|
|
#
|
|
# Basic DataFrame
|
|
#
|
|
pdf = pd.DataFrame([1, 2, 3])
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
psdf.insert(1, "b", 10)
|
|
pdf.insert(1, "b", 10)
|
|
self.assert_eq(psdf.sort_index(), pdf.sort_index(), almost=True)
|
|
psdf.insert(2, "c", 0.1)
|
|
pdf.insert(2, "c", 0.1)
|
|
self.assert_eq(psdf.sort_index(), pdf.sort_index(), almost=True)
|
|
psdf.insert(3, "d", psdf.b + 1)
|
|
pdf.insert(3, "d", pdf.b + 1)
|
|
self.assert_eq(psdf.sort_index(), pdf.sort_index(), almost=True)
|
|
|
|
psser = ps.Series([4, 5, 6])
|
|
self.assertRaises(ValueError, lambda: psdf.insert(0, "y", psser))
|
|
self.assertRaisesRegex(
|
|
ValueError, "cannot insert b, already exists", lambda: psdf.insert(1, "b", 10)
|
|
)
|
|
self.assertRaisesRegex(
|
|
TypeError,
|
|
'"column" should be a scalar value or tuple that contains scalar values',
|
|
lambda: psdf.insert(0, list("abc"), psser),
|
|
)
|
|
self.assertRaises(ValueError, lambda: psdf.insert(0, "e", [7, 8, 9, 10]))
|
|
self.assertRaises(ValueError, lambda: psdf.insert(0, "f", ps.Series([7, 8])))
|
|
self.assertRaises(AssertionError, lambda: psdf.insert(100, "y", psser))
|
|
self.assertRaises(AssertionError, lambda: psdf.insert(1, "y", psser, allow_duplicates=True))
|
|
|
|
#
|
|
# DataFrame with MultiIndex as columns
|
|
#
|
|
pdf = pd.DataFrame({("x", "a", "b"): [1, 2, 3]})
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
psdf.insert(1, "b", 10)
|
|
pdf.insert(1, "b", 10)
|
|
self.assert_eq(psdf.sort_index(), pdf.sort_index(), almost=True)
|
|
psdf.insert(2, "c", 0.1)
|
|
pdf.insert(2, "c", 0.1)
|
|
self.assert_eq(psdf.sort_index(), pdf.sort_index(), almost=True)
|
|
psdf.insert(3, "d", psdf.b + 1)
|
|
pdf.insert(3, "d", pdf.b + 1)
|
|
self.assert_eq(psdf.sort_index(), pdf.sort_index(), almost=True)
|
|
|
|
self.assertRaisesRegex(
|
|
ValueError, "cannot insert d, already exists", lambda: psdf.insert(4, "d", 11)
|
|
)
|
|
self.assertRaisesRegex(
|
|
ValueError,
|
|
'"column" must have length equal to number of column levels.',
|
|
lambda: psdf.insert(4, ("e",), 11),
|
|
)
|
|
|
|
def test_inplace(self):
|
|
pdf, psdf = self.df_pair
|
|
|
|
pser = pdf.a
|
|
psser = psdf.a
|
|
|
|
pdf["a"] = pdf["a"] + 10
|
|
psdf["a"] = psdf["a"] + 10
|
|
|
|
self.assert_eq(psdf, pdf)
|
|
self.assert_eq(psser, pser)
|
|
|
|
def test_assign_list(self):
|
|
pdf, psdf = self.df_pair
|
|
|
|
pser = pdf.a
|
|
psser = psdf.a
|
|
|
|
pdf["x"] = [10, 20, 30, 40, 50, 60, 70, 80, 90]
|
|
psdf["x"] = [10, 20, 30, 40, 50, 60, 70, 80, 90]
|
|
|
|
self.assert_eq(psdf.sort_index(), pdf.sort_index())
|
|
self.assert_eq(psser, pser)
|
|
|
|
with self.assertRaisesRegex(ValueError, "Length of values does not match length of index"):
|
|
psdf["z"] = [10, 20, 30, 40, 50, 60, 70, 80]
|
|
|
|
def test_dataframe_multiindex_columns(self):
|
|
pdf = pd.DataFrame(
|
|
{
|
|
("x", "a", "1"): [1, 2, 3],
|
|
("x", "b", "2"): [4, 5, 6],
|
|
("y.z", "c.d", "3"): [7, 8, 9],
|
|
("x", "b", "4"): [10, 11, 12],
|
|
},
|
|
index=np.random.rand(3),
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf, pdf)
|
|
self.assert_eq(psdf["x"], pdf["x"])
|
|
self.assert_eq(psdf["y.z"], pdf["y.z"])
|
|
self.assert_eq(psdf["x"]["b"], pdf["x"]["b"])
|
|
self.assert_eq(psdf["x"]["b"]["2"], pdf["x"]["b"]["2"])
|
|
|
|
self.assert_eq(psdf.x, pdf.x)
|
|
self.assert_eq(psdf.x.b, pdf.x.b)
|
|
self.assert_eq(psdf.x.b["2"], pdf.x.b["2"])
|
|
|
|
self.assertRaises(KeyError, lambda: psdf["z"])
|
|
self.assertRaises(AttributeError, lambda: psdf.z)
|
|
|
|
self.assert_eq(psdf[("x",)], pdf[("x",)])
|
|
self.assert_eq(psdf[("x", "a")], pdf[("x", "a")])
|
|
self.assert_eq(psdf[("x", "a", "1")], pdf[("x", "a", "1")])
|
|
|
|
def test_dataframe_column_level_name(self):
|
|
column = pd.Index(["A", "B", "C"], name="X")
|
|
pdf = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=column, index=np.random.rand(2))
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf, pdf)
|
|
self.assert_eq(psdf.columns.names, pdf.columns.names)
|
|
self.assert_eq(psdf.to_pandas().columns.names, pdf.columns.names)
|
|
|
|
def test_dataframe_multiindex_names_level(self):
|
|
columns = pd.MultiIndex.from_tuples(
|
|
[("X", "A", "Z"), ("X", "B", "Z"), ("Y", "C", "Z"), ("Y", "D", "Z")],
|
|
names=["lvl_1", "lvl_2", "lv_3"],
|
|
)
|
|
pdf = pd.DataFrame(
|
|
[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16], [17, 18, 19, 20]],
|
|
columns=columns,
|
|
index=np.random.rand(5),
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf.columns.names, pdf.columns.names)
|
|
self.assert_eq(psdf.to_pandas().columns.names, pdf.columns.names)
|
|
|
|
psdf1 = ps.from_pandas(pdf)
|
|
self.assert_eq(psdf1.columns.names, pdf.columns.names)
|
|
|
|
self.assertRaises(
|
|
AssertionError,
|
|
lambda: ps.DataFrame(psdf1._internal.copy(column_label_names=("level",))),
|
|
)
|
|
|
|
self.assert_eq(psdf["X"], pdf["X"])
|
|
self.assert_eq(psdf["X"].columns.names, pdf["X"].columns.names)
|
|
self.assert_eq(psdf["X"].to_pandas().columns.names, pdf["X"].columns.names)
|
|
self.assert_eq(psdf["X"]["A"], pdf["X"]["A"])
|
|
self.assert_eq(psdf["X"]["A"].columns.names, pdf["X"]["A"].columns.names)
|
|
self.assert_eq(psdf["X"]["A"].to_pandas().columns.names, pdf["X"]["A"].columns.names)
|
|
self.assert_eq(psdf[("X", "A")], pdf[("X", "A")])
|
|
self.assert_eq(psdf[("X", "A")].columns.names, pdf[("X", "A")].columns.names)
|
|
self.assert_eq(psdf[("X", "A")].to_pandas().columns.names, pdf[("X", "A")].columns.names)
|
|
self.assert_eq(psdf[("X", "A", "Z")], pdf[("X", "A", "Z")])
|
|
|
|
def test_itertuples(self):
|
|
pdf = pd.DataFrame({"num_legs": [4, 2], "num_wings": [0, 2]}, index=["dog", "hawk"])
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
for ptuple, ktuple in zip(
|
|
pdf.itertuples(index=False, name="Animal"), psdf.itertuples(index=False, name="Animal")
|
|
):
|
|
self.assert_eq(ptuple, ktuple)
|
|
for ptuple, ktuple in zip(pdf.itertuples(name=None), psdf.itertuples(name=None)):
|
|
self.assert_eq(ptuple, ktuple)
|
|
|
|
pdf.index = pd.MultiIndex.from_arrays(
|
|
[[1, 2], ["black", "brown"]], names=("count", "color")
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
for ptuple, ktuple in zip(pdf.itertuples(name="Animal"), psdf.itertuples(name="Animal")):
|
|
self.assert_eq(ptuple, ktuple)
|
|
|
|
pdf.columns = pd.MultiIndex.from_arrays(
|
|
[["CA", "WA"], ["age", "children"]], names=("origin", "info")
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
for ptuple, ktuple in zip(pdf.itertuples(name="Animal"), psdf.itertuples(name="Animal")):
|
|
self.assert_eq(ptuple, ktuple)
|
|
|
|
pdf = pd.DataFrame([1, 2, 3])
|
|
psdf = ps.from_pandas(pdf)
|
|
for ptuple, ktuple in zip(
|
|
(pdf + 1).itertuples(name="num"), (psdf + 1).itertuples(name="num")
|
|
):
|
|
self.assert_eq(ptuple, ktuple)
|
|
|
|
# DataFrames with a large number of columns (>254)
|
|
pdf = pd.DataFrame(np.random.random((1, 255)))
|
|
psdf = ps.from_pandas(pdf)
|
|
for ptuple, ktuple in zip(pdf.itertuples(name="num"), psdf.itertuples(name="num")):
|
|
self.assert_eq(ptuple, ktuple)
|
|
|
|
def test_iterrows(self):
|
|
pdf = pd.DataFrame(
|
|
{
|
|
("x", "a", "1"): [1, 2, 3],
|
|
("x", "b", "2"): [4, 5, 6],
|
|
("y.z", "c.d", "3"): [7, 8, 9],
|
|
("x", "b", "4"): [10, 11, 12],
|
|
},
|
|
index=np.random.rand(3),
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
for (pdf_k, pdf_v), (psdf_k, psdf_v) in zip(pdf.iterrows(), psdf.iterrows()):
|
|
self.assert_eq(pdf_k, psdf_k)
|
|
self.assert_eq(pdf_v, psdf_v)
|
|
|
|
def test_reset_index(self):
|
|
pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=np.random.rand(3))
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf.reset_index(), pdf.reset_index())
|
|
self.assert_eq(psdf.reset_index().index, pdf.reset_index().index)
|
|
self.assert_eq(psdf.reset_index(drop=True), pdf.reset_index(drop=True))
|
|
|
|
pdf.index.name = "a"
|
|
psdf.index.name = "a"
|
|
|
|
with self.assertRaisesRegex(ValueError, "cannot insert a, already exists"):
|
|
psdf.reset_index()
|
|
|
|
self.assert_eq(psdf.reset_index(drop=True), pdf.reset_index(drop=True))
|
|
|
|
# inplace
|
|
pser = pdf.a
|
|
psser = psdf.a
|
|
pdf.reset_index(drop=True, inplace=True)
|
|
psdf.reset_index(drop=True, inplace=True)
|
|
self.assert_eq(psdf, pdf)
|
|
self.assert_eq(psser, pser)
|
|
|
|
def test_reset_index_with_default_index_types(self):
|
|
pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=np.random.rand(3))
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
with ps.option_context("compute.default_index_type", "sequence"):
|
|
self.assert_eq(psdf.reset_index(), pdf.reset_index())
|
|
|
|
with ps.option_context("compute.default_index_type", "distributed-sequence"):
|
|
self.assert_eq(psdf.reset_index(), pdf.reset_index())
|
|
|
|
with ps.option_context("compute.default_index_type", "distributed"):
|
|
# the index is different.
|
|
self.assert_eq(psdf.reset_index().to_pandas().reset_index(drop=True), pdf.reset_index())
|
|
|
|
def test_reset_index_with_multiindex_columns(self):
|
|
index = pd.MultiIndex.from_tuples(
|
|
[("bird", "falcon"), ("bird", "parrot"), ("mammal", "lion"), ("mammal", "monkey")],
|
|
names=["class", "name"],
|
|
)
|
|
columns = pd.MultiIndex.from_tuples([("speed", "max"), ("species", "type")])
|
|
pdf = pd.DataFrame(
|
|
[(389.0, "fly"), (24.0, "fly"), (80.5, "run"), (np.nan, "jump")],
|
|
index=index,
|
|
columns=columns,
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf, pdf)
|
|
self.assert_eq(psdf.reset_index(), pdf.reset_index())
|
|
self.assert_eq(psdf.reset_index(level="class"), pdf.reset_index(level="class"))
|
|
self.assert_eq(
|
|
psdf.reset_index(level="class", col_level=1),
|
|
pdf.reset_index(level="class", col_level=1),
|
|
)
|
|
self.assert_eq(
|
|
psdf.reset_index(level="class", col_level=1, col_fill="species"),
|
|
pdf.reset_index(level="class", col_level=1, col_fill="species"),
|
|
)
|
|
self.assert_eq(
|
|
psdf.reset_index(level="class", col_level=1, col_fill="genus"),
|
|
pdf.reset_index(level="class", col_level=1, col_fill="genus"),
|
|
)
|
|
|
|
with self.assertRaisesRegex(IndexError, "Index has only 2 levels, not 3"):
|
|
psdf.reset_index(col_level=2)
|
|
|
|
pdf.index.names = [("x", "class"), ("y", "name")]
|
|
psdf.index.names = [("x", "class"), ("y", "name")]
|
|
|
|
self.assert_eq(psdf.reset_index(), pdf.reset_index())
|
|
|
|
with self.assertRaisesRegex(ValueError, "Item must have length equal to number of levels."):
|
|
psdf.reset_index(col_level=1)
|
|
|
|
def test_index_to_frame_reset_index(self):
|
|
def check(psdf, pdf):
|
|
self.assert_eq(psdf.reset_index(), pdf.reset_index())
|
|
self.assert_eq(psdf.reset_index(drop=True), pdf.reset_index(drop=True))
|
|
|
|
pdf.reset_index(drop=True, inplace=True)
|
|
psdf.reset_index(drop=True, inplace=True)
|
|
self.assert_eq(psdf, pdf)
|
|
|
|
pdf, psdf = self.df_pair
|
|
check(psdf.index.to_frame(), pdf.index.to_frame())
|
|
check(psdf.index.to_frame(index=False), pdf.index.to_frame(index=False))
|
|
|
|
if LooseVersion(pd.__version__) >= LooseVersion("0.24"):
|
|
# The `name` argument is added in pandas 0.24.
|
|
check(psdf.index.to_frame(name="a"), pdf.index.to_frame(name="a"))
|
|
check(
|
|
psdf.index.to_frame(index=False, name="a"),
|
|
pdf.index.to_frame(index=False, name="a"),
|
|
)
|
|
check(psdf.index.to_frame(name=("x", "a")), pdf.index.to_frame(name=("x", "a")))
|
|
check(
|
|
psdf.index.to_frame(index=False, name=("x", "a")),
|
|
pdf.index.to_frame(index=False, name=("x", "a")),
|
|
)
|
|
|
|
def test_multiindex_column_access(self):
|
|
columns = pd.MultiIndex.from_tuples(
|
|
[
|
|
("a", "", "", "b"),
|
|
("c", "", "d", ""),
|
|
("e", "", "f", ""),
|
|
("e", "g", "", ""),
|
|
("", "", "", "h"),
|
|
("i", "", "", ""),
|
|
]
|
|
)
|
|
|
|
pdf = pd.DataFrame(
|
|
[
|
|
(1, "a", "x", 10, 100, 1000),
|
|
(2, "b", "y", 20, 200, 2000),
|
|
(3, "c", "z", 30, 300, 3000),
|
|
],
|
|
columns=columns,
|
|
index=np.random.rand(3),
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf, pdf)
|
|
self.assert_eq(psdf["a"], pdf["a"])
|
|
self.assert_eq(psdf["a"]["b"], pdf["a"]["b"])
|
|
self.assert_eq(psdf["c"], pdf["c"])
|
|
self.assert_eq(psdf["c"]["d"], pdf["c"]["d"])
|
|
self.assert_eq(psdf["e"], pdf["e"])
|
|
self.assert_eq(psdf["e"][""]["f"], pdf["e"][""]["f"])
|
|
self.assert_eq(psdf["e"]["g"], pdf["e"]["g"])
|
|
self.assert_eq(psdf[""], pdf[""])
|
|
self.assert_eq(psdf[""]["h"], pdf[""]["h"])
|
|
self.assert_eq(psdf["i"], pdf["i"])
|
|
|
|
self.assert_eq(psdf[["a", "e"]], pdf[["a", "e"]])
|
|
self.assert_eq(psdf[["e", "a"]], pdf[["e", "a"]])
|
|
|
|
self.assert_eq(psdf[("a",)], pdf[("a",)])
|
|
self.assert_eq(psdf[("e", "g")], pdf[("e", "g")])
|
|
# self.assert_eq(psdf[("i",)], pdf[("i",)])
|
|
self.assert_eq(psdf[("i", "")], pdf[("i", "")])
|
|
|
|
self.assertRaises(KeyError, lambda: psdf[("a", "b")])
|
|
|
|
def test_repr_cache_invalidation(self):
|
|
# If there is any cache, inplace operations should invalidate it.
|
|
df = ps.range(10)
|
|
df.__repr__()
|
|
df["a"] = df["id"]
|
|
self.assertEqual(df.__repr__(), df.to_pandas().__repr__())
|
|
|
|
def test_repr_html_cache_invalidation(self):
|
|
# If there is any cache, inplace operations should invalidate it.
|
|
df = ps.range(10)
|
|
df._repr_html_()
|
|
df["a"] = df["id"]
|
|
self.assertEqual(df._repr_html_(), df.to_pandas()._repr_html_())
|
|
|
|
def test_empty_dataframe(self):
|
|
pdf = pd.DataFrame({"a": pd.Series([], dtype="i1"), "b": pd.Series([], dtype="str")})
|
|
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(psdf, pdf)
|
|
|
|
with self.sql_conf({SPARK_CONF_ARROW_ENABLED: False}):
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(psdf, pdf)
|
|
|
|
def test_all_null_dataframe(self):
|
|
pdf = pd.DataFrame(
|
|
{
|
|
"a": [None, None, None, "a"],
|
|
"b": [None, None, None, 1],
|
|
"c": [None, None, None] + list(np.arange(1, 2).astype("i1")),
|
|
"d": [None, None, None, 1.0],
|
|
"e": [None, None, None, True],
|
|
"f": [None, None, None] + list(pd.date_range("20130101", periods=1)),
|
|
},
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf.iloc[:-1], pdf.iloc[:-1])
|
|
|
|
with self.sql_conf({SPARK_CONF_ARROW_ENABLED: False}):
|
|
self.assert_eq(psdf.iloc[:-1], pdf.iloc[:-1])
|
|
|
|
pdf = pd.DataFrame(
|
|
{
|
|
"a": pd.Series([None, None, None], dtype="float64"),
|
|
"b": pd.Series([None, None, None], dtype="str"),
|
|
},
|
|
)
|
|
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(psdf, pdf)
|
|
|
|
with self.sql_conf({SPARK_CONF_ARROW_ENABLED: False}):
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(psdf, pdf)
|
|
|
|
def test_nullable_object(self):
|
|
pdf = pd.DataFrame(
|
|
{
|
|
"a": list("abc") + [np.nan, None],
|
|
"b": list(range(1, 4)) + [np.nan, None],
|
|
"c": list(np.arange(3, 6).astype("i1")) + [np.nan, None],
|
|
"d": list(np.arange(4.0, 7.0, dtype="float64")) + [np.nan, None],
|
|
"e": [True, False, True, np.nan, None],
|
|
"f": list(pd.date_range("20130101", periods=3)) + [np.nan, None],
|
|
},
|
|
index=np.random.rand(5),
|
|
)
|
|
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(psdf, pdf)
|
|
|
|
with self.sql_conf({SPARK_CONF_ARROW_ENABLED: False}):
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(psdf, pdf)
|
|
|
|
def test_assign(self):
|
|
pdf, psdf = self.df_pair
|
|
|
|
psdf["w"] = 1.0
|
|
pdf["w"] = 1.0
|
|
|
|
self.assert_eq(psdf, pdf)
|
|
|
|
psdf.w = 10.0
|
|
pdf.w = 10.0
|
|
|
|
self.assert_eq(psdf, pdf)
|
|
|
|
psdf[1] = 1.0
|
|
pdf[1] = 1.0
|
|
|
|
self.assert_eq(psdf, pdf)
|
|
|
|
psdf = psdf.assign(a=psdf["a"] * 2)
|
|
pdf = pdf.assign(a=pdf["a"] * 2)
|
|
|
|
self.assert_eq(psdf, pdf)
|
|
|
|
# multi-index columns
|
|
columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "w"), ("y", "v")])
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
|
|
psdf[("a", "c")] = "def"
|
|
pdf[("a", "c")] = "def"
|
|
|
|
self.assert_eq(psdf, pdf)
|
|
|
|
psdf = psdf.assign(Z="ZZ")
|
|
pdf = pdf.assign(Z="ZZ")
|
|
|
|
self.assert_eq(psdf, pdf)
|
|
|
|
psdf["x"] = "ghi"
|
|
pdf["x"] = "ghi"
|
|
|
|
self.assert_eq(psdf, pdf)
|
|
|
|
def test_head(self):
|
|
pdf, psdf = self.df_pair
|
|
|
|
self.assert_eq(psdf.head(2), pdf.head(2))
|
|
self.assert_eq(psdf.head(3), pdf.head(3))
|
|
self.assert_eq(psdf.head(0), pdf.head(0))
|
|
self.assert_eq(psdf.head(-3), pdf.head(-3))
|
|
self.assert_eq(psdf.head(-10), pdf.head(-10))
|
|
|
|
def test_attributes(self):
|
|
psdf = self.psdf
|
|
|
|
self.assertIn("a", dir(psdf))
|
|
self.assertNotIn("foo", dir(psdf))
|
|
self.assertRaises(AttributeError, lambda: psdf.foo)
|
|
|
|
psdf = ps.DataFrame({"a b c": [1, 2, 3]})
|
|
self.assertNotIn("a b c", dir(psdf))
|
|
psdf = ps.DataFrame({"a": [1, 2], 5: [1, 2]})
|
|
self.assertIn("a", dir(psdf))
|
|
self.assertNotIn(5, dir(psdf))
|
|
|
|
def test_column_names(self):
|
|
pdf, psdf = self.df_pair
|
|
|
|
self.assert_eq(psdf.columns, pdf.columns)
|
|
self.assert_eq(psdf[["b", "a"]].columns, pdf[["b", "a"]].columns)
|
|
self.assert_eq(psdf["a"].name, pdf["a"].name)
|
|
self.assert_eq((psdf["a"] + 1).name, (pdf["a"] + 1).name)
|
|
|
|
self.assert_eq((psdf.a + psdf.b).name, (pdf.a + pdf.b).name)
|
|
self.assert_eq((psdf.a + psdf.b.rename("a")).name, (pdf.a + pdf.b.rename("a")).name)
|
|
self.assert_eq((psdf.a + psdf.b.rename()).name, (pdf.a + pdf.b.rename()).name)
|
|
self.assert_eq((psdf.a.rename() + psdf.b).name, (pdf.a.rename() + pdf.b).name)
|
|
self.assert_eq(
|
|
(psdf.a.rename() + psdf.b.rename()).name, (pdf.a.rename() + pdf.b.rename()).name
|
|
)
|
|
|
|
def test_rename_columns(self):
|
|
pdf = pd.DataFrame(
|
|
{"a": [1, 2, 3, 4, 5, 6, 7], "b": [7, 6, 5, 4, 3, 2, 1]}, index=np.random.rand(7)
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
psdf.columns = ["x", "y"]
|
|
pdf.columns = ["x", "y"]
|
|
self.assert_eq(psdf.columns, pd.Index(["x", "y"]))
|
|
self.assert_eq(psdf, pdf)
|
|
self.assert_eq(psdf._internal.data_spark_column_names, ["x", "y"])
|
|
self.assert_eq(psdf.to_spark().columns, ["x", "y"])
|
|
self.assert_eq(psdf.to_spark(index_col="index").columns, ["index", "x", "y"])
|
|
|
|
columns = pdf.columns
|
|
columns.name = "lvl_1"
|
|
|
|
psdf.columns = columns
|
|
self.assert_eq(psdf.columns.names, ["lvl_1"])
|
|
self.assert_eq(psdf, pdf)
|
|
|
|
msg = "Length mismatch: Expected axis has 2 elements, new values have 4 elements"
|
|
with self.assertRaisesRegex(ValueError, msg):
|
|
psdf.columns = [1, 2, 3, 4]
|
|
|
|
# Multi-index columns
|
|
pdf = pd.DataFrame(
|
|
{("A", "0"): [1, 2, 2, 3], ("B", "1"): [1, 2, 3, 4]}, index=np.random.rand(4)
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
columns = pdf.columns
|
|
self.assert_eq(psdf.columns, columns)
|
|
self.assert_eq(psdf, pdf)
|
|
|
|
pdf.columns = ["x", "y"]
|
|
psdf.columns = ["x", "y"]
|
|
self.assert_eq(psdf.columns, pd.Index(["x", "y"]))
|
|
self.assert_eq(psdf, pdf)
|
|
self.assert_eq(psdf._internal.data_spark_column_names, ["x", "y"])
|
|
self.assert_eq(psdf.to_spark().columns, ["x", "y"])
|
|
self.assert_eq(psdf.to_spark(index_col="index").columns, ["index", "x", "y"])
|
|
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
self.assert_eq(psdf.columns, columns)
|
|
self.assert_eq(psdf, pdf)
|
|
self.assert_eq(psdf._internal.data_spark_column_names, ["(A, 0)", "(B, 1)"])
|
|
self.assert_eq(psdf.to_spark().columns, ["(A, 0)", "(B, 1)"])
|
|
self.assert_eq(psdf.to_spark(index_col="index").columns, ["index", "(A, 0)", "(B, 1)"])
|
|
|
|
columns.names = ["lvl_1", "lvl_2"]
|
|
|
|
psdf.columns = columns
|
|
self.assert_eq(psdf.columns.names, ["lvl_1", "lvl_2"])
|
|
self.assert_eq(psdf, pdf)
|
|
self.assert_eq(psdf._internal.data_spark_column_names, ["(A, 0)", "(B, 1)"])
|
|
self.assert_eq(psdf.to_spark().columns, ["(A, 0)", "(B, 1)"])
|
|
self.assert_eq(psdf.to_spark(index_col="index").columns, ["index", "(A, 0)", "(B, 1)"])
|
|
|
|
def test_rename_dataframe(self):
|
|
pdf1 = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
|
|
psdf1 = ps.from_pandas(pdf1)
|
|
|
|
self.assert_eq(
|
|
psdf1.rename(columns={"A": "a", "B": "b"}), pdf1.rename(columns={"A": "a", "B": "b"})
|
|
)
|
|
|
|
result_psdf = psdf1.rename(index={1: 10, 2: 20})
|
|
result_pdf = pdf1.rename(index={1: 10, 2: 20})
|
|
self.assert_eq(result_psdf, result_pdf)
|
|
|
|
# inplace
|
|
pser = result_pdf.A
|
|
psser = result_psdf.A
|
|
result_psdf.rename(index={10: 100, 20: 200}, inplace=True)
|
|
result_pdf.rename(index={10: 100, 20: 200}, inplace=True)
|
|
self.assert_eq(result_psdf, result_pdf)
|
|
self.assert_eq(psser, pser)
|
|
|
|
def str_lower(s) -> str:
|
|
return str.lower(s)
|
|
|
|
self.assert_eq(
|
|
psdf1.rename(str_lower, axis="columns"), pdf1.rename(str_lower, axis="columns")
|
|
)
|
|
|
|
def mul10(x) -> int:
|
|
return x * 10
|
|
|
|
self.assert_eq(psdf1.rename(mul10, axis="index"), pdf1.rename(mul10, axis="index"))
|
|
|
|
self.assert_eq(
|
|
psdf1.rename(columns=str_lower, index={1: 10, 2: 20}),
|
|
pdf1.rename(columns=str_lower, index={1: 10, 2: 20}),
|
|
)
|
|
|
|
idx = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B"), ("Y", "C"), ("Y", "D")])
|
|
pdf2 = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=idx)
|
|
psdf2 = ps.from_pandas(pdf2)
|
|
|
|
self.assert_eq(psdf2.rename(columns=str_lower), pdf2.rename(columns=str_lower))
|
|
|
|
self.assert_eq(
|
|
psdf2.rename(columns=str_lower, level=0), pdf2.rename(columns=str_lower, level=0)
|
|
)
|
|
self.assert_eq(
|
|
psdf2.rename(columns=str_lower, level=1), pdf2.rename(columns=str_lower, level=1)
|
|
)
|
|
|
|
pdf3 = pd.DataFrame([[1, 2], [3, 4], [5, 6], [7, 8]], index=idx, columns=list("ab"))
|
|
psdf3 = ps.from_pandas(pdf3)
|
|
|
|
self.assert_eq(psdf3.rename(index=str_lower), pdf3.rename(index=str_lower))
|
|
self.assert_eq(
|
|
psdf3.rename(index=str_lower, level=0), pdf3.rename(index=str_lower, level=0)
|
|
)
|
|
self.assert_eq(
|
|
psdf3.rename(index=str_lower, level=1), pdf3.rename(index=str_lower, level=1)
|
|
)
|
|
|
|
pdf4 = pdf2 + 1
|
|
psdf4 = psdf2 + 1
|
|
self.assert_eq(psdf4.rename(columns=str_lower), pdf4.rename(columns=str_lower))
|
|
|
|
pdf5 = pdf3 + 1
|
|
psdf5 = psdf3 + 1
|
|
self.assert_eq(psdf5.rename(index=str_lower), pdf5.rename(index=str_lower))
|
|
|
|
def test_rename_axis(self):
|
|
index = pd.Index(["A", "B", "C"], name="index")
|
|
columns = pd.Index(["numbers", "values"], name="cols")
|
|
pdf = pd.DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], index=index, columns=columns)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
for axis in [0, "index"]:
|
|
self.assert_eq(
|
|
pdf.rename_axis("index2", axis=axis).sort_index(),
|
|
psdf.rename_axis("index2", axis=axis).sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
pdf.rename_axis(["index2"], axis=axis).sort_index(),
|
|
psdf.rename_axis(["index2"], axis=axis).sort_index(),
|
|
)
|
|
|
|
for axis in [1, "columns"]:
|
|
self.assert_eq(
|
|
pdf.rename_axis("cols2", axis=axis).sort_index(),
|
|
psdf.rename_axis("cols2", axis=axis).sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
pdf.rename_axis(["cols2"], axis=axis).sort_index(),
|
|
psdf.rename_axis(["cols2"], axis=axis).sort_index(),
|
|
)
|
|
|
|
pdf2 = pdf.copy()
|
|
psdf2 = psdf.copy()
|
|
pdf2.rename_axis("index2", axis="index", inplace=True)
|
|
psdf2.rename_axis("index2", axis="index", inplace=True)
|
|
self.assert_eq(pdf2.sort_index(), psdf2.sort_index())
|
|
|
|
self.assertRaises(ValueError, lambda: psdf.rename_axis(["index2", "index3"], axis=0))
|
|
self.assertRaises(ValueError, lambda: psdf.rename_axis(["cols2", "cols3"], axis=1))
|
|
self.assertRaises(TypeError, lambda: psdf.rename_axis(mapper=["index2"], index=["index3"]))
|
|
|
|
# index/columns parameters and dict_like/functions mappers introduced in pandas 0.24.0
|
|
if LooseVersion(pd.__version__) >= LooseVersion("0.24.0"):
|
|
self.assert_eq(
|
|
pdf.rename_axis(index={"index": "index2"}, columns={"cols": "cols2"}).sort_index(),
|
|
psdf.rename_axis(index={"index": "index2"}, columns={"cols": "cols2"}).sort_index(),
|
|
)
|
|
|
|
self.assert_eq(
|
|
pdf.rename_axis(
|
|
index={"missing": "index2"}, columns={"missing": "cols2"}
|
|
).sort_index(),
|
|
psdf.rename_axis(
|
|
index={"missing": "index2"}, columns={"missing": "cols2"}
|
|
).sort_index(),
|
|
)
|
|
|
|
self.assert_eq(
|
|
pdf.rename_axis(index=str.upper, columns=str.upper).sort_index(),
|
|
psdf.rename_axis(index=str.upper, columns=str.upper).sort_index(),
|
|
)
|
|
else:
|
|
expected = pdf
|
|
expected.index.name = "index2"
|
|
expected.columns.name = "cols2"
|
|
result = psdf.rename_axis(
|
|
index={"index": "index2"}, columns={"cols": "cols2"}
|
|
).sort_index()
|
|
self.assert_eq(expected, result)
|
|
|
|
expected.index.name = "index"
|
|
expected.columns.name = "cols"
|
|
result = psdf.rename_axis(
|
|
index={"missing": "index2"}, columns={"missing": "cols2"}
|
|
).sort_index()
|
|
self.assert_eq(expected, result)
|
|
|
|
expected.index.name = "INDEX"
|
|
expected.columns.name = "COLS"
|
|
result = psdf.rename_axis(index=str.upper, columns=str.upper).sort_index()
|
|
self.assert_eq(expected, result)
|
|
|
|
index = pd.MultiIndex.from_tuples(
|
|
[("A", "B"), ("C", "D"), ("E", "F")], names=["index1", "index2"]
|
|
)
|
|
columns = pd.MultiIndex.from_tuples(
|
|
[("numbers", "first"), ("values", "second")], names=["cols1", "cols2"]
|
|
)
|
|
pdf = pd.DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], index=index, columns=columns)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
for axis in [0, "index"]:
|
|
self.assert_eq(
|
|
pdf.rename_axis(["index3", "index4"], axis=axis).sort_index(),
|
|
psdf.rename_axis(["index3", "index4"], axis=axis).sort_index(),
|
|
)
|
|
|
|
for axis in [1, "columns"]:
|
|
self.assert_eq(
|
|
pdf.rename_axis(["cols3", "cols4"], axis=axis).sort_index(),
|
|
psdf.rename_axis(["cols3", "cols4"], axis=axis).sort_index(),
|
|
)
|
|
|
|
self.assertRaises(
|
|
ValueError, lambda: psdf.rename_axis(["index3", "index4", "index5"], axis=0)
|
|
)
|
|
self.assertRaises(ValueError, lambda: psdf.rename_axis(["cols3", "cols4", "cols5"], axis=1))
|
|
|
|
# index/columns parameters and dict_like/functions mappers introduced in pandas 0.24.0
|
|
if LooseVersion(pd.__version__) >= LooseVersion("0.24.0"):
|
|
self.assert_eq(
|
|
pdf.rename_axis(
|
|
index={"index1": "index3"}, columns={"cols1": "cols3"}
|
|
).sort_index(),
|
|
psdf.rename_axis(
|
|
index={"index1": "index3"}, columns={"cols1": "cols3"}
|
|
).sort_index(),
|
|
)
|
|
|
|
self.assert_eq(
|
|
pdf.rename_axis(
|
|
index={"missing": "index3"}, columns={"missing": "cols3"}
|
|
).sort_index(),
|
|
psdf.rename_axis(
|
|
index={"missing": "index3"}, columns={"missing": "cols3"}
|
|
).sort_index(),
|
|
)
|
|
|
|
self.assert_eq(
|
|
pdf.rename_axis(
|
|
index={"index1": "index3", "index2": "index4"},
|
|
columns={"cols1": "cols3", "cols2": "cols4"},
|
|
).sort_index(),
|
|
psdf.rename_axis(
|
|
index={"index1": "index3", "index2": "index4"},
|
|
columns={"cols1": "cols3", "cols2": "cols4"},
|
|
).sort_index(),
|
|
)
|
|
|
|
self.assert_eq(
|
|
pdf.rename_axis(index=str.upper, columns=str.upper).sort_index(),
|
|
psdf.rename_axis(index=str.upper, columns=str.upper).sort_index(),
|
|
)
|
|
else:
|
|
expected = pdf
|
|
expected.index.names = ["index3", "index2"]
|
|
expected.columns.names = ["cols3", "cols2"]
|
|
result = psdf.rename_axis(
|
|
index={"index1": "index3"}, columns={"cols1": "cols3"}
|
|
).sort_index()
|
|
self.assert_eq(expected, result)
|
|
|
|
expected.index.names = ["index1", "index2"]
|
|
expected.columns.names = ["cols1", "cols2"]
|
|
result = psdf.rename_axis(
|
|
index={"missing": "index2"}, columns={"missing": "cols2"}
|
|
).sort_index()
|
|
self.assert_eq(expected, result)
|
|
|
|
expected.index.names = ["index3", "index4"]
|
|
expected.columns.names = ["cols3", "cols4"]
|
|
result = psdf.rename_axis(
|
|
index={"index1": "index3", "index2": "index4"},
|
|
columns={"cols1": "cols3", "cols2": "cols4"},
|
|
).sort_index()
|
|
self.assert_eq(expected, result)
|
|
|
|
expected.index.names = ["INDEX1", "INDEX2"]
|
|
expected.columns.names = ["COLS1", "COLS2"]
|
|
result = psdf.rename_axis(index=str.upper, columns=str.upper).sort_index()
|
|
self.assert_eq(expected, result)
|
|
|
|
def test_dot_in_column_name(self):
|
|
self.assert_eq(
|
|
ps.DataFrame(ps.range(1)._internal.spark_frame.selectExpr("1L as `a.b`"))["a.b"],
|
|
ps.Series([1], name="a.b"),
|
|
)
|
|
|
|
def test_aggregate(self):
|
|
pdf = pd.DataFrame(
|
|
[[1, 2, 3], [4, 5, 6], [7, 8, 9], [np.nan, np.nan, np.nan]], columns=["A", "B", "C"]
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(
|
|
psdf.agg(["sum", "min"])[["A", "B", "C"]].sort_index(), # TODO?: fix column order
|
|
pdf.agg(["sum", "min"])[["A", "B", "C"]].sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
psdf.agg({"A": ["sum", "min"], "B": ["min", "max"]})[["A", "B"]].sort_index(),
|
|
pdf.agg({"A": ["sum", "min"], "B": ["min", "max"]})[["A", "B"]].sort_index(),
|
|
)
|
|
|
|
self.assertRaises(KeyError, lambda: psdf.agg({"A": ["sum", "min"], "X": ["min", "max"]}))
|
|
|
|
# multi-index columns
|
|
columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B"), ("Y", "C")])
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
|
|
self.assert_eq(
|
|
psdf.agg(["sum", "min"])[[("X", "A"), ("X", "B"), ("Y", "C")]].sort_index(),
|
|
pdf.agg(["sum", "min"])[[("X", "A"), ("X", "B"), ("Y", "C")]].sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
psdf.agg({("X", "A"): ["sum", "min"], ("X", "B"): ["min", "max"]})[
|
|
[("X", "A"), ("X", "B")]
|
|
].sort_index(),
|
|
pdf.agg({("X", "A"): ["sum", "min"], ("X", "B"): ["min", "max"]})[
|
|
[("X", "A"), ("X", "B")]
|
|
].sort_index(),
|
|
)
|
|
|
|
self.assertRaises(TypeError, lambda: psdf.agg({"X": ["sum", "min"], "Y": ["min", "max"]}))
|
|
|
|
# non-string names
|
|
pdf = pd.DataFrame(
|
|
[[1, 2, 3], [4, 5, 6], [7, 8, 9], [np.nan, np.nan, np.nan]], columns=[10, 20, 30]
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(
|
|
psdf.agg(["sum", "min"])[[10, 20, 30]].sort_index(),
|
|
pdf.agg(["sum", "min"])[[10, 20, 30]].sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
psdf.agg({10: ["sum", "min"], 20: ["min", "max"]})[[10, 20]].sort_index(),
|
|
pdf.agg({10: ["sum", "min"], 20: ["min", "max"]})[[10, 20]].sort_index(),
|
|
)
|
|
|
|
columns = pd.MultiIndex.from_tuples([("X", 10), ("X", 20), ("Y", 30)])
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
|
|
self.assert_eq(
|
|
psdf.agg(["sum", "min"])[[("X", 10), ("X", 20), ("Y", 30)]].sort_index(),
|
|
pdf.agg(["sum", "min"])[[("X", 10), ("X", 20), ("Y", 30)]].sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
psdf.agg({("X", 10): ["sum", "min"], ("X", 20): ["min", "max"]})[
|
|
[("X", 10), ("X", 20)]
|
|
].sort_index(),
|
|
pdf.agg({("X", 10): ["sum", "min"], ("X", 20): ["min", "max"]})[
|
|
[("X", 10), ("X", 20)]
|
|
].sort_index(),
|
|
)
|
|
|
|
pdf = pd.DataFrame(
|
|
[datetime(2019, 2, 2, 0, 0, 0, 0), datetime(2019, 2, 3, 0, 0, 0, 0)],
|
|
columns=["timestamp"],
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf.timestamp.min(), pdf.timestamp.min())
|
|
self.assert_eq(psdf.timestamp.max(), pdf.timestamp.max())
|
|
|
|
def test_droplevel(self):
|
|
pdf = (
|
|
pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])
|
|
.set_index([0, 1])
|
|
.rename_axis(["a", "b"])
|
|
)
|
|
pdf.columns = pd.MultiIndex.from_tuples(
|
|
[("c", "e"), ("d", "f")], names=["level_1", "level_2"]
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assertRaises(ValueError, lambda: psdf.droplevel(["a", "b"]))
|
|
self.assertRaises(ValueError, lambda: psdf.droplevel([1, 1, 1, 1, 1]))
|
|
self.assertRaises(IndexError, lambda: psdf.droplevel(2))
|
|
self.assertRaises(IndexError, lambda: psdf.droplevel(-3))
|
|
self.assertRaises(KeyError, lambda: psdf.droplevel({"a"}))
|
|
self.assertRaises(KeyError, lambda: psdf.droplevel({"a": 1}))
|
|
|
|
self.assertRaises(ValueError, lambda: psdf.droplevel(["level_1", "level_2"], axis=1))
|
|
self.assertRaises(IndexError, lambda: psdf.droplevel(2, axis=1))
|
|
self.assertRaises(IndexError, lambda: psdf.droplevel(-3, axis=1))
|
|
self.assertRaises(KeyError, lambda: psdf.droplevel({"level_1"}, axis=1))
|
|
self.assertRaises(KeyError, lambda: psdf.droplevel({"level_1": 1}, axis=1))
|
|
|
|
# droplevel is new in pandas 0.24.0
|
|
if LooseVersion(pd.__version__) >= LooseVersion("0.24.0"):
|
|
self.assert_eq(pdf.droplevel("a"), psdf.droplevel("a"))
|
|
self.assert_eq(pdf.droplevel(["a"]), psdf.droplevel(["a"]))
|
|
self.assert_eq(pdf.droplevel(("a",)), psdf.droplevel(("a",)))
|
|
self.assert_eq(pdf.droplevel(0), psdf.droplevel(0))
|
|
self.assert_eq(pdf.droplevel(-1), psdf.droplevel(-1))
|
|
|
|
self.assert_eq(pdf.droplevel("level_1", axis=1), psdf.droplevel("level_1", axis=1))
|
|
self.assert_eq(pdf.droplevel(["level_1"], axis=1), psdf.droplevel(["level_1"], axis=1))
|
|
self.assert_eq(
|
|
pdf.droplevel(("level_1",), axis=1), psdf.droplevel(("level_1",), axis=1)
|
|
)
|
|
self.assert_eq(pdf.droplevel(0, axis=1), psdf.droplevel(0, axis=1))
|
|
self.assert_eq(pdf.droplevel(-1, axis=1), psdf.droplevel(-1, axis=1))
|
|
else:
|
|
expected = pdf.copy()
|
|
expected.index = expected.index.droplevel("a")
|
|
|
|
self.assert_eq(expected, psdf.droplevel("a"))
|
|
self.assert_eq(expected, psdf.droplevel(["a"]))
|
|
self.assert_eq(expected, psdf.droplevel(("a",)))
|
|
self.assert_eq(expected, psdf.droplevel(0))
|
|
|
|
expected = pdf.copy()
|
|
expected.index = expected.index.droplevel(-1)
|
|
|
|
self.assert_eq(expected, psdf.droplevel(-1))
|
|
|
|
expected = pdf.copy()
|
|
expected.columns = expected.columns.droplevel("level_1")
|
|
|
|
self.assert_eq(expected, psdf.droplevel("level_1", axis=1))
|
|
self.assert_eq(expected, psdf.droplevel(["level_1"], axis=1))
|
|
self.assert_eq(expected, psdf.droplevel(("level_1",), axis=1))
|
|
self.assert_eq(expected, psdf.droplevel(0, axis=1))
|
|
|
|
expected = pdf.copy()
|
|
expected.columns = expected.columns.droplevel(-1)
|
|
|
|
self.assert_eq(expected, psdf.droplevel(-1, axis=1))
|
|
|
|
# Tupled names
|
|
pdf.columns.names = [("level", 1), ("level", 2)]
|
|
pdf.index.names = [("a", 10), ("x", 20)]
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assertRaises(KeyError, lambda: psdf.droplevel("a"))
|
|
self.assertRaises(KeyError, lambda: psdf.droplevel(("a", 10)))
|
|
|
|
# droplevel is new in pandas 0.24.0
|
|
if LooseVersion(pd.__version__) >= LooseVersion("0.24.0"):
|
|
self.assert_eq(pdf.droplevel([("a", 10)]), psdf.droplevel([("a", 10)]))
|
|
self.assert_eq(
|
|
pdf.droplevel([("level", 1)], axis=1), psdf.droplevel([("level", 1)], axis=1)
|
|
)
|
|
else:
|
|
expected = pdf.copy()
|
|
expected.index = expected.index.droplevel([("a", 10)])
|
|
|
|
self.assert_eq(expected, psdf.droplevel([("a", 10)]))
|
|
|
|
expected = pdf.copy()
|
|
expected.columns = expected.columns.droplevel([("level", 1)])
|
|
|
|
self.assert_eq(expected, psdf.droplevel([("level", 1)], axis=1))
|
|
|
|
# non-string names
|
|
pdf = (
|
|
pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])
|
|
.set_index([0, 1])
|
|
.rename_axis([10.0, 20.0])
|
|
)
|
|
pdf.columns = pd.MultiIndex.from_tuples([("c", "e"), ("d", "f")], names=[100.0, 200.0])
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
# droplevel is new in pandas 0.24.0
|
|
if LooseVersion(pd.__version__) >= LooseVersion("0.24.0"):
|
|
self.assert_eq(pdf.droplevel(10.0), psdf.droplevel(10.0))
|
|
self.assert_eq(pdf.droplevel([10.0]), psdf.droplevel([10.0]))
|
|
self.assert_eq(pdf.droplevel((10.0,)), psdf.droplevel((10.0,)))
|
|
self.assert_eq(pdf.droplevel(0), psdf.droplevel(0))
|
|
self.assert_eq(pdf.droplevel(-1), psdf.droplevel(-1))
|
|
self.assert_eq(pdf.droplevel(100.0, axis=1), psdf.droplevel(100.0, axis=1))
|
|
self.assert_eq(pdf.droplevel(0, axis=1), psdf.droplevel(0, axis=1))
|
|
else:
|
|
expected = pdf.copy()
|
|
expected.index = expected.index.droplevel(10.0)
|
|
|
|
self.assert_eq(expected, psdf.droplevel(10.0))
|
|
self.assert_eq(expected, psdf.droplevel([10.0]))
|
|
self.assert_eq(expected, psdf.droplevel((10.0,)))
|
|
self.assert_eq(expected, psdf.droplevel(0))
|
|
|
|
expected = pdf.copy()
|
|
expected.index = expected.index.droplevel(-1)
|
|
self.assert_eq(expected, psdf.droplevel(-1))
|
|
|
|
expected = pdf.copy()
|
|
expected.columns = expected.columns.droplevel(100.0)
|
|
|
|
self.assert_eq(expected, psdf.droplevel(100.0, axis=1))
|
|
self.assert_eq(expected, psdf.droplevel(0, axis=1))
|
|
|
|
def test_drop(self):
|
|
pdf = pd.DataFrame({"x": [1, 2], "y": [3, 4], "z": [5, 6]}, index=np.random.rand(2))
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
# Assert 'labels' or 'columns' parameter is set
|
|
expected_error_message = "Need to specify at least one of 'labels' or 'columns'"
|
|
with self.assertRaisesRegex(ValueError, expected_error_message):
|
|
psdf.drop()
|
|
# Assert axis cannot be 0
|
|
with self.assertRaisesRegex(NotImplementedError, "Drop currently only works for axis=1"):
|
|
psdf.drop("x", axis=0)
|
|
# Assert using a str for 'labels' works
|
|
self.assert_eq(psdf.drop("x", axis=1), pdf.drop("x", axis=1))
|
|
# Assert axis is 1 by default
|
|
self.assert_eq(psdf.drop("x"), pdf.drop("x", axis=1))
|
|
# Assert using a list for 'labels' works
|
|
self.assert_eq(psdf.drop(["y", "z"], axis=1), pdf.drop(["y", "z"], axis=1))
|
|
# Assert using 'columns' instead of 'labels' produces the same results
|
|
self.assert_eq(psdf.drop(columns="x"), pdf.drop(columns="x"))
|
|
self.assert_eq(psdf.drop(columns=["y", "z"]), pdf.drop(columns=["y", "z"]))
|
|
|
|
# Assert 'labels' being used when both 'labels' and 'columns' are specified
|
|
# TODO: should throw an error?
|
|
expected_output = pd.DataFrame({"y": [3, 4], "z": [5, 6]}, index=psdf.index.to_pandas())
|
|
self.assert_eq(psdf.drop(labels=["x"], columns=["y"]), expected_output)
|
|
|
|
columns = pd.MultiIndex.from_tuples([(1, "x"), (1, "y"), (2, "z")])
|
|
pdf.columns = columns
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf.drop(columns=1), pdf.drop(columns=1))
|
|
self.assert_eq(psdf.drop(columns=(1, "x")), pdf.drop(columns=(1, "x")))
|
|
self.assert_eq(psdf.drop(columns=[(1, "x"), 2]), pdf.drop(columns=[(1, "x"), 2]))
|
|
|
|
self.assertRaises(KeyError, lambda: psdf.drop(columns=3))
|
|
self.assertRaises(KeyError, lambda: psdf.drop(columns=(1, "z")))
|
|
|
|
# non-string names
|
|
pdf = pd.DataFrame({10: [1, 2], 20: [3, 4], 30: [5, 6]}, index=np.random.rand(2))
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf.drop(10), pdf.drop(10, axis=1))
|
|
self.assert_eq(psdf.drop([20, 30]), pdf.drop([20, 30], axis=1))
|
|
|
|
def _test_dropna(self, pdf, axis):
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf.dropna(axis=axis), pdf.dropna(axis=axis))
|
|
self.assert_eq(psdf.dropna(axis=axis, how="all"), pdf.dropna(axis=axis, how="all"))
|
|
self.assert_eq(psdf.dropna(axis=axis, subset=["x"]), pdf.dropna(axis=axis, subset=["x"]))
|
|
self.assert_eq(psdf.dropna(axis=axis, subset="x"), pdf.dropna(axis=axis, subset=["x"]))
|
|
self.assert_eq(
|
|
psdf.dropna(axis=axis, subset=["y", "z"]), pdf.dropna(axis=axis, subset=["y", "z"])
|
|
)
|
|
self.assert_eq(
|
|
psdf.dropna(axis=axis, subset=["y", "z"], how="all"),
|
|
pdf.dropna(axis=axis, subset=["y", "z"], how="all"),
|
|
)
|
|
|
|
self.assert_eq(psdf.dropna(axis=axis, thresh=2), pdf.dropna(axis=axis, thresh=2))
|
|
self.assert_eq(
|
|
psdf.dropna(axis=axis, thresh=1, subset=["y", "z"]),
|
|
pdf.dropna(axis=axis, thresh=1, subset=["y", "z"]),
|
|
)
|
|
|
|
pdf2 = pdf.copy()
|
|
psdf2 = psdf.copy()
|
|
pser = pdf2[pdf2.columns[0]]
|
|
psser = psdf2[psdf2.columns[0]]
|
|
pdf2.dropna(inplace=True)
|
|
psdf2.dropna(inplace=True)
|
|
self.assert_eq(psdf2, pdf2)
|
|
self.assert_eq(psser, pser)
|
|
|
|
# multi-index
|
|
columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")])
|
|
if axis == 0:
|
|
pdf.columns = columns
|
|
else:
|
|
pdf.index = columns
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf.dropna(axis=axis), pdf.dropna(axis=axis))
|
|
self.assert_eq(psdf.dropna(axis=axis, how="all"), pdf.dropna(axis=axis, how="all"))
|
|
self.assert_eq(
|
|
psdf.dropna(axis=axis, subset=[("a", "x")]), pdf.dropna(axis=axis, subset=[("a", "x")])
|
|
)
|
|
self.assert_eq(
|
|
psdf.dropna(axis=axis, subset=("a", "x")), pdf.dropna(axis=axis, subset=[("a", "x")])
|
|
)
|
|
self.assert_eq(
|
|
psdf.dropna(axis=axis, subset=[("a", "y"), ("b", "z")]),
|
|
pdf.dropna(axis=axis, subset=[("a", "y"), ("b", "z")]),
|
|
)
|
|
self.assert_eq(
|
|
psdf.dropna(axis=axis, subset=[("a", "y"), ("b", "z")], how="all"),
|
|
pdf.dropna(axis=axis, subset=[("a", "y"), ("b", "z")], how="all"),
|
|
)
|
|
|
|
self.assert_eq(psdf.dropna(axis=axis, thresh=2), pdf.dropna(axis=axis, thresh=2))
|
|
self.assert_eq(
|
|
psdf.dropna(axis=axis, thresh=1, subset=[("a", "y"), ("b", "z")]),
|
|
pdf.dropna(axis=axis, thresh=1, subset=[("a", "y"), ("b", "z")]),
|
|
)
|
|
|
|
def test_dropna_axis_index(self):
|
|
pdf = pd.DataFrame(
|
|
{
|
|
"x": [np.nan, 2, 3, 4, np.nan, 6],
|
|
"y": [1, 2, np.nan, 4, np.nan, np.nan],
|
|
"z": [1, 2, 3, 4, np.nan, np.nan],
|
|
},
|
|
index=np.random.rand(6),
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self._test_dropna(pdf, axis=0)
|
|
|
|
# empty
|
|
pdf = pd.DataFrame(index=np.random.rand(6))
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf.dropna(), pdf.dropna())
|
|
self.assert_eq(psdf.dropna(how="all"), pdf.dropna(how="all"))
|
|
self.assert_eq(psdf.dropna(thresh=0), pdf.dropna(thresh=0))
|
|
self.assert_eq(psdf.dropna(thresh=1), pdf.dropna(thresh=1))
|
|
|
|
with self.assertRaisesRegex(ValueError, "No axis named foo"):
|
|
psdf.dropna(axis="foo")
|
|
|
|
self.assertRaises(KeyError, lambda: psdf.dropna(subset="1"))
|
|
with self.assertRaisesRegex(ValueError, "invalid how option: 1"):
|
|
psdf.dropna(how=1)
|
|
with self.assertRaisesRegex(TypeError, "must specify how or thresh"):
|
|
psdf.dropna(how=None)
|
|
|
|
def test_dropna_axis_column(self):
|
|
pdf = pd.DataFrame(
|
|
{
|
|
"x": [np.nan, 2, 3, 4, np.nan, 6],
|
|
"y": [1, 2, np.nan, 4, np.nan, np.nan],
|
|
"z": [1, 2, 3, 4, np.nan, np.nan],
|
|
},
|
|
index=[str(r) for r in np.random.rand(6)],
|
|
).T
|
|
|
|
self._test_dropna(pdf, axis=1)
|
|
|
|
# empty
|
|
pdf = pd.DataFrame({"x": [], "y": [], "z": []})
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf.dropna(axis=1), pdf.dropna(axis=1))
|
|
self.assert_eq(psdf.dropna(axis=1, how="all"), pdf.dropna(axis=1, how="all"))
|
|
self.assert_eq(psdf.dropna(axis=1, thresh=0), pdf.dropna(axis=1, thresh=0))
|
|
self.assert_eq(psdf.dropna(axis=1, thresh=1), pdf.dropna(axis=1, thresh=1))
|
|
|
|
def test_dtype(self):
|
|
pdf = pd.DataFrame(
|
|
{
|
|
"a": list("abc"),
|
|
"b": list(range(1, 4)),
|
|
"c": np.arange(3, 6).astype("i1"),
|
|
"d": np.arange(4.0, 7.0, dtype="float64"),
|
|
"e": [True, False, True],
|
|
"f": pd.date_range("20130101", periods=3),
|
|
},
|
|
index=np.random.rand(3),
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(psdf, pdf)
|
|
self.assertTrue((psdf.dtypes == pdf.dtypes).all())
|
|
|
|
# multi-index columns
|
|
columns = pd.MultiIndex.from_tuples(zip(list("xxxyyz"), list("abcdef")))
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
self.assertTrue((psdf.dtypes == pdf.dtypes).all())
|
|
|
|
def test_fillna(self):
|
|
pdf = pd.DataFrame(
|
|
{
|
|
"x": [np.nan, 2, 3, 4, np.nan, 6],
|
|
"y": [1, 2, np.nan, 4, np.nan, np.nan],
|
|
"z": [1, 2, 3, 4, np.nan, np.nan],
|
|
},
|
|
index=np.random.rand(6),
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf, pdf)
|
|
self.assert_eq(psdf.fillna(-1), pdf.fillna(-1))
|
|
self.assert_eq(
|
|
psdf.fillna({"x": -1, "y": -2, "z": -5}), pdf.fillna({"x": -1, "y": -2, "z": -5})
|
|
)
|
|
self.assert_eq(pdf.fillna(method="ffill"), psdf.fillna(method="ffill"))
|
|
self.assert_eq(pdf.fillna(method="ffill", limit=2), psdf.fillna(method="ffill", limit=2))
|
|
self.assert_eq(pdf.fillna(method="bfill"), psdf.fillna(method="bfill"))
|
|
self.assert_eq(pdf.fillna(method="bfill", limit=2), psdf.fillna(method="bfill", limit=2))
|
|
|
|
pdf = pdf.set_index(["x", "y"])
|
|
psdf = ps.from_pandas(pdf)
|
|
# check multi index
|
|
self.assert_eq(psdf.fillna(-1), pdf.fillna(-1))
|
|
self.assert_eq(pdf.fillna(method="bfill"), psdf.fillna(method="bfill"))
|
|
self.assert_eq(pdf.fillna(method="ffill"), psdf.fillna(method="ffill"))
|
|
|
|
pser = pdf.z
|
|
psser = psdf.z
|
|
pdf.fillna({"x": -1, "y": -2, "z": -5}, inplace=True)
|
|
psdf.fillna({"x": -1, "y": -2, "z": -5}, inplace=True)
|
|
self.assert_eq(psdf, pdf)
|
|
self.assert_eq(psser, pser)
|
|
|
|
s_nan = pd.Series([-1, -2, -5], index=["x", "y", "z"], dtype=int)
|
|
self.assert_eq(psdf.fillna(s_nan), pdf.fillna(s_nan))
|
|
|
|
with self.assertRaisesRegex(NotImplementedError, "fillna currently only"):
|
|
psdf.fillna(-1, axis=1)
|
|
with self.assertRaisesRegex(NotImplementedError, "fillna currently only"):
|
|
psdf.fillna(-1, axis="columns")
|
|
with self.assertRaisesRegex(ValueError, "limit parameter for value is not support now"):
|
|
psdf.fillna(-1, limit=1)
|
|
with self.assertRaisesRegex(TypeError, "Unsupported.*DataFrame"):
|
|
psdf.fillna(pd.DataFrame({"x": [-1], "y": [-1], "z": [-1]}))
|
|
with self.assertRaisesRegex(TypeError, "Unsupported.*int64"):
|
|
psdf.fillna({"x": np.int64(-6), "y": np.int64(-4), "z": -5})
|
|
with self.assertRaisesRegex(ValueError, "Expecting 'pad', 'ffill', 'backfill' or 'bfill'."):
|
|
psdf.fillna(method="xxx")
|
|
with self.assertRaisesRegex(
|
|
ValueError, "Must specify a fillna 'value' or 'method' parameter."
|
|
):
|
|
psdf.fillna()
|
|
|
|
# multi-index columns
|
|
pdf = pd.DataFrame(
|
|
{
|
|
("x", "a"): [np.nan, 2, 3, 4, np.nan, 6],
|
|
("x", "b"): [1, 2, np.nan, 4, np.nan, np.nan],
|
|
("y", "c"): [1, 2, 3, 4, np.nan, np.nan],
|
|
},
|
|
index=np.random.rand(6),
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf.fillna(-1), pdf.fillna(-1))
|
|
self.assert_eq(
|
|
psdf.fillna({("x", "a"): -1, ("x", "b"): -2, ("y", "c"): -5}),
|
|
pdf.fillna({("x", "a"): -1, ("x", "b"): -2, ("y", "c"): -5}),
|
|
)
|
|
self.assert_eq(pdf.fillna(method="ffill"), psdf.fillna(method="ffill"))
|
|
self.assert_eq(pdf.fillna(method="ffill", limit=2), psdf.fillna(method="ffill", limit=2))
|
|
self.assert_eq(pdf.fillna(method="bfill"), psdf.fillna(method="bfill"))
|
|
self.assert_eq(pdf.fillna(method="bfill", limit=2), psdf.fillna(method="bfill", limit=2))
|
|
|
|
self.assert_eq(psdf.fillna({"x": -1}), pdf.fillna({"x": -1}))
|
|
|
|
if sys.version_info >= (3, 6):
|
|
# flaky in Python 3.5.
|
|
self.assert_eq(
|
|
psdf.fillna({"x": -1, ("x", "b"): -2}), pdf.fillna({"x": -1, ("x", "b"): -2})
|
|
)
|
|
self.assert_eq(
|
|
psdf.fillna({("x", "b"): -2, "x": -1}), pdf.fillna({("x", "b"): -2, "x": -1})
|
|
)
|
|
|
|
# check multi index
|
|
pdf = pdf.set_index([("x", "a"), ("x", "b")])
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(psdf.fillna(-1), pdf.fillna(-1))
|
|
self.assert_eq(
|
|
psdf.fillna({("x", "a"): -1, ("x", "b"): -2, ("y", "c"): -5}),
|
|
pdf.fillna({("x", "a"): -1, ("x", "b"): -2, ("y", "c"): -5}),
|
|
)
|
|
|
|
def test_isnull(self):
|
|
pdf = pd.DataFrame(
|
|
{"x": [1, 2, 3, 4, None, 6], "y": list("abdabd")}, index=np.random.rand(6)
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf.notnull(), pdf.notnull())
|
|
self.assert_eq(psdf.isnull(), pdf.isnull())
|
|
|
|
def test_to_datetime(self):
|
|
pdf = pd.DataFrame(
|
|
{"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}, index=np.random.rand(2)
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(pd.to_datetime(pdf), ps.to_datetime(psdf))
|
|
|
|
def test_nunique(self):
|
|
pdf = pd.DataFrame({"A": [1, 2, 3], "B": [np.nan, 3, np.nan]}, index=np.random.rand(3))
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
# Assert NaNs are dropped by default
|
|
self.assert_eq(psdf.nunique(), pdf.nunique())
|
|
|
|
# Assert including NaN values
|
|
self.assert_eq(psdf.nunique(dropna=False), pdf.nunique(dropna=False))
|
|
|
|
# Assert approximate counts
|
|
self.assert_eq(
|
|
ps.DataFrame({"A": range(100)}).nunique(approx=True), pd.Series([103], index=["A"]),
|
|
)
|
|
self.assert_eq(
|
|
ps.DataFrame({"A": range(100)}).nunique(approx=True, rsd=0.01),
|
|
pd.Series([100], index=["A"]),
|
|
)
|
|
|
|
# Assert unsupported axis value yet
|
|
msg = 'axis should be either 0 or "index" currently.'
|
|
with self.assertRaisesRegex(NotImplementedError, msg):
|
|
psdf.nunique(axis=1)
|
|
|
|
# multi-index columns
|
|
columns = pd.MultiIndex.from_tuples([("X", "A"), ("Y", "B")], names=["1", "2"])
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
|
|
self.assert_eq(psdf.nunique(), pdf.nunique())
|
|
self.assert_eq(psdf.nunique(dropna=False), pdf.nunique(dropna=False))
|
|
|
|
def test_sort_values(self):
|
|
pdf = pd.DataFrame(
|
|
{"a": [1, 2, 3, 4, 5, None, 7], "b": [7, 6, 5, 4, 3, 2, 1]}, index=np.random.rand(7)
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(psdf.sort_values("b"), pdf.sort_values("b"))
|
|
self.assert_eq(psdf.sort_values(["b", "a"]), pdf.sort_values(["b", "a"]))
|
|
self.assert_eq(
|
|
psdf.sort_values(["b", "a"], ascending=[False, True]),
|
|
pdf.sort_values(["b", "a"], ascending=[False, True]),
|
|
)
|
|
|
|
self.assertRaises(ValueError, lambda: psdf.sort_values(["b", "a"], ascending=[False]))
|
|
|
|
self.assert_eq(
|
|
psdf.sort_values(["b", "a"], na_position="first"),
|
|
pdf.sort_values(["b", "a"], na_position="first"),
|
|
)
|
|
|
|
self.assertRaises(ValueError, lambda: psdf.sort_values(["b", "a"], na_position="invalid"))
|
|
|
|
pserA = pdf.a
|
|
psserA = psdf.a
|
|
self.assert_eq(psdf.sort_values("b", inplace=True), pdf.sort_values("b", inplace=True))
|
|
self.assert_eq(psdf, pdf)
|
|
self.assert_eq(psserA, pserA)
|
|
|
|
# multi-index columns
|
|
pdf = pd.DataFrame(
|
|
{("X", 10): [1, 2, 3, 4, 5, None, 7], ("X", 20): [7, 6, 5, 4, 3, 2, 1]},
|
|
index=np.random.rand(7),
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf.sort_values(("X", 20)), pdf.sort_values(("X", 20)))
|
|
self.assert_eq(
|
|
psdf.sort_values([("X", 20), ("X", 10)]), pdf.sort_values([("X", 20), ("X", 10)])
|
|
)
|
|
|
|
self.assertRaisesRegex(
|
|
ValueError,
|
|
"For a multi-index, the label must be a tuple with elements",
|
|
lambda: psdf.sort_values(["X"]),
|
|
)
|
|
|
|
# non-string names
|
|
pdf = pd.DataFrame(
|
|
{10: [1, 2, 3, 4, 5, None, 7], 20: [7, 6, 5, 4, 3, 2, 1]}, index=np.random.rand(7)
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf.sort_values(20), pdf.sort_values(20))
|
|
self.assert_eq(psdf.sort_values([20, 10]), pdf.sort_values([20, 10]))
|
|
|
|
def test_sort_index(self):
|
|
pdf = pd.DataFrame(
|
|
{"A": [2, 1, np.nan], "B": [np.nan, 0, np.nan]}, index=["b", "a", np.nan]
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
# Assert invalid parameters
|
|
self.assertRaises(NotImplementedError, lambda: psdf.sort_index(axis=1))
|
|
self.assertRaises(NotImplementedError, lambda: psdf.sort_index(kind="mergesort"))
|
|
self.assertRaises(ValueError, lambda: psdf.sort_index(na_position="invalid"))
|
|
|
|
# Assert default behavior without parameters
|
|
self.assert_eq(psdf.sort_index(), pdf.sort_index())
|
|
# Assert sorting descending
|
|
self.assert_eq(psdf.sort_index(ascending=False), pdf.sort_index(ascending=False))
|
|
# Assert sorting NA indices first
|
|
self.assert_eq(psdf.sort_index(na_position="first"), pdf.sort_index(na_position="first"))
|
|
|
|
# Assert sorting inplace
|
|
pserA = pdf.A
|
|
psserA = psdf.A
|
|
self.assertEqual(psdf.sort_index(inplace=True), pdf.sort_index(inplace=True))
|
|
self.assert_eq(psdf, pdf)
|
|
self.assert_eq(psserA, pserA)
|
|
|
|
# Assert multi-indices
|
|
pdf = pd.DataFrame(
|
|
{"A": range(4), "B": range(4)[::-1]}, index=[["b", "b", "a", "a"], [1, 0, 1, 0]]
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(psdf.sort_index(), pdf.sort_index())
|
|
self.assert_eq(psdf.sort_index(level=[1, 0]), pdf.sort_index(level=[1, 0]))
|
|
self.assert_eq(psdf.reset_index().sort_index(), pdf.reset_index().sort_index())
|
|
|
|
# Assert with multi-index columns
|
|
columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B")])
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
|
|
self.assert_eq(psdf.sort_index(), pdf.sort_index())
|
|
|
|
def test_swaplevel(self):
|
|
# MultiIndex with two levels
|
|
arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]]
|
|
pidx = pd.MultiIndex.from_arrays(arrays, names=("number", "color"))
|
|
pdf = pd.DataFrame({"x1": ["a", "b", "c", "d"], "x2": ["a", "b", "c", "d"]}, index=pidx)
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(pdf.swaplevel(), psdf.swaplevel())
|
|
self.assert_eq(pdf.swaplevel(0, 1), psdf.swaplevel(0, 1))
|
|
self.assert_eq(pdf.swaplevel(1, 1), psdf.swaplevel(1, 1))
|
|
self.assert_eq(pdf.swaplevel("number", "color"), psdf.swaplevel("number", "color"))
|
|
|
|
# MultiIndex with more than two levels
|
|
arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"], ["l", "m", "s", "xs"]]
|
|
pidx = pd.MultiIndex.from_arrays(arrays, names=("number", "color", "size"))
|
|
pdf = pd.DataFrame({"x1": ["a", "b", "c", "d"], "x2": ["a", "b", "c", "d"]}, index=pidx)
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(pdf.swaplevel(), psdf.swaplevel())
|
|
self.assert_eq(pdf.swaplevel(0, 1), psdf.swaplevel(0, 1))
|
|
self.assert_eq(pdf.swaplevel(0, 2), psdf.swaplevel(0, 2))
|
|
self.assert_eq(pdf.swaplevel(1, 2), psdf.swaplevel(1, 2))
|
|
self.assert_eq(pdf.swaplevel(1, 1), psdf.swaplevel(1, 1))
|
|
self.assert_eq(pdf.swaplevel(-1, -2), psdf.swaplevel(-1, -2))
|
|
self.assert_eq(pdf.swaplevel("number", "color"), psdf.swaplevel("number", "color"))
|
|
self.assert_eq(pdf.swaplevel("number", "size"), psdf.swaplevel("number", "size"))
|
|
self.assert_eq(pdf.swaplevel("color", "size"), psdf.swaplevel("color", "size"))
|
|
self.assert_eq(
|
|
pdf.swaplevel("color", "size", axis="index"),
|
|
psdf.swaplevel("color", "size", axis="index"),
|
|
)
|
|
self.assert_eq(
|
|
pdf.swaplevel("color", "size", axis=0), psdf.swaplevel("color", "size", axis=0)
|
|
)
|
|
|
|
pdf = pd.DataFrame(
|
|
{
|
|
"x1": ["a", "b", "c", "d"],
|
|
"x2": ["a", "b", "c", "d"],
|
|
"x3": ["a", "b", "c", "d"],
|
|
"x4": ["a", "b", "c", "d"],
|
|
}
|
|
)
|
|
pidx = pd.MultiIndex.from_arrays(arrays, names=("number", "color", "size"))
|
|
pdf.columns = pidx
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(pdf.swaplevel(axis=1), psdf.swaplevel(axis=1))
|
|
self.assert_eq(pdf.swaplevel(0, 1, axis=1), psdf.swaplevel(0, 1, axis=1))
|
|
self.assert_eq(pdf.swaplevel(0, 2, axis=1), psdf.swaplevel(0, 2, axis=1))
|
|
self.assert_eq(pdf.swaplevel(1, 2, axis=1), psdf.swaplevel(1, 2, axis=1))
|
|
self.assert_eq(pdf.swaplevel(1, 1, axis=1), psdf.swaplevel(1, 1, axis=1))
|
|
self.assert_eq(pdf.swaplevel(-1, -2, axis=1), psdf.swaplevel(-1, -2, axis=1))
|
|
self.assert_eq(
|
|
pdf.swaplevel("number", "color", axis=1), psdf.swaplevel("number", "color", axis=1)
|
|
)
|
|
self.assert_eq(
|
|
pdf.swaplevel("number", "size", axis=1), psdf.swaplevel("number", "size", axis=1)
|
|
)
|
|
self.assert_eq(
|
|
pdf.swaplevel("color", "size", axis=1), psdf.swaplevel("color", "size", axis=1)
|
|
)
|
|
self.assert_eq(
|
|
pdf.swaplevel("color", "size", axis="columns"),
|
|
psdf.swaplevel("color", "size", axis="columns"),
|
|
)
|
|
|
|
# Error conditions
|
|
self.assertRaises(AssertionError, lambda: ps.DataFrame([1, 2]).swaplevel())
|
|
self.assertRaises(IndexError, lambda: psdf.swaplevel(0, 9, axis=1))
|
|
self.assertRaises(KeyError, lambda: psdf.swaplevel("not_number", "color", axis=1))
|
|
self.assertRaises(ValueError, lambda: psdf.swaplevel(axis=2))
|
|
|
|
def test_swapaxes(self):
|
|
pdf = pd.DataFrame(
|
|
[[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=["x", "y", "z"], columns=["a", "b", "c"]
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf.swapaxes(0, 1), pdf.swapaxes(0, 1))
|
|
self.assert_eq(psdf.swapaxes(1, 0), pdf.swapaxes(1, 0))
|
|
self.assert_eq(psdf.swapaxes("index", "columns"), pdf.swapaxes("index", "columns"))
|
|
self.assert_eq(psdf.swapaxes("columns", "index"), pdf.swapaxes("columns", "index"))
|
|
self.assert_eq((psdf + 1).swapaxes(0, 1), (pdf + 1).swapaxes(0, 1))
|
|
|
|
self.assertRaises(AssertionError, lambda: psdf.swapaxes(0, 1, copy=False))
|
|
self.assertRaises(ValueError, lambda: psdf.swapaxes(0, -1))
|
|
|
|
def test_nlargest(self):
|
|
pdf = pd.DataFrame(
|
|
{"a": [1, 2, 3, 4, 5, None, 7], "b": [7, 6, 5, 4, 3, 2, 1]}, index=np.random.rand(7)
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(psdf.nlargest(n=5, columns="a"), pdf.nlargest(5, columns="a"))
|
|
self.assert_eq(psdf.nlargest(n=5, columns=["a", "b"]), pdf.nlargest(5, columns=["a", "b"]))
|
|
|
|
def test_nsmallest(self):
|
|
pdf = pd.DataFrame(
|
|
{"a": [1, 2, 3, 4, 5, None, 7], "b": [7, 6, 5, 4, 3, 2, 1]}, index=np.random.rand(7)
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(psdf.nsmallest(n=5, columns="a"), pdf.nsmallest(5, columns="a"))
|
|
self.assert_eq(
|
|
psdf.nsmallest(n=5, columns=["a", "b"]), pdf.nsmallest(5, columns=["a", "b"])
|
|
)
|
|
|
|
def test_xs(self):
|
|
d = {
|
|
"num_legs": [4, 4, 2, 2],
|
|
"num_wings": [0, 0, 2, 2],
|
|
"class": ["mammal", "mammal", "mammal", "bird"],
|
|
"animal": ["cat", "dog", "bat", "penguin"],
|
|
"locomotion": ["walks", "walks", "flies", "walks"],
|
|
}
|
|
pdf = pd.DataFrame(data=d)
|
|
pdf = pdf.set_index(["class", "animal", "locomotion"])
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf.xs("mammal"), pdf.xs("mammal"))
|
|
self.assert_eq(psdf.xs(("mammal",)), pdf.xs(("mammal",)))
|
|
self.assert_eq(psdf.xs(("mammal", "dog", "walks")), pdf.xs(("mammal", "dog", "walks")))
|
|
self.assert_eq(
|
|
ps.concat([psdf, psdf]).xs(("mammal", "dog", "walks")),
|
|
pd.concat([pdf, pdf]).xs(("mammal", "dog", "walks")),
|
|
)
|
|
self.assert_eq(psdf.xs("cat", level=1), pdf.xs("cat", level=1))
|
|
self.assert_eq(psdf.xs("flies", level=2), pdf.xs("flies", level=2))
|
|
self.assert_eq(psdf.xs("mammal", level=-3), pdf.xs("mammal", level=-3))
|
|
|
|
msg = 'axis should be either 0 or "index" currently.'
|
|
with self.assertRaisesRegex(NotImplementedError, msg):
|
|
psdf.xs("num_wings", axis=1)
|
|
with self.assertRaises(KeyError):
|
|
psdf.xs(("mammal", "dog", "walk"))
|
|
msg = r"'Key length \(4\) exceeds index depth \(3\)'"
|
|
with self.assertRaisesRegex(KeyError, msg):
|
|
psdf.xs(("mammal", "dog", "walks", "foo"))
|
|
|
|
self.assertRaises(IndexError, lambda: psdf.xs("foo", level=-4))
|
|
self.assertRaises(IndexError, lambda: psdf.xs("foo", level=3))
|
|
|
|
self.assertRaises(KeyError, lambda: psdf.xs(("dog", "walks"), level=1))
|
|
|
|
# non-string names
|
|
pdf = pd.DataFrame(data=d)
|
|
pdf = pdf.set_index(["class", "animal", "num_legs", "num_wings"])
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf.xs(("mammal", "dog", 4)), pdf.xs(("mammal", "dog", 4)))
|
|
self.assert_eq(psdf.xs(2, level=2), pdf.xs(2, level=2))
|
|
|
|
self.assert_eq((psdf + "a").xs(("mammal", "dog", 4)), (pdf + "a").xs(("mammal", "dog", 4)))
|
|
self.assert_eq((psdf + "a").xs(2, level=2), (pdf + "a").xs(2, level=2))
|
|
|
|
def test_missing(self):
|
|
psdf = self.psdf
|
|
|
|
missing_functions = inspect.getmembers(_MissingPandasLikeDataFrame, inspect.isfunction)
|
|
unsupported_functions = [
|
|
name for (name, type_) in missing_functions if type_.__name__ == "unsupported_function"
|
|
]
|
|
for name in unsupported_functions:
|
|
with self.assertRaisesRegex(
|
|
PandasNotImplementedError,
|
|
"method.*DataFrame.*{}.*not implemented( yet\\.|\\. .+)".format(name),
|
|
):
|
|
getattr(psdf, name)()
|
|
|
|
deprecated_functions = [
|
|
name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function"
|
|
]
|
|
for name in deprecated_functions:
|
|
with self.assertRaisesRegex(
|
|
PandasNotImplementedError, "method.*DataFrame.*{}.*is deprecated".format(name)
|
|
):
|
|
getattr(psdf, name)()
|
|
|
|
missing_properties = inspect.getmembers(
|
|
_MissingPandasLikeDataFrame, lambda o: isinstance(o, property)
|
|
)
|
|
unsupported_properties = [
|
|
name
|
|
for (name, type_) in missing_properties
|
|
if type_.fget.__name__ == "unsupported_property"
|
|
]
|
|
for name in unsupported_properties:
|
|
with self.assertRaisesRegex(
|
|
PandasNotImplementedError,
|
|
"property.*DataFrame.*{}.*not implemented( yet\\.|\\. .+)".format(name),
|
|
):
|
|
getattr(psdf, name)
|
|
deprecated_properties = [
|
|
name
|
|
for (name, type_) in missing_properties
|
|
if type_.fget.__name__ == "deprecated_property"
|
|
]
|
|
for name in deprecated_properties:
|
|
with self.assertRaisesRegex(
|
|
PandasNotImplementedError, "property.*DataFrame.*{}.*is deprecated".format(name)
|
|
):
|
|
getattr(psdf, name)
|
|
|
|
def test_to_numpy(self):
|
|
pdf = pd.DataFrame(
|
|
{
|
|
"a": [4, 2, 3, 4, 8, 6],
|
|
"b": [1, 2, 9, 4, 2, 4],
|
|
"c": ["one", "three", "six", "seven", "one", "5"],
|
|
},
|
|
index=np.random.rand(6),
|
|
)
|
|
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf.to_numpy(), pdf.values)
|
|
|
|
def test_to_pandas(self):
|
|
pdf, psdf = self.df_pair
|
|
self.assert_eq(psdf.toPandas(), pdf)
|
|
self.assert_eq(psdf.to_pandas(), pdf)
|
|
|
|
def test_isin(self):
|
|
pdf = pd.DataFrame(
|
|
{
|
|
"a": [4, 2, 3, 4, 8, 6],
|
|
"b": [1, 2, 9, 4, 2, 4],
|
|
"c": ["one", "three", "six", "seven", "one", "5"],
|
|
},
|
|
index=np.random.rand(6),
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf.isin([4, "six"]), pdf.isin([4, "six"]))
|
|
# Seems like pandas has a bug when passing `np.array` as parameter
|
|
self.assert_eq(psdf.isin(np.array([4, "six"])), pdf.isin([4, "six"]))
|
|
self.assert_eq(
|
|
psdf.isin({"a": [2, 8], "c": ["three", "one"]}),
|
|
pdf.isin({"a": [2, 8], "c": ["three", "one"]}),
|
|
)
|
|
self.assert_eq(
|
|
psdf.isin({"a": np.array([2, 8]), "c": ["three", "one"]}),
|
|
pdf.isin({"a": np.array([2, 8]), "c": ["three", "one"]}),
|
|
)
|
|
|
|
msg = "'DataFrame' object has no attribute {'e'}"
|
|
with self.assertRaisesRegex(AttributeError, msg):
|
|
psdf.isin({"e": [5, 7], "a": [1, 6]})
|
|
|
|
msg = "DataFrame and Series are not supported"
|
|
with self.assertRaisesRegex(NotImplementedError, msg):
|
|
psdf.isin(pdf)
|
|
|
|
msg = "Values should be iterable, Series, DataFrame or dict."
|
|
with self.assertRaisesRegex(TypeError, msg):
|
|
psdf.isin(1)
|
|
|
|
def test_merge(self):
|
|
left_pdf = pd.DataFrame(
|
|
{
|
|
"lkey": ["foo", "bar", "baz", "foo", "bar", "l"],
|
|
"value": [1, 2, 3, 5, 6, 7],
|
|
"x": list("abcdef"),
|
|
},
|
|
columns=["lkey", "value", "x"],
|
|
)
|
|
right_pdf = pd.DataFrame(
|
|
{
|
|
"rkey": ["baz", "foo", "bar", "baz", "foo", "r"],
|
|
"value": [4, 5, 6, 7, 8, 9],
|
|
"y": list("efghij"),
|
|
},
|
|
columns=["rkey", "value", "y"],
|
|
)
|
|
right_ps = pd.Series(list("defghi"), name="x", index=[5, 6, 7, 8, 9, 10])
|
|
|
|
left_psdf = ps.from_pandas(left_pdf)
|
|
right_psdf = ps.from_pandas(right_pdf)
|
|
right_psser = ps.from_pandas(right_ps)
|
|
|
|
def check(op, right_psdf=right_psdf, right_pdf=right_pdf):
|
|
k_res = op(left_psdf, right_psdf)
|
|
k_res = k_res.to_pandas()
|
|
k_res = k_res.sort_values(by=list(k_res.columns))
|
|
k_res = k_res.reset_index(drop=True)
|
|
p_res = op(left_pdf, right_pdf)
|
|
p_res = p_res.sort_values(by=list(p_res.columns))
|
|
p_res = p_res.reset_index(drop=True)
|
|
self.assert_eq(k_res, p_res)
|
|
|
|
check(lambda left, right: left.merge(right))
|
|
check(lambda left, right: left.merge(right, on="value"))
|
|
check(lambda left, right: left.merge(right, left_on="lkey", right_on="rkey"))
|
|
check(lambda left, right: left.set_index("lkey").merge(right.set_index("rkey")))
|
|
check(
|
|
lambda left, right: left.set_index("lkey").merge(
|
|
right, left_index=True, right_on="rkey"
|
|
)
|
|
)
|
|
check(
|
|
lambda left, right: left.merge(
|
|
right.set_index("rkey"), left_on="lkey", right_index=True
|
|
)
|
|
)
|
|
check(
|
|
lambda left, right: left.set_index("lkey").merge(
|
|
right.set_index("rkey"), left_index=True, right_index=True
|
|
)
|
|
)
|
|
|
|
# MultiIndex
|
|
check(
|
|
lambda left, right: left.merge(
|
|
right, left_on=["lkey", "value"], right_on=["rkey", "value"]
|
|
)
|
|
)
|
|
check(
|
|
lambda left, right: left.set_index(["lkey", "value"]).merge(
|
|
right, left_index=True, right_on=["rkey", "value"]
|
|
)
|
|
)
|
|
check(
|
|
lambda left, right: left.merge(
|
|
right.set_index(["rkey", "value"]), left_on=["lkey", "value"], right_index=True
|
|
)
|
|
)
|
|
# TODO: when both left_index=True and right_index=True with multi-index
|
|
# check(lambda left, right: left.set_index(['lkey', 'value']).merge(
|
|
# right.set_index(['rkey', 'value']), left_index=True, right_index=True))
|
|
|
|
# join types
|
|
for how in ["inner", "left", "right", "outer"]:
|
|
check(lambda left, right: left.merge(right, on="value", how=how))
|
|
check(lambda left, right: left.merge(right, left_on="lkey", right_on="rkey", how=how))
|
|
|
|
# suffix
|
|
check(
|
|
lambda left, right: left.merge(
|
|
right, left_on="lkey", right_on="rkey", suffixes=["_left", "_right"]
|
|
)
|
|
)
|
|
|
|
# Test Series on the right
|
|
# pd.DataFrame.merge with Series is implemented since version 0.24.0
|
|
if LooseVersion(pd.__version__) >= LooseVersion("0.24.0"):
|
|
check(lambda left, right: left.merge(right), right_psser, right_ps)
|
|
check(
|
|
lambda left, right: left.merge(right, left_on="x", right_on="x"),
|
|
right_psser,
|
|
right_ps,
|
|
)
|
|
check(
|
|
lambda left, right: left.set_index("x").merge(right, left_index=True, right_on="x"),
|
|
right_psser,
|
|
right_ps,
|
|
)
|
|
|
|
# Test join types with Series
|
|
for how in ["inner", "left", "right", "outer"]:
|
|
check(lambda left, right: left.merge(right, how=how), right_psser, right_ps)
|
|
check(
|
|
lambda left, right: left.merge(right, left_on="x", right_on="x", how=how),
|
|
right_psser,
|
|
right_ps,
|
|
)
|
|
|
|
# suffix with Series
|
|
check(
|
|
lambda left, right: left.merge(
|
|
right,
|
|
suffixes=["_left", "_right"],
|
|
how="outer",
|
|
left_index=True,
|
|
right_index=True,
|
|
),
|
|
right_psser,
|
|
right_ps,
|
|
)
|
|
|
|
# multi-index columns
|
|
left_columns = pd.MultiIndex.from_tuples([(10, "lkey"), (10, "value"), (20, "x")])
|
|
left_pdf.columns = left_columns
|
|
left_psdf.columns = left_columns
|
|
|
|
right_columns = pd.MultiIndex.from_tuples([(10, "rkey"), (10, "value"), (30, "y")])
|
|
right_pdf.columns = right_columns
|
|
right_psdf.columns = right_columns
|
|
|
|
check(lambda left, right: left.merge(right))
|
|
check(lambda left, right: left.merge(right, on=[(10, "value")]))
|
|
check(
|
|
lambda left, right: (left.set_index((10, "lkey")).merge(right.set_index((10, "rkey"))))
|
|
)
|
|
check(
|
|
lambda left, right: (
|
|
left.set_index((10, "lkey")).merge(
|
|
right.set_index((10, "rkey")), left_index=True, right_index=True
|
|
)
|
|
)
|
|
)
|
|
# TODO: when both left_index=True and right_index=True with multi-index columns
|
|
# check(lambda left, right: left.merge(right,
|
|
# left_on=[('a', 'lkey')], right_on=[('a', 'rkey')]))
|
|
# check(lambda left, right: (left.set_index(('a', 'lkey'))
|
|
# .merge(right, left_index=True, right_on=[('a', 'rkey')])))
|
|
|
|
# non-string names
|
|
left_pdf.columns = [10, 100, 1000]
|
|
left_psdf.columns = [10, 100, 1000]
|
|
|
|
right_pdf.columns = [20, 100, 2000]
|
|
right_psdf.columns = [20, 100, 2000]
|
|
|
|
check(lambda left, right: left.merge(right))
|
|
check(lambda left, right: left.merge(right, on=[100]))
|
|
check(lambda left, right: (left.set_index(10).merge(right.set_index(20))))
|
|
check(
|
|
lambda left, right: (
|
|
left.set_index(10).merge(right.set_index(20), left_index=True, right_index=True)
|
|
)
|
|
)
|
|
|
|
def test_merge_same_anchor(self):
|
|
pdf = pd.DataFrame(
|
|
{
|
|
"lkey": ["foo", "bar", "baz", "foo", "bar", "l"],
|
|
"rkey": ["baz", "foo", "bar", "baz", "foo", "r"],
|
|
"value": [1, 1, 3, 5, 6, 7],
|
|
"x": list("abcdef"),
|
|
"y": list("efghij"),
|
|
},
|
|
columns=["lkey", "rkey", "value", "x", "y"],
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
left_pdf = pdf[["lkey", "value", "x"]]
|
|
right_pdf = pdf[["rkey", "value", "y"]]
|
|
left_psdf = psdf[["lkey", "value", "x"]]
|
|
right_psdf = psdf[["rkey", "value", "y"]]
|
|
|
|
def check(op, right_psdf=right_psdf, right_pdf=right_pdf):
|
|
k_res = op(left_psdf, right_psdf)
|
|
k_res = k_res.to_pandas()
|
|
k_res = k_res.sort_values(by=list(k_res.columns))
|
|
k_res = k_res.reset_index(drop=True)
|
|
p_res = op(left_pdf, right_pdf)
|
|
p_res = p_res.sort_values(by=list(p_res.columns))
|
|
p_res = p_res.reset_index(drop=True)
|
|
self.assert_eq(k_res, p_res)
|
|
|
|
check(lambda left, right: left.merge(right))
|
|
check(lambda left, right: left.merge(right, on="value"))
|
|
check(lambda left, right: left.merge(right, left_on="lkey", right_on="rkey"))
|
|
check(lambda left, right: left.set_index("lkey").merge(right.set_index("rkey")))
|
|
check(
|
|
lambda left, right: left.set_index("lkey").merge(
|
|
right, left_index=True, right_on="rkey"
|
|
)
|
|
)
|
|
check(
|
|
lambda left, right: left.merge(
|
|
right.set_index("rkey"), left_on="lkey", right_index=True
|
|
)
|
|
)
|
|
check(
|
|
lambda left, right: left.set_index("lkey").merge(
|
|
right.set_index("rkey"), left_index=True, right_index=True
|
|
)
|
|
)
|
|
|
|
def test_merge_retains_indices(self):
|
|
left_pdf = pd.DataFrame({"A": [0, 1]})
|
|
right_pdf = pd.DataFrame({"B": [1, 2]}, index=[1, 2])
|
|
left_psdf = ps.from_pandas(left_pdf)
|
|
right_psdf = ps.from_pandas(right_pdf)
|
|
|
|
self.assert_eq(
|
|
left_psdf.merge(right_psdf, left_index=True, right_index=True),
|
|
left_pdf.merge(right_pdf, left_index=True, right_index=True),
|
|
)
|
|
self.assert_eq(
|
|
left_psdf.merge(right_psdf, left_on="A", right_index=True),
|
|
left_pdf.merge(right_pdf, left_on="A", right_index=True),
|
|
)
|
|
self.assert_eq(
|
|
left_psdf.merge(right_psdf, left_index=True, right_on="B"),
|
|
left_pdf.merge(right_pdf, left_index=True, right_on="B"),
|
|
)
|
|
self.assert_eq(
|
|
left_psdf.merge(right_psdf, left_on="A", right_on="B"),
|
|
left_pdf.merge(right_pdf, left_on="A", right_on="B"),
|
|
)
|
|
|
|
def test_merge_how_parameter(self):
|
|
left_pdf = pd.DataFrame({"A": [1, 2]})
|
|
right_pdf = pd.DataFrame({"B": ["x", "y"]}, index=[1, 2])
|
|
left_psdf = ps.from_pandas(left_pdf)
|
|
right_psdf = ps.from_pandas(right_pdf)
|
|
|
|
psdf = left_psdf.merge(right_psdf, left_index=True, right_index=True)
|
|
pdf = left_pdf.merge(right_pdf, left_index=True, right_index=True)
|
|
self.assert_eq(
|
|
psdf.sort_values(by=list(psdf.columns)).reset_index(drop=True),
|
|
pdf.sort_values(by=list(pdf.columns)).reset_index(drop=True),
|
|
)
|
|
|
|
psdf = left_psdf.merge(right_psdf, left_index=True, right_index=True, how="left")
|
|
pdf = left_pdf.merge(right_pdf, left_index=True, right_index=True, how="left")
|
|
self.assert_eq(
|
|
psdf.sort_values(by=list(psdf.columns)).reset_index(drop=True),
|
|
pdf.sort_values(by=list(pdf.columns)).reset_index(drop=True),
|
|
)
|
|
|
|
psdf = left_psdf.merge(right_psdf, left_index=True, right_index=True, how="right")
|
|
pdf = left_pdf.merge(right_pdf, left_index=True, right_index=True, how="right")
|
|
self.assert_eq(
|
|
psdf.sort_values(by=list(psdf.columns)).reset_index(drop=True),
|
|
pdf.sort_values(by=list(pdf.columns)).reset_index(drop=True),
|
|
)
|
|
|
|
psdf = left_psdf.merge(right_psdf, left_index=True, right_index=True, how="outer")
|
|
pdf = left_pdf.merge(right_pdf, left_index=True, right_index=True, how="outer")
|
|
self.assert_eq(
|
|
psdf.sort_values(by=list(psdf.columns)).reset_index(drop=True),
|
|
pdf.sort_values(by=list(pdf.columns)).reset_index(drop=True),
|
|
)
|
|
|
|
def test_merge_raises(self):
|
|
left = ps.DataFrame(
|
|
{"value": [1, 2, 3, 5, 6], "x": list("abcde")},
|
|
columns=["value", "x"],
|
|
index=["foo", "bar", "baz", "foo", "bar"],
|
|
)
|
|
right = ps.DataFrame(
|
|
{"value": [4, 5, 6, 7, 8], "y": list("fghij")},
|
|
columns=["value", "y"],
|
|
index=["baz", "foo", "bar", "baz", "foo"],
|
|
)
|
|
|
|
with self.assertRaisesRegex(ValueError, "No common columns to perform merge on"):
|
|
left[["x"]].merge(right[["y"]])
|
|
|
|
with self.assertRaisesRegex(ValueError, "not a combination of both"):
|
|
left.merge(right, on="value", left_on="x")
|
|
|
|
with self.assertRaisesRegex(ValueError, "Must pass right_on or right_index=True"):
|
|
left.merge(right, left_on="x")
|
|
|
|
with self.assertRaisesRegex(ValueError, "Must pass right_on or right_index=True"):
|
|
left.merge(right, left_index=True)
|
|
|
|
with self.assertRaisesRegex(ValueError, "Must pass left_on or left_index=True"):
|
|
left.merge(right, right_on="y")
|
|
|
|
with self.assertRaisesRegex(ValueError, "Must pass left_on or left_index=True"):
|
|
left.merge(right, right_index=True)
|
|
|
|
with self.assertRaisesRegex(
|
|
ValueError, "len\\(left_keys\\) must equal len\\(right_keys\\)"
|
|
):
|
|
left.merge(right, left_on="value", right_on=["value", "y"])
|
|
|
|
with self.assertRaisesRegex(
|
|
ValueError, "len\\(left_keys\\) must equal len\\(right_keys\\)"
|
|
):
|
|
left.merge(right, left_on=["value", "x"], right_on="value")
|
|
|
|
with self.assertRaisesRegex(ValueError, "['inner', 'left', 'right', 'full', 'outer']"):
|
|
left.merge(right, left_index=True, right_index=True, how="foo")
|
|
|
|
with self.assertRaisesRegex(KeyError, "id"):
|
|
left.merge(right, on="id")
|
|
|
|
def test_append(self):
|
|
pdf = pd.DataFrame([[1, 2], [3, 4]], columns=list("AB"))
|
|
psdf = ps.from_pandas(pdf)
|
|
other_pdf = pd.DataFrame([[3, 4], [5, 6]], columns=list("BC"), index=[2, 3])
|
|
other_psdf = ps.from_pandas(other_pdf)
|
|
|
|
self.assert_eq(psdf.append(psdf), pdf.append(pdf))
|
|
self.assert_eq(psdf.append(psdf, ignore_index=True), pdf.append(pdf, ignore_index=True))
|
|
|
|
# Assert DataFrames with non-matching columns
|
|
self.assert_eq(psdf.append(other_psdf), pdf.append(other_pdf))
|
|
|
|
# Assert appending a Series fails
|
|
msg = "DataFrames.append() does not support appending Series to DataFrames"
|
|
with self.assertRaises(TypeError, msg=msg):
|
|
psdf.append(psdf["A"])
|
|
|
|
# Assert using the sort parameter raises an exception
|
|
msg = "The 'sort' parameter is currently not supported"
|
|
with self.assertRaises(NotImplementedError, msg=msg):
|
|
psdf.append(psdf, sort=True)
|
|
|
|
# Assert using 'verify_integrity' only raises an exception for overlapping indices
|
|
self.assert_eq(
|
|
psdf.append(other_psdf, verify_integrity=True),
|
|
pdf.append(other_pdf, verify_integrity=True),
|
|
)
|
|
msg = "Indices have overlapping values"
|
|
with self.assertRaises(ValueError, msg=msg):
|
|
psdf.append(psdf, verify_integrity=True)
|
|
|
|
# Skip integrity verification when ignore_index=True
|
|
self.assert_eq(
|
|
psdf.append(psdf, ignore_index=True, verify_integrity=True),
|
|
pdf.append(pdf, ignore_index=True, verify_integrity=True),
|
|
)
|
|
|
|
# Assert appending multi-index DataFrames
|
|
multi_index_pdf = pd.DataFrame([[1, 2], [3, 4]], columns=list("AB"), index=[[2, 3], [4, 5]])
|
|
multi_index_psdf = ps.from_pandas(multi_index_pdf)
|
|
other_multi_index_pdf = pd.DataFrame(
|
|
[[5, 6], [7, 8]], columns=list("AB"), index=[[2, 3], [6, 7]]
|
|
)
|
|
other_multi_index_psdf = ps.from_pandas(other_multi_index_pdf)
|
|
|
|
self.assert_eq(
|
|
multi_index_psdf.append(multi_index_psdf), multi_index_pdf.append(multi_index_pdf)
|
|
)
|
|
|
|
# Assert DataFrames with non-matching columns
|
|
self.assert_eq(
|
|
multi_index_psdf.append(other_multi_index_psdf),
|
|
multi_index_pdf.append(other_multi_index_pdf),
|
|
)
|
|
|
|
# Assert using 'verify_integrity' only raises an exception for overlapping indices
|
|
self.assert_eq(
|
|
multi_index_psdf.append(other_multi_index_psdf, verify_integrity=True),
|
|
multi_index_pdf.append(other_multi_index_pdf, verify_integrity=True),
|
|
)
|
|
with self.assertRaises(ValueError, msg=msg):
|
|
multi_index_psdf.append(multi_index_psdf, verify_integrity=True)
|
|
|
|
# Skip integrity verification when ignore_index=True
|
|
self.assert_eq(
|
|
multi_index_psdf.append(multi_index_psdf, ignore_index=True, verify_integrity=True),
|
|
multi_index_pdf.append(multi_index_pdf, ignore_index=True, verify_integrity=True),
|
|
)
|
|
|
|
# Assert trying to append DataFrames with different index levels
|
|
msg = "Both DataFrames have to have the same number of index levels"
|
|
with self.assertRaises(ValueError, msg=msg):
|
|
psdf.append(multi_index_psdf)
|
|
|
|
# Skip index level check when ignore_index=True
|
|
self.assert_eq(
|
|
psdf.append(multi_index_psdf, ignore_index=True),
|
|
pdf.append(multi_index_pdf, ignore_index=True),
|
|
)
|
|
|
|
columns = pd.MultiIndex.from_tuples([("A", "X"), ("A", "Y")])
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
|
|
self.assert_eq(psdf.append(psdf), pdf.append(pdf))
|
|
|
|
def test_clip(self):
|
|
pdf = pd.DataFrame(
|
|
{"A": [0, 2, 4], "B": [4, 2, 0], "X": [-1, 10, 0]}, index=np.random.rand(3)
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
# Assert list-like values are not accepted for 'lower' and 'upper'
|
|
msg = "List-like value are not supported for 'lower' and 'upper' at the moment"
|
|
with self.assertRaises(TypeError, msg=msg):
|
|
psdf.clip(lower=[1])
|
|
with self.assertRaises(TypeError, msg=msg):
|
|
psdf.clip(upper=[1])
|
|
|
|
# Assert no lower or upper
|
|
self.assert_eq(psdf.clip(), pdf.clip())
|
|
# Assert lower only
|
|
self.assert_eq(psdf.clip(1), pdf.clip(1))
|
|
# Assert upper only
|
|
self.assert_eq(psdf.clip(upper=3), pdf.clip(upper=3))
|
|
# Assert lower and upper
|
|
self.assert_eq(psdf.clip(1, 3), pdf.clip(1, 3))
|
|
|
|
pdf["clip"] = pdf.A.clip(lower=1, upper=3)
|
|
psdf["clip"] = psdf.A.clip(lower=1, upper=3)
|
|
self.assert_eq(psdf, pdf)
|
|
|
|
# Assert behavior on string values
|
|
str_psdf = ps.DataFrame({"A": ["a", "b", "c"]}, index=np.random.rand(3))
|
|
self.assert_eq(str_psdf.clip(1, 3), str_psdf)
|
|
|
|
def test_binary_operators(self):
|
|
pdf = pd.DataFrame(
|
|
{"A": [0, 2, 4], "B": [4, 2, 0], "X": [-1, 10, 0]}, index=np.random.rand(3)
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf + psdf.copy(), pdf + pdf.copy())
|
|
|
|
self.assertRaisesRegex(
|
|
ValueError,
|
|
"it comes from a different dataframe",
|
|
lambda: ps.range(10).add(ps.range(10)),
|
|
)
|
|
|
|
self.assertRaisesRegex(
|
|
TypeError,
|
|
"add with a sequence is currently not supported",
|
|
lambda: ps.range(10).add(ps.range(10).id),
|
|
)
|
|
|
|
def test_binary_operator_add(self):
|
|
# Positive
|
|
pdf = pd.DataFrame({"a": ["x"], "b": ["y"], "c": [1], "d": [2]})
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf["a"] + psdf["b"], pdf["a"] + pdf["b"])
|
|
self.assert_eq(psdf["c"] + psdf["d"], pdf["c"] + pdf["d"])
|
|
|
|
# Negative
|
|
ks_err_msg = "string addition can only be applied to string series or literals"
|
|
|
|
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["a"] + psdf["c"])
|
|
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["c"] + psdf["a"])
|
|
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["c"] + "literal")
|
|
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: "literal" + psdf["c"])
|
|
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: 1 + psdf["a"])
|
|
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["a"] + 1)
|
|
|
|
def test_binary_operator_sub(self):
|
|
# Positive
|
|
pdf = pd.DataFrame({"a": [2], "b": [1]})
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf["a"] - psdf["b"], pdf["a"] - pdf["b"])
|
|
|
|
# Negative
|
|
psdf = ps.DataFrame({"a": ["x"], "b": [1]})
|
|
ks_err_msg = "subtraction can not be applied to string series or literals"
|
|
|
|
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["a"] - psdf["b"])
|
|
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["b"] - psdf["a"])
|
|
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["b"] - "literal")
|
|
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: "literal" - psdf["b"])
|
|
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: 1 - psdf["a"])
|
|
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["a"] - 1)
|
|
|
|
psdf = ps.DataFrame({"a": ["x"], "b": ["y"]})
|
|
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["a"] - psdf["b"])
|
|
|
|
def test_binary_operator_truediv(self):
|
|
# Positive
|
|
pdf = pd.DataFrame({"a": [3], "b": [2]})
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf["a"] / psdf["b"], pdf["a"] / pdf["b"])
|
|
|
|
# Negative
|
|
psdf = ps.DataFrame({"a": ["x"], "b": [1]})
|
|
ks_err_msg = "division can not be applied on string series or literals"
|
|
|
|
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["a"] / psdf["b"])
|
|
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["b"] / psdf["a"])
|
|
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["b"] / "literal")
|
|
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: "literal" / psdf["b"])
|
|
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: 1 / psdf["a"])
|
|
|
|
def test_binary_operator_floordiv(self):
|
|
psdf = ps.DataFrame({"a": ["x"], "b": [1]})
|
|
ks_err_msg = "division can not be applied on string series or literals"
|
|
|
|
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["a"] // psdf["b"])
|
|
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["b"] // psdf["a"])
|
|
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["b"] // "literal")
|
|
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: "literal" // psdf["b"])
|
|
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: 1 // psdf["a"])
|
|
|
|
def test_binary_operator_mod(self):
|
|
# Positive
|
|
pdf = pd.DataFrame({"a": [3], "b": [2]})
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf["a"] % psdf["b"], pdf["a"] % pdf["b"])
|
|
|
|
# Negative
|
|
psdf = ps.DataFrame({"a": ["x"], "b": [1]})
|
|
ks_err_msg = "modulo can not be applied on string series or literals"
|
|
|
|
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["a"] % psdf["b"])
|
|
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["b"] % psdf["a"])
|
|
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["b"] % "literal")
|
|
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: 1 % psdf["a"])
|
|
|
|
def test_binary_operator_multiply(self):
|
|
# Positive
|
|
pdf = pd.DataFrame({"a": ["x", "y"], "b": [1, 2], "c": [3, 4]})
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf["b"] * psdf["c"], pdf["b"] * pdf["c"])
|
|
self.assert_eq(psdf["c"] * psdf["b"], pdf["c"] * pdf["b"])
|
|
self.assert_eq(psdf["a"] * psdf["b"], pdf["a"] * pdf["b"])
|
|
self.assert_eq(psdf["b"] * psdf["a"], pdf["b"] * pdf["a"])
|
|
self.assert_eq(psdf["a"] * 2, pdf["a"] * 2)
|
|
self.assert_eq(psdf["b"] * 2, pdf["b"] * 2)
|
|
self.assert_eq(2 * psdf["a"], 2 * pdf["a"])
|
|
self.assert_eq(2 * psdf["b"], 2 * pdf["b"])
|
|
|
|
# Negative
|
|
psdf = ps.DataFrame({"a": ["x"], "b": [2]})
|
|
ks_err_msg = "multiplication can not be applied to a string literal"
|
|
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["b"] * "literal")
|
|
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: "literal" * psdf["b"])
|
|
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["a"] * "literal")
|
|
|
|
ks_err_msg = "a string series can only be multiplied to an int series or literal"
|
|
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["a"] * psdf["a"])
|
|
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["a"] * 0.1)
|
|
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: 0.1 * psdf["a"])
|
|
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: "literal" * psdf["a"])
|
|
|
|
def test_sample(self):
|
|
pdf = pd.DataFrame({"A": [0, 2, 4]})
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
# Make sure the tests run, but we can't check the result because they are non-deterministic.
|
|
psdf.sample(frac=0.1)
|
|
psdf.sample(frac=0.2, replace=True)
|
|
psdf.sample(frac=0.2, random_state=5)
|
|
psdf["A"].sample(frac=0.2)
|
|
psdf["A"].sample(frac=0.2, replace=True)
|
|
psdf["A"].sample(frac=0.2, random_state=5)
|
|
|
|
with self.assertRaises(ValueError):
|
|
psdf.sample()
|
|
with self.assertRaises(NotImplementedError):
|
|
psdf.sample(n=1)
|
|
|
|
def test_add_prefix(self):
|
|
pdf = pd.DataFrame({"A": [1, 2, 3, 4], "B": [3, 4, 5, 6]}, index=np.random.rand(4))
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(pdf.add_prefix("col_"), psdf.add_prefix("col_"))
|
|
|
|
columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B")])
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
self.assert_eq(pdf.add_prefix("col_"), psdf.add_prefix("col_"))
|
|
|
|
def test_add_suffix(self):
|
|
pdf = pd.DataFrame({"A": [1, 2, 3, 4], "B": [3, 4, 5, 6]}, index=np.random.rand(4))
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(pdf.add_suffix("first_series"), psdf.add_suffix("first_series"))
|
|
|
|
columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B")])
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
self.assert_eq(pdf.add_suffix("first_series"), psdf.add_suffix("first_series"))
|
|
|
|
def test_join(self):
|
|
# check basic function
|
|
pdf1 = pd.DataFrame(
|
|
{"key": ["K0", "K1", "K2", "K3"], "A": ["A0", "A1", "A2", "A3"]}, columns=["key", "A"]
|
|
)
|
|
pdf2 = pd.DataFrame(
|
|
{"key": ["K0", "K1", "K2"], "B": ["B0", "B1", "B2"]}, columns=["key", "B"]
|
|
)
|
|
psdf1 = ps.from_pandas(pdf1)
|
|
psdf2 = ps.from_pandas(pdf2)
|
|
|
|
join_pdf = pdf1.join(pdf2, lsuffix="_left", rsuffix="_right")
|
|
join_pdf.sort_values(by=list(join_pdf.columns), inplace=True)
|
|
|
|
join_psdf = psdf1.join(psdf2, lsuffix="_left", rsuffix="_right")
|
|
join_psdf.sort_values(by=list(join_psdf.columns), inplace=True)
|
|
|
|
self.assert_eq(join_pdf, join_psdf)
|
|
|
|
# join with duplicated columns in Series
|
|
with self.assertRaisesRegex(ValueError, "columns overlap but no suffix specified"):
|
|
ks1 = ps.Series(["A1", "A5"], index=[1, 2], name="A")
|
|
psdf1.join(ks1, how="outer")
|
|
# join with duplicated columns in DataFrame
|
|
with self.assertRaisesRegex(ValueError, "columns overlap but no suffix specified"):
|
|
psdf1.join(psdf2, how="outer")
|
|
|
|
# check `on` parameter
|
|
join_pdf = pdf1.join(pdf2.set_index("key"), on="key", lsuffix="_left", rsuffix="_right")
|
|
join_pdf.sort_values(by=list(join_pdf.columns), inplace=True)
|
|
|
|
join_psdf = psdf1.join(psdf2.set_index("key"), on="key", lsuffix="_left", rsuffix="_right")
|
|
join_psdf.sort_values(by=list(join_psdf.columns), inplace=True)
|
|
self.assert_eq(join_pdf.reset_index(drop=True), join_psdf.reset_index(drop=True))
|
|
|
|
join_pdf = pdf1.set_index("key").join(
|
|
pdf2.set_index("key"), on="key", lsuffix="_left", rsuffix="_right"
|
|
)
|
|
join_pdf.sort_values(by=list(join_pdf.columns), inplace=True)
|
|
|
|
join_psdf = psdf1.set_index("key").join(
|
|
psdf2.set_index("key"), on="key", lsuffix="_left", rsuffix="_right"
|
|
)
|
|
join_psdf.sort_values(by=list(join_psdf.columns), inplace=True)
|
|
self.assert_eq(join_pdf.reset_index(drop=True), join_psdf.reset_index(drop=True))
|
|
|
|
# multi-index columns
|
|
columns1 = pd.MultiIndex.from_tuples([("x", "key"), ("Y", "A")])
|
|
columns2 = pd.MultiIndex.from_tuples([("x", "key"), ("Y", "B")])
|
|
pdf1.columns = columns1
|
|
pdf2.columns = columns2
|
|
psdf1.columns = columns1
|
|
psdf2.columns = columns2
|
|
|
|
join_pdf = pdf1.join(pdf2, lsuffix="_left", rsuffix="_right")
|
|
join_pdf.sort_values(by=list(join_pdf.columns), inplace=True)
|
|
|
|
join_psdf = psdf1.join(psdf2, lsuffix="_left", rsuffix="_right")
|
|
join_psdf.sort_values(by=list(join_psdf.columns), inplace=True)
|
|
|
|
self.assert_eq(join_pdf, join_psdf)
|
|
|
|
# check `on` parameter
|
|
join_pdf = pdf1.join(
|
|
pdf2.set_index(("x", "key")), on=[("x", "key")], lsuffix="_left", rsuffix="_right"
|
|
)
|
|
join_pdf.sort_values(by=list(join_pdf.columns), inplace=True)
|
|
|
|
join_psdf = psdf1.join(
|
|
psdf2.set_index(("x", "key")), on=[("x", "key")], lsuffix="_left", rsuffix="_right"
|
|
)
|
|
join_psdf.sort_values(by=list(join_psdf.columns), inplace=True)
|
|
|
|
self.assert_eq(join_pdf.reset_index(drop=True), join_psdf.reset_index(drop=True))
|
|
|
|
join_pdf = pdf1.set_index(("x", "key")).join(
|
|
pdf2.set_index(("x", "key")), on=[("x", "key")], lsuffix="_left", rsuffix="_right"
|
|
)
|
|
join_pdf.sort_values(by=list(join_pdf.columns), inplace=True)
|
|
|
|
join_psdf = psdf1.set_index(("x", "key")).join(
|
|
psdf2.set_index(("x", "key")), on=[("x", "key")], lsuffix="_left", rsuffix="_right"
|
|
)
|
|
join_psdf.sort_values(by=list(join_psdf.columns), inplace=True)
|
|
|
|
self.assert_eq(join_pdf.reset_index(drop=True), join_psdf.reset_index(drop=True))
|
|
|
|
# multi-index
|
|
midx1 = pd.MultiIndex.from_tuples(
|
|
[("w", "a"), ("x", "b"), ("y", "c"), ("z", "d")], names=["index1", "index2"]
|
|
)
|
|
midx2 = pd.MultiIndex.from_tuples(
|
|
[("w", "a"), ("x", "b"), ("y", "c")], names=["index1", "index2"]
|
|
)
|
|
pdf1.index = midx1
|
|
pdf2.index = midx2
|
|
psdf1 = ps.from_pandas(pdf1)
|
|
psdf2 = ps.from_pandas(pdf2)
|
|
|
|
join_pdf = pdf1.join(pdf2, on=["index1", "index2"], rsuffix="_right")
|
|
join_pdf.sort_values(by=list(join_pdf.columns), inplace=True)
|
|
|
|
join_psdf = psdf1.join(psdf2, on=["index1", "index2"], rsuffix="_right")
|
|
join_psdf.sort_values(by=list(join_psdf.columns), inplace=True)
|
|
|
|
self.assert_eq(join_pdf, join_psdf)
|
|
|
|
with self.assertRaisesRegex(
|
|
ValueError, r'len\(left_on\) must equal the number of levels in the index of "right"'
|
|
):
|
|
psdf1.join(psdf2, on=["index1"], rsuffix="_right")
|
|
|
|
def test_replace(self):
|
|
pdf = pd.DataFrame(
|
|
{
|
|
"name": ["Ironman", "Captain America", "Thor", "Hulk"],
|
|
"weapon": ["Mark-45", "Shield", "Mjolnir", "Smash"],
|
|
},
|
|
index=np.random.rand(4),
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
with self.assertRaisesRegex(
|
|
NotImplementedError, "replace currently works only for method='pad"
|
|
):
|
|
psdf.replace(method="bfill")
|
|
with self.assertRaisesRegex(
|
|
NotImplementedError, "replace currently works only when limit=None"
|
|
):
|
|
psdf.replace(limit=10)
|
|
with self.assertRaisesRegex(
|
|
NotImplementedError, "replace currently doesn't supports regex"
|
|
):
|
|
psdf.replace(regex="")
|
|
|
|
with self.assertRaisesRegex(ValueError, "Length of to_replace and value must be same"):
|
|
psdf.replace(to_replace=["Ironman"], value=["Spiderman", "Doctor Strange"])
|
|
|
|
self.assert_eq(psdf.replace("Ironman", "Spiderman"), pdf.replace("Ironman", "Spiderman"))
|
|
self.assert_eq(
|
|
psdf.replace(["Ironman", "Captain America"], ["Rescue", "Hawkeye"]),
|
|
pdf.replace(["Ironman", "Captain America"], ["Rescue", "Hawkeye"]),
|
|
)
|
|
self.assert_eq(
|
|
psdf.replace(("Ironman", "Captain America"), ("Rescue", "Hawkeye")),
|
|
pdf.replace(("Ironman", "Captain America"), ("Rescue", "Hawkeye")),
|
|
)
|
|
|
|
# inplace
|
|
pser = pdf.name
|
|
psser = psdf.name
|
|
pdf.replace("Ironman", "Spiderman", inplace=True)
|
|
psdf.replace("Ironman", "Spiderman", inplace=True)
|
|
self.assert_eq(psdf, pdf)
|
|
self.assert_eq(psser, pser)
|
|
|
|
pdf = pd.DataFrame(
|
|
{"A": [0, 1, 2, 3, np.nan], "B": [5, 6, 7, 8, np.nan], "C": ["a", "b", "c", "d", None]},
|
|
index=np.random.rand(5),
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf.replace([0, 1, 2, 3, 5, 6], 4), pdf.replace([0, 1, 2, 3, 5, 6], 4))
|
|
|
|
self.assert_eq(
|
|
psdf.replace([0, 1, 2, 3, 5, 6], [6, 5, 4, 3, 2, 1]),
|
|
pdf.replace([0, 1, 2, 3, 5, 6], [6, 5, 4, 3, 2, 1]),
|
|
)
|
|
|
|
self.assert_eq(psdf.replace({0: 10, 1: 100, 7: 200}), pdf.replace({0: 10, 1: 100, 7: 200}))
|
|
|
|
self.assert_eq(
|
|
psdf.replace({"A": [0, np.nan], "B": [5, np.nan]}, 100),
|
|
pdf.replace({"A": [0, np.nan], "B": [5, np.nan]}, 100),
|
|
)
|
|
|
|
self.assert_eq(
|
|
psdf.replace({"A": {0: 100, 4: 400, np.nan: 700}}),
|
|
pdf.replace({"A": {0: 100, 4: 400, np.nan: 700}}),
|
|
)
|
|
self.assert_eq(
|
|
psdf.replace({"X": {0: 100, 4: 400, np.nan: 700}}),
|
|
pdf.replace({"X": {0: 100, 4: 400, np.nan: 700}}),
|
|
)
|
|
|
|
self.assert_eq(psdf.replace({"C": ["a", None]}, "e"), pdf.replace({"C": ["a", None]}, "e"))
|
|
|
|
# multi-index columns
|
|
columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B"), ("Y", "C")])
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
|
|
self.assert_eq(psdf.replace([0, 1, 2, 3, 5, 6], 4), pdf.replace([0, 1, 2, 3, 5, 6], 4))
|
|
|
|
self.assert_eq(
|
|
psdf.replace([0, 1, 2, 3, 5, 6], [6, 5, 4, 3, 2, 1]),
|
|
pdf.replace([0, 1, 2, 3, 5, 6], [6, 5, 4, 3, 2, 1]),
|
|
)
|
|
|
|
self.assert_eq(psdf.replace({0: 10, 1: 100, 7: 200}), pdf.replace({0: 10, 1: 100, 7: 200}))
|
|
|
|
self.assert_eq(
|
|
psdf.replace({("X", "A"): [0, np.nan], ("X", "B"): 5}, 100),
|
|
pdf.replace({("X", "A"): [0, np.nan], ("X", "B"): 5}, 100),
|
|
)
|
|
|
|
self.assert_eq(
|
|
psdf.replace({("X", "A"): {0: 100, 4: 400, np.nan: 700}}),
|
|
pdf.replace({("X", "A"): {0: 100, 4: 400, np.nan: 700}}),
|
|
)
|
|
self.assert_eq(
|
|
psdf.replace({("X", "B"): {0: 100, 4: 400, np.nan: 700}}),
|
|
pdf.replace({("X", "B"): {0: 100, 4: 400, np.nan: 700}}),
|
|
)
|
|
|
|
self.assert_eq(
|
|
psdf.replace({("Y", "C"): ["a", None]}, "e"),
|
|
pdf.replace({("Y", "C"): ["a", None]}, "e"),
|
|
)
|
|
|
|
def test_update(self):
|
|
# check base function
|
|
def get_data(left_columns=None, right_columns=None):
|
|
left_pdf = pd.DataFrame(
|
|
{"A": ["1", "2", "3", "4"], "B": ["100", "200", np.nan, np.nan]}, columns=["A", "B"]
|
|
)
|
|
right_pdf = pd.DataFrame(
|
|
{"B": ["x", np.nan, "y", np.nan], "C": ["100", "200", "300", "400"]},
|
|
columns=["B", "C"],
|
|
)
|
|
|
|
left_psdf = ps.DataFrame(
|
|
{"A": ["1", "2", "3", "4"], "B": ["100", "200", None, None]}, columns=["A", "B"]
|
|
)
|
|
right_psdf = ps.DataFrame(
|
|
{"B": ["x", None, "y", None], "C": ["100", "200", "300", "400"]}, columns=["B", "C"]
|
|
)
|
|
if left_columns is not None:
|
|
left_pdf.columns = left_columns
|
|
left_psdf.columns = left_columns
|
|
if right_columns is not None:
|
|
right_pdf.columns = right_columns
|
|
right_psdf.columns = right_columns
|
|
return left_psdf, left_pdf, right_psdf, right_pdf
|
|
|
|
left_psdf, left_pdf, right_psdf, right_pdf = get_data()
|
|
pser = left_pdf.B
|
|
psser = left_psdf.B
|
|
left_pdf.update(right_pdf)
|
|
left_psdf.update(right_psdf)
|
|
self.assert_eq(left_pdf.sort_values(by=["A", "B"]), left_psdf.sort_values(by=["A", "B"]))
|
|
self.assert_eq(psser.sort_index(), pser.sort_index())
|
|
|
|
left_psdf, left_pdf, right_psdf, right_pdf = get_data()
|
|
left_pdf.update(right_pdf, overwrite=False)
|
|
left_psdf.update(right_psdf, overwrite=False)
|
|
self.assert_eq(left_pdf.sort_values(by=["A", "B"]), left_psdf.sort_values(by=["A", "B"]))
|
|
|
|
with self.assertRaises(NotImplementedError):
|
|
left_psdf.update(right_psdf, join="right")
|
|
|
|
# multi-index columns
|
|
left_columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B")])
|
|
right_columns = pd.MultiIndex.from_tuples([("X", "B"), ("Y", "C")])
|
|
|
|
left_psdf, left_pdf, right_psdf, right_pdf = get_data(
|
|
left_columns=left_columns, right_columns=right_columns
|
|
)
|
|
left_pdf.update(right_pdf)
|
|
left_psdf.update(right_psdf)
|
|
self.assert_eq(
|
|
left_pdf.sort_values(by=[("X", "A"), ("X", "B")]),
|
|
left_psdf.sort_values(by=[("X", "A"), ("X", "B")]),
|
|
)
|
|
|
|
left_psdf, left_pdf, right_psdf, right_pdf = get_data(
|
|
left_columns=left_columns, right_columns=right_columns
|
|
)
|
|
left_pdf.update(right_pdf, overwrite=False)
|
|
left_psdf.update(right_psdf, overwrite=False)
|
|
self.assert_eq(
|
|
left_pdf.sort_values(by=[("X", "A"), ("X", "B")]),
|
|
left_psdf.sort_values(by=[("X", "A"), ("X", "B")]),
|
|
)
|
|
|
|
right_columns = pd.MultiIndex.from_tuples([("Y", "B"), ("Y", "C")])
|
|
left_psdf, left_pdf, right_psdf, right_pdf = get_data(
|
|
left_columns=left_columns, right_columns=right_columns
|
|
)
|
|
left_pdf.update(right_pdf)
|
|
left_psdf.update(right_psdf)
|
|
self.assert_eq(
|
|
left_pdf.sort_values(by=[("X", "A"), ("X", "B")]),
|
|
left_psdf.sort_values(by=[("X", "A"), ("X", "B")]),
|
|
)
|
|
|
|
def test_pivot_table_dtypes(self):
|
|
pdf = pd.DataFrame(
|
|
{
|
|
"a": [4, 2, 3, 4, 8, 6],
|
|
"b": [1, 2, 2, 4, 2, 4],
|
|
"e": [1, 2, 2, 4, 2, 4],
|
|
"c": [1, 2, 9, 4, 7, 4],
|
|
},
|
|
index=np.random.rand(6),
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
# Skip columns comparison by reset_index
|
|
res_df = psdf.pivot_table(
|
|
index=["c"], columns="a", values=["b"], aggfunc={"b": "mean"}
|
|
).dtypes.reset_index(drop=True)
|
|
exp_df = pdf.pivot_table(
|
|
index=["c"], columns="a", values=["b"], aggfunc={"b": "mean"}
|
|
).dtypes.reset_index(drop=True)
|
|
self.assert_eq(res_df, exp_df)
|
|
|
|
# Results don't have the same column's name
|
|
|
|
# Todo: self.assert_eq(psdf.pivot_table(columns="a", values="b").dtypes,
|
|
# pdf.pivot_table(columns="a", values="b").dtypes)
|
|
|
|
# Todo: self.assert_eq(psdf.pivot_table(index=['c'], columns="a", values="b").dtypes,
|
|
# pdf.pivot_table(index=['c'], columns="a", values="b").dtypes)
|
|
|
|
# Todo: self.assert_eq(psdf.pivot_table(index=['e', 'c'], columns="a", values="b").dtypes,
|
|
# pdf.pivot_table(index=['e', 'c'], columns="a", values="b").dtypes)
|
|
|
|
# Todo: self.assert_eq(psdf.pivot_table(index=['e', 'c'],
|
|
# columns="a", values="b", fill_value=999).dtypes, pdf.pivot_table(index=['e', 'c'],
|
|
# columns="a", values="b", fill_value=999).dtypes)
|
|
|
|
def test_pivot_table(self):
|
|
pdf = pd.DataFrame(
|
|
{
|
|
"a": [4, 2, 3, 4, 8, 6],
|
|
"b": [1, 2, 2, 4, 2, 4],
|
|
"e": [10, 20, 20, 40, 20, 40],
|
|
"c": [1, 2, 9, 4, 7, 4],
|
|
"d": [-1, -2, -3, -4, -5, -6],
|
|
},
|
|
index=np.random.rand(6),
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
# Checking if both DataFrames have the same results
|
|
self.assert_eq(
|
|
psdf.pivot_table(columns="a", values="b").sort_index(),
|
|
pdf.pivot_table(columns="a", values="b").sort_index(),
|
|
almost=True,
|
|
)
|
|
|
|
self.assert_eq(
|
|
psdf.pivot_table(index=["c"], columns="a", values="b").sort_index(),
|
|
pdf.pivot_table(index=["c"], columns="a", values="b").sort_index(),
|
|
almost=True,
|
|
)
|
|
|
|
self.assert_eq(
|
|
psdf.pivot_table(index=["c"], columns="a", values="b", aggfunc="sum").sort_index(),
|
|
pdf.pivot_table(index=["c"], columns="a", values="b", aggfunc="sum").sort_index(),
|
|
almost=True,
|
|
)
|
|
|
|
self.assert_eq(
|
|
psdf.pivot_table(index=["c"], columns="a", values=["b"], aggfunc="sum").sort_index(),
|
|
pdf.pivot_table(index=["c"], columns="a", values=["b"], aggfunc="sum").sort_index(),
|
|
almost=True,
|
|
)
|
|
|
|
self.assert_eq(
|
|
psdf.pivot_table(
|
|
index=["c"], columns="a", values=["b", "e"], aggfunc="sum"
|
|
).sort_index(),
|
|
pdf.pivot_table(
|
|
index=["c"], columns="a", values=["b", "e"], aggfunc="sum"
|
|
).sort_index(),
|
|
almost=True,
|
|
)
|
|
|
|
self.assert_eq(
|
|
psdf.pivot_table(
|
|
index=["c"], columns="a", values=["b", "e", "d"], aggfunc="sum"
|
|
).sort_index(),
|
|
pdf.pivot_table(
|
|
index=["c"], columns="a", values=["b", "e", "d"], aggfunc="sum"
|
|
).sort_index(),
|
|
almost=True,
|
|
)
|
|
|
|
self.assert_eq(
|
|
psdf.pivot_table(
|
|
index=["c"], columns="a", values=["b", "e"], aggfunc={"b": "mean", "e": "sum"}
|
|
).sort_index(),
|
|
pdf.pivot_table(
|
|
index=["c"], columns="a", values=["b", "e"], aggfunc={"b": "mean", "e": "sum"}
|
|
).sort_index(),
|
|
almost=True,
|
|
)
|
|
|
|
self.assert_eq(
|
|
psdf.pivot_table(index=["e", "c"], columns="a", values="b").sort_index(),
|
|
pdf.pivot_table(index=["e", "c"], columns="a", values="b").sort_index(),
|
|
almost=True,
|
|
)
|
|
|
|
self.assert_eq(
|
|
psdf.pivot_table(
|
|
index=["e", "c"], columns="a", values="b", fill_value=999
|
|
).sort_index(),
|
|
pdf.pivot_table(index=["e", "c"], columns="a", values="b", fill_value=999).sort_index(),
|
|
almost=True,
|
|
)
|
|
|
|
# multi-index columns
|
|
columns = pd.MultiIndex.from_tuples(
|
|
[("x", "a"), ("x", "b"), ("y", "e"), ("z", "c"), ("w", "d")]
|
|
)
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
|
|
self.assert_eq(
|
|
psdf.pivot_table(columns=("x", "a"), values=("x", "b")).sort_index(),
|
|
pdf.pivot_table(columns=[("x", "a")], values=[("x", "b")]).sort_index(),
|
|
almost=True,
|
|
)
|
|
|
|
self.assert_eq(
|
|
psdf.pivot_table(
|
|
index=[("z", "c")], columns=("x", "a"), values=[("x", "b")]
|
|
).sort_index(),
|
|
pdf.pivot_table(
|
|
index=[("z", "c")], columns=[("x", "a")], values=[("x", "b")]
|
|
).sort_index(),
|
|
almost=True,
|
|
)
|
|
|
|
self.assert_eq(
|
|
psdf.pivot_table(
|
|
index=[("z", "c")], columns=("x", "a"), values=[("x", "b"), ("y", "e")]
|
|
).sort_index(),
|
|
pdf.pivot_table(
|
|
index=[("z", "c")], columns=[("x", "a")], values=[("x", "b"), ("y", "e")]
|
|
).sort_index(),
|
|
almost=True,
|
|
)
|
|
|
|
self.assert_eq(
|
|
psdf.pivot_table(
|
|
index=[("z", "c")], columns=("x", "a"), values=[("x", "b"), ("y", "e"), ("w", "d")]
|
|
).sort_index(),
|
|
pdf.pivot_table(
|
|
index=[("z", "c")],
|
|
columns=[("x", "a")],
|
|
values=[("x", "b"), ("y", "e"), ("w", "d")],
|
|
).sort_index(),
|
|
almost=True,
|
|
)
|
|
|
|
self.assert_eq(
|
|
psdf.pivot_table(
|
|
index=[("z", "c")],
|
|
columns=("x", "a"),
|
|
values=[("x", "b"), ("y", "e")],
|
|
aggfunc={("x", "b"): "mean", ("y", "e"): "sum"},
|
|
).sort_index(),
|
|
pdf.pivot_table(
|
|
index=[("z", "c")],
|
|
columns=[("x", "a")],
|
|
values=[("x", "b"), ("y", "e")],
|
|
aggfunc={("x", "b"): "mean", ("y", "e"): "sum"},
|
|
).sort_index(),
|
|
almost=True,
|
|
)
|
|
|
|
def test_pivot_table_and_index(self):
|
|
# https://github.com/databricks/koalas/issues/805
|
|
pdf = pd.DataFrame(
|
|
{
|
|
"A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"],
|
|
"B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"],
|
|
"C": [
|
|
"small",
|
|
"large",
|
|
"large",
|
|
"small",
|
|
"small",
|
|
"large",
|
|
"small",
|
|
"small",
|
|
"large",
|
|
],
|
|
"D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
|
|
"E": [2, 4, 5, 5, 6, 6, 8, 9, 9],
|
|
},
|
|
columns=["A", "B", "C", "D", "E"],
|
|
index=np.random.rand(9),
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
ptable = pdf.pivot_table(
|
|
values="D", index=["A", "B"], columns="C", aggfunc="sum", fill_value=0
|
|
).sort_index()
|
|
ktable = psdf.pivot_table(
|
|
values="D", index=["A", "B"], columns="C", aggfunc="sum", fill_value=0
|
|
).sort_index()
|
|
|
|
self.assert_eq(ktable, ptable)
|
|
self.assert_eq(ktable.index, ptable.index)
|
|
self.assert_eq(repr(ktable.index), repr(ptable.index))
|
|
|
|
def test_stack(self):
|
|
pdf_single_level_cols = pd.DataFrame(
|
|
[[0, 1], [2, 3]], index=["cat", "dog"], columns=["weight", "height"]
|
|
)
|
|
psdf_single_level_cols = ps.from_pandas(pdf_single_level_cols)
|
|
|
|
self.assert_eq(
|
|
psdf_single_level_cols.stack().sort_index(), pdf_single_level_cols.stack().sort_index()
|
|
)
|
|
|
|
multicol1 = pd.MultiIndex.from_tuples(
|
|
[("weight", "kg"), ("weight", "pounds")], names=["x", "y"]
|
|
)
|
|
pdf_multi_level_cols1 = pd.DataFrame(
|
|
[[1, 2], [2, 4]], index=["cat", "dog"], columns=multicol1
|
|
)
|
|
psdf_multi_level_cols1 = ps.from_pandas(pdf_multi_level_cols1)
|
|
|
|
self.assert_eq(
|
|
psdf_multi_level_cols1.stack().sort_index(), pdf_multi_level_cols1.stack().sort_index()
|
|
)
|
|
|
|
multicol2 = pd.MultiIndex.from_tuples([("weight", "kg"), ("height", "m")])
|
|
pdf_multi_level_cols2 = pd.DataFrame(
|
|
[[1.0, 2.0], [3.0, 4.0]], index=["cat", "dog"], columns=multicol2
|
|
)
|
|
psdf_multi_level_cols2 = ps.from_pandas(pdf_multi_level_cols2)
|
|
|
|
self.assert_eq(
|
|
psdf_multi_level_cols2.stack().sort_index(), pdf_multi_level_cols2.stack().sort_index()
|
|
)
|
|
|
|
pdf = pd.DataFrame(
|
|
{
|
|
("y", "c"): [True, True],
|
|
("x", "b"): [False, False],
|
|
("x", "c"): [True, False],
|
|
("y", "a"): [False, True],
|
|
}
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf.stack().sort_index(), pdf.stack().sort_index())
|
|
self.assert_eq(psdf[[]].stack().sort_index(), pdf[[]].stack().sort_index(), almost=True)
|
|
|
|
def test_unstack(self):
|
|
pdf = pd.DataFrame(
|
|
np.random.randn(3, 3),
|
|
index=pd.MultiIndex.from_tuples([("rg1", "x"), ("rg1", "y"), ("rg2", "z")]),
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf.unstack().sort_index(), pdf.unstack().sort_index(), almost=True)
|
|
|
|
def test_pivot_errors(self):
|
|
psdf = ps.range(10)
|
|
|
|
with self.assertRaisesRegex(ValueError, "columns should be set"):
|
|
psdf.pivot(index="id")
|
|
|
|
with self.assertRaisesRegex(ValueError, "values should be set"):
|
|
psdf.pivot(index="id", columns="id")
|
|
|
|
def test_pivot_table_errors(self):
|
|
pdf = pd.DataFrame(
|
|
{
|
|
"a": [4, 2, 3, 4, 8, 6],
|
|
"b": [1, 2, 2, 4, 2, 4],
|
|
"e": [1, 2, 2, 4, 2, 4],
|
|
"c": [1, 2, 9, 4, 7, 4],
|
|
},
|
|
index=np.random.rand(6),
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assertRaises(KeyError, lambda: psdf.pivot_table(index=["c"], columns="a", values=5))
|
|
|
|
msg = "index should be a None or a list of columns."
|
|
with self.assertRaisesRegex(TypeError, msg):
|
|
psdf.pivot_table(index="c", columns="a", values="b")
|
|
|
|
msg = "pivot_table doesn't support aggfunc as dict and without index."
|
|
with self.assertRaisesRegex(NotImplementedError, msg):
|
|
psdf.pivot_table(columns="a", values=["b", "e"], aggfunc={"b": "mean", "e": "sum"})
|
|
|
|
msg = "columns should be one column name."
|
|
with self.assertRaisesRegex(TypeError, msg):
|
|
psdf.pivot_table(columns=["a"], values=["b"], aggfunc={"b": "mean", "e": "sum"})
|
|
|
|
msg = "Columns in aggfunc must be the same as values."
|
|
with self.assertRaisesRegex(ValueError, msg):
|
|
psdf.pivot_table(
|
|
index=["e", "c"], columns="a", values="b", aggfunc={"b": "mean", "e": "sum"}
|
|
)
|
|
|
|
msg = "values can't be a list without index."
|
|
with self.assertRaisesRegex(NotImplementedError, msg):
|
|
psdf.pivot_table(columns="a", values=["b", "e"])
|
|
|
|
msg = "Wrong columns A."
|
|
with self.assertRaisesRegex(ValueError, msg):
|
|
psdf.pivot_table(
|
|
index=["c"], columns="A", values=["b", "e"], aggfunc={"b": "mean", "e": "sum"}
|
|
)
|
|
|
|
psdf = ps.DataFrame(
|
|
{
|
|
"A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"],
|
|
"B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"],
|
|
"C": [
|
|
"small",
|
|
"large",
|
|
"large",
|
|
"small",
|
|
"small",
|
|
"large",
|
|
"small",
|
|
"small",
|
|
"large",
|
|
],
|
|
"D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
|
|
"E": [2, 4, 5, 5, 6, 6, 8, 9, 9],
|
|
},
|
|
columns=["A", "B", "C", "D", "E"],
|
|
index=np.random.rand(9),
|
|
)
|
|
|
|
msg = "values should be a numeric type."
|
|
with self.assertRaisesRegex(TypeError, msg):
|
|
psdf.pivot_table(
|
|
index=["C"], columns="A", values=["B", "E"], aggfunc={"B": "mean", "E": "sum"}
|
|
)
|
|
|
|
msg = "values should be a numeric type."
|
|
with self.assertRaisesRegex(TypeError, msg):
|
|
psdf.pivot_table(index=["C"], columns="A", values="B", aggfunc={"B": "mean"})
|
|
|
|
def test_transpose(self):
|
|
# TODO: what if with random index?
|
|
pdf1 = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}, columns=["col1", "col2"])
|
|
psdf1 = ps.from_pandas(pdf1)
|
|
|
|
pdf2 = pd.DataFrame(
|
|
data={"score": [9, 8], "kids": [0, 0], "age": [12, 22]},
|
|
columns=["score", "kids", "age"],
|
|
)
|
|
psdf2 = ps.from_pandas(pdf2)
|
|
|
|
self.assert_eq(pdf1.transpose().sort_index(), psdf1.transpose().sort_index())
|
|
self.assert_eq(pdf2.transpose().sort_index(), psdf2.transpose().sort_index())
|
|
|
|
with option_context("compute.max_rows", None):
|
|
self.assert_eq(pdf1.transpose().sort_index(), psdf1.transpose().sort_index())
|
|
|
|
self.assert_eq(pdf2.transpose().sort_index(), psdf2.transpose().sort_index())
|
|
|
|
pdf3 = pd.DataFrame(
|
|
{
|
|
("cg1", "a"): [1, 2, 3],
|
|
("cg1", "b"): [4, 5, 6],
|
|
("cg2", "c"): [7, 8, 9],
|
|
("cg3", "d"): [9, 9, 9],
|
|
},
|
|
index=pd.MultiIndex.from_tuples([("rg1", "x"), ("rg1", "y"), ("rg2", "z")]),
|
|
)
|
|
psdf3 = ps.from_pandas(pdf3)
|
|
|
|
self.assert_eq(pdf3.transpose().sort_index(), psdf3.transpose().sort_index())
|
|
|
|
with option_context("compute.max_rows", None):
|
|
self.assert_eq(pdf3.transpose().sort_index(), psdf3.transpose().sort_index())
|
|
|
|
def _test_cummin(self, pdf, psdf):
|
|
self.assert_eq(pdf.cummin(), psdf.cummin())
|
|
self.assert_eq(pdf.cummin(skipna=False), psdf.cummin(skipna=False))
|
|
self.assert_eq(pdf.cummin().sum(), psdf.cummin().sum())
|
|
|
|
def test_cummin(self):
|
|
pdf = pd.DataFrame(
|
|
[[2.0, 1.0], [5, None], [1.0, 0.0], [2.0, 4.0], [4.0, 9.0]],
|
|
columns=list("AB"),
|
|
index=np.random.rand(5),
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
self._test_cummin(pdf, psdf)
|
|
|
|
def test_cummin_multiindex_columns(self):
|
|
arrays = [np.array(["A", "A", "B", "B"]), np.array(["one", "two", "one", "two"])]
|
|
pdf = pd.DataFrame(np.random.randn(3, 4), index=["A", "C", "B"], columns=arrays)
|
|
pdf.at["C", ("A", "two")] = None
|
|
psdf = ps.from_pandas(pdf)
|
|
self._test_cummin(pdf, psdf)
|
|
|
|
def _test_cummax(self, pdf, psdf):
|
|
self.assert_eq(pdf.cummax(), psdf.cummax())
|
|
self.assert_eq(pdf.cummax(skipna=False), psdf.cummax(skipna=False))
|
|
self.assert_eq(pdf.cummax().sum(), psdf.cummax().sum())
|
|
|
|
def test_cummax(self):
|
|
pdf = pd.DataFrame(
|
|
[[2.0, 1.0], [5, None], [1.0, 0.0], [2.0, 4.0], [4.0, 9.0]],
|
|
columns=list("AB"),
|
|
index=np.random.rand(5),
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
self._test_cummax(pdf, psdf)
|
|
|
|
def test_cummax_multiindex_columns(self):
|
|
arrays = [np.array(["A", "A", "B", "B"]), np.array(["one", "two", "one", "two"])]
|
|
pdf = pd.DataFrame(np.random.randn(3, 4), index=["A", "C", "B"], columns=arrays)
|
|
pdf.at["C", ("A", "two")] = None
|
|
psdf = ps.from_pandas(pdf)
|
|
self._test_cummax(pdf, psdf)
|
|
|
|
def _test_cumsum(self, pdf, psdf):
|
|
self.assert_eq(pdf.cumsum(), psdf.cumsum())
|
|
self.assert_eq(pdf.cumsum(skipna=False), psdf.cumsum(skipna=False))
|
|
self.assert_eq(pdf.cumsum().sum(), psdf.cumsum().sum())
|
|
|
|
def test_cumsum(self):
|
|
pdf = pd.DataFrame(
|
|
[[2.0, 1.0], [5, None], [1.0, 0.0], [2.0, 4.0], [4.0, 9.0]],
|
|
columns=list("AB"),
|
|
index=np.random.rand(5),
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
self._test_cumsum(pdf, psdf)
|
|
|
|
def test_cumsum_multiindex_columns(self):
|
|
arrays = [np.array(["A", "A", "B", "B"]), np.array(["one", "two", "one", "two"])]
|
|
pdf = pd.DataFrame(np.random.randn(3, 4), index=["A", "C", "B"], columns=arrays)
|
|
pdf.at["C", ("A", "two")] = None
|
|
psdf = ps.from_pandas(pdf)
|
|
self._test_cumsum(pdf, psdf)
|
|
|
|
def _test_cumprod(self, pdf, psdf):
|
|
self.assert_eq(pdf.cumprod(), psdf.cumprod(), almost=True)
|
|
self.assert_eq(pdf.cumprod(skipna=False), psdf.cumprod(skipna=False), almost=True)
|
|
self.assert_eq(pdf.cumprod().sum(), psdf.cumprod().sum(), almost=True)
|
|
|
|
def test_cumprod(self):
|
|
pdf = pd.DataFrame(
|
|
[[2.0, 1.0, 1], [5, None, 2], [1.0, -1.0, -3], [2.0, 0, 4], [4.0, 9.0, 5]],
|
|
columns=list("ABC"),
|
|
index=np.random.rand(5),
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
self._test_cumprod(pdf, psdf)
|
|
|
|
def test_cumprod_multiindex_columns(self):
|
|
arrays = [np.array(["A", "A", "B", "B"]), np.array(["one", "two", "one", "two"])]
|
|
pdf = pd.DataFrame(np.random.rand(3, 4), index=["A", "C", "B"], columns=arrays)
|
|
pdf.at["C", ("A", "two")] = None
|
|
psdf = ps.from_pandas(pdf)
|
|
self._test_cumprod(pdf, psdf)
|
|
|
|
def test_drop_duplicates(self):
|
|
pdf = pd.DataFrame(
|
|
{"a": [1, 2, 2, 2, 3], "b": ["a", "a", "a", "c", "d"]}, index=np.random.rand(5)
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
# inplace is False
|
|
for keep in ["first", "last", False]:
|
|
with self.subTest(keep=keep):
|
|
self.assert_eq(
|
|
pdf.drop_duplicates(keep=keep).sort_index(),
|
|
psdf.drop_duplicates(keep=keep).sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
pdf.drop_duplicates("a", keep=keep).sort_index(),
|
|
psdf.drop_duplicates("a", keep=keep).sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
pdf.drop_duplicates(["a", "b"], keep=keep).sort_index(),
|
|
psdf.drop_duplicates(["a", "b"], keep=keep).sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
pdf.set_index("a", append=True).drop_duplicates(keep=keep).sort_index(),
|
|
psdf.set_index("a", append=True).drop_duplicates(keep=keep).sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
pdf.set_index("a", append=True).drop_duplicates("b", keep=keep).sort_index(),
|
|
psdf.set_index("a", append=True).drop_duplicates("b", keep=keep).sort_index(),
|
|
)
|
|
|
|
columns = pd.MultiIndex.from_tuples([("x", "a"), ("y", "b")])
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
|
|
# inplace is False
|
|
for keep in ["first", "last", False]:
|
|
with self.subTest("multi-index columns", keep=keep):
|
|
self.assert_eq(
|
|
pdf.drop_duplicates(keep=keep).sort_index(),
|
|
psdf.drop_duplicates(keep=keep).sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
pdf.drop_duplicates(("x", "a"), keep=keep).sort_index(),
|
|
psdf.drop_duplicates(("x", "a"), keep=keep).sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
pdf.drop_duplicates([("x", "a"), ("y", "b")], keep=keep).sort_index(),
|
|
psdf.drop_duplicates([("x", "a"), ("y", "b")], keep=keep).sort_index(),
|
|
)
|
|
|
|
# inplace is True
|
|
subset_list = [None, "a", ["a", "b"]]
|
|
for subset in subset_list:
|
|
pdf = pd.DataFrame(
|
|
{"a": [1, 2, 2, 2, 3], "b": ["a", "a", "a", "c", "d"]}, index=np.random.rand(5)
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
pser = pdf.a
|
|
psser = psdf.a
|
|
pdf.drop_duplicates(subset=subset, inplace=True)
|
|
psdf.drop_duplicates(subset=subset, inplace=True)
|
|
self.assert_eq(psdf.sort_index(), pdf.sort_index())
|
|
self.assert_eq(psser.sort_index(), pser.sort_index())
|
|
|
|
# multi-index columns, inplace is True
|
|
subset_list = [None, ("x", "a"), [("x", "a"), ("y", "b")]]
|
|
for subset in subset_list:
|
|
pdf = pd.DataFrame(
|
|
{"a": [1, 2, 2, 2, 3], "b": ["a", "a", "a", "c", "d"]}, index=np.random.rand(5)
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
columns = pd.MultiIndex.from_tuples([("x", "a"), ("y", "b")])
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
pser = pdf[("x", "a")]
|
|
psser = psdf[("x", "a")]
|
|
pdf.drop_duplicates(subset=subset, inplace=True)
|
|
psdf.drop_duplicates(subset=subset, inplace=True)
|
|
self.assert_eq(psdf.sort_index(), pdf.sort_index())
|
|
self.assert_eq(psser.sort_index(), pser.sort_index())
|
|
|
|
# non-string names
|
|
pdf = pd.DataFrame(
|
|
{10: [1, 2, 2, 2, 3], 20: ["a", "a", "a", "c", "d"]}, index=np.random.rand(5)
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(
|
|
pdf.drop_duplicates(10, keep=keep).sort_index(),
|
|
psdf.drop_duplicates(10, keep=keep).sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
pdf.drop_duplicates([10, 20], keep=keep).sort_index(),
|
|
psdf.drop_duplicates([10, 20], keep=keep).sort_index(),
|
|
)
|
|
|
|
def test_reindex(self):
|
|
index = pd.Index(["A", "B", "C", "D", "E"])
|
|
columns = pd.Index(["numbers"])
|
|
pdf = pd.DataFrame([1.0, 2.0, 3.0, 4.0, None], index=index, columns=columns)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
columns2 = pd.Index(["numbers", "2", "3"], name="cols2")
|
|
self.assert_eq(
|
|
pdf.reindex(columns=columns2).sort_index(), psdf.reindex(columns=columns2).sort_index(),
|
|
)
|
|
|
|
columns = pd.Index(["numbers"], name="cols")
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
|
|
self.assert_eq(
|
|
pdf.reindex(["A", "B", "C"], columns=["numbers", "2", "3"]).sort_index(),
|
|
psdf.reindex(["A", "B", "C"], columns=["numbers", "2", "3"]).sort_index(),
|
|
)
|
|
|
|
self.assert_eq(
|
|
pdf.reindex(["A", "B", "C"], index=["numbers", "2", "3"]).sort_index(),
|
|
psdf.reindex(["A", "B", "C"], index=["numbers", "2", "3"]).sort_index(),
|
|
)
|
|
|
|
self.assert_eq(
|
|
pdf.reindex(index=["A", "B"]).sort_index(), psdf.reindex(index=["A", "B"]).sort_index()
|
|
)
|
|
|
|
self.assert_eq(
|
|
pdf.reindex(index=["A", "B", "2", "3"]).sort_index(),
|
|
psdf.reindex(index=["A", "B", "2", "3"]).sort_index(),
|
|
)
|
|
|
|
self.assert_eq(
|
|
pdf.reindex(index=["A", "E", "2", "3"], fill_value=0).sort_index(),
|
|
psdf.reindex(index=["A", "E", "2", "3"], fill_value=0).sort_index(),
|
|
)
|
|
|
|
self.assert_eq(
|
|
pdf.reindex(columns=["numbers"]).sort_index(),
|
|
psdf.reindex(columns=["numbers"]).sort_index(),
|
|
)
|
|
|
|
# Using float as fill_value to avoid int64/32 clash
|
|
self.assert_eq(
|
|
pdf.reindex(columns=["numbers", "2", "3"], fill_value=0.0).sort_index(),
|
|
psdf.reindex(columns=["numbers", "2", "3"], fill_value=0.0).sort_index(),
|
|
)
|
|
|
|
columns2 = pd.Index(["numbers", "2", "3"])
|
|
self.assert_eq(
|
|
pdf.reindex(columns=columns2).sort_index(), psdf.reindex(columns=columns2).sort_index(),
|
|
)
|
|
|
|
columns2 = pd.Index(["numbers", "2", "3"], name="cols2")
|
|
self.assert_eq(
|
|
pdf.reindex(columns=columns2).sort_index(), psdf.reindex(columns=columns2).sort_index(),
|
|
)
|
|
|
|
# Reindexing single Index on single Index
|
|
pindex2 = pd.Index(["A", "C", "D", "E", "0"], name="index2")
|
|
kindex2 = ps.from_pandas(pindex2)
|
|
|
|
for fill_value in [None, 0]:
|
|
self.assert_eq(
|
|
pdf.reindex(index=pindex2, fill_value=fill_value).sort_index(),
|
|
psdf.reindex(index=kindex2, fill_value=fill_value).sort_index(),
|
|
)
|
|
|
|
pindex2 = pd.DataFrame({"index2": ["A", "C", "D", "E", "0"]}).set_index("index2").index
|
|
kindex2 = ps.from_pandas(pindex2)
|
|
|
|
for fill_value in [None, 0]:
|
|
self.assert_eq(
|
|
pdf.reindex(index=pindex2, fill_value=fill_value).sort_index(),
|
|
psdf.reindex(index=kindex2, fill_value=fill_value).sort_index(),
|
|
)
|
|
|
|
# Reindexing MultiIndex on single Index
|
|
pindex = pd.MultiIndex.from_tuples(
|
|
[("A", "B"), ("C", "D"), ("F", "G")], names=["name1", "name2"]
|
|
)
|
|
kindex = ps.from_pandas(pindex)
|
|
|
|
self.assert_eq(
|
|
pdf.reindex(index=pindex, fill_value=0.0).sort_index(),
|
|
psdf.reindex(index=kindex, fill_value=0.0).sort_index(),
|
|
)
|
|
|
|
self.assertRaises(TypeError, lambda: psdf.reindex(columns=["numbers", "2", "3"], axis=1))
|
|
self.assertRaises(TypeError, lambda: psdf.reindex(columns=["numbers", "2", "3"], axis=2))
|
|
self.assertRaises(TypeError, lambda: psdf.reindex(index=["A", "B", "C"], axis=1))
|
|
self.assertRaises(TypeError, lambda: psdf.reindex(index=123))
|
|
|
|
# Reindexing MultiIndex on MultiIndex
|
|
pdf = pd.DataFrame({"numbers": [1.0, 2.0, None]}, index=pindex)
|
|
psdf = ps.from_pandas(pdf)
|
|
pindex2 = pd.MultiIndex.from_tuples(
|
|
[("A", "G"), ("C", "D"), ("I", "J")], names=["name1", "name2"]
|
|
)
|
|
kindex2 = ps.from_pandas(pindex2)
|
|
|
|
for fill_value in [None, 0.0]:
|
|
self.assert_eq(
|
|
pdf.reindex(index=pindex2, fill_value=fill_value).sort_index(),
|
|
psdf.reindex(index=kindex2, fill_value=fill_value).sort_index(),
|
|
)
|
|
|
|
pindex2 = (
|
|
pd.DataFrame({"index_level_1": ["A", "C", "I"], "index_level_2": ["G", "D", "J"]})
|
|
.set_index(["index_level_1", "index_level_2"])
|
|
.index
|
|
)
|
|
kindex2 = ps.from_pandas(pindex2)
|
|
|
|
for fill_value in [None, 0.0]:
|
|
self.assert_eq(
|
|
pdf.reindex(index=pindex2, fill_value=fill_value).sort_index(),
|
|
psdf.reindex(index=kindex2, fill_value=fill_value).sort_index(),
|
|
)
|
|
|
|
columns = pd.MultiIndex.from_tuples([("X", "numbers")], names=["cols1", "cols2"])
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
|
|
# Reindexing MultiIndex index on MultiIndex columns and MultiIndex index
|
|
for fill_value in [None, 0.0]:
|
|
self.assert_eq(
|
|
pdf.reindex(index=pindex2, fill_value=fill_value).sort_index(),
|
|
psdf.reindex(index=kindex2, fill_value=fill_value).sort_index(),
|
|
)
|
|
|
|
index = pd.Index(["A", "B", "C", "D", "E"])
|
|
pdf = pd.DataFrame(data=[1.0, 2.0, 3.0, 4.0, None], index=index, columns=columns)
|
|
psdf = ps.from_pandas(pdf)
|
|
pindex2 = pd.Index(["A", "C", "D", "E", "0"], name="index2")
|
|
kindex2 = ps.from_pandas(pindex2)
|
|
|
|
# Reindexing single Index on MultiIndex columns and single Index
|
|
for fill_value in [None, 0.0]:
|
|
self.assert_eq(
|
|
pdf.reindex(index=pindex2, fill_value=fill_value).sort_index(),
|
|
psdf.reindex(index=kindex2, fill_value=fill_value).sort_index(),
|
|
)
|
|
|
|
for fill_value in [None, 0.0]:
|
|
self.assert_eq(
|
|
pdf.reindex(
|
|
columns=[("X", "numbers"), ("Y", "2"), ("Y", "3")], fill_value=fill_value
|
|
).sort_index(),
|
|
psdf.reindex(
|
|
columns=[("X", "numbers"), ("Y", "2"), ("Y", "3")], fill_value=fill_value
|
|
).sort_index(),
|
|
)
|
|
|
|
columns2 = pd.MultiIndex.from_tuples(
|
|
[("X", "numbers"), ("Y", "2"), ("Y", "3")], names=["cols3", "cols4"]
|
|
)
|
|
self.assert_eq(
|
|
pdf.reindex(columns=columns2).sort_index(), psdf.reindex(columns=columns2).sort_index(),
|
|
)
|
|
|
|
self.assertRaises(TypeError, lambda: psdf.reindex(columns=["X"]))
|
|
self.assertRaises(ValueError, lambda: psdf.reindex(columns=[("X",)]))
|
|
|
|
def test_reindex_like(self):
|
|
data = [[1.0, 2.0], [3.0, None], [None, 4.0]]
|
|
index = pd.Index(["A", "B", "C"], name="index")
|
|
columns = pd.Index(["numbers", "values"], name="cols")
|
|
pdf = pd.DataFrame(data=data, index=index, columns=columns)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
# Reindexing single Index on single Index
|
|
data2 = [[5.0, None], [6.0, 7.0], [8.0, None]]
|
|
index2 = pd.Index(["A", "C", "D"], name="index2")
|
|
columns2 = pd.Index(["numbers", "F"], name="cols2")
|
|
pdf2 = pd.DataFrame(data=data2, index=index2, columns=columns2)
|
|
psdf2 = ps.from_pandas(pdf2)
|
|
|
|
self.assert_eq(
|
|
pdf.reindex_like(pdf2).sort_index(), psdf.reindex_like(psdf2).sort_index(),
|
|
)
|
|
|
|
pdf2 = pd.DataFrame({"index_level_1": ["A", "C", "I"]})
|
|
psdf2 = ps.from_pandas(pdf2)
|
|
|
|
self.assert_eq(
|
|
pdf.reindex_like(pdf2.set_index(["index_level_1"])).sort_index(),
|
|
psdf.reindex_like(psdf2.set_index(["index_level_1"])).sort_index(),
|
|
)
|
|
|
|
# Reindexing MultiIndex on single Index
|
|
index2 = pd.MultiIndex.from_tuples(
|
|
[("A", "G"), ("C", "D"), ("I", "J")], names=["name3", "name4"]
|
|
)
|
|
pdf2 = pd.DataFrame(data=data2, index=index2)
|
|
psdf2 = ps.from_pandas(pdf2)
|
|
|
|
self.assert_eq(
|
|
pdf.reindex_like(pdf2).sort_index(), psdf.reindex_like(psdf2).sort_index(),
|
|
)
|
|
|
|
self.assertRaises(TypeError, lambda: psdf.reindex_like(index2))
|
|
self.assertRaises(AssertionError, lambda: psdf2.reindex_like(psdf))
|
|
|
|
# Reindexing MultiIndex on MultiIndex
|
|
columns2 = pd.MultiIndex.from_tuples(
|
|
[("numbers", "third"), ("values", "second")], names=["cols3", "cols4"]
|
|
)
|
|
pdf2.columns = columns2
|
|
psdf2.columns = columns2
|
|
|
|
columns = pd.MultiIndex.from_tuples(
|
|
[("numbers", "first"), ("values", "second")], names=["cols1", "cols2"]
|
|
)
|
|
index = pd.MultiIndex.from_tuples(
|
|
[("A", "B"), ("C", "D"), ("E", "F")], names=["name1", "name2"]
|
|
)
|
|
pdf = pd.DataFrame(data=data, index=index, columns=columns)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(
|
|
pdf.reindex_like(pdf2).sort_index(), psdf.reindex_like(psdf2).sort_index(),
|
|
)
|
|
|
|
def test_melt(self):
|
|
pdf = pd.DataFrame(
|
|
{"A": [1, 3, 5], "B": [2, 4, 6], "C": [7, 8, 9]}, index=np.random.rand(3)
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(
|
|
psdf.melt().sort_values(["variable", "value"]).reset_index(drop=True),
|
|
pdf.melt().sort_values(["variable", "value"]),
|
|
)
|
|
self.assert_eq(
|
|
psdf.melt(id_vars="A").sort_values(["variable", "value"]).reset_index(drop=True),
|
|
pdf.melt(id_vars="A").sort_values(["variable", "value"]),
|
|
)
|
|
self.assert_eq(
|
|
psdf.melt(id_vars=["A", "B"]).sort_values(["variable", "value"]).reset_index(drop=True),
|
|
pdf.melt(id_vars=["A", "B"]).sort_values(["variable", "value"]),
|
|
)
|
|
self.assert_eq(
|
|
psdf.melt(id_vars=("A", "B")).sort_values(["variable", "value"]).reset_index(drop=True),
|
|
pdf.melt(id_vars=("A", "B")).sort_values(["variable", "value"]),
|
|
)
|
|
self.assert_eq(
|
|
psdf.melt(id_vars=["A"], value_vars=["C"])
|
|
.sort_values(["variable", "value"])
|
|
.reset_index(drop=True),
|
|
pdf.melt(id_vars=["A"], value_vars=["C"]).sort_values(["variable", "value"]),
|
|
)
|
|
self.assert_eq(
|
|
psdf.melt(id_vars=["A"], value_vars=["B"], var_name="myVarname", value_name="myValname")
|
|
.sort_values(["myVarname", "myValname"])
|
|
.reset_index(drop=True),
|
|
pdf.melt(
|
|
id_vars=["A"], value_vars=["B"], var_name="myVarname", value_name="myValname"
|
|
).sort_values(["myVarname", "myValname"]),
|
|
)
|
|
self.assert_eq(
|
|
psdf.melt(value_vars=("A", "B"))
|
|
.sort_values(["variable", "value"])
|
|
.reset_index(drop=True),
|
|
pdf.melt(value_vars=("A", "B")).sort_values(["variable", "value"]),
|
|
)
|
|
|
|
self.assertRaises(KeyError, lambda: psdf.melt(id_vars="Z"))
|
|
self.assertRaises(KeyError, lambda: psdf.melt(value_vars="Z"))
|
|
|
|
# multi-index columns
|
|
if LooseVersion("0.24") <= LooseVersion(pd.__version__) < LooseVersion("1.0.0"):
|
|
# pandas >=0.24,<1.0 doesn't support mixed int/str columns in melt.
|
|
# see: https://github.com/pandas-dev/pandas/pull/29792
|
|
TEN = "10"
|
|
TWELVE = "20"
|
|
else:
|
|
TEN = 10.0
|
|
TWELVE = 20.0
|
|
|
|
columns = pd.MultiIndex.from_tuples([(TEN, "A"), (TEN, "B"), (TWELVE, "C")])
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
|
|
self.assert_eq(
|
|
psdf.melt().sort_values(["variable_0", "variable_1", "value"]).reset_index(drop=True),
|
|
pdf.melt().sort_values(["variable_0", "variable_1", "value"]),
|
|
)
|
|
self.assert_eq(
|
|
psdf.melt(id_vars=[(TEN, "A")])
|
|
.sort_values(["variable_0", "variable_1", "value"])
|
|
.reset_index(drop=True),
|
|
pdf.melt(id_vars=[(TEN, "A")])
|
|
.sort_values(["variable_0", "variable_1", "value"])
|
|
.rename(columns=name_like_string),
|
|
)
|
|
self.assert_eq(
|
|
psdf.melt(id_vars=[(TEN, "A")], value_vars=[(TWELVE, "C")])
|
|
.sort_values(["variable_0", "variable_1", "value"])
|
|
.reset_index(drop=True),
|
|
pdf.melt(id_vars=[(TEN, "A")], value_vars=[(TWELVE, "C")])
|
|
.sort_values(["variable_0", "variable_1", "value"])
|
|
.rename(columns=name_like_string),
|
|
)
|
|
self.assert_eq(
|
|
psdf.melt(
|
|
id_vars=[(TEN, "A")],
|
|
value_vars=[(TEN, "B")],
|
|
var_name=["myV1", "myV2"],
|
|
value_name="myValname",
|
|
)
|
|
.sort_values(["myV1", "myV2", "myValname"])
|
|
.reset_index(drop=True),
|
|
pdf.melt(
|
|
id_vars=[(TEN, "A")],
|
|
value_vars=[(TEN, "B")],
|
|
var_name=["myV1", "myV2"],
|
|
value_name="myValname",
|
|
)
|
|
.sort_values(["myV1", "myV2", "myValname"])
|
|
.rename(columns=name_like_string),
|
|
)
|
|
|
|
columns.names = ["v0", "v1"]
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
|
|
self.assert_eq(
|
|
psdf.melt().sort_values(["v0", "v1", "value"]).reset_index(drop=True),
|
|
pdf.melt().sort_values(["v0", "v1", "value"]),
|
|
)
|
|
|
|
self.assertRaises(ValueError, lambda: psdf.melt(id_vars=(TEN, "A")))
|
|
self.assertRaises(ValueError, lambda: psdf.melt(value_vars=(TEN, "A")))
|
|
self.assertRaises(KeyError, lambda: psdf.melt(id_vars=[TEN]))
|
|
self.assertRaises(KeyError, lambda: psdf.melt(id_vars=[(TWELVE, "A")]))
|
|
self.assertRaises(KeyError, lambda: psdf.melt(value_vars=[TWELVE]))
|
|
self.assertRaises(KeyError, lambda: psdf.melt(value_vars=[(TWELVE, "A")]))
|
|
|
|
# non-string names
|
|
pdf.columns = [10.0, 20.0, 30.0]
|
|
psdf.columns = [10.0, 20.0, 30.0]
|
|
|
|
self.assert_eq(
|
|
psdf.melt().sort_values(["variable", "value"]).reset_index(drop=True),
|
|
pdf.melt().sort_values(["variable", "value"]),
|
|
)
|
|
self.assert_eq(
|
|
psdf.melt(id_vars=10.0).sort_values(["variable", "value"]).reset_index(drop=True),
|
|
pdf.melt(id_vars=10.0).sort_values(["variable", "value"]),
|
|
)
|
|
self.assert_eq(
|
|
psdf.melt(id_vars=[10.0, 20.0])
|
|
.sort_values(["variable", "value"])
|
|
.reset_index(drop=True),
|
|
pdf.melt(id_vars=[10.0, 20.0]).sort_values(["variable", "value"]),
|
|
)
|
|
self.assert_eq(
|
|
psdf.melt(id_vars=(10.0, 20.0))
|
|
.sort_values(["variable", "value"])
|
|
.reset_index(drop=True),
|
|
pdf.melt(id_vars=(10.0, 20.0)).sort_values(["variable", "value"]),
|
|
)
|
|
self.assert_eq(
|
|
psdf.melt(id_vars=[10.0], value_vars=[30.0])
|
|
.sort_values(["variable", "value"])
|
|
.reset_index(drop=True),
|
|
pdf.melt(id_vars=[10.0], value_vars=[30.0]).sort_values(["variable", "value"]),
|
|
)
|
|
self.assert_eq(
|
|
psdf.melt(value_vars=(10.0, 20.0))
|
|
.sort_values(["variable", "value"])
|
|
.reset_index(drop=True),
|
|
pdf.melt(value_vars=(10.0, 20.0)).sort_values(["variable", "value"]),
|
|
)
|
|
|
|
def test_all(self):
|
|
pdf = pd.DataFrame(
|
|
{
|
|
"col1": [False, False, False],
|
|
"col2": [True, False, False],
|
|
"col3": [0, 0, 1],
|
|
"col4": [0, 1, 2],
|
|
"col5": [False, False, None],
|
|
"col6": [True, False, None],
|
|
},
|
|
index=np.random.rand(3),
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf.all(), pdf.all())
|
|
|
|
columns = pd.MultiIndex.from_tuples(
|
|
[
|
|
("a", "col1"),
|
|
("a", "col2"),
|
|
("a", "col3"),
|
|
("b", "col4"),
|
|
("b", "col5"),
|
|
("c", "col6"),
|
|
]
|
|
)
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
|
|
self.assert_eq(psdf.all(), pdf.all())
|
|
|
|
columns.names = ["X", "Y"]
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
|
|
self.assert_eq(psdf.all(), pdf.all())
|
|
|
|
with self.assertRaisesRegex(
|
|
NotImplementedError, 'axis should be either 0 or "index" currently.'
|
|
):
|
|
psdf.all(axis=1)
|
|
|
|
def test_any(self):
|
|
pdf = pd.DataFrame(
|
|
{
|
|
"col1": [False, False, False],
|
|
"col2": [True, False, False],
|
|
"col3": [0, 0, 1],
|
|
"col4": [0, 1, 2],
|
|
"col5": [False, False, None],
|
|
"col6": [True, False, None],
|
|
},
|
|
index=np.random.rand(3),
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf.any(), pdf.any())
|
|
|
|
columns = pd.MultiIndex.from_tuples(
|
|
[
|
|
("a", "col1"),
|
|
("a", "col2"),
|
|
("a", "col3"),
|
|
("b", "col4"),
|
|
("b", "col5"),
|
|
("c", "col6"),
|
|
]
|
|
)
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
|
|
self.assert_eq(psdf.any(), pdf.any())
|
|
|
|
columns.names = ["X", "Y"]
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
|
|
self.assert_eq(psdf.any(), pdf.any())
|
|
|
|
with self.assertRaisesRegex(
|
|
NotImplementedError, 'axis should be either 0 or "index" currently.'
|
|
):
|
|
psdf.any(axis=1)
|
|
|
|
def test_rank(self):
|
|
pdf = pd.DataFrame(
|
|
data={"col1": [1, 2, 3, 1], "col2": [3, 4, 3, 1]},
|
|
columns=["col1", "col2"],
|
|
index=np.random.rand(4),
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(pdf.rank().sort_index(), psdf.rank().sort_index())
|
|
self.assert_eq(pdf.rank().sum(), psdf.rank().sum())
|
|
self.assert_eq(
|
|
pdf.rank(ascending=False).sort_index(), psdf.rank(ascending=False).sort_index()
|
|
)
|
|
self.assert_eq(pdf.rank(method="min").sort_index(), psdf.rank(method="min").sort_index())
|
|
self.assert_eq(pdf.rank(method="max").sort_index(), psdf.rank(method="max").sort_index())
|
|
self.assert_eq(
|
|
pdf.rank(method="first").sort_index(), psdf.rank(method="first").sort_index()
|
|
)
|
|
self.assert_eq(
|
|
pdf.rank(method="dense").sort_index(), psdf.rank(method="dense").sort_index()
|
|
)
|
|
|
|
msg = "method must be one of 'average', 'min', 'max', 'first', 'dense'"
|
|
with self.assertRaisesRegex(ValueError, msg):
|
|
psdf.rank(method="nothing")
|
|
|
|
# multi-index columns
|
|
columns = pd.MultiIndex.from_tuples([("x", "col1"), ("y", "col2")])
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
self.assert_eq(pdf.rank().sort_index(), psdf.rank().sort_index())
|
|
|
|
def test_round(self):
|
|
pdf = pd.DataFrame(
|
|
{
|
|
"A": [0.028208, 0.038683, 0.877076],
|
|
"B": [0.992815, 0.645646, 0.149370],
|
|
"C": [0.173891, 0.577595, 0.491027],
|
|
},
|
|
columns=["A", "B", "C"],
|
|
index=np.random.rand(3),
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
pser = pd.Series([1, 0, 2], index=["A", "B", "C"])
|
|
psser = ps.Series([1, 0, 2], index=["A", "B", "C"])
|
|
self.assert_eq(pdf.round(2), psdf.round(2))
|
|
self.assert_eq(pdf.round({"A": 1, "C": 2}), psdf.round({"A": 1, "C": 2}))
|
|
self.assert_eq(pdf.round({"A": 1, "D": 2}), psdf.round({"A": 1, "D": 2}))
|
|
self.assert_eq(pdf.round(pser), psdf.round(psser))
|
|
msg = "decimals must be an integer, a dict-like or a Series"
|
|
with self.assertRaisesRegex(TypeError, msg):
|
|
psdf.round(1.5)
|
|
|
|
# multi-index columns
|
|
columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B"), ("Y", "C")])
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
pser = pd.Series([1, 0, 2], index=columns)
|
|
psser = ps.Series([1, 0, 2], index=columns)
|
|
self.assert_eq(pdf.round(2), psdf.round(2))
|
|
self.assert_eq(
|
|
pdf.round({("X", "A"): 1, ("Y", "C"): 2}), psdf.round({("X", "A"): 1, ("Y", "C"): 2})
|
|
)
|
|
self.assert_eq(pdf.round({("X", "A"): 1, "Y": 2}), psdf.round({("X", "A"): 1, "Y": 2}))
|
|
self.assert_eq(pdf.round(pser), psdf.round(psser))
|
|
|
|
# non-string names
|
|
pdf = pd.DataFrame(
|
|
{
|
|
10: [0.028208, 0.038683, 0.877076],
|
|
20: [0.992815, 0.645646, 0.149370],
|
|
30: [0.173891, 0.577595, 0.491027],
|
|
},
|
|
index=np.random.rand(3),
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(pdf.round({10: 1, 30: 2}), psdf.round({10: 1, 30: 2}))
|
|
|
|
def test_shift(self):
|
|
pdf = pd.DataFrame(
|
|
{
|
|
"Col1": [10, 20, 15, 30, 45],
|
|
"Col2": [13, 23, 18, 33, 48],
|
|
"Col3": [17, 27, 22, 37, 52],
|
|
},
|
|
index=np.random.rand(5),
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(pdf.shift(3), psdf.shift(3))
|
|
self.assert_eq(pdf.shift().shift(-1), psdf.shift().shift(-1))
|
|
self.assert_eq(pdf.shift().sum().astype(int), psdf.shift().sum())
|
|
|
|
# Need the expected result since pandas 0.23 does not support `fill_value` argument.
|
|
pdf1 = pd.DataFrame(
|
|
{"Col1": [0, 0, 0, 10, 20], "Col2": [0, 0, 0, 13, 23], "Col3": [0, 0, 0, 17, 27]},
|
|
index=pdf.index,
|
|
)
|
|
self.assert_eq(pdf1, psdf.shift(periods=3, fill_value=0))
|
|
msg = "should be an int"
|
|
with self.assertRaisesRegex(TypeError, msg):
|
|
psdf.shift(1.5)
|
|
|
|
# multi-index columns
|
|
columns = pd.MultiIndex.from_tuples([("x", "Col1"), ("x", "Col2"), ("y", "Col3")])
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
self.assert_eq(pdf.shift(3), psdf.shift(3))
|
|
self.assert_eq(pdf.shift().shift(-1), psdf.shift().shift(-1))
|
|
|
|
def test_diff(self):
|
|
pdf = pd.DataFrame(
|
|
{"a": [1, 2, 3, 4, 5, 6], "b": [1, 1, 2, 3, 5, 8], "c": [1, 4, 9, 16, 25, 36]},
|
|
index=np.random.rand(6),
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(pdf.diff(), psdf.diff())
|
|
self.assert_eq(pdf.diff().diff(-1), psdf.diff().diff(-1))
|
|
self.assert_eq(pdf.diff().sum().astype(int), psdf.diff().sum())
|
|
|
|
msg = "should be an int"
|
|
with self.assertRaisesRegex(TypeError, msg):
|
|
psdf.diff(1.5)
|
|
msg = 'axis should be either 0 or "index" currently.'
|
|
with self.assertRaisesRegex(NotImplementedError, msg):
|
|
psdf.diff(axis=1)
|
|
|
|
# multi-index columns
|
|
columns = pd.MultiIndex.from_tuples([("x", "Col1"), ("x", "Col2"), ("y", "Col3")])
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
|
|
self.assert_eq(pdf.diff(), psdf.diff())
|
|
|
|
def test_duplicated(self):
|
|
pdf = pd.DataFrame(
|
|
{"a": [1, 1, 2, 3], "b": [1, 1, 1, 4], "c": [1, 1, 1, 5]}, index=np.random.rand(4)
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(pdf.duplicated().sort_index(), psdf.duplicated().sort_index())
|
|
self.assert_eq(
|
|
pdf.duplicated(keep="last").sort_index(), psdf.duplicated(keep="last").sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
pdf.duplicated(keep=False).sort_index(), psdf.duplicated(keep=False).sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
pdf.duplicated(subset="b").sort_index(), psdf.duplicated(subset="b").sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
pdf.duplicated(subset=["b"]).sort_index(), psdf.duplicated(subset=["b"]).sort_index(),
|
|
)
|
|
with self.assertRaisesRegex(ValueError, "'keep' only supports 'first', 'last' and False"):
|
|
psdf.duplicated(keep="false")
|
|
with self.assertRaisesRegex(KeyError, "'d'"):
|
|
psdf.duplicated(subset=["d"])
|
|
|
|
pdf.index.name = "x"
|
|
psdf.index.name = "x"
|
|
self.assert_eq(pdf.duplicated().sort_index(), psdf.duplicated().sort_index())
|
|
|
|
# multi-index
|
|
self.assert_eq(
|
|
pdf.set_index("a", append=True).duplicated().sort_index(),
|
|
psdf.set_index("a", append=True).duplicated().sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
pdf.set_index("a", append=True).duplicated(keep=False).sort_index(),
|
|
psdf.set_index("a", append=True).duplicated(keep=False).sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
pdf.set_index("a", append=True).duplicated(subset=["b"]).sort_index(),
|
|
psdf.set_index("a", append=True).duplicated(subset=["b"]).sort_index(),
|
|
)
|
|
|
|
# mutli-index columns
|
|
columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")])
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
self.assert_eq(pdf.duplicated().sort_index(), psdf.duplicated().sort_index())
|
|
self.assert_eq(
|
|
pdf.duplicated(subset=("x", "b")).sort_index(),
|
|
psdf.duplicated(subset=("x", "b")).sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
pdf.duplicated(subset=[("x", "b")]).sort_index(),
|
|
psdf.duplicated(subset=[("x", "b")]).sort_index(),
|
|
)
|
|
|
|
# non-string names
|
|
pdf = pd.DataFrame(
|
|
{10: [1, 1, 2, 3], 20: [1, 1, 1, 4], 30: [1, 1, 1, 5]}, index=np.random.rand(4)
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(pdf.duplicated().sort_index(), psdf.duplicated().sort_index())
|
|
self.assert_eq(
|
|
pdf.duplicated(subset=10).sort_index(), psdf.duplicated(subset=10).sort_index(),
|
|
)
|
|
|
|
def test_ffill(self):
|
|
idx = np.random.rand(6)
|
|
pdf = pd.DataFrame(
|
|
{
|
|
"x": [np.nan, 2, 3, 4, np.nan, 6],
|
|
"y": [1, 2, np.nan, 4, np.nan, np.nan],
|
|
"z": [1, 2, 3, 4, np.nan, np.nan],
|
|
},
|
|
index=idx,
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf.ffill(), pdf.ffill())
|
|
self.assert_eq(psdf.ffill(limit=1), pdf.ffill(limit=1))
|
|
|
|
pser = pdf.y
|
|
psser = psdf.y
|
|
|
|
psdf.ffill(inplace=True)
|
|
pdf.ffill(inplace=True)
|
|
|
|
self.assert_eq(psdf, pdf)
|
|
self.assert_eq(psser, pser)
|
|
self.assert_eq(psser[idx[2]], pser[idx[2]])
|
|
|
|
def test_bfill(self):
|
|
idx = np.random.rand(6)
|
|
pdf = pd.DataFrame(
|
|
{
|
|
"x": [np.nan, 2, 3, 4, np.nan, 6],
|
|
"y": [1, 2, np.nan, 4, np.nan, np.nan],
|
|
"z": [1, 2, 3, 4, np.nan, np.nan],
|
|
},
|
|
index=idx,
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf.bfill(), pdf.bfill())
|
|
self.assert_eq(psdf.bfill(limit=1), pdf.bfill(limit=1))
|
|
|
|
pser = pdf.x
|
|
psser = psdf.x
|
|
|
|
psdf.bfill(inplace=True)
|
|
pdf.bfill(inplace=True)
|
|
|
|
self.assert_eq(psdf, pdf)
|
|
self.assert_eq(psser, pser)
|
|
self.assert_eq(psser[idx[0]], pser[idx[0]])
|
|
|
|
def test_filter(self):
|
|
pdf = pd.DataFrame(
|
|
{
|
|
"aa": ["aa", "bd", "bc", "ab", "ce"],
|
|
"ba": [1, 2, 3, 4, 5],
|
|
"cb": [1.0, 2.0, 3.0, 4.0, 5.0],
|
|
"db": [1.0, np.nan, 3.0, np.nan, 5.0],
|
|
}
|
|
)
|
|
pdf = pdf.set_index("aa")
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(
|
|
psdf.filter(items=["ab", "aa"], axis=0).sort_index(),
|
|
pdf.filter(items=["ab", "aa"], axis=0).sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
psdf.filter(items=["ba", "db"], axis=1).sort_index(),
|
|
pdf.filter(items=["ba", "db"], axis=1).sort_index(),
|
|
)
|
|
|
|
self.assert_eq(psdf.filter(like="b", axis="index"), pdf.filter(like="b", axis="index"))
|
|
self.assert_eq(psdf.filter(like="c", axis="columns"), pdf.filter(like="c", axis="columns"))
|
|
|
|
self.assert_eq(
|
|
psdf.filter(regex="b.*", axis="index"), pdf.filter(regex="b.*", axis="index")
|
|
)
|
|
self.assert_eq(
|
|
psdf.filter(regex="b.*", axis="columns"), pdf.filter(regex="b.*", axis="columns")
|
|
)
|
|
|
|
pdf = pdf.set_index("ba", append=True)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(
|
|
psdf.filter(items=[("aa", 1), ("bd", 2)], axis=0).sort_index(),
|
|
pdf.filter(items=[("aa", 1), ("bd", 2)], axis=0).sort_index(),
|
|
)
|
|
|
|
with self.assertRaisesRegex(TypeError, "Unsupported type list"):
|
|
psdf.filter(items=[["aa", 1], ("bd", 2)], axis=0)
|
|
|
|
with self.assertRaisesRegex(ValueError, "The item should not be empty."):
|
|
psdf.filter(items=[(), ("bd", 2)], axis=0)
|
|
|
|
self.assert_eq(psdf.filter(like="b", axis=0), pdf.filter(like="b", axis=0))
|
|
|
|
self.assert_eq(psdf.filter(regex="b.*", axis=0), pdf.filter(regex="b.*", axis=0))
|
|
|
|
with self.assertRaisesRegex(ValueError, "items should be a list-like object"):
|
|
psdf.filter(items="b")
|
|
|
|
with self.assertRaisesRegex(ValueError, "No axis named"):
|
|
psdf.filter(regex="b.*", axis=123)
|
|
|
|
with self.assertRaisesRegex(TypeError, "Must pass either `items`, `like`"):
|
|
psdf.filter()
|
|
|
|
with self.assertRaisesRegex(TypeError, "mutually exclusive"):
|
|
psdf.filter(regex="b.*", like="aaa")
|
|
|
|
# multi-index columns
|
|
pdf = pd.DataFrame(
|
|
{
|
|
("x", "aa"): ["aa", "ab", "bc", "bd", "ce"],
|
|
("x", "ba"): [1, 2, 3, 4, 5],
|
|
("y", "cb"): [1.0, 2.0, 3.0, 4.0, 5.0],
|
|
("z", "db"): [1.0, np.nan, 3.0, np.nan, 5.0],
|
|
}
|
|
)
|
|
pdf = pdf.set_index(("x", "aa"))
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(
|
|
psdf.filter(items=["ab", "aa"], axis=0).sort_index(),
|
|
pdf.filter(items=["ab", "aa"], axis=0).sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
psdf.filter(items=[("x", "ba"), ("z", "db")], axis=1).sort_index(),
|
|
pdf.filter(items=[("x", "ba"), ("z", "db")], axis=1).sort_index(),
|
|
)
|
|
|
|
self.assert_eq(psdf.filter(like="b", axis="index"), pdf.filter(like="b", axis="index"))
|
|
self.assert_eq(psdf.filter(like="c", axis="columns"), pdf.filter(like="c", axis="columns"))
|
|
|
|
self.assert_eq(
|
|
psdf.filter(regex="b.*", axis="index"), pdf.filter(regex="b.*", axis="index")
|
|
)
|
|
self.assert_eq(
|
|
psdf.filter(regex="b.*", axis="columns"), pdf.filter(regex="b.*", axis="columns")
|
|
)
|
|
|
|
def test_pipe(self):
|
|
psdf = ps.DataFrame(
|
|
{"category": ["A", "A", "B"], "col1": [1, 2, 3], "col2": [4, 5, 6]},
|
|
columns=["category", "col1", "col2"],
|
|
)
|
|
|
|
self.assertRaisesRegex(
|
|
ValueError,
|
|
"arg is both the pipe target and a keyword argument",
|
|
lambda: psdf.pipe((lambda x: x, "arg"), arg="1"),
|
|
)
|
|
|
|
def test_transform(self):
|
|
pdf = pd.DataFrame(
|
|
{
|
|
"a": [1, 2, 3, 4, 5, 6] * 100,
|
|
"b": [1.0, 1.0, 2.0, 3.0, 5.0, 8.0] * 100,
|
|
"c": [1, 4, 9, 16, 25, 36] * 100,
|
|
},
|
|
columns=["a", "b", "c"],
|
|
index=np.random.rand(600),
|
|
)
|
|
psdf = ps.DataFrame(pdf)
|
|
self.assert_eq(
|
|
psdf.transform(lambda x: x + 1).sort_index(),
|
|
pdf.transform(lambda x: x + 1).sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
psdf.transform(lambda x, y: x + y, y=2).sort_index(),
|
|
pdf.transform(lambda x, y: x + y, y=2).sort_index(),
|
|
)
|
|
with option_context("compute.shortcut_limit", 500):
|
|
self.assert_eq(
|
|
psdf.transform(lambda x: x + 1).sort_index(),
|
|
pdf.transform(lambda x: x + 1).sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
psdf.transform(lambda x, y: x + y, y=1).sort_index(),
|
|
pdf.transform(lambda x, y: x + y, y=1).sort_index(),
|
|
)
|
|
|
|
with self.assertRaisesRegex(AssertionError, "the first argument should be a callable"):
|
|
psdf.transform(1)
|
|
|
|
# multi-index columns
|
|
columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")])
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
|
|
self.assert_eq(
|
|
psdf.transform(lambda x: x + 1).sort_index(),
|
|
pdf.transform(lambda x: x + 1).sort_index(),
|
|
)
|
|
with option_context("compute.shortcut_limit", 500):
|
|
self.assert_eq(
|
|
psdf.transform(lambda x: x + 1).sort_index(),
|
|
pdf.transform(lambda x: x + 1).sort_index(),
|
|
)
|
|
|
|
def test_apply(self):
|
|
pdf = pd.DataFrame(
|
|
{
|
|
"a": [1, 2, 3, 4, 5, 6] * 100,
|
|
"b": [1.0, 1.0, 2.0, 3.0, 5.0, 8.0] * 100,
|
|
"c": [1, 4, 9, 16, 25, 36] * 100,
|
|
},
|
|
columns=["a", "b", "c"],
|
|
index=np.random.rand(600),
|
|
)
|
|
psdf = ps.DataFrame(pdf)
|
|
|
|
self.assert_eq(
|
|
psdf.apply(lambda x: x + 1).sort_index(), pdf.apply(lambda x: x + 1).sort_index()
|
|
)
|
|
self.assert_eq(
|
|
psdf.apply(lambda x, b: x + b, args=(1,)).sort_index(),
|
|
pdf.apply(lambda x, b: x + b, args=(1,)).sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
psdf.apply(lambda x, b: x + b, b=1).sort_index(),
|
|
pdf.apply(lambda x, b: x + b, b=1).sort_index(),
|
|
)
|
|
|
|
with option_context("compute.shortcut_limit", 500):
|
|
self.assert_eq(
|
|
psdf.apply(lambda x: x + 1).sort_index(), pdf.apply(lambda x: x + 1).sort_index()
|
|
)
|
|
self.assert_eq(
|
|
psdf.apply(lambda x, b: x + b, args=(1,)).sort_index(),
|
|
pdf.apply(lambda x, b: x + b, args=(1,)).sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
psdf.apply(lambda x, b: x + b, b=1).sort_index(),
|
|
pdf.apply(lambda x, b: x + b, b=1).sort_index(),
|
|
)
|
|
|
|
# returning a Series
|
|
self.assert_eq(
|
|
psdf.apply(lambda x: len(x), axis=1).sort_index(),
|
|
pdf.apply(lambda x: len(x), axis=1).sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
psdf.apply(lambda x, c: len(x) + c, axis=1, c=100).sort_index(),
|
|
pdf.apply(lambda x, c: len(x) + c, axis=1, c=100).sort_index(),
|
|
)
|
|
with option_context("compute.shortcut_limit", 500):
|
|
self.assert_eq(
|
|
psdf.apply(lambda x: len(x), axis=1).sort_index(),
|
|
pdf.apply(lambda x: len(x), axis=1).sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
psdf.apply(lambda x, c: len(x) + c, axis=1, c=100).sort_index(),
|
|
pdf.apply(lambda x, c: len(x) + c, axis=1, c=100).sort_index(),
|
|
)
|
|
|
|
with self.assertRaisesRegex(AssertionError, "the first argument should be a callable"):
|
|
psdf.apply(1)
|
|
|
|
with self.assertRaisesRegex(TypeError, "The given function.*1 or 'column'; however"):
|
|
|
|
def f1(_) -> ps.DataFrame[int]:
|
|
pass
|
|
|
|
psdf.apply(f1, axis=0)
|
|
|
|
with self.assertRaisesRegex(TypeError, "The given function.*0 or 'index'; however"):
|
|
|
|
def f2(_) -> ps.Series[int]:
|
|
pass
|
|
|
|
psdf.apply(f2, axis=1)
|
|
|
|
# multi-index columns
|
|
columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")])
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
|
|
self.assert_eq(
|
|
psdf.apply(lambda x: x + 1).sort_index(), pdf.apply(lambda x: x + 1).sort_index()
|
|
)
|
|
with option_context("compute.shortcut_limit", 500):
|
|
self.assert_eq(
|
|
psdf.apply(lambda x: x + 1).sort_index(), pdf.apply(lambda x: x + 1).sort_index()
|
|
)
|
|
|
|
# returning a Series
|
|
self.assert_eq(
|
|
psdf.apply(lambda x: len(x), axis=1).sort_index(),
|
|
pdf.apply(lambda x: len(x), axis=1).sort_index(),
|
|
)
|
|
with option_context("compute.shortcut_limit", 500):
|
|
self.assert_eq(
|
|
psdf.apply(lambda x: len(x), axis=1).sort_index(),
|
|
pdf.apply(lambda x: len(x), axis=1).sort_index(),
|
|
)
|
|
|
|
def test_apply_batch(self):
|
|
pdf = pd.DataFrame(
|
|
{
|
|
"a": [1, 2, 3, 4, 5, 6] * 100,
|
|
"b": [1.0, 1.0, 2.0, 3.0, 5.0, 8.0] * 100,
|
|
"c": [1, 4, 9, 16, 25, 36] * 100,
|
|
},
|
|
columns=["a", "b", "c"],
|
|
index=np.random.rand(600),
|
|
)
|
|
psdf = ps.DataFrame(pdf)
|
|
|
|
# One to test alias.
|
|
self.assert_eq(psdf.apply_batch(lambda pdf: pdf + 1).sort_index(), (pdf + 1).sort_index())
|
|
self.assert_eq(
|
|
psdf.koalas.apply_batch(lambda pdf, a: pdf + a, args=(1,)).sort_index(),
|
|
(pdf + 1).sort_index(),
|
|
)
|
|
with option_context("compute.shortcut_limit", 500):
|
|
self.assert_eq(
|
|
psdf.koalas.apply_batch(lambda pdf: pdf + 1).sort_index(), (pdf + 1).sort_index()
|
|
)
|
|
self.assert_eq(
|
|
psdf.koalas.apply_batch(lambda pdf, b: pdf + b, b=1).sort_index(),
|
|
(pdf + 1).sort_index(),
|
|
)
|
|
|
|
with self.assertRaisesRegex(AssertionError, "the first argument should be a callable"):
|
|
psdf.koalas.apply_batch(1)
|
|
|
|
with self.assertRaisesRegex(TypeError, "The given function.*frame as its type hints"):
|
|
|
|
def f2(_) -> ps.Series[int]:
|
|
pass
|
|
|
|
psdf.koalas.apply_batch(f2)
|
|
|
|
with self.assertRaisesRegex(ValueError, "The given function should return a frame"):
|
|
psdf.koalas.apply_batch(lambda pdf: 1)
|
|
|
|
# multi-index columns
|
|
columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")])
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
|
|
self.assert_eq(
|
|
psdf.koalas.apply_batch(lambda x: x + 1).sort_index(), (pdf + 1).sort_index()
|
|
)
|
|
with option_context("compute.shortcut_limit", 500):
|
|
self.assert_eq(
|
|
psdf.koalas.apply_batch(lambda x: x + 1).sort_index(), (pdf + 1).sort_index()
|
|
)
|
|
|
|
def test_transform_batch(self):
|
|
pdf = pd.DataFrame(
|
|
{
|
|
"a": [1, 2, 3, 4, 5, 6] * 100,
|
|
"b": [1.0, 1.0, 2.0, 3.0, 5.0, 8.0] * 100,
|
|
"c": [1, 4, 9, 16, 25, 36] * 100,
|
|
},
|
|
columns=["a", "b", "c"],
|
|
index=np.random.rand(600),
|
|
)
|
|
psdf = ps.DataFrame(pdf)
|
|
|
|
# One to test alias.
|
|
self.assert_eq(
|
|
psdf.transform_batch(lambda pdf: pdf + 1).sort_index(), (pdf + 1).sort_index()
|
|
)
|
|
self.assert_eq(
|
|
psdf.koalas.transform_batch(lambda pdf: pdf.c + 1).sort_index(),
|
|
(pdf.c + 1).sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
psdf.koalas.transform_batch(lambda pdf, a: pdf + a, 1).sort_index(),
|
|
(pdf + 1).sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
psdf.koalas.transform_batch(lambda pdf, a: pdf.c + a, a=1).sort_index(),
|
|
(pdf.c + 1).sort_index(),
|
|
)
|
|
|
|
with option_context("compute.shortcut_limit", 500):
|
|
self.assert_eq(
|
|
psdf.koalas.transform_batch(lambda pdf: pdf + 1).sort_index(),
|
|
(pdf + 1).sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
psdf.koalas.transform_batch(lambda pdf: pdf.b + 1).sort_index(),
|
|
(pdf.b + 1).sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
psdf.koalas.transform_batch(lambda pdf, a: pdf + a, 1).sort_index(),
|
|
(pdf + 1).sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
psdf.koalas.transform_batch(lambda pdf, a: pdf.c + a, a=1).sort_index(),
|
|
(pdf.c + 1).sort_index(),
|
|
)
|
|
|
|
with self.assertRaisesRegex(AssertionError, "the first argument should be a callable"):
|
|
psdf.koalas.transform_batch(1)
|
|
|
|
with self.assertRaisesRegex(ValueError, "The given function should return a frame"):
|
|
psdf.koalas.transform_batch(lambda pdf: 1)
|
|
|
|
with self.assertRaisesRegex(
|
|
ValueError, "transform_batch cannot produce aggregated results"
|
|
):
|
|
psdf.koalas.transform_batch(lambda pdf: pd.Series(1))
|
|
|
|
# multi-index columns
|
|
columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")])
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
|
|
self.assert_eq(
|
|
psdf.koalas.transform_batch(lambda x: x + 1).sort_index(), (pdf + 1).sort_index()
|
|
)
|
|
with option_context("compute.shortcut_limit", 500):
|
|
self.assert_eq(
|
|
psdf.koalas.transform_batch(lambda x: x + 1).sort_index(), (pdf + 1).sort_index()
|
|
)
|
|
|
|
def test_transform_batch_same_anchor(self):
|
|
psdf = ps.range(10)
|
|
psdf["d"] = psdf.koalas.transform_batch(lambda pdf: pdf.id + 1)
|
|
self.assert_eq(
|
|
psdf,
|
|
pd.DataFrame({"id": list(range(10)), "d": list(range(1, 11))}, columns=["id", "d"]),
|
|
)
|
|
|
|
psdf = ps.range(10)
|
|
# One to test alias.
|
|
psdf["d"] = psdf.id.transform_batch(lambda ser: ser + 1)
|
|
self.assert_eq(
|
|
psdf,
|
|
pd.DataFrame({"id": list(range(10)), "d": list(range(1, 11))}, columns=["id", "d"]),
|
|
)
|
|
|
|
psdf = ps.range(10)
|
|
|
|
def plus_one(pdf) -> ps.Series[np.int64]:
|
|
return pdf.id + 1
|
|
|
|
psdf["d"] = psdf.koalas.transform_batch(plus_one)
|
|
self.assert_eq(
|
|
psdf,
|
|
pd.DataFrame({"id": list(range(10)), "d": list(range(1, 11))}, columns=["id", "d"]),
|
|
)
|
|
|
|
psdf = ps.range(10)
|
|
|
|
def plus_one(ser) -> ps.Series[np.int64]:
|
|
return ser + 1
|
|
|
|
psdf["d"] = psdf.id.koalas.transform_batch(plus_one)
|
|
self.assert_eq(
|
|
psdf,
|
|
pd.DataFrame({"id": list(range(10)), "d": list(range(1, 11))}, columns=["id", "d"]),
|
|
)
|
|
|
|
def test_empty_timestamp(self):
|
|
pdf = pd.DataFrame(
|
|
{
|
|
"t": [
|
|
datetime(2019, 1, 1, 0, 0, 0),
|
|
datetime(2019, 1, 2, 0, 0, 0),
|
|
datetime(2019, 1, 3, 0, 0, 0),
|
|
]
|
|
},
|
|
index=np.random.rand(3),
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(psdf[psdf["t"] != psdf["t"]], pdf[pdf["t"] != pdf["t"]])
|
|
self.assert_eq(psdf[psdf["t"] != psdf["t"]].dtypes, pdf[pdf["t"] != pdf["t"]].dtypes)
|
|
|
|
def test_to_spark(self):
|
|
psdf = ps.from_pandas(self.pdf)
|
|
|
|
with self.assertRaisesRegex(ValueError, "'index_col' cannot be overlapped"):
|
|
psdf.to_spark(index_col="a")
|
|
|
|
with self.assertRaisesRegex(ValueError, "length of index columns.*1.*3"):
|
|
psdf.to_spark(index_col=["x", "y", "z"])
|
|
|
|
def test_keys(self):
|
|
pdf = pd.DataFrame(
|
|
[[1, 2], [4, 5], [7, 8]],
|
|
index=["cobra", "viper", "sidewinder"],
|
|
columns=["max_speed", "shield"],
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf.keys(), pdf.keys())
|
|
|
|
def test_quantile(self):
|
|
pdf, psdf = self.df_pair
|
|
|
|
self.assert_eq(psdf.quantile(0.5), pdf.quantile(0.5))
|
|
self.assert_eq(psdf.quantile([0.25, 0.5, 0.75]), pdf.quantile([0.25, 0.5, 0.75]))
|
|
|
|
self.assert_eq(psdf.loc[[]].quantile(0.5), pdf.loc[[]].quantile(0.5))
|
|
self.assert_eq(
|
|
psdf.loc[[]].quantile([0.25, 0.5, 0.75]), pdf.loc[[]].quantile([0.25, 0.5, 0.75])
|
|
)
|
|
|
|
with self.assertRaisesRegex(
|
|
NotImplementedError, 'axis should be either 0 or "index" currently.'
|
|
):
|
|
psdf.quantile(0.5, axis=1)
|
|
with self.assertRaisesRegex(TypeError, "accuracy must be an integer; however"):
|
|
psdf.quantile(accuracy="a")
|
|
with self.assertRaisesRegex(TypeError, "q must be a float or an array of floats;"):
|
|
psdf.quantile(q="a")
|
|
with self.assertRaisesRegex(TypeError, "q must be a float or an array of floats;"):
|
|
psdf.quantile(q=["a"])
|
|
|
|
self.assert_eq(
|
|
psdf.quantile(0.5, numeric_only=False), pdf.quantile(0.5, numeric_only=False)
|
|
)
|
|
self.assert_eq(
|
|
psdf.quantile([0.25, 0.5, 0.75], numeric_only=False),
|
|
pdf.quantile([0.25, 0.5, 0.75], numeric_only=False),
|
|
)
|
|
|
|
# multi-index column
|
|
columns = pd.MultiIndex.from_tuples([("x", "a"), ("y", "b")])
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
|
|
self.assert_eq(psdf.quantile(0.5), pdf.quantile(0.5))
|
|
self.assert_eq(psdf.quantile([0.25, 0.5, 0.75]), pdf.quantile([0.25, 0.5, 0.75]))
|
|
|
|
pdf = pd.DataFrame({"x": ["a", "b", "c"]})
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"):
|
|
self.assert_eq(psdf.quantile(0.5), pdf.quantile(0.5))
|
|
self.assert_eq(psdf.quantile([0.25, 0.5, 0.75]), pdf.quantile([0.25, 0.5, 0.75]))
|
|
else:
|
|
self.assert_eq(psdf.quantile(0.5), pd.Series(name=0.5))
|
|
self.assert_eq(psdf.quantile([0.25, 0.5, 0.75]), pd.DataFrame(index=[0.25, 0.5, 0.75]))
|
|
|
|
with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"):
|
|
psdf.quantile(0.5, numeric_only=False)
|
|
with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"):
|
|
psdf.quantile([0.25, 0.5, 0.75], numeric_only=False)
|
|
|
|
def test_pct_change(self):
|
|
pdf = pd.DataFrame(
|
|
{"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0], "c": [300, 200, 400, 200]},
|
|
index=np.random.rand(4),
|
|
)
|
|
pdf.columns = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf.pct_change(2), pdf.pct_change(2), check_exact=False)
|
|
self.assert_eq(psdf.pct_change().sum(), pdf.pct_change().sum(), check_exact=False)
|
|
|
|
def test_where(self):
|
|
psdf = ps.from_pandas(self.pdf)
|
|
|
|
with self.assertRaisesRegex(TypeError, "type of cond must be a DataFrame or Series"):
|
|
psdf.where(1)
|
|
|
|
def test_mask(self):
|
|
psdf = ps.from_pandas(self.pdf)
|
|
|
|
with self.assertRaisesRegex(TypeError, "type of cond must be a DataFrame or Series"):
|
|
psdf.mask(1)
|
|
|
|
def test_query(self):
|
|
pdf = pd.DataFrame({"A": range(1, 6), "B": range(10, 0, -2), "C": range(10, 5, -1)})
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
exprs = ("A > B", "A < C", "C == B")
|
|
for expr in exprs:
|
|
self.assert_eq(psdf.query(expr), pdf.query(expr))
|
|
|
|
# test `inplace=True`
|
|
for expr in exprs:
|
|
dummy_psdf = psdf.copy()
|
|
dummy_pdf = pdf.copy()
|
|
|
|
pser = dummy_pdf.A
|
|
psser = dummy_psdf.A
|
|
dummy_pdf.query(expr, inplace=True)
|
|
dummy_psdf.query(expr, inplace=True)
|
|
|
|
self.assert_eq(dummy_psdf, dummy_pdf)
|
|
self.assert_eq(psser, pser)
|
|
|
|
# invalid values for `expr`
|
|
invalid_exprs = (1, 1.0, (exprs[0],), [exprs[0]])
|
|
for expr in invalid_exprs:
|
|
with self.assertRaisesRegex(
|
|
TypeError,
|
|
"expr must be a string to be evaluated, {} given".format(type(expr).__name__),
|
|
):
|
|
psdf.query(expr)
|
|
|
|
# invalid values for `inplace`
|
|
invalid_inplaces = (1, 0, "True", "False")
|
|
for inplace in invalid_inplaces:
|
|
with self.assertRaisesRegex(
|
|
TypeError,
|
|
'For argument "inplace" expected type bool, received type {}.'.format(
|
|
type(inplace).__name__
|
|
),
|
|
):
|
|
psdf.query("a < b", inplace=inplace)
|
|
|
|
# doesn't support for MultiIndex columns
|
|
columns = pd.MultiIndex.from_tuples([("A", "Z"), ("B", "X"), ("C", "C")])
|
|
psdf.columns = columns
|
|
with self.assertRaisesRegex(TypeError, "Doesn't support for MultiIndex columns"):
|
|
psdf.query("('A', 'Z') > ('B', 'X')")
|
|
|
|
def test_take(self):
|
|
pdf = pd.DataFrame(
|
|
{"A": range(0, 50000), "B": range(100000, 0, -2), "C": range(100000, 50000, -1)}
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
# axis=0 (default)
|
|
self.assert_eq(psdf.take([1, 2]).sort_index(), pdf.take([1, 2]).sort_index())
|
|
self.assert_eq(psdf.take([-1, -2]).sort_index(), pdf.take([-1, -2]).sort_index())
|
|
self.assert_eq(
|
|
psdf.take(range(100, 110)).sort_index(), pdf.take(range(100, 110)).sort_index()
|
|
)
|
|
self.assert_eq(
|
|
psdf.take(range(-110, -100)).sort_index(), pdf.take(range(-110, -100)).sort_index()
|
|
)
|
|
self.assert_eq(
|
|
psdf.take([10, 100, 1000, 10000]).sort_index(),
|
|
pdf.take([10, 100, 1000, 10000]).sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
psdf.take([-10, -100, -1000, -10000]).sort_index(),
|
|
pdf.take([-10, -100, -1000, -10000]).sort_index(),
|
|
)
|
|
|
|
# axis=1
|
|
self.assert_eq(
|
|
psdf.take([1, 2], axis=1).sort_index(), pdf.take([1, 2], axis=1).sort_index()
|
|
)
|
|
self.assert_eq(
|
|
psdf.take([-1, -2], axis=1).sort_index(), pdf.take([-1, -2], axis=1).sort_index()
|
|
)
|
|
self.assert_eq(
|
|
psdf.take(range(1, 3), axis=1).sort_index(), pdf.take(range(1, 3), axis=1).sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
psdf.take(range(-1, -3), axis=1).sort_index(),
|
|
pdf.take(range(-1, -3), axis=1).sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
psdf.take([2, 1], axis=1).sort_index(), pdf.take([2, 1], axis=1).sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
psdf.take([-1, -2], axis=1).sort_index(), pdf.take([-1, -2], axis=1).sort_index(),
|
|
)
|
|
|
|
# MultiIndex columns
|
|
columns = pd.MultiIndex.from_tuples([("A", "Z"), ("B", "X"), ("C", "C")])
|
|
psdf.columns = columns
|
|
pdf.columns = columns
|
|
|
|
# MultiIndex columns with axis=0 (default)
|
|
self.assert_eq(psdf.take([1, 2]).sort_index(), pdf.take([1, 2]).sort_index())
|
|
self.assert_eq(psdf.take([-1, -2]).sort_index(), pdf.take([-1, -2]).sort_index())
|
|
self.assert_eq(
|
|
psdf.take(range(100, 110)).sort_index(), pdf.take(range(100, 110)).sort_index()
|
|
)
|
|
self.assert_eq(
|
|
psdf.take(range(-110, -100)).sort_index(), pdf.take(range(-110, -100)).sort_index()
|
|
)
|
|
self.assert_eq(
|
|
psdf.take([10, 100, 1000, 10000]).sort_index(),
|
|
pdf.take([10, 100, 1000, 10000]).sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
psdf.take([-10, -100, -1000, -10000]).sort_index(),
|
|
pdf.take([-10, -100, -1000, -10000]).sort_index(),
|
|
)
|
|
|
|
# axis=1
|
|
self.assert_eq(
|
|
psdf.take([1, 2], axis=1).sort_index(), pdf.take([1, 2], axis=1).sort_index()
|
|
)
|
|
self.assert_eq(
|
|
psdf.take([-1, -2], axis=1).sort_index(), pdf.take([-1, -2], axis=1).sort_index()
|
|
)
|
|
self.assert_eq(
|
|
psdf.take(range(1, 3), axis=1).sort_index(), pdf.take(range(1, 3), axis=1).sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
psdf.take(range(-1, -3), axis=1).sort_index(),
|
|
pdf.take(range(-1, -3), axis=1).sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
psdf.take([2, 1], axis=1).sort_index(), pdf.take([2, 1], axis=1).sort_index(),
|
|
)
|
|
self.assert_eq(
|
|
psdf.take([-1, -2], axis=1).sort_index(), pdf.take([-1, -2], axis=1).sort_index(),
|
|
)
|
|
|
|
# Checking the type of indices.
|
|
self.assertRaises(TypeError, lambda: psdf.take(1))
|
|
self.assertRaises(TypeError, lambda: psdf.take("1"))
|
|
self.assertRaises(TypeError, lambda: psdf.take({1, 2}))
|
|
self.assertRaises(TypeError, lambda: psdf.take({1: None, 2: None}))
|
|
|
|
def test_axes(self):
|
|
pdf = self.pdf
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(pdf.axes, psdf.axes)
|
|
|
|
# multi-index columns
|
|
columns = pd.MultiIndex.from_tuples([("x", "a"), ("y", "b")])
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
self.assert_eq(pdf.axes, psdf.axes)
|
|
|
|
def test_udt(self):
|
|
sparse_values = {0: 0.1, 1: 1.1}
|
|
sparse_vector = SparseVector(len(sparse_values), sparse_values)
|
|
pdf = pd.DataFrame({"a": [sparse_vector], "b": [10]})
|
|
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(psdf, pdf)
|
|
|
|
def test_eval(self):
|
|
pdf = pd.DataFrame({"A": range(1, 6), "B": range(10, 0, -2)})
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
# operation between columns (returns Series)
|
|
self.assert_eq(pdf.eval("A + B"), psdf.eval("A + B"))
|
|
self.assert_eq(pdf.eval("A + A"), psdf.eval("A + A"))
|
|
# assignment (returns DataFrame)
|
|
self.assert_eq(pdf.eval("C = A + B"), psdf.eval("C = A + B"))
|
|
self.assert_eq(pdf.eval("A = A + A"), psdf.eval("A = A + A"))
|
|
# operation between scalars (returns scalar)
|
|
self.assert_eq(pdf.eval("1 + 1"), psdf.eval("1 + 1"))
|
|
# complicated operations with assignment
|
|
self.assert_eq(
|
|
pdf.eval("B = A + B // (100 + 200) * (500 - B) - 10.5"),
|
|
psdf.eval("B = A + B // (100 + 200) * (500 - B) - 10.5"),
|
|
)
|
|
|
|
# inplace=True (only support for assignment)
|
|
pdf.eval("C = A + B", inplace=True)
|
|
psdf.eval("C = A + B", inplace=True)
|
|
self.assert_eq(pdf, psdf)
|
|
pser = pdf.A
|
|
psser = psdf.A
|
|
pdf.eval("A = B + C", inplace=True)
|
|
psdf.eval("A = B + C", inplace=True)
|
|
self.assert_eq(pdf, psdf)
|
|
self.assert_eq(pser, psser)
|
|
|
|
# doesn't support for multi-index columns
|
|
columns = pd.MultiIndex.from_tuples([("x", "a"), ("y", "b"), ("z", "c")])
|
|
psdf.columns = columns
|
|
self.assertRaises(TypeError, lambda: psdf.eval("x.a + y.b"))
|
|
|
|
@unittest.skipIf(not have_tabulate, tabulate_requirement_message)
|
|
def test_to_markdown(self):
|
|
pdf = pd.DataFrame(data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]})
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
# `to_markdown()` is supported in pandas >= 1.0.0 since it's newly added in pandas 1.0.0.
|
|
if LooseVersion(pd.__version__) < LooseVersion("1.0.0"):
|
|
self.assertRaises(NotImplementedError, lambda: psdf.to_markdown())
|
|
else:
|
|
self.assert_eq(pdf.to_markdown(), psdf.to_markdown())
|
|
|
|
def test_cache(self):
|
|
pdf = pd.DataFrame(
|
|
[(0.2, 0.3), (0.0, 0.6), (0.6, 0.0), (0.2, 0.1)], columns=["dogs", "cats"]
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
with psdf.cache() as cached_df:
|
|
self.assert_eq(isinstance(cached_df, CachedDataFrame), True)
|
|
self.assert_eq(
|
|
repr(cached_df.storage_level), repr(StorageLevel(True, True, False, True))
|
|
)
|
|
|
|
def test_persist(self):
|
|
pdf = pd.DataFrame(
|
|
[(0.2, 0.3), (0.0, 0.6), (0.6, 0.0), (0.2, 0.1)], columns=["dogs", "cats"]
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
storage_levels = [
|
|
StorageLevel.DISK_ONLY,
|
|
StorageLevel.MEMORY_AND_DISK,
|
|
StorageLevel.MEMORY_ONLY,
|
|
StorageLevel.OFF_HEAP,
|
|
]
|
|
|
|
for storage_level in storage_levels:
|
|
with psdf.persist(storage_level) as cached_df:
|
|
self.assert_eq(isinstance(cached_df, CachedDataFrame), True)
|
|
self.assert_eq(repr(cached_df.storage_level), repr(storage_level))
|
|
|
|
self.assertRaises(TypeError, lambda: psdf.persist("DISK_ONLY"))
|
|
|
|
def test_squeeze(self):
|
|
axises = [None, 0, 1, "rows", "index", "columns"]
|
|
|
|
# Multiple columns
|
|
pdf = pd.DataFrame([[1, 2], [3, 4]], columns=["a", "b"], index=["x", "y"])
|
|
psdf = ps.from_pandas(pdf)
|
|
for axis in axises:
|
|
self.assert_eq(pdf.squeeze(axis), psdf.squeeze(axis))
|
|
# Multiple columns with MultiIndex columns
|
|
columns = pd.MultiIndex.from_tuples([("A", "Z"), ("B", "X")])
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
for axis in axises:
|
|
self.assert_eq(pdf.squeeze(axis), psdf.squeeze(axis))
|
|
|
|
# Single column with single value
|
|
pdf = pd.DataFrame([[1]], columns=["a"], index=["x"])
|
|
psdf = ps.from_pandas(pdf)
|
|
for axis in axises:
|
|
self.assert_eq(pdf.squeeze(axis), psdf.squeeze(axis))
|
|
# Single column with single value with MultiIndex column
|
|
columns = pd.MultiIndex.from_tuples([("A", "Z")])
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
for axis in axises:
|
|
self.assert_eq(pdf.squeeze(axis), psdf.squeeze(axis))
|
|
|
|
# Single column with multiple values
|
|
pdf = pd.DataFrame([1, 2, 3, 4], columns=["a"])
|
|
psdf = ps.from_pandas(pdf)
|
|
for axis in axises:
|
|
self.assert_eq(pdf.squeeze(axis), psdf.squeeze(axis))
|
|
# Single column with multiple values with MultiIndex column
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
for axis in axises:
|
|
self.assert_eq(pdf.squeeze(axis), psdf.squeeze(axis))
|
|
|
|
def test_rfloordiv(self):
|
|
pdf = pd.DataFrame(
|
|
{"angles": [0, 3, 4], "degrees": [360, 180, 360]},
|
|
index=["circle", "triangle", "rectangle"],
|
|
columns=["angles", "degrees"],
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
if LooseVersion(pd.__version__) < LooseVersion("1.0.0") and LooseVersion(
|
|
pd.__version__
|
|
) >= LooseVersion("0.24.0"):
|
|
expected_result = pd.DataFrame(
|
|
{"angles": [np.inf, 3.0, 2.0], "degrees": [0.0, 0.0, 0.0]},
|
|
index=["circle", "triangle", "rectangle"],
|
|
columns=["angles", "degrees"],
|
|
)
|
|
else:
|
|
expected_result = pdf.rfloordiv(10)
|
|
|
|
self.assert_eq(psdf.rfloordiv(10), expected_result)
|
|
|
|
def test_truncate(self):
|
|
pdf1 = pd.DataFrame(
|
|
{
|
|
"A": ["a", "b", "c", "d", "e", "f", "g"],
|
|
"B": ["h", "i", "j", "k", "l", "m", "n"],
|
|
"C": ["o", "p", "q", "r", "s", "t", "u"],
|
|
},
|
|
index=[-500, -20, -1, 0, 400, 550, 1000],
|
|
)
|
|
psdf1 = ps.from_pandas(pdf1)
|
|
pdf2 = pd.DataFrame(
|
|
{
|
|
"A": ["a", "b", "c", "d", "e", "f", "g"],
|
|
"B": ["h", "i", "j", "k", "l", "m", "n"],
|
|
"C": ["o", "p", "q", "r", "s", "t", "u"],
|
|
},
|
|
index=[1000, 550, 400, 0, -1, -20, -500],
|
|
)
|
|
psdf2 = ps.from_pandas(pdf2)
|
|
|
|
self.assert_eq(psdf1.truncate(), pdf1.truncate())
|
|
self.assert_eq(psdf1.truncate(before=-20), pdf1.truncate(before=-20))
|
|
self.assert_eq(psdf1.truncate(after=400), pdf1.truncate(after=400))
|
|
self.assert_eq(psdf1.truncate(copy=False), pdf1.truncate(copy=False))
|
|
self.assert_eq(psdf1.truncate(-20, 400, copy=False), pdf1.truncate(-20, 400, copy=False))
|
|
# The bug for these tests has been fixed in pandas 1.1.0.
|
|
if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
|
|
self.assert_eq(psdf2.truncate(0, 550), pdf2.truncate(0, 550))
|
|
self.assert_eq(psdf2.truncate(0, 550, copy=False), pdf2.truncate(0, 550, copy=False))
|
|
else:
|
|
expected_psdf = ps.DataFrame(
|
|
{"A": ["b", "c", "d"], "B": ["i", "j", "k"], "C": ["p", "q", "r"]},
|
|
index=[550, 400, 0],
|
|
)
|
|
self.assert_eq(psdf2.truncate(0, 550), expected_psdf)
|
|
self.assert_eq(psdf2.truncate(0, 550, copy=False), expected_psdf)
|
|
|
|
# axis = 1
|
|
self.assert_eq(psdf1.truncate(axis=1), pdf1.truncate(axis=1))
|
|
self.assert_eq(psdf1.truncate(before="B", axis=1), pdf1.truncate(before="B", axis=1))
|
|
self.assert_eq(psdf1.truncate(after="A", axis=1), pdf1.truncate(after="A", axis=1))
|
|
self.assert_eq(psdf1.truncate(copy=False, axis=1), pdf1.truncate(copy=False, axis=1))
|
|
self.assert_eq(psdf2.truncate("B", "C", axis=1), pdf2.truncate("B", "C", axis=1))
|
|
self.assert_eq(
|
|
psdf1.truncate("B", "C", copy=False, axis=1),
|
|
pdf1.truncate("B", "C", copy=False, axis=1),
|
|
)
|
|
|
|
# MultiIndex columns
|
|
columns = pd.MultiIndex.from_tuples([("A", "Z"), ("B", "X"), ("C", "Z")])
|
|
pdf1.columns = columns
|
|
psdf1.columns = columns
|
|
pdf2.columns = columns
|
|
psdf2.columns = columns
|
|
|
|
self.assert_eq(psdf1.truncate(), pdf1.truncate())
|
|
self.assert_eq(psdf1.truncate(before=-20), pdf1.truncate(before=-20))
|
|
self.assert_eq(psdf1.truncate(after=400), pdf1.truncate(after=400))
|
|
self.assert_eq(psdf1.truncate(copy=False), pdf1.truncate(copy=False))
|
|
self.assert_eq(psdf1.truncate(-20, 400, copy=False), pdf1.truncate(-20, 400, copy=False))
|
|
# The bug for these tests has been fixed in pandas 1.1.0.
|
|
if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
|
|
self.assert_eq(psdf2.truncate(0, 550), pdf2.truncate(0, 550))
|
|
self.assert_eq(psdf2.truncate(0, 550, copy=False), pdf2.truncate(0, 550, copy=False))
|
|
else:
|
|
expected_psdf.columns = columns
|
|
self.assert_eq(psdf2.truncate(0, 550), expected_psdf)
|
|
self.assert_eq(psdf2.truncate(0, 550, copy=False), expected_psdf)
|
|
# axis = 1
|
|
self.assert_eq(psdf1.truncate(axis=1), pdf1.truncate(axis=1))
|
|
self.assert_eq(psdf1.truncate(before="B", axis=1), pdf1.truncate(before="B", axis=1))
|
|
self.assert_eq(psdf1.truncate(after="A", axis=1), pdf1.truncate(after="A", axis=1))
|
|
self.assert_eq(psdf1.truncate(copy=False, axis=1), pdf1.truncate(copy=False, axis=1))
|
|
self.assert_eq(psdf2.truncate("B", "C", axis=1), pdf2.truncate("B", "C", axis=1))
|
|
self.assert_eq(
|
|
psdf1.truncate("B", "C", copy=False, axis=1),
|
|
pdf1.truncate("B", "C", copy=False, axis=1),
|
|
)
|
|
|
|
# Exceptions
|
|
psdf = ps.DataFrame(
|
|
{
|
|
"A": ["a", "b", "c", "d", "e", "f", "g"],
|
|
"B": ["h", "i", "j", "k", "l", "m", "n"],
|
|
"C": ["o", "p", "q", "r", "s", "t", "u"],
|
|
},
|
|
index=[-500, 100, 400, 0, -1, 550, -20],
|
|
)
|
|
msg = "truncate requires a sorted index"
|
|
with self.assertRaisesRegex(ValueError, msg):
|
|
psdf.truncate()
|
|
|
|
psdf = ps.DataFrame(
|
|
{
|
|
"A": ["a", "b", "c", "d", "e", "f", "g"],
|
|
"B": ["h", "i", "j", "k", "l", "m", "n"],
|
|
"C": ["o", "p", "q", "r", "s", "t", "u"],
|
|
},
|
|
index=[-500, -20, -1, 0, 400, 550, 1000],
|
|
)
|
|
msg = "Truncate: -20 must be after 400"
|
|
with self.assertRaisesRegex(ValueError, msg):
|
|
psdf.truncate(400, -20)
|
|
msg = "Truncate: B must be after C"
|
|
with self.assertRaisesRegex(ValueError, msg):
|
|
psdf.truncate("C", "B", axis=1)
|
|
|
|
def test_explode(self):
|
|
pdf = pd.DataFrame({"A": [[-1.0, np.nan], [0.0, np.inf], [1.0, -np.inf]], "B": 1})
|
|
pdf.index.name = "index"
|
|
pdf.columns.name = "columns"
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
if LooseVersion(pd.__version__) >= LooseVersion("0.25.0"):
|
|
expected_result1 = pdf.explode("A")
|
|
expected_result2 = pdf.explode("B")
|
|
else:
|
|
expected_result1 = pd.DataFrame(
|
|
{"A": [-1, np.nan, 0, np.inf, 1, -np.inf], "B": [1, 1, 1, 1, 1, 1]},
|
|
index=pd.Index([0, 0, 1, 1, 2, 2]),
|
|
)
|
|
expected_result1.index.name = "index"
|
|
expected_result1.columns.name = "columns"
|
|
expected_result2 = pdf
|
|
|
|
self.assert_eq(psdf.explode("A"), expected_result1, almost=True)
|
|
self.assert_eq(repr(psdf.explode("B")), repr(expected_result2))
|
|
self.assert_eq(psdf.explode("A").index.name, expected_result1.index.name)
|
|
self.assert_eq(psdf.explode("A").columns.name, expected_result1.columns.name)
|
|
|
|
self.assertRaises(TypeError, lambda: psdf.explode(["A", "B"]))
|
|
|
|
# MultiIndex
|
|
midx = pd.MultiIndex.from_tuples(
|
|
[("x", "a"), ("x", "b"), ("y", "c")], names=["index1", "index2"]
|
|
)
|
|
pdf.index = midx
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
if LooseVersion(pd.__version__) >= LooseVersion("0.25.0"):
|
|
expected_result1 = pdf.explode("A")
|
|
expected_result2 = pdf.explode("B")
|
|
else:
|
|
midx = pd.MultiIndex.from_tuples(
|
|
[("x", "a"), ("x", "a"), ("x", "b"), ("x", "b"), ("y", "c"), ("y", "c")],
|
|
names=["index1", "index2"],
|
|
)
|
|
expected_result1.index = midx
|
|
expected_result2 = pdf
|
|
|
|
self.assert_eq(psdf.explode("A"), expected_result1, almost=True)
|
|
self.assert_eq(repr(psdf.explode("B")), repr(expected_result2))
|
|
self.assert_eq(psdf.explode("A").index.names, expected_result1.index.names)
|
|
self.assert_eq(psdf.explode("A").columns.name, expected_result1.columns.name)
|
|
|
|
self.assertRaises(TypeError, lambda: psdf.explode(["A", "B"]))
|
|
|
|
# MultiIndex columns
|
|
columns = pd.MultiIndex.from_tuples([("A", "Z"), ("B", "X")], names=["column1", "column2"])
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
|
|
if LooseVersion(pd.__version__) >= LooseVersion("0.25.0"):
|
|
expected_result1 = pdf.explode(("A", "Z"))
|
|
expected_result2 = pdf.explode(("B", "X"))
|
|
expected_result3 = pdf.A.explode("Z")
|
|
else:
|
|
expected_result1.columns = columns
|
|
expected_result2 = pdf
|
|
expected_result3 = pd.DataFrame({"Z": [-1, np.nan, 0, np.inf, 1, -np.inf]}, index=midx)
|
|
expected_result3.index.name = "index"
|
|
expected_result3.columns.name = "column2"
|
|
|
|
self.assert_eq(psdf.explode(("A", "Z")), expected_result1, almost=True)
|
|
self.assert_eq(repr(psdf.explode(("B", "X"))), repr(expected_result2))
|
|
self.assert_eq(psdf.explode(("A", "Z")).index.names, expected_result1.index.names)
|
|
self.assert_eq(psdf.explode(("A", "Z")).columns.names, expected_result1.columns.names)
|
|
|
|
self.assert_eq(psdf.A.explode("Z"), expected_result3, almost=True)
|
|
|
|
self.assertRaises(TypeError, lambda: psdf.explode(["A", "B"]))
|
|
self.assertRaises(ValueError, lambda: psdf.explode("A"))
|
|
|
|
def test_spark_schema(self):
|
|
psdf = ps.DataFrame(
|
|
{
|
|
"a": list("abc"),
|
|
"b": list(range(1, 4)),
|
|
"c": np.arange(3, 6).astype("i1"),
|
|
"d": np.arange(4.0, 7.0, dtype="float64"),
|
|
"e": [True, False, True],
|
|
"f": pd.date_range("20130101", periods=3),
|
|
},
|
|
columns=["a", "b", "c", "d", "e", "f"],
|
|
)
|
|
self.assertEqual(psdf.spark_schema(), psdf.spark.schema())
|
|
self.assertEqual(psdf.spark_schema("index"), psdf.spark.schema("index"))
|
|
|
|
def test_print_schema(self):
|
|
psdf = ps.DataFrame(
|
|
{"a": list("abc"), "b": list(range(1, 4)), "c": np.arange(3, 6).astype("i1")},
|
|
columns=["a", "b", "c"],
|
|
)
|
|
|
|
prev = sys.stdout
|
|
try:
|
|
out = StringIO()
|
|
sys.stdout = out
|
|
psdf.print_schema()
|
|
actual = out.getvalue().strip()
|
|
|
|
out = StringIO()
|
|
sys.stdout = out
|
|
psdf.spark.print_schema()
|
|
expected = out.getvalue().strip()
|
|
|
|
self.assertEqual(actual, expected)
|
|
finally:
|
|
sys.stdout = prev
|
|
|
|
def test_explain_hint(self):
|
|
psdf1 = ps.DataFrame(
|
|
{"lkey": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 5]}, columns=["lkey", "value"]
|
|
)
|
|
psdf2 = ps.DataFrame(
|
|
{"rkey": ["foo", "bar", "baz", "foo"], "value": [5, 6, 7, 8]}, columns=["rkey", "value"]
|
|
)
|
|
merged = psdf1.merge(psdf2.hint("broadcast"), left_on="lkey", right_on="rkey")
|
|
prev = sys.stdout
|
|
try:
|
|
out = StringIO()
|
|
sys.stdout = out
|
|
merged.explain()
|
|
actual = out.getvalue().strip()
|
|
|
|
out = StringIO()
|
|
sys.stdout = out
|
|
merged.spark.explain()
|
|
expected = out.getvalue().strip()
|
|
|
|
self.assertEqual(actual, expected)
|
|
finally:
|
|
sys.stdout = prev
|
|
|
|
def test_mad(self):
|
|
pdf = pd.DataFrame(
|
|
{
|
|
"A": [1, 2, None, 4, np.nan],
|
|
"B": [-0.1, 0.2, -0.3, np.nan, 0.5],
|
|
"C": ["a", "b", "c", "d", "e"],
|
|
}
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf.mad(), pdf.mad())
|
|
self.assert_eq(psdf.mad(axis=1), pdf.mad(axis=1))
|
|
|
|
with self.assertRaises(ValueError):
|
|
psdf.mad(axis=2)
|
|
|
|
# MultiIndex columns
|
|
columns = pd.MultiIndex.from_tuples([("A", "X"), ("A", "Y"), ("A", "Z")])
|
|
pdf.columns = columns
|
|
psdf.columns = columns
|
|
|
|
self.assert_eq(psdf.mad(), pdf.mad())
|
|
self.assert_eq(psdf.mad(axis=1), pdf.mad(axis=1))
|
|
|
|
pdf = pd.DataFrame({"A": [True, True, False, False], "B": [True, False, False, True]})
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(psdf.mad(), pdf.mad())
|
|
self.assert_eq(psdf.mad(axis=1), pdf.mad(axis=1))
|
|
|
|
def test_abs(self):
|
|
pdf = pd.DataFrame({"a": [-2, -1, 0, 1]})
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(abs(psdf), abs(pdf))
|
|
self.assert_eq(np.abs(psdf), np.abs(pdf))
|
|
|
|
def test_iteritems(self):
|
|
pdf = pd.DataFrame(
|
|
{"species": ["bear", "bear", "marsupial"], "population": [1864, 22000, 80000]},
|
|
index=["panda", "polar", "koala"],
|
|
columns=["species", "population"],
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
for (p_name, p_items), (k_name, k_items) in zip(pdf.iteritems(), psdf.iteritems()):
|
|
self.assert_eq(p_name, k_name)
|
|
self.assert_eq(p_items, k_items)
|
|
|
|
def test_tail(self):
|
|
pdf = pd.DataFrame({"x": range(1000)})
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(pdf.tail(), psdf.tail())
|
|
self.assert_eq(pdf.tail(10), psdf.tail(10))
|
|
self.assert_eq(pdf.tail(-990), psdf.tail(-990))
|
|
self.assert_eq(pdf.tail(0), psdf.tail(0))
|
|
self.assert_eq(pdf.tail(-1001), psdf.tail(-1001))
|
|
self.assert_eq(pdf.tail(1001), psdf.tail(1001))
|
|
self.assert_eq((pdf + 1).tail(), (psdf + 1).tail())
|
|
self.assert_eq((pdf + 1).tail(10), (psdf + 1).tail(10))
|
|
self.assert_eq((pdf + 1).tail(-990), (psdf + 1).tail(-990))
|
|
self.assert_eq((pdf + 1).tail(0), (psdf + 1).tail(0))
|
|
self.assert_eq((pdf + 1).tail(-1001), (psdf + 1).tail(-1001))
|
|
self.assert_eq((pdf + 1).tail(1001), (psdf + 1).tail(1001))
|
|
with self.assertRaisesRegex(TypeError, "bad operand type for unary -: 'str'"):
|
|
psdf.tail("10")
|
|
|
|
def test_last_valid_index(self):
|
|
pdf = pd.DataFrame(
|
|
{"a": [1, 2, 3, None], "b": [1.0, 2.0, 3.0, None], "c": [100, 200, 400, None]},
|
|
index=["Q", "W", "E", "R"],
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(pdf.last_valid_index(), psdf.last_valid_index())
|
|
self.assert_eq(pdf[[]].last_valid_index(), psdf[[]].last_valid_index())
|
|
|
|
# MultiIndex columns
|
|
pdf.columns = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(pdf.last_valid_index(), psdf.last_valid_index())
|
|
|
|
# Empty DataFrame
|
|
pdf = pd.Series([]).to_frame()
|
|
psdf = ps.Series([]).to_frame()
|
|
self.assert_eq(pdf.last_valid_index(), psdf.last_valid_index())
|
|
|
|
def test_last(self):
|
|
index = pd.date_range("2018-04-09", periods=4, freq="2D")
|
|
pdf = pd.DataFrame([1, 2, 3, 4], index=index)
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(pdf.last("1D"), psdf.last("1D"))
|
|
self.assert_eq(pdf.last(DateOffset(days=1)), psdf.last(DateOffset(days=1)))
|
|
with self.assertRaisesRegex(TypeError, "'last' only supports a DatetimeIndex"):
|
|
ps.DataFrame([1, 2, 3, 4]).last("1D")
|
|
|
|
def test_first(self):
|
|
index = pd.date_range("2018-04-09", periods=4, freq="2D")
|
|
pdf = pd.DataFrame([1, 2, 3, 4], index=index)
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(pdf.first("1D"), psdf.first("1D"))
|
|
self.assert_eq(pdf.first(DateOffset(days=1)), psdf.first(DateOffset(days=1)))
|
|
with self.assertRaisesRegex(TypeError, "'first' only supports a DatetimeIndex"):
|
|
ps.DataFrame([1, 2, 3, 4]).first("1D")
|
|
|
|
def test_first_valid_index(self):
|
|
pdf = pd.DataFrame(
|
|
{"a": [None, 2, 3, 2], "b": [None, 2.0, 3.0, 1.0], "c": [None, 200, 400, 200]},
|
|
index=["Q", "W", "E", "R"],
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(pdf.first_valid_index(), psdf.first_valid_index())
|
|
self.assert_eq(pdf[[]].first_valid_index(), psdf[[]].first_valid_index())
|
|
|
|
# MultiIndex columns
|
|
pdf.columns = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(pdf.first_valid_index(), psdf.first_valid_index())
|
|
|
|
# Empty DataFrame
|
|
pdf = pd.Series([]).to_frame()
|
|
psdf = ps.Series([]).to_frame()
|
|
self.assert_eq(pdf.first_valid_index(), psdf.first_valid_index())
|
|
|
|
pdf = pd.DataFrame(
|
|
{"a": [None, 2, 3, 2], "b": [None, 2.0, 3.0, 1.0], "c": [None, 200, 400, 200]},
|
|
index=[
|
|
datetime(2021, 1, 1),
|
|
datetime(2021, 2, 1),
|
|
datetime(2021, 3, 1),
|
|
datetime(2021, 4, 1),
|
|
],
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(pdf.first_valid_index(), psdf.first_valid_index())
|
|
|
|
def test_product(self):
|
|
pdf = pd.DataFrame(
|
|
{"A": [1, 2, 3, 4, 5], "B": [10, 20, 30, 40, 50], "C": ["a", "b", "c", "d", "e"]}
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(pdf.prod(), psdf.prod().sort_index())
|
|
|
|
# Named columns
|
|
pdf.columns.name = "Koalas"
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(pdf.prod(), psdf.prod().sort_index())
|
|
|
|
# MultiIndex columns
|
|
pdf.columns = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(pdf.prod(), psdf.prod().sort_index())
|
|
|
|
# Named MultiIndex columns
|
|
pdf.columns.names = ["Hello", "Koalas"]
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(pdf.prod(), psdf.prod().sort_index())
|
|
|
|
# No numeric columns
|
|
pdf = pd.DataFrame({"key": ["a", "b", "c"], "val": ["x", "y", "z"]})
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(pdf.prod(), psdf.prod().sort_index())
|
|
|
|
# No numeric named columns
|
|
pdf.columns.name = "Koalas"
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(pdf.prod(), psdf.prod().sort_index(), almost=True)
|
|
|
|
# No numeric MultiIndex columns
|
|
pdf.columns = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y")])
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(pdf.prod(), psdf.prod().sort_index(), almost=True)
|
|
|
|
# No numeric named MultiIndex columns
|
|
pdf.columns.names = ["Hello", "Koalas"]
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(pdf.prod(), psdf.prod().sort_index(), almost=True)
|
|
|
|
# All NaN columns
|
|
pdf = pd.DataFrame(
|
|
{
|
|
"A": [np.nan, np.nan, np.nan, np.nan, np.nan],
|
|
"B": [10, 20, 30, 40, 50],
|
|
"C": ["a", "b", "c", "d", "e"],
|
|
}
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(pdf.prod(), psdf.prod().sort_index(), check_exact=False)
|
|
|
|
# All NaN named columns
|
|
pdf.columns.name = "Koalas"
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(pdf.prod(), psdf.prod().sort_index(), check_exact=False)
|
|
|
|
# All NaN MultiIndex columns
|
|
pdf.columns = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(pdf.prod(), psdf.prod().sort_index(), check_exact=False)
|
|
|
|
# All NaN named MultiIndex columns
|
|
pdf.columns.names = ["Hello", "Koalas"]
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(pdf.prod(), psdf.prod().sort_index(), check_exact=False)
|
|
|
|
def test_from_dict(self):
|
|
data = {"row_1": [3, 2, 1, 0], "row_2": [10, 20, 30, 40]}
|
|
pdf = pd.DataFrame.from_dict(data)
|
|
psdf = ps.DataFrame.from_dict(data)
|
|
self.assert_eq(pdf, psdf)
|
|
|
|
pdf = pd.DataFrame.from_dict(data, dtype="int8")
|
|
psdf = ps.DataFrame.from_dict(data, dtype="int8")
|
|
self.assert_eq(pdf, psdf)
|
|
|
|
pdf = pd.DataFrame.from_dict(data, orient="index", columns=["A", "B", "C", "D"])
|
|
psdf = ps.DataFrame.from_dict(data, orient="index", columns=["A", "B", "C", "D"])
|
|
self.assert_eq(pdf, psdf)
|
|
|
|
def test_pad(self):
|
|
pdf = pd.DataFrame(
|
|
{
|
|
"A": [None, 3, None, None],
|
|
"B": [2, 4, None, 3],
|
|
"C": [None, None, None, 1],
|
|
"D": [0, 1, 5, 4],
|
|
},
|
|
columns=["A", "B", "C", "D"],
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
if LooseVersion(pd.__version__) >= LooseVersion("1.1"):
|
|
self.assert_eq(pdf.pad(), psdf.pad())
|
|
|
|
# Test `inplace=True`
|
|
pdf.pad(inplace=True)
|
|
psdf.pad(inplace=True)
|
|
self.assert_eq(pdf, psdf)
|
|
else:
|
|
expected = ps.DataFrame(
|
|
{
|
|
"A": [None, 3, 3, 3],
|
|
"B": [2.0, 4.0, 4.0, 3.0],
|
|
"C": [None, None, None, 1],
|
|
"D": [0, 1, 5, 4],
|
|
},
|
|
columns=["A", "B", "C", "D"],
|
|
)
|
|
self.assert_eq(expected, psdf.pad())
|
|
|
|
# Test `inplace=True`
|
|
psdf.pad(inplace=True)
|
|
self.assert_eq(expected, psdf)
|
|
|
|
def test_backfill(self):
|
|
pdf = pd.DataFrame(
|
|
{
|
|
"A": [None, 3, None, None],
|
|
"B": [2, 4, None, 3],
|
|
"C": [None, None, None, 1],
|
|
"D": [0, 1, 5, 4],
|
|
},
|
|
columns=["A", "B", "C", "D"],
|
|
)
|
|
psdf = ps.from_pandas(pdf)
|
|
|
|
if LooseVersion(pd.__version__) >= LooseVersion("1.1"):
|
|
self.assert_eq(pdf.backfill(), psdf.backfill())
|
|
|
|
# Test `inplace=True`
|
|
pdf.backfill(inplace=True)
|
|
psdf.backfill(inplace=True)
|
|
self.assert_eq(pdf, psdf)
|
|
else:
|
|
expected = ps.DataFrame(
|
|
{
|
|
"A": [3.0, 3.0, None, None],
|
|
"B": [2.0, 4.0, 3.0, 3.0],
|
|
"C": [1.0, 1.0, 1.0, 1.0],
|
|
"D": [0, 1, 5, 4],
|
|
},
|
|
columns=["A", "B", "C", "D"],
|
|
)
|
|
self.assert_eq(expected, psdf.backfill())
|
|
|
|
# Test `inplace=True`
|
|
psdf.backfill(inplace=True)
|
|
self.assert_eq(expected, psdf)
|
|
|
|
def test_align(self):
|
|
pdf1 = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}, index=[10, 20, 30])
|
|
psdf1 = ps.from_pandas(pdf1)
|
|
|
|
for join in ["outer", "inner", "left", "right"]:
|
|
for axis in [None, 0, 1]:
|
|
psdf_l, psdf_r = psdf1.align(psdf1[["b"]], join=join, axis=axis)
|
|
pdf_l, pdf_r = pdf1.align(pdf1[["b"]], join=join, axis=axis)
|
|
self.assert_eq(psdf_l, pdf_l)
|
|
self.assert_eq(psdf_r, pdf_r)
|
|
|
|
psdf_l, psdf_r = psdf1[["a"]].align(psdf1[["b", "a"]], join=join, axis=axis)
|
|
pdf_l, pdf_r = pdf1[["a"]].align(pdf1[["b", "a"]], join=join, axis=axis)
|
|
self.assert_eq(psdf_l, pdf_l)
|
|
self.assert_eq(psdf_r, pdf_r)
|
|
|
|
psdf_l, psdf_r = psdf1[["b", "a"]].align(psdf1[["a"]], join=join, axis=axis)
|
|
pdf_l, pdf_r = pdf1[["b", "a"]].align(pdf1[["a"]], join=join, axis=axis)
|
|
self.assert_eq(psdf_l, pdf_l)
|
|
self.assert_eq(psdf_r, pdf_r)
|
|
|
|
psdf_l, psdf_r = psdf1.align(psdf1["b"], axis=0)
|
|
pdf_l, pdf_r = pdf1.align(pdf1["b"], axis=0)
|
|
self.assert_eq(psdf_l, pdf_l)
|
|
self.assert_eq(psdf_r, pdf_r)
|
|
|
|
psdf_l, psser_b = psdf1[["a"]].align(psdf1["b"], axis=0)
|
|
pdf_l, pser_b = pdf1[["a"]].align(pdf1["b"], axis=0)
|
|
self.assert_eq(psdf_l, pdf_l)
|
|
self.assert_eq(psser_b, pser_b)
|
|
|
|
self.assertRaises(ValueError, lambda: psdf1.align(psdf1, join="unknown"))
|
|
self.assertRaises(ValueError, lambda: psdf1.align(psdf1["b"]))
|
|
self.assertRaises(NotImplementedError, lambda: psdf1.align(psdf1["b"], axis=1))
|
|
|
|
pdf2 = pd.DataFrame({"a": [4, 5, 6], "d": ["d", "e", "f"]}, index=[10, 11, 12])
|
|
psdf2 = ps.from_pandas(pdf2)
|
|
|
|
for join in ["outer", "inner", "left", "right"]:
|
|
psdf_l, psdf_r = psdf1.align(psdf2, join=join, axis=1)
|
|
pdf_l, pdf_r = pdf1.align(pdf2, join=join, axis=1)
|
|
self.assert_eq(psdf_l.sort_index(), pdf_l.sort_index())
|
|
self.assert_eq(psdf_r.sort_index(), pdf_r.sort_index())
|
|
|
|
def test_between_time(self):
|
|
idx = pd.date_range("2018-04-09", periods=4, freq="1D20min")
|
|
pdf = pd.DataFrame({"A": [1, 2, 3, 4]}, index=idx)
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(
|
|
pdf.between_time("0:15", "0:45").sort_index(),
|
|
psdf.between_time("0:15", "0:45").sort_index(),
|
|
)
|
|
|
|
pdf.index.name = "ts"
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(
|
|
pdf.between_time("0:15", "0:45").sort_index(),
|
|
psdf.between_time("0:15", "0:45").sort_index(),
|
|
)
|
|
|
|
# Column label is 'index'
|
|
pdf.columns = pd.Index(["index"])
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(
|
|
pdf.between_time("0:15", "0:45").sort_index(),
|
|
psdf.between_time("0:15", "0:45").sort_index(),
|
|
)
|
|
|
|
# Both index name and column label are 'index'
|
|
pdf.index.name = "index"
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(
|
|
pdf.between_time("0:15", "0:45").sort_index(),
|
|
psdf.between_time("0:15", "0:45").sort_index(),
|
|
)
|
|
|
|
# Index name is 'index', column label is ('X', 'A')
|
|
pdf.columns = pd.MultiIndex.from_arrays([["X"], ["A"]])
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(
|
|
pdf.between_time("0:15", "0:45").sort_index(),
|
|
psdf.between_time("0:15", "0:45").sort_index(),
|
|
)
|
|
|
|
with self.assertRaisesRegex(
|
|
NotImplementedError, "between_time currently only works for axis=0"
|
|
):
|
|
psdf.between_time("0:15", "0:45", axis=1)
|
|
|
|
psdf = ps.DataFrame({"A": [1, 2, 3, 4]})
|
|
with self.assertRaisesRegex(TypeError, "Index must be DatetimeIndex"):
|
|
psdf.between_time("0:15", "0:45")
|
|
|
|
def test_at_time(self):
|
|
idx = pd.date_range("2018-04-09", periods=4, freq="1D20min")
|
|
pdf = pd.DataFrame({"A": [1, 2, 3, 4]}, index=idx)
|
|
psdf = ps.from_pandas(pdf)
|
|
psdf.at_time("0:20")
|
|
self.assert_eq(
|
|
pdf.at_time("0:20").sort_index(), psdf.at_time("0:20").sort_index(),
|
|
)
|
|
|
|
# Index name is 'ts'
|
|
pdf.index.name = "ts"
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(
|
|
pdf.at_time("0:20").sort_index(), psdf.at_time("0:20").sort_index(),
|
|
)
|
|
|
|
# Index name is 'ts', column label is 'index'
|
|
pdf.columns = pd.Index(["index"])
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(
|
|
pdf.at_time("0:40").sort_index(), psdf.at_time("0:40").sort_index(),
|
|
)
|
|
|
|
# Both index name and column label are 'index'
|
|
pdf.index.name = "index"
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(
|
|
pdf.at_time("0:40").sort_index(), psdf.at_time("0:40").sort_index(),
|
|
)
|
|
|
|
# Index name is 'index', column label is ('X', 'A')
|
|
pdf.columns = pd.MultiIndex.from_arrays([["X"], ["A"]])
|
|
psdf = ps.from_pandas(pdf)
|
|
self.assert_eq(
|
|
pdf.at_time("0:40").sort_index(), psdf.at_time("0:40").sort_index(),
|
|
)
|
|
|
|
with self.assertRaisesRegex(NotImplementedError, "'asof' argument is not supported"):
|
|
psdf.at_time("0:15", asof=True)
|
|
|
|
with self.assertRaisesRegex(NotImplementedError, "at_time currently only works for axis=0"):
|
|
psdf.at_time("0:15", axis=1)
|
|
|
|
psdf = ps.DataFrame({"A": [1, 2, 3, 4]})
|
|
with self.assertRaisesRegex(TypeError, "Index must be DatetimeIndex"):
|
|
psdf.at_time("0:15")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
from pyspark.pandas.tests.test_dataframe import * # noqa: F401
|
|
|
|
try:
|
|
import xmlrunner # type: ignore[import]
|
|
|
|
testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
|
|
except ImportError:
|
|
testRunner = None
|
|
unittest.main(testRunner=testRunner, verbosity=2)
|