6497ac3585
### What changes were proposed in this pull request? Adds more type annotations in the file `python/pyspark/pandas/frame.py` and fixes the mypy check failures. ### Why are the changes needed? We should enable more disallow_untyped_defs mypy checks. ### Does this PR introduce _any_ user-facing change? Yes. This PR adds more type annotations in pandas APIs on Spark module, which can impact interaction with development tools for users. ### How was this patch tested? The mypy check with a new configuration and existing tests should pass. Closes #33073 from ueshin/issues/SPARK-35471/disallow_untyped_defs_frame. Authored-by: Takuya UESHIN <ueshin@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
6372 lines
193 KiB
Python
6372 lines
193 KiB
Python
#
|
||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||
# contributor license agreements. See the NOTICE file distributed with
|
||
# this work for additional information regarding copyright ownership.
|
||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||
# (the "License"); you may not use this file except in compliance with
|
||
# the License. You may obtain a copy of the License at
|
||
#
|
||
# http://www.apache.org/licenses/LICENSE-2.0
|
||
#
|
||
# Unless required by applicable law or agreed to in writing, software
|
||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
# See the License for the specific language governing permissions and
|
||
# limitations under the License.
|
||
#
|
||
|
||
"""
|
||
A wrapper class for Spark Column to behave similar to pandas Series.
|
||
"""
|
||
import datetime
|
||
import re
|
||
import inspect
|
||
import sys
|
||
from collections.abc import Mapping
|
||
from functools import partial, wraps, reduce
|
||
from typing import (
|
||
Any,
|
||
Callable,
|
||
Dict,
|
||
Generic,
|
||
IO,
|
||
Iterable,
|
||
List,
|
||
Optional,
|
||
Sequence,
|
||
Tuple,
|
||
Type,
|
||
TypeVar,
|
||
Union,
|
||
cast,
|
||
no_type_check,
|
||
overload,
|
||
TYPE_CHECKING,
|
||
)
|
||
|
||
import numpy as np
|
||
import pandas as pd
|
||
from pandas.core.accessor import CachedAccessor
|
||
from pandas.io.formats.printing import pprint_thing
|
||
from pandas.api.types import is_list_like, is_hashable
|
||
from pandas.api.extensions import ExtensionDtype
|
||
from pandas.tseries.frequencies import DateOffset
|
||
from pyspark.sql import functions as F, Column, DataFrame as SparkDataFrame
|
||
from pyspark.sql.types import (
|
||
ArrayType,
|
||
BooleanType,
|
||
DataType,
|
||
DoubleType,
|
||
FloatType,
|
||
IntegerType,
|
||
IntegralType,
|
||
LongType,
|
||
NumericType,
|
||
Row,
|
||
StructType,
|
||
)
|
||
from pyspark.sql.window import Window
|
||
|
||
from pyspark import pandas as ps # For running doctests and reference resolution in PyCharm.
|
||
from pyspark.pandas.accessors import PandasOnSparkSeriesMethods
|
||
from pyspark.pandas.categorical import CategoricalAccessor
|
||
from pyspark.pandas.config import get_option
|
||
from pyspark.pandas.base import IndexOpsMixin
|
||
from pyspark.pandas.exceptions import SparkPandasIndexingError
|
||
from pyspark.pandas.frame import DataFrame
|
||
from pyspark.pandas.generic import Frame
|
||
from pyspark.pandas.internal import (
|
||
InternalField,
|
||
InternalFrame,
|
||
DEFAULT_SERIES_NAME,
|
||
NATURAL_ORDER_COLUMN_NAME,
|
||
SPARK_DEFAULT_INDEX_NAME,
|
||
SPARK_DEFAULT_SERIES_NAME,
|
||
)
|
||
from pyspark.pandas.missing.series import MissingPandasLikeSeries
|
||
from pyspark.pandas.plot import PandasOnSparkPlotAccessor
|
||
from pyspark.pandas.ml import corr
|
||
from pyspark.pandas.utils import (
|
||
combine_frames,
|
||
is_name_like_tuple,
|
||
is_name_like_value,
|
||
name_like_string,
|
||
same_anchor,
|
||
scol_for,
|
||
sql_conf,
|
||
validate_arguments_and_invoke_function,
|
||
validate_axis,
|
||
validate_bool_kwarg,
|
||
verify_temp_column_name,
|
||
SPARK_CONF_ARROW_ENABLED,
|
||
)
|
||
from pyspark.pandas.datetimes import DatetimeMethods
|
||
from pyspark.pandas.spark.accessors import SparkSeriesMethods
|
||
from pyspark.pandas.strings import StringMethods
|
||
from pyspark.pandas.typedef import (
|
||
infer_return_type,
|
||
spark_type_to_pandas_dtype,
|
||
Dtype,
|
||
ScalarType,
|
||
Scalar,
|
||
SeriesType,
|
||
)
|
||
|
||
if TYPE_CHECKING:
|
||
from pyspark.pandas.groupby import SeriesGroupBy # noqa: F401 (SPARK-34943)
|
||
from pyspark.pandas.indexes import Index # noqa: F401 (SPARK-34943)
|
||
|
||
# This regular expression pattern is complied and defined here to avoid to compile the same
|
||
# pattern every time it is used in _repr_ in Series.
|
||
# This pattern basically seeks the footer string from pandas'
|
||
REPR_PATTERN = re.compile(r"Length: (?P<length>[0-9]+)")
|
||
|
||
_flex_doc_SERIES = """
|
||
Return {desc} of series and other, element-wise (binary operator `{op_name}`).
|
||
|
||
Equivalent to ``{equiv}``
|
||
|
||
Parameters
|
||
----------
|
||
other : Series or scalar value
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
The result of the operation.
|
||
|
||
See Also
|
||
--------
|
||
Series.{reverse}
|
||
|
||
{series_examples}
|
||
"""
|
||
|
||
_add_example_SERIES = """
|
||
Examples
|
||
--------
|
||
>>> df = ps.DataFrame({'a': [2, 2, 4, np.nan],
|
||
... 'b': [2, np.nan, 2, np.nan]},
|
||
... index=['a', 'b', 'c', 'd'], columns=['a', 'b'])
|
||
>>> df
|
||
a b
|
||
a 2.0 2.0
|
||
b 2.0 NaN
|
||
c 4.0 2.0
|
||
d NaN NaN
|
||
|
||
>>> df.a.add(df.b)
|
||
a 4.0
|
||
b NaN
|
||
c 6.0
|
||
d NaN
|
||
dtype: float64
|
||
|
||
>>> df.a.radd(df.b)
|
||
a 4.0
|
||
b NaN
|
||
c 6.0
|
||
d NaN
|
||
dtype: float64
|
||
"""
|
||
|
||
_sub_example_SERIES = """
|
||
Examples
|
||
--------
|
||
>>> df = ps.DataFrame({'a': [2, 2, 4, np.nan],
|
||
... 'b': [2, np.nan, 2, np.nan]},
|
||
... index=['a', 'b', 'c', 'd'], columns=['a', 'b'])
|
||
>>> df
|
||
a b
|
||
a 2.0 2.0
|
||
b 2.0 NaN
|
||
c 4.0 2.0
|
||
d NaN NaN
|
||
|
||
>>> df.a.subtract(df.b)
|
||
a 0.0
|
||
b NaN
|
||
c 2.0
|
||
d NaN
|
||
dtype: float64
|
||
|
||
>>> df.a.rsub(df.b)
|
||
a 0.0
|
||
b NaN
|
||
c -2.0
|
||
d NaN
|
||
dtype: float64
|
||
"""
|
||
|
||
_mul_example_SERIES = """
|
||
Examples
|
||
--------
|
||
>>> df = ps.DataFrame({'a': [2, 2, 4, np.nan],
|
||
... 'b': [2, np.nan, 2, np.nan]},
|
||
... index=['a', 'b', 'c', 'd'], columns=['a', 'b'])
|
||
>>> df
|
||
a b
|
||
a 2.0 2.0
|
||
b 2.0 NaN
|
||
c 4.0 2.0
|
||
d NaN NaN
|
||
|
||
>>> df.a.multiply(df.b)
|
||
a 4.0
|
||
b NaN
|
||
c 8.0
|
||
d NaN
|
||
dtype: float64
|
||
|
||
>>> df.a.rmul(df.b)
|
||
a 4.0
|
||
b NaN
|
||
c 8.0
|
||
d NaN
|
||
dtype: float64
|
||
"""
|
||
|
||
_div_example_SERIES = """
|
||
Examples
|
||
--------
|
||
>>> df = ps.DataFrame({'a': [2, 2, 4, np.nan],
|
||
... 'b': [2, np.nan, 2, np.nan]},
|
||
... index=['a', 'b', 'c', 'd'], columns=['a', 'b'])
|
||
>>> df
|
||
a b
|
||
a 2.0 2.0
|
||
b 2.0 NaN
|
||
c 4.0 2.0
|
||
d NaN NaN
|
||
|
||
>>> df.a.divide(df.b)
|
||
a 1.0
|
||
b NaN
|
||
c 2.0
|
||
d NaN
|
||
dtype: float64
|
||
|
||
>>> df.a.rdiv(df.b)
|
||
a 1.0
|
||
b NaN
|
||
c 0.5
|
||
d NaN
|
||
dtype: float64
|
||
"""
|
||
|
||
_pow_example_SERIES = """
|
||
Examples
|
||
--------
|
||
>>> df = ps.DataFrame({'a': [2, 2, 4, np.nan],
|
||
... 'b': [2, np.nan, 2, np.nan]},
|
||
... index=['a', 'b', 'c', 'd'], columns=['a', 'b'])
|
||
>>> df
|
||
a b
|
||
a 2.0 2.0
|
||
b 2.0 NaN
|
||
c 4.0 2.0
|
||
d NaN NaN
|
||
|
||
>>> df.a.pow(df.b)
|
||
a 4.0
|
||
b NaN
|
||
c 16.0
|
||
d NaN
|
||
dtype: float64
|
||
|
||
>>> df.a.rpow(df.b)
|
||
a 4.0
|
||
b NaN
|
||
c 16.0
|
||
d NaN
|
||
dtype: float64
|
||
"""
|
||
|
||
_mod_example_SERIES = """
|
||
Examples
|
||
--------
|
||
>>> df = ps.DataFrame({'a': [2, 2, 4, np.nan],
|
||
... 'b': [2, np.nan, 2, np.nan]},
|
||
... index=['a', 'b', 'c', 'd'], columns=['a', 'b'])
|
||
>>> df
|
||
a b
|
||
a 2.0 2.0
|
||
b 2.0 NaN
|
||
c 4.0 2.0
|
||
d NaN NaN
|
||
|
||
>>> df.a.mod(df.b)
|
||
a 0.0
|
||
b NaN
|
||
c 0.0
|
||
d NaN
|
||
dtype: float64
|
||
|
||
>>> df.a.rmod(df.b)
|
||
a 0.0
|
||
b NaN
|
||
c 2.0
|
||
d NaN
|
||
dtype: float64
|
||
"""
|
||
|
||
_floordiv_example_SERIES = """
|
||
Examples
|
||
--------
|
||
>>> df = ps.DataFrame({'a': [2, 2, 4, np.nan],
|
||
... 'b': [2, np.nan, 2, np.nan]},
|
||
... index=['a', 'b', 'c', 'd'], columns=['a', 'b'])
|
||
>>> df
|
||
a b
|
||
a 2.0 2.0
|
||
b 2.0 NaN
|
||
c 4.0 2.0
|
||
d NaN NaN
|
||
|
||
>>> df.a.floordiv(df.b)
|
||
a 1.0
|
||
b NaN
|
||
c 2.0
|
||
d NaN
|
||
dtype: float64
|
||
|
||
>>> df.a.rfloordiv(df.b)
|
||
a 1.0
|
||
b NaN
|
||
c 0.0
|
||
d NaN
|
||
dtype: float64
|
||
"""
|
||
|
||
T = TypeVar("T")
|
||
|
||
# Needed to disambiguate Series.str and str type
|
||
str_type = str
|
||
|
||
|
||
def _create_type_for_series_type(param: Any) -> Type[SeriesType]:
|
||
from pyspark.pandas.typedef import NameTypeHolder
|
||
|
||
if isinstance(param, ExtensionDtype):
|
||
new_class = type("NameType", (NameTypeHolder,), {}) # type: Type[NameTypeHolder]
|
||
new_class.tpe = param
|
||
else:
|
||
new_class = param.type if isinstance(param, np.dtype) else param
|
||
|
||
return SeriesType[new_class] # type: ignore
|
||
|
||
|
||
if (3, 5) <= sys.version_info < (3, 7) and __name__ != "__main__":
|
||
from typing import GenericMeta # type: ignore
|
||
|
||
old_getitem = GenericMeta.__getitem__ # type: ignore
|
||
|
||
@no_type_check
|
||
def new_getitem(self, params):
|
||
if hasattr(self, "is_series"):
|
||
return old_getitem(self, _create_type_for_series_type(params))
|
||
else:
|
||
return old_getitem(self, params)
|
||
|
||
GenericMeta.__getitem__ = new_getitem # type: ignore
|
||
|
||
|
||
class Series(Frame, IndexOpsMixin, Generic[T]):
|
||
"""
|
||
pandas-on-Spark Series that corresponds to pandas Series logically. This holds Spark Column
|
||
internally.
|
||
|
||
:ivar _internal: an internal immutable Frame to manage metadata.
|
||
:type _internal: InternalFrame
|
||
:ivar _psdf: Parent's pandas-on-Spark DataFrame
|
||
:type _psdf: ps.DataFrame
|
||
|
||
Parameters
|
||
----------
|
||
data : array-like, dict, or scalar value, pandas Series
|
||
Contains data stored in Series
|
||
If data is a dict, argument order is maintained for Python 3.6
|
||
and later.
|
||
Note that if `data` is a pandas Series, other arguments should not be used.
|
||
index : array-like or Index (1d)
|
||
Values must be hashable and have the same length as `data`.
|
||
Non-unique index values are allowed. Will default to
|
||
RangeIndex (0, 1, 2, ..., n) if not provided. If both a dict and index
|
||
sequence are used, the index will override the keys found in the
|
||
dict.
|
||
dtype : numpy.dtype or None
|
||
If None, dtype will be inferred
|
||
copy : boolean, default False
|
||
Copy input data
|
||
"""
|
||
|
||
@no_type_check
|
||
def __init__(self, data=None, index=None, dtype=None, name=None, copy=False, fastpath=False):
|
||
assert data is not None
|
||
|
||
if isinstance(data, DataFrame):
|
||
assert dtype is None
|
||
assert name is None
|
||
assert not copy
|
||
assert not fastpath
|
||
|
||
self._anchor = data # type: DataFrame
|
||
self._col_label = index # type: Tuple
|
||
else:
|
||
if isinstance(data, pd.Series):
|
||
assert index is None
|
||
assert dtype is None
|
||
assert name is None
|
||
assert not copy
|
||
assert not fastpath
|
||
s = data
|
||
else:
|
||
s = pd.Series(
|
||
data=data, index=index, dtype=dtype, name=name, copy=copy, fastpath=fastpath
|
||
)
|
||
internal = InternalFrame.from_pandas(pd.DataFrame(s))
|
||
if s.name is None:
|
||
internal = internal.copy(column_labels=[None])
|
||
anchor = DataFrame(internal)
|
||
|
||
self._anchor = anchor
|
||
self._col_label = anchor._internal.column_labels[0]
|
||
object.__setattr__(anchor, "_psseries", {self._column_label: self})
|
||
|
||
@property
|
||
def _psdf(self) -> DataFrame:
|
||
return self._anchor
|
||
|
||
@property
|
||
def _internal(self) -> InternalFrame:
|
||
return self._psdf._internal.select_column(self._column_label)
|
||
|
||
@property
|
||
def _column_label(self) -> Optional[Tuple]:
|
||
return self._col_label
|
||
|
||
def _update_anchor(self, psdf: DataFrame) -> None:
|
||
assert psdf._internal.column_labels == [self._column_label], (
|
||
psdf._internal.column_labels,
|
||
[self._column_label],
|
||
)
|
||
self._anchor = psdf
|
||
object.__setattr__(psdf, "_psseries", {self._column_label: self})
|
||
|
||
def _with_new_scol(self, scol: Column, *, field: Optional[InternalField] = None) -> "Series":
|
||
"""
|
||
Copy pandas-on-Spark Series with the new Spark Column.
|
||
|
||
:param scol: the new Spark Column
|
||
:return: the copied Series
|
||
"""
|
||
name = name_like_string(self._column_label)
|
||
internal = self._internal.copy(
|
||
data_spark_columns=[scol.alias(name)],
|
||
data_fields=[
|
||
field if field is None or field.struct_field is None else field.copy(name=name)
|
||
],
|
||
)
|
||
return first_series(DataFrame(internal))
|
||
|
||
spark = CachedAccessor("spark", SparkSeriesMethods)
|
||
|
||
@property
|
||
def dtypes(self) -> Dtype:
|
||
"""Return the dtype object of the underlying data.
|
||
|
||
>>> s = ps.Series(list('abc'))
|
||
>>> s.dtype == s.dtypes
|
||
True
|
||
"""
|
||
return self.dtype
|
||
|
||
@property
|
||
def axes(self) -> List["Index"]:
|
||
"""
|
||
Return a list of the row axis labels.
|
||
|
||
Examples
|
||
--------
|
||
|
||
>>> psser = ps.Series([1, 2, 3])
|
||
>>> psser.axes
|
||
[Int64Index([0, 1, 2], dtype='int64')]
|
||
"""
|
||
return [self.index]
|
||
|
||
# Arithmetic Operators
|
||
def add(self, other: Any) -> "Series":
|
||
return self + other
|
||
|
||
add.__doc__ = _flex_doc_SERIES.format(
|
||
desc="Addition",
|
||
op_name="+",
|
||
equiv="series + other",
|
||
reverse="radd",
|
||
series_examples=_add_example_SERIES,
|
||
)
|
||
|
||
def radd(self, other: Any) -> "Series":
|
||
return other + self
|
||
|
||
radd.__doc__ = _flex_doc_SERIES.format(
|
||
desc="Reverse Addition",
|
||
op_name="+",
|
||
equiv="other + series",
|
||
reverse="add",
|
||
series_examples=_add_example_SERIES,
|
||
)
|
||
|
||
def div(self, other: Any) -> "Series":
|
||
return self / other
|
||
|
||
div.__doc__ = _flex_doc_SERIES.format(
|
||
desc="Floating division",
|
||
op_name="/",
|
||
equiv="series / other",
|
||
reverse="rdiv",
|
||
series_examples=_div_example_SERIES,
|
||
)
|
||
|
||
divide = div
|
||
|
||
def rdiv(self, other: Any) -> "Series":
|
||
return other / self
|
||
|
||
rdiv.__doc__ = _flex_doc_SERIES.format(
|
||
desc="Reverse Floating division",
|
||
op_name="/",
|
||
equiv="other / series",
|
||
reverse="div",
|
||
series_examples=_div_example_SERIES,
|
||
)
|
||
|
||
def truediv(self, other: Any) -> "Series":
|
||
return self / other
|
||
|
||
truediv.__doc__ = _flex_doc_SERIES.format(
|
||
desc="Floating division",
|
||
op_name="/",
|
||
equiv="series / other",
|
||
reverse="rtruediv",
|
||
series_examples=_div_example_SERIES,
|
||
)
|
||
|
||
def rtruediv(self, other: Any) -> "Series":
|
||
return other / self
|
||
|
||
rtruediv.__doc__ = _flex_doc_SERIES.format(
|
||
desc="Reverse Floating division",
|
||
op_name="/",
|
||
equiv="other / series",
|
||
reverse="truediv",
|
||
series_examples=_div_example_SERIES,
|
||
)
|
||
|
||
def mul(self, other: Any) -> "Series":
|
||
return self * other
|
||
|
||
mul.__doc__ = _flex_doc_SERIES.format(
|
||
desc="Multiplication",
|
||
op_name="*",
|
||
equiv="series * other",
|
||
reverse="rmul",
|
||
series_examples=_mul_example_SERIES,
|
||
)
|
||
|
||
multiply = mul
|
||
|
||
def rmul(self, other: Any) -> "Series":
|
||
return other * self
|
||
|
||
rmul.__doc__ = _flex_doc_SERIES.format(
|
||
desc="Reverse Multiplication",
|
||
op_name="*",
|
||
equiv="other * series",
|
||
reverse="mul",
|
||
series_examples=_mul_example_SERIES,
|
||
)
|
||
|
||
def sub(self, other: Any) -> "Series":
|
||
return self - other
|
||
|
||
sub.__doc__ = _flex_doc_SERIES.format(
|
||
desc="Subtraction",
|
||
op_name="-",
|
||
equiv="series - other",
|
||
reverse="rsub",
|
||
series_examples=_sub_example_SERIES,
|
||
)
|
||
|
||
subtract = sub
|
||
|
||
def rsub(self, other: Any) -> "Series":
|
||
return other - self
|
||
|
||
rsub.__doc__ = _flex_doc_SERIES.format(
|
||
desc="Reverse Subtraction",
|
||
op_name="-",
|
||
equiv="other - series",
|
||
reverse="sub",
|
||
series_examples=_sub_example_SERIES,
|
||
)
|
||
|
||
def mod(self, other: Any) -> "Series":
|
||
return self % other
|
||
|
||
mod.__doc__ = _flex_doc_SERIES.format(
|
||
desc="Modulo",
|
||
op_name="%",
|
||
equiv="series % other",
|
||
reverse="rmod",
|
||
series_examples=_mod_example_SERIES,
|
||
)
|
||
|
||
def rmod(self, other: Any) -> "Series":
|
||
return other % self
|
||
|
||
rmod.__doc__ = _flex_doc_SERIES.format(
|
||
desc="Reverse Modulo",
|
||
op_name="%",
|
||
equiv="other % series",
|
||
reverse="mod",
|
||
series_examples=_mod_example_SERIES,
|
||
)
|
||
|
||
def pow(self, other: Any) -> "Series":
|
||
return self ** other
|
||
|
||
pow.__doc__ = _flex_doc_SERIES.format(
|
||
desc="Exponential power of series",
|
||
op_name="**",
|
||
equiv="series ** other",
|
||
reverse="rpow",
|
||
series_examples=_pow_example_SERIES,
|
||
)
|
||
|
||
def rpow(self, other: Any) -> "Series":
|
||
return other ** self
|
||
|
||
rpow.__doc__ = _flex_doc_SERIES.format(
|
||
desc="Reverse Exponential power",
|
||
op_name="**",
|
||
equiv="other ** series",
|
||
reverse="pow",
|
||
series_examples=_pow_example_SERIES,
|
||
)
|
||
|
||
def floordiv(self, other: Any) -> "Series":
|
||
return self // other
|
||
|
||
floordiv.__doc__ = _flex_doc_SERIES.format(
|
||
desc="Integer division",
|
||
op_name="//",
|
||
equiv="series // other",
|
||
reverse="rfloordiv",
|
||
series_examples=_floordiv_example_SERIES,
|
||
)
|
||
|
||
def rfloordiv(self, other: Any) -> "Series":
|
||
return other // self
|
||
|
||
rfloordiv.__doc__ = _flex_doc_SERIES.format(
|
||
desc="Reverse Integer division",
|
||
op_name="//",
|
||
equiv="other // series",
|
||
reverse="floordiv",
|
||
series_examples=_floordiv_example_SERIES,
|
||
)
|
||
|
||
# create accessor for pandas-on-Spark specific methods.
|
||
pandas_on_spark = CachedAccessor("pandas_on_spark", PandasOnSparkSeriesMethods)
|
||
|
||
# keep the name "koalas" for backward compatibility.
|
||
koalas = CachedAccessor("koalas", PandasOnSparkSeriesMethods)
|
||
|
||
# Comparison Operators
|
||
def eq(self, other: Any) -> bool:
|
||
"""
|
||
Compare if the current value is equal to the other.
|
||
|
||
>>> df = ps.DataFrame({'a': [1, 2, 3, 4],
|
||
... 'b': [1, np.nan, 1, np.nan]},
|
||
... index=['a', 'b', 'c', 'd'], columns=['a', 'b'])
|
||
|
||
>>> df.a == 1
|
||
a True
|
||
b False
|
||
c False
|
||
d False
|
||
Name: a, dtype: bool
|
||
|
||
>>> df.b.eq(1)
|
||
a True
|
||
b False
|
||
c True
|
||
d False
|
||
Name: b, dtype: bool
|
||
"""
|
||
return self == other
|
||
|
||
equals = eq
|
||
|
||
def gt(self, other: Any) -> "Series":
|
||
"""
|
||
Compare if the current value is greater than the other.
|
||
|
||
>>> df = ps.DataFrame({'a': [1, 2, 3, 4],
|
||
... 'b': [1, np.nan, 1, np.nan]},
|
||
... index=['a', 'b', 'c', 'd'], columns=['a', 'b'])
|
||
|
||
>>> df.a > 1
|
||
a False
|
||
b True
|
||
c True
|
||
d True
|
||
Name: a, dtype: bool
|
||
|
||
>>> df.b.gt(1)
|
||
a False
|
||
b False
|
||
c False
|
||
d False
|
||
Name: b, dtype: bool
|
||
"""
|
||
return self > other
|
||
|
||
def ge(self, other: Any) -> "Series":
|
||
"""
|
||
Compare if the current value is greater than or equal to the other.
|
||
|
||
>>> df = ps.DataFrame({'a': [1, 2, 3, 4],
|
||
... 'b': [1, np.nan, 1, np.nan]},
|
||
... index=['a', 'b', 'c', 'd'], columns=['a', 'b'])
|
||
|
||
>>> df.a >= 2
|
||
a False
|
||
b True
|
||
c True
|
||
d True
|
||
Name: a, dtype: bool
|
||
|
||
>>> df.b.ge(2)
|
||
a False
|
||
b False
|
||
c False
|
||
d False
|
||
Name: b, dtype: bool
|
||
"""
|
||
return self >= other
|
||
|
||
def lt(self, other: Any) -> "Series":
|
||
"""
|
||
Compare if the current value is less than the other.
|
||
|
||
>>> df = ps.DataFrame({'a': [1, 2, 3, 4],
|
||
... 'b': [1, np.nan, 1, np.nan]},
|
||
... index=['a', 'b', 'c', 'd'], columns=['a', 'b'])
|
||
|
||
>>> df.a < 1
|
||
a False
|
||
b False
|
||
c False
|
||
d False
|
||
Name: a, dtype: bool
|
||
|
||
>>> df.b.lt(2)
|
||
a True
|
||
b False
|
||
c True
|
||
d False
|
||
Name: b, dtype: bool
|
||
"""
|
||
return self < other
|
||
|
||
def le(self, other: Any) -> "Series":
|
||
"""
|
||
Compare if the current value is less than or equal to the other.
|
||
|
||
>>> df = ps.DataFrame({'a': [1, 2, 3, 4],
|
||
... 'b': [1, np.nan, 1, np.nan]},
|
||
... index=['a', 'b', 'c', 'd'], columns=['a', 'b'])
|
||
|
||
>>> df.a <= 2
|
||
a True
|
||
b True
|
||
c False
|
||
d False
|
||
Name: a, dtype: bool
|
||
|
||
>>> df.b.le(2)
|
||
a True
|
||
b False
|
||
c True
|
||
d False
|
||
Name: b, dtype: bool
|
||
"""
|
||
return self <= other
|
||
|
||
def ne(self, other: Any) -> "Series":
|
||
"""
|
||
Compare if the current value is not equal to the other.
|
||
|
||
>>> df = ps.DataFrame({'a': [1, 2, 3, 4],
|
||
... 'b': [1, np.nan, 1, np.nan]},
|
||
... index=['a', 'b', 'c', 'd'], columns=['a', 'b'])
|
||
|
||
>>> df.a != 1
|
||
a False
|
||
b True
|
||
c True
|
||
d True
|
||
Name: a, dtype: bool
|
||
|
||
>>> df.b.ne(1)
|
||
a False
|
||
b True
|
||
c False
|
||
d True
|
||
Name: b, dtype: bool
|
||
"""
|
||
return self != other
|
||
|
||
def divmod(self, other: Any) -> Tuple["Series", "Series"]:
|
||
"""
|
||
Return Integer division and modulo of series and other, element-wise
|
||
(binary operator `divmod`).
|
||
|
||
Parameters
|
||
----------
|
||
other : Series or scalar value
|
||
|
||
Returns
|
||
-------
|
||
2-Tuple of Series
|
||
The result of the operation.
|
||
|
||
See Also
|
||
--------
|
||
Series.rdivmod
|
||
"""
|
||
return self.floordiv(other), self.mod(other)
|
||
|
||
def rdivmod(self, other: Any) -> Tuple["Series", "Series"]:
|
||
"""
|
||
Return Integer division and modulo of series and other, element-wise
|
||
(binary operator `rdivmod`).
|
||
|
||
Parameters
|
||
----------
|
||
other : Series or scalar value
|
||
|
||
Returns
|
||
-------
|
||
2-Tuple of Series
|
||
The result of the operation.
|
||
|
||
See Also
|
||
--------
|
||
Series.divmod
|
||
"""
|
||
return self.rfloordiv(other), self.rmod(other)
|
||
|
||
def between(self, left: Any, right: Any, inclusive: bool = True) -> "Series":
|
||
"""
|
||
Return boolean Series equivalent to left <= series <= right.
|
||
This function returns a boolean vector containing `True` wherever the
|
||
corresponding Series element is between the boundary values `left` and
|
||
`right`. NA values are treated as `False`.
|
||
|
||
Parameters
|
||
----------
|
||
left : scalar or list-like
|
||
Left boundary.
|
||
right : scalar or list-like
|
||
Right boundary.
|
||
inclusive : bool, default True
|
||
Include boundaries.
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
Series representing whether each element is between left and
|
||
right (inclusive).
|
||
|
||
See Also
|
||
--------
|
||
Series.gt : Greater than of series and other.
|
||
Series.lt : Less than of series and other.
|
||
|
||
Notes
|
||
-----
|
||
This function is equivalent to ``(left <= ser) & (ser <= right)``
|
||
|
||
Examples
|
||
--------
|
||
>>> s = ps.Series([2, 0, 4, 8, np.nan])
|
||
|
||
Boundary values are included by default:
|
||
|
||
>>> s.between(1, 4)
|
||
0 True
|
||
1 False
|
||
2 True
|
||
3 False
|
||
4 False
|
||
dtype: bool
|
||
|
||
With `inclusive` set to ``False`` boundary values are excluded:
|
||
|
||
>>> s.between(1, 4, inclusive=False)
|
||
0 True
|
||
1 False
|
||
2 False
|
||
3 False
|
||
4 False
|
||
dtype: bool
|
||
|
||
`left` and `right` can be any scalar value:
|
||
|
||
>>> s = ps.Series(['Alice', 'Bob', 'Carol', 'Eve'])
|
||
>>> s.between('Anna', 'Daniel')
|
||
0 False
|
||
1 True
|
||
2 True
|
||
3 False
|
||
dtype: bool
|
||
"""
|
||
if inclusive:
|
||
lmask = self >= left
|
||
rmask = self <= right
|
||
else:
|
||
lmask = self > left
|
||
rmask = self < right
|
||
|
||
return lmask & rmask
|
||
|
||
# TODO: arg should support Series
|
||
# TODO: NaN and None
|
||
def map(self, arg: Union[Dict, Callable]) -> "Series":
|
||
"""
|
||
Map values of Series according to input correspondence.
|
||
|
||
Used for substituting each value in a Series with another value,
|
||
that may be derived from a function, a ``dict``.
|
||
|
||
.. note:: make sure the size of the dictionary is not huge because it could
|
||
downgrade the performance or throw OutOfMemoryError due to a huge
|
||
expression within Spark. Consider the input as a functions as an
|
||
alternative instead in this case.
|
||
|
||
Parameters
|
||
----------
|
||
arg : function or dict
|
||
Mapping correspondence.
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
Same index as caller.
|
||
|
||
See Also
|
||
--------
|
||
Series.apply : For applying more complex functions on a Series.
|
||
DataFrame.applymap : Apply a function elementwise on a whole DataFrame.
|
||
|
||
Notes
|
||
-----
|
||
When ``arg`` is a dictionary, values in Series that are not in the
|
||
dictionary (as keys) are converted to ``None``. However, if the
|
||
dictionary is a ``dict`` subclass that defines ``__missing__`` (i.e.
|
||
provides a method for default values), then this default is used
|
||
rather than ``None``.
|
||
|
||
Examples
|
||
--------
|
||
>>> s = ps.Series(['cat', 'dog', None, 'rabbit'])
|
||
>>> s
|
||
0 cat
|
||
1 dog
|
||
2 None
|
||
3 rabbit
|
||
dtype: object
|
||
|
||
``map`` accepts a ``dict``. Values that are not found
|
||
in the ``dict`` are converted to ``None``, unless the dict has a default
|
||
value (e.g. ``defaultdict``):
|
||
|
||
>>> s.map({'cat': 'kitten', 'dog': 'puppy'})
|
||
0 kitten
|
||
1 puppy
|
||
2 None
|
||
3 None
|
||
dtype: object
|
||
|
||
It also accepts a function:
|
||
|
||
>>> def format(x) -> str:
|
||
... return 'I am a {}'.format(x)
|
||
|
||
>>> s.map(format)
|
||
0 I am a cat
|
||
1 I am a dog
|
||
2 I am a None
|
||
3 I am a rabbit
|
||
dtype: object
|
||
"""
|
||
if isinstance(arg, dict):
|
||
is_start = True
|
||
# In case dictionary is empty.
|
||
current = F.when(F.lit(False), F.lit(None).cast(self.spark.data_type))
|
||
|
||
for to_replace, value in arg.items():
|
||
if is_start:
|
||
current = F.when(self.spark.column == F.lit(to_replace), value)
|
||
is_start = False
|
||
else:
|
||
current = current.when(self.spark.column == F.lit(to_replace), value)
|
||
|
||
if hasattr(arg, "__missing__"):
|
||
tmp_val = arg[np._NoValue]
|
||
del arg[np._NoValue] # Remove in case it's set in defaultdict.
|
||
current = current.otherwise(F.lit(tmp_val))
|
||
else:
|
||
current = current.otherwise(F.lit(None).cast(self.spark.data_type))
|
||
return self._with_new_scol(current)
|
||
else:
|
||
return self.apply(arg)
|
||
|
||
@property
|
||
def shape(self) -> Tuple[int]:
|
||
"""Return a tuple of the shape of the underlying data."""
|
||
return (len(self),)
|
||
|
||
@property
|
||
def name(self) -> Union[Any, Tuple]:
|
||
"""Return name of the Series."""
|
||
name = self._column_label
|
||
if name is not None and len(name) == 1:
|
||
return name[0]
|
||
else:
|
||
return name
|
||
|
||
@name.setter
|
||
def name(self, name: Union[Any, Tuple]) -> None:
|
||
self.rename(name, inplace=True)
|
||
|
||
# TODO: Functionality and documentation should be matched. Currently, changing index labels
|
||
# taking dictionary and function to change index are not supported.
|
||
def rename(self, index: Optional[Union[Any, Tuple]] = None, **kwargs: Any) -> "Series":
|
||
"""
|
||
Alter Series name.
|
||
|
||
Parameters
|
||
----------
|
||
index : scalar
|
||
Scalar will alter the ``Series.name`` attribute.
|
||
|
||
inplace : bool, default False
|
||
Whether to return a new Series. If True then value of copy is
|
||
ignored.
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
Series with name altered.
|
||
|
||
Examples
|
||
--------
|
||
|
||
>>> s = ps.Series([1, 2, 3])
|
||
>>> s
|
||
0 1
|
||
1 2
|
||
2 3
|
||
dtype: int64
|
||
|
||
>>> s.rename("my_name") # scalar, changes Series.name
|
||
0 1
|
||
1 2
|
||
2 3
|
||
Name: my_name, dtype: int64
|
||
"""
|
||
if index is None:
|
||
pass
|
||
elif not is_hashable(index):
|
||
raise TypeError("Series.name must be a hashable type")
|
||
elif not isinstance(index, tuple):
|
||
index = (index,)
|
||
name = name_like_string(index)
|
||
scol = self.spark.column.alias(name)
|
||
field = self._internal.data_fields[0].copy(name=name)
|
||
|
||
internal = self._internal.copy(
|
||
column_labels=[index],
|
||
data_spark_columns=[scol],
|
||
data_fields=[field],
|
||
column_label_names=None,
|
||
)
|
||
psdf = DataFrame(internal) # type: DataFrame
|
||
|
||
if kwargs.get("inplace", False):
|
||
self._col_label = index
|
||
self._update_anchor(psdf)
|
||
return self
|
||
else:
|
||
return first_series(psdf)
|
||
|
||
def rename_axis(
|
||
self, mapper: Optional[Any] = None, index: Optional[Any] = None, inplace: bool = False
|
||
) -> Optional["Series"]:
|
||
"""
|
||
Set the name of the axis for the index or columns.
|
||
|
||
Parameters
|
||
----------
|
||
mapper, index : scalar, list-like, dict-like or function, optional
|
||
A scalar, list-like, dict-like or functions transformations to
|
||
apply to the index values.
|
||
inplace : bool, default False
|
||
Modifies the object directly, instead of creating a new Series.
|
||
|
||
Returns
|
||
-------
|
||
Series, or None if `inplace` is True.
|
||
|
||
See Also
|
||
--------
|
||
Series.rename : Alter Series index labels or name.
|
||
DataFrame.rename : Alter DataFrame index labels or name.
|
||
Index.rename : Set new names on index.
|
||
|
||
Examples
|
||
--------
|
||
>>> s = ps.Series(["dog", "cat", "monkey"], name="animal")
|
||
>>> s # doctest: +NORMALIZE_WHITESPACE
|
||
0 dog
|
||
1 cat
|
||
2 monkey
|
||
Name: animal, dtype: object
|
||
>>> s.rename_axis("index").sort_index() # doctest: +NORMALIZE_WHITESPACE
|
||
index
|
||
0 dog
|
||
1 cat
|
||
2 monkey
|
||
Name: animal, dtype: object
|
||
|
||
**MultiIndex**
|
||
|
||
>>> index = pd.MultiIndex.from_product([['mammal'],
|
||
... ['dog', 'cat', 'monkey']],
|
||
... names=['type', 'name'])
|
||
>>> s = ps.Series([4, 4, 2], index=index, name='num_legs')
|
||
>>> s # doctest: +NORMALIZE_WHITESPACE
|
||
type name
|
||
mammal dog 4
|
||
cat 4
|
||
monkey 2
|
||
Name: num_legs, dtype: int64
|
||
>>> s.rename_axis(index={'type': 'class'}).sort_index() # doctest: +NORMALIZE_WHITESPACE
|
||
class name
|
||
mammal cat 4
|
||
dog 4
|
||
monkey 2
|
||
Name: num_legs, dtype: int64
|
||
>>> s.rename_axis(index=str.upper).sort_index() # doctest: +NORMALIZE_WHITESPACE
|
||
TYPE NAME
|
||
mammal cat 4
|
||
dog 4
|
||
monkey 2
|
||
Name: num_legs, dtype: int64
|
||
"""
|
||
psdf = self.to_frame().rename_axis(mapper=mapper, index=index, inplace=False)
|
||
if inplace:
|
||
self._update_anchor(psdf)
|
||
return None
|
||
else:
|
||
return first_series(psdf)
|
||
|
||
@property
|
||
def index(self) -> "ps.Index":
|
||
"""The index (axis labels) Column of the Series.
|
||
|
||
See Also
|
||
--------
|
||
Index
|
||
"""
|
||
return self._psdf.index
|
||
|
||
@property
|
||
def is_unique(self) -> bool:
|
||
"""
|
||
Return boolean if values in the object are unique
|
||
|
||
Returns
|
||
-------
|
||
is_unique : boolean
|
||
|
||
>>> ps.Series([1, 2, 3]).is_unique
|
||
True
|
||
>>> ps.Series([1, 2, 2]).is_unique
|
||
False
|
||
>>> ps.Series([1, 2, 3, None]).is_unique
|
||
True
|
||
"""
|
||
scol = self.spark.column
|
||
|
||
# Here we check:
|
||
# 1. the distinct count without nulls and count without nulls for non-null values
|
||
# 2. count null values and see if null is a distinct value.
|
||
#
|
||
# This workaround is in order to calculate the distinct count including nulls in
|
||
# single pass. Note that COUNT(DISTINCT expr) in Spark is designed to ignore nulls.
|
||
return self._internal.spark_frame.select(
|
||
(F.count(scol) == F.countDistinct(scol))
|
||
& (F.count(F.when(scol.isNull(), 1).otherwise(None)) <= 1)
|
||
).collect()[0][0]
|
||
|
||
def reset_index(
|
||
self,
|
||
level: Optional[Union[int, Any, Tuple, Sequence[Union[int, Any, Tuple]]]] = None,
|
||
drop: bool = False,
|
||
name: Optional[Union[Any, Tuple]] = None,
|
||
inplace: bool = False,
|
||
) -> Optional[Union["Series", DataFrame]]:
|
||
"""
|
||
Generate a new DataFrame or Series with the index reset.
|
||
|
||
This is useful when the index needs to be treated as a column,
|
||
or when the index is meaningless and needs to be reset
|
||
to the default before another operation.
|
||
|
||
Parameters
|
||
----------
|
||
level : int, str, tuple, or list, default optional
|
||
For a Series with a MultiIndex, only remove the specified levels from the index.
|
||
Removes all levels by default.
|
||
drop : bool, default False
|
||
Just reset the index, without inserting it as a column in the new DataFrame.
|
||
name : object, optional
|
||
The name to use for the column containing the original Series values.
|
||
Uses self.name by default. This argument is ignored when drop is True.
|
||
inplace : bool, default False
|
||
Modify the Series in place (do not create a new object).
|
||
|
||
Returns
|
||
-------
|
||
Series or DataFrame
|
||
When `drop` is False (the default), a DataFrame is returned.
|
||
The newly created columns will come first in the DataFrame,
|
||
followed by the original Series values.
|
||
When `drop` is True, a `Series` is returned.
|
||
In either case, if ``inplace=True``, no value is returned.
|
||
|
||
Examples
|
||
--------
|
||
>>> s = ps.Series([1, 2, 3, 4], index=pd.Index(['a', 'b', 'c', 'd'], name='idx'))
|
||
|
||
Generate a DataFrame with default index.
|
||
|
||
>>> s.reset_index()
|
||
idx 0
|
||
0 a 1
|
||
1 b 2
|
||
2 c 3
|
||
3 d 4
|
||
|
||
To specify the name of the new column use `name`.
|
||
|
||
>>> s.reset_index(name='values')
|
||
idx values
|
||
0 a 1
|
||
1 b 2
|
||
2 c 3
|
||
3 d 4
|
||
|
||
To generate a new Series with the default set `drop` to True.
|
||
|
||
>>> s.reset_index(drop=True)
|
||
0 1
|
||
1 2
|
||
2 3
|
||
3 4
|
||
dtype: int64
|
||
|
||
To update the Series in place, without generating a new one
|
||
set `inplace` to True. Note that it also requires ``drop=True``.
|
||
|
||
>>> s.reset_index(inplace=True, drop=True)
|
||
>>> s
|
||
0 1
|
||
1 2
|
||
2 3
|
||
3 4
|
||
dtype: int64
|
||
"""
|
||
inplace = validate_bool_kwarg(inplace, "inplace")
|
||
if inplace and not drop:
|
||
raise TypeError("Cannot reset_index inplace on a Series to create a DataFrame")
|
||
|
||
if drop:
|
||
psdf = self._psdf[[self.name]]
|
||
else:
|
||
psser = self
|
||
if name is not None:
|
||
psser = psser.rename(name)
|
||
psdf = psser.to_frame()
|
||
psdf = psdf.reset_index(level=level, drop=drop)
|
||
if drop:
|
||
if inplace:
|
||
self._update_anchor(psdf)
|
||
return None
|
||
else:
|
||
return first_series(psdf)
|
||
else:
|
||
return psdf
|
||
|
||
def to_frame(self, name: Optional[Union[Any, Tuple]] = None) -> DataFrame:
|
||
"""
|
||
Convert Series to DataFrame.
|
||
|
||
Parameters
|
||
----------
|
||
name : object, default None
|
||
The passed name should substitute for the series name (if it has
|
||
one).
|
||
|
||
Returns
|
||
-------
|
||
DataFrame
|
||
DataFrame representation of Series.
|
||
|
||
Examples
|
||
--------
|
||
>>> s = ps.Series(["a", "b", "c"])
|
||
>>> s.to_frame()
|
||
0
|
||
0 a
|
||
1 b
|
||
2 c
|
||
|
||
>>> s = ps.Series(["a", "b", "c"], name="vals")
|
||
>>> s.to_frame()
|
||
vals
|
||
0 a
|
||
1 b
|
||
2 c
|
||
"""
|
||
if name is not None:
|
||
renamed = self.rename(name)
|
||
elif self._column_label is None:
|
||
renamed = self.rename(DEFAULT_SERIES_NAME)
|
||
else:
|
||
renamed = self
|
||
return DataFrame(renamed._internal)
|
||
|
||
to_dataframe = to_frame
|
||
|
||
def to_string(
|
||
self,
|
||
buf: Optional[IO[str]] = None,
|
||
na_rep: str = "NaN",
|
||
float_format: Optional[Callable[[float], str]] = None,
|
||
header: bool = True,
|
||
index: bool = True,
|
||
length: bool = False,
|
||
dtype: bool = False,
|
||
name: bool = False,
|
||
max_rows: Optional[int] = None,
|
||
) -> Optional[str]:
|
||
"""
|
||
Render a string representation of the Series.
|
||
|
||
.. note:: This method should only be used if the resulting pandas object is expected
|
||
to be small, as all the data is loaded into the driver's memory. If the input
|
||
is large, set max_rows parameter.
|
||
|
||
Parameters
|
||
----------
|
||
buf : StringIO-like, optional
|
||
buffer to write to
|
||
na_rep : string, optional
|
||
string representation of NAN to use, default 'NaN'
|
||
float_format : one-parameter function, optional
|
||
formatter function to apply to columns' elements if they are floats
|
||
default None
|
||
header : boolean, default True
|
||
Add the Series header (index name)
|
||
index : bool, optional
|
||
Add index (row) labels, default True
|
||
length : boolean, default False
|
||
Add the Series length
|
||
dtype : boolean, default False
|
||
Add the Series dtype
|
||
name : boolean, default False
|
||
Add the Series name if not None
|
||
max_rows : int, optional
|
||
Maximum number of rows to show before truncating. If None, show
|
||
all.
|
||
|
||
Returns
|
||
-------
|
||
formatted : string (if not buffer passed)
|
||
|
||
Examples
|
||
--------
|
||
>>> df = ps.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)], columns=['dogs', 'cats'])
|
||
>>> print(df['dogs'].to_string())
|
||
0 0.2
|
||
1 0.0
|
||
2 0.6
|
||
3 0.2
|
||
|
||
>>> print(df['dogs'].to_string(max_rows=2))
|
||
0 0.2
|
||
1 0.0
|
||
"""
|
||
# Make sure locals() call is at the top of the function so we don't capture local variables.
|
||
args = locals()
|
||
if max_rows is not None:
|
||
psseries = self.head(max_rows)
|
||
else:
|
||
psseries = self
|
||
|
||
return validate_arguments_and_invoke_function(
|
||
psseries._to_internal_pandas(), self.to_string, pd.Series.to_string, args
|
||
)
|
||
|
||
def to_clipboard(self, excel: bool = True, sep: Optional[str] = None, **kwargs: Any) -> None:
|
||
# Docstring defined below by reusing DataFrame.to_clipboard's.
|
||
args = locals()
|
||
psseries = self
|
||
|
||
return validate_arguments_and_invoke_function(
|
||
psseries._to_internal_pandas(), self.to_clipboard, pd.Series.to_clipboard, args
|
||
)
|
||
|
||
to_clipboard.__doc__ = DataFrame.to_clipboard.__doc__
|
||
|
||
def to_dict(self, into: Type = dict) -> Mapping:
|
||
"""
|
||
Convert Series to {label -> value} dict or dict-like object.
|
||
|
||
.. note:: This method should only be used if the resulting pandas DataFrame is expected
|
||
to be small, as all the data is loaded into the driver's memory.
|
||
|
||
Parameters
|
||
----------
|
||
into : class, default dict
|
||
The collections.abc.Mapping subclass to use as the return
|
||
object. Can be the actual class or an empty
|
||
instance of the mapping type you want. If you want a
|
||
collections.defaultdict, you must pass it initialized.
|
||
|
||
Returns
|
||
-------
|
||
collections.abc.Mapping
|
||
Key-value representation of Series.
|
||
|
||
Examples
|
||
--------
|
||
>>> s = ps.Series([1, 2, 3, 4])
|
||
>>> s_dict = s.to_dict()
|
||
>>> sorted(s_dict.items())
|
||
[(0, 1), (1, 2), (2, 3), (3, 4)]
|
||
|
||
>>> from collections import OrderedDict, defaultdict
|
||
>>> s.to_dict(OrderedDict)
|
||
OrderedDict([(0, 1), (1, 2), (2, 3), (3, 4)])
|
||
|
||
>>> dd = defaultdict(list)
|
||
>>> s.to_dict(dd) # doctest: +ELLIPSIS
|
||
defaultdict(<class 'list'>, {...})
|
||
"""
|
||
# Make sure locals() call is at the top of the function so we don't capture local variables.
|
||
args = locals()
|
||
psseries = self
|
||
return validate_arguments_and_invoke_function(
|
||
psseries._to_internal_pandas(), self.to_dict, pd.Series.to_dict, args
|
||
)
|
||
|
||
def to_latex(
|
||
self,
|
||
buf: Optional[IO[str]] = None,
|
||
columns: Optional[List[Union[Any, Tuple]]] = None,
|
||
col_space: Optional[int] = None,
|
||
header: bool = True,
|
||
index: bool = True,
|
||
na_rep: str = "NaN",
|
||
formatters: Optional[
|
||
Union[List[Callable[[Any], str]], Dict[Union[Any, Tuple], Callable[[Any], str]]]
|
||
] = None,
|
||
float_format: Optional[Callable[[float], str]] = None,
|
||
sparsify: Optional[bool] = None,
|
||
index_names: bool = True,
|
||
bold_rows: bool = False,
|
||
column_format: Optional[str] = None,
|
||
longtable: Optional[bool] = None,
|
||
escape: Optional[bool] = None,
|
||
encoding: Optional[str] = None,
|
||
decimal: str = ".",
|
||
multicolumn: Optional[bool] = None,
|
||
multicolumn_format: Optional[str] = None,
|
||
multirow: Optional[bool] = None,
|
||
) -> Optional[str]:
|
||
|
||
args = locals()
|
||
psseries = self
|
||
return validate_arguments_and_invoke_function(
|
||
psseries._to_internal_pandas(), self.to_latex, pd.Series.to_latex, args
|
||
)
|
||
|
||
to_latex.__doc__ = DataFrame.to_latex.__doc__
|
||
|
||
def to_pandas(self) -> pd.Series:
|
||
"""
|
||
Return a pandas Series.
|
||
|
||
.. note:: This method should only be used if the resulting pandas object is expected
|
||
to be small, as all the data is loaded into the driver's memory.
|
||
|
||
Examples
|
||
--------
|
||
>>> df = ps.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)], columns=['dogs', 'cats'])
|
||
>>> df['dogs'].to_pandas()
|
||
0 0.2
|
||
1 0.0
|
||
2 0.6
|
||
3 0.2
|
||
Name: dogs, dtype: float64
|
||
"""
|
||
return self._to_internal_pandas().copy()
|
||
|
||
def to_list(self) -> List:
|
||
"""
|
||
Return a list of the values.
|
||
|
||
These are each a scalar type, which is a Python scalar
|
||
(for str, int, float) or a pandas scalar
|
||
(for Timestamp/Timedelta/Interval/Period)
|
||
|
||
.. note:: This method should only be used if the resulting list is expected
|
||
to be small, as all the data is loaded into the driver's memory.
|
||
|
||
"""
|
||
return self._to_internal_pandas().tolist()
|
||
|
||
tolist = to_list
|
||
|
||
def drop_duplicates(self, keep: str = "first", inplace: bool = False) -> Optional["Series"]:
|
||
"""
|
||
Return Series with duplicate values removed.
|
||
|
||
Parameters
|
||
----------
|
||
keep : {'first', 'last', ``False``}, default 'first'
|
||
Method to handle dropping duplicates:
|
||
- 'first' : Drop duplicates except for the first occurrence.
|
||
- 'last' : Drop duplicates except for the last occurrence.
|
||
- ``False`` : Drop all duplicates.
|
||
inplace : bool, default ``False``
|
||
If ``True``, performs operation inplace and returns None.
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
Series with duplicates dropped.
|
||
|
||
Examples
|
||
--------
|
||
Generate a Series with duplicated entries.
|
||
|
||
>>> s = ps.Series(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo'],
|
||
... name='animal')
|
||
>>> s.sort_index()
|
||
0 lama
|
||
1 cow
|
||
2 lama
|
||
3 beetle
|
||
4 lama
|
||
5 hippo
|
||
Name: animal, dtype: object
|
||
|
||
With the 'keep' parameter, the selection behaviour of duplicated values
|
||
can be changed. The value 'first' keeps the first occurrence for each
|
||
set of duplicated entries. The default value of keep is 'first'.
|
||
|
||
>>> s.drop_duplicates().sort_index()
|
||
0 lama
|
||
1 cow
|
||
3 beetle
|
||
5 hippo
|
||
Name: animal, dtype: object
|
||
|
||
The value 'last' for parameter 'keep' keeps the last occurrence for
|
||
each set of duplicated entries.
|
||
|
||
>>> s.drop_duplicates(keep='last').sort_index()
|
||
1 cow
|
||
3 beetle
|
||
4 lama
|
||
5 hippo
|
||
Name: animal, dtype: object
|
||
|
||
The value ``False`` for parameter 'keep' discards all sets of
|
||
duplicated entries. Setting the value of 'inplace' to ``True`` performs
|
||
the operation inplace and returns ``None``.
|
||
|
||
>>> s.drop_duplicates(keep=False, inplace=True)
|
||
>>> s.sort_index()
|
||
1 cow
|
||
3 beetle
|
||
5 hippo
|
||
Name: animal, dtype: object
|
||
"""
|
||
inplace = validate_bool_kwarg(inplace, "inplace")
|
||
psdf = self._psdf[[self.name]].drop_duplicates(keep=keep)
|
||
|
||
if inplace:
|
||
self._update_anchor(psdf)
|
||
return None
|
||
else:
|
||
return first_series(psdf)
|
||
|
||
def reindex(self, index: Optional[Any] = None, fill_value: Optional[Any] = None) -> "Series":
|
||
"""
|
||
Conform Series to new index with optional filling logic, placing
|
||
NA/NaN in locations having no value in the previous index. A new object
|
||
is produced.
|
||
|
||
Parameters
|
||
----------
|
||
index: array-like, optional
|
||
New labels / index to conform to, should be specified using keywords.
|
||
Preferably an Index object to avoid duplicating data
|
||
fill_value : scalar, default np.NaN
|
||
Value to use for missing values. Defaults to NaN, but can be any
|
||
"compatible" value.
|
||
|
||
Returns
|
||
-------
|
||
Series with changed index.
|
||
|
||
See Also
|
||
--------
|
||
Series.reset_index : Remove row labels or move them to new columns.
|
||
|
||
Examples
|
||
--------
|
||
|
||
Create a series with some fictional data.
|
||
|
||
>>> index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror']
|
||
>>> ser = ps.Series([200, 200, 404, 404, 301],
|
||
... index=index, name='http_status')
|
||
>>> ser
|
||
Firefox 200
|
||
Chrome 200
|
||
Safari 404
|
||
IE10 404
|
||
Konqueror 301
|
||
Name: http_status, dtype: int64
|
||
|
||
Create a new index and reindex the Series. By default
|
||
values in the new index that do not have corresponding
|
||
records in the Series are assigned ``NaN``.
|
||
|
||
>>> new_index= ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10',
|
||
... 'Chrome']
|
||
>>> ser.reindex(new_index).sort_index()
|
||
Chrome 200.0
|
||
Comodo Dragon NaN
|
||
IE10 404.0
|
||
Iceweasel NaN
|
||
Safari 404.0
|
||
Name: http_status, dtype: float64
|
||
|
||
We can fill in the missing values by passing a value to
|
||
the keyword ``fill_value``.
|
||
|
||
>>> ser.reindex(new_index, fill_value=0).sort_index()
|
||
Chrome 200
|
||
Comodo Dragon 0
|
||
IE10 404
|
||
Iceweasel 0
|
||
Safari 404
|
||
Name: http_status, dtype: int64
|
||
|
||
To further illustrate the filling functionality in
|
||
``reindex``, we will create a Series with a
|
||
monotonically increasing index (for example, a sequence
|
||
of dates).
|
||
|
||
>>> date_index = pd.date_range('1/1/2010', periods=6, freq='D')
|
||
>>> ser2 = ps.Series([100, 101, np.nan, 100, 89, 88],
|
||
... name='prices', index=date_index)
|
||
>>> ser2.sort_index()
|
||
2010-01-01 100.0
|
||
2010-01-02 101.0
|
||
2010-01-03 NaN
|
||
2010-01-04 100.0
|
||
2010-01-05 89.0
|
||
2010-01-06 88.0
|
||
Name: prices, dtype: float64
|
||
|
||
Suppose we decide to expand the series to cover a wider
|
||
date range.
|
||
|
||
>>> date_index2 = pd.date_range('12/29/2009', periods=10, freq='D')
|
||
>>> ser2.reindex(date_index2).sort_index()
|
||
2009-12-29 NaN
|
||
2009-12-30 NaN
|
||
2009-12-31 NaN
|
||
2010-01-01 100.0
|
||
2010-01-02 101.0
|
||
2010-01-03 NaN
|
||
2010-01-04 100.0
|
||
2010-01-05 89.0
|
||
2010-01-06 88.0
|
||
2010-01-07 NaN
|
||
Name: prices, dtype: float64
|
||
"""
|
||
|
||
return first_series(self.to_frame().reindex(index=index, fill_value=fill_value)).rename(
|
||
self.name
|
||
)
|
||
|
||
def reindex_like(self, other: Union["Series", "DataFrame"]) -> "Series":
|
||
"""
|
||
Return a Series with matching indices as other object.
|
||
|
||
Conform the object to the same index on all axes. Places NA/NaN in locations
|
||
having no value in the previous index.
|
||
|
||
Parameters
|
||
----------
|
||
other : Series or DataFrame
|
||
Its row and column indices are used to define the new indices
|
||
of this object.
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
Series with changed indices on each axis.
|
||
|
||
See Also
|
||
--------
|
||
DataFrame.set_index : Set row labels.
|
||
DataFrame.reset_index : Remove row labels or move them to new columns.
|
||
DataFrame.reindex : Change to new indices or expand indices.
|
||
|
||
Notes
|
||
-----
|
||
Same as calling
|
||
``.reindex(index=other.index, ...)``.
|
||
|
||
Examples
|
||
--------
|
||
|
||
>>> s1 = ps.Series([24.3, 31.0, 22.0, 35.0],
|
||
... index=pd.date_range(start='2014-02-12',
|
||
... end='2014-02-15', freq='D'),
|
||
... name="temp_celsius")
|
||
>>> s1
|
||
2014-02-12 24.3
|
||
2014-02-13 31.0
|
||
2014-02-14 22.0
|
||
2014-02-15 35.0
|
||
Name: temp_celsius, dtype: float64
|
||
|
||
>>> s2 = ps.Series(["low", "low", "medium"],
|
||
... index=pd.DatetimeIndex(['2014-02-12', '2014-02-13',
|
||
... '2014-02-15']),
|
||
... name="winspeed")
|
||
>>> s2
|
||
2014-02-12 low
|
||
2014-02-13 low
|
||
2014-02-15 medium
|
||
Name: winspeed, dtype: object
|
||
|
||
>>> s2.reindex_like(s1).sort_index()
|
||
2014-02-12 low
|
||
2014-02-13 low
|
||
2014-02-14 None
|
||
2014-02-15 medium
|
||
Name: winspeed, dtype: object
|
||
"""
|
||
if isinstance(other, (Series, DataFrame)):
|
||
return self.reindex(index=other.index)
|
||
else:
|
||
raise TypeError("other must be a pandas-on-Spark Series or DataFrame")
|
||
|
||
def fillna(
|
||
self,
|
||
value: Optional[Any] = None,
|
||
method: Optional[str] = None,
|
||
axis: Optional[Union[int, str]] = None,
|
||
inplace: bool = False,
|
||
limit: Optional[int] = None,
|
||
) -> Optional["Series"]:
|
||
"""Fill NA/NaN values.
|
||
|
||
.. note:: the current implementation of 'method' parameter in fillna uses Spark's Window
|
||
without specifying partition specification. This leads to move all data into
|
||
single partition in single machine and could cause serious
|
||
performance degradation. Avoid this method against very large dataset.
|
||
|
||
Parameters
|
||
----------
|
||
value : scalar, dict, Series
|
||
Value to use to fill holes. alternately a dict/Series of values
|
||
specifying which value to use for each column.
|
||
DataFrame is not supported.
|
||
method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
|
||
Method to use for filling holes in reindexed Series pad / ffill: propagate last valid
|
||
observation forward to next valid backfill / bfill:
|
||
use NEXT valid observation to fill gap
|
||
axis : {0 or `index`}
|
||
1 and `columns` are not supported.
|
||
inplace : boolean, default False
|
||
Fill in place (do not create a new object)
|
||
limit : int, default None
|
||
If method is specified, this is the maximum number of consecutive NaN values to
|
||
forward/backward fill. In other words, if there is a gap with more than this number of
|
||
consecutive NaNs, it will only be partially filled. If method is not specified,
|
||
this is the maximum number of entries along the entire axis where NaNs will be filled.
|
||
Must be greater than 0 if not None
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
Series with NA entries filled.
|
||
|
||
Examples
|
||
--------
|
||
>>> s = ps.Series([np.nan, 2, 3, 4, np.nan, 6], name='x')
|
||
>>> s
|
||
0 NaN
|
||
1 2.0
|
||
2 3.0
|
||
3 4.0
|
||
4 NaN
|
||
5 6.0
|
||
Name: x, dtype: float64
|
||
|
||
Replace all NaN elements with 0s.
|
||
|
||
>>> s.fillna(0)
|
||
0 0.0
|
||
1 2.0
|
||
2 3.0
|
||
3 4.0
|
||
4 0.0
|
||
5 6.0
|
||
Name: x, dtype: float64
|
||
|
||
We can also propagate non-null values forward or backward.
|
||
|
||
>>> s.fillna(method='ffill')
|
||
0 NaN
|
||
1 2.0
|
||
2 3.0
|
||
3 4.0
|
||
4 4.0
|
||
5 6.0
|
||
Name: x, dtype: float64
|
||
|
||
>>> s = ps.Series([np.nan, 'a', 'b', 'c', np.nan], name='x')
|
||
>>> s.fillna(method='ffill')
|
||
0 None
|
||
1 a
|
||
2 b
|
||
3 c
|
||
4 c
|
||
Name: x, dtype: object
|
||
"""
|
||
psser = self._fillna(value=value, method=method, axis=axis, limit=limit)
|
||
|
||
if method is not None:
|
||
psser = DataFrame(psser._psdf._internal.resolved_copy)._psser_for(self._column_label)
|
||
|
||
inplace = validate_bool_kwarg(inplace, "inplace")
|
||
if inplace:
|
||
self._psdf._update_internal_frame(psser._psdf._internal, requires_same_anchor=False)
|
||
return None
|
||
else:
|
||
return psser._with_new_scol(psser.spark.column) # TODO: dtype?
|
||
|
||
def _fillna(
|
||
self,
|
||
value: Optional[Any] = None,
|
||
method: Optional[str] = None,
|
||
axis: Optional[Union[int, str]] = None,
|
||
limit: Optional[int] = None,
|
||
part_cols: Sequence[Union[str, Column]] = (),
|
||
) -> "Series":
|
||
axis = validate_axis(axis)
|
||
if axis != 0:
|
||
raise NotImplementedError("fillna currently only works for axis=0 or axis='index'")
|
||
if (value is None) and (method is None):
|
||
raise ValueError("Must specify a fillna 'value' or 'method' parameter.")
|
||
if (method is not None) and (method not in ["ffill", "pad", "backfill", "bfill"]):
|
||
raise ValueError("Expecting 'pad', 'ffill', 'backfill' or 'bfill'.")
|
||
|
||
scol = self.spark.column
|
||
|
||
if isinstance(self.spark.data_type, (FloatType, DoubleType)):
|
||
cond = scol.isNull() | F.isnan(scol)
|
||
else:
|
||
if not self.spark.nullable:
|
||
return self.copy()
|
||
cond = scol.isNull()
|
||
|
||
if value is not None:
|
||
if not isinstance(value, (float, int, str, bool)):
|
||
raise TypeError("Unsupported type %s" % type(value).__name__)
|
||
if limit is not None:
|
||
raise ValueError("limit parameter for value is not support now")
|
||
scol = F.when(cond, value).otherwise(scol)
|
||
else:
|
||
if method in ["ffill", "pad"]:
|
||
func = F.last
|
||
end = Window.currentRow - 1
|
||
if limit is not None:
|
||
begin = Window.currentRow - limit
|
||
else:
|
||
begin = Window.unboundedPreceding
|
||
elif method in ["bfill", "backfill"]:
|
||
func = F.first
|
||
begin = Window.currentRow + 1
|
||
if limit is not None:
|
||
end = Window.currentRow + limit
|
||
else:
|
||
end = Window.unboundedFollowing
|
||
|
||
window = (
|
||
Window.partitionBy(*part_cols)
|
||
.orderBy(NATURAL_ORDER_COLUMN_NAME)
|
||
.rowsBetween(begin, end)
|
||
)
|
||
scol = F.when(cond, func(scol, True).over(window)).otherwise(scol)
|
||
|
||
return DataFrame(
|
||
self._psdf._internal.with_new_spark_column(
|
||
self._column_label, scol.alias(name_like_string(self.name)) # TODO: dtype?
|
||
)
|
||
)._psser_for(self._column_label)
|
||
|
||
def dropna(
|
||
self, axis: Union[int, str] = 0, inplace: bool = False, **kwargs: Any
|
||
) -> Optional["Series"]:
|
||
"""
|
||
Return a new Series with missing values removed.
|
||
|
||
Parameters
|
||
----------
|
||
axis : {0 or 'index'}, default 0
|
||
There is only one axis to drop values from.
|
||
inplace : bool, default False
|
||
If True, do operation inplace and return None.
|
||
**kwargs
|
||
Not in use.
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
Series with NA entries dropped from it.
|
||
|
||
Examples
|
||
--------
|
||
>>> ser = ps.Series([1., 2., np.nan])
|
||
>>> ser
|
||
0 1.0
|
||
1 2.0
|
||
2 NaN
|
||
dtype: float64
|
||
|
||
Drop NA values from a Series.
|
||
|
||
>>> ser.dropna()
|
||
0 1.0
|
||
1 2.0
|
||
dtype: float64
|
||
|
||
Keep the Series with valid entries in the same variable.
|
||
|
||
>>> ser.dropna(inplace=True)
|
||
>>> ser
|
||
0 1.0
|
||
1 2.0
|
||
dtype: float64
|
||
"""
|
||
inplace = validate_bool_kwarg(inplace, "inplace")
|
||
# TODO: last two examples from pandas produce different results.
|
||
psdf = self._psdf[[self.name]].dropna(axis=axis, inplace=False)
|
||
if inplace:
|
||
self._update_anchor(psdf)
|
||
return None
|
||
else:
|
||
return first_series(psdf)
|
||
|
||
def clip(self, lower: Union[float, int] = None, upper: Union[float, int] = None) -> "Series":
|
||
"""
|
||
Trim values at input threshold(s).
|
||
|
||
Assigns values outside boundary to boundary values.
|
||
|
||
Parameters
|
||
----------
|
||
lower : float or int, default None
|
||
Minimum threshold value. All values below this threshold will be set to it.
|
||
upper : float or int, default None
|
||
Maximum threshold value. All values above this threshold will be set to it.
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
Series with the values outside the clip boundaries replaced
|
||
|
||
Examples
|
||
--------
|
||
>>> ps.Series([0, 2, 4]).clip(1, 3)
|
||
0 1
|
||
1 2
|
||
2 3
|
||
dtype: int64
|
||
|
||
Notes
|
||
-----
|
||
One difference between this implementation and pandas is that running
|
||
`pd.Series(['a', 'b']).clip(0, 1)` will crash with "TypeError: '<=' not supported between
|
||
instances of 'str' and 'int'" while `ps.Series(['a', 'b']).clip(0, 1)` will output the
|
||
original Series, simply ignoring the incompatible types.
|
||
"""
|
||
if is_list_like(lower) or is_list_like(upper):
|
||
raise TypeError(
|
||
"List-like value are not supported for 'lower' and 'upper' at the " + "moment"
|
||
)
|
||
|
||
if lower is None and upper is None:
|
||
return self
|
||
|
||
if isinstance(self.spark.data_type, NumericType):
|
||
scol = self.spark.column
|
||
if lower is not None:
|
||
scol = F.when(scol < lower, lower).otherwise(scol)
|
||
if upper is not None:
|
||
scol = F.when(scol > upper, upper).otherwise(scol)
|
||
return self._with_new_scol(
|
||
scol.alias(self._internal.data_spark_column_names[0]),
|
||
field=self._internal.data_fields[0],
|
||
)
|
||
else:
|
||
return self
|
||
|
||
def drop(
|
||
self,
|
||
labels: Optional[Union[Any, Tuple, List[Union[Any, Tuple]]]] = None,
|
||
index: Optional[Union[Any, Tuple, List[Union[Any, Tuple]]]] = None,
|
||
level: Optional[int] = None,
|
||
) -> "Series":
|
||
"""
|
||
Return Series with specified index labels removed.
|
||
|
||
Remove elements of a Series based on specifying the index labels.
|
||
When using a multi-index, labels on different levels can be removed by specifying the level.
|
||
|
||
Parameters
|
||
----------
|
||
labels : single label or list-like
|
||
Index labels to drop.
|
||
index : None
|
||
Redundant for application on Series, but index can be used instead of labels.
|
||
level : int or level name, optional
|
||
For MultiIndex, level for which the labels will be removed.
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
Series with specified index labels removed.
|
||
|
||
See Also
|
||
--------
|
||
Series.dropna
|
||
|
||
Examples
|
||
--------
|
||
>>> s = ps.Series(data=np.arange(3), index=['A', 'B', 'C'])
|
||
>>> s
|
||
A 0
|
||
B 1
|
||
C 2
|
||
dtype: int64
|
||
|
||
Drop single label A
|
||
|
||
>>> s.drop('A')
|
||
B 1
|
||
C 2
|
||
dtype: int64
|
||
|
||
Drop labels B and C
|
||
|
||
>>> s.drop(labels=['B', 'C'])
|
||
A 0
|
||
dtype: int64
|
||
|
||
With 'index' rather than 'labels' returns exactly same result.
|
||
|
||
>>> s.drop(index='A')
|
||
B 1
|
||
C 2
|
||
dtype: int64
|
||
|
||
>>> s.drop(index=['B', 'C'])
|
||
A 0
|
||
dtype: int64
|
||
|
||
Also support for MultiIndex
|
||
|
||
>>> midx = pd.MultiIndex([['lama', 'cow', 'falcon'],
|
||
... ['speed', 'weight', 'length']],
|
||
... [[0, 0, 0, 1, 1, 1, 2, 2, 2],
|
||
... [0, 1, 2, 0, 1, 2, 0, 1, 2]])
|
||
>>> s = ps.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],
|
||
... index=midx)
|
||
>>> s
|
||
lama speed 45.0
|
||
weight 200.0
|
||
length 1.2
|
||
cow speed 30.0
|
||
weight 250.0
|
||
length 1.5
|
||
falcon speed 320.0
|
||
weight 1.0
|
||
length 0.3
|
||
dtype: float64
|
||
|
||
>>> s.drop(labels='weight', level=1)
|
||
lama speed 45.0
|
||
length 1.2
|
||
cow speed 30.0
|
||
length 1.5
|
||
falcon speed 320.0
|
||
length 0.3
|
||
dtype: float64
|
||
|
||
>>> s.drop(('lama', 'weight'))
|
||
lama speed 45.0
|
||
length 1.2
|
||
cow speed 30.0
|
||
weight 250.0
|
||
length 1.5
|
||
falcon speed 320.0
|
||
weight 1.0
|
||
length 0.3
|
||
dtype: float64
|
||
|
||
>>> s.drop([('lama', 'speed'), ('falcon', 'weight')])
|
||
lama weight 200.0
|
||
length 1.2
|
||
cow speed 30.0
|
||
weight 250.0
|
||
length 1.5
|
||
falcon speed 320.0
|
||
length 0.3
|
||
dtype: float64
|
||
"""
|
||
return first_series(self._drop(labels=labels, index=index, level=level))
|
||
|
||
def _drop(
|
||
self,
|
||
labels: Optional[Union[Any, Tuple, List[Union[Any, Tuple]]]] = None,
|
||
index: Optional[Union[Any, Tuple, List[Union[Any, Tuple]]]] = None,
|
||
level: Optional[int] = None,
|
||
) -> DataFrame:
|
||
if labels is not None:
|
||
if index is not None:
|
||
raise ValueError("Cannot specify both 'labels' and 'index'")
|
||
return self._drop(index=labels, level=level)
|
||
if index is not None:
|
||
internal = self._internal
|
||
if level is None:
|
||
level = 0
|
||
if level >= internal.index_level:
|
||
raise ValueError("'level' should be less than the number of indexes")
|
||
|
||
if is_name_like_tuple(index): # type: ignore
|
||
index_list = [cast(Tuple, index)]
|
||
elif is_name_like_value(index):
|
||
index_list = [(index,)]
|
||
elif all(is_name_like_value(idxes, allow_tuple=False) for idxes in index):
|
||
index_list = [(idex,) for idex in index]
|
||
elif not all(is_name_like_tuple(idxes) for idxes in index):
|
||
raise ValueError(
|
||
"If the given index is a list, it "
|
||
"should only contains names as all tuples or all non tuples "
|
||
"that contain index names"
|
||
)
|
||
else:
|
||
index_list = cast(List[Tuple], index)
|
||
|
||
drop_index_scols = []
|
||
for idxes in index_list:
|
||
try:
|
||
index_scols = [
|
||
internal.index_spark_columns[lvl] == idx
|
||
for lvl, idx in enumerate(idxes, level)
|
||
]
|
||
except IndexError:
|
||
raise KeyError(
|
||
"Key length ({}) exceeds index depth ({})".format(
|
||
internal.index_level, len(idxes)
|
||
)
|
||
)
|
||
drop_index_scols.append(reduce(lambda x, y: x & y, index_scols))
|
||
|
||
cond = ~reduce(lambda x, y: x | y, drop_index_scols)
|
||
|
||
return DataFrame(internal.with_filter(cond))
|
||
else:
|
||
raise ValueError("Need to specify at least one of 'labels' or 'index'")
|
||
|
||
def head(self, n: int = 5) -> "Series":
|
||
"""
|
||
Return the first n rows.
|
||
|
||
This function returns the first n rows for the object based on position.
|
||
It is useful for quickly testing if your object has the right type of data in it.
|
||
|
||
Parameters
|
||
----------
|
||
n : Integer, default = 5
|
||
|
||
Returns
|
||
-------
|
||
The first n rows of the caller object.
|
||
|
||
Examples
|
||
--------
|
||
>>> df = ps.DataFrame({'animal':['alligator', 'bee', 'falcon', 'lion']})
|
||
>>> df.animal.head(2) # doctest: +NORMALIZE_WHITESPACE
|
||
0 alligator
|
||
1 bee
|
||
Name: animal, dtype: object
|
||
"""
|
||
return first_series(self.to_frame().head(n)).rename(self.name)
|
||
|
||
def last(self, offset: Union[str, DateOffset]) -> "Series":
|
||
"""
|
||
Select final periods of time series data based on a date offset.
|
||
|
||
When having a Series with dates as index, this function can
|
||
select the last few elements based on a date offset.
|
||
|
||
Parameters
|
||
----------
|
||
offset : str or DateOffset
|
||
The offset length of the data that will be selected. For instance,
|
||
'3D' will display all the rows having their index within the last 3 days.
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
A subset of the caller.
|
||
|
||
Raises
|
||
------
|
||
TypeError
|
||
If the index is not a :class:`DatetimeIndex`
|
||
|
||
Examples
|
||
--------
|
||
>>> index = pd.date_range('2018-04-09', periods=4, freq='2D')
|
||
>>> psser = ps.Series([1, 2, 3, 4], index=index)
|
||
>>> psser
|
||
2018-04-09 1
|
||
2018-04-11 2
|
||
2018-04-13 3
|
||
2018-04-15 4
|
||
dtype: int64
|
||
|
||
Get the rows for the last 3 days:
|
||
|
||
>>> psser.last('3D')
|
||
2018-04-13 3
|
||
2018-04-15 4
|
||
dtype: int64
|
||
|
||
Notice the data for 3 last calendar days were returned, not the last
|
||
3 observed days in the dataset, and therefore data for 2018-04-11 was
|
||
not returned.
|
||
"""
|
||
return first_series(self.to_frame().last(offset)).rename(self.name)
|
||
|
||
def first(self, offset: Union[str, DateOffset]) -> "Series":
|
||
"""
|
||
Select first periods of time series data based on a date offset.
|
||
|
||
When having a Series with dates as index, this function can
|
||
select the first few elements based on a date offset.
|
||
|
||
Parameters
|
||
----------
|
||
offset : str or DateOffset
|
||
The offset length of the data that will be selected. For instance,
|
||
'3D' will display all the rows having their index within the first 3 days.
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
A subset of the caller.
|
||
|
||
Raises
|
||
------
|
||
TypeError
|
||
If the index is not a :class:`DatetimeIndex`
|
||
|
||
Examples
|
||
--------
|
||
>>> index = pd.date_range('2018-04-09', periods=4, freq='2D')
|
||
>>> psser = ps.Series([1, 2, 3, 4], index=index)
|
||
>>> psser
|
||
2018-04-09 1
|
||
2018-04-11 2
|
||
2018-04-13 3
|
||
2018-04-15 4
|
||
dtype: int64
|
||
|
||
Get the rows for the first 3 days:
|
||
|
||
>>> psser.first('3D')
|
||
2018-04-09 1
|
||
2018-04-11 2
|
||
dtype: int64
|
||
|
||
Notice the data for 3 first calendar days were returned, not the first
|
||
3 observed days in the dataset, and therefore data for 2018-04-13 was
|
||
not returned.
|
||
"""
|
||
return first_series(self.to_frame().first(offset)).rename(self.name)
|
||
|
||
# TODO: Categorical type isn't supported (due to PySpark's limitation) and
|
||
# some doctests related with timestamps were not added.
|
||
def unique(self) -> "Series":
|
||
"""
|
||
Return unique values of Series object.
|
||
|
||
Uniques are returned in order of appearance. Hash table-based unique,
|
||
therefore does NOT sort.
|
||
|
||
.. note:: This method returns newly created Series whereas pandas returns
|
||
the unique values as a NumPy array.
|
||
|
||
Returns
|
||
-------
|
||
Returns the unique values as a Series.
|
||
|
||
See Also
|
||
--------
|
||
Index.unique
|
||
groupby.SeriesGroupBy.unique
|
||
|
||
Examples
|
||
--------
|
||
>>> psser = ps.Series([2, 1, 3, 3], name='A')
|
||
>>> psser.unique().sort_values() # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
|
||
<BLANKLINE>
|
||
... 1
|
||
... 2
|
||
... 3
|
||
Name: A, dtype: int64
|
||
|
||
>>> ps.Series([pd.Timestamp('2016-01-01') for _ in range(3)]).unique()
|
||
0 2016-01-01
|
||
dtype: datetime64[ns]
|
||
|
||
>>> psser.name = ('x', 'a')
|
||
>>> psser.unique().sort_values() # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
|
||
<BLANKLINE>
|
||
... 1
|
||
... 2
|
||
... 3
|
||
Name: (x, a), dtype: int64
|
||
"""
|
||
sdf = self._internal.spark_frame.select(self.spark.column).distinct()
|
||
internal = InternalFrame(
|
||
spark_frame=sdf,
|
||
index_spark_columns=None,
|
||
column_labels=[self._column_label],
|
||
data_spark_columns=[scol_for(sdf, self._internal.data_spark_column_names[0])],
|
||
data_fields=[self._internal.data_fields[0]],
|
||
column_label_names=self._internal.column_label_names,
|
||
)
|
||
return first_series(DataFrame(internal))
|
||
|
||
def sort_values(
|
||
self, ascending: bool = True, inplace: bool = False, na_position: str = "last"
|
||
) -> Optional["Series"]:
|
||
"""
|
||
Sort by the values.
|
||
|
||
Sort a Series in ascending or descending order by some criterion.
|
||
|
||
Parameters
|
||
----------
|
||
ascending : bool or list of bool, default True
|
||
Sort ascending vs. descending. Specify list for multiple sort
|
||
orders. If this is a list of bools, must match the length of
|
||
the by.
|
||
inplace : bool, default False
|
||
if True, perform operation in-place
|
||
na_position : {'first', 'last'}, default 'last'
|
||
`first` puts NaNs at the beginning, `last` puts NaNs at the end
|
||
|
||
Returns
|
||
-------
|
||
sorted_obj : Series ordered by values.
|
||
|
||
Examples
|
||
--------
|
||
>>> s = ps.Series([np.nan, 1, 3, 10, 5])
|
||
>>> s
|
||
0 NaN
|
||
1 1.0
|
||
2 3.0
|
||
3 10.0
|
||
4 5.0
|
||
dtype: float64
|
||
|
||
Sort values ascending order (default behaviour)
|
||
|
||
>>> s.sort_values(ascending=True)
|
||
1 1.0
|
||
2 3.0
|
||
4 5.0
|
||
3 10.0
|
||
0 NaN
|
||
dtype: float64
|
||
|
||
Sort values descending order
|
||
|
||
>>> s.sort_values(ascending=False)
|
||
3 10.0
|
||
4 5.0
|
||
2 3.0
|
||
1 1.0
|
||
0 NaN
|
||
dtype: float64
|
||
|
||
Sort values inplace
|
||
|
||
>>> s.sort_values(ascending=False, inplace=True)
|
||
>>> s
|
||
3 10.0
|
||
4 5.0
|
||
2 3.0
|
||
1 1.0
|
||
0 NaN
|
||
dtype: float64
|
||
|
||
Sort values putting NAs first
|
||
|
||
>>> s.sort_values(na_position='first')
|
||
0 NaN
|
||
1 1.0
|
||
2 3.0
|
||
4 5.0
|
||
3 10.0
|
||
dtype: float64
|
||
|
||
Sort a series of strings
|
||
|
||
>>> s = ps.Series(['z', 'b', 'd', 'a', 'c'])
|
||
>>> s
|
||
0 z
|
||
1 b
|
||
2 d
|
||
3 a
|
||
4 c
|
||
dtype: object
|
||
|
||
>>> s.sort_values()
|
||
3 a
|
||
1 b
|
||
4 c
|
||
2 d
|
||
0 z
|
||
dtype: object
|
||
"""
|
||
inplace = validate_bool_kwarg(inplace, "inplace")
|
||
psdf = self._psdf[[self.name]]._sort(
|
||
by=[self.spark.column], ascending=ascending, na_position=na_position
|
||
)
|
||
|
||
if inplace:
|
||
self._update_anchor(psdf)
|
||
return None
|
||
else:
|
||
return first_series(psdf)
|
||
|
||
def sort_index(
|
||
self,
|
||
axis: int = 0,
|
||
level: Optional[Union[int, List[int]]] = None,
|
||
ascending: bool = True,
|
||
inplace: bool = False,
|
||
kind: str = None,
|
||
na_position: str = "last",
|
||
) -> Optional["Series"]:
|
||
"""
|
||
Sort object by labels (along an axis)
|
||
|
||
Parameters
|
||
----------
|
||
axis : index, columns to direct sorting. Currently, only axis = 0 is supported.
|
||
level : int or level name or list of ints or list of level names
|
||
if not None, sort on values in specified index level(s)
|
||
ascending : boolean, default True
|
||
Sort ascending vs. descending
|
||
inplace : bool, default False
|
||
if True, perform operation in-place
|
||
kind : str, default None
|
||
pandas-on-Spark does not allow specifying the sorting algorithm at the moment,
|
||
default None
|
||
na_position : {‘first’, ‘last’}, default ‘last’
|
||
first puts NaNs at the beginning, last puts NaNs at the end. Not implemented for
|
||
MultiIndex.
|
||
|
||
Returns
|
||
-------
|
||
sorted_obj : Series
|
||
|
||
Examples
|
||
--------
|
||
>>> df = ps.Series([2, 1, np.nan], index=['b', 'a', np.nan])
|
||
|
||
>>> df.sort_index()
|
||
a 1.0
|
||
b 2.0
|
||
NaN NaN
|
||
dtype: float64
|
||
|
||
>>> df.sort_index(ascending=False)
|
||
b 2.0
|
||
a 1.0
|
||
NaN NaN
|
||
dtype: float64
|
||
|
||
>>> df.sort_index(na_position='first')
|
||
NaN NaN
|
||
a 1.0
|
||
b 2.0
|
||
dtype: float64
|
||
|
||
>>> df.sort_index(inplace=True)
|
||
>>> df
|
||
a 1.0
|
||
b 2.0
|
||
NaN NaN
|
||
dtype: float64
|
||
|
||
>>> df = ps.Series(range(4), index=[['b', 'b', 'a', 'a'], [1, 0, 1, 0]], name='0')
|
||
|
||
>>> df.sort_index()
|
||
a 0 3
|
||
1 2
|
||
b 0 1
|
||
1 0
|
||
Name: 0, dtype: int64
|
||
|
||
>>> df.sort_index(level=1) # doctest: +SKIP
|
||
a 0 3
|
||
b 0 1
|
||
a 1 2
|
||
b 1 0
|
||
Name: 0, dtype: int64
|
||
|
||
>>> df.sort_index(level=[1, 0])
|
||
a 0 3
|
||
b 0 1
|
||
a 1 2
|
||
b 1 0
|
||
Name: 0, dtype: int64
|
||
"""
|
||
inplace = validate_bool_kwarg(inplace, "inplace")
|
||
psdf = self._psdf[[self.name]].sort_index(
|
||
axis=axis, level=level, ascending=ascending, kind=kind, na_position=na_position
|
||
)
|
||
|
||
if inplace:
|
||
self._update_anchor(psdf)
|
||
return None
|
||
else:
|
||
return first_series(psdf)
|
||
|
||
def swaplevel(
|
||
self, i: Union[int, Any, Tuple] = -2, j: Union[int, Any, Tuple] = -1, copy: bool = True
|
||
) -> "Series":
|
||
"""
|
||
Swap levels i and j in a MultiIndex.
|
||
Default is to swap the two innermost levels of the index.
|
||
|
||
Parameters
|
||
----------
|
||
i, j : int, str
|
||
Level of the indices to be swapped. Can pass level name as string.
|
||
copy : bool, default True
|
||
Whether to copy underlying data. Must be True.
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
Series with levels swapped in MultiIndex.
|
||
|
||
Examples
|
||
--------
|
||
>>> midx = pd.MultiIndex.from_arrays([['a', 'b'], [1, 2]], names = ['word', 'number'])
|
||
>>> midx # doctest: +SKIP
|
||
MultiIndex([('a', 1),
|
||
('b', 2)],
|
||
names=['word', 'number'])
|
||
>>> psser = ps.Series(['x', 'y'], index=midx)
|
||
>>> psser
|
||
word number
|
||
a 1 x
|
||
b 2 y
|
||
dtype: object
|
||
>>> psser.swaplevel()
|
||
number word
|
||
1 a x
|
||
2 b y
|
||
dtype: object
|
||
>>> psser.swaplevel(0, 1)
|
||
number word
|
||
1 a x
|
||
2 b y
|
||
dtype: object
|
||
>>> psser.swaplevel('number', 'word')
|
||
number word
|
||
1 a x
|
||
2 b y
|
||
dtype: object
|
||
"""
|
||
assert copy is True
|
||
|
||
return first_series(self.to_frame().swaplevel(i, j, axis=0)).rename(self.name)
|
||
|
||
def swapaxes(self, i: Union[str, int], j: Union[str, int], copy: bool = True) -> "Series":
|
||
"""
|
||
Interchange axes and swap values axes appropriately.
|
||
|
||
Parameters
|
||
----------
|
||
i: {0 or 'index', 1 or 'columns'}. The axis to swap.
|
||
j: {0 or 'index', 1 or 'columns'}. The axis to swap.
|
||
copy : bool, default True.
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
|
||
Examples
|
||
--------
|
||
>>> psser = ps.Series([1, 2, 3], index=["x", "y", "z"])
|
||
>>> psser
|
||
x 1
|
||
y 2
|
||
z 3
|
||
dtype: int64
|
||
>>>
|
||
>>> psser.swapaxes(0, 0)
|
||
x 1
|
||
y 2
|
||
z 3
|
||
dtype: int64
|
||
"""
|
||
assert copy is True
|
||
|
||
i = validate_axis(i)
|
||
j = validate_axis(j)
|
||
if not i == j == 0:
|
||
raise ValueError("Axis must be 0 for Series")
|
||
|
||
return self.copy()
|
||
|
||
def add_prefix(self, prefix: str) -> "Series":
|
||
"""
|
||
Prefix labels with string `prefix`.
|
||
|
||
For Series, the row labels are prefixed.
|
||
For DataFrame, the column labels are prefixed.
|
||
|
||
Parameters
|
||
----------
|
||
prefix : str
|
||
The string to add before each label.
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
New Series with updated labels.
|
||
|
||
See Also
|
||
--------
|
||
Series.add_suffix: Suffix column labels with string `suffix`.
|
||
DataFrame.add_suffix: Suffix column labels with string `suffix`.
|
||
DataFrame.add_prefix: Prefix column labels with string `prefix`.
|
||
|
||
Examples
|
||
--------
|
||
>>> s = ps.Series([1, 2, 3, 4])
|
||
>>> s
|
||
0 1
|
||
1 2
|
||
2 3
|
||
3 4
|
||
dtype: int64
|
||
|
||
>>> s.add_prefix('item_')
|
||
item_0 1
|
||
item_1 2
|
||
item_2 3
|
||
item_3 4
|
||
dtype: int64
|
||
"""
|
||
assert isinstance(prefix, str)
|
||
internal = self._internal.resolved_copy
|
||
sdf = internal.spark_frame.select(
|
||
[
|
||
F.concat(F.lit(prefix), index_spark_column).alias(index_spark_column_name)
|
||
for index_spark_column, index_spark_column_name in zip(
|
||
internal.index_spark_columns, internal.index_spark_column_names
|
||
)
|
||
]
|
||
+ internal.data_spark_columns
|
||
)
|
||
return first_series(
|
||
DataFrame(internal.with_new_sdf(sdf, index_fields=([None] * internal.index_level)))
|
||
)
|
||
|
||
def add_suffix(self, suffix: str) -> "Series":
|
||
"""
|
||
Suffix labels with string suffix.
|
||
|
||
For Series, the row labels are suffixed.
|
||
For DataFrame, the column labels are suffixed.
|
||
|
||
Parameters
|
||
----------
|
||
suffix : str
|
||
The string to add after each label.
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
New Series with updated labels.
|
||
|
||
See Also
|
||
--------
|
||
Series.add_prefix: Prefix row labels with string `prefix`.
|
||
DataFrame.add_prefix: Prefix column labels with string `prefix`.
|
||
DataFrame.add_suffix: Suffix column labels with string `suffix`.
|
||
|
||
Examples
|
||
--------
|
||
>>> s = ps.Series([1, 2, 3, 4])
|
||
>>> s
|
||
0 1
|
||
1 2
|
||
2 3
|
||
3 4
|
||
dtype: int64
|
||
|
||
>>> s.add_suffix('_item')
|
||
0_item 1
|
||
1_item 2
|
||
2_item 3
|
||
3_item 4
|
||
dtype: int64
|
||
"""
|
||
assert isinstance(suffix, str)
|
||
internal = self._internal.resolved_copy
|
||
sdf = internal.spark_frame.select(
|
||
[
|
||
F.concat(index_spark_column, F.lit(suffix)).alias(index_spark_column_name)
|
||
for index_spark_column, index_spark_column_name in zip(
|
||
internal.index_spark_columns, internal.index_spark_column_names
|
||
)
|
||
]
|
||
+ internal.data_spark_columns
|
||
)
|
||
return first_series(
|
||
DataFrame(internal.with_new_sdf(sdf, index_fields=([None] * internal.index_level)))
|
||
)
|
||
|
||
def corr(self, other: "Series", method: str = "pearson") -> float:
|
||
"""
|
||
Compute correlation with `other` Series, excluding missing values.
|
||
|
||
Parameters
|
||
----------
|
||
other : Series
|
||
method : {'pearson', 'spearman'}
|
||
* pearson : standard correlation coefficient
|
||
* spearman : Spearman rank correlation
|
||
|
||
Returns
|
||
-------
|
||
correlation : float
|
||
|
||
Examples
|
||
--------
|
||
>>> df = ps.DataFrame({'s1': [.2, .0, .6, .2],
|
||
... 's2': [.3, .6, .0, .1]})
|
||
>>> s1 = df.s1
|
||
>>> s2 = df.s2
|
||
>>> s1.corr(s2, method='pearson') # doctest: +ELLIPSIS
|
||
-0.851064...
|
||
|
||
>>> s1.corr(s2, method='spearman') # doctest: +ELLIPSIS
|
||
-0.948683...
|
||
|
||
Notes
|
||
-----
|
||
There are behavior differences between pandas-on-Spark and pandas.
|
||
|
||
* the `method` argument only accepts 'pearson', 'spearman'
|
||
* the data should not contain NaNs. pandas-on-Spark will return an error.
|
||
* pandas-on-Spark doesn't support the following argument(s).
|
||
|
||
* `min_periods` argument is not supported
|
||
"""
|
||
# This implementation is suboptimal because it computes more than necessary,
|
||
# but it should be a start
|
||
columns = ["__corr_arg1__", "__corr_arg2__"]
|
||
psdf = self._psdf.assign(__corr_arg1__=self, __corr_arg2__=other)[columns]
|
||
psdf.columns = columns
|
||
c = corr(psdf, method=method)
|
||
return c.loc[tuple(columns)]
|
||
|
||
def nsmallest(self, n: int = 5) -> "Series":
|
||
"""
|
||
Return the smallest `n` elements.
|
||
|
||
Parameters
|
||
----------
|
||
n : int, default 5
|
||
Return this many ascending sorted values.
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
The `n` smallest values in the Series, sorted in increasing order.
|
||
|
||
See Also
|
||
--------
|
||
Series.nlargest: Get the `n` largest elements.
|
||
Series.sort_values: Sort Series by values.
|
||
Series.head: Return the first `n` rows.
|
||
|
||
Notes
|
||
-----
|
||
Faster than ``.sort_values().head(n)`` for small `n` relative to
|
||
the size of the ``Series`` object.
|
||
In pandas-on-Spark, thanks to Spark's lazy execution and query optimizer,
|
||
the two would have same performance.
|
||
|
||
Examples
|
||
--------
|
||
>>> data = [1, 2, 3, 4, np.nan ,6, 7, 8]
|
||
>>> s = ps.Series(data)
|
||
>>> s
|
||
0 1.0
|
||
1 2.0
|
||
2 3.0
|
||
3 4.0
|
||
4 NaN
|
||
5 6.0
|
||
6 7.0
|
||
7 8.0
|
||
dtype: float64
|
||
|
||
The `n` largest elements where ``n=5`` by default.
|
||
|
||
>>> s.nsmallest()
|
||
0 1.0
|
||
1 2.0
|
||
2 3.0
|
||
3 4.0
|
||
5 6.0
|
||
dtype: float64
|
||
|
||
>>> s.nsmallest(3)
|
||
0 1.0
|
||
1 2.0
|
||
2 3.0
|
||
dtype: float64
|
||
"""
|
||
return self.sort_values(ascending=True).head(n)
|
||
|
||
def nlargest(self, n: int = 5) -> "Series":
|
||
"""
|
||
Return the largest `n` elements.
|
||
|
||
Parameters
|
||
----------
|
||
n : int, default 5
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
The `n` largest values in the Series, sorted in decreasing order.
|
||
|
||
See Also
|
||
--------
|
||
Series.nsmallest: Get the `n` smallest elements.
|
||
Series.sort_values: Sort Series by values.
|
||
Series.head: Return the first `n` rows.
|
||
|
||
Notes
|
||
-----
|
||
Faster than ``.sort_values(ascending=False).head(n)`` for small `n`
|
||
relative to the size of the ``Series`` object.
|
||
|
||
In pandas-on-Spark, thanks to Spark's lazy execution and query optimizer,
|
||
the two would have same performance.
|
||
|
||
Examples
|
||
--------
|
||
>>> data = [1, 2, 3, 4, np.nan ,6, 7, 8]
|
||
>>> s = ps.Series(data)
|
||
>>> s
|
||
0 1.0
|
||
1 2.0
|
||
2 3.0
|
||
3 4.0
|
||
4 NaN
|
||
5 6.0
|
||
6 7.0
|
||
7 8.0
|
||
dtype: float64
|
||
|
||
The `n` largest elements where ``n=5`` by default.
|
||
|
||
>>> s.nlargest()
|
||
7 8.0
|
||
6 7.0
|
||
5 6.0
|
||
3 4.0
|
||
2 3.0
|
||
dtype: float64
|
||
|
||
>>> s.nlargest(n=3)
|
||
7 8.0
|
||
6 7.0
|
||
5 6.0
|
||
dtype: float64
|
||
|
||
|
||
"""
|
||
return self.sort_values(ascending=False).head(n)
|
||
|
||
def append(
|
||
self, to_append: "Series", ignore_index: bool = False, verify_integrity: bool = False
|
||
) -> "Series":
|
||
"""
|
||
Concatenate two or more Series.
|
||
|
||
Parameters
|
||
----------
|
||
to_append : Series or list/tuple of Series
|
||
ignore_index : boolean, default False
|
||
If True, do not use the index labels.
|
||
verify_integrity : boolean, default False
|
||
If True, raise Exception on creating index with duplicates
|
||
|
||
Returns
|
||
-------
|
||
appended : Series
|
||
|
||
Examples
|
||
--------
|
||
>>> s1 = ps.Series([1, 2, 3])
|
||
>>> s2 = ps.Series([4, 5, 6])
|
||
>>> s3 = ps.Series([4, 5, 6], index=[3,4,5])
|
||
|
||
>>> s1.append(s2)
|
||
0 1
|
||
1 2
|
||
2 3
|
||
0 4
|
||
1 5
|
||
2 6
|
||
dtype: int64
|
||
|
||
>>> s1.append(s3)
|
||
0 1
|
||
1 2
|
||
2 3
|
||
3 4
|
||
4 5
|
||
5 6
|
||
dtype: int64
|
||
|
||
With ignore_index set to True:
|
||
|
||
>>> s1.append(s2, ignore_index=True)
|
||
0 1
|
||
1 2
|
||
2 3
|
||
3 4
|
||
4 5
|
||
5 6
|
||
dtype: int64
|
||
"""
|
||
return first_series(
|
||
self.to_frame().append(to_append.to_frame(), ignore_index, verify_integrity)
|
||
).rename(self.name)
|
||
|
||
def sample(
|
||
self,
|
||
n: Optional[int] = None,
|
||
frac: Optional[float] = None,
|
||
replace: bool = False,
|
||
random_state: Optional[int] = None,
|
||
) -> "Series":
|
||
return first_series(
|
||
self.to_frame().sample(n=n, frac=frac, replace=replace, random_state=random_state)
|
||
).rename(self.name)
|
||
|
||
sample.__doc__ = DataFrame.sample.__doc__
|
||
|
||
@no_type_check
|
||
def hist(self, bins=10, **kwds):
|
||
return self.plot.hist(bins, **kwds)
|
||
|
||
hist.__doc__ = PandasOnSparkPlotAccessor.hist.__doc__
|
||
|
||
def apply(self, func: Callable, args: Sequence[Any] = (), **kwds: Any) -> "Series":
|
||
"""
|
||
Invoke function on values of Series.
|
||
|
||
Can be a Python function that only works on the Series.
|
||
|
||
.. note:: this API executes the function once to infer the type which is
|
||
potentially expensive, for instance, when the dataset is created after
|
||
aggregations or sorting.
|
||
|
||
To avoid this, specify return type in ``func``, for instance, as below:
|
||
|
||
>>> def square(x) -> np.int32:
|
||
... return x ** 2
|
||
|
||
pandas-on-Spark uses return type hint and does not try to infer the type.
|
||
|
||
Parameters
|
||
----------
|
||
func : function
|
||
Python function to apply. Note that type hint for return type is required.
|
||
args : tuple
|
||
Positional arguments passed to func after the series value.
|
||
**kwds
|
||
Additional keyword arguments passed to func.
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
|
||
See Also
|
||
--------
|
||
Series.aggregate : Only perform aggregating type operations.
|
||
Series.transform : Only perform transforming type operations.
|
||
DataFrame.apply : The equivalent function for DataFrame.
|
||
|
||
Examples
|
||
--------
|
||
Create a Series with typical summer temperatures for each city.
|
||
|
||
>>> s = ps.Series([20, 21, 12],
|
||
... index=['London', 'New York', 'Helsinki'])
|
||
>>> s
|
||
London 20
|
||
New York 21
|
||
Helsinki 12
|
||
dtype: int64
|
||
|
||
|
||
Square the values by defining a function and passing it as an
|
||
argument to ``apply()``.
|
||
|
||
>>> def square(x) -> np.int64:
|
||
... return x ** 2
|
||
>>> s.apply(square)
|
||
London 400
|
||
New York 441
|
||
Helsinki 144
|
||
dtype: int64
|
||
|
||
|
||
Define a custom function that needs additional positional
|
||
arguments and pass these additional arguments using the
|
||
``args`` keyword
|
||
|
||
>>> def subtract_custom_value(x, custom_value) -> np.int64:
|
||
... return x - custom_value
|
||
|
||
>>> s.apply(subtract_custom_value, args=(5,))
|
||
London 15
|
||
New York 16
|
||
Helsinki 7
|
||
dtype: int64
|
||
|
||
|
||
Define a custom function that takes keyword arguments
|
||
and pass these arguments to ``apply``
|
||
|
||
>>> def add_custom_values(x, **kwargs) -> np.int64:
|
||
... for month in kwargs:
|
||
... x += kwargs[month]
|
||
... return x
|
||
|
||
>>> s.apply(add_custom_values, june=30, july=20, august=25)
|
||
London 95
|
||
New York 96
|
||
Helsinki 87
|
||
dtype: int64
|
||
|
||
|
||
Use a function from the Numpy library
|
||
|
||
>>> def numpy_log(col) -> np.float64:
|
||
... return np.log(col)
|
||
>>> s.apply(numpy_log)
|
||
London 2.995732
|
||
New York 3.044522
|
||
Helsinki 2.484907
|
||
dtype: float64
|
||
|
||
|
||
You can omit the type hint and let pandas-on-Spark infer its type.
|
||
|
||
>>> s.apply(np.log)
|
||
London 2.995732
|
||
New York 3.044522
|
||
Helsinki 2.484907
|
||
dtype: float64
|
||
|
||
"""
|
||
assert callable(func), "the first argument should be a callable function."
|
||
try:
|
||
spec = inspect.getfullargspec(func)
|
||
return_sig = spec.annotations.get("return", None)
|
||
should_infer_schema = return_sig is None
|
||
except TypeError:
|
||
# Falls back to schema inference if it fails to get signature.
|
||
should_infer_schema = True
|
||
|
||
apply_each = wraps(func)(lambda s: s.apply(func, args=args, **kwds))
|
||
|
||
if should_infer_schema:
|
||
return self.pandas_on_spark._transform_batch(apply_each, None)
|
||
else:
|
||
sig_return = infer_return_type(func)
|
||
if not isinstance(sig_return, ScalarType):
|
||
raise ValueError(
|
||
"Expected the return type of this function to be of scalar type, "
|
||
"but found type {}".format(sig_return)
|
||
)
|
||
return_type = cast(ScalarType, sig_return)
|
||
return self.pandas_on_spark._transform_batch(apply_each, return_type)
|
||
|
||
# TODO: not all arguments are implemented comparing to pandas' for now.
|
||
def aggregate(self, func: Union[str, List[str]]) -> Union[Scalar, "Series"]:
|
||
"""Aggregate using one or more operations over the specified axis.
|
||
|
||
Parameters
|
||
----------
|
||
func : str or a list of str
|
||
function name(s) as string apply to series.
|
||
|
||
Returns
|
||
-------
|
||
scalar, Series
|
||
The return can be:
|
||
- scalar : when Series.agg is called with single function
|
||
- Series : when Series.agg is called with several functions
|
||
|
||
Notes
|
||
-----
|
||
`agg` is an alias for `aggregate`. Use the alias.
|
||
|
||
See Also
|
||
--------
|
||
Series.apply : Invoke function on a Series.
|
||
Series.transform : Only perform transforming type operations.
|
||
Series.groupby : Perform operations over groups.
|
||
DataFrame.aggregate : The equivalent function for DataFrame.
|
||
|
||
Examples
|
||
--------
|
||
>>> s = ps.Series([1, 2, 3, 4])
|
||
>>> s.agg('min')
|
||
1
|
||
|
||
>>> s.agg(['min', 'max']).sort_index()
|
||
max 4
|
||
min 1
|
||
dtype: int64
|
||
"""
|
||
if isinstance(func, list):
|
||
return first_series(self.to_frame().aggregate(func)).rename(self.name)
|
||
elif isinstance(func, str):
|
||
return getattr(self, func)()
|
||
else:
|
||
raise TypeError("func must be a string or list of strings")
|
||
|
||
agg = aggregate
|
||
|
||
def transpose(self, *args: Any, **kwargs: Any) -> "Series":
|
||
"""
|
||
Return the transpose, which is by definition self.
|
||
|
||
Examples
|
||
--------
|
||
It returns the same object as the transpose of the given series object, which is by
|
||
definition self.
|
||
|
||
>>> s = ps.Series([1, 2, 3])
|
||
>>> s
|
||
0 1
|
||
1 2
|
||
2 3
|
||
dtype: int64
|
||
|
||
>>> s.transpose()
|
||
0 1
|
||
1 2
|
||
2 3
|
||
dtype: int64
|
||
"""
|
||
return self.copy()
|
||
|
||
T = property(transpose)
|
||
|
||
def transform(
|
||
self,
|
||
func: Union[Callable, List[Callable]],
|
||
axis: Union[int, str] = 0,
|
||
*args: Any,
|
||
**kwargs: Any
|
||
) -> Union["Series", DataFrame]:
|
||
"""
|
||
Call ``func`` producing the same type as `self` with transformed values
|
||
and that has the same axis length as input.
|
||
|
||
.. note:: this API executes the function once to infer the type which is
|
||
potentially expensive, for instance, when the dataset is created after
|
||
aggregations or sorting.
|
||
|
||
To avoid this, specify return type in ``func``, for instance, as below:
|
||
|
||
>>> def square(x) -> np.int32:
|
||
... return x ** 2
|
||
|
||
pandas-on-Spark uses return type hint and does not try to infer the type.
|
||
|
||
Parameters
|
||
----------
|
||
func : function or list
|
||
A function or a list of functions to use for transforming the data.
|
||
axis : int, default 0 or 'index'
|
||
Can only be set to 0 at the moment.
|
||
*args
|
||
Positional arguments to pass to `func`.
|
||
**kwargs
|
||
Keyword arguments to pass to `func`.
|
||
|
||
Returns
|
||
-------
|
||
An instance of the same type with `self` that must have the same length as input.
|
||
|
||
See Also
|
||
--------
|
||
Series.aggregate : Only perform aggregating type operations.
|
||
Series.apply : Invoke function on Series.
|
||
DataFrame.transform : The equivalent function for DataFrame.
|
||
|
||
Examples
|
||
--------
|
||
|
||
>>> s = ps.Series(range(3))
|
||
>>> s
|
||
0 0
|
||
1 1
|
||
2 2
|
||
dtype: int64
|
||
|
||
>>> def sqrt(x) -> float:
|
||
... return np.sqrt(x)
|
||
>>> s.transform(sqrt)
|
||
0 0.000000
|
||
1 1.000000
|
||
2 1.414214
|
||
dtype: float64
|
||
|
||
Even though the resulting instance must have the same length as the
|
||
input, it is possible to provide several input functions:
|
||
|
||
>>> def exp(x) -> float:
|
||
... return np.exp(x)
|
||
>>> s.transform([sqrt, exp])
|
||
sqrt exp
|
||
0 0.000000 1.000000
|
||
1 1.000000 2.718282
|
||
2 1.414214 7.389056
|
||
|
||
You can omit the type hint and let pandas-on-Spark infer its type.
|
||
|
||
>>> s.transform([np.sqrt, np.exp])
|
||
sqrt exp
|
||
0 0.000000 1.000000
|
||
1 1.000000 2.718282
|
||
2 1.414214 7.389056
|
||
"""
|
||
axis = validate_axis(axis)
|
||
if axis != 0:
|
||
raise NotImplementedError('axis should be either 0 or "index" currently.')
|
||
|
||
if isinstance(func, list):
|
||
applied = []
|
||
for f in func:
|
||
applied.append(self.apply(f, args=args, **kwargs).rename(f.__name__))
|
||
|
||
internal = self._internal.with_new_columns(applied)
|
||
return DataFrame(internal)
|
||
else:
|
||
return self.apply(func, args=args, **kwargs)
|
||
|
||
def round(self, decimals: int = 0) -> "Series":
|
||
"""
|
||
Round each value in a Series to the given number of decimals.
|
||
|
||
Parameters
|
||
----------
|
||
decimals : int
|
||
Number of decimal places to round to (default: 0).
|
||
If decimals is negative, it specifies the number of
|
||
positions to the left of the decimal point.
|
||
|
||
Returns
|
||
-------
|
||
Series object
|
||
|
||
See Also
|
||
--------
|
||
DataFrame.round
|
||
|
||
Examples
|
||
--------
|
||
>>> df = ps.Series([0.028208, 0.038683, 0.877076], name='x')
|
||
>>> df
|
||
0 0.028208
|
||
1 0.038683
|
||
2 0.877076
|
||
Name: x, dtype: float64
|
||
|
||
>>> df.round(2)
|
||
0 0.03
|
||
1 0.04
|
||
2 0.88
|
||
Name: x, dtype: float64
|
||
"""
|
||
if not isinstance(decimals, int):
|
||
raise TypeError("decimals must be an integer")
|
||
scol = F.round(self.spark.column, decimals)
|
||
return self._with_new_scol(scol) # TODO: dtype?
|
||
|
||
# TODO: add 'interpolation' parameter.
|
||
def quantile(
|
||
self, q: Union[float, Iterable[float]] = 0.5, accuracy: int = 10000
|
||
) -> Union[Scalar, "Series"]:
|
||
"""
|
||
Return value at the given quantile.
|
||
|
||
.. note:: Unlike pandas', the quantile in pandas-on-Spark is an approximated quantile
|
||
based upon approximate percentile computation because computing quantile across
|
||
a large dataset is extremely expensive.
|
||
|
||
Parameters
|
||
----------
|
||
q : float or array-like, default 0.5 (50% quantile)
|
||
0 <= q <= 1, the quantile(s) to compute.
|
||
accuracy : int, optional
|
||
Default accuracy of approximation. Larger value means better accuracy.
|
||
The relative error can be deduced by 1.0 / accuracy.
|
||
|
||
Returns
|
||
-------
|
||
float or Series
|
||
If the current object is a Series and ``q`` is an array, a Series will be
|
||
returned where the index is ``q`` and the values are the quantiles, otherwise
|
||
a float will be returned.
|
||
|
||
Examples
|
||
--------
|
||
>>> s = ps.Series([1, 2, 3, 4, 5])
|
||
>>> s.quantile(.5)
|
||
3.0
|
||
|
||
>>> (s + 1).quantile(.5)
|
||
4.0
|
||
|
||
>>> s.quantile([.25, .5, .75])
|
||
0.25 2.0
|
||
0.50 3.0
|
||
0.75 4.0
|
||
dtype: float64
|
||
|
||
>>> (s + 1).quantile([.25, .5, .75])
|
||
0.25 3.0
|
||
0.50 4.0
|
||
0.75 5.0
|
||
dtype: float64
|
||
"""
|
||
if isinstance(q, Iterable):
|
||
return first_series(
|
||
self.to_frame().quantile(q=q, axis=0, numeric_only=False, accuracy=accuracy)
|
||
).rename(self.name)
|
||
else:
|
||
if not isinstance(accuracy, int):
|
||
raise TypeError(
|
||
"accuracy must be an integer; however, got [%s]" % type(accuracy).__name__
|
||
)
|
||
|
||
if not isinstance(q, float):
|
||
raise TypeError(
|
||
"q must be a float or an array of floats; however, [%s] found." % type(q)
|
||
)
|
||
q_float = cast(float, q)
|
||
if q_float < 0.0 or q_float > 1.0:
|
||
raise ValueError("percentiles should all be in the interval [0, 1].")
|
||
|
||
def quantile(spark_column: Column, spark_type: DataType) -> Column:
|
||
if isinstance(spark_type, (BooleanType, NumericType)):
|
||
return F.percentile_approx(spark_column.cast(DoubleType()), q_float, accuracy)
|
||
else:
|
||
raise TypeError(
|
||
"Could not convert {} ({}) to numeric".format(
|
||
spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
|
||
)
|
||
)
|
||
|
||
return self._reduce_for_stat_function(quantile, name="quantile")
|
||
|
||
# TODO: add axis, numeric_only, pct, na_option parameter
|
||
def rank(self, method: str = "average", ascending: bool = True) -> "Series":
|
||
"""
|
||
Compute numerical data ranks (1 through n) along axis. Equal values are
|
||
assigned a rank that is the average of the ranks of those values.
|
||
|
||
.. note:: the current implementation of rank uses Spark's Window without
|
||
specifying partition specification. This leads to move all data into
|
||
single partition in single machine and could cause serious
|
||
performance degradation. Avoid this method against very large dataset.
|
||
|
||
Parameters
|
||
----------
|
||
method : {'average', 'min', 'max', 'first', 'dense'}
|
||
* average: average rank of group
|
||
* min: lowest rank in group
|
||
* max: highest rank in group
|
||
* first: ranks assigned in order they appear in the array
|
||
* dense: like 'min', but rank always increases by 1 between groups
|
||
ascending : boolean, default True
|
||
False for ranks by high (1) to low (N)
|
||
|
||
Returns
|
||
-------
|
||
ranks : same type as caller
|
||
|
||
Examples
|
||
--------
|
||
>>> s = ps.Series([1, 2, 2, 3], name='A')
|
||
>>> s
|
||
0 1
|
||
1 2
|
||
2 2
|
||
3 3
|
||
Name: A, dtype: int64
|
||
|
||
>>> s.rank()
|
||
0 1.0
|
||
1 2.5
|
||
2 2.5
|
||
3 4.0
|
||
Name: A, dtype: float64
|
||
|
||
If method is set to 'min', it use lowest rank in group.
|
||
|
||
>>> s.rank(method='min')
|
||
0 1.0
|
||
1 2.0
|
||
2 2.0
|
||
3 4.0
|
||
Name: A, dtype: float64
|
||
|
||
If method is set to 'max', it use highest rank in group.
|
||
|
||
>>> s.rank(method='max')
|
||
0 1.0
|
||
1 3.0
|
||
2 3.0
|
||
3 4.0
|
||
Name: A, dtype: float64
|
||
|
||
If method is set to 'first', it is assigned rank in order without groups.
|
||
|
||
>>> s.rank(method='first')
|
||
0 1.0
|
||
1 2.0
|
||
2 3.0
|
||
3 4.0
|
||
Name: A, dtype: float64
|
||
|
||
If method is set to 'dense', it leaves no gaps in group.
|
||
|
||
>>> s.rank(method='dense')
|
||
0 1.0
|
||
1 2.0
|
||
2 2.0
|
||
3 3.0
|
||
Name: A, dtype: float64
|
||
"""
|
||
return self._rank(method, ascending).spark.analyzed
|
||
|
||
def _rank(
|
||
self,
|
||
method: str = "average",
|
||
ascending: bool = True,
|
||
*,
|
||
part_cols: Sequence[Union[str, Column]] = ()
|
||
) -> "Series":
|
||
if method not in ["average", "min", "max", "first", "dense"]:
|
||
msg = "method must be one of 'average', 'min', 'max', 'first', 'dense'"
|
||
raise ValueError(msg)
|
||
|
||
if self._internal.index_level > 1:
|
||
raise ValueError("rank do not support index now")
|
||
|
||
if ascending:
|
||
asc_func = lambda scol: scol.asc()
|
||
else:
|
||
asc_func = lambda scol: scol.desc()
|
||
|
||
if method == "first":
|
||
window = (
|
||
Window.orderBy(
|
||
asc_func(self.spark.column),
|
||
asc_func(F.col(NATURAL_ORDER_COLUMN_NAME)),
|
||
)
|
||
.partitionBy(*part_cols)
|
||
.rowsBetween(Window.unboundedPreceding, Window.currentRow)
|
||
)
|
||
scol = F.row_number().over(window)
|
||
elif method == "dense":
|
||
window = (
|
||
Window.orderBy(asc_func(self.spark.column))
|
||
.partitionBy(*part_cols)
|
||
.rowsBetween(Window.unboundedPreceding, Window.currentRow)
|
||
)
|
||
scol = F.dense_rank().over(window)
|
||
else:
|
||
if method == "average":
|
||
stat_func = F.mean
|
||
elif method == "min":
|
||
stat_func = F.min
|
||
elif method == "max":
|
||
stat_func = F.max
|
||
window1 = (
|
||
Window.orderBy(asc_func(self.spark.column))
|
||
.partitionBy(*part_cols)
|
||
.rowsBetween(Window.unboundedPreceding, Window.currentRow)
|
||
)
|
||
window2 = Window.partitionBy([self.spark.column] + list(part_cols)).rowsBetween(
|
||
Window.unboundedPreceding, Window.unboundedFollowing
|
||
)
|
||
scol = stat_func(F.row_number().over(window1)).over(window2)
|
||
psser = self._with_new_scol(scol)
|
||
return psser.astype(np.float64)
|
||
|
||
def filter(
|
||
self,
|
||
items: Optional[Sequence[Any]] = None,
|
||
like: Optional[str] = None,
|
||
regex: Optional[str] = None,
|
||
axis: Optional[Union[int, str]] = None,
|
||
) -> "Series":
|
||
axis = validate_axis(axis)
|
||
if axis == 1:
|
||
raise ValueError("Series does not support columns axis.")
|
||
return first_series(
|
||
self.to_frame().filter(items=items, like=like, regex=regex, axis=axis)
|
||
).rename(self.name)
|
||
|
||
filter.__doc__ = DataFrame.filter.__doc__
|
||
|
||
def describe(self, percentiles: Optional[List[float]] = None) -> "Series":
|
||
return first_series(self.to_frame().describe(percentiles)).rename(self.name)
|
||
|
||
describe.__doc__ = DataFrame.describe.__doc__
|
||
|
||
def diff(self, periods: int = 1) -> "Series":
|
||
"""
|
||
First discrete difference of element.
|
||
|
||
Calculates the difference of a Series element compared with another element in the
|
||
DataFrame (default is the element in the same column of the previous row).
|
||
|
||
.. note:: the current implementation of diff uses Spark's Window without
|
||
specifying partition specification. This leads to move all data into
|
||
single partition in single machine and could cause serious
|
||
performance degradation. Avoid this method against very large dataset.
|
||
|
||
Parameters
|
||
----------
|
||
periods : int, default 1
|
||
Periods to shift for calculating difference, accepts negative values.
|
||
|
||
Returns
|
||
-------
|
||
diffed : Series
|
||
|
||
Examples
|
||
--------
|
||
>>> df = ps.DataFrame({'a': [1, 2, 3, 4, 5, 6],
|
||
... 'b': [1, 1, 2, 3, 5, 8],
|
||
... 'c': [1, 4, 9, 16, 25, 36]}, columns=['a', 'b', 'c'])
|
||
>>> df
|
||
a b c
|
||
0 1 1 1
|
||
1 2 1 4
|
||
2 3 2 9
|
||
3 4 3 16
|
||
4 5 5 25
|
||
5 6 8 36
|
||
|
||
>>> df.b.diff()
|
||
0 NaN
|
||
1 0.0
|
||
2 1.0
|
||
3 1.0
|
||
4 2.0
|
||
5 3.0
|
||
Name: b, dtype: float64
|
||
|
||
Difference with previous value
|
||
|
||
>>> df.c.diff(periods=3)
|
||
0 NaN
|
||
1 NaN
|
||
2 NaN
|
||
3 15.0
|
||
4 21.0
|
||
5 27.0
|
||
Name: c, dtype: float64
|
||
|
||
Difference with following value
|
||
|
||
>>> df.c.diff(periods=-1)
|
||
0 -3.0
|
||
1 -5.0
|
||
2 -7.0
|
||
3 -9.0
|
||
4 -11.0
|
||
5 NaN
|
||
Name: c, dtype: float64
|
||
"""
|
||
return self._diff(periods).spark.analyzed
|
||
|
||
def _diff(self, periods: int, *, part_cols: Sequence[Union[str, Column]] = ()) -> "Series":
|
||
if not isinstance(periods, int):
|
||
raise TypeError("periods should be an int; however, got [%s]" % type(periods).__name__)
|
||
window = (
|
||
Window.partitionBy(*part_cols)
|
||
.orderBy(NATURAL_ORDER_COLUMN_NAME)
|
||
.rowsBetween(-periods, -periods)
|
||
)
|
||
scol = self.spark.column - F.lag(self.spark.column, periods).over(window)
|
||
return self._with_new_scol(scol, field=self._internal.data_fields[0].copy(nullable=True))
|
||
|
||
def idxmax(self, skipna: bool = True) -> Union[Tuple, Any]:
|
||
"""
|
||
Return the row label of the maximum value.
|
||
|
||
If multiple values equal the maximum, the first row label with that
|
||
value is returned.
|
||
|
||
Parameters
|
||
----------
|
||
skipna : bool, default True
|
||
Exclude NA/null values. If the entire Series is NA, the result
|
||
will be NA.
|
||
|
||
Returns
|
||
-------
|
||
Index
|
||
Label of the maximum value.
|
||
|
||
Raises
|
||
------
|
||
ValueError
|
||
If the Series is empty.
|
||
|
||
See Also
|
||
--------
|
||
Series.idxmin : Return index *label* of the first occurrence
|
||
of minimum of values.
|
||
|
||
Examples
|
||
--------
|
||
>>> s = ps.Series(data=[1, None, 4, 3, 5],
|
||
... index=['A', 'B', 'C', 'D', 'E'])
|
||
>>> s
|
||
A 1.0
|
||
B NaN
|
||
C 4.0
|
||
D 3.0
|
||
E 5.0
|
||
dtype: float64
|
||
|
||
>>> s.idxmax()
|
||
'E'
|
||
|
||
If `skipna` is False and there is an NA value in the data,
|
||
the function returns ``nan``.
|
||
|
||
>>> s.idxmax(skipna=False)
|
||
nan
|
||
|
||
In case of multi-index, you get a tuple:
|
||
|
||
>>> index = pd.MultiIndex.from_arrays([
|
||
... ['a', 'a', 'b', 'b'], ['c', 'd', 'e', 'f']], names=('first', 'second'))
|
||
>>> s = ps.Series(data=[1, None, 4, 5], index=index)
|
||
>>> s
|
||
first second
|
||
a c 1.0
|
||
d NaN
|
||
b e 4.0
|
||
f 5.0
|
||
dtype: float64
|
||
|
||
>>> s.idxmax()
|
||
('b', 'f')
|
||
|
||
If multiple values equal the maximum, the first row label with that
|
||
value is returned.
|
||
|
||
>>> s = ps.Series([1, 100, 1, 100, 1, 100], index=[10, 3, 5, 2, 1, 8])
|
||
>>> s
|
||
10 1
|
||
3 100
|
||
5 1
|
||
2 100
|
||
1 1
|
||
8 100
|
||
dtype: int64
|
||
|
||
>>> s.idxmax()
|
||
3
|
||
"""
|
||
sdf = self._internal.spark_frame
|
||
scol = self.spark.column
|
||
index_scols = self._internal.index_spark_columns
|
||
# desc_nulls_(last|first) is used via Py4J directly because
|
||
# it's not supported in Spark 2.3.
|
||
if skipna:
|
||
sdf = sdf.orderBy(Column(scol._jc.desc_nulls_last()), NATURAL_ORDER_COLUMN_NAME)
|
||
else:
|
||
sdf = sdf.orderBy(Column(scol._jc.desc_nulls_first()), NATURAL_ORDER_COLUMN_NAME)
|
||
results = sdf.select([scol] + index_scols).take(1)
|
||
if len(results) == 0:
|
||
raise ValueError("attempt to get idxmin of an empty sequence")
|
||
if results[0][0] is None:
|
||
# This will only happens when skipna is False because we will
|
||
# place nulls first.
|
||
return np.nan
|
||
values = list(results[0][1:])
|
||
if len(values) == 1:
|
||
return values[0]
|
||
else:
|
||
return tuple(values)
|
||
|
||
def idxmin(self, skipna: bool = True) -> Union[Tuple, Any]:
|
||
"""
|
||
Return the row label of the minimum value.
|
||
|
||
If multiple values equal the minimum, the first row label with that
|
||
value is returned.
|
||
|
||
Parameters
|
||
----------
|
||
skipna : bool, default True
|
||
Exclude NA/null values. If the entire Series is NA, the result
|
||
will be NA.
|
||
|
||
Returns
|
||
-------
|
||
Index
|
||
Label of the minimum value.
|
||
|
||
Raises
|
||
------
|
||
ValueError
|
||
If the Series is empty.
|
||
|
||
See Also
|
||
--------
|
||
Series.idxmax : Return index *label* of the first occurrence
|
||
of maximum of values.
|
||
|
||
Notes
|
||
-----
|
||
This method is the Series version of ``ndarray.argmin``. This method
|
||
returns the label of the minimum, while ``ndarray.argmin`` returns
|
||
the position. To get the position, use ``series.values.argmin()``.
|
||
|
||
Examples
|
||
--------
|
||
>>> s = ps.Series(data=[1, None, 4, 0],
|
||
... index=['A', 'B', 'C', 'D'])
|
||
>>> s
|
||
A 1.0
|
||
B NaN
|
||
C 4.0
|
||
D 0.0
|
||
dtype: float64
|
||
|
||
>>> s.idxmin()
|
||
'D'
|
||
|
||
If `skipna` is False and there is an NA value in the data,
|
||
the function returns ``nan``.
|
||
|
||
>>> s.idxmin(skipna=False)
|
||
nan
|
||
|
||
In case of multi-index, you get a tuple:
|
||
|
||
>>> index = pd.MultiIndex.from_arrays([
|
||
... ['a', 'a', 'b', 'b'], ['c', 'd', 'e', 'f']], names=('first', 'second'))
|
||
>>> s = ps.Series(data=[1, None, 4, 0], index=index)
|
||
>>> s
|
||
first second
|
||
a c 1.0
|
||
d NaN
|
||
b e 4.0
|
||
f 0.0
|
||
dtype: float64
|
||
|
||
>>> s.idxmin()
|
||
('b', 'f')
|
||
|
||
If multiple values equal the minimum, the first row label with that
|
||
value is returned.
|
||
|
||
>>> s = ps.Series([1, 100, 1, 100, 1, 100], index=[10, 3, 5, 2, 1, 8])
|
||
>>> s
|
||
10 1
|
||
3 100
|
||
5 1
|
||
2 100
|
||
1 1
|
||
8 100
|
||
dtype: int64
|
||
|
||
>>> s.idxmin()
|
||
10
|
||
"""
|
||
sdf = self._internal.spark_frame
|
||
scol = self.spark.column
|
||
index_scols = self._internal.index_spark_columns
|
||
# asc_nulls_(last|first)is used via Py4J directly because
|
||
# it's not supported in Spark 2.3.
|
||
if skipna:
|
||
sdf = sdf.orderBy(Column(scol._jc.asc_nulls_last()), NATURAL_ORDER_COLUMN_NAME)
|
||
else:
|
||
sdf = sdf.orderBy(Column(scol._jc.asc_nulls_first()), NATURAL_ORDER_COLUMN_NAME)
|
||
results = sdf.select([scol] + index_scols).take(1)
|
||
if len(results) == 0:
|
||
raise ValueError("attempt to get idxmin of an empty sequence")
|
||
if results[0][0] is None:
|
||
# This will only happens when skipna is False because we will
|
||
# place nulls first.
|
||
return np.nan
|
||
values = list(results[0][1:])
|
||
if len(values) == 1:
|
||
return values[0]
|
||
else:
|
||
return tuple(values)
|
||
|
||
def pop(self, item: Union[Any, Tuple]) -> Union["Series", Scalar]:
|
||
"""
|
||
Return item and drop from series.
|
||
|
||
Parameters
|
||
----------
|
||
item : label
|
||
Label of index to be popped.
|
||
|
||
Returns
|
||
-------
|
||
Value that is popped from series.
|
||
|
||
Examples
|
||
--------
|
||
>>> s = ps.Series(data=np.arange(3), index=['A', 'B', 'C'])
|
||
>>> s
|
||
A 0
|
||
B 1
|
||
C 2
|
||
dtype: int64
|
||
|
||
>>> s.pop('A')
|
||
0
|
||
|
||
>>> s
|
||
B 1
|
||
C 2
|
||
dtype: int64
|
||
|
||
>>> s = ps.Series(data=np.arange(3), index=['A', 'A', 'C'])
|
||
>>> s
|
||
A 0
|
||
A 1
|
||
C 2
|
||
dtype: int64
|
||
|
||
>>> s.pop('A')
|
||
A 0
|
||
A 1
|
||
dtype: int64
|
||
|
||
>>> s
|
||
C 2
|
||
dtype: int64
|
||
|
||
Also support for MultiIndex
|
||
|
||
>>> midx = pd.MultiIndex([['lama', 'cow', 'falcon'],
|
||
... ['speed', 'weight', 'length']],
|
||
... [[0, 0, 0, 1, 1, 1, 2, 2, 2],
|
||
... [0, 1, 2, 0, 1, 2, 0, 1, 2]])
|
||
>>> s = ps.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],
|
||
... index=midx)
|
||
>>> s
|
||
lama speed 45.0
|
||
weight 200.0
|
||
length 1.2
|
||
cow speed 30.0
|
||
weight 250.0
|
||
length 1.5
|
||
falcon speed 320.0
|
||
weight 1.0
|
||
length 0.3
|
||
dtype: float64
|
||
|
||
>>> s.pop('lama')
|
||
speed 45.0
|
||
weight 200.0
|
||
length 1.2
|
||
dtype: float64
|
||
|
||
>>> s
|
||
cow speed 30.0
|
||
weight 250.0
|
||
length 1.5
|
||
falcon speed 320.0
|
||
weight 1.0
|
||
length 0.3
|
||
dtype: float64
|
||
|
||
Also support for MultiIndex with several indexs.
|
||
|
||
>>> midx = pd.MultiIndex([['a', 'b', 'c'],
|
||
... ['lama', 'cow', 'falcon'],
|
||
... ['speed', 'weight', 'length']],
|
||
... [[0, 0, 0, 0, 0, 0, 1, 1, 1],
|
||
... [0, 0, 0, 1, 1, 1, 2, 2, 2],
|
||
... [0, 1, 2, 0, 1, 2, 0, 0, 2]]
|
||
... )
|
||
>>> s = ps.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],
|
||
... index=midx)
|
||
>>> s
|
||
a lama speed 45.0
|
||
weight 200.0
|
||
length 1.2
|
||
cow speed 30.0
|
||
weight 250.0
|
||
length 1.5
|
||
b falcon speed 320.0
|
||
speed 1.0
|
||
length 0.3
|
||
dtype: float64
|
||
|
||
>>> s.pop(('a', 'lama'))
|
||
speed 45.0
|
||
weight 200.0
|
||
length 1.2
|
||
dtype: float64
|
||
|
||
>>> s
|
||
a cow speed 30.0
|
||
weight 250.0
|
||
length 1.5
|
||
b falcon speed 320.0
|
||
speed 1.0
|
||
length 0.3
|
||
dtype: float64
|
||
|
||
>>> s.pop(('b', 'falcon', 'speed'))
|
||
(b, falcon, speed) 320.0
|
||
(b, falcon, speed) 1.0
|
||
dtype: float64
|
||
"""
|
||
if not is_name_like_value(item):
|
||
raise TypeError("'key' should be string or tuple that contains strings")
|
||
if not is_name_like_tuple(item):
|
||
item = (item,)
|
||
if self._internal.index_level < len(item):
|
||
raise KeyError(
|
||
"Key length ({}) exceeds index depth ({})".format(
|
||
len(item), self._internal.index_level
|
||
)
|
||
)
|
||
|
||
internal = self._internal
|
||
scols = internal.index_spark_columns[len(item) :] + [self.spark.column]
|
||
rows = [internal.spark_columns[level] == index for level, index in enumerate(item)]
|
||
sdf = internal.spark_frame.filter(reduce(lambda x, y: x & y, rows)).select(scols)
|
||
|
||
psdf = self._drop(item)
|
||
self._update_anchor(psdf)
|
||
|
||
if self._internal.index_level == len(item):
|
||
# if spark_frame has one column and one data, return data only without frame
|
||
pdf = sdf.limit(2).toPandas()
|
||
length = len(pdf)
|
||
if length == 1:
|
||
return pdf[internal.data_spark_column_names[0]].iloc[0]
|
||
|
||
item_string = name_like_string(item)
|
||
sdf = sdf.withColumn(SPARK_DEFAULT_INDEX_NAME, F.lit(str(item_string)))
|
||
internal = InternalFrame(
|
||
spark_frame=sdf,
|
||
index_spark_columns=[scol_for(sdf, SPARK_DEFAULT_INDEX_NAME)],
|
||
column_labels=[self._column_label],
|
||
data_fields=[self._internal.data_fields[0]],
|
||
)
|
||
return first_series(DataFrame(internal))
|
||
else:
|
||
internal = internal.copy(
|
||
spark_frame=sdf,
|
||
index_spark_columns=[
|
||
scol_for(sdf, col) for col in internal.index_spark_column_names[len(item) :]
|
||
],
|
||
index_fields=internal.index_fields[len(item) :],
|
||
index_names=self._internal.index_names[len(item) :],
|
||
data_spark_columns=[scol_for(sdf, internal.data_spark_column_names[0])],
|
||
)
|
||
return first_series(DataFrame(internal))
|
||
|
||
def copy(self, deep: bool = True) -> "Series":
|
||
"""
|
||
Make a copy of this object's indices and data.
|
||
|
||
Parameters
|
||
----------
|
||
deep : bool, default True
|
||
this parameter is not supported but just dummy parameter to match pandas.
|
||
|
||
Returns
|
||
-------
|
||
copy : Series
|
||
|
||
Examples
|
||
--------
|
||
>>> s = ps.Series([1, 2], index=["a", "b"])
|
||
>>> s
|
||
a 1
|
||
b 2
|
||
dtype: int64
|
||
>>> s_copy = s.copy()
|
||
>>> s_copy
|
||
a 1
|
||
b 2
|
||
dtype: int64
|
||
"""
|
||
return self._psdf.copy(deep=deep)._psser_for(self._column_label)
|
||
|
||
def mode(self, dropna: bool = True) -> "Series":
|
||
"""
|
||
Return the mode(s) of the dataset.
|
||
|
||
Always returns Series even if only one value is returned.
|
||
|
||
Parameters
|
||
----------
|
||
dropna : bool, default True
|
||
Don't consider counts of NaN/NaT.
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
Modes of the Series.
|
||
|
||
Examples
|
||
--------
|
||
>>> s = ps.Series([0, 0, 1, 1, 1, np.nan, np.nan, np.nan])
|
||
>>> s
|
||
0 0.0
|
||
1 0.0
|
||
2 1.0
|
||
3 1.0
|
||
4 1.0
|
||
5 NaN
|
||
6 NaN
|
||
7 NaN
|
||
dtype: float64
|
||
|
||
>>> s.mode()
|
||
0 1.0
|
||
dtype: float64
|
||
|
||
If there are several same modes, all items are shown
|
||
|
||
>>> s = ps.Series([0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3,
|
||
... np.nan, np.nan, np.nan])
|
||
>>> s
|
||
0 0.0
|
||
1 0.0
|
||
2 1.0
|
||
3 1.0
|
||
4 1.0
|
||
5 2.0
|
||
6 2.0
|
||
7 2.0
|
||
8 3.0
|
||
9 3.0
|
||
10 3.0
|
||
11 NaN
|
||
12 NaN
|
||
13 NaN
|
||
dtype: float64
|
||
|
||
>>> s.mode().sort_values() # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
|
||
<BLANKLINE>
|
||
... 1.0
|
||
... 2.0
|
||
... 3.0
|
||
dtype: float64
|
||
|
||
With 'dropna' set to 'False', we can also see NaN in the result
|
||
|
||
>>> s.mode(False).sort_values() # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
|
||
<BLANKLINE>
|
||
... 1.0
|
||
... 2.0
|
||
... 3.0
|
||
... NaN
|
||
dtype: float64
|
||
"""
|
||
ser_count = self.value_counts(dropna=dropna, sort=False)
|
||
sdf_count = ser_count._internal.spark_frame
|
||
most_value = ser_count.max()
|
||
sdf_most_value = sdf_count.filter("count == {}".format(most_value))
|
||
sdf = sdf_most_value.select(
|
||
F.col(SPARK_DEFAULT_INDEX_NAME).alias(SPARK_DEFAULT_SERIES_NAME)
|
||
)
|
||
internal = InternalFrame(spark_frame=sdf, index_spark_columns=None, column_labels=[None])
|
||
|
||
return first_series(DataFrame(internal))
|
||
|
||
def keys(self) -> "ps.Index":
|
||
"""
|
||
Return alias for index.
|
||
|
||
Returns
|
||
-------
|
||
Index
|
||
Index of the Series.
|
||
|
||
Examples
|
||
--------
|
||
>>> midx = pd.MultiIndex([['lama', 'cow', 'falcon'],
|
||
... ['speed', 'weight', 'length']],
|
||
... [[0, 0, 0, 1, 1, 1, 2, 2, 2],
|
||
... [0, 1, 2, 0, 1, 2, 0, 1, 2]])
|
||
>>> psser = ps.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx)
|
||
|
||
>>> psser.keys() # doctest: +SKIP
|
||
MultiIndex([( 'lama', 'speed'),
|
||
( 'lama', 'weight'),
|
||
( 'lama', 'length'),
|
||
( 'cow', 'speed'),
|
||
( 'cow', 'weight'),
|
||
( 'cow', 'length'),
|
||
('falcon', 'speed'),
|
||
('falcon', 'weight'),
|
||
('falcon', 'length')],
|
||
)
|
||
"""
|
||
return self.index
|
||
|
||
# TODO: 'regex', 'method' parameter
|
||
def replace(
|
||
self,
|
||
to_replace: Optional[Union[Any, List, Tuple, Dict]] = None,
|
||
value: Optional[Union[List, Tuple]] = None,
|
||
regex: bool = False,
|
||
) -> "Series":
|
||
"""
|
||
Replace values given in to_replace with value.
|
||
Values of the Series are replaced with other values dynamically.
|
||
|
||
Parameters
|
||
----------
|
||
to_replace : str, list, tuple, dict, Series, int, float, or None
|
||
How to find the values that will be replaced.
|
||
* numeric, str:
|
||
|
||
- numeric: numeric values equal to to_replace will be replaced with value
|
||
- str: string exactly matching to_replace will be replaced with value
|
||
|
||
* list of str or numeric:
|
||
|
||
- if to_replace and value are both lists or tuples, they must be the same length.
|
||
- str and numeric rules apply as above.
|
||
|
||
* dict:
|
||
|
||
- Dicts can be used to specify different replacement values for different
|
||
existing values.
|
||
For example, {'a': 'b', 'y': 'z'} replaces the value ‘a’ with ‘b’ and ‘y’
|
||
with ‘z’. To use a dict in this way the value parameter should be None.
|
||
- For a DataFrame a dict can specify that different values should be replaced
|
||
in different columns. For example, {'a': 1, 'b': 'z'} looks for the value 1
|
||
in column ‘a’ and the value ‘z’ in column ‘b’ and replaces these values with
|
||
whatever is specified in value.
|
||
The value parameter should not be None in this case.
|
||
You can treat this as a special case of passing two lists except that you are
|
||
specifying the column to search in.
|
||
|
||
See the examples section for examples of each of these.
|
||
|
||
value : scalar, dict, list, tuple, str default None
|
||
Value to replace any values matching to_replace with.
|
||
For a DataFrame a dict of values can be used to specify which value to use
|
||
for each column (columns not in the dict will not be filled).
|
||
Regular expressions, strings and lists or dicts of such objects are also allowed.
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
Object after replacement.
|
||
|
||
Examples
|
||
--------
|
||
|
||
Scalar `to_replace` and `value`
|
||
|
||
>>> s = ps.Series([0, 1, 2, 3, 4])
|
||
>>> s
|
||
0 0
|
||
1 1
|
||
2 2
|
||
3 3
|
||
4 4
|
||
dtype: int64
|
||
|
||
>>> s.replace(0, 5)
|
||
0 5
|
||
1 1
|
||
2 2
|
||
3 3
|
||
4 4
|
||
dtype: int64
|
||
|
||
List-like `to_replace`
|
||
|
||
>>> s.replace([0, 4], 5000)
|
||
0 5000
|
||
1 1
|
||
2 2
|
||
3 3
|
||
4 5000
|
||
dtype: int64
|
||
|
||
>>> s.replace([1, 2, 3], [10, 20, 30])
|
||
0 0
|
||
1 10
|
||
2 20
|
||
3 30
|
||
4 4
|
||
dtype: int64
|
||
|
||
Dict-like `to_replace`
|
||
|
||
>>> s.replace({1: 1000, 2: 2000, 3: 3000, 4: 4000})
|
||
0 0
|
||
1 1000
|
||
2 2000
|
||
3 3000
|
||
4 4000
|
||
dtype: int64
|
||
|
||
Also support for MultiIndex
|
||
|
||
>>> midx = pd.MultiIndex([['lama', 'cow', 'falcon'],
|
||
... ['speed', 'weight', 'length']],
|
||
... [[0, 0, 0, 1, 1, 1, 2, 2, 2],
|
||
... [0, 1, 2, 0, 1, 2, 0, 1, 2]])
|
||
>>> s = ps.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],
|
||
... index=midx)
|
||
>>> s
|
||
lama speed 45.0
|
||
weight 200.0
|
||
length 1.2
|
||
cow speed 30.0
|
||
weight 250.0
|
||
length 1.5
|
||
falcon speed 320.0
|
||
weight 1.0
|
||
length 0.3
|
||
dtype: float64
|
||
|
||
>>> s.replace(45, 450)
|
||
lama speed 450.0
|
||
weight 200.0
|
||
length 1.2
|
||
cow speed 30.0
|
||
weight 250.0
|
||
length 1.5
|
||
falcon speed 320.0
|
||
weight 1.0
|
||
length 0.3
|
||
dtype: float64
|
||
|
||
>>> s.replace([45, 30, 320], 500)
|
||
lama speed 500.0
|
||
weight 200.0
|
||
length 1.2
|
||
cow speed 500.0
|
||
weight 250.0
|
||
length 1.5
|
||
falcon speed 500.0
|
||
weight 1.0
|
||
length 0.3
|
||
dtype: float64
|
||
|
||
>>> s.replace({45: 450, 30: 300})
|
||
lama speed 450.0
|
||
weight 200.0
|
||
length 1.2
|
||
cow speed 300.0
|
||
weight 250.0
|
||
length 1.5
|
||
falcon speed 320.0
|
||
weight 1.0
|
||
length 0.3
|
||
dtype: float64
|
||
"""
|
||
if to_replace is None:
|
||
return self.fillna(method="ffill")
|
||
if not isinstance(to_replace, (str, list, tuple, dict, int, float)):
|
||
raise TypeError("'to_replace' should be one of str, list, tuple, dict, int, float")
|
||
if regex:
|
||
raise NotImplementedError("replace currently not support for regex")
|
||
to_replace = list(to_replace) if isinstance(to_replace, tuple) else to_replace
|
||
value = list(value) if isinstance(value, tuple) else value
|
||
if isinstance(to_replace, list) and isinstance(value, list):
|
||
if not len(to_replace) == len(value):
|
||
raise ValueError(
|
||
"Replacement lists must match in length. Expecting {} got {}".format(
|
||
len(to_replace), len(value)
|
||
)
|
||
)
|
||
to_replace = {k: v for k, v in zip(to_replace, value)}
|
||
if isinstance(to_replace, dict):
|
||
is_start = True
|
||
if len(to_replace) == 0:
|
||
current = self.spark.column
|
||
else:
|
||
for to_replace_, value in to_replace.items():
|
||
cond = (
|
||
(F.isnan(self.spark.column) | self.spark.column.isNull())
|
||
if pd.isna(to_replace_)
|
||
else (self.spark.column == F.lit(to_replace_))
|
||
)
|
||
if is_start:
|
||
current = F.when(cond, value)
|
||
is_start = False
|
||
else:
|
||
current = current.when(cond, value)
|
||
current = current.otherwise(self.spark.column)
|
||
else:
|
||
cond = self.spark.column.isin(to_replace)
|
||
# to_replace may be a scalar
|
||
if np.array(pd.isna(to_replace)).any():
|
||
cond = cond | F.isnan(self.spark.column) | self.spark.column.isNull()
|
||
current = F.when(cond, value).otherwise(self.spark.column)
|
||
|
||
return self._with_new_scol(current) # TODO: dtype?
|
||
|
||
def update(self, other: "Series") -> None:
|
||
"""
|
||
Modify Series in place using non-NA values from passed Series. Aligns on index.
|
||
|
||
Parameters
|
||
----------
|
||
other : Series
|
||
|
||
Examples
|
||
--------
|
||
>>> from pyspark.pandas.config import set_option, reset_option
|
||
>>> set_option("compute.ops_on_diff_frames", True)
|
||
>>> s = ps.Series([1, 2, 3])
|
||
>>> s.update(ps.Series([4, 5, 6]))
|
||
>>> s.sort_index()
|
||
0 4
|
||
1 5
|
||
2 6
|
||
dtype: int64
|
||
|
||
>>> s = ps.Series(['a', 'b', 'c'])
|
||
>>> s.update(ps.Series(['d', 'e'], index=[0, 2]))
|
||
>>> s.sort_index()
|
||
0 d
|
||
1 b
|
||
2 e
|
||
dtype: object
|
||
|
||
>>> s = ps.Series([1, 2, 3])
|
||
>>> s.update(ps.Series([4, 5, 6, 7, 8]))
|
||
>>> s.sort_index()
|
||
0 4
|
||
1 5
|
||
2 6
|
||
dtype: int64
|
||
|
||
>>> s = ps.Series([1, 2, 3], index=[10, 11, 12])
|
||
>>> s
|
||
10 1
|
||
11 2
|
||
12 3
|
||
dtype: int64
|
||
|
||
>>> s.update(ps.Series([4, 5, 6]))
|
||
>>> s.sort_index()
|
||
10 1
|
||
11 2
|
||
12 3
|
||
dtype: int64
|
||
|
||
>>> s.update(ps.Series([4, 5, 6], index=[11, 12, 13]))
|
||
>>> s.sort_index()
|
||
10 1
|
||
11 4
|
||
12 5
|
||
dtype: int64
|
||
|
||
If ``other`` contains NaNs the corresponding values are not updated
|
||
in the original Series.
|
||
|
||
>>> s = ps.Series([1, 2, 3])
|
||
>>> s.update(ps.Series([4, np.nan, 6]))
|
||
>>> s.sort_index()
|
||
0 4.0
|
||
1 2.0
|
||
2 6.0
|
||
dtype: float64
|
||
|
||
>>> reset_option("compute.ops_on_diff_frames")
|
||
"""
|
||
if not isinstance(other, Series):
|
||
raise TypeError("'other' must be a Series")
|
||
|
||
combined = combine_frames(self._psdf, other._psdf, how="leftouter")
|
||
|
||
this_scol = combined["this"]._internal.spark_column_for(self._column_label)
|
||
that_scol = combined["that"]._internal.spark_column_for(other._column_label)
|
||
|
||
scol = (
|
||
F.when(that_scol.isNotNull(), that_scol)
|
||
.otherwise(this_scol)
|
||
.alias(self._psdf._internal.spark_column_name_for(self._column_label))
|
||
)
|
||
|
||
internal = combined["this"]._internal.with_new_spark_column(
|
||
self._column_label, scol # TODO: dtype?
|
||
)
|
||
|
||
self._psdf._update_internal_frame(internal.resolved_copy, requires_same_anchor=False)
|
||
|
||
def where(self, cond: "Series", other: Any = np.nan) -> "Series":
|
||
"""
|
||
Replace values where the condition is False.
|
||
|
||
Parameters
|
||
----------
|
||
cond : boolean Series
|
||
Where cond is True, keep the original value. Where False,
|
||
replace with corresponding value from other.
|
||
other : scalar, Series
|
||
Entries where cond is False are replaced with corresponding value from other.
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
|
||
Examples
|
||
--------
|
||
|
||
>>> from pyspark.pandas.config import set_option, reset_option
|
||
>>> set_option("compute.ops_on_diff_frames", True)
|
||
>>> s1 = ps.Series([0, 1, 2, 3, 4])
|
||
>>> s2 = ps.Series([100, 200, 300, 400, 500])
|
||
>>> s1.where(s1 > 0).sort_index()
|
||
0 NaN
|
||
1 1.0
|
||
2 2.0
|
||
3 3.0
|
||
4 4.0
|
||
dtype: float64
|
||
|
||
>>> s1.where(s1 > 1, 10).sort_index()
|
||
0 10
|
||
1 10
|
||
2 2
|
||
3 3
|
||
4 4
|
||
dtype: int64
|
||
|
||
>>> s1.where(s1 > 1, s1 + 100).sort_index()
|
||
0 100
|
||
1 101
|
||
2 2
|
||
3 3
|
||
4 4
|
||
dtype: int64
|
||
|
||
>>> s1.where(s1 > 1, s2).sort_index()
|
||
0 100
|
||
1 200
|
||
2 2
|
||
3 3
|
||
4 4
|
||
dtype: int64
|
||
|
||
>>> reset_option("compute.ops_on_diff_frames")
|
||
"""
|
||
assert isinstance(cond, Series)
|
||
|
||
# We should check the DataFrame from both `cond` and `other`.
|
||
should_try_ops_on_diff_frame = not same_anchor(cond, self) or (
|
||
isinstance(other, Series) and not same_anchor(other, self)
|
||
)
|
||
|
||
if should_try_ops_on_diff_frame:
|
||
# Try to perform it with 'compute.ops_on_diff_frame' option.
|
||
psdf = self.to_frame()
|
||
tmp_cond_col = verify_temp_column_name(psdf, "__tmp_cond_col__")
|
||
tmp_other_col = verify_temp_column_name(psdf, "__tmp_other_col__")
|
||
|
||
psdf[tmp_cond_col] = cond
|
||
psdf[tmp_other_col] = other
|
||
|
||
# above logic makes a Spark DataFrame looks like below:
|
||
# +-----------------+---+----------------+-----------------+
|
||
# |__index_level_0__| 0|__tmp_cond_col__|__tmp_other_col__|
|
||
# +-----------------+---+----------------+-----------------+
|
||
# | 0| 0| false| 100|
|
||
# | 1| 1| false| 200|
|
||
# | 3| 3| true| 400|
|
||
# | 2| 2| true| 300|
|
||
# | 4| 4| true| 500|
|
||
# +-----------------+---+----------------+-----------------+
|
||
condition = (
|
||
F.when(
|
||
psdf[tmp_cond_col].spark.column,
|
||
psdf._psser_for(psdf._internal.column_labels[0]).spark.column,
|
||
)
|
||
.otherwise(psdf[tmp_other_col].spark.column)
|
||
.alias(psdf._internal.data_spark_column_names[0])
|
||
)
|
||
|
||
internal = psdf._internal.with_new_columns(
|
||
[condition], column_labels=self._internal.column_labels
|
||
)
|
||
return first_series(DataFrame(internal))
|
||
else:
|
||
if isinstance(other, Series):
|
||
other = other.spark.column
|
||
condition = (
|
||
F.when(cond.spark.column, self.spark.column)
|
||
.otherwise(other)
|
||
.alias(self._internal.data_spark_column_names[0])
|
||
)
|
||
return self._with_new_scol(condition)
|
||
|
||
def mask(self, cond: "Series", other: Any = np.nan) -> "Series":
|
||
"""
|
||
Replace values where the condition is True.
|
||
|
||
Parameters
|
||
----------
|
||
cond : boolean Series
|
||
Where cond is False, keep the original value. Where True,
|
||
replace with corresponding value from other.
|
||
other : scalar, Series
|
||
Entries where cond is True are replaced with corresponding value from other.
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
|
||
Examples
|
||
--------
|
||
|
||
>>> from pyspark.pandas.config import set_option, reset_option
|
||
>>> set_option("compute.ops_on_diff_frames", True)
|
||
>>> s1 = ps.Series([0, 1, 2, 3, 4])
|
||
>>> s2 = ps.Series([100, 200, 300, 400, 500])
|
||
>>> s1.mask(s1 > 0).sort_index()
|
||
0 0.0
|
||
1 NaN
|
||
2 NaN
|
||
3 NaN
|
||
4 NaN
|
||
dtype: float64
|
||
|
||
>>> s1.mask(s1 > 1, 10).sort_index()
|
||
0 0
|
||
1 1
|
||
2 10
|
||
3 10
|
||
4 10
|
||
dtype: int64
|
||
|
||
>>> s1.mask(s1 > 1, s1 + 100).sort_index()
|
||
0 0
|
||
1 1
|
||
2 102
|
||
3 103
|
||
4 104
|
||
dtype: int64
|
||
|
||
>>> s1.mask(s1 > 1, s2).sort_index()
|
||
0 0
|
||
1 1
|
||
2 300
|
||
3 400
|
||
4 500
|
||
dtype: int64
|
||
|
||
>>> reset_option("compute.ops_on_diff_frames")
|
||
"""
|
||
return self.where(cast(Series, ~cond), other)
|
||
|
||
def xs(self, key: Union[Any, Tuple], level: Optional[int] = None) -> "Series":
|
||
"""
|
||
Return cross-section from the Series.
|
||
|
||
This method takes a `key` argument to select data at a particular
|
||
level of a MultiIndex.
|
||
|
||
Parameters
|
||
----------
|
||
key : label or tuple of label
|
||
Label contained in the index, or partially in a MultiIndex.
|
||
level : object, defaults to first n levels (n=1 or len(key))
|
||
In case of a key partially contained in a MultiIndex, indicate
|
||
which levels are used. Levels can be referred by label or position.
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
Cross-section from the original Series
|
||
corresponding to the selected index levels.
|
||
|
||
Examples
|
||
--------
|
||
>>> midx = pd.MultiIndex([['a', 'b', 'c'],
|
||
... ['lama', 'cow', 'falcon'],
|
||
... ['speed', 'weight', 'length']],
|
||
... [[0, 0, 0, 1, 1, 1, 2, 2, 2],
|
||
... [0, 0, 0, 1, 1, 1, 2, 2, 2],
|
||
... [0, 1, 2, 0, 1, 2, 0, 1, 2]])
|
||
>>> s = ps.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],
|
||
... index=midx)
|
||
>>> s
|
||
a lama speed 45.0
|
||
weight 200.0
|
||
length 1.2
|
||
b cow speed 30.0
|
||
weight 250.0
|
||
length 1.5
|
||
c falcon speed 320.0
|
||
weight 1.0
|
||
length 0.3
|
||
dtype: float64
|
||
|
||
Get values at specified index
|
||
|
||
>>> s.xs('a')
|
||
lama speed 45.0
|
||
weight 200.0
|
||
length 1.2
|
||
dtype: float64
|
||
|
||
Get values at several indexes
|
||
|
||
>>> s.xs(('a', 'lama'))
|
||
speed 45.0
|
||
weight 200.0
|
||
length 1.2
|
||
dtype: float64
|
||
|
||
Get values at specified index and level
|
||
|
||
>>> s.xs('lama', level=1)
|
||
a speed 45.0
|
||
weight 200.0
|
||
length 1.2
|
||
dtype: float64
|
||
"""
|
||
if not isinstance(key, tuple):
|
||
key = (key,)
|
||
if level is None:
|
||
level = 0
|
||
|
||
internal = self._internal
|
||
scols = (
|
||
internal.index_spark_columns[:level]
|
||
+ internal.index_spark_columns[level + len(key) :]
|
||
+ [self.spark.column]
|
||
)
|
||
rows = [internal.spark_columns[lvl] == index for lvl, index in enumerate(key, level)]
|
||
sdf = internal.spark_frame.filter(reduce(lambda x, y: x & y, rows)).select(scols)
|
||
|
||
if internal.index_level == len(key):
|
||
# if spark_frame has one column and one data, return data only without frame
|
||
pdf = sdf.limit(2).toPandas()
|
||
length = len(pdf)
|
||
if length == 1:
|
||
return pdf[self._internal.data_spark_column_names[0]].iloc[0]
|
||
|
||
index_spark_column_names = (
|
||
internal.index_spark_column_names[:level]
|
||
+ internal.index_spark_column_names[level + len(key) :]
|
||
)
|
||
index_names = internal.index_names[:level] + internal.index_names[level + len(key) :]
|
||
index_fields = internal.index_fields[:level] + internal.index_fields[level + len(key) :]
|
||
|
||
internal = internal.copy(
|
||
spark_frame=sdf,
|
||
index_spark_columns=[scol_for(sdf, col) for col in index_spark_column_names],
|
||
index_names=index_names,
|
||
index_fields=index_fields,
|
||
data_spark_columns=[scol_for(sdf, internal.data_spark_column_names[0])],
|
||
)
|
||
return first_series(DataFrame(internal))
|
||
|
||
def pct_change(self, periods: int = 1) -> "Series":
|
||
"""
|
||
Percentage change between the current and a prior element.
|
||
|
||
.. note:: the current implementation of this API uses Spark's Window without
|
||
specifying partition specification. This leads to move all data into
|
||
single partition in single machine and could cause serious
|
||
performance degradation. Avoid this method against very large dataset.
|
||
|
||
Parameters
|
||
----------
|
||
periods : int, default 1
|
||
Periods to shift for forming percent change.
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
|
||
Examples
|
||
--------
|
||
|
||
>>> psser = ps.Series([90, 91, 85], index=[2, 4, 1])
|
||
>>> psser
|
||
2 90
|
||
4 91
|
||
1 85
|
||
dtype: int64
|
||
|
||
>>> psser.pct_change()
|
||
2 NaN
|
||
4 0.011111
|
||
1 -0.065934
|
||
dtype: float64
|
||
|
||
>>> psser.sort_index().pct_change()
|
||
1 NaN
|
||
2 0.058824
|
||
4 0.011111
|
||
dtype: float64
|
||
|
||
>>> psser.pct_change(periods=2)
|
||
2 NaN
|
||
4 NaN
|
||
1 -0.055556
|
||
dtype: float64
|
||
"""
|
||
scol = self.spark.column
|
||
|
||
window = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween(-periods, -periods)
|
||
prev_row = F.lag(scol, periods).over(window)
|
||
|
||
return self._with_new_scol((scol - prev_row) / prev_row).spark.analyzed
|
||
|
||
def combine_first(self, other: "Series") -> "Series":
|
||
"""
|
||
Combine Series values, choosing the calling Series's values first.
|
||
|
||
Parameters
|
||
----------
|
||
other : Series
|
||
The value(s) to be combined with the `Series`.
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
The result of combining the Series with the other object.
|
||
|
||
See Also
|
||
--------
|
||
Series.combine : Perform elementwise operation on two Series
|
||
using a given function.
|
||
|
||
Notes
|
||
-----
|
||
Result index will be the union of the two indexes.
|
||
|
||
Examples
|
||
--------
|
||
>>> s1 = ps.Series([1, np.nan])
|
||
>>> s2 = ps.Series([3, 4])
|
||
>>> with ps.option_context("compute.ops_on_diff_frames", True):
|
||
... s1.combine_first(s2)
|
||
0 1.0
|
||
1 4.0
|
||
dtype: float64
|
||
"""
|
||
if not isinstance(other, ps.Series):
|
||
raise TypeError("`combine_first` only allows `Series` for parameter `other`")
|
||
if same_anchor(self, other):
|
||
this = self.spark.column
|
||
that = other.spark.column
|
||
combined = self._psdf
|
||
else:
|
||
combined = combine_frames(self._psdf, other._psdf)
|
||
this = combined["this"]._internal.spark_column_for(self._column_label)
|
||
that = combined["that"]._internal.spark_column_for(other._column_label)
|
||
# If `self` has missing value, use value of `other`
|
||
cond = F.when(this.isNull(), that).otherwise(this)
|
||
# If `self` and `other` come from same frame, the anchor should be kept
|
||
if same_anchor(self, other):
|
||
return self._with_new_scol(cond) # TODO: dtype?
|
||
index_scols = combined._internal.index_spark_columns
|
||
sdf = combined._internal.spark_frame.select(
|
||
*index_scols, cond.alias(self._internal.data_spark_column_names[0])
|
||
).distinct()
|
||
internal = self._internal.with_new_sdf(
|
||
sdf, index_fields=combined._internal.index_fields, data_fields=[None] # TODO: dtype?
|
||
)
|
||
return first_series(DataFrame(internal))
|
||
|
||
def dot(self, other: Union["Series", DataFrame]) -> Union[Scalar, "Series"]:
|
||
"""
|
||
Compute the dot product between the Series and the columns of other.
|
||
|
||
This method computes the dot product between the Series and another
|
||
one, or the Series and each columns of a DataFrame.
|
||
|
||
It can also be called using `self @ other` in Python >= 3.5.
|
||
|
||
.. note:: This API is slightly different from pandas when indexes from both Series
|
||
are not aligned. To match with pandas', it requires to read the whole data for,
|
||
for example, counting. pandas raises an exception; however, pandas-on-Spark
|
||
just proceeds and performs by ignoring mismatches with NaN permissively.
|
||
|
||
>>> pdf1 = pd.Series([1, 2, 3], index=[0, 1, 2])
|
||
>>> pdf2 = pd.Series([1, 2, 3], index=[0, 1, 3])
|
||
>>> pdf1.dot(pdf2) # doctest: +SKIP
|
||
...
|
||
ValueError: matrices are not aligned
|
||
|
||
>>> psdf1 = ps.Series([1, 2, 3], index=[0, 1, 2])
|
||
>>> psdf2 = ps.Series([1, 2, 3], index=[0, 1, 3])
|
||
>>> psdf1.dot(psdf2) # doctest: +SKIP
|
||
5
|
||
|
||
Parameters
|
||
----------
|
||
other : Series, DataFrame.
|
||
The other object to compute the dot product with its columns.
|
||
|
||
Returns
|
||
-------
|
||
scalar, Series
|
||
Return the dot product of the Series and other if other is a
|
||
Series, the Series of the dot product of Series and each rows of
|
||
other if other is a DataFrame.
|
||
|
||
Notes
|
||
-----
|
||
The Series and other has to share the same index if other is a Series
|
||
or a DataFrame.
|
||
|
||
Examples
|
||
--------
|
||
>>> s = ps.Series([0, 1, 2, 3])
|
||
|
||
>>> s.dot(s)
|
||
14
|
||
|
||
>>> s @ s
|
||
14
|
||
|
||
>>> psdf = ps.DataFrame({'x': [0, 1, 2, 3], 'y': [0, -1, -2, -3]})
|
||
>>> psdf
|
||
x y
|
||
0 0 0
|
||
1 1 -1
|
||
2 2 -2
|
||
3 3 -3
|
||
|
||
>>> with ps.option_context("compute.ops_on_diff_frames", True):
|
||
... s.dot(psdf)
|
||
...
|
||
x 14
|
||
y -14
|
||
dtype: int64
|
||
"""
|
||
if isinstance(other, DataFrame):
|
||
if not same_anchor(self, other):
|
||
if not self.index.sort_values().equals(other.index.sort_values()):
|
||
raise ValueError("matrices are not aligned")
|
||
|
||
other_copy = other.copy() # type: DataFrame
|
||
column_labels = other_copy._internal.column_labels
|
||
|
||
self_column_label = verify_temp_column_name(other_copy, "__self_column__")
|
||
other_copy[self_column_label] = self
|
||
self_psser = other_copy._psser_for(self_column_label)
|
||
|
||
product_pssers = [
|
||
cast(Series, other_copy._psser_for(label) * self_psser) for label in column_labels
|
||
]
|
||
|
||
dot_product_psser = DataFrame(
|
||
other_copy._internal.with_new_columns(product_pssers, column_labels=column_labels)
|
||
).sum()
|
||
|
||
return cast(Series, dot_product_psser).rename(self.name)
|
||
|
||
else:
|
||
assert isinstance(other, Series)
|
||
if not same_anchor(self, other):
|
||
if len(self.index) != len(other.index):
|
||
raise ValueError("matrices are not aligned")
|
||
return (self * other).sum()
|
||
|
||
def __matmul__(self, other: Union["Series", DataFrame]) -> Union[Scalar, "Series"]:
|
||
"""
|
||
Matrix multiplication using binary `@` operator in Python>=3.5.
|
||
"""
|
||
return self.dot(other)
|
||
|
||
def repeat(self, repeats: Union[int, "Series"]) -> "Series":
|
||
"""
|
||
Repeat elements of a Series.
|
||
|
||
Returns a new Series where each element of the current Series
|
||
is repeated consecutively a given number of times.
|
||
|
||
Parameters
|
||
----------
|
||
repeats : int or Series
|
||
The number of repetitions for each element. This should be a
|
||
non-negative integer. Repeating 0 times will return an empty
|
||
Series.
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
Newly created Series with repeated elements.
|
||
|
||
See Also
|
||
--------
|
||
Index.repeat : Equivalent function for Index.
|
||
|
||
Examples
|
||
--------
|
||
>>> s = ps.Series(['a', 'b', 'c'])
|
||
>>> s
|
||
0 a
|
||
1 b
|
||
2 c
|
||
dtype: object
|
||
>>> s.repeat(2)
|
||
0 a
|
||
1 b
|
||
2 c
|
||
0 a
|
||
1 b
|
||
2 c
|
||
dtype: object
|
||
>>> ps.Series([1, 2, 3]).repeat(0)
|
||
Series([], dtype: int64)
|
||
"""
|
||
if not isinstance(repeats, (int, Series)):
|
||
raise TypeError(
|
||
"`repeats` argument must be integer or Series, but got {}".format(type(repeats))
|
||
)
|
||
|
||
if isinstance(repeats, Series):
|
||
if not same_anchor(self, repeats):
|
||
psdf = self.to_frame()
|
||
temp_repeats = verify_temp_column_name(psdf, "__temp_repeats__")
|
||
psdf[temp_repeats] = repeats
|
||
return (
|
||
psdf._psser_for(psdf._internal.column_labels[0])
|
||
.repeat(psdf[temp_repeats])
|
||
.rename(self.name)
|
||
)
|
||
else:
|
||
scol = F.explode(
|
||
F.array_repeat(self.spark.column, repeats.astype("int32").spark.column)
|
||
).alias(name_like_string(self.name))
|
||
sdf = self._internal.spark_frame.select(self._internal.index_spark_columns + [scol])
|
||
internal = self._internal.copy(
|
||
spark_frame=sdf,
|
||
index_spark_columns=[
|
||
scol_for(sdf, col) for col in self._internal.index_spark_column_names
|
||
],
|
||
data_spark_columns=[scol_for(sdf, name_like_string(self.name))],
|
||
)
|
||
return first_series(DataFrame(internal))
|
||
else:
|
||
if repeats < 0:
|
||
raise ValueError("negative dimensions are not allowed")
|
||
|
||
psdf = self._psdf[[self.name]]
|
||
if repeats == 0:
|
||
return first_series(DataFrame(psdf._internal.with_filter(F.lit(False))))
|
||
else:
|
||
return first_series(ps.concat([psdf] * repeats))
|
||
|
||
def asof(self, where: Union[Any, List]) -> Union[Scalar, "Series"]:
|
||
"""
|
||
Return the last row(s) without any NaNs before `where`.
|
||
|
||
The last row (for each element in `where`, if list) without any
|
||
NaN is taken.
|
||
|
||
If there is no good value, NaN is returned.
|
||
|
||
.. note:: This API is dependent on :meth:`Index.is_monotonic_increasing`
|
||
which can be expensive.
|
||
|
||
Parameters
|
||
----------
|
||
where : index or array-like of indices
|
||
|
||
Returns
|
||
-------
|
||
scalar or Series
|
||
|
||
The return can be:
|
||
|
||
* scalar : when `self` is a Series and `where` is a scalar
|
||
* Series: when `self` is a Series and `where` is an array-like
|
||
|
||
Return scalar or Series
|
||
|
||
Notes
|
||
-----
|
||
Indices are assumed to be sorted. Raises if this is not the case.
|
||
|
||
Examples
|
||
--------
|
||
>>> s = ps.Series([1, 2, np.nan, 4], index=[10, 20, 30, 40])
|
||
>>> s
|
||
10 1.0
|
||
20 2.0
|
||
30 NaN
|
||
40 4.0
|
||
dtype: float64
|
||
|
||
A scalar `where`.
|
||
|
||
>>> s.asof(20)
|
||
2.0
|
||
|
||
For a sequence `where`, a Series is returned. The first value is
|
||
NaN, because the first element of `where` is before the first
|
||
index value.
|
||
|
||
>>> s.asof([5, 20]).sort_index()
|
||
5 NaN
|
||
20 2.0
|
||
dtype: float64
|
||
|
||
Missing values are not considered. The following is ``2.0``, not
|
||
NaN, even though NaN is at the index location for ``30``.
|
||
|
||
>>> s.asof(30)
|
||
2.0
|
||
"""
|
||
should_return_series = True
|
||
if isinstance(self.index, ps.MultiIndex):
|
||
raise ValueError("asof is not supported for a MultiIndex")
|
||
if isinstance(where, (ps.Index, ps.Series, DataFrame)):
|
||
raise ValueError("where cannot be an Index, Series or a DataFrame")
|
||
if not self.index.is_monotonic_increasing:
|
||
raise ValueError("asof requires a sorted index")
|
||
if not is_list_like(where):
|
||
should_return_series = False
|
||
where = [where]
|
||
index_scol = self._internal.index_spark_columns[0]
|
||
index_type = self._internal.spark_type_for(index_scol)
|
||
cond = [
|
||
F.max(F.when(index_scol <= F.lit(index).cast(index_type), self.spark.column))
|
||
for index in where
|
||
]
|
||
sdf = self._internal.spark_frame.select(cond)
|
||
if not should_return_series:
|
||
with sql_conf({SPARK_CONF_ARROW_ENABLED: False}):
|
||
# Disable Arrow to keep row ordering.
|
||
result = cast(pd.DataFrame, sdf.limit(1).toPandas()).iloc[0, 0]
|
||
return result if result is not None else np.nan
|
||
|
||
# The data is expected to be small so it's fine to transpose/use default index.
|
||
with ps.option_context("compute.default_index_type", "distributed", "compute.max_rows", 1):
|
||
psdf = ps.DataFrame(sdf) # type: DataFrame
|
||
psdf.columns = pd.Index(where)
|
||
return first_series(psdf.transpose()).rename(self.name)
|
||
|
||
def mad(self) -> float:
|
||
"""
|
||
Return the mean absolute deviation of values.
|
||
|
||
Examples
|
||
--------
|
||
>>> s = ps.Series([1, 2, 3, 4])
|
||
>>> s
|
||
0 1
|
||
1 2
|
||
2 3
|
||
3 4
|
||
dtype: int64
|
||
|
||
>>> s.mad()
|
||
1.0
|
||
"""
|
||
|
||
sdf = self._internal.spark_frame
|
||
spark_column = self.spark.column
|
||
avg = unpack_scalar(sdf.select(F.avg(spark_column)))
|
||
mad = unpack_scalar(sdf.select(F.avg(F.abs(spark_column - avg))))
|
||
|
||
return mad
|
||
|
||
def unstack(self, level: int = -1) -> DataFrame:
|
||
"""
|
||
Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame.
|
||
The level involved will automatically get sorted.
|
||
|
||
Notes
|
||
-----
|
||
Unlike pandas, pandas-on-Spark doesn't check whether an index is duplicated or not
|
||
because the checking of duplicated index requires scanning whole data which
|
||
can be quite expensive.
|
||
|
||
Parameters
|
||
----------
|
||
level : int, str, or list of these, default last level
|
||
Level(s) to unstack, can pass level name.
|
||
|
||
Returns
|
||
-------
|
||
DataFrame
|
||
Unstacked Series.
|
||
|
||
Examples
|
||
--------
|
||
>>> s = ps.Series([1, 2, 3, 4],
|
||
... index=pd.MultiIndex.from_product([['one', 'two'],
|
||
... ['a', 'b']]))
|
||
>>> s
|
||
one a 1
|
||
b 2
|
||
two a 3
|
||
b 4
|
||
dtype: int64
|
||
|
||
>>> s.unstack(level=-1).sort_index()
|
||
a b
|
||
one 1 2
|
||
two 3 4
|
||
|
||
>>> s.unstack(level=0).sort_index()
|
||
one two
|
||
a 1 3
|
||
b 2 4
|
||
"""
|
||
if not isinstance(self.index, ps.MultiIndex):
|
||
raise ValueError("Series.unstack only support for a MultiIndex")
|
||
index_nlevels = self.index.nlevels
|
||
if level > 0 and (level > index_nlevels - 1):
|
||
raise IndexError(
|
||
"Too many levels: Index has only {} levels, not {}".format(index_nlevels, level + 1)
|
||
)
|
||
elif level < 0 and (level < -index_nlevels):
|
||
raise IndexError(
|
||
"Too many levels: Index has only {} levels, {} is not a valid level number".format(
|
||
index_nlevels, level
|
||
)
|
||
)
|
||
|
||
internal = self._internal.resolved_copy
|
||
|
||
index_map = list(zip(internal.index_spark_column_names, internal.index_names))
|
||
pivot_col, column_label_names = index_map.pop(level)
|
||
index_scol_names, index_names = zip(*index_map)
|
||
col = internal.data_spark_column_names[0]
|
||
|
||
sdf = internal.spark_frame
|
||
sdf = sdf.groupby(list(index_scol_names)).pivot(pivot_col).agg(F.first(scol_for(sdf, col)))
|
||
internal = InternalFrame( # TODO: dtypes?
|
||
spark_frame=sdf,
|
||
index_spark_columns=[scol_for(sdf, col) for col in index_scol_names],
|
||
index_names=list(index_names),
|
||
column_label_names=[column_label_names],
|
||
)
|
||
return DataFrame(internal)
|
||
|
||
def item(self) -> Scalar:
|
||
"""
|
||
Return the first element of the underlying data as a Python scalar.
|
||
|
||
Returns
|
||
-------
|
||
scalar
|
||
The first element of Series.
|
||
|
||
Raises
|
||
------
|
||
ValueError
|
||
If the data is not length-1.
|
||
|
||
Examples
|
||
--------
|
||
>>> psser = ps.Series([10])
|
||
>>> psser.item()
|
||
10
|
||
"""
|
||
return self.head(2)._to_internal_pandas().item()
|
||
|
||
def iteritems(self) -> Iterable[Tuple[Union[Any, Tuple], Any]]:
|
||
"""
|
||
Lazily iterate over (index, value) tuples.
|
||
|
||
This method returns an iterable tuple (index, value). This is
|
||
convenient if you want to create a lazy iterator.
|
||
|
||
.. note:: Unlike pandas', the iteritems in pandas-on-Spark returns generator rather
|
||
zip object
|
||
|
||
Returns
|
||
-------
|
||
iterable
|
||
Iterable of tuples containing the (index, value) pairs from a
|
||
Series.
|
||
|
||
See Also
|
||
--------
|
||
DataFrame.items : Iterate over (column name, Series) pairs.
|
||
DataFrame.iterrows : Iterate over DataFrame rows as (index, Series) pairs.
|
||
|
||
Examples
|
||
--------
|
||
>>> s = ps.Series(['A', 'B', 'C'])
|
||
>>> for index, value in s.items():
|
||
... print("Index : {}, Value : {}".format(index, value))
|
||
Index : 0, Value : A
|
||
Index : 1, Value : B
|
||
Index : 2, Value : C
|
||
"""
|
||
internal_index_columns = self._internal.index_spark_column_names
|
||
internal_data_column = self._internal.data_spark_column_names[0]
|
||
|
||
def extract_kv_from_spark_row(row: Row) -> Tuple[Union[Any, Tuple], Any]:
|
||
k = (
|
||
row[internal_index_columns[0]]
|
||
if len(internal_index_columns) == 1
|
||
else tuple(row[c] for c in internal_index_columns)
|
||
)
|
||
v = row[internal_data_column]
|
||
return k, v
|
||
|
||
for k, v in map(
|
||
extract_kv_from_spark_row, self._internal.resolved_copy.spark_frame.toLocalIterator()
|
||
):
|
||
yield k, v
|
||
|
||
def items(self) -> Iterable[Tuple[Union[Any, Tuple], Any]]:
|
||
"""This is an alias of ``iteritems``."""
|
||
return self.iteritems()
|
||
|
||
def droplevel(self, level: Union[int, Any, Tuple, List[Union[int, Any, Tuple]]]) -> "Series":
|
||
"""
|
||
Return Series with requested index level(s) removed.
|
||
|
||
Parameters
|
||
----------
|
||
level : int, str, or list-like
|
||
If a string is given, must be the name of a level
|
||
If list-like, elements must be names or positional indexes
|
||
of levels.
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
Series with requested index level(s) removed.
|
||
|
||
Examples
|
||
--------
|
||
>>> psser = ps.Series(
|
||
... [1, 2, 3],
|
||
... index=pd.MultiIndex.from_tuples(
|
||
... [("x", "a"), ("x", "b"), ("y", "c")], names=["level_1", "level_2"]
|
||
... ),
|
||
... )
|
||
>>> psser
|
||
level_1 level_2
|
||
x a 1
|
||
b 2
|
||
y c 3
|
||
dtype: int64
|
||
|
||
Removing specific index level by level
|
||
|
||
>>> psser.droplevel(0)
|
||
level_2
|
||
a 1
|
||
b 2
|
||
c 3
|
||
dtype: int64
|
||
|
||
Removing specific index level by name
|
||
|
||
>>> psser.droplevel("level_2")
|
||
level_1
|
||
x 1
|
||
x 2
|
||
y 3
|
||
dtype: int64
|
||
"""
|
||
return first_series(self.to_frame().droplevel(level=level, axis=0)).rename(self.name)
|
||
|
||
def tail(self, n: int = 5) -> "Series":
|
||
"""
|
||
Return the last `n` rows.
|
||
|
||
This function returns last `n` rows from the object based on
|
||
position. It is useful for quickly verifying data, for example,
|
||
after sorting or appending rows.
|
||
|
||
For negative values of `n`, this function returns all rows except
|
||
the first `n` rows, equivalent to ``df[n:]``.
|
||
|
||
Parameters
|
||
----------
|
||
n : int, default 5
|
||
Number of rows to select.
|
||
|
||
Returns
|
||
-------
|
||
type of caller
|
||
The last `n` rows of the caller object.
|
||
|
||
See Also
|
||
--------
|
||
DataFrame.head : The first `n` rows of the caller object.
|
||
|
||
Examples
|
||
--------
|
||
>>> psser = ps.Series([1, 2, 3, 4, 5])
|
||
>>> psser
|
||
0 1
|
||
1 2
|
||
2 3
|
||
3 4
|
||
4 5
|
||
dtype: int64
|
||
|
||
>>> psser.tail(3) # doctest: +SKIP
|
||
2 3
|
||
3 4
|
||
4 5
|
||
dtype: int64
|
||
"""
|
||
return first_series(self.to_frame().tail(n=n)).rename(self.name)
|
||
|
||
def explode(self) -> "Series":
|
||
"""
|
||
Transform each element of a list-like to a row.
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
Exploded lists to rows; index will be duplicated for these rows.
|
||
|
||
See Also
|
||
--------
|
||
Series.str.split : Split string values on specified separator.
|
||
Series.unstack : Unstack, a.k.a. pivot, Series with MultiIndex
|
||
to produce DataFrame.
|
||
DataFrame.melt : Unpivot a DataFrame from wide format to long format.
|
||
DataFrame.explode : Explode a DataFrame from list-like
|
||
columns to long format.
|
||
|
||
Examples
|
||
--------
|
||
>>> psser = ps.Series([[1, 2, 3], [], [3, 4]])
|
||
>>> psser
|
||
0 [1, 2, 3]
|
||
1 []
|
||
2 [3, 4]
|
||
dtype: object
|
||
|
||
>>> psser.explode() # doctest: +SKIP
|
||
0 1.0
|
||
0 2.0
|
||
0 3.0
|
||
1 NaN
|
||
2 3.0
|
||
2 4.0
|
||
dtype: float64
|
||
"""
|
||
if not isinstance(self.spark.data_type, ArrayType):
|
||
return self.copy()
|
||
|
||
scol = F.explode_outer(self.spark.column).alias(name_like_string(self._column_label))
|
||
|
||
internal = self._internal.with_new_columns([scol], keep_order=False)
|
||
return first_series(DataFrame(internal))
|
||
|
||
def argsort(self) -> "Series":
|
||
"""
|
||
Return the integer indices that would sort the Series values.
|
||
Unlike pandas, the index order is not preserved in the result.
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
Positions of values within the sort order with -1 indicating
|
||
nan values.
|
||
|
||
Examples
|
||
--------
|
||
>>> psser = ps.Series([3, 3, 4, 1, 6, 2, 3, 7, 8, 7, 10])
|
||
>>> psser
|
||
0 3
|
||
1 3
|
||
2 4
|
||
3 1
|
||
4 6
|
||
5 2
|
||
6 3
|
||
7 7
|
||
8 8
|
||
9 7
|
||
10 10
|
||
dtype: int64
|
||
|
||
>>> psser.argsort().sort_index()
|
||
0 3
|
||
1 5
|
||
2 0
|
||
3 1
|
||
4 6
|
||
5 2
|
||
6 4
|
||
7 7
|
||
8 9
|
||
9 8
|
||
10 10
|
||
dtype: int64
|
||
"""
|
||
notnull = self.loc[self.notnull()]
|
||
|
||
sdf_for_index = notnull._internal.spark_frame.select(notnull._internal.index_spark_columns)
|
||
|
||
tmp_join_key = verify_temp_column_name(sdf_for_index, "__tmp_join_key__")
|
||
sdf_for_index, _ = InternalFrame.attach_distributed_sequence_column(
|
||
sdf_for_index, tmp_join_key
|
||
)
|
||
# sdf_for_index:
|
||
# +----------------+-----------------+
|
||
# |__tmp_join_key__|__index_level_0__|
|
||
# +----------------+-----------------+
|
||
# | 0| 0|
|
||
# | 1| 1|
|
||
# | 2| 2|
|
||
# | 3| 3|
|
||
# | 4| 4|
|
||
# +----------------+-----------------+
|
||
|
||
sdf_for_data = notnull._internal.spark_frame.select(
|
||
notnull.spark.column.alias("values"), NATURAL_ORDER_COLUMN_NAME
|
||
)
|
||
sdf_for_data, _ = InternalFrame.attach_distributed_sequence_column(
|
||
sdf_for_data, SPARK_DEFAULT_SERIES_NAME
|
||
)
|
||
# sdf_for_data:
|
||
# +---+------+-----------------+
|
||
# | 0|values|__natural_order__|
|
||
# +---+------+-----------------+
|
||
# | 0| 3| 25769803776|
|
||
# | 1| 3| 51539607552|
|
||
# | 2| 4| 77309411328|
|
||
# | 3| 1| 103079215104|
|
||
# | 4| 2| 128849018880|
|
||
# +---+------+-----------------+
|
||
|
||
sdf_for_data = sdf_for_data.sort(
|
||
scol_for(sdf_for_data, "values"), NATURAL_ORDER_COLUMN_NAME
|
||
).drop("values", NATURAL_ORDER_COLUMN_NAME)
|
||
|
||
tmp_join_key = verify_temp_column_name(sdf_for_data, "__tmp_join_key__")
|
||
sdf_for_data, _ = InternalFrame.attach_distributed_sequence_column(
|
||
sdf_for_data, tmp_join_key
|
||
)
|
||
# sdf_for_index: sdf_for_data:
|
||
# +----------------+-----------------+ +----------------+---+
|
||
# |__tmp_join_key__|__index_level_0__| |__tmp_join_key__| 0|
|
||
# +----------------+-----------------+ +----------------+---+
|
||
# | 0| 0| | 0| 3|
|
||
# | 1| 1| | 1| 4|
|
||
# | 2| 2| | 2| 0|
|
||
# | 3| 3| | 3| 1|
|
||
# | 4| 4| | 4| 2|
|
||
# +----------------+-----------------+ +----------------+---+
|
||
|
||
sdf = sdf_for_index.join(sdf_for_data, on=tmp_join_key).drop(tmp_join_key)
|
||
|
||
internal = self._internal.with_new_sdf(
|
||
spark_frame=sdf,
|
||
data_columns=[SPARK_DEFAULT_SERIES_NAME],
|
||
index_fields=[
|
||
InternalField(dtype=field.dtype) for field in self._internal.index_fields
|
||
],
|
||
data_fields=[None],
|
||
)
|
||
psser = first_series(DataFrame(internal))
|
||
|
||
return cast(
|
||
Series, ps.concat([psser, self.loc[self.isnull()].spark.transform(lambda _: F.lit(-1))])
|
||
)
|
||
|
||
def argmax(self) -> int:
|
||
"""
|
||
Return int position of the largest value in the Series.
|
||
|
||
If the maximum is achieved in multiple locations,
|
||
the first row position is returned.
|
||
|
||
Returns
|
||
-------
|
||
int
|
||
Row position of the maximum value.
|
||
|
||
Examples
|
||
--------
|
||
Consider dataset containing cereal calories
|
||
|
||
>>> s = ps.Series({'Corn Flakes': 100.0, 'Almond Delight': 110.0,
|
||
... 'Cinnamon Toast Crunch': 120.0, 'Cocoa Puff': 110.0})
|
||
>>> s # doctest: +SKIP
|
||
Corn Flakes 100.0
|
||
Almond Delight 110.0
|
||
Cinnamon Toast Crunch 120.0
|
||
Cocoa Puff 110.0
|
||
dtype: float64
|
||
|
||
>>> s.argmax() # doctest: +SKIP
|
||
2
|
||
"""
|
||
sdf = self._internal.spark_frame.select(self.spark.column, NATURAL_ORDER_COLUMN_NAME)
|
||
max_value = sdf.select(
|
||
F.max(scol_for(sdf, self._internal.data_spark_column_names[0])),
|
||
F.first(NATURAL_ORDER_COLUMN_NAME),
|
||
).head()
|
||
if max_value[1] is None:
|
||
raise ValueError("attempt to get argmax of an empty sequence")
|
||
elif max_value[0] is None:
|
||
return -1
|
||
# We should remember the natural sequence started from 0
|
||
seq_col_name = verify_temp_column_name(sdf, "__distributed_sequence_column__")
|
||
sdf, _ = InternalFrame.attach_distributed_sequence_column(
|
||
sdf.drop(NATURAL_ORDER_COLUMN_NAME), seq_col_name
|
||
)
|
||
# If the maximum is achieved in multiple locations, the first row position is returned.
|
||
return sdf.filter(
|
||
scol_for(sdf, self._internal.data_spark_column_names[0]) == max_value[0]
|
||
).head()[0]
|
||
|
||
def argmin(self) -> int:
|
||
"""
|
||
Return int position of the smallest value in the Series.
|
||
|
||
If the minimum is achieved in multiple locations,
|
||
the first row position is returned.
|
||
|
||
Returns
|
||
-------
|
||
int
|
||
Row position of the minimum value.
|
||
|
||
Examples
|
||
--------
|
||
Consider dataset containing cereal calories
|
||
|
||
>>> s = ps.Series({'Corn Flakes': 100.0, 'Almond Delight': 110.0,
|
||
... 'Cinnamon Toast Crunch': 120.0, 'Cocoa Puff': 110.0})
|
||
>>> s # doctest: +SKIP
|
||
Corn Flakes 100.0
|
||
Almond Delight 110.0
|
||
Cinnamon Toast Crunch 120.0
|
||
Cocoa Puff 110.0
|
||
dtype: float64
|
||
|
||
>>> s.argmin() # doctest: +SKIP
|
||
0
|
||
"""
|
||
sdf = self._internal.spark_frame.select(self.spark.column, NATURAL_ORDER_COLUMN_NAME)
|
||
min_value = sdf.select(
|
||
F.min(scol_for(sdf, self._internal.data_spark_column_names[0])),
|
||
F.first(NATURAL_ORDER_COLUMN_NAME),
|
||
).head()
|
||
if min_value[1] is None:
|
||
raise ValueError("attempt to get argmin of an empty sequence")
|
||
elif min_value[0] is None:
|
||
return -1
|
||
# We should remember the natural sequence started from 0
|
||
seq_col_name = verify_temp_column_name(sdf, "__distributed_sequence_column__")
|
||
sdf, _ = InternalFrame.attach_distributed_sequence_column(
|
||
sdf.drop(NATURAL_ORDER_COLUMN_NAME), seq_col_name
|
||
)
|
||
# If the minimum is achieved in multiple locations, the first row position is returned.
|
||
return sdf.filter(
|
||
scol_for(sdf, self._internal.data_spark_column_names[0]) == min_value[0]
|
||
).head()[0]
|
||
|
||
def compare(
|
||
self, other: "Series", keep_shape: bool = False, keep_equal: bool = False
|
||
) -> DataFrame:
|
||
"""
|
||
Compare to another Series and show the differences.
|
||
|
||
Parameters
|
||
----------
|
||
other : Series
|
||
Object to compare with.
|
||
keep_shape : bool, default False
|
||
If true, all rows and columns are kept.
|
||
Otherwise, only the ones with different values are kept.
|
||
keep_equal : bool, default False
|
||
If true, the result keeps values that are equal.
|
||
Otherwise, equal values are shown as NaNs.
|
||
|
||
Returns
|
||
-------
|
||
DataFrame
|
||
|
||
Notes
|
||
-----
|
||
Matching NaNs will not appear as a difference.
|
||
|
||
Examples
|
||
--------
|
||
|
||
>>> from pyspark.pandas.config import set_option, reset_option
|
||
>>> set_option("compute.ops_on_diff_frames", True)
|
||
>>> s1 = ps.Series(["a", "b", "c", "d", "e"])
|
||
>>> s2 = ps.Series(["a", "a", "c", "b", "e"])
|
||
|
||
Align the differences on columns
|
||
|
||
>>> s1.compare(s2).sort_index()
|
||
self other
|
||
1 b a
|
||
3 d b
|
||
|
||
Keep all original rows
|
||
|
||
>>> s1.compare(s2, keep_shape=True).sort_index()
|
||
self other
|
||
0 None None
|
||
1 b a
|
||
2 None None
|
||
3 d b
|
||
4 None None
|
||
|
||
Keep all original rows and also all original values
|
||
|
||
>>> s1.compare(s2, keep_shape=True, keep_equal=True).sort_index()
|
||
self other
|
||
0 a a
|
||
1 b a
|
||
2 c c
|
||
3 d b
|
||
4 e e
|
||
|
||
>>> reset_option("compute.ops_on_diff_frames")
|
||
"""
|
||
if same_anchor(self, other):
|
||
self_column_label = verify_temp_column_name(other.to_frame(), "__self_column__")
|
||
other_column_label = verify_temp_column_name(self.to_frame(), "__other_column__")
|
||
combined = DataFrame(
|
||
self._internal.with_new_columns(
|
||
[self.rename(self_column_label), other.rename(other_column_label)]
|
||
)
|
||
) # type: DataFrame
|
||
else:
|
||
if not self.index.equals(other.index):
|
||
raise ValueError("Can only compare identically-labeled Series objects")
|
||
|
||
combined = combine_frames(self.to_frame(), other.to_frame())
|
||
|
||
this_column_label = "self"
|
||
that_column_label = "other"
|
||
if keep_equal and keep_shape:
|
||
combined.columns = pd.Index([this_column_label, that_column_label])
|
||
return combined
|
||
|
||
this_data_scol = combined._internal.data_spark_columns[0]
|
||
that_data_scol = combined._internal.data_spark_columns[1]
|
||
index_scols = combined._internal.index_spark_columns
|
||
sdf = combined._internal.spark_frame
|
||
if keep_shape:
|
||
this_scol = (
|
||
F.when(this_data_scol == that_data_scol, None)
|
||
.otherwise(this_data_scol)
|
||
.alias(this_column_label)
|
||
)
|
||
this_field = combined._internal.data_fields[0].copy(
|
||
name=this_column_label, nullable=True
|
||
)
|
||
|
||
that_scol = (
|
||
F.when(this_data_scol == that_data_scol, None)
|
||
.otherwise(that_data_scol)
|
||
.alias(that_column_label)
|
||
)
|
||
that_field = combined._internal.data_fields[1].copy(
|
||
name=that_column_label, nullable=True
|
||
)
|
||
else:
|
||
sdf = sdf.filter(~this_data_scol.eqNullSafe(that_data_scol))
|
||
|
||
this_scol = this_data_scol.alias(this_column_label)
|
||
this_field = combined._internal.data_fields[0].copy(name=this_column_label)
|
||
|
||
that_scol = that_data_scol.alias(that_column_label)
|
||
that_field = combined._internal.data_fields[1].copy(name=that_column_label)
|
||
|
||
sdf = sdf.select(*index_scols, this_scol, that_scol, NATURAL_ORDER_COLUMN_NAME)
|
||
internal = InternalFrame(
|
||
spark_frame=sdf,
|
||
index_spark_columns=[
|
||
scol_for(sdf, col) for col in self._internal.index_spark_column_names
|
||
],
|
||
index_names=self._internal.index_names,
|
||
index_fields=combined._internal.index_fields,
|
||
column_labels=[(this_column_label,), (that_column_label,)],
|
||
data_spark_columns=[scol_for(sdf, this_column_label), scol_for(sdf, that_column_label)],
|
||
data_fields=[this_field, that_field],
|
||
column_label_names=[None],
|
||
)
|
||
return DataFrame(internal)
|
||
|
||
def align(
|
||
self,
|
||
other: Union[DataFrame, "Series"],
|
||
join: str = "outer",
|
||
axis: Optional[Union[int, str]] = None,
|
||
copy: bool = True,
|
||
) -> Tuple["Series", Union[DataFrame, "Series"]]:
|
||
"""
|
||
Align two objects on their axes with the specified join method.
|
||
|
||
Join method is specified for each axis Index.
|
||
|
||
Parameters
|
||
----------
|
||
other : DataFrame or Series
|
||
join : {{'outer', 'inner', 'left', 'right'}}, default 'outer'
|
||
axis : allowed axis of the other object, default None
|
||
Align on index (0), columns (1), or both (None).
|
||
copy : bool, default True
|
||
Always returns new objects. If copy=False and no reindexing is
|
||
required then original objects are returned.
|
||
|
||
Returns
|
||
-------
|
||
(left, right) : (Series, type of other)
|
||
Aligned objects.
|
||
|
||
Examples
|
||
--------
|
||
>>> ps.set_option("compute.ops_on_diff_frames", True)
|
||
>>> s1 = ps.Series([7, 8, 9], index=[10, 11, 12])
|
||
>>> s2 = ps.Series(["g", "h", "i"], index=[10, 20, 30])
|
||
|
||
>>> aligned_l, aligned_r = s1.align(s2)
|
||
>>> aligned_l.sort_index()
|
||
10 7.0
|
||
11 8.0
|
||
12 9.0
|
||
20 NaN
|
||
30 NaN
|
||
dtype: float64
|
||
>>> aligned_r.sort_index()
|
||
10 g
|
||
11 None
|
||
12 None
|
||
20 h
|
||
30 i
|
||
dtype: object
|
||
|
||
Align with the join type "inner":
|
||
|
||
>>> aligned_l, aligned_r = s1.align(s2, join="inner")
|
||
>>> aligned_l.sort_index()
|
||
10 7
|
||
dtype: int64
|
||
>>> aligned_r.sort_index()
|
||
10 g
|
||
dtype: object
|
||
|
||
Align with a DataFrame:
|
||
|
||
>>> df = ps.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}, index=[10, 20, 30])
|
||
>>> aligned_l, aligned_r = s1.align(df)
|
||
>>> aligned_l.sort_index()
|
||
10 7.0
|
||
11 8.0
|
||
12 9.0
|
||
20 NaN
|
||
30 NaN
|
||
dtype: float64
|
||
>>> aligned_r.sort_index()
|
||
a b
|
||
10 1.0 a
|
||
11 NaN None
|
||
12 NaN None
|
||
20 2.0 b
|
||
30 3.0 c
|
||
|
||
>>> ps.reset_option("compute.ops_on_diff_frames")
|
||
"""
|
||
axis = validate_axis(axis)
|
||
if axis == 1:
|
||
raise ValueError("Series does not support columns axis.")
|
||
|
||
self_df = self.to_frame()
|
||
left, right = self_df.align(other, join=join, axis=axis, copy=False)
|
||
|
||
if left is self_df:
|
||
left_ser = self
|
||
else:
|
||
left_ser = first_series(left).rename(self.name)
|
||
|
||
return (left_ser.copy(), right.copy()) if copy else (left_ser, right)
|
||
|
||
def between_time(
|
||
self,
|
||
start_time: Union[datetime.time, str],
|
||
end_time: Union[datetime.time, str],
|
||
include_start: bool = True,
|
||
include_end: bool = True,
|
||
axis: Union[int, str] = 0,
|
||
) -> "Series":
|
||
"""
|
||
Select values between particular times of the day (example: 9:00-9:30 AM).
|
||
|
||
By setting ``start_time`` to be later than ``end_time``,
|
||
you can get the times that are *not* between the two times.
|
||
|
||
Parameters
|
||
----------
|
||
start_time : datetime.time or str
|
||
Initial time as a time filter limit.
|
||
end_time : datetime.time or str
|
||
End time as a time filter limit.
|
||
include_start : bool, default True
|
||
Whether the start time needs to be included in the result.
|
||
include_end : bool, default True
|
||
Whether the end time needs to be included in the result.
|
||
axis : {0 or 'index', 1 or 'columns'}, default 0
|
||
Determine range time on index or columns value.
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
Data from the original object filtered to the specified dates range.
|
||
|
||
Raises
|
||
------
|
||
TypeError
|
||
If the index is not a :class:`DatetimeIndex`
|
||
|
||
See Also
|
||
--------
|
||
at_time : Select values at a particular time of the day.
|
||
last : Select final periods of time series based on a date offset.
|
||
DatetimeIndex.indexer_between_time : Get just the index locations for
|
||
values between particular times of the day.
|
||
|
||
Examples
|
||
--------
|
||
>>> idx = pd.date_range('2018-04-09', periods=4, freq='1D20min')
|
||
>>> psser = ps.Series([1, 2, 3, 4], index=idx)
|
||
>>> psser
|
||
2018-04-09 00:00:00 1
|
||
2018-04-10 00:20:00 2
|
||
2018-04-11 00:40:00 3
|
||
2018-04-12 01:00:00 4
|
||
dtype: int64
|
||
|
||
>>> psser.between_time('0:15', '0:45')
|
||
2018-04-10 00:20:00 2
|
||
2018-04-11 00:40:00 3
|
||
dtype: int64
|
||
"""
|
||
return first_series(
|
||
self.to_frame().between_time(start_time, end_time, include_start, include_end, axis)
|
||
).rename(self.name)
|
||
|
||
def at_time(
|
||
self, time: Union[datetime.time, str], asof: bool = False, axis: Union[int, str] = 0
|
||
) -> "Series":
|
||
"""
|
||
Select values at particular time of day (example: 9:30AM).
|
||
|
||
Parameters
|
||
----------
|
||
time : datetime.time or str
|
||
axis : {0 or 'index', 1 or 'columns'}, default 0
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
|
||
Raises
|
||
------
|
||
TypeError
|
||
If the index is not a :class:`DatetimeIndex`
|
||
|
||
See Also
|
||
--------
|
||
between_time : Select values between particular times of the day.
|
||
DatetimeIndex.indexer_at_time : Get just the index locations for
|
||
values at particular time of the day.
|
||
|
||
Examples
|
||
--------
|
||
>>> idx = pd.date_range('2018-04-09', periods=4, freq='12H')
|
||
>>> psser = ps.Series([1, 2, 3, 4], index=idx)
|
||
>>> psser
|
||
2018-04-09 00:00:00 1
|
||
2018-04-09 12:00:00 2
|
||
2018-04-10 00:00:00 3
|
||
2018-04-10 12:00:00 4
|
||
dtype: int64
|
||
|
||
>>> psser.at_time('12:00')
|
||
2018-04-09 12:00:00 2
|
||
2018-04-10 12:00:00 4
|
||
dtype: int64
|
||
"""
|
||
return first_series(self.to_frame().at_time(time, asof, axis)).rename(self.name)
|
||
|
||
def _cum(
|
||
self,
|
||
func: Callable[[Column], Column],
|
||
skipna: bool,
|
||
part_cols: Sequence[Union[str, Column]] = (),
|
||
ascending: bool = True,
|
||
) -> "Series":
|
||
# This is used to cummin, cummax, cumsum, etc.
|
||
|
||
if ascending:
|
||
window = (
|
||
Window.orderBy(F.asc(NATURAL_ORDER_COLUMN_NAME))
|
||
.partitionBy(*part_cols)
|
||
.rowsBetween(Window.unboundedPreceding, Window.currentRow)
|
||
)
|
||
else:
|
||
window = (
|
||
Window.orderBy(F.desc(NATURAL_ORDER_COLUMN_NAME))
|
||
.partitionBy(*part_cols)
|
||
.rowsBetween(Window.unboundedPreceding, Window.currentRow)
|
||
)
|
||
|
||
if skipna:
|
||
# There is a behavior difference between pandas and PySpark. In case of cummax,
|
||
#
|
||
# Input:
|
||
# A B
|
||
# 0 2.0 1.0
|
||
# 1 5.0 NaN
|
||
# 2 1.0 0.0
|
||
# 3 2.0 4.0
|
||
# 4 4.0 9.0
|
||
#
|
||
# pandas:
|
||
# A B
|
||
# 0 2.0 1.0
|
||
# 1 5.0 NaN
|
||
# 2 5.0 1.0
|
||
# 3 5.0 4.0
|
||
# 4 5.0 9.0
|
||
#
|
||
# PySpark:
|
||
# A B
|
||
# 0 2.0 1.0
|
||
# 1 5.0 1.0
|
||
# 2 5.0 1.0
|
||
# 3 5.0 4.0
|
||
# 4 5.0 9.0
|
||
|
||
scol = F.when(
|
||
# Manually sets nulls given the column defined above.
|
||
self.spark.column.isNull(),
|
||
F.lit(None),
|
||
).otherwise(func(self.spark.column).over(window))
|
||
else:
|
||
# Here, we use two Windows.
|
||
# One for real data.
|
||
# The other one for setting nulls after the first null it meets.
|
||
#
|
||
# There is a behavior difference between pandas and PySpark. In case of cummax,
|
||
#
|
||
# Input:
|
||
# A B
|
||
# 0 2.0 1.0
|
||
# 1 5.0 NaN
|
||
# 2 1.0 0.0
|
||
# 3 2.0 4.0
|
||
# 4 4.0 9.0
|
||
#
|
||
# pandas:
|
||
# A B
|
||
# 0 2.0 1.0
|
||
# 1 5.0 NaN
|
||
# 2 5.0 NaN
|
||
# 3 5.0 NaN
|
||
# 4 5.0 NaN
|
||
#
|
||
# PySpark:
|
||
# A B
|
||
# 0 2.0 1.0
|
||
# 1 5.0 1.0
|
||
# 2 5.0 1.0
|
||
# 3 5.0 4.0
|
||
# 4 5.0 9.0
|
||
scol = F.when(
|
||
# By going through with max, it sets True after the first time it meets null.
|
||
F.max(self.spark.column.isNull()).over(window),
|
||
# Manually sets nulls given the column defined above.
|
||
F.lit(None),
|
||
).otherwise(func(self.spark.column).over(window))
|
||
|
||
return self._with_new_scol(scol)
|
||
|
||
def _cumsum(self, skipna: bool, part_cols: Sequence[Union[str, Column]] = ()) -> "Series":
|
||
psser = self
|
||
if isinstance(psser.spark.data_type, BooleanType):
|
||
psser = psser.spark.transform(lambda scol: scol.cast(LongType()))
|
||
elif not isinstance(psser.spark.data_type, NumericType):
|
||
raise TypeError(
|
||
"Could not convert {} ({}) to numeric".format(
|
||
spark_type_to_pandas_dtype(psser.spark.data_type),
|
||
psser.spark.data_type.simpleString(),
|
||
)
|
||
)
|
||
return psser._cum(F.sum, skipna, part_cols)
|
||
|
||
def _cumprod(self, skipna: bool, part_cols: Sequence[Union[str, Column]] = ()) -> "Series":
|
||
if isinstance(self.spark.data_type, BooleanType):
|
||
scol = self._cum(
|
||
lambda scol: F.min(F.coalesce(scol, F.lit(True))), skipna, part_cols
|
||
).spark.column.cast(LongType())
|
||
elif isinstance(self.spark.data_type, NumericType):
|
||
num_zeros = self._cum(
|
||
lambda scol: F.sum(F.when(scol == 0, 1).otherwise(0)), skipna, part_cols
|
||
).spark.column
|
||
num_negatives = self._cum(
|
||
lambda scol: F.sum(F.when(scol < 0, 1).otherwise(0)), skipna, part_cols
|
||
).spark.column
|
||
sign = F.when(num_negatives % 2 == 0, 1).otherwise(-1)
|
||
|
||
abs_prod = F.exp(
|
||
self._cum(lambda scol: F.sum(F.log(F.abs(scol))), skipna, part_cols).spark.column
|
||
)
|
||
|
||
scol = F.when(num_zeros > 0, 0).otherwise(sign * abs_prod)
|
||
|
||
if isinstance(self.spark.data_type, IntegralType):
|
||
scol = F.round(scol).cast(LongType())
|
||
else:
|
||
raise TypeError(
|
||
"Could not convert {} ({}) to numeric".format(
|
||
spark_type_to_pandas_dtype(self.spark.data_type),
|
||
self.spark.data_type.simpleString(),
|
||
)
|
||
)
|
||
|
||
return self._with_new_scol(scol)
|
||
|
||
# ----------------------------------------------------------------------
|
||
# Accessor Methods
|
||
# ----------------------------------------------------------------------
|
||
dt = CachedAccessor("dt", DatetimeMethods)
|
||
str = CachedAccessor("str", StringMethods)
|
||
cat = CachedAccessor("cat", CategoricalAccessor)
|
||
plot = CachedAccessor("plot", PandasOnSparkPlotAccessor)
|
||
|
||
# ----------------------------------------------------------------------
|
||
|
||
def _apply_series_op(
|
||
self, op: Callable[["Series"], Union["Series", Column]], should_resolve: bool = False
|
||
) -> "Series":
|
||
psser_or_scol = op(self)
|
||
if isinstance(psser_or_scol, Series):
|
||
psser = psser_or_scol
|
||
else:
|
||
psser = self._with_new_scol(cast(Column, psser_or_scol))
|
||
if should_resolve:
|
||
internal = psser._internal.resolved_copy
|
||
return first_series(DataFrame(internal))
|
||
else:
|
||
return psser
|
||
|
||
def _reduce_for_stat_function(
|
||
self,
|
||
sfun: Union[Callable[[Column], Column], Callable[[Column, DataType], Column]],
|
||
name: str_type,
|
||
axis: Optional[Union[int, str_type]] = None,
|
||
numeric_only: bool = True,
|
||
**kwargs: Any
|
||
) -> Scalar:
|
||
"""
|
||
Applies sfun to the column and returns a scalar
|
||
|
||
Parameters
|
||
----------
|
||
sfun : the stats function to be used for aggregation
|
||
name : original pandas API name.
|
||
axis : used only for sanity check because series only support index axis.
|
||
numeric_only : not used by this implementation, but passed down by stats functions
|
||
"""
|
||
from inspect import signature
|
||
|
||
axis = validate_axis(axis)
|
||
if axis == 1:
|
||
raise ValueError("Series does not support columns axis.")
|
||
num_args = len(signature(sfun).parameters)
|
||
spark_column = self.spark.column
|
||
spark_type = self.spark.data_type
|
||
|
||
if num_args == 1:
|
||
# Only pass in the column if sfun accepts only one arg
|
||
scol = cast(Callable[[Column], Column], sfun)(spark_column)
|
||
else: # must be 2
|
||
assert num_args == 2
|
||
# Pass in both the column and its data type if sfun accepts two args
|
||
scol = cast(Callable[[Column, DataType], Column], sfun)(spark_column, spark_type)
|
||
|
||
min_count = kwargs.get("min_count", 0)
|
||
if min_count > 0:
|
||
scol = F.when(Frame._count_expr(spark_column, spark_type) >= min_count, scol)
|
||
|
||
result = unpack_scalar(self._internal.spark_frame.select(scol))
|
||
return result if result is not None else np.nan
|
||
|
||
def _build_groupby(
|
||
self, by: List[Union["Series", Tuple]], as_index: bool, dropna: bool
|
||
) -> "SeriesGroupBy":
|
||
from pyspark.pandas.groupby import SeriesGroupBy
|
||
|
||
return SeriesGroupBy._build(self, by, as_index=as_index, dropna=dropna)
|
||
|
||
def __getitem__(self, key: Any) -> Any:
|
||
try:
|
||
if (isinstance(key, slice) and any(type(n) == int for n in [key.start, key.stop])) or (
|
||
type(key) == int
|
||
and not isinstance(self.index.spark.data_type, (IntegerType, LongType))
|
||
):
|
||
# Seems like pandas Series always uses int as positional search when slicing
|
||
# with ints, searches based on index values when the value is int.
|
||
return self.iloc[key]
|
||
return self.loc[key]
|
||
except SparkPandasIndexingError:
|
||
raise KeyError(
|
||
"Key length ({}) exceeds index depth ({})".format(
|
||
len(key), self._internal.index_level
|
||
)
|
||
)
|
||
|
||
def __getattr__(self, item: str_type) -> Any:
|
||
if item.startswith("__"):
|
||
raise AttributeError(item)
|
||
if hasattr(MissingPandasLikeSeries, item):
|
||
property_or_func = getattr(MissingPandasLikeSeries, item)
|
||
if isinstance(property_or_func, property):
|
||
return property_or_func.fget(self) # type: ignore
|
||
else:
|
||
return partial(property_or_func, self)
|
||
raise AttributeError("'Series' object has no attribute '{}'".format(item))
|
||
|
||
def _to_internal_pandas(self) -> pd.Series:
|
||
"""
|
||
Return a pandas Series directly from _internal to avoid overhead of copy.
|
||
|
||
This method is for internal use only.
|
||
"""
|
||
return self._psdf._internal.to_pandas_frame[self.name]
|
||
|
||
def __repr__(self) -> str_type:
|
||
max_display_count = get_option("display.max_rows")
|
||
if max_display_count is None:
|
||
return self._to_internal_pandas().to_string(name=self.name, dtype=self.dtype)
|
||
|
||
pser = self._psdf._get_or_create_repr_pandas_cache(max_display_count)[self.name]
|
||
pser_length = len(pser)
|
||
pser = pser.iloc[:max_display_count]
|
||
if pser_length > max_display_count:
|
||
repr_string = pser.to_string(length=True)
|
||
rest, prev_footer = repr_string.rsplit("\n", 1)
|
||
match = REPR_PATTERN.search(prev_footer)
|
||
if match is not None:
|
||
length = match.group("length")
|
||
dtype_name = str(self.dtype.name)
|
||
if self.name is None:
|
||
footer = "\ndtype: {dtype}\nShowing only the first {length}".format(
|
||
length=length, dtype=pprint_thing(dtype_name)
|
||
)
|
||
else:
|
||
footer = (
|
||
"\nName: {name}, dtype: {dtype}"
|
||
"\nShowing only the first {length}".format(
|
||
length=length, name=self.name, dtype=pprint_thing(dtype_name)
|
||
)
|
||
)
|
||
return rest + footer
|
||
return pser.to_string(name=self.name, dtype=self.dtype)
|
||
|
||
def __dir__(self) -> Iterable[str_type]:
|
||
if not isinstance(self.spark.data_type, StructType):
|
||
fields = []
|
||
else:
|
||
fields = [f for f in self.spark.data_type.fieldNames() if " " not in f]
|
||
return list(super().__dir__()) + fields
|
||
|
||
def __iter__(self) -> None:
|
||
return MissingPandasLikeSeries.__iter__(self)
|
||
|
||
if sys.version_info >= (3, 7):
|
||
# In order to support the type hints such as Series[...]. See DataFrame.__class_getitem__.
|
||
def __class_getitem__(cls, params: Any) -> Type[SeriesType]:
|
||
return _create_type_for_series_type(params)
|
||
|
||
elif (3, 5) <= sys.version_info < (3, 7):
|
||
# The implementation is in its metaclass so this flag is needed to distinguish
|
||
# pandas-on-Spark Series.
|
||
is_series = None
|
||
|
||
|
||
def unpack_scalar(sdf: SparkDataFrame) -> Any:
|
||
"""
|
||
Takes a dataframe that is supposed to contain a single row with a single scalar value,
|
||
and returns this value.
|
||
"""
|
||
l = cast(pd.DataFrame, sdf.limit(2).toPandas())
|
||
assert len(l) == 1, (sdf, l)
|
||
row = l.iloc[0]
|
||
l2 = list(row)
|
||
assert len(l2) == 1, (row, l2)
|
||
return l2[0]
|
||
|
||
|
||
@overload
|
||
def first_series(df: DataFrame) -> "Series":
|
||
...
|
||
|
||
|
||
@overload
|
||
def first_series(df: pd.DataFrame) -> pd.Series:
|
||
...
|
||
|
||
|
||
def first_series(df: Union[DataFrame, pd.DataFrame]) -> Union["Series", pd.Series]:
|
||
"""
|
||
Takes a DataFrame and returns the first column of the DataFrame as a Series
|
||
"""
|
||
assert isinstance(df, (DataFrame, pd.DataFrame)), type(df)
|
||
if isinstance(df, DataFrame):
|
||
return df._psser_for(df._internal.column_labels[0])
|
||
else:
|
||
return df[df.columns[0]]
|
||
|
||
|
||
def _test() -> None:
|
||
import os
|
||
import doctest
|
||
import sys
|
||
from pyspark.sql import SparkSession
|
||
import pyspark.pandas.series
|
||
|
||
os.chdir(os.environ["SPARK_HOME"])
|
||
|
||
globs = pyspark.pandas.series.__dict__.copy()
|
||
globs["ps"] = pyspark.pandas
|
||
spark = (
|
||
SparkSession.builder.master("local[4]").appName("pyspark.pandas.series tests").getOrCreate()
|
||
)
|
||
(failure_count, test_count) = doctest.testmod(
|
||
pyspark.pandas.series,
|
||
globs=globs,
|
||
optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
|
||
)
|
||
spark.stop()
|
||
if failure_count:
|
||
sys.exit(-1)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
_test()
|