2a335f2d7d
### What changes were proposed in this pull request? Fixes `mypy` errors and enables `mypy` check for pandas-on-Spark. ### Why are the changes needed? The `mypy` check for pandas-on-Spark was disabled when the initial porting. It should be enabled again; otherwise we will miss type checking errors. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? The enabled `mypy` check and existing unit tests should pass. Closes #32540 from ueshin/issues/SPARK-34941/pandas_mypy. Authored-by: Takuya UESHIN <ueshin@databricks.com> Signed-off-by: Takuya UESHIN <ueshin@databricks.com>
1763 lines
64 KiB
Python
1763 lines
64 KiB
Python
#
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
# this work for additional information regarding copyright ownership.
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
# (the "License"); you may not use this file except in compliance with
|
|
# the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
"""
|
|
A loc indexer for pandas-on-Spark DataFrame/Series.
|
|
"""
|
|
from abc import ABCMeta, abstractmethod
|
|
from collections.abc import Iterable
|
|
from functools import reduce
|
|
from typing import Any, Optional, List, Tuple, TYPE_CHECKING, Union, cast, Sized
|
|
|
|
import pandas as pd
|
|
from pandas.api.types import is_list_like
|
|
from pyspark import sql as spark
|
|
from pyspark.sql import functions as F
|
|
from pyspark.sql.types import BooleanType, LongType
|
|
from pyspark.sql.utils import AnalysisException
|
|
import numpy as np
|
|
|
|
from pyspark import pandas as ps # noqa: F401
|
|
from pyspark.pandas.internal import (
|
|
InternalFrame,
|
|
NATURAL_ORDER_COLUMN_NAME,
|
|
SPARK_DEFAULT_SERIES_NAME,
|
|
)
|
|
from pyspark.pandas.exceptions import SparkPandasIndexingError, SparkPandasNotImplementedError
|
|
from pyspark.pandas.typedef.typehints import (
|
|
Dtype,
|
|
Scalar,
|
|
extension_dtypes,
|
|
spark_type_to_pandas_dtype,
|
|
)
|
|
from pyspark.pandas.utils import (
|
|
is_name_like_tuple,
|
|
is_name_like_value,
|
|
lazy_property,
|
|
name_like_string,
|
|
same_anchor,
|
|
scol_for,
|
|
verify_temp_column_name,
|
|
)
|
|
|
|
if TYPE_CHECKING:
|
|
from pyspark.pandas.frame import DataFrame # noqa: F401 (SPARK-34943)
|
|
from pyspark.pandas.series import Series # noqa: F401 (SPARK-34943)
|
|
|
|
|
|
class IndexerLike(object):
|
|
def __init__(self, kdf_or_kser):
|
|
from pyspark.pandas.frame import DataFrame
|
|
from pyspark.pandas.series import Series
|
|
|
|
assert isinstance(kdf_or_kser, (DataFrame, Series)), "unexpected argument type: {}".format(
|
|
type(kdf_or_kser)
|
|
)
|
|
self._kdf_or_kser = kdf_or_kser
|
|
|
|
@property
|
|
def _is_df(self):
|
|
from pyspark.pandas.frame import DataFrame
|
|
|
|
return isinstance(self._kdf_or_kser, DataFrame)
|
|
|
|
@property
|
|
def _is_series(self):
|
|
from pyspark.pandas.series import Series
|
|
|
|
return isinstance(self._kdf_or_kser, Series)
|
|
|
|
@property
|
|
def _kdf(self):
|
|
if self._is_df:
|
|
return self._kdf_or_kser
|
|
else:
|
|
assert self._is_series
|
|
return self._kdf_or_kser._kdf
|
|
|
|
@property
|
|
def _internal(self):
|
|
return self._kdf._internal
|
|
|
|
|
|
class AtIndexer(IndexerLike):
|
|
"""
|
|
Access a single value for a row/column label pair.
|
|
If the index is not unique, all matching pairs are returned as an array.
|
|
Similar to ``loc``, in that both provide label-based lookups. Use ``at`` if you only need to
|
|
get a single value in a DataFrame or Series.
|
|
|
|
.. note:: Unlike pandas, pandas-on-Spark only allows using ``at`` to get values but not to
|
|
set them.
|
|
|
|
.. note:: Warning: If ``row_index`` matches a lot of rows, large amounts of data will be
|
|
fetched, potentially causing your machine to run out of memory.
|
|
|
|
Raises
|
|
------
|
|
KeyError
|
|
When label does not exist in DataFrame
|
|
|
|
Examples
|
|
--------
|
|
>>> kdf = ps.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]],
|
|
... index=[4, 5, 5], columns=['A', 'B', 'C'])
|
|
>>> kdf
|
|
A B C
|
|
4 0 2 3
|
|
5 0 4 1
|
|
5 10 20 30
|
|
|
|
Get value at specified row/column pair
|
|
|
|
>>> kdf.at[4, 'B']
|
|
2
|
|
|
|
Get array if an index occurs multiple times
|
|
|
|
>>> kdf.at[5, 'B']
|
|
array([ 4, 20])
|
|
"""
|
|
|
|
def __getitem__(self, key) -> Union["Series", "DataFrame", Scalar]:
|
|
if self._is_df:
|
|
if not isinstance(key, tuple) or len(key) != 2:
|
|
raise TypeError("Use DataFrame.at like .at[row_index, column_name]")
|
|
row_sel, col_sel = key
|
|
else:
|
|
assert self._is_series, type(self._kdf_or_kser)
|
|
if isinstance(key, tuple) and len(key) != 1:
|
|
raise TypeError("Use Series.at like .at[row_index]")
|
|
row_sel = key
|
|
col_sel = self._kdf_or_kser._column_label
|
|
|
|
if self._internal.index_level == 1:
|
|
if not is_name_like_value(row_sel, allow_none=False, allow_tuple=False):
|
|
raise ValueError("At based indexing on a single index can only have a single value")
|
|
row_sel = (row_sel,)
|
|
else:
|
|
if not is_name_like_tuple(row_sel, allow_none=False):
|
|
raise ValueError("At based indexing on multi-index can only have tuple values")
|
|
|
|
if col_sel is not None:
|
|
if not is_name_like_value(col_sel, allow_none=False):
|
|
raise ValueError("At based indexing on multi-index can only have tuple values")
|
|
if not is_name_like_tuple(col_sel):
|
|
col_sel = (col_sel,)
|
|
|
|
cond = reduce(
|
|
lambda x, y: x & y,
|
|
[scol == row for scol, row in zip(self._internal.index_spark_columns, row_sel)],
|
|
)
|
|
pdf = (
|
|
self._internal.spark_frame.drop(NATURAL_ORDER_COLUMN_NAME)
|
|
.filter(cond)
|
|
.select(self._internal.spark_column_for(col_sel))
|
|
.toPandas()
|
|
)
|
|
|
|
if len(pdf) < 1:
|
|
raise KeyError(name_like_string(row_sel))
|
|
|
|
values = pdf.iloc[:, 0].values
|
|
return (
|
|
values if (len(row_sel) < self._internal.index_level or len(values) > 1) else values[0]
|
|
)
|
|
|
|
|
|
class iAtIndexer(IndexerLike):
|
|
"""
|
|
Access a single value for a row/column pair by integer position.
|
|
|
|
Similar to ``iloc``, in that both provide integer-based lookups. Use
|
|
``iat`` if you only need to get or set a single value in a DataFrame
|
|
or Series.
|
|
|
|
Raises
|
|
------
|
|
KeyError
|
|
When label does not exist in DataFrame
|
|
|
|
Examples
|
|
--------
|
|
>>> df = ps.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]],
|
|
... columns=['A', 'B', 'C'])
|
|
>>> df
|
|
A B C
|
|
0 0 2 3
|
|
1 0 4 1
|
|
2 10 20 30
|
|
|
|
Get value at specified row/column pair
|
|
|
|
>>> df.iat[1, 2]
|
|
1
|
|
|
|
Get value within a series
|
|
|
|
>>> kser = ps.Series([1, 2, 3], index=[10, 20, 30])
|
|
>>> kser
|
|
10 1
|
|
20 2
|
|
30 3
|
|
dtype: int64
|
|
|
|
>>> kser.iat[1]
|
|
2
|
|
"""
|
|
|
|
def __getitem__(self, key) -> Union["Series", "DataFrame", Scalar]:
|
|
if self._is_df:
|
|
if not isinstance(key, tuple) or len(key) != 2:
|
|
raise TypeError(
|
|
"Use DataFrame.iat like .iat[row_integer_position, column_integer_position]"
|
|
)
|
|
row_sel, col_sel = key
|
|
if not isinstance(row_sel, int) or not isinstance(col_sel, int):
|
|
raise ValueError("iAt based indexing can only have integer indexers")
|
|
return self._kdf_or_kser.iloc[row_sel, col_sel]
|
|
else:
|
|
assert self._is_series, type(self._kdf_or_kser)
|
|
if not isinstance(key, int) and len(key) != 1:
|
|
raise TypeError("Use Series.iat like .iat[row_integer_position]")
|
|
if not isinstance(key, int):
|
|
raise ValueError("iAt based indexing can only have integer indexers")
|
|
return self._kdf_or_kser.iloc[key]
|
|
|
|
|
|
class LocIndexerLike(IndexerLike, metaclass=ABCMeta):
|
|
def _select_rows(
|
|
self, rows_sel: Any
|
|
) -> Tuple[Optional[spark.Column], Optional[int], Optional[int]]:
|
|
"""
|
|
Dispatch the logic for select rows to more specific methods by `rows_sel` argument types.
|
|
|
|
Parameters
|
|
----------
|
|
rows_sel : the key specified to select rows.
|
|
|
|
Returns
|
|
-------
|
|
Tuple of Spark column, int, int:
|
|
|
|
* The Spark column for the condition to filter the rows.
|
|
* The number of rows when the selection can be simplified by limit.
|
|
* The remaining index rows if the result index size is shrunk.
|
|
"""
|
|
from pyspark.pandas.series import Series
|
|
|
|
if rows_sel is None:
|
|
return None, None, None
|
|
elif isinstance(rows_sel, Series):
|
|
return self._select_rows_by_series(rows_sel)
|
|
elif isinstance(rows_sel, spark.Column):
|
|
return self._select_rows_by_spark_column(rows_sel)
|
|
elif isinstance(rows_sel, slice):
|
|
if rows_sel == slice(None):
|
|
# If slice is None - select everything, so nothing to do
|
|
return None, None, None
|
|
return self._select_rows_by_slice(rows_sel)
|
|
elif isinstance(rows_sel, tuple):
|
|
return self._select_rows_else(rows_sel)
|
|
elif is_list_like(rows_sel):
|
|
return self._select_rows_by_iterable(rows_sel)
|
|
else:
|
|
return self._select_rows_else(rows_sel)
|
|
|
|
def _select_cols(
|
|
self, cols_sel: Any, missing_keys: Optional[List[Tuple]] = None
|
|
) -> Tuple[
|
|
List[Tuple], Optional[List[spark.Column]], Optional[List[Dtype]], bool, Optional[Tuple]
|
|
]:
|
|
"""
|
|
Dispatch the logic for select columns to more specific methods by `cols_sel` argument types.
|
|
|
|
Parameters
|
|
----------
|
|
cols_sel : the key specified to select columns.
|
|
|
|
Returns
|
|
-------
|
|
Tuple of list of column label, list of Spark columns, list of dtypes, bool:
|
|
|
|
* The column labels selected.
|
|
* The Spark columns selected.
|
|
* The dtypes selected.
|
|
* The boolean value whether Series should be returned or not.
|
|
* The Series name if needed.
|
|
"""
|
|
from pyspark.pandas.series import Series
|
|
|
|
if cols_sel is None:
|
|
column_labels = self._internal.column_labels
|
|
data_spark_columns = self._internal.data_spark_columns
|
|
data_dtypes = self._internal.data_dtypes
|
|
return column_labels, data_spark_columns, data_dtypes, False, None
|
|
elif isinstance(cols_sel, Series):
|
|
return self._select_cols_by_series(cols_sel, missing_keys)
|
|
elif isinstance(cols_sel, spark.Column):
|
|
return self._select_cols_by_spark_column(cols_sel, missing_keys)
|
|
elif isinstance(cols_sel, slice):
|
|
if cols_sel == slice(None):
|
|
# If slice is None - select everything, so nothing to do
|
|
column_labels = self._internal.column_labels
|
|
data_spark_columns = self._internal.data_spark_columns
|
|
data_dtypes = self._internal.data_dtypes
|
|
return column_labels, data_spark_columns, data_dtypes, False, None
|
|
return self._select_cols_by_slice(cols_sel, missing_keys)
|
|
elif isinstance(cols_sel, tuple):
|
|
return self._select_cols_else(cols_sel, missing_keys)
|
|
elif is_list_like(cols_sel):
|
|
return self._select_cols_by_iterable(cols_sel, missing_keys)
|
|
else:
|
|
return self._select_cols_else(cols_sel, missing_keys)
|
|
|
|
# Methods for row selection
|
|
|
|
@abstractmethod
|
|
def _select_rows_by_series(
|
|
self, rows_sel: "Series"
|
|
) -> Tuple[Optional[spark.Column], Optional[int], Optional[int]]:
|
|
""" Select rows by `Series` type key. """
|
|
pass
|
|
|
|
@abstractmethod
|
|
def _select_rows_by_spark_column(
|
|
self, rows_sel: spark.Column
|
|
) -> Tuple[Optional[spark.Column], Optional[int], Optional[int]]:
|
|
""" Select rows by Spark `Column` type key. """
|
|
pass
|
|
|
|
@abstractmethod
|
|
def _select_rows_by_slice(
|
|
self, rows_sel: slice
|
|
) -> Tuple[Optional[spark.Column], Optional[int], Optional[int]]:
|
|
""" Select rows by `slice` type key. """
|
|
pass
|
|
|
|
@abstractmethod
|
|
def _select_rows_by_iterable(
|
|
self, rows_sel: Iterable
|
|
) -> Tuple[Optional[spark.Column], Optional[int], Optional[int]]:
|
|
""" Select rows by `Iterable` type key. """
|
|
pass
|
|
|
|
@abstractmethod
|
|
def _select_rows_else(
|
|
self, rows_sel: Any
|
|
) -> Tuple[Optional[spark.Column], Optional[int], Optional[int]]:
|
|
""" Select rows by other type key. """
|
|
pass
|
|
|
|
# Methods for col selection
|
|
|
|
@abstractmethod
|
|
def _select_cols_by_series(
|
|
self, cols_sel: "Series", missing_keys: Optional[List[Tuple]]
|
|
) -> Tuple[
|
|
List[Tuple], Optional[List[spark.Column]], Optional[List[Dtype]], bool, Optional[Tuple]
|
|
]:
|
|
""" Select columns by `Series` type key. """
|
|
pass
|
|
|
|
@abstractmethod
|
|
def _select_cols_by_spark_column(
|
|
self, cols_sel: spark.Column, missing_keys: Optional[List[Tuple]]
|
|
) -> Tuple[
|
|
List[Tuple], Optional[List[spark.Column]], Optional[List[Dtype]], bool, Optional[Tuple]
|
|
]:
|
|
""" Select columns by Spark `Column` type key. """
|
|
pass
|
|
|
|
@abstractmethod
|
|
def _select_cols_by_slice(
|
|
self, cols_sel: slice, missing_keys: Optional[List[Tuple]]
|
|
) -> Tuple[
|
|
List[Tuple], Optional[List[spark.Column]], Optional[List[Dtype]], bool, Optional[Tuple]
|
|
]:
|
|
""" Select columns by `slice` type key. """
|
|
pass
|
|
|
|
@abstractmethod
|
|
def _select_cols_by_iterable(
|
|
self, cols_sel: Iterable, missing_keys: Optional[List[Tuple]]
|
|
) -> Tuple[
|
|
List[Tuple], Optional[List[spark.Column]], Optional[List[Dtype]], bool, Optional[Tuple]
|
|
]:
|
|
""" Select columns by `Iterable` type key. """
|
|
pass
|
|
|
|
@abstractmethod
|
|
def _select_cols_else(
|
|
self, cols_sel: Any, missing_keys: Optional[List[Tuple]]
|
|
) -> Tuple[
|
|
List[Tuple], Optional[List[spark.Column]], Optional[List[Dtype]], bool, Optional[Tuple]
|
|
]:
|
|
""" Select columns by other type key. """
|
|
pass
|
|
|
|
def __getitem__(self, key) -> Union["Series", "DataFrame"]:
|
|
from pyspark.pandas.frame import DataFrame
|
|
from pyspark.pandas.series import Series, first_series
|
|
|
|
if self._is_series:
|
|
if isinstance(key, Series) and not same_anchor(key, self._kdf_or_kser):
|
|
kdf = self._kdf_or_kser.to_frame()
|
|
temp_col = verify_temp_column_name(kdf, "__temp_col__")
|
|
|
|
kdf[temp_col] = key
|
|
return type(self)(kdf[self._kdf_or_kser.name])[kdf[temp_col]]
|
|
|
|
cond, limit, remaining_index = self._select_rows(key)
|
|
if cond is None and limit is None:
|
|
return self._kdf_or_kser
|
|
|
|
column_label = self._kdf_or_kser._column_label
|
|
column_labels = [column_label]
|
|
data_spark_columns = [self._internal.spark_column_for(column_label)]
|
|
data_dtypes = [self._internal.dtype_for(column_label)]
|
|
returns_series = True
|
|
series_name = self._kdf_or_kser.name
|
|
else:
|
|
assert self._is_df
|
|
if isinstance(key, tuple):
|
|
if len(key) != 2:
|
|
raise SparkPandasIndexingError("Only accepts pairs of candidates")
|
|
rows_sel, cols_sel = key
|
|
else:
|
|
rows_sel = key
|
|
cols_sel = None
|
|
|
|
if isinstance(rows_sel, Series) and not same_anchor(rows_sel, self._kdf_or_kser):
|
|
kdf = self._kdf_or_kser.copy()
|
|
temp_col = verify_temp_column_name(kdf, "__temp_col__")
|
|
|
|
kdf[temp_col] = rows_sel
|
|
return type(self)(kdf)[kdf[temp_col], cols_sel][list(self._kdf_or_kser.columns)]
|
|
|
|
cond, limit, remaining_index = self._select_rows(rows_sel)
|
|
(
|
|
column_labels,
|
|
data_spark_columns,
|
|
data_dtypes,
|
|
returns_series,
|
|
series_name,
|
|
) = self._select_cols(cols_sel)
|
|
|
|
if cond is None and limit is None and returns_series:
|
|
kser = self._kdf_or_kser._kser_for(column_labels[0])
|
|
if series_name is not None and series_name != kser.name:
|
|
kser = kser.rename(series_name)
|
|
return kser
|
|
|
|
if remaining_index is not None:
|
|
index_spark_columns = self._internal.index_spark_columns[-remaining_index:]
|
|
index_names = self._internal.index_names[-remaining_index:]
|
|
index_dtypes = self._internal.index_dtypes[-remaining_index:]
|
|
else:
|
|
index_spark_columns = self._internal.index_spark_columns
|
|
index_names = self._internal.index_names
|
|
index_dtypes = self._internal.index_dtypes
|
|
|
|
if len(column_labels) > 0:
|
|
column_labels = column_labels.copy()
|
|
column_labels_level = max(
|
|
len(label) if label is not None else 1 for label in column_labels
|
|
)
|
|
none_column = 0
|
|
for i, label in enumerate(column_labels):
|
|
if label is None:
|
|
label = (none_column,)
|
|
none_column += 1
|
|
if len(label) < column_labels_level:
|
|
label = tuple(list(label) + ([""]) * (column_labels_level - len(label)))
|
|
column_labels[i] = label
|
|
|
|
if i == 0 and none_column == 1:
|
|
column_labels = [None]
|
|
|
|
column_label_names = self._internal.column_label_names[-column_labels_level:]
|
|
else:
|
|
column_label_names = self._internal.column_label_names
|
|
|
|
try:
|
|
sdf = self._internal.spark_frame
|
|
|
|
if cond is not None:
|
|
index_columns = sdf.select(index_spark_columns).columns
|
|
data_columns = sdf.select(data_spark_columns).columns
|
|
sdf = sdf.filter(cond).select(index_spark_columns + data_spark_columns)
|
|
index_spark_columns = [scol_for(sdf, col) for col in index_columns]
|
|
data_spark_columns = [scol_for(sdf, col) for col in data_columns]
|
|
|
|
if limit is not None:
|
|
if limit >= 0:
|
|
sdf = sdf.limit(limit)
|
|
else:
|
|
sdf = sdf.limit(sdf.count() + limit)
|
|
sdf = sdf.drop(NATURAL_ORDER_COLUMN_NAME)
|
|
except AnalysisException:
|
|
raise KeyError(
|
|
"[{}] don't exist in columns".format(
|
|
[col._jc.toString() for col in data_spark_columns]
|
|
)
|
|
)
|
|
|
|
internal = InternalFrame(
|
|
spark_frame=sdf,
|
|
index_spark_columns=index_spark_columns,
|
|
index_names=index_names,
|
|
index_dtypes=index_dtypes,
|
|
column_labels=column_labels,
|
|
data_spark_columns=data_spark_columns,
|
|
data_dtypes=data_dtypes,
|
|
column_label_names=column_label_names,
|
|
)
|
|
kdf = DataFrame(internal)
|
|
|
|
if returns_series:
|
|
kdf_or_kser = first_series(kdf)
|
|
if series_name is not None and series_name != kdf_or_kser.name:
|
|
kdf_or_kser = kdf_or_kser.rename(series_name)
|
|
else:
|
|
kdf_or_kser = kdf
|
|
|
|
if remaining_index is not None and remaining_index == 0:
|
|
pdf_or_pser = kdf_or_kser.head(2).to_pandas()
|
|
length = len(pdf_or_pser)
|
|
if length == 0:
|
|
raise KeyError(name_like_string(key))
|
|
elif length == 1:
|
|
return pdf_or_pser.iloc[0]
|
|
else:
|
|
return kdf_or_kser
|
|
else:
|
|
return kdf_or_kser
|
|
|
|
def __setitem__(self, key, value):
|
|
from pyspark.pandas.frame import DataFrame
|
|
from pyspark.pandas.series import Series, first_series
|
|
|
|
if self._is_series:
|
|
if (
|
|
isinstance(key, Series)
|
|
and (isinstance(self, iLocIndexer) or not same_anchor(key, self._kdf_or_kser))
|
|
) or (
|
|
isinstance(value, Series)
|
|
and (isinstance(self, iLocIndexer) or not same_anchor(value, self._kdf_or_kser))
|
|
):
|
|
if self._kdf_or_kser.name is None:
|
|
kdf = self._kdf_or_kser.to_frame()
|
|
column_label = kdf._internal.column_labels[0]
|
|
else:
|
|
kdf = self._kdf_or_kser._kdf.copy()
|
|
column_label = self._kdf_or_kser._column_label
|
|
temp_natural_order = verify_temp_column_name(kdf, "__temp_natural_order__")
|
|
temp_key_col = verify_temp_column_name(kdf, "__temp_key_col__")
|
|
temp_value_col = verify_temp_column_name(kdf, "__temp_value_col__")
|
|
|
|
kdf[temp_natural_order] = F.monotonically_increasing_id()
|
|
if isinstance(key, Series):
|
|
kdf[temp_key_col] = key
|
|
if isinstance(value, Series):
|
|
kdf[temp_value_col] = value
|
|
kdf = kdf.sort_values(temp_natural_order).drop(temp_natural_order)
|
|
|
|
kser = kdf._kser_for(column_label)
|
|
if isinstance(key, Series):
|
|
key = F.col(
|
|
"`{}`".format(kdf[temp_key_col]._internal.data_spark_column_names[0])
|
|
)
|
|
if isinstance(value, Series):
|
|
value = F.col(
|
|
"`{}`".format(kdf[temp_value_col]._internal.data_spark_column_names[0])
|
|
)
|
|
|
|
type(self)(kser)[key] = value
|
|
|
|
if self._kdf_or_kser.name is None:
|
|
kser = kser.rename()
|
|
|
|
self._kdf_or_kser._kdf._update_internal_frame(
|
|
kser._kdf[
|
|
self._kdf_or_kser._kdf._internal.column_labels
|
|
]._internal.resolved_copy,
|
|
requires_same_anchor=False,
|
|
)
|
|
return
|
|
|
|
if isinstance(value, DataFrame):
|
|
raise ValueError("Incompatible indexer with DataFrame")
|
|
|
|
cond, limit, remaining_index = self._select_rows(key)
|
|
if cond is None:
|
|
cond = F.lit(True)
|
|
if limit is not None:
|
|
cond = cond & (self._internal.spark_frame[self._sequence_col] < F.lit(limit))
|
|
|
|
if isinstance(value, (Series, spark.Column)):
|
|
if remaining_index is not None and remaining_index == 0:
|
|
raise ValueError(
|
|
"No axis named {} for object type {}".format(key, type(value).__name__)
|
|
)
|
|
if isinstance(value, Series):
|
|
value = value.spark.column
|
|
else:
|
|
value = F.lit(value)
|
|
scol = (
|
|
F.when(cond, value)
|
|
.otherwise(self._internal.spark_column_for(self._kdf_or_kser._column_label))
|
|
.alias(name_like_string(self._kdf_or_kser.name or SPARK_DEFAULT_SERIES_NAME))
|
|
)
|
|
|
|
internal = self._internal.with_new_spark_column(
|
|
self._kdf_or_kser._column_label, scol # TODO: dtype?
|
|
)
|
|
self._kdf_or_kser._kdf._update_internal_frame(internal, requires_same_anchor=False)
|
|
else:
|
|
assert self._is_df
|
|
|
|
if isinstance(key, tuple):
|
|
if len(key) != 2:
|
|
raise SparkPandasIndexingError("Only accepts pairs of candidates")
|
|
rows_sel, cols_sel = key
|
|
else:
|
|
rows_sel = key
|
|
cols_sel = None
|
|
|
|
if isinstance(value, DataFrame):
|
|
if len(value.columns) == 1:
|
|
value = first_series(value)
|
|
else:
|
|
raise ValueError("Only a dataframe with one column can be assigned")
|
|
|
|
if (
|
|
isinstance(rows_sel, Series)
|
|
and (isinstance(self, iLocIndexer) or not same_anchor(rows_sel, self._kdf_or_kser))
|
|
) or (
|
|
isinstance(value, Series)
|
|
and (isinstance(self, iLocIndexer) or not same_anchor(value, self._kdf_or_kser))
|
|
):
|
|
kdf = self._kdf_or_kser.copy()
|
|
temp_natural_order = verify_temp_column_name(kdf, "__temp_natural_order__")
|
|
temp_key_col = verify_temp_column_name(kdf, "__temp_key_col__")
|
|
temp_value_col = verify_temp_column_name(kdf, "__temp_value_col__")
|
|
|
|
kdf[temp_natural_order] = F.monotonically_increasing_id()
|
|
if isinstance(rows_sel, Series):
|
|
kdf[temp_key_col] = rows_sel
|
|
if isinstance(value, Series):
|
|
kdf[temp_value_col] = value
|
|
kdf = kdf.sort_values(temp_natural_order).drop(temp_natural_order)
|
|
|
|
if isinstance(rows_sel, Series):
|
|
rows_sel = F.col(
|
|
"`{}`".format(kdf[temp_key_col]._internal.data_spark_column_names[0])
|
|
)
|
|
if isinstance(value, Series):
|
|
value = F.col(
|
|
"`{}`".format(kdf[temp_value_col]._internal.data_spark_column_names[0])
|
|
)
|
|
|
|
type(self)(kdf)[rows_sel, cols_sel] = value
|
|
|
|
self._kdf_or_kser._update_internal_frame(
|
|
kdf[list(self._kdf_or_kser.columns)]._internal.resolved_copy,
|
|
requires_same_anchor=False,
|
|
)
|
|
return
|
|
|
|
cond, limit, remaining_index = self._select_rows(rows_sel)
|
|
missing_keys = []
|
|
_, data_spark_columns, _, _, _ = self._select_cols(cols_sel, missing_keys=missing_keys)
|
|
|
|
if cond is None:
|
|
cond = F.lit(True)
|
|
if limit is not None:
|
|
cond = cond & (self._internal.spark_frame[self._sequence_col] < F.lit(limit))
|
|
|
|
if isinstance(value, (Series, spark.Column)):
|
|
if remaining_index is not None and remaining_index == 0:
|
|
raise ValueError("Incompatible indexer with Series")
|
|
if len(data_spark_columns) > 1:
|
|
raise ValueError("shape mismatch")
|
|
if isinstance(value, Series):
|
|
value = value.spark.column
|
|
else:
|
|
value = F.lit(value)
|
|
|
|
new_data_spark_columns = []
|
|
new_dtypes = []
|
|
for new_scol, spark_column_name, new_dtype in zip(
|
|
self._internal.data_spark_columns,
|
|
self._internal.data_spark_column_names,
|
|
self._internal.data_dtypes,
|
|
):
|
|
for scol in data_spark_columns:
|
|
if new_scol._jc.equals(scol._jc):
|
|
new_scol = F.when(cond, value).otherwise(scol).alias(spark_column_name)
|
|
new_dtype = spark_type_to_pandas_dtype(
|
|
self._internal.spark_frame.select(new_scol).schema[0].dataType,
|
|
use_extension_dtypes=isinstance(new_dtype, extension_dtypes),
|
|
)
|
|
break
|
|
new_data_spark_columns.append(new_scol)
|
|
new_dtypes.append(new_dtype)
|
|
|
|
column_labels = self._internal.column_labels.copy()
|
|
for label in missing_keys:
|
|
if not is_name_like_tuple(label):
|
|
label = (label,)
|
|
if len(label) < self._internal.column_labels_level:
|
|
label = tuple(
|
|
list(label) + ([""] * (self._internal.column_labels_level - len(label)))
|
|
)
|
|
elif len(label) > self._internal.column_labels_level:
|
|
raise KeyError(
|
|
"Key length ({}) exceeds index depth ({})".format(
|
|
len(label), self._internal.column_labels_level
|
|
)
|
|
)
|
|
column_labels.append(label)
|
|
new_data_spark_columns.append(F.when(cond, value).alias(name_like_string(label)))
|
|
new_dtypes.append(None)
|
|
|
|
internal = self._internal.with_new_columns(
|
|
new_data_spark_columns, column_labels=column_labels, data_dtypes=new_dtypes
|
|
)
|
|
self._kdf_or_kser._update_internal_frame(internal, requires_same_anchor=False)
|
|
|
|
|
|
class LocIndexer(LocIndexerLike):
|
|
"""
|
|
Access a group of rows and columns by label(s) or a boolean Series.
|
|
|
|
``.loc[]`` is primarily label based, but may also be used with a
|
|
conditional boolean Series derived from the DataFrame or Series.
|
|
|
|
Allowed inputs are:
|
|
|
|
- A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is
|
|
interpreted as a *label* of the index, and **never** as an
|
|
integer position along the index) for column selection.
|
|
|
|
- A list or array of labels, e.g. ``['a', 'b', 'c']``.
|
|
|
|
- A slice object with labels, e.g. ``'a':'f'``.
|
|
|
|
- A conditional boolean Series derived from the DataFrame or Series
|
|
|
|
- A boolean array of the same length as the column axis being sliced,
|
|
e.g. ``[True, False, True]``.
|
|
|
|
- An alignable boolean pandas Series to the column axis being sliced.
|
|
The index of the key will be aligned before masking.
|
|
|
|
Not allowed inputs which pandas allows are:
|
|
|
|
- A boolean array of the same length as the row axis being sliced,
|
|
e.g. ``[True, False, True]``.
|
|
- A ``callable`` function with one argument (the calling Series, DataFrame
|
|
or Panel) and that returns valid output for indexing (one of the above)
|
|
|
|
.. note:: MultiIndex is not supported yet.
|
|
|
|
.. note:: Note that contrary to usual python slices, **both** the
|
|
start and the stop are included, and the step of the slice is not allowed.
|
|
|
|
.. note:: With a list or array of labels for row selection,
|
|
pandas-on-Spark behaves as a filter without reordering by the labels.
|
|
|
|
See Also
|
|
--------
|
|
Series.loc : Access group of values using labels.
|
|
|
|
Examples
|
|
--------
|
|
**Getting values**
|
|
|
|
>>> df = ps.DataFrame([[1, 2], [4, 5], [7, 8]],
|
|
... index=['cobra', 'viper', 'sidewinder'],
|
|
... columns=['max_speed', 'shield'])
|
|
>>> df
|
|
max_speed shield
|
|
cobra 1 2
|
|
viper 4 5
|
|
sidewinder 7 8
|
|
|
|
Single label. Note this returns the row as a Series.
|
|
|
|
>>> df.loc['viper']
|
|
max_speed 4
|
|
shield 5
|
|
Name: viper, dtype: int64
|
|
|
|
List of labels. Note using ``[[]]`` returns a DataFrame.
|
|
Also note that pandas-on-Spark behaves just a filter without reordering by the labels.
|
|
|
|
>>> df.loc[['viper', 'sidewinder']]
|
|
max_speed shield
|
|
viper 4 5
|
|
sidewinder 7 8
|
|
|
|
>>> df.loc[['sidewinder', 'viper']]
|
|
max_speed shield
|
|
viper 4 5
|
|
sidewinder 7 8
|
|
|
|
Single label for column.
|
|
|
|
>>> df.loc['cobra', 'shield']
|
|
2
|
|
|
|
List of labels for row.
|
|
|
|
>>> df.loc[['cobra'], 'shield']
|
|
cobra 2
|
|
Name: shield, dtype: int64
|
|
|
|
List of labels for column.
|
|
|
|
>>> df.loc['cobra', ['shield']]
|
|
shield 2
|
|
Name: cobra, dtype: int64
|
|
|
|
List of labels for both row and column.
|
|
|
|
>>> df.loc[['cobra'], ['shield']]
|
|
shield
|
|
cobra 2
|
|
|
|
Slice with labels for row and single label for column. As mentioned
|
|
above, note that both the start and stop of the slice are included.
|
|
|
|
>>> df.loc['cobra':'viper', 'max_speed']
|
|
cobra 1
|
|
viper 4
|
|
Name: max_speed, dtype: int64
|
|
|
|
Conditional that returns a boolean Series
|
|
|
|
>>> df.loc[df['shield'] > 6]
|
|
max_speed shield
|
|
sidewinder 7 8
|
|
|
|
Conditional that returns a boolean Series with column labels specified
|
|
|
|
>>> df.loc[df['shield'] > 6, ['max_speed']]
|
|
max_speed
|
|
sidewinder 7
|
|
|
|
A boolean array of the same length as the column axis being sliced.
|
|
|
|
>>> df.loc[:, [False, True]]
|
|
shield
|
|
cobra 2
|
|
viper 5
|
|
sidewinder 8
|
|
|
|
An alignable boolean Series to the column axis being sliced.
|
|
|
|
>>> df.loc[:, pd.Series([False, True], index=['max_speed', 'shield'])]
|
|
shield
|
|
cobra 2
|
|
viper 5
|
|
sidewinder 8
|
|
|
|
**Setting values**
|
|
|
|
Setting value for all items matching the list of labels.
|
|
|
|
>>> df.loc[['viper', 'sidewinder'], ['shield']] = 50
|
|
>>> df
|
|
max_speed shield
|
|
cobra 1 2
|
|
viper 4 50
|
|
sidewinder 7 50
|
|
|
|
Setting value for an entire row
|
|
|
|
>>> df.loc['cobra'] = 10
|
|
>>> df
|
|
max_speed shield
|
|
cobra 10 10
|
|
viper 4 50
|
|
sidewinder 7 50
|
|
|
|
Set value for an entire column
|
|
|
|
>>> df.loc[:, 'max_speed'] = 30
|
|
>>> df
|
|
max_speed shield
|
|
cobra 30 10
|
|
viper 30 50
|
|
sidewinder 30 50
|
|
|
|
Set value for an entire list of columns
|
|
|
|
>>> df.loc[:, ['max_speed', 'shield']] = 100
|
|
>>> df
|
|
max_speed shield
|
|
cobra 100 100
|
|
viper 100 100
|
|
sidewinder 100 100
|
|
|
|
Set value with Series
|
|
|
|
>>> df.loc[:, 'shield'] = df['shield'] * 2
|
|
>>> df
|
|
max_speed shield
|
|
cobra 100 200
|
|
viper 100 200
|
|
sidewinder 100 200
|
|
|
|
**Getting values on a DataFrame with an index that has integer labels**
|
|
|
|
Another example using integers for the index
|
|
|
|
>>> df = ps.DataFrame([[1, 2], [4, 5], [7, 8]],
|
|
... index=[7, 8, 9],
|
|
... columns=['max_speed', 'shield'])
|
|
>>> df
|
|
max_speed shield
|
|
7 1 2
|
|
8 4 5
|
|
9 7 8
|
|
|
|
Slice with integer labels for rows. As mentioned above, note that both
|
|
the start and stop of the slice are included.
|
|
|
|
>>> df.loc[7:9]
|
|
max_speed shield
|
|
7 1 2
|
|
8 4 5
|
|
9 7 8
|
|
"""
|
|
|
|
@staticmethod
|
|
def _NotImplemented(description):
|
|
return SparkPandasNotImplementedError(
|
|
description=description,
|
|
pandas_function=".loc[..., ...]",
|
|
spark_target_function="select, where",
|
|
)
|
|
|
|
def _select_rows_by_series(
|
|
self, rows_sel: "Series"
|
|
) -> Tuple[Optional[spark.Column], Optional[int], Optional[int]]:
|
|
assert isinstance(rows_sel.spark.data_type, BooleanType), rows_sel.spark.data_type
|
|
return rows_sel.spark.column, None, None
|
|
|
|
def _select_rows_by_spark_column(
|
|
self, rows_sel: spark.Column
|
|
) -> Tuple[Optional[spark.Column], Optional[int], Optional[int]]:
|
|
spark_type = self._internal.spark_frame.select(rows_sel).schema[0].dataType
|
|
assert isinstance(spark_type, BooleanType), spark_type
|
|
return rows_sel, None, None
|
|
|
|
def _select_rows_by_slice(
|
|
self, rows_sel: slice
|
|
) -> Tuple[Optional[spark.Column], Optional[int], Optional[int]]:
|
|
from pyspark.pandas.indexes import MultiIndex
|
|
|
|
if rows_sel.step is not None:
|
|
raise LocIndexer._NotImplemented("Cannot use step with Spark.")
|
|
elif self._internal.index_level == 1:
|
|
sdf = self._internal.spark_frame
|
|
index = self._kdf_or_kser.index
|
|
index_column = index.to_series()
|
|
index_data_type = index_column.spark.data_type
|
|
start = rows_sel.start
|
|
stop = rows_sel.stop
|
|
|
|
# get natural order from '__natural_order__' from start to stop
|
|
# to keep natural order.
|
|
start_and_stop = (
|
|
sdf.select(index_column.spark.column, NATURAL_ORDER_COLUMN_NAME)
|
|
.where(
|
|
(index_column.spark.column == F.lit(start).cast(index_data_type))
|
|
| (index_column.spark.column == F.lit(stop).cast(index_data_type))
|
|
)
|
|
.collect()
|
|
)
|
|
|
|
start = [row[1] for row in start_and_stop if row[0] == start]
|
|
start = start[0] if len(start) > 0 else None
|
|
|
|
stop = [row[1] for row in start_and_stop if row[0] == stop]
|
|
stop = stop[-1] if len(stop) > 0 else None
|
|
|
|
conds = [] # type: List[spark.Column]
|
|
if start is not None:
|
|
conds.append(F.col(NATURAL_ORDER_COLUMN_NAME) >= F.lit(start).cast(LongType()))
|
|
if stop is not None:
|
|
conds.append(F.col(NATURAL_ORDER_COLUMN_NAME) <= F.lit(stop).cast(LongType()))
|
|
|
|
# if index order is not monotonic increasing or decreasing
|
|
# and specified values don't exist in index, raise KeyError
|
|
if (start is None and rows_sel.start is not None) or (
|
|
stop is None and rows_sel.stop is not None
|
|
):
|
|
|
|
inc = index_column.is_monotonic_increasing
|
|
if inc is False:
|
|
dec = index_column.is_monotonic_decreasing
|
|
|
|
if start is None and rows_sel.start is not None:
|
|
start = rows_sel.start
|
|
if inc is not False:
|
|
conds.append(
|
|
index_column.spark.column >= F.lit(start).cast(index_data_type)
|
|
)
|
|
elif dec is not False:
|
|
conds.append(
|
|
index_column.spark.column <= F.lit(start).cast(index_data_type)
|
|
)
|
|
else:
|
|
raise KeyError(rows_sel.start)
|
|
if stop is None and rows_sel.stop is not None:
|
|
stop = rows_sel.stop
|
|
if inc is not False:
|
|
conds.append(index_column.spark.column <= F.lit(stop).cast(index_data_type))
|
|
elif dec is not False:
|
|
conds.append(index_column.spark.column >= F.lit(stop).cast(index_data_type))
|
|
else:
|
|
raise KeyError(rows_sel.stop)
|
|
|
|
return reduce(lambda x, y: x & y, conds), None, None
|
|
else:
|
|
index = self._kdf_or_kser.index
|
|
index_data_type = [f.dataType for f in index.to_series().spark.data_type]
|
|
|
|
start = rows_sel.start
|
|
if start is not None:
|
|
if not isinstance(start, tuple):
|
|
start = (start,)
|
|
if len(start) == 0:
|
|
start = None
|
|
stop = rows_sel.stop
|
|
if stop is not None:
|
|
if not isinstance(stop, tuple):
|
|
stop = (stop,)
|
|
if len(stop) == 0:
|
|
stop = None
|
|
|
|
depth = max(
|
|
len(start) if start is not None else 0, len(stop) if stop is not None else 0
|
|
)
|
|
if depth == 0:
|
|
return None, None, None
|
|
elif (
|
|
depth > self._internal.index_level
|
|
or not index.droplevel(list(range(self._internal.index_level)[depth:])).is_monotonic
|
|
):
|
|
raise KeyError(
|
|
"Key length ({}) was greater than MultiIndex sort depth".format(depth)
|
|
)
|
|
|
|
conds = []
|
|
if start is not None:
|
|
cond = F.lit(True)
|
|
for scol, value, dt in list(
|
|
zip(self._internal.index_spark_columns, start, index_data_type)
|
|
)[::-1]:
|
|
compare = MultiIndex._comparator_for_monotonic_increasing(dt)
|
|
cond = F.when(scol.eqNullSafe(F.lit(value).cast(dt)), cond).otherwise(
|
|
compare(scol, F.lit(value).cast(dt), spark.Column.__gt__)
|
|
)
|
|
conds.append(cond)
|
|
if stop is not None:
|
|
cond = F.lit(True)
|
|
for scol, value, dt in list(
|
|
zip(self._internal.index_spark_columns, stop, index_data_type)
|
|
)[::-1]:
|
|
compare = MultiIndex._comparator_for_monotonic_increasing(dt)
|
|
cond = F.when(scol.eqNullSafe(F.lit(value).cast(dt)), cond).otherwise(
|
|
compare(scol, F.lit(value).cast(dt), spark.Column.__lt__)
|
|
)
|
|
conds.append(cond)
|
|
|
|
return reduce(lambda x, y: x & y, conds), None, None
|
|
|
|
def _select_rows_by_iterable(
|
|
self, rows_sel: Iterable
|
|
) -> Tuple[Optional[spark.Column], Optional[int], Optional[int]]:
|
|
rows_sel = list(rows_sel)
|
|
if len(rows_sel) == 0:
|
|
return F.lit(False), None, None
|
|
elif self._internal.index_level == 1:
|
|
index_column = self._kdf_or_kser.index.to_series()
|
|
index_data_type = index_column.spark.data_type
|
|
if len(rows_sel) == 1:
|
|
return (
|
|
index_column.spark.column == F.lit(rows_sel[0]).cast(index_data_type),
|
|
None,
|
|
None,
|
|
)
|
|
else:
|
|
return (
|
|
index_column.spark.column.isin(
|
|
[F.lit(r).cast(index_data_type) for r in rows_sel]
|
|
),
|
|
None,
|
|
None,
|
|
)
|
|
else:
|
|
raise LocIndexer._NotImplemented("Cannot select with MultiIndex with Spark.")
|
|
|
|
def _select_rows_else(
|
|
self, rows_sel: Any
|
|
) -> Tuple[Optional[spark.Column], Optional[int], Optional[int]]:
|
|
if not isinstance(rows_sel, tuple):
|
|
rows_sel = (rows_sel,)
|
|
if len(rows_sel) > self._internal.index_level:
|
|
raise SparkPandasIndexingError("Too many indexers")
|
|
|
|
rows = [scol == value for scol, value in zip(self._internal.index_spark_columns, rows_sel)]
|
|
return (
|
|
reduce(lambda x, y: x & y, rows),
|
|
None,
|
|
self._internal.index_level - len(rows_sel),
|
|
)
|
|
|
|
def _get_from_multiindex_column(
|
|
self, key, missing_keys, labels=None, recursed=0
|
|
) -> Tuple[List[Tuple], Optional[List[spark.Column]], Any, bool, Optional[Tuple]]:
|
|
""" Select columns from multi-index columns. """
|
|
assert isinstance(key, tuple)
|
|
if labels is None:
|
|
labels = [(label, label) for label in self._internal.column_labels]
|
|
for k in key:
|
|
labels = [
|
|
(label, None if lbl is None else lbl[1:])
|
|
for label, lbl in labels
|
|
if (lbl is None and k is None) or (lbl is not None and lbl[0] == k)
|
|
]
|
|
if len(labels) == 0:
|
|
if missing_keys is None:
|
|
raise KeyError(k)
|
|
else:
|
|
missing_keys.append(key)
|
|
return [], [], [], False, None
|
|
|
|
if all(lbl is not None and len(lbl) > 0 and lbl[0] == "" for _, lbl in labels):
|
|
# If the head is '', drill down recursively.
|
|
labels = [(label, tuple([str(key), *lbl[1:]])) for i, (label, lbl) in enumerate(labels)]
|
|
return self._get_from_multiindex_column((str(key),), missing_keys, labels, recursed + 1)
|
|
else:
|
|
returns_series = all(lbl is None or len(lbl) == 0 for _, lbl in labels)
|
|
if returns_series:
|
|
labels = set(label for label, _ in labels)
|
|
assert len(labels) == 1
|
|
label = list(labels)[0]
|
|
column_labels = [label]
|
|
data_spark_columns = [self._internal.spark_column_for(label)]
|
|
data_dtypes = [self._internal.dtype_for(label)]
|
|
if label is None:
|
|
series_name = None
|
|
else:
|
|
if recursed > 0:
|
|
label = label[:-recursed]
|
|
series_name = label if len(label) > 1 else label[0]
|
|
else:
|
|
column_labels = [
|
|
None if lbl is None or lbl == (None,) else lbl for _, lbl in labels
|
|
]
|
|
data_spark_columns = [self._internal.spark_column_for(label) for label, _ in labels]
|
|
data_dtypes = [self._internal.dtype_for(label) for label, _ in labels]
|
|
series_name = None
|
|
|
|
return column_labels, data_spark_columns, data_dtypes, returns_series, series_name
|
|
|
|
def _select_cols_by_series(
|
|
self, cols_sel: "Series", missing_keys: Optional[List[Tuple]]
|
|
) -> Tuple[
|
|
List[Tuple], Optional[List[spark.Column]], Optional[List[Dtype]], bool, Optional[Tuple]
|
|
]:
|
|
column_labels = [cols_sel._column_label]
|
|
data_spark_columns = [cols_sel.spark.column]
|
|
data_dtypes = [cols_sel.dtype]
|
|
return column_labels, data_spark_columns, data_dtypes, True, None
|
|
|
|
def _select_cols_by_spark_column(
|
|
self, cols_sel: spark.Column, missing_keys: Optional[List[Tuple]]
|
|
) -> Tuple[
|
|
List[Tuple], Optional[List[spark.Column]], Optional[List[Dtype]], bool, Optional[Tuple]
|
|
]:
|
|
column_labels = [
|
|
(self._internal.spark_frame.select(cols_sel).columns[0],)
|
|
] # type: List[Tuple]
|
|
data_spark_columns = [cols_sel]
|
|
return column_labels, data_spark_columns, None, True, None
|
|
|
|
def _select_cols_by_slice(
|
|
self, cols_sel: slice, missing_keys: Optional[List[Tuple]]
|
|
) -> Tuple[
|
|
List[Tuple], Optional[List[spark.Column]], Optional[List[Dtype]], bool, Optional[Tuple]
|
|
]:
|
|
start, stop = self._kdf_or_kser.columns.slice_locs(start=cols_sel.start, end=cols_sel.stop)
|
|
column_labels = self._internal.column_labels[start:stop]
|
|
data_spark_columns = self._internal.data_spark_columns[start:stop]
|
|
data_dtypes = self._internal.data_dtypes[start:stop]
|
|
return column_labels, data_spark_columns, data_dtypes, False, None
|
|
|
|
def _select_cols_by_iterable(
|
|
self, cols_sel: Iterable, missing_keys: Optional[List[Tuple]]
|
|
) -> Tuple[
|
|
List[Tuple], Optional[List[spark.Column]], Optional[List[Dtype]], bool, Optional[Tuple]
|
|
]:
|
|
from pyspark.pandas.series import Series
|
|
|
|
if all(isinstance(key, Series) for key in cols_sel):
|
|
column_labels = [key._column_label for key in cols_sel]
|
|
data_spark_columns = [key.spark.column for key in cols_sel]
|
|
data_dtypes = [key.dtype for key in cols_sel]
|
|
elif all(isinstance(key, spark.Column) for key in cols_sel):
|
|
column_labels = [
|
|
(self._internal.spark_frame.select(col).columns[0],) for col in cols_sel
|
|
]
|
|
data_spark_columns = list(cols_sel)
|
|
data_dtypes = None
|
|
elif all(isinstance(key, bool) for key in cols_sel) or all(
|
|
isinstance(key, np.bool_) for key in cols_sel
|
|
):
|
|
if len(cast(Sized, cols_sel)) != len(self._internal.column_labels):
|
|
raise IndexError(
|
|
"Boolean index has wrong length: %s instead of %s"
|
|
% (len(cast(Sized, cols_sel)), len(self._internal.column_labels))
|
|
)
|
|
if isinstance(cols_sel, pd.Series):
|
|
if not cols_sel.index.sort_values().equals(self._kdf.columns.sort_values()):
|
|
raise SparkPandasIndexingError(
|
|
"Unalignable boolean Series provided as indexer "
|
|
"(index of the boolean Series and of the indexed object do not match)"
|
|
)
|
|
else:
|
|
column_labels = [
|
|
column_label
|
|
for column_label in self._internal.column_labels
|
|
if cols_sel[column_label if len(column_label) > 1 else column_label[0]]
|
|
]
|
|
data_spark_columns = [
|
|
self._internal.spark_column_for(column_label)
|
|
for column_label in column_labels
|
|
]
|
|
data_dtypes = [
|
|
self._internal.dtype_for(column_label) for column_label in column_labels
|
|
]
|
|
else:
|
|
column_labels = [
|
|
self._internal.column_labels[i] for i, col in enumerate(cols_sel) if col
|
|
]
|
|
data_spark_columns = [
|
|
self._internal.data_spark_columns[i] for i, col in enumerate(cols_sel) if col
|
|
]
|
|
data_dtypes = [
|
|
self._internal.data_dtypes[i] for i, col in enumerate(cols_sel) if col
|
|
]
|
|
elif any(isinstance(key, tuple) for key in cols_sel) and any(
|
|
not is_name_like_tuple(key) for key in cols_sel
|
|
):
|
|
raise TypeError(
|
|
"Expected tuple, got {}".format(
|
|
type(set(key for key in cols_sel if not is_name_like_tuple(key)).pop())
|
|
)
|
|
)
|
|
else:
|
|
if missing_keys is None and all(isinstance(key, tuple) for key in cols_sel):
|
|
level = self._internal.column_labels_level
|
|
if any(len(key) != level for key in cols_sel):
|
|
raise ValueError("All the key level should be the same as column index level.")
|
|
|
|
column_labels = []
|
|
data_spark_columns = []
|
|
data_dtypes = []
|
|
for key in cols_sel:
|
|
found = False
|
|
for label in self._internal.column_labels:
|
|
if label == key or label[0] == key:
|
|
column_labels.append(label)
|
|
data_spark_columns.append(self._internal.spark_column_for(label))
|
|
data_dtypes.append(self._internal.dtype_for(label))
|
|
found = True
|
|
if not found:
|
|
if missing_keys is None:
|
|
raise KeyError("['{}'] not in index".format(name_like_string(key)))
|
|
else:
|
|
missing_keys.append(key)
|
|
|
|
return column_labels, data_spark_columns, data_dtypes, False, None
|
|
|
|
def _select_cols_else(
|
|
self, cols_sel: Any, missing_keys: Optional[List[Tuple]]
|
|
) -> Tuple[
|
|
List[Tuple], Optional[List[spark.Column]], Optional[List[Dtype]], bool, Optional[Tuple]
|
|
]:
|
|
if not is_name_like_tuple(cols_sel):
|
|
cols_sel = (cols_sel,)
|
|
return self._get_from_multiindex_column(cols_sel, missing_keys)
|
|
|
|
|
|
class iLocIndexer(LocIndexerLike):
|
|
"""
|
|
Purely integer-location based indexing for selection by position.
|
|
|
|
``.iloc[]`` is primarily integer position based (from ``0`` to
|
|
``length-1`` of the axis), but may also be used with a conditional boolean Series.
|
|
|
|
Allowed inputs are:
|
|
|
|
- An integer for column selection, e.g. ``5``.
|
|
- A list or array of integers for row selection with distinct index values,
|
|
e.g. ``[3, 4, 0]``
|
|
- A list or array of integers for column selection, e.g. ``[4, 3, 0]``.
|
|
- A boolean array for column selection.
|
|
- A slice object with ints for row and column selection, e.g. ``1:7``.
|
|
|
|
Not allowed inputs which pandas allows are:
|
|
|
|
- A list or array of integers for row selection with duplicated indexes,
|
|
e.g. ``[4, 4, 0]``.
|
|
- A boolean array for row selection.
|
|
- A ``callable`` function with one argument (the calling Series, DataFrame
|
|
or Panel) and that returns valid output for indexing (one of the above).
|
|
This is useful in method chains, when you don't have a reference to the
|
|
calling object, but would like to base your selection on some value.
|
|
|
|
``.iloc`` will raise ``IndexError`` if a requested indexer is
|
|
out-of-bounds, except *slice* indexers which allow out-of-bounds
|
|
indexing (this conforms with python/numpy *slice* semantics).
|
|
|
|
See Also
|
|
--------
|
|
DataFrame.loc : Purely label-location based indexer for selection by label.
|
|
Series.iloc : Purely integer-location based indexing for
|
|
selection by position.
|
|
|
|
Examples
|
|
--------
|
|
|
|
>>> mydict = [{'a': 1, 'b': 2, 'c': 3, 'd': 4},
|
|
... {'a': 100, 'b': 200, 'c': 300, 'd': 400},
|
|
... {'a': 1000, 'b': 2000, 'c': 3000, 'd': 4000 }]
|
|
>>> df = ps.DataFrame(mydict, columns=['a', 'b', 'c', 'd'])
|
|
>>> df
|
|
a b c d
|
|
0 1 2 3 4
|
|
1 100 200 300 400
|
|
2 1000 2000 3000 4000
|
|
|
|
**Indexing just the rows**
|
|
|
|
A scalar integer for row selection.
|
|
|
|
>>> df.iloc[1]
|
|
a 100
|
|
b 200
|
|
c 300
|
|
d 400
|
|
Name: 1, dtype: int64
|
|
|
|
>>> df.iloc[[0]]
|
|
a b c d
|
|
0 1 2 3 4
|
|
|
|
With a `slice` object.
|
|
|
|
>>> df.iloc[:3]
|
|
a b c d
|
|
0 1 2 3 4
|
|
1 100 200 300 400
|
|
2 1000 2000 3000 4000
|
|
|
|
**Indexing both axes**
|
|
|
|
You can mix the indexer types for the index and columns. Use ``:`` to
|
|
select the entire axis.
|
|
|
|
With scalar integers.
|
|
|
|
>>> df.iloc[:1, 1]
|
|
0 2
|
|
Name: b, dtype: int64
|
|
|
|
With lists of integers.
|
|
|
|
>>> df.iloc[:2, [1, 3]]
|
|
b d
|
|
0 2 4
|
|
1 200 400
|
|
|
|
With `slice` objects.
|
|
|
|
>>> df.iloc[:2, 0:3]
|
|
a b c
|
|
0 1 2 3
|
|
1 100 200 300
|
|
|
|
With a boolean array whose length matches the columns.
|
|
|
|
>>> df.iloc[:, [True, False, True, False]]
|
|
a c
|
|
0 1 3
|
|
1 100 300
|
|
2 1000 3000
|
|
|
|
**Setting values**
|
|
|
|
Setting value for all items matching the list of labels.
|
|
|
|
>>> df.iloc[[1, 2], [1]] = 50
|
|
>>> df
|
|
a b c d
|
|
0 1 2 3 4
|
|
1 100 50 300 400
|
|
2 1000 50 3000 4000
|
|
|
|
Setting value for an entire row
|
|
|
|
>>> df.iloc[0] = 10
|
|
>>> df
|
|
a b c d
|
|
0 10 10 10 10
|
|
1 100 50 300 400
|
|
2 1000 50 3000 4000
|
|
|
|
Set value for an entire column
|
|
|
|
>>> df.iloc[:, 2] = 30
|
|
>>> df
|
|
a b c d
|
|
0 10 10 30 10
|
|
1 100 50 30 400
|
|
2 1000 50 30 4000
|
|
|
|
Set value for an entire list of columns
|
|
|
|
>>> df.iloc[:, [2, 3]] = 100
|
|
>>> df
|
|
a b c d
|
|
0 10 10 100 100
|
|
1 100 50 100 100
|
|
2 1000 50 100 100
|
|
|
|
Set value with Series
|
|
|
|
>>> df.iloc[:, 3] = df.iloc[:, 3] * 2
|
|
>>> df
|
|
a b c d
|
|
0 10 10 100 200
|
|
1 100 50 100 200
|
|
2 1000 50 100 200
|
|
"""
|
|
|
|
@staticmethod
|
|
def _NotImplemented(description):
|
|
return SparkPandasNotImplementedError(
|
|
description=description,
|
|
pandas_function=".iloc[..., ...]",
|
|
spark_target_function="select, where",
|
|
)
|
|
|
|
@lazy_property
|
|
def _internal(self):
|
|
# Use resolved_copy to fix the natural order.
|
|
internal = super()._internal.resolved_copy
|
|
sdf = InternalFrame.attach_distributed_sequence_column(
|
|
internal.spark_frame, column_name=self._sequence_col
|
|
)
|
|
return internal.with_new_sdf(spark_frame=sdf.orderBy(NATURAL_ORDER_COLUMN_NAME))
|
|
|
|
@lazy_property
|
|
def _sequence_col(self):
|
|
# Use resolved_copy to fix the natural order.
|
|
internal = super()._internal.resolved_copy
|
|
return verify_temp_column_name(internal.spark_frame, "__distributed_sequence_column__")
|
|
|
|
def _select_rows_by_series(
|
|
self, rows_sel: "Series"
|
|
) -> Tuple[Optional[spark.Column], Optional[int], Optional[int]]:
|
|
raise iLocIndexer._NotImplemented(
|
|
".iloc requires numeric slice, conditional "
|
|
"boolean Index or a sequence of positions as int, "
|
|
"got {}".format(type(rows_sel))
|
|
)
|
|
|
|
def _select_rows_by_spark_column(
|
|
self, rows_sel: spark.Column
|
|
) -> Tuple[Optional[spark.Column], Optional[int], Optional[int]]:
|
|
raise iLocIndexer._NotImplemented(
|
|
".iloc requires numeric slice, conditional "
|
|
"boolean Index or a sequence of positions as int, "
|
|
"got {}".format(type(rows_sel))
|
|
)
|
|
|
|
def _select_rows_by_slice(
|
|
self, rows_sel: slice
|
|
) -> Tuple[Optional[spark.Column], Optional[int], Optional[int]]:
|
|
def verify_type(i):
|
|
if not isinstance(i, int):
|
|
raise TypeError(
|
|
"cannot do slice indexing with these indexers [{}] of {}".format(i, type(i))
|
|
)
|
|
|
|
has_negative = False
|
|
start = rows_sel.start
|
|
if start is not None:
|
|
verify_type(start)
|
|
if start == 0:
|
|
start = None
|
|
elif start < 0:
|
|
has_negative = True
|
|
stop = rows_sel.stop
|
|
if stop is not None:
|
|
verify_type(stop)
|
|
if stop < 0:
|
|
has_negative = True
|
|
|
|
step = rows_sel.step
|
|
if step is not None:
|
|
verify_type(step)
|
|
if step == 0:
|
|
raise ValueError("slice step cannot be zero")
|
|
else:
|
|
step = 1
|
|
|
|
if start is None and step == 1:
|
|
return None, stop, None
|
|
|
|
sdf = self._internal.spark_frame
|
|
sequence_scol = sdf[self._sequence_col]
|
|
|
|
if has_negative or (step < 0 and start is None):
|
|
cnt = sdf.count()
|
|
|
|
cond = []
|
|
if start is not None:
|
|
if start < 0:
|
|
start = start + cnt
|
|
if step >= 0:
|
|
cond.append(sequence_scol >= F.lit(start).cast(LongType()))
|
|
else:
|
|
cond.append(sequence_scol <= F.lit(start).cast(LongType()))
|
|
if stop is not None:
|
|
if stop < 0:
|
|
stop = stop + cnt
|
|
if step >= 0:
|
|
cond.append(sequence_scol < F.lit(stop).cast(LongType()))
|
|
else:
|
|
cond.append(sequence_scol > F.lit(stop).cast(LongType()))
|
|
if step != 1:
|
|
if step > 0:
|
|
start = start or 0
|
|
else:
|
|
start = start or (cnt - 1)
|
|
cond.append(((sequence_scol - start) % F.lit(step).cast(LongType())) == F.lit(0))
|
|
|
|
return reduce(lambda x, y: x & y, cond), None, None
|
|
|
|
def _select_rows_by_iterable(
|
|
self, rows_sel: Iterable
|
|
) -> Tuple[Optional[spark.Column], Optional[int], Optional[int]]:
|
|
sdf = self._internal.spark_frame
|
|
|
|
if any(isinstance(key, (int, np.int, np.int64, np.int32)) and key < 0 for key in rows_sel):
|
|
offset = sdf.count()
|
|
else:
|
|
offset = 0
|
|
|
|
new_rows_sel = []
|
|
for key in list(rows_sel):
|
|
if not isinstance(key, (int, np.int, np.int64, np.int32)):
|
|
raise TypeError(
|
|
"cannot do positional indexing with these indexers [{}] of {}".format(
|
|
key, type(key)
|
|
)
|
|
)
|
|
if key < 0:
|
|
key = key + offset
|
|
new_rows_sel.append(key)
|
|
|
|
if len(new_rows_sel) != len(set(new_rows_sel)):
|
|
raise NotImplementedError(
|
|
"Duplicated row selection is not currently supported; "
|
|
"however, normalised index was [%s]" % new_rows_sel
|
|
)
|
|
|
|
sequence_scol = sdf[self._sequence_col]
|
|
cond = []
|
|
for key in new_rows_sel:
|
|
cond.append(sequence_scol == F.lit(int(key)).cast(LongType()))
|
|
|
|
if len(cond) == 0:
|
|
cond = [F.lit(False)]
|
|
return reduce(lambda x, y: x | y, cond), None, None
|
|
|
|
def _select_rows_else(
|
|
self, rows_sel: Any
|
|
) -> Tuple[Optional[spark.Column], Optional[int], Optional[int]]:
|
|
if isinstance(rows_sel, int):
|
|
sdf = self._internal.spark_frame
|
|
return (sdf[self._sequence_col] == rows_sel), None, 0
|
|
elif isinstance(rows_sel, tuple):
|
|
raise SparkPandasIndexingError("Too many indexers")
|
|
else:
|
|
raise iLocIndexer._NotImplemented(
|
|
".iloc requires numeric slice, conditional "
|
|
"boolean Index or a sequence of positions as int, "
|
|
"got {}".format(type(rows_sel))
|
|
)
|
|
|
|
def _select_cols_by_series(
|
|
self, cols_sel: "Series", missing_keys: Optional[List[Tuple]]
|
|
) -> Tuple[
|
|
List[Tuple], Optional[List[spark.Column]], Optional[List[Dtype]], bool, Optional[Tuple]
|
|
]:
|
|
raise ValueError(
|
|
"Location based indexing can only have [integer, integer slice, "
|
|
"listlike of integers, boolean array] types, got {}".format(cols_sel)
|
|
)
|
|
|
|
def _select_cols_by_spark_column(
|
|
self, cols_sel: spark.Column, missing_keys: Optional[List[Tuple]]
|
|
) -> Tuple[
|
|
List[Tuple], Optional[List[spark.Column]], Optional[List[Dtype]], bool, Optional[Tuple]
|
|
]:
|
|
raise ValueError(
|
|
"Location based indexing can only have [integer, integer slice, "
|
|
"listlike of integers, boolean array] types, got {}".format(cols_sel)
|
|
)
|
|
|
|
def _select_cols_by_slice(
|
|
self, cols_sel: slice, missing_keys: Optional[List[Tuple]]
|
|
) -> Tuple[
|
|
List[Tuple], Optional[List[spark.Column]], Optional[List[Dtype]], bool, Optional[Tuple]
|
|
]:
|
|
if all(
|
|
s is None or isinstance(s, int) for s in (cols_sel.start, cols_sel.stop, cols_sel.step)
|
|
):
|
|
column_labels = self._internal.column_labels[cols_sel]
|
|
data_spark_columns = self._internal.data_spark_columns[cols_sel]
|
|
data_dtypes = self._internal.data_dtypes[cols_sel]
|
|
return column_labels, data_spark_columns, data_dtypes, False, None
|
|
else:
|
|
not_none = (
|
|
cols_sel.start
|
|
if cols_sel.start is not None
|
|
else cols_sel.stop
|
|
if cols_sel.stop is not None
|
|
else cols_sel.step
|
|
)
|
|
raise TypeError(
|
|
"cannot do slice indexing with these indexers {} of {}".format(
|
|
not_none, type(not_none)
|
|
)
|
|
)
|
|
|
|
def _select_cols_by_iterable(
|
|
self, cols_sel: Iterable, missing_keys: Optional[List[Tuple]]
|
|
) -> Tuple[
|
|
List[Tuple], Optional[List[spark.Column]], Optional[List[Dtype]], bool, Optional[Tuple]
|
|
]:
|
|
if all(isinstance(s, bool) for s in cols_sel):
|
|
cols_sel = [i for i, s in enumerate(cols_sel) if s]
|
|
if all(isinstance(s, int) for s in cols_sel):
|
|
column_labels = [self._internal.column_labels[s] for s in cols_sel]
|
|
data_spark_columns = [self._internal.data_spark_columns[s] for s in cols_sel]
|
|
data_dtypes = [self._internal.data_dtypes[s] for s in cols_sel]
|
|
return column_labels, data_spark_columns, data_dtypes, False, None
|
|
else:
|
|
raise TypeError("cannot perform reduce with flexible type")
|
|
|
|
def _select_cols_else(
|
|
self, cols_sel: Any, missing_keys: Optional[List[Tuple]]
|
|
) -> Tuple[
|
|
List[Tuple], Optional[List[spark.Column]], Optional[List[Dtype]], bool, Optional[Tuple]
|
|
]:
|
|
if isinstance(cols_sel, int):
|
|
if cols_sel > len(self._internal.column_labels):
|
|
raise KeyError(cols_sel)
|
|
column_labels = [self._internal.column_labels[cols_sel]]
|
|
data_spark_columns = [self._internal.data_spark_columns[cols_sel]]
|
|
data_dtypes = [self._internal.data_dtypes[cols_sel]]
|
|
return column_labels, data_spark_columns, data_dtypes, True, None
|
|
else:
|
|
raise ValueError(
|
|
"Location based indexing can only have [integer, integer slice, "
|
|
"listlike of integers, boolean array] types, got {}".format(cols_sel)
|
|
)
|
|
|
|
def __setitem__(self, key, value):
|
|
if is_list_like(value) and not isinstance(value, spark.Column):
|
|
iloc_item = self[key]
|
|
if not is_list_like(key) or not is_list_like(iloc_item):
|
|
raise ValueError("setting an array element with a sequence.")
|
|
else:
|
|
shape_iloc_item = iloc_item.shape
|
|
len_iloc_item = shape_iloc_item[0]
|
|
len_value = len(value)
|
|
if len_iloc_item != len_value:
|
|
if self._is_series:
|
|
raise ValueError(
|
|
"cannot set using a list-like indexer with a different length than "
|
|
"the value"
|
|
)
|
|
else:
|
|
raise ValueError(
|
|
"shape mismatch: value array of shape ({},) could not be broadcast "
|
|
"to indexing result of shape {}".format(len_value, shape_iloc_item)
|
|
)
|
|
super().__setitem__(key, value)
|
|
# Update again with resolved_copy to drop extra columns.
|
|
self._kdf._update_internal_frame(
|
|
self._kdf._internal.resolved_copy, requires_same_anchor=False
|
|
)
|
|
|
|
# Clean up implicitly cached properties to be able to reuse the indexer.
|
|
del self._internal
|
|
del self._sequence_col
|
|
|
|
|
|
def _test():
|
|
import os
|
|
import doctest
|
|
import sys
|
|
from pyspark.sql import SparkSession
|
|
import pyspark.pandas.indexing
|
|
|
|
os.chdir(os.environ["SPARK_HOME"])
|
|
|
|
globs = pyspark.pandas.indexing.__dict__.copy()
|
|
globs["ps"] = pyspark.pandas
|
|
spark = (
|
|
SparkSession.builder.master("local[4]")
|
|
.appName("pyspark.pandas.indexing tests")
|
|
.getOrCreate()
|
|
)
|
|
(failure_count, test_count) = doctest.testmod(
|
|
pyspark.pandas.indexing,
|
|
globs=globs,
|
|
optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
|
|
)
|
|
spark.stop()
|
|
if failure_count:
|
|
sys.exit(-1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
_test()
|