[SPARK-36338][PYTHON][SQL] Move distributed-sequence implementation to Scala side
### What changes were proposed in this pull request?
This PR proposes to implement `distributed-sequence` index in Scala side.
### Why are the changes needed?
- Avoid unnecessary (de)serialization
- Keep the nullability in the input DataFrame when `distributed-sequence` is enabled. During the serialization, all fields are being nullable for now (see https://github.com/apache/spark/pull/32775#discussion_r645882104)
### Does this PR introduce _any_ user-facing change?
No to end users since pandas API on Spark is not released yet.
```python
import pyspark.pandas as ps
ps.set_option('compute.default_index_type', 'distributed-sequence')
ps.range(1).spark.print_schema()
```
Before:
```
root
|-- id: long (nullable = true)
```
After:
```
root
|-- id: long (nullable = false)
```
### How was this patch tested?
Manually tested, and existing tests should cover them.
Closes #33570 from HyukjinKwon/SPARK-36338.
Authored-by: Hyukjin Kwon <gurwls223@apache.org>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
(cherry picked from commit c6140d4d0a
)
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
This commit is contained in:
parent
9cd370894b
commit
fee87f13d1
|
@ -169,7 +169,7 @@ class PandasOnSparkFrameMethods(object):
|
||||||
for scol, label in zip(internal.data_spark_columns, internal.column_labels)
|
for scol, label in zip(internal.data_spark_columns, internal.column_labels)
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
sdf, force_nullable = attach_func(sdf, name_like_string(column))
|
sdf = attach_func(sdf, name_like_string(column))
|
||||||
|
|
||||||
return DataFrame(
|
return DataFrame(
|
||||||
InternalFrame(
|
InternalFrame(
|
||||||
|
@ -178,28 +178,18 @@ class PandasOnSparkFrameMethods(object):
|
||||||
scol_for(sdf, SPARK_INDEX_NAME_FORMAT(i)) for i in range(internal.index_level)
|
scol_for(sdf, SPARK_INDEX_NAME_FORMAT(i)) for i in range(internal.index_level)
|
||||||
],
|
],
|
||||||
index_names=internal.index_names,
|
index_names=internal.index_names,
|
||||||
index_fields=(
|
index_fields=internal.index_fields,
|
||||||
[field.copy(nullable=True) for field in internal.index_fields]
|
|
||||||
if force_nullable
|
|
||||||
else internal.index_fields
|
|
||||||
),
|
|
||||||
column_labels=internal.column_labels + [column],
|
column_labels=internal.column_labels + [column],
|
||||||
data_spark_columns=(
|
data_spark_columns=(
|
||||||
[scol_for(sdf, name_like_string(label)) for label in internal.column_labels]
|
[scol_for(sdf, name_like_string(label)) for label in internal.column_labels]
|
||||||
+ [scol_for(sdf, name_like_string(column))]
|
+ [scol_for(sdf, name_like_string(column))]
|
||||||
),
|
),
|
||||||
data_fields=(
|
data_fields=internal.data_fields
|
||||||
(
|
+ [
|
||||||
[field.copy(nullable=True) for field in internal.data_fields]
|
InternalField.from_struct_field(
|
||||||
if force_nullable
|
StructField(name_like_string(column), LongType(), nullable=False)
|
||||||
else internal.data_fields
|
|
||||||
)
|
)
|
||||||
+ [
|
],
|
||||||
InternalField.from_struct_field(
|
|
||||||
StructField(name_like_string(column), LongType(), nullable=False)
|
|
||||||
)
|
|
||||||
]
|
|
||||||
),
|
|
||||||
column_label_names=internal.column_label_names,
|
column_label_names=internal.column_label_names,
|
||||||
).resolved_copy
|
).resolved_copy
|
||||||
)
|
)
|
||||||
|
|
|
@ -175,7 +175,7 @@ _options = [
|
||||||
Option(
|
Option(
|
||||||
key="compute.default_index_type",
|
key="compute.default_index_type",
|
||||||
doc=("This sets the default index type: sequence, distributed and distributed-sequence."),
|
doc=("This sets the default index type: sequence, distributed and distributed-sequence."),
|
||||||
default="sequence",
|
default="distributed-sequence",
|
||||||
types=str,
|
types=str,
|
||||||
check_func=(
|
check_func=(
|
||||||
lambda v: v in ("sequence", "distributed", "distributed-sequence"),
|
lambda v: v in ("sequence", "distributed", "distributed-sequence"),
|
||||||
|
|
|
@ -1692,9 +1692,7 @@ class Index(IndexOpsMixin):
|
||||||
]
|
]
|
||||||
sdf = sdf.select(index_value_columns)
|
sdf = sdf.select(index_value_columns)
|
||||||
|
|
||||||
sdf, force_nullable = InternalFrame.attach_default_index(
|
sdf = InternalFrame.attach_default_index(sdf, default_index_type="distributed-sequence")
|
||||||
sdf, default_index_type="distributed-sequence"
|
|
||||||
)
|
|
||||||
# sdf here looks as below
|
# sdf here looks as below
|
||||||
# +-----------------+-----------------+-----------------+-----------------+
|
# +-----------------+-----------------+-----------------+-----------------+
|
||||||
# |__index_level_0__|__index_value_0__|__index_value_1__|__index_value_2__|
|
# |__index_level_0__|__index_value_0__|__index_value_1__|__index_value_2__|
|
||||||
|
@ -1727,11 +1725,7 @@ class Index(IndexOpsMixin):
|
||||||
scol_for(sdf, col) for col in self._internal.index_spark_column_names
|
scol_for(sdf, col) for col in self._internal.index_spark_column_names
|
||||||
],
|
],
|
||||||
index_names=self._internal.index_names,
|
index_names=self._internal.index_names,
|
||||||
index_fields=(
|
index_fields=self._internal.index_fields,
|
||||||
[field.copy(nullable=True) for field in self._internal.index_fields]
|
|
||||||
if force_nullable
|
|
||||||
else self._internal.index_fields
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
return DataFrame(internal).index
|
return DataFrame(internal).index
|
||||||
|
@ -1829,7 +1823,7 @@ class Index(IndexOpsMixin):
|
||||||
"""
|
"""
|
||||||
sdf = self._internal.spark_frame.select(self.spark.column)
|
sdf = self._internal.spark_frame.select(self.spark.column)
|
||||||
sequence_col = verify_temp_column_name(sdf, "__distributed_sequence_column__")
|
sequence_col = verify_temp_column_name(sdf, "__distributed_sequence_column__")
|
||||||
sdf, _ = InternalFrame.attach_distributed_sequence_column(sdf, column_name=sequence_col)
|
sdf = InternalFrame.attach_distributed_sequence_column(sdf, column_name=sequence_col)
|
||||||
# spark_frame here looks like below
|
# spark_frame here looks like below
|
||||||
# +-----------------+---------------+
|
# +-----------------+---------------+
|
||||||
# |__index_level_0__|__index_value__|
|
# |__index_level_0__|__index_value__|
|
||||||
|
@ -1877,7 +1871,7 @@ class Index(IndexOpsMixin):
|
||||||
"""
|
"""
|
||||||
sdf = self._internal.spark_frame.select(self.spark.column)
|
sdf = self._internal.spark_frame.select(self.spark.column)
|
||||||
sequence_col = verify_temp_column_name(sdf, "__distributed_sequence_column__")
|
sequence_col = verify_temp_column_name(sdf, "__distributed_sequence_column__")
|
||||||
sdf, _ = InternalFrame.attach_distributed_sequence_column(sdf, column_name=sequence_col)
|
sdf = InternalFrame.attach_distributed_sequence_column(sdf, column_name=sequence_col)
|
||||||
|
|
||||||
return (
|
return (
|
||||||
sdf.orderBy(
|
sdf.orderBy(
|
||||||
|
@ -2475,7 +2469,7 @@ class Index(IndexOpsMixin):
|
||||||
scol_for(sdf, col) for col in self._internal.index_spark_column_names
|
scol_for(sdf, col) for col in self._internal.index_spark_column_names
|
||||||
],
|
],
|
||||||
index_names=self._internal.index_names,
|
index_names=self._internal.index_names,
|
||||||
index_fields=[field.copy(nullable=True) for field in self._internal.index_fields],
|
index_fields=[InternalField(field.dtype) for field in self._internal.index_fields],
|
||||||
)
|
)
|
||||||
return DataFrame(internal).index
|
return DataFrame(internal).index
|
||||||
|
|
||||||
|
|
|
@ -1054,7 +1054,7 @@ class MultiIndex(Index):
|
||||||
scol_for(sdf, col) for col in self._internal.index_spark_column_names
|
scol_for(sdf, col) for col in self._internal.index_spark_column_names
|
||||||
],
|
],
|
||||||
index_names=self._internal.index_names,
|
index_names=self._internal.index_names,
|
||||||
index_fields=[field.copy(nullable=True) for field in self._internal.index_fields],
|
index_fields=[InternalField(field.dtype) for field in self._internal.index_fields],
|
||||||
)
|
)
|
||||||
return DataFrame(internal).index
|
return DataFrame(internal).index
|
||||||
|
|
||||||
|
|
|
@ -1536,22 +1536,10 @@ class iLocIndexer(LocIndexerLike):
|
||||||
def _internal(self) -> "InternalFrame":
|
def _internal(self) -> "InternalFrame":
|
||||||
# Use resolved_copy to fix the natural order.
|
# Use resolved_copy to fix the natural order.
|
||||||
internal = super()._internal.resolved_copy
|
internal = super()._internal.resolved_copy
|
||||||
sdf, force_nullable = InternalFrame.attach_distributed_sequence_column(
|
sdf = InternalFrame.attach_distributed_sequence_column(
|
||||||
internal.spark_frame, column_name=self._sequence_col
|
internal.spark_frame, column_name=self._sequence_col
|
||||||
)
|
)
|
||||||
return internal.with_new_sdf(
|
return internal.with_new_sdf(spark_frame=sdf.orderBy(NATURAL_ORDER_COLUMN_NAME))
|
||||||
spark_frame=sdf.orderBy(NATURAL_ORDER_COLUMN_NAME),
|
|
||||||
index_fields=(
|
|
||||||
[field.copy(nullable=True) for field in internal.index_fields]
|
|
||||||
if force_nullable
|
|
||||||
else internal.index_fields
|
|
||||||
),
|
|
||||||
data_fields=(
|
|
||||||
[field.copy(nullable=True) for field in internal.data_fields]
|
|
||||||
if force_nullable
|
|
||||||
else internal.data_fields
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
@lazy_property
|
@lazy_property
|
||||||
def _sequence_col(self) -> str:
|
def _sequence_col(self) -> str:
|
||||||
|
|
|
@ -20,15 +20,12 @@ An internal immutable DataFrame with some metadata to manage indexes.
|
||||||
"""
|
"""
|
||||||
import re
|
import re
|
||||||
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union, TYPE_CHECKING, cast
|
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union, TYPE_CHECKING, cast
|
||||||
from itertools import accumulate
|
|
||||||
import py4j
|
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pandas.api.types import CategoricalDtype # noqa: F401
|
from pandas.api.types import CategoricalDtype # noqa: F401
|
||||||
from pyspark._globals import _NoValue, _NoValueType
|
from pyspark._globals import _NoValue, _NoValueType
|
||||||
from pyspark.sql import functions as F, Column, DataFrame as SparkDataFrame, Window
|
from pyspark.sql import functions as F, Column, DataFrame as SparkDataFrame, Window
|
||||||
from pyspark.sql.functions import pandas_udf
|
|
||||||
from pyspark.sql.types import ( # noqa: F401
|
from pyspark.sql.types import ( # noqa: F401
|
||||||
BooleanType,
|
BooleanType,
|
||||||
DataType,
|
DataType,
|
||||||
|
@ -64,7 +61,6 @@ from pyspark.pandas.utils import (
|
||||||
name_like_string,
|
name_like_string,
|
||||||
scol_for,
|
scol_for,
|
||||||
spark_column_equals,
|
spark_column_equals,
|
||||||
verify_temp_column_name,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -636,7 +632,7 @@ class InternalFrame(object):
|
||||||
)
|
)
|
||||||
|
|
||||||
# Create default index.
|
# Create default index.
|
||||||
spark_frame, force_nullable = InternalFrame.attach_default_index(spark_frame)
|
spark_frame = InternalFrame.attach_default_index(spark_frame)
|
||||||
index_spark_columns = [scol_for(spark_frame, SPARK_DEFAULT_INDEX_NAME)]
|
index_spark_columns = [scol_for(spark_frame, SPARK_DEFAULT_INDEX_NAME)]
|
||||||
|
|
||||||
index_fields = [
|
index_fields = [
|
||||||
|
@ -658,7 +654,6 @@ class InternalFrame(object):
|
||||||
data_fields = [
|
data_fields = [
|
||||||
field.copy(
|
field.copy(
|
||||||
name=name_like_string(struct_field.name),
|
name=name_like_string(struct_field.name),
|
||||||
nullable=(force_nullable or field.nullable),
|
|
||||||
)
|
)
|
||||||
for field, struct_field in zip(data_fields, data_struct_fields)
|
for field, struct_field in zip(data_fields, data_struct_fields)
|
||||||
]
|
]
|
||||||
|
@ -836,7 +831,7 @@ class InternalFrame(object):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def attach_default_index(
|
def attach_default_index(
|
||||||
sdf: SparkDataFrame, default_index_type: Optional[str] = None
|
sdf: SparkDataFrame, default_index_type: Optional[str] = None
|
||||||
) -> Tuple[SparkDataFrame, bool]:
|
) -> SparkDataFrame:
|
||||||
"""
|
"""
|
||||||
This method attaches a default index to Spark DataFrame. Spark does not have the index
|
This method attaches a default index to Spark DataFrame. Spark does not have the index
|
||||||
notion so corresponding column should be generated.
|
notion so corresponding column should be generated.
|
||||||
|
@ -848,13 +843,13 @@ class InternalFrame(object):
|
||||||
|
|
||||||
It adds the default index column '__index_level_0__'.
|
It adds the default index column '__index_level_0__'.
|
||||||
|
|
||||||
>>> spark_frame = InternalFrame.attach_default_index(spark_frame)[0]
|
>>> spark_frame = InternalFrame.attach_default_index(spark_frame)
|
||||||
>>> spark_frame
|
>>> spark_frame
|
||||||
DataFrame[__index_level_0__: bigint, id: bigint]
|
DataFrame[__index_level_0__: bigint, id: bigint]
|
||||||
|
|
||||||
It throws an exception if the given column name already exists.
|
It throws an exception if the given column name already exists.
|
||||||
|
|
||||||
>>> InternalFrame.attach_default_index(spark_frame)[0]
|
>>> InternalFrame.attach_default_index(spark_frame)
|
||||||
... # doctest: +ELLIPSIS
|
... # doctest: +ELLIPSIS
|
||||||
Traceback (most recent call last):
|
Traceback (most recent call last):
|
||||||
...
|
...
|
||||||
|
@ -881,34 +876,26 @@ class InternalFrame(object):
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def attach_sequence_column(
|
def attach_sequence_column(sdf: SparkDataFrame, column_name: str) -> SparkDataFrame:
|
||||||
sdf: SparkDataFrame, column_name: str
|
|
||||||
) -> Tuple[SparkDataFrame, bool]:
|
|
||||||
scols = [scol_for(sdf, column) for column in sdf.columns]
|
scols = [scol_for(sdf, column) for column in sdf.columns]
|
||||||
sequential_index = (
|
sequential_index = (
|
||||||
F.row_number().over(Window.orderBy(F.monotonically_increasing_id())).cast("long") - 1
|
F.row_number().over(Window.orderBy(F.monotonically_increasing_id())).cast("long") - 1
|
||||||
)
|
)
|
||||||
return sdf.select(sequential_index.alias(column_name), *scols), False
|
return sdf.select(sequential_index.alias(column_name), *scols)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def attach_distributed_column(
|
def attach_distributed_column(sdf: SparkDataFrame, column_name: str) -> SparkDataFrame:
|
||||||
sdf: SparkDataFrame, column_name: str
|
|
||||||
) -> Tuple[SparkDataFrame, bool]:
|
|
||||||
scols = [scol_for(sdf, column) for column in sdf.columns]
|
scols = [scol_for(sdf, column) for column in sdf.columns]
|
||||||
return sdf.select(F.monotonically_increasing_id().alias(column_name), *scols), False
|
return sdf.select(F.monotonically_increasing_id().alias(column_name), *scols)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def attach_distributed_sequence_column(
|
def attach_distributed_sequence_column(sdf: SparkDataFrame, column_name: str) -> SparkDataFrame:
|
||||||
sdf: SparkDataFrame, column_name: str
|
|
||||||
) -> Tuple[SparkDataFrame, bool]:
|
|
||||||
"""
|
"""
|
||||||
This method attaches a Spark column that has a sequence in a distributed manner.
|
This method attaches a Spark column that has a sequence in a distributed manner.
|
||||||
This is equivalent to the column assigned when default index type 'distributed-sequence'.
|
This is equivalent to the column assigned when default index type 'distributed-sequence'.
|
||||||
|
|
||||||
>>> sdf = ps.DataFrame(['a', 'b', 'c']).to_spark()
|
>>> sdf = ps.DataFrame(['a', 'b', 'c']).to_spark()
|
||||||
>>> sdf, force_nullable = (
|
>>> sdf = InternalFrame.attach_distributed_sequence_column(sdf, column_name="sequence")
|
||||||
... InternalFrame.attach_distributed_sequence_column(sdf, column_name="sequence")
|
|
||||||
... )
|
|
||||||
>>> sdf.show() # doctest: +NORMALIZE_WHITESPACE
|
>>> sdf.show() # doctest: +NORMALIZE_WHITESPACE
|
||||||
+--------+---+
|
+--------+---+
|
||||||
|sequence| 0|
|
|sequence| 0|
|
||||||
|
@ -917,124 +904,21 @@ class InternalFrame(object):
|
||||||
| 1| b|
|
| 1| b|
|
||||||
| 2| c|
|
| 2| c|
|
||||||
+--------+---+
|
+--------+---+
|
||||||
>>> force_nullable
|
|
||||||
True
|
|
||||||
"""
|
"""
|
||||||
if len(sdf.columns) > 0:
|
if len(sdf.columns) > 0:
|
||||||
try:
|
return SparkDataFrame(
|
||||||
jdf = sdf._jdf.toDF() # type: ignore
|
sdf._jdf.toDF().withSequenceColumn(column_name), # type: ignore
|
||||||
|
sdf.sql_ctx,
|
||||||
sql_ctx = sdf.sql_ctx
|
)
|
||||||
encoders = sql_ctx._jvm.org.apache.spark.sql.Encoders # type: ignore
|
|
||||||
encoder = encoders.tuple(jdf.exprEnc(), encoders.scalaLong())
|
|
||||||
|
|
||||||
jrdd = jdf.localCheckpoint(False).rdd().zipWithIndex()
|
|
||||||
|
|
||||||
df = SparkDataFrame(
|
|
||||||
sql_ctx.sparkSession._jsparkSession.createDataset( # type: ignore
|
|
||||||
jrdd, encoder
|
|
||||||
).toDF(),
|
|
||||||
sql_ctx,
|
|
||||||
)
|
|
||||||
columns = df.columns
|
|
||||||
return (
|
|
||||||
df.selectExpr(
|
|
||||||
"`{}` as `{}`".format(columns[1], column_name), "`{}`.*".format(columns[0])
|
|
||||||
),
|
|
||||||
True,
|
|
||||||
)
|
|
||||||
except py4j.protocol.Py4JError:
|
|
||||||
if is_testing():
|
|
||||||
raise
|
|
||||||
return InternalFrame._attach_distributed_sequence_column(sdf, column_name)
|
|
||||||
else:
|
else:
|
||||||
cnt = sdf.count()
|
cnt = sdf.count()
|
||||||
if cnt > 0:
|
if cnt > 0:
|
||||||
return default_session().range(cnt).toDF(column_name), False
|
return default_session().range(cnt).toDF(column_name)
|
||||||
else:
|
else:
|
||||||
return (
|
return default_session().createDataFrame(
|
||||||
default_session().createDataFrame(
|
[], schema=StructType().add(column_name, data_type=LongType(), nullable=False)
|
||||||
[],
|
|
||||||
schema=StructType().add(column_name, data_type=LongType(), nullable=False),
|
|
||||||
),
|
|
||||||
False,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _attach_distributed_sequence_column(
|
|
||||||
sdf: SparkDataFrame, column_name: str
|
|
||||||
) -> Tuple[SparkDataFrame, bool]:
|
|
||||||
"""
|
|
||||||
>>> sdf = ps.DataFrame(['a', 'b', 'c']).to_spark()
|
|
||||||
>>> sdf, force_nullable = (
|
|
||||||
... InternalFrame._attach_distributed_sequence_column(sdf, column_name="sequence")
|
|
||||||
... )
|
|
||||||
>>> sdf.sort("sequence").show() # doctest: +NORMALIZE_WHITESPACE
|
|
||||||
+--------+---+
|
|
||||||
|sequence| 0|
|
|
||||||
+--------+---+
|
|
||||||
| 0| a|
|
|
||||||
| 1| b|
|
|
||||||
| 2| c|
|
|
||||||
+--------+---+
|
|
||||||
>>> force_nullable
|
|
||||||
False
|
|
||||||
"""
|
|
||||||
scols = [scol_for(sdf, column) for column in sdf.columns]
|
|
||||||
|
|
||||||
spark_partition_column = verify_temp_column_name(sdf, "__spark_partition_id__")
|
|
||||||
offset_column = verify_temp_column_name(sdf, "__offset__")
|
|
||||||
row_number_column = verify_temp_column_name(sdf, "__row_number__")
|
|
||||||
|
|
||||||
# 1. Calculates counts per each partition ID. `counts` here is, for instance,
|
|
||||||
# {
|
|
||||||
# 1: 83,
|
|
||||||
# 6: 83,
|
|
||||||
# 3: 83,
|
|
||||||
# ...
|
|
||||||
# }
|
|
||||||
sdf = sdf.withColumn(spark_partition_column, F.spark_partition_id())
|
|
||||||
|
|
||||||
# Checkpoint the DataFrame to fix the partition ID.
|
|
||||||
sdf = sdf.localCheckpoint(eager=False)
|
|
||||||
|
|
||||||
counts = map(
|
|
||||||
lambda x: (x["key"], x["count"]),
|
|
||||||
sdf.groupby(sdf[spark_partition_column].alias("key")).count().collect(),
|
|
||||||
)
|
|
||||||
|
|
||||||
# 2. Calculates cumulative sum in an order of partition id.
|
|
||||||
# Note that it does not matter if partition id guarantees its order or not.
|
|
||||||
# We just need a one-by-one sequential id.
|
|
||||||
|
|
||||||
# sort by partition key.
|
|
||||||
sorted_counts = sorted(counts, key=lambda x: x[0])
|
|
||||||
# get cumulative sum in an order of partition key.
|
|
||||||
cumulative_counts = [0] + list(accumulate(map(lambda count: count[1], sorted_counts)))
|
|
||||||
# zip it with partition key.
|
|
||||||
sums = dict(zip(map(lambda count: count[0], sorted_counts), cumulative_counts))
|
|
||||||
|
|
||||||
# 3. Attach offset for each partition.
|
|
||||||
@pandas_udf(returnType=LongType()) # type: ignore
|
|
||||||
def offset(id: pd.Series) -> pd.Series:
|
|
||||||
current_partition_offset = sums[id.iloc[0]]
|
|
||||||
return pd.Series(current_partition_offset).repeat(len(id))
|
|
||||||
|
|
||||||
sdf = sdf.withColumn(offset_column, offset(spark_partition_column))
|
|
||||||
|
|
||||||
# 4. Calculate row_number in each partition.
|
|
||||||
w = Window.partitionBy(spark_partition_column).orderBy(F.monotonically_increasing_id())
|
|
||||||
row_number = F.row_number().over(w)
|
|
||||||
sdf = sdf.withColumn(row_number_column, row_number)
|
|
||||||
|
|
||||||
# 5. Calculate the index.
|
|
||||||
return (
|
|
||||||
sdf.select(
|
|
||||||
(sdf[offset_column] + sdf[row_number_column] - 1).alias(column_name), *scols
|
|
||||||
),
|
|
||||||
False,
|
|
||||||
)
|
|
||||||
|
|
||||||
def spark_column_for(self, label: Label) -> Column:
|
def spark_column_for(self, label: Label) -> Column:
|
||||||
"""Return Spark Column for the given column label."""
|
"""Return Spark Column for the given column label."""
|
||||||
column_labels_to_scol = dict(zip(self.column_labels, self.data_spark_columns))
|
column_labels_to_scol = dict(zip(self.column_labels, self.data_spark_columns))
|
||||||
|
|
|
@ -5533,7 +5533,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
|
||||||
sdf_for_index = notnull._internal.spark_frame.select(notnull._internal.index_spark_columns)
|
sdf_for_index = notnull._internal.spark_frame.select(notnull._internal.index_spark_columns)
|
||||||
|
|
||||||
tmp_join_key = verify_temp_column_name(sdf_for_index, "__tmp_join_key__")
|
tmp_join_key = verify_temp_column_name(sdf_for_index, "__tmp_join_key__")
|
||||||
sdf_for_index, _ = InternalFrame.attach_distributed_sequence_column(
|
sdf_for_index = InternalFrame.attach_distributed_sequence_column(
|
||||||
sdf_for_index, tmp_join_key
|
sdf_for_index, tmp_join_key
|
||||||
)
|
)
|
||||||
# sdf_for_index:
|
# sdf_for_index:
|
||||||
|
@ -5550,7 +5550,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
|
||||||
sdf_for_data = notnull._internal.spark_frame.select(
|
sdf_for_data = notnull._internal.spark_frame.select(
|
||||||
notnull.spark.column.alias("values"), NATURAL_ORDER_COLUMN_NAME
|
notnull.spark.column.alias("values"), NATURAL_ORDER_COLUMN_NAME
|
||||||
)
|
)
|
||||||
sdf_for_data, _ = InternalFrame.attach_distributed_sequence_column(
|
sdf_for_data = InternalFrame.attach_distributed_sequence_column(
|
||||||
sdf_for_data, SPARK_DEFAULT_SERIES_NAME
|
sdf_for_data, SPARK_DEFAULT_SERIES_NAME
|
||||||
)
|
)
|
||||||
# sdf_for_data:
|
# sdf_for_data:
|
||||||
|
@ -5569,9 +5569,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
|
||||||
).drop("values", NATURAL_ORDER_COLUMN_NAME)
|
).drop("values", NATURAL_ORDER_COLUMN_NAME)
|
||||||
|
|
||||||
tmp_join_key = verify_temp_column_name(sdf_for_data, "__tmp_join_key__")
|
tmp_join_key = verify_temp_column_name(sdf_for_data, "__tmp_join_key__")
|
||||||
sdf_for_data, _ = InternalFrame.attach_distributed_sequence_column(
|
sdf_for_data = InternalFrame.attach_distributed_sequence_column(sdf_for_data, tmp_join_key)
|
||||||
sdf_for_data, tmp_join_key
|
|
||||||
)
|
|
||||||
# sdf_for_index: sdf_for_data:
|
# sdf_for_index: sdf_for_data:
|
||||||
# +----------------+-----------------+ +----------------+---+
|
# +----------------+-----------------+ +----------------+---+
|
||||||
# |__tmp_join_key__|__index_level_0__| |__tmp_join_key__| 0|
|
# |__tmp_join_key__|__index_level_0__| |__tmp_join_key__| 0|
|
||||||
|
@ -5639,7 +5637,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
|
||||||
return -1
|
return -1
|
||||||
# We should remember the natural sequence started from 0
|
# We should remember the natural sequence started from 0
|
||||||
seq_col_name = verify_temp_column_name(sdf, "__distributed_sequence_column__")
|
seq_col_name = verify_temp_column_name(sdf, "__distributed_sequence_column__")
|
||||||
sdf, _ = InternalFrame.attach_distributed_sequence_column(
|
sdf = InternalFrame.attach_distributed_sequence_column(
|
||||||
sdf.drop(NATURAL_ORDER_COLUMN_NAME), seq_col_name
|
sdf.drop(NATURAL_ORDER_COLUMN_NAME), seq_col_name
|
||||||
)
|
)
|
||||||
# If the maximum is achieved in multiple locations, the first row position is returned.
|
# If the maximum is achieved in multiple locations, the first row position is returned.
|
||||||
|
@ -5686,7 +5684,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
|
||||||
return -1
|
return -1
|
||||||
# We should remember the natural sequence started from 0
|
# We should remember the natural sequence started from 0
|
||||||
seq_col_name = verify_temp_column_name(sdf, "__distributed_sequence_column__")
|
seq_col_name = verify_temp_column_name(sdf, "__distributed_sequence_column__")
|
||||||
sdf, _ = InternalFrame.attach_distributed_sequence_column(
|
sdf = InternalFrame.attach_distributed_sequence_column(
|
||||||
sdf.drop(NATURAL_ORDER_COLUMN_NAME), seq_col_name
|
sdf.drop(NATURAL_ORDER_COLUMN_NAME), seq_col_name
|
||||||
)
|
)
|
||||||
# If the minimum is achieved in multiple locations, the first row position is returned.
|
# If the minimum is achieved in multiple locations, the first row position is returned.
|
||||||
|
|
|
@ -5146,23 +5146,26 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
|
||||||
sys.stdout = prev
|
sys.stdout = prev
|
||||||
|
|
||||||
def test_explain_hint(self):
|
def test_explain_hint(self):
|
||||||
psdf1 = ps.DataFrame(
|
with ps.option_context("compute.default_index_type", "sequence"):
|
||||||
{"lkey": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 5]}, columns=["lkey", "value"]
|
psdf1 = ps.DataFrame(
|
||||||
)
|
{"lkey": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 5]},
|
||||||
psdf2 = ps.DataFrame(
|
columns=["lkey", "value"],
|
||||||
{"rkey": ["foo", "bar", "baz", "foo"], "value": [5, 6, 7, 8]}, columns=["rkey", "value"]
|
)
|
||||||
)
|
psdf2 = ps.DataFrame(
|
||||||
merged = psdf1.merge(psdf2.spark.hint("broadcast"), left_on="lkey", right_on="rkey")
|
{"rkey": ["foo", "bar", "baz", "foo"], "value": [5, 6, 7, 8]},
|
||||||
prev = sys.stdout
|
columns=["rkey", "value"],
|
||||||
try:
|
)
|
||||||
out = StringIO()
|
merged = psdf1.merge(psdf2.spark.hint("broadcast"), left_on="lkey", right_on="rkey")
|
||||||
sys.stdout = out
|
prev = sys.stdout
|
||||||
merged.spark.explain()
|
try:
|
||||||
actual = out.getvalue().strip()
|
out = StringIO()
|
||||||
|
sys.stdout = out
|
||||||
|
merged.spark.explain()
|
||||||
|
actual = out.getvalue().strip()
|
||||||
|
|
||||||
self.assertTrue("Broadcast" in actual, actual)
|
self.assertTrue("Broadcast" in actual, actual)
|
||||||
finally:
|
finally:
|
||||||
sys.stdout = prev
|
sys.stdout = prev
|
||||||
|
|
||||||
def test_mad(self):
|
def test_mad(self):
|
||||||
pdf = pd.DataFrame(
|
pdf = pd.DataFrame(
|
||||||
|
|
|
@ -3509,6 +3509,31 @@ class Dataset[T] private[sql](
|
||||||
// For Python API
|
// For Python API
|
||||||
////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
/**
|
||||||
|
* It adds a new long column with the name `name` that increases one by one.
|
||||||
|
* This is for 'distributed-sequence' default index in pandas API on Spark.
|
||||||
|
*/
|
||||||
|
private[sql] def withSequenceColumn(name: String) = {
|
||||||
|
val rdd: RDD[InternalRow] =
|
||||||
|
// Checkpoint the DataFrame to fix the partition ID.
|
||||||
|
localCheckpoint(false)
|
||||||
|
.queryExecution.toRdd.zipWithIndex().mapPartitions { iter =>
|
||||||
|
val joinedRow = new JoinedRow
|
||||||
|
val unsafeRowWriter =
|
||||||
|
new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1)
|
||||||
|
|
||||||
|
iter.map { case (row, id) =>
|
||||||
|
// Writes to an UnsafeRow directly
|
||||||
|
unsafeRowWriter.reset()
|
||||||
|
unsafeRowWriter.write(0, id)
|
||||||
|
joinedRow(unsafeRowWriter.getRow, row)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sparkSession.internalCreateDataFrame(
|
||||||
|
rdd, StructType(StructField(name, LongType, nullable = false) +: schema), isStreaming)
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Converts a JavaRDD to a PythonRDD.
|
* Converts a JavaRDD to a PythonRDD.
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -2937,6 +2937,12 @@ class DataFrameSuite extends QueryTest
|
||||||
checkAnswer(sql("SELECT sum(c1 * c3) + sum(c2 * c3) FROM tbl"), Row(2.00000000000) :: Nil)
|
checkAnswer(sql("SELECT sum(c1 * c3) + sum(c2 * c3) FROM tbl"), Row(2.00000000000) :: Nil)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test("SPARK-36338: DataFrame.withSequenceColumn should append unique sequence IDs") {
|
||||||
|
val ids = spark.range(10).repartition(5)
|
||||||
|
.withSequenceColumn("default_index").collect().map(_.getLong(0))
|
||||||
|
assert(ids.toSet === Range(0, 10).toSet)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
case class GroupByKey(a: Int, b: Int)
|
case class GroupByKey(a: Int, b: Int)
|
||||||
|
|
Loading…
Reference in a new issue