spark-instrumented-optimizer/python/pyspark/rdd.pyi

#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

from typing import overload
from typing import (
    Any,
    Callable,
    Dict,
    Generic,
    Hashable,
    Iterable,
    Iterator,
    List,
    Optional,
    Tuple,
    Union,
    TypeVar,
)
from typing_extensions import Literal

from numpy import int32, int64, float32, float64, ndarray  # type: ignore[import]

from pyspark._typing import SupportsOrdering
from pyspark.sql.pandas._typing import (
    PandasScalarUDFType,
    PandasScalarIterUDFType,
    PandasGroupedMapUDFType,
    PandasCogroupedMapUDFType,
    PandasGroupedAggUDFType,
    PandasMapIterUDFType,
)
import pyspark.context
from pyspark.resultiterable import ResultIterable
from pyspark.serializers import Serializer
from pyspark.storagelevel import StorageLevel
from pyspark.resource.requests import (  # noqa: F401
    ExecutorResourceRequests,
    TaskResourceRequests,
)
from pyspark.resource.profile import ResourceProfile
from pyspark.statcounter import StatCounter
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.types import StructType
from pyspark.sql._typing import RowLike
from py4j.java_gateway import JavaObject  # type: ignore[import]

T = TypeVar("T")
U = TypeVar("U")
K = TypeVar("K", bound=Hashable)
V = TypeVar("V")
V1 = TypeVar("V1")
V2 = TypeVar("V2")
V3 = TypeVar("V3")
O = TypeVar("O", bound=SupportsOrdering)
NumberOrArray = TypeVar(
    "NumberOrArray", float, int, complex, int32, int64, float32, float64, ndarray
)

def portable_hash(x: Hashable) -> int: ...

class PythonEvalType:
    NON_UDF: Literal[0]
    SQL_BATCHED_UDF: Literal[100]
    SQL_SCALAR_PANDAS_UDF: PandasScalarUDFType
    SQL_GROUPED_MAP_PANDAS_UDF: PandasGroupedMapUDFType
    SQL_GROUPED_AGG_PANDAS_UDF: PandasGroupedAggUDFType
    SQL_WINDOW_AGG_PANDAS_UDF: Literal[203]
    SQL_SCALAR_PANDAS_ITER_UDF: PandasScalarIterUDFType
    SQL_MAP_PANDAS_ITER_UDF: PandasMapIterUDFType
    SQL_COGROUPED_MAP_PANDAS_UDF: PandasCogroupedMapUDFType

class BoundedFloat(float):
    def __new__(cls, mean: float, confidence: float, low: float, high: float): ...

class Partitioner:
    numPartitions: int
    partitionFunc: Callable[[Any], int]
    def __init__(self, numPartitions, partitionFunc) -> None: ...
    def __eq__(self, other: Any) -> bool: ...
    def __call__(self, k: Any) -> int: ...

class RDD(Generic[T]):
    is_cached: bool
    is_checkpointed: bool
    ctx: pyspark.context.SparkContext
    partitioner: Optional[Partitioner]
    def __init__(
        self,
        jrdd: JavaObject,
        ctx: pyspark.context.SparkContext,
        jrdd_deserializer: Serializer = ...,
    ) -> None: ...
    def id(self) -> int: ...
    def __getnewargs__(self) -> Any: ...
    @property
    def context(self) -> pyspark.context.SparkContext: ...
    def cache(self) -> RDD[T]: ...
    def persist(self, storageLevel: StorageLevel = ...) -> RDD[T]: ...
    def unpersist(self, blocking: bool = ...) -> RDD[T]: ...
    def checkpoint(self) -> None: ...
    def isCheckpointed(self) -> bool: ...
    def localCheckpoint(self) -> None: ...
    def isLocallyCheckpointed(self) -> bool: ...
    def getCheckpointFile(self) -> Optional[str]: ...
    def map(self, f: Callable[[T], U], preservesPartitioning: bool = ...) -> RDD[U]: ...
    def flatMap(
        self, f: Callable[[T], Iterable[U]], preservesPartitioning: bool = ...
    ) -> RDD[U]: ...
    def mapPartitions(
        self, f: Callable[[Iterable[T]], Iterable[U]], preservesPartitioning: bool = ...
    ) -> RDD[U]: ...
    def mapPartitionsWithIndex(
        self,
        f: Callable[[int, Iterable[T]], Iterable[U]],
        preservesPartitioning: bool = ...,
    ) -> RDD[U]: ...
    def mapPartitionsWithSplit(
        self,
        f: Callable[[int, Iterable[T]], Iterable[U]],
        preservesPartitioning: bool = ...,
    ) -> RDD[U]: ...
    def getNumPartitions(self) -> int: ...
    def filter(self, f: Callable[[T], bool]) -> RDD[T]: ...
    def distinct(self, numPartitions: Optional[int] = ...) -> RDD[T]: ...
    def sample(
        self, withReplacement: bool, fraction: float, seed: Optional[int] = ...
    ) -> RDD[T]: ...
    def randomSplit(
        self, weights: List[Union[int, float]], seed: Optional[int] = ...
    ) -> List[RDD[T]]: ...
    def takeSample(
        self, withReplacement: bool, num: int, seed: Optional[int] = ...
    ) -> List[T]: ...
    def union(self, other: RDD[U]) -> RDD[Union[T, U]]: ...
    def intersection(self, other: RDD[T]) -> RDD[T]: ...
    def __add__(self, other: RDD[T]) -> RDD[T]: ...
    @overload
    def repartitionAndSortWithinPartitions(
        self: RDD[Tuple[O, V]],
        numPartitions: Optional[int] = ...,
        partitionFunc: Callable[[O], int] = ...,
        ascending: bool = ...,
    ) -> RDD[Tuple[O, V]]: ...
    @overload
    def repartitionAndSortWithinPartitions(
        self: RDD[Tuple[K, V]],
        numPartitions: Optional[int],
        partitionFunc: Callable[[K], int],
        ascending: bool,
        keyfunc: Callable[[K], O],
    ) -> RDD[Tuple[K, V]]: ...
    @overload
    def repartitionAndSortWithinPartitions(
        self: RDD[Tuple[K, V]],
        numPartitions: Optional[int] = ...,
        partitionFunc: Callable[[K], int] = ...,
        ascending: bool = ...,
        *,
        keyfunc: Callable[[K], O]
    ) -> RDD[Tuple[K, V]]: ...
    @overload
    def sortByKey(
        self: RDD[Tuple[O, V]],
        ascending: bool = ...,
        numPartitions: Optional[int] = ...,
    ) -> RDD[Tuple[K, V]]: ...
    @overload
    def sortByKey(
        self: RDD[Tuple[K, V]],
        ascending: bool,
        numPartitions: int,
        keyfunc: Callable[[K], O],
    ) -> RDD[Tuple[K, V]]: ...
    @overload
    def sortByKey(
        self: RDD[Tuple[K, V]],
        ascending: bool = ...,
        numPartitions: Optional[int] = ...,
        *,
        keyfunc: Callable[[K], O]
    ) -> RDD[Tuple[K, V]]: ...
    def sortBy(
        self: RDD[T],
        keyfunc: Callable[[T], O],
        ascending: bool = ...,
        numPartitions: Optional[int] = ...,
    ) -> RDD[T]: ...
    def glom(self) -> RDD[List[T]]: ...
    def cartesian(self, other: RDD[U]) -> RDD[Tuple[T, U]]: ...
    def groupBy(
        self,
        f: Callable[[T], K],
        numPartitions: Optional[int] = ...,
        partitionFunc: Callable[[K], int] = ...,
    ) -> RDD[Tuple[K, Iterable[T]]]: ...
    def pipe(
        self, command: str, env: Optional[Dict[str, str]] = ..., checkCode: bool = ...
    ) -> RDD[str]: ...
    def foreach(self, f: Callable[[T], None]) -> None: ...
    def foreachPartition(self, f: Callable[[Iterable[T]], None]) -> None: ...
    def collect(self) -> List[T]: ...
    def collectWithJobGroup(
        self, groupId: str, description: str, interruptOnCancel: bool = ...
    ) -> List[T]: ...
    def reduce(self, f: Callable[[T, T], T]) -> T: ...
    def treeReduce(self, f: Callable[[T, T], T], depth: int = ...) -> T: ...
    def fold(self, zeroValue: T, op: Callable[[T, T], T]) -> T: ...
    def aggregate(
        self, zeroValue: U, seqOp: Callable[[U, T], U], combOp: Callable[[U, U], U]
    ) -> U: ...
    def treeAggregate(
        self,
        zeroValue: U,
        seqOp: Callable[[U, T], U],
        combOp: Callable[[U, U], U],
        depth: int = ...,
    ) -> U: ...
    @overload
    def max(self: RDD[O]) -> O: ...
    @overload
    def max(self, key: Callable[[T], O]) -> T: ...
    @overload
    def min(self: RDD[O]) -> O: ...
    @overload
    def min(self, key: Callable[[T], O]) -> T: ...
    def sum(self: RDD[NumberOrArray]) -> NumberOrArray: ...
    def count(self) -> int: ...
    def stats(self: RDD[NumberOrArray]) -> StatCounter: ...
    def histogram(self, buckets: List[T]) -> Tuple[List[T], List[int]]: ...
    def mean(self: RDD[NumberOrArray]) -> NumberOrArray: ...
    def variance(self: RDD[NumberOrArray]) -> NumberOrArray: ...
    def stdev(self: RDD[NumberOrArray]) -> NumberOrArray: ...
    def sampleStdev(self: RDD[NumberOrArray]) -> NumberOrArray: ...
    def sampleVariance(self: RDD[NumberOrArray]) -> NumberOrArray: ...
    def countByValue(self: RDD[K]) -> Dict[K, int]: ...
    @overload
    def top(self: RDD[O], num: int) -> List[O]: ...
    @overload
    def top(self: RDD[T], num: int, key: Callable[[T], O]) -> List[T]: ...
    @overload
    def takeOrdered(self: RDD[O], num: int) -> List[O]: ...
    @overload
    def takeOrdered(self: RDD[T], num: int, key: Callable[[T], O]) -> List[T]: ...
    def take(self, num: int) -> List[T]: ...
    def first(self) -> T: ...
    def isEmpty(self) -> bool: ...
    def saveAsNewAPIHadoopDataset(
        self: RDD[Tuple[K, V]],
        conf: Dict[str, str],
        keyConverter: Optional[str] = ...,
        valueConverter: Optional[str] = ...,
    ) -> None: ...
    def saveAsNewAPIHadoopFile(
        self: RDD[Tuple[K, V]],
        path: str,
        outputFormatClass: str,
        keyClass: Optional[str] = ...,
        valueClass: Optional[str] = ...,
        keyConverter: Optional[str] = ...,
        valueConverter: Optional[str] = ...,
        conf: Optional[Dict[str, str]] = ...,
    ) -> None: ...
    def saveAsHadoopDataset(
        self: RDD[Tuple[K, V]],
        conf: Dict[str, str],
        keyConverter: Optional[str] = ...,
        valueConverter: Optional[str] = ...,
    ) -> None: ...
    def saveAsHadoopFile(
        self: RDD[Tuple[K, V]],
        path: str,
        outputFormatClass: str,
        keyClass: Optional[str] = ...,
        valueClass: Optional[str] = ...,
        keyConverter: Optional[str] = ...,
        valueConverter: Optional[str] = ...,
        conf: Optional[str] = ...,
        compressionCodecClass: Optional[str] = ...,
    ) -> None: ...
    def saveAsSequenceFile(
        self: RDD[Tuple[K, V]], path: str, compressionCodecClass: Optional[str] = ...
    ) -> None: ...
    def saveAsPickleFile(self, path: str, batchSize: int = ...) -> None: ...
    def saveAsTextFile(
        self, path: str, compressionCodecClass: Optional[str] = ...
    ) -> None: ...
    def collectAsMap(self: RDD[Tuple[K, V]]) -> Dict[K, V]: ...
    def keys(self: RDD[Tuple[K, V]]) -> RDD[K]: ...
    def values(self: RDD[Tuple[K, V]]) -> RDD[V]: ...
    def reduceByKey(
        self: RDD[Tuple[K, V]],
        func: Callable[[V, V], V],
        numPartitions: Optional[int] = ...,
        partitionFunc: Callable[[K], int] = ...,
    ) -> RDD[Tuple[K, V]]: ...
    def reduceByKeyLocally(
        self: RDD[Tuple[K, V]], func: Callable[[V, V], V]
    ) -> Dict[K, V]: ...
    def countByKey(self: RDD[Tuple[K, V]]) -> Dict[K, int]: ...
    def join(
        self: RDD[Tuple[K, V]],
        other: RDD[Tuple[K, U]],
        numPartitions: Optional[int] = ...,
    ) -> RDD[Tuple[K, Tuple[V, U]]]: ...
    def leftOuterJoin(
        self: RDD[Tuple[K, V]],
        other: RDD[Tuple[K, U]],
        numPartitions: Optional[int] = ...,
    ) -> RDD[Tuple[K, Tuple[V, Optional[U]]]]: ...
    def rightOuterJoin(
        self: RDD[Tuple[K, V]],
        other: RDD[Tuple[K, U]],
        numPartitions: Optional[int] = ...,
    ) -> RDD[Tuple[K, Tuple[Optional[V], U]]]: ...
    def fullOuterJoin(
        self: RDD[Tuple[K, V]],
        other: RDD[Tuple[K, U]],
        numPartitions: Optional[int] = ...,
    ) -> RDD[Tuple[K, Tuple[Optional[V], Optional[U]]]]: ...
    def partitionBy(
        self: RDD[Tuple[K, V]],
        numPartitions: int,
        partitionFunc: Callable[[K], int] = ...,
    ) -> RDD[Tuple[K, V]]: ...
    def combineByKey(
        self: RDD[Tuple[K, V]],
        createCombiner: Callable[[V], U],
        mergeValue: Callable[[U, V], U],
        mergeCombiners: Callable[[U, U], U],
        numPartitions: Optional[int] = ...,
        partitionFunc: Callable[[K], int] = ...,
    ) -> RDD[Tuple[K, U]]: ...
    def aggregateByKey(
        self: RDD[Tuple[K, V]],
        zeroValue: U,
        seqFunc: Callable[[U, V], U],
        combFunc: Callable[[U, U], U],
        numPartitions: Optional[int] = ...,
        partitionFunc: Callable[[K], int] = ...,
    ) -> RDD[Tuple[K, U]]: ...
    def foldByKey(
        self: RDD[Tuple[K, V]],
        zeroValue: V,
        func: Callable[[V, V], V],
        numPartitions: Optional[int] = ...,
        partitionFunc: Callable[[K], int] = ...,
    ) -> RDD[Tuple[K, V]]: ...
    def groupByKey(
        self: RDD[Tuple[K, V]],
        numPartitions: Optional[int] = ...,
        partitionFunc: Callable[[K], int] = ...,
    ) -> RDD[Tuple[K, Iterable[V]]]: ...
    def flatMapValues(
        self: RDD[Tuple[K, V]], f: Callable[[V], Iterable[U]]
    ) -> RDD[Tuple[K, U]]: ...
    def mapValues(self: RDD[Tuple[K, V]], f: Callable[[V], U]) -> RDD[Tuple[K, U]]: ...
    @overload
    def groupWith(
        self: RDD[Tuple[K, V]], __o: RDD[Tuple[K, V1]]
    ) -> RDD[Tuple[K, Tuple[ResultIterable[V], ResultIterable[V1]]]]: ...
    @overload
    def groupWith(
        self: RDD[Tuple[K, V]], __o1: RDD[Tuple[K, V1]], __o2: RDD[Tuple[K, V2]]
    ) -> RDD[
        Tuple[K, Tuple[ResultIterable[V], ResultIterable[V1], ResultIterable[V2]]]
    ]: ...
    @overload
    def groupWith(
        self: RDD[Tuple[K, V]],
        other1: RDD[Tuple[K, V1]],
        other2: RDD[Tuple[K, V2]],
        other3: RDD[Tuple[K, V3]],
    ) -> RDD[
        Tuple[
            K,
            Tuple[
                ResultIterable[V],
                ResultIterable[V1],
                ResultIterable[V2],
                ResultIterable[V3],
            ],
        ]
    ]: ...
    def cogroup(
        self: RDD[Tuple[K, V]],
        other: RDD[Tuple[K, U]],
        numPartitions: Optional[int] = ...,
    ) -> RDD[Tuple[K, Tuple[ResultIterable[V], ResultIterable[U]]]]: ...
    def sampleByKey(
        self: RDD[Tuple[K, V]],
        withReplacement: bool,
        fractions: Dict[K, Union[float, int]],
        seed: Optional[int] = ...,
    ) -> RDD[Tuple[K, V]]: ...
    def subtractByKey(
        self: RDD[Tuple[K, V]],
        other: RDD[Tuple[K, U]],
        numPartitions: Optional[int] = ...,
    ) -> RDD[Tuple[K, V]]: ...
    def subtract(
        self: RDD[T], other: RDD[T], numPartitions: Optional[int] = ...
    ) -> RDD[T]: ...
    def keyBy(self: RDD[T], f: Callable[[T], K]) -> RDD[Tuple[K, T]]: ...
    def repartition(self, numPartitions: int) -> RDD[T]: ...
    def coalesce(self, numPartitions: int, shuffle: bool = ...) -> RDD[T]: ...
    def zip(self, other: RDD[U]) -> RDD[Tuple[T, U]]: ...
    def zipWithIndex(self) -> RDD[Tuple[T, int]]: ...
    def zipWithUniqueId(self) -> RDD[Tuple[T, int]]: ...
    def name(self) -> str: ...
    def setName(self, name: str) -> RDD[T]: ...
    def toDebugString(self) -> bytes: ...
    def getStorageLevel(self) -> StorageLevel: ...
    def lookup(self: RDD[Tuple[K, V]], key: K) -> List[V]: ...
    def countApprox(self, timeout: int, confidence: float = ...) -> int: ...
    def sumApprox(
        self: RDD[Union[float, int]], timeout: int, confidence: float = ...
    ) -> BoundedFloat: ...
    def meanApprox(
        self: RDD[Union[float, int]], timeout: int, confidence: float = ...
    ) -> BoundedFloat: ...
    def countApproxDistinct(self, relativeSD: float = ...) -> int: ...
    def toLocalIterator(self, prefetchPartitions: bool = ...) -> Iterator[T]: ...
    def barrier(self: RDD[T]) -> RDDBarrier[T]: ...
    def withResources(self: RDD[T], profile: ResourceProfile) -> RDD[T]: ...
    def getResourceProfile(self) -> Optional[ResourceProfile]: ...
    @overload
    def toDF(
        self: RDD[RowLike],
        schema: Optional[List[str]] = ...,
        sampleRatio: Optional[float] = ...,
    ) -> DataFrame: ...
    @overload
    def toDF(self: RDD[RowLike], schema: Optional[StructType] = ...) -> DataFrame: ...

class RDDBarrier(Generic[T]):
    rdd: RDD[T]
    def __init__(self, rdd: RDD[T]) -> None: ...
    def mapPartitions(
        self, f: Callable[[Iterable[T]], Iterable[U]], preservesPartitioning: bool = ...
    ) -> RDD[U]: ...
    def mapPartitionsWithIndex(
        self,
        f: Callable[[int, Iterable[T]], Iterable[U]],
        preservesPartitioning: bool = ...,
    ) -> RDD[U]: ...

class PipelinedRDD(RDD[U], Generic[T, U]):
    func: Callable[[T], U]
    preservesPartitioning: bool
    is_cached: bool
    is_checkpointed: bool
    ctx: pyspark.context.SparkContext
    prev: RDD[T]
    partitioner: Optional[Partitioner]
    is_barrier: bool
    def __init__(
        self,
        prev: RDD[T],
        func: Callable[[Iterable[T]], Iterable[U]],
        preservesPartitioning: bool = ...,
        isFromBarrier: bool = ...,
    ) -> None: ...
    def getNumPartitions(self) -> int: ...
    def id(self) -> int: ...
[SPARK-32714][PYTHON] Initial pyspark-stubs port ### What changes were proposed in this pull request? This PR proposes migration of [`pyspark-stubs`](https://github.com/zero323/pyspark-stubs) into Spark codebase. ### Why are the changes needed? ### Does this PR introduce _any_ user-facing change? Yes. This PR adds type annotations directly to Spark source. This can impact interaction with development tools for users, which haven't used `pyspark-stubs`. ### How was this patch tested? - [x] MyPy tests of the PySpark source ``` mypy --no-incremental --config python/mypy.ini python/pyspark ``` - [x] MyPy tests of Spark examples ``` MYPYPATH=python/ mypy --no-incremental --config python/mypy.ini examples/src/main/python/ml examples/src/main/python/sql examples/src/main/python/sql/streaming ``` - [x] Existing Flake8 linter - [x] Existing unit tests Tested against: - `mypy==0.790+dev.e959952d9001e9713d329a2f9b196705b028f894` - `mypy==0.782` Closes #29591 from zero323/SPARK-32681. Authored-by: zero323 <mszymkiewicz@gmail.com> Signed-off-by: HyukjinKwon <gurwls223@apache.org> 2020-09-24 01:15:36 -04:00			`#`
			`# Licensed to the Apache Software Foundation (ASF) under one`
			`# or more contributor license agreements. See the NOTICE file`
			`# distributed with this work for additional information`
			`# regarding copyright ownership. The ASF licenses this file`
			`# to you under the Apache License, Version 2.0 (the`
			`# "License"); you may not use this file except in compliance`
			`# with the License. You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing,`
			`# software distributed under the License is distributed on an`
			`# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY`
			`# KIND, either express or implied. See the License for the`
			`# specific language governing permissions and limitations`
			`# under the License.`

			`from typing import overload`
			`from typing import (`
			`Any,`
			`Callable,`
			`Dict,`
			`Generic,`
			`Hashable,`
			`Iterable,`
			`Iterator,`
			`List,`
			`Optional,`
			`Tuple,`
			`Union,`
			`TypeVar,`
			`)`
			`from typing_extensions import Literal`

			`from numpy import int32, int64, float32, float64, ndarray # type: ignore[import]`

			`from pyspark._typing import SupportsOrdering`
			`from pyspark.sql.pandas._typing import (`
			`PandasScalarUDFType,`
			`PandasScalarIterUDFType,`
			`PandasGroupedMapUDFType,`
			`PandasCogroupedMapUDFType,`
			`PandasGroupedAggUDFType,`
			`PandasMapIterUDFType,`
			`)`
			`import pyspark.context`
			`from pyspark.resultiterable import ResultIterable`
			`from pyspark.serializers import Serializer`
			`from pyspark.storagelevel import StorageLevel`
			`from pyspark.resource.requests import ( # noqa: F401`
			`ExecutorResourceRequests,`
			`TaskResourceRequests,`
			`)`
			`from pyspark.resource.profile import ResourceProfile`
			`from pyspark.statcounter import StatCounter`
			`from pyspark.sql.dataframe import DataFrame`
			`from pyspark.sql.types import StructType`
			`from pyspark.sql._typing import RowLike`
			`from py4j.java_gateway import JavaObject # type: ignore[import]`

			`T = TypeVar("T")`
			`U = TypeVar("U")`
			`K = TypeVar("K", bound=Hashable)`
			`V = TypeVar("V")`
			`V1 = TypeVar("V1")`
			`V2 = TypeVar("V2")`
			`V3 = TypeVar("V3")`
			`O = TypeVar("O", bound=SupportsOrdering)`
			`NumberOrArray = TypeVar(`
			`"NumberOrArray", float, int, complex, int32, int64, float32, float64, ndarray`
			`)`

			`def portable_hash(x: Hashable) -> int: ...`

			`class PythonEvalType:`
			`NON_UDF: Literal[0]`
			`SQL_BATCHED_UDF: Literal[100]`
			`SQL_SCALAR_PANDAS_UDF: PandasScalarUDFType`
			`SQL_GROUPED_MAP_PANDAS_UDF: PandasGroupedMapUDFType`
			`SQL_GROUPED_AGG_PANDAS_UDF: PandasGroupedAggUDFType`
			`SQL_WINDOW_AGG_PANDAS_UDF: Literal[203]`
			`SQL_SCALAR_PANDAS_ITER_UDF: PandasScalarIterUDFType`
			`SQL_MAP_PANDAS_ITER_UDF: PandasMapIterUDFType`
			`SQL_COGROUPED_MAP_PANDAS_UDF: PandasCogroupedMapUDFType`

			`class BoundedFloat(float):`
			`def __new__(cls, mean: float, confidence: float, low: float, high: float): ...`

			`class Partitioner:`
			`numPartitions: int`
			`partitionFunc: Callable[[Any], int]`
			`def __init__(self, numPartitions, partitionFunc) -> None: ...`
			`def __eq__(self, other: Any) -> bool: ...`
			`def __call__(self, k: Any) -> int: ...`

			`class RDD(Generic[T]):`
			`is_cached: bool`
			`is_checkpointed: bool`
			`ctx: pyspark.context.SparkContext`
			`partitioner: Optional[Partitioner]`
			`def __init__(`
			`self,`
			`jrdd: JavaObject,`
			`ctx: pyspark.context.SparkContext,`
			`jrdd_deserializer: Serializer = ...,`
			`) -> None: ...`
			`def id(self) -> int: ...`
			`def __getnewargs__(self) -> Any: ...`
			`@property`
			`def context(self) -> pyspark.context.SparkContext: ...`
			`def cache(self) -> RDD[T]: ...`
			`def persist(self, storageLevel: StorageLevel = ...) -> RDD[T]: ...`
			`def unpersist(self, blocking: bool = ...) -> RDD[T]: ...`
			`def checkpoint(self) -> None: ...`
			`def isCheckpointed(self) -> bool: ...`
			`def localCheckpoint(self) -> None: ...`
			`def isLocallyCheckpointed(self) -> bool: ...`
			`def getCheckpointFile(self) -> Optional[str]: ...`
			`def map(self, f: Callable[[T], U], preservesPartitioning: bool = ...) -> RDD[U]: ...`
			`def flatMap(`
			`self, f: Callable[[T], Iterable[U]], preservesPartitioning: bool = ...`
			`) -> RDD[U]: ...`
			`def mapPartitions(`
			`self, f: Callable[[Iterable[T]], Iterable[U]], preservesPartitioning: bool = ...`
			`) -> RDD[U]: ...`
			`def mapPartitionsWithIndex(`
			`self,`
			`f: Callable[[int, Iterable[T]], Iterable[U]],`
			`preservesPartitioning: bool = ...,`
			`) -> RDD[U]: ...`
			`def mapPartitionsWithSplit(`
			`self,`
			`f: Callable[[int, Iterable[T]], Iterable[U]],`
			`preservesPartitioning: bool = ...,`
			`) -> RDD[U]: ...`
			`def getNumPartitions(self) -> int: ...`
			`def filter(self, f: Callable[[T], bool]) -> RDD[T]: ...`
			`def distinct(self, numPartitions: Optional[int] = ...) -> RDD[T]: ...`
			`def sample(`
			`self, withReplacement: bool, fraction: float, seed: Optional[int] = ...`
			`) -> RDD[T]: ...`
			`def randomSplit(`
			`self, weights: List[Union[int, float]], seed: Optional[int] = ...`
			`) -> List[RDD[T]]: ...`
			`def takeSample(`
			`self, withReplacement: bool, num: int, seed: Optional[int] = ...`
			`) -> List[T]: ...`
			`def union(self, other: RDD[U]) -> RDD[Union[T, U]]: ...`
			`def intersection(self, other: RDD[T]) -> RDD[T]: ...`
			`def __add__(self, other: RDD[T]) -> RDD[T]: ...`
			`@overload`
			`def repartitionAndSortWithinPartitions(`
			`self: RDD[Tuple[O, V]],`
			`numPartitions: Optional[int] = ...,`
			`partitionFunc: Callable[[O], int] = ...,`
			`ascending: bool = ...,`
			`) -> RDD[Tuple[O, V]]: ...`
			`@overload`
			`def repartitionAndSortWithinPartitions(`
			`self: RDD[Tuple[K, V]],`
			`numPartitions: Optional[int],`
			`partitionFunc: Callable[[K], int],`
			`ascending: bool,`
			`keyfunc: Callable[[K], O],`
			`) -> RDD[Tuple[K, V]]: ...`
			`@overload`
			`def repartitionAndSortWithinPartitions(`
			`self: RDD[Tuple[K, V]],`
			`numPartitions: Optional[int] = ...,`
			`partitionFunc: Callable[[K], int] = ...,`
			`ascending: bool = ...,`
			`*,`
			`keyfunc: Callable[[K], O]`
			`) -> RDD[Tuple[K, V]]: ...`
			`@overload`
			`def sortByKey(`
			`self: RDD[Tuple[O, V]],`
			`ascending: bool = ...,`
			`numPartitions: Optional[int] = ...,`
			`) -> RDD[Tuple[K, V]]: ...`
			`@overload`
			`def sortByKey(`
			`self: RDD[Tuple[K, V]],`
			`ascending: bool,`
			`numPartitions: int,`
			`keyfunc: Callable[[K], O],`
			`) -> RDD[Tuple[K, V]]: ...`
			`@overload`
			`def sortByKey(`
			`self: RDD[Tuple[K, V]],`
			`ascending: bool = ...,`
			`numPartitions: Optional[int] = ...,`
			`*,`
			`keyfunc: Callable[[K], O]`
			`) -> RDD[Tuple[K, V]]: ...`
			`def sortBy(`
			`self: RDD[T],`
			`keyfunc: Callable[[T], O],`
			`ascending: bool = ...,`
			`numPartitions: Optional[int] = ...,`
			`) -> RDD[T]: ...`
			`def glom(self) -> RDD[List[T]]: ...`
			`def cartesian(self, other: RDD[U]) -> RDD[Tuple[T, U]]: ...`
			`def groupBy(`
			`self,`
			`f: Callable[[T], K],`
			`numPartitions: Optional[int] = ...,`
			`partitionFunc: Callable[[K], int] = ...,`
			`) -> RDD[Tuple[K, Iterable[T]]]: ...`
			`def pipe(`
			`self, command: str, env: Optional[Dict[str, str]] = ..., checkCode: bool = ...`
			`) -> RDD[str]: ...`
			`def foreach(self, f: Callable[[T], None]) -> None: ...`
			`def foreachPartition(self, f: Callable[[Iterable[T]], None]) -> None: ...`
			`def collect(self) -> List[T]: ...`
			`def collectWithJobGroup(`
			`self, groupId: str, description: str, interruptOnCancel: bool = ...`
			`) -> List[T]: ...`
			`def reduce(self, f: Callable[[T, T], T]) -> T: ...`
			`def treeReduce(self, f: Callable[[T, T], T], depth: int = ...) -> T: ...`
			`def fold(self, zeroValue: T, op: Callable[[T, T], T]) -> T: ...`
			`def aggregate(`
			`self, zeroValue: U, seqOp: Callable[[U, T], U], combOp: Callable[[U, U], U]`
			`) -> U: ...`
			`def treeAggregate(`
			`self,`
			`zeroValue: U,`
			`seqOp: Callable[[U, T], U],`
			`combOp: Callable[[U, U], U],`
			`depth: int = ...,`
			`) -> U: ...`
			`@overload`
			`def max(self: RDD[O]) -> O: ...`
			`@overload`
			`def max(self, key: Callable[[T], O]) -> T: ...`
			`@overload`
			`def min(self: RDD[O]) -> O: ...`
			`@overload`
			`def min(self, key: Callable[[T], O]) -> T: ...`
			`def sum(self: RDD[NumberOrArray]) -> NumberOrArray: ...`
			`def count(self) -> int: ...`
			`def stats(self: RDD[NumberOrArray]) -> StatCounter: ...`
			`def histogram(self, buckets: List[T]) -> Tuple[List[T], List[int]]: ...`
			`def mean(self: RDD[NumberOrArray]) -> NumberOrArray: ...`
			`def variance(self: RDD[NumberOrArray]) -> NumberOrArray: ...`
			`def stdev(self: RDD[NumberOrArray]) -> NumberOrArray: ...`
			`def sampleStdev(self: RDD[NumberOrArray]) -> NumberOrArray: ...`
			`def sampleVariance(self: RDD[NumberOrArray]) -> NumberOrArray: ...`
			`def countByValue(self: RDD[K]) -> Dict[K, int]: ...`
			`@overload`
			`def top(self: RDD[O], num: int) -> List[O]: ...`
			`@overload`
			`def top(self: RDD[T], num: int, key: Callable[[T], O]) -> List[T]: ...`
			`@overload`
			`def takeOrdered(self: RDD[O], num: int) -> List[O]: ...`
			`@overload`
			`def takeOrdered(self: RDD[T], num: int, key: Callable[[T], O]) -> List[T]: ...`
			`def take(self, num: int) -> List[T]: ...`
			`def first(self) -> T: ...`
			`def isEmpty(self) -> bool: ...`
			`def saveAsNewAPIHadoopDataset(`
			`self: RDD[Tuple[K, V]],`
			`conf: Dict[str, str],`
			`keyConverter: Optional[str] = ...,`
			`valueConverter: Optional[str] = ...,`
			`) -> None: ...`
			`def saveAsNewAPIHadoopFile(`
			`self: RDD[Tuple[K, V]],`
			`path: str,`
			`outputFormatClass: str,`
			`keyClass: Optional[str] = ...,`
			`valueClass: Optional[str] = ...,`
			`keyConverter: Optional[str] = ...,`
			`valueConverter: Optional[str] = ...,`
			`conf: Optional[Dict[str, str]] = ...,`
			`) -> None: ...`
			`def saveAsHadoopDataset(`
			`self: RDD[Tuple[K, V]],`
			`conf: Dict[str, str],`
			`keyConverter: Optional[str] = ...,`
			`valueConverter: Optional[str] = ...,`
			`) -> None: ...`
			`def saveAsHadoopFile(`
			`self: RDD[Tuple[K, V]],`
			`path: str,`
			`outputFormatClass: str,`
			`keyClass: Optional[str] = ...,`
			`valueClass: Optional[str] = ...,`
			`keyConverter: Optional[str] = ...,`
			`valueConverter: Optional[str] = ...,`
			`conf: Optional[str] = ...,`
			`compressionCodecClass: Optional[str] = ...,`
			`) -> None: ...`
			`def saveAsSequenceFile(`
			`self: RDD[Tuple[K, V]], path: str, compressionCodecClass: Optional[str] = ...`
			`) -> None: ...`
			`def saveAsPickleFile(self, path: str, batchSize: int = ...) -> None: ...`
			`def saveAsTextFile(`
			`self, path: str, compressionCodecClass: Optional[str] = ...`
			`) -> None: ...`
			`def collectAsMap(self: RDD[Tuple[K, V]]) -> Dict[K, V]: ...`
			`def keys(self: RDD[Tuple[K, V]]) -> RDD[K]: ...`
			`def values(self: RDD[Tuple[K, V]]) -> RDD[V]: ...`
			`def reduceByKey(`
			`self: RDD[Tuple[K, V]],`
			`func: Callable[[V, V], V],`
			`numPartitions: Optional[int] = ...,`
			`partitionFunc: Callable[[K], int] = ...,`
			`) -> RDD[Tuple[K, V]]: ...`
			`def reduceByKeyLocally(`
			`self: RDD[Tuple[K, V]], func: Callable[[V, V], V]`
			`) -> Dict[K, V]: ...`
			`def countByKey(self: RDD[Tuple[K, V]]) -> Dict[K, int]: ...`
			`def join(`
			`self: RDD[Tuple[K, V]],`
			`other: RDD[Tuple[K, U]],`
			`numPartitions: Optional[int] = ...,`
			`) -> RDD[Tuple[K, Tuple[V, U]]]: ...`
			`def leftOuterJoin(`
			`self: RDD[Tuple[K, V]],`
			`other: RDD[Tuple[K, U]],`
			`numPartitions: Optional[int] = ...,`
			`) -> RDD[Tuple[K, Tuple[V, Optional[U]]]]: ...`
			`def rightOuterJoin(`
			`self: RDD[Tuple[K, V]],`
			`other: RDD[Tuple[K, U]],`
			`numPartitions: Optional[int] = ...,`
			`) -> RDD[Tuple[K, Tuple[Optional[V], U]]]: ...`
			`def fullOuterJoin(`
			`self: RDD[Tuple[K, V]],`
			`other: RDD[Tuple[K, U]],`
			`numPartitions: Optional[int] = ...,`
			`) -> RDD[Tuple[K, Tuple[Optional[V], Optional[U]]]]: ...`
			`def partitionBy(`
			`self: RDD[Tuple[K, V]],`
			`numPartitions: int,`
			`partitionFunc: Callable[[K], int] = ...,`
			`) -> RDD[Tuple[K, V]]: ...`
			`def combineByKey(`
			`self: RDD[Tuple[K, V]],`
			`createCombiner: Callable[[V], U],`
			`mergeValue: Callable[[U, V], U],`
			`mergeCombiners: Callable[[U, U], U],`
			`numPartitions: Optional[int] = ...,`
			`partitionFunc: Callable[[K], int] = ...,`
			`) -> RDD[Tuple[K, U]]: ...`
			`def aggregateByKey(`
			`self: RDD[Tuple[K, V]],`
			`zeroValue: U,`
			`seqFunc: Callable[[U, V], U],`
			`combFunc: Callable[[U, U], U],`
			`numPartitions: Optional[int] = ...,`
			`partitionFunc: Callable[[K], int] = ...,`
			`) -> RDD[Tuple[K, U]]: ...`
			`def foldByKey(`
			`self: RDD[Tuple[K, V]],`
			`zeroValue: V,`
			`func: Callable[[V, V], V],`
			`numPartitions: Optional[int] = ...,`
			`partitionFunc: Callable[[K], int] = ...,`
			`) -> RDD[Tuple[K, V]]: ...`
			`def groupByKey(`
			`self: RDD[Tuple[K, V]],`
			`numPartitions: Optional[int] = ...,`
			`partitionFunc: Callable[[K], int] = ...,`
			`) -> RDD[Tuple[K, Iterable[V]]]: ...`
			`def flatMapValues(`
			`self: RDD[Tuple[K, V]], f: Callable[[V], Iterable[U]]`
			`) -> RDD[Tuple[K, U]]: ...`
			`def mapValues(self: RDD[Tuple[K, V]], f: Callable[[V], U]) -> RDD[Tuple[K, U]]: ...`
			`@overload`
			`def groupWith(`
			`self: RDD[Tuple[K, V]], __o: RDD[Tuple[K, V1]]`
			`) -> RDD[Tuple[K, Tuple[ResultIterable[V], ResultIterable[V1]]]]: ...`
			`@overload`
			`def groupWith(`
			`self: RDD[Tuple[K, V]], __o1: RDD[Tuple[K, V1]], __o2: RDD[Tuple[K, V2]]`
			`) -> RDD[`
			`Tuple[K, Tuple[ResultIterable[V], ResultIterable[V1], ResultIterable[V2]]]`
			`]: ...`
			`@overload`
			`def groupWith(`
			`self: RDD[Tuple[K, V]],`
			`other1: RDD[Tuple[K, V1]],`
			`other2: RDD[Tuple[K, V2]],`
			`other3: RDD[Tuple[K, V3]],`
			`) -> RDD[`
			`Tuple[`
			`K,`
			`Tuple[`
			`ResultIterable[V],`
			`ResultIterable[V1],`
			`ResultIterable[V2],`
			`ResultIterable[V3],`
			`],`
			`]`
			`]: ...`
			`def cogroup(`
			`self: RDD[Tuple[K, V]],`
			`other: RDD[Tuple[K, U]],`
			`numPartitions: Optional[int] = ...,`
			`) -> RDD[Tuple[K, Tuple[ResultIterable[V], ResultIterable[U]]]]: ...`
			`def sampleByKey(`
			`self: RDD[Tuple[K, V]],`
			`withReplacement: bool,`
			`fractions: Dict[K, Union[float, int]],`
			`seed: Optional[int] = ...,`
			`) -> RDD[Tuple[K, V]]: ...`
			`def subtractByKey(`
			`self: RDD[Tuple[K, V]],`
			`other: RDD[Tuple[K, U]],`
			`numPartitions: Optional[int] = ...,`
			`) -> RDD[Tuple[K, V]]: ...`
			`def subtract(`
			`self: RDD[T], other: RDD[T], numPartitions: Optional[int] = ...`
			`) -> RDD[T]: ...`
			`def keyBy(self: RDD[T], f: Callable[[T], K]) -> RDD[Tuple[K, T]]: ...`
			`def repartition(self, numPartitions: int) -> RDD[T]: ...`
			`def coalesce(self, numPartitions: int, shuffle: bool = ...) -> RDD[T]: ...`
			`def zip(self, other: RDD[U]) -> RDD[Tuple[T, U]]: ...`
			`def zipWithIndex(self) -> RDD[Tuple[T, int]]: ...`
			`def zipWithUniqueId(self) -> RDD[Tuple[T, int]]: ...`
			`def name(self) -> str: ...`
			`def setName(self, name: str) -> RDD[T]: ...`
			`def toDebugString(self) -> bytes: ...`
			`def getStorageLevel(self) -> StorageLevel: ...`
			`def lookup(self: RDD[Tuple[K, V]], key: K) -> List[V]: ...`
			`def countApprox(self, timeout: int, confidence: float = ...) -> int: ...`
			`def sumApprox(`
			`self: RDD[Union[float, int]], timeout: int, confidence: float = ...`
			`) -> BoundedFloat: ...`
			`def meanApprox(`
			`self: RDD[Union[float, int]], timeout: int, confidence: float = ...`
			`) -> BoundedFloat: ...`
			`def countApproxDistinct(self, relativeSD: float = ...) -> int: ...`
			`def toLocalIterator(self, prefetchPartitions: bool = ...) -> Iterator[T]: ...`
			`def barrier(self: RDD[T]) -> RDDBarrier[T]: ...`
			`def withResources(self: RDD[T], profile: ResourceProfile) -> RDD[T]: ...`
			`def getResourceProfile(self) -> Optional[ResourceProfile]: ...`
			`@overload`
			`def toDF(`
			`self: RDD[RowLike],`
			`schema: Optional[List[str]] = ...,`
			`sampleRatio: Optional[float] = ...,`
			`) -> DataFrame: ...`
			`@overload`
			`def toDF(self: RDD[RowLike], schema: Optional[StructType] = ...) -> DataFrame: ...`

			`class RDDBarrier(Generic[T]):`
			`rdd: RDD[T]`
			`def __init__(self, rdd: RDD[T]) -> None: ...`
			`def mapPartitions(`
			`self, f: Callable[[Iterable[T]], Iterable[U]], preservesPartitioning: bool = ...`
			`) -> RDD[U]: ...`
			`def mapPartitionsWithIndex(`
			`self,`
			`f: Callable[[int, Iterable[T]], Iterable[U]],`
			`preservesPartitioning: bool = ...,`
			`) -> RDD[U]: ...`

			`class PipelinedRDD(RDD[U], Generic[T, U]):`
			`func: Callable[[T], U]`
			`preservesPartitioning: bool`
			`is_cached: bool`
			`is_checkpointed: bool`
			`ctx: pyspark.context.SparkContext`
			`prev: RDD[T]`
			`partitioner: Optional[Partitioner]`
			`is_barrier: bool`
			`def __init__(`
			`self,`
			`prev: RDD[T],`
			`func: Callable[[Iterable[T]], Iterable[U]],`
			`preservesPartitioning: bool = ...,`
			`isFromBarrier: bool = ...,`
			`) -> None: ...`
			`def getNumPartitions(self) -> int: ...`
			`def id(self) -> int: ...`