spark-instrumented-optimizer/python/pyspark/mllib/feature.pyi

#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

from typing import overload
from typing import Iterable, Hashable, List, Tuple, Union

from pyspark.mllib._typing import VectorLike
from pyspark.context import SparkContext
from pyspark.rdd import RDD
from pyspark.mllib.common import JavaModelWrapper
from pyspark.mllib.linalg import Vector
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.util import JavaLoader, JavaSaveable

from py4j.java_collections import JavaMap  # type: ignore[import]

class VectorTransformer:
    @overload
    def transform(self, vector: VectorLike) -> Vector: ...
    @overload
    def transform(self, vector: RDD[VectorLike]) -> RDD[Vector]: ...

class Normalizer(VectorTransformer):
    p: float
    def __init__(self, p: float = ...) -> None: ...
    @overload
    def transform(self, vector: VectorLike) -> Vector: ...
    @overload
    def transform(self, vector: RDD[VectorLike]) -> RDD[Vector]: ...

class JavaVectorTransformer(JavaModelWrapper, VectorTransformer):
    @overload
    def transform(self, vector: VectorLike) -> Vector: ...
    @overload
    def transform(self, vector: RDD[VectorLike]) -> RDD[Vector]: ...

class StandardScalerModel(JavaVectorTransformer):
    @overload
    def transform(self, vector: VectorLike) -> Vector: ...
    @overload
    def transform(self, vector: RDD[VectorLike]) -> RDD[Vector]: ...
    def setWithMean(self, withMean: bool) -> StandardScalerModel: ...
    def setWithStd(self, withStd: bool) -> StandardScalerModel: ...
    @property
    def withStd(self) -> bool: ...
    @property
    def withMean(self) -> bool: ...
    @property
    def std(self) -> Vector: ...
    @property
    def mean(self) -> Vector: ...

class StandardScaler:
    withMean: bool
    withStd: bool
    def __init__(self, withMean: bool = ..., withStd: bool = ...) -> None: ...
    def fit(self, dataset: RDD[VectorLike]) -> StandardScalerModel: ...

class ChiSqSelectorModel(JavaVectorTransformer):
    @overload
    def transform(self, vector: VectorLike) -> Vector: ...
    @overload
    def transform(self, vector: RDD[VectorLike]) -> RDD[Vector]: ...

class ChiSqSelector:
    numTopFeatures: int
    selectorType: str
    percentile: float
    fpr: float
    fdr: float
    fwe: float
    def __init__(
        self,
        numTopFeatures: int = ...,
        selectorType: str = ...,
        percentile: float = ...,
        fpr: float = ...,
        fdr: float = ...,
        fwe: float = ...,
    ) -> None: ...
    def setNumTopFeatures(self, numTopFeatures: int) -> ChiSqSelector: ...
    def setPercentile(self, percentile: float) -> ChiSqSelector: ...
    def setFpr(self, fpr: float) -> ChiSqSelector: ...
    def setFdr(self, fdr: float) -> ChiSqSelector: ...
    def setFwe(self, fwe: float) -> ChiSqSelector: ...
    def setSelectorType(self, selectorType: str) -> ChiSqSelector: ...
    def fit(self, data: RDD[LabeledPoint]) -> ChiSqSelectorModel: ...

class PCAModel(JavaVectorTransformer): ...

class PCA:
    k: int
    def __init__(self, k: int) -> None: ...
    def fit(self, data: RDD[VectorLike]) -> PCAModel: ...

class HashingTF:
    numFeatures: int
    binary: bool
    def __init__(self, numFeatures: int = ...) -> None: ...
    def setBinary(self, value: bool) -> HashingTF: ...
    def indexOf(self, term: Hashable) -> int: ...
    @overload
    def transform(self, document: Iterable[Hashable]) -> Vector: ...
    @overload
    def transform(self, document: RDD[Iterable[Hashable]]) -> RDD[Vector]: ...

class IDFModel(JavaVectorTransformer):
    @overload
    def transform(self, x: VectorLike) -> Vector: ...
    @overload
    def transform(self, x: RDD[VectorLike]) -> RDD[Vector]: ...
    def idf(self) -> Vector: ...
    def docFreq(self) -> List[int]: ...
    def numDocs(self) -> int: ...

class IDF:
    minDocFreq: int
    def __init__(self, minDocFreq: int = ...) -> None: ...
    def fit(self, dataset: RDD[VectorLike]) -> IDFModel: ...

class Word2VecModel(JavaVectorTransformer, JavaSaveable, JavaLoader[Word2VecModel]):
    def transform(self, word: str) -> Vector: ...  # type: ignore
    def findSynonyms(self, word: Union[str, VectorLike], num: int) -> Iterable[Tuple[str, float]]: ...
    def getVectors(self) -> JavaMap: ...
    @classmethod
    def load(cls, sc: SparkContext, path: str) -> Word2VecModel: ...

class Word2Vec:
    vectorSize: int
    learningRate: float
    numPartitions: int
    numIterations: int
    seed: int
    minCount: int
    windowSize: int
    def __init__(self) -> None: ...
    def setVectorSize(self, vectorSize: int) -> Word2Vec: ...
    def setLearningRate(self, learningRate: float) -> Word2Vec: ...
    def setNumPartitions(self, numPartitions: int) -> Word2Vec: ...
    def setNumIterations(self, numIterations: int) -> Word2Vec: ...
    def setSeed(self, seed: int) -> Word2Vec: ...
    def setMinCount(self, minCount: int) -> Word2Vec: ...
    def setWindowSize(self, windowSize: int) -> Word2Vec: ...
    def fit(self, data: RDD[List[str]]) -> Word2VecModel: ...

class ElementwiseProduct(VectorTransformer):
    scalingVector: Vector
    def __init__(self, scalingVector: Vector) -> None: ...
    @overload
    def transform(self, vector: VectorLike) -> Vector: ...
    @overload
    def transform(self, vector: RDD[VectorLike]) -> RDD[Vector]: ...
[SPARK-32714][PYTHON] Initial pyspark-stubs port ### What changes were proposed in this pull request? This PR proposes migration of [`pyspark-stubs`](https://github.com/zero323/pyspark-stubs) into Spark codebase. ### Why are the changes needed? ### Does this PR introduce _any_ user-facing change? Yes. This PR adds type annotations directly to Spark source. This can impact interaction with development tools for users, which haven't used `pyspark-stubs`. ### How was this patch tested? - [x] MyPy tests of the PySpark source ``` mypy --no-incremental --config python/mypy.ini python/pyspark ``` - [x] MyPy tests of Spark examples ``` MYPYPATH=python/ mypy --no-incremental --config python/mypy.ini examples/src/main/python/ml examples/src/main/python/sql examples/src/main/python/sql/streaming ``` - [x] Existing Flake8 linter - [x] Existing unit tests Tested against: - `mypy==0.790+dev.e959952d9001e9713d329a2f9b196705b028f894` - `mypy==0.782` Closes #29591 from zero323/SPARK-32681. Authored-by: zero323 <mszymkiewicz@gmail.com> Signed-off-by: HyukjinKwon <gurwls223@apache.org> 2020-09-24 01:15:36 -04:00			`#`
			`# Licensed to the Apache Software Foundation (ASF) under one`
			`# or more contributor license agreements. See the NOTICE file`
			`# distributed with this work for additional information`
			`# regarding copyright ownership. The ASF licenses this file`
			`# to you under the Apache License, Version 2.0 (the`
			`# "License"); you may not use this file except in compliance`
			`# with the License. You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing,`
			`# software distributed under the License is distributed on an`
			`# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY`
			`# KIND, either express or implied. See the License for the`
			`# specific language governing permissions and limitations`
			`# under the License.`

			`from typing import overload`
[SPARK-33252][PYTHON][DOCS] Migration to NumPy documentation style in MLlib (pyspark.mllib.*) ### What changes were proposed in this pull request? This PR proposes migration of `pyspark.mllib` to NumPy documentation style. ### Why are the changes needed? To improve documentation style. Before: ![old](https://user-images.githubusercontent.com/1554276/100097941-90234980-2e5d-11eb-8b4d-c25d98d85191.png) After: ![new](https://user-images.githubusercontent.com/1554276/100097966-987b8480-2e5d-11eb-9e02-07b18c327624.png) ### Does this PR introduce _any_ user-facing change? Yes, this changes both rendered HTML docs and console representation (SPARK-33243). ### How was this patch tested? `dev/lint-python` and manual inspection. Closes #30413 from zero323/SPARK-33252. Authored-by: zero323 <mszymkiewicz@gmail.com> Signed-off-by: HyukjinKwon <gurwls223@apache.org> 2020-11-24 20:24:41 -05:00			`from typing import Iterable, Hashable, List, Tuple, Union`
[SPARK-32714][PYTHON] Initial pyspark-stubs port ### What changes were proposed in this pull request? This PR proposes migration of [`pyspark-stubs`](https://github.com/zero323/pyspark-stubs) into Spark codebase. ### Why are the changes needed? ### Does this PR introduce _any_ user-facing change? Yes. This PR adds type annotations directly to Spark source. This can impact interaction with development tools for users, which haven't used `pyspark-stubs`. ### How was this patch tested? - [x] MyPy tests of the PySpark source ``` mypy --no-incremental --config python/mypy.ini python/pyspark ``` - [x] MyPy tests of Spark examples ``` MYPYPATH=python/ mypy --no-incremental --config python/mypy.ini examples/src/main/python/ml examples/src/main/python/sql examples/src/main/python/sql/streaming ``` - [x] Existing Flake8 linter - [x] Existing unit tests Tested against: - `mypy==0.790+dev.e959952d9001e9713d329a2f9b196705b028f894` - `mypy==0.782` Closes #29591 from zero323/SPARK-32681. Authored-by: zero323 <mszymkiewicz@gmail.com> Signed-off-by: HyukjinKwon <gurwls223@apache.org> 2020-09-24 01:15:36 -04:00
			`from pyspark.mllib._typing import VectorLike`
			`from pyspark.context import SparkContext`
			`from pyspark.rdd import RDD`
			`from pyspark.mllib.common import JavaModelWrapper`
			`from pyspark.mllib.linalg import Vector`
			`from pyspark.mllib.regression import LabeledPoint`
			`from pyspark.mllib.util import JavaLoader, JavaSaveable`

			`from py4j.java_collections import JavaMap # type: ignore[import]`

			`class VectorTransformer:`
			`@overload`
			`def transform(self, vector: VectorLike) -> Vector: ...`
			`@overload`
			`def transform(self, vector: RDD[VectorLike]) -> RDD[Vector]: ...`

			`class Normalizer(VectorTransformer):`
			`p: float`
			`def __init__(self, p: float = ...) -> None: ...`
			`@overload`
			`def transform(self, vector: VectorLike) -> Vector: ...`
			`@overload`
			`def transform(self, vector: RDD[VectorLike]) -> RDD[Vector]: ...`

			`class JavaVectorTransformer(JavaModelWrapper, VectorTransformer):`
			`@overload`
			`def transform(self, vector: VectorLike) -> Vector: ...`
			`@overload`
			`def transform(self, vector: RDD[VectorLike]) -> RDD[Vector]: ...`

			`class StandardScalerModel(JavaVectorTransformer):`
			`@overload`
			`def transform(self, vector: VectorLike) -> Vector: ...`
			`@overload`
			`def transform(self, vector: RDD[VectorLike]) -> RDD[Vector]: ...`
			`def setWithMean(self, withMean: bool) -> StandardScalerModel: ...`
			`def setWithStd(self, withStd: bool) -> StandardScalerModel: ...`
			`@property`
			`def withStd(self) -> bool: ...`
			`@property`
			`def withMean(self) -> bool: ...`
			`@property`
			`def std(self) -> Vector: ...`
			`@property`
			`def mean(self) -> Vector: ...`

			`class StandardScaler:`
			`withMean: bool`
			`withStd: bool`
			`def __init__(self, withMean: bool = ..., withStd: bool = ...) -> None: ...`
			`def fit(self, dataset: RDD[VectorLike]) -> StandardScalerModel: ...`

			`class ChiSqSelectorModel(JavaVectorTransformer):`
			`@overload`
			`def transform(self, vector: VectorLike) -> Vector: ...`
			`@overload`
			`def transform(self, vector: RDD[VectorLike]) -> RDD[Vector]: ...`

			`class ChiSqSelector:`
			`numTopFeatures: int`
			`selectorType: str`
			`percentile: float`
			`fpr: float`
			`fdr: float`
			`fwe: float`
			`def __init__(`
			`self,`
			`numTopFeatures: int = ...,`
			`selectorType: str = ...,`
			`percentile: float = ...,`
			`fpr: float = ...,`
			`fdr: float = ...,`
			`fwe: float = ...,`
			`) -> None: ...`
			`def setNumTopFeatures(self, numTopFeatures: int) -> ChiSqSelector: ...`
			`def setPercentile(self, percentile: float) -> ChiSqSelector: ...`
			`def setFpr(self, fpr: float) -> ChiSqSelector: ...`
			`def setFdr(self, fdr: float) -> ChiSqSelector: ...`
			`def setFwe(self, fwe: float) -> ChiSqSelector: ...`
			`def setSelectorType(self, selectorType: str) -> ChiSqSelector: ...`
			`def fit(self, data: RDD[LabeledPoint]) -> ChiSqSelectorModel: ...`

			`class PCAModel(JavaVectorTransformer): ...`

			`class PCA:`
			`k: int`
			`def __init__(self, k: int) -> None: ...`
			`def fit(self, data: RDD[VectorLike]) -> PCAModel: ...`

			`class HashingTF:`
			`numFeatures: int`
			`binary: bool`
			`def __init__(self, numFeatures: int = ...) -> None: ...`
			`def setBinary(self, value: bool) -> HashingTF: ...`
			`def indexOf(self, term: Hashable) -> int: ...`
			`@overload`
			`def transform(self, document: Iterable[Hashable]) -> Vector: ...`
			`@overload`
			`def transform(self, document: RDD[Iterable[Hashable]]) -> RDD[Vector]: ...`

			`class IDFModel(JavaVectorTransformer):`
			`@overload`
			`def transform(self, x: VectorLike) -> Vector: ...`
			`@overload`
			`def transform(self, x: RDD[VectorLike]) -> RDD[Vector]: ...`
			`def idf(self) -> Vector: ...`
			`def docFreq(self) -> List[int]: ...`
			`def numDocs(self) -> int: ...`

			`class IDF:`
			`minDocFreq: int`
			`def __init__(self, minDocFreq: int = ...) -> None: ...`
			`def fit(self, dataset: RDD[VectorLike]) -> IDFModel: ...`

			`class Word2VecModel(JavaVectorTransformer, JavaSaveable, JavaLoader[Word2VecModel]):`
			`def transform(self, word: str) -> Vector: ... # type: ignore`
[SPARK-33252][PYTHON][DOCS] Migration to NumPy documentation style in MLlib (pyspark.mllib.*) ### What changes were proposed in this pull request? This PR proposes migration of `pyspark.mllib` to NumPy documentation style. ### Why are the changes needed? To improve documentation style. Before: ![old](https://user-images.githubusercontent.com/1554276/100097941-90234980-2e5d-11eb-8b4d-c25d98d85191.png) After: ![new](https://user-images.githubusercontent.com/1554276/100097966-987b8480-2e5d-11eb-9e02-07b18c327624.png) ### Does this PR introduce _any_ user-facing change? Yes, this changes both rendered HTML docs and console representation (SPARK-33243). ### How was this patch tested? `dev/lint-python` and manual inspection. Closes #30413 from zero323/SPARK-33252. Authored-by: zero323 <mszymkiewicz@gmail.com> Signed-off-by: HyukjinKwon <gurwls223@apache.org> 2020-11-24 20:24:41 -05:00			`def findSynonyms(self, word: Union[str, VectorLike], num: int) -> Iterable[Tuple[str, float]]: ...`
[SPARK-32714][PYTHON] Initial pyspark-stubs port ### What changes were proposed in this pull request? This PR proposes migration of [`pyspark-stubs`](https://github.com/zero323/pyspark-stubs) into Spark codebase. ### Why are the changes needed? ### Does this PR introduce _any_ user-facing change? Yes. This PR adds type annotations directly to Spark source. This can impact interaction with development tools for users, which haven't used `pyspark-stubs`. ### How was this patch tested? - [x] MyPy tests of the PySpark source ``` mypy --no-incremental --config python/mypy.ini python/pyspark ``` - [x] MyPy tests of Spark examples ``` MYPYPATH=python/ mypy --no-incremental --config python/mypy.ini examples/src/main/python/ml examples/src/main/python/sql examples/src/main/python/sql/streaming ``` - [x] Existing Flake8 linter - [x] Existing unit tests Tested against: - `mypy==0.790+dev.e959952d9001e9713d329a2f9b196705b028f894` - `mypy==0.782` Closes #29591 from zero323/SPARK-32681. Authored-by: zero323 <mszymkiewicz@gmail.com> Signed-off-by: HyukjinKwon <gurwls223@apache.org> 2020-09-24 01:15:36 -04:00			`def getVectors(self) -> JavaMap: ...`
			`@classmethod`
			`def load(cls, sc: SparkContext, path: str) -> Word2VecModel: ...`

			`class Word2Vec:`
			`vectorSize: int`
			`learningRate: float`
			`numPartitions: int`
			`numIterations: int`
			`seed: int`
			`minCount: int`
			`windowSize: int`
			`def __init__(self) -> None: ...`
			`def setVectorSize(self, vectorSize: int) -> Word2Vec: ...`
			`def setLearningRate(self, learningRate: float) -> Word2Vec: ...`
			`def setNumPartitions(self, numPartitions: int) -> Word2Vec: ...`
			`def setNumIterations(self, numIterations: int) -> Word2Vec: ...`
			`def setSeed(self, seed: int) -> Word2Vec: ...`
			`def setMinCount(self, minCount: int) -> Word2Vec: ...`
			`def setWindowSize(self, windowSize: int) -> Word2Vec: ...`
			`def fit(self, data: RDD[List[str]]) -> Word2VecModel: ...`

			`class ElementwiseProduct(VectorTransformer):`
			`scalingVector: Vector`
			`def __init__(self, scalingVector: Vector) -> None: ...`
			`@overload`
			`def transform(self, vector: VectorLike) -> Vector: ...`
			`@overload`
			`def transform(self, vector: RDD[VectorLike]) -> RDD[Vector]: ...`