spark-instrumented-optimizer/python/pyspark/ml/clustering.pyi

#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

from typing import Any, List, Optional

from pyspark.ml.linalg import Matrix, Vector
from pyspark.ml.util import (
    GeneralJavaMLWritable,
    HasTrainingSummary,
    JavaMLReadable,
    JavaMLWritable,
)
from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams, JavaWrapper
from pyspark.ml.param.shared import (
    HasAggregationDepth,
    HasCheckpointInterval,
    HasDistanceMeasure,
    HasFeaturesCol,
    HasMaxIter,
    HasPredictionCol,
    HasProbabilityCol,
    HasSeed,
    HasTol,
    HasWeightCol,
)

from pyspark.ml.param import Param
from pyspark.ml.stat import MultivariateGaussian
from pyspark.sql.dataframe import DataFrame

from numpy import ndarray  # type: ignore[import]

class ClusteringSummary(JavaWrapper):
    @property
    def predictionCol(self) -> str: ...
    @property
    def predictions(self) -> DataFrame: ...
    @property
    def featuresCol(self) -> str: ...
    @property
    def k(self) -> int: ...
    @property
    def cluster(self) -> DataFrame: ...
    @property
    def clusterSizes(self) -> List[int]: ...
    @property
    def numIter(self) -> int: ...

class _GaussianMixtureParams(
    HasMaxIter,
    HasFeaturesCol,
    HasSeed,
    HasPredictionCol,
    HasProbabilityCol,
    HasTol,
    HasAggregationDepth,
    HasWeightCol,
):
    k: Param[int]
    def __init__(self, *args: Any): ...
    def getK(self) -> int: ...

class GaussianMixtureModel(
    JavaModel,
    _GaussianMixtureParams,
    JavaMLWritable,
    JavaMLReadable[GaussianMixtureModel],
    HasTrainingSummary[GaussianMixtureSummary],
):
    def setFeaturesCol(self, value: str) -> GaussianMixtureModel: ...
    def setPredictionCol(self, value: str) -> GaussianMixtureModel: ...
    def setProbabilityCol(self, value: str) -> GaussianMixtureModel: ...
    @property
    def weights(self) -> List[float]: ...
    @property
    def gaussians(self) -> List[MultivariateGaussian]: ...
    @property
    def gaussiansDF(self) -> DataFrame: ...
    @property
    def summary(self) -> GaussianMixtureSummary: ...
    def predict(self, value: Vector) -> int: ...
    def predictProbability(self, value: Vector) -> Vector: ...

class GaussianMixture(
    JavaEstimator[GaussianMixtureModel],
    _GaussianMixtureParams,
    JavaMLWritable,
    JavaMLReadable[GaussianMixture],
):
    def __init__(
        self,
        *,
        featuresCol: str = ...,
        predictionCol: str = ...,
        k: int = ...,
        probabilityCol: str = ...,
        tol: float = ...,
        maxIter: int = ...,
        seed: Optional[int] = ...,
        aggregationDepth: int = ...,
        weightCol: Optional[str] = ...
    ) -> None: ...
    def setParams(
        self,
        *,
        featuresCol: str = ...,
        predictionCol: str = ...,
        k: int = ...,
        probabilityCol: str = ...,
        tol: float = ...,
        maxIter: int = ...,
        seed: Optional[int] = ...,
        aggregationDepth: int = ...,
        weightCol: Optional[str] = ...
    ) -> GaussianMixture: ...
    def setK(self, value: int) -> GaussianMixture: ...
    def setMaxIter(self, value: int) -> GaussianMixture: ...
    def setFeaturesCol(self, value: str) -> GaussianMixture: ...
    def setPredictionCol(self, value: str) -> GaussianMixture: ...
    def setProbabilityCol(self, value: str) -> GaussianMixture: ...
    def setWeightCol(self, value: str) -> GaussianMixture: ...
    def setSeed(self, value: int) -> GaussianMixture: ...
    def setTol(self, value: float) -> GaussianMixture: ...
    def setAggregationDepth(self, value: int) -> GaussianMixture: ...

class GaussianMixtureSummary(ClusteringSummary):
    @property
    def probabilityCol(self) -> str: ...
    @property
    def probability(self) -> DataFrame: ...
    @property
    def logLikelihood(self) -> float: ...

class KMeansSummary(ClusteringSummary):
    def trainingCost(self) -> float: ...

class _KMeansParams(
    HasMaxIter,
    HasFeaturesCol,
    HasSeed,
    HasPredictionCol,
    HasTol,
    HasDistanceMeasure,
    HasWeightCol,
):
    k: Param[int]
    initMode: Param[str]
    initSteps: Param[int]
    def __init__(self, *args: Any): ...
    def getK(self) -> int: ...
    def getInitMode(self) -> str: ...
    def getInitSteps(self) -> int: ...

class KMeansModel(
    JavaModel,
    _KMeansParams,
    GeneralJavaMLWritable,
    JavaMLReadable[KMeansModel],
    HasTrainingSummary[KMeansSummary],
):
    def setFeaturesCol(self, value: str) -> KMeansModel: ...
    def setPredictionCol(self, value: str) -> KMeansModel: ...
    def clusterCenters(self) -> List[ndarray]: ...
    @property
    def summary(self) -> KMeansSummary: ...
    def predict(self, value: Vector) -> int: ...

class KMeans(
    JavaEstimator[KMeansModel], _KMeansParams, JavaMLWritable, JavaMLReadable[KMeans]
):
    def __init__(
        self,
        *,
        featuresCol: str = ...,
        predictionCol: str = ...,
        k: int = ...,
        initMode: str = ...,
        initSteps: int = ...,
        tol: float = ...,
        maxIter: int = ...,
        seed: Optional[int] = ...,
        distanceMeasure: str = ...,
        weightCol: Optional[str] = ...
    ) -> None: ...
    def setParams(
        self,
        *,
        featuresCol: str = ...,
        predictionCol: str = ...,
        k: int = ...,
        initMode: str = ...,
        initSteps: int = ...,
        tol: float = ...,
        maxIter: int = ...,
        seed: Optional[int] = ...,
        distanceMeasure: str = ...,
        weightCol: Optional[str] = ...
    ) -> KMeans: ...
    def setK(self, value: int) -> KMeans: ...
    def setInitMode(self, value: str) -> KMeans: ...
    def setInitSteps(self, value: int) -> KMeans: ...
    def setDistanceMeasure(self, value: str) -> KMeans: ...
    def setMaxIter(self, value: int) -> KMeans: ...
    def setFeaturesCol(self, value: str) -> KMeans: ...
    def setPredictionCol(self, value: str) -> KMeans: ...
    def setSeed(self, value: int) -> KMeans: ...
    def setTol(self, value: float) -> KMeans: ...
    def setWeightCol(self, value: str) -> KMeans: ...

class _BisectingKMeansParams(
    HasMaxIter,
    HasFeaturesCol,
    HasSeed,
    HasPredictionCol,
    HasDistanceMeasure,
    HasWeightCol,
):
    k: Param[int]
    minDivisibleClusterSize: Param[float]
    def __init__(self, *args: Any): ...
    def getK(self) -> int: ...
    def getMinDivisibleClusterSize(self) -> float: ...

class BisectingKMeansModel(
    JavaModel,
    _BisectingKMeansParams,
    JavaMLWritable,
    JavaMLReadable[BisectingKMeansModel],
    HasTrainingSummary[BisectingKMeansSummary],
):
    def setFeaturesCol(self, value: str) -> BisectingKMeansModel: ...
    def setPredictionCol(self, value: str) -> BisectingKMeansModel: ...
    def clusterCenters(self) -> List[ndarray]: ...
    def computeCost(self, dataset: DataFrame) -> float: ...
    @property
    def summary(self) -> BisectingKMeansSummary: ...
    def predict(self, value: Vector) -> int: ...

class BisectingKMeans(
    JavaEstimator[BisectingKMeansModel],
    _BisectingKMeansParams,
    JavaMLWritable,
    JavaMLReadable[BisectingKMeans],
):
    def __init__(
        self,
        *,
        featuresCol: str = ...,
        predictionCol: str = ...,
        maxIter: int = ...,
        seed: Optional[int] = ...,
        k: int = ...,
        minDivisibleClusterSize: float = ...,
        distanceMeasure: str = ...,
        weightCol: Optional[str] = ...
    ) -> None: ...
    def setParams(
        self,
        *,
        featuresCol: str = ...,
        predictionCol: str = ...,
        maxIter: int = ...,
        seed: Optional[int] = ...,
        k: int = ...,
        minDivisibleClusterSize: float = ...,
        distanceMeasure: str = ...,
        weightCol: Optional[str] = ...
    ) -> BisectingKMeans: ...
    def setK(self, value: int) -> BisectingKMeans: ...
    def setMinDivisibleClusterSize(self, value: float) -> BisectingKMeans: ...
    def setDistanceMeasure(self, value: str) -> BisectingKMeans: ...
    def setMaxIter(self, value: int) -> BisectingKMeans: ...
    def setFeaturesCol(self, value: str) -> BisectingKMeans: ...
    def setPredictionCol(self, value: str) -> BisectingKMeans: ...
    def setSeed(self, value: int) -> BisectingKMeans: ...
    def setWeightCol(self, value: str) -> BisectingKMeans: ...

class BisectingKMeansSummary(ClusteringSummary):
    @property
    def trainingCost(self) -> float: ...

class _LDAParams(HasMaxIter, HasFeaturesCol, HasSeed, HasCheckpointInterval):
    k: Param[int]
    optimizer: Param[str]
    learningOffset: Param[float]
    learningDecay: Param[float]
    subsamplingRate: Param[float]
    optimizeDocConcentration: Param[bool]
    docConcentration: Param[List[float]]
    topicConcentration: Param[float]
    topicDistributionCol: Param[str]
    keepLastCheckpoint: Param[bool]
    def __init__(self, *args: Any): ...
    def setK(self, value: int) -> LDA: ...
    def getOptimizer(self) -> str: ...
    def getLearningOffset(self) -> float: ...
    def getLearningDecay(self) -> float: ...
    def getSubsamplingRate(self) -> float: ...
    def getOptimizeDocConcentration(self) -> bool: ...
    def getDocConcentration(self) -> List[float]: ...
    def getTopicConcentration(self) -> float: ...
    def getTopicDistributionCol(self) -> str: ...
    def getKeepLastCheckpoint(self) -> bool: ...

class LDAModel(JavaModel, _LDAParams):
    def setFeaturesCol(self, value: str) -> LDAModel: ...
    def setSeed(self, value: int) -> LDAModel: ...
    def setTopicDistributionCol(self, value: str) -> LDAModel: ...
    def isDistributed(self) -> bool: ...
    def vocabSize(self) -> int: ...
    def topicsMatrix(self) -> Matrix: ...
    def logLikelihood(self, dataset: DataFrame) -> float: ...
    def logPerplexity(self, dataset: DataFrame) -> float: ...
    def describeTopics(self, maxTermsPerTopic: int = ...) -> DataFrame: ...
    def estimatedDocConcentration(self) -> Vector: ...

class DistributedLDAModel(
    LDAModel, JavaMLReadable[DistributedLDAModel], JavaMLWritable
):
    def toLocal(self) -> LDAModel: ...
    def trainingLogLikelihood(self) -> float: ...
    def logPrior(self) -> float: ...
    def getCheckpointFiles(self) -> List[str]: ...

class LocalLDAModel(LDAModel, JavaMLReadable[LocalLDAModel], JavaMLWritable): ...

class LDA(JavaEstimator[LDAModel], _LDAParams, JavaMLReadable[LDA], JavaMLWritable):
    def __init__(
        self,
        *,
        featuresCol: str = ...,
        maxIter: int = ...,
        seed: Optional[int] = ...,
        checkpointInterval: int = ...,
        k: int = ...,
        optimizer: str = ...,
        learningOffset: float = ...,
        learningDecay: float = ...,
        subsamplingRate: float = ...,
        optimizeDocConcentration: bool = ...,
        docConcentration: Optional[List[float]] = ...,
        topicConcentration: Optional[float] = ...,
        topicDistributionCol: str = ...,
        keepLastCheckpoint: bool = ...
    ) -> None: ...
    def setParams(
        self,
        *,
        featuresCol: str = ...,
        maxIter: int = ...,
        seed: Optional[int] = ...,
        checkpointInterval: int = ...,
        k: int = ...,
        optimizer: str = ...,
        learningOffset: float = ...,
        learningDecay: float = ...,
        subsamplingRate: float = ...,
        optimizeDocConcentration: bool = ...,
        docConcentration: Optional[List[float]] = ...,
        topicConcentration: Optional[float] = ...,
        topicDistributionCol: str = ...,
        keepLastCheckpoint: bool = ...
    ) -> LDA: ...
    def setCheckpointInterval(self, value: int) -> LDA: ...
    def setSeed(self, value: int) -> LDA: ...
    def setK(self, value: int) -> LDA: ...
    def setOptimizer(self, value: str) -> LDA: ...
    def setLearningOffset(self, value: float) -> LDA: ...
    def setLearningDecay(self, value: float) -> LDA: ...
    def setSubsamplingRate(self, value: float) -> LDA: ...
    def setOptimizeDocConcentration(self, value: bool) -> LDA: ...
    def setDocConcentration(self, value: List[float]) -> LDA: ...
    def setTopicConcentration(self, value: float) -> LDA: ...
    def setTopicDistributionCol(self, value: str) -> LDA: ...
    def setKeepLastCheckpoint(self, value: bool) -> LDA: ...
    def setMaxIter(self, value: int) -> LDA: ...
    def setFeaturesCol(self, value: str) -> LDA: ...

class _PowerIterationClusteringParams(HasMaxIter, HasWeightCol):
    k: Param[int]
    initMode: Param[str]
    srcCol: Param[str]
    dstCol: Param[str]
    def __init__(self, *args: Any): ...
    def getK(self) -> int: ...
    def getInitMode(self) -> str: ...
    def getSrcCol(self) -> str: ...
    def getDstCol(self) -> str: ...

class PowerIterationClustering(
    _PowerIterationClusteringParams,
    JavaParams,
    JavaMLReadable[PowerIterationClustering],
    JavaMLWritable,
):
    def __init__(
        self,
        *,
        k: int = ...,
        maxIter: int = ...,
        initMode: str = ...,
        srcCol: str = ...,
        dstCol: str = ...,
        weightCol: Optional[str] = ...
    ) -> None: ...
    def setParams(
        self,
        *,
        k: int = ...,
        maxIter: int = ...,
        initMode: str = ...,
        srcCol: str = ...,
        dstCol: str = ...,
        weightCol: Optional[str] = ...
    ) -> PowerIterationClustering: ...
    def setK(self, value: int) -> PowerIterationClustering: ...
    def setInitMode(self, value: str) -> PowerIterationClustering: ...
    def setSrcCol(self, value: str) -> str: ...
    def setDstCol(self, value: str) -> PowerIterationClustering: ...
    def setMaxIter(self, value: int) -> PowerIterationClustering: ...
    def setWeightCol(self, value: str) -> PowerIterationClustering: ...
    def assignClusters(self, dataset: DataFrame) -> DataFrame: ...