spark-instrumented-optimizer/python/pyspark/ml/param/shared.pyi
zhengruifeng a2887164bc [SPARK-32907][ML][PYTHON] adaptively blockify instances - LinearSVC
### What changes were proposed in this pull request?
1, use `maxBlockSizeInMB` instead of `blockSize`(#rows) to control the stacking of vectors;
2, infer an appropriate `maxBlockSizeInMB` if set 0;

### Why are the changes needed?
the performance gain is mainly related to the nnz of block.

f2jBLAS |   |   |   |   |   |   |   |   |   |   |   |   |  
-- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | --
Duration(millisecond) | branch 3.0 Impl | blockSizeInMB=0.0625 | blockSizeInMB=0.125 | blockSizeInMB=0.25 | blockSizeInMB=0.5 | blockSizeInMB=1 | blockSizeInMB=2 | blockSizeInMB=4 | blockSizeInMB=8 | blockSizeInMB=16 | blockSizeInMB=32 | blockSizeInMB=64 | blockSizeInMB=128
epsilon(100%) | 326481 | 26143 | 25710 | 24726 | 25395 | 25840 | 26846 | 25927 | 27431 | 26190 | 26056 | 26347 | 27204
epsilon3000(67%) | 455247 | 35893 | 34366 | 34985 | 38387 | 38901 | 40426 | 40044 | 39161 | 38767 | 39965 | 39523 | 39108
epsilon4000(50%) | 306390 | 42256 | 41164 | 43748 | 48638 | 50892 | 50986 | 51091 | 51072 | 51289 | 51652 | 53312 | 52146
epsilon5000(40%) | 307619 | 43639 | 42992 | 44743 | 50800 | 51939 | 51871 | 52190 | 53850 | 52607 | 51062 | 52509 | 51570
epsilon10000(20%) | 310070 | 58371 | 55921 | 56317 | 56618 | 53694 | 52131 | 51768 | 51728 | 52233 | 51881 | 51653 | 52440
epsilon20000(10%) | 316565 | 109193 | 95121 | 82764 | 69653 | 60764 | 56066 | 53371 | 52822 | 52872 | 52769 | 52527 | 53508
epsilon200000(1%) | 336181 | 1569721 | 1069355 | 673718 | 375043 | 218230 | 145393 | 110926 | 94327 | 87039 | 83926 | 81890 | 81787
  |   |   |   |   |   |   |   |   |   |   |   |   |  
  |   |   |   |   |   |   |   |   |   |   |   |   |  
  | Speedup |   |   |   |   |   |   |   |   |   |   |   |  
epsilon(100%) | 1 | 12.48827602 | 12.69859977 | **13.20395535** | 12.85611341 | 12.63471362 | 12.16125307 | 12.59231689 | 11.90189931 | 12.46586483 | 12.5299739 | 12.39158158 | 12.00121306
epsilon3000(67%) | 1 | 12.68344803 | **13.2470174** | 13.01263399 | 11.85940553 | 11.70270687 | 11.26124276 | 11.36866946 | 11.62500958 | 11.74315784 | 11.39114225 | 11.51853351 | 11.64076404
epsilon4000(50%) | 1 | 7.250804619 | **7.443154212** | 7.003520161 | 6.299395534 | 6.020396133 | 6.00929667 | 5.996946625 | 5.999177632 | 5.973795551 | 5.931812902 | 5.747111345 | 5.875618456
epsilon5000(40%) | 1 | 7.049176196 | **7.155261444** | 6.875243055 | 6.055492126 | 5.92269778 | 5.930462108 | 5.894213451 | 5.712516249 | 5.847491779 | 6.024421292 | 5.858405226 | 5.965076595
epsilon10000(20%) | 1 | 5.312055644 | 5.544786395 | 5.505797539 | 5.4765269 | 5.774760681 | 5.947900481 | 5.98960748 | 5.994239097 | 5.93628549 | 5.976561747 | **6.002942714** | 5.912852784
epsilon20000(10%) | 1 | 2.899132728 | 3.328024306 | 3.824911797 | 4.544886796 | 5.209745902 | 5.64629187 | 5.931404695 | 5.993052137 | 5.987384627 | 5.999071425 | **6.026710073** | 5.916218136
epsilon200000(1%) | 1 | 0.214166084 | 0.314377358 | 0.498993644 | 0.896379882 | 1.540489392 | 2.312222734 | 3.03067811 | 3.563995463 | 3.862417997 | 4.005683578 | 4.105275369 | **4.110445425**

OpenBLAS |   |   |   |   |   |   |   |   |   |   |   |   |  
-- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | --
Duration(millisecond) | branch 3.0 Impl | blockSizeInMB=0.0625 | blockSizeInMB=0.125 | blockSizeInMB=0.25 | blockSizeInMB=0.5 | blockSizeInMB=1 | blockSizeInMB=2 | blockSizeInMB=4 | blockSizeInMB=8 | blockSizeInMB=16 | blockSizeInMB=32 | blockSizeInMB=64 | blockSizeInMB=128
epsilon(100%) | 299119 | 26047 | 25049 | 25239 | 28001 | 35138 | 36438 | 36279 | 36114 | 35111 | 35428 | 36295 | 35197
epsilon3000(67%) | 439798 | 33321 | 34423 | 34336 | 38906 | 51756 | 54138 | 54085 | 53412 | 54766 | 54425 | 54221 | 54842
epsilon4000(50%) | 302963 | 42960 | 40678 | 43483 | 48254 | 50888 | 54990 | 52647 | 51947 | 51843 | 52891 | 53410 | 52020
epsilon5000(40%) | 303569 | 44225 | 44961 | 45065 | 51768 | 52776 | 51930 | 53587 | 53104 | 51833 | 52138 | 52574 | 53756
epsilon10000(20%) | 307403 | 58447 | 55993 | 56757 | 56694 | 54038 | 52734 | 52073 | 52051 | 52150 | 51986 | 52407 | 52390
epsilon20000(10%) | 313344 | 107580 | 94679 | 83329 | 70226 | 60996 | 57130 | 55461 | 54641 | 52712 | 52541 | 53101 | 53312
epsilon200000(1%) | 334679 | 1642726 | 1073148 | 654481 | 364974 | 213881 | 140248 | 107579 | 91757 | 85090 | 81940 | 80492 | 80250
  |   |   |   |   |   |   |   |   |   |   |   |   |  
  |   |   |   |   |   |   |   |   |   |   |   |   |  
  | Speedup |   |   |   |   |   |   |   |   |   |   |   |  
epsilon(100%) | 1 | 11.48381771 | **11.94135494** | 11.85146004 | 10.68243991 | 8.512692811 | 8.208985125 | 8.244962651 | 8.282632774 | 8.519238985 | 8.443011178 | 8.241328007 | 8.498423161
epsilon3000(67%) | 1 | 13.19882356 | 12.7762833 | **12.80865564** | 11.30411762 | 8.497526857 | 8.123646976 | 8.131607655 | 8.234067251 | 8.030493372 | 8.080808452 | 8.111211523 | 8.01936472
epsilon4000(50%) | 1 | 7.052211359 | **7.44783421** | 6.967389555 | 6.278505409 | 5.953525389 | 5.509419895 | 5.754610899 | 5.832155851 | 5.843855487 | 5.728063376 | 5.672402172 | 5.823971549
epsilon5000(40%) | 1 | **6.86419446** | 6.751829363 | 6.736247642 | 5.864027971 | 5.752027437 | 5.845734643 | 5.664974714 | 5.716499699 | 5.856674319 | 5.822413595 | 5.774127896 | 5.647164968
epsilon10000(20%) | 1 | 5.259517169 | 5.490025539 | 5.416124883 | 5.422143437 | 5.688645028 | 5.829313157 | 5.903308816 | 5.905803923 | 5.894592522 | **5.913188166** | 5.865685882 | 5.867589235
epsilon20000(10%) | 1 | 2.912660346 | 3.309540658 | 3.760323537 | 4.461937174 | 5.137123746 | 5.48475407 | 5.649807973 | 5.734594901 | 5.944452876 | **5.963799699** | 5.900905821 | 5.87755102
epsilon200000(1%) | 1 | 0.203733915 | 0.311866583 | 0.511365494 | 0.916994087 | 1.564790701 | 2.38633706 | 3.111006795 | 3.647449241 | 3.933235398 | 4.084439834 | 4.157916315 | **4.170454829**

### Does this PR introduce _any_ user-facing change?
yes, param `blockSize` -> `blockSizeInMB` in master

### How was this patch tested?
added testsuites and performance test (result attached in [ticket](https://issues.apache.org/jira/browse/SPARK-32907))

Closes #30009 from zhengruifeng/adaptively_blockify_linear_svc_II.

Lead-authored-by: zhengruifeng <ruifengz@foxmail.com>
Co-authored-by: Weichen Xu <weichen.xu@databricks.com>
Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
2020-11-12 19:14:07 +08:00

193 lines
5.5 KiB
Python

#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from typing import Any, Generic, List
from pyspark.ml._typing import T
from pyspark.ml.param import *
class HasMaxIter(Params):
maxIter: Param[int]
def __init__(self) -> None: ...
def getMaxIter(self) -> int: ...
class HasRegParam(Params):
regParam: Param[float]
def __init__(self) -> None: ...
def getRegParam(self) -> float: ...
class HasFeaturesCol(Params):
featuresCol: Param[str]
def __init__(self) -> None: ...
def getFeaturesCol(self) -> str: ...
class HasLabelCol(Params):
labelCol: Param[str]
def __init__(self) -> None: ...
def getLabelCol(self) -> str: ...
class HasPredictionCol(Params):
predictionCol: Param[str]
def __init__(self) -> None: ...
def getPredictionCol(self) -> str: ...
class HasProbabilityCol(Params):
probabilityCol: Param[str]
def __init__(self) -> None: ...
def getProbabilityCol(self) -> str: ...
class HasRawPredictionCol(Params):
rawPredictionCol: Param[str]
def __init__(self) -> None: ...
def getRawPredictionCol(self) -> str: ...
class HasInputCol(Params):
inputCol: Param[str]
def __init__(self) -> None: ...
def getInputCol(self) -> str: ...
class HasInputCols(Params):
inputCols: Param[List[str]]
def __init__(self) -> None: ...
def getInputCols(self) -> List[str]: ...
class HasOutputCol(Params):
outputCol: Param[str]
def __init__(self) -> None: ...
def getOutputCol(self) -> str: ...
class HasOutputCols(Params):
outputCols: Param[List[str]]
def __init__(self) -> None: ...
def getOutputCols(self) -> List[str]: ...
class HasNumFeatures(Params):
numFeatures: Param[int]
def __init__(self) -> None: ...
def getNumFeatures(self) -> int: ...
class HasCheckpointInterval(Params):
checkpointInterval: Param[int]
def __init__(self) -> None: ...
def getCheckpointInterval(self) -> int: ...
class HasSeed(Params):
seed: Param[int]
def __init__(self) -> None: ...
def getSeed(self) -> int: ...
class HasTol(Params):
tol: Param[float]
def __init__(self) -> None: ...
def getTol(self) -> float: ...
class HasRelativeError(Params):
relativeError: Param[float]
def __init__(self) -> None: ...
def getRelativeError(self) -> float: ...
class HasStepSize(Params):
stepSize: Param[float]
def __init__(self) -> None: ...
def getStepSize(self) -> float: ...
class HasHandleInvalid(Params):
handleInvalid: Param[str]
def __init__(self) -> None: ...
def getHandleInvalid(self) -> str: ...
class HasElasticNetParam(Params):
elasticNetParam: Param[float]
def __init__(self) -> None: ...
def getElasticNetParam(self) -> float: ...
class HasFitIntercept(Params):
fitIntercept: Param[bool]
def __init__(self) -> None: ...
def getFitIntercept(self) -> bool: ...
class HasStandardization(Params):
standardization: Param[bool]
def __init__(self) -> None: ...
def getStandardization(self) -> bool: ...
class HasThresholds(Params):
thresholds: Param[List[float]]
def __init__(self) -> None: ...
def getThresholds(self) -> List[float]: ...
class HasThreshold(Params):
threshold: Param[float]
def __init__(self) -> None: ...
def getThreshold(self) -> float: ...
class HasWeightCol(Params):
weightCol: Param[str]
def __init__(self) -> None: ...
def getWeightCol(self) -> str: ...
class HasSolver(Params):
solver: Param[str]
def __init__(self) -> None: ...
def getSolver(self) -> str: ...
class HasVarianceCol(Params):
varianceCol: Param[str]
def __init__(self) -> None: ...
def getVarianceCol(self) -> str: ...
class HasAggregationDepth(Params):
aggregationDepth: Param[int]
def __init__(self) -> None: ...
def getAggregationDepth(self) -> int: ...
class HasParallelism(Params):
parallelism: Param[int]
def __init__(self) -> None: ...
def getParallelism(self) -> int: ...
class HasCollectSubModels(Params):
collectSubModels: Param[bool]
def __init__(self) -> None: ...
def getCollectSubModels(self) -> bool: ...
class HasLoss(Params):
loss: Param[str]
def __init__(self) -> None: ...
def getLoss(self) -> str: ...
class HasValidationIndicatorCol(Params):
validationIndicatorCol: Param[str]
def __init__(self) -> None: ...
def getValidationIndicatorCol(self) -> str: ...
class HasDistanceMeasure(Params):
distanceMeasure: Param[str]
def __init__(self) -> None: ...
def getDistanceMeasure(self) -> str: ...
class HasBlockSize(Params):
blockSize: Param[int]
def __init__(self) -> None: ...
def getBlockSize(self) -> int: ...
class HasMaxBlockSizeInMB(Params):
maxBlockSizeInMB: Param[float]
def __init__(self) -> None: ...
def getMaxBlockSizeInMB(self) -> float: ...