[SPARK-28243][PYSPARK][ML][FOLLOW-UP] Move Python DecisionTreeParams to regression.py

## What changes were proposed in this pull request?
Leave ```shared.py``` untouched. Move Python ```DecisionTreeParams``` to ```regression.py```

## How was this patch tested?
Use existing tests

Closes #25406 from huaxingao/spark-28243.

Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
This commit is contained in:
Huaxin Gao 2019-08-15 10:21:26 -05:00 committed by Sean Owen
parent 3f35440304
commit ba5ee27706
4 changed files with 77 additions and 97 deletions

View file

@ -22,8 +22,9 @@ from multiprocessing.pool import ThreadPool
from pyspark import since, keyword_only
from pyspark.ml import Estimator, Model
from pyspark.ml.param.shared import *
from pyspark.ml.regression import DecisionTreeModel, DecisionTreeRegressionModel, \
GBTParams, HasVarianceImpurity, RandomForestParams, TreeEnsembleModel
from pyspark.ml.regression import DecisionTreeModel, DecisionTreeParams, \
DecisionTreeRegressionModel, GBTParams, HasVarianceImpurity, RandomForestParams, \
TreeEnsembleModel
from pyspark.ml.util import *
from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams
from pyspark.ml.wrapper import JavaWrapper

View file

@ -174,45 +174,4 @@ if __name__ == "__main__":
param_code = _gen_param_header(name, doc, defaultValueStr, typeConverter)
code.append(param_code + "\n" + _gen_param_code(name, doc, defaultValueStr))
decisionTreeParams = [
("maxDepth", "Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; " +
"depth 1 means 1 internal node + 2 leaf nodes.", "TypeConverters.toInt"),
("maxBins", "Max number of bins for" +
" discretizing continuous features. Must be >=2 and >= number of categories for any" +
" categorical feature.", "TypeConverters.toInt"),
("minInstancesPerNode", "Minimum number of instances each child must have after split. " +
"If a split causes the left or right child to have fewer than minInstancesPerNode, the " +
"split will be discarded as invalid. Should be >= 1.", "TypeConverters.toInt"),
("minInfoGain", "Minimum information gain for a split to be considered at a tree node.",
"TypeConverters.toFloat"),
("maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation. If too small," +
" then 1 node will be split per iteration, and its aggregates may exceed this size.",
"TypeConverters.toInt"),
("cacheNodeIds", "If false, the algorithm will pass trees to executors to match " +
"instances with nodes. If true, the algorithm will cache node IDs for each instance. " +
"Caching can speed up training of deeper trees. Users can set how often should the " +
"cache be checkpointed or disable it by setting checkpointInterval.",
"TypeConverters.toBoolean")]
decisionTreeCode = '''class DecisionTreeParams(Params):
"""
Mixin for Decision Tree parameters.
"""
$dummyPlaceHolders
def __init__(self):
super(DecisionTreeParams, self).__init__()'''
dtParamMethods = ""
dummyPlaceholders = ""
paramTemplate = """$name = Param($owner, "$name", "$doc", typeConverter=$typeConverterStr)"""
for name, doc, typeConverterStr in decisionTreeParams:
if typeConverterStr is None:
typeConverterStr = str(None)
variable = paramTemplate.replace("$name", name).replace("$doc", doc) \
.replace("$typeConverterStr", typeConverterStr)
dummyPlaceholders += variable.replace("$owner", "Params._dummy()") + "\n "
dtParamMethods += _gen_param_code(name, doc, None) + "\n"
code.append(decisionTreeCode.replace("$dummyPlaceHolders", dummyPlaceholders) + "\n" +
dtParamMethods)
print("\n\n\n".join(code))

View file

@ -747,57 +747,3 @@ class HasValidationIndicatorCol(Params):
Gets the value of validationIndicatorCol or its default value.
"""
return self.getOrDefault(self.validationIndicatorCol)
class DecisionTreeParams(Params):
"""
Mixin for Decision Tree parameters.
"""
maxDepth = Param(Params._dummy(), "maxDepth", "Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.", typeConverter=TypeConverters.toInt)
maxBins = Param(Params._dummy(), "maxBins", "Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.", typeConverter=TypeConverters.toInt)
minInstancesPerNode = Param(Params._dummy(), "minInstancesPerNode", "Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.", typeConverter=TypeConverters.toInt)
minInfoGain = Param(Params._dummy(), "minInfoGain", "Minimum information gain for a split to be considered at a tree node.", typeConverter=TypeConverters.toFloat)
maxMemoryInMB = Param(Params._dummy(), "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation. If too small, then 1 node will be split per iteration, and its aggregates may exceed this size.", typeConverter=TypeConverters.toInt)
cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.", typeConverter=TypeConverters.toBoolean)
def __init__(self):
super(DecisionTreeParams, self).__init__()
def getMaxDepth(self):
"""
Gets the value of maxDepth or its default value.
"""
return self.getOrDefault(self.maxDepth)
def getMaxBins(self):
"""
Gets the value of maxBins or its default value.
"""
return self.getOrDefault(self.maxBins)
def getMinInstancesPerNode(self):
"""
Gets the value of minInstancesPerNode or its default value.
"""
return self.getOrDefault(self.minInstancesPerNode)
def getMinInfoGain(self):
"""
Gets the value of minInfoGain or its default value.
"""
return self.getOrDefault(self.minInfoGain)
def getMaxMemoryInMB(self):
"""
Gets the value of maxMemoryInMB or its default value.
"""
return self.getOrDefault(self.maxMemoryInMB)
def getCacheNodeIds(self):
"""
Gets the value of cacheNodeIds or its default value.
"""
return self.getOrDefault(self.cacheNodeIds)

View file

@ -584,6 +584,80 @@ class IsotonicRegressionModel(JavaModel, JavaMLWritable, JavaMLReadable):
return self._call_java("predictions")
class DecisionTreeParams(Params):
"""
Mixin for Decision Tree parameters.
"""
maxDepth = Param(Params._dummy(), "maxDepth", "Maximum depth of the tree. (>= 0) E.g., " +
"depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.",
typeConverter=TypeConverters.toInt)
maxBins = Param(Params._dummy(), "maxBins", "Max number of bins for discretizing continuous " +
"features. Must be >=2 and >= number of categories for any categorical " +
"feature.", typeConverter=TypeConverters.toInt)
minInstancesPerNode = Param(Params._dummy(), "minInstancesPerNode", "Minimum number of " +
"instances each child must have after split. If a split causes " +
"the left or right child to have fewer than " +
"minInstancesPerNode, the split will be discarded as invalid. " +
"Should be >= 1.", typeConverter=TypeConverters.toInt)
minInfoGain = Param(Params._dummy(), "minInfoGain", "Minimum information gain for a split " +
"to be considered at a tree node.", typeConverter=TypeConverters.toFloat)
maxMemoryInMB = Param(Params._dummy(), "maxMemoryInMB", "Maximum memory in MB allocated to " +
"histogram aggregation. If too small, then 1 node will be split per " +
"iteration, and its aggregates may exceed this size.",
typeConverter=TypeConverters.toInt)
cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the algorithm will pass " +
"trees to executors to match instances with nodes. If true, the " +
"algorithm will cache node IDs for each instance. Caching can speed " +
"up training of deeper trees. Users can set how often should the cache " +
"be checkpointed or disable it by setting checkpointInterval.",
typeConverter=TypeConverters.toBoolean)
def __init__(self):
super(DecisionTreeParams, self).__init__()
def getMaxDepth(self):
"""
Gets the value of maxDepth or its default value.
"""
return self.getOrDefault(self.maxDepth)
def getMaxBins(self):
"""
Gets the value of maxBins or its default value.
"""
return self.getOrDefault(self.maxBins)
def getMinInstancesPerNode(self):
"""
Gets the value of minInstancesPerNode or its default value.
"""
return self.getOrDefault(self.minInstancesPerNode)
def getMinInfoGain(self):
"""
Gets the value of minInfoGain or its default value.
"""
return self.getOrDefault(self.minInfoGain)
def getMaxMemoryInMB(self):
"""
Gets the value of maxMemoryInMB or its default value.
"""
return self.getOrDefault(self.maxMemoryInMB)
def getCacheNodeIds(self):
"""
Gets the value of cacheNodeIds or its default value.
"""
return self.getOrDefault(self.cacheNodeIds)
class TreeEnsembleParams(DecisionTreeParams):
"""
Mixin for Decision Tree-based ensemble algorithms parameters.