diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index ce6543952b..5a1202b0ba 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -22,8 +22,9 @@ from multiprocessing.pool import ThreadPool from pyspark import since, keyword_only from pyspark.ml import Estimator, Model from pyspark.ml.param.shared import * -from pyspark.ml.regression import DecisionTreeModel, DecisionTreeRegressionModel, \ - GBTParams, HasVarianceImpurity, RandomForestParams, TreeEnsembleModel +from pyspark.ml.regression import DecisionTreeModel, DecisionTreeParams, \ + DecisionTreeRegressionModel, GBTParams, HasVarianceImpurity, RandomForestParams, \ + TreeEnsembleModel from pyspark.ml.util import * from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams from pyspark.ml.wrapper import JavaWrapper diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py index 1b0c8c5d28..ca2e4a0f5b 100644 --- a/python/pyspark/ml/param/_shared_params_code_gen.py +++ b/python/pyspark/ml/param/_shared_params_code_gen.py @@ -174,45 +174,4 @@ if __name__ == "__main__": param_code = _gen_param_header(name, doc, defaultValueStr, typeConverter) code.append(param_code + "\n" + _gen_param_code(name, doc, defaultValueStr)) - decisionTreeParams = [ - ("maxDepth", "Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; " + - "depth 1 means 1 internal node + 2 leaf nodes.", "TypeConverters.toInt"), - ("maxBins", "Max number of bins for" + - " discretizing continuous features. Must be >=2 and >= number of categories for any" + - " categorical feature.", "TypeConverters.toInt"), - ("minInstancesPerNode", "Minimum number of instances each child must have after split. " + - "If a split causes the left or right child to have fewer than minInstancesPerNode, the " + - "split will be discarded as invalid. Should be >= 1.", "TypeConverters.toInt"), - ("minInfoGain", "Minimum information gain for a split to be considered at a tree node.", - "TypeConverters.toFloat"), - ("maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation. If too small," + - " then 1 node will be split per iteration, and its aggregates may exceed this size.", - "TypeConverters.toInt"), - ("cacheNodeIds", "If false, the algorithm will pass trees to executors to match " + - "instances with nodes. If true, the algorithm will cache node IDs for each instance. " + - "Caching can speed up training of deeper trees. Users can set how often should the " + - "cache be checkpointed or disable it by setting checkpointInterval.", - "TypeConverters.toBoolean")] - - decisionTreeCode = '''class DecisionTreeParams(Params): - """ - Mixin for Decision Tree parameters. - """ - - $dummyPlaceHolders - - def __init__(self): - super(DecisionTreeParams, self).__init__()''' - dtParamMethods = "" - dummyPlaceholders = "" - paramTemplate = """$name = Param($owner, "$name", "$doc", typeConverter=$typeConverterStr)""" - for name, doc, typeConverterStr in decisionTreeParams: - if typeConverterStr is None: - typeConverterStr = str(None) - variable = paramTemplate.replace("$name", name).replace("$doc", doc) \ - .replace("$typeConverterStr", typeConverterStr) - dummyPlaceholders += variable.replace("$owner", "Params._dummy()") + "\n " - dtParamMethods += _gen_param_code(name, doc, None) + "\n" - code.append(decisionTreeCode.replace("$dummyPlaceHolders", dummyPlaceholders) + "\n" + - dtParamMethods) print("\n\n\n".join(code)) diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py index 56d6190723..9527ef6f57 100644 --- a/python/pyspark/ml/param/shared.py +++ b/python/pyspark/ml/param/shared.py @@ -747,57 +747,3 @@ class HasValidationIndicatorCol(Params): Gets the value of validationIndicatorCol or its default value. """ return self.getOrDefault(self.validationIndicatorCol) - - -class DecisionTreeParams(Params): - """ - Mixin for Decision Tree parameters. - """ - - maxDepth = Param(Params._dummy(), "maxDepth", "Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.", typeConverter=TypeConverters.toInt) - maxBins = Param(Params._dummy(), "maxBins", "Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.", typeConverter=TypeConverters.toInt) - minInstancesPerNode = Param(Params._dummy(), "minInstancesPerNode", "Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.", typeConverter=TypeConverters.toInt) - minInfoGain = Param(Params._dummy(), "minInfoGain", "Minimum information gain for a split to be considered at a tree node.", typeConverter=TypeConverters.toFloat) - maxMemoryInMB = Param(Params._dummy(), "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation. If too small, then 1 node will be split per iteration, and its aggregates may exceed this size.", typeConverter=TypeConverters.toInt) - cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.", typeConverter=TypeConverters.toBoolean) - - - def __init__(self): - super(DecisionTreeParams, self).__init__() - - def getMaxDepth(self): - """ - Gets the value of maxDepth or its default value. - """ - return self.getOrDefault(self.maxDepth) - - def getMaxBins(self): - """ - Gets the value of maxBins or its default value. - """ - return self.getOrDefault(self.maxBins) - - def getMinInstancesPerNode(self): - """ - Gets the value of minInstancesPerNode or its default value. - """ - return self.getOrDefault(self.minInstancesPerNode) - - def getMinInfoGain(self): - """ - Gets the value of minInfoGain or its default value. - """ - return self.getOrDefault(self.minInfoGain) - - def getMaxMemoryInMB(self): - """ - Gets the value of maxMemoryInMB or its default value. - """ - return self.getOrDefault(self.maxMemoryInMB) - - def getCacheNodeIds(self): - """ - Gets the value of cacheNodeIds or its default value. - """ - return self.getOrDefault(self.cacheNodeIds) - diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 349130f22f..aca9e6f53c 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -584,6 +584,80 @@ class IsotonicRegressionModel(JavaModel, JavaMLWritable, JavaMLReadable): return self._call_java("predictions") +class DecisionTreeParams(Params): + """ + Mixin for Decision Tree parameters. + """ + + maxDepth = Param(Params._dummy(), "maxDepth", "Maximum depth of the tree. (>= 0) E.g., " + + "depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.", + typeConverter=TypeConverters.toInt) + + maxBins = Param(Params._dummy(), "maxBins", "Max number of bins for discretizing continuous " + + "features. Must be >=2 and >= number of categories for any categorical " + + "feature.", typeConverter=TypeConverters.toInt) + + minInstancesPerNode = Param(Params._dummy(), "minInstancesPerNode", "Minimum number of " + + "instances each child must have after split. If a split causes " + + "the left or right child to have fewer than " + + "minInstancesPerNode, the split will be discarded as invalid. " + + "Should be >= 1.", typeConverter=TypeConverters.toInt) + + minInfoGain = Param(Params._dummy(), "minInfoGain", "Minimum information gain for a split " + + "to be considered at a tree node.", typeConverter=TypeConverters.toFloat) + + maxMemoryInMB = Param(Params._dummy(), "maxMemoryInMB", "Maximum memory in MB allocated to " + + "histogram aggregation. If too small, then 1 node will be split per " + + "iteration, and its aggregates may exceed this size.", + typeConverter=TypeConverters.toInt) + + cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the algorithm will pass " + + "trees to executors to match instances with nodes. If true, the " + + "algorithm will cache node IDs for each instance. Caching can speed " + + "up training of deeper trees. Users can set how often should the cache " + + "be checkpointed or disable it by setting checkpointInterval.", + typeConverter=TypeConverters.toBoolean) + + def __init__(self): + super(DecisionTreeParams, self).__init__() + + def getMaxDepth(self): + """ + Gets the value of maxDepth or its default value. + """ + return self.getOrDefault(self.maxDepth) + + def getMaxBins(self): + """ + Gets the value of maxBins or its default value. + """ + return self.getOrDefault(self.maxBins) + + def getMinInstancesPerNode(self): + """ + Gets the value of minInstancesPerNode or its default value. + """ + return self.getOrDefault(self.minInstancesPerNode) + + def getMinInfoGain(self): + """ + Gets the value of minInfoGain or its default value. + """ + return self.getOrDefault(self.minInfoGain) + + def getMaxMemoryInMB(self): + """ + Gets the value of maxMemoryInMB or its default value. + """ + return self.getOrDefault(self.maxMemoryInMB) + + def getCacheNodeIds(self): + """ + Gets the value of cacheNodeIds or its default value. + """ + return self.getOrDefault(self.cacheNodeIds) + + class TreeEnsembleParams(DecisionTreeParams): """ Mixin for Decision Tree-based ensemble algorithms parameters.