[SPARK-7535] [.0] [MLLIB] Audit the pipeline APIs for 1.4
Some changes to the pipeilne APIs:
1. Estimator/Transformer/ doesn’t need to extend Params since PipelineStage already does.
1. Move Evaluator to ml.evaluation.
1. Mention larger metric values are better.
1. PipelineModel doc. “compiled” -> “fitted”
1. Hide object PolynomialExpansion.
1. Hide object VectorAssembler.
1. Word2Vec.minCount (and other) -> group param
1. ParamValidators -> DeveloperApi
1. Hide MetadataUtils/SchemaUtils.
jkbradley
Author: Xiangrui Meng <meng@databricks.com>
Closes #6322 from mengxr/SPARK-7535.0 and squashes the following commits:
9e9c7da [Xiangrui Meng] move JavaEvaluator to ml.evaluation as well
e179480 [Xiangrui Meng] move Evaluation to ml.evaluation in PySpark
08ef61f [Xiangrui Meng] update pipieline APIs
(cherry picked from commit 8f11c6116b
)
Signed-off-by: Xiangrui Meng <meng@databricks.com>
This commit is contained in:
parent
2cc7907d73
commit
11a5f32116
|
@ -28,7 +28,7 @@ import org.apache.spark.sql.DataFrame
|
|||
* Abstract class for estimators that fit models to data.
|
||||
*/
|
||||
@AlphaComponent
|
||||
abstract class Estimator[M <: Model[M]] extends PipelineStage with Params {
|
||||
abstract class Estimator[M <: Model[M]] extends PipelineStage {
|
||||
|
||||
/**
|
||||
* Fits a single model to the input data with optional parameters.
|
||||
|
|
|
@ -170,7 +170,7 @@ class Pipeline(override val uid: String) extends Estimator[PipelineModel] {
|
|||
|
||||
/**
|
||||
* :: AlphaComponent ::
|
||||
* Represents a compiled pipeline.
|
||||
* Represents a fitted pipeline.
|
||||
*/
|
||||
@AlphaComponent
|
||||
class PipelineModel private[ml] (
|
||||
|
|
|
@ -32,7 +32,7 @@ import org.apache.spark.sql.types._
|
|||
* Abstract class for transformers that transform one dataset into another.
|
||||
*/
|
||||
@AlphaComponent
|
||||
abstract class Transformer extends PipelineStage with Params {
|
||||
abstract class Transformer extends PipelineStage {
|
||||
|
||||
/**
|
||||
* Transforms the dataset with optional parameters
|
||||
|
|
|
@ -18,7 +18,7 @@
|
|||
package org.apache.spark.ml.evaluation
|
||||
|
||||
import org.apache.spark.annotation.AlphaComponent
|
||||
import org.apache.spark.ml.Evaluator
|
||||
import org.apache.spark.ml.evaluation.Evaluator
|
||||
import org.apache.spark.ml.param._
|
||||
import org.apache.spark.ml.param.shared._
|
||||
import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.ml
|
||||
package org.apache.spark.ml.evaluation
|
||||
|
||||
import org.apache.spark.annotation.AlphaComponent
|
||||
import org.apache.spark.ml.param.{ParamMap, Params}
|
||||
|
@ -29,7 +29,7 @@ import org.apache.spark.sql.DataFrame
|
|||
abstract class Evaluator extends Params {
|
||||
|
||||
/**
|
||||
* Evaluates the output.
|
||||
* Evaluates model output and returns a scalar metric (larger is better).
|
||||
*
|
||||
* @param dataset a dataset that contains labels/observations and predictions.
|
||||
* @param paramMap parameter map that specifies the input columns and output metrics
|
|
@ -75,7 +75,7 @@ class PolynomialExpansion(override val uid: String)
|
|||
* To handle sparsity, if c is zero, we can skip all monomials that contain it. We remember the
|
||||
* current index and increment it properly for sparse input.
|
||||
*/
|
||||
object PolynomialExpansion {
|
||||
private[feature] object PolynomialExpansion {
|
||||
|
||||
private def choose(n: Int, k: Int): Int = {
|
||||
Range(n, n - k, -1).product / Range(k, 1, -1).product
|
||||
|
|
|
@ -78,8 +78,7 @@ class VectorAssembler(override val uid: String)
|
|||
}
|
||||
}
|
||||
|
||||
@AlphaComponent
|
||||
object VectorAssembler {
|
||||
private object VectorAssembler {
|
||||
|
||||
private[feature] def assemble(vv: Any*): Vector = {
|
||||
val indices = ArrayBuilder.make[Int]
|
||||
|
|
|
@ -37,6 +37,7 @@ private[feature] trait Word2VecBase extends Params
|
|||
|
||||
/**
|
||||
* The dimension of the code that you want to transform from words.
|
||||
* @group param
|
||||
*/
|
||||
final val vectorSize = new IntParam(
|
||||
this, "vectorSize", "the dimension of codes after transforming from words")
|
||||
|
@ -47,6 +48,7 @@ private[feature] trait Word2VecBase extends Params
|
|||
|
||||
/**
|
||||
* Number of partitions for sentences of words.
|
||||
* @group param
|
||||
*/
|
||||
final val numPartitions = new IntParam(
|
||||
this, "numPartitions", "number of partitions for sentences of words")
|
||||
|
@ -58,6 +60,7 @@ private[feature] trait Word2VecBase extends Params
|
|||
/**
|
||||
* The minimum number of times a token must appear to be included in the word2vec model's
|
||||
* vocabulary.
|
||||
* @group param
|
||||
*/
|
||||
final val minCount = new IntParam(this, "minCount", "the minimum number of times a token must " +
|
||||
"appear to be included in the word2vec model's vocabulary")
|
||||
|
|
|
@ -24,7 +24,7 @@ import scala.annotation.varargs
|
|||
import scala.collection.mutable
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
import org.apache.spark.annotation.AlphaComponent
|
||||
import org.apache.spark.annotation.{DeveloperApi, AlphaComponent}
|
||||
import org.apache.spark.ml.util.Identifiable
|
||||
|
||||
/**
|
||||
|
@ -92,9 +92,11 @@ class Param[T](val parent: String, val name: String, val doc: String, val isVali
|
|||
}
|
||||
|
||||
/**
|
||||
* :: DeveloperApi ::
|
||||
* Factory methods for common validation functions for [[Param.isValid]].
|
||||
* The numerical methods only support Int, Long, Float, and Double.
|
||||
*/
|
||||
@DeveloperApi
|
||||
object ParamValidators {
|
||||
|
||||
/** (private[param]) Default validation always return true */
|
||||
|
@ -529,11 +531,13 @@ trait Params extends Identifiable with Serializable {
|
|||
}
|
||||
|
||||
/**
|
||||
* :: DeveloperApi ::
|
||||
* Java-friendly wrapper for [[Params]].
|
||||
* Java developers who need to extend [[Params]] should use this class instead.
|
||||
* If you need to extend a abstract class which already extends [[Params]], then that abstract
|
||||
* class should be Java-friendly as well.
|
||||
*/
|
||||
@DeveloperApi
|
||||
abstract class JavaParams extends Params
|
||||
|
||||
/**
|
||||
|
|
|
@ -22,6 +22,7 @@ import com.github.fommil.netlib.F2jBLAS
|
|||
import org.apache.spark.Logging
|
||||
import org.apache.spark.annotation.AlphaComponent
|
||||
import org.apache.spark.ml._
|
||||
import org.apache.spark.ml.evaluation.Evaluator
|
||||
import org.apache.spark.ml.param._
|
||||
import org.apache.spark.ml.util.Identifiable
|
||||
import org.apache.spark.mllib.util.MLUtils
|
||||
|
|
|
@ -19,18 +19,14 @@ package org.apache.spark.ml.util
|
|||
|
||||
import scala.collection.immutable.HashMap
|
||||
|
||||
import org.apache.spark.annotation.Experimental
|
||||
import org.apache.spark.ml.attribute._
|
||||
import org.apache.spark.sql.types.StructField
|
||||
|
||||
|
||||
/**
|
||||
* :: Experimental ::
|
||||
*
|
||||
* Helper utilities for tree-based algorithms
|
||||
*/
|
||||
@Experimental
|
||||
object MetadataUtils {
|
||||
private[spark] object MetadataUtils {
|
||||
|
||||
/**
|
||||
* Examine a schema to identify the number of classes in a label column.
|
||||
|
|
|
@ -17,15 +17,13 @@
|
|||
|
||||
package org.apache.spark.ml.util
|
||||
|
||||
import org.apache.spark.annotation.DeveloperApi
|
||||
import org.apache.spark.sql.types.{DataType, StructField, StructType}
|
||||
|
||||
|
||||
/**
|
||||
* :: DeveloperApi ::
|
||||
* Utils for handling schemas.
|
||||
*/
|
||||
@DeveloperApi
|
||||
object SchemaUtils {
|
||||
private[spark] object SchemaUtils {
|
||||
|
||||
// TODO: Move the utility methods to SQL.
|
||||
|
||||
|
|
|
@ -15,6 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from pyspark.ml.pipeline import Transformer, Estimator, Model, Pipeline, PipelineModel, Evaluator
|
||||
from pyspark.ml.pipeline import Transformer, Estimator, Model, Pipeline, PipelineModel
|
||||
|
||||
__all__ = ["Transformer", "Estimator", "Model", "Pipeline", "PipelineModel", "Evaluator"]
|
||||
__all__ = ["Transformer", "Estimator", "Model", "Pipeline", "PipelineModel"]
|
||||
|
|
|
@ -15,13 +15,72 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from pyspark.ml.wrapper import JavaEvaluator
|
||||
from abc import abstractmethod, ABCMeta
|
||||
|
||||
from pyspark.ml.wrapper import JavaWrapper
|
||||
from pyspark.ml.param import Param, Params
|
||||
from pyspark.ml.param.shared import HasLabelCol, HasRawPredictionCol
|
||||
from pyspark.ml.util import keyword_only
|
||||
from pyspark.mllib.common import inherit_doc
|
||||
|
||||
__all__ = ['BinaryClassificationEvaluator']
|
||||
__all__ = ['Evaluator', 'BinaryClassificationEvaluator']
|
||||
|
||||
|
||||
@inherit_doc
|
||||
class Evaluator(Params):
|
||||
"""
|
||||
Base class for evaluators that compute metrics from predictions.
|
||||
"""
|
||||
|
||||
__metaclass__ = ABCMeta
|
||||
|
||||
@abstractmethod
|
||||
def _evaluate(self, dataset):
|
||||
"""
|
||||
Evaluates the output.
|
||||
|
||||
:param dataset: a dataset that contains labels/observations and
|
||||
predictions
|
||||
:return: metric
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def evaluate(self, dataset, params={}):
|
||||
"""
|
||||
Evaluates the output with optional parameters.
|
||||
|
||||
:param dataset: a dataset that contains labels/observations and
|
||||
predictions
|
||||
:param params: an optional param map that overrides embedded
|
||||
params
|
||||
:return: metric
|
||||
"""
|
||||
if isinstance(params, dict):
|
||||
if params:
|
||||
return self.copy(params)._evaluate(dataset)
|
||||
else:
|
||||
return self._evaluate(dataset)
|
||||
else:
|
||||
raise ValueError("Params must be a param map but got %s." % type(params))
|
||||
|
||||
|
||||
@inherit_doc
|
||||
class JavaEvaluator(Evaluator, JavaWrapper):
|
||||
"""
|
||||
Base class for :py:class:`Evaluator`s that wrap Java/Scala
|
||||
implementations.
|
||||
"""
|
||||
|
||||
__metaclass__ = ABCMeta
|
||||
|
||||
def _evaluate(self, dataset):
|
||||
"""
|
||||
Evaluates the output.
|
||||
:param dataset: a dataset that contains labels/observations and predictions.
|
||||
:return: evaluation metric
|
||||
"""
|
||||
self._transfer_params_to_java()
|
||||
return self._java_obj.evaluate(dataset._jdf)
|
||||
|
||||
|
||||
@inherit_doc
|
||||
|
|
|
@ -219,40 +219,3 @@ class PipelineModel(Model):
|
|||
def copy(self, extra={}):
|
||||
stages = [stage.copy(extra) for stage in self.stages]
|
||||
return PipelineModel(stages)
|
||||
|
||||
|
||||
class Evaluator(Params):
|
||||
"""
|
||||
Base class for evaluators that compute metrics from predictions.
|
||||
"""
|
||||
|
||||
__metaclass__ = ABCMeta
|
||||
|
||||
@abstractmethod
|
||||
def _evaluate(self, dataset):
|
||||
"""
|
||||
Evaluates the output.
|
||||
|
||||
:param dataset: a dataset that contains labels/observations and
|
||||
predictions
|
||||
:return: metric
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def evaluate(self, dataset, params={}):
|
||||
"""
|
||||
Evaluates the output with optional parameters.
|
||||
|
||||
:param dataset: a dataset that contains labels/observations and
|
||||
predictions
|
||||
:param params: an optional param map that overrides embedded
|
||||
params
|
||||
:return: metric
|
||||
"""
|
||||
if isinstance(params, dict):
|
||||
if params:
|
||||
return self.copy(params)._evaluate(dataset)
|
||||
else:
|
||||
return self._evaluate(dataset)
|
||||
else:
|
||||
raise ValueError("Params must be a param map but got %s." % type(params))
|
||||
|
|
|
@ -20,7 +20,7 @@ from abc import ABCMeta
|
|||
from pyspark import SparkContext
|
||||
from pyspark.sql import DataFrame
|
||||
from pyspark.ml.param import Params
|
||||
from pyspark.ml.pipeline import Estimator, Transformer, Evaluator, Model
|
||||
from pyspark.ml.pipeline import Estimator, Transformer, Model
|
||||
from pyspark.mllib.common import inherit_doc, _java2py, _py2java
|
||||
|
||||
|
||||
|
@ -185,22 +185,3 @@ class JavaModel(Model, JavaTransformer):
|
|||
sc = SparkContext._active_spark_context
|
||||
java_args = [_py2java(sc, arg) for arg in args]
|
||||
return _java2py(sc, m(*java_args))
|
||||
|
||||
|
||||
@inherit_doc
|
||||
class JavaEvaluator(Evaluator, JavaWrapper):
|
||||
"""
|
||||
Base class for :py:class:`Evaluator`s that wrap Java/Scala
|
||||
implementations.
|
||||
"""
|
||||
|
||||
__metaclass__ = ABCMeta
|
||||
|
||||
def _evaluate(self, dataset):
|
||||
"""
|
||||
Evaluates the output.
|
||||
:param dataset: a dataset that contains labels/observations and predictions.
|
||||
:return: evaluation metric
|
||||
"""
|
||||
self._transfer_params_to_java()
|
||||
return self._java_obj.evaluate(dataset._jdf)
|
||||
|
|
Loading…
Reference in a new issue