[SPARK-7535] [.0] [MLLIB] Audit the pipeline APIs for 1.4

Some changes to the pipeilne APIs:

1. Estimator/Transformer/ doesn’t need to extend Params since PipelineStage already does.
1. Move Evaluator to ml.evaluation.
1. Mention larger metric values are better.
1. PipelineModel doc. “compiled” -> “fitted”
1. Hide object PolynomialExpansion.
1. Hide object VectorAssembler.
1. Word2Vec.minCount (and other) -> group param
1. ParamValidators -> DeveloperApi
1. Hide MetadataUtils/SchemaUtils.

jkbradley

Author: Xiangrui Meng <meng@databricks.com>

Closes #6322 from mengxr/SPARK-7535.0 and squashes the following commits:

9e9c7da [Xiangrui Meng] move JavaEvaluator to ml.evaluation as well
e179480 [Xiangrui Meng] move Evaluation to ml.evaluation in PySpark
08ef61f [Xiangrui Meng] update pipieline APIs

(cherry picked from commit 8f11c6116b)
Signed-off-by: Xiangrui Meng <meng@databricks.com>
This commit is contained in:
Xiangrui Meng 2015-05-21 22:57:33 -07:00
parent 2cc7907d73
commit 11a5f32116
16 changed files with 84 additions and 80 deletions

View file

@ -28,7 +28,7 @@ import org.apache.spark.sql.DataFrame
* Abstract class for estimators that fit models to data.
*/
@AlphaComponent
abstract class Estimator[M <: Model[M]] extends PipelineStage with Params {
abstract class Estimator[M <: Model[M]] extends PipelineStage {
/**
* Fits a single model to the input data with optional parameters.

View file

@ -170,7 +170,7 @@ class Pipeline(override val uid: String) extends Estimator[PipelineModel] {
/**
* :: AlphaComponent ::
* Represents a compiled pipeline.
* Represents a fitted pipeline.
*/
@AlphaComponent
class PipelineModel private[ml] (

View file

@ -32,7 +32,7 @@ import org.apache.spark.sql.types._
* Abstract class for transformers that transform one dataset into another.
*/
@AlphaComponent
abstract class Transformer extends PipelineStage with Params {
abstract class Transformer extends PipelineStage {
/**
* Transforms the dataset with optional parameters

View file

@ -18,7 +18,7 @@
package org.apache.spark.ml.evaluation
import org.apache.spark.annotation.AlphaComponent
import org.apache.spark.ml.Evaluator
import org.apache.spark.ml.evaluation.Evaluator
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared._
import org.apache.spark.ml.util.{Identifiable, SchemaUtils}

View file

@ -15,7 +15,7 @@
* limitations under the License.
*/
package org.apache.spark.ml
package org.apache.spark.ml.evaluation
import org.apache.spark.annotation.AlphaComponent
import org.apache.spark.ml.param.{ParamMap, Params}
@ -29,7 +29,7 @@ import org.apache.spark.sql.DataFrame
abstract class Evaluator extends Params {
/**
* Evaluates the output.
* Evaluates model output and returns a scalar metric (larger is better).
*
* @param dataset a dataset that contains labels/observations and predictions.
* @param paramMap parameter map that specifies the input columns and output metrics

View file

@ -75,7 +75,7 @@ class PolynomialExpansion(override val uid: String)
* To handle sparsity, if c is zero, we can skip all monomials that contain it. We remember the
* current index and increment it properly for sparse input.
*/
object PolynomialExpansion {
private[feature] object PolynomialExpansion {
private def choose(n: Int, k: Int): Int = {
Range(n, n - k, -1).product / Range(k, 1, -1).product

View file

@ -78,8 +78,7 @@ class VectorAssembler(override val uid: String)
}
}
@AlphaComponent
object VectorAssembler {
private object VectorAssembler {
private[feature] def assemble(vv: Any*): Vector = {
val indices = ArrayBuilder.make[Int]

View file

@ -37,6 +37,7 @@ private[feature] trait Word2VecBase extends Params
/**
* The dimension of the code that you want to transform from words.
* @group param
*/
final val vectorSize = new IntParam(
this, "vectorSize", "the dimension of codes after transforming from words")
@ -47,6 +48,7 @@ private[feature] trait Word2VecBase extends Params
/**
* Number of partitions for sentences of words.
* @group param
*/
final val numPartitions = new IntParam(
this, "numPartitions", "number of partitions for sentences of words")
@ -58,6 +60,7 @@ private[feature] trait Word2VecBase extends Params
/**
* The minimum number of times a token must appear to be included in the word2vec model's
* vocabulary.
* @group param
*/
final val minCount = new IntParam(this, "minCount", "the minimum number of times a token must " +
"appear to be included in the word2vec model's vocabulary")

View file

@ -24,7 +24,7 @@ import scala.annotation.varargs
import scala.collection.mutable
import scala.collection.JavaConverters._
import org.apache.spark.annotation.AlphaComponent
import org.apache.spark.annotation.{DeveloperApi, AlphaComponent}
import org.apache.spark.ml.util.Identifiable
/**
@ -92,9 +92,11 @@ class Param[T](val parent: String, val name: String, val doc: String, val isVali
}
/**
* :: DeveloperApi ::
* Factory methods for common validation functions for [[Param.isValid]].
* The numerical methods only support Int, Long, Float, and Double.
*/
@DeveloperApi
object ParamValidators {
/** (private[param]) Default validation always return true */
@ -529,11 +531,13 @@ trait Params extends Identifiable with Serializable {
}
/**
* :: DeveloperApi ::
* Java-friendly wrapper for [[Params]].
* Java developers who need to extend [[Params]] should use this class instead.
* If you need to extend a abstract class which already extends [[Params]], then that abstract
* class should be Java-friendly as well.
*/
@DeveloperApi
abstract class JavaParams extends Params
/**

View file

@ -22,6 +22,7 @@ import com.github.fommil.netlib.F2jBLAS
import org.apache.spark.Logging
import org.apache.spark.annotation.AlphaComponent
import org.apache.spark.ml._
import org.apache.spark.ml.evaluation.Evaluator
import org.apache.spark.ml.param._
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.mllib.util.MLUtils

View file

@ -19,18 +19,14 @@ package org.apache.spark.ml.util
import scala.collection.immutable.HashMap
import org.apache.spark.annotation.Experimental
import org.apache.spark.ml.attribute._
import org.apache.spark.sql.types.StructField
/**
* :: Experimental ::
*
* Helper utilities for tree-based algorithms
*/
@Experimental
object MetadataUtils {
private[spark] object MetadataUtils {
/**
* Examine a schema to identify the number of classes in a label column.

View file

@ -17,15 +17,13 @@
package org.apache.spark.ml.util
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.sql.types.{DataType, StructField, StructType}
/**
* :: DeveloperApi ::
* Utils for handling schemas.
*/
@DeveloperApi
object SchemaUtils {
private[spark] object SchemaUtils {
// TODO: Move the utility methods to SQL.

View file

@ -15,6 +15,6 @@
# limitations under the License.
#
from pyspark.ml.pipeline import Transformer, Estimator, Model, Pipeline, PipelineModel, Evaluator
from pyspark.ml.pipeline import Transformer, Estimator, Model, Pipeline, PipelineModel
__all__ = ["Transformer", "Estimator", "Model", "Pipeline", "PipelineModel", "Evaluator"]
__all__ = ["Transformer", "Estimator", "Model", "Pipeline", "PipelineModel"]

View file

@ -15,13 +15,72 @@
# limitations under the License.
#
from pyspark.ml.wrapper import JavaEvaluator
from abc import abstractmethod, ABCMeta
from pyspark.ml.wrapper import JavaWrapper
from pyspark.ml.param import Param, Params
from pyspark.ml.param.shared import HasLabelCol, HasRawPredictionCol
from pyspark.ml.util import keyword_only
from pyspark.mllib.common import inherit_doc
__all__ = ['BinaryClassificationEvaluator']
__all__ = ['Evaluator', 'BinaryClassificationEvaluator']
@inherit_doc
class Evaluator(Params):
"""
Base class for evaluators that compute metrics from predictions.
"""
__metaclass__ = ABCMeta
@abstractmethod
def _evaluate(self, dataset):
"""
Evaluates the output.
:param dataset: a dataset that contains labels/observations and
predictions
:return: metric
"""
raise NotImplementedError()
def evaluate(self, dataset, params={}):
"""
Evaluates the output with optional parameters.
:param dataset: a dataset that contains labels/observations and
predictions
:param params: an optional param map that overrides embedded
params
:return: metric
"""
if isinstance(params, dict):
if params:
return self.copy(params)._evaluate(dataset)
else:
return self._evaluate(dataset)
else:
raise ValueError("Params must be a param map but got %s." % type(params))
@inherit_doc
class JavaEvaluator(Evaluator, JavaWrapper):
"""
Base class for :py:class:`Evaluator`s that wrap Java/Scala
implementations.
"""
__metaclass__ = ABCMeta
def _evaluate(self, dataset):
"""
Evaluates the output.
:param dataset: a dataset that contains labels/observations and predictions.
:return: evaluation metric
"""
self._transfer_params_to_java()
return self._java_obj.evaluate(dataset._jdf)
@inherit_doc

View file

@ -219,40 +219,3 @@ class PipelineModel(Model):
def copy(self, extra={}):
stages = [stage.copy(extra) for stage in self.stages]
return PipelineModel(stages)
class Evaluator(Params):
"""
Base class for evaluators that compute metrics from predictions.
"""
__metaclass__ = ABCMeta
@abstractmethod
def _evaluate(self, dataset):
"""
Evaluates the output.
:param dataset: a dataset that contains labels/observations and
predictions
:return: metric
"""
raise NotImplementedError()
def evaluate(self, dataset, params={}):
"""
Evaluates the output with optional parameters.
:param dataset: a dataset that contains labels/observations and
predictions
:param params: an optional param map that overrides embedded
params
:return: metric
"""
if isinstance(params, dict):
if params:
return self.copy(params)._evaluate(dataset)
else:
return self._evaluate(dataset)
else:
raise ValueError("Params must be a param map but got %s." % type(params))

View file

@ -20,7 +20,7 @@ from abc import ABCMeta
from pyspark import SparkContext
from pyspark.sql import DataFrame
from pyspark.ml.param import Params
from pyspark.ml.pipeline import Estimator, Transformer, Evaluator, Model
from pyspark.ml.pipeline import Estimator, Transformer, Model
from pyspark.mllib.common import inherit_doc, _java2py, _py2java
@ -185,22 +185,3 @@ class JavaModel(Model, JavaTransformer):
sc = SparkContext._active_spark_context
java_args = [_py2java(sc, arg) for arg in args]
return _java2py(sc, m(*java_args))
@inherit_doc
class JavaEvaluator(Evaluator, JavaWrapper):
"""
Base class for :py:class:`Evaluator`s that wrap Java/Scala
implementations.
"""
__metaclass__ = ABCMeta
def _evaluate(self, dataset):
"""
Evaluates the output.
:param dataset: a dataset that contains labels/observations and predictions.
:return: evaluation metric
"""
self._transfer_params_to_java()
return self._java_obj.evaluate(dataset._jdf)