[SPARK-12349][ML] Make spark.ml PCAModel load backwards compatible

Only load explainedVariance in PCAModel if it was written with Spark > 1.6.x
jkbradley is this kind of what you had in mind?

Author: Sean Owen <sowen@cloudera.com>

Closes #10327 from srowen/SPARK-12349.
This commit is contained in:
Sean Owen 2015-12-21 10:21:22 +00:00
parent ce1798b3af
commit d0f695089e

View file

@ -167,14 +167,37 @@ object PCAModel extends MLReadable[PCAModel] {
private val className = classOf[PCAModel].getName
/**
* Loads a [[PCAModel]] from data located at the input path. Note that the model includes an
* `explainedVariance` member that is not recorded by Spark 1.6 and earlier. A model
* can be loaded from such older data but will have an empty vector for
* `explainedVariance`.
*
* @param path path to serialized model data
* @return a [[PCAModel]]
*/
override def load(path: String): PCAModel = {
val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
// explainedVariance field is not present in Spark <= 1.6
val versionRegex = "([0-9]+)\\.([0-9])+.*".r
val hasExplainedVariance = metadata.sparkVersion match {
case versionRegex(major, minor) =>
(major.toInt >= 2 || (major.toInt == 1 && minor.toInt > 6))
case _ => false
}
val dataPath = new Path(path, "data").toString
val model = if (hasExplainedVariance) {
val Row(pc: DenseMatrix, explainedVariance: DenseVector) =
sqlContext.read.parquet(dataPath)
.select("pc", "explainedVariance")
.head()
val model = new PCAModel(metadata.uid, pc, explainedVariance)
new PCAModel(metadata.uid, pc, explainedVariance)
} else {
val Row(pc: DenseMatrix) = sqlContext.read.parquet(dataPath).select("pc").head()
new PCAModel(metadata.uid, pc, Vectors.dense(Array.empty[Double]).asInstanceOf[DenseVector])
}
DefaultParamsReader.getAndSetParams(model, metadata)
model
}