From dab10516138867b7c4fc6d42168497e82853b539 Mon Sep 17 00:00:00 2001 From: Nick Pentreath Date: Thu, 30 Jun 2016 17:52:15 -0700 Subject: [PATCH] [SPARK-16328][ML][MLLIB][PYSPARK] Add 'asML' and 'fromML' conversion methods to PySpark linalg The move to `ml.linalg` created `asML`/`fromML` utility methods in Scala/Java for converting between representations. These are missing in Python, this PR adds them. ## How was this patch tested? New doctests. Author: Nick Pentreath Closes #13997 from MLnick/SPARK-16328-python-linalg-convert. --- python/pyspark/mllib/linalg/__init__.py | 99 +++++++++++++++++++++++++ python/pyspark/mllib/tests.py | 69 +++++++++++++++++ 2 files changed, 168 insertions(+) diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py index 3a345b2b56..15dc53a959 100644 --- a/python/pyspark/mllib/linalg/__init__.py +++ b/python/pyspark/mllib/linalg/__init__.py @@ -39,6 +39,7 @@ else: import numpy as np from pyspark import since +from pyspark.ml import linalg as newlinalg from pyspark.sql.types import UserDefinedType, StructField, StructType, ArrayType, DoubleType, \ IntegerType, ByteType, BooleanType @@ -247,6 +248,15 @@ class Vector(object): """ raise NotImplementedError + def asML(self): + """ + Convert this vector to the new mllib-local representation. + This does NOT copy the data; it copies references. + + :return: :py:class:`pyspark.ml.linalg.Vector` + """ + raise NotImplementedError + class DenseVector(Vector): """ @@ -408,6 +418,17 @@ class DenseVector(Vector): """ return self.array + def asML(self): + """ + Convert this vector to the new mllib-local representation. + This does NOT copy the data; it copies references. + + :return: :py:class:`pyspark.ml.linalg.DenseVector` + + .. versionadded:: 2.0.0 + """ + return newlinalg.DenseVector(self.array) + @property def values(self): """ @@ -737,6 +758,17 @@ class SparseVector(Vector): arr[self.indices] = self.values return arr + def asML(self): + """ + Convert this vector to the new mllib-local representation. + This does NOT copy the data; it copies references. + + :return: :py:class:`pyspark.ml.linalg.SparseVector` + + .. versionadded:: 2.0.0 + """ + return newlinalg.SparseVector(self.size, self.indices, self.values) + def __len__(self): return self.size @@ -845,6 +877,24 @@ class Vectors(object): elements = elements[0] return DenseVector(elements) + @staticmethod + def fromML(vec): + """ + Convert a vector from the new mllib-local representation. + This does NOT copy the data; it copies references. + + :param vec: a :py:class:`pyspark.ml.linalg.Vector` + :return: a :py:class:`pyspark.mllib.linalg.Vector` + + .. versionadded:: 2.0.0 + """ + if isinstance(vec, newlinalg.DenseVector): + return DenseVector(vec.array) + elif isinstance(vec, newlinalg.SparseVector): + return SparseVector(vec.size, vec.indices, vec.values) + else: + raise TypeError("Unsupported vector type %s" % type(vec)) + @staticmethod def stringify(vector): """ @@ -945,6 +995,13 @@ class Matrix(object): """ raise NotImplementedError + def asML(self): + """ + Convert this matrix to the new mllib-local representation. + This does NOT copy the data; it copies references. + """ + raise NotImplementedError + @staticmethod def _convert_to_array(array_like, dtype): """ @@ -1044,6 +1101,17 @@ class DenseMatrix(Matrix): return SparseMatrix(self.numRows, self.numCols, colPtrs, rowIndices, values) + def asML(self): + """ + Convert this matrix to the new mllib-local representation. + This does NOT copy the data; it copies references. + + :return: :py:class:`pyspark.ml.linalg.DenseMatrix` + + .. versionadded:: 2.0.0 + """ + return newlinalg.DenseMatrix(self.numRows, self.numCols, self.values, self.isTransposed) + def __getitem__(self, indices): i, j = indices if i < 0 or i >= self.numRows: @@ -1216,6 +1284,18 @@ class SparseMatrix(Matrix): densevals = np.ravel(self.toArray(), order='F') return DenseMatrix(self.numRows, self.numCols, densevals) + def asML(self): + """ + Convert this matrix to the new mllib-local representation. + This does NOT copy the data; it copies references. + + :return: :py:class:`pyspark.ml.linalg.SparseMatrix` + + .. versionadded:: 2.0.0 + """ + return newlinalg.SparseMatrix(self.numRows, self.numCols, self.colPtrs, self.rowIndices, + self.values, self.isTransposed) + # TODO: More efficient implementation: def __eq__(self, other): return np.all(self.toArray() == other.toArray()) @@ -1236,6 +1316,25 @@ class Matrices(object): """ return SparseMatrix(numRows, numCols, colPtrs, rowIndices, values) + @staticmethod + def fromML(mat): + """ + Convert a matrix from the new mllib-local representation. + This does NOT copy the data; it copies references. + + :param mat: a :py:class:`pyspark.ml.linalg.Matrix` + :return: a :py:class:`pyspark.mllib.linalg.Matrix` + + .. versionadded:: 2.0.0 + """ + if isinstance(mat, newlinalg.DenseMatrix): + return DenseMatrix(mat.numRows, mat.numCols, mat.values, mat.isTransposed) + elif isinstance(mat, newlinalg.SparseMatrix): + return SparseMatrix(mat.numRows, mat.numCols, mat.colPtrs, mat.rowIndices, + mat.values, mat.isTransposed) + else: + raise TypeError("Unsupported matrix type %s" % type(mat)) + class QRDecomposition(object): """ diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index 74cf7bb8ea..72fa8b5f3d 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -49,6 +49,7 @@ else: import unittest from pyspark import SparkContext +import pyspark.ml.linalg as newlinalg from pyspark.mllib.common import _to_java_object_rdd from pyspark.mllib.clustering import StreamingKMeans, StreamingKMeansModel from pyspark.mllib.linalg import Vector, SparseVector, DenseVector, VectorUDT, _convert_to_vector,\ @@ -423,6 +424,74 @@ class VectorTests(MLlibTestCase): tmp = SparseVector(4, [0, 2], [3, 0]) self.assertEqual(tmp.numNonzeros(), 1) + def test_ml_mllib_vector_conversion(self): + # to ml + # dense + mllibDV = Vectors.dense([1, 2, 3]) + mlDV1 = newlinalg.Vectors.dense([1, 2, 3]) + mlDV2 = mllibDV.asML() + self.assertEqual(mlDV2, mlDV1) + # sparse + mllibSV = Vectors.sparse(4, {1: 1.0, 3: 5.5}) + mlSV1 = newlinalg.Vectors.sparse(4, {1: 1.0, 3: 5.5}) + mlSV2 = mllibSV.asML() + self.assertEqual(mlSV2, mlSV1) + # from ml + # dense + mllibDV1 = Vectors.dense([1, 2, 3]) + mlDV = newlinalg.Vectors.dense([1, 2, 3]) + mllibDV2 = Vectors.fromML(mlDV) + self.assertEqual(mllibDV1, mllibDV2) + # sparse + mllibSV1 = Vectors.sparse(4, {1: 1.0, 3: 5.5}) + mlSV = newlinalg.Vectors.sparse(4, {1: 1.0, 3: 5.5}) + mllibSV2 = Vectors.fromML(mlSV) + self.assertEqual(mllibSV1, mllibSV2) + + def test_ml_mllib_matrix_conversion(self): + # to ml + # dense + mllibDM = Matrices.dense(2, 2, [0, 1, 2, 3]) + mlDM1 = newlinalg.Matrices.dense(2, 2, [0, 1, 2, 3]) + mlDM2 = mllibDM.asML() + self.assertEqual(mlDM2, mlDM1) + # transposed + mllibDMt = DenseMatrix(2, 2, [0, 1, 2, 3], True) + mlDMt1 = newlinalg.DenseMatrix(2, 2, [0, 1, 2, 3], True) + mlDMt2 = mllibDMt.asML() + self.assertEqual(mlDMt2, mlDMt1) + # sparse + mllibSM = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) + mlSM1 = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) + mlSM2 = mllibSM.asML() + self.assertEqual(mlSM2, mlSM1) + # transposed + mllibSMt = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) + mlSMt1 = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) + mlSMt2 = mllibSMt.asML() + self.assertEqual(mlSMt2, mlSMt1) + # from ml + # dense + mllibDM1 = Matrices.dense(2, 2, [1, 2, 3, 4]) + mlDM = newlinalg.Matrices.dense(2, 2, [1, 2, 3, 4]) + mllibDM2 = Matrices.fromML(mlDM) + self.assertEqual(mllibDM1, mllibDM2) + # transposed + mllibDMt1 = DenseMatrix(2, 2, [1, 2, 3, 4], True) + mlDMt = newlinalg.DenseMatrix(2, 2, [1, 2, 3, 4], True) + mllibDMt2 = Matrices.fromML(mlDMt) + self.assertEqual(mllibDMt1, mllibDMt2) + # sparse + mllibSM1 = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) + mlSM = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) + mllibSM2 = Matrices.fromML(mlSM) + self.assertEqual(mllibSM1, mllibSM2) + # transposed + mllibSMt1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) + mlSMt = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) + mllibSMt2 = Matrices.fromML(mlSMt) + self.assertEqual(mllibSMt1, mllibSMt2) + class ListTests(MLlibTestCase):