[SPARK-16328][ML][MLLIB][PYSPARK] Add 'asML' and 'fromML' conversion methods to PySpark linalg

The move to `ml.linalg` created `asML`/`fromML` utility methods in Scala/Java for converting between representations. These are missing in Python, this PR adds them.

## How was this patch tested?

New doctests.

Author: Nick Pentreath <nickp@za.ibm.com>

Closes #13997 from MLnick/SPARK-16328-python-linalg-convert.
This commit is contained in:
Nick Pentreath 2016-06-30 17:52:15 -07:00 committed by Joseph K. Bradley
parent 85f2303eca
commit dab1051613
2 changed files with 168 additions and 0 deletions

View file

@ -39,6 +39,7 @@ else:
import numpy as np
from pyspark import since
from pyspark.ml import linalg as newlinalg
from pyspark.sql.types import UserDefinedType, StructField, StructType, ArrayType, DoubleType, \
IntegerType, ByteType, BooleanType
@ -247,6 +248,15 @@ class Vector(object):
"""
raise NotImplementedError
def asML(self):
"""
Convert this vector to the new mllib-local representation.
This does NOT copy the data; it copies references.
:return: :py:class:`pyspark.ml.linalg.Vector`
"""
raise NotImplementedError
class DenseVector(Vector):
"""
@ -408,6 +418,17 @@ class DenseVector(Vector):
"""
return self.array
def asML(self):
"""
Convert this vector to the new mllib-local representation.
This does NOT copy the data; it copies references.
:return: :py:class:`pyspark.ml.linalg.DenseVector`
.. versionadded:: 2.0.0
"""
return newlinalg.DenseVector(self.array)
@property
def values(self):
"""
@ -737,6 +758,17 @@ class SparseVector(Vector):
arr[self.indices] = self.values
return arr
def asML(self):
"""
Convert this vector to the new mllib-local representation.
This does NOT copy the data; it copies references.
:return: :py:class:`pyspark.ml.linalg.SparseVector`
.. versionadded:: 2.0.0
"""
return newlinalg.SparseVector(self.size, self.indices, self.values)
def __len__(self):
return self.size
@ -845,6 +877,24 @@ class Vectors(object):
elements = elements[0]
return DenseVector(elements)
@staticmethod
def fromML(vec):
"""
Convert a vector from the new mllib-local representation.
This does NOT copy the data; it copies references.
:param vec: a :py:class:`pyspark.ml.linalg.Vector`
:return: a :py:class:`pyspark.mllib.linalg.Vector`
.. versionadded:: 2.0.0
"""
if isinstance(vec, newlinalg.DenseVector):
return DenseVector(vec.array)
elif isinstance(vec, newlinalg.SparseVector):
return SparseVector(vec.size, vec.indices, vec.values)
else:
raise TypeError("Unsupported vector type %s" % type(vec))
@staticmethod
def stringify(vector):
"""
@ -945,6 +995,13 @@ class Matrix(object):
"""
raise NotImplementedError
def asML(self):
"""
Convert this matrix to the new mllib-local representation.
This does NOT copy the data; it copies references.
"""
raise NotImplementedError
@staticmethod
def _convert_to_array(array_like, dtype):
"""
@ -1044,6 +1101,17 @@ class DenseMatrix(Matrix):
return SparseMatrix(self.numRows, self.numCols, colPtrs, rowIndices, values)
def asML(self):
"""
Convert this matrix to the new mllib-local representation.
This does NOT copy the data; it copies references.
:return: :py:class:`pyspark.ml.linalg.DenseMatrix`
.. versionadded:: 2.0.0
"""
return newlinalg.DenseMatrix(self.numRows, self.numCols, self.values, self.isTransposed)
def __getitem__(self, indices):
i, j = indices
if i < 0 or i >= self.numRows:
@ -1216,6 +1284,18 @@ class SparseMatrix(Matrix):
densevals = np.ravel(self.toArray(), order='F')
return DenseMatrix(self.numRows, self.numCols, densevals)
def asML(self):
"""
Convert this matrix to the new mllib-local representation.
This does NOT copy the data; it copies references.
:return: :py:class:`pyspark.ml.linalg.SparseMatrix`
.. versionadded:: 2.0.0
"""
return newlinalg.SparseMatrix(self.numRows, self.numCols, self.colPtrs, self.rowIndices,
self.values, self.isTransposed)
# TODO: More efficient implementation:
def __eq__(self, other):
return np.all(self.toArray() == other.toArray())
@ -1236,6 +1316,25 @@ class Matrices(object):
"""
return SparseMatrix(numRows, numCols, colPtrs, rowIndices, values)
@staticmethod
def fromML(mat):
"""
Convert a matrix from the new mllib-local representation.
This does NOT copy the data; it copies references.
:param mat: a :py:class:`pyspark.ml.linalg.Matrix`
:return: a :py:class:`pyspark.mllib.linalg.Matrix`
.. versionadded:: 2.0.0
"""
if isinstance(mat, newlinalg.DenseMatrix):
return DenseMatrix(mat.numRows, mat.numCols, mat.values, mat.isTransposed)
elif isinstance(mat, newlinalg.SparseMatrix):
return SparseMatrix(mat.numRows, mat.numCols, mat.colPtrs, mat.rowIndices,
mat.values, mat.isTransposed)
else:
raise TypeError("Unsupported matrix type %s" % type(mat))
class QRDecomposition(object):
"""

View file

@ -49,6 +49,7 @@ else:
import unittest
from pyspark import SparkContext
import pyspark.ml.linalg as newlinalg
from pyspark.mllib.common import _to_java_object_rdd
from pyspark.mllib.clustering import StreamingKMeans, StreamingKMeansModel
from pyspark.mllib.linalg import Vector, SparseVector, DenseVector, VectorUDT, _convert_to_vector,\
@ -423,6 +424,74 @@ class VectorTests(MLlibTestCase):
tmp = SparseVector(4, [0, 2], [3, 0])
self.assertEqual(tmp.numNonzeros(), 1)
def test_ml_mllib_vector_conversion(self):
# to ml
# dense
mllibDV = Vectors.dense([1, 2, 3])
mlDV1 = newlinalg.Vectors.dense([1, 2, 3])
mlDV2 = mllibDV.asML()
self.assertEqual(mlDV2, mlDV1)
# sparse
mllibSV = Vectors.sparse(4, {1: 1.0, 3: 5.5})
mlSV1 = newlinalg.Vectors.sparse(4, {1: 1.0, 3: 5.5})
mlSV2 = mllibSV.asML()
self.assertEqual(mlSV2, mlSV1)
# from ml
# dense
mllibDV1 = Vectors.dense([1, 2, 3])
mlDV = newlinalg.Vectors.dense([1, 2, 3])
mllibDV2 = Vectors.fromML(mlDV)
self.assertEqual(mllibDV1, mllibDV2)
# sparse
mllibSV1 = Vectors.sparse(4, {1: 1.0, 3: 5.5})
mlSV = newlinalg.Vectors.sparse(4, {1: 1.0, 3: 5.5})
mllibSV2 = Vectors.fromML(mlSV)
self.assertEqual(mllibSV1, mllibSV2)
def test_ml_mllib_matrix_conversion(self):
# to ml
# dense
mllibDM = Matrices.dense(2, 2, [0, 1, 2, 3])
mlDM1 = newlinalg.Matrices.dense(2, 2, [0, 1, 2, 3])
mlDM2 = mllibDM.asML()
self.assertEqual(mlDM2, mlDM1)
# transposed
mllibDMt = DenseMatrix(2, 2, [0, 1, 2, 3], True)
mlDMt1 = newlinalg.DenseMatrix(2, 2, [0, 1, 2, 3], True)
mlDMt2 = mllibDMt.asML()
self.assertEqual(mlDMt2, mlDMt1)
# sparse
mllibSM = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
mlSM1 = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
mlSM2 = mllibSM.asML()
self.assertEqual(mlSM2, mlSM1)
# transposed
mllibSMt = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
mlSMt1 = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
mlSMt2 = mllibSMt.asML()
self.assertEqual(mlSMt2, mlSMt1)
# from ml
# dense
mllibDM1 = Matrices.dense(2, 2, [1, 2, 3, 4])
mlDM = newlinalg.Matrices.dense(2, 2, [1, 2, 3, 4])
mllibDM2 = Matrices.fromML(mlDM)
self.assertEqual(mllibDM1, mllibDM2)
# transposed
mllibDMt1 = DenseMatrix(2, 2, [1, 2, 3, 4], True)
mlDMt = newlinalg.DenseMatrix(2, 2, [1, 2, 3, 4], True)
mllibDMt2 = Matrices.fromML(mlDMt)
self.assertEqual(mllibDMt1, mllibDMt2)
# sparse
mllibSM1 = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
mlSM = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
mllibSM2 = Matrices.fromML(mlSM)
self.assertEqual(mllibSM1, mllibSM2)
# transposed
mllibSMt1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
mlSMt = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
mllibSMt2 = Matrices.fromML(mlSMt)
self.assertEqual(mllibSMt1, mllibSMt2)
class ListTests(MLlibTestCase):