[SPARK-16328][ML][MLLIB][PYSPARK] Add 'asML' and 'fromML' conversion methods to PySpark linalg
The move to `ml.linalg` created `asML`/`fromML` utility methods in Scala/Java for converting between representations. These are missing in Python, this PR adds them. ## How was this patch tested? New doctests. Author: Nick Pentreath <nickp@za.ibm.com> Closes #13997 from MLnick/SPARK-16328-python-linalg-convert.
This commit is contained in:
parent
85f2303eca
commit
dab1051613
|
@ -39,6 +39,7 @@ else:
|
|||
import numpy as np
|
||||
|
||||
from pyspark import since
|
||||
from pyspark.ml import linalg as newlinalg
|
||||
from pyspark.sql.types import UserDefinedType, StructField, StructType, ArrayType, DoubleType, \
|
||||
IntegerType, ByteType, BooleanType
|
||||
|
||||
|
@ -247,6 +248,15 @@ class Vector(object):
|
|||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def asML(self):
|
||||
"""
|
||||
Convert this vector to the new mllib-local representation.
|
||||
This does NOT copy the data; it copies references.
|
||||
|
||||
:return: :py:class:`pyspark.ml.linalg.Vector`
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class DenseVector(Vector):
|
||||
"""
|
||||
|
@ -408,6 +418,17 @@ class DenseVector(Vector):
|
|||
"""
|
||||
return self.array
|
||||
|
||||
def asML(self):
|
||||
"""
|
||||
Convert this vector to the new mllib-local representation.
|
||||
This does NOT copy the data; it copies references.
|
||||
|
||||
:return: :py:class:`pyspark.ml.linalg.DenseVector`
|
||||
|
||||
.. versionadded:: 2.0.0
|
||||
"""
|
||||
return newlinalg.DenseVector(self.array)
|
||||
|
||||
@property
|
||||
def values(self):
|
||||
"""
|
||||
|
@ -737,6 +758,17 @@ class SparseVector(Vector):
|
|||
arr[self.indices] = self.values
|
||||
return arr
|
||||
|
||||
def asML(self):
|
||||
"""
|
||||
Convert this vector to the new mllib-local representation.
|
||||
This does NOT copy the data; it copies references.
|
||||
|
||||
:return: :py:class:`pyspark.ml.linalg.SparseVector`
|
||||
|
||||
.. versionadded:: 2.0.0
|
||||
"""
|
||||
return newlinalg.SparseVector(self.size, self.indices, self.values)
|
||||
|
||||
def __len__(self):
|
||||
return self.size
|
||||
|
||||
|
@ -845,6 +877,24 @@ class Vectors(object):
|
|||
elements = elements[0]
|
||||
return DenseVector(elements)
|
||||
|
||||
@staticmethod
|
||||
def fromML(vec):
|
||||
"""
|
||||
Convert a vector from the new mllib-local representation.
|
||||
This does NOT copy the data; it copies references.
|
||||
|
||||
:param vec: a :py:class:`pyspark.ml.linalg.Vector`
|
||||
:return: a :py:class:`pyspark.mllib.linalg.Vector`
|
||||
|
||||
.. versionadded:: 2.0.0
|
||||
"""
|
||||
if isinstance(vec, newlinalg.DenseVector):
|
||||
return DenseVector(vec.array)
|
||||
elif isinstance(vec, newlinalg.SparseVector):
|
||||
return SparseVector(vec.size, vec.indices, vec.values)
|
||||
else:
|
||||
raise TypeError("Unsupported vector type %s" % type(vec))
|
||||
|
||||
@staticmethod
|
||||
def stringify(vector):
|
||||
"""
|
||||
|
@ -945,6 +995,13 @@ class Matrix(object):
|
|||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def asML(self):
|
||||
"""
|
||||
Convert this matrix to the new mllib-local representation.
|
||||
This does NOT copy the data; it copies references.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
def _convert_to_array(array_like, dtype):
|
||||
"""
|
||||
|
@ -1044,6 +1101,17 @@ class DenseMatrix(Matrix):
|
|||
|
||||
return SparseMatrix(self.numRows, self.numCols, colPtrs, rowIndices, values)
|
||||
|
||||
def asML(self):
|
||||
"""
|
||||
Convert this matrix to the new mllib-local representation.
|
||||
This does NOT copy the data; it copies references.
|
||||
|
||||
:return: :py:class:`pyspark.ml.linalg.DenseMatrix`
|
||||
|
||||
.. versionadded:: 2.0.0
|
||||
"""
|
||||
return newlinalg.DenseMatrix(self.numRows, self.numCols, self.values, self.isTransposed)
|
||||
|
||||
def __getitem__(self, indices):
|
||||
i, j = indices
|
||||
if i < 0 or i >= self.numRows:
|
||||
|
@ -1216,6 +1284,18 @@ class SparseMatrix(Matrix):
|
|||
densevals = np.ravel(self.toArray(), order='F')
|
||||
return DenseMatrix(self.numRows, self.numCols, densevals)
|
||||
|
||||
def asML(self):
|
||||
"""
|
||||
Convert this matrix to the new mllib-local representation.
|
||||
This does NOT copy the data; it copies references.
|
||||
|
||||
:return: :py:class:`pyspark.ml.linalg.SparseMatrix`
|
||||
|
||||
.. versionadded:: 2.0.0
|
||||
"""
|
||||
return newlinalg.SparseMatrix(self.numRows, self.numCols, self.colPtrs, self.rowIndices,
|
||||
self.values, self.isTransposed)
|
||||
|
||||
# TODO: More efficient implementation:
|
||||
def __eq__(self, other):
|
||||
return np.all(self.toArray() == other.toArray())
|
||||
|
@ -1236,6 +1316,25 @@ class Matrices(object):
|
|||
"""
|
||||
return SparseMatrix(numRows, numCols, colPtrs, rowIndices, values)
|
||||
|
||||
@staticmethod
|
||||
def fromML(mat):
|
||||
"""
|
||||
Convert a matrix from the new mllib-local representation.
|
||||
This does NOT copy the data; it copies references.
|
||||
|
||||
:param mat: a :py:class:`pyspark.ml.linalg.Matrix`
|
||||
:return: a :py:class:`pyspark.mllib.linalg.Matrix`
|
||||
|
||||
.. versionadded:: 2.0.0
|
||||
"""
|
||||
if isinstance(mat, newlinalg.DenseMatrix):
|
||||
return DenseMatrix(mat.numRows, mat.numCols, mat.values, mat.isTransposed)
|
||||
elif isinstance(mat, newlinalg.SparseMatrix):
|
||||
return SparseMatrix(mat.numRows, mat.numCols, mat.colPtrs, mat.rowIndices,
|
||||
mat.values, mat.isTransposed)
|
||||
else:
|
||||
raise TypeError("Unsupported matrix type %s" % type(mat))
|
||||
|
||||
|
||||
class QRDecomposition(object):
|
||||
"""
|
||||
|
|
|
@ -49,6 +49,7 @@ else:
|
|||
import unittest
|
||||
|
||||
from pyspark import SparkContext
|
||||
import pyspark.ml.linalg as newlinalg
|
||||
from pyspark.mllib.common import _to_java_object_rdd
|
||||
from pyspark.mllib.clustering import StreamingKMeans, StreamingKMeansModel
|
||||
from pyspark.mllib.linalg import Vector, SparseVector, DenseVector, VectorUDT, _convert_to_vector,\
|
||||
|
@ -423,6 +424,74 @@ class VectorTests(MLlibTestCase):
|
|||
tmp = SparseVector(4, [0, 2], [3, 0])
|
||||
self.assertEqual(tmp.numNonzeros(), 1)
|
||||
|
||||
def test_ml_mllib_vector_conversion(self):
|
||||
# to ml
|
||||
# dense
|
||||
mllibDV = Vectors.dense([1, 2, 3])
|
||||
mlDV1 = newlinalg.Vectors.dense([1, 2, 3])
|
||||
mlDV2 = mllibDV.asML()
|
||||
self.assertEqual(mlDV2, mlDV1)
|
||||
# sparse
|
||||
mllibSV = Vectors.sparse(4, {1: 1.0, 3: 5.5})
|
||||
mlSV1 = newlinalg.Vectors.sparse(4, {1: 1.0, 3: 5.5})
|
||||
mlSV2 = mllibSV.asML()
|
||||
self.assertEqual(mlSV2, mlSV1)
|
||||
# from ml
|
||||
# dense
|
||||
mllibDV1 = Vectors.dense([1, 2, 3])
|
||||
mlDV = newlinalg.Vectors.dense([1, 2, 3])
|
||||
mllibDV2 = Vectors.fromML(mlDV)
|
||||
self.assertEqual(mllibDV1, mllibDV2)
|
||||
# sparse
|
||||
mllibSV1 = Vectors.sparse(4, {1: 1.0, 3: 5.5})
|
||||
mlSV = newlinalg.Vectors.sparse(4, {1: 1.0, 3: 5.5})
|
||||
mllibSV2 = Vectors.fromML(mlSV)
|
||||
self.assertEqual(mllibSV1, mllibSV2)
|
||||
|
||||
def test_ml_mllib_matrix_conversion(self):
|
||||
# to ml
|
||||
# dense
|
||||
mllibDM = Matrices.dense(2, 2, [0, 1, 2, 3])
|
||||
mlDM1 = newlinalg.Matrices.dense(2, 2, [0, 1, 2, 3])
|
||||
mlDM2 = mllibDM.asML()
|
||||
self.assertEqual(mlDM2, mlDM1)
|
||||
# transposed
|
||||
mllibDMt = DenseMatrix(2, 2, [0, 1, 2, 3], True)
|
||||
mlDMt1 = newlinalg.DenseMatrix(2, 2, [0, 1, 2, 3], True)
|
||||
mlDMt2 = mllibDMt.asML()
|
||||
self.assertEqual(mlDMt2, mlDMt1)
|
||||
# sparse
|
||||
mllibSM = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
|
||||
mlSM1 = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
|
||||
mlSM2 = mllibSM.asML()
|
||||
self.assertEqual(mlSM2, mlSM1)
|
||||
# transposed
|
||||
mllibSMt = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
|
||||
mlSMt1 = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
|
||||
mlSMt2 = mllibSMt.asML()
|
||||
self.assertEqual(mlSMt2, mlSMt1)
|
||||
# from ml
|
||||
# dense
|
||||
mllibDM1 = Matrices.dense(2, 2, [1, 2, 3, 4])
|
||||
mlDM = newlinalg.Matrices.dense(2, 2, [1, 2, 3, 4])
|
||||
mllibDM2 = Matrices.fromML(mlDM)
|
||||
self.assertEqual(mllibDM1, mllibDM2)
|
||||
# transposed
|
||||
mllibDMt1 = DenseMatrix(2, 2, [1, 2, 3, 4], True)
|
||||
mlDMt = newlinalg.DenseMatrix(2, 2, [1, 2, 3, 4], True)
|
||||
mllibDMt2 = Matrices.fromML(mlDMt)
|
||||
self.assertEqual(mllibDMt1, mllibDMt2)
|
||||
# sparse
|
||||
mllibSM1 = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
|
||||
mlSM = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
|
||||
mllibSM2 = Matrices.fromML(mlSM)
|
||||
self.assertEqual(mllibSM1, mllibSM2)
|
||||
# transposed
|
||||
mllibSMt1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
|
||||
mlSMt = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
|
||||
mllibSMt2 = Matrices.fromML(mlSMt)
|
||||
self.assertEqual(mllibSMt1, mllibSMt2)
|
||||
|
||||
|
||||
class ListTests(MLlibTestCase):
|
||||
|
||||
|
|
Loading…
Reference in a new issue