44b7931936
### What changes were proposed in this pull request? This PR corrects some exception type when the function input params are failed to validate due to TypeError. In order to convenient to review, there are 3 commits in this PR: - Standardize input validation error type on sql - Standardize input validation error type on ml - Standardize input validation error type on pandas ### Why are the changes needed? As suggestion from Python exception doc [1]: "Raised when an operation or function is applied to an object of inappropriate type.", but there are many Value error are raised in some pyspark code, this patch fix them. [1] https://docs.python.org/3/library/exceptions.html#TypeError Note that: this patch only addresses the exsiting some wrong raise type for input validation, the input validation decorator/framework which mentioned in [SPARK-35176](https://issues.apache.org/jira/browse/SPARK-35176), would be submited in a speparated patch. ### Does this PR introduce _any_ user-facing change? Yes, code can raise the right TypeError instead of ValueError. ### How was this patch tested? Existing test case and UT Closes #32368 from Yikun/SPARK-35176. Authored-by: Yikun Jiang <yikunkero@gmail.com> Signed-off-by: HyukjinKwon <gurwls223@apache.org>
662 lines
27 KiB
Python
662 lines
27 KiB
Python
#
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
# this work for additional information regarding copyright ownership.
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
# (the "License"); you may not use this file except in compliance with
|
|
# the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
import array as pyarray
|
|
import unittest
|
|
|
|
from numpy import array, array_equal, zeros, arange, tile, ones, inf
|
|
|
|
import pyspark.ml.linalg as newlinalg
|
|
from pyspark.serializers import PickleSerializer
|
|
from pyspark.mllib.linalg import ( # type: ignore[attr-defined]
|
|
Vector, SparseVector, DenseVector, VectorUDT, _convert_to_vector,
|
|
DenseMatrix, SparseMatrix, Vectors, Matrices, MatrixUDT
|
|
)
|
|
from pyspark.mllib.linalg.distributed import RowMatrix, IndexedRowMatrix, IndexedRow
|
|
from pyspark.mllib.regression import LabeledPoint
|
|
from pyspark.sql import Row
|
|
from pyspark.testing.mllibutils import MLlibTestCase
|
|
from pyspark.testing.utils import have_scipy
|
|
|
|
|
|
class VectorTests(MLlibTestCase):
|
|
|
|
def _test_serialize(self, v):
|
|
ser = PickleSerializer()
|
|
self.assertEqual(v, ser.loads(ser.dumps(v)))
|
|
jvec = self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads(bytearray(ser.dumps(v)))
|
|
nv = ser.loads(bytes(self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.dumps(jvec)))
|
|
self.assertEqual(v, nv)
|
|
vs = [v] * 100
|
|
jvecs = self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads(bytearray(ser.dumps(vs)))
|
|
nvs = ser.loads(bytes(self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.dumps(jvecs)))
|
|
self.assertEqual(vs, nvs)
|
|
|
|
def test_serialize(self):
|
|
self._test_serialize(DenseVector(range(10)))
|
|
self._test_serialize(DenseVector(array([1., 2., 3., 4.])))
|
|
self._test_serialize(DenseVector(pyarray.array('d', range(10))))
|
|
self._test_serialize(SparseVector(4, {1: 1, 3: 2}))
|
|
self._test_serialize(SparseVector(3, {}))
|
|
self._test_serialize(DenseMatrix(2, 3, range(6)))
|
|
sm1 = SparseMatrix(
|
|
3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0])
|
|
self._test_serialize(sm1)
|
|
|
|
def test_dot(self):
|
|
sv = SparseVector(4, {1: 1, 3: 2})
|
|
dv = DenseVector(array([1., 2., 3., 4.]))
|
|
lst = DenseVector([1, 2, 3, 4])
|
|
mat = array([[1., 2., 3., 4.],
|
|
[1., 2., 3., 4.],
|
|
[1., 2., 3., 4.],
|
|
[1., 2., 3., 4.]])
|
|
arr = pyarray.array('d', [0, 1, 2, 3])
|
|
self.assertEqual(10.0, sv.dot(dv))
|
|
self.assertTrue(array_equal(array([3., 6., 9., 12.]), sv.dot(mat)))
|
|
self.assertEqual(30.0, dv.dot(dv))
|
|
self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat)))
|
|
self.assertEqual(30.0, lst.dot(dv))
|
|
self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat)))
|
|
self.assertEqual(7.0, sv.dot(arr))
|
|
|
|
def test_squared_distance(self):
|
|
def squared_distance(a, b):
|
|
if isinstance(a, Vector):
|
|
return a.squared_distance(b)
|
|
else:
|
|
return b.squared_distance(a)
|
|
|
|
sv = SparseVector(4, {1: 1, 3: 2})
|
|
dv = DenseVector(array([1., 2., 3., 4.]))
|
|
lst = DenseVector([4, 3, 2, 1])
|
|
lst1 = [4, 3, 2, 1]
|
|
arr = pyarray.array('d', [0, 2, 1, 3])
|
|
narr = array([0, 2, 1, 3])
|
|
self.assertEqual(15.0, squared_distance(sv, dv))
|
|
self.assertEqual(25.0, squared_distance(sv, lst))
|
|
self.assertEqual(20.0, squared_distance(dv, lst))
|
|
self.assertEqual(15.0, squared_distance(dv, sv))
|
|
self.assertEqual(25.0, squared_distance(lst, sv))
|
|
self.assertEqual(20.0, squared_distance(lst, dv))
|
|
self.assertEqual(0.0, squared_distance(sv, sv))
|
|
self.assertEqual(0.0, squared_distance(dv, dv))
|
|
self.assertEqual(0.0, squared_distance(lst, lst))
|
|
self.assertEqual(25.0, squared_distance(sv, lst1))
|
|
self.assertEqual(3.0, squared_distance(sv, arr))
|
|
self.assertEqual(3.0, squared_distance(sv, narr))
|
|
|
|
def test_hash(self):
|
|
v1 = DenseVector([0.0, 1.0, 0.0, 5.5])
|
|
v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
|
|
v3 = DenseVector([0.0, 1.0, 0.0, 5.5])
|
|
v4 = SparseVector(4, [(1, 1.0), (3, 2.5)])
|
|
self.assertEqual(hash(v1), hash(v2))
|
|
self.assertEqual(hash(v1), hash(v3))
|
|
self.assertEqual(hash(v2), hash(v3))
|
|
self.assertFalse(hash(v1) == hash(v4))
|
|
self.assertFalse(hash(v2) == hash(v4))
|
|
|
|
def test_eq(self):
|
|
v1 = DenseVector([0.0, 1.0, 0.0, 5.5])
|
|
v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
|
|
v3 = DenseVector([0.0, 1.0, 0.0, 5.5])
|
|
v4 = SparseVector(6, [(1, 1.0), (3, 5.5)])
|
|
v5 = DenseVector([0.0, 1.0, 0.0, 2.5])
|
|
v6 = SparseVector(4, [(1, 1.0), (3, 2.5)])
|
|
dm1 = DenseMatrix(2, 2, [2, 0, 0, 0])
|
|
sm1 = SparseMatrix(2, 2, [0, 2, 3], [0], [2])
|
|
self.assertEqual(v1, v2)
|
|
self.assertEqual(v1, v3)
|
|
self.assertFalse(v2 == v4)
|
|
self.assertFalse(v1 == v5)
|
|
self.assertFalse(v1 == v6)
|
|
# this is done as Dense and Sparse matrices can be semantically
|
|
# equal while still implementing a different __eq__ method
|
|
self.assertEqual(dm1, sm1)
|
|
self.assertEqual(sm1, dm1)
|
|
|
|
def test_equals(self):
|
|
indices = [1, 2, 4]
|
|
values = [1., 3., 2.]
|
|
self.assertTrue(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 0., 2.]))
|
|
self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 1., 0., 2.]))
|
|
self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 0., 2.]))
|
|
self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 2., 2.]))
|
|
|
|
def test_conversion(self):
|
|
# numpy arrays should be automatically upcast to float64
|
|
# tests for fix of [SPARK-5089]
|
|
v = array([1, 2, 3, 4], dtype='float64')
|
|
dv = DenseVector(v)
|
|
self.assertTrue(dv.array.dtype == 'float64')
|
|
v = array([1, 2, 3, 4], dtype='float32')
|
|
dv = DenseVector(v)
|
|
self.assertTrue(dv.array.dtype == 'float64')
|
|
|
|
def test_sparse_vector_indexing(self):
|
|
sv = SparseVector(5, {1: 1, 3: 2})
|
|
self.assertEqual(sv[0], 0.)
|
|
self.assertEqual(sv[3], 2.)
|
|
self.assertEqual(sv[1], 1.)
|
|
self.assertEqual(sv[2], 0.)
|
|
self.assertEqual(sv[4], 0.)
|
|
self.assertEqual(sv[-1], 0.)
|
|
self.assertEqual(sv[-2], 2.)
|
|
self.assertEqual(sv[-3], 0.)
|
|
self.assertEqual(sv[-5], 0.)
|
|
for ind in [5, -6]:
|
|
self.assertRaises(IndexError, sv.__getitem__, ind)
|
|
for ind in [7.8, '1']:
|
|
self.assertRaises(TypeError, sv.__getitem__, ind)
|
|
|
|
zeros = SparseVector(4, {})
|
|
self.assertEqual(zeros[0], 0.0)
|
|
self.assertEqual(zeros[3], 0.0)
|
|
for ind in [4, -5]:
|
|
self.assertRaises(IndexError, zeros.__getitem__, ind)
|
|
|
|
empty = SparseVector(0, {})
|
|
for ind in [-1, 0, 1]:
|
|
self.assertRaises(IndexError, empty.__getitem__, ind)
|
|
|
|
def test_sparse_vector_iteration(self):
|
|
self.assertListEqual(list(SparseVector(3, [], [])), [0.0, 0.0, 0.0])
|
|
self.assertListEqual(list(SparseVector(5, [0, 3], [1.0, 2.0])), [1.0, 0.0, 0.0, 2.0, 0.0])
|
|
|
|
def test_matrix_indexing(self):
|
|
mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10])
|
|
expected = [[0, 6], [1, 8], [4, 10]]
|
|
for i in range(3):
|
|
for j in range(2):
|
|
self.assertEqual(mat[i, j], expected[i][j])
|
|
|
|
for i, j in [(-1, 0), (4, 1), (3, 4)]:
|
|
self.assertRaises(IndexError, mat.__getitem__, (i, j))
|
|
|
|
def test_repr_dense_matrix(self):
|
|
mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10])
|
|
self.assertTrue(
|
|
repr(mat),
|
|
'DenseMatrix(3, 2, [0.0, 1.0, 4.0, 6.0, 8.0, 10.0], False)')
|
|
|
|
mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10], True)
|
|
self.assertTrue(
|
|
repr(mat),
|
|
'DenseMatrix(3, 2, [0.0, 1.0, 4.0, 6.0, 8.0, 10.0], False)')
|
|
|
|
mat = DenseMatrix(6, 3, zeros(18))
|
|
self.assertTrue(
|
|
repr(mat),
|
|
'DenseMatrix(6, 3, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..., \
|
|
0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], False)')
|
|
|
|
def test_repr_sparse_matrix(self):
|
|
sm1t = SparseMatrix(
|
|
3, 4, [0, 2, 3, 5], [0, 1, 2, 0, 2], [3.0, 2.0, 4.0, 9.0, 8.0],
|
|
isTransposed=True)
|
|
self.assertTrue(
|
|
repr(sm1t),
|
|
'SparseMatrix(3, 4, [0, 2, 3, 5], [0, 1, 2, 0, 2], [3.0, 2.0, 4.0, 9.0, 8.0], True)')
|
|
|
|
indices = tile(arange(6), 3)
|
|
values = ones(18)
|
|
sm = SparseMatrix(6, 3, [0, 6, 12, 18], indices, values)
|
|
self.assertTrue(
|
|
repr(sm), "SparseMatrix(6, 3, [0, 6, 12, 18], \
|
|
[0, 1, 2, 3, 4, 5, 0, 1, ..., 4, 5, 0, 1, 2, 3, 4, 5], \
|
|
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..., \
|
|
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], False)")
|
|
|
|
self.assertTrue(
|
|
str(sm),
|
|
"6 X 3 CSCMatrix\n\
|
|
(0,0) 1.0\n(1,0) 1.0\n(2,0) 1.0\n(3,0) 1.0\n(4,0) 1.0\n(5,0) 1.0\n\
|
|
(0,1) 1.0\n(1,1) 1.0\n(2,1) 1.0\n(3,1) 1.0\n(4,1) 1.0\n(5,1) 1.0\n\
|
|
(0,2) 1.0\n(1,2) 1.0\n(2,2) 1.0\n(3,2) 1.0\n..\n..")
|
|
|
|
sm = SparseMatrix(1, 18, zeros(19), [], [])
|
|
self.assertTrue(
|
|
repr(sm),
|
|
'SparseMatrix(1, 18, \
|
|
[0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0], [], [], False)')
|
|
|
|
def test_sparse_matrix(self):
|
|
# Test sparse matrix creation.
|
|
sm1 = SparseMatrix(
|
|
3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0])
|
|
self.assertEqual(sm1.numRows, 3)
|
|
self.assertEqual(sm1.numCols, 4)
|
|
self.assertEqual(sm1.colPtrs.tolist(), [0, 2, 2, 4, 4])
|
|
self.assertEqual(sm1.rowIndices.tolist(), [1, 2, 1, 2])
|
|
self.assertEqual(sm1.values.tolist(), [1.0, 2.0, 4.0, 5.0])
|
|
self.assertTrue(
|
|
repr(sm1),
|
|
'SparseMatrix(3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0], False)')
|
|
|
|
# Test indexing
|
|
expected = [
|
|
[0, 0, 0, 0],
|
|
[1, 0, 4, 0],
|
|
[2, 0, 5, 0]]
|
|
|
|
for i in range(3):
|
|
for j in range(4):
|
|
self.assertEqual(expected[i][j], sm1[i, j])
|
|
self.assertTrue(array_equal(sm1.toArray(), expected))
|
|
|
|
for i, j in [(-1, 1), (4, 3), (3, 5)]:
|
|
self.assertRaises(IndexError, sm1.__getitem__, (i, j))
|
|
|
|
# Test conversion to dense and sparse.
|
|
smnew = sm1.toDense().toSparse()
|
|
self.assertEqual(sm1.numRows, smnew.numRows)
|
|
self.assertEqual(sm1.numCols, smnew.numCols)
|
|
self.assertTrue(array_equal(sm1.colPtrs, smnew.colPtrs))
|
|
self.assertTrue(array_equal(sm1.rowIndices, smnew.rowIndices))
|
|
self.assertTrue(array_equal(sm1.values, smnew.values))
|
|
|
|
sm1t = SparseMatrix(
|
|
3, 4, [0, 2, 3, 5], [0, 1, 2, 0, 2], [3.0, 2.0, 4.0, 9.0, 8.0],
|
|
isTransposed=True)
|
|
self.assertEqual(sm1t.numRows, 3)
|
|
self.assertEqual(sm1t.numCols, 4)
|
|
self.assertEqual(sm1t.colPtrs.tolist(), [0, 2, 3, 5])
|
|
self.assertEqual(sm1t.rowIndices.tolist(), [0, 1, 2, 0, 2])
|
|
self.assertEqual(sm1t.values.tolist(), [3.0, 2.0, 4.0, 9.0, 8.0])
|
|
|
|
expected = [
|
|
[3, 2, 0, 0],
|
|
[0, 0, 4, 0],
|
|
[9, 0, 8, 0]]
|
|
|
|
for i in range(3):
|
|
for j in range(4):
|
|
self.assertEqual(expected[i][j], sm1t[i, j])
|
|
self.assertTrue(array_equal(sm1t.toArray(), expected))
|
|
|
|
def test_dense_matrix_is_transposed(self):
|
|
mat1 = DenseMatrix(3, 2, [0, 4, 1, 6, 3, 9], isTransposed=True)
|
|
mat = DenseMatrix(3, 2, [0, 1, 3, 4, 6, 9])
|
|
self.assertEqual(mat1, mat)
|
|
|
|
expected = [[0, 4], [1, 6], [3, 9]]
|
|
for i in range(3):
|
|
for j in range(2):
|
|
self.assertEqual(mat1[i, j], expected[i][j])
|
|
self.assertTrue(array_equal(mat1.toArray(), expected))
|
|
|
|
sm = mat1.toSparse()
|
|
self.assertTrue(array_equal(sm.rowIndices, [1, 2, 0, 1, 2]))
|
|
self.assertTrue(array_equal(sm.colPtrs, [0, 2, 5]))
|
|
self.assertTrue(array_equal(sm.values, [1, 3, 4, 6, 9]))
|
|
|
|
def test_parse_vector(self):
|
|
a = DenseVector([])
|
|
self.assertEqual(str(a), '[]')
|
|
self.assertEqual(Vectors.parse(str(a)), a)
|
|
a = DenseVector([3, 4, 6, 7])
|
|
self.assertEqual(str(a), '[3.0,4.0,6.0,7.0]')
|
|
self.assertEqual(Vectors.parse(str(a)), a)
|
|
a = SparseVector(4, [], [])
|
|
self.assertEqual(str(a), '(4,[],[])')
|
|
self.assertEqual(SparseVector.parse(str(a)), a)
|
|
a = SparseVector(4, [0, 2], [3, 4])
|
|
self.assertEqual(str(a), '(4,[0,2],[3.0,4.0])')
|
|
self.assertEqual(Vectors.parse(str(a)), a)
|
|
a = SparseVector(10, [0, 1], [4, 5])
|
|
self.assertEqual(SparseVector.parse(' (10, [0,1 ],[ 4.0,5.0] )'), a)
|
|
|
|
def test_norms(self):
|
|
a = DenseVector([0, 2, 3, -1])
|
|
self.assertAlmostEqual(a.norm(2), 3.742, 3)
|
|
self.assertTrue(a.norm(1), 6)
|
|
self.assertTrue(a.norm(inf), 3)
|
|
a = SparseVector(4, [0, 2], [3, -4])
|
|
self.assertAlmostEqual(a.norm(2), 5)
|
|
self.assertTrue(a.norm(1), 7)
|
|
self.assertTrue(a.norm(inf), 4)
|
|
|
|
tmp = SparseVector(4, [0, 2], [3, 0])
|
|
self.assertEqual(tmp.numNonzeros(), 1)
|
|
|
|
def test_ml_mllib_vector_conversion(self):
|
|
# to ml
|
|
# dense
|
|
mllibDV = Vectors.dense([1, 2, 3])
|
|
mlDV1 = newlinalg.Vectors.dense([1, 2, 3])
|
|
mlDV2 = mllibDV.asML()
|
|
self.assertEqual(mlDV2, mlDV1)
|
|
# sparse
|
|
mllibSV = Vectors.sparse(4, {1: 1.0, 3: 5.5})
|
|
mlSV1 = newlinalg.Vectors.sparse(4, {1: 1.0, 3: 5.5})
|
|
mlSV2 = mllibSV.asML()
|
|
self.assertEqual(mlSV2, mlSV1)
|
|
# from ml
|
|
# dense
|
|
mllibDV1 = Vectors.dense([1, 2, 3])
|
|
mlDV = newlinalg.Vectors.dense([1, 2, 3])
|
|
mllibDV2 = Vectors.fromML(mlDV)
|
|
self.assertEqual(mllibDV1, mllibDV2)
|
|
# sparse
|
|
mllibSV1 = Vectors.sparse(4, {1: 1.0, 3: 5.5})
|
|
mlSV = newlinalg.Vectors.sparse(4, {1: 1.0, 3: 5.5})
|
|
mllibSV2 = Vectors.fromML(mlSV)
|
|
self.assertEqual(mllibSV1, mllibSV2)
|
|
|
|
def test_ml_mllib_matrix_conversion(self):
|
|
# to ml
|
|
# dense
|
|
mllibDM = Matrices.dense(2, 2, [0, 1, 2, 3])
|
|
mlDM1 = newlinalg.Matrices.dense(2, 2, [0, 1, 2, 3])
|
|
mlDM2 = mllibDM.asML()
|
|
self.assertEqual(mlDM2, mlDM1)
|
|
# transposed
|
|
mllibDMt = DenseMatrix(2, 2, [0, 1, 2, 3], True)
|
|
mlDMt1 = newlinalg.DenseMatrix(2, 2, [0, 1, 2, 3], True)
|
|
mlDMt2 = mllibDMt.asML()
|
|
self.assertEqual(mlDMt2, mlDMt1)
|
|
# sparse
|
|
mllibSM = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
|
|
mlSM1 = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
|
|
mlSM2 = mllibSM.asML()
|
|
self.assertEqual(mlSM2, mlSM1)
|
|
# transposed
|
|
mllibSMt = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
|
|
mlSMt1 = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
|
|
mlSMt2 = mllibSMt.asML()
|
|
self.assertEqual(mlSMt2, mlSMt1)
|
|
# from ml
|
|
# dense
|
|
mllibDM1 = Matrices.dense(2, 2, [1, 2, 3, 4])
|
|
mlDM = newlinalg.Matrices.dense(2, 2, [1, 2, 3, 4])
|
|
mllibDM2 = Matrices.fromML(mlDM)
|
|
self.assertEqual(mllibDM1, mllibDM2)
|
|
# transposed
|
|
mllibDMt1 = DenseMatrix(2, 2, [1, 2, 3, 4], True)
|
|
mlDMt = newlinalg.DenseMatrix(2, 2, [1, 2, 3, 4], True)
|
|
mllibDMt2 = Matrices.fromML(mlDMt)
|
|
self.assertEqual(mllibDMt1, mllibDMt2)
|
|
# sparse
|
|
mllibSM1 = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
|
|
mlSM = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
|
|
mllibSM2 = Matrices.fromML(mlSM)
|
|
self.assertEqual(mllibSM1, mllibSM2)
|
|
# transposed
|
|
mllibSMt1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
|
|
mlSMt = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
|
|
mllibSMt2 = Matrices.fromML(mlSMt)
|
|
self.assertEqual(mllibSMt1, mllibSMt2)
|
|
|
|
|
|
class VectorUDTTests(MLlibTestCase):
|
|
|
|
dv0 = DenseVector([])
|
|
dv1 = DenseVector([1.0, 2.0])
|
|
sv0 = SparseVector(2, [], [])
|
|
sv1 = SparseVector(2, [1], [2.0])
|
|
udt = VectorUDT()
|
|
|
|
def test_json_schema(self):
|
|
self.assertEqual(VectorUDT.fromJson(self.udt.jsonValue()), self.udt)
|
|
|
|
def test_serialization(self):
|
|
for v in [self.dv0, self.dv1, self.sv0, self.sv1]:
|
|
self.assertEqual(v, self.udt.deserialize(self.udt.serialize(v)))
|
|
|
|
def test_infer_schema(self):
|
|
rdd = self.sc.parallelize([LabeledPoint(1.0, self.dv1), LabeledPoint(0.0, self.sv1)])
|
|
df = rdd.toDF()
|
|
schema = df.schema
|
|
field = [f for f in schema.fields if f.name == "features"][0]
|
|
self.assertEqual(field.dataType, self.udt)
|
|
vectors = df.rdd.map(lambda p: p.features).collect()
|
|
self.assertEqual(len(vectors), 2)
|
|
for v in vectors:
|
|
if isinstance(v, SparseVector):
|
|
self.assertEqual(v, self.sv1)
|
|
elif isinstance(v, DenseVector):
|
|
self.assertEqual(v, self.dv1)
|
|
else:
|
|
raise TypeError("expecting a vector but got %r of type %r" % (v, type(v)))
|
|
|
|
def test_row_matrix_from_dataframe(self):
|
|
from pyspark.sql.utils import IllegalArgumentException
|
|
df = self.spark.createDataFrame([Row(Vectors.dense(1))])
|
|
row_matrix = RowMatrix(df)
|
|
self.assertEqual(row_matrix.numRows(), 1)
|
|
self.assertEqual(row_matrix.numCols(), 1)
|
|
with self.assertRaises(IllegalArgumentException):
|
|
RowMatrix(df.selectExpr("'monkey'"))
|
|
|
|
def test_indexed_row_matrix_from_dataframe(self):
|
|
from pyspark.sql.utils import IllegalArgumentException
|
|
df = self.spark.createDataFrame([Row(int(0), Vectors.dense(1))])
|
|
matrix = IndexedRowMatrix(df)
|
|
self.assertEqual(matrix.numRows(), 1)
|
|
self.assertEqual(matrix.numCols(), 1)
|
|
with self.assertRaises(IllegalArgumentException):
|
|
IndexedRowMatrix(df.drop("_1"))
|
|
|
|
def test_row_matrix_invalid_type(self):
|
|
rows = self.sc.parallelize([[1, 2, 3], [4, 5, 6]])
|
|
invalid_type = ""
|
|
matrix = RowMatrix(rows)
|
|
self.assertRaises(TypeError, matrix.multiply, invalid_type)
|
|
|
|
irows = self.sc.parallelize([IndexedRow(0, [1, 2, 3]),
|
|
IndexedRow(1, [4, 5, 6])])
|
|
imatrix = IndexedRowMatrix(irows)
|
|
self.assertRaises(TypeError, imatrix.multiply, invalid_type)
|
|
|
|
|
|
class MatrixUDTTests(MLlibTestCase):
|
|
|
|
dm1 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10])
|
|
dm2 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10], isTransposed=True)
|
|
sm1 = SparseMatrix(1, 1, [0, 1], [0], [2.0])
|
|
sm2 = SparseMatrix(2, 1, [0, 0, 1], [0], [5.0], isTransposed=True)
|
|
udt = MatrixUDT()
|
|
|
|
def test_json_schema(self):
|
|
self.assertEqual(MatrixUDT.fromJson(self.udt.jsonValue()), self.udt)
|
|
|
|
def test_serialization(self):
|
|
for m in [self.dm1, self.dm2, self.sm1, self.sm2]:
|
|
self.assertEqual(m, self.udt.deserialize(self.udt.serialize(m)))
|
|
|
|
def test_infer_schema(self):
|
|
rdd = self.sc.parallelize([("dense", self.dm1), ("sparse", self.sm1)])
|
|
df = rdd.toDF()
|
|
schema = df.schema
|
|
self.assertTrue(schema.fields[1].dataType, self.udt)
|
|
matrices = df.rdd.map(lambda x: x._2).collect()
|
|
self.assertEqual(len(matrices), 2)
|
|
for m in matrices:
|
|
if isinstance(m, DenseMatrix):
|
|
self.assertTrue(m, self.dm1)
|
|
elif isinstance(m, SparseMatrix):
|
|
self.assertTrue(m, self.sm1)
|
|
else:
|
|
raise ValueError("Expected a matrix but got type %r" % type(m))
|
|
|
|
|
|
@unittest.skipIf(not have_scipy, "SciPy not installed")
|
|
class SciPyTests(MLlibTestCase):
|
|
|
|
"""
|
|
Test both vector operations and MLlib algorithms with SciPy sparse matrices,
|
|
if SciPy is available.
|
|
"""
|
|
|
|
def test_serialize(self):
|
|
from scipy.sparse import lil_matrix
|
|
|
|
ser = PickleSerializer()
|
|
lil = lil_matrix((4, 1))
|
|
lil[1, 0] = 1
|
|
lil[3, 0] = 2
|
|
sv = SparseVector(4, {1: 1, 3: 2})
|
|
self.assertEqual(sv, _convert_to_vector(lil))
|
|
self.assertEqual(sv, _convert_to_vector(lil.tocsc()))
|
|
self.assertEqual(sv, _convert_to_vector(lil.tocoo()))
|
|
self.assertEqual(sv, _convert_to_vector(lil.tocsr()))
|
|
self.assertEqual(sv, _convert_to_vector(lil.todok()))
|
|
|
|
def serialize(l):
|
|
return ser.loads(ser.dumps(_convert_to_vector(l)))
|
|
self.assertEqual(sv, serialize(lil))
|
|
self.assertEqual(sv, serialize(lil.tocsc()))
|
|
self.assertEqual(sv, serialize(lil.tocsr()))
|
|
self.assertEqual(sv, serialize(lil.todok()))
|
|
|
|
def test_convert_to_vector(self):
|
|
from scipy.sparse import csc_matrix
|
|
# Create a CSC matrix with non-sorted indices
|
|
indptr = array([0, 2])
|
|
indices = array([3, 1])
|
|
data = array([2.0, 1.0])
|
|
csc = csc_matrix((data, indices, indptr))
|
|
self.assertFalse(csc.has_sorted_indices)
|
|
sv = SparseVector(4, {1: 1, 3: 2})
|
|
self.assertEqual(sv, _convert_to_vector(csc))
|
|
|
|
def test_dot(self):
|
|
from scipy.sparse import lil_matrix
|
|
lil = lil_matrix((4, 1))
|
|
lil[1, 0] = 1
|
|
lil[3, 0] = 2
|
|
dv = DenseVector(array([1., 2., 3., 4.]))
|
|
self.assertEqual(10.0, dv.dot(lil))
|
|
|
|
def test_squared_distance(self):
|
|
from scipy.sparse import lil_matrix
|
|
lil = lil_matrix((4, 1))
|
|
lil[1, 0] = 3
|
|
lil[3, 0] = 2
|
|
dv = DenseVector(array([1., 2., 3., 4.]))
|
|
sv = SparseVector(4, {0: 1, 1: 2, 2: 3, 3: 4})
|
|
self.assertEqual(15.0, dv.squared_distance(lil))
|
|
self.assertEqual(15.0, sv.squared_distance(lil))
|
|
|
|
def scipy_matrix(self, size, values):
|
|
"""Create a column SciPy matrix from a dictionary of values"""
|
|
from scipy.sparse import lil_matrix
|
|
lil = lil_matrix((size, 1))
|
|
for key, value in values.items():
|
|
lil[key, 0] = value
|
|
return lil
|
|
|
|
def test_clustering(self):
|
|
from pyspark.mllib.clustering import KMeans
|
|
data = [
|
|
self.scipy_matrix(3, {1: 1.0}),
|
|
self.scipy_matrix(3, {1: 1.1}),
|
|
self.scipy_matrix(3, {2: 1.0}),
|
|
self.scipy_matrix(3, {2: 1.1})
|
|
]
|
|
clusters = KMeans.train(self.sc.parallelize(data), 2, initializationMode="k-means||")
|
|
self.assertEqual(clusters.predict(data[0]), clusters.predict(data[1]))
|
|
self.assertEqual(clusters.predict(data[2]), clusters.predict(data[3]))
|
|
|
|
def test_classification(self):
|
|
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
|
|
from pyspark.mllib.tree import DecisionTree
|
|
data = [
|
|
LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
|
|
LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
|
|
LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
|
|
LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
|
|
]
|
|
rdd = self.sc.parallelize(data)
|
|
features = [p.features for p in data]
|
|
|
|
lr_model = LogisticRegressionWithSGD.train(rdd)
|
|
self.assertTrue(lr_model.predict(features[0]) <= 0)
|
|
self.assertTrue(lr_model.predict(features[1]) > 0)
|
|
self.assertTrue(lr_model.predict(features[2]) <= 0)
|
|
self.assertTrue(lr_model.predict(features[3]) > 0)
|
|
|
|
svm_model = SVMWithSGD.train(rdd)
|
|
self.assertTrue(svm_model.predict(features[0]) <= 0)
|
|
self.assertTrue(svm_model.predict(features[1]) > 0)
|
|
self.assertTrue(svm_model.predict(features[2]) <= 0)
|
|
self.assertTrue(svm_model.predict(features[3]) > 0)
|
|
|
|
nb_model = NaiveBayes.train(rdd)
|
|
self.assertTrue(nb_model.predict(features[0]) <= 0)
|
|
self.assertTrue(nb_model.predict(features[1]) > 0)
|
|
self.assertTrue(nb_model.predict(features[2]) <= 0)
|
|
self.assertTrue(nb_model.predict(features[3]) > 0)
|
|
|
|
categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories
|
|
dt_model = DecisionTree.trainClassifier(rdd, numClasses=2,
|
|
categoricalFeaturesInfo=categoricalFeaturesInfo)
|
|
self.assertTrue(dt_model.predict(features[0]) <= 0)
|
|
self.assertTrue(dt_model.predict(features[1]) > 0)
|
|
self.assertTrue(dt_model.predict(features[2]) <= 0)
|
|
self.assertTrue(dt_model.predict(features[3]) > 0)
|
|
|
|
def test_regression(self):
|
|
from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
|
|
RidgeRegressionWithSGD
|
|
from pyspark.mllib.tree import DecisionTree
|
|
data = [
|
|
LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})),
|
|
LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
|
|
LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})),
|
|
LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
|
|
]
|
|
rdd = self.sc.parallelize(data)
|
|
features = [p.features for p in data]
|
|
|
|
lr_model = LinearRegressionWithSGD.train(rdd)
|
|
self.assertTrue(lr_model.predict(features[0]) <= 0)
|
|
self.assertTrue(lr_model.predict(features[1]) > 0)
|
|
self.assertTrue(lr_model.predict(features[2]) <= 0)
|
|
self.assertTrue(lr_model.predict(features[3]) > 0)
|
|
|
|
lasso_model = LassoWithSGD.train(rdd)
|
|
self.assertTrue(lasso_model.predict(features[0]) <= 0)
|
|
self.assertTrue(lasso_model.predict(features[1]) > 0)
|
|
self.assertTrue(lasso_model.predict(features[2]) <= 0)
|
|
self.assertTrue(lasso_model.predict(features[3]) > 0)
|
|
|
|
rr_model = RidgeRegressionWithSGD.train(rdd)
|
|
self.assertTrue(rr_model.predict(features[0]) <= 0)
|
|
self.assertTrue(rr_model.predict(features[1]) > 0)
|
|
self.assertTrue(rr_model.predict(features[2]) <= 0)
|
|
self.assertTrue(rr_model.predict(features[3]) > 0)
|
|
|
|
categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories
|
|
dt_model = DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
|
|
self.assertTrue(dt_model.predict(features[0]) <= 0)
|
|
self.assertTrue(dt_model.predict(features[1]) > 0)
|
|
self.assertTrue(dt_model.predict(features[2]) <= 0)
|
|
self.assertTrue(dt_model.predict(features[3]) > 0)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
from pyspark.mllib.tests.test_linalg import * # noqa: F401
|
|
|
|
try:
|
|
import xmlrunner # type: ignore[import]
|
|
testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
|
|
except ImportError:
|
|
testRunner = None
|
|
unittest.main(testRunner=testRunner, verbosity=2)
|