[SPARK-5089][PYSPARK][MLLIB] Fix vector convert
This is a small change addressing a potentially significant bug in how PySpark + MLlib handles non-float64 numpy arrays. The automatic conversion to `DenseVector` that occurs when passing RDDs to MLlib algorithms in PySpark should automatically upcast to float64s, but currently this wasn't actually happening. As a result, non-float64 would be silently parsed inappropriately during SerDe, yielding erroneous results when running, for example, KMeans. The PR includes the fix, as well as a new test for the correct conversion behavior. davies Author: freeman <the.freeman.lab@gmail.com> Closes #3902 from freeman-lab/fix-vector-convert and squashes the following commits: 764db47 [freeman] Add a test for proper conversion behavior 704f97e [freeman] Return array after changing type
This commit is contained in:
parent
1c0e7ce056
commit
6c6f325740
|
@ -178,7 +178,7 @@ class DenseVector(Vector):
|
|||
elif not isinstance(ar, np.ndarray):
|
||||
ar = np.array(ar, dtype=np.float64)
|
||||
if ar.dtype != np.float64:
|
||||
ar.astype(np.float64)
|
||||
ar = ar.astype(np.float64)
|
||||
self.array = ar
|
||||
|
||||
def __reduce__(self):
|
||||
|
|
|
@ -110,6 +110,16 @@ class VectorTests(PySparkTestCase):
|
|||
self.assertEquals(0.0, _squared_distance(dv, dv))
|
||||
self.assertEquals(0.0, _squared_distance(lst, lst))
|
||||
|
||||
def test_conversion(self):
|
||||
# numpy arrays should be automatically upcast to float64
|
||||
# tests for fix of [SPARK-5089]
|
||||
v = array([1, 2, 3, 4], dtype='float64')
|
||||
dv = DenseVector(v)
|
||||
self.assertTrue(dv.array.dtype == 'float64')
|
||||
v = array([1, 2, 3, 4], dtype='float32')
|
||||
dv = DenseVector(v)
|
||||
self.assertTrue(dv.array.dtype == 'float64')
|
||||
|
||||
|
||||
class ListTests(PySparkTestCase):
|
||||
|
||||
|
|
Loading…
Reference in a new issue