spark-instrumented-optimizer/python/pyspark/ml/functions.py

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from pyspark import since, SparkContext
from pyspark.sql.column import Column, _to_java_column


@since("3.0.0")
def vector_to_array(col, dtype="float64"):
    """
    Converts a column of MLlib sparse/dense vectors into a column of dense arrays.

    :param col: A string of the column name or a Column
    :param dtype: The data type of the output array. Valid values: "float64" or "float32".
    :return: The converted column of dense arrays.

    .. versionadded:: 3.0.0

    >>> from pyspark.ml.linalg import Vectors
    >>> from pyspark.ml.functions import vector_to_array
    >>> from pyspark.mllib.linalg import Vectors as OldVectors
    >>> df = spark.createDataFrame([
    ...     (Vectors.dense(1.0, 2.0, 3.0), OldVectors.dense(10.0, 20.0, 30.0)),
    ...     (Vectors.sparse(3, [(0, 2.0), (2, 3.0)]),
    ...      OldVectors.sparse(3, [(0, 20.0), (2, 30.0)]))],
    ...     ["vec", "oldVec"])
    >>> df1 = df.select(vector_to_array("vec").alias("vec"),
    ...                 vector_to_array("oldVec").alias("oldVec"))
    >>> df1.collect()
    [Row(vec=[1.0, 2.0, 3.0], oldVec=[10.0, 20.0, 30.0]),
     Row(vec=[2.0, 0.0, 3.0], oldVec=[20.0, 0.0, 30.0])]
    >>> df2 = df.select(vector_to_array("vec", "float32").alias("vec"),
    ...                 vector_to_array("oldVec", "float32").alias("oldVec"))
    >>> df2.collect()
    [Row(vec=[1.0, 2.0, 3.0], oldVec=[10.0, 20.0, 30.0]),
     Row(vec=[2.0, 0.0, 3.0], oldVec=[20.0, 0.0, 30.0])]
    >>> df1.schema.fields
    [StructField(vec,ArrayType(DoubleType,false),false),
    StructField(oldVec,ArrayType(DoubleType,false),false)]
    >>> df2.schema.fields
    [StructField(vec,ArrayType(FloatType,false),false),
    StructField(oldVec,ArrayType(FloatType,false),false)]
    """
    sc = SparkContext._active_spark_context
    return Column(
        sc._jvm.org.apache.spark.ml.functions.vector_to_array(_to_java_column(col), dtype))


def _test():
    import doctest
    from pyspark.sql import SparkSession
    import pyspark.ml.functions
    import sys
    globs = pyspark.ml.functions.__dict__.copy()
    spark = SparkSession.builder \
        .master("local[2]") \
        .appName("ml.functions tests") \
        .getOrCreate()
    sc = spark.sparkContext
    globs['sc'] = sc
    globs['spark'] = spark

    (failure_count, test_count) = doctest.testmod(
        pyspark.ml.functions, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
    spark.stop()
    if failure_count:
        sys.exit(-1)


if __name__ == "__main__":
    _test()
[SPARK-30154][ML] PySpark UDF to convert MLlib vectors to dense arrays ### What changes were proposed in this pull request? PySpark UDF to convert MLlib vectors to dense arrays. Example: ``` from pyspark.ml.functions import vector_to_array df.select(vector_to_array(col("features")) ``` ### Why are the changes needed? If a PySpark user wants to convert MLlib sparse/dense vectors in a DataFrame into dense arrays, an efficient approach is to do that in JVM. However, it requires PySpark user to write Scala code and register it as a UDF. Often this is infeasible for a pure python project. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? UT. Closes #26910 from WeichenXu123/vector_to_array. Authored-by: WeichenXu <weichen.xu@databricks.com> Signed-off-by: Xiangrui Meng <meng@databricks.com> 2020-01-06 19:18:51 -05:00			`#`
			`# Licensed to the Apache Software Foundation (ASF) under one or more`
			`# contributor license agreements. See the NOTICE file distributed with`
			`# this work for additional information regarding copyright ownership.`
			`# The ASF licenses this file to You under the Apache License, Version 2.0`
			`# (the "License"); you may not use this file except in compliance with`
			`# the License. You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`#`

			`from pyspark import since, SparkContext`
			`from pyspark.sql.column import Column, _to_java_column`


[SPARK-30762] Add dtype=float32 support to vector_to_array UDF ### What changes were proposed in this pull request? In this PR, we add a parameter in the python function vector_to_array(col) that allows converting to a column of arrays of Float (32bits) in scala, which would be mapped to a numpy array of dtype=float32. ### Why are the changes needed? In the downstream ML training, using float32 instead of float64 (default) would allow a larger batch size, i.e., allow more data to fit in the memory. ### Does this PR introduce any user-facing change? Yes. Old: `vector_to_array()` only take one param ``` df.select(vector_to_array("colA"), ...) ``` New: `vector_to_array()` can take an additional optional param: `dtype` = "float32" (or "float64") ``` df.select(vector_to_array("colA", "float32"), ...) ``` ### How was this patch tested? Unit test in scala. doctest in python. Closes #27522 from liangz1/udf-float32. Authored-by: Liang Zhang <liang.zhang@databricks.com> Signed-off-by: WeichenXu <weichen.xu@databricks.com> 2020-02-13 10:55:13 -05:00			`@since("3.0.0")`
			`def vector_to_array(col, dtype="float64"):`
[SPARK-30154][ML] PySpark UDF to convert MLlib vectors to dense arrays ### What changes were proposed in this pull request? PySpark UDF to convert MLlib vectors to dense arrays. Example: ``` from pyspark.ml.functions import vector_to_array df.select(vector_to_array(col("features")) ``` ### Why are the changes needed? If a PySpark user wants to convert MLlib sparse/dense vectors in a DataFrame into dense arrays, an efficient approach is to do that in JVM. However, it requires PySpark user to write Scala code and register it as a UDF. Often this is infeasible for a pure python project. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? UT. Closes #26910 from WeichenXu123/vector_to_array. Authored-by: WeichenXu <weichen.xu@databricks.com> Signed-off-by: Xiangrui Meng <meng@databricks.com> 2020-01-06 19:18:51 -05:00			`"""`
			`Converts a column of MLlib sparse/dense vectors into a column of dense arrays.`
[SPARK-30859][PYSPARK][DOCS][MINOR] Fixed docstring syntax issues preventing proper compilation of documentation This commit is published into the public domain. ### What changes were proposed in this pull request? Some syntax issues in docstrings have been fixed. ### Why are the changes needed? In some places, the documentation did not render as intended, e.g. parameter documentations were not formatted as such. ### Does this PR introduce any user-facing change? Slight improvements in documentation. ### How was this patch tested? Manual testing. No new Sphinx warnings arise due to this change. Closes #27613 from DavidToneian/SPARK-30859. Authored-by: David Toneian <david@toneian.com> Signed-off-by: HyukjinKwon <gurwls223@apache.org> 2020-02-18 02:46:45 -05:00
[SPARK-30762] Add dtype=float32 support to vector_to_array UDF ### What changes were proposed in this pull request? In this PR, we add a parameter in the python function vector_to_array(col) that allows converting to a column of arrays of Float (32bits) in scala, which would be mapped to a numpy array of dtype=float32. ### Why are the changes needed? In the downstream ML training, using float32 instead of float64 (default) would allow a larger batch size, i.e., allow more data to fit in the memory. ### Does this PR introduce any user-facing change? Yes. Old: `vector_to_array()` only take one param ``` df.select(vector_to_array("colA"), ...) ``` New: `vector_to_array()` can take an additional optional param: `dtype` = "float32" (or "float64") ``` df.select(vector_to_array("colA", "float32"), ...) ``` ### How was this patch tested? Unit test in scala. doctest in python. Closes #27522 from liangz1/udf-float32. Authored-by: Liang Zhang <liang.zhang@databricks.com> Signed-off-by: WeichenXu <weichen.xu@databricks.com> 2020-02-13 10:55:13 -05:00			`:param col: A string of the column name or a Column`
			`:param dtype: The data type of the output array. Valid values: "float64" or "float32".`
			`:return: The converted column of dense arrays.`

			`.. versionadded:: 3.0.0`
[SPARK-30154][ML] PySpark UDF to convert MLlib vectors to dense arrays ### What changes were proposed in this pull request? PySpark UDF to convert MLlib vectors to dense arrays. Example: ``` from pyspark.ml.functions import vector_to_array df.select(vector_to_array(col("features")) ``` ### Why are the changes needed? If a PySpark user wants to convert MLlib sparse/dense vectors in a DataFrame into dense arrays, an efficient approach is to do that in JVM. However, it requires PySpark user to write Scala code and register it as a UDF. Often this is infeasible for a pure python project. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? UT. Closes #26910 from WeichenXu123/vector_to_array. Authored-by: WeichenXu <weichen.xu@databricks.com> Signed-off-by: Xiangrui Meng <meng@databricks.com> 2020-01-06 19:18:51 -05:00
			`>>> from pyspark.ml.linalg import Vectors`
			`>>> from pyspark.ml.functions import vector_to_array`
			`>>> from pyspark.mllib.linalg import Vectors as OldVectors`
			`>>> df = spark.createDataFrame([`
			`... (Vectors.dense(1.0, 2.0, 3.0), OldVectors.dense(10.0, 20.0, 30.0)),`
			`... (Vectors.sparse(3, [(0, 2.0), (2, 3.0)]),`
			`... OldVectors.sparse(3, [(0, 20.0), (2, 30.0)]))],`
			`... ["vec", "oldVec"])`
[SPARK-30762] Add dtype=float32 support to vector_to_array UDF ### What changes were proposed in this pull request? In this PR, we add a parameter in the python function vector_to_array(col) that allows converting to a column of arrays of Float (32bits) in scala, which would be mapped to a numpy array of dtype=float32. ### Why are the changes needed? In the downstream ML training, using float32 instead of float64 (default) would allow a larger batch size, i.e., allow more data to fit in the memory. ### Does this PR introduce any user-facing change? Yes. Old: `vector_to_array()` only take one param ``` df.select(vector_to_array("colA"), ...) ``` New: `vector_to_array()` can take an additional optional param: `dtype` = "float32" (or "float64") ``` df.select(vector_to_array("colA", "float32"), ...) ``` ### How was this patch tested? Unit test in scala. doctest in python. Closes #27522 from liangz1/udf-float32. Authored-by: Liang Zhang <liang.zhang@databricks.com> Signed-off-by: WeichenXu <weichen.xu@databricks.com> 2020-02-13 10:55:13 -05:00			`>>> df1 = df.select(vector_to_array("vec").alias("vec"),`
			`... vector_to_array("oldVec").alias("oldVec"))`
			`>>> df1.collect()`
			`[Row(vec=[1.0, 2.0, 3.0], oldVec=[10.0, 20.0, 30.0]),`
			`Row(vec=[2.0, 0.0, 3.0], oldVec=[20.0, 0.0, 30.0])]`
			`>>> df2 = df.select(vector_to_array("vec", "float32").alias("vec"),`
			`... vector_to_array("oldVec", "float32").alias("oldVec"))`
			`>>> df2.collect()`
[SPARK-30154][ML] PySpark UDF to convert MLlib vectors to dense arrays ### What changes were proposed in this pull request? PySpark UDF to convert MLlib vectors to dense arrays. Example: ``` from pyspark.ml.functions import vector_to_array df.select(vector_to_array(col("features")) ``` ### Why are the changes needed? If a PySpark user wants to convert MLlib sparse/dense vectors in a DataFrame into dense arrays, an efficient approach is to do that in JVM. However, it requires PySpark user to write Scala code and register it as a UDF. Often this is infeasible for a pure python project. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? UT. Closes #26910 from WeichenXu123/vector_to_array. Authored-by: WeichenXu <weichen.xu@databricks.com> Signed-off-by: Xiangrui Meng <meng@databricks.com> 2020-01-06 19:18:51 -05:00			`[Row(vec=[1.0, 2.0, 3.0], oldVec=[10.0, 20.0, 30.0]),`
			`Row(vec=[2.0, 0.0, 3.0], oldVec=[20.0, 0.0, 30.0])]`
[SPARK-30762] Add dtype=float32 support to vector_to_array UDF ### What changes were proposed in this pull request? In this PR, we add a parameter in the python function vector_to_array(col) that allows converting to a column of arrays of Float (32bits) in scala, which would be mapped to a numpy array of dtype=float32. ### Why are the changes needed? In the downstream ML training, using float32 instead of float64 (default) would allow a larger batch size, i.e., allow more data to fit in the memory. ### Does this PR introduce any user-facing change? Yes. Old: `vector_to_array()` only take one param ``` df.select(vector_to_array("colA"), ...) ``` New: `vector_to_array()` can take an additional optional param: `dtype` = "float32" (or "float64") ``` df.select(vector_to_array("colA", "float32"), ...) ``` ### How was this patch tested? Unit test in scala. doctest in python. Closes #27522 from liangz1/udf-float32. Authored-by: Liang Zhang <liang.zhang@databricks.com> Signed-off-by: WeichenXu <weichen.xu@databricks.com> 2020-02-13 10:55:13 -05:00			`>>> df1.schema.fields`
			`[StructField(vec,ArrayType(DoubleType,false),false),`
			`StructField(oldVec,ArrayType(DoubleType,false),false)]`
			`>>> df2.schema.fields`
			`[StructField(vec,ArrayType(FloatType,false),false),`
			`StructField(oldVec,ArrayType(FloatType,false),false)]`
[SPARK-30154][ML] PySpark UDF to convert MLlib vectors to dense arrays ### What changes were proposed in this pull request? PySpark UDF to convert MLlib vectors to dense arrays. Example: ``` from pyspark.ml.functions import vector_to_array df.select(vector_to_array(col("features")) ``` ### Why are the changes needed? If a PySpark user wants to convert MLlib sparse/dense vectors in a DataFrame into dense arrays, an efficient approach is to do that in JVM. However, it requires PySpark user to write Scala code and register it as a UDF. Often this is infeasible for a pure python project. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? UT. Closes #26910 from WeichenXu123/vector_to_array. Authored-by: WeichenXu <weichen.xu@databricks.com> Signed-off-by: Xiangrui Meng <meng@databricks.com> 2020-01-06 19:18:51 -05:00			`"""`
			`sc = SparkContext._active_spark_context`
			`return Column(`
[SPARK-30762] Add dtype=float32 support to vector_to_array UDF ### What changes were proposed in this pull request? In this PR, we add a parameter in the python function vector_to_array(col) that allows converting to a column of arrays of Float (32bits) in scala, which would be mapped to a numpy array of dtype=float32. ### Why are the changes needed? In the downstream ML training, using float32 instead of float64 (default) would allow a larger batch size, i.e., allow more data to fit in the memory. ### Does this PR introduce any user-facing change? Yes. Old: `vector_to_array()` only take one param ``` df.select(vector_to_array("colA"), ...) ``` New: `vector_to_array()` can take an additional optional param: `dtype` = "float32" (or "float64") ``` df.select(vector_to_array("colA", "float32"), ...) ``` ### How was this patch tested? Unit test in scala. doctest in python. Closes #27522 from liangz1/udf-float32. Authored-by: Liang Zhang <liang.zhang@databricks.com> Signed-off-by: WeichenXu <weichen.xu@databricks.com> 2020-02-13 10:55:13 -05:00			`sc._jvm.org.apache.spark.ml.functions.vector_to_array(_to_java_column(col), dtype))`
[SPARK-30154][ML] PySpark UDF to convert MLlib vectors to dense arrays ### What changes were proposed in this pull request? PySpark UDF to convert MLlib vectors to dense arrays. Example: ``` from pyspark.ml.functions import vector_to_array df.select(vector_to_array(col("features")) ``` ### Why are the changes needed? If a PySpark user wants to convert MLlib sparse/dense vectors in a DataFrame into dense arrays, an efficient approach is to do that in JVM. However, it requires PySpark user to write Scala code and register it as a UDF. Often this is infeasible for a pure python project. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? UT. Closes #26910 from WeichenXu123/vector_to_array. Authored-by: WeichenXu <weichen.xu@databricks.com> Signed-off-by: Xiangrui Meng <meng@databricks.com> 2020-01-06 19:18:51 -05:00

			`def _test():`
			`import doctest`
			`from pyspark.sql import SparkSession`
			`import pyspark.ml.functions`
			`import sys`
			`globs = pyspark.ml.functions.__dict__.copy()`
			`spark = SparkSession.builder \`
			`.master("local[2]") \`
			`.appName("ml.functions tests") \`
			`.getOrCreate()`
			`sc = spark.sparkContext`
			`globs['sc'] = sc`
			`globs['spark'] = spark`

			`(failure_count, test_count) = doctest.testmod(`
			`pyspark.ml.functions, globs=globs,`
			`optionflags=doctest.ELLIPSIS \| doctest.NORMALIZE_WHITESPACE)`
			`spark.stop()`
			`if failure_count:`
			`sys.exit(-1)`


			`if __name__ == "__main__":`
			`_test()`