spark-instrumented-optimizer/python/pyspark/ml/functions.py

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from pyspark import since, SparkContext
from pyspark.sql.column import Column, _to_java_column


@since(3.0)
def vector_to_array(col):
    """
    Converts a column of MLlib sparse/dense vectors into a column of dense arrays.

    >>> from pyspark.ml.linalg import Vectors
    >>> from pyspark.ml.functions import vector_to_array
    >>> from pyspark.mllib.linalg import Vectors as OldVectors
    >>> df = spark.createDataFrame([
    ...     (Vectors.dense(1.0, 2.0, 3.0), OldVectors.dense(10.0, 20.0, 30.0)),
    ...     (Vectors.sparse(3, [(0, 2.0), (2, 3.0)]),
    ...      OldVectors.sparse(3, [(0, 20.0), (2, 30.0)]))],
    ...     ["vec", "oldVec"])
    >>> df.select(vector_to_array("vec").alias("vec"),
    ...           vector_to_array("oldVec").alias("oldVec")).collect()
    [Row(vec=[1.0, 2.0, 3.0], oldVec=[10.0, 20.0, 30.0]),
     Row(vec=[2.0, 0.0, 3.0], oldVec=[20.0, 0.0, 30.0])]
    """
    sc = SparkContext._active_spark_context
    return Column(
        sc._jvm.org.apache.spark.ml.functions.vector_to_array(_to_java_column(col)))


def _test():
    import doctest
    from pyspark.sql import SparkSession
    import pyspark.ml.functions
    import sys
    globs = pyspark.ml.functions.__dict__.copy()
    spark = SparkSession.builder \
        .master("local[2]") \
        .appName("ml.functions tests") \
        .getOrCreate()
    sc = spark.sparkContext
    globs['sc'] = sc
    globs['spark'] = spark

    (failure_count, test_count) = doctest.testmod(
        pyspark.ml.functions, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
    spark.stop()
    if failure_count:
        sys.exit(-1)


if __name__ == "__main__":
    _test()
[SPARK-30154][ML] PySpark UDF to convert MLlib vectors to dense arrays ### What changes were proposed in this pull request? PySpark UDF to convert MLlib vectors to dense arrays. Example: ``` from pyspark.ml.functions import vector_to_array df.select(vector_to_array(col("features")) ``` ### Why are the changes needed? If a PySpark user wants to convert MLlib sparse/dense vectors in a DataFrame into dense arrays, an efficient approach is to do that in JVM. However, it requires PySpark user to write Scala code and register it as a UDF. Often this is infeasible for a pure python project. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? UT. Closes #26910 from WeichenXu123/vector_to_array. Authored-by: WeichenXu <weichen.xu@databricks.com> Signed-off-by: Xiangrui Meng <meng@databricks.com> 2020-01-06 19:18:51 -05:00			`#`
			`# Licensed to the Apache Software Foundation (ASF) under one or more`
			`# contributor license agreements. See the NOTICE file distributed with`
			`# this work for additional information regarding copyright ownership.`
			`# The ASF licenses this file to You under the Apache License, Version 2.0`
			`# (the "License"); you may not use this file except in compliance with`
			`# the License. You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`#`

			`from pyspark import since, SparkContext`
			`from pyspark.sql.column import Column, _to_java_column`


			`@since(3.0)`
			`def vector_to_array(col):`
			`"""`
			`Converts a column of MLlib sparse/dense vectors into a column of dense arrays.`

			`>>> from pyspark.ml.linalg import Vectors`
			`>>> from pyspark.ml.functions import vector_to_array`
			`>>> from pyspark.mllib.linalg import Vectors as OldVectors`
			`>>> df = spark.createDataFrame([`
			`... (Vectors.dense(1.0, 2.0, 3.0), OldVectors.dense(10.0, 20.0, 30.0)),`
			`... (Vectors.sparse(3, [(0, 2.0), (2, 3.0)]),`
			`... OldVectors.sparse(3, [(0, 20.0), (2, 30.0)]))],`
			`... ["vec", "oldVec"])`
			`>>> df.select(vector_to_array("vec").alias("vec"),`
			`... vector_to_array("oldVec").alias("oldVec")).collect()`
			`[Row(vec=[1.0, 2.0, 3.0], oldVec=[10.0, 20.0, 30.0]),`
			`Row(vec=[2.0, 0.0, 3.0], oldVec=[20.0, 0.0, 30.0])]`
			`"""`
			`sc = SparkContext._active_spark_context`
			`return Column(`
			`sc._jvm.org.apache.spark.ml.functions.vector_to_array(_to_java_column(col)))`


			`def _test():`
			`import doctest`
			`from pyspark.sql import SparkSession`
			`import pyspark.ml.functions`
			`import sys`
			`globs = pyspark.ml.functions.__dict__.copy()`
			`spark = SparkSession.builder \`
			`.master("local[2]") \`
			`.appName("ml.functions tests") \`
			`.getOrCreate()`
			`sc = spark.sparkContext`
			`globs['sc'] = sc`
			`globs['spark'] = spark`

			`(failure_count, test_count) = doctest.testmod(`
			`pyspark.ml.functions, globs=globs,`
			`optionflags=doctest.ELLIPSIS \| doctest.NORMALIZE_WHITESPACE)`
			`spark.stop()`
			`if failure_count:`
			`sys.exit(-1)`


			`if __name__ == "__main__":`
			`_test()`