[SPARK-26970][PYTHON][ML] Add Spark ML interaction transformer to PySpark

## What changes were proposed in this pull request? Adds the Spark ML Interaction transformer to PySpark ## How was this patch tested? - Added Python doctest - Ran the newly added example code - Manually confirmed that a PipelineModel that contains an Interaction transformer can now be loaded in PySpark Closes #24426 from Andrew-Crosby/pyspark-interaction-transformer. Lead-authored-by: Andrew-Crosby <37139900+Andrew-Crosby@users.noreply.github.com> Co-authored-by: Andrew-Crosby <andrew.crosby@autotrader.co.uk> Signed-off-by: Bryan Cutler <cutlerb@gmail.com>
2019-04-23 13:53:33 -07:00 · 2019-04-23 13:53:33 -07:00 · 5bf5d9d854
parent 810be5dd20
commit 5bf5d9d854
2 changed files with 110 additions and 0 deletions
--- a/examples/src/main/python/ml/interaction_example.py
+++ b/examples/src/main/python/ml/interaction_example.py
@ -0,0 +1,56 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+# $example on$
+from pyspark.ml.feature import Interaction, VectorAssembler
+# $example off$
+from pyspark.sql import SparkSession
+
+if __name__ == "__main__":
+    spark = SparkSession\
+        .builder\
+        .appName("InteractionExample")\
+        .getOrCreate()
+
+    # $example on$
+    df = spark.createDataFrame(
+        [(1, 1, 2, 3, 8, 4, 5),
+         (2, 4, 3, 8, 7, 9, 8),
+         (3, 6, 1, 9, 2, 3, 6),
+         (4, 10, 8, 6, 9, 4, 5),
+         (5, 9, 2, 7, 10, 7, 3),
+         (6, 1, 1, 4, 2, 8, 4)],
+        ["id1", "id2", "id3", "id4", "id5", "id6", "id7"])
+
+    assembler1 = VectorAssembler(inputCols=["id2", "id3", "id4"], outputCol="vec1")
+
+    assembled1 = assembler1.transform(df)
+
+    assembler2 = VectorAssembler(inputCols=["id5", "id6", "id7"], outputCol="vec2")
+
+    assembled2 = assembler2.transform(assembled1).select("id1", "vec1", "vec2")
+
+    interaction = Interaction(inputCols=["id1", "vec1", "vec2"], outputCol="interactedCol")
+
+    interacted = interaction.transform(assembled2)
+
+    interacted.show(truncate=False)
+    # $example off$
+
+    spark.stop()
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@ -39,6 +39,7 @@ __all__ = ['Binarizer',
           'IDF', 'IDFModel',
           'Imputer', 'ImputerModel',
           'IndexToString',
+           'Interaction',
           'MaxAbsScaler', 'MaxAbsScalerModel',
           'MinHashLSH', 'MinHashLSHModel',
           'MinMaxScaler', 'MinMaxScalerModel',
@ -1227,6 +1228,59 @@ class ImputerModel(JavaModel, JavaMLReadable, JavaMLWritable):
        return self._call_java("surrogateDF")


+@inherit_doc
+class Interaction(JavaTransformer, HasInputCols, HasOutputCol, JavaMLReadable, JavaMLWritable):
+    """
+    Implements the feature interaction transform. This transformer takes in Double and Vector type
+    columns and outputs a flattened vector of their feature interactions. To handle interaction,
+    we first one-hot encode any nominal features. Then, a vector of the feature cross-products is
+    produced.
+
+    For example, given the input feature values `Double(2)` and `Vector(3, 4)`, the output would be
+    `Vector(6, 8)` if all input features were numeric. If the first feature was instead nominal
+    with four categories, the output would then be `Vector(0, 0, 0, 0, 3, 4, 0, 0)`.
+
+    >>> df = spark.createDataFrame([(0.0, 1.0), (2.0, 3.0)], ["a", "b"])
+    >>> interaction = Interaction(inputCols=["a", "b"], outputCol="ab")
+    >>> interaction.transform(df).show()
+    +---+---+-----+
+    |  a|  b|   ab|
+    +---+---+-----+
+    |0.0|1.0|[0.0]|
+    |2.0|3.0|[6.0]|
+    +---+---+-----+
+    ...
+    >>> interactionPath = temp_path + "/interaction"
+    >>> interaction.save(interactionPath)
+    >>> loadedInteraction = Interaction.load(interactionPath)
+    >>> loadedInteraction.transform(df).head().ab == interaction.transform(df).head().ab
+    True
+
+    .. versionadded:: 3.0.0
+    """
+
+    @keyword_only
+    def __init__(self, inputCols=None, outputCol=None):
+        """
+        __init__(self, inputCols=None, outputCol=None):
+        """
+        super(Interaction, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Interaction", self.uid)
+        self._setDefault()
+        kwargs = self._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    @since("3.0.0")
+    def setParams(self, inputCols=None, outputCol=None):
+        """
+        setParams(self, inputCols=None, outputCol=None)
+        Sets params for this Interaction.
+        """
+        kwargs = self._input_kwargs
+        return self._set(**kwargs)
+
+
@inherit_doc
 class MaxAbsScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable):
    """