[SPARK-26970][PYTHON][ML] Add Spark ML interaction transformer to PySpark

## What changes were proposed in this pull request?

Adds the Spark ML Interaction transformer to PySpark

## How was this patch tested?

- Added Python doctest
- Ran the newly added example code
- Manually confirmed that a PipelineModel that contains an Interaction transformer can now be loaded in PySpark

Closes #24426 from Andrew-Crosby/pyspark-interaction-transformer.

Lead-authored-by: Andrew-Crosby <37139900+Andrew-Crosby@users.noreply.github.com>
Co-authored-by: Andrew-Crosby <andrew.crosby@autotrader.co.uk>
Signed-off-by: Bryan Cutler <cutlerb@gmail.com>
This commit is contained in:
Andrew-Crosby 2019-04-23 13:53:33 -07:00 committed by Bryan Cutler
parent 810be5dd20
commit 5bf5d9d854
2 changed files with 110 additions and 0 deletions

View file

@ -0,0 +1,56 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from __future__ import print_function
# $example on$
from pyspark.ml.feature import Interaction, VectorAssembler
# $example off$
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession\
.builder\
.appName("InteractionExample")\
.getOrCreate()
# $example on$
df = spark.createDataFrame(
[(1, 1, 2, 3, 8, 4, 5),
(2, 4, 3, 8, 7, 9, 8),
(3, 6, 1, 9, 2, 3, 6),
(4, 10, 8, 6, 9, 4, 5),
(5, 9, 2, 7, 10, 7, 3),
(6, 1, 1, 4, 2, 8, 4)],
["id1", "id2", "id3", "id4", "id5", "id6", "id7"])
assembler1 = VectorAssembler(inputCols=["id2", "id3", "id4"], outputCol="vec1")
assembled1 = assembler1.transform(df)
assembler2 = VectorAssembler(inputCols=["id5", "id6", "id7"], outputCol="vec2")
assembled2 = assembler2.transform(assembled1).select("id1", "vec1", "vec2")
interaction = Interaction(inputCols=["id1", "vec1", "vec2"], outputCol="interactedCol")
interacted = interaction.transform(assembled2)
interacted.show(truncate=False)
# $example off$
spark.stop()

View file

@ -39,6 +39,7 @@ __all__ = ['Binarizer',
'IDF', 'IDFModel',
'Imputer', 'ImputerModel',
'IndexToString',
'Interaction',
'MaxAbsScaler', 'MaxAbsScalerModel',
'MinHashLSH', 'MinHashLSHModel',
'MinMaxScaler', 'MinMaxScalerModel',
@ -1227,6 +1228,59 @@ class ImputerModel(JavaModel, JavaMLReadable, JavaMLWritable):
return self._call_java("surrogateDF")
@inherit_doc
class Interaction(JavaTransformer, HasInputCols, HasOutputCol, JavaMLReadable, JavaMLWritable):
"""
Implements the feature interaction transform. This transformer takes in Double and Vector type
columns and outputs a flattened vector of their feature interactions. To handle interaction,
we first one-hot encode any nominal features. Then, a vector of the feature cross-products is
produced.
For example, given the input feature values `Double(2)` and `Vector(3, 4)`, the output would be
`Vector(6, 8)` if all input features were numeric. If the first feature was instead nominal
with four categories, the output would then be `Vector(0, 0, 0, 0, 3, 4, 0, 0)`.
>>> df = spark.createDataFrame([(0.0, 1.0), (2.0, 3.0)], ["a", "b"])
>>> interaction = Interaction(inputCols=["a", "b"], outputCol="ab")
>>> interaction.transform(df).show()
+---+---+-----+
| a| b| ab|
+---+---+-----+
|0.0|1.0|[0.0]|
|2.0|3.0|[6.0]|
+---+---+-----+
...
>>> interactionPath = temp_path + "/interaction"
>>> interaction.save(interactionPath)
>>> loadedInteraction = Interaction.load(interactionPath)
>>> loadedInteraction.transform(df).head().ab == interaction.transform(df).head().ab
True
.. versionadded:: 3.0.0
"""
@keyword_only
def __init__(self, inputCols=None, outputCol=None):
"""
__init__(self, inputCols=None, outputCol=None):
"""
super(Interaction, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Interaction", self.uid)
self._setDefault()
kwargs = self._input_kwargs
self.setParams(**kwargs)
@keyword_only
@since("3.0.0")
def setParams(self, inputCols=None, outputCol=None):
"""
setParams(self, inputCols=None, outputCol=None)
Sets params for this Interaction.
"""
kwargs = self._input_kwargs
return self._set(**kwargs)
@inherit_doc
class MaxAbsScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable):
"""