[SPARK-26970][PYTHON][ML] Add Spark ML interaction transformer to PySpark
## What changes were proposed in this pull request? Adds the Spark ML Interaction transformer to PySpark ## How was this patch tested? - Added Python doctest - Ran the newly added example code - Manually confirmed that a PipelineModel that contains an Interaction transformer can now be loaded in PySpark Closes #24426 from Andrew-Crosby/pyspark-interaction-transformer. Lead-authored-by: Andrew-Crosby <37139900+Andrew-Crosby@users.noreply.github.com> Co-authored-by: Andrew-Crosby <andrew.crosby@autotrader.co.uk> Signed-off-by: Bryan Cutler <cutlerb@gmail.com>
This commit is contained in:
parent
810be5dd20
commit
5bf5d9d854
56
examples/src/main/python/ml/interaction_example.py
Normal file
56
examples/src/main/python/ml/interaction_example.py
Normal file
|
@ -0,0 +1,56 @@
|
|||
#
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.feature import Interaction, VectorAssembler
|
||||
# $example off$
|
||||
from pyspark.sql import SparkSession
|
||||
|
||||
if __name__ == "__main__":
|
||||
spark = SparkSession\
|
||||
.builder\
|
||||
.appName("InteractionExample")\
|
||||
.getOrCreate()
|
||||
|
||||
# $example on$
|
||||
df = spark.createDataFrame(
|
||||
[(1, 1, 2, 3, 8, 4, 5),
|
||||
(2, 4, 3, 8, 7, 9, 8),
|
||||
(3, 6, 1, 9, 2, 3, 6),
|
||||
(4, 10, 8, 6, 9, 4, 5),
|
||||
(5, 9, 2, 7, 10, 7, 3),
|
||||
(6, 1, 1, 4, 2, 8, 4)],
|
||||
["id1", "id2", "id3", "id4", "id5", "id6", "id7"])
|
||||
|
||||
assembler1 = VectorAssembler(inputCols=["id2", "id3", "id4"], outputCol="vec1")
|
||||
|
||||
assembled1 = assembler1.transform(df)
|
||||
|
||||
assembler2 = VectorAssembler(inputCols=["id5", "id6", "id7"], outputCol="vec2")
|
||||
|
||||
assembled2 = assembler2.transform(assembled1).select("id1", "vec1", "vec2")
|
||||
|
||||
interaction = Interaction(inputCols=["id1", "vec1", "vec2"], outputCol="interactedCol")
|
||||
|
||||
interacted = interaction.transform(assembled2)
|
||||
|
||||
interacted.show(truncate=False)
|
||||
# $example off$
|
||||
|
||||
spark.stop()
|
|
@ -39,6 +39,7 @@ __all__ = ['Binarizer',
|
|||
'IDF', 'IDFModel',
|
||||
'Imputer', 'ImputerModel',
|
||||
'IndexToString',
|
||||
'Interaction',
|
||||
'MaxAbsScaler', 'MaxAbsScalerModel',
|
||||
'MinHashLSH', 'MinHashLSHModel',
|
||||
'MinMaxScaler', 'MinMaxScalerModel',
|
||||
|
@ -1227,6 +1228,59 @@ class ImputerModel(JavaModel, JavaMLReadable, JavaMLWritable):
|
|||
return self._call_java("surrogateDF")
|
||||
|
||||
|
||||
@inherit_doc
|
||||
class Interaction(JavaTransformer, HasInputCols, HasOutputCol, JavaMLReadable, JavaMLWritable):
|
||||
"""
|
||||
Implements the feature interaction transform. This transformer takes in Double and Vector type
|
||||
columns and outputs a flattened vector of their feature interactions. To handle interaction,
|
||||
we first one-hot encode any nominal features. Then, a vector of the feature cross-products is
|
||||
produced.
|
||||
|
||||
For example, given the input feature values `Double(2)` and `Vector(3, 4)`, the output would be
|
||||
`Vector(6, 8)` if all input features were numeric. If the first feature was instead nominal
|
||||
with four categories, the output would then be `Vector(0, 0, 0, 0, 3, 4, 0, 0)`.
|
||||
|
||||
>>> df = spark.createDataFrame([(0.0, 1.0), (2.0, 3.0)], ["a", "b"])
|
||||
>>> interaction = Interaction(inputCols=["a", "b"], outputCol="ab")
|
||||
>>> interaction.transform(df).show()
|
||||
+---+---+-----+
|
||||
| a| b| ab|
|
||||
+---+---+-----+
|
||||
|0.0|1.0|[0.0]|
|
||||
|2.0|3.0|[6.0]|
|
||||
+---+---+-----+
|
||||
...
|
||||
>>> interactionPath = temp_path + "/interaction"
|
||||
>>> interaction.save(interactionPath)
|
||||
>>> loadedInteraction = Interaction.load(interactionPath)
|
||||
>>> loadedInteraction.transform(df).head().ab == interaction.transform(df).head().ab
|
||||
True
|
||||
|
||||
.. versionadded:: 3.0.0
|
||||
"""
|
||||
|
||||
@keyword_only
|
||||
def __init__(self, inputCols=None, outputCol=None):
|
||||
"""
|
||||
__init__(self, inputCols=None, outputCol=None):
|
||||
"""
|
||||
super(Interaction, self).__init__()
|
||||
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Interaction", self.uid)
|
||||
self._setDefault()
|
||||
kwargs = self._input_kwargs
|
||||
self.setParams(**kwargs)
|
||||
|
||||
@keyword_only
|
||||
@since("3.0.0")
|
||||
def setParams(self, inputCols=None, outputCol=None):
|
||||
"""
|
||||
setParams(self, inputCols=None, outputCol=None)
|
||||
Sets params for this Interaction.
|
||||
"""
|
||||
kwargs = self._input_kwargs
|
||||
return self._set(**kwargs)
|
||||
|
||||
|
||||
@inherit_doc
|
||||
class MaxAbsScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable):
|
||||
"""
|
||||
|
|
Loading…
Reference in a new issue