[SPARK-32138] Drop Python 2.7, 3.4 and 3.5

### What changes were proposed in this pull request?

This PR aims to drop Python 2.7, 3.4 and 3.5.

Roughly speaking, it removes all the widely known Python 2 compatibility workarounds such as `sys.version` comparison, `__future__`. Also, it removes the Python 2 dedicated codes such as `ArrayConstructor` in Spark.

### Why are the changes needed?

 1. Unsupport EOL Python versions
 2. Reduce maintenance overhead and remove a bit of legacy codes and hacks for Python 2.
 3. PyPy2 has a critical bug that causes a flaky test, SPARK-28358 given my testing and investigation.
 4. Users can use Python type hints with Pandas UDFs without thinking about Python version
 5. Users can leverage one latest cloudpickle, https://github.com/apache/spark/pull/28950. With Python 3.8+ it can also leverage C pickle.

### Does this PR introduce _any_ user-facing change?

Yes, users cannot use Python 2.7, 3.4 and 3.5 in the upcoming Spark version.

### How was this patch tested?

Manually tested and also tested in Jenkins.

Closes #28957 from HyukjinKwon/SPARK-32138.

Authored-by: HyukjinKwon <gurwls223@apache.org>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
This commit is contained in:
HyukjinKwon 2020-07-14 11:22:44 +09:00
parent 90ac9f975b
commit 4ad9bfd53b
225 changed files with 735 additions and 2033 deletions

View file

@ -133,7 +133,8 @@ jobs:
architecture: x64 architecture: x64
- name: Install Python 3.6 - name: Install Python 3.6
uses: actions/setup-python@v2 uses: actions/setup-python@v2
if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) # Yarn has a Python specific test too, for example, YarnClusterSuite.
if: contains(matrix.modules, 'yarn') || contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
with: with:
python-version: 3.6 python-version: 3.6
architecture: x64 architecture: x64

View file

@ -45,71 +45,6 @@ private[spark] object SerDeUtil extends Logging {
} }
} }
} }
// Unpickle array.array generated by Python 2.6
class ArrayConstructor extends net.razorvine.pickle.objects.ArrayConstructor {
// /* Description of types */
// static struct arraydescr descriptors[] = {
// {'c', sizeof(char), c_getitem, c_setitem},
// {'b', sizeof(char), b_getitem, b_setitem},
// {'B', sizeof(char), BB_getitem, BB_setitem},
// #ifdef Py_USING_UNICODE
// {'u', sizeof(Py_UNICODE), u_getitem, u_setitem},
// #endif
// {'h', sizeof(short), h_getitem, h_setitem},
// {'H', sizeof(short), HH_getitem, HH_setitem},
// {'i', sizeof(int), i_getitem, i_setitem},
// {'I', sizeof(int), II_getitem, II_setitem},
// {'l', sizeof(long), l_getitem, l_setitem},
// {'L', sizeof(long), LL_getitem, LL_setitem},
// {'f', sizeof(float), f_getitem, f_setitem},
// {'d', sizeof(double), d_getitem, d_setitem},
// {'\0', 0, 0, 0} /* Sentinel */
// };
val machineCodes: Map[Char, Int] = if (ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN)) {
Map('B' -> 0, 'b' -> 1, 'H' -> 3, 'h' -> 5, 'I' -> 7, 'i' -> 9,
'L' -> 11, 'l' -> 13, 'f' -> 15, 'd' -> 17, 'u' -> 21
)
} else {
Map('B' -> 0, 'b' -> 1, 'H' -> 2, 'h' -> 4, 'I' -> 6, 'i' -> 8,
'L' -> 10, 'l' -> 12, 'f' -> 14, 'd' -> 16, 'u' -> 20
)
}
override def construct(args: Array[Object]): Object = {
if (args.length == 1) {
construct(args ++ Array(""))
} else if (args.length == 2 && args(1).isInstanceOf[String]) {
val typecode = args(0).asInstanceOf[String].charAt(0)
// This must be ISO 8859-1 / Latin 1, not UTF-8, to interoperate correctly
val data = args(1).asInstanceOf[String].getBytes(StandardCharsets.ISO_8859_1)
if (typecode == 'c') {
// It seems like the pickle of pypy uses the similar protocol to Python 2.6, which uses
// a string for array data instead of list as Python 2.7, and handles an array of
// typecode 'c' as 1-byte character.
val result = new Array[Char](data.length)
var i = 0
while (i < data.length) {
result(i) = data(i).toChar
i += 1
}
result
} else {
construct(typecode, machineCodes(typecode), data)
}
} else if (args.length == 2 && args(0) == "l") {
// On Python 2, an array of typecode 'l' should be handled as long rather than int.
val values = args(1).asInstanceOf[JArrayList[_]]
val result = new Array[Long](values.size)
var i = 0
while (i < values.size) {
result(i) = values.get(i).asInstanceOf[Number].longValue()
i += 1
}
result
} else {
super.construct(args)
}
}
}
private var initialized = false private var initialized = false
// This should be called before trying to unpickle array.array from Python // This should be called before trying to unpickle array.array from Python
@ -117,7 +52,6 @@ private[spark] object SerDeUtil extends Logging {
def initialize(): Unit = { def initialize(): Unit = {
synchronized{ synchronized{
if (!initialized) { if (!initialized) {
Unpickler.registerConstructor("array", "array", new ArrayConstructor())
Unpickler.registerConstructor("__builtin__", "bytearray", new ByteArrayConstructor()) Unpickler.registerConstructor("__builtin__", "bytearray", new ByteArrayConstructor())
Unpickler.registerConstructor("builtins", "bytearray", new ByteArrayConstructor()) Unpickler.registerConstructor("builtins", "bytearray", new ByteArrayConstructor())
Unpickler.registerConstructor("__builtin__", "bytes", new ByteArrayConstructor()) Unpickler.registerConstructor("__builtin__", "bytes", new ByteArrayConstructor())

View file

@ -49,8 +49,6 @@ except ImportError:
print("Install using 'sudo pip install unidecode'") print("Install using 'sudo pip install unidecode'")
sys.exit(-1) sys.exit(-1)
if sys.version < '3':
input = raw_input # noqa
# Contributors list file name # Contributors list file name
contributors_file_name = "contributors.txt" contributors_file_name = "contributors.txt"
@ -152,10 +150,7 @@ def get_commits(tag):
if not is_valid_author(author): if not is_valid_author(author):
author = github_username author = github_username
# Guard against special characters # Guard against special characters
try: # Python 2 author = str(author)
author = unicode(author, "UTF-8")
except NameError: # Python 3
author = str(author)
author = unidecode.unidecode(author).strip() author = unidecode.unidecode(author).strip()
commit = Commit(_hash, author, title, pr_number) commit = Commit(_hash, author, title, pr_number)
commits.append(commit) commits.append(commit)

View file

@ -22,14 +22,9 @@ import json
import os import os
import re import re
import sys import sys
if sys.version < '3': from urllib.request import urlopen
from urllib2 import urlopen from urllib.request import Request
from urllib2 import Request from urllib.error import HTTPError
from urllib2 import HTTPError
else:
from urllib.request import urlopen
from urllib.request import Request
from urllib.error import HTTPError
try: try:
import jira.client import jira.client

View file

@ -168,7 +168,15 @@ function sphinx_test {
# Check that the documentation builds acceptably, skip check if sphinx is not installed. # Check that the documentation builds acceptably, skip check if sphinx is not installed.
if ! hash "$SPHINX_BUILD" 2> /dev/null; then if ! hash "$SPHINX_BUILD" 2> /dev/null; then
echo "The $SPHINX_BUILD command was not found. Skipping pydoc checks for now." echo "The $SPHINX_BUILD command was not found. Skipping Sphinx build for now."
echo
return
fi
# TODO(SPARK-32279): Install Sphinx in Python 3 of Jenkins machines
PYTHON_HAS_SPHINX=$("$PYTHON_EXECUTABLE" -c 'import importlib.util; print(importlib.util.find_spec("sphinx") is not None)')
if [[ "$PYTHON_HAS_SPHINX" == "False" ]]; then
echo "$PYTHON_EXECUTABLE does not have Sphinx installed. Skipping Sphinx build for now."
echo echo
return return
fi fi

View file

@ -31,15 +31,9 @@ import re
import subprocess import subprocess
import sys import sys
import traceback import traceback
if sys.version < '3': from urllib.request import urlopen
input = raw_input # noqa from urllib.request import Request
from urllib2 import urlopen from urllib.error import HTTPError
from urllib2 import Request
from urllib2 import HTTPError
else:
from urllib.request import urlopen
from urllib.request import Request
from urllib.error import HTTPError
try: try:
import jira.client import jira.client

View file

@ -22,15 +22,9 @@ import sys
import json import json
import functools import functools
import subprocess import subprocess
if sys.version < '3': from urllib.request import urlopen
from urllib2 import urlopen from urllib.request import Request
from urllib2 import Request from urllib.error import HTTPError, URLError
from urllib2 import HTTPError, URLError
else:
from urllib.request import urlopen
from urllib.request import Request
from urllib.error import HTTPError, URLError
from sparktestsupport import SPARK_HOME, ERROR_CODES from sparktestsupport import SPARK_HOME, ERROR_CODES
from sparktestsupport.shellutils import run_cmd from sparktestsupport.shellutils import run_cmd

View file

@ -24,8 +24,7 @@
# Moved functools import to the top of the file. # Moved functools import to the top of the file.
# Changed assert to a ValueError. # Changed assert to a ValueError.
# Changed iter[items|keys] to [items|keys], for python 3 # Changed iter[items|keys] to [items|keys], for python 3
# compatibility. I don't think it matters for python 2 these are # compatibility.
# now lists instead of iterables.
# Copy the input so as to leave it unmodified. # Copy the input so as to leave it unmodified.
# Renamed function from toposort2 to toposort. # Renamed function from toposort2 to toposort.
# Handle empty input. # Handle empty input.

View file

@ -2917,7 +2917,7 @@ The following variables can be set in `spark-env.sh`:
</tr> </tr>
<tr> <tr>
<td><code>PYSPARK_PYTHON</code></td> <td><code>PYSPARK_PYTHON</code></td>
<td>Python binary executable to use for PySpark in both driver and workers (default is <code>python2.7</code> if available, otherwise <code>python</code>). <td>Python binary executable to use for PySpark in both driver and workers (default is <code>python3</code> if available, otherwise <code>python</code>).
Property <code>spark.pyspark.python</code> take precedence if it is set</td> Property <code>spark.pyspark.python</code> take precedence if it is set</td>
</tr> </tr>
<tr> <tr>

View file

@ -44,9 +44,8 @@ source, visit [Building Spark](building-spark.html).
Spark runs on both Windows and UNIX-like systems (e.g. Linux, Mac OS), and it should run on any platform that runs a supported version of Java. This should include JVMs on x86_64 and ARM64. It's easy to run locally on one machine --- all you need is to have `java` installed on your system `PATH`, or the `JAVA_HOME` environment variable pointing to a Java installation. Spark runs on both Windows and UNIX-like systems (e.g. Linux, Mac OS), and it should run on any platform that runs a supported version of Java. This should include JVMs on x86_64 and ARM64. It's easy to run locally on one machine --- all you need is to have `java` installed on your system `PATH`, or the `JAVA_HOME` environment variable pointing to a Java installation.
Spark runs on Java 8/11, Scala 2.12, Python 2.7+/3.4+ and R 3.5+. Spark runs on Java 8/11, Scala 2.12, Python 3.6+ and R 3.5+.
Java 8 prior to version 8u92 support is deprecated as of Spark 3.0.0. Java 8 prior to version 8u92 support is deprecated as of Spark 3.0.0.
Python 2 and Python 3 prior to version 3.6 support is deprecated as of Spark 3.0.0.
For the Scala API, Spark {{site.SPARK_VERSION}} For the Scala API, Spark {{site.SPARK_VERSION}}
uses Scala {{site.SCALA_BINARY_VERSION}}. You will need to use a compatible Scala version uses Scala {{site.SCALA_BINARY_VERSION}}. You will need to use a compatible Scala version
({{site.SCALA_BINARY_VERSION}}.x). ({{site.SCALA_BINARY_VERSION}}.x).

View file

@ -101,10 +101,10 @@ import org.apache.spark.SparkConf;
<div data-lang="python" markdown="1"> <div data-lang="python" markdown="1">
Spark {{site.SPARK_VERSION}} works with Python 2.7+ or Python 3.4+. It can use the standard CPython interpreter, Spark {{site.SPARK_VERSION}} works with Python 3.6+. It can use the standard CPython interpreter,
so C libraries like NumPy can be used. It also works with PyPy 2.3+. so C libraries like NumPy can be used. It also works with PyPy 2.3+.
Note that Python 2 support is deprecated as of Spark 3.0.0. Python 2, 3.4 and 3.5 supports were removed in Spark 3.1.0.
Spark applications in Python can either be run with the `bin/spark-submit` script which includes Spark at runtime, or by including it in your setup.py as: Spark applications in Python can either be run with the `bin/spark-submit` script which includes Spark at runtime, or by including it in your setup.py as:
@ -134,8 +134,8 @@ PySpark requires the same minor version of Python in both driver and workers. It
you can specify which version of Python you want to use by `PYSPARK_PYTHON`, for example: you can specify which version of Python you want to use by `PYSPARK_PYTHON`, for example:
{% highlight bash %} {% highlight bash %}
$ PYSPARK_PYTHON=python3.4 bin/pyspark $ PYSPARK_PYTHON=python3.8 bin/pyspark
$ PYSPARK_PYTHON=/opt/pypy-2.5/bin/pypy bin/spark-submit examples/src/main/python/pi.py $ PYSPARK_PYTHON=/path-to-your-pypy/pypy bin/spark-submit examples/src/main/python/pi.py
{% endhighlight %} {% endhighlight %}
</div> </div>
@ -276,7 +276,7 @@ $ PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS=notebook ./bin/pyspar
You can customize the `ipython` or `jupyter` commands by setting `PYSPARK_DRIVER_PYTHON_OPTS`. You can customize the `ipython` or `jupyter` commands by setting `PYSPARK_DRIVER_PYTHON_OPTS`.
After the Jupyter Notebook server is launched, you can create a new "Python 2" notebook from After the Jupyter Notebook server is launched, you can create a new notebook from
the "Files" tab. Inside the notebook, you can input the command `%pylab inline` as part of the "Files" tab. Inside the notebook, you can input the command `%pylab inline` as part of
your notebook before you start to try Spark from the Jupyter notebook. your notebook before you start to try Spark from the Jupyter notebook.
@ -447,7 +447,7 @@ Writables are automatically converted:
<table class="table"> <table class="table">
<tr><th>Writable Type</th><th>Python Type</th></tr> <tr><th>Writable Type</th><th>Python Type</th></tr>
<tr><td>Text</td><td>unicode str</td></tr> <tr><td>Text</td><td>str</td></tr>
<tr><td>IntWritable</td><td>int</td></tr> <tr><td>IntWritable</td><td>int</td></tr>
<tr><td>FloatWritable</td><td>float</td></tr> <tr><td>FloatWritable</td><td>float</td></tr>
<tr><td>DoubleWritable</td><td>float</td></tr> <tr><td>DoubleWritable</td><td>float</td></tr>

View file

@ -21,8 +21,6 @@ pyspark.ml.recommendation.ALS for more conventional use.
This example requires numpy (http://www.numpy.org/) This example requires numpy (http://www.numpy.org/)
""" """
from __future__ import print_function
import sys import sys
import numpy as np import numpy as np

View file

@ -43,8 +43,6 @@ $ ./bin/spark-submit --driver-class-path /path/to/example/jar \
{u'favorite_color': None, u'name': u'Alyssa'} {u'favorite_color': None, u'name': u'Alyssa'}
{u'favorite_color': u'red', u'name': u'Ben'} {u'favorite_color': u'red', u'name': u'Ben'}
""" """
from __future__ import print_function
import sys import sys
from functools import reduce from functools import reduce

View file

@ -22,8 +22,6 @@ examples/src/main/python/ml/kmeans_example.py.
This example requires NumPy (http://www.numpy.org/). This example requires NumPy (http://www.numpy.org/).
""" """
from __future__ import print_function
import sys import sys
import numpy as np import numpy as np

View file

@ -22,8 +22,6 @@ to act on batches of input data using efficient matrix operations.
In practice, one may prefer to use the LogisticRegression algorithm in In practice, one may prefer to use the LogisticRegression algorithm in
ML, as shown in examples/src/main/python/ml/logistic_regression_with_elastic_net.py. ML, as shown in examples/src/main/python/ml/logistic_regression_with_elastic_net.py.
""" """
from __future__ import print_function
import sys import sys
import numpy as np import numpy as np

View file

@ -20,8 +20,6 @@ An example demonstrating aft survival regression.
Run with: Run with:
bin/spark-submit examples/src/main/python/ml/aft_survival_regression.py bin/spark-submit examples/src/main/python/ml/aft_survival_regression.py
""" """
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.regression import AFTSurvivalRegression from pyspark.ml.regression import AFTSurvivalRegression
from pyspark.ml.linalg import Vectors from pyspark.ml.linalg import Vectors

View file

@ -15,12 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
import sys
if sys.version >= '3':
long = int
from pyspark.sql import SparkSession from pyspark.sql import SparkSession
# $example on$ # $example on$
@ -39,7 +33,7 @@ if __name__ == "__main__":
lines = spark.read.text("data/mllib/als/sample_movielens_ratings.txt").rdd lines = spark.read.text("data/mllib/als/sample_movielens_ratings.txt").rdd
parts = lines.map(lambda row: row.value.split("::")) parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]), ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
rating=float(p[2]), timestamp=long(p[3]))) rating=float(p[2]), timestamp=int(p[3])))
ratings = spark.createDataFrame(ratingsRDD) ratings = spark.createDataFrame(ratingsRDD)
(training, test) = ratings.randomSplit([0.8, 0.2]) (training, test) = ratings.randomSplit([0.8, 0.2])

View file

@ -20,8 +20,6 @@ An example for ANOVASelector.
Run with: Run with:
bin/spark-submit examples/src/main/python/ml/anova_selector_example.py bin/spark-submit examples/src/main/python/ml/anova_selector_example.py
""" """
from __future__ import print_function
from pyspark.sql import SparkSession from pyspark.sql import SparkSession
# $example on$ # $example on$
from pyspark.ml.feature import ANOVASelector from pyspark.ml.feature import ANOVASelector

View file

@ -20,8 +20,6 @@ An example for ANOVA testing.
Run with: Run with:
bin/spark-submit examples/src/main/python/ml/anova_test_example.py bin/spark-submit examples/src/main/python/ml/anova_test_example.py
""" """
from __future__ import print_function
from pyspark.sql import SparkSession from pyspark.sql import SparkSession
# $example on$ # $example on$
from pyspark.ml.linalg import Vectors from pyspark.ml.linalg import Vectors

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
from pyspark.sql import SparkSession from pyspark.sql import SparkSession
# $example on$ # $example on$
from pyspark.ml.feature import Binarizer from pyspark.ml.feature import Binarizer

View file

@ -20,8 +20,6 @@ An example demonstrating bisecting k-means clustering.
Run with: Run with:
bin/spark-submit examples/src/main/python/ml/bisecting_k_means_example.py bin/spark-submit examples/src/main/python/ml/bisecting_k_means_example.py
""" """
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.clustering import BisectingKMeans from pyspark.ml.clustering import BisectingKMeans
from pyspark.ml.evaluation import ClusteringEvaluator from pyspark.ml.evaluation import ClusteringEvaluator

View file

@ -20,8 +20,6 @@ An example demonstrating BucketedRandomProjectionLSH.
Run with: Run with:
bin/spark-submit examples/src/main/python/ml/bucketed_random_projection_lsh_example.py bin/spark-submit examples/src/main/python/ml/bucketed_random_projection_lsh_example.py
""" """
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.feature import BucketedRandomProjectionLSH from pyspark.ml.feature import BucketedRandomProjectionLSH
from pyspark.ml.linalg import Vectors from pyspark.ml.linalg import Vectors

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
from pyspark.sql import SparkSession from pyspark.sql import SparkSession
# $example on$ # $example on$
from pyspark.ml.feature import Bucketizer from pyspark.ml.feature import Bucketizer

View file

@ -20,8 +20,6 @@ An example for Chi-square hypothesis testing.
Run with: Run with:
bin/spark-submit examples/src/main/python/ml/chi_square_test_example.py bin/spark-submit examples/src/main/python/ml/chi_square_test_example.py
""" """
from __future__ import print_function
from pyspark.sql import SparkSession from pyspark.sql import SparkSession
# $example on$ # $example on$
from pyspark.ml.linalg import Vectors from pyspark.ml.linalg import Vectors

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
from pyspark.sql import SparkSession from pyspark.sql import SparkSession
# $example on$ # $example on$
from pyspark.ml.feature import ChiSqSelector from pyspark.ml.feature import ChiSqSelector

View file

@ -20,8 +20,6 @@ An example for computing correlation matrix.
Run with: Run with:
bin/spark-submit examples/src/main/python/ml/correlation_example.py bin/spark-submit examples/src/main/python/ml/correlation_example.py
""" """
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.linalg import Vectors from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation from pyspark.ml.stat import Correlation

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
from pyspark.sql import SparkSession from pyspark.sql import SparkSession
# $example on$ # $example on$
from pyspark.ml.feature import CountVectorizer from pyspark.ml.feature import CountVectorizer

View file

@ -22,8 +22,6 @@ Run with:
bin/spark-submit examples/src/main/python/ml/cross_validator.py bin/spark-submit examples/src/main/python/ml/cross_validator.py
""" """
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml import Pipeline from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression from pyspark.ml.classification import LogisticRegression

View file

@ -19,8 +19,6 @@
An example of how to use DataFrame for ML. Run with:: An example of how to use DataFrame for ML. Run with::
bin/spark-submit examples/src/main/python/ml/dataframe_example.py <input_path> bin/spark-submit examples/src/main/python/ml/dataframe_example.py <input_path>
""" """
from __future__ import print_function
import os import os
import sys import sys
import tempfile import tempfile

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.feature import DCT from pyspark.ml.feature import DCT
from pyspark.ml.linalg import Vectors from pyspark.ml.linalg import Vectors

View file

@ -18,8 +18,6 @@
""" """
Decision Tree Classification Example. Decision Tree Classification Example.
""" """
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml import Pipeline from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier from pyspark.ml.classification import DecisionTreeClassifier

View file

@ -18,8 +18,6 @@
""" """
Decision Tree Regression Example. Decision Tree Regression Example.
""" """
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml import Pipeline from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressor from pyspark.ml.regression import DecisionTreeRegressor

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.feature import ElementwiseProduct from pyspark.ml.feature import ElementwiseProduct
from pyspark.ml.linalg import Vectors from pyspark.ml.linalg import Vectors

View file

@ -18,8 +18,6 @@
""" """
Estimator Transformer Param Example. Estimator Transformer Param Example.
""" """
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.linalg import Vectors from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression from pyspark.ml.classification import LogisticRegression

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
from pyspark.sql import SparkSession from pyspark.sql import SparkSession
# $example on$ # $example on$
from pyspark.ml.feature import FeatureHasher from pyspark.ml.feature import FeatureHasher

View file

@ -18,8 +18,6 @@
""" """
FMClassifier Example. FMClassifier Example.
""" """
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml import Pipeline from pyspark.ml import Pipeline
from pyspark.ml.classification import FMClassifier from pyspark.ml.classification import FMClassifier

View file

@ -18,8 +18,6 @@
""" """
FMRegressor Example. FMRegressor Example.
""" """
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml import Pipeline from pyspark.ml import Pipeline
from pyspark.ml.regression import FMRegressor from pyspark.ml.regression import FMRegressor

View file

@ -20,8 +20,6 @@ An example for FValueSelector.
Run with: Run with:
bin/spark-submit examples/src/main/python/ml/fvalue_selector_example.py bin/spark-submit examples/src/main/python/ml/fvalue_selector_example.py
""" """
from __future__ import print_function
from pyspark.sql import SparkSession from pyspark.sql import SparkSession
# $example on$ # $example on$
from pyspark.ml.feature import FValueSelector from pyspark.ml.feature import FValueSelector

View file

@ -20,8 +20,6 @@ An example for FValue testing.
Run with: Run with:
bin/spark-submit examples/src/main/python/ml/fvalue_test_example.py bin/spark-submit examples/src/main/python/ml/fvalue_test_example.py
""" """
from __future__ import print_function
from pyspark.sql import SparkSession from pyspark.sql import SparkSession
# $example on$ # $example on$
from pyspark.ml.linalg import Vectors from pyspark.ml.linalg import Vectors

View file

@ -20,8 +20,6 @@ A simple example demonstrating Gaussian Mixture Model (GMM).
Run with: Run with:
bin/spark-submit examples/src/main/python/ml/gaussian_mixture_example.py bin/spark-submit examples/src/main/python/ml/gaussian_mixture_example.py
""" """
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.clustering import GaussianMixture from pyspark.ml.clustering import GaussianMixture
# $example off$ # $example off$

View file

@ -20,8 +20,6 @@ An example demonstrating generalized linear regression.
Run with: Run with:
bin/spark-submit examples/src/main/python/ml/generalized_linear_regression_example.py bin/spark-submit examples/src/main/python/ml/generalized_linear_regression_example.py
""" """
from __future__ import print_function
from pyspark.sql import SparkSession from pyspark.sql import SparkSession
# $example on$ # $example on$
from pyspark.ml.regression import GeneralizedLinearRegression from pyspark.ml.regression import GeneralizedLinearRegression

View file

@ -18,8 +18,6 @@
""" """
Gradient Boosted Tree Classifier Example. Gradient Boosted Tree Classifier Example.
""" """
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml import Pipeline from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier from pyspark.ml.classification import GBTClassifier

View file

@ -18,8 +18,6 @@
""" """
Gradient Boosted Tree Regressor Example. Gradient Boosted Tree Regressor Example.
""" """
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml import Pipeline from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressor from pyspark.ml.regression import GBTRegressor

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.feature import IndexToString, StringIndexer from pyspark.ml.feature import IndexToString, StringIndexer
# $example off$ # $example off$

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.feature import Interaction, VectorAssembler from pyspark.ml.feature import Interaction, VectorAssembler
# $example off$ # $example off$

View file

@ -21,8 +21,6 @@ Isotonic Regression Example.
Run with: Run with:
bin/spark-submit examples/src/main/python/ml/isotonic_regression_example.py bin/spark-submit examples/src/main/python/ml/isotonic_regression_example.py
""" """
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.regression import IsotonicRegression from pyspark.ml.regression import IsotonicRegression
# $example off$ # $example off$

View file

@ -22,8 +22,6 @@ Run with:
This example requires NumPy (http://www.numpy.org/). This example requires NumPy (http://www.numpy.org/).
""" """
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.clustering import KMeans from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator from pyspark.ml.evaluation import ClusteringEvaluator

View file

@ -20,8 +20,6 @@ An example demonstrating LDA.
Run with: Run with:
bin/spark-submit examples/src/main/python/ml/lda_example.py bin/spark-submit examples/src/main/python/ml/lda_example.py
""" """
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.clustering import LDA from pyspark.ml.clustering import LDA
# $example off$ # $example off$

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.regression import LinearRegression from pyspark.ml.regression import LinearRegression
# $example off$ # $example off$

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.classification import LinearSVC from pyspark.ml.classification import LinearSVC
# $example off$ # $example off$

View file

@ -20,8 +20,6 @@ An example demonstrating Logistic Regression Summary.
Run with: Run with:
bin/spark-submit examples/src/main/python/ml/logistic_regression_summary_example.py bin/spark-submit examples/src/main/python/ml/logistic_regression_summary_example.py
""" """
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.classification import LogisticRegression from pyspark.ml.classification import LogisticRegression
# $example off$ # $example off$

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.classification import LogisticRegression from pyspark.ml.classification import LogisticRegression
# $example off$ # $example off$

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.feature import MaxAbsScaler from pyspark.ml.feature import MaxAbsScaler
from pyspark.ml.linalg import Vectors from pyspark.ml.linalg import Vectors

View file

@ -20,8 +20,6 @@ An example demonstrating MinHashLSH.
Run with: Run with:
bin/spark-submit examples/src/main/python/ml/min_hash_lsh_example.py bin/spark-submit examples/src/main/python/ml/min_hash_lsh_example.py
""" """
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.feature import MinHashLSH from pyspark.ml.feature import MinHashLSH
from pyspark.ml.linalg import Vectors from pyspark.ml.linalg import Vectors

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.feature import MinMaxScaler from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.linalg import Vectors from pyspark.ml.linalg import Vectors

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.classification import LogisticRegression from pyspark.ml.classification import LogisticRegression
# $example off$ # $example off$

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.classification import MultilayerPerceptronClassifier from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.ml.evaluation import MulticlassClassificationEvaluator

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.feature import NGram from pyspark.ml.feature import NGram
# $example off$ # $example off$

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.classification import NaiveBayes from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.ml.evaluation import MulticlassClassificationEvaluator

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.feature import Normalizer from pyspark.ml.feature import Normalizer
from pyspark.ml.linalg import Vectors from pyspark.ml.linalg import Vectors

View file

@ -21,8 +21,6 @@ using Logistic Regression as the base classifier.
Run with: Run with:
bin/spark-submit examples/src/main/python/ml/one_vs_rest_example.py bin/spark-submit examples/src/main/python/ml/one_vs_rest_example.py
""" """
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.classification import LogisticRegression, OneVsRest from pyspark.ml.classification import LogisticRegression, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.ml.evaluation import MulticlassClassificationEvaluator

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.feature import OneHotEncoder from pyspark.ml.feature import OneHotEncoder
# $example off$ # $example off$

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.feature import PCA from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors from pyspark.ml.linalg import Vectors

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.feature import PolynomialExpansion from pyspark.ml.feature import PolynomialExpansion
from pyspark.ml.linalg import Vectors from pyspark.ml.linalg import Vectors

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.feature import QuantileDiscretizer from pyspark.ml.feature import QuantileDiscretizer
# $example off$ # $example off$

View file

@ -18,8 +18,6 @@
""" """
Random Forest Classifier Example. Random Forest Classifier Example.
""" """
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml import Pipeline from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier from pyspark.ml.classification import RandomForestClassifier

View file

@ -18,8 +18,6 @@
""" """
Random Forest Regressor Example. Random Forest Regressor Example.
""" """
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml import Pipeline from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor from pyspark.ml.regression import RandomForestRegressor

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.feature import RFormula from pyspark.ml.feature import RFormula
# $example off$ # $example off$

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.feature import RobustScaler from pyspark.ml.feature import RobustScaler
# $example off$ # $example off$

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.feature import SQLTransformer from pyspark.ml.feature import SQLTransformer
# $example off$ # $example off$

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.feature import StandardScaler from pyspark.ml.feature import StandardScaler
# $example off$ # $example off$

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.feature import StopWordsRemover from pyspark.ml.feature import StopWordsRemover
# $example off$ # $example off$

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.feature import StringIndexer from pyspark.ml.feature import StringIndexer
# $example off$ # $example off$

View file

@ -20,8 +20,6 @@ An example for summarizer.
Run with: Run with:
bin/spark-submit examples/src/main/python/ml/summarizer_example.py bin/spark-submit examples/src/main/python/ml/summarizer_example.py
""" """
from __future__ import print_function
from pyspark.sql import SparkSession from pyspark.sql import SparkSession
# $example on$ # $example on$
from pyspark.ml.stat import Summarizer from pyspark.ml.stat import Summarizer

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.feature import HashingTF, IDF, Tokenizer from pyspark.ml.feature import HashingTF, IDF, Tokenizer
# $example off$ # $example off$

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.feature import Tokenizer, RegexTokenizer from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf from pyspark.sql.functions import col, udf

View file

@ -20,8 +20,6 @@ An example for VarianceThresholdSelector.
Run with: Run with:
bin/spark-submit examples/src/main/python/ml/variance_threshold_selector_example.py bin/spark-submit examples/src/main/python/ml/variance_threshold_selector_example.py
""" """
from __future__ import print_function
from pyspark.sql import SparkSession from pyspark.sql import SparkSession
# $example on$ # $example on$
from pyspark.ml.feature import VarianceThresholdSelector from pyspark.ml.feature import VarianceThresholdSelector

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.linalg import Vectors from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler from pyspark.ml.feature import VectorAssembler

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.feature import VectorIndexer from pyspark.ml.feature import VectorIndexer
# $example off$ # $example off$

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.linalg import Vectors from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import (VectorSizeHint, VectorAssembler) from pyspark.ml.feature import (VectorSizeHint, VectorAssembler)

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.feature import VectorSlicer from pyspark.ml.feature import VectorSlicer
from pyspark.ml.linalg import Vectors from pyspark.ml.linalg import Vectors

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
# $example on$ # $example on$
from pyspark.ml.feature import Word2Vec from pyspark.ml.feature import Word2Vec
# $example off$ # $example off$

View file

@ -17,7 +17,6 @@
""" """
Binary Classification Metrics Example. Binary Classification Metrics Example.
""" """
from __future__ import print_function
from pyspark import SparkContext from pyspark import SparkContext
# $example on$ # $example on$
from pyspark.mllib.classification import LogisticRegressionWithLBFGS from pyspark.mllib.classification import LogisticRegressionWithLBFGS

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
# $example on$ # $example on$
from numpy import array from numpy import array
# $example off$ # $example off$

View file

@ -18,8 +18,6 @@
""" """
Correlations using MLlib. Correlations using MLlib.
""" """
from __future__ import print_function
import sys import sys
from pyspark import SparkContext from pyspark import SparkContext

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
import numpy as np import numpy as np
from pyspark import SparkContext from pyspark import SparkContext

View file

@ -18,8 +18,6 @@
""" """
Decision Tree Classification Example. Decision Tree Classification Example.
""" """
from __future__ import print_function
from pyspark import SparkContext from pyspark import SparkContext
# $example on$ # $example on$
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel from pyspark.mllib.tree import DecisionTree, DecisionTreeModel

View file

@ -18,8 +18,6 @@
""" """
Decision Tree Regression Example. Decision Tree Regression Example.
""" """
from __future__ import print_function
from pyspark import SparkContext from pyspark import SparkContext
# $example on$ # $example on$
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel from pyspark.mllib.tree import DecisionTree, DecisionTreeModel

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
from pyspark import SparkContext from pyspark import SparkContext
# $example on$ # $example on$
from pyspark.mllib.feature import ElementwiseProduct from pyspark.mllib.feature import ElementwiseProduct

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
# $example on$ # $example on$
from numpy import array from numpy import array
# $example off$ # $example off$

View file

@ -18,11 +18,6 @@
""" """
A Gaussian Mixture Model clustering program using MLlib. A Gaussian Mixture Model clustering program using MLlib.
""" """
from __future__ import print_function
import sys
if sys.version >= '3':
long = int
import random import random
import argparse import argparse
@ -53,7 +48,7 @@ if __name__ == "__main__":
parser.add_argument('--convergenceTol', default=1e-3, type=float, help='convergence threshold') parser.add_argument('--convergenceTol', default=1e-3, type=float, help='convergence threshold')
parser.add_argument('--maxIterations', default=100, type=int, help='Number of iterations') parser.add_argument('--maxIterations', default=100, type=int, help='Number of iterations')
parser.add_argument('--seed', default=random.getrandbits(19), parser.add_argument('--seed', default=random.getrandbits(19),
type=long, help='Random seed') type=int, help='Random seed')
args = parser.parse_args() args = parser.parse_args()
conf = SparkConf().setAppName("GMM") conf = SparkConf().setAppName("GMM")

View file

@ -18,8 +18,6 @@
""" """
Gradient Boosted Trees Classification Example. Gradient Boosted Trees Classification Example.
""" """
from __future__ import print_function
from pyspark import SparkContext from pyspark import SparkContext
# $example on$ # $example on$
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel

View file

@ -18,8 +18,6 @@
""" """
Gradient Boosted Trees Regression Example. Gradient Boosted Trees Regression Example.
""" """
from __future__ import print_function
from pyspark import SparkContext from pyspark import SparkContext
# $example on$ # $example on$
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
from pyspark import SparkContext from pyspark import SparkContext
# $example on$ # $example on$
from pyspark.mllib.linalg import Matrices, Vectors from pyspark.mllib.linalg import Matrices, Vectors

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
from pyspark import SparkContext from pyspark import SparkContext
# $example on$ # $example on$
from pyspark.mllib.stat import Statistics from pyspark.mllib.stat import Statistics

View file

@ -18,8 +18,6 @@
""" """
Isotonic Regression Example. Isotonic Regression Example.
""" """
from __future__ import print_function
from pyspark import SparkContext from pyspark import SparkContext
# $example on$ # $example on$
import math import math

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
# $example on$ # $example on$
from numpy import array from numpy import array
from math import sqrt from math import sqrt

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
from pyspark import SparkContext from pyspark import SparkContext
# $example on$ # $example on$
from pyspark.mllib.stat import KernelDensity from pyspark.mllib.stat import KernelDensity

View file

@ -20,8 +20,6 @@ A K-means clustering program using MLlib.
This example requires NumPy (http://www.numpy.org/). This example requires NumPy (http://www.numpy.org/).
""" """
from __future__ import print_function
import sys import sys
import numpy as np import numpy as np

View file

@ -15,8 +15,6 @@
# limitations under the License. # limitations under the License.
# #
from __future__ import print_function
from pyspark import SparkContext from pyspark import SparkContext
# $example on$ # $example on$
from pyspark.mllib.clustering import LDA, LDAModel from pyspark.mllib.clustering import LDA, LDAModel

Some files were not shown because too many files have changed in this diff Show more