[SPARK-32138] Drop Python 2.7, 3.4 and 3.5

### What changes were proposed in this pull request?

This PR aims to drop Python 2.7, 3.4 and 3.5.

Roughly speaking, it removes all the widely known Python 2 compatibility workarounds such as `sys.version` comparison, `__future__`. Also, it removes the Python 2 dedicated codes such as `ArrayConstructor` in Spark.

### Why are the changes needed?

 1. Unsupport EOL Python versions
 2. Reduce maintenance overhead and remove a bit of legacy codes and hacks for Python 2.
 3. PyPy2 has a critical bug that causes a flaky test, SPARK-28358 given my testing and investigation.
 4. Users can use Python type hints with Pandas UDFs without thinking about Python version
 5. Users can leverage one latest cloudpickle, https://github.com/apache/spark/pull/28950. With Python 3.8+ it can also leverage C pickle.

### Does this PR introduce _any_ user-facing change?

Yes, users cannot use Python 2.7, 3.4 and 3.5 in the upcoming Spark version.

### How was this patch tested?

Manually tested and also tested in Jenkins.

Closes #28957 from HyukjinKwon/SPARK-32138.

Authored-by: HyukjinKwon <gurwls223@apache.org>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
This commit is contained in:
HyukjinKwon 2020-07-14 11:22:44 +09:00
parent 90ac9f975b
commit 4ad9bfd53b
225 changed files with 735 additions and 2033 deletions

View file

@ -133,7 +133,8 @@ jobs:
architecture: x64
- name: Install Python 3.6
uses: actions/setup-python@v2
if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
# Yarn has a Python specific test too, for example, YarnClusterSuite.
if: contains(matrix.modules, 'yarn') || contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
with:
python-version: 3.6
architecture: x64

View file

@ -45,71 +45,6 @@ private[spark] object SerDeUtil extends Logging {
}
}
}
// Unpickle array.array generated by Python 2.6
class ArrayConstructor extends net.razorvine.pickle.objects.ArrayConstructor {
// /* Description of types */
// static struct arraydescr descriptors[] = {
// {'c', sizeof(char), c_getitem, c_setitem},
// {'b', sizeof(char), b_getitem, b_setitem},
// {'B', sizeof(char), BB_getitem, BB_setitem},
// #ifdef Py_USING_UNICODE
// {'u', sizeof(Py_UNICODE), u_getitem, u_setitem},
// #endif
// {'h', sizeof(short), h_getitem, h_setitem},
// {'H', sizeof(short), HH_getitem, HH_setitem},
// {'i', sizeof(int), i_getitem, i_setitem},
// {'I', sizeof(int), II_getitem, II_setitem},
// {'l', sizeof(long), l_getitem, l_setitem},
// {'L', sizeof(long), LL_getitem, LL_setitem},
// {'f', sizeof(float), f_getitem, f_setitem},
// {'d', sizeof(double), d_getitem, d_setitem},
// {'\0', 0, 0, 0} /* Sentinel */
// };
val machineCodes: Map[Char, Int] = if (ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN)) {
Map('B' -> 0, 'b' -> 1, 'H' -> 3, 'h' -> 5, 'I' -> 7, 'i' -> 9,
'L' -> 11, 'l' -> 13, 'f' -> 15, 'd' -> 17, 'u' -> 21
)
} else {
Map('B' -> 0, 'b' -> 1, 'H' -> 2, 'h' -> 4, 'I' -> 6, 'i' -> 8,
'L' -> 10, 'l' -> 12, 'f' -> 14, 'd' -> 16, 'u' -> 20
)
}
override def construct(args: Array[Object]): Object = {
if (args.length == 1) {
construct(args ++ Array(""))
} else if (args.length == 2 && args(1).isInstanceOf[String]) {
val typecode = args(0).asInstanceOf[String].charAt(0)
// This must be ISO 8859-1 / Latin 1, not UTF-8, to interoperate correctly
val data = args(1).asInstanceOf[String].getBytes(StandardCharsets.ISO_8859_1)
if (typecode == 'c') {
// It seems like the pickle of pypy uses the similar protocol to Python 2.6, which uses
// a string for array data instead of list as Python 2.7, and handles an array of
// typecode 'c' as 1-byte character.
val result = new Array[Char](data.length)
var i = 0
while (i < data.length) {
result(i) = data(i).toChar
i += 1
}
result
} else {
construct(typecode, machineCodes(typecode), data)
}
} else if (args.length == 2 && args(0) == "l") {
// On Python 2, an array of typecode 'l' should be handled as long rather than int.
val values = args(1).asInstanceOf[JArrayList[_]]
val result = new Array[Long](values.size)
var i = 0
while (i < values.size) {
result(i) = values.get(i).asInstanceOf[Number].longValue()
i += 1
}
result
} else {
super.construct(args)
}
}
}
private var initialized = false
// This should be called before trying to unpickle array.array from Python
@ -117,7 +52,6 @@ private[spark] object SerDeUtil extends Logging {
def initialize(): Unit = {
synchronized{
if (!initialized) {
Unpickler.registerConstructor("array", "array", new ArrayConstructor())
Unpickler.registerConstructor("__builtin__", "bytearray", new ByteArrayConstructor())
Unpickler.registerConstructor("builtins", "bytearray", new ByteArrayConstructor())
Unpickler.registerConstructor("__builtin__", "bytes", new ByteArrayConstructor())

View file

@ -49,8 +49,6 @@ except ImportError:
print("Install using 'sudo pip install unidecode'")
sys.exit(-1)
if sys.version < '3':
input = raw_input # noqa
# Contributors list file name
contributors_file_name = "contributors.txt"
@ -152,10 +150,7 @@ def get_commits(tag):
if not is_valid_author(author):
author = github_username
# Guard against special characters
try: # Python 2
author = unicode(author, "UTF-8")
except NameError: # Python 3
author = str(author)
author = str(author)
author = unidecode.unidecode(author).strip()
commit = Commit(_hash, author, title, pr_number)
commits.append(commit)

View file

@ -22,14 +22,9 @@ import json
import os
import re
import sys
if sys.version < '3':
from urllib2 import urlopen
from urllib2 import Request
from urllib2 import HTTPError
else:
from urllib.request import urlopen
from urllib.request import Request
from urllib.error import HTTPError
from urllib.request import urlopen
from urllib.request import Request
from urllib.error import HTTPError
try:
import jira.client

View file

@ -168,7 +168,15 @@ function sphinx_test {
# Check that the documentation builds acceptably, skip check if sphinx is not installed.
if ! hash "$SPHINX_BUILD" 2> /dev/null; then
echo "The $SPHINX_BUILD command was not found. Skipping pydoc checks for now."
echo "The $SPHINX_BUILD command was not found. Skipping Sphinx build for now."
echo
return
fi
# TODO(SPARK-32279): Install Sphinx in Python 3 of Jenkins machines
PYTHON_HAS_SPHINX=$("$PYTHON_EXECUTABLE" -c 'import importlib.util; print(importlib.util.find_spec("sphinx") is not None)')
if [[ "$PYTHON_HAS_SPHINX" == "False" ]]; then
echo "$PYTHON_EXECUTABLE does not have Sphinx installed. Skipping Sphinx build for now."
echo
return
fi

View file

@ -31,15 +31,9 @@ import re
import subprocess
import sys
import traceback
if sys.version < '3':
input = raw_input # noqa
from urllib2 import urlopen
from urllib2 import Request
from urllib2 import HTTPError
else:
from urllib.request import urlopen
from urllib.request import Request
from urllib.error import HTTPError
from urllib.request import urlopen
from urllib.request import Request
from urllib.error import HTTPError
try:
import jira.client

View file

@ -22,15 +22,9 @@ import sys
import json
import functools
import subprocess
if sys.version < '3':
from urllib2 import urlopen
from urllib2 import Request
from urllib2 import HTTPError, URLError
else:
from urllib.request import urlopen
from urllib.request import Request
from urllib.error import HTTPError, URLError
from urllib.request import urlopen
from urllib.request import Request
from urllib.error import HTTPError, URLError
from sparktestsupport import SPARK_HOME, ERROR_CODES
from sparktestsupport.shellutils import run_cmd

View file

@ -24,8 +24,7 @@
# Moved functools import to the top of the file.
# Changed assert to a ValueError.
# Changed iter[items|keys] to [items|keys], for python 3
# compatibility. I don't think it matters for python 2 these are
# now lists instead of iterables.
# compatibility.
# Copy the input so as to leave it unmodified.
# Renamed function from toposort2 to toposort.
# Handle empty input.

View file

@ -2917,7 +2917,7 @@ The following variables can be set in `spark-env.sh`:
</tr>
<tr>
<td><code>PYSPARK_PYTHON</code></td>
<td>Python binary executable to use for PySpark in both driver and workers (default is <code>python2.7</code> if available, otherwise <code>python</code>).
<td>Python binary executable to use for PySpark in both driver and workers (default is <code>python3</code> if available, otherwise <code>python</code>).
Property <code>spark.pyspark.python</code> take precedence if it is set</td>
</tr>
<tr>

View file

@ -44,9 +44,8 @@ source, visit [Building Spark](building-spark.html).
Spark runs on both Windows and UNIX-like systems (e.g. Linux, Mac OS), and it should run on any platform that runs a supported version of Java. This should include JVMs on x86_64 and ARM64. It's easy to run locally on one machine --- all you need is to have `java` installed on your system `PATH`, or the `JAVA_HOME` environment variable pointing to a Java installation.
Spark runs on Java 8/11, Scala 2.12, Python 2.7+/3.4+ and R 3.5+.
Spark runs on Java 8/11, Scala 2.12, Python 3.6+ and R 3.5+.
Java 8 prior to version 8u92 support is deprecated as of Spark 3.0.0.
Python 2 and Python 3 prior to version 3.6 support is deprecated as of Spark 3.0.0.
For the Scala API, Spark {{site.SPARK_VERSION}}
uses Scala {{site.SCALA_BINARY_VERSION}}. You will need to use a compatible Scala version
({{site.SCALA_BINARY_VERSION}}.x).

View file

@ -101,10 +101,10 @@ import org.apache.spark.SparkConf;
<div data-lang="python" markdown="1">
Spark {{site.SPARK_VERSION}} works with Python 2.7+ or Python 3.4+. It can use the standard CPython interpreter,
Spark {{site.SPARK_VERSION}} works with Python 3.6+. It can use the standard CPython interpreter,
so C libraries like NumPy can be used. It also works with PyPy 2.3+.
Note that Python 2 support is deprecated as of Spark 3.0.0.
Python 2, 3.4 and 3.5 supports were removed in Spark 3.1.0.
Spark applications in Python can either be run with the `bin/spark-submit` script which includes Spark at runtime, or by including it in your setup.py as:
@ -134,8 +134,8 @@ PySpark requires the same minor version of Python in both driver and workers. It
you can specify which version of Python you want to use by `PYSPARK_PYTHON`, for example:
{% highlight bash %}
$ PYSPARK_PYTHON=python3.4 bin/pyspark
$ PYSPARK_PYTHON=/opt/pypy-2.5/bin/pypy bin/spark-submit examples/src/main/python/pi.py
$ PYSPARK_PYTHON=python3.8 bin/pyspark
$ PYSPARK_PYTHON=/path-to-your-pypy/pypy bin/spark-submit examples/src/main/python/pi.py
{% endhighlight %}
</div>
@ -276,7 +276,7 @@ $ PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS=notebook ./bin/pyspar
You can customize the `ipython` or `jupyter` commands by setting `PYSPARK_DRIVER_PYTHON_OPTS`.
After the Jupyter Notebook server is launched, you can create a new "Python 2" notebook from
After the Jupyter Notebook server is launched, you can create a new notebook from
the "Files" tab. Inside the notebook, you can input the command `%pylab inline` as part of
your notebook before you start to try Spark from the Jupyter notebook.
@ -447,7 +447,7 @@ Writables are automatically converted:
<table class="table">
<tr><th>Writable Type</th><th>Python Type</th></tr>
<tr><td>Text</td><td>unicode str</td></tr>
<tr><td>Text</td><td>str</td></tr>
<tr><td>IntWritable</td><td>int</td></tr>
<tr><td>FloatWritable</td><td>float</td></tr>
<tr><td>DoubleWritable</td><td>float</td></tr>

View file

@ -21,8 +21,6 @@ pyspark.ml.recommendation.ALS for more conventional use.
This example requires numpy (http://www.numpy.org/)
"""
from __future__ import print_function
import sys
import numpy as np

View file

@ -43,8 +43,6 @@ $ ./bin/spark-submit --driver-class-path /path/to/example/jar \
{u'favorite_color': None, u'name': u'Alyssa'}
{u'favorite_color': u'red', u'name': u'Ben'}
"""
from __future__ import print_function
import sys
from functools import reduce

View file

@ -22,8 +22,6 @@ examples/src/main/python/ml/kmeans_example.py.
This example requires NumPy (http://www.numpy.org/).
"""
from __future__ import print_function
import sys
import numpy as np

View file

@ -22,8 +22,6 @@ to act on batches of input data using efficient matrix operations.
In practice, one may prefer to use the LogisticRegression algorithm in
ML, as shown in examples/src/main/python/ml/logistic_regression_with_elastic_net.py.
"""
from __future__ import print_function
import sys
import numpy as np

View file

@ -20,8 +20,6 @@ An example demonstrating aft survival regression.
Run with:
bin/spark-submit examples/src/main/python/ml/aft_survival_regression.py
"""
from __future__ import print_function
# $example on$
from pyspark.ml.regression import AFTSurvivalRegression
from pyspark.ml.linalg import Vectors

View file

@ -15,12 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
import sys
if sys.version >= '3':
long = int
from pyspark.sql import SparkSession
# $example on$
@ -39,7 +33,7 @@ if __name__ == "__main__":
lines = spark.read.text("data/mllib/als/sample_movielens_ratings.txt").rdd
parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
rating=float(p[2]), timestamp=long(p[3])))
rating=float(p[2]), timestamp=int(p[3])))
ratings = spark.createDataFrame(ratingsRDD)
(training, test) = ratings.randomSplit([0.8, 0.2])

View file

@ -20,8 +20,6 @@ An example for ANOVASelector.
Run with:
bin/spark-submit examples/src/main/python/ml/anova_selector_example.py
"""
from __future__ import print_function
from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import ANOVASelector

View file

@ -20,8 +20,6 @@ An example for ANOVA testing.
Run with:
bin/spark-submit examples/src/main/python/ml/anova_test_example.py
"""
from __future__ import print_function
from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.linalg import Vectors

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import Binarizer

View file

@ -20,8 +20,6 @@ An example demonstrating bisecting k-means clustering.
Run with:
bin/spark-submit examples/src/main/python/ml/bisecting_k_means_example.py
"""
from __future__ import print_function
# $example on$
from pyspark.ml.clustering import BisectingKMeans
from pyspark.ml.evaluation import ClusteringEvaluator

View file

@ -20,8 +20,6 @@ An example demonstrating BucketedRandomProjectionLSH.
Run with:
bin/spark-submit examples/src/main/python/ml/bucketed_random_projection_lsh_example.py
"""
from __future__ import print_function
# $example on$
from pyspark.ml.feature import BucketedRandomProjectionLSH
from pyspark.ml.linalg import Vectors

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import Bucketizer

View file

@ -20,8 +20,6 @@ An example for Chi-square hypothesis testing.
Run with:
bin/spark-submit examples/src/main/python/ml/chi_square_test_example.py
"""
from __future__ import print_function
from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.linalg import Vectors

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import ChiSqSelector

View file

@ -20,8 +20,6 @@ An example for computing correlation matrix.
Run with:
bin/spark-submit examples/src/main/python/ml/correlation_example.py
"""
from __future__ import print_function
# $example on$
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import CountVectorizer

View file

@ -22,8 +22,6 @@ Run with:
bin/spark-submit examples/src/main/python/ml/cross_validator.py
"""
from __future__ import print_function
# $example on$
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression

View file

@ -19,8 +19,6 @@
An example of how to use DataFrame for ML. Run with::
bin/spark-submit examples/src/main/python/ml/dataframe_example.py <input_path>
"""
from __future__ import print_function
import os
import sys
import tempfile

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
# $example on$
from pyspark.ml.feature import DCT
from pyspark.ml.linalg import Vectors

View file

@ -18,8 +18,6 @@
"""
Decision Tree Classification Example.
"""
from __future__ import print_function
# $example on$
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier

View file

@ -18,8 +18,6 @@
"""
Decision Tree Regression Example.
"""
from __future__ import print_function
# $example on$
from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressor

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
# $example on$
from pyspark.ml.feature import ElementwiseProduct
from pyspark.ml.linalg import Vectors

View file

@ -18,8 +18,6 @@
"""
Estimator Transformer Param Example.
"""
from __future__ import print_function
# $example on$
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import FeatureHasher

View file

@ -18,8 +18,6 @@
"""
FMClassifier Example.
"""
from __future__ import print_function
# $example on$
from pyspark.ml import Pipeline
from pyspark.ml.classification import FMClassifier

View file

@ -18,8 +18,6 @@
"""
FMRegressor Example.
"""
from __future__ import print_function
# $example on$
from pyspark.ml import Pipeline
from pyspark.ml.regression import FMRegressor

View file

@ -20,8 +20,6 @@ An example for FValueSelector.
Run with:
bin/spark-submit examples/src/main/python/ml/fvalue_selector_example.py
"""
from __future__ import print_function
from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import FValueSelector

View file

@ -20,8 +20,6 @@ An example for FValue testing.
Run with:
bin/spark-submit examples/src/main/python/ml/fvalue_test_example.py
"""
from __future__ import print_function
from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.linalg import Vectors

View file

@ -20,8 +20,6 @@ A simple example demonstrating Gaussian Mixture Model (GMM).
Run with:
bin/spark-submit examples/src/main/python/ml/gaussian_mixture_example.py
"""
from __future__ import print_function
# $example on$
from pyspark.ml.clustering import GaussianMixture
# $example off$

View file

@ -20,8 +20,6 @@ An example demonstrating generalized linear regression.
Run with:
bin/spark-submit examples/src/main/python/ml/generalized_linear_regression_example.py
"""
from __future__ import print_function
from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.regression import GeneralizedLinearRegression

View file

@ -18,8 +18,6 @@
"""
Gradient Boosted Tree Classifier Example.
"""
from __future__ import print_function
# $example on$
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier

View file

@ -18,8 +18,6 @@
"""
Gradient Boosted Tree Regressor Example.
"""
from __future__ import print_function
# $example on$
from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressor

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
# $example on$
from pyspark.ml.feature import IndexToString, StringIndexer
# $example off$

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
# $example on$
from pyspark.ml.feature import Interaction, VectorAssembler
# $example off$

View file

@ -21,8 +21,6 @@ Isotonic Regression Example.
Run with:
bin/spark-submit examples/src/main/python/ml/isotonic_regression_example.py
"""
from __future__ import print_function
# $example on$
from pyspark.ml.regression import IsotonicRegression
# $example off$

View file

@ -22,8 +22,6 @@ Run with:
This example requires NumPy (http://www.numpy.org/).
"""
from __future__ import print_function
# $example on$
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

View file

@ -20,8 +20,6 @@ An example demonstrating LDA.
Run with:
bin/spark-submit examples/src/main/python/ml/lda_example.py
"""
from __future__ import print_function
# $example on$
from pyspark.ml.clustering import LDA
# $example off$

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
# $example on$
from pyspark.ml.regression import LinearRegression
# $example off$

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
# $example on$
from pyspark.ml.classification import LinearSVC
# $example off$

View file

@ -20,8 +20,6 @@ An example demonstrating Logistic Regression Summary.
Run with:
bin/spark-submit examples/src/main/python/ml/logistic_regression_summary_example.py
"""
from __future__ import print_function
# $example on$
from pyspark.ml.classification import LogisticRegression
# $example off$

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
# $example on$
from pyspark.ml.classification import LogisticRegression
# $example off$

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
# $example on$
from pyspark.ml.feature import MaxAbsScaler
from pyspark.ml.linalg import Vectors

View file

@ -20,8 +20,6 @@ An example demonstrating MinHashLSH.
Run with:
bin/spark-submit examples/src/main/python/ml/min_hash_lsh_example.py
"""
from __future__ import print_function
# $example on$
from pyspark.ml.feature import MinHashLSH
from pyspark.ml.linalg import Vectors

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
# $example on$
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.linalg import Vectors

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
# $example on$
from pyspark.ml.classification import LogisticRegression
# $example off$

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
# $example on$
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
# $example on$
from pyspark.ml.feature import NGram
# $example off$

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
# $example on$
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
# $example on$
from pyspark.ml.feature import Normalizer
from pyspark.ml.linalg import Vectors

View file

@ -21,8 +21,6 @@ using Logistic Regression as the base classifier.
Run with:
bin/spark-submit examples/src/main/python/ml/one_vs_rest_example.py
"""
from __future__ import print_function
# $example on$
from pyspark.ml.classification import LogisticRegression, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
# $example on$
from pyspark.ml.feature import OneHotEncoder
# $example off$

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
# $example on$
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
# $example on$
from pyspark.ml.feature import PolynomialExpansion
from pyspark.ml.linalg import Vectors

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
# $example on$
from pyspark.ml.feature import QuantileDiscretizer
# $example off$

View file

@ -18,8 +18,6 @@
"""
Random Forest Classifier Example.
"""
from __future__ import print_function
# $example on$
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier

View file

@ -18,8 +18,6 @@
"""
Random Forest Regressor Example.
"""
from __future__ import print_function
# $example on$
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
# $example on$
from pyspark.ml.feature import RFormula
# $example off$

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
# $example on$
from pyspark.ml.feature import RobustScaler
# $example off$

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
# $example on$
from pyspark.ml.feature import SQLTransformer
# $example off$

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
# $example on$
from pyspark.ml.feature import StandardScaler
# $example off$

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
# $example on$
from pyspark.ml.feature import StopWordsRemover
# $example off$

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
# $example on$
from pyspark.ml.feature import StringIndexer
# $example off$

View file

@ -20,8 +20,6 @@ An example for summarizer.
Run with:
bin/spark-submit examples/src/main/python/ml/summarizer_example.py
"""
from __future__ import print_function
from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.stat import Summarizer

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
# $example on$
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
# $example off$

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
# $example on$
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf

View file

@ -20,8 +20,6 @@ An example for VarianceThresholdSelector.
Run with:
bin/spark-submit examples/src/main/python/ml/variance_threshold_selector_example.py
"""
from __future__ import print_function
from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import VarianceThresholdSelector

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
# $example on$
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
# $example on$
from pyspark.ml.feature import VectorIndexer
# $example off$

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
# $example on$
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import (VectorSizeHint, VectorAssembler)

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
# $example on$
from pyspark.ml.feature import VectorSlicer
from pyspark.ml.linalg import Vectors

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
# $example on$
from pyspark.ml.feature import Word2Vec
# $example off$

View file

@ -17,7 +17,6 @@
"""
Binary Classification Metrics Example.
"""
from __future__ import print_function
from pyspark import SparkContext
# $example on$
from pyspark.mllib.classification import LogisticRegressionWithLBFGS

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
# $example on$
from numpy import array
# $example off$

View file

@ -18,8 +18,6 @@
"""
Correlations using MLlib.
"""
from __future__ import print_function
import sys
from pyspark import SparkContext

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
import numpy as np
from pyspark import SparkContext

View file

@ -18,8 +18,6 @@
"""
Decision Tree Classification Example.
"""
from __future__ import print_function
from pyspark import SparkContext
# $example on$
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel

View file

@ -18,8 +18,6 @@
"""
Decision Tree Regression Example.
"""
from __future__ import print_function
from pyspark import SparkContext
# $example on$
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
from pyspark import SparkContext
# $example on$
from pyspark.mllib.feature import ElementwiseProduct

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
# $example on$
from numpy import array
# $example off$

View file

@ -18,11 +18,6 @@
"""
A Gaussian Mixture Model clustering program using MLlib.
"""
from __future__ import print_function
import sys
if sys.version >= '3':
long = int
import random
import argparse
@ -53,7 +48,7 @@ if __name__ == "__main__":
parser.add_argument('--convergenceTol', default=1e-3, type=float, help='convergence threshold')
parser.add_argument('--maxIterations', default=100, type=int, help='Number of iterations')
parser.add_argument('--seed', default=random.getrandbits(19),
type=long, help='Random seed')
type=int, help='Random seed')
args = parser.parse_args()
conf = SparkConf().setAppName("GMM")

View file

@ -18,8 +18,6 @@
"""
Gradient Boosted Trees Classification Example.
"""
from __future__ import print_function
from pyspark import SparkContext
# $example on$
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel

View file

@ -18,8 +18,6 @@
"""
Gradient Boosted Trees Regression Example.
"""
from __future__ import print_function
from pyspark import SparkContext
# $example on$
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
from pyspark import SparkContext
# $example on$
from pyspark.mllib.linalg import Matrices, Vectors

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
from pyspark import SparkContext
# $example on$
from pyspark.mllib.stat import Statistics

View file

@ -18,8 +18,6 @@
"""
Isotonic Regression Example.
"""
from __future__ import print_function
from pyspark import SparkContext
# $example on$
import math

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
# $example on$
from numpy import array
from math import sqrt

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
from pyspark import SparkContext
# $example on$
from pyspark.mllib.stat import KernelDensity

View file

@ -20,8 +20,6 @@ A K-means clustering program using MLlib.
This example requires NumPy (http://www.numpy.org/).
"""
from __future__ import print_function
import sys
import numpy as np

View file

@ -15,8 +15,6 @@
# limitations under the License.
#
from __future__ import print_function
from pyspark import SparkContext
# $example on$
from pyspark.mllib.clustering import LDA, LDAModel

Some files were not shown because too many files have changed in this diff Show more