[SPARK-32138] Drop Python 2.7, 3.4 and 3.5
### What changes were proposed in this pull request? This PR aims to drop Python 2.7, 3.4 and 3.5. Roughly speaking, it removes all the widely known Python 2 compatibility workarounds such as `sys.version` comparison, `__future__`. Also, it removes the Python 2 dedicated codes such as `ArrayConstructor` in Spark. ### Why are the changes needed? 1. Unsupport EOL Python versions 2. Reduce maintenance overhead and remove a bit of legacy codes and hacks for Python 2. 3. PyPy2 has a critical bug that causes a flaky test, SPARK-28358 given my testing and investigation. 4. Users can use Python type hints with Pandas UDFs without thinking about Python version 5. Users can leverage one latest cloudpickle, https://github.com/apache/spark/pull/28950. With Python 3.8+ it can also leverage C pickle. ### Does this PR introduce _any_ user-facing change? Yes, users cannot use Python 2.7, 3.4 and 3.5 in the upcoming Spark version. ### How was this patch tested? Manually tested and also tested in Jenkins. Closes #28957 from HyukjinKwon/SPARK-32138. Authored-by: HyukjinKwon <gurwls223@apache.org> Signed-off-by: HyukjinKwon <gurwls223@apache.org>
This commit is contained in:
parent
90ac9f975b
commit
4ad9bfd53b
3
.github/workflows/master.yml
vendored
3
.github/workflows/master.yml
vendored
|
@ -133,7 +133,8 @@ jobs:
|
|||
architecture: x64
|
||||
- name: Install Python 3.6
|
||||
uses: actions/setup-python@v2
|
||||
if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
|
||||
# Yarn has a Python specific test too, for example, YarnClusterSuite.
|
||||
if: contains(matrix.modules, 'yarn') || contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
|
||||
with:
|
||||
python-version: 3.6
|
||||
architecture: x64
|
||||
|
|
|
@ -45,71 +45,6 @@ private[spark] object SerDeUtil extends Logging {
|
|||
}
|
||||
}
|
||||
}
|
||||
// Unpickle array.array generated by Python 2.6
|
||||
class ArrayConstructor extends net.razorvine.pickle.objects.ArrayConstructor {
|
||||
// /* Description of types */
|
||||
// static struct arraydescr descriptors[] = {
|
||||
// {'c', sizeof(char), c_getitem, c_setitem},
|
||||
// {'b', sizeof(char), b_getitem, b_setitem},
|
||||
// {'B', sizeof(char), BB_getitem, BB_setitem},
|
||||
// #ifdef Py_USING_UNICODE
|
||||
// {'u', sizeof(Py_UNICODE), u_getitem, u_setitem},
|
||||
// #endif
|
||||
// {'h', sizeof(short), h_getitem, h_setitem},
|
||||
// {'H', sizeof(short), HH_getitem, HH_setitem},
|
||||
// {'i', sizeof(int), i_getitem, i_setitem},
|
||||
// {'I', sizeof(int), II_getitem, II_setitem},
|
||||
// {'l', sizeof(long), l_getitem, l_setitem},
|
||||
// {'L', sizeof(long), LL_getitem, LL_setitem},
|
||||
// {'f', sizeof(float), f_getitem, f_setitem},
|
||||
// {'d', sizeof(double), d_getitem, d_setitem},
|
||||
// {'\0', 0, 0, 0} /* Sentinel */
|
||||
// };
|
||||
val machineCodes: Map[Char, Int] = if (ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN)) {
|
||||
Map('B' -> 0, 'b' -> 1, 'H' -> 3, 'h' -> 5, 'I' -> 7, 'i' -> 9,
|
||||
'L' -> 11, 'l' -> 13, 'f' -> 15, 'd' -> 17, 'u' -> 21
|
||||
)
|
||||
} else {
|
||||
Map('B' -> 0, 'b' -> 1, 'H' -> 2, 'h' -> 4, 'I' -> 6, 'i' -> 8,
|
||||
'L' -> 10, 'l' -> 12, 'f' -> 14, 'd' -> 16, 'u' -> 20
|
||||
)
|
||||
}
|
||||
override def construct(args: Array[Object]): Object = {
|
||||
if (args.length == 1) {
|
||||
construct(args ++ Array(""))
|
||||
} else if (args.length == 2 && args(1).isInstanceOf[String]) {
|
||||
val typecode = args(0).asInstanceOf[String].charAt(0)
|
||||
// This must be ISO 8859-1 / Latin 1, not UTF-8, to interoperate correctly
|
||||
val data = args(1).asInstanceOf[String].getBytes(StandardCharsets.ISO_8859_1)
|
||||
if (typecode == 'c') {
|
||||
// It seems like the pickle of pypy uses the similar protocol to Python 2.6, which uses
|
||||
// a string for array data instead of list as Python 2.7, and handles an array of
|
||||
// typecode 'c' as 1-byte character.
|
||||
val result = new Array[Char](data.length)
|
||||
var i = 0
|
||||
while (i < data.length) {
|
||||
result(i) = data(i).toChar
|
||||
i += 1
|
||||
}
|
||||
result
|
||||
} else {
|
||||
construct(typecode, machineCodes(typecode), data)
|
||||
}
|
||||
} else if (args.length == 2 && args(0) == "l") {
|
||||
// On Python 2, an array of typecode 'l' should be handled as long rather than int.
|
||||
val values = args(1).asInstanceOf[JArrayList[_]]
|
||||
val result = new Array[Long](values.size)
|
||||
var i = 0
|
||||
while (i < values.size) {
|
||||
result(i) = values.get(i).asInstanceOf[Number].longValue()
|
||||
i += 1
|
||||
}
|
||||
result
|
||||
} else {
|
||||
super.construct(args)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private var initialized = false
|
||||
// This should be called before trying to unpickle array.array from Python
|
||||
|
@ -117,7 +52,6 @@ private[spark] object SerDeUtil extends Logging {
|
|||
def initialize(): Unit = {
|
||||
synchronized{
|
||||
if (!initialized) {
|
||||
Unpickler.registerConstructor("array", "array", new ArrayConstructor())
|
||||
Unpickler.registerConstructor("__builtin__", "bytearray", new ByteArrayConstructor())
|
||||
Unpickler.registerConstructor("builtins", "bytearray", new ByteArrayConstructor())
|
||||
Unpickler.registerConstructor("__builtin__", "bytes", new ByteArrayConstructor())
|
||||
|
|
|
@ -49,8 +49,6 @@ except ImportError:
|
|||
print("Install using 'sudo pip install unidecode'")
|
||||
sys.exit(-1)
|
||||
|
||||
if sys.version < '3':
|
||||
input = raw_input # noqa
|
||||
|
||||
# Contributors list file name
|
||||
contributors_file_name = "contributors.txt"
|
||||
|
@ -152,9 +150,6 @@ def get_commits(tag):
|
|||
if not is_valid_author(author):
|
||||
author = github_username
|
||||
# Guard against special characters
|
||||
try: # Python 2
|
||||
author = unicode(author, "UTF-8")
|
||||
except NameError: # Python 3
|
||||
author = str(author)
|
||||
author = unidecode.unidecode(author).strip()
|
||||
commit = Commit(_hash, author, title, pr_number)
|
||||
|
|
|
@ -22,11 +22,6 @@ import json
|
|||
import os
|
||||
import re
|
||||
import sys
|
||||
if sys.version < '3':
|
||||
from urllib2 import urlopen
|
||||
from urllib2 import Request
|
||||
from urllib2 import HTTPError
|
||||
else:
|
||||
from urllib.request import urlopen
|
||||
from urllib.request import Request
|
||||
from urllib.error import HTTPError
|
||||
|
|
|
@ -168,7 +168,15 @@ function sphinx_test {
|
|||
|
||||
# Check that the documentation builds acceptably, skip check if sphinx is not installed.
|
||||
if ! hash "$SPHINX_BUILD" 2> /dev/null; then
|
||||
echo "The $SPHINX_BUILD command was not found. Skipping pydoc checks for now."
|
||||
echo "The $SPHINX_BUILD command was not found. Skipping Sphinx build for now."
|
||||
echo
|
||||
return
|
||||
fi
|
||||
|
||||
# TODO(SPARK-32279): Install Sphinx in Python 3 of Jenkins machines
|
||||
PYTHON_HAS_SPHINX=$("$PYTHON_EXECUTABLE" -c 'import importlib.util; print(importlib.util.find_spec("sphinx") is not None)')
|
||||
if [[ "$PYTHON_HAS_SPHINX" == "False" ]]; then
|
||||
echo "$PYTHON_EXECUTABLE does not have Sphinx installed. Skipping Sphinx build for now."
|
||||
echo
|
||||
return
|
||||
fi
|
||||
|
|
|
@ -31,12 +31,6 @@ import re
|
|||
import subprocess
|
||||
import sys
|
||||
import traceback
|
||||
if sys.version < '3':
|
||||
input = raw_input # noqa
|
||||
from urllib2 import urlopen
|
||||
from urllib2 import Request
|
||||
from urllib2 import HTTPError
|
||||
else:
|
||||
from urllib.request import urlopen
|
||||
from urllib.request import Request
|
||||
from urllib.error import HTTPError
|
||||
|
|
|
@ -22,16 +22,10 @@ import sys
|
|||
import json
|
||||
import functools
|
||||
import subprocess
|
||||
if sys.version < '3':
|
||||
from urllib2 import urlopen
|
||||
from urllib2 import Request
|
||||
from urllib2 import HTTPError, URLError
|
||||
else:
|
||||
from urllib.request import urlopen
|
||||
from urllib.request import Request
|
||||
from urllib.error import HTTPError, URLError
|
||||
|
||||
|
||||
from sparktestsupport import SPARK_HOME, ERROR_CODES
|
||||
from sparktestsupport.shellutils import run_cmd
|
||||
|
||||
|
|
|
@ -24,8 +24,7 @@
|
|||
# Moved functools import to the top of the file.
|
||||
# Changed assert to a ValueError.
|
||||
# Changed iter[items|keys] to [items|keys], for python 3
|
||||
# compatibility. I don't think it matters for python 2 these are
|
||||
# now lists instead of iterables.
|
||||
# compatibility.
|
||||
# Copy the input so as to leave it unmodified.
|
||||
# Renamed function from toposort2 to toposort.
|
||||
# Handle empty input.
|
||||
|
|
|
@ -2917,7 +2917,7 @@ The following variables can be set in `spark-env.sh`:
|
|||
</tr>
|
||||
<tr>
|
||||
<td><code>PYSPARK_PYTHON</code></td>
|
||||
<td>Python binary executable to use for PySpark in both driver and workers (default is <code>python2.7</code> if available, otherwise <code>python</code>).
|
||||
<td>Python binary executable to use for PySpark in both driver and workers (default is <code>python3</code> if available, otherwise <code>python</code>).
|
||||
Property <code>spark.pyspark.python</code> take precedence if it is set</td>
|
||||
</tr>
|
||||
<tr>
|
||||
|
|
|
@ -44,9 +44,8 @@ source, visit [Building Spark](building-spark.html).
|
|||
|
||||
Spark runs on both Windows and UNIX-like systems (e.g. Linux, Mac OS), and it should run on any platform that runs a supported version of Java. This should include JVMs on x86_64 and ARM64. It's easy to run locally on one machine --- all you need is to have `java` installed on your system `PATH`, or the `JAVA_HOME` environment variable pointing to a Java installation.
|
||||
|
||||
Spark runs on Java 8/11, Scala 2.12, Python 2.7+/3.4+ and R 3.5+.
|
||||
Spark runs on Java 8/11, Scala 2.12, Python 3.6+ and R 3.5+.
|
||||
Java 8 prior to version 8u92 support is deprecated as of Spark 3.0.0.
|
||||
Python 2 and Python 3 prior to version 3.6 support is deprecated as of Spark 3.0.0.
|
||||
For the Scala API, Spark {{site.SPARK_VERSION}}
|
||||
uses Scala {{site.SCALA_BINARY_VERSION}}. You will need to use a compatible Scala version
|
||||
({{site.SCALA_BINARY_VERSION}}.x).
|
||||
|
|
|
@ -101,10 +101,10 @@ import org.apache.spark.SparkConf;
|
|||
|
||||
<div data-lang="python" markdown="1">
|
||||
|
||||
Spark {{site.SPARK_VERSION}} works with Python 2.7+ or Python 3.4+. It can use the standard CPython interpreter,
|
||||
Spark {{site.SPARK_VERSION}} works with Python 3.6+. It can use the standard CPython interpreter,
|
||||
so C libraries like NumPy can be used. It also works with PyPy 2.3+.
|
||||
|
||||
Note that Python 2 support is deprecated as of Spark 3.0.0.
|
||||
Python 2, 3.4 and 3.5 supports were removed in Spark 3.1.0.
|
||||
|
||||
Spark applications in Python can either be run with the `bin/spark-submit` script which includes Spark at runtime, or by including it in your setup.py as:
|
||||
|
||||
|
@ -134,8 +134,8 @@ PySpark requires the same minor version of Python in both driver and workers. It
|
|||
you can specify which version of Python you want to use by `PYSPARK_PYTHON`, for example:
|
||||
|
||||
{% highlight bash %}
|
||||
$ PYSPARK_PYTHON=python3.4 bin/pyspark
|
||||
$ PYSPARK_PYTHON=/opt/pypy-2.5/bin/pypy bin/spark-submit examples/src/main/python/pi.py
|
||||
$ PYSPARK_PYTHON=python3.8 bin/pyspark
|
||||
$ PYSPARK_PYTHON=/path-to-your-pypy/pypy bin/spark-submit examples/src/main/python/pi.py
|
||||
{% endhighlight %}
|
||||
|
||||
</div>
|
||||
|
@ -276,7 +276,7 @@ $ PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS=notebook ./bin/pyspar
|
|||
|
||||
You can customize the `ipython` or `jupyter` commands by setting `PYSPARK_DRIVER_PYTHON_OPTS`.
|
||||
|
||||
After the Jupyter Notebook server is launched, you can create a new "Python 2" notebook from
|
||||
After the Jupyter Notebook server is launched, you can create a new notebook from
|
||||
the "Files" tab. Inside the notebook, you can input the command `%pylab inline` as part of
|
||||
your notebook before you start to try Spark from the Jupyter notebook.
|
||||
|
||||
|
@ -447,7 +447,7 @@ Writables are automatically converted:
|
|||
|
||||
<table class="table">
|
||||
<tr><th>Writable Type</th><th>Python Type</th></tr>
|
||||
<tr><td>Text</td><td>unicode str</td></tr>
|
||||
<tr><td>Text</td><td>str</td></tr>
|
||||
<tr><td>IntWritable</td><td>int</td></tr>
|
||||
<tr><td>FloatWritable</td><td>float</td></tr>
|
||||
<tr><td>DoubleWritable</td><td>float</td></tr>
|
||||
|
|
|
@ -21,8 +21,6 @@ pyspark.ml.recommendation.ALS for more conventional use.
|
|||
|
||||
This example requires numpy (http://www.numpy.org/)
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
|
|
|
@ -43,8 +43,6 @@ $ ./bin/spark-submit --driver-class-path /path/to/example/jar \
|
|||
{u'favorite_color': None, u'name': u'Alyssa'}
|
||||
{u'favorite_color': u'red', u'name': u'Ben'}
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
|
||||
from functools import reduce
|
||||
|
|
|
@ -22,8 +22,6 @@ examples/src/main/python/ml/kmeans_example.py.
|
|||
|
||||
This example requires NumPy (http://www.numpy.org/).
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
|
|
|
@ -22,8 +22,6 @@ to act on batches of input data using efficient matrix operations.
|
|||
In practice, one may prefer to use the LogisticRegression algorithm in
|
||||
ML, as shown in examples/src/main/python/ml/logistic_regression_with_elastic_net.py.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
|
|
|
@ -20,8 +20,6 @@ An example demonstrating aft survival regression.
|
|||
Run with:
|
||||
bin/spark-submit examples/src/main/python/ml/aft_survival_regression.py
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.regression import AFTSurvivalRegression
|
||||
from pyspark.ml.linalg import Vectors
|
||||
|
|
|
@ -15,12 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
if sys.version >= '3':
|
||||
long = int
|
||||
|
||||
from pyspark.sql import SparkSession
|
||||
|
||||
# $example on$
|
||||
|
@ -39,7 +33,7 @@ if __name__ == "__main__":
|
|||
lines = spark.read.text("data/mllib/als/sample_movielens_ratings.txt").rdd
|
||||
parts = lines.map(lambda row: row.value.split("::"))
|
||||
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
|
||||
rating=float(p[2]), timestamp=long(p[3])))
|
||||
rating=float(p[2]), timestamp=int(p[3])))
|
||||
ratings = spark.createDataFrame(ratingsRDD)
|
||||
(training, test) = ratings.randomSplit([0.8, 0.2])
|
||||
|
||||
|
|
|
@ -20,8 +20,6 @@ An example for ANOVASelector.
|
|||
Run with:
|
||||
bin/spark-submit examples/src/main/python/ml/anova_selector_example.py
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
from pyspark.sql import SparkSession
|
||||
# $example on$
|
||||
from pyspark.ml.feature import ANOVASelector
|
||||
|
|
|
@ -20,8 +20,6 @@ An example for ANOVA testing.
|
|||
Run with:
|
||||
bin/spark-submit examples/src/main/python/ml/anova_test_example.py
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
from pyspark.sql import SparkSession
|
||||
# $example on$
|
||||
from pyspark.ml.linalg import Vectors
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
from pyspark.sql import SparkSession
|
||||
# $example on$
|
||||
from pyspark.ml.feature import Binarizer
|
||||
|
|
|
@ -20,8 +20,6 @@ An example demonstrating bisecting k-means clustering.
|
|||
Run with:
|
||||
bin/spark-submit examples/src/main/python/ml/bisecting_k_means_example.py
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.clustering import BisectingKMeans
|
||||
from pyspark.ml.evaluation import ClusteringEvaluator
|
||||
|
|
|
@ -20,8 +20,6 @@ An example demonstrating BucketedRandomProjectionLSH.
|
|||
Run with:
|
||||
bin/spark-submit examples/src/main/python/ml/bucketed_random_projection_lsh_example.py
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.feature import BucketedRandomProjectionLSH
|
||||
from pyspark.ml.linalg import Vectors
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
from pyspark.sql import SparkSession
|
||||
# $example on$
|
||||
from pyspark.ml.feature import Bucketizer
|
||||
|
|
|
@ -20,8 +20,6 @@ An example for Chi-square hypothesis testing.
|
|||
Run with:
|
||||
bin/spark-submit examples/src/main/python/ml/chi_square_test_example.py
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
from pyspark.sql import SparkSession
|
||||
# $example on$
|
||||
from pyspark.ml.linalg import Vectors
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
from pyspark.sql import SparkSession
|
||||
# $example on$
|
||||
from pyspark.ml.feature import ChiSqSelector
|
||||
|
|
|
@ -20,8 +20,6 @@ An example for computing correlation matrix.
|
|||
Run with:
|
||||
bin/spark-submit examples/src/main/python/ml/correlation_example.py
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.linalg import Vectors
|
||||
from pyspark.ml.stat import Correlation
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
from pyspark.sql import SparkSession
|
||||
# $example on$
|
||||
from pyspark.ml.feature import CountVectorizer
|
||||
|
|
|
@ -22,8 +22,6 @@ Run with:
|
|||
|
||||
bin/spark-submit examples/src/main/python/ml/cross_validator.py
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml import Pipeline
|
||||
from pyspark.ml.classification import LogisticRegression
|
||||
|
|
|
@ -19,8 +19,6 @@
|
|||
An example of how to use DataFrame for ML. Run with::
|
||||
bin/spark-submit examples/src/main/python/ml/dataframe_example.py <input_path>
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.feature import DCT
|
||||
from pyspark.ml.linalg import Vectors
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
"""
|
||||
Decision Tree Classification Example.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml import Pipeline
|
||||
from pyspark.ml.classification import DecisionTreeClassifier
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
"""
|
||||
Decision Tree Regression Example.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml import Pipeline
|
||||
from pyspark.ml.regression import DecisionTreeRegressor
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.feature import ElementwiseProduct
|
||||
from pyspark.ml.linalg import Vectors
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
"""
|
||||
Estimator Transformer Param Example.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.linalg import Vectors
|
||||
from pyspark.ml.classification import LogisticRegression
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
from pyspark.sql import SparkSession
|
||||
# $example on$
|
||||
from pyspark.ml.feature import FeatureHasher
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
"""
|
||||
FMClassifier Example.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml import Pipeline
|
||||
from pyspark.ml.classification import FMClassifier
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
"""
|
||||
FMRegressor Example.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml import Pipeline
|
||||
from pyspark.ml.regression import FMRegressor
|
||||
|
|
|
@ -20,8 +20,6 @@ An example for FValueSelector.
|
|||
Run with:
|
||||
bin/spark-submit examples/src/main/python/ml/fvalue_selector_example.py
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
from pyspark.sql import SparkSession
|
||||
# $example on$
|
||||
from pyspark.ml.feature import FValueSelector
|
||||
|
|
|
@ -20,8 +20,6 @@ An example for FValue testing.
|
|||
Run with:
|
||||
bin/spark-submit examples/src/main/python/ml/fvalue_test_example.py
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
from pyspark.sql import SparkSession
|
||||
# $example on$
|
||||
from pyspark.ml.linalg import Vectors
|
||||
|
|
|
@ -20,8 +20,6 @@ A simple example demonstrating Gaussian Mixture Model (GMM).
|
|||
Run with:
|
||||
bin/spark-submit examples/src/main/python/ml/gaussian_mixture_example.py
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.clustering import GaussianMixture
|
||||
# $example off$
|
||||
|
|
|
@ -20,8 +20,6 @@ An example demonstrating generalized linear regression.
|
|||
Run with:
|
||||
bin/spark-submit examples/src/main/python/ml/generalized_linear_regression_example.py
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
from pyspark.sql import SparkSession
|
||||
# $example on$
|
||||
from pyspark.ml.regression import GeneralizedLinearRegression
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
"""
|
||||
Gradient Boosted Tree Classifier Example.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml import Pipeline
|
||||
from pyspark.ml.classification import GBTClassifier
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
"""
|
||||
Gradient Boosted Tree Regressor Example.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml import Pipeline
|
||||
from pyspark.ml.regression import GBTRegressor
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.feature import IndexToString, StringIndexer
|
||||
# $example off$
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.feature import Interaction, VectorAssembler
|
||||
# $example off$
|
||||
|
|
|
@ -21,8 +21,6 @@ Isotonic Regression Example.
|
|||
Run with:
|
||||
bin/spark-submit examples/src/main/python/ml/isotonic_regression_example.py
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.regression import IsotonicRegression
|
||||
# $example off$
|
||||
|
|
|
@ -22,8 +22,6 @@ Run with:
|
|||
|
||||
This example requires NumPy (http://www.numpy.org/).
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.clustering import KMeans
|
||||
from pyspark.ml.evaluation import ClusteringEvaluator
|
||||
|
|
|
@ -20,8 +20,6 @@ An example demonstrating LDA.
|
|||
Run with:
|
||||
bin/spark-submit examples/src/main/python/ml/lda_example.py
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.clustering import LDA
|
||||
# $example off$
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.regression import LinearRegression
|
||||
# $example off$
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.classification import LinearSVC
|
||||
# $example off$
|
||||
|
|
|
@ -20,8 +20,6 @@ An example demonstrating Logistic Regression Summary.
|
|||
Run with:
|
||||
bin/spark-submit examples/src/main/python/ml/logistic_regression_summary_example.py
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.classification import LogisticRegression
|
||||
# $example off$
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.classification import LogisticRegression
|
||||
# $example off$
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.feature import MaxAbsScaler
|
||||
from pyspark.ml.linalg import Vectors
|
||||
|
|
|
@ -20,8 +20,6 @@ An example demonstrating MinHashLSH.
|
|||
Run with:
|
||||
bin/spark-submit examples/src/main/python/ml/min_hash_lsh_example.py
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.feature import MinHashLSH
|
||||
from pyspark.ml.linalg import Vectors
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.feature import MinMaxScaler
|
||||
from pyspark.ml.linalg import Vectors
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.classification import LogisticRegression
|
||||
# $example off$
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.classification import MultilayerPerceptronClassifier
|
||||
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.feature import NGram
|
||||
# $example off$
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.classification import NaiveBayes
|
||||
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.feature import Normalizer
|
||||
from pyspark.ml.linalg import Vectors
|
||||
|
|
|
@ -21,8 +21,6 @@ using Logistic Regression as the base classifier.
|
|||
Run with:
|
||||
bin/spark-submit examples/src/main/python/ml/one_vs_rest_example.py
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.classification import LogisticRegression, OneVsRest
|
||||
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.feature import OneHotEncoder
|
||||
# $example off$
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.feature import PCA
|
||||
from pyspark.ml.linalg import Vectors
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.feature import PolynomialExpansion
|
||||
from pyspark.ml.linalg import Vectors
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.feature import QuantileDiscretizer
|
||||
# $example off$
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
"""
|
||||
Random Forest Classifier Example.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml import Pipeline
|
||||
from pyspark.ml.classification import RandomForestClassifier
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
"""
|
||||
Random Forest Regressor Example.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml import Pipeline
|
||||
from pyspark.ml.regression import RandomForestRegressor
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.feature import RFormula
|
||||
# $example off$
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.feature import RobustScaler
|
||||
# $example off$
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.feature import SQLTransformer
|
||||
# $example off$
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.feature import StandardScaler
|
||||
# $example off$
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.feature import StopWordsRemover
|
||||
# $example off$
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.feature import StringIndexer
|
||||
# $example off$
|
||||
|
|
|
@ -20,8 +20,6 @@ An example for summarizer.
|
|||
Run with:
|
||||
bin/spark-submit examples/src/main/python/ml/summarizer_example.py
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
from pyspark.sql import SparkSession
|
||||
# $example on$
|
||||
from pyspark.ml.stat import Summarizer
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
|
||||
# $example off$
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.feature import Tokenizer, RegexTokenizer
|
||||
from pyspark.sql.functions import col, udf
|
||||
|
|
|
@ -20,8 +20,6 @@ An example for VarianceThresholdSelector.
|
|||
Run with:
|
||||
bin/spark-submit examples/src/main/python/ml/variance_threshold_selector_example.py
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
from pyspark.sql import SparkSession
|
||||
# $example on$
|
||||
from pyspark.ml.feature import VarianceThresholdSelector
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.linalg import Vectors
|
||||
from pyspark.ml.feature import VectorAssembler
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.feature import VectorIndexer
|
||||
# $example off$
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.linalg import Vectors
|
||||
from pyspark.ml.feature import (VectorSizeHint, VectorAssembler)
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.feature import VectorSlicer
|
||||
from pyspark.ml.linalg import Vectors
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from pyspark.ml.feature import Word2Vec
|
||||
# $example off$
|
||||
|
|
|
@ -17,7 +17,6 @@
|
|||
"""
|
||||
Binary Classification Metrics Example.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
from pyspark import SparkContext
|
||||
# $example on$
|
||||
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from numpy import array
|
||||
# $example off$
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
"""
|
||||
Correlations using MLlib.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
|
||||
from pyspark import SparkContext
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pyspark import SparkContext
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
"""
|
||||
Decision Tree Classification Example.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
from pyspark import SparkContext
|
||||
# $example on$
|
||||
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
"""
|
||||
Decision Tree Regression Example.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
from pyspark import SparkContext
|
||||
# $example on$
|
||||
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
from pyspark import SparkContext
|
||||
# $example on$
|
||||
from pyspark.mllib.feature import ElementwiseProduct
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from numpy import array
|
||||
# $example off$
|
||||
|
|
|
@ -18,11 +18,6 @@
|
|||
"""
|
||||
A Gaussian Mixture Model clustering program using MLlib.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
if sys.version >= '3':
|
||||
long = int
|
||||
|
||||
import random
|
||||
import argparse
|
||||
|
@ -53,7 +48,7 @@ if __name__ == "__main__":
|
|||
parser.add_argument('--convergenceTol', default=1e-3, type=float, help='convergence threshold')
|
||||
parser.add_argument('--maxIterations', default=100, type=int, help='Number of iterations')
|
||||
parser.add_argument('--seed', default=random.getrandbits(19),
|
||||
type=long, help='Random seed')
|
||||
type=int, help='Random seed')
|
||||
args = parser.parse_args()
|
||||
|
||||
conf = SparkConf().setAppName("GMM")
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
"""
|
||||
Gradient Boosted Trees Classification Example.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
from pyspark import SparkContext
|
||||
# $example on$
|
||||
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
"""
|
||||
Gradient Boosted Trees Regression Example.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
from pyspark import SparkContext
|
||||
# $example on$
|
||||
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
from pyspark import SparkContext
|
||||
# $example on$
|
||||
from pyspark.mllib.linalg import Matrices, Vectors
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
from pyspark import SparkContext
|
||||
# $example on$
|
||||
from pyspark.mllib.stat import Statistics
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
"""
|
||||
Isotonic Regression Example.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
from pyspark import SparkContext
|
||||
# $example on$
|
||||
import math
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# $example on$
|
||||
from numpy import array
|
||||
from math import sqrt
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
from pyspark import SparkContext
|
||||
# $example on$
|
||||
from pyspark.mllib.stat import KernelDensity
|
||||
|
|
|
@ -20,8 +20,6 @@ A K-means clustering program using MLlib.
|
|||
|
||||
This example requires NumPy (http://www.numpy.org/).
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
|
|
|
@ -15,8 +15,6 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
from pyspark import SparkContext
|
||||
# $example on$
|
||||
from pyspark.mllib.clustering import LDA, LDAModel
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue