[SPARK-32138] Drop Python 2.7, 3.4 and 3.5

### What changes were proposed in this pull request? This PR aims to drop Python 2.7, 3.4 and 3.5. Roughly speaking, it removes all the widely known Python 2 compatibility workarounds such as `sys.version` comparison, `__future__`. Also, it removes the Python 2 dedicated codes such as `ArrayConstructor` in Spark. ### Why are the changes needed? 1. Unsupport EOL Python versions 2. Reduce maintenance overhead and remove a bit of legacy codes and hacks for Python 2. 3. PyPy2 has a critical bug that causes a flaky test, SPARK-28358 given my testing and investigation. 4. Users can use Python type hints with Pandas UDFs without thinking about Python version 5. Users can leverage one latest cloudpickle, https://github.com/apache/spark/pull/28950. With Python 3.8+ it can also leverage C pickle. ### Does this PR introduce _any_ user-facing change? Yes, users cannot use Python 2.7, 3.4 and 3.5 in the upcoming Spark version. ### How was this patch tested? Manually tested and also tested in Jenkins. Closes #28957 from HyukjinKwon/SPARK-32138. Authored-by: HyukjinKwon <gurwls223@apache.org> Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-07-14 11:22:44 +09:00 · 2020-07-14 11:22:44 +09:00 · 4ad9bfd53b
parent 90ac9f975b
commit 4ad9bfd53b
225 changed files with 735 additions and 2033 deletions
--- a/.github/workflows/master.yml
+++ b/.github/workflows/master.yml
@ -133,7 +133,8 @@ jobs:
        architecture: x64
    - name: Install Python 3.6
      uses: actions/setup-python@v2
-      if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
+      # Yarn has a Python specific test too, for example, YarnClusterSuite.
      if: contains(matrix.modules, 'yarn') || contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
      with:
        python-version: 3.6
        architecture: x64
--- a/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala
@ -45,71 +45,6 @@ private[spark] object SerDeUtil extends Logging {
      }
    }
  }
  // Unpickle array.array generated by Python 2.6
  class ArrayConstructor extends net.razorvine.pickle.objects.ArrayConstructor {
    //  /* Description of types */
    //  static struct arraydescr descriptors[] = {
    //    {'c', sizeof(char), c_getitem, c_setitem},
    //    {'b', sizeof(char), b_getitem, b_setitem},
    //    {'B', sizeof(char), BB_getitem, BB_setitem},
    //    #ifdef Py_USING_UNICODE
    //      {'u', sizeof(Py_UNICODE), u_getitem, u_setitem},
    //    #endif
    //    {'h', sizeof(short), h_getitem, h_setitem},
    //    {'H', sizeof(short), HH_getitem, HH_setitem},
    //    {'i', sizeof(int), i_getitem, i_setitem},
    //    {'I', sizeof(int), II_getitem, II_setitem},
    //    {'l', sizeof(long), l_getitem, l_setitem},
    //    {'L', sizeof(long), LL_getitem, LL_setitem},
    //    {'f', sizeof(float), f_getitem, f_setitem},
    //    {'d', sizeof(double), d_getitem, d_setitem},
    //    {'\0', 0, 0, 0} /* Sentinel */
    //  };
    val machineCodes: Map[Char, Int] = if (ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN)) {
      Map('B' -> 0, 'b' -> 1, 'H' -> 3, 'h' -> 5, 'I' -> 7, 'i' -> 9,
        'L' -> 11, 'l' -> 13, 'f' -> 15, 'd' -> 17, 'u' -> 21
      )
    } else {
      Map('B' -> 0, 'b' -> 1, 'H' -> 2, 'h' -> 4, 'I' -> 6, 'i' -> 8,
        'L' -> 10, 'l' -> 12, 'f' -> 14, 'd' -> 16, 'u' -> 20
      )
    }
    override def construct(args: Array[Object]): Object = {
      if (args.length == 1) {
        construct(args ++ Array(""))
      } else if (args.length == 2 && args(1).isInstanceOf[String]) {
        val typecode = args(0).asInstanceOf[String].charAt(0)
        // This must be ISO 8859-1 / Latin 1, not UTF-8, to interoperate correctly
        val data = args(1).asInstanceOf[String].getBytes(StandardCharsets.ISO_8859_1)
        if (typecode == 'c') {
          // It seems like the pickle of pypy uses the similar protocol to Python 2.6, which uses
          // a string for array data instead of list as Python 2.7, and handles an array of
          // typecode 'c' as 1-byte character.
          val result = new Array[Char](data.length)
          var i = 0
          while (i < data.length) {
            result(i) = data(i).toChar
            i += 1
          }
          result
        } else {
          construct(typecode, machineCodes(typecode), data)
        }
      } else if (args.length == 2 && args(0) == "l") {
        // On Python 2, an array of typecode 'l' should be handled as long rather than int.
        val values = args(1).asInstanceOf[JArrayList[_]]
        val result = new Array[Long](values.size)
        var i = 0
        while (i < values.size) {
          result(i) = values.get(i).asInstanceOf[Number].longValue()
          i += 1
        }
        result
      } else {
        super.construct(args)
      }
    }
  }
  private var initialized = false
  // This should be called before trying to unpickle array.array from Python
@ -117,7 +52,6 @@ private[spark] object SerDeUtil extends Logging {
  def initialize(): Unit = {
    synchronized{
      if (!initialized) {
        Unpickler.registerConstructor("array", "array", new ArrayConstructor())
        Unpickler.registerConstructor("__builtin__", "bytearray", new ByteArrayConstructor())
        Unpickler.registerConstructor("builtins", "bytearray", new ByteArrayConstructor())
        Unpickler.registerConstructor("__builtin__", "bytes", new ByteArrayConstructor())
--- a/dev/create-release/releaseutils.py
+++ b/dev/create-release/releaseutils.py
@ -49,8 +49,6 @@ except ImportError:
    print("Install using 'sudo pip install unidecode'")
    sys.exit(-1)
 if sys.version < '3':
    input = raw_input  # noqa
 # Contributors list file name
 contributors_file_name = "contributors.txt"
@ -152,10 +150,7 @@ def get_commits(tag):
            if not is_valid_author(author):
                author = github_username
        # Guard against special characters
-        try:               # Python 2
+        author = str(author)
            author = unicode(author, "UTF-8")
        except NameError:  # Python 3
            author = str(author)
        author = unidecode.unidecode(author).strip()
        commit = Commit(_hash, author, title, pr_number)
        commits.append(commit)
--- a/dev/github_jira_sync.py
+++ b/dev/github_jira_sync.py
@ -22,14 +22,9 @@ import json
 import os
 import re
 import sys
-if sys.version < '3':
+from urllib.request import urlopen
-    from urllib2 import urlopen
+from urllib.request import Request
-    from urllib2 import Request
+from urllib.error import HTTPError
    from urllib2 import HTTPError
 else:
    from urllib.request import urlopen
    from urllib.request import Request
    from urllib.error import HTTPError
 try:
    import jira.client
--- a/dev/lint-python
+++ b/dev/lint-python
@ -168,7 +168,15 @@ function sphinx_test {
    # Check that the documentation builds acceptably, skip check if sphinx is not installed.
    if ! hash "$SPHINX_BUILD" 2> /dev/null; then
-        echo "The $SPHINX_BUILD command was not found. Skipping pydoc checks for now."
+        echo "The $SPHINX_BUILD command was not found. Skipping Sphinx build for now."
        echo
        return
    fi
    # TODO(SPARK-32279): Install Sphinx in Python 3 of Jenkins machines
    PYTHON_HAS_SPHINX=$("$PYTHON_EXECUTABLE" -c 'import importlib.util; print(importlib.util.find_spec("sphinx") is not None)')
    if [[ "$PYTHON_HAS_SPHINX" == "False" ]]; then
        echo "$PYTHON_EXECUTABLE does not have Sphinx installed. Skipping Sphinx build for now."
        echo
        return
    fi
--- a/dev/merge_spark_pr.py
+++ b/dev/merge_spark_pr.py
@ -31,15 +31,9 @@ import re
 import subprocess
 import sys
 import traceback
-if sys.version < '3':
+from urllib.request import urlopen
-    input = raw_input  # noqa
+from urllib.request import Request
-    from urllib2 import urlopen
+from urllib.error import HTTPError
    from urllib2 import Request
    from urllib2 import HTTPError
 else:
    from urllib.request import urlopen
    from urllib.request import Request
    from urllib.error import HTTPError
 try:
    import jira.client
--- a/dev/run-tests-jenkins.py
+++ b/dev/run-tests-jenkins.py
@ -22,15 +22,9 @@ import sys
 import json
 import functools
 import subprocess
-if sys.version < '3':
+from urllib.request import urlopen
-    from urllib2 import urlopen
+from urllib.request import Request
-    from urllib2 import Request
+from urllib.error import HTTPError, URLError
    from urllib2 import HTTPError, URLError
 else:
    from urllib.request import urlopen
    from urllib.request import Request
    from urllib.error import HTTPError, URLError
 from sparktestsupport import SPARK_HOME, ERROR_CODES
 from sparktestsupport.shellutils import run_cmd
--- a/dev/sparktestsupport/toposort.py
+++ b/dev/sparktestsupport/toposort.py
@ -24,8 +24,7 @@
 #    Moved functools import to the top of the file.
 #    Changed assert to a ValueError.
 #    Changed iter[items|keys] to [items|keys], for python 3
-#     compatibility. I don't think it matters for python 2 these are
+#     compatibility.
 #     now lists instead of iterables.
 #    Copy the input so as to leave it unmodified.
 #    Renamed function from toposort2 to toposort.
 #    Handle empty input.
--- a/docs/configuration.md
+++ b/docs/configuration.md
@ -2917,7 +2917,7 @@ The following variables can be set in `spark-env.sh`:
  </tr>
  <tr>
    <td><code>PYSPARK_PYTHON</code></td>
-    <td>Python binary executable to use for PySpark in both driver and workers (default is <code>python2.7</code> if available, otherwise <code>python</code>).
+    <td>Python binary executable to use for PySpark in both driver and workers (default is <code>python3</code> if available, otherwise <code>python</code>).
    Property <code>spark.pyspark.python</code> take precedence if it is set</td>
  </tr>
  <tr>
--- a/docs/index.md
+++ b/docs/index.md
@ -44,9 +44,8 @@ source, visit [Building Spark](building-spark.html).
 Spark runs on both Windows and UNIX-like systems (e.g. Linux, Mac OS), and it should run on any platform that runs a supported version of Java. This should include JVMs on x86_64 and ARM64. It's easy to run locally on one machine --- all you need is to have `java` installed on your system `PATH`, or the `JAVA_HOME` environment variable pointing to a Java installation.
-Spark runs on Java 8/11, Scala 2.12, Python 2.7+/3.4+ and R 3.5+.
+Spark runs on Java 8/11, Scala 2.12, Python 3.6+ and R 3.5+.
 Java 8 prior to version 8u92 support is deprecated as of Spark 3.0.0.
 Python 2 and Python 3 prior to version 3.6 support is deprecated as of Spark 3.0.0.
 For the Scala API, Spark {{site.SPARK_VERSION}}
 uses Scala {{site.SCALA_BINARY_VERSION}}. You will need to use a compatible Scala version
 ({{site.SCALA_BINARY_VERSION}}.x).
--- a/docs/rdd-programming-guide.md
+++ b/docs/rdd-programming-guide.md
@ -101,10 +101,10 @@ import org.apache.spark.SparkConf;
 <div data-lang="python"  markdown="1">
-Spark {{site.SPARK_VERSION}} works with Python 2.7+ or Python 3.4+. It can use the standard CPython interpreter,
+Spark {{site.SPARK_VERSION}} works with Python 3.6+. It can use the standard CPython interpreter,
 so C libraries like NumPy can be used. It also works with PyPy 2.3+.
-Note that Python 2 support is deprecated as of Spark 3.0.0.
+Python 2, 3.4 and 3.5 supports were removed in Spark 3.1.0.
 Spark applications in Python can either be run with the `bin/spark-submit` script which includes Spark at runtime, or by including it in your setup.py as:
@ -134,8 +134,8 @@ PySpark requires the same minor version of Python in both driver and workers. It
 you can specify which version of Python you want to use by `PYSPARK_PYTHON`, for example:
 {% highlight bash %}
-$ PYSPARK_PYTHON=python3.4 bin/pyspark
+$ PYSPARK_PYTHON=python3.8 bin/pyspark
-$ PYSPARK_PYTHON=/opt/pypy-2.5/bin/pypy bin/spark-submit examples/src/main/python/pi.py
+$ PYSPARK_PYTHON=/path-to-your-pypy/pypy bin/spark-submit examples/src/main/python/pi.py
 {% endhighlight %}
 </div>
@ -276,7 +276,7 @@ $ PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS=notebook ./bin/pyspar
 You can customize the `ipython` or `jupyter` commands by setting `PYSPARK_DRIVER_PYTHON_OPTS`.
-After the Jupyter Notebook server is launched, you can create a new "Python 2" notebook from
+After the Jupyter Notebook server is launched, you can create a new notebook from
 the "Files" tab. Inside the notebook, you can input the command `%pylab inline` as part of
 your notebook before you start to try Spark from the Jupyter notebook.
@ -447,7 +447,7 @@ Writables are automatically converted:
 <table class="table">
 <tr><th>Writable Type</th><th>Python Type</th></tr>
-<tr><td>Text</td><td>unicode str</td></tr>
+<tr><td>Text</td><td>str</td></tr>
 <tr><td>IntWritable</td><td>int</td></tr>
 <tr><td>FloatWritable</td><td>float</td></tr>
 <tr><td>DoubleWritable</td><td>float</td></tr>
--- a/examples/src/main/python/als.py
+++ b/examples/src/main/python/als.py
@ -21,8 +21,6 @@ pyspark.ml.recommendation.ALS for more conventional use.
 This example requires numpy (http://www.numpy.org/)
 """
 from __future__ import print_function
 import sys
 import numpy as np
--- a/examples/src/main/python/avro_inputformat.py
+++ b/examples/src/main/python/avro_inputformat.py
@ -43,8 +43,6 @@ $ ./bin/spark-submit --driver-class-path /path/to/example/jar \
 {u'favorite_color': None, u'name': u'Alyssa'}
 {u'favorite_color': u'red', u'name': u'Ben'}
 """
 from __future__ import print_function
 import sys
 from functools import reduce
--- a/examples/src/main/python/kmeans.py
+++ b/examples/src/main/python/kmeans.py
@ -22,8 +22,6 @@ examples/src/main/python/ml/kmeans_example.py.
 This example requires NumPy (http://www.numpy.org/).
 """
 from __future__ import print_function
 import sys
 import numpy as np
--- a/examples/src/main/python/logistic_regression.py
+++ b/examples/src/main/python/logistic_regression.py
@ -22,8 +22,6 @@ to act on batches of input data using efficient matrix operations.
 In practice, one may prefer to use the LogisticRegression algorithm in
 ML, as shown in examples/src/main/python/ml/logistic_regression_with_elastic_net.py.
 """
 from __future__ import print_function
 import sys
 import numpy as np
--- a/examples/src/main/python/ml/aft_survival_regression.py
+++ b/examples/src/main/python/ml/aft_survival_regression.py
@ -20,8 +20,6 @@ An example demonstrating aft survival regression.
 Run with:
  bin/spark-submit examples/src/main/python/ml/aft_survival_regression.py
 """
 from __future__ import print_function
 # $example on$
 from pyspark.ml.regression import AFTSurvivalRegression
 from pyspark.ml.linalg import Vectors
--- a/examples/src/main/python/ml/als_example.py
+++ b/examples/src/main/python/ml/als_example.py
@ -15,12 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 import sys
 if sys.version >= '3':
    long = int
 from pyspark.sql import SparkSession
 # $example on$
@ -39,7 +33,7 @@ if __name__ == "__main__":
    lines = spark.read.text("data/mllib/als/sample_movielens_ratings.txt").rdd
    parts = lines.map(lambda row: row.value.split("::"))
    ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
-                                         rating=float(p[2]), timestamp=long(p[3])))
+                                         rating=float(p[2]), timestamp=int(p[3])))
    ratings = spark.createDataFrame(ratingsRDD)
    (training, test) = ratings.randomSplit([0.8, 0.2])
--- a/examples/src/main/python/ml/anova_selector_example.py
+++ b/examples/src/main/python/ml/anova_selector_example.py
@ -20,8 +20,6 @@ An example for ANOVASelector.
 Run with:
  bin/spark-submit examples/src/main/python/ml/anova_selector_example.py
 """
 from __future__ import print_function
 from pyspark.sql import SparkSession
 # $example on$
 from pyspark.ml.feature import ANOVASelector
--- a/examples/src/main/python/ml/anova_test_example.py
+++ b/examples/src/main/python/ml/anova_test_example.py
@ -20,8 +20,6 @@ An example for ANOVA testing.
 Run with:
  bin/spark-submit examples/src/main/python/ml/anova_test_example.py
 """
 from __future__ import print_function
 from pyspark.sql import SparkSession
 # $example on$
 from pyspark.ml.linalg import Vectors
--- a/examples/src/main/python/ml/binarizer_example.py
+++ b/examples/src/main/python/ml/binarizer_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 from pyspark.sql import SparkSession
 # $example on$
 from pyspark.ml.feature import Binarizer
--- a/examples/src/main/python/ml/bisecting_k_means_example.py
+++ b/examples/src/main/python/ml/bisecting_k_means_example.py
@ -20,8 +20,6 @@ An example demonstrating bisecting k-means clustering.
 Run with:
  bin/spark-submit examples/src/main/python/ml/bisecting_k_means_example.py
 """
 from __future__ import print_function
 # $example on$
 from pyspark.ml.clustering import BisectingKMeans
 from pyspark.ml.evaluation import ClusteringEvaluator
--- a/examples/src/main/python/ml/bucketed_random_projection_lsh_example.py
+++ b/examples/src/main/python/ml/bucketed_random_projection_lsh_example.py
@ -20,8 +20,6 @@ An example demonstrating BucketedRandomProjectionLSH.
 Run with:
  bin/spark-submit examples/src/main/python/ml/bucketed_random_projection_lsh_example.py
 """
 from __future__ import print_function
 # $example on$
 from pyspark.ml.feature import BucketedRandomProjectionLSH
 from pyspark.ml.linalg import Vectors
--- a/examples/src/main/python/ml/bucketizer_example.py
+++ b/examples/src/main/python/ml/bucketizer_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 from pyspark.sql import SparkSession
 # $example on$
 from pyspark.ml.feature import Bucketizer
--- a/examples/src/main/python/ml/chi_square_test_example.py
+++ b/examples/src/main/python/ml/chi_square_test_example.py
@ -20,8 +20,6 @@ An example for Chi-square hypothesis testing.
 Run with:
  bin/spark-submit examples/src/main/python/ml/chi_square_test_example.py
 """
 from __future__ import print_function
 from pyspark.sql import SparkSession
 # $example on$
 from pyspark.ml.linalg import Vectors
--- a/examples/src/main/python/ml/chisq_selector_example.py
+++ b/examples/src/main/python/ml/chisq_selector_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 from pyspark.sql import SparkSession
 # $example on$
 from pyspark.ml.feature import ChiSqSelector
--- a/examples/src/main/python/ml/correlation_example.py
+++ b/examples/src/main/python/ml/correlation_example.py
@ -20,8 +20,6 @@ An example for computing correlation matrix.
 Run with:
  bin/spark-submit examples/src/main/python/ml/correlation_example.py
 """
 from __future__ import print_function
 # $example on$
 from pyspark.ml.linalg import Vectors
 from pyspark.ml.stat import Correlation
--- a/examples/src/main/python/ml/count_vectorizer_example.py
+++ b/examples/src/main/python/ml/count_vectorizer_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 from pyspark.sql import SparkSession
 # $example on$
 from pyspark.ml.feature import CountVectorizer
--- a/examples/src/main/python/ml/cross_validator.py
+++ b/examples/src/main/python/ml/cross_validator.py
@ -22,8 +22,6 @@ Run with:
  bin/spark-submit examples/src/main/python/ml/cross_validator.py
 """
 from __future__ import print_function
 # $example on$
 from pyspark.ml import Pipeline
 from pyspark.ml.classification import LogisticRegression
--- a/examples/src/main/python/ml/dataframe_example.py
+++ b/examples/src/main/python/ml/dataframe_example.py
@ -19,8 +19,6 @@
 An example of how to use DataFrame for ML. Run with::
    bin/spark-submit examples/src/main/python/ml/dataframe_example.py <input_path>
 """
 from __future__ import print_function
 import os
 import sys
 import tempfile
--- a/examples/src/main/python/ml/dct_example.py
+++ b/examples/src/main/python/ml/dct_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 # $example on$
 from pyspark.ml.feature import DCT
 from pyspark.ml.linalg import Vectors
--- a/examples/src/main/python/ml/decision_tree_classification_example.py
+++ b/examples/src/main/python/ml/decision_tree_classification_example.py
@ -18,8 +18,6 @@
 """
 Decision Tree Classification Example.
 """
 from __future__ import print_function
 # $example on$
 from pyspark.ml import Pipeline
 from pyspark.ml.classification import DecisionTreeClassifier
--- a/examples/src/main/python/ml/decision_tree_regression_example.py
+++ b/examples/src/main/python/ml/decision_tree_regression_example.py
@ -18,8 +18,6 @@
 """
 Decision Tree Regression Example.
 """
 from __future__ import print_function
 # $example on$
 from pyspark.ml import Pipeline
 from pyspark.ml.regression import DecisionTreeRegressor
--- a/examples/src/main/python/ml/elementwise_product_example.py
+++ b/examples/src/main/python/ml/elementwise_product_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 # $example on$
 from pyspark.ml.feature import ElementwiseProduct
 from pyspark.ml.linalg import Vectors
--- a/examples/src/main/python/ml/estimator_transformer_param_example.py
+++ b/examples/src/main/python/ml/estimator_transformer_param_example.py
@ -18,8 +18,6 @@
 """
 Estimator Transformer Param Example.
 """
 from __future__ import print_function
 # $example on$
 from pyspark.ml.linalg import Vectors
 from pyspark.ml.classification import LogisticRegression
--- a/examples/src/main/python/ml/feature_hasher_example.py
+++ b/examples/src/main/python/ml/feature_hasher_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 from pyspark.sql import SparkSession
 # $example on$
 from pyspark.ml.feature import FeatureHasher
--- a/examples/src/main/python/ml/fm_classifier_example.py
+++ b/examples/src/main/python/ml/fm_classifier_example.py
@ -18,8 +18,6 @@
 """
 FMClassifier Example.
 """
 from __future__ import print_function
 # $example on$
 from pyspark.ml import Pipeline
 from pyspark.ml.classification import FMClassifier
--- a/examples/src/main/python/ml/fm_regressor_example.py
+++ b/examples/src/main/python/ml/fm_regressor_example.py
@ -18,8 +18,6 @@
 """
 FMRegressor Example.
 """
 from __future__ import print_function
 # $example on$
 from pyspark.ml import Pipeline
 from pyspark.ml.regression import FMRegressor
--- a/examples/src/main/python/ml/fvalue_selector_example.py
+++ b/examples/src/main/python/ml/fvalue_selector_example.py
@ -20,8 +20,6 @@ An example for FValueSelector.
 Run with:
  bin/spark-submit examples/src/main/python/ml/fvalue_selector_example.py
 """
 from __future__ import print_function
 from pyspark.sql import SparkSession
 # $example on$
 from pyspark.ml.feature import FValueSelector
--- a/examples/src/main/python/ml/fvalue_test_example.py
+++ b/examples/src/main/python/ml/fvalue_test_example.py
@ -20,8 +20,6 @@ An example for FValue testing.
 Run with:
  bin/spark-submit examples/src/main/python/ml/fvalue_test_example.py
 """
 from __future__ import print_function
 from pyspark.sql import SparkSession
 # $example on$
 from pyspark.ml.linalg import Vectors
--- a/examples/src/main/python/ml/gaussian_mixture_example.py
+++ b/examples/src/main/python/ml/gaussian_mixture_example.py
@ -20,8 +20,6 @@ A simple example demonstrating Gaussian Mixture Model (GMM).
 Run with:
  bin/spark-submit examples/src/main/python/ml/gaussian_mixture_example.py
 """
 from __future__ import print_function
 # $example on$
 from pyspark.ml.clustering import GaussianMixture
 # $example off$
--- a/examples/src/main/python/ml/generalized_linear_regression_example.py
+++ b/examples/src/main/python/ml/generalized_linear_regression_example.py
@ -20,8 +20,6 @@ An example demonstrating generalized linear regression.
 Run with:
  bin/spark-submit examples/src/main/python/ml/generalized_linear_regression_example.py
 """
 from __future__ import print_function
 from pyspark.sql import SparkSession
 # $example on$
 from pyspark.ml.regression import GeneralizedLinearRegression
--- a/examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py
+++ b/examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py
@ -18,8 +18,6 @@
 """
 Gradient Boosted Tree Classifier Example.
 """
 from __future__ import print_function
 # $example on$
 from pyspark.ml import Pipeline
 from pyspark.ml.classification import GBTClassifier
--- a/examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py
+++ b/examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py
@ -18,8 +18,6 @@
 """
 Gradient Boosted Tree Regressor Example.
 """
 from __future__ import print_function
 # $example on$
 from pyspark.ml import Pipeline
 from pyspark.ml.regression import GBTRegressor
--- a/examples/src/main/python/ml/index_to_string_example.py
+++ b/examples/src/main/python/ml/index_to_string_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 # $example on$
 from pyspark.ml.feature import IndexToString, StringIndexer
 # $example off$
--- a/examples/src/main/python/ml/interaction_example.py
+++ b/examples/src/main/python/ml/interaction_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 # $example on$
 from pyspark.ml.feature import Interaction, VectorAssembler
 # $example off$
--- a/examples/src/main/python/ml/isotonic_regression_example.py
+++ b/examples/src/main/python/ml/isotonic_regression_example.py
@ -21,8 +21,6 @@ Isotonic Regression Example.
 Run with:
  bin/spark-submit examples/src/main/python/ml/isotonic_regression_example.py
 """
 from __future__ import print_function
 # $example on$
 from pyspark.ml.regression import IsotonicRegression
 # $example off$
--- a/examples/src/main/python/ml/kmeans_example.py
+++ b/examples/src/main/python/ml/kmeans_example.py
@ -22,8 +22,6 @@ Run with:
 This example requires NumPy (http://www.numpy.org/).
 """
 from __future__ import print_function
 # $example on$
 from pyspark.ml.clustering import KMeans
 from pyspark.ml.evaluation import ClusteringEvaluator
--- a/examples/src/main/python/ml/lda_example.py
+++ b/examples/src/main/python/ml/lda_example.py
@ -20,8 +20,6 @@ An example demonstrating LDA.
 Run with:
  bin/spark-submit examples/src/main/python/ml/lda_example.py
 """
 from __future__ import print_function
 # $example on$
 from pyspark.ml.clustering import LDA
 # $example off$
--- a/examples/src/main/python/ml/linear_regression_with_elastic_net.py
+++ b/examples/src/main/python/ml/linear_regression_with_elastic_net.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 # $example on$
 from pyspark.ml.regression import LinearRegression
 # $example off$
--- a/examples/src/main/python/ml/linearsvc.py
+++ b/examples/src/main/python/ml/linearsvc.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 # $example on$
 from pyspark.ml.classification import LinearSVC
 # $example off$
--- a/examples/src/main/python/ml/logistic_regression_summary_example.py
+++ b/examples/src/main/python/ml/logistic_regression_summary_example.py
@ -20,8 +20,6 @@ An example demonstrating Logistic Regression Summary.
 Run with:
  bin/spark-submit examples/src/main/python/ml/logistic_regression_summary_example.py
 """
 from __future__ import print_function
 # $example on$
 from pyspark.ml.classification import LogisticRegression
 # $example off$
--- a/examples/src/main/python/ml/logistic_regression_with_elastic_net.py
+++ b/examples/src/main/python/ml/logistic_regression_with_elastic_net.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 # $example on$
 from pyspark.ml.classification import LogisticRegression
 # $example off$
--- a/examples/src/main/python/ml/max_abs_scaler_example.py
+++ b/examples/src/main/python/ml/max_abs_scaler_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 # $example on$
 from pyspark.ml.feature import MaxAbsScaler
 from pyspark.ml.linalg import Vectors
--- a/examples/src/main/python/ml/min_hash_lsh_example.py
+++ b/examples/src/main/python/ml/min_hash_lsh_example.py
@ -20,8 +20,6 @@ An example demonstrating MinHashLSH.
 Run with:
  bin/spark-submit examples/src/main/python/ml/min_hash_lsh_example.py
 """
 from __future__ import print_function
 # $example on$
 from pyspark.ml.feature import MinHashLSH
 from pyspark.ml.linalg import Vectors
--- a/examples/src/main/python/ml/min_max_scaler_example.py
+++ b/examples/src/main/python/ml/min_max_scaler_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 # $example on$
 from pyspark.ml.feature import MinMaxScaler
 from pyspark.ml.linalg import Vectors
--- a/examples/src/main/python/ml/multiclass_logistic_regression_with_elastic_net.py
+++ b/examples/src/main/python/ml/multiclass_logistic_regression_with_elastic_net.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 # $example on$
 from pyspark.ml.classification import LogisticRegression
 # $example off$
--- a/examples/src/main/python/ml/multilayer_perceptron_classification.py
+++ b/examples/src/main/python/ml/multilayer_perceptron_classification.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 # $example on$
 from pyspark.ml.classification import MultilayerPerceptronClassifier
 from pyspark.ml.evaluation import MulticlassClassificationEvaluator
--- a/examples/src/main/python/ml/n_gram_example.py
+++ b/examples/src/main/python/ml/n_gram_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 # $example on$
 from pyspark.ml.feature import NGram
 # $example off$
--- a/examples/src/main/python/ml/naive_bayes_example.py
+++ b/examples/src/main/python/ml/naive_bayes_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 # $example on$
 from pyspark.ml.classification import NaiveBayes
 from pyspark.ml.evaluation import MulticlassClassificationEvaluator
--- a/examples/src/main/python/ml/normalizer_example.py
+++ b/examples/src/main/python/ml/normalizer_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 # $example on$
 from pyspark.ml.feature import Normalizer
 from pyspark.ml.linalg import Vectors
--- a/examples/src/main/python/ml/one_vs_rest_example.py
+++ b/examples/src/main/python/ml/one_vs_rest_example.py
@ -21,8 +21,6 @@ using Logistic Regression as the base classifier.
 Run with:
  bin/spark-submit examples/src/main/python/ml/one_vs_rest_example.py
 """
 from __future__ import print_function
 # $example on$
 from pyspark.ml.classification import LogisticRegression, OneVsRest
 from pyspark.ml.evaluation import MulticlassClassificationEvaluator
--- a/examples/src/main/python/ml/onehot_encoder_example.py
+++ b/examples/src/main/python/ml/onehot_encoder_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 # $example on$
 from pyspark.ml.feature import OneHotEncoder
 # $example off$
--- a/examples/src/main/python/ml/pca_example.py
+++ b/examples/src/main/python/ml/pca_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 # $example on$
 from pyspark.ml.feature import PCA
 from pyspark.ml.linalg import Vectors
--- a/examples/src/main/python/ml/polynomial_expansion_example.py
+++ b/examples/src/main/python/ml/polynomial_expansion_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 # $example on$
 from pyspark.ml.feature import PolynomialExpansion
 from pyspark.ml.linalg import Vectors
--- a/examples/src/main/python/ml/quantile_discretizer_example.py
+++ b/examples/src/main/python/ml/quantile_discretizer_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 # $example on$
 from pyspark.ml.feature import QuantileDiscretizer
 # $example off$
--- a/examples/src/main/python/ml/random_forest_classifier_example.py
+++ b/examples/src/main/python/ml/random_forest_classifier_example.py
@ -18,8 +18,6 @@
 """
 Random Forest Classifier Example.
 """
 from __future__ import print_function
 # $example on$
 from pyspark.ml import Pipeline
 from pyspark.ml.classification import RandomForestClassifier
--- a/examples/src/main/python/ml/random_forest_regressor_example.py
+++ b/examples/src/main/python/ml/random_forest_regressor_example.py
@ -18,8 +18,6 @@
 """
 Random Forest Regressor Example.
 """
 from __future__ import print_function
 # $example on$
 from pyspark.ml import Pipeline
 from pyspark.ml.regression import RandomForestRegressor
--- a/examples/src/main/python/ml/rformula_example.py
+++ b/examples/src/main/python/ml/rformula_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 # $example on$
 from pyspark.ml.feature import RFormula
 # $example off$
--- a/examples/src/main/python/ml/robust_scaler_example.py
+++ b/examples/src/main/python/ml/robust_scaler_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 # $example on$
 from pyspark.ml.feature import RobustScaler
 # $example off$
--- a/examples/src/main/python/ml/sql_transformer.py
+++ b/examples/src/main/python/ml/sql_transformer.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 # $example on$
 from pyspark.ml.feature import SQLTransformer
 # $example off$
--- a/examples/src/main/python/ml/standard_scaler_example.py
+++ b/examples/src/main/python/ml/standard_scaler_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 # $example on$
 from pyspark.ml.feature import StandardScaler
 # $example off$
--- a/examples/src/main/python/ml/stopwords_remover_example.py
+++ b/examples/src/main/python/ml/stopwords_remover_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 # $example on$
 from pyspark.ml.feature import StopWordsRemover
 # $example off$
--- a/examples/src/main/python/ml/string_indexer_example.py
+++ b/examples/src/main/python/ml/string_indexer_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 # $example on$
 from pyspark.ml.feature import StringIndexer
 # $example off$
--- a/examples/src/main/python/ml/summarizer_example.py
+++ b/examples/src/main/python/ml/summarizer_example.py
@ -20,8 +20,6 @@ An example for summarizer.
 Run with:
  bin/spark-submit examples/src/main/python/ml/summarizer_example.py
 """
 from __future__ import print_function
 from pyspark.sql import SparkSession
 # $example on$
 from pyspark.ml.stat import Summarizer
--- a/examples/src/main/python/ml/tf_idf_example.py
+++ b/examples/src/main/python/ml/tf_idf_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 # $example on$
 from pyspark.ml.feature import HashingTF, IDF, Tokenizer
 # $example off$
--- a/examples/src/main/python/ml/tokenizer_example.py
+++ b/examples/src/main/python/ml/tokenizer_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 # $example on$
 from pyspark.ml.feature import Tokenizer, RegexTokenizer
 from pyspark.sql.functions import col, udf
--- a/examples/src/main/python/ml/variance_threshold_selector_example.py
+++ b/examples/src/main/python/ml/variance_threshold_selector_example.py
@ -20,8 +20,6 @@ An example for VarianceThresholdSelector.
 Run with:
  bin/spark-submit examples/src/main/python/ml/variance_threshold_selector_example.py
 """
 from __future__ import print_function
 from pyspark.sql import SparkSession
 # $example on$
 from pyspark.ml.feature import VarianceThresholdSelector
--- a/examples/src/main/python/ml/vector_assembler_example.py
+++ b/examples/src/main/python/ml/vector_assembler_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 # $example on$
 from pyspark.ml.linalg import Vectors
 from pyspark.ml.feature import VectorAssembler
--- a/examples/src/main/python/ml/vector_indexer_example.py
+++ b/examples/src/main/python/ml/vector_indexer_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 # $example on$
 from pyspark.ml.feature import VectorIndexer
 # $example off$
--- a/examples/src/main/python/ml/vector_size_hint_example.py
+++ b/examples/src/main/python/ml/vector_size_hint_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 # $example on$
 from pyspark.ml.linalg import Vectors
 from pyspark.ml.feature import (VectorSizeHint, VectorAssembler)
--- a/examples/src/main/python/ml/vector_slicer_example.py
+++ b/examples/src/main/python/ml/vector_slicer_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 # $example on$
 from pyspark.ml.feature import VectorSlicer
 from pyspark.ml.linalg import Vectors
--- a/examples/src/main/python/ml/word2vec_example.py
+++ b/examples/src/main/python/ml/word2vec_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 # $example on$
 from pyspark.ml.feature import Word2Vec
 # $example off$
--- a/examples/src/main/python/mllib/binary_classification_metrics_example.py
+++ b/examples/src/main/python/mllib/binary_classification_metrics_example.py
@ -17,7 +17,6 @@
 """
 Binary Classification Metrics Example.
 """
 from __future__ import print_function
 from pyspark import SparkContext
 # $example on$
 from pyspark.mllib.classification import LogisticRegressionWithLBFGS
--- a/examples/src/main/python/mllib/bisecting_k_means_example.py
+++ b/examples/src/main/python/mllib/bisecting_k_means_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 # $example on$
 from numpy import array
 # $example off$
--- a/examples/src/main/python/mllib/correlations.py
+++ b/examples/src/main/python/mllib/correlations.py
@ -18,8 +18,6 @@
 """
 Correlations using MLlib.
 """
 from __future__ import print_function
 import sys
 from pyspark import SparkContext
--- a/examples/src/main/python/mllib/correlations_example.py
+++ b/examples/src/main/python/mllib/correlations_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 import numpy as np
 from pyspark import SparkContext
--- a/examples/src/main/python/mllib/decision_tree_classification_example.py
+++ b/examples/src/main/python/mllib/decision_tree_classification_example.py
@ -18,8 +18,6 @@
 """
 Decision Tree Classification Example.
 """
 from __future__ import print_function
 from pyspark import SparkContext
 # $example on$
 from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
--- a/examples/src/main/python/mllib/decision_tree_regression_example.py
+++ b/examples/src/main/python/mllib/decision_tree_regression_example.py
@ -18,8 +18,6 @@
 """
 Decision Tree Regression Example.
 """
 from __future__ import print_function
 from pyspark import SparkContext
 # $example on$
 from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
--- a/examples/src/main/python/mllib/elementwise_product_example.py
+++ b/examples/src/main/python/mllib/elementwise_product_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 from pyspark import SparkContext
 # $example on$
 from pyspark.mllib.feature import ElementwiseProduct
--- a/examples/src/main/python/mllib/gaussian_mixture_example.py
+++ b/examples/src/main/python/mllib/gaussian_mixture_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 # $example on$
 from numpy import array
 # $example off$
--- a/examples/src/main/python/mllib/gaussian_mixture_model.py
+++ b/examples/src/main/python/mllib/gaussian_mixture_model.py
@ -18,11 +18,6 @@
 """
 A Gaussian Mixture Model clustering program using MLlib.
 """
 from __future__ import print_function
 import sys
 if sys.version >= '3':
    long = int
 import random
 import argparse
@ -53,7 +48,7 @@ if __name__ == "__main__":
    parser.add_argument('--convergenceTol', default=1e-3, type=float, help='convergence threshold')
    parser.add_argument('--maxIterations', default=100, type=int, help='Number of iterations')
    parser.add_argument('--seed', default=random.getrandbits(19),
-                        type=long, help='Random seed')
+                        type=int, help='Random seed')
    args = parser.parse_args()
    conf = SparkConf().setAppName("GMM")
--- a/examples/src/main/python/mllib/gradient_boosting_classification_example.py
+++ b/examples/src/main/python/mllib/gradient_boosting_classification_example.py
@ -18,8 +18,6 @@
 """
 Gradient Boosted Trees Classification Example.
 """
 from __future__ import print_function
 from pyspark import SparkContext
 # $example on$
 from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
--- a/examples/src/main/python/mllib/gradient_boosting_regression_example.py
+++ b/examples/src/main/python/mllib/gradient_boosting_regression_example.py
@ -18,8 +18,6 @@
 """
 Gradient Boosted Trees Regression Example.
 """
 from __future__ import print_function
 from pyspark import SparkContext
 # $example on$
 from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
--- a/examples/src/main/python/mllib/hypothesis_testing_example.py
+++ b/examples/src/main/python/mllib/hypothesis_testing_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 from pyspark import SparkContext
 # $example on$
 from pyspark.mllib.linalg import Matrices, Vectors
--- a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py
+++ b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 from pyspark import SparkContext
 # $example on$
 from pyspark.mllib.stat import Statistics
--- a/examples/src/main/python/mllib/isotonic_regression_example.py
+++ b/examples/src/main/python/mllib/isotonic_regression_example.py
@ -18,8 +18,6 @@
 """
 Isotonic Regression Example.
 """
 from __future__ import print_function
 from pyspark import SparkContext
 # $example on$
 import math
--- a/examples/src/main/python/mllib/k_means_example.py
+++ b/examples/src/main/python/mllib/k_means_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 # $example on$
 from numpy import array
 from math import sqrt
--- a/examples/src/main/python/mllib/kernel_density_estimation_example.py
+++ b/examples/src/main/python/mllib/kernel_density_estimation_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 from pyspark import SparkContext
 # $example on$
 from pyspark.mllib.stat import KernelDensity
--- a/examples/src/main/python/mllib/kmeans.py
+++ b/examples/src/main/python/mllib/kmeans.py
@ -20,8 +20,6 @@ A K-means clustering program using MLlib.
 This example requires NumPy (http://www.numpy.org/).
 """
 from __future__ import print_function
 import sys
 import numpy as np
--- a/examples/src/main/python/mllib/latent_dirichlet_allocation_example.py
+++ b/examples/src/main/python/mllib/latent_dirichlet_allocation_example.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #
 from __future__ import print_function
 from pyspark import SparkContext
 # $example on$
 from pyspark.mllib.clustering import LDA, LDAModel
--- a/Show more
+++ b/Show more