From 4ad9bfd53b84a6d2497668c73af6899bae14c187 Mon Sep 17 00:00:00 2001
From: HyukjinKwon <gurwls223@apache.org>
Date: Tue, 14 Jul 2020 11:22:44 +0900
Subject: [PATCH] [SPARK-32138] Drop Python 2.7, 3.4 and 3.5

### What changes were proposed in this pull request?

This PR aims to drop Python 2.7, 3.4 and 3.5.

Roughly speaking, it removes all the widely known Python 2 compatibility workarounds such as `sys.version` comparison, `__future__`. Also, it removes the Python 2 dedicated codes such as `ArrayConstructor` in Spark.

### Why are the changes needed?

 1. Unsupport EOL Python versions
 2. Reduce maintenance overhead and remove a bit of legacy codes and hacks for Python 2.
 3. PyPy2 has a critical bug that causes a flaky test, SPARK-28358 given my testing and investigation.
 4. Users can use Python type hints with Pandas UDFs without thinking about Python version
 5. Users can leverage one latest cloudpickle, https://github.com/apache/spark/pull/28950. With Python 3.8+ it can also leverage C pickle.

### Does this PR introduce _any_ user-facing change?

Yes, users cannot use Python 2.7, 3.4 and 3.5 in the upcoming Spark version.

### How was this patch tested?

Manually tested and also tested in Jenkins.

Closes #28957 from HyukjinKwon/SPARK-32138.

Authored-by: HyukjinKwon <gurwls223@apache.org>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 .github/workflows/master.yml                  |   3 +-
 .../apache/spark/api/python/SerDeUtil.scala   |  66 -----
 dev/create-release/releaseutils.py            |   7 +-
 dev/github_jira_sync.py                       |  11 +-
 dev/lint-python                               |  10 +-
 dev/merge_spark_pr.py                         |  12 +-
 dev/run-tests-jenkins.py                      |  12 +-
 dev/sparktestsupport/toposort.py              |   3 +-
 docs/configuration.md                         |   2 +-
 docs/index.md                                 |   3 +-
 docs/rdd-programming-guide.md                 |  12 +-
 examples/src/main/python/als.py               |   2 -
 examples/src/main/python/avro_inputformat.py  |   2 -
 examples/src/main/python/kmeans.py            |   2 -
 .../src/main/python/logistic_regression.py    |   2 -
 .../main/python/ml/aft_survival_regression.py |   2 -
 examples/src/main/python/ml/als_example.py    |   8 +-
 .../main/python/ml/anova_selector_example.py  |   2 -
 .../src/main/python/ml/anova_test_example.py  |   2 -
 .../src/main/python/ml/binarizer_example.py   |   2 -
 .../python/ml/bisecting_k_means_example.py    |   2 -
 .../bucketed_random_projection_lsh_example.py |   2 -
 .../src/main/python/ml/bucketizer_example.py  |   2 -
 .../main/python/ml/chi_square_test_example.py |   2 -
 .../main/python/ml/chisq_selector_example.py  |   2 -
 .../src/main/python/ml/correlation_example.py |   2 -
 .../python/ml/count_vectorizer_example.py     |   2 -
 .../src/main/python/ml/cross_validator.py     |   2 -
 .../src/main/python/ml/dataframe_example.py   |   2 -
 examples/src/main/python/ml/dct_example.py    |   2 -
 .../decision_tree_classification_example.py   |   2 -
 .../ml/decision_tree_regression_example.py    |   2 -
 .../python/ml/elementwise_product_example.py  |   2 -
 .../ml/estimator_transformer_param_example.py |   2 -
 .../main/python/ml/feature_hasher_example.py  |   2 -
 .../main/python/ml/fm_classifier_example.py   |   2 -
 .../main/python/ml/fm_regressor_example.py    |   2 -
 .../main/python/ml/fvalue_selector_example.py |   2 -
 .../src/main/python/ml/fvalue_test_example.py |   2 -
 .../python/ml/gaussian_mixture_example.py     |   2 -
 .../generalized_linear_regression_example.py  |   2 -
 ...radient_boosted_tree_classifier_example.py |   2 -
 ...gradient_boosted_tree_regressor_example.py |   2 -
 .../main/python/ml/index_to_string_example.py |   2 -
 .../src/main/python/ml/interaction_example.py |   2 -
 .../python/ml/isotonic_regression_example.py  |   2 -
 examples/src/main/python/ml/kmeans_example.py |   2 -
 examples/src/main/python/ml/lda_example.py    |   2 -
 .../ml/linear_regression_with_elastic_net.py  |   2 -
 examples/src/main/python/ml/linearsvc.py      |   2 -
 .../ml/logistic_regression_summary_example.py |   2 -
 .../logistic_regression_with_elastic_net.py   |   2 -
 .../main/python/ml/max_abs_scaler_example.py  |   2 -
 .../main/python/ml/min_hash_lsh_example.py    |   2 -
 .../main/python/ml/min_max_scaler_example.py  |   2 -
 ...ss_logistic_regression_with_elastic_net.py |   2 -
 .../multilayer_perceptron_classification.py   |   2 -
 examples/src/main/python/ml/n_gram_example.py |   2 -
 .../src/main/python/ml/naive_bayes_example.py |   2 -
 .../src/main/python/ml/normalizer_example.py  |   2 -
 .../src/main/python/ml/one_vs_rest_example.py |   2 -
 .../main/python/ml/onehot_encoder_example.py  |   2 -
 examples/src/main/python/ml/pca_example.py    |   2 -
 .../python/ml/polynomial_expansion_example.py |   2 -
 .../python/ml/quantile_discretizer_example.py |   2 -
 .../ml/random_forest_classifier_example.py    |   2 -
 .../ml/random_forest_regressor_example.py     |   2 -
 .../src/main/python/ml/rformula_example.py    |   2 -
 .../main/python/ml/robust_scaler_example.py   |   2 -
 .../src/main/python/ml/sql_transformer.py     |   2 -
 .../main/python/ml/standard_scaler_example.py |   2 -
 .../python/ml/stopwords_remover_example.py    |   2 -
 .../main/python/ml/string_indexer_example.py  |   2 -
 .../src/main/python/ml/summarizer_example.py  |   2 -
 examples/src/main/python/ml/tf_idf_example.py |   2 -
 .../src/main/python/ml/tokenizer_example.py   |   2 -
 .../ml/variance_threshold_selector_example.py |   2 -
 .../python/ml/vector_assembler_example.py     |   2 -
 .../main/python/ml/vector_indexer_example.py  |   2 -
 .../python/ml/vector_size_hint_example.py     |   2 -
 .../main/python/ml/vector_slicer_example.py   |   2 -
 .../src/main/python/ml/word2vec_example.py    |   2 -
 .../binary_classification_metrics_example.py  |   1 -
 .../python/mllib/bisecting_k_means_example.py |   2 -
 .../src/main/python/mllib/correlations.py     |   2 -
 .../main/python/mllib/correlations_example.py |   2 -
 .../decision_tree_classification_example.py   |   2 -
 .../mllib/decision_tree_regression_example.py |   2 -
 .../mllib/elementwise_product_example.py      |   2 -
 .../python/mllib/gaussian_mixture_example.py  |   2 -
 .../python/mllib/gaussian_mixture_model.py    |   7 +-
 ...radient_boosting_classification_example.py |   2 -
 .../gradient_boosting_regression_example.py   |   2 -
 .../mllib/hypothesis_testing_example.py       |   2 -
 ...testing_kolmogorov_smirnov_test_example.py |   2 -
 .../mllib/isotonic_regression_example.py      |   2 -
 .../src/main/python/mllib/k_means_example.py  |   2 -
 .../kernel_density_estimation_example.py      |   2 -
 examples/src/main/python/mllib/kmeans.py      |   2 -
 .../latent_dirichlet_allocation_example.py    |   2 -
 .../linear_regression_with_sgd_example.py     |   2 -
 .../main/python/mllib/logistic_regression.py  |   2 -
 .../logistic_regression_with_lbfgs_example.py |   2 -
 .../main/python/mllib/naive_bayes_example.py  |   2 -
 .../main/python/mllib/normalizer_example.py   |   2 -
 .../power_iteration_clustering_example.py     |   2 -
 .../random_forest_classification_example.py   |   2 -
 .../mllib/random_forest_regression_example.py |   2 -
 .../python/mllib/random_rdd_generation.py     |   2 -
 .../python/mllib/recommendation_example.py    |   2 -
 .../src/main/python/mllib/sampled_rdds.py     |   2 -
 .../python/mllib/standard_scaler_example.py   |   2 -
 .../mllib/stratified_sampling_example.py      |   2 -
 .../python/mllib/streaming_k_means_example.py |   2 -
 .../streaming_linear_regression_example.py    |   2 -
 .../mllib/summary_statistics_example.py       |   2 -
 .../src/main/python/mllib/tf_idf_example.py   |   2 -
 examples/src/main/python/mllib/word2vec.py    |   2 -
 .../src/main/python/mllib/word2vec_example.py |   2 -
 examples/src/main/python/pagerank.py          |   2 -
 .../src/main/python/parquet_inputformat.py    |   2 -
 examples/src/main/python/pi.py                |   2 -
 examples/src/main/python/sort.py              |   2 -
 examples/src/main/python/sql/arrow.py         |   9 -
 examples/src/main/python/sql/basic.py         |   2 -
 examples/src/main/python/sql/datasource.py    |   2 -
 examples/src/main/python/sql/hive.py          |   2 -
 .../streaming/structured_kafka_wordcount.py   |   2 -
 .../streaming/structured_network_wordcount.py |   2 -
 .../structured_network_wordcount_windowed.py  |   2 -
 examples/src/main/python/status_api_demo.py   |   7 +-
 .../main/python/streaming/hdfs_wordcount.py   |   2 -
 .../python/streaming/network_wordcount.py     |   2 -
 .../streaming/network_wordjoinsentiments.py   |   2 -
 .../recoverable_network_wordcount.py          |   2 -
 .../python/streaming/sql_network_wordcount.py |   2 -
 .../streaming/stateful_network_wordcount.py   |   2 -
 .../src/main/python/transitive_closure.py     |   2 -
 examples/src/main/python/wordcount.py         |   2 -
 .../streaming/kinesis_wordcount_asl.py        |   2 -
 python/pyspark/accumulators.py                |   5 +-
 python/pyspark/broadcast.py                   |  10 +-
 python/pyspark/conf.py                        |  25 +-
 python/pyspark/context.py                     |  44 +--
 python/pyspark/find_spark_home.py             |  31 +-
 python/pyspark/java_gateway.py                |   7 +-
 python/pyspark/ml/classification.py           |   8 +-
 python/pyspark/ml/common.py                   |   7 +-
 python/pyspark/ml/feature.py                  |  39 +--
 python/pyspark/ml/fpm.py                      |   6 +-
 python/pyspark/ml/image.py                    |   3 +-
 python/pyspark/ml/linalg/__init__.py          |  33 +--
 python/pyspark/ml/param/__init__.py           |  25 +-
 .../ml/param/_shared_params_code_gen.py       |   2 -
 python/pyspark/ml/pipeline.py                 |   6 +-
 python/pyspark/ml/tests/test_feature.py       |   5 +-
 python/pyspark/ml/tests/test_param.py         |  15 +-
 .../pyspark/ml/tests/test_training_summary.py |   5 +-
 python/pyspark/ml/tree.py                     |   6 +-
 python/pyspark/ml/tuning.py                   |   3 +-
 python/pyspark/ml/util.py                     |  20 +-
 python/pyspark/ml/wrapper.py                  |  12 +-
 python/pyspark/mllib/__init__.py              |   2 -
 python/pyspark/mllib/clustering.py            |  22 +-
 python/pyspark/mllib/common.py                |   7 +-
 python/pyspark/mllib/feature.py               |  19 +-
 python/pyspark/mllib/fpm.py                   |   7 +-
 python/pyspark/mllib/linalg/__init__.py       |  32 +--
 python/pyspark/mllib/linalg/distributed.py    |  35 ++-
 python/pyspark/mllib/stat/KernelDensity.py    |   5 -
 python/pyspark/mllib/stat/_statistics.py      |  14 +-
 python/pyspark/mllib/tests/test_linalg.py     |   5 +-
 python/pyspark/mllib/tree.py                  |   2 -
 python/pyspark/mllib/util.py                  |  14 +-
 python/pyspark/rdd.py                         |  39 +--
 python/pyspark/resultiterable.py              |   5 +-
 python/pyspark/serializers.py                 |  50 +---
 python/pyspark/shell.py                       |   5 +-
 python/pyspark/sql/__init__.py                |   3 -
 python/pyspark/sql/avro/functions.py          |   7 +-
 python/pyspark/sql/catalog.py                 |  10 +-
 python/pyspark/sql/column.py                  |  80 +++---
 python/pyspark/sql/conf.py                    |  10 +-
 python/pyspark/sql/context.py                 |  46 ++-
 python/pyspark/sql/dataframe.py               | 272 ++++++++----------
 python/pyspark/sql/functions.py               | 212 +++++---------
 python/pyspark/sql/group.py                   |   8 +-
 python/pyspark/sql/pandas/conversion.py       |  24 +-
 python/pyspark/sql/pandas/functions.py        |  43 ++-
 python/pyspark/sql/pandas/serializers.py      |  13 +-
 python/pyspark/sql/readwriter.py              |  39 ++-
 python/pyspark/sql/session.py                 |  45 +--
 python/pyspark/sql/streaming.py               |  28 +-
 python/pyspark/sql/tests/test_arrow.py        |  14 +-
 python/pyspark/sql/tests/test_column.py       |  10 +-
 python/pyspark/sql/tests/test_context.py      |   6 +-
 python/pyspark/sql/tests/test_functions.py    |   4 -
 .../sql/tests/test_pandas_cogrouped_map.py    |  15 +-
 .../sql/tests/test_pandas_grouped_map.py      |  39 ++-
 python/pyspark/sql/tests/test_pandas_map.py   |   3 -
 .../sql/tests/test_pandas_udf_scalar.py       |  27 +-
 .../sql/tests/test_pandas_udf_typehints.py    | 254 +++++++---------
 python/pyspark/sql/tests/test_types.py        |  27 +-
 python/pyspark/sql/types.py                   | 101 +------
 python/pyspark/sql/udf.py                     |  14 +-
 python/pyspark/sql/utils.py                   |  21 +-
 python/pyspark/streaming/context.py           |   2 -
 python/pyspark/streaming/dstream.py           |   7 +-
 python/pyspark/taskcontext.py                 |   4 -
 python/pyspark/testing/sqlutils.py            |   7 +-
 python/pyspark/tests/test_profiler.py         |   6 +-
 python/pyspark/tests/test_rdd.py              |  21 +-
 python/pyspark/tests/test_readwrite.py        | 183 ------------
 python/pyspark/tests/test_shuffle.py          |  13 +-
 python/pyspark/tests/test_taskcontext.py      |   7 +-
 python/pyspark/tests/test_util.py             |   6 +-
 python/pyspark/tests/test_worker.py           |  27 +-
 python/pyspark/util.py                        |  42 ---
 python/pyspark/worker.py                      |  32 +--
 python/run-tests.py                           |   9 +-
 python/setup.py                               |  11 +-
 .../integration-tests/tests/pyfiles.py        |   3 -
 .../tests/worker_memory_check.py              |   2 -
 .../src/test/resources/data/scripts/cat.py    |   1 -
 .../resources/data/scripts/dumpdata_script.py |   9 +-
 225 files changed, 735 insertions(+), 2033 deletions(-)

diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml
index d6458bf44f..5cf00c6ed9 100644
--- a/.github/workflows/master.yml
+++ b/.github/workflows/master.yml
@@ -133,7 +133,8 @@ jobs:
         architecture: x64
     - name: Install Python 3.6
       uses: actions/setup-python@v2
-      if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
+      # Yarn has a Python specific test too, for example, YarnClusterSuite.
+      if: contains(matrix.modules, 'yarn') || contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
       with:
         python-version: 3.6
         architecture: x64
diff --git a/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala b/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala
index 01e64b6972..5a6fa50796 100644
--- a/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala
@@ -45,71 +45,6 @@ private[spark] object SerDeUtil extends Logging {
       }
     }
   }
-  // Unpickle array.array generated by Python 2.6
-  class ArrayConstructor extends net.razorvine.pickle.objects.ArrayConstructor {
-    //  /* Description of types */
-    //  static struct arraydescr descriptors[] = {
-    //    {'c', sizeof(char), c_getitem, c_setitem},
-    //    {'b', sizeof(char), b_getitem, b_setitem},
-    //    {'B', sizeof(char), BB_getitem, BB_setitem},
-    //    #ifdef Py_USING_UNICODE
-    //      {'u', sizeof(Py_UNICODE), u_getitem, u_setitem},
-    //    #endif
-    //    {'h', sizeof(short), h_getitem, h_setitem},
-    //    {'H', sizeof(short), HH_getitem, HH_setitem},
-    //    {'i', sizeof(int), i_getitem, i_setitem},
-    //    {'I', sizeof(int), II_getitem, II_setitem},
-    //    {'l', sizeof(long), l_getitem, l_setitem},
-    //    {'L', sizeof(long), LL_getitem, LL_setitem},
-    //    {'f', sizeof(float), f_getitem, f_setitem},
-    //    {'d', sizeof(double), d_getitem, d_setitem},
-    //    {'\0', 0, 0, 0} /* Sentinel */
-    //  };
-    val machineCodes: Map[Char, Int] = if (ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN)) {
-      Map('B' -> 0, 'b' -> 1, 'H' -> 3, 'h' -> 5, 'I' -> 7, 'i' -> 9,
-        'L' -> 11, 'l' -> 13, 'f' -> 15, 'd' -> 17, 'u' -> 21
-      )
-    } else {
-      Map('B' -> 0, 'b' -> 1, 'H' -> 2, 'h' -> 4, 'I' -> 6, 'i' -> 8,
-        'L' -> 10, 'l' -> 12, 'f' -> 14, 'd' -> 16, 'u' -> 20
-      )
-    }
-    override def construct(args: Array[Object]): Object = {
-      if (args.length == 1) {
-        construct(args ++ Array(""))
-      } else if (args.length == 2 && args(1).isInstanceOf[String]) {
-        val typecode = args(0).asInstanceOf[String].charAt(0)
-        // This must be ISO 8859-1 / Latin 1, not UTF-8, to interoperate correctly
-        val data = args(1).asInstanceOf[String].getBytes(StandardCharsets.ISO_8859_1)
-        if (typecode == 'c') {
-          // It seems like the pickle of pypy uses the similar protocol to Python 2.6, which uses
-          // a string for array data instead of list as Python 2.7, and handles an array of
-          // typecode 'c' as 1-byte character.
-          val result = new Array[Char](data.length)
-          var i = 0
-          while (i < data.length) {
-            result(i) = data(i).toChar
-            i += 1
-          }
-          result
-        } else {
-          construct(typecode, machineCodes(typecode), data)
-        }
-      } else if (args.length == 2 && args(0) == "l") {
-        // On Python 2, an array of typecode 'l' should be handled as long rather than int.
-        val values = args(1).asInstanceOf[JArrayList[_]]
-        val result = new Array[Long](values.size)
-        var i = 0
-        while (i < values.size) {
-          result(i) = values.get(i).asInstanceOf[Number].longValue()
-          i += 1
-        }
-        result
-      } else {
-        super.construct(args)
-      }
-    }
-  }
 
   private var initialized = false
   // This should be called before trying to unpickle array.array from Python
@@ -117,7 +52,6 @@ private[spark] object SerDeUtil extends Logging {
   def initialize(): Unit = {
     synchronized{
       if (!initialized) {
-        Unpickler.registerConstructor("array", "array", new ArrayConstructor())
         Unpickler.registerConstructor("__builtin__", "bytearray", new ByteArrayConstructor())
         Unpickler.registerConstructor("builtins", "bytearray", new ByteArrayConstructor())
         Unpickler.registerConstructor("__builtin__", "bytes", new ByteArrayConstructor())
diff --git a/dev/create-release/releaseutils.py b/dev/create-release/releaseutils.py
index a5a26ae8f5..241b7ed539 100755
--- a/dev/create-release/releaseutils.py
+++ b/dev/create-release/releaseutils.py
@@ -49,8 +49,6 @@ except ImportError:
     print("Install using 'sudo pip install unidecode'")
     sys.exit(-1)
 
-if sys.version < '3':
-    input = raw_input  # noqa
 
 # Contributors list file name
 contributors_file_name = "contributors.txt"
@@ -152,10 +150,7 @@ def get_commits(tag):
             if not is_valid_author(author):
                 author = github_username
         # Guard against special characters
-        try:               # Python 2
-            author = unicode(author, "UTF-8")
-        except NameError:  # Python 3
-            author = str(author)
+        author = str(author)
         author = unidecode.unidecode(author).strip()
         commit = Commit(_hash, author, title, pr_number)
         commits.append(commit)
diff --git a/dev/github_jira_sync.py b/dev/github_jira_sync.py
index b444b74d40..b90afeebc5 100755
--- a/dev/github_jira_sync.py
+++ b/dev/github_jira_sync.py
@@ -22,14 +22,9 @@ import json
 import os
 import re
 import sys
-if sys.version < '3':
-    from urllib2 import urlopen
-    from urllib2 import Request
-    from urllib2 import HTTPError
-else:
-    from urllib.request import urlopen
-    from urllib.request import Request
-    from urllib.error import HTTPError
+from urllib.request import urlopen
+from urllib.request import Request
+from urllib.error import HTTPError
 
 try:
     import jira.client
diff --git a/dev/lint-python b/dev/lint-python
index d5491f2447..1fddbfa64b 100755
--- a/dev/lint-python
+++ b/dev/lint-python
@@ -168,7 +168,15 @@ function sphinx_test {
 
     # Check that the documentation builds acceptably, skip check if sphinx is not installed.
     if ! hash "$SPHINX_BUILD" 2> /dev/null; then
-        echo "The $SPHINX_BUILD command was not found. Skipping pydoc checks for now."
+        echo "The $SPHINX_BUILD command was not found. Skipping Sphinx build for now."
+        echo
+        return
+    fi
+
+    # TODO(SPARK-32279): Install Sphinx in Python 3 of Jenkins machines
+    PYTHON_HAS_SPHINX=$("$PYTHON_EXECUTABLE" -c 'import importlib.util; print(importlib.util.find_spec("sphinx") is not None)')
+    if [[ "$PYTHON_HAS_SPHINX" == "False" ]]; then
+        echo "$PYTHON_EXECUTABLE does not have Sphinx installed. Skipping Sphinx build for now."
         echo
         return
     fi
diff --git a/dev/merge_spark_pr.py b/dev/merge_spark_pr.py
index 967cdace60..b42429d717 100755
--- a/dev/merge_spark_pr.py
+++ b/dev/merge_spark_pr.py
@@ -31,15 +31,9 @@ import re
 import subprocess
 import sys
 import traceback
-if sys.version < '3':
-    input = raw_input  # noqa
-    from urllib2 import urlopen
-    from urllib2 import Request
-    from urllib2 import HTTPError
-else:
-    from urllib.request import urlopen
-    from urllib.request import Request
-    from urllib.error import HTTPError
+from urllib.request import urlopen
+from urllib.request import Request
+from urllib.error import HTTPError
 
 try:
     import jira.client
diff --git a/dev/run-tests-jenkins.py b/dev/run-tests-jenkins.py
index 13be9592d7..4ff5b327e3 100755
--- a/dev/run-tests-jenkins.py
+++ b/dev/run-tests-jenkins.py
@@ -22,15 +22,9 @@ import sys
 import json
 import functools
 import subprocess
-if sys.version < '3':
-    from urllib2 import urlopen
-    from urllib2 import Request
-    from urllib2 import HTTPError, URLError
-else:
-    from urllib.request import urlopen
-    from urllib.request import Request
-    from urllib.error import HTTPError, URLError
-
+from urllib.request import urlopen
+from urllib.request import Request
+from urllib.error import HTTPError, URLError
 
 from sparktestsupport import SPARK_HOME, ERROR_CODES
 from sparktestsupport.shellutils import run_cmd
diff --git a/dev/sparktestsupport/toposort.py b/dev/sparktestsupport/toposort.py
index 8b2688d200..6785e481b5 100644
--- a/dev/sparktestsupport/toposort.py
+++ b/dev/sparktestsupport/toposort.py
@@ -24,8 +24,7 @@
 #    Moved functools import to the top of the file.
 #    Changed assert to a ValueError.
 #    Changed iter[items|keys] to [items|keys], for python 3
-#     compatibility. I don't think it matters for python 2 these are
-#     now lists instead of iterables.
+#     compatibility.
 #    Copy the input so as to leave it unmodified.
 #    Renamed function from toposort2 to toposort.
 #    Handle empty input.
diff --git a/docs/configuration.md b/docs/configuration.md
index 42f706b296..abf76105ae 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -2917,7 +2917,7 @@ The following variables can be set in `spark-env.sh`:
   </tr>
   <tr>
     <td><code>PYSPARK_PYTHON</code></td>
-    <td>Python binary executable to use for PySpark in both driver and workers (default is <code>python2.7</code> if available, otherwise <code>python</code>).
+    <td>Python binary executable to use for PySpark in both driver and workers (default is <code>python3</code> if available, otherwise <code>python</code>).
     Property <code>spark.pyspark.python</code> take precedence if it is set</td>
   </tr>
   <tr>
diff --git a/docs/index.md b/docs/index.md
index c0771ca170..8fd169e63f 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -44,9 +44,8 @@ source, visit [Building Spark](building-spark.html).
 
 Spark runs on both Windows and UNIX-like systems (e.g. Linux, Mac OS), and it should run on any platform that runs a supported version of Java. This should include JVMs on x86_64 and ARM64. It's easy to run locally on one machine --- all you need is to have `java` installed on your system `PATH`, or the `JAVA_HOME` environment variable pointing to a Java installation.
 
-Spark runs on Java 8/11, Scala 2.12, Python 2.7+/3.4+ and R 3.5+.
+Spark runs on Java 8/11, Scala 2.12, Python 3.6+ and R 3.5+.
 Java 8 prior to version 8u92 support is deprecated as of Spark 3.0.0.
-Python 2 and Python 3 prior to version 3.6 support is deprecated as of Spark 3.0.0.
 For the Scala API, Spark {{site.SPARK_VERSION}}
 uses Scala {{site.SCALA_BINARY_VERSION}}. You will need to use a compatible Scala version
 ({{site.SCALA_BINARY_VERSION}}.x).
diff --git a/docs/rdd-programming-guide.md b/docs/rdd-programming-guide.md
index 70bfefce47..07207f62bb 100644
--- a/docs/rdd-programming-guide.md
+++ b/docs/rdd-programming-guide.md
@@ -101,10 +101,10 @@ import org.apache.spark.SparkConf;
 
 <div data-lang="python"  markdown="1">
 
-Spark {{site.SPARK_VERSION}} works with Python 2.7+ or Python 3.4+. It can use the standard CPython interpreter,
+Spark {{site.SPARK_VERSION}} works with Python 3.6+. It can use the standard CPython interpreter,
 so C libraries like NumPy can be used. It also works with PyPy 2.3+.
 
-Note that Python 2 support is deprecated as of Spark 3.0.0.
+Python 2, 3.4 and 3.5 supports were removed in Spark 3.1.0.
 
 Spark applications in Python can either be run with the `bin/spark-submit` script which includes Spark at runtime, or by including it in your setup.py as:
 
@@ -134,8 +134,8 @@ PySpark requires the same minor version of Python in both driver and workers. It
 you can specify which version of Python you want to use by `PYSPARK_PYTHON`, for example:
 
 {% highlight bash %}
-$ PYSPARK_PYTHON=python3.4 bin/pyspark
-$ PYSPARK_PYTHON=/opt/pypy-2.5/bin/pypy bin/spark-submit examples/src/main/python/pi.py
+$ PYSPARK_PYTHON=python3.8 bin/pyspark
+$ PYSPARK_PYTHON=/path-to-your-pypy/pypy bin/spark-submit examples/src/main/python/pi.py
 {% endhighlight %}
 
 </div>
@@ -276,7 +276,7 @@ $ PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS=notebook ./bin/pyspar
 
 You can customize the `ipython` or `jupyter` commands by setting `PYSPARK_DRIVER_PYTHON_OPTS`.
 
-After the Jupyter Notebook server is launched, you can create a new "Python 2" notebook from
+After the Jupyter Notebook server is launched, you can create a new notebook from
 the "Files" tab. Inside the notebook, you can input the command `%pylab inline` as part of
 your notebook before you start to try Spark from the Jupyter notebook.
 
@@ -447,7 +447,7 @@ Writables are automatically converted:
 
 <table class="table">
 <tr><th>Writable Type</th><th>Python Type</th></tr>
-<tr><td>Text</td><td>unicode str</td></tr>
+<tr><td>Text</td><td>str</td></tr>
 <tr><td>IntWritable</td><td>int</td></tr>
 <tr><td>FloatWritable</td><td>float</td></tr>
 <tr><td>DoubleWritable</td><td>float</td></tr>
diff --git a/examples/src/main/python/als.py b/examples/src/main/python/als.py
index 6d3241876a..511634fd8f 100755
--- a/examples/src/main/python/als.py
+++ b/examples/src/main/python/als.py
@@ -21,8 +21,6 @@ pyspark.ml.recommendation.ALS for more conventional use.
 
 This example requires numpy (http://www.numpy.org/)
 """
-from __future__ import print_function
-
 import sys
 
 import numpy as np
diff --git a/examples/src/main/python/avro_inputformat.py b/examples/src/main/python/avro_inputformat.py
index a18722c687..49ab37e7b3 100644
--- a/examples/src/main/python/avro_inputformat.py
+++ b/examples/src/main/python/avro_inputformat.py
@@ -43,8 +43,6 @@ $ ./bin/spark-submit --driver-class-path /path/to/example/jar \
 {u'favorite_color': None, u'name': u'Alyssa'}
 {u'favorite_color': u'red', u'name': u'Ben'}
 """
-from __future__ import print_function
-
 import sys
 
 from functools import reduce
diff --git a/examples/src/main/python/kmeans.py b/examples/src/main/python/kmeans.py
index a42d711fc5..022378619c 100755
--- a/examples/src/main/python/kmeans.py
+++ b/examples/src/main/python/kmeans.py
@@ -22,8 +22,6 @@ examples/src/main/python/ml/kmeans_example.py.
 
 This example requires NumPy (http://www.numpy.org/).
 """
-from __future__ import print_function
-
 import sys
 
 import numpy as np
diff --git a/examples/src/main/python/logistic_regression.py b/examples/src/main/python/logistic_regression.py
index bcc4e0f4e8..4b83740152 100755
--- a/examples/src/main/python/logistic_regression.py
+++ b/examples/src/main/python/logistic_regression.py
@@ -22,8 +22,6 @@ to act on batches of input data using efficient matrix operations.
 In practice, one may prefer to use the LogisticRegression algorithm in
 ML, as shown in examples/src/main/python/ml/logistic_regression_with_elastic_net.py.
 """
-from __future__ import print_function
-
 import sys
 
 import numpy as np
diff --git a/examples/src/main/python/ml/aft_survival_regression.py b/examples/src/main/python/ml/aft_survival_regression.py
index 0a71f76418..2040a7876c 100644
--- a/examples/src/main/python/ml/aft_survival_regression.py
+++ b/examples/src/main/python/ml/aft_survival_regression.py
@@ -20,8 +20,6 @@ An example demonstrating aft survival regression.
 Run with:
   bin/spark-submit examples/src/main/python/ml/aft_survival_regression.py
 """
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.regression import AFTSurvivalRegression
 from pyspark.ml.linalg import Vectors
diff --git a/examples/src/main/python/ml/als_example.py b/examples/src/main/python/ml/als_example.py
index 8b7ec9c439..b392639784 100644
--- a/examples/src/main/python/ml/als_example.py
+++ b/examples/src/main/python/ml/als_example.py
@@ -15,12 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
-import sys
-if sys.version >= '3':
-    long = int
-
 from pyspark.sql import SparkSession
 
 # $example on$
@@ -39,7 +33,7 @@ if __name__ == "__main__":
     lines = spark.read.text("data/mllib/als/sample_movielens_ratings.txt").rdd
     parts = lines.map(lambda row: row.value.split("::"))
     ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
-                                         rating=float(p[2]), timestamp=long(p[3])))
+                                         rating=float(p[2]), timestamp=int(p[3])))
     ratings = spark.createDataFrame(ratingsRDD)
     (training, test) = ratings.randomSplit([0.8, 0.2])
 
diff --git a/examples/src/main/python/ml/anova_selector_example.py b/examples/src/main/python/ml/anova_selector_example.py
index f8458f5d6e..da80fa6231 100644
--- a/examples/src/main/python/ml/anova_selector_example.py
+++ b/examples/src/main/python/ml/anova_selector_example.py
@@ -20,8 +20,6 @@ An example for ANOVASelector.
 Run with:
   bin/spark-submit examples/src/main/python/ml/anova_selector_example.py
 """
-from __future__ import print_function
-
 from pyspark.sql import SparkSession
 # $example on$
 from pyspark.ml.feature import ANOVASelector
diff --git a/examples/src/main/python/ml/anova_test_example.py b/examples/src/main/python/ml/anova_test_example.py
index 4119441cde..451e078f60 100644
--- a/examples/src/main/python/ml/anova_test_example.py
+++ b/examples/src/main/python/ml/anova_test_example.py
@@ -20,8 +20,6 @@ An example for ANOVA testing.
 Run with:
   bin/spark-submit examples/src/main/python/ml/anova_test_example.py
 """
-from __future__ import print_function
-
 from pyspark.sql import SparkSession
 # $example on$
 from pyspark.ml.linalg import Vectors
diff --git a/examples/src/main/python/ml/binarizer_example.py b/examples/src/main/python/ml/binarizer_example.py
index 669bb2aeab..5d5ae4122e 100644
--- a/examples/src/main/python/ml/binarizer_example.py
+++ b/examples/src/main/python/ml/binarizer_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 from pyspark.sql import SparkSession
 # $example on$
 from pyspark.ml.feature import Binarizer
diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py b/examples/src/main/python/ml/bisecting_k_means_example.py
index 82adb338b5..513f80a09e 100644
--- a/examples/src/main/python/ml/bisecting_k_means_example.py
+++ b/examples/src/main/python/ml/bisecting_k_means_example.py
@@ -20,8 +20,6 @@ An example demonstrating bisecting k-means clustering.
 Run with:
   bin/spark-submit examples/src/main/python/ml/bisecting_k_means_example.py
 """
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.clustering import BisectingKMeans
 from pyspark.ml.evaluation import ClusteringEvaluator
diff --git a/examples/src/main/python/ml/bucketed_random_projection_lsh_example.py b/examples/src/main/python/ml/bucketed_random_projection_lsh_example.py
index 610176ea59..f5836091f3 100644
--- a/examples/src/main/python/ml/bucketed_random_projection_lsh_example.py
+++ b/examples/src/main/python/ml/bucketed_random_projection_lsh_example.py
@@ -20,8 +20,6 @@ An example demonstrating BucketedRandomProjectionLSH.
 Run with:
   bin/spark-submit examples/src/main/python/ml/bucketed_random_projection_lsh_example.py
 """
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.feature import BucketedRandomProjectionLSH
 from pyspark.ml.linalg import Vectors
diff --git a/examples/src/main/python/ml/bucketizer_example.py b/examples/src/main/python/ml/bucketizer_example.py
index 742f35093b..5de67f7126 100644
--- a/examples/src/main/python/ml/bucketizer_example.py
+++ b/examples/src/main/python/ml/bucketizer_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 from pyspark.sql import SparkSession
 # $example on$
 from pyspark.ml.feature import Bucketizer
diff --git a/examples/src/main/python/ml/chi_square_test_example.py b/examples/src/main/python/ml/chi_square_test_example.py
index 2af7e683cd..bf15a03d9c 100644
--- a/examples/src/main/python/ml/chi_square_test_example.py
+++ b/examples/src/main/python/ml/chi_square_test_example.py
@@ -20,8 +20,6 @@ An example for Chi-square hypothesis testing.
 Run with:
   bin/spark-submit examples/src/main/python/ml/chi_square_test_example.py
 """
-from __future__ import print_function
-
 from pyspark.sql import SparkSession
 # $example on$
 from pyspark.ml.linalg import Vectors
diff --git a/examples/src/main/python/ml/chisq_selector_example.py b/examples/src/main/python/ml/chisq_selector_example.py
index 028a9ea9d6..c83a8c1bc7 100644
--- a/examples/src/main/python/ml/chisq_selector_example.py
+++ b/examples/src/main/python/ml/chisq_selector_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 from pyspark.sql import SparkSession
 # $example on$
 from pyspark.ml.feature import ChiSqSelector
diff --git a/examples/src/main/python/ml/correlation_example.py b/examples/src/main/python/ml/correlation_example.py
index 1f4e402ac1..9006d54149 100644
--- a/examples/src/main/python/ml/correlation_example.py
+++ b/examples/src/main/python/ml/correlation_example.py
@@ -20,8 +20,6 @@ An example for computing correlation matrix.
 Run with:
   bin/spark-submit examples/src/main/python/ml/correlation_example.py
 """
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.linalg import Vectors
 from pyspark.ml.stat import Correlation
diff --git a/examples/src/main/python/ml/count_vectorizer_example.py b/examples/src/main/python/ml/count_vectorizer_example.py
index f2e41db77d..b3ddfb128c 100644
--- a/examples/src/main/python/ml/count_vectorizer_example.py
+++ b/examples/src/main/python/ml/count_vectorizer_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 from pyspark.sql import SparkSession
 # $example on$
 from pyspark.ml.feature import CountVectorizer
diff --git a/examples/src/main/python/ml/cross_validator.py b/examples/src/main/python/ml/cross_validator.py
index 6256d11504..0ad0865486 100644
--- a/examples/src/main/python/ml/cross_validator.py
+++ b/examples/src/main/python/ml/cross_validator.py
@@ -22,8 +22,6 @@ Run with:
 
   bin/spark-submit examples/src/main/python/ml/cross_validator.py
 """
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml import Pipeline
 from pyspark.ml.classification import LogisticRegression
diff --git a/examples/src/main/python/ml/dataframe_example.py b/examples/src/main/python/ml/dataframe_example.py
index cabc3de68f..d2bf937441 100644
--- a/examples/src/main/python/ml/dataframe_example.py
+++ b/examples/src/main/python/ml/dataframe_example.py
@@ -19,8 +19,6 @@
 An example of how to use DataFrame for ML. Run with::
     bin/spark-submit examples/src/main/python/ml/dataframe_example.py <input_path>
 """
-from __future__ import print_function
-
 import os
 import sys
 import tempfile
diff --git a/examples/src/main/python/ml/dct_example.py b/examples/src/main/python/ml/dct_example.py
index c0457f8d0f..37da4f5e8f 100644
--- a/examples/src/main/python/ml/dct_example.py
+++ b/examples/src/main/python/ml/dct_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.feature import DCT
 from pyspark.ml.linalg import Vectors
diff --git a/examples/src/main/python/ml/decision_tree_classification_example.py b/examples/src/main/python/ml/decision_tree_classification_example.py
index d6e2977de0..eb7177b845 100644
--- a/examples/src/main/python/ml/decision_tree_classification_example.py
+++ b/examples/src/main/python/ml/decision_tree_classification_example.py
@@ -18,8 +18,6 @@
 """
 Decision Tree Classification Example.
 """
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml import Pipeline
 from pyspark.ml.classification import DecisionTreeClassifier
diff --git a/examples/src/main/python/ml/decision_tree_regression_example.py b/examples/src/main/python/ml/decision_tree_regression_example.py
index 58d7ad921d..1ed1636a3d 100644
--- a/examples/src/main/python/ml/decision_tree_regression_example.py
+++ b/examples/src/main/python/ml/decision_tree_regression_example.py
@@ -18,8 +18,6 @@
 """
 Decision Tree Regression Example.
 """
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml import Pipeline
 from pyspark.ml.regression import DecisionTreeRegressor
diff --git a/examples/src/main/python/ml/elementwise_product_example.py b/examples/src/main/python/ml/elementwise_product_example.py
index 590053998b..71eec8d432 100644
--- a/examples/src/main/python/ml/elementwise_product_example.py
+++ b/examples/src/main/python/ml/elementwise_product_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.feature import ElementwiseProduct
 from pyspark.ml.linalg import Vectors
diff --git a/examples/src/main/python/ml/estimator_transformer_param_example.py b/examples/src/main/python/ml/estimator_transformer_param_example.py
index eb21051435..1dcca6c201 100644
--- a/examples/src/main/python/ml/estimator_transformer_param_example.py
+++ b/examples/src/main/python/ml/estimator_transformer_param_example.py
@@ -18,8 +18,6 @@
 """
 Estimator Transformer Param Example.
 """
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.linalg import Vectors
 from pyspark.ml.classification import LogisticRegression
diff --git a/examples/src/main/python/ml/feature_hasher_example.py b/examples/src/main/python/ml/feature_hasher_example.py
index 6cf9ecc396..4fe573d19d 100644
--- a/examples/src/main/python/ml/feature_hasher_example.py
+++ b/examples/src/main/python/ml/feature_hasher_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 from pyspark.sql import SparkSession
 # $example on$
 from pyspark.ml.feature import FeatureHasher
diff --git a/examples/src/main/python/ml/fm_classifier_example.py b/examples/src/main/python/ml/fm_classifier_example.py
index 6e7c2ccf02..b47bdc5275 100644
--- a/examples/src/main/python/ml/fm_classifier_example.py
+++ b/examples/src/main/python/ml/fm_classifier_example.py
@@ -18,8 +18,6 @@
 """
 FMClassifier Example.
 """
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml import Pipeline
 from pyspark.ml.classification import FMClassifier
diff --git a/examples/src/main/python/ml/fm_regressor_example.py b/examples/src/main/python/ml/fm_regressor_example.py
index afd7639680..5c8133996a 100644
--- a/examples/src/main/python/ml/fm_regressor_example.py
+++ b/examples/src/main/python/ml/fm_regressor_example.py
@@ -18,8 +18,6 @@
 """
 FMRegressor Example.
 """
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml import Pipeline
 from pyspark.ml.regression import FMRegressor
diff --git a/examples/src/main/python/ml/fvalue_selector_example.py b/examples/src/main/python/ml/fvalue_selector_example.py
index 3158953a5d..f164af47eb 100644
--- a/examples/src/main/python/ml/fvalue_selector_example.py
+++ b/examples/src/main/python/ml/fvalue_selector_example.py
@@ -20,8 +20,6 @@ An example for FValueSelector.
 Run with:
   bin/spark-submit examples/src/main/python/ml/fvalue_selector_example.py
 """
-from __future__ import print_function
-
 from pyspark.sql import SparkSession
 # $example on$
 from pyspark.ml.feature import FValueSelector
diff --git a/examples/src/main/python/ml/fvalue_test_example.py b/examples/src/main/python/ml/fvalue_test_example.py
index 410b39e449..dfa8073e5a 100644
--- a/examples/src/main/python/ml/fvalue_test_example.py
+++ b/examples/src/main/python/ml/fvalue_test_example.py
@@ -20,8 +20,6 @@ An example for FValue testing.
 Run with:
   bin/spark-submit examples/src/main/python/ml/fvalue_test_example.py
 """
-from __future__ import print_function
-
 from pyspark.sql import SparkSession
 # $example on$
 from pyspark.ml.linalg import Vectors
diff --git a/examples/src/main/python/ml/gaussian_mixture_example.py b/examples/src/main/python/ml/gaussian_mixture_example.py
index 4938a90418..1441faa792 100644
--- a/examples/src/main/python/ml/gaussian_mixture_example.py
+++ b/examples/src/main/python/ml/gaussian_mixture_example.py
@@ -20,8 +20,6 @@ A simple example demonstrating Gaussian Mixture Model (GMM).
 Run with:
   bin/spark-submit examples/src/main/python/ml/gaussian_mixture_example.py
 """
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.clustering import GaussianMixture
 # $example off$
diff --git a/examples/src/main/python/ml/generalized_linear_regression_example.py b/examples/src/main/python/ml/generalized_linear_regression_example.py
index a52f4650c1..06a8a5a2e9 100644
--- a/examples/src/main/python/ml/generalized_linear_regression_example.py
+++ b/examples/src/main/python/ml/generalized_linear_regression_example.py
@@ -20,8 +20,6 @@ An example demonstrating generalized linear regression.
 Run with:
   bin/spark-submit examples/src/main/python/ml/generalized_linear_regression_example.py
 """
-from __future__ import print_function
-
 from pyspark.sql import SparkSession
 # $example on$
 from pyspark.ml.regression import GeneralizedLinearRegression
diff --git a/examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py b/examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py
index c2042fd7b7..a7efa2170a 100644
--- a/examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py
+++ b/examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py
@@ -18,8 +18,6 @@
 """
 Gradient Boosted Tree Classifier Example.
 """
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml import Pipeline
 from pyspark.ml.classification import GBTClassifier
diff --git a/examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py b/examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py
index cc96c973e4..5e09b96c1e 100644
--- a/examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py
+++ b/examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py
@@ -18,8 +18,6 @@
 """
 Gradient Boosted Tree Regressor Example.
 """
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml import Pipeline
 from pyspark.ml.regression import GBTRegressor
diff --git a/examples/src/main/python/ml/index_to_string_example.py b/examples/src/main/python/ml/index_to_string_example.py
index 33d104e8e3..98bdb89ce3 100644
--- a/examples/src/main/python/ml/index_to_string_example.py
+++ b/examples/src/main/python/ml/index_to_string_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.feature import IndexToString, StringIndexer
 # $example off$
diff --git a/examples/src/main/python/ml/interaction_example.py b/examples/src/main/python/ml/interaction_example.py
index 4b63227191..ac365179b0 100644
--- a/examples/src/main/python/ml/interaction_example.py
+++ b/examples/src/main/python/ml/interaction_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.feature import Interaction, VectorAssembler
 # $example off$
diff --git a/examples/src/main/python/ml/isotonic_regression_example.py b/examples/src/main/python/ml/isotonic_regression_example.py
index 89cba9dfc7..d7b893894f 100644
--- a/examples/src/main/python/ml/isotonic_regression_example.py
+++ b/examples/src/main/python/ml/isotonic_regression_example.py
@@ -21,8 +21,6 @@ Isotonic Regression Example.
 Run with:
   bin/spark-submit examples/src/main/python/ml/isotonic_regression_example.py
 """
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.regression import IsotonicRegression
 # $example off$
diff --git a/examples/src/main/python/ml/kmeans_example.py b/examples/src/main/python/ml/kmeans_example.py
index 80a878af67..47223fd953 100644
--- a/examples/src/main/python/ml/kmeans_example.py
+++ b/examples/src/main/python/ml/kmeans_example.py
@@ -22,8 +22,6 @@ Run with:
 
 This example requires NumPy (http://www.numpy.org/).
 """
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.clustering import KMeans
 from pyspark.ml.evaluation import ClusteringEvaluator
diff --git a/examples/src/main/python/ml/lda_example.py b/examples/src/main/python/ml/lda_example.py
index 97d1a042d1..a47dfa383c 100644
--- a/examples/src/main/python/ml/lda_example.py
+++ b/examples/src/main/python/ml/lda_example.py
@@ -20,8 +20,6 @@ An example demonstrating LDA.
 Run with:
   bin/spark-submit examples/src/main/python/ml/lda_example.py
 """
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.clustering import LDA
 # $example off$
diff --git a/examples/src/main/python/ml/linear_regression_with_elastic_net.py b/examples/src/main/python/ml/linear_regression_with_elastic_net.py
index 6639e9160a..864fc76cff 100644
--- a/examples/src/main/python/ml/linear_regression_with_elastic_net.py
+++ b/examples/src/main/python/ml/linear_regression_with_elastic_net.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.regression import LinearRegression
 # $example off$
diff --git a/examples/src/main/python/ml/linearsvc.py b/examples/src/main/python/ml/linearsvc.py
index 9b79abbf96..61d726cf3f 100644
--- a/examples/src/main/python/ml/linearsvc.py
+++ b/examples/src/main/python/ml/linearsvc.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.classification import LinearSVC
 # $example off$
diff --git a/examples/src/main/python/ml/logistic_regression_summary_example.py b/examples/src/main/python/ml/logistic_regression_summary_example.py
index 2274ff707b..6d045108da 100644
--- a/examples/src/main/python/ml/logistic_regression_summary_example.py
+++ b/examples/src/main/python/ml/logistic_regression_summary_example.py
@@ -20,8 +20,6 @@ An example demonstrating Logistic Regression Summary.
 Run with:
   bin/spark-submit examples/src/main/python/ml/logistic_regression_summary_example.py
 """
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.classification import LogisticRegression
 # $example off$
diff --git a/examples/src/main/python/ml/logistic_regression_with_elastic_net.py b/examples/src/main/python/ml/logistic_regression_with_elastic_net.py
index d095fbd373..916fdade27 100644
--- a/examples/src/main/python/ml/logistic_regression_with_elastic_net.py
+++ b/examples/src/main/python/ml/logistic_regression_with_elastic_net.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.classification import LogisticRegression
 # $example off$
diff --git a/examples/src/main/python/ml/max_abs_scaler_example.py b/examples/src/main/python/ml/max_abs_scaler_example.py
index 45eda3cdad..d7ff3561ce 100644
--- a/examples/src/main/python/ml/max_abs_scaler_example.py
+++ b/examples/src/main/python/ml/max_abs_scaler_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.feature import MaxAbsScaler
 from pyspark.ml.linalg import Vectors
diff --git a/examples/src/main/python/ml/min_hash_lsh_example.py b/examples/src/main/python/ml/min_hash_lsh_example.py
index 93136e6ae3..683f97a055 100644
--- a/examples/src/main/python/ml/min_hash_lsh_example.py
+++ b/examples/src/main/python/ml/min_hash_lsh_example.py
@@ -20,8 +20,6 @@ An example demonstrating MinHashLSH.
 Run with:
   bin/spark-submit examples/src/main/python/ml/min_hash_lsh_example.py
 """
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.feature import MinHashLSH
 from pyspark.ml.linalg import Vectors
diff --git a/examples/src/main/python/ml/min_max_scaler_example.py b/examples/src/main/python/ml/min_max_scaler_example.py
index b5f272e59b..cd74243699 100644
--- a/examples/src/main/python/ml/min_max_scaler_example.py
+++ b/examples/src/main/python/ml/min_max_scaler_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.feature import MinMaxScaler
 from pyspark.ml.linalg import Vectors
diff --git a/examples/src/main/python/ml/multiclass_logistic_regression_with_elastic_net.py b/examples/src/main/python/ml/multiclass_logistic_regression_with_elastic_net.py
index bec9860c79..3bb4a72864 100644
--- a/examples/src/main/python/ml/multiclass_logistic_regression_with_elastic_net.py
+++ b/examples/src/main/python/ml/multiclass_logistic_regression_with_elastic_net.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.classification import LogisticRegression
 # $example off$
diff --git a/examples/src/main/python/ml/multilayer_perceptron_classification.py b/examples/src/main/python/ml/multilayer_perceptron_classification.py
index 88fc69f753..74f5321935 100644
--- a/examples/src/main/python/ml/multilayer_perceptron_classification.py
+++ b/examples/src/main/python/ml/multilayer_perceptron_classification.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.classification import MultilayerPerceptronClassifier
 from pyspark.ml.evaluation import MulticlassClassificationEvaluator
diff --git a/examples/src/main/python/ml/n_gram_example.py b/examples/src/main/python/ml/n_gram_example.py
index 31676e076a..8c8031b939 100644
--- a/examples/src/main/python/ml/n_gram_example.py
+++ b/examples/src/main/python/ml/n_gram_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.feature import NGram
 # $example off$
diff --git a/examples/src/main/python/ml/naive_bayes_example.py b/examples/src/main/python/ml/naive_bayes_example.py
index 7290ab81cd..8d1777c6f9 100644
--- a/examples/src/main/python/ml/naive_bayes_example.py
+++ b/examples/src/main/python/ml/naive_bayes_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.classification import NaiveBayes
 from pyspark.ml.evaluation import MulticlassClassificationEvaluator
diff --git a/examples/src/main/python/ml/normalizer_example.py b/examples/src/main/python/ml/normalizer_example.py
index 510bd825fd..2aa012961a 100644
--- a/examples/src/main/python/ml/normalizer_example.py
+++ b/examples/src/main/python/ml/normalizer_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.feature import Normalizer
 from pyspark.ml.linalg import Vectors
diff --git a/examples/src/main/python/ml/one_vs_rest_example.py b/examples/src/main/python/ml/one_vs_rest_example.py
index 956e94ae4a..4cae1a9980 100644
--- a/examples/src/main/python/ml/one_vs_rest_example.py
+++ b/examples/src/main/python/ml/one_vs_rest_example.py
@@ -21,8 +21,6 @@ using Logistic Regression as the base classifier.
 Run with:
   bin/spark-submit examples/src/main/python/ml/one_vs_rest_example.py
 """
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.classification import LogisticRegression, OneVsRest
 from pyspark.ml.evaluation import MulticlassClassificationEvaluator
diff --git a/examples/src/main/python/ml/onehot_encoder_example.py b/examples/src/main/python/ml/onehot_encoder_example.py
index 73775b79e3..6deb84ed78 100644
--- a/examples/src/main/python/ml/onehot_encoder_example.py
+++ b/examples/src/main/python/ml/onehot_encoder_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.feature import OneHotEncoder
 # $example off$
diff --git a/examples/src/main/python/ml/pca_example.py b/examples/src/main/python/ml/pca_example.py
index 38746aced0..03fb709c8e 100644
--- a/examples/src/main/python/ml/pca_example.py
+++ b/examples/src/main/python/ml/pca_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.feature import PCA
 from pyspark.ml.linalg import Vectors
diff --git a/examples/src/main/python/ml/polynomial_expansion_example.py b/examples/src/main/python/ml/polynomial_expansion_example.py
index 40bcb7b13a..75f436e768 100644
--- a/examples/src/main/python/ml/polynomial_expansion_example.py
+++ b/examples/src/main/python/ml/polynomial_expansion_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.feature import PolynomialExpansion
 from pyspark.ml.linalg import Vectors
diff --git a/examples/src/main/python/ml/quantile_discretizer_example.py b/examples/src/main/python/ml/quantile_discretizer_example.py
index 0fc1d1949a..82be3936d2 100644
--- a/examples/src/main/python/ml/quantile_discretizer_example.py
+++ b/examples/src/main/python/ml/quantile_discretizer_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.feature import QuantileDiscretizer
 # $example off$
diff --git a/examples/src/main/python/ml/random_forest_classifier_example.py b/examples/src/main/python/ml/random_forest_classifier_example.py
index 4eaa94dd7f..8983d1f2e9 100644
--- a/examples/src/main/python/ml/random_forest_classifier_example.py
+++ b/examples/src/main/python/ml/random_forest_classifier_example.py
@@ -18,8 +18,6 @@
 """
 Random Forest Classifier Example.
 """
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml import Pipeline
 from pyspark.ml.classification import RandomForestClassifier
diff --git a/examples/src/main/python/ml/random_forest_regressor_example.py b/examples/src/main/python/ml/random_forest_regressor_example.py
index a34edff2ec..b9306ddf2f 100644
--- a/examples/src/main/python/ml/random_forest_regressor_example.py
+++ b/examples/src/main/python/ml/random_forest_regressor_example.py
@@ -18,8 +18,6 @@
 """
 Random Forest Regressor Example.
 """
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml import Pipeline
 from pyspark.ml.regression import RandomForestRegressor
diff --git a/examples/src/main/python/ml/rformula_example.py b/examples/src/main/python/ml/rformula_example.py
index 6629239db2..25bb6dac56 100644
--- a/examples/src/main/python/ml/rformula_example.py
+++ b/examples/src/main/python/ml/rformula_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.feature import RFormula
 # $example off$
diff --git a/examples/src/main/python/ml/robust_scaler_example.py b/examples/src/main/python/ml/robust_scaler_example.py
index 435e9ccb80..9f7c6d6507 100644
--- a/examples/src/main/python/ml/robust_scaler_example.py
+++ b/examples/src/main/python/ml/robust_scaler_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.feature import RobustScaler
 # $example off$
diff --git a/examples/src/main/python/ml/sql_transformer.py b/examples/src/main/python/ml/sql_transformer.py
index 0bf8f35720..c8ac5c46aa 100644
--- a/examples/src/main/python/ml/sql_transformer.py
+++ b/examples/src/main/python/ml/sql_transformer.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.feature import SQLTransformer
 # $example off$
diff --git a/examples/src/main/python/ml/standard_scaler_example.py b/examples/src/main/python/ml/standard_scaler_example.py
index c0027480e6..9021c10075 100644
--- a/examples/src/main/python/ml/standard_scaler_example.py
+++ b/examples/src/main/python/ml/standard_scaler_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.feature import StandardScaler
 # $example off$
diff --git a/examples/src/main/python/ml/stopwords_remover_example.py b/examples/src/main/python/ml/stopwords_remover_example.py
index 3b8e7855e3..832a7c7d0a 100644
--- a/examples/src/main/python/ml/stopwords_remover_example.py
+++ b/examples/src/main/python/ml/stopwords_remover_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.feature import StopWordsRemover
 # $example off$
diff --git a/examples/src/main/python/ml/string_indexer_example.py b/examples/src/main/python/ml/string_indexer_example.py
index 2255bfb9c1..f2ac63eabd 100644
--- a/examples/src/main/python/ml/string_indexer_example.py
+++ b/examples/src/main/python/ml/string_indexer_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.feature import StringIndexer
 # $example off$
diff --git a/examples/src/main/python/ml/summarizer_example.py b/examples/src/main/python/ml/summarizer_example.py
index 8835f189a1..4982746450 100644
--- a/examples/src/main/python/ml/summarizer_example.py
+++ b/examples/src/main/python/ml/summarizer_example.py
@@ -20,8 +20,6 @@ An example for summarizer.
 Run with:
   bin/spark-submit examples/src/main/python/ml/summarizer_example.py
 """
-from __future__ import print_function
-
 from pyspark.sql import SparkSession
 # $example on$
 from pyspark.ml.stat import Summarizer
diff --git a/examples/src/main/python/ml/tf_idf_example.py b/examples/src/main/python/ml/tf_idf_example.py
index d43244fa68..b4bb0dfa31 100644
--- a/examples/src/main/python/ml/tf_idf_example.py
+++ b/examples/src/main/python/ml/tf_idf_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.feature import HashingTF, IDF, Tokenizer
 # $example off$
diff --git a/examples/src/main/python/ml/tokenizer_example.py b/examples/src/main/python/ml/tokenizer_example.py
index 5c65c5c9f8..c6b5fac227 100644
--- a/examples/src/main/python/ml/tokenizer_example.py
+++ b/examples/src/main/python/ml/tokenizer_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.feature import Tokenizer, RegexTokenizer
 from pyspark.sql.functions import col, udf
diff --git a/examples/src/main/python/ml/variance_threshold_selector_example.py b/examples/src/main/python/ml/variance_threshold_selector_example.py
index b7edb86653..0a996e0e28 100644
--- a/examples/src/main/python/ml/variance_threshold_selector_example.py
+++ b/examples/src/main/python/ml/variance_threshold_selector_example.py
@@ -20,8 +20,6 @@ An example for VarianceThresholdSelector.
 Run with:
   bin/spark-submit examples/src/main/python/ml/variance_threshold_selector_example.py
 """
-from __future__ import print_function
-
 from pyspark.sql import SparkSession
 # $example on$
 from pyspark.ml.feature import VarianceThresholdSelector
diff --git a/examples/src/main/python/ml/vector_assembler_example.py b/examples/src/main/python/ml/vector_assembler_example.py
index 98de1d5ea7..0ce31cf0ea 100644
--- a/examples/src/main/python/ml/vector_assembler_example.py
+++ b/examples/src/main/python/ml/vector_assembler_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.linalg import Vectors
 from pyspark.ml.feature import VectorAssembler
diff --git a/examples/src/main/python/ml/vector_indexer_example.py b/examples/src/main/python/ml/vector_indexer_example.py
index 5c2956077d..51a4191606 100644
--- a/examples/src/main/python/ml/vector_indexer_example.py
+++ b/examples/src/main/python/ml/vector_indexer_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.feature import VectorIndexer
 # $example off$
diff --git a/examples/src/main/python/ml/vector_size_hint_example.py b/examples/src/main/python/ml/vector_size_hint_example.py
index fb77dacec6..355d85aee8 100644
--- a/examples/src/main/python/ml/vector_size_hint_example.py
+++ b/examples/src/main/python/ml/vector_size_hint_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.linalg import Vectors
 from pyspark.ml.feature import (VectorSizeHint, VectorAssembler)
diff --git a/examples/src/main/python/ml/vector_slicer_example.py b/examples/src/main/python/ml/vector_slicer_example.py
index 68c8cfe27e..86e089d152 100644
--- a/examples/src/main/python/ml/vector_slicer_example.py
+++ b/examples/src/main/python/ml/vector_slicer_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.feature import VectorSlicer
 from pyspark.ml.linalg import Vectors
diff --git a/examples/src/main/python/ml/word2vec_example.py b/examples/src/main/python/ml/word2vec_example.py
index 77f8951df0..0eabeda3dc 100644
--- a/examples/src/main/python/ml/word2vec_example.py
+++ b/examples/src/main/python/ml/word2vec_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.feature import Word2Vec
 # $example off$
diff --git a/examples/src/main/python/mllib/binary_classification_metrics_example.py b/examples/src/main/python/mllib/binary_classification_metrics_example.py
index d14ce7982e..741746e6e3 100644
--- a/examples/src/main/python/mllib/binary_classification_metrics_example.py
+++ b/examples/src/main/python/mllib/binary_classification_metrics_example.py
@@ -17,7 +17,6 @@
 """
 Binary Classification Metrics Example.
 """
-from __future__ import print_function
 from pyspark import SparkContext
 # $example on$
 from pyspark.mllib.classification import LogisticRegressionWithLBFGS
diff --git a/examples/src/main/python/mllib/bisecting_k_means_example.py b/examples/src/main/python/mllib/bisecting_k_means_example.py
index 36e36fc689..d7b6ad9d42 100644
--- a/examples/src/main/python/mllib/bisecting_k_means_example.py
+++ b/examples/src/main/python/mllib/bisecting_k_means_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 # $example on$
 from numpy import array
 # $example off$
diff --git a/examples/src/main/python/mllib/correlations.py b/examples/src/main/python/mllib/correlations.py
index 089504fa70..27d07b22a5 100755
--- a/examples/src/main/python/mllib/correlations.py
+++ b/examples/src/main/python/mllib/correlations.py
@@ -18,8 +18,6 @@
 """
 Correlations using MLlib.
 """
-from __future__ import print_function
-
 import sys
 
 from pyspark import SparkContext
diff --git a/examples/src/main/python/mllib/correlations_example.py b/examples/src/main/python/mllib/correlations_example.py
index 66d18f6e5d..bb71b96868 100644
--- a/examples/src/main/python/mllib/correlations_example.py
+++ b/examples/src/main/python/mllib/correlations_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 import numpy as np
 
 from pyspark import SparkContext
diff --git a/examples/src/main/python/mllib/decision_tree_classification_example.py b/examples/src/main/python/mllib/decision_tree_classification_example.py
index 7eecf50058..009e393226 100644
--- a/examples/src/main/python/mllib/decision_tree_classification_example.py
+++ b/examples/src/main/python/mllib/decision_tree_classification_example.py
@@ -18,8 +18,6 @@
 """
 Decision Tree Classification Example.
 """
-from __future__ import print_function
-
 from pyspark import SparkContext
 # $example on$
 from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
diff --git a/examples/src/main/python/mllib/decision_tree_regression_example.py b/examples/src/main/python/mllib/decision_tree_regression_example.py
index acf9e25fdf..71dfbf0790 100644
--- a/examples/src/main/python/mllib/decision_tree_regression_example.py
+++ b/examples/src/main/python/mllib/decision_tree_regression_example.py
@@ -18,8 +18,6 @@
 """
 Decision Tree Regression Example.
 """
-from __future__ import print_function
-
 from pyspark import SparkContext
 # $example on$
 from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
diff --git a/examples/src/main/python/mllib/elementwise_product_example.py b/examples/src/main/python/mllib/elementwise_product_example.py
index 8ae9afb1dc..15e6a43f73 100644
--- a/examples/src/main/python/mllib/elementwise_product_example.py
+++ b/examples/src/main/python/mllib/elementwise_product_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 from pyspark import SparkContext
 # $example on$
 from pyspark.mllib.feature import ElementwiseProduct
diff --git a/examples/src/main/python/mllib/gaussian_mixture_example.py b/examples/src/main/python/mllib/gaussian_mixture_example.py
index a60e799d62..3b19478f45 100644
--- a/examples/src/main/python/mllib/gaussian_mixture_example.py
+++ b/examples/src/main/python/mllib/gaussian_mixture_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 # $example on$
 from numpy import array
 # $example off$
diff --git a/examples/src/main/python/mllib/gaussian_mixture_model.py b/examples/src/main/python/mllib/gaussian_mixture_model.py
index 6b46e27dda..96ce6b6f6a 100644
--- a/examples/src/main/python/mllib/gaussian_mixture_model.py
+++ b/examples/src/main/python/mllib/gaussian_mixture_model.py
@@ -18,11 +18,6 @@
 """
 A Gaussian Mixture Model clustering program using MLlib.
 """
-from __future__ import print_function
-
-import sys
-if sys.version >= '3':
-    long = int
 
 import random
 import argparse
@@ -53,7 +48,7 @@ if __name__ == "__main__":
     parser.add_argument('--convergenceTol', default=1e-3, type=float, help='convergence threshold')
     parser.add_argument('--maxIterations', default=100, type=int, help='Number of iterations')
     parser.add_argument('--seed', default=random.getrandbits(19),
-                        type=long, help='Random seed')
+                        type=int, help='Random seed')
     args = parser.parse_args()
 
     conf = SparkConf().setAppName("GMM")
diff --git a/examples/src/main/python/mllib/gradient_boosting_classification_example.py b/examples/src/main/python/mllib/gradient_boosting_classification_example.py
index 65a03572be..eb12f20619 100644
--- a/examples/src/main/python/mllib/gradient_boosting_classification_example.py
+++ b/examples/src/main/python/mllib/gradient_boosting_classification_example.py
@@ -18,8 +18,6 @@
 """
 Gradient Boosted Trees Classification Example.
 """
-from __future__ import print_function
-
 from pyspark import SparkContext
 # $example on$
 from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
diff --git a/examples/src/main/python/mllib/gradient_boosting_regression_example.py b/examples/src/main/python/mllib/gradient_boosting_regression_example.py
index 877f8ab461..eb59a992df 100644
--- a/examples/src/main/python/mllib/gradient_boosting_regression_example.py
+++ b/examples/src/main/python/mllib/gradient_boosting_regression_example.py
@@ -18,8 +18,6 @@
 """
 Gradient Boosted Trees Regression Example.
 """
-from __future__ import print_function
-
 from pyspark import SparkContext
 # $example on$
 from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
diff --git a/examples/src/main/python/mllib/hypothesis_testing_example.py b/examples/src/main/python/mllib/hypothesis_testing_example.py
index 21a5584fd6..321be8b76f 100644
--- a/examples/src/main/python/mllib/hypothesis_testing_example.py
+++ b/examples/src/main/python/mllib/hypothesis_testing_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 from pyspark import SparkContext
 # $example on$
 from pyspark.mllib.linalg import Matrices, Vectors
diff --git a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py
index ef380dee79..12a186900e 100644
--- a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py
+++ b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 from pyspark import SparkContext
 # $example on$
 from pyspark.mllib.stat import Statistics
diff --git a/examples/src/main/python/mllib/isotonic_regression_example.py b/examples/src/main/python/mllib/isotonic_regression_example.py
index f5322d79c4..a5a0cfeae9 100644
--- a/examples/src/main/python/mllib/isotonic_regression_example.py
+++ b/examples/src/main/python/mllib/isotonic_regression_example.py
@@ -18,8 +18,6 @@
 """
 Isotonic Regression Example.
 """
-from __future__ import print_function
-
 from pyspark import SparkContext
 # $example on$
 import math
diff --git a/examples/src/main/python/mllib/k_means_example.py b/examples/src/main/python/mllib/k_means_example.py
index d6058f4502..ead1e56de5 100644
--- a/examples/src/main/python/mllib/k_means_example.py
+++ b/examples/src/main/python/mllib/k_means_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 # $example on$
 from numpy import array
 from math import sqrt
diff --git a/examples/src/main/python/mllib/kernel_density_estimation_example.py b/examples/src/main/python/mllib/kernel_density_estimation_example.py
index 3e8f7241a4..22d1917160 100644
--- a/examples/src/main/python/mllib/kernel_density_estimation_example.py
+++ b/examples/src/main/python/mllib/kernel_density_estimation_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 from pyspark import SparkContext
 # $example on$
 from pyspark.mllib.stat import KernelDensity
diff --git a/examples/src/main/python/mllib/kmeans.py b/examples/src/main/python/mllib/kmeans.py
index 1bdb3e9b4a..2560384b6a 100755
--- a/examples/src/main/python/mllib/kmeans.py
+++ b/examples/src/main/python/mllib/kmeans.py
@@ -20,8 +20,6 @@ A K-means clustering program using MLlib.
 
 This example requires NumPy (http://www.numpy.org/).
 """
-from __future__ import print_function
-
 import sys
 
 import numpy as np
diff --git a/examples/src/main/python/mllib/latent_dirichlet_allocation_example.py b/examples/src/main/python/mllib/latent_dirichlet_allocation_example.py
index 2a1bef5f20..f82a28aadc 100644
--- a/examples/src/main/python/mllib/latent_dirichlet_allocation_example.py
+++ b/examples/src/main/python/mllib/latent_dirichlet_allocation_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 from pyspark import SparkContext
 # $example on$
 from pyspark.mllib.clustering import LDA, LDAModel
diff --git a/examples/src/main/python/mllib/linear_regression_with_sgd_example.py b/examples/src/main/python/mllib/linear_regression_with_sgd_example.py
index 6744463d40..cb67396332 100644
--- a/examples/src/main/python/mllib/linear_regression_with_sgd_example.py
+++ b/examples/src/main/python/mllib/linear_regression_with_sgd_example.py
@@ -18,8 +18,6 @@
 """
 Linear Regression With SGD Example.
 """
-from __future__ import print_function
-
 from pyspark import SparkContext
 # $example on$
 from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel
diff --git a/examples/src/main/python/mllib/logistic_regression.py b/examples/src/main/python/mllib/logistic_regression.py
index 87efe17375..7b90615a53 100755
--- a/examples/src/main/python/mllib/logistic_regression.py
+++ b/examples/src/main/python/mllib/logistic_regression.py
@@ -20,8 +20,6 @@ Logistic regression using MLlib.
 
 This example requires NumPy (http://www.numpy.org/).
 """
-from __future__ import print_function
-
 import sys
 
 from pyspark import SparkContext
diff --git a/examples/src/main/python/mllib/logistic_regression_with_lbfgs_example.py b/examples/src/main/python/mllib/logistic_regression_with_lbfgs_example.py
index c9b768b314..ac5ab1d1b5 100644
--- a/examples/src/main/python/mllib/logistic_regression_with_lbfgs_example.py
+++ b/examples/src/main/python/mllib/logistic_regression_with_lbfgs_example.py
@@ -18,8 +18,6 @@
 """
 Logistic Regression With LBFGS Example.
 """
-from __future__ import print_function
-
 from pyspark import SparkContext
 # $example on$
 from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
diff --git a/examples/src/main/python/mllib/naive_bayes_example.py b/examples/src/main/python/mllib/naive_bayes_example.py
index a29fcccac5..74d18233d5 100644
--- a/examples/src/main/python/mllib/naive_bayes_example.py
+++ b/examples/src/main/python/mllib/naive_bayes_example.py
@@ -22,8 +22,6 @@ Usage:
   `spark-submit --master local[4] examples/src/main/python/mllib/naive_bayes_example.py`
 """
 
-from __future__ import print_function
-
 import shutil
 
 from pyspark import SparkContext
diff --git a/examples/src/main/python/mllib/normalizer_example.py b/examples/src/main/python/mllib/normalizer_example.py
index a4e028ca9a..d46110d9a0 100644
--- a/examples/src/main/python/mllib/normalizer_example.py
+++ b/examples/src/main/python/mllib/normalizer_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 from pyspark import SparkContext
 # $example on$
 from pyspark.mllib.feature import Normalizer
diff --git a/examples/src/main/python/mllib/power_iteration_clustering_example.py b/examples/src/main/python/mllib/power_iteration_clustering_example.py
index ca19c0ccb6..60eedef5fa 100644
--- a/examples/src/main/python/mllib/power_iteration_clustering_example.py
+++ b/examples/src/main/python/mllib/power_iteration_clustering_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 from pyspark import SparkContext
 # $example on$
 from pyspark.mllib.clustering import PowerIterationClustering, PowerIterationClusteringModel
diff --git a/examples/src/main/python/mllib/random_forest_classification_example.py b/examples/src/main/python/mllib/random_forest_classification_example.py
index 5ac67520da..a929c10d5a 100644
--- a/examples/src/main/python/mllib/random_forest_classification_example.py
+++ b/examples/src/main/python/mllib/random_forest_classification_example.py
@@ -18,8 +18,6 @@
 """
 Random Forest Classification Example.
 """
-from __future__ import print_function
-
 from pyspark import SparkContext
 # $example on$
 from pyspark.mllib.tree import RandomForest, RandomForestModel
diff --git a/examples/src/main/python/mllib/random_forest_regression_example.py b/examples/src/main/python/mllib/random_forest_regression_example.py
index 7e986a0d30..4e05937768 100644
--- a/examples/src/main/python/mllib/random_forest_regression_example.py
+++ b/examples/src/main/python/mllib/random_forest_regression_example.py
@@ -18,8 +18,6 @@
 """
 Random Forest Regression Example.
 """
-from __future__ import print_function
-
 from pyspark import SparkContext
 # $example on$
 from pyspark.mllib.tree import RandomForest, RandomForestModel
diff --git a/examples/src/main/python/mllib/random_rdd_generation.py b/examples/src/main/python/mllib/random_rdd_generation.py
index 9a429b5f8a..49afcfe939 100755
--- a/examples/src/main/python/mllib/random_rdd_generation.py
+++ b/examples/src/main/python/mllib/random_rdd_generation.py
@@ -18,8 +18,6 @@
 """
 Randomly generated RDDs.
 """
-from __future__ import print_function
-
 import sys
 
 from pyspark import SparkContext
diff --git a/examples/src/main/python/mllib/recommendation_example.py b/examples/src/main/python/mllib/recommendation_example.py
index 00e683c3ae..719f3f904b 100644
--- a/examples/src/main/python/mllib/recommendation_example.py
+++ b/examples/src/main/python/mllib/recommendation_example.py
@@ -18,8 +18,6 @@
 """
 Collaborative Filtering Classification Example.
 """
-from __future__ import print_function
-
 from pyspark import SparkContext
 
 # $example on$
diff --git a/examples/src/main/python/mllib/sampled_rdds.py b/examples/src/main/python/mllib/sampled_rdds.py
index 00e7cf4bbc..9095c2b2d7 100755
--- a/examples/src/main/python/mllib/sampled_rdds.py
+++ b/examples/src/main/python/mllib/sampled_rdds.py
@@ -18,8 +18,6 @@
 """
 Randomly sampled RDDs.
 """
-from __future__ import print_function
-
 import sys
 
 from pyspark import SparkContext
diff --git a/examples/src/main/python/mllib/standard_scaler_example.py b/examples/src/main/python/mllib/standard_scaler_example.py
index 11ed34427d..c8fd64dfbb 100644
--- a/examples/src/main/python/mllib/standard_scaler_example.py
+++ b/examples/src/main/python/mllib/standard_scaler_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 from pyspark import SparkContext
 # $example on$
 from pyspark.mllib.feature import StandardScaler
diff --git a/examples/src/main/python/mllib/stratified_sampling_example.py b/examples/src/main/python/mllib/stratified_sampling_example.py
index a13f8f08dd..2d29f74a19 100644
--- a/examples/src/main/python/mllib/stratified_sampling_example.py
+++ b/examples/src/main/python/mllib/stratified_sampling_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 from pyspark import SparkContext
 
 if __name__ == "__main__":
diff --git a/examples/src/main/python/mllib/streaming_k_means_example.py b/examples/src/main/python/mllib/streaming_k_means_example.py
index e82509ad3f..4904a9ebcf 100644
--- a/examples/src/main/python/mllib/streaming_k_means_example.py
+++ b/examples/src/main/python/mllib/streaming_k_means_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 from pyspark import SparkContext
 from pyspark.streaming import StreamingContext
 # $example on$
diff --git a/examples/src/main/python/mllib/streaming_linear_regression_example.py b/examples/src/main/python/mllib/streaming_linear_regression_example.py
index 714c9a0de7..1d52e00fbf 100644
--- a/examples/src/main/python/mllib/streaming_linear_regression_example.py
+++ b/examples/src/main/python/mllib/streaming_linear_regression_example.py
@@ -18,8 +18,6 @@
 """
 Streaming Linear Regression Example.
 """
-from __future__ import print_function
-
 # $example on$
 import sys
 # $example off$
diff --git a/examples/src/main/python/mllib/summary_statistics_example.py b/examples/src/main/python/mllib/summary_statistics_example.py
index d55d1a2c2d..d86e841145 100644
--- a/examples/src/main/python/mllib/summary_statistics_example.py
+++ b/examples/src/main/python/mllib/summary_statistics_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 from pyspark import SparkContext
 # $example on$
 import numpy as np
diff --git a/examples/src/main/python/mllib/tf_idf_example.py b/examples/src/main/python/mllib/tf_idf_example.py
index b66412b233..4449066f5b 100644
--- a/examples/src/main/python/mllib/tf_idf_example.py
+++ b/examples/src/main/python/mllib/tf_idf_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 from pyspark import SparkContext
 # $example on$
 from pyspark.mllib.feature import HashingTF, IDF
diff --git a/examples/src/main/python/mllib/word2vec.py b/examples/src/main/python/mllib/word2vec.py
index 4e7d4f7610..3e5720b4df 100644
--- a/examples/src/main/python/mllib/word2vec.py
+++ b/examples/src/main/python/mllib/word2vec.py
@@ -23,8 +23,6 @@
 # grep -o -E '\w+(\W+\w+){0,15}' text8 > text8_lines
 # This was done so that the example can be run in local mode
 
-from __future__ import print_function
-
 import sys
 
 from pyspark import SparkContext
diff --git a/examples/src/main/python/mllib/word2vec_example.py b/examples/src/main/python/mllib/word2vec_example.py
index ad1090c77e..d37a6e7137 100644
--- a/examples/src/main/python/mllib/word2vec_example.py
+++ b/examples/src/main/python/mllib/word2vec_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 from pyspark import SparkContext
 # $example on$
 from pyspark.mllib.feature import Word2Vec
diff --git a/examples/src/main/python/pagerank.py b/examples/src/main/python/pagerank.py
index 2c19e8700a..0ab7249a82 100755
--- a/examples/src/main/python/pagerank.py
+++ b/examples/src/main/python/pagerank.py
@@ -22,8 +22,6 @@ Please refer to PageRank implementation provided by graphx
 Example Usage:
 bin/spark-submit examples/src/main/python/pagerank.py data/mllib/pagerank_data.txt 10
 """
-from __future__ import print_function
-
 import re
 import sys
 from operator import add
diff --git a/examples/src/main/python/parquet_inputformat.py b/examples/src/main/python/parquet_inputformat.py
index 83041f0040..ca8dd25e6d 100644
--- a/examples/src/main/python/parquet_inputformat.py
+++ b/examples/src/main/python/parquet_inputformat.py
@@ -29,8 +29,6 @@ $ ./bin/spark-submit --driver-class-path /path/to/example/jar \\
 {u'favorite_color': u'red', u'name': u'Ben', u'favorite_numbers': []}
 <...more log output...>
 """
-from __future__ import print_function
-
 import sys
 
 from pyspark.sql import SparkSession
diff --git a/examples/src/main/python/pi.py b/examples/src/main/python/pi.py
index 5839cc2874..e646722533 100755
--- a/examples/src/main/python/pi.py
+++ b/examples/src/main/python/pi.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 import sys
 from random import random
 from operator import add
diff --git a/examples/src/main/python/sort.py b/examples/src/main/python/sort.py
index d3cd985d19..9efb00a6f1 100755
--- a/examples/src/main/python/sort.py
+++ b/examples/src/main/python/sort.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 import sys
 
 from pyspark.sql import SparkSession
diff --git a/examples/src/main/python/sql/arrow.py b/examples/src/main/python/sql/arrow.py
index b7d8467172..e46449dbef 100644
--- a/examples/src/main/python/sql/arrow.py
+++ b/examples/src/main/python/sql/arrow.py
@@ -21,21 +21,12 @@ Run with:
   ./bin/spark-submit examples/src/main/python/sql/arrow.py
 """
 
-from __future__ import print_function
-
-import sys
-
 from pyspark.sql import SparkSession
 from pyspark.sql.pandas.utils import require_minimum_pandas_version, require_minimum_pyarrow_version
 
 require_minimum_pandas_version()
 require_minimum_pyarrow_version()
 
-if sys.version_info < (3, 6):
-    raise Exception(
-        "Running this example file requires Python 3.6+; however, "
-        "your Python version was:\n %s" % sys.version)
-
 
 def dataframe_with_arrow_example(spark):
     # $example on:dataframe_with_arrow$
diff --git a/examples/src/main/python/sql/basic.py b/examples/src/main/python/sql/basic.py
index c8fb25d053..eba8e6ad99 100644
--- a/examples/src/main/python/sql/basic.py
+++ b/examples/src/main/python/sql/basic.py
@@ -20,8 +20,6 @@ A simple example demonstrating basic Spark SQL features.
 Run with:
   ./bin/spark-submit examples/src/main/python/sql/basic.py
 """
-from __future__ import print_function
-
 # $example on:init_session$
 from pyspark.sql import SparkSession
 # $example off:init_session$
diff --git a/examples/src/main/python/sql/datasource.py b/examples/src/main/python/sql/datasource.py
index 265f135e1e..94a41a7e5e 100644
--- a/examples/src/main/python/sql/datasource.py
+++ b/examples/src/main/python/sql/datasource.py
@@ -20,8 +20,6 @@ A simple example demonstrating Spark SQL data sources.
 Run with:
   ./bin/spark-submit examples/src/main/python/sql/datasource.py
 """
-from __future__ import print_function
-
 from pyspark.sql import SparkSession
 # $example on:schema_merging$
 from pyspark.sql import Row
diff --git a/examples/src/main/python/sql/hive.py b/examples/src/main/python/sql/hive.py
index e96a8af71a..bc23dcd9bd 100644
--- a/examples/src/main/python/sql/hive.py
+++ b/examples/src/main/python/sql/hive.py
@@ -20,8 +20,6 @@ A simple example demonstrating Spark SQL Hive integration.
 Run with:
   ./bin/spark-submit examples/src/main/python/sql/hive.py
 """
-from __future__ import print_function
-
 # $example on:spark_hive$
 from os.path import join, abspath
 
diff --git a/examples/src/main/python/sql/streaming/structured_kafka_wordcount.py b/examples/src/main/python/sql/streaming/structured_kafka_wordcount.py
index 9210678913..40a955a46c 100644
--- a/examples/src/main/python/sql/streaming/structured_kafka_wordcount.py
+++ b/examples/src/main/python/sql/streaming/structured_kafka_wordcount.py
@@ -36,8 +36,6 @@
     `$ bin/spark-submit examples/src/main/python/sql/streaming/structured_kafka_wordcount.py \
     host1:port1,host2:port2 subscribe topic1,topic2`
 """
-from __future__ import print_function
-
 import sys
 
 from pyspark.sql import SparkSession
diff --git a/examples/src/main/python/sql/streaming/structured_network_wordcount.py b/examples/src/main/python/sql/streaming/structured_network_wordcount.py
index 9ac3921647..c8f43c9dcf 100644
--- a/examples/src/main/python/sql/streaming/structured_network_wordcount.py
+++ b/examples/src/main/python/sql/streaming/structured_network_wordcount.py
@@ -27,8 +27,6 @@ r"""
     `$ bin/spark-submit examples/src/main/python/sql/streaming/structured_network_wordcount.py
     localhost 9999`
 """
-from __future__ import print_function
-
 import sys
 
 from pyspark.sql import SparkSession
diff --git a/examples/src/main/python/sql/streaming/structured_network_wordcount_windowed.py b/examples/src/main/python/sql/streaming/structured_network_wordcount_windowed.py
index c4e3bbf44c..cc39d8afa6 100644
--- a/examples/src/main/python/sql/streaming/structured_network_wordcount_windowed.py
+++ b/examples/src/main/python/sql/streaming/structured_network_wordcount_windowed.py
@@ -39,8 +39,6 @@ r"""
 
  One recommended <window duration>, <slide duration> pair is 10, 5
 """
-from __future__ import print_function
-
 import sys
 
 from pyspark.sql import SparkSession
diff --git a/examples/src/main/python/status_api_demo.py b/examples/src/main/python/status_api_demo.py
index 8cc8cc820c..7b408c8726 100644
--- a/examples/src/main/python/status_api_demo.py
+++ b/examples/src/main/python/status_api_demo.py
@@ -15,15 +15,10 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 import time
 import threading
 import sys
-if sys.version >= '3':
-    import queue as Queue
-else:
-    import Queue
+import queue as Queue
 
 from pyspark import SparkConf, SparkContext
 
diff --git a/examples/src/main/python/streaming/hdfs_wordcount.py b/examples/src/main/python/streaming/hdfs_wordcount.py
index f9a5c43a8e..fac07727b7 100644
--- a/examples/src/main/python/streaming/hdfs_wordcount.py
+++ b/examples/src/main/python/streaming/hdfs_wordcount.py
@@ -25,8 +25,6 @@
 
  Then create a text file in `localdir` and the words in the file will get counted.
 """
-from __future__ import print_function
-
 import sys
 
 from pyspark import SparkContext
diff --git a/examples/src/main/python/streaming/network_wordcount.py b/examples/src/main/python/streaming/network_wordcount.py
index f3099d2517..b57f4e9e38 100644
--- a/examples/src/main/python/streaming/network_wordcount.py
+++ b/examples/src/main/python/streaming/network_wordcount.py
@@ -25,8 +25,6 @@ r"""
  and then run the example
     `$ bin/spark-submit examples/src/main/python/streaming/network_wordcount.py localhost 9999`
 """
-from __future__ import print_function
-
 import sys
 
 from pyspark import SparkContext
diff --git a/examples/src/main/python/streaming/network_wordjoinsentiments.py b/examples/src/main/python/streaming/network_wordjoinsentiments.py
index 2b5434c0c8..5b03546fb4 100644
--- a/examples/src/main/python/streaming/network_wordjoinsentiments.py
+++ b/examples/src/main/python/streaming/network_wordjoinsentiments.py
@@ -30,8 +30,6 @@ r"""
     localhost 9999`
 """
 
-from __future__ import print_function
-
 import sys
 
 from pyspark import SparkContext
diff --git a/examples/src/main/python/streaming/recoverable_network_wordcount.py b/examples/src/main/python/streaming/recoverable_network_wordcount.py
index a39c4d0b5b..8424556e88 100644
--- a/examples/src/main/python/streaming/recoverable_network_wordcount.py
+++ b/examples/src/main/python/streaming/recoverable_network_wordcount.py
@@ -35,8 +35,6 @@
  checkpoint data exists in ~/checkpoint/, then it will create StreamingContext from
  the checkpoint data.
 """
-from __future__ import print_function
-
 import os
 import sys
 
diff --git a/examples/src/main/python/streaming/sql_network_wordcount.py b/examples/src/main/python/streaming/sql_network_wordcount.py
index ab3cfc0679..59a8a11a45 100644
--- a/examples/src/main/python/streaming/sql_network_wordcount.py
+++ b/examples/src/main/python/streaming/sql_network_wordcount.py
@@ -27,8 +27,6 @@ r"""
  and then run the example
     `$ bin/spark-submit examples/src/main/python/streaming/sql_network_wordcount.py localhost 9999`
 """
-from __future__ import print_function
-
 import sys
 
 from pyspark import SparkContext
diff --git a/examples/src/main/python/streaming/stateful_network_wordcount.py b/examples/src/main/python/streaming/stateful_network_wordcount.py
index d5d1eba6c5..7a45be663a 100644
--- a/examples/src/main/python/streaming/stateful_network_wordcount.py
+++ b/examples/src/main/python/streaming/stateful_network_wordcount.py
@@ -29,8 +29,6 @@ r"""
     `$ bin/spark-submit examples/src/main/python/streaming/stateful_network_wordcount.py \
         localhost 9999`
 """
-from __future__ import print_function
-
 import sys
 
 from pyspark import SparkContext
diff --git a/examples/src/main/python/transitive_closure.py b/examples/src/main/python/transitive_closure.py
index 49551d4085..9f543daecd 100755
--- a/examples/src/main/python/transitive_closure.py
+++ b/examples/src/main/python/transitive_closure.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 import sys
 from random import Random
 
diff --git a/examples/src/main/python/wordcount.py b/examples/src/main/python/wordcount.py
index a05e24ff3f..037c1e8aa3 100755
--- a/examples/src/main/python/wordcount.py
+++ b/examples/src/main/python/wordcount.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 import sys
 from operator import add
 
diff --git a/external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py b/external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py
index 5370b79389..df8c64e531 100644
--- a/external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py
+++ b/external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py
@@ -55,8 +55,6 @@
   See http://spark.apache.org/docs/latest/streaming-kinesis-integration.html for more details on
   the Kinesis Spark Streaming integration.
 """
-from __future__ import print_function
-
 import sys
 
 from pyspark import SparkContext
diff --git a/python/pyspark/accumulators.py b/python/pyspark/accumulators.py
index a5d513262b..2a19d233bc 100644
--- a/python/pyspark/accumulators.py
+++ b/python/pyspark/accumulators.py
@@ -89,10 +89,7 @@ TypeError:...
 import sys
 import select
 import struct
-if sys.version < '3':
-    import SocketServer
-else:
-    import socketserver as SocketServer
+import socketserver as SocketServer
 import threading
 from pyspark.serializers import read_int, PickleSerializer
 
diff --git a/python/pyspark/broadcast.py b/python/pyspark/broadcast.py
index 803d857055..c2daf7600f 100644
--- a/python/pyspark/broadcast.py
+++ b/python/pyspark/broadcast.py
@@ -20,16 +20,12 @@ import os
 import sys
 from tempfile import NamedTemporaryFile
 import threading
+import pickle
 
 from pyspark.java_gateway import local_connect_and_auth
 from pyspark.serializers import ChunkedStream, pickle_protocol
-from pyspark.util import _exception_message, print_exec
+from pyspark.util import print_exec
 
-if sys.version < '3':
-    import cPickle as pickle
-else:
-    import pickle
-    unicode = str
 
 __all__ = ['Broadcast']
 
@@ -113,7 +109,7 @@ class Broadcast(object):
             raise
         except Exception as e:
             msg = "Could not serialize broadcast: %s: %s" \
-                  % (e.__class__.__name__, _exception_message(e))
+                  % (e.__class__.__name__, str(e))
             print_exec(sys.stderr)
             raise pickle.PicklingError(msg)
         f.close()
diff --git a/python/pyspark/conf.py b/python/pyspark/conf.py
index 2024260868..efd8b6d633 100644
--- a/python/pyspark/conf.py
+++ b/python/pyspark/conf.py
@@ -22,14 +22,14 @@
 >>> conf.setMaster("local").setAppName("My app")
 <pyspark.conf.SparkConf object at ...>
 >>> conf.get("spark.master")
-u'local'
+'local'
 >>> conf.get("spark.app.name")
-u'My app'
+'My app'
 >>> sc = SparkContext(conf=conf)
 >>> sc.master
-u'local'
+'local'
 >>> sc.appName
-u'My app'
+'My app'
 >>> sc.sparkHome is None
 True
 
@@ -37,21 +37,21 @@ True
 >>> conf.setSparkHome("/path")
 <pyspark.conf.SparkConf object at ...>
 >>> conf.get("spark.home")
-u'/path'
+'/path'
 >>> conf.setExecutorEnv("VAR1", "value1")
 <pyspark.conf.SparkConf object at ...>
 >>> conf.setExecutorEnv(pairs = [("VAR3", "value3"), ("VAR4", "value4")])
 <pyspark.conf.SparkConf object at ...>
 >>> conf.get("spark.executorEnv.VAR1")
-u'value1'
+'value1'
 >>> print(conf.toDebugString())
 spark.executorEnv.VAR1=value1
 spark.executorEnv.VAR3=value3
 spark.executorEnv.VAR4=value4
 spark.home=/path
 >>> sorted(conf.getAll(), key=lambda p: p[0])
-[(u'spark.executorEnv.VAR1', u'value1'), (u'spark.executorEnv.VAR3', u'value3'), \
-(u'spark.executorEnv.VAR4', u'value4'), (u'spark.home', u'/path')]
+[('spark.executorEnv.VAR1', 'value1'), ('spark.executorEnv.VAR3', 'value3'), \
+('spark.executorEnv.VAR4', 'value4'), ('spark.home', '/path')]
 >>> conf._jconf.setExecutorEnv("VAR5", "value5")
 JavaObject id...
 >>> print(conf.toDebugString())
@@ -65,11 +65,6 @@ spark.home=/path
 __all__ = ['SparkConf']
 
 import sys
-import re
-
-if sys.version > '3':
-    unicode = str
-    __doc__ = re.sub(r"(\W|^)[uU](['])", r'\1\2', __doc__)
 
 
 class SparkConf(object):
@@ -124,9 +119,9 @@ class SparkConf(object):
         """Set a configuration property."""
         # Try to set self._jconf first if JVM is created, set self._conf if JVM is not created yet.
         if self._jconf is not None:
-            self._jconf.set(key, unicode(value))
+            self._jconf.set(key, str(value))
         else:
-            self._conf[key] = unicode(value)
+            self._conf[key] = str(value)
         return self
 
     def setIfMissing(self, key, value):
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index 6d58e1d144..2e105cc382 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -21,6 +21,7 @@ import signal
 import sys
 import threading
 import warnings
+import importlib
 from threading import RLock
 from tempfile import NamedTemporaryFile
 
@@ -37,15 +38,12 @@ from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deseria
     PairDeserializer, AutoBatchedSerializer, NoOpSerializer, ChunkedStream
 from pyspark.storagelevel import StorageLevel
 from pyspark.resource.information import ResourceInformation
-from pyspark.rdd import RDD, _load_from_socket, ignore_unicode_prefix
+from pyspark.rdd import RDD, _load_from_socket
 from pyspark.taskcontext import TaskContext
 from pyspark.traceback_utils import CallSite, first_spark_call
 from pyspark.status import StatusTracker
 from pyspark.profiler import ProfilerCollector, BasicProfiler
 
-if sys.version > '3':
-    xrange = range
-
 
 __all__ = ['SparkContext']
 
@@ -213,15 +211,6 @@ class SparkContext(object):
         self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python')
         self.pythonVer = "%d.%d" % sys.version_info[:2]
 
-        if sys.version_info < (3, 6):
-            with warnings.catch_warnings():
-                warnings.simplefilter("once")
-                warnings.warn(
-                    "Support for Python 2 and Python 3 prior to version 3.6 is deprecated as "
-                    "of Spark 3.0. See also the plan for dropping Python 2 support at "
-                    "https://spark.apache.org/news/plan-for-dropping-python-2-support.html.",
-                    DeprecationWarning)
-
         # Broadcast's __reduce__ method stores Broadcast instances here.
         # This allows other code to determine which Broadcast instances have
         # been pickled, so it can determine which Java broadcast objects to
@@ -398,7 +387,6 @@ class SparkContext(object):
         return self._jsc.version()
 
     @property
-    @ignore_unicode_prefix
     def applicationId(self):
         """
         A unique identifier for the Spark application.
@@ -408,7 +396,7 @@ class SparkContext(object):
         * in case of YARN something like 'application_1433865536131_34483'
 
         >>> sc.applicationId  # doctest: +ELLIPSIS
-        u'local-...'
+        'local-...'
         """
         return self._jsc.sc().applicationId()
 
@@ -490,20 +478,20 @@ class SparkContext(object):
             end = start
             start = 0
 
-        return self.parallelize(xrange(start, end, step), numSlices)
+        return self.parallelize(range(start, end, step), numSlices)
 
     def parallelize(self, c, numSlices=None):
         """
-        Distribute a local Python collection to form an RDD. Using xrange
+        Distribute a local Python collection to form an RDD. Using range
         is recommended if the input represents a range for performance.
 
         >>> sc.parallelize([0, 2, 3, 4, 6], 5).glom().collect()
         [[0], [2], [3], [4], [6]]
-        >>> sc.parallelize(xrange(0, 6, 2), 5).glom().collect()
+        >>> sc.parallelize(range(0, 6, 2), 5).glom().collect()
         [[], [0], [], [2], [4]]
         """
         numSlices = int(numSlices) if numSlices is not None else self.defaultParallelism
-        if isinstance(c, xrange):
+        if isinstance(c, range):
             size = len(c)
             if size == 0:
                 return self.parallelize([], numSlices)
@@ -522,7 +510,7 @@ class SparkContext(object):
                 # the empty iterator to a list, thus make sure worker reuse takes effect.
                 # See more details in SPARK-26549.
                 assert len(list(iterator)) == 0
-                return xrange(getStart(split), getStart(split + 1), step)
+                return range(getStart(split), getStart(split + 1), step)
 
             return self.parallelize([], numSlices).mapPartitionsWithIndex(f)
 
@@ -591,7 +579,6 @@ class SparkContext(object):
         minPartitions = minPartitions or self.defaultMinPartitions
         return RDD(self._jsc.objectFile(name, minPartitions), self)
 
-    @ignore_unicode_prefix
     def textFile(self, name, minPartitions=None, use_unicode=True):
         """
         Read a text file from HDFS, a local file system (available on all
@@ -608,13 +595,12 @@ class SparkContext(object):
         ...    _ = testFile.write("Hello world!")
         >>> textFile = sc.textFile(path)
         >>> textFile.collect()
-        [u'Hello world!']
+        ['Hello world!']
         """
         minPartitions = minPartitions or min(self.defaultParallelism, 2)
         return RDD(self._jsc.textFile(name, minPartitions), self,
                    UTF8Deserializer(use_unicode))
 
-    @ignore_unicode_prefix
     def wholeTextFiles(self, path, minPartitions=None, use_unicode=True):
         """
         Read a directory of text files from HDFS, a local file system
@@ -658,7 +644,7 @@ class SparkContext(object):
         ...    _ = file2.write("2")
         >>> textFiles = sc.wholeTextFiles(dirPath)
         >>> sorted(textFiles.collect())
-        [(u'.../1.txt', u'1'), (u'.../2.txt', u'2')]
+        [('.../1.txt', '1'), ('.../2.txt', '2')]
         """
         minPartitions = minPartitions or self.defaultMinPartitions
         return RDD(self._jsc.wholeTextFiles(path, minPartitions), self,
@@ -846,7 +832,6 @@ class SparkContext(object):
         jrdd = self._jsc.checkpointFile(name)
         return RDD(jrdd, self, input_deserializer)
 
-    @ignore_unicode_prefix
     def union(self, rdds):
         """
         Build the union of a list of RDDs.
@@ -860,10 +845,10 @@ class SparkContext(object):
         ...    _ = testFile.write("Hello")
         >>> textFile = sc.textFile(path)
         >>> textFile.collect()
-        [u'Hello']
+        ['Hello']
         >>> parallelized = sc.parallelize(["World!"])
         >>> sorted(sc.union([textFile, parallelized]).collect())
-        [u'Hello', 'World!']
+        ['Hello', 'World!']
         """
         first_jrdd_deserializer = rdds[0]._jrdd_deserializer
         if any(x._jrdd_deserializer != first_jrdd_deserializer for x in rdds):
@@ -959,9 +944,8 @@ class SparkContext(object):
             self._python_includes.append(filename)
             # for tests in local mode
             sys.path.insert(1, os.path.join(SparkFiles.getRootDirectory(), filename))
-        if sys.version > '3':
-            import importlib
-            importlib.invalidate_caches()
+
+        importlib.invalidate_caches()
 
     def setCheckpointDir(self, dirName):
         """
diff --git a/python/pyspark/find_spark_home.py b/python/pyspark/find_spark_home.py
index 52f6ea9a37..920c04009d 100755
--- a/python/pyspark/find_spark_home.py
+++ b/python/pyspark/find_spark_home.py
@@ -20,7 +20,6 @@
 # This script attempt to determine the correct setting for SPARK_HOME given
 # that Spark may have been installed on the system with pip.
 
-from __future__ import print_function
 import os
 import sys
 
@@ -41,26 +40,15 @@ def _find_spark_home():
 
     # Add the path of the PySpark module if it exists
     import_error_raised = False
-    if sys.version < "3":
-        import imp
-        try:
-            module_home = imp.find_module("pyspark")[1]
-            paths.append(module_home)
-            # If we are installed in edit mode also look two dirs up
-            paths.append(os.path.join(module_home, "../../"))
-        except ImportError:
-            # Not pip installed no worries
-            import_error_raised = True
-    else:
-        from importlib.util import find_spec
-        try:
-            module_home = os.path.dirname(find_spec("pyspark").origin)
-            paths.append(module_home)
-            # If we are installed in edit mode also look two dirs up
-            paths.append(os.path.join(module_home, "../../"))
-        except ImportError:
-            # Not pip installed no worries
-            import_error_raised = True
+    from importlib.util import find_spec
+    try:
+        module_home = os.path.dirname(find_spec("pyspark").origin)
+        paths.append(module_home)
+        # If we are installed in edit mode also look two dirs up
+        paths.append(os.path.join(module_home, "../../"))
+    except ImportError:
+        # Not pip installed no worries
+        import_error_raised = True
 
     # Normalize the paths
     paths = [os.path.abspath(p) for p in paths]
@@ -84,5 +72,6 @@ def _find_spark_home():
                 "'PYSPARK_PYTHON=python3 pyspark'.\n", file=sys.stderr)
         sys.exit(-1)
 
+
 if __name__ == "__main__":
     print(_find_spark_home())
diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index 0daf09b17a..fba92a96ae 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -17,7 +17,6 @@
 
 import atexit
 import os
-import sys
 import signal
 import shlex
 import shutil
@@ -27,14 +26,10 @@ import tempfile
 import time
 from subprocess import Popen, PIPE
 
-if sys.version >= '3':
-    xrange = range
-
 from py4j.java_gateway import java_import, JavaGateway, JavaObject, GatewayParameters
 from py4j.clientserver import ClientServer, JavaParameters, PythonParameters
 from pyspark.find_spark_home import _find_spark_home
 from pyspark.serializers import read_int, write_with_length, UTF8Deserializer
-from pyspark.util import _exception_message
 
 
 def launch_gateway(conf=None, popen_kwargs=None):
@@ -197,7 +192,7 @@ def local_connect_and_auth(port, auth_secret):
             _do_server_auth(sockfile, auth_secret)
             return (sockfile, sock)
         except socket.error as e:
-            emsg = _exception_message(e)
+            emsg = str(e)
             errors.append("tried to connect to %s, but an error occured: %s" % (sa, emsg))
             sock.close()
             sock = None
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index cc8ce0567b..7c8cbe3a9f 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -16,20 +16,20 @@
 #
 
 import operator
-import sys
+import warnings
 from abc import ABCMeta, abstractmethod, abstractproperty
 from multiprocessing.pool import ThreadPool
 
-from pyspark import since, keyword_only
+from pyspark import keyword_only
 from pyspark.ml import Estimator, Predictor, PredictionModel, Model
 from pyspark.ml.param.shared import *
 from pyspark.ml.tree import _DecisionTreeModel, _DecisionTreeParams, \
     _TreeEnsembleModel, _RandomForestParams, _GBTParams, \
-    _HasVarianceImpurity, _TreeClassifierParams, _TreeEnsembleParams
+    _HasVarianceImpurity, _TreeClassifierParams
 from pyspark.ml.regression import _FactorizationMachinesParams, DecisionTreeRegressionModel
 from pyspark.ml.util import *
 from pyspark.ml.base import _PredictorParams
-from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams, \
+from pyspark.ml.wrapper import JavaParams, \
     JavaPredictor, JavaPredictionModel, JavaWrapper
 from pyspark.ml.common import inherit_doc, _java2py, _py2java
 from pyspark.ml.linalg import Vectors
diff --git a/python/pyspark/ml/common.py b/python/pyspark/ml/common.py
index 387c5d7309..4e1d7f93ae 100644
--- a/python/pyspark/ml/common.py
+++ b/python/pyspark/ml/common.py
@@ -15,11 +15,6 @@
 # limitations under the License.
 #
 
-import sys
-if sys.version >= '3':
-    long = int
-    unicode = str
-
 import py4j.protocol
 from py4j.protocol import Py4JJavaError
 from py4j.java_gateway import JavaObject
@@ -79,7 +74,7 @@ def _py2java(sc, obj):
         obj = [_py2java(sc, x) for x in obj]
     elif isinstance(obj, JavaObject):
         pass
-    elif isinstance(obj, (int, long, float, bool, bytes, unicode)):
+    elif isinstance(obj, (int, float, bool, bytes, str)):
         pass
     else:
         data = bytearray(PickleSerializer().dumps(obj))
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 498629cea8..c52ea62686 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -15,12 +15,7 @@
 # limitations under the License.
 #
 
-import sys
-if sys.version > '3':
-    basestring = str
-
 from pyspark import since, keyword_only, SparkContext
-from pyspark.rdd import ignore_unicode_prefix
 from pyspark.ml.linalg import _convert_to_vector
 from pyspark.ml.param.shared import *
 from pyspark.ml.util import JavaMLReadable, JavaMLWritable
@@ -2178,7 +2173,6 @@ class MinMaxScalerModel(JavaModel, _MinMaxScalerParams, JavaMLReadable, JavaMLWr
 
 
 @inherit_doc
-@ignore_unicode_prefix
 class NGram(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable):
     """
     A feature transformer that converts the input array of strings into an array of n-grams. Null
@@ -2196,15 +2190,15 @@ class NGram(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWr
     >>> ngram.setOutputCol("nGrams")
     NGram...
     >>> ngram.transform(df).head()
-    Row(inputTokens=[u'a', u'b', u'c', u'd', u'e'], nGrams=[u'a b', u'b c', u'c d', u'd e'])
+    Row(inputTokens=['a', 'b', 'c', 'd', 'e'], nGrams=['a b', 'b c', 'c d', 'd e'])
     >>> # Change n-gram length
     >>> ngram.setParams(n=4).transform(df).head()
-    Row(inputTokens=[u'a', u'b', u'c', u'd', u'e'], nGrams=[u'a b c d', u'b c d e'])
+    Row(inputTokens=['a', 'b', 'c', 'd', 'e'], nGrams=['a b c d', 'b c d e'])
     >>> # Temporarily modify output column.
     >>> ngram.transform(df, {ngram.outputCol: "output"}).head()
-    Row(inputTokens=[u'a', u'b', u'c', u'd', u'e'], output=[u'a b c d', u'b c d e'])
+    Row(inputTokens=['a', 'b', 'c', 'd', 'e'], output=['a b c d', 'b c d e'])
     >>> ngram.transform(df).head()
-    Row(inputTokens=[u'a', u'b', u'c', u'd', u'e'], nGrams=[u'a b c d', u'b c d e'])
+    Row(inputTokens=['a', 'b', 'c', 'd', 'e'], nGrams=['a b c d', 'b c d e'])
     >>> # Must use keyword arguments to specify params.
     >>> ngram.setParams("text")
     Traceback (most recent call last):
@@ -3082,7 +3076,6 @@ class RobustScalerModel(JavaModel, _RobustScalerParams, JavaMLReadable, JavaMLWr
 
 
 @inherit_doc
-@ignore_unicode_prefix
 class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable):
     """
     A regex based tokenizer that extracts tokens either by using the
@@ -3099,15 +3092,15 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable,
     >>> reTokenizer.setOutputCol("words")
     RegexTokenizer...
     >>> reTokenizer.transform(df).head()
-    Row(text=u'A B  c', words=[u'a', u'b', u'c'])
+    Row(text='A B  c', words=['a', 'b', 'c'])
     >>> # Change a parameter.
     >>> reTokenizer.setParams(outputCol="tokens").transform(df).head()
-    Row(text=u'A B  c', tokens=[u'a', u'b', u'c'])
+    Row(text='A B  c', tokens=['a', 'b', 'c'])
     >>> # Temporarily modify a parameter.
     >>> reTokenizer.transform(df, {reTokenizer.outputCol: "words"}).head()
-    Row(text=u'A B  c', words=[u'a', u'b', u'c'])
+    Row(text='A B  c', words=['a', 'b', 'c'])
     >>> reTokenizer.transform(df).head()
-    Row(text=u'A B  c', tokens=[u'a', u'b', u'c'])
+    Row(text='A B  c', tokens=['a', 'b', 'c'])
     >>> # Must use keyword arguments to specify params.
     >>> reTokenizer.setParams("text")
     Traceback (most recent call last):
@@ -3935,7 +3928,6 @@ class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol, HasInputCols,
 
 
 @inherit_doc
-@ignore_unicode_prefix
 class Tokenizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable):
     """
     A tokenizer that converts the input string to lowercase and then
@@ -3946,15 +3938,15 @@ class Tokenizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, Java
     >>> tokenizer.setInputCol("text")
     Tokenizer...
     >>> tokenizer.transform(df).head()
-    Row(text=u'a b c', words=[u'a', u'b', u'c'])
+    Row(text='a b c', words=['a', 'b', 'c'])
     >>> # Change a parameter.
     >>> tokenizer.setParams(outputCol="tokens").transform(df).head()
-    Row(text=u'a b c', tokens=[u'a', u'b', u'c'])
+    Row(text='a b c', tokens=['a', 'b', 'c'])
     >>> # Temporarily modify a parameter.
     >>> tokenizer.transform(df, {tokenizer.outputCol: "words"}).head()
-    Row(text=u'a b c', words=[u'a', u'b', u'c'])
+    Row(text='a b c', words=['a', 'b', 'c'])
     >>> tokenizer.transform(df).head()
-    Row(text=u'a b c', tokens=[u'a', u'b', u'c'])
+    Row(text='a b c', tokens=['a', 'b', 'c'])
     >>> # Must use keyword arguments to specify params.
     >>> tokenizer.setParams("text")
     Traceback (most recent call last):
@@ -4476,7 +4468,6 @@ class _Word2VecParams(HasStepSize, HasMaxIter, HasSeed, HasInputCol, HasOutputCo
 
 
 @inherit_doc
-@ignore_unicode_prefix
 class Word2Vec(JavaEstimator, _Word2VecParams, JavaMLReadable, JavaMLWritable):
     """
     Word2Vec trains a model of `Map(String, Vector)`, i.e. transforms a word into a code for further
@@ -4505,7 +4496,7 @@ class Word2Vec(JavaEstimator, _Word2VecParams, JavaMLReadable, JavaMLWritable):
     +----+--------------------+
     ...
     >>> model.findSynonymsArray("a", 2)
-    [(u'b', 0.015859870240092278), (u'c', -0.5680795907974243)]
+    [('b', 0.015859870240092278), ('c', -0.5680795907974243)]
     >>> from pyspark.sql.functions import format_number as fmt
     >>> model.findSynonyms("a", 2).select("word", fmt("similarity", 5).alias("similarity")).show()
     +----+----------+
@@ -4668,7 +4659,7 @@ class Word2VecModel(JavaModel, _Word2VecParams, JavaMLReadable, JavaMLWritable):
         Returns a dataframe with two fields word and similarity (which
         gives the cosine similarity).
         """
-        if not isinstance(word, basestring):
+        if not isinstance(word, str):
             word = _convert_to_vector(word)
         return self._call_java("findSynonyms", word, num)
 
@@ -4680,7 +4671,7 @@ class Word2VecModel(JavaModel, _Word2VecParams, JavaMLReadable, JavaMLWritable):
         Returns an array with two fields word and similarity (which
         gives the cosine similarity).
         """
-        if not isinstance(word, basestring):
+        if not isinstance(word, str):
             word = _convert_to_vector(word)
         tuples = self._java_obj.findSynonymsArray(word, num)
         return list(map(lambda st: (st._1(), st._2()), list(tuples)))
diff --git a/python/pyspark/ml/fpm.py b/python/pyspark/ml/fpm.py
index 7a5591f3fb..b91788a82c 100644
--- a/python/pyspark/ml/fpm.py
+++ b/python/pyspark/ml/fpm.py
@@ -15,8 +15,7 @@
 # limitations under the License.
 #
 
-from pyspark import keyword_only, since
-from pyspark.rdd import ignore_unicode_prefix
+from pyspark import keyword_only
 from pyspark.sql import DataFrame
 from pyspark.ml.util import *
 from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams
@@ -132,7 +131,6 @@ class FPGrowthModel(JavaModel, _FPGrowthParams, JavaMLWritable, JavaMLReadable):
         return self._call_java("associationRules")
 
 
-@ignore_unicode_prefix
 class FPGrowth(JavaEstimator, _FPGrowthParams, JavaMLWritable, JavaMLReadable):
     r"""
     A parallel FP-growth algorithm to mine frequent itemsets. The algorithm is described in
@@ -193,7 +191,7 @@ class FPGrowth(JavaEstimator, _FPGrowthParams, JavaMLWritable, JavaMLReadable):
     ...
     >>> new_data = spark.createDataFrame([(["t", "s"], )], ["items"])
     >>> sorted(fpm.transform(new_data).first().newPrediction)
-    [u'x', u'y', u'z']
+    ['x', 'y', 'z']
 
     .. versionadded:: 2.2.0
     """
diff --git a/python/pyspark/ml/image.py b/python/pyspark/ml/image.py
index 4fb1036fba..20b24559b1 100644
--- a/python/pyspark/ml/image.py
+++ b/python/pyspark/ml/image.py
@@ -25,14 +25,13 @@
 """
 
 import sys
-import warnings
 
 import numpy as np
 from distutils.version import LooseVersion
 
 from pyspark import SparkContext
 from pyspark.sql.types import Row, _create_row, _parse_datatype_json_string
-from pyspark.sql import DataFrame, SparkSession
+from pyspark.sql import SparkSession
 
 __all__ = ["ImageSchema"]
 
diff --git a/python/pyspark/ml/linalg/__init__.py b/python/pyspark/ml/linalg/__init__.py
index a79d5e5dcb..8be440da4f 100644
--- a/python/pyspark/ml/linalg/__init__.py
+++ b/python/pyspark/ml/linalg/__init__.py
@@ -27,18 +27,8 @@ import sys
 import array
 import struct
 
-if sys.version >= '3':
-    basestring = str
-    xrange = range
-    import copyreg as copy_reg
-    long = int
-else:
-    from itertools import izip as zip
-    import copy_reg
-
 import numpy as np
 
-from pyspark import since
 from pyspark.sql.types import UserDefinedType, StructField, StructType, ArrayType, DoubleType, \
     IntegerType, ByteType, BooleanType
 
@@ -47,13 +37,6 @@ __all__ = ['Vector', 'DenseVector', 'SparseVector', 'Vectors',
            'Matrix', 'DenseMatrix', 'SparseMatrix', 'Matrices']
 
 
-if sys.version_info[:2] == (2, 7):
-    # speed up pickling array in Python 2.7
-    def fast_pickle_array(ar):
-        return array.array, (ar.typecode, ar.tostring())
-    copy_reg.pickle(array.array, fast_pickle_array)
-
-
 # Check whether we have SciPy. MLlib works without it too, but if we have it, some methods,
 # such as _dot and _serialize_double_vector, start to support scipy.sparse matrices.
 
@@ -68,7 +51,7 @@ except:
 def _convert_to_vector(l):
     if isinstance(l, Vector):
         return l
-    elif type(l) in (array.array, np.array, np.ndarray, list, tuple, xrange):
+    elif type(l) in (array.array, np.array, np.ndarray, list, tuple, range):
         return DenseVector(l)
     elif _have_scipy and scipy.sparse.issparse(l):
         assert l.shape[1] == 1, "Expected column vector"
@@ -102,7 +85,7 @@ def _vector_size(v):
     """
     if isinstance(v, Vector):
         return len(v)
-    elif type(v) in (array.array, list, tuple, xrange):
+    elif type(v) in (array.array, list, tuple, range):
         return len(v)
     elif type(v) == np.ndarray:
         if v.ndim == 1 or (v.ndim == 2 and v.shape[1] == 1):
@@ -415,7 +398,7 @@ class DenseVector(Vector):
         elif isinstance(other, SparseVector):
             if len(self) != other.size:
                 return False
-            return Vectors._equals(list(xrange(len(self))), self.array, other.indices, other.values)
+            return Vectors._equals(list(range(len(self))), self.array, other.indices, other.values)
         return False
 
     def __ne__(self, other):
@@ -520,7 +503,7 @@ class SparseVector(Vector):
                 self.indices = np.array(args[0], dtype=np.int32)
                 self.values = np.array(args[1], dtype=np.float64)
             assert len(self.indices) == len(self.values), "index and value arrays not same length"
-            for i in xrange(len(self.indices) - 1):
+            for i in range(len(self.indices) - 1):
                 if self.indices[i] >= self.indices[i + 1]:
                     raise TypeError(
                         "Indices %s and %s are not strictly increasing"
@@ -699,7 +682,7 @@ class SparseVector(Vector):
         inds = self.indices
         vals = self.values
         entries = ", ".join(["{0}: {1}".format(inds[i], _format_float(vals[i]))
-                             for i in xrange(len(inds))])
+                             for i in range(len(inds))])
         return "SparseVector({0}, {{{1}}})".format(self.size, entries)
 
     def __eq__(self, other):
@@ -709,7 +692,7 @@ class SparseVector(Vector):
         elif isinstance(other, DenseVector):
             if self.size != len(other):
                 return False
-            return Vectors._equals(self.indices, self.values, list(xrange(len(other))), other.array)
+            return Vectors._equals(self.indices, self.values, list(range(len(other))), other.array)
         return False
 
     def __getitem__(self, index):
@@ -791,7 +774,7 @@ class Vectors(object):
         >>> Vectors.dense(1.0, 2.0)
         DenseVector([1.0, 2.0])
         """
-        if len(elements) == 1 and not isinstance(elements[0], (float, int, long)):
+        if len(elements) == 1 and not isinstance(elements[0], (float, int)):
             # it's list, numpy.array or other iterable object.
             elements = elements[0]
         return DenseVector(elements)
@@ -1124,7 +1107,7 @@ class SparseMatrix(Matrix):
         Return a numpy.ndarray
         """
         A = np.zeros((self.numRows, self.numCols), dtype=np.float64, order='F')
-        for k in xrange(self.colPtrs.size - 1):
+        for k in range(self.colPtrs.size - 1):
             startptr = self.colPtrs[k]
             endptr = self.colPtrs[k + 1]
             if self.isTransposed:
diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py
index 1be8755c7b..96b07bfa5f 100644
--- a/python/pyspark/ml/param/__init__.py
+++ b/python/pyspark/ml/param/__init__.py
@@ -16,15 +16,10 @@
 #
 import array
 import sys
-if sys.version > '3':
-    basestring = str
-    xrange = range
-    unicode = str
-
 from abc import ABCMeta
 import copy
-import numpy as np
 
+import numpy as np
 from py4j.java_gateway import JavaObject
 
 from pyspark.ml.linalg import DenseVector, Vector, Matrix
@@ -93,12 +88,12 @@ class TypeConverters(object):
     @staticmethod
     def _can_convert_to_list(value):
         vtype = type(value)
-        return vtype in [list, np.ndarray, tuple, xrange, array.array] or isinstance(value, Vector)
+        return vtype in [list, np.ndarray, tuple, range, array.array] or isinstance(value, Vector)
 
     @staticmethod
     def _can_convert_to_string(value):
         vtype = type(value)
-        return isinstance(value, basestring) or vtype in [np.unicode_, np.string_, np.str_]
+        return isinstance(value, str) or vtype in [np.unicode_, np.string_, np.str_]
 
     @staticmethod
     def identity(value):
@@ -114,7 +109,7 @@ class TypeConverters(object):
         """
         if type(value) == list:
             return value
-        elif type(value) in [np.ndarray, tuple, xrange, array.array]:
+        elif type(value) in [np.ndarray, tuple, range, array.array]:
             return list(value)
         elif isinstance(value, Vector):
             return list(value.toArray())
@@ -211,12 +206,10 @@ class TypeConverters(object):
         """
         Convert a value to a string, if possible.
         """
-        if isinstance(value, basestring):
+        if isinstance(value, str):
             return value
-        elif type(value) in [np.string_, np.str_]:
+        elif type(value) in [np.string_, np.str_, np.unicode_]:
             return str(value)
-        elif type(value) == np.unicode_:
-            return unicode(value)
         else:
             raise TypeError("Could not convert %s to string type" % type(value))
 
@@ -338,7 +331,7 @@ class Params(Identifiable):
         Tests whether this instance contains a param with a given
         (string) name.
         """
-        if isinstance(paramName, basestring):
+        if isinstance(paramName, str):
             p = getattr(self, paramName, None)
             return isinstance(p, Param)
         else:
@@ -421,7 +414,7 @@ class Params(Identifiable):
         if isinstance(param, Param):
             self._shouldOwn(param)
             return param
-        elif isinstance(param, basestring):
+        elif isinstance(param, str):
             return self.getParam(param)
         else:
             raise ValueError("Cannot resolve %r as a param." % param)
@@ -510,7 +503,7 @@ class Params(Identifiable):
         :return: same instance, but with the uid and Param.parent values
                  updated, including within param maps
         """
-        newUid = unicode(newUid)
+        newUid = str(newUid)
         self.uid = newUid
         newDefaultParamMap = dict()
         newParamMap = dict()
diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py
index 2086e831f4..bc1ea87ad6 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 header = """#
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
diff --git a/python/pyspark/ml/pipeline.py b/python/pyspark/ml/pipeline.py
index 53d07ec966..eacb8b82b5 100644
--- a/python/pyspark/ml/pipeline.py
+++ b/python/pyspark/ml/pipeline.py
@@ -16,12 +16,8 @@
 #
 
 import sys
-import os
 
-if sys.version > '3':
-    basestring = str
-
-from pyspark import since, keyword_only, SparkContext
+from pyspark import keyword_only
 from pyspark.ml.base import Estimator, Model, Transformer
 from pyspark.ml.param import Param, Params
 from pyspark.ml.util import *
diff --git a/python/pyspark/ml/tests/test_feature.py b/python/pyspark/ml/tests/test_feature.py
index 4c6bfa696b..7856a317c2 100644
--- a/python/pyspark/ml/tests/test_feature.py
+++ b/python/pyspark/ml/tests/test_feature.py
@@ -19,9 +19,6 @@
 import sys
 import unittest
 
-if sys.version > '3':
-    basestring = str
-
 from pyspark.ml.feature import Binarizer, CountVectorizer, CountVectorizerModel, HashingTF, IDF, \
     NGram, RFormula, StopWordsRemover, StringIndexer, StringIndexerModel, VectorSizeHint
 from pyspark.ml.linalg import DenseVector, SparseVector, Vectors
@@ -91,7 +88,7 @@ class FeatureTests(SparkSessionTestCase):
         transformedDF = stopWordRemover.transform(dataset)
         self.assertEqual(transformedDF.head().output, ["panda"])
         self.assertEqual(type(stopWordRemover.getStopWords()), list)
-        self.assertTrue(isinstance(stopWordRemover.getStopWords()[0], basestring))
+        self.assertTrue(isinstance(stopWordRemover.getStopWords()[0], str))
         # Custom
         stopwords = ["panda"]
         stopWordRemover.setStopWords(stopwords)
diff --git a/python/pyspark/ml/tests/test_param.py b/python/pyspark/ml/tests/test_param.py
index 1b2b1914cc..e1abd59a2d 100644
--- a/python/pyspark/ml/tests/test_param.py
+++ b/python/pyspark/ml/tests/test_param.py
@@ -35,10 +35,6 @@ from pyspark.ml.wrapper import JavaParams
 from pyspark.testing.mlutils import check_params, PySparkTestCase, SparkSessionTestCase
 
 
-if sys.version > '3':
-    xrange = range
-
-
 class ParamTypeConversionTests(PySparkTestCase):
     """
     Test that param type conversion happens.
@@ -67,14 +63,14 @@ class ParamTypeConversionTests(PySparkTestCase):
     def test_list(self):
         l = [0, 1]
         for lst_like in [l, np.array(l), DenseVector(l), SparseVector(len(l), range(len(l)), l),
-                         pyarray.array('l', l), xrange(2), tuple(l)]:
+                         pyarray.array('l', l), range(2), tuple(l)]:
             converted = TypeConverters.toList(lst_like)
             self.assertEqual(type(converted), list)
             self.assertListEqual(converted, l)
 
     def test_list_int(self):
         for indices in [[1.0, 2.0], np.array([1.0, 2.0]), DenseVector([1.0, 2.0]),
-                        SparseVector(2, {0: 1.0, 1: 2.0}), xrange(1, 3), (1.0, 2.0),
+                        SparseVector(2, {0: 1.0, 1: 2.0}), range(1, 3), (1.0, 2.0),
                         pyarray.array('d', [1.0, 2.0])]:
             vs = VectorSlicer(indices=indices)
             self.assertListEqual(vs.getIndices(), [1, 2])
@@ -200,12 +196,7 @@ class ParamTests(SparkSessionTestCase):
         self.assertEqual(testParams._resolveParam("maxIter"), testParams.maxIter)
 
         self.assertEqual(testParams._resolveParam(u"maxIter"), testParams.maxIter)
-        if sys.version_info[0] >= 3:
-            # In Python 3, it is allowed to get/set attributes with non-ascii characters.
-            e_cls = AttributeError
-        else:
-            e_cls = UnicodeEncodeError
-        self.assertRaises(e_cls, lambda: testParams._resolveParam(u"아"))
+        self.assertRaises(AttributeError, lambda: testParams._resolveParam(u"아"))
 
     def test_params(self):
         testParams = TestParams()
diff --git a/python/pyspark/ml/tests/test_training_summary.py b/python/pyspark/ml/tests/test_training_summary.py
index 7d90579318..15e9ebb0f5 100644
--- a/python/pyspark/ml/tests/test_training_summary.py
+++ b/python/pyspark/ml/tests/test_training_summary.py
@@ -18,9 +18,6 @@
 import sys
 import unittest
 
-if sys.version > '3':
-    basestring = str
-
 from pyspark.ml.classification import BinaryLogisticRegressionSummary, LinearSVC, \
     LinearSVCSummary, BinaryRandomForestClassificationSummary, LogisticRegression, \
     LogisticRegressionSummary, RandomForestClassificationSummary, \
@@ -101,7 +98,7 @@ class TrainingSummaryTest(SparkSessionTestCase):
         self.assertEqual(s.residualDegreeOfFreedom, 1)
         self.assertEqual(s.residualDegreeOfFreedomNull, 2)
         self.assertEqual(s.rank, 1)
-        self.assertTrue(isinstance(s.solver, basestring))
+        self.assertTrue(isinstance(s.solver, str))
         self.assertTrue(isinstance(s.aic, float))
         self.assertTrue(isinstance(s.deviance, float))
         self.assertTrue(isinstance(s.nullDeviance, float))
diff --git a/python/pyspark/ml/tree.py b/python/pyspark/ml/tree.py
index a13b27ec8a..460c76fabc 100644
--- a/python/pyspark/ml/tree.py
+++ b/python/pyspark/ml/tree.py
@@ -15,12 +15,10 @@
 # limitations under the License.
 #
 
-from pyspark import since, keyword_only
 from pyspark.ml.param.shared import *
 from pyspark.ml.util import *
-from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams, \
-    JavaPredictor, JavaPredictionModel
-from pyspark.ml.common import inherit_doc, _java2py, _py2java
+from pyspark.ml.wrapper import JavaPredictionModel
+from pyspark.ml.common import inherit_doc
 
 
 @inherit_doc
diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py
index e00753b2ff..7f3d942e2e 100644
--- a/python/pyspark/ml/tuning.py
+++ b/python/pyspark/ml/tuning.py
@@ -15,12 +15,11 @@
 # limitations under the License.
 #
 import itertools
-import sys
 from multiprocessing.pool import ThreadPool
 
 import numpy as np
 
-from pyspark import since, keyword_only
+from pyspark import keyword_only
 from pyspark.ml import Estimator, Model
 from pyspark.ml.common import _py2java, _java2py
 from pyspark.ml.param import Params, Param, TypeConverters
diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py
index aac2b38d3f..9ab6bfa9ba 100644
--- a/python/pyspark/ml/util.py
+++ b/python/pyspark/ml/util.py
@@ -20,12 +20,6 @@ import sys
 import os
 import time
 import uuid
-import warnings
-
-if sys.version > '3':
-    basestring = str
-    unicode = str
-    long = int
 
 from pyspark import SparkContext, since
 from pyspark.ml.common import inherit_doc
@@ -60,10 +54,10 @@ class Identifiable(object):
     @classmethod
     def _randomUID(cls):
         """
-        Generate a unique unicode id for the object. The default implementation
+        Generate a unique string id for the object. The default implementation
         concatenates the class name, "_", and 12 random hex chars.
         """
-        return unicode(cls.__name__ + "_" + uuid.uuid4().hex[-12:])
+        return str(cls.__name__ + "_" + uuid.uuid4().hex[-12:])
 
 
 @inherit_doc
@@ -170,8 +164,8 @@ class JavaMLWriter(MLWriter):
 
     def save(self, path):
         """Save the ML instance to the input path."""
-        if not isinstance(path, basestring):
-            raise TypeError("path should be a basestring, got type %s" % type(path))
+        if not isinstance(path, str):
+            raise TypeError("path should be a string, got type %s" % type(path))
         self._jwrite.save(path)
 
     def overwrite(self):
@@ -275,8 +269,8 @@ class JavaMLReader(MLReader):
 
     def load(self, path):
         """Load the ML instance from the input path."""
-        if not isinstance(path, basestring):
-            raise TypeError("path should be a basestring, got type %s" % type(path))
+        if not isinstance(path, str):
+            raise TypeError("path should be a string, got type %s" % type(path))
         java_obj = self._jread.load(path)
         if not hasattr(self._clazz, "_from_java"):
             raise NotImplementedError("This Java ML type cannot be loaded into Python currently: %r"
@@ -430,7 +424,7 @@ class DefaultParamsWriter(MLWriter):
         for p in instance._defaultParamMap:
             jsonDefaultParams[p.name] = instance._defaultParamMap[p]
 
-        basicMetadata = {"class": cls, "timestamp": long(round(time.time() * 1000)),
+        basicMetadata = {"class": cls, "timestamp": int(round(time.time() * 1000)),
                          "sparkVersion": sc.version, "uid": uid, "paramMap": jsonParams,
                          "defaultParamMap": jsonDefaultParams}
         if extraMetadata is not None:
diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py
index e59c6c7b25..c1d060a51c 100644
--- a/python/pyspark/ml/wrapper.py
+++ b/python/pyspark/ml/wrapper.py
@@ -16,9 +16,6 @@
 #
 
 from abc import ABCMeta, abstractmethod
-import sys
-if sys.version >= '3':
-    xrange = range
 
 from pyspark import since
 from pyspark import SparkContext
@@ -26,7 +23,6 @@ from pyspark.sql import DataFrame
 from pyspark.ml import Estimator, Predictor, PredictionModel, Transformer, Model
 from pyspark.ml.base import _PredictorParams
 from pyspark.ml.param import Params
-from pyspark.ml.param.shared import HasFeaturesCol, HasLabelCol, HasPredictionCol
 from pyspark.ml.util import _jvm
 from pyspark.ml.common import inherit_doc, _java2py, _py2java
 
@@ -99,15 +95,15 @@ class JavaWrapper(object):
             # If pylist is a 2D array, then a 2D java array will be created.
             # The 2D array is a square, non-jagged 2D array that is big enough for all elements.
             inner_array_length = 0
-            for i in xrange(len(pylist)):
+            for i in range(len(pylist)):
                 inner_array_length = max(inner_array_length, len(pylist[i]))
             java_array = sc._gateway.new_array(java_class, len(pylist), inner_array_length)
-            for i in xrange(len(pylist)):
-                for j in xrange(len(pylist[i])):
+            for i in range(len(pylist)):
+                for j in range(len(pylist[i])):
                     java_array[i][j] = pylist[i][j]
         else:
             java_array = sc._gateway.new_array(java_class, len(pylist))
-            for i in xrange(len(pylist)):
+            for i in range(len(pylist)):
                 java_array[i] = pylist[i]
         return java_array
 
diff --git a/python/pyspark/mllib/__init__.py b/python/pyspark/mllib/__init__.py
index ae26521ea9..6067693111 100644
--- a/python/pyspark/mllib/__init__.py
+++ b/python/pyspark/mllib/__init__.py
@@ -21,8 +21,6 @@ RDD-based machine learning APIs for Python (in maintenance mode).
 The `pyspark.mllib` package is in maintenance mode as of the Spark 2.0.0 release to encourage
 migration to the DataFrame-based APIs under the `pyspark.ml` package.
 """
-from __future__ import absolute_import
-
 # MLlib currently needs NumPy 1.4+, so complain if lower
 
 import numpy
diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py
index e41e5c9cc8..85cfe583fd 100644
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@@ -17,20 +17,13 @@
 
 import sys
 import array as pyarray
-import warnings
-
-if sys.version > '3':
-    xrange = range
-    basestring = str
-
 from math import exp, log
+from collections import namedtuple
 
 from numpy import array, random, tile
 
-from collections import namedtuple
-
 from pyspark import SparkContext, since
-from pyspark.rdd import RDD, ignore_unicode_prefix
+from pyspark.rdd import RDD
 from pyspark.mllib.common import JavaModelWrapper, callMLlibFunc, callJavaFunc, _py2java, _java2py
 from pyspark.mllib.linalg import SparseVector, _convert_to_vector, DenseVector
 from pyspark.mllib.stat.distribution import MultivariateGaussian
@@ -257,7 +250,7 @@ class KMeansModel(Saveable, Loader):
             return x.map(self.predict)
 
         x = _convert_to_vector(x)
-        for i in xrange(len(self.centers)):
+        for i in range(len(self.centers)):
             distance = x.squared_distance(self.centers[i])
             if distance < best_distance:
                 best = i
@@ -708,7 +701,7 @@ class StreamingKMeansModel(KMeansModel):
     >>> stkm = StreamingKMeansModel(initCenters, initWeights)
     >>> data = sc.parallelize([[-0.1, -0.1], [0.1, 0.1],
     ...                        [0.9, 0.9], [1.1, 1.1]])
-    >>> stkm = stkm.update(data, 1.0, u"batches")
+    >>> stkm = stkm.update(data, 1.0, "batches")
     >>> stkm.centers
     array([[ 0.,  0.],
            [ 1.,  1.]])
@@ -720,7 +713,7 @@ class StreamingKMeansModel(KMeansModel):
     [3.0, 3.0]
     >>> decayFactor = 0.0
     >>> data = sc.parallelize([DenseVector([1.5, 1.5]), DenseVector([0.2, 0.2])])
-    >>> stkm = stkm.update(data, 0.0, u"batches")
+    >>> stkm = stkm.update(data, 0.0, "batches")
     >>> stkm.centers
     array([[ 0.2,  0.2],
            [ 1.5,  1.5]])
@@ -743,7 +736,6 @@ class StreamingKMeansModel(KMeansModel):
         """Return the cluster weights."""
         return self._clusterWeights
 
-    @ignore_unicode_prefix
     @since('1.5.0')
     def update(self, data, decayFactor, timeUnit):
         """Update the centroids, according to data
@@ -979,8 +971,8 @@ class LDAModel(JavaModelWrapper, JavaSaveable, Loader):
         """
         if not isinstance(sc, SparkContext):
             raise TypeError("sc should be a SparkContext, got type %s" % type(sc))
-        if not isinstance(path, basestring):
-            raise TypeError("path should be a basestring, got type %s" % type(path))
+        if not isinstance(path, str):
+            raise TypeError("path should be a string, got type %s" % type(path))
         model = callMLlibFunc("loadLDAModel", sc, path)
         return LDAModel(model)
 
diff --git a/python/pyspark/mllib/common.py b/python/pyspark/mllib/common.py
index bac8f35056..24e2f19825 100644
--- a/python/pyspark/mllib/common.py
+++ b/python/pyspark/mllib/common.py
@@ -15,11 +15,6 @@
 # limitations under the License.
 #
 
-import sys
-if sys.version >= '3':
-    long = int
-    unicode = str
-
 import py4j.protocol
 from py4j.protocol import Py4JJavaError
 from py4j.java_gateway import JavaObject
@@ -81,7 +76,7 @@ def _py2java(sc, obj):
         obj = [_py2java(sc, x) for x in obj]
     elif isinstance(obj, JavaObject):
         pass
-    elif isinstance(obj, (int, long, float, bool, bytes, unicode)):
+    elif isinstance(obj, (int, float, bool, bytes, str)):
         pass
     else:
         data = bytearray(PickleSerializer().dumps(obj))
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index 3efae6ff0e..80a197eaa7 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -18,21 +18,15 @@
 """
 Python package for feature in MLlib.
 """
-from __future__ import absolute_import
-
 import sys
 import warnings
-if sys.version >= '3':
-    basestring = str
-    unicode = str
-
 from py4j.protocol import Py4JJavaError
 
 from pyspark import since
-from pyspark.rdd import RDD, ignore_unicode_prefix
+from pyspark.rdd import RDD
 from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
 from pyspark.mllib.linalg import (
-    Vector, Vectors, DenseVector, SparseVector, _convert_to_vector)
+    Vectors, DenseVector, SparseVector, _convert_to_vector)
 from pyspark.mllib.regression import LabeledPoint
 from pyspark.mllib.util import JavaLoader, JavaSaveable
 
@@ -616,7 +610,7 @@ class Word2VecModel(JavaVectorTransformer, JavaSaveable, JavaLoader):
 
         .. note:: Local use only
         """
-        if not isinstance(word, basestring):
+        if not isinstance(word, str):
             word = _convert_to_vector(word)
         words, similarity = self.call("findSynonyms", word, num)
         return zip(words, similarity)
@@ -640,7 +634,6 @@ class Word2VecModel(JavaVectorTransformer, JavaSaveable, JavaLoader):
         return Word2VecModel(model)
 
 
-@ignore_unicode_prefix
 class Word2Vec(object):
     """Word2Vec creates vector representation of words in a text corpus.
     The algorithm first constructs a vocabulary from the corpus
@@ -668,7 +661,7 @@ class Word2Vec(object):
 
     >>> syms = model.findSynonyms("a", 2)
     >>> [s[0] for s in syms]
-    [u'b', u'c']
+    ['b', 'c']
 
     But querying for synonyms of a vector may return the word whose
     representation is that vector:
@@ -676,7 +669,7 @@ class Word2Vec(object):
     >>> vec = model.transform("a")
     >>> syms = model.findSynonyms(vec, 2)
     >>> [s[0] for s in syms]
-    [u'a', u'b']
+    ['a', 'b']
 
     >>> import os, tempfile
     >>> path = tempfile.mkdtemp()
@@ -686,7 +679,7 @@ class Word2Vec(object):
     True
     >>> syms = sameModel.findSynonyms("a", 2)
     >>> [s[0] for s in syms]
-    [u'b', u'c']
+    ['b', 'c']
     >>> from shutil import rmtree
     >>> try:
     ...     rmtree(path)
diff --git a/python/pyspark/mllib/fpm.py b/python/pyspark/mllib/fpm.py
index 373a141456..cbbd7b351b 100644
--- a/python/pyspark/mllib/fpm.py
+++ b/python/pyspark/mllib/fpm.py
@@ -20,7 +20,6 @@ import sys
 from collections import namedtuple
 
 from pyspark import since
-from pyspark.rdd import ignore_unicode_prefix
 from pyspark.mllib.common import JavaModelWrapper, callMLlibFunc
 from pyspark.mllib.util import JavaSaveable, JavaLoader, inherit_doc
 
@@ -28,7 +27,6 @@ __all__ = ['FPGrowth', 'FPGrowthModel', 'PrefixSpan', 'PrefixSpanModel']
 
 
 @inherit_doc
-@ignore_unicode_prefix
 class FPGrowthModel(JavaModelWrapper, JavaSaveable, JavaLoader):
     """
     A FP-Growth model for mining frequent itemsets
@@ -38,7 +36,7 @@ class FPGrowthModel(JavaModelWrapper, JavaSaveable, JavaLoader):
     >>> rdd = sc.parallelize(data, 2)
     >>> model = FPGrowth.train(rdd, 0.6, 2)
     >>> sorted(model.freqItemsets().collect())
-    [FreqItemset(items=[u'a'], freq=4), FreqItemset(items=[u'c'], freq=3), ...
+    [FreqItemset(items=['a'], freq=4), FreqItemset(items=['c'], freq=3), ...
     >>> model_path = temp_path + "/fpm"
     >>> model.save(sc, model_path)
     >>> sameModel = FPGrowthModel.load(sc, model_path)
@@ -101,7 +99,6 @@ class FPGrowth(object):
 
 
 @inherit_doc
-@ignore_unicode_prefix
 class PrefixSpanModel(JavaModelWrapper):
     """
     Model fitted by PrefixSpan
@@ -114,7 +111,7 @@ class PrefixSpanModel(JavaModelWrapper):
     >>> rdd = sc.parallelize(data, 2)
     >>> model = PrefixSpan.train(rdd)
     >>> sorted(model.freqSequences().collect())
-    [FreqSequence(sequence=[[u'a']], freq=3), FreqSequence(sequence=[[u'a'], [u'a']], freq=1), ...
+    [FreqSequence(sequence=[['a']], freq=3), FreqSequence(sequence=[['a'], ['a']], freq=1), ...
 
     .. versionadded:: 1.6.0
     """
diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py
index cd09621b13..c1402fb98a 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -27,15 +27,6 @@ import sys
 import array
 import struct
 
-if sys.version >= '3':
-    basestring = str
-    xrange = range
-    import copyreg as copy_reg
-    long = int
-else:
-    from itertools import izip as zip
-    import copy_reg
-
 import numpy as np
 
 from pyspark import since
@@ -49,13 +40,6 @@ __all__ = ['Vector', 'DenseVector', 'SparseVector', 'Vectors',
            'QRDecomposition']
 
 
-if sys.version_info[:2] == (2, 7):
-    # speed up pickling array in Python 2.7
-    def fast_pickle_array(ar):
-        return array.array, (ar.typecode, ar.tostring())
-    copy_reg.pickle(array.array, fast_pickle_array)
-
-
 # Check whether we have SciPy. MLlib works without it too, but if we have it, some methods,
 # such as _dot and _serialize_double_vector, start to support scipy.sparse matrices.
 
@@ -70,7 +54,7 @@ except:
 def _convert_to_vector(l):
     if isinstance(l, Vector):
         return l
-    elif type(l) in (array.array, np.array, np.ndarray, list, tuple, xrange):
+    elif type(l) in (array.array, np.array, np.ndarray, list, tuple, range):
         return DenseVector(l)
     elif _have_scipy and scipy.sparse.issparse(l):
         assert l.shape[1] == 1, "Expected column vector"
@@ -104,7 +88,7 @@ def _vector_size(v):
     """
     if isinstance(v, Vector):
         return len(v)
-    elif type(v) in (array.array, list, tuple, xrange):
+    elif type(v) in (array.array, list, tuple, range):
         return len(v)
     elif type(v) == np.ndarray:
         if v.ndim == 1 or (v.ndim == 2 and v.shape[1] == 1):
@@ -459,7 +443,7 @@ class DenseVector(Vector):
         elif isinstance(other, SparseVector):
             if len(self) != other.size:
                 return False
-            return Vectors._equals(list(xrange(len(self))), self.array, other.indices, other.values)
+            return Vectors._equals(list(range(len(self))), self.array, other.indices, other.values)
         return False
 
     def __ne__(self, other):
@@ -556,7 +540,7 @@ class SparseVector(Vector):
                 self.indices = np.array(args[0], dtype=np.int32)
                 self.values = np.array(args[1], dtype=np.float64)
             assert len(self.indices) == len(self.values), "index and value arrays not same length"
-            for i in xrange(len(self.indices) - 1):
+            for i in range(len(self.indices) - 1):
                 if self.indices[i] >= self.indices[i + 1]:
                     raise TypeError(
                         "Indices %s and %s are not strictly increasing"
@@ -788,7 +772,7 @@ class SparseVector(Vector):
         inds = self.indices
         vals = self.values
         entries = ", ".join(["{0}: {1}".format(inds[i], _format_float(vals[i]))
-                             for i in xrange(len(inds))])
+                             for i in range(len(inds))])
         return "SparseVector({0}, {{{1}}})".format(self.size, entries)
 
     def __eq__(self, other):
@@ -798,7 +782,7 @@ class SparseVector(Vector):
         elif isinstance(other, DenseVector):
             if self.size != len(other):
                 return False
-            return Vectors._equals(self.indices, self.values, list(xrange(len(other))), other.array)
+            return Vectors._equals(self.indices, self.values, list(range(len(other))), other.array)
         return False
 
     def __getitem__(self, index):
@@ -880,7 +864,7 @@ class Vectors(object):
         >>> Vectors.dense(1.0, 2.0)
         DenseVector([1.0, 2.0])
         """
-        if len(elements) == 1 and not isinstance(elements[0], (float, int, long)):
+        if len(elements) == 1 and not isinstance(elements[0], (float, int)):
             # it's list, numpy.array or other iterable object.
             elements = elements[0]
         return DenseVector(elements)
@@ -1279,7 +1263,7 @@ class SparseMatrix(Matrix):
         Return an numpy.ndarray
         """
         A = np.zeros((self.numRows, self.numCols), dtype=np.float64, order='F')
-        for k in xrange(self.colPtrs.size - 1):
+        for k in range(self.colPtrs.size - 1):
             startptr = self.colPtrs[k]
             endptr = self.colPtrs[k + 1]
             if self.isTransposed:
diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py
index 56701758c8..603d31d3d7 100644
--- a/python/pyspark/mllib/linalg/distributed.py
+++ b/python/pyspark/mllib/linalg/distributed.py
@@ -21,9 +21,6 @@ Package for distributed linear algebra.
 
 import sys
 
-if sys.version >= '3':
-    long = int
-
 from py4j.java_gateway import JavaObject
 
 from pyspark import RDD, since
@@ -95,9 +92,9 @@ class RowMatrix(DistributedMatrix):
         """
         if isinstance(rows, RDD):
             rows = rows.map(_convert_to_vector)
-            java_matrix = callMLlibFunc("createRowMatrix", rows, long(numRows), int(numCols))
+            java_matrix = callMLlibFunc("createRowMatrix", rows, int(numRows), int(numCols))
         elif isinstance(rows, DataFrame):
-            java_matrix = callMLlibFunc("createRowMatrix", rows, long(numRows), int(numCols))
+            java_matrix = callMLlibFunc("createRowMatrix", rows, int(numRows), int(numCols))
         elif (isinstance(rows, JavaObject)
               and rows.getClass().getSimpleName() == "RowMatrix"):
             java_matrix = rows
@@ -439,13 +436,13 @@ class IndexedRow(object):
     """
     Represents a row of an IndexedRowMatrix.
 
-    Just a wrapper over a (long, vector) tuple.
+    Just a wrapper over a (int, vector) tuple.
 
     :param index: The index for the given row.
     :param vector: The row in the matrix at the given index.
     """
     def __init__(self, index, vector):
-        self.index = long(index)
+        self.index = int(index)
         self.vector = _convert_to_vector(vector)
 
     def __repr__(self):
@@ -465,8 +462,8 @@ class IndexedRowMatrix(DistributedMatrix):
     """
     Represents a row-oriented distributed Matrix with indexed rows.
 
-    :param rows: An RDD of IndexedRows or (long, vector) tuples or a DataFrame consisting of a
-                 long typed column of indices and a vector typed column.
+    :param rows: An RDD of IndexedRows or (int, vector) tuples or a DataFrame consisting of a
+                 int typed column of indices and a vector typed column.
     :param numRows: Number of rows in the matrix. A non-positive
                     value means unknown, at which point the number
                     of rows will be determined by the max row
@@ -510,14 +507,14 @@ class IndexedRowMatrix(DistributedMatrix):
             # both be easily serialized.  We will convert back to
             # IndexedRows on the Scala side.
             java_matrix = callMLlibFunc("createIndexedRowMatrix", rows.toDF(),
-                                        long(numRows), int(numCols))
+                                        int(numRows), int(numCols))
         elif isinstance(rows, DataFrame):
-            java_matrix = callMLlibFunc("createIndexedRowMatrix", rows, long(numRows), int(numCols))
+            java_matrix = callMLlibFunc("createIndexedRowMatrix", rows, int(numRows), int(numCols))
         elif (isinstance(rows, JavaObject)
               and rows.getClass().getSimpleName() == "IndexedRowMatrix"):
             java_matrix = rows
         else:
-            raise TypeError("rows should be an RDD of IndexedRows or (long, vector) tuples, "
+            raise TypeError("rows should be an RDD of IndexedRows or (int, vector) tuples, "
                             "got %s" % type(rows))
 
         self._java_matrix_wrapper = JavaModelWrapper(java_matrix)
@@ -731,15 +728,15 @@ class MatrixEntry(object):
     """
     Represents an entry of a CoordinateMatrix.
 
-    Just a wrapper over a (long, long, float) tuple.
+    Just a wrapper over a (int, int, float) tuple.
 
     :param i: The row index of the matrix.
     :param j: The column index of the matrix.
     :param value: The (i, j)th entry of the matrix, as a float.
     """
     def __init__(self, i, j, value):
-        self.i = long(i)
-        self.j = long(j)
+        self.i = int(i)
+        self.j = int(j)
         self.value = float(value)
 
     def __repr__(self):
@@ -760,7 +757,7 @@ class CoordinateMatrix(DistributedMatrix):
     Represents a matrix in coordinate format.
 
     :param entries: An RDD of MatrixEntry inputs or
-                    (long, long, float) tuples.
+                    (int, int, float) tuples.
     :param numRows: Number of rows in the matrix. A non-positive
                     value means unknown, at which point the number
                     of rows will be determined by the max row
@@ -804,13 +801,13 @@ class CoordinateMatrix(DistributedMatrix):
             # each be easily serialized. We will convert back to
             # MatrixEntry inputs on the Scala side.
             java_matrix = callMLlibFunc("createCoordinateMatrix", entries.toDF(),
-                                        long(numRows), long(numCols))
+                                        int(numRows), int(numCols))
         elif (isinstance(entries, JavaObject)
               and entries.getClass().getSimpleName() == "CoordinateMatrix"):
             java_matrix = entries
         else:
             raise TypeError("entries should be an RDD of MatrixEntry entries or "
-                            "(long, long, float) tuples, got %s" % type(entries))
+                            "(int, int, float) tuples, got %s" % type(entries))
 
         self._java_matrix_wrapper = JavaModelWrapper(java_matrix)
 
@@ -1044,7 +1041,7 @@ class BlockMatrix(DistributedMatrix):
             # the Scala side.
             java_matrix = callMLlibFunc("createBlockMatrix", blocks.toDF(),
                                         int(rowsPerBlock), int(colsPerBlock),
-                                        long(numRows), long(numCols))
+                                        int(numRows), int(numCols))
         elif (isinstance(blocks, JavaObject)
               and blocks.getClass().getSimpleName() == "BlockMatrix"):
             java_matrix = blocks
diff --git a/python/pyspark/mllib/stat/KernelDensity.py b/python/pyspark/mllib/stat/KernelDensity.py
index 7250eab670..56444c152f 100644
--- a/python/pyspark/mllib/stat/KernelDensity.py
+++ b/python/pyspark/mllib/stat/KernelDensity.py
@@ -15,11 +15,6 @@
 # limitations under the License.
 #
 
-import sys
-
-if sys.version > '3':
-    xrange = range
-
 import numpy as np
 
 from pyspark.mllib.common import callMLlibFunc
diff --git a/python/pyspark/mllib/stat/_statistics.py b/python/pyspark/mllib/stat/_statistics.py
index d49f741a2f..43454ba518 100644
--- a/python/pyspark/mllib/stat/_statistics.py
+++ b/python/pyspark/mllib/stat/_statistics.py
@@ -16,10 +16,8 @@
 #
 
 import sys
-if sys.version >= '3':
-    basestring = str
 
-from pyspark.rdd import RDD, ignore_unicode_prefix
+from pyspark.rdd import RDD
 from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
 from pyspark.mllib.linalg import Matrix, _convert_to_vector
 from pyspark.mllib.regression import LabeledPoint
@@ -157,7 +155,6 @@ class Statistics(object):
             return callMLlibFunc("corr", x.map(float), y.map(float), method)
 
     @staticmethod
-    @ignore_unicode_prefix
     def chiSqTest(observed, expected=None):
         """
         If `observed` is Vector, conduct Pearson's chi-squared goodness
@@ -199,9 +196,9 @@ class Statistics(object):
         >>> print(round(pearson.pValue, 4))
         0.8187
         >>> pearson.method
-        u'pearson'
+        'pearson'
         >>> pearson.nullHypothesis
-        u'observed follows the same distribution as expected.'
+        'observed follows the same distribution as expected.'
 
         >>> observed = Vectors.dense([21, 38, 43, 80])
         >>> expected = Vectors.dense([3, 5, 7, 20])
@@ -242,7 +239,6 @@ class Statistics(object):
         return ChiSqTestResult(jmodel)
 
     @staticmethod
-    @ignore_unicode_prefix
     def kolmogorovSmirnovTest(data, distName="norm", *params):
         """
         Performs the Kolmogorov-Smirnov (KS) test for data sampled from
@@ -282,7 +278,7 @@ class Statistics(object):
         >>> print(round(ksmodel.statistic, 3))
         0.175
         >>> ksmodel.nullHypothesis
-        u'Sample follows theoretical distribution'
+        'Sample follows theoretical distribution'
 
         >>> data = sc.parallelize([2.0, 3.0, 4.0])
         >>> ksmodel = kstest(data, "norm", 3.0, 1.0)
@@ -293,7 +289,7 @@ class Statistics(object):
         """
         if not isinstance(data, RDD):
             raise TypeError("data should be an RDD, got %s." % type(data))
-        if not isinstance(distName, basestring):
+        if not isinstance(distName, str):
             raise TypeError("distName should be a string, got %s." % type(distName))
 
         params = [float(param) for param in params]
diff --git a/python/pyspark/mllib/tests/test_linalg.py b/python/pyspark/mllib/tests/test_linalg.py
index 312730e8af..21c2bb422a 100644
--- a/python/pyspark/mllib/tests/test_linalg.py
+++ b/python/pyspark/mllib/tests/test_linalg.py
@@ -31,9 +31,6 @@ from pyspark.sql import Row
 from pyspark.testing.mllibutils import MLlibTestCase
 from pyspark.testing.utils import have_scipy
 
-if sys.version >= '3':
-    long = int
-
 
 class VectorTests(MLlibTestCase):
 
@@ -447,7 +444,7 @@ class VectorUDTTests(MLlibTestCase):
 
     def test_indexed_row_matrix_from_dataframe(self):
         from pyspark.sql.utils import IllegalArgumentException
-        df = self.spark.createDataFrame([Row(long(0), Vectors.dense(1))])
+        df = self.spark.createDataFrame([Row(int(0), Vectors.dense(1))])
         matrix = IndexedRowMatrix(df)
         self.assertEqual(matrix.numRows(), 1)
         self.assertEqual(matrix.numCols(), 1)
diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
index 2d8df461ac..e05dfdb953 100644
--- a/python/pyspark/mllib/tree.py
+++ b/python/pyspark/mllib/tree.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import absolute_import
-
 import sys
 import random
 
diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py
index f0f9cda467..a0be29a82e 100644
--- a/python/pyspark/mllib/util.py
+++ b/python/pyspark/mllib/util.py
@@ -18,10 +18,6 @@
 import sys
 import numpy as np
 
-if sys.version > '3':
-    xrange = range
-    basestring = str
-
 from pyspark import SparkContext, since
 from pyspark.mllib.common import callMLlibFunc, inherit_doc
 from pyspark.mllib.linalg import Vectors, SparseVector, _convert_to_vector
@@ -46,7 +42,7 @@ class MLUtils(object):
         nnz = len(items) - 1
         indices = np.zeros(nnz, dtype=np.int32)
         values = np.zeros(nnz)
-        for i in xrange(nnz):
+        for i in range(nnz):
             index, value = items[1 + i].split(":")
             indices[i] = int(index) - 1
             values[i] = float(value)
@@ -61,10 +57,10 @@ class MLUtils(object):
         v = _convert_to_vector(p.features)
         if isinstance(v, SparseVector):
             nnz = len(v.indices)
-            for i in xrange(nnz):
+            for i in range(nnz):
                 items.append(str(v.indices[i] + 1) + ":" + str(v.values[i]))
         else:
-            for i in xrange(len(v)):
+            for i in range(len(v)):
                 items.append(str(i + 1) + ":" + str(v[i]))
         return " ".join(items)
 
@@ -396,8 +392,8 @@ class JavaSaveable(Saveable):
         """Save this model to the given path."""
         if not isinstance(sc, SparkContext):
             raise TypeError("sc should be a SparkContext, got type %s" % type(sc))
-        if not isinstance(path, basestring):
-            raise TypeError("path should be a basestring, got type %s" % type(path))
+        if not isinstance(path, str):
+            raise TypeError("path should be a string, got type %s" % type(path))
         self._java_model.save(sc._jsc.sc(), path)
 
 
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index db0c1971cd..437b2c4465 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -33,15 +33,10 @@ from itertools import chain
 from functools import reduce
 from math import sqrt, log, isinf, isnan, pow, ceil
 
-if sys.version > '3':
-    basestring = unicode = str
-else:
-    from itertools import imap as map, ifilter as filter
-
 from pyspark.java_gateway import local_connect_and_auth
 from pyspark.serializers import AutoBatchedSerializer, BatchedSerializer, NoOpSerializer, \
     CartesianDeserializer, CloudPickleSerializer, PairDeserializer, PickleSerializer, \
-    UTF8Deserializer, pack_long, read_int, write_int
+    pack_long, read_int, write_int
 from pyspark.join import python_join, python_left_outer_join, \
     python_right_outer_join, python_full_outer_join, python_cogroup
 from pyspark.statcounter import StatCounter
@@ -93,7 +88,7 @@ def portable_hash(x):
     219750521
     """
 
-    if sys.version_info >= (3, 2, 3) and 'PYTHONHASHSEED' not in os.environ:
+    if 'PYTHONHASHSEED' not in os.environ:
         raise Exception("Randomness of hash of string should be disabled via PYTHONHASHSEED")
 
     if x is None:
@@ -204,19 +199,6 @@ def _local_iterator_from_socket(sock_info, serializer):
     return iter(PyLocalIterable(sock_info, serializer))
 
 
-def ignore_unicode_prefix(f):
-    """
-    Ignore the 'u' prefix of string in doc tests, to make it works
-    in both python 2 and 3
-    """
-    if sys.version >= '3':
-        # the representation of unicode string in Python 3 does not have prefix 'u',
-        # so remove the prefix 'u' for doc tests
-        literal_re = re.compile(r"(\W|^)[uU](['])", re.UNICODE)
-        f.__doc__ = literal_re.sub(r'\1\2', f.__doc__)
-    return f
-
-
 class Partitioner(object):
     def __init__(self, numPartitions, partitionFunc):
         self.numPartitions = numPartitions
@@ -797,13 +779,12 @@ class RDD(object):
         """
         return self.map(lambda x: (f(x), x)).groupByKey(numPartitions, partitionFunc)
 
-    @ignore_unicode_prefix
     def pipe(self, command, env=None, checkCode=False):
         """
         Return an RDD created by piping elements to a forked external process.
 
         >>> sc.parallelize(['1', '2', '', '3']).pipe('cat').collect()
-        [u'1', u'2', u'', u'3']
+        ['1', '2', '', '3']
 
         :param checkCode: whether or not to check the return value of the shell command.
         """
@@ -816,7 +797,7 @@ class RDD(object):
 
             def pipe_objs(out):
                 for obj in iterator:
-                    s = unicode(obj).rstrip('\n') + '\n'
+                    s = str(obj).rstrip('\n') + '\n'
                     out.write(s.encode('utf-8'))
                 out.close()
             Thread(target=pipe_objs, args=[pipe.stdin]).start()
@@ -1591,7 +1572,6 @@ class RDD(object):
             ser = BatchedSerializer(PickleSerializer(), batchSize)
         self._reserialize(ser)._jrdd.saveAsObjectFile(path)
 
-    @ignore_unicode_prefix
     def saveAsTextFile(self, path, compressionCodecClass=None):
         """
         Save this RDD as a text file, using string representations of elements.
@@ -1625,13 +1605,13 @@ class RDD(object):
         >>> from fileinput import input, hook_compressed
         >>> result = sorted(input(glob(tempFile3.name + "/part*.gz"), openhook=hook_compressed))
         >>> b''.join(result).decode('utf-8')
-        u'bar\\nfoo\\n'
+        'bar\\nfoo\\n'
         """
         def func(split, iterator):
             for x in iterator:
-                if not isinstance(x, (unicode, bytes)):
-                    x = unicode(x)
-                if isinstance(x, unicode):
+                if not isinstance(x, (str, bytes)):
+                    x = str(x)
+                if isinstance(x, str):
                     x = x.encode("utf-8")
                 yield x
         keyed = self.mapPartitionsWithIndex(func)
@@ -2281,14 +2261,13 @@ class RDD(object):
         if n:
             return n
 
-    @ignore_unicode_prefix
     def setName(self, name):
         """
         Assign a name to this RDD.
 
         >>> rdd1 = sc.parallelize([1, 2])
         >>> rdd1.setName('RDD1').name()
-        u'RDD1'
+        'RDD1'
         """
         self._jrdd.setName(name)
         return self
diff --git a/python/pyspark/resultiterable.py b/python/pyspark/resultiterable.py
index c867b51877..cd2a59513b 100644
--- a/python/pyspark/resultiterable.py
+++ b/python/pyspark/resultiterable.py
@@ -15,10 +15,7 @@
 # limitations under the License.
 #
 
-try:
-    from collections.abc import Iterable
-except ImportError:
-    from collections import Iterable
+from collections.abc import Iterable
 
 
 __all__ = ["ResultIterable"]
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index 49b7cb4546..80ce9b8408 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -58,18 +58,11 @@ import types
 import collections
 import zlib
 import itertools
-
-if sys.version < '3':
-    import cPickle as pickle
-    from itertools import izip as zip, imap as map
-else:
-    import pickle
-    basestring = unicode = str
-    xrange = range
+import pickle
 pickle_protocol = pickle.HIGHEST_PROTOCOL
 
 from pyspark import cloudpickle
-from pyspark.util import _exception_message, print_exec
+from pyspark.util import print_exec
 
 
 __all__ = ["PickleSerializer", "MarshalSerializer", "UTF8Deserializer"]
@@ -132,11 +125,6 @@ class FramedSerializer(Serializer):
     where `length` is a 32-bit integer and data is `length` bytes.
     """
 
-    def __init__(self):
-        # On Python 2.6, we can't write bytearrays to streams, so we need to convert them
-        # to strings first. Check if the version number is that old.
-        self._only_write_strings = sys.version_info[0:2] <= (2, 6)
-
     def dump_stream(self, iterator, stream):
         for obj in iterator:
             self._write_with_length(obj, stream)
@@ -155,10 +143,7 @@ class FramedSerializer(Serializer):
         if len(serialized) > (1 << 31):
             raise ValueError("can not serialize object larger than 2G")
         write_int(len(serialized), stream)
-        if self._only_write_strings:
-            stream.write(str(serialized))
-        else:
-            stream.write(serialized)
+        stream.write(serialized)
 
     def _read_with_length(self, stream):
         length = read_int(stream)
@@ -204,7 +189,7 @@ class BatchedSerializer(Serializer):
             yield list(iterator)
         elif hasattr(iterator, "__len__") and hasattr(iterator, "__getslice__"):
             n = len(iterator)
-            for i in xrange(0, n, self.batchSize):
+            for i in range(0, n, self.batchSize):
                 yield iterator[i: i + self.batchSize]
         else:
             items = []
@@ -395,23 +380,8 @@ def _hijack_namedtuple():
         return types.FunctionType(f.__code__, f.__globals__, f.__name__,
                                   f.__defaults__, f.__closure__)
 
-    def _kwdefaults(f):
-        # __kwdefaults__ contains the default values of keyword-only arguments which are
-        # introduced from Python 3. The possible cases for __kwdefaults__ in namedtuple
-        # are as below:
-        #
-        # - Does not exist in Python 2.
-        # - Returns None in <= Python 3.5.x.
-        # - Returns a dictionary containing the default values to the keys from Python 3.6.x
-        #    (See https://bugs.python.org/issue25628).
-        kargs = getattr(f, "__kwdefaults__", None)
-        if kargs is None:
-            return {}
-        else:
-            return kargs
-
     _old_namedtuple = _copy_func(collections.namedtuple)
-    _old_namedtuple_kwdefaults = _kwdefaults(collections.namedtuple)
+    _old_namedtuple_kwdefaults = collections.namedtuple.__kwdefaults__
 
     def namedtuple(*args, **kwargs):
         for k, v in _old_namedtuple_kwdefaults.items():
@@ -453,12 +423,8 @@ class PickleSerializer(FramedSerializer):
     def dumps(self, obj):
         return pickle.dumps(obj, pickle_protocol)
 
-    if sys.version >= '3':
-        def loads(self, obj, encoding="bytes"):
-            return pickle.loads(obj, encoding=encoding)
-    else:
-        def loads(self, obj, encoding=None):
-            return pickle.loads(obj)
+    def loads(self, obj, encoding="bytes"):
+        return pickle.loads(obj, encoding=encoding)
 
 
 class CloudPickleSerializer(PickleSerializer):
@@ -469,7 +435,7 @@ class CloudPickleSerializer(PickleSerializer):
         except pickle.PickleError:
             raise
         except Exception as e:
-            emsg = _exception_message(e)
+            emsg = str(e)
             if "'i' format requires" in emsg:
                 msg = "Object too large to serialize: %s" % emsg
             else:
diff --git a/python/pyspark/shell.py b/python/pyspark/shell.py
index 65e3bdbc05..cde163bd2d 100644
--- a/python/pyspark/shell.py
+++ b/python/pyspark/shell.py
@@ -26,11 +26,8 @@ import os
 import platform
 import warnings
 
-import py4j
-
-from pyspark import SparkConf
 from pyspark.context import SparkContext
-from pyspark.sql import SparkSession, SQLContext
+from pyspark.sql import SparkSession
 
 if os.environ.get("SPARK_EXECUTOR_URI"):
     SparkContext.setSystemProperty("spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"])
diff --git a/python/pyspark/sql/__init__.py b/python/pyspark/sql/__init__.py
index c28cb8c3b9..af32469e82 100644
--- a/python/pyspark/sql/__init__.py
+++ b/python/pyspark/sql/__init__.py
@@ -39,9 +39,6 @@ Important classes of Spark SQL and DataFrames:
     - :class:`pyspark.sql.Window`
       For working with window functions.
 """
-from __future__ import absolute_import
-
-
 from pyspark.sql.types import Row
 from pyspark.sql.context import SQLContext, HiveContext, UDFRegistration
 from pyspark.sql.session import SparkSession
diff --git a/python/pyspark/sql/avro/functions.py b/python/pyspark/sql/avro/functions.py
index ed62a72d6c..974412ee4e 100644
--- a/python/pyspark/sql/avro/functions.py
+++ b/python/pyspark/sql/avro/functions.py
@@ -21,12 +21,10 @@ A collections of builtin avro functions
 
 
 from pyspark import since, SparkContext
-from pyspark.rdd import ignore_unicode_prefix
 from pyspark.sql.column import Column, _to_java_column
 from pyspark.util import _print_missing_jar
 
 
-@ignore_unicode_prefix
 @since(3.0)
 def from_avro(data, jsonFormatSchema, options={}):
     """
@@ -45,7 +43,7 @@ def from_avro(data, jsonFormatSchema, options={}):
 
     >>> from pyspark.sql import Row
     >>> from pyspark.sql.avro.functions import from_avro, to_avro
-    >>> data = [(1, Row(name='Alice', age=2))]
+    >>> data = [(1, Row(age=2, name='Alice'))]
     >>> df = spark.createDataFrame(data, ("key", "value"))
     >>> avroDf = df.select(to_avro(df.value).alias("avro"))
     >>> avroDf.collect()
@@ -55,7 +53,7 @@ def from_avro(data, jsonFormatSchema, options={}):
     ...     "fields":[{"name":"age","type":["long","null"]},
     ...     {"name":"name","type":["string","null"]}]},"null"]}]}'''
     >>> avroDf.select(from_avro(avroDf.avro, jsonFormatSchema).alias("value")).collect()
-    [Row(value=Row(avro=Row(age=2, name=u'Alice')))]
+    [Row(value=Row(avro=Row(age=2, name='Alice')))]
     """
 
     sc = SparkContext._active_spark_context
@@ -69,7 +67,6 @@ def from_avro(data, jsonFormatSchema, options={}):
     return Column(jc)
 
 
-@ignore_unicode_prefix
 @since(3.0)
 def to_avro(data, jsonFormatSchema=""):
     """
diff --git a/python/pyspark/sql/catalog.py b/python/pyspark/sql/catalog.py
index 974251f63b..25fc696dac 100644
--- a/python/pyspark/sql/catalog.py
+++ b/python/pyspark/sql/catalog.py
@@ -20,10 +20,8 @@ import warnings
 from collections import namedtuple
 
 from pyspark import since
-from pyspark.rdd import ignore_unicode_prefix, PythonEvalType
 from pyspark.sql.dataframe import DataFrame
-from pyspark.sql.udf import UserDefinedFunction
-from pyspark.sql.types import IntegerType, StringType, StructType
+from pyspark.sql.types import StructType
 
 
 Database = namedtuple("Database", "name description locationUri")
@@ -44,19 +42,16 @@ class Catalog(object):
         self._jsparkSession = sparkSession._jsparkSession
         self._jcatalog = sparkSession._jsparkSession.catalog()
 
-    @ignore_unicode_prefix
     @since(2.0)
     def currentDatabase(self):
         """Returns the current default database in this session."""
         return self._jcatalog.currentDatabase()
 
-    @ignore_unicode_prefix
     @since(2.0)
     def setCurrentDatabase(self, dbName):
         """Sets the current default database in this session."""
         return self._jcatalog.setCurrentDatabase(dbName)
 
-    @ignore_unicode_prefix
     @since(2.0)
     def listDatabases(self):
         """Returns a list of databases available across all sessions."""
@@ -70,7 +65,6 @@ class Catalog(object):
                 locationUri=jdb.locationUri()))
         return databases
 
-    @ignore_unicode_prefix
     @since(2.0)
     def listTables(self, dbName=None):
         """Returns a list of tables/views in the specified database.
@@ -92,7 +86,6 @@ class Catalog(object):
                 isTemporary=jtable.isTemporary()))
         return tables
 
-    @ignore_unicode_prefix
     @since(2.0)
     def listFunctions(self, dbName=None):
         """Returns a list of functions registered in the specified database.
@@ -113,7 +106,6 @@ class Catalog(object):
                 isTemporary=jfunction.isTemporary()))
         return functions
 
-    @ignore_unicode_prefix
     @since(2.0)
     def listColumns(self, tableName, dbName=None):
         """Returns a list of columns for the given table/view in the specified database.
diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py
index ef4944c912..bd4c355762 100644
--- a/python/pyspark/sql/column.py
+++ b/python/pyspark/sql/column.py
@@ -19,15 +19,8 @@ import sys
 import json
 import warnings
 
-if sys.version >= '3':
-    basestring = str
-    long = int
-
-from py4j.java_gateway import is_instance_of
-
 from pyspark import copy_func, since
 from pyspark.context import SparkContext
-from pyspark.rdd import ignore_unicode_prefix
 from pyspark.sql.types import *
 
 __all__ = ["Column"]
@@ -46,7 +39,7 @@ def _create_column_from_name(name):
 def _to_java_column(col):
     if isinstance(col, Column):
         jcol = col._jc
-    elif isinstance(col, basestring):
+    elif isinstance(col, str):
         jcol = _create_column_from_name(col)
     else:
         raise TypeError(
@@ -359,7 +352,7 @@ class Column(object):
     :param other: string in line
 
     >>> df.filter(df.name.contains('o')).collect()
-    [Row(age=5, name=u'Bob')]
+    [Row(age=5, name='Bob')]
     """
     _rlike_doc = """
     SQL RLIKE expression (LIKE with Regex). Returns a boolean :class:`Column` based on a regex
@@ -368,7 +361,7 @@ class Column(object):
     :param other: an extended regex expression
 
     >>> df.filter(df.name.rlike('ice$')).collect()
-    [Row(age=2, name=u'Alice')]
+    [Row(age=2, name='Alice')]
     """
     _like_doc = """
     SQL like expression. Returns a boolean :class:`Column` based on a SQL LIKE match.
@@ -378,7 +371,7 @@ class Column(object):
     See :func:`rlike` for a regex version
 
     >>> df.filter(df.name.like('Al%')).collect()
-    [Row(age=2, name=u'Alice')]
+    [Row(age=2, name='Alice')]
     """
     _startswith_doc = """
     String starts with. Returns a boolean :class:`Column` based on a string match.
@@ -386,7 +379,7 @@ class Column(object):
     :param other: string at start of line (do not use a regex `^`)
 
     >>> df.filter(df.name.startswith('Al')).collect()
-    [Row(age=2, name=u'Alice')]
+    [Row(age=2, name='Alice')]
     >>> df.filter(df.name.startswith('^Al')).collect()
     []
     """
@@ -396,18 +389,17 @@ class Column(object):
     :param other: string at end of line (do not use a regex `$`)
 
     >>> df.filter(df.name.endswith('ice')).collect()
-    [Row(age=2, name=u'Alice')]
+    [Row(age=2, name='Alice')]
     >>> df.filter(df.name.endswith('ice$')).collect()
     []
     """
 
-    contains = ignore_unicode_prefix(_bin_op("contains", _contains_doc))
-    rlike = ignore_unicode_prefix(_bin_op("rlike", _rlike_doc))
-    like = ignore_unicode_prefix(_bin_op("like", _like_doc))
-    startswith = ignore_unicode_prefix(_bin_op("startsWith", _startswith_doc))
-    endswith = ignore_unicode_prefix(_bin_op("endsWith", _endswith_doc))
+    contains = _bin_op("contains", _contains_doc)
+    rlike = _bin_op("rlike", _rlike_doc)
+    like = _bin_op("like", _like_doc)
+    startswith = _bin_op("startsWith", _startswith_doc)
+    endswith = _bin_op("endsWith", _endswith_doc)
 
-    @ignore_unicode_prefix
     @since(1.3)
     def substr(self, startPos, length):
         """
@@ -417,7 +409,7 @@ class Column(object):
         :param length:  length of the substring (int or Column)
 
         >>> df.select(df.name.substr(1, 3).alias("col")).collect()
-        [Row(col=u'Ali'), Row(col=u'Bob')]
+        [Row(col='Ali'), Row(col='Bob')]
         """
         if type(startPos) != type(length):
             raise TypeError(
@@ -435,7 +427,6 @@ class Column(object):
             raise TypeError("Unexpected type: %s" % type(startPos))
         return Column(jc)
 
-    @ignore_unicode_prefix
     @since(1.5)
     def isin(self, *cols):
         """
@@ -443,9 +434,9 @@ class Column(object):
         expression is contained by the evaluated values of the arguments.
 
         >>> df[df.name.isin("Bob", "Mike")].collect()
-        [Row(age=5, name=u'Bob')]
+        [Row(age=5, name='Bob')]
         >>> df[df.age.isin([1, 2, 3])].collect()
-        [Row(age=2, name=u'Alice')]
+        [Row(age=2, name='Alice')]
         """
         if len(cols) == 1 and isinstance(cols[0], (list, set)):
             cols = cols[0]
@@ -461,7 +452,7 @@ class Column(object):
     >>> from pyspark.sql import Row
     >>> df = spark.createDataFrame([('Tom', 80), ('Alice', None)], ["name", "height"])
     >>> df.select(df.name).orderBy(df.name.asc()).collect()
-    [Row(name=u'Alice'), Row(name=u'Tom')]
+    [Row(name='Alice'), Row(name='Tom')]
     """
     _asc_nulls_first_doc = """
     Returns a sort expression based on ascending order of the column, and null values
@@ -470,7 +461,7 @@ class Column(object):
     >>> from pyspark.sql import Row
     >>> df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"])
     >>> df.select(df.name).orderBy(df.name.asc_nulls_first()).collect()
-    [Row(name=None), Row(name=u'Alice'), Row(name=u'Tom')]
+    [Row(name=None), Row(name='Alice'), Row(name='Tom')]
 
     .. versionadded:: 2.4
     """
@@ -481,7 +472,7 @@ class Column(object):
     >>> from pyspark.sql import Row
     >>> df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"])
     >>> df.select(df.name).orderBy(df.name.asc_nulls_last()).collect()
-    [Row(name=u'Alice'), Row(name=u'Tom'), Row(name=None)]
+    [Row(name='Alice'), Row(name='Tom'), Row(name=None)]
 
     .. versionadded:: 2.4
     """
@@ -491,7 +482,7 @@ class Column(object):
     >>> from pyspark.sql import Row
     >>> df = spark.createDataFrame([('Tom', 80), ('Alice', None)], ["name", "height"])
     >>> df.select(df.name).orderBy(df.name.desc()).collect()
-    [Row(name=u'Tom'), Row(name=u'Alice')]
+    [Row(name='Tom'), Row(name='Alice')]
     """
     _desc_nulls_first_doc = """
     Returns a sort expression based on the descending order of the column, and null values
@@ -500,7 +491,7 @@ class Column(object):
     >>> from pyspark.sql import Row
     >>> df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"])
     >>> df.select(df.name).orderBy(df.name.desc_nulls_first()).collect()
-    [Row(name=None), Row(name=u'Tom'), Row(name=u'Alice')]
+    [Row(name=None), Row(name='Tom'), Row(name='Alice')]
 
     .. versionadded:: 2.4
     """
@@ -511,37 +502,37 @@ class Column(object):
     >>> from pyspark.sql import Row
     >>> df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"])
     >>> df.select(df.name).orderBy(df.name.desc_nulls_last()).collect()
-    [Row(name=u'Tom'), Row(name=u'Alice'), Row(name=None)]
+    [Row(name='Tom'), Row(name='Alice'), Row(name=None)]
 
     .. versionadded:: 2.4
     """
 
-    asc = ignore_unicode_prefix(_unary_op("asc", _asc_doc))
-    asc_nulls_first = ignore_unicode_prefix(_unary_op("asc_nulls_first", _asc_nulls_first_doc))
-    asc_nulls_last = ignore_unicode_prefix(_unary_op("asc_nulls_last", _asc_nulls_last_doc))
-    desc = ignore_unicode_prefix(_unary_op("desc", _desc_doc))
-    desc_nulls_first = ignore_unicode_prefix(_unary_op("desc_nulls_first", _desc_nulls_first_doc))
-    desc_nulls_last = ignore_unicode_prefix(_unary_op("desc_nulls_last", _desc_nulls_last_doc))
+    asc = _unary_op("asc", _asc_doc)
+    asc_nulls_first = _unary_op("asc_nulls_first", _asc_nulls_first_doc)
+    asc_nulls_last = _unary_op("asc_nulls_last", _asc_nulls_last_doc)
+    desc = _unary_op("desc", _desc_doc)
+    desc_nulls_first = _unary_op("desc_nulls_first", _desc_nulls_first_doc)
+    desc_nulls_last = _unary_op("desc_nulls_last", _desc_nulls_last_doc)
 
     _isNull_doc = """
     True if the current expression is null.
 
     >>> from pyspark.sql import Row
-    >>> df = spark.createDataFrame([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)])
+    >>> df = spark.createDataFrame([Row(name='Tom', height=80), Row(name='Alice', height=None)])
     >>> df.filter(df.height.isNull()).collect()
-    [Row(height=None, name=u'Alice')]
+    [Row(name='Alice', height=None)]
     """
     _isNotNull_doc = """
     True if the current expression is NOT null.
 
     >>> from pyspark.sql import Row
-    >>> df = spark.createDataFrame([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)])
+    >>> df = spark.createDataFrame([Row(name='Tom', height=80), Row(name='Alice', height=None)])
     >>> df.filter(df.height.isNotNull()).collect()
-    [Row(height=80, name=u'Tom')]
+    [Row(name='Tom', height=80)]
     """
 
-    isNull = ignore_unicode_prefix(_unary_op("isNull", _isNull_doc))
-    isNotNull = ignore_unicode_prefix(_unary_op("isNotNull", _isNotNull_doc))
+    isNull = _unary_op("isNull", _isNull_doc)
+    isNotNull = _unary_op("isNotNull", _isNotNull_doc)
 
     @since(1.3)
     def alias(self, *alias, **kwargs):
@@ -581,17 +572,16 @@ class Column(object):
 
     name = copy_func(alias, sinceversion=2.0, doc=":func:`name` is an alias for :func:`alias`.")
 
-    @ignore_unicode_prefix
     @since(1.3)
     def cast(self, dataType):
         """ Convert the column into type ``dataType``.
 
         >>> df.select(df.age.cast("string").alias('ages')).collect()
-        [Row(ages=u'2'), Row(ages=u'5')]
+        [Row(ages='2'), Row(ages='5')]
         >>> df.select(df.age.cast(StringType()).alias('ages')).collect()
-        [Row(ages=u'2'), Row(ages=u'5')]
+        [Row(ages='2'), Row(ages='5')]
         """
-        if isinstance(dataType, basestring):
+        if isinstance(dataType, str):
             jc = self._jc.cast(dataType)
         elif isinstance(dataType, DataType):
             from pyspark.sql import SparkSession
diff --git a/python/pyspark/sql/conf.py b/python/pyspark/sql/conf.py
index 71ea163171..eab084a1fa 100644
--- a/python/pyspark/sql/conf.py
+++ b/python/pyspark/sql/conf.py
@@ -18,10 +18,6 @@
 import sys
 
 from pyspark import since, _NoValue
-from pyspark.rdd import ignore_unicode_prefix
-
-if sys.version_info[0] >= 3:
-    basestring = str
 
 
 class RuntimeConfig(object):
@@ -34,13 +30,11 @@ class RuntimeConfig(object):
         """Create a new RuntimeConfig that wraps the underlying JVM object."""
         self._jconf = jconf
 
-    @ignore_unicode_prefix
     @since(2.0)
     def set(self, key, value):
         """Sets the given Spark runtime configuration property."""
         self._jconf.set(key, value)
 
-    @ignore_unicode_prefix
     @since(2.0)
     def get(self, key, default=_NoValue):
         """Returns the value of Spark runtime configuration property for the given key,
@@ -54,7 +48,6 @@ class RuntimeConfig(object):
                 self._checkType(default, "default")
             return self._jconf.get(key, default)
 
-    @ignore_unicode_prefix
     @since(2.0)
     def unset(self, key):
         """Resets the configuration property for the given key."""
@@ -62,11 +55,10 @@ class RuntimeConfig(object):
 
     def _checkType(self, obj, identifier):
         """Assert that an object is of type str."""
-        if not isinstance(obj, basestring):
+        if not isinstance(obj, str):
             raise TypeError("expected %s '%s' to be a string (was '%s')" %
                             (identifier, obj, type(obj).__name__))
 
-    @ignore_unicode_prefix
     @since(2.4)
     def isModifiable(self, key):
         """Indicates whether the configuration property with the given key
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index 956343a231..7fbcf85cb1 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -15,15 +15,10 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
 import sys
 import warnings
 
-if sys.version >= '3':
-    basestring = unicode = str
-
 from pyspark import since, _NoValue
-from pyspark.rdd import ignore_unicode_prefix
 from pyspark.sql.session import _monkey_patch_RDD, SparkSession
 from pyspark.sql.dataframe import DataFrame
 from pyspark.sql.readwriter import DataFrameReader
@@ -52,7 +47,6 @@ class SQLContext(object):
 
     _instantiatedContext = None
 
-    @ignore_unicode_prefix
     def __init__(self, sparkContext, sparkSession=None, jsqlContext=None):
         """Creates a new SQLContext.
 
@@ -70,7 +64,7 @@ class SQLContext(object):
         [Row((i + CAST(1 AS BIGINT))=2, (d + CAST(1 AS DOUBLE))=2.0, (NOT b)=False, list[1]=2, \
             dict[s]=0, time=datetime.datetime(2014, 8, 1, 14, 1, 5), a=1)]
         >>> df.rdd.map(lambda x: (x.i, x.s, x.d, x.l, x.b, x.time, x.row.a, x.list)).collect()
-        [(1, u'string', 1.0, 1, True, datetime.datetime(2014, 8, 1, 14, 1, 5), 1, [1, 2, 3])]
+        [(1, 'string', 1.0, 1, True, datetime.datetime(2014, 8, 1, 14, 1, 5), 1, [1, 2, 3])]
         """
         warnings.warn(
             "Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.",
@@ -142,7 +136,6 @@ class SQLContext(object):
         """
         self.sparkSession.conf.set(key, value)
 
-    @ignore_unicode_prefix
     @since(1.3)
     def getConf(self, key, defaultValue=_NoValue):
         """Returns the value of Spark SQL configuration property for the given key.
@@ -152,12 +145,12 @@ class SQLContext(object):
         the system default value.
 
         >>> sqlContext.getConf("spark.sql.shuffle.partitions")
-        u'200'
-        >>> sqlContext.getConf("spark.sql.shuffle.partitions", u"10")
-        u'10'
-        >>> sqlContext.setConf("spark.sql.shuffle.partitions", u"50")
-        >>> sqlContext.getConf("spark.sql.shuffle.partitions", u"10")
-        u'50'
+        '200'
+        >>> sqlContext.getConf("spark.sql.shuffle.partitions", "10")
+        '10'
+        >>> sqlContext.setConf("spark.sql.shuffle.partitions", "50")
+        >>> sqlContext.getConf("spark.sql.shuffle.partitions", "10")
+        '50'
         """
         return self.sparkSession.conf.get(key, defaultValue)
 
@@ -229,7 +222,6 @@ class SQLContext(object):
         return self.sparkSession._inferSchema(rdd, samplingRatio)
 
     @since(1.3)
-    @ignore_unicode_prefix
     def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=True):
         """
         Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`.
@@ -274,27 +266,27 @@ class SQLContext(object):
 
         >>> l = [('Alice', 1)]
         >>> sqlContext.createDataFrame(l).collect()
-        [Row(_1=u'Alice', _2=1)]
+        [Row(_1='Alice', _2=1)]
         >>> sqlContext.createDataFrame(l, ['name', 'age']).collect()
-        [Row(name=u'Alice', age=1)]
+        [Row(name='Alice', age=1)]
 
         >>> d = [{'name': 'Alice', 'age': 1}]
         >>> sqlContext.createDataFrame(d).collect()
-        [Row(age=1, name=u'Alice')]
+        [Row(age=1, name='Alice')]
 
         >>> rdd = sc.parallelize(l)
         >>> sqlContext.createDataFrame(rdd).collect()
-        [Row(_1=u'Alice', _2=1)]
+        [Row(_1='Alice', _2=1)]
         >>> df = sqlContext.createDataFrame(rdd, ['name', 'age'])
         >>> df.collect()
-        [Row(name=u'Alice', age=1)]
+        [Row(name='Alice', age=1)]
 
         >>> from pyspark.sql import Row
         >>> Person = Row('name', 'age')
         >>> person = rdd.map(lambda r: Person(*r))
         >>> df2 = sqlContext.createDataFrame(person)
         >>> df2.collect()
-        [Row(name=u'Alice', age=1)]
+        [Row(name='Alice', age=1)]
 
         >>> from pyspark.sql.types import *
         >>> schema = StructType([
@@ -302,15 +294,15 @@ class SQLContext(object):
         ...    StructField("age", IntegerType(), True)])
         >>> df3 = sqlContext.createDataFrame(rdd, schema)
         >>> df3.collect()
-        [Row(name=u'Alice', age=1)]
+        [Row(name='Alice', age=1)]
 
         >>> sqlContext.createDataFrame(df.toPandas()).collect()  # doctest: +SKIP
-        [Row(name=u'Alice', age=1)]
+        [Row(name='Alice', age=1)]
         >>> sqlContext.createDataFrame(pandas.DataFrame([[1, 2]])).collect()  # doctest: +SKIP
         [Row(0=1, 1=2)]
 
         >>> sqlContext.createDataFrame(rdd, "a: string, b: int").collect()
-        [Row(a=u'Alice', b=1)]
+        [Row(a='Alice', b=1)]
         >>> rdd = rdd.map(lambda row: row[1])
         >>> sqlContext.createDataFrame(rdd, "int").collect()
         [Row(value=1)]
@@ -358,7 +350,6 @@ class SQLContext(object):
         return self.sparkSession.catalog.createExternalTable(
             tableName, path, source, schema, **options)
 
-    @ignore_unicode_prefix
     @since(1.0)
     def sql(self, sqlQuery):
         """Returns a :class:`DataFrame` representing the result of the given query.
@@ -368,7 +359,7 @@ class SQLContext(object):
         >>> sqlContext.registerDataFrameAsTable(df, "table1")
         >>> df2 = sqlContext.sql("SELECT field1 AS f1, field2 as f2 from table1")
         >>> df2.collect()
-        [Row(f1=1, f2=u'row1'), Row(f1=2, f2=u'row2'), Row(f1=3, f2=u'row3')]
+        [Row(f1=1, f2='row1'), Row(f1=2, f2='row2'), Row(f1=3, f2='row3')]
         """
         return self.sparkSession.sql(sqlQuery)
 
@@ -385,7 +376,6 @@ class SQLContext(object):
         """
         return self.sparkSession.table(tableName)
 
-    @ignore_unicode_prefix
     @since(1.3)
     def tables(self, dbName=None):
         """Returns a :class:`DataFrame` containing names of tables in the given database.
@@ -401,7 +391,7 @@ class SQLContext(object):
         >>> sqlContext.registerDataFrameAsTable(df, "table1")
         >>> df2 = sqlContext.tables()
         >>> df2.filter("tableName = 'table1'").first()
-        Row(database=u'', tableName=u'table1', isTemporary=True)
+        Row(database='', tableName='table1', isTemporary=True)
         """
         if dbName is None:
             return DataFrame(self._ssql_ctx.tables(), self)
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 3ad899bcc3..023fbeabcb 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -17,21 +17,12 @@
 
 import sys
 import random
-
-if sys.version >= '3':
-    basestring = unicode = str
-    long = int
-    from functools import reduce
-    from html import escape as html_escape
-else:
-    from itertools import imap as map
-    from cgi import escape as html_escape
-
 import warnings
+from functools import reduce
+from html import escape as html_escape
 
 from pyspark import copy_func, since, _NoValue
-from pyspark.rdd import RDD, _load_from_socket, _local_iterator_from_socket, \
-    ignore_unicode_prefix
+from pyspark.rdd import RDD, _load_from_socket, _local_iterator_from_socket
 from pyspark.serializers import BatchedSerializer, PickleSerializer, \
     UTF8Deserializer
 from pyspark.storagelevel import StorageLevel
@@ -109,7 +100,6 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
         """
         return DataFrameStatFunctions(self)
 
-    @ignore_unicode_prefix
     @since(1.3)
     def toJSON(self, use_unicode=True):
         """Converts a :class:`DataFrame` into a :class:`RDD` of string.
@@ -117,7 +107,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
         Each row is turned into a JSON document as one element in the returned RDD.
 
         >>> df.toJSON().first()
-        u'{"age":2,"name":"Alice"}'
+        '{"age":2,"name":"Alice"}'
         """
         rdd = self._jdf.toJSON()
         return RDD(rdd.toJavaRDD(), self._sc, UTF8Deserializer(use_unicode))
@@ -330,11 +320,11 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
 
         # For the case when extended is mode:
         #   df.explain("formatted")
-        is_extended_as_mode = isinstance(extended, basestring) and mode is None
+        is_extended_as_mode = isinstance(extended, str) and mode is None
 
         # For the mode specified:
         #   df.explain(mode="formatted")
-        is_mode_case = extended is None and isinstance(mode, basestring)
+        is_mode_case = extended is None and isinstance(mode, str)
 
         if not (is_no_argument or is_extended_case or is_extended_as_mode or is_mode_case):
             argtypes = [
@@ -568,7 +558,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
         if not isinstance(name, str):
             raise TypeError("name should be provided as str, got {0}".format(type(name)))
 
-        allowed_types = (basestring, list, float, int)
+        allowed_types = (str, list, float, int)
         for p in parameters:
             if not isinstance(p, allowed_types):
                 raise TypeError(
@@ -587,19 +577,17 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
         """
         return int(self._jdf.count())
 
-    @ignore_unicode_prefix
     @since(1.3)
     def collect(self):
         """Returns all the records as a list of :class:`Row`.
 
         >>> df.collect()
-        [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
+        [Row(age=2, name='Alice'), Row(age=5, name='Bob')]
         """
         with SCCallSiteSync(self._sc) as css:
             sock_info = self._jdf.collectToPython()
         return list(_load_from_socket(sock_info, BatchedSerializer(PickleSerializer())))
 
-    @ignore_unicode_prefix
     @since(2.0)
     def toLocalIterator(self, prefetchPartitions=False):
         """
@@ -612,36 +600,33 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
                                    before it is needed.
 
         >>> list(df.toLocalIterator())
-        [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
+        [Row(age=2, name='Alice'), Row(age=5, name='Bob')]
         """
         with SCCallSiteSync(self._sc) as css:
             sock_info = self._jdf.toPythonIterator(prefetchPartitions)
         return _local_iterator_from_socket(sock_info, BatchedSerializer(PickleSerializer()))
 
-    @ignore_unicode_prefix
     @since(1.3)
     def limit(self, num):
         """Limits the result count to the number specified.
 
         >>> df.limit(1).collect()
-        [Row(age=2, name=u'Alice')]
+        [Row(age=2, name='Alice')]
         >>> df.limit(0).collect()
         []
         """
         jdf = self._jdf.limit(num)
         return DataFrame(jdf, self.sql_ctx)
 
-    @ignore_unicode_prefix
     @since(1.3)
     def take(self, num):
         """Returns the first ``num`` rows as a :class:`list` of :class:`Row`.
 
         >>> df.take(2)
-        [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
+        [Row(age=2, name='Alice'), Row(age=5, name='Bob')]
         """
         return self.limit(num).collect()
 
-    @ignore_unicode_prefix
     @since(3.0)
     def tail(self, num):
         """
@@ -651,7 +636,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
         a very large ``num`` can crash the driver process with OutOfMemoryError.
 
         >>> df.tail(1)
-        [Row(age=5, name=u'Bob')]
+        [Row(age=5, name='Bob')]
         """
         with SCCallSiteSync(self._sc):
             sock_info = self._jdf.tailToPython(num)
@@ -818,7 +803,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
             else:
                 return DataFrame(
                     self._jdf.repartition(numPartitions, self._jcols(*cols)), self.sql_ctx)
-        elif isinstance(numPartitions, (basestring, Column)):
+        elif isinstance(numPartitions, (str, Column)):
             cols = (numPartitions, ) + cols
             return DataFrame(self._jdf.repartition(self._jcols(*cols)), self.sql_ctx)
         else:
@@ -869,7 +854,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
             else:
                 return DataFrame(
                     self._jdf.repartitionByRange(numPartitions, self._jcols(*cols)), self.sql_ctx)
-        elif isinstance(numPartitions, (basestring, Column)):
+        elif isinstance(numPartitions, (str, Column)):
             cols = (numPartitions,) + cols
             return DataFrame(self._jdf.repartitionByRange(self._jcols(*cols)), self.sql_ctx)
         else:
@@ -944,7 +929,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
             fraction = withReplacement
             withReplacement = None
 
-        seed = long(seed) if seed is not None else None
+        seed = int(seed) if seed is not None else None
         args = [arg for arg in [withReplacement, fraction, seed] if arg is not None]
         jdf = self._jdf.sample(*args)
         return DataFrame(jdf, self.sql_ctx)
@@ -978,15 +963,15 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
         .. versionchanged:: 3.0
            Added sampling by a column of :class:`Column`
         """
-        if isinstance(col, basestring):
+        if isinstance(col, str):
             col = Column(col)
         elif not isinstance(col, Column):
             raise ValueError("col must be a string or a column, but got %r" % type(col))
         if not isinstance(fractions, dict):
             raise ValueError("fractions must be a dict but got %r" % type(fractions))
         for k, v in fractions.items():
-            if not isinstance(k, (float, int, long, basestring)):
-                raise ValueError("key must be float, int, long, or string, but got %r" % type(k))
+            if not isinstance(k, (float, int, str)):
+                raise ValueError("key must be float, int, or string, but got %r" % type(k))
             fractions[k] = float(v)
         col = col._jc
         seed = seed if seed is not None else random.randint(0, sys.maxsize)
@@ -1011,7 +996,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
             if w < 0.0:
                 raise ValueError("Weights must be positive. Found weight value: %s" % w)
         seed = seed if seed is not None else random.randint(0, sys.maxsize)
-        rdd_array = self._jdf.randomSplit(_to_list(self.sql_ctx._sc, weights), long(seed))
+        rdd_array = self._jdf.randomSplit(_to_list(self.sql_ctx._sc, weights), int(seed))
         return [DataFrame(rdd, self.sql_ctx) for rdd in rdd_array]
 
     @property
@@ -1052,12 +1037,11 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
         |   3|
         +----+
         """
-        if not isinstance(colName, basestring):
+        if not isinstance(colName, str):
             raise ValueError("colName should be provided as string")
         jc = self._jdf.colRegex(colName)
         return Column(jc)
 
-    @ignore_unicode_prefix
     @since(1.3)
     def alias(self, alias):
         """Returns a new :class:`DataFrame` with an alias set.
@@ -1070,12 +1054,11 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
         >>> joined_df = df_as1.join(df_as2, col("df_as1.name") == col("df_as2.name"), 'inner')
         >>> joined_df.select("df_as1.name", "df_as2.name", "df_as2.age") \
                 .sort(desc("df_as1.name")).collect()
-        [Row(name=u'Bob', name=u'Bob', age=5), Row(name=u'Alice', name=u'Alice', age=2)]
+        [Row(name='Bob', name='Bob', age=5), Row(name='Alice', name='Alice', age=2)]
         """
-        assert isinstance(alias, basestring), "alias should be a string"
+        assert isinstance(alias, str), "alias should be a string"
         return DataFrame(getattr(self._jdf, "as")(alias), self.sql_ctx)
 
-    @ignore_unicode_prefix
     @since(2.1)
     def crossJoin(self, other):
         """Returns the cartesian product with another :class:`DataFrame`.
@@ -1083,18 +1066,17 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
         :param other: Right side of the cartesian product.
 
         >>> df.select("age", "name").collect()
-        [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
+        [Row(age=2, name='Alice'), Row(age=5, name='Bob')]
         >>> df2.select("name", "height").collect()
-        [Row(name=u'Tom', height=80), Row(name=u'Bob', height=85)]
+        [Row(name='Tom', height=80), Row(name='Bob', height=85)]
         >>> df.crossJoin(df2.select("height")).select("age", "name", "height").collect()
-        [Row(age=2, name=u'Alice', height=80), Row(age=2, name=u'Alice', height=85),
-         Row(age=5, name=u'Bob', height=80), Row(age=5, name=u'Bob', height=85)]
+        [Row(age=2, name='Alice', height=80), Row(age=2, name='Alice', height=85),
+         Row(age=5, name='Bob', height=80), Row(age=5, name='Bob', height=85)]
         """
 
         jdf = self._jdf.crossJoin(other._jdf)
         return DataFrame(jdf, self.sql_ctx)
 
-    @ignore_unicode_prefix
     @since(1.3)
     def join(self, other, on=None, how=None):
         """Joins with another :class:`DataFrame`, using the given join expression.
@@ -1113,27 +1095,27 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
         >>> from pyspark.sql.functions import desc
         >>> df.join(df2, df.name == df2.name, 'outer').select(df.name, df2.height) \
                 .sort(desc("name")).collect()
-        [Row(name=u'Bob', height=85), Row(name=u'Alice', height=None), Row(name=None, height=80)]
+        [Row(name='Bob', height=85), Row(name='Alice', height=None), Row(name=None, height=80)]
 
         >>> df.join(df2, 'name', 'outer').select('name', 'height').sort(desc("name")).collect()
-        [Row(name=u'Tom', height=80), Row(name=u'Bob', height=85), Row(name=u'Alice', height=None)]
+        [Row(name='Tom', height=80), Row(name='Bob', height=85), Row(name='Alice', height=None)]
 
         >>> cond = [df.name == df3.name, df.age == df3.age]
         >>> df.join(df3, cond, 'outer').select(df.name, df3.age).collect()
-        [Row(name=u'Alice', age=2), Row(name=u'Bob', age=5)]
+        [Row(name='Alice', age=2), Row(name='Bob', age=5)]
 
         >>> df.join(df2, 'name').select(df.name, df2.height).collect()
-        [Row(name=u'Bob', height=85)]
+        [Row(name='Bob', height=85)]
 
         >>> df.join(df4, ['name', 'age']).select(df.name, df.age).collect()
-        [Row(name=u'Bob', age=5)]
+        [Row(name='Bob', age=5)]
         """
 
         if on is not None and not isinstance(on, list):
             on = [on]
 
         if on is not None:
-            if isinstance(on[0], basestring):
+            if isinstance(on[0], str):
                 on = self._jseq(on)
             else:
                 assert isinstance(on[0], Column), "on should be Column or list of Column"
@@ -1147,7 +1129,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
                 how = "inner"
             if on is None:
                 on = self._jseq([])
-            assert isinstance(how, basestring), "how should be basestring"
+            assert isinstance(how, str), "how should be a string"
             jdf = self._jdf.join(other._jdf, on, how)
         return DataFrame(jdf, self.sql_ctx)
 
@@ -1171,7 +1153,6 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
         jdf = self._jdf.sortWithinPartitions(self._sort_cols(cols, kwargs))
         return DataFrame(jdf, self.sql_ctx)
 
-    @ignore_unicode_prefix
     @since(1.3)
     def sort(self, *cols, **kwargs):
         """Returns a new :class:`DataFrame` sorted by the specified column(s).
@@ -1182,18 +1163,18 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
             If a list is specified, length of the list must equal length of the `cols`.
 
         >>> df.sort(df.age.desc()).collect()
-        [Row(age=5, name=u'Bob'), Row(age=2, name=u'Alice')]
+        [Row(age=5, name='Bob'), Row(age=2, name='Alice')]
         >>> df.sort("age", ascending=False).collect()
-        [Row(age=5, name=u'Bob'), Row(age=2, name=u'Alice')]
+        [Row(age=5, name='Bob'), Row(age=2, name='Alice')]
         >>> df.orderBy(df.age.desc()).collect()
-        [Row(age=5, name=u'Bob'), Row(age=2, name=u'Alice')]
+        [Row(age=5, name='Bob'), Row(age=2, name='Alice')]
         >>> from pyspark.sql.functions import *
         >>> df.sort(asc("age")).collect()
-        [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
+        [Row(age=2, name='Alice'), Row(age=5, name='Bob')]
         >>> df.orderBy(desc("age"), "name").collect()
-        [Row(age=5, name=u'Bob'), Row(age=2, name=u'Alice')]
+        [Row(age=5, name='Bob'), Row(age=2, name='Alice')]
         >>> df.orderBy(["age", "name"], ascending=[0, 1]).collect()
-        [Row(age=5, name=u'Bob'), Row(age=2, name=u'Alice')]
+        [Row(age=5, name='Bob'), Row(age=2, name='Alice')]
         """
         jdf = self._jdf.sort(self._sort_cols(cols, kwargs))
         return DataFrame(jdf, self.sql_ctx)
@@ -1333,7 +1314,6 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
         jdf = self._jdf.summary(self._jseq(statistics))
         return DataFrame(jdf, self.sql_ctx)
 
-    @ignore_unicode_prefix
     @since(1.3)
     def head(self, n=None):
         """Returns the first ``n`` rows.
@@ -1346,26 +1326,24 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
             If n is 1, return a single Row.
 
         >>> df.head()
-        Row(age=2, name=u'Alice')
+        Row(age=2, name='Alice')
         >>> df.head(1)
-        [Row(age=2, name=u'Alice')]
+        [Row(age=2, name='Alice')]
         """
         if n is None:
             rs = self.head(1)
             return rs[0] if rs else None
         return self.take(n)
 
-    @ignore_unicode_prefix
     @since(1.3)
     def first(self):
         """Returns the first row as a :class:`Row`.
 
         >>> df.first()
-        Row(age=2, name=u'Alice')
+        Row(age=2, name='Alice')
         """
         return self.head()
 
-    @ignore_unicode_prefix
     @since(1.3)
     def __getitem__(self, item):
         """Returns the column as a :class:`Column`.
@@ -1373,13 +1351,13 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
         >>> df.select(df['age']).collect()
         [Row(age=2), Row(age=5)]
         >>> df[ ["name", "age"]].collect()
-        [Row(name=u'Alice', age=2), Row(name=u'Bob', age=5)]
+        [Row(name='Alice', age=2), Row(name='Bob', age=5)]
         >>> df[ df.age > 3 ].collect()
-        [Row(age=5, name=u'Bob')]
+        [Row(age=5, name='Bob')]
         >>> df[df[0] > 3].collect()
-        [Row(age=5, name=u'Bob')]
+        [Row(age=5, name='Bob')]
         """
-        if isinstance(item, basestring):
+        if isinstance(item, str):
             jc = self._jdf.apply(item)
             return Column(jc)
         elif isinstance(item, Column):
@@ -1405,7 +1383,6 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
         jc = self._jdf.apply(name)
         return Column(jc)
 
-    @ignore_unicode_prefix
     @since(1.3)
     def select(self, *cols):
         """Projects a set of expressions and returns a new :class:`DataFrame`.
@@ -1415,11 +1392,11 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
             in the current :class:`DataFrame`.
 
         >>> df.select('*').collect()
-        [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
+        [Row(age=2, name='Alice'), Row(age=5, name='Bob')]
         >>> df.select('name', 'age').collect()
-        [Row(name=u'Alice', age=2), Row(name=u'Bob', age=5)]
+        [Row(name='Alice', age=2), Row(name='Bob', age=5)]
         >>> df.select(df.name, (df.age + 10).alias('age')).collect()
-        [Row(name=u'Alice', age=12), Row(name=u'Bob', age=15)]
+        [Row(name='Alice', age=12), Row(name='Bob', age=15)]
         """
         jdf = self._jdf.select(self._jcols(*cols))
         return DataFrame(jdf, self.sql_ctx)
@@ -1438,7 +1415,6 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
         jdf = self._jdf.selectExpr(self._jseq(expr))
         return DataFrame(jdf, self.sql_ctx)
 
-    @ignore_unicode_prefix
     @since(1.3)
     def filter(self, condition):
         """Filters rows using the given condition.
@@ -1449,16 +1425,16 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
             or a string of SQL expression.
 
         >>> df.filter(df.age > 3).collect()
-        [Row(age=5, name=u'Bob')]
+        [Row(age=5, name='Bob')]
         >>> df.where(df.age == 2).collect()
-        [Row(age=2, name=u'Alice')]
+        [Row(age=2, name='Alice')]
 
         >>> df.filter("age > 3").collect()
-        [Row(age=5, name=u'Bob')]
+        [Row(age=5, name='Bob')]
         >>> df.where("age = 2").collect()
-        [Row(age=2, name=u'Alice')]
+        [Row(age=2, name='Alice')]
         """
-        if isinstance(condition, basestring):
+        if isinstance(condition, str):
             jdf = self._jdf.filter(condition)
         elif isinstance(condition, Column):
             jdf = self._jdf.filter(condition._jc)
@@ -1466,7 +1442,6 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
             raise TypeError("condition should be string or Column")
         return DataFrame(jdf, self.sql_ctx)
 
-    @ignore_unicode_prefix
     @since(1.3)
     def groupBy(self, *cols):
         """Groups the :class:`DataFrame` using the specified columns,
@@ -1481,11 +1456,11 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
         >>> df.groupBy().avg().collect()
         [Row(avg(age)=3.5)]
         >>> sorted(df.groupBy('name').agg({'age': 'mean'}).collect())
-        [Row(name=u'Alice', avg(age)=2.0), Row(name=u'Bob', avg(age)=5.0)]
+        [Row(name='Alice', avg(age)=2.0), Row(name='Bob', avg(age)=5.0)]
         >>> sorted(df.groupBy(df.name).avg().collect())
-        [Row(name=u'Alice', avg(age)=2.0), Row(name=u'Bob', avg(age)=5.0)]
+        [Row(name='Alice', avg(age)=2.0), Row(name='Bob', avg(age)=5.0)]
         >>> sorted(df.groupBy(['name', df.age]).count().collect())
-        [Row(name=u'Alice', age=2, count=1), Row(name=u'Bob', age=5, count=1)]
+        [Row(name='Alice', age=2, count=1), Row(name='Bob', age=5, count=1)]
         """
         jgd = self._jdf.groupBy(self._jcols(*cols))
         from pyspark.sql.group import GroupedData
@@ -1655,19 +1630,19 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
         ...     Row(name='Alice', age=5, height=80), \\
         ...     Row(name='Alice', age=10, height=80)]).toDF()
         >>> df.dropDuplicates().show()
-        +---+------+-----+
-        |age|height| name|
-        +---+------+-----+
-        |  5|    80|Alice|
-        | 10|    80|Alice|
-        +---+------+-----+
+        +-----+---+------+
+        | name|age|height|
+        +-----+---+------+
+        |Alice|  5|    80|
+        |Alice| 10|    80|
+        +-----+---+------+
 
         >>> df.dropDuplicates(['name', 'height']).show()
-        +---+------+-----+
-        |age|height| name|
-        +---+------+-----+
-        |  5|    80|Alice|
-        +---+------+-----+
+        +-----+---+------+
+        | name|age|height|
+        +-----+---+------+
+        |Alice|  5|    80|
+        +-----+---+------+
         """
         if subset is None:
             jdf = self._jdf.dropDuplicates()
@@ -1700,7 +1675,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
 
         if subset is None:
             subset = self.columns
-        elif isinstance(subset, basestring):
+        elif isinstance(subset, str):
             subset = [subset]
         elif not isinstance(subset, (list, tuple)):
             raise ValueError("subset should be a list or tuple of column names")
@@ -1715,11 +1690,11 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
         """Replace null values, alias for ``na.fill()``.
         :func:`DataFrame.fillna` and :func:`DataFrameNaFunctions.fill` are aliases of each other.
 
-        :param value: int, long, float, string, bool or dict.
+        :param value: int, float, string, bool or dict.
             Value to replace null values with.
             If the value is a dict, then `subset` is ignored and `value` must be a mapping
             from column name (string) to replacement value. The replacement value must be
-            an int, long, float, boolean, or string.
+            an int, float, boolean, or string.
         :param subset: optional list of column names to consider.
             Columns specified in subset that do not have matching data type are ignored.
             For example, if `value` is a string, and subset contains a non-string column,
@@ -1754,13 +1729,13 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
         | 50|  null|unknown|
         +---+------+-------+
         """
-        if not isinstance(value, (float, int, long, basestring, bool, dict)):
-            raise ValueError("value should be a float, int, long, string, bool or dict")
+        if not isinstance(value, (float, int, str, bool, dict)):
+            raise ValueError("value should be a float, int, string, bool or dict")
 
         # Note that bool validates isinstance(int), but we don't want to
         # convert bools to floats
 
-        if not isinstance(value, bool) and isinstance(value, (int, long)):
+        if not isinstance(value, bool) and isinstance(value, int):
             value = float(value)
 
         if isinstance(value, dict):
@@ -1768,7 +1743,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
         elif subset is None:
             return DataFrame(self._jdf.na().fill(value), self.sql_ctx)
         else:
-            if isinstance(subset, basestring):
+            if isinstance(subset, str):
                 subset = [subset]
             elif not isinstance(subset, (list, tuple)):
                 raise ValueError("subset should be a list or tuple of column names")
@@ -1787,12 +1762,12 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
         floating point representation. In case of conflicts (for example with `{42: -1, 42.0: 1}`)
         and arbitrary replacement will be used.
 
-        :param to_replace: bool, int, long, float, string, list or dict.
+        :param to_replace: bool, int, float, string, list or dict.
             Value to be replaced.
             If the value is a dict, then `value` is ignored or can be omitted, and `to_replace`
             must be a mapping between a value and a replacement.
-        :param value: bool, int, long, float, string, list or None.
-            The replacement value must be a bool, int, long, float, string or None. If `value` is a
+        :param value: bool, int, float, string, list or None.
+            The replacement value must be a bool, int, float, string or None. If `value` is a
             list, `value` should be of the same length and type as `to_replace`.
             If `value` is a scalar and `to_replace` is a sequence, then `value` is
             used as a replacement for each item in `to_replace`.
@@ -1854,7 +1829,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
 
             >>> all_of(bool)([True, False])
             True
-            >>> all_of(basestring)(["a", 1])
+            >>> all_of(str)(["a", 1])
             False
             """
             def all_of_(xs):
@@ -1862,20 +1837,20 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
             return all_of_
 
         all_of_bool = all_of(bool)
-        all_of_str = all_of(basestring)
-        all_of_numeric = all_of((float, int, long))
+        all_of_str = all_of(str)
+        all_of_numeric = all_of((float, int))
 
         # Validate input types
-        valid_types = (bool, float, int, long, basestring, list, tuple)
+        valid_types = (bool, float, int, str, list, tuple)
         if not isinstance(to_replace, valid_types + (dict, )):
             raise ValueError(
-                "to_replace should be a bool, float, int, long, string, list, tuple, or dict. "
+                "to_replace should be a bool, float, int, string, list, tuple, or dict. "
                 "Got {0}".format(type(to_replace)))
 
         if not isinstance(value, valid_types) and value is not None \
                 and not isinstance(to_replace, dict):
             raise ValueError("If to_replace is not a dict, value should be "
-                             "a bool, float, int, long, string, list, tuple or None. "
+                             "a bool, float, int, string, list, tuple or None. "
                              "Got {0}".format(type(value)))
 
         if isinstance(to_replace, (list, tuple)) and isinstance(value, (list, tuple)):
@@ -1883,12 +1858,12 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
                 raise ValueError("to_replace and value lists should be of the same length. "
                                  "Got {0} and {1}".format(len(to_replace), len(value)))
 
-        if not (subset is None or isinstance(subset, (list, tuple, basestring))):
+        if not (subset is None or isinstance(subset, (list, tuple, str))):
             raise ValueError("subset should be a list or tuple of column names, "
                              "column name or None. Got {0}".format(type(subset)))
 
         # Reshape input arguments if necessary
-        if isinstance(to_replace, (float, int, long, basestring)):
+        if isinstance(to_replace, (float, int, str)):
             to_replace = [to_replace]
 
         if isinstance(to_replace, dict):
@@ -1896,11 +1871,11 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
             if value is not None:
                 warnings.warn("to_replace is a dict and value is not None. value will be ignored.")
         else:
-            if isinstance(value, (float, int, long, basestring)) or value is None:
+            if isinstance(value, (float, int, str)) or value is None:
                 value = [value for _ in range(len(to_replace))]
             rep_dict = dict(zip(to_replace, value))
 
-        if isinstance(subset, basestring):
+        if isinstance(subset, str):
             subset = [subset]
 
         # Verify we were not passed in mixed type generics.
@@ -1957,10 +1932,10 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
            Added support for multiple columns.
         """
 
-        if not isinstance(col, (basestring, list, tuple)):
+        if not isinstance(col, (str, list, tuple)):
             raise ValueError("col should be a string, list or tuple, but got %r" % type(col))
 
-        isStr = isinstance(col, basestring)
+        isStr = isinstance(col, str)
 
         if isinstance(col, tuple):
             col = list(col)
@@ -1968,7 +1943,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
             col = [col]
 
         for c in col:
-            if not isinstance(c, basestring):
+            if not isinstance(c, str):
                 raise ValueError("columns should be strings, but got %r" % type(c))
         col = _to_list(self._sc, col)
 
@@ -1977,12 +1952,12 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
         if isinstance(probabilities, tuple):
             probabilities = list(probabilities)
         for p in probabilities:
-            if not isinstance(p, (float, int, long)) or p < 0 or p > 1:
-                raise ValueError("probabilities should be numerical (float, int, long) in [0,1].")
+            if not isinstance(p, (float, int)) or p < 0 or p > 1:
+                raise ValueError("probabilities should be numerical (float, int) in [0,1].")
         probabilities = _to_list(self._sc, probabilities)
 
-        if not isinstance(relativeError, (float, int, long)) or relativeError < 0:
-            raise ValueError("relativeError should be numerical (float, int, long) >= 0.")
+        if not isinstance(relativeError, (float, int)) or relativeError < 0:
+            raise ValueError("relativeError should be numerical (float, int) >= 0.")
         relativeError = float(relativeError)
 
         jaq = self._jdf.stat().approxQuantile(col, probabilities, relativeError)
@@ -2000,9 +1975,9 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
         :param col2: The name of the second column
         :param method: The correlation method. Currently only supports "pearson"
         """
-        if not isinstance(col1, basestring):
+        if not isinstance(col1, str):
             raise ValueError("col1 should be a string.")
-        if not isinstance(col2, basestring):
+        if not isinstance(col2, str):
             raise ValueError("col2 should be a string.")
         if not method:
             method = "pearson"
@@ -2020,9 +1995,9 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
         :param col1: The name of the first column
         :param col2: The name of the second column
         """
-        if not isinstance(col1, basestring):
+        if not isinstance(col1, str):
             raise ValueError("col1 should be a string.")
-        if not isinstance(col2, basestring):
+        if not isinstance(col2, str):
             raise ValueError("col2 should be a string.")
         return self._jdf.stat().cov(col1, col2)
 
@@ -2042,9 +2017,9 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
         :param col2: The name of the second column. Distinct items will make the column names
             of the :class:`DataFrame`.
         """
-        if not isinstance(col1, basestring):
+        if not isinstance(col1, str):
             raise ValueError("col1 should be a string.")
-        if not isinstance(col2, basestring):
+        if not isinstance(col2, str):
             raise ValueError("col2 should be a string.")
         return DataFrame(self._jdf.stat().crosstab(col1, col2), self.sql_ctx)
 
@@ -2073,7 +2048,6 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
             support = 0.01
         return DataFrame(self._jdf.stat().freqItems(_to_seq(self._sc, cols), support), self.sql_ctx)
 
-    @ignore_unicode_prefix
     @since(1.3)
     def withColumn(self, colName, col):
         """
@@ -2092,13 +2066,12 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
             To avoid this, use :func:`select` with the multiple columns at once.
 
         >>> df.withColumn('age2', df.age + 2).collect()
-        [Row(age=2, name=u'Alice', age2=4), Row(age=5, name=u'Bob', age2=7)]
+        [Row(age=2, name='Alice', age2=4), Row(age=5, name='Bob', age2=7)]
 
         """
         assert isinstance(col, Column), "col should be Column"
         return DataFrame(self._jdf.withColumn(colName, col._jc), self.sql_ctx)
 
-    @ignore_unicode_prefix
     @since(1.3)
     def withColumnRenamed(self, existing, new):
         """Returns a new :class:`DataFrame` by renaming an existing column.
@@ -2108,12 +2081,11 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
         :param new: string, new name of the column.
 
         >>> df.withColumnRenamed('age', 'age2').collect()
-        [Row(age2=2, name=u'Alice'), Row(age2=5, name=u'Bob')]
+        [Row(age2=2, name='Alice'), Row(age2=5, name='Bob')]
         """
         return DataFrame(self._jdf.withColumnRenamed(existing, new), self.sql_ctx)
 
     @since(1.4)
-    @ignore_unicode_prefix
     def drop(self, *cols):
         """Returns a new :class:`DataFrame` that drops the specified column.
         This is a no-op if schema doesn't contain the given column name(s).
@@ -2122,23 +2094,23 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
             :class:`Column` to drop, or a list of string name of the columns to drop.
 
         >>> df.drop('age').collect()
-        [Row(name=u'Alice'), Row(name=u'Bob')]
+        [Row(name='Alice'), Row(name='Bob')]
 
         >>> df.drop(df.age).collect()
-        [Row(name=u'Alice'), Row(name=u'Bob')]
+        [Row(name='Alice'), Row(name='Bob')]
 
         >>> df.join(df2, df.name == df2.name, 'inner').drop(df.name).collect()
-        [Row(age=5, height=85, name=u'Bob')]
+        [Row(age=5, height=85, name='Bob')]
 
         >>> df.join(df2, df.name == df2.name, 'inner').drop(df2.name).collect()
-        [Row(age=5, name=u'Bob', height=85)]
+        [Row(age=5, name='Bob', height=85)]
 
         >>> df.join(df2, 'name', 'inner').drop('age', 'height').collect()
-        [Row(name=u'Bob')]
+        [Row(name='Bob')]
         """
         if len(cols) == 1:
             col = cols[0]
-            if isinstance(col, basestring):
+            if isinstance(col, str):
                 jdf = self._jdf.drop(col)
             elif isinstance(col, Column):
                 jdf = self._jdf.drop(col._jc)
@@ -2146,20 +2118,19 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
                 raise TypeError("col should be a string or a Column")
         else:
             for col in cols:
-                if not isinstance(col, basestring):
+                if not isinstance(col, str):
                     raise TypeError("each col in the param list should be a string")
             jdf = self._jdf.drop(self._jseq(cols))
 
         return DataFrame(jdf, self.sql_ctx)
 
-    @ignore_unicode_prefix
     def toDF(self, *cols):
         """Returns a new :class:`DataFrame` that with new specified column names
 
         :param cols: list of new column names (string)
 
         >>> df.toDF('f1', 'f2').collect()
-        [Row(f1=2, f2=u'Alice'), Row(f1=5, f2=u'Bob')]
+        [Row(f1=2, f2='Alice'), Row(f1=5, f2='Bob')]
         """
         jdf = self._jdf.toDF(self._jseq(cols))
         return DataFrame(jdf, self.sql_ctx)
@@ -2347,7 +2318,6 @@ def _test():
     from pyspark.context import SparkContext
     from pyspark.sql import Row, SQLContext, SparkSession
     import pyspark.sql.dataframe
-    from pyspark.sql.functions import from_unixtime
     globs = pyspark.sql.dataframe.__dict__.copy()
     sc = SparkContext('local[4]', 'PythonTest')
     globs['sc'] = sc
@@ -2356,16 +2326,16 @@ def _test():
     globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')])\
         .toDF(StructType([StructField('age', IntegerType()),
                           StructField('name', StringType())]))
-    globs['df2'] = sc.parallelize([Row(name='Tom', height=80), Row(name='Bob', height=85)]).toDF()
-    globs['df3'] = sc.parallelize([Row(name='Alice', age=2),
-                                   Row(name='Bob', age=5)]).toDF()
-    globs['df4'] = sc.parallelize([Row(name='Alice', age=10, height=80),
-                                   Row(name='Bob', age=5, height=None),
-                                   Row(name='Tom', age=None, height=None),
-                                   Row(name=None, age=None, height=None)]).toDF()
-    globs['df5'] = sc.parallelize([Row(name='Alice', spy=False, age=10),
-                                   Row(name='Bob', spy=None, age=5),
-                                   Row(name='Mallory', spy=True, age=None)]).toDF()
+    globs['df2'] = sc.parallelize([Row(height=80, name='Tom'), Row(height=85, name='Bob')]).toDF()
+    globs['df3'] = sc.parallelize([Row(age=2, name='Alice'),
+                                   Row(age=5, name='Bob')]).toDF()
+    globs['df4'] = sc.parallelize([Row(age=10, height=80, name='Alice'),
+                                   Row(age=5, height=None, name='Bob'),
+                                   Row(age=None, height=None, name='Tom'),
+                                   Row(age=None, height=None, name=None)]).toDF()
+    globs['df5'] = sc.parallelize([Row(age=10, name='Alice', spy=False),
+                                   Row(age=5, name='Bob', spy=None),
+                                   Row(age=None, name='Mallory', spy=True)]).toDF()
     globs['sdf'] = sc.parallelize([Row(name='Tom', time=1479441846),
                                    Row(name='Bob', time=1479442946)]).toDF()
 
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index b5a7c18904..63b049999f 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -22,14 +22,8 @@ import sys
 import functools
 import warnings
 
-if sys.version < "3":
-    from itertools import imap as map
-
-if sys.version >= '3':
-    basestring = str
-
 from pyspark import since, SparkContext
-from pyspark.rdd import ignore_unicode_prefix, PythonEvalType
+from pyspark.rdd import PythonEvalType
 from pyspark.sql.column import Column, _to_java_column, _to_seq, _create_column_from_literal, \
     _create_column_from_name
 from pyspark.sql.dataframe import DataFrame
@@ -88,14 +82,14 @@ def _create_binary_mathfunction(name, doc=""):
         # if they are not columns or strings.
         if isinstance(col1, Column):
             arg1 = col1._jc
-        elif isinstance(col1, basestring):
+        elif isinstance(col1, str):
             arg1 = _create_column_from_name(col1)
         else:
             arg1 = float(col1)
 
         if isinstance(col2, Column):
             arg2 = col2._jc
-        elif isinstance(col2, basestring):
+        elif isinstance(col2, str):
             arg2 = _create_column_from_name(col2)
         else:
             arg2 = float(col2)
@@ -648,7 +642,6 @@ def percentile_approx(col, percentage, accuracy=10000):
     return Column(sc._jvm.functions.percentile_approx(_to_java_column(col), percentage, accuracy))
 
 
-@ignore_unicode_prefix
 @since(1.4)
 def rand(seed=None):
     """Generates a random column with independent and identically distributed (i.i.d.) samples
@@ -657,8 +650,8 @@ def rand(seed=None):
     .. note:: The function is non-deterministic in general case.
 
     >>> df.withColumn('rand', rand(seed=42) * 3).collect()
-    [Row(age=2, name=u'Alice', rand=2.4052597283576684),
-     Row(age=5, name=u'Bob', rand=2.3913904055683974)]
+    [Row(age=2, name='Alice', rand=2.4052597283576684),
+     Row(age=5, name='Bob', rand=2.3913904055683974)]
     """
     sc = SparkContext._active_spark_context
     if seed is not None:
@@ -668,7 +661,6 @@ def rand(seed=None):
     return Column(jc)
 
 
-@ignore_unicode_prefix
 @since(1.4)
 def randn(seed=None):
     """Generates a column with independent and identically distributed (i.i.d.) samples from
@@ -677,8 +669,8 @@ def randn(seed=None):
     .. note:: The function is non-deterministic in general case.
 
     >>> df.withColumn('randn', randn(seed=42)).collect()
-    [Row(age=2, name=u'Alice', randn=1.1027054481455365),
-    Row(age=5, name=u'Bob', randn=0.7400395449950132)]
+    [Row(age=2, name='Alice', randn=1.1027054481455365),
+    Row(age=5, name='Bob', randn=0.7400395449950132)]
     """
     sc = SparkContext._active_spark_context
     if seed is not None:
@@ -774,7 +766,6 @@ def expr(str):
     return Column(sc._jvm.functions.expr(str))
 
 
-@ignore_unicode_prefix
 @since(1.4)
 def struct(*cols):
     """Creates a new struct column.
@@ -782,9 +773,9 @@ def struct(*cols):
     :param cols: list of column names (string) or list of :class:`Column` expressions
 
     >>> df.select(struct('age', 'name').alias("struct")).collect()
-    [Row(struct=Row(age=2, name=u'Alice')), Row(struct=Row(age=5, name=u'Bob'))]
+    [Row(struct=Row(age=2, name='Alice')), Row(struct=Row(age=5, name='Bob'))]
     >>> df.select(struct([df.age, df.name]).alias("struct")).collect()
-    [Row(struct=Row(age=2, name=u'Alice')), Row(struct=Row(age=5, name=u'Bob'))]
+    [Row(struct=Row(age=2, name='Alice')), Row(struct=Row(age=5, name='Bob'))]
     """
     sc = SparkContext._active_spark_context
     if len(cols) == 1 and isinstance(cols[0], (list, set)):
@@ -879,14 +870,13 @@ def log2(col):
 
 
 @since(1.5)
-@ignore_unicode_prefix
 def conv(col, fromBase, toBase):
     """
     Convert a number in a string column from one base to another.
 
     >>> df = spark.createDataFrame([("010101",)], ['n'])
     >>> df.select(conv(df.n, 2, 16).alias('hex')).collect()
-    [Row(hex=u'15')]
+    [Row(hex='15')]
     """
     sc = SparkContext._active_spark_context
     return Column(sc._jvm.functions.conv(_to_java_column(col), fromBase, toBase))
@@ -976,7 +966,6 @@ def current_timestamp():
     return Column(sc._jvm.functions.current_timestamp())
 
 
-@ignore_unicode_prefix
 @since(1.5)
 def date_format(date, format):
     """
@@ -992,7 +981,7 @@ def date_format(date, format):
 
     >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
     >>> df.select(date_format('dt', 'MM/dd/yyy').alias('date')).collect()
-    [Row(date=u'04/08/2015')]
+    [Row(date='04/08/2015')]
     """
     sc = SparkContext._active_spark_context
     return Column(sc._jvm.functions.date_format(_to_java_column(date), format))
@@ -1310,7 +1299,6 @@ def last_day(date):
     return Column(sc._jvm.functions.last_day(_to_java_column(date)))
 
 
-@ignore_unicode_prefix
 @since(1.5)
 def from_unixtime(timestamp, format="yyyy-MM-dd HH:mm:ss"):
     """
@@ -1321,7 +1309,7 @@ def from_unixtime(timestamp, format="yyyy-MM-dd HH:mm:ss"):
     >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
     >>> time_df = spark.createDataFrame([(1428476400,)], ['unix_time'])
     >>> time_df.select(from_unixtime('unix_time').alias('ts')).collect()
-    [Row(ts=u'2015-04-08 00:00:00')]
+    [Row(ts='2015-04-08 00:00:00')]
     >>> spark.conf.unset("spark.sql.session.timeZone")
     """
     sc = SparkContext._active_spark_context
@@ -1447,7 +1435,6 @@ def timestamp_seconds(col):
 
 
 @since(2.0)
-@ignore_unicode_prefix
 def window(timeColumn, windowDuration, slideDuration=None, startTime=None):
     """Bucketize rows into one or more time windows given a timestamp specifying column. Window
     starts are inclusive but the window ends are exclusive, e.g. 12:05 will be in the window
@@ -1471,7 +1458,7 @@ def window(timeColumn, windowDuration, slideDuration=None, startTime=None):
     >>> w = df.groupBy(window("date", "5 seconds")).agg(sum("val").alias("sum"))
     >>> w.select(w.window.start.cast("string").alias("start"),
     ...          w.window.end.cast("string").alias("end"), "sum").collect()
-    [Row(start=u'2016-03-11 09:00:05', end=u'2016-03-11 09:00:10', sum=1)]
+    [Row(start='2016-03-11 09:00:05', end='2016-03-11 09:00:10', sum=1)]
     """
     def check_string_field(field, fieldName):
         if not field or type(field) is not str:
@@ -1498,7 +1485,6 @@ def window(timeColumn, windowDuration, slideDuration=None, startTime=None):
 # ---------------------------- misc functions ----------------------------------
 
 @since(1.5)
-@ignore_unicode_prefix
 def crc32(col):
     """
     Calculates the cyclic redundancy check value  (CRC32) of a binary column and
@@ -1511,33 +1497,30 @@ def crc32(col):
     return Column(sc._jvm.functions.crc32(_to_java_column(col)))
 
 
-@ignore_unicode_prefix
 @since(1.5)
 def md5(col):
     """Calculates the MD5 digest and returns the value as a 32 character hex string.
 
     >>> spark.createDataFrame([('ABC',)], ['a']).select(md5('a').alias('hash')).collect()
-    [Row(hash=u'902fbdd2b1df0c4f70b4a5d23525e932')]
+    [Row(hash='902fbdd2b1df0c4f70b4a5d23525e932')]
     """
     sc = SparkContext._active_spark_context
     jc = sc._jvm.functions.md5(_to_java_column(col))
     return Column(jc)
 
 
-@ignore_unicode_prefix
 @since(1.5)
 def sha1(col):
     """Returns the hex string result of SHA-1.
 
     >>> spark.createDataFrame([('ABC',)], ['a']).select(sha1('a').alias('hash')).collect()
-    [Row(hash=u'3c01bdbb26f358bab27f267924aa2c9a03fcfdb8')]
+    [Row(hash='3c01bdbb26f358bab27f267924aa2c9a03fcfdb8')]
     """
     sc = SparkContext._active_spark_context
     jc = sc._jvm.functions.sha1(_to_java_column(col))
     return Column(jc)
 
 
-@ignore_unicode_prefix
 @since(1.5)
 def sha2(col, numBits):
     """Returns the hex string result of SHA-2 family of hash functions (SHA-224, SHA-256, SHA-384,
@@ -1546,9 +1529,9 @@ def sha2(col, numBits):
 
     >>> digests = df.select(sha2(df.name, 256).alias('s')).collect()
     >>> digests[0]
-    Row(s=u'3bc51062973c458d5a6f2d8d64a023246354ad7e064b1e4e009ec8a0699a3043')
+    Row(s='3bc51062973c458d5a6f2d8d64a023246354ad7e064b1e4e009ec8a0699a3043')
     >>> digests[1]
-    Row(s=u'cd9fb1e148ccd8442e5aa74904cc73bf6fb54d1d54d333bd596aa9bb4bb4e961')
+    Row(s='cd9fb1e148ccd8442e5aa74904cc73bf6fb54d1d54d333bd596aa9bb4bb4e961')
     """
     sc = SparkContext._active_spark_context
     jc = sc._jvm.functions.sha2(_to_java_column(col), numBits)
@@ -1600,7 +1583,6 @@ del _name, _doc
 
 
 @since(1.5)
-@ignore_unicode_prefix
 def concat_ws(sep, *cols):
     """
     Concatenates multiple input string columns together into a single string column,
@@ -1608,7 +1590,7 @@ def concat_ws(sep, *cols):
 
     >>> df = spark.createDataFrame([('abcd','123')], ['s', 'd'])
     >>> df.select(concat_ws('-', df.s, df.d).alias('s')).collect()
-    [Row(s=u'abcd-123')]
+    [Row(s='abcd-123')]
     """
     sc = SparkContext._active_spark_context
     return Column(sc._jvm.functions.concat_ws(sep, _to_seq(sc, cols, _to_java_column)))
@@ -1634,7 +1616,6 @@ def encode(col, charset):
     return Column(sc._jvm.functions.encode(_to_java_column(col), charset))
 
 
-@ignore_unicode_prefix
 @since(1.5)
 def format_number(col, d):
     """
@@ -1645,13 +1626,12 @@ def format_number(col, d):
     :param d: the N decimal places
 
     >>> spark.createDataFrame([(5,)], ['a']).select(format_number('a', 4).alias('v')).collect()
-    [Row(v=u'5.0000')]
+    [Row(v='5.0000')]
     """
     sc = SparkContext._active_spark_context
     return Column(sc._jvm.functions.format_number(_to_java_column(col), d))
 
 
-@ignore_unicode_prefix
 @since(1.5)
 def format_string(format, *cols):
     """
@@ -1663,7 +1643,7 @@ def format_string(format, *cols):
 
     >>> df = spark.createDataFrame([(5, "hello")], ['a', 'b'])
     >>> df.select(format_string('%d %s', df.a, df.b).alias('v')).collect()
-    [Row(v=u'5 hello')]
+    [Row(v='5 hello')]
     """
     sc = SparkContext._active_spark_context
     return Column(sc._jvm.functions.format_string(format, _to_seq(sc, cols, _to_java_column)))
@@ -1721,7 +1701,6 @@ def overlay(src, replace, pos, len=-1):
 
 
 @since(1.5)
-@ignore_unicode_prefix
 def substring(str, pos, len):
     """
     Substring starts at `pos` and is of length `len` when str is String type or
@@ -1732,14 +1711,13 @@ def substring(str, pos, len):
 
     >>> df = spark.createDataFrame([('abcd',)], ['s',])
     >>> df.select(substring(df.s, 1, 2).alias('s')).collect()
-    [Row(s=u'ab')]
+    [Row(s='ab')]
     """
     sc = SparkContext._active_spark_context
     return Column(sc._jvm.functions.substring(_to_java_column(str), pos, len))
 
 
 @since(1.5)
-@ignore_unicode_prefix
 def substring_index(str, delim, count):
     """
     Returns the substring from string str before count occurrences of the delimiter delim.
@@ -1749,15 +1727,14 @@ def substring_index(str, delim, count):
 
     >>> df = spark.createDataFrame([('a.b.c.d',)], ['s'])
     >>> df.select(substring_index(df.s, '.', 2).alias('s')).collect()
-    [Row(s=u'a.b')]
+    [Row(s='a.b')]
     >>> df.select(substring_index(df.s, '.', -3).alias('s')).collect()
-    [Row(s=u'b.c.d')]
+    [Row(s='b.c.d')]
     """
     sc = SparkContext._active_spark_context
     return Column(sc._jvm.functions.substring_index(_to_java_column(str), delim, count))
 
 
-@ignore_unicode_prefix
 @since(1.5)
 def levenshtein(left, right):
     """Computes the Levenshtein distance of the two given strings.
@@ -1792,49 +1769,45 @@ def locate(substr, str, pos=1):
 
 
 @since(1.5)
-@ignore_unicode_prefix
 def lpad(col, len, pad):
     """
     Left-pad the string column to width `len` with `pad`.
 
     >>> df = spark.createDataFrame([('abcd',)], ['s',])
     >>> df.select(lpad(df.s, 6, '#').alias('s')).collect()
-    [Row(s=u'##abcd')]
+    [Row(s='##abcd')]
     """
     sc = SparkContext._active_spark_context
     return Column(sc._jvm.functions.lpad(_to_java_column(col), len, pad))
 
 
 @since(1.5)
-@ignore_unicode_prefix
 def rpad(col, len, pad):
     """
     Right-pad the string column to width `len` with `pad`.
 
     >>> df = spark.createDataFrame([('abcd',)], ['s',])
     >>> df.select(rpad(df.s, 6, '#').alias('s')).collect()
-    [Row(s=u'abcd##')]
+    [Row(s='abcd##')]
     """
     sc = SparkContext._active_spark_context
     return Column(sc._jvm.functions.rpad(_to_java_column(col), len, pad))
 
 
 @since(1.5)
-@ignore_unicode_prefix
 def repeat(col, n):
     """
     Repeats a string column n times, and returns it as a new string column.
 
     >>> df = spark.createDataFrame([('ab',)], ['s',])
     >>> df.select(repeat(df.s, 3).alias('s')).collect()
-    [Row(s=u'ababab')]
+    [Row(s='ababab')]
     """
     sc = SparkContext._active_spark_context
     return Column(sc._jvm.functions.repeat(_to_java_column(col), n))
 
 
 @since(1.5)
-@ignore_unicode_prefix
 def split(str, pattern, limit=-1):
     """
     Splits str around matches of the given pattern.
@@ -1855,15 +1828,14 @@ def split(str, pattern, limit=-1):
 
     >>> df = spark.createDataFrame([('oneAtwoBthreeC',)], ['s',])
     >>> df.select(split(df.s, '[ABC]', 2).alias('s')).collect()
-    [Row(s=[u'one', u'twoBthreeC'])]
+    [Row(s=['one', 'twoBthreeC'])]
     >>> df.select(split(df.s, '[ABC]', -1).alias('s')).collect()
-    [Row(s=[u'one', u'two', u'three', u''])]
+    [Row(s=['one', 'two', 'three', ''])]
     """
     sc = SparkContext._active_spark_context
     return Column(sc._jvm.functions.split(_to_java_column(str), pattern, limit))
 
 
-@ignore_unicode_prefix
 @since(1.5)
 def regexp_extract(str, pattern, idx):
     r"""Extract a specific group matched by a Java regex, from the specified string column.
@@ -1871,73 +1843,68 @@ def regexp_extract(str, pattern, idx):
 
     >>> df = spark.createDataFrame([('100-200',)], ['str'])
     >>> df.select(regexp_extract('str', r'(\d+)-(\d+)', 1).alias('d')).collect()
-    [Row(d=u'100')]
+    [Row(d='100')]
     >>> df = spark.createDataFrame([('foo',)], ['str'])
     >>> df.select(regexp_extract('str', r'(\d+)', 1).alias('d')).collect()
-    [Row(d=u'')]
+    [Row(d='')]
     >>> df = spark.createDataFrame([('aaaac',)], ['str'])
     >>> df.select(regexp_extract('str', '(a+)(b)?(c)', 2).alias('d')).collect()
-    [Row(d=u'')]
+    [Row(d='')]
     """
     sc = SparkContext._active_spark_context
     jc = sc._jvm.functions.regexp_extract(_to_java_column(str), pattern, idx)
     return Column(jc)
 
 
-@ignore_unicode_prefix
 @since(1.5)
 def regexp_replace(str, pattern, replacement):
     r"""Replace all substrings of the specified string value that match regexp with rep.
 
     >>> df = spark.createDataFrame([('100-200',)], ['str'])
     >>> df.select(regexp_replace('str', r'(\d+)', '--').alias('d')).collect()
-    [Row(d=u'-----')]
+    [Row(d='-----')]
     """
     sc = SparkContext._active_spark_context
     jc = sc._jvm.functions.regexp_replace(_to_java_column(str), pattern, replacement)
     return Column(jc)
 
 
-@ignore_unicode_prefix
 @since(1.5)
 def initcap(col):
     """Translate the first letter of each word to upper case in the sentence.
 
     >>> spark.createDataFrame([('ab cd',)], ['a']).select(initcap("a").alias('v')).collect()
-    [Row(v=u'Ab Cd')]
+    [Row(v='Ab Cd')]
     """
     sc = SparkContext._active_spark_context
     return Column(sc._jvm.functions.initcap(_to_java_column(col)))
 
 
 @since(1.5)
-@ignore_unicode_prefix
 def soundex(col):
     """
     Returns the SoundEx encoding for a string
 
     >>> df = spark.createDataFrame([("Peters",),("Uhrbach",)], ['name'])
     >>> df.select(soundex(df.name).alias("soundex")).collect()
-    [Row(soundex=u'P362'), Row(soundex=u'U612')]
+    [Row(soundex='P362'), Row(soundex='U612')]
     """
     sc = SparkContext._active_spark_context
     return Column(sc._jvm.functions.soundex(_to_java_column(col)))
 
 
-@ignore_unicode_prefix
 @since(1.5)
 def bin(col):
     """Returns the string representation of the binary value of the given column.
 
     >>> df.select(bin(df.age).alias('c')).collect()
-    [Row(c=u'10'), Row(c=u'101')]
+    [Row(c='10'), Row(c='101')]
     """
     sc = SparkContext._active_spark_context
     jc = sc._jvm.functions.bin(_to_java_column(col))
     return Column(jc)
 
 
-@ignore_unicode_prefix
 @since(1.5)
 def hex(col):
     """Computes hex value of the given column, which could be :class:`pyspark.sql.types.StringType`,
@@ -1945,14 +1912,13 @@ def hex(col):
     :class:`pyspark.sql.types.LongType`.
 
     >>> spark.createDataFrame([('ABC', 3)], ['a', 'b']).select(hex('a'), hex('b')).collect()
-    [Row(hex(a)=u'414243', hex(b)=u'3')]
+    [Row(hex(a)='414243', hex(b)='3')]
     """
     sc = SparkContext._active_spark_context
     jc = sc._jvm.functions.hex(_to_java_column(col))
     return Column(jc)
 
 
-@ignore_unicode_prefix
 @since(1.5)
 def unhex(col):
     """Inverse of hex. Interprets each pair of characters as a hexadecimal number
@@ -1965,7 +1931,6 @@ def unhex(col):
     return Column(sc._jvm.functions.unhex(_to_java_column(col)))
 
 
-@ignore_unicode_prefix
 @since(1.5)
 def length(col):
     """Computes the character length of string data or number of bytes of binary data.
@@ -1979,7 +1944,6 @@ def length(col):
     return Column(sc._jvm.functions.length(_to_java_column(col)))
 
 
-@ignore_unicode_prefix
 @since(1.5)
 def translate(srcCol, matching, replace):
     """A function translate any character in the `srcCol` by a character in `matching`.
@@ -1989,7 +1953,7 @@ def translate(srcCol, matching, replace):
 
     >>> spark.createDataFrame([('translate',)], ['a']).select(translate('a', "rnlt", "123") \\
     ...     .alias('r')).collect()
-    [Row(r=u'1a2s3ae')]
+    [Row(r='1a2s3ae')]
     """
     sc = SparkContext._active_spark_context
     return Column(sc._jvm.functions.translate(_to_java_column(srcCol), matching, replace))
@@ -1997,7 +1961,6 @@ def translate(srcCol, matching, replace):
 
 # ---------------------- Collection functions ------------------------------
 
-@ignore_unicode_prefix
 @since(2.0)
 def create_map(*cols):
     """Creates a new map column.
@@ -2006,9 +1969,9 @@ def create_map(*cols):
         grouped as key-value pairs, e.g. (key1, value1, key2, value2, ...).
 
     >>> df.select(create_map('name', 'age').alias("map")).collect()
-    [Row(map={u'Alice': 2}), Row(map={u'Bob': 5})]
+    [Row(map={'Alice': 2}), Row(map={'Bob': 5})]
     >>> df.select(create_map([df.name, df.age]).alias("map")).collect()
-    [Row(map={u'Alice': 2}), Row(map={u'Bob': 5})]
+    [Row(map={'Alice': 2}), Row(map={'Bob': 5})]
     """
     sc = SparkContext._active_spark_context
     if len(cols) == 1 and isinstance(cols[0], (list, set)):
@@ -2108,7 +2071,6 @@ def slice(x, start, length):
     return Column(sc._jvm.functions.slice(_to_java_column(x), start, length))
 
 
-@ignore_unicode_prefix
 @since(2.4)
 def array_join(col, delimiter, null_replacement=None):
     """
@@ -2117,9 +2079,9 @@ def array_join(col, delimiter, null_replacement=None):
 
     >>> df = spark.createDataFrame([(["a", "b", "c"],), (["a", None],)], ['data'])
     >>> df.select(array_join(df.data, ",").alias("joined")).collect()
-    [Row(joined=u'a,b,c'), Row(joined=u'a')]
+    [Row(joined='a,b,c'), Row(joined='a')]
     >>> df.select(array_join(df.data, ",", "NULL").alias("joined")).collect()
-    [Row(joined=u'a,b,c'), Row(joined=u'a,NULL')]
+    [Row(joined='a,b,c'), Row(joined='a,NULL')]
     """
     sc = SparkContext._active_spark_context
     if null_replacement is None:
@@ -2130,7 +2092,6 @@ def array_join(col, delimiter, null_replacement=None):
 
 
 @since(1.5)
-@ignore_unicode_prefix
 def concat(*cols):
     """
     Concatenates multiple input columns together into a single column.
@@ -2138,7 +2099,7 @@ def concat(*cols):
 
     >>> df = spark.createDataFrame([('abcd','123')], ['s', 'd'])
     >>> df.select(concat(df.s, df.d).alias('s')).collect()
-    [Row(s=u'abcd123')]
+    [Row(s='abcd123')]
 
     >>> df = spark.createDataFrame([([1, 2], [3, 4], [5]), ([1, 2], None, [3])], ['a', 'b', 'c'])
     >>> df.select(concat(df.a, df.b, df.c).alias("arr")).collect()
@@ -2165,7 +2126,6 @@ def array_position(col, value):
     return Column(sc._jvm.functions.array_position(_to_java_column(col), value))
 
 
-@ignore_unicode_prefix
 @since(2.4)
 def element_at(col, extraction):
     """
@@ -2179,7 +2139,7 @@ def element_at(col, extraction):
 
     >>> df = spark.createDataFrame([(["a", "b", "c"],), ([],)], ['data'])
     >>> df.select(element_at(df.data, 1)).collect()
-    [Row(element_at(data, 1)=u'a'), Row(element_at(data, 1)=None)]
+    [Row(element_at(data, 1)='a'), Row(element_at(data, 1)=None)]
 
     >>> df = spark.createDataFrame([({"a": 1.0, "b": 2.0},), ({},)], ['data'])
     >>> df.select(element_at(df.data, lit("a"))).collect()
@@ -2221,7 +2181,6 @@ def array_distinct(col):
     return Column(sc._jvm.functions.array_distinct(_to_java_column(col)))
 
 
-@ignore_unicode_prefix
 @since(2.4)
 def array_intersect(col1, col2):
     """
@@ -2234,13 +2193,12 @@ def array_intersect(col1, col2):
     >>> from pyspark.sql import Row
     >>> df = spark.createDataFrame([Row(c1=["b", "a", "c"], c2=["c", "d", "a", "f"])])
     >>> df.select(array_intersect(df.c1, df.c2)).collect()
-    [Row(array_intersect(c1, c2)=[u'a', u'c'])]
+    [Row(array_intersect(c1, c2)=['a', 'c'])]
     """
     sc = SparkContext._active_spark_context
     return Column(sc._jvm.functions.array_intersect(_to_java_column(col1), _to_java_column(col2)))
 
 
-@ignore_unicode_prefix
 @since(2.4)
 def array_union(col1, col2):
     """
@@ -2253,13 +2211,12 @@ def array_union(col1, col2):
     >>> from pyspark.sql import Row
     >>> df = spark.createDataFrame([Row(c1=["b", "a", "c"], c2=["c", "d", "a", "f"])])
     >>> df.select(array_union(df.c1, df.c2)).collect()
-    [Row(array_union(c1, c2)=[u'b', u'a', u'c', u'd', u'f'])]
+    [Row(array_union(c1, c2)=['b', 'a', 'c', 'd', 'f'])]
     """
     sc = SparkContext._active_spark_context
     return Column(sc._jvm.functions.array_union(_to_java_column(col1), _to_java_column(col2)))
 
 
-@ignore_unicode_prefix
 @since(2.4)
 def array_except(col1, col2):
     """
@@ -2272,7 +2229,7 @@ def array_except(col1, col2):
     >>> from pyspark.sql import Row
     >>> df = spark.createDataFrame([Row(c1=["b", "a", "c"], c2=["c", "d", "a", "f"])])
     >>> df.select(array_except(df.c1, df.c2)).collect()
-    [Row(array_except(c1, c2)=[u'b'])]
+    [Row(array_except(c1, c2)=['b'])]
     """
     sc = SparkContext._active_spark_context
     return Column(sc._jvm.functions.array_except(_to_java_column(col1), _to_java_column(col2)))
@@ -2397,7 +2354,6 @@ def posexplode_outer(col):
     return Column(jc)
 
 
-@ignore_unicode_prefix
 @since(1.6)
 def get_json_object(col, path):
     """
@@ -2411,14 +2367,13 @@ def get_json_object(col, path):
     >>> df = spark.createDataFrame(data, ("key", "jstring"))
     >>> df.select(df.key, get_json_object(df.jstring, '$.f1').alias("c0"), \\
     ...                   get_json_object(df.jstring, '$.f2').alias("c1") ).collect()
-    [Row(key=u'1', c0=u'value1', c1=u'value2'), Row(key=u'2', c0=u'value12', c1=None)]
+    [Row(key='1', c0='value1', c1='value2'), Row(key='2', c0='value12', c1=None)]
     """
     sc = SparkContext._active_spark_context
     jc = sc._jvm.functions.get_json_object(_to_java_column(col), path)
     return Column(jc)
 
 
-@ignore_unicode_prefix
 @since(1.6)
 def json_tuple(col, *fields):
     """Creates a new row for a json column according to the given field names.
@@ -2429,14 +2384,13 @@ def json_tuple(col, *fields):
     >>> data = [("1", '''{"f1": "value1", "f2": "value2"}'''), ("2", '''{"f1": "value12"}''')]
     >>> df = spark.createDataFrame(data, ("key", "jstring"))
     >>> df.select(df.key, json_tuple(df.jstring, 'f1', 'f2')).collect()
-    [Row(key=u'1', c0=u'value1', c1=u'value2'), Row(key=u'2', c0=u'value12', c1=None)]
+    [Row(key='1', c0='value1', c1='value2'), Row(key='2', c0='value12', c1=None)]
     """
     sc = SparkContext._active_spark_context
     jc = sc._jvm.functions.json_tuple(_to_java_column(col), _to_seq(sc, fields))
     return Column(jc)
 
 
-@ignore_unicode_prefix
 @since(2.1)
 def from_json(col, schema, options={}):
     """
@@ -2460,7 +2414,7 @@ def from_json(col, schema, options={}):
     >>> df.select(from_json(df.value, "a INT").alias("json")).collect()
     [Row(json=Row(a=1))]
     >>> df.select(from_json(df.value, "MAP<STRING,INT>").alias("json")).collect()
-    [Row(json={u'a': 1})]
+    [Row(json={'a': 1})]
     >>> data = [(1, '''[{"a": 1}]''')]
     >>> schema = ArrayType(StructType([StructField("a", IntegerType())]))
     >>> df = spark.createDataFrame(data, ("key", "value"))
@@ -2485,7 +2439,6 @@ def from_json(col, schema, options={}):
     return Column(jc)
 
 
-@ignore_unicode_prefix
 @since(2.1)
 def to_json(col, options={}):
     """
@@ -2499,26 +2452,26 @@ def to_json(col, options={}):
 
     >>> from pyspark.sql import Row
     >>> from pyspark.sql.types import *
-    >>> data = [(1, Row(name='Alice', age=2))]
+    >>> data = [(1, Row(age=2, name='Alice'))]
     >>> df = spark.createDataFrame(data, ("key", "value"))
     >>> df.select(to_json(df.value).alias("json")).collect()
-    [Row(json=u'{"age":2,"name":"Alice"}')]
-    >>> data = [(1, [Row(name='Alice', age=2), Row(name='Bob', age=3)])]
+    [Row(json='{"age":2,"name":"Alice"}')]
+    >>> data = [(1, [Row(age=2, name='Alice'), Row(age=3, name='Bob')])]
     >>> df = spark.createDataFrame(data, ("key", "value"))
     >>> df.select(to_json(df.value).alias("json")).collect()
-    [Row(json=u'[{"age":2,"name":"Alice"},{"age":3,"name":"Bob"}]')]
+    [Row(json='[{"age":2,"name":"Alice"},{"age":3,"name":"Bob"}]')]
     >>> data = [(1, {"name": "Alice"})]
     >>> df = spark.createDataFrame(data, ("key", "value"))
     >>> df.select(to_json(df.value).alias("json")).collect()
-    [Row(json=u'{"name":"Alice"}')]
+    [Row(json='{"name":"Alice"}')]
     >>> data = [(1, [{"name": "Alice"}, {"name": "Bob"}])]
     >>> df = spark.createDataFrame(data, ("key", "value"))
     >>> df.select(to_json(df.value).alias("json")).collect()
-    [Row(json=u'[{"name":"Alice"},{"name":"Bob"}]')]
+    [Row(json='[{"name":"Alice"},{"name":"Bob"}]')]
     >>> data = [(1, ["Alice", "Bob"])]
     >>> df = spark.createDataFrame(data, ("key", "value"))
     >>> df.select(to_json(df.value).alias("json")).collect()
-    [Row(json=u'["Alice","Bob"]')]
+    [Row(json='["Alice","Bob"]')]
     """
 
     sc = SparkContext._active_spark_context
@@ -2526,7 +2479,6 @@ def to_json(col, options={}):
     return Column(jc)
 
 
-@ignore_unicode_prefix
 @since(2.4)
 def schema_of_json(json, options={}):
     """
@@ -2540,12 +2492,12 @@ def schema_of_json(json, options={}):
 
     >>> df = spark.range(1)
     >>> df.select(schema_of_json(lit('{"a": 0}')).alias("json")).collect()
-    [Row(json=u'struct<a:bigint>')]
+    [Row(json='struct<a:bigint>')]
     >>> schema = schema_of_json('{a: 1}', {'allowUnquotedFieldNames':'true'})
     >>> df.select(schema.alias("json")).collect()
-    [Row(json=u'struct<a:bigint>')]
+    [Row(json='struct<a:bigint>')]
     """
-    if isinstance(json, basestring):
+    if isinstance(json, str):
         col = _create_column_from_literal(json)
     elif isinstance(json, Column):
         col = _to_java_column(json)
@@ -2557,7 +2509,6 @@ def schema_of_json(json, options={}):
     return Column(jc)
 
 
-@ignore_unicode_prefix
 @since(3.0)
 def schema_of_csv(csv, options={}):
     """
@@ -2568,11 +2519,11 @@ def schema_of_csv(csv, options={}):
 
     >>> df = spark.range(1)
     >>> df.select(schema_of_csv(lit('1|a'), {'sep':'|'}).alias("csv")).collect()
-    [Row(csv=u'struct<_c0:int,_c1:string>')]
+    [Row(csv='struct<_c0:int,_c1:string>')]
     >>> df.select(schema_of_csv('1|a', {'sep':'|'}).alias("csv")).collect()
-    [Row(csv=u'struct<_c0:int,_c1:string>')]
+    [Row(csv='struct<_c0:int,_c1:string>')]
     """
-    if isinstance(csv, basestring):
+    if isinstance(csv, str):
         col = _create_column_from_literal(csv)
     elif isinstance(csv, Column):
         col = _to_java_column(csv)
@@ -2584,7 +2535,6 @@ def schema_of_csv(csv, options={}):
     return Column(jc)
 
 
-@ignore_unicode_prefix
 @since(3.0)
 def to_csv(col, options={}):
     """
@@ -2595,10 +2545,10 @@ def to_csv(col, options={}):
     :param options: options to control converting. accepts the same options as the CSV datasource.
 
     >>> from pyspark.sql import Row
-    >>> data = [(1, Row(name='Alice', age=2))]
+    >>> data = [(1, Row(age=2, name='Alice'))]
     >>> df = spark.createDataFrame(data, ("key", "value"))
     >>> df.select(to_csv(df.value).alias("csv")).collect()
-    [Row(csv=u'2,Alice')]
+    [Row(csv='2,Alice')]
     """
 
     sc = SparkContext._active_spark_context
@@ -2705,7 +2655,6 @@ def shuffle(col):
 
 
 @since(1.5)
-@ignore_unicode_prefix
 def reverse(col):
     """
     Collection function: returns a reversed string or an array with reverse order of elements.
@@ -2714,7 +2663,7 @@ def reverse(col):
 
     >>> df = spark.createDataFrame([('Spark SQL',)], ['data'])
     >>> df.select(reverse(df.data).alias('s')).collect()
-    [Row(s=u'LQS krapS')]
+    [Row(s='LQS krapS')]
     >>> df = spark.createDataFrame([([2, 1, 3],) ,([1],) ,([],)], ['data'])
     >>> df.select(reverse(df.data).alias('r')).collect()
     [Row(r=[3, 1, 2]), Row(r=[1]), Row(r=[])]
@@ -2820,7 +2769,6 @@ def map_from_entries(col):
     return Column(sc._jvm.functions.map_from_entries(_to_java_column(col)))
 
 
-@ignore_unicode_prefix
 @since(2.4)
 def array_repeat(col, count):
     """
@@ -2828,7 +2776,7 @@ def array_repeat(col, count):
 
     >>> df = spark.createDataFrame([('ab',)], ['data'])
     >>> df.select(array_repeat(df.data, 3).alias('r')).collect()
-    [Row(r=[u'ab', u'ab', u'ab'])]
+    [Row(r=['ab', 'ab', 'ab'])]
     """
     sc = SparkContext._active_spark_context
     return Column(sc._jvm.functions.array_repeat(
@@ -2898,7 +2846,6 @@ def sequence(start, stop, step=None):
             _to_java_column(start), _to_java_column(stop), _to_java_column(step)))
 
 
-@ignore_unicode_prefix
 @since(3.0)
 def from_csv(col, schema, options={}):
     """
@@ -2920,11 +2867,11 @@ def from_csv(col, schema, options={}):
     >>> df = spark.createDataFrame(data, ("value",))
     >>> options = {'ignoreLeadingWhiteSpace': True}
     >>> df.select(from_csv(df.value, "s string", options).alias("csv")).collect()
-    [Row(csv=Row(s=u'abc'))]
+    [Row(csv=Row(s='abc'))]
     """
 
     sc = SparkContext._active_spark_context
-    if isinstance(schema, basestring):
+    if isinstance(schema, str):
         schema = _create_column_from_literal(schema)
     elif isinstance(schema, Column):
         schema = _to_java_column(schema)
@@ -2984,20 +2931,6 @@ def _get_lambda_parameters(f):
     return parameters
 
 
-def _get_lambda_parameters_legacy(f):
-    # TODO (SPARK-29909) Remove once 2.7 support is dropped
-    import inspect
-
-    spec = inspect.getargspec(f)
-    if not 1 <= len(spec.args) <= 3 or spec.varargs or spec.keywords:
-        raise ValueError(
-            "f should take between 1 and 3 arguments, but provided function takes {}".format(
-                spec
-            )
-        )
-    return spec.args
-
-
 def _create_lambda(f):
     """
     Create `o.a.s.sql.expressions.LambdaFunction` corresponding
@@ -3008,10 +2941,7 @@ def _create_lambda(f):
             - (Column, Column) -> Column: ...
             - (Column, Column, Column) -> Column: ...
     """
-    if sys.version_info >= (3, 3):
-        parameters = _get_lambda_parameters(f)
-    else:
-        parameters = _get_lambda_parameters_legacy(f)
+    parameters = _get_lambda_parameters(f)
 
     sc = SparkContext._active_spark_context
     expressions = sc._jvm.org.apache.spark.sql.catalyst.expressions
@@ -3481,7 +3411,7 @@ def udf(f=None, returnType=StringType()):
                            evalType=PythonEvalType.SQL_BATCHED_UDF)
 
 
-blacklist = ['map', 'since', 'ignore_unicode_prefix']
+blacklist = ['map', 'since']
 __all__ = [k for k, v in globals().items()
            if not k.startswith('_') and k[0].islower() and callable(v) and k not in blacklist]
 __all__ += ["PandasUDFType"]
@@ -3500,7 +3430,7 @@ def _test():
     sc = spark.sparkContext
     globs['sc'] = sc
     globs['spark'] = spark
-    globs['df'] = spark.createDataFrame([Row(name='Alice', age=2), Row(name='Bob', age=5)])
+    globs['df'] = spark.createDataFrame([Row(age=2, name='Alice'), Row(age=5, name='Bob')])
     (failure_count, test_count) = doctest.testmod(
         pyspark.sql.functions, globs=globs,
         optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py
index ac826bc64a..83e2baa8f0 100644
--- a/python/pyspark/sql/group.py
+++ b/python/pyspark/sql/group.py
@@ -18,7 +18,6 @@
 import sys
 
 from pyspark import since
-from pyspark.rdd import ignore_unicode_prefix
 from pyspark.sql.column import Column, _to_seq
 from pyspark.sql.dataframe import DataFrame
 from pyspark.sql.pandas.group_ops import PandasGroupedOpsMixin
@@ -60,7 +59,6 @@ class GroupedData(PandasGroupedOpsMixin):
         self._df = df
         self.sql_ctx = df.sql_ctx
 
-    @ignore_unicode_prefix
     @since(1.3)
     def agg(self, *exprs):
         """Compute aggregates and returns the result as a :class:`DataFrame`.
@@ -91,18 +89,18 @@ class GroupedData(PandasGroupedOpsMixin):
 
         >>> gdf = df.groupBy(df.name)
         >>> sorted(gdf.agg({"*": "count"}).collect())
-        [Row(name=u'Alice', count(1)=1), Row(name=u'Bob', count(1)=1)]
+        [Row(name='Alice', count(1)=1), Row(name='Bob', count(1)=1)]
 
         >>> from pyspark.sql import functions as F
         >>> sorted(gdf.agg(F.min(df.age)).collect())
-        [Row(name=u'Alice', min(age)=2), Row(name=u'Bob', min(age)=5)]
+        [Row(name='Alice', min(age)=2), Row(name='Bob', min(age)=5)]
 
         >>> from pyspark.sql.functions import pandas_udf, PandasUDFType
         >>> @pandas_udf('int', PandasUDFType.GROUPED_AGG)  # doctest: +SKIP
         ... def min_udf(v):
         ...     return v.min()
         >>> sorted(gdf.agg(min_udf(df.age)).collect())  # doctest: +SKIP
-        [Row(name=u'Alice', min_udf(age)=2), Row(name=u'Bob', min_udf(age)=5)]
+        [Row(name='Alice', min_udf(age)=2), Row(name='Bob', min_udf(age)=5)]
         """
         assert exprs, "exprs should not be empty"
         if len(exprs) == 1 and isinstance(exprs[0], dict):
diff --git a/python/pyspark/sql/pandas/conversion.py b/python/pyspark/sql/pandas/conversion.py
index e6d8e9f24a..3842bc2357 100644
--- a/python/pyspark/sql/pandas/conversion.py
+++ b/python/pyspark/sql/pandas/conversion.py
@@ -16,11 +16,6 @@
 #
 import sys
 import warnings
-if sys.version >= '3':
-    basestring = unicode = str
-    xrange = range
-else:
-    from itertools import izip as zip
 from collections import Counter
 
 from pyspark import since
@@ -29,7 +24,6 @@ from pyspark.sql.pandas.serializers import ArrowCollectSerializer
 from pyspark.sql.types import IntegralType
 from pyspark.sql.types import *
 from pyspark.traceback_utils import SCCallSiteSync
-from pyspark.util import _exception_message
 
 
 class PandasConversionMixin(object):
@@ -84,7 +78,7 @@ class PandasConversionMixin(object):
                         "failed by the reason below:\n  %s\n"
                         "Attempting non-optimization as "
                         "'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to "
-                        "true." % _exception_message(e))
+                        "true." % str(e))
                     warnings.warn(msg)
                     use_arrow = False
                 else:
@@ -93,7 +87,7 @@ class PandasConversionMixin(object):
                         "'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has "
                         "reached the error below and will not continue because automatic fallback "
                         "with 'spark.sql.execution.arrow.pyspark.fallback.enabled' has been set to "
-                        "false.\n  %s" % _exception_message(e))
+                        "false.\n  %s" % str(e))
                     warnings.warn(msg)
                     raise
 
@@ -130,7 +124,7 @@ class PandasConversionMixin(object):
                         "reached the error below and can not continue. Note that "
                         "'spark.sql.execution.arrow.pyspark.fallback.enabled' does not have an "
                         "effect on failures in the middle of "
-                        "computation.\n  %s" % _exception_message(e))
+                        "computation.\n  %s" % str(e))
                     warnings.warn(msg)
                     raise
 
@@ -268,7 +262,7 @@ class SparkConversionMixin(object):
 
         # If no schema supplied by user then get the names of columns only
         if schema is None:
-            schema = [str(x) if not isinstance(x, basestring) else
+            schema = [str(x) if not isinstance(x, str) else
                       (x.encode('utf-8') if not isinstance(x, str) else x)
                       for x in data.columns]
 
@@ -276,8 +270,6 @@ class SparkConversionMixin(object):
             try:
                 return self._create_from_pandas_with_arrow(data, schema, timezone)
             except Exception as e:
-                from pyspark.util import _exception_message
-
                 if self._wrapped._conf.arrowPySparkFallbackEnabled():
                     msg = (
                         "createDataFrame attempted Arrow optimization because "
@@ -285,7 +277,7 @@ class SparkConversionMixin(object):
                         "failed by the reason below:\n  %s\n"
                         "Attempting non-optimization as "
                         "'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to "
-                        "true." % _exception_message(e))
+                        "true." % str(e))
                     warnings.warn(msg)
                 else:
                     msg = (
@@ -293,7 +285,7 @@ class SparkConversionMixin(object):
                         "'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has "
                         "reached the error below and will not continue because automatic "
                         "fallback with 'spark.sql.execution.arrow.pyspark.fallback.enabled' "
-                        "has been set to false.\n  %s" % _exception_message(e))
+                        "has been set to false.\n  %s" % str(e))
                     warnings.warn(msg)
                     raise
         data = self._convert_from_pandas(data, schema, timezone)
@@ -358,7 +350,7 @@ class SparkConversionMixin(object):
         col_names = cur_dtypes.names
         record_type_list = []
         has_rec_fix = False
-        for i in xrange(len(cur_dtypes)):
+        for i in range(len(cur_dtypes)):
             curr_type = cur_dtypes[i]
             # If type is a datetime64 timestamp, convert to microseconds
             # NOTE: if dtype is datetime[ns] then np.record.tolist() will output values as longs,
@@ -413,7 +405,7 @@ class SparkConversionMixin(object):
 
         # Slice the DataFrame to be batched
         step = -(-len(pdf) // self.sparkContext.defaultParallelism)  # round int up
-        pdf_slices = (pdf.iloc[start:start + step] for start in xrange(0, len(pdf), step))
+        pdf_slices = (pdf.iloc[start:start + step] for start in range(0, len(pdf), step))
 
         # Create list of Arrow (columns, type) for serializer dump_stream
         arrow_data = [[(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)]
diff --git a/python/pyspark/sql/pandas/functions.py b/python/pyspark/sql/pandas/functions.py
index 094dc357b6..ba4dec82d4 100644
--- a/python/pyspark/sql/pandas/functions.py
+++ b/python/pyspark/sql/pandas/functions.py
@@ -18,6 +18,7 @@
 import functools
 import sys
 import warnings
+from inspect import getfullargspec
 
 from pyspark import since
 from pyspark.rdd import PythonEvalType
@@ -25,7 +26,6 @@ from pyspark.sql.pandas.typehints import infer_eval_type
 from pyspark.sql.pandas.utils import require_minimum_pandas_version, require_minimum_pyarrow_version
 from pyspark.sql.types import DataType
 from pyspark.sql.udf import _create_udf
-from pyspark.util import _get_argspec
 
 
 class PandasUDFType(object):
@@ -371,30 +371,29 @@ def pandas_udf(f=None, returnType=None, functionType=None):
 
 
 def _create_pandas_udf(f, returnType, evalType):
-    argspec = _get_argspec(f)
+    argspec = getfullargspec(f)
 
     # pandas UDF by type hints.
-    if sys.version_info >= (3, 6):
-        from inspect import signature
+    from inspect import signature
 
-        if evalType in [PythonEvalType.SQL_SCALAR_PANDAS_UDF,
-                        PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF,
-                        PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF]:
-            warnings.warn(
-                "In Python 3.6+ and Spark 3.0+, it is preferred to specify type hints for "
-                "pandas UDF instead of specifying pandas UDF type which will be deprecated "
-                "in the future releases. See SPARK-28264 for more details.", UserWarning)
-        elif evalType in [PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
-                          PythonEvalType.SQL_MAP_PANDAS_ITER_UDF,
-                          PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF]:
-            # In case of 'SQL_GROUPED_MAP_PANDAS_UDF',  deprecation warning is being triggered
-            # at `apply` instead.
-            # In case of 'SQL_MAP_PANDAS_ITER_UDF' and 'SQL_COGROUPED_MAP_PANDAS_UDF', the
-            # evaluation type will always be set.
-            pass
-        elif len(argspec.annotations) > 0:
-            evalType = infer_eval_type(signature(f))
-            assert evalType is not None
+    if evalType in [PythonEvalType.SQL_SCALAR_PANDAS_UDF,
+                    PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF,
+                    PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF]:
+        warnings.warn(
+            "In Python 3.6+ and Spark 3.0+, it is preferred to specify type hints for "
+            "pandas UDF instead of specifying pandas UDF type which will be deprecated "
+            "in the future releases. See SPARK-28264 for more details.", UserWarning)
+    elif evalType in [PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
+                      PythonEvalType.SQL_MAP_PANDAS_ITER_UDF,
+                      PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF]:
+        # In case of 'SQL_GROUPED_MAP_PANDAS_UDF',  deprecation warning is being triggered
+        # at `apply` instead.
+        # In case of 'SQL_MAP_PANDAS_ITER_UDF' and 'SQL_COGROUPED_MAP_PANDAS_UDF', the
+        # evaluation type will always be set.
+        pass
+    elif len(argspec.annotations) > 0:
+        evalType = infer_eval_type(signature(f))
+        assert evalType is not None
 
     if evalType is None:
         # Set default is scalar UDF.
diff --git a/python/pyspark/sql/pandas/serializers.py b/python/pyspark/sql/pandas/serializers.py
index 42562e1fb9..4b91c6a0f8 100644
--- a/python/pyspark/sql/pandas/serializers.py
+++ b/python/pyspark/sql/pandas/serializers.py
@@ -19,13 +19,6 @@
 Serializers for PyArrow and pandas conversions. See `pyspark.serializers` for more details.
 """
 
-import sys
-if sys.version < '3':
-    from itertools import izip as zip
-else:
-    basestring = unicode = str
-    xrange = range
-
 from pyspark.serializers import Serializer, read_int, write_int, UTF8Deserializer
 
 
@@ -67,7 +60,7 @@ class ArrowCollectSerializer(Serializer):
             raise RuntimeError("An error occurred while calling "
                                "ArrowCollectSerializer.load_stream: {}".format(error_msg))
         batch_order = []
-        for i in xrange(num):
+        for i in range(num):
             index = read_int(stream)
             batch_order.append(index)
         yield batch_order
@@ -180,7 +173,7 @@ class ArrowStreamPandasSerializer(ArrowStreamSerializer):
                 if len(s) == 0 and len(s.columns) == 0:
                     arrs_names = [(pa.array([], type=field.type), field.name) for field in t]
                 # Assign result columns by schema name if user labeled with strings
-                elif self._assign_cols_by_name and any(isinstance(name, basestring)
+                elif self._assign_cols_by_name and any(isinstance(name, str)
                                                        for name in s.columns):
                     arrs_names = [(create_array(s[field.name], field.type), field.name)
                                   for field in t]
@@ -194,7 +187,7 @@ class ArrowStreamPandasSerializer(ArrowStreamSerializer):
             else:
                 arrs.append(create_array(s, t))
 
-        return pa.RecordBatch.from_arrays(arrs, ["_%d" % i for i in xrange(len(arrs))])
+        return pa.RecordBatch.from_arrays(arrs, ["_%d" % i for i in range(len(arrs))])
 
     def dump_stream(self, iterator, stream):
         """
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 336345e383..a83aece2e4 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -15,15 +15,9 @@
 # limitations under the License.
 #
 
-import sys
-
-if sys.version >= '3':
-    basestring = unicode = str
-
 from py4j.java_gateway import JavaClass
 
 from pyspark import RDD, since
-from pyspark.rdd import ignore_unicode_prefix
 from pyspark.sql.column import _to_seq
 from pyspark.sql.types import *
 from pyspark.sql import utils
@@ -94,7 +88,7 @@ class DataFrameReader(OptionUtils):
         if isinstance(schema, StructType):
             jschema = spark._jsparkSession.parseDataType(schema.json())
             self._jreader = self._jreader.schema(jschema)
-        elif isinstance(schema, basestring):
+        elif isinstance(schema, str):
             self._jreader = self._jreader.schema(schema)
         else:
             raise TypeError("schema should be StructType or string")
@@ -174,7 +168,7 @@ class DataFrameReader(OptionUtils):
         if schema is not None:
             self.schema(schema)
         self.options(**options)
-        if isinstance(path, basestring):
+        if isinstance(path, str):
             return self._df(self._jreader.load(path))
         elif path is not None:
             if type(path) != list:
@@ -294,16 +288,16 @@ class DataFrameReader(OptionUtils):
             allowUnquotedControlChars=allowUnquotedControlChars, lineSep=lineSep,
             samplingRatio=samplingRatio, dropFieldIfAllNull=dropFieldIfAllNull, encoding=encoding,
             locale=locale, pathGlobFilter=pathGlobFilter, recursiveFileLookup=recursiveFileLookup)
-        if isinstance(path, basestring):
+        if isinstance(path, str):
             path = [path]
         if type(path) == list:
             return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))
         elif isinstance(path, RDD):
             def func(iterator):
                 for x in iterator:
-                    if not isinstance(x, basestring):
-                        x = unicode(x)
-                    if isinstance(x, unicode):
+                    if not isinstance(x, str):
+                        x = str(x)
+                    if isinstance(x, str):
                         x = x.encode("utf-8")
                     yield x
             keyed = path.mapPartitions(func)
@@ -352,7 +346,6 @@ class DataFrameReader(OptionUtils):
                        recursiveFileLookup=recursiveFileLookup)
         return self._df(self._jreader.parquet(_to_seq(self._spark._sc, paths)))
 
-    @ignore_unicode_prefix
     @since(1.6)
     def text(self, paths, wholetext=False, lineSep=None, pathGlobFilter=None,
              recursiveFileLookup=None):
@@ -376,15 +369,15 @@ class DataFrameReader(OptionUtils):
 
         >>> df = spark.read.text('python/test_support/sql/text-test.txt')
         >>> df.collect()
-        [Row(value=u'hello'), Row(value=u'this')]
+        [Row(value='hello'), Row(value='this')]
         >>> df = spark.read.text('python/test_support/sql/text-test.txt', wholetext=True)
         >>> df.collect()
-        [Row(value=u'hello\\nthis')]
+        [Row(value='hello\\nthis')]
         """
         self._set_opts(
             wholetext=wholetext, lineSep=lineSep, pathGlobFilter=pathGlobFilter,
             recursiveFileLookup=recursiveFileLookup)
-        if isinstance(paths, basestring):
+        if isinstance(paths, str):
             paths = [paths]
         return self._df(self._jreader.text(self._spark._sc._jvm.PythonUtils.toSeq(paths)))
 
@@ -529,16 +522,16 @@ class DataFrameReader(OptionUtils):
             charToEscapeQuoteEscaping=charToEscapeQuoteEscaping, samplingRatio=samplingRatio,
             enforceSchema=enforceSchema, emptyValue=emptyValue, locale=locale, lineSep=lineSep,
             pathGlobFilter=pathGlobFilter, recursiveFileLookup=recursiveFileLookup)
-        if isinstance(path, basestring):
+        if isinstance(path, str):
             path = [path]
         if type(path) == list:
             return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))
         elif isinstance(path, RDD):
             def func(iterator):
                 for x in iterator:
-                    if not isinstance(x, basestring):
-                        x = unicode(x)
-                    if isinstance(x, unicode):
+                    if not isinstance(x, str):
+                        x = str(x)
+                    if isinstance(x, str):
                         x = x.encode("utf-8")
                     yield x
             keyed = path.mapPartitions(func)
@@ -574,7 +567,7 @@ class DataFrameReader(OptionUtils):
         """
         self._set_opts(mergeSchema=mergeSchema, pathGlobFilter=pathGlobFilter,
                        recursiveFileLookup=recursiveFileLookup)
-        if isinstance(path, basestring):
+        if isinstance(path, str):
             path = [path]
         return self._df(self._jreader.orc(_to_seq(self._spark._sc, path)))
 
@@ -763,7 +756,7 @@ class DataFrameWriter(OptionUtils):
 
             col, cols = col[0], col[1:]
 
-        if not all(isinstance(c, basestring) for c in cols) or not(isinstance(col, basestring)):
+        if not all(isinstance(c, str) for c in cols) or not(isinstance(col, str)):
             raise TypeError("all names should be `str`")
 
         self._jwrite = self._jwrite.bucketBy(numBuckets, col, _to_seq(self._spark._sc, cols))
@@ -788,7 +781,7 @@ class DataFrameWriter(OptionUtils):
 
             col, cols = col[0], col[1:]
 
-        if not all(isinstance(c, basestring) for c in cols) or not(isinstance(col, basestring)):
+        if not all(isinstance(c, str) for c in cols) or not(isinstance(col, str)):
             raise TypeError("all names should be `str`")
 
         self._jwrite = self._jwrite.sortBy(col, _to_seq(self._spark._sc, cols))
diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py
index 61891c478d..a5d102712d 100644
--- a/python/pyspark/sql/session.py
+++ b/python/pyspark/sql/session.py
@@ -15,22 +15,13 @@
 # limitations under the License.
 #
 
-# To disallow implicit relative import. Remove this once we drop Python 2.
-from __future__ import absolute_import
-from __future__ import print_function
 import sys
 import warnings
 from functools import reduce
 from threading import RLock
 
-if sys.version >= '3':
-    basestring = unicode = str
-    xrange = range
-else:
-    from itertools import imap as map
-
 from pyspark import since
-from pyspark.rdd import RDD, ignore_unicode_prefix
+from pyspark.rdd import RDD
 from pyspark.sql.conf import RuntimeConfig
 from pyspark.sql.dataframe import DataFrame
 from pyspark.sql.pandas.conversion import SparkConversionMixin
@@ -56,7 +47,7 @@ def _monkey_patch_RDD(sparkSession):
         :return: a DataFrame
 
         >>> rdd.toDF().collect()
-        [Row(name=u'Alice', age=1)]
+        [Row(name='Alice', age=1)]
         """
         return sparkSession.createDataFrame(self, schema, sampleRatio)
 
@@ -197,7 +188,6 @@ class SparkSession(SparkConversionMixin):
     _instantiatedSession = None
     _activeSession = None
 
-    @ignore_unicode_prefix
     def __init__(self, sparkContext, jsparkSession=None):
         """Creates a new SparkSession.
 
@@ -213,7 +203,7 @@ class SparkSession(SparkConversionMixin):
         [Row((i + CAST(1 AS BIGINT))=2, (d + CAST(1 AS DOUBLE))=2.0, (NOT b)=False, list[1]=2, \
             dict[s]=0, time=datetime.datetime(2014, 8, 1, 14, 1, 5), a=1)]
         >>> df.rdd.map(lambda x: (x.i, x.s, x.d, x.l, x.b, x.time, x.row.a, x.list)).collect()
-        [(1, u'string', 1.0, 1, True, datetime.datetime(2014, 8, 1, 14, 1, 5), 1, [1, 2, 3])]
+        [(1, 'string', 1.0, 1, True, datetime.datetime(2014, 8, 1, 14, 1, 5), 1, [1, 2, 3])]
         """
         from pyspark.sql.context import SQLContext
         self._sc = sparkContext
@@ -492,7 +482,6 @@ class SparkSession(SparkConversionMixin):
         return SparkSession.builder.getOrCreate()
 
     @since(2.0)
-    @ignore_unicode_prefix
     def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=True):
         """
         Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`.
@@ -530,34 +519,29 @@ class SparkSession(SparkConversionMixin):
 
         .. note:: Usage with spark.sql.execution.arrow.pyspark.enabled=True is experimental.
 
-        .. note:: When Arrow optimization is enabled, strings inside Pandas DataFrame in Python
-            2 are converted into bytes as they are bytes in Python 2 whereas regular strings are
-            left as strings. When using strings in Python 2, use unicode `u""` as Python standard
-            practice.
-
         >>> l = [('Alice', 1)]
         >>> spark.createDataFrame(l).collect()
-        [Row(_1=u'Alice', _2=1)]
+        [Row(_1='Alice', _2=1)]
         >>> spark.createDataFrame(l, ['name', 'age']).collect()
-        [Row(name=u'Alice', age=1)]
+        [Row(name='Alice', age=1)]
 
         >>> d = [{'name': 'Alice', 'age': 1}]
         >>> spark.createDataFrame(d).collect()
-        [Row(age=1, name=u'Alice')]
+        [Row(age=1, name='Alice')]
 
         >>> rdd = sc.parallelize(l)
         >>> spark.createDataFrame(rdd).collect()
-        [Row(_1=u'Alice', _2=1)]
+        [Row(_1='Alice', _2=1)]
         >>> df = spark.createDataFrame(rdd, ['name', 'age'])
         >>> df.collect()
-        [Row(name=u'Alice', age=1)]
+        [Row(name='Alice', age=1)]
 
         >>> from pyspark.sql import Row
         >>> Person = Row('name', 'age')
         >>> person = rdd.map(lambda r: Person(*r))
         >>> df2 = spark.createDataFrame(person)
         >>> df2.collect()
-        [Row(name=u'Alice', age=1)]
+        [Row(name='Alice', age=1)]
 
         >>> from pyspark.sql.types import *
         >>> schema = StructType([
@@ -565,15 +549,15 @@ class SparkSession(SparkConversionMixin):
         ...    StructField("age", IntegerType(), True)])
         >>> df3 = spark.createDataFrame(rdd, schema)
         >>> df3.collect()
-        [Row(name=u'Alice', age=1)]
+        [Row(name='Alice', age=1)]
 
         >>> spark.createDataFrame(df.toPandas()).collect()  # doctest: +SKIP
-        [Row(name=u'Alice', age=1)]
+        [Row(name='Alice', age=1)]
         >>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect()  # doctest: +SKIP
         [Row(0=1, 1=2)]
 
         >>> spark.createDataFrame(rdd, "a: string, b: int").collect()
-        [Row(a=u'Alice', b=1)]
+        [Row(a='Alice', b=1)]
         >>> rdd = rdd.map(lambda row: row[1])
         >>> spark.createDataFrame(rdd, "int").collect()
         [Row(value=1)]
@@ -587,7 +571,7 @@ class SparkSession(SparkConversionMixin):
         if isinstance(data, DataFrame):
             raise TypeError("data is already a DataFrame")
 
-        if isinstance(schema, basestring):
+        if isinstance(schema, str):
             schema = _parse_datatype_string(schema)
         elif isinstance(schema, (list, tuple)):
             # Must re-encode any unicode strings to be consistent with StructField names
@@ -634,7 +618,6 @@ class SparkSession(SparkConversionMixin):
         df._schema = schema
         return df
 
-    @ignore_unicode_prefix
     @since(2.0)
     def sql(self, sqlQuery):
         """Returns a :class:`DataFrame` representing the result of the given query.
@@ -644,7 +627,7 @@ class SparkSession(SparkConversionMixin):
         >>> df.createOrReplaceTempView("table1")
         >>> df2 = spark.sql("SELECT field1 AS f1, field2 as f2 from table1")
         >>> df2.collect()
-        [Row(f1=1, f2=u'row1'), Row(f1=2, f2=u'row2'), Row(f1=3, f2=u'row3')]
+        [Row(f1=1, f2='row1'), Row(f1=2, f2='row2'), Row(f1=3, f2='row3')]
         """
         return DataFrame(self._jsparkSession.sql(sqlQuery), self._wrapped)
 
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index 2450a4c93c..5c528c1d54 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -18,13 +18,9 @@
 import sys
 import json
 
-if sys.version >= '3':
-    basestring = str
-
 from py4j.java_gateway import java_import
 
 from pyspark import since, keyword_only
-from pyspark.rdd import ignore_unicode_prefix
 from pyspark.sql.column import _to_seq
 from pyspark.sql.readwriter import OptionUtils, to_str
 from pyspark.sql.types import *
@@ -204,7 +200,6 @@ class StreamingQueryManager(object):
         self._jsqm = jsqm
 
     @property
-    @ignore_unicode_prefix
     @since(2.0)
     def active(self):
         """Returns a list of active queries associated with this SQLContext
@@ -213,12 +208,11 @@ class StreamingQueryManager(object):
         >>> sqm = spark.streams
         >>> # get the list of active streaming queries
         >>> [q.name for q in sqm.active]
-        [u'this_query']
+        ['this_query']
         >>> sq.stop()
         """
         return [StreamingQuery(jsq) for jsq in self._jsqm.active()]
 
-    @ignore_unicode_prefix
     @since(2.0)
     def get(self, id):
         """Returns an active query from this SQLContext or throws exception if an active query
@@ -226,7 +220,7 @@ class StreamingQueryManager(object):
 
         >>> sq = sdf.writeStream.format('memory').queryName('this_query').start()
         >>> sq.name
-        u'this_query'
+        'this_query'
         >>> sq = spark.streams.get(sq.id)
         >>> sq.isActive
         True
@@ -328,7 +322,7 @@ class DataStreamReader(OptionUtils):
         if isinstance(schema, StructType):
             jschema = spark._jsparkSession.parseDataType(schema.json())
             self._jreader = self._jreader.schema(jschema)
-        elif isinstance(schema, basestring):
+        elif isinstance(schema, str):
             self._jreader = self._jreader.schema(schema)
         else:
             raise TypeError("schema should be StructType or string")
@@ -527,7 +521,7 @@ class DataStreamReader(OptionUtils):
             allowUnquotedControlChars=allowUnquotedControlChars, lineSep=lineSep, locale=locale,
             dropFieldIfAllNull=dropFieldIfAllNull, encoding=encoding,
             pathGlobFilter=pathGlobFilter, recursiveFileLookup=recursiveFileLookup)
-        if isinstance(path, basestring):
+        if isinstance(path, str):
             return self._df(self._jreader.json(path))
         else:
             raise TypeError("path can be only a single string")
@@ -555,7 +549,7 @@ class DataStreamReader(OptionUtils):
         """
         self._set_opts(mergeSchema=mergeSchema, pathGlobFilter=pathGlobFilter,
                        recursiveFileLookup=recursiveFileLookup)
-        if isinstance(path, basestring):
+        if isinstance(path, str):
             return self._df(self._jreader.orc(path))
         else:
             raise TypeError("path can be only a single string")
@@ -585,12 +579,11 @@ class DataStreamReader(OptionUtils):
         """
         self._set_opts(mergeSchema=mergeSchema, pathGlobFilter=pathGlobFilter,
                        recursiveFileLookup=recursiveFileLookup)
-        if isinstance(path, basestring):
+        if isinstance(path, str):
             return self._df(self._jreader.parquet(path))
         else:
             raise TypeError("path can be only a single string")
 
-    @ignore_unicode_prefix
     @since(2.0)
     def text(self, path, wholetext=False, lineSep=None, pathGlobFilter=None,
              recursiveFileLookup=None):
@@ -623,7 +616,7 @@ class DataStreamReader(OptionUtils):
         self._set_opts(
             wholetext=wholetext, lineSep=lineSep, pathGlobFilter=pathGlobFilter,
             recursiveFileLookup=recursiveFileLookup)
-        if isinstance(path, basestring):
+        if isinstance(path, str):
             return self._df(self._jreader.text(path))
         else:
             raise TypeError("path can be only a single string")
@@ -762,7 +755,7 @@ class DataStreamReader(OptionUtils):
             charToEscapeQuoteEscaping=charToEscapeQuoteEscaping, enforceSchema=enforceSchema,
             emptyValue=emptyValue, locale=locale, lineSep=lineSep,
             pathGlobFilter=pathGlobFilter, recursiveFileLookup=recursiveFileLookup)
-        if isinstance(path, basestring):
+        if isinstance(path, str):
             return self._df(self._jreader.csv(path))
         else:
             raise TypeError("path can be only a single string")
@@ -1153,7 +1146,6 @@ class DataStreamWriter(object):
         ensure_callback_server_started(gw)
         return self
 
-    @ignore_unicode_prefix
     @since(2.0)
     def start(self, path=None, format=None, outputMode=None, partitionBy=None, queryName=None,
               **options):
@@ -1186,14 +1178,14 @@ class DataStreamWriter(object):
         >>> sq.isActive
         True
         >>> sq.name
-        u'this_query'
+        'this_query'
         >>> sq.stop()
         >>> sq.isActive
         False
         >>> sq = sdf.writeStream.trigger(processingTime='5 seconds').start(
         ...     queryName='that_query', outputMode="append", format='memory')
         >>> sq.name
-        u'that_query'
+        'that_query'
         >>> sq.isActive
         True
         >>> sq.stop()
diff --git a/python/pyspark/sql/tests/test_arrow.py b/python/pyspark/sql/tests/test_arrow.py
index a96354e3ec..90fc983aec 100644
--- a/python/pyspark/sql/tests/test_arrow.py
+++ b/python/pyspark/sql/tests/test_arrow.py
@@ -21,9 +21,6 @@ import threading
 import time
 import unittest
 import warnings
-import sys
-if sys.version >= '3':
-    basestring = unicode = str
 
 from pyspark import SparkContext, SparkConf
 from pyspark.sql import Row, SparkSession
@@ -32,7 +29,6 @@ from pyspark.sql.types import *
 from pyspark.testing.sqlutils import ReusedSQLTestCase, have_pandas, have_pyarrow, \
     pandas_requirement_message, pyarrow_requirement_message
 from pyspark.testing.utils import QuietTest
-from pyspark.util import _exception_message
 
 if have_pandas:
     import pandas as pd
@@ -130,7 +126,7 @@ class ArrowTests(ReusedSQLTestCase):
                             warn.message for warn in warns if isinstance(warn.message, UserWarning)]
                         self.assertTrue(len(user_warns) > 0)
                         self.assertTrue(
-                            "Attempting non-optimization" in _exception_message(user_warns[-1]))
+                            "Attempting non-optimization" in str(user_warns[-1]))
                         assert_frame_equal(pdf, pd.DataFrame({u'map': [{u'a': 1}]}))
 
     def test_toPandas_fallback_disabled(self):
@@ -358,7 +354,7 @@ class ArrowTests(ReusedSQLTestCase):
                         warn.message for warn in warns if isinstance(warn.message, UserWarning)]
                     self.assertTrue(len(user_warns) > 0)
                     self.assertTrue(
-                        "Attempting non-optimization" in _exception_message(user_warns[-1]))
+                        "Attempting non-optimization" in str(user_warns[-1]))
                     self.assertEqual(df.collect(), [Row(a={u'a': 1})])
 
     def test_createDataFrame_fallback_disabled(self):
@@ -438,12 +434,12 @@ class ArrowTests(ReusedSQLTestCase):
         assert_frame_equal(result_spark, result_arrow)
 
         # ensure original category elements are string
-        self.assertIsInstance(category_first_element, basestring)
+        self.assertIsInstance(category_first_element, str)
         # spark data frame and arrow execution mode enabled data frame type must match pandas
         self.assertEqual(spark_type, 'string')
         self.assertEqual(arrow_type, 'string')
-        self.assertIsInstance(arrow_first_category_element, basestring)
-        self.assertIsInstance(spark_first_category_element, basestring)
+        self.assertIsInstance(arrow_first_category_element, str)
+        self.assertIsInstance(spark_first_category_element, str)
 
     def test_createDataFrame_with_float_index(self):
         # SPARK-32098: float index should not produce duplicated or truncated Spark DataFrame
diff --git a/python/pyspark/sql/tests/test_column.py b/python/pyspark/sql/tests/test_column.py
index 58bf896a10..e0b8bf45a2 100644
--- a/python/pyspark/sql/tests/test_column.py
+++ b/python/pyspark/sql/tests/test_column.py
@@ -16,8 +16,6 @@
 # limitations under the License.
 #
 
-import sys
-
 from pyspark.sql import Column, Row
 from pyspark.sql.types import *
 from pyspark.sql.utils import AnalysisException
@@ -109,12 +107,8 @@ class ColumnTests(ReusedSQLTestCase):
         self.assertRaises(TypeError, lambda: df[{}])
 
     def test_column_name_with_non_ascii(self):
-        if sys.version >= '3':
-            columnName = "数量"
-            self.assertTrue(isinstance(columnName, str))
-        else:
-            columnName = unicode("数量", "utf-8")
-            self.assertTrue(isinstance(columnName, unicode))
+        columnName = "数量"
+        self.assertTrue(isinstance(columnName, str))
         schema = StructType([StructField(columnName, LongType(), True)])
         df = self.spark.createDataFrame([(1,)], schema)
         self.assertEqual(schema, df.schema)
diff --git a/python/pyspark/sql/tests/test_context.py b/python/pyspark/sql/tests/test_context.py
index 3b1b638ed4..ff953ba4b4 100644
--- a/python/pyspark/sql/tests/test_context.py
+++ b/python/pyspark/sql/tests/test_context.py
@@ -19,11 +19,7 @@ import shutil
 import sys
 import tempfile
 import unittest
-try:
-    from importlib import reload  # Python 3.4+ only.
-except ImportError:
-    # Otherwise, we will stick to Python 2's built-in reload.
-    pass
+from importlib import reload
 
 import py4j
 
diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py
index 52ae74df5d..7dcc19f3ba 100644
--- a/python/pyspark/sql/tests/test_functions.py
+++ b/python/pyspark/sql/tests/test_functions.py
@@ -167,10 +167,6 @@ class FunctionsTests(ReusedSQLTestCase):
             TypeError,
             "must be the same type",
             lambda: df.select(col('name').substr(0, lit(1))))
-        if sys.version_info.major == 2:
-            self.assertRaises(
-                TypeError,
-                lambda: df.select(col('name').substr(long(0), long(1))))
 
         for name in _string_functions.keys():
             self.assertEqual(
diff --git a/python/pyspark/sql/tests/test_pandas_cogrouped_map.py b/python/pyspark/sql/tests/test_pandas_cogrouped_map.py
index c1cb30c3ca..24a73918d8 100644
--- a/python/pyspark/sql/tests/test_pandas_cogrouped_map.py
+++ b/python/pyspark/sql/tests/test_pandas_cogrouped_map.py
@@ -32,11 +32,6 @@ if have_pyarrow:
     import pyarrow as pa
 
 
-# Tests below use pd.DataFrame.assign that will infer mixed types (unicode/str) for column names
-# From kwargs w/ Python 2, so need to set check_column_type=False and avoid this check
-_check_column_type = sys.version >= '3'
-
-
 @unittest.skipIf(
     not have_pandas or not have_pyarrow,
     pandas_requirement_message or pyarrow_requirement_message)
@@ -109,7 +104,7 @@ class CogroupedMapInPandasTests(ReusedSQLTestCase):
             'v2': [90, 100, 110]
         })
 
-        assert_frame_equal(expected, result, check_column_type=_check_column_type)
+        assert_frame_equal(expected, result)
 
     def test_empty_group_by(self):
         left = self.data1
@@ -130,7 +125,7 @@ class CogroupedMapInPandasTests(ReusedSQLTestCase):
             .merge(left, right, on=['id', 'k']) \
             .sort_values(by=['id', 'k'])
 
-        assert_frame_equal(expected, result, check_column_type=_check_column_type)
+        assert_frame_equal(expected, result)
 
     def test_mixed_scalar_udfs_followed_by_cogrouby_apply(self):
         df = self.spark.range(0, 10).toDF('v1')
@@ -173,7 +168,7 @@ class CogroupedMapInPandasTests(ReusedSQLTestCase):
         expected = self.data1.toPandas()
         expected = expected.assign(key=expected.id % 2 == 0)
 
-        assert_frame_equal(expected, result, check_column_type=_check_column_type)
+        assert_frame_equal(expected, result)
 
     def test_wrong_return_type(self):
         # Test that we get a sensible exception invalid values passed to apply
@@ -224,7 +219,7 @@ class CogroupedMapInPandasTests(ReusedSQLTestCase):
         expected = left.toPandas() if isLeft else right.toPandas()
         expected = expected.assign(key=expected.id)
 
-        assert_frame_equal(expected, result, check_column_type=_check_column_type)
+        assert_frame_equal(expected, result)
 
     @staticmethod
     def _test_merge(left, right, output_schema='id long, k int, v int, v2 int'):
@@ -246,7 +241,7 @@ class CogroupedMapInPandasTests(ReusedSQLTestCase):
             .merge(left, right, on=['id', 'k']) \
             .sort_values(by=['id', 'k'])
 
-        assert_frame_equal(expected, result, check_column_type=_check_column_type)
+        assert_frame_equal(expected, result)
 
 
 if __name__ == "__main__":
diff --git a/python/pyspark/sql/tests/test_pandas_grouped_map.py b/python/pyspark/sql/tests/test_pandas_grouped_map.py
index cc6167e619..00cc9b3a64 100644
--- a/python/pyspark/sql/tests/test_pandas_grouped_map.py
+++ b/python/pyspark/sql/tests/test_pandas_grouped_map.py
@@ -38,11 +38,6 @@ if have_pyarrow:
     import pyarrow as pa
 
 
-# Tests below use pd.DataFrame.assign that will infer mixed types (unicode/str) for column names
-# from kwargs w/ Python 2, so need to set check_column_type=False and avoid this check
-_check_column_type = sys.version >= '3'
-
-
 @unittest.skipIf(
     not have_pandas or not have_pyarrow,
     pandas_requirement_message or pyarrow_requirement_message)
@@ -139,9 +134,9 @@ class GroupedMapInPandasTests(ReusedSQLTestCase):
         result3 = df.groupby('id').apply(udf3).sort('id').toPandas()
         expected3 = expected1
 
-        assert_frame_equal(expected1, result1, check_column_type=_check_column_type)
-        assert_frame_equal(expected2, result2, check_column_type=_check_column_type)
-        assert_frame_equal(expected3, result3, check_column_type=_check_column_type)
+        assert_frame_equal(expected1, result1)
+        assert_frame_equal(expected2, result2)
+        assert_frame_equal(expected3, result3)
 
     def test_array_type_correct(self):
         df = self.data.withColumn("arr", array(col("id"))).repartition(1, "id")
@@ -159,7 +154,7 @@ class GroupedMapInPandasTests(ReusedSQLTestCase):
 
         result = df.groupby('id').apply(udf).sort('id').toPandas()
         expected = df.toPandas().groupby('id').apply(udf.func).reset_index(drop=True)
-        assert_frame_equal(expected, result, check_column_type=_check_column_type)
+        assert_frame_equal(expected, result)
 
     def test_register_grouped_map_udf(self):
         foo_udf = pandas_udf(lambda x: x, "id long", PandasUDFType.GROUPED_MAP)
@@ -181,7 +176,7 @@ class GroupedMapInPandasTests(ReusedSQLTestCase):
 
         result = df.groupby('id').apply(foo).sort('id').toPandas()
         expected = df.toPandas().groupby('id').apply(foo.func).reset_index(drop=True)
-        assert_frame_equal(expected, result, check_column_type=_check_column_type)
+        assert_frame_equal(expected, result)
 
     def test_coerce(self):
         df = self.data
@@ -195,7 +190,7 @@ class GroupedMapInPandasTests(ReusedSQLTestCase):
         result = df.groupby('id').apply(foo).sort('id').toPandas()
         expected = df.toPandas().groupby('id').apply(foo.func).reset_index(drop=True)
         expected = expected.assign(v=expected.v.astype('float64'))
-        assert_frame_equal(expected, result, check_column_type=_check_column_type)
+        assert_frame_equal(expected, result)
 
     def test_complex_groupby(self):
         df = self.data
@@ -213,7 +208,7 @@ class GroupedMapInPandasTests(ReusedSQLTestCase):
         expected = pdf.groupby(pdf['id'] % 2 == 0, as_index=False).apply(normalize.func)
         expected = expected.sort_values(['id', 'v']).reset_index(drop=True)
         expected = expected.assign(norm=expected.norm.astype('float64'))
-        assert_frame_equal(expected, result, check_column_type=_check_column_type)
+        assert_frame_equal(expected, result)
 
     def test_empty_groupby(self):
         df = self.data
@@ -231,7 +226,7 @@ class GroupedMapInPandasTests(ReusedSQLTestCase):
         expected = normalize.func(pdf)
         expected = expected.sort_values(['id', 'v']).reset_index(drop=True)
         expected = expected.assign(norm=expected.norm.astype('float64'))
-        assert_frame_equal(expected, result, check_column_type=_check_column_type)
+        assert_frame_equal(expected, result)
 
     def test_datatype_string(self):
         df = self.data
@@ -244,7 +239,7 @@ class GroupedMapInPandasTests(ReusedSQLTestCase):
 
         result = df.groupby('id').apply(foo_udf).sort('id').toPandas()
         expected = df.toPandas().groupby('id').apply(foo_udf.func).reset_index(drop=True)
-        assert_frame_equal(expected, result, check_column_type=_check_column_type)
+        assert_frame_equal(expected, result)
 
     def test_wrong_return_type(self):
         with QuietTest(self.sc):
@@ -301,7 +296,7 @@ class GroupedMapInPandasTests(ReusedSQLTestCase):
         df = self.spark.createDataFrame(dt, 'timestamp').toDF('time')
         foo_udf = pandas_udf(lambda pdf: pdf, 'time timestamp', PandasUDFType.GROUPED_MAP)
         result = df.groupby('time').apply(foo_udf).sort('time')
-        assert_frame_equal(df.toPandas(), result.toPandas(), check_column_type=_check_column_type)
+        assert_frame_equal(df.toPandas(), result.toPandas())
 
     def test_udf_with_key(self):
         import numpy as np
@@ -355,26 +350,26 @@ class GroupedMapInPandasTests(ReusedSQLTestCase):
         expected1 = pdf.groupby('id', as_index=False)\
             .apply(lambda x: udf1.func((x.id.iloc[0],), x))\
             .sort_values(['id', 'v']).reset_index(drop=True)
-        assert_frame_equal(expected1, result1, check_column_type=_check_column_type)
+        assert_frame_equal(expected1, result1)
 
         # Test groupby expression
         result2 = df.groupby(df.id % 2).apply(udf1).sort('id', 'v').toPandas()
         expected2 = pdf.groupby(pdf.id % 2, as_index=False)\
             .apply(lambda x: udf1.func((x.id.iloc[0] % 2,), x))\
             .sort_values(['id', 'v']).reset_index(drop=True)
-        assert_frame_equal(expected2, result2, check_column_type=_check_column_type)
+        assert_frame_equal(expected2, result2)
 
         # Test complex groupby
         result3 = df.groupby(df.id, df.v % 2).apply(udf2).sort('id', 'v').toPandas()
         expected3 = pdf.groupby([pdf.id, pdf.v % 2], as_index=False)\
             .apply(lambda x: udf2.func((x.id.iloc[0], (x.v % 2).iloc[0],), x))\
             .sort_values(['id', 'v']).reset_index(drop=True)
-        assert_frame_equal(expected3, result3, check_column_type=_check_column_type)
+        assert_frame_equal(expected3, result3)
 
         # Test empty groupby
         result4 = df.groupby().apply(udf3).sort('id', 'v').toPandas()
         expected4 = udf3.func((), pdf)
-        assert_frame_equal(expected4, result4, check_column_type=_check_column_type)
+        assert_frame_equal(expected4, result4)
 
     def test_column_order(self):
 
@@ -407,7 +402,7 @@ class GroupedMapInPandasTests(ReusedSQLTestCase):
             .select('id', 'u', 'v').toPandas()
         pd_result = grouped_pdf.apply(change_col_order)
         expected = pd_result.sort_values(['id', 'v']).reset_index(drop=True)
-        assert_frame_equal(expected, result, check_column_type=_check_column_type)
+        assert_frame_equal(expected, result)
 
         # Function returns a pdf with positional columns, indexed by range
         def range_col_order(pdf):
@@ -426,7 +421,7 @@ class GroupedMapInPandasTests(ReusedSQLTestCase):
         pd_result = grouped_pdf.apply(range_col_order)
         rename_pdf(pd_result, ['id', 'u', 'v'])
         expected = pd_result.sort_values(['id', 'v']).reset_index(drop=True)
-        assert_frame_equal(expected, result, check_column_type=_check_column_type)
+        assert_frame_equal(expected, result)
 
         # Function returns a pdf with columns indexed with integers
         def int_index(pdf):
@@ -444,7 +439,7 @@ class GroupedMapInPandasTests(ReusedSQLTestCase):
         pd_result = grouped_pdf.apply(int_index)
         rename_pdf(pd_result, ['id', 'u', 'v'])
         expected = pd_result.sort_values(['id', 'v']).reset_index(drop=True)
-        assert_frame_equal(expected, result, check_column_type=_check_column_type)
+        assert_frame_equal(expected, result)
 
         @pandas_udf('id long, v int', PandasUDFType.GROUPED_MAP)
         def column_name_typo(pdf):
diff --git a/python/pyspark/sql/tests/test_pandas_map.py b/python/pyspark/sql/tests/test_pandas_map.py
index f1956a2523..02ae6a86f9 100644
--- a/python/pyspark/sql/tests/test_pandas_map.py
+++ b/python/pyspark/sql/tests/test_pandas_map.py
@@ -19,9 +19,6 @@ import sys
 import time
 import unittest
 
-if sys.version >= '3':
-    unicode = str
-
 from pyspark.sql.functions import pandas_udf, PandasUDFType
 from pyspark.testing.sqlutils import ReusedSQLTestCase, have_pandas, have_pyarrow, \
     pandas_requirement_message, pyarrow_requirement_message
diff --git a/python/pyspark/sql/tests/test_pandas_udf_scalar.py b/python/pyspark/sql/tests/test_pandas_udf_scalar.py
index 2d38efd39f..75e2a0929e 100644
--- a/python/pyspark/sql/tests/test_pandas_udf_scalar.py
+++ b/python/pyspark/sql/tests/test_pandas_udf_scalar.py
@@ -22,10 +22,6 @@ import sys
 import tempfile
 import time
 import unittest
-
-if sys.version >= '3':
-    unicode = str
-
 from datetime import date, datetime
 from decimal import Decimal
 
@@ -319,7 +315,7 @@ class ScalarPandasUDFTests(ReusedSQLTestCase):
             StructField('str', StringType())])
 
         def scalar_func(id):
-            return pd.DataFrame({'id': id, 'str': id.apply(unicode)})
+            return pd.DataFrame({'id': id, 'str': id.apply(str)})
 
         def iter_func(it):
             for id in it:
@@ -486,14 +482,14 @@ class ScalarPandasUDFTests(ReusedSQLTestCase):
 
         @pandas_udf(return_type)
         def scalar_f(id):
-            return pd.DataFrame({'id': id, 'str': id.apply(unicode)})
+            return pd.DataFrame({'id': id, 'str': id.apply(str)})
 
         scalar_g = pandas_udf(lambda x: x, return_type)
 
         @pandas_udf(return_type, PandasUDFType.SCALAR_ITER)
         def iter_f(it):
             for id in it:
-                yield pd.DataFrame({'id': id, 'str': id.apply(unicode)})
+                yield pd.DataFrame({'id': id, 'str': id.apply(str)})
 
         iter_g = pandas_udf(lambda x: x, return_type, PandasUDFType.SCALAR_ITER)
 
@@ -915,21 +911,12 @@ class ScalarPandasUDFTests(ReusedSQLTestCase):
         # Check result of column 'B' must be equal to column 'A' in type and values
         pd.testing.assert_series_equal(result_spark["A"], result_spark["B"], check_names=False)
 
-    @unittest.skipIf(sys.version_info[:2] < (3, 5), "Type hints are supported from Python 3.5.")
     def test_type_annotation(self):
         # Regression test to check if type hints can be used. See SPARK-23569.
-        # Note that it throws an error during compilation in lower Python versions if 'exec'
-        # is not used. Also, note that we explicitly use another dictionary to avoid modifications
-        # in the current 'locals()'.
-        #
-        # Hyukjin: I think it's an ugly way to test issues about syntax specific in
-        # higher versions of Python, which we shouldn't encourage. This was the last resort
-        # I could come up with at that time.
-        _locals = {}
-        exec(
-            "import pandas as pd\ndef noop(col: pd.Series) -> pd.Series: return col",
-            _locals)
-        df = self.spark.range(1).select(pandas_udf(f=_locals['noop'], returnType='bigint')('id'))
+        def noop(col: pd.Series) -> pd.Series:
+            return col
+
+        df = self.spark.range(1).select(pandas_udf(f=noop, returnType='bigint')('id'))
         self.assertEqual(df.first()[0], 0)
 
     def test_mixed_udf(self):
diff --git a/python/pyspark/sql/tests/test_pandas_udf_typehints.py b/python/pyspark/sql/tests/test_pandas_udf_typehints.py
index 2582080056..618164fa84 100644
--- a/python/pyspark/sql/tests/test_pandas_udf_typehints.py
+++ b/python/pyspark/sql/tests/test_pandas_udf_typehints.py
@@ -14,9 +14,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-import sys
 import unittest
 import inspect
+from typing import Union, Iterator, Tuple
 
 from pyspark.sql.functions import mean, lit
 from pyspark.testing.sqlutils import ReusedSQLTestCase, \
@@ -24,209 +24,162 @@ from pyspark.testing.sqlutils import ReusedSQLTestCase, \
     pyarrow_requirement_message
 from pyspark.sql.pandas.typehints import infer_eval_type
 from pyspark.sql.pandas.functions import pandas_udf, PandasUDFType
+from pyspark.sql import Row
 
 if have_pandas:
     import pandas as pd
+    import numpy as np
     from pandas.util.testing import assert_frame_equal
 
-python_requirement_message = "pandas UDF with type hints are supported with Python 3.6+."
-
 
 @unittest.skipIf(
-    not have_pandas or not have_pyarrow or sys.version_info[:2] < (3, 6),
-    pandas_requirement_message or pyarrow_requirement_message or python_requirement_message)
+    not have_pandas or not have_pyarrow,
+    pandas_requirement_message or pyarrow_requirement_message)
 class PandasUDFTypeHintsTests(ReusedSQLTestCase):
-    # Note that, we should remove `exec` once we drop Python 2 in this class.
-
-    def setUp(self):
-        self.local = {'pd': pd}
-
     def test_type_annotation_scalar(self):
-        exec(
-            "def func(col: pd.Series) -> pd.Series: pass",
-            self.local)
+        def func(col: pd.Series) -> pd.Series:
+            pass
         self.assertEqual(
-            infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR)
+            infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR)
 
-        exec(
-            "def func(col: pd.DataFrame, col1: pd.Series) -> pd.DataFrame: pass",
-            self.local)
+        def func(col: pd.DataFrame, col1: pd.Series) -> pd.DataFrame:
+            pass
         self.assertEqual(
-            infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR)
+            infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR)
 
-        exec(
-            "def func(col: pd.DataFrame, *args: pd.Series) -> pd.Series: pass",
-            self.local)
+        def func(col: pd.DataFrame, *args: pd.Series) -> pd.Series:
+            pass
         self.assertEqual(
-            infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR)
+            infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR)
 
-        exec(
-            "def func(col: pd.Series, *args: pd.Series, **kwargs: pd.DataFrame) -> pd.Series:\n"
-            "    pass",
-            self.local)
+        def func(col: pd.Series, *args: pd.Series, **kwargs: pd.DataFrame) -> pd.Series:
+            pass
         self.assertEqual(
-            infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR)
+            infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR)
 
-        exec(
-            "def func(col: pd.Series, *, col2: pd.DataFrame) -> pd.DataFrame:\n"
-            "    pass",
-            self.local)
+        def func(col: pd.Series, *, col2: pd.DataFrame) -> pd.DataFrame:
+            pass
         self.assertEqual(
-            infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR)
+            infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR)
 
-        exec(
-            "from typing import Union\n"
-            "def func(col: Union[pd.Series, pd.DataFrame], *, col2: pd.DataFrame) -> pd.Series:\n"
-            "    pass",
-            self.local)
+        def func(col: Union[pd.Series, pd.DataFrame], *, col2: pd.DataFrame) -> pd.Series:
+            pass
         self.assertEqual(
-            infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR)
+            infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR)
 
     def test_type_annotation_scalar_iter(self):
-        exec(
-            "from typing import Iterator\n"
-            "def func(iter: Iterator[pd.Series]) -> Iterator[pd.Series]: pass",
-            self.local)
+        def func(iter: Iterator[pd.Series]) -> Iterator[pd.Series]:
+            pass
         self.assertEqual(
-            infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR_ITER)
+            infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR_ITER)
 
-        exec(
-            "from typing import Iterator, Tuple\n"
-            "def func(iter: Iterator[Tuple[pd.DataFrame, pd.Series]]) -> Iterator[pd.DataFrame]:\n"
-            "    pass",
-            self.local)
+        def func(iter: Iterator[Tuple[pd.DataFrame, pd.Series]]) -> Iterator[pd.DataFrame]:
+            pass
         self.assertEqual(
-            infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR_ITER)
+            infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR_ITER)
 
-        exec(
-            "from typing import Iterator, Tuple\n"
-            "def func(iter: Iterator[Tuple[pd.DataFrame, ...]]) -> Iterator[pd.Series]: pass",
-            self.local)
+        def func(iter: Iterator[Tuple[pd.DataFrame, ...]]) -> Iterator[pd.Series]:
+            pass
         self.assertEqual(
-            infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR_ITER)
+            infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR_ITER)
 
-        exec(
-            "from typing import Iterator, Tuple, Union\n"
-            "def func(iter: Iterator[Tuple[Union[pd.DataFrame, pd.Series], ...]])"
-            " -> Iterator[pd.Series]: pass",
-            self.local)
+        def func(
+            iter: Iterator[Tuple[Union[pd.DataFrame, pd.Series], ...]]
+        ) -> Iterator[pd.Series]:
+            pass
         self.assertEqual(
-            infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR_ITER)
+            infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR_ITER)
 
     def test_type_annotation_group_agg(self):
-        exec(
-            "def func(col: pd.Series) -> str: pass",
-            self.local)
-        self.assertEqual(
-            infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.GROUPED_AGG)
 
-        exec(
-            "def func(col: pd.DataFrame, col1: pd.Series) -> int: pass",
-            self.local)
+        def func(col: pd.Series) -> str:
+            pass
         self.assertEqual(
-            infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.GROUPED_AGG)
+            infer_eval_type(inspect.signature(func)), PandasUDFType.GROUPED_AGG)
 
-        exec(
-            "from pyspark.sql import Row\n"
-            "def func(col: pd.DataFrame, *args: pd.Series) -> Row: pass",
-            self.local)
+        def func(col: pd.DataFrame, col1: pd.Series) -> int:
+            pass
         self.assertEqual(
-            infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.GROUPED_AGG)
+            infer_eval_type(inspect.signature(func)), PandasUDFType.GROUPED_AGG)
 
-        exec(
-            "def func(col: pd.Series, *args: pd.Series, **kwargs: pd.DataFrame) -> str:\n"
-            "    pass",
-            self.local)
+        def func(col: pd.DataFrame, *args: pd.Series) -> Row:
+            pass
         self.assertEqual(
-            infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.GROUPED_AGG)
+            infer_eval_type(inspect.signature(func)), PandasUDFType.GROUPED_AGG)
 
-        exec(
-            "def func(col: pd.Series, *, col2: pd.DataFrame) -> float:\n"
-            "    pass",
-            self.local)
+        def func(col: pd.Series, *args: pd.Series, **kwargs: pd.DataFrame) -> str:
+            pass
         self.assertEqual(
-            infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.GROUPED_AGG)
+            infer_eval_type(inspect.signature(func)), PandasUDFType.GROUPED_AGG)
 
-        exec(
-            "from typing import Union\n"
-            "def func(col: Union[pd.Series, pd.DataFrame], *, col2: pd.DataFrame) -> float:\n"
-            "    pass",
-            self.local)
+        def func(col: pd.Series, *, col2: pd.DataFrame) -> float:
+            pass
         self.assertEqual(
-            infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.GROUPED_AGG)
+            infer_eval_type(inspect.signature(func)), PandasUDFType.GROUPED_AGG)
+
+        def func(col: Union[pd.Series, pd.DataFrame], *, col2: pd.DataFrame) -> float:
+            pass
+        self.assertEqual(
+            infer_eval_type(inspect.signature(func)), PandasUDFType.GROUPED_AGG)
 
     def test_type_annotation_negative(self):
-        exec(
-            "def func(col: str) -> pd.Series: pass",
-            self.local)
+
+        def func(col: str) -> pd.Series:
+            pass
         self.assertRaisesRegex(
             NotImplementedError,
             "Unsupported signature.*str",
-            infer_eval_type, inspect.signature(self.local['func']))
+            infer_eval_type, inspect.signature(func))
 
-        exec(
-            "def func(col: pd.DataFrame, col1: int) -> pd.DataFrame: pass",
-            self.local)
+        def func(col: pd.DataFrame, col1: int) -> pd.DataFrame:
+            pass
         self.assertRaisesRegex(
             NotImplementedError,
             "Unsupported signature.*int",
-            infer_eval_type, inspect.signature(self.local['func']))
+            infer_eval_type, inspect.signature(func))
 
-        exec(
-            "from typing import Union\n"
-            "def func(col: Union[pd.DataFrame, str], col1: int) -> pd.DataFrame: pass",
-            self.local)
+        def func(col: Union[pd.DataFrame, str], col1: int) -> pd.DataFrame:
+            pass
         self.assertRaisesRegex(
             NotImplementedError,
             "Unsupported signature.*str",
-            infer_eval_type, inspect.signature(self.local['func']))
+            infer_eval_type, inspect.signature(func))
 
-        exec(
-            "from typing import Tuple\n"
-            "def func(col: pd.Series) -> Tuple[pd.DataFrame]: pass",
-            self.local)
+        def func(col: pd.Series) -> Tuple[pd.DataFrame]:
+            pass
         self.assertRaisesRegex(
             NotImplementedError,
             "Unsupported signature.*Tuple",
-            infer_eval_type, inspect.signature(self.local['func']))
+            infer_eval_type, inspect.signature(func))
 
-        exec(
-            "def func(col, *args: pd.Series) -> pd.Series: pass",
-            self.local)
+        def func(col, *args: pd.Series) -> pd.Series:
+            pass
         self.assertRaisesRegex(
             ValueError,
             "should be specified.*Series",
-            infer_eval_type, inspect.signature(self.local['func']))
+            infer_eval_type, inspect.signature(func))
 
-        exec(
-            "def func(col: pd.Series, *args: pd.Series, **kwargs: pd.DataFrame):\n"
-            "    pass",
-            self.local)
+        def func(col: pd.Series, *args: pd.Series, **kwargs: pd.DataFrame):
+            pass
         self.assertRaisesRegex(
             ValueError,
             "should be specified.*Series",
-            infer_eval_type, inspect.signature(self.local['func']))
+            infer_eval_type, inspect.signature(func))
 
-        exec(
-            "def func(col: pd.Series, *, col2) -> pd.DataFrame:\n"
-            "    pass",
-            self.local)
+        def func(col: pd.Series, *, col2) -> pd.DataFrame:
+            pass
         self.assertRaisesRegex(
             ValueError,
             "should be specified.*Series",
-            infer_eval_type, inspect.signature(self.local['func']))
+            infer_eval_type, inspect.signature(func))
 
     def test_scalar_udf_type_hint(self):
         df = self.spark.range(10).selectExpr("id", "id as v")
 
-        exec(
-            "import typing\n"
-            "def plus_one(v: typing.Union[pd.Series, pd.DataFrame]) -> pd.Series:\n"
-            "    return v + 1",
-            self.local)
-
-        plus_one = pandas_udf("long")(self.local["plus_one"])
+        def plus_one(v: Union[pd.Series, pd.DataFrame]) -> pd.Series:
+            return v + 1
 
+        plus_one = pandas_udf("long")(plus_one)
         actual = df.select(plus_one(df.v).alias("plus_one"))
         expected = df.selectExpr("(v + 1) as plus_one")
         assert_frame_equal(expected.toPandas(), actual.toPandas())
@@ -234,14 +187,11 @@ class PandasUDFTypeHintsTests(ReusedSQLTestCase):
     def test_scalar_iter_udf_type_hint(self):
         df = self.spark.range(10).selectExpr("id", "id as v")
 
-        exec(
-            "import typing\n"
-            "def plus_one(itr: typing.Iterator[pd.Series]) -> typing.Iterator[pd.Series]:\n"
-            "    for s in itr:\n"
-            "        yield s + 1",
-            self.local)
+        def plus_one(itr: Iterator[pd.Series]) -> Iterator[pd.Series]:
+            for s in itr:
+                yield s + 1
 
-        plus_one = pandas_udf("long")(self.local["plus_one"])
+        plus_one = pandas_udf("long")(plus_one)
 
         actual = df.select(plus_one(df.v).alias("plus_one"))
         expected = df.selectExpr("(v + 1) as plus_one")
@@ -249,13 +199,11 @@ class PandasUDFTypeHintsTests(ReusedSQLTestCase):
 
     def test_group_agg_udf_type_hint(self):
         df = self.spark.range(10).selectExpr("id", "id as v")
-        exec(
-            "import numpy as np\n"
-            "def weighted_mean(v: pd.Series, w: pd.Series) -> float:\n"
-            "    return np.average(v, weights=w)",
-            self.local)
 
-        weighted_mean = pandas_udf("double")(self.local["weighted_mean"])
+        def weighted_mean(v: pd.Series, w: pd.Series) -> float:
+            return np.average(v, weights=w)
+
+        weighted_mean = pandas_udf("double")(weighted_mean)
 
         actual = df.groupby('id').agg(weighted_mean(df.v, lit(1.0))).sort('id')
         expected = df.groupby('id').agg(mean(df.v).alias('weighted_mean(v, 1.0)')).sort('id')
@@ -263,12 +211,9 @@ class PandasUDFTypeHintsTests(ReusedSQLTestCase):
 
     def test_ignore_type_hint_in_group_apply_in_pandas(self):
         df = self.spark.range(10)
-        exec(
-            "def pandas_plus_one(v: pd.DataFrame) -> pd.DataFrame:\n"
-            "    return v + 1",
-            self.local)
 
-        pandas_plus_one = self.local["pandas_plus_one"]
+        def pandas_plus_one(v: pd.DataFrame) -> pd.DataFrame:
+            return v + 1
 
         actual = df.groupby('id').applyInPandas(pandas_plus_one, schema=df.schema).sort('id')
         expected = df.selectExpr("id + 1 as id")
@@ -276,12 +221,9 @@ class PandasUDFTypeHintsTests(ReusedSQLTestCase):
 
     def test_ignore_type_hint_in_cogroup_apply_in_pandas(self):
         df = self.spark.range(10)
-        exec(
-            "def pandas_plus_one(left: pd.DataFrame, right: pd.DataFrame) -> pd.DataFrame:\n"
-            "    return left + 1",
-            self.local)
 
-        pandas_plus_one = self.local["pandas_plus_one"]
+        def pandas_plus_one(left: pd.DataFrame, right: pd.DataFrame) -> pd.DataFrame:
+            return left + 1
 
         actual = df.groupby('id').cogroup(
             self.spark.range(10).groupby("id")
@@ -291,13 +233,9 @@ class PandasUDFTypeHintsTests(ReusedSQLTestCase):
 
     def test_ignore_type_hint_in_map_in_pandas(self):
         df = self.spark.range(10)
-        exec(
-            "from typing import Iterator\n"
-            "def pandas_plus_one(iter: Iterator[pd.DataFrame]) -> Iterator[pd.DataFrame]:\n"
-            "    return map(lambda v: v + 1, iter)",
-            self.local)
 
-        pandas_plus_one = self.local["pandas_plus_one"]
+        def pandas_plus_one(iter: Iterator[pd.DataFrame]) -> Iterator[pd.DataFrame]:
+            return map(lambda v: v + 1, iter)
 
         actual = df.mapInPandas(pandas_plus_one, schema=df.schema)
         expected = df.selectExpr("id + 1 as id")
diff --git a/python/pyspark/sql/tests/test_types.py b/python/pyspark/sql/tests/test_types.py
index 016cafd669..051c8bde50 100644
--- a/python/pyspark/sql/tests/test_types.py
+++ b/python/pyspark/sql/tests/test_types.py
@@ -56,7 +56,7 @@ class TypesTests(ReusedSQLTestCase):
         self.assertEqual(10, df3.count())
 
     def test_apply_schema_to_dict_and_rows(self):
-        schema = StructType().add("b", StringType()).add("a", IntegerType())
+        schema = StructType().add("a", IntegerType()).add("b", StringType())
         input = [{"a": 1}, {"b": "coffee"}]
         rdd = self.sc.parallelize(input)
         for verify in [False, True]:
@@ -72,7 +72,6 @@ class TypesTests(ReusedSQLTestCase):
             self.assertEqual(10, df4.count())
 
     def test_create_dataframe_schema_mismatch(self):
-        input = [Row(a=1)]
         rdd = self.sc.parallelize(range(3)).map(lambda i: Row(a=i))
         schema = StructType([StructField("a", IntegerType()), StructField("b", StringType())])
         df = self.spark.createDataFrame(rdd, schema)
@@ -540,7 +539,6 @@ class TypesTests(ReusedSQLTestCase):
         self.assertEqual(_infer_type(2**61), LongType())
         self.assertEqual(_infer_type(2**71), LongType())
 
-    @unittest.skipIf(sys.version < "3", "only Python 3 infers bytes as binary type")
     def test_infer_binary_type(self):
         binaryrow = [Row(f1='a', f2=b"abcd")]
         df = self.sc.parallelize(binaryrow).toDF()
@@ -665,10 +663,6 @@ class TypesTests(ReusedSQLTestCase):
             supported_string_types += ['u']
             # test unicode
             assertCollectSuccess('u', u'a')
-        if sys.version_info[0] < 3:
-            supported_string_types += ['c']
-            # test string
-            assertCollectSuccess('c', 'a')
 
         # supported float and double
         #
@@ -721,12 +715,8 @@ class TypesTests(ReusedSQLTestCase):
         #
         # Keys in _array_type_mappings is a complete list of all supported types,
         # and types not in _array_type_mappings are considered unsupported.
-        # `array.typecodes` are not supported in python 2.
-        if sys.version_info[0] < 3:
-            all_types = set(['c', 'b', 'B', 'u', 'h', 'H', 'i', 'I', 'l', 'L', 'f', 'd'])
-        else:
-            # PyPy seems not having array.typecodes.
-            all_types = set(['b', 'B', 'u', 'h', 'H', 'i', 'I', 'l', 'L', 'q', 'Q', 'f', 'd'])
+        # PyPy seems not having array.typecodes.
+        all_types = set(['b', 'B', 'u', 'h', 'H', 'i', 'I', 'l', 'L', 'q', 'Q', 'f', 'd'])
         unsupported_types = all_types - set(supported_types)
         # test unsupported types
         for t in unsupported_types:
@@ -767,10 +757,7 @@ class DataTypeTests(unittest.TestCase):
         self.assertEqual(repr(row), "<Row('Alice', 11)>")
 
         # test __repr__ with unicode values
-        if sys.version_info.major >= 3:
-            self.assertEqual(repr(Row("数", "量")), "<Row('数', '量')>")
-        else:
-            self.assertEqual(repr(Row(u"数", u"量")), r"<Row(u'\u6570', u'\u91cf')>")
+        self.assertEqual(repr(Row("数", "量")), "<Row('数', '量')>")
 
     def test_empty_row(self):
         row = Row()
@@ -888,7 +875,6 @@ class DataTypeVerificationTests(unittest.TestCase):
             ({"s": "a", "f": 1.0}, schema),
             (Row(s="a", i=1), schema),
             (Row(s="a", i=None), schema),
-            (Row(s="a", i=1, f=1.0), schema),
             (["a", 1], schema),
             (["a", None], schema),
             (("a", 1), schema),
@@ -973,18 +959,13 @@ class DataTypeVerificationTests(unittest.TestCase):
             with self.assertRaises(exp, msg=msg):
                 _make_type_verifier(data_type, nullable=False)(obj)
 
-    @unittest.skipIf(sys.version_info[:2] < (3, 6), "Create Row without sorting fields")
     def test_row_without_field_sorting(self):
-        sorting_enabled_tmp = Row._row_field_sorting_enabled
-        Row._row_field_sorting_enabled = False
-
         r = Row(b=1, a=2)
         TestRow = Row("b", "a")
         expected = TestRow(1, 2)
 
         self.assertEqual(r, expected)
         self.assertEqual(repr(r), "Row(b=1, a=2)")
-        Row._row_field_sorting_enabled = sorting_enabled_tmp
 
 
 if __name__ == "__main__":
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 320a68dffe..cc08482c73 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 #
 
-import os
 import sys
 import decimal
 import time
@@ -26,11 +25,6 @@ import re
 import base64
 from array import array
 import ctypes
-import warnings
-
-if sys.version >= "3":
-    long = int
-    basestring = unicode = str
 
 from py4j.protocol import register_input_converter
 from py4j.java_gateway import JavaClass
@@ -409,9 +403,7 @@ class StructField(DataType):
         """
         assert isinstance(dataType, DataType),\
             "dataType %s should be an instance of %s" % (dataType, DataType)
-        assert isinstance(name, basestring), "field name %s should be string" % (name)
-        if not isinstance(name, str):
-            name = name.encode('utf-8')
+        assert isinstance(name, str), "field name %s should be a string" % (name)
         self.name = name
         self.dataType = dataType
         self.nullable = nullable
@@ -613,8 +605,6 @@ class StructType(DataType):
         else:
             if isinstance(obj, dict):
                 return tuple(obj.get(n) for n in self.names)
-            elif isinstance(obj, Row) and getattr(obj, "__from_dict__", False):
-                return tuple(obj[n] for n in self.names)
             elif isinstance(obj, (list, tuple)):
                 return tuple(obj)
             elif hasattr(obj, "__dict__"):
@@ -904,19 +894,9 @@ _type_mappings = {
     datetime.date: DateType,
     datetime.datetime: TimestampType,
     datetime.time: TimestampType,
+    bytes: BinaryType,
 }
 
-if sys.version < "3":
-    _type_mappings.update({
-        unicode: StringType,
-        long: LongType,
-    })
-
-if sys.version >= "3":
-    _type_mappings.update({
-        bytes: BinaryType,
-    })
-
 # Mapping Python array types to Spark SQL DataType
 # We should be careful here. The size of these types in python depends on C
 # implementation. We need to make sure that this conversion does not lose any
@@ -990,20 +970,6 @@ for _typecode in _array_unsigned_int_typecode_ctype_mappings.keys():
 if sys.version_info[0] < 4:
     _array_type_mappings['u'] = StringType
 
-# Type code 'c' are only available at python 2
-if sys.version_info[0] < 3:
-    _array_type_mappings['c'] = StringType
-
-# SPARK-21465:
-# In python2, array of 'L' happened to be mistakenly, just partially supported. To
-# avoid breaking user's code, we should keep this partial support. Below is a
-# dirty hacking to keep this partial support and pass the unit test.
-import platform
-if sys.version_info[0] < 3 and platform.python_implementation() != 'PyPy':
-    if 'L' not in _array_type_mappings.keys():
-        _array_type_mappings['L'] = LongType
-        _array_unsigned_int_typecode_ctype_mappings['L'] = ctypes.c_uint
-
 
 def _infer_type(obj):
     """Infer the DataType from obj
@@ -1187,14 +1153,14 @@ def _create_converter(dataType):
 
 _acceptable_types = {
     BooleanType: (bool,),
-    ByteType: (int, long),
-    ShortType: (int, long),
-    IntegerType: (int, long),
-    LongType: (int, long),
+    ByteType: (int,),
+    ShortType: (int,),
+    IntegerType: (int,),
+    LongType: (int,),
     FloatType: (float,),
     DoubleType: (float,),
     DecimalType: (decimal.Decimal,),
-    StringType: (str, unicode),
+    StringType: (str,),
     BinaryType: (bytearray, bytes),
     DateType: (datetime.date, datetime.datetime),
     TimestampType: (datetime.datetime,),
@@ -1376,10 +1342,6 @@ def _make_type_verifier(dataType, nullable=True, name=None):
             if isinstance(obj, dict):
                 for f, verifier in verifiers:
                     verifier(obj.get(f))
-            elif isinstance(obj, Row) and getattr(obj, "__from_dict__", False):
-                # the order in obj could be different than dataType.fields
-                for f, verifier in verifiers:
-                    verifier(obj[f])
             elif isinstance(obj, (tuple, list)):
                 if len(obj) != len(verifiers):
                     raise ValueError(
@@ -1438,21 +1400,11 @@ class Row(tuple):
 
     NOTE: As of Spark 3.0.0, Rows created from named arguments no longer have
     field names sorted alphabetically and will be ordered in the position as
-    entered. To enable sorting for Rows compatible with Spark 2.x, set the
-    environment variable "PYSPARK_ROW_FIELD_SORTING_ENABLED" to "true". This
-    option is deprecated and will be removed in future versions of Spark. For
-    Python versions < 3.6, the order of named arguments is not guaranteed to
-    be the same as entered, see https://www.python.org/dev/peps/pep-0468. In
-    this case, a warning will be issued and the Row will fallback to sort the
-    field names automatically.
-
-    NOTE: Examples with Row in pydocs are run with the environment variable
-    "PYSPARK_ROW_FIELD_SORTING_ENABLED" set to "true" which results in output
-    where fields are sorted.
+    entered.
 
     >>> row = Row(name="Alice", age=11)
     >>> row
-    Row(age=11, name='Alice')
+    Row(name='Alice', age=11)
     >>> row['name'], row['age']
     ('Alice', 11)
     >>> row.name, row.age
@@ -1476,47 +1428,22 @@ class Row(tuple):
     Row(name='Alice', age=11)
 
     This form can also be used to create rows as tuple values, i.e. with unnamed
-    fields. Beware that such Row objects have different equality semantics:
+    fields.
 
     >>> row1 = Row("Alice", 11)
     >>> row2 = Row(name="Alice", age=11)
     >>> row1 == row2
-    False
-    >>> row3 = Row(a="Alice", b=11)
-    >>> row1 == row3
     True
     """
 
-    # Remove after Python < 3.6 dropped, see SPARK-29748
-    _row_field_sorting_enabled = \
-        os.environ.get('PYSPARK_ROW_FIELD_SORTING_ENABLED', 'false').lower() == 'true'
-
-    if _row_field_sorting_enabled:
-        warnings.warn("The environment variable 'PYSPARK_ROW_FIELD_SORTING_ENABLED' "
-                      "is deprecated and will be removed in future versions of Spark")
-
     def __new__(cls, *args, **kwargs):
         if args and kwargs:
             raise ValueError("Can not use both args "
                              "and kwargs to create Row")
         if kwargs:
-            if not Row._row_field_sorting_enabled and sys.version_info[:2] < (3, 6):
-                warnings.warn("To use named arguments for Python version < 3.6, Row fields will be "
-                              "automatically sorted. This warning can be skipped by setting the "
-                              "environment variable 'PYSPARK_ROW_FIELD_SORTING_ENABLED' to 'true'.")
-                Row._row_field_sorting_enabled = True
-
             # create row objects
-            if Row._row_field_sorting_enabled:
-                # Remove after Python < 3.6 dropped, see SPARK-29748
-                names = sorted(kwargs.keys())
-                row = tuple.__new__(cls, [kwargs[n] for n in names])
-                row.__fields__ = names
-                row.__from_dict__ = True
-            else:
-                row = tuple.__new__(cls, list(kwargs.values()))
-                row.__fields__ = list(kwargs.keys())
-
+            row = tuple.__new__(cls, list(kwargs.values()))
+            row.__fields__ = list(kwargs.keys())
             return row
         else:
             # create row class or objects
@@ -1537,7 +1464,7 @@ class Row(tuple):
         >>> Row(name="Alice", age=11).asDict() == {'name': 'Alice', 'age': 11}
         True
         >>> row = Row(key=1, value=Row(name='a', age=2))
-        >>> row.asDict() == {'key': 1, 'value': Row(age=2, name='a')}
+        >>> row.asDict() == {'key': 1, 'value': Row(name='a', age=2)}
         True
         >>> row.asDict(True) == {'key': 1, 'value': {'name': 'a', 'age': 2}}
         True
@@ -1600,7 +1527,7 @@ class Row(tuple):
             raise AttributeError(item)
 
     def __setattr__(self, key, value):
-        if key != '__fields__' and key != "__from_dict__":
+        if key != '__fields__':
             raise Exception("Row is read-only")
         self.__dict__[key] = value
 
diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py
index da68583b04..100481cf12 100644
--- a/python/pyspark/sql/udf.py
+++ b/python/pyspark/sql/udf.py
@@ -21,7 +21,7 @@ import functools
 import sys
 
 from pyspark import SparkContext, since
-from pyspark.rdd import _prepare_for_python_RDD, PythonEvalType, ignore_unicode_prefix
+from pyspark.rdd import _prepare_for_python_RDD, PythonEvalType
 from pyspark.sql.column import Column, _to_java_column, _to_seq
 from pyspark.sql.types import StringType, DataType, StructType, _parse_datatype_string
 from pyspark.sql.pandas.types import to_arrow_type
@@ -232,7 +232,6 @@ class UDFRegistration(object):
     def __init__(self, sparkSession):
         self.sparkSession = sparkSession
 
-    @ignore_unicode_prefix
     @since("1.3.1")
     def register(self, name, f, returnType=None):
         """Register a Python function (including lambda function) or a user-defined function
@@ -261,10 +260,10 @@ class UDFRegistration(object):
 
             >>> strlen = spark.udf.register("stringLengthString", lambda x: len(x))
             >>> spark.sql("SELECT stringLengthString('test')").collect()
-            [Row(stringLengthString(test)=u'4')]
+            [Row(stringLengthString(test)='4')]
 
             >>> spark.sql("SELECT 'foo' AS text").select(strlen("text")).collect()
-            [Row(stringLengthString(text)=u'3')]
+            [Row(stringLengthString(text)='3')]
 
             >>> from pyspark.sql.types import IntegerType
             >>> _ = spark.udf.register("stringLengthInt", lambda x: len(x), IntegerType())
@@ -349,7 +348,6 @@ class UDFRegistration(object):
         self.sparkSession._jsparkSession.udf().registerPython(name, register_udf._judf)
         return return_udf
 
-    @ignore_unicode_prefix
     @since(2.3)
     def registerJavaFunction(self, name, javaClassName, returnType=None):
         """Register a Java user-defined function as a SQL function.
@@ -389,7 +387,6 @@ class UDFRegistration(object):
             jdt = self.sparkSession._jsparkSession.parseDataType(returnType.json())
         self.sparkSession._jsparkSession.udf().registerJava(name, javaClassName, jdt)
 
-    @ignore_unicode_prefix
     @since(2.3)
     def registerJavaUDAF(self, name, javaClassName):
         """Register a Java user-defined aggregate function as a SQL function.
@@ -403,7 +400,7 @@ class UDFRegistration(object):
         >>> df.createOrReplaceTempView("df")
         >>> q = "SELECT name, javaUDAF(id) as avg from df group by name order by name desc"
         >>> spark.sql(q).collect()  # doctest: +SKIP
-        [Row(name=u'b', avg=102.0), Row(name=u'a', avg=102.0)]
+        [Row(name='b', avg=102.0), Row(name='a', avg=102.0)]
         """
 
         self.sparkSession._jsparkSession.udf().registerJavaUDAF(name, javaClassName)
@@ -419,9 +416,6 @@ def _test():
         .appName("sql.udf tests")\
         .getOrCreate()
     globs['spark'] = spark
-    # Hack to skip the unit tests in register. These are currently being tested in proper tests.
-    # We should reenable this test once we completely drop Python 2.
-    del pyspark.sql.udf.UDFRegistration.register
     (failure_count, test_count) = doctest.testmod(
         pyspark.sql.udf, globs=globs,
         optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
diff --git a/python/pyspark/sql/utils.py b/python/pyspark/sql/utils.py
index 1d5bc49d25..bd76d88005 100644
--- a/python/pyspark/sql/utils.py
+++ b/python/pyspark/sql/utils.py
@@ -16,22 +16,9 @@
 #
 
 import py4j
-import sys
 
 from pyspark import SparkContext
 
-if sys.version_info.major >= 3:
-    unicode = str
-    # Disable exception chaining (PEP 3134) in captured exceptions
-    # in order to hide JVM stacktace.
-    exec("""
-def raise_from(e):
-    raise e from None
-""")
-else:
-    def raise_from(e):
-        raise e
-
 
 class CapturedException(Exception):
     def __init__(self, desc, stackTrace, cause=None):
@@ -45,11 +32,7 @@ class CapturedException(Exception):
         desc = self.desc
         if debug_enabled:
             desc = desc + "\n\nJVM stacktrace:\n%s" % self.stackTrace
-        # encode unicode instance for python2 for human readable description
-        if sys.version_info.major < 3 and isinstance(desc, unicode):
-            return str(desc.encode('utf-8'))
-        else:
-            return str(desc)
+        return str(desc)
 
 
 class AnalysisException(CapturedException):
@@ -131,7 +114,7 @@ def capture_sql_exception(f):
             if not isinstance(converted, UnknownException):
                 # Hide where the exception came from that shows a non-Pythonic
                 # JVM exception message.
-                raise_from(converted)
+                raise converted from None
             else:
                 raise
     return deco
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 6199611940..170f0c0ef7 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 from py4j.java_gateway import java_import, is_instance_of
 
 from pyspark import RDD, SparkConf
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 60562a6c92..000318588e 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -21,11 +21,6 @@ import time
 from itertools import chain
 from datetime import datetime
 
-if sys.version < "3":
-    from itertools import imap as map, ifilter as filter
-else:
-    long = int
-
 from py4j.protocol import Py4JJavaError
 
 from pyspark import RDD
@@ -404,7 +399,7 @@ class DStream(object):
         """
         if isinstance(timestamp, datetime):
             timestamp = time.mktime(timestamp.timetuple())
-        return self._sc._jvm.Time(long(timestamp * 1000))
+        return self._sc._jvm.Time(int(timestamp * 1000))
 
     def slice(self, begin, end):
         """
diff --git a/python/pyspark/taskcontext.py b/python/pyspark/taskcontext.py
index 8f419a5e84..d8aa5f9318 100644
--- a/python/pyspark/taskcontext.py
+++ b/python/pyspark/taskcontext.py
@@ -14,10 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
-from __future__ import print_function
-import json
-
 from pyspark.java_gateway import local_connect_and_auth
 from pyspark.serializers import read_int, write_int, write_with_length, UTF8Deserializer
 
diff --git a/python/pyspark/testing/sqlutils.py b/python/pyspark/testing/sqlutils.py
index 085fce6daa..e85cae7dda 100644
--- a/python/pyspark/testing/sqlutils.py
+++ b/python/pyspark/testing/sqlutils.py
@@ -24,7 +24,6 @@ from contextlib import contextmanager
 from pyspark.sql import SparkSession
 from pyspark.sql.types import ArrayType, DoubleType, UserDefinedType, Row
 from pyspark.testing.utils import ReusedPySparkTestCase
-from pyspark.util import _exception_message
 
 
 pandas_requirement_message = None
@@ -33,7 +32,7 @@ try:
     require_minimum_pandas_version()
 except ImportError as e:
     # If Pandas version requirement is not satisfied, skip related tests.
-    pandas_requirement_message = _exception_message(e)
+    pandas_requirement_message = str(e)
 
 pyarrow_requirement_message = None
 try:
@@ -41,14 +40,14 @@ try:
     require_minimum_pyarrow_version()
 except ImportError as e:
     # If Arrow version requirement is not satisfied, skip related tests.
-    pyarrow_requirement_message = _exception_message(e)
+    pyarrow_requirement_message = str(e)
 
 test_not_compiled_message = None
 try:
     from pyspark.sql.utils import require_test_compiled
     require_test_compiled()
 except Exception as e:
-    test_not_compiled_message = _exception_message(e)
+    test_not_compiled_message = str(e)
 
 have_pandas = pandas_requirement_message is None
 have_pyarrow = pyarrow_requirement_message is None
diff --git a/python/pyspark/tests/test_profiler.py b/python/pyspark/tests/test_profiler.py
index 04ca5a3896..dbce72a0d3 100644
--- a/python/pyspark/tests/test_profiler.py
+++ b/python/pyspark/tests/test_profiler.py
@@ -19,15 +19,11 @@ import os
 import sys
 import tempfile
 import unittest
+from io import StringIO
 
 from pyspark import SparkConf, SparkContext, BasicProfiler
 from pyspark.testing.utils import PySparkTestCase
 
-if sys.version >= "3":
-    from io import StringIO
-else:
-    from StringIO import StringIO
-
 
 class ProfilerTests(PySparkTestCase):
 
diff --git a/python/pyspark/tests/test_rdd.py b/python/pyspark/tests/test_rdd.py
index 6c5b818056..1a580e27ea 100644
--- a/python/pyspark/tests/test_rdd.py
+++ b/python/pyspark/tests/test_rdd.py
@@ -32,9 +32,6 @@ from pyspark.serializers import CloudPickleSerializer, BatchedSerializer, Pickle
     MarshalSerializer, UTF8Deserializer, NoOpSerializer
 from pyspark.testing.utils import ReusedPySparkTestCase, SPARK_HOME, QuietTest
 
-if sys.version_info[0] >= 3:
-    xrange = range
-
 
 global_func = lambda: "Hi"
 
@@ -193,15 +190,13 @@ class RDDTests(ReusedPySparkTestCase):
 
     def test_sampling_default_seed(self):
         # Test for SPARK-3995 (default seed setting)
-        data = self.sc.parallelize(xrange(1000), 1)
+        data = self.sc.parallelize(range(1000), 1)
         subset = data.takeSample(False, 10)
         self.assertEqual(len(subset), 10)
 
     def test_aggregate_mutable_zero_value(self):
         # Test for SPARK-9021; uses aggregate and treeAggregate to build dict
         # representing a counter of ints
-        # NOTE: dict is used instead of collections.Counter for Python 2.6
-        # compatibility
         from collections import defaultdict
 
         # Show that single or multiple partitions work
@@ -262,8 +257,6 @@ class RDDTests(ReusedPySparkTestCase):
     def test_fold_mutable_zero_value(self):
         # Test for SPARK-9021; uses fold to merge an RDD of dict counters into
         # a single dict
-        # NOTE: dict is used instead of collections.Counter for Python 2.6
-        # compatibility
         from collections import defaultdict
 
         counts1 = defaultdict(int, dict((i, 1) for i in range(10)))
@@ -439,7 +432,7 @@ class RDDTests(ReusedPySparkTestCase):
 
     def test_large_closure(self):
         N = 200000
-        data = [float(i) for i in xrange(N)]
+        data = [float(i) for i in range(N)]
         rdd = self.sc.parallelize(range(1), 1).map(lambda x: len(data))
         self.assertEqual(N, rdd.first())
         # regression test for SPARK-6886
@@ -464,8 +457,8 @@ class RDDTests(ReusedPySparkTestCase):
 
     def test_zip_with_different_object_sizes(self):
         # regress test for SPARK-5973
-        a = self.sc.parallelize(xrange(10000)).map(lambda i: '*' * i)
-        b = self.sc.parallelize(xrange(10000, 20000)).map(lambda i: '*' * i)
+        a = self.sc.parallelize(range(10000)).map(lambda i: '*' * i)
+        b = self.sc.parallelize(range(10000, 20000)).map(lambda i: '*' * i)
         self.assertEqual(10000, a.zip(b).count())
 
     def test_zip_with_different_number_of_items(self):
@@ -487,7 +480,7 @@ class RDDTests(ReusedPySparkTestCase):
             self.assertRaises(Exception, lambda: a.zip(b).count())
 
     def test_count_approx_distinct(self):
-        rdd = self.sc.parallelize(xrange(1000))
+        rdd = self.sc.parallelize(range(1000))
         self.assertTrue(950 < rdd.countApproxDistinct(0.03) < 1050)
         self.assertTrue(950 < rdd.map(float).countApproxDistinct(0.03) < 1050)
         self.assertTrue(950 < rdd.map(str).countApproxDistinct(0.03) < 1050)
@@ -641,7 +634,7 @@ class RDDTests(ReusedPySparkTestCase):
     def test_external_group_by_key(self):
         self.sc._conf.set("spark.python.worker.memory", "1m")
         N = 2000001
-        kv = self.sc.parallelize(xrange(N)).map(lambda x: (x % 3, x))
+        kv = self.sc.parallelize(range(N)).map(lambda x: (x % 3, x))
         gkv = kv.groupByKey().cache()
         self.assertEqual(3, gkv.count())
         filtered = gkv.filter(lambda kv: kv[0] == 1)
@@ -698,7 +691,7 @@ class RDDTests(ReusedPySparkTestCase):
 
     # Regression test for SPARK-6294
     def test_take_on_jrdd(self):
-        rdd = self.sc.parallelize(xrange(1 << 20)).map(lambda x: str(x))
+        rdd = self.sc.parallelize(range(1 << 20)).map(lambda x: str(x))
         rdd._jrdd.first()
 
     def test_sortByKey_uses_all_partitions_not_only_first_and_last(self):
diff --git a/python/pyspark/tests/test_readwrite.py b/python/pyspark/tests/test_readwrite.py
index 734b7e4789..faa006c7d8 100644
--- a/python/pyspark/tests/test_readwrite.py
+++ b/python/pyspark/tests/test_readwrite.py
@@ -38,104 +38,6 @@ class InputFormatTests(ReusedPySparkTestCase):
         ReusedPySparkTestCase.tearDownClass()
         shutil.rmtree(cls.tempdir.name)
 
-    @unittest.skipIf(sys.version >= "3", "serialize array of byte")
-    def test_sequencefiles(self):
-        basepath = self.tempdir.name
-        ints = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfint/",
-                                           "org.apache.hadoop.io.IntWritable",
-                                           "org.apache.hadoop.io.Text").collect())
-        ei = [(1, u'aa'), (1, u'aa'), (2, u'aa'), (2, u'bb'), (2, u'bb'), (3, u'cc')]
-        self.assertEqual(ints, ei)
-
-        doubles = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfdouble/",
-                                              "org.apache.hadoop.io.DoubleWritable",
-                                              "org.apache.hadoop.io.Text").collect())
-        ed = [(1.0, u'aa'), (1.0, u'aa'), (2.0, u'aa'), (2.0, u'bb'), (2.0, u'bb'), (3.0, u'cc')]
-        self.assertEqual(doubles, ed)
-
-        bytes = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfbytes/",
-                                            "org.apache.hadoop.io.IntWritable",
-                                            "org.apache.hadoop.io.BytesWritable").collect())
-        ebs = [(1, bytearray('aa', 'utf-8')),
-               (1, bytearray('aa', 'utf-8')),
-               (2, bytearray('aa', 'utf-8')),
-               (2, bytearray('bb', 'utf-8')),
-               (2, bytearray('bb', 'utf-8')),
-               (3, bytearray('cc', 'utf-8'))]
-        self.assertEqual(bytes, ebs)
-
-        text = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sftext/",
-                                           "org.apache.hadoop.io.Text",
-                                           "org.apache.hadoop.io.Text").collect())
-        et = [(u'1', u'aa'),
-              (u'1', u'aa'),
-              (u'2', u'aa'),
-              (u'2', u'bb'),
-              (u'2', u'bb'),
-              (u'3', u'cc')]
-        self.assertEqual(text, et)
-
-        bools = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfbool/",
-                                            "org.apache.hadoop.io.IntWritable",
-                                            "org.apache.hadoop.io.BooleanWritable").collect())
-        eb = [(1, False), (1, True), (2, False), (2, False), (2, True), (3, True)]
-        self.assertEqual(bools, eb)
-
-        nulls = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfnull/",
-                                            "org.apache.hadoop.io.IntWritable",
-                                            "org.apache.hadoop.io.BooleanWritable").collect())
-        en = [(1, None), (1, None), (2, None), (2, None), (2, None), (3, None)]
-        self.assertEqual(nulls, en)
-
-        maps = self.sc.sequenceFile(basepath + "/sftestdata/sfmap/",
-                                    "org.apache.hadoop.io.IntWritable",
-                                    "org.apache.hadoop.io.MapWritable").collect()
-        em = [(1, {}),
-              (1, {3.0: u'bb'}),
-              (2, {1.0: u'aa'}),
-              (2, {1.0: u'cc'}),
-              (3, {2.0: u'dd'})]
-        for v in maps:
-            self.assertTrue(v in em)
-
-        # arrays get pickled to tuples by default
-        tuples = sorted(self.sc.sequenceFile(
-            basepath + "/sftestdata/sfarray/",
-            "org.apache.hadoop.io.IntWritable",
-            "org.apache.spark.api.python.DoubleArrayWritable").collect())
-        et = [(1, ()),
-              (2, (3.0, 4.0, 5.0)),
-              (3, (4.0, 5.0, 6.0))]
-        self.assertEqual(tuples, et)
-
-        # with custom converters, primitive arrays can stay as arrays
-        arrays = sorted(self.sc.sequenceFile(
-            basepath + "/sftestdata/sfarray/",
-            "org.apache.hadoop.io.IntWritable",
-            "org.apache.spark.api.python.DoubleArrayWritable",
-            valueConverter="org.apache.spark.api.python.WritableToDoubleArrayConverter").collect())
-        ea = [(1, array('d')),
-              (2, array('d', [3.0, 4.0, 5.0])),
-              (3, array('d', [4.0, 5.0, 6.0]))]
-        self.assertEqual(arrays, ea)
-
-        clazz = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfclass/",
-                                            "org.apache.hadoop.io.Text",
-                                            "org.apache.spark.api.python.TestWritable").collect())
-        cname = u'org.apache.spark.api.python.TestWritable'
-        ec = [(u'1', {u'__class__': cname, u'double': 1.0, u'int': 1, u'str': u'test1'}),
-              (u'2', {u'__class__': cname, u'double': 2.3, u'int': 2, u'str': u'test2'}),
-              (u'3', {u'__class__': cname, u'double': 3.1, u'int': 3, u'str': u'test3'}),
-              (u'4', {u'__class__': cname, u'double': 4.2, u'int': 4, u'str': u'test4'}),
-              (u'5', {u'__class__': cname, u'double': 5.5, u'int': 5, u'str': u'test56'})]
-        self.assertEqual(clazz, ec)
-
-        unbatched_clazz = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfclass/",
-                                                      "org.apache.hadoop.io.Text",
-                                                      "org.apache.spark.api.python.TestWritable",
-                                                      ).collect())
-        self.assertEqual(unbatched_clazz, ec)
-
     def test_oldhadoop(self):
         basepath = self.tempdir.name
         ints = sorted(self.sc.hadoopFile(basepath + "/sftestdata/sfint/",
@@ -249,51 +151,6 @@ class OutputFormatTests(ReusedPySparkTestCase):
     def tearDown(self):
         shutil.rmtree(self.tempdir.name, ignore_errors=True)
 
-    @unittest.skipIf(sys.version >= "3", "serialize array of byte")
-    def test_sequencefiles(self):
-        basepath = self.tempdir.name
-        ei = [(1, u'aa'), (1, u'aa'), (2, u'aa'), (2, u'bb'), (2, u'bb'), (3, u'cc')]
-        self.sc.parallelize(ei).saveAsSequenceFile(basepath + "/sfint/")
-        ints = sorted(self.sc.sequenceFile(basepath + "/sfint/").collect())
-        self.assertEqual(ints, ei)
-
-        ed = [(1.0, u'aa'), (1.0, u'aa'), (2.0, u'aa'), (2.0, u'bb'), (2.0, u'bb'), (3.0, u'cc')]
-        self.sc.parallelize(ed).saveAsSequenceFile(basepath + "/sfdouble/")
-        doubles = sorted(self.sc.sequenceFile(basepath + "/sfdouble/").collect())
-        self.assertEqual(doubles, ed)
-
-        ebs = [(1, bytearray(b'\x00\x07spam\x08')), (2, bytearray(b'\x00\x07spam\x08'))]
-        self.sc.parallelize(ebs).saveAsSequenceFile(basepath + "/sfbytes/")
-        bytes = sorted(self.sc.sequenceFile(basepath + "/sfbytes/").collect())
-        self.assertEqual(bytes, ebs)
-
-        et = [(u'1', u'aa'),
-              (u'2', u'bb'),
-              (u'3', u'cc')]
-        self.sc.parallelize(et).saveAsSequenceFile(basepath + "/sftext/")
-        text = sorted(self.sc.sequenceFile(basepath + "/sftext/").collect())
-        self.assertEqual(text, et)
-
-        eb = [(1, False), (1, True), (2, False), (2, False), (2, True), (3, True)]
-        self.sc.parallelize(eb).saveAsSequenceFile(basepath + "/sfbool/")
-        bools = sorted(self.sc.sequenceFile(basepath + "/sfbool/").collect())
-        self.assertEqual(bools, eb)
-
-        en = [(1, None), (1, None), (2, None), (2, None), (2, None), (3, None)]
-        self.sc.parallelize(en).saveAsSequenceFile(basepath + "/sfnull/")
-        nulls = sorted(self.sc.sequenceFile(basepath + "/sfnull/").collect())
-        self.assertEqual(nulls, en)
-
-        em = [(1, {}),
-              (1, {3.0: u'bb'}),
-              (2, {1.0: u'aa'}),
-              (2, {1.0: u'cc'}),
-              (3, {2.0: u'dd'})]
-        self.sc.parallelize(em).saveAsSequenceFile(basepath + "/sfmap/")
-        maps = self.sc.sequenceFile(basepath + "/sfmap/").collect()
-        for v in maps:
-            self.assertTrue(v, em)
-
     def test_oldhadoop(self):
         basepath = self.tempdir.name
         dict_data = [(1, {}),
@@ -361,46 +218,6 @@ class OutputFormatTests(ReusedPySparkTestCase):
             conf=input_conf).collect())
         self.assertEqual(new_dataset, data)
 
-    @unittest.skipIf(sys.version >= "3", "serialize of array")
-    def test_newhadoop_with_array(self):
-        basepath = self.tempdir.name
-        # use custom ArrayWritable types and converters to handle arrays
-        array_data = [(1, array('d')),
-                      (1, array('d', [1.0, 2.0, 3.0])),
-                      (2, array('d', [3.0, 4.0, 5.0]))]
-        self.sc.parallelize(array_data).saveAsNewAPIHadoopFile(
-            basepath + "/newhadoop/",
-            "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat",
-            "org.apache.hadoop.io.IntWritable",
-            "org.apache.spark.api.python.DoubleArrayWritable",
-            valueConverter="org.apache.spark.api.python.DoubleArrayToWritableConverter")
-        result = sorted(self.sc.newAPIHadoopFile(
-            basepath + "/newhadoop/",
-            "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
-            "org.apache.hadoop.io.IntWritable",
-            "org.apache.spark.api.python.DoubleArrayWritable",
-            valueConverter="org.apache.spark.api.python.WritableToDoubleArrayConverter").collect())
-        self.assertEqual(result, array_data)
-
-        conf = {
-            "mapreduce.job.outputformat.class":
-                "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat",
-            "mapreduce.job.output.key.class": "org.apache.hadoop.io.IntWritable",
-            "mapreduce.job.output.value.class": "org.apache.spark.api.python.DoubleArrayWritable",
-            "mapreduce.output.fileoutputformat.outputdir": basepath + "/newdataset/"
-        }
-        self.sc.parallelize(array_data).saveAsNewAPIHadoopDataset(
-            conf,
-            valueConverter="org.apache.spark.api.python.DoubleArrayToWritableConverter")
-        input_conf = {"mapreduce.input.fileinputformat.inputdir": basepath + "/newdataset/"}
-        new_dataset = sorted(self.sc.newAPIHadoopRDD(
-            "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
-            "org.apache.hadoop.io.IntWritable",
-            "org.apache.spark.api.python.DoubleArrayWritable",
-            valueConverter="org.apache.spark.api.python.WritableToDoubleArrayConverter",
-            conf=input_conf).collect())
-        self.assertEqual(new_dataset, array_data)
-
     def test_newolderror(self):
         basepath = self.tempdir.name
         rdd = self.sc.parallelize(range(1, 4)).map(lambda x: (x, "a" * x))
diff --git a/python/pyspark/tests/test_shuffle.py b/python/pyspark/tests/test_shuffle.py
index d50ba632d6..434414618e 100644
--- a/python/pyspark/tests/test_shuffle.py
+++ b/python/pyspark/tests/test_shuffle.py
@@ -23,15 +23,12 @@ from py4j.protocol import Py4JJavaError
 from pyspark import shuffle, PickleSerializer, SparkConf, SparkContext
 from pyspark.shuffle import Aggregator, ExternalMerger, ExternalSorter
 
-if sys.version_info[0] >= 3:
-    xrange = range
-
 
 class MergerTests(unittest.TestCase):
 
     def setUp(self):
         self.N = 1 << 12
-        self.l = [i for i in xrange(self.N)]
+        self.l = [i for i in range(self.N)]
         self.data = list(zip(self.l, self.l))
         self.agg = Aggregator(lambda x: [x],
                               lambda x, y: x.append(y) or x,
@@ -42,26 +39,26 @@ class MergerTests(unittest.TestCase):
         m.mergeValues(self.data)
         self.assertEqual(m.spills, 0)
         self.assertEqual(sum(sum(v) for k, v in m.items()),
-                         sum(xrange(self.N)))
+                         sum(range(self.N)))
 
         m = ExternalMerger(self.agg, 1000)
         m.mergeCombiners(map(lambda x_y1: (x_y1[0], [x_y1[1]]), self.data))
         self.assertEqual(m.spills, 0)
         self.assertEqual(sum(sum(v) for k, v in m.items()),
-                         sum(xrange(self.N)))
+                         sum(range(self.N)))
 
     def test_medium_dataset(self):
         m = ExternalMerger(self.agg, 20)
         m.mergeValues(self.data)
         self.assertTrue(m.spills >= 1)
         self.assertEqual(sum(sum(v) for k, v in m.items()),
-                         sum(xrange(self.N)))
+                         sum(range(self.N)))
 
         m = ExternalMerger(self.agg, 10)
         m.mergeCombiners(map(lambda x_y2: (x_y2[0], [x_y2[1]]), self.data * 3))
         self.assertTrue(m.spills >= 1)
         self.assertEqual(sum(sum(v) for k, v in m.items()),
-                         sum(xrange(self.N)) * 3)
+                         sum(range(self.N)) * 3)
 
     def test_huge_dataset(self):
         m = ExternalMerger(self.agg, 5, partitions=3)
diff --git a/python/pyspark/tests/test_taskcontext.py b/python/pyspark/tests/test_taskcontext.py
index 90e4bcdfad..8c2bedbe4e 100644
--- a/python/pyspark/tests/test_taskcontext.py
+++ b/python/pyspark/tests/test_taskcontext.py
@@ -26,9 +26,6 @@ import unittest
 from pyspark import SparkConf, SparkContext, TaskContext, BarrierTaskContext
 from pyspark.testing.utils import PySparkTestCase, SPARK_HOME
 
-if sys.version_info[0] >= 3:
-    xrange = range
-
 
 class TaskContextTests(PySparkTestCase):
 
@@ -251,9 +248,9 @@ class TaskContextTestsWithWorkerReuse(unittest.TestCase):
     def test_task_context_correct_with_python_worker_reuse(self):
         """Verify the task context correct when reused python worker"""
         # start a normal job first to start all workers and get all worker pids
-        worker_pids = self.sc.parallelize(xrange(2), 2).map(lambda x: os.getpid()).collect()
+        worker_pids = self.sc.parallelize(range(2), 2).map(lambda x: os.getpid()).collect()
         # the worker will reuse in this barrier job
-        rdd = self.sc.parallelize(xrange(10), 2)
+        rdd = self.sc.parallelize(range(10), 2)
 
         def context(iterator):
             tp = TaskContext.get().partitionId()
diff --git a/python/pyspark/tests/test_util.py b/python/pyspark/tests/test_util.py
index 81bfb66e70..511d62a51f 100644
--- a/python/pyspark/tests/test_util.py
+++ b/python/pyspark/tests/test_util.py
@@ -61,14 +61,12 @@ class KeywordOnlyTests(unittest.TestCase):
 
 
 class UtilTests(PySparkTestCase):
-    def test_py4j_exception_message(self):
-        from pyspark.util import _exception_message
-
+    def test_py4j_str(self):
         with self.assertRaises(Py4JJavaError) as context:
             # This attempts java.lang.String(null) which throws an NPE.
             self.sc._jvm.java.lang.String(None)
 
-        self.assertTrue('NullPointerException' in _exception_message(context.exception))
+        self.assertTrue('NullPointerException' in str(context.exception))
 
     def test_parsing_version_string(self):
         from pyspark.util import VersionUtils
diff --git a/python/pyspark/tests/test_worker.py b/python/pyspark/tests/test_worker.py
index dba9298ee1..3b1848dcfd 100644
--- a/python/pyspark/tests/test_worker.py
+++ b/python/pyspark/tests/test_worker.py
@@ -32,9 +32,6 @@ from py4j.protocol import Py4JJavaError
 from pyspark import SparkConf, SparkContext
 from pyspark.testing.utils import ReusedPySparkTestCase, PySparkTestCase, QuietTest
 
-if sys.version_info[0] >= 3:
-    xrange = range
-
 
 class WorkerTests(ReusedPySparkTestCase):
     def test_cancel_task(self):
@@ -88,13 +85,13 @@ class WorkerTests(ReusedPySparkTestCase):
             self.fail("daemon had been killed")
 
         # run a normal job
-        rdd = self.sc.parallelize(xrange(100), 1)
+        rdd = self.sc.parallelize(range(100), 1)
         self.assertEqual(100, rdd.map(str).count())
 
     def test_after_exception(self):
         def raise_exception(_):
             raise Exception()
-        rdd = self.sc.parallelize(xrange(100), 1)
+        rdd = self.sc.parallelize(range(100), 1)
         with QuietTest(self.sc):
             self.assertRaises(Exception, lambda: rdd.foreach(raise_exception))
         self.assertEqual(100, rdd.map(str).count())
@@ -110,22 +107,22 @@ class WorkerTests(ReusedPySparkTestCase):
         with QuietTest(self.sc):
             self.assertRaises(Exception, lambda: filtered_data.count())
 
-        rdd = self.sc.parallelize(xrange(100), 1)
+        rdd = self.sc.parallelize(range(100), 1)
         self.assertEqual(100, rdd.map(str).count())
 
     def test_accumulator_when_reuse_worker(self):
         from pyspark.accumulators import INT_ACCUMULATOR_PARAM
         acc1 = self.sc.accumulator(0, INT_ACCUMULATOR_PARAM)
-        self.sc.parallelize(xrange(100), 20).foreach(lambda x: acc1.add(x))
+        self.sc.parallelize(range(100), 20).foreach(lambda x: acc1.add(x))
         self.assertEqual(sum(range(100)), acc1.value)
 
         acc2 = self.sc.accumulator(0, INT_ACCUMULATOR_PARAM)
-        self.sc.parallelize(xrange(100), 20).foreach(lambda x: acc2.add(x))
+        self.sc.parallelize(range(100), 20).foreach(lambda x: acc2.add(x))
         self.assertEqual(sum(range(100)), acc2.value)
         self.assertEqual(sum(range(100)), acc1.value)
 
     def test_reuse_worker_after_take(self):
-        rdd = self.sc.parallelize(xrange(100000), 1)
+        rdd = self.sc.parallelize(range(100000), 1)
         self.assertEqual(0, rdd.first())
 
         def count():
@@ -160,17 +157,13 @@ class WorkerTests(ReusedPySparkTestCase):
 
             self.sc.parallelize([1]).map(lambda x: f()).count()
         except Py4JJavaError as e:
-            if sys.version_info.major < 3:
-                # we have to use unicode here to avoid UnicodeDecodeError
-                self.assertRegexpMatches(unicode(e).encode("utf-8"), "exception with 中")
-            else:
-                self.assertRegexpMatches(str(e), "exception with 中")
+            self.assertRegexpMatches(str(e), "exception with 中")
 
 
 class WorkerReuseTest(PySparkTestCase):
 
-    def test_reuse_worker_of_parallelize_xrange(self):
-        rdd = self.sc.parallelize(xrange(20), 8)
+    def test_reuse_worker_of_parallelize_range(self):
+        rdd = self.sc.parallelize(range(20), 8)
         previous_pids = rdd.map(lambda x: os.getpid()).collect()
         current_pids = rdd.map(lambda x: os.getpid()).collect()
         for pid in current_pids:
@@ -189,7 +182,7 @@ class WorkerMemoryTest(unittest.TestCase):
         self.sc = SparkContext('local[4]', class_name, conf=conf)
 
     def test_memory_limit(self):
-        rdd = self.sc.parallelize(xrange(1), 1)
+        rdd = self.sc.parallelize(range(1), 1)
 
         def getrlimit():
             import resource
diff --git a/python/pyspark/util.py b/python/pyspark/util.py
index d9429372a6..c003586e9c 100644
--- a/python/pyspark/util.py
+++ b/python/pyspark/util.py
@@ -19,52 +19,10 @@
 import re
 import sys
 import traceback
-import os
-import warnings
-import inspect
-from py4j.protocol import Py4JJavaError
 
 __all__ = []
 
 
-def _exception_message(excp):
-    """Return the message from an exception as either a str or unicode object.  Supports both
-    Python 2 and Python 3.
-
-    >>> msg = "Exception message"
-    >>> excp = Exception(msg)
-    >>> msg == _exception_message(excp)
-    True
-
-    >>> msg = u"unicöde"
-    >>> excp = Exception(msg)
-    >>> msg == _exception_message(excp)
-    True
-    """
-    if isinstance(excp, Py4JJavaError):
-        # 'Py4JJavaError' doesn't contain the stack trace available on the Java side in 'message'
-        # attribute in Python 2. We should call 'str' function on this exception in general but
-        # 'Py4JJavaError' has an issue about addressing non-ascii strings. So, here we work
-        # around by the direct call, '__str__()'. Please see SPARK-23517.
-        return excp.__str__()
-    if hasattr(excp, "message"):
-        return excp.message
-    return str(excp)
-
-
-def _get_argspec(f):
-    """
-    Get argspec of a function. Supports both Python 2 and Python 3.
-    """
-    if sys.version_info[0] < 3:
-        argspec = inspect.getargspec(f)
-    else:
-        # `getargspec` is deprecated since python3.0 (incompatible with function annotations).
-        # See SPARK-23569.
-        argspec = inspect.getfullargspec(f)
-    return argspec
-
-
 def print_exec(stream):
     ei = sys.exc_info()
     traceback.print_exception(ei[0], ei[1], ei[2], None, stream)
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index 5f4a8a2d2d..9b54affb13 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -18,11 +18,11 @@
 """
 Worker that receives input from Piped RDD.
 """
-from __future__ import print_function
-from __future__ import absolute_import
 import os
 import sys
 import time
+from inspect import getfullargspec
+import importlib
 # 'resource' is a Unix specific module.
 has_resource_module = True
 try:
@@ -44,14 +44,9 @@ from pyspark.serializers import write_with_length, write_int, read_long, read_bo
 from pyspark.sql.pandas.serializers import ArrowStreamPandasUDFSerializer, CogroupUDFSerializer
 from pyspark.sql.pandas.types import to_arrow_type
 from pyspark.sql.types import StructType
-from pyspark.util import _get_argspec, fail_on_stopiteration
+from pyspark.util import fail_on_stopiteration
 from pyspark import shuffle
 
-if sys.version >= '3':
-    basestring = str
-else:
-    from itertools import imap as map  # use iterator map by default
-
 pickleSer = PickleSerializer()
 utf8_deserializer = UTF8Deserializer()
 
@@ -272,10 +267,10 @@ def read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index):
     elif eval_type == PythonEvalType.SQL_MAP_PANDAS_ITER_UDF:
         return arg_offsets, wrap_pandas_iter_udf(func, return_type)
     elif eval_type == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF:
-        argspec = _get_argspec(chained_func)  # signature was lost when wrapping it
+        argspec = getfullargspec(chained_func)  # signature was lost when wrapping it
         return arg_offsets, wrap_grouped_map_pandas_udf(func, return_type, argspec)
     elif eval_type == PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF:
-        argspec = _get_argspec(chained_func)  # signature was lost when wrapping it
+        argspec = getfullargspec(chained_func)  # signature was lost when wrapping it
         return arg_offsets, wrap_cogrouped_map_pandas_udf(func, return_type, argspec)
     elif eval_type == PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF:
         return arg_offsets, wrap_grouped_agg_pandas_udf(func, return_type)
@@ -342,11 +337,13 @@ def read_udfs(pickleSer, infile, eval_type):
             pickleSer, infile, eval_type, runner_conf, udf_index=0)
 
         def func(_, iterator):
-            num_input_rows = [0]  # TODO(SPARK-29909): Use nonlocal after we drop Python 2.
+            num_input_rows = 0
 
             def map_batch(batch):
+                nonlocal num_input_rows
+
                 udf_args = [batch[offset] for offset in arg_offsets]
-                num_input_rows[0] += len(udf_args[0])
+                num_input_rows += len(udf_args[0])
                 if len(udf_args) == 1:
                     return udf_args[0]
                 else:
@@ -363,7 +360,7 @@ def read_udfs(pickleSer, infile, eval_type):
                 # by consuming the input iterator in user side. Therefore,
                 # it's very unlikely the output length is higher than
                 # input length.
-                assert is_map_iter or num_output_rows <= num_input_rows[0], \
+                assert is_map_iter or num_output_rows <= num_input_rows, \
                     "Pandas SCALAR_ITER UDF outputted more rows than input rows."
                 yield (result_batch, result_type)
 
@@ -376,11 +373,11 @@ def read_udfs(pickleSer, infile, eval_type):
                     raise RuntimeError("pandas iterator UDF should exhaust the input "
                                        "iterator.")
 
-                if num_output_rows != num_input_rows[0]:
+                if num_output_rows != num_input_rows:
                     raise RuntimeError(
                         "The length of output in Scalar iterator pandas UDF should be "
                         "the same with the input's; however, the length of output was %d and the "
-                        "length of input was %d." % (num_output_rows, num_input_rows[0]))
+                        "length of input was %d." % (num_output_rows, num_input_rows))
 
         # profiling is not supported for UDF
         return func, None, ser, ser
@@ -548,9 +545,8 @@ def main(infile, outfile):
         for _ in range(num_python_includes):
             filename = utf8_deserializer.loads(infile)
             add_path(os.path.join(spark_files_dir, filename))
-        if sys.version > '3':
-            import importlib
-            importlib.invalidate_caches()
+
+        importlib.invalidate_caches()
 
         # fetch names and values of broadcast variables
         needs_broadcast_decryption_server = read_bool(infile)
diff --git a/python/run-tests.py b/python/run-tests.py
index 42510c7642..23076eab1c 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -28,10 +28,7 @@ import tempfile
 from threading import Thread, Lock
 import time
 import uuid
-if sys.version < '3':
-    import Queue
-else:
-    import queue as Queue
+import queue as Queue
 from multiprocessing import Manager
 
 
@@ -75,7 +72,6 @@ def run_individual_python_test(target_dir, test_name, pyspark_python):
         'SPARK_PREPEND_CLASSES': '1',
         'PYSPARK_PYTHON': which(pyspark_python),
         'PYSPARK_DRIVER_PYTHON': which(pyspark_python),
-        'PYSPARK_ROW_FIELD_SORTING_ENABLED': 'true'
     })
 
     # Create a unique temp directory under 'target/' for each run. The TMPDIR variable is
@@ -161,7 +157,8 @@ def run_individual_python_test(target_dir, test_name, pyspark_python):
 
 
 def get_default_python_executables():
-    python_execs = [x for x in ["python3.6", "python2.7", "pypy3", "pypy"] if which(x)]
+    # TODO(SPARK-32278): install PyPy3 in Jenkins to test
+    python_execs = [x for x in ["python3.6", "python3.8", "pypy3"] if which(x)]
 
     if "python3.6" not in python_execs:
         p = which("python3")
diff --git a/python/setup.py b/python/setup.py
index afbd601b04..c456a32fea 100755
--- a/python/setup.py
+++ b/python/setup.py
@@ -16,18 +16,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
 import glob
 import os
 import sys
 from setuptools import setup
 from shutil import copyfile, copytree, rmtree
 
-if sys.version_info < (2, 7):
-    print("Python versions prior to 2.7 are not supported for pip installed PySpark.",
-          file=sys.stderr)
-    sys.exit(-1)
-
 try:
     exec(open('pyspark/version.py').read())
 except IOError:
@@ -217,13 +211,10 @@ try:
                 'pyarrow>=%s' % _minimum_pyarrow_version,
             ]
         },
+        python_requires='>=3.6',
         classifiers=[
             'Development Status :: 5 - Production/Stable',
             'License :: OSI Approved :: Apache Software License',
-            'Programming Language :: Python :: 2.7',
-            'Programming Language :: Python :: 3',
-            'Programming Language :: Python :: 3.4',
-            'Programming Language :: Python :: 3.5',
             'Programming Language :: Python :: 3.6',
             'Programming Language :: Python :: 3.7',
             'Programming Language :: Python :: 3.8',
diff --git a/resource-managers/kubernetes/integration-tests/tests/pyfiles.py b/resource-managers/kubernetes/integration-tests/tests/pyfiles.py
index ba55b75803..51c0160554 100644
--- a/resource-managers/kubernetes/integration-tests/tests/pyfiles.py
+++ b/resource-managers/kubernetes/integration-tests/tests/pyfiles.py
@@ -14,9 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
-from __future__ import print_function
-
 import sys
 
 from pyspark.sql import SparkSession
diff --git a/resource-managers/kubernetes/integration-tests/tests/worker_memory_check.py b/resource-managers/kubernetes/integration-tests/tests/worker_memory_check.py
index d312a29f38..74559a0b54 100644
--- a/resource-managers/kubernetes/integration-tests/tests/worker_memory_check.py
+++ b/resource-managers/kubernetes/integration-tests/tests/worker_memory_check.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 import resource
 import sys
 
diff --git a/sql/hive/src/test/resources/data/scripts/cat.py b/sql/hive/src/test/resources/data/scripts/cat.py
index aea0362f89..420d9f832a 100644
--- a/sql/hive/src/test/resources/data/scripts/cat.py
+++ b/sql/hive/src/test/resources/data/scripts/cat.py
@@ -16,7 +16,6 @@
 # specific language governing permissions and limitations
 # under the License.
 #
-from __future__ import print_function
 import sys
 import os
 
diff --git a/sql/hive/src/test/resources/data/scripts/dumpdata_script.py b/sql/hive/src/test/resources/data/scripts/dumpdata_script.py
index 5b360208d3..f724fdc85b 100644
--- a/sql/hive/src/test/resources/data/scripts/dumpdata_script.py
+++ b/sql/hive/src/test/resources/data/scripts/dumpdata_script.py
@@ -18,12 +18,9 @@
 #
 import sys
 
-if sys.version_info[0] >= 3:
-    xrange = range
-
-for i in xrange(50):
-    for j in xrange(5):
-        for k in xrange(20022):
+for i in range(50):
+    for j in range(5):
+        for k in range(20022):
             print(20000 * i + k)
 
 for line in sys.stdin: