diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml
index d6458bf44f..5cf00c6ed9 100644
--- a/.github/workflows/master.yml
+++ b/.github/workflows/master.yml
@@ -133,7 +133,8 @@ jobs:
architecture: x64
- name: Install Python 3.6
uses: actions/setup-python@v2
- if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
+ # Yarn has a Python specific test too, for example, YarnClusterSuite.
+ if: contains(matrix.modules, 'yarn') || contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
with:
python-version: 3.6
architecture: x64
diff --git a/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala b/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala
index 01e64b6972..5a6fa50796 100644
--- a/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala
@@ -45,71 +45,6 @@ private[spark] object SerDeUtil extends Logging {
}
}
}
- // Unpickle array.array generated by Python 2.6
- class ArrayConstructor extends net.razorvine.pickle.objects.ArrayConstructor {
- // /* Description of types */
- // static struct arraydescr descriptors[] = {
- // {'c', sizeof(char), c_getitem, c_setitem},
- // {'b', sizeof(char), b_getitem, b_setitem},
- // {'B', sizeof(char), BB_getitem, BB_setitem},
- // #ifdef Py_USING_UNICODE
- // {'u', sizeof(Py_UNICODE), u_getitem, u_setitem},
- // #endif
- // {'h', sizeof(short), h_getitem, h_setitem},
- // {'H', sizeof(short), HH_getitem, HH_setitem},
- // {'i', sizeof(int), i_getitem, i_setitem},
- // {'I', sizeof(int), II_getitem, II_setitem},
- // {'l', sizeof(long), l_getitem, l_setitem},
- // {'L', sizeof(long), LL_getitem, LL_setitem},
- // {'f', sizeof(float), f_getitem, f_setitem},
- // {'d', sizeof(double), d_getitem, d_setitem},
- // {'\0', 0, 0, 0} /* Sentinel */
- // };
- val machineCodes: Map[Char, Int] = if (ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN)) {
- Map('B' -> 0, 'b' -> 1, 'H' -> 3, 'h' -> 5, 'I' -> 7, 'i' -> 9,
- 'L' -> 11, 'l' -> 13, 'f' -> 15, 'd' -> 17, 'u' -> 21
- )
- } else {
- Map('B' -> 0, 'b' -> 1, 'H' -> 2, 'h' -> 4, 'I' -> 6, 'i' -> 8,
- 'L' -> 10, 'l' -> 12, 'f' -> 14, 'd' -> 16, 'u' -> 20
- )
- }
- override def construct(args: Array[Object]): Object = {
- if (args.length == 1) {
- construct(args ++ Array(""))
- } else if (args.length == 2 && args(1).isInstanceOf[String]) {
- val typecode = args(0).asInstanceOf[String].charAt(0)
- // This must be ISO 8859-1 / Latin 1, not UTF-8, to interoperate correctly
- val data = args(1).asInstanceOf[String].getBytes(StandardCharsets.ISO_8859_1)
- if (typecode == 'c') {
- // It seems like the pickle of pypy uses the similar protocol to Python 2.6, which uses
- // a string for array data instead of list as Python 2.7, and handles an array of
- // typecode 'c' as 1-byte character.
- val result = new Array[Char](data.length)
- var i = 0
- while (i < data.length) {
- result(i) = data(i).toChar
- i += 1
- }
- result
- } else {
- construct(typecode, machineCodes(typecode), data)
- }
- } else if (args.length == 2 && args(0) == "l") {
- // On Python 2, an array of typecode 'l' should be handled as long rather than int.
- val values = args(1).asInstanceOf[JArrayList[_]]
- val result = new Array[Long](values.size)
- var i = 0
- while (i < values.size) {
- result(i) = values.get(i).asInstanceOf[Number].longValue()
- i += 1
- }
- result
- } else {
- super.construct(args)
- }
- }
- }
private var initialized = false
// This should be called before trying to unpickle array.array from Python
@@ -117,7 +52,6 @@ private[spark] object SerDeUtil extends Logging {
def initialize(): Unit = {
synchronized{
if (!initialized) {
- Unpickler.registerConstructor("array", "array", new ArrayConstructor())
Unpickler.registerConstructor("__builtin__", "bytearray", new ByteArrayConstructor())
Unpickler.registerConstructor("builtins", "bytearray", new ByteArrayConstructor())
Unpickler.registerConstructor("__builtin__", "bytes", new ByteArrayConstructor())
diff --git a/dev/create-release/releaseutils.py b/dev/create-release/releaseutils.py
index a5a26ae8f5..241b7ed539 100755
--- a/dev/create-release/releaseutils.py
+++ b/dev/create-release/releaseutils.py
@@ -49,8 +49,6 @@ except ImportError:
print("Install using 'sudo pip install unidecode'")
sys.exit(-1)
-if sys.version < '3':
- input = raw_input # noqa
# Contributors list file name
contributors_file_name = "contributors.txt"
@@ -152,10 +150,7 @@ def get_commits(tag):
if not is_valid_author(author):
author = github_username
# Guard against special characters
- try: # Python 2
- author = unicode(author, "UTF-8")
- except NameError: # Python 3
- author = str(author)
+ author = str(author)
author = unidecode.unidecode(author).strip()
commit = Commit(_hash, author, title, pr_number)
commits.append(commit)
diff --git a/dev/github_jira_sync.py b/dev/github_jira_sync.py
index b444b74d40..b90afeebc5 100755
--- a/dev/github_jira_sync.py
+++ b/dev/github_jira_sync.py
@@ -22,14 +22,9 @@ import json
import os
import re
import sys
-if sys.version < '3':
- from urllib2 import urlopen
- from urllib2 import Request
- from urllib2 import HTTPError
-else:
- from urllib.request import urlopen
- from urllib.request import Request
- from urllib.error import HTTPError
+from urllib.request import urlopen
+from urllib.request import Request
+from urllib.error import HTTPError
try:
import jira.client
diff --git a/dev/lint-python b/dev/lint-python
index d5491f2447..1fddbfa64b 100755
--- a/dev/lint-python
+++ b/dev/lint-python
@@ -168,7 +168,15 @@ function sphinx_test {
# Check that the documentation builds acceptably, skip check if sphinx is not installed.
if ! hash "$SPHINX_BUILD" 2> /dev/null; then
- echo "The $SPHINX_BUILD command was not found. Skipping pydoc checks for now."
+ echo "The $SPHINX_BUILD command was not found. Skipping Sphinx build for now."
+ echo
+ return
+ fi
+
+ # TODO(SPARK-32279): Install Sphinx in Python 3 of Jenkins machines
+ PYTHON_HAS_SPHINX=$("$PYTHON_EXECUTABLE" -c 'import importlib.util; print(importlib.util.find_spec("sphinx") is not None)')
+ if [[ "$PYTHON_HAS_SPHINX" == "False" ]]; then
+ echo "$PYTHON_EXECUTABLE does not have Sphinx installed. Skipping Sphinx build for now."
echo
return
fi
diff --git a/dev/merge_spark_pr.py b/dev/merge_spark_pr.py
index 967cdace60..b42429d717 100755
--- a/dev/merge_spark_pr.py
+++ b/dev/merge_spark_pr.py
@@ -31,15 +31,9 @@ import re
import subprocess
import sys
import traceback
-if sys.version < '3':
- input = raw_input # noqa
- from urllib2 import urlopen
- from urllib2 import Request
- from urllib2 import HTTPError
-else:
- from urllib.request import urlopen
- from urllib.request import Request
- from urllib.error import HTTPError
+from urllib.request import urlopen
+from urllib.request import Request
+from urllib.error import HTTPError
try:
import jira.client
diff --git a/dev/run-tests-jenkins.py b/dev/run-tests-jenkins.py
index 13be9592d7..4ff5b327e3 100755
--- a/dev/run-tests-jenkins.py
+++ b/dev/run-tests-jenkins.py
@@ -22,15 +22,9 @@ import sys
import json
import functools
import subprocess
-if sys.version < '3':
- from urllib2 import urlopen
- from urllib2 import Request
- from urllib2 import HTTPError, URLError
-else:
- from urllib.request import urlopen
- from urllib.request import Request
- from urllib.error import HTTPError, URLError
-
+from urllib.request import urlopen
+from urllib.request import Request
+from urllib.error import HTTPError, URLError
from sparktestsupport import SPARK_HOME, ERROR_CODES
from sparktestsupport.shellutils import run_cmd
diff --git a/dev/sparktestsupport/toposort.py b/dev/sparktestsupport/toposort.py
index 8b2688d200..6785e481b5 100644
--- a/dev/sparktestsupport/toposort.py
+++ b/dev/sparktestsupport/toposort.py
@@ -24,8 +24,7 @@
# Moved functools import to the top of the file.
# Changed assert to a ValueError.
# Changed iter[items|keys] to [items|keys], for python 3
-# compatibility. I don't think it matters for python 2 these are
-# now lists instead of iterables.
+# compatibility.
# Copy the input so as to leave it unmodified.
# Renamed function from toposort2 to toposort.
# Handle empty input.
diff --git a/docs/configuration.md b/docs/configuration.md
index 42f706b296..abf76105ae 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -2917,7 +2917,7 @@ The following variables can be set in `spark-env.sh`:
PYSPARK_PYTHON |
- Python binary executable to use for PySpark in both driver and workers (default is python2.7 if available, otherwise python ).
+ | Python binary executable to use for PySpark in both driver and workers (default is python3 if available, otherwise python ).
Property spark.pyspark.python take precedence if it is set |
diff --git a/docs/index.md b/docs/index.md
index c0771ca170..8fd169e63f 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -44,9 +44,8 @@ source, visit [Building Spark](building-spark.html).
Spark runs on both Windows and UNIX-like systems (e.g. Linux, Mac OS), and it should run on any platform that runs a supported version of Java. This should include JVMs on x86_64 and ARM64. It's easy to run locally on one machine --- all you need is to have `java` installed on your system `PATH`, or the `JAVA_HOME` environment variable pointing to a Java installation.
-Spark runs on Java 8/11, Scala 2.12, Python 2.7+/3.4+ and R 3.5+.
+Spark runs on Java 8/11, Scala 2.12, Python 3.6+ and R 3.5+.
Java 8 prior to version 8u92 support is deprecated as of Spark 3.0.0.
-Python 2 and Python 3 prior to version 3.6 support is deprecated as of Spark 3.0.0.
For the Scala API, Spark {{site.SPARK_VERSION}}
uses Scala {{site.SCALA_BINARY_VERSION}}. You will need to use a compatible Scala version
({{site.SCALA_BINARY_VERSION}}.x).
diff --git a/docs/rdd-programming-guide.md b/docs/rdd-programming-guide.md
index 70bfefce47..07207f62bb 100644
--- a/docs/rdd-programming-guide.md
+++ b/docs/rdd-programming-guide.md
@@ -101,10 +101,10 @@ import org.apache.spark.SparkConf;
-Spark {{site.SPARK_VERSION}} works with Python 2.7+ or Python 3.4+. It can use the standard CPython interpreter,
+Spark {{site.SPARK_VERSION}} works with Python 3.6+. It can use the standard CPython interpreter,
so C libraries like NumPy can be used. It also works with PyPy 2.3+.
-Note that Python 2 support is deprecated as of Spark 3.0.0.
+Python 2, 3.4 and 3.5 supports were removed in Spark 3.1.0.
Spark applications in Python can either be run with the `bin/spark-submit` script which includes Spark at runtime, or by including it in your setup.py as:
@@ -134,8 +134,8 @@ PySpark requires the same minor version of Python in both driver and workers. It
you can specify which version of Python you want to use by `PYSPARK_PYTHON`, for example:
{% highlight bash %}
-$ PYSPARK_PYTHON=python3.4 bin/pyspark
-$ PYSPARK_PYTHON=/opt/pypy-2.5/bin/pypy bin/spark-submit examples/src/main/python/pi.py
+$ PYSPARK_PYTHON=python3.8 bin/pyspark
+$ PYSPARK_PYTHON=/path-to-your-pypy/pypy bin/spark-submit examples/src/main/python/pi.py
{% endhighlight %}
@@ -276,7 +276,7 @@ $ PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS=notebook ./bin/pyspar
You can customize the `ipython` or `jupyter` commands by setting `PYSPARK_DRIVER_PYTHON_OPTS`.
-After the Jupyter Notebook server is launched, you can create a new "Python 2" notebook from
+After the Jupyter Notebook server is launched, you can create a new notebook from
the "Files" tab. Inside the notebook, you can input the command `%pylab inline` as part of
your notebook before you start to try Spark from the Jupyter notebook.
@@ -447,7 +447,7 @@ Writables are automatically converted:
Writable Type | Python Type |
-Text | unicode str |
+Text | str |
IntWritable | int |
FloatWritable | float |
DoubleWritable | float |
diff --git a/examples/src/main/python/als.py b/examples/src/main/python/als.py
index 6d3241876a..511634fd8f 100755
--- a/examples/src/main/python/als.py
+++ b/examples/src/main/python/als.py
@@ -21,8 +21,6 @@ pyspark.ml.recommendation.ALS for more conventional use.
This example requires numpy (http://www.numpy.org/)
"""
-from __future__ import print_function
-
import sys
import numpy as np
diff --git a/examples/src/main/python/avro_inputformat.py b/examples/src/main/python/avro_inputformat.py
index a18722c687..49ab37e7b3 100644
--- a/examples/src/main/python/avro_inputformat.py
+++ b/examples/src/main/python/avro_inputformat.py
@@ -43,8 +43,6 @@ $ ./bin/spark-submit --driver-class-path /path/to/example/jar \
{u'favorite_color': None, u'name': u'Alyssa'}
{u'favorite_color': u'red', u'name': u'Ben'}
"""
-from __future__ import print_function
-
import sys
from functools import reduce
diff --git a/examples/src/main/python/kmeans.py b/examples/src/main/python/kmeans.py
index a42d711fc5..022378619c 100755
--- a/examples/src/main/python/kmeans.py
+++ b/examples/src/main/python/kmeans.py
@@ -22,8 +22,6 @@ examples/src/main/python/ml/kmeans_example.py.
This example requires NumPy (http://www.numpy.org/).
"""
-from __future__ import print_function
-
import sys
import numpy as np
diff --git a/examples/src/main/python/logistic_regression.py b/examples/src/main/python/logistic_regression.py
index bcc4e0f4e8..4b83740152 100755
--- a/examples/src/main/python/logistic_regression.py
+++ b/examples/src/main/python/logistic_regression.py
@@ -22,8 +22,6 @@ to act on batches of input data using efficient matrix operations.
In practice, one may prefer to use the LogisticRegression algorithm in
ML, as shown in examples/src/main/python/ml/logistic_regression_with_elastic_net.py.
"""
-from __future__ import print_function
-
import sys
import numpy as np
diff --git a/examples/src/main/python/ml/aft_survival_regression.py b/examples/src/main/python/ml/aft_survival_regression.py
index 0a71f76418..2040a7876c 100644
--- a/examples/src/main/python/ml/aft_survival_regression.py
+++ b/examples/src/main/python/ml/aft_survival_regression.py
@@ -20,8 +20,6 @@ An example demonstrating aft survival regression.
Run with:
bin/spark-submit examples/src/main/python/ml/aft_survival_regression.py
"""
-from __future__ import print_function
-
# $example on$
from pyspark.ml.regression import AFTSurvivalRegression
from pyspark.ml.linalg import Vectors
diff --git a/examples/src/main/python/ml/als_example.py b/examples/src/main/python/ml/als_example.py
index 8b7ec9c439..b392639784 100644
--- a/examples/src/main/python/ml/als_example.py
+++ b/examples/src/main/python/ml/als_example.py
@@ -15,12 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
-import sys
-if sys.version >= '3':
- long = int
-
from pyspark.sql import SparkSession
# $example on$
@@ -39,7 +33,7 @@ if __name__ == "__main__":
lines = spark.read.text("data/mllib/als/sample_movielens_ratings.txt").rdd
parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
- rating=float(p[2]), timestamp=long(p[3])))
+ rating=float(p[2]), timestamp=int(p[3])))
ratings = spark.createDataFrame(ratingsRDD)
(training, test) = ratings.randomSplit([0.8, 0.2])
diff --git a/examples/src/main/python/ml/anova_selector_example.py b/examples/src/main/python/ml/anova_selector_example.py
index f8458f5d6e..da80fa6231 100644
--- a/examples/src/main/python/ml/anova_selector_example.py
+++ b/examples/src/main/python/ml/anova_selector_example.py
@@ -20,8 +20,6 @@ An example for ANOVASelector.
Run with:
bin/spark-submit examples/src/main/python/ml/anova_selector_example.py
"""
-from __future__ import print_function
-
from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import ANOVASelector
diff --git a/examples/src/main/python/ml/anova_test_example.py b/examples/src/main/python/ml/anova_test_example.py
index 4119441cde..451e078f60 100644
--- a/examples/src/main/python/ml/anova_test_example.py
+++ b/examples/src/main/python/ml/anova_test_example.py
@@ -20,8 +20,6 @@ An example for ANOVA testing.
Run with:
bin/spark-submit examples/src/main/python/ml/anova_test_example.py
"""
-from __future__ import print_function
-
from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.linalg import Vectors
diff --git a/examples/src/main/python/ml/binarizer_example.py b/examples/src/main/python/ml/binarizer_example.py
index 669bb2aeab..5d5ae4122e 100644
--- a/examples/src/main/python/ml/binarizer_example.py
+++ b/examples/src/main/python/ml/binarizer_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import Binarizer
diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py b/examples/src/main/python/ml/bisecting_k_means_example.py
index 82adb338b5..513f80a09e 100644
--- a/examples/src/main/python/ml/bisecting_k_means_example.py
+++ b/examples/src/main/python/ml/bisecting_k_means_example.py
@@ -20,8 +20,6 @@ An example demonstrating bisecting k-means clustering.
Run with:
bin/spark-submit examples/src/main/python/ml/bisecting_k_means_example.py
"""
-from __future__ import print_function
-
# $example on$
from pyspark.ml.clustering import BisectingKMeans
from pyspark.ml.evaluation import ClusteringEvaluator
diff --git a/examples/src/main/python/ml/bucketed_random_projection_lsh_example.py b/examples/src/main/python/ml/bucketed_random_projection_lsh_example.py
index 610176ea59..f5836091f3 100644
--- a/examples/src/main/python/ml/bucketed_random_projection_lsh_example.py
+++ b/examples/src/main/python/ml/bucketed_random_projection_lsh_example.py
@@ -20,8 +20,6 @@ An example demonstrating BucketedRandomProjectionLSH.
Run with:
bin/spark-submit examples/src/main/python/ml/bucketed_random_projection_lsh_example.py
"""
-from __future__ import print_function
-
# $example on$
from pyspark.ml.feature import BucketedRandomProjectionLSH
from pyspark.ml.linalg import Vectors
diff --git a/examples/src/main/python/ml/bucketizer_example.py b/examples/src/main/python/ml/bucketizer_example.py
index 742f35093b..5de67f7126 100644
--- a/examples/src/main/python/ml/bucketizer_example.py
+++ b/examples/src/main/python/ml/bucketizer_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import Bucketizer
diff --git a/examples/src/main/python/ml/chi_square_test_example.py b/examples/src/main/python/ml/chi_square_test_example.py
index 2af7e683cd..bf15a03d9c 100644
--- a/examples/src/main/python/ml/chi_square_test_example.py
+++ b/examples/src/main/python/ml/chi_square_test_example.py
@@ -20,8 +20,6 @@ An example for Chi-square hypothesis testing.
Run with:
bin/spark-submit examples/src/main/python/ml/chi_square_test_example.py
"""
-from __future__ import print_function
-
from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.linalg import Vectors
diff --git a/examples/src/main/python/ml/chisq_selector_example.py b/examples/src/main/python/ml/chisq_selector_example.py
index 028a9ea9d6..c83a8c1bc7 100644
--- a/examples/src/main/python/ml/chisq_selector_example.py
+++ b/examples/src/main/python/ml/chisq_selector_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import ChiSqSelector
diff --git a/examples/src/main/python/ml/correlation_example.py b/examples/src/main/python/ml/correlation_example.py
index 1f4e402ac1..9006d54149 100644
--- a/examples/src/main/python/ml/correlation_example.py
+++ b/examples/src/main/python/ml/correlation_example.py
@@ -20,8 +20,6 @@ An example for computing correlation matrix.
Run with:
bin/spark-submit examples/src/main/python/ml/correlation_example.py
"""
-from __future__ import print_function
-
# $example on$
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation
diff --git a/examples/src/main/python/ml/count_vectorizer_example.py b/examples/src/main/python/ml/count_vectorizer_example.py
index f2e41db77d..b3ddfb128c 100644
--- a/examples/src/main/python/ml/count_vectorizer_example.py
+++ b/examples/src/main/python/ml/count_vectorizer_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import CountVectorizer
diff --git a/examples/src/main/python/ml/cross_validator.py b/examples/src/main/python/ml/cross_validator.py
index 6256d11504..0ad0865486 100644
--- a/examples/src/main/python/ml/cross_validator.py
+++ b/examples/src/main/python/ml/cross_validator.py
@@ -22,8 +22,6 @@ Run with:
bin/spark-submit examples/src/main/python/ml/cross_validator.py
"""
-from __future__ import print_function
-
# $example on$
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
diff --git a/examples/src/main/python/ml/dataframe_example.py b/examples/src/main/python/ml/dataframe_example.py
index cabc3de68f..d2bf937441 100644
--- a/examples/src/main/python/ml/dataframe_example.py
+++ b/examples/src/main/python/ml/dataframe_example.py
@@ -19,8 +19,6 @@
An example of how to use DataFrame for ML. Run with::
bin/spark-submit examples/src/main/python/ml/dataframe_example.py
"""
-from __future__ import print_function
-
import os
import sys
import tempfile
diff --git a/examples/src/main/python/ml/dct_example.py b/examples/src/main/python/ml/dct_example.py
index c0457f8d0f..37da4f5e8f 100644
--- a/examples/src/main/python/ml/dct_example.py
+++ b/examples/src/main/python/ml/dct_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
# $example on$
from pyspark.ml.feature import DCT
from pyspark.ml.linalg import Vectors
diff --git a/examples/src/main/python/ml/decision_tree_classification_example.py b/examples/src/main/python/ml/decision_tree_classification_example.py
index d6e2977de0..eb7177b845 100644
--- a/examples/src/main/python/ml/decision_tree_classification_example.py
+++ b/examples/src/main/python/ml/decision_tree_classification_example.py
@@ -18,8 +18,6 @@
"""
Decision Tree Classification Example.
"""
-from __future__ import print_function
-
# $example on$
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
diff --git a/examples/src/main/python/ml/decision_tree_regression_example.py b/examples/src/main/python/ml/decision_tree_regression_example.py
index 58d7ad921d..1ed1636a3d 100644
--- a/examples/src/main/python/ml/decision_tree_regression_example.py
+++ b/examples/src/main/python/ml/decision_tree_regression_example.py
@@ -18,8 +18,6 @@
"""
Decision Tree Regression Example.
"""
-from __future__ import print_function
-
# $example on$
from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressor
diff --git a/examples/src/main/python/ml/elementwise_product_example.py b/examples/src/main/python/ml/elementwise_product_example.py
index 590053998b..71eec8d432 100644
--- a/examples/src/main/python/ml/elementwise_product_example.py
+++ b/examples/src/main/python/ml/elementwise_product_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
# $example on$
from pyspark.ml.feature import ElementwiseProduct
from pyspark.ml.linalg import Vectors
diff --git a/examples/src/main/python/ml/estimator_transformer_param_example.py b/examples/src/main/python/ml/estimator_transformer_param_example.py
index eb21051435..1dcca6c201 100644
--- a/examples/src/main/python/ml/estimator_transformer_param_example.py
+++ b/examples/src/main/python/ml/estimator_transformer_param_example.py
@@ -18,8 +18,6 @@
"""
Estimator Transformer Param Example.
"""
-from __future__ import print_function
-
# $example on$
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
diff --git a/examples/src/main/python/ml/feature_hasher_example.py b/examples/src/main/python/ml/feature_hasher_example.py
index 6cf9ecc396..4fe573d19d 100644
--- a/examples/src/main/python/ml/feature_hasher_example.py
+++ b/examples/src/main/python/ml/feature_hasher_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import FeatureHasher
diff --git a/examples/src/main/python/ml/fm_classifier_example.py b/examples/src/main/python/ml/fm_classifier_example.py
index 6e7c2ccf02..b47bdc5275 100644
--- a/examples/src/main/python/ml/fm_classifier_example.py
+++ b/examples/src/main/python/ml/fm_classifier_example.py
@@ -18,8 +18,6 @@
"""
FMClassifier Example.
"""
-from __future__ import print_function
-
# $example on$
from pyspark.ml import Pipeline
from pyspark.ml.classification import FMClassifier
diff --git a/examples/src/main/python/ml/fm_regressor_example.py b/examples/src/main/python/ml/fm_regressor_example.py
index afd7639680..5c8133996a 100644
--- a/examples/src/main/python/ml/fm_regressor_example.py
+++ b/examples/src/main/python/ml/fm_regressor_example.py
@@ -18,8 +18,6 @@
"""
FMRegressor Example.
"""
-from __future__ import print_function
-
# $example on$
from pyspark.ml import Pipeline
from pyspark.ml.regression import FMRegressor
diff --git a/examples/src/main/python/ml/fvalue_selector_example.py b/examples/src/main/python/ml/fvalue_selector_example.py
index 3158953a5d..f164af47eb 100644
--- a/examples/src/main/python/ml/fvalue_selector_example.py
+++ b/examples/src/main/python/ml/fvalue_selector_example.py
@@ -20,8 +20,6 @@ An example for FValueSelector.
Run with:
bin/spark-submit examples/src/main/python/ml/fvalue_selector_example.py
"""
-from __future__ import print_function
-
from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import FValueSelector
diff --git a/examples/src/main/python/ml/fvalue_test_example.py b/examples/src/main/python/ml/fvalue_test_example.py
index 410b39e449..dfa8073e5a 100644
--- a/examples/src/main/python/ml/fvalue_test_example.py
+++ b/examples/src/main/python/ml/fvalue_test_example.py
@@ -20,8 +20,6 @@ An example for FValue testing.
Run with:
bin/spark-submit examples/src/main/python/ml/fvalue_test_example.py
"""
-from __future__ import print_function
-
from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.linalg import Vectors
diff --git a/examples/src/main/python/ml/gaussian_mixture_example.py b/examples/src/main/python/ml/gaussian_mixture_example.py
index 4938a90418..1441faa792 100644
--- a/examples/src/main/python/ml/gaussian_mixture_example.py
+++ b/examples/src/main/python/ml/gaussian_mixture_example.py
@@ -20,8 +20,6 @@ A simple example demonstrating Gaussian Mixture Model (GMM).
Run with:
bin/spark-submit examples/src/main/python/ml/gaussian_mixture_example.py
"""
-from __future__ import print_function
-
# $example on$
from pyspark.ml.clustering import GaussianMixture
# $example off$
diff --git a/examples/src/main/python/ml/generalized_linear_regression_example.py b/examples/src/main/python/ml/generalized_linear_regression_example.py
index a52f4650c1..06a8a5a2e9 100644
--- a/examples/src/main/python/ml/generalized_linear_regression_example.py
+++ b/examples/src/main/python/ml/generalized_linear_regression_example.py
@@ -20,8 +20,6 @@ An example demonstrating generalized linear regression.
Run with:
bin/spark-submit examples/src/main/python/ml/generalized_linear_regression_example.py
"""
-from __future__ import print_function
-
from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.regression import GeneralizedLinearRegression
diff --git a/examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py b/examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py
index c2042fd7b7..a7efa2170a 100644
--- a/examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py
+++ b/examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py
@@ -18,8 +18,6 @@
"""
Gradient Boosted Tree Classifier Example.
"""
-from __future__ import print_function
-
# $example on$
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
diff --git a/examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py b/examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py
index cc96c973e4..5e09b96c1e 100644
--- a/examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py
+++ b/examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py
@@ -18,8 +18,6 @@
"""
Gradient Boosted Tree Regressor Example.
"""
-from __future__ import print_function
-
# $example on$
from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressor
diff --git a/examples/src/main/python/ml/index_to_string_example.py b/examples/src/main/python/ml/index_to_string_example.py
index 33d104e8e3..98bdb89ce3 100644
--- a/examples/src/main/python/ml/index_to_string_example.py
+++ b/examples/src/main/python/ml/index_to_string_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
# $example on$
from pyspark.ml.feature import IndexToString, StringIndexer
# $example off$
diff --git a/examples/src/main/python/ml/interaction_example.py b/examples/src/main/python/ml/interaction_example.py
index 4b63227191..ac365179b0 100644
--- a/examples/src/main/python/ml/interaction_example.py
+++ b/examples/src/main/python/ml/interaction_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
# $example on$
from pyspark.ml.feature import Interaction, VectorAssembler
# $example off$
diff --git a/examples/src/main/python/ml/isotonic_regression_example.py b/examples/src/main/python/ml/isotonic_regression_example.py
index 89cba9dfc7..d7b893894f 100644
--- a/examples/src/main/python/ml/isotonic_regression_example.py
+++ b/examples/src/main/python/ml/isotonic_regression_example.py
@@ -21,8 +21,6 @@ Isotonic Regression Example.
Run with:
bin/spark-submit examples/src/main/python/ml/isotonic_regression_example.py
"""
-from __future__ import print_function
-
# $example on$
from pyspark.ml.regression import IsotonicRegression
# $example off$
diff --git a/examples/src/main/python/ml/kmeans_example.py b/examples/src/main/python/ml/kmeans_example.py
index 80a878af67..47223fd953 100644
--- a/examples/src/main/python/ml/kmeans_example.py
+++ b/examples/src/main/python/ml/kmeans_example.py
@@ -22,8 +22,6 @@ Run with:
This example requires NumPy (http://www.numpy.org/).
"""
-from __future__ import print_function
-
# $example on$
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
diff --git a/examples/src/main/python/ml/lda_example.py b/examples/src/main/python/ml/lda_example.py
index 97d1a042d1..a47dfa383c 100644
--- a/examples/src/main/python/ml/lda_example.py
+++ b/examples/src/main/python/ml/lda_example.py
@@ -20,8 +20,6 @@ An example demonstrating LDA.
Run with:
bin/spark-submit examples/src/main/python/ml/lda_example.py
"""
-from __future__ import print_function
-
# $example on$
from pyspark.ml.clustering import LDA
# $example off$
diff --git a/examples/src/main/python/ml/linear_regression_with_elastic_net.py b/examples/src/main/python/ml/linear_regression_with_elastic_net.py
index 6639e9160a..864fc76cff 100644
--- a/examples/src/main/python/ml/linear_regression_with_elastic_net.py
+++ b/examples/src/main/python/ml/linear_regression_with_elastic_net.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
# $example on$
from pyspark.ml.regression import LinearRegression
# $example off$
diff --git a/examples/src/main/python/ml/linearsvc.py b/examples/src/main/python/ml/linearsvc.py
index 9b79abbf96..61d726cf3f 100644
--- a/examples/src/main/python/ml/linearsvc.py
+++ b/examples/src/main/python/ml/linearsvc.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
# $example on$
from pyspark.ml.classification import LinearSVC
# $example off$
diff --git a/examples/src/main/python/ml/logistic_regression_summary_example.py b/examples/src/main/python/ml/logistic_regression_summary_example.py
index 2274ff707b..6d045108da 100644
--- a/examples/src/main/python/ml/logistic_regression_summary_example.py
+++ b/examples/src/main/python/ml/logistic_regression_summary_example.py
@@ -20,8 +20,6 @@ An example demonstrating Logistic Regression Summary.
Run with:
bin/spark-submit examples/src/main/python/ml/logistic_regression_summary_example.py
"""
-from __future__ import print_function
-
# $example on$
from pyspark.ml.classification import LogisticRegression
# $example off$
diff --git a/examples/src/main/python/ml/logistic_regression_with_elastic_net.py b/examples/src/main/python/ml/logistic_regression_with_elastic_net.py
index d095fbd373..916fdade27 100644
--- a/examples/src/main/python/ml/logistic_regression_with_elastic_net.py
+++ b/examples/src/main/python/ml/logistic_regression_with_elastic_net.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
# $example on$
from pyspark.ml.classification import LogisticRegression
# $example off$
diff --git a/examples/src/main/python/ml/max_abs_scaler_example.py b/examples/src/main/python/ml/max_abs_scaler_example.py
index 45eda3cdad..d7ff3561ce 100644
--- a/examples/src/main/python/ml/max_abs_scaler_example.py
+++ b/examples/src/main/python/ml/max_abs_scaler_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
# $example on$
from pyspark.ml.feature import MaxAbsScaler
from pyspark.ml.linalg import Vectors
diff --git a/examples/src/main/python/ml/min_hash_lsh_example.py b/examples/src/main/python/ml/min_hash_lsh_example.py
index 93136e6ae3..683f97a055 100644
--- a/examples/src/main/python/ml/min_hash_lsh_example.py
+++ b/examples/src/main/python/ml/min_hash_lsh_example.py
@@ -20,8 +20,6 @@ An example demonstrating MinHashLSH.
Run with:
bin/spark-submit examples/src/main/python/ml/min_hash_lsh_example.py
"""
-from __future__ import print_function
-
# $example on$
from pyspark.ml.feature import MinHashLSH
from pyspark.ml.linalg import Vectors
diff --git a/examples/src/main/python/ml/min_max_scaler_example.py b/examples/src/main/python/ml/min_max_scaler_example.py
index b5f272e59b..cd74243699 100644
--- a/examples/src/main/python/ml/min_max_scaler_example.py
+++ b/examples/src/main/python/ml/min_max_scaler_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
# $example on$
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.linalg import Vectors
diff --git a/examples/src/main/python/ml/multiclass_logistic_regression_with_elastic_net.py b/examples/src/main/python/ml/multiclass_logistic_regression_with_elastic_net.py
index bec9860c79..3bb4a72864 100644
--- a/examples/src/main/python/ml/multiclass_logistic_regression_with_elastic_net.py
+++ b/examples/src/main/python/ml/multiclass_logistic_regression_with_elastic_net.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
# $example on$
from pyspark.ml.classification import LogisticRegression
# $example off$
diff --git a/examples/src/main/python/ml/multilayer_perceptron_classification.py b/examples/src/main/python/ml/multilayer_perceptron_classification.py
index 88fc69f753..74f5321935 100644
--- a/examples/src/main/python/ml/multilayer_perceptron_classification.py
+++ b/examples/src/main/python/ml/multilayer_perceptron_classification.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
# $example on$
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
diff --git a/examples/src/main/python/ml/n_gram_example.py b/examples/src/main/python/ml/n_gram_example.py
index 31676e076a..8c8031b939 100644
--- a/examples/src/main/python/ml/n_gram_example.py
+++ b/examples/src/main/python/ml/n_gram_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
# $example on$
from pyspark.ml.feature import NGram
# $example off$
diff --git a/examples/src/main/python/ml/naive_bayes_example.py b/examples/src/main/python/ml/naive_bayes_example.py
index 7290ab81cd..8d1777c6f9 100644
--- a/examples/src/main/python/ml/naive_bayes_example.py
+++ b/examples/src/main/python/ml/naive_bayes_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
# $example on$
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
diff --git a/examples/src/main/python/ml/normalizer_example.py b/examples/src/main/python/ml/normalizer_example.py
index 510bd825fd..2aa012961a 100644
--- a/examples/src/main/python/ml/normalizer_example.py
+++ b/examples/src/main/python/ml/normalizer_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
# $example on$
from pyspark.ml.feature import Normalizer
from pyspark.ml.linalg import Vectors
diff --git a/examples/src/main/python/ml/one_vs_rest_example.py b/examples/src/main/python/ml/one_vs_rest_example.py
index 956e94ae4a..4cae1a9980 100644
--- a/examples/src/main/python/ml/one_vs_rest_example.py
+++ b/examples/src/main/python/ml/one_vs_rest_example.py
@@ -21,8 +21,6 @@ using Logistic Regression as the base classifier.
Run with:
bin/spark-submit examples/src/main/python/ml/one_vs_rest_example.py
"""
-from __future__ import print_function
-
# $example on$
from pyspark.ml.classification import LogisticRegression, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
diff --git a/examples/src/main/python/ml/onehot_encoder_example.py b/examples/src/main/python/ml/onehot_encoder_example.py
index 73775b79e3..6deb84ed78 100644
--- a/examples/src/main/python/ml/onehot_encoder_example.py
+++ b/examples/src/main/python/ml/onehot_encoder_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
# $example on$
from pyspark.ml.feature import OneHotEncoder
# $example off$
diff --git a/examples/src/main/python/ml/pca_example.py b/examples/src/main/python/ml/pca_example.py
index 38746aced0..03fb709c8e 100644
--- a/examples/src/main/python/ml/pca_example.py
+++ b/examples/src/main/python/ml/pca_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
# $example on$
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors
diff --git a/examples/src/main/python/ml/polynomial_expansion_example.py b/examples/src/main/python/ml/polynomial_expansion_example.py
index 40bcb7b13a..75f436e768 100644
--- a/examples/src/main/python/ml/polynomial_expansion_example.py
+++ b/examples/src/main/python/ml/polynomial_expansion_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
# $example on$
from pyspark.ml.feature import PolynomialExpansion
from pyspark.ml.linalg import Vectors
diff --git a/examples/src/main/python/ml/quantile_discretizer_example.py b/examples/src/main/python/ml/quantile_discretizer_example.py
index 0fc1d1949a..82be3936d2 100644
--- a/examples/src/main/python/ml/quantile_discretizer_example.py
+++ b/examples/src/main/python/ml/quantile_discretizer_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
# $example on$
from pyspark.ml.feature import QuantileDiscretizer
# $example off$
diff --git a/examples/src/main/python/ml/random_forest_classifier_example.py b/examples/src/main/python/ml/random_forest_classifier_example.py
index 4eaa94dd7f..8983d1f2e9 100644
--- a/examples/src/main/python/ml/random_forest_classifier_example.py
+++ b/examples/src/main/python/ml/random_forest_classifier_example.py
@@ -18,8 +18,6 @@
"""
Random Forest Classifier Example.
"""
-from __future__ import print_function
-
# $example on$
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
diff --git a/examples/src/main/python/ml/random_forest_regressor_example.py b/examples/src/main/python/ml/random_forest_regressor_example.py
index a34edff2ec..b9306ddf2f 100644
--- a/examples/src/main/python/ml/random_forest_regressor_example.py
+++ b/examples/src/main/python/ml/random_forest_regressor_example.py
@@ -18,8 +18,6 @@
"""
Random Forest Regressor Example.
"""
-from __future__ import print_function
-
# $example on$
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
diff --git a/examples/src/main/python/ml/rformula_example.py b/examples/src/main/python/ml/rformula_example.py
index 6629239db2..25bb6dac56 100644
--- a/examples/src/main/python/ml/rformula_example.py
+++ b/examples/src/main/python/ml/rformula_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
# $example on$
from pyspark.ml.feature import RFormula
# $example off$
diff --git a/examples/src/main/python/ml/robust_scaler_example.py b/examples/src/main/python/ml/robust_scaler_example.py
index 435e9ccb80..9f7c6d6507 100644
--- a/examples/src/main/python/ml/robust_scaler_example.py
+++ b/examples/src/main/python/ml/robust_scaler_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
# $example on$
from pyspark.ml.feature import RobustScaler
# $example off$
diff --git a/examples/src/main/python/ml/sql_transformer.py b/examples/src/main/python/ml/sql_transformer.py
index 0bf8f35720..c8ac5c46aa 100644
--- a/examples/src/main/python/ml/sql_transformer.py
+++ b/examples/src/main/python/ml/sql_transformer.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
# $example on$
from pyspark.ml.feature import SQLTransformer
# $example off$
diff --git a/examples/src/main/python/ml/standard_scaler_example.py b/examples/src/main/python/ml/standard_scaler_example.py
index c0027480e6..9021c10075 100644
--- a/examples/src/main/python/ml/standard_scaler_example.py
+++ b/examples/src/main/python/ml/standard_scaler_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
# $example on$
from pyspark.ml.feature import StandardScaler
# $example off$
diff --git a/examples/src/main/python/ml/stopwords_remover_example.py b/examples/src/main/python/ml/stopwords_remover_example.py
index 3b8e7855e3..832a7c7d0a 100644
--- a/examples/src/main/python/ml/stopwords_remover_example.py
+++ b/examples/src/main/python/ml/stopwords_remover_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
# $example on$
from pyspark.ml.feature import StopWordsRemover
# $example off$
diff --git a/examples/src/main/python/ml/string_indexer_example.py b/examples/src/main/python/ml/string_indexer_example.py
index 2255bfb9c1..f2ac63eabd 100644
--- a/examples/src/main/python/ml/string_indexer_example.py
+++ b/examples/src/main/python/ml/string_indexer_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
# $example on$
from pyspark.ml.feature import StringIndexer
# $example off$
diff --git a/examples/src/main/python/ml/summarizer_example.py b/examples/src/main/python/ml/summarizer_example.py
index 8835f189a1..4982746450 100644
--- a/examples/src/main/python/ml/summarizer_example.py
+++ b/examples/src/main/python/ml/summarizer_example.py
@@ -20,8 +20,6 @@ An example for summarizer.
Run with:
bin/spark-submit examples/src/main/python/ml/summarizer_example.py
"""
-from __future__ import print_function
-
from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.stat import Summarizer
diff --git a/examples/src/main/python/ml/tf_idf_example.py b/examples/src/main/python/ml/tf_idf_example.py
index d43244fa68..b4bb0dfa31 100644
--- a/examples/src/main/python/ml/tf_idf_example.py
+++ b/examples/src/main/python/ml/tf_idf_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
# $example on$
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
# $example off$
diff --git a/examples/src/main/python/ml/tokenizer_example.py b/examples/src/main/python/ml/tokenizer_example.py
index 5c65c5c9f8..c6b5fac227 100644
--- a/examples/src/main/python/ml/tokenizer_example.py
+++ b/examples/src/main/python/ml/tokenizer_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
# $example on$
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
diff --git a/examples/src/main/python/ml/variance_threshold_selector_example.py b/examples/src/main/python/ml/variance_threshold_selector_example.py
index b7edb86653..0a996e0e28 100644
--- a/examples/src/main/python/ml/variance_threshold_selector_example.py
+++ b/examples/src/main/python/ml/variance_threshold_selector_example.py
@@ -20,8 +20,6 @@ An example for VarianceThresholdSelector.
Run with:
bin/spark-submit examples/src/main/python/ml/variance_threshold_selector_example.py
"""
-from __future__ import print_function
-
from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import VarianceThresholdSelector
diff --git a/examples/src/main/python/ml/vector_assembler_example.py b/examples/src/main/python/ml/vector_assembler_example.py
index 98de1d5ea7..0ce31cf0ea 100644
--- a/examples/src/main/python/ml/vector_assembler_example.py
+++ b/examples/src/main/python/ml/vector_assembler_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
# $example on$
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
diff --git a/examples/src/main/python/ml/vector_indexer_example.py b/examples/src/main/python/ml/vector_indexer_example.py
index 5c2956077d..51a4191606 100644
--- a/examples/src/main/python/ml/vector_indexer_example.py
+++ b/examples/src/main/python/ml/vector_indexer_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
# $example on$
from pyspark.ml.feature import VectorIndexer
# $example off$
diff --git a/examples/src/main/python/ml/vector_size_hint_example.py b/examples/src/main/python/ml/vector_size_hint_example.py
index fb77dacec6..355d85aee8 100644
--- a/examples/src/main/python/ml/vector_size_hint_example.py
+++ b/examples/src/main/python/ml/vector_size_hint_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
# $example on$
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import (VectorSizeHint, VectorAssembler)
diff --git a/examples/src/main/python/ml/vector_slicer_example.py b/examples/src/main/python/ml/vector_slicer_example.py
index 68c8cfe27e..86e089d152 100644
--- a/examples/src/main/python/ml/vector_slicer_example.py
+++ b/examples/src/main/python/ml/vector_slicer_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
# $example on$
from pyspark.ml.feature import VectorSlicer
from pyspark.ml.linalg import Vectors
diff --git a/examples/src/main/python/ml/word2vec_example.py b/examples/src/main/python/ml/word2vec_example.py
index 77f8951df0..0eabeda3dc 100644
--- a/examples/src/main/python/ml/word2vec_example.py
+++ b/examples/src/main/python/ml/word2vec_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
# $example on$
from pyspark.ml.feature import Word2Vec
# $example off$
diff --git a/examples/src/main/python/mllib/binary_classification_metrics_example.py b/examples/src/main/python/mllib/binary_classification_metrics_example.py
index d14ce7982e..741746e6e3 100644
--- a/examples/src/main/python/mllib/binary_classification_metrics_example.py
+++ b/examples/src/main/python/mllib/binary_classification_metrics_example.py
@@ -17,7 +17,6 @@
"""
Binary Classification Metrics Example.
"""
-from __future__ import print_function
from pyspark import SparkContext
# $example on$
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
diff --git a/examples/src/main/python/mllib/bisecting_k_means_example.py b/examples/src/main/python/mllib/bisecting_k_means_example.py
index 36e36fc689..d7b6ad9d42 100644
--- a/examples/src/main/python/mllib/bisecting_k_means_example.py
+++ b/examples/src/main/python/mllib/bisecting_k_means_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
# $example on$
from numpy import array
# $example off$
diff --git a/examples/src/main/python/mllib/correlations.py b/examples/src/main/python/mllib/correlations.py
index 089504fa70..27d07b22a5 100755
--- a/examples/src/main/python/mllib/correlations.py
+++ b/examples/src/main/python/mllib/correlations.py
@@ -18,8 +18,6 @@
"""
Correlations using MLlib.
"""
-from __future__ import print_function
-
import sys
from pyspark import SparkContext
diff --git a/examples/src/main/python/mllib/correlations_example.py b/examples/src/main/python/mllib/correlations_example.py
index 66d18f6e5d..bb71b96868 100644
--- a/examples/src/main/python/mllib/correlations_example.py
+++ b/examples/src/main/python/mllib/correlations_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
import numpy as np
from pyspark import SparkContext
diff --git a/examples/src/main/python/mllib/decision_tree_classification_example.py b/examples/src/main/python/mllib/decision_tree_classification_example.py
index 7eecf50058..009e393226 100644
--- a/examples/src/main/python/mllib/decision_tree_classification_example.py
+++ b/examples/src/main/python/mllib/decision_tree_classification_example.py
@@ -18,8 +18,6 @@
"""
Decision Tree Classification Example.
"""
-from __future__ import print_function
-
from pyspark import SparkContext
# $example on$
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
diff --git a/examples/src/main/python/mllib/decision_tree_regression_example.py b/examples/src/main/python/mllib/decision_tree_regression_example.py
index acf9e25fdf..71dfbf0790 100644
--- a/examples/src/main/python/mllib/decision_tree_regression_example.py
+++ b/examples/src/main/python/mllib/decision_tree_regression_example.py
@@ -18,8 +18,6 @@
"""
Decision Tree Regression Example.
"""
-from __future__ import print_function
-
from pyspark import SparkContext
# $example on$
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
diff --git a/examples/src/main/python/mllib/elementwise_product_example.py b/examples/src/main/python/mllib/elementwise_product_example.py
index 8ae9afb1dc..15e6a43f73 100644
--- a/examples/src/main/python/mllib/elementwise_product_example.py
+++ b/examples/src/main/python/mllib/elementwise_product_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
from pyspark import SparkContext
# $example on$
from pyspark.mllib.feature import ElementwiseProduct
diff --git a/examples/src/main/python/mllib/gaussian_mixture_example.py b/examples/src/main/python/mllib/gaussian_mixture_example.py
index a60e799d62..3b19478f45 100644
--- a/examples/src/main/python/mllib/gaussian_mixture_example.py
+++ b/examples/src/main/python/mllib/gaussian_mixture_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
# $example on$
from numpy import array
# $example off$
diff --git a/examples/src/main/python/mllib/gaussian_mixture_model.py b/examples/src/main/python/mllib/gaussian_mixture_model.py
index 6b46e27dda..96ce6b6f6a 100644
--- a/examples/src/main/python/mllib/gaussian_mixture_model.py
+++ b/examples/src/main/python/mllib/gaussian_mixture_model.py
@@ -18,11 +18,6 @@
"""
A Gaussian Mixture Model clustering program using MLlib.
"""
-from __future__ import print_function
-
-import sys
-if sys.version >= '3':
- long = int
import random
import argparse
@@ -53,7 +48,7 @@ if __name__ == "__main__":
parser.add_argument('--convergenceTol', default=1e-3, type=float, help='convergence threshold')
parser.add_argument('--maxIterations', default=100, type=int, help='Number of iterations')
parser.add_argument('--seed', default=random.getrandbits(19),
- type=long, help='Random seed')
+ type=int, help='Random seed')
args = parser.parse_args()
conf = SparkConf().setAppName("GMM")
diff --git a/examples/src/main/python/mllib/gradient_boosting_classification_example.py b/examples/src/main/python/mllib/gradient_boosting_classification_example.py
index 65a03572be..eb12f20619 100644
--- a/examples/src/main/python/mllib/gradient_boosting_classification_example.py
+++ b/examples/src/main/python/mllib/gradient_boosting_classification_example.py
@@ -18,8 +18,6 @@
"""
Gradient Boosted Trees Classification Example.
"""
-from __future__ import print_function
-
from pyspark import SparkContext
# $example on$
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
diff --git a/examples/src/main/python/mllib/gradient_boosting_regression_example.py b/examples/src/main/python/mllib/gradient_boosting_regression_example.py
index 877f8ab461..eb59a992df 100644
--- a/examples/src/main/python/mllib/gradient_boosting_regression_example.py
+++ b/examples/src/main/python/mllib/gradient_boosting_regression_example.py
@@ -18,8 +18,6 @@
"""
Gradient Boosted Trees Regression Example.
"""
-from __future__ import print_function
-
from pyspark import SparkContext
# $example on$
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
diff --git a/examples/src/main/python/mllib/hypothesis_testing_example.py b/examples/src/main/python/mllib/hypothesis_testing_example.py
index 21a5584fd6..321be8b76f 100644
--- a/examples/src/main/python/mllib/hypothesis_testing_example.py
+++ b/examples/src/main/python/mllib/hypothesis_testing_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
from pyspark import SparkContext
# $example on$
from pyspark.mllib.linalg import Matrices, Vectors
diff --git a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py
index ef380dee79..12a186900e 100644
--- a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py
+++ b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
from pyspark import SparkContext
# $example on$
from pyspark.mllib.stat import Statistics
diff --git a/examples/src/main/python/mllib/isotonic_regression_example.py b/examples/src/main/python/mllib/isotonic_regression_example.py
index f5322d79c4..a5a0cfeae9 100644
--- a/examples/src/main/python/mllib/isotonic_regression_example.py
+++ b/examples/src/main/python/mllib/isotonic_regression_example.py
@@ -18,8 +18,6 @@
"""
Isotonic Regression Example.
"""
-from __future__ import print_function
-
from pyspark import SparkContext
# $example on$
import math
diff --git a/examples/src/main/python/mllib/k_means_example.py b/examples/src/main/python/mllib/k_means_example.py
index d6058f4502..ead1e56de5 100644
--- a/examples/src/main/python/mllib/k_means_example.py
+++ b/examples/src/main/python/mllib/k_means_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
# $example on$
from numpy import array
from math import sqrt
diff --git a/examples/src/main/python/mllib/kernel_density_estimation_example.py b/examples/src/main/python/mllib/kernel_density_estimation_example.py
index 3e8f7241a4..22d1917160 100644
--- a/examples/src/main/python/mllib/kernel_density_estimation_example.py
+++ b/examples/src/main/python/mllib/kernel_density_estimation_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
from pyspark import SparkContext
# $example on$
from pyspark.mllib.stat import KernelDensity
diff --git a/examples/src/main/python/mllib/kmeans.py b/examples/src/main/python/mllib/kmeans.py
index 1bdb3e9b4a..2560384b6a 100755
--- a/examples/src/main/python/mllib/kmeans.py
+++ b/examples/src/main/python/mllib/kmeans.py
@@ -20,8 +20,6 @@ A K-means clustering program using MLlib.
This example requires NumPy (http://www.numpy.org/).
"""
-from __future__ import print_function
-
import sys
import numpy as np
diff --git a/examples/src/main/python/mllib/latent_dirichlet_allocation_example.py b/examples/src/main/python/mllib/latent_dirichlet_allocation_example.py
index 2a1bef5f20..f82a28aadc 100644
--- a/examples/src/main/python/mllib/latent_dirichlet_allocation_example.py
+++ b/examples/src/main/python/mllib/latent_dirichlet_allocation_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
from pyspark import SparkContext
# $example on$
from pyspark.mllib.clustering import LDA, LDAModel
diff --git a/examples/src/main/python/mllib/linear_regression_with_sgd_example.py b/examples/src/main/python/mllib/linear_regression_with_sgd_example.py
index 6744463d40..cb67396332 100644
--- a/examples/src/main/python/mllib/linear_regression_with_sgd_example.py
+++ b/examples/src/main/python/mllib/linear_regression_with_sgd_example.py
@@ -18,8 +18,6 @@
"""
Linear Regression With SGD Example.
"""
-from __future__ import print_function
-
from pyspark import SparkContext
# $example on$
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel
diff --git a/examples/src/main/python/mllib/logistic_regression.py b/examples/src/main/python/mllib/logistic_regression.py
index 87efe17375..7b90615a53 100755
--- a/examples/src/main/python/mllib/logistic_regression.py
+++ b/examples/src/main/python/mllib/logistic_regression.py
@@ -20,8 +20,6 @@ Logistic regression using MLlib.
This example requires NumPy (http://www.numpy.org/).
"""
-from __future__ import print_function
-
import sys
from pyspark import SparkContext
diff --git a/examples/src/main/python/mllib/logistic_regression_with_lbfgs_example.py b/examples/src/main/python/mllib/logistic_regression_with_lbfgs_example.py
index c9b768b314..ac5ab1d1b5 100644
--- a/examples/src/main/python/mllib/logistic_regression_with_lbfgs_example.py
+++ b/examples/src/main/python/mllib/logistic_regression_with_lbfgs_example.py
@@ -18,8 +18,6 @@
"""
Logistic Regression With LBFGS Example.
"""
-from __future__ import print_function
-
from pyspark import SparkContext
# $example on$
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
diff --git a/examples/src/main/python/mllib/naive_bayes_example.py b/examples/src/main/python/mllib/naive_bayes_example.py
index a29fcccac5..74d18233d5 100644
--- a/examples/src/main/python/mllib/naive_bayes_example.py
+++ b/examples/src/main/python/mllib/naive_bayes_example.py
@@ -22,8 +22,6 @@ Usage:
`spark-submit --master local[4] examples/src/main/python/mllib/naive_bayes_example.py`
"""
-from __future__ import print_function
-
import shutil
from pyspark import SparkContext
diff --git a/examples/src/main/python/mllib/normalizer_example.py b/examples/src/main/python/mllib/normalizer_example.py
index a4e028ca9a..d46110d9a0 100644
--- a/examples/src/main/python/mllib/normalizer_example.py
+++ b/examples/src/main/python/mllib/normalizer_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
from pyspark import SparkContext
# $example on$
from pyspark.mllib.feature import Normalizer
diff --git a/examples/src/main/python/mllib/power_iteration_clustering_example.py b/examples/src/main/python/mllib/power_iteration_clustering_example.py
index ca19c0ccb6..60eedef5fa 100644
--- a/examples/src/main/python/mllib/power_iteration_clustering_example.py
+++ b/examples/src/main/python/mllib/power_iteration_clustering_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
from pyspark import SparkContext
# $example on$
from pyspark.mllib.clustering import PowerIterationClustering, PowerIterationClusteringModel
diff --git a/examples/src/main/python/mllib/random_forest_classification_example.py b/examples/src/main/python/mllib/random_forest_classification_example.py
index 5ac67520da..a929c10d5a 100644
--- a/examples/src/main/python/mllib/random_forest_classification_example.py
+++ b/examples/src/main/python/mllib/random_forest_classification_example.py
@@ -18,8 +18,6 @@
"""
Random Forest Classification Example.
"""
-from __future__ import print_function
-
from pyspark import SparkContext
# $example on$
from pyspark.mllib.tree import RandomForest, RandomForestModel
diff --git a/examples/src/main/python/mllib/random_forest_regression_example.py b/examples/src/main/python/mllib/random_forest_regression_example.py
index 7e986a0d30..4e05937768 100644
--- a/examples/src/main/python/mllib/random_forest_regression_example.py
+++ b/examples/src/main/python/mllib/random_forest_regression_example.py
@@ -18,8 +18,6 @@
"""
Random Forest Regression Example.
"""
-from __future__ import print_function
-
from pyspark import SparkContext
# $example on$
from pyspark.mllib.tree import RandomForest, RandomForestModel
diff --git a/examples/src/main/python/mllib/random_rdd_generation.py b/examples/src/main/python/mllib/random_rdd_generation.py
index 9a429b5f8a..49afcfe939 100755
--- a/examples/src/main/python/mllib/random_rdd_generation.py
+++ b/examples/src/main/python/mllib/random_rdd_generation.py
@@ -18,8 +18,6 @@
"""
Randomly generated RDDs.
"""
-from __future__ import print_function
-
import sys
from pyspark import SparkContext
diff --git a/examples/src/main/python/mllib/recommendation_example.py b/examples/src/main/python/mllib/recommendation_example.py
index 00e683c3ae..719f3f904b 100644
--- a/examples/src/main/python/mllib/recommendation_example.py
+++ b/examples/src/main/python/mllib/recommendation_example.py
@@ -18,8 +18,6 @@
"""
Collaborative Filtering Classification Example.
"""
-from __future__ import print_function
-
from pyspark import SparkContext
# $example on$
diff --git a/examples/src/main/python/mllib/sampled_rdds.py b/examples/src/main/python/mllib/sampled_rdds.py
index 00e7cf4bbc..9095c2b2d7 100755
--- a/examples/src/main/python/mllib/sampled_rdds.py
+++ b/examples/src/main/python/mllib/sampled_rdds.py
@@ -18,8 +18,6 @@
"""
Randomly sampled RDDs.
"""
-from __future__ import print_function
-
import sys
from pyspark import SparkContext
diff --git a/examples/src/main/python/mllib/standard_scaler_example.py b/examples/src/main/python/mllib/standard_scaler_example.py
index 11ed34427d..c8fd64dfbb 100644
--- a/examples/src/main/python/mllib/standard_scaler_example.py
+++ b/examples/src/main/python/mllib/standard_scaler_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
from pyspark import SparkContext
# $example on$
from pyspark.mllib.feature import StandardScaler
diff --git a/examples/src/main/python/mllib/stratified_sampling_example.py b/examples/src/main/python/mllib/stratified_sampling_example.py
index a13f8f08dd..2d29f74a19 100644
--- a/examples/src/main/python/mllib/stratified_sampling_example.py
+++ b/examples/src/main/python/mllib/stratified_sampling_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
from pyspark import SparkContext
if __name__ == "__main__":
diff --git a/examples/src/main/python/mllib/streaming_k_means_example.py b/examples/src/main/python/mllib/streaming_k_means_example.py
index e82509ad3f..4904a9ebcf 100644
--- a/examples/src/main/python/mllib/streaming_k_means_example.py
+++ b/examples/src/main/python/mllib/streaming_k_means_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
# $example on$
diff --git a/examples/src/main/python/mllib/streaming_linear_regression_example.py b/examples/src/main/python/mllib/streaming_linear_regression_example.py
index 714c9a0de7..1d52e00fbf 100644
--- a/examples/src/main/python/mllib/streaming_linear_regression_example.py
+++ b/examples/src/main/python/mllib/streaming_linear_regression_example.py
@@ -18,8 +18,6 @@
"""
Streaming Linear Regression Example.
"""
-from __future__ import print_function
-
# $example on$
import sys
# $example off$
diff --git a/examples/src/main/python/mllib/summary_statistics_example.py b/examples/src/main/python/mllib/summary_statistics_example.py
index d55d1a2c2d..d86e841145 100644
--- a/examples/src/main/python/mllib/summary_statistics_example.py
+++ b/examples/src/main/python/mllib/summary_statistics_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
from pyspark import SparkContext
# $example on$
import numpy as np
diff --git a/examples/src/main/python/mllib/tf_idf_example.py b/examples/src/main/python/mllib/tf_idf_example.py
index b66412b233..4449066f5b 100644
--- a/examples/src/main/python/mllib/tf_idf_example.py
+++ b/examples/src/main/python/mllib/tf_idf_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
from pyspark import SparkContext
# $example on$
from pyspark.mllib.feature import HashingTF, IDF
diff --git a/examples/src/main/python/mllib/word2vec.py b/examples/src/main/python/mllib/word2vec.py
index 4e7d4f7610..3e5720b4df 100644
--- a/examples/src/main/python/mllib/word2vec.py
+++ b/examples/src/main/python/mllib/word2vec.py
@@ -23,8 +23,6 @@
# grep -o -E '\w+(\W+\w+){0,15}' text8 > text8_lines
# This was done so that the example can be run in local mode
-from __future__ import print_function
-
import sys
from pyspark import SparkContext
diff --git a/examples/src/main/python/mllib/word2vec_example.py b/examples/src/main/python/mllib/word2vec_example.py
index ad1090c77e..d37a6e7137 100644
--- a/examples/src/main/python/mllib/word2vec_example.py
+++ b/examples/src/main/python/mllib/word2vec_example.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
from pyspark import SparkContext
# $example on$
from pyspark.mllib.feature import Word2Vec
diff --git a/examples/src/main/python/pagerank.py b/examples/src/main/python/pagerank.py
index 2c19e8700a..0ab7249a82 100755
--- a/examples/src/main/python/pagerank.py
+++ b/examples/src/main/python/pagerank.py
@@ -22,8 +22,6 @@ Please refer to PageRank implementation provided by graphx
Example Usage:
bin/spark-submit examples/src/main/python/pagerank.py data/mllib/pagerank_data.txt 10
"""
-from __future__ import print_function
-
import re
import sys
from operator import add
diff --git a/examples/src/main/python/parquet_inputformat.py b/examples/src/main/python/parquet_inputformat.py
index 83041f0040..ca8dd25e6d 100644
--- a/examples/src/main/python/parquet_inputformat.py
+++ b/examples/src/main/python/parquet_inputformat.py
@@ -29,8 +29,6 @@ $ ./bin/spark-submit --driver-class-path /path/to/example/jar \\
{u'favorite_color': u'red', u'name': u'Ben', u'favorite_numbers': []}
<...more log output...>
"""
-from __future__ import print_function
-
import sys
from pyspark.sql import SparkSession
diff --git a/examples/src/main/python/pi.py b/examples/src/main/python/pi.py
index 5839cc2874..e646722533 100755
--- a/examples/src/main/python/pi.py
+++ b/examples/src/main/python/pi.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
import sys
from random import random
from operator import add
diff --git a/examples/src/main/python/sort.py b/examples/src/main/python/sort.py
index d3cd985d19..9efb00a6f1 100755
--- a/examples/src/main/python/sort.py
+++ b/examples/src/main/python/sort.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
import sys
from pyspark.sql import SparkSession
diff --git a/examples/src/main/python/sql/arrow.py b/examples/src/main/python/sql/arrow.py
index b7d8467172..e46449dbef 100644
--- a/examples/src/main/python/sql/arrow.py
+++ b/examples/src/main/python/sql/arrow.py
@@ -21,21 +21,12 @@ Run with:
./bin/spark-submit examples/src/main/python/sql/arrow.py
"""
-from __future__ import print_function
-
-import sys
-
from pyspark.sql import SparkSession
from pyspark.sql.pandas.utils import require_minimum_pandas_version, require_minimum_pyarrow_version
require_minimum_pandas_version()
require_minimum_pyarrow_version()
-if sys.version_info < (3, 6):
- raise Exception(
- "Running this example file requires Python 3.6+; however, "
- "your Python version was:\n %s" % sys.version)
-
def dataframe_with_arrow_example(spark):
# $example on:dataframe_with_arrow$
diff --git a/examples/src/main/python/sql/basic.py b/examples/src/main/python/sql/basic.py
index c8fb25d053..eba8e6ad99 100644
--- a/examples/src/main/python/sql/basic.py
+++ b/examples/src/main/python/sql/basic.py
@@ -20,8 +20,6 @@ A simple example demonstrating basic Spark SQL features.
Run with:
./bin/spark-submit examples/src/main/python/sql/basic.py
"""
-from __future__ import print_function
-
# $example on:init_session$
from pyspark.sql import SparkSession
# $example off:init_session$
diff --git a/examples/src/main/python/sql/datasource.py b/examples/src/main/python/sql/datasource.py
index 265f135e1e..94a41a7e5e 100644
--- a/examples/src/main/python/sql/datasource.py
+++ b/examples/src/main/python/sql/datasource.py
@@ -20,8 +20,6 @@ A simple example demonstrating Spark SQL data sources.
Run with:
./bin/spark-submit examples/src/main/python/sql/datasource.py
"""
-from __future__ import print_function
-
from pyspark.sql import SparkSession
# $example on:schema_merging$
from pyspark.sql import Row
diff --git a/examples/src/main/python/sql/hive.py b/examples/src/main/python/sql/hive.py
index e96a8af71a..bc23dcd9bd 100644
--- a/examples/src/main/python/sql/hive.py
+++ b/examples/src/main/python/sql/hive.py
@@ -20,8 +20,6 @@ A simple example demonstrating Spark SQL Hive integration.
Run with:
./bin/spark-submit examples/src/main/python/sql/hive.py
"""
-from __future__ import print_function
-
# $example on:spark_hive$
from os.path import join, abspath
diff --git a/examples/src/main/python/sql/streaming/structured_kafka_wordcount.py b/examples/src/main/python/sql/streaming/structured_kafka_wordcount.py
index 9210678913..40a955a46c 100644
--- a/examples/src/main/python/sql/streaming/structured_kafka_wordcount.py
+++ b/examples/src/main/python/sql/streaming/structured_kafka_wordcount.py
@@ -36,8 +36,6 @@
`$ bin/spark-submit examples/src/main/python/sql/streaming/structured_kafka_wordcount.py \
host1:port1,host2:port2 subscribe topic1,topic2`
"""
-from __future__ import print_function
-
import sys
from pyspark.sql import SparkSession
diff --git a/examples/src/main/python/sql/streaming/structured_network_wordcount.py b/examples/src/main/python/sql/streaming/structured_network_wordcount.py
index 9ac3921647..c8f43c9dcf 100644
--- a/examples/src/main/python/sql/streaming/structured_network_wordcount.py
+++ b/examples/src/main/python/sql/streaming/structured_network_wordcount.py
@@ -27,8 +27,6 @@ r"""
`$ bin/spark-submit examples/src/main/python/sql/streaming/structured_network_wordcount.py
localhost 9999`
"""
-from __future__ import print_function
-
import sys
from pyspark.sql import SparkSession
diff --git a/examples/src/main/python/sql/streaming/structured_network_wordcount_windowed.py b/examples/src/main/python/sql/streaming/structured_network_wordcount_windowed.py
index c4e3bbf44c..cc39d8afa6 100644
--- a/examples/src/main/python/sql/streaming/structured_network_wordcount_windowed.py
+++ b/examples/src/main/python/sql/streaming/structured_network_wordcount_windowed.py
@@ -39,8 +39,6 @@ r"""
One recommended , pair is 10, 5
"""
-from __future__ import print_function
-
import sys
from pyspark.sql import SparkSession
diff --git a/examples/src/main/python/status_api_demo.py b/examples/src/main/python/status_api_demo.py
index 8cc8cc820c..7b408c8726 100644
--- a/examples/src/main/python/status_api_demo.py
+++ b/examples/src/main/python/status_api_demo.py
@@ -15,15 +15,10 @@
# limitations under the License.
#
-from __future__ import print_function
-
import time
import threading
import sys
-if sys.version >= '3':
- import queue as Queue
-else:
- import Queue
+import queue as Queue
from pyspark import SparkConf, SparkContext
diff --git a/examples/src/main/python/streaming/hdfs_wordcount.py b/examples/src/main/python/streaming/hdfs_wordcount.py
index f9a5c43a8e..fac07727b7 100644
--- a/examples/src/main/python/streaming/hdfs_wordcount.py
+++ b/examples/src/main/python/streaming/hdfs_wordcount.py
@@ -25,8 +25,6 @@
Then create a text file in `localdir` and the words in the file will get counted.
"""
-from __future__ import print_function
-
import sys
from pyspark import SparkContext
diff --git a/examples/src/main/python/streaming/network_wordcount.py b/examples/src/main/python/streaming/network_wordcount.py
index f3099d2517..b57f4e9e38 100644
--- a/examples/src/main/python/streaming/network_wordcount.py
+++ b/examples/src/main/python/streaming/network_wordcount.py
@@ -25,8 +25,6 @@ r"""
and then run the example
`$ bin/spark-submit examples/src/main/python/streaming/network_wordcount.py localhost 9999`
"""
-from __future__ import print_function
-
import sys
from pyspark import SparkContext
diff --git a/examples/src/main/python/streaming/network_wordjoinsentiments.py b/examples/src/main/python/streaming/network_wordjoinsentiments.py
index 2b5434c0c8..5b03546fb4 100644
--- a/examples/src/main/python/streaming/network_wordjoinsentiments.py
+++ b/examples/src/main/python/streaming/network_wordjoinsentiments.py
@@ -30,8 +30,6 @@ r"""
localhost 9999`
"""
-from __future__ import print_function
-
import sys
from pyspark import SparkContext
diff --git a/examples/src/main/python/streaming/recoverable_network_wordcount.py b/examples/src/main/python/streaming/recoverable_network_wordcount.py
index a39c4d0b5b..8424556e88 100644
--- a/examples/src/main/python/streaming/recoverable_network_wordcount.py
+++ b/examples/src/main/python/streaming/recoverable_network_wordcount.py
@@ -35,8 +35,6 @@
checkpoint data exists in ~/checkpoint/, then it will create StreamingContext from
the checkpoint data.
"""
-from __future__ import print_function
-
import os
import sys
diff --git a/examples/src/main/python/streaming/sql_network_wordcount.py b/examples/src/main/python/streaming/sql_network_wordcount.py
index ab3cfc0679..59a8a11a45 100644
--- a/examples/src/main/python/streaming/sql_network_wordcount.py
+++ b/examples/src/main/python/streaming/sql_network_wordcount.py
@@ -27,8 +27,6 @@ r"""
and then run the example
`$ bin/spark-submit examples/src/main/python/streaming/sql_network_wordcount.py localhost 9999`
"""
-from __future__ import print_function
-
import sys
from pyspark import SparkContext
diff --git a/examples/src/main/python/streaming/stateful_network_wordcount.py b/examples/src/main/python/streaming/stateful_network_wordcount.py
index d5d1eba6c5..7a45be663a 100644
--- a/examples/src/main/python/streaming/stateful_network_wordcount.py
+++ b/examples/src/main/python/streaming/stateful_network_wordcount.py
@@ -29,8 +29,6 @@ r"""
`$ bin/spark-submit examples/src/main/python/streaming/stateful_network_wordcount.py \
localhost 9999`
"""
-from __future__ import print_function
-
import sys
from pyspark import SparkContext
diff --git a/examples/src/main/python/transitive_closure.py b/examples/src/main/python/transitive_closure.py
index 49551d4085..9f543daecd 100755
--- a/examples/src/main/python/transitive_closure.py
+++ b/examples/src/main/python/transitive_closure.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
import sys
from random import Random
diff --git a/examples/src/main/python/wordcount.py b/examples/src/main/python/wordcount.py
index a05e24ff3f..037c1e8aa3 100755
--- a/examples/src/main/python/wordcount.py
+++ b/examples/src/main/python/wordcount.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
import sys
from operator import add
diff --git a/external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py b/external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py
index 5370b79389..df8c64e531 100644
--- a/external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py
+++ b/external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py
@@ -55,8 +55,6 @@
See http://spark.apache.org/docs/latest/streaming-kinesis-integration.html for more details on
the Kinesis Spark Streaming integration.
"""
-from __future__ import print_function
-
import sys
from pyspark import SparkContext
diff --git a/python/pyspark/accumulators.py b/python/pyspark/accumulators.py
index a5d513262b..2a19d233bc 100644
--- a/python/pyspark/accumulators.py
+++ b/python/pyspark/accumulators.py
@@ -89,10 +89,7 @@ TypeError:...
import sys
import select
import struct
-if sys.version < '3':
- import SocketServer
-else:
- import socketserver as SocketServer
+import socketserver as SocketServer
import threading
from pyspark.serializers import read_int, PickleSerializer
diff --git a/python/pyspark/broadcast.py b/python/pyspark/broadcast.py
index 803d857055..c2daf7600f 100644
--- a/python/pyspark/broadcast.py
+++ b/python/pyspark/broadcast.py
@@ -20,16 +20,12 @@ import os
import sys
from tempfile import NamedTemporaryFile
import threading
+import pickle
from pyspark.java_gateway import local_connect_and_auth
from pyspark.serializers import ChunkedStream, pickle_protocol
-from pyspark.util import _exception_message, print_exec
+from pyspark.util import print_exec
-if sys.version < '3':
- import cPickle as pickle
-else:
- import pickle
- unicode = str
__all__ = ['Broadcast']
@@ -113,7 +109,7 @@ class Broadcast(object):
raise
except Exception as e:
msg = "Could not serialize broadcast: %s: %s" \
- % (e.__class__.__name__, _exception_message(e))
+ % (e.__class__.__name__, str(e))
print_exec(sys.stderr)
raise pickle.PicklingError(msg)
f.close()
diff --git a/python/pyspark/conf.py b/python/pyspark/conf.py
index 2024260868..efd8b6d633 100644
--- a/python/pyspark/conf.py
+++ b/python/pyspark/conf.py
@@ -22,14 +22,14 @@
>>> conf.setMaster("local").setAppName("My app")
>>> conf.get("spark.master")
-u'local'
+'local'
>>> conf.get("spark.app.name")
-u'My app'
+'My app'
>>> sc = SparkContext(conf=conf)
>>> sc.master
-u'local'
+'local'
>>> sc.appName
-u'My app'
+'My app'
>>> sc.sparkHome is None
True
@@ -37,21 +37,21 @@ True
>>> conf.setSparkHome("/path")
>>> conf.get("spark.home")
-u'/path'
+'/path'
>>> conf.setExecutorEnv("VAR1", "value1")
>>> conf.setExecutorEnv(pairs = [("VAR3", "value3"), ("VAR4", "value4")])
>>> conf.get("spark.executorEnv.VAR1")
-u'value1'
+'value1'
>>> print(conf.toDebugString())
spark.executorEnv.VAR1=value1
spark.executorEnv.VAR3=value3
spark.executorEnv.VAR4=value4
spark.home=/path
>>> sorted(conf.getAll(), key=lambda p: p[0])
-[(u'spark.executorEnv.VAR1', u'value1'), (u'spark.executorEnv.VAR3', u'value3'), \
-(u'spark.executorEnv.VAR4', u'value4'), (u'spark.home', u'/path')]
+[('spark.executorEnv.VAR1', 'value1'), ('spark.executorEnv.VAR3', 'value3'), \
+('spark.executorEnv.VAR4', 'value4'), ('spark.home', '/path')]
>>> conf._jconf.setExecutorEnv("VAR5", "value5")
JavaObject id...
>>> print(conf.toDebugString())
@@ -65,11 +65,6 @@ spark.home=/path
__all__ = ['SparkConf']
import sys
-import re
-
-if sys.version > '3':
- unicode = str
- __doc__ = re.sub(r"(\W|^)[uU](['])", r'\1\2', __doc__)
class SparkConf(object):
@@ -124,9 +119,9 @@ class SparkConf(object):
"""Set a configuration property."""
# Try to set self._jconf first if JVM is created, set self._conf if JVM is not created yet.
if self._jconf is not None:
- self._jconf.set(key, unicode(value))
+ self._jconf.set(key, str(value))
else:
- self._conf[key] = unicode(value)
+ self._conf[key] = str(value)
return self
def setIfMissing(self, key, value):
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index 6d58e1d144..2e105cc382 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -21,6 +21,7 @@ import signal
import sys
import threading
import warnings
+import importlib
from threading import RLock
from tempfile import NamedTemporaryFile
@@ -37,15 +38,12 @@ from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deseria
PairDeserializer, AutoBatchedSerializer, NoOpSerializer, ChunkedStream
from pyspark.storagelevel import StorageLevel
from pyspark.resource.information import ResourceInformation
-from pyspark.rdd import RDD, _load_from_socket, ignore_unicode_prefix
+from pyspark.rdd import RDD, _load_from_socket
from pyspark.taskcontext import TaskContext
from pyspark.traceback_utils import CallSite, first_spark_call
from pyspark.status import StatusTracker
from pyspark.profiler import ProfilerCollector, BasicProfiler
-if sys.version > '3':
- xrange = range
-
__all__ = ['SparkContext']
@@ -213,15 +211,6 @@ class SparkContext(object):
self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python')
self.pythonVer = "%d.%d" % sys.version_info[:2]
- if sys.version_info < (3, 6):
- with warnings.catch_warnings():
- warnings.simplefilter("once")
- warnings.warn(
- "Support for Python 2 and Python 3 prior to version 3.6 is deprecated as "
- "of Spark 3.0. See also the plan for dropping Python 2 support at "
- "https://spark.apache.org/news/plan-for-dropping-python-2-support.html.",
- DeprecationWarning)
-
# Broadcast's __reduce__ method stores Broadcast instances here.
# This allows other code to determine which Broadcast instances have
# been pickled, so it can determine which Java broadcast objects to
@@ -398,7 +387,6 @@ class SparkContext(object):
return self._jsc.version()
@property
- @ignore_unicode_prefix
def applicationId(self):
"""
A unique identifier for the Spark application.
@@ -408,7 +396,7 @@ class SparkContext(object):
* in case of YARN something like 'application_1433865536131_34483'
>>> sc.applicationId # doctest: +ELLIPSIS
- u'local-...'
+ 'local-...'
"""
return self._jsc.sc().applicationId()
@@ -490,20 +478,20 @@ class SparkContext(object):
end = start
start = 0
- return self.parallelize(xrange(start, end, step), numSlices)
+ return self.parallelize(range(start, end, step), numSlices)
def parallelize(self, c, numSlices=None):
"""
- Distribute a local Python collection to form an RDD. Using xrange
+ Distribute a local Python collection to form an RDD. Using range
is recommended if the input represents a range for performance.
>>> sc.parallelize([0, 2, 3, 4, 6], 5).glom().collect()
[[0], [2], [3], [4], [6]]
- >>> sc.parallelize(xrange(0, 6, 2), 5).glom().collect()
+ >>> sc.parallelize(range(0, 6, 2), 5).glom().collect()
[[], [0], [], [2], [4]]
"""
numSlices = int(numSlices) if numSlices is not None else self.defaultParallelism
- if isinstance(c, xrange):
+ if isinstance(c, range):
size = len(c)
if size == 0:
return self.parallelize([], numSlices)
@@ -522,7 +510,7 @@ class SparkContext(object):
# the empty iterator to a list, thus make sure worker reuse takes effect.
# See more details in SPARK-26549.
assert len(list(iterator)) == 0
- return xrange(getStart(split), getStart(split + 1), step)
+ return range(getStart(split), getStart(split + 1), step)
return self.parallelize([], numSlices).mapPartitionsWithIndex(f)
@@ -591,7 +579,6 @@ class SparkContext(object):
minPartitions = minPartitions or self.defaultMinPartitions
return RDD(self._jsc.objectFile(name, minPartitions), self)
- @ignore_unicode_prefix
def textFile(self, name, minPartitions=None, use_unicode=True):
"""
Read a text file from HDFS, a local file system (available on all
@@ -608,13 +595,12 @@ class SparkContext(object):
... _ = testFile.write("Hello world!")
>>> textFile = sc.textFile(path)
>>> textFile.collect()
- [u'Hello world!']
+ ['Hello world!']
"""
minPartitions = minPartitions or min(self.defaultParallelism, 2)
return RDD(self._jsc.textFile(name, minPartitions), self,
UTF8Deserializer(use_unicode))
- @ignore_unicode_prefix
def wholeTextFiles(self, path, minPartitions=None, use_unicode=True):
"""
Read a directory of text files from HDFS, a local file system
@@ -658,7 +644,7 @@ class SparkContext(object):
... _ = file2.write("2")
>>> textFiles = sc.wholeTextFiles(dirPath)
>>> sorted(textFiles.collect())
- [(u'.../1.txt', u'1'), (u'.../2.txt', u'2')]
+ [('.../1.txt', '1'), ('.../2.txt', '2')]
"""
minPartitions = minPartitions or self.defaultMinPartitions
return RDD(self._jsc.wholeTextFiles(path, minPartitions), self,
@@ -846,7 +832,6 @@ class SparkContext(object):
jrdd = self._jsc.checkpointFile(name)
return RDD(jrdd, self, input_deserializer)
- @ignore_unicode_prefix
def union(self, rdds):
"""
Build the union of a list of RDDs.
@@ -860,10 +845,10 @@ class SparkContext(object):
... _ = testFile.write("Hello")
>>> textFile = sc.textFile(path)
>>> textFile.collect()
- [u'Hello']
+ ['Hello']
>>> parallelized = sc.parallelize(["World!"])
>>> sorted(sc.union([textFile, parallelized]).collect())
- [u'Hello', 'World!']
+ ['Hello', 'World!']
"""
first_jrdd_deserializer = rdds[0]._jrdd_deserializer
if any(x._jrdd_deserializer != first_jrdd_deserializer for x in rdds):
@@ -959,9 +944,8 @@ class SparkContext(object):
self._python_includes.append(filename)
# for tests in local mode
sys.path.insert(1, os.path.join(SparkFiles.getRootDirectory(), filename))
- if sys.version > '3':
- import importlib
- importlib.invalidate_caches()
+
+ importlib.invalidate_caches()
def setCheckpointDir(self, dirName):
"""
diff --git a/python/pyspark/find_spark_home.py b/python/pyspark/find_spark_home.py
index 52f6ea9a37..920c04009d 100755
--- a/python/pyspark/find_spark_home.py
+++ b/python/pyspark/find_spark_home.py
@@ -20,7 +20,6 @@
# This script attempt to determine the correct setting for SPARK_HOME given
# that Spark may have been installed on the system with pip.
-from __future__ import print_function
import os
import sys
@@ -41,26 +40,15 @@ def _find_spark_home():
# Add the path of the PySpark module if it exists
import_error_raised = False
- if sys.version < "3":
- import imp
- try:
- module_home = imp.find_module("pyspark")[1]
- paths.append(module_home)
- # If we are installed in edit mode also look two dirs up
- paths.append(os.path.join(module_home, "../../"))
- except ImportError:
- # Not pip installed no worries
- import_error_raised = True
- else:
- from importlib.util import find_spec
- try:
- module_home = os.path.dirname(find_spec("pyspark").origin)
- paths.append(module_home)
- # If we are installed in edit mode also look two dirs up
- paths.append(os.path.join(module_home, "../../"))
- except ImportError:
- # Not pip installed no worries
- import_error_raised = True
+ from importlib.util import find_spec
+ try:
+ module_home = os.path.dirname(find_spec("pyspark").origin)
+ paths.append(module_home)
+ # If we are installed in edit mode also look two dirs up
+ paths.append(os.path.join(module_home, "../../"))
+ except ImportError:
+ # Not pip installed no worries
+ import_error_raised = True
# Normalize the paths
paths = [os.path.abspath(p) for p in paths]
@@ -84,5 +72,6 @@ def _find_spark_home():
"'PYSPARK_PYTHON=python3 pyspark'.\n", file=sys.stderr)
sys.exit(-1)
+
if __name__ == "__main__":
print(_find_spark_home())
diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index 0daf09b17a..fba92a96ae 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -17,7 +17,6 @@
import atexit
import os
-import sys
import signal
import shlex
import shutil
@@ -27,14 +26,10 @@ import tempfile
import time
from subprocess import Popen, PIPE
-if sys.version >= '3':
- xrange = range
-
from py4j.java_gateway import java_import, JavaGateway, JavaObject, GatewayParameters
from py4j.clientserver import ClientServer, JavaParameters, PythonParameters
from pyspark.find_spark_home import _find_spark_home
from pyspark.serializers import read_int, write_with_length, UTF8Deserializer
-from pyspark.util import _exception_message
def launch_gateway(conf=None, popen_kwargs=None):
@@ -197,7 +192,7 @@ def local_connect_and_auth(port, auth_secret):
_do_server_auth(sockfile, auth_secret)
return (sockfile, sock)
except socket.error as e:
- emsg = _exception_message(e)
+ emsg = str(e)
errors.append("tried to connect to %s, but an error occured: %s" % (sa, emsg))
sock.close()
sock = None
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index cc8ce0567b..7c8cbe3a9f 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -16,20 +16,20 @@
#
import operator
-import sys
+import warnings
from abc import ABCMeta, abstractmethod, abstractproperty
from multiprocessing.pool import ThreadPool
-from pyspark import since, keyword_only
+from pyspark import keyword_only
from pyspark.ml import Estimator, Predictor, PredictionModel, Model
from pyspark.ml.param.shared import *
from pyspark.ml.tree import _DecisionTreeModel, _DecisionTreeParams, \
_TreeEnsembleModel, _RandomForestParams, _GBTParams, \
- _HasVarianceImpurity, _TreeClassifierParams, _TreeEnsembleParams
+ _HasVarianceImpurity, _TreeClassifierParams
from pyspark.ml.regression import _FactorizationMachinesParams, DecisionTreeRegressionModel
from pyspark.ml.util import *
from pyspark.ml.base import _PredictorParams
-from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams, \
+from pyspark.ml.wrapper import JavaParams, \
JavaPredictor, JavaPredictionModel, JavaWrapper
from pyspark.ml.common import inherit_doc, _java2py, _py2java
from pyspark.ml.linalg import Vectors
diff --git a/python/pyspark/ml/common.py b/python/pyspark/ml/common.py
index 387c5d7309..4e1d7f93ae 100644
--- a/python/pyspark/ml/common.py
+++ b/python/pyspark/ml/common.py
@@ -15,11 +15,6 @@
# limitations under the License.
#
-import sys
-if sys.version >= '3':
- long = int
- unicode = str
-
import py4j.protocol
from py4j.protocol import Py4JJavaError
from py4j.java_gateway import JavaObject
@@ -79,7 +74,7 @@ def _py2java(sc, obj):
obj = [_py2java(sc, x) for x in obj]
elif isinstance(obj, JavaObject):
pass
- elif isinstance(obj, (int, long, float, bool, bytes, unicode)):
+ elif isinstance(obj, (int, float, bool, bytes, str)):
pass
else:
data = bytearray(PickleSerializer().dumps(obj))
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 498629cea8..c52ea62686 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -15,12 +15,7 @@
# limitations under the License.
#
-import sys
-if sys.version > '3':
- basestring = str
-
from pyspark import since, keyword_only, SparkContext
-from pyspark.rdd import ignore_unicode_prefix
from pyspark.ml.linalg import _convert_to_vector
from pyspark.ml.param.shared import *
from pyspark.ml.util import JavaMLReadable, JavaMLWritable
@@ -2178,7 +2173,6 @@ class MinMaxScalerModel(JavaModel, _MinMaxScalerParams, JavaMLReadable, JavaMLWr
@inherit_doc
-@ignore_unicode_prefix
class NGram(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable):
"""
A feature transformer that converts the input array of strings into an array of n-grams. Null
@@ -2196,15 +2190,15 @@ class NGram(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWr
>>> ngram.setOutputCol("nGrams")
NGram...
>>> ngram.transform(df).head()
- Row(inputTokens=[u'a', u'b', u'c', u'd', u'e'], nGrams=[u'a b', u'b c', u'c d', u'd e'])
+ Row(inputTokens=['a', 'b', 'c', 'd', 'e'], nGrams=['a b', 'b c', 'c d', 'd e'])
>>> # Change n-gram length
>>> ngram.setParams(n=4).transform(df).head()
- Row(inputTokens=[u'a', u'b', u'c', u'd', u'e'], nGrams=[u'a b c d', u'b c d e'])
+ Row(inputTokens=['a', 'b', 'c', 'd', 'e'], nGrams=['a b c d', 'b c d e'])
>>> # Temporarily modify output column.
>>> ngram.transform(df, {ngram.outputCol: "output"}).head()
- Row(inputTokens=[u'a', u'b', u'c', u'd', u'e'], output=[u'a b c d', u'b c d e'])
+ Row(inputTokens=['a', 'b', 'c', 'd', 'e'], output=['a b c d', 'b c d e'])
>>> ngram.transform(df).head()
- Row(inputTokens=[u'a', u'b', u'c', u'd', u'e'], nGrams=[u'a b c d', u'b c d e'])
+ Row(inputTokens=['a', 'b', 'c', 'd', 'e'], nGrams=['a b c d', 'b c d e'])
>>> # Must use keyword arguments to specify params.
>>> ngram.setParams("text")
Traceback (most recent call last):
@@ -3082,7 +3076,6 @@ class RobustScalerModel(JavaModel, _RobustScalerParams, JavaMLReadable, JavaMLWr
@inherit_doc
-@ignore_unicode_prefix
class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable):
"""
A regex based tokenizer that extracts tokens either by using the
@@ -3099,15 +3092,15 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable,
>>> reTokenizer.setOutputCol("words")
RegexTokenizer...
>>> reTokenizer.transform(df).head()
- Row(text=u'A B c', words=[u'a', u'b', u'c'])
+ Row(text='A B c', words=['a', 'b', 'c'])
>>> # Change a parameter.
>>> reTokenizer.setParams(outputCol="tokens").transform(df).head()
- Row(text=u'A B c', tokens=[u'a', u'b', u'c'])
+ Row(text='A B c', tokens=['a', 'b', 'c'])
>>> # Temporarily modify a parameter.
>>> reTokenizer.transform(df, {reTokenizer.outputCol: "words"}).head()
- Row(text=u'A B c', words=[u'a', u'b', u'c'])
+ Row(text='A B c', words=['a', 'b', 'c'])
>>> reTokenizer.transform(df).head()
- Row(text=u'A B c', tokens=[u'a', u'b', u'c'])
+ Row(text='A B c', tokens=['a', 'b', 'c'])
>>> # Must use keyword arguments to specify params.
>>> reTokenizer.setParams("text")
Traceback (most recent call last):
@@ -3935,7 +3928,6 @@ class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol, HasInputCols,
@inherit_doc
-@ignore_unicode_prefix
class Tokenizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable):
"""
A tokenizer that converts the input string to lowercase and then
@@ -3946,15 +3938,15 @@ class Tokenizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, Java
>>> tokenizer.setInputCol("text")
Tokenizer...
>>> tokenizer.transform(df).head()
- Row(text=u'a b c', words=[u'a', u'b', u'c'])
+ Row(text='a b c', words=['a', 'b', 'c'])
>>> # Change a parameter.
>>> tokenizer.setParams(outputCol="tokens").transform(df).head()
- Row(text=u'a b c', tokens=[u'a', u'b', u'c'])
+ Row(text='a b c', tokens=['a', 'b', 'c'])
>>> # Temporarily modify a parameter.
>>> tokenizer.transform(df, {tokenizer.outputCol: "words"}).head()
- Row(text=u'a b c', words=[u'a', u'b', u'c'])
+ Row(text='a b c', words=['a', 'b', 'c'])
>>> tokenizer.transform(df).head()
- Row(text=u'a b c', tokens=[u'a', u'b', u'c'])
+ Row(text='a b c', tokens=['a', 'b', 'c'])
>>> # Must use keyword arguments to specify params.
>>> tokenizer.setParams("text")
Traceback (most recent call last):
@@ -4476,7 +4468,6 @@ class _Word2VecParams(HasStepSize, HasMaxIter, HasSeed, HasInputCol, HasOutputCo
@inherit_doc
-@ignore_unicode_prefix
class Word2Vec(JavaEstimator, _Word2VecParams, JavaMLReadable, JavaMLWritable):
"""
Word2Vec trains a model of `Map(String, Vector)`, i.e. transforms a word into a code for further
@@ -4505,7 +4496,7 @@ class Word2Vec(JavaEstimator, _Word2VecParams, JavaMLReadable, JavaMLWritable):
+----+--------------------+
...
>>> model.findSynonymsArray("a", 2)
- [(u'b', 0.015859870240092278), (u'c', -0.5680795907974243)]
+ [('b', 0.015859870240092278), ('c', -0.5680795907974243)]
>>> from pyspark.sql.functions import format_number as fmt
>>> model.findSynonyms("a", 2).select("word", fmt("similarity", 5).alias("similarity")).show()
+----+----------+
@@ -4668,7 +4659,7 @@ class Word2VecModel(JavaModel, _Word2VecParams, JavaMLReadable, JavaMLWritable):
Returns a dataframe with two fields word and similarity (which
gives the cosine similarity).
"""
- if not isinstance(word, basestring):
+ if not isinstance(word, str):
word = _convert_to_vector(word)
return self._call_java("findSynonyms", word, num)
@@ -4680,7 +4671,7 @@ class Word2VecModel(JavaModel, _Word2VecParams, JavaMLReadable, JavaMLWritable):
Returns an array with two fields word and similarity (which
gives the cosine similarity).
"""
- if not isinstance(word, basestring):
+ if not isinstance(word, str):
word = _convert_to_vector(word)
tuples = self._java_obj.findSynonymsArray(word, num)
return list(map(lambda st: (st._1(), st._2()), list(tuples)))
diff --git a/python/pyspark/ml/fpm.py b/python/pyspark/ml/fpm.py
index 7a5591f3fb..b91788a82c 100644
--- a/python/pyspark/ml/fpm.py
+++ b/python/pyspark/ml/fpm.py
@@ -15,8 +15,7 @@
# limitations under the License.
#
-from pyspark import keyword_only, since
-from pyspark.rdd import ignore_unicode_prefix
+from pyspark import keyword_only
from pyspark.sql import DataFrame
from pyspark.ml.util import *
from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams
@@ -132,7 +131,6 @@ class FPGrowthModel(JavaModel, _FPGrowthParams, JavaMLWritable, JavaMLReadable):
return self._call_java("associationRules")
-@ignore_unicode_prefix
class FPGrowth(JavaEstimator, _FPGrowthParams, JavaMLWritable, JavaMLReadable):
r"""
A parallel FP-growth algorithm to mine frequent itemsets. The algorithm is described in
@@ -193,7 +191,7 @@ class FPGrowth(JavaEstimator, _FPGrowthParams, JavaMLWritable, JavaMLReadable):
...
>>> new_data = spark.createDataFrame([(["t", "s"], )], ["items"])
>>> sorted(fpm.transform(new_data).first().newPrediction)
- [u'x', u'y', u'z']
+ ['x', 'y', 'z']
.. versionadded:: 2.2.0
"""
diff --git a/python/pyspark/ml/image.py b/python/pyspark/ml/image.py
index 4fb1036fba..20b24559b1 100644
--- a/python/pyspark/ml/image.py
+++ b/python/pyspark/ml/image.py
@@ -25,14 +25,13 @@
"""
import sys
-import warnings
import numpy as np
from distutils.version import LooseVersion
from pyspark import SparkContext
from pyspark.sql.types import Row, _create_row, _parse_datatype_json_string
-from pyspark.sql import DataFrame, SparkSession
+from pyspark.sql import SparkSession
__all__ = ["ImageSchema"]
diff --git a/python/pyspark/ml/linalg/__init__.py b/python/pyspark/ml/linalg/__init__.py
index a79d5e5dcb..8be440da4f 100644
--- a/python/pyspark/ml/linalg/__init__.py
+++ b/python/pyspark/ml/linalg/__init__.py
@@ -27,18 +27,8 @@ import sys
import array
import struct
-if sys.version >= '3':
- basestring = str
- xrange = range
- import copyreg as copy_reg
- long = int
-else:
- from itertools import izip as zip
- import copy_reg
-
import numpy as np
-from pyspark import since
from pyspark.sql.types import UserDefinedType, StructField, StructType, ArrayType, DoubleType, \
IntegerType, ByteType, BooleanType
@@ -47,13 +37,6 @@ __all__ = ['Vector', 'DenseVector', 'SparseVector', 'Vectors',
'Matrix', 'DenseMatrix', 'SparseMatrix', 'Matrices']
-if sys.version_info[:2] == (2, 7):
- # speed up pickling array in Python 2.7
- def fast_pickle_array(ar):
- return array.array, (ar.typecode, ar.tostring())
- copy_reg.pickle(array.array, fast_pickle_array)
-
-
# Check whether we have SciPy. MLlib works without it too, but if we have it, some methods,
# such as _dot and _serialize_double_vector, start to support scipy.sparse matrices.
@@ -68,7 +51,7 @@ except:
def _convert_to_vector(l):
if isinstance(l, Vector):
return l
- elif type(l) in (array.array, np.array, np.ndarray, list, tuple, xrange):
+ elif type(l) in (array.array, np.array, np.ndarray, list, tuple, range):
return DenseVector(l)
elif _have_scipy and scipy.sparse.issparse(l):
assert l.shape[1] == 1, "Expected column vector"
@@ -102,7 +85,7 @@ def _vector_size(v):
"""
if isinstance(v, Vector):
return len(v)
- elif type(v) in (array.array, list, tuple, xrange):
+ elif type(v) in (array.array, list, tuple, range):
return len(v)
elif type(v) == np.ndarray:
if v.ndim == 1 or (v.ndim == 2 and v.shape[1] == 1):
@@ -415,7 +398,7 @@ class DenseVector(Vector):
elif isinstance(other, SparseVector):
if len(self) != other.size:
return False
- return Vectors._equals(list(xrange(len(self))), self.array, other.indices, other.values)
+ return Vectors._equals(list(range(len(self))), self.array, other.indices, other.values)
return False
def __ne__(self, other):
@@ -520,7 +503,7 @@ class SparseVector(Vector):
self.indices = np.array(args[0], dtype=np.int32)
self.values = np.array(args[1], dtype=np.float64)
assert len(self.indices) == len(self.values), "index and value arrays not same length"
- for i in xrange(len(self.indices) - 1):
+ for i in range(len(self.indices) - 1):
if self.indices[i] >= self.indices[i + 1]:
raise TypeError(
"Indices %s and %s are not strictly increasing"
@@ -699,7 +682,7 @@ class SparseVector(Vector):
inds = self.indices
vals = self.values
entries = ", ".join(["{0}: {1}".format(inds[i], _format_float(vals[i]))
- for i in xrange(len(inds))])
+ for i in range(len(inds))])
return "SparseVector({0}, {{{1}}})".format(self.size, entries)
def __eq__(self, other):
@@ -709,7 +692,7 @@ class SparseVector(Vector):
elif isinstance(other, DenseVector):
if self.size != len(other):
return False
- return Vectors._equals(self.indices, self.values, list(xrange(len(other))), other.array)
+ return Vectors._equals(self.indices, self.values, list(range(len(other))), other.array)
return False
def __getitem__(self, index):
@@ -791,7 +774,7 @@ class Vectors(object):
>>> Vectors.dense(1.0, 2.0)
DenseVector([1.0, 2.0])
"""
- if len(elements) == 1 and not isinstance(elements[0], (float, int, long)):
+ if len(elements) == 1 and not isinstance(elements[0], (float, int)):
# it's list, numpy.array or other iterable object.
elements = elements[0]
return DenseVector(elements)
@@ -1124,7 +1107,7 @@ class SparseMatrix(Matrix):
Return a numpy.ndarray
"""
A = np.zeros((self.numRows, self.numCols), dtype=np.float64, order='F')
- for k in xrange(self.colPtrs.size - 1):
+ for k in range(self.colPtrs.size - 1):
startptr = self.colPtrs[k]
endptr = self.colPtrs[k + 1]
if self.isTransposed:
diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py
index 1be8755c7b..96b07bfa5f 100644
--- a/python/pyspark/ml/param/__init__.py
+++ b/python/pyspark/ml/param/__init__.py
@@ -16,15 +16,10 @@
#
import array
import sys
-if sys.version > '3':
- basestring = str
- xrange = range
- unicode = str
-
from abc import ABCMeta
import copy
-import numpy as np
+import numpy as np
from py4j.java_gateway import JavaObject
from pyspark.ml.linalg import DenseVector, Vector, Matrix
@@ -93,12 +88,12 @@ class TypeConverters(object):
@staticmethod
def _can_convert_to_list(value):
vtype = type(value)
- return vtype in [list, np.ndarray, tuple, xrange, array.array] or isinstance(value, Vector)
+ return vtype in [list, np.ndarray, tuple, range, array.array] or isinstance(value, Vector)
@staticmethod
def _can_convert_to_string(value):
vtype = type(value)
- return isinstance(value, basestring) or vtype in [np.unicode_, np.string_, np.str_]
+ return isinstance(value, str) or vtype in [np.unicode_, np.string_, np.str_]
@staticmethod
def identity(value):
@@ -114,7 +109,7 @@ class TypeConverters(object):
"""
if type(value) == list:
return value
- elif type(value) in [np.ndarray, tuple, xrange, array.array]:
+ elif type(value) in [np.ndarray, tuple, range, array.array]:
return list(value)
elif isinstance(value, Vector):
return list(value.toArray())
@@ -211,12 +206,10 @@ class TypeConverters(object):
"""
Convert a value to a string, if possible.
"""
- if isinstance(value, basestring):
+ if isinstance(value, str):
return value
- elif type(value) in [np.string_, np.str_]:
+ elif type(value) in [np.string_, np.str_, np.unicode_]:
return str(value)
- elif type(value) == np.unicode_:
- return unicode(value)
else:
raise TypeError("Could not convert %s to string type" % type(value))
@@ -338,7 +331,7 @@ class Params(Identifiable):
Tests whether this instance contains a param with a given
(string) name.
"""
- if isinstance(paramName, basestring):
+ if isinstance(paramName, str):
p = getattr(self, paramName, None)
return isinstance(p, Param)
else:
@@ -421,7 +414,7 @@ class Params(Identifiable):
if isinstance(param, Param):
self._shouldOwn(param)
return param
- elif isinstance(param, basestring):
+ elif isinstance(param, str):
return self.getParam(param)
else:
raise ValueError("Cannot resolve %r as a param." % param)
@@ -510,7 +503,7 @@ class Params(Identifiable):
:return: same instance, but with the uid and Param.parent values
updated, including within param maps
"""
- newUid = unicode(newUid)
+ newUid = str(newUid)
self.uid = newUid
newDefaultParamMap = dict()
newParamMap = dict()
diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py
index 2086e831f4..bc1ea87ad6 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
header = """#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
diff --git a/python/pyspark/ml/pipeline.py b/python/pyspark/ml/pipeline.py
index 53d07ec966..eacb8b82b5 100644
--- a/python/pyspark/ml/pipeline.py
+++ b/python/pyspark/ml/pipeline.py
@@ -16,12 +16,8 @@
#
import sys
-import os
-if sys.version > '3':
- basestring = str
-
-from pyspark import since, keyword_only, SparkContext
+from pyspark import keyword_only
from pyspark.ml.base import Estimator, Model, Transformer
from pyspark.ml.param import Param, Params
from pyspark.ml.util import *
diff --git a/python/pyspark/ml/tests/test_feature.py b/python/pyspark/ml/tests/test_feature.py
index 4c6bfa696b..7856a317c2 100644
--- a/python/pyspark/ml/tests/test_feature.py
+++ b/python/pyspark/ml/tests/test_feature.py
@@ -19,9 +19,6 @@
import sys
import unittest
-if sys.version > '3':
- basestring = str
-
from pyspark.ml.feature import Binarizer, CountVectorizer, CountVectorizerModel, HashingTF, IDF, \
NGram, RFormula, StopWordsRemover, StringIndexer, StringIndexerModel, VectorSizeHint
from pyspark.ml.linalg import DenseVector, SparseVector, Vectors
@@ -91,7 +88,7 @@ class FeatureTests(SparkSessionTestCase):
transformedDF = stopWordRemover.transform(dataset)
self.assertEqual(transformedDF.head().output, ["panda"])
self.assertEqual(type(stopWordRemover.getStopWords()), list)
- self.assertTrue(isinstance(stopWordRemover.getStopWords()[0], basestring))
+ self.assertTrue(isinstance(stopWordRemover.getStopWords()[0], str))
# Custom
stopwords = ["panda"]
stopWordRemover.setStopWords(stopwords)
diff --git a/python/pyspark/ml/tests/test_param.py b/python/pyspark/ml/tests/test_param.py
index 1b2b1914cc..e1abd59a2d 100644
--- a/python/pyspark/ml/tests/test_param.py
+++ b/python/pyspark/ml/tests/test_param.py
@@ -35,10 +35,6 @@ from pyspark.ml.wrapper import JavaParams
from pyspark.testing.mlutils import check_params, PySparkTestCase, SparkSessionTestCase
-if sys.version > '3':
- xrange = range
-
-
class ParamTypeConversionTests(PySparkTestCase):
"""
Test that param type conversion happens.
@@ -67,14 +63,14 @@ class ParamTypeConversionTests(PySparkTestCase):
def test_list(self):
l = [0, 1]
for lst_like in [l, np.array(l), DenseVector(l), SparseVector(len(l), range(len(l)), l),
- pyarray.array('l', l), xrange(2), tuple(l)]:
+ pyarray.array('l', l), range(2), tuple(l)]:
converted = TypeConverters.toList(lst_like)
self.assertEqual(type(converted), list)
self.assertListEqual(converted, l)
def test_list_int(self):
for indices in [[1.0, 2.0], np.array([1.0, 2.0]), DenseVector([1.0, 2.0]),
- SparseVector(2, {0: 1.0, 1: 2.0}), xrange(1, 3), (1.0, 2.0),
+ SparseVector(2, {0: 1.0, 1: 2.0}), range(1, 3), (1.0, 2.0),
pyarray.array('d', [1.0, 2.0])]:
vs = VectorSlicer(indices=indices)
self.assertListEqual(vs.getIndices(), [1, 2])
@@ -200,12 +196,7 @@ class ParamTests(SparkSessionTestCase):
self.assertEqual(testParams._resolveParam("maxIter"), testParams.maxIter)
self.assertEqual(testParams._resolveParam(u"maxIter"), testParams.maxIter)
- if sys.version_info[0] >= 3:
- # In Python 3, it is allowed to get/set attributes with non-ascii characters.
- e_cls = AttributeError
- else:
- e_cls = UnicodeEncodeError
- self.assertRaises(e_cls, lambda: testParams._resolveParam(u"아"))
+ self.assertRaises(AttributeError, lambda: testParams._resolveParam(u"아"))
def test_params(self):
testParams = TestParams()
diff --git a/python/pyspark/ml/tests/test_training_summary.py b/python/pyspark/ml/tests/test_training_summary.py
index 7d90579318..15e9ebb0f5 100644
--- a/python/pyspark/ml/tests/test_training_summary.py
+++ b/python/pyspark/ml/tests/test_training_summary.py
@@ -18,9 +18,6 @@
import sys
import unittest
-if sys.version > '3':
- basestring = str
-
from pyspark.ml.classification import BinaryLogisticRegressionSummary, LinearSVC, \
LinearSVCSummary, BinaryRandomForestClassificationSummary, LogisticRegression, \
LogisticRegressionSummary, RandomForestClassificationSummary, \
@@ -101,7 +98,7 @@ class TrainingSummaryTest(SparkSessionTestCase):
self.assertEqual(s.residualDegreeOfFreedom, 1)
self.assertEqual(s.residualDegreeOfFreedomNull, 2)
self.assertEqual(s.rank, 1)
- self.assertTrue(isinstance(s.solver, basestring))
+ self.assertTrue(isinstance(s.solver, str))
self.assertTrue(isinstance(s.aic, float))
self.assertTrue(isinstance(s.deviance, float))
self.assertTrue(isinstance(s.nullDeviance, float))
diff --git a/python/pyspark/ml/tree.py b/python/pyspark/ml/tree.py
index a13b27ec8a..460c76fabc 100644
--- a/python/pyspark/ml/tree.py
+++ b/python/pyspark/ml/tree.py
@@ -15,12 +15,10 @@
# limitations under the License.
#
-from pyspark import since, keyword_only
from pyspark.ml.param.shared import *
from pyspark.ml.util import *
-from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams, \
- JavaPredictor, JavaPredictionModel
-from pyspark.ml.common import inherit_doc, _java2py, _py2java
+from pyspark.ml.wrapper import JavaPredictionModel
+from pyspark.ml.common import inherit_doc
@inherit_doc
diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py
index e00753b2ff..7f3d942e2e 100644
--- a/python/pyspark/ml/tuning.py
+++ b/python/pyspark/ml/tuning.py
@@ -15,12 +15,11 @@
# limitations under the License.
#
import itertools
-import sys
from multiprocessing.pool import ThreadPool
import numpy as np
-from pyspark import since, keyword_only
+from pyspark import keyword_only
from pyspark.ml import Estimator, Model
from pyspark.ml.common import _py2java, _java2py
from pyspark.ml.param import Params, Param, TypeConverters
diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py
index aac2b38d3f..9ab6bfa9ba 100644
--- a/python/pyspark/ml/util.py
+++ b/python/pyspark/ml/util.py
@@ -20,12 +20,6 @@ import sys
import os
import time
import uuid
-import warnings
-
-if sys.version > '3':
- basestring = str
- unicode = str
- long = int
from pyspark import SparkContext, since
from pyspark.ml.common import inherit_doc
@@ -60,10 +54,10 @@ class Identifiable(object):
@classmethod
def _randomUID(cls):
"""
- Generate a unique unicode id for the object. The default implementation
+ Generate a unique string id for the object. The default implementation
concatenates the class name, "_", and 12 random hex chars.
"""
- return unicode(cls.__name__ + "_" + uuid.uuid4().hex[-12:])
+ return str(cls.__name__ + "_" + uuid.uuid4().hex[-12:])
@inherit_doc
@@ -170,8 +164,8 @@ class JavaMLWriter(MLWriter):
def save(self, path):
"""Save the ML instance to the input path."""
- if not isinstance(path, basestring):
- raise TypeError("path should be a basestring, got type %s" % type(path))
+ if not isinstance(path, str):
+ raise TypeError("path should be a string, got type %s" % type(path))
self._jwrite.save(path)
def overwrite(self):
@@ -275,8 +269,8 @@ class JavaMLReader(MLReader):
def load(self, path):
"""Load the ML instance from the input path."""
- if not isinstance(path, basestring):
- raise TypeError("path should be a basestring, got type %s" % type(path))
+ if not isinstance(path, str):
+ raise TypeError("path should be a string, got type %s" % type(path))
java_obj = self._jread.load(path)
if not hasattr(self._clazz, "_from_java"):
raise NotImplementedError("This Java ML type cannot be loaded into Python currently: %r"
@@ -430,7 +424,7 @@ class DefaultParamsWriter(MLWriter):
for p in instance._defaultParamMap:
jsonDefaultParams[p.name] = instance._defaultParamMap[p]
- basicMetadata = {"class": cls, "timestamp": long(round(time.time() * 1000)),
+ basicMetadata = {"class": cls, "timestamp": int(round(time.time() * 1000)),
"sparkVersion": sc.version, "uid": uid, "paramMap": jsonParams,
"defaultParamMap": jsonDefaultParams}
if extraMetadata is not None:
diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py
index e59c6c7b25..c1d060a51c 100644
--- a/python/pyspark/ml/wrapper.py
+++ b/python/pyspark/ml/wrapper.py
@@ -16,9 +16,6 @@
#
from abc import ABCMeta, abstractmethod
-import sys
-if sys.version >= '3':
- xrange = range
from pyspark import since
from pyspark import SparkContext
@@ -26,7 +23,6 @@ from pyspark.sql import DataFrame
from pyspark.ml import Estimator, Predictor, PredictionModel, Transformer, Model
from pyspark.ml.base import _PredictorParams
from pyspark.ml.param import Params
-from pyspark.ml.param.shared import HasFeaturesCol, HasLabelCol, HasPredictionCol
from pyspark.ml.util import _jvm
from pyspark.ml.common import inherit_doc, _java2py, _py2java
@@ -99,15 +95,15 @@ class JavaWrapper(object):
# If pylist is a 2D array, then a 2D java array will be created.
# The 2D array is a square, non-jagged 2D array that is big enough for all elements.
inner_array_length = 0
- for i in xrange(len(pylist)):
+ for i in range(len(pylist)):
inner_array_length = max(inner_array_length, len(pylist[i]))
java_array = sc._gateway.new_array(java_class, len(pylist), inner_array_length)
- for i in xrange(len(pylist)):
- for j in xrange(len(pylist[i])):
+ for i in range(len(pylist)):
+ for j in range(len(pylist[i])):
java_array[i][j] = pylist[i][j]
else:
java_array = sc._gateway.new_array(java_class, len(pylist))
- for i in xrange(len(pylist)):
+ for i in range(len(pylist)):
java_array[i] = pylist[i]
return java_array
diff --git a/python/pyspark/mllib/__init__.py b/python/pyspark/mllib/__init__.py
index ae26521ea9..6067693111 100644
--- a/python/pyspark/mllib/__init__.py
+++ b/python/pyspark/mllib/__init__.py
@@ -21,8 +21,6 @@ RDD-based machine learning APIs for Python (in maintenance mode).
The `pyspark.mllib` package is in maintenance mode as of the Spark 2.0.0 release to encourage
migration to the DataFrame-based APIs under the `pyspark.ml` package.
"""
-from __future__ import absolute_import
-
# MLlib currently needs NumPy 1.4+, so complain if lower
import numpy
diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py
index e41e5c9cc8..85cfe583fd 100644
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@@ -17,20 +17,13 @@
import sys
import array as pyarray
-import warnings
-
-if sys.version > '3':
- xrange = range
- basestring = str
-
from math import exp, log
+from collections import namedtuple
from numpy import array, random, tile
-from collections import namedtuple
-
from pyspark import SparkContext, since
-from pyspark.rdd import RDD, ignore_unicode_prefix
+from pyspark.rdd import RDD
from pyspark.mllib.common import JavaModelWrapper, callMLlibFunc, callJavaFunc, _py2java, _java2py
from pyspark.mllib.linalg import SparseVector, _convert_to_vector, DenseVector
from pyspark.mllib.stat.distribution import MultivariateGaussian
@@ -257,7 +250,7 @@ class KMeansModel(Saveable, Loader):
return x.map(self.predict)
x = _convert_to_vector(x)
- for i in xrange(len(self.centers)):
+ for i in range(len(self.centers)):
distance = x.squared_distance(self.centers[i])
if distance < best_distance:
best = i
@@ -708,7 +701,7 @@ class StreamingKMeansModel(KMeansModel):
>>> stkm = StreamingKMeansModel(initCenters, initWeights)
>>> data = sc.parallelize([[-0.1, -0.1], [0.1, 0.1],
... [0.9, 0.9], [1.1, 1.1]])
- >>> stkm = stkm.update(data, 1.0, u"batches")
+ >>> stkm = stkm.update(data, 1.0, "batches")
>>> stkm.centers
array([[ 0., 0.],
[ 1., 1.]])
@@ -720,7 +713,7 @@ class StreamingKMeansModel(KMeansModel):
[3.0, 3.0]
>>> decayFactor = 0.0
>>> data = sc.parallelize([DenseVector([1.5, 1.5]), DenseVector([0.2, 0.2])])
- >>> stkm = stkm.update(data, 0.0, u"batches")
+ >>> stkm = stkm.update(data, 0.0, "batches")
>>> stkm.centers
array([[ 0.2, 0.2],
[ 1.5, 1.5]])
@@ -743,7 +736,6 @@ class StreamingKMeansModel(KMeansModel):
"""Return the cluster weights."""
return self._clusterWeights
- @ignore_unicode_prefix
@since('1.5.0')
def update(self, data, decayFactor, timeUnit):
"""Update the centroids, according to data
@@ -979,8 +971,8 @@ class LDAModel(JavaModelWrapper, JavaSaveable, Loader):
"""
if not isinstance(sc, SparkContext):
raise TypeError("sc should be a SparkContext, got type %s" % type(sc))
- if not isinstance(path, basestring):
- raise TypeError("path should be a basestring, got type %s" % type(path))
+ if not isinstance(path, str):
+ raise TypeError("path should be a string, got type %s" % type(path))
model = callMLlibFunc("loadLDAModel", sc, path)
return LDAModel(model)
diff --git a/python/pyspark/mllib/common.py b/python/pyspark/mllib/common.py
index bac8f35056..24e2f19825 100644
--- a/python/pyspark/mllib/common.py
+++ b/python/pyspark/mllib/common.py
@@ -15,11 +15,6 @@
# limitations under the License.
#
-import sys
-if sys.version >= '3':
- long = int
- unicode = str
-
import py4j.protocol
from py4j.protocol import Py4JJavaError
from py4j.java_gateway import JavaObject
@@ -81,7 +76,7 @@ def _py2java(sc, obj):
obj = [_py2java(sc, x) for x in obj]
elif isinstance(obj, JavaObject):
pass
- elif isinstance(obj, (int, long, float, bool, bytes, unicode)):
+ elif isinstance(obj, (int, float, bool, bytes, str)):
pass
else:
data = bytearray(PickleSerializer().dumps(obj))
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index 3efae6ff0e..80a197eaa7 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -18,21 +18,15 @@
"""
Python package for feature in MLlib.
"""
-from __future__ import absolute_import
-
import sys
import warnings
-if sys.version >= '3':
- basestring = str
- unicode = str
-
from py4j.protocol import Py4JJavaError
from pyspark import since
-from pyspark.rdd import RDD, ignore_unicode_prefix
+from pyspark.rdd import RDD
from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
from pyspark.mllib.linalg import (
- Vector, Vectors, DenseVector, SparseVector, _convert_to_vector)
+ Vectors, DenseVector, SparseVector, _convert_to_vector)
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.util import JavaLoader, JavaSaveable
@@ -616,7 +610,7 @@ class Word2VecModel(JavaVectorTransformer, JavaSaveable, JavaLoader):
.. note:: Local use only
"""
- if not isinstance(word, basestring):
+ if not isinstance(word, str):
word = _convert_to_vector(word)
words, similarity = self.call("findSynonyms", word, num)
return zip(words, similarity)
@@ -640,7 +634,6 @@ class Word2VecModel(JavaVectorTransformer, JavaSaveable, JavaLoader):
return Word2VecModel(model)
-@ignore_unicode_prefix
class Word2Vec(object):
"""Word2Vec creates vector representation of words in a text corpus.
The algorithm first constructs a vocabulary from the corpus
@@ -668,7 +661,7 @@ class Word2Vec(object):
>>> syms = model.findSynonyms("a", 2)
>>> [s[0] for s in syms]
- [u'b', u'c']
+ ['b', 'c']
But querying for synonyms of a vector may return the word whose
representation is that vector:
@@ -676,7 +669,7 @@ class Word2Vec(object):
>>> vec = model.transform("a")
>>> syms = model.findSynonyms(vec, 2)
>>> [s[0] for s in syms]
- [u'a', u'b']
+ ['a', 'b']
>>> import os, tempfile
>>> path = tempfile.mkdtemp()
@@ -686,7 +679,7 @@ class Word2Vec(object):
True
>>> syms = sameModel.findSynonyms("a", 2)
>>> [s[0] for s in syms]
- [u'b', u'c']
+ ['b', 'c']
>>> from shutil import rmtree
>>> try:
... rmtree(path)
diff --git a/python/pyspark/mllib/fpm.py b/python/pyspark/mllib/fpm.py
index 373a141456..cbbd7b351b 100644
--- a/python/pyspark/mllib/fpm.py
+++ b/python/pyspark/mllib/fpm.py
@@ -20,7 +20,6 @@ import sys
from collections import namedtuple
from pyspark import since
-from pyspark.rdd import ignore_unicode_prefix
from pyspark.mllib.common import JavaModelWrapper, callMLlibFunc
from pyspark.mllib.util import JavaSaveable, JavaLoader, inherit_doc
@@ -28,7 +27,6 @@ __all__ = ['FPGrowth', 'FPGrowthModel', 'PrefixSpan', 'PrefixSpanModel']
@inherit_doc
-@ignore_unicode_prefix
class FPGrowthModel(JavaModelWrapper, JavaSaveable, JavaLoader):
"""
A FP-Growth model for mining frequent itemsets
@@ -38,7 +36,7 @@ class FPGrowthModel(JavaModelWrapper, JavaSaveable, JavaLoader):
>>> rdd = sc.parallelize(data, 2)
>>> model = FPGrowth.train(rdd, 0.6, 2)
>>> sorted(model.freqItemsets().collect())
- [FreqItemset(items=[u'a'], freq=4), FreqItemset(items=[u'c'], freq=3), ...
+ [FreqItemset(items=['a'], freq=4), FreqItemset(items=['c'], freq=3), ...
>>> model_path = temp_path + "/fpm"
>>> model.save(sc, model_path)
>>> sameModel = FPGrowthModel.load(sc, model_path)
@@ -101,7 +99,6 @@ class FPGrowth(object):
@inherit_doc
-@ignore_unicode_prefix
class PrefixSpanModel(JavaModelWrapper):
"""
Model fitted by PrefixSpan
@@ -114,7 +111,7 @@ class PrefixSpanModel(JavaModelWrapper):
>>> rdd = sc.parallelize(data, 2)
>>> model = PrefixSpan.train(rdd)
>>> sorted(model.freqSequences().collect())
- [FreqSequence(sequence=[[u'a']], freq=3), FreqSequence(sequence=[[u'a'], [u'a']], freq=1), ...
+ [FreqSequence(sequence=[['a']], freq=3), FreqSequence(sequence=[['a'], ['a']], freq=1), ...
.. versionadded:: 1.6.0
"""
diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py
index cd09621b13..c1402fb98a 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -27,15 +27,6 @@ import sys
import array
import struct
-if sys.version >= '3':
- basestring = str
- xrange = range
- import copyreg as copy_reg
- long = int
-else:
- from itertools import izip as zip
- import copy_reg
-
import numpy as np
from pyspark import since
@@ -49,13 +40,6 @@ __all__ = ['Vector', 'DenseVector', 'SparseVector', 'Vectors',
'QRDecomposition']
-if sys.version_info[:2] == (2, 7):
- # speed up pickling array in Python 2.7
- def fast_pickle_array(ar):
- return array.array, (ar.typecode, ar.tostring())
- copy_reg.pickle(array.array, fast_pickle_array)
-
-
# Check whether we have SciPy. MLlib works without it too, but if we have it, some methods,
# such as _dot and _serialize_double_vector, start to support scipy.sparse matrices.
@@ -70,7 +54,7 @@ except:
def _convert_to_vector(l):
if isinstance(l, Vector):
return l
- elif type(l) in (array.array, np.array, np.ndarray, list, tuple, xrange):
+ elif type(l) in (array.array, np.array, np.ndarray, list, tuple, range):
return DenseVector(l)
elif _have_scipy and scipy.sparse.issparse(l):
assert l.shape[1] == 1, "Expected column vector"
@@ -104,7 +88,7 @@ def _vector_size(v):
"""
if isinstance(v, Vector):
return len(v)
- elif type(v) in (array.array, list, tuple, xrange):
+ elif type(v) in (array.array, list, tuple, range):
return len(v)
elif type(v) == np.ndarray:
if v.ndim == 1 or (v.ndim == 2 and v.shape[1] == 1):
@@ -459,7 +443,7 @@ class DenseVector(Vector):
elif isinstance(other, SparseVector):
if len(self) != other.size:
return False
- return Vectors._equals(list(xrange(len(self))), self.array, other.indices, other.values)
+ return Vectors._equals(list(range(len(self))), self.array, other.indices, other.values)
return False
def __ne__(self, other):
@@ -556,7 +540,7 @@ class SparseVector(Vector):
self.indices = np.array(args[0], dtype=np.int32)
self.values = np.array(args[1], dtype=np.float64)
assert len(self.indices) == len(self.values), "index and value arrays not same length"
- for i in xrange(len(self.indices) - 1):
+ for i in range(len(self.indices) - 1):
if self.indices[i] >= self.indices[i + 1]:
raise TypeError(
"Indices %s and %s are not strictly increasing"
@@ -788,7 +772,7 @@ class SparseVector(Vector):
inds = self.indices
vals = self.values
entries = ", ".join(["{0}: {1}".format(inds[i], _format_float(vals[i]))
- for i in xrange(len(inds))])
+ for i in range(len(inds))])
return "SparseVector({0}, {{{1}}})".format(self.size, entries)
def __eq__(self, other):
@@ -798,7 +782,7 @@ class SparseVector(Vector):
elif isinstance(other, DenseVector):
if self.size != len(other):
return False
- return Vectors._equals(self.indices, self.values, list(xrange(len(other))), other.array)
+ return Vectors._equals(self.indices, self.values, list(range(len(other))), other.array)
return False
def __getitem__(self, index):
@@ -880,7 +864,7 @@ class Vectors(object):
>>> Vectors.dense(1.0, 2.0)
DenseVector([1.0, 2.0])
"""
- if len(elements) == 1 and not isinstance(elements[0], (float, int, long)):
+ if len(elements) == 1 and not isinstance(elements[0], (float, int)):
# it's list, numpy.array or other iterable object.
elements = elements[0]
return DenseVector(elements)
@@ -1279,7 +1263,7 @@ class SparseMatrix(Matrix):
Return an numpy.ndarray
"""
A = np.zeros((self.numRows, self.numCols), dtype=np.float64, order='F')
- for k in xrange(self.colPtrs.size - 1):
+ for k in range(self.colPtrs.size - 1):
startptr = self.colPtrs[k]
endptr = self.colPtrs[k + 1]
if self.isTransposed:
diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py
index 56701758c8..603d31d3d7 100644
--- a/python/pyspark/mllib/linalg/distributed.py
+++ b/python/pyspark/mllib/linalg/distributed.py
@@ -21,9 +21,6 @@ Package for distributed linear algebra.
import sys
-if sys.version >= '3':
- long = int
-
from py4j.java_gateway import JavaObject
from pyspark import RDD, since
@@ -95,9 +92,9 @@ class RowMatrix(DistributedMatrix):
"""
if isinstance(rows, RDD):
rows = rows.map(_convert_to_vector)
- java_matrix = callMLlibFunc("createRowMatrix", rows, long(numRows), int(numCols))
+ java_matrix = callMLlibFunc("createRowMatrix", rows, int(numRows), int(numCols))
elif isinstance(rows, DataFrame):
- java_matrix = callMLlibFunc("createRowMatrix", rows, long(numRows), int(numCols))
+ java_matrix = callMLlibFunc("createRowMatrix", rows, int(numRows), int(numCols))
elif (isinstance(rows, JavaObject)
and rows.getClass().getSimpleName() == "RowMatrix"):
java_matrix = rows
@@ -439,13 +436,13 @@ class IndexedRow(object):
"""
Represents a row of an IndexedRowMatrix.
- Just a wrapper over a (long, vector) tuple.
+ Just a wrapper over a (int, vector) tuple.
:param index: The index for the given row.
:param vector: The row in the matrix at the given index.
"""
def __init__(self, index, vector):
- self.index = long(index)
+ self.index = int(index)
self.vector = _convert_to_vector(vector)
def __repr__(self):
@@ -465,8 +462,8 @@ class IndexedRowMatrix(DistributedMatrix):
"""
Represents a row-oriented distributed Matrix with indexed rows.
- :param rows: An RDD of IndexedRows or (long, vector) tuples or a DataFrame consisting of a
- long typed column of indices and a vector typed column.
+ :param rows: An RDD of IndexedRows or (int, vector) tuples or a DataFrame consisting of a
+ int typed column of indices and a vector typed column.
:param numRows: Number of rows in the matrix. A non-positive
value means unknown, at which point the number
of rows will be determined by the max row
@@ -510,14 +507,14 @@ class IndexedRowMatrix(DistributedMatrix):
# both be easily serialized. We will convert back to
# IndexedRows on the Scala side.
java_matrix = callMLlibFunc("createIndexedRowMatrix", rows.toDF(),
- long(numRows), int(numCols))
+ int(numRows), int(numCols))
elif isinstance(rows, DataFrame):
- java_matrix = callMLlibFunc("createIndexedRowMatrix", rows, long(numRows), int(numCols))
+ java_matrix = callMLlibFunc("createIndexedRowMatrix", rows, int(numRows), int(numCols))
elif (isinstance(rows, JavaObject)
and rows.getClass().getSimpleName() == "IndexedRowMatrix"):
java_matrix = rows
else:
- raise TypeError("rows should be an RDD of IndexedRows or (long, vector) tuples, "
+ raise TypeError("rows should be an RDD of IndexedRows or (int, vector) tuples, "
"got %s" % type(rows))
self._java_matrix_wrapper = JavaModelWrapper(java_matrix)
@@ -731,15 +728,15 @@ class MatrixEntry(object):
"""
Represents an entry of a CoordinateMatrix.
- Just a wrapper over a (long, long, float) tuple.
+ Just a wrapper over a (int, int, float) tuple.
:param i: The row index of the matrix.
:param j: The column index of the matrix.
:param value: The (i, j)th entry of the matrix, as a float.
"""
def __init__(self, i, j, value):
- self.i = long(i)
- self.j = long(j)
+ self.i = int(i)
+ self.j = int(j)
self.value = float(value)
def __repr__(self):
@@ -760,7 +757,7 @@ class CoordinateMatrix(DistributedMatrix):
Represents a matrix in coordinate format.
:param entries: An RDD of MatrixEntry inputs or
- (long, long, float) tuples.
+ (int, int, float) tuples.
:param numRows: Number of rows in the matrix. A non-positive
value means unknown, at which point the number
of rows will be determined by the max row
@@ -804,13 +801,13 @@ class CoordinateMatrix(DistributedMatrix):
# each be easily serialized. We will convert back to
# MatrixEntry inputs on the Scala side.
java_matrix = callMLlibFunc("createCoordinateMatrix", entries.toDF(),
- long(numRows), long(numCols))
+ int(numRows), int(numCols))
elif (isinstance(entries, JavaObject)
and entries.getClass().getSimpleName() == "CoordinateMatrix"):
java_matrix = entries
else:
raise TypeError("entries should be an RDD of MatrixEntry entries or "
- "(long, long, float) tuples, got %s" % type(entries))
+ "(int, int, float) tuples, got %s" % type(entries))
self._java_matrix_wrapper = JavaModelWrapper(java_matrix)
@@ -1044,7 +1041,7 @@ class BlockMatrix(DistributedMatrix):
# the Scala side.
java_matrix = callMLlibFunc("createBlockMatrix", blocks.toDF(),
int(rowsPerBlock), int(colsPerBlock),
- long(numRows), long(numCols))
+ int(numRows), int(numCols))
elif (isinstance(blocks, JavaObject)
and blocks.getClass().getSimpleName() == "BlockMatrix"):
java_matrix = blocks
diff --git a/python/pyspark/mllib/stat/KernelDensity.py b/python/pyspark/mllib/stat/KernelDensity.py
index 7250eab670..56444c152f 100644
--- a/python/pyspark/mllib/stat/KernelDensity.py
+++ b/python/pyspark/mllib/stat/KernelDensity.py
@@ -15,11 +15,6 @@
# limitations under the License.
#
-import sys
-
-if sys.version > '3':
- xrange = range
-
import numpy as np
from pyspark.mllib.common import callMLlibFunc
diff --git a/python/pyspark/mllib/stat/_statistics.py b/python/pyspark/mllib/stat/_statistics.py
index d49f741a2f..43454ba518 100644
--- a/python/pyspark/mllib/stat/_statistics.py
+++ b/python/pyspark/mllib/stat/_statistics.py
@@ -16,10 +16,8 @@
#
import sys
-if sys.version >= '3':
- basestring = str
-from pyspark.rdd import RDD, ignore_unicode_prefix
+from pyspark.rdd import RDD
from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
from pyspark.mllib.linalg import Matrix, _convert_to_vector
from pyspark.mllib.regression import LabeledPoint
@@ -157,7 +155,6 @@ class Statistics(object):
return callMLlibFunc("corr", x.map(float), y.map(float), method)
@staticmethod
- @ignore_unicode_prefix
def chiSqTest(observed, expected=None):
"""
If `observed` is Vector, conduct Pearson's chi-squared goodness
@@ -199,9 +196,9 @@ class Statistics(object):
>>> print(round(pearson.pValue, 4))
0.8187
>>> pearson.method
- u'pearson'
+ 'pearson'
>>> pearson.nullHypothesis
- u'observed follows the same distribution as expected.'
+ 'observed follows the same distribution as expected.'
>>> observed = Vectors.dense([21, 38, 43, 80])
>>> expected = Vectors.dense([3, 5, 7, 20])
@@ -242,7 +239,6 @@ class Statistics(object):
return ChiSqTestResult(jmodel)
@staticmethod
- @ignore_unicode_prefix
def kolmogorovSmirnovTest(data, distName="norm", *params):
"""
Performs the Kolmogorov-Smirnov (KS) test for data sampled from
@@ -282,7 +278,7 @@ class Statistics(object):
>>> print(round(ksmodel.statistic, 3))
0.175
>>> ksmodel.nullHypothesis
- u'Sample follows theoretical distribution'
+ 'Sample follows theoretical distribution'
>>> data = sc.parallelize([2.0, 3.0, 4.0])
>>> ksmodel = kstest(data, "norm", 3.0, 1.0)
@@ -293,7 +289,7 @@ class Statistics(object):
"""
if not isinstance(data, RDD):
raise TypeError("data should be an RDD, got %s." % type(data))
- if not isinstance(distName, basestring):
+ if not isinstance(distName, str):
raise TypeError("distName should be a string, got %s." % type(distName))
params = [float(param) for param in params]
diff --git a/python/pyspark/mllib/tests/test_linalg.py b/python/pyspark/mllib/tests/test_linalg.py
index 312730e8af..21c2bb422a 100644
--- a/python/pyspark/mllib/tests/test_linalg.py
+++ b/python/pyspark/mllib/tests/test_linalg.py
@@ -31,9 +31,6 @@ from pyspark.sql import Row
from pyspark.testing.mllibutils import MLlibTestCase
from pyspark.testing.utils import have_scipy
-if sys.version >= '3':
- long = int
-
class VectorTests(MLlibTestCase):
@@ -447,7 +444,7 @@ class VectorUDTTests(MLlibTestCase):
def test_indexed_row_matrix_from_dataframe(self):
from pyspark.sql.utils import IllegalArgumentException
- df = self.spark.createDataFrame([Row(long(0), Vectors.dense(1))])
+ df = self.spark.createDataFrame([Row(int(0), Vectors.dense(1))])
matrix = IndexedRowMatrix(df)
self.assertEqual(matrix.numRows(), 1)
self.assertEqual(matrix.numCols(), 1)
diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
index 2d8df461ac..e05dfdb953 100644
--- a/python/pyspark/mllib/tree.py
+++ b/python/pyspark/mllib/tree.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import absolute_import
-
import sys
import random
diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py
index f0f9cda467..a0be29a82e 100644
--- a/python/pyspark/mllib/util.py
+++ b/python/pyspark/mllib/util.py
@@ -18,10 +18,6 @@
import sys
import numpy as np
-if sys.version > '3':
- xrange = range
- basestring = str
-
from pyspark import SparkContext, since
from pyspark.mllib.common import callMLlibFunc, inherit_doc
from pyspark.mllib.linalg import Vectors, SparseVector, _convert_to_vector
@@ -46,7 +42,7 @@ class MLUtils(object):
nnz = len(items) - 1
indices = np.zeros(nnz, dtype=np.int32)
values = np.zeros(nnz)
- for i in xrange(nnz):
+ for i in range(nnz):
index, value = items[1 + i].split(":")
indices[i] = int(index) - 1
values[i] = float(value)
@@ -61,10 +57,10 @@ class MLUtils(object):
v = _convert_to_vector(p.features)
if isinstance(v, SparseVector):
nnz = len(v.indices)
- for i in xrange(nnz):
+ for i in range(nnz):
items.append(str(v.indices[i] + 1) + ":" + str(v.values[i]))
else:
- for i in xrange(len(v)):
+ for i in range(len(v)):
items.append(str(i + 1) + ":" + str(v[i]))
return " ".join(items)
@@ -396,8 +392,8 @@ class JavaSaveable(Saveable):
"""Save this model to the given path."""
if not isinstance(sc, SparkContext):
raise TypeError("sc should be a SparkContext, got type %s" % type(sc))
- if not isinstance(path, basestring):
- raise TypeError("path should be a basestring, got type %s" % type(path))
+ if not isinstance(path, str):
+ raise TypeError("path should be a string, got type %s" % type(path))
self._java_model.save(sc._jsc.sc(), path)
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index db0c1971cd..437b2c4465 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -33,15 +33,10 @@ from itertools import chain
from functools import reduce
from math import sqrt, log, isinf, isnan, pow, ceil
-if sys.version > '3':
- basestring = unicode = str
-else:
- from itertools import imap as map, ifilter as filter
-
from pyspark.java_gateway import local_connect_and_auth
from pyspark.serializers import AutoBatchedSerializer, BatchedSerializer, NoOpSerializer, \
CartesianDeserializer, CloudPickleSerializer, PairDeserializer, PickleSerializer, \
- UTF8Deserializer, pack_long, read_int, write_int
+ pack_long, read_int, write_int
from pyspark.join import python_join, python_left_outer_join, \
python_right_outer_join, python_full_outer_join, python_cogroup
from pyspark.statcounter import StatCounter
@@ -93,7 +88,7 @@ def portable_hash(x):
219750521
"""
- if sys.version_info >= (3, 2, 3) and 'PYTHONHASHSEED' not in os.environ:
+ if 'PYTHONHASHSEED' not in os.environ:
raise Exception("Randomness of hash of string should be disabled via PYTHONHASHSEED")
if x is None:
@@ -204,19 +199,6 @@ def _local_iterator_from_socket(sock_info, serializer):
return iter(PyLocalIterable(sock_info, serializer))
-def ignore_unicode_prefix(f):
- """
- Ignore the 'u' prefix of string in doc tests, to make it works
- in both python 2 and 3
- """
- if sys.version >= '3':
- # the representation of unicode string in Python 3 does not have prefix 'u',
- # so remove the prefix 'u' for doc tests
- literal_re = re.compile(r"(\W|^)[uU](['])", re.UNICODE)
- f.__doc__ = literal_re.sub(r'\1\2', f.__doc__)
- return f
-
-
class Partitioner(object):
def __init__(self, numPartitions, partitionFunc):
self.numPartitions = numPartitions
@@ -797,13 +779,12 @@ class RDD(object):
"""
return self.map(lambda x: (f(x), x)).groupByKey(numPartitions, partitionFunc)
- @ignore_unicode_prefix
def pipe(self, command, env=None, checkCode=False):
"""
Return an RDD created by piping elements to a forked external process.
>>> sc.parallelize(['1', '2', '', '3']).pipe('cat').collect()
- [u'1', u'2', u'', u'3']
+ ['1', '2', '', '3']
:param checkCode: whether or not to check the return value of the shell command.
"""
@@ -816,7 +797,7 @@ class RDD(object):
def pipe_objs(out):
for obj in iterator:
- s = unicode(obj).rstrip('\n') + '\n'
+ s = str(obj).rstrip('\n') + '\n'
out.write(s.encode('utf-8'))
out.close()
Thread(target=pipe_objs, args=[pipe.stdin]).start()
@@ -1591,7 +1572,6 @@ class RDD(object):
ser = BatchedSerializer(PickleSerializer(), batchSize)
self._reserialize(ser)._jrdd.saveAsObjectFile(path)
- @ignore_unicode_prefix
def saveAsTextFile(self, path, compressionCodecClass=None):
"""
Save this RDD as a text file, using string representations of elements.
@@ -1625,13 +1605,13 @@ class RDD(object):
>>> from fileinput import input, hook_compressed
>>> result = sorted(input(glob(tempFile3.name + "/part*.gz"), openhook=hook_compressed))
>>> b''.join(result).decode('utf-8')
- u'bar\\nfoo\\n'
+ 'bar\\nfoo\\n'
"""
def func(split, iterator):
for x in iterator:
- if not isinstance(x, (unicode, bytes)):
- x = unicode(x)
- if isinstance(x, unicode):
+ if not isinstance(x, (str, bytes)):
+ x = str(x)
+ if isinstance(x, str):
x = x.encode("utf-8")
yield x
keyed = self.mapPartitionsWithIndex(func)
@@ -2281,14 +2261,13 @@ class RDD(object):
if n:
return n
- @ignore_unicode_prefix
def setName(self, name):
"""
Assign a name to this RDD.
>>> rdd1 = sc.parallelize([1, 2])
>>> rdd1.setName('RDD1').name()
- u'RDD1'
+ 'RDD1'
"""
self._jrdd.setName(name)
return self
diff --git a/python/pyspark/resultiterable.py b/python/pyspark/resultiterable.py
index c867b51877..cd2a59513b 100644
--- a/python/pyspark/resultiterable.py
+++ b/python/pyspark/resultiterable.py
@@ -15,10 +15,7 @@
# limitations under the License.
#
-try:
- from collections.abc import Iterable
-except ImportError:
- from collections import Iterable
+from collections.abc import Iterable
__all__ = ["ResultIterable"]
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index 49b7cb4546..80ce9b8408 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -58,18 +58,11 @@ import types
import collections
import zlib
import itertools
-
-if sys.version < '3':
- import cPickle as pickle
- from itertools import izip as zip, imap as map
-else:
- import pickle
- basestring = unicode = str
- xrange = range
+import pickle
pickle_protocol = pickle.HIGHEST_PROTOCOL
from pyspark import cloudpickle
-from pyspark.util import _exception_message, print_exec
+from pyspark.util import print_exec
__all__ = ["PickleSerializer", "MarshalSerializer", "UTF8Deserializer"]
@@ -132,11 +125,6 @@ class FramedSerializer(Serializer):
where `length` is a 32-bit integer and data is `length` bytes.
"""
- def __init__(self):
- # On Python 2.6, we can't write bytearrays to streams, so we need to convert them
- # to strings first. Check if the version number is that old.
- self._only_write_strings = sys.version_info[0:2] <= (2, 6)
-
def dump_stream(self, iterator, stream):
for obj in iterator:
self._write_with_length(obj, stream)
@@ -155,10 +143,7 @@ class FramedSerializer(Serializer):
if len(serialized) > (1 << 31):
raise ValueError("can not serialize object larger than 2G")
write_int(len(serialized), stream)
- if self._only_write_strings:
- stream.write(str(serialized))
- else:
- stream.write(serialized)
+ stream.write(serialized)
def _read_with_length(self, stream):
length = read_int(stream)
@@ -204,7 +189,7 @@ class BatchedSerializer(Serializer):
yield list(iterator)
elif hasattr(iterator, "__len__") and hasattr(iterator, "__getslice__"):
n = len(iterator)
- for i in xrange(0, n, self.batchSize):
+ for i in range(0, n, self.batchSize):
yield iterator[i: i + self.batchSize]
else:
items = []
@@ -395,23 +380,8 @@ def _hijack_namedtuple():
return types.FunctionType(f.__code__, f.__globals__, f.__name__,
f.__defaults__, f.__closure__)
- def _kwdefaults(f):
- # __kwdefaults__ contains the default values of keyword-only arguments which are
- # introduced from Python 3. The possible cases for __kwdefaults__ in namedtuple
- # are as below:
- #
- # - Does not exist in Python 2.
- # - Returns None in <= Python 3.5.x.
- # - Returns a dictionary containing the default values to the keys from Python 3.6.x
- # (See https://bugs.python.org/issue25628).
- kargs = getattr(f, "__kwdefaults__", None)
- if kargs is None:
- return {}
- else:
- return kargs
-
_old_namedtuple = _copy_func(collections.namedtuple)
- _old_namedtuple_kwdefaults = _kwdefaults(collections.namedtuple)
+ _old_namedtuple_kwdefaults = collections.namedtuple.__kwdefaults__
def namedtuple(*args, **kwargs):
for k, v in _old_namedtuple_kwdefaults.items():
@@ -453,12 +423,8 @@ class PickleSerializer(FramedSerializer):
def dumps(self, obj):
return pickle.dumps(obj, pickle_protocol)
- if sys.version >= '3':
- def loads(self, obj, encoding="bytes"):
- return pickle.loads(obj, encoding=encoding)
- else:
- def loads(self, obj, encoding=None):
- return pickle.loads(obj)
+ def loads(self, obj, encoding="bytes"):
+ return pickle.loads(obj, encoding=encoding)
class CloudPickleSerializer(PickleSerializer):
@@ -469,7 +435,7 @@ class CloudPickleSerializer(PickleSerializer):
except pickle.PickleError:
raise
except Exception as e:
- emsg = _exception_message(e)
+ emsg = str(e)
if "'i' format requires" in emsg:
msg = "Object too large to serialize: %s" % emsg
else:
diff --git a/python/pyspark/shell.py b/python/pyspark/shell.py
index 65e3bdbc05..cde163bd2d 100644
--- a/python/pyspark/shell.py
+++ b/python/pyspark/shell.py
@@ -26,11 +26,8 @@ import os
import platform
import warnings
-import py4j
-
-from pyspark import SparkConf
from pyspark.context import SparkContext
-from pyspark.sql import SparkSession, SQLContext
+from pyspark.sql import SparkSession
if os.environ.get("SPARK_EXECUTOR_URI"):
SparkContext.setSystemProperty("spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"])
diff --git a/python/pyspark/sql/__init__.py b/python/pyspark/sql/__init__.py
index c28cb8c3b9..af32469e82 100644
--- a/python/pyspark/sql/__init__.py
+++ b/python/pyspark/sql/__init__.py
@@ -39,9 +39,6 @@ Important classes of Spark SQL and DataFrames:
- :class:`pyspark.sql.Window`
For working with window functions.
"""
-from __future__ import absolute_import
-
-
from pyspark.sql.types import Row
from pyspark.sql.context import SQLContext, HiveContext, UDFRegistration
from pyspark.sql.session import SparkSession
diff --git a/python/pyspark/sql/avro/functions.py b/python/pyspark/sql/avro/functions.py
index ed62a72d6c..974412ee4e 100644
--- a/python/pyspark/sql/avro/functions.py
+++ b/python/pyspark/sql/avro/functions.py
@@ -21,12 +21,10 @@ A collections of builtin avro functions
from pyspark import since, SparkContext
-from pyspark.rdd import ignore_unicode_prefix
from pyspark.sql.column import Column, _to_java_column
from pyspark.util import _print_missing_jar
-@ignore_unicode_prefix
@since(3.0)
def from_avro(data, jsonFormatSchema, options={}):
"""
@@ -45,7 +43,7 @@ def from_avro(data, jsonFormatSchema, options={}):
>>> from pyspark.sql import Row
>>> from pyspark.sql.avro.functions import from_avro, to_avro
- >>> data = [(1, Row(name='Alice', age=2))]
+ >>> data = [(1, Row(age=2, name='Alice'))]
>>> df = spark.createDataFrame(data, ("key", "value"))
>>> avroDf = df.select(to_avro(df.value).alias("avro"))
>>> avroDf.collect()
@@ -55,7 +53,7 @@ def from_avro(data, jsonFormatSchema, options={}):
... "fields":[{"name":"age","type":["long","null"]},
... {"name":"name","type":["string","null"]}]},"null"]}]}'''
>>> avroDf.select(from_avro(avroDf.avro, jsonFormatSchema).alias("value")).collect()
- [Row(value=Row(avro=Row(age=2, name=u'Alice')))]
+ [Row(value=Row(avro=Row(age=2, name='Alice')))]
"""
sc = SparkContext._active_spark_context
@@ -69,7 +67,6 @@ def from_avro(data, jsonFormatSchema, options={}):
return Column(jc)
-@ignore_unicode_prefix
@since(3.0)
def to_avro(data, jsonFormatSchema=""):
"""
diff --git a/python/pyspark/sql/catalog.py b/python/pyspark/sql/catalog.py
index 974251f63b..25fc696dac 100644
--- a/python/pyspark/sql/catalog.py
+++ b/python/pyspark/sql/catalog.py
@@ -20,10 +20,8 @@ import warnings
from collections import namedtuple
from pyspark import since
-from pyspark.rdd import ignore_unicode_prefix, PythonEvalType
from pyspark.sql.dataframe import DataFrame
-from pyspark.sql.udf import UserDefinedFunction
-from pyspark.sql.types import IntegerType, StringType, StructType
+from pyspark.sql.types import StructType
Database = namedtuple("Database", "name description locationUri")
@@ -44,19 +42,16 @@ class Catalog(object):
self._jsparkSession = sparkSession._jsparkSession
self._jcatalog = sparkSession._jsparkSession.catalog()
- @ignore_unicode_prefix
@since(2.0)
def currentDatabase(self):
"""Returns the current default database in this session."""
return self._jcatalog.currentDatabase()
- @ignore_unicode_prefix
@since(2.0)
def setCurrentDatabase(self, dbName):
"""Sets the current default database in this session."""
return self._jcatalog.setCurrentDatabase(dbName)
- @ignore_unicode_prefix
@since(2.0)
def listDatabases(self):
"""Returns a list of databases available across all sessions."""
@@ -70,7 +65,6 @@ class Catalog(object):
locationUri=jdb.locationUri()))
return databases
- @ignore_unicode_prefix
@since(2.0)
def listTables(self, dbName=None):
"""Returns a list of tables/views in the specified database.
@@ -92,7 +86,6 @@ class Catalog(object):
isTemporary=jtable.isTemporary()))
return tables
- @ignore_unicode_prefix
@since(2.0)
def listFunctions(self, dbName=None):
"""Returns a list of functions registered in the specified database.
@@ -113,7 +106,6 @@ class Catalog(object):
isTemporary=jfunction.isTemporary()))
return functions
- @ignore_unicode_prefix
@since(2.0)
def listColumns(self, tableName, dbName=None):
"""Returns a list of columns for the given table/view in the specified database.
diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py
index ef4944c912..bd4c355762 100644
--- a/python/pyspark/sql/column.py
+++ b/python/pyspark/sql/column.py
@@ -19,15 +19,8 @@ import sys
import json
import warnings
-if sys.version >= '3':
- basestring = str
- long = int
-
-from py4j.java_gateway import is_instance_of
-
from pyspark import copy_func, since
from pyspark.context import SparkContext
-from pyspark.rdd import ignore_unicode_prefix
from pyspark.sql.types import *
__all__ = ["Column"]
@@ -46,7 +39,7 @@ def _create_column_from_name(name):
def _to_java_column(col):
if isinstance(col, Column):
jcol = col._jc
- elif isinstance(col, basestring):
+ elif isinstance(col, str):
jcol = _create_column_from_name(col)
else:
raise TypeError(
@@ -359,7 +352,7 @@ class Column(object):
:param other: string in line
>>> df.filter(df.name.contains('o')).collect()
- [Row(age=5, name=u'Bob')]
+ [Row(age=5, name='Bob')]
"""
_rlike_doc = """
SQL RLIKE expression (LIKE with Regex). Returns a boolean :class:`Column` based on a regex
@@ -368,7 +361,7 @@ class Column(object):
:param other: an extended regex expression
>>> df.filter(df.name.rlike('ice$')).collect()
- [Row(age=2, name=u'Alice')]
+ [Row(age=2, name='Alice')]
"""
_like_doc = """
SQL like expression. Returns a boolean :class:`Column` based on a SQL LIKE match.
@@ -378,7 +371,7 @@ class Column(object):
See :func:`rlike` for a regex version
>>> df.filter(df.name.like('Al%')).collect()
- [Row(age=2, name=u'Alice')]
+ [Row(age=2, name='Alice')]
"""
_startswith_doc = """
String starts with. Returns a boolean :class:`Column` based on a string match.
@@ -386,7 +379,7 @@ class Column(object):
:param other: string at start of line (do not use a regex `^`)
>>> df.filter(df.name.startswith('Al')).collect()
- [Row(age=2, name=u'Alice')]
+ [Row(age=2, name='Alice')]
>>> df.filter(df.name.startswith('^Al')).collect()
[]
"""
@@ -396,18 +389,17 @@ class Column(object):
:param other: string at end of line (do not use a regex `$`)
>>> df.filter(df.name.endswith('ice')).collect()
- [Row(age=2, name=u'Alice')]
+ [Row(age=2, name='Alice')]
>>> df.filter(df.name.endswith('ice$')).collect()
[]
"""
- contains = ignore_unicode_prefix(_bin_op("contains", _contains_doc))
- rlike = ignore_unicode_prefix(_bin_op("rlike", _rlike_doc))
- like = ignore_unicode_prefix(_bin_op("like", _like_doc))
- startswith = ignore_unicode_prefix(_bin_op("startsWith", _startswith_doc))
- endswith = ignore_unicode_prefix(_bin_op("endsWith", _endswith_doc))
+ contains = _bin_op("contains", _contains_doc)
+ rlike = _bin_op("rlike", _rlike_doc)
+ like = _bin_op("like", _like_doc)
+ startswith = _bin_op("startsWith", _startswith_doc)
+ endswith = _bin_op("endsWith", _endswith_doc)
- @ignore_unicode_prefix
@since(1.3)
def substr(self, startPos, length):
"""
@@ -417,7 +409,7 @@ class Column(object):
:param length: length of the substring (int or Column)
>>> df.select(df.name.substr(1, 3).alias("col")).collect()
- [Row(col=u'Ali'), Row(col=u'Bob')]
+ [Row(col='Ali'), Row(col='Bob')]
"""
if type(startPos) != type(length):
raise TypeError(
@@ -435,7 +427,6 @@ class Column(object):
raise TypeError("Unexpected type: %s" % type(startPos))
return Column(jc)
- @ignore_unicode_prefix
@since(1.5)
def isin(self, *cols):
"""
@@ -443,9 +434,9 @@ class Column(object):
expression is contained by the evaluated values of the arguments.
>>> df[df.name.isin("Bob", "Mike")].collect()
- [Row(age=5, name=u'Bob')]
+ [Row(age=5, name='Bob')]
>>> df[df.age.isin([1, 2, 3])].collect()
- [Row(age=2, name=u'Alice')]
+ [Row(age=2, name='Alice')]
"""
if len(cols) == 1 and isinstance(cols[0], (list, set)):
cols = cols[0]
@@ -461,7 +452,7 @@ class Column(object):
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame([('Tom', 80), ('Alice', None)], ["name", "height"])
>>> df.select(df.name).orderBy(df.name.asc()).collect()
- [Row(name=u'Alice'), Row(name=u'Tom')]
+ [Row(name='Alice'), Row(name='Tom')]
"""
_asc_nulls_first_doc = """
Returns a sort expression based on ascending order of the column, and null values
@@ -470,7 +461,7 @@ class Column(object):
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"])
>>> df.select(df.name).orderBy(df.name.asc_nulls_first()).collect()
- [Row(name=None), Row(name=u'Alice'), Row(name=u'Tom')]
+ [Row(name=None), Row(name='Alice'), Row(name='Tom')]
.. versionadded:: 2.4
"""
@@ -481,7 +472,7 @@ class Column(object):
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"])
>>> df.select(df.name).orderBy(df.name.asc_nulls_last()).collect()
- [Row(name=u'Alice'), Row(name=u'Tom'), Row(name=None)]
+ [Row(name='Alice'), Row(name='Tom'), Row(name=None)]
.. versionadded:: 2.4
"""
@@ -491,7 +482,7 @@ class Column(object):
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame([('Tom', 80), ('Alice', None)], ["name", "height"])
>>> df.select(df.name).orderBy(df.name.desc()).collect()
- [Row(name=u'Tom'), Row(name=u'Alice')]
+ [Row(name='Tom'), Row(name='Alice')]
"""
_desc_nulls_first_doc = """
Returns a sort expression based on the descending order of the column, and null values
@@ -500,7 +491,7 @@ class Column(object):
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"])
>>> df.select(df.name).orderBy(df.name.desc_nulls_first()).collect()
- [Row(name=None), Row(name=u'Tom'), Row(name=u'Alice')]
+ [Row(name=None), Row(name='Tom'), Row(name='Alice')]
.. versionadded:: 2.4
"""
@@ -511,37 +502,37 @@ class Column(object):
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"])
>>> df.select(df.name).orderBy(df.name.desc_nulls_last()).collect()
- [Row(name=u'Tom'), Row(name=u'Alice'), Row(name=None)]
+ [Row(name='Tom'), Row(name='Alice'), Row(name=None)]
.. versionadded:: 2.4
"""
- asc = ignore_unicode_prefix(_unary_op("asc", _asc_doc))
- asc_nulls_first = ignore_unicode_prefix(_unary_op("asc_nulls_first", _asc_nulls_first_doc))
- asc_nulls_last = ignore_unicode_prefix(_unary_op("asc_nulls_last", _asc_nulls_last_doc))
- desc = ignore_unicode_prefix(_unary_op("desc", _desc_doc))
- desc_nulls_first = ignore_unicode_prefix(_unary_op("desc_nulls_first", _desc_nulls_first_doc))
- desc_nulls_last = ignore_unicode_prefix(_unary_op("desc_nulls_last", _desc_nulls_last_doc))
+ asc = _unary_op("asc", _asc_doc)
+ asc_nulls_first = _unary_op("asc_nulls_first", _asc_nulls_first_doc)
+ asc_nulls_last = _unary_op("asc_nulls_last", _asc_nulls_last_doc)
+ desc = _unary_op("desc", _desc_doc)
+ desc_nulls_first = _unary_op("desc_nulls_first", _desc_nulls_first_doc)
+ desc_nulls_last = _unary_op("desc_nulls_last", _desc_nulls_last_doc)
_isNull_doc = """
True if the current expression is null.
>>> from pyspark.sql import Row
- >>> df = spark.createDataFrame([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)])
+ >>> df = spark.createDataFrame([Row(name='Tom', height=80), Row(name='Alice', height=None)])
>>> df.filter(df.height.isNull()).collect()
- [Row(height=None, name=u'Alice')]
+ [Row(name='Alice', height=None)]
"""
_isNotNull_doc = """
True if the current expression is NOT null.
>>> from pyspark.sql import Row
- >>> df = spark.createDataFrame([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)])
+ >>> df = spark.createDataFrame([Row(name='Tom', height=80), Row(name='Alice', height=None)])
>>> df.filter(df.height.isNotNull()).collect()
- [Row(height=80, name=u'Tom')]
+ [Row(name='Tom', height=80)]
"""
- isNull = ignore_unicode_prefix(_unary_op("isNull", _isNull_doc))
- isNotNull = ignore_unicode_prefix(_unary_op("isNotNull", _isNotNull_doc))
+ isNull = _unary_op("isNull", _isNull_doc)
+ isNotNull = _unary_op("isNotNull", _isNotNull_doc)
@since(1.3)
def alias(self, *alias, **kwargs):
@@ -581,17 +572,16 @@ class Column(object):
name = copy_func(alias, sinceversion=2.0, doc=":func:`name` is an alias for :func:`alias`.")
- @ignore_unicode_prefix
@since(1.3)
def cast(self, dataType):
""" Convert the column into type ``dataType``.
>>> df.select(df.age.cast("string").alias('ages')).collect()
- [Row(ages=u'2'), Row(ages=u'5')]
+ [Row(ages='2'), Row(ages='5')]
>>> df.select(df.age.cast(StringType()).alias('ages')).collect()
- [Row(ages=u'2'), Row(ages=u'5')]
+ [Row(ages='2'), Row(ages='5')]
"""
- if isinstance(dataType, basestring):
+ if isinstance(dataType, str):
jc = self._jc.cast(dataType)
elif isinstance(dataType, DataType):
from pyspark.sql import SparkSession
diff --git a/python/pyspark/sql/conf.py b/python/pyspark/sql/conf.py
index 71ea163171..eab084a1fa 100644
--- a/python/pyspark/sql/conf.py
+++ b/python/pyspark/sql/conf.py
@@ -18,10 +18,6 @@
import sys
from pyspark import since, _NoValue
-from pyspark.rdd import ignore_unicode_prefix
-
-if sys.version_info[0] >= 3:
- basestring = str
class RuntimeConfig(object):
@@ -34,13 +30,11 @@ class RuntimeConfig(object):
"""Create a new RuntimeConfig that wraps the underlying JVM object."""
self._jconf = jconf
- @ignore_unicode_prefix
@since(2.0)
def set(self, key, value):
"""Sets the given Spark runtime configuration property."""
self._jconf.set(key, value)
- @ignore_unicode_prefix
@since(2.0)
def get(self, key, default=_NoValue):
"""Returns the value of Spark runtime configuration property for the given key,
@@ -54,7 +48,6 @@ class RuntimeConfig(object):
self._checkType(default, "default")
return self._jconf.get(key, default)
- @ignore_unicode_prefix
@since(2.0)
def unset(self, key):
"""Resets the configuration property for the given key."""
@@ -62,11 +55,10 @@ class RuntimeConfig(object):
def _checkType(self, obj, identifier):
"""Assert that an object is of type str."""
- if not isinstance(obj, basestring):
+ if not isinstance(obj, str):
raise TypeError("expected %s '%s' to be a string (was '%s')" %
(identifier, obj, type(obj).__name__))
- @ignore_unicode_prefix
@since(2.4)
def isModifiable(self, key):
"""Indicates whether the configuration property with the given key
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index 956343a231..7fbcf85cb1 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -15,15 +15,10 @@
# limitations under the License.
#
-from __future__ import print_function
import sys
import warnings
-if sys.version >= '3':
- basestring = unicode = str
-
from pyspark import since, _NoValue
-from pyspark.rdd import ignore_unicode_prefix
from pyspark.sql.session import _monkey_patch_RDD, SparkSession
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.readwriter import DataFrameReader
@@ -52,7 +47,6 @@ class SQLContext(object):
_instantiatedContext = None
- @ignore_unicode_prefix
def __init__(self, sparkContext, sparkSession=None, jsqlContext=None):
"""Creates a new SQLContext.
@@ -70,7 +64,7 @@ class SQLContext(object):
[Row((i + CAST(1 AS BIGINT))=2, (d + CAST(1 AS DOUBLE))=2.0, (NOT b)=False, list[1]=2, \
dict[s]=0, time=datetime.datetime(2014, 8, 1, 14, 1, 5), a=1)]
>>> df.rdd.map(lambda x: (x.i, x.s, x.d, x.l, x.b, x.time, x.row.a, x.list)).collect()
- [(1, u'string', 1.0, 1, True, datetime.datetime(2014, 8, 1, 14, 1, 5), 1, [1, 2, 3])]
+ [(1, 'string', 1.0, 1, True, datetime.datetime(2014, 8, 1, 14, 1, 5), 1, [1, 2, 3])]
"""
warnings.warn(
"Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.",
@@ -142,7 +136,6 @@ class SQLContext(object):
"""
self.sparkSession.conf.set(key, value)
- @ignore_unicode_prefix
@since(1.3)
def getConf(self, key, defaultValue=_NoValue):
"""Returns the value of Spark SQL configuration property for the given key.
@@ -152,12 +145,12 @@ class SQLContext(object):
the system default value.
>>> sqlContext.getConf("spark.sql.shuffle.partitions")
- u'200'
- >>> sqlContext.getConf("spark.sql.shuffle.partitions", u"10")
- u'10'
- >>> sqlContext.setConf("spark.sql.shuffle.partitions", u"50")
- >>> sqlContext.getConf("spark.sql.shuffle.partitions", u"10")
- u'50'
+ '200'
+ >>> sqlContext.getConf("spark.sql.shuffle.partitions", "10")
+ '10'
+ >>> sqlContext.setConf("spark.sql.shuffle.partitions", "50")
+ >>> sqlContext.getConf("spark.sql.shuffle.partitions", "10")
+ '50'
"""
return self.sparkSession.conf.get(key, defaultValue)
@@ -229,7 +222,6 @@ class SQLContext(object):
return self.sparkSession._inferSchema(rdd, samplingRatio)
@since(1.3)
- @ignore_unicode_prefix
def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=True):
"""
Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`.
@@ -274,27 +266,27 @@ class SQLContext(object):
>>> l = [('Alice', 1)]
>>> sqlContext.createDataFrame(l).collect()
- [Row(_1=u'Alice', _2=1)]
+ [Row(_1='Alice', _2=1)]
>>> sqlContext.createDataFrame(l, ['name', 'age']).collect()
- [Row(name=u'Alice', age=1)]
+ [Row(name='Alice', age=1)]
>>> d = [{'name': 'Alice', 'age': 1}]
>>> sqlContext.createDataFrame(d).collect()
- [Row(age=1, name=u'Alice')]
+ [Row(age=1, name='Alice')]
>>> rdd = sc.parallelize(l)
>>> sqlContext.createDataFrame(rdd).collect()
- [Row(_1=u'Alice', _2=1)]
+ [Row(_1='Alice', _2=1)]
>>> df = sqlContext.createDataFrame(rdd, ['name', 'age'])
>>> df.collect()
- [Row(name=u'Alice', age=1)]
+ [Row(name='Alice', age=1)]
>>> from pyspark.sql import Row
>>> Person = Row('name', 'age')
>>> person = rdd.map(lambda r: Person(*r))
>>> df2 = sqlContext.createDataFrame(person)
>>> df2.collect()
- [Row(name=u'Alice', age=1)]
+ [Row(name='Alice', age=1)]
>>> from pyspark.sql.types import *
>>> schema = StructType([
@@ -302,15 +294,15 @@ class SQLContext(object):
... StructField("age", IntegerType(), True)])
>>> df3 = sqlContext.createDataFrame(rdd, schema)
>>> df3.collect()
- [Row(name=u'Alice', age=1)]
+ [Row(name='Alice', age=1)]
>>> sqlContext.createDataFrame(df.toPandas()).collect() # doctest: +SKIP
- [Row(name=u'Alice', age=1)]
+ [Row(name='Alice', age=1)]
>>> sqlContext.createDataFrame(pandas.DataFrame([[1, 2]])).collect() # doctest: +SKIP
[Row(0=1, 1=2)]
>>> sqlContext.createDataFrame(rdd, "a: string, b: int").collect()
- [Row(a=u'Alice', b=1)]
+ [Row(a='Alice', b=1)]
>>> rdd = rdd.map(lambda row: row[1])
>>> sqlContext.createDataFrame(rdd, "int").collect()
[Row(value=1)]
@@ -358,7 +350,6 @@ class SQLContext(object):
return self.sparkSession.catalog.createExternalTable(
tableName, path, source, schema, **options)
- @ignore_unicode_prefix
@since(1.0)
def sql(self, sqlQuery):
"""Returns a :class:`DataFrame` representing the result of the given query.
@@ -368,7 +359,7 @@ class SQLContext(object):
>>> sqlContext.registerDataFrameAsTable(df, "table1")
>>> df2 = sqlContext.sql("SELECT field1 AS f1, field2 as f2 from table1")
>>> df2.collect()
- [Row(f1=1, f2=u'row1'), Row(f1=2, f2=u'row2'), Row(f1=3, f2=u'row3')]
+ [Row(f1=1, f2='row1'), Row(f1=2, f2='row2'), Row(f1=3, f2='row3')]
"""
return self.sparkSession.sql(sqlQuery)
@@ -385,7 +376,6 @@ class SQLContext(object):
"""
return self.sparkSession.table(tableName)
- @ignore_unicode_prefix
@since(1.3)
def tables(self, dbName=None):
"""Returns a :class:`DataFrame` containing names of tables in the given database.
@@ -401,7 +391,7 @@ class SQLContext(object):
>>> sqlContext.registerDataFrameAsTable(df, "table1")
>>> df2 = sqlContext.tables()
>>> df2.filter("tableName = 'table1'").first()
- Row(database=u'', tableName=u'table1', isTemporary=True)
+ Row(database='', tableName='table1', isTemporary=True)
"""
if dbName is None:
return DataFrame(self._ssql_ctx.tables(), self)
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 3ad899bcc3..023fbeabcb 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -17,21 +17,12 @@
import sys
import random
-
-if sys.version >= '3':
- basestring = unicode = str
- long = int
- from functools import reduce
- from html import escape as html_escape
-else:
- from itertools import imap as map
- from cgi import escape as html_escape
-
import warnings
+from functools import reduce
+from html import escape as html_escape
from pyspark import copy_func, since, _NoValue
-from pyspark.rdd import RDD, _load_from_socket, _local_iterator_from_socket, \
- ignore_unicode_prefix
+from pyspark.rdd import RDD, _load_from_socket, _local_iterator_from_socket
from pyspark.serializers import BatchedSerializer, PickleSerializer, \
UTF8Deserializer
from pyspark.storagelevel import StorageLevel
@@ -109,7 +100,6 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
"""
return DataFrameStatFunctions(self)
- @ignore_unicode_prefix
@since(1.3)
def toJSON(self, use_unicode=True):
"""Converts a :class:`DataFrame` into a :class:`RDD` of string.
@@ -117,7 +107,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
Each row is turned into a JSON document as one element in the returned RDD.
>>> df.toJSON().first()
- u'{"age":2,"name":"Alice"}'
+ '{"age":2,"name":"Alice"}'
"""
rdd = self._jdf.toJSON()
return RDD(rdd.toJavaRDD(), self._sc, UTF8Deserializer(use_unicode))
@@ -330,11 +320,11 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
# For the case when extended is mode:
# df.explain("formatted")
- is_extended_as_mode = isinstance(extended, basestring) and mode is None
+ is_extended_as_mode = isinstance(extended, str) and mode is None
# For the mode specified:
# df.explain(mode="formatted")
- is_mode_case = extended is None and isinstance(mode, basestring)
+ is_mode_case = extended is None and isinstance(mode, str)
if not (is_no_argument or is_extended_case or is_extended_as_mode or is_mode_case):
argtypes = [
@@ -568,7 +558,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
if not isinstance(name, str):
raise TypeError("name should be provided as str, got {0}".format(type(name)))
- allowed_types = (basestring, list, float, int)
+ allowed_types = (str, list, float, int)
for p in parameters:
if not isinstance(p, allowed_types):
raise TypeError(
@@ -587,19 +577,17 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
"""
return int(self._jdf.count())
- @ignore_unicode_prefix
@since(1.3)
def collect(self):
"""Returns all the records as a list of :class:`Row`.
>>> df.collect()
- [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
+ [Row(age=2, name='Alice'), Row(age=5, name='Bob')]
"""
with SCCallSiteSync(self._sc) as css:
sock_info = self._jdf.collectToPython()
return list(_load_from_socket(sock_info, BatchedSerializer(PickleSerializer())))
- @ignore_unicode_prefix
@since(2.0)
def toLocalIterator(self, prefetchPartitions=False):
"""
@@ -612,36 +600,33 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
before it is needed.
>>> list(df.toLocalIterator())
- [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
+ [Row(age=2, name='Alice'), Row(age=5, name='Bob')]
"""
with SCCallSiteSync(self._sc) as css:
sock_info = self._jdf.toPythonIterator(prefetchPartitions)
return _local_iterator_from_socket(sock_info, BatchedSerializer(PickleSerializer()))
- @ignore_unicode_prefix
@since(1.3)
def limit(self, num):
"""Limits the result count to the number specified.
>>> df.limit(1).collect()
- [Row(age=2, name=u'Alice')]
+ [Row(age=2, name='Alice')]
>>> df.limit(0).collect()
[]
"""
jdf = self._jdf.limit(num)
return DataFrame(jdf, self.sql_ctx)
- @ignore_unicode_prefix
@since(1.3)
def take(self, num):
"""Returns the first ``num`` rows as a :class:`list` of :class:`Row`.
>>> df.take(2)
- [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
+ [Row(age=2, name='Alice'), Row(age=5, name='Bob')]
"""
return self.limit(num).collect()
- @ignore_unicode_prefix
@since(3.0)
def tail(self, num):
"""
@@ -651,7 +636,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
a very large ``num`` can crash the driver process with OutOfMemoryError.
>>> df.tail(1)
- [Row(age=5, name=u'Bob')]
+ [Row(age=5, name='Bob')]
"""
with SCCallSiteSync(self._sc):
sock_info = self._jdf.tailToPython(num)
@@ -818,7 +803,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
else:
return DataFrame(
self._jdf.repartition(numPartitions, self._jcols(*cols)), self.sql_ctx)
- elif isinstance(numPartitions, (basestring, Column)):
+ elif isinstance(numPartitions, (str, Column)):
cols = (numPartitions, ) + cols
return DataFrame(self._jdf.repartition(self._jcols(*cols)), self.sql_ctx)
else:
@@ -869,7 +854,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
else:
return DataFrame(
self._jdf.repartitionByRange(numPartitions, self._jcols(*cols)), self.sql_ctx)
- elif isinstance(numPartitions, (basestring, Column)):
+ elif isinstance(numPartitions, (str, Column)):
cols = (numPartitions,) + cols
return DataFrame(self._jdf.repartitionByRange(self._jcols(*cols)), self.sql_ctx)
else:
@@ -944,7 +929,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
fraction = withReplacement
withReplacement = None
- seed = long(seed) if seed is not None else None
+ seed = int(seed) if seed is not None else None
args = [arg for arg in [withReplacement, fraction, seed] if arg is not None]
jdf = self._jdf.sample(*args)
return DataFrame(jdf, self.sql_ctx)
@@ -978,15 +963,15 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
.. versionchanged:: 3.0
Added sampling by a column of :class:`Column`
"""
- if isinstance(col, basestring):
+ if isinstance(col, str):
col = Column(col)
elif not isinstance(col, Column):
raise ValueError("col must be a string or a column, but got %r" % type(col))
if not isinstance(fractions, dict):
raise ValueError("fractions must be a dict but got %r" % type(fractions))
for k, v in fractions.items():
- if not isinstance(k, (float, int, long, basestring)):
- raise ValueError("key must be float, int, long, or string, but got %r" % type(k))
+ if not isinstance(k, (float, int, str)):
+ raise ValueError("key must be float, int, or string, but got %r" % type(k))
fractions[k] = float(v)
col = col._jc
seed = seed if seed is not None else random.randint(0, sys.maxsize)
@@ -1011,7 +996,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
if w < 0.0:
raise ValueError("Weights must be positive. Found weight value: %s" % w)
seed = seed if seed is not None else random.randint(0, sys.maxsize)
- rdd_array = self._jdf.randomSplit(_to_list(self.sql_ctx._sc, weights), long(seed))
+ rdd_array = self._jdf.randomSplit(_to_list(self.sql_ctx._sc, weights), int(seed))
return [DataFrame(rdd, self.sql_ctx) for rdd in rdd_array]
@property
@@ -1052,12 +1037,11 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
| 3|
+----+
"""
- if not isinstance(colName, basestring):
+ if not isinstance(colName, str):
raise ValueError("colName should be provided as string")
jc = self._jdf.colRegex(colName)
return Column(jc)
- @ignore_unicode_prefix
@since(1.3)
def alias(self, alias):
"""Returns a new :class:`DataFrame` with an alias set.
@@ -1070,12 +1054,11 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
>>> joined_df = df_as1.join(df_as2, col("df_as1.name") == col("df_as2.name"), 'inner')
>>> joined_df.select("df_as1.name", "df_as2.name", "df_as2.age") \
.sort(desc("df_as1.name")).collect()
- [Row(name=u'Bob', name=u'Bob', age=5), Row(name=u'Alice', name=u'Alice', age=2)]
+ [Row(name='Bob', name='Bob', age=5), Row(name='Alice', name='Alice', age=2)]
"""
- assert isinstance(alias, basestring), "alias should be a string"
+ assert isinstance(alias, str), "alias should be a string"
return DataFrame(getattr(self._jdf, "as")(alias), self.sql_ctx)
- @ignore_unicode_prefix
@since(2.1)
def crossJoin(self, other):
"""Returns the cartesian product with another :class:`DataFrame`.
@@ -1083,18 +1066,17 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
:param other: Right side of the cartesian product.
>>> df.select("age", "name").collect()
- [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
+ [Row(age=2, name='Alice'), Row(age=5, name='Bob')]
>>> df2.select("name", "height").collect()
- [Row(name=u'Tom', height=80), Row(name=u'Bob', height=85)]
+ [Row(name='Tom', height=80), Row(name='Bob', height=85)]
>>> df.crossJoin(df2.select("height")).select("age", "name", "height").collect()
- [Row(age=2, name=u'Alice', height=80), Row(age=2, name=u'Alice', height=85),
- Row(age=5, name=u'Bob', height=80), Row(age=5, name=u'Bob', height=85)]
+ [Row(age=2, name='Alice', height=80), Row(age=2, name='Alice', height=85),
+ Row(age=5, name='Bob', height=80), Row(age=5, name='Bob', height=85)]
"""
jdf = self._jdf.crossJoin(other._jdf)
return DataFrame(jdf, self.sql_ctx)
- @ignore_unicode_prefix
@since(1.3)
def join(self, other, on=None, how=None):
"""Joins with another :class:`DataFrame`, using the given join expression.
@@ -1113,27 +1095,27 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
>>> from pyspark.sql.functions import desc
>>> df.join(df2, df.name == df2.name, 'outer').select(df.name, df2.height) \
.sort(desc("name")).collect()
- [Row(name=u'Bob', height=85), Row(name=u'Alice', height=None), Row(name=None, height=80)]
+ [Row(name='Bob', height=85), Row(name='Alice', height=None), Row(name=None, height=80)]
>>> df.join(df2, 'name', 'outer').select('name', 'height').sort(desc("name")).collect()
- [Row(name=u'Tom', height=80), Row(name=u'Bob', height=85), Row(name=u'Alice', height=None)]
+ [Row(name='Tom', height=80), Row(name='Bob', height=85), Row(name='Alice', height=None)]
>>> cond = [df.name == df3.name, df.age == df3.age]
>>> df.join(df3, cond, 'outer').select(df.name, df3.age).collect()
- [Row(name=u'Alice', age=2), Row(name=u'Bob', age=5)]
+ [Row(name='Alice', age=2), Row(name='Bob', age=5)]
>>> df.join(df2, 'name').select(df.name, df2.height).collect()
- [Row(name=u'Bob', height=85)]
+ [Row(name='Bob', height=85)]
>>> df.join(df4, ['name', 'age']).select(df.name, df.age).collect()
- [Row(name=u'Bob', age=5)]
+ [Row(name='Bob', age=5)]
"""
if on is not None and not isinstance(on, list):
on = [on]
if on is not None:
- if isinstance(on[0], basestring):
+ if isinstance(on[0], str):
on = self._jseq(on)
else:
assert isinstance(on[0], Column), "on should be Column or list of Column"
@@ -1147,7 +1129,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
how = "inner"
if on is None:
on = self._jseq([])
- assert isinstance(how, basestring), "how should be basestring"
+ assert isinstance(how, str), "how should be a string"
jdf = self._jdf.join(other._jdf, on, how)
return DataFrame(jdf, self.sql_ctx)
@@ -1171,7 +1153,6 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
jdf = self._jdf.sortWithinPartitions(self._sort_cols(cols, kwargs))
return DataFrame(jdf, self.sql_ctx)
- @ignore_unicode_prefix
@since(1.3)
def sort(self, *cols, **kwargs):
"""Returns a new :class:`DataFrame` sorted by the specified column(s).
@@ -1182,18 +1163,18 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
If a list is specified, length of the list must equal length of the `cols`.
>>> df.sort(df.age.desc()).collect()
- [Row(age=5, name=u'Bob'), Row(age=2, name=u'Alice')]
+ [Row(age=5, name='Bob'), Row(age=2, name='Alice')]
>>> df.sort("age", ascending=False).collect()
- [Row(age=5, name=u'Bob'), Row(age=2, name=u'Alice')]
+ [Row(age=5, name='Bob'), Row(age=2, name='Alice')]
>>> df.orderBy(df.age.desc()).collect()
- [Row(age=5, name=u'Bob'), Row(age=2, name=u'Alice')]
+ [Row(age=5, name='Bob'), Row(age=2, name='Alice')]
>>> from pyspark.sql.functions import *
>>> df.sort(asc("age")).collect()
- [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
+ [Row(age=2, name='Alice'), Row(age=5, name='Bob')]
>>> df.orderBy(desc("age"), "name").collect()
- [Row(age=5, name=u'Bob'), Row(age=2, name=u'Alice')]
+ [Row(age=5, name='Bob'), Row(age=2, name='Alice')]
>>> df.orderBy(["age", "name"], ascending=[0, 1]).collect()
- [Row(age=5, name=u'Bob'), Row(age=2, name=u'Alice')]
+ [Row(age=5, name='Bob'), Row(age=2, name='Alice')]
"""
jdf = self._jdf.sort(self._sort_cols(cols, kwargs))
return DataFrame(jdf, self.sql_ctx)
@@ -1333,7 +1314,6 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
jdf = self._jdf.summary(self._jseq(statistics))
return DataFrame(jdf, self.sql_ctx)
- @ignore_unicode_prefix
@since(1.3)
def head(self, n=None):
"""Returns the first ``n`` rows.
@@ -1346,26 +1326,24 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
If n is 1, return a single Row.
>>> df.head()
- Row(age=2, name=u'Alice')
+ Row(age=2, name='Alice')
>>> df.head(1)
- [Row(age=2, name=u'Alice')]
+ [Row(age=2, name='Alice')]
"""
if n is None:
rs = self.head(1)
return rs[0] if rs else None
return self.take(n)
- @ignore_unicode_prefix
@since(1.3)
def first(self):
"""Returns the first row as a :class:`Row`.
>>> df.first()
- Row(age=2, name=u'Alice')
+ Row(age=2, name='Alice')
"""
return self.head()
- @ignore_unicode_prefix
@since(1.3)
def __getitem__(self, item):
"""Returns the column as a :class:`Column`.
@@ -1373,13 +1351,13 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
>>> df.select(df['age']).collect()
[Row(age=2), Row(age=5)]
>>> df[ ["name", "age"]].collect()
- [Row(name=u'Alice', age=2), Row(name=u'Bob', age=5)]
+ [Row(name='Alice', age=2), Row(name='Bob', age=5)]
>>> df[ df.age > 3 ].collect()
- [Row(age=5, name=u'Bob')]
+ [Row(age=5, name='Bob')]
>>> df[df[0] > 3].collect()
- [Row(age=5, name=u'Bob')]
+ [Row(age=5, name='Bob')]
"""
- if isinstance(item, basestring):
+ if isinstance(item, str):
jc = self._jdf.apply(item)
return Column(jc)
elif isinstance(item, Column):
@@ -1405,7 +1383,6 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
jc = self._jdf.apply(name)
return Column(jc)
- @ignore_unicode_prefix
@since(1.3)
def select(self, *cols):
"""Projects a set of expressions and returns a new :class:`DataFrame`.
@@ -1415,11 +1392,11 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
in the current :class:`DataFrame`.
>>> df.select('*').collect()
- [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
+ [Row(age=2, name='Alice'), Row(age=5, name='Bob')]
>>> df.select('name', 'age').collect()
- [Row(name=u'Alice', age=2), Row(name=u'Bob', age=5)]
+ [Row(name='Alice', age=2), Row(name='Bob', age=5)]
>>> df.select(df.name, (df.age + 10).alias('age')).collect()
- [Row(name=u'Alice', age=12), Row(name=u'Bob', age=15)]
+ [Row(name='Alice', age=12), Row(name='Bob', age=15)]
"""
jdf = self._jdf.select(self._jcols(*cols))
return DataFrame(jdf, self.sql_ctx)
@@ -1438,7 +1415,6 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
jdf = self._jdf.selectExpr(self._jseq(expr))
return DataFrame(jdf, self.sql_ctx)
- @ignore_unicode_prefix
@since(1.3)
def filter(self, condition):
"""Filters rows using the given condition.
@@ -1449,16 +1425,16 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
or a string of SQL expression.
>>> df.filter(df.age > 3).collect()
- [Row(age=5, name=u'Bob')]
+ [Row(age=5, name='Bob')]
>>> df.where(df.age == 2).collect()
- [Row(age=2, name=u'Alice')]
+ [Row(age=2, name='Alice')]
>>> df.filter("age > 3").collect()
- [Row(age=5, name=u'Bob')]
+ [Row(age=5, name='Bob')]
>>> df.where("age = 2").collect()
- [Row(age=2, name=u'Alice')]
+ [Row(age=2, name='Alice')]
"""
- if isinstance(condition, basestring):
+ if isinstance(condition, str):
jdf = self._jdf.filter(condition)
elif isinstance(condition, Column):
jdf = self._jdf.filter(condition._jc)
@@ -1466,7 +1442,6 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
raise TypeError("condition should be string or Column")
return DataFrame(jdf, self.sql_ctx)
- @ignore_unicode_prefix
@since(1.3)
def groupBy(self, *cols):
"""Groups the :class:`DataFrame` using the specified columns,
@@ -1481,11 +1456,11 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
>>> df.groupBy().avg().collect()
[Row(avg(age)=3.5)]
>>> sorted(df.groupBy('name').agg({'age': 'mean'}).collect())
- [Row(name=u'Alice', avg(age)=2.0), Row(name=u'Bob', avg(age)=5.0)]
+ [Row(name='Alice', avg(age)=2.0), Row(name='Bob', avg(age)=5.0)]
>>> sorted(df.groupBy(df.name).avg().collect())
- [Row(name=u'Alice', avg(age)=2.0), Row(name=u'Bob', avg(age)=5.0)]
+ [Row(name='Alice', avg(age)=2.0), Row(name='Bob', avg(age)=5.0)]
>>> sorted(df.groupBy(['name', df.age]).count().collect())
- [Row(name=u'Alice', age=2, count=1), Row(name=u'Bob', age=5, count=1)]
+ [Row(name='Alice', age=2, count=1), Row(name='Bob', age=5, count=1)]
"""
jgd = self._jdf.groupBy(self._jcols(*cols))
from pyspark.sql.group import GroupedData
@@ -1655,19 +1630,19 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
... Row(name='Alice', age=5, height=80), \\
... Row(name='Alice', age=10, height=80)]).toDF()
>>> df.dropDuplicates().show()
- +---+------+-----+
- |age|height| name|
- +---+------+-----+
- | 5| 80|Alice|
- | 10| 80|Alice|
- +---+------+-----+
+ +-----+---+------+
+ | name|age|height|
+ +-----+---+------+
+ |Alice| 5| 80|
+ |Alice| 10| 80|
+ +-----+---+------+
>>> df.dropDuplicates(['name', 'height']).show()
- +---+------+-----+
- |age|height| name|
- +---+------+-----+
- | 5| 80|Alice|
- +---+------+-----+
+ +-----+---+------+
+ | name|age|height|
+ +-----+---+------+
+ |Alice| 5| 80|
+ +-----+---+------+
"""
if subset is None:
jdf = self._jdf.dropDuplicates()
@@ -1700,7 +1675,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
if subset is None:
subset = self.columns
- elif isinstance(subset, basestring):
+ elif isinstance(subset, str):
subset = [subset]
elif not isinstance(subset, (list, tuple)):
raise ValueError("subset should be a list or tuple of column names")
@@ -1715,11 +1690,11 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
"""Replace null values, alias for ``na.fill()``.
:func:`DataFrame.fillna` and :func:`DataFrameNaFunctions.fill` are aliases of each other.
- :param value: int, long, float, string, bool or dict.
+ :param value: int, float, string, bool or dict.
Value to replace null values with.
If the value is a dict, then `subset` is ignored and `value` must be a mapping
from column name (string) to replacement value. The replacement value must be
- an int, long, float, boolean, or string.
+ an int, float, boolean, or string.
:param subset: optional list of column names to consider.
Columns specified in subset that do not have matching data type are ignored.
For example, if `value` is a string, and subset contains a non-string column,
@@ -1754,13 +1729,13 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
| 50| null|unknown|
+---+------+-------+
"""
- if not isinstance(value, (float, int, long, basestring, bool, dict)):
- raise ValueError("value should be a float, int, long, string, bool or dict")
+ if not isinstance(value, (float, int, str, bool, dict)):
+ raise ValueError("value should be a float, int, string, bool or dict")
# Note that bool validates isinstance(int), but we don't want to
# convert bools to floats
- if not isinstance(value, bool) and isinstance(value, (int, long)):
+ if not isinstance(value, bool) and isinstance(value, int):
value = float(value)
if isinstance(value, dict):
@@ -1768,7 +1743,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
elif subset is None:
return DataFrame(self._jdf.na().fill(value), self.sql_ctx)
else:
- if isinstance(subset, basestring):
+ if isinstance(subset, str):
subset = [subset]
elif not isinstance(subset, (list, tuple)):
raise ValueError("subset should be a list or tuple of column names")
@@ -1787,12 +1762,12 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
floating point representation. In case of conflicts (for example with `{42: -1, 42.0: 1}`)
and arbitrary replacement will be used.
- :param to_replace: bool, int, long, float, string, list or dict.
+ :param to_replace: bool, int, float, string, list or dict.
Value to be replaced.
If the value is a dict, then `value` is ignored or can be omitted, and `to_replace`
must be a mapping between a value and a replacement.
- :param value: bool, int, long, float, string, list or None.
- The replacement value must be a bool, int, long, float, string or None. If `value` is a
+ :param value: bool, int, float, string, list or None.
+ The replacement value must be a bool, int, float, string or None. If `value` is a
list, `value` should be of the same length and type as `to_replace`.
If `value` is a scalar and `to_replace` is a sequence, then `value` is
used as a replacement for each item in `to_replace`.
@@ -1854,7 +1829,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
>>> all_of(bool)([True, False])
True
- >>> all_of(basestring)(["a", 1])
+ >>> all_of(str)(["a", 1])
False
"""
def all_of_(xs):
@@ -1862,20 +1837,20 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
return all_of_
all_of_bool = all_of(bool)
- all_of_str = all_of(basestring)
- all_of_numeric = all_of((float, int, long))
+ all_of_str = all_of(str)
+ all_of_numeric = all_of((float, int))
# Validate input types
- valid_types = (bool, float, int, long, basestring, list, tuple)
+ valid_types = (bool, float, int, str, list, tuple)
if not isinstance(to_replace, valid_types + (dict, )):
raise ValueError(
- "to_replace should be a bool, float, int, long, string, list, tuple, or dict. "
+ "to_replace should be a bool, float, int, string, list, tuple, or dict. "
"Got {0}".format(type(to_replace)))
if not isinstance(value, valid_types) and value is not None \
and not isinstance(to_replace, dict):
raise ValueError("If to_replace is not a dict, value should be "
- "a bool, float, int, long, string, list, tuple or None. "
+ "a bool, float, int, string, list, tuple or None. "
"Got {0}".format(type(value)))
if isinstance(to_replace, (list, tuple)) and isinstance(value, (list, tuple)):
@@ -1883,12 +1858,12 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
raise ValueError("to_replace and value lists should be of the same length. "
"Got {0} and {1}".format(len(to_replace), len(value)))
- if not (subset is None or isinstance(subset, (list, tuple, basestring))):
+ if not (subset is None or isinstance(subset, (list, tuple, str))):
raise ValueError("subset should be a list or tuple of column names, "
"column name or None. Got {0}".format(type(subset)))
# Reshape input arguments if necessary
- if isinstance(to_replace, (float, int, long, basestring)):
+ if isinstance(to_replace, (float, int, str)):
to_replace = [to_replace]
if isinstance(to_replace, dict):
@@ -1896,11 +1871,11 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
if value is not None:
warnings.warn("to_replace is a dict and value is not None. value will be ignored.")
else:
- if isinstance(value, (float, int, long, basestring)) or value is None:
+ if isinstance(value, (float, int, str)) or value is None:
value = [value for _ in range(len(to_replace))]
rep_dict = dict(zip(to_replace, value))
- if isinstance(subset, basestring):
+ if isinstance(subset, str):
subset = [subset]
# Verify we were not passed in mixed type generics.
@@ -1957,10 +1932,10 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
Added support for multiple columns.
"""
- if not isinstance(col, (basestring, list, tuple)):
+ if not isinstance(col, (str, list, tuple)):
raise ValueError("col should be a string, list or tuple, but got %r" % type(col))
- isStr = isinstance(col, basestring)
+ isStr = isinstance(col, str)
if isinstance(col, tuple):
col = list(col)
@@ -1968,7 +1943,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
col = [col]
for c in col:
- if not isinstance(c, basestring):
+ if not isinstance(c, str):
raise ValueError("columns should be strings, but got %r" % type(c))
col = _to_list(self._sc, col)
@@ -1977,12 +1952,12 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
if isinstance(probabilities, tuple):
probabilities = list(probabilities)
for p in probabilities:
- if not isinstance(p, (float, int, long)) or p < 0 or p > 1:
- raise ValueError("probabilities should be numerical (float, int, long) in [0,1].")
+ if not isinstance(p, (float, int)) or p < 0 or p > 1:
+ raise ValueError("probabilities should be numerical (float, int) in [0,1].")
probabilities = _to_list(self._sc, probabilities)
- if not isinstance(relativeError, (float, int, long)) or relativeError < 0:
- raise ValueError("relativeError should be numerical (float, int, long) >= 0.")
+ if not isinstance(relativeError, (float, int)) or relativeError < 0:
+ raise ValueError("relativeError should be numerical (float, int) >= 0.")
relativeError = float(relativeError)
jaq = self._jdf.stat().approxQuantile(col, probabilities, relativeError)
@@ -2000,9 +1975,9 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
:param col2: The name of the second column
:param method: The correlation method. Currently only supports "pearson"
"""
- if not isinstance(col1, basestring):
+ if not isinstance(col1, str):
raise ValueError("col1 should be a string.")
- if not isinstance(col2, basestring):
+ if not isinstance(col2, str):
raise ValueError("col2 should be a string.")
if not method:
method = "pearson"
@@ -2020,9 +1995,9 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
:param col1: The name of the first column
:param col2: The name of the second column
"""
- if not isinstance(col1, basestring):
+ if not isinstance(col1, str):
raise ValueError("col1 should be a string.")
- if not isinstance(col2, basestring):
+ if not isinstance(col2, str):
raise ValueError("col2 should be a string.")
return self._jdf.stat().cov(col1, col2)
@@ -2042,9 +2017,9 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
:param col2: The name of the second column. Distinct items will make the column names
of the :class:`DataFrame`.
"""
- if not isinstance(col1, basestring):
+ if not isinstance(col1, str):
raise ValueError("col1 should be a string.")
- if not isinstance(col2, basestring):
+ if not isinstance(col2, str):
raise ValueError("col2 should be a string.")
return DataFrame(self._jdf.stat().crosstab(col1, col2), self.sql_ctx)
@@ -2073,7 +2048,6 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
support = 0.01
return DataFrame(self._jdf.stat().freqItems(_to_seq(self._sc, cols), support), self.sql_ctx)
- @ignore_unicode_prefix
@since(1.3)
def withColumn(self, colName, col):
"""
@@ -2092,13 +2066,12 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
To avoid this, use :func:`select` with the multiple columns at once.
>>> df.withColumn('age2', df.age + 2).collect()
- [Row(age=2, name=u'Alice', age2=4), Row(age=5, name=u'Bob', age2=7)]
+ [Row(age=2, name='Alice', age2=4), Row(age=5, name='Bob', age2=7)]
"""
assert isinstance(col, Column), "col should be Column"
return DataFrame(self._jdf.withColumn(colName, col._jc), self.sql_ctx)
- @ignore_unicode_prefix
@since(1.3)
def withColumnRenamed(self, existing, new):
"""Returns a new :class:`DataFrame` by renaming an existing column.
@@ -2108,12 +2081,11 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
:param new: string, new name of the column.
>>> df.withColumnRenamed('age', 'age2').collect()
- [Row(age2=2, name=u'Alice'), Row(age2=5, name=u'Bob')]
+ [Row(age2=2, name='Alice'), Row(age2=5, name='Bob')]
"""
return DataFrame(self._jdf.withColumnRenamed(existing, new), self.sql_ctx)
@since(1.4)
- @ignore_unicode_prefix
def drop(self, *cols):
"""Returns a new :class:`DataFrame` that drops the specified column.
This is a no-op if schema doesn't contain the given column name(s).
@@ -2122,23 +2094,23 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
:class:`Column` to drop, or a list of string name of the columns to drop.
>>> df.drop('age').collect()
- [Row(name=u'Alice'), Row(name=u'Bob')]
+ [Row(name='Alice'), Row(name='Bob')]
>>> df.drop(df.age).collect()
- [Row(name=u'Alice'), Row(name=u'Bob')]
+ [Row(name='Alice'), Row(name='Bob')]
>>> df.join(df2, df.name == df2.name, 'inner').drop(df.name).collect()
- [Row(age=5, height=85, name=u'Bob')]
+ [Row(age=5, height=85, name='Bob')]
>>> df.join(df2, df.name == df2.name, 'inner').drop(df2.name).collect()
- [Row(age=5, name=u'Bob', height=85)]
+ [Row(age=5, name='Bob', height=85)]
>>> df.join(df2, 'name', 'inner').drop('age', 'height').collect()
- [Row(name=u'Bob')]
+ [Row(name='Bob')]
"""
if len(cols) == 1:
col = cols[0]
- if isinstance(col, basestring):
+ if isinstance(col, str):
jdf = self._jdf.drop(col)
elif isinstance(col, Column):
jdf = self._jdf.drop(col._jc)
@@ -2146,20 +2118,19 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
raise TypeError("col should be a string or a Column")
else:
for col in cols:
- if not isinstance(col, basestring):
+ if not isinstance(col, str):
raise TypeError("each col in the param list should be a string")
jdf = self._jdf.drop(self._jseq(cols))
return DataFrame(jdf, self.sql_ctx)
- @ignore_unicode_prefix
def toDF(self, *cols):
"""Returns a new :class:`DataFrame` that with new specified column names
:param cols: list of new column names (string)
>>> df.toDF('f1', 'f2').collect()
- [Row(f1=2, f2=u'Alice'), Row(f1=5, f2=u'Bob')]
+ [Row(f1=2, f2='Alice'), Row(f1=5, f2='Bob')]
"""
jdf = self._jdf.toDF(self._jseq(cols))
return DataFrame(jdf, self.sql_ctx)
@@ -2347,7 +2318,6 @@ def _test():
from pyspark.context import SparkContext
from pyspark.sql import Row, SQLContext, SparkSession
import pyspark.sql.dataframe
- from pyspark.sql.functions import from_unixtime
globs = pyspark.sql.dataframe.__dict__.copy()
sc = SparkContext('local[4]', 'PythonTest')
globs['sc'] = sc
@@ -2356,16 +2326,16 @@ def _test():
globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')])\
.toDF(StructType([StructField('age', IntegerType()),
StructField('name', StringType())]))
- globs['df2'] = sc.parallelize([Row(name='Tom', height=80), Row(name='Bob', height=85)]).toDF()
- globs['df3'] = sc.parallelize([Row(name='Alice', age=2),
- Row(name='Bob', age=5)]).toDF()
- globs['df4'] = sc.parallelize([Row(name='Alice', age=10, height=80),
- Row(name='Bob', age=5, height=None),
- Row(name='Tom', age=None, height=None),
- Row(name=None, age=None, height=None)]).toDF()
- globs['df5'] = sc.parallelize([Row(name='Alice', spy=False, age=10),
- Row(name='Bob', spy=None, age=5),
- Row(name='Mallory', spy=True, age=None)]).toDF()
+ globs['df2'] = sc.parallelize([Row(height=80, name='Tom'), Row(height=85, name='Bob')]).toDF()
+ globs['df3'] = sc.parallelize([Row(age=2, name='Alice'),
+ Row(age=5, name='Bob')]).toDF()
+ globs['df4'] = sc.parallelize([Row(age=10, height=80, name='Alice'),
+ Row(age=5, height=None, name='Bob'),
+ Row(age=None, height=None, name='Tom'),
+ Row(age=None, height=None, name=None)]).toDF()
+ globs['df5'] = sc.parallelize([Row(age=10, name='Alice', spy=False),
+ Row(age=5, name='Bob', spy=None),
+ Row(age=None, name='Mallory', spy=True)]).toDF()
globs['sdf'] = sc.parallelize([Row(name='Tom', time=1479441846),
Row(name='Bob', time=1479442946)]).toDF()
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index b5a7c18904..63b049999f 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -22,14 +22,8 @@ import sys
import functools
import warnings
-if sys.version < "3":
- from itertools import imap as map
-
-if sys.version >= '3':
- basestring = str
-
from pyspark import since, SparkContext
-from pyspark.rdd import ignore_unicode_prefix, PythonEvalType
+from pyspark.rdd import PythonEvalType
from pyspark.sql.column import Column, _to_java_column, _to_seq, _create_column_from_literal, \
_create_column_from_name
from pyspark.sql.dataframe import DataFrame
@@ -88,14 +82,14 @@ def _create_binary_mathfunction(name, doc=""):
# if they are not columns or strings.
if isinstance(col1, Column):
arg1 = col1._jc
- elif isinstance(col1, basestring):
+ elif isinstance(col1, str):
arg1 = _create_column_from_name(col1)
else:
arg1 = float(col1)
if isinstance(col2, Column):
arg2 = col2._jc
- elif isinstance(col2, basestring):
+ elif isinstance(col2, str):
arg2 = _create_column_from_name(col2)
else:
arg2 = float(col2)
@@ -648,7 +642,6 @@ def percentile_approx(col, percentage, accuracy=10000):
return Column(sc._jvm.functions.percentile_approx(_to_java_column(col), percentage, accuracy))
-@ignore_unicode_prefix
@since(1.4)
def rand(seed=None):
"""Generates a random column with independent and identically distributed (i.i.d.) samples
@@ -657,8 +650,8 @@ def rand(seed=None):
.. note:: The function is non-deterministic in general case.
>>> df.withColumn('rand', rand(seed=42) * 3).collect()
- [Row(age=2, name=u'Alice', rand=2.4052597283576684),
- Row(age=5, name=u'Bob', rand=2.3913904055683974)]
+ [Row(age=2, name='Alice', rand=2.4052597283576684),
+ Row(age=5, name='Bob', rand=2.3913904055683974)]
"""
sc = SparkContext._active_spark_context
if seed is not None:
@@ -668,7 +661,6 @@ def rand(seed=None):
return Column(jc)
-@ignore_unicode_prefix
@since(1.4)
def randn(seed=None):
"""Generates a column with independent and identically distributed (i.i.d.) samples from
@@ -677,8 +669,8 @@ def randn(seed=None):
.. note:: The function is non-deterministic in general case.
>>> df.withColumn('randn', randn(seed=42)).collect()
- [Row(age=2, name=u'Alice', randn=1.1027054481455365),
- Row(age=5, name=u'Bob', randn=0.7400395449950132)]
+ [Row(age=2, name='Alice', randn=1.1027054481455365),
+ Row(age=5, name='Bob', randn=0.7400395449950132)]
"""
sc = SparkContext._active_spark_context
if seed is not None:
@@ -774,7 +766,6 @@ def expr(str):
return Column(sc._jvm.functions.expr(str))
-@ignore_unicode_prefix
@since(1.4)
def struct(*cols):
"""Creates a new struct column.
@@ -782,9 +773,9 @@ def struct(*cols):
:param cols: list of column names (string) or list of :class:`Column` expressions
>>> df.select(struct('age', 'name').alias("struct")).collect()
- [Row(struct=Row(age=2, name=u'Alice')), Row(struct=Row(age=5, name=u'Bob'))]
+ [Row(struct=Row(age=2, name='Alice')), Row(struct=Row(age=5, name='Bob'))]
>>> df.select(struct([df.age, df.name]).alias("struct")).collect()
- [Row(struct=Row(age=2, name=u'Alice')), Row(struct=Row(age=5, name=u'Bob'))]
+ [Row(struct=Row(age=2, name='Alice')), Row(struct=Row(age=5, name='Bob'))]
"""
sc = SparkContext._active_spark_context
if len(cols) == 1 and isinstance(cols[0], (list, set)):
@@ -879,14 +870,13 @@ def log2(col):
@since(1.5)
-@ignore_unicode_prefix
def conv(col, fromBase, toBase):
"""
Convert a number in a string column from one base to another.
>>> df = spark.createDataFrame([("010101",)], ['n'])
>>> df.select(conv(df.n, 2, 16).alias('hex')).collect()
- [Row(hex=u'15')]
+ [Row(hex='15')]
"""
sc = SparkContext._active_spark_context
return Column(sc._jvm.functions.conv(_to_java_column(col), fromBase, toBase))
@@ -976,7 +966,6 @@ def current_timestamp():
return Column(sc._jvm.functions.current_timestamp())
-@ignore_unicode_prefix
@since(1.5)
def date_format(date, format):
"""
@@ -992,7 +981,7 @@ def date_format(date, format):
>>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
>>> df.select(date_format('dt', 'MM/dd/yyy').alias('date')).collect()
- [Row(date=u'04/08/2015')]
+ [Row(date='04/08/2015')]
"""
sc = SparkContext._active_spark_context
return Column(sc._jvm.functions.date_format(_to_java_column(date), format))
@@ -1310,7 +1299,6 @@ def last_day(date):
return Column(sc._jvm.functions.last_day(_to_java_column(date)))
-@ignore_unicode_prefix
@since(1.5)
def from_unixtime(timestamp, format="yyyy-MM-dd HH:mm:ss"):
"""
@@ -1321,7 +1309,7 @@ def from_unixtime(timestamp, format="yyyy-MM-dd HH:mm:ss"):
>>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
>>> time_df = spark.createDataFrame([(1428476400,)], ['unix_time'])
>>> time_df.select(from_unixtime('unix_time').alias('ts')).collect()
- [Row(ts=u'2015-04-08 00:00:00')]
+ [Row(ts='2015-04-08 00:00:00')]
>>> spark.conf.unset("spark.sql.session.timeZone")
"""
sc = SparkContext._active_spark_context
@@ -1447,7 +1435,6 @@ def timestamp_seconds(col):
@since(2.0)
-@ignore_unicode_prefix
def window(timeColumn, windowDuration, slideDuration=None, startTime=None):
"""Bucketize rows into one or more time windows given a timestamp specifying column. Window
starts are inclusive but the window ends are exclusive, e.g. 12:05 will be in the window
@@ -1471,7 +1458,7 @@ def window(timeColumn, windowDuration, slideDuration=None, startTime=None):
>>> w = df.groupBy(window("date", "5 seconds")).agg(sum("val").alias("sum"))
>>> w.select(w.window.start.cast("string").alias("start"),
... w.window.end.cast("string").alias("end"), "sum").collect()
- [Row(start=u'2016-03-11 09:00:05', end=u'2016-03-11 09:00:10', sum=1)]
+ [Row(start='2016-03-11 09:00:05', end='2016-03-11 09:00:10', sum=1)]
"""
def check_string_field(field, fieldName):
if not field or type(field) is not str:
@@ -1498,7 +1485,6 @@ def window(timeColumn, windowDuration, slideDuration=None, startTime=None):
# ---------------------------- misc functions ----------------------------------
@since(1.5)
-@ignore_unicode_prefix
def crc32(col):
"""
Calculates the cyclic redundancy check value (CRC32) of a binary column and
@@ -1511,33 +1497,30 @@ def crc32(col):
return Column(sc._jvm.functions.crc32(_to_java_column(col)))
-@ignore_unicode_prefix
@since(1.5)
def md5(col):
"""Calculates the MD5 digest and returns the value as a 32 character hex string.
>>> spark.createDataFrame([('ABC',)], ['a']).select(md5('a').alias('hash')).collect()
- [Row(hash=u'902fbdd2b1df0c4f70b4a5d23525e932')]
+ [Row(hash='902fbdd2b1df0c4f70b4a5d23525e932')]
"""
sc = SparkContext._active_spark_context
jc = sc._jvm.functions.md5(_to_java_column(col))
return Column(jc)
-@ignore_unicode_prefix
@since(1.5)
def sha1(col):
"""Returns the hex string result of SHA-1.
>>> spark.createDataFrame([('ABC',)], ['a']).select(sha1('a').alias('hash')).collect()
- [Row(hash=u'3c01bdbb26f358bab27f267924aa2c9a03fcfdb8')]
+ [Row(hash='3c01bdbb26f358bab27f267924aa2c9a03fcfdb8')]
"""
sc = SparkContext._active_spark_context
jc = sc._jvm.functions.sha1(_to_java_column(col))
return Column(jc)
-@ignore_unicode_prefix
@since(1.5)
def sha2(col, numBits):
"""Returns the hex string result of SHA-2 family of hash functions (SHA-224, SHA-256, SHA-384,
@@ -1546,9 +1529,9 @@ def sha2(col, numBits):
>>> digests = df.select(sha2(df.name, 256).alias('s')).collect()
>>> digests[0]
- Row(s=u'3bc51062973c458d5a6f2d8d64a023246354ad7e064b1e4e009ec8a0699a3043')
+ Row(s='3bc51062973c458d5a6f2d8d64a023246354ad7e064b1e4e009ec8a0699a3043')
>>> digests[1]
- Row(s=u'cd9fb1e148ccd8442e5aa74904cc73bf6fb54d1d54d333bd596aa9bb4bb4e961')
+ Row(s='cd9fb1e148ccd8442e5aa74904cc73bf6fb54d1d54d333bd596aa9bb4bb4e961')
"""
sc = SparkContext._active_spark_context
jc = sc._jvm.functions.sha2(_to_java_column(col), numBits)
@@ -1600,7 +1583,6 @@ del _name, _doc
@since(1.5)
-@ignore_unicode_prefix
def concat_ws(sep, *cols):
"""
Concatenates multiple input string columns together into a single string column,
@@ -1608,7 +1590,7 @@ def concat_ws(sep, *cols):
>>> df = spark.createDataFrame([('abcd','123')], ['s', 'd'])
>>> df.select(concat_ws('-', df.s, df.d).alias('s')).collect()
- [Row(s=u'abcd-123')]
+ [Row(s='abcd-123')]
"""
sc = SparkContext._active_spark_context
return Column(sc._jvm.functions.concat_ws(sep, _to_seq(sc, cols, _to_java_column)))
@@ -1634,7 +1616,6 @@ def encode(col, charset):
return Column(sc._jvm.functions.encode(_to_java_column(col), charset))
-@ignore_unicode_prefix
@since(1.5)
def format_number(col, d):
"""
@@ -1645,13 +1626,12 @@ def format_number(col, d):
:param d: the N decimal places
>>> spark.createDataFrame([(5,)], ['a']).select(format_number('a', 4).alias('v')).collect()
- [Row(v=u'5.0000')]
+ [Row(v='5.0000')]
"""
sc = SparkContext._active_spark_context
return Column(sc._jvm.functions.format_number(_to_java_column(col), d))
-@ignore_unicode_prefix
@since(1.5)
def format_string(format, *cols):
"""
@@ -1663,7 +1643,7 @@ def format_string(format, *cols):
>>> df = spark.createDataFrame([(5, "hello")], ['a', 'b'])
>>> df.select(format_string('%d %s', df.a, df.b).alias('v')).collect()
- [Row(v=u'5 hello')]
+ [Row(v='5 hello')]
"""
sc = SparkContext._active_spark_context
return Column(sc._jvm.functions.format_string(format, _to_seq(sc, cols, _to_java_column)))
@@ -1721,7 +1701,6 @@ def overlay(src, replace, pos, len=-1):
@since(1.5)
-@ignore_unicode_prefix
def substring(str, pos, len):
"""
Substring starts at `pos` and is of length `len` when str is String type or
@@ -1732,14 +1711,13 @@ def substring(str, pos, len):
>>> df = spark.createDataFrame([('abcd',)], ['s',])
>>> df.select(substring(df.s, 1, 2).alias('s')).collect()
- [Row(s=u'ab')]
+ [Row(s='ab')]
"""
sc = SparkContext._active_spark_context
return Column(sc._jvm.functions.substring(_to_java_column(str), pos, len))
@since(1.5)
-@ignore_unicode_prefix
def substring_index(str, delim, count):
"""
Returns the substring from string str before count occurrences of the delimiter delim.
@@ -1749,15 +1727,14 @@ def substring_index(str, delim, count):
>>> df = spark.createDataFrame([('a.b.c.d',)], ['s'])
>>> df.select(substring_index(df.s, '.', 2).alias('s')).collect()
- [Row(s=u'a.b')]
+ [Row(s='a.b')]
>>> df.select(substring_index(df.s, '.', -3).alias('s')).collect()
- [Row(s=u'b.c.d')]
+ [Row(s='b.c.d')]
"""
sc = SparkContext._active_spark_context
return Column(sc._jvm.functions.substring_index(_to_java_column(str), delim, count))
-@ignore_unicode_prefix
@since(1.5)
def levenshtein(left, right):
"""Computes the Levenshtein distance of the two given strings.
@@ -1792,49 +1769,45 @@ def locate(substr, str, pos=1):
@since(1.5)
-@ignore_unicode_prefix
def lpad(col, len, pad):
"""
Left-pad the string column to width `len` with `pad`.
>>> df = spark.createDataFrame([('abcd',)], ['s',])
>>> df.select(lpad(df.s, 6, '#').alias('s')).collect()
- [Row(s=u'##abcd')]
+ [Row(s='##abcd')]
"""
sc = SparkContext._active_spark_context
return Column(sc._jvm.functions.lpad(_to_java_column(col), len, pad))
@since(1.5)
-@ignore_unicode_prefix
def rpad(col, len, pad):
"""
Right-pad the string column to width `len` with `pad`.
>>> df = spark.createDataFrame([('abcd',)], ['s',])
>>> df.select(rpad(df.s, 6, '#').alias('s')).collect()
- [Row(s=u'abcd##')]
+ [Row(s='abcd##')]
"""
sc = SparkContext._active_spark_context
return Column(sc._jvm.functions.rpad(_to_java_column(col), len, pad))
@since(1.5)
-@ignore_unicode_prefix
def repeat(col, n):
"""
Repeats a string column n times, and returns it as a new string column.
>>> df = spark.createDataFrame([('ab',)], ['s',])
>>> df.select(repeat(df.s, 3).alias('s')).collect()
- [Row(s=u'ababab')]
+ [Row(s='ababab')]
"""
sc = SparkContext._active_spark_context
return Column(sc._jvm.functions.repeat(_to_java_column(col), n))
@since(1.5)
-@ignore_unicode_prefix
def split(str, pattern, limit=-1):
"""
Splits str around matches of the given pattern.
@@ -1855,15 +1828,14 @@ def split(str, pattern, limit=-1):
>>> df = spark.createDataFrame([('oneAtwoBthreeC',)], ['s',])
>>> df.select(split(df.s, '[ABC]', 2).alias('s')).collect()
- [Row(s=[u'one', u'twoBthreeC'])]
+ [Row(s=['one', 'twoBthreeC'])]
>>> df.select(split(df.s, '[ABC]', -1).alias('s')).collect()
- [Row(s=[u'one', u'two', u'three', u''])]
+ [Row(s=['one', 'two', 'three', ''])]
"""
sc = SparkContext._active_spark_context
return Column(sc._jvm.functions.split(_to_java_column(str), pattern, limit))
-@ignore_unicode_prefix
@since(1.5)
def regexp_extract(str, pattern, idx):
r"""Extract a specific group matched by a Java regex, from the specified string column.
@@ -1871,73 +1843,68 @@ def regexp_extract(str, pattern, idx):
>>> df = spark.createDataFrame([('100-200',)], ['str'])
>>> df.select(regexp_extract('str', r'(\d+)-(\d+)', 1).alias('d')).collect()
- [Row(d=u'100')]
+ [Row(d='100')]
>>> df = spark.createDataFrame([('foo',)], ['str'])
>>> df.select(regexp_extract('str', r'(\d+)', 1).alias('d')).collect()
- [Row(d=u'')]
+ [Row(d='')]
>>> df = spark.createDataFrame([('aaaac',)], ['str'])
>>> df.select(regexp_extract('str', '(a+)(b)?(c)', 2).alias('d')).collect()
- [Row(d=u'')]
+ [Row(d='')]
"""
sc = SparkContext._active_spark_context
jc = sc._jvm.functions.regexp_extract(_to_java_column(str), pattern, idx)
return Column(jc)
-@ignore_unicode_prefix
@since(1.5)
def regexp_replace(str, pattern, replacement):
r"""Replace all substrings of the specified string value that match regexp with rep.
>>> df = spark.createDataFrame([('100-200',)], ['str'])
>>> df.select(regexp_replace('str', r'(\d+)', '--').alias('d')).collect()
- [Row(d=u'-----')]
+ [Row(d='-----')]
"""
sc = SparkContext._active_spark_context
jc = sc._jvm.functions.regexp_replace(_to_java_column(str), pattern, replacement)
return Column(jc)
-@ignore_unicode_prefix
@since(1.5)
def initcap(col):
"""Translate the first letter of each word to upper case in the sentence.
>>> spark.createDataFrame([('ab cd',)], ['a']).select(initcap("a").alias('v')).collect()
- [Row(v=u'Ab Cd')]
+ [Row(v='Ab Cd')]
"""
sc = SparkContext._active_spark_context
return Column(sc._jvm.functions.initcap(_to_java_column(col)))
@since(1.5)
-@ignore_unicode_prefix
def soundex(col):
"""
Returns the SoundEx encoding for a string
>>> df = spark.createDataFrame([("Peters",),("Uhrbach",)], ['name'])
>>> df.select(soundex(df.name).alias("soundex")).collect()
- [Row(soundex=u'P362'), Row(soundex=u'U612')]
+ [Row(soundex='P362'), Row(soundex='U612')]
"""
sc = SparkContext._active_spark_context
return Column(sc._jvm.functions.soundex(_to_java_column(col)))
-@ignore_unicode_prefix
@since(1.5)
def bin(col):
"""Returns the string representation of the binary value of the given column.
>>> df.select(bin(df.age).alias('c')).collect()
- [Row(c=u'10'), Row(c=u'101')]
+ [Row(c='10'), Row(c='101')]
"""
sc = SparkContext._active_spark_context
jc = sc._jvm.functions.bin(_to_java_column(col))
return Column(jc)
-@ignore_unicode_prefix
@since(1.5)
def hex(col):
"""Computes hex value of the given column, which could be :class:`pyspark.sql.types.StringType`,
@@ -1945,14 +1912,13 @@ def hex(col):
:class:`pyspark.sql.types.LongType`.
>>> spark.createDataFrame([('ABC', 3)], ['a', 'b']).select(hex('a'), hex('b')).collect()
- [Row(hex(a)=u'414243', hex(b)=u'3')]
+ [Row(hex(a)='414243', hex(b)='3')]
"""
sc = SparkContext._active_spark_context
jc = sc._jvm.functions.hex(_to_java_column(col))
return Column(jc)
-@ignore_unicode_prefix
@since(1.5)
def unhex(col):
"""Inverse of hex. Interprets each pair of characters as a hexadecimal number
@@ -1965,7 +1931,6 @@ def unhex(col):
return Column(sc._jvm.functions.unhex(_to_java_column(col)))
-@ignore_unicode_prefix
@since(1.5)
def length(col):
"""Computes the character length of string data or number of bytes of binary data.
@@ -1979,7 +1944,6 @@ def length(col):
return Column(sc._jvm.functions.length(_to_java_column(col)))
-@ignore_unicode_prefix
@since(1.5)
def translate(srcCol, matching, replace):
"""A function translate any character in the `srcCol` by a character in `matching`.
@@ -1989,7 +1953,7 @@ def translate(srcCol, matching, replace):
>>> spark.createDataFrame([('translate',)], ['a']).select(translate('a', "rnlt", "123") \\
... .alias('r')).collect()
- [Row(r=u'1a2s3ae')]
+ [Row(r='1a2s3ae')]
"""
sc = SparkContext._active_spark_context
return Column(sc._jvm.functions.translate(_to_java_column(srcCol), matching, replace))
@@ -1997,7 +1961,6 @@ def translate(srcCol, matching, replace):
# ---------------------- Collection functions ------------------------------
-@ignore_unicode_prefix
@since(2.0)
def create_map(*cols):
"""Creates a new map column.
@@ -2006,9 +1969,9 @@ def create_map(*cols):
grouped as key-value pairs, e.g. (key1, value1, key2, value2, ...).
>>> df.select(create_map('name', 'age').alias("map")).collect()
- [Row(map={u'Alice': 2}), Row(map={u'Bob': 5})]
+ [Row(map={'Alice': 2}), Row(map={'Bob': 5})]
>>> df.select(create_map([df.name, df.age]).alias("map")).collect()
- [Row(map={u'Alice': 2}), Row(map={u'Bob': 5})]
+ [Row(map={'Alice': 2}), Row(map={'Bob': 5})]
"""
sc = SparkContext._active_spark_context
if len(cols) == 1 and isinstance(cols[0], (list, set)):
@@ -2108,7 +2071,6 @@ def slice(x, start, length):
return Column(sc._jvm.functions.slice(_to_java_column(x), start, length))
-@ignore_unicode_prefix
@since(2.4)
def array_join(col, delimiter, null_replacement=None):
"""
@@ -2117,9 +2079,9 @@ def array_join(col, delimiter, null_replacement=None):
>>> df = spark.createDataFrame([(["a", "b", "c"],), (["a", None],)], ['data'])
>>> df.select(array_join(df.data, ",").alias("joined")).collect()
- [Row(joined=u'a,b,c'), Row(joined=u'a')]
+ [Row(joined='a,b,c'), Row(joined='a')]
>>> df.select(array_join(df.data, ",", "NULL").alias("joined")).collect()
- [Row(joined=u'a,b,c'), Row(joined=u'a,NULL')]
+ [Row(joined='a,b,c'), Row(joined='a,NULL')]
"""
sc = SparkContext._active_spark_context
if null_replacement is None:
@@ -2130,7 +2092,6 @@ def array_join(col, delimiter, null_replacement=None):
@since(1.5)
-@ignore_unicode_prefix
def concat(*cols):
"""
Concatenates multiple input columns together into a single column.
@@ -2138,7 +2099,7 @@ def concat(*cols):
>>> df = spark.createDataFrame([('abcd','123')], ['s', 'd'])
>>> df.select(concat(df.s, df.d).alias('s')).collect()
- [Row(s=u'abcd123')]
+ [Row(s='abcd123')]
>>> df = spark.createDataFrame([([1, 2], [3, 4], [5]), ([1, 2], None, [3])], ['a', 'b', 'c'])
>>> df.select(concat(df.a, df.b, df.c).alias("arr")).collect()
@@ -2165,7 +2126,6 @@ def array_position(col, value):
return Column(sc._jvm.functions.array_position(_to_java_column(col), value))
-@ignore_unicode_prefix
@since(2.4)
def element_at(col, extraction):
"""
@@ -2179,7 +2139,7 @@ def element_at(col, extraction):
>>> df = spark.createDataFrame([(["a", "b", "c"],), ([],)], ['data'])
>>> df.select(element_at(df.data, 1)).collect()
- [Row(element_at(data, 1)=u'a'), Row(element_at(data, 1)=None)]
+ [Row(element_at(data, 1)='a'), Row(element_at(data, 1)=None)]
>>> df = spark.createDataFrame([({"a": 1.0, "b": 2.0},), ({},)], ['data'])
>>> df.select(element_at(df.data, lit("a"))).collect()
@@ -2221,7 +2181,6 @@ def array_distinct(col):
return Column(sc._jvm.functions.array_distinct(_to_java_column(col)))
-@ignore_unicode_prefix
@since(2.4)
def array_intersect(col1, col2):
"""
@@ -2234,13 +2193,12 @@ def array_intersect(col1, col2):
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame([Row(c1=["b", "a", "c"], c2=["c", "d", "a", "f"])])
>>> df.select(array_intersect(df.c1, df.c2)).collect()
- [Row(array_intersect(c1, c2)=[u'a', u'c'])]
+ [Row(array_intersect(c1, c2)=['a', 'c'])]
"""
sc = SparkContext._active_spark_context
return Column(sc._jvm.functions.array_intersect(_to_java_column(col1), _to_java_column(col2)))
-@ignore_unicode_prefix
@since(2.4)
def array_union(col1, col2):
"""
@@ -2253,13 +2211,12 @@ def array_union(col1, col2):
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame([Row(c1=["b", "a", "c"], c2=["c", "d", "a", "f"])])
>>> df.select(array_union(df.c1, df.c2)).collect()
- [Row(array_union(c1, c2)=[u'b', u'a', u'c', u'd', u'f'])]
+ [Row(array_union(c1, c2)=['b', 'a', 'c', 'd', 'f'])]
"""
sc = SparkContext._active_spark_context
return Column(sc._jvm.functions.array_union(_to_java_column(col1), _to_java_column(col2)))
-@ignore_unicode_prefix
@since(2.4)
def array_except(col1, col2):
"""
@@ -2272,7 +2229,7 @@ def array_except(col1, col2):
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame([Row(c1=["b", "a", "c"], c2=["c", "d", "a", "f"])])
>>> df.select(array_except(df.c1, df.c2)).collect()
- [Row(array_except(c1, c2)=[u'b'])]
+ [Row(array_except(c1, c2)=['b'])]
"""
sc = SparkContext._active_spark_context
return Column(sc._jvm.functions.array_except(_to_java_column(col1), _to_java_column(col2)))
@@ -2397,7 +2354,6 @@ def posexplode_outer(col):
return Column(jc)
-@ignore_unicode_prefix
@since(1.6)
def get_json_object(col, path):
"""
@@ -2411,14 +2367,13 @@ def get_json_object(col, path):
>>> df = spark.createDataFrame(data, ("key", "jstring"))
>>> df.select(df.key, get_json_object(df.jstring, '$.f1').alias("c0"), \\
... get_json_object(df.jstring, '$.f2').alias("c1") ).collect()
- [Row(key=u'1', c0=u'value1', c1=u'value2'), Row(key=u'2', c0=u'value12', c1=None)]
+ [Row(key='1', c0='value1', c1='value2'), Row(key='2', c0='value12', c1=None)]
"""
sc = SparkContext._active_spark_context
jc = sc._jvm.functions.get_json_object(_to_java_column(col), path)
return Column(jc)
-@ignore_unicode_prefix
@since(1.6)
def json_tuple(col, *fields):
"""Creates a new row for a json column according to the given field names.
@@ -2429,14 +2384,13 @@ def json_tuple(col, *fields):
>>> data = [("1", '''{"f1": "value1", "f2": "value2"}'''), ("2", '''{"f1": "value12"}''')]
>>> df = spark.createDataFrame(data, ("key", "jstring"))
>>> df.select(df.key, json_tuple(df.jstring, 'f1', 'f2')).collect()
- [Row(key=u'1', c0=u'value1', c1=u'value2'), Row(key=u'2', c0=u'value12', c1=None)]
+ [Row(key='1', c0='value1', c1='value2'), Row(key='2', c0='value12', c1=None)]
"""
sc = SparkContext._active_spark_context
jc = sc._jvm.functions.json_tuple(_to_java_column(col), _to_seq(sc, fields))
return Column(jc)
-@ignore_unicode_prefix
@since(2.1)
def from_json(col, schema, options={}):
"""
@@ -2460,7 +2414,7 @@ def from_json(col, schema, options={}):
>>> df.select(from_json(df.value, "a INT").alias("json")).collect()
[Row(json=Row(a=1))]
>>> df.select(from_json(df.value, "MAP").alias("json")).collect()
- [Row(json={u'a': 1})]
+ [Row(json={'a': 1})]
>>> data = [(1, '''[{"a": 1}]''')]
>>> schema = ArrayType(StructType([StructField("a", IntegerType())]))
>>> df = spark.createDataFrame(data, ("key", "value"))
@@ -2485,7 +2439,6 @@ def from_json(col, schema, options={}):
return Column(jc)
-@ignore_unicode_prefix
@since(2.1)
def to_json(col, options={}):
"""
@@ -2499,26 +2452,26 @@ def to_json(col, options={}):
>>> from pyspark.sql import Row
>>> from pyspark.sql.types import *
- >>> data = [(1, Row(name='Alice', age=2))]
+ >>> data = [(1, Row(age=2, name='Alice'))]
>>> df = spark.createDataFrame(data, ("key", "value"))
>>> df.select(to_json(df.value).alias("json")).collect()
- [Row(json=u'{"age":2,"name":"Alice"}')]
- >>> data = [(1, [Row(name='Alice', age=2), Row(name='Bob', age=3)])]
+ [Row(json='{"age":2,"name":"Alice"}')]
+ >>> data = [(1, [Row(age=2, name='Alice'), Row(age=3, name='Bob')])]
>>> df = spark.createDataFrame(data, ("key", "value"))
>>> df.select(to_json(df.value).alias("json")).collect()
- [Row(json=u'[{"age":2,"name":"Alice"},{"age":3,"name":"Bob"}]')]
+ [Row(json='[{"age":2,"name":"Alice"},{"age":3,"name":"Bob"}]')]
>>> data = [(1, {"name": "Alice"})]
>>> df = spark.createDataFrame(data, ("key", "value"))
>>> df.select(to_json(df.value).alias("json")).collect()
- [Row(json=u'{"name":"Alice"}')]
+ [Row(json='{"name":"Alice"}')]
>>> data = [(1, [{"name": "Alice"}, {"name": "Bob"}])]
>>> df = spark.createDataFrame(data, ("key", "value"))
>>> df.select(to_json(df.value).alias("json")).collect()
- [Row(json=u'[{"name":"Alice"},{"name":"Bob"}]')]
+ [Row(json='[{"name":"Alice"},{"name":"Bob"}]')]
>>> data = [(1, ["Alice", "Bob"])]
>>> df = spark.createDataFrame(data, ("key", "value"))
>>> df.select(to_json(df.value).alias("json")).collect()
- [Row(json=u'["Alice","Bob"]')]
+ [Row(json='["Alice","Bob"]')]
"""
sc = SparkContext._active_spark_context
@@ -2526,7 +2479,6 @@ def to_json(col, options={}):
return Column(jc)
-@ignore_unicode_prefix
@since(2.4)
def schema_of_json(json, options={}):
"""
@@ -2540,12 +2492,12 @@ def schema_of_json(json, options={}):
>>> df = spark.range(1)
>>> df.select(schema_of_json(lit('{"a": 0}')).alias("json")).collect()
- [Row(json=u'struct')]
+ [Row(json='struct')]
>>> schema = schema_of_json('{a: 1}', {'allowUnquotedFieldNames':'true'})
>>> df.select(schema.alias("json")).collect()
- [Row(json=u'struct')]
+ [Row(json='struct')]
"""
- if isinstance(json, basestring):
+ if isinstance(json, str):
col = _create_column_from_literal(json)
elif isinstance(json, Column):
col = _to_java_column(json)
@@ -2557,7 +2509,6 @@ def schema_of_json(json, options={}):
return Column(jc)
-@ignore_unicode_prefix
@since(3.0)
def schema_of_csv(csv, options={}):
"""
@@ -2568,11 +2519,11 @@ def schema_of_csv(csv, options={}):
>>> df = spark.range(1)
>>> df.select(schema_of_csv(lit('1|a'), {'sep':'|'}).alias("csv")).collect()
- [Row(csv=u'struct<_c0:int,_c1:string>')]
+ [Row(csv='struct<_c0:int,_c1:string>')]
>>> df.select(schema_of_csv('1|a', {'sep':'|'}).alias("csv")).collect()
- [Row(csv=u'struct<_c0:int,_c1:string>')]
+ [Row(csv='struct<_c0:int,_c1:string>')]
"""
- if isinstance(csv, basestring):
+ if isinstance(csv, str):
col = _create_column_from_literal(csv)
elif isinstance(csv, Column):
col = _to_java_column(csv)
@@ -2584,7 +2535,6 @@ def schema_of_csv(csv, options={}):
return Column(jc)
-@ignore_unicode_prefix
@since(3.0)
def to_csv(col, options={}):
"""
@@ -2595,10 +2545,10 @@ def to_csv(col, options={}):
:param options: options to control converting. accepts the same options as the CSV datasource.
>>> from pyspark.sql import Row
- >>> data = [(1, Row(name='Alice', age=2))]
+ >>> data = [(1, Row(age=2, name='Alice'))]
>>> df = spark.createDataFrame(data, ("key", "value"))
>>> df.select(to_csv(df.value).alias("csv")).collect()
- [Row(csv=u'2,Alice')]
+ [Row(csv='2,Alice')]
"""
sc = SparkContext._active_spark_context
@@ -2705,7 +2655,6 @@ def shuffle(col):
@since(1.5)
-@ignore_unicode_prefix
def reverse(col):
"""
Collection function: returns a reversed string or an array with reverse order of elements.
@@ -2714,7 +2663,7 @@ def reverse(col):
>>> df = spark.createDataFrame([('Spark SQL',)], ['data'])
>>> df.select(reverse(df.data).alias('s')).collect()
- [Row(s=u'LQS krapS')]
+ [Row(s='LQS krapS')]
>>> df = spark.createDataFrame([([2, 1, 3],) ,([1],) ,([],)], ['data'])
>>> df.select(reverse(df.data).alias('r')).collect()
[Row(r=[3, 1, 2]), Row(r=[1]), Row(r=[])]
@@ -2820,7 +2769,6 @@ def map_from_entries(col):
return Column(sc._jvm.functions.map_from_entries(_to_java_column(col)))
-@ignore_unicode_prefix
@since(2.4)
def array_repeat(col, count):
"""
@@ -2828,7 +2776,7 @@ def array_repeat(col, count):
>>> df = spark.createDataFrame([('ab',)], ['data'])
>>> df.select(array_repeat(df.data, 3).alias('r')).collect()
- [Row(r=[u'ab', u'ab', u'ab'])]
+ [Row(r=['ab', 'ab', 'ab'])]
"""
sc = SparkContext._active_spark_context
return Column(sc._jvm.functions.array_repeat(
@@ -2898,7 +2846,6 @@ def sequence(start, stop, step=None):
_to_java_column(start), _to_java_column(stop), _to_java_column(step)))
-@ignore_unicode_prefix
@since(3.0)
def from_csv(col, schema, options={}):
"""
@@ -2920,11 +2867,11 @@ def from_csv(col, schema, options={}):
>>> df = spark.createDataFrame(data, ("value",))
>>> options = {'ignoreLeadingWhiteSpace': True}
>>> df.select(from_csv(df.value, "s string", options).alias("csv")).collect()
- [Row(csv=Row(s=u'abc'))]
+ [Row(csv=Row(s='abc'))]
"""
sc = SparkContext._active_spark_context
- if isinstance(schema, basestring):
+ if isinstance(schema, str):
schema = _create_column_from_literal(schema)
elif isinstance(schema, Column):
schema = _to_java_column(schema)
@@ -2984,20 +2931,6 @@ def _get_lambda_parameters(f):
return parameters
-def _get_lambda_parameters_legacy(f):
- # TODO (SPARK-29909) Remove once 2.7 support is dropped
- import inspect
-
- spec = inspect.getargspec(f)
- if not 1 <= len(spec.args) <= 3 or spec.varargs or spec.keywords:
- raise ValueError(
- "f should take between 1 and 3 arguments, but provided function takes {}".format(
- spec
- )
- )
- return spec.args
-
-
def _create_lambda(f):
"""
Create `o.a.s.sql.expressions.LambdaFunction` corresponding
@@ -3008,10 +2941,7 @@ def _create_lambda(f):
- (Column, Column) -> Column: ...
- (Column, Column, Column) -> Column: ...
"""
- if sys.version_info >= (3, 3):
- parameters = _get_lambda_parameters(f)
- else:
- parameters = _get_lambda_parameters_legacy(f)
+ parameters = _get_lambda_parameters(f)
sc = SparkContext._active_spark_context
expressions = sc._jvm.org.apache.spark.sql.catalyst.expressions
@@ -3481,7 +3411,7 @@ def udf(f=None, returnType=StringType()):
evalType=PythonEvalType.SQL_BATCHED_UDF)
-blacklist = ['map', 'since', 'ignore_unicode_prefix']
+blacklist = ['map', 'since']
__all__ = [k for k, v in globals().items()
if not k.startswith('_') and k[0].islower() and callable(v) and k not in blacklist]
__all__ += ["PandasUDFType"]
@@ -3500,7 +3430,7 @@ def _test():
sc = spark.sparkContext
globs['sc'] = sc
globs['spark'] = spark
- globs['df'] = spark.createDataFrame([Row(name='Alice', age=2), Row(name='Bob', age=5)])
+ globs['df'] = spark.createDataFrame([Row(age=2, name='Alice'), Row(age=5, name='Bob')])
(failure_count, test_count) = doctest.testmod(
pyspark.sql.functions, globs=globs,
optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py
index ac826bc64a..83e2baa8f0 100644
--- a/python/pyspark/sql/group.py
+++ b/python/pyspark/sql/group.py
@@ -18,7 +18,6 @@
import sys
from pyspark import since
-from pyspark.rdd import ignore_unicode_prefix
from pyspark.sql.column import Column, _to_seq
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.pandas.group_ops import PandasGroupedOpsMixin
@@ -60,7 +59,6 @@ class GroupedData(PandasGroupedOpsMixin):
self._df = df
self.sql_ctx = df.sql_ctx
- @ignore_unicode_prefix
@since(1.3)
def agg(self, *exprs):
"""Compute aggregates and returns the result as a :class:`DataFrame`.
@@ -91,18 +89,18 @@ class GroupedData(PandasGroupedOpsMixin):
>>> gdf = df.groupBy(df.name)
>>> sorted(gdf.agg({"*": "count"}).collect())
- [Row(name=u'Alice', count(1)=1), Row(name=u'Bob', count(1)=1)]
+ [Row(name='Alice', count(1)=1), Row(name='Bob', count(1)=1)]
>>> from pyspark.sql import functions as F
>>> sorted(gdf.agg(F.min(df.age)).collect())
- [Row(name=u'Alice', min(age)=2), Row(name=u'Bob', min(age)=5)]
+ [Row(name='Alice', min(age)=2), Row(name='Bob', min(age)=5)]
>>> from pyspark.sql.functions import pandas_udf, PandasUDFType
>>> @pandas_udf('int', PandasUDFType.GROUPED_AGG) # doctest: +SKIP
... def min_udf(v):
... return v.min()
>>> sorted(gdf.agg(min_udf(df.age)).collect()) # doctest: +SKIP
- [Row(name=u'Alice', min_udf(age)=2), Row(name=u'Bob', min_udf(age)=5)]
+ [Row(name='Alice', min_udf(age)=2), Row(name='Bob', min_udf(age)=5)]
"""
assert exprs, "exprs should not be empty"
if len(exprs) == 1 and isinstance(exprs[0], dict):
diff --git a/python/pyspark/sql/pandas/conversion.py b/python/pyspark/sql/pandas/conversion.py
index e6d8e9f24a..3842bc2357 100644
--- a/python/pyspark/sql/pandas/conversion.py
+++ b/python/pyspark/sql/pandas/conversion.py
@@ -16,11 +16,6 @@
#
import sys
import warnings
-if sys.version >= '3':
- basestring = unicode = str
- xrange = range
-else:
- from itertools import izip as zip
from collections import Counter
from pyspark import since
@@ -29,7 +24,6 @@ from pyspark.sql.pandas.serializers import ArrowCollectSerializer
from pyspark.sql.types import IntegralType
from pyspark.sql.types import *
from pyspark.traceback_utils import SCCallSiteSync
-from pyspark.util import _exception_message
class PandasConversionMixin(object):
@@ -84,7 +78,7 @@ class PandasConversionMixin(object):
"failed by the reason below:\n %s\n"
"Attempting non-optimization as "
"'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to "
- "true." % _exception_message(e))
+ "true." % str(e))
warnings.warn(msg)
use_arrow = False
else:
@@ -93,7 +87,7 @@ class PandasConversionMixin(object):
"'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has "
"reached the error below and will not continue because automatic fallback "
"with 'spark.sql.execution.arrow.pyspark.fallback.enabled' has been set to "
- "false.\n %s" % _exception_message(e))
+ "false.\n %s" % str(e))
warnings.warn(msg)
raise
@@ -130,7 +124,7 @@ class PandasConversionMixin(object):
"reached the error below and can not continue. Note that "
"'spark.sql.execution.arrow.pyspark.fallback.enabled' does not have an "
"effect on failures in the middle of "
- "computation.\n %s" % _exception_message(e))
+ "computation.\n %s" % str(e))
warnings.warn(msg)
raise
@@ -268,7 +262,7 @@ class SparkConversionMixin(object):
# If no schema supplied by user then get the names of columns only
if schema is None:
- schema = [str(x) if not isinstance(x, basestring) else
+ schema = [str(x) if not isinstance(x, str) else
(x.encode('utf-8') if not isinstance(x, str) else x)
for x in data.columns]
@@ -276,8 +270,6 @@ class SparkConversionMixin(object):
try:
return self._create_from_pandas_with_arrow(data, schema, timezone)
except Exception as e:
- from pyspark.util import _exception_message
-
if self._wrapped._conf.arrowPySparkFallbackEnabled():
msg = (
"createDataFrame attempted Arrow optimization because "
@@ -285,7 +277,7 @@ class SparkConversionMixin(object):
"failed by the reason below:\n %s\n"
"Attempting non-optimization as "
"'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to "
- "true." % _exception_message(e))
+ "true." % str(e))
warnings.warn(msg)
else:
msg = (
@@ -293,7 +285,7 @@ class SparkConversionMixin(object):
"'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has "
"reached the error below and will not continue because automatic "
"fallback with 'spark.sql.execution.arrow.pyspark.fallback.enabled' "
- "has been set to false.\n %s" % _exception_message(e))
+ "has been set to false.\n %s" % str(e))
warnings.warn(msg)
raise
data = self._convert_from_pandas(data, schema, timezone)
@@ -358,7 +350,7 @@ class SparkConversionMixin(object):
col_names = cur_dtypes.names
record_type_list = []
has_rec_fix = False
- for i in xrange(len(cur_dtypes)):
+ for i in range(len(cur_dtypes)):
curr_type = cur_dtypes[i]
# If type is a datetime64 timestamp, convert to microseconds
# NOTE: if dtype is datetime[ns] then np.record.tolist() will output values as longs,
@@ -413,7 +405,7 @@ class SparkConversionMixin(object):
# Slice the DataFrame to be batched
step = -(-len(pdf) // self.sparkContext.defaultParallelism) # round int up
- pdf_slices = (pdf.iloc[start:start + step] for start in xrange(0, len(pdf), step))
+ pdf_slices = (pdf.iloc[start:start + step] for start in range(0, len(pdf), step))
# Create list of Arrow (columns, type) for serializer dump_stream
arrow_data = [[(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)]
diff --git a/python/pyspark/sql/pandas/functions.py b/python/pyspark/sql/pandas/functions.py
index 094dc357b6..ba4dec82d4 100644
--- a/python/pyspark/sql/pandas/functions.py
+++ b/python/pyspark/sql/pandas/functions.py
@@ -18,6 +18,7 @@
import functools
import sys
import warnings
+from inspect import getfullargspec
from pyspark import since
from pyspark.rdd import PythonEvalType
@@ -25,7 +26,6 @@ from pyspark.sql.pandas.typehints import infer_eval_type
from pyspark.sql.pandas.utils import require_minimum_pandas_version, require_minimum_pyarrow_version
from pyspark.sql.types import DataType
from pyspark.sql.udf import _create_udf
-from pyspark.util import _get_argspec
class PandasUDFType(object):
@@ -371,30 +371,29 @@ def pandas_udf(f=None, returnType=None, functionType=None):
def _create_pandas_udf(f, returnType, evalType):
- argspec = _get_argspec(f)
+ argspec = getfullargspec(f)
# pandas UDF by type hints.
- if sys.version_info >= (3, 6):
- from inspect import signature
+ from inspect import signature
- if evalType in [PythonEvalType.SQL_SCALAR_PANDAS_UDF,
- PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF,
- PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF]:
- warnings.warn(
- "In Python 3.6+ and Spark 3.0+, it is preferred to specify type hints for "
- "pandas UDF instead of specifying pandas UDF type which will be deprecated "
- "in the future releases. See SPARK-28264 for more details.", UserWarning)
- elif evalType in [PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
- PythonEvalType.SQL_MAP_PANDAS_ITER_UDF,
- PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF]:
- # In case of 'SQL_GROUPED_MAP_PANDAS_UDF', deprecation warning is being triggered
- # at `apply` instead.
- # In case of 'SQL_MAP_PANDAS_ITER_UDF' and 'SQL_COGROUPED_MAP_PANDAS_UDF', the
- # evaluation type will always be set.
- pass
- elif len(argspec.annotations) > 0:
- evalType = infer_eval_type(signature(f))
- assert evalType is not None
+ if evalType in [PythonEvalType.SQL_SCALAR_PANDAS_UDF,
+ PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF,
+ PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF]:
+ warnings.warn(
+ "In Python 3.6+ and Spark 3.0+, it is preferred to specify type hints for "
+ "pandas UDF instead of specifying pandas UDF type which will be deprecated "
+ "in the future releases. See SPARK-28264 for more details.", UserWarning)
+ elif evalType in [PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
+ PythonEvalType.SQL_MAP_PANDAS_ITER_UDF,
+ PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF]:
+ # In case of 'SQL_GROUPED_MAP_PANDAS_UDF', deprecation warning is being triggered
+ # at `apply` instead.
+ # In case of 'SQL_MAP_PANDAS_ITER_UDF' and 'SQL_COGROUPED_MAP_PANDAS_UDF', the
+ # evaluation type will always be set.
+ pass
+ elif len(argspec.annotations) > 0:
+ evalType = infer_eval_type(signature(f))
+ assert evalType is not None
if evalType is None:
# Set default is scalar UDF.
diff --git a/python/pyspark/sql/pandas/serializers.py b/python/pyspark/sql/pandas/serializers.py
index 42562e1fb9..4b91c6a0f8 100644
--- a/python/pyspark/sql/pandas/serializers.py
+++ b/python/pyspark/sql/pandas/serializers.py
@@ -19,13 +19,6 @@
Serializers for PyArrow and pandas conversions. See `pyspark.serializers` for more details.
"""
-import sys
-if sys.version < '3':
- from itertools import izip as zip
-else:
- basestring = unicode = str
- xrange = range
-
from pyspark.serializers import Serializer, read_int, write_int, UTF8Deserializer
@@ -67,7 +60,7 @@ class ArrowCollectSerializer(Serializer):
raise RuntimeError("An error occurred while calling "
"ArrowCollectSerializer.load_stream: {}".format(error_msg))
batch_order = []
- for i in xrange(num):
+ for i in range(num):
index = read_int(stream)
batch_order.append(index)
yield batch_order
@@ -180,7 +173,7 @@ class ArrowStreamPandasSerializer(ArrowStreamSerializer):
if len(s) == 0 and len(s.columns) == 0:
arrs_names = [(pa.array([], type=field.type), field.name) for field in t]
# Assign result columns by schema name if user labeled with strings
- elif self._assign_cols_by_name and any(isinstance(name, basestring)
+ elif self._assign_cols_by_name and any(isinstance(name, str)
for name in s.columns):
arrs_names = [(create_array(s[field.name], field.type), field.name)
for field in t]
@@ -194,7 +187,7 @@ class ArrowStreamPandasSerializer(ArrowStreamSerializer):
else:
arrs.append(create_array(s, t))
- return pa.RecordBatch.from_arrays(arrs, ["_%d" % i for i in xrange(len(arrs))])
+ return pa.RecordBatch.from_arrays(arrs, ["_%d" % i for i in range(len(arrs))])
def dump_stream(self, iterator, stream):
"""
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 336345e383..a83aece2e4 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -15,15 +15,9 @@
# limitations under the License.
#
-import sys
-
-if sys.version >= '3':
- basestring = unicode = str
-
from py4j.java_gateway import JavaClass
from pyspark import RDD, since
-from pyspark.rdd import ignore_unicode_prefix
from pyspark.sql.column import _to_seq
from pyspark.sql.types import *
from pyspark.sql import utils
@@ -94,7 +88,7 @@ class DataFrameReader(OptionUtils):
if isinstance(schema, StructType):
jschema = spark._jsparkSession.parseDataType(schema.json())
self._jreader = self._jreader.schema(jschema)
- elif isinstance(schema, basestring):
+ elif isinstance(schema, str):
self._jreader = self._jreader.schema(schema)
else:
raise TypeError("schema should be StructType or string")
@@ -174,7 +168,7 @@ class DataFrameReader(OptionUtils):
if schema is not None:
self.schema(schema)
self.options(**options)
- if isinstance(path, basestring):
+ if isinstance(path, str):
return self._df(self._jreader.load(path))
elif path is not None:
if type(path) != list:
@@ -294,16 +288,16 @@ class DataFrameReader(OptionUtils):
allowUnquotedControlChars=allowUnquotedControlChars, lineSep=lineSep,
samplingRatio=samplingRatio, dropFieldIfAllNull=dropFieldIfAllNull, encoding=encoding,
locale=locale, pathGlobFilter=pathGlobFilter, recursiveFileLookup=recursiveFileLookup)
- if isinstance(path, basestring):
+ if isinstance(path, str):
path = [path]
if type(path) == list:
return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))
elif isinstance(path, RDD):
def func(iterator):
for x in iterator:
- if not isinstance(x, basestring):
- x = unicode(x)
- if isinstance(x, unicode):
+ if not isinstance(x, str):
+ x = str(x)
+ if isinstance(x, str):
x = x.encode("utf-8")
yield x
keyed = path.mapPartitions(func)
@@ -352,7 +346,6 @@ class DataFrameReader(OptionUtils):
recursiveFileLookup=recursiveFileLookup)
return self._df(self._jreader.parquet(_to_seq(self._spark._sc, paths)))
- @ignore_unicode_prefix
@since(1.6)
def text(self, paths, wholetext=False, lineSep=None, pathGlobFilter=None,
recursiveFileLookup=None):
@@ -376,15 +369,15 @@ class DataFrameReader(OptionUtils):
>>> df = spark.read.text('python/test_support/sql/text-test.txt')
>>> df.collect()
- [Row(value=u'hello'), Row(value=u'this')]
+ [Row(value='hello'), Row(value='this')]
>>> df = spark.read.text('python/test_support/sql/text-test.txt', wholetext=True)
>>> df.collect()
- [Row(value=u'hello\\nthis')]
+ [Row(value='hello\\nthis')]
"""
self._set_opts(
wholetext=wholetext, lineSep=lineSep, pathGlobFilter=pathGlobFilter,
recursiveFileLookup=recursiveFileLookup)
- if isinstance(paths, basestring):
+ if isinstance(paths, str):
paths = [paths]
return self._df(self._jreader.text(self._spark._sc._jvm.PythonUtils.toSeq(paths)))
@@ -529,16 +522,16 @@ class DataFrameReader(OptionUtils):
charToEscapeQuoteEscaping=charToEscapeQuoteEscaping, samplingRatio=samplingRatio,
enforceSchema=enforceSchema, emptyValue=emptyValue, locale=locale, lineSep=lineSep,
pathGlobFilter=pathGlobFilter, recursiveFileLookup=recursiveFileLookup)
- if isinstance(path, basestring):
+ if isinstance(path, str):
path = [path]
if type(path) == list:
return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))
elif isinstance(path, RDD):
def func(iterator):
for x in iterator:
- if not isinstance(x, basestring):
- x = unicode(x)
- if isinstance(x, unicode):
+ if not isinstance(x, str):
+ x = str(x)
+ if isinstance(x, str):
x = x.encode("utf-8")
yield x
keyed = path.mapPartitions(func)
@@ -574,7 +567,7 @@ class DataFrameReader(OptionUtils):
"""
self._set_opts(mergeSchema=mergeSchema, pathGlobFilter=pathGlobFilter,
recursiveFileLookup=recursiveFileLookup)
- if isinstance(path, basestring):
+ if isinstance(path, str):
path = [path]
return self._df(self._jreader.orc(_to_seq(self._spark._sc, path)))
@@ -763,7 +756,7 @@ class DataFrameWriter(OptionUtils):
col, cols = col[0], col[1:]
- if not all(isinstance(c, basestring) for c in cols) or not(isinstance(col, basestring)):
+ if not all(isinstance(c, str) for c in cols) or not(isinstance(col, str)):
raise TypeError("all names should be `str`")
self._jwrite = self._jwrite.bucketBy(numBuckets, col, _to_seq(self._spark._sc, cols))
@@ -788,7 +781,7 @@ class DataFrameWriter(OptionUtils):
col, cols = col[0], col[1:]
- if not all(isinstance(c, basestring) for c in cols) or not(isinstance(col, basestring)):
+ if not all(isinstance(c, str) for c in cols) or not(isinstance(col, str)):
raise TypeError("all names should be `str`")
self._jwrite = self._jwrite.sortBy(col, _to_seq(self._spark._sc, cols))
diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py
index 61891c478d..a5d102712d 100644
--- a/python/pyspark/sql/session.py
+++ b/python/pyspark/sql/session.py
@@ -15,22 +15,13 @@
# limitations under the License.
#
-# To disallow implicit relative import. Remove this once we drop Python 2.
-from __future__ import absolute_import
-from __future__ import print_function
import sys
import warnings
from functools import reduce
from threading import RLock
-if sys.version >= '3':
- basestring = unicode = str
- xrange = range
-else:
- from itertools import imap as map
-
from pyspark import since
-from pyspark.rdd import RDD, ignore_unicode_prefix
+from pyspark.rdd import RDD
from pyspark.sql.conf import RuntimeConfig
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.pandas.conversion import SparkConversionMixin
@@ -56,7 +47,7 @@ def _monkey_patch_RDD(sparkSession):
:return: a DataFrame
>>> rdd.toDF().collect()
- [Row(name=u'Alice', age=1)]
+ [Row(name='Alice', age=1)]
"""
return sparkSession.createDataFrame(self, schema, sampleRatio)
@@ -197,7 +188,6 @@ class SparkSession(SparkConversionMixin):
_instantiatedSession = None
_activeSession = None
- @ignore_unicode_prefix
def __init__(self, sparkContext, jsparkSession=None):
"""Creates a new SparkSession.
@@ -213,7 +203,7 @@ class SparkSession(SparkConversionMixin):
[Row((i + CAST(1 AS BIGINT))=2, (d + CAST(1 AS DOUBLE))=2.0, (NOT b)=False, list[1]=2, \
dict[s]=0, time=datetime.datetime(2014, 8, 1, 14, 1, 5), a=1)]
>>> df.rdd.map(lambda x: (x.i, x.s, x.d, x.l, x.b, x.time, x.row.a, x.list)).collect()
- [(1, u'string', 1.0, 1, True, datetime.datetime(2014, 8, 1, 14, 1, 5), 1, [1, 2, 3])]
+ [(1, 'string', 1.0, 1, True, datetime.datetime(2014, 8, 1, 14, 1, 5), 1, [1, 2, 3])]
"""
from pyspark.sql.context import SQLContext
self._sc = sparkContext
@@ -492,7 +482,6 @@ class SparkSession(SparkConversionMixin):
return SparkSession.builder.getOrCreate()
@since(2.0)
- @ignore_unicode_prefix
def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=True):
"""
Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`.
@@ -530,34 +519,29 @@ class SparkSession(SparkConversionMixin):
.. note:: Usage with spark.sql.execution.arrow.pyspark.enabled=True is experimental.
- .. note:: When Arrow optimization is enabled, strings inside Pandas DataFrame in Python
- 2 are converted into bytes as they are bytes in Python 2 whereas regular strings are
- left as strings. When using strings in Python 2, use unicode `u""` as Python standard
- practice.
-
>>> l = [('Alice', 1)]
>>> spark.createDataFrame(l).collect()
- [Row(_1=u'Alice', _2=1)]
+ [Row(_1='Alice', _2=1)]
>>> spark.createDataFrame(l, ['name', 'age']).collect()
- [Row(name=u'Alice', age=1)]
+ [Row(name='Alice', age=1)]
>>> d = [{'name': 'Alice', 'age': 1}]
>>> spark.createDataFrame(d).collect()
- [Row(age=1, name=u'Alice')]
+ [Row(age=1, name='Alice')]
>>> rdd = sc.parallelize(l)
>>> spark.createDataFrame(rdd).collect()
- [Row(_1=u'Alice', _2=1)]
+ [Row(_1='Alice', _2=1)]
>>> df = spark.createDataFrame(rdd, ['name', 'age'])
>>> df.collect()
- [Row(name=u'Alice', age=1)]
+ [Row(name='Alice', age=1)]
>>> from pyspark.sql import Row
>>> Person = Row('name', 'age')
>>> person = rdd.map(lambda r: Person(*r))
>>> df2 = spark.createDataFrame(person)
>>> df2.collect()
- [Row(name=u'Alice', age=1)]
+ [Row(name='Alice', age=1)]
>>> from pyspark.sql.types import *
>>> schema = StructType([
@@ -565,15 +549,15 @@ class SparkSession(SparkConversionMixin):
... StructField("age", IntegerType(), True)])
>>> df3 = spark.createDataFrame(rdd, schema)
>>> df3.collect()
- [Row(name=u'Alice', age=1)]
+ [Row(name='Alice', age=1)]
>>> spark.createDataFrame(df.toPandas()).collect() # doctest: +SKIP
- [Row(name=u'Alice', age=1)]
+ [Row(name='Alice', age=1)]
>>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect() # doctest: +SKIP
[Row(0=1, 1=2)]
>>> spark.createDataFrame(rdd, "a: string, b: int").collect()
- [Row(a=u'Alice', b=1)]
+ [Row(a='Alice', b=1)]
>>> rdd = rdd.map(lambda row: row[1])
>>> spark.createDataFrame(rdd, "int").collect()
[Row(value=1)]
@@ -587,7 +571,7 @@ class SparkSession(SparkConversionMixin):
if isinstance(data, DataFrame):
raise TypeError("data is already a DataFrame")
- if isinstance(schema, basestring):
+ if isinstance(schema, str):
schema = _parse_datatype_string(schema)
elif isinstance(schema, (list, tuple)):
# Must re-encode any unicode strings to be consistent with StructField names
@@ -634,7 +618,6 @@ class SparkSession(SparkConversionMixin):
df._schema = schema
return df
- @ignore_unicode_prefix
@since(2.0)
def sql(self, sqlQuery):
"""Returns a :class:`DataFrame` representing the result of the given query.
@@ -644,7 +627,7 @@ class SparkSession(SparkConversionMixin):
>>> df.createOrReplaceTempView("table1")
>>> df2 = spark.sql("SELECT field1 AS f1, field2 as f2 from table1")
>>> df2.collect()
- [Row(f1=1, f2=u'row1'), Row(f1=2, f2=u'row2'), Row(f1=3, f2=u'row3')]
+ [Row(f1=1, f2='row1'), Row(f1=2, f2='row2'), Row(f1=3, f2='row3')]
"""
return DataFrame(self._jsparkSession.sql(sqlQuery), self._wrapped)
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index 2450a4c93c..5c528c1d54 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -18,13 +18,9 @@
import sys
import json
-if sys.version >= '3':
- basestring = str
-
from py4j.java_gateway import java_import
from pyspark import since, keyword_only
-from pyspark.rdd import ignore_unicode_prefix
from pyspark.sql.column import _to_seq
from pyspark.sql.readwriter import OptionUtils, to_str
from pyspark.sql.types import *
@@ -204,7 +200,6 @@ class StreamingQueryManager(object):
self._jsqm = jsqm
@property
- @ignore_unicode_prefix
@since(2.0)
def active(self):
"""Returns a list of active queries associated with this SQLContext
@@ -213,12 +208,11 @@ class StreamingQueryManager(object):
>>> sqm = spark.streams
>>> # get the list of active streaming queries
>>> [q.name for q in sqm.active]
- [u'this_query']
+ ['this_query']
>>> sq.stop()
"""
return [StreamingQuery(jsq) for jsq in self._jsqm.active()]
- @ignore_unicode_prefix
@since(2.0)
def get(self, id):
"""Returns an active query from this SQLContext or throws exception if an active query
@@ -226,7 +220,7 @@ class StreamingQueryManager(object):
>>> sq = sdf.writeStream.format('memory').queryName('this_query').start()
>>> sq.name
- u'this_query'
+ 'this_query'
>>> sq = spark.streams.get(sq.id)
>>> sq.isActive
True
@@ -328,7 +322,7 @@ class DataStreamReader(OptionUtils):
if isinstance(schema, StructType):
jschema = spark._jsparkSession.parseDataType(schema.json())
self._jreader = self._jreader.schema(jschema)
- elif isinstance(schema, basestring):
+ elif isinstance(schema, str):
self._jreader = self._jreader.schema(schema)
else:
raise TypeError("schema should be StructType or string")
@@ -527,7 +521,7 @@ class DataStreamReader(OptionUtils):
allowUnquotedControlChars=allowUnquotedControlChars, lineSep=lineSep, locale=locale,
dropFieldIfAllNull=dropFieldIfAllNull, encoding=encoding,
pathGlobFilter=pathGlobFilter, recursiveFileLookup=recursiveFileLookup)
- if isinstance(path, basestring):
+ if isinstance(path, str):
return self._df(self._jreader.json(path))
else:
raise TypeError("path can be only a single string")
@@ -555,7 +549,7 @@ class DataStreamReader(OptionUtils):
"""
self._set_opts(mergeSchema=mergeSchema, pathGlobFilter=pathGlobFilter,
recursiveFileLookup=recursiveFileLookup)
- if isinstance(path, basestring):
+ if isinstance(path, str):
return self._df(self._jreader.orc(path))
else:
raise TypeError("path can be only a single string")
@@ -585,12 +579,11 @@ class DataStreamReader(OptionUtils):
"""
self._set_opts(mergeSchema=mergeSchema, pathGlobFilter=pathGlobFilter,
recursiveFileLookup=recursiveFileLookup)
- if isinstance(path, basestring):
+ if isinstance(path, str):
return self._df(self._jreader.parquet(path))
else:
raise TypeError("path can be only a single string")
- @ignore_unicode_prefix
@since(2.0)
def text(self, path, wholetext=False, lineSep=None, pathGlobFilter=None,
recursiveFileLookup=None):
@@ -623,7 +616,7 @@ class DataStreamReader(OptionUtils):
self._set_opts(
wholetext=wholetext, lineSep=lineSep, pathGlobFilter=pathGlobFilter,
recursiveFileLookup=recursiveFileLookup)
- if isinstance(path, basestring):
+ if isinstance(path, str):
return self._df(self._jreader.text(path))
else:
raise TypeError("path can be only a single string")
@@ -762,7 +755,7 @@ class DataStreamReader(OptionUtils):
charToEscapeQuoteEscaping=charToEscapeQuoteEscaping, enforceSchema=enforceSchema,
emptyValue=emptyValue, locale=locale, lineSep=lineSep,
pathGlobFilter=pathGlobFilter, recursiveFileLookup=recursiveFileLookup)
- if isinstance(path, basestring):
+ if isinstance(path, str):
return self._df(self._jreader.csv(path))
else:
raise TypeError("path can be only a single string")
@@ -1153,7 +1146,6 @@ class DataStreamWriter(object):
ensure_callback_server_started(gw)
return self
- @ignore_unicode_prefix
@since(2.0)
def start(self, path=None, format=None, outputMode=None, partitionBy=None, queryName=None,
**options):
@@ -1186,14 +1178,14 @@ class DataStreamWriter(object):
>>> sq.isActive
True
>>> sq.name
- u'this_query'
+ 'this_query'
>>> sq.stop()
>>> sq.isActive
False
>>> sq = sdf.writeStream.trigger(processingTime='5 seconds').start(
... queryName='that_query', outputMode="append", format='memory')
>>> sq.name
- u'that_query'
+ 'that_query'
>>> sq.isActive
True
>>> sq.stop()
diff --git a/python/pyspark/sql/tests/test_arrow.py b/python/pyspark/sql/tests/test_arrow.py
index a96354e3ec..90fc983aec 100644
--- a/python/pyspark/sql/tests/test_arrow.py
+++ b/python/pyspark/sql/tests/test_arrow.py
@@ -21,9 +21,6 @@ import threading
import time
import unittest
import warnings
-import sys
-if sys.version >= '3':
- basestring = unicode = str
from pyspark import SparkContext, SparkConf
from pyspark.sql import Row, SparkSession
@@ -32,7 +29,6 @@ from pyspark.sql.types import *
from pyspark.testing.sqlutils import ReusedSQLTestCase, have_pandas, have_pyarrow, \
pandas_requirement_message, pyarrow_requirement_message
from pyspark.testing.utils import QuietTest
-from pyspark.util import _exception_message
if have_pandas:
import pandas as pd
@@ -130,7 +126,7 @@ class ArrowTests(ReusedSQLTestCase):
warn.message for warn in warns if isinstance(warn.message, UserWarning)]
self.assertTrue(len(user_warns) > 0)
self.assertTrue(
- "Attempting non-optimization" in _exception_message(user_warns[-1]))
+ "Attempting non-optimization" in str(user_warns[-1]))
assert_frame_equal(pdf, pd.DataFrame({u'map': [{u'a': 1}]}))
def test_toPandas_fallback_disabled(self):
@@ -358,7 +354,7 @@ class ArrowTests(ReusedSQLTestCase):
warn.message for warn in warns if isinstance(warn.message, UserWarning)]
self.assertTrue(len(user_warns) > 0)
self.assertTrue(
- "Attempting non-optimization" in _exception_message(user_warns[-1]))
+ "Attempting non-optimization" in str(user_warns[-1]))
self.assertEqual(df.collect(), [Row(a={u'a': 1})])
def test_createDataFrame_fallback_disabled(self):
@@ -438,12 +434,12 @@ class ArrowTests(ReusedSQLTestCase):
assert_frame_equal(result_spark, result_arrow)
# ensure original category elements are string
- self.assertIsInstance(category_first_element, basestring)
+ self.assertIsInstance(category_first_element, str)
# spark data frame and arrow execution mode enabled data frame type must match pandas
self.assertEqual(spark_type, 'string')
self.assertEqual(arrow_type, 'string')
- self.assertIsInstance(arrow_first_category_element, basestring)
- self.assertIsInstance(spark_first_category_element, basestring)
+ self.assertIsInstance(arrow_first_category_element, str)
+ self.assertIsInstance(spark_first_category_element, str)
def test_createDataFrame_with_float_index(self):
# SPARK-32098: float index should not produce duplicated or truncated Spark DataFrame
diff --git a/python/pyspark/sql/tests/test_column.py b/python/pyspark/sql/tests/test_column.py
index 58bf896a10..e0b8bf45a2 100644
--- a/python/pyspark/sql/tests/test_column.py
+++ b/python/pyspark/sql/tests/test_column.py
@@ -16,8 +16,6 @@
# limitations under the License.
#
-import sys
-
from pyspark.sql import Column, Row
from pyspark.sql.types import *
from pyspark.sql.utils import AnalysisException
@@ -109,12 +107,8 @@ class ColumnTests(ReusedSQLTestCase):
self.assertRaises(TypeError, lambda: df[{}])
def test_column_name_with_non_ascii(self):
- if sys.version >= '3':
- columnName = "数量"
- self.assertTrue(isinstance(columnName, str))
- else:
- columnName = unicode("数量", "utf-8")
- self.assertTrue(isinstance(columnName, unicode))
+ columnName = "数量"
+ self.assertTrue(isinstance(columnName, str))
schema = StructType([StructField(columnName, LongType(), True)])
df = self.spark.createDataFrame([(1,)], schema)
self.assertEqual(schema, df.schema)
diff --git a/python/pyspark/sql/tests/test_context.py b/python/pyspark/sql/tests/test_context.py
index 3b1b638ed4..ff953ba4b4 100644
--- a/python/pyspark/sql/tests/test_context.py
+++ b/python/pyspark/sql/tests/test_context.py
@@ -19,11 +19,7 @@ import shutil
import sys
import tempfile
import unittest
-try:
- from importlib import reload # Python 3.4+ only.
-except ImportError:
- # Otherwise, we will stick to Python 2's built-in reload.
- pass
+from importlib import reload
import py4j
diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py
index 52ae74df5d..7dcc19f3ba 100644
--- a/python/pyspark/sql/tests/test_functions.py
+++ b/python/pyspark/sql/tests/test_functions.py
@@ -167,10 +167,6 @@ class FunctionsTests(ReusedSQLTestCase):
TypeError,
"must be the same type",
lambda: df.select(col('name').substr(0, lit(1))))
- if sys.version_info.major == 2:
- self.assertRaises(
- TypeError,
- lambda: df.select(col('name').substr(long(0), long(1))))
for name in _string_functions.keys():
self.assertEqual(
diff --git a/python/pyspark/sql/tests/test_pandas_cogrouped_map.py b/python/pyspark/sql/tests/test_pandas_cogrouped_map.py
index c1cb30c3ca..24a73918d8 100644
--- a/python/pyspark/sql/tests/test_pandas_cogrouped_map.py
+++ b/python/pyspark/sql/tests/test_pandas_cogrouped_map.py
@@ -32,11 +32,6 @@ if have_pyarrow:
import pyarrow as pa
-# Tests below use pd.DataFrame.assign that will infer mixed types (unicode/str) for column names
-# From kwargs w/ Python 2, so need to set check_column_type=False and avoid this check
-_check_column_type = sys.version >= '3'
-
-
@unittest.skipIf(
not have_pandas or not have_pyarrow,
pandas_requirement_message or pyarrow_requirement_message)
@@ -109,7 +104,7 @@ class CogroupedMapInPandasTests(ReusedSQLTestCase):
'v2': [90, 100, 110]
})
- assert_frame_equal(expected, result, check_column_type=_check_column_type)
+ assert_frame_equal(expected, result)
def test_empty_group_by(self):
left = self.data1
@@ -130,7 +125,7 @@ class CogroupedMapInPandasTests(ReusedSQLTestCase):
.merge(left, right, on=['id', 'k']) \
.sort_values(by=['id', 'k'])
- assert_frame_equal(expected, result, check_column_type=_check_column_type)
+ assert_frame_equal(expected, result)
def test_mixed_scalar_udfs_followed_by_cogrouby_apply(self):
df = self.spark.range(0, 10).toDF('v1')
@@ -173,7 +168,7 @@ class CogroupedMapInPandasTests(ReusedSQLTestCase):
expected = self.data1.toPandas()
expected = expected.assign(key=expected.id % 2 == 0)
- assert_frame_equal(expected, result, check_column_type=_check_column_type)
+ assert_frame_equal(expected, result)
def test_wrong_return_type(self):
# Test that we get a sensible exception invalid values passed to apply
@@ -224,7 +219,7 @@ class CogroupedMapInPandasTests(ReusedSQLTestCase):
expected = left.toPandas() if isLeft else right.toPandas()
expected = expected.assign(key=expected.id)
- assert_frame_equal(expected, result, check_column_type=_check_column_type)
+ assert_frame_equal(expected, result)
@staticmethod
def _test_merge(left, right, output_schema='id long, k int, v int, v2 int'):
@@ -246,7 +241,7 @@ class CogroupedMapInPandasTests(ReusedSQLTestCase):
.merge(left, right, on=['id', 'k']) \
.sort_values(by=['id', 'k'])
- assert_frame_equal(expected, result, check_column_type=_check_column_type)
+ assert_frame_equal(expected, result)
if __name__ == "__main__":
diff --git a/python/pyspark/sql/tests/test_pandas_grouped_map.py b/python/pyspark/sql/tests/test_pandas_grouped_map.py
index cc6167e619..00cc9b3a64 100644
--- a/python/pyspark/sql/tests/test_pandas_grouped_map.py
+++ b/python/pyspark/sql/tests/test_pandas_grouped_map.py
@@ -38,11 +38,6 @@ if have_pyarrow:
import pyarrow as pa
-# Tests below use pd.DataFrame.assign that will infer mixed types (unicode/str) for column names
-# from kwargs w/ Python 2, so need to set check_column_type=False and avoid this check
-_check_column_type = sys.version >= '3'
-
-
@unittest.skipIf(
not have_pandas or not have_pyarrow,
pandas_requirement_message or pyarrow_requirement_message)
@@ -139,9 +134,9 @@ class GroupedMapInPandasTests(ReusedSQLTestCase):
result3 = df.groupby('id').apply(udf3).sort('id').toPandas()
expected3 = expected1
- assert_frame_equal(expected1, result1, check_column_type=_check_column_type)
- assert_frame_equal(expected2, result2, check_column_type=_check_column_type)
- assert_frame_equal(expected3, result3, check_column_type=_check_column_type)
+ assert_frame_equal(expected1, result1)
+ assert_frame_equal(expected2, result2)
+ assert_frame_equal(expected3, result3)
def test_array_type_correct(self):
df = self.data.withColumn("arr", array(col("id"))).repartition(1, "id")
@@ -159,7 +154,7 @@ class GroupedMapInPandasTests(ReusedSQLTestCase):
result = df.groupby('id').apply(udf).sort('id').toPandas()
expected = df.toPandas().groupby('id').apply(udf.func).reset_index(drop=True)
- assert_frame_equal(expected, result, check_column_type=_check_column_type)
+ assert_frame_equal(expected, result)
def test_register_grouped_map_udf(self):
foo_udf = pandas_udf(lambda x: x, "id long", PandasUDFType.GROUPED_MAP)
@@ -181,7 +176,7 @@ class GroupedMapInPandasTests(ReusedSQLTestCase):
result = df.groupby('id').apply(foo).sort('id').toPandas()
expected = df.toPandas().groupby('id').apply(foo.func).reset_index(drop=True)
- assert_frame_equal(expected, result, check_column_type=_check_column_type)
+ assert_frame_equal(expected, result)
def test_coerce(self):
df = self.data
@@ -195,7 +190,7 @@ class GroupedMapInPandasTests(ReusedSQLTestCase):
result = df.groupby('id').apply(foo).sort('id').toPandas()
expected = df.toPandas().groupby('id').apply(foo.func).reset_index(drop=True)
expected = expected.assign(v=expected.v.astype('float64'))
- assert_frame_equal(expected, result, check_column_type=_check_column_type)
+ assert_frame_equal(expected, result)
def test_complex_groupby(self):
df = self.data
@@ -213,7 +208,7 @@ class GroupedMapInPandasTests(ReusedSQLTestCase):
expected = pdf.groupby(pdf['id'] % 2 == 0, as_index=False).apply(normalize.func)
expected = expected.sort_values(['id', 'v']).reset_index(drop=True)
expected = expected.assign(norm=expected.norm.astype('float64'))
- assert_frame_equal(expected, result, check_column_type=_check_column_type)
+ assert_frame_equal(expected, result)
def test_empty_groupby(self):
df = self.data
@@ -231,7 +226,7 @@ class GroupedMapInPandasTests(ReusedSQLTestCase):
expected = normalize.func(pdf)
expected = expected.sort_values(['id', 'v']).reset_index(drop=True)
expected = expected.assign(norm=expected.norm.astype('float64'))
- assert_frame_equal(expected, result, check_column_type=_check_column_type)
+ assert_frame_equal(expected, result)
def test_datatype_string(self):
df = self.data
@@ -244,7 +239,7 @@ class GroupedMapInPandasTests(ReusedSQLTestCase):
result = df.groupby('id').apply(foo_udf).sort('id').toPandas()
expected = df.toPandas().groupby('id').apply(foo_udf.func).reset_index(drop=True)
- assert_frame_equal(expected, result, check_column_type=_check_column_type)
+ assert_frame_equal(expected, result)
def test_wrong_return_type(self):
with QuietTest(self.sc):
@@ -301,7 +296,7 @@ class GroupedMapInPandasTests(ReusedSQLTestCase):
df = self.spark.createDataFrame(dt, 'timestamp').toDF('time')
foo_udf = pandas_udf(lambda pdf: pdf, 'time timestamp', PandasUDFType.GROUPED_MAP)
result = df.groupby('time').apply(foo_udf).sort('time')
- assert_frame_equal(df.toPandas(), result.toPandas(), check_column_type=_check_column_type)
+ assert_frame_equal(df.toPandas(), result.toPandas())
def test_udf_with_key(self):
import numpy as np
@@ -355,26 +350,26 @@ class GroupedMapInPandasTests(ReusedSQLTestCase):
expected1 = pdf.groupby('id', as_index=False)\
.apply(lambda x: udf1.func((x.id.iloc[0],), x))\
.sort_values(['id', 'v']).reset_index(drop=True)
- assert_frame_equal(expected1, result1, check_column_type=_check_column_type)
+ assert_frame_equal(expected1, result1)
# Test groupby expression
result2 = df.groupby(df.id % 2).apply(udf1).sort('id', 'v').toPandas()
expected2 = pdf.groupby(pdf.id % 2, as_index=False)\
.apply(lambda x: udf1.func((x.id.iloc[0] % 2,), x))\
.sort_values(['id', 'v']).reset_index(drop=True)
- assert_frame_equal(expected2, result2, check_column_type=_check_column_type)
+ assert_frame_equal(expected2, result2)
# Test complex groupby
result3 = df.groupby(df.id, df.v % 2).apply(udf2).sort('id', 'v').toPandas()
expected3 = pdf.groupby([pdf.id, pdf.v % 2], as_index=False)\
.apply(lambda x: udf2.func((x.id.iloc[0], (x.v % 2).iloc[0],), x))\
.sort_values(['id', 'v']).reset_index(drop=True)
- assert_frame_equal(expected3, result3, check_column_type=_check_column_type)
+ assert_frame_equal(expected3, result3)
# Test empty groupby
result4 = df.groupby().apply(udf3).sort('id', 'v').toPandas()
expected4 = udf3.func((), pdf)
- assert_frame_equal(expected4, result4, check_column_type=_check_column_type)
+ assert_frame_equal(expected4, result4)
def test_column_order(self):
@@ -407,7 +402,7 @@ class GroupedMapInPandasTests(ReusedSQLTestCase):
.select('id', 'u', 'v').toPandas()
pd_result = grouped_pdf.apply(change_col_order)
expected = pd_result.sort_values(['id', 'v']).reset_index(drop=True)
- assert_frame_equal(expected, result, check_column_type=_check_column_type)
+ assert_frame_equal(expected, result)
# Function returns a pdf with positional columns, indexed by range
def range_col_order(pdf):
@@ -426,7 +421,7 @@ class GroupedMapInPandasTests(ReusedSQLTestCase):
pd_result = grouped_pdf.apply(range_col_order)
rename_pdf(pd_result, ['id', 'u', 'v'])
expected = pd_result.sort_values(['id', 'v']).reset_index(drop=True)
- assert_frame_equal(expected, result, check_column_type=_check_column_type)
+ assert_frame_equal(expected, result)
# Function returns a pdf with columns indexed with integers
def int_index(pdf):
@@ -444,7 +439,7 @@ class GroupedMapInPandasTests(ReusedSQLTestCase):
pd_result = grouped_pdf.apply(int_index)
rename_pdf(pd_result, ['id', 'u', 'v'])
expected = pd_result.sort_values(['id', 'v']).reset_index(drop=True)
- assert_frame_equal(expected, result, check_column_type=_check_column_type)
+ assert_frame_equal(expected, result)
@pandas_udf('id long, v int', PandasUDFType.GROUPED_MAP)
def column_name_typo(pdf):
diff --git a/python/pyspark/sql/tests/test_pandas_map.py b/python/pyspark/sql/tests/test_pandas_map.py
index f1956a2523..02ae6a86f9 100644
--- a/python/pyspark/sql/tests/test_pandas_map.py
+++ b/python/pyspark/sql/tests/test_pandas_map.py
@@ -19,9 +19,6 @@ import sys
import time
import unittest
-if sys.version >= '3':
- unicode = str
-
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.testing.sqlutils import ReusedSQLTestCase, have_pandas, have_pyarrow, \
pandas_requirement_message, pyarrow_requirement_message
diff --git a/python/pyspark/sql/tests/test_pandas_udf_scalar.py b/python/pyspark/sql/tests/test_pandas_udf_scalar.py
index 2d38efd39f..75e2a0929e 100644
--- a/python/pyspark/sql/tests/test_pandas_udf_scalar.py
+++ b/python/pyspark/sql/tests/test_pandas_udf_scalar.py
@@ -22,10 +22,6 @@ import sys
import tempfile
import time
import unittest
-
-if sys.version >= '3':
- unicode = str
-
from datetime import date, datetime
from decimal import Decimal
@@ -319,7 +315,7 @@ class ScalarPandasUDFTests(ReusedSQLTestCase):
StructField('str', StringType())])
def scalar_func(id):
- return pd.DataFrame({'id': id, 'str': id.apply(unicode)})
+ return pd.DataFrame({'id': id, 'str': id.apply(str)})
def iter_func(it):
for id in it:
@@ -486,14 +482,14 @@ class ScalarPandasUDFTests(ReusedSQLTestCase):
@pandas_udf(return_type)
def scalar_f(id):
- return pd.DataFrame({'id': id, 'str': id.apply(unicode)})
+ return pd.DataFrame({'id': id, 'str': id.apply(str)})
scalar_g = pandas_udf(lambda x: x, return_type)
@pandas_udf(return_type, PandasUDFType.SCALAR_ITER)
def iter_f(it):
for id in it:
- yield pd.DataFrame({'id': id, 'str': id.apply(unicode)})
+ yield pd.DataFrame({'id': id, 'str': id.apply(str)})
iter_g = pandas_udf(lambda x: x, return_type, PandasUDFType.SCALAR_ITER)
@@ -915,21 +911,12 @@ class ScalarPandasUDFTests(ReusedSQLTestCase):
# Check result of column 'B' must be equal to column 'A' in type and values
pd.testing.assert_series_equal(result_spark["A"], result_spark["B"], check_names=False)
- @unittest.skipIf(sys.version_info[:2] < (3, 5), "Type hints are supported from Python 3.5.")
def test_type_annotation(self):
# Regression test to check if type hints can be used. See SPARK-23569.
- # Note that it throws an error during compilation in lower Python versions if 'exec'
- # is not used. Also, note that we explicitly use another dictionary to avoid modifications
- # in the current 'locals()'.
- #
- # Hyukjin: I think it's an ugly way to test issues about syntax specific in
- # higher versions of Python, which we shouldn't encourage. This was the last resort
- # I could come up with at that time.
- _locals = {}
- exec(
- "import pandas as pd\ndef noop(col: pd.Series) -> pd.Series: return col",
- _locals)
- df = self.spark.range(1).select(pandas_udf(f=_locals['noop'], returnType='bigint')('id'))
+ def noop(col: pd.Series) -> pd.Series:
+ return col
+
+ df = self.spark.range(1).select(pandas_udf(f=noop, returnType='bigint')('id'))
self.assertEqual(df.first()[0], 0)
def test_mixed_udf(self):
diff --git a/python/pyspark/sql/tests/test_pandas_udf_typehints.py b/python/pyspark/sql/tests/test_pandas_udf_typehints.py
index 2582080056..618164fa84 100644
--- a/python/pyspark/sql/tests/test_pandas_udf_typehints.py
+++ b/python/pyspark/sql/tests/test_pandas_udf_typehints.py
@@ -14,9 +14,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
-import sys
import unittest
import inspect
+from typing import Union, Iterator, Tuple
from pyspark.sql.functions import mean, lit
from pyspark.testing.sqlutils import ReusedSQLTestCase, \
@@ -24,209 +24,162 @@ from pyspark.testing.sqlutils import ReusedSQLTestCase, \
pyarrow_requirement_message
from pyspark.sql.pandas.typehints import infer_eval_type
from pyspark.sql.pandas.functions import pandas_udf, PandasUDFType
+from pyspark.sql import Row
if have_pandas:
import pandas as pd
+ import numpy as np
from pandas.util.testing import assert_frame_equal
-python_requirement_message = "pandas UDF with type hints are supported with Python 3.6+."
-
@unittest.skipIf(
- not have_pandas or not have_pyarrow or sys.version_info[:2] < (3, 6),
- pandas_requirement_message or pyarrow_requirement_message or python_requirement_message)
+ not have_pandas or not have_pyarrow,
+ pandas_requirement_message or pyarrow_requirement_message)
class PandasUDFTypeHintsTests(ReusedSQLTestCase):
- # Note that, we should remove `exec` once we drop Python 2 in this class.
-
- def setUp(self):
- self.local = {'pd': pd}
-
def test_type_annotation_scalar(self):
- exec(
- "def func(col: pd.Series) -> pd.Series: pass",
- self.local)
+ def func(col: pd.Series) -> pd.Series:
+ pass
self.assertEqual(
- infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR)
+ infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR)
- exec(
- "def func(col: pd.DataFrame, col1: pd.Series) -> pd.DataFrame: pass",
- self.local)
+ def func(col: pd.DataFrame, col1: pd.Series) -> pd.DataFrame:
+ pass
self.assertEqual(
- infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR)
+ infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR)
- exec(
- "def func(col: pd.DataFrame, *args: pd.Series) -> pd.Series: pass",
- self.local)
+ def func(col: pd.DataFrame, *args: pd.Series) -> pd.Series:
+ pass
self.assertEqual(
- infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR)
+ infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR)
- exec(
- "def func(col: pd.Series, *args: pd.Series, **kwargs: pd.DataFrame) -> pd.Series:\n"
- " pass",
- self.local)
+ def func(col: pd.Series, *args: pd.Series, **kwargs: pd.DataFrame) -> pd.Series:
+ pass
self.assertEqual(
- infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR)
+ infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR)
- exec(
- "def func(col: pd.Series, *, col2: pd.DataFrame) -> pd.DataFrame:\n"
- " pass",
- self.local)
+ def func(col: pd.Series, *, col2: pd.DataFrame) -> pd.DataFrame:
+ pass
self.assertEqual(
- infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR)
+ infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR)
- exec(
- "from typing import Union\n"
- "def func(col: Union[pd.Series, pd.DataFrame], *, col2: pd.DataFrame) -> pd.Series:\n"
- " pass",
- self.local)
+ def func(col: Union[pd.Series, pd.DataFrame], *, col2: pd.DataFrame) -> pd.Series:
+ pass
self.assertEqual(
- infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR)
+ infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR)
def test_type_annotation_scalar_iter(self):
- exec(
- "from typing import Iterator\n"
- "def func(iter: Iterator[pd.Series]) -> Iterator[pd.Series]: pass",
- self.local)
+ def func(iter: Iterator[pd.Series]) -> Iterator[pd.Series]:
+ pass
self.assertEqual(
- infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR_ITER)
+ infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR_ITER)
- exec(
- "from typing import Iterator, Tuple\n"
- "def func(iter: Iterator[Tuple[pd.DataFrame, pd.Series]]) -> Iterator[pd.DataFrame]:\n"
- " pass",
- self.local)
+ def func(iter: Iterator[Tuple[pd.DataFrame, pd.Series]]) -> Iterator[pd.DataFrame]:
+ pass
self.assertEqual(
- infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR_ITER)
+ infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR_ITER)
- exec(
- "from typing import Iterator, Tuple\n"
- "def func(iter: Iterator[Tuple[pd.DataFrame, ...]]) -> Iterator[pd.Series]: pass",
- self.local)
+ def func(iter: Iterator[Tuple[pd.DataFrame, ...]]) -> Iterator[pd.Series]:
+ pass
self.assertEqual(
- infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR_ITER)
+ infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR_ITER)
- exec(
- "from typing import Iterator, Tuple, Union\n"
- "def func(iter: Iterator[Tuple[Union[pd.DataFrame, pd.Series], ...]])"
- " -> Iterator[pd.Series]: pass",
- self.local)
+ def func(
+ iter: Iterator[Tuple[Union[pd.DataFrame, pd.Series], ...]]
+ ) -> Iterator[pd.Series]:
+ pass
self.assertEqual(
- infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR_ITER)
+ infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR_ITER)
def test_type_annotation_group_agg(self):
- exec(
- "def func(col: pd.Series) -> str: pass",
- self.local)
- self.assertEqual(
- infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.GROUPED_AGG)
- exec(
- "def func(col: pd.DataFrame, col1: pd.Series) -> int: pass",
- self.local)
+ def func(col: pd.Series) -> str:
+ pass
self.assertEqual(
- infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.GROUPED_AGG)
+ infer_eval_type(inspect.signature(func)), PandasUDFType.GROUPED_AGG)
- exec(
- "from pyspark.sql import Row\n"
- "def func(col: pd.DataFrame, *args: pd.Series) -> Row: pass",
- self.local)
+ def func(col: pd.DataFrame, col1: pd.Series) -> int:
+ pass
self.assertEqual(
- infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.GROUPED_AGG)
+ infer_eval_type(inspect.signature(func)), PandasUDFType.GROUPED_AGG)
- exec(
- "def func(col: pd.Series, *args: pd.Series, **kwargs: pd.DataFrame) -> str:\n"
- " pass",
- self.local)
+ def func(col: pd.DataFrame, *args: pd.Series) -> Row:
+ pass
self.assertEqual(
- infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.GROUPED_AGG)
+ infer_eval_type(inspect.signature(func)), PandasUDFType.GROUPED_AGG)
- exec(
- "def func(col: pd.Series, *, col2: pd.DataFrame) -> float:\n"
- " pass",
- self.local)
+ def func(col: pd.Series, *args: pd.Series, **kwargs: pd.DataFrame) -> str:
+ pass
self.assertEqual(
- infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.GROUPED_AGG)
+ infer_eval_type(inspect.signature(func)), PandasUDFType.GROUPED_AGG)
- exec(
- "from typing import Union\n"
- "def func(col: Union[pd.Series, pd.DataFrame], *, col2: pd.DataFrame) -> float:\n"
- " pass",
- self.local)
+ def func(col: pd.Series, *, col2: pd.DataFrame) -> float:
+ pass
self.assertEqual(
- infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.GROUPED_AGG)
+ infer_eval_type(inspect.signature(func)), PandasUDFType.GROUPED_AGG)
+
+ def func(col: Union[pd.Series, pd.DataFrame], *, col2: pd.DataFrame) -> float:
+ pass
+ self.assertEqual(
+ infer_eval_type(inspect.signature(func)), PandasUDFType.GROUPED_AGG)
def test_type_annotation_negative(self):
- exec(
- "def func(col: str) -> pd.Series: pass",
- self.local)
+
+ def func(col: str) -> pd.Series:
+ pass
self.assertRaisesRegex(
NotImplementedError,
"Unsupported signature.*str",
- infer_eval_type, inspect.signature(self.local['func']))
+ infer_eval_type, inspect.signature(func))
- exec(
- "def func(col: pd.DataFrame, col1: int) -> pd.DataFrame: pass",
- self.local)
+ def func(col: pd.DataFrame, col1: int) -> pd.DataFrame:
+ pass
self.assertRaisesRegex(
NotImplementedError,
"Unsupported signature.*int",
- infer_eval_type, inspect.signature(self.local['func']))
+ infer_eval_type, inspect.signature(func))
- exec(
- "from typing import Union\n"
- "def func(col: Union[pd.DataFrame, str], col1: int) -> pd.DataFrame: pass",
- self.local)
+ def func(col: Union[pd.DataFrame, str], col1: int) -> pd.DataFrame:
+ pass
self.assertRaisesRegex(
NotImplementedError,
"Unsupported signature.*str",
- infer_eval_type, inspect.signature(self.local['func']))
+ infer_eval_type, inspect.signature(func))
- exec(
- "from typing import Tuple\n"
- "def func(col: pd.Series) -> Tuple[pd.DataFrame]: pass",
- self.local)
+ def func(col: pd.Series) -> Tuple[pd.DataFrame]:
+ pass
self.assertRaisesRegex(
NotImplementedError,
"Unsupported signature.*Tuple",
- infer_eval_type, inspect.signature(self.local['func']))
+ infer_eval_type, inspect.signature(func))
- exec(
- "def func(col, *args: pd.Series) -> pd.Series: pass",
- self.local)
+ def func(col, *args: pd.Series) -> pd.Series:
+ pass
self.assertRaisesRegex(
ValueError,
"should be specified.*Series",
- infer_eval_type, inspect.signature(self.local['func']))
+ infer_eval_type, inspect.signature(func))
- exec(
- "def func(col: pd.Series, *args: pd.Series, **kwargs: pd.DataFrame):\n"
- " pass",
- self.local)
+ def func(col: pd.Series, *args: pd.Series, **kwargs: pd.DataFrame):
+ pass
self.assertRaisesRegex(
ValueError,
"should be specified.*Series",
- infer_eval_type, inspect.signature(self.local['func']))
+ infer_eval_type, inspect.signature(func))
- exec(
- "def func(col: pd.Series, *, col2) -> pd.DataFrame:\n"
- " pass",
- self.local)
+ def func(col: pd.Series, *, col2) -> pd.DataFrame:
+ pass
self.assertRaisesRegex(
ValueError,
"should be specified.*Series",
- infer_eval_type, inspect.signature(self.local['func']))
+ infer_eval_type, inspect.signature(func))
def test_scalar_udf_type_hint(self):
df = self.spark.range(10).selectExpr("id", "id as v")
- exec(
- "import typing\n"
- "def plus_one(v: typing.Union[pd.Series, pd.DataFrame]) -> pd.Series:\n"
- " return v + 1",
- self.local)
-
- plus_one = pandas_udf("long")(self.local["plus_one"])
+ def plus_one(v: Union[pd.Series, pd.DataFrame]) -> pd.Series:
+ return v + 1
+ plus_one = pandas_udf("long")(plus_one)
actual = df.select(plus_one(df.v).alias("plus_one"))
expected = df.selectExpr("(v + 1) as plus_one")
assert_frame_equal(expected.toPandas(), actual.toPandas())
@@ -234,14 +187,11 @@ class PandasUDFTypeHintsTests(ReusedSQLTestCase):
def test_scalar_iter_udf_type_hint(self):
df = self.spark.range(10).selectExpr("id", "id as v")
- exec(
- "import typing\n"
- "def plus_one(itr: typing.Iterator[pd.Series]) -> typing.Iterator[pd.Series]:\n"
- " for s in itr:\n"
- " yield s + 1",
- self.local)
+ def plus_one(itr: Iterator[pd.Series]) -> Iterator[pd.Series]:
+ for s in itr:
+ yield s + 1
- plus_one = pandas_udf("long")(self.local["plus_one"])
+ plus_one = pandas_udf("long")(plus_one)
actual = df.select(plus_one(df.v).alias("plus_one"))
expected = df.selectExpr("(v + 1) as plus_one")
@@ -249,13 +199,11 @@ class PandasUDFTypeHintsTests(ReusedSQLTestCase):
def test_group_agg_udf_type_hint(self):
df = self.spark.range(10).selectExpr("id", "id as v")
- exec(
- "import numpy as np\n"
- "def weighted_mean(v: pd.Series, w: pd.Series) -> float:\n"
- " return np.average(v, weights=w)",
- self.local)
- weighted_mean = pandas_udf("double")(self.local["weighted_mean"])
+ def weighted_mean(v: pd.Series, w: pd.Series) -> float:
+ return np.average(v, weights=w)
+
+ weighted_mean = pandas_udf("double")(weighted_mean)
actual = df.groupby('id').agg(weighted_mean(df.v, lit(1.0))).sort('id')
expected = df.groupby('id').agg(mean(df.v).alias('weighted_mean(v, 1.0)')).sort('id')
@@ -263,12 +211,9 @@ class PandasUDFTypeHintsTests(ReusedSQLTestCase):
def test_ignore_type_hint_in_group_apply_in_pandas(self):
df = self.spark.range(10)
- exec(
- "def pandas_plus_one(v: pd.DataFrame) -> pd.DataFrame:\n"
- " return v + 1",
- self.local)
- pandas_plus_one = self.local["pandas_plus_one"]
+ def pandas_plus_one(v: pd.DataFrame) -> pd.DataFrame:
+ return v + 1
actual = df.groupby('id').applyInPandas(pandas_plus_one, schema=df.schema).sort('id')
expected = df.selectExpr("id + 1 as id")
@@ -276,12 +221,9 @@ class PandasUDFTypeHintsTests(ReusedSQLTestCase):
def test_ignore_type_hint_in_cogroup_apply_in_pandas(self):
df = self.spark.range(10)
- exec(
- "def pandas_plus_one(left: pd.DataFrame, right: pd.DataFrame) -> pd.DataFrame:\n"
- " return left + 1",
- self.local)
- pandas_plus_one = self.local["pandas_plus_one"]
+ def pandas_plus_one(left: pd.DataFrame, right: pd.DataFrame) -> pd.DataFrame:
+ return left + 1
actual = df.groupby('id').cogroup(
self.spark.range(10).groupby("id")
@@ -291,13 +233,9 @@ class PandasUDFTypeHintsTests(ReusedSQLTestCase):
def test_ignore_type_hint_in_map_in_pandas(self):
df = self.spark.range(10)
- exec(
- "from typing import Iterator\n"
- "def pandas_plus_one(iter: Iterator[pd.DataFrame]) -> Iterator[pd.DataFrame]:\n"
- " return map(lambda v: v + 1, iter)",
- self.local)
- pandas_plus_one = self.local["pandas_plus_one"]
+ def pandas_plus_one(iter: Iterator[pd.DataFrame]) -> Iterator[pd.DataFrame]:
+ return map(lambda v: v + 1, iter)
actual = df.mapInPandas(pandas_plus_one, schema=df.schema)
expected = df.selectExpr("id + 1 as id")
diff --git a/python/pyspark/sql/tests/test_types.py b/python/pyspark/sql/tests/test_types.py
index 016cafd669..051c8bde50 100644
--- a/python/pyspark/sql/tests/test_types.py
+++ b/python/pyspark/sql/tests/test_types.py
@@ -56,7 +56,7 @@ class TypesTests(ReusedSQLTestCase):
self.assertEqual(10, df3.count())
def test_apply_schema_to_dict_and_rows(self):
- schema = StructType().add("b", StringType()).add("a", IntegerType())
+ schema = StructType().add("a", IntegerType()).add("b", StringType())
input = [{"a": 1}, {"b": "coffee"}]
rdd = self.sc.parallelize(input)
for verify in [False, True]:
@@ -72,7 +72,6 @@ class TypesTests(ReusedSQLTestCase):
self.assertEqual(10, df4.count())
def test_create_dataframe_schema_mismatch(self):
- input = [Row(a=1)]
rdd = self.sc.parallelize(range(3)).map(lambda i: Row(a=i))
schema = StructType([StructField("a", IntegerType()), StructField("b", StringType())])
df = self.spark.createDataFrame(rdd, schema)
@@ -540,7 +539,6 @@ class TypesTests(ReusedSQLTestCase):
self.assertEqual(_infer_type(2**61), LongType())
self.assertEqual(_infer_type(2**71), LongType())
- @unittest.skipIf(sys.version < "3", "only Python 3 infers bytes as binary type")
def test_infer_binary_type(self):
binaryrow = [Row(f1='a', f2=b"abcd")]
df = self.sc.parallelize(binaryrow).toDF()
@@ -665,10 +663,6 @@ class TypesTests(ReusedSQLTestCase):
supported_string_types += ['u']
# test unicode
assertCollectSuccess('u', u'a')
- if sys.version_info[0] < 3:
- supported_string_types += ['c']
- # test string
- assertCollectSuccess('c', 'a')
# supported float and double
#
@@ -721,12 +715,8 @@ class TypesTests(ReusedSQLTestCase):
#
# Keys in _array_type_mappings is a complete list of all supported types,
# and types not in _array_type_mappings are considered unsupported.
- # `array.typecodes` are not supported in python 2.
- if sys.version_info[0] < 3:
- all_types = set(['c', 'b', 'B', 'u', 'h', 'H', 'i', 'I', 'l', 'L', 'f', 'd'])
- else:
- # PyPy seems not having array.typecodes.
- all_types = set(['b', 'B', 'u', 'h', 'H', 'i', 'I', 'l', 'L', 'q', 'Q', 'f', 'd'])
+ # PyPy seems not having array.typecodes.
+ all_types = set(['b', 'B', 'u', 'h', 'H', 'i', 'I', 'l', 'L', 'q', 'Q', 'f', 'd'])
unsupported_types = all_types - set(supported_types)
# test unsupported types
for t in unsupported_types:
@@ -767,10 +757,7 @@ class DataTypeTests(unittest.TestCase):
self.assertEqual(repr(row), "")
# test __repr__ with unicode values
- if sys.version_info.major >= 3:
- self.assertEqual(repr(Row("数", "量")), "")
- else:
- self.assertEqual(repr(Row(u"数", u"量")), r"")
+ self.assertEqual(repr(Row("数", "量")), "")
def test_empty_row(self):
row = Row()
@@ -888,7 +875,6 @@ class DataTypeVerificationTests(unittest.TestCase):
({"s": "a", "f": 1.0}, schema),
(Row(s="a", i=1), schema),
(Row(s="a", i=None), schema),
- (Row(s="a", i=1, f=1.0), schema),
(["a", 1], schema),
(["a", None], schema),
(("a", 1), schema),
@@ -973,18 +959,13 @@ class DataTypeVerificationTests(unittest.TestCase):
with self.assertRaises(exp, msg=msg):
_make_type_verifier(data_type, nullable=False)(obj)
- @unittest.skipIf(sys.version_info[:2] < (3, 6), "Create Row without sorting fields")
def test_row_without_field_sorting(self):
- sorting_enabled_tmp = Row._row_field_sorting_enabled
- Row._row_field_sorting_enabled = False
-
r = Row(b=1, a=2)
TestRow = Row("b", "a")
expected = TestRow(1, 2)
self.assertEqual(r, expected)
self.assertEqual(repr(r), "Row(b=1, a=2)")
- Row._row_field_sorting_enabled = sorting_enabled_tmp
if __name__ == "__main__":
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 320a68dffe..cc08482c73 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -15,7 +15,6 @@
# limitations under the License.
#
-import os
import sys
import decimal
import time
@@ -26,11 +25,6 @@ import re
import base64
from array import array
import ctypes
-import warnings
-
-if sys.version >= "3":
- long = int
- basestring = unicode = str
from py4j.protocol import register_input_converter
from py4j.java_gateway import JavaClass
@@ -409,9 +403,7 @@ class StructField(DataType):
"""
assert isinstance(dataType, DataType),\
"dataType %s should be an instance of %s" % (dataType, DataType)
- assert isinstance(name, basestring), "field name %s should be string" % (name)
- if not isinstance(name, str):
- name = name.encode('utf-8')
+ assert isinstance(name, str), "field name %s should be a string" % (name)
self.name = name
self.dataType = dataType
self.nullable = nullable
@@ -613,8 +605,6 @@ class StructType(DataType):
else:
if isinstance(obj, dict):
return tuple(obj.get(n) for n in self.names)
- elif isinstance(obj, Row) and getattr(obj, "__from_dict__", False):
- return tuple(obj[n] for n in self.names)
elif isinstance(obj, (list, tuple)):
return tuple(obj)
elif hasattr(obj, "__dict__"):
@@ -904,19 +894,9 @@ _type_mappings = {
datetime.date: DateType,
datetime.datetime: TimestampType,
datetime.time: TimestampType,
+ bytes: BinaryType,
}
-if sys.version < "3":
- _type_mappings.update({
- unicode: StringType,
- long: LongType,
- })
-
-if sys.version >= "3":
- _type_mappings.update({
- bytes: BinaryType,
- })
-
# Mapping Python array types to Spark SQL DataType
# We should be careful here. The size of these types in python depends on C
# implementation. We need to make sure that this conversion does not lose any
@@ -990,20 +970,6 @@ for _typecode in _array_unsigned_int_typecode_ctype_mappings.keys():
if sys.version_info[0] < 4:
_array_type_mappings['u'] = StringType
-# Type code 'c' are only available at python 2
-if sys.version_info[0] < 3:
- _array_type_mappings['c'] = StringType
-
-# SPARK-21465:
-# In python2, array of 'L' happened to be mistakenly, just partially supported. To
-# avoid breaking user's code, we should keep this partial support. Below is a
-# dirty hacking to keep this partial support and pass the unit test.
-import platform
-if sys.version_info[0] < 3 and platform.python_implementation() != 'PyPy':
- if 'L' not in _array_type_mappings.keys():
- _array_type_mappings['L'] = LongType
- _array_unsigned_int_typecode_ctype_mappings['L'] = ctypes.c_uint
-
def _infer_type(obj):
"""Infer the DataType from obj
@@ -1187,14 +1153,14 @@ def _create_converter(dataType):
_acceptable_types = {
BooleanType: (bool,),
- ByteType: (int, long),
- ShortType: (int, long),
- IntegerType: (int, long),
- LongType: (int, long),
+ ByteType: (int,),
+ ShortType: (int,),
+ IntegerType: (int,),
+ LongType: (int,),
FloatType: (float,),
DoubleType: (float,),
DecimalType: (decimal.Decimal,),
- StringType: (str, unicode),
+ StringType: (str,),
BinaryType: (bytearray, bytes),
DateType: (datetime.date, datetime.datetime),
TimestampType: (datetime.datetime,),
@@ -1376,10 +1342,6 @@ def _make_type_verifier(dataType, nullable=True, name=None):
if isinstance(obj, dict):
for f, verifier in verifiers:
verifier(obj.get(f))
- elif isinstance(obj, Row) and getattr(obj, "__from_dict__", False):
- # the order in obj could be different than dataType.fields
- for f, verifier in verifiers:
- verifier(obj[f])
elif isinstance(obj, (tuple, list)):
if len(obj) != len(verifiers):
raise ValueError(
@@ -1438,21 +1400,11 @@ class Row(tuple):
NOTE: As of Spark 3.0.0, Rows created from named arguments no longer have
field names sorted alphabetically and will be ordered in the position as
- entered. To enable sorting for Rows compatible with Spark 2.x, set the
- environment variable "PYSPARK_ROW_FIELD_SORTING_ENABLED" to "true". This
- option is deprecated and will be removed in future versions of Spark. For
- Python versions < 3.6, the order of named arguments is not guaranteed to
- be the same as entered, see https://www.python.org/dev/peps/pep-0468. In
- this case, a warning will be issued and the Row will fallback to sort the
- field names automatically.
-
- NOTE: Examples with Row in pydocs are run with the environment variable
- "PYSPARK_ROW_FIELD_SORTING_ENABLED" set to "true" which results in output
- where fields are sorted.
+ entered.
>>> row = Row(name="Alice", age=11)
>>> row
- Row(age=11, name='Alice')
+ Row(name='Alice', age=11)
>>> row['name'], row['age']
('Alice', 11)
>>> row.name, row.age
@@ -1476,47 +1428,22 @@ class Row(tuple):
Row(name='Alice', age=11)
This form can also be used to create rows as tuple values, i.e. with unnamed
- fields. Beware that such Row objects have different equality semantics:
+ fields.
>>> row1 = Row("Alice", 11)
>>> row2 = Row(name="Alice", age=11)
>>> row1 == row2
- False
- >>> row3 = Row(a="Alice", b=11)
- >>> row1 == row3
True
"""
- # Remove after Python < 3.6 dropped, see SPARK-29748
- _row_field_sorting_enabled = \
- os.environ.get('PYSPARK_ROW_FIELD_SORTING_ENABLED', 'false').lower() == 'true'
-
- if _row_field_sorting_enabled:
- warnings.warn("The environment variable 'PYSPARK_ROW_FIELD_SORTING_ENABLED' "
- "is deprecated and will be removed in future versions of Spark")
-
def __new__(cls, *args, **kwargs):
if args and kwargs:
raise ValueError("Can not use both args "
"and kwargs to create Row")
if kwargs:
- if not Row._row_field_sorting_enabled and sys.version_info[:2] < (3, 6):
- warnings.warn("To use named arguments for Python version < 3.6, Row fields will be "
- "automatically sorted. This warning can be skipped by setting the "
- "environment variable 'PYSPARK_ROW_FIELD_SORTING_ENABLED' to 'true'.")
- Row._row_field_sorting_enabled = True
-
# create row objects
- if Row._row_field_sorting_enabled:
- # Remove after Python < 3.6 dropped, see SPARK-29748
- names = sorted(kwargs.keys())
- row = tuple.__new__(cls, [kwargs[n] for n in names])
- row.__fields__ = names
- row.__from_dict__ = True
- else:
- row = tuple.__new__(cls, list(kwargs.values()))
- row.__fields__ = list(kwargs.keys())
-
+ row = tuple.__new__(cls, list(kwargs.values()))
+ row.__fields__ = list(kwargs.keys())
return row
else:
# create row class or objects
@@ -1537,7 +1464,7 @@ class Row(tuple):
>>> Row(name="Alice", age=11).asDict() == {'name': 'Alice', 'age': 11}
True
>>> row = Row(key=1, value=Row(name='a', age=2))
- >>> row.asDict() == {'key': 1, 'value': Row(age=2, name='a')}
+ >>> row.asDict() == {'key': 1, 'value': Row(name='a', age=2)}
True
>>> row.asDict(True) == {'key': 1, 'value': {'name': 'a', 'age': 2}}
True
@@ -1600,7 +1527,7 @@ class Row(tuple):
raise AttributeError(item)
def __setattr__(self, key, value):
- if key != '__fields__' and key != "__from_dict__":
+ if key != '__fields__':
raise Exception("Row is read-only")
self.__dict__[key] = value
diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py
index da68583b04..100481cf12 100644
--- a/python/pyspark/sql/udf.py
+++ b/python/pyspark/sql/udf.py
@@ -21,7 +21,7 @@ import functools
import sys
from pyspark import SparkContext, since
-from pyspark.rdd import _prepare_for_python_RDD, PythonEvalType, ignore_unicode_prefix
+from pyspark.rdd import _prepare_for_python_RDD, PythonEvalType
from pyspark.sql.column import Column, _to_java_column, _to_seq
from pyspark.sql.types import StringType, DataType, StructType, _parse_datatype_string
from pyspark.sql.pandas.types import to_arrow_type
@@ -232,7 +232,6 @@ class UDFRegistration(object):
def __init__(self, sparkSession):
self.sparkSession = sparkSession
- @ignore_unicode_prefix
@since("1.3.1")
def register(self, name, f, returnType=None):
"""Register a Python function (including lambda function) or a user-defined function
@@ -261,10 +260,10 @@ class UDFRegistration(object):
>>> strlen = spark.udf.register("stringLengthString", lambda x: len(x))
>>> spark.sql("SELECT stringLengthString('test')").collect()
- [Row(stringLengthString(test)=u'4')]
+ [Row(stringLengthString(test)='4')]
>>> spark.sql("SELECT 'foo' AS text").select(strlen("text")).collect()
- [Row(stringLengthString(text)=u'3')]
+ [Row(stringLengthString(text)='3')]
>>> from pyspark.sql.types import IntegerType
>>> _ = spark.udf.register("stringLengthInt", lambda x: len(x), IntegerType())
@@ -349,7 +348,6 @@ class UDFRegistration(object):
self.sparkSession._jsparkSession.udf().registerPython(name, register_udf._judf)
return return_udf
- @ignore_unicode_prefix
@since(2.3)
def registerJavaFunction(self, name, javaClassName, returnType=None):
"""Register a Java user-defined function as a SQL function.
@@ -389,7 +387,6 @@ class UDFRegistration(object):
jdt = self.sparkSession._jsparkSession.parseDataType(returnType.json())
self.sparkSession._jsparkSession.udf().registerJava(name, javaClassName, jdt)
- @ignore_unicode_prefix
@since(2.3)
def registerJavaUDAF(self, name, javaClassName):
"""Register a Java user-defined aggregate function as a SQL function.
@@ -403,7 +400,7 @@ class UDFRegistration(object):
>>> df.createOrReplaceTempView("df")
>>> q = "SELECT name, javaUDAF(id) as avg from df group by name order by name desc"
>>> spark.sql(q).collect() # doctest: +SKIP
- [Row(name=u'b', avg=102.0), Row(name=u'a', avg=102.0)]
+ [Row(name='b', avg=102.0), Row(name='a', avg=102.0)]
"""
self.sparkSession._jsparkSession.udf().registerJavaUDAF(name, javaClassName)
@@ -419,9 +416,6 @@ def _test():
.appName("sql.udf tests")\
.getOrCreate()
globs['spark'] = spark
- # Hack to skip the unit tests in register. These are currently being tested in proper tests.
- # We should reenable this test once we completely drop Python 2.
- del pyspark.sql.udf.UDFRegistration.register
(failure_count, test_count) = doctest.testmod(
pyspark.sql.udf, globs=globs,
optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
diff --git a/python/pyspark/sql/utils.py b/python/pyspark/sql/utils.py
index 1d5bc49d25..bd76d88005 100644
--- a/python/pyspark/sql/utils.py
+++ b/python/pyspark/sql/utils.py
@@ -16,22 +16,9 @@
#
import py4j
-import sys
from pyspark import SparkContext
-if sys.version_info.major >= 3:
- unicode = str
- # Disable exception chaining (PEP 3134) in captured exceptions
- # in order to hide JVM stacktace.
- exec("""
-def raise_from(e):
- raise e from None
-""")
-else:
- def raise_from(e):
- raise e
-
class CapturedException(Exception):
def __init__(self, desc, stackTrace, cause=None):
@@ -45,11 +32,7 @@ class CapturedException(Exception):
desc = self.desc
if debug_enabled:
desc = desc + "\n\nJVM stacktrace:\n%s" % self.stackTrace
- # encode unicode instance for python2 for human readable description
- if sys.version_info.major < 3 and isinstance(desc, unicode):
- return str(desc.encode('utf-8'))
- else:
- return str(desc)
+ return str(desc)
class AnalysisException(CapturedException):
@@ -131,7 +114,7 @@ def capture_sql_exception(f):
if not isinstance(converted, UnknownException):
# Hide where the exception came from that shows a non-Pythonic
# JVM exception message.
- raise_from(converted)
+ raise converted from None
else:
raise
return deco
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 6199611940..170f0c0ef7 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
from py4j.java_gateway import java_import, is_instance_of
from pyspark import RDD, SparkConf
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 60562a6c92..000318588e 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -21,11 +21,6 @@ import time
from itertools import chain
from datetime import datetime
-if sys.version < "3":
- from itertools import imap as map, ifilter as filter
-else:
- long = int
-
from py4j.protocol import Py4JJavaError
from pyspark import RDD
@@ -404,7 +399,7 @@ class DStream(object):
"""
if isinstance(timestamp, datetime):
timestamp = time.mktime(timestamp.timetuple())
- return self._sc._jvm.Time(long(timestamp * 1000))
+ return self._sc._jvm.Time(int(timestamp * 1000))
def slice(self, begin, end):
"""
diff --git a/python/pyspark/taskcontext.py b/python/pyspark/taskcontext.py
index 8f419a5e84..d8aa5f9318 100644
--- a/python/pyspark/taskcontext.py
+++ b/python/pyspark/taskcontext.py
@@ -14,10 +14,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
-
-from __future__ import print_function
-import json
-
from pyspark.java_gateway import local_connect_and_auth
from pyspark.serializers import read_int, write_int, write_with_length, UTF8Deserializer
diff --git a/python/pyspark/testing/sqlutils.py b/python/pyspark/testing/sqlutils.py
index 085fce6daa..e85cae7dda 100644
--- a/python/pyspark/testing/sqlutils.py
+++ b/python/pyspark/testing/sqlutils.py
@@ -24,7 +24,6 @@ from contextlib import contextmanager
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, DoubleType, UserDefinedType, Row
from pyspark.testing.utils import ReusedPySparkTestCase
-from pyspark.util import _exception_message
pandas_requirement_message = None
@@ -33,7 +32,7 @@ try:
require_minimum_pandas_version()
except ImportError as e:
# If Pandas version requirement is not satisfied, skip related tests.
- pandas_requirement_message = _exception_message(e)
+ pandas_requirement_message = str(e)
pyarrow_requirement_message = None
try:
@@ -41,14 +40,14 @@ try:
require_minimum_pyarrow_version()
except ImportError as e:
# If Arrow version requirement is not satisfied, skip related tests.
- pyarrow_requirement_message = _exception_message(e)
+ pyarrow_requirement_message = str(e)
test_not_compiled_message = None
try:
from pyspark.sql.utils import require_test_compiled
require_test_compiled()
except Exception as e:
- test_not_compiled_message = _exception_message(e)
+ test_not_compiled_message = str(e)
have_pandas = pandas_requirement_message is None
have_pyarrow = pyarrow_requirement_message is None
diff --git a/python/pyspark/tests/test_profiler.py b/python/pyspark/tests/test_profiler.py
index 04ca5a3896..dbce72a0d3 100644
--- a/python/pyspark/tests/test_profiler.py
+++ b/python/pyspark/tests/test_profiler.py
@@ -19,15 +19,11 @@ import os
import sys
import tempfile
import unittest
+from io import StringIO
from pyspark import SparkConf, SparkContext, BasicProfiler
from pyspark.testing.utils import PySparkTestCase
-if sys.version >= "3":
- from io import StringIO
-else:
- from StringIO import StringIO
-
class ProfilerTests(PySparkTestCase):
diff --git a/python/pyspark/tests/test_rdd.py b/python/pyspark/tests/test_rdd.py
index 6c5b818056..1a580e27ea 100644
--- a/python/pyspark/tests/test_rdd.py
+++ b/python/pyspark/tests/test_rdd.py
@@ -32,9 +32,6 @@ from pyspark.serializers import CloudPickleSerializer, BatchedSerializer, Pickle
MarshalSerializer, UTF8Deserializer, NoOpSerializer
from pyspark.testing.utils import ReusedPySparkTestCase, SPARK_HOME, QuietTest
-if sys.version_info[0] >= 3:
- xrange = range
-
global_func = lambda: "Hi"
@@ -193,15 +190,13 @@ class RDDTests(ReusedPySparkTestCase):
def test_sampling_default_seed(self):
# Test for SPARK-3995 (default seed setting)
- data = self.sc.parallelize(xrange(1000), 1)
+ data = self.sc.parallelize(range(1000), 1)
subset = data.takeSample(False, 10)
self.assertEqual(len(subset), 10)
def test_aggregate_mutable_zero_value(self):
# Test for SPARK-9021; uses aggregate and treeAggregate to build dict
# representing a counter of ints
- # NOTE: dict is used instead of collections.Counter for Python 2.6
- # compatibility
from collections import defaultdict
# Show that single or multiple partitions work
@@ -262,8 +257,6 @@ class RDDTests(ReusedPySparkTestCase):
def test_fold_mutable_zero_value(self):
# Test for SPARK-9021; uses fold to merge an RDD of dict counters into
# a single dict
- # NOTE: dict is used instead of collections.Counter for Python 2.6
- # compatibility
from collections import defaultdict
counts1 = defaultdict(int, dict((i, 1) for i in range(10)))
@@ -439,7 +432,7 @@ class RDDTests(ReusedPySparkTestCase):
def test_large_closure(self):
N = 200000
- data = [float(i) for i in xrange(N)]
+ data = [float(i) for i in range(N)]
rdd = self.sc.parallelize(range(1), 1).map(lambda x: len(data))
self.assertEqual(N, rdd.first())
# regression test for SPARK-6886
@@ -464,8 +457,8 @@ class RDDTests(ReusedPySparkTestCase):
def test_zip_with_different_object_sizes(self):
# regress test for SPARK-5973
- a = self.sc.parallelize(xrange(10000)).map(lambda i: '*' * i)
- b = self.sc.parallelize(xrange(10000, 20000)).map(lambda i: '*' * i)
+ a = self.sc.parallelize(range(10000)).map(lambda i: '*' * i)
+ b = self.sc.parallelize(range(10000, 20000)).map(lambda i: '*' * i)
self.assertEqual(10000, a.zip(b).count())
def test_zip_with_different_number_of_items(self):
@@ -487,7 +480,7 @@ class RDDTests(ReusedPySparkTestCase):
self.assertRaises(Exception, lambda: a.zip(b).count())
def test_count_approx_distinct(self):
- rdd = self.sc.parallelize(xrange(1000))
+ rdd = self.sc.parallelize(range(1000))
self.assertTrue(950 < rdd.countApproxDistinct(0.03) < 1050)
self.assertTrue(950 < rdd.map(float).countApproxDistinct(0.03) < 1050)
self.assertTrue(950 < rdd.map(str).countApproxDistinct(0.03) < 1050)
@@ -641,7 +634,7 @@ class RDDTests(ReusedPySparkTestCase):
def test_external_group_by_key(self):
self.sc._conf.set("spark.python.worker.memory", "1m")
N = 2000001
- kv = self.sc.parallelize(xrange(N)).map(lambda x: (x % 3, x))
+ kv = self.sc.parallelize(range(N)).map(lambda x: (x % 3, x))
gkv = kv.groupByKey().cache()
self.assertEqual(3, gkv.count())
filtered = gkv.filter(lambda kv: kv[0] == 1)
@@ -698,7 +691,7 @@ class RDDTests(ReusedPySparkTestCase):
# Regression test for SPARK-6294
def test_take_on_jrdd(self):
- rdd = self.sc.parallelize(xrange(1 << 20)).map(lambda x: str(x))
+ rdd = self.sc.parallelize(range(1 << 20)).map(lambda x: str(x))
rdd._jrdd.first()
def test_sortByKey_uses_all_partitions_not_only_first_and_last(self):
diff --git a/python/pyspark/tests/test_readwrite.py b/python/pyspark/tests/test_readwrite.py
index 734b7e4789..faa006c7d8 100644
--- a/python/pyspark/tests/test_readwrite.py
+++ b/python/pyspark/tests/test_readwrite.py
@@ -38,104 +38,6 @@ class InputFormatTests(ReusedPySparkTestCase):
ReusedPySparkTestCase.tearDownClass()
shutil.rmtree(cls.tempdir.name)
- @unittest.skipIf(sys.version >= "3", "serialize array of byte")
- def test_sequencefiles(self):
- basepath = self.tempdir.name
- ints = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfint/",
- "org.apache.hadoop.io.IntWritable",
- "org.apache.hadoop.io.Text").collect())
- ei = [(1, u'aa'), (1, u'aa'), (2, u'aa'), (2, u'bb'), (2, u'bb'), (3, u'cc')]
- self.assertEqual(ints, ei)
-
- doubles = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfdouble/",
- "org.apache.hadoop.io.DoubleWritable",
- "org.apache.hadoop.io.Text").collect())
- ed = [(1.0, u'aa'), (1.0, u'aa'), (2.0, u'aa'), (2.0, u'bb'), (2.0, u'bb'), (3.0, u'cc')]
- self.assertEqual(doubles, ed)
-
- bytes = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfbytes/",
- "org.apache.hadoop.io.IntWritable",
- "org.apache.hadoop.io.BytesWritable").collect())
- ebs = [(1, bytearray('aa', 'utf-8')),
- (1, bytearray('aa', 'utf-8')),
- (2, bytearray('aa', 'utf-8')),
- (2, bytearray('bb', 'utf-8')),
- (2, bytearray('bb', 'utf-8')),
- (3, bytearray('cc', 'utf-8'))]
- self.assertEqual(bytes, ebs)
-
- text = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sftext/",
- "org.apache.hadoop.io.Text",
- "org.apache.hadoop.io.Text").collect())
- et = [(u'1', u'aa'),
- (u'1', u'aa'),
- (u'2', u'aa'),
- (u'2', u'bb'),
- (u'2', u'bb'),
- (u'3', u'cc')]
- self.assertEqual(text, et)
-
- bools = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfbool/",
- "org.apache.hadoop.io.IntWritable",
- "org.apache.hadoop.io.BooleanWritable").collect())
- eb = [(1, False), (1, True), (2, False), (2, False), (2, True), (3, True)]
- self.assertEqual(bools, eb)
-
- nulls = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfnull/",
- "org.apache.hadoop.io.IntWritable",
- "org.apache.hadoop.io.BooleanWritable").collect())
- en = [(1, None), (1, None), (2, None), (2, None), (2, None), (3, None)]
- self.assertEqual(nulls, en)
-
- maps = self.sc.sequenceFile(basepath + "/sftestdata/sfmap/",
- "org.apache.hadoop.io.IntWritable",
- "org.apache.hadoop.io.MapWritable").collect()
- em = [(1, {}),
- (1, {3.0: u'bb'}),
- (2, {1.0: u'aa'}),
- (2, {1.0: u'cc'}),
- (3, {2.0: u'dd'})]
- for v in maps:
- self.assertTrue(v in em)
-
- # arrays get pickled to tuples by default
- tuples = sorted(self.sc.sequenceFile(
- basepath + "/sftestdata/sfarray/",
- "org.apache.hadoop.io.IntWritable",
- "org.apache.spark.api.python.DoubleArrayWritable").collect())
- et = [(1, ()),
- (2, (3.0, 4.0, 5.0)),
- (3, (4.0, 5.0, 6.0))]
- self.assertEqual(tuples, et)
-
- # with custom converters, primitive arrays can stay as arrays
- arrays = sorted(self.sc.sequenceFile(
- basepath + "/sftestdata/sfarray/",
- "org.apache.hadoop.io.IntWritable",
- "org.apache.spark.api.python.DoubleArrayWritable",
- valueConverter="org.apache.spark.api.python.WritableToDoubleArrayConverter").collect())
- ea = [(1, array('d')),
- (2, array('d', [3.0, 4.0, 5.0])),
- (3, array('d', [4.0, 5.0, 6.0]))]
- self.assertEqual(arrays, ea)
-
- clazz = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfclass/",
- "org.apache.hadoop.io.Text",
- "org.apache.spark.api.python.TestWritable").collect())
- cname = u'org.apache.spark.api.python.TestWritable'
- ec = [(u'1', {u'__class__': cname, u'double': 1.0, u'int': 1, u'str': u'test1'}),
- (u'2', {u'__class__': cname, u'double': 2.3, u'int': 2, u'str': u'test2'}),
- (u'3', {u'__class__': cname, u'double': 3.1, u'int': 3, u'str': u'test3'}),
- (u'4', {u'__class__': cname, u'double': 4.2, u'int': 4, u'str': u'test4'}),
- (u'5', {u'__class__': cname, u'double': 5.5, u'int': 5, u'str': u'test56'})]
- self.assertEqual(clazz, ec)
-
- unbatched_clazz = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfclass/",
- "org.apache.hadoop.io.Text",
- "org.apache.spark.api.python.TestWritable",
- ).collect())
- self.assertEqual(unbatched_clazz, ec)
-
def test_oldhadoop(self):
basepath = self.tempdir.name
ints = sorted(self.sc.hadoopFile(basepath + "/sftestdata/sfint/",
@@ -249,51 +151,6 @@ class OutputFormatTests(ReusedPySparkTestCase):
def tearDown(self):
shutil.rmtree(self.tempdir.name, ignore_errors=True)
- @unittest.skipIf(sys.version >= "3", "serialize array of byte")
- def test_sequencefiles(self):
- basepath = self.tempdir.name
- ei = [(1, u'aa'), (1, u'aa'), (2, u'aa'), (2, u'bb'), (2, u'bb'), (3, u'cc')]
- self.sc.parallelize(ei).saveAsSequenceFile(basepath + "/sfint/")
- ints = sorted(self.sc.sequenceFile(basepath + "/sfint/").collect())
- self.assertEqual(ints, ei)
-
- ed = [(1.0, u'aa'), (1.0, u'aa'), (2.0, u'aa'), (2.0, u'bb'), (2.0, u'bb'), (3.0, u'cc')]
- self.sc.parallelize(ed).saveAsSequenceFile(basepath + "/sfdouble/")
- doubles = sorted(self.sc.sequenceFile(basepath + "/sfdouble/").collect())
- self.assertEqual(doubles, ed)
-
- ebs = [(1, bytearray(b'\x00\x07spam\x08')), (2, bytearray(b'\x00\x07spam\x08'))]
- self.sc.parallelize(ebs).saveAsSequenceFile(basepath + "/sfbytes/")
- bytes = sorted(self.sc.sequenceFile(basepath + "/sfbytes/").collect())
- self.assertEqual(bytes, ebs)
-
- et = [(u'1', u'aa'),
- (u'2', u'bb'),
- (u'3', u'cc')]
- self.sc.parallelize(et).saveAsSequenceFile(basepath + "/sftext/")
- text = sorted(self.sc.sequenceFile(basepath + "/sftext/").collect())
- self.assertEqual(text, et)
-
- eb = [(1, False), (1, True), (2, False), (2, False), (2, True), (3, True)]
- self.sc.parallelize(eb).saveAsSequenceFile(basepath + "/sfbool/")
- bools = sorted(self.sc.sequenceFile(basepath + "/sfbool/").collect())
- self.assertEqual(bools, eb)
-
- en = [(1, None), (1, None), (2, None), (2, None), (2, None), (3, None)]
- self.sc.parallelize(en).saveAsSequenceFile(basepath + "/sfnull/")
- nulls = sorted(self.sc.sequenceFile(basepath + "/sfnull/").collect())
- self.assertEqual(nulls, en)
-
- em = [(1, {}),
- (1, {3.0: u'bb'}),
- (2, {1.0: u'aa'}),
- (2, {1.0: u'cc'}),
- (3, {2.0: u'dd'})]
- self.sc.parallelize(em).saveAsSequenceFile(basepath + "/sfmap/")
- maps = self.sc.sequenceFile(basepath + "/sfmap/").collect()
- for v in maps:
- self.assertTrue(v, em)
-
def test_oldhadoop(self):
basepath = self.tempdir.name
dict_data = [(1, {}),
@@ -361,46 +218,6 @@ class OutputFormatTests(ReusedPySparkTestCase):
conf=input_conf).collect())
self.assertEqual(new_dataset, data)
- @unittest.skipIf(sys.version >= "3", "serialize of array")
- def test_newhadoop_with_array(self):
- basepath = self.tempdir.name
- # use custom ArrayWritable types and converters to handle arrays
- array_data = [(1, array('d')),
- (1, array('d', [1.0, 2.0, 3.0])),
- (2, array('d', [3.0, 4.0, 5.0]))]
- self.sc.parallelize(array_data).saveAsNewAPIHadoopFile(
- basepath + "/newhadoop/",
- "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat",
- "org.apache.hadoop.io.IntWritable",
- "org.apache.spark.api.python.DoubleArrayWritable",
- valueConverter="org.apache.spark.api.python.DoubleArrayToWritableConverter")
- result = sorted(self.sc.newAPIHadoopFile(
- basepath + "/newhadoop/",
- "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
- "org.apache.hadoop.io.IntWritable",
- "org.apache.spark.api.python.DoubleArrayWritable",
- valueConverter="org.apache.spark.api.python.WritableToDoubleArrayConverter").collect())
- self.assertEqual(result, array_data)
-
- conf = {
- "mapreduce.job.outputformat.class":
- "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat",
- "mapreduce.job.output.key.class": "org.apache.hadoop.io.IntWritable",
- "mapreduce.job.output.value.class": "org.apache.spark.api.python.DoubleArrayWritable",
- "mapreduce.output.fileoutputformat.outputdir": basepath + "/newdataset/"
- }
- self.sc.parallelize(array_data).saveAsNewAPIHadoopDataset(
- conf,
- valueConverter="org.apache.spark.api.python.DoubleArrayToWritableConverter")
- input_conf = {"mapreduce.input.fileinputformat.inputdir": basepath + "/newdataset/"}
- new_dataset = sorted(self.sc.newAPIHadoopRDD(
- "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
- "org.apache.hadoop.io.IntWritable",
- "org.apache.spark.api.python.DoubleArrayWritable",
- valueConverter="org.apache.spark.api.python.WritableToDoubleArrayConverter",
- conf=input_conf).collect())
- self.assertEqual(new_dataset, array_data)
-
def test_newolderror(self):
basepath = self.tempdir.name
rdd = self.sc.parallelize(range(1, 4)).map(lambda x: (x, "a" * x))
diff --git a/python/pyspark/tests/test_shuffle.py b/python/pyspark/tests/test_shuffle.py
index d50ba632d6..434414618e 100644
--- a/python/pyspark/tests/test_shuffle.py
+++ b/python/pyspark/tests/test_shuffle.py
@@ -23,15 +23,12 @@ from py4j.protocol import Py4JJavaError
from pyspark import shuffle, PickleSerializer, SparkConf, SparkContext
from pyspark.shuffle import Aggregator, ExternalMerger, ExternalSorter
-if sys.version_info[0] >= 3:
- xrange = range
-
class MergerTests(unittest.TestCase):
def setUp(self):
self.N = 1 << 12
- self.l = [i for i in xrange(self.N)]
+ self.l = [i for i in range(self.N)]
self.data = list(zip(self.l, self.l))
self.agg = Aggregator(lambda x: [x],
lambda x, y: x.append(y) or x,
@@ -42,26 +39,26 @@ class MergerTests(unittest.TestCase):
m.mergeValues(self.data)
self.assertEqual(m.spills, 0)
self.assertEqual(sum(sum(v) for k, v in m.items()),
- sum(xrange(self.N)))
+ sum(range(self.N)))
m = ExternalMerger(self.agg, 1000)
m.mergeCombiners(map(lambda x_y1: (x_y1[0], [x_y1[1]]), self.data))
self.assertEqual(m.spills, 0)
self.assertEqual(sum(sum(v) for k, v in m.items()),
- sum(xrange(self.N)))
+ sum(range(self.N)))
def test_medium_dataset(self):
m = ExternalMerger(self.agg, 20)
m.mergeValues(self.data)
self.assertTrue(m.spills >= 1)
self.assertEqual(sum(sum(v) for k, v in m.items()),
- sum(xrange(self.N)))
+ sum(range(self.N)))
m = ExternalMerger(self.agg, 10)
m.mergeCombiners(map(lambda x_y2: (x_y2[0], [x_y2[1]]), self.data * 3))
self.assertTrue(m.spills >= 1)
self.assertEqual(sum(sum(v) for k, v in m.items()),
- sum(xrange(self.N)) * 3)
+ sum(range(self.N)) * 3)
def test_huge_dataset(self):
m = ExternalMerger(self.agg, 5, partitions=3)
diff --git a/python/pyspark/tests/test_taskcontext.py b/python/pyspark/tests/test_taskcontext.py
index 90e4bcdfad..8c2bedbe4e 100644
--- a/python/pyspark/tests/test_taskcontext.py
+++ b/python/pyspark/tests/test_taskcontext.py
@@ -26,9 +26,6 @@ import unittest
from pyspark import SparkConf, SparkContext, TaskContext, BarrierTaskContext
from pyspark.testing.utils import PySparkTestCase, SPARK_HOME
-if sys.version_info[0] >= 3:
- xrange = range
-
class TaskContextTests(PySparkTestCase):
@@ -251,9 +248,9 @@ class TaskContextTestsWithWorkerReuse(unittest.TestCase):
def test_task_context_correct_with_python_worker_reuse(self):
"""Verify the task context correct when reused python worker"""
# start a normal job first to start all workers and get all worker pids
- worker_pids = self.sc.parallelize(xrange(2), 2).map(lambda x: os.getpid()).collect()
+ worker_pids = self.sc.parallelize(range(2), 2).map(lambda x: os.getpid()).collect()
# the worker will reuse in this barrier job
- rdd = self.sc.parallelize(xrange(10), 2)
+ rdd = self.sc.parallelize(range(10), 2)
def context(iterator):
tp = TaskContext.get().partitionId()
diff --git a/python/pyspark/tests/test_util.py b/python/pyspark/tests/test_util.py
index 81bfb66e70..511d62a51f 100644
--- a/python/pyspark/tests/test_util.py
+++ b/python/pyspark/tests/test_util.py
@@ -61,14 +61,12 @@ class KeywordOnlyTests(unittest.TestCase):
class UtilTests(PySparkTestCase):
- def test_py4j_exception_message(self):
- from pyspark.util import _exception_message
-
+ def test_py4j_str(self):
with self.assertRaises(Py4JJavaError) as context:
# This attempts java.lang.String(null) which throws an NPE.
self.sc._jvm.java.lang.String(None)
- self.assertTrue('NullPointerException' in _exception_message(context.exception))
+ self.assertTrue('NullPointerException' in str(context.exception))
def test_parsing_version_string(self):
from pyspark.util import VersionUtils
diff --git a/python/pyspark/tests/test_worker.py b/python/pyspark/tests/test_worker.py
index dba9298ee1..3b1848dcfd 100644
--- a/python/pyspark/tests/test_worker.py
+++ b/python/pyspark/tests/test_worker.py
@@ -32,9 +32,6 @@ from py4j.protocol import Py4JJavaError
from pyspark import SparkConf, SparkContext
from pyspark.testing.utils import ReusedPySparkTestCase, PySparkTestCase, QuietTest
-if sys.version_info[0] >= 3:
- xrange = range
-
class WorkerTests(ReusedPySparkTestCase):
def test_cancel_task(self):
@@ -88,13 +85,13 @@ class WorkerTests(ReusedPySparkTestCase):
self.fail("daemon had been killed")
# run a normal job
- rdd = self.sc.parallelize(xrange(100), 1)
+ rdd = self.sc.parallelize(range(100), 1)
self.assertEqual(100, rdd.map(str).count())
def test_after_exception(self):
def raise_exception(_):
raise Exception()
- rdd = self.sc.parallelize(xrange(100), 1)
+ rdd = self.sc.parallelize(range(100), 1)
with QuietTest(self.sc):
self.assertRaises(Exception, lambda: rdd.foreach(raise_exception))
self.assertEqual(100, rdd.map(str).count())
@@ -110,22 +107,22 @@ class WorkerTests(ReusedPySparkTestCase):
with QuietTest(self.sc):
self.assertRaises(Exception, lambda: filtered_data.count())
- rdd = self.sc.parallelize(xrange(100), 1)
+ rdd = self.sc.parallelize(range(100), 1)
self.assertEqual(100, rdd.map(str).count())
def test_accumulator_when_reuse_worker(self):
from pyspark.accumulators import INT_ACCUMULATOR_PARAM
acc1 = self.sc.accumulator(0, INT_ACCUMULATOR_PARAM)
- self.sc.parallelize(xrange(100), 20).foreach(lambda x: acc1.add(x))
+ self.sc.parallelize(range(100), 20).foreach(lambda x: acc1.add(x))
self.assertEqual(sum(range(100)), acc1.value)
acc2 = self.sc.accumulator(0, INT_ACCUMULATOR_PARAM)
- self.sc.parallelize(xrange(100), 20).foreach(lambda x: acc2.add(x))
+ self.sc.parallelize(range(100), 20).foreach(lambda x: acc2.add(x))
self.assertEqual(sum(range(100)), acc2.value)
self.assertEqual(sum(range(100)), acc1.value)
def test_reuse_worker_after_take(self):
- rdd = self.sc.parallelize(xrange(100000), 1)
+ rdd = self.sc.parallelize(range(100000), 1)
self.assertEqual(0, rdd.first())
def count():
@@ -160,17 +157,13 @@ class WorkerTests(ReusedPySparkTestCase):
self.sc.parallelize([1]).map(lambda x: f()).count()
except Py4JJavaError as e:
- if sys.version_info.major < 3:
- # we have to use unicode here to avoid UnicodeDecodeError
- self.assertRegexpMatches(unicode(e).encode("utf-8"), "exception with 中")
- else:
- self.assertRegexpMatches(str(e), "exception with 中")
+ self.assertRegexpMatches(str(e), "exception with 中")
class WorkerReuseTest(PySparkTestCase):
- def test_reuse_worker_of_parallelize_xrange(self):
- rdd = self.sc.parallelize(xrange(20), 8)
+ def test_reuse_worker_of_parallelize_range(self):
+ rdd = self.sc.parallelize(range(20), 8)
previous_pids = rdd.map(lambda x: os.getpid()).collect()
current_pids = rdd.map(lambda x: os.getpid()).collect()
for pid in current_pids:
@@ -189,7 +182,7 @@ class WorkerMemoryTest(unittest.TestCase):
self.sc = SparkContext('local[4]', class_name, conf=conf)
def test_memory_limit(self):
- rdd = self.sc.parallelize(xrange(1), 1)
+ rdd = self.sc.parallelize(range(1), 1)
def getrlimit():
import resource
diff --git a/python/pyspark/util.py b/python/pyspark/util.py
index d9429372a6..c003586e9c 100644
--- a/python/pyspark/util.py
+++ b/python/pyspark/util.py
@@ -19,52 +19,10 @@
import re
import sys
import traceback
-import os
-import warnings
-import inspect
-from py4j.protocol import Py4JJavaError
__all__ = []
-def _exception_message(excp):
- """Return the message from an exception as either a str or unicode object. Supports both
- Python 2 and Python 3.
-
- >>> msg = "Exception message"
- >>> excp = Exception(msg)
- >>> msg == _exception_message(excp)
- True
-
- >>> msg = u"unicöde"
- >>> excp = Exception(msg)
- >>> msg == _exception_message(excp)
- True
- """
- if isinstance(excp, Py4JJavaError):
- # 'Py4JJavaError' doesn't contain the stack trace available on the Java side in 'message'
- # attribute in Python 2. We should call 'str' function on this exception in general but
- # 'Py4JJavaError' has an issue about addressing non-ascii strings. So, here we work
- # around by the direct call, '__str__()'. Please see SPARK-23517.
- return excp.__str__()
- if hasattr(excp, "message"):
- return excp.message
- return str(excp)
-
-
-def _get_argspec(f):
- """
- Get argspec of a function. Supports both Python 2 and Python 3.
- """
- if sys.version_info[0] < 3:
- argspec = inspect.getargspec(f)
- else:
- # `getargspec` is deprecated since python3.0 (incompatible with function annotations).
- # See SPARK-23569.
- argspec = inspect.getfullargspec(f)
- return argspec
-
-
def print_exec(stream):
ei = sys.exc_info()
traceback.print_exception(ei[0], ei[1], ei[2], None, stream)
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index 5f4a8a2d2d..9b54affb13 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -18,11 +18,11 @@
"""
Worker that receives input from Piped RDD.
"""
-from __future__ import print_function
-from __future__ import absolute_import
import os
import sys
import time
+from inspect import getfullargspec
+import importlib
# 'resource' is a Unix specific module.
has_resource_module = True
try:
@@ -44,14 +44,9 @@ from pyspark.serializers import write_with_length, write_int, read_long, read_bo
from pyspark.sql.pandas.serializers import ArrowStreamPandasUDFSerializer, CogroupUDFSerializer
from pyspark.sql.pandas.types import to_arrow_type
from pyspark.sql.types import StructType
-from pyspark.util import _get_argspec, fail_on_stopiteration
+from pyspark.util import fail_on_stopiteration
from pyspark import shuffle
-if sys.version >= '3':
- basestring = str
-else:
- from itertools import imap as map # use iterator map by default
-
pickleSer = PickleSerializer()
utf8_deserializer = UTF8Deserializer()
@@ -272,10 +267,10 @@ def read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index):
elif eval_type == PythonEvalType.SQL_MAP_PANDAS_ITER_UDF:
return arg_offsets, wrap_pandas_iter_udf(func, return_type)
elif eval_type == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF:
- argspec = _get_argspec(chained_func) # signature was lost when wrapping it
+ argspec = getfullargspec(chained_func) # signature was lost when wrapping it
return arg_offsets, wrap_grouped_map_pandas_udf(func, return_type, argspec)
elif eval_type == PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF:
- argspec = _get_argspec(chained_func) # signature was lost when wrapping it
+ argspec = getfullargspec(chained_func) # signature was lost when wrapping it
return arg_offsets, wrap_cogrouped_map_pandas_udf(func, return_type, argspec)
elif eval_type == PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF:
return arg_offsets, wrap_grouped_agg_pandas_udf(func, return_type)
@@ -342,11 +337,13 @@ def read_udfs(pickleSer, infile, eval_type):
pickleSer, infile, eval_type, runner_conf, udf_index=0)
def func(_, iterator):
- num_input_rows = [0] # TODO(SPARK-29909): Use nonlocal after we drop Python 2.
+ num_input_rows = 0
def map_batch(batch):
+ nonlocal num_input_rows
+
udf_args = [batch[offset] for offset in arg_offsets]
- num_input_rows[0] += len(udf_args[0])
+ num_input_rows += len(udf_args[0])
if len(udf_args) == 1:
return udf_args[0]
else:
@@ -363,7 +360,7 @@ def read_udfs(pickleSer, infile, eval_type):
# by consuming the input iterator in user side. Therefore,
# it's very unlikely the output length is higher than
# input length.
- assert is_map_iter or num_output_rows <= num_input_rows[0], \
+ assert is_map_iter or num_output_rows <= num_input_rows, \
"Pandas SCALAR_ITER UDF outputted more rows than input rows."
yield (result_batch, result_type)
@@ -376,11 +373,11 @@ def read_udfs(pickleSer, infile, eval_type):
raise RuntimeError("pandas iterator UDF should exhaust the input "
"iterator.")
- if num_output_rows != num_input_rows[0]:
+ if num_output_rows != num_input_rows:
raise RuntimeError(
"The length of output in Scalar iterator pandas UDF should be "
"the same with the input's; however, the length of output was %d and the "
- "length of input was %d." % (num_output_rows, num_input_rows[0]))
+ "length of input was %d." % (num_output_rows, num_input_rows))
# profiling is not supported for UDF
return func, None, ser, ser
@@ -548,9 +545,8 @@ def main(infile, outfile):
for _ in range(num_python_includes):
filename = utf8_deserializer.loads(infile)
add_path(os.path.join(spark_files_dir, filename))
- if sys.version > '3':
- import importlib
- importlib.invalidate_caches()
+
+ importlib.invalidate_caches()
# fetch names and values of broadcast variables
needs_broadcast_decryption_server = read_bool(infile)
diff --git a/python/run-tests.py b/python/run-tests.py
index 42510c7642..23076eab1c 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -28,10 +28,7 @@ import tempfile
from threading import Thread, Lock
import time
import uuid
-if sys.version < '3':
- import Queue
-else:
- import queue as Queue
+import queue as Queue
from multiprocessing import Manager
@@ -75,7 +72,6 @@ def run_individual_python_test(target_dir, test_name, pyspark_python):
'SPARK_PREPEND_CLASSES': '1',
'PYSPARK_PYTHON': which(pyspark_python),
'PYSPARK_DRIVER_PYTHON': which(pyspark_python),
- 'PYSPARK_ROW_FIELD_SORTING_ENABLED': 'true'
})
# Create a unique temp directory under 'target/' for each run. The TMPDIR variable is
@@ -161,7 +157,8 @@ def run_individual_python_test(target_dir, test_name, pyspark_python):
def get_default_python_executables():
- python_execs = [x for x in ["python3.6", "python2.7", "pypy3", "pypy"] if which(x)]
+ # TODO(SPARK-32278): install PyPy3 in Jenkins to test
+ python_execs = [x for x in ["python3.6", "python3.8", "pypy3"] if which(x)]
if "python3.6" not in python_execs:
p = which("python3")
diff --git a/python/setup.py b/python/setup.py
index afbd601b04..c456a32fea 100755
--- a/python/setup.py
+++ b/python/setup.py
@@ -16,18 +16,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-from __future__ import print_function
import glob
import os
import sys
from setuptools import setup
from shutil import copyfile, copytree, rmtree
-if sys.version_info < (2, 7):
- print("Python versions prior to 2.7 are not supported for pip installed PySpark.",
- file=sys.stderr)
- sys.exit(-1)
-
try:
exec(open('pyspark/version.py').read())
except IOError:
@@ -217,13 +211,10 @@ try:
'pyarrow>=%s' % _minimum_pyarrow_version,
]
},
+ python_requires='>=3.6',
classifiers=[
'Development Status :: 5 - Production/Stable',
'License :: OSI Approved :: Apache Software License',
- 'Programming Language :: Python :: 2.7',
- 'Programming Language :: Python :: 3',
- 'Programming Language :: Python :: 3.4',
- 'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
diff --git a/resource-managers/kubernetes/integration-tests/tests/pyfiles.py b/resource-managers/kubernetes/integration-tests/tests/pyfiles.py
index ba55b75803..51c0160554 100644
--- a/resource-managers/kubernetes/integration-tests/tests/pyfiles.py
+++ b/resource-managers/kubernetes/integration-tests/tests/pyfiles.py
@@ -14,9 +14,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
-
-from __future__ import print_function
-
import sys
from pyspark.sql import SparkSession
diff --git a/resource-managers/kubernetes/integration-tests/tests/worker_memory_check.py b/resource-managers/kubernetes/integration-tests/tests/worker_memory_check.py
index d312a29f38..74559a0b54 100644
--- a/resource-managers/kubernetes/integration-tests/tests/worker_memory_check.py
+++ b/resource-managers/kubernetes/integration-tests/tests/worker_memory_check.py
@@ -15,8 +15,6 @@
# limitations under the License.
#
-from __future__ import print_function
-
import resource
import sys
diff --git a/sql/hive/src/test/resources/data/scripts/cat.py b/sql/hive/src/test/resources/data/scripts/cat.py
index aea0362f89..420d9f832a 100644
--- a/sql/hive/src/test/resources/data/scripts/cat.py
+++ b/sql/hive/src/test/resources/data/scripts/cat.py
@@ -16,7 +16,6 @@
# specific language governing permissions and limitations
# under the License.
#
-from __future__ import print_function
import sys
import os
diff --git a/sql/hive/src/test/resources/data/scripts/dumpdata_script.py b/sql/hive/src/test/resources/data/scripts/dumpdata_script.py
index 5b360208d3..f724fdc85b 100644
--- a/sql/hive/src/test/resources/data/scripts/dumpdata_script.py
+++ b/sql/hive/src/test/resources/data/scripts/dumpdata_script.py
@@ -18,12 +18,9 @@
#
import sys
-if sys.version_info[0] >= 3:
- xrange = range
-
-for i in xrange(50):
- for j in xrange(5):
- for k in xrange(20022):
+for i in range(50):
+ for j in range(5):
+ for k in range(20022):
print(20000 * i + k)
for line in sys.stdin: