From 4ad9bfd53b84a6d2497668c73af6899bae14c187 Mon Sep 17 00:00:00 2001 From: HyukjinKwon Date: Tue, 14 Jul 2020 11:22:44 +0900 Subject: [PATCH] [SPARK-32138] Drop Python 2.7, 3.4 and 3.5 ### What changes were proposed in this pull request? This PR aims to drop Python 2.7, 3.4 and 3.5. Roughly speaking, it removes all the widely known Python 2 compatibility workarounds such as `sys.version` comparison, `__future__`. Also, it removes the Python 2 dedicated codes such as `ArrayConstructor` in Spark. ### Why are the changes needed? 1. Unsupport EOL Python versions 2. Reduce maintenance overhead and remove a bit of legacy codes and hacks for Python 2. 3. PyPy2 has a critical bug that causes a flaky test, SPARK-28358 given my testing and investigation. 4. Users can use Python type hints with Pandas UDFs without thinking about Python version 5. Users can leverage one latest cloudpickle, https://github.com/apache/spark/pull/28950. With Python 3.8+ it can also leverage C pickle. ### Does this PR introduce _any_ user-facing change? Yes, users cannot use Python 2.7, 3.4 and 3.5 in the upcoming Spark version. ### How was this patch tested? Manually tested and also tested in Jenkins. Closes #28957 from HyukjinKwon/SPARK-32138. Authored-by: HyukjinKwon Signed-off-by: HyukjinKwon --- .github/workflows/master.yml | 3 +- .../apache/spark/api/python/SerDeUtil.scala | 66 ----- dev/create-release/releaseutils.py | 7 +- dev/github_jira_sync.py | 11 +- dev/lint-python | 10 +- dev/merge_spark_pr.py | 12 +- dev/run-tests-jenkins.py | 12 +- dev/sparktestsupport/toposort.py | 3 +- docs/configuration.md | 2 +- docs/index.md | 3 +- docs/rdd-programming-guide.md | 12 +- examples/src/main/python/als.py | 2 - examples/src/main/python/avro_inputformat.py | 2 - examples/src/main/python/kmeans.py | 2 - .../src/main/python/logistic_regression.py | 2 - .../main/python/ml/aft_survival_regression.py | 2 - examples/src/main/python/ml/als_example.py | 8 +- .../main/python/ml/anova_selector_example.py | 2 - .../src/main/python/ml/anova_test_example.py | 2 - .../src/main/python/ml/binarizer_example.py | 2 - .../python/ml/bisecting_k_means_example.py | 2 - .../bucketed_random_projection_lsh_example.py | 2 - .../src/main/python/ml/bucketizer_example.py | 2 - .../main/python/ml/chi_square_test_example.py | 2 - .../main/python/ml/chisq_selector_example.py | 2 - .../src/main/python/ml/correlation_example.py | 2 - .../python/ml/count_vectorizer_example.py | 2 - .../src/main/python/ml/cross_validator.py | 2 - .../src/main/python/ml/dataframe_example.py | 2 - examples/src/main/python/ml/dct_example.py | 2 - .../decision_tree_classification_example.py | 2 - .../ml/decision_tree_regression_example.py | 2 - .../python/ml/elementwise_product_example.py | 2 - .../ml/estimator_transformer_param_example.py | 2 - .../main/python/ml/feature_hasher_example.py | 2 - .../main/python/ml/fm_classifier_example.py | 2 - .../main/python/ml/fm_regressor_example.py | 2 - .../main/python/ml/fvalue_selector_example.py | 2 - .../src/main/python/ml/fvalue_test_example.py | 2 - .../python/ml/gaussian_mixture_example.py | 2 - .../generalized_linear_regression_example.py | 2 - ...radient_boosted_tree_classifier_example.py | 2 - ...gradient_boosted_tree_regressor_example.py | 2 - .../main/python/ml/index_to_string_example.py | 2 - .../src/main/python/ml/interaction_example.py | 2 - .../python/ml/isotonic_regression_example.py | 2 - examples/src/main/python/ml/kmeans_example.py | 2 - examples/src/main/python/ml/lda_example.py | 2 - .../ml/linear_regression_with_elastic_net.py | 2 - examples/src/main/python/ml/linearsvc.py | 2 - .../ml/logistic_regression_summary_example.py | 2 - .../logistic_regression_with_elastic_net.py | 2 - .../main/python/ml/max_abs_scaler_example.py | 2 - .../main/python/ml/min_hash_lsh_example.py | 2 - .../main/python/ml/min_max_scaler_example.py | 2 - ...ss_logistic_regression_with_elastic_net.py | 2 - .../multilayer_perceptron_classification.py | 2 - examples/src/main/python/ml/n_gram_example.py | 2 - .../src/main/python/ml/naive_bayes_example.py | 2 - .../src/main/python/ml/normalizer_example.py | 2 - .../src/main/python/ml/one_vs_rest_example.py | 2 - .../main/python/ml/onehot_encoder_example.py | 2 - examples/src/main/python/ml/pca_example.py | 2 - .../python/ml/polynomial_expansion_example.py | 2 - .../python/ml/quantile_discretizer_example.py | 2 - .../ml/random_forest_classifier_example.py | 2 - .../ml/random_forest_regressor_example.py | 2 - .../src/main/python/ml/rformula_example.py | 2 - .../main/python/ml/robust_scaler_example.py | 2 - .../src/main/python/ml/sql_transformer.py | 2 - .../main/python/ml/standard_scaler_example.py | 2 - .../python/ml/stopwords_remover_example.py | 2 - .../main/python/ml/string_indexer_example.py | 2 - .../src/main/python/ml/summarizer_example.py | 2 - examples/src/main/python/ml/tf_idf_example.py | 2 - .../src/main/python/ml/tokenizer_example.py | 2 - .../ml/variance_threshold_selector_example.py | 2 - .../python/ml/vector_assembler_example.py | 2 - .../main/python/ml/vector_indexer_example.py | 2 - .../python/ml/vector_size_hint_example.py | 2 - .../main/python/ml/vector_slicer_example.py | 2 - .../src/main/python/ml/word2vec_example.py | 2 - .../binary_classification_metrics_example.py | 1 - .../python/mllib/bisecting_k_means_example.py | 2 - .../src/main/python/mllib/correlations.py | 2 - .../main/python/mllib/correlations_example.py | 2 - .../decision_tree_classification_example.py | 2 - .../mllib/decision_tree_regression_example.py | 2 - .../mllib/elementwise_product_example.py | 2 - .../python/mllib/gaussian_mixture_example.py | 2 - .../python/mllib/gaussian_mixture_model.py | 7 +- ...radient_boosting_classification_example.py | 2 - .../gradient_boosting_regression_example.py | 2 - .../mllib/hypothesis_testing_example.py | 2 - ...testing_kolmogorov_smirnov_test_example.py | 2 - .../mllib/isotonic_regression_example.py | 2 - .../src/main/python/mllib/k_means_example.py | 2 - .../kernel_density_estimation_example.py | 2 - examples/src/main/python/mllib/kmeans.py | 2 - .../latent_dirichlet_allocation_example.py | 2 - .../linear_regression_with_sgd_example.py | 2 - .../main/python/mllib/logistic_regression.py | 2 - .../logistic_regression_with_lbfgs_example.py | 2 - .../main/python/mllib/naive_bayes_example.py | 2 - .../main/python/mllib/normalizer_example.py | 2 - .../power_iteration_clustering_example.py | 2 - .../random_forest_classification_example.py | 2 - .../mllib/random_forest_regression_example.py | 2 - .../python/mllib/random_rdd_generation.py | 2 - .../python/mllib/recommendation_example.py | 2 - .../src/main/python/mllib/sampled_rdds.py | 2 - .../python/mllib/standard_scaler_example.py | 2 - .../mllib/stratified_sampling_example.py | 2 - .../python/mllib/streaming_k_means_example.py | 2 - .../streaming_linear_regression_example.py | 2 - .../mllib/summary_statistics_example.py | 2 - .../src/main/python/mllib/tf_idf_example.py | 2 - examples/src/main/python/mllib/word2vec.py | 2 - .../src/main/python/mllib/word2vec_example.py | 2 - examples/src/main/python/pagerank.py | 2 - .../src/main/python/parquet_inputformat.py | 2 - examples/src/main/python/pi.py | 2 - examples/src/main/python/sort.py | 2 - examples/src/main/python/sql/arrow.py | 9 - examples/src/main/python/sql/basic.py | 2 - examples/src/main/python/sql/datasource.py | 2 - examples/src/main/python/sql/hive.py | 2 - .../streaming/structured_kafka_wordcount.py | 2 - .../streaming/structured_network_wordcount.py | 2 - .../structured_network_wordcount_windowed.py | 2 - examples/src/main/python/status_api_demo.py | 7 +- .../main/python/streaming/hdfs_wordcount.py | 2 - .../python/streaming/network_wordcount.py | 2 - .../streaming/network_wordjoinsentiments.py | 2 - .../recoverable_network_wordcount.py | 2 - .../python/streaming/sql_network_wordcount.py | 2 - .../streaming/stateful_network_wordcount.py | 2 - .../src/main/python/transitive_closure.py | 2 - examples/src/main/python/wordcount.py | 2 - .../streaming/kinesis_wordcount_asl.py | 2 - python/pyspark/accumulators.py | 5 +- python/pyspark/broadcast.py | 10 +- python/pyspark/conf.py | 25 +- python/pyspark/context.py | 44 +-- python/pyspark/find_spark_home.py | 31 +- python/pyspark/java_gateway.py | 7 +- python/pyspark/ml/classification.py | 8 +- python/pyspark/ml/common.py | 7 +- python/pyspark/ml/feature.py | 39 +-- python/pyspark/ml/fpm.py | 6 +- python/pyspark/ml/image.py | 3 +- python/pyspark/ml/linalg/__init__.py | 33 +-- python/pyspark/ml/param/__init__.py | 25 +- .../ml/param/_shared_params_code_gen.py | 2 - python/pyspark/ml/pipeline.py | 6 +- python/pyspark/ml/tests/test_feature.py | 5 +- python/pyspark/ml/tests/test_param.py | 15 +- .../pyspark/ml/tests/test_training_summary.py | 5 +- python/pyspark/ml/tree.py | 6 +- python/pyspark/ml/tuning.py | 3 +- python/pyspark/ml/util.py | 20 +- python/pyspark/ml/wrapper.py | 12 +- python/pyspark/mllib/__init__.py | 2 - python/pyspark/mllib/clustering.py | 22 +- python/pyspark/mllib/common.py | 7 +- python/pyspark/mllib/feature.py | 19 +- python/pyspark/mllib/fpm.py | 7 +- python/pyspark/mllib/linalg/__init__.py | 32 +-- python/pyspark/mllib/linalg/distributed.py | 35 ++- python/pyspark/mllib/stat/KernelDensity.py | 5 - python/pyspark/mllib/stat/_statistics.py | 14 +- python/pyspark/mllib/tests/test_linalg.py | 5 +- python/pyspark/mllib/tree.py | 2 - python/pyspark/mllib/util.py | 14 +- python/pyspark/rdd.py | 39 +-- python/pyspark/resultiterable.py | 5 +- python/pyspark/serializers.py | 50 +--- python/pyspark/shell.py | 5 +- python/pyspark/sql/__init__.py | 3 - python/pyspark/sql/avro/functions.py | 7 +- python/pyspark/sql/catalog.py | 10 +- python/pyspark/sql/column.py | 80 +++--- python/pyspark/sql/conf.py | 10 +- python/pyspark/sql/context.py | 46 ++- python/pyspark/sql/dataframe.py | 272 ++++++++---------- python/pyspark/sql/functions.py | 212 +++++--------- python/pyspark/sql/group.py | 8 +- python/pyspark/sql/pandas/conversion.py | 24 +- python/pyspark/sql/pandas/functions.py | 43 ++- python/pyspark/sql/pandas/serializers.py | 13 +- python/pyspark/sql/readwriter.py | 39 ++- python/pyspark/sql/session.py | 45 +-- python/pyspark/sql/streaming.py | 28 +- python/pyspark/sql/tests/test_arrow.py | 14 +- python/pyspark/sql/tests/test_column.py | 10 +- python/pyspark/sql/tests/test_context.py | 6 +- python/pyspark/sql/tests/test_functions.py | 4 - .../sql/tests/test_pandas_cogrouped_map.py | 15 +- .../sql/tests/test_pandas_grouped_map.py | 39 ++- python/pyspark/sql/tests/test_pandas_map.py | 3 - .../sql/tests/test_pandas_udf_scalar.py | 27 +- .../sql/tests/test_pandas_udf_typehints.py | 254 +++++++--------- python/pyspark/sql/tests/test_types.py | 27 +- python/pyspark/sql/types.py | 101 +------ python/pyspark/sql/udf.py | 14 +- python/pyspark/sql/utils.py | 21 +- python/pyspark/streaming/context.py | 2 - python/pyspark/streaming/dstream.py | 7 +- python/pyspark/taskcontext.py | 4 - python/pyspark/testing/sqlutils.py | 7 +- python/pyspark/tests/test_profiler.py | 6 +- python/pyspark/tests/test_rdd.py | 21 +- python/pyspark/tests/test_readwrite.py | 183 ------------ python/pyspark/tests/test_shuffle.py | 13 +- python/pyspark/tests/test_taskcontext.py | 7 +- python/pyspark/tests/test_util.py | 6 +- python/pyspark/tests/test_worker.py | 27 +- python/pyspark/util.py | 42 --- python/pyspark/worker.py | 32 +-- python/run-tests.py | 9 +- python/setup.py | 11 +- .../integration-tests/tests/pyfiles.py | 3 - .../tests/worker_memory_check.py | 2 - .../src/test/resources/data/scripts/cat.py | 1 - .../resources/data/scripts/dumpdata_script.py | 9 +- 225 files changed, 735 insertions(+), 2033 deletions(-) diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index d6458bf44f..5cf00c6ed9 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -133,7 +133,8 @@ jobs: architecture: x64 - name: Install Python 3.6 uses: actions/setup-python@v2 - if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) + # Yarn has a Python specific test too, for example, YarnClusterSuite. + if: contains(matrix.modules, 'yarn') || contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) with: python-version: 3.6 architecture: x64 diff --git a/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala b/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala index 01e64b6972..5a6fa50796 100644 --- a/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala +++ b/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala @@ -45,71 +45,6 @@ private[spark] object SerDeUtil extends Logging { } } } - // Unpickle array.array generated by Python 2.6 - class ArrayConstructor extends net.razorvine.pickle.objects.ArrayConstructor { - // /* Description of types */ - // static struct arraydescr descriptors[] = { - // {'c', sizeof(char), c_getitem, c_setitem}, - // {'b', sizeof(char), b_getitem, b_setitem}, - // {'B', sizeof(char), BB_getitem, BB_setitem}, - // #ifdef Py_USING_UNICODE - // {'u', sizeof(Py_UNICODE), u_getitem, u_setitem}, - // #endif - // {'h', sizeof(short), h_getitem, h_setitem}, - // {'H', sizeof(short), HH_getitem, HH_setitem}, - // {'i', sizeof(int), i_getitem, i_setitem}, - // {'I', sizeof(int), II_getitem, II_setitem}, - // {'l', sizeof(long), l_getitem, l_setitem}, - // {'L', sizeof(long), LL_getitem, LL_setitem}, - // {'f', sizeof(float), f_getitem, f_setitem}, - // {'d', sizeof(double), d_getitem, d_setitem}, - // {'\0', 0, 0, 0} /* Sentinel */ - // }; - val machineCodes: Map[Char, Int] = if (ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN)) { - Map('B' -> 0, 'b' -> 1, 'H' -> 3, 'h' -> 5, 'I' -> 7, 'i' -> 9, - 'L' -> 11, 'l' -> 13, 'f' -> 15, 'd' -> 17, 'u' -> 21 - ) - } else { - Map('B' -> 0, 'b' -> 1, 'H' -> 2, 'h' -> 4, 'I' -> 6, 'i' -> 8, - 'L' -> 10, 'l' -> 12, 'f' -> 14, 'd' -> 16, 'u' -> 20 - ) - } - override def construct(args: Array[Object]): Object = { - if (args.length == 1) { - construct(args ++ Array("")) - } else if (args.length == 2 && args(1).isInstanceOf[String]) { - val typecode = args(0).asInstanceOf[String].charAt(0) - // This must be ISO 8859-1 / Latin 1, not UTF-8, to interoperate correctly - val data = args(1).asInstanceOf[String].getBytes(StandardCharsets.ISO_8859_1) - if (typecode == 'c') { - // It seems like the pickle of pypy uses the similar protocol to Python 2.6, which uses - // a string for array data instead of list as Python 2.7, and handles an array of - // typecode 'c' as 1-byte character. - val result = new Array[Char](data.length) - var i = 0 - while (i < data.length) { - result(i) = data(i).toChar - i += 1 - } - result - } else { - construct(typecode, machineCodes(typecode), data) - } - } else if (args.length == 2 && args(0) == "l") { - // On Python 2, an array of typecode 'l' should be handled as long rather than int. - val values = args(1).asInstanceOf[JArrayList[_]] - val result = new Array[Long](values.size) - var i = 0 - while (i < values.size) { - result(i) = values.get(i).asInstanceOf[Number].longValue() - i += 1 - } - result - } else { - super.construct(args) - } - } - } private var initialized = false // This should be called before trying to unpickle array.array from Python @@ -117,7 +52,6 @@ private[spark] object SerDeUtil extends Logging { def initialize(): Unit = { synchronized{ if (!initialized) { - Unpickler.registerConstructor("array", "array", new ArrayConstructor()) Unpickler.registerConstructor("__builtin__", "bytearray", new ByteArrayConstructor()) Unpickler.registerConstructor("builtins", "bytearray", new ByteArrayConstructor()) Unpickler.registerConstructor("__builtin__", "bytes", new ByteArrayConstructor()) diff --git a/dev/create-release/releaseutils.py b/dev/create-release/releaseutils.py index a5a26ae8f5..241b7ed539 100755 --- a/dev/create-release/releaseutils.py +++ b/dev/create-release/releaseutils.py @@ -49,8 +49,6 @@ except ImportError: print("Install using 'sudo pip install unidecode'") sys.exit(-1) -if sys.version < '3': - input = raw_input # noqa # Contributors list file name contributors_file_name = "contributors.txt" @@ -152,10 +150,7 @@ def get_commits(tag): if not is_valid_author(author): author = github_username # Guard against special characters - try: # Python 2 - author = unicode(author, "UTF-8") - except NameError: # Python 3 - author = str(author) + author = str(author) author = unidecode.unidecode(author).strip() commit = Commit(_hash, author, title, pr_number) commits.append(commit) diff --git a/dev/github_jira_sync.py b/dev/github_jira_sync.py index b444b74d40..b90afeebc5 100755 --- a/dev/github_jira_sync.py +++ b/dev/github_jira_sync.py @@ -22,14 +22,9 @@ import json import os import re import sys -if sys.version < '3': - from urllib2 import urlopen - from urllib2 import Request - from urllib2 import HTTPError -else: - from urllib.request import urlopen - from urllib.request import Request - from urllib.error import HTTPError +from urllib.request import urlopen +from urllib.request import Request +from urllib.error import HTTPError try: import jira.client diff --git a/dev/lint-python b/dev/lint-python index d5491f2447..1fddbfa64b 100755 --- a/dev/lint-python +++ b/dev/lint-python @@ -168,7 +168,15 @@ function sphinx_test { # Check that the documentation builds acceptably, skip check if sphinx is not installed. if ! hash "$SPHINX_BUILD" 2> /dev/null; then - echo "The $SPHINX_BUILD command was not found. Skipping pydoc checks for now." + echo "The $SPHINX_BUILD command was not found. Skipping Sphinx build for now." + echo + return + fi + + # TODO(SPARK-32279): Install Sphinx in Python 3 of Jenkins machines + PYTHON_HAS_SPHINX=$("$PYTHON_EXECUTABLE" -c 'import importlib.util; print(importlib.util.find_spec("sphinx") is not None)') + if [[ "$PYTHON_HAS_SPHINX" == "False" ]]; then + echo "$PYTHON_EXECUTABLE does not have Sphinx installed. Skipping Sphinx build for now." echo return fi diff --git a/dev/merge_spark_pr.py b/dev/merge_spark_pr.py index 967cdace60..b42429d717 100755 --- a/dev/merge_spark_pr.py +++ b/dev/merge_spark_pr.py @@ -31,15 +31,9 @@ import re import subprocess import sys import traceback -if sys.version < '3': - input = raw_input # noqa - from urllib2 import urlopen - from urllib2 import Request - from urllib2 import HTTPError -else: - from urllib.request import urlopen - from urllib.request import Request - from urllib.error import HTTPError +from urllib.request import urlopen +from urllib.request import Request +from urllib.error import HTTPError try: import jira.client diff --git a/dev/run-tests-jenkins.py b/dev/run-tests-jenkins.py index 13be9592d7..4ff5b327e3 100755 --- a/dev/run-tests-jenkins.py +++ b/dev/run-tests-jenkins.py @@ -22,15 +22,9 @@ import sys import json import functools import subprocess -if sys.version < '3': - from urllib2 import urlopen - from urllib2 import Request - from urllib2 import HTTPError, URLError -else: - from urllib.request import urlopen - from urllib.request import Request - from urllib.error import HTTPError, URLError - +from urllib.request import urlopen +from urllib.request import Request +from urllib.error import HTTPError, URLError from sparktestsupport import SPARK_HOME, ERROR_CODES from sparktestsupport.shellutils import run_cmd diff --git a/dev/sparktestsupport/toposort.py b/dev/sparktestsupport/toposort.py index 8b2688d200..6785e481b5 100644 --- a/dev/sparktestsupport/toposort.py +++ b/dev/sparktestsupport/toposort.py @@ -24,8 +24,7 @@ # Moved functools import to the top of the file. # Changed assert to a ValueError. # Changed iter[items|keys] to [items|keys], for python 3 -# compatibility. I don't think it matters for python 2 these are -# now lists instead of iterables. +# compatibility. # Copy the input so as to leave it unmodified. # Renamed function from toposort2 to toposort. # Handle empty input. diff --git a/docs/configuration.md b/docs/configuration.md index 42f706b296..abf76105ae 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -2917,7 +2917,7 @@ The following variables can be set in `spark-env.sh`: PYSPARK_PYTHON - Python binary executable to use for PySpark in both driver and workers (default is python2.7 if available, otherwise python). + Python binary executable to use for PySpark in both driver and workers (default is python3 if available, otherwise python). Property spark.pyspark.python take precedence if it is set diff --git a/docs/index.md b/docs/index.md index c0771ca170..8fd169e63f 100644 --- a/docs/index.md +++ b/docs/index.md @@ -44,9 +44,8 @@ source, visit [Building Spark](building-spark.html). Spark runs on both Windows and UNIX-like systems (e.g. Linux, Mac OS), and it should run on any platform that runs a supported version of Java. This should include JVMs on x86_64 and ARM64. It's easy to run locally on one machine --- all you need is to have `java` installed on your system `PATH`, or the `JAVA_HOME` environment variable pointing to a Java installation. -Spark runs on Java 8/11, Scala 2.12, Python 2.7+/3.4+ and R 3.5+. +Spark runs on Java 8/11, Scala 2.12, Python 3.6+ and R 3.5+. Java 8 prior to version 8u92 support is deprecated as of Spark 3.0.0. -Python 2 and Python 3 prior to version 3.6 support is deprecated as of Spark 3.0.0. For the Scala API, Spark {{site.SPARK_VERSION}} uses Scala {{site.SCALA_BINARY_VERSION}}. You will need to use a compatible Scala version ({{site.SCALA_BINARY_VERSION}}.x). diff --git a/docs/rdd-programming-guide.md b/docs/rdd-programming-guide.md index 70bfefce47..07207f62bb 100644 --- a/docs/rdd-programming-guide.md +++ b/docs/rdd-programming-guide.md @@ -101,10 +101,10 @@ import org.apache.spark.SparkConf;
-Spark {{site.SPARK_VERSION}} works with Python 2.7+ or Python 3.4+. It can use the standard CPython interpreter, +Spark {{site.SPARK_VERSION}} works with Python 3.6+. It can use the standard CPython interpreter, so C libraries like NumPy can be used. It also works with PyPy 2.3+. -Note that Python 2 support is deprecated as of Spark 3.0.0. +Python 2, 3.4 and 3.5 supports were removed in Spark 3.1.0. Spark applications in Python can either be run with the `bin/spark-submit` script which includes Spark at runtime, or by including it in your setup.py as: @@ -134,8 +134,8 @@ PySpark requires the same minor version of Python in both driver and workers. It you can specify which version of Python you want to use by `PYSPARK_PYTHON`, for example: {% highlight bash %} -$ PYSPARK_PYTHON=python3.4 bin/pyspark -$ PYSPARK_PYTHON=/opt/pypy-2.5/bin/pypy bin/spark-submit examples/src/main/python/pi.py +$ PYSPARK_PYTHON=python3.8 bin/pyspark +$ PYSPARK_PYTHON=/path-to-your-pypy/pypy bin/spark-submit examples/src/main/python/pi.py {% endhighlight %}
@@ -276,7 +276,7 @@ $ PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS=notebook ./bin/pyspar You can customize the `ipython` or `jupyter` commands by setting `PYSPARK_DRIVER_PYTHON_OPTS`. -After the Jupyter Notebook server is launched, you can create a new "Python 2" notebook from +After the Jupyter Notebook server is launched, you can create a new notebook from the "Files" tab. Inside the notebook, you can input the command `%pylab inline` as part of your notebook before you start to try Spark from the Jupyter notebook. @@ -447,7 +447,7 @@ Writables are automatically converted: - + diff --git a/examples/src/main/python/als.py b/examples/src/main/python/als.py index 6d3241876a..511634fd8f 100755 --- a/examples/src/main/python/als.py +++ b/examples/src/main/python/als.py @@ -21,8 +21,6 @@ pyspark.ml.recommendation.ALS for more conventional use. This example requires numpy (http://www.numpy.org/) """ -from __future__ import print_function - import sys import numpy as np diff --git a/examples/src/main/python/avro_inputformat.py b/examples/src/main/python/avro_inputformat.py index a18722c687..49ab37e7b3 100644 --- a/examples/src/main/python/avro_inputformat.py +++ b/examples/src/main/python/avro_inputformat.py @@ -43,8 +43,6 @@ $ ./bin/spark-submit --driver-class-path /path/to/example/jar \ {u'favorite_color': None, u'name': u'Alyssa'} {u'favorite_color': u'red', u'name': u'Ben'} """ -from __future__ import print_function - import sys from functools import reduce diff --git a/examples/src/main/python/kmeans.py b/examples/src/main/python/kmeans.py index a42d711fc5..022378619c 100755 --- a/examples/src/main/python/kmeans.py +++ b/examples/src/main/python/kmeans.py @@ -22,8 +22,6 @@ examples/src/main/python/ml/kmeans_example.py. This example requires NumPy (http://www.numpy.org/). """ -from __future__ import print_function - import sys import numpy as np diff --git a/examples/src/main/python/logistic_regression.py b/examples/src/main/python/logistic_regression.py index bcc4e0f4e8..4b83740152 100755 --- a/examples/src/main/python/logistic_regression.py +++ b/examples/src/main/python/logistic_regression.py @@ -22,8 +22,6 @@ to act on batches of input data using efficient matrix operations. In practice, one may prefer to use the LogisticRegression algorithm in ML, as shown in examples/src/main/python/ml/logistic_regression_with_elastic_net.py. """ -from __future__ import print_function - import sys import numpy as np diff --git a/examples/src/main/python/ml/aft_survival_regression.py b/examples/src/main/python/ml/aft_survival_regression.py index 0a71f76418..2040a7876c 100644 --- a/examples/src/main/python/ml/aft_survival_regression.py +++ b/examples/src/main/python/ml/aft_survival_regression.py @@ -20,8 +20,6 @@ An example demonstrating aft survival regression. Run with: bin/spark-submit examples/src/main/python/ml/aft_survival_regression.py """ -from __future__ import print_function - # $example on$ from pyspark.ml.regression import AFTSurvivalRegression from pyspark.ml.linalg import Vectors diff --git a/examples/src/main/python/ml/als_example.py b/examples/src/main/python/ml/als_example.py index 8b7ec9c439..b392639784 100644 --- a/examples/src/main/python/ml/als_example.py +++ b/examples/src/main/python/ml/als_example.py @@ -15,12 +15,6 @@ # limitations under the License. # -from __future__ import print_function - -import sys -if sys.version >= '3': - long = int - from pyspark.sql import SparkSession # $example on$ @@ -39,7 +33,7 @@ if __name__ == "__main__": lines = spark.read.text("data/mllib/als/sample_movielens_ratings.txt").rdd parts = lines.map(lambda row: row.value.split("::")) ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]), - rating=float(p[2]), timestamp=long(p[3]))) + rating=float(p[2]), timestamp=int(p[3]))) ratings = spark.createDataFrame(ratingsRDD) (training, test) = ratings.randomSplit([0.8, 0.2]) diff --git a/examples/src/main/python/ml/anova_selector_example.py b/examples/src/main/python/ml/anova_selector_example.py index f8458f5d6e..da80fa6231 100644 --- a/examples/src/main/python/ml/anova_selector_example.py +++ b/examples/src/main/python/ml/anova_selector_example.py @@ -20,8 +20,6 @@ An example for ANOVASelector. Run with: bin/spark-submit examples/src/main/python/ml/anova_selector_example.py """ -from __future__ import print_function - from pyspark.sql import SparkSession # $example on$ from pyspark.ml.feature import ANOVASelector diff --git a/examples/src/main/python/ml/anova_test_example.py b/examples/src/main/python/ml/anova_test_example.py index 4119441cde..451e078f60 100644 --- a/examples/src/main/python/ml/anova_test_example.py +++ b/examples/src/main/python/ml/anova_test_example.py @@ -20,8 +20,6 @@ An example for ANOVA testing. Run with: bin/spark-submit examples/src/main/python/ml/anova_test_example.py """ -from __future__ import print_function - from pyspark.sql import SparkSession # $example on$ from pyspark.ml.linalg import Vectors diff --git a/examples/src/main/python/ml/binarizer_example.py b/examples/src/main/python/ml/binarizer_example.py index 669bb2aeab..5d5ae4122e 100644 --- a/examples/src/main/python/ml/binarizer_example.py +++ b/examples/src/main/python/ml/binarizer_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - from pyspark.sql import SparkSession # $example on$ from pyspark.ml.feature import Binarizer diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py b/examples/src/main/python/ml/bisecting_k_means_example.py index 82adb338b5..513f80a09e 100644 --- a/examples/src/main/python/ml/bisecting_k_means_example.py +++ b/examples/src/main/python/ml/bisecting_k_means_example.py @@ -20,8 +20,6 @@ An example demonstrating bisecting k-means clustering. Run with: bin/spark-submit examples/src/main/python/ml/bisecting_k_means_example.py """ -from __future__ import print_function - # $example on$ from pyspark.ml.clustering import BisectingKMeans from pyspark.ml.evaluation import ClusteringEvaluator diff --git a/examples/src/main/python/ml/bucketed_random_projection_lsh_example.py b/examples/src/main/python/ml/bucketed_random_projection_lsh_example.py index 610176ea59..f5836091f3 100644 --- a/examples/src/main/python/ml/bucketed_random_projection_lsh_example.py +++ b/examples/src/main/python/ml/bucketed_random_projection_lsh_example.py @@ -20,8 +20,6 @@ An example demonstrating BucketedRandomProjectionLSH. Run with: bin/spark-submit examples/src/main/python/ml/bucketed_random_projection_lsh_example.py """ -from __future__ import print_function - # $example on$ from pyspark.ml.feature import BucketedRandomProjectionLSH from pyspark.ml.linalg import Vectors diff --git a/examples/src/main/python/ml/bucketizer_example.py b/examples/src/main/python/ml/bucketizer_example.py index 742f35093b..5de67f7126 100644 --- a/examples/src/main/python/ml/bucketizer_example.py +++ b/examples/src/main/python/ml/bucketizer_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - from pyspark.sql import SparkSession # $example on$ from pyspark.ml.feature import Bucketizer diff --git a/examples/src/main/python/ml/chi_square_test_example.py b/examples/src/main/python/ml/chi_square_test_example.py index 2af7e683cd..bf15a03d9c 100644 --- a/examples/src/main/python/ml/chi_square_test_example.py +++ b/examples/src/main/python/ml/chi_square_test_example.py @@ -20,8 +20,6 @@ An example for Chi-square hypothesis testing. Run with: bin/spark-submit examples/src/main/python/ml/chi_square_test_example.py """ -from __future__ import print_function - from pyspark.sql import SparkSession # $example on$ from pyspark.ml.linalg import Vectors diff --git a/examples/src/main/python/ml/chisq_selector_example.py b/examples/src/main/python/ml/chisq_selector_example.py index 028a9ea9d6..c83a8c1bc7 100644 --- a/examples/src/main/python/ml/chisq_selector_example.py +++ b/examples/src/main/python/ml/chisq_selector_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - from pyspark.sql import SparkSession # $example on$ from pyspark.ml.feature import ChiSqSelector diff --git a/examples/src/main/python/ml/correlation_example.py b/examples/src/main/python/ml/correlation_example.py index 1f4e402ac1..9006d54149 100644 --- a/examples/src/main/python/ml/correlation_example.py +++ b/examples/src/main/python/ml/correlation_example.py @@ -20,8 +20,6 @@ An example for computing correlation matrix. Run with: bin/spark-submit examples/src/main/python/ml/correlation_example.py """ -from __future__ import print_function - # $example on$ from pyspark.ml.linalg import Vectors from pyspark.ml.stat import Correlation diff --git a/examples/src/main/python/ml/count_vectorizer_example.py b/examples/src/main/python/ml/count_vectorizer_example.py index f2e41db77d..b3ddfb128c 100644 --- a/examples/src/main/python/ml/count_vectorizer_example.py +++ b/examples/src/main/python/ml/count_vectorizer_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - from pyspark.sql import SparkSession # $example on$ from pyspark.ml.feature import CountVectorizer diff --git a/examples/src/main/python/ml/cross_validator.py b/examples/src/main/python/ml/cross_validator.py index 6256d11504..0ad0865486 100644 --- a/examples/src/main/python/ml/cross_validator.py +++ b/examples/src/main/python/ml/cross_validator.py @@ -22,8 +22,6 @@ Run with: bin/spark-submit examples/src/main/python/ml/cross_validator.py """ -from __future__ import print_function - # $example on$ from pyspark.ml import Pipeline from pyspark.ml.classification import LogisticRegression diff --git a/examples/src/main/python/ml/dataframe_example.py b/examples/src/main/python/ml/dataframe_example.py index cabc3de68f..d2bf937441 100644 --- a/examples/src/main/python/ml/dataframe_example.py +++ b/examples/src/main/python/ml/dataframe_example.py @@ -19,8 +19,6 @@ An example of how to use DataFrame for ML. Run with:: bin/spark-submit examples/src/main/python/ml/dataframe_example.py """ -from __future__ import print_function - import os import sys import tempfile diff --git a/examples/src/main/python/ml/dct_example.py b/examples/src/main/python/ml/dct_example.py index c0457f8d0f..37da4f5e8f 100644 --- a/examples/src/main/python/ml/dct_example.py +++ b/examples/src/main/python/ml/dct_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - # $example on$ from pyspark.ml.feature import DCT from pyspark.ml.linalg import Vectors diff --git a/examples/src/main/python/ml/decision_tree_classification_example.py b/examples/src/main/python/ml/decision_tree_classification_example.py index d6e2977de0..eb7177b845 100644 --- a/examples/src/main/python/ml/decision_tree_classification_example.py +++ b/examples/src/main/python/ml/decision_tree_classification_example.py @@ -18,8 +18,6 @@ """ Decision Tree Classification Example. """ -from __future__ import print_function - # $example on$ from pyspark.ml import Pipeline from pyspark.ml.classification import DecisionTreeClassifier diff --git a/examples/src/main/python/ml/decision_tree_regression_example.py b/examples/src/main/python/ml/decision_tree_regression_example.py index 58d7ad921d..1ed1636a3d 100644 --- a/examples/src/main/python/ml/decision_tree_regression_example.py +++ b/examples/src/main/python/ml/decision_tree_regression_example.py @@ -18,8 +18,6 @@ """ Decision Tree Regression Example. """ -from __future__ import print_function - # $example on$ from pyspark.ml import Pipeline from pyspark.ml.regression import DecisionTreeRegressor diff --git a/examples/src/main/python/ml/elementwise_product_example.py b/examples/src/main/python/ml/elementwise_product_example.py index 590053998b..71eec8d432 100644 --- a/examples/src/main/python/ml/elementwise_product_example.py +++ b/examples/src/main/python/ml/elementwise_product_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - # $example on$ from pyspark.ml.feature import ElementwiseProduct from pyspark.ml.linalg import Vectors diff --git a/examples/src/main/python/ml/estimator_transformer_param_example.py b/examples/src/main/python/ml/estimator_transformer_param_example.py index eb21051435..1dcca6c201 100644 --- a/examples/src/main/python/ml/estimator_transformer_param_example.py +++ b/examples/src/main/python/ml/estimator_transformer_param_example.py @@ -18,8 +18,6 @@ """ Estimator Transformer Param Example. """ -from __future__ import print_function - # $example on$ from pyspark.ml.linalg import Vectors from pyspark.ml.classification import LogisticRegression diff --git a/examples/src/main/python/ml/feature_hasher_example.py b/examples/src/main/python/ml/feature_hasher_example.py index 6cf9ecc396..4fe573d19d 100644 --- a/examples/src/main/python/ml/feature_hasher_example.py +++ b/examples/src/main/python/ml/feature_hasher_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - from pyspark.sql import SparkSession # $example on$ from pyspark.ml.feature import FeatureHasher diff --git a/examples/src/main/python/ml/fm_classifier_example.py b/examples/src/main/python/ml/fm_classifier_example.py index 6e7c2ccf02..b47bdc5275 100644 --- a/examples/src/main/python/ml/fm_classifier_example.py +++ b/examples/src/main/python/ml/fm_classifier_example.py @@ -18,8 +18,6 @@ """ FMClassifier Example. """ -from __future__ import print_function - # $example on$ from pyspark.ml import Pipeline from pyspark.ml.classification import FMClassifier diff --git a/examples/src/main/python/ml/fm_regressor_example.py b/examples/src/main/python/ml/fm_regressor_example.py index afd7639680..5c8133996a 100644 --- a/examples/src/main/python/ml/fm_regressor_example.py +++ b/examples/src/main/python/ml/fm_regressor_example.py @@ -18,8 +18,6 @@ """ FMRegressor Example. """ -from __future__ import print_function - # $example on$ from pyspark.ml import Pipeline from pyspark.ml.regression import FMRegressor diff --git a/examples/src/main/python/ml/fvalue_selector_example.py b/examples/src/main/python/ml/fvalue_selector_example.py index 3158953a5d..f164af47eb 100644 --- a/examples/src/main/python/ml/fvalue_selector_example.py +++ b/examples/src/main/python/ml/fvalue_selector_example.py @@ -20,8 +20,6 @@ An example for FValueSelector. Run with: bin/spark-submit examples/src/main/python/ml/fvalue_selector_example.py """ -from __future__ import print_function - from pyspark.sql import SparkSession # $example on$ from pyspark.ml.feature import FValueSelector diff --git a/examples/src/main/python/ml/fvalue_test_example.py b/examples/src/main/python/ml/fvalue_test_example.py index 410b39e449..dfa8073e5a 100644 --- a/examples/src/main/python/ml/fvalue_test_example.py +++ b/examples/src/main/python/ml/fvalue_test_example.py @@ -20,8 +20,6 @@ An example for FValue testing. Run with: bin/spark-submit examples/src/main/python/ml/fvalue_test_example.py """ -from __future__ import print_function - from pyspark.sql import SparkSession # $example on$ from pyspark.ml.linalg import Vectors diff --git a/examples/src/main/python/ml/gaussian_mixture_example.py b/examples/src/main/python/ml/gaussian_mixture_example.py index 4938a90418..1441faa792 100644 --- a/examples/src/main/python/ml/gaussian_mixture_example.py +++ b/examples/src/main/python/ml/gaussian_mixture_example.py @@ -20,8 +20,6 @@ A simple example demonstrating Gaussian Mixture Model (GMM). Run with: bin/spark-submit examples/src/main/python/ml/gaussian_mixture_example.py """ -from __future__ import print_function - # $example on$ from pyspark.ml.clustering import GaussianMixture # $example off$ diff --git a/examples/src/main/python/ml/generalized_linear_regression_example.py b/examples/src/main/python/ml/generalized_linear_regression_example.py index a52f4650c1..06a8a5a2e9 100644 --- a/examples/src/main/python/ml/generalized_linear_regression_example.py +++ b/examples/src/main/python/ml/generalized_linear_regression_example.py @@ -20,8 +20,6 @@ An example demonstrating generalized linear regression. Run with: bin/spark-submit examples/src/main/python/ml/generalized_linear_regression_example.py """ -from __future__ import print_function - from pyspark.sql import SparkSession # $example on$ from pyspark.ml.regression import GeneralizedLinearRegression diff --git a/examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py b/examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py index c2042fd7b7..a7efa2170a 100644 --- a/examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py +++ b/examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py @@ -18,8 +18,6 @@ """ Gradient Boosted Tree Classifier Example. """ -from __future__ import print_function - # $example on$ from pyspark.ml import Pipeline from pyspark.ml.classification import GBTClassifier diff --git a/examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py b/examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py index cc96c973e4..5e09b96c1e 100644 --- a/examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py +++ b/examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py @@ -18,8 +18,6 @@ """ Gradient Boosted Tree Regressor Example. """ -from __future__ import print_function - # $example on$ from pyspark.ml import Pipeline from pyspark.ml.regression import GBTRegressor diff --git a/examples/src/main/python/ml/index_to_string_example.py b/examples/src/main/python/ml/index_to_string_example.py index 33d104e8e3..98bdb89ce3 100644 --- a/examples/src/main/python/ml/index_to_string_example.py +++ b/examples/src/main/python/ml/index_to_string_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - # $example on$ from pyspark.ml.feature import IndexToString, StringIndexer # $example off$ diff --git a/examples/src/main/python/ml/interaction_example.py b/examples/src/main/python/ml/interaction_example.py index 4b63227191..ac365179b0 100644 --- a/examples/src/main/python/ml/interaction_example.py +++ b/examples/src/main/python/ml/interaction_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - # $example on$ from pyspark.ml.feature import Interaction, VectorAssembler # $example off$ diff --git a/examples/src/main/python/ml/isotonic_regression_example.py b/examples/src/main/python/ml/isotonic_regression_example.py index 89cba9dfc7..d7b893894f 100644 --- a/examples/src/main/python/ml/isotonic_regression_example.py +++ b/examples/src/main/python/ml/isotonic_regression_example.py @@ -21,8 +21,6 @@ Isotonic Regression Example. Run with: bin/spark-submit examples/src/main/python/ml/isotonic_regression_example.py """ -from __future__ import print_function - # $example on$ from pyspark.ml.regression import IsotonicRegression # $example off$ diff --git a/examples/src/main/python/ml/kmeans_example.py b/examples/src/main/python/ml/kmeans_example.py index 80a878af67..47223fd953 100644 --- a/examples/src/main/python/ml/kmeans_example.py +++ b/examples/src/main/python/ml/kmeans_example.py @@ -22,8 +22,6 @@ Run with: This example requires NumPy (http://www.numpy.org/). """ -from __future__ import print_function - # $example on$ from pyspark.ml.clustering import KMeans from pyspark.ml.evaluation import ClusteringEvaluator diff --git a/examples/src/main/python/ml/lda_example.py b/examples/src/main/python/ml/lda_example.py index 97d1a042d1..a47dfa383c 100644 --- a/examples/src/main/python/ml/lda_example.py +++ b/examples/src/main/python/ml/lda_example.py @@ -20,8 +20,6 @@ An example demonstrating LDA. Run with: bin/spark-submit examples/src/main/python/ml/lda_example.py """ -from __future__ import print_function - # $example on$ from pyspark.ml.clustering import LDA # $example off$ diff --git a/examples/src/main/python/ml/linear_regression_with_elastic_net.py b/examples/src/main/python/ml/linear_regression_with_elastic_net.py index 6639e9160a..864fc76cff 100644 --- a/examples/src/main/python/ml/linear_regression_with_elastic_net.py +++ b/examples/src/main/python/ml/linear_regression_with_elastic_net.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - # $example on$ from pyspark.ml.regression import LinearRegression # $example off$ diff --git a/examples/src/main/python/ml/linearsvc.py b/examples/src/main/python/ml/linearsvc.py index 9b79abbf96..61d726cf3f 100644 --- a/examples/src/main/python/ml/linearsvc.py +++ b/examples/src/main/python/ml/linearsvc.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - # $example on$ from pyspark.ml.classification import LinearSVC # $example off$ diff --git a/examples/src/main/python/ml/logistic_regression_summary_example.py b/examples/src/main/python/ml/logistic_regression_summary_example.py index 2274ff707b..6d045108da 100644 --- a/examples/src/main/python/ml/logistic_regression_summary_example.py +++ b/examples/src/main/python/ml/logistic_regression_summary_example.py @@ -20,8 +20,6 @@ An example demonstrating Logistic Regression Summary. Run with: bin/spark-submit examples/src/main/python/ml/logistic_regression_summary_example.py """ -from __future__ import print_function - # $example on$ from pyspark.ml.classification import LogisticRegression # $example off$ diff --git a/examples/src/main/python/ml/logistic_regression_with_elastic_net.py b/examples/src/main/python/ml/logistic_regression_with_elastic_net.py index d095fbd373..916fdade27 100644 --- a/examples/src/main/python/ml/logistic_regression_with_elastic_net.py +++ b/examples/src/main/python/ml/logistic_regression_with_elastic_net.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - # $example on$ from pyspark.ml.classification import LogisticRegression # $example off$ diff --git a/examples/src/main/python/ml/max_abs_scaler_example.py b/examples/src/main/python/ml/max_abs_scaler_example.py index 45eda3cdad..d7ff3561ce 100644 --- a/examples/src/main/python/ml/max_abs_scaler_example.py +++ b/examples/src/main/python/ml/max_abs_scaler_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - # $example on$ from pyspark.ml.feature import MaxAbsScaler from pyspark.ml.linalg import Vectors diff --git a/examples/src/main/python/ml/min_hash_lsh_example.py b/examples/src/main/python/ml/min_hash_lsh_example.py index 93136e6ae3..683f97a055 100644 --- a/examples/src/main/python/ml/min_hash_lsh_example.py +++ b/examples/src/main/python/ml/min_hash_lsh_example.py @@ -20,8 +20,6 @@ An example demonstrating MinHashLSH. Run with: bin/spark-submit examples/src/main/python/ml/min_hash_lsh_example.py """ -from __future__ import print_function - # $example on$ from pyspark.ml.feature import MinHashLSH from pyspark.ml.linalg import Vectors diff --git a/examples/src/main/python/ml/min_max_scaler_example.py b/examples/src/main/python/ml/min_max_scaler_example.py index b5f272e59b..cd74243699 100644 --- a/examples/src/main/python/ml/min_max_scaler_example.py +++ b/examples/src/main/python/ml/min_max_scaler_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - # $example on$ from pyspark.ml.feature import MinMaxScaler from pyspark.ml.linalg import Vectors diff --git a/examples/src/main/python/ml/multiclass_logistic_regression_with_elastic_net.py b/examples/src/main/python/ml/multiclass_logistic_regression_with_elastic_net.py index bec9860c79..3bb4a72864 100644 --- a/examples/src/main/python/ml/multiclass_logistic_regression_with_elastic_net.py +++ b/examples/src/main/python/ml/multiclass_logistic_regression_with_elastic_net.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - # $example on$ from pyspark.ml.classification import LogisticRegression # $example off$ diff --git a/examples/src/main/python/ml/multilayer_perceptron_classification.py b/examples/src/main/python/ml/multilayer_perceptron_classification.py index 88fc69f753..74f5321935 100644 --- a/examples/src/main/python/ml/multilayer_perceptron_classification.py +++ b/examples/src/main/python/ml/multilayer_perceptron_classification.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - # $example on$ from pyspark.ml.classification import MultilayerPerceptronClassifier from pyspark.ml.evaluation import MulticlassClassificationEvaluator diff --git a/examples/src/main/python/ml/n_gram_example.py b/examples/src/main/python/ml/n_gram_example.py index 31676e076a..8c8031b939 100644 --- a/examples/src/main/python/ml/n_gram_example.py +++ b/examples/src/main/python/ml/n_gram_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - # $example on$ from pyspark.ml.feature import NGram # $example off$ diff --git a/examples/src/main/python/ml/naive_bayes_example.py b/examples/src/main/python/ml/naive_bayes_example.py index 7290ab81cd..8d1777c6f9 100644 --- a/examples/src/main/python/ml/naive_bayes_example.py +++ b/examples/src/main/python/ml/naive_bayes_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - # $example on$ from pyspark.ml.classification import NaiveBayes from pyspark.ml.evaluation import MulticlassClassificationEvaluator diff --git a/examples/src/main/python/ml/normalizer_example.py b/examples/src/main/python/ml/normalizer_example.py index 510bd825fd..2aa012961a 100644 --- a/examples/src/main/python/ml/normalizer_example.py +++ b/examples/src/main/python/ml/normalizer_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - # $example on$ from pyspark.ml.feature import Normalizer from pyspark.ml.linalg import Vectors diff --git a/examples/src/main/python/ml/one_vs_rest_example.py b/examples/src/main/python/ml/one_vs_rest_example.py index 956e94ae4a..4cae1a9980 100644 --- a/examples/src/main/python/ml/one_vs_rest_example.py +++ b/examples/src/main/python/ml/one_vs_rest_example.py @@ -21,8 +21,6 @@ using Logistic Regression as the base classifier. Run with: bin/spark-submit examples/src/main/python/ml/one_vs_rest_example.py """ -from __future__ import print_function - # $example on$ from pyspark.ml.classification import LogisticRegression, OneVsRest from pyspark.ml.evaluation import MulticlassClassificationEvaluator diff --git a/examples/src/main/python/ml/onehot_encoder_example.py b/examples/src/main/python/ml/onehot_encoder_example.py index 73775b79e3..6deb84ed78 100644 --- a/examples/src/main/python/ml/onehot_encoder_example.py +++ b/examples/src/main/python/ml/onehot_encoder_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - # $example on$ from pyspark.ml.feature import OneHotEncoder # $example off$ diff --git a/examples/src/main/python/ml/pca_example.py b/examples/src/main/python/ml/pca_example.py index 38746aced0..03fb709c8e 100644 --- a/examples/src/main/python/ml/pca_example.py +++ b/examples/src/main/python/ml/pca_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - # $example on$ from pyspark.ml.feature import PCA from pyspark.ml.linalg import Vectors diff --git a/examples/src/main/python/ml/polynomial_expansion_example.py b/examples/src/main/python/ml/polynomial_expansion_example.py index 40bcb7b13a..75f436e768 100644 --- a/examples/src/main/python/ml/polynomial_expansion_example.py +++ b/examples/src/main/python/ml/polynomial_expansion_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - # $example on$ from pyspark.ml.feature import PolynomialExpansion from pyspark.ml.linalg import Vectors diff --git a/examples/src/main/python/ml/quantile_discretizer_example.py b/examples/src/main/python/ml/quantile_discretizer_example.py index 0fc1d1949a..82be3936d2 100644 --- a/examples/src/main/python/ml/quantile_discretizer_example.py +++ b/examples/src/main/python/ml/quantile_discretizer_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - # $example on$ from pyspark.ml.feature import QuantileDiscretizer # $example off$ diff --git a/examples/src/main/python/ml/random_forest_classifier_example.py b/examples/src/main/python/ml/random_forest_classifier_example.py index 4eaa94dd7f..8983d1f2e9 100644 --- a/examples/src/main/python/ml/random_forest_classifier_example.py +++ b/examples/src/main/python/ml/random_forest_classifier_example.py @@ -18,8 +18,6 @@ """ Random Forest Classifier Example. """ -from __future__ import print_function - # $example on$ from pyspark.ml import Pipeline from pyspark.ml.classification import RandomForestClassifier diff --git a/examples/src/main/python/ml/random_forest_regressor_example.py b/examples/src/main/python/ml/random_forest_regressor_example.py index a34edff2ec..b9306ddf2f 100644 --- a/examples/src/main/python/ml/random_forest_regressor_example.py +++ b/examples/src/main/python/ml/random_forest_regressor_example.py @@ -18,8 +18,6 @@ """ Random Forest Regressor Example. """ -from __future__ import print_function - # $example on$ from pyspark.ml import Pipeline from pyspark.ml.regression import RandomForestRegressor diff --git a/examples/src/main/python/ml/rformula_example.py b/examples/src/main/python/ml/rformula_example.py index 6629239db2..25bb6dac56 100644 --- a/examples/src/main/python/ml/rformula_example.py +++ b/examples/src/main/python/ml/rformula_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - # $example on$ from pyspark.ml.feature import RFormula # $example off$ diff --git a/examples/src/main/python/ml/robust_scaler_example.py b/examples/src/main/python/ml/robust_scaler_example.py index 435e9ccb80..9f7c6d6507 100644 --- a/examples/src/main/python/ml/robust_scaler_example.py +++ b/examples/src/main/python/ml/robust_scaler_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - # $example on$ from pyspark.ml.feature import RobustScaler # $example off$ diff --git a/examples/src/main/python/ml/sql_transformer.py b/examples/src/main/python/ml/sql_transformer.py index 0bf8f35720..c8ac5c46aa 100644 --- a/examples/src/main/python/ml/sql_transformer.py +++ b/examples/src/main/python/ml/sql_transformer.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - # $example on$ from pyspark.ml.feature import SQLTransformer # $example off$ diff --git a/examples/src/main/python/ml/standard_scaler_example.py b/examples/src/main/python/ml/standard_scaler_example.py index c0027480e6..9021c10075 100644 --- a/examples/src/main/python/ml/standard_scaler_example.py +++ b/examples/src/main/python/ml/standard_scaler_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - # $example on$ from pyspark.ml.feature import StandardScaler # $example off$ diff --git a/examples/src/main/python/ml/stopwords_remover_example.py b/examples/src/main/python/ml/stopwords_remover_example.py index 3b8e7855e3..832a7c7d0a 100644 --- a/examples/src/main/python/ml/stopwords_remover_example.py +++ b/examples/src/main/python/ml/stopwords_remover_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - # $example on$ from pyspark.ml.feature import StopWordsRemover # $example off$ diff --git a/examples/src/main/python/ml/string_indexer_example.py b/examples/src/main/python/ml/string_indexer_example.py index 2255bfb9c1..f2ac63eabd 100644 --- a/examples/src/main/python/ml/string_indexer_example.py +++ b/examples/src/main/python/ml/string_indexer_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - # $example on$ from pyspark.ml.feature import StringIndexer # $example off$ diff --git a/examples/src/main/python/ml/summarizer_example.py b/examples/src/main/python/ml/summarizer_example.py index 8835f189a1..4982746450 100644 --- a/examples/src/main/python/ml/summarizer_example.py +++ b/examples/src/main/python/ml/summarizer_example.py @@ -20,8 +20,6 @@ An example for summarizer. Run with: bin/spark-submit examples/src/main/python/ml/summarizer_example.py """ -from __future__ import print_function - from pyspark.sql import SparkSession # $example on$ from pyspark.ml.stat import Summarizer diff --git a/examples/src/main/python/ml/tf_idf_example.py b/examples/src/main/python/ml/tf_idf_example.py index d43244fa68..b4bb0dfa31 100644 --- a/examples/src/main/python/ml/tf_idf_example.py +++ b/examples/src/main/python/ml/tf_idf_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - # $example on$ from pyspark.ml.feature import HashingTF, IDF, Tokenizer # $example off$ diff --git a/examples/src/main/python/ml/tokenizer_example.py b/examples/src/main/python/ml/tokenizer_example.py index 5c65c5c9f8..c6b5fac227 100644 --- a/examples/src/main/python/ml/tokenizer_example.py +++ b/examples/src/main/python/ml/tokenizer_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - # $example on$ from pyspark.ml.feature import Tokenizer, RegexTokenizer from pyspark.sql.functions import col, udf diff --git a/examples/src/main/python/ml/variance_threshold_selector_example.py b/examples/src/main/python/ml/variance_threshold_selector_example.py index b7edb86653..0a996e0e28 100644 --- a/examples/src/main/python/ml/variance_threshold_selector_example.py +++ b/examples/src/main/python/ml/variance_threshold_selector_example.py @@ -20,8 +20,6 @@ An example for VarianceThresholdSelector. Run with: bin/spark-submit examples/src/main/python/ml/variance_threshold_selector_example.py """ -from __future__ import print_function - from pyspark.sql import SparkSession # $example on$ from pyspark.ml.feature import VarianceThresholdSelector diff --git a/examples/src/main/python/ml/vector_assembler_example.py b/examples/src/main/python/ml/vector_assembler_example.py index 98de1d5ea7..0ce31cf0ea 100644 --- a/examples/src/main/python/ml/vector_assembler_example.py +++ b/examples/src/main/python/ml/vector_assembler_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - # $example on$ from pyspark.ml.linalg import Vectors from pyspark.ml.feature import VectorAssembler diff --git a/examples/src/main/python/ml/vector_indexer_example.py b/examples/src/main/python/ml/vector_indexer_example.py index 5c2956077d..51a4191606 100644 --- a/examples/src/main/python/ml/vector_indexer_example.py +++ b/examples/src/main/python/ml/vector_indexer_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - # $example on$ from pyspark.ml.feature import VectorIndexer # $example off$ diff --git a/examples/src/main/python/ml/vector_size_hint_example.py b/examples/src/main/python/ml/vector_size_hint_example.py index fb77dacec6..355d85aee8 100644 --- a/examples/src/main/python/ml/vector_size_hint_example.py +++ b/examples/src/main/python/ml/vector_size_hint_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - # $example on$ from pyspark.ml.linalg import Vectors from pyspark.ml.feature import (VectorSizeHint, VectorAssembler) diff --git a/examples/src/main/python/ml/vector_slicer_example.py b/examples/src/main/python/ml/vector_slicer_example.py index 68c8cfe27e..86e089d152 100644 --- a/examples/src/main/python/ml/vector_slicer_example.py +++ b/examples/src/main/python/ml/vector_slicer_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - # $example on$ from pyspark.ml.feature import VectorSlicer from pyspark.ml.linalg import Vectors diff --git a/examples/src/main/python/ml/word2vec_example.py b/examples/src/main/python/ml/word2vec_example.py index 77f8951df0..0eabeda3dc 100644 --- a/examples/src/main/python/ml/word2vec_example.py +++ b/examples/src/main/python/ml/word2vec_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - # $example on$ from pyspark.ml.feature import Word2Vec # $example off$ diff --git a/examples/src/main/python/mllib/binary_classification_metrics_example.py b/examples/src/main/python/mllib/binary_classification_metrics_example.py index d14ce7982e..741746e6e3 100644 --- a/examples/src/main/python/mllib/binary_classification_metrics_example.py +++ b/examples/src/main/python/mllib/binary_classification_metrics_example.py @@ -17,7 +17,6 @@ """ Binary Classification Metrics Example. """ -from __future__ import print_function from pyspark import SparkContext # $example on$ from pyspark.mllib.classification import LogisticRegressionWithLBFGS diff --git a/examples/src/main/python/mllib/bisecting_k_means_example.py b/examples/src/main/python/mllib/bisecting_k_means_example.py index 36e36fc689..d7b6ad9d42 100644 --- a/examples/src/main/python/mllib/bisecting_k_means_example.py +++ b/examples/src/main/python/mllib/bisecting_k_means_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - # $example on$ from numpy import array # $example off$ diff --git a/examples/src/main/python/mllib/correlations.py b/examples/src/main/python/mllib/correlations.py index 089504fa70..27d07b22a5 100755 --- a/examples/src/main/python/mllib/correlations.py +++ b/examples/src/main/python/mllib/correlations.py @@ -18,8 +18,6 @@ """ Correlations using MLlib. """ -from __future__ import print_function - import sys from pyspark import SparkContext diff --git a/examples/src/main/python/mllib/correlations_example.py b/examples/src/main/python/mllib/correlations_example.py index 66d18f6e5d..bb71b96868 100644 --- a/examples/src/main/python/mllib/correlations_example.py +++ b/examples/src/main/python/mllib/correlations_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - import numpy as np from pyspark import SparkContext diff --git a/examples/src/main/python/mllib/decision_tree_classification_example.py b/examples/src/main/python/mllib/decision_tree_classification_example.py index 7eecf50058..009e393226 100644 --- a/examples/src/main/python/mllib/decision_tree_classification_example.py +++ b/examples/src/main/python/mllib/decision_tree_classification_example.py @@ -18,8 +18,6 @@ """ Decision Tree Classification Example. """ -from __future__ import print_function - from pyspark import SparkContext # $example on$ from pyspark.mllib.tree import DecisionTree, DecisionTreeModel diff --git a/examples/src/main/python/mllib/decision_tree_regression_example.py b/examples/src/main/python/mllib/decision_tree_regression_example.py index acf9e25fdf..71dfbf0790 100644 --- a/examples/src/main/python/mllib/decision_tree_regression_example.py +++ b/examples/src/main/python/mllib/decision_tree_regression_example.py @@ -18,8 +18,6 @@ """ Decision Tree Regression Example. """ -from __future__ import print_function - from pyspark import SparkContext # $example on$ from pyspark.mllib.tree import DecisionTree, DecisionTreeModel diff --git a/examples/src/main/python/mllib/elementwise_product_example.py b/examples/src/main/python/mllib/elementwise_product_example.py index 8ae9afb1dc..15e6a43f73 100644 --- a/examples/src/main/python/mllib/elementwise_product_example.py +++ b/examples/src/main/python/mllib/elementwise_product_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - from pyspark import SparkContext # $example on$ from pyspark.mllib.feature import ElementwiseProduct diff --git a/examples/src/main/python/mllib/gaussian_mixture_example.py b/examples/src/main/python/mllib/gaussian_mixture_example.py index a60e799d62..3b19478f45 100644 --- a/examples/src/main/python/mllib/gaussian_mixture_example.py +++ b/examples/src/main/python/mllib/gaussian_mixture_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - # $example on$ from numpy import array # $example off$ diff --git a/examples/src/main/python/mllib/gaussian_mixture_model.py b/examples/src/main/python/mllib/gaussian_mixture_model.py index 6b46e27dda..96ce6b6f6a 100644 --- a/examples/src/main/python/mllib/gaussian_mixture_model.py +++ b/examples/src/main/python/mllib/gaussian_mixture_model.py @@ -18,11 +18,6 @@ """ A Gaussian Mixture Model clustering program using MLlib. """ -from __future__ import print_function - -import sys -if sys.version >= '3': - long = int import random import argparse @@ -53,7 +48,7 @@ if __name__ == "__main__": parser.add_argument('--convergenceTol', default=1e-3, type=float, help='convergence threshold') parser.add_argument('--maxIterations', default=100, type=int, help='Number of iterations') parser.add_argument('--seed', default=random.getrandbits(19), - type=long, help='Random seed') + type=int, help='Random seed') args = parser.parse_args() conf = SparkConf().setAppName("GMM") diff --git a/examples/src/main/python/mllib/gradient_boosting_classification_example.py b/examples/src/main/python/mllib/gradient_boosting_classification_example.py index 65a03572be..eb12f20619 100644 --- a/examples/src/main/python/mllib/gradient_boosting_classification_example.py +++ b/examples/src/main/python/mllib/gradient_boosting_classification_example.py @@ -18,8 +18,6 @@ """ Gradient Boosted Trees Classification Example. """ -from __future__ import print_function - from pyspark import SparkContext # $example on$ from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel diff --git a/examples/src/main/python/mllib/gradient_boosting_regression_example.py b/examples/src/main/python/mllib/gradient_boosting_regression_example.py index 877f8ab461..eb59a992df 100644 --- a/examples/src/main/python/mllib/gradient_boosting_regression_example.py +++ b/examples/src/main/python/mllib/gradient_boosting_regression_example.py @@ -18,8 +18,6 @@ """ Gradient Boosted Trees Regression Example. """ -from __future__ import print_function - from pyspark import SparkContext # $example on$ from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel diff --git a/examples/src/main/python/mllib/hypothesis_testing_example.py b/examples/src/main/python/mllib/hypothesis_testing_example.py index 21a5584fd6..321be8b76f 100644 --- a/examples/src/main/python/mllib/hypothesis_testing_example.py +++ b/examples/src/main/python/mllib/hypothesis_testing_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - from pyspark import SparkContext # $example on$ from pyspark.mllib.linalg import Matrices, Vectors diff --git a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py index ef380dee79..12a186900e 100644 --- a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py +++ b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - from pyspark import SparkContext # $example on$ from pyspark.mllib.stat import Statistics diff --git a/examples/src/main/python/mllib/isotonic_regression_example.py b/examples/src/main/python/mllib/isotonic_regression_example.py index f5322d79c4..a5a0cfeae9 100644 --- a/examples/src/main/python/mllib/isotonic_regression_example.py +++ b/examples/src/main/python/mllib/isotonic_regression_example.py @@ -18,8 +18,6 @@ """ Isotonic Regression Example. """ -from __future__ import print_function - from pyspark import SparkContext # $example on$ import math diff --git a/examples/src/main/python/mllib/k_means_example.py b/examples/src/main/python/mllib/k_means_example.py index d6058f4502..ead1e56de5 100644 --- a/examples/src/main/python/mllib/k_means_example.py +++ b/examples/src/main/python/mllib/k_means_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - # $example on$ from numpy import array from math import sqrt diff --git a/examples/src/main/python/mllib/kernel_density_estimation_example.py b/examples/src/main/python/mllib/kernel_density_estimation_example.py index 3e8f7241a4..22d1917160 100644 --- a/examples/src/main/python/mllib/kernel_density_estimation_example.py +++ b/examples/src/main/python/mllib/kernel_density_estimation_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - from pyspark import SparkContext # $example on$ from pyspark.mllib.stat import KernelDensity diff --git a/examples/src/main/python/mllib/kmeans.py b/examples/src/main/python/mllib/kmeans.py index 1bdb3e9b4a..2560384b6a 100755 --- a/examples/src/main/python/mllib/kmeans.py +++ b/examples/src/main/python/mllib/kmeans.py @@ -20,8 +20,6 @@ A K-means clustering program using MLlib. This example requires NumPy (http://www.numpy.org/). """ -from __future__ import print_function - import sys import numpy as np diff --git a/examples/src/main/python/mllib/latent_dirichlet_allocation_example.py b/examples/src/main/python/mllib/latent_dirichlet_allocation_example.py index 2a1bef5f20..f82a28aadc 100644 --- a/examples/src/main/python/mllib/latent_dirichlet_allocation_example.py +++ b/examples/src/main/python/mllib/latent_dirichlet_allocation_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - from pyspark import SparkContext # $example on$ from pyspark.mllib.clustering import LDA, LDAModel diff --git a/examples/src/main/python/mllib/linear_regression_with_sgd_example.py b/examples/src/main/python/mllib/linear_regression_with_sgd_example.py index 6744463d40..cb67396332 100644 --- a/examples/src/main/python/mllib/linear_regression_with_sgd_example.py +++ b/examples/src/main/python/mllib/linear_regression_with_sgd_example.py @@ -18,8 +18,6 @@ """ Linear Regression With SGD Example. """ -from __future__ import print_function - from pyspark import SparkContext # $example on$ from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel diff --git a/examples/src/main/python/mllib/logistic_regression.py b/examples/src/main/python/mllib/logistic_regression.py index 87efe17375..7b90615a53 100755 --- a/examples/src/main/python/mllib/logistic_regression.py +++ b/examples/src/main/python/mllib/logistic_regression.py @@ -20,8 +20,6 @@ Logistic regression using MLlib. This example requires NumPy (http://www.numpy.org/). """ -from __future__ import print_function - import sys from pyspark import SparkContext diff --git a/examples/src/main/python/mllib/logistic_regression_with_lbfgs_example.py b/examples/src/main/python/mllib/logistic_regression_with_lbfgs_example.py index c9b768b314..ac5ab1d1b5 100644 --- a/examples/src/main/python/mllib/logistic_regression_with_lbfgs_example.py +++ b/examples/src/main/python/mllib/logistic_regression_with_lbfgs_example.py @@ -18,8 +18,6 @@ """ Logistic Regression With LBFGS Example. """ -from __future__ import print_function - from pyspark import SparkContext # $example on$ from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel diff --git a/examples/src/main/python/mllib/naive_bayes_example.py b/examples/src/main/python/mllib/naive_bayes_example.py index a29fcccac5..74d18233d5 100644 --- a/examples/src/main/python/mllib/naive_bayes_example.py +++ b/examples/src/main/python/mllib/naive_bayes_example.py @@ -22,8 +22,6 @@ Usage: `spark-submit --master local[4] examples/src/main/python/mllib/naive_bayes_example.py` """ -from __future__ import print_function - import shutil from pyspark import SparkContext diff --git a/examples/src/main/python/mllib/normalizer_example.py b/examples/src/main/python/mllib/normalizer_example.py index a4e028ca9a..d46110d9a0 100644 --- a/examples/src/main/python/mllib/normalizer_example.py +++ b/examples/src/main/python/mllib/normalizer_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - from pyspark import SparkContext # $example on$ from pyspark.mllib.feature import Normalizer diff --git a/examples/src/main/python/mllib/power_iteration_clustering_example.py b/examples/src/main/python/mllib/power_iteration_clustering_example.py index ca19c0ccb6..60eedef5fa 100644 --- a/examples/src/main/python/mllib/power_iteration_clustering_example.py +++ b/examples/src/main/python/mllib/power_iteration_clustering_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - from pyspark import SparkContext # $example on$ from pyspark.mllib.clustering import PowerIterationClustering, PowerIterationClusteringModel diff --git a/examples/src/main/python/mllib/random_forest_classification_example.py b/examples/src/main/python/mllib/random_forest_classification_example.py index 5ac67520da..a929c10d5a 100644 --- a/examples/src/main/python/mllib/random_forest_classification_example.py +++ b/examples/src/main/python/mllib/random_forest_classification_example.py @@ -18,8 +18,6 @@ """ Random Forest Classification Example. """ -from __future__ import print_function - from pyspark import SparkContext # $example on$ from pyspark.mllib.tree import RandomForest, RandomForestModel diff --git a/examples/src/main/python/mllib/random_forest_regression_example.py b/examples/src/main/python/mllib/random_forest_regression_example.py index 7e986a0d30..4e05937768 100644 --- a/examples/src/main/python/mllib/random_forest_regression_example.py +++ b/examples/src/main/python/mllib/random_forest_regression_example.py @@ -18,8 +18,6 @@ """ Random Forest Regression Example. """ -from __future__ import print_function - from pyspark import SparkContext # $example on$ from pyspark.mllib.tree import RandomForest, RandomForestModel diff --git a/examples/src/main/python/mllib/random_rdd_generation.py b/examples/src/main/python/mllib/random_rdd_generation.py index 9a429b5f8a..49afcfe939 100755 --- a/examples/src/main/python/mllib/random_rdd_generation.py +++ b/examples/src/main/python/mllib/random_rdd_generation.py @@ -18,8 +18,6 @@ """ Randomly generated RDDs. """ -from __future__ import print_function - import sys from pyspark import SparkContext diff --git a/examples/src/main/python/mllib/recommendation_example.py b/examples/src/main/python/mllib/recommendation_example.py index 00e683c3ae..719f3f904b 100644 --- a/examples/src/main/python/mllib/recommendation_example.py +++ b/examples/src/main/python/mllib/recommendation_example.py @@ -18,8 +18,6 @@ """ Collaborative Filtering Classification Example. """ -from __future__ import print_function - from pyspark import SparkContext # $example on$ diff --git a/examples/src/main/python/mllib/sampled_rdds.py b/examples/src/main/python/mllib/sampled_rdds.py index 00e7cf4bbc..9095c2b2d7 100755 --- a/examples/src/main/python/mllib/sampled_rdds.py +++ b/examples/src/main/python/mllib/sampled_rdds.py @@ -18,8 +18,6 @@ """ Randomly sampled RDDs. """ -from __future__ import print_function - import sys from pyspark import SparkContext diff --git a/examples/src/main/python/mllib/standard_scaler_example.py b/examples/src/main/python/mllib/standard_scaler_example.py index 11ed34427d..c8fd64dfbb 100644 --- a/examples/src/main/python/mllib/standard_scaler_example.py +++ b/examples/src/main/python/mllib/standard_scaler_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - from pyspark import SparkContext # $example on$ from pyspark.mllib.feature import StandardScaler diff --git a/examples/src/main/python/mllib/stratified_sampling_example.py b/examples/src/main/python/mllib/stratified_sampling_example.py index a13f8f08dd..2d29f74a19 100644 --- a/examples/src/main/python/mllib/stratified_sampling_example.py +++ b/examples/src/main/python/mllib/stratified_sampling_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - from pyspark import SparkContext if __name__ == "__main__": diff --git a/examples/src/main/python/mllib/streaming_k_means_example.py b/examples/src/main/python/mllib/streaming_k_means_example.py index e82509ad3f..4904a9ebcf 100644 --- a/examples/src/main/python/mllib/streaming_k_means_example.py +++ b/examples/src/main/python/mllib/streaming_k_means_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - from pyspark import SparkContext from pyspark.streaming import StreamingContext # $example on$ diff --git a/examples/src/main/python/mllib/streaming_linear_regression_example.py b/examples/src/main/python/mllib/streaming_linear_regression_example.py index 714c9a0de7..1d52e00fbf 100644 --- a/examples/src/main/python/mllib/streaming_linear_regression_example.py +++ b/examples/src/main/python/mllib/streaming_linear_regression_example.py @@ -18,8 +18,6 @@ """ Streaming Linear Regression Example. """ -from __future__ import print_function - # $example on$ import sys # $example off$ diff --git a/examples/src/main/python/mllib/summary_statistics_example.py b/examples/src/main/python/mllib/summary_statistics_example.py index d55d1a2c2d..d86e841145 100644 --- a/examples/src/main/python/mllib/summary_statistics_example.py +++ b/examples/src/main/python/mllib/summary_statistics_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - from pyspark import SparkContext # $example on$ import numpy as np diff --git a/examples/src/main/python/mllib/tf_idf_example.py b/examples/src/main/python/mllib/tf_idf_example.py index b66412b233..4449066f5b 100644 --- a/examples/src/main/python/mllib/tf_idf_example.py +++ b/examples/src/main/python/mllib/tf_idf_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - from pyspark import SparkContext # $example on$ from pyspark.mllib.feature import HashingTF, IDF diff --git a/examples/src/main/python/mllib/word2vec.py b/examples/src/main/python/mllib/word2vec.py index 4e7d4f7610..3e5720b4df 100644 --- a/examples/src/main/python/mllib/word2vec.py +++ b/examples/src/main/python/mllib/word2vec.py @@ -23,8 +23,6 @@ # grep -o -E '\w+(\W+\w+){0,15}' text8 > text8_lines # This was done so that the example can be run in local mode -from __future__ import print_function - import sys from pyspark import SparkContext diff --git a/examples/src/main/python/mllib/word2vec_example.py b/examples/src/main/python/mllib/word2vec_example.py index ad1090c77e..d37a6e7137 100644 --- a/examples/src/main/python/mllib/word2vec_example.py +++ b/examples/src/main/python/mllib/word2vec_example.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - from pyspark import SparkContext # $example on$ from pyspark.mllib.feature import Word2Vec diff --git a/examples/src/main/python/pagerank.py b/examples/src/main/python/pagerank.py index 2c19e8700a..0ab7249a82 100755 --- a/examples/src/main/python/pagerank.py +++ b/examples/src/main/python/pagerank.py @@ -22,8 +22,6 @@ Please refer to PageRank implementation provided by graphx Example Usage: bin/spark-submit examples/src/main/python/pagerank.py data/mllib/pagerank_data.txt 10 """ -from __future__ import print_function - import re import sys from operator import add diff --git a/examples/src/main/python/parquet_inputformat.py b/examples/src/main/python/parquet_inputformat.py index 83041f0040..ca8dd25e6d 100644 --- a/examples/src/main/python/parquet_inputformat.py +++ b/examples/src/main/python/parquet_inputformat.py @@ -29,8 +29,6 @@ $ ./bin/spark-submit --driver-class-path /path/to/example/jar \\ {u'favorite_color': u'red', u'name': u'Ben', u'favorite_numbers': []} <...more log output...> """ -from __future__ import print_function - import sys from pyspark.sql import SparkSession diff --git a/examples/src/main/python/pi.py b/examples/src/main/python/pi.py index 5839cc2874..e646722533 100755 --- a/examples/src/main/python/pi.py +++ b/examples/src/main/python/pi.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - import sys from random import random from operator import add diff --git a/examples/src/main/python/sort.py b/examples/src/main/python/sort.py index d3cd985d19..9efb00a6f1 100755 --- a/examples/src/main/python/sort.py +++ b/examples/src/main/python/sort.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - import sys from pyspark.sql import SparkSession diff --git a/examples/src/main/python/sql/arrow.py b/examples/src/main/python/sql/arrow.py index b7d8467172..e46449dbef 100644 --- a/examples/src/main/python/sql/arrow.py +++ b/examples/src/main/python/sql/arrow.py @@ -21,21 +21,12 @@ Run with: ./bin/spark-submit examples/src/main/python/sql/arrow.py """ -from __future__ import print_function - -import sys - from pyspark.sql import SparkSession from pyspark.sql.pandas.utils import require_minimum_pandas_version, require_minimum_pyarrow_version require_minimum_pandas_version() require_minimum_pyarrow_version() -if sys.version_info < (3, 6): - raise Exception( - "Running this example file requires Python 3.6+; however, " - "your Python version was:\n %s" % sys.version) - def dataframe_with_arrow_example(spark): # $example on:dataframe_with_arrow$ diff --git a/examples/src/main/python/sql/basic.py b/examples/src/main/python/sql/basic.py index c8fb25d053..eba8e6ad99 100644 --- a/examples/src/main/python/sql/basic.py +++ b/examples/src/main/python/sql/basic.py @@ -20,8 +20,6 @@ A simple example demonstrating basic Spark SQL features. Run with: ./bin/spark-submit examples/src/main/python/sql/basic.py """ -from __future__ import print_function - # $example on:init_session$ from pyspark.sql import SparkSession # $example off:init_session$ diff --git a/examples/src/main/python/sql/datasource.py b/examples/src/main/python/sql/datasource.py index 265f135e1e..94a41a7e5e 100644 --- a/examples/src/main/python/sql/datasource.py +++ b/examples/src/main/python/sql/datasource.py @@ -20,8 +20,6 @@ A simple example demonstrating Spark SQL data sources. Run with: ./bin/spark-submit examples/src/main/python/sql/datasource.py """ -from __future__ import print_function - from pyspark.sql import SparkSession # $example on:schema_merging$ from pyspark.sql import Row diff --git a/examples/src/main/python/sql/hive.py b/examples/src/main/python/sql/hive.py index e96a8af71a..bc23dcd9bd 100644 --- a/examples/src/main/python/sql/hive.py +++ b/examples/src/main/python/sql/hive.py @@ -20,8 +20,6 @@ A simple example demonstrating Spark SQL Hive integration. Run with: ./bin/spark-submit examples/src/main/python/sql/hive.py """ -from __future__ import print_function - # $example on:spark_hive$ from os.path import join, abspath diff --git a/examples/src/main/python/sql/streaming/structured_kafka_wordcount.py b/examples/src/main/python/sql/streaming/structured_kafka_wordcount.py index 9210678913..40a955a46c 100644 --- a/examples/src/main/python/sql/streaming/structured_kafka_wordcount.py +++ b/examples/src/main/python/sql/streaming/structured_kafka_wordcount.py @@ -36,8 +36,6 @@ `$ bin/spark-submit examples/src/main/python/sql/streaming/structured_kafka_wordcount.py \ host1:port1,host2:port2 subscribe topic1,topic2` """ -from __future__ import print_function - import sys from pyspark.sql import SparkSession diff --git a/examples/src/main/python/sql/streaming/structured_network_wordcount.py b/examples/src/main/python/sql/streaming/structured_network_wordcount.py index 9ac3921647..c8f43c9dcf 100644 --- a/examples/src/main/python/sql/streaming/structured_network_wordcount.py +++ b/examples/src/main/python/sql/streaming/structured_network_wordcount.py @@ -27,8 +27,6 @@ r""" `$ bin/spark-submit examples/src/main/python/sql/streaming/structured_network_wordcount.py localhost 9999` """ -from __future__ import print_function - import sys from pyspark.sql import SparkSession diff --git a/examples/src/main/python/sql/streaming/structured_network_wordcount_windowed.py b/examples/src/main/python/sql/streaming/structured_network_wordcount_windowed.py index c4e3bbf44c..cc39d8afa6 100644 --- a/examples/src/main/python/sql/streaming/structured_network_wordcount_windowed.py +++ b/examples/src/main/python/sql/streaming/structured_network_wordcount_windowed.py @@ -39,8 +39,6 @@ r""" One recommended , pair is 10, 5 """ -from __future__ import print_function - import sys from pyspark.sql import SparkSession diff --git a/examples/src/main/python/status_api_demo.py b/examples/src/main/python/status_api_demo.py index 8cc8cc820c..7b408c8726 100644 --- a/examples/src/main/python/status_api_demo.py +++ b/examples/src/main/python/status_api_demo.py @@ -15,15 +15,10 @@ # limitations under the License. # -from __future__ import print_function - import time import threading import sys -if sys.version >= '3': - import queue as Queue -else: - import Queue +import queue as Queue from pyspark import SparkConf, SparkContext diff --git a/examples/src/main/python/streaming/hdfs_wordcount.py b/examples/src/main/python/streaming/hdfs_wordcount.py index f9a5c43a8e..fac07727b7 100644 --- a/examples/src/main/python/streaming/hdfs_wordcount.py +++ b/examples/src/main/python/streaming/hdfs_wordcount.py @@ -25,8 +25,6 @@ Then create a text file in `localdir` and the words in the file will get counted. """ -from __future__ import print_function - import sys from pyspark import SparkContext diff --git a/examples/src/main/python/streaming/network_wordcount.py b/examples/src/main/python/streaming/network_wordcount.py index f3099d2517..b57f4e9e38 100644 --- a/examples/src/main/python/streaming/network_wordcount.py +++ b/examples/src/main/python/streaming/network_wordcount.py @@ -25,8 +25,6 @@ r""" and then run the example `$ bin/spark-submit examples/src/main/python/streaming/network_wordcount.py localhost 9999` """ -from __future__ import print_function - import sys from pyspark import SparkContext diff --git a/examples/src/main/python/streaming/network_wordjoinsentiments.py b/examples/src/main/python/streaming/network_wordjoinsentiments.py index 2b5434c0c8..5b03546fb4 100644 --- a/examples/src/main/python/streaming/network_wordjoinsentiments.py +++ b/examples/src/main/python/streaming/network_wordjoinsentiments.py @@ -30,8 +30,6 @@ r""" localhost 9999` """ -from __future__ import print_function - import sys from pyspark import SparkContext diff --git a/examples/src/main/python/streaming/recoverable_network_wordcount.py b/examples/src/main/python/streaming/recoverable_network_wordcount.py index a39c4d0b5b..8424556e88 100644 --- a/examples/src/main/python/streaming/recoverable_network_wordcount.py +++ b/examples/src/main/python/streaming/recoverable_network_wordcount.py @@ -35,8 +35,6 @@ checkpoint data exists in ~/checkpoint/, then it will create StreamingContext from the checkpoint data. """ -from __future__ import print_function - import os import sys diff --git a/examples/src/main/python/streaming/sql_network_wordcount.py b/examples/src/main/python/streaming/sql_network_wordcount.py index ab3cfc0679..59a8a11a45 100644 --- a/examples/src/main/python/streaming/sql_network_wordcount.py +++ b/examples/src/main/python/streaming/sql_network_wordcount.py @@ -27,8 +27,6 @@ r""" and then run the example `$ bin/spark-submit examples/src/main/python/streaming/sql_network_wordcount.py localhost 9999` """ -from __future__ import print_function - import sys from pyspark import SparkContext diff --git a/examples/src/main/python/streaming/stateful_network_wordcount.py b/examples/src/main/python/streaming/stateful_network_wordcount.py index d5d1eba6c5..7a45be663a 100644 --- a/examples/src/main/python/streaming/stateful_network_wordcount.py +++ b/examples/src/main/python/streaming/stateful_network_wordcount.py @@ -29,8 +29,6 @@ r""" `$ bin/spark-submit examples/src/main/python/streaming/stateful_network_wordcount.py \ localhost 9999` """ -from __future__ import print_function - import sys from pyspark import SparkContext diff --git a/examples/src/main/python/transitive_closure.py b/examples/src/main/python/transitive_closure.py index 49551d4085..9f543daecd 100755 --- a/examples/src/main/python/transitive_closure.py +++ b/examples/src/main/python/transitive_closure.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - import sys from random import Random diff --git a/examples/src/main/python/wordcount.py b/examples/src/main/python/wordcount.py index a05e24ff3f..037c1e8aa3 100755 --- a/examples/src/main/python/wordcount.py +++ b/examples/src/main/python/wordcount.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - import sys from operator import add diff --git a/external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py b/external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py index 5370b79389..df8c64e531 100644 --- a/external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py +++ b/external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py @@ -55,8 +55,6 @@ See http://spark.apache.org/docs/latest/streaming-kinesis-integration.html for more details on the Kinesis Spark Streaming integration. """ -from __future__ import print_function - import sys from pyspark import SparkContext diff --git a/python/pyspark/accumulators.py b/python/pyspark/accumulators.py index a5d513262b..2a19d233bc 100644 --- a/python/pyspark/accumulators.py +++ b/python/pyspark/accumulators.py @@ -89,10 +89,7 @@ TypeError:... import sys import select import struct -if sys.version < '3': - import SocketServer -else: - import socketserver as SocketServer +import socketserver as SocketServer import threading from pyspark.serializers import read_int, PickleSerializer diff --git a/python/pyspark/broadcast.py b/python/pyspark/broadcast.py index 803d857055..c2daf7600f 100644 --- a/python/pyspark/broadcast.py +++ b/python/pyspark/broadcast.py @@ -20,16 +20,12 @@ import os import sys from tempfile import NamedTemporaryFile import threading +import pickle from pyspark.java_gateway import local_connect_and_auth from pyspark.serializers import ChunkedStream, pickle_protocol -from pyspark.util import _exception_message, print_exec +from pyspark.util import print_exec -if sys.version < '3': - import cPickle as pickle -else: - import pickle - unicode = str __all__ = ['Broadcast'] @@ -113,7 +109,7 @@ class Broadcast(object): raise except Exception as e: msg = "Could not serialize broadcast: %s: %s" \ - % (e.__class__.__name__, _exception_message(e)) + % (e.__class__.__name__, str(e)) print_exec(sys.stderr) raise pickle.PicklingError(msg) f.close() diff --git a/python/pyspark/conf.py b/python/pyspark/conf.py index 2024260868..efd8b6d633 100644 --- a/python/pyspark/conf.py +++ b/python/pyspark/conf.py @@ -22,14 +22,14 @@ >>> conf.setMaster("local").setAppName("My app") >>> conf.get("spark.master") -u'local' +'local' >>> conf.get("spark.app.name") -u'My app' +'My app' >>> sc = SparkContext(conf=conf) >>> sc.master -u'local' +'local' >>> sc.appName -u'My app' +'My app' >>> sc.sparkHome is None True @@ -37,21 +37,21 @@ True >>> conf.setSparkHome("/path") >>> conf.get("spark.home") -u'/path' +'/path' >>> conf.setExecutorEnv("VAR1", "value1") >>> conf.setExecutorEnv(pairs = [("VAR3", "value3"), ("VAR4", "value4")]) >>> conf.get("spark.executorEnv.VAR1") -u'value1' +'value1' >>> print(conf.toDebugString()) spark.executorEnv.VAR1=value1 spark.executorEnv.VAR3=value3 spark.executorEnv.VAR4=value4 spark.home=/path >>> sorted(conf.getAll(), key=lambda p: p[0]) -[(u'spark.executorEnv.VAR1', u'value1'), (u'spark.executorEnv.VAR3', u'value3'), \ -(u'spark.executorEnv.VAR4', u'value4'), (u'spark.home', u'/path')] +[('spark.executorEnv.VAR1', 'value1'), ('spark.executorEnv.VAR3', 'value3'), \ +('spark.executorEnv.VAR4', 'value4'), ('spark.home', '/path')] >>> conf._jconf.setExecutorEnv("VAR5", "value5") JavaObject id... >>> print(conf.toDebugString()) @@ -65,11 +65,6 @@ spark.home=/path __all__ = ['SparkConf'] import sys -import re - -if sys.version > '3': - unicode = str - __doc__ = re.sub(r"(\W|^)[uU](['])", r'\1\2', __doc__) class SparkConf(object): @@ -124,9 +119,9 @@ class SparkConf(object): """Set a configuration property.""" # Try to set self._jconf first if JVM is created, set self._conf if JVM is not created yet. if self._jconf is not None: - self._jconf.set(key, unicode(value)) + self._jconf.set(key, str(value)) else: - self._conf[key] = unicode(value) + self._conf[key] = str(value) return self def setIfMissing(self, key, value): diff --git a/python/pyspark/context.py b/python/pyspark/context.py index 6d58e1d144..2e105cc382 100644 --- a/python/pyspark/context.py +++ b/python/pyspark/context.py @@ -21,6 +21,7 @@ import signal import sys import threading import warnings +import importlib from threading import RLock from tempfile import NamedTemporaryFile @@ -37,15 +38,12 @@ from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deseria PairDeserializer, AutoBatchedSerializer, NoOpSerializer, ChunkedStream from pyspark.storagelevel import StorageLevel from pyspark.resource.information import ResourceInformation -from pyspark.rdd import RDD, _load_from_socket, ignore_unicode_prefix +from pyspark.rdd import RDD, _load_from_socket from pyspark.taskcontext import TaskContext from pyspark.traceback_utils import CallSite, first_spark_call from pyspark.status import StatusTracker from pyspark.profiler import ProfilerCollector, BasicProfiler -if sys.version > '3': - xrange = range - __all__ = ['SparkContext'] @@ -213,15 +211,6 @@ class SparkContext(object): self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python') self.pythonVer = "%d.%d" % sys.version_info[:2] - if sys.version_info < (3, 6): - with warnings.catch_warnings(): - warnings.simplefilter("once") - warnings.warn( - "Support for Python 2 and Python 3 prior to version 3.6 is deprecated as " - "of Spark 3.0. See also the plan for dropping Python 2 support at " - "https://spark.apache.org/news/plan-for-dropping-python-2-support.html.", - DeprecationWarning) - # Broadcast's __reduce__ method stores Broadcast instances here. # This allows other code to determine which Broadcast instances have # been pickled, so it can determine which Java broadcast objects to @@ -398,7 +387,6 @@ class SparkContext(object): return self._jsc.version() @property - @ignore_unicode_prefix def applicationId(self): """ A unique identifier for the Spark application. @@ -408,7 +396,7 @@ class SparkContext(object): * in case of YARN something like 'application_1433865536131_34483' >>> sc.applicationId # doctest: +ELLIPSIS - u'local-...' + 'local-...' """ return self._jsc.sc().applicationId() @@ -490,20 +478,20 @@ class SparkContext(object): end = start start = 0 - return self.parallelize(xrange(start, end, step), numSlices) + return self.parallelize(range(start, end, step), numSlices) def parallelize(self, c, numSlices=None): """ - Distribute a local Python collection to form an RDD. Using xrange + Distribute a local Python collection to form an RDD. Using range is recommended if the input represents a range for performance. >>> sc.parallelize([0, 2, 3, 4, 6], 5).glom().collect() [[0], [2], [3], [4], [6]] - >>> sc.parallelize(xrange(0, 6, 2), 5).glom().collect() + >>> sc.parallelize(range(0, 6, 2), 5).glom().collect() [[], [0], [], [2], [4]] """ numSlices = int(numSlices) if numSlices is not None else self.defaultParallelism - if isinstance(c, xrange): + if isinstance(c, range): size = len(c) if size == 0: return self.parallelize([], numSlices) @@ -522,7 +510,7 @@ class SparkContext(object): # the empty iterator to a list, thus make sure worker reuse takes effect. # See more details in SPARK-26549. assert len(list(iterator)) == 0 - return xrange(getStart(split), getStart(split + 1), step) + return range(getStart(split), getStart(split + 1), step) return self.parallelize([], numSlices).mapPartitionsWithIndex(f) @@ -591,7 +579,6 @@ class SparkContext(object): minPartitions = minPartitions or self.defaultMinPartitions return RDD(self._jsc.objectFile(name, minPartitions), self) - @ignore_unicode_prefix def textFile(self, name, minPartitions=None, use_unicode=True): """ Read a text file from HDFS, a local file system (available on all @@ -608,13 +595,12 @@ class SparkContext(object): ... _ = testFile.write("Hello world!") >>> textFile = sc.textFile(path) >>> textFile.collect() - [u'Hello world!'] + ['Hello world!'] """ minPartitions = minPartitions or min(self.defaultParallelism, 2) return RDD(self._jsc.textFile(name, minPartitions), self, UTF8Deserializer(use_unicode)) - @ignore_unicode_prefix def wholeTextFiles(self, path, minPartitions=None, use_unicode=True): """ Read a directory of text files from HDFS, a local file system @@ -658,7 +644,7 @@ class SparkContext(object): ... _ = file2.write("2") >>> textFiles = sc.wholeTextFiles(dirPath) >>> sorted(textFiles.collect()) - [(u'.../1.txt', u'1'), (u'.../2.txt', u'2')] + [('.../1.txt', '1'), ('.../2.txt', '2')] """ minPartitions = minPartitions or self.defaultMinPartitions return RDD(self._jsc.wholeTextFiles(path, minPartitions), self, @@ -846,7 +832,6 @@ class SparkContext(object): jrdd = self._jsc.checkpointFile(name) return RDD(jrdd, self, input_deserializer) - @ignore_unicode_prefix def union(self, rdds): """ Build the union of a list of RDDs. @@ -860,10 +845,10 @@ class SparkContext(object): ... _ = testFile.write("Hello") >>> textFile = sc.textFile(path) >>> textFile.collect() - [u'Hello'] + ['Hello'] >>> parallelized = sc.parallelize(["World!"]) >>> sorted(sc.union([textFile, parallelized]).collect()) - [u'Hello', 'World!'] + ['Hello', 'World!'] """ first_jrdd_deserializer = rdds[0]._jrdd_deserializer if any(x._jrdd_deserializer != first_jrdd_deserializer for x in rdds): @@ -959,9 +944,8 @@ class SparkContext(object): self._python_includes.append(filename) # for tests in local mode sys.path.insert(1, os.path.join(SparkFiles.getRootDirectory(), filename)) - if sys.version > '3': - import importlib - importlib.invalidate_caches() + + importlib.invalidate_caches() def setCheckpointDir(self, dirName): """ diff --git a/python/pyspark/find_spark_home.py b/python/pyspark/find_spark_home.py index 52f6ea9a37..920c04009d 100755 --- a/python/pyspark/find_spark_home.py +++ b/python/pyspark/find_spark_home.py @@ -20,7 +20,6 @@ # This script attempt to determine the correct setting for SPARK_HOME given # that Spark may have been installed on the system with pip. -from __future__ import print_function import os import sys @@ -41,26 +40,15 @@ def _find_spark_home(): # Add the path of the PySpark module if it exists import_error_raised = False - if sys.version < "3": - import imp - try: - module_home = imp.find_module("pyspark")[1] - paths.append(module_home) - # If we are installed in edit mode also look two dirs up - paths.append(os.path.join(module_home, "../../")) - except ImportError: - # Not pip installed no worries - import_error_raised = True - else: - from importlib.util import find_spec - try: - module_home = os.path.dirname(find_spec("pyspark").origin) - paths.append(module_home) - # If we are installed in edit mode also look two dirs up - paths.append(os.path.join(module_home, "../../")) - except ImportError: - # Not pip installed no worries - import_error_raised = True + from importlib.util import find_spec + try: + module_home = os.path.dirname(find_spec("pyspark").origin) + paths.append(module_home) + # If we are installed in edit mode also look two dirs up + paths.append(os.path.join(module_home, "../../")) + except ImportError: + # Not pip installed no worries + import_error_raised = True # Normalize the paths paths = [os.path.abspath(p) for p in paths] @@ -84,5 +72,6 @@ def _find_spark_home(): "'PYSPARK_PYTHON=python3 pyspark'.\n", file=sys.stderr) sys.exit(-1) + if __name__ == "__main__": print(_find_spark_home()) diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py index 0daf09b17a..fba92a96ae 100644 --- a/python/pyspark/java_gateway.py +++ b/python/pyspark/java_gateway.py @@ -17,7 +17,6 @@ import atexit import os -import sys import signal import shlex import shutil @@ -27,14 +26,10 @@ import tempfile import time from subprocess import Popen, PIPE -if sys.version >= '3': - xrange = range - from py4j.java_gateway import java_import, JavaGateway, JavaObject, GatewayParameters from py4j.clientserver import ClientServer, JavaParameters, PythonParameters from pyspark.find_spark_home import _find_spark_home from pyspark.serializers import read_int, write_with_length, UTF8Deserializer -from pyspark.util import _exception_message def launch_gateway(conf=None, popen_kwargs=None): @@ -197,7 +192,7 @@ def local_connect_and_auth(port, auth_secret): _do_server_auth(sockfile, auth_secret) return (sockfile, sock) except socket.error as e: - emsg = _exception_message(e) + emsg = str(e) errors.append("tried to connect to %s, but an error occured: %s" % (sa, emsg)) sock.close() sock = None diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index cc8ce0567b..7c8cbe3a9f 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -16,20 +16,20 @@ # import operator -import sys +import warnings from abc import ABCMeta, abstractmethod, abstractproperty from multiprocessing.pool import ThreadPool -from pyspark import since, keyword_only +from pyspark import keyword_only from pyspark.ml import Estimator, Predictor, PredictionModel, Model from pyspark.ml.param.shared import * from pyspark.ml.tree import _DecisionTreeModel, _DecisionTreeParams, \ _TreeEnsembleModel, _RandomForestParams, _GBTParams, \ - _HasVarianceImpurity, _TreeClassifierParams, _TreeEnsembleParams + _HasVarianceImpurity, _TreeClassifierParams from pyspark.ml.regression import _FactorizationMachinesParams, DecisionTreeRegressionModel from pyspark.ml.util import * from pyspark.ml.base import _PredictorParams -from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams, \ +from pyspark.ml.wrapper import JavaParams, \ JavaPredictor, JavaPredictionModel, JavaWrapper from pyspark.ml.common import inherit_doc, _java2py, _py2java from pyspark.ml.linalg import Vectors diff --git a/python/pyspark/ml/common.py b/python/pyspark/ml/common.py index 387c5d7309..4e1d7f93ae 100644 --- a/python/pyspark/ml/common.py +++ b/python/pyspark/ml/common.py @@ -15,11 +15,6 @@ # limitations under the License. # -import sys -if sys.version >= '3': - long = int - unicode = str - import py4j.protocol from py4j.protocol import Py4JJavaError from py4j.java_gateway import JavaObject @@ -79,7 +74,7 @@ def _py2java(sc, obj): obj = [_py2java(sc, x) for x in obj] elif isinstance(obj, JavaObject): pass - elif isinstance(obj, (int, long, float, bool, bytes, unicode)): + elif isinstance(obj, (int, float, bool, bytes, str)): pass else: data = bytearray(PickleSerializer().dumps(obj)) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 498629cea8..c52ea62686 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -15,12 +15,7 @@ # limitations under the License. # -import sys -if sys.version > '3': - basestring = str - from pyspark import since, keyword_only, SparkContext -from pyspark.rdd import ignore_unicode_prefix from pyspark.ml.linalg import _convert_to_vector from pyspark.ml.param.shared import * from pyspark.ml.util import JavaMLReadable, JavaMLWritable @@ -2178,7 +2173,6 @@ class MinMaxScalerModel(JavaModel, _MinMaxScalerParams, JavaMLReadable, JavaMLWr @inherit_doc -@ignore_unicode_prefix class NGram(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ A feature transformer that converts the input array of strings into an array of n-grams. Null @@ -2196,15 +2190,15 @@ class NGram(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWr >>> ngram.setOutputCol("nGrams") NGram... >>> ngram.transform(df).head() - Row(inputTokens=[u'a', u'b', u'c', u'd', u'e'], nGrams=[u'a b', u'b c', u'c d', u'd e']) + Row(inputTokens=['a', 'b', 'c', 'd', 'e'], nGrams=['a b', 'b c', 'c d', 'd e']) >>> # Change n-gram length >>> ngram.setParams(n=4).transform(df).head() - Row(inputTokens=[u'a', u'b', u'c', u'd', u'e'], nGrams=[u'a b c d', u'b c d e']) + Row(inputTokens=['a', 'b', 'c', 'd', 'e'], nGrams=['a b c d', 'b c d e']) >>> # Temporarily modify output column. >>> ngram.transform(df, {ngram.outputCol: "output"}).head() - Row(inputTokens=[u'a', u'b', u'c', u'd', u'e'], output=[u'a b c d', u'b c d e']) + Row(inputTokens=['a', 'b', 'c', 'd', 'e'], output=['a b c d', 'b c d e']) >>> ngram.transform(df).head() - Row(inputTokens=[u'a', u'b', u'c', u'd', u'e'], nGrams=[u'a b c d', u'b c d e']) + Row(inputTokens=['a', 'b', 'c', 'd', 'e'], nGrams=['a b c d', 'b c d e']) >>> # Must use keyword arguments to specify params. >>> ngram.setParams("text") Traceback (most recent call last): @@ -3082,7 +3076,6 @@ class RobustScalerModel(JavaModel, _RobustScalerParams, JavaMLReadable, JavaMLWr @inherit_doc -@ignore_unicode_prefix class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ A regex based tokenizer that extracts tokens either by using the @@ -3099,15 +3092,15 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, >>> reTokenizer.setOutputCol("words") RegexTokenizer... >>> reTokenizer.transform(df).head() - Row(text=u'A B c', words=[u'a', u'b', u'c']) + Row(text='A B c', words=['a', 'b', 'c']) >>> # Change a parameter. >>> reTokenizer.setParams(outputCol="tokens").transform(df).head() - Row(text=u'A B c', tokens=[u'a', u'b', u'c']) + Row(text='A B c', tokens=['a', 'b', 'c']) >>> # Temporarily modify a parameter. >>> reTokenizer.transform(df, {reTokenizer.outputCol: "words"}).head() - Row(text=u'A B c', words=[u'a', u'b', u'c']) + Row(text='A B c', words=['a', 'b', 'c']) >>> reTokenizer.transform(df).head() - Row(text=u'A B c', tokens=[u'a', u'b', u'c']) + Row(text='A B c', tokens=['a', 'b', 'c']) >>> # Must use keyword arguments to specify params. >>> reTokenizer.setParams("text") Traceback (most recent call last): @@ -3935,7 +3928,6 @@ class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol, HasInputCols, @inherit_doc -@ignore_unicode_prefix class Tokenizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ A tokenizer that converts the input string to lowercase and then @@ -3946,15 +3938,15 @@ class Tokenizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, Java >>> tokenizer.setInputCol("text") Tokenizer... >>> tokenizer.transform(df).head() - Row(text=u'a b c', words=[u'a', u'b', u'c']) + Row(text='a b c', words=['a', 'b', 'c']) >>> # Change a parameter. >>> tokenizer.setParams(outputCol="tokens").transform(df).head() - Row(text=u'a b c', tokens=[u'a', u'b', u'c']) + Row(text='a b c', tokens=['a', 'b', 'c']) >>> # Temporarily modify a parameter. >>> tokenizer.transform(df, {tokenizer.outputCol: "words"}).head() - Row(text=u'a b c', words=[u'a', u'b', u'c']) + Row(text='a b c', words=['a', 'b', 'c']) >>> tokenizer.transform(df).head() - Row(text=u'a b c', tokens=[u'a', u'b', u'c']) + Row(text='a b c', tokens=['a', 'b', 'c']) >>> # Must use keyword arguments to specify params. >>> tokenizer.setParams("text") Traceback (most recent call last): @@ -4476,7 +4468,6 @@ class _Word2VecParams(HasStepSize, HasMaxIter, HasSeed, HasInputCol, HasOutputCo @inherit_doc -@ignore_unicode_prefix class Word2Vec(JavaEstimator, _Word2VecParams, JavaMLReadable, JavaMLWritable): """ Word2Vec trains a model of `Map(String, Vector)`, i.e. transforms a word into a code for further @@ -4505,7 +4496,7 @@ class Word2Vec(JavaEstimator, _Word2VecParams, JavaMLReadable, JavaMLWritable): +----+--------------------+ ... >>> model.findSynonymsArray("a", 2) - [(u'b', 0.015859870240092278), (u'c', -0.5680795907974243)] + [('b', 0.015859870240092278), ('c', -0.5680795907974243)] >>> from pyspark.sql.functions import format_number as fmt >>> model.findSynonyms("a", 2).select("word", fmt("similarity", 5).alias("similarity")).show() +----+----------+ @@ -4668,7 +4659,7 @@ class Word2VecModel(JavaModel, _Word2VecParams, JavaMLReadable, JavaMLWritable): Returns a dataframe with two fields word and similarity (which gives the cosine similarity). """ - if not isinstance(word, basestring): + if not isinstance(word, str): word = _convert_to_vector(word) return self._call_java("findSynonyms", word, num) @@ -4680,7 +4671,7 @@ class Word2VecModel(JavaModel, _Word2VecParams, JavaMLReadable, JavaMLWritable): Returns an array with two fields word and similarity (which gives the cosine similarity). """ - if not isinstance(word, basestring): + if not isinstance(word, str): word = _convert_to_vector(word) tuples = self._java_obj.findSynonymsArray(word, num) return list(map(lambda st: (st._1(), st._2()), list(tuples))) diff --git a/python/pyspark/ml/fpm.py b/python/pyspark/ml/fpm.py index 7a5591f3fb..b91788a82c 100644 --- a/python/pyspark/ml/fpm.py +++ b/python/pyspark/ml/fpm.py @@ -15,8 +15,7 @@ # limitations under the License. # -from pyspark import keyword_only, since -from pyspark.rdd import ignore_unicode_prefix +from pyspark import keyword_only from pyspark.sql import DataFrame from pyspark.ml.util import * from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams @@ -132,7 +131,6 @@ class FPGrowthModel(JavaModel, _FPGrowthParams, JavaMLWritable, JavaMLReadable): return self._call_java("associationRules") -@ignore_unicode_prefix class FPGrowth(JavaEstimator, _FPGrowthParams, JavaMLWritable, JavaMLReadable): r""" A parallel FP-growth algorithm to mine frequent itemsets. The algorithm is described in @@ -193,7 +191,7 @@ class FPGrowth(JavaEstimator, _FPGrowthParams, JavaMLWritable, JavaMLReadable): ... >>> new_data = spark.createDataFrame([(["t", "s"], )], ["items"]) >>> sorted(fpm.transform(new_data).first().newPrediction) - [u'x', u'y', u'z'] + ['x', 'y', 'z'] .. versionadded:: 2.2.0 """ diff --git a/python/pyspark/ml/image.py b/python/pyspark/ml/image.py index 4fb1036fba..20b24559b1 100644 --- a/python/pyspark/ml/image.py +++ b/python/pyspark/ml/image.py @@ -25,14 +25,13 @@ """ import sys -import warnings import numpy as np from distutils.version import LooseVersion from pyspark import SparkContext from pyspark.sql.types import Row, _create_row, _parse_datatype_json_string -from pyspark.sql import DataFrame, SparkSession +from pyspark.sql import SparkSession __all__ = ["ImageSchema"] diff --git a/python/pyspark/ml/linalg/__init__.py b/python/pyspark/ml/linalg/__init__.py index a79d5e5dcb..8be440da4f 100644 --- a/python/pyspark/ml/linalg/__init__.py +++ b/python/pyspark/ml/linalg/__init__.py @@ -27,18 +27,8 @@ import sys import array import struct -if sys.version >= '3': - basestring = str - xrange = range - import copyreg as copy_reg - long = int -else: - from itertools import izip as zip - import copy_reg - import numpy as np -from pyspark import since from pyspark.sql.types import UserDefinedType, StructField, StructType, ArrayType, DoubleType, \ IntegerType, ByteType, BooleanType @@ -47,13 +37,6 @@ __all__ = ['Vector', 'DenseVector', 'SparseVector', 'Vectors', 'Matrix', 'DenseMatrix', 'SparseMatrix', 'Matrices'] -if sys.version_info[:2] == (2, 7): - # speed up pickling array in Python 2.7 - def fast_pickle_array(ar): - return array.array, (ar.typecode, ar.tostring()) - copy_reg.pickle(array.array, fast_pickle_array) - - # Check whether we have SciPy. MLlib works without it too, but if we have it, some methods, # such as _dot and _serialize_double_vector, start to support scipy.sparse matrices. @@ -68,7 +51,7 @@ except: def _convert_to_vector(l): if isinstance(l, Vector): return l - elif type(l) in (array.array, np.array, np.ndarray, list, tuple, xrange): + elif type(l) in (array.array, np.array, np.ndarray, list, tuple, range): return DenseVector(l) elif _have_scipy and scipy.sparse.issparse(l): assert l.shape[1] == 1, "Expected column vector" @@ -102,7 +85,7 @@ def _vector_size(v): """ if isinstance(v, Vector): return len(v) - elif type(v) in (array.array, list, tuple, xrange): + elif type(v) in (array.array, list, tuple, range): return len(v) elif type(v) == np.ndarray: if v.ndim == 1 or (v.ndim == 2 and v.shape[1] == 1): @@ -415,7 +398,7 @@ class DenseVector(Vector): elif isinstance(other, SparseVector): if len(self) != other.size: return False - return Vectors._equals(list(xrange(len(self))), self.array, other.indices, other.values) + return Vectors._equals(list(range(len(self))), self.array, other.indices, other.values) return False def __ne__(self, other): @@ -520,7 +503,7 @@ class SparseVector(Vector): self.indices = np.array(args[0], dtype=np.int32) self.values = np.array(args[1], dtype=np.float64) assert len(self.indices) == len(self.values), "index and value arrays not same length" - for i in xrange(len(self.indices) - 1): + for i in range(len(self.indices) - 1): if self.indices[i] >= self.indices[i + 1]: raise TypeError( "Indices %s and %s are not strictly increasing" @@ -699,7 +682,7 @@ class SparseVector(Vector): inds = self.indices vals = self.values entries = ", ".join(["{0}: {1}".format(inds[i], _format_float(vals[i])) - for i in xrange(len(inds))]) + for i in range(len(inds))]) return "SparseVector({0}, {{{1}}})".format(self.size, entries) def __eq__(self, other): @@ -709,7 +692,7 @@ class SparseVector(Vector): elif isinstance(other, DenseVector): if self.size != len(other): return False - return Vectors._equals(self.indices, self.values, list(xrange(len(other))), other.array) + return Vectors._equals(self.indices, self.values, list(range(len(other))), other.array) return False def __getitem__(self, index): @@ -791,7 +774,7 @@ class Vectors(object): >>> Vectors.dense(1.0, 2.0) DenseVector([1.0, 2.0]) """ - if len(elements) == 1 and not isinstance(elements[0], (float, int, long)): + if len(elements) == 1 and not isinstance(elements[0], (float, int)): # it's list, numpy.array or other iterable object. elements = elements[0] return DenseVector(elements) @@ -1124,7 +1107,7 @@ class SparseMatrix(Matrix): Return a numpy.ndarray """ A = np.zeros((self.numRows, self.numCols), dtype=np.float64, order='F') - for k in xrange(self.colPtrs.size - 1): + for k in range(self.colPtrs.size - 1): startptr = self.colPtrs[k] endptr = self.colPtrs[k + 1] if self.isTransposed: diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py index 1be8755c7b..96b07bfa5f 100644 --- a/python/pyspark/ml/param/__init__.py +++ b/python/pyspark/ml/param/__init__.py @@ -16,15 +16,10 @@ # import array import sys -if sys.version > '3': - basestring = str - xrange = range - unicode = str - from abc import ABCMeta import copy -import numpy as np +import numpy as np from py4j.java_gateway import JavaObject from pyspark.ml.linalg import DenseVector, Vector, Matrix @@ -93,12 +88,12 @@ class TypeConverters(object): @staticmethod def _can_convert_to_list(value): vtype = type(value) - return vtype in [list, np.ndarray, tuple, xrange, array.array] or isinstance(value, Vector) + return vtype in [list, np.ndarray, tuple, range, array.array] or isinstance(value, Vector) @staticmethod def _can_convert_to_string(value): vtype = type(value) - return isinstance(value, basestring) or vtype in [np.unicode_, np.string_, np.str_] + return isinstance(value, str) or vtype in [np.unicode_, np.string_, np.str_] @staticmethod def identity(value): @@ -114,7 +109,7 @@ class TypeConverters(object): """ if type(value) == list: return value - elif type(value) in [np.ndarray, tuple, xrange, array.array]: + elif type(value) in [np.ndarray, tuple, range, array.array]: return list(value) elif isinstance(value, Vector): return list(value.toArray()) @@ -211,12 +206,10 @@ class TypeConverters(object): """ Convert a value to a string, if possible. """ - if isinstance(value, basestring): + if isinstance(value, str): return value - elif type(value) in [np.string_, np.str_]: + elif type(value) in [np.string_, np.str_, np.unicode_]: return str(value) - elif type(value) == np.unicode_: - return unicode(value) else: raise TypeError("Could not convert %s to string type" % type(value)) @@ -338,7 +331,7 @@ class Params(Identifiable): Tests whether this instance contains a param with a given (string) name. """ - if isinstance(paramName, basestring): + if isinstance(paramName, str): p = getattr(self, paramName, None) return isinstance(p, Param) else: @@ -421,7 +414,7 @@ class Params(Identifiable): if isinstance(param, Param): self._shouldOwn(param) return param - elif isinstance(param, basestring): + elif isinstance(param, str): return self.getParam(param) else: raise ValueError("Cannot resolve %r as a param." % param) @@ -510,7 +503,7 @@ class Params(Identifiable): :return: same instance, but with the uid and Param.parent values updated, including within param maps """ - newUid = unicode(newUid) + newUid = str(newUid) self.uid = newUid newDefaultParamMap = dict() newParamMap = dict() diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py index 2086e831f4..bc1ea87ad6 100644 --- a/python/pyspark/ml/param/_shared_params_code_gen.py +++ b/python/pyspark/ml/param/_shared_params_code_gen.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - header = """# # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with diff --git a/python/pyspark/ml/pipeline.py b/python/pyspark/ml/pipeline.py index 53d07ec966..eacb8b82b5 100644 --- a/python/pyspark/ml/pipeline.py +++ b/python/pyspark/ml/pipeline.py @@ -16,12 +16,8 @@ # import sys -import os -if sys.version > '3': - basestring = str - -from pyspark import since, keyword_only, SparkContext +from pyspark import keyword_only from pyspark.ml.base import Estimator, Model, Transformer from pyspark.ml.param import Param, Params from pyspark.ml.util import * diff --git a/python/pyspark/ml/tests/test_feature.py b/python/pyspark/ml/tests/test_feature.py index 4c6bfa696b..7856a317c2 100644 --- a/python/pyspark/ml/tests/test_feature.py +++ b/python/pyspark/ml/tests/test_feature.py @@ -19,9 +19,6 @@ import sys import unittest -if sys.version > '3': - basestring = str - from pyspark.ml.feature import Binarizer, CountVectorizer, CountVectorizerModel, HashingTF, IDF, \ NGram, RFormula, StopWordsRemover, StringIndexer, StringIndexerModel, VectorSizeHint from pyspark.ml.linalg import DenseVector, SparseVector, Vectors @@ -91,7 +88,7 @@ class FeatureTests(SparkSessionTestCase): transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, ["panda"]) self.assertEqual(type(stopWordRemover.getStopWords()), list) - self.assertTrue(isinstance(stopWordRemover.getStopWords()[0], basestring)) + self.assertTrue(isinstance(stopWordRemover.getStopWords()[0], str)) # Custom stopwords = ["panda"] stopWordRemover.setStopWords(stopwords) diff --git a/python/pyspark/ml/tests/test_param.py b/python/pyspark/ml/tests/test_param.py index 1b2b1914cc..e1abd59a2d 100644 --- a/python/pyspark/ml/tests/test_param.py +++ b/python/pyspark/ml/tests/test_param.py @@ -35,10 +35,6 @@ from pyspark.ml.wrapper import JavaParams from pyspark.testing.mlutils import check_params, PySparkTestCase, SparkSessionTestCase -if sys.version > '3': - xrange = range - - class ParamTypeConversionTests(PySparkTestCase): """ Test that param type conversion happens. @@ -67,14 +63,14 @@ class ParamTypeConversionTests(PySparkTestCase): def test_list(self): l = [0, 1] for lst_like in [l, np.array(l), DenseVector(l), SparseVector(len(l), range(len(l)), l), - pyarray.array('l', l), xrange(2), tuple(l)]: + pyarray.array('l', l), range(2), tuple(l)]: converted = TypeConverters.toList(lst_like) self.assertEqual(type(converted), list) self.assertListEqual(converted, l) def test_list_int(self): for indices in [[1.0, 2.0], np.array([1.0, 2.0]), DenseVector([1.0, 2.0]), - SparseVector(2, {0: 1.0, 1: 2.0}), xrange(1, 3), (1.0, 2.0), + SparseVector(2, {0: 1.0, 1: 2.0}), range(1, 3), (1.0, 2.0), pyarray.array('d', [1.0, 2.0])]: vs = VectorSlicer(indices=indices) self.assertListEqual(vs.getIndices(), [1, 2]) @@ -200,12 +196,7 @@ class ParamTests(SparkSessionTestCase): self.assertEqual(testParams._resolveParam("maxIter"), testParams.maxIter) self.assertEqual(testParams._resolveParam(u"maxIter"), testParams.maxIter) - if sys.version_info[0] >= 3: - # In Python 3, it is allowed to get/set attributes with non-ascii characters. - e_cls = AttributeError - else: - e_cls = UnicodeEncodeError - self.assertRaises(e_cls, lambda: testParams._resolveParam(u"아")) + self.assertRaises(AttributeError, lambda: testParams._resolveParam(u"아")) def test_params(self): testParams = TestParams() diff --git a/python/pyspark/ml/tests/test_training_summary.py b/python/pyspark/ml/tests/test_training_summary.py index 7d90579318..15e9ebb0f5 100644 --- a/python/pyspark/ml/tests/test_training_summary.py +++ b/python/pyspark/ml/tests/test_training_summary.py @@ -18,9 +18,6 @@ import sys import unittest -if sys.version > '3': - basestring = str - from pyspark.ml.classification import BinaryLogisticRegressionSummary, LinearSVC, \ LinearSVCSummary, BinaryRandomForestClassificationSummary, LogisticRegression, \ LogisticRegressionSummary, RandomForestClassificationSummary, \ @@ -101,7 +98,7 @@ class TrainingSummaryTest(SparkSessionTestCase): self.assertEqual(s.residualDegreeOfFreedom, 1) self.assertEqual(s.residualDegreeOfFreedomNull, 2) self.assertEqual(s.rank, 1) - self.assertTrue(isinstance(s.solver, basestring)) + self.assertTrue(isinstance(s.solver, str)) self.assertTrue(isinstance(s.aic, float)) self.assertTrue(isinstance(s.deviance, float)) self.assertTrue(isinstance(s.nullDeviance, float)) diff --git a/python/pyspark/ml/tree.py b/python/pyspark/ml/tree.py index a13b27ec8a..460c76fabc 100644 --- a/python/pyspark/ml/tree.py +++ b/python/pyspark/ml/tree.py @@ -15,12 +15,10 @@ # limitations under the License. # -from pyspark import since, keyword_only from pyspark.ml.param.shared import * from pyspark.ml.util import * -from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams, \ - JavaPredictor, JavaPredictionModel -from pyspark.ml.common import inherit_doc, _java2py, _py2java +from pyspark.ml.wrapper import JavaPredictionModel +from pyspark.ml.common import inherit_doc @inherit_doc diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index e00753b2ff..7f3d942e2e 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -15,12 +15,11 @@ # limitations under the License. # import itertools -import sys from multiprocessing.pool import ThreadPool import numpy as np -from pyspark import since, keyword_only +from pyspark import keyword_only from pyspark.ml import Estimator, Model from pyspark.ml.common import _py2java, _java2py from pyspark.ml.param import Params, Param, TypeConverters diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py index aac2b38d3f..9ab6bfa9ba 100644 --- a/python/pyspark/ml/util.py +++ b/python/pyspark/ml/util.py @@ -20,12 +20,6 @@ import sys import os import time import uuid -import warnings - -if sys.version > '3': - basestring = str - unicode = str - long = int from pyspark import SparkContext, since from pyspark.ml.common import inherit_doc @@ -60,10 +54,10 @@ class Identifiable(object): @classmethod def _randomUID(cls): """ - Generate a unique unicode id for the object. The default implementation + Generate a unique string id for the object. The default implementation concatenates the class name, "_", and 12 random hex chars. """ - return unicode(cls.__name__ + "_" + uuid.uuid4().hex[-12:]) + return str(cls.__name__ + "_" + uuid.uuid4().hex[-12:]) @inherit_doc @@ -170,8 +164,8 @@ class JavaMLWriter(MLWriter): def save(self, path): """Save the ML instance to the input path.""" - if not isinstance(path, basestring): - raise TypeError("path should be a basestring, got type %s" % type(path)) + if not isinstance(path, str): + raise TypeError("path should be a string, got type %s" % type(path)) self._jwrite.save(path) def overwrite(self): @@ -275,8 +269,8 @@ class JavaMLReader(MLReader): def load(self, path): """Load the ML instance from the input path.""" - if not isinstance(path, basestring): - raise TypeError("path should be a basestring, got type %s" % type(path)) + if not isinstance(path, str): + raise TypeError("path should be a string, got type %s" % type(path)) java_obj = self._jread.load(path) if not hasattr(self._clazz, "_from_java"): raise NotImplementedError("This Java ML type cannot be loaded into Python currently: %r" @@ -430,7 +424,7 @@ class DefaultParamsWriter(MLWriter): for p in instance._defaultParamMap: jsonDefaultParams[p.name] = instance._defaultParamMap[p] - basicMetadata = {"class": cls, "timestamp": long(round(time.time() * 1000)), + basicMetadata = {"class": cls, "timestamp": int(round(time.time() * 1000)), "sparkVersion": sc.version, "uid": uid, "paramMap": jsonParams, "defaultParamMap": jsonDefaultParams} if extraMetadata is not None: diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py index e59c6c7b25..c1d060a51c 100644 --- a/python/pyspark/ml/wrapper.py +++ b/python/pyspark/ml/wrapper.py @@ -16,9 +16,6 @@ # from abc import ABCMeta, abstractmethod -import sys -if sys.version >= '3': - xrange = range from pyspark import since from pyspark import SparkContext @@ -26,7 +23,6 @@ from pyspark.sql import DataFrame from pyspark.ml import Estimator, Predictor, PredictionModel, Transformer, Model from pyspark.ml.base import _PredictorParams from pyspark.ml.param import Params -from pyspark.ml.param.shared import HasFeaturesCol, HasLabelCol, HasPredictionCol from pyspark.ml.util import _jvm from pyspark.ml.common import inherit_doc, _java2py, _py2java @@ -99,15 +95,15 @@ class JavaWrapper(object): # If pylist is a 2D array, then a 2D java array will be created. # The 2D array is a square, non-jagged 2D array that is big enough for all elements. inner_array_length = 0 - for i in xrange(len(pylist)): + for i in range(len(pylist)): inner_array_length = max(inner_array_length, len(pylist[i])) java_array = sc._gateway.new_array(java_class, len(pylist), inner_array_length) - for i in xrange(len(pylist)): - for j in xrange(len(pylist[i])): + for i in range(len(pylist)): + for j in range(len(pylist[i])): java_array[i][j] = pylist[i][j] else: java_array = sc._gateway.new_array(java_class, len(pylist)) - for i in xrange(len(pylist)): + for i in range(len(pylist)): java_array[i] = pylist[i] return java_array diff --git a/python/pyspark/mllib/__init__.py b/python/pyspark/mllib/__init__.py index ae26521ea9..6067693111 100644 --- a/python/pyspark/mllib/__init__.py +++ b/python/pyspark/mllib/__init__.py @@ -21,8 +21,6 @@ RDD-based machine learning APIs for Python (in maintenance mode). The `pyspark.mllib` package is in maintenance mode as of the Spark 2.0.0 release to encourage migration to the DataFrame-based APIs under the `pyspark.ml` package. """ -from __future__ import absolute_import - # MLlib currently needs NumPy 1.4+, so complain if lower import numpy diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py index e41e5c9cc8..85cfe583fd 100644 --- a/python/pyspark/mllib/clustering.py +++ b/python/pyspark/mllib/clustering.py @@ -17,20 +17,13 @@ import sys import array as pyarray -import warnings - -if sys.version > '3': - xrange = range - basestring = str - from math import exp, log +from collections import namedtuple from numpy import array, random, tile -from collections import namedtuple - from pyspark import SparkContext, since -from pyspark.rdd import RDD, ignore_unicode_prefix +from pyspark.rdd import RDD from pyspark.mllib.common import JavaModelWrapper, callMLlibFunc, callJavaFunc, _py2java, _java2py from pyspark.mllib.linalg import SparseVector, _convert_to_vector, DenseVector from pyspark.mllib.stat.distribution import MultivariateGaussian @@ -257,7 +250,7 @@ class KMeansModel(Saveable, Loader): return x.map(self.predict) x = _convert_to_vector(x) - for i in xrange(len(self.centers)): + for i in range(len(self.centers)): distance = x.squared_distance(self.centers[i]) if distance < best_distance: best = i @@ -708,7 +701,7 @@ class StreamingKMeansModel(KMeansModel): >>> stkm = StreamingKMeansModel(initCenters, initWeights) >>> data = sc.parallelize([[-0.1, -0.1], [0.1, 0.1], ... [0.9, 0.9], [1.1, 1.1]]) - >>> stkm = stkm.update(data, 1.0, u"batches") + >>> stkm = stkm.update(data, 1.0, "batches") >>> stkm.centers array([[ 0., 0.], [ 1., 1.]]) @@ -720,7 +713,7 @@ class StreamingKMeansModel(KMeansModel): [3.0, 3.0] >>> decayFactor = 0.0 >>> data = sc.parallelize([DenseVector([1.5, 1.5]), DenseVector([0.2, 0.2])]) - >>> stkm = stkm.update(data, 0.0, u"batches") + >>> stkm = stkm.update(data, 0.0, "batches") >>> stkm.centers array([[ 0.2, 0.2], [ 1.5, 1.5]]) @@ -743,7 +736,6 @@ class StreamingKMeansModel(KMeansModel): """Return the cluster weights.""" return self._clusterWeights - @ignore_unicode_prefix @since('1.5.0') def update(self, data, decayFactor, timeUnit): """Update the centroids, according to data @@ -979,8 +971,8 @@ class LDAModel(JavaModelWrapper, JavaSaveable, Loader): """ if not isinstance(sc, SparkContext): raise TypeError("sc should be a SparkContext, got type %s" % type(sc)) - if not isinstance(path, basestring): - raise TypeError("path should be a basestring, got type %s" % type(path)) + if not isinstance(path, str): + raise TypeError("path should be a string, got type %s" % type(path)) model = callMLlibFunc("loadLDAModel", sc, path) return LDAModel(model) diff --git a/python/pyspark/mllib/common.py b/python/pyspark/mllib/common.py index bac8f35056..24e2f19825 100644 --- a/python/pyspark/mllib/common.py +++ b/python/pyspark/mllib/common.py @@ -15,11 +15,6 @@ # limitations under the License. # -import sys -if sys.version >= '3': - long = int - unicode = str - import py4j.protocol from py4j.protocol import Py4JJavaError from py4j.java_gateway import JavaObject @@ -81,7 +76,7 @@ def _py2java(sc, obj): obj = [_py2java(sc, x) for x in obj] elif isinstance(obj, JavaObject): pass - elif isinstance(obj, (int, long, float, bool, bytes, unicode)): + elif isinstance(obj, (int, float, bool, bytes, str)): pass else: data = bytearray(PickleSerializer().dumps(obj)) diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py index 3efae6ff0e..80a197eaa7 100644 --- a/python/pyspark/mllib/feature.py +++ b/python/pyspark/mllib/feature.py @@ -18,21 +18,15 @@ """ Python package for feature in MLlib. """ -from __future__ import absolute_import - import sys import warnings -if sys.version >= '3': - basestring = str - unicode = str - from py4j.protocol import Py4JJavaError from pyspark import since -from pyspark.rdd import RDD, ignore_unicode_prefix +from pyspark.rdd import RDD from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper from pyspark.mllib.linalg import ( - Vector, Vectors, DenseVector, SparseVector, _convert_to_vector) + Vectors, DenseVector, SparseVector, _convert_to_vector) from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.util import JavaLoader, JavaSaveable @@ -616,7 +610,7 @@ class Word2VecModel(JavaVectorTransformer, JavaSaveable, JavaLoader): .. note:: Local use only """ - if not isinstance(word, basestring): + if not isinstance(word, str): word = _convert_to_vector(word) words, similarity = self.call("findSynonyms", word, num) return zip(words, similarity) @@ -640,7 +634,6 @@ class Word2VecModel(JavaVectorTransformer, JavaSaveable, JavaLoader): return Word2VecModel(model) -@ignore_unicode_prefix class Word2Vec(object): """Word2Vec creates vector representation of words in a text corpus. The algorithm first constructs a vocabulary from the corpus @@ -668,7 +661,7 @@ class Word2Vec(object): >>> syms = model.findSynonyms("a", 2) >>> [s[0] for s in syms] - [u'b', u'c'] + ['b', 'c'] But querying for synonyms of a vector may return the word whose representation is that vector: @@ -676,7 +669,7 @@ class Word2Vec(object): >>> vec = model.transform("a") >>> syms = model.findSynonyms(vec, 2) >>> [s[0] for s in syms] - [u'a', u'b'] + ['a', 'b'] >>> import os, tempfile >>> path = tempfile.mkdtemp() @@ -686,7 +679,7 @@ class Word2Vec(object): True >>> syms = sameModel.findSynonyms("a", 2) >>> [s[0] for s in syms] - [u'b', u'c'] + ['b', 'c'] >>> from shutil import rmtree >>> try: ... rmtree(path) diff --git a/python/pyspark/mllib/fpm.py b/python/pyspark/mllib/fpm.py index 373a141456..cbbd7b351b 100644 --- a/python/pyspark/mllib/fpm.py +++ b/python/pyspark/mllib/fpm.py @@ -20,7 +20,6 @@ import sys from collections import namedtuple from pyspark import since -from pyspark.rdd import ignore_unicode_prefix from pyspark.mllib.common import JavaModelWrapper, callMLlibFunc from pyspark.mllib.util import JavaSaveable, JavaLoader, inherit_doc @@ -28,7 +27,6 @@ __all__ = ['FPGrowth', 'FPGrowthModel', 'PrefixSpan', 'PrefixSpanModel'] @inherit_doc -@ignore_unicode_prefix class FPGrowthModel(JavaModelWrapper, JavaSaveable, JavaLoader): """ A FP-Growth model for mining frequent itemsets @@ -38,7 +36,7 @@ class FPGrowthModel(JavaModelWrapper, JavaSaveable, JavaLoader): >>> rdd = sc.parallelize(data, 2) >>> model = FPGrowth.train(rdd, 0.6, 2) >>> sorted(model.freqItemsets().collect()) - [FreqItemset(items=[u'a'], freq=4), FreqItemset(items=[u'c'], freq=3), ... + [FreqItemset(items=['a'], freq=4), FreqItemset(items=['c'], freq=3), ... >>> model_path = temp_path + "/fpm" >>> model.save(sc, model_path) >>> sameModel = FPGrowthModel.load(sc, model_path) @@ -101,7 +99,6 @@ class FPGrowth(object): @inherit_doc -@ignore_unicode_prefix class PrefixSpanModel(JavaModelWrapper): """ Model fitted by PrefixSpan @@ -114,7 +111,7 @@ class PrefixSpanModel(JavaModelWrapper): >>> rdd = sc.parallelize(data, 2) >>> model = PrefixSpan.train(rdd) >>> sorted(model.freqSequences().collect()) - [FreqSequence(sequence=[[u'a']], freq=3), FreqSequence(sequence=[[u'a'], [u'a']], freq=1), ... + [FreqSequence(sequence=[['a']], freq=3), FreqSequence(sequence=[['a'], ['a']], freq=1), ... .. versionadded:: 1.6.0 """ diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py index cd09621b13..c1402fb98a 100644 --- a/python/pyspark/mllib/linalg/__init__.py +++ b/python/pyspark/mllib/linalg/__init__.py @@ -27,15 +27,6 @@ import sys import array import struct -if sys.version >= '3': - basestring = str - xrange = range - import copyreg as copy_reg - long = int -else: - from itertools import izip as zip - import copy_reg - import numpy as np from pyspark import since @@ -49,13 +40,6 @@ __all__ = ['Vector', 'DenseVector', 'SparseVector', 'Vectors', 'QRDecomposition'] -if sys.version_info[:2] == (2, 7): - # speed up pickling array in Python 2.7 - def fast_pickle_array(ar): - return array.array, (ar.typecode, ar.tostring()) - copy_reg.pickle(array.array, fast_pickle_array) - - # Check whether we have SciPy. MLlib works without it too, but if we have it, some methods, # such as _dot and _serialize_double_vector, start to support scipy.sparse matrices. @@ -70,7 +54,7 @@ except: def _convert_to_vector(l): if isinstance(l, Vector): return l - elif type(l) in (array.array, np.array, np.ndarray, list, tuple, xrange): + elif type(l) in (array.array, np.array, np.ndarray, list, tuple, range): return DenseVector(l) elif _have_scipy and scipy.sparse.issparse(l): assert l.shape[1] == 1, "Expected column vector" @@ -104,7 +88,7 @@ def _vector_size(v): """ if isinstance(v, Vector): return len(v) - elif type(v) in (array.array, list, tuple, xrange): + elif type(v) in (array.array, list, tuple, range): return len(v) elif type(v) == np.ndarray: if v.ndim == 1 or (v.ndim == 2 and v.shape[1] == 1): @@ -459,7 +443,7 @@ class DenseVector(Vector): elif isinstance(other, SparseVector): if len(self) != other.size: return False - return Vectors._equals(list(xrange(len(self))), self.array, other.indices, other.values) + return Vectors._equals(list(range(len(self))), self.array, other.indices, other.values) return False def __ne__(self, other): @@ -556,7 +540,7 @@ class SparseVector(Vector): self.indices = np.array(args[0], dtype=np.int32) self.values = np.array(args[1], dtype=np.float64) assert len(self.indices) == len(self.values), "index and value arrays not same length" - for i in xrange(len(self.indices) - 1): + for i in range(len(self.indices) - 1): if self.indices[i] >= self.indices[i + 1]: raise TypeError( "Indices %s and %s are not strictly increasing" @@ -788,7 +772,7 @@ class SparseVector(Vector): inds = self.indices vals = self.values entries = ", ".join(["{0}: {1}".format(inds[i], _format_float(vals[i])) - for i in xrange(len(inds))]) + for i in range(len(inds))]) return "SparseVector({0}, {{{1}}})".format(self.size, entries) def __eq__(self, other): @@ -798,7 +782,7 @@ class SparseVector(Vector): elif isinstance(other, DenseVector): if self.size != len(other): return False - return Vectors._equals(self.indices, self.values, list(xrange(len(other))), other.array) + return Vectors._equals(self.indices, self.values, list(range(len(other))), other.array) return False def __getitem__(self, index): @@ -880,7 +864,7 @@ class Vectors(object): >>> Vectors.dense(1.0, 2.0) DenseVector([1.0, 2.0]) """ - if len(elements) == 1 and not isinstance(elements[0], (float, int, long)): + if len(elements) == 1 and not isinstance(elements[0], (float, int)): # it's list, numpy.array or other iterable object. elements = elements[0] return DenseVector(elements) @@ -1279,7 +1263,7 @@ class SparseMatrix(Matrix): Return an numpy.ndarray """ A = np.zeros((self.numRows, self.numCols), dtype=np.float64, order='F') - for k in xrange(self.colPtrs.size - 1): + for k in range(self.colPtrs.size - 1): startptr = self.colPtrs[k] endptr = self.colPtrs[k + 1] if self.isTransposed: diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py index 56701758c8..603d31d3d7 100644 --- a/python/pyspark/mllib/linalg/distributed.py +++ b/python/pyspark/mllib/linalg/distributed.py @@ -21,9 +21,6 @@ Package for distributed linear algebra. import sys -if sys.version >= '3': - long = int - from py4j.java_gateway import JavaObject from pyspark import RDD, since @@ -95,9 +92,9 @@ class RowMatrix(DistributedMatrix): """ if isinstance(rows, RDD): rows = rows.map(_convert_to_vector) - java_matrix = callMLlibFunc("createRowMatrix", rows, long(numRows), int(numCols)) + java_matrix = callMLlibFunc("createRowMatrix", rows, int(numRows), int(numCols)) elif isinstance(rows, DataFrame): - java_matrix = callMLlibFunc("createRowMatrix", rows, long(numRows), int(numCols)) + java_matrix = callMLlibFunc("createRowMatrix", rows, int(numRows), int(numCols)) elif (isinstance(rows, JavaObject) and rows.getClass().getSimpleName() == "RowMatrix"): java_matrix = rows @@ -439,13 +436,13 @@ class IndexedRow(object): """ Represents a row of an IndexedRowMatrix. - Just a wrapper over a (long, vector) tuple. + Just a wrapper over a (int, vector) tuple. :param index: The index for the given row. :param vector: The row in the matrix at the given index. """ def __init__(self, index, vector): - self.index = long(index) + self.index = int(index) self.vector = _convert_to_vector(vector) def __repr__(self): @@ -465,8 +462,8 @@ class IndexedRowMatrix(DistributedMatrix): """ Represents a row-oriented distributed Matrix with indexed rows. - :param rows: An RDD of IndexedRows or (long, vector) tuples or a DataFrame consisting of a - long typed column of indices and a vector typed column. + :param rows: An RDD of IndexedRows or (int, vector) tuples or a DataFrame consisting of a + int typed column of indices and a vector typed column. :param numRows: Number of rows in the matrix. A non-positive value means unknown, at which point the number of rows will be determined by the max row @@ -510,14 +507,14 @@ class IndexedRowMatrix(DistributedMatrix): # both be easily serialized. We will convert back to # IndexedRows on the Scala side. java_matrix = callMLlibFunc("createIndexedRowMatrix", rows.toDF(), - long(numRows), int(numCols)) + int(numRows), int(numCols)) elif isinstance(rows, DataFrame): - java_matrix = callMLlibFunc("createIndexedRowMatrix", rows, long(numRows), int(numCols)) + java_matrix = callMLlibFunc("createIndexedRowMatrix", rows, int(numRows), int(numCols)) elif (isinstance(rows, JavaObject) and rows.getClass().getSimpleName() == "IndexedRowMatrix"): java_matrix = rows else: - raise TypeError("rows should be an RDD of IndexedRows or (long, vector) tuples, " + raise TypeError("rows should be an RDD of IndexedRows or (int, vector) tuples, " "got %s" % type(rows)) self._java_matrix_wrapper = JavaModelWrapper(java_matrix) @@ -731,15 +728,15 @@ class MatrixEntry(object): """ Represents an entry of a CoordinateMatrix. - Just a wrapper over a (long, long, float) tuple. + Just a wrapper over a (int, int, float) tuple. :param i: The row index of the matrix. :param j: The column index of the matrix. :param value: The (i, j)th entry of the matrix, as a float. """ def __init__(self, i, j, value): - self.i = long(i) - self.j = long(j) + self.i = int(i) + self.j = int(j) self.value = float(value) def __repr__(self): @@ -760,7 +757,7 @@ class CoordinateMatrix(DistributedMatrix): Represents a matrix in coordinate format. :param entries: An RDD of MatrixEntry inputs or - (long, long, float) tuples. + (int, int, float) tuples. :param numRows: Number of rows in the matrix. A non-positive value means unknown, at which point the number of rows will be determined by the max row @@ -804,13 +801,13 @@ class CoordinateMatrix(DistributedMatrix): # each be easily serialized. We will convert back to # MatrixEntry inputs on the Scala side. java_matrix = callMLlibFunc("createCoordinateMatrix", entries.toDF(), - long(numRows), long(numCols)) + int(numRows), int(numCols)) elif (isinstance(entries, JavaObject) and entries.getClass().getSimpleName() == "CoordinateMatrix"): java_matrix = entries else: raise TypeError("entries should be an RDD of MatrixEntry entries or " - "(long, long, float) tuples, got %s" % type(entries)) + "(int, int, float) tuples, got %s" % type(entries)) self._java_matrix_wrapper = JavaModelWrapper(java_matrix) @@ -1044,7 +1041,7 @@ class BlockMatrix(DistributedMatrix): # the Scala side. java_matrix = callMLlibFunc("createBlockMatrix", blocks.toDF(), int(rowsPerBlock), int(colsPerBlock), - long(numRows), long(numCols)) + int(numRows), int(numCols)) elif (isinstance(blocks, JavaObject) and blocks.getClass().getSimpleName() == "BlockMatrix"): java_matrix = blocks diff --git a/python/pyspark/mllib/stat/KernelDensity.py b/python/pyspark/mllib/stat/KernelDensity.py index 7250eab670..56444c152f 100644 --- a/python/pyspark/mllib/stat/KernelDensity.py +++ b/python/pyspark/mllib/stat/KernelDensity.py @@ -15,11 +15,6 @@ # limitations under the License. # -import sys - -if sys.version > '3': - xrange = range - import numpy as np from pyspark.mllib.common import callMLlibFunc diff --git a/python/pyspark/mllib/stat/_statistics.py b/python/pyspark/mllib/stat/_statistics.py index d49f741a2f..43454ba518 100644 --- a/python/pyspark/mllib/stat/_statistics.py +++ b/python/pyspark/mllib/stat/_statistics.py @@ -16,10 +16,8 @@ # import sys -if sys.version >= '3': - basestring = str -from pyspark.rdd import RDD, ignore_unicode_prefix +from pyspark.rdd import RDD from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper from pyspark.mllib.linalg import Matrix, _convert_to_vector from pyspark.mllib.regression import LabeledPoint @@ -157,7 +155,6 @@ class Statistics(object): return callMLlibFunc("corr", x.map(float), y.map(float), method) @staticmethod - @ignore_unicode_prefix def chiSqTest(observed, expected=None): """ If `observed` is Vector, conduct Pearson's chi-squared goodness @@ -199,9 +196,9 @@ class Statistics(object): >>> print(round(pearson.pValue, 4)) 0.8187 >>> pearson.method - u'pearson' + 'pearson' >>> pearson.nullHypothesis - u'observed follows the same distribution as expected.' + 'observed follows the same distribution as expected.' >>> observed = Vectors.dense([21, 38, 43, 80]) >>> expected = Vectors.dense([3, 5, 7, 20]) @@ -242,7 +239,6 @@ class Statistics(object): return ChiSqTestResult(jmodel) @staticmethod - @ignore_unicode_prefix def kolmogorovSmirnovTest(data, distName="norm", *params): """ Performs the Kolmogorov-Smirnov (KS) test for data sampled from @@ -282,7 +278,7 @@ class Statistics(object): >>> print(round(ksmodel.statistic, 3)) 0.175 >>> ksmodel.nullHypothesis - u'Sample follows theoretical distribution' + 'Sample follows theoretical distribution' >>> data = sc.parallelize([2.0, 3.0, 4.0]) >>> ksmodel = kstest(data, "norm", 3.0, 1.0) @@ -293,7 +289,7 @@ class Statistics(object): """ if not isinstance(data, RDD): raise TypeError("data should be an RDD, got %s." % type(data)) - if not isinstance(distName, basestring): + if not isinstance(distName, str): raise TypeError("distName should be a string, got %s." % type(distName)) params = [float(param) for param in params] diff --git a/python/pyspark/mllib/tests/test_linalg.py b/python/pyspark/mllib/tests/test_linalg.py index 312730e8af..21c2bb422a 100644 --- a/python/pyspark/mllib/tests/test_linalg.py +++ b/python/pyspark/mllib/tests/test_linalg.py @@ -31,9 +31,6 @@ from pyspark.sql import Row from pyspark.testing.mllibutils import MLlibTestCase from pyspark.testing.utils import have_scipy -if sys.version >= '3': - long = int - class VectorTests(MLlibTestCase): @@ -447,7 +444,7 @@ class VectorUDTTests(MLlibTestCase): def test_indexed_row_matrix_from_dataframe(self): from pyspark.sql.utils import IllegalArgumentException - df = self.spark.createDataFrame([Row(long(0), Vectors.dense(1))]) + df = self.spark.createDataFrame([Row(int(0), Vectors.dense(1))]) matrix = IndexedRowMatrix(df) self.assertEqual(matrix.numRows(), 1) self.assertEqual(matrix.numCols(), 1) diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py index 2d8df461ac..e05dfdb953 100644 --- a/python/pyspark/mllib/tree.py +++ b/python/pyspark/mllib/tree.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import absolute_import - import sys import random diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py index f0f9cda467..a0be29a82e 100644 --- a/python/pyspark/mllib/util.py +++ b/python/pyspark/mllib/util.py @@ -18,10 +18,6 @@ import sys import numpy as np -if sys.version > '3': - xrange = range - basestring = str - from pyspark import SparkContext, since from pyspark.mllib.common import callMLlibFunc, inherit_doc from pyspark.mllib.linalg import Vectors, SparseVector, _convert_to_vector @@ -46,7 +42,7 @@ class MLUtils(object): nnz = len(items) - 1 indices = np.zeros(nnz, dtype=np.int32) values = np.zeros(nnz) - for i in xrange(nnz): + for i in range(nnz): index, value = items[1 + i].split(":") indices[i] = int(index) - 1 values[i] = float(value) @@ -61,10 +57,10 @@ class MLUtils(object): v = _convert_to_vector(p.features) if isinstance(v, SparseVector): nnz = len(v.indices) - for i in xrange(nnz): + for i in range(nnz): items.append(str(v.indices[i] + 1) + ":" + str(v.values[i])) else: - for i in xrange(len(v)): + for i in range(len(v)): items.append(str(i + 1) + ":" + str(v[i])) return " ".join(items) @@ -396,8 +392,8 @@ class JavaSaveable(Saveable): """Save this model to the given path.""" if not isinstance(sc, SparkContext): raise TypeError("sc should be a SparkContext, got type %s" % type(sc)) - if not isinstance(path, basestring): - raise TypeError("path should be a basestring, got type %s" % type(path)) + if not isinstance(path, str): + raise TypeError("path should be a string, got type %s" % type(path)) self._java_model.save(sc._jsc.sc(), path) diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index db0c1971cd..437b2c4465 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -33,15 +33,10 @@ from itertools import chain from functools import reduce from math import sqrt, log, isinf, isnan, pow, ceil -if sys.version > '3': - basestring = unicode = str -else: - from itertools import imap as map, ifilter as filter - from pyspark.java_gateway import local_connect_and_auth from pyspark.serializers import AutoBatchedSerializer, BatchedSerializer, NoOpSerializer, \ CartesianDeserializer, CloudPickleSerializer, PairDeserializer, PickleSerializer, \ - UTF8Deserializer, pack_long, read_int, write_int + pack_long, read_int, write_int from pyspark.join import python_join, python_left_outer_join, \ python_right_outer_join, python_full_outer_join, python_cogroup from pyspark.statcounter import StatCounter @@ -93,7 +88,7 @@ def portable_hash(x): 219750521 """ - if sys.version_info >= (3, 2, 3) and 'PYTHONHASHSEED' not in os.environ: + if 'PYTHONHASHSEED' not in os.environ: raise Exception("Randomness of hash of string should be disabled via PYTHONHASHSEED") if x is None: @@ -204,19 +199,6 @@ def _local_iterator_from_socket(sock_info, serializer): return iter(PyLocalIterable(sock_info, serializer)) -def ignore_unicode_prefix(f): - """ - Ignore the 'u' prefix of string in doc tests, to make it works - in both python 2 and 3 - """ - if sys.version >= '3': - # the representation of unicode string in Python 3 does not have prefix 'u', - # so remove the prefix 'u' for doc tests - literal_re = re.compile(r"(\W|^)[uU](['])", re.UNICODE) - f.__doc__ = literal_re.sub(r'\1\2', f.__doc__) - return f - - class Partitioner(object): def __init__(self, numPartitions, partitionFunc): self.numPartitions = numPartitions @@ -797,13 +779,12 @@ class RDD(object): """ return self.map(lambda x: (f(x), x)).groupByKey(numPartitions, partitionFunc) - @ignore_unicode_prefix def pipe(self, command, env=None, checkCode=False): """ Return an RDD created by piping elements to a forked external process. >>> sc.parallelize(['1', '2', '', '3']).pipe('cat').collect() - [u'1', u'2', u'', u'3'] + ['1', '2', '', '3'] :param checkCode: whether or not to check the return value of the shell command. """ @@ -816,7 +797,7 @@ class RDD(object): def pipe_objs(out): for obj in iterator: - s = unicode(obj).rstrip('\n') + '\n' + s = str(obj).rstrip('\n') + '\n' out.write(s.encode('utf-8')) out.close() Thread(target=pipe_objs, args=[pipe.stdin]).start() @@ -1591,7 +1572,6 @@ class RDD(object): ser = BatchedSerializer(PickleSerializer(), batchSize) self._reserialize(ser)._jrdd.saveAsObjectFile(path) - @ignore_unicode_prefix def saveAsTextFile(self, path, compressionCodecClass=None): """ Save this RDD as a text file, using string representations of elements. @@ -1625,13 +1605,13 @@ class RDD(object): >>> from fileinput import input, hook_compressed >>> result = sorted(input(glob(tempFile3.name + "/part*.gz"), openhook=hook_compressed)) >>> b''.join(result).decode('utf-8') - u'bar\\nfoo\\n' + 'bar\\nfoo\\n' """ def func(split, iterator): for x in iterator: - if not isinstance(x, (unicode, bytes)): - x = unicode(x) - if isinstance(x, unicode): + if not isinstance(x, (str, bytes)): + x = str(x) + if isinstance(x, str): x = x.encode("utf-8") yield x keyed = self.mapPartitionsWithIndex(func) @@ -2281,14 +2261,13 @@ class RDD(object): if n: return n - @ignore_unicode_prefix def setName(self, name): """ Assign a name to this RDD. >>> rdd1 = sc.parallelize([1, 2]) >>> rdd1.setName('RDD1').name() - u'RDD1' + 'RDD1' """ self._jrdd.setName(name) return self diff --git a/python/pyspark/resultiterable.py b/python/pyspark/resultiterable.py index c867b51877..cd2a59513b 100644 --- a/python/pyspark/resultiterable.py +++ b/python/pyspark/resultiterable.py @@ -15,10 +15,7 @@ # limitations under the License. # -try: - from collections.abc import Iterable -except ImportError: - from collections import Iterable +from collections.abc import Iterable __all__ = ["ResultIterable"] diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py index 49b7cb4546..80ce9b8408 100644 --- a/python/pyspark/serializers.py +++ b/python/pyspark/serializers.py @@ -58,18 +58,11 @@ import types import collections import zlib import itertools - -if sys.version < '3': - import cPickle as pickle - from itertools import izip as zip, imap as map -else: - import pickle - basestring = unicode = str - xrange = range +import pickle pickle_protocol = pickle.HIGHEST_PROTOCOL from pyspark import cloudpickle -from pyspark.util import _exception_message, print_exec +from pyspark.util import print_exec __all__ = ["PickleSerializer", "MarshalSerializer", "UTF8Deserializer"] @@ -132,11 +125,6 @@ class FramedSerializer(Serializer): where `length` is a 32-bit integer and data is `length` bytes. """ - def __init__(self): - # On Python 2.6, we can't write bytearrays to streams, so we need to convert them - # to strings first. Check if the version number is that old. - self._only_write_strings = sys.version_info[0:2] <= (2, 6) - def dump_stream(self, iterator, stream): for obj in iterator: self._write_with_length(obj, stream) @@ -155,10 +143,7 @@ class FramedSerializer(Serializer): if len(serialized) > (1 << 31): raise ValueError("can not serialize object larger than 2G") write_int(len(serialized), stream) - if self._only_write_strings: - stream.write(str(serialized)) - else: - stream.write(serialized) + stream.write(serialized) def _read_with_length(self, stream): length = read_int(stream) @@ -204,7 +189,7 @@ class BatchedSerializer(Serializer): yield list(iterator) elif hasattr(iterator, "__len__") and hasattr(iterator, "__getslice__"): n = len(iterator) - for i in xrange(0, n, self.batchSize): + for i in range(0, n, self.batchSize): yield iterator[i: i + self.batchSize] else: items = [] @@ -395,23 +380,8 @@ def _hijack_namedtuple(): return types.FunctionType(f.__code__, f.__globals__, f.__name__, f.__defaults__, f.__closure__) - def _kwdefaults(f): - # __kwdefaults__ contains the default values of keyword-only arguments which are - # introduced from Python 3. The possible cases for __kwdefaults__ in namedtuple - # are as below: - # - # - Does not exist in Python 2. - # - Returns None in <= Python 3.5.x. - # - Returns a dictionary containing the default values to the keys from Python 3.6.x - # (See https://bugs.python.org/issue25628). - kargs = getattr(f, "__kwdefaults__", None) - if kargs is None: - return {} - else: - return kargs - _old_namedtuple = _copy_func(collections.namedtuple) - _old_namedtuple_kwdefaults = _kwdefaults(collections.namedtuple) + _old_namedtuple_kwdefaults = collections.namedtuple.__kwdefaults__ def namedtuple(*args, **kwargs): for k, v in _old_namedtuple_kwdefaults.items(): @@ -453,12 +423,8 @@ class PickleSerializer(FramedSerializer): def dumps(self, obj): return pickle.dumps(obj, pickle_protocol) - if sys.version >= '3': - def loads(self, obj, encoding="bytes"): - return pickle.loads(obj, encoding=encoding) - else: - def loads(self, obj, encoding=None): - return pickle.loads(obj) + def loads(self, obj, encoding="bytes"): + return pickle.loads(obj, encoding=encoding) class CloudPickleSerializer(PickleSerializer): @@ -469,7 +435,7 @@ class CloudPickleSerializer(PickleSerializer): except pickle.PickleError: raise except Exception as e: - emsg = _exception_message(e) + emsg = str(e) if "'i' format requires" in emsg: msg = "Object too large to serialize: %s" % emsg else: diff --git a/python/pyspark/shell.py b/python/pyspark/shell.py index 65e3bdbc05..cde163bd2d 100644 --- a/python/pyspark/shell.py +++ b/python/pyspark/shell.py @@ -26,11 +26,8 @@ import os import platform import warnings -import py4j - -from pyspark import SparkConf from pyspark.context import SparkContext -from pyspark.sql import SparkSession, SQLContext +from pyspark.sql import SparkSession if os.environ.get("SPARK_EXECUTOR_URI"): SparkContext.setSystemProperty("spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"]) diff --git a/python/pyspark/sql/__init__.py b/python/pyspark/sql/__init__.py index c28cb8c3b9..af32469e82 100644 --- a/python/pyspark/sql/__init__.py +++ b/python/pyspark/sql/__init__.py @@ -39,9 +39,6 @@ Important classes of Spark SQL and DataFrames: - :class:`pyspark.sql.Window` For working with window functions. """ -from __future__ import absolute_import - - from pyspark.sql.types import Row from pyspark.sql.context import SQLContext, HiveContext, UDFRegistration from pyspark.sql.session import SparkSession diff --git a/python/pyspark/sql/avro/functions.py b/python/pyspark/sql/avro/functions.py index ed62a72d6c..974412ee4e 100644 --- a/python/pyspark/sql/avro/functions.py +++ b/python/pyspark/sql/avro/functions.py @@ -21,12 +21,10 @@ A collections of builtin avro functions from pyspark import since, SparkContext -from pyspark.rdd import ignore_unicode_prefix from pyspark.sql.column import Column, _to_java_column from pyspark.util import _print_missing_jar -@ignore_unicode_prefix @since(3.0) def from_avro(data, jsonFormatSchema, options={}): """ @@ -45,7 +43,7 @@ def from_avro(data, jsonFormatSchema, options={}): >>> from pyspark.sql import Row >>> from pyspark.sql.avro.functions import from_avro, to_avro - >>> data = [(1, Row(name='Alice', age=2))] + >>> data = [(1, Row(age=2, name='Alice'))] >>> df = spark.createDataFrame(data, ("key", "value")) >>> avroDf = df.select(to_avro(df.value).alias("avro")) >>> avroDf.collect() @@ -55,7 +53,7 @@ def from_avro(data, jsonFormatSchema, options={}): ... "fields":[{"name":"age","type":["long","null"]}, ... {"name":"name","type":["string","null"]}]},"null"]}]}''' >>> avroDf.select(from_avro(avroDf.avro, jsonFormatSchema).alias("value")).collect() - [Row(value=Row(avro=Row(age=2, name=u'Alice')))] + [Row(value=Row(avro=Row(age=2, name='Alice')))] """ sc = SparkContext._active_spark_context @@ -69,7 +67,6 @@ def from_avro(data, jsonFormatSchema, options={}): return Column(jc) -@ignore_unicode_prefix @since(3.0) def to_avro(data, jsonFormatSchema=""): """ diff --git a/python/pyspark/sql/catalog.py b/python/pyspark/sql/catalog.py index 974251f63b..25fc696dac 100644 --- a/python/pyspark/sql/catalog.py +++ b/python/pyspark/sql/catalog.py @@ -20,10 +20,8 @@ import warnings from collections import namedtuple from pyspark import since -from pyspark.rdd import ignore_unicode_prefix, PythonEvalType from pyspark.sql.dataframe import DataFrame -from pyspark.sql.udf import UserDefinedFunction -from pyspark.sql.types import IntegerType, StringType, StructType +from pyspark.sql.types import StructType Database = namedtuple("Database", "name description locationUri") @@ -44,19 +42,16 @@ class Catalog(object): self._jsparkSession = sparkSession._jsparkSession self._jcatalog = sparkSession._jsparkSession.catalog() - @ignore_unicode_prefix @since(2.0) def currentDatabase(self): """Returns the current default database in this session.""" return self._jcatalog.currentDatabase() - @ignore_unicode_prefix @since(2.0) def setCurrentDatabase(self, dbName): """Sets the current default database in this session.""" return self._jcatalog.setCurrentDatabase(dbName) - @ignore_unicode_prefix @since(2.0) def listDatabases(self): """Returns a list of databases available across all sessions.""" @@ -70,7 +65,6 @@ class Catalog(object): locationUri=jdb.locationUri())) return databases - @ignore_unicode_prefix @since(2.0) def listTables(self, dbName=None): """Returns a list of tables/views in the specified database. @@ -92,7 +86,6 @@ class Catalog(object): isTemporary=jtable.isTemporary())) return tables - @ignore_unicode_prefix @since(2.0) def listFunctions(self, dbName=None): """Returns a list of functions registered in the specified database. @@ -113,7 +106,6 @@ class Catalog(object): isTemporary=jfunction.isTemporary())) return functions - @ignore_unicode_prefix @since(2.0) def listColumns(self, tableName, dbName=None): """Returns a list of columns for the given table/view in the specified database. diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index ef4944c912..bd4c355762 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -19,15 +19,8 @@ import sys import json import warnings -if sys.version >= '3': - basestring = str - long = int - -from py4j.java_gateway import is_instance_of - from pyspark import copy_func, since from pyspark.context import SparkContext -from pyspark.rdd import ignore_unicode_prefix from pyspark.sql.types import * __all__ = ["Column"] @@ -46,7 +39,7 @@ def _create_column_from_name(name): def _to_java_column(col): if isinstance(col, Column): jcol = col._jc - elif isinstance(col, basestring): + elif isinstance(col, str): jcol = _create_column_from_name(col) else: raise TypeError( @@ -359,7 +352,7 @@ class Column(object): :param other: string in line >>> df.filter(df.name.contains('o')).collect() - [Row(age=5, name=u'Bob')] + [Row(age=5, name='Bob')] """ _rlike_doc = """ SQL RLIKE expression (LIKE with Regex). Returns a boolean :class:`Column` based on a regex @@ -368,7 +361,7 @@ class Column(object): :param other: an extended regex expression >>> df.filter(df.name.rlike('ice$')).collect() - [Row(age=2, name=u'Alice')] + [Row(age=2, name='Alice')] """ _like_doc = """ SQL like expression. Returns a boolean :class:`Column` based on a SQL LIKE match. @@ -378,7 +371,7 @@ class Column(object): See :func:`rlike` for a regex version >>> df.filter(df.name.like('Al%')).collect() - [Row(age=2, name=u'Alice')] + [Row(age=2, name='Alice')] """ _startswith_doc = """ String starts with. Returns a boolean :class:`Column` based on a string match. @@ -386,7 +379,7 @@ class Column(object): :param other: string at start of line (do not use a regex `^`) >>> df.filter(df.name.startswith('Al')).collect() - [Row(age=2, name=u'Alice')] + [Row(age=2, name='Alice')] >>> df.filter(df.name.startswith('^Al')).collect() [] """ @@ -396,18 +389,17 @@ class Column(object): :param other: string at end of line (do not use a regex `$`) >>> df.filter(df.name.endswith('ice')).collect() - [Row(age=2, name=u'Alice')] + [Row(age=2, name='Alice')] >>> df.filter(df.name.endswith('ice$')).collect() [] """ - contains = ignore_unicode_prefix(_bin_op("contains", _contains_doc)) - rlike = ignore_unicode_prefix(_bin_op("rlike", _rlike_doc)) - like = ignore_unicode_prefix(_bin_op("like", _like_doc)) - startswith = ignore_unicode_prefix(_bin_op("startsWith", _startswith_doc)) - endswith = ignore_unicode_prefix(_bin_op("endsWith", _endswith_doc)) + contains = _bin_op("contains", _contains_doc) + rlike = _bin_op("rlike", _rlike_doc) + like = _bin_op("like", _like_doc) + startswith = _bin_op("startsWith", _startswith_doc) + endswith = _bin_op("endsWith", _endswith_doc) - @ignore_unicode_prefix @since(1.3) def substr(self, startPos, length): """ @@ -417,7 +409,7 @@ class Column(object): :param length: length of the substring (int or Column) >>> df.select(df.name.substr(1, 3).alias("col")).collect() - [Row(col=u'Ali'), Row(col=u'Bob')] + [Row(col='Ali'), Row(col='Bob')] """ if type(startPos) != type(length): raise TypeError( @@ -435,7 +427,6 @@ class Column(object): raise TypeError("Unexpected type: %s" % type(startPos)) return Column(jc) - @ignore_unicode_prefix @since(1.5) def isin(self, *cols): """ @@ -443,9 +434,9 @@ class Column(object): expression is contained by the evaluated values of the arguments. >>> df[df.name.isin("Bob", "Mike")].collect() - [Row(age=5, name=u'Bob')] + [Row(age=5, name='Bob')] >>> df[df.age.isin([1, 2, 3])].collect() - [Row(age=2, name=u'Alice')] + [Row(age=2, name='Alice')] """ if len(cols) == 1 and isinstance(cols[0], (list, set)): cols = cols[0] @@ -461,7 +452,7 @@ class Column(object): >>> from pyspark.sql import Row >>> df = spark.createDataFrame([('Tom', 80), ('Alice', None)], ["name", "height"]) >>> df.select(df.name).orderBy(df.name.asc()).collect() - [Row(name=u'Alice'), Row(name=u'Tom')] + [Row(name='Alice'), Row(name='Tom')] """ _asc_nulls_first_doc = """ Returns a sort expression based on ascending order of the column, and null values @@ -470,7 +461,7 @@ class Column(object): >>> from pyspark.sql import Row >>> df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"]) >>> df.select(df.name).orderBy(df.name.asc_nulls_first()).collect() - [Row(name=None), Row(name=u'Alice'), Row(name=u'Tom')] + [Row(name=None), Row(name='Alice'), Row(name='Tom')] .. versionadded:: 2.4 """ @@ -481,7 +472,7 @@ class Column(object): >>> from pyspark.sql import Row >>> df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"]) >>> df.select(df.name).orderBy(df.name.asc_nulls_last()).collect() - [Row(name=u'Alice'), Row(name=u'Tom'), Row(name=None)] + [Row(name='Alice'), Row(name='Tom'), Row(name=None)] .. versionadded:: 2.4 """ @@ -491,7 +482,7 @@ class Column(object): >>> from pyspark.sql import Row >>> df = spark.createDataFrame([('Tom', 80), ('Alice', None)], ["name", "height"]) >>> df.select(df.name).orderBy(df.name.desc()).collect() - [Row(name=u'Tom'), Row(name=u'Alice')] + [Row(name='Tom'), Row(name='Alice')] """ _desc_nulls_first_doc = """ Returns a sort expression based on the descending order of the column, and null values @@ -500,7 +491,7 @@ class Column(object): >>> from pyspark.sql import Row >>> df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"]) >>> df.select(df.name).orderBy(df.name.desc_nulls_first()).collect() - [Row(name=None), Row(name=u'Tom'), Row(name=u'Alice')] + [Row(name=None), Row(name='Tom'), Row(name='Alice')] .. versionadded:: 2.4 """ @@ -511,37 +502,37 @@ class Column(object): >>> from pyspark.sql import Row >>> df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"]) >>> df.select(df.name).orderBy(df.name.desc_nulls_last()).collect() - [Row(name=u'Tom'), Row(name=u'Alice'), Row(name=None)] + [Row(name='Tom'), Row(name='Alice'), Row(name=None)] .. versionadded:: 2.4 """ - asc = ignore_unicode_prefix(_unary_op("asc", _asc_doc)) - asc_nulls_first = ignore_unicode_prefix(_unary_op("asc_nulls_first", _asc_nulls_first_doc)) - asc_nulls_last = ignore_unicode_prefix(_unary_op("asc_nulls_last", _asc_nulls_last_doc)) - desc = ignore_unicode_prefix(_unary_op("desc", _desc_doc)) - desc_nulls_first = ignore_unicode_prefix(_unary_op("desc_nulls_first", _desc_nulls_first_doc)) - desc_nulls_last = ignore_unicode_prefix(_unary_op("desc_nulls_last", _desc_nulls_last_doc)) + asc = _unary_op("asc", _asc_doc) + asc_nulls_first = _unary_op("asc_nulls_first", _asc_nulls_first_doc) + asc_nulls_last = _unary_op("asc_nulls_last", _asc_nulls_last_doc) + desc = _unary_op("desc", _desc_doc) + desc_nulls_first = _unary_op("desc_nulls_first", _desc_nulls_first_doc) + desc_nulls_last = _unary_op("desc_nulls_last", _desc_nulls_last_doc) _isNull_doc = """ True if the current expression is null. >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)]) + >>> df = spark.createDataFrame([Row(name='Tom', height=80), Row(name='Alice', height=None)]) >>> df.filter(df.height.isNull()).collect() - [Row(height=None, name=u'Alice')] + [Row(name='Alice', height=None)] """ _isNotNull_doc = """ True if the current expression is NOT null. >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)]) + >>> df = spark.createDataFrame([Row(name='Tom', height=80), Row(name='Alice', height=None)]) >>> df.filter(df.height.isNotNull()).collect() - [Row(height=80, name=u'Tom')] + [Row(name='Tom', height=80)] """ - isNull = ignore_unicode_prefix(_unary_op("isNull", _isNull_doc)) - isNotNull = ignore_unicode_prefix(_unary_op("isNotNull", _isNotNull_doc)) + isNull = _unary_op("isNull", _isNull_doc) + isNotNull = _unary_op("isNotNull", _isNotNull_doc) @since(1.3) def alias(self, *alias, **kwargs): @@ -581,17 +572,16 @@ class Column(object): name = copy_func(alias, sinceversion=2.0, doc=":func:`name` is an alias for :func:`alias`.") - @ignore_unicode_prefix @since(1.3) def cast(self, dataType): """ Convert the column into type ``dataType``. >>> df.select(df.age.cast("string").alias('ages')).collect() - [Row(ages=u'2'), Row(ages=u'5')] + [Row(ages='2'), Row(ages='5')] >>> df.select(df.age.cast(StringType()).alias('ages')).collect() - [Row(ages=u'2'), Row(ages=u'5')] + [Row(ages='2'), Row(ages='5')] """ - if isinstance(dataType, basestring): + if isinstance(dataType, str): jc = self._jc.cast(dataType) elif isinstance(dataType, DataType): from pyspark.sql import SparkSession diff --git a/python/pyspark/sql/conf.py b/python/pyspark/sql/conf.py index 71ea163171..eab084a1fa 100644 --- a/python/pyspark/sql/conf.py +++ b/python/pyspark/sql/conf.py @@ -18,10 +18,6 @@ import sys from pyspark import since, _NoValue -from pyspark.rdd import ignore_unicode_prefix - -if sys.version_info[0] >= 3: - basestring = str class RuntimeConfig(object): @@ -34,13 +30,11 @@ class RuntimeConfig(object): """Create a new RuntimeConfig that wraps the underlying JVM object.""" self._jconf = jconf - @ignore_unicode_prefix @since(2.0) def set(self, key, value): """Sets the given Spark runtime configuration property.""" self._jconf.set(key, value) - @ignore_unicode_prefix @since(2.0) def get(self, key, default=_NoValue): """Returns the value of Spark runtime configuration property for the given key, @@ -54,7 +48,6 @@ class RuntimeConfig(object): self._checkType(default, "default") return self._jconf.get(key, default) - @ignore_unicode_prefix @since(2.0) def unset(self, key): """Resets the configuration property for the given key.""" @@ -62,11 +55,10 @@ class RuntimeConfig(object): def _checkType(self, obj, identifier): """Assert that an object is of type str.""" - if not isinstance(obj, basestring): + if not isinstance(obj, str): raise TypeError("expected %s '%s' to be a string (was '%s')" % (identifier, obj, type(obj).__name__)) - @ignore_unicode_prefix @since(2.4) def isModifiable(self, key): """Indicates whether the configuration property with the given key diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py index 956343a231..7fbcf85cb1 100644 --- a/python/pyspark/sql/context.py +++ b/python/pyspark/sql/context.py @@ -15,15 +15,10 @@ # limitations under the License. # -from __future__ import print_function import sys import warnings -if sys.version >= '3': - basestring = unicode = str - from pyspark import since, _NoValue -from pyspark.rdd import ignore_unicode_prefix from pyspark.sql.session import _monkey_patch_RDD, SparkSession from pyspark.sql.dataframe import DataFrame from pyspark.sql.readwriter import DataFrameReader @@ -52,7 +47,6 @@ class SQLContext(object): _instantiatedContext = None - @ignore_unicode_prefix def __init__(self, sparkContext, sparkSession=None, jsqlContext=None): """Creates a new SQLContext. @@ -70,7 +64,7 @@ class SQLContext(object): [Row((i + CAST(1 AS BIGINT))=2, (d + CAST(1 AS DOUBLE))=2.0, (NOT b)=False, list[1]=2, \ dict[s]=0, time=datetime.datetime(2014, 8, 1, 14, 1, 5), a=1)] >>> df.rdd.map(lambda x: (x.i, x.s, x.d, x.l, x.b, x.time, x.row.a, x.list)).collect() - [(1, u'string', 1.0, 1, True, datetime.datetime(2014, 8, 1, 14, 1, 5), 1, [1, 2, 3])] + [(1, 'string', 1.0, 1, True, datetime.datetime(2014, 8, 1, 14, 1, 5), 1, [1, 2, 3])] """ warnings.warn( "Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.", @@ -142,7 +136,6 @@ class SQLContext(object): """ self.sparkSession.conf.set(key, value) - @ignore_unicode_prefix @since(1.3) def getConf(self, key, defaultValue=_NoValue): """Returns the value of Spark SQL configuration property for the given key. @@ -152,12 +145,12 @@ class SQLContext(object): the system default value. >>> sqlContext.getConf("spark.sql.shuffle.partitions") - u'200' - >>> sqlContext.getConf("spark.sql.shuffle.partitions", u"10") - u'10' - >>> sqlContext.setConf("spark.sql.shuffle.partitions", u"50") - >>> sqlContext.getConf("spark.sql.shuffle.partitions", u"10") - u'50' + '200' + >>> sqlContext.getConf("spark.sql.shuffle.partitions", "10") + '10' + >>> sqlContext.setConf("spark.sql.shuffle.partitions", "50") + >>> sqlContext.getConf("spark.sql.shuffle.partitions", "10") + '50' """ return self.sparkSession.conf.get(key, defaultValue) @@ -229,7 +222,6 @@ class SQLContext(object): return self.sparkSession._inferSchema(rdd, samplingRatio) @since(1.3) - @ignore_unicode_prefix def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=True): """ Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`. @@ -274,27 +266,27 @@ class SQLContext(object): >>> l = [('Alice', 1)] >>> sqlContext.createDataFrame(l).collect() - [Row(_1=u'Alice', _2=1)] + [Row(_1='Alice', _2=1)] >>> sqlContext.createDataFrame(l, ['name', 'age']).collect() - [Row(name=u'Alice', age=1)] + [Row(name='Alice', age=1)] >>> d = [{'name': 'Alice', 'age': 1}] >>> sqlContext.createDataFrame(d).collect() - [Row(age=1, name=u'Alice')] + [Row(age=1, name='Alice')] >>> rdd = sc.parallelize(l) >>> sqlContext.createDataFrame(rdd).collect() - [Row(_1=u'Alice', _2=1)] + [Row(_1='Alice', _2=1)] >>> df = sqlContext.createDataFrame(rdd, ['name', 'age']) >>> df.collect() - [Row(name=u'Alice', age=1)] + [Row(name='Alice', age=1)] >>> from pyspark.sql import Row >>> Person = Row('name', 'age') >>> person = rdd.map(lambda r: Person(*r)) >>> df2 = sqlContext.createDataFrame(person) >>> df2.collect() - [Row(name=u'Alice', age=1)] + [Row(name='Alice', age=1)] >>> from pyspark.sql.types import * >>> schema = StructType([ @@ -302,15 +294,15 @@ class SQLContext(object): ... StructField("age", IntegerType(), True)]) >>> df3 = sqlContext.createDataFrame(rdd, schema) >>> df3.collect() - [Row(name=u'Alice', age=1)] + [Row(name='Alice', age=1)] >>> sqlContext.createDataFrame(df.toPandas()).collect() # doctest: +SKIP - [Row(name=u'Alice', age=1)] + [Row(name='Alice', age=1)] >>> sqlContext.createDataFrame(pandas.DataFrame([[1, 2]])).collect() # doctest: +SKIP [Row(0=1, 1=2)] >>> sqlContext.createDataFrame(rdd, "a: string, b: int").collect() - [Row(a=u'Alice', b=1)] + [Row(a='Alice', b=1)] >>> rdd = rdd.map(lambda row: row[1]) >>> sqlContext.createDataFrame(rdd, "int").collect() [Row(value=1)] @@ -358,7 +350,6 @@ class SQLContext(object): return self.sparkSession.catalog.createExternalTable( tableName, path, source, schema, **options) - @ignore_unicode_prefix @since(1.0) def sql(self, sqlQuery): """Returns a :class:`DataFrame` representing the result of the given query. @@ -368,7 +359,7 @@ class SQLContext(object): >>> sqlContext.registerDataFrameAsTable(df, "table1") >>> df2 = sqlContext.sql("SELECT field1 AS f1, field2 as f2 from table1") >>> df2.collect() - [Row(f1=1, f2=u'row1'), Row(f1=2, f2=u'row2'), Row(f1=3, f2=u'row3')] + [Row(f1=1, f2='row1'), Row(f1=2, f2='row2'), Row(f1=3, f2='row3')] """ return self.sparkSession.sql(sqlQuery) @@ -385,7 +376,6 @@ class SQLContext(object): """ return self.sparkSession.table(tableName) - @ignore_unicode_prefix @since(1.3) def tables(self, dbName=None): """Returns a :class:`DataFrame` containing names of tables in the given database. @@ -401,7 +391,7 @@ class SQLContext(object): >>> sqlContext.registerDataFrameAsTable(df, "table1") >>> df2 = sqlContext.tables() >>> df2.filter("tableName = 'table1'").first() - Row(database=u'', tableName=u'table1', isTemporary=True) + Row(database='', tableName='table1', isTemporary=True) """ if dbName is None: return DataFrame(self._ssql_ctx.tables(), self) diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 3ad899bcc3..023fbeabcb 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -17,21 +17,12 @@ import sys import random - -if sys.version >= '3': - basestring = unicode = str - long = int - from functools import reduce - from html import escape as html_escape -else: - from itertools import imap as map - from cgi import escape as html_escape - import warnings +from functools import reduce +from html import escape as html_escape from pyspark import copy_func, since, _NoValue -from pyspark.rdd import RDD, _load_from_socket, _local_iterator_from_socket, \ - ignore_unicode_prefix +from pyspark.rdd import RDD, _load_from_socket, _local_iterator_from_socket from pyspark.serializers import BatchedSerializer, PickleSerializer, \ UTF8Deserializer from pyspark.storagelevel import StorageLevel @@ -109,7 +100,6 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): """ return DataFrameStatFunctions(self) - @ignore_unicode_prefix @since(1.3) def toJSON(self, use_unicode=True): """Converts a :class:`DataFrame` into a :class:`RDD` of string. @@ -117,7 +107,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): Each row is turned into a JSON document as one element in the returned RDD. >>> df.toJSON().first() - u'{"age":2,"name":"Alice"}' + '{"age":2,"name":"Alice"}' """ rdd = self._jdf.toJSON() return RDD(rdd.toJavaRDD(), self._sc, UTF8Deserializer(use_unicode)) @@ -330,11 +320,11 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): # For the case when extended is mode: # df.explain("formatted") - is_extended_as_mode = isinstance(extended, basestring) and mode is None + is_extended_as_mode = isinstance(extended, str) and mode is None # For the mode specified: # df.explain(mode="formatted") - is_mode_case = extended is None and isinstance(mode, basestring) + is_mode_case = extended is None and isinstance(mode, str) if not (is_no_argument or is_extended_case or is_extended_as_mode or is_mode_case): argtypes = [ @@ -568,7 +558,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): if not isinstance(name, str): raise TypeError("name should be provided as str, got {0}".format(type(name))) - allowed_types = (basestring, list, float, int) + allowed_types = (str, list, float, int) for p in parameters: if not isinstance(p, allowed_types): raise TypeError( @@ -587,19 +577,17 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): """ return int(self._jdf.count()) - @ignore_unicode_prefix @since(1.3) def collect(self): """Returns all the records as a list of :class:`Row`. >>> df.collect() - [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')] + [Row(age=2, name='Alice'), Row(age=5, name='Bob')] """ with SCCallSiteSync(self._sc) as css: sock_info = self._jdf.collectToPython() return list(_load_from_socket(sock_info, BatchedSerializer(PickleSerializer()))) - @ignore_unicode_prefix @since(2.0) def toLocalIterator(self, prefetchPartitions=False): """ @@ -612,36 +600,33 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): before it is needed. >>> list(df.toLocalIterator()) - [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')] + [Row(age=2, name='Alice'), Row(age=5, name='Bob')] """ with SCCallSiteSync(self._sc) as css: sock_info = self._jdf.toPythonIterator(prefetchPartitions) return _local_iterator_from_socket(sock_info, BatchedSerializer(PickleSerializer())) - @ignore_unicode_prefix @since(1.3) def limit(self, num): """Limits the result count to the number specified. >>> df.limit(1).collect() - [Row(age=2, name=u'Alice')] + [Row(age=2, name='Alice')] >>> df.limit(0).collect() [] """ jdf = self._jdf.limit(num) return DataFrame(jdf, self.sql_ctx) - @ignore_unicode_prefix @since(1.3) def take(self, num): """Returns the first ``num`` rows as a :class:`list` of :class:`Row`. >>> df.take(2) - [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')] + [Row(age=2, name='Alice'), Row(age=5, name='Bob')] """ return self.limit(num).collect() - @ignore_unicode_prefix @since(3.0) def tail(self, num): """ @@ -651,7 +636,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): a very large ``num`` can crash the driver process with OutOfMemoryError. >>> df.tail(1) - [Row(age=5, name=u'Bob')] + [Row(age=5, name='Bob')] """ with SCCallSiteSync(self._sc): sock_info = self._jdf.tailToPython(num) @@ -818,7 +803,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): else: return DataFrame( self._jdf.repartition(numPartitions, self._jcols(*cols)), self.sql_ctx) - elif isinstance(numPartitions, (basestring, Column)): + elif isinstance(numPartitions, (str, Column)): cols = (numPartitions, ) + cols return DataFrame(self._jdf.repartition(self._jcols(*cols)), self.sql_ctx) else: @@ -869,7 +854,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): else: return DataFrame( self._jdf.repartitionByRange(numPartitions, self._jcols(*cols)), self.sql_ctx) - elif isinstance(numPartitions, (basestring, Column)): + elif isinstance(numPartitions, (str, Column)): cols = (numPartitions,) + cols return DataFrame(self._jdf.repartitionByRange(self._jcols(*cols)), self.sql_ctx) else: @@ -944,7 +929,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): fraction = withReplacement withReplacement = None - seed = long(seed) if seed is not None else None + seed = int(seed) if seed is not None else None args = [arg for arg in [withReplacement, fraction, seed] if arg is not None] jdf = self._jdf.sample(*args) return DataFrame(jdf, self.sql_ctx) @@ -978,15 +963,15 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): .. versionchanged:: 3.0 Added sampling by a column of :class:`Column` """ - if isinstance(col, basestring): + if isinstance(col, str): col = Column(col) elif not isinstance(col, Column): raise ValueError("col must be a string or a column, but got %r" % type(col)) if not isinstance(fractions, dict): raise ValueError("fractions must be a dict but got %r" % type(fractions)) for k, v in fractions.items(): - if not isinstance(k, (float, int, long, basestring)): - raise ValueError("key must be float, int, long, or string, but got %r" % type(k)) + if not isinstance(k, (float, int, str)): + raise ValueError("key must be float, int, or string, but got %r" % type(k)) fractions[k] = float(v) col = col._jc seed = seed if seed is not None else random.randint(0, sys.maxsize) @@ -1011,7 +996,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): if w < 0.0: raise ValueError("Weights must be positive. Found weight value: %s" % w) seed = seed if seed is not None else random.randint(0, sys.maxsize) - rdd_array = self._jdf.randomSplit(_to_list(self.sql_ctx._sc, weights), long(seed)) + rdd_array = self._jdf.randomSplit(_to_list(self.sql_ctx._sc, weights), int(seed)) return [DataFrame(rdd, self.sql_ctx) for rdd in rdd_array] @property @@ -1052,12 +1037,11 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): | 3| +----+ """ - if not isinstance(colName, basestring): + if not isinstance(colName, str): raise ValueError("colName should be provided as string") jc = self._jdf.colRegex(colName) return Column(jc) - @ignore_unicode_prefix @since(1.3) def alias(self, alias): """Returns a new :class:`DataFrame` with an alias set. @@ -1070,12 +1054,11 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): >>> joined_df = df_as1.join(df_as2, col("df_as1.name") == col("df_as2.name"), 'inner') >>> joined_df.select("df_as1.name", "df_as2.name", "df_as2.age") \ .sort(desc("df_as1.name")).collect() - [Row(name=u'Bob', name=u'Bob', age=5), Row(name=u'Alice', name=u'Alice', age=2)] + [Row(name='Bob', name='Bob', age=5), Row(name='Alice', name='Alice', age=2)] """ - assert isinstance(alias, basestring), "alias should be a string" + assert isinstance(alias, str), "alias should be a string" return DataFrame(getattr(self._jdf, "as")(alias), self.sql_ctx) - @ignore_unicode_prefix @since(2.1) def crossJoin(self, other): """Returns the cartesian product with another :class:`DataFrame`. @@ -1083,18 +1066,17 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): :param other: Right side of the cartesian product. >>> df.select("age", "name").collect() - [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')] + [Row(age=2, name='Alice'), Row(age=5, name='Bob')] >>> df2.select("name", "height").collect() - [Row(name=u'Tom', height=80), Row(name=u'Bob', height=85)] + [Row(name='Tom', height=80), Row(name='Bob', height=85)] >>> df.crossJoin(df2.select("height")).select("age", "name", "height").collect() - [Row(age=2, name=u'Alice', height=80), Row(age=2, name=u'Alice', height=85), - Row(age=5, name=u'Bob', height=80), Row(age=5, name=u'Bob', height=85)] + [Row(age=2, name='Alice', height=80), Row(age=2, name='Alice', height=85), + Row(age=5, name='Bob', height=80), Row(age=5, name='Bob', height=85)] """ jdf = self._jdf.crossJoin(other._jdf) return DataFrame(jdf, self.sql_ctx) - @ignore_unicode_prefix @since(1.3) def join(self, other, on=None, how=None): """Joins with another :class:`DataFrame`, using the given join expression. @@ -1113,27 +1095,27 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): >>> from pyspark.sql.functions import desc >>> df.join(df2, df.name == df2.name, 'outer').select(df.name, df2.height) \ .sort(desc("name")).collect() - [Row(name=u'Bob', height=85), Row(name=u'Alice', height=None), Row(name=None, height=80)] + [Row(name='Bob', height=85), Row(name='Alice', height=None), Row(name=None, height=80)] >>> df.join(df2, 'name', 'outer').select('name', 'height').sort(desc("name")).collect() - [Row(name=u'Tom', height=80), Row(name=u'Bob', height=85), Row(name=u'Alice', height=None)] + [Row(name='Tom', height=80), Row(name='Bob', height=85), Row(name='Alice', height=None)] >>> cond = [df.name == df3.name, df.age == df3.age] >>> df.join(df3, cond, 'outer').select(df.name, df3.age).collect() - [Row(name=u'Alice', age=2), Row(name=u'Bob', age=5)] + [Row(name='Alice', age=2), Row(name='Bob', age=5)] >>> df.join(df2, 'name').select(df.name, df2.height).collect() - [Row(name=u'Bob', height=85)] + [Row(name='Bob', height=85)] >>> df.join(df4, ['name', 'age']).select(df.name, df.age).collect() - [Row(name=u'Bob', age=5)] + [Row(name='Bob', age=5)] """ if on is not None and not isinstance(on, list): on = [on] if on is not None: - if isinstance(on[0], basestring): + if isinstance(on[0], str): on = self._jseq(on) else: assert isinstance(on[0], Column), "on should be Column or list of Column" @@ -1147,7 +1129,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): how = "inner" if on is None: on = self._jseq([]) - assert isinstance(how, basestring), "how should be basestring" + assert isinstance(how, str), "how should be a string" jdf = self._jdf.join(other._jdf, on, how) return DataFrame(jdf, self.sql_ctx) @@ -1171,7 +1153,6 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): jdf = self._jdf.sortWithinPartitions(self._sort_cols(cols, kwargs)) return DataFrame(jdf, self.sql_ctx) - @ignore_unicode_prefix @since(1.3) def sort(self, *cols, **kwargs): """Returns a new :class:`DataFrame` sorted by the specified column(s). @@ -1182,18 +1163,18 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): If a list is specified, length of the list must equal length of the `cols`. >>> df.sort(df.age.desc()).collect() - [Row(age=5, name=u'Bob'), Row(age=2, name=u'Alice')] + [Row(age=5, name='Bob'), Row(age=2, name='Alice')] >>> df.sort("age", ascending=False).collect() - [Row(age=5, name=u'Bob'), Row(age=2, name=u'Alice')] + [Row(age=5, name='Bob'), Row(age=2, name='Alice')] >>> df.orderBy(df.age.desc()).collect() - [Row(age=5, name=u'Bob'), Row(age=2, name=u'Alice')] + [Row(age=5, name='Bob'), Row(age=2, name='Alice')] >>> from pyspark.sql.functions import * >>> df.sort(asc("age")).collect() - [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')] + [Row(age=2, name='Alice'), Row(age=5, name='Bob')] >>> df.orderBy(desc("age"), "name").collect() - [Row(age=5, name=u'Bob'), Row(age=2, name=u'Alice')] + [Row(age=5, name='Bob'), Row(age=2, name='Alice')] >>> df.orderBy(["age", "name"], ascending=[0, 1]).collect() - [Row(age=5, name=u'Bob'), Row(age=2, name=u'Alice')] + [Row(age=5, name='Bob'), Row(age=2, name='Alice')] """ jdf = self._jdf.sort(self._sort_cols(cols, kwargs)) return DataFrame(jdf, self.sql_ctx) @@ -1333,7 +1314,6 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): jdf = self._jdf.summary(self._jseq(statistics)) return DataFrame(jdf, self.sql_ctx) - @ignore_unicode_prefix @since(1.3) def head(self, n=None): """Returns the first ``n`` rows. @@ -1346,26 +1326,24 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): If n is 1, return a single Row. >>> df.head() - Row(age=2, name=u'Alice') + Row(age=2, name='Alice') >>> df.head(1) - [Row(age=2, name=u'Alice')] + [Row(age=2, name='Alice')] """ if n is None: rs = self.head(1) return rs[0] if rs else None return self.take(n) - @ignore_unicode_prefix @since(1.3) def first(self): """Returns the first row as a :class:`Row`. >>> df.first() - Row(age=2, name=u'Alice') + Row(age=2, name='Alice') """ return self.head() - @ignore_unicode_prefix @since(1.3) def __getitem__(self, item): """Returns the column as a :class:`Column`. @@ -1373,13 +1351,13 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): >>> df.select(df['age']).collect() [Row(age=2), Row(age=5)] >>> df[ ["name", "age"]].collect() - [Row(name=u'Alice', age=2), Row(name=u'Bob', age=5)] + [Row(name='Alice', age=2), Row(name='Bob', age=5)] >>> df[ df.age > 3 ].collect() - [Row(age=5, name=u'Bob')] + [Row(age=5, name='Bob')] >>> df[df[0] > 3].collect() - [Row(age=5, name=u'Bob')] + [Row(age=5, name='Bob')] """ - if isinstance(item, basestring): + if isinstance(item, str): jc = self._jdf.apply(item) return Column(jc) elif isinstance(item, Column): @@ -1405,7 +1383,6 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): jc = self._jdf.apply(name) return Column(jc) - @ignore_unicode_prefix @since(1.3) def select(self, *cols): """Projects a set of expressions and returns a new :class:`DataFrame`. @@ -1415,11 +1392,11 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): in the current :class:`DataFrame`. >>> df.select('*').collect() - [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')] + [Row(age=2, name='Alice'), Row(age=5, name='Bob')] >>> df.select('name', 'age').collect() - [Row(name=u'Alice', age=2), Row(name=u'Bob', age=5)] + [Row(name='Alice', age=2), Row(name='Bob', age=5)] >>> df.select(df.name, (df.age + 10).alias('age')).collect() - [Row(name=u'Alice', age=12), Row(name=u'Bob', age=15)] + [Row(name='Alice', age=12), Row(name='Bob', age=15)] """ jdf = self._jdf.select(self._jcols(*cols)) return DataFrame(jdf, self.sql_ctx) @@ -1438,7 +1415,6 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): jdf = self._jdf.selectExpr(self._jseq(expr)) return DataFrame(jdf, self.sql_ctx) - @ignore_unicode_prefix @since(1.3) def filter(self, condition): """Filters rows using the given condition. @@ -1449,16 +1425,16 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): or a string of SQL expression. >>> df.filter(df.age > 3).collect() - [Row(age=5, name=u'Bob')] + [Row(age=5, name='Bob')] >>> df.where(df.age == 2).collect() - [Row(age=2, name=u'Alice')] + [Row(age=2, name='Alice')] >>> df.filter("age > 3").collect() - [Row(age=5, name=u'Bob')] + [Row(age=5, name='Bob')] >>> df.where("age = 2").collect() - [Row(age=2, name=u'Alice')] + [Row(age=2, name='Alice')] """ - if isinstance(condition, basestring): + if isinstance(condition, str): jdf = self._jdf.filter(condition) elif isinstance(condition, Column): jdf = self._jdf.filter(condition._jc) @@ -1466,7 +1442,6 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): raise TypeError("condition should be string or Column") return DataFrame(jdf, self.sql_ctx) - @ignore_unicode_prefix @since(1.3) def groupBy(self, *cols): """Groups the :class:`DataFrame` using the specified columns, @@ -1481,11 +1456,11 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): >>> df.groupBy().avg().collect() [Row(avg(age)=3.5)] >>> sorted(df.groupBy('name').agg({'age': 'mean'}).collect()) - [Row(name=u'Alice', avg(age)=2.0), Row(name=u'Bob', avg(age)=5.0)] + [Row(name='Alice', avg(age)=2.0), Row(name='Bob', avg(age)=5.0)] >>> sorted(df.groupBy(df.name).avg().collect()) - [Row(name=u'Alice', avg(age)=2.0), Row(name=u'Bob', avg(age)=5.0)] + [Row(name='Alice', avg(age)=2.0), Row(name='Bob', avg(age)=5.0)] >>> sorted(df.groupBy(['name', df.age]).count().collect()) - [Row(name=u'Alice', age=2, count=1), Row(name=u'Bob', age=5, count=1)] + [Row(name='Alice', age=2, count=1), Row(name='Bob', age=5, count=1)] """ jgd = self._jdf.groupBy(self._jcols(*cols)) from pyspark.sql.group import GroupedData @@ -1655,19 +1630,19 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): ... Row(name='Alice', age=5, height=80), \\ ... Row(name='Alice', age=10, height=80)]).toDF() >>> df.dropDuplicates().show() - +---+------+-----+ - |age|height| name| - +---+------+-----+ - | 5| 80|Alice| - | 10| 80|Alice| - +---+------+-----+ + +-----+---+------+ + | name|age|height| + +-----+---+------+ + |Alice| 5| 80| + |Alice| 10| 80| + +-----+---+------+ >>> df.dropDuplicates(['name', 'height']).show() - +---+------+-----+ - |age|height| name| - +---+------+-----+ - | 5| 80|Alice| - +---+------+-----+ + +-----+---+------+ + | name|age|height| + +-----+---+------+ + |Alice| 5| 80| + +-----+---+------+ """ if subset is None: jdf = self._jdf.dropDuplicates() @@ -1700,7 +1675,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): if subset is None: subset = self.columns - elif isinstance(subset, basestring): + elif isinstance(subset, str): subset = [subset] elif not isinstance(subset, (list, tuple)): raise ValueError("subset should be a list or tuple of column names") @@ -1715,11 +1690,11 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): """Replace null values, alias for ``na.fill()``. :func:`DataFrame.fillna` and :func:`DataFrameNaFunctions.fill` are aliases of each other. - :param value: int, long, float, string, bool or dict. + :param value: int, float, string, bool or dict. Value to replace null values with. If the value is a dict, then `subset` is ignored and `value` must be a mapping from column name (string) to replacement value. The replacement value must be - an int, long, float, boolean, or string. + an int, float, boolean, or string. :param subset: optional list of column names to consider. Columns specified in subset that do not have matching data type are ignored. For example, if `value` is a string, and subset contains a non-string column, @@ -1754,13 +1729,13 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): | 50| null|unknown| +---+------+-------+ """ - if not isinstance(value, (float, int, long, basestring, bool, dict)): - raise ValueError("value should be a float, int, long, string, bool or dict") + if not isinstance(value, (float, int, str, bool, dict)): + raise ValueError("value should be a float, int, string, bool or dict") # Note that bool validates isinstance(int), but we don't want to # convert bools to floats - if not isinstance(value, bool) and isinstance(value, (int, long)): + if not isinstance(value, bool) and isinstance(value, int): value = float(value) if isinstance(value, dict): @@ -1768,7 +1743,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): elif subset is None: return DataFrame(self._jdf.na().fill(value), self.sql_ctx) else: - if isinstance(subset, basestring): + if isinstance(subset, str): subset = [subset] elif not isinstance(subset, (list, tuple)): raise ValueError("subset should be a list or tuple of column names") @@ -1787,12 +1762,12 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): floating point representation. In case of conflicts (for example with `{42: -1, 42.0: 1}`) and arbitrary replacement will be used. - :param to_replace: bool, int, long, float, string, list or dict. + :param to_replace: bool, int, float, string, list or dict. Value to be replaced. If the value is a dict, then `value` is ignored or can be omitted, and `to_replace` must be a mapping between a value and a replacement. - :param value: bool, int, long, float, string, list or None. - The replacement value must be a bool, int, long, float, string or None. If `value` is a + :param value: bool, int, float, string, list or None. + The replacement value must be a bool, int, float, string or None. If `value` is a list, `value` should be of the same length and type as `to_replace`. If `value` is a scalar and `to_replace` is a sequence, then `value` is used as a replacement for each item in `to_replace`. @@ -1854,7 +1829,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): >>> all_of(bool)([True, False]) True - >>> all_of(basestring)(["a", 1]) + >>> all_of(str)(["a", 1]) False """ def all_of_(xs): @@ -1862,20 +1837,20 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): return all_of_ all_of_bool = all_of(bool) - all_of_str = all_of(basestring) - all_of_numeric = all_of((float, int, long)) + all_of_str = all_of(str) + all_of_numeric = all_of((float, int)) # Validate input types - valid_types = (bool, float, int, long, basestring, list, tuple) + valid_types = (bool, float, int, str, list, tuple) if not isinstance(to_replace, valid_types + (dict, )): raise ValueError( - "to_replace should be a bool, float, int, long, string, list, tuple, or dict. " + "to_replace should be a bool, float, int, string, list, tuple, or dict. " "Got {0}".format(type(to_replace))) if not isinstance(value, valid_types) and value is not None \ and not isinstance(to_replace, dict): raise ValueError("If to_replace is not a dict, value should be " - "a bool, float, int, long, string, list, tuple or None. " + "a bool, float, int, string, list, tuple or None. " "Got {0}".format(type(value))) if isinstance(to_replace, (list, tuple)) and isinstance(value, (list, tuple)): @@ -1883,12 +1858,12 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): raise ValueError("to_replace and value lists should be of the same length. " "Got {0} and {1}".format(len(to_replace), len(value))) - if not (subset is None or isinstance(subset, (list, tuple, basestring))): + if not (subset is None or isinstance(subset, (list, tuple, str))): raise ValueError("subset should be a list or tuple of column names, " "column name or None. Got {0}".format(type(subset))) # Reshape input arguments if necessary - if isinstance(to_replace, (float, int, long, basestring)): + if isinstance(to_replace, (float, int, str)): to_replace = [to_replace] if isinstance(to_replace, dict): @@ -1896,11 +1871,11 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): if value is not None: warnings.warn("to_replace is a dict and value is not None. value will be ignored.") else: - if isinstance(value, (float, int, long, basestring)) or value is None: + if isinstance(value, (float, int, str)) or value is None: value = [value for _ in range(len(to_replace))] rep_dict = dict(zip(to_replace, value)) - if isinstance(subset, basestring): + if isinstance(subset, str): subset = [subset] # Verify we were not passed in mixed type generics. @@ -1957,10 +1932,10 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): Added support for multiple columns. """ - if not isinstance(col, (basestring, list, tuple)): + if not isinstance(col, (str, list, tuple)): raise ValueError("col should be a string, list or tuple, but got %r" % type(col)) - isStr = isinstance(col, basestring) + isStr = isinstance(col, str) if isinstance(col, tuple): col = list(col) @@ -1968,7 +1943,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): col = [col] for c in col: - if not isinstance(c, basestring): + if not isinstance(c, str): raise ValueError("columns should be strings, but got %r" % type(c)) col = _to_list(self._sc, col) @@ -1977,12 +1952,12 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): if isinstance(probabilities, tuple): probabilities = list(probabilities) for p in probabilities: - if not isinstance(p, (float, int, long)) or p < 0 or p > 1: - raise ValueError("probabilities should be numerical (float, int, long) in [0,1].") + if not isinstance(p, (float, int)) or p < 0 or p > 1: + raise ValueError("probabilities should be numerical (float, int) in [0,1].") probabilities = _to_list(self._sc, probabilities) - if not isinstance(relativeError, (float, int, long)) or relativeError < 0: - raise ValueError("relativeError should be numerical (float, int, long) >= 0.") + if not isinstance(relativeError, (float, int)) or relativeError < 0: + raise ValueError("relativeError should be numerical (float, int) >= 0.") relativeError = float(relativeError) jaq = self._jdf.stat().approxQuantile(col, probabilities, relativeError) @@ -2000,9 +1975,9 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): :param col2: The name of the second column :param method: The correlation method. Currently only supports "pearson" """ - if not isinstance(col1, basestring): + if not isinstance(col1, str): raise ValueError("col1 should be a string.") - if not isinstance(col2, basestring): + if not isinstance(col2, str): raise ValueError("col2 should be a string.") if not method: method = "pearson" @@ -2020,9 +1995,9 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): :param col1: The name of the first column :param col2: The name of the second column """ - if not isinstance(col1, basestring): + if not isinstance(col1, str): raise ValueError("col1 should be a string.") - if not isinstance(col2, basestring): + if not isinstance(col2, str): raise ValueError("col2 should be a string.") return self._jdf.stat().cov(col1, col2) @@ -2042,9 +2017,9 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): :param col2: The name of the second column. Distinct items will make the column names of the :class:`DataFrame`. """ - if not isinstance(col1, basestring): + if not isinstance(col1, str): raise ValueError("col1 should be a string.") - if not isinstance(col2, basestring): + if not isinstance(col2, str): raise ValueError("col2 should be a string.") return DataFrame(self._jdf.stat().crosstab(col1, col2), self.sql_ctx) @@ -2073,7 +2048,6 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): support = 0.01 return DataFrame(self._jdf.stat().freqItems(_to_seq(self._sc, cols), support), self.sql_ctx) - @ignore_unicode_prefix @since(1.3) def withColumn(self, colName, col): """ @@ -2092,13 +2066,12 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): To avoid this, use :func:`select` with the multiple columns at once. >>> df.withColumn('age2', df.age + 2).collect() - [Row(age=2, name=u'Alice', age2=4), Row(age=5, name=u'Bob', age2=7)] + [Row(age=2, name='Alice', age2=4), Row(age=5, name='Bob', age2=7)] """ assert isinstance(col, Column), "col should be Column" return DataFrame(self._jdf.withColumn(colName, col._jc), self.sql_ctx) - @ignore_unicode_prefix @since(1.3) def withColumnRenamed(self, existing, new): """Returns a new :class:`DataFrame` by renaming an existing column. @@ -2108,12 +2081,11 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): :param new: string, new name of the column. >>> df.withColumnRenamed('age', 'age2').collect() - [Row(age2=2, name=u'Alice'), Row(age2=5, name=u'Bob')] + [Row(age2=2, name='Alice'), Row(age2=5, name='Bob')] """ return DataFrame(self._jdf.withColumnRenamed(existing, new), self.sql_ctx) @since(1.4) - @ignore_unicode_prefix def drop(self, *cols): """Returns a new :class:`DataFrame` that drops the specified column. This is a no-op if schema doesn't contain the given column name(s). @@ -2122,23 +2094,23 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): :class:`Column` to drop, or a list of string name of the columns to drop. >>> df.drop('age').collect() - [Row(name=u'Alice'), Row(name=u'Bob')] + [Row(name='Alice'), Row(name='Bob')] >>> df.drop(df.age).collect() - [Row(name=u'Alice'), Row(name=u'Bob')] + [Row(name='Alice'), Row(name='Bob')] >>> df.join(df2, df.name == df2.name, 'inner').drop(df.name).collect() - [Row(age=5, height=85, name=u'Bob')] + [Row(age=5, height=85, name='Bob')] >>> df.join(df2, df.name == df2.name, 'inner').drop(df2.name).collect() - [Row(age=5, name=u'Bob', height=85)] + [Row(age=5, name='Bob', height=85)] >>> df.join(df2, 'name', 'inner').drop('age', 'height').collect() - [Row(name=u'Bob')] + [Row(name='Bob')] """ if len(cols) == 1: col = cols[0] - if isinstance(col, basestring): + if isinstance(col, str): jdf = self._jdf.drop(col) elif isinstance(col, Column): jdf = self._jdf.drop(col._jc) @@ -2146,20 +2118,19 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): raise TypeError("col should be a string or a Column") else: for col in cols: - if not isinstance(col, basestring): + if not isinstance(col, str): raise TypeError("each col in the param list should be a string") jdf = self._jdf.drop(self._jseq(cols)) return DataFrame(jdf, self.sql_ctx) - @ignore_unicode_prefix def toDF(self, *cols): """Returns a new :class:`DataFrame` that with new specified column names :param cols: list of new column names (string) >>> df.toDF('f1', 'f2').collect() - [Row(f1=2, f2=u'Alice'), Row(f1=5, f2=u'Bob')] + [Row(f1=2, f2='Alice'), Row(f1=5, f2='Bob')] """ jdf = self._jdf.toDF(self._jseq(cols)) return DataFrame(jdf, self.sql_ctx) @@ -2347,7 +2318,6 @@ def _test(): from pyspark.context import SparkContext from pyspark.sql import Row, SQLContext, SparkSession import pyspark.sql.dataframe - from pyspark.sql.functions import from_unixtime globs = pyspark.sql.dataframe.__dict__.copy() sc = SparkContext('local[4]', 'PythonTest') globs['sc'] = sc @@ -2356,16 +2326,16 @@ def _test(): globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')])\ .toDF(StructType([StructField('age', IntegerType()), StructField('name', StringType())])) - globs['df2'] = sc.parallelize([Row(name='Tom', height=80), Row(name='Bob', height=85)]).toDF() - globs['df3'] = sc.parallelize([Row(name='Alice', age=2), - Row(name='Bob', age=5)]).toDF() - globs['df4'] = sc.parallelize([Row(name='Alice', age=10, height=80), - Row(name='Bob', age=5, height=None), - Row(name='Tom', age=None, height=None), - Row(name=None, age=None, height=None)]).toDF() - globs['df5'] = sc.parallelize([Row(name='Alice', spy=False, age=10), - Row(name='Bob', spy=None, age=5), - Row(name='Mallory', spy=True, age=None)]).toDF() + globs['df2'] = sc.parallelize([Row(height=80, name='Tom'), Row(height=85, name='Bob')]).toDF() + globs['df3'] = sc.parallelize([Row(age=2, name='Alice'), + Row(age=5, name='Bob')]).toDF() + globs['df4'] = sc.parallelize([Row(age=10, height=80, name='Alice'), + Row(age=5, height=None, name='Bob'), + Row(age=None, height=None, name='Tom'), + Row(age=None, height=None, name=None)]).toDF() + globs['df5'] = sc.parallelize([Row(age=10, name='Alice', spy=False), + Row(age=5, name='Bob', spy=None), + Row(age=None, name='Mallory', spy=True)]).toDF() globs['sdf'] = sc.parallelize([Row(name='Tom', time=1479441846), Row(name='Bob', time=1479442946)]).toDF() diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index b5a7c18904..63b049999f 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -22,14 +22,8 @@ import sys import functools import warnings -if sys.version < "3": - from itertools import imap as map - -if sys.version >= '3': - basestring = str - from pyspark import since, SparkContext -from pyspark.rdd import ignore_unicode_prefix, PythonEvalType +from pyspark.rdd import PythonEvalType from pyspark.sql.column import Column, _to_java_column, _to_seq, _create_column_from_literal, \ _create_column_from_name from pyspark.sql.dataframe import DataFrame @@ -88,14 +82,14 @@ def _create_binary_mathfunction(name, doc=""): # if they are not columns or strings. if isinstance(col1, Column): arg1 = col1._jc - elif isinstance(col1, basestring): + elif isinstance(col1, str): arg1 = _create_column_from_name(col1) else: arg1 = float(col1) if isinstance(col2, Column): arg2 = col2._jc - elif isinstance(col2, basestring): + elif isinstance(col2, str): arg2 = _create_column_from_name(col2) else: arg2 = float(col2) @@ -648,7 +642,6 @@ def percentile_approx(col, percentage, accuracy=10000): return Column(sc._jvm.functions.percentile_approx(_to_java_column(col), percentage, accuracy)) -@ignore_unicode_prefix @since(1.4) def rand(seed=None): """Generates a random column with independent and identically distributed (i.i.d.) samples @@ -657,8 +650,8 @@ def rand(seed=None): .. note:: The function is non-deterministic in general case. >>> df.withColumn('rand', rand(seed=42) * 3).collect() - [Row(age=2, name=u'Alice', rand=2.4052597283576684), - Row(age=5, name=u'Bob', rand=2.3913904055683974)] + [Row(age=2, name='Alice', rand=2.4052597283576684), + Row(age=5, name='Bob', rand=2.3913904055683974)] """ sc = SparkContext._active_spark_context if seed is not None: @@ -668,7 +661,6 @@ def rand(seed=None): return Column(jc) -@ignore_unicode_prefix @since(1.4) def randn(seed=None): """Generates a column with independent and identically distributed (i.i.d.) samples from @@ -677,8 +669,8 @@ def randn(seed=None): .. note:: The function is non-deterministic in general case. >>> df.withColumn('randn', randn(seed=42)).collect() - [Row(age=2, name=u'Alice', randn=1.1027054481455365), - Row(age=5, name=u'Bob', randn=0.7400395449950132)] + [Row(age=2, name='Alice', randn=1.1027054481455365), + Row(age=5, name='Bob', randn=0.7400395449950132)] """ sc = SparkContext._active_spark_context if seed is not None: @@ -774,7 +766,6 @@ def expr(str): return Column(sc._jvm.functions.expr(str)) -@ignore_unicode_prefix @since(1.4) def struct(*cols): """Creates a new struct column. @@ -782,9 +773,9 @@ def struct(*cols): :param cols: list of column names (string) or list of :class:`Column` expressions >>> df.select(struct('age', 'name').alias("struct")).collect() - [Row(struct=Row(age=2, name=u'Alice')), Row(struct=Row(age=5, name=u'Bob'))] + [Row(struct=Row(age=2, name='Alice')), Row(struct=Row(age=5, name='Bob'))] >>> df.select(struct([df.age, df.name]).alias("struct")).collect() - [Row(struct=Row(age=2, name=u'Alice')), Row(struct=Row(age=5, name=u'Bob'))] + [Row(struct=Row(age=2, name='Alice')), Row(struct=Row(age=5, name='Bob'))] """ sc = SparkContext._active_spark_context if len(cols) == 1 and isinstance(cols[0], (list, set)): @@ -879,14 +870,13 @@ def log2(col): @since(1.5) -@ignore_unicode_prefix def conv(col, fromBase, toBase): """ Convert a number in a string column from one base to another. >>> df = spark.createDataFrame([("010101",)], ['n']) >>> df.select(conv(df.n, 2, 16).alias('hex')).collect() - [Row(hex=u'15')] + [Row(hex='15')] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.conv(_to_java_column(col), fromBase, toBase)) @@ -976,7 +966,6 @@ def current_timestamp(): return Column(sc._jvm.functions.current_timestamp()) -@ignore_unicode_prefix @since(1.5) def date_format(date, format): """ @@ -992,7 +981,7 @@ def date_format(date, format): >>> df = spark.createDataFrame([('2015-04-08',)], ['dt']) >>> df.select(date_format('dt', 'MM/dd/yyy').alias('date')).collect() - [Row(date=u'04/08/2015')] + [Row(date='04/08/2015')] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.date_format(_to_java_column(date), format)) @@ -1310,7 +1299,6 @@ def last_day(date): return Column(sc._jvm.functions.last_day(_to_java_column(date))) -@ignore_unicode_prefix @since(1.5) def from_unixtime(timestamp, format="yyyy-MM-dd HH:mm:ss"): """ @@ -1321,7 +1309,7 @@ def from_unixtime(timestamp, format="yyyy-MM-dd HH:mm:ss"): >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") >>> time_df = spark.createDataFrame([(1428476400,)], ['unix_time']) >>> time_df.select(from_unixtime('unix_time').alias('ts')).collect() - [Row(ts=u'2015-04-08 00:00:00')] + [Row(ts='2015-04-08 00:00:00')] >>> spark.conf.unset("spark.sql.session.timeZone") """ sc = SparkContext._active_spark_context @@ -1447,7 +1435,6 @@ def timestamp_seconds(col): @since(2.0) -@ignore_unicode_prefix def window(timeColumn, windowDuration, slideDuration=None, startTime=None): """Bucketize rows into one or more time windows given a timestamp specifying column. Window starts are inclusive but the window ends are exclusive, e.g. 12:05 will be in the window @@ -1471,7 +1458,7 @@ def window(timeColumn, windowDuration, slideDuration=None, startTime=None): >>> w = df.groupBy(window("date", "5 seconds")).agg(sum("val").alias("sum")) >>> w.select(w.window.start.cast("string").alias("start"), ... w.window.end.cast("string").alias("end"), "sum").collect() - [Row(start=u'2016-03-11 09:00:05', end=u'2016-03-11 09:00:10', sum=1)] + [Row(start='2016-03-11 09:00:05', end='2016-03-11 09:00:10', sum=1)] """ def check_string_field(field, fieldName): if not field or type(field) is not str: @@ -1498,7 +1485,6 @@ def window(timeColumn, windowDuration, slideDuration=None, startTime=None): # ---------------------------- misc functions ---------------------------------- @since(1.5) -@ignore_unicode_prefix def crc32(col): """ Calculates the cyclic redundancy check value (CRC32) of a binary column and @@ -1511,33 +1497,30 @@ def crc32(col): return Column(sc._jvm.functions.crc32(_to_java_column(col))) -@ignore_unicode_prefix @since(1.5) def md5(col): """Calculates the MD5 digest and returns the value as a 32 character hex string. >>> spark.createDataFrame([('ABC',)], ['a']).select(md5('a').alias('hash')).collect() - [Row(hash=u'902fbdd2b1df0c4f70b4a5d23525e932')] + [Row(hash='902fbdd2b1df0c4f70b4a5d23525e932')] """ sc = SparkContext._active_spark_context jc = sc._jvm.functions.md5(_to_java_column(col)) return Column(jc) -@ignore_unicode_prefix @since(1.5) def sha1(col): """Returns the hex string result of SHA-1. >>> spark.createDataFrame([('ABC',)], ['a']).select(sha1('a').alias('hash')).collect() - [Row(hash=u'3c01bdbb26f358bab27f267924aa2c9a03fcfdb8')] + [Row(hash='3c01bdbb26f358bab27f267924aa2c9a03fcfdb8')] """ sc = SparkContext._active_spark_context jc = sc._jvm.functions.sha1(_to_java_column(col)) return Column(jc) -@ignore_unicode_prefix @since(1.5) def sha2(col, numBits): """Returns the hex string result of SHA-2 family of hash functions (SHA-224, SHA-256, SHA-384, @@ -1546,9 +1529,9 @@ def sha2(col, numBits): >>> digests = df.select(sha2(df.name, 256).alias('s')).collect() >>> digests[0] - Row(s=u'3bc51062973c458d5a6f2d8d64a023246354ad7e064b1e4e009ec8a0699a3043') + Row(s='3bc51062973c458d5a6f2d8d64a023246354ad7e064b1e4e009ec8a0699a3043') >>> digests[1] - Row(s=u'cd9fb1e148ccd8442e5aa74904cc73bf6fb54d1d54d333bd596aa9bb4bb4e961') + Row(s='cd9fb1e148ccd8442e5aa74904cc73bf6fb54d1d54d333bd596aa9bb4bb4e961') """ sc = SparkContext._active_spark_context jc = sc._jvm.functions.sha2(_to_java_column(col), numBits) @@ -1600,7 +1583,6 @@ del _name, _doc @since(1.5) -@ignore_unicode_prefix def concat_ws(sep, *cols): """ Concatenates multiple input string columns together into a single string column, @@ -1608,7 +1590,7 @@ def concat_ws(sep, *cols): >>> df = spark.createDataFrame([('abcd','123')], ['s', 'd']) >>> df.select(concat_ws('-', df.s, df.d).alias('s')).collect() - [Row(s=u'abcd-123')] + [Row(s='abcd-123')] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.concat_ws(sep, _to_seq(sc, cols, _to_java_column))) @@ -1634,7 +1616,6 @@ def encode(col, charset): return Column(sc._jvm.functions.encode(_to_java_column(col), charset)) -@ignore_unicode_prefix @since(1.5) def format_number(col, d): """ @@ -1645,13 +1626,12 @@ def format_number(col, d): :param d: the N decimal places >>> spark.createDataFrame([(5,)], ['a']).select(format_number('a', 4).alias('v')).collect() - [Row(v=u'5.0000')] + [Row(v='5.0000')] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.format_number(_to_java_column(col), d)) -@ignore_unicode_prefix @since(1.5) def format_string(format, *cols): """ @@ -1663,7 +1643,7 @@ def format_string(format, *cols): >>> df = spark.createDataFrame([(5, "hello")], ['a', 'b']) >>> df.select(format_string('%d %s', df.a, df.b).alias('v')).collect() - [Row(v=u'5 hello')] + [Row(v='5 hello')] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.format_string(format, _to_seq(sc, cols, _to_java_column))) @@ -1721,7 +1701,6 @@ def overlay(src, replace, pos, len=-1): @since(1.5) -@ignore_unicode_prefix def substring(str, pos, len): """ Substring starts at `pos` and is of length `len` when str is String type or @@ -1732,14 +1711,13 @@ def substring(str, pos, len): >>> df = spark.createDataFrame([('abcd',)], ['s',]) >>> df.select(substring(df.s, 1, 2).alias('s')).collect() - [Row(s=u'ab')] + [Row(s='ab')] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.substring(_to_java_column(str), pos, len)) @since(1.5) -@ignore_unicode_prefix def substring_index(str, delim, count): """ Returns the substring from string str before count occurrences of the delimiter delim. @@ -1749,15 +1727,14 @@ def substring_index(str, delim, count): >>> df = spark.createDataFrame([('a.b.c.d',)], ['s']) >>> df.select(substring_index(df.s, '.', 2).alias('s')).collect() - [Row(s=u'a.b')] + [Row(s='a.b')] >>> df.select(substring_index(df.s, '.', -3).alias('s')).collect() - [Row(s=u'b.c.d')] + [Row(s='b.c.d')] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.substring_index(_to_java_column(str), delim, count)) -@ignore_unicode_prefix @since(1.5) def levenshtein(left, right): """Computes the Levenshtein distance of the two given strings. @@ -1792,49 +1769,45 @@ def locate(substr, str, pos=1): @since(1.5) -@ignore_unicode_prefix def lpad(col, len, pad): """ Left-pad the string column to width `len` with `pad`. >>> df = spark.createDataFrame([('abcd',)], ['s',]) >>> df.select(lpad(df.s, 6, '#').alias('s')).collect() - [Row(s=u'##abcd')] + [Row(s='##abcd')] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.lpad(_to_java_column(col), len, pad)) @since(1.5) -@ignore_unicode_prefix def rpad(col, len, pad): """ Right-pad the string column to width `len` with `pad`. >>> df = spark.createDataFrame([('abcd',)], ['s',]) >>> df.select(rpad(df.s, 6, '#').alias('s')).collect() - [Row(s=u'abcd##')] + [Row(s='abcd##')] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.rpad(_to_java_column(col), len, pad)) @since(1.5) -@ignore_unicode_prefix def repeat(col, n): """ Repeats a string column n times, and returns it as a new string column. >>> df = spark.createDataFrame([('ab',)], ['s',]) >>> df.select(repeat(df.s, 3).alias('s')).collect() - [Row(s=u'ababab')] + [Row(s='ababab')] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.repeat(_to_java_column(col), n)) @since(1.5) -@ignore_unicode_prefix def split(str, pattern, limit=-1): """ Splits str around matches of the given pattern. @@ -1855,15 +1828,14 @@ def split(str, pattern, limit=-1): >>> df = spark.createDataFrame([('oneAtwoBthreeC',)], ['s',]) >>> df.select(split(df.s, '[ABC]', 2).alias('s')).collect() - [Row(s=[u'one', u'twoBthreeC'])] + [Row(s=['one', 'twoBthreeC'])] >>> df.select(split(df.s, '[ABC]', -1).alias('s')).collect() - [Row(s=[u'one', u'two', u'three', u''])] + [Row(s=['one', 'two', 'three', ''])] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.split(_to_java_column(str), pattern, limit)) -@ignore_unicode_prefix @since(1.5) def regexp_extract(str, pattern, idx): r"""Extract a specific group matched by a Java regex, from the specified string column. @@ -1871,73 +1843,68 @@ def regexp_extract(str, pattern, idx): >>> df = spark.createDataFrame([('100-200',)], ['str']) >>> df.select(regexp_extract('str', r'(\d+)-(\d+)', 1).alias('d')).collect() - [Row(d=u'100')] + [Row(d='100')] >>> df = spark.createDataFrame([('foo',)], ['str']) >>> df.select(regexp_extract('str', r'(\d+)', 1).alias('d')).collect() - [Row(d=u'')] + [Row(d='')] >>> df = spark.createDataFrame([('aaaac',)], ['str']) >>> df.select(regexp_extract('str', '(a+)(b)?(c)', 2).alias('d')).collect() - [Row(d=u'')] + [Row(d='')] """ sc = SparkContext._active_spark_context jc = sc._jvm.functions.regexp_extract(_to_java_column(str), pattern, idx) return Column(jc) -@ignore_unicode_prefix @since(1.5) def regexp_replace(str, pattern, replacement): r"""Replace all substrings of the specified string value that match regexp with rep. >>> df = spark.createDataFrame([('100-200',)], ['str']) >>> df.select(regexp_replace('str', r'(\d+)', '--').alias('d')).collect() - [Row(d=u'-----')] + [Row(d='-----')] """ sc = SparkContext._active_spark_context jc = sc._jvm.functions.regexp_replace(_to_java_column(str), pattern, replacement) return Column(jc) -@ignore_unicode_prefix @since(1.5) def initcap(col): """Translate the first letter of each word to upper case in the sentence. >>> spark.createDataFrame([('ab cd',)], ['a']).select(initcap("a").alias('v')).collect() - [Row(v=u'Ab Cd')] + [Row(v='Ab Cd')] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.initcap(_to_java_column(col))) @since(1.5) -@ignore_unicode_prefix def soundex(col): """ Returns the SoundEx encoding for a string >>> df = spark.createDataFrame([("Peters",),("Uhrbach",)], ['name']) >>> df.select(soundex(df.name).alias("soundex")).collect() - [Row(soundex=u'P362'), Row(soundex=u'U612')] + [Row(soundex='P362'), Row(soundex='U612')] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.soundex(_to_java_column(col))) -@ignore_unicode_prefix @since(1.5) def bin(col): """Returns the string representation of the binary value of the given column. >>> df.select(bin(df.age).alias('c')).collect() - [Row(c=u'10'), Row(c=u'101')] + [Row(c='10'), Row(c='101')] """ sc = SparkContext._active_spark_context jc = sc._jvm.functions.bin(_to_java_column(col)) return Column(jc) -@ignore_unicode_prefix @since(1.5) def hex(col): """Computes hex value of the given column, which could be :class:`pyspark.sql.types.StringType`, @@ -1945,14 +1912,13 @@ def hex(col): :class:`pyspark.sql.types.LongType`. >>> spark.createDataFrame([('ABC', 3)], ['a', 'b']).select(hex('a'), hex('b')).collect() - [Row(hex(a)=u'414243', hex(b)=u'3')] + [Row(hex(a)='414243', hex(b)='3')] """ sc = SparkContext._active_spark_context jc = sc._jvm.functions.hex(_to_java_column(col)) return Column(jc) -@ignore_unicode_prefix @since(1.5) def unhex(col): """Inverse of hex. Interprets each pair of characters as a hexadecimal number @@ -1965,7 +1931,6 @@ def unhex(col): return Column(sc._jvm.functions.unhex(_to_java_column(col))) -@ignore_unicode_prefix @since(1.5) def length(col): """Computes the character length of string data or number of bytes of binary data. @@ -1979,7 +1944,6 @@ def length(col): return Column(sc._jvm.functions.length(_to_java_column(col))) -@ignore_unicode_prefix @since(1.5) def translate(srcCol, matching, replace): """A function translate any character in the `srcCol` by a character in `matching`. @@ -1989,7 +1953,7 @@ def translate(srcCol, matching, replace): >>> spark.createDataFrame([('translate',)], ['a']).select(translate('a', "rnlt", "123") \\ ... .alias('r')).collect() - [Row(r=u'1a2s3ae')] + [Row(r='1a2s3ae')] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.translate(_to_java_column(srcCol), matching, replace)) @@ -1997,7 +1961,6 @@ def translate(srcCol, matching, replace): # ---------------------- Collection functions ------------------------------ -@ignore_unicode_prefix @since(2.0) def create_map(*cols): """Creates a new map column. @@ -2006,9 +1969,9 @@ def create_map(*cols): grouped as key-value pairs, e.g. (key1, value1, key2, value2, ...). >>> df.select(create_map('name', 'age').alias("map")).collect() - [Row(map={u'Alice': 2}), Row(map={u'Bob': 5})] + [Row(map={'Alice': 2}), Row(map={'Bob': 5})] >>> df.select(create_map([df.name, df.age]).alias("map")).collect() - [Row(map={u'Alice': 2}), Row(map={u'Bob': 5})] + [Row(map={'Alice': 2}), Row(map={'Bob': 5})] """ sc = SparkContext._active_spark_context if len(cols) == 1 and isinstance(cols[0], (list, set)): @@ -2108,7 +2071,6 @@ def slice(x, start, length): return Column(sc._jvm.functions.slice(_to_java_column(x), start, length)) -@ignore_unicode_prefix @since(2.4) def array_join(col, delimiter, null_replacement=None): """ @@ -2117,9 +2079,9 @@ def array_join(col, delimiter, null_replacement=None): >>> df = spark.createDataFrame([(["a", "b", "c"],), (["a", None],)], ['data']) >>> df.select(array_join(df.data, ",").alias("joined")).collect() - [Row(joined=u'a,b,c'), Row(joined=u'a')] + [Row(joined='a,b,c'), Row(joined='a')] >>> df.select(array_join(df.data, ",", "NULL").alias("joined")).collect() - [Row(joined=u'a,b,c'), Row(joined=u'a,NULL')] + [Row(joined='a,b,c'), Row(joined='a,NULL')] """ sc = SparkContext._active_spark_context if null_replacement is None: @@ -2130,7 +2092,6 @@ def array_join(col, delimiter, null_replacement=None): @since(1.5) -@ignore_unicode_prefix def concat(*cols): """ Concatenates multiple input columns together into a single column. @@ -2138,7 +2099,7 @@ def concat(*cols): >>> df = spark.createDataFrame([('abcd','123')], ['s', 'd']) >>> df.select(concat(df.s, df.d).alias('s')).collect() - [Row(s=u'abcd123')] + [Row(s='abcd123')] >>> df = spark.createDataFrame([([1, 2], [3, 4], [5]), ([1, 2], None, [3])], ['a', 'b', 'c']) >>> df.select(concat(df.a, df.b, df.c).alias("arr")).collect() @@ -2165,7 +2126,6 @@ def array_position(col, value): return Column(sc._jvm.functions.array_position(_to_java_column(col), value)) -@ignore_unicode_prefix @since(2.4) def element_at(col, extraction): """ @@ -2179,7 +2139,7 @@ def element_at(col, extraction): >>> df = spark.createDataFrame([(["a", "b", "c"],), ([],)], ['data']) >>> df.select(element_at(df.data, 1)).collect() - [Row(element_at(data, 1)=u'a'), Row(element_at(data, 1)=None)] + [Row(element_at(data, 1)='a'), Row(element_at(data, 1)=None)] >>> df = spark.createDataFrame([({"a": 1.0, "b": 2.0},), ({},)], ['data']) >>> df.select(element_at(df.data, lit("a"))).collect() @@ -2221,7 +2181,6 @@ def array_distinct(col): return Column(sc._jvm.functions.array_distinct(_to_java_column(col))) -@ignore_unicode_prefix @since(2.4) def array_intersect(col1, col2): """ @@ -2234,13 +2193,12 @@ def array_intersect(col1, col2): >>> from pyspark.sql import Row >>> df = spark.createDataFrame([Row(c1=["b", "a", "c"], c2=["c", "d", "a", "f"])]) >>> df.select(array_intersect(df.c1, df.c2)).collect() - [Row(array_intersect(c1, c2)=[u'a', u'c'])] + [Row(array_intersect(c1, c2)=['a', 'c'])] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.array_intersect(_to_java_column(col1), _to_java_column(col2))) -@ignore_unicode_prefix @since(2.4) def array_union(col1, col2): """ @@ -2253,13 +2211,12 @@ def array_union(col1, col2): >>> from pyspark.sql import Row >>> df = spark.createDataFrame([Row(c1=["b", "a", "c"], c2=["c", "d", "a", "f"])]) >>> df.select(array_union(df.c1, df.c2)).collect() - [Row(array_union(c1, c2)=[u'b', u'a', u'c', u'd', u'f'])] + [Row(array_union(c1, c2)=['b', 'a', 'c', 'd', 'f'])] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.array_union(_to_java_column(col1), _to_java_column(col2))) -@ignore_unicode_prefix @since(2.4) def array_except(col1, col2): """ @@ -2272,7 +2229,7 @@ def array_except(col1, col2): >>> from pyspark.sql import Row >>> df = spark.createDataFrame([Row(c1=["b", "a", "c"], c2=["c", "d", "a", "f"])]) >>> df.select(array_except(df.c1, df.c2)).collect() - [Row(array_except(c1, c2)=[u'b'])] + [Row(array_except(c1, c2)=['b'])] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.array_except(_to_java_column(col1), _to_java_column(col2))) @@ -2397,7 +2354,6 @@ def posexplode_outer(col): return Column(jc) -@ignore_unicode_prefix @since(1.6) def get_json_object(col, path): """ @@ -2411,14 +2367,13 @@ def get_json_object(col, path): >>> df = spark.createDataFrame(data, ("key", "jstring")) >>> df.select(df.key, get_json_object(df.jstring, '$.f1').alias("c0"), \\ ... get_json_object(df.jstring, '$.f2').alias("c1") ).collect() - [Row(key=u'1', c0=u'value1', c1=u'value2'), Row(key=u'2', c0=u'value12', c1=None)] + [Row(key='1', c0='value1', c1='value2'), Row(key='2', c0='value12', c1=None)] """ sc = SparkContext._active_spark_context jc = sc._jvm.functions.get_json_object(_to_java_column(col), path) return Column(jc) -@ignore_unicode_prefix @since(1.6) def json_tuple(col, *fields): """Creates a new row for a json column according to the given field names. @@ -2429,14 +2384,13 @@ def json_tuple(col, *fields): >>> data = [("1", '''{"f1": "value1", "f2": "value2"}'''), ("2", '''{"f1": "value12"}''')] >>> df = spark.createDataFrame(data, ("key", "jstring")) >>> df.select(df.key, json_tuple(df.jstring, 'f1', 'f2')).collect() - [Row(key=u'1', c0=u'value1', c1=u'value2'), Row(key=u'2', c0=u'value12', c1=None)] + [Row(key='1', c0='value1', c1='value2'), Row(key='2', c0='value12', c1=None)] """ sc = SparkContext._active_spark_context jc = sc._jvm.functions.json_tuple(_to_java_column(col), _to_seq(sc, fields)) return Column(jc) -@ignore_unicode_prefix @since(2.1) def from_json(col, schema, options={}): """ @@ -2460,7 +2414,7 @@ def from_json(col, schema, options={}): >>> df.select(from_json(df.value, "a INT").alias("json")).collect() [Row(json=Row(a=1))] >>> df.select(from_json(df.value, "MAP").alias("json")).collect() - [Row(json={u'a': 1})] + [Row(json={'a': 1})] >>> data = [(1, '''[{"a": 1}]''')] >>> schema = ArrayType(StructType([StructField("a", IntegerType())])) >>> df = spark.createDataFrame(data, ("key", "value")) @@ -2485,7 +2439,6 @@ def from_json(col, schema, options={}): return Column(jc) -@ignore_unicode_prefix @since(2.1) def to_json(col, options={}): """ @@ -2499,26 +2452,26 @@ def to_json(col, options={}): >>> from pyspark.sql import Row >>> from pyspark.sql.types import * - >>> data = [(1, Row(name='Alice', age=2))] + >>> data = [(1, Row(age=2, name='Alice'))] >>> df = spark.createDataFrame(data, ("key", "value")) >>> df.select(to_json(df.value).alias("json")).collect() - [Row(json=u'{"age":2,"name":"Alice"}')] - >>> data = [(1, [Row(name='Alice', age=2), Row(name='Bob', age=3)])] + [Row(json='{"age":2,"name":"Alice"}')] + >>> data = [(1, [Row(age=2, name='Alice'), Row(age=3, name='Bob')])] >>> df = spark.createDataFrame(data, ("key", "value")) >>> df.select(to_json(df.value).alias("json")).collect() - [Row(json=u'[{"age":2,"name":"Alice"},{"age":3,"name":"Bob"}]')] + [Row(json='[{"age":2,"name":"Alice"},{"age":3,"name":"Bob"}]')] >>> data = [(1, {"name": "Alice"})] >>> df = spark.createDataFrame(data, ("key", "value")) >>> df.select(to_json(df.value).alias("json")).collect() - [Row(json=u'{"name":"Alice"}')] + [Row(json='{"name":"Alice"}')] >>> data = [(1, [{"name": "Alice"}, {"name": "Bob"}])] >>> df = spark.createDataFrame(data, ("key", "value")) >>> df.select(to_json(df.value).alias("json")).collect() - [Row(json=u'[{"name":"Alice"},{"name":"Bob"}]')] + [Row(json='[{"name":"Alice"},{"name":"Bob"}]')] >>> data = [(1, ["Alice", "Bob"])] >>> df = spark.createDataFrame(data, ("key", "value")) >>> df.select(to_json(df.value).alias("json")).collect() - [Row(json=u'["Alice","Bob"]')] + [Row(json='["Alice","Bob"]')] """ sc = SparkContext._active_spark_context @@ -2526,7 +2479,6 @@ def to_json(col, options={}): return Column(jc) -@ignore_unicode_prefix @since(2.4) def schema_of_json(json, options={}): """ @@ -2540,12 +2492,12 @@ def schema_of_json(json, options={}): >>> df = spark.range(1) >>> df.select(schema_of_json(lit('{"a": 0}')).alias("json")).collect() - [Row(json=u'struct')] + [Row(json='struct')] >>> schema = schema_of_json('{a: 1}', {'allowUnquotedFieldNames':'true'}) >>> df.select(schema.alias("json")).collect() - [Row(json=u'struct')] + [Row(json='struct')] """ - if isinstance(json, basestring): + if isinstance(json, str): col = _create_column_from_literal(json) elif isinstance(json, Column): col = _to_java_column(json) @@ -2557,7 +2509,6 @@ def schema_of_json(json, options={}): return Column(jc) -@ignore_unicode_prefix @since(3.0) def schema_of_csv(csv, options={}): """ @@ -2568,11 +2519,11 @@ def schema_of_csv(csv, options={}): >>> df = spark.range(1) >>> df.select(schema_of_csv(lit('1|a'), {'sep':'|'}).alias("csv")).collect() - [Row(csv=u'struct<_c0:int,_c1:string>')] + [Row(csv='struct<_c0:int,_c1:string>')] >>> df.select(schema_of_csv('1|a', {'sep':'|'}).alias("csv")).collect() - [Row(csv=u'struct<_c0:int,_c1:string>')] + [Row(csv='struct<_c0:int,_c1:string>')] """ - if isinstance(csv, basestring): + if isinstance(csv, str): col = _create_column_from_literal(csv) elif isinstance(csv, Column): col = _to_java_column(csv) @@ -2584,7 +2535,6 @@ def schema_of_csv(csv, options={}): return Column(jc) -@ignore_unicode_prefix @since(3.0) def to_csv(col, options={}): """ @@ -2595,10 +2545,10 @@ def to_csv(col, options={}): :param options: options to control converting. accepts the same options as the CSV datasource. >>> from pyspark.sql import Row - >>> data = [(1, Row(name='Alice', age=2))] + >>> data = [(1, Row(age=2, name='Alice'))] >>> df = spark.createDataFrame(data, ("key", "value")) >>> df.select(to_csv(df.value).alias("csv")).collect() - [Row(csv=u'2,Alice')] + [Row(csv='2,Alice')] """ sc = SparkContext._active_spark_context @@ -2705,7 +2655,6 @@ def shuffle(col): @since(1.5) -@ignore_unicode_prefix def reverse(col): """ Collection function: returns a reversed string or an array with reverse order of elements. @@ -2714,7 +2663,7 @@ def reverse(col): >>> df = spark.createDataFrame([('Spark SQL',)], ['data']) >>> df.select(reverse(df.data).alias('s')).collect() - [Row(s=u'LQS krapS')] + [Row(s='LQS krapS')] >>> df = spark.createDataFrame([([2, 1, 3],) ,([1],) ,([],)], ['data']) >>> df.select(reverse(df.data).alias('r')).collect() [Row(r=[3, 1, 2]), Row(r=[1]), Row(r=[])] @@ -2820,7 +2769,6 @@ def map_from_entries(col): return Column(sc._jvm.functions.map_from_entries(_to_java_column(col))) -@ignore_unicode_prefix @since(2.4) def array_repeat(col, count): """ @@ -2828,7 +2776,7 @@ def array_repeat(col, count): >>> df = spark.createDataFrame([('ab',)], ['data']) >>> df.select(array_repeat(df.data, 3).alias('r')).collect() - [Row(r=[u'ab', u'ab', u'ab'])] + [Row(r=['ab', 'ab', 'ab'])] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.array_repeat( @@ -2898,7 +2846,6 @@ def sequence(start, stop, step=None): _to_java_column(start), _to_java_column(stop), _to_java_column(step))) -@ignore_unicode_prefix @since(3.0) def from_csv(col, schema, options={}): """ @@ -2920,11 +2867,11 @@ def from_csv(col, schema, options={}): >>> df = spark.createDataFrame(data, ("value",)) >>> options = {'ignoreLeadingWhiteSpace': True} >>> df.select(from_csv(df.value, "s string", options).alias("csv")).collect() - [Row(csv=Row(s=u'abc'))] + [Row(csv=Row(s='abc'))] """ sc = SparkContext._active_spark_context - if isinstance(schema, basestring): + if isinstance(schema, str): schema = _create_column_from_literal(schema) elif isinstance(schema, Column): schema = _to_java_column(schema) @@ -2984,20 +2931,6 @@ def _get_lambda_parameters(f): return parameters -def _get_lambda_parameters_legacy(f): - # TODO (SPARK-29909) Remove once 2.7 support is dropped - import inspect - - spec = inspect.getargspec(f) - if not 1 <= len(spec.args) <= 3 or spec.varargs or spec.keywords: - raise ValueError( - "f should take between 1 and 3 arguments, but provided function takes {}".format( - spec - ) - ) - return spec.args - - def _create_lambda(f): """ Create `o.a.s.sql.expressions.LambdaFunction` corresponding @@ -3008,10 +2941,7 @@ def _create_lambda(f): - (Column, Column) -> Column: ... - (Column, Column, Column) -> Column: ... """ - if sys.version_info >= (3, 3): - parameters = _get_lambda_parameters(f) - else: - parameters = _get_lambda_parameters_legacy(f) + parameters = _get_lambda_parameters(f) sc = SparkContext._active_spark_context expressions = sc._jvm.org.apache.spark.sql.catalyst.expressions @@ -3481,7 +3411,7 @@ def udf(f=None, returnType=StringType()): evalType=PythonEvalType.SQL_BATCHED_UDF) -blacklist = ['map', 'since', 'ignore_unicode_prefix'] +blacklist = ['map', 'since'] __all__ = [k for k, v in globals().items() if not k.startswith('_') and k[0].islower() and callable(v) and k not in blacklist] __all__ += ["PandasUDFType"] @@ -3500,7 +3430,7 @@ def _test(): sc = spark.sparkContext globs['sc'] = sc globs['spark'] = spark - globs['df'] = spark.createDataFrame([Row(name='Alice', age=2), Row(name='Bob', age=5)]) + globs['df'] = spark.createDataFrame([Row(age=2, name='Alice'), Row(age=5, name='Bob')]) (failure_count, test_count) = doctest.testmod( pyspark.sql.functions, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE) diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py index ac826bc64a..83e2baa8f0 100644 --- a/python/pyspark/sql/group.py +++ b/python/pyspark/sql/group.py @@ -18,7 +18,6 @@ import sys from pyspark import since -from pyspark.rdd import ignore_unicode_prefix from pyspark.sql.column import Column, _to_seq from pyspark.sql.dataframe import DataFrame from pyspark.sql.pandas.group_ops import PandasGroupedOpsMixin @@ -60,7 +59,6 @@ class GroupedData(PandasGroupedOpsMixin): self._df = df self.sql_ctx = df.sql_ctx - @ignore_unicode_prefix @since(1.3) def agg(self, *exprs): """Compute aggregates and returns the result as a :class:`DataFrame`. @@ -91,18 +89,18 @@ class GroupedData(PandasGroupedOpsMixin): >>> gdf = df.groupBy(df.name) >>> sorted(gdf.agg({"*": "count"}).collect()) - [Row(name=u'Alice', count(1)=1), Row(name=u'Bob', count(1)=1)] + [Row(name='Alice', count(1)=1), Row(name='Bob', count(1)=1)] >>> from pyspark.sql import functions as F >>> sorted(gdf.agg(F.min(df.age)).collect()) - [Row(name=u'Alice', min(age)=2), Row(name=u'Bob', min(age)=5)] + [Row(name='Alice', min(age)=2), Row(name='Bob', min(age)=5)] >>> from pyspark.sql.functions import pandas_udf, PandasUDFType >>> @pandas_udf('int', PandasUDFType.GROUPED_AGG) # doctest: +SKIP ... def min_udf(v): ... return v.min() >>> sorted(gdf.agg(min_udf(df.age)).collect()) # doctest: +SKIP - [Row(name=u'Alice', min_udf(age)=2), Row(name=u'Bob', min_udf(age)=5)] + [Row(name='Alice', min_udf(age)=2), Row(name='Bob', min_udf(age)=5)] """ assert exprs, "exprs should not be empty" if len(exprs) == 1 and isinstance(exprs[0], dict): diff --git a/python/pyspark/sql/pandas/conversion.py b/python/pyspark/sql/pandas/conversion.py index e6d8e9f24a..3842bc2357 100644 --- a/python/pyspark/sql/pandas/conversion.py +++ b/python/pyspark/sql/pandas/conversion.py @@ -16,11 +16,6 @@ # import sys import warnings -if sys.version >= '3': - basestring = unicode = str - xrange = range -else: - from itertools import izip as zip from collections import Counter from pyspark import since @@ -29,7 +24,6 @@ from pyspark.sql.pandas.serializers import ArrowCollectSerializer from pyspark.sql.types import IntegralType from pyspark.sql.types import * from pyspark.traceback_utils import SCCallSiteSync -from pyspark.util import _exception_message class PandasConversionMixin(object): @@ -84,7 +78,7 @@ class PandasConversionMixin(object): "failed by the reason below:\n %s\n" "Attempting non-optimization as " "'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to " - "true." % _exception_message(e)) + "true." % str(e)) warnings.warn(msg) use_arrow = False else: @@ -93,7 +87,7 @@ class PandasConversionMixin(object): "'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has " "reached the error below and will not continue because automatic fallback " "with 'spark.sql.execution.arrow.pyspark.fallback.enabled' has been set to " - "false.\n %s" % _exception_message(e)) + "false.\n %s" % str(e)) warnings.warn(msg) raise @@ -130,7 +124,7 @@ class PandasConversionMixin(object): "reached the error below and can not continue. Note that " "'spark.sql.execution.arrow.pyspark.fallback.enabled' does not have an " "effect on failures in the middle of " - "computation.\n %s" % _exception_message(e)) + "computation.\n %s" % str(e)) warnings.warn(msg) raise @@ -268,7 +262,7 @@ class SparkConversionMixin(object): # If no schema supplied by user then get the names of columns only if schema is None: - schema = [str(x) if not isinstance(x, basestring) else + schema = [str(x) if not isinstance(x, str) else (x.encode('utf-8') if not isinstance(x, str) else x) for x in data.columns] @@ -276,8 +270,6 @@ class SparkConversionMixin(object): try: return self._create_from_pandas_with_arrow(data, schema, timezone) except Exception as e: - from pyspark.util import _exception_message - if self._wrapped._conf.arrowPySparkFallbackEnabled(): msg = ( "createDataFrame attempted Arrow optimization because " @@ -285,7 +277,7 @@ class SparkConversionMixin(object): "failed by the reason below:\n %s\n" "Attempting non-optimization as " "'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to " - "true." % _exception_message(e)) + "true." % str(e)) warnings.warn(msg) else: msg = ( @@ -293,7 +285,7 @@ class SparkConversionMixin(object): "'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has " "reached the error below and will not continue because automatic " "fallback with 'spark.sql.execution.arrow.pyspark.fallback.enabled' " - "has been set to false.\n %s" % _exception_message(e)) + "has been set to false.\n %s" % str(e)) warnings.warn(msg) raise data = self._convert_from_pandas(data, schema, timezone) @@ -358,7 +350,7 @@ class SparkConversionMixin(object): col_names = cur_dtypes.names record_type_list = [] has_rec_fix = False - for i in xrange(len(cur_dtypes)): + for i in range(len(cur_dtypes)): curr_type = cur_dtypes[i] # If type is a datetime64 timestamp, convert to microseconds # NOTE: if dtype is datetime[ns] then np.record.tolist() will output values as longs, @@ -413,7 +405,7 @@ class SparkConversionMixin(object): # Slice the DataFrame to be batched step = -(-len(pdf) // self.sparkContext.defaultParallelism) # round int up - pdf_slices = (pdf.iloc[start:start + step] for start in xrange(0, len(pdf), step)) + pdf_slices = (pdf.iloc[start:start + step] for start in range(0, len(pdf), step)) # Create list of Arrow (columns, type) for serializer dump_stream arrow_data = [[(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)] diff --git a/python/pyspark/sql/pandas/functions.py b/python/pyspark/sql/pandas/functions.py index 094dc357b6..ba4dec82d4 100644 --- a/python/pyspark/sql/pandas/functions.py +++ b/python/pyspark/sql/pandas/functions.py @@ -18,6 +18,7 @@ import functools import sys import warnings +from inspect import getfullargspec from pyspark import since from pyspark.rdd import PythonEvalType @@ -25,7 +26,6 @@ from pyspark.sql.pandas.typehints import infer_eval_type from pyspark.sql.pandas.utils import require_minimum_pandas_version, require_minimum_pyarrow_version from pyspark.sql.types import DataType from pyspark.sql.udf import _create_udf -from pyspark.util import _get_argspec class PandasUDFType(object): @@ -371,30 +371,29 @@ def pandas_udf(f=None, returnType=None, functionType=None): def _create_pandas_udf(f, returnType, evalType): - argspec = _get_argspec(f) + argspec = getfullargspec(f) # pandas UDF by type hints. - if sys.version_info >= (3, 6): - from inspect import signature + from inspect import signature - if evalType in [PythonEvalType.SQL_SCALAR_PANDAS_UDF, - PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF, - PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF]: - warnings.warn( - "In Python 3.6+ and Spark 3.0+, it is preferred to specify type hints for " - "pandas UDF instead of specifying pandas UDF type which will be deprecated " - "in the future releases. See SPARK-28264 for more details.", UserWarning) - elif evalType in [PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF, - PythonEvalType.SQL_MAP_PANDAS_ITER_UDF, - PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF]: - # In case of 'SQL_GROUPED_MAP_PANDAS_UDF', deprecation warning is being triggered - # at `apply` instead. - # In case of 'SQL_MAP_PANDAS_ITER_UDF' and 'SQL_COGROUPED_MAP_PANDAS_UDF', the - # evaluation type will always be set. - pass - elif len(argspec.annotations) > 0: - evalType = infer_eval_type(signature(f)) - assert evalType is not None + if evalType in [PythonEvalType.SQL_SCALAR_PANDAS_UDF, + PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF, + PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF]: + warnings.warn( + "In Python 3.6+ and Spark 3.0+, it is preferred to specify type hints for " + "pandas UDF instead of specifying pandas UDF type which will be deprecated " + "in the future releases. See SPARK-28264 for more details.", UserWarning) + elif evalType in [PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF, + PythonEvalType.SQL_MAP_PANDAS_ITER_UDF, + PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF]: + # In case of 'SQL_GROUPED_MAP_PANDAS_UDF', deprecation warning is being triggered + # at `apply` instead. + # In case of 'SQL_MAP_PANDAS_ITER_UDF' and 'SQL_COGROUPED_MAP_PANDAS_UDF', the + # evaluation type will always be set. + pass + elif len(argspec.annotations) > 0: + evalType = infer_eval_type(signature(f)) + assert evalType is not None if evalType is None: # Set default is scalar UDF. diff --git a/python/pyspark/sql/pandas/serializers.py b/python/pyspark/sql/pandas/serializers.py index 42562e1fb9..4b91c6a0f8 100644 --- a/python/pyspark/sql/pandas/serializers.py +++ b/python/pyspark/sql/pandas/serializers.py @@ -19,13 +19,6 @@ Serializers for PyArrow and pandas conversions. See `pyspark.serializers` for more details. """ -import sys -if sys.version < '3': - from itertools import izip as zip -else: - basestring = unicode = str - xrange = range - from pyspark.serializers import Serializer, read_int, write_int, UTF8Deserializer @@ -67,7 +60,7 @@ class ArrowCollectSerializer(Serializer): raise RuntimeError("An error occurred while calling " "ArrowCollectSerializer.load_stream: {}".format(error_msg)) batch_order = [] - for i in xrange(num): + for i in range(num): index = read_int(stream) batch_order.append(index) yield batch_order @@ -180,7 +173,7 @@ class ArrowStreamPandasSerializer(ArrowStreamSerializer): if len(s) == 0 and len(s.columns) == 0: arrs_names = [(pa.array([], type=field.type), field.name) for field in t] # Assign result columns by schema name if user labeled with strings - elif self._assign_cols_by_name and any(isinstance(name, basestring) + elif self._assign_cols_by_name and any(isinstance(name, str) for name in s.columns): arrs_names = [(create_array(s[field.name], field.type), field.name) for field in t] @@ -194,7 +187,7 @@ class ArrowStreamPandasSerializer(ArrowStreamSerializer): else: arrs.append(create_array(s, t)) - return pa.RecordBatch.from_arrays(arrs, ["_%d" % i for i in xrange(len(arrs))]) + return pa.RecordBatch.from_arrays(arrs, ["_%d" % i for i in range(len(arrs))]) def dump_stream(self, iterator, stream): """ diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 336345e383..a83aece2e4 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -15,15 +15,9 @@ # limitations under the License. # -import sys - -if sys.version >= '3': - basestring = unicode = str - from py4j.java_gateway import JavaClass from pyspark import RDD, since -from pyspark.rdd import ignore_unicode_prefix from pyspark.sql.column import _to_seq from pyspark.sql.types import * from pyspark.sql import utils @@ -94,7 +88,7 @@ class DataFrameReader(OptionUtils): if isinstance(schema, StructType): jschema = spark._jsparkSession.parseDataType(schema.json()) self._jreader = self._jreader.schema(jschema) - elif isinstance(schema, basestring): + elif isinstance(schema, str): self._jreader = self._jreader.schema(schema) else: raise TypeError("schema should be StructType or string") @@ -174,7 +168,7 @@ class DataFrameReader(OptionUtils): if schema is not None: self.schema(schema) self.options(**options) - if isinstance(path, basestring): + if isinstance(path, str): return self._df(self._jreader.load(path)) elif path is not None: if type(path) != list: @@ -294,16 +288,16 @@ class DataFrameReader(OptionUtils): allowUnquotedControlChars=allowUnquotedControlChars, lineSep=lineSep, samplingRatio=samplingRatio, dropFieldIfAllNull=dropFieldIfAllNull, encoding=encoding, locale=locale, pathGlobFilter=pathGlobFilter, recursiveFileLookup=recursiveFileLookup) - if isinstance(path, basestring): + if isinstance(path, str): path = [path] if type(path) == list: return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path))) elif isinstance(path, RDD): def func(iterator): for x in iterator: - if not isinstance(x, basestring): - x = unicode(x) - if isinstance(x, unicode): + if not isinstance(x, str): + x = str(x) + if isinstance(x, str): x = x.encode("utf-8") yield x keyed = path.mapPartitions(func) @@ -352,7 +346,6 @@ class DataFrameReader(OptionUtils): recursiveFileLookup=recursiveFileLookup) return self._df(self._jreader.parquet(_to_seq(self._spark._sc, paths))) - @ignore_unicode_prefix @since(1.6) def text(self, paths, wholetext=False, lineSep=None, pathGlobFilter=None, recursiveFileLookup=None): @@ -376,15 +369,15 @@ class DataFrameReader(OptionUtils): >>> df = spark.read.text('python/test_support/sql/text-test.txt') >>> df.collect() - [Row(value=u'hello'), Row(value=u'this')] + [Row(value='hello'), Row(value='this')] >>> df = spark.read.text('python/test_support/sql/text-test.txt', wholetext=True) >>> df.collect() - [Row(value=u'hello\\nthis')] + [Row(value='hello\\nthis')] """ self._set_opts( wholetext=wholetext, lineSep=lineSep, pathGlobFilter=pathGlobFilter, recursiveFileLookup=recursiveFileLookup) - if isinstance(paths, basestring): + if isinstance(paths, str): paths = [paths] return self._df(self._jreader.text(self._spark._sc._jvm.PythonUtils.toSeq(paths))) @@ -529,16 +522,16 @@ class DataFrameReader(OptionUtils): charToEscapeQuoteEscaping=charToEscapeQuoteEscaping, samplingRatio=samplingRatio, enforceSchema=enforceSchema, emptyValue=emptyValue, locale=locale, lineSep=lineSep, pathGlobFilter=pathGlobFilter, recursiveFileLookup=recursiveFileLookup) - if isinstance(path, basestring): + if isinstance(path, str): path = [path] if type(path) == list: return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path))) elif isinstance(path, RDD): def func(iterator): for x in iterator: - if not isinstance(x, basestring): - x = unicode(x) - if isinstance(x, unicode): + if not isinstance(x, str): + x = str(x) + if isinstance(x, str): x = x.encode("utf-8") yield x keyed = path.mapPartitions(func) @@ -574,7 +567,7 @@ class DataFrameReader(OptionUtils): """ self._set_opts(mergeSchema=mergeSchema, pathGlobFilter=pathGlobFilter, recursiveFileLookup=recursiveFileLookup) - if isinstance(path, basestring): + if isinstance(path, str): path = [path] return self._df(self._jreader.orc(_to_seq(self._spark._sc, path))) @@ -763,7 +756,7 @@ class DataFrameWriter(OptionUtils): col, cols = col[0], col[1:] - if not all(isinstance(c, basestring) for c in cols) or not(isinstance(col, basestring)): + if not all(isinstance(c, str) for c in cols) or not(isinstance(col, str)): raise TypeError("all names should be `str`") self._jwrite = self._jwrite.bucketBy(numBuckets, col, _to_seq(self._spark._sc, cols)) @@ -788,7 +781,7 @@ class DataFrameWriter(OptionUtils): col, cols = col[0], col[1:] - if not all(isinstance(c, basestring) for c in cols) or not(isinstance(col, basestring)): + if not all(isinstance(c, str) for c in cols) or not(isinstance(col, str)): raise TypeError("all names should be `str`") self._jwrite = self._jwrite.sortBy(col, _to_seq(self._spark._sc, cols)) diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py index 61891c478d..a5d102712d 100644 --- a/python/pyspark/sql/session.py +++ b/python/pyspark/sql/session.py @@ -15,22 +15,13 @@ # limitations under the License. # -# To disallow implicit relative import. Remove this once we drop Python 2. -from __future__ import absolute_import -from __future__ import print_function import sys import warnings from functools import reduce from threading import RLock -if sys.version >= '3': - basestring = unicode = str - xrange = range -else: - from itertools import imap as map - from pyspark import since -from pyspark.rdd import RDD, ignore_unicode_prefix +from pyspark.rdd import RDD from pyspark.sql.conf import RuntimeConfig from pyspark.sql.dataframe import DataFrame from pyspark.sql.pandas.conversion import SparkConversionMixin @@ -56,7 +47,7 @@ def _monkey_patch_RDD(sparkSession): :return: a DataFrame >>> rdd.toDF().collect() - [Row(name=u'Alice', age=1)] + [Row(name='Alice', age=1)] """ return sparkSession.createDataFrame(self, schema, sampleRatio) @@ -197,7 +188,6 @@ class SparkSession(SparkConversionMixin): _instantiatedSession = None _activeSession = None - @ignore_unicode_prefix def __init__(self, sparkContext, jsparkSession=None): """Creates a new SparkSession. @@ -213,7 +203,7 @@ class SparkSession(SparkConversionMixin): [Row((i + CAST(1 AS BIGINT))=2, (d + CAST(1 AS DOUBLE))=2.0, (NOT b)=False, list[1]=2, \ dict[s]=0, time=datetime.datetime(2014, 8, 1, 14, 1, 5), a=1)] >>> df.rdd.map(lambda x: (x.i, x.s, x.d, x.l, x.b, x.time, x.row.a, x.list)).collect() - [(1, u'string', 1.0, 1, True, datetime.datetime(2014, 8, 1, 14, 1, 5), 1, [1, 2, 3])] + [(1, 'string', 1.0, 1, True, datetime.datetime(2014, 8, 1, 14, 1, 5), 1, [1, 2, 3])] """ from pyspark.sql.context import SQLContext self._sc = sparkContext @@ -492,7 +482,6 @@ class SparkSession(SparkConversionMixin): return SparkSession.builder.getOrCreate() @since(2.0) - @ignore_unicode_prefix def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=True): """ Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`. @@ -530,34 +519,29 @@ class SparkSession(SparkConversionMixin): .. note:: Usage with spark.sql.execution.arrow.pyspark.enabled=True is experimental. - .. note:: When Arrow optimization is enabled, strings inside Pandas DataFrame in Python - 2 are converted into bytes as they are bytes in Python 2 whereas regular strings are - left as strings. When using strings in Python 2, use unicode `u""` as Python standard - practice. - >>> l = [('Alice', 1)] >>> spark.createDataFrame(l).collect() - [Row(_1=u'Alice', _2=1)] + [Row(_1='Alice', _2=1)] >>> spark.createDataFrame(l, ['name', 'age']).collect() - [Row(name=u'Alice', age=1)] + [Row(name='Alice', age=1)] >>> d = [{'name': 'Alice', 'age': 1}] >>> spark.createDataFrame(d).collect() - [Row(age=1, name=u'Alice')] + [Row(age=1, name='Alice')] >>> rdd = sc.parallelize(l) >>> spark.createDataFrame(rdd).collect() - [Row(_1=u'Alice', _2=1)] + [Row(_1='Alice', _2=1)] >>> df = spark.createDataFrame(rdd, ['name', 'age']) >>> df.collect() - [Row(name=u'Alice', age=1)] + [Row(name='Alice', age=1)] >>> from pyspark.sql import Row >>> Person = Row('name', 'age') >>> person = rdd.map(lambda r: Person(*r)) >>> df2 = spark.createDataFrame(person) >>> df2.collect() - [Row(name=u'Alice', age=1)] + [Row(name='Alice', age=1)] >>> from pyspark.sql.types import * >>> schema = StructType([ @@ -565,15 +549,15 @@ class SparkSession(SparkConversionMixin): ... StructField("age", IntegerType(), True)]) >>> df3 = spark.createDataFrame(rdd, schema) >>> df3.collect() - [Row(name=u'Alice', age=1)] + [Row(name='Alice', age=1)] >>> spark.createDataFrame(df.toPandas()).collect() # doctest: +SKIP - [Row(name=u'Alice', age=1)] + [Row(name='Alice', age=1)] >>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect() # doctest: +SKIP [Row(0=1, 1=2)] >>> spark.createDataFrame(rdd, "a: string, b: int").collect() - [Row(a=u'Alice', b=1)] + [Row(a='Alice', b=1)] >>> rdd = rdd.map(lambda row: row[1]) >>> spark.createDataFrame(rdd, "int").collect() [Row(value=1)] @@ -587,7 +571,7 @@ class SparkSession(SparkConversionMixin): if isinstance(data, DataFrame): raise TypeError("data is already a DataFrame") - if isinstance(schema, basestring): + if isinstance(schema, str): schema = _parse_datatype_string(schema) elif isinstance(schema, (list, tuple)): # Must re-encode any unicode strings to be consistent with StructField names @@ -634,7 +618,6 @@ class SparkSession(SparkConversionMixin): df._schema = schema return df - @ignore_unicode_prefix @since(2.0) def sql(self, sqlQuery): """Returns a :class:`DataFrame` representing the result of the given query. @@ -644,7 +627,7 @@ class SparkSession(SparkConversionMixin): >>> df.createOrReplaceTempView("table1") >>> df2 = spark.sql("SELECT field1 AS f1, field2 as f2 from table1") >>> df2.collect() - [Row(f1=1, f2=u'row1'), Row(f1=2, f2=u'row2'), Row(f1=3, f2=u'row3')] + [Row(f1=1, f2='row1'), Row(f1=2, f2='row2'), Row(f1=3, f2='row3')] """ return DataFrame(self._jsparkSession.sql(sqlQuery), self._wrapped) diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py index 2450a4c93c..5c528c1d54 100644 --- a/python/pyspark/sql/streaming.py +++ b/python/pyspark/sql/streaming.py @@ -18,13 +18,9 @@ import sys import json -if sys.version >= '3': - basestring = str - from py4j.java_gateway import java_import from pyspark import since, keyword_only -from pyspark.rdd import ignore_unicode_prefix from pyspark.sql.column import _to_seq from pyspark.sql.readwriter import OptionUtils, to_str from pyspark.sql.types import * @@ -204,7 +200,6 @@ class StreamingQueryManager(object): self._jsqm = jsqm @property - @ignore_unicode_prefix @since(2.0) def active(self): """Returns a list of active queries associated with this SQLContext @@ -213,12 +208,11 @@ class StreamingQueryManager(object): >>> sqm = spark.streams >>> # get the list of active streaming queries >>> [q.name for q in sqm.active] - [u'this_query'] + ['this_query'] >>> sq.stop() """ return [StreamingQuery(jsq) for jsq in self._jsqm.active()] - @ignore_unicode_prefix @since(2.0) def get(self, id): """Returns an active query from this SQLContext or throws exception if an active query @@ -226,7 +220,7 @@ class StreamingQueryManager(object): >>> sq = sdf.writeStream.format('memory').queryName('this_query').start() >>> sq.name - u'this_query' + 'this_query' >>> sq = spark.streams.get(sq.id) >>> sq.isActive True @@ -328,7 +322,7 @@ class DataStreamReader(OptionUtils): if isinstance(schema, StructType): jschema = spark._jsparkSession.parseDataType(schema.json()) self._jreader = self._jreader.schema(jschema) - elif isinstance(schema, basestring): + elif isinstance(schema, str): self._jreader = self._jreader.schema(schema) else: raise TypeError("schema should be StructType or string") @@ -527,7 +521,7 @@ class DataStreamReader(OptionUtils): allowUnquotedControlChars=allowUnquotedControlChars, lineSep=lineSep, locale=locale, dropFieldIfAllNull=dropFieldIfAllNull, encoding=encoding, pathGlobFilter=pathGlobFilter, recursiveFileLookup=recursiveFileLookup) - if isinstance(path, basestring): + if isinstance(path, str): return self._df(self._jreader.json(path)) else: raise TypeError("path can be only a single string") @@ -555,7 +549,7 @@ class DataStreamReader(OptionUtils): """ self._set_opts(mergeSchema=mergeSchema, pathGlobFilter=pathGlobFilter, recursiveFileLookup=recursiveFileLookup) - if isinstance(path, basestring): + if isinstance(path, str): return self._df(self._jreader.orc(path)) else: raise TypeError("path can be only a single string") @@ -585,12 +579,11 @@ class DataStreamReader(OptionUtils): """ self._set_opts(mergeSchema=mergeSchema, pathGlobFilter=pathGlobFilter, recursiveFileLookup=recursiveFileLookup) - if isinstance(path, basestring): + if isinstance(path, str): return self._df(self._jreader.parquet(path)) else: raise TypeError("path can be only a single string") - @ignore_unicode_prefix @since(2.0) def text(self, path, wholetext=False, lineSep=None, pathGlobFilter=None, recursiveFileLookup=None): @@ -623,7 +616,7 @@ class DataStreamReader(OptionUtils): self._set_opts( wholetext=wholetext, lineSep=lineSep, pathGlobFilter=pathGlobFilter, recursiveFileLookup=recursiveFileLookup) - if isinstance(path, basestring): + if isinstance(path, str): return self._df(self._jreader.text(path)) else: raise TypeError("path can be only a single string") @@ -762,7 +755,7 @@ class DataStreamReader(OptionUtils): charToEscapeQuoteEscaping=charToEscapeQuoteEscaping, enforceSchema=enforceSchema, emptyValue=emptyValue, locale=locale, lineSep=lineSep, pathGlobFilter=pathGlobFilter, recursiveFileLookup=recursiveFileLookup) - if isinstance(path, basestring): + if isinstance(path, str): return self._df(self._jreader.csv(path)) else: raise TypeError("path can be only a single string") @@ -1153,7 +1146,6 @@ class DataStreamWriter(object): ensure_callback_server_started(gw) return self - @ignore_unicode_prefix @since(2.0) def start(self, path=None, format=None, outputMode=None, partitionBy=None, queryName=None, **options): @@ -1186,14 +1178,14 @@ class DataStreamWriter(object): >>> sq.isActive True >>> sq.name - u'this_query' + 'this_query' >>> sq.stop() >>> sq.isActive False >>> sq = sdf.writeStream.trigger(processingTime='5 seconds').start( ... queryName='that_query', outputMode="append", format='memory') >>> sq.name - u'that_query' + 'that_query' >>> sq.isActive True >>> sq.stop() diff --git a/python/pyspark/sql/tests/test_arrow.py b/python/pyspark/sql/tests/test_arrow.py index a96354e3ec..90fc983aec 100644 --- a/python/pyspark/sql/tests/test_arrow.py +++ b/python/pyspark/sql/tests/test_arrow.py @@ -21,9 +21,6 @@ import threading import time import unittest import warnings -import sys -if sys.version >= '3': - basestring = unicode = str from pyspark import SparkContext, SparkConf from pyspark.sql import Row, SparkSession @@ -32,7 +29,6 @@ from pyspark.sql.types import * from pyspark.testing.sqlutils import ReusedSQLTestCase, have_pandas, have_pyarrow, \ pandas_requirement_message, pyarrow_requirement_message from pyspark.testing.utils import QuietTest -from pyspark.util import _exception_message if have_pandas: import pandas as pd @@ -130,7 +126,7 @@ class ArrowTests(ReusedSQLTestCase): warn.message for warn in warns if isinstance(warn.message, UserWarning)] self.assertTrue(len(user_warns) > 0) self.assertTrue( - "Attempting non-optimization" in _exception_message(user_warns[-1])) + "Attempting non-optimization" in str(user_warns[-1])) assert_frame_equal(pdf, pd.DataFrame({u'map': [{u'a': 1}]})) def test_toPandas_fallback_disabled(self): @@ -358,7 +354,7 @@ class ArrowTests(ReusedSQLTestCase): warn.message for warn in warns if isinstance(warn.message, UserWarning)] self.assertTrue(len(user_warns) > 0) self.assertTrue( - "Attempting non-optimization" in _exception_message(user_warns[-1])) + "Attempting non-optimization" in str(user_warns[-1])) self.assertEqual(df.collect(), [Row(a={u'a': 1})]) def test_createDataFrame_fallback_disabled(self): @@ -438,12 +434,12 @@ class ArrowTests(ReusedSQLTestCase): assert_frame_equal(result_spark, result_arrow) # ensure original category elements are string - self.assertIsInstance(category_first_element, basestring) + self.assertIsInstance(category_first_element, str) # spark data frame and arrow execution mode enabled data frame type must match pandas self.assertEqual(spark_type, 'string') self.assertEqual(arrow_type, 'string') - self.assertIsInstance(arrow_first_category_element, basestring) - self.assertIsInstance(spark_first_category_element, basestring) + self.assertIsInstance(arrow_first_category_element, str) + self.assertIsInstance(spark_first_category_element, str) def test_createDataFrame_with_float_index(self): # SPARK-32098: float index should not produce duplicated or truncated Spark DataFrame diff --git a/python/pyspark/sql/tests/test_column.py b/python/pyspark/sql/tests/test_column.py index 58bf896a10..e0b8bf45a2 100644 --- a/python/pyspark/sql/tests/test_column.py +++ b/python/pyspark/sql/tests/test_column.py @@ -16,8 +16,6 @@ # limitations under the License. # -import sys - from pyspark.sql import Column, Row from pyspark.sql.types import * from pyspark.sql.utils import AnalysisException @@ -109,12 +107,8 @@ class ColumnTests(ReusedSQLTestCase): self.assertRaises(TypeError, lambda: df[{}]) def test_column_name_with_non_ascii(self): - if sys.version >= '3': - columnName = "数量" - self.assertTrue(isinstance(columnName, str)) - else: - columnName = unicode("数量", "utf-8") - self.assertTrue(isinstance(columnName, unicode)) + columnName = "数量" + self.assertTrue(isinstance(columnName, str)) schema = StructType([StructField(columnName, LongType(), True)]) df = self.spark.createDataFrame([(1,)], schema) self.assertEqual(schema, df.schema) diff --git a/python/pyspark/sql/tests/test_context.py b/python/pyspark/sql/tests/test_context.py index 3b1b638ed4..ff953ba4b4 100644 --- a/python/pyspark/sql/tests/test_context.py +++ b/python/pyspark/sql/tests/test_context.py @@ -19,11 +19,7 @@ import shutil import sys import tempfile import unittest -try: - from importlib import reload # Python 3.4+ only. -except ImportError: - # Otherwise, we will stick to Python 2's built-in reload. - pass +from importlib import reload import py4j diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py index 52ae74df5d..7dcc19f3ba 100644 --- a/python/pyspark/sql/tests/test_functions.py +++ b/python/pyspark/sql/tests/test_functions.py @@ -167,10 +167,6 @@ class FunctionsTests(ReusedSQLTestCase): TypeError, "must be the same type", lambda: df.select(col('name').substr(0, lit(1)))) - if sys.version_info.major == 2: - self.assertRaises( - TypeError, - lambda: df.select(col('name').substr(long(0), long(1)))) for name in _string_functions.keys(): self.assertEqual( diff --git a/python/pyspark/sql/tests/test_pandas_cogrouped_map.py b/python/pyspark/sql/tests/test_pandas_cogrouped_map.py index c1cb30c3ca..24a73918d8 100644 --- a/python/pyspark/sql/tests/test_pandas_cogrouped_map.py +++ b/python/pyspark/sql/tests/test_pandas_cogrouped_map.py @@ -32,11 +32,6 @@ if have_pyarrow: import pyarrow as pa -# Tests below use pd.DataFrame.assign that will infer mixed types (unicode/str) for column names -# From kwargs w/ Python 2, so need to set check_column_type=False and avoid this check -_check_column_type = sys.version >= '3' - - @unittest.skipIf( not have_pandas or not have_pyarrow, pandas_requirement_message or pyarrow_requirement_message) @@ -109,7 +104,7 @@ class CogroupedMapInPandasTests(ReusedSQLTestCase): 'v2': [90, 100, 110] }) - assert_frame_equal(expected, result, check_column_type=_check_column_type) + assert_frame_equal(expected, result) def test_empty_group_by(self): left = self.data1 @@ -130,7 +125,7 @@ class CogroupedMapInPandasTests(ReusedSQLTestCase): .merge(left, right, on=['id', 'k']) \ .sort_values(by=['id', 'k']) - assert_frame_equal(expected, result, check_column_type=_check_column_type) + assert_frame_equal(expected, result) def test_mixed_scalar_udfs_followed_by_cogrouby_apply(self): df = self.spark.range(0, 10).toDF('v1') @@ -173,7 +168,7 @@ class CogroupedMapInPandasTests(ReusedSQLTestCase): expected = self.data1.toPandas() expected = expected.assign(key=expected.id % 2 == 0) - assert_frame_equal(expected, result, check_column_type=_check_column_type) + assert_frame_equal(expected, result) def test_wrong_return_type(self): # Test that we get a sensible exception invalid values passed to apply @@ -224,7 +219,7 @@ class CogroupedMapInPandasTests(ReusedSQLTestCase): expected = left.toPandas() if isLeft else right.toPandas() expected = expected.assign(key=expected.id) - assert_frame_equal(expected, result, check_column_type=_check_column_type) + assert_frame_equal(expected, result) @staticmethod def _test_merge(left, right, output_schema='id long, k int, v int, v2 int'): @@ -246,7 +241,7 @@ class CogroupedMapInPandasTests(ReusedSQLTestCase): .merge(left, right, on=['id', 'k']) \ .sort_values(by=['id', 'k']) - assert_frame_equal(expected, result, check_column_type=_check_column_type) + assert_frame_equal(expected, result) if __name__ == "__main__": diff --git a/python/pyspark/sql/tests/test_pandas_grouped_map.py b/python/pyspark/sql/tests/test_pandas_grouped_map.py index cc6167e619..00cc9b3a64 100644 --- a/python/pyspark/sql/tests/test_pandas_grouped_map.py +++ b/python/pyspark/sql/tests/test_pandas_grouped_map.py @@ -38,11 +38,6 @@ if have_pyarrow: import pyarrow as pa -# Tests below use pd.DataFrame.assign that will infer mixed types (unicode/str) for column names -# from kwargs w/ Python 2, so need to set check_column_type=False and avoid this check -_check_column_type = sys.version >= '3' - - @unittest.skipIf( not have_pandas or not have_pyarrow, pandas_requirement_message or pyarrow_requirement_message) @@ -139,9 +134,9 @@ class GroupedMapInPandasTests(ReusedSQLTestCase): result3 = df.groupby('id').apply(udf3).sort('id').toPandas() expected3 = expected1 - assert_frame_equal(expected1, result1, check_column_type=_check_column_type) - assert_frame_equal(expected2, result2, check_column_type=_check_column_type) - assert_frame_equal(expected3, result3, check_column_type=_check_column_type) + assert_frame_equal(expected1, result1) + assert_frame_equal(expected2, result2) + assert_frame_equal(expected3, result3) def test_array_type_correct(self): df = self.data.withColumn("arr", array(col("id"))).repartition(1, "id") @@ -159,7 +154,7 @@ class GroupedMapInPandasTests(ReusedSQLTestCase): result = df.groupby('id').apply(udf).sort('id').toPandas() expected = df.toPandas().groupby('id').apply(udf.func).reset_index(drop=True) - assert_frame_equal(expected, result, check_column_type=_check_column_type) + assert_frame_equal(expected, result) def test_register_grouped_map_udf(self): foo_udf = pandas_udf(lambda x: x, "id long", PandasUDFType.GROUPED_MAP) @@ -181,7 +176,7 @@ class GroupedMapInPandasTests(ReusedSQLTestCase): result = df.groupby('id').apply(foo).sort('id').toPandas() expected = df.toPandas().groupby('id').apply(foo.func).reset_index(drop=True) - assert_frame_equal(expected, result, check_column_type=_check_column_type) + assert_frame_equal(expected, result) def test_coerce(self): df = self.data @@ -195,7 +190,7 @@ class GroupedMapInPandasTests(ReusedSQLTestCase): result = df.groupby('id').apply(foo).sort('id').toPandas() expected = df.toPandas().groupby('id').apply(foo.func).reset_index(drop=True) expected = expected.assign(v=expected.v.astype('float64')) - assert_frame_equal(expected, result, check_column_type=_check_column_type) + assert_frame_equal(expected, result) def test_complex_groupby(self): df = self.data @@ -213,7 +208,7 @@ class GroupedMapInPandasTests(ReusedSQLTestCase): expected = pdf.groupby(pdf['id'] % 2 == 0, as_index=False).apply(normalize.func) expected = expected.sort_values(['id', 'v']).reset_index(drop=True) expected = expected.assign(norm=expected.norm.astype('float64')) - assert_frame_equal(expected, result, check_column_type=_check_column_type) + assert_frame_equal(expected, result) def test_empty_groupby(self): df = self.data @@ -231,7 +226,7 @@ class GroupedMapInPandasTests(ReusedSQLTestCase): expected = normalize.func(pdf) expected = expected.sort_values(['id', 'v']).reset_index(drop=True) expected = expected.assign(norm=expected.norm.astype('float64')) - assert_frame_equal(expected, result, check_column_type=_check_column_type) + assert_frame_equal(expected, result) def test_datatype_string(self): df = self.data @@ -244,7 +239,7 @@ class GroupedMapInPandasTests(ReusedSQLTestCase): result = df.groupby('id').apply(foo_udf).sort('id').toPandas() expected = df.toPandas().groupby('id').apply(foo_udf.func).reset_index(drop=True) - assert_frame_equal(expected, result, check_column_type=_check_column_type) + assert_frame_equal(expected, result) def test_wrong_return_type(self): with QuietTest(self.sc): @@ -301,7 +296,7 @@ class GroupedMapInPandasTests(ReusedSQLTestCase): df = self.spark.createDataFrame(dt, 'timestamp').toDF('time') foo_udf = pandas_udf(lambda pdf: pdf, 'time timestamp', PandasUDFType.GROUPED_MAP) result = df.groupby('time').apply(foo_udf).sort('time') - assert_frame_equal(df.toPandas(), result.toPandas(), check_column_type=_check_column_type) + assert_frame_equal(df.toPandas(), result.toPandas()) def test_udf_with_key(self): import numpy as np @@ -355,26 +350,26 @@ class GroupedMapInPandasTests(ReusedSQLTestCase): expected1 = pdf.groupby('id', as_index=False)\ .apply(lambda x: udf1.func((x.id.iloc[0],), x))\ .sort_values(['id', 'v']).reset_index(drop=True) - assert_frame_equal(expected1, result1, check_column_type=_check_column_type) + assert_frame_equal(expected1, result1) # Test groupby expression result2 = df.groupby(df.id % 2).apply(udf1).sort('id', 'v').toPandas() expected2 = pdf.groupby(pdf.id % 2, as_index=False)\ .apply(lambda x: udf1.func((x.id.iloc[0] % 2,), x))\ .sort_values(['id', 'v']).reset_index(drop=True) - assert_frame_equal(expected2, result2, check_column_type=_check_column_type) + assert_frame_equal(expected2, result2) # Test complex groupby result3 = df.groupby(df.id, df.v % 2).apply(udf2).sort('id', 'v').toPandas() expected3 = pdf.groupby([pdf.id, pdf.v % 2], as_index=False)\ .apply(lambda x: udf2.func((x.id.iloc[0], (x.v % 2).iloc[0],), x))\ .sort_values(['id', 'v']).reset_index(drop=True) - assert_frame_equal(expected3, result3, check_column_type=_check_column_type) + assert_frame_equal(expected3, result3) # Test empty groupby result4 = df.groupby().apply(udf3).sort('id', 'v').toPandas() expected4 = udf3.func((), pdf) - assert_frame_equal(expected4, result4, check_column_type=_check_column_type) + assert_frame_equal(expected4, result4) def test_column_order(self): @@ -407,7 +402,7 @@ class GroupedMapInPandasTests(ReusedSQLTestCase): .select('id', 'u', 'v').toPandas() pd_result = grouped_pdf.apply(change_col_order) expected = pd_result.sort_values(['id', 'v']).reset_index(drop=True) - assert_frame_equal(expected, result, check_column_type=_check_column_type) + assert_frame_equal(expected, result) # Function returns a pdf with positional columns, indexed by range def range_col_order(pdf): @@ -426,7 +421,7 @@ class GroupedMapInPandasTests(ReusedSQLTestCase): pd_result = grouped_pdf.apply(range_col_order) rename_pdf(pd_result, ['id', 'u', 'v']) expected = pd_result.sort_values(['id', 'v']).reset_index(drop=True) - assert_frame_equal(expected, result, check_column_type=_check_column_type) + assert_frame_equal(expected, result) # Function returns a pdf with columns indexed with integers def int_index(pdf): @@ -444,7 +439,7 @@ class GroupedMapInPandasTests(ReusedSQLTestCase): pd_result = grouped_pdf.apply(int_index) rename_pdf(pd_result, ['id', 'u', 'v']) expected = pd_result.sort_values(['id', 'v']).reset_index(drop=True) - assert_frame_equal(expected, result, check_column_type=_check_column_type) + assert_frame_equal(expected, result) @pandas_udf('id long, v int', PandasUDFType.GROUPED_MAP) def column_name_typo(pdf): diff --git a/python/pyspark/sql/tests/test_pandas_map.py b/python/pyspark/sql/tests/test_pandas_map.py index f1956a2523..02ae6a86f9 100644 --- a/python/pyspark/sql/tests/test_pandas_map.py +++ b/python/pyspark/sql/tests/test_pandas_map.py @@ -19,9 +19,6 @@ import sys import time import unittest -if sys.version >= '3': - unicode = str - from pyspark.sql.functions import pandas_udf, PandasUDFType from pyspark.testing.sqlutils import ReusedSQLTestCase, have_pandas, have_pyarrow, \ pandas_requirement_message, pyarrow_requirement_message diff --git a/python/pyspark/sql/tests/test_pandas_udf_scalar.py b/python/pyspark/sql/tests/test_pandas_udf_scalar.py index 2d38efd39f..75e2a0929e 100644 --- a/python/pyspark/sql/tests/test_pandas_udf_scalar.py +++ b/python/pyspark/sql/tests/test_pandas_udf_scalar.py @@ -22,10 +22,6 @@ import sys import tempfile import time import unittest - -if sys.version >= '3': - unicode = str - from datetime import date, datetime from decimal import Decimal @@ -319,7 +315,7 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): StructField('str', StringType())]) def scalar_func(id): - return pd.DataFrame({'id': id, 'str': id.apply(unicode)}) + return pd.DataFrame({'id': id, 'str': id.apply(str)}) def iter_func(it): for id in it: @@ -486,14 +482,14 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): @pandas_udf(return_type) def scalar_f(id): - return pd.DataFrame({'id': id, 'str': id.apply(unicode)}) + return pd.DataFrame({'id': id, 'str': id.apply(str)}) scalar_g = pandas_udf(lambda x: x, return_type) @pandas_udf(return_type, PandasUDFType.SCALAR_ITER) def iter_f(it): for id in it: - yield pd.DataFrame({'id': id, 'str': id.apply(unicode)}) + yield pd.DataFrame({'id': id, 'str': id.apply(str)}) iter_g = pandas_udf(lambda x: x, return_type, PandasUDFType.SCALAR_ITER) @@ -915,21 +911,12 @@ class ScalarPandasUDFTests(ReusedSQLTestCase): # Check result of column 'B' must be equal to column 'A' in type and values pd.testing.assert_series_equal(result_spark["A"], result_spark["B"], check_names=False) - @unittest.skipIf(sys.version_info[:2] < (3, 5), "Type hints are supported from Python 3.5.") def test_type_annotation(self): # Regression test to check if type hints can be used. See SPARK-23569. - # Note that it throws an error during compilation in lower Python versions if 'exec' - # is not used. Also, note that we explicitly use another dictionary to avoid modifications - # in the current 'locals()'. - # - # Hyukjin: I think it's an ugly way to test issues about syntax specific in - # higher versions of Python, which we shouldn't encourage. This was the last resort - # I could come up with at that time. - _locals = {} - exec( - "import pandas as pd\ndef noop(col: pd.Series) -> pd.Series: return col", - _locals) - df = self.spark.range(1).select(pandas_udf(f=_locals['noop'], returnType='bigint')('id')) + def noop(col: pd.Series) -> pd.Series: + return col + + df = self.spark.range(1).select(pandas_udf(f=noop, returnType='bigint')('id')) self.assertEqual(df.first()[0], 0) def test_mixed_udf(self): diff --git a/python/pyspark/sql/tests/test_pandas_udf_typehints.py b/python/pyspark/sql/tests/test_pandas_udf_typehints.py index 2582080056..618164fa84 100644 --- a/python/pyspark/sql/tests/test_pandas_udf_typehints.py +++ b/python/pyspark/sql/tests/test_pandas_udf_typehints.py @@ -14,9 +14,9 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import sys import unittest import inspect +from typing import Union, Iterator, Tuple from pyspark.sql.functions import mean, lit from pyspark.testing.sqlutils import ReusedSQLTestCase, \ @@ -24,209 +24,162 @@ from pyspark.testing.sqlutils import ReusedSQLTestCase, \ pyarrow_requirement_message from pyspark.sql.pandas.typehints import infer_eval_type from pyspark.sql.pandas.functions import pandas_udf, PandasUDFType +from pyspark.sql import Row if have_pandas: import pandas as pd + import numpy as np from pandas.util.testing import assert_frame_equal -python_requirement_message = "pandas UDF with type hints are supported with Python 3.6+." - @unittest.skipIf( - not have_pandas or not have_pyarrow or sys.version_info[:2] < (3, 6), - pandas_requirement_message or pyarrow_requirement_message or python_requirement_message) + not have_pandas or not have_pyarrow, + pandas_requirement_message or pyarrow_requirement_message) class PandasUDFTypeHintsTests(ReusedSQLTestCase): - # Note that, we should remove `exec` once we drop Python 2 in this class. - - def setUp(self): - self.local = {'pd': pd} - def test_type_annotation_scalar(self): - exec( - "def func(col: pd.Series) -> pd.Series: pass", - self.local) + def func(col: pd.Series) -> pd.Series: + pass self.assertEqual( - infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR) + infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR) - exec( - "def func(col: pd.DataFrame, col1: pd.Series) -> pd.DataFrame: pass", - self.local) + def func(col: pd.DataFrame, col1: pd.Series) -> pd.DataFrame: + pass self.assertEqual( - infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR) + infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR) - exec( - "def func(col: pd.DataFrame, *args: pd.Series) -> pd.Series: pass", - self.local) + def func(col: pd.DataFrame, *args: pd.Series) -> pd.Series: + pass self.assertEqual( - infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR) + infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR) - exec( - "def func(col: pd.Series, *args: pd.Series, **kwargs: pd.DataFrame) -> pd.Series:\n" - " pass", - self.local) + def func(col: pd.Series, *args: pd.Series, **kwargs: pd.DataFrame) -> pd.Series: + pass self.assertEqual( - infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR) + infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR) - exec( - "def func(col: pd.Series, *, col2: pd.DataFrame) -> pd.DataFrame:\n" - " pass", - self.local) + def func(col: pd.Series, *, col2: pd.DataFrame) -> pd.DataFrame: + pass self.assertEqual( - infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR) + infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR) - exec( - "from typing import Union\n" - "def func(col: Union[pd.Series, pd.DataFrame], *, col2: pd.DataFrame) -> pd.Series:\n" - " pass", - self.local) + def func(col: Union[pd.Series, pd.DataFrame], *, col2: pd.DataFrame) -> pd.Series: + pass self.assertEqual( - infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR) + infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR) def test_type_annotation_scalar_iter(self): - exec( - "from typing import Iterator\n" - "def func(iter: Iterator[pd.Series]) -> Iterator[pd.Series]: pass", - self.local) + def func(iter: Iterator[pd.Series]) -> Iterator[pd.Series]: + pass self.assertEqual( - infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR_ITER) + infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR_ITER) - exec( - "from typing import Iterator, Tuple\n" - "def func(iter: Iterator[Tuple[pd.DataFrame, pd.Series]]) -> Iterator[pd.DataFrame]:\n" - " pass", - self.local) + def func(iter: Iterator[Tuple[pd.DataFrame, pd.Series]]) -> Iterator[pd.DataFrame]: + pass self.assertEqual( - infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR_ITER) + infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR_ITER) - exec( - "from typing import Iterator, Tuple\n" - "def func(iter: Iterator[Tuple[pd.DataFrame, ...]]) -> Iterator[pd.Series]: pass", - self.local) + def func(iter: Iterator[Tuple[pd.DataFrame, ...]]) -> Iterator[pd.Series]: + pass self.assertEqual( - infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR_ITER) + infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR_ITER) - exec( - "from typing import Iterator, Tuple, Union\n" - "def func(iter: Iterator[Tuple[Union[pd.DataFrame, pd.Series], ...]])" - " -> Iterator[pd.Series]: pass", - self.local) + def func( + iter: Iterator[Tuple[Union[pd.DataFrame, pd.Series], ...]] + ) -> Iterator[pd.Series]: + pass self.assertEqual( - infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR_ITER) + infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR_ITER) def test_type_annotation_group_agg(self): - exec( - "def func(col: pd.Series) -> str: pass", - self.local) - self.assertEqual( - infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.GROUPED_AGG) - exec( - "def func(col: pd.DataFrame, col1: pd.Series) -> int: pass", - self.local) + def func(col: pd.Series) -> str: + pass self.assertEqual( - infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.GROUPED_AGG) + infer_eval_type(inspect.signature(func)), PandasUDFType.GROUPED_AGG) - exec( - "from pyspark.sql import Row\n" - "def func(col: pd.DataFrame, *args: pd.Series) -> Row: pass", - self.local) + def func(col: pd.DataFrame, col1: pd.Series) -> int: + pass self.assertEqual( - infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.GROUPED_AGG) + infer_eval_type(inspect.signature(func)), PandasUDFType.GROUPED_AGG) - exec( - "def func(col: pd.Series, *args: pd.Series, **kwargs: pd.DataFrame) -> str:\n" - " pass", - self.local) + def func(col: pd.DataFrame, *args: pd.Series) -> Row: + pass self.assertEqual( - infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.GROUPED_AGG) + infer_eval_type(inspect.signature(func)), PandasUDFType.GROUPED_AGG) - exec( - "def func(col: pd.Series, *, col2: pd.DataFrame) -> float:\n" - " pass", - self.local) + def func(col: pd.Series, *args: pd.Series, **kwargs: pd.DataFrame) -> str: + pass self.assertEqual( - infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.GROUPED_AGG) + infer_eval_type(inspect.signature(func)), PandasUDFType.GROUPED_AGG) - exec( - "from typing import Union\n" - "def func(col: Union[pd.Series, pd.DataFrame], *, col2: pd.DataFrame) -> float:\n" - " pass", - self.local) + def func(col: pd.Series, *, col2: pd.DataFrame) -> float: + pass self.assertEqual( - infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.GROUPED_AGG) + infer_eval_type(inspect.signature(func)), PandasUDFType.GROUPED_AGG) + + def func(col: Union[pd.Series, pd.DataFrame], *, col2: pd.DataFrame) -> float: + pass + self.assertEqual( + infer_eval_type(inspect.signature(func)), PandasUDFType.GROUPED_AGG) def test_type_annotation_negative(self): - exec( - "def func(col: str) -> pd.Series: pass", - self.local) + + def func(col: str) -> pd.Series: + pass self.assertRaisesRegex( NotImplementedError, "Unsupported signature.*str", - infer_eval_type, inspect.signature(self.local['func'])) + infer_eval_type, inspect.signature(func)) - exec( - "def func(col: pd.DataFrame, col1: int) -> pd.DataFrame: pass", - self.local) + def func(col: pd.DataFrame, col1: int) -> pd.DataFrame: + pass self.assertRaisesRegex( NotImplementedError, "Unsupported signature.*int", - infer_eval_type, inspect.signature(self.local['func'])) + infer_eval_type, inspect.signature(func)) - exec( - "from typing import Union\n" - "def func(col: Union[pd.DataFrame, str], col1: int) -> pd.DataFrame: pass", - self.local) + def func(col: Union[pd.DataFrame, str], col1: int) -> pd.DataFrame: + pass self.assertRaisesRegex( NotImplementedError, "Unsupported signature.*str", - infer_eval_type, inspect.signature(self.local['func'])) + infer_eval_type, inspect.signature(func)) - exec( - "from typing import Tuple\n" - "def func(col: pd.Series) -> Tuple[pd.DataFrame]: pass", - self.local) + def func(col: pd.Series) -> Tuple[pd.DataFrame]: + pass self.assertRaisesRegex( NotImplementedError, "Unsupported signature.*Tuple", - infer_eval_type, inspect.signature(self.local['func'])) + infer_eval_type, inspect.signature(func)) - exec( - "def func(col, *args: pd.Series) -> pd.Series: pass", - self.local) + def func(col, *args: pd.Series) -> pd.Series: + pass self.assertRaisesRegex( ValueError, "should be specified.*Series", - infer_eval_type, inspect.signature(self.local['func'])) + infer_eval_type, inspect.signature(func)) - exec( - "def func(col: pd.Series, *args: pd.Series, **kwargs: pd.DataFrame):\n" - " pass", - self.local) + def func(col: pd.Series, *args: pd.Series, **kwargs: pd.DataFrame): + pass self.assertRaisesRegex( ValueError, "should be specified.*Series", - infer_eval_type, inspect.signature(self.local['func'])) + infer_eval_type, inspect.signature(func)) - exec( - "def func(col: pd.Series, *, col2) -> pd.DataFrame:\n" - " pass", - self.local) + def func(col: pd.Series, *, col2) -> pd.DataFrame: + pass self.assertRaisesRegex( ValueError, "should be specified.*Series", - infer_eval_type, inspect.signature(self.local['func'])) + infer_eval_type, inspect.signature(func)) def test_scalar_udf_type_hint(self): df = self.spark.range(10).selectExpr("id", "id as v") - exec( - "import typing\n" - "def plus_one(v: typing.Union[pd.Series, pd.DataFrame]) -> pd.Series:\n" - " return v + 1", - self.local) - - plus_one = pandas_udf("long")(self.local["plus_one"]) + def plus_one(v: Union[pd.Series, pd.DataFrame]) -> pd.Series: + return v + 1 + plus_one = pandas_udf("long")(plus_one) actual = df.select(plus_one(df.v).alias("plus_one")) expected = df.selectExpr("(v + 1) as plus_one") assert_frame_equal(expected.toPandas(), actual.toPandas()) @@ -234,14 +187,11 @@ class PandasUDFTypeHintsTests(ReusedSQLTestCase): def test_scalar_iter_udf_type_hint(self): df = self.spark.range(10).selectExpr("id", "id as v") - exec( - "import typing\n" - "def plus_one(itr: typing.Iterator[pd.Series]) -> typing.Iterator[pd.Series]:\n" - " for s in itr:\n" - " yield s + 1", - self.local) + def plus_one(itr: Iterator[pd.Series]) -> Iterator[pd.Series]: + for s in itr: + yield s + 1 - plus_one = pandas_udf("long")(self.local["plus_one"]) + plus_one = pandas_udf("long")(plus_one) actual = df.select(plus_one(df.v).alias("plus_one")) expected = df.selectExpr("(v + 1) as plus_one") @@ -249,13 +199,11 @@ class PandasUDFTypeHintsTests(ReusedSQLTestCase): def test_group_agg_udf_type_hint(self): df = self.spark.range(10).selectExpr("id", "id as v") - exec( - "import numpy as np\n" - "def weighted_mean(v: pd.Series, w: pd.Series) -> float:\n" - " return np.average(v, weights=w)", - self.local) - weighted_mean = pandas_udf("double")(self.local["weighted_mean"]) + def weighted_mean(v: pd.Series, w: pd.Series) -> float: + return np.average(v, weights=w) + + weighted_mean = pandas_udf("double")(weighted_mean) actual = df.groupby('id').agg(weighted_mean(df.v, lit(1.0))).sort('id') expected = df.groupby('id').agg(mean(df.v).alias('weighted_mean(v, 1.0)')).sort('id') @@ -263,12 +211,9 @@ class PandasUDFTypeHintsTests(ReusedSQLTestCase): def test_ignore_type_hint_in_group_apply_in_pandas(self): df = self.spark.range(10) - exec( - "def pandas_plus_one(v: pd.DataFrame) -> pd.DataFrame:\n" - " return v + 1", - self.local) - pandas_plus_one = self.local["pandas_plus_one"] + def pandas_plus_one(v: pd.DataFrame) -> pd.DataFrame: + return v + 1 actual = df.groupby('id').applyInPandas(pandas_plus_one, schema=df.schema).sort('id') expected = df.selectExpr("id + 1 as id") @@ -276,12 +221,9 @@ class PandasUDFTypeHintsTests(ReusedSQLTestCase): def test_ignore_type_hint_in_cogroup_apply_in_pandas(self): df = self.spark.range(10) - exec( - "def pandas_plus_one(left: pd.DataFrame, right: pd.DataFrame) -> pd.DataFrame:\n" - " return left + 1", - self.local) - pandas_plus_one = self.local["pandas_plus_one"] + def pandas_plus_one(left: pd.DataFrame, right: pd.DataFrame) -> pd.DataFrame: + return left + 1 actual = df.groupby('id').cogroup( self.spark.range(10).groupby("id") @@ -291,13 +233,9 @@ class PandasUDFTypeHintsTests(ReusedSQLTestCase): def test_ignore_type_hint_in_map_in_pandas(self): df = self.spark.range(10) - exec( - "from typing import Iterator\n" - "def pandas_plus_one(iter: Iterator[pd.DataFrame]) -> Iterator[pd.DataFrame]:\n" - " return map(lambda v: v + 1, iter)", - self.local) - pandas_plus_one = self.local["pandas_plus_one"] + def pandas_plus_one(iter: Iterator[pd.DataFrame]) -> Iterator[pd.DataFrame]: + return map(lambda v: v + 1, iter) actual = df.mapInPandas(pandas_plus_one, schema=df.schema) expected = df.selectExpr("id + 1 as id") diff --git a/python/pyspark/sql/tests/test_types.py b/python/pyspark/sql/tests/test_types.py index 016cafd669..051c8bde50 100644 --- a/python/pyspark/sql/tests/test_types.py +++ b/python/pyspark/sql/tests/test_types.py @@ -56,7 +56,7 @@ class TypesTests(ReusedSQLTestCase): self.assertEqual(10, df3.count()) def test_apply_schema_to_dict_and_rows(self): - schema = StructType().add("b", StringType()).add("a", IntegerType()) + schema = StructType().add("a", IntegerType()).add("b", StringType()) input = [{"a": 1}, {"b": "coffee"}] rdd = self.sc.parallelize(input) for verify in [False, True]: @@ -72,7 +72,6 @@ class TypesTests(ReusedSQLTestCase): self.assertEqual(10, df4.count()) def test_create_dataframe_schema_mismatch(self): - input = [Row(a=1)] rdd = self.sc.parallelize(range(3)).map(lambda i: Row(a=i)) schema = StructType([StructField("a", IntegerType()), StructField("b", StringType())]) df = self.spark.createDataFrame(rdd, schema) @@ -540,7 +539,6 @@ class TypesTests(ReusedSQLTestCase): self.assertEqual(_infer_type(2**61), LongType()) self.assertEqual(_infer_type(2**71), LongType()) - @unittest.skipIf(sys.version < "3", "only Python 3 infers bytes as binary type") def test_infer_binary_type(self): binaryrow = [Row(f1='a', f2=b"abcd")] df = self.sc.parallelize(binaryrow).toDF() @@ -665,10 +663,6 @@ class TypesTests(ReusedSQLTestCase): supported_string_types += ['u'] # test unicode assertCollectSuccess('u', u'a') - if sys.version_info[0] < 3: - supported_string_types += ['c'] - # test string - assertCollectSuccess('c', 'a') # supported float and double # @@ -721,12 +715,8 @@ class TypesTests(ReusedSQLTestCase): # # Keys in _array_type_mappings is a complete list of all supported types, # and types not in _array_type_mappings are considered unsupported. - # `array.typecodes` are not supported in python 2. - if sys.version_info[0] < 3: - all_types = set(['c', 'b', 'B', 'u', 'h', 'H', 'i', 'I', 'l', 'L', 'f', 'd']) - else: - # PyPy seems not having array.typecodes. - all_types = set(['b', 'B', 'u', 'h', 'H', 'i', 'I', 'l', 'L', 'q', 'Q', 'f', 'd']) + # PyPy seems not having array.typecodes. + all_types = set(['b', 'B', 'u', 'h', 'H', 'i', 'I', 'l', 'L', 'q', 'Q', 'f', 'd']) unsupported_types = all_types - set(supported_types) # test unsupported types for t in unsupported_types: @@ -767,10 +757,7 @@ class DataTypeTests(unittest.TestCase): self.assertEqual(repr(row), "") # test __repr__ with unicode values - if sys.version_info.major >= 3: - self.assertEqual(repr(Row("数", "量")), "") - else: - self.assertEqual(repr(Row(u"数", u"量")), r"") + self.assertEqual(repr(Row("数", "量")), "") def test_empty_row(self): row = Row() @@ -888,7 +875,6 @@ class DataTypeVerificationTests(unittest.TestCase): ({"s": "a", "f": 1.0}, schema), (Row(s="a", i=1), schema), (Row(s="a", i=None), schema), - (Row(s="a", i=1, f=1.0), schema), (["a", 1], schema), (["a", None], schema), (("a", 1), schema), @@ -973,18 +959,13 @@ class DataTypeVerificationTests(unittest.TestCase): with self.assertRaises(exp, msg=msg): _make_type_verifier(data_type, nullable=False)(obj) - @unittest.skipIf(sys.version_info[:2] < (3, 6), "Create Row without sorting fields") def test_row_without_field_sorting(self): - sorting_enabled_tmp = Row._row_field_sorting_enabled - Row._row_field_sorting_enabled = False - r = Row(b=1, a=2) TestRow = Row("b", "a") expected = TestRow(1, 2) self.assertEqual(r, expected) self.assertEqual(repr(r), "Row(b=1, a=2)") - Row._row_field_sorting_enabled = sorting_enabled_tmp if __name__ == "__main__": diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py index 320a68dffe..cc08482c73 100644 --- a/python/pyspark/sql/types.py +++ b/python/pyspark/sql/types.py @@ -15,7 +15,6 @@ # limitations under the License. # -import os import sys import decimal import time @@ -26,11 +25,6 @@ import re import base64 from array import array import ctypes -import warnings - -if sys.version >= "3": - long = int - basestring = unicode = str from py4j.protocol import register_input_converter from py4j.java_gateway import JavaClass @@ -409,9 +403,7 @@ class StructField(DataType): """ assert isinstance(dataType, DataType),\ "dataType %s should be an instance of %s" % (dataType, DataType) - assert isinstance(name, basestring), "field name %s should be string" % (name) - if not isinstance(name, str): - name = name.encode('utf-8') + assert isinstance(name, str), "field name %s should be a string" % (name) self.name = name self.dataType = dataType self.nullable = nullable @@ -613,8 +605,6 @@ class StructType(DataType): else: if isinstance(obj, dict): return tuple(obj.get(n) for n in self.names) - elif isinstance(obj, Row) and getattr(obj, "__from_dict__", False): - return tuple(obj[n] for n in self.names) elif isinstance(obj, (list, tuple)): return tuple(obj) elif hasattr(obj, "__dict__"): @@ -904,19 +894,9 @@ _type_mappings = { datetime.date: DateType, datetime.datetime: TimestampType, datetime.time: TimestampType, + bytes: BinaryType, } -if sys.version < "3": - _type_mappings.update({ - unicode: StringType, - long: LongType, - }) - -if sys.version >= "3": - _type_mappings.update({ - bytes: BinaryType, - }) - # Mapping Python array types to Spark SQL DataType # We should be careful here. The size of these types in python depends on C # implementation. We need to make sure that this conversion does not lose any @@ -990,20 +970,6 @@ for _typecode in _array_unsigned_int_typecode_ctype_mappings.keys(): if sys.version_info[0] < 4: _array_type_mappings['u'] = StringType -# Type code 'c' are only available at python 2 -if sys.version_info[0] < 3: - _array_type_mappings['c'] = StringType - -# SPARK-21465: -# In python2, array of 'L' happened to be mistakenly, just partially supported. To -# avoid breaking user's code, we should keep this partial support. Below is a -# dirty hacking to keep this partial support and pass the unit test. -import platform -if sys.version_info[0] < 3 and platform.python_implementation() != 'PyPy': - if 'L' not in _array_type_mappings.keys(): - _array_type_mappings['L'] = LongType - _array_unsigned_int_typecode_ctype_mappings['L'] = ctypes.c_uint - def _infer_type(obj): """Infer the DataType from obj @@ -1187,14 +1153,14 @@ def _create_converter(dataType): _acceptable_types = { BooleanType: (bool,), - ByteType: (int, long), - ShortType: (int, long), - IntegerType: (int, long), - LongType: (int, long), + ByteType: (int,), + ShortType: (int,), + IntegerType: (int,), + LongType: (int,), FloatType: (float,), DoubleType: (float,), DecimalType: (decimal.Decimal,), - StringType: (str, unicode), + StringType: (str,), BinaryType: (bytearray, bytes), DateType: (datetime.date, datetime.datetime), TimestampType: (datetime.datetime,), @@ -1376,10 +1342,6 @@ def _make_type_verifier(dataType, nullable=True, name=None): if isinstance(obj, dict): for f, verifier in verifiers: verifier(obj.get(f)) - elif isinstance(obj, Row) and getattr(obj, "__from_dict__", False): - # the order in obj could be different than dataType.fields - for f, verifier in verifiers: - verifier(obj[f]) elif isinstance(obj, (tuple, list)): if len(obj) != len(verifiers): raise ValueError( @@ -1438,21 +1400,11 @@ class Row(tuple): NOTE: As of Spark 3.0.0, Rows created from named arguments no longer have field names sorted alphabetically and will be ordered in the position as - entered. To enable sorting for Rows compatible with Spark 2.x, set the - environment variable "PYSPARK_ROW_FIELD_SORTING_ENABLED" to "true". This - option is deprecated and will be removed in future versions of Spark. For - Python versions < 3.6, the order of named arguments is not guaranteed to - be the same as entered, see https://www.python.org/dev/peps/pep-0468. In - this case, a warning will be issued and the Row will fallback to sort the - field names automatically. - - NOTE: Examples with Row in pydocs are run with the environment variable - "PYSPARK_ROW_FIELD_SORTING_ENABLED" set to "true" which results in output - where fields are sorted. + entered. >>> row = Row(name="Alice", age=11) >>> row - Row(age=11, name='Alice') + Row(name='Alice', age=11) >>> row['name'], row['age'] ('Alice', 11) >>> row.name, row.age @@ -1476,47 +1428,22 @@ class Row(tuple): Row(name='Alice', age=11) This form can also be used to create rows as tuple values, i.e. with unnamed - fields. Beware that such Row objects have different equality semantics: + fields. >>> row1 = Row("Alice", 11) >>> row2 = Row(name="Alice", age=11) >>> row1 == row2 - False - >>> row3 = Row(a="Alice", b=11) - >>> row1 == row3 True """ - # Remove after Python < 3.6 dropped, see SPARK-29748 - _row_field_sorting_enabled = \ - os.environ.get('PYSPARK_ROW_FIELD_SORTING_ENABLED', 'false').lower() == 'true' - - if _row_field_sorting_enabled: - warnings.warn("The environment variable 'PYSPARK_ROW_FIELD_SORTING_ENABLED' " - "is deprecated and will be removed in future versions of Spark") - def __new__(cls, *args, **kwargs): if args and kwargs: raise ValueError("Can not use both args " "and kwargs to create Row") if kwargs: - if not Row._row_field_sorting_enabled and sys.version_info[:2] < (3, 6): - warnings.warn("To use named arguments for Python version < 3.6, Row fields will be " - "automatically sorted. This warning can be skipped by setting the " - "environment variable 'PYSPARK_ROW_FIELD_SORTING_ENABLED' to 'true'.") - Row._row_field_sorting_enabled = True - # create row objects - if Row._row_field_sorting_enabled: - # Remove after Python < 3.6 dropped, see SPARK-29748 - names = sorted(kwargs.keys()) - row = tuple.__new__(cls, [kwargs[n] for n in names]) - row.__fields__ = names - row.__from_dict__ = True - else: - row = tuple.__new__(cls, list(kwargs.values())) - row.__fields__ = list(kwargs.keys()) - + row = tuple.__new__(cls, list(kwargs.values())) + row.__fields__ = list(kwargs.keys()) return row else: # create row class or objects @@ -1537,7 +1464,7 @@ class Row(tuple): >>> Row(name="Alice", age=11).asDict() == {'name': 'Alice', 'age': 11} True >>> row = Row(key=1, value=Row(name='a', age=2)) - >>> row.asDict() == {'key': 1, 'value': Row(age=2, name='a')} + >>> row.asDict() == {'key': 1, 'value': Row(name='a', age=2)} True >>> row.asDict(True) == {'key': 1, 'value': {'name': 'a', 'age': 2}} True @@ -1600,7 +1527,7 @@ class Row(tuple): raise AttributeError(item) def __setattr__(self, key, value): - if key != '__fields__' and key != "__from_dict__": + if key != '__fields__': raise Exception("Row is read-only") self.__dict__[key] = value diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py index da68583b04..100481cf12 100644 --- a/python/pyspark/sql/udf.py +++ b/python/pyspark/sql/udf.py @@ -21,7 +21,7 @@ import functools import sys from pyspark import SparkContext, since -from pyspark.rdd import _prepare_for_python_RDD, PythonEvalType, ignore_unicode_prefix +from pyspark.rdd import _prepare_for_python_RDD, PythonEvalType from pyspark.sql.column import Column, _to_java_column, _to_seq from pyspark.sql.types import StringType, DataType, StructType, _parse_datatype_string from pyspark.sql.pandas.types import to_arrow_type @@ -232,7 +232,6 @@ class UDFRegistration(object): def __init__(self, sparkSession): self.sparkSession = sparkSession - @ignore_unicode_prefix @since("1.3.1") def register(self, name, f, returnType=None): """Register a Python function (including lambda function) or a user-defined function @@ -261,10 +260,10 @@ class UDFRegistration(object): >>> strlen = spark.udf.register("stringLengthString", lambda x: len(x)) >>> spark.sql("SELECT stringLengthString('test')").collect() - [Row(stringLengthString(test)=u'4')] + [Row(stringLengthString(test)='4')] >>> spark.sql("SELECT 'foo' AS text").select(strlen("text")).collect() - [Row(stringLengthString(text)=u'3')] + [Row(stringLengthString(text)='3')] >>> from pyspark.sql.types import IntegerType >>> _ = spark.udf.register("stringLengthInt", lambda x: len(x), IntegerType()) @@ -349,7 +348,6 @@ class UDFRegistration(object): self.sparkSession._jsparkSession.udf().registerPython(name, register_udf._judf) return return_udf - @ignore_unicode_prefix @since(2.3) def registerJavaFunction(self, name, javaClassName, returnType=None): """Register a Java user-defined function as a SQL function. @@ -389,7 +387,6 @@ class UDFRegistration(object): jdt = self.sparkSession._jsparkSession.parseDataType(returnType.json()) self.sparkSession._jsparkSession.udf().registerJava(name, javaClassName, jdt) - @ignore_unicode_prefix @since(2.3) def registerJavaUDAF(self, name, javaClassName): """Register a Java user-defined aggregate function as a SQL function. @@ -403,7 +400,7 @@ class UDFRegistration(object): >>> df.createOrReplaceTempView("df") >>> q = "SELECT name, javaUDAF(id) as avg from df group by name order by name desc" >>> spark.sql(q).collect() # doctest: +SKIP - [Row(name=u'b', avg=102.0), Row(name=u'a', avg=102.0)] + [Row(name='b', avg=102.0), Row(name='a', avg=102.0)] """ self.sparkSession._jsparkSession.udf().registerJavaUDAF(name, javaClassName) @@ -419,9 +416,6 @@ def _test(): .appName("sql.udf tests")\ .getOrCreate() globs['spark'] = spark - # Hack to skip the unit tests in register. These are currently being tested in proper tests. - # We should reenable this test once we completely drop Python 2. - del pyspark.sql.udf.UDFRegistration.register (failure_count, test_count) = doctest.testmod( pyspark.sql.udf, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE) diff --git a/python/pyspark/sql/utils.py b/python/pyspark/sql/utils.py index 1d5bc49d25..bd76d88005 100644 --- a/python/pyspark/sql/utils.py +++ b/python/pyspark/sql/utils.py @@ -16,22 +16,9 @@ # import py4j -import sys from pyspark import SparkContext -if sys.version_info.major >= 3: - unicode = str - # Disable exception chaining (PEP 3134) in captured exceptions - # in order to hide JVM stacktace. - exec(""" -def raise_from(e): - raise e from None -""") -else: - def raise_from(e): - raise e - class CapturedException(Exception): def __init__(self, desc, stackTrace, cause=None): @@ -45,11 +32,7 @@ class CapturedException(Exception): desc = self.desc if debug_enabled: desc = desc + "\n\nJVM stacktrace:\n%s" % self.stackTrace - # encode unicode instance for python2 for human readable description - if sys.version_info.major < 3 and isinstance(desc, unicode): - return str(desc.encode('utf-8')) - else: - return str(desc) + return str(desc) class AnalysisException(CapturedException): @@ -131,7 +114,7 @@ def capture_sql_exception(f): if not isinstance(converted, UnknownException): # Hide where the exception came from that shows a non-Pythonic # JVM exception message. - raise_from(converted) + raise converted from None else: raise return deco diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py index 6199611940..170f0c0ef7 100644 --- a/python/pyspark/streaming/context.py +++ b/python/pyspark/streaming/context.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - from py4j.java_gateway import java_import, is_instance_of from pyspark import RDD, SparkConf diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py index 60562a6c92..000318588e 100644 --- a/python/pyspark/streaming/dstream.py +++ b/python/pyspark/streaming/dstream.py @@ -21,11 +21,6 @@ import time from itertools import chain from datetime import datetime -if sys.version < "3": - from itertools import imap as map, ifilter as filter -else: - long = int - from py4j.protocol import Py4JJavaError from pyspark import RDD @@ -404,7 +399,7 @@ class DStream(object): """ if isinstance(timestamp, datetime): timestamp = time.mktime(timestamp.timetuple()) - return self._sc._jvm.Time(long(timestamp * 1000)) + return self._sc._jvm.Time(int(timestamp * 1000)) def slice(self, begin, end): """ diff --git a/python/pyspark/taskcontext.py b/python/pyspark/taskcontext.py index 8f419a5e84..d8aa5f9318 100644 --- a/python/pyspark/taskcontext.py +++ b/python/pyspark/taskcontext.py @@ -14,10 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -from __future__ import print_function -import json - from pyspark.java_gateway import local_connect_and_auth from pyspark.serializers import read_int, write_int, write_with_length, UTF8Deserializer diff --git a/python/pyspark/testing/sqlutils.py b/python/pyspark/testing/sqlutils.py index 085fce6daa..e85cae7dda 100644 --- a/python/pyspark/testing/sqlutils.py +++ b/python/pyspark/testing/sqlutils.py @@ -24,7 +24,6 @@ from contextlib import contextmanager from pyspark.sql import SparkSession from pyspark.sql.types import ArrayType, DoubleType, UserDefinedType, Row from pyspark.testing.utils import ReusedPySparkTestCase -from pyspark.util import _exception_message pandas_requirement_message = None @@ -33,7 +32,7 @@ try: require_minimum_pandas_version() except ImportError as e: # If Pandas version requirement is not satisfied, skip related tests. - pandas_requirement_message = _exception_message(e) + pandas_requirement_message = str(e) pyarrow_requirement_message = None try: @@ -41,14 +40,14 @@ try: require_minimum_pyarrow_version() except ImportError as e: # If Arrow version requirement is not satisfied, skip related tests. - pyarrow_requirement_message = _exception_message(e) + pyarrow_requirement_message = str(e) test_not_compiled_message = None try: from pyspark.sql.utils import require_test_compiled require_test_compiled() except Exception as e: - test_not_compiled_message = _exception_message(e) + test_not_compiled_message = str(e) have_pandas = pandas_requirement_message is None have_pyarrow = pyarrow_requirement_message is None diff --git a/python/pyspark/tests/test_profiler.py b/python/pyspark/tests/test_profiler.py index 04ca5a3896..dbce72a0d3 100644 --- a/python/pyspark/tests/test_profiler.py +++ b/python/pyspark/tests/test_profiler.py @@ -19,15 +19,11 @@ import os import sys import tempfile import unittest +from io import StringIO from pyspark import SparkConf, SparkContext, BasicProfiler from pyspark.testing.utils import PySparkTestCase -if sys.version >= "3": - from io import StringIO -else: - from StringIO import StringIO - class ProfilerTests(PySparkTestCase): diff --git a/python/pyspark/tests/test_rdd.py b/python/pyspark/tests/test_rdd.py index 6c5b818056..1a580e27ea 100644 --- a/python/pyspark/tests/test_rdd.py +++ b/python/pyspark/tests/test_rdd.py @@ -32,9 +32,6 @@ from pyspark.serializers import CloudPickleSerializer, BatchedSerializer, Pickle MarshalSerializer, UTF8Deserializer, NoOpSerializer from pyspark.testing.utils import ReusedPySparkTestCase, SPARK_HOME, QuietTest -if sys.version_info[0] >= 3: - xrange = range - global_func = lambda: "Hi" @@ -193,15 +190,13 @@ class RDDTests(ReusedPySparkTestCase): def test_sampling_default_seed(self): # Test for SPARK-3995 (default seed setting) - data = self.sc.parallelize(xrange(1000), 1) + data = self.sc.parallelize(range(1000), 1) subset = data.takeSample(False, 10) self.assertEqual(len(subset), 10) def test_aggregate_mutable_zero_value(self): # Test for SPARK-9021; uses aggregate and treeAggregate to build dict # representing a counter of ints - # NOTE: dict is used instead of collections.Counter for Python 2.6 - # compatibility from collections import defaultdict # Show that single or multiple partitions work @@ -262,8 +257,6 @@ class RDDTests(ReusedPySparkTestCase): def test_fold_mutable_zero_value(self): # Test for SPARK-9021; uses fold to merge an RDD of dict counters into # a single dict - # NOTE: dict is used instead of collections.Counter for Python 2.6 - # compatibility from collections import defaultdict counts1 = defaultdict(int, dict((i, 1) for i in range(10))) @@ -439,7 +432,7 @@ class RDDTests(ReusedPySparkTestCase): def test_large_closure(self): N = 200000 - data = [float(i) for i in xrange(N)] + data = [float(i) for i in range(N)] rdd = self.sc.parallelize(range(1), 1).map(lambda x: len(data)) self.assertEqual(N, rdd.first()) # regression test for SPARK-6886 @@ -464,8 +457,8 @@ class RDDTests(ReusedPySparkTestCase): def test_zip_with_different_object_sizes(self): # regress test for SPARK-5973 - a = self.sc.parallelize(xrange(10000)).map(lambda i: '*' * i) - b = self.sc.parallelize(xrange(10000, 20000)).map(lambda i: '*' * i) + a = self.sc.parallelize(range(10000)).map(lambda i: '*' * i) + b = self.sc.parallelize(range(10000, 20000)).map(lambda i: '*' * i) self.assertEqual(10000, a.zip(b).count()) def test_zip_with_different_number_of_items(self): @@ -487,7 +480,7 @@ class RDDTests(ReusedPySparkTestCase): self.assertRaises(Exception, lambda: a.zip(b).count()) def test_count_approx_distinct(self): - rdd = self.sc.parallelize(xrange(1000)) + rdd = self.sc.parallelize(range(1000)) self.assertTrue(950 < rdd.countApproxDistinct(0.03) < 1050) self.assertTrue(950 < rdd.map(float).countApproxDistinct(0.03) < 1050) self.assertTrue(950 < rdd.map(str).countApproxDistinct(0.03) < 1050) @@ -641,7 +634,7 @@ class RDDTests(ReusedPySparkTestCase): def test_external_group_by_key(self): self.sc._conf.set("spark.python.worker.memory", "1m") N = 2000001 - kv = self.sc.parallelize(xrange(N)).map(lambda x: (x % 3, x)) + kv = self.sc.parallelize(range(N)).map(lambda x: (x % 3, x)) gkv = kv.groupByKey().cache() self.assertEqual(3, gkv.count()) filtered = gkv.filter(lambda kv: kv[0] == 1) @@ -698,7 +691,7 @@ class RDDTests(ReusedPySparkTestCase): # Regression test for SPARK-6294 def test_take_on_jrdd(self): - rdd = self.sc.parallelize(xrange(1 << 20)).map(lambda x: str(x)) + rdd = self.sc.parallelize(range(1 << 20)).map(lambda x: str(x)) rdd._jrdd.first() def test_sortByKey_uses_all_partitions_not_only_first_and_last(self): diff --git a/python/pyspark/tests/test_readwrite.py b/python/pyspark/tests/test_readwrite.py index 734b7e4789..faa006c7d8 100644 --- a/python/pyspark/tests/test_readwrite.py +++ b/python/pyspark/tests/test_readwrite.py @@ -38,104 +38,6 @@ class InputFormatTests(ReusedPySparkTestCase): ReusedPySparkTestCase.tearDownClass() shutil.rmtree(cls.tempdir.name) - @unittest.skipIf(sys.version >= "3", "serialize array of byte") - def test_sequencefiles(self): - basepath = self.tempdir.name - ints = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfint/", - "org.apache.hadoop.io.IntWritable", - "org.apache.hadoop.io.Text").collect()) - ei = [(1, u'aa'), (1, u'aa'), (2, u'aa'), (2, u'bb'), (2, u'bb'), (3, u'cc')] - self.assertEqual(ints, ei) - - doubles = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfdouble/", - "org.apache.hadoop.io.DoubleWritable", - "org.apache.hadoop.io.Text").collect()) - ed = [(1.0, u'aa'), (1.0, u'aa'), (2.0, u'aa'), (2.0, u'bb'), (2.0, u'bb'), (3.0, u'cc')] - self.assertEqual(doubles, ed) - - bytes = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfbytes/", - "org.apache.hadoop.io.IntWritable", - "org.apache.hadoop.io.BytesWritable").collect()) - ebs = [(1, bytearray('aa', 'utf-8')), - (1, bytearray('aa', 'utf-8')), - (2, bytearray('aa', 'utf-8')), - (2, bytearray('bb', 'utf-8')), - (2, bytearray('bb', 'utf-8')), - (3, bytearray('cc', 'utf-8'))] - self.assertEqual(bytes, ebs) - - text = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sftext/", - "org.apache.hadoop.io.Text", - "org.apache.hadoop.io.Text").collect()) - et = [(u'1', u'aa'), - (u'1', u'aa'), - (u'2', u'aa'), - (u'2', u'bb'), - (u'2', u'bb'), - (u'3', u'cc')] - self.assertEqual(text, et) - - bools = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfbool/", - "org.apache.hadoop.io.IntWritable", - "org.apache.hadoop.io.BooleanWritable").collect()) - eb = [(1, False), (1, True), (2, False), (2, False), (2, True), (3, True)] - self.assertEqual(bools, eb) - - nulls = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfnull/", - "org.apache.hadoop.io.IntWritable", - "org.apache.hadoop.io.BooleanWritable").collect()) - en = [(1, None), (1, None), (2, None), (2, None), (2, None), (3, None)] - self.assertEqual(nulls, en) - - maps = self.sc.sequenceFile(basepath + "/sftestdata/sfmap/", - "org.apache.hadoop.io.IntWritable", - "org.apache.hadoop.io.MapWritable").collect() - em = [(1, {}), - (1, {3.0: u'bb'}), - (2, {1.0: u'aa'}), - (2, {1.0: u'cc'}), - (3, {2.0: u'dd'})] - for v in maps: - self.assertTrue(v in em) - - # arrays get pickled to tuples by default - tuples = sorted(self.sc.sequenceFile( - basepath + "/sftestdata/sfarray/", - "org.apache.hadoop.io.IntWritable", - "org.apache.spark.api.python.DoubleArrayWritable").collect()) - et = [(1, ()), - (2, (3.0, 4.0, 5.0)), - (3, (4.0, 5.0, 6.0))] - self.assertEqual(tuples, et) - - # with custom converters, primitive arrays can stay as arrays - arrays = sorted(self.sc.sequenceFile( - basepath + "/sftestdata/sfarray/", - "org.apache.hadoop.io.IntWritable", - "org.apache.spark.api.python.DoubleArrayWritable", - valueConverter="org.apache.spark.api.python.WritableToDoubleArrayConverter").collect()) - ea = [(1, array('d')), - (2, array('d', [3.0, 4.0, 5.0])), - (3, array('d', [4.0, 5.0, 6.0]))] - self.assertEqual(arrays, ea) - - clazz = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfclass/", - "org.apache.hadoop.io.Text", - "org.apache.spark.api.python.TestWritable").collect()) - cname = u'org.apache.spark.api.python.TestWritable' - ec = [(u'1', {u'__class__': cname, u'double': 1.0, u'int': 1, u'str': u'test1'}), - (u'2', {u'__class__': cname, u'double': 2.3, u'int': 2, u'str': u'test2'}), - (u'3', {u'__class__': cname, u'double': 3.1, u'int': 3, u'str': u'test3'}), - (u'4', {u'__class__': cname, u'double': 4.2, u'int': 4, u'str': u'test4'}), - (u'5', {u'__class__': cname, u'double': 5.5, u'int': 5, u'str': u'test56'})] - self.assertEqual(clazz, ec) - - unbatched_clazz = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfclass/", - "org.apache.hadoop.io.Text", - "org.apache.spark.api.python.TestWritable", - ).collect()) - self.assertEqual(unbatched_clazz, ec) - def test_oldhadoop(self): basepath = self.tempdir.name ints = sorted(self.sc.hadoopFile(basepath + "/sftestdata/sfint/", @@ -249,51 +151,6 @@ class OutputFormatTests(ReusedPySparkTestCase): def tearDown(self): shutil.rmtree(self.tempdir.name, ignore_errors=True) - @unittest.skipIf(sys.version >= "3", "serialize array of byte") - def test_sequencefiles(self): - basepath = self.tempdir.name - ei = [(1, u'aa'), (1, u'aa'), (2, u'aa'), (2, u'bb'), (2, u'bb'), (3, u'cc')] - self.sc.parallelize(ei).saveAsSequenceFile(basepath + "/sfint/") - ints = sorted(self.sc.sequenceFile(basepath + "/sfint/").collect()) - self.assertEqual(ints, ei) - - ed = [(1.0, u'aa'), (1.0, u'aa'), (2.0, u'aa'), (2.0, u'bb'), (2.0, u'bb'), (3.0, u'cc')] - self.sc.parallelize(ed).saveAsSequenceFile(basepath + "/sfdouble/") - doubles = sorted(self.sc.sequenceFile(basepath + "/sfdouble/").collect()) - self.assertEqual(doubles, ed) - - ebs = [(1, bytearray(b'\x00\x07spam\x08')), (2, bytearray(b'\x00\x07spam\x08'))] - self.sc.parallelize(ebs).saveAsSequenceFile(basepath + "/sfbytes/") - bytes = sorted(self.sc.sequenceFile(basepath + "/sfbytes/").collect()) - self.assertEqual(bytes, ebs) - - et = [(u'1', u'aa'), - (u'2', u'bb'), - (u'3', u'cc')] - self.sc.parallelize(et).saveAsSequenceFile(basepath + "/sftext/") - text = sorted(self.sc.sequenceFile(basepath + "/sftext/").collect()) - self.assertEqual(text, et) - - eb = [(1, False), (1, True), (2, False), (2, False), (2, True), (3, True)] - self.sc.parallelize(eb).saveAsSequenceFile(basepath + "/sfbool/") - bools = sorted(self.sc.sequenceFile(basepath + "/sfbool/").collect()) - self.assertEqual(bools, eb) - - en = [(1, None), (1, None), (2, None), (2, None), (2, None), (3, None)] - self.sc.parallelize(en).saveAsSequenceFile(basepath + "/sfnull/") - nulls = sorted(self.sc.sequenceFile(basepath + "/sfnull/").collect()) - self.assertEqual(nulls, en) - - em = [(1, {}), - (1, {3.0: u'bb'}), - (2, {1.0: u'aa'}), - (2, {1.0: u'cc'}), - (3, {2.0: u'dd'})] - self.sc.parallelize(em).saveAsSequenceFile(basepath + "/sfmap/") - maps = self.sc.sequenceFile(basepath + "/sfmap/").collect() - for v in maps: - self.assertTrue(v, em) - def test_oldhadoop(self): basepath = self.tempdir.name dict_data = [(1, {}), @@ -361,46 +218,6 @@ class OutputFormatTests(ReusedPySparkTestCase): conf=input_conf).collect()) self.assertEqual(new_dataset, data) - @unittest.skipIf(sys.version >= "3", "serialize of array") - def test_newhadoop_with_array(self): - basepath = self.tempdir.name - # use custom ArrayWritable types and converters to handle arrays - array_data = [(1, array('d')), - (1, array('d', [1.0, 2.0, 3.0])), - (2, array('d', [3.0, 4.0, 5.0]))] - self.sc.parallelize(array_data).saveAsNewAPIHadoopFile( - basepath + "/newhadoop/", - "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat", - "org.apache.hadoop.io.IntWritable", - "org.apache.spark.api.python.DoubleArrayWritable", - valueConverter="org.apache.spark.api.python.DoubleArrayToWritableConverter") - result = sorted(self.sc.newAPIHadoopFile( - basepath + "/newhadoop/", - "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat", - "org.apache.hadoop.io.IntWritable", - "org.apache.spark.api.python.DoubleArrayWritable", - valueConverter="org.apache.spark.api.python.WritableToDoubleArrayConverter").collect()) - self.assertEqual(result, array_data) - - conf = { - "mapreduce.job.outputformat.class": - "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat", - "mapreduce.job.output.key.class": "org.apache.hadoop.io.IntWritable", - "mapreduce.job.output.value.class": "org.apache.spark.api.python.DoubleArrayWritable", - "mapreduce.output.fileoutputformat.outputdir": basepath + "/newdataset/" - } - self.sc.parallelize(array_data).saveAsNewAPIHadoopDataset( - conf, - valueConverter="org.apache.spark.api.python.DoubleArrayToWritableConverter") - input_conf = {"mapreduce.input.fileinputformat.inputdir": basepath + "/newdataset/"} - new_dataset = sorted(self.sc.newAPIHadoopRDD( - "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat", - "org.apache.hadoop.io.IntWritable", - "org.apache.spark.api.python.DoubleArrayWritable", - valueConverter="org.apache.spark.api.python.WritableToDoubleArrayConverter", - conf=input_conf).collect()) - self.assertEqual(new_dataset, array_data) - def test_newolderror(self): basepath = self.tempdir.name rdd = self.sc.parallelize(range(1, 4)).map(lambda x: (x, "a" * x)) diff --git a/python/pyspark/tests/test_shuffle.py b/python/pyspark/tests/test_shuffle.py index d50ba632d6..434414618e 100644 --- a/python/pyspark/tests/test_shuffle.py +++ b/python/pyspark/tests/test_shuffle.py @@ -23,15 +23,12 @@ from py4j.protocol import Py4JJavaError from pyspark import shuffle, PickleSerializer, SparkConf, SparkContext from pyspark.shuffle import Aggregator, ExternalMerger, ExternalSorter -if sys.version_info[0] >= 3: - xrange = range - class MergerTests(unittest.TestCase): def setUp(self): self.N = 1 << 12 - self.l = [i for i in xrange(self.N)] + self.l = [i for i in range(self.N)] self.data = list(zip(self.l, self.l)) self.agg = Aggregator(lambda x: [x], lambda x, y: x.append(y) or x, @@ -42,26 +39,26 @@ class MergerTests(unittest.TestCase): m.mergeValues(self.data) self.assertEqual(m.spills, 0) self.assertEqual(sum(sum(v) for k, v in m.items()), - sum(xrange(self.N))) + sum(range(self.N))) m = ExternalMerger(self.agg, 1000) m.mergeCombiners(map(lambda x_y1: (x_y1[0], [x_y1[1]]), self.data)) self.assertEqual(m.spills, 0) self.assertEqual(sum(sum(v) for k, v in m.items()), - sum(xrange(self.N))) + sum(range(self.N))) def test_medium_dataset(self): m = ExternalMerger(self.agg, 20) m.mergeValues(self.data) self.assertTrue(m.spills >= 1) self.assertEqual(sum(sum(v) for k, v in m.items()), - sum(xrange(self.N))) + sum(range(self.N))) m = ExternalMerger(self.agg, 10) m.mergeCombiners(map(lambda x_y2: (x_y2[0], [x_y2[1]]), self.data * 3)) self.assertTrue(m.spills >= 1) self.assertEqual(sum(sum(v) for k, v in m.items()), - sum(xrange(self.N)) * 3) + sum(range(self.N)) * 3) def test_huge_dataset(self): m = ExternalMerger(self.agg, 5, partitions=3) diff --git a/python/pyspark/tests/test_taskcontext.py b/python/pyspark/tests/test_taskcontext.py index 90e4bcdfad..8c2bedbe4e 100644 --- a/python/pyspark/tests/test_taskcontext.py +++ b/python/pyspark/tests/test_taskcontext.py @@ -26,9 +26,6 @@ import unittest from pyspark import SparkConf, SparkContext, TaskContext, BarrierTaskContext from pyspark.testing.utils import PySparkTestCase, SPARK_HOME -if sys.version_info[0] >= 3: - xrange = range - class TaskContextTests(PySparkTestCase): @@ -251,9 +248,9 @@ class TaskContextTestsWithWorkerReuse(unittest.TestCase): def test_task_context_correct_with_python_worker_reuse(self): """Verify the task context correct when reused python worker""" # start a normal job first to start all workers and get all worker pids - worker_pids = self.sc.parallelize(xrange(2), 2).map(lambda x: os.getpid()).collect() + worker_pids = self.sc.parallelize(range(2), 2).map(lambda x: os.getpid()).collect() # the worker will reuse in this barrier job - rdd = self.sc.parallelize(xrange(10), 2) + rdd = self.sc.parallelize(range(10), 2) def context(iterator): tp = TaskContext.get().partitionId() diff --git a/python/pyspark/tests/test_util.py b/python/pyspark/tests/test_util.py index 81bfb66e70..511d62a51f 100644 --- a/python/pyspark/tests/test_util.py +++ b/python/pyspark/tests/test_util.py @@ -61,14 +61,12 @@ class KeywordOnlyTests(unittest.TestCase): class UtilTests(PySparkTestCase): - def test_py4j_exception_message(self): - from pyspark.util import _exception_message - + def test_py4j_str(self): with self.assertRaises(Py4JJavaError) as context: # This attempts java.lang.String(null) which throws an NPE. self.sc._jvm.java.lang.String(None) - self.assertTrue('NullPointerException' in _exception_message(context.exception)) + self.assertTrue('NullPointerException' in str(context.exception)) def test_parsing_version_string(self): from pyspark.util import VersionUtils diff --git a/python/pyspark/tests/test_worker.py b/python/pyspark/tests/test_worker.py index dba9298ee1..3b1848dcfd 100644 --- a/python/pyspark/tests/test_worker.py +++ b/python/pyspark/tests/test_worker.py @@ -32,9 +32,6 @@ from py4j.protocol import Py4JJavaError from pyspark import SparkConf, SparkContext from pyspark.testing.utils import ReusedPySparkTestCase, PySparkTestCase, QuietTest -if sys.version_info[0] >= 3: - xrange = range - class WorkerTests(ReusedPySparkTestCase): def test_cancel_task(self): @@ -88,13 +85,13 @@ class WorkerTests(ReusedPySparkTestCase): self.fail("daemon had been killed") # run a normal job - rdd = self.sc.parallelize(xrange(100), 1) + rdd = self.sc.parallelize(range(100), 1) self.assertEqual(100, rdd.map(str).count()) def test_after_exception(self): def raise_exception(_): raise Exception() - rdd = self.sc.parallelize(xrange(100), 1) + rdd = self.sc.parallelize(range(100), 1) with QuietTest(self.sc): self.assertRaises(Exception, lambda: rdd.foreach(raise_exception)) self.assertEqual(100, rdd.map(str).count()) @@ -110,22 +107,22 @@ class WorkerTests(ReusedPySparkTestCase): with QuietTest(self.sc): self.assertRaises(Exception, lambda: filtered_data.count()) - rdd = self.sc.parallelize(xrange(100), 1) + rdd = self.sc.parallelize(range(100), 1) self.assertEqual(100, rdd.map(str).count()) def test_accumulator_when_reuse_worker(self): from pyspark.accumulators import INT_ACCUMULATOR_PARAM acc1 = self.sc.accumulator(0, INT_ACCUMULATOR_PARAM) - self.sc.parallelize(xrange(100), 20).foreach(lambda x: acc1.add(x)) + self.sc.parallelize(range(100), 20).foreach(lambda x: acc1.add(x)) self.assertEqual(sum(range(100)), acc1.value) acc2 = self.sc.accumulator(0, INT_ACCUMULATOR_PARAM) - self.sc.parallelize(xrange(100), 20).foreach(lambda x: acc2.add(x)) + self.sc.parallelize(range(100), 20).foreach(lambda x: acc2.add(x)) self.assertEqual(sum(range(100)), acc2.value) self.assertEqual(sum(range(100)), acc1.value) def test_reuse_worker_after_take(self): - rdd = self.sc.parallelize(xrange(100000), 1) + rdd = self.sc.parallelize(range(100000), 1) self.assertEqual(0, rdd.first()) def count(): @@ -160,17 +157,13 @@ class WorkerTests(ReusedPySparkTestCase): self.sc.parallelize([1]).map(lambda x: f()).count() except Py4JJavaError as e: - if sys.version_info.major < 3: - # we have to use unicode here to avoid UnicodeDecodeError - self.assertRegexpMatches(unicode(e).encode("utf-8"), "exception with 中") - else: - self.assertRegexpMatches(str(e), "exception with 中") + self.assertRegexpMatches(str(e), "exception with 中") class WorkerReuseTest(PySparkTestCase): - def test_reuse_worker_of_parallelize_xrange(self): - rdd = self.sc.parallelize(xrange(20), 8) + def test_reuse_worker_of_parallelize_range(self): + rdd = self.sc.parallelize(range(20), 8) previous_pids = rdd.map(lambda x: os.getpid()).collect() current_pids = rdd.map(lambda x: os.getpid()).collect() for pid in current_pids: @@ -189,7 +182,7 @@ class WorkerMemoryTest(unittest.TestCase): self.sc = SparkContext('local[4]', class_name, conf=conf) def test_memory_limit(self): - rdd = self.sc.parallelize(xrange(1), 1) + rdd = self.sc.parallelize(range(1), 1) def getrlimit(): import resource diff --git a/python/pyspark/util.py b/python/pyspark/util.py index d9429372a6..c003586e9c 100644 --- a/python/pyspark/util.py +++ b/python/pyspark/util.py @@ -19,52 +19,10 @@ import re import sys import traceback -import os -import warnings -import inspect -from py4j.protocol import Py4JJavaError __all__ = [] -def _exception_message(excp): - """Return the message from an exception as either a str or unicode object. Supports both - Python 2 and Python 3. - - >>> msg = "Exception message" - >>> excp = Exception(msg) - >>> msg == _exception_message(excp) - True - - >>> msg = u"unicöde" - >>> excp = Exception(msg) - >>> msg == _exception_message(excp) - True - """ - if isinstance(excp, Py4JJavaError): - # 'Py4JJavaError' doesn't contain the stack trace available on the Java side in 'message' - # attribute in Python 2. We should call 'str' function on this exception in general but - # 'Py4JJavaError' has an issue about addressing non-ascii strings. So, here we work - # around by the direct call, '__str__()'. Please see SPARK-23517. - return excp.__str__() - if hasattr(excp, "message"): - return excp.message - return str(excp) - - -def _get_argspec(f): - """ - Get argspec of a function. Supports both Python 2 and Python 3. - """ - if sys.version_info[0] < 3: - argspec = inspect.getargspec(f) - else: - # `getargspec` is deprecated since python3.0 (incompatible with function annotations). - # See SPARK-23569. - argspec = inspect.getfullargspec(f) - return argspec - - def print_exec(stream): ei = sys.exc_info() traceback.print_exception(ei[0], ei[1], ei[2], None, stream) diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py index 5f4a8a2d2d..9b54affb13 100644 --- a/python/pyspark/worker.py +++ b/python/pyspark/worker.py @@ -18,11 +18,11 @@ """ Worker that receives input from Piped RDD. """ -from __future__ import print_function -from __future__ import absolute_import import os import sys import time +from inspect import getfullargspec +import importlib # 'resource' is a Unix specific module. has_resource_module = True try: @@ -44,14 +44,9 @@ from pyspark.serializers import write_with_length, write_int, read_long, read_bo from pyspark.sql.pandas.serializers import ArrowStreamPandasUDFSerializer, CogroupUDFSerializer from pyspark.sql.pandas.types import to_arrow_type from pyspark.sql.types import StructType -from pyspark.util import _get_argspec, fail_on_stopiteration +from pyspark.util import fail_on_stopiteration from pyspark import shuffle -if sys.version >= '3': - basestring = str -else: - from itertools import imap as map # use iterator map by default - pickleSer = PickleSerializer() utf8_deserializer = UTF8Deserializer() @@ -272,10 +267,10 @@ def read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index): elif eval_type == PythonEvalType.SQL_MAP_PANDAS_ITER_UDF: return arg_offsets, wrap_pandas_iter_udf(func, return_type) elif eval_type == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF: - argspec = _get_argspec(chained_func) # signature was lost when wrapping it + argspec = getfullargspec(chained_func) # signature was lost when wrapping it return arg_offsets, wrap_grouped_map_pandas_udf(func, return_type, argspec) elif eval_type == PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF: - argspec = _get_argspec(chained_func) # signature was lost when wrapping it + argspec = getfullargspec(chained_func) # signature was lost when wrapping it return arg_offsets, wrap_cogrouped_map_pandas_udf(func, return_type, argspec) elif eval_type == PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF: return arg_offsets, wrap_grouped_agg_pandas_udf(func, return_type) @@ -342,11 +337,13 @@ def read_udfs(pickleSer, infile, eval_type): pickleSer, infile, eval_type, runner_conf, udf_index=0) def func(_, iterator): - num_input_rows = [0] # TODO(SPARK-29909): Use nonlocal after we drop Python 2. + num_input_rows = 0 def map_batch(batch): + nonlocal num_input_rows + udf_args = [batch[offset] for offset in arg_offsets] - num_input_rows[0] += len(udf_args[0]) + num_input_rows += len(udf_args[0]) if len(udf_args) == 1: return udf_args[0] else: @@ -363,7 +360,7 @@ def read_udfs(pickleSer, infile, eval_type): # by consuming the input iterator in user side. Therefore, # it's very unlikely the output length is higher than # input length. - assert is_map_iter or num_output_rows <= num_input_rows[0], \ + assert is_map_iter or num_output_rows <= num_input_rows, \ "Pandas SCALAR_ITER UDF outputted more rows than input rows." yield (result_batch, result_type) @@ -376,11 +373,11 @@ def read_udfs(pickleSer, infile, eval_type): raise RuntimeError("pandas iterator UDF should exhaust the input " "iterator.") - if num_output_rows != num_input_rows[0]: + if num_output_rows != num_input_rows: raise RuntimeError( "The length of output in Scalar iterator pandas UDF should be " "the same with the input's; however, the length of output was %d and the " - "length of input was %d." % (num_output_rows, num_input_rows[0])) + "length of input was %d." % (num_output_rows, num_input_rows)) # profiling is not supported for UDF return func, None, ser, ser @@ -548,9 +545,8 @@ def main(infile, outfile): for _ in range(num_python_includes): filename = utf8_deserializer.loads(infile) add_path(os.path.join(spark_files_dir, filename)) - if sys.version > '3': - import importlib - importlib.invalidate_caches() + + importlib.invalidate_caches() # fetch names and values of broadcast variables needs_broadcast_decryption_server = read_bool(infile) diff --git a/python/run-tests.py b/python/run-tests.py index 42510c7642..23076eab1c 100755 --- a/python/run-tests.py +++ b/python/run-tests.py @@ -28,10 +28,7 @@ import tempfile from threading import Thread, Lock import time import uuid -if sys.version < '3': - import Queue -else: - import queue as Queue +import queue as Queue from multiprocessing import Manager @@ -75,7 +72,6 @@ def run_individual_python_test(target_dir, test_name, pyspark_python): 'SPARK_PREPEND_CLASSES': '1', 'PYSPARK_PYTHON': which(pyspark_python), 'PYSPARK_DRIVER_PYTHON': which(pyspark_python), - 'PYSPARK_ROW_FIELD_SORTING_ENABLED': 'true' }) # Create a unique temp directory under 'target/' for each run. The TMPDIR variable is @@ -161,7 +157,8 @@ def run_individual_python_test(target_dir, test_name, pyspark_python): def get_default_python_executables(): - python_execs = [x for x in ["python3.6", "python2.7", "pypy3", "pypy"] if which(x)] + # TODO(SPARK-32278): install PyPy3 in Jenkins to test + python_execs = [x for x in ["python3.6", "python3.8", "pypy3"] if which(x)] if "python3.6" not in python_execs: p = which("python3") diff --git a/python/setup.py b/python/setup.py index afbd601b04..c456a32fea 100755 --- a/python/setup.py +++ b/python/setup.py @@ -16,18 +16,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import print_function import glob import os import sys from setuptools import setup from shutil import copyfile, copytree, rmtree -if sys.version_info < (2, 7): - print("Python versions prior to 2.7 are not supported for pip installed PySpark.", - file=sys.stderr) - sys.exit(-1) - try: exec(open('pyspark/version.py').read()) except IOError: @@ -217,13 +211,10 @@ try: 'pyarrow>=%s' % _minimum_pyarrow_version, ] }, + python_requires='>=3.6', classifiers=[ 'Development Status :: 5 - Production/Stable', 'License :: OSI Approved :: Apache Software License', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', diff --git a/resource-managers/kubernetes/integration-tests/tests/pyfiles.py b/resource-managers/kubernetes/integration-tests/tests/pyfiles.py index ba55b75803..51c0160554 100644 --- a/resource-managers/kubernetes/integration-tests/tests/pyfiles.py +++ b/resource-managers/kubernetes/integration-tests/tests/pyfiles.py @@ -14,9 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -from __future__ import print_function - import sys from pyspark.sql import SparkSession diff --git a/resource-managers/kubernetes/integration-tests/tests/worker_memory_check.py b/resource-managers/kubernetes/integration-tests/tests/worker_memory_check.py index d312a29f38..74559a0b54 100644 --- a/resource-managers/kubernetes/integration-tests/tests/worker_memory_check.py +++ b/resource-managers/kubernetes/integration-tests/tests/worker_memory_check.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - import resource import sys diff --git a/sql/hive/src/test/resources/data/scripts/cat.py b/sql/hive/src/test/resources/data/scripts/cat.py index aea0362f89..420d9f832a 100644 --- a/sql/hive/src/test/resources/data/scripts/cat.py +++ b/sql/hive/src/test/resources/data/scripts/cat.py @@ -16,7 +16,6 @@ # specific language governing permissions and limitations # under the License. # -from __future__ import print_function import sys import os diff --git a/sql/hive/src/test/resources/data/scripts/dumpdata_script.py b/sql/hive/src/test/resources/data/scripts/dumpdata_script.py index 5b360208d3..f724fdc85b 100644 --- a/sql/hive/src/test/resources/data/scripts/dumpdata_script.py +++ b/sql/hive/src/test/resources/data/scripts/dumpdata_script.py @@ -18,12 +18,9 @@ # import sys -if sys.version_info[0] >= 3: - xrange = range - -for i in xrange(50): - for j in xrange(5): - for k in xrange(20022): +for i in range(50): + for j in range(5): + for k in range(20022): print(20000 * i + k) for line in sys.stdin:
Writable TypePython Type
Textunicode str
Textstr
IntWritableint
FloatWritablefloat
DoubleWritablefloat