From 688d016c7acc4b9d96d75b40123be9f40b7b2693 Mon Sep 17 00:00:00 2001 From: HyukjinKwon Date: Thu, 24 Sep 2020 14:49:58 +0900 Subject: [PATCH] [SPARK-32982][BUILD] Remove hive-1.2 profiles in PIP installation option ### What changes were proposed in this pull request? This PR removes Hive 1.2 option (and therefore `HIVE_VERSION` environment variable as well). ### Why are the changes needed? Hive 1.2 is a fork version. We shouldn't promote users to use. ### Does this PR introduce _any_ user-facing change? Nope, `HIVE_VERSION` and Hive 1.2 are removed but this is new experimental feature in master only. ### How was this patch tested? Manually tested: ```bash SPARK_VERSION=3.0.1 HADOOP_VERSION=3.2 pip install pyspark-3.1.0.dev0.tar.gz -v SPARK_VERSION=3.0.1 HADOOP_VERSION=2.7 pip install pyspark-3.1.0.dev0.tar.gz -v SPARK_VERSION=3.0.1 HADOOP_VERSION=invalid pip install pyspark-3.1.0.dev0.tar.gz -v ``` Closes #29858 from HyukjinKwon/SPARK-32981. Authored-by: HyukjinKwon Signed-off-by: HyukjinKwon --- dev/create-release/release-build.sh | 2 +- .../docs/source/getting_started/install.rst | 24 +++++-------------- python/pyspark/install.py | 16 ++++--------- python/pyspark/tests/test_install_spark.py | 13 ++++------ python/setup.py | 2 ++ 5 files changed, 18 insertions(+), 39 deletions(-) diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh index c47469a2f6..c7fee13d39 100755 --- a/dev/create-release/release-build.sh +++ b/dev/create-release/release-build.sh @@ -276,7 +276,7 @@ if [[ "$1" == "package" ]]; then # list of packages to be built, so it's ok for things to be missing in BINARY_PKGS_EXTRA. # NOTE: Don't forget to update the valid combinations of distributions at - # 'python/pyspark.install.py' and 'python/docs/source/getting_started/installation.rst' + # 'python/pyspark/install.py' and 'python/docs/source/getting_started/install.rst' # if you're changing them. declare -A BINARY_PKGS_ARGS BINARY_PKGS_ARGS["hadoop3.2"]="-Phadoop-3.2 $HIVE_PROFILES" diff --git a/python/docs/source/getting_started/install.rst b/python/docs/source/getting_started/install.rst index d915e9c734..4039698d39 100644 --- a/python/docs/source/getting_started/install.rst +++ b/python/docs/source/getting_started/install.rst @@ -48,40 +48,28 @@ If you want to install extra dependencies for a specific componenet, you can ins pip install pyspark[sql] -For PySpark with different Hadoop and/or Hive, you can install it by using ``HIVE_VERSION`` and ``HADOOP_VERSION`` environment variables as below: +For PySpark with a different Hadoop version, you can install it by using ``HADOOP_VERSION`` environment variables as below: .. code-block:: bash - HIVE_VERSION=2.3 pip install pyspark HADOOP_VERSION=2.7 pip install pyspark - HIVE_VERSION=1.2 HADOOP_VERSION=2.7 pip install pyspark -The default distribution has built-in Hadoop 3.2 and Hive 2.3. If users specify different versions, the pip installation automatically +The default distribution uses Hadoop 3.2 and Hive 2.3. If users specify different versions of Hadoop, the pip installation automatically downloads a different version and use it in PySpark. Downloading it can take a while depending on -the network and the mirror chosen. ``PYSPARK_RELEASE_MIRROR`` can be set to manually choose the mirror -for faster downloading. +the network and the mirror chosen. ``PYSPARK_RELEASE_MIRROR`` can be set to manually choose the mirror for faster downloading. .. code-block:: bash PYSPARK_RELEASE_MIRROR=http://mirror.apache-kr.org HADOOP_VERSION=2.7 pip install -It is recommended to use `-v` option in `pip` to track the installation and download status. +It is recommended to use ``-v`` option in ``pip`` to track the installation and download status. .. code-block:: bash HADOOP_VERSION=2.7 pip install pyspark -v -Supported versions are as below: - -====================================== ====================================== ====================================== -``HADOOP_VERSION`` \\ ``HIVE_VERSION`` 1.2 2.3 (default) -====================================== ====================================== ====================================== -**2.7** O O -**3.2 (default)** X O -**without** X O -====================================== ====================================== ====================================== - -Note that this installation of PySpark with different versions of Hadoop and Hive is experimental. It can change or be removed between minor releases. +Supported versions of Hadoop are ``HADOOP_VERSION=2.7`` and ``HADOOP_VERSION=3.2`` (default). +Note that this installation of PySpark with a different version of Hadoop is experimental. It can change or be removed between minor releases. Using Conda diff --git a/python/pyspark/install.py b/python/pyspark/install.py index 89573577cd..84dd2c9964 100644 --- a/python/pyspark/install.py +++ b/python/pyspark/install.py @@ -26,18 +26,13 @@ from shutil import rmtree DEFAULT_HADOOP = "hadoop3.2" DEFAULT_HIVE = "hive2.3" SUPPORTED_HADOOP_VERSIONS = ["hadoop2.7", "hadoop3.2", "without-hadoop"] -SUPPORTED_HIVE_VERSIONS = ["hive1.2", "hive2.3"] +SUPPORTED_HIVE_VERSIONS = ["hive2.3"] UNSUPPORTED_COMBINATIONS = [ - ("without-hadoop", "hive1.2"), - ("hadoop3.2", "hive1.2"), ] def checked_package_name(spark_version, hadoop_version, hive_version): - if hive_version == "hive1.2": - return "%s-bin-%s-%s" % (spark_version, hadoop_version, hive_version) - else: - return "%s-bin-%s" % (spark_version, hadoop_version) + return "%s-bin-%s" % (spark_version, hadoop_version) def checked_versions(spark_version, hadoop_version, hive_version): @@ -48,7 +43,7 @@ def checked_versions(spark_version, hadoop_version, hive_version): :param hadoop_version: Hadoop version. It should be X.X such as '2.7' or 'hadoop2.7'. 'without' and 'without-hadoop' are supported as special keywords for Hadoop free distribution. - :param hive_version: Hive version. It should be X.X such as '1.2' or 'hive1.2'. + :param hive_version: Hive version. It should be X.X such as '2.3' or 'hive2.3'. :return it returns fully-qualified versions of Spark, Hadoop and Hive in a tuple. For example, spark-3.0.0, hadoop3.2 and hive2.3. @@ -80,9 +75,6 @@ def checked_versions(spark_version, hadoop_version, hive_version): "one of [%s]" % (hive_version, ", ".join( SUPPORTED_HADOOP_VERSIONS))) - if (hadoop_version, hive_version) in UNSUPPORTED_COMBINATIONS: - raise RuntimeError("Hive 1.2 should only be with Hadoop 2.7.") - return spark_version, hadoop_version, hive_version @@ -95,7 +87,7 @@ def install_spark(dest, spark_version, hadoop_version, hive_version): :param spark_version: Spark version. It should be spark-X.X.X form. :param hadoop_version: Hadoop version. It should be hadoopX.X such as 'hadoop2.7' or 'without-hadoop'. - :param hive_version: Hive version. It should be hiveX.X such as 'hive1.2'. + :param hive_version: Hive version. It should be hiveX.X such as 'hive2.3'. """ package_name = checked_package_name(spark_version, hadoop_version, hive_version) diff --git a/python/pyspark/tests/test_install_spark.py b/python/pyspark/tests/test_install_spark.py index b215cf6b01..6f9949aa8b 100644 --- a/python/pyspark/tests/test_install_spark.py +++ b/python/pyspark/tests/test_install_spark.py @@ -41,9 +41,6 @@ class SparkInstallationTestCase(unittest.TestCase): self.assertTrue(os.path.exists("%s/RELEASE" % tmp_dir)) def test_package_name(self): - self.assertEqual( - "spark-3.0.0-bin-hadoop3.2-hive1.2", - checked_package_name("spark-3.0.0", "hadoop3.2", "hive1.2")) self.assertEqual( "spark-3.0.0-bin-hadoop3.2", checked_package_name("spark-3.0.0", "hadoop3.2", "hive2.3")) @@ -53,12 +50,12 @@ class SparkInstallationTestCase(unittest.TestCase): # Positive test cases self.assertEqual( - ("spark-3.0.0", "hadoop2.7", "hive1.2"), - checked_versions("spark-3.0.0", "hadoop2.7", "hive1.2")) + ("spark-3.0.0", "hadoop2.7", "hive2.3"), + checked_versions("spark-3.0.0", "hadoop2.7", "hive2.3")) self.assertEqual( - ("spark-3.0.0", "hadoop2.7", "hive1.2"), - checked_versions("3.0.0", "2.7", "1.2")) + ("spark-3.0.0", "hadoop2.7", "hive2.3"), + checked_versions("3.0.0", "2.7", "2.3")) self.assertEqual( ("spark-2.4.1", "without-hadoop", "hive2.3"), @@ -94,7 +91,7 @@ class SparkInstallationTestCase(unittest.TestCase): hadoop_version=DEFAULT_HADOOP, hive_version="malformed") - with self.assertRaisesRegex(RuntimeError, "Hive 1.2 should only be with Hadoop 2.7"): + with self.assertRaisesRegex(RuntimeError, "Spark distribution of hive1.2 is not supported"): checked_versions( spark_version=test_version, hadoop_version="hadoop3.2", diff --git a/python/setup.py b/python/setup.py index 2067653893..8d9cf2ee54 100755 --- a/python/setup.py +++ b/python/setup.py @@ -127,6 +127,8 @@ class InstallCommand(install): if ("HADOOP_VERSION" in os.environ) or ("HIVE_VERSION" in os.environ): # Note that SPARK_VERSION environment is just a testing purpose. + # HIVE_VERSION environment variable is also internal for now in case + # we support another version of Hive in the future. spark_version, hadoop_version, hive_version = install_module.checked_versions( os.environ.get("SPARK_VERSION", VERSION).lower(), os.environ.get("HADOOP_VERSION", install_module.DEFAULT_HADOOP).lower(),