[SPARK-32982][BUILD] Remove hive-1.2 profiles in PIP installation option

### What changes were proposed in this pull request? This PR removes Hive 1.2 option (and therefore `HIVE_VERSION` environment variable as well). ### Why are the changes needed? Hive 1.2 is a fork version. We shouldn't promote users to use. ### Does this PR introduce _any_ user-facing change? Nope, `HIVE_VERSION` and Hive 1.2 are removed but this is new experimental feature in master only. ### How was this patch tested? Manually tested: ```bash SPARK_VERSION=3.0.1 HADOOP_VERSION=3.2 pip install pyspark-3.1.0.dev0.tar.gz -v SPARK_VERSION=3.0.1 HADOOP_VERSION=2.7 pip install pyspark-3.1.0.dev0.tar.gz -v SPARK_VERSION=3.0.1 HADOOP_VERSION=invalid pip install pyspark-3.1.0.dev0.tar.gz -v ``` Closes #29858 from HyukjinKwon/SPARK-32981. Authored-by: HyukjinKwon <gurwls223@apache.org> Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-24 14:49:58 +09:00 · 2020-09-24 14:49:58 +09:00 · 688d016c7a
parent 31a16fbb40
commit 688d016c7a
5 changed files with 18 additions and 39 deletions
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@ -276,7 +276,7 @@ if [[ "$1" == "package" ]]; then
  # list of packages to be built, so it's ok for things to be missing in BINARY_PKGS_EXTRA.

  # NOTE: Don't forget to update the valid combinations of distributions at
-  #   'python/pyspark.install.py' and 'python/docs/source/getting_started/installation.rst'
+  #   'python/pyspark/install.py' and 'python/docs/source/getting_started/install.rst'
  #   if you're changing them.
  declare -A BINARY_PKGS_ARGS
  BINARY_PKGS_ARGS["hadoop3.2"]="-Phadoop-3.2 $HIVE_PROFILES"
--- a/python/docs/source/getting_started/install.rst
+++ b/python/docs/source/getting_started/install.rst
@ -48,40 +48,28 @@ If you want to install extra dependencies for a specific componenet, you can ins

    pip install pyspark[sql]

-For PySpark with different Hadoop and/or Hive, you can install it by using ``HIVE_VERSION`` and ``HADOOP_VERSION`` environment variables as below:
+For PySpark with a different Hadoop version, you can install it by using ``HADOOP_VERSION`` environment variables as below:

 .. code-block:: bash

-    HIVE_VERSION=2.3 pip install pyspark
    HADOOP_VERSION=2.7 pip install pyspark
-    HIVE_VERSION=1.2 HADOOP_VERSION=2.7 pip install pyspark

-The default distribution has built-in Hadoop 3.2 and Hive 2.3. If users specify different versions, the pip installation automatically
+The default distribution uses Hadoop 3.2 and Hive 2.3. If users specify different versions of Hadoop, the pip installation automatically
 downloads a different version and use it in PySpark. Downloading it can take a while depending on
-the network and the mirror chosen. ``PYSPARK_RELEASE_MIRROR`` can be set to manually choose the mirror
-for faster downloading.
+the network and the mirror chosen. ``PYSPARK_RELEASE_MIRROR`` can be set to manually choose the mirror for faster downloading.

 .. code-block:: bash

    PYSPARK_RELEASE_MIRROR=http://mirror.apache-kr.org HADOOP_VERSION=2.7 pip install

-It is recommended to use `-v` option in `pip` to track the installation and download status.
+It is recommended to use ``-v`` option in ``pip`` to track the installation and download status.

 .. code-block:: bash

    HADOOP_VERSION=2.7 pip install pyspark -v

-Supported versions are as below:
-
-====================================== ====================================== ======================================
-``HADOOP_VERSION`` \\ ``HIVE_VERSION`` 1.2                                    2.3 (default)
-====================================== ====================================== ======================================
-**2.7**                                O                                      O
-**3.2 (default)**                      X                                      O
-**without**                            X                                      O
-====================================== ====================================== ======================================
-
-Note that this installation of PySpark with different versions of Hadoop and Hive is experimental. It can change or be removed between minor releases.
+Supported versions of Hadoop are ``HADOOP_VERSION=2.7`` and ``HADOOP_VERSION=3.2`` (default).
+Note that this installation of PySpark with a different version of Hadoop is experimental. It can change or be removed between minor releases.


 Using Conda
--- a/python/pyspark/install.py
+++ b/python/pyspark/install.py
@ -26,18 +26,13 @@ from shutil import rmtree
 DEFAULT_HADOOP = "hadoop3.2"
 DEFAULT_HIVE = "hive2.3"
 SUPPORTED_HADOOP_VERSIONS = ["hadoop2.7", "hadoop3.2", "without-hadoop"]
-SUPPORTED_HIVE_VERSIONS = ["hive1.2", "hive2.3"]
+SUPPORTED_HIVE_VERSIONS = ["hive2.3"]
 UNSUPPORTED_COMBINATIONS = [
-    ("without-hadoop", "hive1.2"),
-    ("hadoop3.2", "hive1.2"),
 ]


 def checked_package_name(spark_version, hadoop_version, hive_version):
-    if hive_version == "hive1.2":
-        return "%s-bin-%s-%s" % (spark_version, hadoop_version, hive_version)
-    else:
-        return "%s-bin-%s" % (spark_version, hadoop_version)
+    return "%s-bin-%s" % (spark_version, hadoop_version)


 def checked_versions(spark_version, hadoop_version, hive_version):
@ -48,7 +43,7 @@ def checked_versions(spark_version, hadoop_version, hive_version):
    :param hadoop_version: Hadoop version. It should be X.X such as '2.7' or 'hadoop2.7'.
        'without' and 'without-hadoop' are supported as special keywords for Hadoop free
        distribution.
-    :param hive_version: Hive version. It should be X.X such as '1.2' or 'hive1.2'.
+    :param hive_version: Hive version. It should be X.X such as '2.3' or 'hive2.3'.

    :return it returns fully-qualified versions of Spark, Hadoop and Hive in a tuple.
        For example, spark-3.0.0, hadoop3.2 and hive2.3.
@ -80,9 +75,6 @@ def checked_versions(spark_version, hadoop_version, hive_version):
            "one of [%s]" % (hive_version, ", ".join(
                SUPPORTED_HADOOP_VERSIONS)))

-    if (hadoop_version, hive_version) in UNSUPPORTED_COMBINATIONS:
-        raise RuntimeError("Hive 1.2 should only be with Hadoop 2.7.")
-
    return spark_version, hadoop_version, hive_version


@ -95,7 +87,7 @@ def install_spark(dest, spark_version, hadoop_version, hive_version):
    :param spark_version: Spark version. It should be spark-X.X.X form.
    :param hadoop_version: Hadoop version. It should be hadoopX.X
        such as 'hadoop2.7' or 'without-hadoop'.
-    :param hive_version: Hive version. It should be hiveX.X such as 'hive1.2'.
+    :param hive_version: Hive version. It should be hiveX.X such as 'hive2.3'.
    """

    package_name = checked_package_name(spark_version, hadoop_version, hive_version)
--- a/python/pyspark/tests/test_install_spark.py
+++ b/python/pyspark/tests/test_install_spark.py
@ -41,9 +41,6 @@ class SparkInstallationTestCase(unittest.TestCase):
            self.assertTrue(os.path.exists("%s/RELEASE" % tmp_dir))

    def test_package_name(self):
-        self.assertEqual(
-            "spark-3.0.0-bin-hadoop3.2-hive1.2",
-            checked_package_name("spark-3.0.0", "hadoop3.2", "hive1.2"))
        self.assertEqual(
            "spark-3.0.0-bin-hadoop3.2",
            checked_package_name("spark-3.0.0", "hadoop3.2", "hive2.3"))
@ -53,12 +50,12 @@ class SparkInstallationTestCase(unittest.TestCase):

        # Positive test cases
        self.assertEqual(
-            ("spark-3.0.0", "hadoop2.7", "hive1.2"),
-            checked_versions("spark-3.0.0", "hadoop2.7", "hive1.2"))
+            ("spark-3.0.0", "hadoop2.7", "hive2.3"),
+            checked_versions("spark-3.0.0", "hadoop2.7", "hive2.3"))

        self.assertEqual(
-            ("spark-3.0.0", "hadoop2.7", "hive1.2"),
-            checked_versions("3.0.0", "2.7", "1.2"))
+            ("spark-3.0.0", "hadoop2.7", "hive2.3"),
+            checked_versions("3.0.0", "2.7", "2.3"))

        self.assertEqual(
            ("spark-2.4.1", "without-hadoop", "hive2.3"),
@ -94,7 +91,7 @@ class SparkInstallationTestCase(unittest.TestCase):
                hadoop_version=DEFAULT_HADOOP,
                hive_version="malformed")

-        with self.assertRaisesRegex(RuntimeError, "Hive 1.2 should only be with Hadoop 2.7"):
+        with self.assertRaisesRegex(RuntimeError, "Spark distribution of hive1.2 is not supported"):
            checked_versions(
                spark_version=test_version,
                hadoop_version="hadoop3.2",
--- a/python/setup.py
+++ b/python/setup.py
@ -127,6 +127,8 @@ class InstallCommand(install):

        if ("HADOOP_VERSION" in os.environ) or ("HIVE_VERSION" in os.environ):
            # Note that SPARK_VERSION environment is just a testing purpose.
+            # HIVE_VERSION environment variable is also internal for now in case
+            # we support another version of Hive in the future.
            spark_version, hadoop_version, hive_version = install_module.checked_versions(
                os.environ.get("SPARK_VERSION", VERSION).lower(),
                os.environ.get("HADOOP_VERSION", install_module.DEFAULT_HADOOP).lower(),