From 688d016c7acc4b9d96d75b40123be9f40b7b2693 Mon Sep 17 00:00:00 2001
From: HyukjinKwon <gurwls223@apache.org>
Date: Thu, 24 Sep 2020 14:49:58 +0900
Subject: [PATCH] [SPARK-32982][BUILD] Remove hive-1.2 profiles in PIP
 installation option

### What changes were proposed in this pull request?

This PR removes Hive 1.2 option (and therefore `HIVE_VERSION` environment variable as well).

### Why are the changes needed?

Hive 1.2 is a fork version. We shouldn't promote users to use.

### Does this PR introduce _any_ user-facing change?

Nope, `HIVE_VERSION` and Hive 1.2 are removed but this is new experimental feature in master only.

### How was this patch tested?

Manually tested:

```bash
SPARK_VERSION=3.0.1 HADOOP_VERSION=3.2 pip install pyspark-3.1.0.dev0.tar.gz -v
SPARK_VERSION=3.0.1 HADOOP_VERSION=2.7 pip install pyspark-3.1.0.dev0.tar.gz -v
SPARK_VERSION=3.0.1 HADOOP_VERSION=invalid pip install pyspark-3.1.0.dev0.tar.gz -v
```

Closes #29858 from HyukjinKwon/SPARK-32981.

Authored-by: HyukjinKwon <gurwls223@apache.org>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 dev/create-release/release-build.sh           |  2 +-
 .../docs/source/getting_started/install.rst   | 24 +++++--------------
 python/pyspark/install.py                     | 16 ++++---------
 python/pyspark/tests/test_install_spark.py    | 13 ++++------
 python/setup.py                               |  2 ++
 5 files changed, 18 insertions(+), 39 deletions(-)

diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index c47469a2f6..c7fee13d39 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -276,7 +276,7 @@ if [[ "$1" == "package" ]]; then
   # list of packages to be built, so it's ok for things to be missing in BINARY_PKGS_EXTRA.
 
   # NOTE: Don't forget to update the valid combinations of distributions at
-  #   'python/pyspark.install.py' and 'python/docs/source/getting_started/installation.rst'
+  #   'python/pyspark/install.py' and 'python/docs/source/getting_started/install.rst'
   #   if you're changing them.
   declare -A BINARY_PKGS_ARGS
   BINARY_PKGS_ARGS["hadoop3.2"]="-Phadoop-3.2 $HIVE_PROFILES"
diff --git a/python/docs/source/getting_started/install.rst b/python/docs/source/getting_started/install.rst
index d915e9c734..4039698d39 100644
--- a/python/docs/source/getting_started/install.rst
+++ b/python/docs/source/getting_started/install.rst
@@ -48,40 +48,28 @@ If you want to install extra dependencies for a specific componenet, you can ins
 
     pip install pyspark[sql]
 
-For PySpark with different Hadoop and/or Hive, you can install it by using ``HIVE_VERSION`` and ``HADOOP_VERSION`` environment variables as below:
+For PySpark with a different Hadoop version, you can install it by using ``HADOOP_VERSION`` environment variables as below:
 
 .. code-block:: bash
 
-    HIVE_VERSION=2.3 pip install pyspark
     HADOOP_VERSION=2.7 pip install pyspark
-    HIVE_VERSION=1.2 HADOOP_VERSION=2.7 pip install pyspark
 
-The default distribution has built-in Hadoop 3.2 and Hive 2.3. If users specify different versions, the pip installation automatically
+The default distribution uses Hadoop 3.2 and Hive 2.3. If users specify different versions of Hadoop, the pip installation automatically
 downloads a different version and use it in PySpark. Downloading it can take a while depending on
-the network and the mirror chosen. ``PYSPARK_RELEASE_MIRROR`` can be set to manually choose the mirror
-for faster downloading.
+the network and the mirror chosen. ``PYSPARK_RELEASE_MIRROR`` can be set to manually choose the mirror for faster downloading.
 
 .. code-block:: bash
 
     PYSPARK_RELEASE_MIRROR=http://mirror.apache-kr.org HADOOP_VERSION=2.7 pip install
 
-It is recommended to use `-v` option in `pip` to track the installation and download status.
+It is recommended to use ``-v`` option in ``pip`` to track the installation and download status.
 
 .. code-block:: bash
 
     HADOOP_VERSION=2.7 pip install pyspark -v
 
-Supported versions are as below:
-
-====================================== ====================================== ======================================
-``HADOOP_VERSION`` \\ ``HIVE_VERSION`` 1.2                                    2.3 (default)
-====================================== ====================================== ======================================
-**2.7**                                O                                      O
-**3.2 (default)**                      X                                      O
-**without**                            X                                      O
-====================================== ====================================== ======================================
-
-Note that this installation of PySpark with different versions of Hadoop and Hive is experimental. It can change or be removed between minor releases.
+Supported versions of Hadoop are ``HADOOP_VERSION=2.7`` and ``HADOOP_VERSION=3.2`` (default).
+Note that this installation of PySpark with a different version of Hadoop is experimental. It can change or be removed between minor releases.
 
 
 Using Conda
diff --git a/python/pyspark/install.py b/python/pyspark/install.py
index 89573577cd..84dd2c9964 100644
--- a/python/pyspark/install.py
+++ b/python/pyspark/install.py
@@ -26,18 +26,13 @@ from shutil import rmtree
 DEFAULT_HADOOP = "hadoop3.2"
 DEFAULT_HIVE = "hive2.3"
 SUPPORTED_HADOOP_VERSIONS = ["hadoop2.7", "hadoop3.2", "without-hadoop"]
-SUPPORTED_HIVE_VERSIONS = ["hive1.2", "hive2.3"]
+SUPPORTED_HIVE_VERSIONS = ["hive2.3"]
 UNSUPPORTED_COMBINATIONS = [
-    ("without-hadoop", "hive1.2"),
-    ("hadoop3.2", "hive1.2"),
 ]
 
 
 def checked_package_name(spark_version, hadoop_version, hive_version):
-    if hive_version == "hive1.2":
-        return "%s-bin-%s-%s" % (spark_version, hadoop_version, hive_version)
-    else:
-        return "%s-bin-%s" % (spark_version, hadoop_version)
+    return "%s-bin-%s" % (spark_version, hadoop_version)
 
 
 def checked_versions(spark_version, hadoop_version, hive_version):
@@ -48,7 +43,7 @@ def checked_versions(spark_version, hadoop_version, hive_version):
     :param hadoop_version: Hadoop version. It should be X.X such as '2.7' or 'hadoop2.7'.
         'without' and 'without-hadoop' are supported as special keywords for Hadoop free
         distribution.
-    :param hive_version: Hive version. It should be X.X such as '1.2' or 'hive1.2'.
+    :param hive_version: Hive version. It should be X.X such as '2.3' or 'hive2.3'.
 
     :return it returns fully-qualified versions of Spark, Hadoop and Hive in a tuple.
         For example, spark-3.0.0, hadoop3.2 and hive2.3.
@@ -80,9 +75,6 @@ def checked_versions(spark_version, hadoop_version, hive_version):
             "one of [%s]" % (hive_version, ", ".join(
                 SUPPORTED_HADOOP_VERSIONS)))
 
-    if (hadoop_version, hive_version) in UNSUPPORTED_COMBINATIONS:
-        raise RuntimeError("Hive 1.2 should only be with Hadoop 2.7.")
-
     return spark_version, hadoop_version, hive_version
 
 
@@ -95,7 +87,7 @@ def install_spark(dest, spark_version, hadoop_version, hive_version):
     :param spark_version: Spark version. It should be spark-X.X.X form.
     :param hadoop_version: Hadoop version. It should be hadoopX.X
         such as 'hadoop2.7' or 'without-hadoop'.
-    :param hive_version: Hive version. It should be hiveX.X such as 'hive1.2'.
+    :param hive_version: Hive version. It should be hiveX.X such as 'hive2.3'.
     """
 
     package_name = checked_package_name(spark_version, hadoop_version, hive_version)
diff --git a/python/pyspark/tests/test_install_spark.py b/python/pyspark/tests/test_install_spark.py
index b215cf6b01..6f9949aa8b 100644
--- a/python/pyspark/tests/test_install_spark.py
+++ b/python/pyspark/tests/test_install_spark.py
@@ -41,9 +41,6 @@ class SparkInstallationTestCase(unittest.TestCase):
             self.assertTrue(os.path.exists("%s/RELEASE" % tmp_dir))
 
     def test_package_name(self):
-        self.assertEqual(
-            "spark-3.0.0-bin-hadoop3.2-hive1.2",
-            checked_package_name("spark-3.0.0", "hadoop3.2", "hive1.2"))
         self.assertEqual(
             "spark-3.0.0-bin-hadoop3.2",
             checked_package_name("spark-3.0.0", "hadoop3.2", "hive2.3"))
@@ -53,12 +50,12 @@ class SparkInstallationTestCase(unittest.TestCase):
 
         # Positive test cases
         self.assertEqual(
-            ("spark-3.0.0", "hadoop2.7", "hive1.2"),
-            checked_versions("spark-3.0.0", "hadoop2.7", "hive1.2"))
+            ("spark-3.0.0", "hadoop2.7", "hive2.3"),
+            checked_versions("spark-3.0.0", "hadoop2.7", "hive2.3"))
 
         self.assertEqual(
-            ("spark-3.0.0", "hadoop2.7", "hive1.2"),
-            checked_versions("3.0.0", "2.7", "1.2"))
+            ("spark-3.0.0", "hadoop2.7", "hive2.3"),
+            checked_versions("3.0.0", "2.7", "2.3"))
 
         self.assertEqual(
             ("spark-2.4.1", "without-hadoop", "hive2.3"),
@@ -94,7 +91,7 @@ class SparkInstallationTestCase(unittest.TestCase):
                 hadoop_version=DEFAULT_HADOOP,
                 hive_version="malformed")
 
-        with self.assertRaisesRegex(RuntimeError, "Hive 1.2 should only be with Hadoop 2.7"):
+        with self.assertRaisesRegex(RuntimeError, "Spark distribution of hive1.2 is not supported"):
             checked_versions(
                 spark_version=test_version,
                 hadoop_version="hadoop3.2",
diff --git a/python/setup.py b/python/setup.py
index 2067653893..8d9cf2ee54 100755
--- a/python/setup.py
+++ b/python/setup.py
@@ -127,6 +127,8 @@ class InstallCommand(install):
 
         if ("HADOOP_VERSION" in os.environ) or ("HIVE_VERSION" in os.environ):
             # Note that SPARK_VERSION environment is just a testing purpose.
+            # HIVE_VERSION environment variable is also internal for now in case
+            # we support another version of Hive in the future.
             spark_version, hadoop_version, hive_version = install_module.checked_versions(
                 os.environ.get("SPARK_VERSION", VERSION).lower(),
                 os.environ.get("HADOOP_VERSION", install_module.DEFAULT_HADOOP).lower(),