[SPARK-32982][BUILD] Remove hive-1.2 profiles in PIP installation option

### What changes were proposed in this pull request?

This PR removes Hive 1.2 option (and therefore `HIVE_VERSION` environment variable as well).

### Why are the changes needed?

Hive 1.2 is a fork version. We shouldn't promote users to use.

### Does this PR introduce _any_ user-facing change?

Nope, `HIVE_VERSION` and Hive 1.2 are removed but this is new experimental feature in master only.

### How was this patch tested?

Manually tested:

```bash
SPARK_VERSION=3.0.1 HADOOP_VERSION=3.2 pip install pyspark-3.1.0.dev0.tar.gz -v
SPARK_VERSION=3.0.1 HADOOP_VERSION=2.7 pip install pyspark-3.1.0.dev0.tar.gz -v
SPARK_VERSION=3.0.1 HADOOP_VERSION=invalid pip install pyspark-3.1.0.dev0.tar.gz -v
```

Closes #29858 from HyukjinKwon/SPARK-32981.

Authored-by: HyukjinKwon <gurwls223@apache.org>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
This commit is contained in:
HyukjinKwon 2020-09-24 14:49:58 +09:00
parent 31a16fbb40
commit 688d016c7a
5 changed files with 18 additions and 39 deletions

View file

@ -276,7 +276,7 @@ if [[ "$1" == "package" ]]; then
# list of packages to be built, so it's ok for things to be missing in BINARY_PKGS_EXTRA.
# NOTE: Don't forget to update the valid combinations of distributions at
# 'python/pyspark.install.py' and 'python/docs/source/getting_started/installation.rst'
# 'python/pyspark/install.py' and 'python/docs/source/getting_started/install.rst'
# if you're changing them.
declare -A BINARY_PKGS_ARGS
BINARY_PKGS_ARGS["hadoop3.2"]="-Phadoop-3.2 $HIVE_PROFILES"

View file

@ -48,40 +48,28 @@ If you want to install extra dependencies for a specific componenet, you can ins
pip install pyspark[sql]
For PySpark with different Hadoop and/or Hive, you can install it by using ``HIVE_VERSION`` and ``HADOOP_VERSION`` environment variables as below:
For PySpark with a different Hadoop version, you can install it by using ``HADOOP_VERSION`` environment variables as below:
.. code-block:: bash
HIVE_VERSION=2.3 pip install pyspark
HADOOP_VERSION=2.7 pip install pyspark
HIVE_VERSION=1.2 HADOOP_VERSION=2.7 pip install pyspark
The default distribution has built-in Hadoop 3.2 and Hive 2.3. If users specify different versions, the pip installation automatically
The default distribution uses Hadoop 3.2 and Hive 2.3. If users specify different versions of Hadoop, the pip installation automatically
downloads a different version and use it in PySpark. Downloading it can take a while depending on
the network and the mirror chosen. ``PYSPARK_RELEASE_MIRROR`` can be set to manually choose the mirror
for faster downloading.
the network and the mirror chosen. ``PYSPARK_RELEASE_MIRROR`` can be set to manually choose the mirror for faster downloading.
.. code-block:: bash
PYSPARK_RELEASE_MIRROR=http://mirror.apache-kr.org HADOOP_VERSION=2.7 pip install
It is recommended to use `-v` option in `pip` to track the installation and download status.
It is recommended to use ``-v`` option in ``pip`` to track the installation and download status.
.. code-block:: bash
HADOOP_VERSION=2.7 pip install pyspark -v
Supported versions are as below:
====================================== ====================================== ======================================
``HADOOP_VERSION`` \\ ``HIVE_VERSION`` 1.2 2.3 (default)
====================================== ====================================== ======================================
**2.7** O O
**3.2 (default)** X O
**without** X O
====================================== ====================================== ======================================
Note that this installation of PySpark with different versions of Hadoop and Hive is experimental. It can change or be removed between minor releases.
Supported versions of Hadoop are ``HADOOP_VERSION=2.7`` and ``HADOOP_VERSION=3.2`` (default).
Note that this installation of PySpark with a different version of Hadoop is experimental. It can change or be removed between minor releases.
Using Conda

View file

@ -26,18 +26,13 @@ from shutil import rmtree
DEFAULT_HADOOP = "hadoop3.2"
DEFAULT_HIVE = "hive2.3"
SUPPORTED_HADOOP_VERSIONS = ["hadoop2.7", "hadoop3.2", "without-hadoop"]
SUPPORTED_HIVE_VERSIONS = ["hive1.2", "hive2.3"]
SUPPORTED_HIVE_VERSIONS = ["hive2.3"]
UNSUPPORTED_COMBINATIONS = [
("without-hadoop", "hive1.2"),
("hadoop3.2", "hive1.2"),
]
def checked_package_name(spark_version, hadoop_version, hive_version):
if hive_version == "hive1.2":
return "%s-bin-%s-%s" % (spark_version, hadoop_version, hive_version)
else:
return "%s-bin-%s" % (spark_version, hadoop_version)
return "%s-bin-%s" % (spark_version, hadoop_version)
def checked_versions(spark_version, hadoop_version, hive_version):
@ -48,7 +43,7 @@ def checked_versions(spark_version, hadoop_version, hive_version):
:param hadoop_version: Hadoop version. It should be X.X such as '2.7' or 'hadoop2.7'.
'without' and 'without-hadoop' are supported as special keywords for Hadoop free
distribution.
:param hive_version: Hive version. It should be X.X such as '1.2' or 'hive1.2'.
:param hive_version: Hive version. It should be X.X such as '2.3' or 'hive2.3'.
:return it returns fully-qualified versions of Spark, Hadoop and Hive in a tuple.
For example, spark-3.0.0, hadoop3.2 and hive2.3.
@ -80,9 +75,6 @@ def checked_versions(spark_version, hadoop_version, hive_version):
"one of [%s]" % (hive_version, ", ".join(
SUPPORTED_HADOOP_VERSIONS)))
if (hadoop_version, hive_version) in UNSUPPORTED_COMBINATIONS:
raise RuntimeError("Hive 1.2 should only be with Hadoop 2.7.")
return spark_version, hadoop_version, hive_version
@ -95,7 +87,7 @@ def install_spark(dest, spark_version, hadoop_version, hive_version):
:param spark_version: Spark version. It should be spark-X.X.X form.
:param hadoop_version: Hadoop version. It should be hadoopX.X
such as 'hadoop2.7' or 'without-hadoop'.
:param hive_version: Hive version. It should be hiveX.X such as 'hive1.2'.
:param hive_version: Hive version. It should be hiveX.X such as 'hive2.3'.
"""
package_name = checked_package_name(spark_version, hadoop_version, hive_version)

View file

@ -41,9 +41,6 @@ class SparkInstallationTestCase(unittest.TestCase):
self.assertTrue(os.path.exists("%s/RELEASE" % tmp_dir))
def test_package_name(self):
self.assertEqual(
"spark-3.0.0-bin-hadoop3.2-hive1.2",
checked_package_name("spark-3.0.0", "hadoop3.2", "hive1.2"))
self.assertEqual(
"spark-3.0.0-bin-hadoop3.2",
checked_package_name("spark-3.0.0", "hadoop3.2", "hive2.3"))
@ -53,12 +50,12 @@ class SparkInstallationTestCase(unittest.TestCase):
# Positive test cases
self.assertEqual(
("spark-3.0.0", "hadoop2.7", "hive1.2"),
checked_versions("spark-3.0.0", "hadoop2.7", "hive1.2"))
("spark-3.0.0", "hadoop2.7", "hive2.3"),
checked_versions("spark-3.0.0", "hadoop2.7", "hive2.3"))
self.assertEqual(
("spark-3.0.0", "hadoop2.7", "hive1.2"),
checked_versions("3.0.0", "2.7", "1.2"))
("spark-3.0.0", "hadoop2.7", "hive2.3"),
checked_versions("3.0.0", "2.7", "2.3"))
self.assertEqual(
("spark-2.4.1", "without-hadoop", "hive2.3"),
@ -94,7 +91,7 @@ class SparkInstallationTestCase(unittest.TestCase):
hadoop_version=DEFAULT_HADOOP,
hive_version="malformed")
with self.assertRaisesRegex(RuntimeError, "Hive 1.2 should only be with Hadoop 2.7"):
with self.assertRaisesRegex(RuntimeError, "Spark distribution of hive1.2 is not supported"):
checked_versions(
spark_version=test_version,
hadoop_version="hadoop3.2",

View file

@ -127,6 +127,8 @@ class InstallCommand(install):
if ("HADOOP_VERSION" in os.environ) or ("HIVE_VERSION" in os.environ):
# Note that SPARK_VERSION environment is just a testing purpose.
# HIVE_VERSION environment variable is also internal for now in case
# we support another version of Hive in the future.
spark_version, hadoop_version, hive_version = install_module.checked_versions(
os.environ.get("SPARK_VERSION", VERSION).lower(),
os.environ.get("HADOOP_VERSION", install_module.DEFAULT_HADOOP).lower(),