[SPARK-32982][BUILD] Remove hive-1.2 profiles in PIP installation option
### What changes were proposed in this pull request? This PR removes Hive 1.2 option (and therefore `HIVE_VERSION` environment variable as well). ### Why are the changes needed? Hive 1.2 is a fork version. We shouldn't promote users to use. ### Does this PR introduce _any_ user-facing change? Nope, `HIVE_VERSION` and Hive 1.2 are removed but this is new experimental feature in master only. ### How was this patch tested? Manually tested: ```bash SPARK_VERSION=3.0.1 HADOOP_VERSION=3.2 pip install pyspark-3.1.0.dev0.tar.gz -v SPARK_VERSION=3.0.1 HADOOP_VERSION=2.7 pip install pyspark-3.1.0.dev0.tar.gz -v SPARK_VERSION=3.0.1 HADOOP_VERSION=invalid pip install pyspark-3.1.0.dev0.tar.gz -v ``` Closes #29858 from HyukjinKwon/SPARK-32981. Authored-by: HyukjinKwon <gurwls223@apache.org> Signed-off-by: HyukjinKwon <gurwls223@apache.org>
This commit is contained in:
parent
31a16fbb40
commit
688d016c7a
|
@ -276,7 +276,7 @@ if [[ "$1" == "package" ]]; then
|
|||
# list of packages to be built, so it's ok for things to be missing in BINARY_PKGS_EXTRA.
|
||||
|
||||
# NOTE: Don't forget to update the valid combinations of distributions at
|
||||
# 'python/pyspark.install.py' and 'python/docs/source/getting_started/installation.rst'
|
||||
# 'python/pyspark/install.py' and 'python/docs/source/getting_started/install.rst'
|
||||
# if you're changing them.
|
||||
declare -A BINARY_PKGS_ARGS
|
||||
BINARY_PKGS_ARGS["hadoop3.2"]="-Phadoop-3.2 $HIVE_PROFILES"
|
||||
|
|
|
@ -48,40 +48,28 @@ If you want to install extra dependencies for a specific componenet, you can ins
|
|||
|
||||
pip install pyspark[sql]
|
||||
|
||||
For PySpark with different Hadoop and/or Hive, you can install it by using ``HIVE_VERSION`` and ``HADOOP_VERSION`` environment variables as below:
|
||||
For PySpark with a different Hadoop version, you can install it by using ``HADOOP_VERSION`` environment variables as below:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
HIVE_VERSION=2.3 pip install pyspark
|
||||
HADOOP_VERSION=2.7 pip install pyspark
|
||||
HIVE_VERSION=1.2 HADOOP_VERSION=2.7 pip install pyspark
|
||||
|
||||
The default distribution has built-in Hadoop 3.2 and Hive 2.3. If users specify different versions, the pip installation automatically
|
||||
The default distribution uses Hadoop 3.2 and Hive 2.3. If users specify different versions of Hadoop, the pip installation automatically
|
||||
downloads a different version and use it in PySpark. Downloading it can take a while depending on
|
||||
the network and the mirror chosen. ``PYSPARK_RELEASE_MIRROR`` can be set to manually choose the mirror
|
||||
for faster downloading.
|
||||
the network and the mirror chosen. ``PYSPARK_RELEASE_MIRROR`` can be set to manually choose the mirror for faster downloading.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
PYSPARK_RELEASE_MIRROR=http://mirror.apache-kr.org HADOOP_VERSION=2.7 pip install
|
||||
|
||||
It is recommended to use `-v` option in `pip` to track the installation and download status.
|
||||
It is recommended to use ``-v`` option in ``pip`` to track the installation and download status.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
HADOOP_VERSION=2.7 pip install pyspark -v
|
||||
|
||||
Supported versions are as below:
|
||||
|
||||
====================================== ====================================== ======================================
|
||||
``HADOOP_VERSION`` \\ ``HIVE_VERSION`` 1.2 2.3 (default)
|
||||
====================================== ====================================== ======================================
|
||||
**2.7** O O
|
||||
**3.2 (default)** X O
|
||||
**without** X O
|
||||
====================================== ====================================== ======================================
|
||||
|
||||
Note that this installation of PySpark with different versions of Hadoop and Hive is experimental. It can change or be removed between minor releases.
|
||||
Supported versions of Hadoop are ``HADOOP_VERSION=2.7`` and ``HADOOP_VERSION=3.2`` (default).
|
||||
Note that this installation of PySpark with a different version of Hadoop is experimental. It can change or be removed between minor releases.
|
||||
|
||||
|
||||
Using Conda
|
||||
|
|
|
@ -26,18 +26,13 @@ from shutil import rmtree
|
|||
DEFAULT_HADOOP = "hadoop3.2"
|
||||
DEFAULT_HIVE = "hive2.3"
|
||||
SUPPORTED_HADOOP_VERSIONS = ["hadoop2.7", "hadoop3.2", "without-hadoop"]
|
||||
SUPPORTED_HIVE_VERSIONS = ["hive1.2", "hive2.3"]
|
||||
SUPPORTED_HIVE_VERSIONS = ["hive2.3"]
|
||||
UNSUPPORTED_COMBINATIONS = [
|
||||
("without-hadoop", "hive1.2"),
|
||||
("hadoop3.2", "hive1.2"),
|
||||
]
|
||||
|
||||
|
||||
def checked_package_name(spark_version, hadoop_version, hive_version):
|
||||
if hive_version == "hive1.2":
|
||||
return "%s-bin-%s-%s" % (spark_version, hadoop_version, hive_version)
|
||||
else:
|
||||
return "%s-bin-%s" % (spark_version, hadoop_version)
|
||||
return "%s-bin-%s" % (spark_version, hadoop_version)
|
||||
|
||||
|
||||
def checked_versions(spark_version, hadoop_version, hive_version):
|
||||
|
@ -48,7 +43,7 @@ def checked_versions(spark_version, hadoop_version, hive_version):
|
|||
:param hadoop_version: Hadoop version. It should be X.X such as '2.7' or 'hadoop2.7'.
|
||||
'without' and 'without-hadoop' are supported as special keywords for Hadoop free
|
||||
distribution.
|
||||
:param hive_version: Hive version. It should be X.X such as '1.2' or 'hive1.2'.
|
||||
:param hive_version: Hive version. It should be X.X such as '2.3' or 'hive2.3'.
|
||||
|
||||
:return it returns fully-qualified versions of Spark, Hadoop and Hive in a tuple.
|
||||
For example, spark-3.0.0, hadoop3.2 and hive2.3.
|
||||
|
@ -80,9 +75,6 @@ def checked_versions(spark_version, hadoop_version, hive_version):
|
|||
"one of [%s]" % (hive_version, ", ".join(
|
||||
SUPPORTED_HADOOP_VERSIONS)))
|
||||
|
||||
if (hadoop_version, hive_version) in UNSUPPORTED_COMBINATIONS:
|
||||
raise RuntimeError("Hive 1.2 should only be with Hadoop 2.7.")
|
||||
|
||||
return spark_version, hadoop_version, hive_version
|
||||
|
||||
|
||||
|
@ -95,7 +87,7 @@ def install_spark(dest, spark_version, hadoop_version, hive_version):
|
|||
:param spark_version: Spark version. It should be spark-X.X.X form.
|
||||
:param hadoop_version: Hadoop version. It should be hadoopX.X
|
||||
such as 'hadoop2.7' or 'without-hadoop'.
|
||||
:param hive_version: Hive version. It should be hiveX.X such as 'hive1.2'.
|
||||
:param hive_version: Hive version. It should be hiveX.X such as 'hive2.3'.
|
||||
"""
|
||||
|
||||
package_name = checked_package_name(spark_version, hadoop_version, hive_version)
|
||||
|
|
|
@ -41,9 +41,6 @@ class SparkInstallationTestCase(unittest.TestCase):
|
|||
self.assertTrue(os.path.exists("%s/RELEASE" % tmp_dir))
|
||||
|
||||
def test_package_name(self):
|
||||
self.assertEqual(
|
||||
"spark-3.0.0-bin-hadoop3.2-hive1.2",
|
||||
checked_package_name("spark-3.0.0", "hadoop3.2", "hive1.2"))
|
||||
self.assertEqual(
|
||||
"spark-3.0.0-bin-hadoop3.2",
|
||||
checked_package_name("spark-3.0.0", "hadoop3.2", "hive2.3"))
|
||||
|
@ -53,12 +50,12 @@ class SparkInstallationTestCase(unittest.TestCase):
|
|||
|
||||
# Positive test cases
|
||||
self.assertEqual(
|
||||
("spark-3.0.0", "hadoop2.7", "hive1.2"),
|
||||
checked_versions("spark-3.0.0", "hadoop2.7", "hive1.2"))
|
||||
("spark-3.0.0", "hadoop2.7", "hive2.3"),
|
||||
checked_versions("spark-3.0.0", "hadoop2.7", "hive2.3"))
|
||||
|
||||
self.assertEqual(
|
||||
("spark-3.0.0", "hadoop2.7", "hive1.2"),
|
||||
checked_versions("3.0.0", "2.7", "1.2"))
|
||||
("spark-3.0.0", "hadoop2.7", "hive2.3"),
|
||||
checked_versions("3.0.0", "2.7", "2.3"))
|
||||
|
||||
self.assertEqual(
|
||||
("spark-2.4.1", "without-hadoop", "hive2.3"),
|
||||
|
@ -94,7 +91,7 @@ class SparkInstallationTestCase(unittest.TestCase):
|
|||
hadoop_version=DEFAULT_HADOOP,
|
||||
hive_version="malformed")
|
||||
|
||||
with self.assertRaisesRegex(RuntimeError, "Hive 1.2 should only be with Hadoop 2.7"):
|
||||
with self.assertRaisesRegex(RuntimeError, "Spark distribution of hive1.2 is not supported"):
|
||||
checked_versions(
|
||||
spark_version=test_version,
|
||||
hadoop_version="hadoop3.2",
|
||||
|
|
|
@ -127,6 +127,8 @@ class InstallCommand(install):
|
|||
|
||||
if ("HADOOP_VERSION" in os.environ) or ("HIVE_VERSION" in os.environ):
|
||||
# Note that SPARK_VERSION environment is just a testing purpose.
|
||||
# HIVE_VERSION environment variable is also internal for now in case
|
||||
# we support another version of Hive in the future.
|
||||
spark_version, hadoop_version, hive_version = install_module.checked_versions(
|
||||
os.environ.get("SPARK_VERSION", VERSION).lower(),
|
||||
os.environ.get("HADOOP_VERSION", install_module.DEFAULT_HADOOP).lower(),
|
||||
|
|
Loading…
Reference in a new issue