diff --git a/python/docs/source/getting_started/install.rst b/python/docs/source/getting_started/install.rst index a90f5fe159..c5485424da 100644 --- a/python/docs/source/getting_started/install.rst +++ b/python/docs/source/getting_started/install.rst @@ -48,11 +48,11 @@ If you want to install extra dependencies for a specific component, you can inst pip install pyspark[sql] -For PySpark with/without a specific Hadoop version, you can install it by using ``HADOOP_VERSION`` environment variables as below: +For PySpark with/without a specific Hadoop version, you can install it by using ``PYSPARK_HADOOP_VERSION`` environment variables as below: .. code-block:: bash - HADOOP_VERSION=2.7 pip install pyspark + PYSPARK_HADOOP_VERSION=2.7 pip install pyspark The default distribution uses Hadoop 3.2 and Hive 2.3. If users specify different versions of Hadoop, the pip installation automatically downloads a different version and use it in PySpark. Downloading it can take a while depending on @@ -60,15 +60,15 @@ the network and the mirror chosen. ``PYSPARK_RELEASE_MIRROR`` can be set to manu .. code-block:: bash - PYSPARK_RELEASE_MIRROR=http://mirror.apache-kr.org HADOOP_VERSION=2.7 pip install + PYSPARK_RELEASE_MIRROR=http://mirror.apache-kr.org PYSPARK_HADOOP_VERSION=2.7 pip install It is recommended to use ``-v`` option in ``pip`` to track the installation and download status. .. code-block:: bash - HADOOP_VERSION=2.7 pip install pyspark -v + PYSPARK_HADOOP_VERSION=2.7 pip install pyspark -v -Supported values in ``HADOOP_VERSION`` are: +Supported values in ``PYSPARK_HADOOP_VERSION`` are: - ``without``: Spark pre-built with user-provided Apache Hadoop - ``2.7``: Spark pre-built for Apache Hadoop 2.7 diff --git a/python/pyspark/find_spark_home.py b/python/pyspark/find_spark_home.py index 4521a36503..62a36d42eb 100755 --- a/python/pyspark/find_spark_home.py +++ b/python/pyspark/find_spark_home.py @@ -36,7 +36,7 @@ def _find_spark_home(): (os.path.isdir(os.path.join(path, "jars")) or os.path.isdir(os.path.join(path, "assembly")))) - # Spark distribution can be downloaded when HADOOP_VERSION environment variable is set. + # Spark distribution can be downloaded when PYSPARK_HADOOP_VERSION environment variable is set. # We should look up this directory first, see also SPARK-32017. spark_dist_dir = "spark-distribution" paths = [ diff --git a/python/setup.py b/python/setup.py index 7bb8a00171..c7f195b89a 100755 --- a/python/setup.py +++ b/python/setup.py @@ -125,16 +125,16 @@ class InstallCommand(install): spark_dist = os.path.join(self.install_lib, "pyspark", "spark-distribution") rmtree(spark_dist, ignore_errors=True) - if ("HADOOP_VERSION" in os.environ) or ("HIVE_VERSION" in os.environ): - # Note that SPARK_VERSION environment is just a testing purpose. - # HIVE_VERSION environment variable is also internal for now in case + if ("PYSPARK_HADOOP_VERSION" in os.environ) or ("PYSPARK_HIVE_VERSION" in os.environ): + # Note that PYSPARK_VERSION environment is just a testing purpose. + # PYSPARK_HIVE_VERSION environment variable is also internal for now in case # we support another version of Hive in the future. spark_version, hadoop_version, hive_version = install_module.checked_versions( - os.environ.get("SPARK_VERSION", VERSION).lower(), - os.environ.get("HADOOP_VERSION", install_module.DEFAULT_HADOOP).lower(), - os.environ.get("HIVE_VERSION", install_module.DEFAULT_HIVE).lower()) + os.environ.get("PYSPARK_VERSION", VERSION).lower(), + os.environ.get("PYSPARK_HADOOP_VERSION", install_module.DEFAULT_HADOOP).lower(), + os.environ.get("PYSPARK_HIVE_VERSION", install_module.DEFAULT_HIVE).lower()) - if ("SPARK_VERSION" not in os.environ and + if ("PYSPARK_VERSION" not in os.environ and ((install_module.DEFAULT_HADOOP, install_module.DEFAULT_HIVE) == (hadoop_version, hive_version))): # Do not download and install if they are same as default.