#!/usr/bin/env python3 # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import importlib.util import glob import os import sys from setuptools import setup from setuptools.command.install import install from shutil import copyfile, copytree, rmtree try: exec(open('pyspark/version.py').read()) except IOError: print("Failed to load PySpark version file for packaging. You must be in Spark's python dir.", file=sys.stderr) sys.exit(-1) try: spec = importlib.util.spec_from_file_location("install", "pyspark/install.py") install_module = importlib.util.module_from_spec(spec) spec.loader.exec_module(install_module) except IOError: print("Failed to load the installing module (pyspark/install.py) which had to be " "packaged together.", file=sys.stderr) sys.exit(-1) VERSION = __version__ # noqa # A temporary path so we can access above the Python project root and fetch scripts and jars we need TEMP_PATH = "deps" SPARK_HOME = os.path.abspath("../") # Provide guidance about how to use setup.py incorrect_invocation_message = """ If you are installing pyspark from spark source, you must first build Spark and run sdist. To build Spark with maven you can run: ./build/mvn -DskipTests clean package Building the source dist is done in the Python directory: cd python python setup.py sdist pip install dist/*.tar.gz""" # Figure out where the jars are we need to package with PySpark. JARS_PATH = glob.glob(os.path.join(SPARK_HOME, "assembly/target/scala-*/jars/")) if len(JARS_PATH) == 1: JARS_PATH = JARS_PATH[0] elif (os.path.isfile("../RELEASE") and len(glob.glob("../jars/spark*core*.jar")) == 1): # Release mode puts the jars in a jars directory JARS_PATH = os.path.join(SPARK_HOME, "jars") elif len(JARS_PATH) > 1: print("Assembly jars exist for multiple scalas ({0}), please cleanup assembly/target".format( JARS_PATH), file=sys.stderr) sys.exit(-1) elif len(JARS_PATH) == 0 and not os.path.exists(TEMP_PATH): print(incorrect_invocation_message, file=sys.stderr) sys.exit(-1) EXAMPLES_PATH = os.path.join(SPARK_HOME, "examples/src/main/python") SCRIPTS_PATH = os.path.join(SPARK_HOME, "bin") USER_SCRIPTS_PATH = os.path.join(SPARK_HOME, "sbin") DATA_PATH = os.path.join(SPARK_HOME, "data") LICENSES_PATH = os.path.join(SPARK_HOME, "licenses") SCRIPTS_TARGET = os.path.join(TEMP_PATH, "bin") USER_SCRIPTS_TARGET = os.path.join(TEMP_PATH, "sbin") JARS_TARGET = os.path.join(TEMP_PATH, "jars") EXAMPLES_TARGET = os.path.join(TEMP_PATH, "examples") DATA_TARGET = os.path.join(TEMP_PATH, "data") LICENSES_TARGET = os.path.join(TEMP_PATH, "licenses") # Check and see if we are under the spark path in which case we need to build the symlink farm. # This is important because we only want to build the symlink farm while under Spark otherwise we # want to use the symlink farm. And if the symlink farm exists under while under Spark (e.g. a # partially built sdist) we should error and have the user sort it out. in_spark = (os.path.isfile("../core/src/main/scala/org/apache/spark/SparkContext.scala") or (os.path.isfile("../RELEASE") and len(glob.glob("../jars/spark*core*.jar")) == 1)) def _supports_symlinks(): """Check if the system supports symlinks (e.g. *nix) or not.""" return getattr(os, "symlink", None) is not None if (in_spark): # Construct links for setup try: os.mkdir(TEMP_PATH) except: print("Temp path for symlink to parent already exists {0}".format(TEMP_PATH), file=sys.stderr) sys.exit(-1) # If you are changing the versions here, please also change ./python/pyspark/sql/pandas/utils.py # For Arrow, you should also check ./pom.xml and ensure there are no breaking changes in the # binary format protocol with the Java version, see ARROW_HOME/format/* for specifications. # Also don't forget to update python/docs/source/getting_started/install.rst. _minimum_pandas_version = "0.23.2" _minimum_pyarrow_version = "1.0.0" class InstallCommand(install): # TODO(SPARK-32837) leverage pip's custom options def run(self): install.run(self) # Make sure the destination is always clean. spark_dist = os.path.join(self.install_lib, "pyspark", "spark-distribution") rmtree(spark_dist, ignore_errors=True) if ("PYSPARK_HADOOP_VERSION" in os.environ) or ("PYSPARK_HIVE_VERSION" in os.environ): # Note that PYSPARK_VERSION environment is just a testing purpose. # PYSPARK_HIVE_VERSION environment variable is also internal for now in case # we support another version of Hive in the future. spark_version, hadoop_version, hive_version = install_module.checked_versions( os.environ.get("PYSPARK_VERSION", VERSION).lower(), os.environ.get("PYSPARK_HADOOP_VERSION", install_module.DEFAULT_HADOOP).lower(), os.environ.get("PYSPARK_HIVE_VERSION", install_module.DEFAULT_HIVE).lower()) if ("PYSPARK_VERSION" not in os.environ and ((install_module.DEFAULT_HADOOP, install_module.DEFAULT_HIVE) == (hadoop_version, hive_version))): # Do not download and install if they are same as default. return install_module.install_spark( dest=spark_dist, spark_version=spark_version, hadoop_version=hadoop_version, hive_version=hive_version) try: # We copy the shell script to be under pyspark/python/pyspark so that the launcher scripts # find it where expected. The rest of the files aren't copied because they are accessed # using Python imports instead which will be resolved correctly. try: os.makedirs("pyspark/python/pyspark") except OSError: # Don't worry if the directory already exists. pass copyfile("pyspark/shell.py", "pyspark/python/pyspark/shell.py") if (in_spark): # Construct the symlink farm - this is necessary since we can't refer to the path above the # package root and we need to copy the jars and scripts which are up above the python root. if _supports_symlinks(): os.symlink(JARS_PATH, JARS_TARGET) os.symlink(SCRIPTS_PATH, SCRIPTS_TARGET) os.symlink(USER_SCRIPTS_PATH, USER_SCRIPTS_TARGET) os.symlink(EXAMPLES_PATH, EXAMPLES_TARGET) os.symlink(DATA_PATH, DATA_TARGET) os.symlink(LICENSES_PATH, LICENSES_TARGET) else: # For windows fall back to the slower copytree copytree(JARS_PATH, JARS_TARGET) copytree(SCRIPTS_PATH, SCRIPTS_TARGET) copytree(USER_SCRIPTS_PATH, USER_SCRIPTS_TARGET) copytree(EXAMPLES_PATH, EXAMPLES_TARGET) copytree(DATA_PATH, DATA_TARGET) copytree(LICENSES_PATH, LICENSES_TARGET) else: # If we are not inside of SPARK_HOME verify we have the required symlink farm if not os.path.exists(JARS_TARGET): print("To build packaging must be in the python directory under the SPARK_HOME.", file=sys.stderr) if not os.path.isdir(SCRIPTS_TARGET): print(incorrect_invocation_message, file=sys.stderr) sys.exit(-1) # Scripts directive requires a list of each script path and does not take wild cards. script_names = os.listdir(SCRIPTS_TARGET) scripts = list(map(lambda script: os.path.join(SCRIPTS_TARGET, script), script_names)) # We add find_spark_home.py to the bin directory we install so that pip installed PySpark # will search for SPARK_HOME with Python. scripts.append("pyspark/find_spark_home.py") with open('README.md') as f: long_description = f.read() setup( name='pyspark', version=VERSION, description='Apache Spark Python API', long_description=long_description, long_description_content_type="text/markdown", author='Spark Developers', author_email='dev@spark.apache.org', url='https://github.com/apache/spark/tree/master/python', packages=['pyspark', 'pyspark.cloudpickle', 'pyspark.mllib', 'pyspark.mllib.linalg', 'pyspark.mllib.stat', 'pyspark.ml', 'pyspark.ml.linalg', 'pyspark.ml.param', 'pyspark.sql', 'pyspark.sql.avro', 'pyspark.sql.pandas', 'pyspark.streaming', 'pyspark.bin', 'pyspark.sbin', 'pyspark.jars', 'pyspark.python.pyspark', 'pyspark.python.lib', 'pyspark.data', 'pyspark.licenses', 'pyspark.resource', 'pyspark.examples.src.main.python'], include_package_data=True, package_dir={ 'pyspark.jars': 'deps/jars', 'pyspark.bin': 'deps/bin', 'pyspark.sbin': 'deps/sbin', 'pyspark.python.lib': 'lib', 'pyspark.data': 'deps/data', 'pyspark.licenses': 'deps/licenses', 'pyspark.examples.src.main.python': 'deps/examples', }, package_data={ 'pyspark.jars': ['*.jar'], 'pyspark.bin': ['*'], 'pyspark.sbin': ['spark-config.sh', 'spark-daemon.sh', 'start-history-server.sh', 'stop-history-server.sh', ], 'pyspark.python.lib': ['*.zip'], 'pyspark.data': ['*.txt', '*.data'], 'pyspark.licenses': ['*.txt'], 'pyspark.examples.src.main.python': ['*.py', '*/*.py']}, scripts=scripts, license='http://www.apache.org/licenses/LICENSE-2.0', # Don't forget to update python/docs/source/getting_started/install.rst # if you're updating the versions or dependencies. install_requires=['py4j==0.10.9.2'], extras_require={ 'ml': ['numpy>=1.7'], 'mllib': ['numpy>=1.7'], 'sql': [ 'pandas>=%s' % _minimum_pandas_version, 'pyarrow>=%s' % _minimum_pyarrow_version, ] }, python_requires='>=3.6', classifiers=[ 'Development Status :: 5 - Production/Stable', 'License :: OSI Approved :: Apache Software License', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: Implementation :: CPython', 'Programming Language :: Python :: Implementation :: PyPy', 'Typing :: Typed'], cmdclass={ 'install': InstallCommand, }, ) finally: # We only cleanup the symlink farm if we were in Spark, otherwise we are installing rather than # packaging. if (in_spark): # Depending on cleaning up the symlink farm or copied version if _supports_symlinks(): os.remove(os.path.join(TEMP_PATH, "jars")) os.remove(os.path.join(TEMP_PATH, "bin")) os.remove(os.path.join(TEMP_PATH, "sbin")) os.remove(os.path.join(TEMP_PATH, "examples")) os.remove(os.path.join(TEMP_PATH, "data")) os.remove(os.path.join(TEMP_PATH, "licenses")) else: rmtree(os.path.join(TEMP_PATH, "jars")) rmtree(os.path.join(TEMP_PATH, "bin")) rmtree(os.path.join(TEMP_PATH, "sbin")) rmtree(os.path.join(TEMP_PATH, "examples")) rmtree(os.path.join(TEMP_PATH, "data")) rmtree(os.path.join(TEMP_PATH, "licenses")) os.rmdir(TEMP_PATH)