[SPARK-4890] Upgrade Boto to 2.34.0; automatically download Boto from PyPi instead of packaging it
This patch upgrades `spark-ec2`'s Boto version to 2.34.0, since this is blocking several features. Newer versions of Boto don't work properly when they're loaded from a zipfile since they try to read a JSON file from a path relative to the Boto library sources. Therefore, this patch also changes spark-ec2 to automatically download Boto from PyPi if it's not present in `SPARK_EC2_DIR/lib`, similar to what we do in the `sbt/sbt` script. This shouldn't ben an issue for users since they already need to have an internet connection to launch an EC2 cluster. By performing the downloading in spark_ec2.py instead of the Bash script, this should also work for Windows users. I've tested this with Python 2.6, too. Author: Josh Rosen <joshrosen@databricks.com> Closes #3737 from JoshRosen/update-boto and squashes the following commits: 0aa43cc [Josh Rosen] Remove unused setup_standalone_cluster() method. f02935d [Josh Rosen] Enable Python deprecation warnings and fix one Boto warning: 587ae89 [Josh Rosen] [SPARK-4890] Upgrade Boto to 2.34.0; automatically download Boto from PyPi instead of packaging it
This commit is contained in:
parent
7981f96976
commit
c28083f468
|
@ -22,5 +22,4 @@
|
|||
#+ the underlying Python script.
|
||||
SPARK_EC2_DIR="$(dirname $0)"
|
||||
|
||||
PYTHONPATH="${SPARK_EC2_DIR}/third_party/boto-2.4.1.zip/boto-2.4.1:$PYTHONPATH" \
|
||||
python "${SPARK_EC2_DIR}/spark_ec2.py" "$@"
|
||||
python -Wdefault "${SPARK_EC2_DIR}/spark_ec2.py" "$@"
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
|
||||
from __future__ import with_statement
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
import pipes
|
||||
|
@ -29,6 +30,7 @@ import shutil
|
|||
import string
|
||||
import subprocess
|
||||
import sys
|
||||
import tarfile
|
||||
import tempfile
|
||||
import time
|
||||
import urllib2
|
||||
|
@ -36,9 +38,6 @@ import warnings
|
|||
from datetime import datetime
|
||||
from optparse import OptionParser
|
||||
from sys import stderr
|
||||
import boto
|
||||
from boto.ec2.blockdevicemapping import BlockDeviceMapping, BlockDeviceType, EBSBlockDeviceType
|
||||
from boto import ec2
|
||||
|
||||
DEFAULT_SPARK_VERSION = "1.1.0"
|
||||
SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__))
|
||||
|
@ -48,6 +47,39 @@ MESOS_SPARK_EC2_BRANCH = "v4"
|
|||
AMI_PREFIX = "https://raw.github.com/mesos/spark-ec2/{b}/ami-list".format(b=MESOS_SPARK_EC2_BRANCH)
|
||||
|
||||
|
||||
def setup_boto():
|
||||
# Download Boto if it's not already present in the SPARK_EC2_DIR/lib folder:
|
||||
version = "boto-2.34.0"
|
||||
md5 = "5556223d2d0cc4d06dd4829e671dcecd"
|
||||
url = "https://pypi.python.org/packages/source/b/boto/%s.tar.gz" % version
|
||||
lib_dir = os.path.join(SPARK_EC2_DIR, "lib")
|
||||
if not os.path.exists(lib_dir):
|
||||
os.mkdir(lib_dir)
|
||||
boto_lib_dir = os.path.join(lib_dir, version)
|
||||
if not os.path.isdir(boto_lib_dir):
|
||||
tgz_file_path = os.path.join(lib_dir, "%s.tar.gz" % version)
|
||||
print "Downloading Boto from PyPi"
|
||||
download_stream = urllib2.urlopen(url)
|
||||
with open(tgz_file_path, "wb") as tgz_file:
|
||||
tgz_file.write(download_stream.read())
|
||||
with open(tgz_file_path) as tar:
|
||||
if hashlib.md5(tar.read()).hexdigest() != md5:
|
||||
print >> stderr, "ERROR: Got wrong md5sum for Boto"
|
||||
sys.exit(1)
|
||||
tar = tarfile.open(tgz_file_path)
|
||||
tar.extractall(path=lib_dir)
|
||||
tar.close()
|
||||
os.remove(tgz_file_path)
|
||||
print "Finished downloading Boto"
|
||||
sys.path.insert(0, boto_lib_dir)
|
||||
|
||||
|
||||
setup_boto()
|
||||
import boto
|
||||
from boto.ec2.blockdevicemapping import BlockDeviceMapping, BlockDeviceType, EBSBlockDeviceType
|
||||
from boto import ec2
|
||||
|
||||
|
||||
class UsageError(Exception):
|
||||
pass
|
||||
|
||||
|
@ -452,7 +484,7 @@ def launch_cluster(conn, opts, cluster_name):
|
|||
active_instance_ids.append(id_to_req[i].instance_id)
|
||||
if len(active_instance_ids) == opts.slaves:
|
||||
print "All %d slaves granted" % opts.slaves
|
||||
reservations = conn.get_all_instances(active_instance_ids)
|
||||
reservations = conn.get_all_reservations(active_instance_ids)
|
||||
slave_nodes = []
|
||||
for r in reservations:
|
||||
slave_nodes += r.instances
|
||||
|
@ -541,7 +573,7 @@ def launch_cluster(conn, opts, cluster_name):
|
|||
|
||||
def get_existing_cluster(conn, opts, cluster_name, die_on_error=True):
|
||||
print "Searching for existing cluster " + cluster_name + "..."
|
||||
reservations = conn.get_all_instances()
|
||||
reservations = conn.get_all_reservations()
|
||||
master_nodes = []
|
||||
slave_nodes = []
|
||||
for res in reservations:
|
||||
|
@ -618,12 +650,6 @@ def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key):
|
|||
print "Done!"
|
||||
|
||||
|
||||
def setup_standalone_cluster(master, slave_nodes, opts):
|
||||
slave_ips = '\n'.join([i.public_dns_name for i in slave_nodes])
|
||||
ssh(master, opts, "echo \"%s\" > spark/conf/slaves" % (slave_ips))
|
||||
ssh(master, opts, "/root/spark/sbin/start-all.sh")
|
||||
|
||||
|
||||
def setup_spark_cluster(master, opts):
|
||||
ssh(master, opts, "chmod u+x spark-ec2/setup.sh")
|
||||
ssh(master, opts, "spark-ec2/setup.sh")
|
||||
|
|
BIN
ec2/third_party/boto-2.4.1.zip
vendored
BIN
ec2/third_party/boto-2.4.1.zip
vendored
Binary file not shown.
Loading…
Reference in a new issue