Skip to content

Commit b884daa

Browse files
florianverheinsrowen
authored andcommitted
[SPARK-5611] [EC2] Allow spark-ec2 repo and branch to be set on CLI of spark_ec2.py
and by extension, the ami-list Useful for using alternate spark-ec2 repos or branches. Author: Florian Verhein <[email protected]> Closes apache#4385 from florianverhein/master and squashes the following commits: 7e2b4be [Florian Verhein] [SPARK-5611] [EC2] typo 8b653dc [Florian Verhein] [SPARK-5611] [EC2] Enforce only supporting spark-ec2 forks from github, log improvement bc4b0ed [Florian Verhein] [SPARK-5611] allow spark-ec2 repos with different names 8b5c551 [Florian Verhein] improve option naming, fix logging, fix lint failing, add guard to enforce spark-ec2 7724308 [Florian Verhein] [SPARK-5611] [EC2] fixes b42b68c [Florian Verhein] [SPARK-5611] [EC2] Allow spark-ec2 repo and branch to be set on CLI of spark_ec2.py
1 parent f48199e commit b884daa

File tree

1 file changed

+32
-5
lines changed

1 file changed

+32
-5
lines changed

ec2/spark_ec2.py

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -62,10 +62,10 @@
6262

6363
DEFAULT_SPARK_VERSION = SPARK_EC2_VERSION
6464
DEFAULT_SPARK_GITHUB_REPO = "https://github.com/apache/spark"
65-
MESOS_SPARK_EC2_BRANCH = "branch-1.3"
6665

67-
# A URL prefix from which to fetch AMI information
68-
AMI_PREFIX = "https://raw.github.com/mesos/spark-ec2/{b}/ami-list".format(b=MESOS_SPARK_EC2_BRANCH)
66+
# Default location to get the spark-ec2 scripts (and ami-list) from
67+
DEFAULT_SPARK_EC2_GITHUB_REPO = "https://github.com/mesos/spark-ec2"
68+
DEFAULT_SPARK_EC2_BRANCH = "branch-1.3"
6969

7070

7171
def setup_boto():
@@ -147,6 +147,14 @@ def parse_args():
147147
"--spark-git-repo",
148148
default=DEFAULT_SPARK_GITHUB_REPO,
149149
help="Github repo from which to checkout supplied commit hash (default: %default)")
150+
parser.add_option(
151+
"--spark-ec2-git-repo",
152+
default=DEFAULT_SPARK_EC2_GITHUB_REPO,
153+
help="Github repo from which to checkout spark-ec2 (default: %default)")
154+
parser.add_option(
155+
"--spark-ec2-git-branch",
156+
default=DEFAULT_SPARK_EC2_BRANCH,
157+
help="Github repo branch of spark-ec2 to use (default: %default)")
150158
parser.add_option(
151159
"--hadoop-major-version", default="1",
152160
help="Major version of Hadoop (default: %default)")
@@ -333,7 +341,12 @@ def get_spark_ami(opts):
333341
print >> stderr,\
334342
"Don't recognize %s, assuming type is pvm" % opts.instance_type
335343

336-
ami_path = "%s/%s/%s" % (AMI_PREFIX, opts.region, instance_type)
344+
# URL prefix from which to fetch AMI information
345+
ami_prefix = "{r}/{b}/ami-list".format(
346+
r=opts.spark_ec2_git_repo.replace("https://github.com", "https://raw.github.com", 1),
347+
b=opts.spark_ec2_git_branch)
348+
349+
ami_path = "%s/%s/%s" % (ami_prefix, opts.region, instance_type)
337350
try:
338351
ami = urllib2.urlopen(ami_path).read().strip()
339352
print "Spark AMI: " + ami
@@ -650,12 +663,15 @@ def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key):
650663

651664
# NOTE: We should clone the repository before running deploy_files to
652665
# prevent ec2-variables.sh from being overwritten
666+
print "Cloning spark-ec2 scripts from {r}/tree/{b} on master...".format(
667+
r=opts.spark_ec2_git_repo, b=opts.spark_ec2_git_branch)
653668
ssh(
654669
host=master,
655670
opts=opts,
656671
command="rm -rf spark-ec2"
657672
+ " && "
658-
+ "git clone https://github.com/mesos/spark-ec2.git -b {b}".format(b=MESOS_SPARK_EC2_BRANCH)
673+
+ "git clone {r} -b {b} spark-ec2".format(r=opts.spark_ec2_git_repo,
674+
b=opts.spark_ec2_git_branch)
659675
)
660676

661677
print "Deploying files to master..."
@@ -1038,6 +1054,17 @@ def real_main():
10381054
print >> stderr, "ebs-vol-num cannot be greater than 8"
10391055
sys.exit(1)
10401056

1057+
# Prevent breaking ami_prefix (/, .git and startswith checks)
1058+
# Prevent forks with non spark-ec2 names for now.
1059+
if opts.spark_ec2_git_repo.endswith("/") or \
1060+
opts.spark_ec2_git_repo.endswith(".git") or \
1061+
not opts.spark_ec2_git_repo.startswith("https://github.com") or \
1062+
not opts.spark_ec2_git_repo.endswith("spark-ec2"):
1063+
print >> stderr, "spark-ec2-git-repo must be a github repo and it must not have a " \
1064+
"trailing / or .git. " \
1065+
"Furthermore, we currently only support forks named spark-ec2."
1066+
sys.exit(1)
1067+
10411068
try:
10421069
conn = ec2.connect_to_region(opts.region)
10431070
except Exception as e:

0 commit comments

Comments
 (0)