2ddb6f262f
-there is one issue with the mimir hostname not being resolved from spark workers. may switch to stateful setfor mimir to resolve.
140 lines
6.8 KiB
Docker
140 lines
6.8 KiB
Docker
FROM docker.mimirdb.info/alpine_oraclejdk8
|
|
|
|
|
|
# Metadata
|
|
LABEL base.image="docker.mimirdb.info/alpine_oraclejdk8"
|
|
LABEL version="0.1"
|
|
LABEL software="Spark"
|
|
LABEL software.version="0.1.201801"
|
|
LABEL description="Spark image"
|
|
|
|
RUN apk add --update curl bash sed perl grep openssh
|
|
#dnsmasq drill dhclient
|
|
|
|
#download hadoop
|
|
#ARG HADOOP_ARCHIVE=http://www.eu.apache.org/dist/hadoop/common/hadoop-2.7.6/hadoop-2.7.6.tar.gz
|
|
#RUN curl -sL $HADOOP_ARCHIVE | gunzip | tar -x -C /usr/local/
|
|
|
|
#or copy it
|
|
COPY hadoop-2.8.2.tar.gz /
|
|
RUN gunzip -c /hadoop-2.8.2.tar.gz | tar -x -C /usr/local/ && rm /hadoop-2.8.2.tar.gz
|
|
|
|
RUN cd /usr/local && ln -s ./hadoop-2.8.2 hadoop
|
|
|
|
#environmental variables set
|
|
ENV HADOOP_PREFIX /usr/local/hadoop
|
|
ENV HADOOP_COMMON_HOME /usr/local/hadoop
|
|
ENV HADOOP_HDFS_HOME /usr/local/hadoop
|
|
ENV HADOOP_MAPRED_HOME /usr/local/hadoop
|
|
ENV HADOOP_YARN_HOME /usr/local/hadoop
|
|
ENV HADOOP_CONF_DIR /usr/local/hadoop/etc/hadoop
|
|
ENV YARN_CONF_DIR $HADOOP_PREFIX/etc/hadoop
|
|
ENV HADOOP_HOME=/usr/local/hadoop
|
|
ENV CLUSTER_NAME=test
|
|
ENV MASTER_IP=0
|
|
ENV AWS_ECS=true
|
|
ENV HDFS_HOST=namenode
|
|
ENV HDFS_DATA_HOST=datanode
|
|
ENV CORE_CONF_fs_defaultFS=hdfs://namenode:8020
|
|
ENV CORE_CONF_hadoop_http_staticuser_user=root
|
|
ENV CORE_CONF_hadoop_proxyuser_hue_hosts=*
|
|
ENV CORE_CONF_hadoop_proxyuser_hue_groups=*
|
|
ENV HDFS_CONF_dfs_webhdfs_enabled=true
|
|
ENV HDFS_CONF_dfs_permissions_enabled=false
|
|
ENV HDFS_CONF_dfs_client_use_datanode_hostname=true
|
|
ENV HDFS_CONF_dfs_datanode_use_datanode_hostname=true
|
|
ENV HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false
|
|
ENV HDFS_CONF_dfs_datanode_address=0.0.0.0:50010
|
|
|
|
#env for namenode
|
|
ENV HDFS_CONF_dfs_namenode_name_dir=file:///hadoop/dfs/name
|
|
RUN mkdir -p /hadoop/dfs/name
|
|
VOLUME /hadoop/dfs/name
|
|
|
|
# env for datanode
|
|
ENV HDFS_CONF_dfs_datanode_data_dir=file:///hadoop/dfs/data
|
|
RUN mkdir -p /hadoop/dfs/data
|
|
VOLUME /hadoop/dfs/data
|
|
|
|
#VOLUME ["type=volume,source=mimir-vol,target=\/usr\/local\/source\/"]
|
|
#download spark
|
|
#ARG SPARK_ARCHIVE=http://supergsego.com/apache/spark/spark-2.4.0/spark-2.4.0-bin-hadoop2.7.tgz
|
|
#RUN curl -sL $SPARK_ARCHIVE | gunzip | tar -x -C /usr/local/
|
|
|
|
#or copy it
|
|
COPY spark-2.4.0-bin-without-hadoop.tgz /
|
|
RUN gunzip -c /spark-2.4.0-bin-without-hadoop.tgz | tar -x -C /usr/local/ && rm /spark-2.4.0-bin-without-hadoop.tgz
|
|
COPY hadoop-aws-2.8.2.jar aws-java-sdk-1.11.234.jar aws-java-sdk-core-1.11.234.jar aws-java-sdk-kms-1.11.234.jar \
|
|
aws-java-sdk-s3-1.11.234.jar hadoop-aws-2.8.2.jar httpclient-4.5.3.jar joda-time-2.9.9.jar /usr/local/spark-2.4.0-bin-without-hadoop/jars/
|
|
|
|
ENV SPARK_HOME /usr/local/spark-2.4.0-bin-without-hadoop
|
|
ENV PATH $PATH:$SPARK_HOME/bin
|
|
ENV INSTANCE_TYPE worker
|
|
|
|
# for High-availability like zoo-keeper's leader election
|
|
# COPY ha.conf $SPARK_HOME/conf
|
|
|
|
#For dhclient and dnsmasq
|
|
#RUN echo "option rfc3442-classless-static-routes code 121 = array of unsigned integer 8;" > /etc/dhcp/dhclient.conf && \
|
|
# echo "" >> /etc/dhcp/dhclient.conf && \
|
|
# echo "send host-name = gethostname();" >> /etc/dhcp/dhclient.conf && \
|
|
# echo "request subnet-mask, broadcast-address, time-offset, routers," >> /etc/dhcp/dhclient.conf && \
|
|
# echo " domain-name, domain-name-servers, domain-search, host-name," >> /etc/dhcp/dhclient.conf && \
|
|
# echo " dhcp6.name-servers, dhcp6.domain-search, dhcp6.fqdn, dhcp6.sntp-servers," >> /etc/dhcp/dhclient.conf && \
|
|
# echo " netbios-name-servers, netbios-scope, interface-mtu," >> /etc/dhcp/dhclient.conf && \
|
|
# echo " rfc3442-classless-static-routes, ntp-servers;" >> /etc/dhcp/dhclient.conf && \
|
|
# echo "" >> /etc/dhcp/dhclient.conf && \
|
|
# echo "prepend domain-name-servers 127.0.0.1" >> /etc/dhcp/dhclient.conf && \
|
|
# echo "nameserver 127.0.0.1" > /etc/resolv.conf
|
|
|
|
COPY base_entry.sh $SPARK_HOME/
|
|
COPY conf/master/spark-defaults.conf $SPARK_HOME/spark-master-defaults.conf
|
|
COPY conf/worker/spark-defaults.conf $SPARK_HOME/spark-worker-defaults.conf
|
|
|
|
EXPOSE 4040 6066 7001 7005 7015 7077 8080 8020 40000-50000 50070 22
|
|
#53 53/udp
|
|
|
|
#setup ssh
|
|
RUN echo 'root:odinlab' |chpasswd
|
|
RUN sed -ri 's/^#?PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config \
|
|
&& sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config \
|
|
&& ssh-keygen -f /etc/ssh/ssh_host_rsa_key -N '' -t rsa \
|
|
&& ssh-keygen -f /etc/ssh/ssh_host_dsa_key -N '' -t dsa \
|
|
&& ssh-keygen -f /etc/ssh/ssh_host_ecdsa_key -N '' -t ecdsa \
|
|
&& ssh-keygen -f /etc/ssh/ssh_host_ed25519_key -N '' -t ed25519
|
|
|
|
|
|
RUN chmod u+x $SPARK_HOME/base_entry.sh && \
|
|
echo "#!/bin/sh" > $SPARK_HOME/master.sh && \
|
|
echo "$SPARK_HOME/base_entry.sh \$HDFS_HOST" >> $SPARK_HOME/master.sh && \
|
|
echo "cp $SPARK_HOME/spark-master-defaults.conf $SPARK_HOME/conf/spark-defaults.conf" >> $SPARK_HOME/master.sh && \
|
|
echo "if [ -f $SPARK_HOME/.initComplete ] || [ -f /hadoop/dfs/name/current/fsimage_0000000000000000000 ]" >> $SPARK_HOME/master.sh && \
|
|
echo "then" >> $SPARK_HOME/master.sh && \
|
|
echo " echo 'already initialized...'" >> $SPARK_HOME/master.sh && \
|
|
echo "else" >> $SPARK_HOME/master.sh && \
|
|
echo " $HADOOP_HOME/bin/hdfs --config $HADOOP_HOME/etc/hadoop/ namenode -format $CLUSTER_NAME -force" >> $SPARK_HOME/master.sh && \
|
|
echo " touch $SPARK_HOME/.initComplete" >> $SPARK_HOME/master.sh && \
|
|
echo "fi " >> $SPARK_HOME/master.sh && \
|
|
echo "echo 'hdfs configured'" >> $SPARK_HOME/master.sh && \
|
|
echo "export SPARK_DIST_CLASSPATH=\$($HADOOP_HOME/bin/hadoop classpath)" >> $SPARK_HOME/master.sh && \
|
|
echo "(nohup $HADOOP_HOME/bin/hdfs --config $HADOOP_HOME/etc/hadoop/ namenode) > hdfs.log 2>&1 &" >> $SPARK_HOME/master.sh && \
|
|
#echo "sleep 5" >> $SPARK_HOME/master.sh && \
|
|
echo "(sleep 5 && nohup bin/spark-class org.apache.spark.deploy.master.Master -h \$HDFS_HOST) > spark.log 2>&1 &" >> $SPARK_HOME/master.sh && \
|
|
#echo "sleep 30" >> $SPARK_HOME/master.sh && \
|
|
#echo 'while true; do echo "$(cat spark.log)"; sleep 10; done;' >> $SPARK_HOME/master.sh && \
|
|
echo "/usr/sbin/sshd &" >> $SPARK_HOME/master.sh && \
|
|
echo 'while true; do uptime; sleep 300; done;' >> $SPARK_HOME/master.sh && \
|
|
chmod u+x $SPARK_HOME/master.sh && \
|
|
echo "#!/bin/sh" > $SPARK_HOME/worker.sh && \
|
|
echo "./base_entry.sh \$HDFS_HOST" >> $SPARK_HOME/worker.sh && \
|
|
echo "cp $SPARK_HOME/spark-worker-defaults.conf $SPARK_HOME/conf/spark-defaults.conf" >> $SPARK_HOME/worker.sh && \
|
|
echo "export SPARK_DIST_CLASSPATH=\$($HADOOP_HOME/bin/hadoop classpath)" >> $SPARK_HOME/worker.sh && \
|
|
echo "(nohup $HADOOP_HOME/bin/hdfs --config $HADOOP_HOME/etc/hadoop/ datanode) > hdfs.log 2>&1 &" >> $SPARK_HOME/worker.sh && \
|
|
#echo "sleep 5" >> $SPARK_HOME/worker.sh && \
|
|
echo "(sleep 5 && nohup bin/spark-class org.apache.spark.deploy.worker.Worker spark://\$HDFS_HOST:7077) > spark.log 2>&1 &" >> $SPARK_HOME/worker.sh && \
|
|
echo "/usr/sbin/sshd &" >> $SPARK_HOME/worker.sh && \
|
|
echo 'while true; do uptime; sleep 300; done;' >> $SPARK_HOME/worker.sh && \
|
|
chmod u+x $SPARK_HOME/worker.sh
|
|
|
|
WORKDIR $SPARK_HOME
|