python cell execution sandboxing docker image fixes for spark image hadoop version mismatch

master
Michael Brachmann 2020-04-10 09:45:37 -04:00
parent 10350f6c3f
commit 328a0b1447
11 changed files with 239 additions and 9 deletions

View File

@ -4,6 +4,6 @@ sudo docker build -t docker.mimirdb.info/vizier-auth ./ --build-arg UI_BRANCH=ma
#sudo docker build -t docker.mimirdb.info/spark-hadoop --build-arg SPARK_VERSION="spark-2.4.0-bin-without-hadoop" ./
sudo docker build -t docker.mimirdb.info/spark-hadoop --build-arg SPARK_VERSION="spark-2.4.4-bin-without-hadoop-scala-2.12" ./
sudo docker build -t docker.mimirdb.info/spark-hadoop-scala-2.12 --build-arg SPARK_VERSION="spark-2.4.4-bin-without-hadoop-scala-2.12" ./
sudo docker build -t docker.mimirdb.info/vizier-proxy --build-arg VIZIER_CONFIG="vizier_auth.conf" ./

View File

@ -0,0 +1,54 @@
FROM docker.mimirdb.info/alpine_openjdk8
LABEL software="vizier-python-executor"
LABEL software.version="0.2.20200202"
LABEL version="0.7"
ARG VIZIERSERVER_PYTHON_EXECUTOR_PORT=5005
ARG API_BRANCH=master
ARG CONDA_VERSION="latest"
ENV PYEXECUTOR_DEBUG=False
ENV MIMIR_URL=http://vizier-auth:8089/api/v2/
ENV VIZIERSERVER_PYTHON_EXECUTOR_PORT=$VIZIERSERVER_PYTHON_EXECUTOR_PORT
ENV WSGI_LOG_LEVEL=debug
RUN apk add --update --no-cache curl bash git supervisor ca-certificates
#RUN curl -OsL "https://repo.continuum.io/archive/Anaconda3-$CONDA_VERSION-Linux-x86_64.sh" \
# && /bin/bash Anaconda3-$CONDA_VERSION-Linux-x86_64.sh -b -p /opt/conda \
# && rm Anaconda3-$CONDA_VERSION-Linux-x86_64.sh \
# && echo 'export PATH=/opt/conda/bin:$PATH' >> /etc/profile.d/conda.sh
RUN curl -OsL "https://repo.anaconda.com/miniconda/Miniconda3-$CONDA_VERSION-Linux-x86_64.sh" \
&& /bin/bash Miniconda3-$CONDA_VERSION-Linux-x86_64.sh -b -p /opt/conda \
&& rm Miniconda3-$CONDA_VERSION-Linux-x86_64.sh \
&& echo 'export PATH=/opt/conda/bin:$PATH' >> /etc/profile.d/conda.sh
#setup web-api
RUN mkdir -p /usr/local/source/ \
&& cd /usr/local/source/ \
&& /opt/conda/bin/conda create --name vizierasync python=3.8 pip \
&& source /opt/conda/bin/activate vizierasync \
&& git clone https://github.com/VizierDB/web-api-async.git web-api\
&& cd /usr/local/source/web-api \
&& git checkout -b local_$API_BRANCH origin/$API_BRANCH \
&& pip install -r requirements.txt \
&& pip install gunicorn \
&& pip install futures \
&& pip install matplotlib \
&& pip install bokeh \
&& pip install geopandas \
&& pip install pandas \
&& pip install numpy \
&& pip install shapely \
&& pip install -e .
COPY main.py /usr/local/source/web-api/vizier/main.py
COPY run_executor.sh /usr/local/source/run_executor.sh
COPY supervisord.conf /etc/supervisord.conf
RUN chmod +x /usr/local/source/run_executor.sh
WORKDIR /usr/local/source
EXPOSE $VIZIERSERVER_PYTHON_EXECUTOR_PORT 9001
ENTRYPOINT /usr/bin/supervisord

90
python-executor/main.py Normal file
View File

@ -0,0 +1,90 @@
import sys
import os
from flask import Flask
from flask import request
from vizier.engine.packages.stream import OutputStream
from vizier.engine.packages.pycell.client.base import VizierDBClient
from vizier.engine.packages.pycell.plugins import python_cell_preload
from vizier.datastore.mimir.store import MimirDatastore
from vizier.datastore.fs.base import FileSystemDatastore
from vizier.engine.packages.pycell.processor import VARS_DBCLIENT
from multiprocessing import Process, Pipe
app = Flask(__name__)
application = app
VIZIERSERVER_PYTHON_EXECUTOR_PORT = os.environ.get('VIZIERSERVER_PYTHON_EXECUTOR_PORT', 5005)
def set2list(obj):
if isinstance(obj, set):
return list(obj)
else:
return obj
def execute_python(conn, obj):
out = sys.stdout
err = sys.stderr
stream = list()
dsklass = globals()[obj['datastore']]
datastore = dsklass(obj['basepath'])
client = VizierDBClient(
datastore=datastore,
datasets=obj['datasets'],
source=obj['source'],
dataobjects=obj['dataobjects']
)
variables = {VARS_DBCLIENT: client}
sys.stdout = OutputStream(tag='out', stream=stream)
sys.stderr = OutputStream(tag='err', stream=stream)
# Keep track of exception that is thrown by the code
exception = None
python_cell_preload(variables)
# Run the Python code
try:
exec(obj['source'], variables)
except Exception as ex:
exception = ex
finally:
# Make sure to reverse redirection of output streams
sys.stdout = out
sys.stderr = err
# Set module outputs
print(str(exception))
stdout = []
stderr = []
is_success = (exception is None)
for tag, text in stream:
text = ''.join(text).strip()
if tag == 'out':
stdout.append(text)
else:
stderr.append(text)
is_success = False
if not is_success:
stderr.append(str(exception))
conn.send({'success':is_success,
'stdout':stdout,
'stderr':stderr,
'provenance':
{'read':set2list(client.read),
'write':set2list(client.write),
'delete':set2list(client.delete)},
'datasets':client.datasets,
'dataobjects':client.dataobjects})
conn.close()
@app.route("/", methods=['POST'])
def home():
if not request.json:
raise ValueError("not json")
obj = request.json
print(str(obj))
parent_conn, child_conn = Pipe()
p = Process(target=execute_python, args=(child_conn,obj))
p.start()
return_val = parent_conn.recv()
p.join()
print(return_val)
return return_val
if __name__ == "__main__":
app.run(debug=True, port=VIZIERSERVER_PYTHON_EXECUTOR_PORT)

View File

@ -0,0 +1,12 @@
#!/bin/bash
echo 'activating virtualenv...'
cd /usr/local/source/web-api/
source /opt/conda/bin/activate vizierasync
cd vizier
echo 'running wsgi server...'
if [ $PYEXECUTOR_DEBUG == "True" ]
then
python3 main.py
else
gunicorn -w 1 --access-logfile - --error-logfile - --log-level $WSGI_LOG_LEVEL --threads 8 --bind 0.0.0.0:$VIZIERSERVER_PYTHON_EXECUTOR_PORT main
fi

View File

@ -0,0 +1,22 @@
[supervisord]
nodaemon=true
[program:pyexec]
command=/usr/local/source/run_executor.sh
stdout_events_enabled=true
stderr_events_enabled=true
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
[inet_http_server]
port=0.0.0.0:9001
username=root
password=odinlab
[supervisorctl]
serverurl=http://127.0.0.1:9001
username=root
password=odinlab

View File

@ -43,6 +43,10 @@ sudo docker rm vizier-ui
sudo docker stop vizier-proxy
sudo docker rm vizier-proxy
#proxy
#auth
sudo docker stop vizier-auth
sudo docker rm vizier-auth
#auth
sudo docker stop python-executor
sudo docker rm python-executor

View File

@ -34,9 +34,11 @@ S3_AWS_SECRET_ACCESS_KEY="dL79qJGyLkUFyYvmmg3hEn8bIklSaTkrfG0IXuki"
S3_BUCKET_NAME="vizier-data-test"
VIZIER_DATA_VOLUME="vizier-data"
#python-executor for sandboxing python cell code execution
sudo docker run -d -h python-executor --name python-executor --network spark-net -p 5005:5005 -p 9003:9001 -v $VIZIER_DATA_VOLUME:/usr/local/source/vizier-api-auth/vizier-data -e MIMIR_URL=http://vizier-auth:8089/api/v2/ docker.mimirdb.info/python-executor
#vizier-auth
sudo docker run -d -v $VIZIER_DATA_VOLUME:/usr/local/source/vizier-api-auth/vizier-data -p 5000:5000 -p 9002:9001 --expose 9000 --expose 4041 --expose 8089 --network spark-net -h vizier-auth --name vizier-auth -e DATA_STAGING_TYPE="hdfs" -e MIMIR_DATA_DIR="/usr/local/source/web-api/vizier/.vizierdb/mimir" -e REMOTE_SPARK=true -e USE_S3_VOLUME=false -e MIMIR_HOST="vizier-auth" -e SPARK_HOST=$MASTER_HOSTNAME -e RESTORE_BACKUP=false -e PULL_MIMIR=false -e AWS_ACCESS_KEY_ID=$S3_AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY=$S3_AWS_SECRET_ACCESS_KEY -e S3_BUCKET_NAME="$S3_BUCKET_NAME" -e API_SERVER="demo.$VIZIER_DOMAIN" -e VIZIERSERVER_SERVER_PORT=443 -e VIZIERSERVER_BASE_URL="https://demo.$VIZIER_DOMAIN" -e VIZIERAUTH_OAUTH_ID=e070fef69a20f246bcbc16ebc49c584dadde4753d88de0dac42eeea1cf2a2e48 -e VIZIERAUTH_OAUTH_SECRET=d5ed682921d6cb73d9a7b190173662403483bb2197f3960c9b0de325af624072 docker.mimirdb.info/vizier-auth
sudo docker run -d -v $VIZIER_DATA_VOLUME:/usr/local/source/vizier-api-auth/vizier-data -p 5000:5000 -p 9002:9001 --expose 9000 --expose 4041 --expose 8089 --network spark-net -h vizier-auth --name vizier-auth -e DATA_STAGING_TYPE="hdfs" -e MIMIR_DATA_DIR="/usr/local/source/web-api/vizier/.vizierdb/mimir" -e REMOTE_SPARK=true -e USE_S3_VOLUME=false -e MIMIR_HOST="vizier-auth" -e SPARK_HOST=$MASTER_HOSTNAME -e RESTORE_BACKUP=false -e PULL_MIMIR=false -e S3_AWS_ACCESS_KEY_ID=$S3_AWS_ACCESS_KEY_ID -e S3_AWS_SECRET_ACCESS_KEY=$S3_AWS_SECRET_ACCESS_KEY -e AWS_ACCESS_KEY_ID=$S3_AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY=$S3_AWS_SECRET_ACCESS_KEY -e S3_BUCKET_NAME="$S3_BUCKET_NAME" -e API_SERVER="demo.$VIZIER_DOMAIN" -e VIZIERSERVER_SERVER_PORT=443 -e VIZIERSERVER_BASE_URL="https://demo.$VIZIER_DOMAIN" -e AUTHSERVER_AUTH_CLIENTS="GenericOAuth20Client" -e VIZIERAUTH_OAUTH_ID=62dbc5e3ce67547a8ed874e5907f1798956f9a4403af6d20b33be2a8e460219a -e VIZIERAUTH_OAUTH_SECRET=b1005dfa188919c0ce56406fef1203c70daae50759973c5c59d826dc41c069b0 docker.mimirdb.info/vizier-auth
#proxy
sudo docker run -d -p 80:80 -p 443:443 -p 9001:9001 -h vizier-proxy --name vizier-proxy --network spark-net -e VIZIER_CONFIG="vizier_auth.conf" -e VIZIER_API_APP_PATH="/vizier-db/api/v1/" -e VIZIER_DOMAIN="$VIZIER_DOMAIN" docker.mimirdb.info/vizier-proxy

View File

@ -0,0 +1,42 @@
#!/bin/sh
SPARK_VERSION="spark-2.4.4-bin-without-hadoop-scala-2.12"
SPARK_CONTAINER="spark-hadoop-scala-2.12"
MASTER_HOSTNAME="namenode"
MASTER_CONTAINER=`sudo docker run --restart always -d -v data-auth:/tmp/data --name $MASTER_HOSTNAME -h $MASTER_HOSTNAME --network spark-net -p 222:22 -p 4040:4040 -p 6066:6066 -p 7077:7077 -p 8020:8020 -p 8080:8080 -p 50070:50070 --expose 7001 --expose 7002 --expose 7003 --expose 7004 --expose 7005 --expose 7006 --expose 7077 --expose 6066 --expose 4040 --expose 8020 --expose 50070 -e "MASTER=spark://namenode:7077" -e "SPARK_CONF_DIR=/conf" -e "SPARK_PUBLIC_DNS=127.0.0.1" -e "LD_LIBRARY_PATH=/usr/local/hadoop/lib/native/" -e "SPARK_EXECUTOR_MEMORY=8g" -e "SPARK_DAEMON_MEMORY=8g" -e "SPARK_DRIVER_MEMORY=8g" -e "SPARK_WORKER_MEMORY=8g" -e "HDFS_CONF_dfs_client_use_datanode_hostname=true" -e "AWS_ECS=false" docker.mimirdb.info/$SPARK_CONTAINER /usr/local/$SPARK_VERSION/master.sh`
echo "master container id: $MASTER_CONTAINER"
START_PORT=7001
END_PORT=7006
WORKER_PORT=8882
WORKER_WEBUI_PORT=8082
HOSTNAME="datanode"
DATANODE_PORT=50010
sudo docker run --restart always -d -v data-auth:/tmp/data -h $HOSTNAME --name $HOSTNAME --network spark-net --link $MASTER_CONTAINER -p $WORKER_WEBUI_PORT:8082 --expose $WORKER_PORT --expose $DATANODE_PORT -e "SPARK_CONF_DIR=/conf" -e "SPARK_PUBLIC_DNS=127.0.0.1" -e "SPARK_WORKER_CORES=4" -e "SPARK_WORKER_PORT=$WORKER_PORT" -e "SPARK_WORKER_WEBUI_PORT=$WORKER_WEBUI_PORT" -e "LD_LIBRARY_PATH=/usr/local/hadoop/lib/native/" -e "HDFS_DATA_HOST=$HOSTNAME" -e "HDFS_HOST=namenode" -e "HDFS_CONF_dfs_datanode_address=0.0.0.0:$DATANODE_PORT" -e "SPARK_EXECUTOR_MEMORY=8g" -e "SPARK_DAEMON_MEMORY=8g" -e "SPARK_DRIVER_MEMORY=8g" -e "SPARK_WORKER_MEMORY=8g" -e "HDFS_CONF_dfs_client_use_datanode_hostname=true" -e "AWS_ECS=false" docker.mimirdb.info/$SPARK_CONTAINER /usr/local/$SPARK_VERSION/worker.sh
WORKER_WEBUI_PORT=8083
HOSTNAME="datanode2"
sudo docker run --restart always -d -v data-auth:/tmp/data -h $HOSTNAME --name $HOSTNAME --network spark-net --link $MASTER_CONTAINER -p $WORKER_WEBUI_PORT:8082 --expose $WORKER_PORT --expose $DATANODE_PORT -e "SPARK_CONF_DIR=/conf" -e "SPARK_PUBLIC_DNS=127.0.0.1" -e "SPARK_WORKER_CORES=4" -e "SPARK_WORKER_PORT=$WORKER_PORT" -e "SPARK_WORKER_WEBUI_PORT=$WORKER_WEBUI_PORT" -e "LD_LIBRARY_PATH=/usr/local/hadoop/lib/native/" -e "HDFS_DATA_HOST=$HOSTNAME" -e "HDFS_HOST=namenode" -e "HDFS_CONF_dfs_datanode_address=0.0.0.0:$DATANODE_PORT" -e "SPARK_EXECUTOR_MEMORY=8g" -e "SPARK_DAEMON_MEMORY=8g" -e "SPARK_DRIVER_MEMORY=8g" -e "SPARK_WORKER_MEMORY=8g" -e "HDFS_CONF_dfs_client_use_datanode_hostname=true" -e "AWS_ECS=false" docker.mimirdb.info/$SPARK_CONTAINER /usr/local/$SPARK_VERSION/worker.sh
WORKER_WEBUI_PORT=8084
HOSTNAME="datanode3"
sudo docker run --restart always -d -v data-auth:/tmp/data -h $HOSTNAME --name $HOSTNAME --network spark-net --link $MASTER_CONTAINER -p $WORKER_WEBUI_PORT:8082 --expose $WORKER_PORT --expose $DATANODE_PORT -e "SPARK_CONF_DIR=/conf" -e "SPARK_PUBLIC_DNS=127.0.0.1" -e "SPARK_WORKER_CORES=4" -e "SPARK_WORKER_PORT=$WORKER_PORT" -e "SPARK_WORKER_WEBUI_PORT=$WORKER_WEBUI_PORT" -e "LD_LIBRARY_PATH=/usr/local/hadoop/lib/native/" -e "HDFS_DATA_HOST=$HOSTNAME" -e "HDFS_HOST=namenode" -e "HDFS_CONF_dfs_datanode_address=0.0.0.0:$DATANODE_PORT" -e "SPARK_EXECUTOR_MEMORY=8g" -e "SPARK_DAEMON_MEMORY=8g" -e "SPARK_DRIVER_MEMORY=8g" -e "SPARK_WORKER_MEMORY=8g" -e "HDFS_CONF_dfs_client_use_datanode_hostname=true" -e "AWS_ECS=false" docker.mimirdb.info/$SPARK_CONTAINER /usr/local/$SPARK_VERSION/worker.sh
WORKER_WEBUI_PORT=8085
HOSTNAME="datanode4"
sudo docker run --restart always -d -v data-auth:/tmp/data -h $HOSTNAME --name $HOSTNAME --network spark-net --link $MASTER_CONTAINER -p $WORKER_WEBUI_PORT:8082 --expose $WORKER_PORT --expose $DATANODE_PORT -e "SPARK_CONF_DIR=/conf" -e "SPARK_PUBLIC_DNS=127.0.0.1" -e "SPARK_WORKER_CORES=4" -e "SPARK_WORKER_PORT=$WORKER_PORT" -e "SPARK_WORKER_WEBUI_PORT=$WORKER_WEBUI_PORT" -e "LD_LIBRARY_PATH=/usr/local/hadoop/lib/native/" -e "HDFS_DATA_HOST=$HOSTNAME" -e "HDFS_HOST=namenode" -e "HDFS_CONF_dfs_datanode_address=0.0.0.0:$DATANODE_PORT" -e "SPARK_EXECUTOR_MEMORY=8g" -e "SPARK_DAEMON_MEMORY=8g" -e "SPARK_DRIVER_MEMORY=8g" -e "SPARK_WORKER_MEMORY=8g" -e "HDFS_CONF_dfs_client_use_datanode_hostname=true" -e "AWS_ECS=false" docker.mimirdb.info/$SPARK_CONTAINER /usr/local/$SPARK_VERSION/worker.sh
VIZIER_DOMAIN="vizierdb.info"
S3_AWS_ACCESS_KEY_ID="AKIAJ7MLFSPYLYG47ARQ"
S3_AWS_SECRET_ACCESS_KEY="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"
S3_BUCKET_NAME="vizier-data-ub"
#python-executor for sandboxing python cell code execution
sudo docker run --restart always -d -h python-executor --name python-executor --network spark-net -p 5005:5005 -p 9003:9001 --mount type=bind,source=/home/csestaff/mrb24/docker-mounts/vizier-api-auth,target=/usr/local/source/vizier-api-auth/vizier-data -e MIMIR_URL=http://vizier-auth:8089/api/v2/ docker.mimirdb.info/python-executor
#vizier-auth
sudo docker run --restart always -d --mount type=bind,source=/home/csestaff/mrb24/docker-mounts/vizier-api-auth,target=/usr/local/source/vizier-api-auth/vizier-data -p 5000:5000 -p 9002:9001 --expose 9000 --expose 4041 --expose 8089 --network spark-net -h vizier-auth --name vizier-auth -e DATA_STAGING_TYPE="hdfs" -e MIMIR_DATA_DIR="/usr/local/source/vizier-api-auth/vizier-data" -e REMOTE_SPARK=true -e USE_S3_VOLUME=false -e MIMIR_HOST="vizier-auth" -e SPARK_HOST=$MASTER_HOSTNAME -e RESTORE_BACKUP=false -e PULL_MIMIR=false -e AWS_ACCESS_KEY_ID=$S3_AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY=$S3_AWS_SECRET_ACCESS_KEY -e AWS_ACCESS_KEY_ID=$S3_AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY=$S3_AWS_SECRET_ACCESS_KEY -e S3_BUCKET_NAME="$S3_BUCKET_NAME" -e API_SERVER="demo.$VIZIER_DOMAIN" -e VIZIERSERVER_SERVER_PORT=443 -e VIZIERSERVER_BASE_URL="https://demo.$VIZIER_DOMAIN" -e AUTHSERVER_AUTH_CLIENTS="GenericOAuth20Client" -e VIZIERAUTH_OAUTH_ID=e070fef69a20f246bcbc16ebc49c584dadde4753d88de0dac42eeea1cf2a2e48 -e VIZIERAUTH_OAUTH_SECRET=d5ed682921d6cb73d9a7b190173662403483bb2197f3960c9b0de325af624072 docker.mimirdb.info/vizier-auth
#runBackup --restore --sparkHost namenode --dataStagingType s3 --overwriteJars -X LOG LOGM remoteSpark

View File

@ -3,9 +3,9 @@ FROM docker.mimirdb.info/alpine_openjdk8
# Metadata
LABEL base.image="docker.mimirdb.info/alpine_openjdk8"
LABEL version="0.1"
LABEL version="0.4"
LABEL software="Spark"
LABEL software.version="0.1.201801"
LABEL software.version="0.1.202004"
LABEL description="Spark image"
RUN apk add --update curl bash sed perl grep openssh
@ -67,8 +67,6 @@ ENV SPARK_VERSION=$SPARK_VERSION
#or copy it
COPY $SPARK_VERSION.tgz /
RUN gunzip -c /$SPARK_VERSION.tgz | tar -x -C /usr/local/ && rm /$SPARK_VERSION.tgz
COPY hadoop-aws-2.8.2.jar aws-java-sdk-1.11.234.jar aws-java-sdk-core-1.11.234.jar aws-java-sdk-kms-1.11.234.jar \
aws-java-sdk-s3-1.11.234.jar hadoop-aws-2.8.2.jar httpclient-4.5.3.jar joda-time-2.9.9.jar /usr/local/$SPARK_VERSION/jars/
ENV SPARK_HOME /usr/local/$SPARK_VERSION
ENV PATH $PATH:$SPARK_HOME/bin

View File

@ -3,7 +3,7 @@ FROM docker.mimirdb.info/alpine_openjdk8
# Metadata
LABEL base.image="docker.mimirdb.info/alpine_openjdk8"
LABEL version="0.4"
LABEL version="0.5.8"
LABEL software="Vizier Auth"
LABEL software.version="0.2.20200202"
LABEL description="an open source, provenance aware, iterative data cleaning tool"
@ -54,6 +54,12 @@ ENV REMOTE_SPARK=false
#gram
ENV VIZIERAUTH_OAUTH_ID=e554e37483640ccc73324b5620376601843aadfa37d972f094ea13d02df90a0f
ENV VIZIERAUTH_OAUTH_SECRET=f385531e40fb5268397d222c6c26a611cdd906510d3ee4ed8ad013d33b2c4102
#shibboleth and gitlab
ENV AUTHSERVER_AUTH_CLIENTS="SAML2Client,GenericOAuth20Client"
ENV SANDBOX_PYTHON_EXECUTION=True
ENV SANDBOX_PYTHON_URL=http://python-executor:5005/
ENV GITLAB_OAUTH_HOST="gitlab.odin.cse.buffalo.edu"
#have vizier-auth scala code run web-api process (true)
ENV RUN_WEB_API=false
@ -198,7 +204,7 @@ RUN cd /usr/local/source/ \
&& pip install -e . \
&& mkdir -p /usr/local/source/web-api/.vizierdb
LABEL pullui="1"
#setup production web-ui branch
RUN mkdir -p /usr/local/source/ \
&& cd /usr/local/source/ \