ARG HADOOP_VERSION
FROM registry.gitlab.com/rychly-edu/docker/docker-hadoop-base:${HADOOP_VERSION:-alpine-latest}

MAINTAINER Marek Rychly <marek.rychly@gmail.com>

# https://spark.apache.org/downloads.html
ARG SPARK_VERSION=2.4.3

ARG DOWNLOAD_CACHE
#ARG APACHE_ORIG="http://www-eu.apache.org/dist"	# mirror disabled as it does not have the old versions, just the latest
ARG APACHE_ORIG="http://archive.apache.org/dist"
ARG APACHE_MIRROR="https://archive.apache.org/dist"

ENV SPARK_HOME="/opt/spark"
ENV SPARK_CONF_DIR="${SPARK_HOME}/conf"
ENV \
SPARK_SLAVES_CONF="${SPARK_CONF_DIR}/slaves" \
SPARK_DEF_CONF="${SPARK_CONF_DIR}/spark-defaults.conf" \
SPARK_ENV_CONF="${SPARK_CONF_DIR}/spark-env.sh" \
SPARK_USER="spark" \
ARROW_LIBHDFS_DIR="${HADOOP_HOME}/lib/native"

COPY scripts /

RUN true \
# make the scripts executable
&& chmod 755 /*.sh \
# sed: scripts to set Spark properties in files require GNU sed (the usage of busybox sed may result into incorrect outputs)
# procps: Spark shell scripts require ps, they are not compatible with Busybox ps (unknown parameter 'p')
# coreutils: Spark shell scripts require nohup, they are not compatible with Busybox nohup (cannot interpret parameter '--')
# python2, python3: Spark workers require the python binary to execute PySpark tasks (a particular python binary can be set by the spark.pyspark.python Spark conf property)
# py2-pip py3-pip: Spark workers may require to install additional Python packages (also see manylinux compatibility below)
# boost-filesystem boost-regex py2-futures py-enum34 py2-six py2-numpy py3-six py3-numpy: Dependencies of Apache Arrow and PyArrows for Python 2 and Python 3
# ca-certificates: for wget to be able to use https://
&& apk add --no-cache --update \
ca-certificates gnupg attr sed procps coreutils python2 py2-pip python3 py3-pip \
boost-filesystem boost-regex py2-futures py-enum34 py2-six py2-numpy py3-six py3-numpy \
# wait-for: Spark history server needs to wait for a HDFS namenode to load/save the history
&& wget -O /usr/bin/wait-for "https://raw.githubusercontent.com/eficode/wait-for/master/wait-for" && chmod 755 /usr/bin/wait-for \
\
# make Python compatible with manylinux wheels and create some useful symlinks that are expected to exist for default Python version
# see https://github.com/insightfulsystems/alpine-python and https://issues.apache.org/jira/browse/ARROW-2058
&& PYTHON_VER=$(`which python python2 python3 | head -1` --version 2>&1 | sed 's/^.*\([0-9]\+\.[0-9]\+\)\..*$/\1/') \
&& echo "manylinux1_compatible = True" | tee $(ls -d /usr/lib/python2.* | head -1)/_manylinux.py $(ls -d /usr/lib/python3.* | head -1)/_manylinux.py \
&& ln -sf locale.h /usr/include/xlocale.h \
&& ln -sf easy_install-${PYTHON_VER} /usr/bin/easy_install \
&& ln -sf idle${PYTHON_VER} /usr/bin/idle \
&& ln -sf pydoc${PYTHON_VER} /usr/bin/pydoc \
&& ln -sf python${PYTHON_VER} /usr/bin/python \
&& ln -sf python-config${PYTHON_VER} /usr/bin/python-config \
&& ln -sf pip${PYTHON_VER} /usr/bin/pip \
\
# Apache Arrow and PyArrow, see https://spark.apache.org/docs/latest/sql-pyspark-pandas-with-arrow.html
&& wget -O /tmp/pyarrow-python2.zip "https://gitlab.com/rychly-edu/pyarrow-alpine/-/jobs/artifacts/master/download?job=build:python2" && unzip /tmp/pyarrow-python2.zip -d /tmp \
&& tar xzf /tmp/distributions/alpine*-python2-arrow-*.linux-x86_64.tar.gz -C / \
&& python2 -m easy_install /tmp/distributions/alpine*-pyarrow-*-py2.?-linux-x86_64.egg \
&& for I in /usr/local/lib/libarrow_python.so.*.?.?; do mv -v ${I} ${I}-python2; done \
&& wget -O /tmp/pyarrow-python3.zip "https://gitlab.com/rychly-edu/pyarrow-alpine/-/jobs/artifacts/master/download?job=build:python3" && unzip /tmp/pyarrow-python3.zip -d /tmp \
&& tar xzf /tmp/distributions/alpine*-python3-arrow-*.linux-x86_64.tar.gz -C / \
&& python3 -m easy_install /tmp/distributions/alpine*-pyarrow-*-py3.?-linux-x86_64.egg \
&& for I in /usr/local/lib/libarrow_python.so.*.?.?; do mv -v ${I} ${I}-python3; done \
# default will be PyArrow for Python 2 (it is necessary to symlink both libarrow_python.so.14.1.0 and libarrow_python.so.14 as the second points to *-python3 for some reasons)
&& for I in /usr/local/lib/libarrow_python.so.*-python2; do ln -vfs ${I##*/} ${I%-python?}; ln -vfs ${I##*/} ${I%.?.?-python?}; done \
\
# download keys and trust them
&& ( [ -n "${DOWNLOAD_CACHE}" ] && cp -v "${DOWNLOAD_CACHE}/spark.KEYS" /tmp \
	|| wget -O /tmp/spark.KEYS "${APACHE_ORIG}/spark/KEYS" ) \
&& gpg --import /tmp/spark.KEYS \
&& echo "trust-model always" > ~/.gnupg/gpg.conf \
\
# download the package
&& echo "Downloading ${APACHE_MIRROR}/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-without-hadoop.tgz ..." \
&& ( [ -n "${DOWNLOAD_CACHE}" ] && cp -v "${DOWNLOAD_CACHE}/spark-${SPARK_VERSION}-bin-without-hadoop.tgz" /tmp \
	|| wget -O /tmp/spark-${SPARK_VERSION}-bin-without-hadoop.tgz "${APACHE_MIRROR}/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-without-hadoop.tgz" ) \
\
# download and verify signature
&& ( [ -n "${DOWNLOAD_CACHE}" ] && cp -v "${DOWNLOAD_CACHE}/spark-${SPARK_VERSION}-bin-without-hadoop.tgz.asc" /tmp \
	|| wget -O /tmp/spark-${SPARK_VERSION}-bin-without-hadoop.tgz.asc "${APACHE_ORIG}/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-without-hadoop.tgz.asc" ) \
&& for SIG in /tmp/*.asc; do gpg --verify "${SIG}" "${SIG%.asc}"; done \
\
# extract the package and remove garbage
&& mkdir -p "${SPARK_HOME}" \
&& tar -xzf "/tmp/spark-${SPARK_VERSION}-bin-without-hadoop.tgz" -C "${SPARK_HOME}" --strip-components 1 \
&& find "${SPARK_HOME}" -name '*.cmd' -delete \
&& rm -rf "${SPARK_HOME}/data" "${SPARK_HOME}/examples" \
\
# fix ISSUE: os::commit_memory failed; error=Operation not permitted
# (AUFS does not support xattr, so we need to set the flag once again after execution of the container in its entrypoint)
# https://en.wikibooks.org/wiki/Grsecurity/Application-specific_Settings#Java
&& setfattr -n user.pax.flags -v em "${JAVA_HOME}/bin/java" "${JAVA_HOME}/jre/bin/java" \
\
# integrate with Java and Hadoop
&& echo '#!/bin/sh' > ${SPARK_ENV_CONF} \
&& echo "export JAVA_HOME=${JAVA_HOME}" >> ${SPARK_ENV_CONF} \
&& echo "export SPARK_DIST_CLASSPATH=\$(${HADOOP_HOME}/bin/hadoop classpath)" >> ${SPARK_ENV_CONF} \
&& chmod 755 ${SPARK_ENV_CONF} \
\
# set up permissions
&& addgroup -S "${SPARK_USER}" \
&& adduser -h "${SPARK_HOME}" -g "Apache Spark" -s /bin/sh -G "${SPARK_USER}" -S -D -H "${SPARK_USER}" \
&& chown -R "${SPARK_USER}:${SPARK_USER}" "${SPARK_HOME}" \
\
# set path for the shell
&& echo '#!/bin/sh' > /etc/profile.d/path-spark.sh \
&& echo "export PATH=\"\${PATH}:${SPARK_HOME}/bin\"" >> /etc/profile.d/path-spark.sh \
&& chmod 755 /etc/profile.d/path-spark.sh \
\
# clean up
&& apk del gnupg \
&& rm -rf /tmp/* /var/tmp/* /var/cache/apk/* /root/.gnupg

ENTRYPOINT ["/entrypoint.sh"]

HEALTHCHECK CMD /healthcheck.sh
