ARG HADOOP_VERSION
FROM registry.gitlab.com/rychly-edu/docker/docker-hadoop-base:${HADOOP_VERSION:-latest}

MAINTAINER Marek Rychly <marek.rychly@gmail.com>

# https://spark.apache.org/downloads.html
ARG SPARK_VERSION=2.4.3

ARG DOWNLOAD_CACHE
#ARG APACHE_ORIG="http://www-eu.apache.org/dist"	# mirror disabled as it does not have the old versions, just the latest
ARG APACHE_ORIG="http://archive.apache.org/dist"
ARG APACHE_MIRROR="https://archive.apache.org/dist"

ENV SPARK_HOME="/opt/spark"
ENV SPARK_CONF_DIR="${SPARK_HOME}/conf"
ENV \
SPARK_SLAVES_CONF="${SPARK_CONF_DIR}/slaves" \
SPARK_DEF_CONF="${SPARK_CONF_DIR}/spark-defaults.conf" \
SPARK_ENV_CONF="${SPARK_CONF_DIR}/spark-env.sh" \
SPARK_USER="spark" \
ARROW_LIBHDFS_DIR="${HADOOP_HOME}/lib/native"

COPY scripts /

RUN true \
# make the scripts executable
&& chmod 755 /*.sh \
# python(2)-minimal, python3-minimal: Spark workers require the python binary to execute PySpark tasks (a particular python binary can be set by the spark.pyspark.python Spark conf property)
# python(2)-pip python3-pip: Spark workers may require to install additional Python packages
# netcat: required for wait-for, otherwise it wont be able to call it and will fail on timeout
&& apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get -y install gnupg \
python-minimal python-pip python3-minimal python3-pip netcat \
# wait-for: Spark history server needs to wait for a HDFS namenode to load/save the history
&& curl -LfsSo /usr/bin/wait-for "https://raw.githubusercontent.com/eficode/wait-for/master/wait-for" && chmod 755 /usr/bin/wait-for \
\
# Apache Arrow and PyArrow, see https://spark.apache.org/docs/latest/sql-pyspark-pandas-with-arrow.html
# (pyarrow is not debian's python-arrow or python3-arrow, we must install it by pip)
&& python2 -m pip install --no-cache-dir pyarrow \
&& python3 -m pip install --no-cache-dir pyarrow \
\
# download keys and trust them
&& ( [ -n "${DOWNLOAD_CACHE}" ] && cp -v "${DOWNLOAD_CACHE}/spark.KEYS" /tmp \
	|| curl -LfsSo /tmp/spark.KEYS "${APACHE_ORIG}/spark/KEYS" ) \
&& gpg --import /tmp/spark.KEYS \
&& echo "trust-model always" > ~/.gnupg/gpg.conf \
\
# download the package
&& echo "Downloading ${APACHE_MIRROR}/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-without-hadoop.tgz ..." \
&& ( [ -n "${DOWNLOAD_CACHE}" ] && cp -v "${DOWNLOAD_CACHE}/spark-${SPARK_VERSION}-bin-without-hadoop.tgz" /tmp \
	|| curl -LfsSo /tmp/spark-${SPARK_VERSION}-bin-without-hadoop.tgz "${APACHE_MIRROR}/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-without-hadoop.tgz" ) \
\
# download and verify signature
&& ( [ -n "${DOWNLOAD_CACHE}" ] && cp -v "${DOWNLOAD_CACHE}/spark-${SPARK_VERSION}-bin-without-hadoop.tgz.asc" /tmp \
	|| curl -LfsSo /tmp/spark-${SPARK_VERSION}-bin-without-hadoop.tgz.asc "${APACHE_ORIG}/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-without-hadoop.tgz.asc" ) \
&& for SIG in /tmp/*.asc; do gpg --verify "${SIG}" "${SIG%.asc}"; done \
\
# extract the package and remove garbage
&& mkdir -p "${SPARK_HOME}" \
&& tar -xzf "/tmp/spark-${SPARK_VERSION}-bin-without-hadoop.tgz" -C "${SPARK_HOME}" --strip-components 1 \
&& find "${SPARK_HOME}" -name '*.cmd' -delete \
&& rm -rf "${SPARK_HOME}/data" "${SPARK_HOME}/examples" \
\
# integrate with Java and Hadoop
&& echo '#!/bin/sh' > ${SPARK_ENV_CONF} \
&& echo "export JAVA_HOME=${JAVA_HOME}" >> ${SPARK_ENV_CONF} \
&& echo "export SPARK_DIST_CLASSPATH=\$(${HADOOP_HOME}/bin/hadoop classpath)" >> ${SPARK_ENV_CONF} \
&& chmod 755 ${SPARK_ENV_CONF} \
\
# set up permissions
&& addgroup --system "${SPARK_USER}" \
&& adduser --system --home "${SPARK_HOME}" --gecos "Apache Spark" --shell /bin/sh --ingroup "${SPARK_USER}" --disabled-login --no-create-home "${SPARK_USER}" \
&& chown -R "${SPARK_USER}:${SPARK_USER}" "${SPARK_HOME}" \
\
# set path for the shell
&& echo '#!/bin/sh' > /etc/profile.d/path-spark.sh \
&& echo "export PATH=\"\${PATH}:${SPARK_HOME}/bin\"" >> /etc/profile.d/path-spark.sh \
&& chmod 755 /etc/profile.d/path-spark.sh \
\
# clean up
&& apt-get -y autoremove gnupg \
&& apt-get clean \
&& rm -rf /tmp/* /var/tmp/* /var/lib/apt/lists/* /root/.gnupg

ENTRYPOINT ["/entrypoint.sh"]

HEALTHCHECK CMD /healthcheck.sh
