# https://gitlab.com/rychly-edu/docker/docker-adoptopenjdk
# the latest custom image based on Alpine 3.10; later images or official images are based on Alpine 3.11 or 3.12 that do not provide all Python2 packages required by PySpark apps
FROM registry.gitlab.com/rychly-edu/docker/docker-adoptopenjdk:jdk-8u-alpine-3.10-slime

MAINTAINER Marek Rychly <marek.rychly@gmail.com>

# https://hadoop.apache.org/releases.html
ARG HADOOP_VERSION=3.2.0

ARG DOWNLOAD_CACHE
ARG APACHE_ORIG="http://www-eu.apache.org/dist"
#ARG APACHE_MIRROR="ftp://mirror.hosting90.cz/apache"
ARG APACHE_MIRROR="http://archive.apache.org/dist"

ENV HADOOP_HOME="/opt/hadoop"
ENV HADOOP_CONF_DIR="${HADOOP_HOME}/etc/hadoop"
ENV \
HADOOP_CORE_CONF="${HADOOP_CONF_DIR}/core-site.xml" \
HADOOP_HDFS_CONF="${HADOOP_CONF_DIR}/hdfs-site.xml" \
HADOOP_YARN_CONF="${HADOOP_CONF_DIR}/yarn-site.xml" \
HADOOP_MAPRED_CONF="${HADOOP_CONF_DIR}/mapred-site.xml" \
HADOOP_HTTPFS_CONF="${HADOOP_CONF_DIR}/httpfs-site.xml" \
HADOOP_KMS_CONF="${HADOOP_CONF_DIR}/kms-site.xml" \
HADOOP_ENV_CONF="${HADOOP_CONF_DIR}/hadoop-env.sh"

COPY scripts /

RUN true \
# make the scripts executable
&& chmod 755 /*.sh \
# bash: hadoop shell scripts require BASH, they are not compatible with Busybox ASH/SH
# sed: scripts to set Hadoop properties in XML files require GNU sed (the usage of busybox sed results into incorrect outputs)
# ca-certificates: for wget to be able to use https://
&& apk add --no-cache --update ca-certificates gnupg attr sed bash \
# wait-for: various images derived from this may need to wait for some services (e.g., Spark history server needs to wait for a HDFS namenode to load/save the history)
&& wget -O /usr/bin/wait-for "https://raw.githubusercontent.com/eficode/wait-for/master/wait-for" && chmod 755 /usr/bin/wait-for \
\
# download keys and trust them
&& ( [ -n "${DOWNLOAD_CACHE}" ] && cp -v "${DOWNLOAD_CACHE}/hadoop.KEYS" /tmp \
	|| wget -O /tmp/hadoop.KEYS "${APACHE_ORIG}/hadoop/common/KEYS" ) \
&& gpg --import /tmp/hadoop.KEYS \
&& echo "trust-model always" > ~/.gnupg/gpg.conf \
\
# download the package
&& ( [ -n "${DOWNLOAD_CACHE}" ] && cp -v "${DOWNLOAD_CACHE}/hadoop-${HADOOP_VERSION}.tar.gz" /tmp \
	|| wget -O /tmp/hadoop-${HADOOP_VERSION}.tar.gz "${APACHE_MIRROR}/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz" ) \
\
# download and verify signature
&& ( [ -n "${DOWNLOAD_CACHE}" ] && cp -v "${DOWNLOAD_CACHE}/hadoop-${HADOOP_VERSION}.tar.gz.asc" /tmp \
	|| wget -O /tmp/hadoop-${HADOOP_VERSION}.tar.gz.asc "${APACHE_ORIG}/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz.asc" ) \
&& for SIG in /tmp/*.asc; do gpg --verify "${SIG}" "${SIG%.asc}"; done \
\
# extract the package and remove garbage
&& mkdir -p "${HADOOP_HOME}" \
&& tar -xzf "/tmp/hadoop-${HADOOP_VERSION}.tar.gz" -C "${HADOOP_HOME}" --strip-components 1 \
&& find "${HADOOP_HOME}" \( -name '*.cmd' -o -name '*-sources.jar' -o -name '*-tests.jar' \) -delete \
&& rm -rf "${HADOOP_HOME}/share/doc" \
\
# adoptopenjdk/openjdk8:alpine-slim base image contains glibc including zlib, so glibc-based native Hadoop libraries should be ok
# now, let us install also bzip2's libbz2.so, zstd's libzstd.so, snappy's libsnappy.so, openssl's v1.0 libcrypto.so (EVP_CIPHER_CTX_cleanup required, removed in v1.1)
# check the glibc-based native Hadoop libraries by: /opt/hadoop/bin/hadoop checknative -a
&& wget -O - https://archive.archlinux.org/packages/b/bzip2/bzip2-1.0.8-2-x86_64.pkg.tar.xz | tar -xJ -C /tmp \
&& wget -O - https://archive.archlinux.org/packages/z/zstd/zstd-1.4.0-1-x86_64.pkg.tar.xz | tar -xJ -C /tmp \
&& wget -O - https://archive.archlinux.org/packages/s/snappy/snappy-1.1.7-1-x86_64.pkg.tar.xz | tar -xJ -C /tmp \
&& wget -O - https://archive.archlinux.org/packages/o/openssl-1.0/openssl-1.0-1.0.2.s-1-x86_64.pkg.tar.xz | tar -xJ -C /tmp \
&& mv /tmp/usr/lib/libbz2.so* /tmp/usr/lib/libzstd.so* /tmp/usr/lib/libsnappy.so* /tmp/usr/lib/libcrypto.so* /usr/glibc-compat/lib \
&& ln -s libcrypto.so.1.0.0 /usr/glibc-compat/lib/libcrypto.so \
\
# fix ISSUE: os::commit_memory failed; error=Operation not permitted
# (AUFS does not support xattr, so we need to set the flag once again after execution of the container in its entrypoint)
# https://en.wikibooks.org/wiki/Grsecurity/Application-specific_Settings#Java
&& setfattr -n user.pax.flags -v em "${JAVA_HOME}/bin/java" "${JAVA_HOME}/jre/bin/java" \
\
# create a log directory (must be writable also by Hadoop apps such as Spark) and enable logging
&& mkdir -p ${HADOOP_HOME}/logs \
&& chmod 777 ${HADOOP_HOME}/logs \
&& sed -i "s|^\(hadoop\.log\.dir=\).*\$|\1${HADOOP_HOME}/logs|g" ${HADOOP_HOME}/etc/hadoop/log4j.properties \
\
# integrate with Java
&& echo "export JAVA_HOME=${JAVA_HOME}" >> ${HADOOP_ENV_CONF} \
\
# set up permissions
&& addgroup -S hadoop \
&& adduser -h ${HADOOP_HOME} -g "Apache Hadoop" -s /bin/sh -G hadoop -S -D -H hadoop \
&& chown -R hadoop:hadoop ${HADOOP_HOME} \
\
# set path for the shell
&& echo '#!/bin/sh' > /etc/profile.d/path-hadoop.sh \
&& echo "export PATH=\"\${PATH}:${HADOOP_HOME}/bin\"" >> /etc/profile.d/path-hadoop.sh \
&& chmod 755 /etc/profile.d/path-hadoop.sh \
\
# clean up
&& apk del gnupg \
&& rm -rf /tmp/* /var/tmp/* /var/cache/apk/*

CMD echo "This is just a base image for building another docker images!" >&2
