# https://hub.docker.com/r/adoptopenjdk/openjdk8
# https://github.com/AdoptOpenJDK/openjdk-docker/blob/master/8/jdk/ubuntu/Dockerfile.hotspot.releases.slim
FROM adoptopenjdk/openjdk8:slim

MAINTAINER Marek Rychly <marek.rychly@gmail.com>

# https://hadoop.apache.org/releases.html
ARG HADOOP_VERSION=3.2.0

ARG DOWNLOAD_CACHE
ARG APACHE_ORIG="http://www-eu.apache.org/dist"
#ARG APACHE_MIRROR="ftp://mirror.hosting90.cz/apache"
ARG APACHE_MIRROR="http://archive.apache.org/dist"

ENV HADOOP_HOME="/opt/hadoop"
ENV HADOOP_CONF_DIR="${HADOOP_HOME}/etc/hadoop"
ENV \
HADOOP_CORE_CONF="${HADOOP_CONF_DIR}/core-site.xml" \
HADOOP_HDFS_CONF="${HADOOP_CONF_DIR}/hdfs-site.xml" \
HADOOP_YARN_CONF="${HADOOP_CONF_DIR}/yarn-site.xml" \
HADOOP_MAPRED_CONF="${HADOOP_CONF_DIR}/mapred-site.xml" \
HADOOP_HTTPFS_CONF="${HADOOP_CONF_DIR}/httpfs-site.xml" \
HADOOP_KMS_CONF="${HADOOP_CONF_DIR}/kms-site.xml" \
HADOOP_ENV_CONF="${HADOOP_CONF_DIR}/hadoop-env.sh"

COPY scripts /

RUN true \
# make the scripts executable
&& chmod 755 /*.sh \
# netcat: required for wait-for, otherwise it wont be able to call it and will fail on timeout
&& apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get -y install gnupg netcat \
# wait-for: various images derived from this may need to wait for some services (e.g., Spark history server needs to wait for a HDFS namenode to load/save the history)
&& curl -LfsSo /usr/bin/wait-for "https://raw.githubusercontent.com/eficode/wait-for/master/wait-for" && chmod 755 /usr/bin/wait-for \
\
# download keys and trust them
&& ( [ -n "${DOWNLOAD_CACHE}" ] && cp -v "${DOWNLOAD_CACHE}/hadoop.KEYS" /tmp \
	|| curl -LfsSo /tmp/hadoop.KEYS "${APACHE_ORIG}/hadoop/common/KEYS" ) \
&& gpg --import /tmp/hadoop.KEYS \
&& echo "trust-model always" > ~/.gnupg/gpg.conf \
\
# download the package
&& ( [ -n "${DOWNLOAD_CACHE}" ] && cp -v "${DOWNLOAD_CACHE}/hadoop-${HADOOP_VERSION}.tar.gz" /tmp \
	|| curl -LfsSo /tmp/hadoop-${HADOOP_VERSION}.tar.gz "${APACHE_MIRROR}/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz" ) \
\
# download and verify signature
&& ( [ -n "${DOWNLOAD_CACHE}" ] && cp -v "${DOWNLOAD_CACHE}/hadoop-${HADOOP_VERSION}.tar.gz.asc" /tmp \
	|| curl -LfsSo /tmp/hadoop-${HADOOP_VERSION}.tar.gz.asc "${APACHE_ORIG}/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz.asc" ) \
&& for SIG in /tmp/*.asc; do gpg --verify "${SIG}" "${SIG%.asc}"; done \
\
# extract the package and remove garbage
&& mkdir -p "${HADOOP_HOME}" \
&& tar -xzf "/tmp/hadoop-${HADOOP_VERSION}.tar.gz" -C "${HADOOP_HOME}" --strip-components 1 \
&& find "${HADOOP_HOME}" \( -name '*.cmd' -o -name '*-sources.jar' -o -name '*-tests.jar' \) -delete \
&& rm -rf "${HADOOP_HOME}/share/doc" \
\
# adoptopenjdk/openjdk8:slim base image contains glibc including zlib's libz.so, bzip2's libbz2.so, zstd's libzstd.so
# now, let us install also snappy's libsnappy.so, openssl's v1.0 libcrypto.so (EVP_CIPHER_CTX_cleanup required, removed in v1.1)
# check the glibc-based native Hadoop libraries by: /opt/hadoop/bin/hadoop checknative -a
&& DEBIAN_FRONTEND=noninteractive apt-get -y install libsnappy1v5 libssl1.0-dev \
\
# create a log directory (must be writable also by Hadoop apps such as Spark) and enable logging
&& mkdir -p ${HADOOP_HOME}/logs \
&& chmod 777 ${HADOOP_HOME}/logs \
&& sed -i "s|^\(hadoop\.log\.dir=\).*\$|\1${HADOOP_HOME}/logs|g" ${HADOOP_HOME}/etc/hadoop/log4j.properties \
\
# integrate with Java
&& echo "export JAVA_HOME=${JAVA_HOME}" >> ${HADOOP_ENV_CONF} \
\
# set up permissions
&& addgroup --system hadoop \
&& adduser --system --home ${HADOOP_HOME} --gecos "Apache Hadoop" --shell /bin/sh --ingroup hadoop --disabled-login --no-create-home hadoop \
&& chown -R hadoop:hadoop ${HADOOP_HOME} \
\
# set path for the shell
&& echo '#!/bin/sh' > /etc/profile.d/path-hadoop.sh \
&& echo "export PATH=\"\${PATH}:${HADOOP_HOME}/bin\"" >> /etc/profile.d/path-hadoop.sh \
&& chmod 755 /etc/profile.d/path-hadoop.sh \
\
# clean up
&& apt-get -y autoremove gnupg \
&& apt-get clean \
&& rm -rf /tmp/* /var/tmp/* /var/lib/apt/lists/*

CMD echo "This is just a base image for building another docker images!" >&2
