From 6410663e8a78150d30c3b513f9b903e3af6b7bd2 Mon Sep 17 00:00:00 2001
From: Yann PETIT <yann.petit@saagie.com>
Date: Fri, 5 Jun 2020 18:49:58 +0200
Subject: [PATCH 01/10] update to latest jupyter image which uses bionic in
 place of xenial

---
 Dockerfile | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 93668da..f1faaca 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,7 +2,7 @@ ARG PYTHON2_IMG="saagie/python:2.7.202005.84"
 ARG PYTHON3_IMG="saagie/python:3.6.202005.84"
 
 # FIXME should use a minimal image and add libs after + update to latest available
-ARG BASE_CONTAINER="jupyter/scipy-notebook:c7fb6660d096"
+ARG BASE_CONTAINER="jupyter/scipy-notebook:76402a27fd13"
 
 FROM $PYTHON2_IMG AS PYTHON2
 FROM $PYTHON3_IMG AS PYTHON3
@@ -25,8 +25,8 @@ USER root
 RUN apt-get update && apt-get install -y --no-install-recommends \
       libxml2-dev libxslt1-dev antiword unrtf poppler-utils pstotext tesseract-ocr \
       flac ffmpeg lame libmad0 libsox-fmt-mp3 sox libjpeg-dev swig redis-server libpulse-dev \
-      libpng3 libfreetype6-dev libatlas-base-dev gfortran \
-      libgdal1-dev sasl2-bin libsasl2-2 libsasl2-dev \
+      libpng16-16 libfreetype6-dev libatlas-base-dev gfortran \
+      libgdal-dev sasl2-bin libsasl2-2 libsasl2-dev \
       libsasl2-modules unixodbc-dev python3-tk \
       qt5-default \
       libqt5webkit5-dev \
@@ -121,15 +121,11 @@ ENV CUDNN_VERSION 7.6.0.64
 LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}"
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
-      ca-certificates apt-transport-https gnupg-curl && \
+      ca-certificates apt-transport-https gnupg2 curl && \
     rm -rf /var/lib/apt/lists/* && \
-    NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \
-    NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \
-    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \
-    apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +5 > cudasign.pub && \
-    echo "$NVIDIA_GPGKEY_SUM  cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \
-    echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
-    echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \
+    curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub | apt-key add - && \
+    echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
+    echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \
     # For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a
     apt-get update && apt-get install -y --no-install-recommends \
         cuda-cudart-$CUDA_PKG_VERSION \

From 1f0170c3d8ef6e7aa9811652813a2bf5b94b9e2e Mon Sep 17 00:00:00 2001
From: Yann PETIT <yann.petit@saagie.com>
Date: Wed, 8 Jul 2020 15:08:26 +0200
Subject: [PATCH 02/10] [scipy] move jupyter official scipy as base in own
 folder

---
 Dockerfile => scipy/Dockerfile                           | 0
 Jenkinsfile => scipy/Jenkinsfile                         | 0
 build.sh => scipy/build.sh                               | 0
 entrypoint.sh => scipy/entrypoint.sh                     | 0
 python2_lib_test.py => scipy/python2_lib_test.py         | 0
 python3_lib_test.py => scipy/python3_lib_test.py         | 0
 requirements_conda2.txt => scipy/requirements_conda2.txt | 0
 requirements_conda3.txt => scipy/requirements_conda3.txt | 0
 requirements_pip2.txt => scipy/requirements_pip2.txt     | 0
 requirements_pip3.txt => scipy/requirements_pip3.txt     | 0
 10 files changed, 0 insertions(+), 0 deletions(-)
 rename Dockerfile => scipy/Dockerfile (100%)
 rename Jenkinsfile => scipy/Jenkinsfile (100%)
 rename build.sh => scipy/build.sh (100%)
 rename entrypoint.sh => scipy/entrypoint.sh (100%)
 rename python2_lib_test.py => scipy/python2_lib_test.py (100%)
 rename python3_lib_test.py => scipy/python3_lib_test.py (100%)
 rename requirements_conda2.txt => scipy/requirements_conda2.txt (100%)
 rename requirements_conda3.txt => scipy/requirements_conda3.txt (100%)
 rename requirements_pip2.txt => scipy/requirements_pip2.txt (100%)
 rename requirements_pip3.txt => scipy/requirements_pip3.txt (100%)

diff --git a/Dockerfile b/scipy/Dockerfile
similarity index 100%
rename from Dockerfile
rename to scipy/Dockerfile
diff --git a/Jenkinsfile b/scipy/Jenkinsfile
similarity index 100%
rename from Jenkinsfile
rename to scipy/Jenkinsfile
diff --git a/build.sh b/scipy/build.sh
similarity index 100%
rename from build.sh
rename to scipy/build.sh
diff --git a/entrypoint.sh b/scipy/entrypoint.sh
similarity index 100%
rename from entrypoint.sh
rename to scipy/entrypoint.sh
diff --git a/python2_lib_test.py b/scipy/python2_lib_test.py
similarity index 100%
rename from python2_lib_test.py
rename to scipy/python2_lib_test.py
diff --git a/python3_lib_test.py b/scipy/python3_lib_test.py
similarity index 100%
rename from python3_lib_test.py
rename to scipy/python3_lib_test.py
diff --git a/requirements_conda2.txt b/scipy/requirements_conda2.txt
similarity index 100%
rename from requirements_conda2.txt
rename to scipy/requirements_conda2.txt
diff --git a/requirements_conda3.txt b/scipy/requirements_conda3.txt
similarity index 100%
rename from requirements_conda3.txt
rename to scipy/requirements_conda3.txt
diff --git a/requirements_pip2.txt b/scipy/requirements_pip2.txt
similarity index 100%
rename from requirements_pip2.txt
rename to scipy/requirements_pip2.txt
diff --git a/requirements_pip3.txt b/scipy/requirements_pip3.txt
similarity index 100%
rename from requirements_pip3.txt
rename to scipy/requirements_pip3.txt

From b26c340dc31c21e303b97bb31ec8502b91523380 Mon Sep 17 00:00:00 2001
From: Yann PETIT <yann.petit@saagie.com>
Date: Thu, 25 Jun 2020 11:41:53 +0200
Subject: [PATCH 03/10] [16] use jupyter/minimal-notebook as base, drop
 python2.x support, update to latest saagie/python image in a separate folder

---
 .gitignore                      |   1 +
 README.md                       |   4 +-
 minimal/Dockerfile              | 142 ++++++++++++++++++++++++++++++++
 minimal/Jenkinsfile             |  34 ++++++++
 minimal/build.sh                |  25 ++++++
 minimal/entrypoint.sh           |   4 +
 minimal/python3_lib_test.py     | 103 +++++++++++++++++++++++
 minimal/requirements_conda3.txt |   3 +
 minimal/requirements_pip3.txt   |   2 +
 9 files changed, 316 insertions(+), 2 deletions(-)
 create mode 100644 .gitignore
 create mode 100644 minimal/Dockerfile
 create mode 100644 minimal/Jenkinsfile
 create mode 100755 minimal/build.sh
 create mode 100755 minimal/entrypoint.sh
 create mode 100644 minimal/python3_lib_test.py
 create mode 100644 minimal/requirements_conda3.txt
 create mode 100644 minimal/requirements_pip3.txt

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..485dee6
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+.idea
diff --git a/README.md b/README.md
index e90dfac..0c958c3 100644
--- a/README.md
+++ b/README.md
@@ -44,5 +44,5 @@
 ### For python 3
 	!pip install libraryName
 
-### For python 2
-	!pip2 install libraryName
+/!\ Python2 support dropped
+
diff --git a/minimal/Dockerfile b/minimal/Dockerfile
new file mode 100644
index 0000000..07985bd
--- /dev/null
+++ b/minimal/Dockerfile
@@ -0,0 +1,142 @@
+ARG PYTHON3_IMG="saagie/python:3.6.202005.84"
+
+# use latest image with ubuntu 16.04 Xenial for CDH5 compatibility
+# see (https://github.com/jupyter/docker-stacks/commits/master?after=04f7f60d34a674a2964d96a6cb97c57a7870a828+664)
+ARG BASE_CONTAINER="jupyter/minimal-notebook:f9e77e3ddd6f"
+
+FROM $PYTHON3_IMG AS PYTHON3
+FROM $BASE_CONTAINER
+
+MAINTAINER Saagie
+
+ENV PATH="${PATH}:/home/$NB_USER/.local/bin"
+
+
+# Starts by cleaning useless npm cache & other files
+RUN npm cache clean --force \
+    && conda clean -ay \
+    && rm -rf $CONDA_DIR/share/jupyter/lab/staging
+# Not necessary to apt-get clean it seems
+
+########################## LIBS PART BEGIN ##########################
+USER root
+# TODO check if all necessary seems there are duplicate from jupyter/scipy image
+RUN apt-get update -qq && apt-get install -yqq --no-install-recommends \
+      # replaces libpng3 for bionic
+      libpng16-16 \
+      # replaces libdal6 for bionic
+      libgdal-dev \
+      # needed to compile psycopg2
+      libpq-dev \
+      curl \
+      libxml2-dev libxslt1-dev antiword unrtf poppler-utils pstotext tesseract-ocr \
+      flac ffmpeg lame libmad0 libsox-fmt-mp3 sox libjpeg-dev swig redis-server libpulse-dev \
+      libfreetype6-dev libatlas-base-dev gfortran \
+      sasl2-bin libsasl2-2 libsasl2-dev \
+      libsasl2-modules unixodbc-dev python3-tk \
+      qt5-default \
+      libqt5webkit5-dev \
+      libcurl4-openssl-dev \
+    && rm -rf /var/lib/apt/lists/*
+########################## LIBS PART END ##########################
+
+
+################ Kernels / Conda envs / requirements PART BEGIN ################
+USER $NB_USER
+# Uninstall python3 kernel
+RUN jupyter kernelspec remove -f python3
+
+# Update conda to latest version
+#RUN conda update -n root conda \
+RUN conda clean -ay
+
+# seems there's sometimesa problem with pyzmq so need to reinstall it...
+RUN conda create -n py36 python=3.6 \
+    && bash -c "source activate py36 && pip uninstall pyzmq -y && pip install pyzmq && conda install notebook ipykernel -y && ipython kernel install --user --name py36 --display-name 'Python 3.6'" \
+    && conda clean -ay \
+    && rm -rf ~/.cache/pip
+
+# TODO check if all necessary seems there are duplicate from jupyter/scipy image
+SHELL ["/bin/bash", "-c"]
+# Add libs for python 3.6 env
+#     inherited from saagie/python:3.6 image
+#     installed via pip only
+#     installed via conda
+COPY requirements_conda3.txt requirements_conda3.txt
+COPY --from=PYTHON3 /requirements.txt ./requirements_python3.txt
+COPY requirements_pip3.txt requirements_pip3.txt
+RUN conda install -n py36 --quiet --yes --file requirements_conda3.txt \
+    # Some installed library (scikit-learn) could not be removed so use --ignore-installed \
+    && sed -n '/scikit-learn/p' requirements_python3.txt >> requirements_python3_ignore-installed.txt \
+    && sed -i '/scikit-learn/d' requirements_python3.txt \
+    && . activate py36 \
+    && python -m pip install --no-cache-dir --ignore-installed -r requirements_python3_ignore-installed.txt \
+    && python -m pip install --no-cache-dir -r requirements_python3.txt \
+    && python -m pip install --no-cache-dir -r requirements_pip3.txt \
+    && conda deactivate \
+    && conda clean -ay \
+    && rm -rf ~/.cache/pip
+################ Kernels / Conda envs / requirements PART ENDS #################
+
+
+########################## CUDA PART BEGIN ##########################
+USER root
+
+ENV PATH="${PATH}:/usr/local/nvidia/bin:/usr/local/cuda/bin"
+ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/nvidia/lib:/usr/local/nvidia/lib64"
+# nvidia-container-runtime
+ENV NVIDIA_VISIBLE_DEVICES all
+ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
+ENV NVIDIA_REQUIRE_CUDA "cuda>=10.0 brand=tesla,driver>=384,driver<385 brand=tesla,driver>=410,driver<411"
+
+ENV CUDA_VERSION 10.0.130
+ENV CUDA_PKG_VERSION 10-0=$CUDA_VERSION-1
+ENV NCCL_VERSION 2.4.2
+ENV CUDNN_VERSION 7.6.0.64
+
+LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      ca-certificates apt-transport-https gnupg-curl && \
+    rm -rf /var/lib/apt/lists/* && \
+    NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \
+    NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \
+    apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +5 > cudasign.pub && \
+    echo "$NVIDIA_GPGKEY_SUM  cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \
+    echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
+    echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \
+    # For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a
+    apt-get update && apt-get install -y --no-install-recommends \
+        cuda-cudart-$CUDA_PKG_VERSION \
+        cuda-libraries-$CUDA_PKG_VERSION \
+        cuda-nvtx-$CUDA_PKG_VERSION \
+        cuda-compat-10-0 && \
+        libnccl2=$NCCL_VERSION-1+cuda10.0 \
+        libcudnn7=$CUDNN_VERSION-1+cuda10.0 \
+    && apt-mark hold libnccl2 libcudnn7 \
+    && ln -s cuda-10.0 /usr/local/cuda \
+    && rm -rf /var/lib/apt/lists/* \
+    # Path doesn't exists... here for compatibility it seems https://gitlab.com/nvidia/container-images/cuda/issues/27
+    && echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf \
+    && echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
+########################## CUDA PART END ##########################
+
+
+########################## NOTEBOOKS DIR ##########################
+USER root
+# Create default workdir (useful if no volume mounted)
+RUN mkdir /notebooks-dir && chown 1000:100 /notebooks-dir
+# Define default workdir
+WORKDIR /notebooks-dir
+########################## NOTEBOOKS DIR  END ##########################
+
+#Add entrypoint.sh
+COPY entrypoint.sh /entrypoint.sh
+RUN chmod +x /entrypoint.sh
+
+# Should run as $NB_USER
+USER $NB_USER
+
+# Default: run without authentication
+CMD ["/entrypoint.sh"]
diff --git a/minimal/Jenkinsfile b/minimal/Jenkinsfile
new file mode 100644
index 0000000..106c636
--- /dev/null
+++ b/minimal/Jenkinsfile
@@ -0,0 +1,34 @@
+buildVersion = new Date().format("yyyyMMddHHmmss")
+
+pipeline {
+    agent { node { label 'docker_image' } }
+
+    options {
+        disableConcurrentBuilds()
+    }
+
+    stages {
+        stage('Build Jupyter images') {
+            steps {
+                script {
+                    sh "docker build -t saagie/jupyter-python-nbk:v2_$buildVersion ."
+                }
+            }
+        }
+
+        stage('Push techno images') {
+            steps {
+                script {
+                    withCredentials(
+        [usernamePassword(credentialsId: '8fc4964e-30c6-4bb9-8a19-69e37ea905b6',
+                usernameVariable: 'USERNAME',
+                passwordVariable: 'PASSWORD')]) {
+
+                        sh "docker login -u $USERNAME -p $PASSWORD"
+                        sh "docker push saagie/jupyter-python-nbk:v2_$buildVersion"
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/minimal/build.sh b/minimal/build.sh
new file mode 100755
index 0000000..d46fd2e
--- /dev/null
+++ b/minimal/build.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+set -euxo pipefail
+
+NO_CACHE=""
+export DOCKER_BUILDKIT=0
+
+while (( $# )); do
+    case $1 in
+        --no-cache) NO_CACHE="--no-cache"
+        ;;
+        --buildkit) export DOCKER_BUILDKIT=1
+        ;;
+        --*) echo "Bad Option $1"
+        ;;
+        *) TYPE=$1
+        ;;
+        *) break
+	;;
+    esac
+    shift
+done
+
+docker build $NO_CACHE \
+    -t $TYPE \
+    .
diff --git a/minimal/entrypoint.sh b/minimal/entrypoint.sh
new file mode 100755
index 0000000..682c6eb
--- /dev/null
+++ b/minimal/entrypoint.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+chown -R jovyan /notebooks-dir
+
+start-notebook.sh --KernelSpecManager.ensure_native_kernel=False --NotebookApp.token='' --NotebookApp.password='' --NotebookApp.base_url=$SAAGIE_BASE_PATH
diff --git a/minimal/python3_lib_test.py b/minimal/python3_lib_test.py
new file mode 100644
index 0000000..b414cb4
--- /dev/null
+++ b/minimal/python3_lib_test.py
@@ -0,0 +1,103 @@
+import sys
+print(sys.executable)
+print(sys.version)
+print(sys.version_info)
+
+###
+### Test conda install
+### FIXME find a way to test those installs
+#from hdfs.hfile import Hfile
+#import hdf5
+
+###
+### Test Jupyter specific
+###
+from PIL import Image
+from google.protobuf import descriptor_pb2
+
+###
+### Test imports from python3
+###
+import addok
+import apiclient
+import bs4
+import bokeh
+import bs4
+from confluent_kafka import Producer
+import crypto
+import cython
+import django
+import dryscrape
+import elasticsearch
+import excel
+from fastparquet import ParquetFile
+import fiona
+import folium
+import gensim
+import geopandas
+import geopy
+import graphviz
+import h5py
+import hdfs
+import autosklearn.classification
+import thrift_sasl
+from pybrain.tools.shortcuts import buildNetwork
+import ibis
+from imblearn.over_sampling import RandomOverSampler
+from impala.dbapi import connect
+import ipywidgets
+import jellyfish
+import joblib
+from kafka import KafkaConsumer
+from keras.layers import Dense
+import lime
+import lxml
+import matplotlib
+import mpld3
+import mysql.connector
+from neo4j import GraphDatabase
+import networkx
+import nltk
+from numba import jit
+import numpy
+import cv2
+import openpyxl
+import pandas
+from pdfminer.psparser import *
+import psycopg2
+from Crypto.Hash import SHA256
+import pycurl
+import pydotplus
+import pymongo
+import pyodbc
+import shapefile
+import pytesseract
+from Levenshtein import _levenshtein
+from requests_kerberos import *
+from skimage import data
+from sklearn import datasets
+import scipy
+import scrapy
+import seaborn
+import shap
+import shapely
+import simplejson
+import six
+import spacy
+from sqlalchemy import create_engine
+import statsmodels
+import tabula
+import tensorflow as tf
+print('Num GPUs Available: ', len(tf.config.experimental.list_physical_devices('GPU')))
+import tensorflow
+import textract
+import theano.tensor
+import tika
+import tokenizer
+import torch
+import torchvision
+import tpot
+import umap
+from wand.image import Image
+import xgboost
+import xlwt
diff --git a/minimal/requirements_conda3.txt b/minimal/requirements_conda3.txt
new file mode 100644
index 0000000..6ce3105
--- /dev/null
+++ b/minimal/requirements_conda3.txt
@@ -0,0 +1,3 @@
+hdf5==1.10.1
+python-hdfs==2.0.16
+pycurl>=7.43,<7.44
diff --git a/minimal/requirements_pip3.txt b/minimal/requirements_pip3.txt
new file mode 100644
index 0000000..1e39f6d
--- /dev/null
+++ b/minimal/requirements_pip3.txt
@@ -0,0 +1,2 @@
+pillow==4.3.0
+protobuf==3.6.1

From 53d272c33f63aa71978e4ecb2c1faaa50470b90c Mon Sep 17 00:00:00 2001
From: Yann PETIT <public@ypetit.net>
Date: Thu, 9 Jul 2020 12:30:25 +0200
Subject: [PATCH 04/10] [16] update README for new images

---
 README.md | 44 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 0c958c3..5a59436 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,54 @@
 # Jupyter Datascience Notebook for python
 
+## Images 
+
+Jupyter notebook for Python is declined into several images :
+
+    * saagie/jupyter-python-nbk:v2-minimal  
+    * saagie/jupyter-python-nbk:v2-base / saagie/jupyter-python-nbk:v2 
+    * saagie/jupyter-python-nbk:v2-scipy 
+
+### saagie/jupyter-python-nbk:v2-minimal
+This image is based **jupyter/minimal-notebook** one,
+
+=> adapted to run s;loothly on Saagie's platform
+
+=> with no particular datascience additionnal libs it's up toi you to add your owns.
+
+### saagie/jupyter-python-nbk:v2-base
+This is the official and main image, base on **jupyter/minimal-notebook** 
+
+=> it comes with a bunch of additional libraries 
+
+=> and is quite similar to **jupyter/scipy-notebook** with even more features.
+
+This image is the same as **saagie/jupyter-python-nbk:v2**
+
+### saagie/jupyter-python-nbk:v2-scipy
+This is the legacy @deprecated v2 image, initially based on **jupyter/scipy-notebook** 
+
+=> it comes with a bunch of additional libraries 
+
+=> but is now **deprecated** in favor of ***saagie/jupyter-python-nbk:v2-base***
+
+
 ## Run with :
-	docker run -p 8888:8888 -v /path/to/data/notebooks/dir:/notebooks-dir saagie/jupyter-python-nbk:latest
+
+### Standalone image
+
+	docker run -p 8888:8888 -v /path/to/data/notebooks/dir:/notebooks-dir saagie/jupyter-python-nbk:v2latest
 
 	Mounting volume is optional (-v /path/to/data/notebooks/dir:/notebooks-dir) but if you want to do it:
 	* create your local directory with: `mkdir -P /path/to/data/notebooks/dir`
 	* make Jovyan (Jupyter notebook default user) the owner of this directory with: `chown -R 1000:100 /path/to/data/notebooks/dir`
 
+### On Saagie's platform 
+
+    * use port 8888
+    * define SAAGIE_BASE_PATH env var for base_path
+    * do not activate "rewrite url"
+    * optionnaly you can add a volume to map /notebooks-dir folder
+
 ## Libraries :
 	* Data Processing
 		* numpy

From f0b25c099dbbf16fb4bd9781c62b7a87c932f141 Mon Sep 17 00:00:00 2001
From: Yann PETIT <public@ypetit.net>
Date: Thu, 9 Jul 2020 22:39:47 +0200
Subject: [PATCH 05/10] [16] make minimal minimal, and create base image
 inherited from minimal

---
 minimal/Jenkinsfile => Jenkinsfile        |   7 +-
 base/Dockerfile                           | 100 ++++++++++++++++++++++
 base/build.sh                             |  25 ++++++
 {minimal => base}/python3_lib_test.py     |   0
 {minimal => base}/requirements_conda3.txt |   0
 {minimal => base}/requirements_pip3.txt   |   0
 minimal/Dockerfile                        |  88 +------------------
 scipy/Jenkinsfile                         |  34 --------
 8 files changed, 132 insertions(+), 122 deletions(-)
 rename minimal/Jenkinsfile => Jenkinsfile (56%)
 create mode 100644 base/Dockerfile
 create mode 100755 base/build.sh
 rename {minimal => base}/python3_lib_test.py (100%)
 rename {minimal => base}/requirements_conda3.txt (100%)
 rename {minimal => base}/requirements_pip3.txt (100%)
 delete mode 100644 scipy/Jenkinsfile

diff --git a/minimal/Jenkinsfile b/Jenkinsfile
similarity index 56%
rename from minimal/Jenkinsfile
rename to Jenkinsfile
index 106c636..7a465eb 100644
--- a/minimal/Jenkinsfile
+++ b/Jenkinsfile
@@ -11,7 +11,9 @@ pipeline {
         stage('Build Jupyter images') {
             steps {
                 script {
-                    sh "docker build -t saagie/jupyter-python-nbk:v2_$buildVersion ."
+                    sh "cd minimal && docker build -t saagie/jupyter-python-nbk:v2-minimal_$buildVersion ."
+                    sh "cd base && docker build -t saagie/jupyter-python-nbk:v2-base_$buildVersion -t saagie/jupyter-python-nbk:v2_$buildVersion ."
+                    sh "cd scipy && docker build -t saagie/jupyter-python-nbk:v2-scipy_$buildVersion ."
                 }
             }
         }
@@ -25,6 +27,9 @@ pipeline {
                 passwordVariable: 'PASSWORD')]) {
 
                         sh "docker login -u $USERNAME -p $PASSWORD"
+                        sh "docker push saagie/jupyter-python-nbk:v2-minimal_$buildVersion"
+                        sh "docker push saagie/jupyter-python-nbk:v2-base_$buildVersion"
+                        sh "docker push saagie/jupyter-python-nbk:v2-scipy_$buildVersion"
                         sh "docker push saagie/jupyter-python-nbk:v2_$buildVersion"
                     }
                 }
diff --git a/base/Dockerfile b/base/Dockerfile
new file mode 100644
index 0000000..b3ae1a6
--- /dev/null
+++ b/base/Dockerfile
@@ -0,0 +1,100 @@
+ARG PYTHON3_IMG="saagie/python:3.6.202005.84"
+
+ARG BASE_CONTAINER="saagie/jupyter-python-nbk:v2-minimal"
+
+FROM $PYTHON3_IMG AS PYTHON3
+FROM $BASE_CONTAINER
+
+MAINTAINER Saagie
+
+########################## LIBS PART BEGIN ##########################
+USER root
+# TODO check if all necessary seems there are duplicate from jupyter/scipy image
+RUN apt-get update -qq && apt-get install -yqq --no-install-recommends \
+      # replaces libpng3 for bionic
+      libpng16-16 \
+      # replaces libdal6 for bionic
+      libgdal-dev \
+      # needed to compile psycopg2
+      libpq-dev \
+      libxml2-dev libxslt1-dev antiword unrtf poppler-utils pstotext tesseract-ocr \
+      flac ffmpeg lame libmad0 libsox-fmt-mp3 sox libjpeg-dev swig redis-server libpulse-dev \
+      libfreetype6-dev libatlas-base-dev gfortran \
+      sasl2-bin libsasl2-2 libsasl2-dev \
+      libsasl2-modules unixodbc-dev python3-tk \
+      qt5-default \
+      libqt5webkit5-dev \
+      libcurl4-openssl-dev \
+    && rm -rf /var/lib/apt/lists/*
+########################## LIBS PART END ##########################
+
+
+################ Kernels / Conda envs / requirements PART BEGIN ################
+USER $NB_USER
+# TODO check if all necessary seems there are duplicate from jupyter/scipy image
+SHELL ["/bin/bash", "-c"]
+# Add libs for python 3.6 env
+#     inherited from saagie/python:3.6 image
+#     installed via pip only
+#     installed via conda
+COPY requirements_conda3.txt requirements_conda3.txt
+COPY --from=PYTHON3 /requirements.txt ./requirements_python3.txt
+COPY requirements_pip3.txt requirements_pip3.txt
+RUN conda install -n py36 --quiet --yes --file requirements_conda3.txt \
+    # Some installed library (scikit-learn) could not be removed so use --ignore-installed \
+    && sed -n '/scikit-learn/p' requirements_python3.txt >> requirements_python3_ignore-installed.txt \
+    && sed -i '/scikit-learn/d' requirements_python3.txt \
+    && . activate py36 \
+    && python -m pip install --no-cache-dir --ignore-installed -r requirements_python3_ignore-installed.txt \
+    && python -m pip install --no-cache-dir -r requirements_python3.txt \
+    && python -m pip install --no-cache-dir -r requirements_pip3.txt \
+    && conda deactivate \
+    && conda clean -ay \
+    && rm -rf ~/.cache/pip
+################ Kernels / Conda envs / requirements PART ENDS #################
+
+
+########################## CUDA PART BEGIN ##########################
+USER root
+
+ENV PATH="${PATH}:/usr/local/nvidia/bin:/usr/local/cuda/bin"
+ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/nvidia/lib:/usr/local/nvidia/lib64"
+# nvidia-container-runtime
+ENV NVIDIA_VISIBLE_DEVICES all
+ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
+ENV NVIDIA_REQUIRE_CUDA "cuda>=10.0 brand=tesla,driver>=384,driver<385 brand=tesla,driver>=410,driver<411"
+
+ENV CUDA_VERSION 10.0.130
+ENV CUDA_PKG_VERSION 10-0=$CUDA_VERSION-1
+ENV NCCL_VERSION 2.4.2
+ENV CUDNN_VERSION 7.6.0.64
+
+LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      ca-certificates apt-transport-https gnupg-curl && \
+    rm -rf /var/lib/apt/lists/* && \
+    NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \
+    NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \
+    apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +5 > cudasign.pub && \
+    echo "$NVIDIA_GPGKEY_SUM  cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \
+    echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
+    echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \
+    # For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a
+    apt-get update && apt-get install -y --no-install-recommends \
+        cuda-cudart-$CUDA_PKG_VERSION \
+        cuda-libraries-$CUDA_PKG_VERSION \
+        cuda-nvtx-$CUDA_PKG_VERSION \
+        cuda-compat-10-0 && \
+        libnccl2=$NCCL_VERSION-1+cuda10.0 \
+        libcudnn7=$CUDNN_VERSION-1+cuda10.0 \
+    && apt-mark hold libnccl2 libcudnn7 \
+    && ln -s cuda-10.0 /usr/local/cuda \
+    && rm -rf /var/lib/apt/lists/* \
+    # Path doesn't exists... here for compatibility it seems https://gitlab.com/nvidia/container-images/cuda/issues/27
+    && echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf \
+    && echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
+########################## CUDA PART END ##########################
+
+USER $NB_USER
diff --git a/base/build.sh b/base/build.sh
new file mode 100755
index 0000000..d46fd2e
--- /dev/null
+++ b/base/build.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+set -euxo pipefail
+
+NO_CACHE=""
+export DOCKER_BUILDKIT=0
+
+while (( $# )); do
+    case $1 in
+        --no-cache) NO_CACHE="--no-cache"
+        ;;
+        --buildkit) export DOCKER_BUILDKIT=1
+        ;;
+        --*) echo "Bad Option $1"
+        ;;
+        *) TYPE=$1
+        ;;
+        *) break
+	;;
+    esac
+    shift
+done
+
+docker build $NO_CACHE \
+    -t $TYPE \
+    .
diff --git a/minimal/python3_lib_test.py b/base/python3_lib_test.py
similarity index 100%
rename from minimal/python3_lib_test.py
rename to base/python3_lib_test.py
diff --git a/minimal/requirements_conda3.txt b/base/requirements_conda3.txt
similarity index 100%
rename from minimal/requirements_conda3.txt
rename to base/requirements_conda3.txt
diff --git a/minimal/requirements_pip3.txt b/base/requirements_pip3.txt
similarity index 100%
rename from minimal/requirements_pip3.txt
rename to base/requirements_pip3.txt
diff --git a/minimal/Dockerfile b/minimal/Dockerfile
index 07985bd..0e782fa 100644
--- a/minimal/Dockerfile
+++ b/minimal/Dockerfile
@@ -1,17 +1,11 @@
-ARG PYTHON3_IMG="saagie/python:3.6.202005.84"
-
 # use latest image with ubuntu 16.04 Xenial for CDH5 compatibility
 # see (https://github.com/jupyter/docker-stacks/commits/master?after=04f7f60d34a674a2964d96a6cb97c57a7870a828+664)
-ARG BASE_CONTAINER="jupyter/minimal-notebook:f9e77e3ddd6f"
-
-FROM $PYTHON3_IMG AS PYTHON3
-FROM $BASE_CONTAINER
+FROM jupyter/minimal-notebook:f9e77e3ddd6f
 
 MAINTAINER Saagie
 
 ENV PATH="${PATH}:/home/$NB_USER/.local/bin"
 
-
 # Starts by cleaning useless npm cache & other files
 RUN npm cache clean --force \
     && conda clean -ay \
@@ -22,21 +16,7 @@ RUN npm cache clean --force \
 USER root
 # TODO check if all necessary seems there are duplicate from jupyter/scipy image
 RUN apt-get update -qq && apt-get install -yqq --no-install-recommends \
-      # replaces libpng3 for bionic
-      libpng16-16 \
-      # replaces libdal6 for bionic
-      libgdal-dev \
-      # needed to compile psycopg2
-      libpq-dev \
       curl \
-      libxml2-dev libxslt1-dev antiword unrtf poppler-utils pstotext tesseract-ocr \
-      flac ffmpeg lame libmad0 libsox-fmt-mp3 sox libjpeg-dev swig redis-server libpulse-dev \
-      libfreetype6-dev libatlas-base-dev gfortran \
-      sasl2-bin libsasl2-2 libsasl2-dev \
-      libsasl2-modules unixodbc-dev python3-tk \
-      qt5-default \
-      libqt5webkit5-dev \
-      libcurl4-openssl-dev \
     && rm -rf /var/lib/apt/lists/*
 ########################## LIBS PART END ##########################
 
@@ -55,74 +35,8 @@ RUN conda create -n py36 python=3.6 \
     && bash -c "source activate py36 && pip uninstall pyzmq -y && pip install pyzmq && conda install notebook ipykernel -y && ipython kernel install --user --name py36 --display-name 'Python 3.6'" \
     && conda clean -ay \
     && rm -rf ~/.cache/pip
-
-# TODO check if all necessary seems there are duplicate from jupyter/scipy image
-SHELL ["/bin/bash", "-c"]
-# Add libs for python 3.6 env
-#     inherited from saagie/python:3.6 image
-#     installed via pip only
-#     installed via conda
-COPY requirements_conda3.txt requirements_conda3.txt
-COPY --from=PYTHON3 /requirements.txt ./requirements_python3.txt
-COPY requirements_pip3.txt requirements_pip3.txt
-RUN conda install -n py36 --quiet --yes --file requirements_conda3.txt \
-    # Some installed library (scikit-learn) could not be removed so use --ignore-installed \
-    && sed -n '/scikit-learn/p' requirements_python3.txt >> requirements_python3_ignore-installed.txt \
-    && sed -i '/scikit-learn/d' requirements_python3.txt \
-    && . activate py36 \
-    && python -m pip install --no-cache-dir --ignore-installed -r requirements_python3_ignore-installed.txt \
-    && python -m pip install --no-cache-dir -r requirements_python3.txt \
-    && python -m pip install --no-cache-dir -r requirements_pip3.txt \
-    && conda deactivate \
-    && conda clean -ay \
-    && rm -rf ~/.cache/pip
 ################ Kernels / Conda envs / requirements PART ENDS #################
 
-
-########################## CUDA PART BEGIN ##########################
-USER root
-
-ENV PATH="${PATH}:/usr/local/nvidia/bin:/usr/local/cuda/bin"
-ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/nvidia/lib:/usr/local/nvidia/lib64"
-# nvidia-container-runtime
-ENV NVIDIA_VISIBLE_DEVICES all
-ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
-ENV NVIDIA_REQUIRE_CUDA "cuda>=10.0 brand=tesla,driver>=384,driver<385 brand=tesla,driver>=410,driver<411"
-
-ENV CUDA_VERSION 10.0.130
-ENV CUDA_PKG_VERSION 10-0=$CUDA_VERSION-1
-ENV NCCL_VERSION 2.4.2
-ENV CUDNN_VERSION 7.6.0.64
-
-LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}"
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-      ca-certificates apt-transport-https gnupg-curl && \
-    rm -rf /var/lib/apt/lists/* && \
-    NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \
-    NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \
-    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \
-    apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +5 > cudasign.pub && \
-    echo "$NVIDIA_GPGKEY_SUM  cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \
-    echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
-    echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \
-    # For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a
-    apt-get update && apt-get install -y --no-install-recommends \
-        cuda-cudart-$CUDA_PKG_VERSION \
-        cuda-libraries-$CUDA_PKG_VERSION \
-        cuda-nvtx-$CUDA_PKG_VERSION \
-        cuda-compat-10-0 && \
-        libnccl2=$NCCL_VERSION-1+cuda10.0 \
-        libcudnn7=$CUDNN_VERSION-1+cuda10.0 \
-    && apt-mark hold libnccl2 libcudnn7 \
-    && ln -s cuda-10.0 /usr/local/cuda \
-    && rm -rf /var/lib/apt/lists/* \
-    # Path doesn't exists... here for compatibility it seems https://gitlab.com/nvidia/container-images/cuda/issues/27
-    && echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf \
-    && echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
-########################## CUDA PART END ##########################
-
-
 ########################## NOTEBOOKS DIR ##########################
 USER root
 # Create default workdir (useful if no volume mounted)
diff --git a/scipy/Jenkinsfile b/scipy/Jenkinsfile
deleted file mode 100644
index 106c636..0000000
--- a/scipy/Jenkinsfile
+++ /dev/null
@@ -1,34 +0,0 @@
-buildVersion = new Date().format("yyyyMMddHHmmss")
-
-pipeline {
-    agent { node { label 'docker_image' } }
-
-    options {
-        disableConcurrentBuilds()
-    }
-
-    stages {
-        stage('Build Jupyter images') {
-            steps {
-                script {
-                    sh "docker build -t saagie/jupyter-python-nbk:v2_$buildVersion ."
-                }
-            }
-        }
-
-        stage('Push techno images') {
-            steps {
-                script {
-                    withCredentials(
-        [usernamePassword(credentialsId: '8fc4964e-30c6-4bb9-8a19-69e37ea905b6',
-                usernameVariable: 'USERNAME',
-                passwordVariable: 'PASSWORD')]) {
-
-                        sh "docker login -u $USERNAME -p $PASSWORD"
-                        sh "docker push saagie/jupyter-python-nbk:v2_$buildVersion"
-                    }
-                }
-            }
-        }
-    }
-}

From 32bdc597fe2a8eb4af45c497915d0ee2c84d7a31 Mon Sep 17 00:00:00 2001
From: Yann PETIT <yann.petit@saagie.com>
Date: Wed, 17 Jun 2020 16:11:11 +0200
Subject: [PATCH 06/10] [jupyter-spark] add spark to jupyter v2 - initial
 commit

---
 spark/Dockerfile                       |  72 +++++++++++++
 spark/build.sh                         |  32 ++++++
 spark/entrypoint.sh                    |  14 +++
 spark/resources/cloudera.list          |   4 +
 spark/resources/core-site.xml          | 137 +++++++++++++++++++++++++
 spark/resources/requirements_conda.txt |   1 +
 spark/resources/spark-defaults.conf    |   7 ++
 spark/resources/spark-env.sh           |  44 ++++++++
 spark/resources/test.ipynb             |  80 +++++++++++++++
 spark/resources/test.ipynb.txt         |  24 +++++
 spark/resources/test2.ipynb            |  54 ++++++++++
 11 files changed, 469 insertions(+)
 create mode 100644 spark/Dockerfile
 create mode 100644 spark/build.sh
 create mode 100644 spark/entrypoint.sh
 create mode 100644 spark/resources/cloudera.list
 create mode 100644 spark/resources/core-site.xml
 create mode 100644 spark/resources/requirements_conda.txt
 create mode 100644 spark/resources/spark-defaults.conf
 create mode 100644 spark/resources/spark-env.sh
 create mode 100644 spark/resources/test.ipynb
 create mode 100644 spark/resources/test.ipynb.txt
 create mode 100644 spark/resources/test2.ipynb

diff --git a/spark/Dockerfile b/spark/Dockerfile
new file mode 100644
index 0000000..d95a6cf
--- /dev/null
+++ b/spark/Dockerfile
@@ -0,0 +1,72 @@
+ARG BASE_CONTAINER="ypetit/test:jupyter-minimal_20200403.54"
+
+FROM $BASE_CONTAINER
+
+MAINTAINER Saagie
+
+ENV DEBIAN_FRONTEND noninteractive
+
+# SAAGIE Spark dependencies
+ENV SPARK_VERSION 2.4.5
+ENV HADOOP_VERSION 2.6
+
+USER root
+
+# Install tools
+RUN apt-get update -qq && apt-get install -yqq --no-install-recommends \
+      vim nano gnupg2 && \
+    rm -rf /var/lib/apt/lists/*;
+
+# Installing Java
+RUN apt-get update -qq && apt-get install -yqq --no-install-recommends -y \
+      openjdk-8-jre-headless ca-certificates-java && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install Kerberos & ACL for Saagie
+COPY resources/cloudera.list /etc/apt/sources.list.d/cloudera.list
+RUN apt-get update -qq && apt-get install -yqq --no-install-recommends \
+      krb5-user acl \
+# apt-key adv  --recv-keys --keyserver keyserver.ubuntu.com 327574EE02A818DD
+      freeipa-client sentry-hdfs-plugin \
+    && rm -rf /var/lib/apt/lists/*;
+
+# Spark config
+ENV PORT0 4040
+ENV SPARK_HOME /usr/local/spark
+ENV PYTHONPATH $SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.7-src.zip
+ENV SPARK_OPTS --driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info
+ENV HADOOP_CONF_DIR="/home/jovyan/hadoop"
+
+RUN mkdir $HADOOP_CONF_DIR
+COPY resources/core-site.xml /home/jovyan/hadoop/core-site.xml
+
+RUN mkdir -p /usr/lib/impala/lib/ && chown $NB_UID /usr/lib/impala/lib
+RUN sed -i '2iln -s /etc/hadoop/conf/sentry-libs/hive-hcatalog-core.jar /usr/lib/impala/lib/hive-hcatalog-core.jar' /usr/local/bin/start-notebook.sh
+
+#Installing Spark
+RUN wget https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz -P /tmp \
+    && tar -zxf /tmp/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz -C /usr/local/  \
+    && ln -s /usr/local/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}/ /usr/local/spark \
+    && rm /tmp/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz
+
+# Install conda libs (pyarrow)
+# TODO copy alsewhere (here it's in /notebook-dirs)
+USER $NB_UID
+COPY resources/requirements_conda.txt /home/$NB-USER/requirements_conda.txt
+RUN conda install -n py27 --quiet --yes --file /home/$NB-USER/requirements_conda.txt && \
+    conda install -n py36 --quiet --yes --file /home/$NB-USER/requirements_conda.txt && \
+    conda clean -afy
+
+USER root
+COPY resources/spark-defaults.conf $SPARK_HOME/conf/spark-defaults.conf.orig
+COPY resources/spark-env.sh $SPARK_HOME/conf/spark-env.sh
+COPY resources/test.ipynb /home/$NB_USER/work/test.ipynb
+COPY resources/test2.ipynb /home/$NB_USER/work/test2.ipynb
+RUN chown $NB_USER:$NB_UID $SPARK_HOME/conf/spark-defaults.conf.orig \
+    && chown $NB_USER:$NB_UID $SPARK_HOME/conf/spark-env.sh \
+    && chown $NB_USER:$NB_UID /home/$NB_USER/work/test.ipynb \
+    && chown $NB_USER:$NB_UID /home/$NB_USER/work/test2.ipynb \
+    && chmod +x $SPARK_HOME/conf/spark-env.sh
+
+USER $NB_UID
+WORKDIR /home/jovyan/work
diff --git a/spark/build.sh b/spark/build.sh
new file mode 100644
index 0000000..f00794a
--- /dev/null
+++ b/spark/build.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+set -euxo pipefail
+
+NO_CACHE=""
+export DOCKER_BUILDKIT=0
+#BASE_CONTAINER="jupyter/scipy-notebook:c7fb6660d096"
+#PYTHON2_IMG="saagie/python:2.7.202003.76"
+#PYTHON3_IMG="saagie/python:3.6.202003.76"
+
+
+while (( $# )); do
+    case $1 in
+        --no-cache) NO_CACHE="--no-cache"
+        ;;
+        --buildkit) export DOCKER_BUILDKIT=1
+        ;;
+        --*) echo "Bad Option $1"
+        ;;
+        *) TYPE=$1
+        ;;
+        *) break
+	;;
+    esac
+    shift
+done
+
+docker build $NO_CACHE \
+    -t $TYPE \
+    .
+    #    --build-arg BASE_CONTAINER=$BASE_CONTAINER \
+    #    --build-arg PYTHON2_IMG=$PYTHON2_IMG \
+    #    --build-arg PYTHON3_IMG=$PYTHON3_IMG \
diff --git a/spark/entrypoint.sh b/spark/entrypoint.sh
new file mode 100644
index 0000000..a2f287b
--- /dev/null
+++ b/spark/entrypoint.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+chown -R jovyan /notebooks-dir
+
+TARGET_NAMESPACE="saagie1-projectc5aa8432-f94a-4707-bb9e-79e183e8b107"
+FILE_KERNEL_PY27="/home/jovyan/.local/share/jupyter/kernels/py27/kernel.json"
+FILE_KERNEL_PY36="/home/jovyan/.local/share/jupyter/kernels/py36/kernel.json"
+
+# get line conatining env
+# sed -n '/"env"/p' $FILE_KERNEL_PY36
+# if line exists insert
+
+
+
+start-notebook.sh --KernelSpecManager.ensure_native_kernel=False --NotebookApp.token='' --NotebookApp.password='' --NotebookApp.base_url=$SAAGIE_BASE_PATH
diff --git a/spark/resources/cloudera.list b/spark/resources/cloudera.list
new file mode 100644
index 0000000..78f7d94
--- /dev/null
+++ b/spark/resources/cloudera.list
@@ -0,0 +1,4 @@
+# Modified https://archive.cloudera.com/cdh5/ubuntu/xenial/amd64/cdh/cloudera.list file
+# To reference as trusted despite the expired / weak key
+deb [arch=amd64, trusted=yes] https://archive.cloudera.com/cdh5/ubuntu/xenial/amd64/cdh xenial-cdh5 contrib
+deb-src [trusted=yes] https://archive.cloudera.com/cdh5/ubuntu/xenial/amd64/cdh xenial-cdh5 contrib
diff --git a/spark/resources/core-site.xml b/spark/resources/core-site.xml
new file mode 100644
index 0000000..ea8ed4e
--- /dev/null
+++ b/spark/resources/core-site.xml
@@ -0,0 +1,137 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+
+<configuration>
+
+  <property>
+    <name>fs.defaultFS</name>
+    <value>hdfs://cluster</value>
+  </property>
+
+
+  <property>
+    <name>dfs.permissions.superusergroup</name>
+    <value>hadoop</value>
+  </property>
+
+  <property>
+    <name>hadoop.proxyuser.mapred.groups</name>
+    <value>*</value>
+  </property>
+
+  <property>
+    <name>hadoop.proxyuser.mapred.hosts</name>
+    <value>*</value>
+  </property>
+
+  <property>
+    <name>hadoop.proxyuser.hue.hosts</name>
+    <value>nn1.p01.saagie1.a36152.saagie</value>
+  </property>
+
+  <property>
+    <name>hadoop.proxyuser.hue.groups</name>
+    <value>*</value>
+  </property>
+
+  <property>
+    <name>hadoop.proxyuser.hdfs.groups</name>
+    <value>*</value>
+  </property>
+
+  <property>
+    <name>hadoop.proxyuser.hdfs.hosts</name>
+    <value>*</value>
+  </property>
+
+  <property>
+    <name>hadoop.proxyuser.httpfs.hosts</name>
+    <value>*</value>
+  </property>
+
+  <property>
+    <name>hadoop.proxyuser.httpfs.groups</name>
+    <value>*</value>
+  </property>
+
+  <property>
+    <name>hadoop.proxyuser.oozie.hosts</name>
+    <value>*</value>
+  </property>
+
+  <property>
+    <name>hadoop.proxyuser.oozie.groups</name>
+    <value>*</value>
+  </property>
+
+  <property>
+    <name>hadoop.proxyuser.impala.hosts</name>
+    <value>*</value>
+  </property>
+
+  <property>
+   <name>hadoop.proxyuser.impala.groups</name>
+   <value>*</value>
+  </property>
+
+  <property>
+    <name>hadoop.proxyuser.hive.hosts</name>
+    <value>*</value>
+  </property>
+
+  <property>
+    <name>hadoop.proxyuser.hive.groups</name>
+    <value>*</value>
+  </property>
+
+  <property>
+    <name>hadoop.proxyuser.yarn.groups</name>
+    <value>*</value>
+  </property>
+
+  <property>
+    <name>hadoop.proxyuser.yarn.hosts</name>
+    <value>*</value>
+  </property>
+
+  <property>
+    <name>hadoop.proxyuser.m2m.hosts</name>
+    <value>*</value>
+  </property>
+
+  <property>
+    <name>hadoop.proxyuser.m2m.groups</name>
+    <value>*</value>
+  </property>
+
+
+  <property>
+    <name>hadoop.security.group.mapping</name>
+    <value>org.apache.hadoop.security.CompositeGroupsMapping</value>
+  </property>
+
+  <property>
+    <name>hadoop.security.group.mapping.providers</name>
+    <value>shell4services</value>
+  </property>
+
+  <property>
+   <name>hadoop.security.group.mapping.providers.combined</name>
+   <value>true</value>
+  </property>
+
+  <property>
+    <name>hadoop.security.group.mapping.provider.shell4services</name>
+    <value>org.apache.hadoop.security.ShellBasedUnixGroupsMapping</value>
+  </property>
+
+
+  <property>
+   <name>hadoop.security.auth_to_local</name>
+   <value>
+    RULE:[2:$1]
+    DEFAULT
+   </value>
+  </property>
+
+</configuration>
diff --git a/spark/resources/requirements_conda.txt b/spark/resources/requirements_conda.txt
new file mode 100644
index 0000000..19d8363
--- /dev/null
+++ b/spark/resources/requirements_conda.txt
@@ -0,0 +1 @@
+pyarrow
diff --git a/spark/resources/spark-defaults.conf b/spark/resources/spark-defaults.conf
new file mode 100644
index 0000000..ace4a1e
--- /dev/null
+++ b/spark/resources/spark-defaults.conf
@@ -0,0 +1,7 @@
+#spark.submit.deployMode cluster
+spark.master k8s\://https\://kubernetes.default.svc\:443
+spark.kubernetes.driver.label.io.saagie/type job
+spark.kubernetes.driver.label.io.saagie/spark-role driver
+spark.kubernetes.executor.label.io.saagie/type job
+spark.kubernetes.authenticate.driver.serviceAccountName spark-driver
+#spark.defaults TRUE
diff --git a/spark/resources/spark-env.sh b/spark/resources/spark-env.sh
new file mode 100644
index 0000000..eb065e7
--- /dev/null
+++ b/spark/resources/spark-env.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+shuffle=false
+#if [[ $APACHE_SPARK_VERSION == "2.1.0" ]]; then
+#    shuffle=true
+#elif [[ $SPARK_VERSION == "2.1.0" ]]; then
+#    shuffle=true
+#fi
+
+if [ -z "$SPARK_VERSION" ]; then
+    export SPARK_VERSION=$APACHE_SPARK_VERSION
+fi
+
+export HADOOP_CONF_DIR=/etc/hadoop/conf/
+
+#this a hack
+#export SPARK_CONF_DIR=$(mktemp -d)
+export SPARK_CONF_DIR=$SPARK_HOME/conf
+#cp /tmp/spark-defaults.conf $SPARK_CONF_DIR/spark-defaults.conf
+cp $SPARK_CONF_DIR/spark-defaults.conf.orig $SPARK_CONF_DIR/spark-defaults.conf
+echo "spark.ui.port     $PORT0" >> $SPARK_CONF_DIR/spark-defaults.conf
+
+if [ -z "$PYTHON_VERSION" ]; then
+   echo "spark.mesos.executor.docker.image     saagie/spark:java-$JAVA_VERSION-$SPARK_VERSION-1.3.1-centos" >> $SPARK_CONF_DIR/spark-defaults.conf
+else
+   echo "spark.mesos.executor.docker.image     saagie/spark:python-$PYTHON_VERSION-$SPARK_VERSION-1.3.1-centos" >> $SPARK_CONF_DIR/spark-defaults.conf
+fi
+
+echo "spark.shuffle.service.enabled               $shuffle" >> $SPARK_CONF_DIR/spark-defaults.conf
+echo "spark.dynamicAllocation.enabled             $shuffle" >> $SPARK_CONF_DIR/spark-defaults.conf
+
+# hack for notebook
+if [ -z "$PORT1" ]; then
+  PORT1=$PORT0
+fi
+
+echo "spark.driver.port $PORT1" >> $SPARK_CONF_DIR/spark-defaults.conf
+cp $SPARK_CONF_DIR/spark-defaults.conf $SPARK_CONF_DIR/spark-defaults.conf.new1
+
+echo "spark.kubernetes.namespace $(cat /etc/hostname)" >> $SPARK_CONF_DIR/spark-defaults.conf
+
+export PYSPARK_PYTHON=/opt/conda/envs/py36/bin/python
+
+
+cp $SPARK_CONF_DIR/spark-defaults.conf $SPARK_CONF_DIR/spark-defaults.conf.new2
diff --git a/spark/resources/test.ipynb b/spark/resources/test.ipynb
new file mode 100644
index 0000000..afed899
--- /dev/null
+++ b/spark/resources/test.ipynb
@@ -0,0 +1,80 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import os\n",
+    "#import pyspark\n",
+    "from pyspark.sql import SparkSession\n",
+    "#pyspark.SparkConf().setAll([('spark.submit.deployMode', 'cluster'), ('spark.master', 'k8s://https://kubernetes.default.svc:443')])\n",
+    "\n",
+    "spark = SparkSession \\\n",
+    "    .builder \\\n",
+    "    .appName(\"File lines count\") \\\n",
+    "    .config(\"spark.submit.deployMode\", \"cluster\") \\\n",
+    "    .config(\"spark.master\", \"k8s://https://kubernetes.default.svc:443\") \\\n",
+    "    .config(\"spark.kubernetes.driver.label.io.saagie/type\", \"job\") \\\n",
+    "    .config(\"spark.kubernetes.driver.label.io.saagie/spark-role\", \"driver\") \\\n",
+    "    .config(\"spark.kubernetes.executor.label.io.saagie/type\", \"job\") \\\n",
+    "    .config(\"spark.kubernetes.authenticate.driver.serviceAccountName\", \"spark-driver\") \\\n",
+    "    .config(\"spark.kubernetes.namespace\", \"POD_NAMESPACE\") \\\n",
+    "    .getOrCreate() \n",
+    "\n",
+    "#Doit en principe retourner l'URL de K8S\n",
+    "spark.sparkContext.getConf().getAll()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "Py4JJavaError",
+     "evalue": "An error occurred while calling o264.count.\n: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 2.0 failed 1 times, most recent failure: Lost task 0.0 in stage 2.0 (TID 1, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):\n  File \"/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py\", line 267, in main\n    (\"%d.%d\" % sys.version_info[:2], version))\nException: Python in worker has different version 3.7 than that in driver 3.6, PySpark cannot run with different minor versions.Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.\n\n\tat org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)\n\tat org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:592)\n\tat org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:575)\n\tat org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)\n\tat org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)\n\tat scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)\n\tat scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)\n\tat scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)\n\tat scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.agg_doAggregateWithoutKey_0$(Unknown Source)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)\n\tat org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)\n\tat org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)\n\tat scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)\n\tat org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)\n\tat org.apache.spark.scheduler.Task.run(Task.scala:123)\n\tat org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)\n\tat org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n\tat java.lang.Thread.run(Thread.java:748)\n\nDriver stacktrace:\n\tat org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1891)\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1879)\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)\n\tat scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)\n\tat scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)\n\tat org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1878)\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)\n\tat scala.Option.foreach(Option.scala:257)\n\tat org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:927)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2112)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2061)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2050)\n\tat org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)\n\tat org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:738)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)\n\tat org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:990)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)\n\tat org.apache.spark.rdd.RDD.withScope(RDD.scala:385)\n\tat org.apache.spark.rdd.RDD.collect(RDD.scala:989)\n\tat org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:299)\n\tat org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2836)\n\tat org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2835)\n\tat org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)\n\tat org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)\n\tat org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)\n\tat org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)\n\tat org.apache.spark.sql.Dataset.withAction(Dataset.scala:3369)\n\tat org.apache.spark.sql.Dataset.count(Dataset.scala:2835)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:282)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\n\tat java.lang.Thread.run(Thread.java:748)\nCaused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):\n  File \"/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py\", line 267, in main\n    (\"%d.%d\" % sys.version_info[:2], version))\nException: Python in worker has different version 3.7 than that in driver 3.6, PySpark cannot run with different minor versions.Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.\n\n\tat org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)\n\tat org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:592)\n\tat org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:575)\n\tat org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)\n\tat org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)\n\tat scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)\n\tat scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)\n\tat scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)\n\tat scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.agg_doAggregateWithoutKey_0$(Unknown Source)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)\n\tat org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)\n\tat org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)\n\tat scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)\n\tat org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)\n\tat org.apache.spark.scheduler.Task.run(Task.scala:123)\n\tat org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)\n\tat org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n\t... 1 more\n",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mPy4JJavaError\u001b[0m                             Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-6-28b9f4f6b401>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0mlist_a\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'aa'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m'aaze'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0mdfa\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mspark\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcreateDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlist_a\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Nombre d'éléments : \"\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdfa\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcount\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      4\u001b[0m \u001b[0mdfa\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/usr/local/spark/python/pyspark/sql/dataframe.py\u001b[0m in \u001b[0;36mcount\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    521\u001b[0m         \u001b[0;36m2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    522\u001b[0m         \"\"\"\n\u001b[0;32m--> 523\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_jdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcount\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    524\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    525\u001b[0m     \u001b[0;34m@\u001b[0m\u001b[0mignore_unicode_prefix\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m   1255\u001b[0m         \u001b[0manswer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgateway_client\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1256\u001b[0m         return_value = get_return_value(\n\u001b[0;32m-> 1257\u001b[0;31m             answer, self.gateway_client, self.target_id, self.name)\n\u001b[0m\u001b[1;32m   1258\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1259\u001b[0m         \u001b[0;32mfor\u001b[0m \u001b[0mtemp_arg\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtemp_args\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/usr/local/spark/python/pyspark/sql/utils.py\u001b[0m in \u001b[0;36mdeco\u001b[0;34m(*a, **kw)\u001b[0m\n\u001b[1;32m     61\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mdeco\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     62\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     64\u001b[0m         \u001b[0;32mexcept\u001b[0m \u001b[0mpy4j\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprotocol\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPy4JJavaError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     65\u001b[0m             \u001b[0ms\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjava_exception\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtoString\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py\u001b[0m in \u001b[0;36mget_return_value\u001b[0;34m(answer, gateway_client, target_id, name)\u001b[0m\n\u001b[1;32m    326\u001b[0m                 raise Py4JJavaError(\n\u001b[1;32m    327\u001b[0m                     \u001b[0;34m\"An error occurred while calling {0}{1}{2}.\\n\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 328\u001b[0;31m                     format(target_id, \".\", name), value)\n\u001b[0m\u001b[1;32m    329\u001b[0m             \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    330\u001b[0m                 raise Py4JError(\n",
+      "\u001b[0;31mPy4JJavaError\u001b[0m: An error occurred while calling o264.count.\n: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 2.0 failed 1 times, most recent failure: Lost task 0.0 in stage 2.0 (TID 1, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):\n  File \"/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py\", line 267, in main\n    (\"%d.%d\" % sys.version_info[:2], version))\nException: Python in worker has different version 3.7 than that in driver 3.6, PySpark cannot run with different minor versions.Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.\n\n\tat org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)\n\tat org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:592)\n\tat org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:575)\n\tat org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)\n\tat org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)\n\tat scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)\n\tat scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)\n\tat scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)\n\tat scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.agg_doAggregateWithoutKey_0$(Unknown Source)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)\n\tat org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)\n\tat org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)\n\tat scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)\n\tat org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)\n\tat org.apache.spark.scheduler.Task.run(Task.scala:123)\n\tat org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)\n\tat org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n\tat java.lang.Thread.run(Thread.java:748)\n\nDriver stacktrace:\n\tat org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1891)\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1879)\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)\n\tat scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)\n\tat scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)\n\tat org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1878)\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)\n\tat scala.Option.foreach(Option.scala:257)\n\tat org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:927)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2112)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2061)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2050)\n\tat org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)\n\tat org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:738)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)\n\tat org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:990)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)\n\tat org.apache.spark.rdd.RDD.withScope(RDD.scala:385)\n\tat org.apache.spark.rdd.RDD.collect(RDD.scala:989)\n\tat org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:299)\n\tat org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2836)\n\tat org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2835)\n\tat org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)\n\tat org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)\n\tat org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)\n\tat org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)\n\tat org.apache.spark.sql.Dataset.withAction(Dataset.scala:3369)\n\tat org.apache.spark.sql.Dataset.count(Dataset.scala:2835)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:282)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\n\tat java.lang.Thread.run(Thread.java:748)\nCaused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):\n  File \"/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py\", line 267, in main\n    (\"%d.%d\" % sys.version_info[:2], version))\nException: Python in worker has different version 3.7 than that in driver 3.6, PySpark cannot run with different minor versions.Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.\n\n\tat org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)\n\tat org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:592)\n\tat org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:575)\n\tat org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)\n\tat org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)\n\tat scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)\n\tat scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)\n\tat scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)\n\tat scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.agg_doAggregateWithoutKey_0$(Unknown Source)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)\n\tat org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)\n\tat org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)\n\tat scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)\n\tat org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)\n\tat org.apache.spark.scheduler.Task.run(Task.scala:123)\n\tat org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)\n\tat org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n\t... 1 more\n"
+     ]
+    }
+   ],
+   "source": [
+    "list_a=[('aa',3), ('aaze',3)]\n",
+    "dfa = spark.createDataFrame(list_a)\n",
+    "print(\"Nombre d'éléments : \"+str(dfa.count()))\n",
+    "dfa.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.6",
+   "language": "python",
+   "name": "py36"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/spark/resources/test.ipynb.txt b/spark/resources/test.ipynb.txt
new file mode 100644
index 0000000..5461693
--- /dev/null
+++ b/spark/resources/test.ipynb.txt
@@ -0,0 +1,24 @@
+import os
+#import pyspark
+from pyspark.sql import SparkSession
+#pyspark.SparkConf().setAll([('spark.submit.deployMode', 'cluster'), ('spark.master', 'k8s://https://kubernetes.default.svc:443')])
+
+spark = SparkSession \
+    .builder \
+    .appName("File lines count") \
+    .config("spark.submit.deployMode", "cluster") \
+    .config("spark.master", "k8s://https://kubernetes.default.svc:443") \
+    .config("spark.kubernetes.driver.label.io.saagie/type", "job") \
+    .config("spark.kubernetes.driver.label.io.saagie/spark-role", "driver") \
+    .config("spark.kubernetes.executor.label.io.saagie/type", "job") \
+    .config("spark.kubernetes.authenticate.driver.serviceAccountName", "spark-driver") \
+    .config("spark.kubernetes.namespace", "POD_NAMESPACE") \
+    .getOrCreate()
+
+#Doit en principe retourner l'URL de K8S
+spark.sparkContext.getConf().getAll()
+
+list_a=[('aa',3), ('aaze',3)]
+dfa = spark.createDataFrame(list_a)
+print("Nombre d'éléments : "+str(dfa.count()))
+dfa.show()
diff --git a/spark/resources/test2.ipynb b/spark/resources/test2.ipynb
new file mode 100644
index 0000000..0f5fd87
--- /dev/null
+++ b/spark/resources/test2.ipynb
@@ -0,0 +1,54 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pyspark.sql import SparkSession\n",
+    "\n",
+    "spark = SparkSession \\\n",
+    "    .builder \\\n",
+    "    .appName(\"File lines count\") \\\n",
+    "    .getOrCreate() \n",
+    "\n",
+    "#Doit en principe retourner l'URL de K8S\n",
+    "spark.sparkContext.getConf().getAll()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "list_a=[('aa',3), ('aaze',3)]\n",
+    "dfa = spark.createDataFrame(list_a)\n",
+    "print(\"Nombre d'éléments : \"+str(dfa.count()))\n",
+    "dfa.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.6",
+   "language": "python",
+   "name": "py36"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

From 6055c908d300af0de54b265b70b0342dc4bb3e48 Mon Sep 17 00:00:00 2001
From: Yann PETIT <yann.petit@saagie.com>
Date: Thu, 25 Jun 2020 13:43:58 +0200
Subject: [PATCH 07/10] [jupyter-spark] update image and base image to use
 latest xenial based jupyter/minimal image and remove python 2.7

---
 spark/Dockerfile | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/spark/Dockerfile b/spark/Dockerfile
index d95a6cf..69b6071 100644
--- a/spark/Dockerfile
+++ b/spark/Dockerfile
@@ -1,4 +1,4 @@
-ARG BASE_CONTAINER="ypetit/test:jupyter-minimal_20200403.54"
+ARG BASE_CONTAINER="ypetit/test:jupyter-python-notebook_minimal_20200625_02"
 
 FROM $BASE_CONTAINER
 
@@ -24,9 +24,9 @@ RUN apt-get update -qq && apt-get install -yqq --no-install-recommends -y \
 
 # Install Kerberos & ACL for Saagie
 COPY resources/cloudera.list /etc/apt/sources.list.d/cloudera.list
-RUN apt-get update -qq && apt-get install -yqq --no-install-recommends \
+RUN apt-key adv  --recv-keys --keyserver keyserver.ubuntu.com 327574EE02A818DD \
+    && apt-get update -qq && apt-get install -yqq --no-install-recommends \
       krb5-user acl \
-# apt-key adv  --recv-keys --keyserver keyserver.ubuntu.com 327574EE02A818DD
       freeipa-client sentry-hdfs-plugin \
     && rm -rf /var/lib/apt/lists/*;
 
@@ -53,9 +53,8 @@ RUN wget https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SP
 # TODO copy alsewhere (here it's in /notebook-dirs)
 USER $NB_UID
 COPY resources/requirements_conda.txt /home/$NB-USER/requirements_conda.txt
-RUN conda install -n py27 --quiet --yes --file /home/$NB-USER/requirements_conda.txt && \
-    conda install -n py36 --quiet --yes --file /home/$NB-USER/requirements_conda.txt && \
-    conda clean -afy
+RUN conda install -n py36 --quiet --yes --file /home/$NB-USER/requirements_conda.txt && \
+    conda clean -ay
 
 USER root
 COPY resources/spark-defaults.conf $SPARK_HOME/conf/spark-defaults.conf.orig

From f9c3e4de6d6bc9b21d1b8b142edcb7e9d5347131 Mon Sep 17 00:00:00 2001
From: Yann PETIT <yann.petit@saagie.com>
Date: Thu, 25 Jun 2020 14:18:58 +0200
Subject: [PATCH 08/10] [jupyter-spark] remove spark-default.conf and
 kubernetes references

---
 spark/Dockerfile                    |  4 +---
 spark/resources/spark-defaults.conf |  7 -------
 spark/resources/spark-env.sh        | 20 --------------------
 3 files changed, 1 insertion(+), 30 deletions(-)
 delete mode 100644 spark/resources/spark-defaults.conf

diff --git a/spark/Dockerfile b/spark/Dockerfile
index 69b6071..8e51701 100644
--- a/spark/Dockerfile
+++ b/spark/Dockerfile
@@ -57,12 +57,10 @@ RUN conda install -n py36 --quiet --yes --file /home/$NB-USER/requirements_conda
     conda clean -ay
 
 USER root
-COPY resources/spark-defaults.conf $SPARK_HOME/conf/spark-defaults.conf.orig
 COPY resources/spark-env.sh $SPARK_HOME/conf/spark-env.sh
 COPY resources/test.ipynb /home/$NB_USER/work/test.ipynb
 COPY resources/test2.ipynb /home/$NB_USER/work/test2.ipynb
-RUN chown $NB_USER:$NB_UID $SPARK_HOME/conf/spark-defaults.conf.orig \
-    && chown $NB_USER:$NB_UID $SPARK_HOME/conf/spark-env.sh \
+RUN chown $NB_USER:$NB_UID $SPARK_HOME/conf/spark-env.sh \
     && chown $NB_USER:$NB_UID /home/$NB_USER/work/test.ipynb \
     && chown $NB_USER:$NB_UID /home/$NB_USER/work/test2.ipynb \
     && chmod +x $SPARK_HOME/conf/spark-env.sh
diff --git a/spark/resources/spark-defaults.conf b/spark/resources/spark-defaults.conf
deleted file mode 100644
index ace4a1e..0000000
--- a/spark/resources/spark-defaults.conf
+++ /dev/null
@@ -1,7 +0,0 @@
-#spark.submit.deployMode cluster
-spark.master k8s\://https\://kubernetes.default.svc\:443
-spark.kubernetes.driver.label.io.saagie/type job
-spark.kubernetes.driver.label.io.saagie/spark-role driver
-spark.kubernetes.executor.label.io.saagie/type job
-spark.kubernetes.authenticate.driver.serviceAccountName spark-driver
-#spark.defaults TRUE
diff --git a/spark/resources/spark-env.sh b/spark/resources/spark-env.sh
index eb065e7..f0492c0 100644
--- a/spark/resources/spark-env.sh
+++ b/spark/resources/spark-env.sh
@@ -1,10 +1,5 @@
 #!/usr/bin/env bash
 shuffle=false
-#if [[ $APACHE_SPARK_VERSION == "2.1.0" ]]; then
-#    shuffle=true
-#elif [[ $SPARK_VERSION == "2.1.0" ]]; then
-#    shuffle=true
-#fi
 
 if [ -z "$SPARK_VERSION" ]; then
     export SPARK_VERSION=$APACHE_SPARK_VERSION
@@ -13,18 +8,9 @@ fi
 export HADOOP_CONF_DIR=/etc/hadoop/conf/
 
 #this a hack
-#export SPARK_CONF_DIR=$(mktemp -d)
 export SPARK_CONF_DIR=$SPARK_HOME/conf
-#cp /tmp/spark-defaults.conf $SPARK_CONF_DIR/spark-defaults.conf
-cp $SPARK_CONF_DIR/spark-defaults.conf.orig $SPARK_CONF_DIR/spark-defaults.conf
 echo "spark.ui.port     $PORT0" >> $SPARK_CONF_DIR/spark-defaults.conf
 
-if [ -z "$PYTHON_VERSION" ]; then
-   echo "spark.mesos.executor.docker.image     saagie/spark:java-$JAVA_VERSION-$SPARK_VERSION-1.3.1-centos" >> $SPARK_CONF_DIR/spark-defaults.conf
-else
-   echo "spark.mesos.executor.docker.image     saagie/spark:python-$PYTHON_VERSION-$SPARK_VERSION-1.3.1-centos" >> $SPARK_CONF_DIR/spark-defaults.conf
-fi
-
 echo "spark.shuffle.service.enabled               $shuffle" >> $SPARK_CONF_DIR/spark-defaults.conf
 echo "spark.dynamicAllocation.enabled             $shuffle" >> $SPARK_CONF_DIR/spark-defaults.conf
 
@@ -34,11 +20,5 @@ if [ -z "$PORT1" ]; then
 fi
 
 echo "spark.driver.port $PORT1" >> $SPARK_CONF_DIR/spark-defaults.conf
-cp $SPARK_CONF_DIR/spark-defaults.conf $SPARK_CONF_DIR/spark-defaults.conf.new1
-
-echo "spark.kubernetes.namespace $(cat /etc/hostname)" >> $SPARK_CONF_DIR/spark-defaults.conf
 
 export PYSPARK_PYTHON=/opt/conda/envs/py36/bin/python
-
-
-cp $SPARK_CONF_DIR/spark-defaults.conf $SPARK_CONF_DIR/spark-defaults.conf.new2

From c6eb1f01c76298a10b2ff8db635535c86c2501c1 Mon Sep 17 00:00:00 2001
From: Yann PETIT <yann.petit@saagie.com>
Date: Wed, 8 Jul 2020 14:57:15 +0200
Subject: [PATCH 09/10] [jupyter-spark] remove useless namespace env var

---
 spark/entrypoint.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spark/entrypoint.sh b/spark/entrypoint.sh
index a2f287b..65a67e5 100644
--- a/spark/entrypoint.sh
+++ b/spark/entrypoint.sh
@@ -1,7 +1,6 @@
 #!/bin/bash
 chown -R jovyan /notebooks-dir
 
-TARGET_NAMESPACE="saagie1-projectc5aa8432-f94a-4707-bb9e-79e183e8b107"
 FILE_KERNEL_PY27="/home/jovyan/.local/share/jupyter/kernels/py27/kernel.json"
 FILE_KERNEL_PY36="/home/jovyan/.local/share/jupyter/kernels/py36/kernel.json"
 

From 1f6ba4d5e80c2b3e3542942bfee3649fd9ca0206 Mon Sep 17 00:00:00 2001
From: Yann PETIT <public@ypetit.net>
Date: Fri, 10 Jul 2020 00:43:21 +0200
Subject: [PATCH 10/10] [jupyter-spark] use official base image, refacto

---
 spark/Dockerfile    | 4 +---
 spark/build.sh      | 7 -------
 spark/entrypoint.sh | 7 -------
 3 files changed, 1 insertion(+), 17 deletions(-)
 mode change 100644 => 100755 spark/build.sh
 mode change 100644 => 100755 spark/entrypoint.sh

diff --git a/spark/Dockerfile b/spark/Dockerfile
index 8e51701..52cc2be 100644
--- a/spark/Dockerfile
+++ b/spark/Dockerfile
@@ -1,6 +1,4 @@
-ARG BASE_CONTAINER="ypetit/test:jupyter-python-notebook_minimal_20200625_02"
-
-FROM $BASE_CONTAINER
+FROM saagie/jupyter-python-nbk:v2-base
 
 MAINTAINER Saagie
 
diff --git a/spark/build.sh b/spark/build.sh
old mode 100644
new mode 100755
index f00794a..d46fd2e
--- a/spark/build.sh
+++ b/spark/build.sh
@@ -3,10 +3,6 @@ set -euxo pipefail
 
 NO_CACHE=""
 export DOCKER_BUILDKIT=0
-#BASE_CONTAINER="jupyter/scipy-notebook:c7fb6660d096"
-#PYTHON2_IMG="saagie/python:2.7.202003.76"
-#PYTHON3_IMG="saagie/python:3.6.202003.76"
-
 
 while (( $# )); do
     case $1 in
@@ -27,6 +23,3 @@ done
 docker build $NO_CACHE \
     -t $TYPE \
     .
-    #    --build-arg BASE_CONTAINER=$BASE_CONTAINER \
-    #    --build-arg PYTHON2_IMG=$PYTHON2_IMG \
-    #    --build-arg PYTHON3_IMG=$PYTHON3_IMG \
diff --git a/spark/entrypoint.sh b/spark/entrypoint.sh
old mode 100644
new mode 100755
index 65a67e5..5e3c37a
--- a/spark/entrypoint.sh
+++ b/spark/entrypoint.sh
@@ -1,13 +1,6 @@
 #!/bin/bash
 chown -R jovyan /notebooks-dir
 
-FILE_KERNEL_PY27="/home/jovyan/.local/share/jupyter/kernels/py27/kernel.json"
 FILE_KERNEL_PY36="/home/jovyan/.local/share/jupyter/kernels/py36/kernel.json"
 
-# get line conatining env
-# sed -n '/"env"/p' $FILE_KERNEL_PY36
-# if line exists insert
-
-
-
 start-notebook.sh --KernelSpecManager.ensure_native_kernel=False --NotebookApp.token='' --NotebookApp.password='' --NotebookApp.base_url=$SAAGIE_BASE_PATH