From 6410663e8a78150d30c3b513f9b903e3af6b7bd2 Mon Sep 17 00:00:00 2001 From: Yann PETIT Date: Fri, 5 Jun 2020 18:49:58 +0200 Subject: [PATCH 01/10] update to latest jupyter image which uses bionic in place of xenial --- Dockerfile | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/Dockerfile b/Dockerfile index 93668da..f1faaca 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,7 @@ ARG PYTHON2_IMG="saagie/python:2.7.202005.84" ARG PYTHON3_IMG="saagie/python:3.6.202005.84" # FIXME should use a minimal image and add libs after + update to latest available -ARG BASE_CONTAINER="jupyter/scipy-notebook:c7fb6660d096" +ARG BASE_CONTAINER="jupyter/scipy-notebook:76402a27fd13" FROM $PYTHON2_IMG AS PYTHON2 FROM $PYTHON3_IMG AS PYTHON3 @@ -25,8 +25,8 @@ USER root RUN apt-get update && apt-get install -y --no-install-recommends \ libxml2-dev libxslt1-dev antiword unrtf poppler-utils pstotext tesseract-ocr \ flac ffmpeg lame libmad0 libsox-fmt-mp3 sox libjpeg-dev swig redis-server libpulse-dev \ - libpng3 libfreetype6-dev libatlas-base-dev gfortran \ - libgdal1-dev sasl2-bin libsasl2-2 libsasl2-dev \ + libpng16-16 libfreetype6-dev libatlas-base-dev gfortran \ + libgdal-dev sasl2-bin libsasl2-2 libsasl2-dev \ libsasl2-modules unixodbc-dev python3-tk \ qt5-default \ libqt5webkit5-dev \ @@ -121,15 +121,11 @@ ENV CUDNN_VERSION 7.6.0.64 LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}" RUN apt-get update && apt-get install -y --no-install-recommends \ - ca-certificates apt-transport-https gnupg-curl && \ + ca-certificates apt-transport-https gnupg2 curl && \ rm -rf /var/lib/apt/lists/* && \ - NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \ - NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \ - apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \ - apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +5 > cudasign.pub && \ - echo "$NVIDIA_GPGKEY_SUM cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \ - echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \ - echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \ + curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub | apt-key add - && \ + echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \ + echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \ # For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a apt-get update && apt-get install -y --no-install-recommends \ cuda-cudart-$CUDA_PKG_VERSION \ From 1f0170c3d8ef6e7aa9811652813a2bf5b94b9e2e Mon Sep 17 00:00:00 2001 From: Yann PETIT Date: Wed, 8 Jul 2020 15:08:26 +0200 Subject: [PATCH 02/10] [scipy] move jupyter official scipy as base in own folder --- Dockerfile => scipy/Dockerfile | 0 Jenkinsfile => scipy/Jenkinsfile | 0 build.sh => scipy/build.sh | 0 entrypoint.sh => scipy/entrypoint.sh | 0 python2_lib_test.py => scipy/python2_lib_test.py | 0 python3_lib_test.py => scipy/python3_lib_test.py | 0 requirements_conda2.txt => scipy/requirements_conda2.txt | 0 requirements_conda3.txt => scipy/requirements_conda3.txt | 0 requirements_pip2.txt => scipy/requirements_pip2.txt | 0 requirements_pip3.txt => scipy/requirements_pip3.txt | 0 10 files changed, 0 insertions(+), 0 deletions(-) rename Dockerfile => scipy/Dockerfile (100%) rename Jenkinsfile => scipy/Jenkinsfile (100%) rename build.sh => scipy/build.sh (100%) rename entrypoint.sh => scipy/entrypoint.sh (100%) rename python2_lib_test.py => scipy/python2_lib_test.py (100%) rename python3_lib_test.py => scipy/python3_lib_test.py (100%) rename requirements_conda2.txt => scipy/requirements_conda2.txt (100%) rename requirements_conda3.txt => scipy/requirements_conda3.txt (100%) rename requirements_pip2.txt => scipy/requirements_pip2.txt (100%) rename requirements_pip3.txt => scipy/requirements_pip3.txt (100%) diff --git a/Dockerfile b/scipy/Dockerfile similarity index 100% rename from Dockerfile rename to scipy/Dockerfile diff --git a/Jenkinsfile b/scipy/Jenkinsfile similarity index 100% rename from Jenkinsfile rename to scipy/Jenkinsfile diff --git a/build.sh b/scipy/build.sh similarity index 100% rename from build.sh rename to scipy/build.sh diff --git a/entrypoint.sh b/scipy/entrypoint.sh similarity index 100% rename from entrypoint.sh rename to scipy/entrypoint.sh diff --git a/python2_lib_test.py b/scipy/python2_lib_test.py similarity index 100% rename from python2_lib_test.py rename to scipy/python2_lib_test.py diff --git a/python3_lib_test.py b/scipy/python3_lib_test.py similarity index 100% rename from python3_lib_test.py rename to scipy/python3_lib_test.py diff --git a/requirements_conda2.txt b/scipy/requirements_conda2.txt similarity index 100% rename from requirements_conda2.txt rename to scipy/requirements_conda2.txt diff --git a/requirements_conda3.txt b/scipy/requirements_conda3.txt similarity index 100% rename from requirements_conda3.txt rename to scipy/requirements_conda3.txt diff --git a/requirements_pip2.txt b/scipy/requirements_pip2.txt similarity index 100% rename from requirements_pip2.txt rename to scipy/requirements_pip2.txt diff --git a/requirements_pip3.txt b/scipy/requirements_pip3.txt similarity index 100% rename from requirements_pip3.txt rename to scipy/requirements_pip3.txt From b26c340dc31c21e303b97bb31ec8502b91523380 Mon Sep 17 00:00:00 2001 From: Yann PETIT Date: Thu, 25 Jun 2020 11:41:53 +0200 Subject: [PATCH 03/10] [16] use jupyter/minimal-notebook as base, drop python2.x support, update to latest saagie/python image in a separate folder --- .gitignore | 1 + README.md | 4 +- minimal/Dockerfile | 142 ++++++++++++++++++++++++++++++++ minimal/Jenkinsfile | 34 ++++++++ minimal/build.sh | 25 ++++++ minimal/entrypoint.sh | 4 + minimal/python3_lib_test.py | 103 +++++++++++++++++++++++ minimal/requirements_conda3.txt | 3 + minimal/requirements_pip3.txt | 2 + 9 files changed, 316 insertions(+), 2 deletions(-) create mode 100644 .gitignore create mode 100644 minimal/Dockerfile create mode 100644 minimal/Jenkinsfile create mode 100755 minimal/build.sh create mode 100755 minimal/entrypoint.sh create mode 100644 minimal/python3_lib_test.py create mode 100644 minimal/requirements_conda3.txt create mode 100644 minimal/requirements_pip3.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..485dee6 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.idea diff --git a/README.md b/README.md index e90dfac..0c958c3 100644 --- a/README.md +++ b/README.md @@ -44,5 +44,5 @@ ### For python 3 !pip install libraryName -### For python 2 - !pip2 install libraryName +/!\ Python2 support dropped + diff --git a/minimal/Dockerfile b/minimal/Dockerfile new file mode 100644 index 0000000..07985bd --- /dev/null +++ b/minimal/Dockerfile @@ -0,0 +1,142 @@ +ARG PYTHON3_IMG="saagie/python:3.6.202005.84" + +# use latest image with ubuntu 16.04 Xenial for CDH5 compatibility +# see (https://github.com/jupyter/docker-stacks/commits/master?after=04f7f60d34a674a2964d96a6cb97c57a7870a828+664) +ARG BASE_CONTAINER="jupyter/minimal-notebook:f9e77e3ddd6f" + +FROM $PYTHON3_IMG AS PYTHON3 +FROM $BASE_CONTAINER + +MAINTAINER Saagie + +ENV PATH="${PATH}:/home/$NB_USER/.local/bin" + + +# Starts by cleaning useless npm cache & other files +RUN npm cache clean --force \ + && conda clean -ay \ + && rm -rf $CONDA_DIR/share/jupyter/lab/staging +# Not necessary to apt-get clean it seems + +########################## LIBS PART BEGIN ########################## +USER root +# TODO check if all necessary seems there are duplicate from jupyter/scipy image +RUN apt-get update -qq && apt-get install -yqq --no-install-recommends \ + # replaces libpng3 for bionic + libpng16-16 \ + # replaces libdal6 for bionic + libgdal-dev \ + # needed to compile psycopg2 + libpq-dev \ + curl \ + libxml2-dev libxslt1-dev antiword unrtf poppler-utils pstotext tesseract-ocr \ + flac ffmpeg lame libmad0 libsox-fmt-mp3 sox libjpeg-dev swig redis-server libpulse-dev \ + libfreetype6-dev libatlas-base-dev gfortran \ + sasl2-bin libsasl2-2 libsasl2-dev \ + libsasl2-modules unixodbc-dev python3-tk \ + qt5-default \ + libqt5webkit5-dev \ + libcurl4-openssl-dev \ + && rm -rf /var/lib/apt/lists/* +########################## LIBS PART END ########################## + + +################ Kernels / Conda envs / requirements PART BEGIN ################ +USER $NB_USER +# Uninstall python3 kernel +RUN jupyter kernelspec remove -f python3 + +# Update conda to latest version +#RUN conda update -n root conda \ +RUN conda clean -ay + +# seems there's sometimesa problem with pyzmq so need to reinstall it... +RUN conda create -n py36 python=3.6 \ + && bash -c "source activate py36 && pip uninstall pyzmq -y && pip install pyzmq && conda install notebook ipykernel -y && ipython kernel install --user --name py36 --display-name 'Python 3.6'" \ + && conda clean -ay \ + && rm -rf ~/.cache/pip + +# TODO check if all necessary seems there are duplicate from jupyter/scipy image +SHELL ["/bin/bash", "-c"] +# Add libs for python 3.6 env +# inherited from saagie/python:3.6 image +# installed via pip only +# installed via conda +COPY requirements_conda3.txt requirements_conda3.txt +COPY --from=PYTHON3 /requirements.txt ./requirements_python3.txt +COPY requirements_pip3.txt requirements_pip3.txt +RUN conda install -n py36 --quiet --yes --file requirements_conda3.txt \ + # Some installed library (scikit-learn) could not be removed so use --ignore-installed \ + && sed -n '/scikit-learn/p' requirements_python3.txt >> requirements_python3_ignore-installed.txt \ + && sed -i '/scikit-learn/d' requirements_python3.txt \ + && . activate py36 \ + && python -m pip install --no-cache-dir --ignore-installed -r requirements_python3_ignore-installed.txt \ + && python -m pip install --no-cache-dir -r requirements_python3.txt \ + && python -m pip install --no-cache-dir -r requirements_pip3.txt \ + && conda deactivate \ + && conda clean -ay \ + && rm -rf ~/.cache/pip +################ Kernels / Conda envs / requirements PART ENDS ################# + + +########################## CUDA PART BEGIN ########################## +USER root + +ENV PATH="${PATH}:/usr/local/nvidia/bin:/usr/local/cuda/bin" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/nvidia/lib:/usr/local/nvidia/lib64" +# nvidia-container-runtime +ENV NVIDIA_VISIBLE_DEVICES all +ENV NVIDIA_DRIVER_CAPABILITIES compute,utility +ENV NVIDIA_REQUIRE_CUDA "cuda>=10.0 brand=tesla,driver>=384,driver<385 brand=tesla,driver>=410,driver<411" + +ENV CUDA_VERSION 10.0.130 +ENV CUDA_PKG_VERSION 10-0=$CUDA_VERSION-1 +ENV NCCL_VERSION 2.4.2 +ENV CUDNN_VERSION 7.6.0.64 + +LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}" + +RUN apt-get update && apt-get install -y --no-install-recommends \ + ca-certificates apt-transport-https gnupg-curl && \ + rm -rf /var/lib/apt/lists/* && \ + NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \ + NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \ + apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \ + apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +5 > cudasign.pub && \ + echo "$NVIDIA_GPGKEY_SUM cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \ + echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \ + echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \ + # For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a + apt-get update && apt-get install -y --no-install-recommends \ + cuda-cudart-$CUDA_PKG_VERSION \ + cuda-libraries-$CUDA_PKG_VERSION \ + cuda-nvtx-$CUDA_PKG_VERSION \ + cuda-compat-10-0 && \ + libnccl2=$NCCL_VERSION-1+cuda10.0 \ + libcudnn7=$CUDNN_VERSION-1+cuda10.0 \ + && apt-mark hold libnccl2 libcudnn7 \ + && ln -s cuda-10.0 /usr/local/cuda \ + && rm -rf /var/lib/apt/lists/* \ + # Path doesn't exists... here for compatibility it seems https://gitlab.com/nvidia/container-images/cuda/issues/27 + && echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf \ + && echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf +########################## CUDA PART END ########################## + + +########################## NOTEBOOKS DIR ########################## +USER root +# Create default workdir (useful if no volume mounted) +RUN mkdir /notebooks-dir && chown 1000:100 /notebooks-dir +# Define default workdir +WORKDIR /notebooks-dir +########################## NOTEBOOKS DIR END ########################## + +#Add entrypoint.sh +COPY entrypoint.sh /entrypoint.sh +RUN chmod +x /entrypoint.sh + +# Should run as $NB_USER +USER $NB_USER + +# Default: run without authentication +CMD ["/entrypoint.sh"] diff --git a/minimal/Jenkinsfile b/minimal/Jenkinsfile new file mode 100644 index 0000000..106c636 --- /dev/null +++ b/minimal/Jenkinsfile @@ -0,0 +1,34 @@ +buildVersion = new Date().format("yyyyMMddHHmmss") + +pipeline { + agent { node { label 'docker_image' } } + + options { + disableConcurrentBuilds() + } + + stages { + stage('Build Jupyter images') { + steps { + script { + sh "docker build -t saagie/jupyter-python-nbk:v2_$buildVersion ." + } + } + } + + stage('Push techno images') { + steps { + script { + withCredentials( + [usernamePassword(credentialsId: '8fc4964e-30c6-4bb9-8a19-69e37ea905b6', + usernameVariable: 'USERNAME', + passwordVariable: 'PASSWORD')]) { + + sh "docker login -u $USERNAME -p $PASSWORD" + sh "docker push saagie/jupyter-python-nbk:v2_$buildVersion" + } + } + } + } + } +} diff --git a/minimal/build.sh b/minimal/build.sh new file mode 100755 index 0000000..d46fd2e --- /dev/null +++ b/minimal/build.sh @@ -0,0 +1,25 @@ +#!/bin/bash +set -euxo pipefail + +NO_CACHE="" +export DOCKER_BUILDKIT=0 + +while (( $# )); do + case $1 in + --no-cache) NO_CACHE="--no-cache" + ;; + --buildkit) export DOCKER_BUILDKIT=1 + ;; + --*) echo "Bad Option $1" + ;; + *) TYPE=$1 + ;; + *) break + ;; + esac + shift +done + +docker build $NO_CACHE \ + -t $TYPE \ + . diff --git a/minimal/entrypoint.sh b/minimal/entrypoint.sh new file mode 100755 index 0000000..682c6eb --- /dev/null +++ b/minimal/entrypoint.sh @@ -0,0 +1,4 @@ +#!/bin/bash +chown -R jovyan /notebooks-dir + +start-notebook.sh --KernelSpecManager.ensure_native_kernel=False --NotebookApp.token='' --NotebookApp.password='' --NotebookApp.base_url=$SAAGIE_BASE_PATH diff --git a/minimal/python3_lib_test.py b/minimal/python3_lib_test.py new file mode 100644 index 0000000..b414cb4 --- /dev/null +++ b/minimal/python3_lib_test.py @@ -0,0 +1,103 @@ +import sys +print(sys.executable) +print(sys.version) +print(sys.version_info) + +### +### Test conda install +### FIXME find a way to test those installs +#from hdfs.hfile import Hfile +#import hdf5 + +### +### Test Jupyter specific +### +from PIL import Image +from google.protobuf import descriptor_pb2 + +### +### Test imports from python3 +### +import addok +import apiclient +import bs4 +import bokeh +import bs4 +from confluent_kafka import Producer +import crypto +import cython +import django +import dryscrape +import elasticsearch +import excel +from fastparquet import ParquetFile +import fiona +import folium +import gensim +import geopandas +import geopy +import graphviz +import h5py +import hdfs +import autosklearn.classification +import thrift_sasl +from pybrain.tools.shortcuts import buildNetwork +import ibis +from imblearn.over_sampling import RandomOverSampler +from impala.dbapi import connect +import ipywidgets +import jellyfish +import joblib +from kafka import KafkaConsumer +from keras.layers import Dense +import lime +import lxml +import matplotlib +import mpld3 +import mysql.connector +from neo4j import GraphDatabase +import networkx +import nltk +from numba import jit +import numpy +import cv2 +import openpyxl +import pandas +from pdfminer.psparser import * +import psycopg2 +from Crypto.Hash import SHA256 +import pycurl +import pydotplus +import pymongo +import pyodbc +import shapefile +import pytesseract +from Levenshtein import _levenshtein +from requests_kerberos import * +from skimage import data +from sklearn import datasets +import scipy +import scrapy +import seaborn +import shap +import shapely +import simplejson +import six +import spacy +from sqlalchemy import create_engine +import statsmodels +import tabula +import tensorflow as tf +print('Num GPUs Available: ', len(tf.config.experimental.list_physical_devices('GPU'))) +import tensorflow +import textract +import theano.tensor +import tika +import tokenizer +import torch +import torchvision +import tpot +import umap +from wand.image import Image +import xgboost +import xlwt diff --git a/minimal/requirements_conda3.txt b/minimal/requirements_conda3.txt new file mode 100644 index 0000000..6ce3105 --- /dev/null +++ b/minimal/requirements_conda3.txt @@ -0,0 +1,3 @@ +hdf5==1.10.1 +python-hdfs==2.0.16 +pycurl>=7.43,<7.44 diff --git a/minimal/requirements_pip3.txt b/minimal/requirements_pip3.txt new file mode 100644 index 0000000..1e39f6d --- /dev/null +++ b/minimal/requirements_pip3.txt @@ -0,0 +1,2 @@ +pillow==4.3.0 +protobuf==3.6.1 From 53d272c33f63aa71978e4ecb2c1faaa50470b90c Mon Sep 17 00:00:00 2001 From: Yann PETIT Date: Thu, 9 Jul 2020 12:30:25 +0200 Subject: [PATCH 04/10] [16] update README for new images --- README.md | 44 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 0c958c3..5a59436 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,54 @@ # Jupyter Datascience Notebook for python +## Images + +Jupyter notebook for Python is declined into several images : + + * saagie/jupyter-python-nbk:v2-minimal + * saagie/jupyter-python-nbk:v2-base / saagie/jupyter-python-nbk:v2 + * saagie/jupyter-python-nbk:v2-scipy + +### saagie/jupyter-python-nbk:v2-minimal +This image is based **jupyter/minimal-notebook** one, + +=> adapted to run s;loothly on Saagie's platform + +=> with no particular datascience additionnal libs it's up toi you to add your owns. + +### saagie/jupyter-python-nbk:v2-base +This is the official and main image, base on **jupyter/minimal-notebook** + +=> it comes with a bunch of additional libraries + +=> and is quite similar to **jupyter/scipy-notebook** with even more features. + +This image is the same as **saagie/jupyter-python-nbk:v2** + +### saagie/jupyter-python-nbk:v2-scipy +This is the legacy @deprecated v2 image, initially based on **jupyter/scipy-notebook** + +=> it comes with a bunch of additional libraries + +=> but is now **deprecated** in favor of ***saagie/jupyter-python-nbk:v2-base*** + + ## Run with : - docker run -p 8888:8888 -v /path/to/data/notebooks/dir:/notebooks-dir saagie/jupyter-python-nbk:latest + +### Standalone image + + docker run -p 8888:8888 -v /path/to/data/notebooks/dir:/notebooks-dir saagie/jupyter-python-nbk:v2latest Mounting volume is optional (-v /path/to/data/notebooks/dir:/notebooks-dir) but if you want to do it: * create your local directory with: `mkdir -P /path/to/data/notebooks/dir` * make Jovyan (Jupyter notebook default user) the owner of this directory with: `chown -R 1000:100 /path/to/data/notebooks/dir` +### On Saagie's platform + + * use port 8888 + * define SAAGIE_BASE_PATH env var for base_path + * do not activate "rewrite url" + * optionnaly you can add a volume to map /notebooks-dir folder + ## Libraries : * Data Processing * numpy From f0b25c099dbbf16fb4bd9781c62b7a87c932f141 Mon Sep 17 00:00:00 2001 From: Yann PETIT Date: Thu, 9 Jul 2020 22:39:47 +0200 Subject: [PATCH 05/10] [16] make minimal minimal, and create base image inherited from minimal --- minimal/Jenkinsfile => Jenkinsfile | 7 +- base/Dockerfile | 100 ++++++++++++++++++++++ base/build.sh | 25 ++++++ {minimal => base}/python3_lib_test.py | 0 {minimal => base}/requirements_conda3.txt | 0 {minimal => base}/requirements_pip3.txt | 0 minimal/Dockerfile | 88 +------------------ scipy/Jenkinsfile | 34 -------- 8 files changed, 132 insertions(+), 122 deletions(-) rename minimal/Jenkinsfile => Jenkinsfile (56%) create mode 100644 base/Dockerfile create mode 100755 base/build.sh rename {minimal => base}/python3_lib_test.py (100%) rename {minimal => base}/requirements_conda3.txt (100%) rename {minimal => base}/requirements_pip3.txt (100%) delete mode 100644 scipy/Jenkinsfile diff --git a/minimal/Jenkinsfile b/Jenkinsfile similarity index 56% rename from minimal/Jenkinsfile rename to Jenkinsfile index 106c636..7a465eb 100644 --- a/minimal/Jenkinsfile +++ b/Jenkinsfile @@ -11,7 +11,9 @@ pipeline { stage('Build Jupyter images') { steps { script { - sh "docker build -t saagie/jupyter-python-nbk:v2_$buildVersion ." + sh "cd minimal && docker build -t saagie/jupyter-python-nbk:v2-minimal_$buildVersion ." + sh "cd base && docker build -t saagie/jupyter-python-nbk:v2-base_$buildVersion -t saagie/jupyter-python-nbk:v2_$buildVersion ." + sh "cd scipy && docker build -t saagie/jupyter-python-nbk:v2-scipy_$buildVersion ." } } } @@ -25,6 +27,9 @@ pipeline { passwordVariable: 'PASSWORD')]) { sh "docker login -u $USERNAME -p $PASSWORD" + sh "docker push saagie/jupyter-python-nbk:v2-minimal_$buildVersion" + sh "docker push saagie/jupyter-python-nbk:v2-base_$buildVersion" + sh "docker push saagie/jupyter-python-nbk:v2-scipy_$buildVersion" sh "docker push saagie/jupyter-python-nbk:v2_$buildVersion" } } diff --git a/base/Dockerfile b/base/Dockerfile new file mode 100644 index 0000000..b3ae1a6 --- /dev/null +++ b/base/Dockerfile @@ -0,0 +1,100 @@ +ARG PYTHON3_IMG="saagie/python:3.6.202005.84" + +ARG BASE_CONTAINER="saagie/jupyter-python-nbk:v2-minimal" + +FROM $PYTHON3_IMG AS PYTHON3 +FROM $BASE_CONTAINER + +MAINTAINER Saagie + +########################## LIBS PART BEGIN ########################## +USER root +# TODO check if all necessary seems there are duplicate from jupyter/scipy image +RUN apt-get update -qq && apt-get install -yqq --no-install-recommends \ + # replaces libpng3 for bionic + libpng16-16 \ + # replaces libdal6 for bionic + libgdal-dev \ + # needed to compile psycopg2 + libpq-dev \ + libxml2-dev libxslt1-dev antiword unrtf poppler-utils pstotext tesseract-ocr \ + flac ffmpeg lame libmad0 libsox-fmt-mp3 sox libjpeg-dev swig redis-server libpulse-dev \ + libfreetype6-dev libatlas-base-dev gfortran \ + sasl2-bin libsasl2-2 libsasl2-dev \ + libsasl2-modules unixodbc-dev python3-tk \ + qt5-default \ + libqt5webkit5-dev \ + libcurl4-openssl-dev \ + && rm -rf /var/lib/apt/lists/* +########################## LIBS PART END ########################## + + +################ Kernels / Conda envs / requirements PART BEGIN ################ +USER $NB_USER +# TODO check if all necessary seems there are duplicate from jupyter/scipy image +SHELL ["/bin/bash", "-c"] +# Add libs for python 3.6 env +# inherited from saagie/python:3.6 image +# installed via pip only +# installed via conda +COPY requirements_conda3.txt requirements_conda3.txt +COPY --from=PYTHON3 /requirements.txt ./requirements_python3.txt +COPY requirements_pip3.txt requirements_pip3.txt +RUN conda install -n py36 --quiet --yes --file requirements_conda3.txt \ + # Some installed library (scikit-learn) could not be removed so use --ignore-installed \ + && sed -n '/scikit-learn/p' requirements_python3.txt >> requirements_python3_ignore-installed.txt \ + && sed -i '/scikit-learn/d' requirements_python3.txt \ + && . activate py36 \ + && python -m pip install --no-cache-dir --ignore-installed -r requirements_python3_ignore-installed.txt \ + && python -m pip install --no-cache-dir -r requirements_python3.txt \ + && python -m pip install --no-cache-dir -r requirements_pip3.txt \ + && conda deactivate \ + && conda clean -ay \ + && rm -rf ~/.cache/pip +################ Kernels / Conda envs / requirements PART ENDS ################# + + +########################## CUDA PART BEGIN ########################## +USER root + +ENV PATH="${PATH}:/usr/local/nvidia/bin:/usr/local/cuda/bin" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/nvidia/lib:/usr/local/nvidia/lib64" +# nvidia-container-runtime +ENV NVIDIA_VISIBLE_DEVICES all +ENV NVIDIA_DRIVER_CAPABILITIES compute,utility +ENV NVIDIA_REQUIRE_CUDA "cuda>=10.0 brand=tesla,driver>=384,driver<385 brand=tesla,driver>=410,driver<411" + +ENV CUDA_VERSION 10.0.130 +ENV CUDA_PKG_VERSION 10-0=$CUDA_VERSION-1 +ENV NCCL_VERSION 2.4.2 +ENV CUDNN_VERSION 7.6.0.64 + +LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}" + +RUN apt-get update && apt-get install -y --no-install-recommends \ + ca-certificates apt-transport-https gnupg-curl && \ + rm -rf /var/lib/apt/lists/* && \ + NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \ + NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \ + apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \ + apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +5 > cudasign.pub && \ + echo "$NVIDIA_GPGKEY_SUM cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \ + echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \ + echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \ + # For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a + apt-get update && apt-get install -y --no-install-recommends \ + cuda-cudart-$CUDA_PKG_VERSION \ + cuda-libraries-$CUDA_PKG_VERSION \ + cuda-nvtx-$CUDA_PKG_VERSION \ + cuda-compat-10-0 && \ + libnccl2=$NCCL_VERSION-1+cuda10.0 \ + libcudnn7=$CUDNN_VERSION-1+cuda10.0 \ + && apt-mark hold libnccl2 libcudnn7 \ + && ln -s cuda-10.0 /usr/local/cuda \ + && rm -rf /var/lib/apt/lists/* \ + # Path doesn't exists... here for compatibility it seems https://gitlab.com/nvidia/container-images/cuda/issues/27 + && echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf \ + && echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf +########################## CUDA PART END ########################## + +USER $NB_USER diff --git a/base/build.sh b/base/build.sh new file mode 100755 index 0000000..d46fd2e --- /dev/null +++ b/base/build.sh @@ -0,0 +1,25 @@ +#!/bin/bash +set -euxo pipefail + +NO_CACHE="" +export DOCKER_BUILDKIT=0 + +while (( $# )); do + case $1 in + --no-cache) NO_CACHE="--no-cache" + ;; + --buildkit) export DOCKER_BUILDKIT=1 + ;; + --*) echo "Bad Option $1" + ;; + *) TYPE=$1 + ;; + *) break + ;; + esac + shift +done + +docker build $NO_CACHE \ + -t $TYPE \ + . diff --git a/minimal/python3_lib_test.py b/base/python3_lib_test.py similarity index 100% rename from minimal/python3_lib_test.py rename to base/python3_lib_test.py diff --git a/minimal/requirements_conda3.txt b/base/requirements_conda3.txt similarity index 100% rename from minimal/requirements_conda3.txt rename to base/requirements_conda3.txt diff --git a/minimal/requirements_pip3.txt b/base/requirements_pip3.txt similarity index 100% rename from minimal/requirements_pip3.txt rename to base/requirements_pip3.txt diff --git a/minimal/Dockerfile b/minimal/Dockerfile index 07985bd..0e782fa 100644 --- a/minimal/Dockerfile +++ b/minimal/Dockerfile @@ -1,17 +1,11 @@ -ARG PYTHON3_IMG="saagie/python:3.6.202005.84" - # use latest image with ubuntu 16.04 Xenial for CDH5 compatibility # see (https://github.com/jupyter/docker-stacks/commits/master?after=04f7f60d34a674a2964d96a6cb97c57a7870a828+664) -ARG BASE_CONTAINER="jupyter/minimal-notebook:f9e77e3ddd6f" - -FROM $PYTHON3_IMG AS PYTHON3 -FROM $BASE_CONTAINER +FROM jupyter/minimal-notebook:f9e77e3ddd6f MAINTAINER Saagie ENV PATH="${PATH}:/home/$NB_USER/.local/bin" - # Starts by cleaning useless npm cache & other files RUN npm cache clean --force \ && conda clean -ay \ @@ -22,21 +16,7 @@ RUN npm cache clean --force \ USER root # TODO check if all necessary seems there are duplicate from jupyter/scipy image RUN apt-get update -qq && apt-get install -yqq --no-install-recommends \ - # replaces libpng3 for bionic - libpng16-16 \ - # replaces libdal6 for bionic - libgdal-dev \ - # needed to compile psycopg2 - libpq-dev \ curl \ - libxml2-dev libxslt1-dev antiword unrtf poppler-utils pstotext tesseract-ocr \ - flac ffmpeg lame libmad0 libsox-fmt-mp3 sox libjpeg-dev swig redis-server libpulse-dev \ - libfreetype6-dev libatlas-base-dev gfortran \ - sasl2-bin libsasl2-2 libsasl2-dev \ - libsasl2-modules unixodbc-dev python3-tk \ - qt5-default \ - libqt5webkit5-dev \ - libcurl4-openssl-dev \ && rm -rf /var/lib/apt/lists/* ########################## LIBS PART END ########################## @@ -55,74 +35,8 @@ RUN conda create -n py36 python=3.6 \ && bash -c "source activate py36 && pip uninstall pyzmq -y && pip install pyzmq && conda install notebook ipykernel -y && ipython kernel install --user --name py36 --display-name 'Python 3.6'" \ && conda clean -ay \ && rm -rf ~/.cache/pip - -# TODO check if all necessary seems there are duplicate from jupyter/scipy image -SHELL ["/bin/bash", "-c"] -# Add libs for python 3.6 env -# inherited from saagie/python:3.6 image -# installed via pip only -# installed via conda -COPY requirements_conda3.txt requirements_conda3.txt -COPY --from=PYTHON3 /requirements.txt ./requirements_python3.txt -COPY requirements_pip3.txt requirements_pip3.txt -RUN conda install -n py36 --quiet --yes --file requirements_conda3.txt \ - # Some installed library (scikit-learn) could not be removed so use --ignore-installed \ - && sed -n '/scikit-learn/p' requirements_python3.txt >> requirements_python3_ignore-installed.txt \ - && sed -i '/scikit-learn/d' requirements_python3.txt \ - && . activate py36 \ - && python -m pip install --no-cache-dir --ignore-installed -r requirements_python3_ignore-installed.txt \ - && python -m pip install --no-cache-dir -r requirements_python3.txt \ - && python -m pip install --no-cache-dir -r requirements_pip3.txt \ - && conda deactivate \ - && conda clean -ay \ - && rm -rf ~/.cache/pip ################ Kernels / Conda envs / requirements PART ENDS ################# - -########################## CUDA PART BEGIN ########################## -USER root - -ENV PATH="${PATH}:/usr/local/nvidia/bin:/usr/local/cuda/bin" -ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/nvidia/lib:/usr/local/nvidia/lib64" -# nvidia-container-runtime -ENV NVIDIA_VISIBLE_DEVICES all -ENV NVIDIA_DRIVER_CAPABILITIES compute,utility -ENV NVIDIA_REQUIRE_CUDA "cuda>=10.0 brand=tesla,driver>=384,driver<385 brand=tesla,driver>=410,driver<411" - -ENV CUDA_VERSION 10.0.130 -ENV CUDA_PKG_VERSION 10-0=$CUDA_VERSION-1 -ENV NCCL_VERSION 2.4.2 -ENV CUDNN_VERSION 7.6.0.64 - -LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}" - -RUN apt-get update && apt-get install -y --no-install-recommends \ - ca-certificates apt-transport-https gnupg-curl && \ - rm -rf /var/lib/apt/lists/* && \ - NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \ - NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \ - apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \ - apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +5 > cudasign.pub && \ - echo "$NVIDIA_GPGKEY_SUM cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \ - echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \ - echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \ - # For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a - apt-get update && apt-get install -y --no-install-recommends \ - cuda-cudart-$CUDA_PKG_VERSION \ - cuda-libraries-$CUDA_PKG_VERSION \ - cuda-nvtx-$CUDA_PKG_VERSION \ - cuda-compat-10-0 && \ - libnccl2=$NCCL_VERSION-1+cuda10.0 \ - libcudnn7=$CUDNN_VERSION-1+cuda10.0 \ - && apt-mark hold libnccl2 libcudnn7 \ - && ln -s cuda-10.0 /usr/local/cuda \ - && rm -rf /var/lib/apt/lists/* \ - # Path doesn't exists... here for compatibility it seems https://gitlab.com/nvidia/container-images/cuda/issues/27 - && echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf \ - && echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf -########################## CUDA PART END ########################## - - ########################## NOTEBOOKS DIR ########################## USER root # Create default workdir (useful if no volume mounted) diff --git a/scipy/Jenkinsfile b/scipy/Jenkinsfile deleted file mode 100644 index 106c636..0000000 --- a/scipy/Jenkinsfile +++ /dev/null @@ -1,34 +0,0 @@ -buildVersion = new Date().format("yyyyMMddHHmmss") - -pipeline { - agent { node { label 'docker_image' } } - - options { - disableConcurrentBuilds() - } - - stages { - stage('Build Jupyter images') { - steps { - script { - sh "docker build -t saagie/jupyter-python-nbk:v2_$buildVersion ." - } - } - } - - stage('Push techno images') { - steps { - script { - withCredentials( - [usernamePassword(credentialsId: '8fc4964e-30c6-4bb9-8a19-69e37ea905b6', - usernameVariable: 'USERNAME', - passwordVariable: 'PASSWORD')]) { - - sh "docker login -u $USERNAME -p $PASSWORD" - sh "docker push saagie/jupyter-python-nbk:v2_$buildVersion" - } - } - } - } - } -} From 32bdc597fe2a8eb4af45c497915d0ee2c84d7a31 Mon Sep 17 00:00:00 2001 From: Yann PETIT Date: Wed, 17 Jun 2020 16:11:11 +0200 Subject: [PATCH 06/10] [jupyter-spark] add spark to jupyter v2 - initial commit --- spark/Dockerfile | 72 +++++++++++++ spark/build.sh | 32 ++++++ spark/entrypoint.sh | 14 +++ spark/resources/cloudera.list | 4 + spark/resources/core-site.xml | 137 +++++++++++++++++++++++++ spark/resources/requirements_conda.txt | 1 + spark/resources/spark-defaults.conf | 7 ++ spark/resources/spark-env.sh | 44 ++++++++ spark/resources/test.ipynb | 80 +++++++++++++++ spark/resources/test.ipynb.txt | 24 +++++ spark/resources/test2.ipynb | 54 ++++++++++ 11 files changed, 469 insertions(+) create mode 100644 spark/Dockerfile create mode 100644 spark/build.sh create mode 100644 spark/entrypoint.sh create mode 100644 spark/resources/cloudera.list create mode 100644 spark/resources/core-site.xml create mode 100644 spark/resources/requirements_conda.txt create mode 100644 spark/resources/spark-defaults.conf create mode 100644 spark/resources/spark-env.sh create mode 100644 spark/resources/test.ipynb create mode 100644 spark/resources/test.ipynb.txt create mode 100644 spark/resources/test2.ipynb diff --git a/spark/Dockerfile b/spark/Dockerfile new file mode 100644 index 0000000..d95a6cf --- /dev/null +++ b/spark/Dockerfile @@ -0,0 +1,72 @@ +ARG BASE_CONTAINER="ypetit/test:jupyter-minimal_20200403.54" + +FROM $BASE_CONTAINER + +MAINTAINER Saagie + +ENV DEBIAN_FRONTEND noninteractive + +# SAAGIE Spark dependencies +ENV SPARK_VERSION 2.4.5 +ENV HADOOP_VERSION 2.6 + +USER root + +# Install tools +RUN apt-get update -qq && apt-get install -yqq --no-install-recommends \ + vim nano gnupg2 && \ + rm -rf /var/lib/apt/lists/*; + +# Installing Java +RUN apt-get update -qq && apt-get install -yqq --no-install-recommends -y \ + openjdk-8-jre-headless ca-certificates-java && \ + rm -rf /var/lib/apt/lists/* + +# Install Kerberos & ACL for Saagie +COPY resources/cloudera.list /etc/apt/sources.list.d/cloudera.list +RUN apt-get update -qq && apt-get install -yqq --no-install-recommends \ + krb5-user acl \ +# apt-key adv --recv-keys --keyserver keyserver.ubuntu.com 327574EE02A818DD + freeipa-client sentry-hdfs-plugin \ + && rm -rf /var/lib/apt/lists/*; + +# Spark config +ENV PORT0 4040 +ENV SPARK_HOME /usr/local/spark +ENV PYTHONPATH $SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.7-src.zip +ENV SPARK_OPTS --driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info +ENV HADOOP_CONF_DIR="/home/jovyan/hadoop" + +RUN mkdir $HADOOP_CONF_DIR +COPY resources/core-site.xml /home/jovyan/hadoop/core-site.xml + +RUN mkdir -p /usr/lib/impala/lib/ && chown $NB_UID /usr/lib/impala/lib +RUN sed -i '2iln -s /etc/hadoop/conf/sentry-libs/hive-hcatalog-core.jar /usr/lib/impala/lib/hive-hcatalog-core.jar' /usr/local/bin/start-notebook.sh + +#Installing Spark +RUN wget https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz -P /tmp \ + && tar -zxf /tmp/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz -C /usr/local/ \ + && ln -s /usr/local/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}/ /usr/local/spark \ + && rm /tmp/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz + +# Install conda libs (pyarrow) +# TODO copy alsewhere (here it's in /notebook-dirs) +USER $NB_UID +COPY resources/requirements_conda.txt /home/$NB-USER/requirements_conda.txt +RUN conda install -n py27 --quiet --yes --file /home/$NB-USER/requirements_conda.txt && \ + conda install -n py36 --quiet --yes --file /home/$NB-USER/requirements_conda.txt && \ + conda clean -afy + +USER root +COPY resources/spark-defaults.conf $SPARK_HOME/conf/spark-defaults.conf.orig +COPY resources/spark-env.sh $SPARK_HOME/conf/spark-env.sh +COPY resources/test.ipynb /home/$NB_USER/work/test.ipynb +COPY resources/test2.ipynb /home/$NB_USER/work/test2.ipynb +RUN chown $NB_USER:$NB_UID $SPARK_HOME/conf/spark-defaults.conf.orig \ + && chown $NB_USER:$NB_UID $SPARK_HOME/conf/spark-env.sh \ + && chown $NB_USER:$NB_UID /home/$NB_USER/work/test.ipynb \ + && chown $NB_USER:$NB_UID /home/$NB_USER/work/test2.ipynb \ + && chmod +x $SPARK_HOME/conf/spark-env.sh + +USER $NB_UID +WORKDIR /home/jovyan/work diff --git a/spark/build.sh b/spark/build.sh new file mode 100644 index 0000000..f00794a --- /dev/null +++ b/spark/build.sh @@ -0,0 +1,32 @@ +#!/bin/bash +set -euxo pipefail + +NO_CACHE="" +export DOCKER_BUILDKIT=0 +#BASE_CONTAINER="jupyter/scipy-notebook:c7fb6660d096" +#PYTHON2_IMG="saagie/python:2.7.202003.76" +#PYTHON3_IMG="saagie/python:3.6.202003.76" + + +while (( $# )); do + case $1 in + --no-cache) NO_CACHE="--no-cache" + ;; + --buildkit) export DOCKER_BUILDKIT=1 + ;; + --*) echo "Bad Option $1" + ;; + *) TYPE=$1 + ;; + *) break + ;; + esac + shift +done + +docker build $NO_CACHE \ + -t $TYPE \ + . + # --build-arg BASE_CONTAINER=$BASE_CONTAINER \ + # --build-arg PYTHON2_IMG=$PYTHON2_IMG \ + # --build-arg PYTHON3_IMG=$PYTHON3_IMG \ diff --git a/spark/entrypoint.sh b/spark/entrypoint.sh new file mode 100644 index 0000000..a2f287b --- /dev/null +++ b/spark/entrypoint.sh @@ -0,0 +1,14 @@ +#!/bin/bash +chown -R jovyan /notebooks-dir + +TARGET_NAMESPACE="saagie1-projectc5aa8432-f94a-4707-bb9e-79e183e8b107" +FILE_KERNEL_PY27="/home/jovyan/.local/share/jupyter/kernels/py27/kernel.json" +FILE_KERNEL_PY36="/home/jovyan/.local/share/jupyter/kernels/py36/kernel.json" + +# get line conatining env +# sed -n '/"env"/p' $FILE_KERNEL_PY36 +# if line exists insert + + + +start-notebook.sh --KernelSpecManager.ensure_native_kernel=False --NotebookApp.token='' --NotebookApp.password='' --NotebookApp.base_url=$SAAGIE_BASE_PATH diff --git a/spark/resources/cloudera.list b/spark/resources/cloudera.list new file mode 100644 index 0000000..78f7d94 --- /dev/null +++ b/spark/resources/cloudera.list @@ -0,0 +1,4 @@ +# Modified https://archive.cloudera.com/cdh5/ubuntu/xenial/amd64/cdh/cloudera.list file +# To reference as trusted despite the expired / weak key +deb [arch=amd64, trusted=yes] https://archive.cloudera.com/cdh5/ubuntu/xenial/amd64/cdh xenial-cdh5 contrib +deb-src [trusted=yes] https://archive.cloudera.com/cdh5/ubuntu/xenial/amd64/cdh xenial-cdh5 contrib diff --git a/spark/resources/core-site.xml b/spark/resources/core-site.xml new file mode 100644 index 0000000..ea8ed4e --- /dev/null +++ b/spark/resources/core-site.xml @@ -0,0 +1,137 @@ + + + + + + + fs.defaultFS + hdfs://cluster + + + + + dfs.permissions.superusergroup + hadoop + + + + hadoop.proxyuser.mapred.groups + * + + + + hadoop.proxyuser.mapred.hosts + * + + + + hadoop.proxyuser.hue.hosts + nn1.p01.saagie1.a36152.saagie + + + + hadoop.proxyuser.hue.groups + * + + + + hadoop.proxyuser.hdfs.groups + * + + + + hadoop.proxyuser.hdfs.hosts + * + + + + hadoop.proxyuser.httpfs.hosts + * + + + + hadoop.proxyuser.httpfs.groups + * + + + + hadoop.proxyuser.oozie.hosts + * + + + + hadoop.proxyuser.oozie.groups + * + + + + hadoop.proxyuser.impala.hosts + * + + + + hadoop.proxyuser.impala.groups + * + + + + hadoop.proxyuser.hive.hosts + * + + + + hadoop.proxyuser.hive.groups + * + + + + hadoop.proxyuser.yarn.groups + * + + + + hadoop.proxyuser.yarn.hosts + * + + + + hadoop.proxyuser.m2m.hosts + * + + + + hadoop.proxyuser.m2m.groups + * + + + + + hadoop.security.group.mapping + org.apache.hadoop.security.CompositeGroupsMapping + + + + hadoop.security.group.mapping.providers + shell4services + + + + hadoop.security.group.mapping.providers.combined + true + + + + hadoop.security.group.mapping.provider.shell4services + org.apache.hadoop.security.ShellBasedUnixGroupsMapping + + + + + hadoop.security.auth_to_local + + RULE:[2:$1] + DEFAULT + + + + diff --git a/spark/resources/requirements_conda.txt b/spark/resources/requirements_conda.txt new file mode 100644 index 0000000..19d8363 --- /dev/null +++ b/spark/resources/requirements_conda.txt @@ -0,0 +1 @@ +pyarrow diff --git a/spark/resources/spark-defaults.conf b/spark/resources/spark-defaults.conf new file mode 100644 index 0000000..ace4a1e --- /dev/null +++ b/spark/resources/spark-defaults.conf @@ -0,0 +1,7 @@ +#spark.submit.deployMode cluster +spark.master k8s\://https\://kubernetes.default.svc\:443 +spark.kubernetes.driver.label.io.saagie/type job +spark.kubernetes.driver.label.io.saagie/spark-role driver +spark.kubernetes.executor.label.io.saagie/type job +spark.kubernetes.authenticate.driver.serviceAccountName spark-driver +#spark.defaults TRUE diff --git a/spark/resources/spark-env.sh b/spark/resources/spark-env.sh new file mode 100644 index 0000000..eb065e7 --- /dev/null +++ b/spark/resources/spark-env.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +shuffle=false +#if [[ $APACHE_SPARK_VERSION == "2.1.0" ]]; then +# shuffle=true +#elif [[ $SPARK_VERSION == "2.1.0" ]]; then +# shuffle=true +#fi + +if [ -z "$SPARK_VERSION" ]; then + export SPARK_VERSION=$APACHE_SPARK_VERSION +fi + +export HADOOP_CONF_DIR=/etc/hadoop/conf/ + +#this a hack +#export SPARK_CONF_DIR=$(mktemp -d) +export SPARK_CONF_DIR=$SPARK_HOME/conf +#cp /tmp/spark-defaults.conf $SPARK_CONF_DIR/spark-defaults.conf +cp $SPARK_CONF_DIR/spark-defaults.conf.orig $SPARK_CONF_DIR/spark-defaults.conf +echo "spark.ui.port $PORT0" >> $SPARK_CONF_DIR/spark-defaults.conf + +if [ -z "$PYTHON_VERSION" ]; then + echo "spark.mesos.executor.docker.image saagie/spark:java-$JAVA_VERSION-$SPARK_VERSION-1.3.1-centos" >> $SPARK_CONF_DIR/spark-defaults.conf +else + echo "spark.mesos.executor.docker.image saagie/spark:python-$PYTHON_VERSION-$SPARK_VERSION-1.3.1-centos" >> $SPARK_CONF_DIR/spark-defaults.conf +fi + +echo "spark.shuffle.service.enabled $shuffle" >> $SPARK_CONF_DIR/spark-defaults.conf +echo "spark.dynamicAllocation.enabled $shuffle" >> $SPARK_CONF_DIR/spark-defaults.conf + +# hack for notebook +if [ -z "$PORT1" ]; then + PORT1=$PORT0 +fi + +echo "spark.driver.port $PORT1" >> $SPARK_CONF_DIR/spark-defaults.conf +cp $SPARK_CONF_DIR/spark-defaults.conf $SPARK_CONF_DIR/spark-defaults.conf.new1 + +echo "spark.kubernetes.namespace $(cat /etc/hostname)" >> $SPARK_CONF_DIR/spark-defaults.conf + +export PYSPARK_PYTHON=/opt/conda/envs/py36/bin/python + + +cp $SPARK_CONF_DIR/spark-defaults.conf $SPARK_CONF_DIR/spark-defaults.conf.new2 diff --git a/spark/resources/test.ipynb b/spark/resources/test.ipynb new file mode 100644 index 0000000..afed899 --- /dev/null +++ b/spark/resources/test.ipynb @@ -0,0 +1,80 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# import os\n", + "#import pyspark\n", + "from pyspark.sql import SparkSession\n", + "#pyspark.SparkConf().setAll([('spark.submit.deployMode', 'cluster'), ('spark.master', 'k8s://https://kubernetes.default.svc:443')])\n", + "\n", + "spark = SparkSession \\\n", + " .builder \\\n", + " .appName(\"File lines count\") \\\n", + " .config(\"spark.submit.deployMode\", \"cluster\") \\\n", + " .config(\"spark.master\", \"k8s://https://kubernetes.default.svc:443\") \\\n", + " .config(\"spark.kubernetes.driver.label.io.saagie/type\", \"job\") \\\n", + " .config(\"spark.kubernetes.driver.label.io.saagie/spark-role\", \"driver\") \\\n", + " .config(\"spark.kubernetes.executor.label.io.saagie/type\", \"job\") \\\n", + " .config(\"spark.kubernetes.authenticate.driver.serviceAccountName\", \"spark-driver\") \\\n", + " .config(\"spark.kubernetes.namespace\", \"POD_NAMESPACE\") \\\n", + " .getOrCreate() \n", + "\n", + "#Doit en principe retourner l'URL de K8S\n", + "spark.sparkContext.getConf().getAll()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "ename": "Py4JJavaError", + "evalue": "An error occurred while calling o264.count.\n: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 2.0 failed 1 times, most recent failure: Lost task 0.0 in stage 2.0 (TID 1, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):\n File \"/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py\", line 267, in main\n (\"%d.%d\" % sys.version_info[:2], version))\nException: Python in worker has different version 3.7 than that in driver 3.6, PySpark cannot run with different minor versions.Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.\n\n\tat org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)\n\tat org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:592)\n\tat org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:575)\n\tat org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)\n\tat org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)\n\tat scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)\n\tat scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)\n\tat scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)\n\tat scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.agg_doAggregateWithoutKey_0$(Unknown Source)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)\n\tat org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)\n\tat org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)\n\tat scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)\n\tat org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)\n\tat org.apache.spark.scheduler.Task.run(Task.scala:123)\n\tat org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)\n\tat org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n\tat java.lang.Thread.run(Thread.java:748)\n\nDriver stacktrace:\n\tat org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1891)\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1879)\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)\n\tat scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)\n\tat scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)\n\tat org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1878)\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)\n\tat scala.Option.foreach(Option.scala:257)\n\tat org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:927)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2112)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2061)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2050)\n\tat org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)\n\tat org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:738)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)\n\tat org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:990)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)\n\tat org.apache.spark.rdd.RDD.withScope(RDD.scala:385)\n\tat org.apache.spark.rdd.RDD.collect(RDD.scala:989)\n\tat org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:299)\n\tat org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2836)\n\tat org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2835)\n\tat org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)\n\tat org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)\n\tat org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)\n\tat org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)\n\tat org.apache.spark.sql.Dataset.withAction(Dataset.scala:3369)\n\tat org.apache.spark.sql.Dataset.count(Dataset.scala:2835)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:282)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\n\tat java.lang.Thread.run(Thread.java:748)\nCaused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):\n File \"/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py\", line 267, in main\n (\"%d.%d\" % sys.version_info[:2], version))\nException: Python in worker has different version 3.7 than that in driver 3.6, PySpark cannot run with different minor versions.Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.\n\n\tat org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)\n\tat org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:592)\n\tat org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:575)\n\tat org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)\n\tat org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)\n\tat scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)\n\tat scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)\n\tat scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)\n\tat scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.agg_doAggregateWithoutKey_0$(Unknown Source)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)\n\tat org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)\n\tat org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)\n\tat scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)\n\tat org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)\n\tat org.apache.spark.scheduler.Task.run(Task.scala:123)\n\tat org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)\n\tat org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n\t... 1 more\n", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mPy4JJavaError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mlist_a\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'aa'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m'aaze'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mdfa\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mspark\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcreateDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlist_a\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Nombre d'éléments : \"\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdfa\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcount\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0mdfa\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/spark/python/pyspark/sql/dataframe.py\u001b[0m in \u001b[0;36mcount\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 521\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 522\u001b[0m \"\"\"\n\u001b[0;32m--> 523\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_jdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcount\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 524\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 525\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mignore_unicode_prefix\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 1255\u001b[0m \u001b[0manswer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgateway_client\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1256\u001b[0m return_value = get_return_value(\n\u001b[0;32m-> 1257\u001b[0;31m answer, self.gateway_client, self.target_id, self.name)\n\u001b[0m\u001b[1;32m 1258\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1259\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtemp_arg\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtemp_args\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/spark/python/pyspark/sql/utils.py\u001b[0m in \u001b[0;36mdeco\u001b[0;34m(*a, **kw)\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdeco\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 64\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mpy4j\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprotocol\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPy4JJavaError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0ms\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjava_exception\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtoString\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py\u001b[0m in \u001b[0;36mget_return_value\u001b[0;34m(answer, gateway_client, target_id, name)\u001b[0m\n\u001b[1;32m 326\u001b[0m raise Py4JJavaError(\n\u001b[1;32m 327\u001b[0m \u001b[0;34m\"An error occurred while calling {0}{1}{2}.\\n\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 328\u001b[0;31m format(target_id, \".\", name), value)\n\u001b[0m\u001b[1;32m 329\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 330\u001b[0m raise Py4JError(\n", + "\u001b[0;31mPy4JJavaError\u001b[0m: An error occurred while calling o264.count.\n: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 2.0 failed 1 times, most recent failure: Lost task 0.0 in stage 2.0 (TID 1, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):\n File \"/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py\", line 267, in main\n (\"%d.%d\" % sys.version_info[:2], version))\nException: Python in worker has different version 3.7 than that in driver 3.6, PySpark cannot run with different minor versions.Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.\n\n\tat org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)\n\tat org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:592)\n\tat org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:575)\n\tat org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)\n\tat org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)\n\tat scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)\n\tat scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)\n\tat scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)\n\tat scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.agg_doAggregateWithoutKey_0$(Unknown Source)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)\n\tat org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)\n\tat org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)\n\tat scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)\n\tat org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)\n\tat org.apache.spark.scheduler.Task.run(Task.scala:123)\n\tat org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)\n\tat org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n\tat java.lang.Thread.run(Thread.java:748)\n\nDriver stacktrace:\n\tat org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1891)\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1879)\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)\n\tat scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)\n\tat scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)\n\tat org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1878)\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)\n\tat scala.Option.foreach(Option.scala:257)\n\tat org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:927)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2112)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2061)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2050)\n\tat org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)\n\tat org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:738)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)\n\tat org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:990)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)\n\tat org.apache.spark.rdd.RDD.withScope(RDD.scala:385)\n\tat org.apache.spark.rdd.RDD.collect(RDD.scala:989)\n\tat org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:299)\n\tat org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2836)\n\tat org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2835)\n\tat org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)\n\tat org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)\n\tat org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)\n\tat org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)\n\tat org.apache.spark.sql.Dataset.withAction(Dataset.scala:3369)\n\tat org.apache.spark.sql.Dataset.count(Dataset.scala:2835)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:282)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\n\tat java.lang.Thread.run(Thread.java:748)\nCaused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):\n File \"/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py\", line 267, in main\n (\"%d.%d\" % sys.version_info[:2], version))\nException: Python in worker has different version 3.7 than that in driver 3.6, PySpark cannot run with different minor versions.Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.\n\n\tat org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)\n\tat org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:592)\n\tat org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:575)\n\tat org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)\n\tat org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)\n\tat scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)\n\tat scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)\n\tat scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)\n\tat scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.agg_doAggregateWithoutKey_0$(Unknown Source)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)\n\tat org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)\n\tat org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)\n\tat scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)\n\tat org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)\n\tat org.apache.spark.scheduler.Task.run(Task.scala:123)\n\tat org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)\n\tat org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n\t... 1 more\n" + ] + } + ], + "source": [ + "list_a=[('aa',3), ('aaze',3)]\n", + "dfa = spark.createDataFrame(list_a)\n", + "print(\"Nombre d'éléments : \"+str(dfa.count()))\n", + "dfa.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "py36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/spark/resources/test.ipynb.txt b/spark/resources/test.ipynb.txt new file mode 100644 index 0000000..5461693 --- /dev/null +++ b/spark/resources/test.ipynb.txt @@ -0,0 +1,24 @@ +import os +#import pyspark +from pyspark.sql import SparkSession +#pyspark.SparkConf().setAll([('spark.submit.deployMode', 'cluster'), ('spark.master', 'k8s://https://kubernetes.default.svc:443')]) + +spark = SparkSession \ + .builder \ + .appName("File lines count") \ + .config("spark.submit.deployMode", "cluster") \ + .config("spark.master", "k8s://https://kubernetes.default.svc:443") \ + .config("spark.kubernetes.driver.label.io.saagie/type", "job") \ + .config("spark.kubernetes.driver.label.io.saagie/spark-role", "driver") \ + .config("spark.kubernetes.executor.label.io.saagie/type", "job") \ + .config("spark.kubernetes.authenticate.driver.serviceAccountName", "spark-driver") \ + .config("spark.kubernetes.namespace", "POD_NAMESPACE") \ + .getOrCreate() + +#Doit en principe retourner l'URL de K8S +spark.sparkContext.getConf().getAll() + +list_a=[('aa',3), ('aaze',3)] +dfa = spark.createDataFrame(list_a) +print("Nombre d'éléments : "+str(dfa.count())) +dfa.show() diff --git a/spark/resources/test2.ipynb b/spark/resources/test2.ipynb new file mode 100644 index 0000000..0f5fd87 --- /dev/null +++ b/spark/resources/test2.ipynb @@ -0,0 +1,54 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql import SparkSession\n", + "\n", + "spark = SparkSession \\\n", + " .builder \\\n", + " .appName(\"File lines count\") \\\n", + " .getOrCreate() \n", + "\n", + "#Doit en principe retourner l'URL de K8S\n", + "spark.sparkContext.getConf().getAll()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "list_a=[('aa',3), ('aaze',3)]\n", + "dfa = spark.createDataFrame(list_a)\n", + "print(\"Nombre d'éléments : \"+str(dfa.count()))\n", + "dfa.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "py36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 6055c908d300af0de54b265b70b0342dc4bb3e48 Mon Sep 17 00:00:00 2001 From: Yann PETIT Date: Thu, 25 Jun 2020 13:43:58 +0200 Subject: [PATCH 07/10] [jupyter-spark] update image and base image to use latest xenial based jupyter/minimal image and remove python 2.7 --- spark/Dockerfile | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/spark/Dockerfile b/spark/Dockerfile index d95a6cf..69b6071 100644 --- a/spark/Dockerfile +++ b/spark/Dockerfile @@ -1,4 +1,4 @@ -ARG BASE_CONTAINER="ypetit/test:jupyter-minimal_20200403.54" +ARG BASE_CONTAINER="ypetit/test:jupyter-python-notebook_minimal_20200625_02" FROM $BASE_CONTAINER @@ -24,9 +24,9 @@ RUN apt-get update -qq && apt-get install -yqq --no-install-recommends -y \ # Install Kerberos & ACL for Saagie COPY resources/cloudera.list /etc/apt/sources.list.d/cloudera.list -RUN apt-get update -qq && apt-get install -yqq --no-install-recommends \ +RUN apt-key adv --recv-keys --keyserver keyserver.ubuntu.com 327574EE02A818DD \ + && apt-get update -qq && apt-get install -yqq --no-install-recommends \ krb5-user acl \ -# apt-key adv --recv-keys --keyserver keyserver.ubuntu.com 327574EE02A818DD freeipa-client sentry-hdfs-plugin \ && rm -rf /var/lib/apt/lists/*; @@ -53,9 +53,8 @@ RUN wget https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SP # TODO copy alsewhere (here it's in /notebook-dirs) USER $NB_UID COPY resources/requirements_conda.txt /home/$NB-USER/requirements_conda.txt -RUN conda install -n py27 --quiet --yes --file /home/$NB-USER/requirements_conda.txt && \ - conda install -n py36 --quiet --yes --file /home/$NB-USER/requirements_conda.txt && \ - conda clean -afy +RUN conda install -n py36 --quiet --yes --file /home/$NB-USER/requirements_conda.txt && \ + conda clean -ay USER root COPY resources/spark-defaults.conf $SPARK_HOME/conf/spark-defaults.conf.orig From f9c3e4de6d6bc9b21d1b8b142edcb7e9d5347131 Mon Sep 17 00:00:00 2001 From: Yann PETIT Date: Thu, 25 Jun 2020 14:18:58 +0200 Subject: [PATCH 08/10] [jupyter-spark] remove spark-default.conf and kubernetes references --- spark/Dockerfile | 4 +--- spark/resources/spark-defaults.conf | 7 ------- spark/resources/spark-env.sh | 20 -------------------- 3 files changed, 1 insertion(+), 30 deletions(-) delete mode 100644 spark/resources/spark-defaults.conf diff --git a/spark/Dockerfile b/spark/Dockerfile index 69b6071..8e51701 100644 --- a/spark/Dockerfile +++ b/spark/Dockerfile @@ -57,12 +57,10 @@ RUN conda install -n py36 --quiet --yes --file /home/$NB-USER/requirements_conda conda clean -ay USER root -COPY resources/spark-defaults.conf $SPARK_HOME/conf/spark-defaults.conf.orig COPY resources/spark-env.sh $SPARK_HOME/conf/spark-env.sh COPY resources/test.ipynb /home/$NB_USER/work/test.ipynb COPY resources/test2.ipynb /home/$NB_USER/work/test2.ipynb -RUN chown $NB_USER:$NB_UID $SPARK_HOME/conf/spark-defaults.conf.orig \ - && chown $NB_USER:$NB_UID $SPARK_HOME/conf/spark-env.sh \ +RUN chown $NB_USER:$NB_UID $SPARK_HOME/conf/spark-env.sh \ && chown $NB_USER:$NB_UID /home/$NB_USER/work/test.ipynb \ && chown $NB_USER:$NB_UID /home/$NB_USER/work/test2.ipynb \ && chmod +x $SPARK_HOME/conf/spark-env.sh diff --git a/spark/resources/spark-defaults.conf b/spark/resources/spark-defaults.conf deleted file mode 100644 index ace4a1e..0000000 --- a/spark/resources/spark-defaults.conf +++ /dev/null @@ -1,7 +0,0 @@ -#spark.submit.deployMode cluster -spark.master k8s\://https\://kubernetes.default.svc\:443 -spark.kubernetes.driver.label.io.saagie/type job -spark.kubernetes.driver.label.io.saagie/spark-role driver -spark.kubernetes.executor.label.io.saagie/type job -spark.kubernetes.authenticate.driver.serviceAccountName spark-driver -#spark.defaults TRUE diff --git a/spark/resources/spark-env.sh b/spark/resources/spark-env.sh index eb065e7..f0492c0 100644 --- a/spark/resources/spark-env.sh +++ b/spark/resources/spark-env.sh @@ -1,10 +1,5 @@ #!/usr/bin/env bash shuffle=false -#if [[ $APACHE_SPARK_VERSION == "2.1.0" ]]; then -# shuffle=true -#elif [[ $SPARK_VERSION == "2.1.0" ]]; then -# shuffle=true -#fi if [ -z "$SPARK_VERSION" ]; then export SPARK_VERSION=$APACHE_SPARK_VERSION @@ -13,18 +8,9 @@ fi export HADOOP_CONF_DIR=/etc/hadoop/conf/ #this a hack -#export SPARK_CONF_DIR=$(mktemp -d) export SPARK_CONF_DIR=$SPARK_HOME/conf -#cp /tmp/spark-defaults.conf $SPARK_CONF_DIR/spark-defaults.conf -cp $SPARK_CONF_DIR/spark-defaults.conf.orig $SPARK_CONF_DIR/spark-defaults.conf echo "spark.ui.port $PORT0" >> $SPARK_CONF_DIR/spark-defaults.conf -if [ -z "$PYTHON_VERSION" ]; then - echo "spark.mesos.executor.docker.image saagie/spark:java-$JAVA_VERSION-$SPARK_VERSION-1.3.1-centos" >> $SPARK_CONF_DIR/spark-defaults.conf -else - echo "spark.mesos.executor.docker.image saagie/spark:python-$PYTHON_VERSION-$SPARK_VERSION-1.3.1-centos" >> $SPARK_CONF_DIR/spark-defaults.conf -fi - echo "spark.shuffle.service.enabled $shuffle" >> $SPARK_CONF_DIR/spark-defaults.conf echo "spark.dynamicAllocation.enabled $shuffle" >> $SPARK_CONF_DIR/spark-defaults.conf @@ -34,11 +20,5 @@ if [ -z "$PORT1" ]; then fi echo "spark.driver.port $PORT1" >> $SPARK_CONF_DIR/spark-defaults.conf -cp $SPARK_CONF_DIR/spark-defaults.conf $SPARK_CONF_DIR/spark-defaults.conf.new1 - -echo "spark.kubernetes.namespace $(cat /etc/hostname)" >> $SPARK_CONF_DIR/spark-defaults.conf export PYSPARK_PYTHON=/opt/conda/envs/py36/bin/python - - -cp $SPARK_CONF_DIR/spark-defaults.conf $SPARK_CONF_DIR/spark-defaults.conf.new2 From c6eb1f01c76298a10b2ff8db635535c86c2501c1 Mon Sep 17 00:00:00 2001 From: Yann PETIT Date: Wed, 8 Jul 2020 14:57:15 +0200 Subject: [PATCH 09/10] [jupyter-spark] remove useless namespace env var --- spark/entrypoint.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/spark/entrypoint.sh b/spark/entrypoint.sh index a2f287b..65a67e5 100644 --- a/spark/entrypoint.sh +++ b/spark/entrypoint.sh @@ -1,7 +1,6 @@ #!/bin/bash chown -R jovyan /notebooks-dir -TARGET_NAMESPACE="saagie1-projectc5aa8432-f94a-4707-bb9e-79e183e8b107" FILE_KERNEL_PY27="/home/jovyan/.local/share/jupyter/kernels/py27/kernel.json" FILE_KERNEL_PY36="/home/jovyan/.local/share/jupyter/kernels/py36/kernel.json" From 1f6ba4d5e80c2b3e3542942bfee3649fd9ca0206 Mon Sep 17 00:00:00 2001 From: Yann PETIT Date: Fri, 10 Jul 2020 00:43:21 +0200 Subject: [PATCH 10/10] [jupyter-spark] use official base image, refacto --- spark/Dockerfile | 4 +--- spark/build.sh | 7 ------- spark/entrypoint.sh | 7 ------- 3 files changed, 1 insertion(+), 17 deletions(-) mode change 100644 => 100755 spark/build.sh mode change 100644 => 100755 spark/entrypoint.sh diff --git a/spark/Dockerfile b/spark/Dockerfile index 8e51701..52cc2be 100644 --- a/spark/Dockerfile +++ b/spark/Dockerfile @@ -1,6 +1,4 @@ -ARG BASE_CONTAINER="ypetit/test:jupyter-python-notebook_minimal_20200625_02" - -FROM $BASE_CONTAINER +FROM saagie/jupyter-python-nbk:v2-base MAINTAINER Saagie diff --git a/spark/build.sh b/spark/build.sh old mode 100644 new mode 100755 index f00794a..d46fd2e --- a/spark/build.sh +++ b/spark/build.sh @@ -3,10 +3,6 @@ set -euxo pipefail NO_CACHE="" export DOCKER_BUILDKIT=0 -#BASE_CONTAINER="jupyter/scipy-notebook:c7fb6660d096" -#PYTHON2_IMG="saagie/python:2.7.202003.76" -#PYTHON3_IMG="saagie/python:3.6.202003.76" - while (( $# )); do case $1 in @@ -27,6 +23,3 @@ done docker build $NO_CACHE \ -t $TYPE \ . - # --build-arg BASE_CONTAINER=$BASE_CONTAINER \ - # --build-arg PYTHON2_IMG=$PYTHON2_IMG \ - # --build-arg PYTHON3_IMG=$PYTHON3_IMG \ diff --git a/spark/entrypoint.sh b/spark/entrypoint.sh old mode 100644 new mode 100755 index 65a67e5..5e3c37a --- a/spark/entrypoint.sh +++ b/spark/entrypoint.sh @@ -1,13 +1,6 @@ #!/bin/bash chown -R jovyan /notebooks-dir -FILE_KERNEL_PY27="/home/jovyan/.local/share/jupyter/kernels/py27/kernel.json" FILE_KERNEL_PY36="/home/jovyan/.local/share/jupyter/kernels/py36/kernel.json" -# get line conatining env -# sed -n '/"env"/p' $FILE_KERNEL_PY36 -# if line exists insert - - - start-notebook.sh --KernelSpecManager.ensure_native_kernel=False --NotebookApp.token='' --NotebookApp.password='' --NotebookApp.base_url=$SAAGIE_BASE_PATH