From 9e8d3a4546639ffd610f3405a1d88603c0a24444 Mon Sep 17 00:00:00 2001 From: renhuanyu <43604335+renhuanyu@users.noreply.github.com> Date: Mon, 28 Jun 2021 23:05:50 +0800 Subject: [PATCH 1/3] Update README.md Add demo about paddlepaddle on valcano. --- README.md | 218 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 218 insertions(+) diff --git a/README.md b/README.md index 5750434..c1e0fb1 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,225 @@ Check paddle job status $ kubectl -n paddle-system get pdj ``` +### Run demo paddlejob on volcano + +Volcano is a batch scheduling system based on kubernetes. Using it for task scheduling results in better performance.Volcano installation, you can refer to [this website](https://volcano.sh/en/docs/installation/). + +Edit ctr-Volcano. yaml in the node. + +``` +apiVersion: batch.volcano.sh/v1alpha1 +kind: Job +metadata: + name: ctr-volcano +spec: + minAvailable: 4 + schedulerName: volcano + policies: + - event: PodEvicted + action: RestartJob + - event: PodFailed + action: RestartJob + tasks: + - replicas: 2 + name: pserver + template: + metadata: + labels: + paddle-job-pserver: fluid-ctr + spec: + imagePullSecrets: + - name: default-secret + volumes: + - hostPath: + path: /home/work/ + type: "" + name: seqdata + containers: + - image: volcanosh/edlctr:v1 + command: + - paddle_k8s + - start_fluid + imagePullPolicy: IfNotPresent + name: pserver + volumeMounts: + - mountPath: /mnt/seqdata + name: seqdata + resources: + limits: + cpu: 10 + memory: 30Gi + ephemeral-storage: 10Gi + requests: + cpu: 1 + memory: 100M + ephemeral-storage: 1Gi + env: + - name: GLOG_v + value: "0" + - name: GLOG_logtostderr + value: "1" + - name: TOPOLOGY + value: "" + - name: TRAINER_PACKAGE + value: /workspace + - name: NAMESPACE + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.namespace + - name: POD_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP + - name: POD_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.name + - name: PADDLE_CURRENT_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP + - name: PADDLE_JOB_NAME + value: fluid-ctr + - name: PADDLE_IS_LOCAL + value: "0" + - name: PADDLE_TRAINERS_NUM + value: "2" + - name: PADDLE_PSERVERS_NUM + value: "2" + - name: FLAGS_rpc_deadline + value: "36000000" + - name: ENTRY + value: cd /workspace/ctr && python train.py --is_local 0 --cloud_train 1 + - name: PADDLE_PORT + value: "30236" + - name: LD_LIBRARY_PATH + value: /usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind + - name: PADDLE_TRAINING_ROLE + value: PSERVER + - name: TRAINING_ROLE + value: PSERVER + restartPolicy: OnFailure + - replicas: 2 + policies: + - event: TaskCompleted + action: CompleteJob + name: trainer + template: + metadata: + labels: + paddle-job: fluid-ctr + spec: + imagePullSecrets: + - name: default-secret + volumes: + - hostPath: + path: /home/work/ + type: "" + name: seqdata + containers: + - image: volcanosh/edlctr:v1 + command: + - paddle_k8s + - start_fluid + imagePullPolicy: IfNotPresent + name: trainer + volumeMounts: + - mountPath: /mnt/seqdata + name: seqdata + resources: + limits: + cpu: 10 + memory: 30Gi + ephemeral-storage: 10Gi + requests: + cpu: 1 + memory: 100M + ephemeral-storage: 10Gi + env: + - name: GLOG_v + value: "0" + - name: GLOG_logtostderr + value: "1" + - name: TOPOLOGY + - name: TRAINER_PACKAGE + value: /workspace + - name: CPU_NUM + value: "2" + - name: NAMESPACE + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.namespace + - name: POD_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP + - name: POD_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.name + - name: PADDLE_CURRENT_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP + - name: PADDLE_JOB_NAME + value: fluid-ctr + - name: PADDLE_IS_LOCAL + value: "0" + - name: FLAGS_rpc_deadline + value: "36000000" + - name: PADDLE_PORT + value: "30236" + - name: PADDLE_PSERVERS_NUM + value: "2" + - name: PADDLE_TRAINERS_NUM + value: "2" + - name: PADDLE_TRAINING_ROLE + value: TRAINER + - name: TRAINING_ROLE + value: TRAINER + - name: LD_LIBRARY_PATH + value: /usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind + - name: ENTRY + value: cd /workspace/ctr && python train.py --is_local 0 --cloud_train 1 + restartPolicy: OnFailure + +``` + +Deployment of job. + +``` +kubectl apply -f ctr-volcano.yaml +``` + +Query if the job is running properly. + +``` +kubectl get pods | grep ctr-volcano +``` + +You can select a PServer task to view the log. + +``` +kubectl logs ctr-volcano-pserver-0 +``` + +Select a Tariner task to view the log. + +``` +kubectl logs ctr-volcano-trainer-0 +``` + ### Uninstall + Simply ```shell $ kubectl delete -f https://raw.githubusercontent.com/PaddleFlow/paddle-operator/main/deploy/v1/crd.yaml -f https://raw.githubusercontent.com/PaddleFlow/paddle-operator/main/deploy/v1/operator.yaml From cdd171642979ab1f771eabde6d69d5aeb2ff52d4 Mon Sep 17 00:00:00 2001 From: renhuanyu <43604335+renhuanyu@users.noreply.github.com> Date: Tue, 29 Jun 2021 09:20:43 +0800 Subject: [PATCH 2/3] Update README.md Add demo about paddlepaddle on volcano. --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c1e0fb1..aaaf83e 100644 --- a/README.md +++ b/README.md @@ -64,9 +64,9 @@ $ kubectl -n paddle-system get pdj ### Run demo paddlejob on volcano -Volcano is a batch scheduling system based on kubernetes. Using it for task scheduling results in better performance.Volcano installation, you can refer to [this website](https://volcano.sh/en/docs/installation/). +Volcano is a batch scheduling system based on kubernetes. Using it for jobs scheduling results in better performance.Volcano installation, you can refer to [this website](https://volcano.sh/en/docs/installation/). -Edit ctr-Volcano. yaml in the node. +Edit `ctr-Volcano. yaml` in the node. ``` apiVersion: batch.volcano.sh/v1alpha1 From b657ef6a823ea4ca14bce75858bf0b78c46b5662 Mon Sep 17 00:00:00 2001 From: renhuanyu <43604335+renhuanyu@users.noreply.github.com> Date: Tue, 29 Jun 2021 20:59:59 +0800 Subject: [PATCH 3/3] Add an example about paddlepaddle on volcano. Add an example about paddlepaddle on volcano. --- README.md | 217 -------------------- deploy/examples/paddlepaddle_on_volcano.md | 220 +++++++++++++++++++++ 2 files changed, 220 insertions(+), 217 deletions(-) create mode 100644 deploy/examples/paddlepaddle_on_volcano.md diff --git a/README.md b/README.md index aaaf83e..dd49771 100644 --- a/README.md +++ b/README.md @@ -62,223 +62,6 @@ Check paddle job status $ kubectl -n paddle-system get pdj ``` -### Run demo paddlejob on volcano - -Volcano is a batch scheduling system based on kubernetes. Using it for jobs scheduling results in better performance.Volcano installation, you can refer to [this website](https://volcano.sh/en/docs/installation/). - -Edit `ctr-Volcano. yaml` in the node. - -``` -apiVersion: batch.volcano.sh/v1alpha1 -kind: Job -metadata: - name: ctr-volcano -spec: - minAvailable: 4 - schedulerName: volcano - policies: - - event: PodEvicted - action: RestartJob - - event: PodFailed - action: RestartJob - tasks: - - replicas: 2 - name: pserver - template: - metadata: - labels: - paddle-job-pserver: fluid-ctr - spec: - imagePullSecrets: - - name: default-secret - volumes: - - hostPath: - path: /home/work/ - type: "" - name: seqdata - containers: - - image: volcanosh/edlctr:v1 - command: - - paddle_k8s - - start_fluid - imagePullPolicy: IfNotPresent - name: pserver - volumeMounts: - - mountPath: /mnt/seqdata - name: seqdata - resources: - limits: - cpu: 10 - memory: 30Gi - ephemeral-storage: 10Gi - requests: - cpu: 1 - memory: 100M - ephemeral-storage: 1Gi - env: - - name: GLOG_v - value: "0" - - name: GLOG_logtostderr - value: "1" - - name: TOPOLOGY - value: "" - - name: TRAINER_PACKAGE - value: /workspace - - name: NAMESPACE - valueFrom: - fieldRef: - apiVersion: v1 - fieldPath: metadata.namespace - - name: POD_IP - valueFrom: - fieldRef: - apiVersion: v1 - fieldPath: status.podIP - - name: POD_NAME - valueFrom: - fieldRef: - apiVersion: v1 - fieldPath: metadata.name - - name: PADDLE_CURRENT_IP - valueFrom: - fieldRef: - apiVersion: v1 - fieldPath: status.podIP - - name: PADDLE_JOB_NAME - value: fluid-ctr - - name: PADDLE_IS_LOCAL - value: "0" - - name: PADDLE_TRAINERS_NUM - value: "2" - - name: PADDLE_PSERVERS_NUM - value: "2" - - name: FLAGS_rpc_deadline - value: "36000000" - - name: ENTRY - value: cd /workspace/ctr && python train.py --is_local 0 --cloud_train 1 - - name: PADDLE_PORT - value: "30236" - - name: LD_LIBRARY_PATH - value: /usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind - - name: PADDLE_TRAINING_ROLE - value: PSERVER - - name: TRAINING_ROLE - value: PSERVER - restartPolicy: OnFailure - - replicas: 2 - policies: - - event: TaskCompleted - action: CompleteJob - name: trainer - template: - metadata: - labels: - paddle-job: fluid-ctr - spec: - imagePullSecrets: - - name: default-secret - volumes: - - hostPath: - path: /home/work/ - type: "" - name: seqdata - containers: - - image: volcanosh/edlctr:v1 - command: - - paddle_k8s - - start_fluid - imagePullPolicy: IfNotPresent - name: trainer - volumeMounts: - - mountPath: /mnt/seqdata - name: seqdata - resources: - limits: - cpu: 10 - memory: 30Gi - ephemeral-storage: 10Gi - requests: - cpu: 1 - memory: 100M - ephemeral-storage: 10Gi - env: - - name: GLOG_v - value: "0" - - name: GLOG_logtostderr - value: "1" - - name: TOPOLOGY - - name: TRAINER_PACKAGE - value: /workspace - - name: CPU_NUM - value: "2" - - name: NAMESPACE - valueFrom: - fieldRef: - apiVersion: v1 - fieldPath: metadata.namespace - - name: POD_IP - valueFrom: - fieldRef: - apiVersion: v1 - fieldPath: status.podIP - - name: POD_NAME - valueFrom: - fieldRef: - apiVersion: v1 - fieldPath: metadata.name - - name: PADDLE_CURRENT_IP - valueFrom: - fieldRef: - apiVersion: v1 - fieldPath: status.podIP - - name: PADDLE_JOB_NAME - value: fluid-ctr - - name: PADDLE_IS_LOCAL - value: "0" - - name: FLAGS_rpc_deadline - value: "36000000" - - name: PADDLE_PORT - value: "30236" - - name: PADDLE_PSERVERS_NUM - value: "2" - - name: PADDLE_TRAINERS_NUM - value: "2" - - name: PADDLE_TRAINING_ROLE - value: TRAINER - - name: TRAINING_ROLE - value: TRAINER - - name: LD_LIBRARY_PATH - value: /usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind - - name: ENTRY - value: cd /workspace/ctr && python train.py --is_local 0 --cloud_train 1 - restartPolicy: OnFailure - -``` - -Deployment of job. - -``` -kubectl apply -f ctr-volcano.yaml -``` - -Query if the job is running properly. - -``` -kubectl get pods | grep ctr-volcano -``` - -You can select a PServer task to view the log. - -``` -kubectl logs ctr-volcano-pserver-0 -``` - -Select a Tariner task to view the log. - -``` -kubectl logs ctr-volcano-trainer-0 -``` - ### Uninstall Simply diff --git a/deploy/examples/paddlepaddle_on_volcano.md b/deploy/examples/paddlepaddle_on_volcano.md new file mode 100644 index 0000000..7addbe2 --- /dev/null +++ b/deploy/examples/paddlepaddle_on_volcano.md @@ -0,0 +1,220 @@ +### Run demo paddlejob on volcano + +Volcano is a batch scheduling system based on kubernetes. Using it for jobs scheduling results in better performance.Volcano installation, you can refer to [this website](https://volcano.sh/en/docs/installation/). + +Edit `ctr-Volcano. yaml` in the node. + +``` +apiVersion: batch.volcano.sh/v1alpha1 +kind: Job +metadata: + name: ctr-volcano +spec: + minAvailable: 4 + schedulerName: volcano + policies: + - event: PodEvicted + action: RestartJob + - event: PodFailed + action: RestartJob + tasks: + - replicas: 2 + name: pserver + template: + metadata: + labels: + paddle-job-pserver: fluid-ctr + spec: + imagePullSecrets: + - name: default-secret + volumes: + - hostPath: + path: /home/work/ + type: "" + name: seqdata + containers: + - image: volcanosh/edlctr:v1 + command: + - paddle_k8s + - start_fluid + imagePullPolicy: IfNotPresent + name: pserver + volumeMounts: + - mountPath: /mnt/seqdata + name: seqdata + resources: + limits: + cpu: 10 + memory: 30Gi + ephemeral-storage: 10Gi + requests: + cpu: 1 + memory: 100M + ephemeral-storage: 1Gi + env: + - name: GLOG_v + value: "0" + - name: GLOG_logtostderr + value: "1" + - name: TOPOLOGY + value: "" + - name: TRAINER_PACKAGE + value: /workspace + - name: NAMESPACE + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.namespace + - name: POD_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP + - name: POD_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.name + - name: PADDLE_CURRENT_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP + - name: PADDLE_JOB_NAME + value: fluid-ctr + - name: PADDLE_IS_LOCAL + value: "0" + - name: PADDLE_TRAINERS_NUM + value: "2" + - name: PADDLE_PSERVERS_NUM + value: "2" + - name: FLAGS_rpc_deadline + value: "36000000" + - name: ENTRY + value: cd /workspace/ctr && python train.py --is_local 0 --cloud_train 1 + - name: PADDLE_PORT + value: "30236" + - name: LD_LIBRARY_PATH + value: /usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind + - name: PADDLE_TRAINING_ROLE + value: PSERVER + - name: TRAINING_ROLE + value: PSERVER + restartPolicy: OnFailure + - replicas: 2 + policies: + - event: TaskCompleted + action: CompleteJob + name: trainer + template: + metadata: + labels: + paddle-job: fluid-ctr + spec: + imagePullSecrets: + - name: default-secret + volumes: + - hostPath: + path: /home/work/ + type: "" + name: seqdata + containers: + - image: volcanosh/edlctr:v1 + command: + - paddle_k8s + - start_fluid + imagePullPolicy: IfNotPresent + name: trainer + volumeMounts: + - mountPath: /mnt/seqdata + name: seqdata + resources: + limits: + cpu: 10 + memory: 30Gi + ephemeral-storage: 10Gi + requests: + cpu: 1 + memory: 100M + ephemeral-storage: 10Gi + env: + - name: GLOG_v + value: "0" + - name: GLOG_logtostderr + value: "1" + - name: TOPOLOGY + - name: TRAINER_PACKAGE + value: /workspace + - name: CPU_NUM + value: "2" + - name: NAMESPACE + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.namespace + - name: POD_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP + - name: POD_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.name + - name: PADDLE_CURRENT_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP + - name: PADDLE_JOB_NAME + value: fluid-ctr + - name: PADDLE_IS_LOCAL + value: "0" + - name: FLAGS_rpc_deadline + value: "36000000" + - name: PADDLE_PORT + value: "30236" + - name: PADDLE_PSERVERS_NUM + value: "2" + - name: PADDLE_TRAINERS_NUM + value: "2" + - name: PADDLE_TRAINING_ROLE + value: TRAINER + - name: TRAINING_ROLE + value: TRAINER + - name: LD_LIBRARY_PATH + value: /usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind + - name: ENTRY + value: cd /workspace/ctr && python train.py --is_local 0 --cloud_train 1 + restartPolicy: OnFailure + +``` + +Deployment of job. + +``` +kubectl apply -f ctr-volcano.yaml +``` + +Query if the job is running properly.If the PodGroup cannot meet the scheduling conditions, check that the cluster has sufficient resources available. + +``` +kubectl get podgroup +kubectl describe podgroup ctr-volcano +kubectl get pods | grep ctr-volcano +``` + +You can select a PServer task to view the log. + +``` +kubectl logs ctr-volcano-pserver-0 +``` + +Select a Tariner task to view the log. + +``` +kubectl logs ctr-volcano-trainer-0 +``` + +###