From c0d0e929ad0e6ba461a6be165975d52297d63116 Mon Sep 17 00:00:00 2001
From: jdp8 <jdavidperez025@gmail.com>
Date: Sat, 30 Dec 2023 15:49:52 -0400
Subject: [PATCH] Add Pose Estimation & Semantic Segmentation

---
 examples/react/src/App.tsx                    | 148 ++++++++++++---
 .../react/src/annotator_helper_functions.ts   | 171 ++++++++++++++++++
 2 files changed, 295 insertions(+), 24 deletions(-)
 create mode 100644 examples/react/src/annotator_helper_functions.ts
diff --git a/examples/react/src/App.tsx b/examples/react/src/App.tsx
index 0b316e1..de6a477 100644
--- a/examples/react/src/App.tsx
+++ b/examples/react/src/App.tsx
@@ -24,6 +24,7 @@ import { FAQ } from './components/FAQ'
 import { Tensor } from '@xenova/transformers'
 import cv from '@techstark/opencv-js'
 import { StableDiffusionControlNetPipeline } from '../../../dist/pipelines/StableDiffusionControlNetPipeline';
+import { getBlobFromImage, generateColors, segArgmax, posePostProcess, loadAnnotatorFile } from './annotator_helper_functions' 
 
 const darkTheme = createTheme({
   palette: {
@@ -39,6 +40,7 @@ interface SelectedPipeline {
   steps: number
   hasImg2Img: boolean
   hasControlNet: boolean
+  controlnet?: string
 }
 
 const pipelines = [
@@ -91,7 +93,32 @@ const pipelines = [
     height: 512,
     steps: 20,
     hasImg2Img: true,
-    hasControlNet: true
+    hasControlNet: true,
+    controlnet: 'canny'
+  },
+  {
+    name: 'StableDiffusion 1.5 Base FP16 Semantic Segmentation (2.9GB)',
+    repo: 'jdp8/stable-diffusion-1-5-seg-v11p-onnx',
+    revision: 'main',
+    fp16: true,
+    width: 512,
+    height: 512,
+    steps: 20,
+    hasImg2Img: true,
+    hasControlNet: true,
+    controlnet: 'semantic_segmentation'
+  },
+  {
+    name: 'StableDiffusion 1.5 Base FP16 OpenPose (2.9GB)',
+    repo: 'jdp8/stable-diffusion-1-5-openpose-v11p-onnx',
+    revision: 'main',
+    fp16: true,
+    width: 512,
+    height: 512,
+    steps: 20,
+    hasImg2Img: true,
+    hasControlNet: true,
+    controlnet: 'openpose'
   },
 ]
 
@@ -110,6 +137,8 @@ function App() {
   const [inputImage, setInputImage] = useState<Float32Array>();
   const [strength, setStrength] = useState(0.8);
   const [controlNetImage, setControlNetImage] = useState<Float32Array>();
+  const [annotator_model, setAnnotatorModel] = useState('');
+  const [annotator_config, setAnnotatorConfig] = useState('');
   const [runVaeOnEachStep, setRunVaeOnEachStep] = useState(false);
   useEffect(() => {
     setModelCacheDir('models')
@@ -201,7 +230,7 @@ function App() {
    * @param type Pipeline of the input image.
    * @returns void
    */
-  function uploadImage(e: any, type: 'controlnet'|'img2img') {
+  function uploadImage(e: any, type: 'controlnet'|'img2img', controlnet='canny') {
     if(!e.target.files[0]) {
       // No image uploaded
       return;
@@ -225,29 +254,70 @@ function App() {
           setInputImage(rgb_array);
         }
         else if(type == 'controlnet') {
-          // For now only Canny Edge Detection is available
           const cvImg = cv.imread(uploadedImage); // RGBA Image | 4 Channels
-          const imgGray = new cv.Mat();
-          cv.cvtColor(cvImg, imgGray, cv.COLOR_RGBA2GRAY); // Gray Image | 1 Channel
-          const imgCanny = new cv.Mat();
-          cv.Canny(imgGray, imgCanny, 100, 200, 3, false); // Canny Image | 1 Channel
-          const rgbaCanny = new cv.Mat();
-          cv.cvtColor(imgCanny, rgbaCanny, cv.COLOR_GRAY2RGBA, 0); // RGBA Canny Image | 4 Channels
 
-          /**
-           * The canny data can be accessed as so:
-           * cannyEdges.data -> UInt8Array
-           * cannyEdges.data8S -> Int8Array
-           * cannyEdges.data16S -> Int16Array
-           * cannyEdges.data16U -> UInt16Array
-           * cannyEdges.data32F -> Float32Array
-           * cannyEdges.data32S -> Int32Array
-           * cannyEdges.data64F -> Float64Array
-           */
+          if(controlnet == 'semantic_segmentation') {
+            const inputSize = [513, 513];
+            const mean = [127.5, 127.5, 127.5];
+            const std = 0.007843;
+            const swapRB = false;
+
+            const input = getBlobFromImage(inputSize, mean, std, swapRB, cvImg);
+            const net = cv.readNet(annotator_model);
+            net.setInput(input);
+            const result = net.forward();
+            const colors = generateColors(result);
+            const output = segArgmax(result, colors);
+            const resizedOutput = new cv.Mat();
+            const dsize = new cv.Size(512, 512);
+            cv.resize(output, resizedOutput, dsize, 0, 0, cv.INTER_AREA);
+            const rgbSem = getRgbData(Uint8ClampedArray.from(resizedOutput.data), false);
+            setControlNetImage(rgbSem);
+            cvImg.delete();input.delete();net.delete();result.delete();resizedOutput.delete();
+          }
+          else if(controlnet == 'openpose') {
+            // inputSize can be changed, the original is [368, 368]. The higher the size, the slower the annotation and vice-versa.
+            const inputSize = [125, 125];
+            const mean = [0, 0, 0];
+            const std = 0.00392;
+            const swapRB = false;
+            const threshold = 0.1;
+
+            // the pairs of keypoint, can be "COCO", "MPI" and "BODY_25"
+            let dataset = '';
 
-          const rgbCanny = getRgbData(Uint8ClampedArray.from(rgbaCanny.data), false);
-          setControlNetImage(rgbCanny);
-          cvImg.delete();imgGray.delete();imgCanny.delete();rgbaCanny.delete();
+            if(annotator_model == 'pose_iter_584000.caffemodel') {
+              dataset = 'BODY_25'
+            }
+            else if(annotator_model == 'pose_iter_440000.caffemodel') {
+              dataset = 'COCO'
+            }
+            else if (annotator_model == 'pose_iter_160000.caffemodel') {
+              dataset = 'MPI'
+            }
+
+            const input = getBlobFromImage(inputSize, mean, std, swapRB, cvImg);
+            let net = cv.readNet(annotator_model, annotator_config);
+            net.setInput(input);
+            const result = net.forward();
+            const output = posePostProcess(result, dataset, threshold, 512, 512);
+            const rgbaPose = new cv.Mat();
+            cv.cvtColor(output, rgbaPose, cv.COLOR_RGB2RGBA, 0); // RGBA Pose Image | 4 Channels
+            const rgbPose = getRgbData(Uint8ClampedArray.from(rgbaPose.data), false);
+            setControlNetImage(rgbPose);
+            cvImg.delete();input.delete();net.delete();result.delete();rgbaPose.delete();
+          }
+          else if(controlnet == 'canny') {
+            const imgGray = new cv.Mat();
+            cv.cvtColor(cvImg, imgGray, cv.COLOR_RGBA2GRAY); // Gray Image | 1 Channel
+            const imgCanny = new cv.Mat();
+            cv.Canny(imgGray, imgCanny, 100, 200, 3, false); // Canny Image | 1 Channel
+            const rgbaCanny = new cv.Mat();
+            cv.cvtColor(imgCanny, rgbaCanny, cv.COLOR_GRAY2RGBA, 0); // RGBA Canny Image | 4 Channels
+            const rgbCanny = getRgbData(Uint8ClampedArray.from(rgbaCanny.data), false);
+            setControlNetImage(rgbCanny);
+            cvImg.delete();imgGray.delete();imgCanny.delete();rgbaCanny.delete();
+          }
         }
       });
       uploadedImage.src = file.target.result;
@@ -333,13 +403,43 @@ function App() {
                 {selectedPipeline?.hasControlNet &&
                   (
                     <>
-                      <label htmlFor="upload_controlnet_image">Upload Image for ControlNet Pipeline:</label>
+                      <label htmlFor="upload_annotator_model">Upload Annotator Model for ControlNet Pipeline:</label>
+                      <TextField
+                        id="upload_annotator_model"
+                        inputProps={{accept:".caffemodel,.pb"}}
+                        type={"file"}
+                        disabled={modelState != 'ready'}
+                        onChange={async (e) => {
+                          const fileName = await loadAnnotatorFile(e)
+                          //@ts-ignore
+                          setAnnotatorModel(fileName)
+                        }}
+                      />
+
+                      {selectedPipeline?.controlnet == 'openpose' && 
+                        (
+                          <>
+                            <label htmlFor="upload_config">Upload Annotator Config File for ControlNet Pipeline:</label>
+                            <TextField
+                              id="upload_config"
+                              inputProps={{accept:".prototxt"}}
+                              type={"file"}
+                              disabled={modelState != 'ready'}
+                              onChange={async (e) => {
+                                const fileName = await loadAnnotatorFile(e)
+                                //@ts-ignore
+                                setAnnotatorConfig(fileName)
+                              }}
+                            />
+                          </>
+                      )}
+                      <label htmlFor="upload_controlnet_image">Upload Image for ControlNet Pipeline (annotator files must be uploaded first):</label>
                       <TextField
                         id="upload_controlnet_image"
                         inputProps={{accept:"image/*"}}
                         type={"file"}
                         disabled={modelState != 'ready'}
-                        onChange={(e) => uploadImage(e, "controlnet")}
+                        onChange={(e) => uploadImage(e, "controlnet", selectedPipeline.controlnet)}
                       />
                   </>
                 )}
diff --git a/examples/react/src/annotator_helper_functions.ts b/examples/react/src/annotator_helper_functions.ts
new file mode 100644
index 0000000..ae329f2
--- /dev/null
+++ b/examples/react/src/annotator_helper_functions.ts
@@ -0,0 +1,171 @@
+import cv from '@techstark/opencv-js'
+
+export const getBlobFromImage = function(inputSize: Array<number>, mean: Array<number>, std: number, swapRB: boolean, cvImg: any) {
+    const matC3 = new cv.Mat(cvImg.matSize[0], cvImg.matSize[1], cv.CV_8UC3);
+    cv.cvtColor(cvImg, matC3, cv.COLOR_RGBA2BGR);
+    const input = cv.blobFromImage(matC3, std, new cv.Size(inputSize[0], inputSize[1]), new cv.Scalar(mean[0], mean[1], mean[2]), swapRB);
+    
+    matC3.delete();
+    return input;
+}
+
+export const loadAnnotatorFile = async (e: any) => {
+    if(!e.target.files[0]) {
+      return;
+    }
+
+    return new Promise((resolve) => {
+        let file = e.target.files[0];
+        let path = file.name;
+        let reader = new FileReader();
+        reader.readAsArrayBuffer(file);
+        reader.onload = function(ev) {
+            if(reader.readyState === 2) {
+                let buffer: any = reader.result;
+                let data = new Uint8Array(buffer);
+                cv.FS_createDataFile('/', path, data, true, false, false);
+                resolve(path);
+            }
+        }
+    });
+}
+
+export const generateColors = function(result: any) {
+    const numClasses = result.matSize[1];
+    let colors = [0, 0, 0];
+    while(colors.length < numClasses * 3) {
+        colors.push(Math.round((Math.random() * 255 + colors[colors.length - 3]) / 2));
+    }
+    return colors;
+}
+
+export const segArgmax = function(result: any, colors: Array<number>) {
+    const C = result.matSize[1];
+    const H = result.matSize[2];
+    const W = result.matSize[3];
+    const resultData = result.data32F;
+    const imgSize = H*W;
+
+    let classId = [];
+    let i, j;
+    for(i = 0; i < imgSize; ++i) {
+        let id = 0;
+        for(j = 0; j < C; ++j) {
+            if(resultData[j*imgSize+i] > resultData[id*imgSize+i]) {
+            id = j;
+            }
+        }
+        classId.push(colors[id*3]);
+        classId.push(colors[id*3+1]);
+        classId.push(colors[id*3+2]);
+        classId.push(255);
+    }
+
+    const output = cv.matFromArray(H, W, cv.CV_8UC4, classId);
+    return output;
+}
+
+export const posePostProcess = function(result: any, dataset: string, threshold: number, outputWidth: number, outputHeight: number) {
+    const resultData = result.data32F;
+    const matSize = result.matSize;
+    // const size1 = matSize[1];
+    const size2 = matSize[2];
+    const size3 = matSize[3];
+    const mapSize = size2 * size3;
+
+    let output = cv.Mat.zeros(outputWidth, outputHeight, cv.CV_8UC3);
+
+    let BODY_PARTS: any = {};
+    let POSE_PAIRS: any = [];
+
+    if(dataset === 'COCO') {
+        BODY_PARTS = { "Nose": 0, "Neck": 1, "RShoulder": 2, "RElbow": 3, "RWrist": 4,
+                    "LShoulder": 5, "LElbow": 6, "LWrist": 7, "RHip": 8, "RKnee": 9,
+                    "RAnkle": 10, "LHip": 11, "LKnee": 12, "LAnkle": 13, "REye": 14,
+                    "LEye": 15, "REar": 16, "LEar": 17, "Background": 18 };
+
+        POSE_PAIRS = [ ["Neck", "RShoulder"], ["Neck", "LShoulder"], ["RShoulder", "RElbow"],
+                    ["RElbow", "RWrist"], ["LShoulder", "LElbow"], ["LElbow", "LWrist"],
+                    ["Neck", "RHip"], ["RHip", "RKnee"], ["RKnee", "RAnkle"], ["Neck", "LHip"],
+                    ["LHip", "LKnee"], ["LKnee", "LAnkle"], ["Neck", "Nose"], ["Nose", "REye"],
+                    ["REye", "REar"], ["Nose", "LEye"], ["LEye", "LEar"] ]
+    }
+    else if (dataset === 'MPI') {
+        BODY_PARTS = { "Head": 0, "Neck": 1, "RShoulder": 2, "RElbow": 3, "RWrist": 4,
+                    "LShoulder": 5, "LElbow": 6, "LWrist": 7, "RHip": 8, "RKnee": 9,
+                    "RAnkle": 10, "LHip": 11, "LKnee": 12, "LAnkle": 13, "Chest": 14,
+                    "Background": 15 }
+
+        POSE_PAIRS = [ ["Head", "Neck"], ["Neck", "RShoulder"], ["RShoulder", "RElbow"],
+                    ["RElbow", "RWrist"], ["Neck", "LShoulder"], ["LShoulder", "LElbow"],
+                    ["LElbow", "LWrist"], ["Neck", "Chest"], ["Chest", "RHip"], ["RHip", "RKnee"],
+                    ["RKnee", "RAnkle"], ["Chest", "LHip"], ["LHip", "LKnee"], ["LKnee", "LAnkle"] ]
+    }
+    else if (dataset === 'BODY_25') {
+        BODY_PARTS = { "Nose": 0, "Neck": 1, "RShoulder": 2, "RElbow": 3, "RWrist": 4,
+                    "LShoulder": 5, "LElbow": 6, "LWrist": 7, "MidHip": 8, "RHip": 9,
+                    "RKnee": 10, "RAnkle": 11, "LHip": 12, "LKnee": 13, "LAnkle": 14,
+                    "REye": 15, "LEye": 16, "REar": 17, "LEar": 18, "LBigToe": 19,
+                    "LSmallToe": 20, "LHeel": 21, "RBigToe": 22, "RSmallToe": 23,
+                    "RHeel": 24, "Background": 25 }
+
+        POSE_PAIRS = [ ["Neck", "Nose"], ["Neck", "RShoulder"],
+                    ["Neck", "LShoulder"], ["RShoulder", "RElbow"],
+                    ["RElbow", "RWrist"], ["LShoulder", "LElbow"],
+                    ["LElbow", "LWrist"], ["Nose", "REye"],
+                    ["REye", "REar"], ["Nose", "LEye"],
+                    ["LEye", "LEar"], ["Neck", "MidHip"],
+                    ["MidHip", "RHip"], ["RHip", "RKnee"],
+                    ["RKnee", "RAnkle"], ["RAnkle", "RBigToe"],
+                    ["RBigToe", "RSmallToe"], ["RAnkle", "RHeel"],
+                    ["MidHip", "LHip"], ["LHip", "LKnee"],
+                    ["LKnee", "LAnkle"], ["LAnkle", "LBigToe"],
+                    ["LBigToe", "LSmallToe"], ["LAnkle", "LHeel"] ]
+    }
+
+    // get position of keypoints from output
+    let points = [];
+    let i;
+    for(i = 0; i < Object.keys(BODY_PARTS).length; ++i) {
+        let heatMap = resultData.slice(i * mapSize, (i+1) * mapSize);
+        let maxIndex = 0;
+        let maxConf = heatMap[0];
+        let index: any;
+        for(index in heatMap) {
+            if(heatMap[index] > heatMap[maxIndex]) {
+                maxIndex = index;
+                maxConf = heatMap[index];
+            }
+        }
+
+        if(maxConf > threshold) {
+            let indexX = maxIndex % size3;
+            let indexY = maxIndex / size3;
+
+            let x = outputWidth * indexX / size3;
+            let y = outputHeight * indexY / size2;
+
+            points[i] = [Math.round(x), Math.round(y)];
+        }
+    }
+
+    // draw the points and lines into the image
+    for(const pair of POSE_PAIRS) {
+        const partFrom = pair[0];
+        const partTo = pair[1];
+        const idFrom = BODY_PARTS[partFrom];
+        const idTo = BODY_PARTS[partTo];
+        const pointFrom = points[idFrom];
+        const pointTo = points[idTo];
+
+        if(points[idFrom] && points[idTo]) {
+            cv.line(output, new cv.Point(pointFrom[0], pointFrom[1]),
+                            new cv.Point(pointTo[0], pointTo[1]), new cv.Scalar(0, 255, 0), 3);
+            cv.ellipse(output, new cv.Point(pointFrom[0], pointFrom[1]), new cv.Size(3, 3), 0, 0, 360,
+                               new cv.Scalar(0, 0, 255), cv.FILLED);
+            cv.ellipse(output, new cv.Point(pointTo[0], pointTo[1]), new cv.Size(3, 3), 0, 0, 360,
+                               new cv.Scalar(0, 0, 255), cv.FILLED);
+        }
+    }
+    return output;
+}
\ No newline at end of file