From c0d0e929ad0e6ba461a6be165975d52297d63116 Mon Sep 17 00:00:00 2001 From: jdp8 Date: Sat, 30 Dec 2023 15:49:52 -0400 Subject: [PATCH] Add Pose Estimation & Semantic Segmentation --- examples/react/src/App.tsx | 148 ++++++++++++--- .../react/src/annotator_helper_functions.ts | 171 ++++++++++++++++++ 2 files changed, 295 insertions(+), 24 deletions(-) create mode 100644 examples/react/src/annotator_helper_functions.ts diff --git a/examples/react/src/App.tsx b/examples/react/src/App.tsx index 0b316e1..de6a477 100644 --- a/examples/react/src/App.tsx +++ b/examples/react/src/App.tsx @@ -24,6 +24,7 @@ import { FAQ } from './components/FAQ' import { Tensor } from '@xenova/transformers' import cv from '@techstark/opencv-js' import { StableDiffusionControlNetPipeline } from '../../../dist/pipelines/StableDiffusionControlNetPipeline'; +import { getBlobFromImage, generateColors, segArgmax, posePostProcess, loadAnnotatorFile } from './annotator_helper_functions' const darkTheme = createTheme({ palette: { @@ -39,6 +40,7 @@ interface SelectedPipeline { steps: number hasImg2Img: boolean hasControlNet: boolean + controlnet?: string } const pipelines = [ @@ -91,7 +93,32 @@ const pipelines = [ height: 512, steps: 20, hasImg2Img: true, - hasControlNet: true + hasControlNet: true, + controlnet: 'canny' + }, + { + name: 'StableDiffusion 1.5 Base FP16 Semantic Segmentation (2.9GB)', + repo: 'jdp8/stable-diffusion-1-5-seg-v11p-onnx', + revision: 'main', + fp16: true, + width: 512, + height: 512, + steps: 20, + hasImg2Img: true, + hasControlNet: true, + controlnet: 'semantic_segmentation' + }, + { + name: 'StableDiffusion 1.5 Base FP16 OpenPose (2.9GB)', + repo: 'jdp8/stable-diffusion-1-5-openpose-v11p-onnx', + revision: 'main', + fp16: true, + width: 512, + height: 512, + steps: 20, + hasImg2Img: true, + hasControlNet: true, + controlnet: 'openpose' }, ] @@ -110,6 +137,8 @@ function App() { const [inputImage, setInputImage] = useState(); const [strength, setStrength] = useState(0.8); const [controlNetImage, setControlNetImage] = useState(); + const [annotator_model, setAnnotatorModel] = useState(''); + const [annotator_config, setAnnotatorConfig] = useState(''); const [runVaeOnEachStep, setRunVaeOnEachStep] = useState(false); useEffect(() => { setModelCacheDir('models') @@ -201,7 +230,7 @@ function App() { * @param type Pipeline of the input image. * @returns void */ - function uploadImage(e: any, type: 'controlnet'|'img2img') { + function uploadImage(e: any, type: 'controlnet'|'img2img', controlnet='canny') { if(!e.target.files[0]) { // No image uploaded return; @@ -225,29 +254,70 @@ function App() { setInputImage(rgb_array); } else if(type == 'controlnet') { - // For now only Canny Edge Detection is available const cvImg = cv.imread(uploadedImage); // RGBA Image | 4 Channels - const imgGray = new cv.Mat(); - cv.cvtColor(cvImg, imgGray, cv.COLOR_RGBA2GRAY); // Gray Image | 1 Channel - const imgCanny = new cv.Mat(); - cv.Canny(imgGray, imgCanny, 100, 200, 3, false); // Canny Image | 1 Channel - const rgbaCanny = new cv.Mat(); - cv.cvtColor(imgCanny, rgbaCanny, cv.COLOR_GRAY2RGBA, 0); // RGBA Canny Image | 4 Channels - /** - * The canny data can be accessed as so: - * cannyEdges.data -> UInt8Array - * cannyEdges.data8S -> Int8Array - * cannyEdges.data16S -> Int16Array - * cannyEdges.data16U -> UInt16Array - * cannyEdges.data32F -> Float32Array - * cannyEdges.data32S -> Int32Array - * cannyEdges.data64F -> Float64Array - */ + if(controlnet == 'semantic_segmentation') { + const inputSize = [513, 513]; + const mean = [127.5, 127.5, 127.5]; + const std = 0.007843; + const swapRB = false; + + const input = getBlobFromImage(inputSize, mean, std, swapRB, cvImg); + const net = cv.readNet(annotator_model); + net.setInput(input); + const result = net.forward(); + const colors = generateColors(result); + const output = segArgmax(result, colors); + const resizedOutput = new cv.Mat(); + const dsize = new cv.Size(512, 512); + cv.resize(output, resizedOutput, dsize, 0, 0, cv.INTER_AREA); + const rgbSem = getRgbData(Uint8ClampedArray.from(resizedOutput.data), false); + setControlNetImage(rgbSem); + cvImg.delete();input.delete();net.delete();result.delete();resizedOutput.delete(); + } + else if(controlnet == 'openpose') { + // inputSize can be changed, the original is [368, 368]. The higher the size, the slower the annotation and vice-versa. + const inputSize = [125, 125]; + const mean = [0, 0, 0]; + const std = 0.00392; + const swapRB = false; + const threshold = 0.1; + + // the pairs of keypoint, can be "COCO", "MPI" and "BODY_25" + let dataset = ''; - const rgbCanny = getRgbData(Uint8ClampedArray.from(rgbaCanny.data), false); - setControlNetImage(rgbCanny); - cvImg.delete();imgGray.delete();imgCanny.delete();rgbaCanny.delete(); + if(annotator_model == 'pose_iter_584000.caffemodel') { + dataset = 'BODY_25' + } + else if(annotator_model == 'pose_iter_440000.caffemodel') { + dataset = 'COCO' + } + else if (annotator_model == 'pose_iter_160000.caffemodel') { + dataset = 'MPI' + } + + const input = getBlobFromImage(inputSize, mean, std, swapRB, cvImg); + let net = cv.readNet(annotator_model, annotator_config); + net.setInput(input); + const result = net.forward(); + const output = posePostProcess(result, dataset, threshold, 512, 512); + const rgbaPose = new cv.Mat(); + cv.cvtColor(output, rgbaPose, cv.COLOR_RGB2RGBA, 0); // RGBA Pose Image | 4 Channels + const rgbPose = getRgbData(Uint8ClampedArray.from(rgbaPose.data), false); + setControlNetImage(rgbPose); + cvImg.delete();input.delete();net.delete();result.delete();rgbaPose.delete(); + } + else if(controlnet == 'canny') { + const imgGray = new cv.Mat(); + cv.cvtColor(cvImg, imgGray, cv.COLOR_RGBA2GRAY); // Gray Image | 1 Channel + const imgCanny = new cv.Mat(); + cv.Canny(imgGray, imgCanny, 100, 200, 3, false); // Canny Image | 1 Channel + const rgbaCanny = new cv.Mat(); + cv.cvtColor(imgCanny, rgbaCanny, cv.COLOR_GRAY2RGBA, 0); // RGBA Canny Image | 4 Channels + const rgbCanny = getRgbData(Uint8ClampedArray.from(rgbaCanny.data), false); + setControlNetImage(rgbCanny); + cvImg.delete();imgGray.delete();imgCanny.delete();rgbaCanny.delete(); + } } }); uploadedImage.src = file.target.result; @@ -333,13 +403,43 @@ function App() { {selectedPipeline?.hasControlNet && ( <> - + + { + const fileName = await loadAnnotatorFile(e) + //@ts-ignore + setAnnotatorModel(fileName) + }} + /> + + {selectedPipeline?.controlnet == 'openpose' && + ( + <> + + { + const fileName = await loadAnnotatorFile(e) + //@ts-ignore + setAnnotatorConfig(fileName) + }} + /> + + )} + uploadImage(e, "controlnet")} + onChange={(e) => uploadImage(e, "controlnet", selectedPipeline.controlnet)} /> )} diff --git a/examples/react/src/annotator_helper_functions.ts b/examples/react/src/annotator_helper_functions.ts new file mode 100644 index 0000000..ae329f2 --- /dev/null +++ b/examples/react/src/annotator_helper_functions.ts @@ -0,0 +1,171 @@ +import cv from '@techstark/opencv-js' + +export const getBlobFromImage = function(inputSize: Array, mean: Array, std: number, swapRB: boolean, cvImg: any) { + const matC3 = new cv.Mat(cvImg.matSize[0], cvImg.matSize[1], cv.CV_8UC3); + cv.cvtColor(cvImg, matC3, cv.COLOR_RGBA2BGR); + const input = cv.blobFromImage(matC3, std, new cv.Size(inputSize[0], inputSize[1]), new cv.Scalar(mean[0], mean[1], mean[2]), swapRB); + + matC3.delete(); + return input; +} + +export const loadAnnotatorFile = async (e: any) => { + if(!e.target.files[0]) { + return; + } + + return new Promise((resolve) => { + let file = e.target.files[0]; + let path = file.name; + let reader = new FileReader(); + reader.readAsArrayBuffer(file); + reader.onload = function(ev) { + if(reader.readyState === 2) { + let buffer: any = reader.result; + let data = new Uint8Array(buffer); + cv.FS_createDataFile('/', path, data, true, false, false); + resolve(path); + } + } + }); +} + +export const generateColors = function(result: any) { + const numClasses = result.matSize[1]; + let colors = [0, 0, 0]; + while(colors.length < numClasses * 3) { + colors.push(Math.round((Math.random() * 255 + colors[colors.length - 3]) / 2)); + } + return colors; +} + +export const segArgmax = function(result: any, colors: Array) { + const C = result.matSize[1]; + const H = result.matSize[2]; + const W = result.matSize[3]; + const resultData = result.data32F; + const imgSize = H*W; + + let classId = []; + let i, j; + for(i = 0; i < imgSize; ++i) { + let id = 0; + for(j = 0; j < C; ++j) { + if(resultData[j*imgSize+i] > resultData[id*imgSize+i]) { + id = j; + } + } + classId.push(colors[id*3]); + classId.push(colors[id*3+1]); + classId.push(colors[id*3+2]); + classId.push(255); + } + + const output = cv.matFromArray(H, W, cv.CV_8UC4, classId); + return output; +} + +export const posePostProcess = function(result: any, dataset: string, threshold: number, outputWidth: number, outputHeight: number) { + const resultData = result.data32F; + const matSize = result.matSize; + // const size1 = matSize[1]; + const size2 = matSize[2]; + const size3 = matSize[3]; + const mapSize = size2 * size3; + + let output = cv.Mat.zeros(outputWidth, outputHeight, cv.CV_8UC3); + + let BODY_PARTS: any = {}; + let POSE_PAIRS: any = []; + + if(dataset === 'COCO') { + BODY_PARTS = { "Nose": 0, "Neck": 1, "RShoulder": 2, "RElbow": 3, "RWrist": 4, + "LShoulder": 5, "LElbow": 6, "LWrist": 7, "RHip": 8, "RKnee": 9, + "RAnkle": 10, "LHip": 11, "LKnee": 12, "LAnkle": 13, "REye": 14, + "LEye": 15, "REar": 16, "LEar": 17, "Background": 18 }; + + POSE_PAIRS = [ ["Neck", "RShoulder"], ["Neck", "LShoulder"], ["RShoulder", "RElbow"], + ["RElbow", "RWrist"], ["LShoulder", "LElbow"], ["LElbow", "LWrist"], + ["Neck", "RHip"], ["RHip", "RKnee"], ["RKnee", "RAnkle"], ["Neck", "LHip"], + ["LHip", "LKnee"], ["LKnee", "LAnkle"], ["Neck", "Nose"], ["Nose", "REye"], + ["REye", "REar"], ["Nose", "LEye"], ["LEye", "LEar"] ] + } + else if (dataset === 'MPI') { + BODY_PARTS = { "Head": 0, "Neck": 1, "RShoulder": 2, "RElbow": 3, "RWrist": 4, + "LShoulder": 5, "LElbow": 6, "LWrist": 7, "RHip": 8, "RKnee": 9, + "RAnkle": 10, "LHip": 11, "LKnee": 12, "LAnkle": 13, "Chest": 14, + "Background": 15 } + + POSE_PAIRS = [ ["Head", "Neck"], ["Neck", "RShoulder"], ["RShoulder", "RElbow"], + ["RElbow", "RWrist"], ["Neck", "LShoulder"], ["LShoulder", "LElbow"], + ["LElbow", "LWrist"], ["Neck", "Chest"], ["Chest", "RHip"], ["RHip", "RKnee"], + ["RKnee", "RAnkle"], ["Chest", "LHip"], ["LHip", "LKnee"], ["LKnee", "LAnkle"] ] + } + else if (dataset === 'BODY_25') { + BODY_PARTS = { "Nose": 0, "Neck": 1, "RShoulder": 2, "RElbow": 3, "RWrist": 4, + "LShoulder": 5, "LElbow": 6, "LWrist": 7, "MidHip": 8, "RHip": 9, + "RKnee": 10, "RAnkle": 11, "LHip": 12, "LKnee": 13, "LAnkle": 14, + "REye": 15, "LEye": 16, "REar": 17, "LEar": 18, "LBigToe": 19, + "LSmallToe": 20, "LHeel": 21, "RBigToe": 22, "RSmallToe": 23, + "RHeel": 24, "Background": 25 } + + POSE_PAIRS = [ ["Neck", "Nose"], ["Neck", "RShoulder"], + ["Neck", "LShoulder"], ["RShoulder", "RElbow"], + ["RElbow", "RWrist"], ["LShoulder", "LElbow"], + ["LElbow", "LWrist"], ["Nose", "REye"], + ["REye", "REar"], ["Nose", "LEye"], + ["LEye", "LEar"], ["Neck", "MidHip"], + ["MidHip", "RHip"], ["RHip", "RKnee"], + ["RKnee", "RAnkle"], ["RAnkle", "RBigToe"], + ["RBigToe", "RSmallToe"], ["RAnkle", "RHeel"], + ["MidHip", "LHip"], ["LHip", "LKnee"], + ["LKnee", "LAnkle"], ["LAnkle", "LBigToe"], + ["LBigToe", "LSmallToe"], ["LAnkle", "LHeel"] ] + } + + // get position of keypoints from output + let points = []; + let i; + for(i = 0; i < Object.keys(BODY_PARTS).length; ++i) { + let heatMap = resultData.slice(i * mapSize, (i+1) * mapSize); + let maxIndex = 0; + let maxConf = heatMap[0]; + let index: any; + for(index in heatMap) { + if(heatMap[index] > heatMap[maxIndex]) { + maxIndex = index; + maxConf = heatMap[index]; + } + } + + if(maxConf > threshold) { + let indexX = maxIndex % size3; + let indexY = maxIndex / size3; + + let x = outputWidth * indexX / size3; + let y = outputHeight * indexY / size2; + + points[i] = [Math.round(x), Math.round(y)]; + } + } + + // draw the points and lines into the image + for(const pair of POSE_PAIRS) { + const partFrom = pair[0]; + const partTo = pair[1]; + const idFrom = BODY_PARTS[partFrom]; + const idTo = BODY_PARTS[partTo]; + const pointFrom = points[idFrom]; + const pointTo = points[idTo]; + + if(points[idFrom] && points[idTo]) { + cv.line(output, new cv.Point(pointFrom[0], pointFrom[1]), + new cv.Point(pointTo[0], pointTo[1]), new cv.Scalar(0, 255, 0), 3); + cv.ellipse(output, new cv.Point(pointFrom[0], pointFrom[1]), new cv.Size(3, 3), 0, 0, 360, + new cv.Scalar(0, 0, 255), cv.FILLED); + cv.ellipse(output, new cv.Point(pointTo[0], pointTo[1]), new cv.Size(3, 3), 0, 0, 360, + new cv.Scalar(0, 0, 255), cv.FILLED); + } + } + return output; +} \ No newline at end of file