diff --git a/.gitignore b/.gitignore index dd10da52..0cc93f06 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,5 @@ ipch/ *.cachefile *.VC.db *.VC.VC.opendb +guetzli.vcxproj.user +clguetzli/clguetzli.cu.ptx* diff --git a/.travis.sh b/.travis.sh index a30f38e5..905889ff 100755 --- a/.travis.sh +++ b/.travis.sh @@ -14,6 +14,7 @@ case "$1" in "bazel") case "${TRAVIS_OS_NAME}" in "linux") + sudo apt-get remove oracle-java9-installer wget https://github.com/bazelbuild/bazel/releases/download/0.4.5/bazel_0.4.5-linux-x86_64.deb echo 'b494d0a413e4703b6cd5312403bea4d92246d6425b3be68c9bfbeb8cc4db8a55 bazel_0.4.5-linux-x86_64.deb' | sha256sum -c --strict || exit 1 sudo dpkg -i bazel_0.4.5-linux-x86_64.deb diff --git a/.travis.yml b/.travis.yml index 39e1caaa..85db2b53 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,6 +13,8 @@ matrix: packages: - wget - libjpeg-progs + - netpbm + - oracle-java8-installer - os: osx env: BUILD_SYSTEM=bazel @@ -29,6 +31,7 @@ matrix: - libpng-dev - pkg-config - libjpeg-progs + - netpbm - os: osx env: BUILD_SYSTEM=make diff --git a/BUILD b/BUILD index 05bfc0da..c88d3890 100644 --- a/BUILD +++ b/BUILD @@ -8,6 +8,9 @@ cc_library( "guetzli/*.h", "guetzli/*.cc", "guetzli/*.inc", + "clguetzli/*.cpp", + "clguetzli/*.h", + "clguetzli/*.hpp" ], exclude = ["guetzli/guetzli.cc"], ), diff --git a/README.md b/README.md index 2ecd1072..37fa4267 100644 --- a/README.md +++ b/README.md @@ -99,3 +99,59 @@ attempts made. Please note that JPEG images do not support alpha channel (transparency). If the input is a PNG with an alpha channel, it will be overlaid on black background before encoding. + +# Extra features + +**Note:** Please make sure that you can build guetzli successfully before adding the following features. + +## Enable CUDA/OpenCL support + +**Note:** Before adding [CUDA](https://developer.nvidia.com/cuda-zone) support, please [check](http://developer.nvidia.com/cuda-gpus) whether your GPU support CUDA or not. + +**Note:** If you don't have an NVIDIA card that support CUDA, you can try [OpenCL](https://www.khronos.org/opencl/) instead. You can install any of the OpenCL SDKs, such as [Intel OpenCL SDK](https://software.intel.com/en-us/intel-opencl), [AMD OpenCL SDK](http://developer.amd.com/tools-and-sdks/opencl-zone/), etc. + +**Note:** The steps for adding OpenCL support is very similar with adding CUDA support, so the following introduction will be only for CUDA. + +### On POSIX systems +1. Follow the [Installation Guide for Linux ](https://developer.nvidia.com/compute/cuda/8.0/Prod2/docs/sidebar/CUDA_Installation_Guide_Linux-pdf) to setup [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit). +2. Edit `premake5.lua`, add `defines { "__USE_OPENCL__", "__USE_CUDA__" }` and `links { "OpenCL", "cuda" }` under `filter "action:gmake"`. Then do `premake5 --os=linux gmake` to update the makefile. +3. Edit `clguetzli/clguetzli.cl` and add `#define __USE_OPENCL__` at first line. +4. Run `make` and expect the binary to be created in `bin/Release/guetzli`. +5. Run `./compile.sh 64` or `./compile.sh 32` to build the 64 or 32 bits [ptx](http://docs.nvidia.com/cuda/parallel-thread-execution) file, and the ptx file will be copied to `bin/Release/clguetzli`. + +### On Windows +1. Follow the [Installation Guide for Microsoft Windows](https://developer.nvidia.com/compute/cuda/8.0/Prod2/docs/sidebar/CUDA_Installation_Guide_Windows-pdf) to setup `CUDA Toolkit`. +2. Copy `\VC\bin\amd64\vcvars64.bat` as `\vcvars64.bat` +3. Open the Visual Studio project and edit the project `Property Pages` as follows: + * Add `__USE_OPENCL__` and `__USE_CUDA__` to preprocessor definitions. + * Add `OpenCL.lib` and `cuda.lib` to additional dependencies. + * Add `$(CUDA_PATH)\include` to include directories. + * Add `$(CUDA_PATH)\lib\Win32` or `$(CUDA_PATH)\lib\x64` to library directories. +4. Edit `clguetzli/clguetzli.cl` and add `#define __USE_OPENCL__` at first line. +5. Build it. + +### Usage +```bash +guetzli [--c|--cuda|--opencl] [other options] original.png output.jpg +guetzli [--c|--cuda|--opencl] [other options] original.jpg output.jpg +``` +You can pass a `--c` parameter to enable the procedure optimization or `--cuda` parameter to use the CUDA acceleration or `--opencl` to use the OpenCL acceleration. + +If you have any question about CUDA/OpenCL support, please contact strongtu@tencent.com, ianhuang@tencent.com or chriskzhou@tencent.com. + +## Enable full JPEG format support +### On POSIX systems +1. Install [libjpeg](http://libjpeg.sourceforge.net/). + If using your operating system + package manager, install development versions of the packages if the + distinction exists. + * On Ubuntu, do `apt-get install libjpeg8-dev`. + * On Fedora, do `dnf install libjpeg-devel`. + * On Arch Linux, do `pacman -S libjpeg`. + * On Alpine Linux, do `apk add libjpeg`. +2. Edit `premake5.lua`, add `defines {"__SUPPORT_FULL_JPEG__"}` and `links { "jpeg" }` under `filter "action:gmake"`. Then do `premake5 --os=linux gmake` to update the makefile. +3. Run `make` and expect the binary to be created in `bin/Release/guetzli` +### On Windows +1. Install `libjpeg-turbo` using vcpkg: `.\vcpkg install libjpeg-turbo` +2. Open the Visual Studio project and add `__SUPPORT_FULL_JPEG__` to preprocessor definitions in the project `Property Pages`. +3. Build it. \ No newline at end of file diff --git a/appveyor.yml b/appveyor.yml index 061ab6d0..97acb3ac 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -15,7 +15,7 @@ install: - premake5.exe %TOOLSET% - git clone https://github.com/Microsoft/vcpkg - md vcpkg\downloads\nuget-3.5.0 - - appveyor DownloadFile https://dist.nuget.org/win-x86-commandline/latest/nuget.exe -FileName %appveyor_build_folder%\vcpkg\downloads\nuget-3.5.0\nuget.exe + - appveyor DownloadFile https://dist.nuget.org/win-x86-commandline/v3.5.0/nuget.exe -FileName %appveyor_build_folder%\vcpkg\downloads\nuget-3.5.0\nuget.exe - appveyor DownloadFile https://cmake.org/files/v3.8/cmake-3.8.0-rc1-win32-x86.zip -FileName %appveyor_build_folder%\vcpkg\downloads\cmake-3.8.0-rc1-win32-x86.zip - 7z x %appveyor_build_folder%\vcpkg\downloads\cmake-3.8.0-rc1-win32-x86.zip - cd vcpkg diff --git a/clguetzli/cl.hpp b/clguetzli/cl.hpp new file mode 100644 index 00000000..a7043b50 --- /dev/null +++ b/clguetzli/cl.hpp @@ -0,0 +1,322 @@ +#pragma once + +#ifdef __USE_OPENCL__ + +template +inline void clSetKernelArgK(cl_kernel k, int idx, T* t) +{ + clSetKernelArg(k, idx, sizeof(T), t); +} + +template<> +inline void clSetKernelArgK(cl_kernel k, int idx, int* t) +{ + cl_int c = *t; + clSetKernelArg(k, idx, sizeof(cl_int), &c); +} + +template<> +inline void clSetKernelArgK(cl_kernel k, int idx, const int* t) +{ + cl_int c = *t; + clSetKernelArg(k, idx, sizeof(cl_int), &c); +} + +template<> +inline void clSetKernelArgK(cl_kernel k, int idx, size_t* t) +{ + cl_int c = *t; + clSetKernelArg(k, idx, sizeof(cl_int), &c); +} + +template<> +inline void clSetKernelArgK(cl_kernel k, int idx, const size_t* t) +{ + cl_int c = *t; + clSetKernelArg(k, idx, sizeof(cl_int), &c); +} + +template +inline void clSetKernelArgEx(cl_kernel k, T0* t0) +{ + clSetKernelArgK(k, 0, t0); +} + +template +inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1) +{ + clSetKernelArgK(k, 1, t1); + clSetKernelArgEx(k, t0); +} + +template +inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2) +{ + clSetKernelArgK(k, 2, t2); + clSetKernelArgEx(k, t0, t1); +} + +template +inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3) +{ + clSetKernelArgK(k, 3, t3); + clSetKernelArgEx(k, t0, t1, t2); +} + +template +inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4) +{ + clSetKernelArgK(k, 4, t4); + clSetKernelArgEx(k, t0, t1, t2, t3); +} + +template +inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5) +{ + clSetKernelArgK(k, 5, t5); + clSetKernelArgEx(k, t0, t1, t2, t3, t4); +} + +template +inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6) +{ + clSetKernelArgK(k, 6, t6); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5); +} + +template +inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7) +{ + clSetKernelArgK(k, 7, t7); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6); +} + +template +inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8) +{ + clSetKernelArgK(k, 8, t8); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7); +} + +template +inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8, T9* t9) +{ + clSetKernelArgK(k, 9, t9); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8); +} + +template + inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8, T9* t9, T10* t10) +{ + clSetKernelArgK(k, 10, t10); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9); +} + +template + inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8, T9* t9, T10* t10, T11* t11) +{ + clSetKernelArgK(k, 11, t11); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10); +} + +template + inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, + T5* t5, T6* t6, T7* t7, T8* t8, T9* t9, + T10* t10, T11* t11, T12* t12) +{ + clSetKernelArgK(k, 12, t12); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11); +} + +template + inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, + T5* t5, T6* t6, T7* t7, T8* t8, T9* t9, + T10* t10, T11* t11, T12* t12, T13* t13) +{ + clSetKernelArgK(k, 13, t13); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12); +} + +template + inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, + T5* t5, T6* t6, T7* t7, T8* t8, T9* t9, + T10* t10, T11* t11, T12* t12, T13* t13, + T14* t14) +{ + clSetKernelArgK(k, 14, t14); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13); +} + +template< + typename T0, typename T1, typename T2, typename T3, typename T4, + typename T5, typename T6, typename T7, typename T8, typename T9, + typename T10, typename T11, typename T12, typename T13, typename T14, + typename T15> + inline void clSetKernelArgEx(cl_kernel k, + T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, + T5* t5, T6* t6, T7* t7, T8* t8, T9* t9, + T10* t10, T11* t11, T12* t12, T13* t13, T14* t14, + T15* t15) +{ + clSetKernelArgK(k, 15, t15); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14); +} + +template< + typename T0, typename T1, typename T2, typename T3, typename T4, + typename T5, typename T6, typename T7, typename T8, typename T9, + typename T10, typename T11, typename T12, typename T13, typename T14, + typename T15, typename T16> + inline void clSetKernelArgEx(cl_kernel k, + T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, + T5* t5, T6* t6, T7* t7, T8* t8, T9* t9, + T10* t10, T11* t11, T12* t12, T13* t13, T14* t14, + T15* t15, T16* t16) +{ + clSetKernelArgK(k, 16, t16); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15); +} + +template< + typename T0, typename T1, typename T2, typename T3, typename T4, + typename T5, typename T6, typename T7, typename T8, typename T9, + typename T10, typename T11, typename T12, typename T13, typename T14, + typename T15, typename T16, typename T17> + inline void clSetKernelArgEx(cl_kernel k, + T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, + T5* t5, T6* t6, T7* t7, T8* t8, T9* t9, + T10* t10, T11* t11, T12* t12, T13* t13, T14* t14, + T15* t15, T16* t16, T17* t17) +{ + clSetKernelArgK(k, 17, t17); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16); +} + +template< + typename T0, typename T1, typename T2, typename T3, typename T4, + typename T5, typename T6, typename T7, typename T8, typename T9, + typename T10, typename T11, typename T12, typename T13, typename T14, + typename T15, typename T16, typename T17, typename T18> + inline void clSetKernelArgEx(cl_kernel k, + T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, + T5* t5, T6* t6, T7* t7, T8* t8, T9* t9, + T10* t10, T11* t11, T12* t12, T13* t13, T14* t14, + T15* t15, T16* t16, T17* t17, T18* t18) +{ + clSetKernelArgK(k, 18, t18); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17); +} + +template< + typename T0, typename T1, typename T2, typename T3, typename T4, + typename T5, typename T6, typename T7, typename T8, typename T9, + typename T10, typename T11, typename T12, typename T13, typename T14, + typename T15, typename T16, typename T17, typename T18, typename T19> + inline void clSetKernelArgEx(cl_kernel k, + T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, + T5* t5, T6* t6, T7* t7, T8* t8, T9* t9, + T10* t10, T11* t11, T12* t12, T13* t13, T14* t14, + T15* t15, T16* t16, T17* t17, T18* t18, T19* t19) +{ + clSetKernelArgK(k, 19, t19); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18); +} + +template< + typename T0, typename T1, typename T2, typename T3, typename T4, + typename T5, typename T6, typename T7, typename T8, typename T9, + typename T10, typename T11, typename T12, typename T13, typename T14, + typename T15, typename T16, typename T17, typename T18, typename T19, + typename T20> + inline void clSetKernelArgEx(cl_kernel k, + T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, + T5* t5, T6* t6, T7* t7, T8* t8, T9* t9, + T10* t10, T11* t11, T12* t12, T13* t13, T14* t14, + T15* t15, T16* t16, T17* t17, T18* t18, T19* t19, + T20* t20) +{ + clSetKernelArgK(k, 20, t20); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19); +} + +template< + typename T0, typename T1, typename T2, typename T3, typename T4, + typename T5, typename T6, typename T7, typename T8, typename T9, + typename T10, typename T11, typename T12, typename T13, typename T14, + typename T15, typename T16, typename T17, typename T18, typename T19, + typename T20, typename T21> + inline void clSetKernelArgEx(cl_kernel k, + T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, + T5* t5, T6* t6, T7* t7, T8* t8, T9* t9, + T10* t10, T11* t11, T12* t12, T13* t13, T14* t14, + T15* t15, T16* t16, T17* t17, T18* t18, T19* t19, + T20* t20, T21* t21) +{ + clSetKernelArgK(k, 21, t21); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19, t20); +} + +template< + typename T0, typename T1, typename T2, typename T3, typename T4, + typename T5, typename T6, typename T7, typename T8, typename T9, + typename T10, typename T11, typename T12, typename T13, typename T14, + typename T15, typename T16, typename T17, typename T18, typename T19, + typename T20, typename T21, typename T22> + inline void clSetKernelArgEx(cl_kernel k, + T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, + T5* t5, T6* t6, T7* t7, T8* t8, T9* t9, + T10* t10, T11* t11, T12* t12, T13* t13, T14* t14, + T15* t15, T16* t16, T17* t17, T18* t18, T19* t19, + T20* t20, T21* t21, T22* t22) +{ + clSetKernelArgK(k, 22, t22); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19, t20, t21); +} + +template< + typename T0, typename T1, typename T2, typename T3, typename T4, + typename T5, typename T6, typename T7, typename T8, typename T9, + typename T10, typename T11, typename T12, typename T13, typename T14, + typename T15, typename T16, typename T17, typename T18, typename T19, + typename T20, typename T21, typename T22, typename T23> + inline void clSetKernelArgEx(cl_kernel k, + T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, + T5* t5, T6* t6, T7* t7, T8* t8, T9* t9, + T10* t10, T11* t11, T12* t12, T13* t13, T14* t14, + T15* t15, T16* t16, T17* t17, T18* t18, T19* t19, + T20* t20, T21* t21, T22* t22, T23* t23) +{ + clSetKernelArgK(k, 23, t23); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19, t20, t21, t22); +} + +template< + typename T0, typename T1, typename T2, typename T3, typename T4, + typename T5, typename T6, typename T7, typename T8, typename T9, + typename T10, typename T11, typename T12, typename T13, typename T14, + typename T15, typename T16, typename T17, typename T18, typename T19, + typename T20, typename T21, typename T22, typename T23, typename T24> +inline void clSetKernelArgEx(cl_kernel k, + T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, + T5* t5, T6* t6, T7* t7, T8* t8, T9* t9, + T10* t10, T11* t11, T12* t12, T13* t13, T14* t14, + T15* t15, T16* t16, T17* t17, T18* t18, T19* t19, + T20* t20, T21* t21, T22* t22, T23* t23, T24* t24) +{ + clSetKernelArgK(k, 24, t24); + clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19, t20, t21, t22, t23); +} + +#endif // __USE_OPENCL__ \ No newline at end of file diff --git a/clguetzli/clbutter_comparator.cpp b/clguetzli/clbutter_comparator.cpp new file mode 100644 index 00000000..d91055d5 --- /dev/null +++ b/clguetzli/clbutter_comparator.cpp @@ -0,0 +1,1813 @@ +/* +* OpenCL/CUDA edition implementation of butter_comparator. +* +* Author: strongtu@tencent.com +* ianhuang@tencent.com +* chriskzhou@tencent.com +*/ +#include "clbutter_comparator.h" +#include "clguetzli.h" +#include "clguetzli_test.h" + +#include +#include + +namespace butteraugli { + +static const float kInternalGoodQualityThreshold = 14.921561160295326; +static const float kGlobalScale = 1.0 / kInternalGoodQualityThreshold; + +inline float DotProductOpt(const float u[3], const float v[3]) { + return u[0] * v[0] + u[1] * v[1] + u[2] * v[2]; +} + +// Computes a horizontal convolution and transposes the result. +void ConvolutionOpt(size_t xsize, size_t ysize, + size_t xstep, + size_t len, size_t offset, + const float* __restrict__ multipliers, + const float* __restrict__ inp, + float border_ratio, + float* __restrict__ result) { + PROFILER_FUNC; + float weight_no_border = 0; + for (size_t j = 0; j <= 2 * offset; ++j) { + weight_no_border += multipliers[j]; + } + for (size_t x = 0, ox = 0; x < xsize; x += xstep, ox++) { + int minx = x < offset ? 0 : x - offset; + int maxx = std::min(xsize, x + len - offset) - 1; + float weight = 0.0; + for (int j = minx; j <= maxx; ++j) { + weight += multipliers[j - x + offset]; + } + // Interpolate linearly between the no-border scaling and border scaling. + weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border; + float scale = 1.0 / weight; + for (size_t y = 0; y < ysize; ++y) { + float sum = 0.0; + for (int j = minx; j <= maxx; ++j) { + sum += inp[y * xsize + j] * multipliers[j - x + offset]; + } + result[ox * ysize + y] = static_cast(sum * scale); + } + } +} + +void BlurOpt(size_t xsize, size_t ysize, float* channel, float sigma, + float border_ratio) { + PROFILER_FUNC; + float m = 2.25; // Accuracy increases when m is increased. + const float scaler = -1.0 / (2 * sigma * sigma); + // For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52} + const int diff = std::max(1, m * fabs(sigma)); + const int expn_size = 2 * diff + 1; + std::vector expn(expn_size); + for (int i = -diff; i <= diff; ++i) { + expn[i + diff] = static_cast(exp(scaler * i * i)); + } + const int xstep = std::max(1, int(sigma / 3)); + const int ystep = xstep; + int dxsize = (xsize + xstep - 1) / xstep; + int dysize = (ysize + ystep - 1) / ystep; + std::vector tmp(dxsize * ysize); + ConvolutionOpt(xsize, ysize, xstep, expn_size, diff, expn.data(), channel, + border_ratio, + tmp.data()); + float* output = channel; + std::vector downsampled_output; + if (xstep > 1) { + downsampled_output.resize(dxsize * dysize); + output = downsampled_output.data(); + } + ConvolutionOpt(ysize, dxsize, ystep, expn_size, diff, expn.data(), tmp.data(), + border_ratio, output); + if (xstep > 1) { + for (size_t y = 0; y < ysize; y++) { + for (size_t x = 0; x < xsize; x++) { + // TODO: Use correct rounding. + channel[y * xsize + x] = + downsampled_output[(y / ystep) * dxsize + (x / xstep)]; + } + } + } +} + +// To change this to n, add the relevant FFTn function and kFFTnMapIndexTable. +constexpr size_t kBlockEdge = 8; +constexpr size_t kBlockSize = kBlockEdge * kBlockEdge; +constexpr size_t kBlockEdgeHalf = kBlockEdge / 2; +constexpr size_t kBlockHalf = kBlockEdge * kBlockEdgeHalf; + +// Contrast sensitivity related weights. +static const float *GetContrastSensitivityMatrixOpt() { + static float csf8x8[kBlockHalf + kBlockEdgeHalf + 1] = { + 5.28270670524, + 0.0, + 0.0, + 0.0, + 0.3831134973, + 0.676303603859, + 3.58927792424, + 18.6104367002, + 18.6104367002, + 3.09093131948, + 1.0, + 0.498250875965, + 0.36198671102, + 0.308982169883, + 0.1312701920435, + 2.37370549629, + 3.58927792424, + 1.0, + 2.37370549629, + 0.991205724152, + 1.05178802919, + 0.627264168628, + 0.4, + 0.1312701920435, + 0.676303603859, + 0.498250875965, + 0.991205724152, + 0.5, + 0.3831134973, + 0.349686450518, + 0.627264168628, + 0.308982169883, + 0.3831134973, + 0.36198671102, + 1.05178802919, + 0.3831134973, + 0.12, + }; + return &csf8x8[0]; +} + +std::array MakeHighFreqColorDiffDxOpt() { + std::array lut; + static const float off = 11.38708334481672; + static const float inc = 14.550189611520716; + lut[0] = 0.0; + lut[1] = off; + for (int i = 2; i < 21; ++i) { + lut[i] = lut[i - 1] + inc; + } + return lut; +} + +const float *GetHighFreqColorDiffDxOpt() { + static const std::array kLut = MakeHighFreqColorDiffDxOpt(); + return kLut.data(); +} + +std::array MakeHighFreqColorDiffDyOpt() { + std::array lut; + static const float off = 1.4103373714040413; + static const float inc = 0.7084088867024; + lut[0] = 0.0; + lut[1] = off; + for (int i = 2; i < 21; ++i) { + lut[i] = lut[i - 1] + inc; + } + return lut; +} + +const float *GetHighFreqColorDiffDyOpt() { + static const std::array kLut = MakeHighFreqColorDiffDyOpt(); + return kLut.data(); +} + +std::array MakeLowFreqColorDiffDyOpt() { + std::array lut; + static const float inc = 5.2511644570349185; + lut[0] = 0.0; + for (int i = 1; i < 21; ++i) { + lut[i] = lut[i - 1] + inc; + } + return lut; +} + +const float *GetLowFreqColorDiffDyOpt() { + static const std::array kLut = MakeLowFreqColorDiffDyOpt(); + return kLut.data(); +} + +inline float InterpolateOpt(const float *array, int size, float sx) { + float ix = fabs(sx); + assert(ix < 10000); + int baseix = static_cast(ix); + float res; + if (baseix >= size - 1) { + res = array[size - 1]; + } + else { + float mix = ix - baseix; + int nextix = baseix + 1; + res = array[baseix] + mix * (array[nextix] - array[baseix]); + } + if (sx < 0) res = -res; + return res; +} + +inline float InterpolateClampNegativeOpt(const float *array, + int size, float sx) { + if (sx < 0) { + sx = 0; + } + float ix = fabs(sx); + int baseix = static_cast(ix); + float res; + if (baseix >= size - 1) { + res = array[size - 1]; + } + else { + float mix = ix - baseix; + int nextix = baseix + 1; + res = array[baseix] + mix * (array[nextix] - array[baseix]); + } + return res; +} + +void RgbToXybOpt(float r, float g, float b, + float *valx, float *valy, float *valz) { + static const float a0 = 1.01611726948; + static const float a1 = 0.982482243696; + static const float a2 = 1.43571362627; + static const float a3 = 0.896039849412; + *valx = a0 * r - a1 * g; + *valy = a2 * r + a3 * g; + *valz = b; +} + +static inline void XybToValsOpt(float x, float y, float z, + float *valx, float *valy, float *valz) { + static const float xmul = 0.758304045695; + static const float ymul = 2.28148649801; + static const float zmul = 1.87816926918; + *valx = InterpolateOpt(GetHighFreqColorDiffDxOpt(), 21, x * xmul); + *valy = InterpolateOpt(GetHighFreqColorDiffDyOpt(), 21, y * ymul); + *valz = zmul * z; +} + +// Rough psychovisual distance to gray for low frequency colors. +static void XybLowFreqToValsOpt(float x, float y, float z, + float *valx, float *valy, float *valz) { + static const float xmul = 6.64482198135; + static const float ymul = 0.837846224276; + static const float zmul = 7.34905756986; + static const float y_to_z_mul = 0.0812519812628; + z += y_to_z_mul * y; + *valz = z * zmul; + *valx = x * xmul; + *valy = InterpolateOpt(GetLowFreqColorDiffDyOpt(), 21, y * ymul); +} + +float RemoveRangeAroundZeroOpt(float v, float range) { + if (v >= -range && v < range) { + return 0; + } + if (v < 0) { + return v + range; + } + else { + return v - range; + } +} + +void XybDiffLowFreqSquaredAccumulateOpt(float r0, float g0, float b0, + float r1, float g1, float b1, + float factor, float res[3]) { + float valx0, valy0, valz0; + float valx1, valy1, valz1; + XybLowFreqToValsOpt(r0, g0, b0, &valx0, &valy0, &valz0); + if (r1 == 0.0 && g1 == 0.0 && b1 == 0.0) { + PROFILER_ZONE("XybDiff r1=g1=b1=0"); + res[0] += factor * valx0 * valx0; + res[1] += factor * valy0 * valy0; + res[2] += factor * valz0 * valz0; + return; + } + XybLowFreqToValsOpt(r1, g1, b1, &valx1, &valy1, &valz1); + // Approximate the distance of the colors by their respective distances + // to gray. + float valx = valx0 - valx1; + float valy = valy0 - valy1; + float valz = valz0 - valz1; + res[0] += factor * valx * valx; + res[1] += factor * valy * valy; + res[2] += factor * valz * valz; +} + +struct ComplexOpt { +public: + float real; + float imag; +}; + +inline float abssq(const ComplexOpt& c) { + return c.real * c.real + c.imag * c.imag; +} + +static void TransposeBlock(ComplexOpt data[kBlockSize]) { + for (int i = 0; i < kBlockEdge; i++) { + for (int j = 0; j < i; j++) { + std::swap(data[kBlockEdge * i + j], data[kBlockEdge * j + i]); + } + } +} + +// D. J. Bernstein's Fast Fourier Transform algorithm on 4 elements. +inline void FFT4Opt(ComplexOpt* a) { + float t1, t2, t3, t4, t5, t6, t7, t8; + t5 = a[2].real; + t1 = a[0].real - t5; + t7 = a[3].real; + t5 += a[0].real; + t3 = a[1].real - t7; + t7 += a[1].real; + t8 = t5 + t7; + a[0].real = t8; + t5 -= t7; + a[1].real = t5; + t6 = a[2].imag; + t2 = a[0].imag - t6; + t6 += a[0].imag; + t5 = a[3].imag; + a[2].imag = t2 + t3; + t2 -= t3; + a[3].imag = t2; + t4 = a[1].imag - t5; + a[3].real = t1 + t4; + t1 -= t4; + a[2].real = t1; + t5 += a[1].imag; + a[0].imag = t6 + t5; + t6 -= t5; + a[1].imag = t6; +} + +static const float kSqrtHalf = 0.70710678118654752440084436210484903; + +// D. J. Bernstein's Fast Fourier Transform algorithm on 8 elements. +void FFT8OptOpt(ComplexOpt* a) { + float t1, t2, t3, t4, t5, t6, t7, t8; + + t7 = a[4].imag; + t4 = a[0].imag - t7; + t7 += a[0].imag; + a[0].imag = t7; + + t8 = a[6].real; + t5 = a[2].real - t8; + t8 += a[2].real; + a[2].real = t8; + + t7 = a[6].imag; + a[6].imag = t4 - t5; + t4 += t5; + a[4].imag = t4; + + t6 = a[2].imag - t7; + t7 += a[2].imag; + a[2].imag = t7; + + t8 = a[4].real; + t3 = a[0].real - t8; + t8 += a[0].real; + a[0].real = t8; + + a[4].real = t3 - t6; + t3 += t6; + a[6].real = t3; + + t7 = a[5].real; + t3 = a[1].real - t7; + t7 += a[1].real; + a[1].real = t7; + + t8 = a[7].imag; + t6 = a[3].imag - t8; + t8 += a[3].imag; + a[3].imag = t8; + t1 = t3 - t6; + t3 += t6; + + t7 = a[5].imag; + t4 = a[1].imag - t7; + t7 += a[1].imag; + a[1].imag = t7; + + t8 = a[7].real; + t5 = a[3].real - t8; + t8 += a[3].real; + a[3].real = t8; + + t2 = t4 - t5; + t4 += t5; + + t6 = t1 - t4; + t8 = kSqrtHalf; + t6 *= t8; + a[5].real = a[4].real - t6; + t1 += t4; + t1 *= t8; + a[5].imag = a[4].imag - t1; + t6 += a[4].real; + a[4].real = t6; + t1 += a[4].imag; + a[4].imag = t1; + + t5 = t2 - t3; + t5 *= t8; + a[7].imag = a[6].imag - t5; + t2 += t3; + t2 *= t8; + a[7].real = a[6].real - t2; + t2 += a[6].real; + a[6].real = t2; + t5 += a[6].imag; + a[6].imag = t5; + + FFT4Opt(a); + + // Reorder to the correct output order. + // TODO: Modify the above computation so that this is not needed. + ComplexOpt tmp = a[2]; + a[2] = a[3]; + a[3] = a[5]; + a[5] = a[7]; + a[7] = a[4]; + a[4] = a[1]; + a[1] = a[6]; + a[6] = tmp; +} + +// Same as FFT8, but all inputs are real. +// TODO: Since this does not need to be in-place, maybe there is a +// faster FFT than this one, which is derived from DJB's in-place complex FFT. +void RealFFT8Opt(const float* in, ComplexOpt* out) { + float t1, t2, t3, t5, t6, t7, t8; + t8 = in[6]; + t5 = in[2] - t8; + t8 += in[2]; + out[2].real = t8; + out[6].imag = -t5; + out[4].imag = t5; + t8 = in[4]; + t3 = in[0] - t8; + t8 += in[0]; + out[0].real = t8; + out[4].real = t3; + out[6].real = t3; + t7 = in[5]; + t3 = in[1] - t7; + t7 += in[1]; + out[1].real = t7; + t8 = in[7]; + t5 = in[3] - t8; + t8 += in[3]; + out[3].real = t8; + t2 = -t5; + t6 = t3 - t5; + t8 = kSqrtHalf; + t6 *= t8; + out[5].real = out[4].real - t6; + t1 = t3 + t5; + t1 *= t8; + out[5].imag = out[4].imag - t1; + t6 += out[4].real; + out[4].real = t6; + t1 += out[4].imag; + out[4].imag = t1; + t5 = t2 - t3; + t5 *= t8; + out[7].imag = out[6].imag - t5; + t2 += t3; + t2 *= t8; + out[7].real = out[6].real - t2; + t2 += out[6].real; + out[6].real = t2; + t5 += out[6].imag; + out[6].imag = t5; + t5 = out[2].real; + t1 = out[0].real - t5; + t7 = out[3].real; + t5 += out[0].real; + t3 = out[1].real - t7; + t7 += out[1].real; + t8 = t5 + t7; + out[0].real = t8; + t5 -= t7; + out[1].real = t5; + out[2].imag = t3; + out[3].imag = -t3; + out[3].real = t1; + out[2].real = t1; + out[0].imag = 0; + out[1].imag = 0; + + // Reorder to the correct output order. + // TODO: Modify the above computation so that this is not needed. + ComplexOpt tmp = out[2]; + out[2] = out[3]; + out[3] = out[5]; + out[5] = out[7]; + out[7] = out[4]; + out[4] = out[1]; + out[1] = out[6]; + out[6] = tmp; +} + +// Fills in block[kBlockEdgeHalf..(kBlockHalf+kBlockEdgeHalf)], and leaves the +// rest unmodified. +void ButteraugliFFTSquaredOpt(float block[kBlockSize]) { + float global_mul = 0.000064; + ComplexOpt block_c[kBlockSize]; + assert(kBlockEdge == 8); + for (int y = 0; y < kBlockEdge; ++y) { + RealFFT8Opt(block + y * kBlockEdge, block_c + y * kBlockEdge); + } + TransposeBlock(block_c); + float r0[kBlockEdge]; + float r1[kBlockEdge]; + for (int x = 0; x < kBlockEdge; ++x) { + r0[x] = block_c[x].real; + r1[x] = block_c[kBlockHalf + x].real; + } + RealFFT8Opt(r0, block_c); + RealFFT8Opt(r1, block_c + kBlockHalf); + for (int y = 1; y < kBlockEdgeHalf; ++y) { + FFT8OptOpt(block_c + y * kBlockEdge); + } + for (int i = kBlockEdgeHalf; i < kBlockHalf + kBlockEdgeHalf + 1; ++i) { + block[i] = abssq(block_c[i]); + block[i] *= global_mul; + } +} + +// Computes 8x8 FFT of each channel of xyb0 and xyb1 and adds the total squared +// 3-dimensional xybdiff of the two blocks to diff_xyb_{dc,ac} and the average +// diff on the edges to diff_xyb_edge_dc. +void ButteraugliBlockDiffOpt(float xyb0[3 * kBlockSize], + float xyb1[3 * kBlockSize], + float diff_xyb_dc[3], + float diff_xyb_ac[3], + float diff_xyb_edge_dc[3]) { + PROFILER_FUNC; + const float *csf8x8 = GetContrastSensitivityMatrixOpt(); + + float avgdiff_xyb[3] = { 0.0 }; + float avgdiff_edge[3][4] = { { 0.0 } }; + for (int i = 0; i < 3 * kBlockSize; ++i) { + const float diff_xyb = xyb0[i] - xyb1[i]; + const int c = i / kBlockSize; + avgdiff_xyb[c] += diff_xyb / kBlockSize; + const int k = i % kBlockSize; + const int kx = k % kBlockEdge; + const int ky = k / kBlockEdge; + const int h_edge_idx = ky == 0 ? 1 : ky == 7 ? 3 : -1; + const int v_edge_idx = kx == 0 ? 0 : kx == 7 ? 2 : -1; + if (h_edge_idx >= 0) { + avgdiff_edge[c][h_edge_idx] += diff_xyb / kBlockEdge; + } + if (v_edge_idx >= 0) { + avgdiff_edge[c][v_edge_idx] += diff_xyb / kBlockEdge; + } + } + XybDiffLowFreqSquaredAccumulateOpt(avgdiff_xyb[0], + avgdiff_xyb[1], + avgdiff_xyb[2], + 0, 0, 0, csf8x8[0], + diff_xyb_dc); + for (int i = 0; i < 4; ++i) { + XybDiffLowFreqSquaredAccumulateOpt(avgdiff_edge[0][i], + avgdiff_edge[1][i], + avgdiff_edge[2][i], + 0, 0, 0, csf8x8[0], + diff_xyb_edge_dc); + } + + float* xyb_avg = xyb0; + float* xyb_halfdiff = xyb1; + for (int i = 0; i < 3 * kBlockSize; ++i) { + float avg = (xyb0[i] + xyb1[i]) / 2; + float halfdiff = (xyb0[i] - xyb1[i]) / 2; + xyb_avg[i] = avg; + xyb_halfdiff[i] = halfdiff; + } + float *y_avg = &xyb_avg[kBlockSize]; + float *x_halfdiff_squared = &xyb_halfdiff[0]; + float *y_halfdiff = &xyb_halfdiff[kBlockSize]; + float *z_halfdiff_squared = &xyb_halfdiff[2 * kBlockSize]; + ButteraugliFFTSquaredOpt(y_avg); + ButteraugliFFTSquaredOpt(x_halfdiff_squared); + ButteraugliFFTSquaredOpt(y_halfdiff); + ButteraugliFFTSquaredOpt(z_halfdiff_squared); + + static const float xmul = 64.8; + static const float ymul = 1.753123908348329; + static const float ymul2 = 1.51983458269; + static const float zmul = 2.4; + + for (size_t i = kBlockEdgeHalf; i < kBlockHalf + kBlockEdgeHalf + 1; ++i) { + float d = csf8x8[i]; + diff_xyb_ac[0] += d * xmul * x_halfdiff_squared[i]; + diff_xyb_ac[2] += d * zmul * z_halfdiff_squared[i]; + + y_avg[i] = sqrt(y_avg[i]); + y_halfdiff[i] = sqrt(y_halfdiff[i]); + float y0 = y_avg[i] - y_halfdiff[i]; + float y1 = y_avg[i] + y_halfdiff[i]; + // Remove the impact of small absolute values. + // This improves the behavior with flat noise. + static const float ylimit = 0.04; + y0 = RemoveRangeAroundZeroOpt(y0, ylimit); + y1 = RemoveRangeAroundZeroOpt(y1, ylimit); + if (y0 != y1) { + float valy0 = InterpolateOpt(GetHighFreqColorDiffDyOpt(), 21, y0 * ymul2); + float valy1 = InterpolateOpt(GetHighFreqColorDiffDyOpt(), 21, y1 * ymul2); + float valy = ymul * (valy0 - valy1); + diff_xyb_ac[1] += d * valy * valy; + } + } +} + +// Low frequency edge detectors. +// Two edge detectors are applied in each corner of the 8x8 square. +// The squared 3-dimensional error vector is added to diff_xyb. +void Butteraugli8x8CornerEdgeDetectorDiffOpt( + const size_t pos_x, + const size_t pos_y, + const size_t xsize, + const size_t ysize, + const std::vector > &blurred0, + const std::vector > &blurred1, + float diff_xyb[3]) { + PROFILER_FUNC; + int local_count = 0; + float local_xyb[3] = { 0 }; + static const float w = 0.711100840192; + for (int k = 0; k < 4; ++k) { + size_t step = 3; + size_t offset[4][2] = { { 0, 0 },{ 0, 7 },{ 7, 0 },{ 7, 7 } }; + size_t x = pos_x + offset[k][0]; + size_t y = pos_y + offset[k][1]; + if (x >= step && x + step < xsize) { + size_t ix = y * xsize + (x - step); + size_t ix2 = ix + 2 * step; + XybDiffLowFreqSquaredAccumulateOpt( + w * (blurred0[0][ix] - blurred0[0][ix2]), + w * (blurred0[1][ix] - blurred0[1][ix2]), + w * (blurred0[2][ix] - blurred0[2][ix2]), + w * (blurred1[0][ix] - blurred1[0][ix2]), + w * (blurred1[1][ix] - blurred1[1][ix2]), + w * (blurred1[2][ix] - blurred1[2][ix2]), + 1.0, local_xyb); + ++local_count; + } + if (y >= step && y + step < ysize) { + size_t ix = (y - step) * xsize + x; + size_t ix2 = ix + 2 * step * xsize; + XybDiffLowFreqSquaredAccumulateOpt( + w * (blurred0[0][ix] - blurred0[0][ix2]), + w * (blurred0[1][ix] - blurred0[1][ix2]), + w * (blurred0[2][ix] - blurred0[2][ix2]), + w * (blurred1[0][ix] - blurred1[0][ix2]), + w * (blurred1[1][ix] - blurred1[1][ix2]), + w * (blurred1[2][ix] - blurred1[2][ix2]), + 1.0, local_xyb); + ++local_count; + } + } + static const float weight = 0.01617112696; + const float mul = weight * 8.0 / local_count; + for (int i = 0; i < 3; ++i) { + diff_xyb[i] += mul * local_xyb[i]; + } +} + +// https://en.wikipedia.org/wiki/Photopsin absordance modeling. +const float *GetOpsinAbsorbanceOpt() { + static const float kMix[12] = { + 0.348036746003, + 0.577814843137, + 0.0544556093735, + 0.774145581713, + 0.26922717275, + 0.767247733938, + 0.0366922708552, + 0.920130265014, + 0.0882062883536, + 0.158581714673, + 0.712857943858, + 10.6524069248, + }; + return &kMix[0]; +} + +void OpsinAbsorbanceOpt(const float in[3], float out[3]) { + const float *mix = GetOpsinAbsorbanceOpt(); + out[0] = mix[0] * in[0] + mix[1] * in[1] + mix[2] * in[2] + mix[3]; + out[1] = mix[4] * in[0] + mix[5] * in[1] + mix[6] * in[2] + mix[7]; + out[2] = mix[8] * in[0] + mix[9] * in[1] + mix[10] * in[2] + mix[11]; +} + +float GammaMinArgOpt() { + float in[3] = { 0.0, 0.0, 0.0 }; + float out[3]; + OpsinAbsorbanceOpt(in, out); + return std::min(out[0], std::min(out[1], out[2])); +} + +float GammaMaxArgOpt() { + float in[3] = { 255.0, 255.0, 255.0 }; + float out[3]; + OpsinAbsorbanceOpt(in, out); + return std::max(out[0], std::max(out[1], out[2])); +} + +void MaskHighIntensityChangeOpt( + size_t xsize, size_t ysize, + const std::vector > &c0, + const std::vector > &c1, + std::vector > &xyb0, + std::vector > &xyb1) { + PROFILER_FUNC; + for (size_t y = 0; y < ysize; ++y) { + for (size_t x = 0; x < xsize; ++x) { + size_t ix = y * xsize + x; + const float ave[3] = { + static_cast((c0[0][ix] + c1[0][ix]) * 0.5), + static_cast((c0[1][ix] + c1[1][ix]) * 0.5), + static_cast((c0[2][ix] + c1[2][ix]) * 0.5), + }; + float sqr_max_diff = -1; + { + int offset[4] = + { -1, 1, -static_cast(xsize), static_cast(xsize) }; + int border[4] = + { x == 0, x + 1 == xsize, y == 0, y + 1 == ysize }; + for (int dir = 0; dir < 4; ++dir) { + if (border[dir]) { + continue; + } + const int ix2 = ix + offset[dir]; + float diff = 0.5 * (c0[1][ix2] + c1[1][ix2]) - ave[1]; + diff *= diff; + if (sqr_max_diff < diff) { + sqr_max_diff = diff; + } + } + } + static const float kReductionX = 275.19165240059317; + static const float kReductionY = 18599.41286306991; + static const float kReductionZ = 410.8995306951065; + static const float kChromaBalance = 106.95800948271017; + float chroma_scale = kChromaBalance / (ave[1] + kChromaBalance); + + const float mix[3] = { + chroma_scale * kReductionX / (sqr_max_diff + kReductionX), + kReductionY / (sqr_max_diff + kReductionY), + chroma_scale * kReductionZ / (sqr_max_diff + kReductionZ), + }; + // Interpolate lineraly between the average color and the actual + // color -- to reduce the importance of this pixel. + for (int i = 0; i < 3; ++i) { + xyb0[i][ix] = static_cast(mix[i] * c0[i][ix] + (1 - mix[i]) * ave[i]); + xyb1[i][ix] = static_cast(mix[i] * c1[i][ix] + (1 - mix[i]) * ave[i]); + } + } + } +} + +float SimpleGammaOpt(float v) { + static const float kGamma = 0.387494322593; + static const float limit = 43.01745241042018; + float bright = v - limit; + if (bright >= 0) { + static const float mul = 0.0383723643799; + v -= bright * mul; + } + static const float limit2 = 94.68634353321337; + float bright2 = v - limit2; + if (bright2 >= 0) { + static const float mul = 0.22885405968; + v -= bright2 * mul; + } + static const float offset = 0.156775786057; + static const float scale = 8.898059160493739; + float retval = scale * (offset + pow(v, kGamma)); + return retval; +} + +// Polynomial evaluation via Clenshaw's scheme (similar to Horner's). +// Template enables compile-time unrolling of the recursion, but must reside +// outside of a class due to the specialization. +template +static inline void ClenshawRecursionOpt(const float x, const float *coefficients, + float *b1, float *b2) { + const float x_b1 = x * (*b1); + const float t = (x_b1 + x_b1) - (*b2) + coefficients[INDEX]; + *b2 = *b1; + *b1 = t; + + ClenshawRecursionOpt(x, coefficients, b1, b2); +} + +// Base case +template <> +inline void ClenshawRecursionOpt<0>(const float x, const float *coefficients, + float *b1, float *b2) { + const float x_b1 = x * (*b1); + // The final iteration differs - no 2 * x_b1 here. + *b1 = x_b1 - (*b2) + coefficients[0]; +} + +// Rational polynomial := dividing two polynomial evaluations. These are easier +// to find than minimax polynomials. +struct RationalPolynomialOpt { + template + static float EvaluatePolynomial(const float x, + const float(&coefficients)[N]) { + float b1 = 0.0; + float b2 = 0.0; + ClenshawRecursionOpt(x, coefficients, &b1, &b2); + return b1; + } + + // Evaluates the polynomial at x (in [min_value, max_value]). + inline float operator()(const float x) const { + // First normalize to [0, 1]. + const float x01 = (x - min_value) / (max_value - min_value); + // And then to [-1, 1] domain of Chebyshev polynomials. + const float xc = 2.0 * x01 - 1.0; + + const float yp = EvaluatePolynomial(xc, p); + const float yq = EvaluatePolynomial(xc, q); + if (yq == 0.0) return 0.0; + return static_cast(yp / yq); + } + + // Domain of the polynomials; they are undefined elsewhere. + float min_value; + float max_value; + + // Coefficients of T_n (Chebyshev polynomials of the first kind). + // Degree 5/5 is a compromise between accuracy (0.1%) and numerical stability. + float p[5 + 1]; + float q[5 + 1]; +}; + +static inline float GammaPolynomialOpt(float value) { + // Generated by gamma_polynomial.m from equispaced x/gamma(x) samples. + static const RationalPolynomialOpt r = { + 0.770000000000000, 274.579999999999984, + { + 881.979476556478289, 1496.058452015812463, 908.662212739659481, + 373.566100223287378, 85.840860336314364, 6.683258861509244, + }, + { + 12.262350348616792, 20.557285797683576, 12.161463238367844, + 4.711532733641639, 0.899112889751053, 0.035662329617191, + } }; + return static_cast(r(value)); +} + +static inline float GammaOpt(float v) { + // return SimpleGamma(v); + return GammaPolynomialOpt(static_cast(v)); +} + +void OpsinDynamicsImageOpt(size_t xsize, size_t ysize, + std::vector > &rgb) { + PROFILER_FUNC; + std::vector > blurred = rgb; + static const float kSigma = 1.1; + for (int i = 0; i < 3; ++i) { + BlurOpt(xsize, ysize, blurred[i].data(), kSigma, 0.0); + } + for (size_t i = 0; i < rgb[0].size(); ++i) { + float sensitivity[3]; + { + // Calculate sensitivity[3] based on the smoothed image gamma derivative. + float pre_rgb[3] = { blurred[0][i], blurred[1][i], blurred[2][i] }; + float pre_mixed[3]; + OpsinAbsorbanceOpt(pre_rgb, pre_mixed); + sensitivity[0] = GammaOpt(pre_mixed[0]) / pre_mixed[0]; + sensitivity[1] = GammaOpt(pre_mixed[1]) / pre_mixed[1]; + sensitivity[2] = GammaOpt(pre_mixed[2]) / pre_mixed[2]; + } + float cur_rgb[3] = { rgb[0][i], rgb[1][i], rgb[2][i] }; + float cur_mixed[3]; + OpsinAbsorbanceOpt(cur_rgb, cur_mixed); + cur_mixed[0] *= sensitivity[0]; + cur_mixed[1] *= sensitivity[1]; + cur_mixed[2] *= sensitivity[2]; + float x, y, z; + RgbToXybOpt(cur_mixed[0], cur_mixed[1], cur_mixed[2], &x, &y, &z); + rgb[0][i] = static_cast(x); + rgb[1][i] = static_cast(y); + rgb[2][i] = static_cast(z); + } +} + +void ScaleImageOpt(float scale, std::vector *result) { + PROFILER_FUNC; + for (size_t i = 0; i < result->size(); ++i) { + (*result)[i] *= static_cast(scale); + } +} + +// Making a cluster of local errors to be more impactful than +// just a single error. +void CalculateDiffmapOpt(const size_t xsize, const size_t ysize, + const size_t step, + std::vector* diffmap) { + PROFILER_FUNC; + // Shift the diffmap more correctly above the pixels, from 2.5 pixels to 0.5 + // pixels distance over the original image. The border of 2 pixels on top and + // left side and 3 pixels on right and bottom side are zeroed, but these + // values have no meaning, they only exist to keep the result map the same + // size as the input images. + int s2 = (8 - step) / 2; + { + // Upsample and take square root. + std::vector diffmap_out(xsize * ysize); + const size_t res_xsize = (xsize + step - 1) / step; + for (size_t res_y = 0; res_y + 8 - step < ysize; res_y += step) { + for (size_t res_x = 0; res_x + 8 - step < xsize; res_x += step) { + size_t res_ix = (res_y * res_xsize + res_x) / step; + float orig_val = (*diffmap)[res_ix]; + constexpr float kInitialSlope = 100; + // TODO(b/29974893): Until that is fixed do not call sqrt on very small + // numbers. + float val = orig_val < (1.0 / (kInitialSlope * kInitialSlope)) + ? kInitialSlope * orig_val + : std::sqrt(orig_val); + for (size_t off_y = 0; off_y < step; ++off_y) { + for (size_t off_x = 0; off_x < step; ++off_x) { + diffmap_out[(res_y + off_y + s2) * xsize + + res_x + off_x + s2] = val; + } + } + } + } + *diffmap = diffmap_out; + } + { + static const float kSigma = 8.8510880283; + static const float mul1 = 24.8235314874; + static const float scale = 1.0 / (1.0 + mul1); + const int s = 8 - step; + std::vector blurred((xsize - s) * (ysize - s)); + for (size_t y = 0; y < ysize - s; ++y) { + for (size_t x = 0; x < xsize - s; ++x) { + blurred[y * (xsize - s) + x] = (*diffmap)[(y + s2) * xsize + x + s2]; + } + } + static const float border_ratio = 0.03027655136; + BlurOpt(xsize - s, ysize - s, blurred.data(), kSigma, border_ratio); + for (size_t y = 0; y < ysize - s; ++y) { + for (size_t x = 0; x < xsize - s; ++x) { + (*diffmap)[(y + s2) * xsize + x + s2] + += static_cast(mul1) * blurred[y * (xsize - s) + x]; + } + } + ScaleImageOpt(scale, diffmap); + } +} + +static std::array MakeMaskOpt( + float extmul, float extoff, + float mul, float offset, + float scaler) { + std::array lut; + for (size_t i = 0; i < lut.size(); ++i) { + const float c = mul / ((0.01 * scaler * i) + offset); + lut[i] = 1.0 + extmul * (c + extoff); + assert(lut[i] >= 0.0); + lut[i] *= lut[i]; + } + return lut; +} + +float MaskXOpt(float delta) { + PROFILER_FUNC; + static const float extmul = 0.975741017749; + static const float extoff = -4.25328244168; + static const float offset = 0.454909521427; + static const float scaler = 0.0738288224836; + static const float mul = 20.8029176447; + static const std::array lut = + MakeMaskOpt(extmul, extoff, mul, offset, scaler); + return InterpolateClampNegativeOpt(lut.data(), lut.size(), delta); +} + +float MaskYOpt(float delta) { + PROFILER_FUNC; + static const float extmul = 0.373995618954; + static const float extoff = 1.5307267433; + static const float offset = 0.911952641929; + static const float scaler = 1.1731667845; + static const float mul = 16.2447033988; + static const std::array lut = + MakeMaskOpt(extmul, extoff, mul, offset, scaler); + return InterpolateClampNegativeOpt(lut.data(), lut.size(), delta); +} + +float MaskBOpt(float delta) { + PROFILER_FUNC; + static const float extmul = 0.61582234137; + static const float extoff = -4.25376118646; + static const float offset = 1.05105070921; + static const float scaler = 0.47434643535; + static const float mul = 31.1444967089; + static const std::array lut = + MakeMaskOpt(extmul, extoff, mul, offset, scaler); + return InterpolateClampNegativeOpt(lut.data(), lut.size(), delta); +} + +float MaskDcXOpt(float delta) { + PROFILER_FUNC; + static const float extmul = 1.79116943438; + static const float extoff = -3.86797479189; + static const float offset = 0.670960225853; + static const float scaler = 0.486575865525; + static const float mul = 20.4563479139; + static const std::array lut = + MakeMaskOpt(extmul, extoff, mul, offset, scaler); + return InterpolateClampNegativeOpt(lut.data(), lut.size(), delta); +} + +float MaskDcYOpt(float delta) { + PROFILER_FUNC; + static const float extmul = 0.212223514236; + static const float extoff = -3.65647120524; + static const float offset = 1.73396799447; + static const float scaler = 0.170392660501; + static const float mul = 21.6566724788; + static const std::array lut = + MakeMaskOpt(extmul, extoff, mul, offset, scaler); + return InterpolateClampNegativeOpt(lut.data(), lut.size(), delta); +} + +float MaskDcBOpt(float delta) { + PROFILER_FUNC; + static const float extmul = 0.349376011816; + static const float extoff = -0.894711072781; + static const float offset = 0.901647926679; + static const float scaler = 0.380086095024; + static const float mul = 18.0373825149; + static const std::array lut = + MakeMaskOpt(extmul, extoff, mul, offset, scaler); + return InterpolateClampNegativeOpt(lut.data(), lut.size(), delta); +} + +// Replaces values[x + y * xsize] with the minimum of the values in the +// square_size square with coordinates +// x - offset .. x + square_size - offset - 1, +// y - offset .. y + square_size - offset - 1. +void MinSquareValOpt(size_t square_size, size_t offset, + size_t xsize, size_t ysize, + float *values) { + PROFILER_FUNC; + // offset is not negative and smaller than square_size. + assert(offset < square_size); + std::vector tmp(xsize * ysize); + for (size_t y = 0; y < ysize; ++y) { + const size_t minh = offset > y ? 0 : y - offset; + const size_t maxh = std::min(ysize, y + square_size - offset); + for (size_t x = 0; x < xsize; ++x) { + float min = values[x + minh * xsize]; + for (size_t j = minh + 1; j < maxh; ++j) { + float tmpf = values[x + j * xsize]; + if (tmpf < min) min = tmpf; + } + tmp[x + y * xsize] = static_cast(min); + } + } + for (size_t x = 0; x < xsize; ++x) { + const size_t minw = offset > x ? 0 : x - offset; + const size_t maxw = std::min(xsize, x + square_size - offset); + for (size_t y = 0; y < ysize; ++y) { + float min = tmp[minw + y * xsize]; + for (size_t j = minw + 1; j < maxw; ++j) { + float tmpf = tmp[j + y * xsize]; + if (tmpf < min) min = tmpf; + } + values[x + y * xsize] = static_cast(min); + } + } +} + +void Average5x5Opt(int xsize, int ysize, std::vector* diffs) { + PROFILER_FUNC; + if (xsize < 4 || ysize < 4) { + // TODO: Make this work for small dimensions as well. + return; + } + static const float w = 0.679144890667f; + static const float scale = 1.0f / (5.0f + 4 * w); + std::vector result = *diffs; + std::vector tmp0 = *diffs; + std::vector tmp1 = *diffs; + ScaleImage(w, &tmp1); + for (int y = 0; y < ysize; y++) { + const int row0 = y * xsize; + result[row0 + 1] += tmp0[row0]; + result[row0 + 0] += tmp0[row0 + 1]; + result[row0 + 2] += tmp0[row0 + 1]; + for (int x = 2; x < xsize - 2; ++x) { + result[row0 + x - 1] += tmp0[row0 + x]; + result[row0 + x + 1] += tmp0[row0 + x]; + } + result[row0 + xsize - 3] += tmp0[row0 + xsize - 2]; + result[row0 + xsize - 1] += tmp0[row0 + xsize - 2]; + result[row0 + xsize - 2] += tmp0[row0 + xsize - 1]; + if (y > 0) { + const int rowd1 = row0 - xsize; + result[rowd1 + 1] += tmp1[row0]; + result[rowd1 + 0] += tmp0[row0]; + for (int x = 1; x < xsize - 1; ++x) { + result[rowd1 + x + 1] += tmp1[row0 + x]; + result[rowd1 + x + 0] += tmp0[row0 + x]; + result[rowd1 + x - 1] += tmp1[row0 + x]; + } + result[rowd1 + xsize - 1] += tmp0[row0 + xsize - 1]; + result[rowd1 + xsize - 2] += tmp1[row0 + xsize - 1]; + } + if (y + 1 < ysize) { + const int rowu1 = row0 + xsize; + result[rowu1 + 1] += tmp1[row0]; + result[rowu1 + 0] += tmp0[row0]; + for (int x = 1; x < xsize - 1; ++x) { + result[rowu1 + x + 1] += tmp1[row0 + x]; + result[rowu1 + x + 0] += tmp0[row0 + x]; + result[rowu1 + x - 1] += tmp1[row0 + x]; + } + result[rowu1 + xsize - 1] += tmp0[row0 + xsize - 1]; + result[rowu1 + xsize - 2] += tmp1[row0 + xsize - 1]; + } + } + *diffs = result; + ScaleImageOpt(scale, diffs); +} + +void DiffPrecomputeOpt( + const std::vector > &xyb0, + const std::vector > &xyb1, + size_t xsize, size_t ysize, + std::vector > *mask) { + PROFILER_FUNC; + mask->resize(3, std::vector(xyb0[0].size())); + float valsh0[3] = { 0.0 }; + float valsv0[3] = { 0.0 }; + float valsh1[3] = { 0.0 }; + float valsv1[3] = { 0.0 }; + int ix2; + for (size_t y = 0; y < ysize; ++y) { + for (size_t x = 0; x < xsize; ++x) { + size_t ix = x + xsize * y; + if (x + 1 < xsize) { + ix2 = ix + 1; + } + else { + ix2 = ix - 1; + } + { + float x0 = (xyb0[0][ix] - xyb0[0][ix2]); + float y0 = (xyb0[1][ix] - xyb0[1][ix2]); + float z0 = (xyb0[2][ix] - xyb0[2][ix2]); + XybToValsOpt(x0, y0, z0, &valsh0[0], &valsh0[1], &valsh0[2]); + float x1 = (xyb1[0][ix] - xyb1[0][ix2]); + float y1 = (xyb1[1][ix] - xyb1[1][ix2]); + float z1 = (xyb1[2][ix] - xyb1[2][ix2]); + XybToValsOpt(x1, y1, z1, &valsh1[0], &valsh1[1], &valsh1[2]); + } + if (y + 1 < ysize) { + ix2 = ix + xsize; + } + else { + ix2 = ix - xsize; + } + { + float x0 = (xyb0[0][ix] - xyb0[0][ix2]); + float y0 = (xyb0[1][ix] - xyb0[1][ix2]); + float z0 = (xyb0[2][ix] - xyb0[2][ix2]); + XybToValsOpt(x0, y0, z0, &valsv0[0], &valsv0[1], &valsv0[2]); + float x1 = (xyb1[0][ix] - xyb1[0][ix2]); + float y1 = (xyb1[1][ix] - xyb1[1][ix2]); + float z1 = (xyb1[2][ix] - xyb1[2][ix2]); + XybToValsOpt(x1, y1, z1, &valsv1[0], &valsv1[1], &valsv1[2]); + } + for (int i = 0; i < 3; ++i) { + float sup0 = fabs(valsh0[i]) + fabs(valsv0[i]); + float sup1 = fabs(valsh1[i]) + fabs(valsv1[i]); + float m = std::min(sup0, sup1); + (*mask)[i][ix] = static_cast(m); + } + } + } +} + +void MaskOpt(const std::vector > &xyb0, + const std::vector > &xyb1, + size_t xsize, size_t ysize, + std::vector > *mask, + std::vector > *mask_dc) { + PROFILER_FUNC; + mask->resize(3); + for (int i = 0; i < 3; ++i) { + (*mask)[i].resize(xsize * ysize); + } + DiffPrecomputeOpt(xyb0, xyb1, xsize, ysize, mask); + for (int i = 0; i < 3; ++i) { + _Average5x5(xsize, ysize, &(*mask)[i]); + MinSquareValOpt(4, 0, xsize, ysize, (*mask)[i].data()); + static const float sigma[3] = { + 9.65781083553, + 14.2644604355, + 4.53358927369, + }; + BlurOpt(xsize, ysize, (*mask)[i].data(), sigma[i], 0.0); + } + static const float w00 = 232.206464018; + static const float w11 = 22.9455222245; + static const float w22 = 503.962310606; + + mask_dc->resize(3); + for (int i = 0; i < 3; ++i) { + (*mask_dc)[i].resize(xsize * ysize); + } + for (size_t y = 0; y < ysize; ++y) { + for (size_t x = 0; x < xsize; ++x) { + const size_t idx = y * xsize + x; + const float s0 = (*mask)[0][idx]; + const float s1 = (*mask)[1][idx]; + const float s2 = (*mask)[2][idx]; + const float p0 = w00 * s0; + const float p1 = w11 * s1; + const float p2 = w22 * s2; + + (*mask)[0][idx] = static_cast(MaskXOpt(p0)); + (*mask)[1][idx] = static_cast(MaskYOpt(p1)); + (*mask)[2][idx] = static_cast(MaskBOpt(p2)); + (*mask_dc)[0][idx] = static_cast(MaskDcXOpt(p0)); + (*mask_dc)[1][idx] = static_cast(MaskDcYOpt(p1)); + (*mask_dc)[2][idx] = static_cast(MaskDcBOpt(p2)); + } + } + for (int i = 0; i < 3; ++i) { + ScaleImageOpt(kGlobalScale * kGlobalScale, &(*mask)[i]); + ScaleImageOpt(kGlobalScale * kGlobalScale, &(*mask_dc)[i]); + } +} + +} + +namespace butteraugli +{ + clButteraugliComparator::clButteraugliComparator(size_t xsize, size_t ysize, int step) + : ButteraugliComparator(xsize, ysize, step) + { + + } + + void clButteraugliComparator::DiffmapOpsinDynamicsImage( + std::vector> &xyb0, + std::vector> &xyb1, + std::vector &result) + { + if (MODE_CPU_OPT == g_mathMode) + { + DiffmapOpsinDynamicsImageOpt(xyb0, xyb1, result); + } +#ifdef __USE_OPENCL__ + else if (MODE_OPENCL == g_mathMode && xsize_ > 100 && ysize_ > 100) + { + result.resize(xsize_ * ysize_); + clDiffmapOpsinDynamicsImage(result.data(), xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), + xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_); + } +#endif +#ifdef __USE_CUDA__ + else if (MODE_CUDA == g_mathMode && xsize_ > 100 && ysize_ > 100) + { + result.resize(xsize_ * ysize_); + cuDiffmapOpsinDynamicsImage(result.data(), xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), + xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_); + } +#endif + else + { + ButteraugliComparator::DiffmapOpsinDynamicsImage(xyb0, xyb1, result); + } + } + + + void clButteraugliComparator::BlockDiffMap(const std::vector > &xyb0, + const std::vector > &xyb1, + std::vector* block_diff_dc, + std::vector* block_diff_ac) + { + ButteraugliComparator::BlockDiffMap(xyb0, xyb1, block_diff_dc, block_diff_ac); +#ifdef __USE_OPENCL__ + if (MODE_CHECKCL == g_mathMode && xsize_ > 8 && ysize_ > 8) + { + tclBlockDiffMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), + xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), + xsize_, ysize_, step_, + (*block_diff_dc).data(), (*block_diff_ac).data()); + } +#endif + } + + void clButteraugliComparator::EdgeDetectorMap(const std::vector > &xyb0, + const std::vector > &xyb1, + std::vector* edge_detector_map) + { + ButteraugliComparator::EdgeDetectorMap(xyb0, xyb1, edge_detector_map); +#ifdef __USE_OPENCL__ + if (MODE_CHECKCL == g_mathMode && xsize_ > 8 && ysize_ > 8) + { + tclEdgeDetectorMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), + xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), + xsize_, ysize_, step_, + (*edge_detector_map).data()); + } +#endif + } + + void clButteraugliComparator::EdgeDetectorLowFreq(const std::vector > &xyb0, + const std::vector > &xyb1, + std::vector* block_diff_ac) + { +#ifdef __USE_OPENCL__ + if (MODE_CHECKCL == g_mathMode && xsize_ > 8 && ysize_ > 8) + { + std::vector orign_ac = *block_diff_ac; + ButteraugliComparator::EdgeDetectorLowFreq(xyb0, xyb1, block_diff_ac); + tclEdgeDetectorLowFreq(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), + xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), + xsize_, ysize_, step_, + orign_ac.data(), (*block_diff_ac).data()); + } + else +#endif + { + ButteraugliComparator::EdgeDetectorLowFreq(xyb0, xyb1, block_diff_ac); + } + } + + void clButteraugliComparator::CombineChannels(const std::vector >& mask_xyb, + const std::vector >& mask_xyb_dc, + const std::vector& block_diff_dc, + const std::vector& block_diff_ac, + const std::vector& edge_detector_map, + std::vector* result) + { +#ifdef __USE_OPENCL__ + if (MODE_CHECKCL == g_mathMode && xsize_ > 8 && ysize_ > 8) + { + std::vector temp = *result; + temp.resize(res_xsize_ * res_ysize_); + ButteraugliComparator::CombineChannels(mask_xyb, mask_xyb_dc, block_diff_dc, block_diff_ac, edge_detector_map, result); + tclCombineChannels(mask_xyb[0].data(), mask_xyb[1].data(), mask_xyb[2].data(), + mask_xyb_dc[0].data(), mask_xyb_dc[1].data(), mask_xyb_dc[2].data(), + block_diff_dc.data(), + block_diff_ac.data(), edge_detector_map.data(), xsize_, ysize_, res_xsize_, res_ysize_, step_, &temp[0], &(*result)[0]); + } + else +#endif + { + ButteraugliComparator::CombineChannels(mask_xyb, mask_xyb_dc, block_diff_dc, block_diff_ac, edge_detector_map, result); + } + } + + void clButteraugliComparator::DiffmapOpsinDynamicsImageOpt( + std::vector> &xyb0, + std::vector> &xyb1, + std::vector &result) + { + if (xsize_ < 8 || ysize_ < 8) return; + { + auto xyb0_c = xyb0; + auto xyb1_c = xyb1; + MaskHighIntensityChangeOpt(xsize_, ysize_, xyb0_c, xyb1_c, xyb0, xyb1); + } + assert(8 <= xsize_); + for (int i = 0; i < 3; i++) { + assert(xyb0[i].size() == num_pixels_); + assert(xyb1[i].size() == num_pixels_); + } + std::vector edge_detector_map(3 * res_xsize_ * res_ysize_); + EdgeDetectorMapOpt(xyb0, xyb1, &edge_detector_map); + std::vector block_diff_dc(3 * res_xsize_ * res_ysize_); + std::vector block_diff_ac(3 * res_xsize_ * res_ysize_); + BlockDiffMapOpt(xyb0, xyb1, &block_diff_dc, &block_diff_ac); + EdgeDetectorLowFreqOpt(xyb0, xyb1, &block_diff_ac); + { + std::vector > mask_xyb(3); + std::vector > mask_xyb_dc(3); + MaskOpt(xyb0, xyb1, xsize_, ysize_, &mask_xyb, &mask_xyb_dc); + CombineChannelsOpt(mask_xyb, mask_xyb_dc, block_diff_dc, block_diff_ac, + edge_detector_map, &result); + } + CalculateDiffmapOpt(xsize_, ysize_, step_, &result); + } + + void clButteraugliComparator::BlockDiffMapOpt(const std::vector > &xyb0, + const std::vector > &xyb1, + std::vector* block_diff_dc, + std::vector* block_diff_ac) + { + for (size_t res_y = 0; res_y + (kBlockEdge - step_ - 1) < ysize_; + res_y += step_) { + for (size_t res_x = 0; res_x + (kBlockEdge - step_ - 1) < xsize_; + res_x += step_) { + size_t res_ix = (res_y * res_xsize_ + res_x) / step_; + size_t offset = (std::min(res_y, ysize_ - 8) * xsize_ + + std::min(res_x, xsize_ - 8)); + float block0[3 * kBlockEdge * kBlockEdge]; + float block1[3 * kBlockEdge * kBlockEdge]; + for (int i = 0; i < 3; ++i) { + float *m0 = &block0[i * kBlockEdge * kBlockEdge]; + float *m1 = &block1[i * kBlockEdge * kBlockEdge]; + for (size_t y = 0; y < kBlockEdge; y++) { + for (size_t x = 0; x < kBlockEdge; x++) { + m0[kBlockEdge * y + x] = xyb0[i][offset + y * xsize_ + x]; + m1[kBlockEdge * y + x] = xyb1[i][offset + y * xsize_ + x]; + } + } + } + float diff_xyb_dc[3] = { 0.0 }; + float diff_xyb_ac[3] = { 0.0 }; + float diff_xyb_edge_dc[3] = { 0.0 }; + ButteraugliBlockDiffOpt(block0, block1, + diff_xyb_dc, diff_xyb_ac, diff_xyb_edge_dc); + for (int i = 0; i < 3; ++i) { + (*block_diff_dc)[3 * res_ix + i] = static_cast(diff_xyb_dc[i]); + (*block_diff_ac)[3 * res_ix + i] = static_cast(diff_xyb_ac[i]); + } + } + } + } + + void clButteraugliComparator::EdgeDetectorMapOpt(const std::vector > &xyb0, + const std::vector > &xyb1, + std::vector* edge_detector_map) + { + static const float kSigma[3] = { + 1.5, + 0.586, + 0.4, + }; + std::vector > blurred0(xyb0); + std::vector > blurred1(xyb1); + for (int i = 0; i < 3; i++) { + BlurOpt(xsize_, ysize_, blurred0[i].data(), kSigma[i], 0.0); + BlurOpt(xsize_, ysize_, blurred1[i].data(), kSigma[i], 0.0); + } + for (size_t res_y = 0; res_y + (8 - step_) < ysize_; res_y += step_) { + for (size_t res_x = 0; res_x + (8 - step_) < xsize_; res_x += step_) { + size_t res_ix = (res_y * res_xsize_ + res_x) / step_; + float diff_xyb[3] = { 0.0 }; + Butteraugli8x8CornerEdgeDetectorDiffOpt(std::min(res_x, xsize_ - 8), + std::min(res_y, ysize_ - 8), + xsize_, ysize_, + blurred0, blurred1, + diff_xyb); + for (int i = 0; i < 3; ++i) { + (*edge_detector_map)[3 * res_ix + i] = static_cast(diff_xyb[i]); + } + } + } + } + + void clButteraugliComparator::EdgeDetectorLowFreqOpt(const std::vector > &xyb0, + const std::vector > &xyb1, + std::vector* block_diff_ac) + { + static const float kSigma = 14; + static const float kMul = 10; + std::vector > blurred0(xyb0); + std::vector > blurred1(xyb1); + for (int i = 0; i < 3; i++) { + BlurOpt(xsize_, ysize_, blurred0[i].data(), kSigma, 0.0); + BlurOpt(xsize_, ysize_, blurred1[i].data(), kSigma, 0.0); + } + const int step = 8; + for (size_t y = 0; y + step < ysize_; y += step_) { + int resy = y / step_; + int resx = step / step_; + for (size_t x = 0; x + step < xsize_; x += step_, resx++) { + const int ix = y * xsize_ + x; + const int res_ix = resy * res_xsize_ + resx; + float diff[4][3]; + for (int i = 0; i < 3; ++i) { + int ix2 = ix + 8; + diff[0][i] = + ((blurred1[i][ix] - blurred0[i][ix]) + + (blurred0[i][ix2] - blurred1[i][ix2])); + ix2 = ix + 8 * xsize_; + diff[1][i] = + ((blurred1[i][ix] - blurred0[i][ix]) + + (blurred0[i][ix2] - blurred1[i][ix2])); + ix2 = ix + 6 * xsize_ + 6; + diff[2][i] = + ((blurred1[i][ix] - blurred0[i][ix]) + + (blurred0[i][ix2] - blurred1[i][ix2])); + ix2 = ix + 6 * xsize_ - 6; + diff[3][i] = x < step ? 0 : + ((blurred1[i][ix] - blurred0[i][ix]) + + (blurred0[i][ix2] - blurred1[i][ix2])); + } + float max_diff_xyb[3] = { 0 }; + for (int k = 0; k < 4; ++k) { + float diff_xyb[3] = { 0 }; + XybDiffLowFreqSquaredAccumulateOpt(diff[k][0], diff[k][1], diff[k][2], + 0, 0, 0, 1.0, + diff_xyb); + for (int i = 0; i < 3; ++i) { + max_diff_xyb[i] = std::max(max_diff_xyb[i], diff_xyb[i]); + } + } + for (int i = 0; i < 3; ++i) { + (*block_diff_ac)[3 * res_ix + i] += static_cast(kMul * max_diff_xyb[i]); + } + } + } + } + + void clButteraugliComparator::CombineChannelsOpt(const std::vector >& mask_xyb, + const std::vector >& mask_xyb_dc, + const std::vector& block_diff_dc, + const std::vector& block_diff_ac, + const std::vector& edge_detector_map, + std::vector* result) + { + result->resize(res_xsize_ * res_ysize_); + for (size_t res_y = 0; res_y + (8 - step_) < ysize_; res_y += step_) { + for (size_t res_x = 0; res_x + (8 - step_) < xsize_; res_x += step_) { + size_t res_ix = (res_y * res_xsize_ + res_x) / step_; + float mask[3]; + float dc_mask[3]; + for (int i = 0; i < 3; ++i) { + mask[i] = mask_xyb[i][(res_y + 3) * xsize_ + (res_x + 3)]; + dc_mask[i] = mask_xyb_dc[i][(res_y + 3) * xsize_ + (res_x + 3)]; + } + (*result)[res_ix] = static_cast( + DotProductOpt(&block_diff_dc[3 * res_ix], dc_mask) + + DotProductOpt(&block_diff_ac[3 * res_ix], mask) + + DotProductOpt(&edge_detector_map[3 * res_ix], mask)); + } + } + } + + void MinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values) + { +#ifdef __USE_OPENCL__ + if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8) + { + std::vector img; + img.resize(xsize * ysize); + memcpy(img.data(), values, xsize * ysize * sizeof(float)); + _MinSquareVal(square_size, offset, xsize, ysize, values); + tclMinSquareVal(img.data(), square_size, offset, xsize, ysize, values); + } + else +#endif + { + _MinSquareVal(square_size, offset, xsize, ysize, values); + } + } + + void Average5x5(int xsize, int ysize, std::vector* diffs) + { +#ifdef __USE_OPENCL__ + if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8) + { + std::vector diffs_org = *diffs; + _Average5x5(xsize, ysize, diffs); + tclAverage5x5(xsize, ysize, diffs_org, *diffs); + } + else +#endif + { + _Average5x5(xsize, ysize, diffs); + } + } + + void DiffPrecompute(const std::vector > &xyb0, const std::vector > &xyb1, size_t xsize, size_t ysize, std::vector > *mask) + { + _DiffPrecompute(xyb0, xyb1, xsize, ysize, mask); + +#ifdef __USE_OPENCL__ + if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8) + { + tclDiffPrecompute(xyb0, xyb1, xsize, ysize, mask); + } +#endif + } + + void Mask(const std::vector > &xyb0, + const std::vector > &xyb1, + size_t xsize, size_t ysize, + std::vector > *mask, + std::vector > *mask_dc) + { + if (MODE_CPU_OPT == g_mathMode) + { + MaskOpt(xyb0, xyb1, xsize, ysize, mask, mask_dc); + } +#ifdef __USE_OPENCL__ + else if (MODE_OPENCL == g_mathMode && xsize > 100 && ysize > 100) + { + mask->resize(3); + mask_dc->resize(3); + for (int i = 0; i < 3; i++) + { + (*mask)[i].resize(xsize * ysize); + (*mask_dc)[i].resize(xsize * ysize); + } + clMask((*mask)[0].data(), (*mask)[1].data(), (*mask)[2].data(), + (*mask_dc)[0].data(), (*mask_dc)[1].data(), (*mask_dc)[2].data(), + xsize, ysize, + xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), + xyb1[0].data(), xyb1[1].data(), xyb1[2].data() + ); + } + else if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8) + { + _Mask(xyb0, xyb1, xsize, ysize, mask, mask_dc); + tclMask(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), + xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), + xsize, ysize, + (*mask)[0].data(), (*mask)[1].data(), (*mask)[2].data(), + (*mask_dc)[0].data(), (*mask_dc)[1].data(), (*mask_dc)[2].data()); + } +#endif +#ifdef __USE_CUDA__ + else if (MODE_CUDA == g_mathMode && xsize > 100 && ysize > 100) + { + mask->resize(3); + mask_dc->resize(3); + for (int i = 0; i < 3; i++) + { + (*mask)[i].resize(xsize * ysize); + (*mask_dc)[i].resize(xsize * ysize); + } + cuMask((*mask)[0].data(), (*mask)[1].data(), (*mask)[2].data(), + (*mask_dc)[0].data(), (*mask_dc)[1].data(), (*mask_dc)[2].data(), + xsize, ysize, + xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), + xyb1[0].data(), xyb1[1].data(), xyb1[2].data() + ); + } +#endif + else + { + _Mask(xyb0, xyb1, xsize, ysize, mask, mask_dc); + } + } + + void CalculateDiffmap(const size_t xsize, const size_t ysize, + const size_t step, + std::vector* diffmap) + { +#ifdef __USE_OPENCL__ + if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8) + { + std::vector diffmap_org = *diffmap; + _CalculateDiffmap(xsize, ysize, step, diffmap); + tclCalculateDiffmap(xsize, ysize, step, diffmap_org.data(), diffmap_org.size(), (*diffmap).data()); + } + else +#endif + { + _CalculateDiffmap(xsize, ysize, step, diffmap); + } + } + + void MaskHighIntensityChange( + size_t xsize, size_t ysize, + const std::vector > &c0, + const std::vector > &c1, + std::vector > &xyb0, + std::vector > &xyb1) + { +#ifdef __USE_OPENCL__ + if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8) + { + _MaskHighIntensityChange(xsize, ysize, c0, c1, xyb0, xyb1); + tclMaskHighIntensityChange(c0[0].data(), c0[1].data(), c0[2].data(), + c1[0].data(), c1[1].data(), c1[2].data(), + xsize, ysize, + xyb0[0].data(), xyb0[1].data(), xyb0[2].data(), + xyb1[0].data(), xyb1[1].data(), xyb1[2].data()); + } + else +#endif + if (MODE_CPU_OPT == g_mathMode) + { + MaskHighIntensityChangeOpt(xsize, ysize, c0, c1, xyb0, xyb1); + } + else + { + _MaskHighIntensityChange(xsize, ysize, c0, c1, xyb0, xyb1); + } + } + + void ScaleImage(double scale, std::vector *result) + { +#ifdef __USE_OPENCL__ + if (MODE_CHECKCL == g_mathMode && result->size() > 64) + { + std::vector result_org = *result; + _ScaleImage(scale, result); + tclScaleImage(scale, result_org.data(), (*result).data(), (*result).size()); + } + else +#endif + { + _ScaleImage(scale, result); + } + } + + void Convolution(size_t xsize, size_t ysize, + size_t xstep, + size_t len, size_t offset, + const float* __restrict__ multipliers, + const float* __restrict__ inp, + float border_ratio, + float* __restrict__ result) + { +#ifdef __USE_OPENCL__ + _Convolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result); + if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8) + { + tclConvolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result); + } +#endif + } + + void Blur(size_t xsize, size_t ysize, float* channel, double sigma, + double border_ratio) + { +#ifdef __USE_OPENCL__ + if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8) + { + std::vector orignChannel; + orignChannel.resize(xsize * ysize); + memcpy(orignChannel.data(), channel, xsize * ysize * sizeof(float)); + _Blur(xsize, ysize, channel, sigma, border_ratio); + tclBlur(orignChannel.data(), xsize, ysize, sigma, border_ratio, channel); + } + else +#endif + { + _Blur(xsize, ysize, channel, sigma, border_ratio); + } + } + + void OpsinDynamicsImage(size_t xsize, size_t ysize, + std::vector > &rgb) + { + if (MODE_CPU_OPT == g_mathMode) + { + OpsinDynamicsImageOpt(xsize, ysize, rgb); + } +#ifdef __USE_OPENCL__ + else if (MODE_OPENCL == g_mathMode && xsize > 100 && ysize > 100) + { + float * r = rgb[0].data(); + float * g = rgb[1].data(); + float * b = rgb[2].data(); + + clOpsinDynamicsImage(r, g, b, xsize, ysize); + } + else if (MODE_CHECKCL == g_mathMode && xsize > 8 & ysize > 8) + { + std::vector< std::vector> orig_rgb = rgb; + _OpsinDynamicsImage(xsize, ysize, rgb); + tclOpsinDynamicsImage(orig_rgb[0].data(), orig_rgb[1].data(), orig_rgb[2].data(), + xsize, ysize, + rgb[0].data(), rgb[1].data(), rgb[2].data()); + } +#endif +#ifdef __USE_CUDA__ + else if (MODE_CUDA == g_mathMode && xsize > 100 && ysize > 100) + { + float * r = rgb[0].data(); + float * g = rgb[1].data(); + float * b = rgb[2].data(); + + cuOpsinDynamicsImage(r, g, b, xsize, ysize); + } +#endif + else + { + _OpsinDynamicsImage(xsize, ysize, rgb); + } + } +} \ No newline at end of file diff --git a/clguetzli/clbutter_comparator.h b/clguetzli/clbutter_comparator.h new file mode 100644 index 00000000..76380785 --- /dev/null +++ b/clguetzli/clbutter_comparator.h @@ -0,0 +1,116 @@ +/* +* OpenCL/CUDA edition implementation of butter_comparator. +* +* Author: strongtu@tencent.com +* ianhuang@tencent.com +* chriskzhou@tencent.com +*/ +#pragma once +#include +#include "butteraugli/butteraugli.h" + +#define __restrict__ + +namespace butteraugli { + + class clButteraugliComparator : public ButteraugliComparator + { + public: + clButteraugliComparator(size_t xsize, size_t ysize, int step); + + virtual void DiffmapOpsinDynamicsImage(std::vector> &xyb0, + std::vector> &xyb1, + std::vector &result); + + virtual void DiffmapOpsinDynamicsImageOpt(std::vector> &xyb0, + std::vector> &xyb1, + std::vector &result); + + virtual void BlockDiffMap(const std::vector > &rgb0, + const std::vector > &rgb1, + std::vector* block_diff_dc, + std::vector* block_diff_ac); + + virtual void BlockDiffMapOpt(const std::vector > &rgb0, + const std::vector > &rgb1, + std::vector* block_diff_dc, + std::vector* block_diff_ac); + + virtual void EdgeDetectorMap(const std::vector > &rgb0, + const std::vector > &rgb1, + std::vector* edge_detector_map); + + virtual void EdgeDetectorMapOpt(const std::vector > &rgb0, + const std::vector > &rgb1, + std::vector* edge_detector_map); + + virtual void EdgeDetectorLowFreq(const std::vector > &rgb0, + const std::vector > &rgb1, + std::vector* block_diff_ac); + + virtual void EdgeDetectorLowFreqOpt(const std::vector > &rgb0, + const std::vector > &rgb1, + std::vector* block_diff_ac); + + virtual void CombineChannels(const std::vector >& scale_xyb, + const std::vector >& scale_xyb_dc, + const std::vector& block_diff_dc, + const std::vector& block_diff_ac, + const std::vector& edge_detector_map, + std::vector* result); + + virtual void CombineChannelsOpt(const std::vector >& scale_xyb, + const std::vector >& scale_xyb_dc, + const std::vector& block_diff_dc, + const std::vector& block_diff_ac, + const std::vector& edge_detector_map, + std::vector* result); + }; + + void _MinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values); + void _Average5x5(int xsize, int ysize, std::vector* diffs); + void _DiffPrecompute(const std::vector > &xyb0, const std::vector > &xyb1, size_t xsize, size_t ysize, std::vector > *mask); + void _Mask(const std::vector > &xyb0, + const std::vector > &xyb1, + size_t xsize, size_t ysize, + std::vector > *mask, + std::vector > *mask_dc); + void _CalculateDiffmap(const size_t xsize, const size_t ysize, + const size_t step, + std::vector* diffmap); + void _OpsinDynamicsImage(size_t xsize, size_t ysize, + std::vector > &rgb); + void _MaskHighIntensityChange( + size_t xsize, size_t ysize, + const std::vector > &c0, + const std::vector > &c1, + std::vector > &xyb0, + std::vector > &xyb1); + void _ScaleImage(double scale, std::vector *result); + void _Convolution(size_t xsize, size_t ysize, + size_t xstep, + size_t len, size_t offset, + const float* __restrict__ multipliers, + const float* __restrict__ inp, + double border_ratio, + float* __restrict__ result); + void _Blur(size_t xsize, size_t ysize, float* channel, double sigma, + double border_ratio); + + void MinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values); + void Average5x5(int xsize, int ysize, std::vector* diffs); + void DiffPrecompute(const std::vector > &xyb0, const std::vector > &xyb1, size_t xsize, size_t ysize, std::vector > *mask); + void ScaleImage(double scale, std::vector *result); + void Convolution(size_t xsize, size_t ysize, + size_t xstep, + size_t len, size_t offset, + const float* __restrict__ multipliers, + const float* __restrict__ inp, + float border_ratio, + float* __restrict__ result); + void Blur(size_t xsize, size_t ysize, float* channel, double sigma, + double border_ratio); + void CalculateDiffmap(const size_t xsize, const size_t ysize, + const size_t step, + std::vector* diffmap); +} \ No newline at end of file diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl new file mode 100644 index 00000000..2a8eb527 --- /dev/null +++ b/clguetzli/clguetzli.cl @@ -0,0 +1,3420 @@ +/* +* OpenCL Kernels +* +* Author: strongtu@tencent.com +* ianhuang@tencent.com +* chriskzhou@tencent.com +*/ +#ifdef __USE_OPENCL__ +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +#include "clguetzli/clguetzli.cl.h" + +#ifdef __USE_DOUBLE_AS_FLOAT__ +#define double float +#endif + +#define kBlockEdge 8 +#define kBlockSize (kBlockEdge * kBlockEdge) +#define kDCTBlockSize (kBlockEdge * kBlockEdge) +#define kBlockEdgeHalf (kBlockEdge / 2) +#define kBlockHalf (kBlockEdge * kBlockEdgeHalf) +#define kComputeBlockSize (kBlockSize * 3) + +// IntFloatPair: opencl version of output_order/input_order +typedef struct __IntFloatPair +{ + int idx; + float err; +}IntFloatPair, DCTScoreData, CoeffData; + +typedef struct __IntFloatPairList +{ + int size; + IntFloatPair *pData; +}IntFloatPairList; + +__device__ void XybToVals(double x, double y, double z, double *valx, double *valy, double *valz); +__device__ double InterpolateClampNegative(__global const double *array, int size, double sx); +__device__ void XybDiffLowFreqSquaredAccumulate(double r0, double g0, double b0, + double r1, double g1, double b1, + double factor, double res[3]); +__device__ double DotProduct(__global const float u[3], const double v[3]); +__device__ void OpsinAbsorbance(const double in[3], double out[3]); +__device__ void RgbToXyb(double r, double g, double b, double *valx, double *valy, double *valz); +__device__ double Gamma(double v); +__device__ void ButteraugliBlockDiff(__private double xyb0[3 * kBlockSize], + __private double xyb1[3 * kBlockSize], + double diff_xyb_dc[3], + double diff_xyb_ac[3], + double diff_xyb_edge_dc[3]); +__device__ void Butteraugli8x8CornerEdgeDetectorDiff( + int pos_x, + int pos_y, + int xsize, + int ysize, + __global const float *r, __global const float *g, __global const float* b, + __global const float *r2, __global const float* g2, __global const float *b2, + double* diff_xyb); + +__device__ int MakeInputOrderEx(const coeff_t block[3*8*8], const coeff_t orig_block[3*8*8], IntFloatPairList *input_order); + +__device__ double Factor2(const channel_info mayout_channel[3], + const coeff_t* candidate_block, + const int block_x, + const int block_y, + __global const float *orig_image_batch, + __global const float *mask_scale, + const int image_width, + const int image_height); + +__device__ double CompareBlockFactor1(const channel_info mayout_channel[3], + const coeff_t* candidate_block, + const int block_x, + const int block_y, + __global const float *orig_image_batch, + __global const float *mask_scale, + const int image_width, + const int image_height); + +__device__ double CompareBlockFactor(const channel_info mayout_channel[3], + const coeff_t* candidate_block, + const int block_x, + const int block_y, + __global const float *orig_image_batch, + __global const float *mask_scale, + const int image_width, + const int image_height, + const int factor); + +__device__ void floatcopy(float *dst, const float *src, int size); +__device__ void coeffcopy(coeff_t *dst, const coeff_t *src, int size); +__device__ void coeffcopy_g(coeff_t *dst, __global const coeff_t *src, int size); +__device__ int list_erase(IntFloatPairList* list, int idx); +__device__ int list_push_back(IntFloatPairList* list, int i, float f); + +__kernel void clConvolutionEx( + __global float* result, + __global const float* inp, const int xsize, + __global const float* multipliers, const int len, + const int xstep, const int offset, const float border_ratio) +{ + const int ox = get_global_id(0); + const int y = get_global_id(1); + + const int oxsize = get_global_size(0); + const int ysize = get_global_size(1); + + const int x = ox * xstep; + + float weight_no_border = 0; + for (int j = 0; j <= 2 * offset; j++) + { + weight_no_border += multipliers[j]; + } + + int minx = x < offset ? 0 : x - offset; + int maxx = min(xsize, x + len - offset); + + float weight = 0.0; + for (int j = minx; j < maxx; j++) + { + weight += multipliers[j - x + offset]; + } + + weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border; + float scale = 1.0 / weight; + + float sum = 0.0; + for (int j = minx; j < maxx; j++) + { + sum += inp[y * xsize + j] * multipliers[j - x + offset]; + } + + result[ox * ysize + y] = sum * scale; +} + +__kernel void clConvolutionXEx( + __global float* result, + const int xsize, const int ysize, + __global const float* inp, + __global const float* multipliers, const int len, + const int step, const int offset, const float border_ratio) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + if (x >= xsize || y >= ysize) return; + + if (x % step != 0) return; + + float weight_no_border = 0; + for (int j = 0; j <= 2 * offset; j++) + { + weight_no_border += multipliers[j]; + } + + int minx = x < offset ? 0 : x - offset; + int maxx = min(xsize, x + len - offset); + + float weight = 0.0; + for (int j = minx; j < maxx; j++) + { + weight += multipliers[j - x + offset]; + } + + weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border; + float scale = 1.0 / weight; + + float sum = 0.0; + for (int j = minx; j < maxx; j++) + { + sum += inp[y * xsize + j] * multipliers[j - x + offset]; + } + + result[y * xsize + x] = sum * scale; +} + +__kernel void clConvolutionYEx( + __global float* result, + const int xsize, const int ysize, + __global const float* inp, + __global const float* multipliers, const int len, + const int step, const int offset, const float border_ratio) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + if (x >= xsize || y >= ysize) return; + if (x % step != 0) return; + if (y % step != 0) return; + + float weight_no_border = 0; + for (int j = 0; j <= 2 * offset; j++) + { + weight_no_border += multipliers[j]; + } + + int miny = y < offset ? 0 : y - offset; + int maxy = min(ysize, y + len - offset); + + float weight = 0.0; + for (int j = miny; j < maxy; j++) + { + weight += multipliers[j - y + offset]; + } + + weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border; + float scale = 1.0 / weight; + + float sum = 0.0; + for (int j = miny; j < maxy; j++) + { + sum += inp[j * xsize + x] * multipliers[j - y + offset]; + } + + result[y * xsize + x] = sum * scale; +} + +__kernel void clSquareSampleEx( + __global float* result, + const int xsize, const int ysize, + __global const float* image, + const int xstep, const int ystep) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + if (x >= xsize || y >= ysize) return; + + int x_sample = x - x % xstep; + int y_sample = y - y % ystep; + + if (x_sample == x && y_sample == y) return; + + result[y * xsize + x] = image[y_sample * xsize + x_sample]; +} + +__kernel void clOpsinDynamicsImageEx( + __global float *r, __global float *g, __global float *b, + const int size, + __global const float *r_blurred, __global const float *g_blurred, __global const float *b_blurred) +{ + const int i = get_global_id(0); + if (i >= size) return; + + double pre[3] = { r_blurred[i], g_blurred[i], b_blurred[i] }; + double pre_mixed[3]; + OpsinAbsorbance(pre, pre_mixed); + + double sensitivity[3]; + sensitivity[0] = Gamma(pre_mixed[0]) / pre_mixed[0]; + sensitivity[1] = Gamma(pre_mixed[1]) / pre_mixed[1]; + sensitivity[2] = Gamma(pre_mixed[2]) / pre_mixed[2]; + + double cur_rgb[3] = { r[i], g[i], b[i] }; + double cur_mixed[3]; + OpsinAbsorbance(cur_rgb, cur_mixed); + cur_mixed[0] *= sensitivity[0]; + cur_mixed[1] *= sensitivity[1]; + cur_mixed[2] *= sensitivity[2]; + + double x, y, z; + RgbToXyb(cur_mixed[0], cur_mixed[1], cur_mixed[2], &x, &y, &z); + r[i] = x; + g[i] = y; + b[i] = z; +} + +__kernel void clMaskHighIntensityChangeEx( + __global float *xyb0_x, __global float *xyb0_y, __global float *xyb0_b, + const int xsize, const int ysize, + __global float *xyb1_x, __global float *xyb1_y, __global float *xyb1_b, + __global const float *c0_x, __global const float *c0_y, __global const float *c0_b, + __global const float *c1_x, __global const float *c1_y, __global const float *c1_b +) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + if (x >= xsize || y >= ysize) return; + + size_t ix = y * xsize + x; + const double ave[3] = { + (c0_x[ix] + c1_x[ix]) * 0.5f, + (c0_y[ix] + c1_y[ix]) * 0.5f, + (c0_b[ix] + c1_b[ix]) * 0.5f, + }; + double sqr_max_diff = -1; + { + int offset[4] = { -1, 1, -(int)(xsize), (int)(xsize) }; + int border[4] = { x == 0, x + 1 == xsize, y == 0, y + 1 == ysize }; + for (int dir = 0; dir < 4; ++dir) { + if (border[dir]) { + continue; + } + const int ix2 = ix + offset[dir]; + double diff = 0.5 * (c0_y[ix2] + c1_y[ix2]) - ave[1]; + diff *= diff; + if (sqr_max_diff < diff) { + sqr_max_diff = diff; + } + } + } + const double kReductionX = 275.19165240059317; + const double kReductionY = 18599.41286306991; + const double kReductionZ = 410.8995306951065; + const double kChromaBalance = 106.95800948271017; + double chroma_scale = kChromaBalance / (ave[1] + kChromaBalance); + + const double mix[3] = { + chroma_scale * kReductionX / (sqr_max_diff + kReductionX), + kReductionY / (sqr_max_diff + kReductionY), + chroma_scale * kReductionZ / (sqr_max_diff + kReductionZ), + }; + // Interpolate lineraly between the average color and the actual + // color -- to reduce the importance of this pixel. + xyb0_x[ix] = (float)(mix[0] * c0_x[ix] + (1 - mix[0]) * ave[0]); + xyb1_x[ix] = (float)(mix[0] * c1_x[ix] + (1 - mix[0]) * ave[0]); + + xyb0_y[ix] = (float)(mix[1] * c0_y[ix] + (1 - mix[1]) * ave[1]); + xyb1_y[ix] = (float)(mix[1] * c1_y[ix] + (1 - mix[1]) * ave[1]); + + xyb0_b[ix] = (float)(mix[2] * c0_b[ix] + (1 - mix[2]) * ave[2]); + xyb1_b[ix] = (float)(mix[2] * c1_b[ix] + (1 - mix[2]) * ave[2]); +} + +__kernel void clEdgeDetectorMapEx( + __global float *result, + const int res_xsize, const int res_ysize, + __global const float *r, __global const float *g, __global const float* b, + __global const float *r2, __global const float* g2, __global const float *b2, + int xsize, int ysize, int step) +{ + const int res_x = get_global_id(0); + const int res_y = get_global_id(1); + + if (res_x >= res_xsize || res_y >= res_ysize) return; + + int pos_x = res_x * step; + int pos_y = res_y * step; + + if (pos_x >= xsize - (8 - step)) return; + if (pos_y >= ysize - (8 - step)) return; + + pos_x = min(pos_x, xsize - 8); + pos_y = min(pos_y, ysize - 8); + + double diff_xyb[3] = { 0.0 }; + Butteraugli8x8CornerEdgeDetectorDiff(pos_x, pos_y, xsize, ysize, + r, g, b, + r2, g2, b2, + &diff_xyb[0]); + + int idx = (res_y * res_xsize + res_x) * 3; + result[idx] = diff_xyb[0]; + result[idx + 1] = diff_xyb[1]; + result[idx + 2] = diff_xyb[2]; +} + + +__kernel void clBlockDiffMapEx( + __global float* block_diff_dc, __global float* block_diff_ac, + const int res_xsize, const int res_ysize, + __global const float* r, __global const float* g, __global const float* b, + __global const float* r2, __global const float* g2, __global const float* b2, + int xsize, int ysize, int step) +{ + const int res_x = get_global_id(0); + const int res_y = get_global_id(1); + + if (res_x >= res_xsize || res_y >= res_ysize) return; + + int pos_x = res_x * step; + int pos_y = res_y * step; + + if ((pos_x + kBlockEdge - step - 1) >= xsize) return; + if ((pos_y + kBlockEdge - step - 1) >= ysize) return; + + size_t res_ix = res_y * res_xsize + res_x; + size_t offset = min(pos_y, ysize - 8) * xsize + min(pos_x, xsize - 8); + + double block0[3 * kBlockEdge * kBlockEdge]; + double block1[3 * kBlockEdge * kBlockEdge]; + + double *block0_r = &block0[0]; + double *block0_g = &block0[kBlockEdge * kBlockEdge]; + double *block0_b = &block0[2 * kBlockEdge * kBlockEdge]; + + double *block1_r = &block1[0]; + double *block1_g = &block1[kBlockEdge * kBlockEdge]; + double *block1_b = &block1[2 * kBlockEdge * kBlockEdge]; + + for (int y = 0; y < kBlockEdge; y++) + { + for (int x = 0; x < kBlockEdge; x++) + { + block0_r[kBlockEdge * y + x] = r[offset + y * xsize + x]; + block0_g[kBlockEdge * y + x] = g[offset + y * xsize + x]; + block0_b[kBlockEdge * y + x] = b[offset + y * xsize + x]; + block1_r[kBlockEdge * y + x] = r2[offset + y * xsize + x]; + block1_g[kBlockEdge * y + x] = g2[offset + y * xsize + x]; + block1_b[kBlockEdge * y + x] = b2[offset + y * xsize + x]; + } + } + + double diff_xyb_dc[3] = { 0.0 }; + double diff_xyb_ac[3] = { 0.0 }; + double diff_xyb_edge_dc[3] = { 0.0 }; + + ButteraugliBlockDiff(block0, block1, diff_xyb_dc, diff_xyb_ac, diff_xyb_edge_dc); + + for (int i = 0; i < 3; i++) + { + block_diff_dc[3 * res_ix + i] = diff_xyb_dc[i]; + block_diff_ac[3 * res_ix + i] = diff_xyb_ac[i]; + } +} + +__kernel void clEdgeDetectorLowFreqEx( + __global float *block_diff_ac, + const int res_xsize, const int res_ysize, + __global const float *r, __global const float *g, __global const float* b, + __global const float *r2, __global const float* g2, __global const float *b2, + int xsize, int ysize, int step_) +{ + const int res_x = get_global_id(0); + const int res_y = get_global_id(1); + + if (res_x >= res_xsize || res_y >= res_ysize) return; + + const int step = 8; + if (res_x < step / step_) return; + + int x = (res_x - (step / step_)) * step_; + int y = res_y * step_; + + if (x + step >= xsize) return; + if (y + step >= ysize) return; + + int ix = y * xsize + x; + + double diff[4][3]; + __global const float* blurred0[3] = { r, g, b }; + __global const float* blurred1[3] = { r2, g2, b2 }; + + for (int i = 0; i < 3; ++i) { + int ix2 = ix + 8; + diff[0][i] = + ((blurred1[i][ix] - blurred0[i][ix]) + + (blurred0[i][ix2] - blurred1[i][ix2])); + ix2 = ix + 8 * xsize; + diff[1][i] = + ((blurred1[i][ix] - blurred0[i][ix]) + + (blurred0[i][ix2] - blurred1[i][ix2])); + ix2 = ix + 6 * xsize + 6; + diff[2][i] = + ((blurred1[i][ix] - blurred0[i][ix]) + + (blurred0[i][ix2] - blurred1[i][ix2])); + ix2 = ix + 6 * xsize - 6; + diff[3][i] = x < step ? 0 : + ((blurred1[i][ix] - blurred0[i][ix]) + + (blurred0[i][ix2] - blurred1[i][ix2])); + } + double max_diff_xyb[3] = { 0 }; + for (int k = 0; k < 4; ++k) { + double diff_xyb[3] = { 0 }; + XybDiffLowFreqSquaredAccumulate(diff[k][0], diff[k][1], diff[k][2], + 0, 0, 0, 1.0, + diff_xyb); + for (int i = 0; i < 3; ++i) { + max_diff_xyb[i] = max(max_diff_xyb[i], diff_xyb[i]); + } + } + + int res_ix = res_y * res_xsize + res_x; + + const double kMul = 10; + + block_diff_ac[res_ix * 3] += max_diff_xyb[0] * kMul; + block_diff_ac[res_ix * 3 + 1] += max_diff_xyb[1] * kMul; + block_diff_ac[res_ix * 3 + 2] += max_diff_xyb[2] * kMul; +} + +__kernel void clDiffPrecomputeEx( + __global float *mask_x, __global float *mask_y, __global float *mask_b, + const int xsize, const int ysize, + __global const float *xyb0_x, __global const float *xyb0_y, __global const float *xyb0_b, + __global const float *xyb1_x, __global const float *xyb1_y, __global const float *xyb1_b) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + if (x >= xsize || y >= ysize) return; + + double valsh0[3] = { 0.0 }; + double valsv0[3] = { 0.0 }; + double valsh1[3] = { 0.0 }; + double valsv1[3] = { 0.0 }; + int ix2; + + int ix = x + xsize * y; + if (x + 1 < xsize) { + ix2 = ix + 1; + } + else { + ix2 = ix - 1; + } + { + double x0 = (xyb0_x[ix] - xyb0_x[ix2]); + double y0 = (xyb0_y[ix] - xyb0_y[ix2]); + double z0 = (xyb0_b[ix] - xyb0_b[ix2]); + XybToVals(x0, y0, z0, &valsh0[0], &valsh0[1], &valsh0[2]); + double x1 = (xyb1_x[ix] - xyb1_x[ix2]); + double y1 = (xyb1_y[ix] - xyb1_y[ix2]); + double z1 = (xyb1_b[ix] - xyb1_b[ix2]); + XybToVals(x1, y1, z1, &valsh1[0], &valsh1[1], &valsh1[2]); + } + if (y + 1 < ysize) { + ix2 = ix + xsize; + } + else { + ix2 = ix - xsize; + } + { + double x0 = (xyb0_x[ix] - xyb0_x[ix2]); + double y0 = (xyb0_y[ix] - xyb0_y[ix2]); + double z0 = (xyb0_b[ix] - xyb0_b[ix2]); + XybToVals(x0, y0, z0, &valsv0[0], &valsv0[1], &valsv0[2]); + double x1 = (xyb1_x[ix] - xyb1_x[ix2]); + double y1 = (xyb1_y[ix] - xyb1_y[ix2]); + double z1 = (xyb1_b[ix] - xyb1_b[ix2]); + XybToVals(x1, y1, z1, &valsv1[0], &valsv1[1], &valsv1[2]); + } + + double sup0 = fabs(valsh0[0]) + fabs(valsv0[0]); + double sup1 = fabs(valsh1[0]) + fabs(valsv1[0]); + double m = min(sup0, sup1); + mask_x[ix] = (float)(m); + + sup0 = fabs(valsh0[1]) + fabs(valsv0[1]); + sup1 = fabs(valsh1[1]) + fabs(valsv1[1]); + m = min(sup0, sup1); + mask_y[ix] = (float)(m); + + sup0 = fabs(valsh0[2]) + fabs(valsv0[2]); + sup1 = fabs(valsh1[2]) + fabs(valsv1[2]); + m = min(sup0, sup1); + mask_b[ix] = (float)(m); +} + +__kernel void clScaleImageEx(__global float *img, const int size, float scale) +{ + const int i = get_global_id(0); + if (i >= size) return; + + img[i] *= scale; +} + +#define Average5x5_w 0.679144890667f +__constant float Average5x5_scale = 1.0f / (5.0f + 4 * Average5x5_w); +__kernel void clAverage5x5Ex(__global float *img, const int xsize, const int ysize, __global const float *img_org) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + if (x >= xsize || y >= ysize) return; + + const int row0 = y * xsize; + if (x - 1 >= 0) { + img[row0 + x] += img_org[row0 + x - 1]; + } + if (x + 1 < xsize) { + img[row0 + x] += img_org[row0 + x + 1]; + } + + if (y > 0) { + const int rowd1 = row0 - xsize; + if (x - 1 >= 0) { + img[row0 + x] += img_org[rowd1 + x - 1] * Average5x5_w; + } + img[row0 + x] += img_org[rowd1 + x]; + if (x + 1 < xsize) { + img[row0 + x] += img_org[rowd1 + x + 1] * Average5x5_w; + } + } + + if (y + 1 < ysize) { + const int rowu1 = row0 + xsize; + if (x - 1 >= 0) { + img[row0 + x] += img_org[rowu1 + x - 1] * Average5x5_w; + } + img[row0 + x] += img_org[rowu1 + x]; + if (x + 1 < xsize) { + img[row0 + x] += img_org[rowu1 + x + 1] * Average5x5_w; + } + } + + img[row0 + x] *= Average5x5_scale; +} + +__kernel void clMinSquareValEx(__global float* result, const int xsize, const int ysize, __global const float* img, int square_size, int offset) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + if (x >= xsize || y >= ysize) return; + + int minH = offset > y ? 0 : y - offset; + int maxH = min(y + square_size - offset, ysize); + + int minW = offset > x ? 0 : x - offset; + int maxW = min(x + square_size - offset, xsize); + + float minValue = img[minH * xsize + minW]; + + for (int j = minH; j < maxH; j++) + { + for (int i = minW; i < maxW; i++) + { + float tmp = img[j * xsize + i]; + if (tmp < minValue) minValue = tmp; + } + } + + result[y * xsize + x] = minValue; +} + +__kernel void clDoMaskEx( + __global float *mask_x, __global float *mask_y, __global float *mask_b, + const int xsize, const int ysize, + __global float *mask_dc_x, __global float *mask_dc_y, __global float *mask_dc_b, + __global const double *lut_x, __global const double *lut_y, __global const double *lut_b, + __global const double *lut_dc_x, __global const double *lut_dc_y, __global const double *lut_dc_b) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + const double w00 = 232.206464018; + const double w11 = 22.9455222245; + const double w22 = 503.962310606; + + const size_t idx = y * xsize + x; + const double s0 = mask_x[idx]; + const double s1 = mask_y[idx]; + const double s2 = mask_b[idx]; + const double p0 = w00 * s0; + const double p1 = w11 * s1; + const double p2 = w22 * s2; + + mask_x[idx] = (float)(InterpolateClampNegative(lut_x, 512, p0)); + mask_y[idx] = (float)(InterpolateClampNegative(lut_y, 512, p1)); + mask_b[idx] = (float)(InterpolateClampNegative(lut_b, 512, p2)); + mask_dc_x[idx] = (float)(InterpolateClampNegative(lut_dc_x, 512, p0)); + mask_dc_y[idx] = (float)(InterpolateClampNegative(lut_dc_y, 512, p1)); + mask_dc_b[idx] = (float)(InterpolateClampNegative(lut_dc_b, 512, p2)); +} + +__kernel void clCombineChannelsEx( + __global float *result, + __global const float *mask_x, __global const float *mask_y, __global const float *mask_b, + __global const float *mask_dc_x, __global const float *mask_dc_y, __global const float *mask_dc_b, + const int xsize, const int ysize, + __global const float *block_diff_dc, + __global const float *block_diff_ac, + __global float *edge_detector_map, + const int res_xsize, + const int step) +{ + const int res_x = get_global_id(0) * step; + const int res_y = get_global_id(1) * step; + + double mask[3]; + double dc_mask[3]; + mask[0] = mask_x[(res_y + 3) * xsize + (res_x + 3)]; + dc_mask[0] = mask_dc_x[(res_y + 3) * xsize + (res_x + 3)]; + + mask[1] = mask_y[(res_y + 3) * xsize + (res_x + 3)]; + dc_mask[1] = mask_dc_y[(res_y + 3) * xsize + (res_x + 3)]; + + mask[2] = mask_b[(res_y + 3) * xsize + (res_x + 3)]; + dc_mask[2] = mask_dc_b[(res_y + 3) * xsize + (res_x + 3)]; + + size_t res_ix = (res_y * res_xsize + res_x) / step; + result[res_ix] = (float)( + DotProduct(&block_diff_dc[3 * res_ix], dc_mask) + + DotProduct(&block_diff_ac[3 * res_ix], mask) + + DotProduct(&edge_detector_map[3 * res_ix], mask)); +} + +__kernel void clUpsampleSquareRootEx(__global float *diffmap_out, __global const float *diffmap, int xsize, int ysize, int step) +{ + const int res_x = get_global_id(0); + const int res_y = get_global_id(1); + + const int res_xsize = get_global_size(0); + const int res_ysize = get_global_size(1); + + const int pos_x = res_x * step; + const int pos_y = res_y * step; + + if (pos_y + 8 - step >= ysize) return; + if (pos_x + 8 - step >= xsize) return; + + int s2 = (8 - step) / 2; + + // Upsample and take square root. + float orig_val = diffmap[res_y * res_xsize + res_x]; + + const float kInitialSlope = 100; + // TODO(b/29974893): Until that is fixed do not call sqrt on very small + // numbers. + double val = orig_val < (1.0 / (kInitialSlope * kInitialSlope)) + ? kInitialSlope * orig_val + : sqrt(orig_val); + + for (size_t off_y = 0; off_y < step; ++off_y) { + for (size_t off_x = 0; off_x < step; ++off_x) { + diffmap_out[(pos_y + off_y + s2) * xsize + pos_x + off_x + s2] = val; + } + } +} + +__kernel void clRemoveBorderEx(__global float *out, const int xsize, const int ysize, __global const float *in, int s, int s2) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + if (x >= xsize || y >= ysize) return; + + out[y * xsize + x] = in[(y + s2) * (xsize + s) + x + s2]; +} + +__kernel void clAddBorderEx(__global float *out, const int xsize, const int ysize, int s, int s2, __global const float *in) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + if (x >= xsize - s || + y >= ysize - s) + { + return; + } + + const double mul1 = 24.8235314874; + out[(y + s2) * xsize + x + s2] += (float)(mul1) * in[y * (xsize - s) + x]; + +} + +__kernel void clComputeBlockZeroingOrderEx( + __global const coeff_t *orig_batch_0, // Coeffs of Original image. + __global const coeff_t *orig_batch_1, // Coeffs of Original image. + __global const coeff_t *orig_batch_2, // Coeffs of Original image. + __global const float *orig_image_batch, // pregamma of Original image.. + __global const float *mask_scale, // mask_scale of Original image.. + const int block_xsize, + const int block_ysize, + const int image_width, + const int image_height, + + __global const coeff_t *mayout_batch_0, // Coeffs of output image. + __global const coeff_t *mayout_batch_1, // Coeffs of output image. + __global const coeff_t *mayout_batch_2, // Coeffs of output image. + __global const ushort *mayout_pixel_0, + __global const ushort *mayout_pixel_1, + __global const ushort *mayout_pixel_2, + + const channel_info mayout_channel_0, + const channel_info mayout_channel_1, + const channel_info mayout_channel_2, + const int factor, // Current factor in computing. + const int comp_mask, // Current channel in computing. + const float BlockErrorLimit, + __global CoeffData *output_order_list/*out*/) +{ + const int block_x = get_global_id(0); + const int block_y = get_global_id(1); + + if (block_x >= block_xsize || block_y >= block_ysize) return; + + channel_info orig_channel[3]; + orig_channel[0].coeff = orig_batch_0; + orig_channel[1].coeff = orig_batch_1; + orig_channel[2].coeff = orig_batch_2; + + channel_info mayout_channel[3] = { mayout_channel_0, mayout_channel_1, mayout_channel_2 }; + mayout_channel[0].coeff = mayout_batch_0; + mayout_channel[1].coeff = mayout_batch_1; + mayout_channel[2].coeff = mayout_batch_2; + mayout_channel[0].pixel = mayout_pixel_0; + mayout_channel[1].pixel = mayout_pixel_1; + mayout_channel[2].pixel = mayout_pixel_2; + + int block_idx = 0; + + coeff_t mayout_block[kComputeBlockSize] = { 0 }; + coeff_t orig_block[kComputeBlockSize] = { 0 }; + + for (int c = 0; c < 3; c++) { + if (comp_mask & (1< 0) + { + float best_err = 1e17f; + int best_i = 0; + for (int i = 0; i < min(3, input_order.size); i++) + { + const int idx = input_order.pData[i].idx; + coeff_t old_coeff = mayout_block[idx]; + mayout_block[idx] = 0; + + + float max_err = CompareBlockFactor(mayout_channel, + mayout_block, + block_x, + block_y, + orig_image_batch, + mask_scale, + image_width, + image_height, + factor); + if (max_err < best_err) + { + best_err = max_err; + best_i = i; + } + mayout_block[idx] = old_coeff; + } + + if (best_err >= BlockErrorLimit) + { // The input_order is an ascent vector, break when best_err exceed the error limit. + break; + } + int idx = input_order.pData[best_i].idx; + mayout_block[idx] = 0; + list_erase(&input_order, best_i); + + list_push_back(&output_order, idx, best_err); + } + + float min_err = 1e10; + for (int i = output_order.size - 1; i >= 0; --i) { + min_err = min(min_err, output_order.pData[i].err); + output_order.pData[i].err = min_err; + } + + __global CoeffData *output_block = output_order_list + block_idx * kComputeBlockSize; + + int out_count = 0; + for (int i = 0; i < kComputeBlockSize && i < output_order.size; i++) + { + // err exceeding the limit is no need to continue. + if (output_order.pData[i].err <= BlockErrorLimit) + { + output_block[out_count].idx = output_order.pData[i].idx; + output_block[out_count].err = output_order.pData[i].err; + out_count++; + } + } +} + +__device__ void Butteraugli8x8CornerEdgeDetectorDiff( + int pos_x, + int pos_y, + int xsize, + int ysize, + __global const float *r, __global const float *g, __global const float* b, + __global const float *r2, __global const float* g2, __global const float *b2, + double* diff_xyb) +{ + int local_count = 0; + double local_xyb[3] = { 0 }; + const double w = 0.711100840192; + + int offset[4][2] = { { 0,0 },{ 0,7 },{ 7,0 },{ 7,7 } }; + int edgeSize = 3; + + for (int k = 0; k < 4; k++) + { + int x = pos_x + offset[k][0]; + int y = pos_y + offset[k][1]; + + if (x >= edgeSize && x + edgeSize < xsize) { + size_t ix = y * xsize + (x - edgeSize); + size_t ix2 = ix + 2 * edgeSize; + XybDiffLowFreqSquaredAccumulate( + w * (r[ix] - r[ix2]), + w * (g[ix] - g[ix2]), + w * (b[ix] - b[ix2]), + w * (r2[ix] - r2[ix2]), + w * (g2[ix] - g2[ix2]), + w * (b2[ix] - b2[ix2]), + 1.0, local_xyb); + ++local_count; + } + if (y >= edgeSize && y + edgeSize < ysize) { + size_t ix = (y - edgeSize) * xsize + x; + size_t ix2 = ix + 2 * edgeSize * xsize; + XybDiffLowFreqSquaredAccumulate( + w * (r[ix] - r[ix2]), + w * (g[ix] - g[ix2]), + w * (b[ix] - b[ix2]), + w * (r2[ix] - r2[ix2]), + w * (g2[ix] - g2[ix2]), + w * (b2[ix] - b2[ix2]), + 1.0, local_xyb); + ++local_count; + } + } + + const double weight = 0.01617112696; + const double mul = weight * 8.0 / local_count; + for (int i = 0; i < 3; ++i) { + diff_xyb[i] += mul * local_xyb[i]; + } +} + +__device__ double DotProduct(__global const float u[3], const double v[3]) { + return u[0] * v[0] + u[1] * v[1] + u[2] * v[2]; +} + +__device__ double Interpolate(__constant_ex const double *array, const int size, const double sx) { + double ix = fabs(sx); + + int baseix = (int)(ix); + double res; + if (baseix >= size - 1) { + res = array[size - 1]; + } + else { + double mix = ix - baseix; + int nextix = baseix + 1; + res = array[baseix] + mix * (array[nextix] - array[baseix]); + } + if (sx < 0) res = -res; + return res; +} + +#define XybToVals_off_x 11.38708334481672 +#define XybToVals_inc_x 14.550189611520716 +__constant double XybToVals_lut_x[21] = { + 0, + XybToVals_off_x, + XybToVals_off_x + 1 * XybToVals_inc_x, + XybToVals_off_x + 2 * XybToVals_inc_x, + XybToVals_off_x + 3 * XybToVals_inc_x, + XybToVals_off_x + 4 * XybToVals_inc_x, + XybToVals_off_x + 5 * XybToVals_inc_x, + XybToVals_off_x + 6 * XybToVals_inc_x, + XybToVals_off_x + 7 * XybToVals_inc_x, + XybToVals_off_x + 8 * XybToVals_inc_x, + XybToVals_off_x + 9 * XybToVals_inc_x, + XybToVals_off_x + 10 * XybToVals_inc_x, + XybToVals_off_x + 11 * XybToVals_inc_x, + XybToVals_off_x + 12 * XybToVals_inc_x, + XybToVals_off_x + 13 * XybToVals_inc_x, + XybToVals_off_x + 14 * XybToVals_inc_x, + XybToVals_off_x + 15 * XybToVals_inc_x, + XybToVals_off_x + 16 * XybToVals_inc_x, + XybToVals_off_x + 17 * XybToVals_inc_x, + XybToVals_off_x + 18 * XybToVals_inc_x, + XybToVals_off_x + 19 * XybToVals_inc_x, +}; + +#define XybToVals_off_y 1.4103373714040413 +#define XybToVals_inc_y 0.7084088867024 +__constant double XybToVals_lut_y[21] = { + 0, + XybToVals_off_y, + XybToVals_off_y + 1 * XybToVals_inc_y, + XybToVals_off_y + 2 * XybToVals_inc_y, + XybToVals_off_y + 3 * XybToVals_inc_y, + XybToVals_off_y + 4 * XybToVals_inc_y, + XybToVals_off_y + 5 * XybToVals_inc_y, + XybToVals_off_y + 6 * XybToVals_inc_y, + XybToVals_off_y + 7 * XybToVals_inc_y, + XybToVals_off_y + 8 * XybToVals_inc_y, + XybToVals_off_y + 9 * XybToVals_inc_y, + XybToVals_off_y + 10 * XybToVals_inc_y, + XybToVals_off_y + 11 * XybToVals_inc_y, + XybToVals_off_y + 12 * XybToVals_inc_y, + XybToVals_off_y + 13 * XybToVals_inc_y, + XybToVals_off_y + 14 * XybToVals_inc_y, + XybToVals_off_y + 15 * XybToVals_inc_y, + XybToVals_off_y + 16 * XybToVals_inc_y, + XybToVals_off_y + 17 * XybToVals_inc_y, + XybToVals_off_y + 18 * XybToVals_inc_y, + XybToVals_off_y + 19 * XybToVals_inc_y, +}; + +__device__ void XybToVals( + double x, double y, double z, + double *valx, double *valy, double *valz) +{ + const double xmul = 0.758304045695; + const double ymul = 2.28148649801; + const double zmul = 1.87816926918; + + *valx = Interpolate(&XybToVals_lut_x[0], 21, x * xmul); + *valy = Interpolate(&XybToVals_lut_y[0], 21, y * ymul); + *valz = zmul * z; +} + +#define XybLowFreqToVals_inc 5.2511644570349185 +__constant double XybLowFreqToVals_lut[21] = { + 0, + 1 * XybLowFreqToVals_inc, + 2 * XybLowFreqToVals_inc, + 3 * XybLowFreqToVals_inc, + 4 * XybLowFreqToVals_inc, + 5 * XybLowFreqToVals_inc, + 6 * XybLowFreqToVals_inc, + 7 * XybLowFreqToVals_inc, + 8 * XybLowFreqToVals_inc, + 9 * XybLowFreqToVals_inc, + 10 * XybLowFreqToVals_inc, + 11 * XybLowFreqToVals_inc, + 12 * XybLowFreqToVals_inc, + 13 * XybLowFreqToVals_inc, + 14 * XybLowFreqToVals_inc, + 15 * XybLowFreqToVals_inc, + 16 * XybLowFreqToVals_inc, + 17 * XybLowFreqToVals_inc, + 18 * XybLowFreqToVals_inc, + 19 * XybLowFreqToVals_inc, + 20 * XybLowFreqToVals_inc, +}; + +__device__ void XybLowFreqToVals(double x, double y, double z, + double *valx, double *valy, double *valz) { + const double xmul = 6.64482198135; + const double ymul = 0.837846224276; + const double zmul = 7.34905756986; + const double y_to_z_mul = 0.0812519812628; + + z += y_to_z_mul * y; + *valz = z * zmul; + *valx = x * xmul; + *valy = Interpolate(&XybLowFreqToVals_lut[0], 21, y * ymul); +} + +__device__ double InterpolateClampNegative(__global const double *array, + int size, double sx) { + if (sx < 0) { + sx = 0; + } + double ix = fabs(sx); + int baseix = (int)(ix); + double res; + if (baseix >= size - 1) { + res = array[size - 1]; + } + else { + double mix = ix - baseix; + int nextix = baseix + 1; + res = array[baseix] + mix * (array[nextix] - array[baseix]); + } + return res; +} + +__device__ void XybDiffLowFreqSquaredAccumulate(double r0, double g0, double b0, + double r1, double g1, double b1, + double factor, double res[3]) { + double valx0, valy0, valz0; + double valx1, valy1, valz1; + XybLowFreqToVals(r0, g0, b0, &valx0, &valy0, &valz0); + if (r1 == 0.0 && g1 == 0.0 && b1 == 0.0) { + //PROFILER_ZONE("XybDiff r1=g1=b1=0"); + res[0] += factor * valx0 * valx0; + res[1] += factor * valy0 * valy0; + res[2] += factor * valz0 * valz0; + return; + } + XybLowFreqToVals(r1, g1, b1, &valx1, &valy1, &valz1); + // Approximate the distance of the colors by their respective distances + // to gray. + double valx = valx0 - valx1; + double valy = valy0 - valy1; + double valz = valz0 - valz1; + res[0] += factor * valx * valx; + res[1] += factor * valy * valy; + res[2] += factor * valz * valz; +} + +typedef struct __Complex +{ + double real; + double imag; +}Complex; + +__constant double kSqrtHalf = 0.70710678118654752440084436210484903; +__device__ void RealFFT8(const double* in, Complex* out) { + double t1, t2, t3, t5, t6, t7, t8; + t8 = in[6]; + t5 = in[2] - t8; + t8 += in[2]; + out[2].real = t8; + out[6].imag = -t5; + out[4].imag = t5; + t8 = in[4]; + t3 = in[0] - t8; + t8 += in[0]; + out[0].real = t8; + out[4].real = t3; + out[6].real = t3; + t7 = in[5]; + t3 = in[1] - t7; + t7 += in[1]; + out[1].real = t7; + t8 = in[7]; + t5 = in[3] - t8; + t8 += in[3]; + out[3].real = t8; + t2 = -t5; + t6 = t3 - t5; + t8 = kSqrtHalf; + t6 *= t8; + out[5].real = out[4].real - t6; + t1 = t3 + t5; + t1 *= t8; + out[5].imag = out[4].imag - t1; + t6 += out[4].real; + out[4].real = t6; + t1 += out[4].imag; + out[4].imag = t1; + t5 = t2 - t3; + t5 *= t8; + out[7].imag = out[6].imag - t5; + t2 += t3; + t2 *= t8; + out[7].real = out[6].real - t2; + t2 += out[6].real; + out[6].real = t2; + t5 += out[6].imag; + out[6].imag = t5; + t5 = out[2].real; + t1 = out[0].real - t5; + t7 = out[3].real; + t5 += out[0].real; + t3 = out[1].real - t7; + t7 += out[1].real; + t8 = t5 + t7; + out[0].real = t8; + t5 -= t7; + out[1].real = t5; + out[2].imag = t3; + out[3].imag = -t3; + out[3].real = t1; + out[2].real = t1; + out[0].imag = 0; + out[1].imag = 0; + + // Reorder to the correct output order. + // TODO: Modify the above computation so that this is not needed. + Complex tmp = out[2]; + out[2] = out[3]; + out[3] = out[5]; + out[5] = out[7]; + out[7] = out[4]; + out[4] = out[1]; + out[1] = out[6]; + out[6] = tmp; +} + +__device__ void TransposeBlock(Complex data[kBlockSize]) { + for (int i = 0; i < kBlockEdge; i++) { + for (int j = 0; j < i; j++) { + Complex tmp = data[kBlockEdge * i + j]; + data[kBlockEdge * i + j] = data[kBlockEdge * j + i]; + data[kBlockEdge * j + i] = tmp; + } + } +} + +// D. J. Bernstein's Fast Fourier Transform algorithm on 4 elements. +__device__ inline void FFT4(Complex* a) { + double t1, t2, t3, t4, t5, t6, t7, t8; + t5 = a[2].real; + t1 = a[0].real - t5; + t7 = a[3].real; + t5 += a[0].real; + t3 = a[1].real - t7; + t7 += a[1].real; + t8 = t5 + t7; + a[0].real = t8; + t5 -= t7; + a[1].real = t5; + t6 = a[2].imag; + t2 = a[0].imag - t6; + t6 += a[0].imag; + t5 = a[3].imag; + a[2].imag = t2 + t3; + t2 -= t3; + a[3].imag = t2; + t4 = a[1].imag - t5; + a[3].real = t1 + t4; + t1 -= t4; + a[2].real = t1; + t5 += a[1].imag; + a[0].imag = t6 + t5; + t6 -= t5; + a[1].imag = t6; +} + +// D. J. Bernstein's Fast Fourier Transform algorithm on 8 elements. +__device__ void FFT8(Complex* a) { + const double kSqrtHalf = 0.70710678118654752440084436210484903; + double t1, t2, t3, t4, t5, t6, t7, t8; + + t7 = a[4].imag; + t4 = a[0].imag - t7; + t7 += a[0].imag; + a[0].imag = t7; + + t8 = a[6].real; + t5 = a[2].real - t8; + t8 += a[2].real; + a[2].real = t8; + + t7 = a[6].imag; + a[6].imag = t4 - t5; + t4 += t5; + a[4].imag = t4; + + t6 = a[2].imag - t7; + t7 += a[2].imag; + a[2].imag = t7; + + t8 = a[4].real; + t3 = a[0].real - t8; + t8 += a[0].real; + a[0].real = t8; + + a[4].real = t3 - t6; + t3 += t6; + a[6].real = t3; + + t7 = a[5].real; + t3 = a[1].real - t7; + t7 += a[1].real; + a[1].real = t7; + + t8 = a[7].imag; + t6 = a[3].imag - t8; + t8 += a[3].imag; + a[3].imag = t8; + t1 = t3 - t6; + t3 += t6; + + t7 = a[5].imag; + t4 = a[1].imag - t7; + t7 += a[1].imag; + a[1].imag = t7; + + t8 = a[7].real; + t5 = a[3].real - t8; + t8 += a[3].real; + a[3].real = t8; + + t2 = t4 - t5; + t4 += t5; + + t6 = t1 - t4; + t8 = kSqrtHalf; + t6 *= t8; + a[5].real = a[4].real - t6; + t1 += t4; + t1 *= t8; + a[5].imag = a[4].imag - t1; + t6 += a[4].real; + a[4].real = t6; + t1 += a[4].imag; + a[4].imag = t1; + + t5 = t2 - t3; + t5 *= t8; + a[7].imag = a[6].imag - t5; + t2 += t3; + t2 *= t8; + a[7].real = a[6].real - t2; + t2 += a[6].real; + a[6].real = t2; + t5 += a[6].imag; + a[6].imag = t5; + + FFT4(a); + + // Reorder to the correct output order. + // TODO: Modify the above computation so that this is not needed. + Complex tmp = a[2]; + a[2] = a[3]; + a[3] = a[5]; + a[5] = a[7]; + a[7] = a[4]; + a[4] = a[1]; + a[1] = a[6]; + a[6] = tmp; +} + +__device__ double abssq(const Complex c) { + return c.real * c.real + c.imag * c.imag; +} + +__device__ void ButteraugliFFTSquared(__private double block[kBlockSize]) { + double global_mul = 0.000064; + Complex block_c[kBlockSize]; + + for (int y = 0; y < kBlockEdge; ++y) { + RealFFT8(block + y * kBlockEdge, block_c + y * kBlockEdge); + } + TransposeBlock(block_c); + double r0[kBlockEdge]; + double r1[kBlockEdge]; + for (int x = 0; x < kBlockEdge; ++x) { + r0[x] = block_c[x].real; + r1[x] = block_c[kBlockHalf + x].real; + } + RealFFT8(r0, block_c); + RealFFT8(r1, block_c + kBlockHalf); + for (int y = 1; y < kBlockEdgeHalf; ++y) { + FFT8(block_c + y * kBlockEdge); + } + for (int i = kBlockEdgeHalf; i < kBlockHalf + kBlockEdgeHalf + 1; ++i) { + block[i] = abssq(block_c[i]); + block[i] *= global_mul; + } +} + +__device__ double RemoveRangeAroundZero(double v, double range) { + if (v >= -range && v < range) { + return 0; + } + if (v < 0) { + return v + range; + } + else { + return v - range; + } +} + +#define MakeHighFreqColorDiffDy_off 1.4103373714040413 +#define MakeHighFreqColorDiffDy_inc 0.7084088867024 +__constant double MakeHighFreqColorDiffDy_lut[21] = { + 0.0, + MakeHighFreqColorDiffDy_off, + MakeHighFreqColorDiffDy_off + 1 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 2 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 3 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 4 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 5 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 6 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 7 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 8 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 9 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 10 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 11 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 12 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 13 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 14 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 15 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 16 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 17 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 18 * MakeHighFreqColorDiffDy_inc, + MakeHighFreqColorDiffDy_off + 19 * MakeHighFreqColorDiffDy_inc, +}; + +__constant double csf8x8[kBlockHalf + kBlockEdgeHalf + 1] = { + 5.28270670524, + 0.0, + 0.0, + 0.0, + 0.3831134973, + 0.676303603859, + 3.58927792424, + 18.6104367002, + 18.6104367002, + 3.09093131948, + 1.0, + 0.498250875965, + 0.36198671102, + 0.308982169883, + 0.1312701920435, + 2.37370549629, + 3.58927792424, + 1.0, + 2.37370549629, + 0.991205724152, + 1.05178802919, + 0.627264168628, + 0.4, + 0.1312701920435, + 0.676303603859, + 0.498250875965, + 0.991205724152, + 0.5, + 0.3831134973, + 0.349686450518, + 0.627264168628, + 0.308982169883, + 0.3831134973, + 0.36198671102, + 1.05178802919, + 0.3831134973, + 0.12, +}; + +// Computes 8x8 FFT of each channel of xyb0 and xyb1 and adds the total squared +// 3-dimensional xybdiff of the two blocks to diff_xyb_{dc,ac} and the average +// diff on the edges to diff_xyb_edge_dc. +__device__ void ButteraugliBlockDiff(__private double xyb0[3 * kBlockSize], + __private double xyb1[3 * kBlockSize], + double diff_xyb_dc[3], + double diff_xyb_ac[3], + double diff_xyb_edge_dc[3]) { + + double avgdiff_xyb[3] = { 0.0 }; + double avgdiff_edge[3][4] = { { 0.0 } }; + + for (int i = 0; i < 3 * kBlockSize; ++i) { + const double diff_xyb = xyb0[i] - xyb1[i]; + const int c = i / kBlockSize; + avgdiff_xyb[c] += diff_xyb / kBlockSize; + const int k = i % kBlockSize; + const int kx = k % kBlockEdge; + const int ky = k / kBlockEdge; + const int h_edge_idx = ky == 0 ? 1 : ky == 7 ? 3 : -1; + const int v_edge_idx = kx == 0 ? 0 : kx == 7 ? 2 : -1; + if (h_edge_idx >= 0) { + avgdiff_edge[c][h_edge_idx] += diff_xyb / kBlockEdge; + } + if (v_edge_idx >= 0) { + avgdiff_edge[c][v_edge_idx] += diff_xyb / kBlockEdge; + } + } + XybDiffLowFreqSquaredAccumulate(avgdiff_xyb[0], + avgdiff_xyb[1], + avgdiff_xyb[2], + 0, 0, 0, csf8x8[0], + diff_xyb_dc); + for (int i = 0; i < 4; ++i) { + XybDiffLowFreqSquaredAccumulate(avgdiff_edge[0][i], + avgdiff_edge[1][i], + avgdiff_edge[2][i], + 0, 0, 0, csf8x8[0], + diff_xyb_edge_dc); + } + + double* xyb_avg = xyb0; + double* xyb_halfdiff = xyb1; + for (int i = 0; i < 3 * kBlockSize; ++i) { + double avg = (xyb0[i] + xyb1[i]) / 2; + double halfdiff = (xyb0[i] - xyb1[i]) / 2; + xyb_avg[i] = avg; + xyb_halfdiff[i] = halfdiff; + } + double *y_avg = &xyb_avg[kBlockSize]; + double *x_halfdiff_squared = &xyb_halfdiff[0]; + double *y_halfdiff = &xyb_halfdiff[kBlockSize]; + double *z_halfdiff_squared = &xyb_halfdiff[2 * kBlockSize]; + ButteraugliFFTSquared(y_avg); + ButteraugliFFTSquared(x_halfdiff_squared); + ButteraugliFFTSquared(y_halfdiff); + ButteraugliFFTSquared(z_halfdiff_squared); + + const double xmul = 64.8; + const double ymul = 1.753123908348329; + const double ymul2 = 1.51983458269; + const double zmul = 2.4; + + for (size_t i = kBlockEdgeHalf; i < kBlockHalf + kBlockEdgeHalf + 1; ++i) { + double d = csf8x8[i]; + diff_xyb_ac[0] += d * xmul * x_halfdiff_squared[i]; + diff_xyb_ac[2] += d * zmul * z_halfdiff_squared[i]; + + y_avg[i] = sqrt(y_avg[i]); + y_halfdiff[i] = sqrt(y_halfdiff[i]); + double y0 = y_avg[i] - y_halfdiff[i]; + double y1 = y_avg[i] + y_halfdiff[i]; + // Remove the impact of small absolute values. + // This improves the behavior with flat noise. + const double ylimit = 0.04; + y0 = RemoveRangeAroundZero(y0, ylimit); + y1 = RemoveRangeAroundZero(y1, ylimit); + if (y0 != y1) { + double valy0 = Interpolate(&MakeHighFreqColorDiffDy_lut[0], 21, y0 * ymul2); + double valy1 = Interpolate(&MakeHighFreqColorDiffDy_lut[0], 21, y1 * ymul2); + double valy = ymul * (valy0 - valy1); + diff_xyb_ac[1] += d * valy * valy; + } + } +} + +__constant static float g_mix[12] = { + 0.348036746003, + 0.577814843137, + 0.0544556093735, + 0.774145581713, + 0.26922717275, + 0.767247733938, + 0.0366922708552, + 0.920130265014, + 0.0882062883536, + 0.158581714673, + 0.712857943858, + 10.6524069248, +}; + +__device__ void OpsinAbsorbance(const double in[3], double out[3]) +{ + out[0] = g_mix[0] * in[0] + g_mix[1] * in[1] + g_mix[2] * in[2] + g_mix[3]; + out[1] = g_mix[4] * in[0] + g_mix[5] * in[1] + g_mix[6] * in[2] + g_mix[7]; + out[2] = g_mix[8] * in[0] + g_mix[9] * in[1] + g_mix[10] * in[2] + g_mix[11]; +} + +__device__ double EvaluatePolynomial(const double x, __constant_ex const double *coefficients, int n) +{ + double b1 = 0.0; + double b2 = 0.0; + + for (int i = n - 1; i >= 0; i--) + { + if (i == 0) { + const double x_b1 = x * b1; + b1 = x_b1 - b2 + coefficients[0]; + break; + } + const double x_b1 = x * b1; + const double t = (x_b1 + x_b1) - b2 + coefficients[i]; + b2 = b1; + b1 = t; + } + + return b1; +} + +static __constant double g_gamma_p[5 + 1] = { + 881.979476556478289, 1496.058452015812463, 908.662212739659481, + 373.566100223287378, 85.840860336314364, 6.683258861509244, +}; + +static __constant double g_gamma_q[5 + 1] = { + 12.262350348616792, 20.557285797683576, 12.161463238367844, + 4.711532733641639, 0.899112889751053, 0.035662329617191, +}; + +__device__ double Gamma(double v) +{ + const double min_value = 0.770000000000000; + const double max_value = 274.579999999999984; + const double x01 = (v - min_value) / (max_value - min_value); + const double xc = 2.0 * x01 - 1.0; + + const double yp = EvaluatePolynomial(xc, g_gamma_p, 6); + const double yq = EvaluatePolynomial(xc, g_gamma_q, 6); + if (yq == 0.0) return 0.0; + return (float)(yp / yq); +} + +__device__ void RgbToXyb(double r, double g, double b, double *valx, double *valy, double *valz) +{ + const double a0 = 1.01611726948; + const double a1 = 0.982482243696; + const double a2 = 1.43571362627; + const double a3 = 0.896039849412; + *valx = a0 * r - a1 * g; + *valy = a2 * r + a3 * g; + *valz = b; +} + +__device__ int list_push_back(IntFloatPairList* list, int i, float f) +{ + list->pData[list->size].idx = i; + list->pData[list->size].err = f; + return ++list->size; +} + +__device__ int list_erase(IntFloatPairList* list, int idx) +{ + for (int i = idx; i < list->size - 1; i++) + { + list->pData[i].idx = list->pData[i + 1].idx; + list->pData[i].err = list->pData[i + 1].err; + } + return --list->size; +} + +__device__ int SortInputOrder(DCTScoreData* input_order, int size) +{ + int i, j; + DCTScoreData tmp; + for (j = 1; j < size; j++) { + tmp.idx = input_order[j].idx; + tmp.err = input_order[j].err; + + i = j - 1; + while (i >= 0 && input_order[i].err > tmp.err) { + input_order[i + 1].idx = input_order[i].idx; + input_order[i + 1].err = input_order[i].err; + i--; + } + input_order[i + 1].idx = tmp.idx; + input_order[i + 1].err = tmp.err; + } + return size; +} + +__constant static float csf[192] = { + 0.0f, + 1.71014f, + 0.298711f, + 0.233709f, + 0.223126f, + 0.207072f, + 0.192775f, + 0.161201f, + 2.05807f, + 0.222927f, + 0.203406f, + 0.188465f, + 0.184668f, + 0.169993f, + 0.159142f, + 0.130155f, + 0.430518f, + 0.204939f, + 0.206655f, + 0.192231f, + 0.182941f, + 0.169455f, + 0.157599f, + 0.127153f, + 0.234757f, + 0.191098f, + 0.192698f, + 0.17425f, + 0.166503f, + 0.142154f, + 0.126182f, + 0.104196f, + 0.226117f, + 0.185373f, + 0.183825f, + 0.166643f, + 0.159414f, + 0.12636f, + 0.108696f, + 0.0911974f, + 0.207463f, + 0.171517f, + 0.170124f, + 0.141582f, + 0.126213f, + 0.103627f, + 0.0882436f, + 0.0751848f, + 0.196436f, + 0.161947f, + 0.159271f, + 0.126938f, + 0.109125f, + 0.0878027f, + 0.0749842f, + 0.0633859f, + 0.165232f, + 0.132905f, + 0.128679f, + 0.105766f, + 0.0906087f, + 0.0751544f, + 0.0641187f, + 0.0529921f, + 0.0f, + 0.147235f, + 0.11264f, + 0.0757892f, + 0.0493929f, + 0.0280663f, + 0.0075012f, + -0.000945567f, + 0.149251f, + 0.0964806f, + 0.0786224f, + 0.05206f, + 0.0292758f, + 0.00353094f, + -0.00277912f, + -0.00404481f, + 0.115551f, + 0.0793142f, + 0.0623735f, + 0.0405019f, + 0.0152656f, + -0.00145742f, + -0.00370369f, + -0.00375106f, + 0.0791547f, + 0.0537506f, + 0.0413634f, + 0.0193486f, + 0.000609066f, + -0.00510923f, + -0.0046452f, + -0.00385187f, + 0.0544534f, + 0.0334066f, + 0.0153899f, + 0.000539088f, + -0.00356085f, + -0.00535661f, + -0.00429145f, + -0.00343131f, + 0.0356439f, + 0.00865645f, + 0.00165229f, + -0.00425931f, + -0.00507324f, + -0.00459083f, + -0.003703f, + -0.00310327f, + 0.0121926f, + -0.0009259f, + -0.00330991f, + -0.00499378f, + -0.00437381f, + -0.00377427f, + -0.00311731f, + -0.00255125f, + -0.000320593f, + -0.00426043f, + -0.00416549f, + -0.00419364f, + -0.00365418f, + -0.00317499f, + -0.00255932f, + -0.00217917f, + 0.0f, + 0.143471f, + 0.124336f, + 0.0947465f, + 0.0814066f, + 0.0686776f, + 0.0588122f, + 0.0374415f, + 0.146315f, + 0.105334f, + 0.0949415f, + 0.0784241f, + 0.0689064f, + 0.0588304f, + 0.0495961f, + 0.0202342f, + 0.123818f, + 0.0952654f, + 0.0860556f, + 0.0724158f, + 0.0628307f, + 0.0529965f, + 0.0353941f, + 0.00815821f, + 0.097054f, + 0.080422f, + 0.0731085f, + 0.0636154f, + 0.055606f, + 0.0384127f, + 0.0142879f, + 0.00105195f, + 0.0849312f, + 0.071115f, + 0.0631183f, + 0.0552972f, + 0.0369221f, + 0.00798314f, + 0.000716374f, + -0.00200948f, + 0.0722298f, + 0.0599559f, + 0.054841f, + 0.0387529f, + 0.0107262f, + 0.000355315f, + -0.00244803f, + -0.00335222f, + 0.0635335f, + 0.0514196f, + 0.0406309f, + 0.0125833f, + 0.00151305f, + -0.00140269f, + -0.00362547f, + -0.00337649f, + 0.0472024f, + 0.0198725f, + 0.0113437f, + 0.00266305f, + -0.00137183f, + -0.00354158f, + -0.00341292f, + -0.00290074f +}; + +__constant static float bias[192] = { + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0 +}; + +__device__ coeff_t _abs(coeff_t val) +{ + return val >= 0 ? val : -val; +} + +__device__ int MakeInputOrder(__global const coeff_t *block, __global const coeff_t *orig_block, IntFloatPairList *input_order, int block_size) +{ + int size = 0; + for (int c = 0; c < 3; ++c) { + for (int k = 1; k < block_size; ++k) { + int idx = c * block_size + k; + if (block[idx] != 0) { + float score = _abs(orig_block[idx]) * csf[idx] + bias[idx]; + size = list_push_back(input_order, idx, score); + } + } + } + return SortInputOrder(input_order->pData, size); +} + +__constant static int kIDCTMatrix[kDCTBlockSize] = { + 8192, 11363, 10703, 9633, 8192, 6437, 4433, 2260, + 8192, 9633, 4433, -2259, -8192, -11362, -10704, -6436, + 8192, 6437, -4433, -11362, -8192, 2261, 10704, 9633, + 8192, 2260, -10703, -6436, 8192, 9633, -4433, -11363, + 8192, -2260, -10703, 6436, 8192, -9633, -4433, 11363, + 8192, -6437, -4433, 11362, -8192, -2261, 10704, -9633, + 8192, -9633, 4433, 2259, -8192, 11362, -10704, 6436, + 8192, -11363, 10703, -9633, 8192, -6437, 4433, -2260, +}; + +// Computes out[x] = sum{kIDCTMatrix[8*x+u]*in[u*stride]; for u in [0..7]} +__device__ void Compute1dIDCT(const coeff_t* in, const int stride, int out[8]) { + int tmp0, tmp1, tmp2, tmp3, tmp4; + + tmp1 = kIDCTMatrix[0] * in[0]; + out[0] = out[1] = out[2] = out[3] = out[4] = out[5] = out[6] = out[7] = tmp1; + + tmp0 = in[stride]; + tmp1 = kIDCTMatrix[1] * tmp0; + tmp2 = kIDCTMatrix[9] * tmp0; + tmp3 = kIDCTMatrix[17] * tmp0; + tmp4 = kIDCTMatrix[25] * tmp0; + out[0] += tmp1; + out[1] += tmp2; + out[2] += tmp3; + out[3] += tmp4; + out[4] -= tmp4; + out[5] -= tmp3; + out[6] -= tmp2; + out[7] -= tmp1; + + tmp0 = in[2 * stride]; + tmp1 = kIDCTMatrix[2] * tmp0; + tmp2 = kIDCTMatrix[10] * tmp0; + out[0] += tmp1; + out[1] += tmp2; + out[2] -= tmp2; + out[3] -= tmp1; + out[4] -= tmp1; + out[5] -= tmp2; + out[6] += tmp2; + out[7] += tmp1; + + tmp0 = in[3 * stride]; + tmp1 = kIDCTMatrix[3] * tmp0; + tmp2 = kIDCTMatrix[11] * tmp0; + tmp3 = kIDCTMatrix[19] * tmp0; + tmp4 = kIDCTMatrix[27] * tmp0; + out[0] += tmp1; + out[1] += tmp2; + out[2] += tmp3; + out[3] += tmp4; + out[4] -= tmp4; + out[5] -= tmp3; + out[6] -= tmp2; + out[7] -= tmp1; + + tmp0 = in[4 * stride]; + tmp1 = kIDCTMatrix[4] * tmp0; + out[0] += tmp1; + out[1] -= tmp1; + out[2] -= tmp1; + out[3] += tmp1; + out[4] += tmp1; + out[5] -= tmp1; + out[6] -= tmp1; + out[7] += tmp1; + + tmp0 = in[5 * stride]; + tmp1 = kIDCTMatrix[5] * tmp0; + tmp2 = kIDCTMatrix[13] * tmp0; + tmp3 = kIDCTMatrix[21] * tmp0; + tmp4 = kIDCTMatrix[29] * tmp0; + out[0] += tmp1; + out[1] += tmp2; + out[2] += tmp3; + out[3] += tmp4; + out[4] -= tmp4; + out[5] -= tmp3; + out[6] -= tmp2; + out[7] -= tmp1; + + tmp0 = in[6 * stride]; + tmp1 = kIDCTMatrix[6] * tmp0; + tmp2 = kIDCTMatrix[14] * tmp0; + out[0] += tmp1; + out[1] += tmp2; + out[2] -= tmp2; + out[3] -= tmp1; + out[4] -= tmp1; + out[5] -= tmp2; + out[6] += tmp2; + out[7] += tmp1; + + tmp0 = in[7 * stride]; + tmp1 = kIDCTMatrix[7] * tmp0; + tmp2 = kIDCTMatrix[15] * tmp0; + tmp3 = kIDCTMatrix[23] * tmp0; + tmp4 = kIDCTMatrix[31] * tmp0; + out[0] += tmp1; + out[1] += tmp2; + out[2] += tmp3; + out[3] += tmp4; + out[4] -= tmp4; + out[5] -= tmp3; + out[6] -= tmp2; + out[7] -= tmp1; +} + +__device__ void CoeffToIDCT(__private const coeff_t block[8*8], uchar out[8*8]) +{ + coeff_t colidcts[kDCTBlockSize]; + const int kColScale = 11; + const int kColRound = 1 << (kColScale - 1); + for (int x = 0; x < 8; ++x) + { + int colbuf[8] = { 0 }; + Compute1dIDCT(&block[x], 8, colbuf); + for (int y = 0; y < 8; ++y) + { + colidcts[8 * y + x] = (colbuf[y] + kColRound) >> kColScale; + } + } + const int kRowScale = 18; + const int kRowRound = 257 << (kRowScale - 1); // includes offset by 128 + for (int y = 0; y < 8; ++y) + { + const int rowidx = 8 * y; + int rowbuf[8] = { 0 }; + Compute1dIDCT(&colidcts[rowidx], 1, rowbuf); + for (int x = 0; x < 8; ++x) { + out[rowidx + x] = max(0, min(255, (rowbuf[x] + kRowRound) >> kRowScale)); + } + } +} + +__device__ void IDCTToPixel8x8(const uchar idct[8 * 8], ushort pixels_[8 * 8]) +{ + const int block_x = 0; + const int block_y = 0; + const int width_ = 8; + const int height_ = 8; + + for (int iy = 0; iy < 8; ++iy) { + for (int ix = 0; ix < 8; ++ix) { + int x = 8 * block_x + ix; + int y = 8 * block_y + iy; + if (x >= width_ || y >= height_) continue; + int p = y * width_ + x; + pixels_[p] = idct[8 * iy + ix] << 4; + } + } +} + +__device__ void IDCTToPixel16x16(const uchar idct[8 * 8], ushort pixels_out[16 * 16], __global const ushort *pixel_orig, int block_x, int block_y, int width_, int height_) +{ + // Fill in the 10x10 pixel area in the subsampled image that will be the + // basis of the upsampling. This area is enough to hold the 3x3 kernel of + // the fancy upsampler around each pixel. +#define kSubsampledEdgeSize 10 + ushort subsampled[kSubsampledEdgeSize * kSubsampledEdgeSize]; + for (int j = 0; j < kSubsampledEdgeSize; ++j) { + // The order we fill in the rows is: + // 8 rows intersecting the block, row below, row above + const int y0 = block_y * 16 + (j < 9 ? j * 2 : -2); + for (int i = 0; i < kSubsampledEdgeSize; ++i) { + // The order we fill in each row is: + // 8 pixels within the block, left edge, right edge + const int ix = ((j < 9 ? (j + 1) * kSubsampledEdgeSize : 0) + + (i < 9 ? i + 1 : 0)); + const int x0 = block_x * 16 + (i < 9 ? i * 2 : -2); + if (x0 < 0) { + subsampled[ix] = subsampled[ix + 1]; + } + else if (y0 < 0) { + subsampled[ix] = subsampled[ix + kSubsampledEdgeSize]; + } + else if (x0 >= width_) { + subsampled[ix] = subsampled[ix - 1]; + } + else if (y0 >= height_) { + subsampled[ix] = subsampled[ix - kSubsampledEdgeSize]; + } + else if (i < 8 && j < 8) { + subsampled[ix] = idct[j * 8 + i] << 4; + } + else { + // Reconstruct the subsampled pixels around the edge of the current + // block by computing the inverse of the fancy upsampler. + const int y1 = max(y0 - 1, 0); + const int x1 = max(x0 - 1, 0); + subsampled[ix] = (pixel_orig[y0 * width_ + x0] * 9 + + pixel_orig[y1 * width_ + x1] + + pixel_orig[y0 * width_ + x1] * -3 + + pixel_orig[y1 * width_ + x0] * -3) >> 2; + } + } + } + // Determine area to update. + int xmin = block_x * 16; // std::max(block_x * 16 - 1, 0); + int xmax = min(block_x * 16 + 15, width_ - 1); + int ymin = block_y * 16; // std::max(block_y * 16 - 1, 0); + int ymax = min(block_y * 16 + 15, height_ - 1); + + // Apply the fancy upsampler on the subsampled block. + for (int y = ymin; y <= ymax; ++y) { + const int y0 = ((y & ~1) / 2 - block_y * 8 + 1) * kSubsampledEdgeSize; + const int dy = ((y & 1) * 2 - 1) * kSubsampledEdgeSize; + for (int x = xmin; x <= xmax; ++x) { + const int x0 = (x & ~1) / 2 - block_x * 8 + 1; + const int dx = (x & 1) * 2 - 1; + const int ix = x0 + y0; + + int out_x = x - xmin; + int out_y = y - ymin; + + pixels_out[out_y * 16 + out_x] = (subsampled[ix] * 9 + subsampled[ix + dy] * 3 + + subsampled[ix + dx] * 3 + subsampled[ix + dx + dy]) >> 4; + } + } +} + +// out = [YUVYUV....YUVYUV] +__device__ void PixelToYUV(ushort pixels_[8 * 8], uchar out[8 * 8], int xsize/* = 8*/, int ysize/* = 8*/) +{ + const int stride = 3; + + for (int y = 0; y < xsize; ++y) { + for (int x = 0; x < ysize; ++x) { + int px = y * xsize + x; + *out = (uchar)((pixels_[px] + 8 - (x & 1)) >> 4); + out += stride; + } + } +} + +__constant static int kCrToRedTable[256] = { + -179, -178, -177, -175, -174, -172, -171, -170, -168, -167, -165, -164, + -163, -161, -160, -158, -157, -156, -154, -153, -151, -150, -149, -147, + -146, -144, -143, -142, -140, -139, -137, -136, -135, -133, -132, -130, + -129, -128, -126, -125, -123, -122, -121, -119, -118, -116, -115, -114, + -112, -111, -109, -108, -107, -105, -104, -102, -101, -100, -98, -97, + -95, -94, -93, -91, -90, -88, -87, -86, -84, -83, -81, -80, + -79, -77, -76, -74, -73, -72, -70, -69, -67, -66, -64, -63, + -62, -60, -59, -57, -56, -55, -53, -52, -50, -49, -48, -46, + -45, -43, -42, -41, -39, -38, -36, -35, -34, -32, -31, -29, + -28, -27, -25, -24, -22, -21, -20, -18, -17, -15, -14, -13, + -11, -10, -8, -7, -6, -4, -3, -1, 0, 1, 3, 4, + 6, 7, 8, 10, 11, 13, 14, 15, 17, 18, 20, 21, + 22, 24, 25, 27, 28, 29, 31, 32, 34, 35, 36, 38, + 39, 41, 42, 43, 45, 46, 48, 49, 50, 52, 53, 55, + 56, 57, 59, 60, 62, 63, 64, 66, 67, 69, 70, 72, + 73, 74, 76, 77, 79, 80, 81, 83, 84, 86, 87, 88, + 90, 91, 93, 94, 95, 97, 98, 100, 101, 102, 104, 105, + 107, 108, 109, 111, 112, 114, 115, 116, 118, 119, 121, 122, + 123, 125, 126, 128, 129, 130, 132, 133, 135, 136, 137, 139, + 140, 142, 143, 144, 146, 147, 149, 150, 151, 153, 154, 156, + 157, 158, 160, 161, 163, 164, 165, 167, 168, 170, 171, 172, + 174, 175, 177, 178 +}; + +__constant static int kCbToBlueTable[256] = { + -227, -225, -223, -222, -220, -218, -216, -214, -213, -211, -209, -207, + -206, -204, -202, -200, -198, -197, -195, -193, -191, -190, -188, -186, + -184, -183, -181, -179, -177, -175, -174, -172, -170, -168, -167, -165, + -163, -161, -159, -158, -156, -154, -152, -151, -149, -147, -145, -144, + -142, -140, -138, -136, -135, -133, -131, -129, -128, -126, -124, -122, + -120, -119, -117, -115, -113, -112, -110, -108, -106, -105, -103, -101, + -99, -97, -96, -94, -92, -90, -89, -87, -85, -83, -82, -80, + -78, -76, -74, -73, -71, -69, -67, -66, -64, -62, -60, -58, + -57, -55, -53, -51, -50, -48, -46, -44, -43, -41, -39, -37, + -35, -34, -32, -30, -28, -27, -25, -23, -21, -19, -18, -16, + -14, -12, -11, -9, -7, -5, -4, -2, 0, 2, 4, 5, + 7, 9, 11, 12, 14, 16, 18, 19, 21, 23, 25, 27, + 28, 30, 32, 34, 35, 37, 39, 41, 43, 44, 46, 48, + 50, 51, 53, 55, 57, 58, 60, 62, 64, 66, 67, 69, + 71, 73, 74, 76, 78, 80, 82, 83, 85, 87, 89, 90, + 92, 94, 96, 97, 99, 101, 103, 105, 106, 108, 110, 112, + 113, 115, 117, 119, 120, 122, 124, 126, 128, 129, 131, 133, + 135, 136, 138, 140, 142, 144, 145, 147, 149, 151, 152, 154, + 156, 158, 159, 161, 163, 165, 167, 168, 170, 172, 174, 175, + 177, 179, 181, 183, 184, 186, 188, 190, 191, 193, 195, 197, + 198, 200, 202, 204, 206, 207, 209, 211, 213, 214, 216, 218, + 220, 222, 223, 225, +}; + +__constant static int kCrToGreenTable[256] = { + 5990656, 5943854, 5897052, 5850250, 5803448, 5756646, 5709844, 5663042, + 5616240, 5569438, 5522636, 5475834, 5429032, 5382230, 5335428, 5288626, + 5241824, 5195022, 5148220, 5101418, 5054616, 5007814, 4961012, 4914210, + 4867408, 4820606, 4773804, 4727002, 4680200, 4633398, 4586596, 4539794, + 4492992, 4446190, 4399388, 4352586, 4305784, 4258982, 4212180, 4165378, + 4118576, 4071774, 4024972, 3978170, 3931368, 3884566, 3837764, 3790962, + 3744160, 3697358, 3650556, 3603754, 3556952, 3510150, 3463348, 3416546, + 3369744, 3322942, 3276140, 3229338, 3182536, 3135734, 3088932, 3042130, + 2995328, 2948526, 2901724, 2854922, 2808120, 2761318, 2714516, 2667714, + 2620912, 2574110, 2527308, 2480506, 2433704, 2386902, 2340100, 2293298, + 2246496, 2199694, 2152892, 2106090, 2059288, 2012486, 1965684, 1918882, + 1872080, 1825278, 1778476, 1731674, 1684872, 1638070, 1591268, 1544466, + 1497664, 1450862, 1404060, 1357258, 1310456, 1263654, 1216852, 1170050, + 1123248, 1076446, 1029644, 982842, 936040, 889238, 842436, 795634, + 748832, 702030, 655228, 608426, 561624, 514822, 468020, 421218, + 374416, 327614, 280812, 234010, 187208, 140406, 93604, 46802, + 0, -46802, -93604, -140406, -187208, -234010, -280812, -327614, + -374416, -421218, -468020, -514822, -561624, -608426, -655228, -702030, + -748832, -795634, -842436, -889238, -936040, -982842, -1029644, -1076446, + -1123248, -1170050, -1216852, -1263654, -1310456, -1357258, -1404060, -1450862, + -1497664, -1544466, -1591268, -1638070, -1684872, -1731674, -1778476, -1825278, + -1872080, -1918882, -1965684, -2012486, -2059288, -2106090, -2152892, -2199694, + -2246496, -2293298, -2340100, -2386902, -2433704, -2480506, -2527308, -2574110, + -2620912, -2667714, -2714516, -2761318, -2808120, -2854922, -2901724, -2948526, + -2995328, -3042130, -3088932, -3135734, -3182536, -3229338, -3276140, -3322942, + -3369744, -3416546, -3463348, -3510150, -3556952, -3603754, -3650556, -3697358, + -3744160, -3790962, -3837764, -3884566, -3931368, -3978170, -4024972, -4071774, + -4118576, -4165378, -4212180, -4258982, -4305784, -4352586, -4399388, -4446190, + -4492992, -4539794, -4586596, -4633398, -4680200, -4727002, -4773804, -4820606, + -4867408, -4914210, -4961012, -5007814, -5054616, -5101418, -5148220, -5195022, + -5241824, -5288626, -5335428, -5382230, -5429032, -5475834, -5522636, -5569438, + -5616240, -5663042, -5709844, -5756646, -5803448, -5850250, -5897052, -5943854, +}; + +__constant static int kCbToGreenTable[256] = { + 2919680, 2897126, 2874572, 2852018, 2829464, 2806910, 2784356, 2761802, + 2739248, 2716694, 2694140, 2671586, 2649032, 2626478, 2603924, 2581370, + 2558816, 2536262, 2513708, 2491154, 2468600, 2446046, 2423492, 2400938, + 2378384, 2355830, 2333276, 2310722, 2288168, 2265614, 2243060, 2220506, + 2197952, 2175398, 2152844, 2130290, 2107736, 2085182, 2062628, 2040074, + 2017520, 1994966, 1972412, 1949858, 1927304, 1904750, 1882196, 1859642, + 1837088, 1814534, 1791980, 1769426, 1746872, 1724318, 1701764, 1679210, + 1656656, 1634102, 1611548, 1588994, 1566440, 1543886, 1521332, 1498778, + 1476224, 1453670, 1431116, 1408562, 1386008, 1363454, 1340900, 1318346, + 1295792, 1273238, 1250684, 1228130, 1205576, 1183022, 1160468, 1137914, + 1115360, 1092806, 1070252, 1047698, 1025144, 1002590, 980036, 957482, + 934928, 912374, 889820, 867266, 844712, 822158, 799604, 777050, + 754496, 731942, 709388, 686834, 664280, 641726, 619172, 596618, + 574064, 551510, 528956, 506402, 483848, 461294, 438740, 416186, + 393632, 371078, 348524, 325970, 303416, 280862, 258308, 235754, + 213200, 190646, 168092, 145538, 122984, 100430, 77876, 55322, + 32768, 10214, -12340, -34894, -57448, -80002, -102556, -125110, + -147664, -170218, -192772, -215326, -237880, -260434, -282988, -305542, + -328096, -350650, -373204, -395758, -418312, -440866, -463420, -485974, + -508528, -531082, -553636, -576190, -598744, -621298, -643852, -666406, + -688960, -711514, -734068, -756622, -779176, -801730, -824284, -846838, + -869392, -891946, -914500, -937054, -959608, -982162, -1004716, -1027270, + -1049824, -1072378, -1094932, -1117486, -1140040, -1162594, -1185148, -1207702, + -1230256, -1252810, -1275364, -1297918, -1320472, -1343026, -1365580, -1388134, + -1410688, -1433242, -1455796, -1478350, -1500904, -1523458, -1546012, -1568566, + -1591120, -1613674, -1636228, -1658782, -1681336, -1703890, -1726444, -1748998, + -1771552, -1794106, -1816660, -1839214, -1861768, -1884322, -1906876, -1929430, + -1951984, -1974538, -1997092, -2019646, -2042200, -2064754, -2087308, -2109862, + -2132416, -2154970, -2177524, -2200078, -2222632, -2245186, -2267740, -2290294, + -2312848, -2335402, -2357956, -2380510, -2403064, -2425618, -2448172, -2470726, + -2493280, -2515834, -2538388, -2560942, -2583496, -2606050, -2628604, -2651158, + -2673712, -2696266, -2718820, -2741374, -2763928, -2786482, -2809036, -2831590, +}; + +__constant static uchar kRangeLimitLut[4 * 256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, + 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, + 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, + 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, + 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, + 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, + 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, + 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, + 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +}; + +__device__ void YUVToRGB(__private uchar pixelBlock[3*8*8], int size /*= 8 * 8*/) +{ + __constant_ex uchar* kRangeLimit = kRangeLimitLut + 384; + + for (int i = 0; i < size; i++) + { + uchar *pixel = &pixelBlock[i * 3]; + + int y = pixel[0]; + int cb = pixel[1]; + int cr = pixel[2]; + pixel[0] = kRangeLimit[y + kCrToRedTable[cr]]; + pixel[1] = kRangeLimit[y + ((kCrToGreenTable[cr] + kCbToGreenTable[cb]) >> 16)]; + pixel[2] = kRangeLimit[y + kCbToBlueTable[cb]]; + } +} + +__constant static double kSrgb8ToLinearTable[256] = { + 0.000000, + 0.077399, + 0.154799, + 0.232198, + 0.309598, + 0.386997, + 0.464396, + 0.541796, + 0.619195, + 0.696594, + 0.773994, + 0.853367, + 0.937509, + 1.026303, + 1.119818, + 1.218123, + 1.321287, + 1.429375, + 1.542452, + 1.660583, + 1.783830, + 1.912253, + 2.045914, + 2.184872, + 2.329185, + 2.478910, + 2.634105, + 2.794824, + 2.961123, + 3.133055, + 3.310673, + 3.494031, + 3.683180, + 3.878171, + 4.079055, + 4.285881, + 4.498698, + 4.717556, + 4.942502, + 5.173584, + 5.410848, + 5.654341, + 5.904108, + 6.160196, + 6.422649, + 6.691512, + 6.966827, + 7.248640, + 7.536993, + 7.831928, + 8.133488, + 8.441715, + 8.756651, + 9.078335, + 9.406810, + 9.742115, + 10.084290, + 10.433375, + 10.789410, + 11.152432, + 11.522482, + 11.899597, + 12.283815, + 12.675174, + 13.073712, + 13.479465, + 13.892470, + 14.312765, + 14.740385, + 15.175366, + 15.617744, + 16.067555, + 16.524833, + 16.989614, + 17.461933, + 17.941824, + 18.429322, + 18.924460, + 19.427272, + 19.937793, + 20.456054, + 20.982090, + 21.515934, + 22.057618, + 22.607175, + 23.164636, + 23.730036, + 24.303404, + 24.884774, + 25.474176, + 26.071642, + 26.677203, + 27.290891, + 27.912736, + 28.542769, + 29.181020, + 29.827520, + 30.482299, + 31.145387, + 31.816813, + 32.496609, + 33.184802, + 33.881422, + 34.586499, + 35.300062, + 36.022139, + 36.752760, + 37.491953, + 38.239746, + 38.996169, + 39.761248, + 40.535013, + 41.317491, + 42.108710, + 42.908697, + 43.717481, + 44.535088, + 45.361546, + 46.196882, + 47.041124, + 47.894297, + 48.756429, + 49.627547, + 50.507676, + 51.396845, + 52.295078, + 53.202402, + 54.118843, + 55.044428, + 55.979181, + 56.923129, + 57.876298, + 58.838712, + 59.810398, + 60.791381, + 61.781686, + 62.781338, + 63.790363, + 64.808784, + 65.836627, + 66.873918, + 67.920679, + 68.976937, + 70.042715, + 71.118037, + 72.202929, + 73.297414, + 74.401516, + 75.515259, + 76.638668, + 77.771765, + 78.914575, + 80.067122, + 81.229428, + 82.401518, + 83.583415, + 84.775142, + 85.976722, + 87.188178, + 88.409534, + 89.640813, + 90.882037, + 92.133229, + 93.394412, + 94.665609, + 95.946841, + 97.238133, + 98.539506, + 99.850982, + 101.172584, + 102.504334, + 103.846254, + 105.198366, + 106.560693, + 107.933256, + 109.316077, + 110.709177, + 112.112579, + 113.526305, + 114.950375, + 116.384811, + 117.829635, + 119.284868, + 120.750532, + 122.226647, + 123.713235, + 125.210317, + 126.717914, + 128.236047, + 129.764737, + 131.304005, + 132.853871, + 134.414357, + 135.985483, + 137.567270, + 139.159738, + 140.762907, + 142.376799, + 144.001434, + 145.636832, + 147.283012, + 148.939997, + 150.607804, + 152.286456, + 153.975971, + 155.676371, + 157.387673, + 159.109900, + 160.843070, + 162.587203, + 164.342319, + 166.108438, + 167.885578, + 169.673761, + 171.473005, + 173.283330, + 175.104755, + 176.937299, + 178.780982, + 180.635824, + 182.501843, + 184.379058, + 186.267489, + 188.167154, + 190.078073, + 192.000265, + 193.933749, + 195.878543, + 197.834666, + 199.802137, + 201.780975, + 203.771198, + 205.772826, + 207.785876, + 209.810367, + 211.846319, + 213.893748, + 215.952674, + 218.023115, + 220.105089, + 222.198615, + 224.303711, + 226.420395, + 228.548685, + 230.688599, + 232.840156, + 235.003373, + 237.178269, + 239.364861, + 241.563167, + 243.773205, + 245.994993, + 248.228549, + 250.473890, + 252.731035, + 255.000000, +}; + +__device__ void YUVToImage(__private uchar yuv[3 * 8 * 8], float* r, float* g, float* b, int xsize/* = 8*/, int ysize/* = 8*/, int inside_x/* = 8*/, int inside_y/* = 8*/) +{ + YUVToRGB(yuv, xsize * ysize); + +#define lut kSrgb8ToLinearTable +// const __constant double* lut = kSrgb8ToLinearTable; + + for (int i = 0; i < xsize * ysize; i++) + { + r[i] = lut[yuv[3 * i]]; + g[i] = lut[yuv[3 * i + 1]]; + b[i] = lut[yuv[3 * i + 2]]; + } + for (int y = 0; y < inside_y; y++) + { + for (int x = inside_x; x < xsize; x++) + { + int idx = y * xsize + (inside_x - 1); + r[y * xsize + x] = r[idx]; + g[y * xsize + x] = g[idx]; + b[y * xsize + x] = b[idx]; + } + } + for (int y = inside_y; y < ysize; y++) + { + for (int x = 0; x < xsize; x++) + { + int idx = (inside_y - 1) * xsize + x; + r[y * xsize + x] = r[idx]; + g[y * xsize + x] = g[idx]; + b[y * xsize + x] = b[idx]; + } + } +#undef lut +} + +__device__ void BlockToImage(__private const coeff_t block[8*8*3], float r[8*8], float g[8*8], float b[8*8], int inside_x, int inside_y) +{ + uchar idct[3][8 * 8]; + CoeffToIDCT(&block[0], idct[0]); + CoeffToIDCT(&block[8 * 8], idct[1]); + CoeffToIDCT(&block[8 * 8 * 2], idct[2]); + + ushort pixels[3][8 * 8]; + IDCTToPixel8x8(idct[0], pixels[0]); + IDCTToPixel8x8(idct[1], pixels[1]); + IDCTToPixel8x8(idct[2], pixels[2]); + + uchar yuv[8 * 8 * 3]; + PixelToYUV(pixels[0], &yuv[0], 8, 8); + PixelToYUV(pixels[1], &yuv[1], 8, 8); + PixelToYUV(pixels[2], &yuv[2], 8, 8); + + YUVToRGB(yuv, 8 * 8); + + for (int i = 0; i < 8 * 8; i++) + { + r[i] = kSrgb8ToLinearTable[yuv[3 * i]]; + g[i] = kSrgb8ToLinearTable[yuv[3 * i + 1]]; + b[i] = kSrgb8ToLinearTable[yuv[3 * i + 2]]; + } + for (int y = 0; y < inside_y; y++) + { + for (int x = inside_x; x < 8; x++) + { + int idx = y * 8 + (inside_x - 1); + r[y * 8 + x] = r[idx]; + g[y * 8 + x] = g[idx]; + b[y * 8 + x] = b[idx]; + } + } + for (int y = inside_y; y < 8; y++) + { + for (int x = 0; x < 8; x++) + { + int idx = (inside_y - 1) * 8 + x; + r[y * 8 + x] = r[idx]; + g[y * 8 + x] = g[idx]; + b[y * 8 + x] = b[idx]; + } + } +} + +__device__ void CoeffToYUV16x16(__private const coeff_t block[8 * 8], uchar *yuv, __global const ushort *pixel_orig, int block_x, int block_y, int width_, int height_) +{ + uchar idct[8 * 8]; + CoeffToIDCT(&block[0], &idct[0]); + + ushort pixels[16 * 16]; + IDCTToPixel16x16(idct, pixels, pixel_orig, block_x, block_y, width_, height_); + + PixelToYUV(pixels, yuv, 16, 16); +} + +__device__ void CoeffToYUV16x16_g(__global const coeff_t block[8 * 8], uchar *yuv, __global const ushort *pixel_orig, int block_x, int block_y, int width_, int height_) +{ + coeff_t b[8 * 8]; + for (int i = 0; i < 8 * 8; i++) + { + b[i] = block[i]; + } + CoeffToYUV16x16(b, yuv, pixel_orig, block_x, block_y, width_, height_); +} + +__device__ void CoeffToYUV8x8(__private const coeff_t block[8 * 8], uchar *yuv) +{ + uchar idct[8 * 8]; + CoeffToIDCT(&block[0], &idct[0]); + + ushort pixels[8 * 8]; + IDCTToPixel8x8(idct, pixels); + + PixelToYUV(pixels, yuv, 8, 8); +} + +__device__ void CoeffToYUV8x8_g(__global const coeff_t block[8 * 8], uchar *yuv) +{ + coeff_t b[8 * 8]; + for (int i = 0; i < 8 * 8; i++) + { + b[i] = block[i]; + } + + CoeffToYUV8x8(b, yuv); +} + +__device__ void Copy8x8To16x16(const uchar yuv8x8[3 * 8 * 8], uchar yuv16x16[3 * 16 * 16], int off_x, int off_y) +{ + for (int y = 0; y < 8; y++) + { + for (int x = 0; x < 8; x++) + { + int idx = y * 8 + x; + int idx16 = (y + off_y * 8) * 16 + (x + off_x * 8); + yuv16x16[idx16 * 3] = yuv8x8[idx * 3]; + } + } +} + +__device__ void Copy16x16To8x8(const uchar yuv16x16[3 * 16 * 16], uchar yuv8x8[3 * 8 * 8], int off_x, int off_y) +{ + for (int y = 0; y < 8; y++) + { + for (int x = 0; x < 8; x++) + { + int idx = y * 8 + x; + int idx16 = (y + off_y * 8) * 16 + (x + off_x * 8); + yuv8x8[idx * 3] = yuv16x16[idx16 * 3]; + } + } +} + +__device__ void Copy16x16ToChannel(const float rgb16x16[3][16 * 16], float r[8 * 8], float g[8 * 8], float b[8 * 8], int off_x, int off_y) +{ + for (int y = 0; y < 8; y++) + { + for (int x = 0; x < 8; x++) + { + int idx = y * 8 + x; + int idx16 = (y + off_y * 8) * 16 + (x + off_x * 8); + r[idx] = rgb16x16[0][idx16]; + g[idx] = rgb16x16[1][idx16]; + b[idx] = rgb16x16[2][idx16]; + } + } +} + +__device__ void Convolution(size_t xsize, size_t ysize, + int xstep, int len, int offset, + const float* multipliers, + const float* inp, + float border_ratio, + float* result) +{ + float weight_no_border = 0; + + for (size_t j = 0; j <= 2 * offset; ++j) { + weight_no_border += multipliers[j]; + } + for (size_t x = 0, ox = 0; x < xsize; x += xstep, ox++) { + int minx = x < offset ? 0 : x - offset; + int maxx = min(xsize, x + len - offset) - 1; + float weight = 0.0; + for (int j = minx; j <= maxx; ++j) { + weight += multipliers[j - x + offset]; + } + // Interpolate linearly between the no-border scaling and border scaling. + weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border; + float scale = 1.0 / weight; + for (size_t y = 0; y < ysize; ++y) { + float sum = 0.0; + for (int j = minx; j <= maxx; ++j) { + sum += inp[y * xsize + j] * multipliers[j - x + offset]; + } + result[ox * ysize + y] = (float)(sum * scale); + } + } +} + +__device__ void BlurEx(const float *r, int xsize, int ysize, double kSigma, double border_ratio, float *output) +{ + const double sigma = 1.1; + double m = 2.25; // Accuracy increases when m is increased. + const double scaler = -0.41322314049586772; // when sigma=1.1, scaler is -0.41322314049586772 + const int diff = 2; // when sigma=1.1, diff's value is 2. + const int expn_size = 5; // when sigma=1.1, scaler is 5 + float expn[5] = { exp(scaler * (-diff) * (-diff)), + exp(scaler * (-diff + 1) * (-diff + 1)), + exp(scaler * (-diff + 2) * (-diff + 2)), + exp(scaler * (-diff + 3) * (-diff + 3)), + exp(scaler * (-diff + 4) * (-diff + 4))}; + const int xstep = 1; // when sigma=1.1, xstep is 1. + const int ystep = xstep; + + int dxsize = (xsize + xstep - 1) / xstep; + + float tmp[8*8] = { 0 }; + Convolution(xsize, ysize, xstep, expn_size, diff, expn, r, border_ratio, tmp); + Convolution(ysize, dxsize, ystep, expn_size, diff, expn, tmp, + border_ratio, output); +} + +__device__ void OpsinDynamicsImageBlock(__private float *r, __private float *g, __private float *b, + __private const float *r_blurred, __private const float *g_blurred, __private const float *b_blurred, + int size) +{ + for (size_t i = 0; i < size; ++i) { + double sensitivity[3]; + { + // Calculate sensitivity[3] based on the smoothed image gamma derivative. + double pre_rgb[3] = { r_blurred[i], g_blurred[i], b_blurred[i] }; + double pre_mixed[3]; + OpsinAbsorbance(pre_rgb, pre_mixed); + sensitivity[0] = Gamma(pre_mixed[0]) / pre_mixed[0]; + sensitivity[1] = Gamma(pre_mixed[1]) / pre_mixed[1]; + sensitivity[2] = Gamma(pre_mixed[2]) / pre_mixed[2]; + } + double cur_rgb[3] = { r[i], g[i], b[i] }; + double cur_mixed[3]; + OpsinAbsorbance(cur_rgb, cur_mixed); + cur_mixed[0] *= sensitivity[0]; + cur_mixed[1] *= sensitivity[1]; + cur_mixed[2] *= sensitivity[2]; + double x, y, z; + RgbToXyb(cur_mixed[0], cur_mixed[1], cur_mixed[2], &x, &y, &z); + r[i] = (float)(x); + g[i] = (float)(y); + b[i] = (float)(z); + } +} + +__device__ void MaskHighIntensityChangeBlock(float *xyb0_x, float *xyb0_y, float *xyb0_b, + float *xyb1_x, float *xyb1_y, float *xyb1_b, + const float *c0_x, const float *c0_y, const float *c0_b, + const float *c1_x, const float *c1_y, const float *c1_b, + int xsize, int ysize) +{ + for (int x = 0; x < xsize; ++x) + { + for (int y = 0; y < ysize; ++y) + { + size_t ix = y * xsize + x; + const double ave[3] = { + (c0_x[ix] + c1_x[ix]) * 0.5f, + (c0_y[ix] + c1_y[ix]) * 0.5f, + (c0_b[ix] + c1_b[ix]) * 0.5f, + }; + double sqr_max_diff = -1; + { + int offset[4] = { -1, 1, -(int)(xsize), (int)(xsize) }; + int border[4] = { x == 0, x + 1 == xsize, y == 0, y + 1 == ysize }; + for (int dir = 0; dir < 4; ++dir) { + if (border[dir]) + { + continue; + } + const int ix2 = ix + offset[dir]; + double diff = 0.5 * (c0_y[ix2] + c1_y[ix2]) - ave[1]; + diff *= diff; + if (sqr_max_diff < diff) + { + sqr_max_diff = diff; + } + } + } + const double kReductionX = 275.19165240059317; + const double kReductionY = 18599.41286306991; + const double kReductionZ = 410.8995306951065; + const double kChromaBalance = 106.95800948271017; + double chroma_scale = kChromaBalance / (ave[1] + kChromaBalance); + + const double mix[3] = { + chroma_scale * kReductionX / (sqr_max_diff + kReductionX), + kReductionY / (sqr_max_diff + kReductionY), + chroma_scale * kReductionZ / (sqr_max_diff + kReductionZ), + }; + // Interpolate lineraly between the average color and the actual + // color -- to reduce the importance of this pixel. + xyb0_x[ix] = (float)(mix[0] * c0_x[ix] + (1 - mix[0]) * ave[0]); + xyb1_x[ix] = (float)(mix[0] * c1_x[ix] + (1 - mix[0]) * ave[0]); + + xyb0_y[ix] = (float)(mix[1] * c0_y[ix] + (1 - mix[1]) * ave[1]); + xyb1_y[ix] = (float)(mix[1] * c1_y[ix] + (1 - mix[1]) * ave[1]); + + xyb0_b[ix] = (float)(mix[2] * c0_b[ix] + (1 - mix[2]) * ave[2]); + xyb1_b[ix] = (float)(mix[2] * c1_b[ix] + (1 - mix[2]) * ave[2]); + } + } +} + +__device__ void floatcopy(float *dst, const float *src, int size) +{ + for (int i = 0; i < size; i++) + { + dst[i] = src[i]; + } +} + +__device__ void coeffcopy_g(coeff_t *dst, __global const coeff_t *src, int size) +{ + for (int i = 0; i < size; i++) + { + dst[i] = src[i]; + } +} + +__device__ void coeffcopy(coeff_t *dst, const coeff_t *src, int size) +{ + for (int i = 0; i < size; i++) + { + dst[i] = src[i]; + } +} + +__device__ void CalcOpsinDynamicsImage(__private float rgb[3][kDCTBlockSize]) +{ + float rgb_blurred[3][kDCTBlockSize]; + for (int i = 0; i < 3; i++) + { + BlurEx(rgb[i], 8, 8, 1.1, 0, rgb_blurred[i]); + } + OpsinDynamicsImageBlock(rgb[0], rgb[1], rgb[2], rgb_blurred[0], rgb_blurred[1], rgb_blurred[2], kDCTBlockSize); +} + +__device__ double ComputeImage8x8Block(__private float rgb0_c[3][kDCTBlockSize], __private float rgb1_c[3][kDCTBlockSize], const __global float* mask_scale_block) +{ + CalcOpsinDynamicsImage(rgb1_c); + + float rgb0[3][kDCTBlockSize]; + float rgb1[3][kDCTBlockSize]; + + floatcopy(&rgb0[0][0], &rgb0_c[0][0], 3 * kDCTBlockSize); + floatcopy(&rgb1[0][0], &rgb1_c[0][0], 3 * kDCTBlockSize); + + MaskHighIntensityChangeBlock(rgb0[0], rgb0[1], rgb0[2], + rgb1[0], rgb1[1], rgb1[2], + rgb0_c[0], rgb0_c[1], rgb0_c[2], + rgb1_c[0], rgb1_c[1], rgb1_c[2], + 8, 8); + + double b0[3 * kDCTBlockSize]; + double b1[3 * kDCTBlockSize]; + for (int c = 0; c < 3; ++c) { + for (int ix = 0; ix < kDCTBlockSize; ++ix) { + b0[c * kDCTBlockSize + ix] = rgb0[c][ix]; + b1[c * kDCTBlockSize + ix] = rgb1[c][ix]; + } + } + + double diff_xyz_dc[3] = { 0.0 }; + double diff_xyz_ac[3] = { 0.0 }; + double diff_xyz_edge_dc[3] = { 0.0 }; + ButteraugliBlockDiff(b0, b1, diff_xyz_dc, diff_xyz_ac, diff_xyz_edge_dc); + + double diff = 0.0; + double diff_edge = 0.0; + + for (int c = 0; c < 3; ++c) { + diff += diff_xyz_dc[c] * mask_scale_block[c]; + diff += diff_xyz_ac[c] * mask_scale_block[c]; + diff_edge += diff_xyz_edge_dc[c] * mask_scale_block[c]; + } + const double kEdgeWeight = 0.05; + return sqrt((1 - kEdgeWeight) * diff + kEdgeWeight * diff_edge); + +} + +// return the count of Non-zero item +__device__ int MakeInputOrderEx(const coeff_t block[3*8*8], const coeff_t orig_block[3*8*8], IntFloatPairList *input_order) +{ + const int block_size = 64; + int size = 0; + for (int c = 0; c < 3; ++c) { + for (int k = 1; k < block_size; ++k) { + int idx = c * block_size + k; + if (block[idx] != 0) { + float score = _abs(orig_block[idx]) * csf[idx] + bias[idx]; + size = list_push_back(input_order, idx, score); + } + } + } + + return SortInputOrder(input_order->pData, size); +} + +__device__ int GetOrigBlock(float rgb0_c[3][kDCTBlockSize], + const __global float *orig_image_batch, + int width_, int height_, + int block_x, int block_y, + int factor, + int off_x, int off_y) +{ + int block_xx = block_x * factor + off_x; + int block_yy = block_y * factor + off_y; + if (block_xx * 8 >= width_ || block_yy * 8 >= height_) return -1; + + const int block8_width = (width_ + 8 - 1) / 8; + + int block_ix = block_yy * block8_width + block_xx; + + __global const float* block_opsin = &orig_image_batch[block_ix * 3 * kDCTBlockSize]; + for (int i = 0; i < 3; i++) { + for (int k = 0; k < kDCTBlockSize; k++) { + rgb0_c[i][k] = block_opsin[i * kDCTBlockSize + k]; + } + } + + return block_ix; +} + +__device__ double CompareBlockFactor1(const channel_info mayout_channel[3], + const coeff_t* candidate_block, + const int block_x, + const int block_y, + __global const float *orig_image_batch, + __global const float *mask_scale, + const int image_width, + const int image_height) +{ + const coeff_t *candidate_channel[3]; + for (int c = 0; c < 3; c++) { + candidate_channel[c] = &candidate_block[c * 8 * 8]; + } + + uchar yuv16x16[3 * 16 * 16] = { 0 }; // factor 2 mode output image + uchar yuv8x8[3 * 8 * 8] = { 0 }; // factor 1 mode output image + + for (int c = 0; c < 3; c++) + { + if (mayout_channel[c].factor == 1) { + const coeff_t *coeff_block = candidate_channel[c]; + CoeffToYUV8x8(coeff_block, &yuv8x8[c]); + } + else { + int block_xx = block_x / mayout_channel[c].factor; + int block_yy = block_y / mayout_channel[c].factor; + int ix = block_x % mayout_channel[c].factor;; + int iy = block_y % mayout_channel[c].factor; + + int block_16x16idx = block_yy * mayout_channel[c].block_width + block_xx; + __global const coeff_t * coeff_block = mayout_channel[c].coeff + block_16x16idx * 8 * 8; + + CoeffToYUV16x16_g(coeff_block, &yuv16x16[c], + mayout_channel[c].pixel, block_xx, block_yy, + image_width, + image_height); + + // copy YUV16x16 corner to YUV8x8 + Copy16x16To8x8(&yuv16x16[c], &yuv8x8[c], ix, iy); + } + } + + { + float rgb0_c[3][kDCTBlockSize]; + int block_8x8idx = GetOrigBlock(rgb0_c, orig_image_batch, image_width, image_height, block_x, block_y, 1, 0, 0); + + int inside_x = block_x * 8 + 8 > image_width ? image_width - block_x * 8 : 8; + int inside_y = block_y * 8 + 8 > image_height ? image_height - block_y * 8 : 8; + float rgb1_c[3][kDCTBlockSize]; + + YUVToImage(yuv8x8, rgb1_c[0], rgb1_c[1], rgb1_c[2], 8, 8, inside_x, inside_y); + + return ComputeImage8x8Block(rgb0_c, rgb1_c, mask_scale + block_8x8idx * 3); + } +} + +__device__ double Factor2(const channel_info mayout_channel[3], + const coeff_t* candidate_block, + const int block_x, + const int block_y, + __global const float *orig_image_batch, + __global const float *mask_scale, + const int image_width, + const int image_height) +{ + const int factor = 2; + const coeff_t *candidate_channel[3]; + for (int c = 0; c < 3; c++) { + candidate_channel[c] = &candidate_block[c * 8 * 8]; + } + + uchar yuv16x16[3 * 16 * 16] = { 0 }; // factor 2 mode output image + uchar yuv8x8[3 * 8 * 8] = { 0 }; // factor 1 mode output image + + for (int c = 0; c < 3; c++) + { + if (mayout_channel[c].factor == 1) { + for (int iy = 0; iy < factor; ++iy) { + for (int ix = 0; ix < factor; ++ix) { + int block_xx = block_x * factor + ix; + int block_yy = block_y * factor + iy; + + ///if (ix != off_x || iy != off_y) continue; + if (block_xx >= mayout_channel[c].block_width || + block_yy >= mayout_channel[c].block_height) + { + continue; + } + int block_8x8idx = block_yy * mayout_channel[c].block_width + block_xx; + __global const coeff_t * coeff_block = mayout_channel[c].coeff + block_8x8idx * 8 * 8; + CoeffToYUV8x8_g(coeff_block, &yuv8x8[c]); + + // copy YUV8x8 to YUV1616 corner + Copy8x8To16x16(&yuv8x8[c], &yuv16x16[c], ix, iy); + } + } + } + else { + const coeff_t * coeff_block = candidate_channel[c]; + CoeffToYUV16x16(coeff_block, &yuv16x16[c], + mayout_channel[c].pixel, block_x, block_y, + image_width, + image_height); + } + } + + int inside_x = block_x * 16 + 16 > image_width ? image_width - block_x * 16 : 16; + int inside_y = block_y * 16 + 16 > image_height ? image_height - block_y * 16 : 16; + + float rgb16x16[3][16 * 16]; + YUVToImage(yuv16x16, rgb16x16[0], rgb16x16[1], rgb16x16[2], 16, 16, inside_x, inside_y); + + double max_err = 0; + for (int iy = 0; iy < factor; ++iy) { + for (int ix = 0; ix < factor; ++ix) { + int block_xx = block_x * factor + ix; + int block_yy = block_y * factor + iy; + + if (block_xx * 8 >= image_width || + block_yy * 8 >= image_height) + { + continue; + } + + float rgb0_c[3][kDCTBlockSize]; + int block_8x8idx = GetOrigBlock(rgb0_c, orig_image_batch, image_width, image_height, block_x, block_y, factor, ix, iy); + + float rgb1_c[3][kDCTBlockSize]; + Copy16x16ToChannel(rgb16x16, rgb1_c[0], rgb1_c[1], rgb1_c[2], ix, iy); + double err = ComputeImage8x8Block(rgb0_c, rgb1_c, mask_scale + block_8x8idx * 3); + max_err = max(max_err, err); + } + } + return max_err; +} + +__device__ double CompareBlockFactor(const channel_info mayout_channel[3], + const coeff_t* candidate_block, + const int block_x, + const int block_y, + __global const float *orig_image_batch, + __global const float *mask_scale, + const int image_width, + const int image_height, + const int factor) +{ + const coeff_t *candidate_channel[3]; + for (int c = 0; c < 3; c++) { + candidate_channel[c] = &candidate_block[c * 8 * 8]; + } + + uchar yuv16x16[3 * 16 * 16] = { 0 }; // factor 2 mode output image + uchar yuv8x8[3 * 8 * 8] = { 0 }; // factor 1 mode output image + + for (int c = 0; c < 3; c++) + { + if (mayout_channel[c].factor == 1) { + if (factor == 1) { + const coeff_t *coeff_block = candidate_channel[c]; + CoeffToYUV8x8(coeff_block, &yuv8x8[c]); + } + else { + for (int iy = 0; iy < factor; ++iy) { + for (int ix = 0; ix < factor; ++ix) { + int block_xx = block_x * factor + ix; + int block_yy = block_y * factor + iy; + + ///if (ix != off_x || iy != off_y) continue; + if (block_xx >= mayout_channel[c].block_width || + block_yy >= mayout_channel[c].block_height) + { + continue; + } + int block_8x8idx = block_yy * mayout_channel[c].block_width + block_xx; + __global const coeff_t * coeff_block = mayout_channel[c].coeff + block_8x8idx * 8 * 8; + CoeffToYUV8x8_g(coeff_block, &yuv8x8[c]); + + // copy YUV8x8 to YUV1616 corner + Copy8x8To16x16(&yuv8x8[c], &yuv16x16[c], ix, iy); + } + } + } + } + else { + if (factor == 1) { + int block_xx = block_x / mayout_channel[c].factor; + int block_yy = block_y / mayout_channel[c].factor; + int ix = block_x % mayout_channel[c].factor;; + int iy = block_y % mayout_channel[c].factor; + + int block_16x16idx = block_yy * mayout_channel[c].block_width + block_xx; + __global const coeff_t * coeff_block = mayout_channel[c].coeff + block_16x16idx * 8 * 8; + + CoeffToYUV16x16_g(coeff_block, &yuv16x16[c], + mayout_channel[c].pixel, block_xx, block_yy, + image_width, + image_height); + + // copy YUV16x16 corner to YUV8x8 + Copy16x16To8x8(&yuv16x16[c], &yuv8x8[c], ix, iy); + } + else { + const coeff_t * coeff_block = candidate_channel[c]; + CoeffToYUV16x16(coeff_block, &yuv16x16[c], + mayout_channel[c].pixel, block_x, block_y, + image_width, + image_height); + } + } + } + + if (factor == 1) + { + float rgb0_c[3][kDCTBlockSize]; + int block_8x8idx = GetOrigBlock(rgb0_c, orig_image_batch, image_width, image_height, block_x, block_y, factor, 0, 0); + + int inside_x = block_x * 8 + 8 > image_width ? image_width - block_x * 8 : 8; + int inside_y = block_y * 8 + 8 > image_height ? image_height - block_y * 8 : 8; + float rgb1_c[3][kDCTBlockSize]; + + YUVToImage(yuv8x8, rgb1_c[0], rgb1_c[1], rgb1_c[2], 8, 8, inside_x, inside_y); + + return ComputeImage8x8Block(rgb0_c, rgb1_c, mask_scale + block_8x8idx * 3); + } + else + { + int inside_x = block_x * 16 + 16 > image_width ? image_width - block_x * 16 : 16; + int inside_y = block_y * 16 + 16 > image_height ? image_height - block_y * 16 : 16; + + float rgb16x16[3][16 * 16]; + YUVToImage(yuv16x16, rgb16x16[0], rgb16x16[1], rgb16x16[2], 16, 16, inside_x, inside_y); + + double max_err = 0; + for (int iy = 0; iy < factor; ++iy) { + for (int ix = 0; ix < factor; ++ix) { + int block_xx = block_x * factor + ix; + int block_yy = block_y * factor + iy; + + if (block_xx * 8 >= image_width || + block_yy * 8 >= image_height) + { + continue; + } + + float rgb0_c[3][kDCTBlockSize]; + int block_8x8idx = GetOrigBlock(rgb0_c, orig_image_batch, image_width, image_height, block_x, block_y, factor, ix, iy); + + float rgb1_c[3][kDCTBlockSize]; + Copy16x16ToChannel(rgb16x16, rgb1_c[0], rgb1_c[1], rgb1_c[2], ix, iy); + double err = ComputeImage8x8Block(rgb0_c, rgb1_c, mask_scale + block_8x8idx * 3); + max_err = max(max_err, err); + } + } + return max_err; + } +} + +#ifdef __USE_DOUBLE_AS_FLOAT__ +#undef double +#endif + +#endif //__USE_OPENCL__ \ No newline at end of file diff --git a/clguetzli/clguetzli.cl.cpp b/clguetzli/clguetzli.cl.cpp new file mode 100644 index 00000000..619c0cfd --- /dev/null +++ b/clguetzli/clguetzli.cl.cpp @@ -0,0 +1,226 @@ +/* +* OpenCL/CUDA edition implementation of ButteraugliComparator. +* +* Author: strongtu@tencent.com +* ianhuang@tencent.com +* chriskzhou@tencent.com +*/ +#include +#include +#include +#include "utils.h" + +#ifdef __USE_OPENCL__ + +using namespace std; + +int g_idvec[10] = { 0 }; +int g_sizevec[10] = { 0 }; + +int get_global_id(int dim) { + return g_idvec[dim]; +} +int get_global_size(int dim) { + return g_sizevec[dim]; +} + +void set_global_id(int dim, int id){ + g_idvec[dim] = id; +} +void set_global_size(int dim, int size){ + g_sizevec[dim] = size; +} + +#define __checkcl +#define abs(exper) fabs((exper)) +#include "clguetzli.h" +#include "clguetzli.cl" +#include "cuguetzli.h" +#include "ocu.h" + +namespace guetzli +{ + ButteraugliComparatorEx::ButteraugliComparatorEx(const int width, const int height, + const std::vector* rgb, + const float target_distance, ProcessStats* stats) + : ButteraugliComparator(width, height, rgb, target_distance, stats) + { + if (MODE_CPU != g_mathMode) + { + rgb_orig_opsin.resize(3); + rgb_orig_opsin[0].resize(width * height); + rgb_orig_opsin[1].resize(width * height); + rgb_orig_opsin[2].resize(width * height); + +#ifdef __USE_DOUBLE_AS_FLOAT__ + const float* lut = kSrgb8ToLinearTable; +#else + const double* lut = kSrgb8ToLinearTable; +#endif + for (int c = 0; c < 3; ++c) { + for (int y = 0, ix = 0; y < height_; ++y) { + for (int x = 0; x < width_; ++x, ++ix) { + rgb_orig_opsin[c][ix] = lut[rgb_orig_[3 * ix + c]]; + } + } + } + ::butteraugli::OpsinDynamicsImage(width_, height_, rgb_orig_opsin); + } + } + + void ButteraugliComparatorEx::Compare(const OutputImage& img) + { + if (MODE_CPU_OPT == g_mathMode) + { + std::vector > rgb0 = rgb_orig_opsin; + + std::vector > rgb(3, std::vector(width_ * height_)); + img.ToLinearRGB(&rgb); + ::butteraugli::OpsinDynamicsImage(width_, height_, rgb); + std::vector().swap(distmap_); + comparator_.DiffmapOpsinDynamicsImage(rgb0, rgb, distmap_); + distance_ = ::butteraugli::ButteraugliScoreFromDiffmap(distmap_); + } +#ifdef __USE_OPENCL__ + else if (MODE_OPENCL == g_mathMode) + { + std::vector > rgb1(3, std::vector(width_ * height_)); + img.ToLinearRGB(&rgb1); + + const int xsize = width_; + const int ysize = height_; + std::vector().swap(distmap_); + distmap_.resize(xsize * ysize); + + size_t channel_size = xsize * ysize * sizeof(float); + ocl_args_d_t &ocl = getOcl(); + ocl_channels xyb0 = ocl.allocMemChannels(channel_size, rgb_orig_opsin[0].data(), rgb_orig_opsin[1].data(), rgb_orig_opsin[2].data()); + ocl_channels xyb1 = ocl.allocMemChannels(channel_size, rgb1[0].data(), rgb1[1].data(), rgb1[2].data()); + + cl_mem mem_result = ocl.allocMem(channel_size); + + clOpsinDynamicsImageEx(xyb1, xsize, ysize); + clDiffmapOpsinDynamicsImageEx(mem_result, xyb0, xyb1, xsize, ysize, comparator_.step()); + + cl_int err = clEnqueueReadBuffer(ocl.commandQueue, mem_result, false, 0, channel_size, distmap_.data(), 0, NULL, NULL); + LOG_CL_RESULT(err); + err = clFinish(ocl.commandQueue); + LOG_CL_RESULT(err); + + clReleaseMemObject(mem_result); + ocl.releaseMemChannels(xyb0); + ocl.releaseMemChannels(xyb1); + + distance_ = ::butteraugli::ButteraugliScoreFromDiffmap(distmap_); + } +#endif +#ifdef __USE_CUDA__ + else if (MODE_CUDA == g_mathMode) + { + std::vector > rgb1(3, std::vector(width_ * height_)); + img.ToLinearRGB(&rgb1); + + const int xsize = width_; + const int ysize = height_; + std::vector().swap(distmap_); + distmap_.resize(xsize * ysize); + + size_t channel_size = xsize * ysize * sizeof(float); + ocu_args_d_t &ocu = getOcu(); + ocu_channels xyb0 = ocu.allocMemChannels(channel_size, rgb_orig_opsin[0].data(), rgb_orig_opsin[1].data(), rgb_orig_opsin[2].data()); + ocu_channels xyb1 = ocu.allocMemChannels(channel_size, rgb1[0].data(), rgb1[1].data(), rgb1[2].data()); + + cu_mem mem_result = ocu.allocMem(channel_size); + + cuOpsinDynamicsImageEx(xyb1, xsize, ysize); + + cuDiffmapOpsinDynamicsImageEx(mem_result, xyb0, xyb1, xsize, ysize, comparator_.step()); + + cuMemcpyDtoH(distmap_.data(), mem_result, channel_size); + + ocu.releaseMem(mem_result); + ocu.releaseMemChannels(xyb0); + ocu.releaseMemChannels(xyb1); + + distance_ = ::butteraugli::ButteraugliScoreFromDiffmap(distmap_); + } +#endif + else + { + ButteraugliComparator::Compare(img); + } + } + + void ButteraugliComparatorEx::StartBlockComparisons() + { + if (MODE_CPU == g_mathMode) + { + ButteraugliComparator::StartBlockComparisons(); + return; + } + + std::vector > dummy(3); + ::butteraugli::Mask(rgb_orig_opsin, rgb_orig_opsin, width_, height_, &mask_xyz_, &dummy); + + const int width = width_; + const int height = height_; + const int factor_x = 1; + const int factor_y = 1; + + const int block_width = (width + 8 * factor_x - 1) / (8 * factor_x); + const int block_height = (height + 8 * factor_y - 1) / (8 * factor_y); + const int num_blocks = block_width * block_height; +#ifdef __USE_DOUBLE_AS_FLOAT__ + const float* lut = kSrgb8ToLinearTable; +#else + const double* lut = kSrgb8ToLinearTable; +#endif + imgOpsinDynamicsBlockList.resize(num_blocks * 3 * kDCTBlockSize); + imgMaskXyzScaleBlockList.resize(num_blocks * 3); + for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) + { + for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) + { + float* curR = &imgOpsinDynamicsBlockList[block_ix * 3 * kDCTBlockSize]; + float* curG = curR + kDCTBlockSize; + float* curB = curG + kDCTBlockSize; + + for (int iy = 0, i = 0; iy < 8; ++iy) { + for (int ix = 0; ix < 8; ++ix, ++i) { + int x = std::min(8 * block_x + ix, width - 1); + int y = std::min(8 * block_y + iy, height - 1); + int px = y * width + x; + + curR[i] = lut[rgb_orig_[3 * px]]; + curG[i] = lut[rgb_orig_[3 * px + 1]]; + curB[i] = lut[rgb_orig_[3 * px + 2]]; + } + } + + CalcOpsinDynamicsImage((float(*)[64])curR); + + int xmin = block_x * 8; + int ymin = block_y * 8; + + imgMaskXyzScaleBlockList[block_ix * 3] = mask_xyz_[0][ymin * width_ + xmin]; + imgMaskXyzScaleBlockList[block_ix * 3 + 1] = mask_xyz_[1][ymin * width_ + xmin]; + imgMaskXyzScaleBlockList[block_ix * 3 + 2] = mask_xyz_[2][ymin * width_ + xmin]; + } + } + } + + void ButteraugliComparatorEx::FinishBlockComparisons() { + ButteraugliComparator::FinishBlockComparisons(); + + imgOpsinDynamicsBlockList.clear(); + imgMaskXyzScaleBlockList.clear(); + } + + double ButteraugliComparatorEx::CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const + { + double err = ButteraugliComparator::CompareBlock(img, off_x, off_y, candidate_block, comp_mask); + return err; + } +} + +#endif \ No newline at end of file diff --git a/clguetzli/clguetzli.cl.h b/clguetzli/clguetzli.cl.h new file mode 100644 index 00000000..12543e42 --- /dev/null +++ b/clguetzli/clguetzli.cl.h @@ -0,0 +1,162 @@ +/* +* OpenCL/CUDA edition implementation of ButteraugliComparator. +* +* Author: strongtu@tencent.com +* ianhuang@tencent.com +* chriskzhou@tencent.com +*/ +#ifndef __CLGUETZLI_CL_H__ +#define __CLGUETZLI_CL_H__ + +#ifdef __USE_OPENCL__ + +#ifdef __cplusplus +#ifndef __CUDACC__ +#include "CL/cl.h" +#include "cuda.h" +#endif +#endif + +#define __USE_DOUBLE_AS_FLOAT__ + +#ifdef __cplusplus +#ifndef __CUDACC__ + #define __kernel + #define __private + #define __global + #define __constant + #define __constant_ex + #define __device__ + + typedef unsigned char uchar; + typedef unsigned short ushort; + typedef CUdeviceptr cu_mem; + + int get_global_id(int dim); + int get_global_size(int dim); + void set_global_id(int dim, int id); + void set_global_size(int dim, int size); + + #ifdef __checkcl + typedef union ocl_channels_t + { + struct + { + float * r; + float * g; + float * b; + }; + union + { + float *ch[3]; + }; + }ocl_channels; + + typedef union ocu_channels_t + { + struct + { + float * r; + float * g; + float * b; + }; + union + { + float *ch[3]; + }; + }ocu_channels; + #else + typedef union ocl_channels_t + { + struct + { + cl_mem r; + cl_mem g; + cl_mem b; + }; + struct + { + cl_mem x; + cl_mem y; + cl_mem b_; + }; + union + { + cl_mem ch[3]; + }; + }ocl_channels; + + typedef union ocu_channels_t + { + struct + { + cu_mem r; + cu_mem g; + cu_mem b; + }; + struct + { + cu_mem x; + cu_mem y; + cu_mem b_; + }; + union + { + cu_mem ch[3]; + }; + }ocu_channels; + #endif +#endif /*__CUDACC__*/ +#endif /*__cplusplus*/ + +#ifdef __OPENCL_VERSION__ + #define __constant_ex __constant + #define __device__ + +#endif /*__OPENCL_VERSION__*/ + +#ifdef __CUDACC__ + #define __kernel extern "C" __global__ + #define __private + #define __global + #define __constant __constant__ + #define __constant_ex + typedef unsigned char uchar; + typedef unsigned short ushort; + + __device__ int get_global_id(int dim) + { + switch (dim) + { + case 0: return blockIdx.x * blockDim.x + threadIdx.x; + case 1: return blockIdx.y * blockDim.y + threadIdx.y; + default: return blockIdx.z * blockDim.z + threadIdx.z; + } + } + + __device__ int get_global_size(int dim) + { + switch(dim) + { + case 0: return gridDim.x * blockDim.x; + case 1: return gridDim.y * blockDim.y; + default: return gridDim.z * blockDim.z; + } + } + +#endif /*__CUDACC__*/ + + typedef short coeff_t; + + typedef struct __channel_info_t + { + int factor; + int block_width; + int block_height; + __global const coeff_t *coeff; + __global const ushort *pixel; + }channel_info; + +#endif /*__CLGUETZLI_CL_H__*/ + +#endif // __USE_OPENCL__ \ No newline at end of file diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp new file mode 100644 index 00000000..52129927 --- /dev/null +++ b/clguetzli/clguetzli.cpp @@ -0,0 +1,841 @@ +/* +* OpenCL edition implementation of guetzli. +* +* Author: strongtu@tencent.com +* ianhuang@tencent.com +* chriskzhou@tencent.com +*/ +#include "clguetzli.h" +#include +#include +#include +#include "cl.hpp" + +extern MATH_MODE g_mathMode = MODE_CPU; + +#ifdef __USE_OPENCL__ + +#ifdef __USE_DOUBLE_AS_FLOAT__ +#define double float +#endif + +void clOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize) +{ + size_t channel_size = xsize * ysize * sizeof(float); + + ocl_args_d_t &ocl = getOcl(); + ocl_channels rgb = ocl.allocMemChannels(channel_size, r, g, b); + + clOpsinDynamicsImageEx(rgb, xsize, ysize); + + clEnqueueReadBuffer(ocl.commandQueue, rgb.r, false, 0, channel_size, r, 0, NULL, NULL); + clEnqueueReadBuffer(ocl.commandQueue, rgb.g, false, 0, channel_size, g, 0, NULL, NULL); + clEnqueueReadBuffer(ocl.commandQueue, rgb.b, false, 0, channel_size, b, 0, NULL, NULL); + clFinish(ocl.commandQueue); + + ocl.releaseMemChannels(rgb); +} + +void clDiffmapOpsinDynamicsImage( + float* result, + const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2, + const size_t xsize, const size_t ysize, + const size_t step) +{ + size_t channel_size = xsize * ysize * sizeof(float); + + ocl_args_d_t &ocl = getOcl(); + ocl_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b); + ocl_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2); + + cl_mem mem_result = ocl.allocMem(channel_size, result); + + clDiffmapOpsinDynamicsImageEx(mem_result, xyb0, xyb1, xsize, ysize, step); + + clEnqueueReadBuffer(ocl.commandQueue, mem_result, false, 0, channel_size, result, 0, NULL, NULL); + cl_int err = clFinish(ocl.commandQueue); + + ocl.releaseMemChannels(xyb1); + ocl.releaseMemChannels(xyb0); + + clReleaseMemObject(mem_result); +} + +void clComputeBlockZeroingOrder( + guetzli::CoeffData *output_order_batch, + const channel_info orig_channel[3], + const float *orig_image_batch, + const float *mask_scale, + const int image_width, + const int image_height, + const channel_info mayout_channel[3], + const int factor, + const int comp_mask, + const float BlockErrorLimit) +{ + const int block8_width = (image_width + 8 - 1) / 8; + const int block8_height = (image_height + 8 - 1) / 8; + const int blockf_width = (image_width + 8 * factor - 1) / (8 * factor); + const int blockf_height = (image_height + 8 * factor - 1) / (8 * factor); + + using namespace guetzli; + + ocl_args_d_t &ocl = getOcl(); + + cl_mem mem_orig_coeff[3]; + cl_mem mem_mayout_coeff[3]; + cl_mem mem_mayout_pixel[3]; + for (int c = 0; c < 3; c++) + { + int block_count = orig_channel[c].block_width * orig_channel[c].block_height; + mem_orig_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, orig_channel[c].coeff); + + block_count = mayout_channel[c].block_width * mayout_channel[c].block_height; + mem_mayout_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, mayout_channel[c].coeff); + + mem_mayout_pixel[c] = ocl.allocMem(image_width * image_height * sizeof(uint16_t), mayout_channel[c].pixel); + } + cl_mem mem_orig_image = ocl.allocMem(sizeof(float) * 3 * kDCTBlockSize * block8_width * block8_height, orig_image_batch); + cl_mem mem_mask_scale = ocl.allocMem(sizeof(float) * 3 * block8_width * block8_height, mask_scale); + + int output_order_batch_size = sizeof(CoeffData) * 3 * kDCTBlockSize * blockf_width * blockf_height; + cl_mem mem_output_order_batch = ocl.allocMem(output_order_batch_size, output_order_batch); + + cl_kernel kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER]; + clSetKernelArgEx(kernel, &mem_orig_coeff[0], &mem_orig_coeff[1], &mem_orig_coeff[2], + &mem_orig_image, &mem_mask_scale, + &blockf_width, &blockf_height, + &image_width, &image_height, + &mem_mayout_coeff[0], &mem_mayout_coeff[1], &mem_mayout_coeff[2], + &mem_mayout_pixel[0], &mem_mayout_pixel[1], &mem_mayout_pixel[2], + &mayout_channel[0], &mayout_channel[1], &mayout_channel[2], + &factor, + &comp_mask, + &BlockErrorLimit, + &mem_output_order_batch); + + size_t globalWorkSize[2] = { blockf_width, blockf_height }; + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + LOG_CL_RESULT(err); + err = clFinish(ocl.commandQueue); + LOG_CL_RESULT(err); + + clEnqueueReadBuffer(ocl.commandQueue, mem_output_order_batch, false, 0, output_order_batch_size, output_order_batch, 0, NULL, NULL); + clFinish(ocl.commandQueue); + + for (int c = 0; c < 3; c++) + { + clReleaseMemObject(mem_orig_coeff[c]); + clReleaseMemObject(mem_mayout_coeff[c]); + clReleaseMemObject(mem_mayout_pixel[c]); + } + + clReleaseMemObject(mem_orig_image); + clReleaseMemObject(mem_mask_scale); + clReleaseMemObject(mem_output_order_batch); +} + +void clMask( + float* mask_r, float* mask_g, float* mask_b, + float* maskdc_r, float* maskdc_g, float* maskdc_b, + const size_t xsize, const size_t ysize, + const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2) +{ + ocl_args_d_t &ocl = getOcl(); + + size_t channel_size = xsize * ysize * sizeof(float); + + ocl_channels rgb = ocl.allocMemChannels(channel_size, r, g, b); + ocl_channels rgb2 = ocl.allocMemChannels(channel_size, r2, g2, b2); + ocl_channels mask = ocl.allocMemChannels(channel_size); + ocl_channels mask_dc = ocl.allocMemChannels(channel_size); + + clMaskEx(mask, mask_dc, rgb, rgb2, xsize, ysize); + + clEnqueueReadBuffer(ocl.commandQueue, mask.r, false, 0, channel_size, mask_r, 0, NULL, NULL); + clEnqueueReadBuffer(ocl.commandQueue, mask.g, false, 0, channel_size, mask_g, 0, NULL, NULL); + clEnqueueReadBuffer(ocl.commandQueue, mask.b, false, 0, channel_size, mask_b, 0, NULL, NULL); + clEnqueueReadBuffer(ocl.commandQueue, mask_dc.r, false, 0, channel_size, maskdc_r, 0, NULL, NULL); + clEnqueueReadBuffer(ocl.commandQueue, mask_dc.g, false, 0, channel_size, maskdc_g, 0, NULL, NULL); + clEnqueueReadBuffer(ocl.commandQueue, mask_dc.b, false, 0, channel_size, maskdc_b, 0, NULL, NULL); + clFinish(ocl.commandQueue); + + ocl.releaseMemChannels(rgb); + ocl.releaseMemChannels(rgb2); + ocl.releaseMemChannels(mask); + ocl.releaseMemChannels(mask_dc); +} + +void clDiffmapOpsinDynamicsImageEx( + cl_mem result, + ocl_channels xyb0, + ocl_channels xyb1, + const size_t xsize, const size_t ysize, + const size_t step) +{ + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + + size_t channel_size = xsize * ysize * sizeof(float); + size_t channel_step_size = res_xsize * res_ysize * sizeof(float); + + ocl_args_d_t &ocl = getOcl(); + + cl_mem edge_detector_map = ocl.allocMem(3 * channel_step_size); + cl_mem block_diff_dc = ocl.allocMem(3 * channel_step_size); + cl_mem block_diff_ac = ocl.allocMem(3 * channel_step_size); + + clMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize); + + clEdgeDetectorMapEx(edge_detector_map, xyb0, xyb1, xsize, ysize, step); + clBlockDiffMapEx(block_diff_dc, block_diff_ac, xyb0, xyb1, xsize, ysize, step); + clEdgeDetectorLowFreqEx(block_diff_ac, xyb0, xyb1, xsize, ysize, step); + { + ocl_channels mask = ocl.allocMemChannels(channel_size); + ocl_channels mask_dc = ocl.allocMemChannels(channel_size); + clMaskEx(mask, mask_dc, xyb0, xyb1, xsize, ysize); + clCombineChannelsEx(result, mask, mask_dc, xsize, ysize, block_diff_dc, block_diff_ac, edge_detector_map, res_xsize, step); + + ocl.releaseMemChannels(mask); + ocl.releaseMemChannels(mask_dc); + } + + clCalculateDiffmapEx(result, xsize, ysize, step); + + clReleaseMemObject(edge_detector_map); + clReleaseMemObject(block_diff_dc); + clReleaseMemObject(block_diff_ac); +} +void clConvolutionEx( + cl_mem result/*out*/, + const cl_mem inp, size_t xsize, size_t ysize, + const cl_mem multipliers, size_t len, + int xstep, int offset, float border_ratio) +{ + ocl_args_d_t &ocl = getOcl(); + + size_t oxsize = (xsize + xstep - 1) / xstep; + + cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTION]; + clSetKernelArgEx(kernel, &result, &inp, &xsize, &multipliers, &len, &xstep, &offset, &border_ratio); + + size_t globalWorkSize[2] = { oxsize, ysize }; + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + LOG_CL_RESULT(err); + err = clFinish(ocl.commandQueue); + LOG_CL_RESULT(err); +} + +void clConvolutionXEx( + cl_mem result/*out*/, + const cl_mem inp, size_t xsize, size_t ysize, + const cl_mem multipliers, size_t len, + int xstep, int offset, float border_ratio) +{ + ocl_args_d_t &ocl = getOcl(); + + cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTIONX]; + clSetKernelArgEx(kernel, &result, &xsize, &ysize, &inp, &multipliers, &len, &xstep, &offset, &border_ratio); + + size_t globalWorkSize[2] = { xsize, ysize }; + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + LOG_CL_RESULT(err); + err = clFinish(ocl.commandQueue); + LOG_CL_RESULT(err); +} + +void clConvolutionYEx( + cl_mem result/*out*/, + const cl_mem inp, size_t xsize, size_t ysize, + const cl_mem multipliers, size_t len, + int xstep, int offset, float border_ratio) +{ + ocl_args_d_t &ocl = getOcl(); + + cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTIONY]; + clSetKernelArgEx(kernel, &result, &xsize, &ysize, &inp, &multipliers, &len, &xstep, &offset, &border_ratio); + + size_t globalWorkSize[2] = { xsize, ysize }; + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + LOG_CL_RESULT(err); + err = clFinish(ocl.commandQueue); + LOG_CL_RESULT(err); +} + +void clSquareSampleEx( + cl_mem result/*out*/, + const cl_mem image, size_t xsize, size_t ysize, + size_t xstep, size_t ystep) +{ + ocl_args_d_t &ocl = getOcl(); + + cl_kernel kernel = ocl.kernel[KERNEL_SQUARESAMPLE]; + clSetKernelArgEx(kernel, &result, &xsize, &ysize, &image, &xstep, &ystep); + + size_t globalWorkSize[2] = { xsize, ysize }; + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + LOG_CL_RESULT(err); + err = clFinish(ocl.commandQueue); + LOG_CL_RESULT(err); +} + +void clBlurEx(cl_mem image/*out, opt*/, const size_t xsize, const size_t ysize, + const double sigma, const double border_ratio, + cl_mem result/*out, opt*/) +{ + double m = 2.25; // Accuracy increases when m is increased. + const double scaler = -1.0 / (2 * sigma * sigma); + // For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52} + const int diff = std::max(1, m * fabs(sigma)); + const int expn_size = 2 * diff + 1; + std::vector expn(expn_size); + for (int i = -diff; i <= diff; ++i) { + expn[i + diff] = static_cast(exp(scaler * i * i)); + } + + const int xstep = std::max(1, int(sigma / 3)); + + ocl_args_d_t &ocl = getOcl(); + cl_mem mem_expn = ocl.allocMem(sizeof(cl_float) * expn_size, expn.data()); + + if (xstep > 1) + { + cl_mem m = ocl.allocMem(sizeof(cl_float) * xsize * ysize); + clConvolutionXEx(m, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + clConvolutionYEx(result ? result : image, m, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + clSquareSampleEx(result ? result : image, result ? result : image, xsize, ysize, xstep, xstep); + clReleaseMemObject(m); + } + else + { + cl_mem m = ocl.allocMem(sizeof(cl_float) * xsize * ysize); + clConvolutionXEx(m, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + clConvolutionYEx(result ? result : image, m, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + clReleaseMemObject(m); + } + + clReleaseMemObject(mem_expn); +} + +void clOpsinDynamicsImageEx(ocl_channels &rgb, const size_t xsize, const size_t ysize) +{ + static const double kSigma = 1.1; + + size_t channel_size = xsize * ysize * sizeof(float); + + ocl_args_d_t &ocl = getOcl(); + ocl_channels rgb_blurred = ocl.allocMemChannels(channel_size); + + const int size = xsize * ysize; + + clBlurEx(rgb.r, xsize, ysize, kSigma, 0.0, rgb_blurred.r); + clBlurEx(rgb.g, xsize, ysize, kSigma, 0.0, rgb_blurred.g); + clBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b); + + cl_kernel kernel = ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE]; + clSetKernelArgEx(kernel, &rgb.r, &rgb.g, &rgb.b, &size, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b); + + size_t globalWorkSize[1] = { xsize * ysize }; + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); + LOG_CL_RESULT(err); + err = clFinish(ocl.commandQueue); + LOG_CL_RESULT(err); + + ocl.releaseMemChannels(rgb_blurred); +} + +void clMaskHighIntensityChangeEx( + ocl_channels &xyb0/*in,out*/, + ocl_channels &xyb1/*in,out*/, + const size_t xsize, const size_t ysize) +{ + size_t channel_size = xsize * ysize * sizeof(float); + + ocl_args_d_t &ocl = getOcl(); + + ocl_channels c0 = ocl.allocMemChannels(channel_size); + ocl_channels c1 = ocl.allocMemChannels(channel_size); + + clEnqueueCopyBuffer(ocl.commandQueue, xyb0.r, c0.r, 0, 0, channel_size, 0, NULL, NULL); + clEnqueueCopyBuffer(ocl.commandQueue, xyb0.g, c0.g, 0, 0, channel_size, 0, NULL, NULL); + clEnqueueCopyBuffer(ocl.commandQueue, xyb0.b, c0.b, 0, 0, channel_size, 0, NULL, NULL); + clEnqueueCopyBuffer(ocl.commandQueue, xyb1.r, c1.r, 0, 0, channel_size, 0, NULL, NULL); + clEnqueueCopyBuffer(ocl.commandQueue, xyb1.g, c1.g, 0, 0, channel_size, 0, NULL, NULL); + clEnqueueCopyBuffer(ocl.commandQueue, xyb1.b, c1.b, 0, 0, channel_size, 0, NULL, NULL); + clFinish(ocl.commandQueue); + + cl_kernel kernel = ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE]; + clSetKernelArgEx(kernel, + &xyb0.r, &xyb0.g, &xyb0.b, + &xsize, &ysize, + &xyb1.r, &xyb1.g, &xyb1.b, + &c0.r, &c0.g, &c0.b, + &c1.r, &c1.g, &c1.b); + + size_t globalWorkSize[2] = { xsize, ysize }; + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + LOG_CL_RESULT(err); + err = clFinish(ocl.commandQueue); + LOG_CL_RESULT(err); + + ocl.releaseMemChannels(c0); + ocl.releaseMemChannels(c1); +} + +void clEdgeDetectorMapEx( + cl_mem result/*out*/, + const ocl_channels &rgb, const ocl_channels &rgb2, + const size_t xsize, const size_t ysize, const size_t step) +{ + size_t channel_size = xsize * ysize * sizeof(float); + + ocl_args_d_t &ocl = getOcl(); + + ocl_channels rgb_blured = ocl.allocMemChannels(channel_size); + ocl_channels rgb2_blured = ocl.allocMemChannels(channel_size); + + static const double kSigma[3] = { 1.5, 0.586, 0.4 }; + + for (int i = 0; i < 3; i++) + { + clBlurEx(rgb.ch[i], xsize, ysize, kSigma[i], 0.0, rgb_blured.ch[i]); + clBlurEx(rgb2.ch[i], xsize, ysize, kSigma[i], 0.0, rgb2_blured.ch[i]); + } + + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + + cl_kernel kernel = ocl.kernel[KERNEL_EDGEDETECTOR]; + clSetKernelArgEx(kernel, &result, + &res_xsize, &res_ysize, + &rgb_blured.r, &rgb_blured.g, &rgb_blured.b, + &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b, + &xsize, &ysize, &step); + + size_t globalWorkSize[2] = { res_xsize, res_ysize}; + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + LOG_CL_RESULT(err); + err = clFinish(ocl.commandQueue); + LOG_CL_RESULT(err); + + ocl.releaseMemChannels(rgb_blured); + ocl.releaseMemChannels(rgb2_blured); +} + +void clBlockDiffMapEx( + cl_mem block_diff_dc/*out*/, + cl_mem block_diff_ac/*out*/, + const ocl_channels &rgb, const ocl_channels &rgb2, + const size_t xsize, const size_t ysize, const size_t step) +{ + ocl_args_d_t &ocl = getOcl(); + + + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + + cl_kernel kernel = ocl.kernel[KERNEL_BLOCKDIFFMAP]; + clSetKernelArgEx(kernel, &block_diff_dc, &block_diff_ac, + &res_xsize, &res_ysize, + &rgb.r, &rgb.g, &rgb.b, + &rgb2.r, &rgb2.g, &rgb2.b, + &xsize, &ysize, &step); + + + size_t globalWorkSize[2] = { res_xsize, res_ysize }; + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + LOG_CL_RESULT(err); + err = clFinish(ocl.commandQueue); + LOG_CL_RESULT(err); +} + +void clEdgeDetectorLowFreqEx( + cl_mem block_diff_ac/*in,out*/, + const ocl_channels &rgb, const ocl_channels &rgb2, + const size_t xsize, const size_t ysize, const size_t step) +{ + size_t channel_size = xsize * ysize * sizeof(float); + + static const double kSigma = 14; + ocl_args_d_t &ocl = getOcl(); + ocl_channels rgb_blured = ocl.allocMemChannels(channel_size); + ocl_channels rgb2_blured = ocl.allocMemChannels(channel_size); + + for (int i = 0; i < 3; i++) + { + clBlurEx(rgb.ch[i], xsize, ysize, kSigma, 0.0, rgb_blured.ch[i]); + clBlurEx(rgb2.ch[i], xsize, ysize, kSigma, 0.0, rgb2_blured.ch[i]); + } + + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + + cl_kernel kernel = ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ]; + clSetKernelArgEx(kernel, &block_diff_ac, + &res_xsize, &res_ysize, + &rgb_blured.r, &rgb_blured.g, &rgb_blured.b, + &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b, + &xsize, &ysize, &step); + + size_t globalWorkSize[2] = { res_xsize, res_ysize }; + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + LOG_CL_RESULT(err); + err = clFinish(ocl.commandQueue); + LOG_CL_RESULT(err); + + ocl.releaseMemChannels(rgb_blured); + ocl.releaseMemChannels(rgb2_blured); +} + +void clDiffPrecomputeEx( + ocl_channels &mask/*out*/, + const ocl_channels &xyb0, const ocl_channels &xyb1, + const size_t xsize, const size_t ysize) +{ + ocl_args_d_t &ocl = getOcl(); + + cl_kernel kernel = ocl.kernel[KERNEL_DIFFPRECOMPUTE]; + clSetKernelArgEx(kernel, &mask.x, &mask.y, &mask.b, + &xsize, &ysize, + &xyb0.x, &xyb0.y, &xyb0.b, + &xyb1.x, &xyb1.y, &xyb1.b); + + size_t globalWorkSize[2] = { xsize, ysize }; + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + LOG_CL_RESULT(err); + err = clFinish(ocl.commandQueue); + LOG_CL_RESULT(err); +} + +void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w) +{ + ocl_args_d_t &ocl = getOcl(); + float fw = w; + + cl_kernel kernel = ocl.kernel[KERNEL_SCALEIMAGE]; + clSetKernelArgEx(kernel, &img, &size, &fw); + + size_t globalWorkSize[1] = { size }; + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); + LOG_CL_RESULT(err); + err = clFinish(ocl.commandQueue); + LOG_CL_RESULT(err); +} + +void clAverage5x5Ex(cl_mem img/*in,out*/, const size_t xsize, const size_t ysize) +{ + if (xsize < 4 || ysize < 4) { + // TODO: Make this work for small dimensions as well. + return; + } + + ocl_args_d_t &ocl = getOcl(); + + size_t len = xsize * ysize * sizeof(float); + cl_mem img_org = ocl.allocMem(len); + + clEnqueueCopyBuffer(ocl.commandQueue, img, img_org, 0, 0, len, 0, NULL, NULL); + + cl_kernel kernel = ocl.kernel[KERNEL_AVERAGE5X5]; + clSetKernelArgEx(kernel, &img, &xsize, &ysize, &img_org); + + size_t globalWorkSize[2] = { xsize, ysize }; + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + LOG_CL_RESULT(err); + err = clFinish(ocl.commandQueue); + LOG_CL_RESULT(err); + + clReleaseMemObject(img_org); +} + +void clMinSquareValEx( + cl_mem img/*in,out*/, + const size_t xsize, const size_t ysize, + const size_t square_size, const size_t offset) +{ + ocl_args_d_t &ocl = getOcl(); + + cl_mem result = ocl.allocMem(sizeof(cl_float) * xsize * ysize); + + cl_kernel kernel = ocl.kernel[KERNEL_MINSQUAREVAL]; + clSetKernelArgEx(kernel, &result, &xsize, &ysize, &img, &square_size, &offset); + + size_t globalWorkSize[2] = { xsize, ysize }; + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + LOG_CL_RESULT(err); + err = clEnqueueCopyBuffer(ocl.commandQueue, result, img, 0, 0, sizeof(cl_float) * xsize * ysize, 0, NULL, NULL); + LOG_CL_RESULT(err); + err = clFinish(ocl.commandQueue); + LOG_CL_RESULT(err); + clReleaseMemObject(result); +} + +static void MakeMask(double extmul, double extoff, + double mul, double offset, + double scaler, double *result) +{ + for (size_t i = 0; i < 512; ++i) { + const double c = mul / ((0.01 * scaler * i) + offset); + result[i] = 1.0 + extmul * (c + extoff); + result[i] *= result[i]; + } +} + +static const double kInternalGoodQualityThreshold = 14.921561160295326; +static const double kGlobalScale = 1.0 / kInternalGoodQualityThreshold; + +void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, size_t xsize, size_t ysize) +{ + ocl_args_d_t &ocl = getOcl(); + + double extmul = 0.975741017749; + double extoff = -4.25328244168; + double offset = 0.454909521427; + double scaler = 0.0738288224836; + double mul = 20.8029176447; + static double lut_x[512]; + static bool lutx_init = false; + if (!lutx_init) + { + lutx_init = true; + MakeMask(extmul, extoff, mul, offset, scaler, lut_x); + } + + extmul = 0.373995618954; + extoff = 1.5307267433; + offset = 0.911952641929; + scaler = 1.1731667845; + mul = 16.2447033988; + static double lut_y[512]; + static bool luty_init = false; + if (!luty_init) + { + luty_init = true; + MakeMask(extmul, extoff, mul, offset, scaler, lut_y); + } + + extmul = 0.61582234137; + extoff = -4.25376118646; + offset = 1.05105070921; + scaler = 0.47434643535; + mul = 31.1444967089; + static double lut_b[512]; + static bool lutb_init = false; + if (!lutb_init) + { + lutb_init = true; + MakeMask(extmul, extoff, mul, offset, scaler, lut_b); + } + + extmul = 1.79116943438; + extoff = -3.86797479189; + offset = 0.670960225853; + scaler = 0.486575865525; + mul = 20.4563479139; + static double lut_dcx[512]; + static bool lutdcx_init = false; + if (!lutdcx_init) + { + lutdcx_init = true; + MakeMask(extmul, extoff, mul, offset, scaler, lut_dcx); + } + + extmul = 0.212223514236; + extoff = -3.65647120524; + offset = 1.73396799447; + scaler = 0.170392660501; + mul = 21.6566724788; + static double lut_dcy[512]; + static bool lutdcy_init = false; + if (!lutdcy_init) + { + lutdcy_init = true; + MakeMask(extmul, extoff, mul, offset, scaler, lut_dcy); + } + + extmul = 0.349376011816; + extoff = -0.894711072781; + offset = 0.901647926679; + scaler = 0.380086095024; + mul = 18.0373825149; + static double lut_dcb[512]; + static bool lutdcb_init = false; + if (!lutdcb_init) + { + lutdcb_init = true; + MakeMask(extmul, extoff, mul, offset, scaler, lut_dcb); + } + + size_t channel_size = 512 * sizeof(double); + ocl_channels xyb = ocl.allocMemChannels(channel_size, lut_x, lut_y, lut_b); + ocl_channels xyb_dc = ocl.allocMemChannels(channel_size, lut_dcx, lut_dcy, lut_dcb); + + cl_kernel kernel = ocl.kernel[KERNEL_DOMASK]; + clSetKernelArgEx(kernel, &mask.r, &mask.g, &mask.b, + &xsize, &ysize, + &mask_dc.r, &mask_dc.g, &mask_dc.b, + &xyb.x, &xyb.y, &xyb.b, + &xyb_dc.x, &xyb_dc.y, &xyb_dc.b); + + size_t globalWorkSize[2] = { xsize, ysize }; + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + LOG_CL_RESULT(err); + err = clFinish(ocl.commandQueue); + LOG_CL_RESULT(err); + + ocl.releaseMemChannels(xyb); + ocl.releaseMemChannels(xyb_dc); +} + +void clMaskEx( + ocl_channels mask/*out*/, ocl_channels mask_dc/*out*/, + const ocl_channels &rgb, const ocl_channels &rgb2, + const size_t xsize, const size_t ysize) +{ + clDiffPrecomputeEx(mask, rgb, rgb2, xsize, ysize); + for (int i = 0; i < 3; i++) + { + clAverage5x5Ex(mask.ch[i], xsize, ysize); + clMinSquareValEx(mask.ch[i], xsize, ysize, 4, 0); + + static const double sigma[3] = { + 9.65781083553, + 14.2644604355, + 4.53358927369, + }; + + clBlurEx(mask.ch[i], xsize, ysize, sigma[i], 0.0); + } + + clDoMask(mask, mask_dc, xsize, ysize); + + for (int i = 0; i < 3; i++) + { + clScaleImageEx(mask.ch[i], xsize * ysize, kGlobalScale * kGlobalScale); + clScaleImageEx(mask_dc.ch[i], xsize * ysize, kGlobalScale * kGlobalScale); + } +} + +void clCombineChannelsEx( + cl_mem result/*out*/, + const ocl_channels &mask, + const ocl_channels &mask_dc, + const size_t xsize, const size_t ysize, + const cl_mem block_diff_dc, + const cl_mem block_diff_ac, + const cl_mem edge_detector_map, + const size_t res_xsize, + const size_t step) +{ + ocl_args_d_t &ocl = getOcl(); + + const size_t work_xsize = ((xsize - 8 + step) + step - 1) / step; + const size_t work_ysize = ((ysize - 8 + step) + step - 1) / step; + + cl_kernel kernel = ocl.kernel[KERNEL_COMBINECHANNELS]; + clSetKernelArgEx(kernel, &result, + &mask.r, &mask.g, &mask.b, + &mask_dc.r, &mask_dc.g, &mask_dc.b, + &xsize, &ysize, + &block_diff_dc, &block_diff_ac, + &edge_detector_map, + &res_xsize, + &step); + + size_t globalWorkSize[2] = { work_xsize, work_ysize }; + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + LOG_CL_RESULT(err); + err = clFinish(ocl.commandQueue); + LOG_CL_RESULT(err); +} + +void clUpsampleSquareRootEx(cl_mem diffmap, const size_t xsize, const size_t ysize, const int step) +{ + ocl_args_d_t &ocl = getOcl(); + + cl_mem diffmap_out = ocl.allocMem(xsize * ysize * sizeof(float)); + + cl_kernel kernel = ocl.kernel[KERNEL_UPSAMPLESQUAREROOT]; + clSetKernelArgEx(kernel, &diffmap_out, &diffmap, &xsize, &ysize, &step); + + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + + size_t globalWorkSize[2] = { res_xsize, res_ysize }; + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + LOG_CL_RESULT(err); + err = clEnqueueCopyBuffer(ocl.commandQueue, diffmap_out, diffmap, 0, 0, xsize * ysize * sizeof(float), 0, NULL, NULL); + LOG_CL_RESULT(err); + err = clFinish(ocl.commandQueue); + LOG_CL_RESULT(err); + + clReleaseMemObject(diffmap_out); +} + +void clRemoveBorderEx(cl_mem out, const cl_mem in, const size_t xsize, const size_t ysize, const int step) +{ + ocl_args_d_t &ocl = getOcl(); + + cl_int cls = 8 - step; + cl_int cls2 = (8 - step) / 2; + + int out_xsize = xsize - cls; + int out_ysize = ysize - cls; + + cl_kernel kernel = ocl.kernel[KERNEL_REMOVEBORDER]; + clSetKernelArgEx(kernel, &out, &out_xsize, &out_ysize, &in, &cls, &cls2); + + size_t globalWorkSize[2] = { out_xsize, out_ysize}; + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + LOG_CL_RESULT(err); + err = clFinish(ocl.commandQueue); + LOG_CL_RESULT(err); +} + +void clAddBorderEx(cl_mem out, size_t xsize, size_t ysize, int step, cl_mem in) +{ + ocl_args_d_t &ocl = getOcl(); + + cl_int cls = 8 - step; + cl_int cls2 = (8 - step) / 2; + cl_kernel kernel = ocl.kernel[KERNEL_ADDBORDER]; + clSetKernelArgEx(kernel, &out, &xsize, &ysize, &cls, &cls2, &in); + + size_t globalWorkSize[2] = { xsize, ysize}; + cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); + LOG_CL_RESULT(err); + err = clFinish(ocl.commandQueue); + LOG_CL_RESULT(err); +} + +void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step) +{ + clUpsampleSquareRootEx(diffmap, xsize, ysize, step); + + static const double kSigma = 8.8510880283; + static const double mul1 = 24.8235314874; + static const double scale = 1.0 / (1.0 + mul1); + + const int s = 8 - step; + int s2 = (8 - step) / 2; + + ocl_args_d_t &ocl = getOcl(); + cl_mem blurred = ocl.allocMem((xsize - s) * (ysize - s) * sizeof(float)); + clRemoveBorderEx(blurred, diffmap, xsize, ysize, step); + + static const double border_ratio = 0.03027655136; + clBlurEx(blurred, xsize - s, ysize - s, kSigma, border_ratio); + + clAddBorderEx(diffmap, xsize, ysize, step, blurred); + clScaleImageEx(diffmap, xsize * ysize, scale); + + clReleaseMemObject(blurred); +} + +#ifdef __USE_DOUBLE_AS_FLOAT__ +#undef double +#endif + +#endif \ No newline at end of file diff --git a/clguetzli/clguetzli.cu b/clguetzli/clguetzli.cu new file mode 100644 index 00000000..2b7a71c4 --- /dev/null +++ b/clguetzli/clguetzli.cu @@ -0,0 +1,8 @@ +/* +* CUDA Kernels +* +* Author: strongtu@tencent.com +* ianhuang@tencent.com +* chriskzhou@tencent.com +*/ +#include "clguetzli/clguetzli.cl" diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h new file mode 100644 index 00000000..c4f3961c --- /dev/null +++ b/clguetzli/clguetzli.h @@ -0,0 +1,188 @@ +/* +* OpenCL edition implementation of guetzli. +* +* Author: strongtu@tencent.com +* ianhuang@tencent.com +* chriskzhou@tencent.com +*/ +#pragma once +#include +#include "guetzli/processor.h" +#include "guetzli/butteraugli_comparator.h" +#include "ocl.h" +#include "clguetzli.cl.h" + +#include "cuguetzli.h" + +enum MATH_MODE +{ + MODE_CPU = 0, + MODE_CPU_OPT, + MODE_OPENCL, + MODE_CUDA, + MODE_CHECKCL, + MODE_CHECKCUDA +}; + +extern MATH_MODE g_mathMode; + +#ifdef __USE_OPENCL__ + +#ifdef __USE_DOUBLE_AS_FLOAT__ +#define double float +#endif + +void clOpsinDynamicsImage( + float *r, float *g, float *b, + const size_t xsize, const size_t ysize); + +void clDiffmapOpsinDynamicsImage( + float* result, + const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2, + const size_t xsize, const size_t ysize, + const size_t step); + +void clComputeBlockZeroingOrder( + guetzli::CoeffData *output_order_batch, + const channel_info orig_channel[3], + const float *orig_image_batch, + const float *mask_scale, + const int image_width, + const int image_height, + const channel_info mayout_channel[3], + const int factor, + const int comp_mask, + const float BlockErrorLimit); + +void clMask( + float* mask_r, float* mask_g, float* mask_b, + float* maskdc_r, float* maskdc_g, float* maskdc_b, + const size_t xsize, const size_t ysize, + const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2); + +void clDiffmapOpsinDynamicsImageEx( + cl_mem result, + ocl_channels xyb0, + ocl_channels xyb1, + const size_t xsize, const size_t ysize, + const size_t step); + + +void clConvolutionEx( + cl_mem result/*out*/, + const cl_mem inp, size_t xsize, size_t ysize, + const cl_mem multipliers, size_t len, + int xstep, int offset, float border_ratio); + +void clConvolutionXEx( + cl_mem result/*out*/, + const cl_mem inp, size_t xsize, size_t ysize, + const cl_mem multipliers, size_t len, + int xstep, int offset, float border_ratio); + +void clConvolutionYEx( + cl_mem result/*out*/, + const cl_mem inp, size_t xsize, size_t ysize, + const cl_mem multipliers, size_t len, + int xstep, int offset, float border_ratio); + +void clSquareSampleEx( + cl_mem result/*out*/, + const cl_mem image, size_t xsize, size_t ysize, + size_t xstep, size_t ystep); + +void clBlurEx(cl_mem image/*out, opt*/, const size_t xsize, const size_t ysize, + const double sigma, const double border_ratio, + cl_mem result = nullptr/*out, opt*/); + +void clOpsinDynamicsImageEx(ocl_channels &rgb, const size_t xsize, const size_t ysize); + +void clMaskHighIntensityChangeEx( + ocl_channels &xyb0/*in,out*/, + ocl_channels &xyb1/*in,out*/, + const size_t xsize, const size_t ysize); + +void clEdgeDetectorMapEx( + cl_mem result/*out*/, + const ocl_channels &rgb, const ocl_channels &rgb2, + const size_t xsize, const size_t ysize, const size_t step); + +void clBlockDiffMapEx( + cl_mem block_diff_dc/*out*/, + cl_mem block_diff_ac/*out*/, + const ocl_channels &rgb, const ocl_channels &rgb2, + const size_t xsize, const size_t ysize, const size_t step); + +void clEdgeDetectorLowFreqEx( + cl_mem block_diff_ac/*in,out*/, + const ocl_channels &rgb, const ocl_channels &rgb2, + const size_t xsize, const size_t ysize, const size_t step); + +void clDiffPrecomputeEx( + ocl_channels &mask/*out*/, + const ocl_channels &xyb0, const ocl_channels &xyb1, + const size_t xsize, const size_t ysize); + +void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w); + +void clAverage5x5Ex(cl_mem img/*in,out*/, const size_t xsize, const size_t ysize); + +void clMinSquareValEx( + cl_mem img/*in,out*/, + const size_t xsize, const size_t ysize, + const size_t square_size, const size_t offset); + +void clMaskEx( + ocl_channels mask/*out*/, ocl_channels mask_dc/*out*/, + const ocl_channels &rgb, const ocl_channels &rgb2, + const size_t xsize, const size_t ysize); + +void clCombineChannelsEx( + cl_mem result/*out*/, + const ocl_channels &mask, + const ocl_channels &mask_dc, + const size_t xsize, const size_t ysize, + const cl_mem block_diff_dc, + const cl_mem block_diff_ac, + const cl_mem edge_detector_map, + const size_t res_xsize, + const size_t step); + +void clUpsampleSquareRootEx(cl_mem diffmap, const size_t xsize, const size_t ysize, const int step); + +void clRemoveBorderEx(cl_mem out, const cl_mem in, const size_t xsize, const size_t ysize, const int step); + +void clAddBorderEx(cl_mem out, const size_t xsize, const size_t ysize, const int step, const cl_mem in); + +void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step); + +class guetzli::OutputImage; + +#ifdef __USE_DOUBLE_AS_FLOAT__ +#undef double +#endif + +namespace guetzli { + + class ButteraugliComparatorEx : public ButteraugliComparator + { + public: + ButteraugliComparatorEx(const int width, const int height, + const std::vector* rgb, + const float target_distance, ProcessStats* stats); + + void Compare(const OutputImage& img) override; + void StartBlockComparisons() override; + void FinishBlockComparisons() override; + + double CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const override; + public: + std::vector imgOpsinDynamicsBlockList; // [RR..RRGG..GGBB..BB]:blockCount + std::vector imgMaskXyzScaleBlockList; // [RGBRGB..RGBRGB]:blockCount + std::vector> rgb_orig_opsin; + }; +} + +#endif \ No newline at end of file diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp new file mode 100644 index 00000000..2e5af412 --- /dev/null +++ b/clguetzli/clguetzli_test.cpp @@ -0,0 +1,450 @@ +/* +* OpenCL test cases +* +* Author: strongtu@tencent.com +* ianhuang@tencent.com +* chriskzhou@tencent.com +*/ +#ifdef __USE_OPENCL__ + +#include +#include +#include +#include +#include "clguetzli_test.h" +#include "clguetzli.h" +#include "ocl.h" +#include "ocu.h" + +#define FLOAT_COMPARE(a, b, c) floatCompare((a), (b), (c), __FUNCTION__, __LINE__ ) + +int floatCompare(const float* a, const float* b, size_t size, const char* szFunc, int line) +{ + int count = 0; + for (int i = 0; i < size; i++) + { + if (fabs(a[i] - b[i]) > 0.001) + { + count++; + } + } + if (count > 0) + { + LogError("CHK %s(%d) %d:%d\r\n", szFunc, line, count, size); + } + return count; +} + +void tclMaskHighIntensityChange(const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2, + size_t xsize, size_t ysize, + const float* result_r, const float* result_g, const float* result_b, + const float* result_r2, const float* result_g2, const float* result_b2) +{ + size_t channel_size = xsize * ysize * sizeof(float); + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + ocl_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b); + ocl_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2); + + clMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize); + + cl_float *r0_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, xyb0.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r0_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, xyb0.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r0_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, xyb0.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r1_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, xyb1.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r1_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, xyb1.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r1_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, xyb1.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); + + FLOAT_COMPARE(result_r, r0_r, xsize * ysize); + FLOAT_COMPARE(result_g, r0_g, xsize * ysize); + FLOAT_COMPARE(result_b, r0_b, xsize * ysize); + FLOAT_COMPARE(result_r2, r1_r, xsize * ysize); + FLOAT_COMPARE(result_g2, r1_g, xsize * ysize); + FLOAT_COMPARE(result_b2, r1_b, xsize * ysize); + + clEnqueueUnmapMemObject(ocl.commandQueue, xyb0.r, r0_r, 0, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, xyb0.g, r0_g, 0, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, xyb0.b, r0_b, 0, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, xyb1.r, r1_r, 0, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, xyb1.g, r1_g, 0, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, xyb1.b, r1_b, 0, NULL, NULL); + err = clFinish(ocl.commandQueue); + + ocl.releaseMemChannels(xyb0); + ocl.releaseMemChannels(xyb1); +} + +void tclEdgeDetectorMap(const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2, + size_t xsize, size_t ysize, size_t step, + const float* result) +{ + size_t channel_size = xsize * ysize * sizeof(float); + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + const size_t edgemap_size = res_xsize * res_ysize * 3 * sizeof(float); + + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + ocl_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b); + ocl_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2); + cl_mem edge = ocl.allocMem(edgemap_size); + + clEdgeDetectorMapEx(edge, xyb0, xyb1, xsize, ysize, step); + + cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, edge, true, CL_MAP_READ, 0, edgemap_size, 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); + + FLOAT_COMPARE(result, r_r, res_xsize * res_ysize * 3); + + clEnqueueUnmapMemObject(ocl.commandQueue, edge, r_r, 0, NULL, NULL); + err = clFinish(ocl.commandQueue); + + ocl.releaseMemChannels(xyb0); + ocl.releaseMemChannels(xyb1); + clReleaseMemObject(edge); +} + +void tclBlockDiffMap(const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2, + size_t xsize, size_t ysize, size_t step, + const float* result_diff_dc, const float* result_diff_ac) +{ + size_t channel_size = xsize * ysize * sizeof(float); + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + const size_t reschannel_size = res_xsize * res_ysize * 3 * sizeof(float); + + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + ocl_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b); + ocl_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2); + + cl_mem block_diff_dc = ocl.allocMem(reschannel_size); + cl_mem block_diff_ac = ocl.allocMem(reschannel_size); + + clBlockDiffMapEx(block_diff_dc, block_diff_ac, xyb0, xyb1, xsize, ysize, step); + + cl_float *r_dc = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, block_diff_dc, true, CL_MAP_READ, 0, reschannel_size, 0, NULL, NULL, &err); + cl_float *r_ac = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, block_diff_ac, true, CL_MAP_READ, 0, reschannel_size, 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); + + FLOAT_COMPARE(r_dc, result_diff_dc, res_xsize * res_ysize * 3); + FLOAT_COMPARE(r_ac, result_diff_ac, res_xsize * res_ysize * 3); + + clEnqueueUnmapMemObject(ocl.commandQueue, block_diff_dc, r_dc, 0, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, block_diff_ac, r_ac, 0, NULL, NULL); + err = clFinish(ocl.commandQueue); + + ocl.releaseMemChannels(xyb0); + ocl.releaseMemChannels(xyb1); + + clReleaseMemObject(block_diff_ac); + clReleaseMemObject(block_diff_dc); +} + +void tclEdgeDetectorLowFreq(const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2, + size_t xsize, size_t ysize, size_t step, + const float* orign_ac, + const float* result_diff_ac) +{ + size_t channel_size = xsize * ysize * sizeof(float); + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + const size_t reschannel_size = res_xsize * res_ysize * 3 * sizeof(float); + + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + ocl_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b); + ocl_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2); + + cl_mem block_diff_ac = ocl.allocMem(reschannel_size, orign_ac); + + clEdgeDetectorLowFreqEx(block_diff_ac, xyb0, xyb1, xsize, ysize, step); + + cl_float *r_ac = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, block_diff_ac, true, CL_MAP_READ, 0, reschannel_size, 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); + + FLOAT_COMPARE(r_ac, result_diff_ac, res_xsize * res_ysize * 3); + + clEnqueueUnmapMemObject(ocl.commandQueue, block_diff_ac, r_ac, 0, NULL, NULL); + err = clFinish(ocl.commandQueue); + + ocl.releaseMemChannels(xyb0); + ocl.releaseMemChannels(xyb1); + + clReleaseMemObject(block_diff_ac); +} + +void tclMask(const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2, + size_t xsize, size_t ysize, + const float* mask_r, const float* mask_g, const float* mask_b, + const float* maskdc_r, const float* maskdc_g, const float* maskdc_b) +{ + size_t channel_size = xsize * ysize * sizeof(float); + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + ocl_channels rgb = ocl.allocMemChannels(channel_size, r, g, b); + ocl_channels rgb2 = ocl.allocMemChannels(channel_size, r2, g2, b2); + + ocl_channels mask = ocl.allocMemChannels(channel_size); + ocl_channels mask_dc = ocl.allocMemChannels(channel_size); + + clMaskEx(mask/*out*/, mask_dc/*out*/, rgb, rgb2, xsize, ysize); + + cl_float *r0_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r0_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r0_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r1_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r1_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r1_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); + + FLOAT_COMPARE(mask_r, r0_r, xsize * ysize); + FLOAT_COMPARE(mask_g, r0_g, xsize * ysize); + FLOAT_COMPARE(mask_b, r0_b, xsize * ysize); + FLOAT_COMPARE(maskdc_r, r1_r, xsize * ysize); + FLOAT_COMPARE(maskdc_g, r1_g, xsize * ysize); + FLOAT_COMPARE(maskdc_b, r1_b, xsize * ysize); + + clEnqueueUnmapMemObject(ocl.commandQueue, mask.r, r0_r, 0, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, mask.g, r0_g, 0, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, mask.b, r0_b, 0, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, mask_dc.r, r1_r, 0, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, mask_dc.g, r1_g, 0, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, mask_dc.b, r1_b, 0, NULL, NULL); + err = clFinish(ocl.commandQueue); + + ocl.releaseMemChannels(rgb); + ocl.releaseMemChannels(rgb2); + ocl.releaseMemChannels(mask); + ocl.releaseMemChannels(mask_dc); +} + +void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const float *mask_xyb_b, + const float *mask_xyb_dc_x, const float *mask_xyb_dc_y, const float *mask_xyb_dc_b, + const float *block_diff_dc, const float *block_diff_ac, + const float *edge_detector_map, + size_t xsize, size_t ysize, + size_t res_xsize, size_t res_ysize, + size_t step, + const float *init_result, + const float *result) +{ + cl_int err = CL_SUCCESS; + ocl_args_d_t &ocl = getOcl(); + + size_t channel_size = xsize * ysize * sizeof(float); + size_t res_channel_size = res_xsize * res_ysize * sizeof(float); + ocl_channels mask = ocl.allocMemChannels(channel_size, mask_xyb_x, mask_xyb_y, mask_xyb_b); + ocl_channels mask_dc = ocl.allocMemChannels(channel_size, mask_xyb_dc_x, mask_xyb_dc_y, mask_xyb_dc_b); + cl_mem cl_block_diff_dc = ocl.allocMem(3 * res_channel_size, block_diff_dc); + cl_mem cl_block_diff_ac = ocl.allocMem(3 * res_channel_size, block_diff_ac); + cl_mem cl_edge_detector_map = ocl.allocMem(3 * res_channel_size, edge_detector_map); + cl_mem cl_result = ocl.allocMem(res_channel_size, init_result); + + clCombineChannelsEx(cl_result, mask, mask_dc, xsize, ysize, cl_block_diff_dc, cl_block_diff_ac, cl_edge_detector_map, res_xsize, step); + + cl_float *result_tmp = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, cl_result, true, CL_MAP_READ, 0, res_xsize * res_ysize * sizeof(float), 0, NULL, NULL, &err); + + FLOAT_COMPARE(result_tmp, result, res_xsize * res_ysize); + + clEnqueueUnmapMemObject(ocl.commandQueue, cl_result, result_tmp, 0, NULL, NULL); + ocl.releaseMemChannels(mask); + ocl.releaseMemChannels(mask_dc); + clReleaseMemObject(cl_block_diff_dc); + clReleaseMemObject(cl_block_diff_ac); + clReleaseMemObject(cl_edge_detector_map); + clReleaseMemObject(cl_result); +} + +void tclCalculateDiffmap(const size_t xsize, const size_t ysize, + const size_t step, + const float *diffmap, size_t org_len, + const float *diffmap_cmp) +{ + cl_int err = CL_SUCCESS; + ocl_args_d_t &ocl = getOcl(); + + size_t length = xsize * ysize * sizeof(float); + cl_mem mem_diffmap = ocl.allocMem(length); + clEnqueueWriteBuffer(ocl.commandQueue, mem_diffmap, CL_FALSE, 0, org_len * sizeof(float), diffmap, 0, NULL, NULL); + clCalculateDiffmapEx(mem_diffmap, xsize, ysize, step); + cl_float *result_tmp = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_diffmap, true, CL_MAP_READ, 0, length, 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); + FLOAT_COMPARE(result_tmp, diffmap_cmp, xsize * ysize); + clEnqueueUnmapMemObject(ocl.commandQueue, mem_diffmap, result_tmp, 0, NULL, NULL); + clReleaseMemObject(mem_diffmap); +} + +void tclBlur(const float* channel, size_t xsize, size_t ysize, double sigma, double border_ratio, const float* result) +{ + size_t channel_size = xsize * ysize * sizeof(float); + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + cl_mem r = ocl.allocMem(channel_size, channel); + + clBlurEx(r, xsize, ysize, sigma, border_ratio, r); + + cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); + + FLOAT_COMPARE(result, r_r, xsize * ysize); + + clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, 0, NULL, NULL); + err = clFinish(ocl.commandQueue); + + clReleaseMemObject(r); +} + +void tclConvolution(size_t xsize, size_t ysize, + size_t xstep, + size_t len, size_t offset, + const float* multipliers, + const float* inp, + float border_ratio, + float* result) +{ + int dxsize = (xsize + xstep - 1) / xstep; + size_t result_size = dxsize * ysize * sizeof(float); + size_t inp_size = xsize * ysize * sizeof(float); + size_t multipliers_size = len * sizeof(float); + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + cl_mem r = ocl.allocMem(result_size); + cl_mem i = ocl.allocMem(inp_size, inp); + cl_mem m = ocl.allocMem(multipliers_size, multipliers); + + clConvolutionEx(r, i, xsize, ysize, m, len, xstep, offset, border_ratio); + + cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, r, true, CL_MAP_READ, 0, result_size, 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); + + FLOAT_COMPARE(result, r_r, dxsize * ysize); + + clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, 0, NULL, NULL); + err = clFinish(ocl.commandQueue); + + clReleaseMemObject(r); + clReleaseMemObject(i); + clReleaseMemObject(m); +} + +void tclDiffPrecompute( + const std::vector > &xyb0, + const std::vector > &xyb1, + size_t xsize, size_t ysize, + const std::vector > *mask_cmp) +{ + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + size_t channel_size = xsize * ysize * sizeof(float); + ocl_channels cl_xyb0 = ocl.allocMemChannels(channel_size, xyb0[0].data(), xyb0[1].data(), xyb0[2].data()); + ocl_channels cl_xyb1 = ocl.allocMemChannels(channel_size, xyb1[0].data(), xyb1[1].data(), xyb1[2].data()); + ocl_channels cl_mask = ocl.allocMemChannels(channel_size); + + clDiffPrecomputeEx(cl_mask, cl_xyb0, cl_xyb1, xsize, ysize); + + cl_float *r_x = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, cl_mask.x, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r_y = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, cl_mask.y, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, cl_mask.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); + + FLOAT_COMPARE(r_x, (*mask_cmp)[0].data(), xsize * ysize); + FLOAT_COMPARE(r_y, (*mask_cmp)[1].data(), xsize * ysize); + FLOAT_COMPARE(r_b, (*mask_cmp)[2].data(), xsize * ysize); + + clEnqueueUnmapMemObject(ocl.commandQueue, cl_mask.x, r_x, 0, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, cl_mask.y, r_y, 0, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, cl_mask.b, r_b, 0, NULL, NULL); + ocl.releaseMemChannels(cl_xyb0); + ocl.releaseMemChannels(cl_xyb1); + ocl.releaseMemChannels(cl_mask); +} + +void tclAverage5x5(int xsize, int ysize, const std::vector &diffs_org, const std::vector &diffs_cmp) +{ + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + cl_mem mem_diff = ocl.allocMem(xsize * ysize * sizeof(float), diffs_org.data()); + + clAverage5x5Ex(mem_diff, xsize, ysize); + cl_float *r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_diff, true, CL_MAP_READ, 0, xsize * ysize * sizeof(float), 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); + FLOAT_COMPARE(r, diffs_cmp.data(), xsize * ysize); + + clEnqueueUnmapMemObject(ocl.commandQueue, mem_diff, r, 0, NULL, NULL); + clReleaseMemObject(mem_diff); +} + +void tclMinSquareVal(const float *img, size_t square_size, size_t offset, + size_t xsize, size_t ysize, + const float *result) +{ + size_t img_size = xsize * ysize * sizeof(float); + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + cl_mem r = ocl.allocMem(img_size, img); + + clMinSquareValEx(r, xsize, ysize, square_size, offset); + + cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, r, true, CL_MAP_READ, 0, img_size, 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); + + FLOAT_COMPARE(result, r_r, xsize * ysize); + + clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, 0, NULL, NULL); + err = clFinish(ocl.commandQueue); + + clReleaseMemObject(r); +} + +void tclScaleImage(double scale, const float *result_org, const float *result_cmp, size_t length) +{ + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + cl_mem mem_result_org = ocl.allocMem(length * sizeof(float), result_org); + + clScaleImageEx(mem_result_org, length, scale); + + cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_result_org, true, CL_MAP_READ, 0, length * sizeof(float), 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); + + FLOAT_COMPARE(r_r, result_cmp, length); + + clEnqueueUnmapMemObject(ocl.commandQueue, mem_result_org, r_r, 0, NULL, NULL); + clReleaseMemObject(mem_result_org); +} + +void tclOpsinDynamicsImage(const float* r, const float* g, const float* b, size_t xsize, size_t ysize, + const float* result_r, const float* result_g, const float* result_b) +{ + size_t channel_size = xsize * ysize * sizeof(float); + cl_int err = 0; + ocl_args_d_t &ocl = getOcl(); + ocl_channels rgb = ocl.allocMemChannels(channel_size, r, g, b); + + clOpsinDynamicsImageEx(rgb, xsize, ysize); + + cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + cl_float *r_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err); + err = clFinish(ocl.commandQueue); + + FLOAT_COMPARE(result_r, r_r, xsize * ysize); + FLOAT_COMPARE(result_g, r_g, xsize * ysize); + FLOAT_COMPARE(result_b, r_b, xsize * ysize); + + clEnqueueUnmapMemObject(ocl.commandQueue, rgb.r, r_r, 0, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, rgb.g, r_g, 0, NULL, NULL); + clEnqueueUnmapMemObject(ocl.commandQueue, rgb.b, r_b, 0, NULL, NULL); + err = clFinish(ocl.commandQueue); + + ocl.releaseMemChannels(rgb); +} + +#endif \ No newline at end of file diff --git a/clguetzli/clguetzli_test.h b/clguetzli/clguetzli_test.h new file mode 100644 index 00000000..dbc3c47a --- /dev/null +++ b/clguetzli/clguetzli_test.h @@ -0,0 +1,79 @@ +/* +* OpenCL test cases +* +* Author: strongtu@tencent.com +* ianhuang@tencent.com +* chriskzhou@tencent.com +*/ +#pragma once +#include "ocl.h" + +void tclMaskHighIntensityChange(const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2, + size_t xsize, size_t ysize, + const float* result_r, const float* result_g, const float* result_b, + const float* result_r2, const float* result_g2, const float* result_b2); + +void tclBlur(const float* channel, size_t xsize, size_t ysize, double sigma, double border_ratio, const float* result); + +void tclEdgeDetectorMap(const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2, + size_t xsize, size_t ysize, size_t step, + const float* result); + +void tclBlockDiffMap(const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2, + size_t xsize, size_t ysize, size_t step, + const float* result_diff_dc, const float* result_diff_ac); + +void tclEdgeDetectorLowFreq(const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2, + size_t xsize, size_t ysize, size_t step, + const float* orign_ac, + const float* result_diff_dc); + +void tclMask(const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2, + size_t xsize, size_t ysize, + const float* mask_r, const float* mask_g, const float* mask_b, + const float* maskdc_r, const float* maskdc_g, const float* maskdc_b); + +void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const float *mask_xyb_b, + const float *mask_xyb_dc_x, const float *mask_xyb_dc_y, const float *mask_xyb_dc_b, + const float *block_diff_dc, const float *block_diff_ac, + const float *edge_detector_map, + size_t xsize, size_t ysize, + size_t res_xsize, size_t res_ysize, + size_t step, + const float *init_result, + const float *result); + +void tclCalculateDiffmap(const size_t xsize, const size_t ysize, + const size_t step, + const float *diffmap, size_t org_len, + const float *diffmap_cmp); + +void tclConvolution(size_t xsize, size_t ysize, + size_t xstep, + size_t len, size_t offset, + const float* multipliers, + const float* inp, + float border_ratio, + float* result); + +void tclDiffPrecompute( + const std::vector > &xyb0, + const std::vector > &xyb1, + size_t xsize, size_t ysize, + const std::vector > *mask_cmp); + +void tclAverage5x5(int xsize, int ysize, const std::vector &diffs_org, const std::vector &diffs_cmp); + +void tclScaleImage(double scale, const float *result_org, const float *result_cmp, size_t length); + +void tclOpsinDynamicsImage(const float* r, const float* g, const float* b, size_t xsize, size_t ysize, + const float* result_r, const float* result_g, const float* result_b); + +void tclMinSquareVal(const float *img, size_t square_size, size_t offset, + size_t xsize, size_t ysize, + const float *result); diff --git a/clguetzli/cuguetzli.cpp b/clguetzli/cuguetzli.cpp new file mode 100644 index 00000000..f348edb7 --- /dev/null +++ b/clguetzli/cuguetzli.cpp @@ -0,0 +1,903 @@ +/* +* CUDA edition implementation of guetzli. +* +* Author: strongtu@tencent.com +* ianhuang@tencent.com +* chriskzhou@tencent.com +*/ +#include "cuguetzli.h" +#include +#include "ocu.h" + +#ifdef __USE_CUDA__ + +#ifdef __USE_DOUBLE_AS_FLOAT__ +#define double float +#endif + +#define cuFinish cuStreamSynchronize +#define BLOCK_SIZE_X 16 +#define BLOCK_SIZE_Y 16 +#define BLOCK_COUNT_X(size) ((size + BLOCK_SIZE_X - 1) / BLOCK_SIZE_X) +#define BLOCK_COUNT_Y(size) ((size + BLOCK_SIZE_Y - 1) / BLOCK_SIZE_Y) + +void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize) +{ + size_t channel_size = xsize * ysize * sizeof(float); + + ocu_args_d_t &ocu = getOcu(); + ocu_channels rgb = ocu.allocMemChannels(channel_size, r, g, b); + + cuOpsinDynamicsImageEx(rgb, xsize, ysize); + + cuMemcpyDtoHAsync(r, rgb.r, channel_size, ocu.commandQueue); + cuMemcpyDtoHAsync(g, rgb.g, channel_size, ocu.commandQueue); + cuMemcpyDtoHAsync(b, rgb.b, channel_size, ocu.commandQueue); + cuFinish(ocu.commandQueue); + + ocu.releaseMemChannels(rgb); +} + +void cuDiffmapOpsinDynamicsImage( + float* result, + const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2, + const size_t xsize, const size_t ysize, + const size_t step) +{ + size_t channel_size = xsize * ysize * sizeof(float); + + ocu_args_d_t &ocu = getOcu(); + ocu_channels xyb0 = ocu.allocMemChannels(channel_size, r, g, b); + ocu_channels xyb1 = ocu.allocMemChannels(channel_size, r2, g2, b2); + + cu_mem mem_result = ocu.allocMem(channel_size, result); + + cuDiffmapOpsinDynamicsImageEx(mem_result, xyb0, xyb1, xsize, ysize, step); + + cuMemcpyDtoH(result, mem_result, channel_size); + + ocu.releaseMemChannels(xyb1); + ocu.releaseMemChannels(xyb0); + + ocu.releaseMem(mem_result); +} + +void cuComputeBlockZeroingOrder( + guetzli::CoeffData *output_order_batch, + const channel_info orig_channel[3], + const float *orig_image_batch, + const float *mask_scale, + const int image_width, + const int image_height, + const channel_info mayout_channel[3], + const int factor, + const int comp_mask, + const float BlockErrorLimit) +{ + const int block8_width = (image_width + 8 - 1) / 8; + const int block8_height = (image_height + 8 - 1) / 8; + const int blockf_width = (image_width + 8 * factor - 1) / (8 * factor); + const int blockf_height = (image_height + 8 * factor - 1) / (8 * factor); + + using namespace guetzli; + + ocu_args_d_t &ocu = getOcu(); + + cu_mem mem_orig_coeff[3]; + cu_mem mem_mayout_coeff[3]; + cu_mem mem_mayout_pixel[3]; + for (int c = 0; c < 3; c++) + { + int block_count = orig_channel[c].block_width * orig_channel[c].block_height; + mem_orig_coeff[c] = ocu.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, orig_channel[c].coeff); + + block_count = mayout_channel[c].block_width * mayout_channel[c].block_height; + mem_mayout_coeff[c] = ocu.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, mayout_channel[c].coeff); + + mem_mayout_pixel[c] = ocu.allocMem(image_width * image_height * sizeof(uint16_t), mayout_channel[c].pixel); + } + cu_mem mem_orig_image = ocu.allocMem(sizeof(float) * 3 * kDCTBlockSize * block8_width * block8_height, orig_image_batch); + cu_mem mem_mask_scale = ocu.allocMem(sizeof(float) * 3 * block8_width * block8_height, mask_scale); + + int output_order_batch_size = sizeof(CoeffData) * 3 * kDCTBlockSize * blockf_width * blockf_height; + cu_mem mem_output_order_batch = ocu.allocMem(output_order_batch_size, output_order_batch); + + CUfunction kernel = ocu.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER]; + const void *args[] = { &mem_orig_coeff[0], &mem_orig_coeff[1], &mem_orig_coeff[2], + &mem_orig_image, &mem_mask_scale, + &blockf_width, &blockf_height, + &image_width, &image_height, + &mem_mayout_coeff[0], &mem_mayout_coeff[1], &mem_mayout_coeff[2], + &mem_mayout_pixel[0], &mem_mayout_pixel[1], &mem_mayout_pixel[2], + &mayout_channel[0], &mayout_channel[1], &mayout_channel[2], + &factor, + &comp_mask, + &BlockErrorLimit, + &mem_output_order_batch }; + + CUresult err = cuLaunchKernel(kernel, + BLOCK_COUNT_X(blockf_width), BLOCK_COUNT_Y(blockf_height), 1, + BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, + 0, + ocu.commandQueue, (void**)args, NULL); + LOG_CU_RESULT(err); + + err = cuFinish(ocu.commandQueue); + LOG_CU_RESULT(err); + + cuMemcpyDtoH(output_order_batch, mem_output_order_batch, output_order_batch_size); + + for (int c = 0; c < 3; c++) + { + ocu.releaseMem(mem_orig_coeff[c]); + ocu.releaseMem(mem_mayout_coeff[c]); + ocu.releaseMem(mem_mayout_pixel[c]); + } + + ocu.releaseMem(mem_orig_image); + ocu.releaseMem(mem_mask_scale); + ocu.releaseMem(mem_output_order_batch); +} + +void cuMask( + float* mask_r, float* mask_g, float* mask_b, + float* maskdc_r, float* maskdc_g, float* maskdc_b, + const size_t xsize, const size_t ysize, + const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2) +{ + ocu_args_d_t &ocu = getOcu(); + + size_t channel_size = xsize * ysize * sizeof(float); + + ocu_channels rgb = ocu.allocMemChannels(channel_size, r, g, b); + ocu_channels rgb2 = ocu.allocMemChannels(channel_size, r2, g2, b2); + ocu_channels mask = ocu.allocMemChannels(channel_size); + ocu_channels mask_dc = ocu.allocMemChannels(channel_size); + + cuMaskEx(mask, mask_dc, rgb, rgb2, xsize, ysize); + + cuMemcpyDtoHAsync(mask_r, mask.r, channel_size, ocu.commandQueue); + cuMemcpyDtoHAsync(mask_g, mask.g, channel_size, ocu.commandQueue); + cuMemcpyDtoHAsync(mask_b, mask.b, channel_size, ocu.commandQueue); + cuMemcpyDtoHAsync(maskdc_r, mask_dc.r, channel_size, ocu.commandQueue); + cuMemcpyDtoHAsync(maskdc_g, mask_dc.g, channel_size, ocu.commandQueue); + cuMemcpyDtoHAsync(maskdc_b, mask_dc.b, channel_size, ocu.commandQueue); + cuFinish(ocu.commandQueue); + + ocu.releaseMemChannels(rgb); + ocu.releaseMemChannels(rgb2); + ocu.releaseMemChannels(mask); + ocu.releaseMemChannels(mask_dc); +} + +void cuDiffmapOpsinDynamicsImageEx( + cu_mem result, + ocu_channels xyb0, + ocu_channels xyb1, + const size_t xsize, const size_t ysize, + const size_t step) +{ + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + + size_t channel_size = xsize * ysize * sizeof(float); + size_t channel_step_size = res_xsize * res_ysize * sizeof(float); + + ocu_args_d_t &ocu = getOcu(); + + cu_mem edge_detector_map = ocu.allocMem(3 * channel_step_size); + cu_mem block_diff_dc = ocu.allocMem(3 * channel_step_size); + cu_mem block_diff_ac = ocu.allocMem(3 * channel_step_size); + + cuMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize); + + cuEdgeDetectorMapEx(edge_detector_map, xyb0, xyb1, xsize, ysize, step); + cuBlockDiffMapEx(block_diff_dc, block_diff_ac, xyb0, xyb1, xsize, ysize, step); + cuEdgeDetectorLowFreqEx(block_diff_ac, xyb0, xyb1, xsize, ysize, step); + { + ocu_channels mask = ocu.allocMemChannels(channel_size); + ocu_channels mask_dc = ocu.allocMemChannels(channel_size); + cuMaskEx(mask, mask_dc, xyb0, xyb1, xsize, ysize); + cuCombineChannelsEx(result, mask, mask_dc, xsize, ysize, block_diff_dc, block_diff_ac, edge_detector_map, res_xsize, step); + + ocu.releaseMemChannels(mask); + ocu.releaseMemChannels(mask_dc); + } + + cuCalculateDiffmapEx(result, xsize, ysize, step); + + ocu.releaseMem(edge_detector_map); + ocu.releaseMem(block_diff_dc); + ocu.releaseMem(block_diff_ac); +} + +void cuConvolutionEx( + cu_mem result/*out*/, + const cu_mem inp, size_t xsize, size_t ysize, + const cu_mem multipliers, size_t len, + int xstep, int offset, float border_ratio) +{ + ocu_args_d_t &ocu = getOcu(); + + size_t oxsize = (xsize + xstep - 1) / xstep; + + CUfunction kernel = ocu.kernel[KERNEL_CONVOLUTION]; + const void *args[] = { &result, &inp, &xsize, &multipliers, &len, &xstep, &offset, &border_ratio }; + + CUresult err = cuLaunchKernel(kernel, + oxsize, ysize, 1, + 1, 1, 1, + 0, + ocu.commandQueue, (void**)args, NULL); + LOG_CU_RESULT(err); + err = cuFinish(ocu.commandQueue); + LOG_CU_RESULT(err); +} + + +void cuConvolutionXEx( + cu_mem result/*out*/, + const cu_mem inp, size_t xsize, size_t ysize, + const cu_mem multipliers, size_t len, + int xstep, int offset, float border_ratio) +{ + ocu_args_d_t &ocu = getOcu(); + + CUfunction kernel = ocu.kernel[KERNEL_CONVOLUTIONX]; + const void *args[] = { &result, &xsize, &ysize, &inp, &multipliers, &len, &xstep, &offset, &border_ratio }; + + CUresult err = cuLaunchKernel(kernel, + BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1, + BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, + 0, + ocu.commandQueue, (void**)args, NULL); + LOG_CU_RESULT(err); + err = cuFinish(ocu.commandQueue); + LOG_CU_RESULT(err); +} + +void cuConvolutionYEx( + cu_mem result/*out*/, + const cu_mem inp, size_t xsize, size_t ysize, + const cu_mem multipliers, size_t len, + int xstep, int offset, float border_ratio) +{ + ocu_args_d_t &ocu = getOcu(); + + CUfunction kernel = ocu.kernel[KERNEL_CONVOLUTIONY]; + const void *args[] = { &result, &xsize, &ysize, &inp, &multipliers, &len, &xstep, &offset, &border_ratio }; + + CUresult err = cuLaunchKernel(kernel, + BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1, + BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, + 0, + ocu.commandQueue, (void**)args, NULL); + LOG_CU_RESULT(err); + err = cuFinish(ocu.commandQueue); + LOG_CU_RESULT(err); +} + +void cuSquareSampleEx( + cu_mem result/*out*/, + const cu_mem image, size_t xsize, size_t ysize, + size_t xstep, size_t ystep) +{ + ocu_args_d_t &ocu = getOcu(); + + CUfunction kernel = ocu.kernel[KERNEL_SQUARESAMPLE]; + const void *args[] = { &result, &xsize, &ysize, &image, &xstep, &ystep }; + + CUresult err = cuLaunchKernel(kernel, + BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1, + BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, + 0, + ocu.commandQueue, (void**)args, NULL); + LOG_CU_RESULT(err); + err = cuFinish(ocu.commandQueue); + LOG_CU_RESULT(err); +} + +void cuBlurEx(cu_mem image/*out, opt*/, const size_t xsize, const size_t ysize, + const double sigma, const double border_ratio, + cu_mem result/*out, opt*/) +{ + double m = 2.25; // Accuracy increases when m is increased. + const double scaler = -1.0 / (2 * sigma * sigma); + // For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52} + const int diff = std::max(1, m * fabs(sigma)); + const int expn_size = 2 * diff + 1; + std::vector expn(expn_size); + for (int i = -diff; i <= diff; ++i) { + expn[i + diff] = static_cast(exp(scaler * i * i)); + } + + const int xstep = std::max(1, int(sigma / 3)); + + ocu_args_d_t &ocu = getOcu(); + cu_mem mem_expn = ocu.allocMem(sizeof(cl_float) * expn_size, expn.data()); + + if (xstep > 1) + { + cu_mem m = ocu.allocMem(sizeof(cl_float) * xsize * ysize); + cuConvolutionXEx(m, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + cuConvolutionYEx(result ? result : image, m, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + cuSquareSampleEx(result ? result : image, result ? result : image, xsize, ysize, xstep, xstep); + ocu.releaseMem(m); + } + else + { + cu_mem m = ocu.allocMem(sizeof(cl_float) * xsize * ysize); + cuConvolutionXEx(m, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + cuConvolutionYEx(result ? result : image, m, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio); + ocu.releaseMem(m); + } + + ocu.releaseMem(mem_expn); +} + +void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t ysize) +{ + static const double kSigma = 1.1; + + size_t channel_size = xsize * ysize * sizeof(float); + + ocu_args_d_t &ocu = getOcu(); + ocu_channels rgb_blurred = ocu.allocMemChannels(channel_size); + + const int size = xsize * ysize; + + cuBlurEx(rgb.r, xsize, ysize, kSigma, 0.0, rgb_blurred.r); + cuBlurEx(rgb.g, xsize, ysize, kSigma, 0.0, rgb_blurred.g); + cuBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b); + + CUfunction kernel = ocu.kernel[KERNEL_OPSINDYNAMICSIMAGE]; + const void *args[] = { &rgb.r, &rgb.g, &rgb.b, &size, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b }; + + CUresult err = cuLaunchKernel(kernel, +// (size + BLOCK_SIZE_X * BLOCK_SIZE_Y - 1) / BLOCK_SIZE_X * BLOCK_SIZE_Y, 1, 1, +// BLOCK_SIZE_X * BLOCK_SIZE_Y, 1, 1, + (size + 511) / 512, 1, 1, + 512, 1, 1, + 0, + ocu.commandQueue, (void**)args, NULL); + LOG_CU_RESULT(err); + err = cuFinish(ocu.commandQueue); + LOG_CU_RESULT(err); + + ocu.releaseMemChannels(rgb_blurred); +} + +void cuMaskHighIntensityChangeEx( + ocu_channels &xyb0/*in,out*/, + ocu_channels &xyb1/*in,out*/, + const size_t xsize, const size_t ysize) +{ + size_t channel_size = xsize * ysize * sizeof(float); + + ocu_args_d_t &ocu = getOcu(); + + ocu_channels c0 = ocu.allocMemChannels(channel_size); + ocu_channels c1 = ocu.allocMemChannels(channel_size); + + cuMemcpyDtoDAsync(c0.r, xyb0.r, channel_size, ocu.commandQueue); + cuMemcpyDtoDAsync(c0.g, xyb0.g, channel_size, ocu.commandQueue); + cuMemcpyDtoDAsync(c0.b, xyb0.b, channel_size, ocu.commandQueue); + cuMemcpyDtoDAsync(c1.r, xyb1.r, channel_size, ocu.commandQueue); + cuMemcpyDtoDAsync(c1.g, xyb1.g, channel_size, ocu.commandQueue); + cuMemcpyDtoDAsync(c1.b, xyb1.b, channel_size, ocu.commandQueue); + cuFinish(ocu.commandQueue); + + CUfunction kernel = ocu.kernel[KERNEL_MASKHIGHINTENSITYCHANGE]; + const void *args[] = { + &xyb0.r, &xyb0.g, &xyb0.b, + &xsize, &ysize, + &xyb1.r, &xyb1.g, &xyb1.b, + &c0.r, &c0.g, &c0.b, + &c1.r, &c1.g, &c1.b }; + + CUresult err = cuLaunchKernel(kernel, + BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1, + BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, + 0, + ocu.commandQueue, (void**)args, NULL); + LOG_CU_RESULT(err); + err = cuFinish(ocu.commandQueue); + LOG_CU_RESULT(err); + + ocu.releaseMemChannels(c0); + ocu.releaseMemChannels(c1); +} + +void cuEdgeDetectorMapEx( + cu_mem result/*out*/, + const ocu_channels &rgb, const ocu_channels &rgb2, + const size_t xsize, const size_t ysize, const size_t step) +{ + size_t channel_size = xsize * ysize * sizeof(float); + + ocu_args_d_t &ocu = getOcu(); + + ocu_channels rgb_blured = ocu.allocMemChannels(channel_size); + ocu_channels rgb2_blured = ocu.allocMemChannels(channel_size); + + static const double kSigma[3] = { 1.5, 0.586, 0.4 }; + + for (int i = 0; i < 3; i++) + { + cuBlurEx(rgb.ch[i], xsize, ysize, kSigma[i], 0.0, rgb_blured.ch[i]); + cuBlurEx(rgb2.ch[i], xsize, ysize, kSigma[i], 0.0, rgb2_blured.ch[i]); + } + + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + + CUfunction kernel = ocu.kernel[KERNEL_EDGEDETECTOR]; + const void *args[] = { &result, + &res_xsize, &res_ysize, + &rgb_blured.r, &rgb_blured.g, &rgb_blured.b, + &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b, + &xsize, &ysize, &step }; + + CUresult err = cuLaunchKernel(kernel, + BLOCK_COUNT_X(res_xsize), BLOCK_COUNT_Y(res_ysize), 1, + BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, + 0, + ocu.commandQueue, (void**)args, NULL); + LOG_CU_RESULT(err); + err = cuFinish(ocu.commandQueue); + LOG_CU_RESULT(err); + + ocu.releaseMemChannels(rgb_blured); + ocu.releaseMemChannels(rgb2_blured); +} + +void cuBlockDiffMapEx( + cu_mem block_diff_dc/*out*/, + cu_mem block_diff_ac/*out*/, + const ocu_channels &rgb, const ocu_channels &rgb2, + const size_t xsize, const size_t ysize, const size_t step) +{ + ocu_args_d_t &ocu = getOcu(); + + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + + CUfunction kernel = ocu.kernel[KERNEL_BLOCKDIFFMAP]; + const void *args[] = { &block_diff_dc, &block_diff_ac, + &res_xsize, &res_ysize, + &rgb.r, &rgb.g, &rgb.b, + &rgb2.r, &rgb2.g, &rgb2.b, + &xsize, &ysize, &step }; + + CUresult err = cuLaunchKernel(kernel, + BLOCK_COUNT_X(res_xsize), BLOCK_COUNT_Y(res_ysize), 1, + BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, + 0, + ocu.commandQueue, (void**)args, NULL); + LOG_CU_RESULT(err); + err = cuFinish(ocu.commandQueue); + LOG_CU_RESULT(err); +} + +void cuEdgeDetectorLowFreqEx( + cu_mem block_diff_ac/*in,out*/, + const ocu_channels &rgb, const ocu_channels &rgb2, + const size_t xsize, const size_t ysize, const size_t step) +{ + size_t channel_size = xsize * ysize * sizeof(float); + + static const double kSigma = 14; + + ocu_args_d_t &ocu = getOcu(); + ocu_channels rgb_blured = ocu.allocMemChannels(channel_size); + ocu_channels rgb2_blured = ocu.allocMemChannels(channel_size); + + for (int i = 0; i < 3; i++) + { + cuBlurEx(rgb.ch[i], xsize, ysize, kSigma, 0.0, rgb_blured.ch[i]); + cuBlurEx(rgb2.ch[i], xsize, ysize, kSigma, 0.0, rgb2_blured.ch[i]); + } + + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + + CUfunction kernel = ocu.kernel[KERNEL_EDGEDETECTORLOWFREQ]; + const void *args[] = { &block_diff_ac, + &res_xsize, &res_ysize, + &rgb_blured.r, &rgb_blured.g, &rgb_blured.b, + &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b, + &xsize, &ysize, &step }; + + + CUresult err = cuLaunchKernel(kernel, + BLOCK_COUNT_X(res_xsize), BLOCK_COUNT_Y(res_ysize), 1, + BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, + 0, + ocu.commandQueue, (void**)args, NULL); + LOG_CU_RESULT(err); + err = cuFinish(ocu.commandQueue); + LOG_CU_RESULT(err); + + ocu.releaseMemChannels(rgb_blured); + ocu.releaseMemChannels(rgb2_blured); +} + +void cuDiffPrecomputeEx( + ocu_channels &mask/*out*/, + const ocu_channels &xyb0, const ocu_channels &xyb1, + const size_t xsize, const size_t ysize) +{ + ocu_args_d_t &ocu = getOcu(); + + CUfunction kernel = ocu.kernel[KERNEL_DIFFPRECOMPUTE]; + const void *args[] = { &mask.x, &mask.y, &mask.b, + &xsize, &ysize, + &xyb0.x, &xyb0.y, &xyb0.b, + &xyb1.x, &xyb1.y, &xyb1.b }; + + CUresult err = cuLaunchKernel(kernel, + BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1, + BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, + 0, + ocu.commandQueue, (void**)args, NULL); + LOG_CU_RESULT(err); + err = cuFinish(ocu.commandQueue); + LOG_CU_RESULT(err); +} + +void cuScaleImageEx(cu_mem img/*in, out*/, size_t size, double w) +{ + ocu_args_d_t &ocu = getOcu(); + float fw = w; + + CUfunction kernel = ocu.kernel[KERNEL_SCALEIMAGE]; + const void *args[] = { &img, &size, &fw }; + + CUresult err = cuLaunchKernel(kernel, +// (size + BLOCK_SIZE_X * BLOCK_SIZE_Y - 1) / BLOCK_SIZE_X * BLOCK_SIZE_Y, 1, 1, + (size + 511) / 512, 1, 1, +// BLOCK_SIZE_X * BLOCK_SIZE_Y, 1, 1, + 512, 1, 1, + 0, + ocu.commandQueue, (void**)args, NULL); + LOG_CU_RESULT(err); + err = cuFinish(ocu.commandQueue); + LOG_CU_RESULT(err); +} + +void cuAverage5x5Ex(cu_mem img/*in,out*/, const size_t xsize, const size_t ysize) +{ + if (xsize < 4 || ysize < 4) { + // TODO: Make this work for small dimensions as well. + return; + } + + ocu_args_d_t &ocu = getOcu(); + + size_t len = xsize * ysize * sizeof(float); + cu_mem img_org = ocu.allocMem(len); + + cuMemcpyDtoD(img_org, img, len); + + CUfunction kernel = ocu.kernel[KERNEL_AVERAGE5X5]; + const void *args[] = { &img, &xsize, &ysize, &img_org }; + + CUresult err = cuLaunchKernel(kernel, + BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1, + BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, + 0, + ocu.commandQueue, (void**)args, NULL); + LOG_CU_RESULT(err); + err = cuFinish(ocu.commandQueue); + LOG_CU_RESULT(err); + + ocu.releaseMem(img_org); +} + +void cuMinSquareValEx( + cu_mem img/*in,out*/, + const size_t xsize, const size_t ysize, + const size_t square_size, const size_t offset) +{ + ocu_args_d_t &ocu = getOcu(); + + cu_mem result = ocu.allocMem(sizeof(float) * xsize * ysize); + + CUfunction kernel = ocu.kernel[KERNEL_MINSQUAREVAL]; + const void *args[] = { &result, &xsize, &ysize, &img, &square_size, &offset }; + + CUresult err = cuLaunchKernel(kernel, + BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1, + BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, + 0, + ocu.commandQueue, (void**)args, NULL); + LOG_CU_RESULT(err); + err = cuFinish(ocu.commandQueue); + LOG_CU_RESULT(err); + cuMemcpyDtoD(img, result, sizeof(float) * xsize * ysize); + ocu.releaseMem(result); +} + +static void MakeMask(double extmul, double extoff, + double mul, double offset, + double scaler, double *result) +{ + for (size_t i = 0; i < 512; ++i) { + const double c = mul / ((0.01 * scaler * i) + offset); + result[i] = 1.0 + extmul * (c + extoff); + result[i] *= result[i]; + } +} + +static const double kInternalGoodQualityThreshold = 14.921561160295326; +static const double kGlobalScale = 1.0 / kInternalGoodQualityThreshold; + +void cuDoMask(ocu_channels mask/*in, out*/, ocu_channels mask_dc/*in, out*/, size_t xsize, size_t ysize) +{ + ocu_args_d_t &ocu = getOcu(); + + double extmul = 0.975741017749; + double extoff = -4.25328244168; + double offset = 0.454909521427; + double scaler = 0.0738288224836; + double mul = 20.8029176447; + static double lut_x[512]; + static bool lutx_init = false; + if (!lutx_init) + { + lutx_init = true; + MakeMask(extmul, extoff, mul, offset, scaler, lut_x); + } + + extmul = 0.373995618954; + extoff = 1.5307267433; + offset = 0.911952641929; + scaler = 1.1731667845; + mul = 16.2447033988; + static double lut_y[512]; + static bool luty_init = false; + if (!luty_init) + { + luty_init = true; + MakeMask(extmul, extoff, mul, offset, scaler, lut_y); + } + + extmul = 0.61582234137; + extoff = -4.25376118646; + offset = 1.05105070921; + scaler = 0.47434643535; + mul = 31.1444967089; + static double lut_b[512]; + static bool lutb_init = false; + if (!lutb_init) + { + lutb_init = true; + MakeMask(extmul, extoff, mul, offset, scaler, lut_b); + } + + extmul = 1.79116943438; + extoff = -3.86797479189; + offset = 0.670960225853; + scaler = 0.486575865525; + mul = 20.4563479139; + static double lut_dcx[512]; + static bool lutdcx_init = false; + if (!lutdcx_init) + { + lutdcx_init = true; + MakeMask(extmul, extoff, mul, offset, scaler, lut_dcx); + } + + extmul = 0.212223514236; + extoff = -3.65647120524; + offset = 1.73396799447; + scaler = 0.170392660501; + mul = 21.6566724788; + static double lut_dcy[512]; + static bool lutdcy_init = false; + if (!lutdcy_init) + { + lutdcy_init = true; + MakeMask(extmul, extoff, mul, offset, scaler, lut_dcy); + } + + extmul = 0.349376011816; + extoff = -0.894711072781; + offset = 0.901647926679; + scaler = 0.380086095024; + mul = 18.0373825149; + static double lut_dcb[512]; + static bool lutdcb_init = false; + if (!lutdcb_init) + { + lutdcb_init = true; + MakeMask(extmul, extoff, mul, offset, scaler, lut_dcb); + } + + size_t channel_size = 512 * sizeof(double); + ocu_channels xyb = ocu.allocMemChannels(channel_size, lut_x, lut_y, lut_b); + ocu_channels xyb_dc = ocu.allocMemChannels(channel_size, lut_dcx, lut_dcy, lut_dcb); + + CUfunction kernel = ocu.kernel[KERNEL_DOMASK]; + const void *args[] = { &mask.r, &mask.g, &mask.b, + &xsize, &ysize, + &mask_dc.r, &mask_dc.g, &mask_dc.b, + &xyb.x, &xyb.y, &xyb.b, + &xyb_dc.x, &xyb_dc.y, &xyb_dc.b }; + + CUresult err = cuLaunchKernel(kernel, + BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1, + BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, + 0, + ocu.commandQueue, (void**)args, NULL); + LOG_CU_RESULT(err); + err = cuFinish(ocu.commandQueue); + LOG_CU_RESULT(err); + + ocu.releaseMemChannels(xyb); + ocu.releaseMemChannels(xyb_dc); +} + +void cuMaskEx( + ocu_channels mask/*out*/, ocu_channels mask_dc/*out*/, + const ocu_channels &rgb, const ocu_channels &rgb2, + const size_t xsize, const size_t ysize) +{ + cuDiffPrecomputeEx(mask, rgb, rgb2, xsize, ysize); + for (int i = 0; i < 3; i++) + { + cuAverage5x5Ex(mask.ch[i], xsize, ysize); + cuMinSquareValEx(mask.ch[i], xsize, ysize, 4, 0); + + static const double sigma[3] = { + 9.65781083553, + 14.2644604355, + 4.53358927369, + }; + + cuBlurEx(mask.ch[i], xsize, ysize, sigma[i], 0.0); + } + + cuDoMask(mask, mask_dc, xsize, ysize); + + for (int i = 0; i < 3; i++) + { + cuScaleImageEx(mask.ch[i], xsize * ysize, kGlobalScale * kGlobalScale); + cuScaleImageEx(mask_dc.ch[i], xsize * ysize, kGlobalScale * kGlobalScale); + } +} + +void cuCombineChannelsEx( + cu_mem result/*out*/, + const ocu_channels &mask, + const ocu_channels &mask_dc, + const size_t xsize, const size_t ysize, + const cu_mem block_diff_dc, + const cu_mem block_diff_ac, + const cu_mem edge_detector_map, + const size_t res_xsize, + const size_t step) +{ + ocu_args_d_t &ocu = getOcu(); + + const size_t work_xsize = ((xsize - 8 + step) + step - 1) / step; + const size_t work_ysize = ((ysize - 8 + step) + step - 1) / step; + + CUfunction kernel = ocu.kernel[KERNEL_COMBINECHANNELS]; + const void *args[] = { &result, + &mask.r, &mask.g, &mask.b, + &mask_dc.r, &mask_dc.g, &mask_dc.b, + &xsize, &ysize, + &block_diff_dc, &block_diff_ac, + &edge_detector_map, + &res_xsize, + &step }; + + CUresult err = cuLaunchKernel(kernel, + work_xsize, work_ysize, 1, + 1, 1, 1, + 0, + ocu.commandQueue, (void**)args, NULL); + LOG_CU_RESULT(err); + err = cuFinish(ocu.commandQueue); + LOG_CU_RESULT(err); +} + +void cuUpsampleSquareRootEx(cu_mem diffmap, const size_t xsize, const size_t ysize, const int step) +{ + ocu_args_d_t &ocu = getOcu(); + + cu_mem diffmap_out = ocu.allocMem(xsize * ysize * sizeof(float)); + + CUfunction kernel = ocu.kernel[KERNEL_UPSAMPLESQUAREROOT]; + const void *args[] = { &diffmap_out, &diffmap, &xsize, &ysize, &step }; + + const size_t res_xsize = (xsize + step - 1) / step; + const size_t res_ysize = (ysize + step - 1) / step; + + CUresult err = cuLaunchKernel(kernel, + res_xsize, res_ysize, 1, + 1, 1, 1, + 0, + ocu.commandQueue, (void**)args, NULL); + LOG_CU_RESULT(err); + err = cuFinish(ocu.commandQueue); + LOG_CU_RESULT(err); + cuMemcpyDtoD(diffmap, diffmap_out, xsize * ysize * sizeof(float)); + + ocu.releaseMem(diffmap_out); +} + +void cuRemoveBorderEx(cu_mem out, const cu_mem in, const size_t xsize, const size_t ysize, const int step) +{ + ocu_args_d_t &ocu = getOcu(); + + int cls = 8 - step; + int cls2 = (8 - step) / 2; + + int out_xsize = xsize - cls; + int out_ysize = ysize - cls; + + CUfunction kernel = ocu.kernel[KERNEL_REMOVEBORDER]; + const void *args[] = { &out, &out_xsize, &out_ysize, &in, &cls, &cls2 }; + + CUresult err = cuLaunchKernel(kernel, + BLOCK_COUNT_X(out_xsize), BLOCK_COUNT_Y(out_ysize), 1, + BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, + 0, + ocu.commandQueue, (void**)args, NULL); + LOG_CU_RESULT(err); + err = cuFinish(ocu.commandQueue); + LOG_CU_RESULT(err); +} + +void cuAddBorderEx(cu_mem out, size_t xsize, size_t ysize, int step, cu_mem in) +{ + ocu_args_d_t &ocu = getOcu(); + + int cls = 8 - step; + int cls2 = (8 - step) / 2; + CUfunction kernel = ocu.kernel[KERNEL_ADDBORDER]; + const void *args[] = { &out, &xsize, &ysize, &cls, &cls2, &in }; + + CUresult err = cuLaunchKernel(kernel, + BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1, + BLOCK_SIZE_X, BLOCK_SIZE_Y, 1, + 0, + ocu.commandQueue, (void**)args, NULL); + LOG_CU_RESULT(err); + err = cuFinish(ocu.commandQueue); + LOG_CU_RESULT(err); +} + +void cuCalculateDiffmapEx(cu_mem diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step) +{ + cuUpsampleSquareRootEx(diffmap, xsize, ysize, step); + + static const double kSigma = 8.8510880283; + static const double mul1 = 24.8235314874; + static const double scale = 1.0 / (1.0 + mul1); + + const int s = 8 - step; + int s2 = (8 - step) / 2; + + ocu_args_d_t &ocu = getOcu(); + cu_mem blurred = ocu.allocMem((xsize - s) * (ysize - s) * sizeof(float)); + cuRemoveBorderEx(blurred, diffmap, xsize, ysize, step); + + static const double border_ratio = 0.03027655136; + cuBlurEx(blurred, xsize - s, ysize - s, kSigma, border_ratio); + + cuAddBorderEx(diffmap, xsize, ysize, step, blurred); + cuScaleImageEx(diffmap, xsize * ysize, scale); + + ocu.releaseMem(blurred); +} + +#ifdef __USE_DOUBLE_AS_FLOAT__ +#undef double +#endif + +#endif \ No newline at end of file diff --git a/clguetzli/cuguetzli.h b/clguetzli/cuguetzli.h new file mode 100644 index 00000000..8c3e3444 --- /dev/null +++ b/clguetzli/cuguetzli.h @@ -0,0 +1,142 @@ +/* +* CUDA edition implementation of guetzli. +* +* Author: strongtu@tencent.com +* ianhuang@tencent.com +* chriskzhou@tencent.com +*/ +#pragma once +#include "guetzli/processor.h" +#include "clguetzli.cl.h" +#include "ocu.h" + +#ifdef __USE_CUDA__ + +#ifdef __USE_DOUBLE_AS_FLOAT__ +#define double float +#endif + +void cuOpsinDynamicsImage( + float *r, float *g, float *b, + const size_t xsize, const size_t ysize); + +void cuDiffmapOpsinDynamicsImage( + float* result, + const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2, + const size_t xsize, const size_t ysize, + const size_t step); + +void cuComputeBlockZeroingOrder( + guetzli::CoeffData *output_order_batch, + const channel_info orig_channel[3], + const float *orig_image_batch, + const float *mask_scale, + const int image_width, + const int image_height, + const channel_info mayout_channel[3], + const int factor, + const int comp_mask, + const float BlockErrorLimit); + +void cuMask( + float* mask_r, float* mask_g, float* mask_b, + float* maskdc_r, float* maskdc_g, float* maskdc_b, + const size_t xsize, const size_t ysize, + const float* r, const float* g, const float* b, + const float* r2, const float* g2, const float* b2); + +void cuDiffmapOpsinDynamicsImageEx( + cu_mem result, + ocu_channels xyb0, + ocu_channels xyb1, + const size_t xsize, const size_t ysize, + const size_t step); + +void cuConvolutionXEx( + cu_mem result/*out*/, + const cu_mem inp, size_t xsize, size_t ysize, + const cu_mem multipliers, size_t len, + int xstep, int offset, float border_ratio); + +void cuConvolutionYEx( + cu_mem result/*out*/, + const cu_mem inp, size_t xsize, size_t ysize, + const cu_mem multipliers, size_t len, + int xstep, int offset, float border_ratio); + +void cuSquareSampleEx( + cu_mem result/*out*/, + const cu_mem image, size_t xsize, size_t ysize, + size_t xstep, size_t ystep); + +void cuBlurEx(cu_mem image/*out, opt*/, const size_t xsize, const size_t ysize, + const double sigma, const double border_ratio, + cu_mem result = NULL/*out, opt*/); + +void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t ysize); + +void cuMaskHighIntensityChangeEx( + ocu_channels &xyb0/*in,out*/, + ocu_channels &xyb1/*in,out*/, + const size_t xsize, const size_t ysize); + +void cuEdgeDetectorMapEx( + cu_mem result/*out*/, + const ocu_channels &rgb, const ocu_channels &rgb2, + const size_t xsize, const size_t ysize, const size_t step); + +void cuBlockDiffMapEx( + cu_mem block_diff_dc/*out*/, + cu_mem block_diff_ac/*out*/, + const ocu_channels &rgb, const ocu_channels &rgb2, + const size_t xsize, const size_t ysize, const size_t step); + +void cuEdgeDetectorLowFreqEx( + cu_mem block_diff_ac/*in,out*/, + const ocu_channels &rgb, const ocu_channels &rgb2, + const size_t xsize, const size_t ysize, const size_t step); + +void cuDiffPrecomputeEx( + ocu_channels &mask/*out*/, + const ocu_channels &xyb0, const ocu_channels &xyb1, + const size_t xsize, const size_t ysize); + +void cuScaleImageEx(cu_mem img/*in, out*/, size_t size, double w); + +void cuAverage5x5Ex(cu_mem img/*in,out*/, const size_t xsize, const size_t ysize); + +void cuMinSquareValEx( + cu_mem img/*in,out*/, + const size_t xsize, const size_t ysize, + const size_t square_size, const size_t offset); + +void cuMaskEx( + ocu_channels mask/*out*/, ocu_channels mask_dc/*out*/, + const ocu_channels &rgb, const ocu_channels &rgb2, + const size_t xsize, const size_t ysize); + +void cuCombineChannelsEx( + cu_mem result/*out*/, + const ocu_channels &mask, + const ocu_channels &mask_dc, + const size_t xsize, const size_t ysize, + const cu_mem block_diff_dc, + const cu_mem block_diff_ac, + const cu_mem edge_detector_map, + const size_t res_xsize, + const size_t step); + +void cuUpsampleSquareRootEx(cu_mem diffmap, const size_t xsize, const size_t ysize, const int step); + +void cuRemoveBorderEx(cu_mem out, const cu_mem in, const size_t xsize, const size_t ysize, const int step); + +void cuAddBorderEx(cu_mem out, const size_t xsize, const size_t ysize, const int step, const cu_mem in); + +void cuCalculateDiffmapEx(cu_mem diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step); + +#ifdef __USE_DOUBLE_AS_FLOAT__ +#undef double +#endif + +#endif \ No newline at end of file diff --git a/clguetzli/cumem_pool.cpp b/clguetzli/cumem_pool.cpp new file mode 100644 index 00000000..8252d3e7 --- /dev/null +++ b/clguetzli/cumem_pool.cpp @@ -0,0 +1,111 @@ +/* + * Memory Pool for CUDA + * + * Author: ianhuang@tencent.com + */ + +#include "cumem_pool.h" + +#ifdef __USE_CUDA__ + +bool compare_size(const cu_mem_block_t& first, const cu_mem_block_t& second) +{ + return (first.size < second.size); +} + +cu_mem_pool_t::cu_mem_pool_t() + : alloc_count(0) + , total_mem_request(0) +{ + +} + +cu_mem_pool_t::~cu_mem_pool_t() +{ + +} + +cu_mem cu_mem_pool_t::allocMem(size_t s, const void *init) +{ + alloc_count++; + total_mem_request += s; + cu_mem_block_t *block_candidate = NULL; + for (std::list::iterator iter = mem_pool.begin(); iter != mem_pool.end(); iter++) + { + cu_mem_block_t *block = &(*iter); + if (block->status == MBS_IDLE && block->size >= s) { + block_candidate = block; + break; + } + } + cu_mem mem = NULL; + if (block_candidate != NULL) { + block_candidate->status = MBS_BUSY; + block_candidate->used = s; + + mem = block_candidate->mem; + } + else { + cu_mem new_mem; + cuMemAlloc(&new_mem, s); + cu_mem_block_t mem_block; + mem_block.size = s; + mem_block.used = s; + mem_block.mem = new_mem; + mem_block.status = MBS_BUSY; + mem_pool.push_back(mem_block); + mem_pool.sort(compare_size); + + mem = new_mem; + } + if (init) + { + cuMemcpyHtoDAsync(mem, init, s, commandQueue); + } + else + { + cuMemsetD8Async(mem, 0, s, commandQueue); + } + + return mem; +} + +void cu_mem_pool_t::releaseMem(cu_mem mem) +{ + cu_mem_block_t *block_candidate = NULL; + for (std::list::iterator iter = mem_pool.begin(); iter != mem_pool.end(); iter++) + { + cu_mem_block_t *block = &(*iter); + if (block->mem == mem) { + block_candidate = block; + break; + } + } + if (block_candidate != NULL) { + block_candidate->status = MBS_IDLE; + block_candidate->used = 0; + } + else { + cuMemFree(mem); + LogError("mem_pool release mem:%lld can not be found.\r\n", mem); + } +} + +void cu_mem_pool_t::drain() +{ + size_t total_mem = 0; + size_t total_block = mem_pool.size(); + cu_mem_block_t *block_candidate = NULL; + for (std::list::iterator iter = mem_pool.begin(); iter != mem_pool.end(); iter++) + { + if (iter->status == MBS_IDLE) { + total_mem += iter->size; + cuMemFree(iter->mem); + iter = mem_pool.erase(iter); + } + } + + LogError("mem_pool has %u blocks, and total pool memory is:%f kb, total memory request:%f kb, total alloc count:%d.\r\n", total_block, (float)(total_mem) / 1024, (float)(total_mem_request) / 1024, alloc_count); +} + +#endif \ No newline at end of file diff --git a/clguetzli/cumem_pool.h b/clguetzli/cumem_pool.h new file mode 100644 index 00000000..b878d92f --- /dev/null +++ b/clguetzli/cumem_pool.h @@ -0,0 +1,51 @@ +/* +* Memory Pool for CUDA +* +* Author: ianhuang@tencent.com +*/ +#pragma once + +#ifdef __USE_CUDA__ + +#include +#include +#include "ocl.h" + +/*Simple memory pool for CUDA, aiming to reduce the memory allocation count, because it's time consuming.*/ + +enum mem_block_status +{ + MBS_IDLE, + MBS_BUSY, +}; + +struct cu_mem_block_t +{ + cu_mem_block_t() + :status(MBS_IDLE) + , used(0) + {} + ~cu_mem_block_t() + {} + + mem_block_status status; + size_t size; + size_t used; + cu_mem mem; +}; + +struct cu_mem_pool_t +{ + cu_mem_pool_t(); + ~cu_mem_pool_t(); + cu_mem allocMem(size_t s, const void *init = NULL); + void releaseMem(cu_mem mem); + void drain(); + + std::list mem_pool; + CUstream commandQueue; + size_t alloc_count; + size_t total_mem_request; +}; + +#endif \ No newline at end of file diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp new file mode 100644 index 00000000..851ab943 --- /dev/null +++ b/clguetzli/ocl.cpp @@ -0,0 +1,556 @@ +/* +* OpenCL Manager +* +* Author: strongtu@tencent.com +* ianhuang@tencent.com +*/ +#include "ocl.h" +#include +#include + +#ifdef __USE_OPENCL__ + +ocl_args_d_t& getOcl(void) +{ + static bool bInit = false; + static ocl_args_d_t ocl; + + if (bInit == true) return ocl; + + bInit = true; + cl_int err = SetupOpenCL(&ocl, CL_DEVICE_TYPE_GPU); + LOG_CL_RESULT(err); + + char* source = nullptr; + size_t src_size = 0; + ReadSourceFromFile("clguetzli/clguetzli.cl", &source, &src_size); + + ocl.program = clCreateProgramWithSource(ocl.context, 1, (const char**)&source, &src_size, &err); + + delete[] source; + + err = clBuildProgram(ocl.program, 1, &ocl.device, "", NULL, NULL); + LOG_CL_RESULT(err); + if (CL_BUILD_PROGRAM_FAILURE == err) + { + size_t log_size = 0; + clGetProgramBuildInfo(ocl.program, ocl.device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); + + std::vector build_log(log_size); + clGetProgramBuildInfo(ocl.program, ocl.device, CL_PROGRAM_BUILD_LOG, log_size, &build_log[0], NULL); + + LogError("Error happened during the build of OpenCL program.\nBuild log:%s", &build_log[0]); + } + + ocl.kernel[KERNEL_CONVOLUTION] = clCreateKernel(ocl.program, "clConvolutionEx", &err); + ocl.kernel[KERNEL_CONVOLUTIONX] = clCreateKernel(ocl.program, "clConvolutionXEx", &err); + ocl.kernel[KERNEL_CONVOLUTIONY] = clCreateKernel(ocl.program, "clConvolutionYEx", &err); + ocl.kernel[KERNEL_SQUARESAMPLE] = clCreateKernel(ocl.program, "clSquareSampleEx", &err); + ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE] = clCreateKernel(ocl.program, "clOpsinDynamicsImageEx", &err); + ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE] = clCreateKernel(ocl.program, "clMaskHighIntensityChangeEx", &err); + ocl.kernel[KERNEL_EDGEDETECTOR] = clCreateKernel(ocl.program, "clEdgeDetectorMapEx", &err); + ocl.kernel[KERNEL_BLOCKDIFFMAP] = clCreateKernel(ocl.program, "clBlockDiffMapEx", &err); + ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ] = clCreateKernel(ocl.program, "clEdgeDetectorLowFreqEx", &err); + ocl.kernel[KERNEL_DIFFPRECOMPUTE] = clCreateKernel(ocl.program, "clDiffPrecomputeEx", &err); + ocl.kernel[KERNEL_SCALEIMAGE] = clCreateKernel(ocl.program, "clScaleImageEx", &err); + ocl.kernel[KERNEL_AVERAGE5X5] = clCreateKernel(ocl.program, "clAverage5x5Ex", &err); + ocl.kernel[KERNEL_MINSQUAREVAL] = clCreateKernel(ocl.program, "clMinSquareValEx", &err); + ocl.kernel[KERNEL_DOMASK] = clCreateKernel(ocl.program, "clDoMaskEx", &err); + ocl.kernel[KERNEL_COMBINECHANNELS] = clCreateKernel(ocl.program, "clCombineChannelsEx", &err); + ocl.kernel[KERNEL_UPSAMPLESQUAREROOT] = clCreateKernel(ocl.program, "clUpsampleSquareRootEx", &err); + ocl.kernel[KERNEL_REMOVEBORDER] = clCreateKernel(ocl.program, "clRemoveBorderEx", &err); + ocl.kernel[KERNEL_ADDBORDER] = clCreateKernel(ocl.program, "clAddBorderEx", &err); + ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER] = clCreateKernel(ocl.program, "clComputeBlockZeroingOrderEx", &err); + + return ocl; +} + +ocl_args_d_t::ocl_args_d_t() : + context(NULL), + device(NULL), + commandQueue(NULL), + program(NULL), + platformVersion(OPENCL_VERSION_1_2), + deviceVersion(OPENCL_VERSION_1_2), + compilerVersion(OPENCL_VERSION_1_2) +{ + for (int i = 0; i < KERNEL_COUNT; i++) + { + kernel[i] = NULL; + } +} + +ocl_args_d_t::~ocl_args_d_t() +{ + cl_int err = CL_SUCCESS; + for (int i = 0; i < KERNEL_COUNT; i++) + { + err = clReleaseKernel(kernel[i]); + if (CL_SUCCESS != err) + { + LogError("Error: clReleaseKernel returned '%s'.\n", TranslateOpenCLError(err)); + } + } + + if (program) + { + err = clReleaseProgram(program); + if (CL_SUCCESS != err) + { + LogError("Error: clReleaseProgram returned '%s'.\n", TranslateOpenCLError(err)); + } + } + if (commandQueue) + { + err = clReleaseCommandQueue(commandQueue); + if (CL_SUCCESS != err) + { + LogError("Error: clReleaseCommandQueue returned '%s'.\n", TranslateOpenCLError(err)); + } + } + if (device) + { + err = clReleaseDevice(device); + if (CL_SUCCESS != err) + { + LogError("Error: clReleaseDevice returned '%s'.\n", TranslateOpenCLError(err)); + } + } + if (context) + { + err = clReleaseContext(context); + if (CL_SUCCESS != err) + { + LogError("Error: clReleaseContext returned '%s'.\n", TranslateOpenCLError(err)); + } + } +} + +cl_mem ocl_args_d_t::allocMem(size_t s, const void *init) +{ + cl_int err = 0; + cl_mem mem = clCreateBuffer(this->context, CL_MEM_READ_WRITE, s, nullptr, &err); + LOG_CL_RESULT(err); + if (!mem) return NULL; + + // init memory + if (init) + { + err = clEnqueueWriteBuffer(this->commandQueue, mem, CL_FALSE, 0, s, init, 0, NULL, NULL); + LOG_CL_RESULT(err); + err = clFinish(this->commandQueue); + LOG_CL_RESULT(err); + } + else + { + cl_char cc = 0; + err = clEnqueueFillBuffer(this->commandQueue, mem, &cc, sizeof(cc), 0, s / sizeof(cc), 0, NULL, NULL); + LOG_CL_RESULT(err); + err = clFinish(this->commandQueue); + LOG_CL_RESULT(err); + } + + return mem; +} + +ocl_channels ocl_args_d_t::allocMemChannels(size_t s, const void *c0, const void *c1, const void *c2) +{ + const void *c[3] = { c0, c1, c2 }; + + ocl_channels img; + for (int i = 0; i < 3; i++) + { + img.ch[i] = allocMem(s, c[i]); + } + + return img; +} + +void ocl_args_d_t::releaseMemChannels(ocl_channels &rgb) +{ + for (int i = 0; i < 3; i++) + { + clReleaseMemObject(rgb.ch[i]); + rgb.ch[i] = NULL; + } +} + +const char* TranslateOpenCLError(cl_int errorCode) +{ + switch (errorCode) + { + case CL_SUCCESS: return "CL_SUCCESS"; + case CL_DEVICE_NOT_FOUND: return "CL_DEVICE_NOT_FOUND"; + case CL_DEVICE_NOT_AVAILABLE: return "CL_DEVICE_NOT_AVAILABLE"; + case CL_COMPILER_NOT_AVAILABLE: return "CL_COMPILER_NOT_AVAILABLE"; + case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "CL_MEM_OBJECT_ALLOCATION_FAILURE"; + case CL_OUT_OF_RESOURCES: return "CL_OUT_OF_RESOURCES"; + case CL_OUT_OF_HOST_MEMORY: return "CL_OUT_OF_HOST_MEMORY"; + case CL_PROFILING_INFO_NOT_AVAILABLE: return "CL_PROFILING_INFO_NOT_AVAILABLE"; + case CL_MEM_COPY_OVERLAP: return "CL_MEM_COPY_OVERLAP"; + case CL_IMAGE_FORMAT_MISMATCH: return "CL_IMAGE_FORMAT_MISMATCH"; + case CL_IMAGE_FORMAT_NOT_SUPPORTED: return "CL_IMAGE_FORMAT_NOT_SUPPORTED"; + case CL_BUILD_PROGRAM_FAILURE: return "CL_BUILD_PROGRAM_FAILURE"; + case CL_MAP_FAILURE: return "CL_MAP_FAILURE"; + case CL_MISALIGNED_SUB_BUFFER_OFFSET: return "CL_MISALIGNED_SUB_BUFFER_OFFSET"; //-13 + case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"; //-14 + case CL_COMPILE_PROGRAM_FAILURE: return "CL_COMPILE_PROGRAM_FAILURE"; //-15 + case CL_LINKER_NOT_AVAILABLE: return "CL_LINKER_NOT_AVAILABLE"; //-16 + case CL_LINK_PROGRAM_FAILURE: return "CL_LINK_PROGRAM_FAILURE"; //-17 + case CL_DEVICE_PARTITION_FAILED: return "CL_DEVICE_PARTITION_FAILED"; //-18 + case CL_KERNEL_ARG_INFO_NOT_AVAILABLE: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE"; //-19 + case CL_INVALID_VALUE: return "CL_INVALID_VALUE"; + case CL_INVALID_DEVICE_TYPE: return "CL_INVALID_DEVICE_TYPE"; + case CL_INVALID_PLATFORM: return "CL_INVALID_PLATFORM"; + case CL_INVALID_DEVICE: return "CL_INVALID_DEVICE"; + case CL_INVALID_CONTEXT: return "CL_INVALID_CONTEXT"; + case CL_INVALID_QUEUE_PROPERTIES: return "CL_INVALID_QUEUE_PROPERTIES"; + case CL_INVALID_COMMAND_QUEUE: return "CL_INVALID_COMMAND_QUEUE"; + case CL_INVALID_HOST_PTR: return "CL_INVALID_HOST_PTR"; + case CL_INVALID_MEM_OBJECT: return "CL_INVALID_MEM_OBJECT"; + case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"; + case CL_INVALID_IMAGE_SIZE: return "CL_INVALID_IMAGE_SIZE"; + case CL_INVALID_SAMPLER: return "CL_INVALID_SAMPLER"; + case CL_INVALID_BINARY: return "CL_INVALID_BINARY"; + case CL_INVALID_BUILD_OPTIONS: return "CL_INVALID_BUILD_OPTIONS"; + case CL_INVALID_PROGRAM: return "CL_INVALID_PROGRAM"; + case CL_INVALID_PROGRAM_EXECUTABLE: return "CL_INVALID_PROGRAM_EXECUTABLE"; + case CL_INVALID_KERNEL_NAME: return "CL_INVALID_KERNEL_NAME"; + case CL_INVALID_KERNEL_DEFINITION: return "CL_INVALID_KERNEL_DEFINITION"; + case CL_INVALID_KERNEL: return "CL_INVALID_KERNEL"; + case CL_INVALID_ARG_INDEX: return "CL_INVALID_ARG_INDEX"; + case CL_INVALID_ARG_VALUE: return "CL_INVALID_ARG_VALUE"; + case CL_INVALID_ARG_SIZE: return "CL_INVALID_ARG_SIZE"; + case CL_INVALID_KERNEL_ARGS: return "CL_INVALID_KERNEL_ARGS"; + case CL_INVALID_WORK_DIMENSION: return "CL_INVALID_WORK_DIMENSION"; + case CL_INVALID_WORK_GROUP_SIZE: return "CL_INVALID_WORK_GROUP_SIZE"; + case CL_INVALID_WORK_ITEM_SIZE: return "CL_INVALID_WORK_ITEM_SIZE"; + case CL_INVALID_GLOBAL_OFFSET: return "CL_INVALID_GLOBAL_OFFSET"; + case CL_INVALID_EVENT_WAIT_LIST: return "CL_INVALID_EVENT_WAIT_LIST"; + case CL_INVALID_EVENT: return "CL_INVALID_EVENT"; + case CL_INVALID_OPERATION: return "CL_INVALID_OPERATION"; + case CL_INVALID_GL_OBJECT: return "CL_INVALID_GL_OBJECT"; + case CL_INVALID_BUFFER_SIZE: return "CL_INVALID_BUFFER_SIZE"; + case CL_INVALID_MIP_LEVEL: return "CL_INVALID_MIP_LEVEL"; + case CL_INVALID_GLOBAL_WORK_SIZE: return "CL_INVALID_GLOBAL_WORK_SIZE"; //-63 + case CL_INVALID_PROPERTY: return "CL_INVALID_PROPERTY"; //-64 + case CL_INVALID_IMAGE_DESCRIPTOR: return "CL_INVALID_IMAGE_DESCRIPTOR"; //-65 + case CL_INVALID_COMPILER_OPTIONS: return "CL_INVALID_COMPILER_OPTIONS"; //-66 + case CL_INVALID_LINKER_OPTIONS: return "CL_INVALID_LINKER_OPTIONS"; //-67 + case CL_INVALID_DEVICE_PARTITION_COUNT: return "CL_INVALID_DEVICE_PARTITION_COUNT"; //-68 + // case CL_INVALID_PIPE_SIZE: return "CL_INVALID_PIPE_SIZE"; //-69 + // case CL_INVALID_DEVICE_QUEUE: return "CL_INVALID_DEVICE_QUEUE"; //-70 + + default: + return "UNKNOWN ERROR CODE"; + } +} + + +/* +* Check whether an OpenCL platform is the required platform +* (based on the platform's name) +*/ +bool CheckPreferredPlatformMatch(cl_platform_id platform, const char* preferredPlatform) +{ + size_t stringLength = 0; + cl_int err = CL_SUCCESS; + bool match = false; + + // In order to read the platform's name, we first read the platform's name string length (param_value is NULL). + // The value returned in stringLength + err = clGetPlatformInfo(platform, CL_PLATFORM_NAME, 0, NULL, &stringLength); + if (CL_SUCCESS != err) + { + LogError("Error: clGetPlatformInfo() to get CL_PLATFORM_NAME length returned '%s'.\n", TranslateOpenCLError(err)); + return false; + } + + // Now, that we know the platform's name string length, we can allocate enough space before read it + std::vector platformName(stringLength); + + // Read the platform's name string + // The read value returned in platformName + err = clGetPlatformInfo(platform, CL_PLATFORM_NAME, stringLength, &platformName[0], NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clGetplatform_ids() to get CL_PLATFORM_NAME returned %s.\n", TranslateOpenCLError(err)); + return false; + } + + // Now check if the platform's name is the required one + if (strstr(&platformName[0], preferredPlatform) != 0) + { + // The checked platform is the one we're looking for + match = true; + } + + return match; +} + +/* +* Find and return the preferred OpenCL platform +* In case that preferredPlatform is NULL, the ID of the first discovered platform will be returned +*/ +cl_platform_id FindOpenCLPlatform(const char* preferredPlatform, cl_device_type deviceType) +{ + cl_uint numPlatforms = 0; + cl_int err = CL_SUCCESS; + + // Get (in numPlatforms) the number of OpenCL platforms available + // No platform ID will be return, since platforms is NULL + err = clGetPlatformIDs(0, NULL, &numPlatforms); + if (CL_SUCCESS != err) + { + LogError("Error: clGetplatform_ids() to get num platforms returned %s.\n", TranslateOpenCLError(err)); + return NULL; + } + LogInfo("Number of available platforms: %u\n", numPlatforms); + + if (0 == numPlatforms) + { + LogError("Error: No platforms found!\n"); + return NULL; + } + + std::vector platforms(numPlatforms); + + // Now, obtains a list of numPlatforms OpenCL platforms available + // The list of platforms available will be returned in platforms + err = clGetPlatformIDs(numPlatforms, &platforms[0], NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clGetplatform_ids() to get platforms returned %s.\n", TranslateOpenCLError(err)); + return NULL; + } + + // Check if one of the available platform matches the preferred requirements + for (cl_uint i = 0; i < numPlatforms; i++) + { + bool match = true; + cl_uint numDevices = 0; + + size_t nameLen = 0; + clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 0, NULL, &nameLen); + + std::vector platformName(nameLen + 1); + clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, nameLen, &platformName[0], NULL); + platformName[nameLen] = 0; + + LogError("DeviceName: %s\n", platformName.data()); + + if ((NULL != preferredPlatform) && (strlen(preferredPlatform) > 0)) + { + match = (strstr(&platformName[0], preferredPlatform) != 0); + } + + // match is true if the platform's name is the required one or don't care (NULL) + if (match) + { + // Obtains the number of deviceType devices available on platform + // When the function failed we expect numDevices to be zero. + // We ignore the function return value since a non-zero error code + // could happen if this platform doesn't support the specified device type. + err = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &numDevices); + if (CL_SUCCESS != err) + { + if (CL_DEVICE_TYPE_GPU == deviceType) + { + LogError("%s try GPU returned %s.\n", platformName.data(), TranslateOpenCLError(err)); + } + if (CL_DEVICE_TYPE_CPU == deviceType) + { + LogError("%s try CPU returned %s.\n", platformName.data(), TranslateOpenCLError(err)); + } + } + + if (0 != numDevices) + { + // There is at list one device that answer the requirements + LogError("SelectDevice: %s GPU=%d\n", platformName.data(), deviceType == CL_DEVICE_TYPE_GPU ? 1 : 0); + return platforms[i]; + } + } + } + + return NULL; +} + + +/* +* This function read the OpenCL platdorm and device versions +* (using clGetxxxInfo API) and stores it in the ocl structure. +* Later it will enable us to support both OpenCL 1.2 and 2.0 platforms and devices +* in the same program. +*/ +int GetPlatformAndDeviceVersion(cl_platform_id platformId, ocl_args_d_t *ocl) +{ + cl_int err = CL_SUCCESS; + + // Read the platform's version string length (param_value is NULL). + // The value returned in stringLength + size_t stringLength = 0; + err = clGetPlatformInfo(platformId, CL_PLATFORM_VERSION, 0, NULL, &stringLength); + if (CL_SUCCESS != err) + { + LogError("Error: clGetPlatformInfo() to get CL_PLATFORM_VERSION length returned '%s'.\n", TranslateOpenCLError(err)); + return err; + } + + // Now, that we know the platform's version string length, we can allocate enough space before read it + std::vector platformVersion(stringLength); + + // Read the platform's version string + // The read value returned in platformVersion + err = clGetPlatformInfo(platformId, CL_PLATFORM_VERSION, stringLength, &platformVersion[0], NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clGetplatform_ids() to get CL_PLATFORM_VERSION returned %s.\n", TranslateOpenCLError(err)); + return err; + } + + if (strstr(&platformVersion[0], "OpenCL 2.0") != NULL) + { + ocl->platformVersion = OPENCL_VERSION_2_0; + } + + // Read the device's version string length (param_value is NULL). + err = clGetDeviceInfo(ocl->device, CL_DEVICE_VERSION, 0, NULL, &stringLength); + if (CL_SUCCESS != err) + { + LogError("Error: clGetDeviceInfo() to get CL_DEVICE_VERSION length returned '%s'.\n", TranslateOpenCLError(err)); + return err; + } + + // Now, that we know the device's version string length, we can allocate enough space before read it + std::vector deviceVersion(stringLength); + + // Read the device's version string + // The read value returned in deviceVersion + err = clGetDeviceInfo(ocl->device, CL_DEVICE_VERSION, stringLength, &deviceVersion[0], NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clGetDeviceInfo() to get CL_DEVICE_VERSION returned %s.\n", TranslateOpenCLError(err)); + return err; + } + + if (strstr(&deviceVersion[0], "OpenCL 2.0") != NULL) + { + ocl->deviceVersion = OPENCL_VERSION_2_0; + } + + // Read the device's OpenCL C version string length (param_value is NULL). + err = clGetDeviceInfo(ocl->device, CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &stringLength); + if (CL_SUCCESS != err) + { + LogError("Error: clGetDeviceInfo() to get CL_DEVICE_OPENCL_C_VERSION length returned '%s'.\n", TranslateOpenCLError(err)); + return err; + } + + // Now, that we know the device's OpenCL C version string length, we can allocate enough space before read it + std::vector compilerVersion(stringLength); + + // Read the device's OpenCL C version string + // The read value returned in compilerVersion + err = clGetDeviceInfo(ocl->device, CL_DEVICE_OPENCL_C_VERSION, stringLength, &compilerVersion[0], NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clGetDeviceInfo() to get CL_DEVICE_OPENCL_C_VERSION returned %s.\n", TranslateOpenCLError(err)); + return err; + } + + else if (strstr(&compilerVersion[0], "OpenCL C 2.0") != NULL) + { + ocl->compilerVersion = OPENCL_VERSION_2_0; + } + + return err; +} + + +/* +* This function picks/creates necessary OpenCL objects which are needed. +* The objects are: +* OpenCL platform, device, context, and command queue. +* +* All these steps are needed to be performed once in a regular OpenCL application. +* This happens before actual compute kernels calls are performed. +* +* For convenience, in this application you store all those basic OpenCL objects in structure ocl_args_d_t, +* so this function populates fields of this structure, which is passed as parameter ocl. +* Please, consider reviewing the fields before going further. +* The structure definition is right in the beginning of this file. +*/ +int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType) +{ + // The following variable stores return codes for all OpenCL calls. + cl_int err = CL_SUCCESS; + + // Query for all available OpenCL platforms on the system + // Here you enumerate all platforms and pick one which name has preferredPlatform as a sub-string + cl_platform_id platformId = FindOpenCLPlatform(nullptr, deviceType); + if (NULL == platformId) + { + deviceType = CL_DEVICE_TYPE_CPU; + platformId = FindOpenCLPlatform(nullptr, deviceType); + } + + if (NULL == platformId) + { + LogError("Error: Failed to find OpenCL platform.\n"); + return CL_INVALID_VALUE; + } + + // Create context with device of specified type. + // Required device type is passed as function argument deviceType. + // So you may use this function to create context for any CPU or GPU OpenCL device. + // The creation is synchronized (pfn_notify is NULL) and NULL user_data + cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platformId, 0 }; + ocl->context = clCreateContextFromType(contextProperties, deviceType, NULL, NULL, &err); + if ((CL_SUCCESS != err) || (NULL == ocl->context)) + { + LogError("Couldn't create a context, clCreateContextFromType() returned '%s'.\n", TranslateOpenCLError(err)); + return err; + } + + // Query for OpenCL device which was used for context creation + err = clGetContextInfo(ocl->context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), &ocl->device, NULL); + if (CL_SUCCESS != err) + { + LogError("Error: clGetContextInfo() to get list of devices returned %s.\n", TranslateOpenCLError(err)); + return err; + } + + // Read the OpenCL platform's version and the device OpenCL and OpenCL C versions + GetPlatformAndDeviceVersion(platformId, ocl); + + // Create command queue. + // OpenCL kernels are enqueued for execution to a particular device through special objects called command queues. + // Command queue guarantees some ordering between calls and other OpenCL commands. + // Here you create a simple in-order OpenCL command queue that doesn't allow execution of two kernels in parallel on a target device. +#ifdef CL_VERSION_2_0 + if (OPENCL_VERSION_2_0 == ocl->deviceVersion) + { + const cl_command_queue_properties properties[] = { CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0 }; + ocl->commandQueue = clCreateCommandQueueWithProperties(ocl->context, ocl->device, properties, &err); + } + else { + // default behavior: OpenCL 1.2 + cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE; + ocl->commandQueue = clCreateCommandQueue(ocl->context, ocl->device, properties, &err); + } +#else + // default behavior: OpenCL 1.2 + cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE; + ocl->commandQueue = clCreateCommandQueue(ocl->context, ocl->device, properties, &err); +#endif + if (CL_SUCCESS != err) + { + LogError("Error: clCreateCommandQueue() returned %s.\n", TranslateOpenCLError(err)); + return err; + } + + return CL_SUCCESS; +} + +#endif \ No newline at end of file diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h new file mode 100644 index 00000000..7ccee2d8 --- /dev/null +++ b/clguetzli/ocl.h @@ -0,0 +1,72 @@ +/* +* OpenCL Manager +* +* Author: strongtu@tencent.com +* ianhuang@tencent.com +*/ +#pragma once + +#ifdef __USE_OPENCL__ + +#include "CL/cl.h" +#include "utils.h" +#include "clguetzli.cl.h" + +// Macros for OpenCL versions +#define OPENCL_VERSION_1_2 1.2f +#define OPENCL_VERSION_2_0 2.0f + +enum KernelName { + KERNEL_CONVOLUTION = 0, + KERNEL_CONVOLUTIONX, + KERNEL_CONVOLUTIONY, + KERNEL_SQUARESAMPLE, + KERNEL_OPSINDYNAMICSIMAGE, + KERNEL_MASKHIGHINTENSITYCHANGE, + KERNEL_EDGEDETECTOR, + KERNEL_BLOCKDIFFMAP, + KERNEL_EDGEDETECTORLOWFREQ, + KERNEL_DIFFPRECOMPUTE, + KERNEL_SCALEIMAGE, + KERNEL_AVERAGE5X5, + KERNEL_MINSQUAREVAL, + KERNEL_DOMASK, + KERNEL_COMBINECHANNELS, + KERNEL_UPSAMPLESQUAREROOT, + KERNEL_REMOVEBORDER, + KERNEL_ADDBORDER, + KERNEL_COMPUTEBLOCKZEROINGORDER, + KERNEL_COUNT, +}; + +#define LOG_CL_RESULT(e) if (CL_SUCCESS != (e)) { LogError("Error: %s:%d returned %s.\n", __FUNCTION__, __LINE__, TranslateOpenCLError((e)));} + +struct ocl_args_d_t; + +const char* TranslateOpenCLError(cl_int errorCode); + +int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType); + +ocl_args_d_t& getOcl(void); + +struct ocl_args_d_t +{ + ocl_args_d_t(); + ~ocl_args_d_t(); + + cl_mem allocMem(size_t s, const void *init = NULL); + ocl_channels allocMemChannels(size_t s, const void *c0 = NULL, const void *c1 = NULL, const void *c2 = NULL); + void releaseMemChannels(ocl_channels &rgb); + + // Regular OpenCL objects: + cl_context context; // hold the context handler + cl_device_id device; // hold the selected device handler + cl_command_queue commandQueue; // hold the commands-queue handler + cl_program program; // hold the program handler + cl_kernel kernel[KERNEL_COUNT]; // hold the kernel handler + float platformVersion; // hold the OpenCL platform version (default 1.2) + float deviceVersion; // hold the OpenCL device version (default. 1.2) + float compilerVersion; // hold the device OpenCL C version (default. 1.2) +}; + +#endif diff --git a/clguetzli/ocu.cpp b/clguetzli/ocu.cpp new file mode 100644 index 00000000..b7395ed1 --- /dev/null +++ b/clguetzli/ocu.cpp @@ -0,0 +1,206 @@ +/* +* CUDA Manager +* +* Author: strongtu@tencent.com +*/ +#include "ocu.h" + +#ifdef __USE_CUDA__ +#include +#include + +ocu_args_d_t& getOcu(void) +{ + static bool bInit = false; + static ocu_args_d_t ocu; + + if (bInit == true) return ocu; + + bInit = true; + + CUresult err = cuInit(0); + LOG_CU_RESULT(err); + CUdevice dev = 0; + CUcontext ctxt; + CUstream stream; + + err = cuCtxCreate(&ctxt, CU_CTX_SCHED_AUTO, dev); + LOG_CU_RESULT(err); + + char name[1024]; + int proc_count = 0; + int thread_count = 0; + int cap_major = 0, cap_minor = 0; + cuDeviceGetName(name, sizeof(name), dev); + cuDeviceGetAttribute(&cap_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev); + cuDeviceGetAttribute(&cap_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev); + cuDeviceGetAttribute(&proc_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); + cuDeviceGetAttribute(&thread_count, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev); + LogError("CUDA Adapter:%s Ver%d.%d MP %d MaxThread Per MP %d)\r\n", name, cap_major, cap_minor, proc_count, thread_count); + + char* ptx = nullptr; + size_t src_size = 0; +if (sizeof(void*) == 8) + ReadSourceFromFile("clguetzli/clguetzli.cu.ptx64", &ptx, &src_size); +else + ReadSourceFromFile("clguetzli/clguetzli.cu.ptx32", &ptx, &src_size); + + CUmodule mod; + CUjit_option jit_options[2]; + void *jit_optvals[2]; + jit_options[0] = CU_JIT_CACHE_MODE; + jit_optvals[0] = (void*)(uintptr_t)CU_JIT_CACHE_OPTION_CA; + err = cuModuleLoadDataEx(&mod, ptx, 1, jit_options, jit_optvals); + LOG_CU_RESULT(err); + + delete[] ptx; + + cuModuleGetFunction(&ocu.kernel[KERNEL_CONVOLUTION], mod, "clConvolutionEx"); + cuModuleGetFunction(&ocu.kernel[KERNEL_CONVOLUTIONX], mod, "clConvolutionXEx"); + cuModuleGetFunction(&ocu.kernel[KERNEL_CONVOLUTIONY], mod, "clConvolutionYEx"); + cuModuleGetFunction(&ocu.kernel[KERNEL_SQUARESAMPLE], mod, "clSquareSampleEx"); + cuModuleGetFunction(&ocu.kernel[KERNEL_OPSINDYNAMICSIMAGE], mod, "clOpsinDynamicsImageEx"); + cuModuleGetFunction(&ocu.kernel[KERNEL_MASKHIGHINTENSITYCHANGE], mod, "clMaskHighIntensityChangeEx"); + cuModuleGetFunction(&ocu.kernel[KERNEL_EDGEDETECTOR], mod, "clEdgeDetectorMapEx"); + cuModuleGetFunction(&ocu.kernel[KERNEL_BLOCKDIFFMAP], mod, "clBlockDiffMapEx"); + cuModuleGetFunction(&ocu.kernel[KERNEL_EDGEDETECTORLOWFREQ], mod, "clEdgeDetectorLowFreqEx"); + cuModuleGetFunction(&ocu.kernel[KERNEL_DIFFPRECOMPUTE], mod, "clDiffPrecomputeEx"); + cuModuleGetFunction(&ocu.kernel[KERNEL_SCALEIMAGE], mod, "clScaleImageEx"); + cuModuleGetFunction(&ocu.kernel[KERNEL_AVERAGE5X5], mod, "clAverage5x5Ex"); + cuModuleGetFunction(&ocu.kernel[KERNEL_MINSQUAREVAL], mod, "clMinSquareValEx"); + cuModuleGetFunction(&ocu.kernel[KERNEL_DOMASK], mod, "clDoMaskEx"); + cuModuleGetFunction(&ocu.kernel[KERNEL_COMBINECHANNELS], mod, "clCombineChannelsEx"); + cuModuleGetFunction(&ocu.kernel[KERNEL_UPSAMPLESQUAREROOT], mod, "clUpsampleSquareRootEx"); + cuModuleGetFunction(&ocu.kernel[KERNEL_REMOVEBORDER], mod, "clRemoveBorderEx"); + cuModuleGetFunction(&ocu.kernel[KERNEL_ADDBORDER], mod, "clAddBorderEx"); + cuModuleGetFunction(&ocu.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER], mod, "clComputeBlockZeroingOrderEx"); + + cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_SHARED); + cuCtxSetSharedMemConfig(CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE); + + cuStreamCreate(&stream, 0); + + ocu.dev = dev; + ocu.commandQueue = stream; + ocu.mod = mod; + ocu.ctxt = ctxt; + ocu.mem_pool.commandQueue = ocu.commandQueue; + + return ocu; +} + +ocu_args_d_t::ocu_args_d_t() + : dev(0) + , commandQueue(NULL) + , mod(NULL) + , ctxt(NULL) +{ + +} + +ocu_args_d_t::~ocu_args_d_t() +{ + cuModuleUnload(mod); + cuCtxDestroy(ctxt); + mem_pool.drain(); +} + +cu_mem ocu_args_d_t::allocMem(size_t s, const void *init) +{ + return mem_pool.allocMem(s, init); +} + +void ocu_args_d_t::releaseMem(cu_mem mem) +{ + mem_pool.releaseMem(mem); +} + +ocu_channels ocu_args_d_t::allocMemChannels(size_t s, const void *c0, const void *c1, const void *c2) +{ + const void *c[3] = { c0, c1, c2 }; + + ocu_channels img; + for (int i = 0; i < 3; i++) + { + img.ch[i] = allocMem(s, c[i]); + } + + return img; +} + +void ocu_args_d_t::releaseMemChannels(ocu_channels &rgb) +{ + for (int i = 0; i < 3; i++) + { + releaseMem(rgb.ch[i]); + rgb.ch[i] = NULL; + } +} + +const char* TranslateCUDAError(CUresult errorCode) +{ + switch (errorCode) + { + case CUDA_SUCCESS: return "CUDA_SUCCESS"; + case CUDA_ERROR_INVALID_VALUE: return "CUDA_ERROR_INVALID_VALUE"; + case CUDA_ERROR_OUT_OF_MEMORY: return "CUDA_ERROR_OUT_OF_MEMORY"; + case CUDA_ERROR_NOT_INITIALIZED: return "CUDA_ERROR_NOT_INITIALIZED"; + case CUDA_ERROR_DEINITIALIZED: return "CUDA_ERROR_DEINITIALIZED"; + case CUDA_ERROR_PROFILER_DISABLED: return "CUDA_ERROR_PROFILER_DISABLED"; + case CUDA_ERROR_PROFILER_NOT_INITIALIZED: return "CUDA_ERROR_PROFILER_NOT_INITIALIZED"; + case CUDA_ERROR_PROFILER_ALREADY_STARTED: return "CUDA_ERROR_PROFILER_ALREADY_STARTED"; + case CUDA_ERROR_PROFILER_ALREADY_STOPPED: return "CUDA_ERROR_PROFILER_ALREADY_STOPPED"; + case CUDA_ERROR_NO_DEVICE: return "CUDA_ERROR_NO_DEVICE"; + case CUDA_ERROR_INVALID_DEVICE: return "CUDA_ERROR_INVALID_DEVICE"; + case CUDA_ERROR_INVALID_IMAGE: return "CUDA_ERROR_INVALID_IMAGE"; + case CUDA_ERROR_INVALID_CONTEXT: return "CUDA_ERROR_INVALID_CONTEXT"; + case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT"; + case CUDA_ERROR_MAP_FAILED: return "CUDA_ERROR_MAP_FAILED"; + case CUDA_ERROR_UNMAP_FAILED: return "CUDA_ERROR_UNMAP_FAILED"; + case CUDA_ERROR_ARRAY_IS_MAPPED: return "CUDA_ERROR_ARRAY_IS_MAPPED"; + case CUDA_ERROR_ALREADY_MAPPED: return "CUDA_ERROR_ALREADY_MAPPED"; + case CUDA_ERROR_NO_BINARY_FOR_GPU: return "CUDA_ERROR_NO_BINARY_FOR_GPU"; + case CUDA_ERROR_ALREADY_ACQUIRED: return "CUDA_ERROR_ALREADY_ACQUIRED"; + case CUDA_ERROR_NOT_MAPPED: return "CUDA_ERROR_NOT_MAPPED"; + case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY"; + case CUDA_ERROR_NOT_MAPPED_AS_POINTER: return "CUDA_ERROR_NOT_MAPPED_AS_POINTER"; + case CUDA_ERROR_ECC_UNCORRECTABLE: return "CUDA_ERROR_ECC_UNCORRECTABLE"; + case CUDA_ERROR_UNSUPPORTED_LIMIT: return "CUDA_ERROR_UNSUPPORTED_LIMIT"; + case CUDA_ERROR_CONTEXT_ALREADY_IN_USE: return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE"; + case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED: return "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED"; + case CUDA_ERROR_INVALID_PTX: return "CUDA_ERROR_INVALID_PTX"; + case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT: return "CUDA_ERROR_INVALID_GRAPHICS_CONTEXT"; + // case CUDA_ERROR_NVLINK_UNCORRECTABLE: return "CUDA_ERROR_NVLINK_UNCORRECTABLE"; + case CUDA_ERROR_INVALID_SOURCE: return "CUDA_ERROR_INVALID_SOURCE"; + case CUDA_ERROR_FILE_NOT_FOUND: return "CUDA_ERROR_FILE_NOT_FOUND"; + case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND"; + case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED"; + case CUDA_ERROR_OPERATING_SYSTEM: return "CUDA_ERROR_OPERATING_SYSTEM"; + case CUDA_ERROR_INVALID_HANDLE: return "CUDA_ERROR_INVALID_HANDLE"; + case CUDA_ERROR_NOT_FOUND: return "CUDA_ERROR_NOT_FOUND"; + case CUDA_ERROR_NOT_READY: return "CUDA_ERROR_NOT_READY"; + case CUDA_ERROR_ILLEGAL_ADDRESS: return "CUDA_ERROR_ILLEGAL_ADDRESS"; + case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES"; + case CUDA_ERROR_LAUNCH_TIMEOUT: return "CUDA_ERROR_LAUNCH_TIMEOUT"; + case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING"; + case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED: return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED"; + case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED: return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED"; + case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE: return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE"; + case CUDA_ERROR_CONTEXT_IS_DESTROYED: return "CUDA_ERROR_CONTEXT_IS_DESTROYED"; + case CUDA_ERROR_ASSERT: return "CUDA_ERROR_ASSERT"; + case CUDA_ERROR_TOO_MANY_PEERS: return "CUDA_ERROR_TOO_MANY_PEERS"; + case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED: return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED"; + case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED: return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED"; + case CUDA_ERROR_HARDWARE_STACK_ERROR: return "CUDA_ERROR_HARDWARE_STACK_ERROR"; + case CUDA_ERROR_ILLEGAL_INSTRUCTION: return "CUDA_ERROR_ILLEGAL_INSTRUCTION"; + case CUDA_ERROR_MISALIGNED_ADDRESS: return "CUDA_ERROR_MISALIGNED_ADDRESS"; + case CUDA_ERROR_INVALID_ADDRESS_SPACE: return "CUDA_ERROR_INVALID_ADDRESS_SPACE"; + case CUDA_ERROR_INVALID_PC: return "CUDA_ERROR_INVALID_PC"; + case CUDA_ERROR_LAUNCH_FAILED: return "CUDA_ERROR_LAUNCH_FAILED"; + case CUDA_ERROR_NOT_PERMITTED: return "CUDA_ERROR_NOT_PERMITTED"; + case CUDA_ERROR_NOT_SUPPORTED: return "CUDA_ERROR_NOT_SUPPORTED"; + case CUDA_ERROR_UNKNOWN: return "CUDA_ERROR_UNKNOWN"; + default: return "CUDA_ERROR_UNKNOWN"; + } +} +#endif \ No newline at end of file diff --git a/clguetzli/ocu.h b/clguetzli/ocu.h new file mode 100644 index 00000000..93f675a3 --- /dev/null +++ b/clguetzli/ocu.h @@ -0,0 +1,42 @@ +/* +* CUDA Manager +* +* Author: strongtu@tencent.com +*/ +#pragma once + +#ifdef __USE_CUDA__ + +#include +#include "ocl.h" +#include "cumem_pool.h" + +#define LOG_CU_RESULT(e) if (CUDA_SUCCESS != (e)) { LogError("Error: %s:%d returned %s.\n", __FUNCTION__, __LINE__, TranslateCUDAError((e)));} + +struct ocu_args_d_t; + +const char* TranslateCUDAError(CUresult errorCode); + +ocu_args_d_t& getOcu(void); + +struct ocu_args_d_t +{ + ocu_args_d_t(); + ~ocu_args_d_t(); + + cu_mem allocMem(size_t s, const void *init = NULL); + void releaseMem(cu_mem mem); + ocu_channels allocMemChannels(size_t s, const void *c0 = NULL, const void *c1 = NULL, const void *c2 = NULL); + void releaseMemChannels(ocu_channels &rgb); + + CUfunction kernel[KERNEL_COUNT]; + CUstream commandQueue; + CUmodule mod; + CUcontext ctxt; + CUdevice dev; + cu_mem_pool_t mem_pool; +}; + + + +#endif \ No newline at end of file diff --git a/clguetzli/utils.cpp b/clguetzli/utils.cpp new file mode 100644 index 00000000..da699406 --- /dev/null +++ b/clguetzli/utils.cpp @@ -0,0 +1,102 @@ +/***************************************************************************** + * Copyright (c) 2013-2016 Intel Corporation + * All rights reserved. + * + * WARRANTY DISCLAIMER + * + * THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE + * MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Intel Corporation is the author of the Materials, and requests that all + * problem reports or change requests be submitted to it directly + *****************************************************************************/ +#ifdef __USE_OPENCL__ + +#include +#include +#include +#include +#include "CL/cl.h" +#include "CL/cl_ext.h" +#include "utils.h" +#include + + +//we want to use POSIX functions +#pragma warning( push ) +#pragma warning( disable : 4996 ) + + +void LogInfo(const char* str, ...) +{ + if (str) + { + va_list args; + va_start(args, str); + + vfprintf(stdout, str, args); + + va_end(args); + } +} + +void LogError(const char* str, ...) +{ + if (str) + { + va_list args; + va_start(args, str); + + vfprintf(stderr, str, args); + + va_end(args); + } +} + +// Upload the OpenCL C source code to output argument source +// The memory resource is implicitly allocated in the function +// and should be deallocated by the caller +int ReadSourceFromFile(const char* fileName, char** source, size_t* sourceSize) +{ + int errorCode = CL_SUCCESS; + + FILE* fp = NULL; +#ifdef __linux__ + fp = fopen(fileName, "rb"); +#else + fopen_s(&fp, fileName, "rb"); +#endif + if (fp == NULL) + { + LogError("Error: Couldn't find program source file '%s'.\n", fileName); + errorCode = CL_INVALID_VALUE; + } + else { + fseek(fp, 0, SEEK_END); + *sourceSize = ftell(fp); + fseek(fp, 0, SEEK_SET); + + *source = new char[*sourceSize]; + if (*source == NULL) + { + LogError("Error: Couldn't allocate %d bytes for program source from file '%s'.\n", *sourceSize, fileName); + errorCode = CL_OUT_OF_HOST_MEMORY; + } + else { + fread(*source, 1, *sourceSize, fp); + } + } + return errorCode; +} +#pragma warning( pop ) + +#endif \ No newline at end of file diff --git a/clguetzli/utils.h b/clguetzli/utils.h new file mode 100644 index 00000000..71d8d7a1 --- /dev/null +++ b/clguetzli/utils.h @@ -0,0 +1,32 @@ +/***************************************************************************** + * Copyright (c) 2013-2016 Intel Corporation + * All rights reserved. + * + * WARRANTY DISCLAIMER + * + * THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE + * MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Intel Corporation is the author of the Materials, and requests that all + * problem reports or change requests be submitted to it directly + *****************************************************************************/ +#pragma once + +// Print useful information to the default output. Same usage as with printf +void LogInfo(const char* str, ...); + +// Print error notification to the default output. Same usage as with printf +void LogError(const char* str, ...); + +// Read OpenCL source code from fileName and store it in source. The number of read bytes returns in sourceSize +int ReadSourceFromFile(const char* fileName, char** source, size_t* sourceSize); + diff --git a/compile.bat b/compile.bat new file mode 100644 index 00000000..1b98c758 --- /dev/null +++ b/compile.bat @@ -0,0 +1,12 @@ +@rem setupt windows var +call vcvars64.bat + +@echo %1 --machine 64 or 32 +@echo %2 -G + +set machine_num=%1% +set debug_opt=%2% + +if "%machine_num%" == "" set machine_num=64 + +nvcc -Xcompiler "/wd 4819" -I"./" -use_fast_math -ftz=true -prec-div=false -prec-sqrt=false -arch=compute_30 -O3 --machine %machine_num% %debug_opt% -ptx -o clguetzli\clguetzli.cu.ptx%machine_num% clguetzli\clguetzli.cu \ No newline at end of file diff --git a/compile.sh b/compile.sh new file mode 100644 index 00000000..eabb6473 --- /dev/null +++ b/compile.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +#Compile .cu file +echo $1 --machine 64 or 32 +echo $2 -G + +nvcc -D__USE_OPENCL__ -I"./" -I"/usr/local/cuda/include" -ftz=true -prec-div=false -prec-sqrt=false -arch=compute_30 -O3 --machine $1 $2 -ptx -o clguetzli/clguetzli.cu.ptx$1 clguetzli/clguetzli.cu + +#copy to ./bin/Release +cp clguetzli/clguetzli.cu.ptx$1 bin/Release/clguetzli/clguetzli.cu.ptx$1 +cp clguetzli/clguetzli.cl bin/Release/clguetzli/clguetzli.cl +cp clguetzli/clguetzli.cl.h bin/Release/clguetzli/clguetzli.cl.h diff --git a/guetzli.make b/guetzli.make index 7edeea3f..e16aa99b 100644 --- a/guetzli.make +++ b/guetzli.make @@ -16,7 +16,7 @@ ifeq ($(config),release) TARGET = $(TARGETDIR)/guetzli OBJDIR = obj/Release/guetzli DEFINES += - INCLUDES += -I. -Ithird_party/butteraugli + INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli FORCE_INCLUDE += ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES) ALL_CFLAGS += $(CFLAGS) $(ALL_CPPFLAGS) -O3 -g `pkg-config --cflags libpng || libpng-config --cflags` @@ -43,7 +43,7 @@ ifeq ($(config),debug) TARGET = $(TARGETDIR)/guetzli OBJDIR = obj/Debug/guetzli DEFINES += - INCLUDES += -I. -Ithird_party/butteraugli + INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli FORCE_INCLUDE += ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES) ALL_CFLAGS += $(CFLAGS) $(ALL_CPPFLAGS) -g `pkg-config --cflags libpng || libpng-config --cflags` @@ -65,6 +65,15 @@ all: $(TARGETDIR) $(OBJDIR) prebuild prelink $(TARGET) endif OBJECTS := \ + $(OBJDIR)/clbutter_comparator.o \ + $(OBJDIR)/clguetzli.cl.o \ + $(OBJDIR)/clguetzli.o \ + $(OBJDIR)/clguetzli_test.o \ + $(OBJDIR)/cuguetzli.o \ + $(OBJDIR)/cumem_pool.o \ + $(OBJDIR)/ocl.o \ + $(OBJDIR)/ocu.o \ + $(OBJDIR)/utils.o \ $(OBJDIR)/butteraugli_comparator.o \ $(OBJDIR)/dct_double.o \ $(OBJDIR)/debug_print.o \ @@ -143,6 +152,33 @@ $(GCH): $(PCH) $(SILENT) $(CXX) -x c++-header $(ALL_CXXFLAGS) -o "$@" -MF "$(@:%.gch=%.d)" -c "$<" endif +$(OBJDIR)/clbutter_comparator.o: clguetzli/clbutter_comparator.cpp + @echo $(notdir $<) + $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" +$(OBJDIR)/clguetzli.cl.o: clguetzli/clguetzli.cl.cpp + @echo $(notdir $<) + $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" +$(OBJDIR)/clguetzli.o: clguetzli/clguetzli.cpp + @echo $(notdir $<) + $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" +$(OBJDIR)/clguetzli_test.o: clguetzli/clguetzli_test.cpp + @echo $(notdir $<) + $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" +$(OBJDIR)/cuguetzli.o: clguetzli/cuguetzli.cpp + @echo $(notdir $<) + $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" +$(OBJDIR)/cumem_pool.o: clguetzli/cumem_pool.cpp + @echo $(notdir $<) + $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" +$(OBJDIR)/ocl.o: clguetzli/ocl.cpp + @echo $(notdir $<) + $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" +$(OBJDIR)/ocu.o: clguetzli/ocu.cpp + @echo $(notdir $<) + $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" +$(OBJDIR)/utils.o: clguetzli/utils.cpp + @echo $(notdir $<) + $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/butteraugli_comparator.o: guetzli/butteraugli_comparator.cc @echo $(notdir $<) $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" diff --git a/guetzli.vcxproj b/guetzli.vcxproj index 5b7ffeb9..3a0eb72c 100644 --- a/guetzli.vcxproj +++ b/guetzli.vcxproj @@ -51,6 +51,7 @@ + @@ -78,6 +79,8 @@ obj\x86\Release\guetzli\ guetzli .exe + $(VC_IncludePath);$(WindowsSDK_IncludePath);$(VC_IncludeThirdParty) + $(VC_LibraryPath_x86);$(WindowsSDK_LibraryPath_x86);$(NETFXKitsDir)Lib\um\x86;$(VC_LibThirdParty_x86) true @@ -92,6 +95,8 @@ obj\x86\Debug\guetzli\ guetzli .exe + $(VC_IncludePath);$(WindowsSDK_IncludePath);$(VC_IncludeThirdParty) + $(VC_LibraryPath_x86);$(WindowsSDK_LibraryPath_x86);$(NETFXKitsDir)Lib\um\x86;$(VC_LibThirdParty_x86) @@ -103,6 +108,7 @@ true false true + _UNICODE;UNICODE;%(PreprocessorDefinitions) Console @@ -110,18 +116,35 @@ true shlwapi.lib;%(AdditionalDependencies) mainCRTStartup + $(CUDA_PATH)\lib\x64;third_party\libjpeg\x64 + + "$(INTELOCLSDKROOT)bin\x64\ioc64.exe" -cmd=build -input="%(FullPath)" -output="x64\Release\%(Filename).out" -VS -device=GPU -simd=default -targetos=current -bo=" " + + + OpenCL Code Builder + + + false + + + + + + + NotUsing Level3 .;third_party\butteraugli;%(AdditionalIncludeDirectories) - Full + MaxSpeed true - true + false false true + %(PreprocessorDefinitions) Console @@ -129,7 +152,20 @@ true shlwapi.lib;%(AdditionalDependencies) mainCRTStartup + + + $(CUDA_PATH)\lib\Win32;third_party\libjpeg\x86 + + + + + + CUDA CU + + + 3 + @@ -138,13 +174,16 @@ .;third_party\butteraugli;%(AdditionalIncludeDirectories) EditAndContinue Disabled + _UNICODE;UNICODE;%(PreprocessorDefinitions) Console true shlwapi.lib;%(AdditionalDependencies) mainCRTStartup + $(CUDA_PATH)\lib\x64;third_party\libjpeg\x64 + @@ -153,15 +192,32 @@ .;third_party\butteraugli;%(AdditionalIncludeDirectories) EditAndContinue Disabled + %(PreprocessorDefinitions) Console true shlwapi.lib;%(AdditionalDependencies) mainCRTStartup + + + $(CUDA_PATH)\lib\Win32;third_party\libjpeg\x86 + + + 3 + + + + + + + + + + @@ -190,6 +246,15 @@ + + + + + + + + + @@ -212,7 +277,36 @@ + + + Document + + + false + + + Document + CUDA Code Builder + $(ProjectDir)compile.bat 64 + $(ProjectDir)compile.bat 32 + false + clguetzli\clguetzli.cu.ptx64 + $(ProjectDir)compile.bat 64 -G + CUDA Code Builder + clguetzli\clguetzli.cu.ptx64 + false + false + $(ProjectDir)compile.bat 32 -G + CUDA Code Builder + clguetzli\clguetzli.cu.ptx32 + CUDA Code Builder + clguetzli\clguetzli.cu.ptx32 + false + false + + + \ No newline at end of file diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters index da2297c5..7e005105 100644 --- a/guetzli.vcxproj.filters +++ b/guetzli.vcxproj.filters @@ -1,4 +1,4 @@ - + @@ -13,6 +13,9 @@ {FD6FCB41-6929-36EC-F288-50C65E41EC5B} + + {64847a89-ca39-4556-ba0e-d6875c4d39ca} + @@ -93,6 +96,33 @@ third_party\butteraugli\butteraugli + + clguetzli + + + clguetzli + + + clguetzli + + + clguetzli + + + clguetzli + + + clguetzli + + + clguetzli + + + clguetzli + + + clguetzli + @@ -158,5 +188,42 @@ third_party\butteraugli\butteraugli + + clguetzli + + + clguetzli + + + clguetzli + + + clguetzli + + + clguetzli + + + clguetzli + + + clguetzli + + + clguetzli + + + clguetzli + + + + + clguetzli + + + + + clguetzli + \ No newline at end of file diff --git a/guetzli/butteraugli_comparator.cc b/guetzli/butteraugli_comparator.cc index ec964334..f0ce5eb4 100644 --- a/guetzli/butteraugli_comparator.cc +++ b/guetzli/butteraugli_comparator.cc @@ -22,6 +22,10 @@ #include "guetzli/gamma_correct.h" #include "guetzli/score.h" +#include "clguetzli/ocu.h" +#include "clguetzli/clguetzli.h" +#include "clguetzli/cuguetzli.h" + namespace guetzli { std::vector > ComputeOpsinDynamicsImage( @@ -107,7 +111,9 @@ void ButteraugliComparator::SwitchBlock(int block_x, int block_y, } double ButteraugliComparator::CompareBlock(const OutputImage& img, - int off_x, int off_y) const { + int off_x, int off_y, + const coeff_t* candidate_block, + const int comp_mask) const { int block_x = block_x_ * factor_x_ + off_x; int block_y = block_y_ * factor_y_ + off_y; int xmin = 8 * block_x; diff --git a/guetzli/butteraugli_comparator.h b/guetzli/butteraugli_comparator.h index f96d160f..08530a7e 100644 --- a/guetzli/butteraugli_comparator.h +++ b/guetzli/butteraugli_comparator.h @@ -20,6 +20,7 @@ #include #include "butteraugli/butteraugli.h" +#include "clguetzli/clbutter_comparator.h" #include "guetzli/comparator.h" #include "guetzli/jpeg_data.h" #include "guetzli/output_image.h" @@ -44,7 +45,7 @@ class ButteraugliComparator : public Comparator { int factor_x, int factor_y) override; double CompareBlock(const OutputImage& img, - int off_x, int off_y) const override; + int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const override; double ScoreOutputSize(int size) const override; @@ -62,7 +63,7 @@ class ButteraugliComparator : public Comparator { int factor_y, const std::vector& distmap, std::vector* block_weight) override; - private: + protected: const int width_; const int height_; const float target_distance_; @@ -73,7 +74,7 @@ class ButteraugliComparator : public Comparator { int factor_y_; std::vector> mask_xyz_; std::vector>> per_block_pregamma_; - ::butteraugli::ButteraugliComparator comparator_; + ::butteraugli::clButteraugliComparator comparator_; float distance_; std::vector distmap_; ProcessStats* stats_; diff --git a/guetzli/comparator.h b/guetzli/comparator.h index 00c56977..061f9603 100644 --- a/guetzli/comparator.h +++ b/guetzli/comparator.h @@ -51,7 +51,7 @@ class Comparator { // the resulting per-block distance. The interpretation of the returned // distance depends on the comparator used. virtual double CompareBlock(const OutputImage& img, - int off_x, int off_y) const = 0; + int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const = 0; // Returns the combined score of the output image in the last Compare() call // (or the baseline image, if Compare() was not called yet), based on output diff --git a/guetzli/guetzli.cc b/guetzli/guetzli.cc index fb6cd0a9..c972d391 100644 --- a/guetzli/guetzli.cc +++ b/guetzli/guetzli.cc @@ -28,6 +28,10 @@ #include "guetzli/processor.h" #include "guetzli/quality.h" #include "guetzli/stats.h" +#include "clguetzli/clguetzli.h" +#ifdef __USE_GPERFTOOLS__ +#include +#endif namespace { @@ -164,7 +168,9 @@ std::string ReadFileOrDie(const char* filename) { off_t buffer_size = 8192; if (fseek(f, 0, SEEK_END) == 0) { - buffer_size = std::max(ftell(f), 1); +// buffer_size = std::max(ftell(f), 1); + long size = ftell(f); + buffer_size = size > 0 ? size : 1; if (fseek(f, 0, SEEK_SET) != 0) { perror("fseek"); exit(1); @@ -223,6 +229,15 @@ void Usage() { " Default value is %d.\n" " --memlimit M - Memory limit in MB. Guetzli will fail if unable to stay under\n" " the limit. Default limit is %d MB.\n" +#ifdef __USE_OPENCL__ + " --opencl - Use OpenCL\n" + " --checkcl - Check OpenCL result\n" +#endif + " --c - Use c opt version\n" +#ifdef __USE_CUDA__ + " --cuda - Use CUDA\n" + " --checkcuda - Check CUDA result\n" +#endif " --nomemlimit - Do not limit memory usage.\n", kDefaultJPEGQuality, kDefaultMemlimitMB); exit(1); } @@ -230,6 +245,9 @@ void Usage() { } // namespace int main(int argc, char** argv) { +#ifdef __USE_GPERFTOOLS__ + ProfilerStart("guetzli.prof"); +#endif std::set_terminate(TerminateHandler); int verbose = 0; @@ -254,7 +272,28 @@ int main(int argc, char** argv) { memlimit_mb = atoi(argv[opt_idx]); } else if (!strcmp(argv[opt_idx], "--nomemlimit")) { memlimit_mb = -1; - } else if (!strcmp(argv[opt_idx], "--")) { + } +#ifdef __USE_OPENCL__ + else if (!strcmp(argv[opt_idx], "--opencl")) { + g_mathMode = MODE_OPENCL; + } + else if (!strcmp(argv[opt_idx], "--checkcl")) { + g_mathMode = MODE_CHECKCL; + } +#endif + else if (!strcmp(argv[opt_idx], "--c")) + { + g_mathMode = MODE_CPU_OPT; + } +#ifdef __USE_CUDA__ + else if (!strcmp(argv[opt_idx], "--cuda")) { + g_mathMode = MODE_CUDA; + } + else if (!strcmp(argv[opt_idx], "--checkcuda")) { + g_mathMode = MODE_CHECKCUDA; + } +#endif + else if (!strcmp(argv[opt_idx], "--")) { opt_idx++; break; } else { @@ -322,5 +361,8 @@ int main(int argc, char** argv) { } WriteFileOrDie(argv[opt_idx + 1], out_data); +#ifdef __USE_GPERFTOOLS__ + ProfilerStop(); +#endif return 0; } diff --git a/guetzli/jpeg_data_decoder.cc b/guetzli/jpeg_data_decoder.cc index 98f9f4cc..722d6663 100644 --- a/guetzli/jpeg_data_decoder.cc +++ b/guetzli/jpeg_data_decoder.cc @@ -43,9 +43,8 @@ bool HasYCbCrColorSpace(const JPEGData& jpg) { } std::vector DecodeJpegToRGB(const JPEGData& jpg) { - if (jpg.components.size() == 1 || - (jpg.components.size() == 3 && - HasYCbCrColorSpace(jpg) && (jpg.Is420() || jpg.Is444()))) { + if (jpg.components.size() == 3 && + HasYCbCrColorSpace(jpg) && (jpg.Is420() || jpg.Is444())) { OutputImage img(jpg.width, jpg.height); img.CopyFromJpegData(jpg); return img.ToSRGB(); diff --git a/guetzli/output_image.h b/guetzli/output_image.h index 1018eeac..9c9f935a 100644 --- a/guetzli/output_image.h +++ b/guetzli/output_image.h @@ -37,6 +37,8 @@ class OutputImageComponent { int width_in_blocks() const { return width_in_blocks_; } int height_in_blocks() const { return height_in_blocks_; } const coeff_t* coeffs() const { return &coeffs_[0]; } + const uint16_t* pixels() const { return &pixels_[0]; } + size_t pixels_size() const { return pixels_.size(); } const int* quant() const { return &quant_[0]; } bool IsAllZero() const; diff --git a/guetzli/processor.cc b/guetzli/processor.cc index 662653eb..2e8837dc 100644 --- a/guetzli/processor.cc +++ b/guetzli/processor.cc @@ -31,6 +31,11 @@ #include "guetzli/jpeg_data_writer.h" #include "guetzli/output_image.h" #include "guetzli/quantize.h" +#include "clguetzli/clguetzli.h" + +#ifdef __SUPPORT_FULL_JPEG__ +#include "jpeglib.h" +#endif namespace guetzli { @@ -38,10 +43,6 @@ namespace { static const size_t kBlockSize = 3 * kDCTBlockSize; -struct CoeffData { - int idx; - float block_err; -}; struct QuantData { int q[3][kDCTBlockSize]; size_t jpg_size; @@ -57,11 +58,21 @@ class Processor { void SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, const uint8_t comp_mask, const double target_mul, bool stop_early); + + void SelectFrequencyBackEnd(const JPEGData& jpg, OutputImage* img, + const uint8_t comp_mask, + const double target_mul, + bool stop_early, + std::vector &candidate_coeff_offsets, + std::vector& candidate_coeffs, + std::vector &candidate_coeff_errors); + void ComputeBlockZeroingOrder( const coeff_t block[kBlockSize], const coeff_t orig_block[kBlockSize], const int block_x, const int block_y, const int factor_x, const int factor_y, const uint8_t comp_mask, OutputImage* img, std::vector* output_order); + bool SelectQuantMatrix(const JPEGData& jpg_in, const bool downsample, int best_q[3][kDCTBlockSize], OutputImage* img); @@ -402,47 +413,55 @@ void Processor::ComputeBlockZeroingOrder( memcpy(processed_block, block, sizeof(processed_block)); comparator_->SwitchBlock(block_x, block_y, factor_x, factor_y); while (!input_order.empty()) { - float best_err = 1e17f; - int best_i = 0; - for (size_t i = 0; i < std::min(params_.zeroing_greedy_lookahead, - input_order.size()); - ++i) { - coeff_t candidate_block[kBlockSize]; - memcpy(candidate_block, processed_block, sizeof(candidate_block)); - const int idx = input_order[i].first; - candidate_block[idx] = 0; - for (int c = 0; c < 3; ++c) { - if (comp_mask & (1 << c)) { - img->component(c).SetCoeffBlock( - block_x, block_y, &candidate_block[c * kDCTBlockSize]); - } - } - float max_err = 0; - for (int iy = 0; iy < factor_y; ++iy) { - for (int ix = 0; ix < factor_x; ++ix) { - int block_xx = block_x * factor_x + ix; - int block_yy = block_y * factor_y + iy; - if (8 * block_xx < img->width() && 8 * block_yy < img->height()) { - float err = static_cast(comparator_->CompareBlock(*img, ix, iy)); - max_err = std::max(max_err, err); - } - } - } - if (max_err < best_err) { - best_err = max_err; - best_i = i; - } - } - int idx = input_order[best_i].first; - processed_block[idx] = 0; - input_order.erase(input_order.begin() + best_i); - output_order->push_back({idx, best_err}); - for (int c = 0; c < 3; ++c) { - if (comp_mask & (1 << c)) { - img->component(c).SetCoeffBlock( - block_x, block_y, &processed_block[c * kDCTBlockSize]); - } - } + float best_err = 1e17f; + int best_i = 0; + for (size_t i = 0; i < std::min(params_.zeroing_greedy_lookahead, + input_order.size()); + ++i) { + coeff_t candidate_block[kBlockSize]; + memcpy(candidate_block, processed_block, sizeof(candidate_block)); + const int idx = input_order[i].first; + candidate_block[idx] = 0; + for (int c = 0; c < 3; ++c) { + if (comp_mask & (1 << c)) { + img->component(c).SetCoeffBlock( + block_x, block_y, &candidate_block[c * kDCTBlockSize]); + } + } + float max_err = 0; + for (int iy = 0; iy < factor_y; ++iy) { + for (int ix = 0; ix < factor_x; ++ix) { + int block_xx = block_x * factor_x + ix; + int block_yy = block_y * factor_y + iy; + if (8 * block_xx < img->width() && 8 * block_yy < img->height()) { + float err = static_cast(comparator_->CompareBlock(*img, ix, iy, candidate_block, comp_mask)); + max_err = std::max(max_err, err); + } + } + } + if (max_err < best_err) { + best_err = max_err; + best_i = i; + } + } + int idx = input_order[best_i].first; + processed_block[idx] = 0; + input_order.erase(input_order.begin() + best_i); + output_order->push_back({ idx, best_err }); + for (int c = 0; c < 3; ++c) { + if (comp_mask & (1 << c)) { + img->component(c).SetCoeffBlock( + block_x, block_y, &processed_block[c * kDCTBlockSize]); + } + } + if (MODE_CPU_OPT == g_mathMode) + { + if (best_err >= comparator_->BlockErrorLimit()) + { + // The input_order is an ascent vector, break when best_err exceed the error limit. + break; + } + } } // Make the block error values monotonic. float min_err = 1e10; @@ -536,58 +555,188 @@ size_t EstimateDCSize(const JPEGData& jpg) { } // namespace -void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, - const uint8_t comp_mask, - const double target_mul, - bool stop_early) { - const int width = img->width(); - const int height = img->height(); - const int ncomp = jpg.components.size(); - const int last_c = Log2FloorNonZero(comp_mask); - if (static_cast(last_c) >= jpg.components.size()) return; - const int factor_x = img->component(last_c).factor_x(); - const int factor_y = img->component(last_c).factor_y(); - const int block_width = (width + 8 * factor_x - 1) / (8 * factor_x); - const int block_height = (height + 8 * factor_y - 1) / (8 * factor_y); - const int num_blocks = block_width * block_height; - - std::vector candidate_coeff_offsets(num_blocks + 1); - std::vector candidate_coeffs; - std::vector candidate_coeff_errors; - candidate_coeffs.reserve(60 * num_blocks); - candidate_coeff_errors.reserve(60 * num_blocks); - std::vector block_order; - block_order.reserve(3 * kDCTBlockSize); - comparator_->StartBlockComparisons(); - for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) { - for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) { - coeff_t block[kBlockSize] = { 0 }; - coeff_t orig_block[kBlockSize] = { 0 }; - for (int c = 0; c < 3; ++c) { - if (comp_mask & (1 << c)) { - assert(img->component(c).factor_x() == factor_x); - assert(img->component(c).factor_y() == factor_y); - img->component(c).GetCoeffBlock(block_x, block_y, - &block[c * kDCTBlockSize]); - const JPEGComponent& comp = jpg.components[c]; - int jpg_block_ix = block_y * comp.width_in_blocks + block_x; - memcpy(&orig_block[c * kDCTBlockSize], - &comp.coeffs[jpg_block_ix * kDCTBlockSize], - kDCTBlockSize * sizeof(orig_block[0])); +void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, const uint8_t comp_mask, + const double target_mul, bool stop_early) +{ + const int width = img->width(); + const int height = img->height(); + const int ncomp = jpg.components.size(); + const int last_c = Log2FloorNonZero(comp_mask); + if (static_cast(last_c) >= jpg.components.size()) return; + const int factor_x = img->component(last_c).factor_x(); + const int factor_y = img->component(last_c).factor_y(); + const int block_width = (width + 8 * factor_x - 1) / (8 * factor_x); + const int block_height = (height + 8 * factor_y - 1) / (8 * factor_y); + const int num_blocks = block_width * block_height; + + + comparator_->StartBlockComparisons(); + + std::vector output_order_gpu; + std::vector output_order_cpu; + + CoeffData * output_order = NULL; + + if (MODE_OPENCL == g_mathMode || MODE_CUDA == g_mathMode) + { +#ifdef __USE_OPENCL__ + ButteraugliComparatorEx * comp = (ButteraugliComparatorEx*)comparator_; + + channel_info orig_channel[3]; + channel_info mayout_channel[3]; + + for (int c = 0; c < 3; c++) + { + mayout_channel[c].factor = img->component(c).factor_x(); + mayout_channel[c].block_width = img->component(c).width_in_blocks(); + mayout_channel[c].block_height = img->component(c).height_in_blocks(); + mayout_channel[c].coeff = img->component(c).coeffs(); + mayout_channel[c].pixel = img->component(c).pixels(); + + orig_channel[c].factor = jpg.components[c].v_samp_factor; + orig_channel[c].block_width = jpg.components[c].width_in_blocks; + orig_channel[c].block_height = jpg.components[c].height_in_blocks; + orig_channel[c].coeff = jpg.components[c].coeffs.data(); } - } - block_order.clear(); - ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, factor_x, - factor_y, comp_mask, img, &block_order); - candidate_coeff_offsets[block_ix] = candidate_coeffs.size(); - for (size_t i = 0; i < block_order.size(); ++i) { - candidate_coeffs.push_back(block_order[i].idx); - candidate_coeff_errors.push_back(block_order[i].block_err); - } + output_order_gpu.resize(num_blocks * kBlockSize); + output_order = output_order_gpu.data(); + + if (MODE_OPENCL == g_mathMode) + { + clComputeBlockZeroingOrder(output_order, + orig_channel, + comp->imgOpsinDynamicsBlockList.data(), + comp->imgMaskXyzScaleBlockList.data(), + width, + height, + mayout_channel, + factor_x, + comp_mask, + comp->BlockErrorLimit()); + } +#endif +#ifdef __USE_CUDA__ + else + { + cuComputeBlockZeroingOrder(output_order, + orig_channel, + comp->imgOpsinDynamicsBlockList.data(), + comp->imgMaskXyzScaleBlockList.data(), + width, + height, + mayout_channel, + factor_x, + comp_mask, + comp->BlockErrorLimit()); + } +#endif } - } - comparator_->FinishBlockComparisons(); - candidate_coeff_offsets[num_blocks] = candidate_coeffs.size(); +#ifdef __USE_OPENCL__ + if (MODE_CPU_OPT == g_mathMode || MODE_CPU == g_mathMode || MODE_CHECKCL == g_mathMode) +#else + if (MODE_CPU_OPT == g_mathMode || MODE_CPU == g_mathMode) +#endif + { + output_order_cpu.resize(num_blocks * kBlockSize); + output_order = output_order_cpu.data(); + for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) { + for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) { + coeff_t block[kBlockSize] = { 0 }; + coeff_t orig_block[kBlockSize] = { 0 }; + for (int c = 0; c < 3; ++c) { + if (comp_mask & (1 << c)) { + assert(img->component(c).factor_x() == factor_x); + assert(img->component(c).factor_y() == factor_y); + img->component(c).GetCoeffBlock(block_x, block_y, + &block[c * kDCTBlockSize]); + const JPEGComponent& comp = jpg.components[c]; + int jpg_block_ix = block_y * comp.width_in_blocks + block_x; + memcpy(&orig_block[c * kDCTBlockSize], + &comp.coeffs[jpg_block_ix * kDCTBlockSize], + kDCTBlockSize * sizeof(orig_block[0])); + } + } + + std::vector block_order; + ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, factor_x, factor_y, comp_mask, img, &block_order); + + CoeffData * p = &output_order_cpu[block_ix * kBlockSize]; + for (int i = 0; i < block_order.size(); i++) + { + p[i].idx = block_order[i].idx; + p[i].block_err = block_order[i].block_err; + } + } + } + } + +#ifdef __USE_OPENCL__ + if (MODE_CHECKCL == g_mathMode) + { + int count = 0; + int check_size = output_order_gpu.size(); + for (int i = 0; i < check_size; i++) + { + if (output_order_cpu[i].idx != output_order_gpu[i].idx || + fabs(output_order_cpu[i].block_err - output_order_gpu[i].block_err) > 0.001) + { + count++; + } + } + if (count > 0) + { + LogError("CHK %s(%d) %d:%d\r\n", "SelectFrequencyMasking", __LINE__, count, check_size); + } + } +#endif + + std::vector candidate_coeff_offsets(num_blocks + 1); + std::vector candidate_coeffs; + std::vector candidate_coeff_errors; + + for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) { + for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) { + CoeffData * p = &output_order[block_ix * kBlockSize]; + + candidate_coeff_offsets[block_ix] = candidate_coeffs.size(); + for (int i = 0; i < kBlockSize; i++) + { + if (p[i].block_err > 0 && p[i].block_err <= comparator_->BlockErrorLimit()) + { + candidate_coeffs.push_back(p[i].idx); + candidate_coeff_errors.push_back(p[i].block_err); + } + } + } + } + + // + comparator_->FinishBlockComparisons(); + candidate_coeff_offsets[num_blocks] = candidate_coeffs.size(); + + SelectFrequencyBackEnd(jpg, img, comp_mask, target_mul, stop_early, + candidate_coeff_offsets, candidate_coeffs, candidate_coeff_errors); + +} + +void Processor::SelectFrequencyBackEnd(const JPEGData& jpg, OutputImage* img, + const uint8_t comp_mask, + const double target_mul, + bool stop_early, + std::vector &candidate_coeff_offsets, + std::vector& candidate_coeffs, + std::vector &candidate_coeff_errors) +{ + const int ncomp = jpg.components.size(); + const int width = img->width(); + const int height = img->height(); + const int last_c = Log2FloorNonZero(comp_mask); + if (static_cast(last_c) >= jpg.components.size()) return; + const int factor_x = img->component(last_c).factor_x(); + const int factor_y = img->component(last_c).factor_y(); + const int block_width = (width + 8 * factor_x - 1) / (8 * factor_x); + const int block_height = (height + 8 * factor_y - 1) / (8 * factor_y); + const int num_blocks = block_width * block_height; std::vector ac_histograms(ncomp); int jpg_header_size, dc_size; @@ -891,10 +1040,7 @@ bool Process(const Params& params, ProcessStats* stats, } std::vector rgb = DecodeJpegToRGB(jpg); if (rgb.empty()) { - fprintf(stderr, "Unsupported input JPEG file (e.g. unsupported " - "downsampling mode).\nPlease provide the input image as " - "a PNG file.\n"); - return false; + return ProcessUnsupportedJpegData(params,stats,data,jpg_out); } GuetzliOutput out; ProcessStats dummy_stats; @@ -903,15 +1049,62 @@ bool Process(const Params& params, ProcessStats* stats, } std::unique_ptr comparator; if (jpg.width >= 32 && jpg.height >= 32) { +#ifdef __USE_OPENCL__ comparator.reset( - new ButteraugliComparator(jpg.width, jpg.height, &rgb, + new ButteraugliComparatorEx(jpg.width, jpg.height, &rgb, params.butteraugli_target, stats)); +#else + comparator.reset( + new ButteraugliComparator(jpg.width, jpg.height, &rgb, + params.butteraugli_target, stats)); +#endif } bool ok = ProcessJpegData(params, jpg, comparator.get(), &out, stats); *jpg_out = out.jpeg_data; return ok; } +bool ProcessUnsupportedJpegData(const Params& params, ProcessStats* stats, + const std::string& data, + std::string* jpg_out) { +#ifdef __SUPPORT_FULL_JPEG__ + struct jpeg_decompress_struct cinfo; + struct jpeg_error_mgr jerr; + cinfo.err = jpeg_std_error(&jerr); + jpeg_create_decompress(&cinfo); + jpeg_mem_src(&cinfo, (unsigned char*)data.c_str(), data.length()); + + int rc = jpeg_read_header(&cinfo, TRUE); + if (rc != 1) { + fprintf(stderr, "File does not seem to be a normal JPEG\n"); + exit(EXIT_FAILURE); + } + + cinfo.out_color_space = JCS_RGB; //force RGB output + jpeg_start_decompress(&cinfo); + int xsize = cinfo.output_width; + int ysize = cinfo.output_height; + int pixel_size = cinfo.output_components; + unsigned long bmp_size = xsize * ysize * pixel_size; + unsigned char *bmp_buffer = (unsigned char*)malloc(bmp_size); + int row_stride = cinfo.output_width * cinfo.output_components; + JSAMPARRAY buffer = (*cinfo.mem->alloc_sarray) + ((j_common_ptr)&cinfo, JPOOL_IMAGE, row_stride, 1); + while (cinfo.output_scanline < cinfo.output_height) { + unsigned char *buffer_array[1]; + buffer_array[0] = bmp_buffer + (cinfo.output_scanline) * row_stride; + jpeg_read_scanlines(&cinfo, buffer_array, 1); + } + std::vector temp_rgb(bmp_buffer, bmp_buffer + bmp_size); + return Process(params, stats, temp_rgb, xsize, ysize, jpg_out); +#else + fprintf(stderr, "Unsupported input JPEG file (e.g. unsupported " + "downsampling mode).\nPlease provide the input image as " + "a PNG file.\n"); + return false; +#endif +} + bool Process(const Params& params, ProcessStats* stats, const std::vector& rgb, int w, int h, std::string* jpg_out) { @@ -927,9 +1120,15 @@ bool Process(const Params& params, ProcessStats* stats, } std::unique_ptr comparator; if (jpg.width >= 32 && jpg.height >= 32) { +#ifdef __USE_OPENCL__ comparator.reset( - new ButteraugliComparator(jpg.width, jpg.height, &rgb, + new ButteraugliComparatorEx(jpg.width, jpg.height, &rgb, params.butteraugli_target, stats)); +#else + comparator.reset( + new ButteraugliComparator(jpg.width, jpg.height, &rgb, + params.butteraugli_target, stats)); +#endif } bool ok = ProcessJpegData(params, jpg, comparator.get(), &out, stats); *jpg_out = out.jpeg_data; diff --git a/guetzli/processor.h b/guetzli/processor.h index 2c543a25..e6cf4ba8 100644 --- a/guetzli/processor.h +++ b/guetzli/processor.h @@ -26,6 +26,11 @@ namespace guetzli { +struct CoeffData { + int idx; + float block_err; +}; + struct Params { float butteraugli_target = 1.0; bool clear_metadata = true; @@ -48,6 +53,9 @@ struct GuetzliOutput { bool ProcessJpegData(const Params& params, const JPEGData& jpg_in, Comparator* comparator, GuetzliOutput* out, ProcessStats* stats); +bool ProcessUnsupportedJpegData(const Params& params, + ProcessStats* stats, const std::string& data, + std::string* jpg_out); // Sets *out to a jpeg encoded string that will decode to an image that is // visually indistinguishable from the input rgb image. diff --git a/guetzli_static.make b/guetzli_static.make index d20fb77d..9fe7bf05 100644 --- a/guetzli_static.make +++ b/guetzli_static.make @@ -16,7 +16,7 @@ ifeq ($(config),release) TARGET = $(TARGETDIR)/libguetzli_static.a OBJDIR = obj/Release/guetzli_static DEFINES += - INCLUDES += -I. -Ithird_party/butteraugli + INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli FORCE_INCLUDE += ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES) ALL_CFLAGS += $(CFLAGS) $(ALL_CPPFLAGS) -O3 -g `pkg-config --static --cflags libpng || libpng-config --static --cflags` @@ -43,7 +43,7 @@ ifeq ($(config),debug) TARGET = $(TARGETDIR)/libguetzli_static.a OBJDIR = obj/Debug/guetzli_static DEFINES += - INCLUDES += -I. -Ithird_party/butteraugli + INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli FORCE_INCLUDE += ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES) ALL_CFLAGS += $(CFLAGS) $(ALL_CPPFLAGS) -g `pkg-config --static --cflags libpng || libpng-config --static --cflags` @@ -65,6 +65,15 @@ all: $(TARGETDIR) $(OBJDIR) prebuild prelink $(TARGET) endif OBJECTS := \ + $(OBJDIR)/clbutter_comparator.o \ + $(OBJDIR)/clguetzli.cl.o \ + $(OBJDIR)/clguetzli.o \ + $(OBJDIR)/clguetzli_test.o \ + $(OBJDIR)/cuguetzli.o \ + $(OBJDIR)/cumem_pool.o \ + $(OBJDIR)/ocl.o \ + $(OBJDIR)/ocu.o \ + $(OBJDIR)/utils.o \ $(OBJDIR)/butteraugli_comparator.o \ $(OBJDIR)/dct_double.o \ $(OBJDIR)/debug_print.o \ @@ -142,6 +151,33 @@ $(GCH): $(PCH) $(SILENT) $(CXX) -x c++-header $(ALL_CXXFLAGS) -o "$@" -MF "$(@:%.gch=%.d)" -c "$<" endif +$(OBJDIR)/clbutter_comparator.o: clguetzli/clbutter_comparator.cpp + @echo $(notdir $<) + $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" +$(OBJDIR)/clguetzli.cl.o: clguetzli/clguetzli.cl.cpp + @echo $(notdir $<) + $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" +$(OBJDIR)/clguetzli.o: clguetzli/clguetzli.cpp + @echo $(notdir $<) + $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" +$(OBJDIR)/clguetzli_test.o: clguetzli/clguetzli_test.cpp + @echo $(notdir $<) + $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" +$(OBJDIR)/cuguetzli.o: clguetzli/cuguetzli.cpp + @echo $(notdir $<) + $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" +$(OBJDIR)/cumem_pool.o: clguetzli/cumem_pool.cpp + @echo $(notdir $<) + $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" +$(OBJDIR)/ocl.o: clguetzli/ocl.cpp + @echo $(notdir $<) + $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" +$(OBJDIR)/ocu.o: clguetzli/ocu.cpp + @echo $(notdir $<) + $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" +$(OBJDIR)/utils.o: clguetzli/utils.cpp + @echo $(notdir $<) + $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" $(OBJDIR)/butteraugli_comparator.o: guetzli/butteraugli_comparator.cc @echo $(notdir $<) $(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<" diff --git a/guetzli_static.vcxproj b/guetzli_static.vcxproj index 02e6b436..3c3bd850 100644 --- a/guetzli_static.vcxproj +++ b/guetzli_static.vcxproj @@ -93,7 +93,7 @@ NotUsing Level3 - .;third_party\butteraugli;%(AdditionalIncludeDirectories) + .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows%(AdditionalIncludeDirectories) Full true true @@ -110,7 +110,7 @@ NotUsing Level3 - .;third_party\butteraugli;%(AdditionalIncludeDirectories) + .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories) Full true true @@ -127,7 +127,7 @@ NotUsing Level3 - .;third_party\butteraugli;%(AdditionalIncludeDirectories) + .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories) EditAndContinue Disabled @@ -140,7 +140,7 @@ NotUsing Level3 - .;third_party\butteraugli;%(AdditionalIncludeDirectories) + .;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories) EditAndContinue Disabled diff --git a/guetzli_static.vcxproj.filters b/guetzli_static.vcxproj.filters index ec134ccc..94654c91 100644 --- a/guetzli_static.vcxproj.filters +++ b/guetzli_static.vcxproj.filters @@ -1,4 +1,4 @@ - + diff --git a/premake5.lua b/premake5.lua index 1a109d7a..cc41301b 100644 --- a/premake5.lua +++ b/premake5.lua @@ -2,7 +2,8 @@ workspace "guetzli" configurations { "Release", "Debug" } language "C++" flags { "C++11" } - includedirs { ".", "third_party/butteraugli" } + includedirs { ".", "third_party/butteraugli", "clguetzli" } + libdirs {} filter "action:vs*" platforms { "x86_64", "x86" } @@ -29,7 +30,9 @@ workspace "guetzli" "guetzli/*.cc", "guetzli/*.h", "third_party/butteraugli/butteraugli/butteraugli.cc", - "third_party/butteraugli/butteraugli/butteraugli.h" + "third_party/butteraugli/butteraugli/butteraugli.h", + "clguetzli/*.cpp", + "clguetzli/*.h" } removefiles "guetzli/guetzli.cc" filter "action:gmake" @@ -39,8 +42,10 @@ workspace "guetzli" project "guetzli" kind "ConsoleApp" filter "action:gmake" + --defines { "__USE_OPENCL__", "__USE_CUDA__", "__SUPPORT_FULL_JPEG__" } linkoptions { "`pkg-config --libs libpng || libpng-config --ldflags`" } buildoptions { "`pkg-config --cflags libpng || libpng-config --cflags`" } + --links { "OpenCL", "cuda", "profiler", "unwind", "jpeg" } filter "action:vs*" links { "shlwapi" } filter {} @@ -49,5 +54,7 @@ workspace "guetzli" "guetzli/*.cc", "guetzli/*.h", "third_party/butteraugli/butteraugli/butteraugli.cc", - "third_party/butteraugli/butteraugli/butteraugli.h" + "third_party/butteraugli/butteraugli/butteraugli.h", + "clguetzli/*.cpp", + "clguetzli/*.h" } diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc index 4cdc29bb..c32f226c 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.cc +++ b/third_party/butteraugli/butteraugli/butteraugli.cc @@ -40,6 +40,12 @@ #include #include +#ifdef __USE_OPENCL__ +#include "clguetzli/clbutter_comparator.h" +#include "clguetzli/clguetzli.h" +#include "clguetzli/clguetzli_test.h" +#endif + // Restricted pointers speed up Convolution(); MSVC uses a different keyword. #ifdef _MSC_VER #define __restrict__ __restrict @@ -59,7 +65,7 @@ inline double DotProduct(const float u[3], const double v[3]) { } // Computes a horizontal convolution and transposes the result. -static void Convolution(size_t xsize, size_t ysize, +void _Convolution(size_t xsize, size_t ysize, size_t xstep, size_t len, size_t offset, const float* __restrict__ multipliers, @@ -91,7 +97,7 @@ static void Convolution(size_t xsize, size_t ysize, } } -void Blur(size_t xsize, size_t ysize, float* channel, double sigma, +void _Blur(size_t xsize, size_t ysize, float* channel, double sigma, double border_ratio) { PROFILER_FUNC; double m = 2.25; // Accuracy increases when m is increased. @@ -108,17 +114,28 @@ void Blur(size_t xsize, size_t ysize, float* channel, double sigma, int dxsize = (xsize + xstep - 1) / xstep; int dysize = (ysize + ystep - 1) / ystep; std::vector tmp(dxsize * ysize); +#ifdef __USE_OPENCL__ Convolution(xsize, ysize, xstep, expn_size, diff, expn.data(), channel, border_ratio, tmp.data()); +#else + _Convolution(xsize, ysize, xstep, expn_size, diff, expn.data(), channel, + border_ratio, + tmp.data()); +#endif float* output = channel; std::vector downsampled_output; if (xstep > 1) { downsampled_output.resize(dxsize * dysize); output = downsampled_output.data(); } +#ifdef __USE_OPENCL__ Convolution(ysize, dxsize, ystep, expn_size, diff, expn.data(), tmp.data(), border_ratio, output); +#else + _Convolution(ysize, dxsize, ystep, expn_size, diff, expn.data(), tmp.data(), + border_ratio, output); +#endif if (xstep > 1) { for (size_t y = 0; y < ysize; y++) { for (size_t x = 0; x < xsize; x++) { @@ -771,7 +788,7 @@ ButteraugliComparator::ButteraugliComparator( assert(step <= 4); } -void MaskHighIntensityChange( +void _MaskHighIntensityChange( size_t xsize, size_t ysize, const std::vector > &c0, const std::vector > &c1, @@ -923,7 +940,7 @@ static inline double Gamma(double v) { return GammaPolynomial(static_cast(v)); } -void OpsinDynamicsImage(size_t xsize, size_t ysize, +void _OpsinDynamicsImage(size_t xsize, size_t ysize, std::vector > &rgb) { PROFILER_FUNC; std::vector > blurred = rgb; @@ -956,7 +973,7 @@ void OpsinDynamicsImage(size_t xsize, size_t ysize, } } -static void ScaleImage(double scale, std::vector *result) { +void _ScaleImage(double scale, std::vector *result) { PROFILER_FUNC; for (size_t i = 0; i < result->size(); ++i) { (*result)[i] *= static_cast(scale); @@ -965,7 +982,7 @@ static void ScaleImage(double scale, std::vector *result) { // Making a cluster of local errors to be more impactful than // just a single error. -void CalculateDiffmap(const size_t xsize, const size_t ysize, +void _CalculateDiffmap(const size_t xsize, const size_t ysize, const size_t step, std::vector* diffmap) { PROFILER_FUNC; @@ -1018,7 +1035,11 @@ void CalculateDiffmap(const size_t xsize, const size_t ysize, += static_cast(mul1) * blurred[y * (xsize - s) + x]; } } +#ifdef __USE_OPENCL__ ScaleImage(scale, diffmap); +#else + _ScaleImage(scale, diffmap); +#endif } } @@ -1050,7 +1071,11 @@ void ButteraugliComparator::DiffmapOpsinDynamicsImage( CombineChannels(mask_xyb, mask_xyb_dc, block_diff_dc, block_diff_ac, edge_detector_map, &result); } +#ifdef __USE_OPENCL__ CalculateDiffmap(xsize_, ysize_, step_, &result); +#else + _CalculateDiffmap(xsize_, ysize_, step_, &result); +#endif } void ButteraugliComparator::BlockDiffMap( @@ -1304,8 +1329,8 @@ double MaskDcB(double delta) { // square_size square with coordinates // x - offset .. x + square_size - offset - 1, // y - offset .. y + square_size - offset - 1. -void MinSquareVal(size_t square_size, size_t offset, - size_t xsize, size_t ysize, +void _MinSquareVal(size_t square_size, size_t offset, + size_t xsize, size_t ysize, float *values) { PROFILER_FUNC; // offset is not negative and smaller than square_size. @@ -1315,9 +1340,19 @@ void MinSquareVal(size_t square_size, size_t offset, const size_t minh = offset > y ? 0 : y - offset; const size_t maxh = std::min(ysize, y + square_size - offset); for (size_t x = 0; x < xsize; ++x) { +#ifdef __USE_C__ + float min = values[x + minh * xsize]; +#else double min = values[x + minh * xsize]; +#endif for (size_t j = minh + 1; j < maxh; ++j) { +#ifdef __USE_C__ + float tmpf = values[x + j * xsize]; + if (tmpf < min) min = tmpf; +#else min = fmin(min, values[x + j * xsize]); +#endif + } tmp[x + y * xsize] = static_cast(min); } @@ -1328,7 +1363,12 @@ void MinSquareVal(size_t square_size, size_t offset, for (size_t y = 0; y < ysize; ++y) { double min = tmp[minw + y * xsize]; for (size_t j = minw + 1; j < maxw; ++j) { +#ifdef __USE_C__ + float tmpf = tmp[j + y * xsize]; + if (tmpf < min) min = tmpf; +#else min = fmin(min, tmp[j + y * xsize]); +#endif } values[x + y * xsize] = static_cast(min); } @@ -1336,7 +1376,7 @@ void MinSquareVal(size_t square_size, size_t offset, } // ===== Functions used by Mask only ===== -void Average5x5(int xsize, int ysize, std::vector* diffs) { +void _Average5x5(int xsize, int ysize, std::vector* diffs) { PROFILER_FUNC; if (xsize < 4 || ysize < 4) { // TODO: Make this work for small dimensions as well. @@ -1347,7 +1387,11 @@ void Average5x5(int xsize, int ysize, std::vector* diffs) { std::vector result = *diffs; std::vector tmp0 = *diffs; std::vector tmp1 = *diffs; +#ifdef __USE_OPENCL__ ScaleImage(w, &tmp1); +#else + _ScaleImage(w, &tmp1); +#endif for (int y = 0; y < ysize; y++) { const int row0 = y * xsize; result[row0 + 1] += tmp0[row0]; @@ -1386,10 +1430,14 @@ void Average5x5(int xsize, int ysize, std::vector* diffs) { } } *diffs = result; +#ifdef __USE_OPENCL__ ScaleImage(scale, diffs); +#else + _ScaleImage(scale, diffs); +#endif } -void DiffPrecompute( +void _DiffPrecompute( const std::vector > &xyb0, const std::vector > &xyb1, size_t xsize, size_t ysize, @@ -1444,7 +1492,7 @@ void DiffPrecompute( } } -void Mask(const std::vector > &xyb0, +void _Mask(const std::vector > &xyb0, const std::vector > &xyb1, size_t xsize, size_t ysize, std::vector > *mask, @@ -1454,6 +1502,7 @@ void Mask(const std::vector > &xyb0, for (int i = 0; i < 3; ++i) { (*mask)[i].resize(xsize * ysize); } +#ifdef __USE_OPENCL__ DiffPrecompute(xyb0, xyb1, xsize, ysize, mask); for (int i = 0; i < 3; ++i) { Average5x5(xsize, ysize, &(*mask)[i]); @@ -1465,6 +1514,19 @@ void Mask(const std::vector > &xyb0, }; Blur(xsize, ysize, (*mask)[i].data(), sigma[i], 0.0); } +#else + _DiffPrecompute(xyb0, xyb1, xsize, ysize, mask); + for (int i = 0; i < 3; ++i) { + _Average5x5(xsize, ysize, &(*mask)[i]); + _MinSquareVal(4, 0, xsize, ysize, (*mask)[i].data()); + static const double sigma[3] = { + 9.65781083553, + 14.2644604355, + 4.53358927369, + }; + _Blur(xsize, ysize, (*mask)[i].data(), sigma[i], 0.0); + } +#endif static const double w00 = 232.206464018; static const double w11 = 22.9455222245; static const double w22 = 503.962310606; @@ -1491,10 +1553,17 @@ void Mask(const std::vector > &xyb0, (*mask_dc)[2][idx] = static_cast(MaskDcB(p2)); } } +#ifdef __USE_OPENCL__ for (int i = 0; i < 3; ++i) { ScaleImage(kGlobalScale * kGlobalScale, &(*mask)[i]); ScaleImage(kGlobalScale * kGlobalScale, &(*mask_dc)[i]); } +#else + for (int i = 0; i < 3; ++i) { + _ScaleImage(kGlobalScale * kGlobalScale, &(*mask)[i]); + _ScaleImage(kGlobalScale * kGlobalScale, &(*mask_dc)[i]); + } +#endif } } // namespace butteraugli diff --git a/third_party/butteraugli/butteraugli/butteraugli.h b/third_party/butteraugli/butteraugli/butteraugli.h index a79cefb2..547fdc58 100644 --- a/third_party/butteraugli/butteraugli/butteraugli.h +++ b/third_party/butteraugli/butteraugli/butteraugli.h @@ -45,33 +45,34 @@ class ButteraugliComparator { // Computes the butteraugli map between xyb0 and xyb1 and updates result. // Both xyb0 and xyb1 are in opsin-dynamics space. - // NOTE: The xyb0 and xyb1 images are mutated by this function in-place. - void DiffmapOpsinDynamicsImage(std::vector> &xyb0, +// NOTE: The xyb0 and xyb1 images are mutated by this function in-place. + virtual void DiffmapOpsinDynamicsImage(std::vector> &xyb0, std::vector> &xyb1, std::vector &result); - - private: - void BlockDiffMap(const std::vector > &rgb0, + int step() { return step_;} + protected: + virtual void BlockDiffMap(const std::vector > &rgb0, const std::vector > &rgb1, std::vector* block_diff_dc, std::vector* block_diff_ac); - void EdgeDetectorMap(const std::vector > &rgb0, + virtual void EdgeDetectorMap(const std::vector > &rgb0, const std::vector > &rgb1, std::vector* edge_detector_map); - void EdgeDetectorLowFreq(const std::vector > &rgb0, + virtual void EdgeDetectorLowFreq(const std::vector > &rgb0, const std::vector > &rgb1, std::vector* block_diff_ac); - void CombineChannels(const std::vector >& scale_xyb, + virtual void CombineChannels(const std::vector >& scale_xyb, const std::vector >& scale_xyb_dc, const std::vector& block_diff_dc, const std::vector& block_diff_ac, const std::vector& edge_detector_map, std::vector* result); +protected: const size_t xsize_; const size_t ysize_; const size_t num_pixels_;