diff --git a/.gitignore b/.gitignore
index dd10da52..0cc93f06 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,3 +15,5 @@ ipch/
 *.cachefile
 *.VC.db
 *.VC.VC.opendb
+guetzli.vcxproj.user
+clguetzli/clguetzli.cu.ptx*
diff --git a/.travis.sh b/.travis.sh
index a30f38e5..905889ff 100755
--- a/.travis.sh
+++ b/.travis.sh
@@ -14,6 +14,7 @@ case "$1" in
 	    "bazel")
 		case "${TRAVIS_OS_NAME}" in
 		    "linux")
+			sudo apt-get remove oracle-java9-installer
 			wget https://github.com/bazelbuild/bazel/releases/download/0.4.5/bazel_0.4.5-linux-x86_64.deb
 			echo 'b494d0a413e4703b6cd5312403bea4d92246d6425b3be68c9bfbeb8cc4db8a55  bazel_0.4.5-linux-x86_64.deb' | sha256sum -c --strict || exit 1
 			sudo dpkg -i bazel_0.4.5-linux-x86_64.deb
diff --git a/.travis.yml b/.travis.yml
index 39e1caaa..85db2b53 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -13,6 +13,8 @@ matrix:
           packages:
             - wget
             - libjpeg-progs
+            - netpbm
+            - oracle-java8-installer
 
     - os: osx
       env: BUILD_SYSTEM=bazel
@@ -29,6 +31,7 @@ matrix:
             - libpng-dev
             - pkg-config
             - libjpeg-progs
+            - netpbm
 
     - os: osx
       env: BUILD_SYSTEM=make
diff --git a/BUILD b/BUILD
index 05bfc0da..c88d3890 100644
--- a/BUILD
+++ b/BUILD
@@ -8,6 +8,9 @@ cc_library(
             "guetzli/*.h",
             "guetzli/*.cc",
             "guetzli/*.inc",
+            "clguetzli/*.cpp",
+            "clguetzli/*.h",
+            "clguetzli/*.hpp"
         ],
         exclude = ["guetzli/guetzli.cc"],
     ),
diff --git a/README.md b/README.md
index 2ecd1072..37fa4267 100644
--- a/README.md
+++ b/README.md
@@ -99,3 +99,59 @@ attempts made.
 Please note that JPEG images do not support alpha channel (transparency). If the
 input is a PNG with an alpha channel, it will be overlaid on black background
 before encoding.
+
+# Extra features
+
+**Note:** Please make sure that you can build guetzli successfully before adding the following features.
+
+## Enable CUDA/OpenCL support
+
+**Note:** Before adding [CUDA](https://developer.nvidia.com/cuda-zone) support, please [check](http://developer.nvidia.com/cuda-gpus) whether your GPU support CUDA or not.
+
+**Note:** If you don't have an NVIDIA card that support CUDA, you can try [OpenCL](https://www.khronos.org/opencl/) instead. You can install any of the OpenCL SDKs, such as [Intel OpenCL SDK](https://software.intel.com/en-us/intel-opencl), [AMD OpenCL SDK](http://developer.amd.com/tools-and-sdks/opencl-zone/), etc.
+
+**Note:** The steps for adding OpenCL support is very similar with adding CUDA support, so the following introduction will be only for CUDA.
+
+### On POSIX systems
+1. Follow the [Installation Guide for Linux ](https://developer.nvidia.com/compute/cuda/8.0/Prod2/docs/sidebar/CUDA_Installation_Guide_Linux-pdf) to setup [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit).
+2. Edit `premake5.lua`, add `defines { "__USE_OPENCL__", "__USE_CUDA__" }` and `links { "OpenCL", "cuda" }` under `filter "action:gmake"`. Then do `premake5 --os=linux gmake` to update the makefile.
+3. Edit `clguetzli/clguetzli.cl` and add `#define __USE_OPENCL__` at first line.
+4. Run `make` and expect the binary to be created in `bin/Release/guetzli`.
+5. Run `./compile.sh 64` or `./compile.sh 32` to build the 64 or 32 bits [ptx](http://docs.nvidia.com/cuda/parallel-thread-execution) file, and the ptx file will be copied to `bin/Release/clguetzli`.
+
+### On Windows
+1. Follow the [Installation Guide for Microsoft Windows](https://developer.nvidia.com/compute/cuda/8.0/Prod2/docs/sidebar/CUDA_Installation_Guide_Windows-pdf) to setup `CUDA Toolkit`.
+2. Copy `<vs2015 dir>\VC\bin\amd64\vcvars64.bat` as `<guetzli dir>\vcvars64.bat`
+3. Open the Visual Studio project and edit the project `Property Pages` as follows:
+    * Add `__USE_OPENCL__` and `__USE_CUDA__` to preprocessor definitions.
+    * Add `OpenCL.lib` and `cuda.lib` to additional dependencies.
+    * Add `$(CUDA_PATH)\include` to include directories.
+    * Add `$(CUDA_PATH)\lib\Win32` or `$(CUDA_PATH)\lib\x64` to library directories.
+4. Edit `clguetzli/clguetzli.cl` and add `#define __USE_OPENCL__` at first line.
+5. Build it.
+
+### Usage
+```bash
+guetzli [--c|--cuda|--opencl] [other options] original.png output.jpg
+guetzli [--c|--cuda|--opencl] [other options] original.jpg output.jpg
+```
+You can pass a `--c` parameter to enable the procedure optimization or `--cuda` parameter to use the CUDA acceleration or `--opencl` to use the OpenCL acceleration.
+
+If you have any question about CUDA/OpenCL support, please contact strongtu@tencent.com, ianhuang@tencent.com or chriskzhou@tencent.com.
+
+## Enable full JPEG format support
+### On POSIX systems
+1. Install [libjpeg](http://libjpeg.sourceforge.net/).
+    If using your operating system
+    package manager, install development versions of the packages if the
+    distinction exists.
+    *   On Ubuntu, do `apt-get install libjpeg8-dev`.
+    *   On Fedora, do `dnf install libjpeg-devel`. 
+    *   On Arch Linux, do `pacman -S libjpeg`.
+    *   On Alpine Linux, do `apk add libjpeg`.
+2. Edit `premake5.lua`, add `defines {"__SUPPORT_FULL_JPEG__"}` and `links { "jpeg" }` under `filter "action:gmake"`. Then do `premake5 --os=linux gmake` to update the makefile.
+3. Run `make` and expect the binary to be created in `bin/Release/guetzli`
+### On Windows
+1. Install `libjpeg-turbo` using vcpkg: `.\vcpkg install libjpeg-turbo`
+2. Open the Visual Studio project and add `__SUPPORT_FULL_JPEG__` to preprocessor definitions in the project `Property Pages`.
+3. Build it.
\ No newline at end of file
diff --git a/appveyor.yml b/appveyor.yml
index 061ab6d0..97acb3ac 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -15,7 +15,7 @@ install:
   - premake5.exe %TOOLSET%
   - git clone https://github.com/Microsoft/vcpkg
   - md vcpkg\downloads\nuget-3.5.0
-  - appveyor DownloadFile https://dist.nuget.org/win-x86-commandline/latest/nuget.exe -FileName %appveyor_build_folder%\vcpkg\downloads\nuget-3.5.0\nuget.exe
+  - appveyor DownloadFile https://dist.nuget.org/win-x86-commandline/v3.5.0/nuget.exe -FileName %appveyor_build_folder%\vcpkg\downloads\nuget-3.5.0\nuget.exe
   - appveyor DownloadFile https://cmake.org/files/v3.8/cmake-3.8.0-rc1-win32-x86.zip -FileName %appveyor_build_folder%\vcpkg\downloads\cmake-3.8.0-rc1-win32-x86.zip
   - 7z x %appveyor_build_folder%\vcpkg\downloads\cmake-3.8.0-rc1-win32-x86.zip
   - cd vcpkg
diff --git a/clguetzli/cl.hpp b/clguetzli/cl.hpp
new file mode 100644
index 00000000..a7043b50
--- /dev/null
+++ b/clguetzli/cl.hpp
@@ -0,0 +1,322 @@
+#pragma once
+
+#ifdef __USE_OPENCL__
+
+template<typename T>
+inline void clSetKernelArgK(cl_kernel k, int idx, T* t)
+{
+    clSetKernelArg(k, idx, sizeof(T), t);
+}
+
+template<>
+inline void clSetKernelArgK(cl_kernel k, int idx, int* t)
+{
+    cl_int c = *t;
+    clSetKernelArg(k, idx, sizeof(cl_int), &c);
+}
+
+template<>
+inline void clSetKernelArgK(cl_kernel k, int idx, const int* t)
+{
+    cl_int c = *t;
+    clSetKernelArg(k, idx, sizeof(cl_int), &c);
+}
+
+template<>
+inline void clSetKernelArgK(cl_kernel k, int idx, size_t* t)
+{
+    cl_int c = *t;
+    clSetKernelArg(k, idx, sizeof(cl_int), &c);
+}
+
+template<>
+inline void clSetKernelArgK(cl_kernel k, int idx, const size_t* t)
+{
+    cl_int c = *t;
+    clSetKernelArg(k, idx, sizeof(cl_int), &c);
+}
+
+template<typename T0>
+inline void clSetKernelArgEx(cl_kernel k, T0* t0)
+{
+    clSetKernelArgK(k, 0, t0);
+}
+
+template<typename T0, typename T1>
+inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1)
+{
+    clSetKernelArgK(k, 1, t1);
+    clSetKernelArgEx(k, t0);
+}
+
+template<typename T0, typename T1, typename T2>
+inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2)
+{
+    clSetKernelArgK(k, 2, t2);
+    clSetKernelArgEx(k, t0, t1);
+}
+
+template<typename T0, typename T1, typename T2, typename T3>
+inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3)
+{
+    clSetKernelArgK(k, 3, t3);
+    clSetKernelArgEx(k, t0, t1, t2);
+}
+
+template<typename T0, typename T1, typename T2, typename T3, typename T4>
+inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4)
+{
+    clSetKernelArgK(k, 4, t4);
+    clSetKernelArgEx(k, t0, t1, t2, t3);
+}
+
+template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5>
+inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5)
+{
+    clSetKernelArgK(k, 5, t5);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4);
+}
+
+template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>
+inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6)
+{
+    clSetKernelArgK(k, 6, t6);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5);
+}
+
+template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7>
+inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7)
+{
+    clSetKernelArgK(k, 7, t7);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6);
+}
+
+template<typename T0, typename T1, typename T2, typename T3, typename T4, 
+         typename T5, typename T6, typename T7, typename T8>
+inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8)
+{
+    clSetKernelArgK(k, 8, t8);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7);
+}
+
+template<typename T0, typename T1, typename T2, typename T3, typename T4,
+         typename T5, typename T6, typename T7, typename T8, typename T9>
+inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8, T9* t9)
+{
+    clSetKernelArgK(k, 9, t9);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8);
+}
+
+template<typename T0, typename T1, typename T2, typename T3, typename T4,
+    typename T5, typename T6, typename T7, typename T8, typename T9,
+    typename T10>
+    inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8, T9* t9, T10* t10)
+{
+    clSetKernelArgK(k, 10, t10);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9);
+}
+
+template<typename T0, typename T1, typename T2, typename T3, typename T4,
+    typename T5, typename T6, typename T7, typename T8, typename T9,
+    typename T10, typename T11>
+    inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8, T9* t9, T10* t10, T11* t11)
+{
+    clSetKernelArgK(k, 11, t11);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10);
+}
+
+template<typename T0, typename T1, typename T2, typename T3, typename T4,
+    typename T5, typename T6, typename T7, typename T8, typename T9,
+    typename T10, typename T11, typename T12>
+    inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, 
+          T5* t5, T6* t6, T7* t7, T8* t8, T9* t9, 
+          T10* t10, T11* t11, T12* t12)
+{
+    clSetKernelArgK(k, 12, t12);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11);
+}
+
+template<typename T0, typename T1, typename T2, typename T3, typename T4,
+    typename T5, typename T6, typename T7, typename T8, typename T9,
+    typename T10, typename T11, typename T12, typename T13>
+    inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4,
+        T5* t5, T6* t6, T7* t7, T8* t8, T9* t9,
+        T10* t10, T11* t11, T12* t12, T13* t13)
+{
+    clSetKernelArgK(k, 13, t13);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12);
+}
+
+template<typename T0, typename T1, typename T2, typename T3, typename T4,
+    typename T5, typename T6, typename T7, typename T8, typename T9,
+    typename T10, typename T11, typename T12, typename T13, typename T14>
+    inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4,
+        T5* t5, T6* t6, T7* t7, T8* t8, T9* t9,
+        T10* t10, T11* t11, T12* t12, T13* t13,
+        T14* t14)
+{
+    clSetKernelArgK(k, 14, t14);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13);
+}
+
+template<
+    typename T0, typename T1, typename T2, typename T3, typename T4,
+    typename T5, typename T6, typename T7, typename T8, typename T9,
+    typename T10, typename T11, typename T12, typename T13, typename T14,
+    typename T15>
+    inline void clSetKernelArgEx(cl_kernel k,
+        T0* t0, T1* t1, T2* t2, T3* t3, T4* t4,
+        T5* t5, T6* t6, T7* t7, T8* t8, T9* t9,
+        T10* t10, T11* t11, T12* t12, T13* t13, T14* t14,
+        T15* t15)
+{
+    clSetKernelArgK(k, 15, t15);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14);
+}
+
+template<
+    typename T0, typename T1, typename T2, typename T3, typename T4,
+    typename T5, typename T6, typename T7, typename T8, typename T9,
+    typename T10, typename T11, typename T12, typename T13, typename T14,
+    typename T15, typename T16>
+    inline void clSetKernelArgEx(cl_kernel k,
+        T0* t0, T1* t1, T2* t2, T3* t3, T4* t4,
+        T5* t5, T6* t6, T7* t7, T8* t8, T9* t9,
+        T10* t10, T11* t11, T12* t12, T13* t13, T14* t14,
+        T15* t15, T16* t16)
+{
+    clSetKernelArgK(k, 16, t16);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15);
+}
+
+template<
+    typename T0, typename T1, typename T2, typename T3, typename T4,
+    typename T5, typename T6, typename T7, typename T8, typename T9,
+    typename T10, typename T11, typename T12, typename T13, typename T14,
+    typename T15, typename T16, typename T17>
+    inline void clSetKernelArgEx(cl_kernel k,
+        T0* t0, T1* t1, T2* t2, T3* t3, T4* t4,
+        T5* t5, T6* t6, T7* t7, T8* t8, T9* t9,
+        T10* t10, T11* t11, T12* t12, T13* t13, T14* t14,
+        T15* t15, T16* t16, T17* t17)
+{
+    clSetKernelArgK(k, 17, t17);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16);
+}
+
+template<
+    typename T0, typename T1, typename T2, typename T3, typename T4,
+    typename T5, typename T6, typename T7, typename T8, typename T9,
+    typename T10, typename T11, typename T12, typename T13, typename T14,
+    typename T15, typename T16, typename T17, typename T18>
+    inline void clSetKernelArgEx(cl_kernel k,
+        T0* t0, T1* t1, T2* t2, T3* t3, T4* t4,
+        T5* t5, T6* t6, T7* t7, T8* t8, T9* t9,
+        T10* t10, T11* t11, T12* t12, T13* t13, T14* t14,
+        T15* t15, T16* t16, T17* t17, T18* t18)
+{
+    clSetKernelArgK(k, 18, t18);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17);
+}
+
+template<
+    typename T0, typename T1, typename T2, typename T3, typename T4,
+    typename T5, typename T6, typename T7, typename T8, typename T9,
+    typename T10, typename T11, typename T12, typename T13, typename T14,
+    typename T15, typename T16, typename T17, typename T18, typename T19>
+    inline void clSetKernelArgEx(cl_kernel k,
+        T0* t0, T1* t1, T2* t2, T3* t3, T4* t4,
+        T5* t5, T6* t6, T7* t7, T8* t8, T9* t9,
+        T10* t10, T11* t11, T12* t12, T13* t13, T14* t14,
+        T15* t15, T16* t16, T17* t17, T18* t18, T19* t19)
+{
+    clSetKernelArgK(k, 19, t19);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18);
+}
+
+template<
+    typename T0, typename T1, typename T2, typename T3, typename T4,
+    typename T5, typename T6, typename T7, typename T8, typename T9,
+    typename T10, typename T11, typename T12, typename T13, typename T14,
+    typename T15, typename T16, typename T17, typename T18, typename T19,
+    typename T20>
+    inline void clSetKernelArgEx(cl_kernel k,
+        T0* t0, T1* t1, T2* t2, T3* t3, T4* t4,
+        T5* t5, T6* t6, T7* t7, T8* t8, T9* t9,
+        T10* t10, T11* t11, T12* t12, T13* t13, T14* t14,
+        T15* t15, T16* t16, T17* t17, T18* t18, T19* t19,
+        T20* t20)
+{
+    clSetKernelArgK(k, 20, t20);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19);
+}
+
+template<
+    typename T0, typename T1, typename T2, typename T3, typename T4,
+    typename T5, typename T6, typename T7, typename T8, typename T9,
+    typename T10, typename T11, typename T12, typename T13, typename T14,
+    typename T15, typename T16, typename T17, typename T18, typename T19,
+    typename T20, typename T21>
+    inline void clSetKernelArgEx(cl_kernel k,
+        T0* t0, T1* t1, T2* t2, T3* t3, T4* t4,
+        T5* t5, T6* t6, T7* t7, T8* t8, T9* t9,
+        T10* t10, T11* t11, T12* t12, T13* t13, T14* t14,
+        T15* t15, T16* t16, T17* t17, T18* t18, T19* t19,
+        T20* t20, T21* t21)
+{
+    clSetKernelArgK(k, 21, t21);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19, t20);
+}
+
+template<
+    typename T0, typename T1, typename T2, typename T3, typename T4,
+    typename T5, typename T6, typename T7, typename T8, typename T9,
+    typename T10, typename T11, typename T12, typename T13, typename T14,
+    typename T15, typename T16, typename T17, typename T18, typename T19,
+    typename T20, typename T21, typename T22>
+    inline void clSetKernelArgEx(cl_kernel k,
+        T0* t0, T1* t1, T2* t2, T3* t3, T4* t4,
+        T5* t5, T6* t6, T7* t7, T8* t8, T9* t9,
+        T10* t10, T11* t11, T12* t12, T13* t13, T14* t14,
+        T15* t15, T16* t16, T17* t17, T18* t18, T19* t19,
+        T20* t20, T21* t21, T22* t22)
+{
+    clSetKernelArgK(k, 22, t22);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19, t20, t21);
+}
+
+template<
+    typename T0, typename T1, typename T2, typename T3, typename T4,
+    typename T5, typename T6, typename T7, typename T8, typename T9,
+    typename T10, typename T11, typename T12, typename T13, typename T14,
+    typename T15, typename T16, typename T17, typename T18, typename T19,
+    typename T20, typename T21, typename T22, typename T23>
+    inline void clSetKernelArgEx(cl_kernel k,
+        T0* t0, T1* t1, T2* t2, T3* t3, T4* t4,
+        T5* t5, T6* t6, T7* t7, T8* t8, T9* t9,
+        T10* t10, T11* t11, T12* t12, T13* t13, T14* t14,
+        T15* t15, T16* t16, T17* t17, T18* t18, T19* t19,
+        T20* t20, T21* t21, T22* t22, T23* t23)
+{
+    clSetKernelArgK(k, 23, t23);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19, t20, t21, t22);
+}
+
+template<
+    typename T0, typename T1, typename T2, typename T3, typename T4,
+    typename T5, typename T6, typename T7, typename T8, typename T9,
+    typename T10, typename T11, typename T12, typename T13, typename T14,
+    typename T15, typename T16, typename T17, typename T18, typename T19,
+    typename T20, typename T21, typename T22, typename T23, typename T24>
+inline void clSetKernelArgEx(cl_kernel k, 
+        T0* t0, T1* t1, T2* t2, T3* t3, T4* t4,
+        T5* t5, T6* t6, T7* t7, T8* t8, T9* t9,
+        T10* t10, T11* t11, T12* t12, T13* t13, T14* t14, 
+        T15* t15, T16* t16, T17* t17, T18* t18, T19* t19,
+        T20* t20, T21* t21, T22* t22, T23* t23, T24* t24)
+{
+    clSetKernelArgK(k, 24, t24);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19, t20, t21, t22, t23);
+}
+
+#endif // __USE_OPENCL__
\ No newline at end of file
diff --git a/clguetzli/clbutter_comparator.cpp b/clguetzli/clbutter_comparator.cpp
new file mode 100644
index 00000000..d91055d5
--- /dev/null
+++ b/clguetzli/clbutter_comparator.cpp
@@ -0,0 +1,1813 @@
+/*
+* OpenCL/CUDA edition implementation of butter_comparator.
+*
+* Author: strongtu@tencent.com
+*         ianhuang@tencent.com
+*         chriskzhou@tencent.com
+*/
+#include "clbutter_comparator.h"
+#include "clguetzli.h"
+#include "clguetzli_test.h"
+
+#include <algorithm>
+#include <array>
+
+namespace butteraugli {
+
+static const float kInternalGoodQualityThreshold = 14.921561160295326;
+static const float kGlobalScale = 1.0 / kInternalGoodQualityThreshold;
+
+inline float DotProductOpt(const float u[3], const float v[3]) {
+	return u[0] * v[0] + u[1] * v[1] + u[2] * v[2];
+}
+
+// Computes a horizontal convolution and transposes the result.
+void ConvolutionOpt(size_t xsize, size_t ysize,
+	size_t xstep,
+	size_t len, size_t offset,
+	const float* __restrict__ multipliers,
+	const float* __restrict__ inp,
+	float border_ratio,
+	float* __restrict__ result) {
+	PROFILER_FUNC;
+	float weight_no_border = 0;
+	for (size_t j = 0; j <= 2 * offset; ++j) {
+		weight_no_border += multipliers[j];
+	}
+	for (size_t x = 0, ox = 0; x < xsize; x += xstep, ox++) {
+		int minx = x < offset ? 0 : x - offset;
+		int maxx = std::min(xsize, x + len - offset) - 1;
+		float weight = 0.0;
+		for (int j = minx; j <= maxx; ++j) {
+			weight += multipliers[j - x + offset];
+		}
+		// Interpolate linearly between the no-border scaling and border scaling.
+		weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border;
+		float scale = 1.0 / weight;
+		for (size_t y = 0; y < ysize; ++y) {
+			float sum = 0.0;
+			for (int j = minx; j <= maxx; ++j) {
+				sum += inp[y * xsize + j] * multipliers[j - x + offset];
+			}
+			result[ox * ysize + y] = static_cast<float>(sum * scale);
+		}
+	}
+}
+
+void BlurOpt(size_t xsize, size_t ysize, float* channel, float sigma,
+	float border_ratio) {
+	PROFILER_FUNC;
+	float m = 2.25;  // Accuracy increases when m is increased.
+	const float scaler = -1.0 / (2 * sigma * sigma);
+	// For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52}
+	const int diff = std::max<int>(1, m * fabs(sigma));
+	const int expn_size = 2 * diff + 1;
+	std::vector<float> expn(expn_size);
+	for (int i = -diff; i <= diff; ++i) {
+		expn[i + diff] = static_cast<float>(exp(scaler * i * i));
+	}
+	const int xstep = std::max(1, int(sigma / 3));
+	const int ystep = xstep;
+	int dxsize = (xsize + xstep - 1) / xstep;
+	int dysize = (ysize + ystep - 1) / ystep;
+	std::vector<float> tmp(dxsize * ysize);
+	ConvolutionOpt(xsize, ysize, xstep, expn_size, diff, expn.data(), channel,
+		border_ratio,
+		tmp.data());
+	float* output = channel;
+	std::vector<float> downsampled_output;
+	if (xstep > 1) {
+		downsampled_output.resize(dxsize * dysize);
+		output = downsampled_output.data();
+	}
+	ConvolutionOpt(ysize, dxsize, ystep, expn_size, diff, expn.data(), tmp.data(),
+		border_ratio, output);
+	if (xstep > 1) {
+		for (size_t y = 0; y < ysize; y++) {
+			for (size_t x = 0; x < xsize; x++) {
+				// TODO: Use correct rounding.
+				channel[y * xsize + x] =
+					downsampled_output[(y / ystep) * dxsize + (x / xstep)];
+			}
+		}
+	}
+}
+
+// To change this to n, add the relevant FFTn function and kFFTnMapIndexTable.
+constexpr size_t kBlockEdge = 8;
+constexpr size_t kBlockSize = kBlockEdge * kBlockEdge;
+constexpr size_t kBlockEdgeHalf = kBlockEdge / 2;
+constexpr size_t kBlockHalf = kBlockEdge * kBlockEdgeHalf;
+
+// Contrast sensitivity related weights.
+static const float *GetContrastSensitivityMatrixOpt() {
+	static float csf8x8[kBlockHalf + kBlockEdgeHalf + 1] = {
+		5.28270670524,
+		0.0,
+		0.0,
+		0.0,
+		0.3831134973,
+		0.676303603859,
+		3.58927792424,
+		18.6104367002,
+		18.6104367002,
+		3.09093131948,
+		1.0,
+		0.498250875965,
+		0.36198671102,
+		0.308982169883,
+		0.1312701920435,
+		2.37370549629,
+		3.58927792424,
+		1.0,
+		2.37370549629,
+		0.991205724152,
+		1.05178802919,
+		0.627264168628,
+		0.4,
+		0.1312701920435,
+		0.676303603859,
+		0.498250875965,
+		0.991205724152,
+		0.5,
+		0.3831134973,
+		0.349686450518,
+		0.627264168628,
+		0.308982169883,
+		0.3831134973,
+		0.36198671102,
+		1.05178802919,
+		0.3831134973,
+		0.12,
+	};
+	return &csf8x8[0];
+}
+
+std::array<float, 21> MakeHighFreqColorDiffDxOpt() {
+	std::array<float, 21> lut;
+	static const float off = 11.38708334481672;
+	static const float inc = 14.550189611520716;
+	lut[0] = 0.0;
+	lut[1] = off;
+	for (int i = 2; i < 21; ++i) {
+		lut[i] = lut[i - 1] + inc;
+	}
+	return lut;
+}
+
+const float *GetHighFreqColorDiffDxOpt() {
+	static const std::array<float, 21> kLut = MakeHighFreqColorDiffDxOpt();
+	return kLut.data();
+}
+
+std::array<float, 21> MakeHighFreqColorDiffDyOpt() {
+	std::array<float, 21> lut;
+	static const float off = 1.4103373714040413;
+	static const float inc = 0.7084088867024;
+	lut[0] = 0.0;
+	lut[1] = off;
+	for (int i = 2; i < 21; ++i) {
+		lut[i] = lut[i - 1] + inc;
+	}
+	return lut;
+}
+
+const float *GetHighFreqColorDiffDyOpt() {
+	static const std::array<float, 21> kLut = MakeHighFreqColorDiffDyOpt();
+	return kLut.data();
+}
+
+std::array<float, 21> MakeLowFreqColorDiffDyOpt() {
+	std::array<float, 21> lut;
+	static const float inc = 5.2511644570349185;
+	lut[0] = 0.0;
+	for (int i = 1; i < 21; ++i) {
+		lut[i] = lut[i - 1] + inc;
+	}
+	return lut;
+}
+
+const float *GetLowFreqColorDiffDyOpt() {
+	static const std::array<float, 21> kLut = MakeLowFreqColorDiffDyOpt();
+	return kLut.data();
+}
+
+inline float InterpolateOpt(const float *array, int size, float sx) {
+	float ix = fabs(sx);
+	assert(ix < 10000);
+	int baseix = static_cast<int>(ix);
+	float res;
+	if (baseix >= size - 1) {
+		res = array[size - 1];
+	}
+	else {
+		float mix = ix - baseix;
+		int nextix = baseix + 1;
+		res = array[baseix] + mix * (array[nextix] - array[baseix]);
+	}
+	if (sx < 0) res = -res;
+	return res;
+}
+
+inline float InterpolateClampNegativeOpt(const float *array,
+	int size, float sx) {
+	if (sx < 0) {
+		sx = 0;
+	}
+	float ix = fabs(sx);
+	int baseix = static_cast<int>(ix);
+	float res;
+	if (baseix >= size - 1) {
+		res = array[size - 1];
+	}
+	else {
+		float mix = ix - baseix;
+		int nextix = baseix + 1;
+		res = array[baseix] + mix * (array[nextix] - array[baseix]);
+	}
+	return res;
+}
+
+void RgbToXybOpt(float r, float g, float b,
+	float *valx, float *valy, float *valz) {
+	static const float a0 = 1.01611726948;
+	static const float a1 = 0.982482243696;
+	static const float a2 = 1.43571362627;
+	static const float a3 = 0.896039849412;
+	*valx = a0 * r - a1 * g;
+	*valy = a2 * r + a3 * g;
+	*valz = b;
+}
+
+static inline void XybToValsOpt(float x, float y, float z,
+	float *valx, float *valy, float *valz) {
+	static const float xmul = 0.758304045695;
+	static const float ymul = 2.28148649801;
+	static const float zmul = 1.87816926918;
+	*valx = InterpolateOpt(GetHighFreqColorDiffDxOpt(), 21, x * xmul);
+	*valy = InterpolateOpt(GetHighFreqColorDiffDyOpt(), 21, y * ymul);
+	*valz = zmul * z;
+}
+
+// Rough psychovisual distance to gray for low frequency colors.
+static void XybLowFreqToValsOpt(float x, float y, float z,
+	float *valx, float *valy, float *valz) {
+	static const float xmul = 6.64482198135;
+	static const float ymul = 0.837846224276;
+	static const float zmul = 7.34905756986;
+	static const float y_to_z_mul = 0.0812519812628;
+	z += y_to_z_mul * y;
+	*valz = z * zmul;
+	*valx = x * xmul;
+	*valy = InterpolateOpt(GetLowFreqColorDiffDyOpt(), 21, y * ymul);
+}
+
+float RemoveRangeAroundZeroOpt(float v, float range) {
+	if (v >= -range && v < range) {
+		return 0;
+	}
+	if (v < 0) {
+		return v + range;
+	}
+	else {
+		return v - range;
+	}
+}
+
+void XybDiffLowFreqSquaredAccumulateOpt(float r0, float g0, float b0,
+	float r1, float g1, float b1,
+	float factor, float res[3]) {
+	float valx0, valy0, valz0;
+	float valx1, valy1, valz1;
+	XybLowFreqToValsOpt(r0, g0, b0, &valx0, &valy0, &valz0);
+	if (r1 == 0.0 && g1 == 0.0 && b1 == 0.0) {
+		PROFILER_ZONE("XybDiff r1=g1=b1=0");
+		res[0] += factor * valx0 * valx0;
+		res[1] += factor * valy0 * valy0;
+		res[2] += factor * valz0 * valz0;
+		return;
+	}
+	XybLowFreqToValsOpt(r1, g1, b1, &valx1, &valy1, &valz1);
+	// Approximate the distance of the colors by their respective distances
+	// to gray.
+	float valx = valx0 - valx1;
+	float valy = valy0 - valy1;
+	float valz = valz0 - valz1;
+	res[0] += factor * valx * valx;
+	res[1] += factor * valy * valy;
+	res[2] += factor * valz * valz;
+}
+
+struct ComplexOpt {
+public:
+	float real;
+	float imag;
+};
+
+inline float abssq(const ComplexOpt& c) {
+	return c.real * c.real + c.imag * c.imag;
+}
+
+static void TransposeBlock(ComplexOpt data[kBlockSize]) {
+	for (int i = 0; i < kBlockEdge; i++) {
+		for (int j = 0; j < i; j++) {
+			std::swap(data[kBlockEdge * i + j], data[kBlockEdge * j + i]);
+		}
+	}
+}
+
+//  D. J. Bernstein's Fast Fourier Transform algorithm on 4 elements.
+inline void FFT4Opt(ComplexOpt* a) {
+	float t1, t2, t3, t4, t5, t6, t7, t8;
+	t5 = a[2].real;
+	t1 = a[0].real - t5;
+	t7 = a[3].real;
+	t5 += a[0].real;
+	t3 = a[1].real - t7;
+	t7 += a[1].real;
+	t8 = t5 + t7;
+	a[0].real = t8;
+	t5 -= t7;
+	a[1].real = t5;
+	t6 = a[2].imag;
+	t2 = a[0].imag - t6;
+	t6 += a[0].imag;
+	t5 = a[3].imag;
+	a[2].imag = t2 + t3;
+	t2 -= t3;
+	a[3].imag = t2;
+	t4 = a[1].imag - t5;
+	a[3].real = t1 + t4;
+	t1 -= t4;
+	a[2].real = t1;
+	t5 += a[1].imag;
+	a[0].imag = t6 + t5;
+	t6 -= t5;
+	a[1].imag = t6;
+}
+
+static const float kSqrtHalf = 0.70710678118654752440084436210484903;
+
+//  D. J. Bernstein's Fast Fourier Transform algorithm on 8 elements.
+void FFT8OptOpt(ComplexOpt* a) {
+	float t1, t2, t3, t4, t5, t6, t7, t8;
+
+	t7 = a[4].imag;
+	t4 = a[0].imag - t7;
+	t7 += a[0].imag;
+	a[0].imag = t7;
+
+	t8 = a[6].real;
+	t5 = a[2].real - t8;
+	t8 += a[2].real;
+	a[2].real = t8;
+
+	t7 = a[6].imag;
+	a[6].imag = t4 - t5;
+	t4 += t5;
+	a[4].imag = t4;
+
+	t6 = a[2].imag - t7;
+	t7 += a[2].imag;
+	a[2].imag = t7;
+
+	t8 = a[4].real;
+	t3 = a[0].real - t8;
+	t8 += a[0].real;
+	a[0].real = t8;
+
+	a[4].real = t3 - t6;
+	t3 += t6;
+	a[6].real = t3;
+
+	t7 = a[5].real;
+	t3 = a[1].real - t7;
+	t7 += a[1].real;
+	a[1].real = t7;
+
+	t8 = a[7].imag;
+	t6 = a[3].imag - t8;
+	t8 += a[3].imag;
+	a[3].imag = t8;
+	t1 = t3 - t6;
+	t3 += t6;
+
+	t7 = a[5].imag;
+	t4 = a[1].imag - t7;
+	t7 += a[1].imag;
+	a[1].imag = t7;
+
+	t8 = a[7].real;
+	t5 = a[3].real - t8;
+	t8 += a[3].real;
+	a[3].real = t8;
+
+	t2 = t4 - t5;
+	t4 += t5;
+
+	t6 = t1 - t4;
+	t8 = kSqrtHalf;
+	t6 *= t8;
+	a[5].real = a[4].real - t6;
+	t1 += t4;
+	t1 *= t8;
+	a[5].imag = a[4].imag - t1;
+	t6 += a[4].real;
+	a[4].real = t6;
+	t1 += a[4].imag;
+	a[4].imag = t1;
+
+	t5 = t2 - t3;
+	t5 *= t8;
+	a[7].imag = a[6].imag - t5;
+	t2 += t3;
+	t2 *= t8;
+	a[7].real = a[6].real - t2;
+	t2 += a[6].real;
+	a[6].real = t2;
+	t5 += a[6].imag;
+	a[6].imag = t5;
+
+	FFT4Opt(a);
+
+	// Reorder to the correct output order.
+	// TODO: Modify the above computation so that this is not needed.
+	ComplexOpt tmp = a[2];
+	a[2] = a[3];
+	a[3] = a[5];
+	a[5] = a[7];
+	a[7] = a[4];
+	a[4] = a[1];
+	a[1] = a[6];
+	a[6] = tmp;
+}
+
+// Same as FFT8, but all inputs are real.
+// TODO: Since this does not need to be in-place, maybe there is a
+// faster FFT than this one, which is derived from DJB's in-place complex FFT.
+void RealFFT8Opt(const float* in, ComplexOpt* out) {
+	float t1, t2, t3, t5, t6, t7, t8;
+	t8 = in[6];
+	t5 = in[2] - t8;
+	t8 += in[2];
+	out[2].real = t8;
+	out[6].imag = -t5;
+	out[4].imag = t5;
+	t8 = in[4];
+	t3 = in[0] - t8;
+	t8 += in[0];
+	out[0].real = t8;
+	out[4].real = t3;
+	out[6].real = t3;
+	t7 = in[5];
+	t3 = in[1] - t7;
+	t7 += in[1];
+	out[1].real = t7;
+	t8 = in[7];
+	t5 = in[3] - t8;
+	t8 += in[3];
+	out[3].real = t8;
+	t2 = -t5;
+	t6 = t3 - t5;
+	t8 = kSqrtHalf;
+	t6 *= t8;
+	out[5].real = out[4].real - t6;
+	t1 = t3 + t5;
+	t1 *= t8;
+	out[5].imag = out[4].imag - t1;
+	t6 += out[4].real;
+	out[4].real = t6;
+	t1 += out[4].imag;
+	out[4].imag = t1;
+	t5 = t2 - t3;
+	t5 *= t8;
+	out[7].imag = out[6].imag - t5;
+	t2 += t3;
+	t2 *= t8;
+	out[7].real = out[6].real - t2;
+	t2 += out[6].real;
+	out[6].real = t2;
+	t5 += out[6].imag;
+	out[6].imag = t5;
+	t5 = out[2].real;
+	t1 = out[0].real - t5;
+	t7 = out[3].real;
+	t5 += out[0].real;
+	t3 = out[1].real - t7;
+	t7 += out[1].real;
+	t8 = t5 + t7;
+	out[0].real = t8;
+	t5 -= t7;
+	out[1].real = t5;
+	out[2].imag = t3;
+	out[3].imag = -t3;
+	out[3].real = t1;
+	out[2].real = t1;
+	out[0].imag = 0;
+	out[1].imag = 0;
+
+	// Reorder to the correct output order.
+	// TODO: Modify the above computation so that this is not needed.
+	ComplexOpt tmp = out[2];
+	out[2] = out[3];
+	out[3] = out[5];
+	out[5] = out[7];
+	out[7] = out[4];
+	out[4] = out[1];
+	out[1] = out[6];
+	out[6] = tmp;
+}
+
+// Fills in block[kBlockEdgeHalf..(kBlockHalf+kBlockEdgeHalf)], and leaves the
+// rest unmodified.
+void ButteraugliFFTSquaredOpt(float block[kBlockSize]) {
+	float global_mul = 0.000064;
+	ComplexOpt block_c[kBlockSize];
+	assert(kBlockEdge == 8);
+	for (int y = 0; y < kBlockEdge; ++y) {
+		RealFFT8Opt(block + y * kBlockEdge, block_c + y * kBlockEdge);
+	}
+	TransposeBlock(block_c);
+	float r0[kBlockEdge];
+	float r1[kBlockEdge];
+	for (int x = 0; x < kBlockEdge; ++x) {
+		r0[x] = block_c[x].real;
+		r1[x] = block_c[kBlockHalf + x].real;
+	}
+	RealFFT8Opt(r0, block_c);
+	RealFFT8Opt(r1, block_c + kBlockHalf);
+	for (int y = 1; y < kBlockEdgeHalf; ++y) {
+		FFT8OptOpt(block_c + y * kBlockEdge);
+	}
+	for (int i = kBlockEdgeHalf; i < kBlockHalf + kBlockEdgeHalf + 1; ++i) {
+		block[i] = abssq(block_c[i]);
+		block[i] *= global_mul;
+	}
+}
+
+// Computes 8x8 FFT of each channel of xyb0 and xyb1 and adds the total squared
+// 3-dimensional xybdiff of the two blocks to diff_xyb_{dc,ac} and the average
+// diff on the edges to diff_xyb_edge_dc.
+void ButteraugliBlockDiffOpt(float xyb0[3 * kBlockSize],
+	float xyb1[3 * kBlockSize],
+	float diff_xyb_dc[3],
+	float diff_xyb_ac[3],
+	float diff_xyb_edge_dc[3]) {
+	PROFILER_FUNC;
+	const float *csf8x8 = GetContrastSensitivityMatrixOpt();
+
+	float avgdiff_xyb[3] = { 0.0 };
+	float avgdiff_edge[3][4] = { { 0.0 } };
+	for (int i = 0; i < 3 * kBlockSize; ++i) {
+		const float diff_xyb = xyb0[i] - xyb1[i];
+		const int c = i / kBlockSize;
+		avgdiff_xyb[c] += diff_xyb / kBlockSize;
+		const int k = i % kBlockSize;
+		const int kx = k % kBlockEdge;
+		const int ky = k / kBlockEdge;
+		const int h_edge_idx = ky == 0 ? 1 : ky == 7 ? 3 : -1;
+		const int v_edge_idx = kx == 0 ? 0 : kx == 7 ? 2 : -1;
+		if (h_edge_idx >= 0) {
+			avgdiff_edge[c][h_edge_idx] += diff_xyb / kBlockEdge;
+		}
+		if (v_edge_idx >= 0) {
+			avgdiff_edge[c][v_edge_idx] += diff_xyb / kBlockEdge;
+		}
+	}
+	XybDiffLowFreqSquaredAccumulateOpt(avgdiff_xyb[0],
+		avgdiff_xyb[1],
+		avgdiff_xyb[2],
+		0, 0, 0, csf8x8[0],
+		diff_xyb_dc);
+	for (int i = 0; i < 4; ++i) {
+		XybDiffLowFreqSquaredAccumulateOpt(avgdiff_edge[0][i],
+			avgdiff_edge[1][i],
+			avgdiff_edge[2][i],
+			0, 0, 0, csf8x8[0],
+			diff_xyb_edge_dc);
+	}
+
+	float* xyb_avg = xyb0;
+	float* xyb_halfdiff = xyb1;
+	for (int i = 0; i < 3 * kBlockSize; ++i) {
+		float avg = (xyb0[i] + xyb1[i]) / 2;
+		float halfdiff = (xyb0[i] - xyb1[i]) / 2;
+		xyb_avg[i] = avg;
+		xyb_halfdiff[i] = halfdiff;
+	}
+	float *y_avg = &xyb_avg[kBlockSize];
+	float *x_halfdiff_squared = &xyb_halfdiff[0];
+	float *y_halfdiff = &xyb_halfdiff[kBlockSize];
+	float *z_halfdiff_squared = &xyb_halfdiff[2 * kBlockSize];
+	ButteraugliFFTSquaredOpt(y_avg);
+	ButteraugliFFTSquaredOpt(x_halfdiff_squared);
+	ButteraugliFFTSquaredOpt(y_halfdiff);
+	ButteraugliFFTSquaredOpt(z_halfdiff_squared);
+
+	static const float xmul = 64.8;
+	static const float ymul = 1.753123908348329;
+	static const float ymul2 = 1.51983458269;
+	static const float zmul = 2.4;
+
+	for (size_t i = kBlockEdgeHalf; i < kBlockHalf + kBlockEdgeHalf + 1; ++i) {
+		float d = csf8x8[i];
+		diff_xyb_ac[0] += d * xmul * x_halfdiff_squared[i];
+		diff_xyb_ac[2] += d * zmul * z_halfdiff_squared[i];
+
+		y_avg[i] = sqrt(y_avg[i]);
+		y_halfdiff[i] = sqrt(y_halfdiff[i]);
+		float y0 = y_avg[i] - y_halfdiff[i];
+		float y1 = y_avg[i] + y_halfdiff[i];
+		// Remove the impact of small absolute values.
+		// This improves the behavior with flat noise.
+		static const float ylimit = 0.04;
+		y0 = RemoveRangeAroundZeroOpt(y0, ylimit);
+		y1 = RemoveRangeAroundZeroOpt(y1, ylimit);
+		if (y0 != y1) {
+			float valy0 = InterpolateOpt(GetHighFreqColorDiffDyOpt(), 21, y0 * ymul2);
+			float valy1 = InterpolateOpt(GetHighFreqColorDiffDyOpt(), 21, y1 * ymul2);
+			float valy = ymul * (valy0 - valy1);
+			diff_xyb_ac[1] += d * valy * valy;
+		}
+	}
+}
+
+// Low frequency edge detectors.
+// Two edge detectors are applied in each corner of the 8x8 square.
+// The squared 3-dimensional error vector is added to diff_xyb.
+void Butteraugli8x8CornerEdgeDetectorDiffOpt(
+	const size_t pos_x,
+	const size_t pos_y,
+	const size_t xsize,
+	const size_t ysize,
+	const std::vector<std::vector<float> > &blurred0,
+	const std::vector<std::vector<float> > &blurred1,
+	float diff_xyb[3]) {
+	PROFILER_FUNC;
+	int local_count = 0;
+	float local_xyb[3] = { 0 };
+	static const float w = 0.711100840192;
+	for (int k = 0; k < 4; ++k) {
+		size_t step = 3;
+		size_t offset[4][2] = { { 0, 0 },{ 0, 7 },{ 7, 0 },{ 7, 7 } };
+		size_t x = pos_x + offset[k][0];
+		size_t y = pos_y + offset[k][1];
+		if (x >= step && x + step < xsize) {
+			size_t ix = y * xsize + (x - step);
+			size_t ix2 = ix + 2 * step;
+			XybDiffLowFreqSquaredAccumulateOpt(
+				w * (blurred0[0][ix] - blurred0[0][ix2]),
+				w * (blurred0[1][ix] - blurred0[1][ix2]),
+				w * (blurred0[2][ix] - blurred0[2][ix2]),
+				w * (blurred1[0][ix] - blurred1[0][ix2]),
+				w * (blurred1[1][ix] - blurred1[1][ix2]),
+				w * (blurred1[2][ix] - blurred1[2][ix2]),
+				1.0, local_xyb);
+			++local_count;
+		}
+		if (y >= step && y + step < ysize) {
+			size_t ix = (y - step) * xsize + x;
+			size_t ix2 = ix + 2 * step * xsize;
+			XybDiffLowFreqSquaredAccumulateOpt(
+				w * (blurred0[0][ix] - blurred0[0][ix2]),
+				w * (blurred0[1][ix] - blurred0[1][ix2]),
+				w * (blurred0[2][ix] - blurred0[2][ix2]),
+				w * (blurred1[0][ix] - blurred1[0][ix2]),
+				w * (blurred1[1][ix] - blurred1[1][ix2]),
+				w * (blurred1[2][ix] - blurred1[2][ix2]),
+				1.0, local_xyb);
+			++local_count;
+		}
+	}
+	static const float weight = 0.01617112696;
+	const float mul = weight * 8.0 / local_count;
+	for (int i = 0; i < 3; ++i) {
+		diff_xyb[i] += mul * local_xyb[i];
+	}
+}
+
+// https://en.wikipedia.org/wiki/Photopsin absordance modeling.
+const float *GetOpsinAbsorbanceOpt() {
+	static const float kMix[12] = {
+		0.348036746003,
+		0.577814843137,
+		0.0544556093735,
+		0.774145581713,
+		0.26922717275,
+		0.767247733938,
+		0.0366922708552,
+		0.920130265014,
+		0.0882062883536,
+		0.158581714673,
+		0.712857943858,
+		10.6524069248,
+	};
+	return &kMix[0];
+}
+
+void OpsinAbsorbanceOpt(const float in[3], float out[3]) {
+	const float *mix = GetOpsinAbsorbanceOpt();
+	out[0] = mix[0] * in[0] + mix[1] * in[1] + mix[2] * in[2] + mix[3];
+	out[1] = mix[4] * in[0] + mix[5] * in[1] + mix[6] * in[2] + mix[7];
+	out[2] = mix[8] * in[0] + mix[9] * in[1] + mix[10] * in[2] + mix[11];
+}
+
+float GammaMinArgOpt() {
+	float in[3] = { 0.0, 0.0, 0.0 };
+	float out[3];
+	OpsinAbsorbanceOpt(in, out);
+	return std::min(out[0], std::min(out[1], out[2]));
+}
+
+float GammaMaxArgOpt() {
+	float in[3] = { 255.0, 255.0, 255.0 };
+	float out[3];
+	OpsinAbsorbanceOpt(in, out);
+	return std::max(out[0], std::max(out[1], out[2]));
+}
+
+void MaskHighIntensityChangeOpt(
+	size_t xsize, size_t ysize,
+	const std::vector<std::vector<float> > &c0,
+	const std::vector<std::vector<float> > &c1,
+	std::vector<std::vector<float> > &xyb0,
+	std::vector<std::vector<float> > &xyb1) {
+	PROFILER_FUNC;
+	for (size_t y = 0; y < ysize; ++y) {
+		for (size_t x = 0; x < xsize; ++x) {
+			size_t ix = y * xsize + x;
+			const float ave[3] = {
+				static_cast<float>((c0[0][ix] + c1[0][ix]) * 0.5),
+				static_cast<float>((c0[1][ix] + c1[1][ix]) * 0.5),
+				static_cast<float>((c0[2][ix] + c1[2][ix]) * 0.5),
+			};
+			float sqr_max_diff = -1;
+			{
+				int offset[4] =
+				{ -1, 1, -static_cast<int>(xsize), static_cast<int>(xsize) };
+				int border[4] =
+				{ x == 0, x + 1 == xsize, y == 0, y + 1 == ysize };
+				for (int dir = 0; dir < 4; ++dir) {
+					if (border[dir]) {
+						continue;
+					}
+					const int ix2 = ix + offset[dir];
+					float diff = 0.5 * (c0[1][ix2] + c1[1][ix2]) - ave[1];
+					diff *= diff;
+					if (sqr_max_diff < diff) {
+						sqr_max_diff = diff;
+					}
+				}
+			}
+			static const float kReductionX = 275.19165240059317;
+			static const float kReductionY = 18599.41286306991;
+			static const float kReductionZ = 410.8995306951065;
+			static const float kChromaBalance = 106.95800948271017;
+			float chroma_scale = kChromaBalance / (ave[1] + kChromaBalance);
+
+			const float mix[3] = {
+				chroma_scale * kReductionX / (sqr_max_diff + kReductionX),
+				kReductionY / (sqr_max_diff + kReductionY),
+				chroma_scale * kReductionZ / (sqr_max_diff + kReductionZ),
+			};
+			// Interpolate lineraly between the average color and the actual
+			// color -- to reduce the importance of this pixel.
+			for (int i = 0; i < 3; ++i) {
+				xyb0[i][ix] = static_cast<float>(mix[i] * c0[i][ix] + (1 - mix[i]) * ave[i]);
+				xyb1[i][ix] = static_cast<float>(mix[i] * c1[i][ix] + (1 - mix[i]) * ave[i]);
+			}
+		}
+	}
+}
+
+float SimpleGammaOpt(float v) {
+	static const float kGamma = 0.387494322593;
+	static const float limit = 43.01745241042018;
+	float bright = v - limit;
+	if (bright >= 0) {
+		static const float mul = 0.0383723643799;
+		v -= bright * mul;
+	}
+	static const float limit2 = 94.68634353321337;
+	float bright2 = v - limit2;
+	if (bright2 >= 0) {
+		static const float mul = 0.22885405968;
+		v -= bright2 * mul;
+	}
+	static const float offset = 0.156775786057;
+	static const float scale = 8.898059160493739;
+	float retval = scale * (offset + pow(v, kGamma));
+	return retval;
+}
+
+// Polynomial evaluation via Clenshaw's scheme (similar to Horner's).
+// Template enables compile-time unrolling of the recursion, but must reside
+// outside of a class due to the specialization.
+template <int INDEX>
+static inline void ClenshawRecursionOpt(const float x, const float *coefficients,
+	float *b1, float *b2) {
+	const float x_b1 = x * (*b1);
+	const float t = (x_b1 + x_b1) - (*b2) + coefficients[INDEX];
+	*b2 = *b1;
+	*b1 = t;
+
+	ClenshawRecursionOpt<INDEX - 1>(x, coefficients, b1, b2);
+}
+
+// Base case
+template <>
+inline void ClenshawRecursionOpt<0>(const float x, const float *coefficients,
+	float *b1, float *b2) {
+	const float x_b1 = x * (*b1);
+	// The final iteration differs - no 2 * x_b1 here.
+	*b1 = x_b1 - (*b2) + coefficients[0];
+}
+
+// Rational polynomial := dividing two polynomial evaluations. These are easier
+// to find than minimax polynomials.
+struct RationalPolynomialOpt {
+	template <int N>
+	static float EvaluatePolynomial(const float x,
+		const float(&coefficients)[N]) {
+		float b1 = 0.0;
+		float b2 = 0.0;
+		ClenshawRecursionOpt<N - 1>(x, coefficients, &b1, &b2);
+		return b1;
+	}
+
+	// Evaluates the polynomial at x (in [min_value, max_value]).
+	inline float operator()(const float x) const {
+		// First normalize to [0, 1].
+		const float x01 = (x - min_value) / (max_value - min_value);
+		// And then to [-1, 1] domain of Chebyshev polynomials.
+		const float xc = 2.0 * x01 - 1.0;
+
+		const float yp = EvaluatePolynomial(xc, p);
+		const float yq = EvaluatePolynomial(xc, q);
+		if (yq == 0.0) return 0.0;
+		return static_cast<float>(yp / yq);
+	}
+
+	// Domain of the polynomials; they are undefined elsewhere.
+	float min_value;
+	float max_value;
+
+	// Coefficients of T_n (Chebyshev polynomials of the first kind).
+	// Degree 5/5 is a compromise between accuracy (0.1%) and numerical stability.
+	float p[5 + 1];
+	float q[5 + 1];
+};
+
+static inline float GammaPolynomialOpt(float value) {
+	// Generated by gamma_polynomial.m from equispaced x/gamma(x) samples.
+	static const RationalPolynomialOpt r = {
+		0.770000000000000, 274.579999999999984,
+		{
+			881.979476556478289, 1496.058452015812463, 908.662212739659481,
+			373.566100223287378, 85.840860336314364, 6.683258861509244,
+		},
+		{
+			12.262350348616792, 20.557285797683576, 12.161463238367844,
+			4.711532733641639, 0.899112889751053, 0.035662329617191,
+		} };
+	return static_cast<float>(r(value));
+}
+
+static inline float GammaOpt(float v) {
+	// return SimpleGamma(v);
+	return GammaPolynomialOpt(static_cast<float>(v));
+}
+
+void OpsinDynamicsImageOpt(size_t xsize, size_t ysize,
+	std::vector<std::vector<float> > &rgb) {
+	PROFILER_FUNC;
+	std::vector<std::vector<float> > blurred = rgb;
+	static const float kSigma = 1.1;
+	for (int i = 0; i < 3; ++i) {
+		BlurOpt(xsize, ysize, blurred[i].data(), kSigma, 0.0);
+	}
+	for (size_t i = 0; i < rgb[0].size(); ++i) {
+		float sensitivity[3];
+		{
+			// Calculate sensitivity[3] based on the smoothed image gamma derivative.
+			float pre_rgb[3] = { blurred[0][i], blurred[1][i], blurred[2][i] };
+			float pre_mixed[3];
+			OpsinAbsorbanceOpt(pre_rgb, pre_mixed);
+			sensitivity[0] = GammaOpt(pre_mixed[0]) / pre_mixed[0];
+			sensitivity[1] = GammaOpt(pre_mixed[1]) / pre_mixed[1];
+			sensitivity[2] = GammaOpt(pre_mixed[2]) / pre_mixed[2];
+		}
+		float cur_rgb[3] = { rgb[0][i],  rgb[1][i],  rgb[2][i] };
+		float cur_mixed[3];
+		OpsinAbsorbanceOpt(cur_rgb, cur_mixed);
+		cur_mixed[0] *= sensitivity[0];
+		cur_mixed[1] *= sensitivity[1];
+		cur_mixed[2] *= sensitivity[2];
+		float x, y, z;
+		RgbToXybOpt(cur_mixed[0], cur_mixed[1], cur_mixed[2], &x, &y, &z);
+		rgb[0][i] = static_cast<float>(x);
+		rgb[1][i] = static_cast<float>(y);
+		rgb[2][i] = static_cast<float>(z);
+	}
+}
+
+void ScaleImageOpt(float scale, std::vector<float> *result) {
+	PROFILER_FUNC;
+	for (size_t i = 0; i < result->size(); ++i) {
+		(*result)[i] *= static_cast<float>(scale);
+	}
+}
+
+// Making a cluster of local errors to be more impactful than
+// just a single error.
+void CalculateDiffmapOpt(const size_t xsize, const size_t ysize,
+	const size_t step,
+	std::vector<float>* diffmap) {
+	PROFILER_FUNC;
+	// Shift the diffmap more correctly above the pixels, from 2.5 pixels to 0.5
+	// pixels distance over the original image. The border of 2 pixels on top and
+	// left side and 3 pixels on right and bottom side are zeroed, but these
+	// values have no meaning, they only exist to keep the result map the same
+	// size as the input images.
+	int s2 = (8 - step) / 2;
+	{
+		// Upsample and take square root.
+		std::vector<float> diffmap_out(xsize * ysize);
+		const size_t res_xsize = (xsize + step - 1) / step;
+		for (size_t res_y = 0; res_y + 8 - step < ysize; res_y += step) {
+			for (size_t res_x = 0; res_x + 8 - step < xsize; res_x += step) {
+				size_t res_ix = (res_y * res_xsize + res_x) / step;
+				float orig_val = (*diffmap)[res_ix];
+				constexpr float kInitialSlope = 100;
+				// TODO(b/29974893): Until that is fixed do not call sqrt on very small
+				// numbers.
+				float val = orig_val < (1.0 / (kInitialSlope * kInitialSlope))
+					? kInitialSlope * orig_val
+					: std::sqrt(orig_val);
+				for (size_t off_y = 0; off_y < step; ++off_y) {
+					for (size_t off_x = 0; off_x < step; ++off_x) {
+						diffmap_out[(res_y + off_y + s2) * xsize +
+							res_x + off_x + s2] = val;
+					}
+				}
+			}
+		}
+		*diffmap = diffmap_out;
+	}
+	{
+		static const float kSigma = 8.8510880283;
+		static const float mul1 = 24.8235314874;
+		static const float scale = 1.0 / (1.0 + mul1);
+		const int s = 8 - step;
+		std::vector<float> blurred((xsize - s) * (ysize - s));
+		for (size_t y = 0; y < ysize - s; ++y) {
+			for (size_t x = 0; x < xsize - s; ++x) {
+				blurred[y * (xsize - s) + x] = (*diffmap)[(y + s2) * xsize + x + s2];
+			}
+		}
+		static const float border_ratio = 0.03027655136;
+		BlurOpt(xsize - s, ysize - s, blurred.data(), kSigma, border_ratio);
+		for (size_t y = 0; y < ysize - s; ++y) {
+			for (size_t x = 0; x < xsize - s; ++x) {
+				(*diffmap)[(y + s2) * xsize + x + s2]
+					+= static_cast<float>(mul1) * blurred[y * (xsize - s) + x];
+			}
+		}
+		ScaleImageOpt(scale, diffmap);
+	}
+}
+
+static std::array<float, 512> MakeMaskOpt(
+	float extmul, float extoff,
+	float mul, float offset,
+	float scaler) {
+	std::array<float, 512> lut;
+	for (size_t i = 0; i < lut.size(); ++i) {
+		const float c = mul / ((0.01 * scaler * i) + offset);
+		lut[i] = 1.0 + extmul * (c + extoff);
+		assert(lut[i] >= 0.0);
+		lut[i] *= lut[i];
+	}
+	return lut;
+}
+
+float MaskXOpt(float delta) {
+	PROFILER_FUNC;
+	static const float extmul = 0.975741017749;
+	static const float extoff = -4.25328244168;
+	static const float offset = 0.454909521427;
+	static const float scaler = 0.0738288224836;
+	static const float mul = 20.8029176447;
+	static const std::array<float, 512> lut =
+		MakeMaskOpt(extmul, extoff, mul, offset, scaler);
+	return InterpolateClampNegativeOpt(lut.data(), lut.size(), delta);
+}
+
+float MaskYOpt(float delta) {
+	PROFILER_FUNC;
+	static const float extmul = 0.373995618954;
+	static const float extoff = 1.5307267433;
+	static const float offset = 0.911952641929;
+	static const float scaler = 1.1731667845;
+	static const float mul = 16.2447033988;
+	static const std::array<float, 512> lut =
+		MakeMaskOpt(extmul, extoff, mul, offset, scaler);
+	return InterpolateClampNegativeOpt(lut.data(), lut.size(), delta);
+}
+
+float MaskBOpt(float delta) {
+	PROFILER_FUNC;
+	static const float extmul = 0.61582234137;
+	static const float extoff = -4.25376118646;
+	static const float offset = 1.05105070921;
+	static const float scaler = 0.47434643535;
+	static const float mul = 31.1444967089;
+	static const std::array<float, 512> lut =
+		MakeMaskOpt(extmul, extoff, mul, offset, scaler);
+	return InterpolateClampNegativeOpt(lut.data(), lut.size(), delta);
+}
+
+float MaskDcXOpt(float delta) {
+	PROFILER_FUNC;
+	static const float extmul = 1.79116943438;
+	static const float extoff = -3.86797479189;
+	static const float offset = 0.670960225853;
+	static const float scaler = 0.486575865525;
+	static const float mul = 20.4563479139;
+	static const std::array<float, 512> lut =
+		MakeMaskOpt(extmul, extoff, mul, offset, scaler);
+	return InterpolateClampNegativeOpt(lut.data(), lut.size(), delta);
+}
+
+float MaskDcYOpt(float delta) {
+	PROFILER_FUNC;
+	static const float extmul = 0.212223514236;
+	static const float extoff = -3.65647120524;
+	static const float offset = 1.73396799447;
+	static const float scaler = 0.170392660501;
+	static const float mul = 21.6566724788;
+	static const std::array<float, 512> lut =
+		MakeMaskOpt(extmul, extoff, mul, offset, scaler);
+	return InterpolateClampNegativeOpt(lut.data(), lut.size(), delta);
+}
+
+float MaskDcBOpt(float delta) {
+	PROFILER_FUNC;
+	static const float extmul = 0.349376011816;
+	static const float extoff = -0.894711072781;
+	static const float offset = 0.901647926679;
+	static const float scaler = 0.380086095024;
+	static const float mul = 18.0373825149;
+	static const std::array<float, 512> lut =
+		MakeMaskOpt(extmul, extoff, mul, offset, scaler);
+	return InterpolateClampNegativeOpt(lut.data(), lut.size(), delta);
+}
+
+// Replaces values[x + y * xsize] with the minimum of the values in the
+// square_size square with coordinates
+//   x - offset .. x + square_size - offset - 1,
+//   y - offset .. y + square_size - offset - 1.
+void MinSquareValOpt(size_t square_size, size_t offset,
+	size_t xsize, size_t ysize,
+	float *values) {
+	PROFILER_FUNC;
+	// offset is not negative and smaller than square_size.
+	assert(offset < square_size);
+	std::vector<float> tmp(xsize * ysize);
+	for (size_t y = 0; y < ysize; ++y) {
+		const size_t minh = offset > y ? 0 : y - offset;
+		const size_t maxh = std::min<size_t>(ysize, y + square_size - offset);
+		for (size_t x = 0; x < xsize; ++x) {
+			float min = values[x + minh * xsize];
+			for (size_t j = minh + 1; j < maxh; ++j) {
+				float tmpf = values[x + j * xsize];
+				if (tmpf < min) min = tmpf;
+			}
+			tmp[x + y * xsize] = static_cast<float>(min);
+		}
+	}
+	for (size_t x = 0; x < xsize; ++x) {
+		const size_t minw = offset > x ? 0 : x - offset;
+		const size_t maxw = std::min<size_t>(xsize, x + square_size - offset);
+		for (size_t y = 0; y < ysize; ++y) {
+			float min = tmp[minw + y * xsize];
+			for (size_t j = minw + 1; j < maxw; ++j) {
+				float tmpf = tmp[j + y * xsize];
+				if (tmpf < min) min = tmpf;
+			}
+			values[x + y * xsize] = static_cast<float>(min);
+		}
+	}
+}
+
+void Average5x5Opt(int xsize, int ysize, std::vector<float>* diffs) {
+	PROFILER_FUNC;
+	if (xsize < 4 || ysize < 4) {
+		// TODO: Make this work for small dimensions as well.
+		return;
+	}
+	static const float w = 0.679144890667f;
+	static const float scale = 1.0f / (5.0f + 4 * w);
+	std::vector<float> result = *diffs;
+	std::vector<float> tmp0 = *diffs;
+	std::vector<float> tmp1 = *diffs;
+	ScaleImage(w, &tmp1);
+	for (int y = 0; y < ysize; y++) {
+		const int row0 = y * xsize;
+		result[row0 + 1] += tmp0[row0];
+		result[row0 + 0] += tmp0[row0 + 1];
+		result[row0 + 2] += tmp0[row0 + 1];
+		for (int x = 2; x < xsize - 2; ++x) {
+			result[row0 + x - 1] += tmp0[row0 + x];
+			result[row0 + x + 1] += tmp0[row0 + x];
+		}
+		result[row0 + xsize - 3] += tmp0[row0 + xsize - 2];
+		result[row0 + xsize - 1] += tmp0[row0 + xsize - 2];
+		result[row0 + xsize - 2] += tmp0[row0 + xsize - 1];
+		if (y > 0) {
+			const int rowd1 = row0 - xsize;
+			result[rowd1 + 1] += tmp1[row0];
+			result[rowd1 + 0] += tmp0[row0];
+			for (int x = 1; x < xsize - 1; ++x) {
+				result[rowd1 + x + 1] += tmp1[row0 + x];
+				result[rowd1 + x + 0] += tmp0[row0 + x];
+				result[rowd1 + x - 1] += tmp1[row0 + x];
+			}
+			result[rowd1 + xsize - 1] += tmp0[row0 + xsize - 1];
+			result[rowd1 + xsize - 2] += tmp1[row0 + xsize - 1];
+		}
+		if (y + 1 < ysize) {
+			const int rowu1 = row0 + xsize;
+			result[rowu1 + 1] += tmp1[row0];
+			result[rowu1 + 0] += tmp0[row0];
+			for (int x = 1; x < xsize - 1; ++x) {
+				result[rowu1 + x + 1] += tmp1[row0 + x];
+				result[rowu1 + x + 0] += tmp0[row0 + x];
+				result[rowu1 + x - 1] += tmp1[row0 + x];
+			}
+			result[rowu1 + xsize - 1] += tmp0[row0 + xsize - 1];
+			result[rowu1 + xsize - 2] += tmp1[row0 + xsize - 1];
+		}
+	}
+	*diffs = result;
+	ScaleImageOpt(scale, diffs);
+}
+
+void DiffPrecomputeOpt(
+	const std::vector<std::vector<float> > &xyb0,
+	const std::vector<std::vector<float> > &xyb1,
+	size_t xsize, size_t ysize,
+	std::vector<std::vector<float> > *mask) {
+	PROFILER_FUNC;
+	mask->resize(3, std::vector<float>(xyb0[0].size()));
+	float valsh0[3] = { 0.0 };
+	float valsv0[3] = { 0.0 };
+	float valsh1[3] = { 0.0 };
+	float valsv1[3] = { 0.0 };
+	int ix2;
+	for (size_t y = 0; y < ysize; ++y) {
+		for (size_t x = 0; x < xsize; ++x) {
+			size_t ix = x + xsize * y;
+			if (x + 1 < xsize) {
+				ix2 = ix + 1;
+			}
+			else {
+				ix2 = ix - 1;
+			}
+			{
+				float x0 = (xyb0[0][ix] - xyb0[0][ix2]);
+				float y0 = (xyb0[1][ix] - xyb0[1][ix2]);
+				float z0 = (xyb0[2][ix] - xyb0[2][ix2]);
+				XybToValsOpt(x0, y0, z0, &valsh0[0], &valsh0[1], &valsh0[2]);
+				float x1 = (xyb1[0][ix] - xyb1[0][ix2]);
+				float y1 = (xyb1[1][ix] - xyb1[1][ix2]);
+				float z1 = (xyb1[2][ix] - xyb1[2][ix2]);
+				XybToValsOpt(x1, y1, z1, &valsh1[0], &valsh1[1], &valsh1[2]);
+			}
+			if (y + 1 < ysize) {
+				ix2 = ix + xsize;
+			}
+			else {
+				ix2 = ix - xsize;
+			}
+			{
+				float x0 = (xyb0[0][ix] - xyb0[0][ix2]);
+				float y0 = (xyb0[1][ix] - xyb0[1][ix2]);
+				float z0 = (xyb0[2][ix] - xyb0[2][ix2]);
+				XybToValsOpt(x0, y0, z0, &valsv0[0], &valsv0[1], &valsv0[2]);
+				float x1 = (xyb1[0][ix] - xyb1[0][ix2]);
+				float y1 = (xyb1[1][ix] - xyb1[1][ix2]);
+				float z1 = (xyb1[2][ix] - xyb1[2][ix2]);
+				XybToValsOpt(x1, y1, z1, &valsv1[0], &valsv1[1], &valsv1[2]);
+			}
+			for (int i = 0; i < 3; ++i) {
+				float sup0 = fabs(valsh0[i]) + fabs(valsv0[i]);
+				float sup1 = fabs(valsh1[i]) + fabs(valsv1[i]);
+				float m = std::min(sup0, sup1);
+				(*mask)[i][ix] = static_cast<float>(m);
+			}
+		}
+	}
+}
+
+void MaskOpt(const std::vector<std::vector<float> > &xyb0,
+	const std::vector<std::vector<float> > &xyb1,
+	size_t xsize, size_t ysize,
+	std::vector<std::vector<float> > *mask,
+	std::vector<std::vector<float> > *mask_dc) {
+	PROFILER_FUNC;
+	mask->resize(3);
+	for (int i = 0; i < 3; ++i) {
+		(*mask)[i].resize(xsize * ysize);
+	}
+	DiffPrecomputeOpt(xyb0, xyb1, xsize, ysize, mask);
+	for (int i = 0; i < 3; ++i) {
+		_Average5x5(xsize, ysize, &(*mask)[i]);
+		MinSquareValOpt(4, 0, xsize, ysize, (*mask)[i].data());
+		static const float sigma[3] = {
+			9.65781083553,
+			14.2644604355,
+			4.53358927369,
+		};
+		BlurOpt(xsize, ysize, (*mask)[i].data(), sigma[i], 0.0);
+	}
+	static const float w00 = 232.206464018;
+	static const float w11 = 22.9455222245;
+	static const float w22 = 503.962310606;
+
+	mask_dc->resize(3);
+	for (int i = 0; i < 3; ++i) {
+		(*mask_dc)[i].resize(xsize * ysize);
+	}
+	for (size_t y = 0; y < ysize; ++y) {
+		for (size_t x = 0; x < xsize; ++x) {
+			const size_t idx = y * xsize + x;
+			const float s0 = (*mask)[0][idx];
+			const float s1 = (*mask)[1][idx];
+			const float s2 = (*mask)[2][idx];
+			const float p0 = w00 * s0;
+			const float p1 = w11 * s1;
+			const float p2 = w22 * s2;
+
+			(*mask)[0][idx] = static_cast<float>(MaskXOpt(p0));
+			(*mask)[1][idx] = static_cast<float>(MaskYOpt(p1));
+			(*mask)[2][idx] = static_cast<float>(MaskBOpt(p2));
+			(*mask_dc)[0][idx] = static_cast<float>(MaskDcXOpt(p0));
+			(*mask_dc)[1][idx] = static_cast<float>(MaskDcYOpt(p1));
+			(*mask_dc)[2][idx] = static_cast<float>(MaskDcBOpt(p2));
+		}
+	}
+	for (int i = 0; i < 3; ++i) {
+		ScaleImageOpt(kGlobalScale * kGlobalScale, &(*mask)[i]);
+		ScaleImageOpt(kGlobalScale * kGlobalScale, &(*mask_dc)[i]);
+	}
+}
+
+}
+
+namespace butteraugli
+{
+    clButteraugliComparator::clButteraugliComparator(size_t xsize, size_t ysize, int step)
+        : ButteraugliComparator(xsize, ysize, step)
+    {
+
+    }
+
+    void clButteraugliComparator::DiffmapOpsinDynamicsImage(
+        std::vector<std::vector<float>> &xyb0,
+        std::vector<std::vector<float>> &xyb1,
+        std::vector<float> &result)
+    {
+		if (MODE_CPU_OPT == g_mathMode)
+		{
+			DiffmapOpsinDynamicsImageOpt(xyb0, xyb1, result);
+		}
+#ifdef __USE_OPENCL__
+        else if (MODE_OPENCL == g_mathMode && xsize_ > 100 && ysize_ > 100)
+        {
+            result.resize(xsize_ * ysize_);
+            clDiffmapOpsinDynamicsImage(result.data(), xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
+                xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_);
+        }
+#endif
+#ifdef __USE_CUDA__
+        else if (MODE_CUDA == g_mathMode && xsize_ > 100 && ysize_ > 100)
+        {
+            result.resize(xsize_ * ysize_);
+            cuDiffmapOpsinDynamicsImage(result.data(), xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
+                xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_);
+        }
+#endif
+        else
+        {
+            ButteraugliComparator::DiffmapOpsinDynamicsImage(xyb0, xyb1, result);
+        }
+    }
+
+
+    void clButteraugliComparator::BlockDiffMap(const std::vector<std::vector<float> > &xyb0,
+        const std::vector<std::vector<float> > &xyb1,
+        std::vector<float>* block_diff_dc,
+        std::vector<float>* block_diff_ac)
+    {
+        ButteraugliComparator::BlockDiffMap(xyb0, xyb1, block_diff_dc, block_diff_ac);
+#ifdef __USE_OPENCL__
+        if (MODE_CHECKCL == g_mathMode && xsize_ > 8 && ysize_ > 8)
+        {
+            tclBlockDiffMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
+                xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
+                xsize_, ysize_, step_,
+                (*block_diff_dc).data(), (*block_diff_ac).data());
+        }
+#endif
+    }
+	
+    void clButteraugliComparator::EdgeDetectorMap(const std::vector<std::vector<float> > &xyb0,
+        const std::vector<std::vector<float> > &xyb1,
+        std::vector<float>* edge_detector_map)
+    {
+        ButteraugliComparator::EdgeDetectorMap(xyb0, xyb1, edge_detector_map);
+#ifdef __USE_OPENCL__
+        if (MODE_CHECKCL == g_mathMode && xsize_ > 8 && ysize_ > 8)
+        {
+            tclEdgeDetectorMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
+                xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
+                xsize_, ysize_, step_, 
+                (*edge_detector_map).data());
+        }
+#endif
+    }
+
+    void clButteraugliComparator::EdgeDetectorLowFreq(const std::vector<std::vector<float> > &xyb0,
+        const std::vector<std::vector<float> > &xyb1,
+        std::vector<float>* block_diff_ac)
+    {
+#ifdef __USE_OPENCL__
+        if (MODE_CHECKCL == g_mathMode && xsize_ > 8 && ysize_ > 8)
+        {
+            std::vector<float> orign_ac = *block_diff_ac;
+            ButteraugliComparator::EdgeDetectorLowFreq(xyb0, xyb1, block_diff_ac);
+            tclEdgeDetectorLowFreq(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
+                xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
+                xsize_, ysize_, step_,
+                orign_ac.data(), (*block_diff_ac).data());
+        }
+        else
+#endif
+        {
+            ButteraugliComparator::EdgeDetectorLowFreq(xyb0, xyb1, block_diff_ac);
+        }
+    }
+
+    void clButteraugliComparator::CombineChannels(const std::vector<std::vector<float> >& mask_xyb,
+        const std::vector<std::vector<float> >& mask_xyb_dc,
+        const std::vector<float>& block_diff_dc,
+        const std::vector<float>& block_diff_ac,
+        const std::vector<float>& edge_detector_map,
+        std::vector<float>* result)
+    {
+#ifdef __USE_OPENCL__
+        if (MODE_CHECKCL == g_mathMode && xsize_ > 8 && ysize_ > 8)
+        {
+            std::vector<float> temp = *result;
+			temp.resize(res_xsize_ * res_ysize_);
+            ButteraugliComparator::CombineChannels(mask_xyb, mask_xyb_dc, block_diff_dc, block_diff_ac, edge_detector_map, result);
+            tclCombineChannels(mask_xyb[0].data(), mask_xyb[1].data(), mask_xyb[2].data(),
+                mask_xyb_dc[0].data(), mask_xyb_dc[1].data(), mask_xyb_dc[2].data(),
+                block_diff_dc.data(),
+                block_diff_ac.data(), edge_detector_map.data(), xsize_, ysize_, res_xsize_, res_ysize_, step_, &temp[0], &(*result)[0]);
+        }
+        else
+#endif
+        {
+            ButteraugliComparator::CombineChannels(mask_xyb, mask_xyb_dc, block_diff_dc, block_diff_ac, edge_detector_map, result);
+        }
+    }
+
+	void clButteraugliComparator::DiffmapOpsinDynamicsImageOpt(
+		std::vector<std::vector<float>> &xyb0,
+		std::vector<std::vector<float>> &xyb1,
+		std::vector<float> &result)
+	{
+		if (xsize_ < 8 || ysize_ < 8) return;
+		{
+			auto xyb0_c = xyb0;
+			auto xyb1_c = xyb1;
+			MaskHighIntensityChangeOpt(xsize_, ysize_, xyb0_c, xyb1_c, xyb0, xyb1);
+		}
+		assert(8 <= xsize_);
+		for (int i = 0; i < 3; i++) {
+			assert(xyb0[i].size() == num_pixels_);
+			assert(xyb1[i].size() == num_pixels_);
+		}
+		std::vector<float> edge_detector_map(3 * res_xsize_ * res_ysize_);
+		EdgeDetectorMapOpt(xyb0, xyb1, &edge_detector_map);
+		std::vector<float> block_diff_dc(3 * res_xsize_ * res_ysize_);
+		std::vector<float> block_diff_ac(3 * res_xsize_ * res_ysize_);
+		BlockDiffMapOpt(xyb0, xyb1, &block_diff_dc, &block_diff_ac);
+		EdgeDetectorLowFreqOpt(xyb0, xyb1, &block_diff_ac);
+		{
+			std::vector<std::vector<float> > mask_xyb(3);
+			std::vector<std::vector<float> > mask_xyb_dc(3);
+			MaskOpt(xyb0, xyb1, xsize_, ysize_, &mask_xyb, &mask_xyb_dc);
+			CombineChannelsOpt(mask_xyb, mask_xyb_dc, block_diff_dc, block_diff_ac,
+				edge_detector_map, &result);
+		}
+		CalculateDiffmapOpt(xsize_, ysize_, step_, &result);
+	}
+
+	void clButteraugliComparator::BlockDiffMapOpt(const std::vector<std::vector<float> > &xyb0,
+		const std::vector<std::vector<float> > &xyb1,
+		std::vector<float>* block_diff_dc,
+		std::vector<float>* block_diff_ac)
+	{
+		for (size_t res_y = 0; res_y + (kBlockEdge - step_ - 1) < ysize_;
+			res_y += step_) {
+			for (size_t res_x = 0; res_x + (kBlockEdge - step_ - 1) < xsize_;
+				res_x += step_) {
+				size_t res_ix = (res_y * res_xsize_ + res_x) / step_;
+				size_t offset = (std::min(res_y, ysize_ - 8) * xsize_ +
+					std::min(res_x, xsize_ - 8));
+				float block0[3 * kBlockEdge * kBlockEdge];
+				float block1[3 * kBlockEdge * kBlockEdge];
+				for (int i = 0; i < 3; ++i) {
+					float *m0 = &block0[i * kBlockEdge * kBlockEdge];
+					float *m1 = &block1[i * kBlockEdge * kBlockEdge];
+					for (size_t y = 0; y < kBlockEdge; y++) {
+						for (size_t x = 0; x < kBlockEdge; x++) {
+							m0[kBlockEdge * y + x] = xyb0[i][offset + y * xsize_ + x];
+							m1[kBlockEdge * y + x] = xyb1[i][offset + y * xsize_ + x];
+						}
+					}
+				}
+				float diff_xyb_dc[3] = { 0.0 };
+				float diff_xyb_ac[3] = { 0.0 };
+				float diff_xyb_edge_dc[3] = { 0.0 };
+				ButteraugliBlockDiffOpt(block0, block1,
+					diff_xyb_dc, diff_xyb_ac, diff_xyb_edge_dc);
+				for (int i = 0; i < 3; ++i) {
+					(*block_diff_dc)[3 * res_ix + i] = static_cast<float>(diff_xyb_dc[i]);
+					(*block_diff_ac)[3 * res_ix + i] = static_cast<float>(diff_xyb_ac[i]);
+				}
+			}
+		}
+	}
+
+	void clButteraugliComparator::EdgeDetectorMapOpt(const std::vector<std::vector<float> > &xyb0,
+		const std::vector<std::vector<float> > &xyb1,
+		std::vector<float>* edge_detector_map)
+	{
+		static const float kSigma[3] = {
+			1.5,
+			0.586,
+			0.4,
+		};
+		std::vector<std::vector<float> > blurred0(xyb0);
+		std::vector<std::vector<float> > blurred1(xyb1);
+		for (int i = 0; i < 3; i++) {
+			BlurOpt(xsize_, ysize_, blurred0[i].data(), kSigma[i], 0.0);
+			BlurOpt(xsize_, ysize_, blurred1[i].data(), kSigma[i], 0.0);
+		}
+		for (size_t res_y = 0; res_y + (8 - step_) < ysize_; res_y += step_) {
+			for (size_t res_x = 0; res_x + (8 - step_) < xsize_; res_x += step_) {
+				size_t res_ix = (res_y * res_xsize_ + res_x) / step_;
+				float diff_xyb[3] = { 0.0 };
+				Butteraugli8x8CornerEdgeDetectorDiffOpt(std::min(res_x, xsize_ - 8),
+					std::min(res_y, ysize_ - 8),
+					xsize_, ysize_,
+					blurred0, blurred1,
+					diff_xyb);
+				for (int i = 0; i < 3; ++i) {
+					(*edge_detector_map)[3 * res_ix + i] = static_cast<float>(diff_xyb[i]);
+				}
+			}
+		}
+	}
+
+	void clButteraugliComparator::EdgeDetectorLowFreqOpt(const std::vector<std::vector<float> > &xyb0,
+		const std::vector<std::vector<float> > &xyb1,
+		std::vector<float>* block_diff_ac)
+	{
+		static const float kSigma = 14;
+		static const float kMul = 10;
+		std::vector<std::vector<float> > blurred0(xyb0);
+		std::vector<std::vector<float> > blurred1(xyb1);
+		for (int i = 0; i < 3; i++) {
+			BlurOpt(xsize_, ysize_, blurred0[i].data(), kSigma, 0.0);
+			BlurOpt(xsize_, ysize_, blurred1[i].data(), kSigma, 0.0);
+		}
+		const int step = 8;
+		for (size_t y = 0; y + step < ysize_; y += step_) {
+			int resy = y / step_;
+			int resx = step / step_;
+			for (size_t x = 0; x + step < xsize_; x += step_, resx++) {
+				const int ix = y * xsize_ + x;
+				const int res_ix = resy * res_xsize_ + resx;
+				float diff[4][3];
+				for (int i = 0; i < 3; ++i) {
+					int ix2 = ix + 8;
+					diff[0][i] =
+						((blurred1[i][ix] - blurred0[i][ix]) +
+						(blurred0[i][ix2] - blurred1[i][ix2]));
+					ix2 = ix + 8 * xsize_;
+					diff[1][i] =
+						((blurred1[i][ix] - blurred0[i][ix]) +
+						(blurred0[i][ix2] - blurred1[i][ix2]));
+					ix2 = ix + 6 * xsize_ + 6;
+					diff[2][i] =
+						((blurred1[i][ix] - blurred0[i][ix]) +
+						(blurred0[i][ix2] - blurred1[i][ix2]));
+					ix2 = ix + 6 * xsize_ - 6;
+					diff[3][i] = x < step ? 0 :
+						((blurred1[i][ix] - blurred0[i][ix]) +
+						(blurred0[i][ix2] - blurred1[i][ix2]));
+				}
+				float max_diff_xyb[3] = { 0 };
+				for (int k = 0; k < 4; ++k) {
+					float diff_xyb[3] = { 0 };
+					XybDiffLowFreqSquaredAccumulateOpt(diff[k][0], diff[k][1], diff[k][2],
+						0, 0, 0, 1.0,
+						diff_xyb);
+					for (int i = 0; i < 3; ++i) {
+						max_diff_xyb[i] = std::max<float>(max_diff_xyb[i], diff_xyb[i]);
+					}
+				}
+				for (int i = 0; i < 3; ++i) {
+					(*block_diff_ac)[3 * res_ix + i] += static_cast<float>(kMul * max_diff_xyb[i]);
+				}
+			}
+		}
+	}
+
+	void clButteraugliComparator::CombineChannelsOpt(const std::vector<std::vector<float> >& mask_xyb,
+		const std::vector<std::vector<float> >& mask_xyb_dc,
+		const std::vector<float>& block_diff_dc,
+		const std::vector<float>& block_diff_ac,
+		const std::vector<float>& edge_detector_map,
+		std::vector<float>* result)
+	{
+		result->resize(res_xsize_ * res_ysize_);
+		for (size_t res_y = 0; res_y + (8 - step_) < ysize_; res_y += step_) {
+			for (size_t res_x = 0; res_x + (8 - step_) < xsize_; res_x += step_) {
+				size_t res_ix = (res_y * res_xsize_ + res_x) / step_;
+				float mask[3];
+				float dc_mask[3];
+				for (int i = 0; i < 3; ++i) {
+					mask[i] = mask_xyb[i][(res_y + 3) * xsize_ + (res_x + 3)];
+					dc_mask[i] = mask_xyb_dc[i][(res_y + 3) * xsize_ + (res_x + 3)];
+				}
+				(*result)[res_ix] = static_cast<float>(
+					DotProductOpt(&block_diff_dc[3 * res_ix], dc_mask) +
+					DotProductOpt(&block_diff_ac[3 * res_ix], mask) +
+					DotProductOpt(&edge_detector_map[3 * res_ix], mask));
+			}
+		}
+	}
+
+    void MinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values) 
+    {
+#ifdef __USE_OPENCL__
+        if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8)
+        {
+            std::vector<float> img;
+            img.resize(xsize * ysize);
+            memcpy(img.data(), values, xsize * ysize * sizeof(float));
+            _MinSquareVal(square_size, offset, xsize, ysize, values);
+            tclMinSquareVal(img.data(), square_size, offset, xsize, ysize, values);
+        }
+		else
+#endif
+        {
+            _MinSquareVal(square_size, offset, xsize, ysize, values);
+        }
+    }
+
+    void Average5x5(int xsize, int ysize, std::vector<float>* diffs)
+    {
+#ifdef __USE_OPENCL__
+        if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8)
+        {
+            std::vector<float> diffs_org = *diffs;
+            _Average5x5(xsize, ysize, diffs);
+            tclAverage5x5(xsize, ysize, diffs_org, *diffs);
+        }
+        else
+#endif
+        {
+            _Average5x5(xsize, ysize, diffs);
+        }
+    }
+
+    void DiffPrecompute(const std::vector<std::vector<float> > &xyb0, const std::vector<std::vector<float> > &xyb1, size_t xsize, size_t ysize, std::vector<std::vector<float> > *mask)
+    {
+        _DiffPrecompute(xyb0, xyb1, xsize, ysize, mask);
+
+#ifdef __USE_OPENCL__
+        if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8)
+        {
+            tclDiffPrecompute(xyb0, xyb1, xsize, ysize, mask);
+        }
+#endif
+    }
+
+    void Mask(const std::vector<std::vector<float> > &xyb0,
+        const std::vector<std::vector<float> > &xyb1,
+        size_t xsize, size_t ysize,
+        std::vector<std::vector<float> > *mask,
+        std::vector<std::vector<float> > *mask_dc)
+    {
+		if (MODE_CPU_OPT == g_mathMode)
+		{
+			MaskOpt(xyb0, xyb1, xsize, ysize, mask, mask_dc);
+		}
+#ifdef __USE_OPENCL__
+        else if (MODE_OPENCL == g_mathMode && xsize > 100 && ysize > 100)
+        {
+            mask->resize(3);
+            mask_dc->resize(3);
+            for (int i = 0; i < 3; i++)
+            {
+                (*mask)[i].resize(xsize * ysize);
+                (*mask_dc)[i].resize(xsize * ysize);
+            }
+            clMask((*mask)[0].data(), (*mask)[1].data(), (*mask)[2].data(),
+                (*mask_dc)[0].data(), (*mask_dc)[1].data(), (*mask_dc)[2].data(),
+                xsize, ysize,
+                xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
+                xyb1[0].data(), xyb1[1].data(), xyb1[2].data()
+                );
+        }
+		else if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8)
+		{
+			_Mask(xyb0, xyb1, xsize, ysize, mask, mask_dc);
+			tclMask(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
+				xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
+				xsize, ysize,
+				(*mask)[0].data(), (*mask)[1].data(), (*mask)[2].data(),
+				(*mask_dc)[0].data(), (*mask_dc)[1].data(), (*mask_dc)[2].data());
+		}
+#endif
+#ifdef __USE_CUDA__
+        else if (MODE_CUDA == g_mathMode && xsize > 100 && ysize > 100)
+        {
+            mask->resize(3);
+            mask_dc->resize(3);
+            for (int i = 0; i < 3; i++)
+            {
+                (*mask)[i].resize(xsize * ysize);
+                (*mask_dc)[i].resize(xsize * ysize);
+            }
+            cuMask((*mask)[0].data(), (*mask)[1].data(), (*mask)[2].data(),
+                (*mask_dc)[0].data(), (*mask_dc)[1].data(), (*mask_dc)[2].data(),
+                xsize, ysize,
+                xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
+                xyb1[0].data(), xyb1[1].data(), xyb1[2].data()
+            );
+        }
+#endif
+        else
+        {
+            _Mask(xyb0, xyb1, xsize, ysize, mask, mask_dc);
+        }
+    }
+
+    void CalculateDiffmap(const size_t xsize, const size_t ysize,
+        const size_t step,
+        std::vector<float>* diffmap)
+    {
+#ifdef __USE_OPENCL__
+        if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8)
+        {
+            std::vector<float> diffmap_org = *diffmap;
+            _CalculateDiffmap(xsize, ysize, step, diffmap);
+            tclCalculateDiffmap(xsize, ysize, step, diffmap_org.data(), diffmap_org.size(), (*diffmap).data());
+        }
+        else
+#endif
+        {
+            _CalculateDiffmap(xsize, ysize, step, diffmap);
+        }
+    }
+
+    void MaskHighIntensityChange(
+        size_t xsize, size_t ysize,
+        const std::vector<std::vector<float> > &c0,
+        const std::vector<std::vector<float> > &c1,
+        std::vector<std::vector<float> > &xyb0,
+        std::vector<std::vector<float> > &xyb1)
+    {
+#ifdef __USE_OPENCL__
+        if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8)
+        {
+			_MaskHighIntensityChange(xsize, ysize, c0, c1, xyb0, xyb1);
+            tclMaskHighIntensityChange(c0[0].data(), c0[1].data(), c0[2].data(),
+                c1[0].data(), c1[1].data(), c1[2].data(),
+                xsize, ysize,
+                xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
+                xyb1[0].data(), xyb1[1].data(), xyb1[2].data());
+        }
+		else
+#endif
+		if (MODE_CPU_OPT == g_mathMode)
+		{
+			MaskHighIntensityChangeOpt(xsize, ysize, c0, c1, xyb0, xyb1);
+		}
+		else
+		{
+			_MaskHighIntensityChange(xsize, ysize, c0, c1, xyb0, xyb1);
+		}
+    }
+
+    void ScaleImage(double scale, std::vector<float> *result)
+    {
+#ifdef __USE_OPENCL__
+        if (MODE_CHECKCL == g_mathMode && result->size() > 64)
+        {
+            std::vector<float> result_org = *result;
+            _ScaleImage(scale, result);
+            tclScaleImage(scale, result_org.data(), (*result).data(), (*result).size());
+        }
+        else
+#endif
+        {
+            _ScaleImage(scale, result);
+        }
+    }
+
+    void Convolution(size_t xsize, size_t ysize,
+        size_t xstep,
+        size_t len, size_t offset,
+        const float* __restrict__ multipliers,
+        const float* __restrict__ inp,
+        float border_ratio,
+        float* __restrict__ result)
+    {
+#ifdef __USE_OPENCL__
+		_Convolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result);
+        if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8)
+        {
+            tclConvolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result);
+        }
+#endif
+    }
+
+    void Blur(size_t xsize, size_t ysize, float* channel, double sigma,
+        double border_ratio)
+    {
+#ifdef __USE_OPENCL__
+        if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8)
+        {
+            std::vector<float> orignChannel;
+            orignChannel.resize(xsize * ysize);
+            memcpy(orignChannel.data(), channel, xsize * ysize * sizeof(float));
+            _Blur(xsize, ysize, channel, sigma, border_ratio);
+            tclBlur(orignChannel.data(), xsize, ysize, sigma, border_ratio, channel);
+        }
+        else
+#endif
+        {
+            _Blur(xsize, ysize, channel, sigma, border_ratio);
+        }
+    }
+
+    void OpsinDynamicsImage(size_t xsize, size_t ysize,
+        std::vector<std::vector<float> > &rgb)
+    {
+		if (MODE_CPU_OPT == g_mathMode)
+		{
+			OpsinDynamicsImageOpt(xsize, ysize, rgb);
+		}
+#ifdef __USE_OPENCL__
+        else if (MODE_OPENCL == g_mathMode && xsize > 100 && ysize > 100)
+        {
+            float * r = rgb[0].data();
+            float * g = rgb[1].data();
+            float * b = rgb[2].data();
+
+            clOpsinDynamicsImage(r, g, b, xsize, ysize);
+        }
+		else if (MODE_CHECKCL == g_mathMode && xsize > 8 & ysize > 8)
+		{
+			std::vector< std::vector<float>> orig_rgb = rgb;
+			_OpsinDynamicsImage(xsize, ysize, rgb);
+			tclOpsinDynamicsImage(orig_rgb[0].data(), orig_rgb[1].data(), orig_rgb[2].data(),
+				xsize, ysize,
+				rgb[0].data(), rgb[1].data(), rgb[2].data());
+	}
+#endif
+#ifdef __USE_CUDA__
+        else if (MODE_CUDA == g_mathMode && xsize > 100 && ysize > 100)
+        {
+            float * r = rgb[0].data();
+            float * g = rgb[1].data();
+            float * b = rgb[2].data();
+
+            cuOpsinDynamicsImage(r, g, b, xsize, ysize);
+        }
+#endif
+        else
+        {
+            _OpsinDynamicsImage(xsize, ysize, rgb);
+        }
+    }
+}
\ No newline at end of file
diff --git a/clguetzli/clbutter_comparator.h b/clguetzli/clbutter_comparator.h
new file mode 100644
index 00000000..76380785
--- /dev/null
+++ b/clguetzli/clbutter_comparator.h
@@ -0,0 +1,116 @@
+/*
+* OpenCL/CUDA edition implementation of butter_comparator.
+*
+* Author: strongtu@tencent.com
+*         ianhuang@tencent.com
+*         chriskzhou@tencent.com
+*/
+#pragma once
+#include <vector>
+#include "butteraugli/butteraugli.h"
+
+#define __restrict__
+
+namespace butteraugli {
+
+    class clButteraugliComparator : public ButteraugliComparator
+    {
+    public:
+        clButteraugliComparator(size_t xsize, size_t ysize, int step);
+
+        virtual void DiffmapOpsinDynamicsImage(std::vector<std::vector<float>> &xyb0,
+            std::vector<std::vector<float>> &xyb1,
+            std::vector<float> &result);
+
+		virtual void DiffmapOpsinDynamicsImageOpt(std::vector<std::vector<float>> &xyb0,
+			std::vector<std::vector<float>> &xyb1,
+			std::vector<float> &result);
+
+        virtual void BlockDiffMap(const std::vector<std::vector<float> > &rgb0,
+            const std::vector<std::vector<float> > &rgb1,
+            std::vector<float>* block_diff_dc,
+            std::vector<float>* block_diff_ac);
+
+		virtual void BlockDiffMapOpt(const std::vector<std::vector<float> > &rgb0,
+			const std::vector<std::vector<float> > &rgb1,
+			std::vector<float>* block_diff_dc,
+			std::vector<float>* block_diff_ac);
+
+        virtual void EdgeDetectorMap(const std::vector<std::vector<float> > &rgb0,
+            const std::vector<std::vector<float> > &rgb1,
+            std::vector<float>* edge_detector_map);
+
+		virtual void EdgeDetectorMapOpt(const std::vector<std::vector<float> > &rgb0,
+			const std::vector<std::vector<float> > &rgb1,
+			std::vector<float>* edge_detector_map);
+
+        virtual void EdgeDetectorLowFreq(const std::vector<std::vector<float> > &rgb0,
+            const std::vector<std::vector<float> > &rgb1,
+            std::vector<float>* block_diff_ac);
+
+		virtual void EdgeDetectorLowFreqOpt(const std::vector<std::vector<float> > &rgb0,
+			const std::vector<std::vector<float> > &rgb1,
+			std::vector<float>* block_diff_ac);
+
+        virtual void CombineChannels(const std::vector<std::vector<float> >& scale_xyb,
+            const std::vector<std::vector<float> >& scale_xyb_dc,
+            const std::vector<float>& block_diff_dc,
+            const std::vector<float>& block_diff_ac,
+            const std::vector<float>& edge_detector_map,
+            std::vector<float>* result);
+
+		virtual void CombineChannelsOpt(const std::vector<std::vector<float> >& scale_xyb,
+			const std::vector<std::vector<float> >& scale_xyb_dc,
+			const std::vector<float>& block_diff_dc,
+			const std::vector<float>& block_diff_ac,
+			const std::vector<float>& edge_detector_map,
+			std::vector<float>* result);
+    };
+
+    void _MinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values);
+    void _Average5x5(int xsize, int ysize, std::vector<float>* diffs);
+    void _DiffPrecompute(const std::vector<std::vector<float> > &xyb0, const std::vector<std::vector<float> > &xyb1, size_t xsize, size_t ysize, std::vector<std::vector<float> > *mask);
+    void _Mask(const std::vector<std::vector<float> > &xyb0,
+        const std::vector<std::vector<float> > &xyb1,
+        size_t xsize, size_t ysize,
+        std::vector<std::vector<float> > *mask,
+        std::vector<std::vector<float> > *mask_dc);
+    void _CalculateDiffmap(const size_t xsize, const size_t ysize,
+        const size_t step,
+        std::vector<float>* diffmap);
+    void _OpsinDynamicsImage(size_t xsize, size_t ysize,
+        std::vector<std::vector<float> > &rgb);
+    void _MaskHighIntensityChange(
+        size_t xsize, size_t ysize,
+        const std::vector<std::vector<float> > &c0,
+        const std::vector<std::vector<float> > &c1,
+        std::vector<std::vector<float> > &xyb0,
+        std::vector<std::vector<float> > &xyb1);
+    void _ScaleImage(double scale, std::vector<float> *result);
+    void _Convolution(size_t xsize, size_t ysize,
+        size_t xstep,
+        size_t len, size_t offset,
+        const float* __restrict__ multipliers,
+        const float* __restrict__ inp,
+        double border_ratio,
+        float* __restrict__ result);
+    void _Blur(size_t xsize, size_t ysize, float* channel, double sigma,
+        double border_ratio);
+
+    void MinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values);
+    void Average5x5(int xsize, int ysize, std::vector<float>* diffs);
+    void DiffPrecompute(const std::vector<std::vector<float> > &xyb0, const std::vector<std::vector<float> > &xyb1, size_t xsize, size_t ysize, std::vector<std::vector<float> > *mask);
+    void ScaleImage(double scale, std::vector<float> *result);
+    void Convolution(size_t xsize, size_t ysize,
+        size_t xstep,
+        size_t len, size_t offset,
+        const float* __restrict__ multipliers,
+        const float* __restrict__ inp,
+        float border_ratio,
+        float* __restrict__ result);
+    void Blur(size_t xsize, size_t ysize, float* channel, double sigma,
+        double border_ratio);
+    void CalculateDiffmap(const size_t xsize, const size_t ysize,
+        const size_t step,
+        std::vector<float>* diffmap);
+}
\ No newline at end of file
diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
new file mode 100644
index 00000000..2a8eb527
--- /dev/null
+++ b/clguetzli/clguetzli.cl
@@ -0,0 +1,3420 @@
+/*
+* OpenCL Kernels
+*
+* Author: strongtu@tencent.com
+*         ianhuang@tencent.com
+*         chriskzhou@tencent.com
+*/
+#ifdef __USE_OPENCL__
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+#include  "clguetzli/clguetzli.cl.h"
+
+#ifdef __USE_DOUBLE_AS_FLOAT__
+#define double float
+#endif
+
+#define kBlockEdge      8
+#define kBlockSize      (kBlockEdge * kBlockEdge)
+#define kDCTBlockSize   (kBlockEdge * kBlockEdge)
+#define kBlockEdgeHalf  (kBlockEdge / 2)
+#define kBlockHalf      (kBlockEdge * kBlockEdgeHalf)
+#define kComputeBlockSize (kBlockSize * 3)
+
+// IntFloatPair: opencl version of output_order/input_order
+typedef struct __IntFloatPair
+{
+    int   idx;
+    float err;
+}IntFloatPair, DCTScoreData, CoeffData;
+
+typedef struct __IntFloatPairList 
+{
+    int size;
+    IntFloatPair *pData;
+}IntFloatPairList;
+
+__device__ void   XybToVals(double x, double y, double z, double *valx, double *valy, double *valz);
+__device__ double InterpolateClampNegative(__global const double *array, int size, double sx);
+__device__ void   XybDiffLowFreqSquaredAccumulate(double r0, double g0, double b0,
+                                       double r1, double g1, double b1,
+                                       double factor, double res[3]);
+__device__ double DotProduct(__global const float u[3], const double v[3]);
+__device__ void   OpsinAbsorbance(const double in[3], double out[3]);
+__device__ void   RgbToXyb(double r, double g, double b, double *valx, double *valy, double *valz);
+__device__ double Gamma(double v);
+__device__ void   ButteraugliBlockDiff(__private double xyb0[3 * kBlockSize],
+    __private double xyb1[3 * kBlockSize],
+    double diff_xyb_dc[3],
+    double diff_xyb_ac[3],
+    double diff_xyb_edge_dc[3]);
+__device__ void Butteraugli8x8CornerEdgeDetectorDiff(
+    int pos_x,
+    int pos_y,
+    int xsize,
+    int ysize,
+    __global const float *r, __global const float *g, __global const float* b,
+    __global const float *r2, __global const float* g2, __global const float *b2,
+    double* diff_xyb);
+
+__device__ int MakeInputOrderEx(const coeff_t block[3*8*8], const coeff_t orig_block[3*8*8], IntFloatPairList *input_order);
+
+__device__ double Factor2(const channel_info mayout_channel[3],
+                        const coeff_t* candidate_block, 
+                        const int block_x, 
+                        const int block_y, 
+                        __global const float *orig_image_batch,
+                        __global const float *mask_scale,
+                        const int image_width,
+                        const int image_height);
+
+__device__ double CompareBlockFactor1(const channel_info mayout_channel[3],
+    const coeff_t* candidate_block,
+    const int block_x,
+    const int block_y,
+    __global const float *orig_image_batch,
+    __global const float *mask_scale,
+    const int image_width,
+    const int image_height);
+
+__device__ double CompareBlockFactor(const channel_info mayout_channel[3],
+    const coeff_t* candidate_block,
+    const int block_x,
+    const int block_y,
+    __global const float *orig_image_batch,
+    __global const float *mask_scale,
+    const int image_width,
+    const int image_height,
+    const int factor);
+
+__device__ void floatcopy(float *dst, const float *src, int size);
+__device__ void coeffcopy(coeff_t *dst, const coeff_t *src, int size);
+__device__ void coeffcopy_g(coeff_t *dst, __global const coeff_t *src, int size);
+__device__ int list_erase(IntFloatPairList* list, int idx);
+__device__ int list_push_back(IntFloatPairList* list, int i, float f);
+
+__kernel void clConvolutionEx(
+	__global float* result,
+	__global const float* inp, const int xsize, 
+	__global const float* multipliers, const int len,
+    const int xstep, const int offset, const float border_ratio)
+{
+    const int ox = get_global_id(0);
+    const int y = get_global_id(1);
+
+    const int oxsize = get_global_size(0);
+    const int ysize = get_global_size(1);
+
+    const int x = ox * xstep;
+
+    float weight_no_border = 0;
+    for (int j = 0; j <= 2 * offset; j++)
+    {
+        weight_no_border += multipliers[j];
+    }
+
+    int minx = x < offset ? 0 : x - offset;
+    int maxx = min(xsize, x + len - offset);
+
+    float weight = 0.0;
+    for (int j = minx; j < maxx; j++)
+    {
+        weight += multipliers[j - x + offset];
+    }
+
+    weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border;
+    float scale = 1.0 / weight;
+
+    float sum = 0.0;
+    for (int j = minx; j < maxx; j++)
+    {
+        sum += inp[y * xsize + j] * multipliers[j - x + offset];
+    }
+
+    result[ox * ysize + y] = sum * scale;
+}
+
+__kernel void clConvolutionXEx(
+	__global float* result,
+    const int xsize, const int ysize,
+	__global const float* inp,
+	__global const float* multipliers, const int len, 
+	const int step, const int offset, const float border_ratio)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    if (x >= xsize || y >= ysize) return;
+
+    if (x % step != 0) return;
+
+    float weight_no_border = 0;
+    for (int j = 0; j <= 2 * offset; j++)
+    {
+        weight_no_border += multipliers[j];
+    }
+
+    int minx = x < offset ? 0 : x - offset;
+    int maxx = min(xsize, x + len - offset);
+
+    float weight = 0.0;
+    for (int j = minx; j < maxx; j++)
+    {
+        weight += multipliers[j - x + offset];
+    }
+
+    weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border;
+    float scale = 1.0 / weight;
+
+    float sum = 0.0;
+    for (int j = minx; j < maxx; j++)
+    {
+        sum += inp[y * xsize + j] * multipliers[j - x + offset];
+    }
+
+    result[y * xsize + x] = sum * scale;
+}
+
+__kernel void clConvolutionYEx(
+	__global float* result,
+    const int xsize, const int ysize,
+	__global const float* inp, 
+	__global const float* multipliers, const int len, 
+    const int step, const int offset, const float border_ratio)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    if (x >= xsize || y >= ysize) return;
+    if (x % step != 0) return;
+    if (y % step != 0) return;
+
+    float weight_no_border = 0;
+    for (int j = 0; j <= 2 * offset; j++)
+    {
+        weight_no_border += multipliers[j];
+    }
+
+    int miny = y < offset ? 0 : y - offset;
+    int maxy = min(ysize, y + len - offset);
+
+    float weight = 0.0;
+    for (int j = miny; j < maxy; j++)
+    {
+        weight += multipliers[j - y + offset];
+    }
+
+    weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border;
+    float scale = 1.0 / weight;
+
+    float sum = 0.0;
+    for (int j = miny; j < maxy; j++)
+    {
+        sum += inp[j * xsize + x] * multipliers[j - y + offset];
+    }
+
+    result[y * xsize + x] = sum * scale;
+}
+
+__kernel void clSquareSampleEx(
+	__global float* result,
+    const int xsize, const int ysize,
+	__global const float* image, 
+	const int xstep, const int ystep)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    if (x >= xsize || y >= ysize) return;
+
+    int x_sample = x - x % xstep;
+    int y_sample = y - y % ystep;
+
+    if (x_sample == x && y_sample == y) return;
+
+    result[y * xsize + x] = image[y_sample * xsize + x_sample];
+}
+
+__kernel void clOpsinDynamicsImageEx(
+    __global float *r, __global float *g, __global float *b,
+    const int size,
+    __global const float *r_blurred, __global const float *g_blurred, __global const float *b_blurred)
+{
+    const int i = get_global_id(0);
+    if (i >= size) return;
+
+    double pre[3] = { r_blurred[i], g_blurred[i],  b_blurred[i] };
+    double pre_mixed[3];
+    OpsinAbsorbance(pre, pre_mixed);
+
+    double sensitivity[3];
+    sensitivity[0] = Gamma(pre_mixed[0]) / pre_mixed[0];
+    sensitivity[1] = Gamma(pre_mixed[1]) / pre_mixed[1];
+    sensitivity[2] = Gamma(pre_mixed[2]) / pre_mixed[2];
+
+    double cur_rgb[3] = { r[i], g[i],  b[i] };
+    double cur_mixed[3];
+    OpsinAbsorbance(cur_rgb, cur_mixed);
+    cur_mixed[0] *= sensitivity[0];
+    cur_mixed[1] *= sensitivity[1];
+    cur_mixed[2] *= sensitivity[2];
+
+    double x, y, z;
+    RgbToXyb(cur_mixed[0], cur_mixed[1], cur_mixed[2], &x, &y, &z);
+    r[i] = x;
+    g[i] = y;
+    b[i] = z;
+}
+
+__kernel void clMaskHighIntensityChangeEx(
+    __global float *xyb0_x, __global float *xyb0_y, __global float *xyb0_b,
+    const int xsize, const int ysize,
+    __global float *xyb1_x, __global float *xyb1_y, __global float *xyb1_b,
+    __global const float *c0_x, __global const float *c0_y, __global const float *c0_b,
+    __global const float *c1_x, __global const float *c1_y, __global const float *c1_b
+)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    if (x >= xsize || y >= ysize) return;
+
+    size_t ix = y * xsize + x;
+    const double ave[3] = {
+        (c0_x[ix] + c1_x[ix]) * 0.5f,
+        (c0_y[ix] + c1_y[ix]) * 0.5f,
+        (c0_b[ix] + c1_b[ix]) * 0.5f,
+    };
+    double sqr_max_diff = -1;
+    {
+        int offset[4] = { -1, 1, -(int)(xsize), (int)(xsize) };
+        int border[4] = { x == 0, x + 1 == xsize, y == 0, y + 1 == ysize };
+        for (int dir = 0; dir < 4; ++dir) {
+            if (border[dir]) {
+                continue;
+            }
+            const int ix2 = ix + offset[dir];
+            double diff = 0.5 * (c0_y[ix2] + c1_y[ix2]) - ave[1];
+            diff *= diff;
+            if (sqr_max_diff < diff) {
+                sqr_max_diff = diff;
+            }
+        }
+    }
+    const double kReductionX = 275.19165240059317;
+    const double kReductionY = 18599.41286306991;
+    const double kReductionZ = 410.8995306951065;
+    const double kChromaBalance = 106.95800948271017;
+    double chroma_scale = kChromaBalance / (ave[1] + kChromaBalance);
+
+    const double mix[3] = {
+        chroma_scale * kReductionX / (sqr_max_diff + kReductionX),
+        kReductionY / (sqr_max_diff + kReductionY),
+        chroma_scale * kReductionZ / (sqr_max_diff + kReductionZ),
+    };
+    // Interpolate lineraly between the average color and the actual
+    // color -- to reduce the importance of this pixel.
+    xyb0_x[ix] = (float)(mix[0] * c0_x[ix] + (1 - mix[0]) * ave[0]);
+    xyb1_x[ix] = (float)(mix[0] * c1_x[ix] + (1 - mix[0]) * ave[0]);
+
+    xyb0_y[ix] = (float)(mix[1] * c0_y[ix] + (1 - mix[1]) * ave[1]);
+    xyb1_y[ix] = (float)(mix[1] * c1_y[ix] + (1 - mix[1]) * ave[1]);
+
+    xyb0_b[ix] = (float)(mix[2] * c0_b[ix] + (1 - mix[2]) * ave[2]);
+    xyb1_b[ix] = (float)(mix[2] * c1_b[ix] + (1 - mix[2]) * ave[2]);
+}
+
+__kernel void clEdgeDetectorMapEx(
+	__global float *result,
+    const int res_xsize, const int res_ysize,
+    __global const float *r, __global const float *g, __global const float* b,
+    __global const float *r2, __global const float* g2, __global const float *b2,
+    int xsize, int ysize, int step)
+{
+    const int res_x = get_global_id(0);
+    const int res_y = get_global_id(1);
+
+    if (res_x >= res_xsize || res_y >= res_ysize) return;
+
+    int pos_x = res_x * step;
+    int pos_y = res_y * step;
+
+    if (pos_x >= xsize - (8 - step)) return;
+    if (pos_y >= ysize - (8 - step)) return;
+
+    pos_x = min(pos_x, xsize - 8);
+    pos_y = min(pos_y, ysize - 8);
+
+    double diff_xyb[3] = { 0.0 };
+    Butteraugli8x8CornerEdgeDetectorDiff(pos_x, pos_y, xsize, ysize,
+        r, g, b,
+        r2, g2, b2,
+        &diff_xyb[0]);
+
+    int idx = (res_y * res_xsize + res_x) * 3;
+    result[idx] = diff_xyb[0];
+    result[idx + 1] = diff_xyb[1];
+    result[idx + 2] = diff_xyb[2];
+}
+
+
+__kernel void clBlockDiffMapEx(
+	__global float* block_diff_dc, __global float* block_diff_ac,
+    const int res_xsize, const int res_ysize,
+	__global const float* r, __global const float* g, __global const float* b,
+    __global const float* r2, __global const float* g2, __global const float* b2,
+    int xsize, int ysize, int step)
+{
+    const int res_x = get_global_id(0);
+    const int res_y = get_global_id(1);
+
+    if (res_x >= res_xsize || res_y >= res_ysize) return;
+
+    int pos_x = res_x * step;
+    int pos_y = res_y * step;
+
+    if ((pos_x + kBlockEdge - step - 1) >= xsize) return;
+    if ((pos_y + kBlockEdge - step - 1) >= ysize) return;
+
+    size_t res_ix = res_y * res_xsize + res_x;
+    size_t offset = min(pos_y, ysize - 8) * xsize + min(pos_x, xsize - 8);
+
+    double block0[3 * kBlockEdge * kBlockEdge];
+    double block1[3 * kBlockEdge * kBlockEdge];
+
+    double *block0_r = &block0[0];
+    double *block0_g = &block0[kBlockEdge * kBlockEdge];
+    double *block0_b = &block0[2 * kBlockEdge * kBlockEdge];
+
+    double *block1_r = &block1[0];
+    double *block1_g = &block1[kBlockEdge * kBlockEdge];
+    double *block1_b = &block1[2 * kBlockEdge * kBlockEdge];
+
+    for (int y = 0; y < kBlockEdge; y++)
+    {
+        for (int x = 0; x < kBlockEdge; x++)
+        {
+            block0_r[kBlockEdge * y + x] = r[offset + y * xsize + x];
+            block0_g[kBlockEdge * y + x] = g[offset + y * xsize + x];
+            block0_b[kBlockEdge * y + x] = b[offset + y * xsize + x];
+            block1_r[kBlockEdge * y + x] = r2[offset + y * xsize + x];
+            block1_g[kBlockEdge * y + x] = g2[offset + y * xsize + x];
+            block1_b[kBlockEdge * y + x] = b2[offset + y * xsize + x];
+        }
+    }
+
+    double diff_xyb_dc[3] = { 0.0 };
+    double diff_xyb_ac[3] = { 0.0 };
+    double diff_xyb_edge_dc[3] = { 0.0 };
+
+    ButteraugliBlockDiff(block0, block1, diff_xyb_dc, diff_xyb_ac, diff_xyb_edge_dc);
+
+    for (int i = 0; i < 3; i++)
+    {
+        block_diff_dc[3 * res_ix + i] = diff_xyb_dc[i];
+        block_diff_ac[3 * res_ix + i] = diff_xyb_ac[i];
+    }
+}
+
+__kernel void clEdgeDetectorLowFreqEx(
+	__global float *block_diff_ac,
+    const int res_xsize, const int res_ysize, 
+    __global const float *r, __global const float *g, __global const float* b,
+    __global const float *r2, __global const float* g2, __global const float *b2,
+    int xsize, int ysize, int step_)
+{
+    const int res_x = get_global_id(0);
+    const int res_y = get_global_id(1);
+
+    if (res_x >= res_xsize || res_y >= res_ysize) return;
+
+	const int step = 8;
+    if (res_x < step / step_) return;
+
+    int x = (res_x - (step / step_)) * step_;
+    int y = res_y * step_;
+
+    if (x + step >= xsize) return;
+    if (y + step >= ysize) return;
+
+    int ix = y * xsize + x;
+
+    double diff[4][3];
+    __global const float* blurred0[3] = { r, g, b };
+    __global const float* blurred1[3] = { r2, g2, b2 };
+
+    for (int i = 0; i < 3; ++i) {
+        int ix2 = ix + 8;
+        diff[0][i] =
+            ((blurred1[i][ix] - blurred0[i][ix]) +
+            (blurred0[i][ix2] - blurred1[i][ix2]));
+        ix2 = ix + 8 * xsize;
+        diff[1][i] =
+            ((blurred1[i][ix] - blurred0[i][ix]) +
+            (blurred0[i][ix2] - blurred1[i][ix2]));
+        ix2 = ix + 6 * xsize + 6;
+        diff[2][i] =
+            ((blurred1[i][ix] - blurred0[i][ix]) +
+            (blurred0[i][ix2] - blurred1[i][ix2]));
+        ix2 = ix + 6 * xsize - 6;
+        diff[3][i] = x < step ? 0 :
+            ((blurred1[i][ix] - blurred0[i][ix]) +
+            (blurred0[i][ix2] - blurred1[i][ix2]));
+    }
+    double max_diff_xyb[3] = { 0 };
+    for (int k = 0; k < 4; ++k) {
+        double diff_xyb[3] = { 0 };
+        XybDiffLowFreqSquaredAccumulate(diff[k][0], diff[k][1], diff[k][2],
+            0, 0, 0, 1.0,
+            diff_xyb);
+        for (int i = 0; i < 3; ++i) {
+            max_diff_xyb[i] = max(max_diff_xyb[i], diff_xyb[i]);
+        }
+    }
+
+    int res_ix = res_y * res_xsize + res_x;
+
+    const double kMul = 10;
+
+    block_diff_ac[res_ix * 3]     += max_diff_xyb[0] * kMul;
+    block_diff_ac[res_ix * 3 + 1] += max_diff_xyb[1] * kMul;
+    block_diff_ac[res_ix * 3 + 2] += max_diff_xyb[2] * kMul;
+}
+
+__kernel void clDiffPrecomputeEx(
+    __global float *mask_x, __global float *mask_y, __global float *mask_b,
+    const int xsize, const int ysize,
+    __global const float *xyb0_x, __global const float *xyb0_y, __global const float *xyb0_b,
+    __global const float *xyb1_x, __global const float *xyb1_y, __global const float *xyb1_b)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    if (x >= xsize || y >= ysize) return;
+
+    double valsh0[3] = { 0.0 };
+    double valsv0[3] = { 0.0 };
+    double valsh1[3] = { 0.0 };
+    double valsv1[3] = { 0.0 };
+    int ix2;
+
+    int ix = x + xsize * y;
+    if (x + 1 < xsize) {
+        ix2 = ix + 1;
+    }
+    else {
+        ix2 = ix - 1;
+    }
+    {
+        double x0 = (xyb0_x[ix] - xyb0_x[ix2]);
+        double y0 = (xyb0_y[ix] - xyb0_y[ix2]);
+        double z0 = (xyb0_b[ix] - xyb0_b[ix2]);
+        XybToVals(x0, y0, z0, &valsh0[0], &valsh0[1], &valsh0[2]);
+        double x1 = (xyb1_x[ix] - xyb1_x[ix2]);
+        double y1 = (xyb1_y[ix] - xyb1_y[ix2]);
+        double z1 = (xyb1_b[ix] - xyb1_b[ix2]);
+        XybToVals(x1, y1, z1, &valsh1[0], &valsh1[1], &valsh1[2]);
+    }
+    if (y + 1 < ysize) {
+        ix2 = ix + xsize;
+    }
+    else {
+        ix2 = ix - xsize;
+    }
+    {
+        double x0 = (xyb0_x[ix] - xyb0_x[ix2]);
+        double y0 = (xyb0_y[ix] - xyb0_y[ix2]);
+        double z0 = (xyb0_b[ix] - xyb0_b[ix2]);
+        XybToVals(x0, y0, z0, &valsv0[0], &valsv0[1], &valsv0[2]);
+        double x1 = (xyb1_x[ix] - xyb1_x[ix2]);
+        double y1 = (xyb1_y[ix] - xyb1_y[ix2]);
+        double z1 = (xyb1_b[ix] - xyb1_b[ix2]);
+        XybToVals(x1, y1, z1, &valsv1[0], &valsv1[1], &valsv1[2]);
+    }
+
+    double sup0 = fabs(valsh0[0]) + fabs(valsv0[0]);
+    double sup1 = fabs(valsh1[0]) + fabs(valsv1[0]);
+    double m = min(sup0, sup1);
+    mask_x[ix] = (float)(m);
+
+    sup0 = fabs(valsh0[1]) + fabs(valsv0[1]);
+    sup1 = fabs(valsh1[1]) + fabs(valsv1[1]);
+    m = min(sup0, sup1);
+    mask_y[ix] = (float)(m);
+
+    sup0 = fabs(valsh0[2]) + fabs(valsv0[2]);
+    sup1 = fabs(valsh1[2]) + fabs(valsv1[2]);
+    m = min(sup0, sup1);
+    mask_b[ix] = (float)(m);
+}
+
+__kernel void clScaleImageEx(__global float *img, const int size, float scale)
+{
+    const int i = get_global_id(0);
+    if (i >= size) return;
+
+    img[i] *= scale;
+}
+
+#define Average5x5_w 0.679144890667f
+__constant float Average5x5_scale = 1.0f / (5.0f + 4 * Average5x5_w);
+__kernel void clAverage5x5Ex(__global float *img, const int xsize, const int ysize, __global const float *img_org)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    if (x >= xsize || y >= ysize) return;
+
+    const int row0 = y * xsize;
+	if (x - 1 >= 0) {
+		img[row0 + x] += img_org[row0 + x - 1];
+	}
+	if (x + 1 < xsize) {
+		img[row0 + x] += img_org[row0 + x + 1];
+	}
+
+	if (y > 0) {
+		const int rowd1 = row0 - xsize;
+		if (x - 1 >= 0) {
+			img[row0 + x] += img_org[rowd1 + x - 1] * Average5x5_w;
+		}
+		img[row0 + x] += img_org[rowd1 + x];
+		if (x + 1 < xsize) {
+			img[row0 + x] += img_org[rowd1 + x + 1] * Average5x5_w;
+		}
+	}
+
+	if (y + 1 < ysize) {
+		const int rowu1 = row0 + xsize;
+		if (x - 1 >= 0) {
+			img[row0 + x] += img_org[rowu1 + x - 1] * Average5x5_w;
+		}
+		img[row0 + x] += img_org[rowu1 + x];
+		if (x + 1 < xsize) {
+			img[row0 + x] += img_org[rowu1 + x + 1] * Average5x5_w;
+		}
+	}
+
+	img[row0 + x] *= Average5x5_scale;
+}
+
+__kernel void clMinSquareValEx(__global float* result, const int xsize, const int ysize, __global const float* img,  int square_size, int offset)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    if (x >= xsize || y >= ysize) return;
+
+    int minH = offset > y ? 0 : y - offset;
+    int maxH = min(y + square_size - offset, ysize);
+
+    int minW = offset > x ? 0 : x - offset;
+    int maxW = min(x + square_size - offset, xsize);
+
+    float minValue = img[minH * xsize + minW];
+
+    for (int j = minH; j < maxH; j++)
+    {
+        for (int i = minW; i < maxW; i++)
+        {
+            float tmp = img[j * xsize + i];
+            if (tmp < minValue) minValue = tmp;
+        }
+    }
+
+    result[y * xsize + x] = minValue;
+}
+
+__kernel void clDoMaskEx(
+    __global float *mask_x, __global float *mask_y, __global float *mask_b,
+    const int xsize, const int ysize,
+    __global float *mask_dc_x, __global float *mask_dc_y, __global float *mask_dc_b,
+    __global const double *lut_x, __global const double *lut_y, __global const double *lut_b,
+    __global const double *lut_dc_x, __global const double *lut_dc_y, __global const double *lut_dc_b)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+	const double w00 = 232.206464018;
+	const double w11 = 22.9455222245;
+	const double w22 = 503.962310606;
+
+    const size_t idx = y * xsize + x;
+    const double s0 = mask_x[idx];
+    const double s1 = mask_y[idx];
+    const double s2 = mask_b[idx];
+    const double p0 = w00 * s0;
+    const double p1 = w11 * s1;
+    const double p2 = w22 * s2;
+
+    mask_x[idx] = (float)(InterpolateClampNegative(lut_x, 512, p0));
+    mask_y[idx] = (float)(InterpolateClampNegative(lut_y, 512, p1));
+    mask_b[idx] = (float)(InterpolateClampNegative(lut_b, 512, p2));
+    mask_dc_x[idx] = (float)(InterpolateClampNegative(lut_dc_x, 512, p0));
+    mask_dc_y[idx] = (float)(InterpolateClampNegative(lut_dc_y, 512, p1));
+    mask_dc_b[idx] = (float)(InterpolateClampNegative(lut_dc_b, 512, p2));
+}
+
+__kernel void clCombineChannelsEx(
+    __global float *result,
+    __global const float *mask_x, __global const float *mask_y, __global const float *mask_b,
+    __global const float *mask_dc_x, __global const float *mask_dc_y, __global const float *mask_dc_b,
+	const int xsize, const int ysize,
+    __global const float *block_diff_dc,
+    __global const float *block_diff_ac,
+	__global float *edge_detector_map,
+    const int res_xsize,
+    const int step)
+{
+    const int res_x = get_global_id(0) * step;
+    const int res_y = get_global_id(1) * step;
+
+    double mask[3];
+    double dc_mask[3];
+    mask[0] = mask_x[(res_y + 3) * xsize + (res_x + 3)];
+    dc_mask[0] = mask_dc_x[(res_y + 3) * xsize + (res_x + 3)];
+
+    mask[1] = mask_y[(res_y + 3) * xsize + (res_x + 3)];
+    dc_mask[1] = mask_dc_y[(res_y + 3) * xsize + (res_x + 3)];
+
+    mask[2] = mask_b[(res_y + 3) * xsize + (res_x + 3)];
+    dc_mask[2] = mask_dc_b[(res_y + 3) * xsize + (res_x + 3)];
+
+    size_t res_ix = (res_y * res_xsize + res_x) / step;
+    result[res_ix] = (float)(
+        DotProduct(&block_diff_dc[3 * res_ix], dc_mask) +
+        DotProduct(&block_diff_ac[3 * res_ix], mask) +
+        DotProduct(&edge_detector_map[3 * res_ix], mask));
+}
+
+__kernel void clUpsampleSquareRootEx(__global float *diffmap_out, __global const float *diffmap, int xsize, int ysize, int step)
+{
+    const int res_x = get_global_id(0);
+    const int res_y = get_global_id(1);
+
+    const int res_xsize = get_global_size(0);
+    const int res_ysize = get_global_size(1);
+
+    const int pos_x = res_x * step;
+    const int pos_y = res_y * step;
+
+    if (pos_y + 8 - step >= ysize) return;
+    if (pos_x + 8 - step >= xsize) return;
+
+    int s2 = (8 - step) / 2;
+
+    // Upsample and take square root.
+    float orig_val = diffmap[res_y * res_xsize + res_x];
+
+    const float kInitialSlope = 100;
+    // TODO(b/29974893): Until that is fixed do not call sqrt on very small
+    // numbers.
+    double val = orig_val < (1.0 / (kInitialSlope * kInitialSlope))
+        ? kInitialSlope * orig_val
+        : sqrt(orig_val);
+
+    for (size_t off_y = 0; off_y < step; ++off_y) {
+        for (size_t off_x = 0; off_x < step; ++off_x) {
+            diffmap_out[(pos_y + off_y + s2) * xsize + pos_x + off_x + s2] = val;
+        }
+    }
+}
+
+__kernel void clRemoveBorderEx(__global float *out, const int xsize, const int ysize, __global const float *in, int s, int s2)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    if (x >= xsize || y >= ysize) return;
+
+    out[y * xsize + x] = in[(y + s2) * (xsize + s) + x + s2];
+}
+
+__kernel void clAddBorderEx(__global float *out, const int xsize, const int ysize, int s, int s2, __global const float *in)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+	if (x >= xsize - s ||
+	    y >= ysize - s)
+	{ 
+		return;
+	}
+
+    const double mul1 = 24.8235314874;
+    out[(y + s2) * xsize + x + s2] += (float)(mul1) * in[y * (xsize - s) + x];
+
+}
+
+__kernel void clComputeBlockZeroingOrderEx(
+    __global const coeff_t *orig_batch_0,       // Coeffs of Original image.
+    __global const coeff_t *orig_batch_1,       // Coeffs of Original image.
+    __global const coeff_t *orig_batch_2,       // Coeffs of Original image.
+    __global const float   *orig_image_batch,   // pregamma of Original image..
+    __global const float   *mask_scale,         // mask_scale of Original image..
+    const int              block_xsize,
+    const int              block_ysize,
+    const int              image_width,
+    const int              image_height,
+
+    __global const coeff_t *mayout_batch_0,     // Coeffs of output image.
+    __global const coeff_t *mayout_batch_1,     // Coeffs of output image.
+    __global const coeff_t *mayout_batch_2,     // Coeffs of output image.
+    __global const ushort  *mayout_pixel_0,
+    __global const ushort  *mayout_pixel_1,
+    __global const ushort  *mayout_pixel_2,
+
+    const channel_info     mayout_channel_0,
+    const channel_info     mayout_channel_1,
+    const channel_info     mayout_channel_2,
+    const int factor,                                 // Current factor in computing.
+    const int comp_mask,                              // Current channel in computing.
+    const float BlockErrorLimit,
+    __global CoeffData *output_order_list/*out*/)
+{
+    const int block_x = get_global_id(0);
+    const int block_y = get_global_id(1);
+
+    if (block_x >= block_xsize || block_y >= block_ysize) return;
+
+    channel_info orig_channel[3];
+    orig_channel[0].coeff = orig_batch_0;
+    orig_channel[1].coeff = orig_batch_1;
+    orig_channel[2].coeff = orig_batch_2;
+
+    channel_info mayout_channel[3] = { mayout_channel_0, mayout_channel_1, mayout_channel_2 };
+    mayout_channel[0].coeff = mayout_batch_0;
+    mayout_channel[1].coeff = mayout_batch_1;
+    mayout_channel[2].coeff = mayout_batch_2;
+    mayout_channel[0].pixel = mayout_pixel_0;
+    mayout_channel[1].pixel = mayout_pixel_1;
+    mayout_channel[2].pixel = mayout_pixel_2;
+
+    int block_idx = 0;
+
+    coeff_t mayout_block[kComputeBlockSize] = { 0 };
+    coeff_t orig_block[kComputeBlockSize]   = { 0 };
+
+    for (int c = 0; c < 3; c++) {
+        if (comp_mask & (1<<c)) {
+            block_idx = block_y * mayout_channel[c].block_width + block_x;
+            coeffcopy_g(&mayout_block[c * kBlockSize],
+                mayout_channel[c].coeff + block_idx * kBlockSize,
+                kBlockSize);
+            coeffcopy_g(&orig_block[c * kBlockSize],
+                orig_channel[c].coeff + block_idx * kBlockSize,
+                kBlockSize);
+        }
+    }
+
+    DCTScoreData input_order_data[kComputeBlockSize];
+    CoeffData    output_order_data[kComputeBlockSize];
+
+    IntFloatPairList input_order = { 0, input_order_data };
+    IntFloatPairList output_order = { 0, output_order_data };
+
+    int count = MakeInputOrderEx(mayout_block, orig_block, &input_order);
+    
+    while (input_order.size > 0)
+    {
+        float best_err = 1e17f;
+        int best_i = 0;
+        for (int i = 0; i < min(3, input_order.size); i++)
+        {
+            const int idx = input_order.pData[i].idx;
+            coeff_t old_coeff = mayout_block[idx];
+            mayout_block[idx] = 0;
+
+
+            float max_err = CompareBlockFactor(mayout_channel,
+                                               mayout_block,
+                                               block_x,
+                                               block_y,
+                                               orig_image_batch,
+                                               mask_scale,
+                                               image_width,
+                                               image_height,
+                                               factor);
+            if (max_err < best_err)
+            {
+                best_err = max_err;
+                best_i = i;
+            }
+            mayout_block[idx] = old_coeff;
+        }
+
+        if (best_err >= BlockErrorLimit)
+        {   // The input_order is an ascent vector, break when best_err exceed the error limit.
+            break;
+        }
+        int idx = input_order.pData[best_i].idx;
+        mayout_block[idx] = 0;
+        list_erase(&input_order, best_i);
+
+        list_push_back(&output_order, idx, best_err);
+    }
+
+    float min_err = 1e10;
+    for (int i = output_order.size - 1; i >= 0; --i) {
+        min_err = min(min_err, output_order.pData[i].err);
+        output_order.pData[i].err = min_err;
+    }
+
+    __global CoeffData *output_block = output_order_list + block_idx * kComputeBlockSize;
+
+    int out_count = 0;
+    for (int i = 0; i < kComputeBlockSize && i < output_order.size; i++)
+    {
+        // err exceeding the limit is no need to continue.
+        if (output_order.pData[i].err <= BlockErrorLimit)
+        {
+            output_block[out_count].idx = output_order.pData[i].idx;
+            output_block[out_count].err = output_order.pData[i].err;
+            out_count++;
+        }
+    }
+}
+
+__device__ void Butteraugli8x8CornerEdgeDetectorDiff(
+    int pos_x,
+    int pos_y,
+    int xsize,
+    int ysize,
+    __global const float *r, __global const float *g, __global const float* b,
+    __global const float *r2, __global const float* g2, __global const float *b2,
+    double* diff_xyb)
+{
+    int local_count = 0;
+    double local_xyb[3] = { 0 };
+    const double w = 0.711100840192;
+
+    int offset[4][2] = { { 0,0 },{ 0,7 },{ 7,0 },{ 7,7 } };
+    int edgeSize = 3;
+
+    for (int k = 0; k < 4; k++)
+    {
+        int x = pos_x + offset[k][0];
+        int y = pos_y + offset[k][1];
+
+        if (x >= edgeSize && x + edgeSize < xsize) {
+            size_t ix = y * xsize + (x - edgeSize);
+            size_t ix2 = ix + 2 * edgeSize;
+            XybDiffLowFreqSquaredAccumulate(
+                w * (r[ix] - r[ix2]),
+                w * (g[ix] - g[ix2]),
+                w * (b[ix] - b[ix2]),
+                w * (r2[ix] - r2[ix2]),
+                w * (g2[ix] - g2[ix2]),
+                w * (b2[ix] - b2[ix2]),
+                1.0, local_xyb);
+            ++local_count;
+        }
+        if (y >= edgeSize && y + edgeSize < ysize) {
+            size_t ix = (y - edgeSize) * xsize + x;
+            size_t ix2 = ix + 2 * edgeSize * xsize;
+            XybDiffLowFreqSquaredAccumulate(
+                w * (r[ix] - r[ix2]),
+                w * (g[ix] - g[ix2]),
+                w * (b[ix] - b[ix2]),
+                w * (r2[ix] - r2[ix2]),
+                w * (g2[ix] - g2[ix2]),
+                w * (b2[ix] - b2[ix2]),
+                1.0, local_xyb);
+            ++local_count;
+        }
+    }
+
+    const double weight = 0.01617112696;
+    const double mul = weight * 8.0 / local_count;
+    for (int i = 0; i < 3; ++i) {
+        diff_xyb[i] += mul * local_xyb[i];
+    }
+}
+
+__device__ double DotProduct(__global const float u[3], const double v[3]) {
+    return u[0] * v[0] + u[1] * v[1] + u[2] * v[2];
+}
+
+__device__ double Interpolate(__constant_ex const double *array, const int size, const double sx) {
+    double ix = fabs(sx);
+
+    int baseix = (int)(ix);
+    double res;
+    if (baseix >= size - 1) {
+        res = array[size - 1];
+    }
+    else {
+        double mix = ix - baseix;
+        int nextix = baseix + 1;
+        res = array[baseix] + mix * (array[nextix] - array[baseix]);
+    }
+    if (sx < 0) res = -res;
+    return res;
+}
+
+#define XybToVals_off_x 11.38708334481672
+#define XybToVals_inc_x 14.550189611520716
+__constant double XybToVals_lut_x[21] = {
+    0,
+    XybToVals_off_x,
+    XybToVals_off_x + 1 * XybToVals_inc_x,
+    XybToVals_off_x + 2 * XybToVals_inc_x,
+    XybToVals_off_x + 3 * XybToVals_inc_x,
+    XybToVals_off_x + 4 * XybToVals_inc_x,
+    XybToVals_off_x + 5 * XybToVals_inc_x,
+    XybToVals_off_x + 6 * XybToVals_inc_x,
+    XybToVals_off_x + 7 * XybToVals_inc_x,
+    XybToVals_off_x + 8 * XybToVals_inc_x,
+    XybToVals_off_x + 9 * XybToVals_inc_x,
+    XybToVals_off_x + 10 * XybToVals_inc_x,
+    XybToVals_off_x + 11 * XybToVals_inc_x,
+    XybToVals_off_x + 12 * XybToVals_inc_x,
+    XybToVals_off_x + 13 * XybToVals_inc_x,
+    XybToVals_off_x + 14 * XybToVals_inc_x,
+    XybToVals_off_x + 15 * XybToVals_inc_x,
+    XybToVals_off_x + 16 * XybToVals_inc_x,
+    XybToVals_off_x + 17 * XybToVals_inc_x,
+    XybToVals_off_x + 18 * XybToVals_inc_x,
+    XybToVals_off_x + 19 * XybToVals_inc_x,
+};
+
+#define XybToVals_off_y 1.4103373714040413
+#define XybToVals_inc_y 0.7084088867024
+__constant double XybToVals_lut_y[21] = {
+    0,
+    XybToVals_off_y,
+    XybToVals_off_y + 1 * XybToVals_inc_y,
+    XybToVals_off_y + 2 * XybToVals_inc_y,
+    XybToVals_off_y + 3 * XybToVals_inc_y,
+    XybToVals_off_y + 4 * XybToVals_inc_y,
+    XybToVals_off_y + 5 * XybToVals_inc_y,
+    XybToVals_off_y + 6 * XybToVals_inc_y,
+    XybToVals_off_y + 7 * XybToVals_inc_y,
+    XybToVals_off_y + 8 * XybToVals_inc_y,
+    XybToVals_off_y + 9 * XybToVals_inc_y,
+    XybToVals_off_y + 10 * XybToVals_inc_y,
+    XybToVals_off_y + 11 * XybToVals_inc_y,
+    XybToVals_off_y + 12 * XybToVals_inc_y,
+    XybToVals_off_y + 13 * XybToVals_inc_y,
+    XybToVals_off_y + 14 * XybToVals_inc_y,
+    XybToVals_off_y + 15 * XybToVals_inc_y,
+    XybToVals_off_y + 16 * XybToVals_inc_y,
+    XybToVals_off_y + 17 * XybToVals_inc_y,
+    XybToVals_off_y + 18 * XybToVals_inc_y,
+    XybToVals_off_y + 19 * XybToVals_inc_y,
+};
+
+__device__ void XybToVals(
+    double x, double y, double z,
+    double *valx, double *valy, double *valz)
+{
+    const double xmul = 0.758304045695;
+    const double ymul = 2.28148649801;
+    const double zmul = 1.87816926918;
+
+    *valx = Interpolate(&XybToVals_lut_x[0], 21, x * xmul);
+    *valy = Interpolate(&XybToVals_lut_y[0], 21, y * ymul);
+    *valz = zmul * z;
+}
+
+#define XybLowFreqToVals_inc 5.2511644570349185
+__constant double XybLowFreqToVals_lut[21] = {
+    0,
+    1 * XybLowFreqToVals_inc,
+    2 * XybLowFreqToVals_inc,
+    3 * XybLowFreqToVals_inc,
+    4 * XybLowFreqToVals_inc,
+    5 * XybLowFreqToVals_inc,
+    6 * XybLowFreqToVals_inc,
+    7 * XybLowFreqToVals_inc,
+    8 * XybLowFreqToVals_inc,
+    9 * XybLowFreqToVals_inc,
+    10 * XybLowFreqToVals_inc,
+    11 * XybLowFreqToVals_inc,
+    12 * XybLowFreqToVals_inc,
+    13 * XybLowFreqToVals_inc,
+    14 * XybLowFreqToVals_inc,
+    15 * XybLowFreqToVals_inc,
+    16 * XybLowFreqToVals_inc,
+    17 * XybLowFreqToVals_inc,
+    18 * XybLowFreqToVals_inc,
+    19 * XybLowFreqToVals_inc,
+    20 * XybLowFreqToVals_inc,
+};
+
+__device__ void XybLowFreqToVals(double x, double y, double z,
+    double *valx, double *valy, double *valz) {
+    const double xmul = 6.64482198135;
+    const double ymul = 0.837846224276;
+    const double zmul = 7.34905756986;
+    const double y_to_z_mul = 0.0812519812628;
+
+    z += y_to_z_mul * y;
+    *valz = z * zmul;
+    *valx = x * xmul;
+    *valy = Interpolate(&XybLowFreqToVals_lut[0], 21, y * ymul);
+}
+
+__device__ double InterpolateClampNegative(__global const double *array,
+	int size, double sx) {
+	if (sx < 0) {
+		sx = 0;
+	}
+	double ix = fabs(sx);
+	int baseix = (int)(ix);
+	double res;
+	if (baseix >= size - 1) {
+		res = array[size - 1];
+	}
+	else {
+		double mix = ix - baseix;
+		int nextix = baseix + 1;
+		res = array[baseix] + mix * (array[nextix] - array[baseix]);
+	}
+	return res;
+}
+
+__device__ void XybDiffLowFreqSquaredAccumulate(double r0, double g0, double b0,
+	double r1, double g1, double b1,
+	double factor, double res[3]) {
+	double valx0, valy0, valz0;
+	double valx1, valy1, valz1;
+	XybLowFreqToVals(r0, g0, b0, &valx0, &valy0, &valz0);
+	if (r1 == 0.0 && g1 == 0.0 && b1 == 0.0) {
+		//PROFILER_ZONE("XybDiff r1=g1=b1=0");
+		res[0] += factor * valx0 * valx0;
+		res[1] += factor * valy0 * valy0;
+		res[2] += factor * valz0 * valz0;
+		return;
+	}
+	XybLowFreqToVals(r1, g1, b1, &valx1, &valy1, &valz1);
+	// Approximate the distance of the colors by their respective distances
+	// to gray.
+	double valx = valx0 - valx1;
+	double valy = valy0 - valy1;
+	double valz = valz0 - valz1;
+	res[0] += factor * valx * valx;
+	res[1] += factor * valy * valy;
+	res[2] += factor * valz * valz;
+}
+
+typedef struct __Complex
+{
+	double real;
+	double imag;
+}Complex;
+
+__constant double kSqrtHalf = 0.70710678118654752440084436210484903;
+__device__ void RealFFT8(const double* in, Complex* out) {
+	double t1, t2, t3, t5, t6, t7, t8;
+	t8 = in[6];
+	t5 = in[2] - t8;
+	t8 += in[2];
+	out[2].real = t8;
+	out[6].imag = -t5;
+	out[4].imag = t5;
+	t8 = in[4];
+	t3 = in[0] - t8;
+	t8 += in[0];
+	out[0].real = t8;
+	out[4].real = t3;
+	out[6].real = t3;
+	t7 = in[5];
+	t3 = in[1] - t7;
+	t7 += in[1];
+	out[1].real = t7;
+	t8 = in[7];
+	t5 = in[3] - t8;
+	t8 += in[3];
+	out[3].real = t8;
+	t2 = -t5;
+	t6 = t3 - t5;
+	t8 = kSqrtHalf;
+	t6 *= t8;
+	out[5].real = out[4].real - t6;
+	t1 = t3 + t5;
+	t1 *= t8;
+	out[5].imag = out[4].imag - t1;
+	t6 += out[4].real;
+	out[4].real = t6;
+	t1 += out[4].imag;
+	out[4].imag = t1;
+	t5 = t2 - t3;
+	t5 *= t8;
+	out[7].imag = out[6].imag - t5;
+	t2 += t3;
+	t2 *= t8;
+	out[7].real = out[6].real - t2;
+	t2 += out[6].real;
+	out[6].real = t2;
+	t5 += out[6].imag;
+	out[6].imag = t5;
+	t5 = out[2].real;
+	t1 = out[0].real - t5;
+	t7 = out[3].real;
+	t5 += out[0].real;
+	t3 = out[1].real - t7;
+	t7 += out[1].real;
+	t8 = t5 + t7;
+	out[0].real = t8;
+	t5 -= t7;
+	out[1].real = t5;
+	out[2].imag = t3;
+	out[3].imag = -t3;
+	out[3].real = t1;
+	out[2].real = t1;
+	out[0].imag = 0;
+	out[1].imag = 0;
+
+	// Reorder to the correct output order.
+	// TODO: Modify the above computation so that this is not needed.
+	Complex tmp = out[2];
+	out[2] = out[3];
+	out[3] = out[5];
+	out[5] = out[7];
+	out[7] = out[4];
+	out[4] = out[1];
+	out[1] = out[6];
+	out[6] = tmp;
+}
+
+__device__ void TransposeBlock(Complex data[kBlockSize]) {
+	for (int i = 0; i < kBlockEdge; i++) {
+		for (int j = 0; j < i; j++) {
+			Complex tmp = data[kBlockEdge * i + j];
+			data[kBlockEdge * i + j] = data[kBlockEdge * j + i];
+			data[kBlockEdge * j + i] = tmp;
+		}
+	}
+}
+
+//  D. J. Bernstein's Fast Fourier Transform algorithm on 4 elements.
+__device__ inline void FFT4(Complex* a) {
+	double t1, t2, t3, t4, t5, t6, t7, t8;
+	t5 = a[2].real;
+	t1 = a[0].real - t5;
+	t7 = a[3].real;
+	t5 += a[0].real;
+	t3 = a[1].real - t7;
+	t7 += a[1].real;
+	t8 = t5 + t7;
+	a[0].real = t8;
+	t5 -= t7;
+	a[1].real = t5;
+	t6 = a[2].imag;
+	t2 = a[0].imag - t6;
+	t6 += a[0].imag;
+	t5 = a[3].imag;
+	a[2].imag = t2 + t3;
+	t2 -= t3;
+	a[3].imag = t2;
+	t4 = a[1].imag - t5;
+	a[3].real = t1 + t4;
+	t1 -= t4;
+	a[2].real = t1;
+	t5 += a[1].imag;
+	a[0].imag = t6 + t5;
+	t6 -= t5;
+	a[1].imag = t6;
+}
+
+//  D. J. Bernstein's Fast Fourier Transform algorithm on 8 elements.
+__device__ void FFT8(Complex* a) {
+	const double kSqrtHalf = 0.70710678118654752440084436210484903;
+	double t1, t2, t3, t4, t5, t6, t7, t8;
+
+	t7 = a[4].imag;
+	t4 = a[0].imag - t7;
+	t7 += a[0].imag;
+	a[0].imag = t7;
+
+	t8 = a[6].real;
+	t5 = a[2].real - t8;
+	t8 += a[2].real;
+	a[2].real = t8;
+
+	t7 = a[6].imag;
+	a[6].imag = t4 - t5;
+	t4 += t5;
+	a[4].imag = t4;
+
+	t6 = a[2].imag - t7;
+	t7 += a[2].imag;
+	a[2].imag = t7;
+
+	t8 = a[4].real;
+	t3 = a[0].real - t8;
+	t8 += a[0].real;
+	a[0].real = t8;
+
+	a[4].real = t3 - t6;
+	t3 += t6;
+	a[6].real = t3;
+
+	t7 = a[5].real;
+	t3 = a[1].real - t7;
+	t7 += a[1].real;
+	a[1].real = t7;
+
+	t8 = a[7].imag;
+	t6 = a[3].imag - t8;
+	t8 += a[3].imag;
+	a[3].imag = t8;
+	t1 = t3 - t6;
+	t3 += t6;
+
+	t7 = a[5].imag;
+	t4 = a[1].imag - t7;
+	t7 += a[1].imag;
+	a[1].imag = t7;
+
+	t8 = a[7].real;
+	t5 = a[3].real - t8;
+	t8 += a[3].real;
+	a[3].real = t8;
+
+	t2 = t4 - t5;
+	t4 += t5;
+
+	t6 = t1 - t4;
+	t8 = kSqrtHalf;
+	t6 *= t8;
+	a[5].real = a[4].real - t6;
+	t1 += t4;
+	t1 *= t8;
+	a[5].imag = a[4].imag - t1;
+	t6 += a[4].real;
+	a[4].real = t6;
+	t1 += a[4].imag;
+	a[4].imag = t1;
+
+	t5 = t2 - t3;
+	t5 *= t8;
+	a[7].imag = a[6].imag - t5;
+	t2 += t3;
+	t2 *= t8;
+	a[7].real = a[6].real - t2;
+	t2 += a[6].real;
+	a[6].real = t2;
+	t5 += a[6].imag;
+	a[6].imag = t5;
+
+	FFT4(a);
+
+	// Reorder to the correct output order.
+	// TODO: Modify the above computation so that this is not needed.
+	Complex tmp = a[2];
+	a[2] = a[3];
+	a[3] = a[5];
+	a[5] = a[7];
+	a[7] = a[4];
+	a[4] = a[1];
+	a[1] = a[6];
+	a[6] = tmp;
+}
+
+__device__ double abssq(const Complex c) {
+	return c.real * c.real + c.imag * c.imag;
+}
+
+__device__ void ButteraugliFFTSquared(__private double block[kBlockSize]) {
+	double global_mul = 0.000064;
+	Complex block_c[kBlockSize];
+
+	for (int y = 0; y < kBlockEdge; ++y) {
+		RealFFT8(block + y * kBlockEdge, block_c + y * kBlockEdge);
+	}
+	TransposeBlock(block_c);
+	double r0[kBlockEdge];
+	double r1[kBlockEdge];
+	for (int x = 0; x < kBlockEdge; ++x) {
+		r0[x] = block_c[x].real;
+		r1[x] = block_c[kBlockHalf + x].real;
+	}
+	RealFFT8(r0, block_c);
+	RealFFT8(r1, block_c + kBlockHalf);
+	for (int y = 1; y < kBlockEdgeHalf; ++y) {
+		FFT8(block_c + y * kBlockEdge);
+	}
+	for (int i = kBlockEdgeHalf; i < kBlockHalf + kBlockEdgeHalf + 1; ++i) {
+		block[i] = abssq(block_c[i]);
+		block[i] *= global_mul;
+	}
+}
+
+__device__ double RemoveRangeAroundZero(double v, double range) {
+	if (v >= -range && v < range) {
+		return 0;
+	}
+	if (v < 0) {
+		return v + range;
+	}
+	else {
+		return v - range;
+	}
+}
+
+#define MakeHighFreqColorDiffDy_off  1.4103373714040413
+#define MakeHighFreqColorDiffDy_inc  0.7084088867024
+__constant double MakeHighFreqColorDiffDy_lut[21] = {
+	0.0,
+	MakeHighFreqColorDiffDy_off,
+	MakeHighFreqColorDiffDy_off + 1 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 2 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 3 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 4 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 5 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 6 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 7 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 8 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 9 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 10 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 11 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 12 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 13 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 14 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 15 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 16 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 17 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 18 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 19 * MakeHighFreqColorDiffDy_inc,
+};
+
+__constant double csf8x8[kBlockHalf + kBlockEdgeHalf + 1] = {
+	5.28270670524,
+	0.0,
+	0.0,
+	0.0,
+	0.3831134973,
+	0.676303603859,
+	3.58927792424,
+	18.6104367002,
+	18.6104367002,
+	3.09093131948,
+	1.0,
+	0.498250875965,
+	0.36198671102,
+	0.308982169883,
+	0.1312701920435,
+	2.37370549629,
+	3.58927792424,
+	1.0,
+	2.37370549629,
+	0.991205724152,
+	1.05178802919,
+	0.627264168628,
+	0.4,
+	0.1312701920435,
+	0.676303603859,
+	0.498250875965,
+	0.991205724152,
+	0.5,
+	0.3831134973,
+	0.349686450518,
+	0.627264168628,
+	0.308982169883,
+	0.3831134973,
+	0.36198671102,
+	1.05178802919,
+	0.3831134973,
+	0.12,
+};
+
+// Computes 8x8 FFT of each channel of xyb0 and xyb1 and adds the total squared
+// 3-dimensional xybdiff of the two blocks to diff_xyb_{dc,ac} and the average
+// diff on the edges to diff_xyb_edge_dc.
+__device__ void ButteraugliBlockDiff(__private double xyb0[3 * kBlockSize],
+	__private double xyb1[3 * kBlockSize],
+	double diff_xyb_dc[3],
+	double diff_xyb_ac[3],
+	double diff_xyb_edge_dc[3]) {
+
+	double avgdiff_xyb[3] = { 0.0 };
+	double avgdiff_edge[3][4] = { { 0.0 } };
+
+	for (int i = 0; i < 3 * kBlockSize; ++i) {
+		const double diff_xyb = xyb0[i] - xyb1[i];
+		const int c = i / kBlockSize;
+		avgdiff_xyb[c] += diff_xyb / kBlockSize;
+		const int k = i % kBlockSize;
+		const int kx = k % kBlockEdge;
+		const int ky = k / kBlockEdge;
+		const int h_edge_idx = ky == 0 ? 1 : ky == 7 ? 3 : -1;
+		const int v_edge_idx = kx == 0 ? 0 : kx == 7 ? 2 : -1;
+		if (h_edge_idx >= 0) {
+			avgdiff_edge[c][h_edge_idx] += diff_xyb / kBlockEdge;
+		}
+		if (v_edge_idx >= 0) {
+			avgdiff_edge[c][v_edge_idx] += diff_xyb / kBlockEdge;
+		}
+	}
+	XybDiffLowFreqSquaredAccumulate(avgdiff_xyb[0],
+		avgdiff_xyb[1],
+		avgdiff_xyb[2],
+		0, 0, 0, csf8x8[0],
+		diff_xyb_dc);
+	for (int i = 0; i < 4; ++i) {
+		XybDiffLowFreqSquaredAccumulate(avgdiff_edge[0][i],
+			avgdiff_edge[1][i],
+			avgdiff_edge[2][i],
+			0, 0, 0, csf8x8[0],
+			diff_xyb_edge_dc);
+	}
+
+	double* xyb_avg = xyb0;
+	double* xyb_halfdiff = xyb1;
+	for (int i = 0; i < 3 * kBlockSize; ++i) {
+		double avg = (xyb0[i] + xyb1[i]) / 2;
+		double halfdiff = (xyb0[i] - xyb1[i]) / 2;
+		xyb_avg[i] = avg;
+		xyb_halfdiff[i] = halfdiff;
+	}
+	double *y_avg = &xyb_avg[kBlockSize];
+	double *x_halfdiff_squared = &xyb_halfdiff[0];
+	double *y_halfdiff = &xyb_halfdiff[kBlockSize];
+	double *z_halfdiff_squared = &xyb_halfdiff[2 * kBlockSize];
+	ButteraugliFFTSquared(y_avg);
+	ButteraugliFFTSquared(x_halfdiff_squared);
+	ButteraugliFFTSquared(y_halfdiff);
+	ButteraugliFFTSquared(z_halfdiff_squared);
+
+	const double xmul = 64.8;
+	const double ymul = 1.753123908348329;
+	const double ymul2 = 1.51983458269;
+	const double zmul = 2.4;
+
+	for (size_t i = kBlockEdgeHalf; i < kBlockHalf + kBlockEdgeHalf + 1; ++i) {
+		double d = csf8x8[i];
+		diff_xyb_ac[0] += d * xmul * x_halfdiff_squared[i];
+		diff_xyb_ac[2] += d * zmul * z_halfdiff_squared[i];
+
+		y_avg[i] = sqrt(y_avg[i]);
+		y_halfdiff[i] = sqrt(y_halfdiff[i]);
+		double y0 = y_avg[i] - y_halfdiff[i];
+		double y1 = y_avg[i] + y_halfdiff[i];
+		// Remove the impact of small absolute values.
+		// This improves the behavior with flat noise.
+		const double ylimit = 0.04;
+		y0 = RemoveRangeAroundZero(y0, ylimit);
+		y1 = RemoveRangeAroundZero(y1, ylimit);
+		if (y0 != y1) {
+			double valy0 = Interpolate(&MakeHighFreqColorDiffDy_lut[0], 21, y0 * ymul2);
+			double valy1 = Interpolate(&MakeHighFreqColorDiffDy_lut[0], 21, y1 * ymul2);
+			double valy = ymul * (valy0 - valy1);
+			diff_xyb_ac[1] += d * valy * valy;
+		}
+	}
+}
+
+__constant static float g_mix[12] = {
+    0.348036746003,
+    0.577814843137,
+    0.0544556093735,
+    0.774145581713,
+    0.26922717275,
+    0.767247733938,
+    0.0366922708552,
+    0.920130265014,
+    0.0882062883536,
+    0.158581714673,
+    0.712857943858,
+    10.6524069248,
+};
+
+__device__ void OpsinAbsorbance(const double in[3], double out[3])
+{
+    out[0] = g_mix[0] * in[0] + g_mix[1] * in[1] + g_mix[2] * in[2] + g_mix[3];
+    out[1] = g_mix[4] * in[0] + g_mix[5] * in[1] + g_mix[6] * in[2] + g_mix[7];
+    out[2] = g_mix[8] * in[0] + g_mix[9] * in[1] + g_mix[10] * in[2] + g_mix[11];
+}
+
+__device__ double EvaluatePolynomial(const double x, __constant_ex const double *coefficients, int n)
+{
+    double b1 = 0.0;
+    double b2 = 0.0;
+
+    for (int i = n - 1; i >= 0; i--)
+    {
+        if (i == 0) {
+            const double x_b1 = x * b1;
+            b1 = x_b1 - b2 + coefficients[0];
+            break;
+        }
+        const double x_b1 = x * b1;
+        const double t = (x_b1 + x_b1) - b2 + coefficients[i];
+        b2 = b1;
+        b1 = t;
+    }
+
+    return b1;
+}
+
+static  __constant double g_gamma_p[5 + 1] = {
+    881.979476556478289, 1496.058452015812463, 908.662212739659481,
+    373.566100223287378, 85.840860336314364, 6.683258861509244,
+};
+
+static __constant double g_gamma_q[5 + 1] = {
+    12.262350348616792, 20.557285797683576, 12.161463238367844,
+    4.711532733641639, 0.899112889751053, 0.035662329617191,
+};
+
+__device__ double Gamma(double v)
+{
+    const double min_value = 0.770000000000000;
+    const double max_value = 274.579999999999984;
+    const double x01 = (v - min_value) / (max_value - min_value);
+    const double xc = 2.0 * x01 - 1.0;
+
+    const double yp = EvaluatePolynomial(xc, g_gamma_p, 6);
+    const double yq = EvaluatePolynomial(xc, g_gamma_q, 6);
+    if (yq == 0.0) return 0.0;
+    return (float)(yp / yq);
+}
+
+__device__ void RgbToXyb(double r, double g, double b, double *valx, double *valy, double *valz)
+{
+    const double a0 = 1.01611726948;
+    const double a1 = 0.982482243696;
+    const double a2 = 1.43571362627;
+    const double a3 = 0.896039849412;
+    *valx = a0 * r - a1 * g;
+    *valy = a2 * r + a3 * g;
+    *valz = b;
+}
+
+__device__ int list_push_back(IntFloatPairList* list, int i, float f)
+{
+	list->pData[list->size].idx = i;
+	list->pData[list->size].err = f;
+    return ++list->size;
+}
+
+__device__ int list_erase(IntFloatPairList* list, int idx)
+{
+	for (int i = idx; i < list->size - 1; i++)
+	{
+		list->pData[i].idx = list->pData[i + 1].idx;
+		list->pData[i].err = list->pData[i + 1].err;
+	}
+    return --list->size;
+}
+
+__device__  int SortInputOrder(DCTScoreData* input_order, int size)
+{
+	int i, j;
+	DCTScoreData tmp;
+	for (j = 1; j < size; j++) {
+		tmp.idx = input_order[j].idx;
+		tmp.err = input_order[j].err;
+
+		i = j - 1;
+		while (i >= 0 && input_order[i].err > tmp.err) {
+			input_order[i + 1].idx = input_order[i].idx;
+			input_order[i + 1].err = input_order[i].err;
+			i--;
+		}
+		input_order[i + 1].idx = tmp.idx;
+		input_order[i + 1].err = tmp.err;
+	}
+    return size;
+}
+
+__constant static float csf[192] = {
+	0.0f,
+	1.71014f,
+	0.298711f,
+	0.233709f,
+	0.223126f,
+	0.207072f,
+	0.192775f,
+	0.161201f,
+	2.05807f,
+	0.222927f,
+	0.203406f,
+	0.188465f,
+	0.184668f,
+	0.169993f,
+	0.159142f,
+	0.130155f,
+	0.430518f,
+	0.204939f,
+	0.206655f,
+	0.192231f,
+	0.182941f,
+	0.169455f,
+	0.157599f,
+	0.127153f,
+	0.234757f,
+	0.191098f,
+	0.192698f,
+	0.17425f,
+	0.166503f,
+	0.142154f,
+	0.126182f,
+	0.104196f,
+	0.226117f,
+	0.185373f,
+	0.183825f,
+	0.166643f,
+	0.159414f,
+	0.12636f,
+	0.108696f,
+	0.0911974f,
+	0.207463f,
+	0.171517f,
+	0.170124f,
+	0.141582f,
+	0.126213f,
+	0.103627f,
+	0.0882436f,
+	0.0751848f,
+	0.196436f,
+	0.161947f,
+	0.159271f,
+	0.126938f,
+	0.109125f,
+	0.0878027f,
+	0.0749842f,
+	0.0633859f,
+	0.165232f,
+	0.132905f,
+	0.128679f,
+	0.105766f,
+	0.0906087f,
+	0.0751544f,
+	0.0641187f,
+	0.0529921f,
+	0.0f,
+	0.147235f,
+	0.11264f,
+	0.0757892f,
+	0.0493929f,
+	0.0280663f,
+	0.0075012f,
+	-0.000945567f,
+	0.149251f,
+	0.0964806f,
+	0.0786224f,
+	0.05206f,
+	0.0292758f,
+	0.00353094f,
+	-0.00277912f,
+	-0.00404481f,
+	0.115551f,
+	0.0793142f,
+	0.0623735f,
+	0.0405019f,
+	0.0152656f,
+	-0.00145742f,
+	-0.00370369f,
+	-0.00375106f,
+	0.0791547f,
+	0.0537506f,
+	0.0413634f,
+	0.0193486f,
+	0.000609066f,
+	-0.00510923f,
+	-0.0046452f,
+	-0.00385187f,
+	0.0544534f,
+	0.0334066f,
+	0.0153899f,
+	0.000539088f,
+	-0.00356085f,
+	-0.00535661f,
+	-0.00429145f,
+	-0.00343131f,
+	0.0356439f,
+	0.00865645f,
+	0.00165229f,
+	-0.00425931f,
+	-0.00507324f,
+	-0.00459083f,
+	-0.003703f,
+	-0.00310327f,
+	0.0121926f,
+	-0.0009259f,
+	-0.00330991f,
+	-0.00499378f,
+	-0.00437381f,
+	-0.00377427f,
+	-0.00311731f,
+	-0.00255125f,
+	-0.000320593f,
+	-0.00426043f,
+	-0.00416549f,
+	-0.00419364f,
+	-0.00365418f,
+	-0.00317499f,
+	-0.00255932f,
+	-0.00217917f,
+	0.0f,
+	0.143471f,
+	0.124336f,
+	0.0947465f,
+	0.0814066f,
+	0.0686776f,
+	0.0588122f,
+	0.0374415f,
+	0.146315f,
+	0.105334f,
+	0.0949415f,
+	0.0784241f,
+	0.0689064f,
+	0.0588304f,
+	0.0495961f,
+	0.0202342f,
+	0.123818f,
+	0.0952654f,
+	0.0860556f,
+	0.0724158f,
+	0.0628307f,
+	0.0529965f,
+	0.0353941f,
+	0.00815821f,
+	0.097054f,
+	0.080422f,
+	0.0731085f,
+	0.0636154f,
+	0.055606f,
+	0.0384127f,
+	0.0142879f,
+	0.00105195f,
+	0.0849312f,
+	0.071115f,
+	0.0631183f,
+	0.0552972f,
+	0.0369221f,
+	0.00798314f,
+	0.000716374f,
+	-0.00200948f,
+	0.0722298f,
+	0.0599559f,
+	0.054841f,
+	0.0387529f,
+	0.0107262f,
+	0.000355315f,
+	-0.00244803f,
+	-0.00335222f,
+	0.0635335f,
+	0.0514196f,
+	0.0406309f,
+	0.0125833f,
+	0.00151305f,
+	-0.00140269f,
+	-0.00362547f,
+	-0.00337649f,
+	0.0472024f,
+	0.0198725f,
+	0.0113437f,
+	0.00266305f,
+	-0.00137183f,
+	-0.00354158f,
+	-0.00341292f,
+	-0.00290074f
+};
+
+__constant static float bias[192] = {
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0
+};
+
+__device__ coeff_t _abs(coeff_t val)
+{
+	return val >= 0 ? val : -val;
+}
+
+__device__ int MakeInputOrder(__global const coeff_t *block, __global const coeff_t *orig_block, IntFloatPairList *input_order, int block_size)
+{
+	int size = 0;
+	for (int c = 0; c < 3; ++c) {
+		for (int k = 1; k < block_size; ++k) {
+			int idx = c * block_size + k;
+			if (block[idx] != 0) {
+				float score = _abs(orig_block[idx]) * csf[idx] + bias[idx];
+				size = list_push_back(input_order, idx, score);
+			}
+		}
+	}
+    return SortInputOrder(input_order->pData, size);
+}
+
+__constant static int kIDCTMatrix[kDCTBlockSize] = {
+	8192,  11363,  10703,   9633,   8192,   6437,   4433,   2260,
+	8192,   9633,   4433,  -2259,  -8192, -11362, -10704,  -6436,
+	8192,   6437,  -4433, -11362,  -8192,   2261,  10704,   9633,
+	8192,   2260, -10703,  -6436,   8192,   9633,  -4433, -11363,
+	8192,  -2260, -10703,   6436,   8192,  -9633,  -4433,  11363,
+	8192,  -6437,  -4433,  11362,  -8192,  -2261,  10704,  -9633,
+	8192,  -9633,   4433,   2259,  -8192,  11362, -10704,   6436,
+	8192, -11363,  10703,  -9633,   8192,  -6437,   4433,  -2260,
+};
+
+// Computes out[x] = sum{kIDCTMatrix[8*x+u]*in[u*stride]; for u in [0..7]}
+__device__ void Compute1dIDCT(const coeff_t* in, const int stride, int out[8]) {
+	int tmp0, tmp1, tmp2, tmp3, tmp4;
+
+	tmp1 = kIDCTMatrix[0] * in[0];
+	out[0] = out[1] = out[2] = out[3] = out[4] = out[5] = out[6] = out[7] = tmp1;
+
+	tmp0 = in[stride];
+	tmp1 = kIDCTMatrix[1] * tmp0;
+	tmp2 = kIDCTMatrix[9] * tmp0;
+	tmp3 = kIDCTMatrix[17] * tmp0;
+	tmp4 = kIDCTMatrix[25] * tmp0;
+	out[0] += tmp1;
+	out[1] += tmp2;
+	out[2] += tmp3;
+	out[3] += tmp4;
+	out[4] -= tmp4;
+	out[5] -= tmp3;
+	out[6] -= tmp2;
+	out[7] -= tmp1;
+
+	tmp0 = in[2 * stride];
+	tmp1 = kIDCTMatrix[2] * tmp0;
+	tmp2 = kIDCTMatrix[10] * tmp0;
+	out[0] += tmp1;
+	out[1] += tmp2;
+	out[2] -= tmp2;
+	out[3] -= tmp1;
+	out[4] -= tmp1;
+	out[5] -= tmp2;
+	out[6] += tmp2;
+	out[7] += tmp1;
+
+	tmp0 = in[3 * stride];
+	tmp1 = kIDCTMatrix[3] * tmp0;
+	tmp2 = kIDCTMatrix[11] * tmp0;
+	tmp3 = kIDCTMatrix[19] * tmp0;
+	tmp4 = kIDCTMatrix[27] * tmp0;
+	out[0] += tmp1;
+	out[1] += tmp2;
+	out[2] += tmp3;
+	out[3] += tmp4;
+	out[4] -= tmp4;
+	out[5] -= tmp3;
+	out[6] -= tmp2;
+	out[7] -= tmp1;
+
+	tmp0 = in[4 * stride];
+	tmp1 = kIDCTMatrix[4] * tmp0;
+	out[0] += tmp1;
+	out[1] -= tmp1;
+	out[2] -= tmp1;
+	out[3] += tmp1;
+	out[4] += tmp1;
+	out[5] -= tmp1;
+	out[6] -= tmp1;
+	out[7] += tmp1;
+
+	tmp0 = in[5 * stride];
+	tmp1 = kIDCTMatrix[5] * tmp0;
+	tmp2 = kIDCTMatrix[13] * tmp0;
+	tmp3 = kIDCTMatrix[21] * tmp0;
+	tmp4 = kIDCTMatrix[29] * tmp0;
+	out[0] += tmp1;
+	out[1] += tmp2;
+	out[2] += tmp3;
+	out[3] += tmp4;
+	out[4] -= tmp4;
+	out[5] -= tmp3;
+	out[6] -= tmp2;
+	out[7] -= tmp1;
+
+	tmp0 = in[6 * stride];
+	tmp1 = kIDCTMatrix[6] * tmp0;
+	tmp2 = kIDCTMatrix[14] * tmp0;
+	out[0] += tmp1;
+	out[1] += tmp2;
+	out[2] -= tmp2;
+	out[3] -= tmp1;
+	out[4] -= tmp1;
+	out[5] -= tmp2;
+	out[6] += tmp2;
+	out[7] += tmp1;
+
+	tmp0 = in[7 * stride];
+	tmp1 = kIDCTMatrix[7] * tmp0;
+	tmp2 = kIDCTMatrix[15] * tmp0;
+	tmp3 = kIDCTMatrix[23] * tmp0;
+	tmp4 = kIDCTMatrix[31] * tmp0;
+	out[0] += tmp1;
+	out[1] += tmp2;
+	out[2] += tmp3;
+	out[3] += tmp4;
+	out[4] -= tmp4;
+	out[5] -= tmp3;
+	out[6] -= tmp2;
+	out[7] -= tmp1;
+}
+
+__device__ void CoeffToIDCT(__private const coeff_t block[8*8], uchar out[8*8])
+{
+	coeff_t colidcts[kDCTBlockSize];
+	const int kColScale = 11;
+	const int kColRound = 1 << (kColScale - 1);
+	for (int x = 0; x < 8; ++x)
+	{
+		int colbuf[8] = { 0 };
+		Compute1dIDCT(&block[x], 8, colbuf);
+		for (int y = 0; y < 8; ++y)
+		{
+			colidcts[8 * y + x] = (colbuf[y] + kColRound) >> kColScale;
+		}
+	}
+	const int kRowScale = 18;
+	const int kRowRound = 257 << (kRowScale - 1);  // includes offset by 128
+	for (int y = 0; y < 8; ++y)
+	{
+		const int rowidx = 8 * y;
+		int rowbuf[8] = { 0 };
+		Compute1dIDCT(&colidcts[rowidx], 1, rowbuf);
+		for (int x = 0; x < 8; ++x) {
+			out[rowidx + x] = max(0, min(255, (rowbuf[x] + kRowRound) >> kRowScale));
+		}
+	}
+}
+
+__device__ void IDCTToPixel8x8(const uchar idct[8 * 8], ushort pixels_[8 * 8])
+{
+    const int block_x = 0;
+    const int block_y = 0;
+    const int width_ = 8;
+    const int height_ = 8;
+
+    for (int iy = 0; iy < 8; ++iy) {
+        for (int ix = 0; ix < 8; ++ix) {
+            int x = 8 * block_x + ix;
+            int y = 8 * block_y + iy;
+            if (x >= width_ || y >= height_) continue;
+            int p = y * width_ + x;
+            pixels_[p] = idct[8 * iy + ix] << 4;
+        }
+    }
+}
+
+__device__ void IDCTToPixel16x16(const uchar idct[8 * 8], ushort pixels_out[16 * 16], __global const ushort *pixel_orig, int block_x, int block_y, int width_, int height_)
+{
+    // Fill in the 10x10 pixel area in the subsampled image that will be the
+    // basis of the upsampling. This area is enough to hold the 3x3 kernel of
+    // the fancy upsampler around each pixel.
+#define  kSubsampledEdgeSize 10
+    ushort subsampled[kSubsampledEdgeSize * kSubsampledEdgeSize];
+    for (int j = 0; j < kSubsampledEdgeSize; ++j) {
+        // The order we fill in the rows is:
+        //   8 rows intersecting the block, row below, row above
+        const int y0 = block_y * 16 + (j < 9 ? j * 2 : -2);
+        for (int i = 0; i < kSubsampledEdgeSize; ++i) {
+            // The order we fill in each row is:
+            //   8 pixels within the block, left edge, right edge
+            const int ix = ((j < 9 ? (j + 1) * kSubsampledEdgeSize : 0) +
+                (i < 9 ? i + 1 : 0));
+            const int x0 = block_x * 16 + (i < 9 ? i * 2 : -2);
+            if (x0 < 0) {
+                subsampled[ix] = subsampled[ix + 1];
+            }
+            else if (y0 < 0) {
+                subsampled[ix] = subsampled[ix + kSubsampledEdgeSize];
+            }
+            else if (x0 >= width_) {
+                subsampled[ix] = subsampled[ix - 1];
+            }
+            else if (y0 >= height_) {
+                subsampled[ix] = subsampled[ix - kSubsampledEdgeSize];
+            }
+            else if (i < 8 && j < 8) {
+                subsampled[ix] = idct[j * 8 + i] << 4;
+            }
+            else {
+                // Reconstruct the subsampled pixels around the edge of the current
+                // block by computing the inverse of the fancy upsampler.
+                const int y1 = max(y0 - 1, 0);
+                const int x1 = max(x0 - 1, 0);
+                subsampled[ix] = (pixel_orig[y0 * width_ + x0] * 9 +
+                    pixel_orig[y1 * width_ + x1] +
+                    pixel_orig[y0 * width_ + x1] * -3 +
+                    pixel_orig[y1 * width_ + x0] * -3) >> 2;
+            }
+        }
+    }
+    // Determine area to update.
+    int xmin = block_x * 16; // std::max(block_x * 16 - 1, 0);
+    int xmax = min(block_x * 16 + 15, width_ - 1);
+    int ymin = block_y * 16; // std::max(block_y * 16 - 1, 0);
+    int ymax = min(block_y * 16 + 15, height_ - 1);
+
+    // Apply the fancy upsampler on the subsampled block.
+    for (int y = ymin; y <= ymax; ++y) {
+        const int y0 = ((y & ~1) / 2 - block_y * 8 + 1) * kSubsampledEdgeSize;
+        const int dy = ((y & 1) * 2 - 1) * kSubsampledEdgeSize;
+        for (int x = xmin; x <= xmax; ++x) {
+            const int x0 = (x & ~1) / 2 - block_x * 8 + 1;
+            const int dx = (x & 1) * 2 - 1;
+            const int ix = x0 + y0;
+
+            int out_x = x - xmin;
+            int out_y = y - ymin;
+
+            pixels_out[out_y * 16 + out_x] = (subsampled[ix] * 9 + subsampled[ix + dy] * 3 +
+                subsampled[ix + dx] * 3 + subsampled[ix + dx + dy]) >> 4;
+        }
+    }
+}
+
+// out = [YUVYUV....YUVYUV]
+__device__ void PixelToYUV(ushort pixels_[8 * 8], uchar out[8 * 8], int xsize/* = 8*/, int ysize/* = 8*/)
+{
+    const int stride = 3;
+
+    for (int y = 0; y < xsize; ++y) {
+        for (int x = 0; x < ysize; ++x) {
+            int px = y * xsize + x;
+            *out = (uchar)((pixels_[px] + 8 - (x & 1)) >> 4);
+            out += stride;
+        }
+    }
+}
+
+__constant static int kCrToRedTable[256] = {
+	-179, -178, -177, -175, -174, -172, -171, -170, -168, -167, -165, -164,
+	-163, -161, -160, -158, -157, -156, -154, -153, -151, -150, -149, -147,
+	-146, -144, -143, -142, -140, -139, -137, -136, -135, -133, -132, -130,
+	-129, -128, -126, -125, -123, -122, -121, -119, -118, -116, -115, -114,
+	-112, -111, -109, -108, -107, -105, -104, -102, -101, -100,  -98,  -97,
+	-95,  -94,  -93,  -91,  -90,  -88,  -87,  -86,  -84,  -83,  -81,  -80,
+	-79,  -77,  -76,  -74,  -73,  -72,  -70,  -69,  -67,  -66,  -64,  -63,
+	-62,  -60,  -59,  -57,  -56,  -55,  -53,  -52,  -50,  -49,  -48,  -46,
+	-45,  -43,  -42,  -41,  -39,  -38,  -36,  -35,  -34,  -32,  -31,  -29,
+	-28,  -27,  -25,  -24,  -22,  -21,  -20,  -18,  -17,  -15,  -14,  -13,
+	-11,  -10,   -8,   -7,   -6,   -4,   -3,   -1,    0,    1,    3,    4,
+	6,    7,    8,   10,   11,   13,   14,   15,   17,   18,   20,   21,
+	22,   24,   25,   27,   28,   29,   31,   32,   34,   35,   36,   38,
+	39,   41,   42,   43,   45,   46,   48,   49,   50,   52,   53,   55,
+	56,   57,   59,   60,   62,   63,   64,   66,   67,   69,   70,   72,
+	73,   74,   76,   77,   79,   80,   81,   83,   84,   86,   87,   88,
+	90,   91,   93,   94,   95,   97,   98,  100,  101,  102,  104,  105,
+	107,  108,  109,  111,  112,  114,  115,  116,  118,  119,  121,  122,
+	123,  125,  126,  128,  129,  130,  132,  133,  135,  136,  137,  139,
+	140,  142,  143,  144,  146,  147,  149,  150,  151,  153,  154,  156,
+	157,  158,  160,  161,  163,  164,  165,  167,  168,  170,  171,  172,
+	174,  175,  177,  178
+};
+
+__constant static int kCbToBlueTable[256] = {
+	-227, -225, -223, -222, -220, -218, -216, -214, -213, -211, -209, -207,
+	-206, -204, -202, -200, -198, -197, -195, -193, -191, -190, -188, -186,
+	-184, -183, -181, -179, -177, -175, -174, -172, -170, -168, -167, -165,
+	-163, -161, -159, -158, -156, -154, -152, -151, -149, -147, -145, -144,
+	-142, -140, -138, -136, -135, -133, -131, -129, -128, -126, -124, -122,
+	-120, -119, -117, -115, -113, -112, -110, -108, -106, -105, -103, -101,
+	-99,  -97,  -96,  -94,  -92,  -90,  -89,  -87,  -85,  -83,  -82,  -80,
+	-78,  -76,  -74,  -73,  -71,  -69,  -67,  -66,  -64,  -62,  -60,  -58,
+	-57,  -55,  -53,  -51,  -50,  -48,  -46,  -44,  -43,  -41,  -39,  -37,
+	-35,  -34,  -32,  -30,  -28,  -27,  -25,  -23,  -21,  -19,  -18,  -16,
+	-14,  -12,  -11,   -9,   -7,   -5,   -4,   -2,    0,    2,    4,    5,
+	7,    9,   11,   12,   14,   16,   18,   19,   21,   23,   25,   27,
+	28,   30,   32,   34,   35,   37,   39,   41,   43,   44,   46,   48,
+	50,   51,   53,   55,   57,   58,   60,   62,   64,   66,   67,   69,
+	71,   73,   74,   76,   78,   80,   82,   83,   85,   87,   89,   90,
+	92,   94,   96,   97,   99,  101,  103,  105,  106,  108,  110,  112,
+	113,  115,  117,  119,  120,  122,  124,  126,  128,  129,  131,  133,
+	135,  136,  138,  140,  142,  144,  145,  147,  149,  151,  152,  154,
+	156,  158,  159,  161,  163,  165,  167,  168,  170,  172,  174,  175,
+	177,  179,  181,  183,  184,  186,  188,  190,  191,  193,  195,  197,
+	198,  200,  202,  204,  206,  207,  209,  211,  213,  214,  216,  218,
+	220,  222,  223,  225,
+};
+
+__constant static int kCrToGreenTable[256] = {
+	5990656,  5943854,  5897052,  5850250,  5803448,  5756646,  5709844,  5663042,
+	5616240,  5569438,  5522636,  5475834,  5429032,  5382230,  5335428,  5288626,
+	5241824,  5195022,  5148220,  5101418,  5054616,  5007814,  4961012,  4914210,
+	4867408,  4820606,  4773804,  4727002,  4680200,  4633398,  4586596,  4539794,
+	4492992,  4446190,  4399388,  4352586,  4305784,  4258982,  4212180,  4165378,
+	4118576,  4071774,  4024972,  3978170,  3931368,  3884566,  3837764,  3790962,
+	3744160,  3697358,  3650556,  3603754,  3556952,  3510150,  3463348,  3416546,
+	3369744,  3322942,  3276140,  3229338,  3182536,  3135734,  3088932,  3042130,
+	2995328,  2948526,  2901724,  2854922,  2808120,  2761318,  2714516,  2667714,
+	2620912,  2574110,  2527308,  2480506,  2433704,  2386902,  2340100,  2293298,
+	2246496,  2199694,  2152892,  2106090,  2059288,  2012486,  1965684,  1918882,
+	1872080,  1825278,  1778476,  1731674,  1684872,  1638070,  1591268,  1544466,
+	1497664,  1450862,  1404060,  1357258,  1310456,  1263654,  1216852,  1170050,
+	1123248,  1076446,  1029644,   982842,   936040,   889238,   842436,   795634,
+	748832,   702030,   655228,   608426,   561624,   514822,   468020,   421218,
+	374416,   327614,   280812,   234010,   187208,   140406,    93604,    46802,
+	0,   -46802,   -93604,  -140406,  -187208,  -234010,  -280812,  -327614,
+	-374416,  -421218,  -468020,  -514822,  -561624,  -608426,  -655228,  -702030,
+	-748832,  -795634,  -842436,  -889238,  -936040,  -982842, -1029644, -1076446,
+	-1123248, -1170050, -1216852, -1263654, -1310456, -1357258, -1404060, -1450862,
+	-1497664, -1544466, -1591268, -1638070, -1684872, -1731674, -1778476, -1825278,
+	-1872080, -1918882, -1965684, -2012486, -2059288, -2106090, -2152892, -2199694,
+	-2246496, -2293298, -2340100, -2386902, -2433704, -2480506, -2527308, -2574110,
+	-2620912, -2667714, -2714516, -2761318, -2808120, -2854922, -2901724, -2948526,
+	-2995328, -3042130, -3088932, -3135734, -3182536, -3229338, -3276140, -3322942,
+	-3369744, -3416546, -3463348, -3510150, -3556952, -3603754, -3650556, -3697358,
+	-3744160, -3790962, -3837764, -3884566, -3931368, -3978170, -4024972, -4071774,
+	-4118576, -4165378, -4212180, -4258982, -4305784, -4352586, -4399388, -4446190,
+	-4492992, -4539794, -4586596, -4633398, -4680200, -4727002, -4773804, -4820606,
+	-4867408, -4914210, -4961012, -5007814, -5054616, -5101418, -5148220, -5195022,
+	-5241824, -5288626, -5335428, -5382230, -5429032, -5475834, -5522636, -5569438,
+	-5616240, -5663042, -5709844, -5756646, -5803448, -5850250, -5897052, -5943854,
+};
+
+__constant static int kCbToGreenTable[256] = {
+	2919680,  2897126,  2874572,  2852018,  2829464,  2806910,  2784356,  2761802,
+	2739248,  2716694,  2694140,  2671586,  2649032,  2626478,  2603924,  2581370,
+	2558816,  2536262,  2513708,  2491154,  2468600,  2446046,  2423492,  2400938,
+	2378384,  2355830,  2333276,  2310722,  2288168,  2265614,  2243060,  2220506,
+	2197952,  2175398,  2152844,  2130290,  2107736,  2085182,  2062628,  2040074,
+	2017520,  1994966,  1972412,  1949858,  1927304,  1904750,  1882196,  1859642,
+	1837088,  1814534,  1791980,  1769426,  1746872,  1724318,  1701764,  1679210,
+	1656656,  1634102,  1611548,  1588994,  1566440,  1543886,  1521332,  1498778,
+	1476224,  1453670,  1431116,  1408562,  1386008,  1363454,  1340900,  1318346,
+	1295792,  1273238,  1250684,  1228130,  1205576,  1183022,  1160468,  1137914,
+	1115360,  1092806,  1070252,  1047698,  1025144,  1002590,   980036,   957482,
+	934928,   912374,   889820,   867266,   844712,   822158,   799604,   777050,
+	754496,   731942,   709388,   686834,   664280,   641726,   619172,   596618,
+	574064,   551510,   528956,   506402,   483848,   461294,   438740,   416186,
+	393632,   371078,   348524,   325970,   303416,   280862,   258308,   235754,
+	213200,   190646,   168092,   145538,   122984,   100430,    77876,    55322,
+	32768,    10214,   -12340,   -34894,   -57448,   -80002,  -102556,  -125110,
+	-147664,  -170218,  -192772,  -215326,  -237880,  -260434,  -282988,  -305542,
+	-328096,  -350650,  -373204,  -395758,  -418312,  -440866,  -463420,  -485974,
+	-508528,  -531082,  -553636,  -576190,  -598744,  -621298,  -643852,  -666406,
+	-688960,  -711514,  -734068,  -756622,  -779176,  -801730,  -824284,  -846838,
+	-869392,  -891946,  -914500,  -937054,  -959608,  -982162, -1004716, -1027270,
+	-1049824, -1072378, -1094932, -1117486, -1140040, -1162594, -1185148, -1207702,
+	-1230256, -1252810, -1275364, -1297918, -1320472, -1343026, -1365580, -1388134,
+	-1410688, -1433242, -1455796, -1478350, -1500904, -1523458, -1546012, -1568566,
+	-1591120, -1613674, -1636228, -1658782, -1681336, -1703890, -1726444, -1748998,
+	-1771552, -1794106, -1816660, -1839214, -1861768, -1884322, -1906876, -1929430,
+	-1951984, -1974538, -1997092, -2019646, -2042200, -2064754, -2087308, -2109862,
+	-2132416, -2154970, -2177524, -2200078, -2222632, -2245186, -2267740, -2290294,
+	-2312848, -2335402, -2357956, -2380510, -2403064, -2425618, -2448172, -2470726,
+	-2493280, -2515834, -2538388, -2560942, -2583496, -2606050, -2628604, -2651158,
+	-2673712, -2696266, -2718820, -2741374, -2763928, -2786482, -2809036, -2831590,
+};
+
+__constant static uchar kRangeLimitLut[4 * 256] = {
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,
+	16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,
+	32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
+	48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
+	64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
+	80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
+	96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+	112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+	128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
+	144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
+	160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
+	176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
+	192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
+	208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
+	224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+	240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+};
+
+__device__ void YUVToRGB(__private uchar pixelBlock[3*8*8], int size /*= 8 * 8*/)
+{
+    __constant_ex uchar* kRangeLimit = kRangeLimitLut + 384;
+
+	for (int i = 0; i < size; i++)
+	{
+		uchar *pixel = &pixelBlock[i * 3];
+
+		int y = pixel[0];
+		int cb = pixel[1];
+		int cr = pixel[2];
+		pixel[0] = kRangeLimit[y + kCrToRedTable[cr]];
+		pixel[1] = kRangeLimit[y + ((kCrToGreenTable[cr] + kCbToGreenTable[cb]) >> 16)];
+		pixel[2] = kRangeLimit[y + kCbToBlueTable[cb]];
+	}
+}
+
+__constant static double kSrgb8ToLinearTable[256] = {
+	0.000000,
+	0.077399,
+	0.154799,
+	0.232198,
+	0.309598,
+	0.386997,
+	0.464396,
+	0.541796,
+	0.619195,
+	0.696594,
+	0.773994,
+	0.853367,
+	0.937509,
+	1.026303,
+	1.119818,
+	1.218123,
+	1.321287,
+	1.429375,
+	1.542452,
+	1.660583,
+	1.783830,
+	1.912253,
+	2.045914,
+	2.184872,
+	2.329185,
+	2.478910,
+	2.634105,
+	2.794824,
+	2.961123,
+	3.133055,
+	3.310673,
+	3.494031,
+	3.683180,
+	3.878171,
+	4.079055,
+	4.285881,
+	4.498698,
+	4.717556,
+	4.942502,
+	5.173584,
+	5.410848,
+	5.654341,
+	5.904108,
+	6.160196,
+	6.422649,
+	6.691512,
+	6.966827,
+	7.248640,
+	7.536993,
+	7.831928,
+	8.133488,
+	8.441715,
+	8.756651,
+	9.078335,
+	9.406810,
+	9.742115,
+	10.084290,
+	10.433375,
+	10.789410,
+	11.152432,
+	11.522482,
+	11.899597,
+	12.283815,
+	12.675174,
+	13.073712,
+	13.479465,
+	13.892470,
+	14.312765,
+	14.740385,
+	15.175366,
+	15.617744,
+	16.067555,
+	16.524833,
+	16.989614,
+	17.461933,
+	17.941824,
+	18.429322,
+	18.924460,
+	19.427272,
+	19.937793,
+	20.456054,
+	20.982090,
+	21.515934,
+	22.057618,
+	22.607175,
+	23.164636,
+	23.730036,
+	24.303404,
+	24.884774,
+	25.474176,
+	26.071642,
+	26.677203,
+	27.290891,
+	27.912736,
+	28.542769,
+	29.181020,
+	29.827520,
+	30.482299,
+	31.145387,
+	31.816813,
+	32.496609,
+	33.184802,
+	33.881422,
+	34.586499,
+	35.300062,
+	36.022139,
+	36.752760,
+	37.491953,
+	38.239746,
+	38.996169,
+	39.761248,
+	40.535013,
+	41.317491,
+	42.108710,
+	42.908697,
+	43.717481,
+	44.535088,
+	45.361546,
+	46.196882,
+	47.041124,
+	47.894297,
+	48.756429,
+	49.627547,
+	50.507676,
+	51.396845,
+	52.295078,
+	53.202402,
+	54.118843,
+	55.044428,
+	55.979181,
+	56.923129,
+	57.876298,
+	58.838712,
+	59.810398,
+	60.791381,
+	61.781686,
+	62.781338,
+	63.790363,
+	64.808784,
+	65.836627,
+	66.873918,
+	67.920679,
+	68.976937,
+	70.042715,
+	71.118037,
+	72.202929,
+	73.297414,
+	74.401516,
+	75.515259,
+	76.638668,
+	77.771765,
+	78.914575,
+	80.067122,
+	81.229428,
+	82.401518,
+	83.583415,
+	84.775142,
+	85.976722,
+	87.188178,
+	88.409534,
+	89.640813,
+	90.882037,
+	92.133229,
+	93.394412,
+	94.665609,
+	95.946841,
+	97.238133,
+	98.539506,
+	99.850982,
+	101.172584,
+	102.504334,
+	103.846254,
+	105.198366,
+	106.560693,
+	107.933256,
+	109.316077,
+	110.709177,
+	112.112579,
+	113.526305,
+	114.950375,
+	116.384811,
+	117.829635,
+	119.284868,
+	120.750532,
+	122.226647,
+	123.713235,
+	125.210317,
+	126.717914,
+	128.236047,
+	129.764737,
+	131.304005,
+	132.853871,
+	134.414357,
+	135.985483,
+	137.567270,
+	139.159738,
+	140.762907,
+	142.376799,
+	144.001434,
+	145.636832,
+	147.283012,
+	148.939997,
+	150.607804,
+	152.286456,
+	153.975971,
+	155.676371,
+	157.387673,
+	159.109900,
+	160.843070,
+	162.587203,
+	164.342319,
+	166.108438,
+	167.885578,
+	169.673761,
+	171.473005,
+	173.283330,
+	175.104755,
+	176.937299,
+	178.780982,
+	180.635824,
+	182.501843,
+	184.379058,
+	186.267489,
+	188.167154,
+	190.078073,
+	192.000265,
+	193.933749,
+	195.878543,
+	197.834666,
+	199.802137,
+	201.780975,
+	203.771198,
+	205.772826,
+	207.785876,
+	209.810367,
+	211.846319,
+	213.893748,
+	215.952674,
+	218.023115,
+	220.105089,
+	222.198615,
+	224.303711,
+	226.420395,
+	228.548685,
+	230.688599,
+	232.840156,
+	235.003373,
+	237.178269,
+	239.364861,
+	241.563167,
+	243.773205,
+	245.994993,
+	248.228549,
+	250.473890,
+	252.731035,
+	255.000000,
+};
+
+__device__ void YUVToImage(__private uchar yuv[3 * 8 * 8], float* r, float* g, float* b, int xsize/* = 8*/, int ysize/* = 8*/, int inside_x/* = 8*/, int inside_y/* = 8*/)
+{
+    YUVToRGB(yuv, xsize * ysize);
+
+#define lut kSrgb8ToLinearTable
+//    const __constant double* lut = kSrgb8ToLinearTable;
+
+    for (int i = 0; i < xsize * ysize; i++)
+    {
+        r[i] = lut[yuv[3 * i]];
+        g[i] = lut[yuv[3 * i + 1]];
+        b[i] = lut[yuv[3 * i + 2]];
+    }
+    for (int y = 0; y < inside_y; y++)
+    {
+        for (int x = inside_x; x < xsize; x++)
+        {
+            int idx = y * xsize + (inside_x - 1);
+            r[y * xsize + x] = r[idx];
+            g[y * xsize + x] = g[idx];
+            b[y * xsize + x] = b[idx];
+        }
+    }
+    for (int y = inside_y; y < ysize; y++)
+    {
+        for (int x = 0; x < xsize; x++)
+        {
+            int idx = (inside_y - 1) * xsize + x;
+            r[y * xsize + x] = r[idx];
+            g[y * xsize + x] = g[idx];
+            b[y * xsize + x] = b[idx];
+        }
+    }
+#undef lut
+}
+
+__device__ void BlockToImage(__private const coeff_t block[8*8*3], float r[8*8], float g[8*8], float b[8*8], int inside_x, int inside_y)
+{
+	uchar idct[3][8 * 8];
+	CoeffToIDCT(&block[0], idct[0]);
+	CoeffToIDCT(&block[8 * 8], idct[1]);
+	CoeffToIDCT(&block[8 * 8 * 2], idct[2]);
+
+	ushort pixels[3][8 * 8];
+	IDCTToPixel8x8(idct[0], pixels[0]);
+	IDCTToPixel8x8(idct[1], pixels[1]);
+	IDCTToPixel8x8(idct[2], pixels[2]);
+
+	uchar yuv[8 * 8 * 3];
+	PixelToYUV(pixels[0], &yuv[0], 8, 8);
+	PixelToYUV(pixels[1], &yuv[1], 8, 8);
+	PixelToYUV(pixels[2], &yuv[2], 8, 8);
+
+	YUVToRGB(yuv, 8 * 8);
+
+	for (int i = 0; i < 8 * 8; i++)
+	{
+		r[i] = kSrgb8ToLinearTable[yuv[3 * i]];
+		g[i] = kSrgb8ToLinearTable[yuv[3 * i + 1]];
+		b[i] = kSrgb8ToLinearTable[yuv[3 * i + 2]];
+	}
+    for (int y = 0; y < inside_y; y++)
+    {
+        for (int x = inside_x; x < 8; x++)
+        {
+            int idx = y * 8 + (inside_x - 1);
+            r[y * 8 + x] = r[idx];
+            g[y * 8 + x] = g[idx];
+            b[y * 8 + x] = b[idx];
+        }
+    }
+    for (int y = inside_y; y < 8; y++)
+    {
+        for (int x = 0; x < 8; x++)
+        {
+            int idx = (inside_y - 1) * 8 + x;
+            r[y * 8 + x] = r[idx];
+            g[y * 8 + x] = g[idx];
+            b[y * 8 + x] = b[idx];
+        }
+    }
+}
+
+__device__ void CoeffToYUV16x16(__private const coeff_t block[8 * 8], uchar *yuv, __global const ushort *pixel_orig, int block_x, int block_y, int width_, int height_)
+{
+    uchar idct[8 * 8];
+    CoeffToIDCT(&block[0], &idct[0]);
+
+    ushort pixels[16 * 16];
+    IDCTToPixel16x16(idct, pixels, pixel_orig, block_x, block_y, width_, height_);
+
+    PixelToYUV(pixels, yuv, 16, 16);
+}
+
+__device__ void CoeffToYUV16x16_g(__global const coeff_t block[8 * 8], uchar *yuv, __global const ushort *pixel_orig, int block_x, int block_y, int width_, int height_)
+{
+    coeff_t b[8 * 8];
+    for (int i = 0; i < 8 * 8; i++)
+    {
+        b[i] = block[i];
+    }
+    CoeffToYUV16x16(b, yuv, pixel_orig, block_x, block_y, width_, height_);
+}
+
+__device__ void CoeffToYUV8x8(__private const coeff_t block[8 * 8], uchar *yuv)
+{
+    uchar idct[8 * 8];
+    CoeffToIDCT(&block[0], &idct[0]);
+
+    ushort pixels[8 * 8];
+    IDCTToPixel8x8(idct, pixels);
+
+    PixelToYUV(pixels, yuv, 8, 8);
+}
+
+__device__ void CoeffToYUV8x8_g(__global const coeff_t block[8 * 8], uchar *yuv)
+{
+    coeff_t b[8 * 8];
+    for (int i = 0; i < 8 * 8; i++)
+    {
+        b[i] = block[i];
+    }
+
+    CoeffToYUV8x8(b, yuv);
+}
+
+__device__ void Copy8x8To16x16(const uchar yuv8x8[3 * 8 * 8], uchar yuv16x16[3 * 16 * 16], int off_x, int off_y)
+{
+    for (int y = 0; y < 8; y++)
+    {
+        for (int x = 0; x < 8; x++)
+        {
+            int idx = y * 8 + x;
+            int idx16 = (y + off_y * 8) * 16 + (x + off_x * 8);
+            yuv16x16[idx16 * 3] = yuv8x8[idx * 3];
+        }
+    }
+}
+
+__device__ void Copy16x16To8x8(const uchar yuv16x16[3 * 16 * 16], uchar yuv8x8[3 * 8 * 8], int off_x, int off_y)
+{
+    for (int y = 0; y < 8; y++)
+    {
+        for (int x = 0; x < 8; x++)
+        {
+            int idx = y * 8 + x;
+            int idx16 = (y + off_y * 8) * 16 + (x + off_x * 8);
+            yuv8x8[idx * 3] = yuv16x16[idx16 * 3];
+        }
+    }
+}
+
+__device__ void Copy16x16ToChannel(const float rgb16x16[3][16 * 16], float r[8 * 8], float g[8 * 8], float b[8 * 8], int off_x, int off_y)
+{
+    for (int y = 0; y < 8; y++)
+    {
+        for (int x = 0; x < 8; x++)
+        {
+            int idx = y * 8 + x;
+            int idx16 = (y + off_y * 8) * 16 + (x + off_x * 8);
+            r[idx] = rgb16x16[0][idx16];
+            g[idx] = rgb16x16[1][idx16];
+            b[idx] = rgb16x16[2][idx16];
+        }
+    }
+}
+
+__device__ void Convolution(size_t xsize, size_t ysize,
+                 int xstep, int len, int offset, 
+                 const float* multipliers, 
+                 const float* inp, 
+                 float border_ratio,
+                 float* result)
+{ 
+	float weight_no_border = 0;
+
+	for (size_t j = 0; j <= 2 * offset; ++j) {
+		weight_no_border += multipliers[j];
+	}
+	for (size_t x = 0, ox = 0; x < xsize; x += xstep, ox++) {
+		int minx = x < offset ? 0 : x - offset;
+		int maxx = min(xsize, x + len - offset) - 1;
+		float weight = 0.0;
+		for (int j = minx; j <= maxx; ++j) {
+			weight += multipliers[j - x + offset];
+		}
+		// Interpolate linearly between the no-border scaling and border scaling.
+		weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border;
+		float scale = 1.0 / weight;
+		for (size_t y = 0; y < ysize; ++y) {
+			float sum = 0.0;
+			for (int j = minx; j <= maxx; ++j) {
+				sum += inp[y * xsize + j] * multipliers[j - x + offset];
+			}
+			result[ox * ysize + y] = (float)(sum * scale);
+		}
+	}
+}
+
+__device__ void BlurEx(const float *r, int xsize, int ysize, double kSigma, double border_ratio, float *output)
+{
+	const double sigma = 1.1;
+	double m = 2.25;  // Accuracy increases when m is increased.
+	const double scaler = -0.41322314049586772; // when sigma=1.1, scaler is -0.41322314049586772
+	const int diff = 2;  // when sigma=1.1, diff's value is 2.
+	const int expn_size = 5; // when sigma=1.1, scaler is  5
+	float expn[5] = { exp(scaler * (-diff) * (-diff)), 
+							  exp(scaler * (-diff + 1) * (-diff + 1)), 
+							  exp(scaler * (-diff + 2) * (-diff + 2)),
+							  exp(scaler * (-diff + 3) * (-diff + 3)),
+							  exp(scaler * (-diff + 4) * (-diff + 4))};				  
+	const int xstep = 1; // when sigma=1.1, xstep is 1.
+	const int ystep = xstep;
+
+	int dxsize = (xsize + xstep - 1) / xstep;
+
+	float tmp[8*8] = { 0 }; 
+	Convolution(xsize, ysize, xstep, expn_size, diff, expn, r, border_ratio, tmp);
+	Convolution(ysize, dxsize, ystep, expn_size, diff, expn, tmp,
+              border_ratio, output);
+}
+
+__device__ void OpsinDynamicsImageBlock(__private float *r, __private float *g, __private float *b,
+                            __private const float *r_blurred, __private const float *g_blurred, __private const float *b_blurred,
+                            int size)
+{
+  for (size_t i = 0; i < size; ++i) {
+    double sensitivity[3];
+    {
+      // Calculate sensitivity[3] based on the smoothed image gamma derivative.
+      double pre_rgb[3] = { r_blurred[i], g_blurred[i], b_blurred[i] };
+      double pre_mixed[3];
+      OpsinAbsorbance(pre_rgb, pre_mixed);
+      sensitivity[0] = Gamma(pre_mixed[0]) / pre_mixed[0];
+      sensitivity[1] = Gamma(pre_mixed[1]) / pre_mixed[1];
+      sensitivity[2] = Gamma(pre_mixed[2]) / pre_mixed[2];
+    }
+    double cur_rgb[3] = { r[i],  g[i],  b[i] };
+    double cur_mixed[3];
+    OpsinAbsorbance(cur_rgb, cur_mixed);
+    cur_mixed[0] *= sensitivity[0];
+    cur_mixed[1] *= sensitivity[1];
+    cur_mixed[2] *= sensitivity[2];
+    double x, y, z;
+    RgbToXyb(cur_mixed[0], cur_mixed[1], cur_mixed[2], &x, &y, &z);
+    r[i] = (float)(x);
+    g[i] = (float)(y);
+    b[i] = (float)(z);
+  }
+}
+
+__device__ void MaskHighIntensityChangeBlock(float *xyb0_x, float *xyb0_y, float *xyb0_b,
+    float *xyb1_x, float *xyb1_y, float *xyb1_b,
+    const float *c0_x, const float *c0_y, const float *c0_b,
+    const float *c1_x, const float *c1_y, const float *c1_b,
+    int xsize, int ysize)
+{
+    for (int x = 0; x < xsize; ++x)
+    {
+        for (int y = 0; y < ysize; ++y)
+        {
+            size_t ix = y * xsize + x;
+            const double ave[3] = {
+                (c0_x[ix] + c1_x[ix]) * 0.5f,
+                (c0_y[ix] + c1_y[ix]) * 0.5f,
+                (c0_b[ix] + c1_b[ix]) * 0.5f,
+            };
+            double sqr_max_diff = -1;
+            {
+                int offset[4] = { -1, 1, -(int)(xsize), (int)(xsize) };
+                int border[4] = { x == 0, x + 1 == xsize, y == 0, y + 1 == ysize };
+                for (int dir = 0; dir < 4; ++dir) {
+                    if (border[dir])
+                    {
+                        continue;
+                    }
+                    const int ix2 = ix + offset[dir];
+                    double diff = 0.5 * (c0_y[ix2] + c1_y[ix2]) - ave[1];
+                    diff *= diff;
+                    if (sqr_max_diff < diff)
+                    {
+                        sqr_max_diff = diff;
+                    }
+                }
+            }
+            const double kReductionX = 275.19165240059317;
+            const double kReductionY = 18599.41286306991;
+            const double kReductionZ = 410.8995306951065;
+            const double kChromaBalance = 106.95800948271017;
+            double chroma_scale = kChromaBalance / (ave[1] + kChromaBalance);
+
+            const double mix[3] = {
+                chroma_scale * kReductionX / (sqr_max_diff + kReductionX),
+                kReductionY / (sqr_max_diff + kReductionY),
+                chroma_scale * kReductionZ / (sqr_max_diff + kReductionZ),
+            };
+            // Interpolate lineraly between the average color and the actual
+            // color -- to reduce the importance of this pixel.
+            xyb0_x[ix] = (float)(mix[0] * c0_x[ix] + (1 - mix[0]) * ave[0]);
+            xyb1_x[ix] = (float)(mix[0] * c1_x[ix] + (1 - mix[0]) * ave[0]);
+
+            xyb0_y[ix] = (float)(mix[1] * c0_y[ix] + (1 - mix[1]) * ave[1]);
+            xyb1_y[ix] = (float)(mix[1] * c1_y[ix] + (1 - mix[1]) * ave[1]);
+
+            xyb0_b[ix] = (float)(mix[2] * c0_b[ix] + (1 - mix[2]) * ave[2]);
+            xyb1_b[ix] = (float)(mix[2] * c1_b[ix] + (1 - mix[2]) * ave[2]);
+        }
+    }
+}
+
+__device__ void floatcopy(float *dst, const float *src, int size)
+{
+    for (int i = 0; i < size; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+__device__ void coeffcopy_g(coeff_t *dst, __global const coeff_t *src, int size)
+{
+    for (int i = 0; i < size; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+__device__ void coeffcopy(coeff_t *dst, const coeff_t *src, int size)
+{
+    for (int i = 0; i < size; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+__device__ void CalcOpsinDynamicsImage(__private float rgb[3][kDCTBlockSize])
+{
+    float rgb_blurred[3][kDCTBlockSize];
+    for (int i = 0; i < 3; i++)
+    {
+        BlurEx(rgb[i], 8, 8, 1.1, 0, rgb_blurred[i]);
+    }
+    OpsinDynamicsImageBlock(rgb[0], rgb[1], rgb[2], rgb_blurred[0], rgb_blurred[1], rgb_blurred[2], kDCTBlockSize);
+}
+
+__device__ double ComputeImage8x8Block(__private float rgb0_c[3][kDCTBlockSize], __private float rgb1_c[3][kDCTBlockSize], const __global float* mask_scale_block)
+{ 
+    CalcOpsinDynamicsImage(rgb1_c);     
+
+    float rgb0[3][kDCTBlockSize];
+    float rgb1[3][kDCTBlockSize];
+
+    floatcopy(&rgb0[0][0], &rgb0_c[0][0], 3 * kDCTBlockSize);
+    floatcopy(&rgb1[0][0], &rgb1_c[0][0], 3 * kDCTBlockSize);
+
+    MaskHighIntensityChangeBlock(rgb0[0], rgb0[1], rgb0[2],
+                                rgb1[0], rgb1[1], rgb1[2],
+                                rgb0_c[0], rgb0_c[1], rgb0_c[2],
+                                rgb1_c[0], rgb1_c[1], rgb1_c[2],
+                                8, 8);
+
+    double b0[3 * kDCTBlockSize];       
+    double b1[3 * kDCTBlockSize];
+    for (int c = 0; c < 3; ++c) {
+        for (int ix = 0; ix < kDCTBlockSize; ++ix) {
+            b0[c * kDCTBlockSize + ix] = rgb0[c][ix];
+            b1[c * kDCTBlockSize + ix] = rgb1[c][ix];
+        }
+    }
+
+    double diff_xyz_dc[3] = { 0.0 };
+    double diff_xyz_ac[3] = { 0.0 };
+    double diff_xyz_edge_dc[3] = { 0.0 };
+    ButteraugliBlockDiff(b0, b1, diff_xyz_dc, diff_xyz_ac, diff_xyz_edge_dc);
+
+    double diff = 0.0;
+    double diff_edge = 0.0;
+
+    for (int c = 0; c < 3; ++c) {
+        diff += diff_xyz_dc[c] * mask_scale_block[c];
+        diff += diff_xyz_ac[c] * mask_scale_block[c];
+        diff_edge += diff_xyz_edge_dc[c] * mask_scale_block[c];
+    }
+    const double kEdgeWeight = 0.05;
+    return sqrt((1 - kEdgeWeight) * diff + kEdgeWeight * diff_edge);
+
+}
+
+// return the count of Non-zero item
+__device__ int MakeInputOrderEx(const coeff_t block[3*8*8], const coeff_t orig_block[3*8*8], IntFloatPairList *input_order)
+{
+    const int block_size = 64;
+    int size = 0;
+    for (int c = 0; c < 3; ++c) {
+        for (int k = 1; k < block_size; ++k) {
+            int idx = c * block_size + k;
+            if (block[idx] != 0) {
+                float score = _abs(orig_block[idx]) * csf[idx] + bias[idx];
+                size = list_push_back(input_order, idx, score);
+            }
+        }
+    }
+ 
+    return SortInputOrder(input_order->pData, size);
+}
+
+__device__ int GetOrigBlock(float rgb0_c[3][kDCTBlockSize],
+                 const __global float *orig_image_batch, 
+                 int width_, int height_,
+                 int block_x, int block_y, 
+                 int factor,
+                 int off_x, int off_y)
+{
+    int block_xx = block_x * factor + off_x;
+    int block_yy = block_y * factor + off_y;
+    if (block_xx * 8 >= width_ || block_yy * 8 >= height_) return -1;
+
+    const int block8_width = (width_ + 8 - 1) / 8;
+
+    int block_ix = block_yy * block8_width + block_xx;
+
+    __global const float*  block_opsin = &orig_image_batch[block_ix * 3 * kDCTBlockSize];
+    for (int i = 0; i < 3; i++) {
+        for (int k = 0; k < kDCTBlockSize; k++) {
+            rgb0_c[i][k] = block_opsin[i * kDCTBlockSize + k];
+        }
+    }
+
+    return block_ix;
+}
+
+__device__ double CompareBlockFactor1(const channel_info mayout_channel[3],
+    const coeff_t* candidate_block,
+    const int block_x,
+    const int block_y,
+    __global const float *orig_image_batch,
+    __global const float *mask_scale,
+    const int image_width,
+    const int image_height)
+{
+    const coeff_t *candidate_channel[3];
+    for (int c = 0; c < 3; c++) {
+        candidate_channel[c] = &candidate_block[c * 8 * 8];
+    }
+
+    uchar yuv16x16[3 * 16 * 16] = { 0 };  // factor 2 mode output image
+    uchar yuv8x8[3 * 8 * 8] = { 0 };      // factor 1 mode output image
+
+    for (int c = 0; c < 3; c++)
+    {
+        if (mayout_channel[c].factor == 1) {
+            const coeff_t *coeff_block = candidate_channel[c];
+            CoeffToYUV8x8(coeff_block, &yuv8x8[c]);
+        }
+        else {
+            int block_xx = block_x / mayout_channel[c].factor;
+            int block_yy = block_y / mayout_channel[c].factor;
+            int ix = block_x % mayout_channel[c].factor;;
+            int iy = block_y % mayout_channel[c].factor;
+
+            int block_16x16idx = block_yy * mayout_channel[c].block_width + block_xx;
+            __global const coeff_t * coeff_block = mayout_channel[c].coeff + block_16x16idx * 8 * 8;
+
+            CoeffToYUV16x16_g(coeff_block, &yuv16x16[c],
+                mayout_channel[c].pixel, block_xx, block_yy,
+                image_width,
+                image_height);
+
+            // copy YUV16x16 corner to YUV8x8
+            Copy16x16To8x8(&yuv16x16[c], &yuv8x8[c], ix, iy);
+        }
+    }
+
+    {
+        float rgb0_c[3][kDCTBlockSize];
+        int block_8x8idx = GetOrigBlock(rgb0_c, orig_image_batch, image_width, image_height, block_x, block_y, 1, 0, 0);
+
+        int inside_x = block_x * 8 + 8 > image_width ? image_width - block_x * 8 : 8;
+        int inside_y = block_y * 8 + 8 > image_height ? image_height - block_y * 8 : 8;
+        float rgb1_c[3][kDCTBlockSize];
+
+        YUVToImage(yuv8x8, rgb1_c[0], rgb1_c[1], rgb1_c[2], 8, 8, inside_x, inside_y);
+
+        return ComputeImage8x8Block(rgb0_c, rgb1_c, mask_scale + block_8x8idx * 3);
+    }
+}
+
+__device__ double Factor2(const channel_info mayout_channel[3],
+    const coeff_t* candidate_block,
+    const int block_x,
+    const int block_y,
+    __global const float *orig_image_batch,
+    __global const float *mask_scale,
+    const int image_width,
+    const int image_height)
+{
+    const int factor = 2;
+    const coeff_t *candidate_channel[3];
+    for (int c = 0; c < 3; c++) {
+        candidate_channel[c] = &candidate_block[c * 8 * 8];
+    }
+
+    uchar yuv16x16[3 * 16 * 16] = { 0 };  // factor 2 mode output image
+    uchar yuv8x8[3 * 8 * 8] = { 0 };      // factor 1 mode output image
+
+    for (int c = 0; c < 3; c++)
+    {
+        if (mayout_channel[c].factor == 1) {
+                for (int iy = 0; iy < factor; ++iy) {
+                    for (int ix = 0; ix < factor; ++ix) {
+                        int block_xx = block_x * factor + ix;
+                        int block_yy = block_y * factor + iy;
+
+                        ///if (ix != off_x || iy != off_y) continue;
+                        if (block_xx >= mayout_channel[c].block_width ||
+                            block_yy >= mayout_channel[c].block_height)
+                        {
+                            continue;
+                        }
+                        int block_8x8idx = block_yy * mayout_channel[c].block_width + block_xx;
+                        __global const coeff_t * coeff_block = mayout_channel[c].coeff + block_8x8idx * 8 * 8;
+                        CoeffToYUV8x8_g(coeff_block, &yuv8x8[c]);
+
+                        // copy YUV8x8 to YUV1616 corner
+                        Copy8x8To16x16(&yuv8x8[c], &yuv16x16[c], ix, iy);
+                    }
+                }
+        }
+        else {
+                const coeff_t * coeff_block = candidate_channel[c];
+                CoeffToYUV16x16(coeff_block, &yuv16x16[c],
+                    mayout_channel[c].pixel, block_x, block_y,
+                    image_width,
+                    image_height);
+        }
+    }
+
+        int inside_x = block_x * 16 + 16 > image_width ? image_width - block_x * 16 : 16;
+        int inside_y = block_y * 16 + 16 > image_height ? image_height - block_y * 16 : 16;
+
+        float rgb16x16[3][16 * 16];
+        YUVToImage(yuv16x16, rgb16x16[0], rgb16x16[1], rgb16x16[2], 16, 16, inside_x, inside_y);
+
+        double max_err = 0;
+        for (int iy = 0; iy < factor; ++iy) {
+            for (int ix = 0; ix < factor; ++ix) {
+                int block_xx = block_x * factor + ix;
+                int block_yy = block_y * factor + iy;
+
+                if (block_xx * 8 >= image_width ||
+                    block_yy * 8 >= image_height)
+                {
+                    continue;
+                }
+
+                float rgb0_c[3][kDCTBlockSize];
+                int block_8x8idx = GetOrigBlock(rgb0_c, orig_image_batch, image_width, image_height, block_x, block_y, factor, ix, iy);
+
+                float rgb1_c[3][kDCTBlockSize];
+                Copy16x16ToChannel(rgb16x16, rgb1_c[0], rgb1_c[1], rgb1_c[2], ix, iy);
+                double err = ComputeImage8x8Block(rgb0_c, rgb1_c, mask_scale + block_8x8idx * 3);
+                max_err = max(max_err, err);
+            }
+        }
+        return max_err;
+}
+
+__device__ double CompareBlockFactor(const channel_info mayout_channel[3],
+                          const coeff_t* candidate_block, 
+                          const int block_x, 
+                          const int block_y, 
+                          __global const float *orig_image_batch,
+                          __global const float *mask_scale,
+                          const int image_width,
+                          const int image_height,
+                          const int factor)
+{
+    const coeff_t *candidate_channel[3];
+    for (int c = 0; c < 3; c++) {
+        candidate_channel[c] = &candidate_block[c * 8 * 8];
+    }
+
+    uchar yuv16x16[3 * 16 * 16] = { 0 };  // factor 2 mode output image
+    uchar yuv8x8[3 * 8 * 8] = { 0 };      // factor 1 mode output image
+
+    for (int c = 0; c < 3; c++)
+    {
+        if (mayout_channel[c].factor == 1) {
+            if (factor == 1) {
+                const coeff_t *coeff_block = candidate_channel[c];
+                CoeffToYUV8x8(coeff_block, &yuv8x8[c]);
+            }
+            else {
+                for (int iy = 0; iy < factor; ++iy) {
+                    for (int ix = 0; ix < factor; ++ix) {
+                        int block_xx = block_x * factor + ix;
+                        int block_yy = block_y * factor + iy;
+
+                        ///if (ix != off_x || iy != off_y) continue;
+                        if (block_xx >= mayout_channel[c].block_width ||
+                            block_yy >= mayout_channel[c].block_height)
+                        {
+                            continue;
+                        }
+                        int block_8x8idx = block_yy * mayout_channel[c].block_width + block_xx;
+                        __global const coeff_t * coeff_block = mayout_channel[c].coeff + block_8x8idx * 8 * 8;
+                        CoeffToYUV8x8_g(coeff_block, &yuv8x8[c]);
+
+                        // copy YUV8x8 to YUV1616 corner
+                        Copy8x8To16x16(&yuv8x8[c], &yuv16x16[c], ix, iy);
+                    }
+                }
+            }
+        }
+        else { 
+            if (factor == 1) {
+                int block_xx = block_x / mayout_channel[c].factor;
+                int block_yy = block_y / mayout_channel[c].factor;
+                int ix = block_x % mayout_channel[c].factor;;
+                int iy = block_y % mayout_channel[c].factor;
+
+                int block_16x16idx = block_yy * mayout_channel[c].block_width + block_xx;
+                __global const coeff_t * coeff_block = mayout_channel[c].coeff + block_16x16idx * 8 * 8;
+               
+                CoeffToYUV16x16_g(coeff_block, &yuv16x16[c], 
+                    mayout_channel[c].pixel, block_xx, block_yy, 
+                    image_width,
+                    image_height);
+
+                // copy YUV16x16 corner to YUV8x8
+                Copy16x16To8x8(&yuv16x16[c], &yuv8x8[c], ix, iy);
+            }
+            else {
+                const coeff_t * coeff_block = candidate_channel[c];
+                CoeffToYUV16x16(coeff_block, &yuv16x16[c], 
+                    mayout_channel[c].pixel, block_x, block_y, 
+                    image_width, 
+                    image_height);
+            }
+        }
+    }
+
+    if (factor == 1)
+    {
+        float rgb0_c[3][kDCTBlockSize];
+        int block_8x8idx = GetOrigBlock(rgb0_c, orig_image_batch, image_width, image_height, block_x, block_y, factor, 0, 0);
+
+        int inside_x = block_x * 8 + 8 > image_width ? image_width - block_x * 8 : 8;
+        int inside_y = block_y * 8 + 8 > image_height ? image_height - block_y * 8 : 8;
+        float rgb1_c[3][kDCTBlockSize];
+
+        YUVToImage(yuv8x8, rgb1_c[0], rgb1_c[1], rgb1_c[2], 8, 8, inside_x, inside_y);
+
+        return ComputeImage8x8Block(rgb0_c, rgb1_c, mask_scale + block_8x8idx * 3);
+    }
+    else
+    {
+        int inside_x = block_x * 16 + 16 > image_width ? image_width - block_x * 16 : 16;
+        int inside_y = block_y * 16 + 16 > image_height ? image_height - block_y * 16 : 16;
+
+        float rgb16x16[3][16 * 16];
+        YUVToImage(yuv16x16, rgb16x16[0], rgb16x16[1], rgb16x16[2], 16, 16, inside_x, inside_y);
+
+        double max_err = 0;
+        for (int iy = 0; iy < factor; ++iy) {
+            for (int ix = 0; ix < factor; ++ix) {
+                int block_xx = block_x * factor + ix;
+                int block_yy = block_y * factor + iy;
+
+                if (block_xx * 8 >= image_width ||
+                    block_yy * 8 >= image_height)
+                {
+                    continue;
+                }
+                
+                float rgb0_c[3][kDCTBlockSize];
+                int block_8x8idx = GetOrigBlock(rgb0_c, orig_image_batch, image_width, image_height, block_x, block_y, factor, ix, iy);
+
+                float rgb1_c[3][kDCTBlockSize];
+                Copy16x16ToChannel(rgb16x16, rgb1_c[0], rgb1_c[1], rgb1_c[2], ix, iy);
+                double err = ComputeImage8x8Block(rgb0_c, rgb1_c, mask_scale + block_8x8idx * 3);
+                max_err = max(max_err, err);
+            }
+        }
+        return max_err;
+    }
+}
+
+#ifdef __USE_DOUBLE_AS_FLOAT__
+#undef double
+#endif
+
+#endif //__USE_OPENCL__
\ No newline at end of file
diff --git a/clguetzli/clguetzli.cl.cpp b/clguetzli/clguetzli.cl.cpp
new file mode 100644
index 00000000..619c0cfd
--- /dev/null
+++ b/clguetzli/clguetzli.cl.cpp
@@ -0,0 +1,226 @@
+/*
+* OpenCL/CUDA edition implementation of ButteraugliComparator.
+*
+* Author: strongtu@tencent.com
+*         ianhuang@tencent.com
+*         chriskzhou@tencent.com
+*/
+#include <algorithm>
+#include <stdint.h>
+#include <vector>
+#include "utils.h"
+
+#ifdef __USE_OPENCL__
+
+using namespace std;
+
+int g_idvec[10] = { 0 };
+int g_sizevec[10] = { 0 };
+
+int get_global_id(int dim) {
+    return g_idvec[dim];
+}
+int get_global_size(int dim) {
+    return g_sizevec[dim];
+}
+
+void set_global_id(int dim, int id){
+    g_idvec[dim] = id;
+}
+void set_global_size(int dim, int size){
+    g_sizevec[dim] = size;
+}
+
+#define __checkcl
+#define abs(exper)    fabs((exper))
+#include "clguetzli.h"
+#include "clguetzli.cl"
+#include "cuguetzli.h"
+#include "ocu.h"
+
+namespace guetzli
+{
+    ButteraugliComparatorEx::ButteraugliComparatorEx(const int width, const int height,
+        const std::vector<uint8_t>* rgb,
+        const float target_distance, ProcessStats* stats)
+        : ButteraugliComparator(width, height, rgb, target_distance, stats)
+    {
+        if (MODE_CPU != g_mathMode)
+        {
+            rgb_orig_opsin.resize(3);
+            rgb_orig_opsin[0].resize(width * height);
+            rgb_orig_opsin[1].resize(width * height);
+            rgb_orig_opsin[2].resize(width * height);
+
+#ifdef __USE_DOUBLE_AS_FLOAT__
+            const float* lut = kSrgb8ToLinearTable;
+#else
+            const double* lut = kSrgb8ToLinearTable;
+#endif
+            for (int c = 0; c < 3; ++c) {
+                for (int y = 0, ix = 0; y < height_; ++y) {
+                    for (int x = 0; x < width_; ++x, ++ix) {
+                        rgb_orig_opsin[c][ix] = lut[rgb_orig_[3 * ix + c]];
+                    }
+                }
+            }
+            ::butteraugli::OpsinDynamicsImage(width_, height_, rgb_orig_opsin);
+        }
+    }
+
+    void ButteraugliComparatorEx::Compare(const OutputImage& img)
+    {
+		if (MODE_CPU_OPT == g_mathMode)
+		{
+			std::vector<std::vector<float> > rgb0 = rgb_orig_opsin;
+
+			std::vector<std::vector<float> > rgb(3, std::vector<float>(width_ * height_));
+			img.ToLinearRGB(&rgb);
+			::butteraugli::OpsinDynamicsImage(width_, height_, rgb);
+			std::vector<float>().swap(distmap_);
+			comparator_.DiffmapOpsinDynamicsImage(rgb0, rgb, distmap_);
+			distance_ = ::butteraugli::ButteraugliScoreFromDiffmap(distmap_);
+		}
+#ifdef __USE_OPENCL__
+        else if (MODE_OPENCL == g_mathMode)
+        {
+            std::vector<std::vector<float> > rgb1(3, std::vector<float>(width_ * height_));
+            img.ToLinearRGB(&rgb1);
+
+            const int xsize = width_;
+            const int ysize = height_;
+            std::vector<float>().swap(distmap_);
+            distmap_.resize(xsize * ysize);
+
+            size_t channel_size = xsize * ysize * sizeof(float);
+            ocl_args_d_t &ocl = getOcl();
+            ocl_channels xyb0 = ocl.allocMemChannels(channel_size, rgb_orig_opsin[0].data(), rgb_orig_opsin[1].data(), rgb_orig_opsin[2].data());
+            ocl_channels xyb1 = ocl.allocMemChannels(channel_size, rgb1[0].data(), rgb1[1].data(), rgb1[2].data());
+
+            cl_mem mem_result = ocl.allocMem(channel_size);
+
+            clOpsinDynamicsImageEx(xyb1, xsize, ysize);
+            clDiffmapOpsinDynamicsImageEx(mem_result, xyb0, xyb1, xsize, ysize, comparator_.step());
+
+            cl_int err = clEnqueueReadBuffer(ocl.commandQueue, mem_result, false, 0, channel_size, distmap_.data(), 0, NULL, NULL);
+            LOG_CL_RESULT(err);
+            err = clFinish(ocl.commandQueue);
+            LOG_CL_RESULT(err);
+
+            clReleaseMemObject(mem_result);
+            ocl.releaseMemChannels(xyb0);
+            ocl.releaseMemChannels(xyb1);
+
+            distance_ = ::butteraugli::ButteraugliScoreFromDiffmap(distmap_);
+        }
+#endif
+#ifdef __USE_CUDA__
+        else if (MODE_CUDA == g_mathMode)
+        {
+            std::vector<std::vector<float> > rgb1(3, std::vector<float>(width_ * height_));
+            img.ToLinearRGB(&rgb1);
+
+            const int xsize = width_;
+            const int ysize = height_;
+            std::vector<float>().swap(distmap_);
+            distmap_.resize(xsize * ysize);
+
+            size_t channel_size = xsize * ysize * sizeof(float);
+            ocu_args_d_t &ocu = getOcu();
+            ocu_channels xyb0 = ocu.allocMemChannels(channel_size, rgb_orig_opsin[0].data(), rgb_orig_opsin[1].data(), rgb_orig_opsin[2].data());
+            ocu_channels xyb1 = ocu.allocMemChannels(channel_size, rgb1[0].data(), rgb1[1].data(), rgb1[2].data());
+            
+            cu_mem mem_result = ocu.allocMem(channel_size);
+
+            cuOpsinDynamicsImageEx(xyb1, xsize, ysize);
+
+            cuDiffmapOpsinDynamicsImageEx(mem_result, xyb0, xyb1, xsize, ysize, comparator_.step());
+
+            cuMemcpyDtoH(distmap_.data(), mem_result, channel_size);
+
+            ocu.releaseMem(mem_result);
+            ocu.releaseMemChannels(xyb0);
+            ocu.releaseMemChannels(xyb1);
+
+            distance_ = ::butteraugli::ButteraugliScoreFromDiffmap(distmap_);
+        }
+#endif
+		else
+		{
+			ButteraugliComparator::Compare(img);
+		}
+    }
+
+    void ButteraugliComparatorEx::StartBlockComparisons()
+    {
+        if (MODE_CPU == g_mathMode)
+        {
+            ButteraugliComparator::StartBlockComparisons();
+            return;
+        }
+
+        std::vector<std::vector<float> > dummy(3);
+        ::butteraugli::Mask(rgb_orig_opsin, rgb_orig_opsin, width_, height_, &mask_xyz_, &dummy);
+
+        const int width = width_;
+        const int height = height_;
+        const int factor_x = 1;
+        const int factor_y = 1;
+
+        const int block_width = (width + 8 * factor_x - 1) / (8 * factor_x);
+        const int block_height = (height + 8 * factor_y - 1) / (8 * factor_y);
+        const int num_blocks = block_width * block_height;
+#ifdef __USE_DOUBLE_AS_FLOAT__
+        const float* lut = kSrgb8ToLinearTable;
+#else
+        const double* lut = kSrgb8ToLinearTable;
+#endif
+        imgOpsinDynamicsBlockList.resize(num_blocks * 3 * kDCTBlockSize);
+        imgMaskXyzScaleBlockList.resize(num_blocks * 3);
+        for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y)
+        {
+            for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix)
+            {
+                float* curR = &imgOpsinDynamicsBlockList[block_ix * 3 * kDCTBlockSize];
+                float* curG = curR + kDCTBlockSize;
+                float* curB = curG + kDCTBlockSize;
+
+                for (int iy = 0, i = 0; iy < 8; ++iy) {
+                    for (int ix = 0; ix < 8; ++ix, ++i) {
+                        int x = std::min(8 * block_x + ix, width - 1);
+                        int y = std::min(8 * block_y + iy, height - 1);
+                        int px = y * width + x;
+
+                        curR[i] = lut[rgb_orig_[3 * px]];
+                        curG[i] = lut[rgb_orig_[3 * px + 1]];
+                        curB[i] = lut[rgb_orig_[3 * px + 2]];
+                    }
+                }
+
+                CalcOpsinDynamicsImage((float(*)[64])curR);
+
+                int xmin = block_x * 8;
+                int ymin = block_y * 8;
+
+                imgMaskXyzScaleBlockList[block_ix * 3] = mask_xyz_[0][ymin * width_ + xmin];
+                imgMaskXyzScaleBlockList[block_ix * 3 + 1] = mask_xyz_[1][ymin * width_ + xmin];
+                imgMaskXyzScaleBlockList[block_ix * 3 + 2] = mask_xyz_[2][ymin * width_ + xmin];
+            }
+        }
+    }
+
+    void ButteraugliComparatorEx::FinishBlockComparisons() {
+        ButteraugliComparator::FinishBlockComparisons();
+
+        imgOpsinDynamicsBlockList.clear();
+        imgMaskXyzScaleBlockList.clear();
+    }
+    
+    double ButteraugliComparatorEx::CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const
+    {
+        double err = ButteraugliComparator::CompareBlock(img, off_x, off_y, candidate_block, comp_mask);
+        return err;
+    }
+}
+
+#endif
\ No newline at end of file
diff --git a/clguetzli/clguetzli.cl.h b/clguetzli/clguetzli.cl.h
new file mode 100644
index 00000000..12543e42
--- /dev/null
+++ b/clguetzli/clguetzli.cl.h
@@ -0,0 +1,162 @@
+/*
+* OpenCL/CUDA edition implementation of ButteraugliComparator.
+*
+* Author: strongtu@tencent.com
+*         ianhuang@tencent.com
+*         chriskzhou@tencent.com
+*/
+#ifndef __CLGUETZLI_CL_H__
+#define __CLGUETZLI_CL_H__
+
+#ifdef __USE_OPENCL__
+
+#ifdef __cplusplus
+#ifndef __CUDACC__
+#include "CL/cl.h"
+#include "cuda.h"
+#endif
+#endif
+
+#define __USE_DOUBLE_AS_FLOAT__
+
+#ifdef __cplusplus
+#ifndef __CUDACC__
+    #define __kernel
+    #define __private
+    #define __global
+    #define __constant
+    #define __constant_ex
+    #define __device__
+
+    typedef unsigned char uchar;
+    typedef unsigned short ushort;
+    typedef CUdeviceptr cu_mem;
+
+    int get_global_id(int dim);
+    int get_global_size(int dim);
+    void set_global_id(int dim, int id);
+    void set_global_size(int dim, int size);
+
+    #ifdef __checkcl
+        typedef union ocl_channels_t
+        {
+            struct
+            {
+                float * r;
+                float * g;
+                float * b;
+            };
+            union
+            {
+                float *ch[3];
+            };
+        }ocl_channels;
+
+        typedef union ocu_channels_t
+        {
+            struct
+            {
+                float * r;
+                float * g;
+                float * b;
+            };
+            union
+            {
+                float *ch[3];
+            };
+        }ocu_channels;
+    #else
+        typedef union ocl_channels_t
+        {
+            struct
+            {
+                cl_mem r;
+                cl_mem g;
+                cl_mem b;
+            };
+            struct
+            {
+                cl_mem x;
+                cl_mem y;
+                cl_mem b_;
+            };
+            union
+            {
+                cl_mem ch[3];
+            };
+        }ocl_channels;
+
+        typedef union ocu_channels_t
+        {
+            struct
+            {
+                cu_mem r;
+                cu_mem g;
+                cu_mem b;
+            };
+            struct
+            {
+                cu_mem x;
+                cu_mem y;
+                cu_mem b_;
+            };
+            union
+            {
+                cu_mem ch[3];
+            };
+        }ocu_channels;
+    #endif
+#endif /*__CUDACC__*/
+#endif /*__cplusplus*/
+
+#ifdef __OPENCL_VERSION__
+    #define __constant_ex __constant
+    #define __device__
+
+#endif /*__OPENCL_VERSION__*/
+
+#ifdef __CUDACC__
+    #define __kernel    extern "C" __global__
+    #define __private
+    #define __global
+    #define __constant  __constant__
+    #define __constant_ex
+    typedef unsigned char uchar;
+    typedef unsigned short ushort;
+
+    __device__ int get_global_id(int dim)
+    {
+        switch (dim)
+        {
+        case 0:  return blockIdx.x * blockDim.x + threadIdx.x;
+        case 1:  return blockIdx.y * blockDim.y + threadIdx.y;
+        default: return blockIdx.z * blockDim.z + threadIdx.z;
+        }
+    }
+
+    __device__ int get_global_size(int dim)
+    {
+        switch(dim)
+        {
+        case 0: return gridDim.x * blockDim.x;
+        case 1: return gridDim.y * blockDim.y;
+        default: return gridDim.z * blockDim.z;
+        }
+    }
+
+#endif /*__CUDACC__*/
+
+    typedef short coeff_t;
+
+    typedef struct __channel_info_t
+    {
+        int factor;
+        int block_width;
+        int block_height;
+        __global const coeff_t *coeff;
+        __global const ushort  *pixel;
+    }channel_info;
+
+#endif /*__CLGUETZLI_CL_H__*/
+
+#endif // __USE_OPENCL__
\ No newline at end of file
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
new file mode 100644
index 00000000..52129927
--- /dev/null
+++ b/clguetzli/clguetzli.cpp
@@ -0,0 +1,841 @@
+/*
+* OpenCL edition implementation of guetzli.
+*
+* Author: strongtu@tencent.com
+*         ianhuang@tencent.com
+*         chriskzhou@tencent.com
+*/
+#include "clguetzli.h"
+#include <math.h>
+#include <algorithm>
+#include <vector>
+#include "cl.hpp"
+
+extern MATH_MODE g_mathMode = MODE_CPU;
+
+#ifdef __USE_OPENCL__
+
+#ifdef __USE_DOUBLE_AS_FLOAT__
+#define double float
+#endif
+
+void clOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize)
+{
+    size_t channel_size = xsize * ysize * sizeof(float);
+
+    ocl_args_d_t &ocl = getOcl();
+    ocl_channels rgb = ocl.allocMemChannels(channel_size, r, g, b);
+
+    clOpsinDynamicsImageEx(rgb, xsize, ysize);
+
+    clEnqueueReadBuffer(ocl.commandQueue, rgb.r, false, 0, channel_size, r, 0, NULL, NULL);
+    clEnqueueReadBuffer(ocl.commandQueue, rgb.g, false, 0, channel_size, g, 0, NULL, NULL);
+    clEnqueueReadBuffer(ocl.commandQueue, rgb.b, false, 0, channel_size, b, 0, NULL, NULL);
+    clFinish(ocl.commandQueue);
+
+    ocl.releaseMemChannels(rgb);
+}
+
+void clDiffmapOpsinDynamicsImage(
+    float* result,
+    const float* r,  const float* g,  const float* b,
+    const float* r2, const float* g2, const float* b2,
+    const size_t xsize, const size_t ysize,
+    const size_t step)
+{
+    size_t channel_size = xsize * ysize * sizeof(float);
+
+    ocl_args_d_t &ocl = getOcl();
+    ocl_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b);
+    ocl_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2);
+
+    cl_mem mem_result = ocl.allocMem(channel_size, result);
+
+    clDiffmapOpsinDynamicsImageEx(mem_result, xyb0, xyb1, xsize, ysize, step);
+
+    clEnqueueReadBuffer(ocl.commandQueue, mem_result, false, 0, channel_size, result, 0, NULL, NULL);
+    cl_int err = clFinish(ocl.commandQueue);
+
+    ocl.releaseMemChannels(xyb1);
+    ocl.releaseMemChannels(xyb0);
+
+    clReleaseMemObject(mem_result);
+}
+
+void clComputeBlockZeroingOrder(
+    guetzli::CoeffData *output_order_batch,
+    const channel_info orig_channel[3],
+    const float *orig_image_batch,
+    const float *mask_scale,
+    const int image_width,
+    const int image_height,
+    const channel_info mayout_channel[3],
+    const int factor,
+    const int comp_mask,
+    const float BlockErrorLimit)
+{
+    const int block8_width = (image_width + 8 - 1) / 8;
+    const int block8_height = (image_height + 8 - 1) / 8;
+    const int blockf_width = (image_width + 8 * factor - 1) / (8 * factor);
+    const int blockf_height = (image_height + 8 * factor - 1) / (8 * factor);
+
+    using namespace guetzli;
+
+    ocl_args_d_t &ocl = getOcl();
+
+    cl_mem mem_orig_coeff[3];
+    cl_mem mem_mayout_coeff[3];
+    cl_mem mem_mayout_pixel[3];
+    for (int c = 0; c < 3; c++)
+    {
+        int block_count = orig_channel[c].block_width * orig_channel[c].block_height;
+        mem_orig_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, orig_channel[c].coeff);
+
+        block_count = mayout_channel[c].block_width * mayout_channel[c].block_height;
+        mem_mayout_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, mayout_channel[c].coeff);
+
+        mem_mayout_pixel[c] = ocl.allocMem(image_width * image_height * sizeof(uint16_t), mayout_channel[c].pixel);
+    }
+    cl_mem mem_orig_image = ocl.allocMem(sizeof(float) * 3 * kDCTBlockSize * block8_width * block8_height, orig_image_batch);
+    cl_mem mem_mask_scale = ocl.allocMem(sizeof(float) * 3 * block8_width * block8_height, mask_scale);
+
+    int output_order_batch_size = sizeof(CoeffData) * 3 * kDCTBlockSize * blockf_width * blockf_height;
+    cl_mem mem_output_order_batch = ocl.allocMem(output_order_batch_size, output_order_batch);
+
+    cl_kernel kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER];
+    clSetKernelArgEx(kernel, &mem_orig_coeff[0], &mem_orig_coeff[1], &mem_orig_coeff[2],
+                        &mem_orig_image, &mem_mask_scale, 
+						&blockf_width, &blockf_height,
+                        &image_width, &image_height,
+                        &mem_mayout_coeff[0], &mem_mayout_coeff[1], &mem_mayout_coeff[2],
+                        &mem_mayout_pixel[0], &mem_mayout_pixel[1], &mem_mayout_pixel[2],
+                        &mayout_channel[0], &mayout_channel[1], &mayout_channel[2],
+                        &factor, 
+						&comp_mask, 
+						&BlockErrorLimit, 
+						&mem_output_order_batch);
+
+    size_t globalWorkSize[2] = { blockf_width, blockf_height };
+    cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    LOG_CL_RESULT(err);
+    err = clFinish(ocl.commandQueue);
+    LOG_CL_RESULT(err);
+
+    clEnqueueReadBuffer(ocl.commandQueue, mem_output_order_batch, false, 0, output_order_batch_size, output_order_batch, 0, NULL, NULL);
+    clFinish(ocl.commandQueue);
+
+    for (int c = 0; c < 3; c++)
+    {
+        clReleaseMemObject(mem_orig_coeff[c]);
+        clReleaseMemObject(mem_mayout_coeff[c]);
+        clReleaseMemObject(mem_mayout_pixel[c]);
+    }
+
+    clReleaseMemObject(mem_orig_image);
+    clReleaseMemObject(mem_mask_scale);
+    clReleaseMemObject(mem_output_order_batch);
+}
+
+void clMask(
+    float* mask_r,  float* mask_g,    float* mask_b,
+    float* maskdc_r, float* maskdc_g, float* maskdc_b,
+    const size_t xsize, const size_t ysize,
+    const float* r,  const float* g,  const float* b,
+    const float* r2, const float* g2, const float* b2)
+{
+    ocl_args_d_t &ocl = getOcl();
+
+    size_t channel_size = xsize * ysize * sizeof(float);
+
+    ocl_channels rgb = ocl.allocMemChannels(channel_size, r, g, b);
+    ocl_channels rgb2 = ocl.allocMemChannels(channel_size, r2, g2, b2);
+    ocl_channels mask = ocl.allocMemChannels(channel_size);
+    ocl_channels mask_dc = ocl.allocMemChannels(channel_size);
+
+    clMaskEx(mask, mask_dc, rgb, rgb2, xsize, ysize);
+
+    clEnqueueReadBuffer(ocl.commandQueue, mask.r, false, 0, channel_size, mask_r, 0, NULL, NULL);
+    clEnqueueReadBuffer(ocl.commandQueue, mask.g, false, 0, channel_size, mask_g, 0, NULL, NULL);
+    clEnqueueReadBuffer(ocl.commandQueue, mask.b, false, 0, channel_size, mask_b, 0, NULL, NULL);
+    clEnqueueReadBuffer(ocl.commandQueue, mask_dc.r, false, 0, channel_size, maskdc_r, 0, NULL, NULL);
+    clEnqueueReadBuffer(ocl.commandQueue, mask_dc.g, false, 0, channel_size, maskdc_g, 0, NULL, NULL);
+    clEnqueueReadBuffer(ocl.commandQueue, mask_dc.b, false, 0, channel_size, maskdc_b, 0, NULL, NULL);
+    clFinish(ocl.commandQueue);
+
+    ocl.releaseMemChannels(rgb);
+    ocl.releaseMemChannels(rgb2);
+    ocl.releaseMemChannels(mask);
+    ocl.releaseMemChannels(mask_dc);
+}
+
+void clDiffmapOpsinDynamicsImageEx(
+    cl_mem result,
+    ocl_channels xyb0,
+    ocl_channels xyb1,
+    const size_t xsize, const size_t ysize,
+    const size_t step)
+{
+    const size_t res_xsize = (xsize + step - 1) / step;
+    const size_t res_ysize = (ysize + step - 1) / step;
+
+    size_t channel_size = xsize * ysize * sizeof(float);
+    size_t channel_step_size = res_xsize * res_ysize * sizeof(float);
+
+    ocl_args_d_t &ocl = getOcl();
+ 
+    cl_mem edge_detector_map = ocl.allocMem(3 * channel_step_size);
+    cl_mem block_diff_dc = ocl.allocMem(3 * channel_step_size);
+    cl_mem block_diff_ac = ocl.allocMem(3 * channel_step_size);
+
+    clMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize);
+
+    clEdgeDetectorMapEx(edge_detector_map, xyb0, xyb1, xsize, ysize, step);
+    clBlockDiffMapEx(block_diff_dc, block_diff_ac, xyb0, xyb1, xsize, ysize, step);
+    clEdgeDetectorLowFreqEx(block_diff_ac, xyb0, xyb1, xsize, ysize, step);
+    {
+        ocl_channels mask = ocl.allocMemChannels(channel_size);
+        ocl_channels mask_dc = ocl.allocMemChannels(channel_size);
+        clMaskEx(mask, mask_dc, xyb0, xyb1, xsize, ysize);
+        clCombineChannelsEx(result, mask, mask_dc, xsize, ysize, block_diff_dc, block_diff_ac, edge_detector_map, res_xsize, step);
+
+        ocl.releaseMemChannels(mask);
+        ocl.releaseMemChannels(mask_dc);
+    }
+
+    clCalculateDiffmapEx(result, xsize, ysize, step);
+
+    clReleaseMemObject(edge_detector_map);
+    clReleaseMemObject(block_diff_dc);
+    clReleaseMemObject(block_diff_ac);
+}
+void clConvolutionEx(
+    cl_mem result/*out*/,
+    const cl_mem inp, size_t xsize, size_t ysize,
+    const cl_mem multipliers, size_t len,
+    int xstep, int offset, float border_ratio)
+{
+	ocl_args_d_t &ocl = getOcl();
+
+	size_t oxsize = (xsize + xstep - 1) / xstep;
+
+	cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTION];
+    clSetKernelArgEx(kernel, &result, &inp, &xsize, &multipliers, &len, &xstep, &offset, &border_ratio);
+
+	size_t globalWorkSize[2] = { oxsize, ysize };
+	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    LOG_CL_RESULT(err);
+	err = clFinish(ocl.commandQueue);
+    LOG_CL_RESULT(err);
+}
+
+void clConvolutionXEx(
+    cl_mem result/*out*/,
+    const cl_mem inp, size_t xsize, size_t ysize,
+	const cl_mem multipliers, size_t len,
+	int xstep, int offset, float border_ratio)
+{
+	ocl_args_d_t &ocl = getOcl();
+
+	cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTIONX];
+    clSetKernelArgEx(kernel, &result, &xsize, &ysize, &inp, &multipliers, &len, &xstep, &offset, &border_ratio);
+
+	size_t globalWorkSize[2] = { xsize, ysize };
+	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    LOG_CL_RESULT(err);
+	err = clFinish(ocl.commandQueue);
+    LOG_CL_RESULT(err);
+}
+
+void clConvolutionYEx(
+    cl_mem result/*out*/,
+    const cl_mem inp, size_t xsize, size_t ysize,
+	const cl_mem multipliers, size_t len,
+	int xstep, int offset, float border_ratio)
+{
+	ocl_args_d_t &ocl = getOcl();
+
+	cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTIONY];
+    clSetKernelArgEx(kernel, &result, &xsize, &ysize, &inp, &multipliers, &len, &xstep, &offset, &border_ratio);
+
+	size_t globalWorkSize[2] = { xsize, ysize };
+	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    LOG_CL_RESULT(err);
+	err = clFinish(ocl.commandQueue);
+    LOG_CL_RESULT(err);
+}
+
+void clSquareSampleEx(
+    cl_mem result/*out*/,
+    const cl_mem image, size_t xsize, size_t ysize,
+	size_t xstep, size_t ystep)
+{
+	ocl_args_d_t &ocl = getOcl();
+
+	cl_kernel kernel = ocl.kernel[KERNEL_SQUARESAMPLE];
+    clSetKernelArgEx(kernel, &result, &xsize, &ysize, &image, &xstep, &ystep);
+
+	size_t globalWorkSize[2] = { xsize, ysize };
+	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    LOG_CL_RESULT(err);
+	err = clFinish(ocl.commandQueue);
+    LOG_CL_RESULT(err);
+}
+
+void clBlurEx(cl_mem image/*out, opt*/, const size_t xsize, const size_t ysize,
+	const double sigma, const double border_ratio,
+    cl_mem result/*out, opt*/)
+{
+	double m = 2.25;  // Accuracy increases when m is increased.
+	const double scaler = -1.0 / (2 * sigma * sigma);
+	// For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52}
+	const int diff = std::max<int>(1, m * fabs(sigma));
+	const int expn_size = 2 * diff + 1;
+	std::vector<float> expn(expn_size);
+	for (int i = -diff; i <= diff; ++i) {
+		expn[i + diff] = static_cast<float>(exp(scaler * i * i));
+	}
+
+	const int xstep = std::max<int>(1, int(sigma / 3));
+
+	ocl_args_d_t &ocl = getOcl();
+	cl_mem mem_expn = ocl.allocMem(sizeof(cl_float) * expn_size, expn.data());
+
+	if (xstep > 1)
+	{
+        cl_mem m = ocl.allocMem(sizeof(cl_float) * xsize * ysize);
+		clConvolutionXEx(m, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+		clConvolutionYEx(result ? result : image, m, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+        clSquareSampleEx(result ? result : image, result ? result : image, xsize, ysize, xstep, xstep);
+        clReleaseMemObject(m);
+	}
+	else
+	{
+        cl_mem m = ocl.allocMem(sizeof(cl_float) * xsize * ysize);
+		clConvolutionXEx(m, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+		clConvolutionYEx(result ? result : image, m, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+        clReleaseMemObject(m);
+    }
+
+	clReleaseMemObject(mem_expn);
+}
+
+void clOpsinDynamicsImageEx(ocl_channels &rgb, const size_t xsize, const size_t ysize)
+{
+	static const double kSigma = 1.1;
+
+	size_t channel_size = xsize * ysize * sizeof(float);
+
+	ocl_args_d_t &ocl = getOcl();
+	ocl_channels rgb_blurred = ocl.allocMemChannels(channel_size);
+
+    const int size = xsize * ysize;
+
+    clBlurEx(rgb.r, xsize, ysize, kSigma, 0.0, rgb_blurred.r);
+    clBlurEx(rgb.g, xsize, ysize, kSigma, 0.0, rgb_blurred.g);
+    clBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b);
+
+	cl_kernel kernel = ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE];
+    clSetKernelArgEx(kernel,  &rgb.r, &rgb.g, &rgb.b, &size, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b);
+
+	size_t globalWorkSize[1] = { xsize * ysize };
+	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    LOG_CL_RESULT(err);
+	err = clFinish(ocl.commandQueue);
+    LOG_CL_RESULT(err);
+
+	ocl.releaseMemChannels(rgb_blurred);
+}
+
+void clMaskHighIntensityChangeEx(
+    ocl_channels &xyb0/*in,out*/,
+    ocl_channels &xyb1/*in,out*/,
+    const size_t xsize, const size_t ysize)
+{
+	size_t channel_size = xsize * ysize * sizeof(float);
+
+	ocl_args_d_t &ocl = getOcl();
+
+	ocl_channels c0 = ocl.allocMemChannels(channel_size);
+	ocl_channels c1 = ocl.allocMemChannels(channel_size);
+
+	clEnqueueCopyBuffer(ocl.commandQueue, xyb0.r, c0.r, 0, 0, channel_size, 0, NULL, NULL);
+	clEnqueueCopyBuffer(ocl.commandQueue, xyb0.g, c0.g, 0, 0, channel_size, 0, NULL, NULL);
+	clEnqueueCopyBuffer(ocl.commandQueue, xyb0.b, c0.b, 0, 0, channel_size, 0, NULL, NULL);
+	clEnqueueCopyBuffer(ocl.commandQueue, xyb1.r, c1.r, 0, 0, channel_size, 0, NULL, NULL);
+	clEnqueueCopyBuffer(ocl.commandQueue, xyb1.g, c1.g, 0, 0, channel_size, 0, NULL, NULL);
+	clEnqueueCopyBuffer(ocl.commandQueue, xyb1.b, c1.b, 0, 0, channel_size, 0, NULL, NULL);
+	clFinish(ocl.commandQueue);
+
+	cl_kernel kernel = ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE];
+    clSetKernelArgEx(kernel, 
+		&xyb0.r, &xyb0.g, &xyb0.b,
+		&xsize, &ysize,
+    	&xyb1.r, &xyb1.g, &xyb1.b,
+        &c0.r, &c0.g, &c0.b,
+        &c1.r, &c1.g, &c1.b);
+
+	size_t globalWorkSize[2] = { xsize, ysize };
+	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    LOG_CL_RESULT(err);
+	err = clFinish(ocl.commandQueue);
+    LOG_CL_RESULT(err);
+
+	ocl.releaseMemChannels(c0);
+	ocl.releaseMemChannels(c1);
+}
+
+void clEdgeDetectorMapEx(
+    cl_mem result/*out*/,
+    const ocl_channels &rgb, const ocl_channels &rgb2, 
+    const size_t xsize, const size_t ysize, const size_t step)
+{
+	size_t channel_size = xsize * ysize * sizeof(float);
+ 
+	ocl_args_d_t &ocl = getOcl();
+
+	ocl_channels rgb_blured = ocl.allocMemChannels(channel_size);
+	ocl_channels rgb2_blured = ocl.allocMemChannels(channel_size);
+
+ 	static const double kSigma[3] = { 1.5, 0.586, 0.4 };
+
+	for (int i = 0; i < 3; i++)
+	{
+		clBlurEx(rgb.ch[i], xsize, ysize, kSigma[i], 0.0, rgb_blured.ch[i]);
+		clBlurEx(rgb2.ch[i], xsize, ysize, kSigma[i], 0.0, rgb2_blured.ch[i]);
+	}
+
+    const size_t res_xsize = (xsize + step - 1) / step;
+    const size_t res_ysize = (ysize + step - 1) / step;
+
+	cl_kernel kernel = ocl.kernel[KERNEL_EDGEDETECTOR];
+    clSetKernelArgEx(kernel, &result,
+        &res_xsize, &res_ysize,
+        &rgb_blured.r, &rgb_blured.g, &rgb_blured.b,
+        &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b,
+        &xsize, &ysize, &step);
+
+	size_t globalWorkSize[2] = { res_xsize, res_ysize};
+	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    LOG_CL_RESULT(err);
+	err = clFinish(ocl.commandQueue);
+    LOG_CL_RESULT(err);
+
+	ocl.releaseMemChannels(rgb_blured);
+	ocl.releaseMemChannels(rgb2_blured);
+}
+
+void clBlockDiffMapEx(
+    cl_mem block_diff_dc/*out*/, 
+    cl_mem block_diff_ac/*out*/,
+    const ocl_channels &rgb, const ocl_channels &rgb2,
+	const size_t xsize, const size_t ysize, const size_t step)
+{
+	ocl_args_d_t &ocl = getOcl();
+
+
+	const size_t res_xsize = (xsize + step - 1) / step;
+	const size_t res_ysize = (ysize + step - 1) / step;
+	
+	cl_kernel kernel = ocl.kernel[KERNEL_BLOCKDIFFMAP];
+    clSetKernelArgEx(kernel, &block_diff_dc, &block_diff_ac,
+		&res_xsize, &res_ysize,
+        &rgb.r, &rgb.g, &rgb.b,
+        &rgb2.r, &rgb2.g, &rgb2.b,
+        &xsize, &ysize, &step);
+
+
+	size_t globalWorkSize[2] = { res_xsize, res_ysize };
+	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    LOG_CL_RESULT(err);
+	err = clFinish(ocl.commandQueue);
+    LOG_CL_RESULT(err);
+}
+
+void clEdgeDetectorLowFreqEx(
+    cl_mem block_diff_ac/*in,out*/,
+    const ocl_channels &rgb, const ocl_channels &rgb2,
+	const size_t xsize, const size_t ysize, const size_t step)
+{
+	size_t channel_size = xsize * ysize * sizeof(float);
+
+	static const double kSigma = 14;
+	ocl_args_d_t &ocl = getOcl();
+	ocl_channels rgb_blured = ocl.allocMemChannels(channel_size);
+	ocl_channels rgb2_blured = ocl.allocMemChannels(channel_size);
+
+	for (int i = 0; i < 3; i++)
+	{
+		clBlurEx(rgb.ch[i], xsize, ysize, kSigma, 0.0, rgb_blured.ch[i]);
+		clBlurEx(rgb2.ch[i], xsize, ysize, kSigma, 0.0, rgb2_blured.ch[i]);
+	}
+
+    const size_t res_xsize = (xsize + step - 1) / step;
+    const size_t res_ysize = (ysize + step - 1) / step;
+
+	cl_kernel kernel = ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ];
+    clSetKernelArgEx(kernel, &block_diff_ac,
+        &res_xsize, &res_ysize,
+        &rgb_blured.r, &rgb_blured.g, &rgb_blured.b,
+        &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b,
+        &xsize, &ysize, &step);
+
+	size_t globalWorkSize[2] = { res_xsize, res_ysize };
+	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    LOG_CL_RESULT(err);
+	err = clFinish(ocl.commandQueue);
+    LOG_CL_RESULT(err);
+
+	ocl.releaseMemChannels(rgb_blured);
+	ocl.releaseMemChannels(rgb2_blured);
+}
+
+void clDiffPrecomputeEx(
+    ocl_channels &mask/*out*/,
+    const ocl_channels &xyb0, const ocl_channels &xyb1, 
+    const size_t xsize, const size_t ysize)
+{
+	ocl_args_d_t &ocl = getOcl();
+
+	cl_kernel kernel = ocl.kernel[KERNEL_DIFFPRECOMPUTE];
+    clSetKernelArgEx(kernel, &mask.x, &mask.y, &mask.b, 
+							&xsize, &ysize,
+                            &xyb0.x, &xyb0.y, &xyb0.b,
+							&xyb1.x, &xyb1.y, &xyb1.b);
+
+	size_t globalWorkSize[2] = { xsize, ysize };
+	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    LOG_CL_RESULT(err);
+	err = clFinish(ocl.commandQueue);
+    LOG_CL_RESULT(err);
+}
+
+void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w)
+{
+	ocl_args_d_t &ocl = getOcl();
+    float fw = w;
+
+	cl_kernel kernel = ocl.kernel[KERNEL_SCALEIMAGE];
+	clSetKernelArgEx(kernel, &img, &size, &fw);
+
+	size_t globalWorkSize[1] = { size };
+	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    LOG_CL_RESULT(err);
+	err = clFinish(ocl.commandQueue);
+    LOG_CL_RESULT(err);
+}
+
+void clAverage5x5Ex(cl_mem img/*in,out*/, const size_t xsize, const size_t ysize)
+{
+    if (xsize < 4 || ysize < 4) {
+	    // TODO: Make this work for small dimensions as well.
+	    return;
+    }
+
+    ocl_args_d_t &ocl = getOcl();
+
+    size_t len = xsize * ysize * sizeof(float);
+    cl_mem img_org = ocl.allocMem(len);
+
+    clEnqueueCopyBuffer(ocl.commandQueue, img, img_org, 0, 0, len, 0, NULL, NULL);
+
+    cl_kernel kernel = ocl.kernel[KERNEL_AVERAGE5X5];
+    clSetKernelArgEx(kernel, &img, &xsize, &ysize, &img_org);
+
+    size_t globalWorkSize[2] = { xsize, ysize };
+    cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    LOG_CL_RESULT(err);
+    err = clFinish(ocl.commandQueue);
+    LOG_CL_RESULT(err);
+
+    clReleaseMemObject(img_org);
+}
+
+void clMinSquareValEx(
+    cl_mem img/*in,out*/, 
+    const size_t xsize, const size_t ysize, 
+    const size_t square_size, const size_t offset)
+{
+	ocl_args_d_t &ocl = getOcl();
+
+	cl_mem result = ocl.allocMem(sizeof(cl_float) * xsize * ysize);
+
+	cl_kernel kernel = ocl.kernel[KERNEL_MINSQUAREVAL];
+    clSetKernelArgEx(kernel, &result, &xsize, &ysize, &img, &square_size, &offset);
+
+	size_t globalWorkSize[2] = { xsize, ysize };
+	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    LOG_CL_RESULT(err);
+	err = clEnqueueCopyBuffer(ocl.commandQueue, result, img, 0, 0, sizeof(cl_float) * xsize * ysize, 0, NULL, NULL);
+    LOG_CL_RESULT(err);
+	err = clFinish(ocl.commandQueue);
+    LOG_CL_RESULT(err);
+    clReleaseMemObject(result);
+}
+
+static void MakeMask(double extmul, double extoff,
+	double mul, double offset,
+	double scaler, double *result)
+{
+	for (size_t i = 0; i < 512; ++i) {
+		const double c = mul / ((0.01 * scaler * i) + offset);
+		result[i] = 1.0 + extmul * (c + extoff);
+		result[i] *= result[i];
+	}
+}
+
+static const double kInternalGoodQualityThreshold = 14.921561160295326;
+static const double kGlobalScale = 1.0 / kInternalGoodQualityThreshold;
+
+void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, size_t xsize, size_t ysize)
+{
+	ocl_args_d_t &ocl = getOcl();
+
+	double extmul = 0.975741017749;
+	double extoff = -4.25328244168;
+	double offset = 0.454909521427;
+	double scaler = 0.0738288224836;
+	double mul = 20.8029176447;
+	static double lut_x[512];
+    static bool lutx_init = false;
+    if (!lutx_init)
+    {
+        lutx_init = true;
+        MakeMask(extmul, extoff, mul, offset, scaler, lut_x);
+    }
+
+	extmul = 0.373995618954;
+	extoff = 1.5307267433;
+	offset = 0.911952641929;
+	scaler = 1.1731667845;
+	mul = 16.2447033988;
+	static double lut_y[512];
+    static bool luty_init = false;
+    if (!luty_init)
+    {
+        luty_init = true;
+        MakeMask(extmul, extoff, mul, offset, scaler, lut_y);
+    }
+
+	extmul = 0.61582234137;
+	extoff = -4.25376118646;
+	offset = 1.05105070921;
+	scaler = 0.47434643535;
+	mul = 31.1444967089;
+	static double lut_b[512];
+    static bool lutb_init = false;
+    if (!lutb_init)
+    {
+        lutb_init = true;
+        MakeMask(extmul, extoff, mul, offset, scaler, lut_b);
+    }
+
+	extmul = 1.79116943438;
+	extoff = -3.86797479189;
+	offset = 0.670960225853;
+	scaler = 0.486575865525;
+	mul = 20.4563479139;
+	static double lut_dcx[512];
+    static bool lutdcx_init = false;
+    if (!lutdcx_init)
+    {
+        lutdcx_init = true;
+        MakeMask(extmul, extoff, mul, offset, scaler, lut_dcx);
+    }
+
+	extmul = 0.212223514236;
+	extoff = -3.65647120524;
+	offset = 1.73396799447;
+	scaler = 0.170392660501;
+	mul = 21.6566724788;
+	static double lut_dcy[512];
+    static bool lutdcy_init = false;
+    if (!lutdcy_init)
+    {
+        lutdcy_init = true;
+        MakeMask(extmul, extoff, mul, offset, scaler, lut_dcy);
+    }
+
+	extmul = 0.349376011816;
+	extoff = -0.894711072781;
+	offset = 0.901647926679;
+	scaler = 0.380086095024;
+	mul = 18.0373825149;
+	static double lut_dcb[512];
+    static bool lutdcb_init = false;
+    if (!lutdcb_init)
+    {
+        lutdcb_init = true;
+        MakeMask(extmul, extoff, mul, offset, scaler, lut_dcb);
+    }
+
+	size_t channel_size = 512 * sizeof(double);
+	ocl_channels xyb = ocl.allocMemChannels(channel_size, lut_x, lut_y, lut_b);
+    ocl_channels xyb_dc = ocl.allocMemChannels(channel_size, lut_dcx, lut_dcy, lut_dcb);
+
+	cl_kernel kernel = ocl.kernel[KERNEL_DOMASK];
+    clSetKernelArgEx(kernel, &mask.r, &mask.g, &mask.b,
+        &xsize, &ysize,
+        &mask_dc.r, &mask_dc.g, &mask_dc.b,
+        &xyb.x, &xyb.y, &xyb.b,
+        &xyb_dc.x, &xyb_dc.y, &xyb_dc.b);
+
+	size_t globalWorkSize[2] = { xsize, ysize };
+	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    LOG_CL_RESULT(err);
+	err = clFinish(ocl.commandQueue);
+    LOG_CL_RESULT(err);
+
+	ocl.releaseMemChannels(xyb);
+	ocl.releaseMemChannels(xyb_dc);
+}
+
+void clMaskEx(
+    ocl_channels mask/*out*/, ocl_channels mask_dc/*out*/,
+    const ocl_channels &rgb, const ocl_channels &rgb2,
+	const size_t xsize, const size_t ysize)
+{
+    clDiffPrecomputeEx(mask, rgb, rgb2, xsize, ysize);
+    for (int i = 0; i < 3; i++)
+    {
+        clAverage5x5Ex(mask.ch[i], xsize, ysize);
+        clMinSquareValEx(mask.ch[i], xsize, ysize, 4, 0);
+
+        static const double sigma[3] = {
+            9.65781083553,
+            14.2644604355,
+            4.53358927369,
+        };
+
+        clBlurEx(mask.ch[i], xsize, ysize, sigma[i], 0.0);
+    }
+
+	clDoMask(mask, mask_dc, xsize, ysize);
+
+    for (int i = 0; i < 3; i++)
+    {
+        clScaleImageEx(mask.ch[i], xsize * ysize, kGlobalScale * kGlobalScale);
+        clScaleImageEx(mask_dc.ch[i], xsize * ysize, kGlobalScale * kGlobalScale);
+    }
+}
+
+void clCombineChannelsEx(
+    cl_mem result/*out*/,
+	const ocl_channels &mask, 
+	const ocl_channels &mask_dc, 
+    const size_t xsize, const size_t ysize,
+	const cl_mem block_diff_dc, 
+	const cl_mem block_diff_ac, 
+	const cl_mem edge_detector_map, 
+	const size_t res_xsize,
+	const size_t step)
+{
+	ocl_args_d_t &ocl = getOcl();
+
+	const size_t work_xsize = ((xsize - 8 + step) + step - 1) / step;
+	const size_t work_ysize = ((ysize - 8 + step) + step - 1) / step;
+
+	cl_kernel kernel = ocl.kernel[KERNEL_COMBINECHANNELS];
+    clSetKernelArgEx(kernel, &result, 
+    	&mask.r, &mask.g, &mask.b,
+        &mask_dc.r, &mask_dc.g, &mask_dc.b, 
+        &xsize, &ysize,
+        &block_diff_dc, &block_diff_ac,
+        &edge_detector_map,
+        &res_xsize,
+        &step);
+
+	size_t globalWorkSize[2] = { work_xsize, work_ysize };
+	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    LOG_CL_RESULT(err);
+	err = clFinish(ocl.commandQueue);
+    LOG_CL_RESULT(err);
+}
+
+void clUpsampleSquareRootEx(cl_mem diffmap, const size_t xsize, const size_t ysize, const int step)
+{
+	ocl_args_d_t &ocl = getOcl();
+
+    cl_mem diffmap_out = ocl.allocMem(xsize * ysize * sizeof(float));
+
+	cl_kernel kernel = ocl.kernel[KERNEL_UPSAMPLESQUAREROOT];
+    clSetKernelArgEx(kernel, &diffmap_out, &diffmap, &xsize, &ysize, &step);
+
+	const size_t res_xsize = (xsize + step - 1) / step;
+	const size_t res_ysize = (ysize + step - 1) / step;
+
+	size_t globalWorkSize[2] = { res_xsize, res_ysize };
+	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    LOG_CL_RESULT(err);
+	err = clEnqueueCopyBuffer(ocl.commandQueue, diffmap_out, diffmap, 0, 0, xsize * ysize * sizeof(float), 0, NULL, NULL);
+    LOG_CL_RESULT(err);
+	err = clFinish(ocl.commandQueue);
+    LOG_CL_RESULT(err);
+
+    clReleaseMemObject(diffmap_out);
+}
+
+void clRemoveBorderEx(cl_mem out, const cl_mem in, const size_t xsize, const size_t ysize, const int step)
+{
+	ocl_args_d_t &ocl = getOcl();
+
+	cl_int cls = 8 - step;
+	cl_int cls2 = (8 - step) / 2;
+
+    int out_xsize = xsize - cls;
+    int out_ysize = ysize - cls;
+
+	cl_kernel kernel = ocl.kernel[KERNEL_REMOVEBORDER];
+    clSetKernelArgEx(kernel, &out, &out_xsize, &out_ysize, &in, &cls, &cls2);
+
+	size_t globalWorkSize[2] = { out_xsize, out_ysize};
+	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    LOG_CL_RESULT(err);
+	err = clFinish(ocl.commandQueue);
+    LOG_CL_RESULT(err);
+}
+
+void clAddBorderEx(cl_mem out, size_t xsize, size_t ysize, int step, cl_mem in)
+{
+	ocl_args_d_t &ocl = getOcl();
+
+    cl_int cls = 8 - step;
+    cl_int cls2 = (8 - step) / 2;
+	cl_kernel kernel = ocl.kernel[KERNEL_ADDBORDER];
+    clSetKernelArgEx(kernel, &out, &xsize, &ysize, &cls, &cls2, &in);
+
+	size_t globalWorkSize[2] = { xsize, ysize};
+	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    LOG_CL_RESULT(err);
+	err = clFinish(ocl.commandQueue);
+    LOG_CL_RESULT(err);
+}
+
+void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step)
+{
+	clUpsampleSquareRootEx(diffmap, xsize, ysize, step);
+
+	static const double kSigma = 8.8510880283;
+	static const double mul1 = 24.8235314874;
+	static const double scale = 1.0 / (1.0 + mul1);
+
+	const int s = 8 - step;
+	int s2 = (8 - step) / 2;
+
+	ocl_args_d_t &ocl = getOcl();
+	cl_mem blurred = ocl.allocMem((xsize - s) * (ysize - s) * sizeof(float));
+	clRemoveBorderEx(blurred, diffmap, xsize, ysize, step);
+
+	static const double border_ratio = 0.03027655136;
+	clBlurEx(blurred, xsize - s, ysize - s, kSigma, border_ratio);
+
+	clAddBorderEx(diffmap, xsize, ysize, step, blurred);
+	clScaleImageEx(diffmap, xsize * ysize, scale);
+
+	clReleaseMemObject(blurred);
+}
+
+#ifdef __USE_DOUBLE_AS_FLOAT__
+#undef double
+#endif
+
+#endif
\ No newline at end of file
diff --git a/clguetzli/clguetzli.cu b/clguetzli/clguetzli.cu
new file mode 100644
index 00000000..2b7a71c4
--- /dev/null
+++ b/clguetzli/clguetzli.cu
@@ -0,0 +1,8 @@
+/*
+* CUDA Kernels
+*
+* Author: strongtu@tencent.com
+*         ianhuang@tencent.com
+*         chriskzhou@tencent.com
+*/
+#include "clguetzli/clguetzli.cl"
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
new file mode 100644
index 00000000..c4f3961c
--- /dev/null
+++ b/clguetzli/clguetzli.h
@@ -0,0 +1,188 @@
+/*
+* OpenCL edition implementation of guetzli.
+*
+* Author: strongtu@tencent.com
+*         ianhuang@tencent.com
+*         chriskzhou@tencent.com
+*/
+#pragma once
+#include <vector>
+#include "guetzli/processor.h"
+#include "guetzli/butteraugli_comparator.h"
+#include "ocl.h"
+#include "clguetzli.cl.h"
+
+#include "cuguetzli.h"
+
+enum MATH_MODE
+{
+	MODE_CPU = 0,
+	MODE_CPU_OPT,
+	MODE_OPENCL,
+	MODE_CUDA,
+	MODE_CHECKCL,
+	MODE_CHECKCUDA
+};
+
+extern MATH_MODE g_mathMode;
+
+#ifdef __USE_OPENCL__
+
+#ifdef __USE_DOUBLE_AS_FLOAT__
+#define double float
+#endif
+
+void clOpsinDynamicsImage(
+    float *r, float *g, float *b, 
+    const size_t xsize, const size_t ysize);
+
+void clDiffmapOpsinDynamicsImage(
+    float* result,
+    const float* r,  const float* g,  const float* b,
+    const float* r2, const float* g2, const float* b2,
+    const size_t xsize, const size_t ysize,
+    const size_t step);
+
+void clComputeBlockZeroingOrder(
+    guetzli::CoeffData *output_order_batch,
+    const channel_info orig_channel[3],
+    const float *orig_image_batch,
+    const float *mask_scale,
+    const int image_width,
+    const int image_height,
+    const channel_info mayout_channel[3],
+    const int factor,
+    const int comp_mask,
+    const float BlockErrorLimit);
+
+void clMask(
+    float* mask_r,   float* mask_g,   float* mask_b,
+    float* maskdc_r, float* maskdc_g, float* maskdc_b, 
+    const size_t xsize, const size_t ysize,
+    const float* r,  const float* g,  const float* b,
+    const float* r2, const float* g2, const float* b2);
+
+void clDiffmapOpsinDynamicsImageEx(
+    cl_mem result,
+    ocl_channels xyb0,
+    ocl_channels xyb1,
+    const size_t xsize, const size_t ysize,
+    const size_t step);
+
+
+void clConvolutionEx(
+    cl_mem result/*out*/,
+    const cl_mem inp, size_t xsize, size_t ysize,
+    const cl_mem multipliers, size_t len,
+    int xstep, int offset, float border_ratio);
+
+void clConvolutionXEx(
+    cl_mem result/*out*/, 
+    const cl_mem inp, size_t xsize, size_t ysize,
+    const cl_mem multipliers, size_t len,
+    int xstep, int offset, float border_ratio);
+
+void clConvolutionYEx(
+    cl_mem result/*out*/,
+    const cl_mem inp, size_t xsize, size_t ysize,
+    const cl_mem multipliers, size_t len,
+    int xstep, int offset, float border_ratio);
+
+void clSquareSampleEx(
+    cl_mem result/*out*/,
+    const cl_mem image, size_t xsize, size_t ysize,
+    size_t xstep, size_t ystep);
+
+void clBlurEx(cl_mem image/*out, opt*/, const size_t xsize, const size_t ysize,
+    const double sigma, const double border_ratio,
+    cl_mem result = nullptr/*out, opt*/);
+
+void clOpsinDynamicsImageEx(ocl_channels &rgb, const size_t xsize, const size_t ysize);
+
+void clMaskHighIntensityChangeEx(
+    ocl_channels &xyb0/*in,out*/,
+	ocl_channels &xyb1/*in,out*/,
+	const size_t xsize, const size_t ysize);
+
+void clEdgeDetectorMapEx(
+    cl_mem result/*out*/,
+    const ocl_channels &rgb, const ocl_channels &rgb2,
+    const size_t xsize, const size_t ysize, const size_t step);
+
+void clBlockDiffMapEx(
+    cl_mem block_diff_dc/*out*/, 
+    cl_mem block_diff_ac/*out*/,
+    const ocl_channels &rgb, const ocl_channels &rgb2,
+	const size_t xsize, const size_t ysize, const size_t step);
+
+void clEdgeDetectorLowFreqEx(
+    cl_mem block_diff_ac/*in,out*/,
+    const ocl_channels &rgb, const ocl_channels &rgb2,
+    const size_t xsize, const size_t ysize, const size_t step);
+
+void clDiffPrecomputeEx(
+    ocl_channels &mask/*out*/,
+    const ocl_channels &xyb0, const ocl_channels &xyb1, 
+    const size_t xsize, const size_t ysize);
+
+void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w);
+
+void clAverage5x5Ex(cl_mem img/*in,out*/, const size_t xsize, const size_t ysize);
+
+void clMinSquareValEx(
+    cl_mem img/*in,out*/, 
+    const size_t xsize, const size_t ysize, 
+    const size_t square_size, const size_t offset);
+
+void clMaskEx(
+    ocl_channels mask/*out*/, ocl_channels mask_dc/*out*/,
+    const ocl_channels &rgb, const ocl_channels &rgb2,
+	const size_t xsize, const size_t ysize);
+
+void clCombineChannelsEx(
+    cl_mem result/*out*/,
+	const ocl_channels &mask,
+	const ocl_channels &mask_dc,
+    const size_t xsize, const size_t ysize,
+	const cl_mem block_diff_dc,
+	const cl_mem block_diff_ac,
+	const cl_mem edge_detector_map,
+	const size_t res_xsize,
+	const size_t step);
+
+void clUpsampleSquareRootEx(cl_mem diffmap, const size_t xsize, const size_t ysize, const int step);
+
+void clRemoveBorderEx(cl_mem out, const cl_mem in, const size_t xsize, const size_t ysize, const int step);
+
+void clAddBorderEx(cl_mem out, const size_t xsize, const size_t ysize, const int step, const cl_mem in);
+
+void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step);
+
+class guetzli::OutputImage;
+
+#ifdef __USE_DOUBLE_AS_FLOAT__
+#undef double
+#endif
+
+namespace guetzli {
+
+    class ButteraugliComparatorEx : public ButteraugliComparator
+    {
+    public:
+        ButteraugliComparatorEx(const int width, const int height,
+            const std::vector<uint8_t>* rgb,
+            const float target_distance, ProcessStats* stats);
+
+        void Compare(const OutputImage& img) override;
+        void StartBlockComparisons() override;
+        void FinishBlockComparisons() override;
+
+        double CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const override;
+    public:
+        std::vector<float> imgOpsinDynamicsBlockList;   // [RR..RRGG..GGBB..BB]:blockCount
+        std::vector<float> imgMaskXyzScaleBlockList;    // [RGBRGB..RGBRGB]:blockCount
+        std::vector<std::vector<float>> rgb_orig_opsin;
+    };
+}
+
+#endif
\ No newline at end of file
diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp
new file mode 100644
index 00000000..2e5af412
--- /dev/null
+++ b/clguetzli/clguetzli_test.cpp
@@ -0,0 +1,450 @@
+/*
+* OpenCL test cases
+*
+* Author: strongtu@tencent.com
+*         ianhuang@tencent.com
+*         chriskzhou@tencent.com
+*/
+#ifdef __USE_OPENCL__
+
+#include <CL/cl.h>
+#include <math.h>
+#include <assert.h>
+#include <vector>
+#include "clguetzli_test.h"
+#include "clguetzli.h"
+#include "ocl.h"
+#include "ocu.h"
+
+#define FLOAT_COMPARE(a, b, c)  floatCompare((a), (b), (c), __FUNCTION__, __LINE__ )
+
+int floatCompare(const float* a, const float* b, size_t size, const char* szFunc, int line)
+{
+	int count = 0;
+	for (int i = 0; i < size; i++)
+	{
+		if (fabs(a[i] - b[i]) > 0.001)
+		{
+			count++;
+		}
+	}
+	if (count > 0)
+	{
+		LogError("CHK %s(%d) %d:%d\r\n", szFunc, line, count, size);
+	}
+	return count;
+}
+
+void tclMaskHighIntensityChange(const float* r, const float* g, const float* b,
+	const float* r2, const float* g2, const float* b2,
+	size_t xsize, size_t ysize,
+	const float* result_r, const float* result_g, const float* result_b,
+	const float* result_r2, const float* result_g2, const float* result_b2)
+{
+	size_t channel_size = xsize * ysize * sizeof(float);
+	cl_int err = 0;
+	ocl_args_d_t &ocl = getOcl();
+	ocl_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b);
+	ocl_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2);
+
+	clMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize);
+
+	cl_float *r0_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, xyb0.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	cl_float *r0_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, xyb0.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	cl_float *r0_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, xyb0.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	cl_float *r1_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, xyb1.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	cl_float *r1_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, xyb1.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	cl_float *r1_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, xyb1.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	err = clFinish(ocl.commandQueue);
+
+	FLOAT_COMPARE(result_r, r0_r, xsize * ysize);
+	FLOAT_COMPARE(result_g, r0_g, xsize * ysize);
+	FLOAT_COMPARE(result_b, r0_b, xsize * ysize);
+	FLOAT_COMPARE(result_r2, r1_r, xsize * ysize);
+	FLOAT_COMPARE(result_g2, r1_g, xsize * ysize);
+	FLOAT_COMPARE(result_b2, r1_b, xsize * ysize);
+
+	clEnqueueUnmapMemObject(ocl.commandQueue, xyb0.r, r0_r, 0, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, xyb0.g, r0_g, 0, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, xyb0.b, r0_b, 0, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, xyb1.r, r1_r, 0, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, xyb1.g, r1_g, 0, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, xyb1.b, r1_b, 0, NULL, NULL);
+	err = clFinish(ocl.commandQueue);
+
+	ocl.releaseMemChannels(xyb0);
+	ocl.releaseMemChannels(xyb1);
+}
+
+void tclEdgeDetectorMap(const float* r, const float* g, const float* b,
+	const float* r2, const float* g2, const float* b2,
+	size_t xsize, size_t ysize, size_t step,
+	const float* result)
+{
+	size_t channel_size = xsize * ysize * sizeof(float);
+	const size_t res_xsize = (xsize + step - 1) / step;
+	const size_t res_ysize = (ysize + step - 1) / step;
+	const size_t edgemap_size = res_xsize * res_ysize * 3 * sizeof(float);
+
+	cl_int err = 0;
+	ocl_args_d_t &ocl = getOcl();
+	ocl_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b);
+	ocl_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2);
+	cl_mem edge = ocl.allocMem(edgemap_size);
+
+	clEdgeDetectorMapEx(edge, xyb0, xyb1, xsize, ysize, step);
+
+	cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, edge, true, CL_MAP_READ, 0, edgemap_size, 0, NULL, NULL, &err);
+	err = clFinish(ocl.commandQueue);
+
+	FLOAT_COMPARE(result, r_r, res_xsize * res_ysize * 3);
+	
+	clEnqueueUnmapMemObject(ocl.commandQueue, edge, r_r, 0, NULL, NULL);
+	err = clFinish(ocl.commandQueue);
+
+	ocl.releaseMemChannels(xyb0);
+	ocl.releaseMemChannels(xyb1);
+	clReleaseMemObject(edge);
+}
+
+void tclBlockDiffMap(const float* r, const float* g, const float* b,
+	const float* r2, const float* g2, const float* b2,
+	size_t xsize, size_t ysize, size_t step,
+	const float* result_diff_dc, const float* result_diff_ac)
+{
+	size_t channel_size = xsize * ysize * sizeof(float);
+	const size_t res_xsize = (xsize + step - 1) / step;
+	const size_t res_ysize = (ysize + step - 1) / step;
+	const size_t reschannel_size = res_xsize * res_ysize * 3 * sizeof(float);
+
+	cl_int err = 0;
+	ocl_args_d_t &ocl = getOcl();
+	ocl_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b);
+	ocl_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2);
+	
+	cl_mem block_diff_dc = ocl.allocMem(reschannel_size);
+	cl_mem block_diff_ac = ocl.allocMem(reschannel_size);
+
+	clBlockDiffMapEx(block_diff_dc, block_diff_ac, xyb0, xyb1, xsize, ysize, step);
+
+	cl_float *r_dc = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, block_diff_dc, true, CL_MAP_READ, 0, reschannel_size, 0, NULL, NULL, &err);
+	cl_float *r_ac = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, block_diff_ac, true, CL_MAP_READ, 0, reschannel_size, 0, NULL, NULL, &err);
+	err = clFinish(ocl.commandQueue);
+
+	FLOAT_COMPARE(r_dc, result_diff_dc, res_xsize * res_ysize * 3);
+	FLOAT_COMPARE(r_ac, result_diff_ac, res_xsize * res_ysize * 3);
+
+	clEnqueueUnmapMemObject(ocl.commandQueue, block_diff_dc, r_dc, 0, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, block_diff_ac, r_ac, 0, NULL, NULL);
+	err = clFinish(ocl.commandQueue);
+
+	ocl.releaseMemChannels(xyb0);
+	ocl.releaseMemChannels(xyb1);
+
+	clReleaseMemObject(block_diff_ac);
+	clReleaseMemObject(block_diff_dc);
+}
+
+void tclEdgeDetectorLowFreq(const float* r, const float* g, const float* b,
+	const float* r2, const float* g2, const float* b2,
+	size_t xsize, size_t ysize, size_t step,
+    const float* orign_ac,
+	const float* result_diff_ac)
+{
+	size_t channel_size = xsize * ysize * sizeof(float);
+	const size_t res_xsize = (xsize + step - 1) / step;
+	const size_t res_ysize = (ysize + step - 1) / step;
+	const size_t reschannel_size = res_xsize * res_ysize * 3 * sizeof(float);
+
+	cl_int err = 0;
+	ocl_args_d_t &ocl = getOcl();
+	ocl_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b);
+	ocl_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2);
+
+	cl_mem block_diff_ac = ocl.allocMem(reschannel_size, orign_ac);
+
+	clEdgeDetectorLowFreqEx(block_diff_ac, xyb0, xyb1, xsize, ysize, step);
+
+	cl_float *r_ac = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, block_diff_ac, true, CL_MAP_READ, 0, reschannel_size, 0, NULL, NULL, &err);
+	err = clFinish(ocl.commandQueue);
+
+	FLOAT_COMPARE(r_ac, result_diff_ac, res_xsize * res_ysize * 3);
+
+	clEnqueueUnmapMemObject(ocl.commandQueue, block_diff_ac, r_ac, 0, NULL, NULL);
+	err = clFinish(ocl.commandQueue);
+
+	ocl.releaseMemChannels(xyb0);
+	ocl.releaseMemChannels(xyb1);
+
+	clReleaseMemObject(block_diff_ac);
+}
+
+void tclMask(const float* r, const float* g, const float* b,
+	const float* r2, const float* g2, const float* b2,
+	size_t xsize, size_t ysize,
+	const float* mask_r, const float* mask_g, const float* mask_b,
+	const float* maskdc_r, const float* maskdc_g, const float* maskdc_b)
+{
+	size_t channel_size = xsize * ysize * sizeof(float);
+	cl_int err = 0;
+	ocl_args_d_t &ocl = getOcl();
+	ocl_channels rgb = ocl.allocMemChannels(channel_size, r, g, b);
+	ocl_channels rgb2 = ocl.allocMemChannels(channel_size, r2, g2, b2);
+
+	ocl_channels mask = ocl.allocMemChannels(channel_size);
+	ocl_channels mask_dc = ocl.allocMemChannels(channel_size);
+    	
+	clMaskEx(mask/*out*/, mask_dc/*out*/, rgb, rgb2, xsize, ysize);
+
+	cl_float *r0_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	cl_float *r0_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	cl_float *r0_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	cl_float *r1_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	cl_float *r1_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	cl_float *r1_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	err = clFinish(ocl.commandQueue);
+
+	FLOAT_COMPARE(mask_r, r0_r, xsize * ysize);
+	FLOAT_COMPARE(mask_g, r0_g, xsize * ysize);
+	FLOAT_COMPARE(mask_b, r0_b, xsize * ysize);
+	FLOAT_COMPARE(maskdc_r, r1_r, xsize * ysize);
+	FLOAT_COMPARE(maskdc_g, r1_g, xsize * ysize);
+	FLOAT_COMPARE(maskdc_b, r1_b, xsize * ysize);
+
+	clEnqueueUnmapMemObject(ocl.commandQueue, mask.r, r0_r, 0, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, mask.g, r0_g, 0, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, mask.b, r0_b, 0, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, mask_dc.r, r1_r, 0, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, mask_dc.g, r1_g, 0, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, mask_dc.b, r1_b, 0, NULL, NULL);
+	err = clFinish(ocl.commandQueue);
+
+	ocl.releaseMemChannels(rgb);
+	ocl.releaseMemChannels(rgb2);
+	ocl.releaseMemChannels(mask);
+	ocl.releaseMemChannels(mask_dc);
+}
+
+void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const float *mask_xyb_b,
+	const float *mask_xyb_dc_x, const float *mask_xyb_dc_y, const float *mask_xyb_dc_b,
+	const float *block_diff_dc,	const float *block_diff_ac,
+	const float *edge_detector_map,
+	size_t xsize, size_t ysize,
+	size_t res_xsize, size_t res_ysize,
+	size_t step,
+	const float *init_result,
+	const float *result)
+{
+	cl_int err = CL_SUCCESS;
+	ocl_args_d_t &ocl = getOcl();
+
+	size_t channel_size = xsize * ysize * sizeof(float);
+    size_t res_channel_size = res_xsize * res_ysize * sizeof(float);
+	ocl_channels mask = ocl.allocMemChannels(channel_size, mask_xyb_x, mask_xyb_y, mask_xyb_b);
+	ocl_channels mask_dc = ocl.allocMemChannels(channel_size, mask_xyb_dc_x, mask_xyb_dc_y, mask_xyb_dc_b);
+	cl_mem cl_block_diff_dc = ocl.allocMem(3 * res_channel_size, block_diff_dc);
+	cl_mem cl_block_diff_ac = ocl.allocMem(3 * res_channel_size, block_diff_ac);
+	cl_mem cl_edge_detector_map = ocl.allocMem(3 * res_channel_size, edge_detector_map);
+	cl_mem cl_result = ocl.allocMem(res_channel_size, init_result);
+
+	clCombineChannelsEx(cl_result, mask, mask_dc, xsize, ysize, cl_block_diff_dc, cl_block_diff_ac, cl_edge_detector_map, res_xsize, step);
+
+	cl_float *result_tmp = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, cl_result, true, CL_MAP_READ, 0, res_xsize * res_ysize * sizeof(float), 0, NULL, NULL, &err);
+
+	FLOAT_COMPARE(result_tmp, result, res_xsize * res_ysize);
+
+    clEnqueueUnmapMemObject(ocl.commandQueue, cl_result, result_tmp, 0, NULL, NULL);
+	ocl.releaseMemChannels(mask);
+	ocl.releaseMemChannels(mask_dc);
+	clReleaseMemObject(cl_block_diff_dc);
+	clReleaseMemObject(cl_block_diff_ac);
+	clReleaseMemObject(cl_edge_detector_map);
+	clReleaseMemObject(cl_result);
+}
+
+void tclCalculateDiffmap(const size_t xsize, const size_t ysize,
+	const size_t step,
+	const float *diffmap, size_t org_len,
+	const float *diffmap_cmp)
+{
+	cl_int err = CL_SUCCESS;
+	ocl_args_d_t &ocl = getOcl();
+
+	size_t length = xsize * ysize * sizeof(float);
+	cl_mem mem_diffmap = ocl.allocMem(length);
+	clEnqueueWriteBuffer(ocl.commandQueue, mem_diffmap, CL_FALSE, 0, org_len * sizeof(float), diffmap, 0, NULL, NULL);
+	clCalculateDiffmapEx(mem_diffmap, xsize, ysize, step);
+	cl_float *result_tmp = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_diffmap, true, CL_MAP_READ, 0, length, 0, NULL, NULL, &err);
+    err = clFinish(ocl.commandQueue);
+	FLOAT_COMPARE(result_tmp, diffmap_cmp, xsize * ysize);
+    clEnqueueUnmapMemObject(ocl.commandQueue, mem_diffmap, result_tmp, 0, NULL, NULL);
+	clReleaseMemObject(mem_diffmap);
+}
+
+void tclBlur(const float* channel, size_t xsize, size_t ysize, double sigma, double border_ratio, const float* result)
+{
+    size_t channel_size = xsize * ysize * sizeof(float);
+    cl_int err = 0;
+    ocl_args_d_t &ocl = getOcl();
+    cl_mem r = ocl.allocMem(channel_size, channel);
+
+    clBlurEx(r, xsize, ysize, sigma, border_ratio, r);
+
+    cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+    err = clFinish(ocl.commandQueue);
+
+    FLOAT_COMPARE(result, r_r, xsize * ysize);
+
+    clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, 0, NULL, NULL);
+    err = clFinish(ocl.commandQueue);
+
+    clReleaseMemObject(r);
+}
+
+void tclConvolution(size_t xsize, size_t ysize,
+	size_t xstep,
+	size_t len, size_t offset,
+	const float* multipliers,
+	const float* inp,
+	float border_ratio,
+	float* result)
+{
+	int dxsize = (xsize + xstep - 1) / xstep;
+	size_t result_size = dxsize * ysize * sizeof(float);
+	size_t inp_size = xsize * ysize * sizeof(float);
+	size_t multipliers_size = len * sizeof(float);
+	cl_int err = 0;
+    ocl_args_d_t &ocl = getOcl();
+    cl_mem r = ocl.allocMem(result_size);
+	cl_mem i = ocl.allocMem(inp_size, inp);
+	cl_mem m = ocl.allocMem(multipliers_size, multipliers);
+
+	clConvolutionEx(r, i, xsize, ysize, m, len, xstep, offset, border_ratio);
+
+	cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, r, true, CL_MAP_READ, 0, result_size, 0, NULL, NULL, &err);
+	err = clFinish(ocl.commandQueue);
+
+	FLOAT_COMPARE(result, r_r, dxsize * ysize);
+
+	clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, 0, NULL, NULL);
+	err = clFinish(ocl.commandQueue);
+
+    clReleaseMemObject(r);
+	clReleaseMemObject(i);
+	clReleaseMemObject(m);
+}
+
+void tclDiffPrecompute(
+  const std::vector<std::vector<float> > &xyb0,
+  const std::vector<std::vector<float> > &xyb1,
+  size_t xsize, size_t ysize,
+  const std::vector<std::vector<float> > *mask_cmp)
+{
+  cl_int err = 0;
+  ocl_args_d_t &ocl = getOcl();
+  size_t channel_size = xsize * ysize * sizeof(float);
+  ocl_channels cl_xyb0 = ocl.allocMemChannels(channel_size, xyb0[0].data(), xyb0[1].data(), xyb0[2].data());
+  ocl_channels cl_xyb1 = ocl.allocMemChannels(channel_size, xyb1[0].data(), xyb1[1].data(), xyb1[2].data());
+  ocl_channels cl_mask = ocl.allocMemChannels(channel_size);
+
+  clDiffPrecomputeEx(cl_mask, cl_xyb0, cl_xyb1, xsize, ysize);
+
+  cl_float *r_x = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, cl_mask.x, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+  cl_float *r_y = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, cl_mask.y, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+  cl_float *r_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, cl_mask.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+  err = clFinish(ocl.commandQueue);
+
+  FLOAT_COMPARE(r_x, (*mask_cmp)[0].data(), xsize * ysize);
+  FLOAT_COMPARE(r_y, (*mask_cmp)[1].data(), xsize * ysize);
+  FLOAT_COMPARE(r_b, (*mask_cmp)[2].data(), xsize * ysize);
+
+  clEnqueueUnmapMemObject(ocl.commandQueue, cl_mask.x, r_x, 0, NULL, NULL);
+  clEnqueueUnmapMemObject(ocl.commandQueue, cl_mask.y, r_y, 0, NULL, NULL);
+  clEnqueueUnmapMemObject(ocl.commandQueue, cl_mask.b, r_b, 0, NULL, NULL);
+  ocl.releaseMemChannels(cl_xyb0);
+  ocl.releaseMemChannels(cl_xyb1);
+  ocl.releaseMemChannels(cl_mask);
+}
+
+void tclAverage5x5(int xsize, int ysize, const std::vector<float> &diffs_org, const std::vector<float> &diffs_cmp)
+{
+  cl_int err = 0;
+  ocl_args_d_t &ocl = getOcl();
+  cl_mem mem_diff = ocl.allocMem(xsize * ysize * sizeof(float), diffs_org.data());
+
+  clAverage5x5Ex(mem_diff, xsize, ysize);
+  cl_float *r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_diff, true, CL_MAP_READ, 0, xsize * ysize * sizeof(float), 0, NULL, NULL, &err);
+  err = clFinish(ocl.commandQueue);
+  FLOAT_COMPARE(r, diffs_cmp.data(), xsize * ysize);
+
+  clEnqueueUnmapMemObject(ocl.commandQueue, mem_diff, r, 0, NULL, NULL);
+  clReleaseMemObject(mem_diff);
+}
+
+void tclMinSquareVal(const float *img, size_t square_size, size_t offset,
+	size_t xsize, size_t ysize,
+	const float *result)
+{
+	size_t img_size = xsize * ysize * sizeof(float);
+	cl_int err = 0;
+	ocl_args_d_t &ocl = getOcl();
+	cl_mem r = ocl.allocMem(img_size, img);
+
+	clMinSquareValEx(r, xsize, ysize, square_size, offset);
+
+	cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, r, true, CL_MAP_READ, 0, img_size, 0, NULL, NULL, &err);
+	err = clFinish(ocl.commandQueue);
+
+	FLOAT_COMPARE(result, r_r, xsize * ysize);
+
+	clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, 0, NULL, NULL);
+	err = clFinish(ocl.commandQueue);
+
+	clReleaseMemObject(r);
+}
+
+void tclScaleImage(double scale, const float *result_org, const float *result_cmp, size_t length)
+{
+    cl_int err = 0;
+    ocl_args_d_t &ocl = getOcl();
+    cl_mem mem_result_org = ocl.allocMem(length * sizeof(float), result_org);
+
+    clScaleImageEx(mem_result_org, length, scale);
+
+    cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_result_org, true, CL_MAP_READ, 0, length * sizeof(float), 0, NULL, NULL, &err);
+    err = clFinish(ocl.commandQueue);
+
+    FLOAT_COMPARE(r_r, result_cmp, length);
+
+    clEnqueueUnmapMemObject(ocl.commandQueue, mem_result_org, r_r, 0, NULL, NULL);
+    clReleaseMemObject(mem_result_org);
+}
+
+void tclOpsinDynamicsImage(const float* r, const float* g, const float* b, size_t xsize, size_t ysize,
+	const float* result_r, const float* result_g, const float* result_b)
+{
+	size_t channel_size = xsize * ysize * sizeof(float);
+	cl_int err = 0;
+	ocl_args_d_t &ocl = getOcl();
+	ocl_channels rgb = ocl.allocMemChannels(channel_size, r, g, b);
+
+	clOpsinDynamicsImageEx(rgb, xsize, ysize);
+
+	cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	cl_float *r_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	cl_float *r_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	err = clFinish(ocl.commandQueue);
+
+	FLOAT_COMPARE(result_r, r_r, xsize * ysize);
+	FLOAT_COMPARE(result_g, r_g, xsize * ysize);
+	FLOAT_COMPARE(result_b, r_b, xsize * ysize);
+
+	clEnqueueUnmapMemObject(ocl.commandQueue, rgb.r, r_r, 0, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, rgb.g, r_g, 0, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, rgb.b, r_b, 0, NULL, NULL);
+	err = clFinish(ocl.commandQueue);
+
+	ocl.releaseMemChannels(rgb);
+}
+
+#endif
\ No newline at end of file
diff --git a/clguetzli/clguetzli_test.h b/clguetzli/clguetzli_test.h
new file mode 100644
index 00000000..dbc3c47a
--- /dev/null
+++ b/clguetzli/clguetzli_test.h
@@ -0,0 +1,79 @@
+/*
+* OpenCL test cases
+*
+* Author: strongtu@tencent.com
+*         ianhuang@tencent.com
+*         chriskzhou@tencent.com
+*/
+#pragma once
+#include "ocl.h"
+
+void tclMaskHighIntensityChange(const float* r, const float* g, const float* b,
+	const float* r2, const float* g2, const float* b2,
+	size_t xsize, size_t ysize,
+	const float* result_r, const float* result_g, const float* result_b,
+	const float* result_r2, const float* result_g2, const float* result_b2);
+
+void tclBlur(const float* channel, size_t xsize, size_t ysize, double sigma, double border_ratio, const float* result);
+
+void tclEdgeDetectorMap(const float* r, const float* g, const float* b,
+	const float* r2, const float* g2, const float* b2,
+	size_t xsize, size_t ysize, size_t step,
+	const float* result);
+
+void tclBlockDiffMap(const float* r, const float* g, const float* b,
+	const float* r2, const float* g2, const float* b2,
+	size_t xsize, size_t ysize, size_t step,
+	const float* result_diff_dc, const float* result_diff_ac);
+
+void tclEdgeDetectorLowFreq(const float* r, const float* g, const float* b,
+	const float* r2, const float* g2, const float* b2,
+	size_t xsize, size_t ysize, size_t step,
+    const float* orign_ac,
+	const float* result_diff_dc);
+
+void tclMask(const float* r, const float* g, const float* b,
+	const float* r2, const float* g2, const float* b2,
+	size_t xsize, size_t ysize,
+	const float* mask_r, const float* mask_g, const float* mask_b,
+	const float* maskdc_r, const float* maskdc_g, const float* maskdc_b);
+
+void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const float *mask_xyb_b,
+	const float *mask_xyb_dc_x, const float *mask_xyb_dc_y, const float *mask_xyb_dc_b,
+	const float *block_diff_dc, const float *block_diff_ac,
+	const float *edge_detector_map,
+	size_t xsize, size_t ysize,
+	size_t res_xsize, size_t res_ysize,
+	size_t step,
+	const float *init_result,
+	const float *result);
+
+void tclCalculateDiffmap(const size_t xsize, const size_t ysize,
+	const size_t step,
+	const float *diffmap, size_t org_len,
+	const float *diffmap_cmp);
+
+void tclConvolution(size_t xsize, size_t ysize,
+	size_t xstep,
+	size_t len, size_t offset,
+	const float* multipliers,
+	const float* inp,
+	float border_ratio,
+	float* result);
+
+void tclDiffPrecompute(
+  const std::vector<std::vector<float> > &xyb0,
+  const std::vector<std::vector<float> > &xyb1,
+  size_t xsize, size_t ysize,
+  const std::vector<std::vector<float> > *mask_cmp);
+
+void tclAverage5x5(int xsize, int ysize, const std::vector<float> &diffs_org, const std::vector<float> &diffs_cmp);
+
+void tclScaleImage(double scale, const float *result_org, const float *result_cmp, size_t length);
+
+void tclOpsinDynamicsImage(const float* r, const float* g, const float* b, size_t xsize, size_t ysize,
+	const float* result_r, const float* result_g, const float* result_b);
+
+void tclMinSquareVal(const float *img, size_t square_size, size_t offset,
+	size_t xsize, size_t ysize,
+	const float *result);
diff --git a/clguetzli/cuguetzli.cpp b/clguetzli/cuguetzli.cpp
new file mode 100644
index 00000000..f348edb7
--- /dev/null
+++ b/clguetzli/cuguetzli.cpp
@@ -0,0 +1,903 @@
+/*
+* CUDA edition implementation of guetzli.
+*
+* Author: strongtu@tencent.com
+*         ianhuang@tencent.com
+*         chriskzhou@tencent.com
+*/
+#include "cuguetzli.h"
+#include <algorithm>
+#include "ocu.h"
+
+#ifdef __USE_CUDA__
+
+#ifdef __USE_DOUBLE_AS_FLOAT__
+#define double float
+#endif
+
+#define cuFinish cuStreamSynchronize
+#define BLOCK_SIZE_X 16
+#define BLOCK_SIZE_Y 16
+#define BLOCK_COUNT_X(size)    ((size + BLOCK_SIZE_X - 1) / BLOCK_SIZE_X)
+#define BLOCK_COUNT_Y(size)    ((size + BLOCK_SIZE_Y - 1) / BLOCK_SIZE_Y)
+
+void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize)
+{
+    size_t channel_size = xsize * ysize * sizeof(float);
+
+    ocu_args_d_t &ocu = getOcu();
+    ocu_channels rgb = ocu.allocMemChannels(channel_size, r, g, b);
+
+    cuOpsinDynamicsImageEx(rgb, xsize, ysize);
+
+    cuMemcpyDtoHAsync(r, rgb.r, channel_size, ocu.commandQueue);
+    cuMemcpyDtoHAsync(g, rgb.g, channel_size, ocu.commandQueue);
+	cuMemcpyDtoHAsync(b, rgb.b, channel_size, ocu.commandQueue);
+    cuFinish(ocu.commandQueue);
+
+    ocu.releaseMemChannels(rgb);
+}
+
+void cuDiffmapOpsinDynamicsImage(
+    float* result,
+    const float* r, const float* g, const float* b,
+    const float* r2, const float* g2, const float* b2,
+    const size_t xsize, const size_t ysize,
+    const size_t step)
+{
+    size_t channel_size = xsize * ysize * sizeof(float);
+
+    ocu_args_d_t &ocu = getOcu();
+    ocu_channels xyb0 = ocu.allocMemChannels(channel_size, r, g, b);
+    ocu_channels xyb1 = ocu.allocMemChannels(channel_size, r2, g2, b2);
+
+    cu_mem mem_result = ocu.allocMem(channel_size, result);
+
+    cuDiffmapOpsinDynamicsImageEx(mem_result, xyb0, xyb1, xsize, ysize, step);
+
+    cuMemcpyDtoH(result, mem_result, channel_size);
+
+    ocu.releaseMemChannels(xyb1);
+    ocu.releaseMemChannels(xyb0);
+
+    ocu.releaseMem(mem_result);
+}
+
+void cuComputeBlockZeroingOrder(
+    guetzli::CoeffData *output_order_batch,
+    const channel_info orig_channel[3],
+    const float *orig_image_batch,
+    const float *mask_scale,
+    const int image_width,
+    const int image_height,
+    const channel_info mayout_channel[3],
+    const int factor,
+    const int comp_mask,
+    const float BlockErrorLimit)
+{
+    const int block8_width = (image_width + 8 - 1) / 8;
+    const int block8_height = (image_height + 8 - 1) / 8;
+    const int blockf_width = (image_width + 8 * factor - 1) / (8 * factor);
+    const int blockf_height = (image_height + 8 * factor - 1) / (8 * factor);
+
+    using namespace guetzli;
+
+    ocu_args_d_t &ocu = getOcu();
+
+    cu_mem mem_orig_coeff[3];
+    cu_mem mem_mayout_coeff[3];
+    cu_mem mem_mayout_pixel[3];
+    for (int c = 0; c < 3; c++)
+    {
+        int block_count = orig_channel[c].block_width * orig_channel[c].block_height;
+        mem_orig_coeff[c] = ocu.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, orig_channel[c].coeff);
+
+        block_count = mayout_channel[c].block_width * mayout_channel[c].block_height;
+        mem_mayout_coeff[c] = ocu.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, mayout_channel[c].coeff);
+
+        mem_mayout_pixel[c] = ocu.allocMem(image_width * image_height * sizeof(uint16_t), mayout_channel[c].pixel);
+    }
+    cu_mem mem_orig_image = ocu.allocMem(sizeof(float) * 3 * kDCTBlockSize * block8_width * block8_height, orig_image_batch);
+    cu_mem mem_mask_scale = ocu.allocMem(sizeof(float) * 3 * block8_width * block8_height, mask_scale);
+
+    int output_order_batch_size = sizeof(CoeffData) * 3 * kDCTBlockSize * blockf_width * blockf_height;
+    cu_mem mem_output_order_batch = ocu.allocMem(output_order_batch_size, output_order_batch);
+
+    CUfunction kernel = ocu.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER];
+    const void *args[] = { &mem_orig_coeff[0], &mem_orig_coeff[1], &mem_orig_coeff[2],
+        &mem_orig_image, &mem_mask_scale,
+        &blockf_width, &blockf_height,
+        &image_width, &image_height,
+        &mem_mayout_coeff[0], &mem_mayout_coeff[1], &mem_mayout_coeff[2],
+        &mem_mayout_pixel[0], &mem_mayout_pixel[1], &mem_mayout_pixel[2],
+        &mayout_channel[0], &mayout_channel[1], &mayout_channel[2],
+        &factor,
+        &comp_mask,
+        &BlockErrorLimit,
+        &mem_output_order_batch };
+
+    CUresult err = cuLaunchKernel(kernel,
+        BLOCK_COUNT_X(blockf_width), BLOCK_COUNT_Y(blockf_height), 1,
+        BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
+        0,
+        ocu.commandQueue, (void**)args, NULL);
+    LOG_CU_RESULT(err);
+
+    err = cuFinish(ocu.commandQueue);
+	LOG_CU_RESULT(err);
+
+    cuMemcpyDtoH(output_order_batch, mem_output_order_batch, output_order_batch_size);
+
+    for (int c = 0; c < 3; c++)
+    {
+        ocu.releaseMem(mem_orig_coeff[c]);
+        ocu.releaseMem(mem_mayout_coeff[c]);
+        ocu.releaseMem(mem_mayout_pixel[c]);
+    }
+
+    ocu.releaseMem(mem_orig_image);
+    ocu.releaseMem(mem_mask_scale);
+    ocu.releaseMem(mem_output_order_batch);
+}
+
+void cuMask(
+    float* mask_r, float* mask_g, float* mask_b,
+    float* maskdc_r, float* maskdc_g, float* maskdc_b,
+    const size_t xsize, const size_t ysize,
+    const float* r, const float* g, const float* b,
+    const float* r2, const float* g2, const float* b2)
+{
+    ocu_args_d_t &ocu = getOcu();
+
+    size_t channel_size = xsize * ysize * sizeof(float);
+
+    ocu_channels rgb = ocu.allocMemChannels(channel_size, r, g, b);
+    ocu_channels rgb2 = ocu.allocMemChannels(channel_size, r2, g2, b2);
+    ocu_channels mask = ocu.allocMemChannels(channel_size);
+    ocu_channels mask_dc = ocu.allocMemChannels(channel_size);
+
+    cuMaskEx(mask, mask_dc, rgb, rgb2, xsize, ysize);
+
+    cuMemcpyDtoHAsync(mask_r, mask.r, channel_size, ocu.commandQueue);
+    cuMemcpyDtoHAsync(mask_g, mask.g, channel_size, ocu.commandQueue);
+    cuMemcpyDtoHAsync(mask_b, mask.b, channel_size, ocu.commandQueue);
+    cuMemcpyDtoHAsync(maskdc_r, mask_dc.r, channel_size, ocu.commandQueue);
+    cuMemcpyDtoHAsync(maskdc_g, mask_dc.g, channel_size, ocu.commandQueue);
+    cuMemcpyDtoHAsync(maskdc_b, mask_dc.b, channel_size, ocu.commandQueue);
+    cuFinish(ocu.commandQueue);
+
+    ocu.releaseMemChannels(rgb);
+    ocu.releaseMemChannels(rgb2);
+    ocu.releaseMemChannels(mask);
+    ocu.releaseMemChannels(mask_dc);
+}
+
+void cuDiffmapOpsinDynamicsImageEx(
+    cu_mem result,
+    ocu_channels xyb0,
+    ocu_channels xyb1,
+    const size_t xsize, const size_t ysize,
+    const size_t step)
+{
+    const size_t res_xsize = (xsize + step - 1) / step;
+    const size_t res_ysize = (ysize + step - 1) / step;
+
+    size_t channel_size = xsize * ysize * sizeof(float);
+    size_t channel_step_size = res_xsize * res_ysize * sizeof(float);
+
+    ocu_args_d_t &ocu = getOcu();
+ 
+    cu_mem edge_detector_map = ocu.allocMem(3 * channel_step_size);
+    cu_mem block_diff_dc = ocu.allocMem(3 * channel_step_size);
+    cu_mem block_diff_ac = ocu.allocMem(3 * channel_step_size);
+
+    cuMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize);
+
+    cuEdgeDetectorMapEx(edge_detector_map, xyb0, xyb1, xsize, ysize, step);
+    cuBlockDiffMapEx(block_diff_dc, block_diff_ac, xyb0, xyb1, xsize, ysize, step);
+    cuEdgeDetectorLowFreqEx(block_diff_ac, xyb0, xyb1, xsize, ysize, step);
+    {
+        ocu_channels mask = ocu.allocMemChannels(channel_size);
+        ocu_channels mask_dc = ocu.allocMemChannels(channel_size);
+        cuMaskEx(mask, mask_dc, xyb0, xyb1, xsize, ysize);
+        cuCombineChannelsEx(result, mask, mask_dc, xsize, ysize, block_diff_dc, block_diff_ac, edge_detector_map, res_xsize, step);
+
+        ocu.releaseMemChannels(mask);
+        ocu.releaseMemChannels(mask_dc);
+    }
+
+    cuCalculateDiffmapEx(result, xsize, ysize, step);
+
+    ocu.releaseMem(edge_detector_map);
+    ocu.releaseMem(block_diff_dc);
+    ocu.releaseMem(block_diff_ac);
+}
+
+void cuConvolutionEx(
+    cu_mem result/*out*/,
+    const cu_mem inp, size_t xsize, size_t ysize,
+    const cu_mem multipliers, size_t len,
+    int xstep, int offset, float border_ratio)
+{
+    ocu_args_d_t &ocu = getOcu();
+
+    size_t oxsize = (xsize + xstep - 1) / xstep;
+
+	CUfunction kernel = ocu.kernel[KERNEL_CONVOLUTION];
+    const void *args[] = { &result, &inp, &xsize, &multipliers, &len, &xstep, &offset, &border_ratio };
+
+    CUresult err = cuLaunchKernel(kernel,
+        oxsize, ysize, 1,
+        1, 1, 1,
+        0,
+        ocu.commandQueue, (void**)args, NULL);
+	LOG_CU_RESULT(err);
+    err = cuFinish(ocu.commandQueue);
+	LOG_CU_RESULT(err);
+}
+
+
+void cuConvolutionXEx(
+    cu_mem result/*out*/,
+    const cu_mem inp, size_t xsize, size_t ysize,
+    const cu_mem multipliers, size_t len,
+    int xstep, int offset, float border_ratio)
+{
+    ocu_args_d_t &ocu = getOcu();
+
+	CUfunction kernel = ocu.kernel[KERNEL_CONVOLUTIONX];
+    const void *args[] = { &result, &xsize, &ysize, &inp, &multipliers, &len, &xstep, &offset, &border_ratio };
+
+    CUresult err = cuLaunchKernel(kernel,
+        BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1,
+        BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
+        0,
+        ocu.commandQueue, (void**)args, NULL);
+	LOG_CU_RESULT(err);
+    err = cuFinish(ocu.commandQueue);
+	LOG_CU_RESULT(err);
+}
+
+void cuConvolutionYEx(
+    cu_mem result/*out*/,
+    const cu_mem inp, size_t xsize, size_t ysize,
+    const cu_mem multipliers, size_t len,
+    int xstep, int offset, float border_ratio)
+{
+    ocu_args_d_t &ocu = getOcu();
+
+	CUfunction kernel = ocu.kernel[KERNEL_CONVOLUTIONY];
+    const void *args[] = { &result, &xsize, &ysize, &inp, &multipliers, &len, &xstep, &offset, &border_ratio };
+
+    CUresult err = cuLaunchKernel(kernel,
+        BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1,
+        BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
+        0,
+        ocu.commandQueue, (void**)args, NULL);
+	LOG_CU_RESULT(err);
+    err = cuFinish(ocu.commandQueue);
+	LOG_CU_RESULT(err);
+}
+
+void cuSquareSampleEx(
+    cu_mem result/*out*/,
+    const cu_mem image, size_t xsize, size_t ysize,
+    size_t xstep, size_t ystep)
+{
+    ocu_args_d_t &ocu = getOcu();
+
+	CUfunction kernel = ocu.kernel[KERNEL_SQUARESAMPLE];
+    const void *args[] = { &result, &xsize, &ysize, &image, &xstep, &ystep };
+
+    CUresult err = cuLaunchKernel(kernel,
+        BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1,
+        BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
+        0,
+        ocu.commandQueue, (void**)args, NULL);
+	LOG_CU_RESULT(err);
+    err = cuFinish(ocu.commandQueue);
+	LOG_CU_RESULT(err);
+}
+
+void cuBlurEx(cu_mem image/*out, opt*/, const size_t xsize, const size_t ysize,
+    const double sigma, const double border_ratio,
+    cu_mem result/*out, opt*/)
+{
+    double m = 2.25;  // Accuracy increases when m is increased.
+    const double scaler = -1.0 / (2 * sigma * sigma);
+    // For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52}
+    const int diff = std::max<int>(1, m * fabs(sigma));
+    const int expn_size = 2 * diff + 1;
+    std::vector<float> expn(expn_size);
+    for (int i = -diff; i <= diff; ++i) {
+        expn[i + diff] = static_cast<float>(exp(scaler * i * i));
+    }
+
+    const int xstep = std::max<int>(1, int(sigma / 3));
+
+    ocu_args_d_t &ocu = getOcu();
+    cu_mem mem_expn = ocu.allocMem(sizeof(cl_float) * expn_size, expn.data());
+
+    if (xstep > 1)
+    {
+        cu_mem m = ocu.allocMem(sizeof(cl_float) * xsize * ysize);
+        cuConvolutionXEx(m, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+        cuConvolutionYEx(result ? result : image, m, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+        cuSquareSampleEx(result ? result : image, result ? result : image, xsize, ysize, xstep, xstep);
+        ocu.releaseMem(m);
+    }
+    else
+    {
+        cu_mem m = ocu.allocMem(sizeof(cl_float) * xsize * ysize);
+        cuConvolutionXEx(m, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+        cuConvolutionYEx(result ? result : image, m, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+        ocu.releaseMem(m);
+    }
+
+    ocu.releaseMem(mem_expn);
+}
+
+void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t ysize)
+{
+    static const double kSigma = 1.1;
+
+    size_t channel_size = xsize * ysize * sizeof(float);
+
+    ocu_args_d_t &ocu = getOcu();
+    ocu_channels rgb_blurred = ocu.allocMemChannels(channel_size);
+
+    const int size = xsize * ysize;
+
+    cuBlurEx(rgb.r, xsize, ysize, kSigma, 0.0, rgb_blurred.r);
+    cuBlurEx(rgb.g, xsize, ysize, kSigma, 0.0, rgb_blurred.g);
+    cuBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b);
+
+	CUfunction kernel = ocu.kernel[KERNEL_OPSINDYNAMICSIMAGE];
+    const void *args[] = { &rgb.r, &rgb.g, &rgb.b, &size, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b };
+
+    CUresult err = cuLaunchKernel(kernel,
+//        (size + BLOCK_SIZE_X * BLOCK_SIZE_Y - 1) / BLOCK_SIZE_X * BLOCK_SIZE_Y, 1, 1,
+//        BLOCK_SIZE_X * BLOCK_SIZE_Y, 1, 1,
+        (size + 511) / 512, 1, 1,
+        512, 1, 1,
+        0,
+        ocu.commandQueue, (void**)args, NULL);
+	LOG_CU_RESULT(err);
+    err = cuFinish(ocu.commandQueue);
+	LOG_CU_RESULT(err);
+
+    ocu.releaseMemChannels(rgb_blurred);
+}
+
+void cuMaskHighIntensityChangeEx(
+    ocu_channels &xyb0/*in,out*/,
+    ocu_channels &xyb1/*in,out*/,
+    const size_t xsize, const size_t ysize)
+{
+    size_t channel_size = xsize * ysize * sizeof(float);
+
+    ocu_args_d_t &ocu = getOcu();
+
+    ocu_channels c0 = ocu.allocMemChannels(channel_size);
+    ocu_channels c1 = ocu.allocMemChannels(channel_size);
+
+    cuMemcpyDtoDAsync(c0.r, xyb0.r, channel_size, ocu.commandQueue);
+    cuMemcpyDtoDAsync(c0.g, xyb0.g, channel_size, ocu.commandQueue);
+    cuMemcpyDtoDAsync(c0.b, xyb0.b, channel_size, ocu.commandQueue);
+    cuMemcpyDtoDAsync(c1.r, xyb1.r, channel_size, ocu.commandQueue);
+    cuMemcpyDtoDAsync(c1.g, xyb1.g, channel_size, ocu.commandQueue);
+    cuMemcpyDtoDAsync(c1.b, xyb1.b, channel_size, ocu.commandQueue);
+	cuFinish(ocu.commandQueue);
+
+	CUfunction kernel = ocu.kernel[KERNEL_MASKHIGHINTENSITYCHANGE];
+    const void *args[] = { 
+		&xyb0.r, &xyb0.g, &xyb0.b,
+        &xsize, &ysize,
+        &xyb1.r, &xyb1.g, &xyb1.b,
+        &c0.r, &c0.g, &c0.b,
+        &c1.r, &c1.g, &c1.b };
+
+    CUresult err = cuLaunchKernel(kernel,
+        BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1,
+        BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
+        0,
+        ocu.commandQueue, (void**)args, NULL);
+	LOG_CU_RESULT(err);
+    err = cuFinish(ocu.commandQueue);
+	LOG_CU_RESULT(err);
+
+    ocu.releaseMemChannels(c0);
+    ocu.releaseMemChannels(c1);
+}
+
+void cuEdgeDetectorMapEx(
+    cu_mem result/*out*/,
+    const ocu_channels &rgb, const ocu_channels &rgb2,
+    const size_t xsize, const size_t ysize, const size_t step)
+{
+    size_t channel_size = xsize * ysize * sizeof(float);
+
+    ocu_args_d_t &ocu = getOcu();
+
+    ocu_channels rgb_blured = ocu.allocMemChannels(channel_size);
+    ocu_channels rgb2_blured = ocu.allocMemChannels(channel_size);
+
+    static const double kSigma[3] = { 1.5, 0.586, 0.4 };
+
+    for (int i = 0; i < 3; i++)
+    {
+        cuBlurEx(rgb.ch[i], xsize, ysize, kSigma[i], 0.0, rgb_blured.ch[i]);
+        cuBlurEx(rgb2.ch[i], xsize, ysize, kSigma[i], 0.0, rgb2_blured.ch[i]);
+    }
+
+    const size_t res_xsize = (xsize + step - 1) / step;
+    const size_t res_ysize = (ysize + step - 1) / step;
+
+	CUfunction kernel = ocu.kernel[KERNEL_EDGEDETECTOR];
+    const void *args[] = { &result,
+        &res_xsize, &res_ysize,
+        &rgb_blured.r, &rgb_blured.g, &rgb_blured.b,
+        &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b,
+        &xsize, &ysize, &step };
+
+    CUresult err = cuLaunchKernel(kernel,
+        BLOCK_COUNT_X(res_xsize), BLOCK_COUNT_Y(res_ysize), 1,
+        BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
+        0,
+        ocu.commandQueue, (void**)args, NULL);
+	LOG_CU_RESULT(err);
+    err = cuFinish(ocu.commandQueue);
+	LOG_CU_RESULT(err);
+
+    ocu.releaseMemChannels(rgb_blured);
+    ocu.releaseMemChannels(rgb2_blured);
+}
+
+void cuBlockDiffMapEx(
+    cu_mem block_diff_dc/*out*/,
+    cu_mem block_diff_ac/*out*/,
+    const ocu_channels &rgb, const ocu_channels &rgb2,
+    const size_t xsize, const size_t ysize, const size_t step)
+{
+    ocu_args_d_t &ocu = getOcu();
+
+    const size_t res_xsize = (xsize + step - 1) / step;
+    const size_t res_ysize = (ysize + step - 1) / step;
+
+	CUfunction kernel = ocu.kernel[KERNEL_BLOCKDIFFMAP];
+    const void *args[] = { &block_diff_dc, &block_diff_ac,
+        &res_xsize, &res_ysize,
+        &rgb.r, &rgb.g, &rgb.b,
+        &rgb2.r, &rgb2.g, &rgb2.b,
+        &xsize, &ysize, &step };
+
+    CUresult err = cuLaunchKernel(kernel,
+        BLOCK_COUNT_X(res_xsize), BLOCK_COUNT_Y(res_ysize), 1,
+        BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
+        0,
+        ocu.commandQueue, (void**)args, NULL);
+	LOG_CU_RESULT(err);
+    err = cuFinish(ocu.commandQueue);
+	LOG_CU_RESULT(err);
+}
+
+void cuEdgeDetectorLowFreqEx(
+    cu_mem block_diff_ac/*in,out*/,
+    const ocu_channels &rgb, const ocu_channels &rgb2,
+    const size_t xsize, const size_t ysize, const size_t step)
+{
+    size_t channel_size = xsize * ysize * sizeof(float);
+
+    static const double kSigma = 14;
+
+    ocu_args_d_t &ocu = getOcu();
+    ocu_channels rgb_blured = ocu.allocMemChannels(channel_size);
+    ocu_channels rgb2_blured = ocu.allocMemChannels(channel_size);
+
+    for (int i = 0; i < 3; i++)
+    {
+        cuBlurEx(rgb.ch[i], xsize, ysize, kSigma, 0.0, rgb_blured.ch[i]);
+        cuBlurEx(rgb2.ch[i], xsize, ysize, kSigma, 0.0, rgb2_blured.ch[i]);
+    }
+
+    const size_t res_xsize = (xsize + step - 1) / step;
+    const size_t res_ysize = (ysize + step - 1) / step;
+
+	CUfunction kernel = ocu.kernel[KERNEL_EDGEDETECTORLOWFREQ];
+    const void *args[] = { &block_diff_ac,
+        &res_xsize, &res_ysize,
+        &rgb_blured.r, &rgb_blured.g, &rgb_blured.b,
+        &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b,
+        &xsize, &ysize, &step };
+
+    
+    CUresult err = cuLaunchKernel(kernel,
+        BLOCK_COUNT_X(res_xsize), BLOCK_COUNT_Y(res_ysize), 1,
+        BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
+        0,
+        ocu.commandQueue, (void**)args, NULL);
+	LOG_CU_RESULT(err);
+    err = cuFinish(ocu.commandQueue);
+	LOG_CU_RESULT(err);
+
+    ocu.releaseMemChannels(rgb_blured);
+    ocu.releaseMemChannels(rgb2_blured);
+}
+
+void cuDiffPrecomputeEx(
+    ocu_channels &mask/*out*/,
+    const ocu_channels &xyb0, const ocu_channels &xyb1,
+    const size_t xsize, const size_t ysize)
+{
+    ocu_args_d_t &ocu = getOcu();
+
+	CUfunction kernel = ocu.kernel[KERNEL_DIFFPRECOMPUTE];
+    const void *args[] = { &mask.x, &mask.y, &mask.b,
+        &xsize, &ysize,
+        &xyb0.x, &xyb0.y, &xyb0.b,
+        &xyb1.x, &xyb1.y, &xyb1.b };
+
+    CUresult err = cuLaunchKernel(kernel,
+        BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1,
+        BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
+        0,
+        ocu.commandQueue, (void**)args, NULL);
+	LOG_CU_RESULT(err);
+    err = cuFinish(ocu.commandQueue);
+	LOG_CU_RESULT(err);
+}
+
+void cuScaleImageEx(cu_mem img/*in, out*/, size_t size, double w)
+{
+    ocu_args_d_t &ocu = getOcu();
+    float fw = w;
+
+	CUfunction kernel = ocu.kernel[KERNEL_SCALEIMAGE];
+    const void *args[] = { &img, &size, &fw };
+
+    CUresult err = cuLaunchKernel(kernel,
+//        (size + BLOCK_SIZE_X * BLOCK_SIZE_Y - 1) / BLOCK_SIZE_X * BLOCK_SIZE_Y, 1, 1,
+        (size + 511) / 512, 1, 1,
+//        BLOCK_SIZE_X * BLOCK_SIZE_Y, 1, 1,
+        512, 1, 1,
+        0,
+        ocu.commandQueue, (void**)args, NULL);
+	LOG_CU_RESULT(err);
+    err = cuFinish(ocu.commandQueue);
+	LOG_CU_RESULT(err);
+}
+
+void cuAverage5x5Ex(cu_mem img/*in,out*/, const size_t xsize, const size_t ysize)
+{
+    if (xsize < 4 || ysize < 4) {
+        // TODO: Make this work for small dimensions as well.
+        return;
+    }
+
+    ocu_args_d_t &ocu = getOcu();
+
+    size_t len = xsize * ysize * sizeof(float);
+    cu_mem img_org = ocu.allocMem(len);
+
+    cuMemcpyDtoD(img_org, img, len);
+
+	CUfunction kernel = ocu.kernel[KERNEL_AVERAGE5X5];
+    const void *args[] = { &img, &xsize, &ysize, &img_org };
+
+    CUresult err = cuLaunchKernel(kernel,
+        BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1,
+        BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
+        0,
+        ocu.commandQueue, (void**)args, NULL);
+	LOG_CU_RESULT(err);
+    err = cuFinish(ocu.commandQueue);
+	LOG_CU_RESULT(err);
+
+    ocu.releaseMem(img_org);
+}
+
+void cuMinSquareValEx(
+    cu_mem img/*in,out*/,
+    const size_t xsize, const size_t ysize,
+    const size_t square_size, const size_t offset)
+{
+    ocu_args_d_t &ocu = getOcu();
+
+    cu_mem result = ocu.allocMem(sizeof(float) * xsize * ysize);
+
+	CUfunction kernel = ocu.kernel[KERNEL_MINSQUAREVAL];
+    const void *args[] = { &result, &xsize, &ysize, &img, &square_size, &offset };
+
+    CUresult err = cuLaunchKernel(kernel,
+        BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1,
+        BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
+        0,
+        ocu.commandQueue, (void**)args, NULL);
+	LOG_CU_RESULT(err);
+    err = cuFinish(ocu.commandQueue);
+	LOG_CU_RESULT(err);
+    cuMemcpyDtoD(img, result, sizeof(float) * xsize * ysize);
+    ocu.releaseMem(result);
+}
+
+static void MakeMask(double extmul, double extoff,
+    double mul, double offset,
+    double scaler, double *result)
+{
+    for (size_t i = 0; i < 512; ++i) {
+        const double c = mul / ((0.01 * scaler * i) + offset);
+        result[i] = 1.0 + extmul * (c + extoff);
+        result[i] *= result[i];
+    }
+}
+
+static const double kInternalGoodQualityThreshold = 14.921561160295326;
+static const double kGlobalScale = 1.0 / kInternalGoodQualityThreshold;
+
+void cuDoMask(ocu_channels mask/*in, out*/, ocu_channels mask_dc/*in, out*/, size_t xsize, size_t ysize)
+{
+    ocu_args_d_t &ocu = getOcu();
+
+    double extmul = 0.975741017749;
+    double extoff = -4.25328244168;
+    double offset = 0.454909521427;
+    double scaler = 0.0738288224836;
+    double mul = 20.8029176447;
+    static double lut_x[512];
+    static bool lutx_init = false;
+    if (!lutx_init)
+    {
+        lutx_init = true;
+        MakeMask(extmul, extoff, mul, offset, scaler, lut_x);
+    }
+
+    extmul = 0.373995618954;
+    extoff = 1.5307267433;
+    offset = 0.911952641929;
+    scaler = 1.1731667845;
+    mul = 16.2447033988;
+    static double lut_y[512];
+    static bool luty_init = false;
+    if (!luty_init)
+    {
+        luty_init = true;
+        MakeMask(extmul, extoff, mul, offset, scaler, lut_y);
+    }
+
+    extmul = 0.61582234137;
+    extoff = -4.25376118646;
+    offset = 1.05105070921;
+    scaler = 0.47434643535;
+    mul = 31.1444967089;
+    static double lut_b[512];
+    static bool lutb_init = false;
+    if (!lutb_init)
+    {
+        lutb_init = true;
+        MakeMask(extmul, extoff, mul, offset, scaler, lut_b);
+    }
+
+    extmul = 1.79116943438;
+    extoff = -3.86797479189;
+    offset = 0.670960225853;
+    scaler = 0.486575865525;
+    mul = 20.4563479139;
+    static double lut_dcx[512];
+    static bool lutdcx_init = false;
+    if (!lutdcx_init)
+    {
+        lutdcx_init = true;
+        MakeMask(extmul, extoff, mul, offset, scaler, lut_dcx);
+    }
+
+    extmul = 0.212223514236;
+    extoff = -3.65647120524;
+    offset = 1.73396799447;
+    scaler = 0.170392660501;
+    mul = 21.6566724788;
+    static double lut_dcy[512];
+    static bool lutdcy_init = false;
+    if (!lutdcy_init)
+    {
+        lutdcy_init = true;
+        MakeMask(extmul, extoff, mul, offset, scaler, lut_dcy);
+    }
+
+    extmul = 0.349376011816;
+    extoff = -0.894711072781;
+    offset = 0.901647926679;
+    scaler = 0.380086095024;
+    mul = 18.0373825149;
+    static double lut_dcb[512];
+    static bool lutdcb_init = false;
+    if (!lutdcb_init)
+    {
+        lutdcb_init = true;
+        MakeMask(extmul, extoff, mul, offset, scaler, lut_dcb);
+    }
+
+    size_t channel_size = 512 * sizeof(double);
+    ocu_channels xyb = ocu.allocMemChannels(channel_size, lut_x, lut_y, lut_b);
+    ocu_channels xyb_dc = ocu.allocMemChannels(channel_size, lut_dcx, lut_dcy, lut_dcb);
+
+	CUfunction kernel = ocu.kernel[KERNEL_DOMASK];
+    const void *args[] = { &mask.r, &mask.g, &mask.b,
+        &xsize, &ysize,
+        &mask_dc.r, &mask_dc.g, &mask_dc.b,
+        &xyb.x, &xyb.y, &xyb.b,
+        &xyb_dc.x, &xyb_dc.y, &xyb_dc.b };
+
+    CUresult err = cuLaunchKernel(kernel,
+        BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1,
+        BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
+        0,
+        ocu.commandQueue, (void**)args, NULL);
+	LOG_CU_RESULT(err);
+    err = cuFinish(ocu.commandQueue);
+	LOG_CU_RESULT(err);
+
+    ocu.releaseMemChannels(xyb);
+    ocu.releaseMemChannels(xyb_dc);
+}
+
+void cuMaskEx(
+    ocu_channels mask/*out*/, ocu_channels mask_dc/*out*/,
+    const ocu_channels &rgb, const ocu_channels &rgb2,
+    const size_t xsize, const size_t ysize)
+{
+    cuDiffPrecomputeEx(mask, rgb, rgb2, xsize, ysize);
+    for (int i = 0; i < 3; i++)
+    {
+        cuAverage5x5Ex(mask.ch[i], xsize, ysize);
+        cuMinSquareValEx(mask.ch[i], xsize, ysize, 4, 0);
+
+        static const double sigma[3] = {
+            9.65781083553,
+            14.2644604355,
+            4.53358927369,
+        };
+
+        cuBlurEx(mask.ch[i], xsize, ysize, sigma[i], 0.0);
+    }
+
+    cuDoMask(mask, mask_dc, xsize, ysize);
+
+    for (int i = 0; i < 3; i++)
+    {
+        cuScaleImageEx(mask.ch[i], xsize * ysize, kGlobalScale * kGlobalScale);
+        cuScaleImageEx(mask_dc.ch[i], xsize * ysize, kGlobalScale * kGlobalScale);
+    }
+}
+
+void cuCombineChannelsEx(
+    cu_mem result/*out*/,
+    const ocu_channels &mask,
+    const ocu_channels &mask_dc,
+    const size_t xsize, const size_t ysize,
+    const cu_mem block_diff_dc,
+    const cu_mem block_diff_ac,
+    const cu_mem edge_detector_map,
+    const size_t res_xsize,
+    const size_t step)
+{
+    ocu_args_d_t &ocu = getOcu();
+
+    const size_t work_xsize = ((xsize - 8 + step) + step - 1) / step;
+    const size_t work_ysize = ((ysize - 8 + step) + step - 1) / step;
+
+	CUfunction kernel = ocu.kernel[KERNEL_COMBINECHANNELS];
+    const void *args[] = { &result,
+        &mask.r, &mask.g, &mask.b,
+        &mask_dc.r, &mask_dc.g, &mask_dc.b,
+        &xsize, &ysize,
+        &block_diff_dc, &block_diff_ac,
+		&edge_detector_map,
+        &res_xsize,
+        &step };
+
+    CUresult err = cuLaunchKernel(kernel,
+        work_xsize, work_ysize, 1,
+        1, 1, 1,
+        0,
+        ocu.commandQueue, (void**)args, NULL);
+	LOG_CU_RESULT(err);
+    err = cuFinish(ocu.commandQueue);
+	LOG_CU_RESULT(err);
+}
+
+void cuUpsampleSquareRootEx(cu_mem diffmap, const size_t xsize, const size_t ysize, const int step)
+{
+    ocu_args_d_t &ocu = getOcu();
+
+    cu_mem diffmap_out = ocu.allocMem(xsize * ysize * sizeof(float));
+
+	CUfunction kernel = ocu.kernel[KERNEL_UPSAMPLESQUAREROOT];
+    const void *args[] = { &diffmap_out, &diffmap, &xsize, &ysize, &step };
+
+    const size_t res_xsize = (xsize + step - 1) / step;
+    const size_t res_ysize = (ysize + step - 1) / step;
+
+    CUresult err = cuLaunchKernel(kernel,
+        res_xsize, res_ysize, 1,
+        1, 1, 1,
+        0,
+        ocu.commandQueue, (void**)args, NULL);
+	LOG_CU_RESULT(err);
+    err = cuFinish(ocu.commandQueue);
+	LOG_CU_RESULT(err);
+    cuMemcpyDtoD(diffmap, diffmap_out, xsize * ysize * sizeof(float));
+
+    ocu.releaseMem(diffmap_out);
+}
+
+void cuRemoveBorderEx(cu_mem out, const cu_mem in, const size_t xsize, const size_t ysize, const int step)
+{
+    ocu_args_d_t &ocu = getOcu();
+
+    int cls = 8 - step;
+    int cls2 = (8 - step) / 2;
+
+    int out_xsize = xsize - cls;
+    int out_ysize = ysize - cls;
+
+	CUfunction kernel = ocu.kernel[KERNEL_REMOVEBORDER];
+    const void *args[] = { &out, &out_xsize, &out_ysize, &in, &cls, &cls2 };
+
+    CUresult err = cuLaunchKernel(kernel,
+        BLOCK_COUNT_X(out_xsize), BLOCK_COUNT_Y(out_ysize), 1,
+        BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
+        0,
+        ocu.commandQueue, (void**)args, NULL);
+	LOG_CU_RESULT(err);
+    err = cuFinish(ocu.commandQueue);
+	LOG_CU_RESULT(err);
+}
+
+void cuAddBorderEx(cu_mem out, size_t xsize, size_t ysize, int step, cu_mem in)
+{
+    ocu_args_d_t &ocu = getOcu();
+
+    int cls = 8 - step;
+    int cls2 = (8 - step) / 2;
+	CUfunction kernel = ocu.kernel[KERNEL_ADDBORDER];
+    const void *args[] = { &out, &xsize, &ysize, &cls, &cls2, &in };
+
+    CUresult err = cuLaunchKernel(kernel,
+        BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1,
+        BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
+        0,
+        ocu.commandQueue, (void**)args, NULL);
+	LOG_CU_RESULT(err);
+    err = cuFinish(ocu.commandQueue);
+	LOG_CU_RESULT(err);
+}
+
+void cuCalculateDiffmapEx(cu_mem diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step)
+{
+    cuUpsampleSquareRootEx(diffmap, xsize, ysize, step);
+
+    static const double kSigma = 8.8510880283;
+    static const double mul1 = 24.8235314874;
+    static const double scale = 1.0 / (1.0 + mul1);
+
+    const int s = 8 - step;
+    int s2 = (8 - step) / 2;
+
+    ocu_args_d_t &ocu = getOcu();
+    cu_mem blurred = ocu.allocMem((xsize - s) * (ysize - s) * sizeof(float));
+    cuRemoveBorderEx(blurred, diffmap, xsize, ysize, step);
+
+    static const double border_ratio = 0.03027655136;
+    cuBlurEx(blurred, xsize - s, ysize - s, kSigma, border_ratio);
+
+    cuAddBorderEx(diffmap, xsize, ysize, step, blurred);
+    cuScaleImageEx(diffmap, xsize * ysize, scale);
+
+    ocu.releaseMem(blurred);
+}
+
+#ifdef __USE_DOUBLE_AS_FLOAT__
+#undef double
+#endif
+
+#endif
\ No newline at end of file
diff --git a/clguetzli/cuguetzli.h b/clguetzli/cuguetzli.h
new file mode 100644
index 00000000..8c3e3444
--- /dev/null
+++ b/clguetzli/cuguetzli.h
@@ -0,0 +1,142 @@
+/*
+* CUDA edition implementation of guetzli.
+*
+* Author: strongtu@tencent.com
+*         ianhuang@tencent.com
+*         chriskzhou@tencent.com
+*/
+#pragma once
+#include "guetzli/processor.h"
+#include "clguetzli.cl.h"
+#include "ocu.h"
+
+#ifdef __USE_CUDA__
+
+#ifdef __USE_DOUBLE_AS_FLOAT__
+#define double float
+#endif
+
+void cuOpsinDynamicsImage(
+	float *r, float *g, float *b, 
+	const size_t xsize, const size_t ysize);
+
+void cuDiffmapOpsinDynamicsImage(
+    float* result,
+    const float* r, const float* g, const float* b,
+    const float* r2, const float* g2, const float* b2,
+    const size_t xsize, const size_t ysize,
+    const size_t step);
+
+void cuComputeBlockZeroingOrder(
+    guetzli::CoeffData *output_order_batch,
+    const channel_info orig_channel[3],
+    const float *orig_image_batch,
+    const float *mask_scale,
+    const int image_width,
+    const int image_height,
+    const channel_info mayout_channel[3],
+    const int factor,
+    const int comp_mask,
+    const float BlockErrorLimit);
+
+void cuMask(
+    float* mask_r, float* mask_g, float* mask_b,
+    float* maskdc_r, float* maskdc_g, float* maskdc_b,
+    const size_t xsize, const size_t ysize,
+    const float* r, const float* g, const float* b,
+    const float* r2, const float* g2, const float* b2);
+
+void cuDiffmapOpsinDynamicsImageEx(
+    cu_mem result,
+    ocu_channels xyb0,
+    ocu_channels xyb1,
+    const size_t xsize, const size_t ysize,
+    const size_t step);
+
+void cuConvolutionXEx(
+    cu_mem result/*out*/,
+    const cu_mem inp, size_t xsize, size_t ysize,
+    const cu_mem multipliers, size_t len,
+    int xstep, int offset, float border_ratio);
+
+void cuConvolutionYEx(
+    cu_mem result/*out*/,
+    const cu_mem inp, size_t xsize, size_t ysize,
+    const cu_mem multipliers, size_t len,
+    int xstep, int offset, float border_ratio);
+
+void cuSquareSampleEx(
+    cu_mem result/*out*/,
+    const cu_mem image, size_t xsize, size_t ysize,
+    size_t xstep, size_t ystep);
+
+void cuBlurEx(cu_mem image/*out, opt*/, const size_t xsize, const size_t ysize,
+    const double sigma, const double border_ratio,
+    cu_mem result = NULL/*out, opt*/);
+
+void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t ysize);
+
+void cuMaskHighIntensityChangeEx(
+    ocu_channels &xyb0/*in,out*/,
+    ocu_channels &xyb1/*in,out*/,
+    const size_t xsize, const size_t ysize);
+
+void cuEdgeDetectorMapEx(
+    cu_mem result/*out*/,
+    const ocu_channels &rgb, const ocu_channels &rgb2,
+    const size_t xsize, const size_t ysize, const size_t step);
+
+void cuBlockDiffMapEx(
+    cu_mem block_diff_dc/*out*/,
+    cu_mem block_diff_ac/*out*/,
+    const ocu_channels &rgb, const ocu_channels &rgb2,
+    const size_t xsize, const size_t ysize, const size_t step);
+
+void cuEdgeDetectorLowFreqEx(
+    cu_mem block_diff_ac/*in,out*/,
+    const ocu_channels &rgb, const ocu_channels &rgb2,
+    const size_t xsize, const size_t ysize, const size_t step);
+
+void cuDiffPrecomputeEx(
+    ocu_channels &mask/*out*/,
+    const ocu_channels &xyb0, const ocu_channels &xyb1,
+    const size_t xsize, const size_t ysize);
+
+void cuScaleImageEx(cu_mem img/*in, out*/, size_t size, double w);
+
+void cuAverage5x5Ex(cu_mem img/*in,out*/, const size_t xsize, const size_t ysize);
+
+void cuMinSquareValEx(
+    cu_mem img/*in,out*/,
+    const size_t xsize, const size_t ysize,
+    const size_t square_size, const size_t offset);
+
+void cuMaskEx(
+    ocu_channels mask/*out*/, ocu_channels mask_dc/*out*/,
+    const ocu_channels &rgb, const ocu_channels &rgb2,
+    const size_t xsize, const size_t ysize);
+
+void cuCombineChannelsEx(
+    cu_mem result/*out*/,
+    const ocu_channels &mask,
+    const ocu_channels &mask_dc,
+    const size_t xsize, const size_t ysize,
+    const cu_mem block_diff_dc,
+    const cu_mem block_diff_ac,
+    const cu_mem edge_detector_map,
+    const size_t res_xsize,
+    const size_t step);
+
+void cuUpsampleSquareRootEx(cu_mem diffmap, const size_t xsize, const size_t ysize, const int step);
+
+void cuRemoveBorderEx(cu_mem out, const cu_mem in, const size_t xsize, const size_t ysize, const int step);
+
+void cuAddBorderEx(cu_mem out, const size_t xsize, const size_t ysize, const int step, const cu_mem in);
+
+void cuCalculateDiffmapEx(cu_mem diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step);
+
+#ifdef __USE_DOUBLE_AS_FLOAT__
+#undef double
+#endif
+
+#endif
\ No newline at end of file
diff --git a/clguetzli/cumem_pool.cpp b/clguetzli/cumem_pool.cpp
new file mode 100644
index 00000000..8252d3e7
--- /dev/null
+++ b/clguetzli/cumem_pool.cpp
@@ -0,0 +1,111 @@
+/*
+ * Memory Pool for CUDA
+ *
+ * Author: ianhuang@tencent.com
+ */
+
+#include "cumem_pool.h"
+
+#ifdef __USE_CUDA__
+
+bool compare_size(const cu_mem_block_t& first, const cu_mem_block_t& second)
+{
+    return (first.size < second.size);
+}
+
+cu_mem_pool_t::cu_mem_pool_t()
+    : alloc_count(0)
+    , total_mem_request(0)
+{
+
+}
+
+cu_mem_pool_t::~cu_mem_pool_t()
+{
+
+}
+
+cu_mem cu_mem_pool_t::allocMem(size_t s, const void *init)
+{
+    alloc_count++;
+    total_mem_request += s;
+    cu_mem_block_t *block_candidate = NULL;
+    for (std::list<cu_mem_block_t>::iterator iter = mem_pool.begin(); iter != mem_pool.end(); iter++)
+    {
+        cu_mem_block_t *block = &(*iter);
+        if (block->status == MBS_IDLE && block->size >= s) {
+            block_candidate = block;
+            break;
+        }
+    }
+    cu_mem mem = NULL;
+    if (block_candidate != NULL) {
+        block_candidate->status = MBS_BUSY;
+        block_candidate->used = s;
+
+        mem = block_candidate->mem;
+    }
+    else {
+        cu_mem new_mem;
+        cuMemAlloc(&new_mem, s);
+        cu_mem_block_t mem_block;
+        mem_block.size = s;
+        mem_block.used = s;
+        mem_block.mem = new_mem;
+        mem_block.status = MBS_BUSY;
+        mem_pool.push_back(mem_block);
+        mem_pool.sort(compare_size);
+
+        mem = new_mem;
+    }
+    if (init)
+    {
+        cuMemcpyHtoDAsync(mem, init, s, commandQueue);
+    }
+    else
+    {
+        cuMemsetD8Async(mem, 0, s, commandQueue);
+    }
+
+    return mem;
+}
+
+void cu_mem_pool_t::releaseMem(cu_mem mem)
+{
+    cu_mem_block_t *block_candidate = NULL;
+    for (std::list<cu_mem_block_t>::iterator iter = mem_pool.begin(); iter != mem_pool.end(); iter++)
+    {
+        cu_mem_block_t *block = &(*iter);
+        if (block->mem == mem) {
+            block_candidate = block;
+            break;
+        }
+    }
+    if (block_candidate != NULL) {
+        block_candidate->status = MBS_IDLE;
+        block_candidate->used = 0;
+    }
+    else {
+        cuMemFree(mem);
+        LogError("mem_pool release mem:%lld can not be found.\r\n", mem);
+    }
+}
+
+void cu_mem_pool_t::drain()
+{
+    size_t total_mem = 0;
+    size_t total_block = mem_pool.size();
+    cu_mem_block_t *block_candidate = NULL;
+    for (std::list<cu_mem_block_t>::iterator iter = mem_pool.begin(); iter != mem_pool.end(); iter++)
+    {
+        if (iter->status == MBS_IDLE) {
+            total_mem += iter->size;
+            cuMemFree(iter->mem);
+            iter = mem_pool.erase(iter);
+        }
+    }
+
+    LogError("mem_pool has %u blocks, and total pool memory is:%f kb, total memory request:%f kb, total alloc count:%d.\r\n", total_block, (float)(total_mem) / 1024, (float)(total_mem_request) / 1024, alloc_count);
+}
+
+#endif
\ No newline at end of file
diff --git a/clguetzli/cumem_pool.h b/clguetzli/cumem_pool.h
new file mode 100644
index 00000000..b878d92f
--- /dev/null
+++ b/clguetzli/cumem_pool.h
@@ -0,0 +1,51 @@
+/*
+* Memory Pool for CUDA
+*
+* Author: ianhuang@tencent.com
+*/
+#pragma once
+
+#ifdef __USE_CUDA__
+
+#include <list>
+#include <cuda.h>
+#include "ocl.h"
+
+/*Simple memory pool for CUDA, aiming to reduce the memory allocation count, because it's time consuming.*/
+
+enum mem_block_status
+{
+    MBS_IDLE,
+    MBS_BUSY,
+};
+
+struct cu_mem_block_t
+{
+    cu_mem_block_t()
+        :status(MBS_IDLE)
+        , used(0)
+    {}
+    ~cu_mem_block_t()
+    {}
+
+    mem_block_status status;
+    size_t size;
+    size_t used;
+    cu_mem mem;
+};
+
+struct cu_mem_pool_t
+{
+    cu_mem_pool_t();
+    ~cu_mem_pool_t();
+    cu_mem allocMem(size_t s, const void *init = NULL);
+    void releaseMem(cu_mem mem);
+    void drain();
+
+    std::list<cu_mem_block_t> mem_pool;
+    CUstream    commandQueue;
+    size_t alloc_count;
+    size_t total_mem_request;
+};
+
+#endif
\ No newline at end of file
diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp
new file mode 100644
index 00000000..851ab943
--- /dev/null
+++ b/clguetzli/ocl.cpp
@@ -0,0 +1,556 @@
+/*
+* OpenCL Manager
+*
+* Author: strongtu@tencent.com
+*         ianhuang@tencent.com
+*/
+#include "ocl.h"
+#include <string.h>
+#include <vector>
+
+#ifdef __USE_OPENCL__
+
+ocl_args_d_t& getOcl(void)
+{
+    static bool bInit = false;
+    static ocl_args_d_t ocl;
+
+    if (bInit == true) return ocl;
+
+    bInit = true;
+    cl_int err = SetupOpenCL(&ocl, CL_DEVICE_TYPE_GPU);
+    LOG_CL_RESULT(err);
+
+    char* source = nullptr;
+    size_t src_size = 0;
+    ReadSourceFromFile("clguetzli/clguetzli.cl", &source, &src_size);
+
+    ocl.program = clCreateProgramWithSource(ocl.context, 1, (const char**)&source, &src_size, &err);
+
+    delete[] source;
+
+    err = clBuildProgram(ocl.program, 1, &ocl.device, "", NULL, NULL);
+    LOG_CL_RESULT(err);
+    if (CL_BUILD_PROGRAM_FAILURE == err)
+    {
+        size_t log_size = 0;
+        clGetProgramBuildInfo(ocl.program, ocl.device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
+
+        std::vector<char> build_log(log_size);
+        clGetProgramBuildInfo(ocl.program, ocl.device, CL_PROGRAM_BUILD_LOG, log_size, &build_log[0], NULL);
+
+        LogError("Error happened during the build of OpenCL program.\nBuild log:%s", &build_log[0]);
+    }
+
+    ocl.kernel[KERNEL_CONVOLUTION] = clCreateKernel(ocl.program, "clConvolutionEx", &err);
+    ocl.kernel[KERNEL_CONVOLUTIONX] = clCreateKernel(ocl.program, "clConvolutionXEx", &err);
+    ocl.kernel[KERNEL_CONVOLUTIONY] = clCreateKernel(ocl.program, "clConvolutionYEx", &err);
+    ocl.kernel[KERNEL_SQUARESAMPLE] = clCreateKernel(ocl.program, "clSquareSampleEx", &err);
+    ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE] = clCreateKernel(ocl.program, "clOpsinDynamicsImageEx", &err);
+    ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE] = clCreateKernel(ocl.program, "clMaskHighIntensityChangeEx", &err);
+    ocl.kernel[KERNEL_EDGEDETECTOR] = clCreateKernel(ocl.program, "clEdgeDetectorMapEx", &err);
+    ocl.kernel[KERNEL_BLOCKDIFFMAP] = clCreateKernel(ocl.program, "clBlockDiffMapEx", &err);
+    ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ] = clCreateKernel(ocl.program, "clEdgeDetectorLowFreqEx", &err);
+    ocl.kernel[KERNEL_DIFFPRECOMPUTE] = clCreateKernel(ocl.program, "clDiffPrecomputeEx", &err);
+    ocl.kernel[KERNEL_SCALEIMAGE] = clCreateKernel(ocl.program, "clScaleImageEx", &err);
+    ocl.kernel[KERNEL_AVERAGE5X5] = clCreateKernel(ocl.program, "clAverage5x5Ex", &err);
+    ocl.kernel[KERNEL_MINSQUAREVAL] = clCreateKernel(ocl.program, "clMinSquareValEx", &err);
+    ocl.kernel[KERNEL_DOMASK] = clCreateKernel(ocl.program, "clDoMaskEx", &err);
+    ocl.kernel[KERNEL_COMBINECHANNELS] = clCreateKernel(ocl.program, "clCombineChannelsEx", &err);
+    ocl.kernel[KERNEL_UPSAMPLESQUAREROOT] = clCreateKernel(ocl.program, "clUpsampleSquareRootEx", &err);
+    ocl.kernel[KERNEL_REMOVEBORDER] = clCreateKernel(ocl.program, "clRemoveBorderEx", &err);
+    ocl.kernel[KERNEL_ADDBORDER] = clCreateKernel(ocl.program, "clAddBorderEx", &err);
+    ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER] = clCreateKernel(ocl.program, "clComputeBlockZeroingOrderEx", &err);
+
+    return ocl;
+}
+
+ocl_args_d_t::ocl_args_d_t() :
+	context(NULL),
+	device(NULL),
+	commandQueue(NULL),
+	program(NULL),
+	platformVersion(OPENCL_VERSION_1_2),
+	deviceVersion(OPENCL_VERSION_1_2),
+	compilerVersion(OPENCL_VERSION_1_2)
+{
+	for (int i = 0; i < KERNEL_COUNT; i++)
+	{
+		kernel[i] = NULL;
+	}
+}
+
+ocl_args_d_t::~ocl_args_d_t()
+{
+	cl_int err = CL_SUCCESS;
+	for (int i = 0; i < KERNEL_COUNT; i++)
+	{
+		err = clReleaseKernel(kernel[i]);
+		if (CL_SUCCESS != err)
+		{
+			LogError("Error: clReleaseKernel returned '%s'.\n", TranslateOpenCLError(err));
+		}
+	}
+
+	if (program)
+	{
+		err = clReleaseProgram(program);
+		if (CL_SUCCESS != err)
+		{
+			LogError("Error: clReleaseProgram returned '%s'.\n", TranslateOpenCLError(err));
+		}
+	}
+	if (commandQueue)
+	{
+		err = clReleaseCommandQueue(commandQueue);
+		if (CL_SUCCESS != err)
+		{
+			LogError("Error: clReleaseCommandQueue returned '%s'.\n", TranslateOpenCLError(err));
+		}
+	}
+	if (device)
+	{
+		err = clReleaseDevice(device);
+		if (CL_SUCCESS != err)
+		{
+			LogError("Error: clReleaseDevice returned '%s'.\n", TranslateOpenCLError(err));
+		}
+	}
+	if (context)
+	{
+		err = clReleaseContext(context);
+		if (CL_SUCCESS != err)
+		{
+			LogError("Error: clReleaseContext returned '%s'.\n", TranslateOpenCLError(err));
+		}
+	}
+}
+
+cl_mem ocl_args_d_t::allocMem(size_t s, const void *init)
+{
+	cl_int err = 0;
+	cl_mem mem = clCreateBuffer(this->context, CL_MEM_READ_WRITE, s, nullptr, &err);
+    LOG_CL_RESULT(err);
+    if (!mem) return NULL;
+    
+    // init memory
+    if (init)
+    {
+        err = clEnqueueWriteBuffer(this->commandQueue, mem, CL_FALSE, 0, s, init, 0, NULL, NULL);
+        LOG_CL_RESULT(err);
+        err = clFinish(this->commandQueue);
+        LOG_CL_RESULT(err);
+    }
+    else
+    {
+        cl_char cc = 0;
+        err = clEnqueueFillBuffer(this->commandQueue, mem, &cc, sizeof(cc), 0, s / sizeof(cc), 0, NULL, NULL);
+        LOG_CL_RESULT(err);
+        err = clFinish(this->commandQueue);
+        LOG_CL_RESULT(err);
+    }
+
+	return mem;
+}
+
+ocl_channels ocl_args_d_t::allocMemChannels(size_t s, const void *c0, const void *c1, const void *c2)
+{
+	const void *c[3] = { c0, c1, c2 };
+
+	ocl_channels img;
+    for (int i = 0; i < 3; i++)
+    {
+        img.ch[i] = allocMem(s, c[i]);
+    }
+
+	return img;
+}
+
+void ocl_args_d_t::releaseMemChannels(ocl_channels &rgb)
+{
+    for (int i = 0; i < 3; i++)
+    {
+        clReleaseMemObject(rgb.ch[i]);
+        rgb.ch[i] = NULL;
+    }
+}
+
+const char* TranslateOpenCLError(cl_int errorCode)
+{
+	switch (errorCode)
+	{
+	case CL_SUCCESS:                            return "CL_SUCCESS";
+	case CL_DEVICE_NOT_FOUND:                   return "CL_DEVICE_NOT_FOUND";
+	case CL_DEVICE_NOT_AVAILABLE:               return "CL_DEVICE_NOT_AVAILABLE";
+	case CL_COMPILER_NOT_AVAILABLE:             return "CL_COMPILER_NOT_AVAILABLE";
+	case CL_MEM_OBJECT_ALLOCATION_FAILURE:      return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
+	case CL_OUT_OF_RESOURCES:                   return "CL_OUT_OF_RESOURCES";
+	case CL_OUT_OF_HOST_MEMORY:                 return "CL_OUT_OF_HOST_MEMORY";
+	case CL_PROFILING_INFO_NOT_AVAILABLE:       return "CL_PROFILING_INFO_NOT_AVAILABLE";
+	case CL_MEM_COPY_OVERLAP:                   return "CL_MEM_COPY_OVERLAP";
+	case CL_IMAGE_FORMAT_MISMATCH:              return "CL_IMAGE_FORMAT_MISMATCH";
+	case CL_IMAGE_FORMAT_NOT_SUPPORTED:         return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
+	case CL_BUILD_PROGRAM_FAILURE:              return "CL_BUILD_PROGRAM_FAILURE";
+	case CL_MAP_FAILURE:                        return "CL_MAP_FAILURE";
+	case CL_MISALIGNED_SUB_BUFFER_OFFSET:       return "CL_MISALIGNED_SUB_BUFFER_OFFSET";                          //-13
+	case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:    return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";   //-14
+	case CL_COMPILE_PROGRAM_FAILURE:            return "CL_COMPILE_PROGRAM_FAILURE";                               //-15
+	case CL_LINKER_NOT_AVAILABLE:               return "CL_LINKER_NOT_AVAILABLE";                                  //-16
+	case CL_LINK_PROGRAM_FAILURE:               return "CL_LINK_PROGRAM_FAILURE";                                  //-17
+	case CL_DEVICE_PARTITION_FAILED:            return "CL_DEVICE_PARTITION_FAILED";                               //-18
+	case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:      return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";                         //-19
+	case CL_INVALID_VALUE:                      return "CL_INVALID_VALUE";
+	case CL_INVALID_DEVICE_TYPE:                return "CL_INVALID_DEVICE_TYPE";
+	case CL_INVALID_PLATFORM:                   return "CL_INVALID_PLATFORM";
+	case CL_INVALID_DEVICE:                     return "CL_INVALID_DEVICE";
+	case CL_INVALID_CONTEXT:                    return "CL_INVALID_CONTEXT";
+	case CL_INVALID_QUEUE_PROPERTIES:           return "CL_INVALID_QUEUE_PROPERTIES";
+	case CL_INVALID_COMMAND_QUEUE:              return "CL_INVALID_COMMAND_QUEUE";
+	case CL_INVALID_HOST_PTR:                   return "CL_INVALID_HOST_PTR";
+	case CL_INVALID_MEM_OBJECT:                 return "CL_INVALID_MEM_OBJECT";
+	case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:    return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
+	case CL_INVALID_IMAGE_SIZE:                 return "CL_INVALID_IMAGE_SIZE";
+	case CL_INVALID_SAMPLER:                    return "CL_INVALID_SAMPLER";
+	case CL_INVALID_BINARY:                     return "CL_INVALID_BINARY";
+	case CL_INVALID_BUILD_OPTIONS:              return "CL_INVALID_BUILD_OPTIONS";
+	case CL_INVALID_PROGRAM:                    return "CL_INVALID_PROGRAM";
+	case CL_INVALID_PROGRAM_EXECUTABLE:         return "CL_INVALID_PROGRAM_EXECUTABLE";
+	case CL_INVALID_KERNEL_NAME:                return "CL_INVALID_KERNEL_NAME";
+	case CL_INVALID_KERNEL_DEFINITION:          return "CL_INVALID_KERNEL_DEFINITION";
+	case CL_INVALID_KERNEL:                     return "CL_INVALID_KERNEL";
+	case CL_INVALID_ARG_INDEX:                  return "CL_INVALID_ARG_INDEX";
+	case CL_INVALID_ARG_VALUE:                  return "CL_INVALID_ARG_VALUE";
+	case CL_INVALID_ARG_SIZE:                   return "CL_INVALID_ARG_SIZE";
+	case CL_INVALID_KERNEL_ARGS:                return "CL_INVALID_KERNEL_ARGS";
+	case CL_INVALID_WORK_DIMENSION:             return "CL_INVALID_WORK_DIMENSION";
+	case CL_INVALID_WORK_GROUP_SIZE:            return "CL_INVALID_WORK_GROUP_SIZE";
+	case CL_INVALID_WORK_ITEM_SIZE:             return "CL_INVALID_WORK_ITEM_SIZE";
+	case CL_INVALID_GLOBAL_OFFSET:              return "CL_INVALID_GLOBAL_OFFSET";
+	case CL_INVALID_EVENT_WAIT_LIST:            return "CL_INVALID_EVENT_WAIT_LIST";
+	case CL_INVALID_EVENT:                      return "CL_INVALID_EVENT";
+	case CL_INVALID_OPERATION:                  return "CL_INVALID_OPERATION";
+	case CL_INVALID_GL_OBJECT:                  return "CL_INVALID_GL_OBJECT";
+	case CL_INVALID_BUFFER_SIZE:                return "CL_INVALID_BUFFER_SIZE";
+	case CL_INVALID_MIP_LEVEL:                  return "CL_INVALID_MIP_LEVEL";
+	case CL_INVALID_GLOBAL_WORK_SIZE:           return "CL_INVALID_GLOBAL_WORK_SIZE";                           //-63
+	case CL_INVALID_PROPERTY:                   return "CL_INVALID_PROPERTY";                                   //-64
+	case CL_INVALID_IMAGE_DESCRIPTOR:           return "CL_INVALID_IMAGE_DESCRIPTOR";                           //-65
+	case CL_INVALID_COMPILER_OPTIONS:           return "CL_INVALID_COMPILER_OPTIONS";                           //-66
+	case CL_INVALID_LINKER_OPTIONS:             return "CL_INVALID_LINKER_OPTIONS";                             //-67
+	case CL_INVALID_DEVICE_PARTITION_COUNT:     return "CL_INVALID_DEVICE_PARTITION_COUNT";                     //-68
+																												//    case CL_INVALID_PIPE_SIZE:                  return "CL_INVALID_PIPE_SIZE";                                  //-69
+																												//    case CL_INVALID_DEVICE_QUEUE:               return "CL_INVALID_DEVICE_QUEUE";                               //-70    
+
+	default:
+		return "UNKNOWN ERROR CODE";
+	}
+}
+
+
+/*
+* Check whether an OpenCL platform is the required platform
+* (based on the platform's name)
+*/
+bool CheckPreferredPlatformMatch(cl_platform_id platform, const char* preferredPlatform)
+{
+	size_t stringLength = 0;
+	cl_int err = CL_SUCCESS;
+	bool match = false;
+
+	// In order to read the platform's name, we first read the platform's name string length (param_value is NULL).
+	// The value returned in stringLength
+	err = clGetPlatformInfo(platform, CL_PLATFORM_NAME, 0, NULL, &stringLength);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clGetPlatformInfo() to get CL_PLATFORM_NAME length returned '%s'.\n", TranslateOpenCLError(err));
+		return false;
+	}
+
+	// Now, that we know the platform's name string length, we can allocate enough space before read it
+	std::vector<char> platformName(stringLength);
+
+	// Read the platform's name string
+	// The read value returned in platformName
+	err = clGetPlatformInfo(platform, CL_PLATFORM_NAME, stringLength, &platformName[0], NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clGetplatform_ids() to get CL_PLATFORM_NAME returned %s.\n", TranslateOpenCLError(err));
+		return false;
+	}
+
+	// Now check if the platform's name is the required one
+	if (strstr(&platformName[0], preferredPlatform) != 0)
+	{
+		// The checked platform is the one we're looking for
+		match = true;
+	}
+
+	return match;
+}
+
+/*
+* Find and return the preferred OpenCL platform
+* In case that preferredPlatform is NULL, the ID of the first discovered platform will be returned
+*/
+cl_platform_id FindOpenCLPlatform(const char* preferredPlatform, cl_device_type deviceType)
+{
+	cl_uint numPlatforms = 0;
+	cl_int err = CL_SUCCESS;
+
+	// Get (in numPlatforms) the number of OpenCL platforms available
+	// No platform ID will be return, since platforms is NULL
+	err = clGetPlatformIDs(0, NULL, &numPlatforms);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clGetplatform_ids() to get num platforms returned %s.\n", TranslateOpenCLError(err));
+		return NULL;
+	}
+	LogInfo("Number of available platforms: %u\n", numPlatforms);
+
+	if (0 == numPlatforms)
+	{
+		LogError("Error: No platforms found!\n");
+		return NULL;
+	}
+
+	std::vector<cl_platform_id> platforms(numPlatforms);
+
+	// Now, obtains a list of numPlatforms OpenCL platforms available
+	// The list of platforms available will be returned in platforms
+	err = clGetPlatformIDs(numPlatforms, &platforms[0], NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clGetplatform_ids() to get platforms returned %s.\n", TranslateOpenCLError(err));
+		return NULL;
+	}
+
+	// Check if one of the available platform matches the preferred requirements
+	for (cl_uint i = 0; i < numPlatforms; i++)
+	{
+		bool match = true;
+		cl_uint numDevices = 0;
+
+		size_t nameLen = 0;
+		clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 0, NULL, &nameLen);
+
+		std::vector<char> platformName(nameLen + 1);
+		clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, nameLen, &platformName[0], NULL);
+		platformName[nameLen] = 0;
+
+		LogError("DeviceName: %s\n", platformName.data());
+
+		if ((NULL != preferredPlatform) && (strlen(preferredPlatform) > 0))
+		{
+			match = (strstr(&platformName[0], preferredPlatform) != 0);
+		}
+
+		// match is true if the platform's name is the required one or don't care (NULL)
+		if (match)
+		{
+			// Obtains the number of deviceType devices available on platform
+			// When the function failed we expect numDevices to be zero.
+			// We ignore the function return value since a non-zero error code
+			// could happen if this platform doesn't support the specified device type.
+			err = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &numDevices);
+			if (CL_SUCCESS != err)
+			{
+				if (CL_DEVICE_TYPE_GPU == deviceType)
+				{
+					LogError("%s try GPU returned %s.\n", platformName.data(), TranslateOpenCLError(err));
+				}
+				if (CL_DEVICE_TYPE_CPU == deviceType)
+				{
+					LogError("%s try CPU returned %s.\n", platformName.data(), TranslateOpenCLError(err));
+				}
+			}
+
+			if (0 != numDevices)
+			{
+				// There is at list one device that answer the requirements
+				LogError("SelectDevice: %s GPU=%d\n", platformName.data(), deviceType == CL_DEVICE_TYPE_GPU ? 1 : 0);
+				return platforms[i];
+			}
+		}
+	}
+
+	return NULL;
+}
+
+
+/*
+* This function read the OpenCL platdorm and device versions
+* (using clGetxxxInfo API) and stores it in the ocl structure.
+* Later it will enable us to support both OpenCL 1.2 and 2.0 platforms and devices
+* in the same program.
+*/
+int GetPlatformAndDeviceVersion(cl_platform_id platformId, ocl_args_d_t *ocl)
+{
+	cl_int err = CL_SUCCESS;
+
+	// Read the platform's version string length (param_value is NULL).
+	// The value returned in stringLength
+	size_t stringLength = 0;
+	err = clGetPlatformInfo(platformId, CL_PLATFORM_VERSION, 0, NULL, &stringLength);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clGetPlatformInfo() to get CL_PLATFORM_VERSION length returned '%s'.\n", TranslateOpenCLError(err));
+		return err;
+	}
+
+	// Now, that we know the platform's version string length, we can allocate enough space before read it
+	std::vector<char> platformVersion(stringLength);
+
+	// Read the platform's version string
+	// The read value returned in platformVersion
+	err = clGetPlatformInfo(platformId, CL_PLATFORM_VERSION, stringLength, &platformVersion[0], NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clGetplatform_ids() to get CL_PLATFORM_VERSION returned %s.\n", TranslateOpenCLError(err));
+		return err;
+	}
+
+	if (strstr(&platformVersion[0], "OpenCL 2.0") != NULL)
+	{
+		ocl->platformVersion = OPENCL_VERSION_2_0;
+	}
+
+	// Read the device's version string length (param_value is NULL).
+	err = clGetDeviceInfo(ocl->device, CL_DEVICE_VERSION, 0, NULL, &stringLength);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clGetDeviceInfo() to get CL_DEVICE_VERSION length returned '%s'.\n", TranslateOpenCLError(err));
+		return err;
+	}
+
+	// Now, that we know the device's version string length, we can allocate enough space before read it
+	std::vector<char> deviceVersion(stringLength);
+
+	// Read the device's version string
+	// The read value returned in deviceVersion
+	err = clGetDeviceInfo(ocl->device, CL_DEVICE_VERSION, stringLength, &deviceVersion[0], NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clGetDeviceInfo() to get CL_DEVICE_VERSION returned %s.\n", TranslateOpenCLError(err));
+		return err;
+	}
+
+	if (strstr(&deviceVersion[0], "OpenCL 2.0") != NULL)
+	{
+		ocl->deviceVersion = OPENCL_VERSION_2_0;
+	}
+
+	// Read the device's OpenCL C version string length (param_value is NULL).
+	err = clGetDeviceInfo(ocl->device, CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &stringLength);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clGetDeviceInfo() to get CL_DEVICE_OPENCL_C_VERSION length returned '%s'.\n", TranslateOpenCLError(err));
+		return err;
+	}
+
+	// Now, that we know the device's OpenCL C version string length, we can allocate enough space before read it
+	std::vector<char> compilerVersion(stringLength);
+
+	// Read the device's OpenCL C version string
+	// The read value returned in compilerVersion
+	err = clGetDeviceInfo(ocl->device, CL_DEVICE_OPENCL_C_VERSION, stringLength, &compilerVersion[0], NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clGetDeviceInfo() to get CL_DEVICE_OPENCL_C_VERSION returned %s.\n", TranslateOpenCLError(err));
+		return err;
+	}
+
+	else if (strstr(&compilerVersion[0], "OpenCL C 2.0") != NULL)
+	{
+		ocl->compilerVersion = OPENCL_VERSION_2_0;
+	}
+
+	return err;
+}
+
+
+/*
+* This function picks/creates necessary OpenCL objects which are needed.
+* The objects are:
+* OpenCL platform, device, context, and command queue.
+*
+* All these steps are needed to be performed once in a regular OpenCL application.
+* This happens before actual compute kernels calls are performed.
+*
+* For convenience, in this application you store all those basic OpenCL objects in structure ocl_args_d_t,
+* so this function populates fields of this structure, which is passed as parameter ocl.
+* Please, consider reviewing the fields before going further.
+* The structure definition is right in the beginning of this file.
+*/
+int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType)
+{
+	// The following variable stores return codes for all OpenCL calls.
+	cl_int err = CL_SUCCESS;
+
+	// Query for all available OpenCL platforms on the system
+	// Here you enumerate all platforms and pick one which name has preferredPlatform as a sub-string
+	cl_platform_id platformId = FindOpenCLPlatform(nullptr, deviceType);
+	if (NULL == platformId)
+	{
+		deviceType = CL_DEVICE_TYPE_CPU;
+		platformId = FindOpenCLPlatform(nullptr, deviceType);
+	}
+
+	if (NULL == platformId)
+	{
+		LogError("Error: Failed to find OpenCL platform.\n");
+		return CL_INVALID_VALUE;
+	}
+
+	// Create context with device of specified type.
+	// Required device type is passed as function argument deviceType.
+	// So you may use this function to create context for any CPU or GPU OpenCL device.
+	// The creation is synchronized (pfn_notify is NULL) and NULL user_data
+	cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platformId, 0 };
+	ocl->context = clCreateContextFromType(contextProperties, deviceType, NULL, NULL, &err);
+	if ((CL_SUCCESS != err) || (NULL == ocl->context))
+	{
+		LogError("Couldn't create a context, clCreateContextFromType() returned '%s'.\n", TranslateOpenCLError(err));
+		return err;
+	}
+
+	// Query for OpenCL device which was used for context creation
+	err = clGetContextInfo(ocl->context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), &ocl->device, NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clGetContextInfo() to get list of devices returned %s.\n", TranslateOpenCLError(err));
+		return err;
+	}
+
+	// Read the OpenCL platform's version and the device OpenCL and OpenCL C versions
+	GetPlatformAndDeviceVersion(platformId, ocl);
+
+	// Create command queue.
+	// OpenCL kernels are enqueued for execution to a particular device through special objects called command queues.
+	// Command queue guarantees some ordering between calls and other OpenCL commands.
+	// Here you create a simple in-order OpenCL command queue that doesn't allow execution of two kernels in parallel on a target device.
+#ifdef CL_VERSION_2_0
+	if (OPENCL_VERSION_2_0 == ocl->deviceVersion)
+	{
+		const cl_command_queue_properties properties[] = { CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0 };
+		ocl->commandQueue = clCreateCommandQueueWithProperties(ocl->context, ocl->device, properties, &err);
+	}
+	else {
+		// default behavior: OpenCL 1.2
+		cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE;
+		ocl->commandQueue = clCreateCommandQueue(ocl->context, ocl->device, properties, &err);
+	}
+#else
+	// default behavior: OpenCL 1.2
+	cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE;
+	ocl->commandQueue = clCreateCommandQueue(ocl->context, ocl->device, properties, &err);
+#endif
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clCreateCommandQueue() returned %s.\n", TranslateOpenCLError(err));
+		return err;
+	}
+
+	return CL_SUCCESS;
+}
+
+#endif
\ No newline at end of file
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
new file mode 100644
index 00000000..7ccee2d8
--- /dev/null
+++ b/clguetzli/ocl.h
@@ -0,0 +1,72 @@
+/*
+* OpenCL Manager
+*
+* Author: strongtu@tencent.com
+*         ianhuang@tencent.com
+*/
+#pragma once
+
+#ifdef __USE_OPENCL__
+
+#include "CL/cl.h"
+#include "utils.h"
+#include "clguetzli.cl.h"
+
+// Macros for OpenCL versions
+#define OPENCL_VERSION_1_2  1.2f
+#define OPENCL_VERSION_2_0  2.0f
+
+enum KernelName {
+    KERNEL_CONVOLUTION = 0,
+    KERNEL_CONVOLUTIONX,
+    KERNEL_CONVOLUTIONY,
+    KERNEL_SQUARESAMPLE,
+    KERNEL_OPSINDYNAMICSIMAGE,
+    KERNEL_MASKHIGHINTENSITYCHANGE,
+    KERNEL_EDGEDETECTOR,
+    KERNEL_BLOCKDIFFMAP,
+    KERNEL_EDGEDETECTORLOWFREQ,
+    KERNEL_DIFFPRECOMPUTE,
+    KERNEL_SCALEIMAGE,
+    KERNEL_AVERAGE5X5,
+    KERNEL_MINSQUAREVAL,
+    KERNEL_DOMASK,
+    KERNEL_COMBINECHANNELS,
+    KERNEL_UPSAMPLESQUAREROOT,
+    KERNEL_REMOVEBORDER,
+    KERNEL_ADDBORDER,
+    KERNEL_COMPUTEBLOCKZEROINGORDER,
+    KERNEL_COUNT,
+};
+
+#define LOG_CL_RESULT(e)   if (CL_SUCCESS != (e)) { LogError("Error: %s:%d returned %s.\n", __FUNCTION__, __LINE__, TranslateOpenCLError((e)));}
+
+struct ocl_args_d_t;
+
+const char* TranslateOpenCLError(cl_int errorCode);
+
+int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType);
+
+ocl_args_d_t& getOcl(void);
+
+struct ocl_args_d_t
+{
+	ocl_args_d_t();
+	~ocl_args_d_t();
+
+	cl_mem allocMem(size_t s, const void *init = NULL);
+	ocl_channels allocMemChannels(size_t s, const void *c0 = NULL, const void *c1 = NULL, const void *c2 = NULL);
+    void releaseMemChannels(ocl_channels &rgb);
+
+	// Regular OpenCL objects:
+	cl_context       context;           // hold the context handler
+	cl_device_id     device;            // hold the selected device handler
+	cl_command_queue commandQueue;      // hold the commands-queue handler
+	cl_program       program;           // hold the program handler
+	cl_kernel        kernel[KERNEL_COUNT];            // hold the kernel handler
+	float            platformVersion;   // hold the OpenCL platform version (default 1.2)
+	float            deviceVersion;     // hold the OpenCL device version (default. 1.2)
+	float            compilerVersion;   // hold the device OpenCL C version (default. 1.2)
+};
+
+#endif
diff --git a/clguetzli/ocu.cpp b/clguetzli/ocu.cpp
new file mode 100644
index 00000000..b7395ed1
--- /dev/null
+++ b/clguetzli/ocu.cpp
@@ -0,0 +1,206 @@
+/*
+* CUDA Manager
+*
+* Author: strongtu@tencent.com
+*/
+#include "ocu.h"
+
+#ifdef __USE_CUDA__
+#include <cuda.h>
+#include <nvrtc.h>
+
+ocu_args_d_t& getOcu(void)
+{
+    static bool bInit = false;
+    static ocu_args_d_t ocu;
+
+    if (bInit == true) return ocu;
+
+    bInit = true;
+
+    CUresult err = cuInit(0);
+    LOG_CU_RESULT(err);
+    CUdevice dev = 0;
+    CUcontext ctxt;
+    CUstream  stream;
+
+    err = cuCtxCreate(&ctxt, CU_CTX_SCHED_AUTO, dev);
+    LOG_CU_RESULT(err);
+
+    char name[1024];
+    int proc_count = 0;
+    int thread_count = 0;
+    int cap_major = 0, cap_minor = 0;
+    cuDeviceGetName(name, sizeof(name), dev);
+    cuDeviceGetAttribute(&cap_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev);
+    cuDeviceGetAttribute(&cap_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev);
+    cuDeviceGetAttribute(&proc_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
+    cuDeviceGetAttribute(&thread_count, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
+    LogError("CUDA Adapter:%s Ver%d.%d MP %d MaxThread Per MP %d)\r\n", name, cap_major, cap_minor, proc_count, thread_count);
+
+    char* ptx = nullptr;
+    size_t src_size = 0;
+if (sizeof(void*) == 8)
+    ReadSourceFromFile("clguetzli/clguetzli.cu.ptx64", &ptx, &src_size);
+else
+    ReadSourceFromFile("clguetzli/clguetzli.cu.ptx32", &ptx, &src_size);
+
+    CUmodule mod;
+    CUjit_option jit_options[2];
+    void *jit_optvals[2];
+    jit_options[0] = CU_JIT_CACHE_MODE;
+    jit_optvals[0] = (void*)(uintptr_t)CU_JIT_CACHE_OPTION_CA;
+    err = cuModuleLoadDataEx(&mod, ptx, 1, jit_options, jit_optvals);
+    LOG_CU_RESULT(err);
+
+    delete[] ptx;
+
+    cuModuleGetFunction(&ocu.kernel[KERNEL_CONVOLUTION], mod, "clConvolutionEx");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_CONVOLUTIONX], mod, "clConvolutionXEx");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_CONVOLUTIONY], mod, "clConvolutionYEx");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_SQUARESAMPLE], mod, "clSquareSampleEx");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_OPSINDYNAMICSIMAGE], mod, "clOpsinDynamicsImageEx");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_MASKHIGHINTENSITYCHANGE], mod, "clMaskHighIntensityChangeEx");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_EDGEDETECTOR], mod, "clEdgeDetectorMapEx");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_BLOCKDIFFMAP], mod, "clBlockDiffMapEx");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_EDGEDETECTORLOWFREQ], mod, "clEdgeDetectorLowFreqEx");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_DIFFPRECOMPUTE], mod, "clDiffPrecomputeEx");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_SCALEIMAGE], mod, "clScaleImageEx");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_AVERAGE5X5], mod, "clAverage5x5Ex");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_MINSQUAREVAL], mod, "clMinSquareValEx");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_DOMASK], mod, "clDoMaskEx");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_COMBINECHANNELS], mod, "clCombineChannelsEx");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_UPSAMPLESQUAREROOT], mod, "clUpsampleSquareRootEx");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_REMOVEBORDER], mod, "clRemoveBorderEx");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_ADDBORDER], mod, "clAddBorderEx");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER], mod, "clComputeBlockZeroingOrderEx");
+
+    cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_SHARED);
+    cuCtxSetSharedMemConfig(CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE);
+
+    cuStreamCreate(&stream, 0);
+
+    ocu.dev = dev;
+    ocu.commandQueue = stream;
+    ocu.mod = mod;
+    ocu.ctxt = ctxt;
+    ocu.mem_pool.commandQueue = ocu.commandQueue;
+
+    return ocu;
+}
+
+ocu_args_d_t::ocu_args_d_t()
+    : dev(0)
+    , commandQueue(NULL)
+    , mod(NULL)
+    , ctxt(NULL)
+{
+
+}
+
+ocu_args_d_t::~ocu_args_d_t()
+{
+    cuModuleUnload(mod);
+    cuCtxDestroy(ctxt);
+    mem_pool.drain();
+}
+
+cu_mem ocu_args_d_t::allocMem(size_t s, const void *init)
+{
+    return mem_pool.allocMem(s, init);
+}
+
+void ocu_args_d_t::releaseMem(cu_mem mem)
+{
+    mem_pool.releaseMem(mem);
+}
+
+ocu_channels ocu_args_d_t::allocMemChannels(size_t s, const void *c0, const void *c1, const void *c2)
+{
+    const void *c[3] = { c0, c1, c2 };
+
+    ocu_channels img;
+    for (int i = 0; i < 3; i++)
+    {
+        img.ch[i] = allocMem(s, c[i]);
+    }
+
+    return img;
+}
+
+void ocu_args_d_t::releaseMemChannels(ocu_channels &rgb)
+{
+    for (int i = 0; i < 3; i++)
+    {
+        releaseMem(rgb.ch[i]);
+        rgb.ch[i] = NULL;
+    }
+}
+
+const char* TranslateCUDAError(CUresult errorCode)
+{
+    switch (errorCode)
+    {
+    case CUDA_SUCCESS: return "CUDA_SUCCESS";
+    case CUDA_ERROR_INVALID_VALUE: return "CUDA_ERROR_INVALID_VALUE";
+    case CUDA_ERROR_OUT_OF_MEMORY: return "CUDA_ERROR_OUT_OF_MEMORY";
+    case CUDA_ERROR_NOT_INITIALIZED: return "CUDA_ERROR_NOT_INITIALIZED";
+    case CUDA_ERROR_DEINITIALIZED: return "CUDA_ERROR_DEINITIALIZED";
+    case CUDA_ERROR_PROFILER_DISABLED: return "CUDA_ERROR_PROFILER_DISABLED";
+    case CUDA_ERROR_PROFILER_NOT_INITIALIZED: return "CUDA_ERROR_PROFILER_NOT_INITIALIZED";
+    case CUDA_ERROR_PROFILER_ALREADY_STARTED: return "CUDA_ERROR_PROFILER_ALREADY_STARTED";
+    case CUDA_ERROR_PROFILER_ALREADY_STOPPED: return "CUDA_ERROR_PROFILER_ALREADY_STOPPED";
+    case CUDA_ERROR_NO_DEVICE: return "CUDA_ERROR_NO_DEVICE";
+    case CUDA_ERROR_INVALID_DEVICE: return "CUDA_ERROR_INVALID_DEVICE";
+    case CUDA_ERROR_INVALID_IMAGE: return "CUDA_ERROR_INVALID_IMAGE";
+    case CUDA_ERROR_INVALID_CONTEXT: return "CUDA_ERROR_INVALID_CONTEXT";
+    case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT";
+    case CUDA_ERROR_MAP_FAILED: return "CUDA_ERROR_MAP_FAILED";
+    case CUDA_ERROR_UNMAP_FAILED: return "CUDA_ERROR_UNMAP_FAILED";
+    case CUDA_ERROR_ARRAY_IS_MAPPED: return "CUDA_ERROR_ARRAY_IS_MAPPED";
+    case CUDA_ERROR_ALREADY_MAPPED: return "CUDA_ERROR_ALREADY_MAPPED";
+    case CUDA_ERROR_NO_BINARY_FOR_GPU: return "CUDA_ERROR_NO_BINARY_FOR_GPU";
+    case CUDA_ERROR_ALREADY_ACQUIRED: return "CUDA_ERROR_ALREADY_ACQUIRED";
+    case CUDA_ERROR_NOT_MAPPED: return "CUDA_ERROR_NOT_MAPPED";
+    case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY";
+    case CUDA_ERROR_NOT_MAPPED_AS_POINTER: return "CUDA_ERROR_NOT_MAPPED_AS_POINTER";
+    case CUDA_ERROR_ECC_UNCORRECTABLE: return "CUDA_ERROR_ECC_UNCORRECTABLE";
+    case CUDA_ERROR_UNSUPPORTED_LIMIT: return "CUDA_ERROR_UNSUPPORTED_LIMIT";
+    case CUDA_ERROR_CONTEXT_ALREADY_IN_USE: return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE";
+    case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED: return "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED";
+    case CUDA_ERROR_INVALID_PTX: return "CUDA_ERROR_INVALID_PTX";
+    case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT: return "CUDA_ERROR_INVALID_GRAPHICS_CONTEXT";
+    // case CUDA_ERROR_NVLINK_UNCORRECTABLE: return "CUDA_ERROR_NVLINK_UNCORRECTABLE";
+    case CUDA_ERROR_INVALID_SOURCE: return "CUDA_ERROR_INVALID_SOURCE";
+    case CUDA_ERROR_FILE_NOT_FOUND: return "CUDA_ERROR_FILE_NOT_FOUND";
+    case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND";
+    case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED";
+    case CUDA_ERROR_OPERATING_SYSTEM: return "CUDA_ERROR_OPERATING_SYSTEM";
+    case CUDA_ERROR_INVALID_HANDLE: return "CUDA_ERROR_INVALID_HANDLE";
+    case CUDA_ERROR_NOT_FOUND: return "CUDA_ERROR_NOT_FOUND";
+    case CUDA_ERROR_NOT_READY: return "CUDA_ERROR_NOT_READY";
+    case CUDA_ERROR_ILLEGAL_ADDRESS: return "CUDA_ERROR_ILLEGAL_ADDRESS";
+    case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
+    case CUDA_ERROR_LAUNCH_TIMEOUT: return "CUDA_ERROR_LAUNCH_TIMEOUT";
+    case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING";
+    case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED: return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED";
+    case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED: return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED";
+    case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE: return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE";
+    case CUDA_ERROR_CONTEXT_IS_DESTROYED: return "CUDA_ERROR_CONTEXT_IS_DESTROYED";
+    case CUDA_ERROR_ASSERT: return "CUDA_ERROR_ASSERT";
+    case CUDA_ERROR_TOO_MANY_PEERS: return "CUDA_ERROR_TOO_MANY_PEERS";
+    case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED: return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED";
+    case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED: return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED";
+    case CUDA_ERROR_HARDWARE_STACK_ERROR: return "CUDA_ERROR_HARDWARE_STACK_ERROR";
+    case CUDA_ERROR_ILLEGAL_INSTRUCTION: return "CUDA_ERROR_ILLEGAL_INSTRUCTION";
+    case CUDA_ERROR_MISALIGNED_ADDRESS: return "CUDA_ERROR_MISALIGNED_ADDRESS";
+    case CUDA_ERROR_INVALID_ADDRESS_SPACE: return "CUDA_ERROR_INVALID_ADDRESS_SPACE";
+    case CUDA_ERROR_INVALID_PC: return "CUDA_ERROR_INVALID_PC";
+    case CUDA_ERROR_LAUNCH_FAILED: return "CUDA_ERROR_LAUNCH_FAILED";
+    case CUDA_ERROR_NOT_PERMITTED: return "CUDA_ERROR_NOT_PERMITTED";
+    case CUDA_ERROR_NOT_SUPPORTED: return "CUDA_ERROR_NOT_SUPPORTED";
+    case CUDA_ERROR_UNKNOWN: return "CUDA_ERROR_UNKNOWN";
+    default: return "CUDA_ERROR_UNKNOWN";
+    }
+}
+#endif
\ No newline at end of file
diff --git a/clguetzli/ocu.h b/clguetzli/ocu.h
new file mode 100644
index 00000000..93f675a3
--- /dev/null
+++ b/clguetzli/ocu.h
@@ -0,0 +1,42 @@
+/*
+* CUDA Manager
+*
+* Author: strongtu@tencent.com
+*/
+#pragma once
+
+#ifdef __USE_CUDA__
+
+#include <cuda.h>
+#include "ocl.h"
+#include "cumem_pool.h"
+
+#define LOG_CU_RESULT(e)   if (CUDA_SUCCESS != (e)) { LogError("Error: %s:%d returned %s.\n", __FUNCTION__, __LINE__, TranslateCUDAError((e)));}
+
+struct ocu_args_d_t;
+
+const char* TranslateCUDAError(CUresult errorCode);
+
+ocu_args_d_t& getOcu(void);
+
+struct ocu_args_d_t
+{
+    ocu_args_d_t();
+    ~ocu_args_d_t();
+
+    cu_mem allocMem(size_t s, const void *init = NULL);
+    void releaseMem(cu_mem mem);
+    ocu_channels allocMemChannels(size_t s, const void *c0 = NULL, const void *c1 = NULL, const void *c2 = NULL);
+    void releaseMemChannels(ocu_channels &rgb);
+
+    CUfunction  kernel[KERNEL_COUNT];
+    CUstream    commandQueue;
+    CUmodule    mod;
+    CUcontext   ctxt;
+    CUdevice    dev;
+    cu_mem_pool_t mem_pool;
+};
+
+
+
+#endif
\ No newline at end of file
diff --git a/clguetzli/utils.cpp b/clguetzli/utils.cpp
new file mode 100644
index 00000000..da699406
--- /dev/null
+++ b/clguetzli/utils.cpp
@@ -0,0 +1,102 @@
+/*****************************************************************************
+ * Copyright (c) 2013-2016 Intel Corporation
+ * All rights reserved.
+ *
+ * WARRANTY DISCLAIMER
+ *
+ * THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
+ * MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Intel Corporation is the author of the Materials, and requests that all
+ * problem reports or change requests be submitted to it directly
+ *****************************************************************************/
+#ifdef __USE_OPENCL__
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <stdarg.h>
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+#include "utils.h"
+#include <assert.h>
+
+
+//we want to use POSIX functions
+#pragma warning( push )
+#pragma warning( disable : 4996 )
+
+
+void LogInfo(const char* str, ...)
+{
+    if (str)
+    {
+        va_list args;
+        va_start(args, str);
+
+        vfprintf(stdout, str, args);
+
+        va_end(args);
+    }
+}
+
+void LogError(const char* str, ...)
+{
+    if (str)
+    {
+        va_list args;
+        va_start(args, str);
+
+        vfprintf(stderr, str, args);
+
+        va_end(args);
+    }
+}
+
+// Upload the OpenCL C source code to output argument source
+// The memory resource is implicitly allocated in the function
+// and should be deallocated by the caller
+int ReadSourceFromFile(const char* fileName, char** source, size_t* sourceSize)
+{
+    int errorCode = CL_SUCCESS;
+
+    FILE* fp = NULL;
+#ifdef __linux__
+    fp = fopen(fileName, "rb");
+#else
+    fopen_s(&fp, fileName, "rb");
+#endif
+    if (fp == NULL)
+    {
+        LogError("Error: Couldn't find program source file '%s'.\n", fileName);
+        errorCode = CL_INVALID_VALUE;
+    }
+    else {
+        fseek(fp, 0, SEEK_END);
+        *sourceSize = ftell(fp);
+        fseek(fp, 0, SEEK_SET);
+
+        *source = new char[*sourceSize];
+        if (*source == NULL)
+        {
+            LogError("Error: Couldn't allocate %d bytes for program source from file '%s'.\n", *sourceSize, fileName);
+            errorCode = CL_OUT_OF_HOST_MEMORY;
+        }
+        else {
+            fread(*source, 1, *sourceSize, fp);
+        }
+    }
+    return errorCode;
+}
+#pragma warning( pop )
+
+#endif
\ No newline at end of file
diff --git a/clguetzli/utils.h b/clguetzli/utils.h
new file mode 100644
index 00000000..71d8d7a1
--- /dev/null
+++ b/clguetzli/utils.h
@@ -0,0 +1,32 @@
+/*****************************************************************************
+ * Copyright (c) 2013-2016 Intel Corporation
+ * All rights reserved.
+ *
+ * WARRANTY DISCLAIMER
+ *
+ * THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
+ * MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Intel Corporation is the author of the Materials, and requests that all
+ * problem reports or change requests be submitted to it directly
+ *****************************************************************************/
+#pragma once
+
+// Print useful information to the default output. Same usage as with printf
+void LogInfo(const char* str, ...);
+
+// Print error notification to the default output. Same usage as with printf
+void LogError(const char* str, ...);
+
+// Read OpenCL source code from fileName and store it in source. The number of read bytes returns in sourceSize
+int ReadSourceFromFile(const char* fileName, char** source, size_t* sourceSize);
+
diff --git a/compile.bat b/compile.bat
new file mode 100644
index 00000000..1b98c758
--- /dev/null
+++ b/compile.bat
@@ -0,0 +1,12 @@
+@rem setupt windows var
+call vcvars64.bat
+
+@echo %1 --machine 64 or 32
+@echo %2  -G 
+
+set machine_num=%1%
+set debug_opt=%2%
+
+if "%machine_num%" == "" set machine_num=64
+
+nvcc -Xcompiler "/wd 4819" -I"./" -use_fast_math -ftz=true -prec-div=false -prec-sqrt=false -arch=compute_30 -O3 --machine %machine_num% %debug_opt% -ptx -o clguetzli\clguetzli.cu.ptx%machine_num%  clguetzli\clguetzli.cu
\ No newline at end of file
diff --git a/compile.sh b/compile.sh
new file mode 100644
index 00000000..eabb6473
--- /dev/null
+++ b/compile.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+#Compile .cu file
+echo $1 --machine 64 or 32
+echo $2 -G
+
+nvcc -D__USE_OPENCL__ -I"./" -I"/usr/local/cuda/include" -ftz=true -prec-div=false -prec-sqrt=false -arch=compute_30 -O3 --machine $1 $2 -ptx -o clguetzli/clguetzli.cu.ptx$1  clguetzli/clguetzli.cu
+
+#copy to ./bin/Release
+cp clguetzli/clguetzli.cu.ptx$1 bin/Release/clguetzli/clguetzli.cu.ptx$1
+cp clguetzli/clguetzli.cl bin/Release/clguetzli/clguetzli.cl
+cp clguetzli/clguetzli.cl.h bin/Release/clguetzli/clguetzli.cl.h
diff --git a/guetzli.make b/guetzli.make
index 7edeea3f..e16aa99b 100644
--- a/guetzli.make
+++ b/guetzli.make
@@ -16,7 +16,7 @@ ifeq ($(config),release)
   TARGET = $(TARGETDIR)/guetzli
   OBJDIR = obj/Release/guetzli
   DEFINES +=
-  INCLUDES += -I. -Ithird_party/butteraugli
+  INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli
   FORCE_INCLUDE +=
   ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES)
   ALL_CFLAGS += $(CFLAGS) $(ALL_CPPFLAGS) -O3 -g `pkg-config --cflags libpng || libpng-config --cflags`
@@ -43,7 +43,7 @@ ifeq ($(config),debug)
   TARGET = $(TARGETDIR)/guetzli
   OBJDIR = obj/Debug/guetzli
   DEFINES +=
-  INCLUDES += -I. -Ithird_party/butteraugli
+  INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli
   FORCE_INCLUDE +=
   ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES)
   ALL_CFLAGS += $(CFLAGS) $(ALL_CPPFLAGS) -g `pkg-config --cflags libpng || libpng-config --cflags`
@@ -65,6 +65,15 @@ all: $(TARGETDIR) $(OBJDIR) prebuild prelink $(TARGET)
 endif
 
 OBJECTS := \
+	$(OBJDIR)/clbutter_comparator.o \
+	$(OBJDIR)/clguetzli.cl.o \
+	$(OBJDIR)/clguetzli.o \
+	$(OBJDIR)/clguetzli_test.o \
+	$(OBJDIR)/cuguetzli.o \
+	$(OBJDIR)/cumem_pool.o \
+	$(OBJDIR)/ocl.o \
+	$(OBJDIR)/ocu.o \
+	$(OBJDIR)/utils.o \
 	$(OBJDIR)/butteraugli_comparator.o \
 	$(OBJDIR)/dct_double.o \
 	$(OBJDIR)/debug_print.o \
@@ -143,6 +152,33 @@ $(GCH): $(PCH)
 	$(SILENT) $(CXX) -x c++-header $(ALL_CXXFLAGS) -o "$@" -MF "$(@:%.gch=%.d)" -c "$<"
 endif
 
+$(OBJDIR)/clbutter_comparator.o: clguetzli/clbutter_comparator.cpp
+	@echo $(notdir $<)
+	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
+$(OBJDIR)/clguetzli.cl.o: clguetzli/clguetzli.cl.cpp
+	@echo $(notdir $<)
+	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
+$(OBJDIR)/clguetzli.o: clguetzli/clguetzli.cpp
+	@echo $(notdir $<)
+	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
+$(OBJDIR)/clguetzli_test.o: clguetzli/clguetzli_test.cpp
+	@echo $(notdir $<)
+	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
+$(OBJDIR)/cuguetzli.o: clguetzli/cuguetzli.cpp
+	@echo $(notdir $<)
+	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
+$(OBJDIR)/cumem_pool.o: clguetzli/cumem_pool.cpp
+	@echo $(notdir $<)
+	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
+$(OBJDIR)/ocl.o: clguetzli/ocl.cpp
+	@echo $(notdir $<)
+	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
+$(OBJDIR)/ocu.o: clguetzli/ocu.cpp
+	@echo $(notdir $<)
+	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
+$(OBJDIR)/utils.o: clguetzli/utils.cpp
+	@echo $(notdir $<)
+	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/butteraugli_comparator.o: guetzli/butteraugli_comparator.cc
 	@echo $(notdir $<)
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index 5b7ffeb9..3a0eb72c 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -51,6 +51,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\IntelOpenCL.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
@@ -78,6 +79,8 @@
     <IntDir>obj\x86\Release\guetzli\</IntDir>
     <TargetName>guetzli</TargetName>
     <TargetExt>.exe</TargetExt>
+    <IncludePath>$(VC_IncludePath);$(WindowsSDK_IncludePath);$(VC_IncludeThirdParty)</IncludePath>
+    <LibraryPath>$(VC_LibraryPath_x86);$(WindowsSDK_LibraryPath_x86);$(NETFXKitsDir)Lib\um\x86;$(VC_LibThirdParty_x86)</LibraryPath>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <LinkIncremental>true</LinkIncremental>
@@ -92,6 +95,8 @@
     <IntDir>obj\x86\Debug\guetzli\</IntDir>
     <TargetName>guetzli</TargetName>
     <TargetExt>.exe</TargetExt>
+    <IncludePath>$(VC_IncludePath);$(WindowsSDK_IncludePath);$(VC_IncludeThirdParty)</IncludePath>
+    <LibraryPath>$(VC_LibraryPath_x86);$(WindowsSDK_LibraryPath_x86);$(NETFXKitsDir)Lib\um\x86;$(VC_LibThirdParty_x86)</LibraryPath>
   </PropertyGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <ClCompile>
@@ -103,6 +108,7 @@
       <IntrinsicFunctions>true</IntrinsicFunctions>
       <MinimalRebuild>false</MinimalRebuild>
       <StringPooling>true</StringPooling>
+      <PreprocessorDefinitions>_UNICODE;UNICODE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>
@@ -110,18 +116,35 @@
       <OptimizeReferences>true</OptimizeReferences>
       <AdditionalDependencies>shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
+      <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\x64;third_party\libjpeg\x64</AdditionalLibraryDirectories>
     </Link>
+    <CustomBuild>
+      <Command>"$(INTELOCLSDKROOT)bin\x64\ioc64.exe" -cmd=build -input="%(FullPath)" -output="x64\Release\%(Filename).out" -VS -device=GPU -simd=default -targetos=current            -bo="           "</Command>
+    </CustomBuild>
+    <CustomBuild>
+      <Message>OpenCL Code Builder</Message>
+    </CustomBuild>
+    <CustomBuild>
+      <LinkObjects>false</LinkObjects>
+    </CustomBuild>
+    <Intel_OpenCL_Build_Rules />
+    <PostBuildEvent />
+    <PreBuildEvent />
+    <PreBuildEvent />
+    <PreBuildEvent />
+    <PostBuildEvent />
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
       <AdditionalIncludeDirectories>.;third_party\butteraugli;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-      <Optimization>Full</Optimization>
+      <Optimization>MaxSpeed</Optimization>
       <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <IntrinsicFunctions>false</IntrinsicFunctions>
       <MinimalRebuild>false</MinimalRebuild>
       <StringPooling>true</StringPooling>
+      <PreprocessorDefinitions>%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>
@@ -129,7 +152,20 @@
       <OptimizeReferences>true</OptimizeReferences>
       <AdditionalDependencies>shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
+      <ForceSymbolReferences>
+      </ForceSymbolReferences>
+      <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\Win32;third_party\libjpeg\x86</AdditionalLibraryDirectories>
     </Link>
+    <PostBuildEvent>
+      <Command>
+      </Command>
+    </PostBuildEvent>
+    <CustomBuild>
+      <Message>CUDA CU</Message>
+    </CustomBuild>
+    <Intel_OpenCL_Build_Rules>
+      <Device>3</Device>
+    </Intel_OpenCL_Build_Rules>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <ClCompile>
@@ -138,13 +174,16 @@
       <AdditionalIncludeDirectories>.;third_party\butteraugli;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
       <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>_UNICODE;UNICODE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalDependencies>shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
+      <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\x64;third_party\libjpeg\x64</AdditionalLibraryDirectories>
     </Link>
+    <PostBuildEvent />
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <ClCompile>
@@ -153,15 +192,32 @@
       <AdditionalIncludeDirectories>.;third_party\butteraugli;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
       <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalDependencies>shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
+      <ForceSymbolReferences>
+      </ForceSymbolReferences>
+      <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\Win32;third_party\libjpeg\x86</AdditionalLibraryDirectories>
     </Link>
+    <PostBuildEvent />
+    <Intel_OpenCL_Build_Rules>
+      <Device>3</Device>
+    </Intel_OpenCL_Build_Rules>
   </ItemDefinitionGroup>
   <ItemGroup>
+    <ClInclude Include="clguetzli\clbutter_comparator.h" />
+    <ClInclude Include="clguetzli\clguetzli.cl.h" />
+    <ClInclude Include="clguetzli\clguetzli.h" />
+    <ClInclude Include="clguetzli\clguetzli_test.h" />
+    <ClInclude Include="clguetzli\cuguetzli.h" />
+    <ClInclude Include="clguetzli\cumem_pool.h" />
+    <ClInclude Include="clguetzli\ocl.h" />
+    <ClInclude Include="clguetzli\ocu.h" />
+    <ClInclude Include="clguetzli\utils.h" />
     <ClInclude Include="guetzli\butteraugli_comparator.h" />
     <ClInclude Include="guetzli\color_transform.h" />
     <ClInclude Include="guetzli\comparator.h" />
@@ -190,6 +246,15 @@
     <ClInclude Include="third_party\butteraugli\butteraugli\butteraugli.h" />
   </ItemGroup>
   <ItemGroup>
+    <ClCompile Include="clguetzli\clbutter_comparator.cpp" />
+    <ClCompile Include="clguetzli\clguetzli.cl.cpp" />
+    <ClCompile Include="clguetzli\clguetzli.cpp" />
+    <ClCompile Include="clguetzli\clguetzli_test.cpp" />
+    <ClCompile Include="clguetzli\cuguetzli.cpp" />
+    <ClCompile Include="clguetzli\cumem_pool.cpp" />
+    <ClCompile Include="clguetzli\ocl.cpp" />
+    <ClCompile Include="clguetzli\ocu.cpp" />
+    <ClCompile Include="clguetzli\utils.cpp" />
     <ClCompile Include="guetzli\butteraugli_comparator.cc" />
     <ClCompile Include="guetzli\dct_double.cc" />
     <ClCompile Include="guetzli\debug_print.cc" />
@@ -212,7 +277,36 @@
     <ClCompile Include="guetzli\score.cc" />
     <ClCompile Include="third_party\butteraugli\butteraugli\butteraugli.cc" />
   </ItemGroup>
+  <ItemGroup>
+    <Intel_OpenCL_Build_Rules Include="clguetzli\clguetzli.cl">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </Command>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
+    </Intel_OpenCL_Build_Rules>
+    <CustomBuild Include="clguetzli\clguetzli.cu">
+      <FileType>Document</FileType>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">CUDA Code Builder</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ProjectDir)compile.bat 64</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ProjectDir)compile.bat 32</Command>
+      <LinkObjects Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkObjects>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">clguetzli\clguetzli.cu.ptx64</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ProjectDir)compile.bat 64 -G</Command>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">CUDA Code Builder</Message>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">clguetzli\clguetzli.cu.ptx64</Outputs>
+      <LinkObjects Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</LinkObjects>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ProjectDir)compile.bat 32 -G</Command>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">CUDA Code Builder</Message>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">clguetzli\clguetzli.cu.ptx32</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">CUDA Code Builder</Message>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">clguetzli\clguetzli.cu.ptx32</Outputs>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
+    </CustomBuild>
+  </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\IntelOpenCL.targets" />
   </ImportGroup>
 </Project>
\ No newline at end of file
diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters
index da2297c5..7e005105 100644
--- a/guetzli.vcxproj.filters
+++ b/guetzli.vcxproj.filters
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+﻿<?xml version="1.0" encoding="utf-8"?>
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup>
     <Filter Include="guetzli">
@@ -13,6 +13,9 @@
     <Filter Include="third_party\butteraugli\butteraugli">
       <UniqueIdentifier>{FD6FCB41-6929-36EC-F288-50C65E41EC5B}</UniqueIdentifier>
     </Filter>
+    <Filter Include="clguetzli">
+      <UniqueIdentifier>{64847a89-ca39-4556-ba0e-d6875c4d39ca}</UniqueIdentifier>
+    </Filter>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="guetzli\butteraugli_comparator.h">
@@ -93,6 +96,33 @@
     <ClInclude Include="third_party\butteraugli\butteraugli\butteraugli.h">
       <Filter>third_party\butteraugli\butteraugli</Filter>
     </ClInclude>
+    <ClInclude Include="clguetzli\utils.h">
+      <Filter>clguetzli</Filter>
+    </ClInclude>
+    <ClInclude Include="clguetzli\ocl.h">
+      <Filter>clguetzli</Filter>
+    </ClInclude>
+    <ClInclude Include="clguetzli\clguetzli.h">
+      <Filter>clguetzli</Filter>
+    </ClInclude>
+    <ClInclude Include="clguetzli\clguetzli_test.h">
+      <Filter>clguetzli</Filter>
+    </ClInclude>
+    <ClInclude Include="clguetzli\clbutter_comparator.h">
+      <Filter>clguetzli</Filter>
+    </ClInclude>
+    <ClInclude Include="clguetzli\clguetzli.cl.h">
+      <Filter>clguetzli</Filter>
+    </ClInclude>
+    <ClInclude Include="clguetzli\ocu.h">
+      <Filter>clguetzli</Filter>
+    </ClInclude>
+    <ClInclude Include="clguetzli\cuguetzli.h">
+      <Filter>clguetzli</Filter>
+    </ClInclude>
+    <ClInclude Include="clguetzli\cumem_pool.h">
+      <Filter>clguetzli</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="guetzli\butteraugli_comparator.cc">
@@ -158,5 +188,42 @@
     <ClCompile Include="third_party\butteraugli\butteraugli\butteraugli.cc">
       <Filter>third_party\butteraugli\butteraugli</Filter>
     </ClCompile>
+    <ClCompile Include="clguetzli\utils.cpp">
+      <Filter>clguetzli</Filter>
+    </ClCompile>
+    <ClCompile Include="clguetzli\ocl.cpp">
+      <Filter>clguetzli</Filter>
+    </ClCompile>
+    <ClCompile Include="clguetzli\clguetzli.cpp">
+      <Filter>clguetzli</Filter>
+    </ClCompile>
+    <ClCompile Include="clguetzli\clguetzli_test.cpp">
+      <Filter>clguetzli</Filter>
+    </ClCompile>
+    <ClCompile Include="clguetzli\clguetzli.cl.cpp">
+      <Filter>clguetzli</Filter>
+    </ClCompile>
+    <ClCompile Include="clguetzli\clbutter_comparator.cpp">
+      <Filter>clguetzli</Filter>
+    </ClCompile>
+    <ClCompile Include="clguetzli\ocu.cpp">
+      <Filter>clguetzli</Filter>
+    </ClCompile>
+    <ClCompile Include="clguetzli\cuguetzli.cpp">
+      <Filter>clguetzli</Filter>
+    </ClCompile>
+    <ClCompile Include="clguetzli\cumem_pool.cpp">
+      <Filter>clguetzli</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <Intel_OpenCL_Build_Rules Include="clguetzli\clguetzli.cl">
+      <Filter>clguetzli</Filter>
+    </Intel_OpenCL_Build_Rules>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="clguetzli\clguetzli.cu">
+      <Filter>clguetzli</Filter>
+    </CustomBuild>
   </ItemGroup>
 </Project>
\ No newline at end of file
diff --git a/guetzli/butteraugli_comparator.cc b/guetzli/butteraugli_comparator.cc
index ec964334..f0ce5eb4 100644
--- a/guetzli/butteraugli_comparator.cc
+++ b/guetzli/butteraugli_comparator.cc
@@ -22,6 +22,10 @@
 #include "guetzli/gamma_correct.h"
 #include "guetzli/score.h"
 
+#include "clguetzli/ocu.h"
+#include "clguetzli/clguetzli.h"
+#include "clguetzli/cuguetzli.h"
+
 namespace guetzli {
 
 std::vector<std::vector<float> > ComputeOpsinDynamicsImage(
@@ -107,7 +111,9 @@ void ButteraugliComparator::SwitchBlock(int block_x, int block_y,
 }
 
 double ButteraugliComparator::CompareBlock(const OutputImage& img,
-                                           int off_x, int off_y) const {
+                                           int off_x, int off_y, 
+                                           const coeff_t* candidate_block,
+                                           const int comp_mask) const {
   int block_x = block_x_ * factor_x_ + off_x;
   int block_y = block_y_ * factor_y_ + off_y;
   int xmin = 8 * block_x;
diff --git a/guetzli/butteraugli_comparator.h b/guetzli/butteraugli_comparator.h
index f96d160f..08530a7e 100644
--- a/guetzli/butteraugli_comparator.h
+++ b/guetzli/butteraugli_comparator.h
@@ -20,6 +20,7 @@
 #include <vector>
 
 #include "butteraugli/butteraugli.h"
+#include "clguetzli/clbutter_comparator.h"
 #include "guetzli/comparator.h"
 #include "guetzli/jpeg_data.h"
 #include "guetzli/output_image.h"
@@ -44,7 +45,7 @@ class ButteraugliComparator : public Comparator {
                    int factor_x, int factor_y) override;
 
   double CompareBlock(const OutputImage& img,
-                      int off_x, int off_y) const override;
+                      int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const override;
 
   double ScoreOutputSize(int size) const override;
 
@@ -62,7 +63,7 @@ class ButteraugliComparator : public Comparator {
       int factor_y, const std::vector<float>& distmap,
       std::vector<float>* block_weight) override;
 
- private:
+ protected:
   const int width_;
   const int height_;
   const float target_distance_;
@@ -73,7 +74,7 @@ class ButteraugliComparator : public Comparator {
   int factor_y_;
   std::vector<std::vector<float>> mask_xyz_;
   std::vector<std::vector<std::vector<float>>> per_block_pregamma_;
-  ::butteraugli::ButteraugliComparator comparator_;
+  ::butteraugli::clButteraugliComparator comparator_;
   float distance_;
   std::vector<float> distmap_;
   ProcessStats* stats_;
diff --git a/guetzli/comparator.h b/guetzli/comparator.h
index 00c56977..061f9603 100644
--- a/guetzli/comparator.h
+++ b/guetzli/comparator.h
@@ -51,7 +51,7 @@ class Comparator {
   // the resulting per-block distance. The interpretation of the returned
   // distance depends on the comparator used.
   virtual double CompareBlock(const OutputImage& img,
-                              int off_x, int off_y) const = 0;
+                              int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const = 0;
 
   // Returns the combined score of the output image in the last Compare() call
   // (or the baseline image, if Compare() was not called yet), based on output
diff --git a/guetzli/guetzli.cc b/guetzli/guetzli.cc
index fb6cd0a9..c972d391 100644
--- a/guetzli/guetzli.cc
+++ b/guetzli/guetzli.cc
@@ -28,6 +28,10 @@
 #include "guetzli/processor.h"
 #include "guetzli/quality.h"
 #include "guetzli/stats.h"
+#include "clguetzli/clguetzli.h"
+#ifdef __USE_GPERFTOOLS__
+#include <google/profiler.h>
+#endif
 
 namespace {
 
@@ -164,7 +168,9 @@ std::string ReadFileOrDie(const char* filename) {
   off_t buffer_size = 8192;
 
   if (fseek(f, 0, SEEK_END) == 0) {
-    buffer_size = std::max<off_t>(ftell(f), 1);
+//    buffer_size = std::max<off_t>(ftell(f), 1);
+	  long size = ftell(f);
+	  buffer_size = size > 0 ? size : 1;
     if (fseek(f, 0, SEEK_SET) != 0) {
       perror("fseek");
       exit(1);
@@ -223,6 +229,15 @@ void Usage() {
       "                 Default value is %d.\n"
       "  --memlimit M - Memory limit in MB. Guetzli will fail if unable to stay under\n"
       "                 the limit. Default limit is %d MB.\n"
+#ifdef __USE_OPENCL__
+	  "  --opencl     - Use OpenCL\n"
+      "  --checkcl    - Check OpenCL result\n"
+#endif
+	  "  --c          - Use c opt version\n"
+#ifdef __USE_CUDA__
+	  "  --cuda       - Use CUDA\n"	 
+      "  --checkcuda  - Check CUDA result\n"
+#endif
       "  --nomemlimit - Do not limit memory usage.\n", kDefaultJPEGQuality, kDefaultMemlimitMB);
   exit(1);
 }
@@ -230,6 +245,9 @@ void Usage() {
 }  // namespace
 
 int main(int argc, char** argv) {
+#ifdef __USE_GPERFTOOLS__
+	ProfilerStart("guetzli.prof");
+#endif
   std::set_terminate(TerminateHandler);
 
   int verbose = 0;
@@ -254,7 +272,28 @@ int main(int argc, char** argv) {
       memlimit_mb = atoi(argv[opt_idx]);
     } else if (!strcmp(argv[opt_idx], "--nomemlimit")) {
       memlimit_mb = -1;
-    } else if (!strcmp(argv[opt_idx], "--")) {
+	}
+#ifdef __USE_OPENCL__
+	else if (!strcmp(argv[opt_idx], "--opencl")) {
+		g_mathMode = MODE_OPENCL;
+	}
+    else if (!strcmp(argv[opt_idx], "--checkcl")) {
+        g_mathMode = MODE_CHECKCL;
+    }
+#endif
+	else if (!strcmp(argv[opt_idx], "--c"))
+	{
+		g_mathMode = MODE_CPU_OPT;
+	}
+#ifdef __USE_CUDA__
+	else if (!strcmp(argv[opt_idx], "--cuda")) {
+		g_mathMode = MODE_CUDA;
+	}
+    else if (!strcmp(argv[opt_idx], "--checkcuda")) {
+        g_mathMode = MODE_CHECKCUDA;
+    }
+#endif
+	else if (!strcmp(argv[opt_idx], "--")) {
       opt_idx++;
       break;
     } else {
@@ -322,5 +361,8 @@ int main(int argc, char** argv) {
   }
 
   WriteFileOrDie(argv[opt_idx + 1], out_data);
+#ifdef __USE_GPERFTOOLS__
+  ProfilerStop();
+#endif
   return 0;
 }
diff --git a/guetzli/jpeg_data_decoder.cc b/guetzli/jpeg_data_decoder.cc
index 98f9f4cc..722d6663 100644
--- a/guetzli/jpeg_data_decoder.cc
+++ b/guetzli/jpeg_data_decoder.cc
@@ -43,9 +43,8 @@ bool HasYCbCrColorSpace(const JPEGData& jpg) {
 }
 
 std::vector<uint8_t> DecodeJpegToRGB(const JPEGData& jpg) {
-  if (jpg.components.size() == 1 ||
-      (jpg.components.size() == 3 &&
-       HasYCbCrColorSpace(jpg) && (jpg.Is420() || jpg.Is444()))) {
+  if (jpg.components.size() == 3 &&
+       HasYCbCrColorSpace(jpg) && (jpg.Is420() || jpg.Is444())) {
     OutputImage img(jpg.width, jpg.height);
     img.CopyFromJpegData(jpg);
     return img.ToSRGB();
diff --git a/guetzli/output_image.h b/guetzli/output_image.h
index 1018eeac..9c9f935a 100644
--- a/guetzli/output_image.h
+++ b/guetzli/output_image.h
@@ -37,6 +37,8 @@ class OutputImageComponent {
   int width_in_blocks() const { return width_in_blocks_; }
   int height_in_blocks() const { return height_in_blocks_; }
   const coeff_t* coeffs() const { return &coeffs_[0]; }
+  const uint16_t* pixels() const { return &pixels_[0]; }
+  size_t pixels_size() const { return pixels_.size(); }
   const int* quant() const { return &quant_[0]; }
   bool IsAllZero() const;
 
diff --git a/guetzli/processor.cc b/guetzli/processor.cc
index 662653eb..2e8837dc 100644
--- a/guetzli/processor.cc
+++ b/guetzli/processor.cc
@@ -31,6 +31,11 @@
 #include "guetzli/jpeg_data_writer.h"
 #include "guetzli/output_image.h"
 #include "guetzli/quantize.h"
+#include "clguetzli/clguetzli.h"
+
+#ifdef __SUPPORT_FULL_JPEG__
+#include "jpeglib.h"
+#endif
 
 namespace guetzli {
 
@@ -38,10 +43,6 @@ namespace {
 
 static const size_t kBlockSize = 3 * kDCTBlockSize;
 
-struct CoeffData {
-  int idx;
-  float block_err;
-};
 struct QuantData {
   int q[3][kDCTBlockSize];
   size_t jpg_size;
@@ -57,11 +58,21 @@ class Processor {
   void SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img,
                               const uint8_t comp_mask, const double target_mul,
                               bool stop_early);
+
+  void SelectFrequencyBackEnd(const JPEGData& jpg, OutputImage* img,
+      const uint8_t comp_mask,
+      const double target_mul,
+      bool stop_early,
+      std::vector<int> &candidate_coeff_offsets,
+      std::vector<uint8_t>& candidate_coeffs,
+      std::vector<float> &candidate_coeff_errors);
+
   void ComputeBlockZeroingOrder(
       const coeff_t block[kBlockSize], const coeff_t orig_block[kBlockSize],
       const int block_x, const int block_y, const int factor_x,
       const int factor_y, const uint8_t comp_mask, OutputImage* img,
       std::vector<CoeffData>* output_order);
+
   bool SelectQuantMatrix(const JPEGData& jpg_in, const bool downsample,
                          int best_q[3][kDCTBlockSize],
                          OutputImage* img);
@@ -402,47 +413,55 @@ void Processor::ComputeBlockZeroingOrder(
   memcpy(processed_block, block, sizeof(processed_block));
   comparator_->SwitchBlock(block_x, block_y, factor_x, factor_y);
   while (!input_order.empty()) {
-    float best_err = 1e17f;
-    int best_i = 0;
-    for (size_t i = 0; i < std::min<size_t>(params_.zeroing_greedy_lookahead,
-                                         input_order.size());
-         ++i) {
-      coeff_t candidate_block[kBlockSize];
-      memcpy(candidate_block, processed_block, sizeof(candidate_block));
-      const int idx = input_order[i].first;
-      candidate_block[idx] = 0;
-      for (int c = 0; c < 3; ++c) {
-        if (comp_mask & (1 << c)) {
-          img->component(c).SetCoeffBlock(
-              block_x, block_y, &candidate_block[c * kDCTBlockSize]);
-        }
-      }
-      float max_err = 0;
-      for (int iy = 0; iy < factor_y; ++iy) {
-        for (int ix = 0; ix < factor_x; ++ix) {
-          int block_xx = block_x * factor_x + ix;
-          int block_yy = block_y * factor_y + iy;
-          if (8 * block_xx < img->width() && 8 * block_yy < img->height()) {
-            float err = static_cast<float>(comparator_->CompareBlock(*img, ix, iy));
-            max_err = std::max(max_err, err);
-          }
-        }
-      }
-      if (max_err < best_err) {
-        best_err = max_err;
-        best_i = i;
-      }
-    }
-    int idx = input_order[best_i].first;
-    processed_block[idx] = 0;
-    input_order.erase(input_order.begin() + best_i);
-    output_order->push_back({idx, best_err});
-    for (int c = 0; c < 3; ++c) {
-      if (comp_mask & (1 << c)) {
-        img->component(c).SetCoeffBlock(
-            block_x, block_y, &processed_block[c * kDCTBlockSize]);
-      }
-    }
+	  float best_err = 1e17f;
+	  int best_i = 0;
+	  for (size_t i = 0; i < std::min<size_t>(params_.zeroing_greedy_lookahead,
+		  input_order.size());
+		  ++i) {
+		  coeff_t candidate_block[kBlockSize];
+		  memcpy(candidate_block, processed_block, sizeof(candidate_block));
+		  const int idx = input_order[i].first;
+		  candidate_block[idx] = 0;
+		  for (int c = 0; c < 3; ++c) {
+			  if (comp_mask & (1 << c)) {
+				  img->component(c).SetCoeffBlock(
+					  block_x, block_y, &candidate_block[c * kDCTBlockSize]);
+			  }
+		  }
+		  float max_err = 0;
+		  for (int iy = 0; iy < factor_y; ++iy) {
+			  for (int ix = 0; ix < factor_x; ++ix) {
+				  int block_xx = block_x * factor_x + ix;
+				  int block_yy = block_y * factor_y + iy;
+				  if (8 * block_xx < img->width() && 8 * block_yy < img->height()) {
+					  float err = static_cast<float>(comparator_->CompareBlock(*img, ix, iy, candidate_block, comp_mask));
+					  max_err = std::max(max_err, err);
+				  }
+			  }
+		  }
+		  if (max_err < best_err) {
+			  best_err = max_err;
+			  best_i = i;
+		  }
+	  }
+	  int idx = input_order[best_i].first;
+	  processed_block[idx] = 0;
+	  input_order.erase(input_order.begin() + best_i);
+	  output_order->push_back({ idx, best_err });
+	  for (int c = 0; c < 3; ++c) {
+		  if (comp_mask & (1 << c)) {
+			  img->component(c).SetCoeffBlock(
+				  block_x, block_y, &processed_block[c * kDCTBlockSize]);
+		  }
+	  }
+	  if (MODE_CPU_OPT == g_mathMode)
+	  {
+		  if (best_err >= comparator_->BlockErrorLimit())
+		  {   
+              // The input_order is an ascent vector, break when best_err exceed the error limit.
+			  break;
+		  }
+	  }
   }
   // Make the block error values monotonic.
   float min_err = 1e10;
@@ -536,58 +555,188 @@ size_t EstimateDCSize(const JPEGData& jpg) {
 
 }  // namespace
 
-void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img,
-                                       const uint8_t comp_mask,
-                                       const double target_mul,
-                                       bool stop_early) {
-  const int width = img->width();
-  const int height = img->height();
-  const int ncomp = jpg.components.size();
-  const int last_c = Log2FloorNonZero(comp_mask);
-  if (static_cast<size_t>(last_c) >= jpg.components.size()) return;
-  const int factor_x = img->component(last_c).factor_x();
-  const int factor_y = img->component(last_c).factor_y();
-  const int block_width = (width + 8 * factor_x - 1) / (8 * factor_x);
-  const int block_height = (height + 8 * factor_y - 1) / (8 * factor_y);
-  const int num_blocks = block_width * block_height;
-
-  std::vector<int> candidate_coeff_offsets(num_blocks + 1);
-  std::vector<uint8_t> candidate_coeffs;
-  std::vector<float> candidate_coeff_errors;
-  candidate_coeffs.reserve(60 * num_blocks);
-  candidate_coeff_errors.reserve(60 * num_blocks);
-  std::vector<CoeffData> block_order;
-  block_order.reserve(3 * kDCTBlockSize);
-  comparator_->StartBlockComparisons();
-  for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) {
-    for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) {
-      coeff_t block[kBlockSize] = { 0 };
-      coeff_t orig_block[kBlockSize] = { 0 };
-      for (int c = 0; c < 3; ++c) {
-        if (comp_mask & (1 << c)) {
-          assert(img->component(c).factor_x() == factor_x);
-          assert(img->component(c).factor_y() == factor_y);
-          img->component(c).GetCoeffBlock(block_x, block_y,
-                                          &block[c * kDCTBlockSize]);
-          const JPEGComponent& comp = jpg.components[c];
-          int jpg_block_ix = block_y * comp.width_in_blocks + block_x;
-          memcpy(&orig_block[c * kDCTBlockSize],
-                 &comp.coeffs[jpg_block_ix * kDCTBlockSize],
-                 kDCTBlockSize * sizeof(orig_block[0]));
+void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, const uint8_t comp_mask, 
+                                       const double target_mul, bool stop_early)
+{
+    const int width = img->width();
+    const int height = img->height();
+    const int ncomp = jpg.components.size();
+    const int last_c = Log2FloorNonZero(comp_mask);
+    if (static_cast<size_t>(last_c) >= jpg.components.size()) return;
+    const int factor_x = img->component(last_c).factor_x();
+    const int factor_y = img->component(last_c).factor_y();
+    const int block_width = (width + 8 * factor_x - 1) / (8 * factor_x);
+    const int block_height = (height + 8 * factor_y - 1) / (8 * factor_y);
+    const int num_blocks = block_width * block_height;
+
+
+    comparator_->StartBlockComparisons();
+
+    std::vector<CoeffData> output_order_gpu;
+    std::vector<CoeffData> output_order_cpu;
+
+	CoeffData * output_order = NULL;
+
+    if (MODE_OPENCL == g_mathMode || MODE_CUDA == g_mathMode)
+    {
+#ifdef __USE_OPENCL__
+		ButteraugliComparatorEx * comp = (ButteraugliComparatorEx*)comparator_;
+
+        channel_info orig_channel[3];
+        channel_info mayout_channel[3];
+
+        for (int c = 0; c < 3; c++)
+        {
+            mayout_channel[c].factor = img->component(c).factor_x();
+            mayout_channel[c].block_width = img->component(c).width_in_blocks();
+            mayout_channel[c].block_height = img->component(c).height_in_blocks();
+            mayout_channel[c].coeff = img->component(c).coeffs();
+            mayout_channel[c].pixel = img->component(c).pixels();
+
+            orig_channel[c].factor = jpg.components[c].v_samp_factor;
+            orig_channel[c].block_width = jpg.components[c].width_in_blocks;
+            orig_channel[c].block_height = jpg.components[c].height_in_blocks;
+            orig_channel[c].coeff = jpg.components[c].coeffs.data();
         }
-      }
-      block_order.clear();
-      ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, factor_x,
-                               factor_y, comp_mask, img, &block_order);
-      candidate_coeff_offsets[block_ix] = candidate_coeffs.size();
-      for (size_t i = 0; i < block_order.size(); ++i) {
-        candidate_coeffs.push_back(block_order[i].idx);
-        candidate_coeff_errors.push_back(block_order[i].block_err);
-      }
+        output_order_gpu.resize(num_blocks * kBlockSize);
+        output_order = output_order_gpu.data();
+
+        if (MODE_OPENCL == g_mathMode)
+        {
+            clComputeBlockZeroingOrder(output_order,
+                orig_channel,
+                comp->imgOpsinDynamicsBlockList.data(),
+                comp->imgMaskXyzScaleBlockList.data(),
+                width,
+                height,
+                mayout_channel,
+                factor_x,
+                comp_mask,
+                comp->BlockErrorLimit());
+        }
+#endif
+#ifdef __USE_CUDA__
+        else
+        {
+            cuComputeBlockZeroingOrder(output_order,
+                orig_channel,
+                comp->imgOpsinDynamicsBlockList.data(),
+                comp->imgMaskXyzScaleBlockList.data(),
+                width,
+                height,
+                mayout_channel,
+                factor_x,
+                comp_mask,
+                comp->BlockErrorLimit());
+        }
+#endif
     }
-  }
-  comparator_->FinishBlockComparisons();
-  candidate_coeff_offsets[num_blocks] = candidate_coeffs.size();
+#ifdef __USE_OPENCL__
+    if (MODE_CPU_OPT == g_mathMode || MODE_CPU == g_mathMode || MODE_CHECKCL == g_mathMode)
+#else
+	if (MODE_CPU_OPT == g_mathMode || MODE_CPU == g_mathMode)
+#endif
+    {
+        output_order_cpu.resize(num_blocks * kBlockSize);
+        output_order = output_order_cpu.data();
+        for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) {
+            for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) {
+                coeff_t block[kBlockSize] = { 0 };
+                coeff_t orig_block[kBlockSize] = { 0 };
+                for (int c = 0; c < 3; ++c) {
+                    if (comp_mask & (1 << c)) {
+                        assert(img->component(c).factor_x() == factor_x);
+                        assert(img->component(c).factor_y() == factor_y);
+                        img->component(c).GetCoeffBlock(block_x, block_y,
+                            &block[c * kDCTBlockSize]);
+                        const JPEGComponent& comp = jpg.components[c];
+                        int jpg_block_ix = block_y * comp.width_in_blocks + block_x;
+                        memcpy(&orig_block[c * kDCTBlockSize],
+                            &comp.coeffs[jpg_block_ix * kDCTBlockSize],
+                            kDCTBlockSize * sizeof(orig_block[0]));
+                    }
+                }
+
+                std::vector<CoeffData> block_order;
+                ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, factor_x, factor_y, comp_mask, img, &block_order);
+
+                CoeffData * p = &output_order_cpu[block_ix * kBlockSize];
+                for (int i = 0; i < block_order.size(); i++)
+                {
+                    p[i].idx = block_order[i].idx;
+                    p[i].block_err = block_order[i].block_err;
+                }
+            }
+        }
+    }
+
+#ifdef __USE_OPENCL__
+    if (MODE_CHECKCL == g_mathMode)
+    {
+        int count = 0;
+        int check_size = output_order_gpu.size();
+        for (int i = 0; i < check_size; i++)
+        {
+            if (output_order_cpu[i].idx != output_order_gpu[i].idx ||
+                fabs(output_order_cpu[i].block_err - output_order_gpu[i].block_err) > 0.001)
+            {
+                count++;
+            }
+        }
+        if (count > 0)
+        {
+            LogError("CHK %s(%d) %d:%d\r\n", "SelectFrequencyMasking", __LINE__, count, check_size);
+        }
+    }
+#endif
+
+    std::vector<int> candidate_coeff_offsets(num_blocks + 1);
+    std::vector<uint8_t> candidate_coeffs;
+    std::vector<float> candidate_coeff_errors;
+
+    for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) {
+        for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) {
+            CoeffData * p = &output_order[block_ix * kBlockSize];
+   
+            candidate_coeff_offsets[block_ix] = candidate_coeffs.size();
+            for (int i = 0; i < kBlockSize; i++)
+            {
+                if (p[i].block_err > 0 && p[i].block_err <= comparator_->BlockErrorLimit())
+                {
+                    candidate_coeffs.push_back(p[i].idx);
+                    candidate_coeff_errors.push_back(p[i].block_err);
+                }
+            }
+        }
+    }
+
+    //
+    comparator_->FinishBlockComparisons();
+    candidate_coeff_offsets[num_blocks] = candidate_coeffs.size();
+
+    SelectFrequencyBackEnd(jpg, img, comp_mask, target_mul, stop_early,
+        candidate_coeff_offsets, candidate_coeffs, candidate_coeff_errors);
+
+}
+
+void Processor::SelectFrequencyBackEnd(const JPEGData& jpg, OutputImage* img, 
+                                        const uint8_t comp_mask,
+                                        const double target_mul, 
+                                        bool stop_early,
+                                        std::vector<int> &candidate_coeff_offsets,
+                                        std::vector<uint8_t>& candidate_coeffs,
+                                        std::vector<float> &candidate_coeff_errors)
+{
+    const int ncomp = jpg.components.size();
+    const int width = img->width();
+    const int height = img->height();
+    const int last_c = Log2FloorNonZero(comp_mask);
+    if (static_cast<size_t>(last_c) >= jpg.components.size()) return;
+    const int factor_x = img->component(last_c).factor_x();
+    const int factor_y = img->component(last_c).factor_y();
+    const int block_width = (width + 8 * factor_x - 1) / (8 * factor_x);
+    const int block_height = (height + 8 * factor_y - 1) / (8 * factor_y);
+    const int num_blocks = block_width * block_height;
 
   std::vector<JpegHistogram> ac_histograms(ncomp);
   int jpg_header_size, dc_size;
@@ -891,10 +1040,7 @@ bool Process(const Params& params, ProcessStats* stats,
   }
   std::vector<uint8_t> rgb = DecodeJpegToRGB(jpg);
   if (rgb.empty()) {
-    fprintf(stderr, "Unsupported input JPEG file (e.g. unsupported "
-            "downsampling mode).\nPlease provide the input image as "
-            "a PNG file.\n");
-    return false;
+    return ProcessUnsupportedJpegData(params,stats,data,jpg_out);
   }
   GuetzliOutput out;
   ProcessStats dummy_stats;
@@ -903,15 +1049,62 @@ bool Process(const Params& params, ProcessStats* stats,
   }
   std::unique_ptr<ButteraugliComparator> comparator;
   if (jpg.width >= 32 && jpg.height >= 32) {
+#ifdef __USE_OPENCL__
     comparator.reset(
-        new ButteraugliComparator(jpg.width, jpg.height, &rgb,
+        new ButteraugliComparatorEx(jpg.width, jpg.height, &rgb,
                                   params.butteraugli_target, stats));
+#else
+   comparator.reset(
+       new ButteraugliComparator(jpg.width, jpg.height, &rgb,
+           params.butteraugli_target, stats));
+#endif
   }
   bool ok = ProcessJpegData(params, jpg, comparator.get(), &out, stats);
   *jpg_out = out.jpeg_data;
   return ok;
 }
 
+bool ProcessUnsupportedJpegData(const Params& params, ProcessStats* stats,
+	const std::string& data,
+	std::string* jpg_out) {
+#ifdef __SUPPORT_FULL_JPEG__
+	struct jpeg_decompress_struct cinfo;
+	struct jpeg_error_mgr jerr;
+	cinfo.err = jpeg_std_error(&jerr);
+	jpeg_create_decompress(&cinfo);
+	jpeg_mem_src(&cinfo, (unsigned char*)data.c_str(), data.length());
+
+	int rc = jpeg_read_header(&cinfo, TRUE);
+	if (rc != 1) {
+		fprintf(stderr, "File does not seem to be a normal JPEG\n");
+		exit(EXIT_FAILURE);
+	}
+
+	cinfo.out_color_space = JCS_RGB; //force RGB output
+	jpeg_start_decompress(&cinfo);
+	int xsize = cinfo.output_width;
+	int ysize = cinfo.output_height;
+	int pixel_size = cinfo.output_components;
+	unsigned long bmp_size = xsize * ysize * pixel_size;
+	unsigned char *bmp_buffer = (unsigned char*)malloc(bmp_size);
+	int row_stride = cinfo.output_width * cinfo.output_components;
+	JSAMPARRAY buffer = (*cinfo.mem->alloc_sarray)
+		((j_common_ptr)&cinfo, JPOOL_IMAGE, row_stride, 1);
+	while (cinfo.output_scanline < cinfo.output_height) {
+		unsigned char *buffer_array[1];
+		buffer_array[0] = bmp_buffer + (cinfo.output_scanline) * row_stride;
+		jpeg_read_scanlines(&cinfo, buffer_array, 1);
+	}
+	std::vector<uint8_t> temp_rgb(bmp_buffer, bmp_buffer + bmp_size);
+	return Process(params, stats, temp_rgb, xsize, ysize, jpg_out);
+#else
+	fprintf(stderr, "Unsupported input JPEG file (e.g. unsupported "
+		"downsampling mode).\nPlease provide the input image as "
+		"a PNG file.\n");
+	return false;
+#endif
+}
+
 bool Process(const Params& params, ProcessStats* stats,
              const std::vector<uint8_t>& rgb, int w, int h,
              std::string* jpg_out) {
@@ -927,9 +1120,15 @@ bool Process(const Params& params, ProcessStats* stats,
   }
   std::unique_ptr<ButteraugliComparator> comparator;
   if (jpg.width >= 32 && jpg.height >= 32) {
+#ifdef __USE_OPENCL__
     comparator.reset(
-        new ButteraugliComparator(jpg.width, jpg.height, &rgb,
+        new ButteraugliComparatorEx(jpg.width, jpg.height, &rgb,
                                   params.butteraugli_target, stats));
+#else
+	  comparator.reset(
+		new ButteraugliComparator(jpg.width, jpg.height, &rgb,
+			params.butteraugli_target, stats));
+#endif
   }
   bool ok = ProcessJpegData(params, jpg, comparator.get(), &out, stats);
   *jpg_out = out.jpeg_data;
diff --git a/guetzli/processor.h b/guetzli/processor.h
index 2c543a25..e6cf4ba8 100644
--- a/guetzli/processor.h
+++ b/guetzli/processor.h
@@ -26,6 +26,11 @@
 
 namespace guetzli {
 
+struct CoeffData {
+    int idx;
+    float block_err;
+};
+    
 struct Params {
   float butteraugli_target = 1.0;
   bool clear_metadata = true;
@@ -48,6 +53,9 @@ struct GuetzliOutput {
 bool ProcessJpegData(const Params& params, const JPEGData& jpg_in,
                      Comparator* comparator, GuetzliOutput* out,
                      ProcessStats* stats);
+bool ProcessUnsupportedJpegData(const Params& params,
+	ProcessStats* stats, const std::string& data,
+	std::string* jpg_out);
 
 // Sets *out to a jpeg encoded string that will decode to an image that is
 // visually indistinguishable from the input rgb image.
diff --git a/guetzli_static.make b/guetzli_static.make
index d20fb77d..9fe7bf05 100644
--- a/guetzli_static.make
+++ b/guetzli_static.make
@@ -16,7 +16,7 @@ ifeq ($(config),release)
   TARGET = $(TARGETDIR)/libguetzli_static.a
   OBJDIR = obj/Release/guetzli_static
   DEFINES +=
-  INCLUDES += -I. -Ithird_party/butteraugli
+  INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli
   FORCE_INCLUDE +=
   ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES)
   ALL_CFLAGS += $(CFLAGS) $(ALL_CPPFLAGS) -O3 -g `pkg-config --static --cflags libpng || libpng-config --static --cflags`
@@ -43,7 +43,7 @@ ifeq ($(config),debug)
   TARGET = $(TARGETDIR)/libguetzli_static.a
   OBJDIR = obj/Debug/guetzli_static
   DEFINES +=
-  INCLUDES += -I. -Ithird_party/butteraugli
+  INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli
   FORCE_INCLUDE +=
   ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES)
   ALL_CFLAGS += $(CFLAGS) $(ALL_CPPFLAGS) -g `pkg-config --static --cflags libpng || libpng-config --static --cflags`
@@ -65,6 +65,15 @@ all: $(TARGETDIR) $(OBJDIR) prebuild prelink $(TARGET)
 endif
 
 OBJECTS := \
+	$(OBJDIR)/clbutter_comparator.o \
+	$(OBJDIR)/clguetzli.cl.o \
+	$(OBJDIR)/clguetzli.o \
+	$(OBJDIR)/clguetzli_test.o \
+	$(OBJDIR)/cuguetzli.o \
+	$(OBJDIR)/cumem_pool.o \
+	$(OBJDIR)/ocl.o \
+	$(OBJDIR)/ocu.o \
+	$(OBJDIR)/utils.o \
 	$(OBJDIR)/butteraugli_comparator.o \
 	$(OBJDIR)/dct_double.o \
 	$(OBJDIR)/debug_print.o \
@@ -142,6 +151,33 @@ $(GCH): $(PCH)
 	$(SILENT) $(CXX) -x c++-header $(ALL_CXXFLAGS) -o "$@" -MF "$(@:%.gch=%.d)" -c "$<"
 endif
 
+$(OBJDIR)/clbutter_comparator.o: clguetzli/clbutter_comparator.cpp
+	@echo $(notdir $<)
+	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
+$(OBJDIR)/clguetzli.cl.o: clguetzli/clguetzli.cl.cpp
+	@echo $(notdir $<)
+	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
+$(OBJDIR)/clguetzli.o: clguetzli/clguetzli.cpp
+	@echo $(notdir $<)
+	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
+$(OBJDIR)/clguetzli_test.o: clguetzli/clguetzli_test.cpp
+	@echo $(notdir $<)
+	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
+$(OBJDIR)/cuguetzli.o: clguetzli/cuguetzli.cpp
+	@echo $(notdir $<)
+	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
+$(OBJDIR)/cumem_pool.o: clguetzli/cumem_pool.cpp
+	@echo $(notdir $<)
+	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
+$(OBJDIR)/ocl.o: clguetzli/ocl.cpp
+	@echo $(notdir $<)
+	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
+$(OBJDIR)/ocu.o: clguetzli/ocu.cpp
+	@echo $(notdir $<)
+	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
+$(OBJDIR)/utils.o: clguetzli/utils.cpp
+	@echo $(notdir $<)
+	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/butteraugli_comparator.o: guetzli/butteraugli_comparator.cc
 	@echo $(notdir $<)
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
diff --git a/guetzli_static.vcxproj b/guetzli_static.vcxproj
index 02e6b436..3c3bd850 100644
--- a/guetzli_static.vcxproj
+++ b/guetzli_static.vcxproj
@@ -93,7 +93,7 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;third_party\butteraugli;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <Optimization>Full</Optimization>
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
@@ -110,7 +110,7 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;third_party\butteraugli;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <Optimization>Full</Optimization>
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
@@ -127,7 +127,7 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;third_party\butteraugli;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
       <Optimization>Disabled</Optimization>
     </ClCompile>
@@ -140,7 +140,7 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;third_party\butteraugli;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
       <Optimization>Disabled</Optimization>
     </ClCompile>
diff --git a/guetzli_static.vcxproj.filters b/guetzli_static.vcxproj.filters
index ec134ccc..94654c91 100644
--- a/guetzli_static.vcxproj.filters
+++ b/guetzli_static.vcxproj.filters
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+﻿<?xml version="1.0" encoding="utf-8"?>
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup>
     <Filter Include="guetzli">
diff --git a/premake5.lua b/premake5.lua
index 1a109d7a..cc41301b 100644
--- a/premake5.lua
+++ b/premake5.lua
@@ -2,7 +2,8 @@ workspace "guetzli"
   configurations { "Release", "Debug" }
   language "C++"
   flags { "C++11" }
-  includedirs { ".", "third_party/butteraugli" }
+  includedirs { ".", "third_party/butteraugli", "clguetzli" }
+  libdirs {}
 
   filter "action:vs*"
     platforms { "x86_64", "x86" }
@@ -29,7 +30,9 @@ workspace "guetzli"
         "guetzli/*.cc",
         "guetzli/*.h",
         "third_party/butteraugli/butteraugli/butteraugli.cc",
-        "third_party/butteraugli/butteraugli/butteraugli.h"
+        "third_party/butteraugli/butteraugli/butteraugli.h",
+        "clguetzli/*.cpp",
+        "clguetzli/*.h"
       }
     removefiles "guetzli/guetzli.cc"
     filter "action:gmake"
@@ -39,8 +42,10 @@ workspace "guetzli"
   project "guetzli"
     kind "ConsoleApp"
     filter "action:gmake"
+	  --defines { "__USE_OPENCL__", "__USE_CUDA__", "__SUPPORT_FULL_JPEG__" }
       linkoptions { "`pkg-config --libs libpng || libpng-config --ldflags`" }
       buildoptions { "`pkg-config --cflags libpng || libpng-config --cflags`" }
+      --links { "OpenCL", "cuda", "profiler", "unwind", "jpeg" }
     filter "action:vs*"
       links { "shlwapi" }
     filter {}
@@ -49,5 +54,7 @@ workspace "guetzli"
         "guetzli/*.cc",
         "guetzli/*.h",
         "third_party/butteraugli/butteraugli/butteraugli.cc",
-        "third_party/butteraugli/butteraugli/butteraugli.h"
+        "third_party/butteraugli/butteraugli/butteraugli.h",
+        "clguetzli/*.cpp",
+        "clguetzli/*.h"
       }
diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc
index 4cdc29bb..c32f226c 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.cc
+++ b/third_party/butteraugli/butteraugli/butteraugli.cc
@@ -40,6 +40,12 @@
 #include <algorithm>
 #include <array>
 
+#ifdef __USE_OPENCL__
+#include "clguetzli/clbutter_comparator.h"
+#include "clguetzli/clguetzli.h"
+#include "clguetzli/clguetzli_test.h"
+#endif
+
 // Restricted pointers speed up Convolution(); MSVC uses a different keyword.
 #ifdef _MSC_VER
 #define __restrict__ __restrict
@@ -59,7 +65,7 @@ inline double DotProduct(const float u[3], const double v[3]) {
 }
 
 // Computes a horizontal convolution and transposes the result.
-static void Convolution(size_t xsize, size_t ysize,
+void _Convolution(size_t xsize, size_t ysize,
                         size_t xstep,
                         size_t len, size_t offset,
                         const float* __restrict__ multipliers,
@@ -91,7 +97,7 @@ static void Convolution(size_t xsize, size_t ysize,
   }
 }
 
-void Blur(size_t xsize, size_t ysize, float* channel, double sigma,
+void _Blur(size_t xsize, size_t ysize, float* channel, double sigma,
           double border_ratio) {
   PROFILER_FUNC;
   double m = 2.25;  // Accuracy increases when m is increased.
@@ -108,17 +114,28 @@ void Blur(size_t xsize, size_t ysize, float* channel, double sigma,
   int dxsize = (xsize + xstep - 1) / xstep;
   int dysize = (ysize + ystep - 1) / ystep;
   std::vector<float> tmp(dxsize * ysize);
+#ifdef __USE_OPENCL__
   Convolution(xsize, ysize, xstep, expn_size, diff, expn.data(), channel,
               border_ratio,
               tmp.data());
+#else
+  _Convolution(xsize, ysize, xstep, expn_size, diff, expn.data(), channel,
+	          border_ratio,
+	          tmp.data());
+#endif
   float* output = channel;
   std::vector<float> downsampled_output;
   if (xstep > 1) {
     downsampled_output.resize(dxsize * dysize);
     output = downsampled_output.data();
   }
+#ifdef __USE_OPENCL__
   Convolution(ysize, dxsize, ystep, expn_size, diff, expn.data(), tmp.data(),
               border_ratio, output);
+#else
+  _Convolution(ysize, dxsize, ystep, expn_size, diff, expn.data(), tmp.data(),
+	          border_ratio, output);
+#endif
   if (xstep > 1) {
     for (size_t y = 0; y < ysize; y++) {
       for (size_t x = 0; x < xsize; x++) {
@@ -771,7 +788,7 @@ ButteraugliComparator::ButteraugliComparator(
   assert(step <= 4);
 }
 
-void MaskHighIntensityChange(
+void _MaskHighIntensityChange(
     size_t xsize, size_t ysize,
     const std::vector<std::vector<float> > &c0,
     const std::vector<std::vector<float> > &c1,
@@ -923,7 +940,7 @@ static inline double Gamma(double v) {
   return GammaPolynomial(static_cast<float>(v));
 }
 
-void OpsinDynamicsImage(size_t xsize, size_t ysize,
+void _OpsinDynamicsImage(size_t xsize, size_t ysize,
                         std::vector<std::vector<float> > &rgb) {
   PROFILER_FUNC;
   std::vector<std::vector<float> > blurred = rgb;
@@ -956,7 +973,7 @@ void OpsinDynamicsImage(size_t xsize, size_t ysize,
   }
 }
 
-static void ScaleImage(double scale, std::vector<float> *result) {
+void _ScaleImage(double scale, std::vector<float> *result) {
   PROFILER_FUNC;
   for (size_t i = 0; i < result->size(); ++i) {
     (*result)[i] *= static_cast<float>(scale);
@@ -965,7 +982,7 @@ static void ScaleImage(double scale, std::vector<float> *result) {
 
 // Making a cluster of local errors to be more impactful than
 // just a single error.
-void CalculateDiffmap(const size_t xsize, const size_t ysize,
+void _CalculateDiffmap(const size_t xsize, const size_t ysize,
                       const size_t step,
                       std::vector<float>* diffmap) {
   PROFILER_FUNC;
@@ -1018,7 +1035,11 @@ void CalculateDiffmap(const size_t xsize, const size_t ysize,
             += static_cast<float>(mul1) * blurred[y * (xsize - s) + x];
       }
     }
+#ifdef __USE_OPENCL__
     ScaleImage(scale, diffmap);
+#else
+	_ScaleImage(scale, diffmap);
+#endif
   }
 }
 
@@ -1050,7 +1071,11 @@ void ButteraugliComparator::DiffmapOpsinDynamicsImage(
     CombineChannels(mask_xyb, mask_xyb_dc, block_diff_dc, block_diff_ac,
                     edge_detector_map, &result);
   }
+#ifdef __USE_OPENCL__
   CalculateDiffmap(xsize_, ysize_, step_, &result);
+#else
+  _CalculateDiffmap(xsize_, ysize_, step_, &result);
+#endif
 }
 
 void ButteraugliComparator::BlockDiffMap(
@@ -1304,8 +1329,8 @@ double MaskDcB(double delta) {
 // square_size square with coordinates
 //   x - offset .. x + square_size - offset - 1,
 //   y - offset .. y + square_size - offset - 1.
-void MinSquareVal(size_t square_size, size_t offset,
-                  size_t xsize, size_t ysize,
+void _MinSquareVal(size_t square_size, size_t offset,
+				  size_t xsize, size_t ysize,
                   float *values) {
   PROFILER_FUNC;
   // offset is not negative and smaller than square_size.
@@ -1315,9 +1340,19 @@ void MinSquareVal(size_t square_size, size_t offset,
     const size_t minh = offset > y ? 0 : y - offset;
     const size_t maxh = std::min<size_t>(ysize, y + square_size - offset);
     for (size_t x = 0; x < xsize; ++x) {
+#ifdef __USE_C__
+      float min = values[x + minh * xsize];
+#else
       double min = values[x + minh * xsize];
+#endif
       for (size_t j = minh + 1; j < maxh; ++j) {
+#ifdef __USE_C__
+          float tmpf = values[x + j * xsize];
+          if (tmpf < min) min = tmpf;
+#else
         min = fmin(min, values[x + j * xsize]);
+#endif
+
       }
       tmp[x + y * xsize] = static_cast<float>(min);
     }
@@ -1328,7 +1363,12 @@ void MinSquareVal(size_t square_size, size_t offset,
     for (size_t y = 0; y < ysize; ++y) {
       double min = tmp[minw + y * xsize];
       for (size_t j = minw + 1; j < maxw; ++j) {
+#ifdef __USE_C__
+          float tmpf = tmp[j + y * xsize];
+          if (tmpf < min) min = tmpf;
+#else
         min = fmin(min, tmp[j + y * xsize]);
+#endif
       }
       values[x + y * xsize] = static_cast<float>(min);
     }
@@ -1336,7 +1376,7 @@ void MinSquareVal(size_t square_size, size_t offset,
 }
 
 // ===== Functions used by Mask only =====
-void Average5x5(int xsize, int ysize, std::vector<float>* diffs) {
+void _Average5x5(int xsize, int ysize, std::vector<float>* diffs) {
   PROFILER_FUNC;
   if (xsize < 4 || ysize < 4) {
     // TODO: Make this work for small dimensions as well.
@@ -1347,7 +1387,11 @@ void Average5x5(int xsize, int ysize, std::vector<float>* diffs) {
   std::vector<float> result = *diffs;
   std::vector<float> tmp0 = *diffs;
   std::vector<float> tmp1 = *diffs;
+#ifdef __USE_OPENCL__
   ScaleImage(w, &tmp1);
+#else
+  _ScaleImage(w, &tmp1);
+#endif
   for (int y = 0; y < ysize; y++) {
     const int row0 = y * xsize;
     result[row0 + 1] += tmp0[row0];
@@ -1386,10 +1430,14 @@ void Average5x5(int xsize, int ysize, std::vector<float>* diffs) {
     }
   }
   *diffs = result;
+#ifdef __USE_OPENCL__
   ScaleImage(scale, diffs);
+#else
+  _ScaleImage(scale, diffs);
+#endif
 }
 
-void DiffPrecompute(
+void _DiffPrecompute(
     const std::vector<std::vector<float> > &xyb0,
     const std::vector<std::vector<float> > &xyb1,
     size_t xsize, size_t ysize,
@@ -1444,7 +1492,7 @@ void DiffPrecompute(
   }
 }
 
-void Mask(const std::vector<std::vector<float> > &xyb0,
+void _Mask(const std::vector<std::vector<float> > &xyb0,
           const std::vector<std::vector<float> > &xyb1,
           size_t xsize, size_t ysize,
           std::vector<std::vector<float> > *mask,
@@ -1454,6 +1502,7 @@ void Mask(const std::vector<std::vector<float> > &xyb0,
   for (int i = 0; i < 3; ++i) {
     (*mask)[i].resize(xsize * ysize);
   }
+#ifdef __USE_OPENCL__
   DiffPrecompute(xyb0, xyb1, xsize, ysize, mask);
   for (int i = 0; i < 3; ++i) {
     Average5x5(xsize, ysize, &(*mask)[i]);
@@ -1465,6 +1514,19 @@ void Mask(const std::vector<std::vector<float> > &xyb0,
     };
     Blur(xsize, ysize, (*mask)[i].data(), sigma[i], 0.0);
   }
+#else
+  _DiffPrecompute(xyb0, xyb1, xsize, ysize, mask);
+  for (int i = 0; i < 3; ++i) {
+	  _Average5x5(xsize, ysize, &(*mask)[i]);
+	  _MinSquareVal(4, 0, xsize, ysize, (*mask)[i].data());
+	  static const double sigma[3] = {
+		  9.65781083553,
+		  14.2644604355,
+		  4.53358927369,
+	  };
+	  _Blur(xsize, ysize, (*mask)[i].data(), sigma[i], 0.0);
+  }
+#endif
   static const double w00 = 232.206464018;
   static const double w11 = 22.9455222245;
   static const double w22 = 503.962310606;
@@ -1491,10 +1553,17 @@ void Mask(const std::vector<std::vector<float> > &xyb0,
       (*mask_dc)[2][idx] = static_cast<float>(MaskDcB(p2));
     }
   }
+#ifdef __USE_OPENCL__
   for (int i = 0; i < 3; ++i) {
     ScaleImage(kGlobalScale * kGlobalScale, &(*mask)[i]);
     ScaleImage(kGlobalScale * kGlobalScale, &(*mask_dc)[i]);
   }
+#else
+  for (int i = 0; i < 3; ++i) {
+    _ScaleImage(kGlobalScale * kGlobalScale, &(*mask)[i]);
+    _ScaleImage(kGlobalScale * kGlobalScale, &(*mask_dc)[i]);
+  }
+#endif
 }
 
 }  // namespace butteraugli
diff --git a/third_party/butteraugli/butteraugli/butteraugli.h b/third_party/butteraugli/butteraugli/butteraugli.h
index a79cefb2..547fdc58 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.h
+++ b/third_party/butteraugli/butteraugli/butteraugli.h
@@ -45,33 +45,34 @@ class ButteraugliComparator {
 
   // Computes the butteraugli map between xyb0 and xyb1 and updates result.
   // Both xyb0 and xyb1 are in opsin-dynamics space.
-  // NOTE: The xyb0 and xyb1 images are mutated by this function in-place.
-  void DiffmapOpsinDynamicsImage(std::vector<std::vector<float>> &xyb0,
+// NOTE: The xyb0 and xyb1 images are mutated by this function in-place.
+  virtual void DiffmapOpsinDynamicsImage(std::vector<std::vector<float>> &xyb0,
                                  std::vector<std::vector<float>> &xyb1,
                                  std::vector<float> &result);
-
- private:
-  void BlockDiffMap(const std::vector<std::vector<float> > &rgb0,
+  int step() { return step_;}
+ protected:
+  virtual void BlockDiffMap(const std::vector<std::vector<float> > &rgb0,
                     const std::vector<std::vector<float> > &rgb1,
                     std::vector<float>* block_diff_dc,
                     std::vector<float>* block_diff_ac);
 
 
-  void EdgeDetectorMap(const std::vector<std::vector<float> > &rgb0,
+  virtual void EdgeDetectorMap(const std::vector<std::vector<float> > &rgb0,
                        const std::vector<std::vector<float> > &rgb1,
                        std::vector<float>* edge_detector_map);
 
-  void EdgeDetectorLowFreq(const std::vector<std::vector<float> > &rgb0,
+  virtual void EdgeDetectorLowFreq(const std::vector<std::vector<float> > &rgb0,
                            const std::vector<std::vector<float> > &rgb1,
                            std::vector<float>* block_diff_ac);
 
-  void CombineChannels(const std::vector<std::vector<float> >& scale_xyb,
+  virtual void CombineChannels(const std::vector<std::vector<float> >& scale_xyb,
                        const std::vector<std::vector<float> >& scale_xyb_dc,
                        const std::vector<float>& block_diff_dc,
                        const std::vector<float>& block_diff_ac,
                        const std::vector<float>& edge_detector_map,
                        std::vector<float>* result);
 
+protected:
   const size_t xsize_;
   const size_t ysize_;
   const size_t num_pixels_;