From 4f3050560b9eb470ce117bdd635fd2fecb245e7b Mon Sep 17 00:00:00 2001
From: arch1t3cht <arch1t3cht@gmail.com>
Date: Thu, 30 Mar 2023 18:14:44 +0200
Subject: [PATCH 1/3] Add an ignore_mask argument to ignore certain source
 pixels

In practice, the resampling process is not always entirely linear: After
rescaling, pixel values might be clamped to their value range. For
example, when a white image is resampled with zero-padding and the
kernel has negative lobes, the second and third pixel lines from the
edge will be clamped.

When descaling, these clamped pixels give wrong information, so they
will negatively affect the descale result. In the above example, they
will cause dirty borders in the descaled image.

One idea of fixing this problem is to simply discard any pixels which
are suspected to be clipped from the linear system of equations. Since
the system of equations is overdetermined, this can still give an
accurate descale result, as long as no longer contiguous stretches of
pixels are discarded.

This commit adds an ignore_mask argument to the descale functions.
Pixels selected by this mask will have their corresponding equations
dropped from the system of equations. This requires partially
recomputing the matrix and its LDLT decomposition whenever this mask
changes, but this is still reasonably efficient since the matrix is
banded.

The ignore_mask is only supported when only descaling along one axis:
When descaling along both axes, a second mask at an intermediate
resolution would be required for the second descale, so this is better
off being left to the user.
---
 include/descale.h      |   4 +-
 src/avsplugin.c        |   8 +--
 src/descale.c          | 151 ++++++++++++++++++++++++++++++++++++++---
 src/vsplugin.c         |  79 +++++++++++++++++++--
 src/x86/descale_avx2.c |   2 +-
 src/x86/descale_avx2.h |   2 +-
 6 files changed, 222 insertions(+), 24 deletions(-)

diff --git a/include/descale.h b/include/descale.h
index 2677db3..1101f01 100644
--- a/include/descale.h
+++ b/include/descale.h
@@ -76,6 +76,7 @@ typedef struct DescaleParams
     double param2;      // required if mode is BICUBIC
     double shift;       // optional
     double active_dim;  // always required; usually equal to dst_dim
+    int has_ignore_mask;
     enum DescaleBorder border_handling;        // optional
     struct DescaleCustomKernel custom_kernel;  // required if mode is CUSTOM
 } DescaleParams;
@@ -90,6 +91,7 @@ typedef struct DescaleCore
     float **lower;
     float *diagonal;
     float *weights;
+    double *multiplied_weights;
     int *weights_left_idx;
     int *weights_right_idx;
     int weights_columns;
@@ -101,7 +103,7 @@ typedef struct DescaleAPI
     struct DescaleCore *(*create_core)(int src_dim, int dst_dim, struct DescaleParams *params);
     void (*free_core)(struct DescaleCore *core);
     void (*process_vectors)(struct DescaleCore *core, enum DescaleDir dir, int vector_count,
-                            int src_stride, int dst_stride, const float *srcp, float *dstp);
+                            int src_stride, int imask_stride, int dst_stride, const float *srcp, const unsigned char *imaskp, float *dstp);
 } DescaleAPI;
 
 
diff --git a/src/avsplugin.c b/src/avsplugin.c
index ac897c1..69da585 100644
--- a/src/avsplugin.c
+++ b/src/avsplugin.c
@@ -76,16 +76,16 @@ static AVS_VideoFrame * AVSC_CC avs_descale_get_frame(AVS_FilterInfo *fi, int n)
             int intermediate_stride = avs_get_pitch_p(dst, plane);
             float *intermediatep = avs_pool_allocate(fi->env, intermediate_stride * d->dd.src_height * sizeof (float), 32, AVS_ALLOCTYPE_POOLED_ALLOC);
 
-            d->dd.dsapi.process_vectors(d->dd.dscore_h[i && d->dd.subsampling_h], DESCALE_DIR_HORIZONTAL, d->dd.src_height >> (i ? d->dd.subsampling_v : 0), src_stride, intermediate_stride, srcp, intermediatep);
-            d->dd.dsapi.process_vectors(d->dd.dscore_v[i && d->dd.subsampling_v], DESCALE_DIR_VERTICAL, d->dd.dst_width >> (i ? d->dd.subsampling_h : 0), intermediate_stride, dst_stride, intermediatep, dstp);
+            d->dd.dsapi.process_vectors(d->dd.dscore_h[i && d->dd.subsampling_h], DESCALE_DIR_HORIZONTAL, d->dd.src_height >> (i ? d->dd.subsampling_v : 0), src_stride, 0, intermediate_stride, srcp, NULL, intermediatep);
+            d->dd.dsapi.process_vectors(d->dd.dscore_v[i && d->dd.subsampling_v], DESCALE_DIR_VERTICAL, d->dd.dst_width >> (i ? d->dd.subsampling_h : 0), intermediate_stride, 0, dst_stride, intermediatep, NULL, dstp);
 
             avs_pool_free(fi->env, intermediatep);
 
         } else if (d->dd.process_h) {
-            d->dd.dsapi.process_vectors(d->dd.dscore_h[i && d->dd.subsampling_h], DESCALE_DIR_HORIZONTAL, d->dd.src_height >> (i ? d->dd.subsampling_v : 0), src_stride, dst_stride, srcp, dstp);
+            d->dd.dsapi.process_vectors(d->dd.dscore_h[i && d->dd.subsampling_h], DESCALE_DIR_HORIZONTAL, d->dd.src_height >> (i ? d->dd.subsampling_v : 0), src_stride, 0, dst_stride, srcp, NULL, dstp);
 
         } else if (d->dd.process_v) {
-            d->dd.dsapi.process_vectors(d->dd.dscore_v[i && d->dd.subsampling_v], DESCALE_DIR_VERTICAL, d->dd.src_width >> (i ? d->dd.subsampling_h : 0), src_stride, dst_stride, srcp, dstp);
+            d->dd.dsapi.process_vectors(d->dd.dscore_v[i && d->dd.subsampling_v], DESCALE_DIR_VERTICAL, d->dd.src_width >> (i ? d->dd.subsampling_h : 0), src_stride, 0, dst_stride, srcp, NULL, dstp);
         }
     }
 
diff --git a/src/descale.c b/src/descale.c
index 8ba3ec7..694c7bc 100644
--- a/src/descale.c
+++ b/src/descale.c
@@ -25,6 +25,7 @@
 #include <math.h>
 #include <stdbool.h>
 #include <stdlib.h>
+#include <string.h>
 #include "common.h"
 #include "descale.h"
 
@@ -531,11 +532,127 @@ static void process_plane_v_c(int height, int current_height, int current_width,
     }
 }
 
+static inline int check_imask(unsigned char value) {
+    return value >= 128;
+}
+
+static void process_plane_masked(int dst_dim, int src_dim, int vector_count, enum DescaleDir dir, int bandwidth, int * restrict weights_left_idx, int * restrict weights_right_idx,
+                              int weights_columns, float * restrict weights, double * restrict multiplied_weights,
+                              int src_stride, int imask_stride, int dst_stride, const float * restrict srcp, const unsigned char * restrict imaskp, float * restrict dstp)
+{
+    double *modified_ldlt = calloc(dst_dim * bandwidth, sizeof (double));
+    int c = bandwidth / 2;
+
+    int imuls = dir == DESCALE_DIR_HORIZONTAL ? src_stride : 1;
+    int jmuls = dir == DESCALE_DIR_HORIZONTAL ? 1 : src_stride;
+
+    int imuli = dir == DESCALE_DIR_HORIZONTAL ? imask_stride : 1;
+    int jmuli = dir == DESCALE_DIR_HORIZONTAL ? 1 : imask_stride;
+
+    int imuld = dir == DESCALE_DIR_HORIZONTAL ? dst_stride : 1;
+    int jmuld = dir == DESCALE_DIR_HORIZONTAL ? 1 : dst_stride;
+
+    double eps = DBL_EPSILON;
+
+    for (int i = 0; i < vector_count; i++) {
+
+        int same_mask = i > 0;
+        if (i > 0) {
+            for (int j = 0; j < src_dim; j++) {
+                if (check_imask(imaskp[i * imuli + j * jmuli]) != check_imask(imaskp[(i - 1) * imuli + j * jmuli])) {
+                    same_mask = false;
+                    break;
+                }
+            }
+        }
+
+        if (!same_mask) {
+            int imask_start = 0;
+
+            for (int j = 0; j < src_dim; j++) {
+                imask_start = j;
+                if (check_imask(imaskp[i * imuli + j * jmuli]))
+                    break;
+            }
+
+            // Restore the ldlt to the original multiplied weights
+            memcpy(modified_ldlt, multiplied_weights, dst_dim * bandwidth * sizeof (double));
+
+            // Subtract the multiplied masked weights to obtain the new matrix:
+            // M' P M = M' M - M' (I - P) M
+            for (int j = imask_start; j < src_dim; j++) {
+                if (!check_imask(imaskp[i * imuli + j * jmuli]))
+                    continue;
+                for (int r = 0; r < dst_dim; r++) {
+                    if (j < weights_left_idx[r] || j >= weights_right_idx[r]) continue;
+                    for (int s = r; s < dst_dim; s++) {
+                        if (j < weights_left_idx[s] || j >= weights_right_idx[s]) continue;
+                        modified_ldlt[r * bandwidth + s - r] -= weights[r * weights_columns + j - weights_left_idx[r]] * weights[s * weights_columns + j - weights_left_idx[s]];
+                    }
+                }
+            }
+
+            // Now, redo the LDLT decomposition
+            for (int i = 0; i < dst_dim; i++) {
+                int end = DSMIN(c + 1, dst_dim - i);
+
+                for (int j = 1; j < end; j++) {
+                    double d = modified_ldlt[i * bandwidth + j] / (modified_ldlt[i * bandwidth] + eps);
+
+                    for (int k = 0; k < end - j; k++) {
+                        modified_ldlt[(i + j) * bandwidth + k] -= d * modified_ldlt[i * bandwidth + j + k];;
+                    }
+                }
+
+                double e = 1.0 / (modified_ldlt[i * bandwidth] + eps);
+                for (int j = 1; j < end; j++) {
+                    modified_ldlt[i * bandwidth + j] *= e;
+                }
+            }
+        }
+
+        // Now we can do the usual forward/backward substitution
+        for (int j = 0; j < dst_dim; j++) {
+            float sum = 0.0f;
+            int start = DSMAX(0, j - c);
+
+            // A' b
+            for (int k = weights_left_idx[j]; k < weights_right_idx[j]; ++k)
+                sum += weights[j * weights_columns + k - weights_left_idx[j]] * srcp[i * imuls + k * jmuls] * (1 - check_imask(imaskp[i * imuli + k * jmuli]));
+
+            // Solve LD y = A' b
+            for (int k = start; k < j; k++) {
+                sum -= modified_ldlt[k * bandwidth + j - k] * modified_ldlt[k * bandwidth] * dstp[i * imuld + k * jmuld];
+            }
+
+            dstp[i * imuld + j * jmuld] = sum / (eps + modified_ldlt[j * bandwidth]);
+        }
+
+        // Solve L' x = y
+        for (int j = dst_dim - 2; j >= 0; j--) {
+            float sum = 0.0f;
+            int start = DSMIN(dst_dim - 1, j + c);
+
+            for (int k = start; k > j; k--) {
+                sum += modified_ldlt[j * bandwidth + k - j] * dstp[i * imuld + k * jmuld];
+            }
+
+            dstp[i * imuld + j * jmuld] -= sum;
+        }
+    }
+
+    free(modified_ldlt);
+}
+
 
 static void descale_process_vectors_c(struct DescaleCore *core, enum DescaleDir dir, int vector_count,
-                                      int src_stride, int dst_stride, const float *srcp, float *dstp)
+                                      int src_stride, int imask_stride, int dst_stride, const float *srcp, const unsigned char *imaskp, float *dstp)
 {
-    if (dir == DESCALE_DIR_HORIZONTAL) {
+
+    if (imaskp) {
+        process_plane_masked(core->dst_dim, core->src_dim, vector_count, dir, core->bandwidth, core->weights_left_idx, core->weights_right_idx,
+                             core->weights_columns, core->weights, core->multiplied_weights, src_stride, imask_stride, dst_stride, srcp, imaskp, dstp);
+    } else if (dir == DESCALE_DIR_HORIZONTAL) {
         if (core->bandwidth == 3)
             process_plane_h_b3_c(core->dst_dim, core->src_dim, vector_count, core->bandwidth, core->weights_left_idx, core->weights_right_idx,
                                  core->weights_columns, core->weights, core->lower, core->upper, core->diagonal, src_stride, dst_stride, srcp, dstp);
@@ -592,7 +709,7 @@ static struct DescaleCore *create_core(int src_dim, int dst_dim, struct DescaleP
     double *weights;
     double *transposed_weights;
     double *multiplied_weights;
-    double *lower;
+    double *ldlt;
 
     scaling_weights(params->mode, support, dst_dim, src_dim, params->param1, params->param2, params->shift, params->active_dim, params->border_handling, &params->custom_kernel, &weights);
     transpose_matrix(src_dim, dst_dim, weights, &transposed_weights);
@@ -615,9 +732,10 @@ static struct DescaleCore *create_core(int src_dim, int dst_dim, struct DescaleP
     }
 
     multiply_sparse_matrices(dst_dim, src_dim, core.weights_left_idx, core.weights_right_idx, transposed_weights, weights, &multiplied_weights);
-    banded_ldlt_decomposition(dst_dim, core.bandwidth, multiplied_weights);
-    transpose_matrix(dst_dim, dst_dim, multiplied_weights, &lower);
-    multiply_banded_matrix_with_diagonal(dst_dim, core.bandwidth, lower);
+
+    ldlt = calloc(dst_dim * dst_dim, sizeof (double));
+    memcpy(ldlt, multiplied_weights, dst_dim * dst_dim * sizeof (double));
+    banded_ldlt_decomposition(dst_dim, core.bandwidth, ldlt);
 
     int max = 0;
     for (int i = 0; i < dst_dim; i++) {
@@ -633,12 +751,24 @@ static struct DescaleCore *create_core(int src_dim, int dst_dim, struct DescaleP
         }
     }
 
-    extract_compressed_lower_upper_diagonal(dst_dim, core.bandwidth, lower, multiplied_weights, &core.lower, &core.upper, &core.diagonal);
-
+    if (params->has_ignore_mask) {
+        core.multiplied_weights = calloc(dst_dim * core.bandwidth, sizeof (double));
+        for (int i = 0; i < dst_dim; i++) {
+            for (int j = 0; j < core.bandwidth; j++) {
+                core.multiplied_weights[i * core.bandwidth + j] = multiplied_weights[i * dst_dim + i + j];
+            }
+        }
+    } else {
+        double *lower;
+        transpose_matrix(dst_dim, dst_dim, ldlt, &lower);
+        multiply_banded_matrix_with_diagonal(dst_dim, core.bandwidth, lower);
+        extract_compressed_lower_upper_diagonal(dst_dim, core.bandwidth, lower, ldlt, &core.lower, &core.upper, &core.diagonal);
+        free(lower);
+    }
     free(weights);
     free(transposed_weights);
     free(multiplied_weights);
-    free(lower);
+    free(ldlt);
 
     struct DescaleCore *corep = malloc(sizeof core);
     *corep = core;
@@ -652,8 +782,9 @@ static void free_core(struct DescaleCore *core)
     free(core->weights);
     free(core->weights_left_idx);
     free(core->weights_right_idx);
+    free(core->multiplied_weights);
     free(core->diagonal);
-    for (int i = 0; i < core->bandwidth / 2; i++) {
+    for (int i = 0; core->upper && i < core->bandwidth / 2; i++) {
         free(core->lower[i]);
         free(core->upper[i]);
     }
diff --git a/src/vsplugin.c b/src/vsplugin.c
index d43e370..3d08cdd 100644
--- a/src/vsplugin.c
+++ b/src/vsplugin.c
@@ -38,6 +38,7 @@ struct VSDescaleData
     pthread_mutex_t lock;
 
     VSNode *node;
+    VSNode *ignore_mask_node;
     VSVideoInfo vi;
 
     struct DescaleData dd;
@@ -57,9 +58,10 @@ static const VSFrame *VS_CC descale_get_frame(int n, int activation_reason, void
 
     if (activation_reason == arInitial) {
         vsapi->requestFrameFilter(n, d->node, frame_ctx);
+        if (d->ignore_mask_node)
+            vsapi->requestFrameFilter(n, d->ignore_mask_node, frame_ctx);
 
     } else if (activation_reason == arAllFramesReady) {
-
         if (!d->initialized) {
             pthread_mutex_lock(&d->lock);
             if (!d->initialized) {
@@ -71,6 +73,9 @@ static const VSFrame *VS_CC descale_get_frame(int n, int activation_reason, void
 
         const VSVideoFormat fmt = d->vi.format;
         const VSFrame *src = vsapi->getFrameFilter(n, d->node, frame_ctx);
+        const VSFrame *ignore_mask = NULL;
+        if (d->ignore_mask_node)
+            ignore_mask = vsapi->getFrameFilter(n, d->ignore_mask_node, frame_ctx);
 
         VSFrame *intermediate = vsapi->newVideoFrame(&fmt, d->dd.dst_width, d->dd.src_height, NULL, core);
         VSFrame *dst = vsapi->newVideoFrame(&fmt, d->dd.dst_width, d->dd.dst_height, src, core);
@@ -81,23 +86,31 @@ static const VSFrame *VS_CC descale_get_frame(int n, int activation_reason, void
             const float *srcp = (const float *)vsapi->getReadPtr(src, plane);
             float *dstp = (float *)vsapi->getWritePtr(dst, plane);
 
+            int imask_stride = 0;
+            const unsigned char *imaskp = NULL;
+            if (ignore_mask) {
+                imask_stride = vsapi->getStride(ignore_mask, plane);
+                imaskp = vsapi->getReadPtr(ignore_mask, plane);
+            }
+
             if (d->dd.process_h && d->dd.process_v) {
                 int intermediate_stride = vsapi->getStride(intermediate, plane) / sizeof (float);
                 float *intermediatep = (float *)vsapi->getWritePtr(intermediate, plane);
 
-                d->dd.dsapi.process_vectors(d->dd.dscore_h[plane && d->dd.subsampling_h], DESCALE_DIR_HORIZONTAL, d->dd.src_height >> (plane ? d->dd.subsampling_v : 0), src_stride, intermediate_stride, srcp, intermediatep);
-                d->dd.dsapi.process_vectors(d->dd.dscore_v[plane && d->dd.subsampling_v], DESCALE_DIR_VERTICAL, d->dd.dst_width >> (plane ? d->dd.subsampling_h : 0), intermediate_stride, dst_stride, intermediatep, dstp);
+                d->dd.dsapi.process_vectors(d->dd.dscore_h[plane && d->dd.subsampling_h], DESCALE_DIR_HORIZONTAL, d->dd.src_height >> (plane ? d->dd.subsampling_v : 0), src_stride, 0, intermediate_stride, srcp, NULL, intermediatep);
+                d->dd.dsapi.process_vectors(d->dd.dscore_v[plane && d->dd.subsampling_v], DESCALE_DIR_VERTICAL, d->dd.dst_width >> (plane ? d->dd.subsampling_h : 0), intermediate_stride, 0, dst_stride, intermediatep, NULL, dstp);
 
             } else if (d->dd.process_h) {
-                d->dd.dsapi.process_vectors(d->dd.dscore_h[plane && d->dd.subsampling_h], DESCALE_DIR_HORIZONTAL, d->dd.src_height >> (plane ? d->dd.subsampling_v : 0), src_stride, dst_stride, srcp, dstp);
+                d->dd.dsapi.process_vectors(d->dd.dscore_h[plane && d->dd.subsampling_h], DESCALE_DIR_HORIZONTAL, d->dd.src_height >> (plane ? d->dd.subsampling_v : 0), src_stride, imask_stride, dst_stride, srcp, imaskp, dstp);
 
             } else if (d->dd.process_v) {
-                d->dd.dsapi.process_vectors(d->dd.dscore_v[plane && d->dd.subsampling_v], DESCALE_DIR_VERTICAL, d->dd.src_width >> (plane ? d->dd.subsampling_h : 0), src_stride, dst_stride, srcp, dstp);
+                d->dd.dsapi.process_vectors(d->dd.dscore_v[plane && d->dd.subsampling_v], DESCALE_DIR_VERTICAL, d->dd.src_width >> (plane ? d->dd.subsampling_h : 0), src_stride, imask_stride, dst_stride, srcp, imaskp, dstp);
             }
         }
 
         vsapi->freeFrame(intermediate);
         vsapi->freeFrame(src);
+        vsapi->freeFrame(ignore_mask);
 
         return dst;
     }
@@ -111,6 +124,7 @@ static void VS_CC descale_free(void *instance_data, VSCore *core, const VSAPI *v
     struct VSDescaleData *d = (struct VSDescaleData *)instance_data;
 
     vsapi->freeNode(d->node);
+    vsapi->freeNode(d->ignore_mask_node);
 
     if (d->initialized) {
         if (d->dd.process_h) {
@@ -247,6 +261,31 @@ static void VS_CC descale_create(const VSMap *in, VSMap *out, void *user_data, V
 
     int err;
 
+    d.ignore_mask_node = vsapi->mapGetNode(in, "ignore_mask", 0, &err);
+    if (err) {
+        d.ignore_mask_node = NULL;
+    } else {
+        params.has_ignore_mask = 1;
+        const VSVideoInfo *mvi = vsapi->getVideoInfo(d.ignore_mask_node);
+        if (mvi->format.sampleType != stInteger || mvi->format.bitsPerSample != 8) {
+            vsapi->mapSetError(out, "Descale: Ignore mask must use 8 bit integer samples.");    // TODO improve this?
+            vsapi->freeNode(d.node);
+            vsapi->freeNode(d.ignore_mask_node);
+            return;
+        }
+        if (mvi->format.numPlanes != d.vi.format.numPlanes
+                || mvi->format.subSamplingH != d.vi.format.subSamplingH
+                || mvi->format.subSamplingW != d.vi.format.subSamplingW
+                || mvi->width != d.dd.src_width
+                || mvi->height != d.dd.src_height
+                || mvi->numFrames != d.vi.numFrames) {
+            vsapi->mapSetError(out, "Descale: Ignore mask format must match clip format.");    // TODO improve this?
+            vsapi->freeNode(d.node);
+            vsapi->freeNode(d.ignore_mask_node);
+            return;
+        }
+    }
+
     d.dd.shift_h = vsapi->mapGetFloat(in, "src_left", 0, &err);
     if (err)
         d.dd.shift_h = 0.0;
@@ -284,21 +323,27 @@ static void VS_CC descale_create(const VSMap *in, VSMap *out, void *user_data, V
     else
         opt_enum = DESCALE_OPT_AUTO;
 
+    if (d.ignore_mask_node)
+        opt_enum = DESCALE_OPT_NONE;
+
     if (d.dd.dst_width < 1) {
         vsapi->mapSetError(out, "Descale: width must be greater than 0.");
         vsapi->freeNode(d.node);
+        vsapi->freeNode(d.ignore_mask_node);
         return;
     }
 
     if (d.dd.dst_height < 8) {
         vsapi->mapSetError(out, "Descale: Output height must be greater than or equal to 8.");
         vsapi->freeNode(d.node);
+        vsapi->freeNode(d.ignore_mask_node);
         return;
     }
 
     if (d.dd.dst_width > d.dd.src_width || d.dd.dst_height > d.dd.src_height) {
         vsapi->mapSetError(out, "Descale: Output dimension must be less than or equal to input dimension.");
         vsapi->freeNode(d.node);
+        vsapi->freeNode(d.ignore_mask_node);
         return;
     }
 
@@ -337,6 +382,7 @@ static void VS_CC descale_create(const VSMap *in, VSMap *out, void *user_data, V
             vsapi->freeFunction(custom_kernel);
             free(params.custom_kernel.user_data);
             vsapi->freeNode(d.node);
+            vsapi->freeNode(d.ignore_mask_node);
             return;
 
         } else if (err) {
@@ -346,6 +392,7 @@ static void VS_CC descale_create(const VSMap *in, VSMap *out, void *user_data, V
         if (params.taps < 1) {
             vsapi->mapSetError(out, "Descale: taps must be greater than 0.");
             vsapi->freeNode(d.node);
+            vsapi->freeNode(d.ignore_mask_node);
             return;
         }
 
@@ -378,6 +425,14 @@ static void VS_CC descale_create(const VSMap *in, VSMap *out, void *user_data, V
     if (!d.dd.process_h && !d.dd.process_v) {
         vsapi->mapSetNode(out, "clip", d.node, maReplace);
         vsapi->freeNode(d.node);
+        vsapi->freeNode(d.ignore_mask_node);
+        return;
+    }
+
+    if (d.dd.process_h && d.dd.process_v && d.ignore_mask_node) {
+        vsapi->mapSetError(out, "Descale: Ignore mask is not supported when descaling along both axes.");
+        vsapi->freeNode(d.node);
+        vsapi->freeNode(d.ignore_mask_node);
         return;
     }
 
@@ -434,8 +489,11 @@ static void VS_CC descale_create(const VSMap *in, VSMap *out, void *user_data, V
         vsapi->mapSetInt(map1, "force_h", force_h, maReplace);
         vsapi->mapSetInt(map1, "force_v", force_v, maReplace);
         vsapi->mapSetInt(map1, "opt", (int)opt_enum, maReplace);
+        if (d.ignore_mask_node)
+            vsapi->mapSetNode(map1, "ignore_mask", d.ignore_mask_node, maReplace);
         map2 = vsapi->invoke(descale_plugin, "Descale", map1);
         vsapi->freeNode(tmp_node);
+        vsapi->freeNode(d.ignore_mask_node);
         vsapi->freeMap(map1);
         if ((err_msg = vsapi->mapGetError(map2))) {
             vsapi->mapSetError(out, err_msg);
@@ -476,8 +534,8 @@ static void VS_CC descale_create(const VSMap *in, VSMap *out, void *user_data, V
     struct VSDescaleData *data = malloc(sizeof d);
     *data = d;
     data->dd.params = params;
-    VSFilterDependency deps[] = {{data->node, rpStrictSpatial}};
-    vsapi->createVideoFilter(out, funcname, &data->vi, descale_get_frame, descale_free, fmParallel, deps, 1, data, core);
+    VSFilterDependency deps[] = {{data->node, rpStrictSpatial}, {data->ignore_mask_node, rpStrictSpatial}};
+    vsapi->createVideoFilter(out, funcname, &data->vi, descale_get_frame, descale_free, fmParallel, deps, data->ignore_mask_node ? 2 : 1, data, core);
 }
 
 
@@ -494,6 +552,7 @@ VS_EXTERNAL_API(void) VapourSynthPluginInit2(VSPlugin *plugin, const VSPLUGINAPI
             "src_width:float:opt;"
             "src_height:float:opt;"
             "border_handling:int:opt;"
+            "ignore_mask:vnode:opt;"
             "force:int:opt;"
             "force_h:int:opt;"
             "force_v:int:opt;"
@@ -512,6 +571,7 @@ VS_EXTERNAL_API(void) VapourSynthPluginInit2(VSPlugin *plugin, const VSPLUGINAPI
             "src_width:float:opt;"
             "src_height:float:opt;"
             "border_handling:int:opt;"
+            "ignore_mask:vnode:opt;"
             "force:int:opt;"
             "force_h:int:opt;"
             "force_v:int:opt;"
@@ -529,6 +589,7 @@ VS_EXTERNAL_API(void) VapourSynthPluginInit2(VSPlugin *plugin, const VSPLUGINAPI
             "src_width:float:opt;"
             "src_height:float:opt;"
             "border_handling:int:opt;"
+            "ignore_mask:vnode:opt;"
             "force:int:opt;"
             "force_h:int:opt;"
             "force_v:int:opt;"
@@ -545,6 +606,7 @@ VS_EXTERNAL_API(void) VapourSynthPluginInit2(VSPlugin *plugin, const VSPLUGINAPI
             "src_width:float:opt;"
             "src_height:float:opt;"
             "border_handling:int:opt;"
+            "ignore_mask:vnode:opt;"
             "force:int:opt;"
             "force_h:int:opt;"
             "force_v:int:opt;"
@@ -561,6 +623,7 @@ VS_EXTERNAL_API(void) VapourSynthPluginInit2(VSPlugin *plugin, const VSPLUGINAPI
             "src_width:float:opt;"
             "src_height:float:opt;"
             "border_handling:int:opt;"
+            "ignore_mask:vnode:opt;"
             "force:int:opt;"
             "force_h:int:opt;"
             "force_v:int:opt;"
@@ -577,6 +640,7 @@ VS_EXTERNAL_API(void) VapourSynthPluginInit2(VSPlugin *plugin, const VSPLUGINAPI
             "src_width:float:opt;"
             "src_height:float:opt;"
             "border_handling:int:opt;"
+            "ignore_mask:vnode:opt;"
             "force:int:opt;"
             "force_h:int:opt;"
             "force_v:int:opt;"
@@ -598,6 +662,7 @@ VS_EXTERNAL_API(void) VapourSynthPluginInit2(VSPlugin *plugin, const VSPLUGINAPI
             "src_width:float:opt;"
             "src_height:float:opt;"
             "border_handling:int:opt;"
+            "ignore_mask:vnode:opt;"
             "force:int:opt;"
             "force_h:int:opt;"
             "force_v:int:opt;"
diff --git a/src/x86/descale_avx2.c b/src/x86/descale_avx2.c
index d9a807b..937cb4f 100644
--- a/src/x86/descale_avx2.c
+++ b/src/x86/descale_avx2.c
@@ -745,7 +745,7 @@ static void process_plane_v_avx2(int height, int current_height, int current_wid
 
 
 void descale_process_vectors_avx2(struct DescaleCore *core, enum DescaleDir dir, int vector_count,
-                                  int src_stride, int dst_stride, const float *srcp, float *dstp)
+                                  int src_stride, int imask_stride, int dst_stride, const float *srcp, const unsigned char *imaskp, float *dstp)
 {
     if (dir == DESCALE_DIR_HORIZONTAL) {
         float *temp;
diff --git a/src/x86/descale_avx2.h b/src/x86/descale_avx2.h
index 44e245e..2f53d28 100644
--- a/src/x86/descale_avx2.h
+++ b/src/x86/descale_avx2.h
@@ -31,7 +31,7 @@
 
 
 void descale_process_vectors_avx2(struct DescaleCore *core, enum DescaleDir dir, int vector_count,
-                                  int src_stride, int dst_stride, const float *srcp, float *dstp);
+                                  int src_stride, int imask_stride, int dst_stride, const float *srcp, const unsigned char *imaskp, float *dstp);
 
 
 #endif  // DESCALE_AVX2_H

From 86477740ba48b43d9ea7a02515aaceb264cebc8e Mon Sep 17 00:00:00 2001
From: arch1t3cht <arch1t3cht@gmail.com>
Date: Sat, 1 Apr 2023 22:05:57 +0200
Subject: [PATCH 2/3] Store weights column positions to optimize matrix
 multiplication in masked descale

---
 include/descale.h |  2 ++
 src/descale.c     | 35 ++++++++++++++++++++++++++++-------
 2 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/include/descale.h b/include/descale.h
index 1101f01..989fc90 100644
--- a/include/descale.h
+++ b/include/descale.h
@@ -94,6 +94,8 @@ typedef struct DescaleCore
     double *multiplied_weights;
     int *weights_left_idx;
     int *weights_right_idx;
+    int *weights_top_idx;
+    int *weights_bot_idx;
     int weights_columns;
 } DescaleCore;
 
diff --git a/src/descale.c b/src/descale.c
index 694c7bc..7f55e79 100644
--- a/src/descale.c
+++ b/src/descale.c
@@ -536,7 +536,8 @@ static inline int check_imask(unsigned char value) {
     return value >= 128;
 }
 
-static void process_plane_masked(int dst_dim, int src_dim, int vector_count, enum DescaleDir dir, int bandwidth, int * restrict weights_left_idx, int * restrict weights_right_idx,
+static void process_plane_masked(int dst_dim, int src_dim, int vector_count, enum DescaleDir dir, int bandwidth,
+                              int * restrict weights_left_idx, int * restrict weights_right_idx, int * restrict weights_top_idx, int * restrict weights_bot_idx,
                               int weights_columns, float * restrict weights, double * restrict multiplied_weights,
                               int src_stride, int imask_stride, int dst_stride, const float * restrict srcp, const unsigned char * restrict imaskp, float * restrict dstp)
 {
@@ -583,11 +584,12 @@ static void process_plane_masked(int dst_dim, int src_dim, int vector_count, enu
             for (int j = imask_start; j < src_dim; j++) {
                 if (!check_imask(imaskp[i * imuli + j * jmuli]))
                     continue;
-                for (int r = 0; r < dst_dim; r++) {
-                    if (j < weights_left_idx[r] || j >= weights_right_idx[r]) continue;
-                    for (int s = r; s < dst_dim; s++) {
-                        if (j < weights_left_idx[s] || j >= weights_right_idx[s]) continue;
-                        modified_ldlt[r * bandwidth + s - r] -= weights[r * weights_columns + j - weights_left_idx[r]] * weights[s * weights_columns + j - weights_left_idx[s]];
+                int top = weights_top_idx[j];
+                int bot = weights_bot_idx[j];
+                for (int r = top; r < bot; r++) {
+                    double wr = weights[r * weights_columns + j - weights_left_idx[r]];
+                    for (int s = r; s < bot; s++) {
+                        modified_ldlt[r * bandwidth + s - r] -= wr * weights[s * weights_columns + j - weights_left_idx[s]];
                     }
                 }
             }
@@ -650,7 +652,8 @@ static void descale_process_vectors_c(struct DescaleCore *core, enum DescaleDir
 {
 
     if (imaskp) {
-        process_plane_masked(core->dst_dim, core->src_dim, vector_count, dir, core->bandwidth, core->weights_left_idx, core->weights_right_idx,
+        process_plane_masked(core->dst_dim, core->src_dim, vector_count, dir, core->bandwidth,
+                             core->weights_left_idx, core->weights_right_idx, core->weights_top_idx, core->weights_bot_idx,
                              core->weights_columns, core->weights, core->multiplied_weights, src_stride, imask_stride, dst_stride, srcp, imaskp, dstp);
     } else if (dir == DESCALE_DIR_HORIZONTAL) {
         if (core->bandwidth == 3)
@@ -716,6 +719,8 @@ static struct DescaleCore *create_core(int src_dim, int dst_dim, struct DescaleP
 
     core.weights_left_idx = calloc(ceil_n(dst_dim, 8), sizeof (int));
     core.weights_right_idx = calloc(ceil_n(dst_dim, 8), sizeof (int));
+    core.weights_top_idx = calloc(ceil_n(src_dim, 8), sizeof (int));
+    core.weights_bot_idx = calloc(ceil_n(src_dim, 8), sizeof (int));
     for (int i = 0; i < dst_dim; i++) {
         for (int j = 0; j < src_dim; j++) {
             if (transposed_weights[i * src_dim + j] != 0.0) {
@@ -730,6 +735,20 @@ static struct DescaleCore *create_core(int src_dim, int dst_dim, struct DescaleP
             }
         }
     }
+    for (int i = 0; i < src_dim; i++) {
+        for (int j = 0; j < dst_dim; j++) {
+            if (transposed_weights[j * src_dim + i] != 0.0) {
+                core.weights_top_idx[i] = j;
+                break;
+            }
+        }
+        for (int j = dst_dim - 1; j >= 0; j--) {
+            if (transposed_weights[j * src_dim + i] != 0.0) {
+                core.weights_bot_idx[i] = j + 1;
+                break;
+            }
+        }
+    }
 
     multiply_sparse_matrices(dst_dim, src_dim, core.weights_left_idx, core.weights_right_idx, transposed_weights, weights, &multiplied_weights);
 
@@ -782,6 +801,8 @@ static void free_core(struct DescaleCore *core)
     free(core->weights);
     free(core->weights_left_idx);
     free(core->weights_right_idx);
+    free(core->weights_top_idx);
+    free(core->weights_bot_idx);
     free(core->multiplied_weights);
     free(core->diagonal);
     for (int i = 0; core->upper && i < core->bandwidth / 2; i++) {

From 370d8222a2e3612194c84b82ed355f84efd40777 Mon Sep 17 00:00:00 2001
From: arch1t3cht <arch1t3cht@gmail.com>
Date: Tue, 9 May 2023 14:34:44 +0200
Subject: [PATCH 3/3] Add ignore_mask parameter to README

---
 README.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 0e54efd..3502896 100644
--- a/README.md
+++ b/README.md
@@ -10,23 +10,23 @@ The VapourSynth plugin itself supports every constant input format. If the forma
 The included python wrapper, contrary to using the plugin directly, doesn't descale the chroma planes but scales them normally with `Spline36`.
 
 ```
-descale.Debilinear(clip src, int width, int height, float src_left=0.0, float src_top=0.0, float src_width=width, float src_height=height, int border_handling=0, bool force=false, bool force_h=false, bool force_v=false, int opt=0)
+descale.Debilinear(clip src, int width, int height, float src_left=0.0, float src_top=0.0, float src_width=width, float src_height=height, int border_handling=0, clip ignore_mask=None, bool force=false, bool force_h=false, bool force_v=false, int opt=0)
 
-descale.Debicubic(clip src, int width, int height, float b=0.0, float c=0.5, float src_left=0.0, float src_top=0.0, float src_width=width, float src_height=height, int border_handling=0, bool force=false, bool force_h=false, bool force_v=false, int opt=0)
+descale.Debicubic(clip src, int width, int height, float b=0.0, float c=0.5, float src_left=0.0, float src_top=0.0, float src_width=width, float src_height=height, int border_handling=0, clip ignore_mask=None, bool force=false, bool force_h=false, bool force_v=false, int opt=0)
 
-descale.Delanczos(clip src, int width, int height, int taps=3, float src_left=0.0, float src_top=0.0, float src_width=width, float src_height=height, int border_handling=0, bool force=false, bool force_h=false, bool force_v=false, int opt=0)
+descale.Delanczos(clip src, int width, int height, int taps=3, float src_left=0.0, float src_top=0.0, float src_width=width, float src_height=height, int border_handling=0, clip ignore_mask=None, bool force=false, bool force_h=false, bool force_v=false, int opt=0)
 
-descale.Despline16(clip src, int width, int height, float src_left=0.0, float src_top=0.0, float src_width=width, float src_height=height, int border_handling=0, bool force=false, bool force_h=false, bool force_v=false, int opt=0)
+descale.Despline16(clip src, int width, int height, float src_left=0.0, float src_top=0.0, float src_width=width, float src_height=height, int border_handling=0, clip ignore_mask=None, bool force=false, bool force_h=false, bool force_v=false, int opt=0)
 
-descale.Despline36(clip src, int width, int height, float src_left=0.0, float src_top=0.0, float src_width=width, float src_height=height, int border_handling=0, bool force=false, bool force_h=false, bool force_v=false, int opt=0)
+descale.Despline36(clip src, int width, int height, float src_left=0.0, float src_top=0.0, float src_width=width, float src_height=height, int border_handling=0, clip ignore_mask=None, bool force=false, bool force_h=false, bool force_v=false, int opt=0)
 
-descale.Despline64(clip src, int width, int height, float src_left=0.0, float src_top=0.0, float src_width=width, float src_height=height, int border_handling=0, bool force=false, bool force_h=false, bool force_v=false, int opt=0)
+descale.Despline64(clip src, int width, int height, float src_left=0.0, float src_top=0.0, float src_width=width, float src_height=height, int border_handling=0, clip ignore_mask=None, bool force=false, bool force_h=false, bool force_v=false, int opt=0)
 
-descale.Descale(clip src, int width, int height, str kernel, func custom_kernel, int taps=3, float b=0.0, float c=0.0, float src_left=0.0, float src_top=0.0, float src_width=width, float src_height=height, int border_handling=0, bool force=false, bool force_h=false, bool force_v=false, int opt=0)
+descale.Descale(clip src, int width, int height, str kernel, func custom_kernel, int taps=3, float b=0.0, float c=0.0, float src_left=0.0, float src_top=0.0, float src_width=width, float src_height=height, int border_handling=0, clip ignore_mask=None, bool force=false, bool force_h=false, bool force_v=false, int opt=0)
 ```
 
 The AviSynth+ plugin is used similarly, but without the `descale` namespace.
-Custom kernels are only supported in the VapourSynth plugin.
+Custom kernels and ignore masks are only supported in the VapourSynth plugin.
 
 ### Custom kernels