]> git.sesse.net Git - ffmpeg/blobdiff - libavcodec/hevcdsp_template.c
nvenc: Generate AUD NAL units for better compatiblity
[ffmpeg] / libavcodec / hevcdsp_template.c
index f02e1751969b32bd452f89b289441bff08fb1432..31a2e7ab6f6e5be8739aba9961d685e601f2e8dd 100644 (file)
@@ -40,16 +40,16 @@ static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int size,
     }
 }
 
-static void FUNC(transquant_bypass4x4)(uint8_t *_dst, int16_t *coeffs,
-                                       ptrdiff_t stride)
+static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coeffs,
+                                                     ptrdiff_t stride, int size)
 {
     int x, y;
     pixel *dst = (pixel *)_dst;
 
     stride /= sizeof(pixel);
 
-    for (y = 0; y < 4; y++) {
-        for (x = 0; x < 4; x++) {
+    for (y = 0; y < size; y++) {
+        for (x = 0; x < size; x++) {
             dst[x] = av_clip_pixel(dst[x] + *coeffs);
             coeffs++;
         }
@@ -57,55 +57,28 @@ static void FUNC(transquant_bypass4x4)(uint8_t *_dst, int16_t *coeffs,
     }
 }
 
-static void FUNC(transquant_bypass8x8)(uint8_t *_dst, int16_t *coeffs,
+static void FUNC(transquant_bypass4x4)(uint8_t *_dst, int16_t *coeffs,
                                        ptrdiff_t stride)
 {
-    int x, y;
-    pixel *dst = (pixel *)_dst;
-
-    stride /= sizeof(pixel);
+    FUNC(transquant_bypass)(_dst, coeffs, stride, 4);
+}
 
-    for (y = 0; y < 8; y++) {
-        for (x = 0; x < 8; x++) {
-            dst[x] = av_clip_pixel(dst[x] + *coeffs);
-            coeffs++;
-        }
-        dst += stride;
-    }
+static void FUNC(transquant_bypass8x8)(uint8_t *_dst, int16_t *coeffs,
+                                       ptrdiff_t stride)
+{
+    FUNC(transquant_bypass)(_dst, coeffs, stride, 8);
 }
 
 static void FUNC(transquant_bypass16x16)(uint8_t *_dst, int16_t *coeffs,
                                          ptrdiff_t stride)
 {
-    int x, y;
-    pixel *dst = (pixel *)_dst;
-
-    stride /= sizeof(pixel);
-
-    for (y = 0; y < 16; y++) {
-        for (x = 0; x < 16; x++) {
-            dst[x] = av_clip_pixel(dst[x] + *coeffs);
-            coeffs++;
-        }
-        dst += stride;
-    }
+    FUNC(transquant_bypass)(_dst, coeffs, stride, 16);
 }
 
 static void FUNC(transquant_bypass32x32)(uint8_t *_dst, int16_t *coeffs,
                                          ptrdiff_t stride)
 {
-    int x, y;
-    pixel *dst = (pixel *)_dst;
-
-    stride /= sizeof(pixel);
-
-    for (y = 0; y < 32; y++) {
-        for (x = 0; x < 32; x++) {
-            dst[x] = av_clip_pixel(dst[x] + *coeffs);
-            coeffs++;
-        }
-        dst += stride;
-    }
+    FUNC(transquant_bypass)(_dst, coeffs, stride, 32);
 }
 
 static void FUNC(transform_skip)(uint8_t *_dst, int16_t *coeffs,
@@ -802,14 +775,17 @@ static void FUNC(sao_edge_filter_3)(uint8_t *_dst, uint8_t *_src,
 #undef TR_16
 #undef TR_32
 
-static void FUNC(put_hevc_qpel_pixels)(int16_t *dst, ptrdiff_t dststride,
-                                       uint8_t *_src, ptrdiff_t _srcstride,
-                                       int width, int height, int16_t* mcbuffer)
+static av_always_inline void
+FUNC(put_hevc_qpel_pixels)(int16_t *dst, ptrdiff_t dststride,
+                           uint8_t *_src, ptrdiff_t _srcstride,
+                           int width, int height, int mx, int my,
+                           int16_t* mcbuffer)
 {
     int x, y;
     pixel *src          = (pixel *)_src;
     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 
+    dststride /= sizeof(*dst);
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++)
             dst[x] = src[x] << (14 - BIT_DEPTH);
@@ -857,6 +833,7 @@ static void FUNC(put_hevc_qpel_h ## H)(int16_t *dst,  ptrdiff_t dststride,     \
     pixel *src = (pixel*)_src;                                                 \
     ptrdiff_t srcstride = _srcstride / sizeof(pixel);                          \
                                                                                \
+    dststride /= sizeof(*dst);                                                 \
     for (y = 0; y < height; y++) {                                             \
         for (x = 0; x < width; x++)                                            \
             dst[x] = QPEL_FILTER_ ## H(src, 1) >> (BIT_DEPTH - 8);             \
@@ -875,6 +852,7 @@ static void FUNC(put_hevc_qpel_v ## V)(int16_t *dst,  ptrdiff_t dststride,     \
     pixel *src = (pixel*)_src;                                                 \
     ptrdiff_t srcstride = _srcstride / sizeof(pixel);                          \
                                                                                \
+    dststride /= sizeof(*dst);                                                 \
     for (y = 0; y < height; y++)  {                                            \
         for (x = 0; x < width; x++)                                            \
             dst[x] = QPEL_FILTER_ ## V(src, srcstride) >> (BIT_DEPTH - 8);     \
@@ -898,6 +876,7 @@ static void FUNC(put_hevc_qpel_h ## H ## v ## V)(int16_t *dst,                 \
     int16_t tmp_array[(MAX_PB_SIZE + 7) * MAX_PB_SIZE];                        \
     int16_t *tmp = tmp_array;                                                  \
                                                                                \
+    dststride /= sizeof(*dst);                                                 \
     src -= ff_hevc_qpel_extra_before[V] * srcstride;                           \
                                                                                \
     for (y = 0; y < height + ff_hevc_qpel_extra[V]; y++) {                     \
@@ -933,15 +912,90 @@ PUT_HEVC_QPEL_HV(3, 1)
 PUT_HEVC_QPEL_HV(3, 2)
 PUT_HEVC_QPEL_HV(3, 3)
 
-static void FUNC(put_hevc_epel_pixels)(int16_t *dst, ptrdiff_t dststride,
-                                       uint8_t *_src, ptrdiff_t _srcstride,
-                                       int width, int height, int mx, int my,
-                                       int16_t* mcbuffer)
+#define QPEL(W)                                                                             \
+static void FUNC(put_hevc_qpel_pixels_ ## W)(int16_t *dst, ptrdiff_t dststride,             \
+                                             uint8_t *src, ptrdiff_t srcstride,             \
+                                             int height, int mx, int my,                    \
+                                             int16_t *mcbuffer)                             \
+{                                                                                           \
+    FUNC(put_hevc_qpel_pixels)(dst, dststride, src, srcstride, W, height,                   \
+                               mx, my, mcbuffer);                                           \
+}                                                                                           \
+                                                                                            \
+static void FUNC(put_hevc_qpel_h_ ## W)(int16_t *dst, ptrdiff_t dststride,                  \
+                                        uint8_t *src, ptrdiff_t srcstride,                  \
+                                        int height, int mx, int my,                         \
+                                        int16_t *mcbuffer)                                  \
+{                                                                                           \
+    if (mx == 1)                                                                            \
+        FUNC(put_hevc_qpel_h1)(dst, dststride, src, srcstride, W, height, mcbuffer);        \
+    else if (mx == 2)                                                                       \
+        FUNC(put_hevc_qpel_h2)(dst, dststride, src, srcstride, W, height, mcbuffer);        \
+    else                                                                                    \
+        FUNC(put_hevc_qpel_h3)(dst, dststride, src, srcstride, W, height, mcbuffer);        \
+}                                                                                           \
+                                                                                            \
+static void FUNC(put_hevc_qpel_v_ ## W)(int16_t *dst, ptrdiff_t dststride,                  \
+                                             uint8_t *src, ptrdiff_t srcstride,             \
+                                             int height, int mx, int my,                    \
+                                             int16_t *mcbuffer)                             \
+{                                                                                           \
+    if (my == 1)                                                                            \
+        FUNC(put_hevc_qpel_v1)(dst, dststride, src, srcstride, W, height, mcbuffer);        \
+    else if (my == 2)                                                                       \
+        FUNC(put_hevc_qpel_v2)(dst, dststride, src, srcstride, W, height, mcbuffer);        \
+    else                                                                                    \
+        FUNC(put_hevc_qpel_v3)(dst, dststride, src, srcstride, W, height, mcbuffer);        \
+}                                                                                           \
+                                                                                            \
+static void FUNC(put_hevc_qpel_hv_ ## W)(int16_t *dst, ptrdiff_t dststride,                 \
+                                             uint8_t *src, ptrdiff_t srcstride,             \
+                                             int height, int mx, int my,                    \
+                                             int16_t *mcbuffer)                             \
+{                                                                                           \
+    if (my == 1) {                                                                          \
+        if (mx == 1)                                                                        \
+            FUNC(put_hevc_qpel_h1v1)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
+        else if (mx == 2)                                                                   \
+            FUNC(put_hevc_qpel_h2v1)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
+        else                                                                                \
+            FUNC(put_hevc_qpel_h3v1)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
+    } else if (my == 2) {                                                                   \
+        if (mx == 1)                                                                        \
+            FUNC(put_hevc_qpel_h1v2)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
+        else if (mx == 2)                                                                   \
+            FUNC(put_hevc_qpel_h2v2)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
+        else                                                                                \
+            FUNC(put_hevc_qpel_h3v2)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
+    } else {                                                                                \
+        if (mx == 1)                                                                        \
+            FUNC(put_hevc_qpel_h1v3)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
+        else if (mx == 2)                                                                   \
+            FUNC(put_hevc_qpel_h2v3)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
+        else                                                                                \
+            FUNC(put_hevc_qpel_h3v3)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
+    }                                                                                       \
+}
+
+QPEL(64)
+QPEL(48)
+QPEL(32)
+QPEL(24)
+QPEL(16)
+QPEL(12)
+QPEL(8)
+QPEL(4)
+
+static inline void FUNC(put_hevc_epel_pixels)(int16_t *dst, ptrdiff_t dststride,
+                                              uint8_t *_src, ptrdiff_t _srcstride,
+                                              int width, int height, int mx, int my,
+                                              int16_t* mcbuffer)
 {
     int x, y;
     pixel *src          = (pixel *)_src;
     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 
+    dststride /= sizeof(*dst);
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++)
             dst[x] = src[x] << (14 - BIT_DEPTH);
@@ -956,19 +1010,20 @@ static void FUNC(put_hevc_epel_pixels)(int16_t *dst, ptrdiff_t dststride,
      filter_2 * src[x + stride] +               \
      filter_3 * src[x + 2 * stride])
 
-static void FUNC(put_hevc_epel_h)(int16_t *dst, ptrdiff_t dststride,
-                                  uint8_t *_src, ptrdiff_t _srcstride,
-                                  int width, int height, int mx, int my,
-                                  int16_t* mcbuffer)
+static inline void FUNC(put_hevc_epel_h)(int16_t *dst, ptrdiff_t dststride,
+                                         uint8_t *_src, ptrdiff_t _srcstride,
+                                         int width, int height, int mx, int my,
+                                         int16_t* mcbuffer)
 {
     int x, y;
     pixel *src = (pixel *)_src;
     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
-    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
+    const int16_t *filter = ff_hevc_epel_coeffs[mx - 1];
     int8_t filter_0 = filter[0];
     int8_t filter_1 = filter[1];
     int8_t filter_2 = filter[2];
     int8_t filter_3 = filter[3];
+    dststride /= sizeof(*dst);
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++)
             dst[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
@@ -977,20 +1032,21 @@ static void FUNC(put_hevc_epel_h)(int16_t *dst, ptrdiff_t dststride,
     }
 }
 
-static void FUNC(put_hevc_epel_v)(int16_t *dst, ptrdiff_t dststride,
-                                  uint8_t *_src, ptrdiff_t _srcstride,
-                                  int width, int height, int mx, int my,
-                                  int16_t* mcbuffer)
+static inline void FUNC(put_hevc_epel_v)(int16_t *dst, ptrdiff_t dststride,
+                                         uint8_t *_src, ptrdiff_t _srcstride,
+                                         int width, int height, int mx, int my,
+                                         int16_t* mcbuffer)
 {
     int x, y;
     pixel *src = (pixel *)_src;
     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-    const int8_t *filter = ff_hevc_epel_filters[my - 1];
+    const int16_t *filter = ff_hevc_epel_coeffs[my - 1];
     int8_t filter_0 = filter[0];
     int8_t filter_1 = filter[1];
     int8_t filter_2 = filter[2];
     int8_t filter_3 = filter[3];
 
+    dststride /= sizeof(*dst);
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++)
             dst[x] = EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
@@ -999,16 +1055,16 @@ static void FUNC(put_hevc_epel_v)(int16_t *dst, ptrdiff_t dststride,
     }
 }
 
-static void FUNC(put_hevc_epel_hv)(int16_t *dst, ptrdiff_t dststride,
-                                   uint8_t *_src, ptrdiff_t _srcstride,
-                                   int width, int height, int mx, int my,
-                                   int16_t* mcbuffer)
+static inline void FUNC(put_hevc_epel_hv)(int16_t *dst, ptrdiff_t dststride,
+                                          uint8_t *_src, ptrdiff_t _srcstride,
+                                          int width, int height, int mx, int my,
+                                          int16_t* mcbuffer)
 {
     int x, y;
     pixel *src = (pixel *)_src;
     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-    const int8_t *filter_h = ff_hevc_epel_filters[mx - 1];
-    const int8_t *filter_v = ff_hevc_epel_filters[my - 1];
+    const int16_t *filter_h = ff_hevc_epel_coeffs[mx - 1];
+    const int16_t *filter_v = ff_hevc_epel_coeffs[my - 1];
     int8_t filter_0 = filter_h[0];
     int8_t filter_1 = filter_h[1];
     int8_t filter_2 = filter_h[2];
@@ -1016,6 +1072,7 @@ static void FUNC(put_hevc_epel_hv)(int16_t *dst, ptrdiff_t dststride,
     int16_t tmp_array[(MAX_PB_SIZE + 3) * MAX_PB_SIZE];
     int16_t *tmp = tmp_array;
 
+    dststride /= sizeof(*dst);
     src -= EPEL_EXTRA_BEFORE * srcstride;
 
     for (y = 0; y < height + EPEL_EXTRA; y++) {
@@ -1038,9 +1095,53 @@ static void FUNC(put_hevc_epel_hv)(int16_t *dst, ptrdiff_t dststride,
     }
 }
 
-static void FUNC(put_unweighted_pred)(uint8_t *_dst, ptrdiff_t _dststride,
-                                      int16_t *src, ptrdiff_t srcstride,
-                                      int width, int height)
+#define EPEL(W)                                                                 \
+static void FUNC(put_hevc_epel_pixels_ ## W)(int16_t *dst, ptrdiff_t dststride, \
+                                             uint8_t *src, ptrdiff_t srcstride, \
+                                             int height, int mx, int my,        \
+                                             int16_t *mcbuffer)                 \
+{                                                                               \
+    FUNC(put_hevc_epel_pixels)(dst, dststride, src, srcstride,                  \
+                               W, height, mx, my, mcbuffer);                    \
+}                                                                               \
+static void FUNC(put_hevc_epel_h_ ## W)(int16_t *dst, ptrdiff_t dststride,      \
+                                        uint8_t *src, ptrdiff_t srcstride,      \
+                                        int height, int mx, int my,             \
+                                        int16_t *mcbuffer)                      \
+{                                                                               \
+    FUNC(put_hevc_epel_h)(dst, dststride, src, srcstride,                       \
+                          W, height, mx, my, mcbuffer);                         \
+}                                                                               \
+static void FUNC(put_hevc_epel_v_ ## W)(int16_t *dst, ptrdiff_t dststride,      \
+                                        uint8_t *src, ptrdiff_t srcstride,      \
+                                        int height, int mx, int my,             \
+                                        int16_t *mcbuffer)                      \
+{                                                                               \
+    FUNC(put_hevc_epel_v)(dst, dststride, src, srcstride,                       \
+                          W, height, mx, my, mcbuffer);                         \
+}                                                                               \
+static void FUNC(put_hevc_epel_hv_ ## W)(int16_t *dst, ptrdiff_t dststride,     \
+                                         uint8_t *src, ptrdiff_t srcstride,     \
+                                         int height, int mx, int my,            \
+                                         int16_t *mcbuffer)                     \
+{                                                                               \
+    FUNC(put_hevc_epel_hv)(dst, dststride, src, srcstride,                      \
+                           W, height, mx, my, mcbuffer);                        \
+}
+
+EPEL(32)
+EPEL(24)
+EPEL(16)
+EPEL(12)
+EPEL(8)
+EPEL(6)
+EPEL(4)
+EPEL(2)
+
+static av_always_inline void
+FUNC(put_unweighted_pred)(uint8_t *_dst, ptrdiff_t _dststride,
+                          int16_t *src, ptrdiff_t srcstride,
+                          int width, int height)
 {
     int x, y;
     pixel *dst          = (pixel *)_dst;
@@ -1052,6 +1153,7 @@ static void FUNC(put_unweighted_pred)(uint8_t *_dst, ptrdiff_t _dststride,
 #else
     int offset = 0;
 #endif
+    srcstride /= sizeof(*src);
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++)
             dst[x] = av_clip_pixel((src[x] + offset) >> shift);
@@ -1060,10 +1162,11 @@ static void FUNC(put_unweighted_pred)(uint8_t *_dst, ptrdiff_t _dststride,
     }
 }
 
-static void FUNC(put_weighted_pred_avg)(uint8_t *_dst, ptrdiff_t _dststride,
-                                        int16_t *src1, int16_t *src2,
-                                        ptrdiff_t srcstride,
-                                        int width, int height)
+static av_always_inline void
+FUNC(put_unweighted_pred_avg)(uint8_t *_dst, ptrdiff_t _dststride,
+                              int16_t *src1, int16_t *src2,
+                              ptrdiff_t srcstride,
+                              int width, int height)
 {
     int x, y;
     pixel *dst          = (pixel *)_dst;
@@ -1076,6 +1179,7 @@ static void FUNC(put_weighted_pred_avg)(uint8_t *_dst, ptrdiff_t _dststride,
     int offset = 0;
 #endif
 
+    srcstride /= sizeof(*src1);
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++)
             dst[x] = av_clip_pixel((src1[x] + src2[x] + offset) >> shift);
@@ -1085,10 +1189,11 @@ static void FUNC(put_weighted_pred_avg)(uint8_t *_dst, ptrdiff_t _dststride,
     }
 }
 
-static void FUNC(weighted_pred)(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
-                                uint8_t *_dst, ptrdiff_t _dststride,
-                                int16_t *src, ptrdiff_t srcstride,
-                                int width, int height)
+static av_always_inline void
+FUNC(weighted_pred)(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
+                    uint8_t *_dst, ptrdiff_t _dststride,
+                    int16_t *src, ptrdiff_t srcstride,
+                    int width, int height)
 {
     int shift, log2Wd, wx, ox, x, y, offset;
     pixel *dst          = (pixel *)_dst;
@@ -1100,6 +1205,7 @@ static void FUNC(weighted_pred)(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
     wx     = wlxFlag;
     ox     = olxFlag * (1 << (BIT_DEPTH - 8));
 
+    srcstride /= sizeof(*src);
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++) {
             if (log2Wd >= 1) {
@@ -1113,13 +1219,14 @@ static void FUNC(weighted_pred)(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
     }
 }
 
-static void FUNC(weighted_pred_avg)(uint8_t denom,
-                                    int16_t wl0Flag, int16_t wl1Flag,
-                                    int16_t ol0Flag, int16_t ol1Flag,
-                                    uint8_t *_dst, ptrdiff_t _dststride,
-                                    int16_t *src1, int16_t *src2,
-                                    ptrdiff_t srcstride,
-                                    int width, int height)
+static av_always_inline void
+FUNC(weighted_pred_avg)(uint8_t denom,
+                        int16_t wl0Flag, int16_t wl1Flag,
+                        int16_t ol0Flag, int16_t ol1Flag,
+                        uint8_t *_dst, ptrdiff_t _dststride,
+                        int16_t *src1, int16_t *src2,
+                        ptrdiff_t srcstride,
+                        int width, int height)
 {
     int shift, log2Wd, w0, w1, o0, o1, x, y;
     pixel *dst = (pixel *)_dst;
@@ -1132,6 +1239,7 @@ static void FUNC(weighted_pred_avg)(uint8_t denom,
     o0     = ol0Flag * (1 << (BIT_DEPTH - 8));
     o1     = ol1Flag * (1 << (BIT_DEPTH - 8));
 
+    srcstride /= sizeof(*src1);
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++)
             dst[x] = av_clip_pixel((src1[x] * w0 + src2[x] * w1 +
@@ -1142,6 +1250,47 @@ static void FUNC(weighted_pred_avg)(uint8_t denom,
     }
 }
 
+#define PUT_PRED(w)                                                                            \
+static void FUNC(put_unweighted_pred_ ## w)(uint8_t *dst, ptrdiff_t dststride,                 \
+                                            int16_t *src, ptrdiff_t srcstride,                 \
+                                            int height)                                        \
+{                                                                                              \
+    FUNC(put_unweighted_pred)(dst, dststride, src, srcstride, w, height);                      \
+}                                                                                              \
+static void FUNC(put_unweighted_pred_avg_ ## w)(uint8_t *dst, ptrdiff_t dststride,             \
+                                                int16_t *src1, int16_t *src2,                  \
+                                                ptrdiff_t srcstride, int height)               \
+{                                                                                              \
+    FUNC(put_unweighted_pred_avg)(dst, dststride, src1, src2, srcstride, w, height);           \
+}                                                                                              \
+static void FUNC(put_weighted_pred_ ## w)(uint8_t denom, int16_t weight, int16_t offset,       \
+                                          uint8_t *dst, ptrdiff_t dststride,                   \
+                                          int16_t *src, ptrdiff_t srcstride, int height)       \
+{                                                                                              \
+    FUNC(weighted_pred)(denom, weight, offset,                                                 \
+                        dst, dststride, src, srcstride, w, height);                            \
+}                                                                                              \
+static void FUNC(put_weighted_pred_avg_ ## w)(uint8_t denom, int16_t weight0, int16_t weight1, \
+                                              int16_t offset0, int16_t offset1,                \
+                                              uint8_t *dst, ptrdiff_t dststride,               \
+                                              int16_t *src1, int16_t *src2,                    \
+                                              ptrdiff_t srcstride, int height)                 \
+{                                                                                              \
+    FUNC(weighted_pred_avg)(denom, weight0, weight1, offset0, offset1,                         \
+                            dst, dststride, src1, src2, srcstride, w, height);                 \
+}
+
+PUT_PRED(64)
+PUT_PRED(48)
+PUT_PRED(32)
+PUT_PRED(24)
+PUT_PRED(16)
+PUT_PRED(12)
+PUT_PRED(8)
+PUT_PRED(6)
+PUT_PRED(4)
+PUT_PRED(2)
+
 // line zero
 #define P3 pix[-4 * xstride]
 #define P2 pix[-3 * xstride]
@@ -1164,7 +1313,7 @@ static void FUNC(weighted_pred_avg)(uint8_t denom,
 
 static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix,
                                         ptrdiff_t _xstride, ptrdiff_t _ystride,
-                                        int *_beta, int *_tc,
+                                        int beta, int *_tc,
                                         uint8_t *_no_p, uint8_t *_no_q)
 {
     int d, j;
@@ -1172,6 +1321,8 @@ static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix,
     ptrdiff_t xstride = _xstride / sizeof(pixel);
     ptrdiff_t ystride = _ystride / sizeof(pixel);
 
+    beta <<= BIT_DEPTH - 8;
+
     for (j = 0; j < 2; j++) {
         const int dp0  = abs(P2  - 2 * P1  + P0);
         const int dq0  = abs(Q2  - 2 * Q1  + Q0);
@@ -1179,7 +1330,6 @@ static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix,
         const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
         const int d0   = dp0 + dq0;
         const int d3   = dp3 + dq3;
-        const int beta = _beta[j] << (BIT_DEPTH - 8);
         const int tc   = _tc[j]   << (BIT_DEPTH - 8);
         const int no_p = _no_p[j];
         const int no_q = _no_q[j];
@@ -1306,7 +1456,7 @@ static void FUNC(hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
 }
 
 static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
-                                          int *beta, int *tc, uint8_t *no_p,
+                                          int beta, int *tc, uint8_t *no_p,
                                           uint8_t *no_q)
 {
     FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel),
@@ -1314,7 +1464,7 @@ static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
 }
 
 static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
-                                          int *beta, int *tc, uint8_t *no_p,
+                                          int beta, int *tc, uint8_t *no_p,
                                           uint8_t *no_q)
 {
     FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride,