}
}
-static void FUNC(transquant_bypass4x4)(uint8_t *_dst, int16_t *coeffs,
- ptrdiff_t stride)
+static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coeffs,
+ ptrdiff_t stride, int size)
{
int x, y;
pixel *dst = (pixel *)_dst;
stride /= sizeof(pixel);
- for (y = 0; y < 4; y++) {
- for (x = 0; x < 4; x++) {
+ for (y = 0; y < size; y++) {
+ for (x = 0; x < size; x++) {
dst[x] = av_clip_pixel(dst[x] + *coeffs);
coeffs++;
}
}
}
-static void FUNC(transquant_bypass8x8)(uint8_t *_dst, int16_t *coeffs,
+static void FUNC(transquant_bypass4x4)(uint8_t *_dst, int16_t *coeffs,
ptrdiff_t stride)
{
- int x, y;
- pixel *dst = (pixel *)_dst;
-
- stride /= sizeof(pixel);
+ FUNC(transquant_bypass)(_dst, coeffs, stride, 4);
+}
- for (y = 0; y < 8; y++) {
- for (x = 0; x < 8; x++) {
- dst[x] = av_clip_pixel(dst[x] + *coeffs);
- coeffs++;
- }
- dst += stride;
- }
+static void FUNC(transquant_bypass8x8)(uint8_t *_dst, int16_t *coeffs,
+ ptrdiff_t stride)
+{
+ FUNC(transquant_bypass)(_dst, coeffs, stride, 8);
}
static void FUNC(transquant_bypass16x16)(uint8_t *_dst, int16_t *coeffs,
ptrdiff_t stride)
{
- int x, y;
- pixel *dst = (pixel *)_dst;
-
- stride /= sizeof(pixel);
-
- for (y = 0; y < 16; y++) {
- for (x = 0; x < 16; x++) {
- dst[x] = av_clip_pixel(dst[x] + *coeffs);
- coeffs++;
- }
- dst += stride;
- }
+ FUNC(transquant_bypass)(_dst, coeffs, stride, 16);
}
static void FUNC(transquant_bypass32x32)(uint8_t *_dst, int16_t *coeffs,
ptrdiff_t stride)
{
- int x, y;
- pixel *dst = (pixel *)_dst;
-
- stride /= sizeof(pixel);
-
- for (y = 0; y < 32; y++) {
- for (x = 0; x < 32; x++) {
- dst[x] = av_clip_pixel(dst[x] + *coeffs);
- coeffs++;
- }
- dst += stride;
- }
+ FUNC(transquant_bypass)(_dst, coeffs, stride, 32);
}
static void FUNC(transform_skip)(uint8_t *_dst, int16_t *coeffs,
#undef TR_16
#undef TR_32
-static void FUNC(put_hevc_qpel_pixels)(int16_t *dst, ptrdiff_t dststride,
- uint8_t *_src, ptrdiff_t _srcstride,
- int width, int height, int16_t* mcbuffer)
+static av_always_inline void
+FUNC(put_hevc_qpel_pixels)(int16_t *dst, ptrdiff_t dststride,
+ uint8_t *_src, ptrdiff_t _srcstride,
+ int width, int height, int mx, int my,
+ int16_t* mcbuffer)
{
int x, y;
pixel *src = (pixel *)_src;
ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+ dststride /= sizeof(*dst);
for (y = 0; y < height; y++) {
for (x = 0; x < width; x++)
dst[x] = src[x] << (14 - BIT_DEPTH);
pixel *src = (pixel*)_src; \
ptrdiff_t srcstride = _srcstride / sizeof(pixel); \
\
+ dststride /= sizeof(*dst); \
for (y = 0; y < height; y++) { \
for (x = 0; x < width; x++) \
dst[x] = QPEL_FILTER_ ## H(src, 1) >> (BIT_DEPTH - 8); \
pixel *src = (pixel*)_src; \
ptrdiff_t srcstride = _srcstride / sizeof(pixel); \
\
+ dststride /= sizeof(*dst); \
for (y = 0; y < height; y++) { \
for (x = 0; x < width; x++) \
dst[x] = QPEL_FILTER_ ## V(src, srcstride) >> (BIT_DEPTH - 8); \
int16_t tmp_array[(MAX_PB_SIZE + 7) * MAX_PB_SIZE]; \
int16_t *tmp = tmp_array; \
\
+ dststride /= sizeof(*dst); \
src -= ff_hevc_qpel_extra_before[V] * srcstride; \
\
for (y = 0; y < height + ff_hevc_qpel_extra[V]; y++) { \
PUT_HEVC_QPEL_HV(3, 2)
PUT_HEVC_QPEL_HV(3, 3)
-static void FUNC(put_hevc_epel_pixels)(int16_t *dst, ptrdiff_t dststride,
- uint8_t *_src, ptrdiff_t _srcstride,
- int width, int height, int mx, int my,
- int16_t* mcbuffer)
+#define QPEL(W) \
+static void FUNC(put_hevc_qpel_pixels_ ## W)(int16_t *dst, ptrdiff_t dststride, \
+ uint8_t *src, ptrdiff_t srcstride, \
+ int height, int mx, int my, \
+ int16_t *mcbuffer) \
+{ \
+ FUNC(put_hevc_qpel_pixels)(dst, dststride, src, srcstride, W, height, \
+ mx, my, mcbuffer); \
+} \
+ \
+static void FUNC(put_hevc_qpel_h_ ## W)(int16_t *dst, ptrdiff_t dststride, \
+ uint8_t *src, ptrdiff_t srcstride, \
+ int height, int mx, int my, \
+ int16_t *mcbuffer) \
+{ \
+ if (mx == 1) \
+ FUNC(put_hevc_qpel_h1)(dst, dststride, src, srcstride, W, height, mcbuffer); \
+ else if (mx == 2) \
+ FUNC(put_hevc_qpel_h2)(dst, dststride, src, srcstride, W, height, mcbuffer); \
+ else \
+ FUNC(put_hevc_qpel_h3)(dst, dststride, src, srcstride, W, height, mcbuffer); \
+} \
+ \
+static void FUNC(put_hevc_qpel_v_ ## W)(int16_t *dst, ptrdiff_t dststride, \
+ uint8_t *src, ptrdiff_t srcstride, \
+ int height, int mx, int my, \
+ int16_t *mcbuffer) \
+{ \
+ if (my == 1) \
+ FUNC(put_hevc_qpel_v1)(dst, dststride, src, srcstride, W, height, mcbuffer); \
+ else if (my == 2) \
+ FUNC(put_hevc_qpel_v2)(dst, dststride, src, srcstride, W, height, mcbuffer); \
+ else \
+ FUNC(put_hevc_qpel_v3)(dst, dststride, src, srcstride, W, height, mcbuffer); \
+} \
+ \
+static void FUNC(put_hevc_qpel_hv_ ## W)(int16_t *dst, ptrdiff_t dststride, \
+ uint8_t *src, ptrdiff_t srcstride, \
+ int height, int mx, int my, \
+ int16_t *mcbuffer) \
+{ \
+ if (my == 1) { \
+ if (mx == 1) \
+ FUNC(put_hevc_qpel_h1v1)(dst, dststride, src, srcstride, W, height, mcbuffer); \
+ else if (mx == 2) \
+ FUNC(put_hevc_qpel_h2v1)(dst, dststride, src, srcstride, W, height, mcbuffer); \
+ else \
+ FUNC(put_hevc_qpel_h3v1)(dst, dststride, src, srcstride, W, height, mcbuffer); \
+ } else if (my == 2) { \
+ if (mx == 1) \
+ FUNC(put_hevc_qpel_h1v2)(dst, dststride, src, srcstride, W, height, mcbuffer); \
+ else if (mx == 2) \
+ FUNC(put_hevc_qpel_h2v2)(dst, dststride, src, srcstride, W, height, mcbuffer); \
+ else \
+ FUNC(put_hevc_qpel_h3v2)(dst, dststride, src, srcstride, W, height, mcbuffer); \
+ } else { \
+ if (mx == 1) \
+ FUNC(put_hevc_qpel_h1v3)(dst, dststride, src, srcstride, W, height, mcbuffer); \
+ else if (mx == 2) \
+ FUNC(put_hevc_qpel_h2v3)(dst, dststride, src, srcstride, W, height, mcbuffer); \
+ else \
+ FUNC(put_hevc_qpel_h3v3)(dst, dststride, src, srcstride, W, height, mcbuffer); \
+ } \
+}
+
+QPEL(64)
+QPEL(48)
+QPEL(32)
+QPEL(24)
+QPEL(16)
+QPEL(12)
+QPEL(8)
+QPEL(4)
+
+static inline void FUNC(put_hevc_epel_pixels)(int16_t *dst, ptrdiff_t dststride,
+ uint8_t *_src, ptrdiff_t _srcstride,
+ int width, int height, int mx, int my,
+ int16_t* mcbuffer)
{
int x, y;
pixel *src = (pixel *)_src;
ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+ dststride /= sizeof(*dst);
for (y = 0; y < height; y++) {
for (x = 0; x < width; x++)
dst[x] = src[x] << (14 - BIT_DEPTH);
filter_2 * src[x + stride] + \
filter_3 * src[x + 2 * stride])
-static void FUNC(put_hevc_epel_h)(int16_t *dst, ptrdiff_t dststride,
- uint8_t *_src, ptrdiff_t _srcstride,
- int width, int height, int mx, int my,
- int16_t* mcbuffer)
+static inline void FUNC(put_hevc_epel_h)(int16_t *dst, ptrdiff_t dststride,
+ uint8_t *_src, ptrdiff_t _srcstride,
+ int width, int height, int mx, int my,
+ int16_t* mcbuffer)
{
int x, y;
pixel *src = (pixel *)_src;
ptrdiff_t srcstride = _srcstride / sizeof(pixel);
- const int8_t *filter = ff_hevc_epel_filters[mx - 1];
+ const int16_t *filter = ff_hevc_epel_coeffs[mx - 1];
int8_t filter_0 = filter[0];
int8_t filter_1 = filter[1];
int8_t filter_2 = filter[2];
int8_t filter_3 = filter[3];
+ dststride /= sizeof(*dst);
for (y = 0; y < height; y++) {
for (x = 0; x < width; x++)
dst[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
}
}
-static void FUNC(put_hevc_epel_v)(int16_t *dst, ptrdiff_t dststride,
- uint8_t *_src, ptrdiff_t _srcstride,
- int width, int height, int mx, int my,
- int16_t* mcbuffer)
+static inline void FUNC(put_hevc_epel_v)(int16_t *dst, ptrdiff_t dststride,
+ uint8_t *_src, ptrdiff_t _srcstride,
+ int width, int height, int mx, int my,
+ int16_t* mcbuffer)
{
int x, y;
pixel *src = (pixel *)_src;
ptrdiff_t srcstride = _srcstride / sizeof(pixel);
- const int8_t *filter = ff_hevc_epel_filters[my - 1];
+ const int16_t *filter = ff_hevc_epel_coeffs[my - 1];
int8_t filter_0 = filter[0];
int8_t filter_1 = filter[1];
int8_t filter_2 = filter[2];
int8_t filter_3 = filter[3];
+ dststride /= sizeof(*dst);
for (y = 0; y < height; y++) {
for (x = 0; x < width; x++)
dst[x] = EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
}
}
-static void FUNC(put_hevc_epel_hv)(int16_t *dst, ptrdiff_t dststride,
- uint8_t *_src, ptrdiff_t _srcstride,
- int width, int height, int mx, int my,
- int16_t* mcbuffer)
+static inline void FUNC(put_hevc_epel_hv)(int16_t *dst, ptrdiff_t dststride,
+ uint8_t *_src, ptrdiff_t _srcstride,
+ int width, int height, int mx, int my,
+ int16_t* mcbuffer)
{
int x, y;
pixel *src = (pixel *)_src;
ptrdiff_t srcstride = _srcstride / sizeof(pixel);
- const int8_t *filter_h = ff_hevc_epel_filters[mx - 1];
- const int8_t *filter_v = ff_hevc_epel_filters[my - 1];
+ const int16_t *filter_h = ff_hevc_epel_coeffs[mx - 1];
+ const int16_t *filter_v = ff_hevc_epel_coeffs[my - 1];
int8_t filter_0 = filter_h[0];
int8_t filter_1 = filter_h[1];
int8_t filter_2 = filter_h[2];
int16_t tmp_array[(MAX_PB_SIZE + 3) * MAX_PB_SIZE];
int16_t *tmp = tmp_array;
+ dststride /= sizeof(*dst);
src -= EPEL_EXTRA_BEFORE * srcstride;
for (y = 0; y < height + EPEL_EXTRA; y++) {
}
}
-static void FUNC(put_unweighted_pred)(uint8_t *_dst, ptrdiff_t _dststride,
- int16_t *src, ptrdiff_t srcstride,
- int width, int height)
+#define EPEL(W) \
+static void FUNC(put_hevc_epel_pixels_ ## W)(int16_t *dst, ptrdiff_t dststride, \
+ uint8_t *src, ptrdiff_t srcstride, \
+ int height, int mx, int my, \
+ int16_t *mcbuffer) \
+{ \
+ FUNC(put_hevc_epel_pixels)(dst, dststride, src, srcstride, \
+ W, height, mx, my, mcbuffer); \
+} \
+static void FUNC(put_hevc_epel_h_ ## W)(int16_t *dst, ptrdiff_t dststride, \
+ uint8_t *src, ptrdiff_t srcstride, \
+ int height, int mx, int my, \
+ int16_t *mcbuffer) \
+{ \
+ FUNC(put_hevc_epel_h)(dst, dststride, src, srcstride, \
+ W, height, mx, my, mcbuffer); \
+} \
+static void FUNC(put_hevc_epel_v_ ## W)(int16_t *dst, ptrdiff_t dststride, \
+ uint8_t *src, ptrdiff_t srcstride, \
+ int height, int mx, int my, \
+ int16_t *mcbuffer) \
+{ \
+ FUNC(put_hevc_epel_v)(dst, dststride, src, srcstride, \
+ W, height, mx, my, mcbuffer); \
+} \
+static void FUNC(put_hevc_epel_hv_ ## W)(int16_t *dst, ptrdiff_t dststride, \
+ uint8_t *src, ptrdiff_t srcstride, \
+ int height, int mx, int my, \
+ int16_t *mcbuffer) \
+{ \
+ FUNC(put_hevc_epel_hv)(dst, dststride, src, srcstride, \
+ W, height, mx, my, mcbuffer); \
+}
+
+EPEL(32)
+EPEL(24)
+EPEL(16)
+EPEL(12)
+EPEL(8)
+EPEL(6)
+EPEL(4)
+EPEL(2)
+
+static av_always_inline void
+FUNC(put_unweighted_pred)(uint8_t *_dst, ptrdiff_t _dststride,
+ int16_t *src, ptrdiff_t srcstride,
+ int width, int height)
{
int x, y;
pixel *dst = (pixel *)_dst;
#else
int offset = 0;
#endif
+ srcstride /= sizeof(*src);
for (y = 0; y < height; y++) {
for (x = 0; x < width; x++)
dst[x] = av_clip_pixel((src[x] + offset) >> shift);
}
}
-static void FUNC(put_weighted_pred_avg)(uint8_t *_dst, ptrdiff_t _dststride,
- int16_t *src1, int16_t *src2,
- ptrdiff_t srcstride,
- int width, int height)
+static av_always_inline void
+FUNC(put_unweighted_pred_avg)(uint8_t *_dst, ptrdiff_t _dststride,
+ int16_t *src1, int16_t *src2,
+ ptrdiff_t srcstride,
+ int width, int height)
{
int x, y;
pixel *dst = (pixel *)_dst;
int offset = 0;
#endif
+ srcstride /= sizeof(*src1);
for (y = 0; y < height; y++) {
for (x = 0; x < width; x++)
dst[x] = av_clip_pixel((src1[x] + src2[x] + offset) >> shift);
}
}
-static void FUNC(weighted_pred)(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
- uint8_t *_dst, ptrdiff_t _dststride,
- int16_t *src, ptrdiff_t srcstride,
- int width, int height)
+static av_always_inline void
+FUNC(weighted_pred)(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
+ uint8_t *_dst, ptrdiff_t _dststride,
+ int16_t *src, ptrdiff_t srcstride,
+ int width, int height)
{
int shift, log2Wd, wx, ox, x, y, offset;
pixel *dst = (pixel *)_dst;
wx = wlxFlag;
ox = olxFlag * (1 << (BIT_DEPTH - 8));
+ srcstride /= sizeof(*src);
for (y = 0; y < height; y++) {
for (x = 0; x < width; x++) {
if (log2Wd >= 1) {
}
}
-static void FUNC(weighted_pred_avg)(uint8_t denom,
- int16_t wl0Flag, int16_t wl1Flag,
- int16_t ol0Flag, int16_t ol1Flag,
- uint8_t *_dst, ptrdiff_t _dststride,
- int16_t *src1, int16_t *src2,
- ptrdiff_t srcstride,
- int width, int height)
+static av_always_inline void
+FUNC(weighted_pred_avg)(uint8_t denom,
+ int16_t wl0Flag, int16_t wl1Flag,
+ int16_t ol0Flag, int16_t ol1Flag,
+ uint8_t *_dst, ptrdiff_t _dststride,
+ int16_t *src1, int16_t *src2,
+ ptrdiff_t srcstride,
+ int width, int height)
{
int shift, log2Wd, w0, w1, o0, o1, x, y;
pixel *dst = (pixel *)_dst;
o0 = ol0Flag * (1 << (BIT_DEPTH - 8));
o1 = ol1Flag * (1 << (BIT_DEPTH - 8));
+ srcstride /= sizeof(*src1);
for (y = 0; y < height; y++) {
for (x = 0; x < width; x++)
dst[x] = av_clip_pixel((src1[x] * w0 + src2[x] * w1 +
}
}
+#define PUT_PRED(w) \
+static void FUNC(put_unweighted_pred_ ## w)(uint8_t *dst, ptrdiff_t dststride, \
+ int16_t *src, ptrdiff_t srcstride, \
+ int height) \
+{ \
+ FUNC(put_unweighted_pred)(dst, dststride, src, srcstride, w, height); \
+} \
+static void FUNC(put_unweighted_pred_avg_ ## w)(uint8_t *dst, ptrdiff_t dststride, \
+ int16_t *src1, int16_t *src2, \
+ ptrdiff_t srcstride, int height) \
+{ \
+ FUNC(put_unweighted_pred_avg)(dst, dststride, src1, src2, srcstride, w, height); \
+} \
+static void FUNC(put_weighted_pred_ ## w)(uint8_t denom, int16_t weight, int16_t offset, \
+ uint8_t *dst, ptrdiff_t dststride, \
+ int16_t *src, ptrdiff_t srcstride, int height) \
+{ \
+ FUNC(weighted_pred)(denom, weight, offset, \
+ dst, dststride, src, srcstride, w, height); \
+} \
+static void FUNC(put_weighted_pred_avg_ ## w)(uint8_t denom, int16_t weight0, int16_t weight1, \
+ int16_t offset0, int16_t offset1, \
+ uint8_t *dst, ptrdiff_t dststride, \
+ int16_t *src1, int16_t *src2, \
+ ptrdiff_t srcstride, int height) \
+{ \
+ FUNC(weighted_pred_avg)(denom, weight0, weight1, offset0, offset1, \
+ dst, dststride, src1, src2, srcstride, w, height); \
+}
+
+PUT_PRED(64)
+PUT_PRED(48)
+PUT_PRED(32)
+PUT_PRED(24)
+PUT_PRED(16)
+PUT_PRED(12)
+PUT_PRED(8)
+PUT_PRED(6)
+PUT_PRED(4)
+PUT_PRED(2)
+
// line zero
#define P3 pix[-4 * xstride]
#define P2 pix[-3 * xstride]
static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix,
ptrdiff_t _xstride, ptrdiff_t _ystride,
- int *_beta, int *_tc,
+ int beta, int *_tc,
uint8_t *_no_p, uint8_t *_no_q)
{
int d, j;
ptrdiff_t xstride = _xstride / sizeof(pixel);
ptrdiff_t ystride = _ystride / sizeof(pixel);
+ beta <<= BIT_DEPTH - 8;
+
for (j = 0; j < 2; j++) {
const int dp0 = abs(P2 - 2 * P1 + P0);
const int dq0 = abs(Q2 - 2 * Q1 + Q0);
const int dq3 = abs(TQ2 - 2 * TQ1 + TQ0);
const int d0 = dp0 + dq0;
const int d3 = dp3 + dq3;
- const int beta = _beta[j] << (BIT_DEPTH - 8);
const int tc = _tc[j] << (BIT_DEPTH - 8);
const int no_p = _no_p[j];
const int no_q = _no_q[j];
}
static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
- int *beta, int *tc, uint8_t *no_p,
+ int beta, int *tc, uint8_t *no_p,
uint8_t *no_q)
{
FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel),
}
static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
- int *beta, int *tc, uint8_t *no_p,
+ int beta, int *tc, uint8_t *no_p,
uint8_t *no_q)
{
FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride,