]> git.sesse.net Git - ffmpeg/commitdiff
Merge commit 'fca3c3b61952aacc45e9ca54d86a762946c21942'
authorClément Bœsch <cboesch@gopro.com>
Tue, 31 Jan 2017 15:50:21 +0000 (16:50 +0100)
committerClément Bœsch <cboesch@gopro.com>
Tue, 31 Jan 2017 15:53:37 +0000 (16:53 +0100)
* commit 'fca3c3b61952aacc45e9ca54d86a762946c21942':
  hevc: Add AVX2 DC IDCT

Mostly noop as we already have that code.

In the ASM, code is merged with the exception of SECTION which is kept
uppercase for consistency with the rest of the codebase.

Still in the ASM, the prototype comment is fixed to honor the '_' added
from the original commit.

idct_dc_proto() is dropped as it's not used anymore here.

Merged-by: Clément Bœsch <cboesch@gopro.com>
1  2 
libavcodec/x86/hevc_idct.asm
libavcodec/x86/hevcdsp.h
libavcodec/x86/hevcdsp_init.c

index 2edaf9aef154ef0036a527073ce32271145bf6b8,d662aa90ba71fca4d91983ff7e6acc6237766520..33b437c2575cf34ba63f11fe59f3a4202e2dd3cb
@@@ -1,29 -1,30 +1,30 @@@
- ; /*
- ; * SIMD optimized idct functions for HEVC decoding
- ; * Copyright (c) 2014 Pierre-Edouard LEPERE
- ; * Copyright (c) 2014 James Almer
- ; *
- ; * This file is part of FFmpeg.
- ; *
- ; * FFmpeg is free software; you can redistribute it and/or
- ; * modify it under the terms of the GNU Lesser General Public
- ; * License as published by the Free Software Foundation; either
- ; * version 2.1 of the License, or (at your option) any later version.
- ; *
- ; * FFmpeg is distributed in the hope that it will be useful,
- ; * but WITHOUT ANY WARRANTY; without even the implied warranty of
- ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- ; * Lesser General Public License for more details.
- ; *
- ; * You should have received a copy of the GNU Lesser General Public
- ; * License along with FFmpeg; if not, write to the Free Software
- ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- ; */
+ ;*******************************************************************************
+ ;* SIMD-optimized IDCT functions for HEVC decoding
+ ;* Copyright (c) 2014 Pierre-Edouard LEPERE
+ ;* Copyright (c) 2014 James Almer
+ ;*
 -;* This file is part of Libav.
++;* This file is part of FFmpeg.
+ ;*
 -;* Libav is free software; you can redistribute it and/or
++;* FFmpeg is free software; you can redistribute it and/or
+ ;* modify it under the terms of the GNU Lesser General Public
+ ;* License as published by the Free Software Foundation; either
+ ;* version 2.1 of the License, or (at your option) any later version.
+ ;*
 -;* Libav is distributed in the hope that it will be useful,
++;* FFmpeg is distributed in the hope that it will be useful,
+ ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ ;* Lesser General Public License for more details.
+ ;*
+ ;* You should have received a copy of the GNU Lesser General Public
 -;* License along with Libav; if not, write to the Free Software
++;* License along with FFmpeg; if not, write to the Free Software
+ ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ;******************************************************************************
  %include "libavutil/x86/x86util.asm"
  
 -section .text
 +SECTION .text
  
--; void ff_hevc_idctHxW_dc_{8,10}_<opt>(int16_t *coeffs)
++; void ff_hevc_idct_HxW_dc_{8,10}_<opt>(int16_t *coeffs)
  ; %1 = HxW
  ; %2 = number of loops
  ; %3 = bitdepth
index 3cfdc272cf928603afb2eda87f2ffb17cdf3475e,0000000000000000000000000000000000000000..63a148e69a53b831291a2b8a32f3a795427277df
mode 100644,000000..100644
--- /dev/null
@@@ -1,261 -1,0 +1,258 @@@
- #define idct_dc_proto(size, bitd, opt) \
-                 void ff_hevc_idct##size##_dc_add_##bitd##_##opt(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 +/*
 + * HEVC video decoder
 + *
 + * Copyright (C) 2012 - 2013 Guillaume Martres
 + * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
 + *
 + *
 + * This file is part of FFmpeg.
 + *
 + * FFmpeg is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public
 + * License as published by the Free Software Foundation; either
 + * version 2.1 of the License, or (at your option) any later version.
 + *
 + * FFmpeg is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with FFmpeg; if not, write to the Free Software
 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 + */
 +
 +#ifndef AVCODEC_X86_HEVCDSP_H
 +#define AVCODEC_X86_HEVCDSP_H
 +
 +#include <stddef.h>
 +#include <stdint.h>
 +
 +
 +#define PEL_LINK(dst, idx1, idx2, idx3, name, D, opt) \
 +dst[idx1][idx2][idx3] = ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt; \
 +dst ## _bi[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt; \
 +dst ## _uni[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt; \
 +dst ## _uni_w[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt; \
 +dst ## _bi_w[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt
 +
 +
 +#define PEL_PROTOTYPE(name, D, opt) \
 +void ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); \
 +void ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); \
 +void ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width); \
 +void ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width); \
 +void ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, int denom, int wx0, int wx1, int ox0, int ox1, intptr_t mx, intptr_t my, int width)
 +
 +
 +///////////////////////////////////////////////////////////////////////////////
 +// MC functions
 +///////////////////////////////////////////////////////////////////////////////
 +
 +#define EPEL_PROTOTYPES(fname, bitd, opt) \
 +        PEL_PROTOTYPE(fname##4,  bitd, opt); \
 +        PEL_PROTOTYPE(fname##6,  bitd, opt); \
 +        PEL_PROTOTYPE(fname##8,  bitd, opt); \
 +        PEL_PROTOTYPE(fname##12, bitd, opt); \
 +        PEL_PROTOTYPE(fname##16, bitd, opt); \
 +        PEL_PROTOTYPE(fname##24, bitd, opt); \
 +        PEL_PROTOTYPE(fname##32, bitd, opt); \
 +        PEL_PROTOTYPE(fname##48, bitd, opt); \
 +        PEL_PROTOTYPE(fname##64, bitd, opt)
 +
 +#define QPEL_PROTOTYPES(fname, bitd, opt) \
 +        PEL_PROTOTYPE(fname##4,  bitd, opt); \
 +        PEL_PROTOTYPE(fname##8,  bitd, opt); \
 +        PEL_PROTOTYPE(fname##12, bitd, opt); \
 +        PEL_PROTOTYPE(fname##16, bitd, opt); \
 +        PEL_PROTOTYPE(fname##24, bitd, opt); \
 +        PEL_PROTOTYPE(fname##32, bitd, opt); \
 +        PEL_PROTOTYPE(fname##48, bitd, opt); \
 +        PEL_PROTOTYPE(fname##64, bitd, opt)
 +
 +#define WEIGHTING_PROTOTYPE(width, bitd, opt) \
 +void ff_hevc_put_hevc_uni_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int height, int denom,  int _wx, int _ox); \
 +void ff_hevc_put_hevc_bi_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int16_t *_src2, int height, int denom,  int _wx0,  int _wx1, int _ox0, int _ox1)
 +
 +#define WEIGHTING_PROTOTYPES(bitd, opt) \
 +        WEIGHTING_PROTOTYPE(2, bitd, opt); \
 +        WEIGHTING_PROTOTYPE(4, bitd, opt); \
 +        WEIGHTING_PROTOTYPE(6, bitd, opt); \
 +        WEIGHTING_PROTOTYPE(8, bitd, opt); \
 +        WEIGHTING_PROTOTYPE(12, bitd, opt); \
 +        WEIGHTING_PROTOTYPE(16, bitd, opt); \
 +        WEIGHTING_PROTOTYPE(24, bitd, opt); \
 +        WEIGHTING_PROTOTYPE(32, bitd, opt); \
 +        WEIGHTING_PROTOTYPE(48, bitd, opt); \
 +        WEIGHTING_PROTOTYPE(64, bitd, opt)
 +
 +
 +///////////////////////////////////////////////////////////////////////////////
 +// QPEL_PIXELS EPEL_PIXELS
 +///////////////////////////////////////////////////////////////////////////////
 +EPEL_PROTOTYPES(pel_pixels ,  8, sse4);
 +EPEL_PROTOTYPES(pel_pixels , 10, sse4);
 +EPEL_PROTOTYPES(pel_pixels , 12, sse4);
 +
 +void ff_hevc_put_hevc_pel_pixels16_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_pel_pixels24_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_pel_pixels32_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_pel_pixels48_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_pel_pixels64_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +
 +void ff_hevc_put_hevc_pel_pixels16_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_pel_pixels24_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_pel_pixels32_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_pel_pixels48_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_pel_pixels64_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +
 +
 +
 +void ff_hevc_put_hevc_uni_pel_pixels32_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_uni_pel_pixels48_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_uni_pel_pixels64_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_uni_pel_pixels96_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); //used for 10bit
 +void ff_hevc_put_hevc_uni_pel_pixels128_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);//used for 10bit
 +
 +
 +void ff_hevc_put_hevc_bi_pel_pixels16_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +void ff_hevc_put_hevc_bi_pel_pixels24_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +void ff_hevc_put_hevc_bi_pel_pixels32_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +void ff_hevc_put_hevc_bi_pel_pixels48_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +void ff_hevc_put_hevc_bi_pel_pixels64_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +
 +void ff_hevc_put_hevc_bi_pel_pixels16_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +void ff_hevc_put_hevc_bi_pel_pixels24_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +void ff_hevc_put_hevc_bi_pel_pixels32_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +void ff_hevc_put_hevc_bi_pel_pixels48_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +void ff_hevc_put_hevc_bi_pel_pixels64_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +
 +///////////////////////////////////////////////////////////////////////////////
 +// EPEL
 +///////////////////////////////////////////////////////////////////////////////
 +EPEL_PROTOTYPES(epel_h ,  8, sse4);
 +EPEL_PROTOTYPES(epel_h , 10, sse4);
 +EPEL_PROTOTYPES(epel_h , 12, sse4);
 +
 +EPEL_PROTOTYPES(epel_v ,  8, sse4);
 +EPEL_PROTOTYPES(epel_v , 10, sse4);
 +EPEL_PROTOTYPES(epel_v , 12, sse4);
 +
 +EPEL_PROTOTYPES(epel_hv ,  8, sse4);
 +EPEL_PROTOTYPES(epel_hv , 10, sse4);
 +EPEL_PROTOTYPES(epel_hv , 12, sse4);
 +
 +PEL_PROTOTYPE(epel_h16, 8, avx2);
 +PEL_PROTOTYPE(epel_h24, 8, avx2);
 +PEL_PROTOTYPE(epel_h32, 8, avx2);
 +PEL_PROTOTYPE(epel_h48, 8, avx2);
 +PEL_PROTOTYPE(epel_h64, 8, avx2);
 +
 +PEL_PROTOTYPE(epel_h16,10, avx2);
 +PEL_PROTOTYPE(epel_h24,10, avx2);
 +PEL_PROTOTYPE(epel_h32,10, avx2);
 +PEL_PROTOTYPE(epel_h48,10, avx2);
 +PEL_PROTOTYPE(epel_h64,10, avx2);
 +
 +PEL_PROTOTYPE(epel_v16, 8, avx2);
 +PEL_PROTOTYPE(epel_v24, 8, avx2);
 +PEL_PROTOTYPE(epel_v32, 8, avx2);
 +PEL_PROTOTYPE(epel_v48, 8, avx2);
 +PEL_PROTOTYPE(epel_v64, 8, avx2);
 +
 +PEL_PROTOTYPE(epel_v16,10, avx2);
 +PEL_PROTOTYPE(epel_v24,10, avx2);
 +PEL_PROTOTYPE(epel_v32,10, avx2);
 +PEL_PROTOTYPE(epel_v48,10, avx2);
 +PEL_PROTOTYPE(epel_v64,10, avx2);
 +
 +PEL_PROTOTYPE(epel_hv16, 8, avx2);
 +PEL_PROTOTYPE(epel_hv24, 8, avx2);
 +PEL_PROTOTYPE(epel_hv32, 8, avx2);
 +PEL_PROTOTYPE(epel_hv48, 8, avx2);
 +PEL_PROTOTYPE(epel_hv64, 8, avx2);
 +
 +PEL_PROTOTYPE(epel_hv16,10, avx2);
 +PEL_PROTOTYPE(epel_hv24,10, avx2);
 +PEL_PROTOTYPE(epel_hv32,10, avx2);
 +PEL_PROTOTYPE(epel_hv48,10, avx2);
 +PEL_PROTOTYPE(epel_hv64,10, avx2);
 +
 +///////////////////////////////////////////////////////////////////////////////
 +// QPEL
 +///////////////////////////////////////////////////////////////////////////////
 +QPEL_PROTOTYPES(qpel_h ,  8, sse4);
 +QPEL_PROTOTYPES(qpel_h , 10, sse4);
 +QPEL_PROTOTYPES(qpel_h , 12, sse4);
 +
 +QPEL_PROTOTYPES(qpel_v,  8, sse4);
 +QPEL_PROTOTYPES(qpel_v, 10, sse4);
 +QPEL_PROTOTYPES(qpel_v, 12, sse4);
 +
 +QPEL_PROTOTYPES(qpel_hv,  8, sse4);
 +QPEL_PROTOTYPES(qpel_hv, 10, sse4);
 +QPEL_PROTOTYPES(qpel_hv, 12, sse4);
 +
 +PEL_PROTOTYPE(qpel_h16, 8, avx2);
 +PEL_PROTOTYPE(qpel_h24, 8, avx2);
 +PEL_PROTOTYPE(qpel_h32, 8, avx2);
 +PEL_PROTOTYPE(qpel_h48, 8, avx2);
 +PEL_PROTOTYPE(qpel_h64, 8, avx2);
 +
 +PEL_PROTOTYPE(qpel_h16,10, avx2);
 +PEL_PROTOTYPE(qpel_h24,10, avx2);
 +PEL_PROTOTYPE(qpel_h32,10, avx2);
 +PEL_PROTOTYPE(qpel_h48,10, avx2);
 +PEL_PROTOTYPE(qpel_h64,10, avx2);
 +
 +PEL_PROTOTYPE(qpel_v16, 8, avx2);
 +PEL_PROTOTYPE(qpel_v24, 8, avx2);
 +PEL_PROTOTYPE(qpel_v32, 8, avx2);
 +PEL_PROTOTYPE(qpel_v48, 8, avx2);
 +PEL_PROTOTYPE(qpel_v64, 8, avx2);
 +
 +PEL_PROTOTYPE(qpel_v16,10, avx2);
 +PEL_PROTOTYPE(qpel_v24,10, avx2);
 +PEL_PROTOTYPE(qpel_v32,10, avx2);
 +PEL_PROTOTYPE(qpel_v48,10, avx2);
 +PEL_PROTOTYPE(qpel_v64,10, avx2);
 +
 +PEL_PROTOTYPE(qpel_hv16, 8, avx2);
 +PEL_PROTOTYPE(qpel_hv24, 8, avx2);
 +PEL_PROTOTYPE(qpel_hv32, 8, avx2);
 +PEL_PROTOTYPE(qpel_hv48, 8, avx2);
 +PEL_PROTOTYPE(qpel_hv64, 8, avx2);
 +
 +PEL_PROTOTYPE(qpel_hv16,10, avx2);
 +PEL_PROTOTYPE(qpel_hv24,10, avx2);
 +PEL_PROTOTYPE(qpel_hv32,10, avx2);
 +PEL_PROTOTYPE(qpel_hv48,10, avx2);
 +PEL_PROTOTYPE(qpel_hv64,10, avx2);
 +
 +WEIGHTING_PROTOTYPES(8, sse4);
 +WEIGHTING_PROTOTYPES(10, sse4);
 +WEIGHTING_PROTOTYPES(12, sse4);
 +
 +///////////////////////////////////////////////////////////////////////////////
 +// TRANSFORM_ADD
 +///////////////////////////////////////////////////////////////////////////////
 +void ff_hevc_add_residual4_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 +void ff_hevc_add_residual8_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 +void ff_hevc_add_residual16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 +void ff_hevc_add_residual32_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 +
 +void ff_hevc_add_residual8_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 +void ff_hevc_add_residual16_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 +void ff_hevc_add_residual32_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 +
 +void ff_hevc_add_residual32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 +
 +void ff_hevc_add_residual4_10_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 +void ff_hevc_add_residual8_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 +void ff_hevc_add_residual16_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 +void ff_hevc_add_residual32_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 +
 +void ff_hevc_add_residual16_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 +void ff_hevc_add_residual32_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 +
 +#endif // AVCODEC_X86_HEVCDSP_H
index da73d7663809337aec3666fcf001334308a02a5f,1a675ab64d93ae92663fd35b61d5a8739c9ee79b..d16e59d9e7cb34c37841a78c3f847c45cc87b527
@@@ -34,34 -32,43 +34,34 @@@ void ff_hevc_ ## DIR ## _loop_filter_ch
  #define LFL_FUNC(DIR, DEPTH, OPT) \
  void ff_hevc_ ## DIR ## _loop_filter_luma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, int beta, int *tc, uint8_t *no_p, uint8_t *no_q);
  
 -#define LFC_FUNCS(type, depth) \
 -    LFC_FUNC(h, depth, sse2)   \
 -    LFC_FUNC(v, depth, sse2)
 -
 -#define LFL_FUNCS(type, depth) \
 -    LFL_FUNC(h, depth, ssse3)  \
 -    LFL_FUNC(v, depth, ssse3)
 -
 -LFC_FUNCS(uint8_t, 8)
 -LFC_FUNCS(uint8_t, 10)
 -LFL_FUNCS(uint8_t, 8)
 -LFL_FUNCS(uint8_t, 10)
 -
 -#define idct_dc_proto(size, bitd, opt) \
 -                void ff_hevc_idct_ ## size ## _dc_add_ ## bitd ## _ ## opt(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 +#define LFC_FUNCS(type, depth, opt) \
 +    LFC_FUNC(h, depth, opt)  \
 +    LFC_FUNC(v, depth, opt)
  
 -idct_dc_proto(4, 8,mmxext);
 -idct_dc_proto(8, 8,mmxext);
 -idct_dc_proto(16,8,  sse2);
 -idct_dc_proto(32,8,  sse2);
 +#define LFL_FUNCS(type, depth, opt) \
 +    LFL_FUNC(h, depth, opt)  \
 +    LFL_FUNC(v, depth, opt)
  
 -idct_dc_proto(32,8,  avx2);
 -
 -idct_dc_proto(4, 10,mmxext);
 -idct_dc_proto(8, 10,  sse2);
 -idct_dc_proto(16,10,  sse2);
 -idct_dc_proto(32,10,  sse2);
 -idct_dc_proto(8, 10,   avx);
 -idct_dc_proto(16,10,   avx);
 -idct_dc_proto(32,10,   avx);
 -
 -idct_dc_proto(16,10,  avx2);
 -idct_dc_proto(32,10,  avx2);
 +LFC_FUNCS(uint8_t,   8, sse2)
 +LFC_FUNCS(uint8_t,  10, sse2)
 +LFC_FUNCS(uint8_t,  12, sse2)
 +LFC_FUNCS(uint8_t,   8, avx)
 +LFC_FUNCS(uint8_t,  10, avx)
 +LFC_FUNCS(uint8_t,  12, avx)
 +LFL_FUNCS(uint8_t,   8, sse2)
 +LFL_FUNCS(uint8_t,  10, sse2)
 +LFL_FUNCS(uint8_t,  12, sse2)
 +LFL_FUNCS(uint8_t,   8, ssse3)
 +LFL_FUNCS(uint8_t,  10, ssse3)
 +LFL_FUNCS(uint8_t,  12, ssse3)
 +LFL_FUNCS(uint8_t,   8, avx)
 +LFL_FUNCS(uint8_t,  10, avx)
 +LFL_FUNCS(uint8_t,  12, avx)
  
  #define IDCT_FUNCS(W, opt) \
- void ff_hevc_idct##W##_dc_8_##opt(int16_t *coeffs); \
- void ff_hevc_idct##W##_dc_10_##opt(int16_t *coeffs); \
- void ff_hevc_idct##W##_dc_12_##opt(int16_t *coeffs)
+ void ff_hevc_idct_ ## W ## _dc_8_ ## opt(int16_t *coeffs); \
 -void ff_hevc_idct_ ## W ## _dc_10_ ## opt(int16_t *coeffs)
++void ff_hevc_idct_ ## W ## _dc_10_ ## opt(int16_t *coeffs); \
++void ff_hevc_idct_ ## W ## _dc_12_ ## opt(int16_t *coeffs)
  
  IDCT_FUNCS(4x4,   mmxext);
  IDCT_FUNCS(8x8,   mmxext);
@@@ -696,419 -240,126 +696,419 @@@ void ff_hevc_dsp_init_x86(HEVCDSPContex
  {
      int cpu_flags = av_get_cpu_flags();
  
 -#define SET_LUMA_FUNCS(tabname, funcname, depth, cf)      \
 -    c->tabname[0] = funcname ## _4_  ## depth ## _ ## cf; \
 -    c->tabname[1] = funcname ## _8_  ## depth ## _ ## cf; \
 -    c->tabname[2] = funcname ## _12_ ## depth ## _ ## cf; \
 -    c->tabname[3] = funcname ## _16_ ## depth ## _ ## cf; \
 -    c->tabname[4] = funcname ## _24_ ## depth ## _ ## cf; \
 -    c->tabname[5] = funcname ## _32_ ## depth ## _ ## cf; \
 -    c->tabname[6] = funcname ## _48_ ## depth ## _ ## cf; \
 -    c->tabname[7] = funcname ## _64_ ## depth ## _ ## cf;
 -
 -#define SET_CHROMA_FUNCS(tabname, funcname, depth, cf)    \
 -    c->tabname[1] = funcname ## _4_  ## depth ## _ ## cf; \
 -    c->tabname[3] = funcname ## _8_  ## depth ## _ ## cf; \
 -    c->tabname[4] = funcname ## _12_ ## depth ## _ ## cf; \
 -    c->tabname[5] = funcname ## _16_ ## depth ## _ ## cf; \
 -    c->tabname[6] = funcname ## _24_ ## depth ## _ ## cf; \
 -    c->tabname[7] = funcname ## _32_ ## depth ## _ ## cf;
 -
 -#define SET_QPEL_FUNCS(v, h, depth, cf, name) SET_LUMA_FUNCS  (put_hevc_qpel[v][h], name, depth, cf)
 -#define SET_EPEL_FUNCS(v, h, depth, cf, name) SET_CHROMA_FUNCS(put_hevc_epel[v][h], name, depth, cf)
 -
      if (bit_depth == 8) {
          if (EXTERNAL_MMXEXT(cpu_flags)) {
-             c->idct_dc[0] = ff_hevc_idct4x4_dc_8_mmxext;
-             c->idct_dc[1] = ff_hevc_idct8x8_dc_8_mmxext;
+             c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_mmxext;
+             c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_mmxext;
 +            c->add_residual[0] = ff_hevc_add_residual4_8_mmxext;
          }
          if (EXTERNAL_SSE2(cpu_flags)) {
              c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
              c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2;
 +            if (ARCH_X86_64) {
 +                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_sse2;
 +                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_sse2;
 +
 +            }
 +            SAO_BAND_INIT(8, sse2);
  
-             c->idct_dc[1] = ff_hevc_idct8x8_dc_8_sse2;
-             c->idct_dc[2] = ff_hevc_idct16x16_dc_8_sse2;
-             c->idct_dc[3] = ff_hevc_idct32x32_dc_8_sse2;
+             c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_sse2;
+             c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_sse2;
+             c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_sse2;
 -            SET_QPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels);
 -            SET_EPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels);
  
 -            SET_LUMA_FUNCS(put_unweighted_pred,              ff_hevc_put_unweighted_pred,     8, sse2);
 -            SET_LUMA_FUNCS(put_unweighted_pred_avg,          ff_hevc_put_unweighted_pred_avg, 8, sse2);
 -            SET_CHROMA_FUNCS(put_unweighted_pred_chroma,     ff_hevc_put_unweighted_pred,     8, sse2);
 -            SET_CHROMA_FUNCS(put_unweighted_pred_avg_chroma, ff_hevc_put_unweighted_pred_avg, 8, sse2);
 +            c->add_residual[1] = ff_hevc_add_residual8_8_sse2;
 +            c->add_residual[2] = ff_hevc_add_residual16_8_sse2;
 +            c->add_residual[3] = ff_hevc_add_residual32_8_sse2;
          }
          if (EXTERNAL_SSSE3(cpu_flags)) {
 -            SET_QPEL_FUNCS(0, 1, 8, ssse3, ff_hevc_qpel_h);
 -            SET_QPEL_FUNCS(1, 0, 8, ssse3, ff_hevc_qpel_v);
 -            SET_EPEL_FUNCS(0, 1, 8, ssse3, ff_hevc_epel_h);
 -            SET_EPEL_FUNCS(1, 0, 8, ssse3, ff_hevc_epel_v);
 +            if(ARCH_X86_64) {
 +                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
 +                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
 +            }
 +            SAO_EDGE_INIT(8, ssse3);
 +        }
 +        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
 +
 +            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels,  8, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,      8, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,      8, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,     8, sse4);
 +
 +            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 8, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     8, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     8, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    8, sse4);
 +        }
 +        if (EXTERNAL_AVX(cpu_flags)) {
 +            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_avx;
 +            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_avx;
 +            if (ARCH_X86_64) {
 +                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_avx;
 +                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_avx;
 +            }
 +            SAO_BAND_INIT(8, avx);
 +
 +            c->add_residual[1] = ff_hevc_add_residual8_8_avx;
 +            c->add_residual[2] = ff_hevc_add_residual16_8_avx;
 +            c->add_residual[3] = ff_hevc_add_residual32_8_avx;
 +        }
 +        if (EXTERNAL_AVX2(cpu_flags)) {
 +            c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_8_avx2;
 +            c->sao_band_filter[1] = ff_hevc_sao_band_filter_16_8_avx2;
 +        }
 +        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
-             c->idct_dc[2] = ff_hevc_idct16x16_dc_8_avx2;
-             c->idct_dc[3] = ff_hevc_idct32x32_dc_8_avx2;
++            c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_avx2;
++            c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_avx2;
 +            if (ARCH_X86_64) {
 +                c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
 +                c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
 +                c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
 +
 +                c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
 +                c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
 +                c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
 +
 +                c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
 +                c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
 +                c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
 +
 +                c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
 +                c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
 +                c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
 +
 +                c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
 +                c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
 +                c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
 +
 +                c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
 +                c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
 +                c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
 +
 +                c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_8_avx2;
 +                c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_8_avx2;
 +                c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_8_avx2;
 +
 +                c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_8_avx2;
 +                c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_8_avx2;
 +                c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_8_avx2;
  
 +                c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_avx2;
 +                c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_8_avx2;
 +                c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_8_avx2;
 +
 +                c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_8_avx2;
 +                c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_8_avx2;
 +                c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_8_avx2;
 +
 +                c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_8_avx2;
 +                c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_8_avx2;
 +                c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_8_avx2;
 +
 +                c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_8_avx2;
 +                c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_8_avx2;
 +                c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_8_avx2;
 +
 +                c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_avx2;
 +                c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_8_avx2;
 +                c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_8_avx2;
 +
 +                c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_avx2;
 +                c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_8_avx2;
 +                c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_8_avx2;
 +
 +                c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_avx2;
 +                c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_8_avx2;
 +                c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_8_avx2;
 +
 +                c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_avx2;
 +                c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_8_avx2;
 +                c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_avx2;
 +
 +                c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_8_avx2;
 +                c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_8_avx2;
 +                c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_8_avx2;
 +
 +                c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_8_avx2;
 +                c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_8_avx2;
 +                c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_avx2;
 +
 +                c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_8_avx2;
 +                c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_8_avx2;
 +                c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_8_avx2;
 +
 +                c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_8_avx2;
 +                c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_8_avx2;
 +                c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_8_avx2;
 +
 +                c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_8_avx2;
 +                c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_8_avx2;
 +                c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_8_avx2;
 +            }
 +            SAO_BAND_INIT(8, avx2);
 +
 +            c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_8_avx2;
 +            c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_8_avx2;
 +            c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_8_avx2;
 +
 +            c->add_residual[3] = ff_hevc_add_residual32_8_avx2;
          }
      } else if (bit_depth == 10) {
          if (EXTERNAL_MMXEXT(cpu_flags)) {
-             c->idct_dc[0] = ff_hevc_idct4x4_dc_10_mmxext;
-             c->idct_dc[1] = ff_hevc_idct8x8_dc_10_mmxext;
 +            c->add_residual[0] = ff_hevc_add_residual4_10_mmxext;
+             c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_mmxext;
+             c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_mmxext;
          }
          if (EXTERNAL_SSE2(cpu_flags)) {
              c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2;
              c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_sse2;
 +            if (ARCH_X86_64) {
 +                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_sse2;
 +                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_sse2;
 +            }
 +            SAO_BAND_INIT(10, sse2);
 +            SAO_EDGE_INIT(10, sse2);
  
-             c->idct_dc[1] = ff_hevc_idct8x8_dc_10_sse2;
-             c->idct_dc[2] = ff_hevc_idct16x16_dc_10_sse2;
-             c->idct_dc[3] = ff_hevc_idct32x32_dc_10_sse2;
+             c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_sse2;
+             c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_sse2;
+             c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_sse2;
  
 -            SET_QPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels);
 -            SET_EPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels);
 -
 -            SET_LUMA_FUNCS(put_unweighted_pred,              ff_hevc_put_unweighted_pred,     10, sse2);
 -            SET_LUMA_FUNCS(put_unweighted_pred_avg,          ff_hevc_put_unweighted_pred_avg, 10, sse2);
 -            SET_CHROMA_FUNCS(put_unweighted_pred_chroma,     ff_hevc_put_unweighted_pred,     10, sse2);
 -            SET_CHROMA_FUNCS(put_unweighted_pred_avg_chroma, ff_hevc_put_unweighted_pred_avg, 10, sse2);
 +            c->add_residual[1] = ff_hevc_add_residual8_10_sse2;
 +            c->add_residual[2] = ff_hevc_add_residual16_10_sse2;
 +            c->add_residual[3] = ff_hevc_add_residual32_10_sse2;
          }
 -    }
 -
 -#if ARCH_X86_64
 -    if (bit_depth == 8) {
 -        if (EXTERNAL_SSSE3(cpu_flags)) {
 -            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
 -            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
 +        if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
 +            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
 +            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
          }
 +        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
 +            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 10, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,     10, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,     10, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,    10, sse4);
  
 -        if (EXTERNAL_SSE4(cpu_flags)) {
 -            SET_LUMA_FUNCS(weighted_pred,              ff_hevc_put_weighted_pred,     8, sse4);
 -            SET_CHROMA_FUNCS(weighted_pred_chroma,     ff_hevc_put_weighted_pred,     8, sse4);
 -            SET_LUMA_FUNCS(weighted_pred_avg,          ff_hevc_put_weighted_pred_avg, 8, sse4);
 -            SET_CHROMA_FUNCS(weighted_pred_avg_chroma, ff_hevc_put_weighted_pred_avg, 8, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 10, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     10, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     10, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    10, sse4);
          }
 -
          if (EXTERNAL_AVX(cpu_flags)) {
 -#if HAVE_AVX_EXTERNAL
 -            SET_QPEL_FUNCS(1, 1, 8, avx, hevc_qpel_hv);
 -            SET_EPEL_FUNCS(1, 1, 8, avx, hevc_epel_hv);
 -#endif /* HAVE_AVX_EXTERNAL */
 +            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_avx;
 +            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_avx;
 +            if (ARCH_X86_64) {
 +                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_avx;
 +                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_avx;
 +            }
 +            SAO_BAND_INIT(10, avx);
          }
          if (EXTERNAL_AVX2(cpu_flags)) {
 -            c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_avx2;
 -            c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_avx2;
 +            c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_10_avx2;
          }
 -    } else if (bit_depth == 10) {
 -        if (EXTERNAL_SSSE3(cpu_flags)) {
 -            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
 -            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
 +        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
-             c->idct_dc[2] = ff_hevc_idct16x16_dc_10_avx2;
-             c->idct_dc[3] = ff_hevc_idct32x32_dc_10_avx2;
++            c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_avx2;
++            c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_avx2;
 +            if (ARCH_X86_64) {
 +                c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
 +                c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
 +                c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
 +                c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
 +                c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
 +
 +                c->put_hevc_qpel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
 +                c->put_hevc_qpel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
 +                c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
 +                c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
 +                c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
 +
 +                c->put_hevc_epel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
 +                c->put_hevc_epel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
 +                c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
 +                c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
 +                c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
 +
 +                c->put_hevc_qpel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
 +                c->put_hevc_qpel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
 +                c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
 +                c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
 +                c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
 +
 +                c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
 +                c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
 +                c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
 +                c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
 +                c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
 +                c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
 +                c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
 +                c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
 +                c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
 +                c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
 +
 +                c->put_hevc_epel[5][0][1] = ff_hevc_put_hevc_epel_h16_10_avx2;
 +                c->put_hevc_epel[6][0][1] = ff_hevc_put_hevc_epel_h24_10_avx2;
 +                c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_10_avx2;
 +                c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_10_avx2;
 +                c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_10_avx2;
 +
 +                c->put_hevc_epel_uni[5][0][1] = ff_hevc_put_hevc_uni_epel_h16_10_avx2;
 +                c->put_hevc_epel_uni[6][0][1] = ff_hevc_put_hevc_uni_epel_h24_10_avx2;
 +                c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_10_avx2;
 +                c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_10_avx2;
 +                c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_10_avx2;
 +
 +                c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_hevc_bi_epel_h16_10_avx2;
 +                c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_hevc_bi_epel_h24_10_avx2;
 +                c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_10_avx2;
 +                c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_10_avx2;
 +                c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_10_avx2;
 +
 +                c->put_hevc_epel[5][1][0] = ff_hevc_put_hevc_epel_v16_10_avx2;
 +                c->put_hevc_epel[6][1][0] = ff_hevc_put_hevc_epel_v24_10_avx2;
 +                c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_10_avx2;
 +                c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_10_avx2;
 +                c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_10_avx2;
 +
 +                c->put_hevc_epel_uni[5][1][0] = ff_hevc_put_hevc_uni_epel_v16_10_avx2;
 +                c->put_hevc_epel_uni[6][1][0] = ff_hevc_put_hevc_uni_epel_v24_10_avx2;
 +                c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_10_avx2;
 +                c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_10_avx2;
 +                c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_10_avx2;
 +
 +                c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_hevc_bi_epel_v16_10_avx2;
 +                c->put_hevc_epel_bi[6][1][0] = ff_hevc_put_hevc_bi_epel_v24_10_avx2;
 +                c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_10_avx2;
 +                c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_10_avx2;
 +                c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_10_avx2;
 +
 +                c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_10_avx2;
 +                c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_10_avx2;
 +                c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_10_avx2;
 +                c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_10_avx2;
 +                c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_10_avx2;
 +
 +                c->put_hevc_epel_uni[5][1][1] = ff_hevc_put_hevc_uni_epel_hv16_10_avx2;
 +                c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_10_avx2;
 +                c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_10_avx2;
 +                c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_10_avx2;
 +                c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_10_avx2;
 +
 +                c->put_hevc_epel_bi[5][1][1] = ff_hevc_put_hevc_bi_epel_hv16_10_avx2;
 +                c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_10_avx2;
 +                c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_10_avx2;
 +                c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_10_avx2;
 +                c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_10_avx2;
 +
 +                c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_10_avx2;
 +                c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h24_10_avx2;
 +                c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_10_avx2;
 +                c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_10_avx2;
 +                c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_10_avx2;
 +
 +                c->put_hevc_qpel_uni[5][0][1] = ff_hevc_put_hevc_uni_qpel_h16_10_avx2;
 +                c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_uni_qpel_h24_10_avx2;
 +                c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_10_avx2;
 +                c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_10_avx2;
 +                c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_10_avx2;
 +
 +                c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_hevc_bi_qpel_h16_10_avx2;
 +                c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_bi_qpel_h24_10_avx2;
 +                c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_10_avx2;
 +                c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_10_avx2;
 +                c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_10_avx2;
 +
 +                c->put_hevc_qpel[5][1][0] = ff_hevc_put_hevc_qpel_v16_10_avx2;
 +                c->put_hevc_qpel[6][1][0] = ff_hevc_put_hevc_qpel_v24_10_avx2;
 +                c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_10_avx2;
 +                c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_10_avx2;
 +                c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_10_avx2;
 +
 +                c->put_hevc_qpel_uni[5][1][0] = ff_hevc_put_hevc_uni_qpel_v16_10_avx2;
 +                c->put_hevc_qpel_uni[6][1][0] = ff_hevc_put_hevc_uni_qpel_v24_10_avx2;
 +                c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_10_avx2;
 +                c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_10_avx2;
 +                c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_10_avx2;
 +
 +                c->put_hevc_qpel_bi[5][1][0] = ff_hevc_put_hevc_bi_qpel_v16_10_avx2;
 +                c->put_hevc_qpel_bi[6][1][0] = ff_hevc_put_hevc_bi_qpel_v24_10_avx2;
 +                c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_10_avx2;
 +                c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_10_avx2;
 +                c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_10_avx2;
 +
 +                c->put_hevc_qpel[5][1][1] = ff_hevc_put_hevc_qpel_hv16_10_avx2;
 +                c->put_hevc_qpel[6][1][1] = ff_hevc_put_hevc_qpel_hv24_10_avx2;
 +                c->put_hevc_qpel[7][1][1] = ff_hevc_put_hevc_qpel_hv32_10_avx2;
 +                c->put_hevc_qpel[8][1][1] = ff_hevc_put_hevc_qpel_hv48_10_avx2;
 +                c->put_hevc_qpel[9][1][1] = ff_hevc_put_hevc_qpel_hv64_10_avx2;
 +
 +                c->put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_uni_qpel_hv16_10_avx2;
 +                c->put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_uni_qpel_hv24_10_avx2;
 +                c->put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_10_avx2;
 +                c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_10_avx2;
 +                c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_10_avx2;
 +
 +                c->put_hevc_qpel_bi[5][1][1] = ff_hevc_put_hevc_bi_qpel_hv16_10_avx2;
 +                c->put_hevc_qpel_bi[6][1][1] = ff_hevc_put_hevc_bi_qpel_hv24_10_avx2;
 +                c->put_hevc_qpel_bi[7][1][1] = ff_hevc_put_hevc_bi_qpel_hv32_10_avx2;
 +                c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_bi_qpel_hv48_10_avx2;
 +                c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_10_avx2;
 +            }
 +            SAO_BAND_INIT(10, avx2);
 +            SAO_EDGE_INIT(10, avx2);
 +
 +            c->add_residual[2] = ff_hevc_add_residual16_10_avx2;
 +            c->add_residual[3] = ff_hevc_add_residual32_10_avx2;
 +
          }
 -        if (EXTERNAL_SSE4(cpu_flags)) {
 -            SET_LUMA_FUNCS(weighted_pred,              ff_hevc_put_weighted_pred,     10, sse4);
 -            SET_CHROMA_FUNCS(weighted_pred_chroma,     ff_hevc_put_weighted_pred,     10, sse4);
 -            SET_LUMA_FUNCS(weighted_pred_avg,          ff_hevc_put_weighted_pred_avg, 10, sse4);
 -            SET_CHROMA_FUNCS(weighted_pred_avg_chroma, ff_hevc_put_weighted_pred_avg, 10, sse4);
 +    } else if (bit_depth == 12) {
 +        if (EXTERNAL_MMXEXT(cpu_flags)) {
-             c->idct_dc[0] = ff_hevc_idct4x4_dc_12_mmxext;
-             c->idct_dc[1] = ff_hevc_idct8x8_dc_12_mmxext;
++            c->idct_dc[0] = ff_hevc_idct_4x4_dc_12_mmxext;
++            c->idct_dc[1] = ff_hevc_idct_8x8_dc_12_mmxext;
 +        }
 +        if (EXTERNAL_SSE2(cpu_flags)) {
 +            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_sse2;
 +            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_sse2;
 +            if (ARCH_X86_64) {
 +                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2;
 +                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2;
 +            }
 +            SAO_BAND_INIT(12, sse2);
 +            SAO_EDGE_INIT(12, sse2);
 +
-             c->idct_dc[1] = ff_hevc_idct8x8_dc_12_sse2;
-             c->idct_dc[2] = ff_hevc_idct16x16_dc_12_sse2;
-             c->idct_dc[3] = ff_hevc_idct32x32_dc_12_sse2;
++            c->idct_dc[1] = ff_hevc_idct_8x8_dc_12_sse2;
++            c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_sse2;
++            c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_sse2;
 +        }
 +        if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
 +            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_ssse3;
 +            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_ssse3;
 +        }
 +        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
 +            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 12, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,     12, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,     12, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,    12, sse4);
 +
 +            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 12, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     12, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     12, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    12, sse4);
          }
          if (EXTERNAL_AVX(cpu_flags)) {
 -#if HAVE_AVX_EXTERNAL
 -            SET_QPEL_FUNCS(0, 1, 10, avx, ff_hevc_qpel_h);
 -            SET_QPEL_FUNCS(1, 0, 10, avx, ff_hevc_qpel_v);
 -            SET_QPEL_FUNCS(1, 1, 10, avx, hevc_qpel_hv);
 -            SET_EPEL_FUNCS(0, 1, 10, avx, ff_hevc_epel_h);
 -            SET_EPEL_FUNCS(1, 0, 10, avx, ff_hevc_epel_v);
 -            SET_EPEL_FUNCS(1, 1, 10, avx, hevc_epel_hv);
 -#endif /* HAVE_AVX_EXTERNAL */
 +            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_avx;
 +            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_avx;
 +            if (ARCH_X86_64) {
 +                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_avx;
 +                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_avx;
 +            }
 +            SAO_BAND_INIT(12, avx);
          }
          if (EXTERNAL_AVX2(cpu_flags)) {
 -            c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_avx2;
 -            c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_avx2;
 +            c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_12_avx2;
 +        }
 +        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
-             c->idct_dc[2] = ff_hevc_idct16x16_dc_12_avx2;
-             c->idct_dc[3] = ff_hevc_idct32x32_dc_12_avx2;
++            c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_avx2;
++            c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_avx2;
 +
 +            SAO_BAND_INIT(12, avx2);
 +            SAO_EDGE_INIT(12, avx2);
          }
      }
 -#endif /* ARCH_X86_64 */
  }