1 ;*****************************************************************************
2 ;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code
3 ;*****************************************************************************
4 ;* Copyright (C) 2011 x264 project
6 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
8 ;* This file is part of FFmpeg.
10 ;* FFmpeg is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* FFmpeg is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with FFmpeg; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
25 %include "libavutil/x86/x86util.asm"
31 pb_0: times 32 db 0 ; we do not use cextern here as old llvm-gcc fails to align it correctly
33 pw_pixel_max: times 8 dw ((1 << 10)-1)
35 pad10: times 8 dw 10*1023
36 pad20: times 8 dw 20*1023
37 pad30: times 8 dw 30*1023
38 depad: times 4 dd 32*20*1023 + 512
39 depad2: times 8 dw 20*1023 + 16*1022 + 16
40 unpad: times 8 dw 16*1022/32 ; needs to be mod 16
42 tap1: times 4 dw 1, -5
43 tap2: times 4 dw 20, 20
44 tap3: times 4 dw -5, 1
45 pd_0f: times 4 dd 0xffff
68 psubw %1, %2 ; (a-b)/4-b
69 paddw %1, %3 ; (a-b)/4-b+c
70 psraw %1, 2 ; ((a-b)/4-b+c)/4
71 paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
93 FILT_H %1, %7, %8, [pw_16]
95 CLIPW %1, [pb_0], [pw_pixel_max]
105 %define OP_MOV AVG_MOV
114 cglobal %1_h264_qpel%4_%2_10, %5,%6,%7
115 call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
120 call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
125 call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
128 lea r0, [r0+r2*%3+%3*2]
129 lea r1, [r1+r2*%3+%3*2]
130 call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
133 cglobal %1_h264_qpel%4_%2_10, %5,%6 + 2,%7
137 call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
139 lea r1, [r %+ p1+%3*2]
140 call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
142 lea r1, [r %+ p1+r2*%3]
143 call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
144 lea r0, [r%6+r2*%3+%3*2]
145 lea r1, [r %+ p1+r2*%3+%3*2]
146 %if UNIX64 == 0 ; fall through to function
147 call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
153 ;cpu, put/avg, mc, 4/8, ...
156 %if ARCH_X86_32 || cpuflag(sse2)
157 MCAxA_OP %1, %2, %3, i, %4,%5,%6
160 cglobal %1_h264_qpel%3_%2_10, %4,%5,%6
161 %if UNIX64 == 0 ; no prologue or epilogue for UNIX64
162 call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
166 stub_%1_h264_qpel%3_%2_10 %+ SUFFIX:
169 ;-----------------------------------------------------------------------------
170 ; void ff_h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride)
171 ;-----------------------------------------------------------------------------
185 cglobal_mc %1, mc00, 4, 3,4,0
191 cglobal %1_h264_qpel8_mc00_10, 3,4
199 cglobal %1_h264_qpel16_mc00_10, 3,4
209 OP_MOV [r0+r2+16], m1
220 %define OP_MOV AVG_MOV
223 ;-----------------------------------------------------------------------------
224 ; void ff_h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride)
225 ;-----------------------------------------------------------------------------
230 INIT_XMM sse2, cache64
232 INIT_XMM ssse3, cache64
237 %define OP_MOV AVG_MOV
240 INIT_XMM sse2, cache64
242 INIT_XMM ssse3, cache64
249 cglobal_mc %1, mc20, %2, 3,4,9
251 mova m1, [pw_pixel_max]
266 %else ; movu is slow on these processors
274 PALIGNR m3, m0, m2, 2, m5
275 PALIGNR m7, m0, m2, 8, m5
277 PALIGNR m4, m0, m2, 4, m5
278 PALIGNR m7, m0, m2, 6, m5
284 PALIGNR m3, m6, m2, 2, m5
286 PALIGNR m4, m6, m2, 4, m5
287 PALIGNR m7, m6, m2, 6, m5
293 FILT_H m2, m3, m4, p16
307 ;-----------------------------------------------------------------------------
308 ; void ff_h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride)
309 ;-----------------------------------------------------------------------------
311 cglobal_mc %1, mc30, %2, 3,5,9
313 jmp stub_%1_h264_qpel%2_mc10_10 %+ SUFFIX %+ .body
318 ;-----------------------------------------------------------------------------
319 ; void ff_h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride)
320 ;-----------------------------------------------------------------------------
322 cglobal_mc %1, mc10, %2, 3,5,9
326 mova m1, [pw_pixel_max]
341 %else ; movu is slow on these processors
349 PALIGNR m3, m0, m2, 2, m5
350 PALIGNR m7, m0, m2, 8, m5
352 PALIGNR m4, m0, m2, 4, m5
353 PALIGNR m7, m0, m2, 6, m5
359 PALIGNR m3, m6, m2, 2, m5
361 PALIGNR m4, m6, m2, 4, m5
362 PALIGNR m7, m6, m2, 6, m5
368 FILT_H m2, m3, m4, p16
385 ;-----------------------------------------------------------------------------
386 ; void ff_h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
387 ;-----------------------------------------------------------------------------
392 FILT_V m0, m1, m2, m3, m4, m5, m6, m7
402 V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i
411 V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i
417 cglobal_mc %1, mc02, %2, 3,4,8
424 call v_filt%2_ %+ i %+ _10.no_addr4
434 ;-----------------------------------------------------------------------------
435 ; void ff_h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride)
436 ;-----------------------------------------------------------------------------
438 cglobal_mc %1, mc01, %2, 3,5,8
448 call v_filt%2_ %+ i %+ _10
460 ;-----------------------------------------------------------------------------
461 ; void ff_h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride)
462 ;-----------------------------------------------------------------------------
464 cglobal_mc %1, mc03, %2, 3,5,8
466 jmp stub_%1_h264_qpel%2_mc01_10 %+ SUFFIX %+ .body
471 ;-----------------------------------------------------------------------------
472 ; void ff_h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride)
473 ;-----------------------------------------------------------------------------
474 %macro H_FILT_AVG 2-3
476 ;FILT_H with fewer registers and averaged with the FILT_V result
477 ;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration
478 ;unfortunately I need three registers, so m5 will have to be re-read from memory
485 psraw m5, 2 ; (a-b)/4
486 psubw m5, m6 ; (a-b)/4-b
489 paddw m5, m6 ; (a-b)/4-b+c
490 psraw m5, 2 ; ((a-b)/4-b+c)/4
491 paddw m5, m6 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
493 CLIPW m5, [pb_0], [pw_pixel_max]
526 ; this REALLY needs x86_64
527 cglobal_mc %1, mc11, %2, 3,6,8
539 call v_filt%2_ %+ i %+ _10
540 call h_filt%2_ %+ i %+ _10
553 ;-----------------------------------------------------------------------------
554 ; void ff_h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride)
555 ;-----------------------------------------------------------------------------
557 cglobal_mc %1, mc31, %2, 3,6,8
560 jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
565 ;-----------------------------------------------------------------------------
566 ; void ff_h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride)
567 ;-----------------------------------------------------------------------------
569 cglobal_mc %1, mc13, %2, 3,7,12
571 jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
576 ;-----------------------------------------------------------------------------
577 ; void ff_h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride)
578 ;-----------------------------------------------------------------------------
580 cglobal_mc %1, mc33, %2, 3,6,8
583 jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
588 ;-----------------------------------------------------------------------------
589 ; void ff_h264_qpel_mc22(uint8_t *dst, uint8_t *src, int stride)
590 ;-----------------------------------------------------------------------------
595 psubw %1, %2 ; a-5*b+4*c
597 paddw %1, %3 ; a-5*b+20*c
619 neg r2 ; This actually saves instructions
620 lea r1, [r1+r2*2-mmsize+PAD]
621 lea r4, [rsp+PAD+gprsize]
636 FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
638 movu [r4+i*mmsize*3], m0
643 FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
645 movu [r4+i*mmsize*3], m0
647 lea r1, [r1+r2*8+mmsize]
675 movu m1, [r1+mmsize-4]
676 movu m2, [r1+mmsize-2]
677 mova m3, [r1+mmsize+0]
678 movu m4, [r1+mmsize+2]
679 movu m5, [r1+mmsize+4]
680 movu m6, [r1+mmsize+6]
727 cglobal_mc %1, mc22, %2, 3,7,12
728 %define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
729 mov r6, rsp ; backup stack pointer
730 and rsp, ~(mmsize-1) ; align stack
736 mova m7, [pw_pixel_max]
753 mov rsp, r6 ; restore stack pointer
759 ;-----------------------------------------------------------------------------
760 ; void ff_h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride)
761 ;-----------------------------------------------------------------------------
763 cglobal_mc %1, mc12, %2, 3,7,12
764 %define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
765 mov r6, rsp ; backup stack pointer
766 and rsp, ~(mmsize-1) ; align stack
775 mova m7, [pw_pixel_max]
786 movu m3, [r1+r4-2*mmsize] ; movu needed for mc32, etc
798 mov rsp, r6 ; restore stack pointer
804 ;-----------------------------------------------------------------------------
805 ; void ff_h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride)
806 ;-----------------------------------------------------------------------------
808 cglobal_mc %1, mc32, %2, 3,7,12
809 %define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
810 mov r6, rsp ; backup stack pointer
811 and rsp, ~(mmsize-1) ; align stack
816 mov r4d, 2 ; sizeof(pixel)
817 jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body
822 ;-----------------------------------------------------------------------------
823 ; void ff_h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride)
824 ;-----------------------------------------------------------------------------
856 cglobal_mc %1, mc21, %2, 3,7,12
859 %define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
860 mov r6, rsp ; backup stack pointer
861 and rsp, ~(mmsize-1) ; align stack
869 mov r4d, PAD-mmsize ; H buffer
870 jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body
875 ;-----------------------------------------------------------------------------
876 ; void ff_h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride)
877 ;-----------------------------------------------------------------------------
879 cglobal_mc %1, mc23, %2, 3,7,12
881 jmp stub_%1_h264_qpel%2_mc21_10 %+ SUFFIX %+ .body