1 ;*****************************************************************************
2 ;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code
3 ;*****************************************************************************
4 ;* Copyright (C) 2011 x264 project
6 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
8 ;* This file is part of Libav.
10 ;* Libav is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* Libav is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with Libav; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
25 %include "libavutil/x86/x86util.asm"
33 pw_pixel_max: times 8 dw ((1 << 10)-1)
35 pad10: times 8 dw 10*1023
36 pad20: times 8 dw 20*1023
37 pad30: times 8 dw 30*1023
38 depad: times 4 dd 32*20*1023 + 512
39 depad2: times 8 dw 20*1023 + 16*1022 + 16
40 unpad: times 8 dw 16*1022/32 ; needs to be mod 16
42 tap1: times 4 dw 1, -5
43 tap2: times 4 dw 20, 20
44 tap3: times 4 dw -5, 1
45 pd_0f: times 4 dd 0xffff
68 psubw %1, %2 ; (a-b)/4-b
69 paddw %1, %3 ; (a-b)/4-b+c
70 psraw %1, 2 ; ((a-b)/4-b+c)/4
71 paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
93 FILT_H %1, %7, %8, [pw_16]
95 CLIPW %1, [pb_0], [pw_pixel_max]
105 %define OP_MOV AVG_MOV
115 MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8
118 MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8
124 cglobal %2_h264_qpel%5_%3_10_%1, %6,%7,%8
125 call stub_%2_h264_qpel%4_%3_10_%1
130 call stub_%2_h264_qpel%4_%3_10_%1
135 call stub_%2_h264_qpel%4_%3_10_%1
138 lea r0, [r0+r2*%4+%4*2]
139 lea r1, [r1+r2*%4+%4*2]
140 call stub_%2_h264_qpel%4_%3_10_%1
143 cglobal %2_h264_qpel%5_%3_10_%1, %6,%7 + 2,%8
147 call stub_%2_h264_qpel%4_%3_10_%1
149 lea r1, [r %+ p1+%4*2]
150 call stub_%2_h264_qpel%4_%3_10_%1
152 lea r1, [r %+ p1+r2*%4]
153 call stub_%2_h264_qpel%4_%3_10_%1
154 lea r0, [r%7+r2*%4+%4*2]
155 lea r1, [r %+ p1+r2*%4+%4*2]
156 %if UNIX64 == 0 ; fall through to function
157 call stub_%2_h264_qpel%4_%3_10_%1
163 ;cpu, put/avg, mc, 4/8, ...
166 MCAxA %1, %2, %3, %4, i, %5,%6,%7
168 cglobal %2_h264_qpel%4_%3_10_%1, %5,%6,%7
169 %if UNIX64 == 0 ; no prologue or epilogue for UNIX64
170 call stub_%2_h264_qpel%4_%3_10_%1
174 stub_%2_h264_qpel%4_%3_10_%1:
177 ;-----------------------------------------------------------------------------
178 ; void h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride)
179 ;-----------------------------------------------------------------------------
193 cglobal_mc mmxext, %1, mc00, 4, 3,4,0
199 cglobal %1_h264_qpel8_mc00_10_sse2, 3,4
207 cglobal %1_h264_qpel16_mc00_10_sse2, 3,4
217 OP_MOV [r0+r2+16], m1
228 %define OP_MOV AVG_MOV
231 ;-----------------------------------------------------------------------------
232 ; void h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride)
233 ;-----------------------------------------------------------------------------
236 %define PALIGNR PALIGNR_MMX
240 %1 sse2_cache64 , put, 8
241 %define PALIGNR PALIGNR_SSSE3
242 %1 ssse3_cache64, put, 8
245 %define OP_MOV AVG_MOV
246 %define PALIGNR PALIGNR_MMX
250 %1 sse2_cache64 , avg, 8
251 %define PALIGNR PALIGNR_SSSE3
252 %1 ssse3_cache64, avg, 8
257 cglobal_mc %1, %2, mc20, %3, 3,4,9
259 mova m1, [pw_pixel_max]
274 %else ; movu is slow on these processors
282 PALIGNR m3, m0, m2, 2, m5
283 PALIGNR m7, m0, m2, 8, m5
285 PALIGNR m4, m0, m2, 4, m5
286 PALIGNR m7, m0, m2, 6, m5
292 PALIGNR m3, m6, m2, 2, m5
294 PALIGNR m4, m6, m2, 4, m5
295 PALIGNR m7, m6, m2, 6, m5
301 FILT_H m2, m3, m4, p16
315 ;-----------------------------------------------------------------------------
316 ; void h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride)
317 ;-----------------------------------------------------------------------------
319 cglobal_mc %1, %2, mc30, %3, 3,5,9
321 jmp stub_%2_h264_qpel%3_mc10_10_%1.body
326 ;-----------------------------------------------------------------------------
327 ; void h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride)
328 ;-----------------------------------------------------------------------------
330 cglobal_mc %1, %2, mc10, %3, 3,5,9
334 mova m1, [pw_pixel_max]
349 %else ; movu is slow on these processors
357 PALIGNR m3, m0, m2, 2, m5
358 PALIGNR m7, m0, m2, 8, m5
360 PALIGNR m4, m0, m2, 4, m5
361 PALIGNR m7, m0, m2, 6, m5
367 PALIGNR m3, m6, m2, 2, m5
369 PALIGNR m4, m6, m2, 4, m5
370 PALIGNR m7, m6, m2, 6, m5
376 FILT_H m2, m3, m4, p16
393 ;-----------------------------------------------------------------------------
394 ; void h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
395 ;-----------------------------------------------------------------------------
400 FILT_V m0, m1, m2, m3, m4, m5, m6, m7
410 V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i, mmxext
419 V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i, sse2
425 cglobal_mc %1, %2, mc02, %3, 3,4,8
432 call v_filt%3_ %+ i %+ _10_%1.no_addr4
442 ;-----------------------------------------------------------------------------
443 ; void h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride)
444 ;-----------------------------------------------------------------------------
446 cglobal_mc %1, %2, mc01, %3, 3,5,8
456 call v_filt%3_ %+ i %+ _10_%1
468 ;-----------------------------------------------------------------------------
469 ; void h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride)
470 ;-----------------------------------------------------------------------------
472 cglobal_mc %1, %2, mc03, %3, 3,5,8
474 jmp stub_%2_h264_qpel%3_mc01_10_%1.body
479 ;-----------------------------------------------------------------------------
480 ; void h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride)
481 ;-----------------------------------------------------------------------------
482 %macro H_FILT_AVG 3-4
484 ;FILT_H with fewer registers and averaged with the FILT_V result
485 ;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration
486 ;unfortunately I need three registers, so m5 will have to be re-read from memory
493 psraw m5, 2 ; (a-b)/4
494 psubw m5, m6 ; (a-b)/4-b
497 paddw m5, m6 ; (a-b)/4-b+c
498 psraw m5, 2 ; ((a-b)/4-b+c)/4
499 paddw m5, m6 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
501 CLIPW m5, [pb_0], [pw_pixel_max]
514 H_FILT_AVG mmxext, 4, i
518 H_FILT_AVG mmxext, 4, i, 0
525 H_FILT_AVG sse2, 8, i, 0
527 H_FILT_AVG sse2, 8, i
534 ; this REALLY needs x86_64
535 cglobal_mc %1, %2, mc11, %3, 3,6,8
547 call v_filt%3_ %+ i %+ _10_%1
548 call h_filt%3_ %+ i %+ _10_%1
561 ;-----------------------------------------------------------------------------
562 ; void h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride)
563 ;-----------------------------------------------------------------------------
565 cglobal_mc %1, %2, mc31, %3, 3,6,8
568 jmp stub_%2_h264_qpel%3_mc11_10_%1.body
573 ;-----------------------------------------------------------------------------
574 ; void h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride)
575 ;-----------------------------------------------------------------------------
577 cglobal_mc %1, %2, mc13, %3, 3,7,12
579 jmp stub_%2_h264_qpel%3_mc11_10_%1.body
584 ;-----------------------------------------------------------------------------
585 ; void h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride)
586 ;-----------------------------------------------------------------------------
588 cglobal_mc %1, %2, mc33, %3, 3,6,8
591 jmp stub_%2_h264_qpel%3_mc11_10_%1.body
596 ;-----------------------------------------------------------------------------
597 ; void h264_qpel_mc22(uint8_t *dst, uint8_t *src, int stride)
598 ;-----------------------------------------------------------------------------
603 psubw %1, %2 ; a-5*b+4*c
605 paddw %1, %3 ; a-5*b+20*c
627 neg r2 ; This actually saves instructions
628 lea r1, [r1+r2*2-mmsize+PAD]
629 lea r4, [rsp+PAD+gprsize]
644 FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
646 movu [r4+i*mmsize*3], m0
651 FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
653 movu [r4+i*mmsize*3], m0
655 lea r1, [r1+r2*8+mmsize]
683 movu m1, [r1+mmsize-4]
684 movu m2, [r1+mmsize-2]
685 mova m3, [r1+mmsize+0]
686 movu m4, [r1+mmsize+2]
687 movu m5, [r1+mmsize+4]
688 movu m6, [r1+mmsize+6]
735 cglobal_mc %1, %2, mc22, %3, 3,7,12
736 %define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
737 mov r6, rsp ; backup stack pointer
738 and rsp, ~(mmsize-1) ; align stack
744 mova m7, [pw_pixel_max]
761 mov rsp, r6 ; restore stack pointer
767 ;-----------------------------------------------------------------------------
768 ; void h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride)
769 ;-----------------------------------------------------------------------------
771 cglobal_mc %1, %2, mc12, %3, 3,7,12
772 %define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
773 mov r6, rsp ; backup stack pointer
774 and rsp, ~(mmsize-1) ; align stack
783 mova m7, [pw_pixel_max]
794 movu m3, [r1+r4-2*mmsize] ; movu needed for mc32, etc
806 mov rsp, r6 ; restore stack pointer
812 ;-----------------------------------------------------------------------------
813 ; void h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride)
814 ;-----------------------------------------------------------------------------
816 cglobal_mc %1, %2, mc32, %3, 3,7,12
817 %define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
818 mov r6, rsp ; backup stack pointer
819 and rsp, ~(mmsize-1) ; align stack
824 mov r4d, 2 ; sizeof(pixel)
825 jmp stub_%2_h264_qpel%3_mc12_10_%1.body
830 ;-----------------------------------------------------------------------------
831 ; void h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride)
832 ;-----------------------------------------------------------------------------
864 cglobal_mc %1, %2, mc21, %3, 3,7,12
867 %define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
868 mov r6, rsp ; backup stack pointer
869 and rsp, ~(mmsize-1) ; align stack
877 mov r4d, PAD-mmsize ; H buffer
878 jmp stub_%2_h264_qpel%3_mc12_10_%1.body
883 ;-----------------------------------------------------------------------------
884 ; void h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride)
885 ;-----------------------------------------------------------------------------
887 cglobal_mc %1, %2, mc23, %3, 3,7,12
889 jmp stub_%2_h264_qpel%3_mc21_10_%1.body