1 ;*****************************************************************************
2 ;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code
3 ;*****************************************************************************
4 ;* Copyright (C) 2011 x264 project
6 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
8 ;* This file is part of Libav.
10 ;* Libav is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* Libav is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with Libav; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
26 %include "x86util.asm"
34 pw_pixel_max: times 8 dw ((1 << 10)-1)
36 pad10: times 8 dw 10*1023
37 pad20: times 8 dw 20*1023
38 pad30: times 8 dw 30*1023
39 depad: times 4 dd 32*20*1023 + 512
40 depad2: times 8 dw 20*1023 + 16*1022 + 16
41 unpad: times 8 dw 16*1022/32 ; needs to be mod 16
43 tap1: times 4 dw 1, -5
44 tap2: times 4 dw 20, 20
45 tap3: times 4 dw -5, 1
46 pd_0f: times 4 dd 0xffff
69 psubw %1, %2 ; (a-b)/4-b
70 paddw %1, %3 ; (a-b)/4-b+c
71 psraw %1, 2 ; ((a-b)/4-b+c)/4
72 paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
94 FILT_H %1, %7, %8, [pw_16]
96 CLIPW %1, [pb_0], [pw_pixel_max]
106 %define OP_MOV AVG_MOV
116 MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8
119 MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8
124 cglobal %2_h264_qpel%5_%3_10_%1, %6,%7,%8
126 call stub_%2_h264_qpel%4_%3_10_%1
131 call stub_%2_h264_qpel%4_%3_10_%1
136 call stub_%2_h264_qpel%4_%3_10_%1
139 lea r0, [r0+r2*%4+%4*2]
140 lea r1, [r1+r2*%4+%4*2]
141 call stub_%2_h264_qpel%4_%3_10_%1
146 call stub_%2_h264_qpel%4_%3_10_%1
149 call stub_%2_h264_qpel%4_%3_10_%1
152 call stub_%2_h264_qpel%4_%3_10_%1
153 lea r0, [r10+r2*%4+%4*2]
154 lea r1, [r11+r2*%4+%4*2]
155 %ifndef UNIX64 ; fall through to function
156 call stub_%2_h264_qpel%4_%3_10_%1
162 ;cpu, put/avg, mc, 4/8, ...
165 MCAxA %1, %2, %3, %4, i, %5,%6,%7
167 cglobal %2_h264_qpel%4_%3_10_%1, %5,%6,%7
168 %ifndef UNIX64 ; no prologue or epilogue for UNIX64
169 call stub_%2_h264_qpel%4_%3_10_%1
173 stub_%2_h264_qpel%4_%3_10_%1:
176 ;-----------------------------------------------------------------------------
177 ; void h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride)
178 ;-----------------------------------------------------------------------------
192 cglobal_mc mmxext, %1, mc00, 4, 3,4,0
198 cglobal %1_h264_qpel8_mc00_10_sse2, 3,4
206 cglobal %1_h264_qpel16_mc00_10_sse2, 3,4
216 OP_MOV [r0+r2+16], m1
227 %define OP_MOV AVG_MOV
230 ;-----------------------------------------------------------------------------
231 ; void h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride)
232 ;-----------------------------------------------------------------------------
235 %define PALIGNR PALIGNR_MMX
239 %1 sse2_cache64 , put, 8
240 %define PALIGNR PALIGNR_SSSE3
241 %1 ssse3_cache64, put, 8
244 %define OP_MOV AVG_MOV
245 %define PALIGNR PALIGNR_MMX
249 %1 sse2_cache64 , avg, 8
250 %define PALIGNR PALIGNR_SSSE3
251 %1 ssse3_cache64, avg, 8
256 cglobal_mc %1, %2, mc20, %3, 3,4,9
258 mova m1, [pw_pixel_max]
273 %else ; movu is slow on these processors
281 PALIGNR m3, m0, m2, 2, m5
282 PALIGNR m7, m0, m2, 8, m5
284 PALIGNR m4, m0, m2, 4, m5
285 PALIGNR m7, m0, m2, 6, m5
291 PALIGNR m3, m6, m2, 2, m5
293 PALIGNR m4, m6, m2, 4, m5
294 PALIGNR m7, m6, m2, 6, m5
300 FILT_H m2, m3, m4, p16
314 ;-----------------------------------------------------------------------------
315 ; void h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride)
316 ;-----------------------------------------------------------------------------
318 cglobal_mc %1, %2, mc30, %3, 3,5,9
320 jmp stub_%2_h264_qpel%3_mc10_10_%1.body
325 ;-----------------------------------------------------------------------------
326 ; void h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride)
327 ;-----------------------------------------------------------------------------
329 cglobal_mc %1, %2, mc10, %3, 3,5,9
333 mova m1, [pw_pixel_max]
348 %else ; movu is slow on these processors
356 PALIGNR m3, m0, m2, 2, m5
357 PALIGNR m7, m0, m2, 8, m5
359 PALIGNR m4, m0, m2, 4, m5
360 PALIGNR m7, m0, m2, 6, m5
366 PALIGNR m3, m6, m2, 2, m5
368 PALIGNR m4, m6, m2, 4, m5
369 PALIGNR m7, m6, m2, 6, m5
375 FILT_H m2, m3, m4, p16
392 ;-----------------------------------------------------------------------------
393 ; void h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
394 ;-----------------------------------------------------------------------------
399 FILT_V m0, m1, m2, m3, m4, m5, m6, m7
409 V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i, mmxext
418 V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i, sse2
424 cglobal_mc %1, %2, mc02, %3, 3,4,8
431 call v_filt%3_ %+ i %+ _10_%1.no_addr4
441 ;-----------------------------------------------------------------------------
442 ; void h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride)
443 ;-----------------------------------------------------------------------------
445 cglobal_mc %1, %2, mc01, %3, 3,5,8
455 call v_filt%3_ %+ i %+ _10_%1
467 ;-----------------------------------------------------------------------------
468 ; void h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride)
469 ;-----------------------------------------------------------------------------
471 cglobal_mc %1, %2, mc03, %3, 3,5,8
473 jmp stub_%2_h264_qpel%3_mc01_10_%1.body
478 ;-----------------------------------------------------------------------------
479 ; void h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride)
480 ;-----------------------------------------------------------------------------
481 %macro H_FILT_AVG 3-4
483 ;FILT_H with fewer registers and averaged with the FILT_V result
484 ;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration
485 ;unfortunately I need three registers, so m5 will have to be re-read from memory
492 psraw m5, 2 ; (a-b)/4
493 psubw m5, m6 ; (a-b)/4-b
496 paddw m5, m6 ; (a-b)/4-b+c
497 psraw m5, 2 ; ((a-b)/4-b+c)/4
498 paddw m5, m6 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
500 CLIPW m5, [pb_0], [pw_pixel_max]
513 H_FILT_AVG mmxext, 4, i
517 H_FILT_AVG mmxext, 4, i, 0
524 H_FILT_AVG sse2, 8, i, 0
526 H_FILT_AVG sse2, 8, i
533 ; this REALLY needs x86_64
534 cglobal_mc %1, %2, mc11, %3, 3,6,8
546 call v_filt%3_ %+ i %+ _10_%1
547 call h_filt%3_ %+ i %+ _10_%1
560 ;-----------------------------------------------------------------------------
561 ; void h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride)
562 ;-----------------------------------------------------------------------------
564 cglobal_mc %1, %2, mc31, %3, 3,6,8
567 jmp stub_%2_h264_qpel%3_mc11_10_%1.body
572 ;-----------------------------------------------------------------------------
573 ; void h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride)
574 ;-----------------------------------------------------------------------------
576 cglobal_mc %1, %2, mc13, %3, 3,7,12
578 jmp stub_%2_h264_qpel%3_mc11_10_%1.body
583 ;-----------------------------------------------------------------------------
584 ; void h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride)
585 ;-----------------------------------------------------------------------------
587 cglobal_mc %1, %2, mc33, %3, 3,6,8
590 jmp stub_%2_h264_qpel%3_mc11_10_%1.body
595 ;-----------------------------------------------------------------------------
596 ; void h264_qpel_mc22(uint8_t *dst, uint8_t *src, int stride)
597 ;-----------------------------------------------------------------------------
602 psubw %1, %2 ; a-5*b+4*c
604 paddw %1, %3 ; a-5*b+20*c
626 neg r2 ; This actually saves instructions
627 lea r1, [r1+r2*2-mmsize+PAD]
628 lea r4, [rsp+PAD+gprsize]
643 FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
645 movu [r4+i*mmsize*3], m0
650 FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
652 movu [r4+i*mmsize*3], m0
654 lea r1, [r1+r2*8+mmsize]
682 movu m1, [r1+mmsize-4]
683 movu m2, [r1+mmsize-2]
684 mova m3, [r1+mmsize+0]
685 movu m4, [r1+mmsize+2]
686 movu m5, [r1+mmsize+4]
687 movu m6, [r1+mmsize+6]
734 cglobal_mc %1, %2, mc22, %3, 3,7,12
735 %define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
736 mov r6, rsp ; backup stack pointer
737 and rsp, ~(mmsize-1) ; align stack
743 mova m7, [pw_pixel_max]
760 mov rsp, r6 ; restore stack pointer
766 ;-----------------------------------------------------------------------------
767 ; void h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride)
768 ;-----------------------------------------------------------------------------
770 cglobal_mc %1, %2, mc12, %3, 3,7,12
771 %define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
772 mov r6, rsp ; backup stack pointer
773 and rsp, ~(mmsize-1) ; align stack
782 mova m7, [pw_pixel_max]
793 movu m3, [r1+r4-2*mmsize] ; movu needed for mc32, etc
805 mov rsp, r6 ; restore stack pointer
811 ;-----------------------------------------------------------------------------
812 ; void h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride)
813 ;-----------------------------------------------------------------------------
815 cglobal_mc %1, %2, mc32, %3, 3,7,12
816 %define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
817 mov r6, rsp ; backup stack pointer
818 and rsp, ~(mmsize-1) ; align stack
823 mov r4d, 2 ; sizeof(pixel)
824 jmp stub_%2_h264_qpel%3_mc12_10_%1.body
829 ;-----------------------------------------------------------------------------
830 ; void h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride)
831 ;-----------------------------------------------------------------------------
863 cglobal_mc %1, %2, mc21, %3, 3,7,12
866 %define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
867 mov r6, rsp ; backup stack pointer
868 and rsp, ~(mmsize-1) ; align stack
876 mov r4d, PAD-mmsize ; H buffer
877 jmp stub_%2_h264_qpel%3_mc12_10_%1.body
882 ;-----------------------------------------------------------------------------
883 ; void h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride)
884 ;-----------------------------------------------------------------------------
886 cglobal_mc %1, %2, mc23, %3, 3,7,12
888 jmp stub_%2_h264_qpel%3_mc21_10_%1.body