1 ;*****************************************************************************
2 ;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code
3 ;*****************************************************************************
4 ;* Copyright (C) 2011 x264 project
6 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
8 ;* This file is part of Libav.
10 ;* Libav is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* Libav is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with Libav; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
26 %include "x86util.asm"
34 pw_pixel_max: times 8 dw ((1 << 10)-1)
36 pad10: times 8 dw 10*1023
37 pad20: times 8 dw 20*1023
38 pad30: times 8 dw 30*1023
39 depad: times 4 dd 32*20*1023 + 512
40 depad2: times 8 dw 20*1023 + 16*1022 + 16
41 unpad: times 8 dw 16*1022/32 ; needs to be mod 16
43 tap1: times 4 dw 1, -5
44 tap2: times 4 dw 20, 20
45 tap3: times 4 dw -5, 1
46 pd_0f: times 4 dd 0xffff
69 psubw %1, %2 ; (a-b)/4-b
70 paddw %1, %3 ; (a-b)/4-b+c
71 psraw %1, 2 ; ((a-b)/4-b+c)/4
72 paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
94 FILT_H %1, %7, %8, [pw_16]
96 CLIPW %1, [pb_0], [pw_pixel_max]
106 %define OP_MOV AVG_MOV
116 MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8
119 MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8
125 cglobal %2_h264_qpel%5_%3_10_%1, %6,%7,%8
126 call stub_%2_h264_qpel%4_%3_10_%1
131 call stub_%2_h264_qpel%4_%3_10_%1
136 call stub_%2_h264_qpel%4_%3_10_%1
139 lea r0, [r0+r2*%4+%4*2]
140 lea r1, [r1+r2*%4+%4*2]
141 call stub_%2_h264_qpel%4_%3_10_%1
144 cglobal %2_h264_qpel%5_%3_10_%1, %6,%7 + 2,%8
148 call stub_%2_h264_qpel%4_%3_10_%1
150 lea r1, [r %+ p1+%4*2]
151 call stub_%2_h264_qpel%4_%3_10_%1
153 lea r1, [r %+ p1+r2*%4]
154 call stub_%2_h264_qpel%4_%3_10_%1
155 lea r0, [r%7+r2*%4+%4*2]
156 lea r1, [r %+ p1+r2*%4+%4*2]
157 %if UNIX64 == 0 ; fall through to function
158 call stub_%2_h264_qpel%4_%3_10_%1
164 ;cpu, put/avg, mc, 4/8, ...
167 MCAxA %1, %2, %3, %4, i, %5,%6,%7
169 cglobal %2_h264_qpel%4_%3_10_%1, %5,%6,%7
170 %if UNIX64 == 0 ; no prologue or epilogue for UNIX64
171 call stub_%2_h264_qpel%4_%3_10_%1
175 stub_%2_h264_qpel%4_%3_10_%1:
178 ;-----------------------------------------------------------------------------
179 ; void h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride)
180 ;-----------------------------------------------------------------------------
194 cglobal_mc mmxext, %1, mc00, 4, 3,4,0
200 cglobal %1_h264_qpel8_mc00_10_sse2, 3,4
208 cglobal %1_h264_qpel16_mc00_10_sse2, 3,4
218 OP_MOV [r0+r2+16], m1
229 %define OP_MOV AVG_MOV
232 ;-----------------------------------------------------------------------------
233 ; void h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride)
234 ;-----------------------------------------------------------------------------
237 %define PALIGNR PALIGNR_MMX
241 %1 sse2_cache64 , put, 8
242 %define PALIGNR PALIGNR_SSSE3
243 %1 ssse3_cache64, put, 8
246 %define OP_MOV AVG_MOV
247 %define PALIGNR PALIGNR_MMX
251 %1 sse2_cache64 , avg, 8
252 %define PALIGNR PALIGNR_SSSE3
253 %1 ssse3_cache64, avg, 8
258 cglobal_mc %1, %2, mc20, %3, 3,4,9
260 mova m1, [pw_pixel_max]
275 %else ; movu is slow on these processors
283 PALIGNR m3, m0, m2, 2, m5
284 PALIGNR m7, m0, m2, 8, m5
286 PALIGNR m4, m0, m2, 4, m5
287 PALIGNR m7, m0, m2, 6, m5
293 PALIGNR m3, m6, m2, 2, m5
295 PALIGNR m4, m6, m2, 4, m5
296 PALIGNR m7, m6, m2, 6, m5
302 FILT_H m2, m3, m4, p16
316 ;-----------------------------------------------------------------------------
317 ; void h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride)
318 ;-----------------------------------------------------------------------------
320 cglobal_mc %1, %2, mc30, %3, 3,5,9
322 jmp stub_%2_h264_qpel%3_mc10_10_%1.body
327 ;-----------------------------------------------------------------------------
328 ; void h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride)
329 ;-----------------------------------------------------------------------------
331 cglobal_mc %1, %2, mc10, %3, 3,5,9
335 mova m1, [pw_pixel_max]
350 %else ; movu is slow on these processors
358 PALIGNR m3, m0, m2, 2, m5
359 PALIGNR m7, m0, m2, 8, m5
361 PALIGNR m4, m0, m2, 4, m5
362 PALIGNR m7, m0, m2, 6, m5
368 PALIGNR m3, m6, m2, 2, m5
370 PALIGNR m4, m6, m2, 4, m5
371 PALIGNR m7, m6, m2, 6, m5
377 FILT_H m2, m3, m4, p16
394 ;-----------------------------------------------------------------------------
395 ; void h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
396 ;-----------------------------------------------------------------------------
401 FILT_V m0, m1, m2, m3, m4, m5, m6, m7
411 V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i, mmxext
420 V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i, sse2
426 cglobal_mc %1, %2, mc02, %3, 3,4,8
433 call v_filt%3_ %+ i %+ _10_%1.no_addr4
443 ;-----------------------------------------------------------------------------
444 ; void h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride)
445 ;-----------------------------------------------------------------------------
447 cglobal_mc %1, %2, mc01, %3, 3,5,8
457 call v_filt%3_ %+ i %+ _10_%1
469 ;-----------------------------------------------------------------------------
470 ; void h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride)
471 ;-----------------------------------------------------------------------------
473 cglobal_mc %1, %2, mc03, %3, 3,5,8
475 jmp stub_%2_h264_qpel%3_mc01_10_%1.body
480 ;-----------------------------------------------------------------------------
481 ; void h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride)
482 ;-----------------------------------------------------------------------------
483 %macro H_FILT_AVG 3-4
485 ;FILT_H with fewer registers and averaged with the FILT_V result
486 ;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration
487 ;unfortunately I need three registers, so m5 will have to be re-read from memory
494 psraw m5, 2 ; (a-b)/4
495 psubw m5, m6 ; (a-b)/4-b
498 paddw m5, m6 ; (a-b)/4-b+c
499 psraw m5, 2 ; ((a-b)/4-b+c)/4
500 paddw m5, m6 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
502 CLIPW m5, [pb_0], [pw_pixel_max]
515 H_FILT_AVG mmxext, 4, i
519 H_FILT_AVG mmxext, 4, i, 0
526 H_FILT_AVG sse2, 8, i, 0
528 H_FILT_AVG sse2, 8, i
535 ; this REALLY needs x86_64
536 cglobal_mc %1, %2, mc11, %3, 3,6,8
548 call v_filt%3_ %+ i %+ _10_%1
549 call h_filt%3_ %+ i %+ _10_%1
562 ;-----------------------------------------------------------------------------
563 ; void h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride)
564 ;-----------------------------------------------------------------------------
566 cglobal_mc %1, %2, mc31, %3, 3,6,8
569 jmp stub_%2_h264_qpel%3_mc11_10_%1.body
574 ;-----------------------------------------------------------------------------
575 ; void h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride)
576 ;-----------------------------------------------------------------------------
578 cglobal_mc %1, %2, mc13, %3, 3,7,12
580 jmp stub_%2_h264_qpel%3_mc11_10_%1.body
585 ;-----------------------------------------------------------------------------
586 ; void h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride)
587 ;-----------------------------------------------------------------------------
589 cglobal_mc %1, %2, mc33, %3, 3,6,8
592 jmp stub_%2_h264_qpel%3_mc11_10_%1.body
597 ;-----------------------------------------------------------------------------
598 ; void h264_qpel_mc22(uint8_t *dst, uint8_t *src, int stride)
599 ;-----------------------------------------------------------------------------
604 psubw %1, %2 ; a-5*b+4*c
606 paddw %1, %3 ; a-5*b+20*c
628 neg r2 ; This actually saves instructions
629 lea r1, [r1+r2*2-mmsize+PAD]
630 lea r4, [rsp+PAD+gprsize]
645 FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
647 movu [r4+i*mmsize*3], m0
652 FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
654 movu [r4+i*mmsize*3], m0
656 lea r1, [r1+r2*8+mmsize]
684 movu m1, [r1+mmsize-4]
685 movu m2, [r1+mmsize-2]
686 mova m3, [r1+mmsize+0]
687 movu m4, [r1+mmsize+2]
688 movu m5, [r1+mmsize+4]
689 movu m6, [r1+mmsize+6]
736 cglobal_mc %1, %2, mc22, %3, 3,7,12
737 %define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
738 mov r6, rsp ; backup stack pointer
739 and rsp, ~(mmsize-1) ; align stack
745 mova m7, [pw_pixel_max]
762 mov rsp, r6 ; restore stack pointer
768 ;-----------------------------------------------------------------------------
769 ; void h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride)
770 ;-----------------------------------------------------------------------------
772 cglobal_mc %1, %2, mc12, %3, 3,7,12
773 %define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
774 mov r6, rsp ; backup stack pointer
775 and rsp, ~(mmsize-1) ; align stack
784 mova m7, [pw_pixel_max]
795 movu m3, [r1+r4-2*mmsize] ; movu needed for mc32, etc
807 mov rsp, r6 ; restore stack pointer
813 ;-----------------------------------------------------------------------------
814 ; void h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride)
815 ;-----------------------------------------------------------------------------
817 cglobal_mc %1, %2, mc32, %3, 3,7,12
818 %define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
819 mov r6, rsp ; backup stack pointer
820 and rsp, ~(mmsize-1) ; align stack
825 mov r4d, 2 ; sizeof(pixel)
826 jmp stub_%2_h264_qpel%3_mc12_10_%1.body
831 ;-----------------------------------------------------------------------------
832 ; void h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride)
833 ;-----------------------------------------------------------------------------
865 cglobal_mc %1, %2, mc21, %3, 3,7,12
868 %define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
869 mov r6, rsp ; backup stack pointer
870 and rsp, ~(mmsize-1) ; align stack
878 mov r4d, PAD-mmsize ; H buffer
879 jmp stub_%2_h264_qpel%3_mc12_10_%1.body
884 ;-----------------------------------------------------------------------------
885 ; void h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride)
886 ;-----------------------------------------------------------------------------
888 cglobal_mc %1, %2, mc23, %3, 3,7,12
890 jmp stub_%2_h264_qpel%3_mc21_10_%1.body