1 ;*****************************************************************************
2 ;* mc-a2.asm: x86 motion compensation
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2015 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Holger Lubitz <holger@lubitz.org>
9 ;* Mathieu Monnier <manao@melix.net>
10 ;* Oskar Arvidsson <oskar@irock.se>
12 ;* This program is free software; you can redistribute it and/or modify
13 ;* it under the terms of the GNU General Public License as published by
14 ;* the Free Software Foundation; either version 2 of the License, or
15 ;* (at your option) any later version.
17 ;* This program is distributed in the hope that it will be useful,
18 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;* GNU General Public License for more details.
22 ;* You should have received a copy of the GNU General Public License
23 ;* along with this program; if not, write to the Free Software
24 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
26 ;* This program is also available under a commercial proprietary license.
27 ;* For more information, contact us at licensing@x264.com.
28 ;*****************************************************************************
31 %include "x86util.asm"
35 pw_1024: times 16 dw 1024
36 filt_mul20: times 32 db 20
37 filt_mul15: times 16 db 1, -5
38 filt_mul51: times 16 db -5, 1
39 hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
40 deinterleave_shuf: times 2 db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
43 v210_mask: times 4 dq 0xc00ffc003ff003ff
44 v210_luma_shuf: times 2 db 1,2,4,5,6,7,9,10,12,13,14,15,12,13,14,15
45 v210_chroma_shuf: times 2 db 0,1,2,3,5,6,8,9,10,11,13,14,10,11,13,14
46 ; vpermd indices {0,1,2,4,5,7,_,_} merged in the 3 lsb of each dword to save a register
47 v210_mult: dw 0x2000,0x7fff,0x0801,0x2000,0x7ffa,0x0800,0x7ffc,0x0800
48 dw 0x1ffd,0x7fff,0x07ff,0x2000,0x7fff,0x0800,0x7fff,0x0800
50 deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14
51 deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15
53 deinterleave_rgb_shuf: db 0,3,6,9,1,4,7,10,2,5,8,11,-1,-1,-1,-1
54 db 0,4,8,12,1,5,9,13,2,6,10,14,-1,-1,-1,-1
56 deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
57 deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
58 %endif ; !HIGH_BIT_DEPTH
61 pd_0f: times 4 dd 0xffff
63 pad10: times 8 dw 10*PIXEL_MAX
64 pad20: times 8 dw 20*PIXEL_MAX
65 pad30: times 8 dw 30*PIXEL_MAX
66 depad: times 4 dd 32*20*PIXEL_MAX + 512
68 tap1: times 4 dw 1, -5
69 tap2: times 4 dw 20, 20
70 tap3: times 4 dw -5, 1
72 pw_0xc000: times 8 dw 0xc000
116 psubw %1, %2 ; a-5*b+4*c
120 paddw %1, %3 ; a-5*b+20*c
126 psraw %1, 2 ; (a-b)/4
127 psubw %1, %2 ; (a-b)/4-b
128 paddw %1, %3 ; (a-b)/4-b+c
129 psraw %1, 2 ; ((a-b)/4-b+c)/4
130 paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
165 %if HIGH_BIT_DEPTH == 0
170 ;The hpel_filter routines use non-temporal writes for output.
171 ;The following defines may be uncommented for testing.
172 ;Doing the hpel_filter temporal may be a win if the last level cache
173 ;is big enough (preliminary benching suggests on the order of 4* framesize).
176 ;%define movntps movaps
180 ;-----------------------------------------------------------------------------
181 ; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, intptr_t stride, intptr_t width );
182 ;-----------------------------------------------------------------------------
184 cglobal hpel_filter_v, 5,6,11
204 mova m7, [pw_pixel_max]
211 mova m5, [r1+r3+mmsize]
212 mova m6, [r1+r3*2+mmsize]
216 paddw m4, [r5+r3*2+mmsize]
217 paddw m5, [r5+r3+mmsize]
218 paddw m6, [r5+mmsize]
221 FILT_V2 m1, m2, m3, m4, m5, m6
226 mova [r2+r4+mmsize], m4
229 FILT_PACK m1, m4, m6, 5, s10
233 mova [r0+r4+mmsize], m4
238 ;-----------------------------------------------------------------------------
239 ; void hpel_filter_c( uint16_t *dst, int16_t *buf, intptr_t width );
240 ;-----------------------------------------------------------------------------
241 cglobal hpel_filter_c, 3,3,10
281 CLIPW m1, [pb_0], [pw_pixel_max]
287 ;-----------------------------------------------------------------------------
288 ; void hpel_filter_h( uint16_t *dst, uint16_t *src, intptr_t width );
289 ;-----------------------------------------------------------------------------
290 cglobal hpel_filter_h, 3,4,8
296 mova m0, [pw_pixel_max]
308 movu m4, [src-4+mmsize]
309 movu m5, [src-2+mmsize]
311 movu m7, [src+4+mmsize]
312 movu m6, [src+6+mmsize]
315 movu m7, [src+2+mmsize]
316 mova m6, [src+0+mmsize]
318 FILT_H2 m1, m2, m3, m4, m5, m6
321 FILT_PACK m1, m4, m7, 1
325 mova [r0+r2+mmsize], m4
329 %endmacro ; HPEL_FILTER
335 %endif ; HIGH_BIT_DEPTH
337 %if HIGH_BIT_DEPTH == 0
339 ;-----------------------------------------------------------------------------
340 ; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, intptr_t stride, intptr_t width );
341 ;-----------------------------------------------------------------------------
342 cglobal hpel_filter_v, 5,6,%1
350 mova m0, [filt_mul15]
362 SBUTTERFLY bw, 1, 4, 7
363 SBUTTERFLY bw, 2, 5, 7
364 SBUTTERFLY bw, 3, 6, 7
369 pmaddubsw m3, [filt_mul20]
370 pmaddubsw m6, [filt_mul20]
377 LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1
378 LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1
379 LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0
380 LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
381 FILT_V2 m1, m2, m3, m4, m5, m6
386 mova [r2+r4*2+mmsize/2], xm4
387 vextracti128 [r2+r4*2+mmsize], m1, 1
388 vextracti128 [r2+r4*2+mmsize*3/2], m4, 1
391 mova [r2+r4*2+mmsize], m4
393 FILT_PACK m1, m4, m7, 5
402 ;-----------------------------------------------------------------------------
403 ; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
404 ;-----------------------------------------------------------------------------
406 cglobal hpel_filter_c, 3,3
418 paddw m3, [src+2] ; c0
422 paddw m4, [src+14] ; a1
423 paddw m5, [src+12] ; b1
424 paddw m6, [src+10] ; c1
425 FILT_H2 m1, m2, m3, m4, m5, m6
426 FILT_PACK m1, m4, m7, 6
432 ;-----------------------------------------------------------------------------
433 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
434 ;-----------------------------------------------------------------------------
436 cglobal hpel_filter_h, 3,3
470 FILT_H2 m1, m2, m3, m4, m5, m6
471 FILT_PACK m1, m4, m7, 1
478 ;-----------------------------------------------------------------------------
479 ; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
480 ;-----------------------------------------------------------------------------
481 cglobal hpel_filter_c, 3,3,9
486 %ifnidn cpuname, sse2
497 %define pw_rnd [pw_32]
499 ; This doesn't seem to be faster (with AVX) on Sandy Bridge or Bulldozer...
505 movu m3, [src-4+mmsize]
506 movu m2, [src-2+mmsize]
507 mova m1, [src+0+mmsize]
511 paddw m3, [src+6+mmsize]
512 paddw m2, [src+4+mmsize]
513 paddw m1, [src+2+mmsize]
514 FILT_H2 m4, m5, m6, m3, m2, m1
520 PALIGNR m4, m1, m0, 12, m7
521 PALIGNR m5, m1, m0, 14, m0
522 PALIGNR m0, m2, m1, 6, m7
524 PALIGNR m0, m2, m1, 4, m7
526 PALIGNR m6, m2, m1, 2, m7
532 PALIGNR m2, m1, 12, m7
533 PALIGNR m5, m1, 14, m1
535 PALIGNR m3, m1, m0, 6, m7
537 PALIGNR m6, m1, m0, 4, m7
539 PALIGNR m6, m1, m0, 2, m7
543 FILT_PACK m4, m3, pw_rnd, 6
553 ;-----------------------------------------------------------------------------
554 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
555 ;-----------------------------------------------------------------------------
557 cglobal hpel_filter_h, 3,3,8
594 mova m7, [pw_1] ; FIXME xmm8
595 FILT_H2 m1, m2, m3, m4, m5, m6
596 FILT_PACK m1, m4, m7, 1
602 ;-----------------------------------------------------------------------------
603 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
604 ;-----------------------------------------------------------------------------
606 cglobal hpel_filter_h, 3,3
616 ; Using unaligned loads instead of palignr is marginally slower on SB and significantly
617 ; slower on Bulldozer, despite their fast load units -- even though it would let us avoid
618 ; the repeated loads of constants for pmaddubsw.
619 palignr m3, m1, m0, 14
620 palignr m4, m1, m0, 15
621 palignr m0, m2, m1, 2
622 pmaddubsw m3, [filt_mul15]
623 pmaddubsw m4, [filt_mul15]
624 pmaddubsw m0, [filt_mul51]
625 palignr m5, m2, m1, 1
626 palignr m6, m2, m1, 3
629 pmaddubsw m1, [filt_mul20]
630 pmaddubsw m5, [filt_mul20]
631 pmaddubsw m6, [filt_mul51]
635 FILT_PACK m3, m4, m7, 5
636 pshufb m3, [hpel_shuf]
664 cglobal hpel_filter_h, 3,3,8
669 mova m5, [filt_mul15]
670 mova m6, [filt_mul20]
671 mova m7, [filt_mul51]
692 FILT_PACK m0, m1, m2, 5
693 pshufb m0, [hpel_shuf]
702 ;The optimum prefetch distance is difficult to determine in checkasm:
703 ;any prefetch seems slower than not prefetching.
704 ;In real use, the prefetch seems to be a slight win.
705 ;+mmsize is picked somewhat arbitrarily here based on the fact that even one
706 ;loop iteration is going to take longer than the prefetch.
707 prefetcht0 [r1+r2*2+mmsize]
734 LOAD_ADD_2 m1, m4, [r3 ], [r1+r2*2], m2, m5 ; a0 / a1
735 LOAD_ADD_2 m2, m5, [r3+r2 ], [r1+r2 ], m3, m6 ; b0 / b1
736 LOAD_ADD_2 m3, m6, [r3+r2*2], [r1 ], %3, %4 ; c0 / c1
738 FILT_V2 m1, m2, m3, m4, m5, m6
743 vinserti128 %1, m1, xm4, 1
744 vperm2i128 %2, m1, m4, q0301
749 FILT_PACK m1, m4, m15, 5
750 movntps [r8+r4+%5], m1
755 vperm2i128 m3, %2, %1, q0003
757 PALIGNR m1, %2, %1, (mmsize-4), m3
758 PALIGNR m2, %2, %1, (mmsize-2), m3
760 vperm2i128 %1, %3, %2, q0003
762 PALIGNR m3, %3, %2, 4, %1
763 PALIGNR m4, %3, %2, 2, %1
769 PALIGNR %3, %3, %2, 6, m2
778 FILT_PACK %3, %4, m15, 6
796 vperm2i128 m3, %2, %1, q0003
798 PALIGNR m1, %2, %1, (mmsize-2), m3
799 PALIGNR m2, %2, %1, (mmsize-1), m3
801 vperm2i128 m3, %3, %2, q0003
803 PALIGNR m4, %3, %2, 1 , m3
804 PALIGNR m5, %3, %2, 2 , m3
805 PALIGNR m6, %3, %2, 3 , m3
818 FILT_PACK m1, m2, m15, 5
819 pshufb m1, [hpel_shuf]
821 ADD8TO16 m1, m6, m12, m3, m0 ; a
822 ADD8TO16 m2, m5, m12, m3, m0 ; b
823 ADD8TO16 %2, m4, m12, m3, m0 ; c
824 FILT_V2 m1, m2, %2, m6, m5, m4
825 FILT_PACK m1, m6, m15, 5
832 ;-----------------------------------------------------------------------------
833 ; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
834 ; uint8_t *src, intptr_t stride, int width, int height )
835 ;-----------------------------------------------------------------------------
836 cglobal hpel_filter, 7,9,16
853 mova m0, [filt_mul51]
854 mova m12, [filt_mul15]
855 mova m14, [filt_mul20]
864 DO_FILT_V m8, m7, m13, m12, 0
867 DO_FILT_V m6, m5, m11, m12, mmsize
870 psrlw m15, 1 ; pw_512
872 paddw m15, m15 ; pw_32
874 DO_FILT_C m9, m8, m7, m6
876 paddw m15, m15 ; pw_1024
881 DO_FILT_H m10, m13, m11
886 ; setup regs for next y
914 %endif ; !HIGH_BIT_DEPTH
916 %macro PREFETCHNT_ITER 2 ; src, bytes/iteration
917 %assign %%i 4*(%2) ; prefetch 4 iterations ahead. is this optimal?
918 %rep (%2+63) / 64 ; assume 64 byte cache lines
924 ;-----------------------------------------------------------------------------
925 ; void plane_copy_core( pixel *dst, intptr_t i_dst,
926 ; pixel *src, intptr_t i_src, int w, int h )
927 ;-----------------------------------------------------------------------------
928 ; assumes i_dst and w are multiples of mmsize, and i_dst>w
929 %macro PLANE_COPY_CORE 0
930 cglobal plane_copy_core, 6,7
931 FIX_STRIDES r1, r3, r4d
932 %if HIGH_BIT_DEPTH == 0
939 lea r6, [r4+4*mmsize]
941 PREFETCHNT_ITER r2+r6, 4*mmsize
942 movu m0, [r2+r6-4*mmsize]
943 movu m1, [r2+r6-3*mmsize]
944 movu m2, [r2+r6-2*mmsize]
945 movu m3, [r2+r6-1*mmsize]
946 movnta [r0+r6-4*mmsize], m0
947 movnta [r0+r6-3*mmsize], m1
948 movnta [r0+r6-2*mmsize], m2
949 movnta [r0+r6-1*mmsize], m3
952 PREFETCHNT_ITER r2+r6, 4*mmsize
974 %macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
978 mov%4 m0, [%2+(x/2)*mmsize]
979 mov%4 m1, [%3+(x/2)*mmsize]
982 mov%5a [%1+(x+0)*mmsize], m0
983 mov%5a [%1+(x+1)*mmsize], m2
1003 %endif ; HIGH_BIT_DEPTH
1006 %macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, shuffle constant, is aligned
1010 mova m0, [%3+(n+0)*mmsize]
1011 mova m1, [%3+(n+1)*mmsize]
1018 mov%6 [%1+(n/2)*mmsize], m0
1019 mov%6 [%2+(n/2)*mmsize], m2
1022 %else ; !HIGH_BIT_DEPTH
1052 %endif ; mmsize == 16
1053 %endif ; HIGH_BIT_DEPTH
1056 %macro PLANE_INTERLEAVE 0
1057 ;-----------------------------------------------------------------------------
1058 ; void plane_copy_interleave_core( uint8_t *dst, intptr_t i_dst,
1059 ; uint8_t *srcu, intptr_t i_srcu,
1060 ; uint8_t *srcv, intptr_t i_srcv, int w, int h )
1061 ;-----------------------------------------------------------------------------
1062 ; assumes i_dst and w are multiples of 16, and i_dst>2*w
1063 cglobal plane_copy_interleave_core, 6,9
1066 FIX_STRIDES r1, r3, r5, r6d
1080 shr t1, SIZEOF_PIXEL
1094 INTERLEAVE r0+r6*2+ 0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6+0*SIZEOF_PIXEL, u, nt
1095 INTERLEAVE r0+r6*2+16*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6+8*SIZEOF_PIXEL, u, nt
1096 add r6, 16*SIZEOF_PIXEL
1102 movntq [r0+r6*2+(n+ 0)], m0
1103 movntq [r0+r6*2+(n+ 8)], m0
1104 movntq [r0+r6*2+(n+16)], m0
1105 movntq [r0+r6*2+(n+24)], m0
1107 movntdq [r0+r6*2+(n+ 0)], m0
1108 movntdq [r0+r6*2+(n+16)], m0
1112 add r6, 16*SIZEOF_PIXEL
1124 ;-----------------------------------------------------------------------------
1125 ; void store_interleave_chroma( uint8_t *dst, intptr_t i_dst, uint8_t *srcu, uint8_t *srcv, int height )
1126 ;-----------------------------------------------------------------------------
1127 cglobal store_interleave_chroma, 5,5
1130 INTERLEAVE r0+ 0, r2+ 0, r3+ 0, a
1131 INTERLEAVE r0+r1, r2+FDEC_STRIDEB, r3+FDEC_STRIDEB, a
1132 add r2, FDEC_STRIDEB*2
1133 add r3, FDEC_STRIDEB*2
1138 %endmacro ; PLANE_INTERLEAVE
1140 %macro DEINTERLEAVE_START 0
1143 %elif cpuflag(ssse3)
1144 mova m4, [deinterleave_shuf]
1147 %endif ; HIGH_BIT_DEPTH
1150 %macro PLANE_DEINTERLEAVE 0
1151 ;-----------------------------------------------------------------------------
1152 ; void plane_copy_deinterleave( pixel *dstu, intptr_t i_dstu,
1153 ; pixel *dstv, intptr_t i_dstv,
1154 ; pixel *src, intptr_t i_src, int w, int h )
1155 ;-----------------------------------------------------------------------------
1156 cglobal plane_copy_deinterleave, 6,7
1159 FIX_STRIDES r1, r3, r5, r6d
1170 DEINTERLEAVE r0+r6+0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6*2+ 0*SIZEOF_PIXEL, 0, m4, u
1171 DEINTERLEAVE r0+r6+8*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6*2+16*SIZEOF_PIXEL, 0, m4, u
1172 add r6, 16*SIZEOF_PIXEL
1181 ;-----------------------------------------------------------------------------
1182 ; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height )
1183 ;-----------------------------------------------------------------------------
1184 cglobal load_deinterleave_chroma_fenc, 4,4
1188 DEINTERLEAVE r0+ 0, r0+FENC_STRIDEB*1/2, r1+ 0, 1, m4, a
1189 DEINTERLEAVE r0+FENC_STRIDEB, r0+FENC_STRIDEB*3/2, r1+r2, 1, m4, a
1190 add r0, FENC_STRIDEB*2
1196 ;-----------------------------------------------------------------------------
1197 ; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height )
1198 ;-----------------------------------------------------------------------------
1199 cglobal load_deinterleave_chroma_fdec, 4,4
1203 DEINTERLEAVE r0+ 0, r0+FDEC_STRIDEB*1/2, r1+ 0, 0, m4, a
1204 DEINTERLEAVE r0+FDEC_STRIDEB, r0+FDEC_STRIDEB*3/2, r1+r2, 0, m4, a
1205 add r0, FDEC_STRIDEB*2
1210 %endmacro ; PLANE_DEINTERLEAVE
1212 %macro PLANE_DEINTERLEAVE_RGB_CORE 9 ; pw, i_dsta, i_dstb, i_dstc, i_src, w, h, tmp1, tmp2
1214 mova m3, [deinterleave_rgb_shuf+(%1-3)*16]
1221 movu m1, [%8+%1*mmsize/4]
1223 pshufb m0, m3 ; b0 b1 b2 b3 g0 g1 g2 g3 r0 r1 r2 r3
1224 pshufb m1, m3 ; b4 b5 b6 b7 g4 g5 g6 g7 r4 r5 r6 r7
1227 punpcklqdq m0, m1 ; b0 g0 r0 b1 g1 r1 __ __ b4 g4 r4 b5 g5 r5
1229 punpcklqdq m2, m1 ; b2 g2 r2 b3 g3 r3 __ __ b6 g6 r6 b7 g7 r7
1232 punpckhbw m1, m0, m3 ; b4 b5 g4 g5 r4 r5
1233 punpcklbw m0, m3 ; b0 b1 g0 g1 r0 r1
1234 punpckhbw m3, m2, m4 ; b6 b7 g6 g7 r6 r7
1235 punpcklbw m2, m4 ; b2 b3 g2 g3 r2 r3
1236 punpcklwd m0, m2 ; b0 b1 b2 b3 g0 g1 g2 g3 r0 r1 r2 r3
1237 punpcklwd m1, m3 ; b4 b5 b6 b7 g4 g5 g6 g7 r4 r5 r6 r7
1239 pshufd m3, m0, q2301
1240 pshufd m4, m1, q2301
1241 punpckhbw m2, m0, m3 ; b2 b3 g2 g3 r2 r3
1242 punpcklbw m0, m3 ; b0 b1 g0 g1 r0 r1
1243 punpckhbw m3, m1, m4 ; b6 b7 g6 g7 r6 r7
1244 punpcklbw m1, m4 ; b4 b5 g4 g5 r4 r5
1245 punpcklwd m0, m2 ; b0 b1 b2 b3 g0 g1 g2 g3 r0 r1 r2 r3
1246 punpcklwd m1, m3 ; b4 b5 b6 b7 g4 g5 g6 g7 r4 r5 r6 r7
1248 punpckldq m2, m0, m1 ; b0 b1 b2 b3 b4 b5 b6 b7 g0 g1 g2 g3 g4 g5 g6 g7
1249 punpckhdq m0, m1 ; r0 r1 r2 r3 r4 r5 r6 r7
1264 %macro PLANE_DEINTERLEAVE_RGB 0
1265 ;-----------------------------------------------------------------------------
1266 ; void x264_plane_copy_deinterleave_rgb( pixel *dsta, intptr_t i_dsta,
1267 ; pixel *dstb, intptr_t i_dstb,
1268 ; pixel *dstc, intptr_t i_dstc,
1269 ; pixel *src, intptr_t i_src, int pw, int w, int h )
1270 ;-----------------------------------------------------------------------------
1272 cglobal plane_copy_deinterleave_rgb, 8,12
1273 %define %%args r1, r3, r5, r7, r8, r9, r10, r11
1281 cglobal plane_copy_deinterleave_rgb, 1,7
1282 %define %%args r1m, r3m, r5m, r7m, r9m, r1, r3, r5
1296 PLANE_DEINTERLEAVE_RGB_CORE 3, %%args ; BGR
1299 PLANE_DEINTERLEAVE_RGB_CORE 4, %%args ; BGRA
1304 %if HIGH_BIT_DEPTH == 0
1306 PLANE_DEINTERLEAVE_RGB
1308 PLANE_DEINTERLEAVE_RGB
1309 %endif ; !HIGH_BIT_DEPTH
1311 %macro PLANE_DEINTERLEAVE_V210 0
1312 ;-----------------------------------------------------------------------------
1313 ; void x264_plane_copy_deinterleave_v210( uint16_t *dsty, intptr_t i_dsty,
1314 ; uint16_t *dstc, intptr_t i_dstc,
1315 ; uint32_t *src, intptr_t i_src, int w, int h )
1316 ;-----------------------------------------------------------------------------
1318 cglobal plane_copy_deinterleave_v210, 8,10,7
1323 cglobal plane_copy_deinterleave_v210, 7,7,7
1328 FIX_STRIDES r1, r3, r6d
1335 mova m2, [v210_mask]
1336 mova m3, [v210_luma_shuf]
1337 mova m4, [v210_chroma_shuf]
1338 mova m5, [v210_mult] ; also functions as vpermd index for avx2
1339 pshufd m6, m5, q1102
1348 pmulhrsw m0, m5 ; y0 y1 y2 y3 y4 y5 __ __
1349 pmulhrsw m1, m6 ; u0 v0 u1 v1 u2 v2 __ __
1367 %endmacro ; PLANE_DEINTERLEAVE_V210
1378 PLANE_DEINTERLEAVE_V210
1382 PLANE_DEINTERLEAVE_V210
1384 PLANE_DEINTERLEAVE_V210
1397 ; These functions are not general-use; not only do the SSE ones require aligned input,
1398 ; but they also will fail if given a non-mod16 size.
1399 ; memzero SSE will fail for non-mod128.
1401 ;-----------------------------------------------------------------------------
1402 ; void *memcpy_aligned( void *dst, const void *src, size_t n );
1403 ;-----------------------------------------------------------------------------
1405 cglobal memcpy_aligned, 3,3
1416 mova m0, [r1+r2-1*mmsize]
1417 mova m1, [r1+r2-2*mmsize]
1418 mova [r0+r2-1*mmsize], m0
1419 mova [r0+r2-2*mmsize], m1
1425 mova m0, [r1+r2-1*mmsize]
1426 mova m1, [r1+r2-2*mmsize]
1427 mova m2, [r1+r2-3*mmsize]
1428 mova m3, [r1+r2-4*mmsize]
1429 mova [r0+r2-1*mmsize], m0
1430 mova [r0+r2-2*mmsize], m1
1431 mova [r0+r2-3*mmsize], m2
1432 mova [r0+r2-4*mmsize], m3
1444 ;-----------------------------------------------------------------------------
1445 ; void *memzero_aligned( void *dst, size_t n );
1446 ;-----------------------------------------------------------------------------
1448 cglobal memzero_aligned, 2,2
1459 mova [r0 + r1 + i], m0
1474 %if HIGH_BIT_DEPTH == 0
1475 ;-----------------------------------------------------------------------------
1476 ; void integral_init4h( uint16_t *sum, uint8_t *pix, intptr_t stride )
1477 ;-----------------------------------------------------------------------------
1478 %macro INTEGRAL_INIT4H 0
1479 cglobal integral_init4h, 3,4
1495 paddw m1, [r0+r2*2+mmsize]
1497 mova [r3+r2*2+mmsize], m1
1508 %macro INTEGRAL_INIT8H 0
1509 cglobal integral_init8h, 3,4
1518 mpsadbw m2, m0, m4, 100100b
1519 mpsadbw m3, m1, m4, 100100b
1523 mpsadbw m2, m0, m4, 100b
1524 mpsadbw m3, m1, m4, 100b
1529 paddw m1, [r0+r2*2+mmsize]
1533 mova [r3+r2*2+mmsize], m1
1545 %endif ; !HIGH_BIT_DEPTH
1547 %macro INTEGRAL_INIT_8V 0
1548 ;-----------------------------------------------------------------------------
1549 ; void integral_init8v( uint16_t *sum8, intptr_t stride )
1550 ;-----------------------------------------------------------------------------
1551 cglobal integral_init8v, 3,3
1558 mova m1, [r2+r1+mmsize]
1560 psubw m1, [r0+r1+mmsize]
1562 mova [r0+r1+mmsize], m1
1575 ;-----------------------------------------------------------------------------
1576 ; void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride )
1577 ;-----------------------------------------------------------------------------
1579 cglobal integral_init4v, 3,5
1601 cglobal integral_init4v, 3,5
1613 shufpd m0, [r0+r2+16], 1
1614 shufpd m1, [r4+r2+16], 1
1627 cglobal integral_init4v, 3,5
1653 cglobal integral_init4v, 3,5
1663 paddw m0, m2, [r0+r2+8]
1678 pavgb %4, [r0+r5*2+%7]
1679 PALIGNR %1, %3, 1, m6
1680 PALIGNR %2, %4, 1, m6
1698 pavgb m2, m3, [r0+1]
1700 pavgb m3, [r0+r5*2+1]
1704 mova m3, [r0+r5+mmsize]
1705 pavgb m2, m3, [r0+mmsize]
1706 movu m5, [r0+r5+1+mmsize]
1707 pavgb m4, m5, [r0+1+mmsize]
1708 pavgb m3, [r0+r5*2+mmsize]
1709 pavgb m5, [r0+r5*2+1+mmsize]
1717 punpckhqdq m4, m0, m2
1718 punpcklqdq m0, m0, m2
1719 punpckhqdq m5, m1, m3
1720 punpcklqdq m2, m1, m3
1721 vpermq m0, m0, q3120
1722 vpermq m1, m4, q3120
1723 vpermq m2, m2, q3120
1724 vpermq m3, m5, q3120
1732 mova m3, [r0+%4+mmsize]
1734 pavgb m3, [r0+%4+r5+mmsize]
1735 pavgb m2, [r0+%4+r5]
1736 PALIGNR %1, m3, 1, m6
1738 PALIGNR m3, m2, 1, m6
1741 vpperm m5, m3, %1, m7
1742 vpperm m3, m3, %1, m6
1759 pavgb m3, [r0+%3+r5+8]
1760 pavgb m2, [r0+%3+r5]
1763 pavgb m1, [r0+%3+r5+9]
1764 pavgb m0, [r0+%3+r5+1]
1780 pavgw m3, [r0+%3+r5+8]
1781 pavgw m2, [r0+%3+r5]
1784 pavgw m1, [r0+%3+r5+10]
1785 pavgw m0, [r0+%3+r5+2]
1799 mova m3, [r0+%4+mmsize]
1801 pavgw m3, [r0+%4+r5+mmsize]
1802 pavgw m2, [r0+%4+r5]
1803 PALIGNR %1, m3, 2, m6
1805 PALIGNR m3, m2, 2, m6
1808 vpperm m5, m3, %1, m7
1809 vpperm m3, m3, %1, m6
1823 ;-----------------------------------------------------------------------------
1824 ; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
1825 ; intptr_t src_stride, intptr_t dst_stride, int width, int height )
1826 ;-----------------------------------------------------------------------------
1827 %macro FRAME_INIT_LOWRES 0
1828 cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
1835 add dword r7m, mmsize-1
1836 and dword r7m, ~(mmsize-1)
1838 ; src += 2*(height-1)*stride + 2*width
1844 ; dst += (height-1)*stride + width
1853 ; gap = stride - width
1857 %define dst_gap [rsp+gprsize]
1862 %define src_gap [rsp]
1865 mova m6, [deinterleave_shuf32a]
1866 mova m7, [deinterleave_shuf32b]
1873 %ifnidn cpuname, mmx2
1885 %ifidn cpuname, mmx2
1889 FILT8xA m0, r1, r2, 0
1890 FILT8xA m1, r3, r4, r5
1894 %else ; !HIGH_BIT_DEPTH
1896 mova m7, [deinterleave_shuf]
1898 mova m6, [deinterleave_shuf32a]
1899 mova m7, [deinterleave_shuf32b]
1906 %ifnidn cpuname, mmx2
1921 FILT32x4U r1, r2, r3, r4
1923 FILT8x4 m0, m1, m2, m3, m10, m11, mmsize
1926 FILT8x4 m2, m3, m0, m1, m4, m5, 0
1928 vpperm m4, m2, m8, m7
1929 vpperm m2, m2, m8, m6
1930 vpperm m5, m3, m9, m7
1931 vpperm m3, m3, m9, m6
1942 %elifidn cpuname, mmx2
1946 FILT16x2 m0, r1, r2, 0
1947 FILT16x2 m1, r3, r4, r5
1951 %endif ; HIGH_BIT_DEPTH
1964 %endmacro ; FRAME_INIT_LOWRES
1968 %if ARCH_X86_64 == 0
1969 INIT_MMX cache32, mmx2
1980 %if HIGH_BIT_DEPTH==0
1985 ;-----------------------------------------------------------------------------
1986 ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
1987 ; uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
1988 ;-----------------------------------------------------------------------------
1990 cglobal mbtree_propagate_cost, 6,6,7
2004 movq m2, [r2+r5] ; intra
2005 movq m0, [r4+r5] ; invq
2006 movq m3, [r3+r5] ; inter
2007 movq m1, [r1+r5] ; prop
2018 fmaddps m0, m0, m6, m1
2026 fnmaddps m3, m1, m3, m2
2030 mulps m0, m6 ; intra*invq*fps_factor>>8
2031 cvtdq2ps m1, m1 ; prop
2032 addps m0, m1 ; prop + (intra*invq*fps_factor>>8)
2033 cvtdq2ps m1, m2 ; intra
2034 psubd m2, m3 ; intra - inter
2035 cvtdq2ps m2, m2 ; intra - inter
2036 rcpps m3, m1 ; 1 / intra 1st approximation
2037 mulps m1, m3 ; intra * (1/intra 1st approx)
2038 mulps m1, m3 ; intra * (1/intra 1st approx)^2
2039 mulps m0, m2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
2040 addps m3, m3 ; 2 * (1/intra 1st approx)
2041 subps m3, m1 ; 2nd approximation for 1/intra
2042 mulps m0, m3 ; / intra
2054 ; Bulldozer only has a 128-bit float unit, so the AVX version of this function is actually slower.
2058 %macro INT16_UNPACK 1
2059 punpckhwd xm4, xm%1, xm7
2061 vinsertf128 m%1, m%1, xm4, 1
2064 ; FIXME: align loads to 16 bytes
2066 cglobal mbtree_propagate_cost, 6,6,%1
2067 vbroadcastss m6, [r5]
2077 %if notcpuflag(avx2)
2082 pmovzxwd m0, [r2+r5] ; intra
2083 pmovzxwd m1, [r4+r5] ; invq
2084 pmovzxwd m2, [r1+r5] ; prop
2085 pand xm3, xm5, [r3+r5] ; inter
2094 fmaddps m1, m1, m6, m2
2099 fnmaddps m4, m2, m3, m4
2105 pand xm3, xm5, [r3+r5]
2117 mulps m1, m6 ; intra*invq*fps_factor>>8
2118 addps m1, m2 ; prop + (intra*invq*fps_factor>>8)
2119 rcpps m3, m0 ; 1 / intra 1st approximation
2120 mulps m2, m0, m3 ; intra * (1/intra 1st approx)
2121 mulps m2, m3 ; intra * (1/intra 1st approx)^2
2122 mulps m1, m4 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
2123 addps m3, m3 ; 2 * (1/intra 1st approx)
2124 subps m3, m2 ; 2nd approximation for 1/intra
2125 mulps m1, m3 ; / intra
2128 vextractf128 xm2, m1, 1
2141 %macro MBTREE_PROPAGATE_LIST 0
2142 ;-----------------------------------------------------------------------------
2143 ; void mbtree_propagate_list_internal( int16_t (*mvs)[2], int *propagate_amount, uint16_t *lowres_costs,
2144 ; int16_t *output, int bipred_weight, int mb_y, int len )
2145 ;-----------------------------------------------------------------------------
2146 cglobal mbtree_propagate_list_internal, 4,6,8
2147 movh m6, [pw_0to15] ; mb_x
2150 punpcklwd m6, m7 ; 0 y 1 y 2 y 3 y
2152 SPLATW m7, m7 ; bipred_weight
2153 psllw m7, 9 ; bipred_weight << 9
2160 mova m5, [pw_0xc000]
2163 pmulhrsw m5, m3, m7 ; propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
2165 pblendvb m5, m3, m5, m4
2169 por m5, m4 ; if( lists_used == 3 )
2170 ; propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
2173 movu m0, [r0+r4*4] ; x,y
2174 movu m1, [r0+r4*4+mmsize]
2179 paddw m2, m6 ; {mbx, mby} = ({x,y}>>5)+{h->mb.i_mb_x,h->mb.i_mb_y}
2180 paddw m6, m4 ; {mbx, mby} += {4, 0}
2181 paddw m3, m6 ; {mbx, mby} = ({x,y}>>5)+{h->mb.i_mb_x,h->mb.i_mb_y}
2182 paddw m6, m4 ; {mbx, mby} += {4, 0}
2184 mova [r3+mmsize*0], m2
2185 mova [r3+mmsize*1], m3
2188 pand m0, m3 ; x &= 31
2189 pand m1, m3 ; y &= 31
2194 pandn m1, m3 ; y premultiplied by (1<<5) for later use of pmulhrsw
2197 psubw m3, m0 ; 32 - x
2199 psubw m4, m1 ; (32 - y) << 5
2201 pmullw m2, m3, m4 ; idx0weight = (32-y)*(32-x) << 5
2202 pmullw m4, m0 ; idx1weight = (32-y)*x << 5
2203 pmullw m0, m1 ; idx3weight = y*x << 5
2204 pmullw m1, m3 ; idx2weight = y*(32-x) << 5
2206 ; avoid overflow in the input to pmulhrsw
2208 psubw m2, m3 ; idx0weight -= (idx0weight == 32768)
2210 pmulhrsw m2, m5 ; idx0weight * propagate_amount + 512 >> 10
2211 pmulhrsw m4, m5 ; idx1weight * propagate_amount + 512 >> 10
2212 pmulhrsw m1, m5 ; idx2weight * propagate_amount + 512 >> 10
2213 pmulhrsw m0, m5 ; idx3weight * propagate_amount + 512 >> 10
2215 SBUTTERFLY wd, 2, 4, 3
2216 SBUTTERFLY wd, 1, 0, 3
2217 mova [r3+mmsize*2], m2
2218 mova [r3+mmsize*3], m4
2219 mova [r3+mmsize*4], m1
2220 mova [r3+mmsize*5], m0
2229 MBTREE_PROPAGATE_LIST
2231 MBTREE_PROPAGATE_LIST