1 ;*****************************************************************************
2 ;* mc-a2.asm: x86 motion compensation
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2016 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Holger Lubitz <holger@lubitz.org>
9 ;* Mathieu Monnier <manao@melix.net>
10 ;* Oskar Arvidsson <oskar@irock.se>
12 ;* This program is free software; you can redistribute it and/or modify
13 ;* it under the terms of the GNU General Public License as published by
14 ;* the Free Software Foundation; either version 2 of the License, or
15 ;* (at your option) any later version.
17 ;* This program is distributed in the hope that it will be useful,
18 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;* GNU General Public License for more details.
22 ;* You should have received a copy of the GNU General Public License
23 ;* along with this program; if not, write to the Free Software
24 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
26 ;* This program is also available under a commercial proprietary license.
27 ;* For more information, contact us at licensing@x264.com.
28 ;*****************************************************************************
31 %include "x86util.asm"
35 pw_1024: times 16 dw 1024
36 filt_mul20: times 32 db 20
37 filt_mul15: times 16 db 1, -5
38 filt_mul51: times 16 db -5, 1
39 hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
40 deinterleave_shuf: times 2 db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
43 copy_swap_shuf: times 2 db 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
44 v210_mask: times 4 dq 0xc00ffc003ff003ff
45 v210_luma_shuf: times 2 db 1,2,4,5,6,7,9,10,12,13,14,15,12,13,14,15
46 v210_chroma_shuf: times 2 db 0,1,2,3,5,6,8,9,10,11,13,14,10,11,13,14
47 ; vpermd indices {0,1,2,4,5,7,_,_} merged in the 3 lsb of each dword to save a register
48 v210_mult: dw 0x2000,0x7fff,0x0801,0x2000,0x7ffa,0x0800,0x7ffc,0x0800
49 dw 0x1ffd,0x7fff,0x07ff,0x2000,0x7fff,0x0800,0x7fff,0x0800
51 deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14
52 deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15
54 copy_swap_shuf: times 2 db 1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14
55 deinterleave_rgb_shuf: db 0,3,6,9,1,4,7,10,2,5,8,11,-1,-1,-1,-1
56 db 0,4,8,12,1,5,9,13,2,6,10,14,-1,-1,-1,-1
58 deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
59 deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
60 %endif ; !HIGH_BIT_DEPTH
62 mbtree_fix8_unpack_shuf: db -1,-1, 1, 0,-1,-1, 3, 2,-1,-1, 5, 4,-1,-1, 7, 6
63 db -1,-1, 9, 8,-1,-1,11,10,-1,-1,13,12,-1,-1,15,14
64 mbtree_fix8_pack_shuf: db 1, 0, 3, 2, 5, 4, 7, 6, 9, 8,11,10,13,12,15,14
66 pf_256: times 4 dd 256.0
67 pf_inv256: times 4 dd 0.00390625
70 pd_0f: times 4 dd 0xffff
72 pad10: times 8 dw 10*PIXEL_MAX
73 pad20: times 8 dw 20*PIXEL_MAX
74 pad30: times 8 dw 30*PIXEL_MAX
75 depad: times 4 dd 32*20*PIXEL_MAX + 512
77 tap1: times 4 dw 1, -5
78 tap2: times 4 dw 20, 20
79 tap3: times 4 dw -5, 1
81 pw_0xc000: times 8 dw 0xc000
125 psubw %1, %2 ; a-5*b+4*c
129 paddw %1, %3 ; a-5*b+20*c
135 psraw %1, 2 ; (a-b)/4
136 psubw %1, %2 ; (a-b)/4-b
137 paddw %1, %3 ; (a-b)/4-b+c
138 psraw %1, 2 ; ((a-b)/4-b+c)/4
139 paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
174 %if HIGH_BIT_DEPTH == 0
179 ;The hpel_filter routines use non-temporal writes for output.
180 ;The following defines may be uncommented for testing.
181 ;Doing the hpel_filter temporal may be a win if the last level cache
182 ;is big enough (preliminary benching suggests on the order of 4* framesize).
185 ;%define movntps movaps
189 ;-----------------------------------------------------------------------------
190 ; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, intptr_t stride, intptr_t width );
191 ;-----------------------------------------------------------------------------
193 cglobal hpel_filter_v, 5,6,11
213 mova m7, [pw_pixel_max]
220 mova m5, [r1+r3+mmsize]
221 mova m6, [r1+r3*2+mmsize]
225 paddw m4, [r5+r3*2+mmsize]
226 paddw m5, [r5+r3+mmsize]
227 paddw m6, [r5+mmsize]
230 FILT_V2 m1, m2, m3, m4, m5, m6
235 mova [r2+r4+mmsize], m4
238 FILT_PACK m1, m4, m6, 5, s10
242 mova [r0+r4+mmsize], m4
247 ;-----------------------------------------------------------------------------
248 ; void hpel_filter_c( uint16_t *dst, int16_t *buf, intptr_t width );
249 ;-----------------------------------------------------------------------------
250 cglobal hpel_filter_c, 3,3,10
290 CLIPW m1, [pb_0], [pw_pixel_max]
296 ;-----------------------------------------------------------------------------
297 ; void hpel_filter_h( uint16_t *dst, uint16_t *src, intptr_t width );
298 ;-----------------------------------------------------------------------------
299 cglobal hpel_filter_h, 3,4,8
305 mova m0, [pw_pixel_max]
317 movu m4, [src-4+mmsize]
318 movu m5, [src-2+mmsize]
320 movu m7, [src+4+mmsize]
321 movu m6, [src+6+mmsize]
324 movu m7, [src+2+mmsize]
325 mova m6, [src+0+mmsize]
327 FILT_H2 m1, m2, m3, m4, m5, m6
330 FILT_PACK m1, m4, m7, 1
334 mova [r0+r2+mmsize], m4
338 %endmacro ; HPEL_FILTER
344 %endif ; HIGH_BIT_DEPTH
346 %if HIGH_BIT_DEPTH == 0
348 ;-----------------------------------------------------------------------------
349 ; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, intptr_t stride, intptr_t width );
350 ;-----------------------------------------------------------------------------
351 cglobal hpel_filter_v, 5,6,%1
359 mova m0, [filt_mul15]
371 SBUTTERFLY bw, 1, 4, 7
372 SBUTTERFLY bw, 2, 5, 7
373 SBUTTERFLY bw, 3, 6, 7
378 pmaddubsw m3, [filt_mul20]
379 pmaddubsw m6, [filt_mul20]
386 LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1
387 LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1
388 LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0
389 LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
390 FILT_V2 m1, m2, m3, m4, m5, m6
395 mova [r2+r4*2+mmsize/2], xm4
396 vextracti128 [r2+r4*2+mmsize], m1, 1
397 vextracti128 [r2+r4*2+mmsize*3/2], m4, 1
400 mova [r2+r4*2+mmsize], m4
402 FILT_PACK m1, m4, m7, 5
411 ;-----------------------------------------------------------------------------
412 ; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
413 ;-----------------------------------------------------------------------------
415 cglobal hpel_filter_c, 3,3
427 paddw m3, [src+2] ; c0
431 paddw m4, [src+14] ; a1
432 paddw m5, [src+12] ; b1
433 paddw m6, [src+10] ; c1
434 FILT_H2 m1, m2, m3, m4, m5, m6
435 FILT_PACK m1, m4, m7, 6
441 ;-----------------------------------------------------------------------------
442 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
443 ;-----------------------------------------------------------------------------
445 cglobal hpel_filter_h, 3,3
479 FILT_H2 m1, m2, m3, m4, m5, m6
480 FILT_PACK m1, m4, m7, 1
487 ;-----------------------------------------------------------------------------
488 ; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
489 ;-----------------------------------------------------------------------------
490 cglobal hpel_filter_c, 3,3,9
495 %ifnidn cpuname, sse2
506 %define pw_rnd [pw_32]
508 ; This doesn't seem to be faster (with AVX) on Sandy Bridge or Bulldozer...
514 movu m3, [src-4+mmsize]
515 movu m2, [src-2+mmsize]
516 mova m1, [src+0+mmsize]
520 paddw m3, [src+6+mmsize]
521 paddw m2, [src+4+mmsize]
522 paddw m1, [src+2+mmsize]
523 FILT_H2 m4, m5, m6, m3, m2, m1
529 PALIGNR m4, m1, m0, 12, m7
530 PALIGNR m5, m1, m0, 14, m0
531 PALIGNR m0, m2, m1, 6, m7
533 PALIGNR m0, m2, m1, 4, m7
535 PALIGNR m6, m2, m1, 2, m7
541 PALIGNR m2, m1, 12, m7
542 PALIGNR m5, m1, 14, m1
544 PALIGNR m3, m1, m0, 6, m7
546 PALIGNR m6, m1, m0, 4, m7
548 PALIGNR m6, m1, m0, 2, m7
552 FILT_PACK m4, m3, pw_rnd, 6
562 ;-----------------------------------------------------------------------------
563 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
564 ;-----------------------------------------------------------------------------
566 cglobal hpel_filter_h, 3,3,8
603 mova m7, [pw_1] ; FIXME xmm8
604 FILT_H2 m1, m2, m3, m4, m5, m6
605 FILT_PACK m1, m4, m7, 1
611 ;-----------------------------------------------------------------------------
612 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
613 ;-----------------------------------------------------------------------------
615 cglobal hpel_filter_h, 3,3
625 ; Using unaligned loads instead of palignr is marginally slower on SB and significantly
626 ; slower on Bulldozer, despite their fast load units -- even though it would let us avoid
627 ; the repeated loads of constants for pmaddubsw.
628 palignr m3, m1, m0, 14
629 palignr m4, m1, m0, 15
630 palignr m0, m2, m1, 2
631 pmaddubsw m3, [filt_mul15]
632 pmaddubsw m4, [filt_mul15]
633 pmaddubsw m0, [filt_mul51]
634 palignr m5, m2, m1, 1
635 palignr m6, m2, m1, 3
638 pmaddubsw m1, [filt_mul20]
639 pmaddubsw m5, [filt_mul20]
640 pmaddubsw m6, [filt_mul51]
644 FILT_PACK m3, m4, m7, 5
645 pshufb m3, [hpel_shuf]
673 cglobal hpel_filter_h, 3,3,8
678 mova m5, [filt_mul15]
679 mova m6, [filt_mul20]
680 mova m7, [filt_mul51]
701 FILT_PACK m0, m1, m2, 5
702 pshufb m0, [hpel_shuf]
711 ;The optimum prefetch distance is difficult to determine in checkasm:
712 ;any prefetch seems slower than not prefetching.
713 ;In real use, the prefetch seems to be a slight win.
714 ;+mmsize is picked somewhat arbitrarily here based on the fact that even one
715 ;loop iteration is going to take longer than the prefetch.
716 prefetcht0 [r1+r2*2+mmsize]
743 LOAD_ADD_2 m1, m4, [r3 ], [r1+r2*2], m2, m5 ; a0 / a1
744 LOAD_ADD_2 m2, m5, [r3+r2 ], [r1+r2 ], m3, m6 ; b0 / b1
745 LOAD_ADD_2 m3, m6, [r3+r2*2], [r1 ], %3, %4 ; c0 / c1
747 FILT_V2 m1, m2, m3, m4, m5, m6
752 vinserti128 %1, m1, xm4, 1
753 vperm2i128 %2, m1, m4, q0301
758 FILT_PACK m1, m4, m15, 5
759 movntps [r8+r4+%5], m1
764 vperm2i128 m3, %2, %1, q0003
766 PALIGNR m1, %2, %1, (mmsize-4), m3
767 PALIGNR m2, %2, %1, (mmsize-2), m3
769 vperm2i128 %1, %3, %2, q0003
771 PALIGNR m3, %3, %2, 4, %1
772 PALIGNR m4, %3, %2, 2, %1
778 PALIGNR %3, %3, %2, 6, m2
787 FILT_PACK %3, %4, m15, 6
805 vperm2i128 m3, %2, %1, q0003
807 PALIGNR m1, %2, %1, (mmsize-2), m3
808 PALIGNR m2, %2, %1, (mmsize-1), m3
810 vperm2i128 m3, %3, %2, q0003
812 PALIGNR m4, %3, %2, 1 , m3
813 PALIGNR m5, %3, %2, 2 , m3
814 PALIGNR m6, %3, %2, 3 , m3
827 FILT_PACK m1, m2, m15, 5
828 pshufb m1, [hpel_shuf]
830 ADD8TO16 m1, m6, m12, m3, m0 ; a
831 ADD8TO16 m2, m5, m12, m3, m0 ; b
832 ADD8TO16 %2, m4, m12, m3, m0 ; c
833 FILT_V2 m1, m2, %2, m6, m5, m4
834 FILT_PACK m1, m6, m15, 5
841 ;-----------------------------------------------------------------------------
842 ; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
843 ; uint8_t *src, intptr_t stride, int width, int height )
844 ;-----------------------------------------------------------------------------
845 cglobal hpel_filter, 7,9,16
862 mova m0, [filt_mul51]
863 mova m12, [filt_mul15]
864 mova m14, [filt_mul20]
873 DO_FILT_V m8, m7, m13, m12, 0
876 DO_FILT_V m6, m5, m11, m12, mmsize
879 psrlw m15, 1 ; pw_512
881 paddw m15, m15 ; pw_32
883 DO_FILT_C m9, m8, m7, m6
885 paddw m15, m15 ; pw_1024
890 DO_FILT_H m10, m13, m11
895 ; setup regs for next y
923 %endif ; !HIGH_BIT_DEPTH
925 %macro PREFETCHNT_ITER 2 ; src, bytes/iteration
926 %assign %%i 4*(%2) ; prefetch 4 iterations ahead. is this optimal?
927 %rep (%2+63) / 64 ; assume 64 byte cache lines
933 ;-----------------------------------------------------------------------------
934 ; void plane_copy(_swap)_core( pixel *dst, intptr_t i_dst,
935 ; pixel *src, intptr_t i_src, int w, int h )
936 ;-----------------------------------------------------------------------------
937 ; assumes i_dst and w are multiples of mmsize, and i_dst>w
938 %macro PLANE_COPY_CORE 1 ; swap
940 cglobal plane_copy_swap_core, 6,7
941 mova m4, [copy_swap_shuf]
943 cglobal plane_copy_core, 6,7
946 %if %1 && HIGH_BIT_DEPTH
948 %elif %1 || HIGH_BIT_DEPTH
957 lea r6, [r4+4*mmsize]
963 PREFETCHNT_ITER r2+r6, 4*mmsize
964 movu m0, [r2+r6-4*mmsize]
965 movu m1, [r2+r6-3*mmsize]
966 movu m2, [r2+r6-2*mmsize]
967 movu m3, [r2+r6-1*mmsize]
974 movnta [r0+r6-4*mmsize], m0
975 movnta [r0+r6-3*mmsize], m1
976 movnta [r0+r6-2*mmsize], m2
977 movnta [r0+r6-1*mmsize], m3
981 PREFETCHNT_ITER r2+r6, 4*mmsize
1010 %macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
1014 mov%4 m0, [%2+(x/2)*mmsize]
1015 mov%4 m1, [%3+(x/2)*mmsize]
1016 punpckhwd m2, m0, m1
1018 mov%5a [%1+(x+0)*mmsize], m0
1019 mov%5a [%1+(x+1)*mmsize], m2
1034 punpckhbw m2, m0, m1
1039 %endif ; HIGH_BIT_DEPTH
1042 %macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, shuffle constant, is aligned
1046 mova m0, [%3+(n+0)*mmsize]
1047 mova m1, [%3+(n+1)*mmsize]
1054 mov%6 [%1+(n/2)*mmsize], m0
1055 mov%6 [%2+(n/2)*mmsize], m2
1058 %else ; !HIGH_BIT_DEPTH
1088 %endif ; mmsize == 16
1089 %endif ; HIGH_BIT_DEPTH
1092 %macro PLANE_INTERLEAVE 0
1093 ;-----------------------------------------------------------------------------
1094 ; void plane_copy_interleave_core( uint8_t *dst, intptr_t i_dst,
1095 ; uint8_t *srcu, intptr_t i_srcu,
1096 ; uint8_t *srcv, intptr_t i_srcv, int w, int h )
1097 ;-----------------------------------------------------------------------------
1098 ; assumes i_dst and w are multiples of 16, and i_dst>2*w
1099 cglobal plane_copy_interleave_core, 6,9
1102 FIX_STRIDES r1, r3, r5, r6d
1116 shr t1, SIZEOF_PIXEL
1130 INTERLEAVE r0+r6*2+ 0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6+0*SIZEOF_PIXEL, u, nt
1131 INTERLEAVE r0+r6*2+16*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6+8*SIZEOF_PIXEL, u, nt
1132 add r6, 16*SIZEOF_PIXEL
1138 movntq [r0+r6*2+(n+ 0)], m0
1139 movntq [r0+r6*2+(n+ 8)], m0
1140 movntq [r0+r6*2+(n+16)], m0
1141 movntq [r0+r6*2+(n+24)], m0
1143 movntdq [r0+r6*2+(n+ 0)], m0
1144 movntdq [r0+r6*2+(n+16)], m0
1148 add r6, 16*SIZEOF_PIXEL
1160 ;-----------------------------------------------------------------------------
1161 ; void store_interleave_chroma( uint8_t *dst, intptr_t i_dst, uint8_t *srcu, uint8_t *srcv, int height )
1162 ;-----------------------------------------------------------------------------
1163 cglobal store_interleave_chroma, 5,5
1166 INTERLEAVE r0+ 0, r2+ 0, r3+ 0, a
1167 INTERLEAVE r0+r1, r2+FDEC_STRIDEB, r3+FDEC_STRIDEB, a
1168 add r2, FDEC_STRIDEB*2
1169 add r3, FDEC_STRIDEB*2
1174 %endmacro ; PLANE_INTERLEAVE
1176 %macro DEINTERLEAVE_START 0
1179 %elif cpuflag(ssse3)
1180 mova m4, [deinterleave_shuf]
1183 %endif ; HIGH_BIT_DEPTH
1186 %macro PLANE_DEINTERLEAVE 0
1187 ;-----------------------------------------------------------------------------
1188 ; void plane_copy_deinterleave( pixel *dstu, intptr_t i_dstu,
1189 ; pixel *dstv, intptr_t i_dstv,
1190 ; pixel *src, intptr_t i_src, int w, int h )
1191 ;-----------------------------------------------------------------------------
1192 cglobal plane_copy_deinterleave, 6,7
1195 FIX_STRIDES r1, r3, r5, r6d
1206 DEINTERLEAVE r0+r6+0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6*2+ 0*SIZEOF_PIXEL, 0, m4, u
1207 DEINTERLEAVE r0+r6+8*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6*2+16*SIZEOF_PIXEL, 0, m4, u
1208 add r6, 16*SIZEOF_PIXEL
1217 ;-----------------------------------------------------------------------------
1218 ; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height )
1219 ;-----------------------------------------------------------------------------
1220 cglobal load_deinterleave_chroma_fenc, 4,4
1224 DEINTERLEAVE r0+ 0, r0+FENC_STRIDEB*1/2, r1+ 0, 1, m4, a
1225 DEINTERLEAVE r0+FENC_STRIDEB, r0+FENC_STRIDEB*3/2, r1+r2, 1, m4, a
1226 add r0, FENC_STRIDEB*2
1232 ;-----------------------------------------------------------------------------
1233 ; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height )
1234 ;-----------------------------------------------------------------------------
1235 cglobal load_deinterleave_chroma_fdec, 4,4
1239 DEINTERLEAVE r0+ 0, r0+FDEC_STRIDEB*1/2, r1+ 0, 0, m4, a
1240 DEINTERLEAVE r0+FDEC_STRIDEB, r0+FDEC_STRIDEB*3/2, r1+r2, 0, m4, a
1241 add r0, FDEC_STRIDEB*2
1246 %endmacro ; PLANE_DEINTERLEAVE
1248 %macro PLANE_DEINTERLEAVE_RGB_CORE 9 ; pw, i_dsta, i_dstb, i_dstc, i_src, w, h, tmp1, tmp2
1250 mova m3, [deinterleave_rgb_shuf+(%1-3)*16]
1257 movu m1, [%8+%1*mmsize/4]
1259 pshufb m0, m3 ; b0 b1 b2 b3 g0 g1 g2 g3 r0 r1 r2 r3
1260 pshufb m1, m3 ; b4 b5 b6 b7 g4 g5 g6 g7 r4 r5 r6 r7
1263 punpcklqdq m0, m1 ; b0 g0 r0 b1 g1 r1 __ __ b4 g4 r4 b5 g5 r5
1265 punpcklqdq m2, m1 ; b2 g2 r2 b3 g3 r3 __ __ b6 g6 r6 b7 g7 r7
1268 punpckhbw m1, m0, m3 ; b4 b5 g4 g5 r4 r5
1269 punpcklbw m0, m3 ; b0 b1 g0 g1 r0 r1
1270 punpckhbw m3, m2, m4 ; b6 b7 g6 g7 r6 r7
1271 punpcklbw m2, m4 ; b2 b3 g2 g3 r2 r3
1272 punpcklwd m0, m2 ; b0 b1 b2 b3 g0 g1 g2 g3 r0 r1 r2 r3
1273 punpcklwd m1, m3 ; b4 b5 b6 b7 g4 g5 g6 g7 r4 r5 r6 r7
1275 pshufd m3, m0, q2301
1276 pshufd m4, m1, q2301
1277 punpckhbw m2, m0, m3 ; b2 b3 g2 g3 r2 r3
1278 punpcklbw m0, m3 ; b0 b1 g0 g1 r0 r1
1279 punpckhbw m3, m1, m4 ; b6 b7 g6 g7 r6 r7
1280 punpcklbw m1, m4 ; b4 b5 g4 g5 r4 r5
1281 punpcklwd m0, m2 ; b0 b1 b2 b3 g0 g1 g2 g3 r0 r1 r2 r3
1282 punpcklwd m1, m3 ; b4 b5 b6 b7 g4 g5 g6 g7 r4 r5 r6 r7
1284 punpckldq m2, m0, m1 ; b0 b1 b2 b3 b4 b5 b6 b7 g0 g1 g2 g3 g4 g5 g6 g7
1285 punpckhdq m0, m1 ; r0 r1 r2 r3 r4 r5 r6 r7
1300 %macro PLANE_DEINTERLEAVE_RGB 0
1301 ;-----------------------------------------------------------------------------
1302 ; void x264_plane_copy_deinterleave_rgb( pixel *dsta, intptr_t i_dsta,
1303 ; pixel *dstb, intptr_t i_dstb,
1304 ; pixel *dstc, intptr_t i_dstc,
1305 ; pixel *src, intptr_t i_src, int pw, int w, int h )
1306 ;-----------------------------------------------------------------------------
1308 cglobal plane_copy_deinterleave_rgb, 8,12
1309 %define %%args r1, r3, r5, r7, r8, r9, r10, r11
1317 cglobal plane_copy_deinterleave_rgb, 1,7
1318 %define %%args r1m, r3m, r5m, r7m, r9m, r1, r3, r5
1332 PLANE_DEINTERLEAVE_RGB_CORE 3, %%args ; BGR
1335 PLANE_DEINTERLEAVE_RGB_CORE 4, %%args ; BGRA
1340 %if HIGH_BIT_DEPTH == 0
1342 PLANE_DEINTERLEAVE_RGB
1344 PLANE_DEINTERLEAVE_RGB
1345 %endif ; !HIGH_BIT_DEPTH
1347 %macro PLANE_DEINTERLEAVE_V210 0
1348 ;-----------------------------------------------------------------------------
1349 ; void x264_plane_copy_deinterleave_v210( uint16_t *dsty, intptr_t i_dsty,
1350 ; uint16_t *dstc, intptr_t i_dstc,
1351 ; uint32_t *src, intptr_t i_src, int w, int h )
1352 ;-----------------------------------------------------------------------------
1354 cglobal plane_copy_deinterleave_v210, 8,10,7
1359 cglobal plane_copy_deinterleave_v210, 7,7,7
1364 FIX_STRIDES r1, r3, r6d
1371 mova m2, [v210_mask]
1372 mova m3, [v210_luma_shuf]
1373 mova m4, [v210_chroma_shuf]
1374 mova m5, [v210_mult] ; also functions as vpermd index for avx2
1375 pshufd m6, m5, q1102
1384 pmulhrsw m0, m5 ; y0 y1 y2 y3 y4 y5 __ __
1385 pmulhrsw m1, m6 ; u0 v0 u1 v1 u2 v2 __ __
1403 %endmacro ; PLANE_DEINTERLEAVE_V210
1414 PLANE_DEINTERLEAVE_V210
1418 PLANE_DEINTERLEAVE_V210
1420 PLANE_DEINTERLEAVE_V210
1433 ; These functions are not general-use; not only do the SSE ones require aligned input,
1434 ; but they also will fail if given a non-mod16 size.
1435 ; memzero SSE will fail for non-mod128.
1437 ;-----------------------------------------------------------------------------
1438 ; void *memcpy_aligned( void *dst, const void *src, size_t n );
1439 ;-----------------------------------------------------------------------------
1441 cglobal memcpy_aligned, 3,3
1452 mova m0, [r1+r2-1*mmsize]
1453 mova m1, [r1+r2-2*mmsize]
1454 mova [r0+r2-1*mmsize], m0
1455 mova [r0+r2-2*mmsize], m1
1461 mova m0, [r1+r2-1*mmsize]
1462 mova m1, [r1+r2-2*mmsize]
1463 mova m2, [r1+r2-3*mmsize]
1464 mova m3, [r1+r2-4*mmsize]
1465 mova [r0+r2-1*mmsize], m0
1466 mova [r0+r2-2*mmsize], m1
1467 mova [r0+r2-3*mmsize], m2
1468 mova [r0+r2-4*mmsize], m3
1480 ;-----------------------------------------------------------------------------
1481 ; void *memzero_aligned( void *dst, size_t n );
1482 ;-----------------------------------------------------------------------------
1484 cglobal memzero_aligned, 2,2
1495 mova [r0 + r1 + i], m0
1510 %if HIGH_BIT_DEPTH == 0
1511 ;-----------------------------------------------------------------------------
1512 ; void integral_init4h( uint16_t *sum, uint8_t *pix, intptr_t stride )
1513 ;-----------------------------------------------------------------------------
1514 %macro INTEGRAL_INIT4H 0
1515 cglobal integral_init4h, 3,4
1522 mova xm1, [r1+r2+16]
1524 vinserti128 m0, m0, [r1+r2+ 8], 1
1525 vinserti128 m1, m1, [r1+r2+24], 1
1532 paddw m1, [r0+r2*2+mmsize]
1534 mova [r3+r2*2+mmsize], m1
1545 %macro INTEGRAL_INIT8H 0
1546 cglobal integral_init8h, 3,4
1553 mova xm1, [r1+r2+16]
1555 vinserti128 m0, m0, [r1+r2+ 8], 1
1556 vinserti128 m1, m1, [r1+r2+24], 1
1557 mpsadbw m2, m0, m4, 100100b
1558 mpsadbw m3, m1, m4, 100100b
1561 mpsadbw m2, m0, m4, 100b
1562 mpsadbw m3, m1, m4, 100b
1567 paddw m1, [r0+r2*2+mmsize]
1571 mova [r3+r2*2+mmsize], m1
1583 %endif ; !HIGH_BIT_DEPTH
1585 %macro INTEGRAL_INIT_8V 0
1586 ;-----------------------------------------------------------------------------
1587 ; void integral_init8v( uint16_t *sum8, intptr_t stride )
1588 ;-----------------------------------------------------------------------------
1589 cglobal integral_init8v, 3,3
1596 mova m1, [r2+r1+mmsize]
1598 psubw m1, [r0+r1+mmsize]
1600 mova [r0+r1+mmsize], m1
1613 ;-----------------------------------------------------------------------------
1614 ; void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride )
1615 ;-----------------------------------------------------------------------------
1617 cglobal integral_init4v, 3,5
1639 cglobal integral_init4v, 3,5
1651 shufpd m0, [r0+r2+16], 1
1652 shufpd m1, [r4+r2+16], 1
1665 cglobal integral_init4v, 3,5
1691 cglobal integral_init4v, 3,5
1701 paddw m0, m2, [r0+r2+8]
1716 pavgb %4, [r0+r5*2+%7]
1717 PALIGNR %1, %3, 1, m6
1718 PALIGNR %2, %4, 1, m6
1736 pavgb m2, m3, [r0+1]
1738 pavgb m3, [r0+r5*2+1]
1742 mova m3, [r0+r5+mmsize]
1743 pavgb m2, m3, [r0+mmsize]
1744 movu m5, [r0+r5+1+mmsize]
1745 pavgb m4, m5, [r0+1+mmsize]
1746 pavgb m3, [r0+r5*2+mmsize]
1747 pavgb m5, [r0+r5*2+1+mmsize]
1755 punpckhqdq m4, m0, m2
1756 punpcklqdq m0, m0, m2
1757 punpckhqdq m5, m1, m3
1758 punpcklqdq m2, m1, m3
1759 vpermq m0, m0, q3120
1760 vpermq m1, m4, q3120
1761 vpermq m2, m2, q3120
1762 vpermq m3, m5, q3120
1770 mova m3, [r0+%4+mmsize]
1772 pavgb m3, [r0+%4+r5+mmsize]
1773 pavgb m2, [r0+%4+r5]
1774 PALIGNR %1, m3, 1, m6
1776 PALIGNR m3, m2, 1, m6
1779 vpperm m5, m3, %1, m7
1780 vpperm m3, m3, %1, m6
1797 pavgb m3, [r0+%3+r5+8]
1798 pavgb m2, [r0+%3+r5]
1801 pavgb m1, [r0+%3+r5+9]
1802 pavgb m0, [r0+%3+r5+1]
1818 pavgw m3, [r0+%3+r5+8]
1819 pavgw m2, [r0+%3+r5]
1822 pavgw m1, [r0+%3+r5+10]
1823 pavgw m0, [r0+%3+r5+2]
1837 mova m3, [r0+%4+mmsize]
1839 pavgw m3, [r0+%4+r5+mmsize]
1840 pavgw m2, [r0+%4+r5]
1841 PALIGNR %1, m3, 2, m6
1843 PALIGNR m3, m2, 2, m6
1846 vpperm m5, m3, %1, m7
1847 vpperm m3, m3, %1, m6
1861 ;-----------------------------------------------------------------------------
1862 ; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
1863 ; intptr_t src_stride, intptr_t dst_stride, int width, int height )
1864 ;-----------------------------------------------------------------------------
1865 %macro FRAME_INIT_LOWRES 0
1866 cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
1873 add dword r7m, mmsize-1
1874 and dword r7m, ~(mmsize-1)
1876 ; src += 2*(height-1)*stride + 2*width
1882 ; dst += (height-1)*stride + width
1891 ; gap = stride - width
1895 %define dst_gap [rsp+gprsize]
1900 %define src_gap [rsp]
1903 mova m6, [deinterleave_shuf32a]
1904 mova m7, [deinterleave_shuf32b]
1911 %ifnidn cpuname, mmx2
1923 %ifidn cpuname, mmx2
1927 FILT8xA m0, r1, r2, 0
1928 FILT8xA m1, r3, r4, r5
1932 %else ; !HIGH_BIT_DEPTH
1934 mova m7, [deinterleave_shuf]
1936 mova m6, [deinterleave_shuf32a]
1937 mova m7, [deinterleave_shuf32b]
1944 %ifnidn cpuname, mmx2
1959 FILT32x4U r1, r2, r3, r4
1961 FILT8x4 m0, m1, m2, m3, m10, m11, mmsize
1964 FILT8x4 m2, m3, m0, m1, m4, m5, 0
1966 vpperm m4, m2, m8, m7
1967 vpperm m2, m2, m8, m6
1968 vpperm m5, m3, m9, m7
1969 vpperm m3, m3, m9, m6
1980 %elifidn cpuname, mmx2
1984 FILT16x2 m0, r1, r2, 0
1985 FILT16x2 m1, r3, r4, r5
1989 %endif ; HIGH_BIT_DEPTH
2002 %endmacro ; FRAME_INIT_LOWRES
2006 %if ARCH_X86_64 == 0
2007 INIT_MMX cache32, mmx2
2018 %if HIGH_BIT_DEPTH==0
2023 ;-----------------------------------------------------------------------------
2024 ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
2025 ; uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
2026 ;-----------------------------------------------------------------------------
2028 cglobal mbtree_propagate_cost, 6,6,7
2042 movq m2, [r2+r5] ; intra
2043 movq m0, [r4+r5] ; invq
2044 movq m3, [r3+r5] ; inter
2045 movq m1, [r1+r5] ; prop
2056 fmaddps m0, m0, m6, m1
2064 fnmaddps m3, m1, m3, m2
2068 mulps m0, m6 ; intra*invq*fps_factor>>8
2069 cvtdq2ps m1, m1 ; prop
2070 addps m0, m1 ; prop + (intra*invq*fps_factor>>8)
2071 cvtdq2ps m1, m2 ; intra
2072 psubd m2, m3 ; intra - inter
2073 cvtdq2ps m2, m2 ; intra - inter
2074 rcpps m3, m1 ; 1 / intra 1st approximation
2075 mulps m1, m3 ; intra * (1/intra 1st approx)
2076 mulps m1, m3 ; intra * (1/intra 1st approx)^2
2077 mulps m0, m2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
2078 addps m3, m3 ; 2 * (1/intra 1st approx)
2079 subps m3, m1 ; 2nd approximation for 1/intra
2080 mulps m0, m3 ; / intra
2092 ; Bulldozer only has a 128-bit float unit, so the AVX version of this function is actually slower.
2096 %macro INT16_UNPACK 1
2097 punpckhwd xm4, xm%1, xm7
2099 vinsertf128 m%1, m%1, xm4, 1
2102 ; FIXME: align loads to 16 bytes
2104 cglobal mbtree_propagate_cost, 6,6,8-cpuflag(avx2)
2105 vbroadcastss m6, [r5]
2115 %if notcpuflag(avx2)
2120 pmovzxwd m0, [r2+r5] ; intra
2121 pmovzxwd m1, [r4+r5] ; invq
2122 pmovzxwd m2, [r1+r5] ; prop
2123 pand xm3, xm5, [r3+r5] ; inter
2132 fmaddps m1, m1, m6, m2
2137 fnmaddps m4, m2, m3, m4
2143 pand xm3, xm5, [r3+r5]
2155 mulps m1, m6 ; intra*invq*fps_factor>>8
2156 addps m1, m2 ; prop + (intra*invq*fps_factor>>8)
2157 rcpps m3, m0 ; 1 / intra 1st approximation
2158 mulps m2, m0, m3 ; intra * (1/intra 1st approx)
2159 mulps m2, m3 ; intra * (1/intra 1st approx)^2
2160 mulps m1, m4 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
2161 addps m3, m3 ; 2 * (1/intra 1st approx)
2162 subps m3, m2 ; 2nd approximation for 1/intra
2163 mulps m1, m3 ; / intra
2166 vextractf128 xm2, m1, 1
2179 %macro MBTREE_PROPAGATE_LIST 0
2180 ;-----------------------------------------------------------------------------
2181 ; void mbtree_propagate_list_internal( int16_t (*mvs)[2], int *propagate_amount, uint16_t *lowres_costs,
2182 ; int16_t *output, int bipred_weight, int mb_y, int len )
2183 ;-----------------------------------------------------------------------------
2184 cglobal mbtree_propagate_list_internal, 4,6,8
2185 movh m6, [pw_0to15] ; mb_x
2188 punpcklwd m6, m7 ; 0 y 1 y 2 y 3 y
2190 SPLATW m7, m7 ; bipred_weight
2191 psllw m7, 9 ; bipred_weight << 9
2198 mova m5, [pw_0xc000]
2201 pmulhrsw m5, m3, m7 ; propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
2203 pblendvb m5, m3, m5, m4
2207 por m5, m4 ; if( lists_used == 3 )
2208 ; propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
2211 movu m0, [r0+r4*4] ; x,y
2212 movu m1, [r0+r4*4+mmsize]
2217 paddw m2, m6 ; {mbx, mby} = ({x,y}>>5)+{h->mb.i_mb_x,h->mb.i_mb_y}
2218 paddw m6, m4 ; {mbx, mby} += {4, 0}
2219 paddw m3, m6 ; {mbx, mby} = ({x,y}>>5)+{h->mb.i_mb_x,h->mb.i_mb_y}
2220 paddw m6, m4 ; {mbx, mby} += {4, 0}
2222 mova [r3+mmsize*0], m2
2223 mova [r3+mmsize*1], m3
2226 pand m0, m3 ; x &= 31
2227 pand m1, m3 ; y &= 31
2232 pandn m1, m3 ; y premultiplied by (1<<5) for later use of pmulhrsw
2235 psubw m3, m0 ; 32 - x
2237 psubw m4, m1 ; (32 - y) << 5
2239 pmullw m2, m3, m4 ; idx0weight = (32-y)*(32-x) << 5
2240 pmullw m4, m0 ; idx1weight = (32-y)*x << 5
2241 pmullw m0, m1 ; idx3weight = y*x << 5
2242 pmullw m1, m3 ; idx2weight = y*(32-x) << 5
2244 ; avoid overflow in the input to pmulhrsw
2246 psubw m2, m3 ; idx0weight -= (idx0weight == 32768)
2248 pmulhrsw m2, m5 ; idx0weight * propagate_amount + 512 >> 10
2249 pmulhrsw m4, m5 ; idx1weight * propagate_amount + 512 >> 10
2250 pmulhrsw m1, m5 ; idx2weight * propagate_amount + 512 >> 10
2251 pmulhrsw m0, m5 ; idx3weight * propagate_amount + 512 >> 10
2253 SBUTTERFLY wd, 2, 4, 3
2254 SBUTTERFLY wd, 1, 0, 3
2255 mova [r3+mmsize*2], m2
2256 mova [r3+mmsize*3], m4
2257 mova [r3+mmsize*4], m1
2258 mova [r3+mmsize*5], m0
2267 MBTREE_PROPAGATE_LIST
2269 MBTREE_PROPAGATE_LIST
2271 %macro MBTREE_FIX8 0
2272 ;-----------------------------------------------------------------------------
2273 ; void mbtree_fix8_pack( uint16_t *dst, float *src, int count )
2274 ;-----------------------------------------------------------------------------
2275 cglobal mbtree_fix8_pack, 3,4
2277 vbroadcastf128 m2, [pf_256]
2278 vbroadcasti128 m3, [mbtree_fix8_pack_shuf]
2281 mova m3, [mbtree_fix8_pack_shuf]
2284 movsxdifnidn r2, r2d
2290 mulps m0, m2, [r1+4*r2]
2291 mulps m1, m2, [r1+4*r2+mmsize]
2297 vpermq m0, m0, q3120
2305 ; Do the remaining values in scalar in order to avoid overreading src.
2307 mulss xm0, xm2, [r1+4*r2+2*mmsize]
2310 mov [r0+2*r2+mmsize], r3w
2316 ;-----------------------------------------------------------------------------
2317 ; void mbtree_fix8_unpack( float *dst, uint16_t *src, int count )
2318 ;-----------------------------------------------------------------------------
2319 cglobal mbtree_fix8_unpack, 3,4
2321 vbroadcastf128 m2, [pf_inv256]
2323 movaps m2, [pf_inv256]
2324 mova m4, [mbtree_fix8_unpack_shuf+16]
2326 mova m3, [mbtree_fix8_unpack_shuf]
2328 movsxdifnidn r2, r2d
2335 vbroadcasti128 m0, [r1+2*r2]
2336 vbroadcasti128 m1, [r1+2*r2+16]
2344 psrad m0, 16 ; sign-extend
2350 movaps [r0+4*r2], m0
2351 movaps [r0+4*r2+mmsize], m1
2358 movzx r3d, word [r1+2*r2+mmsize]
2361 ; Use 3-arg cvtsi2ss as a workaround for the fact that the instruction has a stupid dependency on
2362 ; dst which causes terrible performance when used in a loop otherwise. Blame Intel for poor design.
2363 cvtsi2ss xm0, xm2, r3d
2365 movss [r0+4*r2+2*mmsize], xm0