1 ;*****************************************************************************
2 ;* mc-a2.asm: x86 motion compensation
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2016 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Holger Lubitz <holger@lubitz.org>
9 ;* Mathieu Monnier <manao@melix.net>
10 ;* Oskar Arvidsson <oskar@irock.se>
12 ;* This program is free software; you can redistribute it and/or modify
13 ;* it under the terms of the GNU General Public License as published by
14 ;* the Free Software Foundation; either version 2 of the License, or
15 ;* (at your option) any later version.
17 ;* This program is distributed in the hope that it will be useful,
18 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;* GNU General Public License for more details.
22 ;* You should have received a copy of the GNU General Public License
23 ;* along with this program; if not, write to the Free Software
24 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
26 ;* This program is also available under a commercial proprietary license.
27 ;* For more information, contact us at licensing@x264.com.
28 ;*****************************************************************************
31 %include "x86util.asm"
35 pw_1024: times 16 dw 1024
36 filt_mul20: times 32 db 20
37 filt_mul15: times 16 db 1, -5
38 filt_mul51: times 16 db -5, 1
39 hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
40 deinterleave_shuf: times 2 db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
43 copy_swap_shuf: times 2 db 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
44 v210_mask: times 4 dq 0xc00ffc003ff003ff
45 v210_luma_shuf: times 2 db 1,2,4,5,6,7,9,10,12,13,14,15,12,13,14,15
46 v210_chroma_shuf: times 2 db 0,1,2,3,5,6,8,9,10,11,13,14,10,11,13,14
47 ; vpermd indices {0,1,2,4,5,7,_,_} merged in the 3 lsb of each dword to save a register
48 v210_mult: dw 0x2000,0x7fff,0x0801,0x2000,0x7ffa,0x0800,0x7ffc,0x0800
49 dw 0x1ffd,0x7fff,0x07ff,0x2000,0x7fff,0x0800,0x7fff,0x0800
51 deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14
52 deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15
54 copy_swap_shuf: times 2 db 1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14
55 deinterleave_rgb_shuf: db 0,3,6,9,1,4,7,10,2,5,8,11,-1,-1,-1,-1
56 db 0,4,8,12,1,5,9,13,2,6,10,14,-1,-1,-1,-1
58 deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
59 deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
60 %endif ; !HIGH_BIT_DEPTH
63 pd_0f: times 4 dd 0xffff
65 pad10: times 8 dw 10*PIXEL_MAX
66 pad20: times 8 dw 20*PIXEL_MAX
67 pad30: times 8 dw 30*PIXEL_MAX
68 depad: times 4 dd 32*20*PIXEL_MAX + 512
70 tap1: times 4 dw 1, -5
71 tap2: times 4 dw 20, 20
72 tap3: times 4 dw -5, 1
74 pw_0xc000: times 8 dw 0xc000
118 psubw %1, %2 ; a-5*b+4*c
122 paddw %1, %3 ; a-5*b+20*c
128 psraw %1, 2 ; (a-b)/4
129 psubw %1, %2 ; (a-b)/4-b
130 paddw %1, %3 ; (a-b)/4-b+c
131 psraw %1, 2 ; ((a-b)/4-b+c)/4
132 paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
167 %if HIGH_BIT_DEPTH == 0
172 ;The hpel_filter routines use non-temporal writes for output.
173 ;The following defines may be uncommented for testing.
174 ;Doing the hpel_filter temporal may be a win if the last level cache
175 ;is big enough (preliminary benching suggests on the order of 4* framesize).
178 ;%define movntps movaps
182 ;-----------------------------------------------------------------------------
183 ; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, intptr_t stride, intptr_t width );
184 ;-----------------------------------------------------------------------------
186 cglobal hpel_filter_v, 5,6,11
206 mova m7, [pw_pixel_max]
213 mova m5, [r1+r3+mmsize]
214 mova m6, [r1+r3*2+mmsize]
218 paddw m4, [r5+r3*2+mmsize]
219 paddw m5, [r5+r3+mmsize]
220 paddw m6, [r5+mmsize]
223 FILT_V2 m1, m2, m3, m4, m5, m6
228 mova [r2+r4+mmsize], m4
231 FILT_PACK m1, m4, m6, 5, s10
235 mova [r0+r4+mmsize], m4
240 ;-----------------------------------------------------------------------------
241 ; void hpel_filter_c( uint16_t *dst, int16_t *buf, intptr_t width );
242 ;-----------------------------------------------------------------------------
243 cglobal hpel_filter_c, 3,3,10
283 CLIPW m1, [pb_0], [pw_pixel_max]
289 ;-----------------------------------------------------------------------------
290 ; void hpel_filter_h( uint16_t *dst, uint16_t *src, intptr_t width );
291 ;-----------------------------------------------------------------------------
292 cglobal hpel_filter_h, 3,4,8
298 mova m0, [pw_pixel_max]
310 movu m4, [src-4+mmsize]
311 movu m5, [src-2+mmsize]
313 movu m7, [src+4+mmsize]
314 movu m6, [src+6+mmsize]
317 movu m7, [src+2+mmsize]
318 mova m6, [src+0+mmsize]
320 FILT_H2 m1, m2, m3, m4, m5, m6
323 FILT_PACK m1, m4, m7, 1
327 mova [r0+r2+mmsize], m4
331 %endmacro ; HPEL_FILTER
337 %endif ; HIGH_BIT_DEPTH
339 %if HIGH_BIT_DEPTH == 0
341 ;-----------------------------------------------------------------------------
342 ; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, intptr_t stride, intptr_t width );
343 ;-----------------------------------------------------------------------------
344 cglobal hpel_filter_v, 5,6,%1
352 mova m0, [filt_mul15]
364 SBUTTERFLY bw, 1, 4, 7
365 SBUTTERFLY bw, 2, 5, 7
366 SBUTTERFLY bw, 3, 6, 7
371 pmaddubsw m3, [filt_mul20]
372 pmaddubsw m6, [filt_mul20]
379 LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1
380 LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1
381 LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0
382 LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
383 FILT_V2 m1, m2, m3, m4, m5, m6
388 mova [r2+r4*2+mmsize/2], xm4
389 vextracti128 [r2+r4*2+mmsize], m1, 1
390 vextracti128 [r2+r4*2+mmsize*3/2], m4, 1
393 mova [r2+r4*2+mmsize], m4
395 FILT_PACK m1, m4, m7, 5
404 ;-----------------------------------------------------------------------------
405 ; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
406 ;-----------------------------------------------------------------------------
408 cglobal hpel_filter_c, 3,3
420 paddw m3, [src+2] ; c0
424 paddw m4, [src+14] ; a1
425 paddw m5, [src+12] ; b1
426 paddw m6, [src+10] ; c1
427 FILT_H2 m1, m2, m3, m4, m5, m6
428 FILT_PACK m1, m4, m7, 6
434 ;-----------------------------------------------------------------------------
435 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
436 ;-----------------------------------------------------------------------------
438 cglobal hpel_filter_h, 3,3
472 FILT_H2 m1, m2, m3, m4, m5, m6
473 FILT_PACK m1, m4, m7, 1
480 ;-----------------------------------------------------------------------------
481 ; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
482 ;-----------------------------------------------------------------------------
483 cglobal hpel_filter_c, 3,3,9
488 %ifnidn cpuname, sse2
499 %define pw_rnd [pw_32]
501 ; This doesn't seem to be faster (with AVX) on Sandy Bridge or Bulldozer...
507 movu m3, [src-4+mmsize]
508 movu m2, [src-2+mmsize]
509 mova m1, [src+0+mmsize]
513 paddw m3, [src+6+mmsize]
514 paddw m2, [src+4+mmsize]
515 paddw m1, [src+2+mmsize]
516 FILT_H2 m4, m5, m6, m3, m2, m1
522 PALIGNR m4, m1, m0, 12, m7
523 PALIGNR m5, m1, m0, 14, m0
524 PALIGNR m0, m2, m1, 6, m7
526 PALIGNR m0, m2, m1, 4, m7
528 PALIGNR m6, m2, m1, 2, m7
534 PALIGNR m2, m1, 12, m7
535 PALIGNR m5, m1, 14, m1
537 PALIGNR m3, m1, m0, 6, m7
539 PALIGNR m6, m1, m0, 4, m7
541 PALIGNR m6, m1, m0, 2, m7
545 FILT_PACK m4, m3, pw_rnd, 6
555 ;-----------------------------------------------------------------------------
556 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
557 ;-----------------------------------------------------------------------------
559 cglobal hpel_filter_h, 3,3,8
596 mova m7, [pw_1] ; FIXME xmm8
597 FILT_H2 m1, m2, m3, m4, m5, m6
598 FILT_PACK m1, m4, m7, 1
604 ;-----------------------------------------------------------------------------
605 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
606 ;-----------------------------------------------------------------------------
608 cglobal hpel_filter_h, 3,3
618 ; Using unaligned loads instead of palignr is marginally slower on SB and significantly
619 ; slower on Bulldozer, despite their fast load units -- even though it would let us avoid
620 ; the repeated loads of constants for pmaddubsw.
621 palignr m3, m1, m0, 14
622 palignr m4, m1, m0, 15
623 palignr m0, m2, m1, 2
624 pmaddubsw m3, [filt_mul15]
625 pmaddubsw m4, [filt_mul15]
626 pmaddubsw m0, [filt_mul51]
627 palignr m5, m2, m1, 1
628 palignr m6, m2, m1, 3
631 pmaddubsw m1, [filt_mul20]
632 pmaddubsw m5, [filt_mul20]
633 pmaddubsw m6, [filt_mul51]
637 FILT_PACK m3, m4, m7, 5
638 pshufb m3, [hpel_shuf]
666 cglobal hpel_filter_h, 3,3,8
671 mova m5, [filt_mul15]
672 mova m6, [filt_mul20]
673 mova m7, [filt_mul51]
694 FILT_PACK m0, m1, m2, 5
695 pshufb m0, [hpel_shuf]
704 ;The optimum prefetch distance is difficult to determine in checkasm:
705 ;any prefetch seems slower than not prefetching.
706 ;In real use, the prefetch seems to be a slight win.
707 ;+mmsize is picked somewhat arbitrarily here based on the fact that even one
708 ;loop iteration is going to take longer than the prefetch.
709 prefetcht0 [r1+r2*2+mmsize]
736 LOAD_ADD_2 m1, m4, [r3 ], [r1+r2*2], m2, m5 ; a0 / a1
737 LOAD_ADD_2 m2, m5, [r3+r2 ], [r1+r2 ], m3, m6 ; b0 / b1
738 LOAD_ADD_2 m3, m6, [r3+r2*2], [r1 ], %3, %4 ; c0 / c1
740 FILT_V2 m1, m2, m3, m4, m5, m6
745 vinserti128 %1, m1, xm4, 1
746 vperm2i128 %2, m1, m4, q0301
751 FILT_PACK m1, m4, m15, 5
752 movntps [r8+r4+%5], m1
757 vperm2i128 m3, %2, %1, q0003
759 PALIGNR m1, %2, %1, (mmsize-4), m3
760 PALIGNR m2, %2, %1, (mmsize-2), m3
762 vperm2i128 %1, %3, %2, q0003
764 PALIGNR m3, %3, %2, 4, %1
765 PALIGNR m4, %3, %2, 2, %1
771 PALIGNR %3, %3, %2, 6, m2
780 FILT_PACK %3, %4, m15, 6
798 vperm2i128 m3, %2, %1, q0003
800 PALIGNR m1, %2, %1, (mmsize-2), m3
801 PALIGNR m2, %2, %1, (mmsize-1), m3
803 vperm2i128 m3, %3, %2, q0003
805 PALIGNR m4, %3, %2, 1 , m3
806 PALIGNR m5, %3, %2, 2 , m3
807 PALIGNR m6, %3, %2, 3 , m3
820 FILT_PACK m1, m2, m15, 5
821 pshufb m1, [hpel_shuf]
823 ADD8TO16 m1, m6, m12, m3, m0 ; a
824 ADD8TO16 m2, m5, m12, m3, m0 ; b
825 ADD8TO16 %2, m4, m12, m3, m0 ; c
826 FILT_V2 m1, m2, %2, m6, m5, m4
827 FILT_PACK m1, m6, m15, 5
834 ;-----------------------------------------------------------------------------
835 ; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
836 ; uint8_t *src, intptr_t stride, int width, int height )
837 ;-----------------------------------------------------------------------------
838 cglobal hpel_filter, 7,9,16
855 mova m0, [filt_mul51]
856 mova m12, [filt_mul15]
857 mova m14, [filt_mul20]
866 DO_FILT_V m8, m7, m13, m12, 0
869 DO_FILT_V m6, m5, m11, m12, mmsize
872 psrlw m15, 1 ; pw_512
874 paddw m15, m15 ; pw_32
876 DO_FILT_C m9, m8, m7, m6
878 paddw m15, m15 ; pw_1024
883 DO_FILT_H m10, m13, m11
888 ; setup regs for next y
916 %endif ; !HIGH_BIT_DEPTH
918 %macro PREFETCHNT_ITER 2 ; src, bytes/iteration
919 %assign %%i 4*(%2) ; prefetch 4 iterations ahead. is this optimal?
920 %rep (%2+63) / 64 ; assume 64 byte cache lines
926 ;-----------------------------------------------------------------------------
927 ; void plane_copy(_swap)_core( pixel *dst, intptr_t i_dst,
928 ; pixel *src, intptr_t i_src, int w, int h )
929 ;-----------------------------------------------------------------------------
930 ; assumes i_dst and w are multiples of mmsize, and i_dst>w
931 %macro PLANE_COPY_CORE 1 ; swap
933 cglobal plane_copy_swap_core, 6,7
934 mova m4, [copy_swap_shuf]
936 cglobal plane_copy_core, 6,7
939 %if %1 && HIGH_BIT_DEPTH
941 %elif %1 || HIGH_BIT_DEPTH
950 lea r6, [r4+4*mmsize]
956 PREFETCHNT_ITER r2+r6, 4*mmsize
957 movu m0, [r2+r6-4*mmsize]
958 movu m1, [r2+r6-3*mmsize]
959 movu m2, [r2+r6-2*mmsize]
960 movu m3, [r2+r6-1*mmsize]
967 movnta [r0+r6-4*mmsize], m0
968 movnta [r0+r6-3*mmsize], m1
969 movnta [r0+r6-2*mmsize], m2
970 movnta [r0+r6-1*mmsize], m3
974 PREFETCHNT_ITER r2+r6, 4*mmsize
1003 %macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
1007 mov%4 m0, [%2+(x/2)*mmsize]
1008 mov%4 m1, [%3+(x/2)*mmsize]
1009 punpckhwd m2, m0, m1
1011 mov%5a [%1+(x+0)*mmsize], m0
1012 mov%5a [%1+(x+1)*mmsize], m2
1027 punpckhbw m2, m0, m1
1032 %endif ; HIGH_BIT_DEPTH
1035 %macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, shuffle constant, is aligned
1039 mova m0, [%3+(n+0)*mmsize]
1040 mova m1, [%3+(n+1)*mmsize]
1047 mov%6 [%1+(n/2)*mmsize], m0
1048 mov%6 [%2+(n/2)*mmsize], m2
1051 %else ; !HIGH_BIT_DEPTH
1081 %endif ; mmsize == 16
1082 %endif ; HIGH_BIT_DEPTH
1085 %macro PLANE_INTERLEAVE 0
1086 ;-----------------------------------------------------------------------------
1087 ; void plane_copy_interleave_core( uint8_t *dst, intptr_t i_dst,
1088 ; uint8_t *srcu, intptr_t i_srcu,
1089 ; uint8_t *srcv, intptr_t i_srcv, int w, int h )
1090 ;-----------------------------------------------------------------------------
1091 ; assumes i_dst and w are multiples of 16, and i_dst>2*w
1092 cglobal plane_copy_interleave_core, 6,9
1095 FIX_STRIDES r1, r3, r5, r6d
1109 shr t1, SIZEOF_PIXEL
1123 INTERLEAVE r0+r6*2+ 0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6+0*SIZEOF_PIXEL, u, nt
1124 INTERLEAVE r0+r6*2+16*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6+8*SIZEOF_PIXEL, u, nt
1125 add r6, 16*SIZEOF_PIXEL
1131 movntq [r0+r6*2+(n+ 0)], m0
1132 movntq [r0+r6*2+(n+ 8)], m0
1133 movntq [r0+r6*2+(n+16)], m0
1134 movntq [r0+r6*2+(n+24)], m0
1136 movntdq [r0+r6*2+(n+ 0)], m0
1137 movntdq [r0+r6*2+(n+16)], m0
1141 add r6, 16*SIZEOF_PIXEL
1153 ;-----------------------------------------------------------------------------
1154 ; void store_interleave_chroma( uint8_t *dst, intptr_t i_dst, uint8_t *srcu, uint8_t *srcv, int height )
1155 ;-----------------------------------------------------------------------------
1156 cglobal store_interleave_chroma, 5,5
1159 INTERLEAVE r0+ 0, r2+ 0, r3+ 0, a
1160 INTERLEAVE r0+r1, r2+FDEC_STRIDEB, r3+FDEC_STRIDEB, a
1161 add r2, FDEC_STRIDEB*2
1162 add r3, FDEC_STRIDEB*2
1167 %endmacro ; PLANE_INTERLEAVE
1169 %macro DEINTERLEAVE_START 0
1172 %elif cpuflag(ssse3)
1173 mova m4, [deinterleave_shuf]
1176 %endif ; HIGH_BIT_DEPTH
1179 %macro PLANE_DEINTERLEAVE 0
1180 ;-----------------------------------------------------------------------------
1181 ; void plane_copy_deinterleave( pixel *dstu, intptr_t i_dstu,
1182 ; pixel *dstv, intptr_t i_dstv,
1183 ; pixel *src, intptr_t i_src, int w, int h )
1184 ;-----------------------------------------------------------------------------
1185 cglobal plane_copy_deinterleave, 6,7
1188 FIX_STRIDES r1, r3, r5, r6d
1199 DEINTERLEAVE r0+r6+0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6*2+ 0*SIZEOF_PIXEL, 0, m4, u
1200 DEINTERLEAVE r0+r6+8*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6*2+16*SIZEOF_PIXEL, 0, m4, u
1201 add r6, 16*SIZEOF_PIXEL
1210 ;-----------------------------------------------------------------------------
1211 ; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height )
1212 ;-----------------------------------------------------------------------------
1213 cglobal load_deinterleave_chroma_fenc, 4,4
1217 DEINTERLEAVE r0+ 0, r0+FENC_STRIDEB*1/2, r1+ 0, 1, m4, a
1218 DEINTERLEAVE r0+FENC_STRIDEB, r0+FENC_STRIDEB*3/2, r1+r2, 1, m4, a
1219 add r0, FENC_STRIDEB*2
1225 ;-----------------------------------------------------------------------------
1226 ; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height )
1227 ;-----------------------------------------------------------------------------
1228 cglobal load_deinterleave_chroma_fdec, 4,4
1232 DEINTERLEAVE r0+ 0, r0+FDEC_STRIDEB*1/2, r1+ 0, 0, m4, a
1233 DEINTERLEAVE r0+FDEC_STRIDEB, r0+FDEC_STRIDEB*3/2, r1+r2, 0, m4, a
1234 add r0, FDEC_STRIDEB*2
1239 %endmacro ; PLANE_DEINTERLEAVE
1241 %macro PLANE_DEINTERLEAVE_RGB_CORE 9 ; pw, i_dsta, i_dstb, i_dstc, i_src, w, h, tmp1, tmp2
1243 mova m3, [deinterleave_rgb_shuf+(%1-3)*16]
1250 movu m1, [%8+%1*mmsize/4]
1252 pshufb m0, m3 ; b0 b1 b2 b3 g0 g1 g2 g3 r0 r1 r2 r3
1253 pshufb m1, m3 ; b4 b5 b6 b7 g4 g5 g6 g7 r4 r5 r6 r7
1256 punpcklqdq m0, m1 ; b0 g0 r0 b1 g1 r1 __ __ b4 g4 r4 b5 g5 r5
1258 punpcklqdq m2, m1 ; b2 g2 r2 b3 g3 r3 __ __ b6 g6 r6 b7 g7 r7
1261 punpckhbw m1, m0, m3 ; b4 b5 g4 g5 r4 r5
1262 punpcklbw m0, m3 ; b0 b1 g0 g1 r0 r1
1263 punpckhbw m3, m2, m4 ; b6 b7 g6 g7 r6 r7
1264 punpcklbw m2, m4 ; b2 b3 g2 g3 r2 r3
1265 punpcklwd m0, m2 ; b0 b1 b2 b3 g0 g1 g2 g3 r0 r1 r2 r3
1266 punpcklwd m1, m3 ; b4 b5 b6 b7 g4 g5 g6 g7 r4 r5 r6 r7
1268 pshufd m3, m0, q2301
1269 pshufd m4, m1, q2301
1270 punpckhbw m2, m0, m3 ; b2 b3 g2 g3 r2 r3
1271 punpcklbw m0, m3 ; b0 b1 g0 g1 r0 r1
1272 punpckhbw m3, m1, m4 ; b6 b7 g6 g7 r6 r7
1273 punpcklbw m1, m4 ; b4 b5 g4 g5 r4 r5
1274 punpcklwd m0, m2 ; b0 b1 b2 b3 g0 g1 g2 g3 r0 r1 r2 r3
1275 punpcklwd m1, m3 ; b4 b5 b6 b7 g4 g5 g6 g7 r4 r5 r6 r7
1277 punpckldq m2, m0, m1 ; b0 b1 b2 b3 b4 b5 b6 b7 g0 g1 g2 g3 g4 g5 g6 g7
1278 punpckhdq m0, m1 ; r0 r1 r2 r3 r4 r5 r6 r7
1293 %macro PLANE_DEINTERLEAVE_RGB 0
1294 ;-----------------------------------------------------------------------------
1295 ; void x264_plane_copy_deinterleave_rgb( pixel *dsta, intptr_t i_dsta,
1296 ; pixel *dstb, intptr_t i_dstb,
1297 ; pixel *dstc, intptr_t i_dstc,
1298 ; pixel *src, intptr_t i_src, int pw, int w, int h )
1299 ;-----------------------------------------------------------------------------
1301 cglobal plane_copy_deinterleave_rgb, 8,12
1302 %define %%args r1, r3, r5, r7, r8, r9, r10, r11
1310 cglobal plane_copy_deinterleave_rgb, 1,7
1311 %define %%args r1m, r3m, r5m, r7m, r9m, r1, r3, r5
1325 PLANE_DEINTERLEAVE_RGB_CORE 3, %%args ; BGR
1328 PLANE_DEINTERLEAVE_RGB_CORE 4, %%args ; BGRA
1333 %if HIGH_BIT_DEPTH == 0
1335 PLANE_DEINTERLEAVE_RGB
1337 PLANE_DEINTERLEAVE_RGB
1338 %endif ; !HIGH_BIT_DEPTH
1340 %macro PLANE_DEINTERLEAVE_V210 0
1341 ;-----------------------------------------------------------------------------
1342 ; void x264_plane_copy_deinterleave_v210( uint16_t *dsty, intptr_t i_dsty,
1343 ; uint16_t *dstc, intptr_t i_dstc,
1344 ; uint32_t *src, intptr_t i_src, int w, int h )
1345 ;-----------------------------------------------------------------------------
1347 cglobal plane_copy_deinterleave_v210, 8,10,7
1352 cglobal plane_copy_deinterleave_v210, 7,7,7
1357 FIX_STRIDES r1, r3, r6d
1364 mova m2, [v210_mask]
1365 mova m3, [v210_luma_shuf]
1366 mova m4, [v210_chroma_shuf]
1367 mova m5, [v210_mult] ; also functions as vpermd index for avx2
1368 pshufd m6, m5, q1102
1377 pmulhrsw m0, m5 ; y0 y1 y2 y3 y4 y5 __ __
1378 pmulhrsw m1, m6 ; u0 v0 u1 v1 u2 v2 __ __
1396 %endmacro ; PLANE_DEINTERLEAVE_V210
1407 PLANE_DEINTERLEAVE_V210
1411 PLANE_DEINTERLEAVE_V210
1413 PLANE_DEINTERLEAVE_V210
1426 ; These functions are not general-use; not only do the SSE ones require aligned input,
1427 ; but they also will fail if given a non-mod16 size.
1428 ; memzero SSE will fail for non-mod128.
1430 ;-----------------------------------------------------------------------------
1431 ; void *memcpy_aligned( void *dst, const void *src, size_t n );
1432 ;-----------------------------------------------------------------------------
1434 cglobal memcpy_aligned, 3,3
1445 mova m0, [r1+r2-1*mmsize]
1446 mova m1, [r1+r2-2*mmsize]
1447 mova [r0+r2-1*mmsize], m0
1448 mova [r0+r2-2*mmsize], m1
1454 mova m0, [r1+r2-1*mmsize]
1455 mova m1, [r1+r2-2*mmsize]
1456 mova m2, [r1+r2-3*mmsize]
1457 mova m3, [r1+r2-4*mmsize]
1458 mova [r0+r2-1*mmsize], m0
1459 mova [r0+r2-2*mmsize], m1
1460 mova [r0+r2-3*mmsize], m2
1461 mova [r0+r2-4*mmsize], m3
1473 ;-----------------------------------------------------------------------------
1474 ; void *memzero_aligned( void *dst, size_t n );
1475 ;-----------------------------------------------------------------------------
1477 cglobal memzero_aligned, 2,2
1488 mova [r0 + r1 + i], m0
1503 %if HIGH_BIT_DEPTH == 0
1504 ;-----------------------------------------------------------------------------
1505 ; void integral_init4h( uint16_t *sum, uint8_t *pix, intptr_t stride )
1506 ;-----------------------------------------------------------------------------
1507 %macro INTEGRAL_INIT4H 0
1508 cglobal integral_init4h, 3,4
1515 mova xm1, [r1+r2+16]
1517 vinserti128 m0, m0, [r1+r2+ 8], 1
1518 vinserti128 m1, m1, [r1+r2+24], 1
1525 paddw m1, [r0+r2*2+mmsize]
1527 mova [r3+r2*2+mmsize], m1
1538 %macro INTEGRAL_INIT8H 0
1539 cglobal integral_init8h, 3,4
1546 mova xm1, [r1+r2+16]
1548 vinserti128 m0, m0, [r1+r2+ 8], 1
1549 vinserti128 m1, m1, [r1+r2+24], 1
1550 mpsadbw m2, m0, m4, 100100b
1551 mpsadbw m3, m1, m4, 100100b
1554 mpsadbw m2, m0, m4, 100b
1555 mpsadbw m3, m1, m4, 100b
1560 paddw m1, [r0+r2*2+mmsize]
1564 mova [r3+r2*2+mmsize], m1
1576 %endif ; !HIGH_BIT_DEPTH
1578 %macro INTEGRAL_INIT_8V 0
1579 ;-----------------------------------------------------------------------------
1580 ; void integral_init8v( uint16_t *sum8, intptr_t stride )
1581 ;-----------------------------------------------------------------------------
1582 cglobal integral_init8v, 3,3
1589 mova m1, [r2+r1+mmsize]
1591 psubw m1, [r0+r1+mmsize]
1593 mova [r0+r1+mmsize], m1
1606 ;-----------------------------------------------------------------------------
1607 ; void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride )
1608 ;-----------------------------------------------------------------------------
1610 cglobal integral_init4v, 3,5
1632 cglobal integral_init4v, 3,5
1644 shufpd m0, [r0+r2+16], 1
1645 shufpd m1, [r4+r2+16], 1
1658 cglobal integral_init4v, 3,5
1684 cglobal integral_init4v, 3,5
1694 paddw m0, m2, [r0+r2+8]
1709 pavgb %4, [r0+r5*2+%7]
1710 PALIGNR %1, %3, 1, m6
1711 PALIGNR %2, %4, 1, m6
1729 pavgb m2, m3, [r0+1]
1731 pavgb m3, [r0+r5*2+1]
1735 mova m3, [r0+r5+mmsize]
1736 pavgb m2, m3, [r0+mmsize]
1737 movu m5, [r0+r5+1+mmsize]
1738 pavgb m4, m5, [r0+1+mmsize]
1739 pavgb m3, [r0+r5*2+mmsize]
1740 pavgb m5, [r0+r5*2+1+mmsize]
1748 punpckhqdq m4, m0, m2
1749 punpcklqdq m0, m0, m2
1750 punpckhqdq m5, m1, m3
1751 punpcklqdq m2, m1, m3
1752 vpermq m0, m0, q3120
1753 vpermq m1, m4, q3120
1754 vpermq m2, m2, q3120
1755 vpermq m3, m5, q3120
1763 mova m3, [r0+%4+mmsize]
1765 pavgb m3, [r0+%4+r5+mmsize]
1766 pavgb m2, [r0+%4+r5]
1767 PALIGNR %1, m3, 1, m6
1769 PALIGNR m3, m2, 1, m6
1772 vpperm m5, m3, %1, m7
1773 vpperm m3, m3, %1, m6
1790 pavgb m3, [r0+%3+r5+8]
1791 pavgb m2, [r0+%3+r5]
1794 pavgb m1, [r0+%3+r5+9]
1795 pavgb m0, [r0+%3+r5+1]
1811 pavgw m3, [r0+%3+r5+8]
1812 pavgw m2, [r0+%3+r5]
1815 pavgw m1, [r0+%3+r5+10]
1816 pavgw m0, [r0+%3+r5+2]
1830 mova m3, [r0+%4+mmsize]
1832 pavgw m3, [r0+%4+r5+mmsize]
1833 pavgw m2, [r0+%4+r5]
1834 PALIGNR %1, m3, 2, m6
1836 PALIGNR m3, m2, 2, m6
1839 vpperm m5, m3, %1, m7
1840 vpperm m3, m3, %1, m6
1854 ;-----------------------------------------------------------------------------
1855 ; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
1856 ; intptr_t src_stride, intptr_t dst_stride, int width, int height )
1857 ;-----------------------------------------------------------------------------
1858 %macro FRAME_INIT_LOWRES 0
1859 cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
1866 add dword r7m, mmsize-1
1867 and dword r7m, ~(mmsize-1)
1869 ; src += 2*(height-1)*stride + 2*width
1875 ; dst += (height-1)*stride + width
1884 ; gap = stride - width
1888 %define dst_gap [rsp+gprsize]
1893 %define src_gap [rsp]
1896 mova m6, [deinterleave_shuf32a]
1897 mova m7, [deinterleave_shuf32b]
1904 %ifnidn cpuname, mmx2
1916 %ifidn cpuname, mmx2
1920 FILT8xA m0, r1, r2, 0
1921 FILT8xA m1, r3, r4, r5
1925 %else ; !HIGH_BIT_DEPTH
1927 mova m7, [deinterleave_shuf]
1929 mova m6, [deinterleave_shuf32a]
1930 mova m7, [deinterleave_shuf32b]
1937 %ifnidn cpuname, mmx2
1952 FILT32x4U r1, r2, r3, r4
1954 FILT8x4 m0, m1, m2, m3, m10, m11, mmsize
1957 FILT8x4 m2, m3, m0, m1, m4, m5, 0
1959 vpperm m4, m2, m8, m7
1960 vpperm m2, m2, m8, m6
1961 vpperm m5, m3, m9, m7
1962 vpperm m3, m3, m9, m6
1973 %elifidn cpuname, mmx2
1977 FILT16x2 m0, r1, r2, 0
1978 FILT16x2 m1, r3, r4, r5
1982 %endif ; HIGH_BIT_DEPTH
1995 %endmacro ; FRAME_INIT_LOWRES
1999 %if ARCH_X86_64 == 0
2000 INIT_MMX cache32, mmx2
2011 %if HIGH_BIT_DEPTH==0
2016 ;-----------------------------------------------------------------------------
2017 ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
2018 ; uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
2019 ;-----------------------------------------------------------------------------
2021 cglobal mbtree_propagate_cost, 6,6,7
2035 movq m2, [r2+r5] ; intra
2036 movq m0, [r4+r5] ; invq
2037 movq m3, [r3+r5] ; inter
2038 movq m1, [r1+r5] ; prop
2049 fmaddps m0, m0, m6, m1
2057 fnmaddps m3, m1, m3, m2
2061 mulps m0, m6 ; intra*invq*fps_factor>>8
2062 cvtdq2ps m1, m1 ; prop
2063 addps m0, m1 ; prop + (intra*invq*fps_factor>>8)
2064 cvtdq2ps m1, m2 ; intra
2065 psubd m2, m3 ; intra - inter
2066 cvtdq2ps m2, m2 ; intra - inter
2067 rcpps m3, m1 ; 1 / intra 1st approximation
2068 mulps m1, m3 ; intra * (1/intra 1st approx)
2069 mulps m1, m3 ; intra * (1/intra 1st approx)^2
2070 mulps m0, m2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
2071 addps m3, m3 ; 2 * (1/intra 1st approx)
2072 subps m3, m1 ; 2nd approximation for 1/intra
2073 mulps m0, m3 ; / intra
2085 ; Bulldozer only has a 128-bit float unit, so the AVX version of this function is actually slower.
2089 %macro INT16_UNPACK 1
2090 punpckhwd xm4, xm%1, xm7
2092 vinsertf128 m%1, m%1, xm4, 1
2095 ; FIXME: align loads to 16 bytes
2097 cglobal mbtree_propagate_cost, 6,6,8-cpuflag(avx2)
2098 vbroadcastss m6, [r5]
2108 %if notcpuflag(avx2)
2113 pmovzxwd m0, [r2+r5] ; intra
2114 pmovzxwd m1, [r4+r5] ; invq
2115 pmovzxwd m2, [r1+r5] ; prop
2116 pand xm3, xm5, [r3+r5] ; inter
2125 fmaddps m1, m1, m6, m2
2130 fnmaddps m4, m2, m3, m4
2136 pand xm3, xm5, [r3+r5]
2148 mulps m1, m6 ; intra*invq*fps_factor>>8
2149 addps m1, m2 ; prop + (intra*invq*fps_factor>>8)
2150 rcpps m3, m0 ; 1 / intra 1st approximation
2151 mulps m2, m0, m3 ; intra * (1/intra 1st approx)
2152 mulps m2, m3 ; intra * (1/intra 1st approx)^2
2153 mulps m1, m4 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
2154 addps m3, m3 ; 2 * (1/intra 1st approx)
2155 subps m3, m2 ; 2nd approximation for 1/intra
2156 mulps m1, m3 ; / intra
2159 vextractf128 xm2, m1, 1
2172 %macro MBTREE_PROPAGATE_LIST 0
2173 ;-----------------------------------------------------------------------------
2174 ; void mbtree_propagate_list_internal( int16_t (*mvs)[2], int *propagate_amount, uint16_t *lowres_costs,
2175 ; int16_t *output, int bipred_weight, int mb_y, int len )
2176 ;-----------------------------------------------------------------------------
2177 cglobal mbtree_propagate_list_internal, 4,6,8
2178 movh m6, [pw_0to15] ; mb_x
2181 punpcklwd m6, m7 ; 0 y 1 y 2 y 3 y
2183 SPLATW m7, m7 ; bipred_weight
2184 psllw m7, 9 ; bipred_weight << 9
2191 mova m5, [pw_0xc000]
2194 pmulhrsw m5, m3, m7 ; propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
2196 pblendvb m5, m3, m5, m4
2200 por m5, m4 ; if( lists_used == 3 )
2201 ; propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
2204 movu m0, [r0+r4*4] ; x,y
2205 movu m1, [r0+r4*4+mmsize]
2210 paddw m2, m6 ; {mbx, mby} = ({x,y}>>5)+{h->mb.i_mb_x,h->mb.i_mb_y}
2211 paddw m6, m4 ; {mbx, mby} += {4, 0}
2212 paddw m3, m6 ; {mbx, mby} = ({x,y}>>5)+{h->mb.i_mb_x,h->mb.i_mb_y}
2213 paddw m6, m4 ; {mbx, mby} += {4, 0}
2215 mova [r3+mmsize*0], m2
2216 mova [r3+mmsize*1], m3
2219 pand m0, m3 ; x &= 31
2220 pand m1, m3 ; y &= 31
2225 pandn m1, m3 ; y premultiplied by (1<<5) for later use of pmulhrsw
2228 psubw m3, m0 ; 32 - x
2230 psubw m4, m1 ; (32 - y) << 5
2232 pmullw m2, m3, m4 ; idx0weight = (32-y)*(32-x) << 5
2233 pmullw m4, m0 ; idx1weight = (32-y)*x << 5
2234 pmullw m0, m1 ; idx3weight = y*x << 5
2235 pmullw m1, m3 ; idx2weight = y*(32-x) << 5
2237 ; avoid overflow in the input to pmulhrsw
2239 psubw m2, m3 ; idx0weight -= (idx0weight == 32768)
2241 pmulhrsw m2, m5 ; idx0weight * propagate_amount + 512 >> 10
2242 pmulhrsw m4, m5 ; idx1weight * propagate_amount + 512 >> 10
2243 pmulhrsw m1, m5 ; idx2weight * propagate_amount + 512 >> 10
2244 pmulhrsw m0, m5 ; idx3weight * propagate_amount + 512 >> 10
2246 SBUTTERFLY wd, 2, 4, 3
2247 SBUTTERFLY wd, 1, 0, 3
2248 mova [r3+mmsize*2], m2
2249 mova [r3+mmsize*3], m4
2250 mova [r3+mmsize*4], m1
2251 mova [r3+mmsize*5], m0
2260 MBTREE_PROPAGATE_LIST
2262 MBTREE_PROPAGATE_LIST