1 ;*****************************************************************************
2 ;* mc-a2.asm: x86 motion compensation
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2015 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Holger Lubitz <holger@lubitz.org>
9 ;* Mathieu Monnier <manao@melix.net>
10 ;* Oskar Arvidsson <oskar@irock.se>
12 ;* This program is free software; you can redistribute it and/or modify
13 ;* it under the terms of the GNU General Public License as published by
14 ;* the Free Software Foundation; either version 2 of the License, or
15 ;* (at your option) any later version.
17 ;* This program is distributed in the hope that it will be useful,
18 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;* GNU General Public License for more details.
22 ;* You should have received a copy of the GNU General Public License
23 ;* along with this program; if not, write to the Free Software
24 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
26 ;* This program is also available under a commercial proprietary license.
27 ;* For more information, contact us at licensing@x264.com.
28 ;*****************************************************************************
31 %include "x86util.asm"
35 pw_1024: times 16 dw 1024
36 filt_mul20: times 32 db 20
37 filt_mul15: times 16 db 1, -5
38 filt_mul51: times 16 db -5, 1
39 hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
40 deinterleave_shuf: times 2 db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
43 copy_swap_shuf: times 2 db 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
44 v210_mask: times 4 dq 0xc00ffc003ff003ff
45 v210_luma_shuf: times 2 db 1,2,4,5,6,7,9,10,12,13,14,15,12,13,14,15
46 v210_chroma_shuf: times 2 db 0,1,2,3,5,6,8,9,10,11,13,14,10,11,13,14
47 ; vpermd indices {0,1,2,4,5,7,_,_} merged in the 3 lsb of each dword to save a register
48 v210_mult: dw 0x2000,0x7fff,0x0801,0x2000,0x7ffa,0x0800,0x7ffc,0x0800
49 dw 0x1ffd,0x7fff,0x07ff,0x2000,0x7fff,0x0800,0x7fff,0x0800
51 deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14
52 deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15
54 copy_swap_shuf: times 2 db 1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14
55 deinterleave_rgb_shuf: db 0,3,6,9,1,4,7,10,2,5,8,11,-1,-1,-1,-1
56 db 0,4,8,12,1,5,9,13,2,6,10,14,-1,-1,-1,-1
58 deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
59 deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
60 %endif ; !HIGH_BIT_DEPTH
63 pd_0f: times 4 dd 0xffff
65 pad10: times 8 dw 10*PIXEL_MAX
66 pad20: times 8 dw 20*PIXEL_MAX
67 pad30: times 8 dw 30*PIXEL_MAX
68 depad: times 4 dd 32*20*PIXEL_MAX + 512
70 tap1: times 4 dw 1, -5
71 tap2: times 4 dw 20, 20
72 tap3: times 4 dw -5, 1
74 pw_0xc000: times 8 dw 0xc000
118 psubw %1, %2 ; a-5*b+4*c
122 paddw %1, %3 ; a-5*b+20*c
128 psraw %1, 2 ; (a-b)/4
129 psubw %1, %2 ; (a-b)/4-b
130 paddw %1, %3 ; (a-b)/4-b+c
131 psraw %1, 2 ; ((a-b)/4-b+c)/4
132 paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
167 %if HIGH_BIT_DEPTH == 0
172 ;The hpel_filter routines use non-temporal writes for output.
173 ;The following defines may be uncommented for testing.
174 ;Doing the hpel_filter temporal may be a win if the last level cache
175 ;is big enough (preliminary benching suggests on the order of 4* framesize).
178 ;%define movntps movaps
182 ;-----------------------------------------------------------------------------
183 ; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, intptr_t stride, intptr_t width );
184 ;-----------------------------------------------------------------------------
186 cglobal hpel_filter_v, 5,6,11
206 mova m7, [pw_pixel_max]
213 mova m5, [r1+r3+mmsize]
214 mova m6, [r1+r3*2+mmsize]
218 paddw m4, [r5+r3*2+mmsize]
219 paddw m5, [r5+r3+mmsize]
220 paddw m6, [r5+mmsize]
223 FILT_V2 m1, m2, m3, m4, m5, m6
228 mova [r2+r4+mmsize], m4
231 FILT_PACK m1, m4, m6, 5, s10
235 mova [r0+r4+mmsize], m4
240 ;-----------------------------------------------------------------------------
241 ; void hpel_filter_c( uint16_t *dst, int16_t *buf, intptr_t width );
242 ;-----------------------------------------------------------------------------
243 cglobal hpel_filter_c, 3,3,10
283 CLIPW m1, [pb_0], [pw_pixel_max]
289 ;-----------------------------------------------------------------------------
290 ; void hpel_filter_h( uint16_t *dst, uint16_t *src, intptr_t width );
291 ;-----------------------------------------------------------------------------
292 cglobal hpel_filter_h, 3,4,8
298 mova m0, [pw_pixel_max]
310 movu m4, [src-4+mmsize]
311 movu m5, [src-2+mmsize]
313 movu m7, [src+4+mmsize]
314 movu m6, [src+6+mmsize]
317 movu m7, [src+2+mmsize]
318 mova m6, [src+0+mmsize]
320 FILT_H2 m1, m2, m3, m4, m5, m6
323 FILT_PACK m1, m4, m7, 1
327 mova [r0+r2+mmsize], m4
331 %endmacro ; HPEL_FILTER
337 %endif ; HIGH_BIT_DEPTH
339 %if HIGH_BIT_DEPTH == 0
341 ;-----------------------------------------------------------------------------
342 ; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, intptr_t stride, intptr_t width );
343 ;-----------------------------------------------------------------------------
344 cglobal hpel_filter_v, 5,6,%1
352 mova m0, [filt_mul15]
364 SBUTTERFLY bw, 1, 4, 7
365 SBUTTERFLY bw, 2, 5, 7
366 SBUTTERFLY bw, 3, 6, 7
371 pmaddubsw m3, [filt_mul20]
372 pmaddubsw m6, [filt_mul20]
379 LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1
380 LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1
381 LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0
382 LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
383 FILT_V2 m1, m2, m3, m4, m5, m6
388 mova [r2+r4*2+mmsize/2], xm4
389 vextracti128 [r2+r4*2+mmsize], m1, 1
390 vextracti128 [r2+r4*2+mmsize*3/2], m4, 1
393 mova [r2+r4*2+mmsize], m4
395 FILT_PACK m1, m4, m7, 5
404 ;-----------------------------------------------------------------------------
405 ; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
406 ;-----------------------------------------------------------------------------
408 cglobal hpel_filter_c, 3,3
420 paddw m3, [src+2] ; c0
424 paddw m4, [src+14] ; a1
425 paddw m5, [src+12] ; b1
426 paddw m6, [src+10] ; c1
427 FILT_H2 m1, m2, m3, m4, m5, m6
428 FILT_PACK m1, m4, m7, 6
434 ;-----------------------------------------------------------------------------
435 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
436 ;-----------------------------------------------------------------------------
438 cglobal hpel_filter_h, 3,3
472 FILT_H2 m1, m2, m3, m4, m5, m6
473 FILT_PACK m1, m4, m7, 1
480 ;-----------------------------------------------------------------------------
481 ; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
482 ;-----------------------------------------------------------------------------
483 cglobal hpel_filter_c, 3,3,9
488 %ifnidn cpuname, sse2
499 %define pw_rnd [pw_32]
501 ; This doesn't seem to be faster (with AVX) on Sandy Bridge or Bulldozer...
507 movu m3, [src-4+mmsize]
508 movu m2, [src-2+mmsize]
509 mova m1, [src+0+mmsize]
513 paddw m3, [src+6+mmsize]
514 paddw m2, [src+4+mmsize]
515 paddw m1, [src+2+mmsize]
516 FILT_H2 m4, m5, m6, m3, m2, m1
522 PALIGNR m4, m1, m0, 12, m7
523 PALIGNR m5, m1, m0, 14, m0
524 PALIGNR m0, m2, m1, 6, m7
526 PALIGNR m0, m2, m1, 4, m7
528 PALIGNR m6, m2, m1, 2, m7
534 PALIGNR m2, m1, 12, m7
535 PALIGNR m5, m1, 14, m1
537 PALIGNR m3, m1, m0, 6, m7
539 PALIGNR m6, m1, m0, 4, m7
541 PALIGNR m6, m1, m0, 2, m7
545 FILT_PACK m4, m3, pw_rnd, 6
555 ;-----------------------------------------------------------------------------
556 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
557 ;-----------------------------------------------------------------------------
559 cglobal hpel_filter_h, 3,3,8
596 mova m7, [pw_1] ; FIXME xmm8
597 FILT_H2 m1, m2, m3, m4, m5, m6
598 FILT_PACK m1, m4, m7, 1
604 ;-----------------------------------------------------------------------------
605 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
606 ;-----------------------------------------------------------------------------
608 cglobal hpel_filter_h, 3,3
618 ; Using unaligned loads instead of palignr is marginally slower on SB and significantly
619 ; slower on Bulldozer, despite their fast load units -- even though it would let us avoid
620 ; the repeated loads of constants for pmaddubsw.
621 palignr m3, m1, m0, 14
622 palignr m4, m1, m0, 15
623 palignr m0, m2, m1, 2
624 pmaddubsw m3, [filt_mul15]
625 pmaddubsw m4, [filt_mul15]
626 pmaddubsw m0, [filt_mul51]
627 palignr m5, m2, m1, 1
628 palignr m6, m2, m1, 3
631 pmaddubsw m1, [filt_mul20]
632 pmaddubsw m5, [filt_mul20]
633 pmaddubsw m6, [filt_mul51]
637 FILT_PACK m3, m4, m7, 5
638 pshufb m3, [hpel_shuf]
666 cglobal hpel_filter_h, 3,3,8
671 mova m5, [filt_mul15]
672 mova m6, [filt_mul20]
673 mova m7, [filt_mul51]
694 FILT_PACK m0, m1, m2, 5
695 pshufb m0, [hpel_shuf]
704 ;The optimum prefetch distance is difficult to determine in checkasm:
705 ;any prefetch seems slower than not prefetching.
706 ;In real use, the prefetch seems to be a slight win.
707 ;+mmsize is picked somewhat arbitrarily here based on the fact that even one
708 ;loop iteration is going to take longer than the prefetch.
709 prefetcht0 [r1+r2*2+mmsize]
736 LOAD_ADD_2 m1, m4, [r3 ], [r1+r2*2], m2, m5 ; a0 / a1
737 LOAD_ADD_2 m2, m5, [r3+r2 ], [r1+r2 ], m3, m6 ; b0 / b1
738 LOAD_ADD_2 m3, m6, [r3+r2*2], [r1 ], %3, %4 ; c0 / c1
740 FILT_V2 m1, m2, m3, m4, m5, m6
745 vinserti128 %1, m1, xm4, 1
746 vperm2i128 %2, m1, m4, q0301
751 FILT_PACK m1, m4, m15, 5
752 movntps [r8+r4+%5], m1
757 vperm2i128 m3, %2, %1, q0003
759 PALIGNR m1, %2, %1, (mmsize-4), m3
760 PALIGNR m2, %2, %1, (mmsize-2), m3
762 vperm2i128 %1, %3, %2, q0003
764 PALIGNR m3, %3, %2, 4, %1
765 PALIGNR m4, %3, %2, 2, %1
771 PALIGNR %3, %3, %2, 6, m2
780 FILT_PACK %3, %4, m15, 6
798 vperm2i128 m3, %2, %1, q0003
800 PALIGNR m1, %2, %1, (mmsize-2), m3
801 PALIGNR m2, %2, %1, (mmsize-1), m3
803 vperm2i128 m3, %3, %2, q0003
805 PALIGNR m4, %3, %2, 1 , m3
806 PALIGNR m5, %3, %2, 2 , m3
807 PALIGNR m6, %3, %2, 3 , m3
820 FILT_PACK m1, m2, m15, 5
821 pshufb m1, [hpel_shuf]
823 ADD8TO16 m1, m6, m12, m3, m0 ; a
824 ADD8TO16 m2, m5, m12, m3, m0 ; b
825 ADD8TO16 %2, m4, m12, m3, m0 ; c
826 FILT_V2 m1, m2, %2, m6, m5, m4
827 FILT_PACK m1, m6, m15, 5
834 ;-----------------------------------------------------------------------------
835 ; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
836 ; uint8_t *src, intptr_t stride, int width, int height )
837 ;-----------------------------------------------------------------------------
838 cglobal hpel_filter, 7,9,16
855 mova m0, [filt_mul51]
856 mova m12, [filt_mul15]
857 mova m14, [filt_mul20]
866 DO_FILT_V m8, m7, m13, m12, 0
869 DO_FILT_V m6, m5, m11, m12, mmsize
872 psrlw m15, 1 ; pw_512
874 paddw m15, m15 ; pw_32
876 DO_FILT_C m9, m8, m7, m6
878 paddw m15, m15 ; pw_1024
883 DO_FILT_H m10, m13, m11
888 ; setup regs for next y
916 %endif ; !HIGH_BIT_DEPTH
918 %macro PREFETCHNT_ITER 2 ; src, bytes/iteration
919 %assign %%i 4*(%2) ; prefetch 4 iterations ahead. is this optimal?
920 %rep (%2+63) / 64 ; assume 64 byte cache lines
926 ;-----------------------------------------------------------------------------
927 ; void plane_copy(_swap)_core( pixel *dst, intptr_t i_dst,
928 ; pixel *src, intptr_t i_src, int w, int h )
929 ;-----------------------------------------------------------------------------
930 ; assumes i_dst and w are multiples of mmsize, and i_dst>w
931 %macro PLANE_COPY_CORE 1 ; swap
933 cglobal plane_copy_swap_core, 6,7
934 mova m4, [copy_swap_shuf]
936 cglobal plane_copy_core, 6,7
939 %if %1 && HIGH_BIT_DEPTH
941 %elif %1 || HIGH_BIT_DEPTH
950 lea r6, [r4+4*mmsize]
956 PREFETCHNT_ITER r2+r6, 4*mmsize
957 movu m0, [r2+r6-4*mmsize]
958 movu m1, [r2+r6-3*mmsize]
959 movu m2, [r2+r6-2*mmsize]
960 movu m3, [r2+r6-1*mmsize]
967 movnta [r0+r6-4*mmsize], m0
968 movnta [r0+r6-3*mmsize], m1
969 movnta [r0+r6-2*mmsize], m2
970 movnta [r0+r6-1*mmsize], m3
974 PREFETCHNT_ITER r2+r6, 4*mmsize
1003 %macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
1007 mov%4 m0, [%2+(x/2)*mmsize]
1008 mov%4 m1, [%3+(x/2)*mmsize]
1009 punpckhwd m2, m0, m1
1011 mov%5a [%1+(x+0)*mmsize], m0
1012 mov%5a [%1+(x+1)*mmsize], m2
1027 punpckhbw m2, m0, m1
1032 %endif ; HIGH_BIT_DEPTH
1035 %macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, shuffle constant, is aligned
1039 mova m0, [%3+(n+0)*mmsize]
1040 mova m1, [%3+(n+1)*mmsize]
1047 mov%6 [%1+(n/2)*mmsize], m0
1048 mov%6 [%2+(n/2)*mmsize], m2
1051 %else ; !HIGH_BIT_DEPTH
1081 %endif ; mmsize == 16
1082 %endif ; HIGH_BIT_DEPTH
1085 %macro PLANE_INTERLEAVE 0
1086 ;-----------------------------------------------------------------------------
1087 ; void plane_copy_interleave_core( uint8_t *dst, intptr_t i_dst,
1088 ; uint8_t *srcu, intptr_t i_srcu,
1089 ; uint8_t *srcv, intptr_t i_srcv, int w, int h )
1090 ;-----------------------------------------------------------------------------
1091 ; assumes i_dst and w are multiples of 16, and i_dst>2*w
1092 cglobal plane_copy_interleave_core, 6,9
1095 FIX_STRIDES r1, r3, r5, r6d
1109 shr t1, SIZEOF_PIXEL
1123 INTERLEAVE r0+r6*2+ 0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6+0*SIZEOF_PIXEL, u, nt
1124 INTERLEAVE r0+r6*2+16*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6+8*SIZEOF_PIXEL, u, nt
1125 add r6, 16*SIZEOF_PIXEL
1131 movntq [r0+r6*2+(n+ 0)], m0
1132 movntq [r0+r6*2+(n+ 8)], m0
1133 movntq [r0+r6*2+(n+16)], m0
1134 movntq [r0+r6*2+(n+24)], m0
1136 movntdq [r0+r6*2+(n+ 0)], m0
1137 movntdq [r0+r6*2+(n+16)], m0
1141 add r6, 16*SIZEOF_PIXEL
1153 ;-----------------------------------------------------------------------------
1154 ; void store_interleave_chroma( uint8_t *dst, intptr_t i_dst, uint8_t *srcu, uint8_t *srcv, int height )
1155 ;-----------------------------------------------------------------------------
1156 cglobal store_interleave_chroma, 5,5
1159 INTERLEAVE r0+ 0, r2+ 0, r3+ 0, a
1160 INTERLEAVE r0+r1, r2+FDEC_STRIDEB, r3+FDEC_STRIDEB, a
1161 add r2, FDEC_STRIDEB*2
1162 add r3, FDEC_STRIDEB*2
1167 %endmacro ; PLANE_INTERLEAVE
1169 %macro DEINTERLEAVE_START 0
1172 %elif cpuflag(ssse3)
1173 mova m4, [deinterleave_shuf]
1176 %endif ; HIGH_BIT_DEPTH
1179 %macro PLANE_DEINTERLEAVE 0
1180 ;-----------------------------------------------------------------------------
1181 ; void plane_copy_deinterleave( pixel *dstu, intptr_t i_dstu,
1182 ; pixel *dstv, intptr_t i_dstv,
1183 ; pixel *src, intptr_t i_src, int w, int h )
1184 ;-----------------------------------------------------------------------------
1185 cglobal plane_copy_deinterleave, 6,7
1188 FIX_STRIDES r1, r3, r5, r6d
1199 DEINTERLEAVE r0+r6+0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6*2+ 0*SIZEOF_PIXEL, 0, m4, u
1200 DEINTERLEAVE r0+r6+8*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6*2+16*SIZEOF_PIXEL, 0, m4, u
1201 add r6, 16*SIZEOF_PIXEL
1210 ;-----------------------------------------------------------------------------
1211 ; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height )
1212 ;-----------------------------------------------------------------------------
1213 cglobal load_deinterleave_chroma_fenc, 4,4
1217 DEINTERLEAVE r0+ 0, r0+FENC_STRIDEB*1/2, r1+ 0, 1, m4, a
1218 DEINTERLEAVE r0+FENC_STRIDEB, r0+FENC_STRIDEB*3/2, r1+r2, 1, m4, a
1219 add r0, FENC_STRIDEB*2
1225 ;-----------------------------------------------------------------------------
1226 ; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height )
1227 ;-----------------------------------------------------------------------------
1228 cglobal load_deinterleave_chroma_fdec, 4,4
1232 DEINTERLEAVE r0+ 0, r0+FDEC_STRIDEB*1/2, r1+ 0, 0, m4, a
1233 DEINTERLEAVE r0+FDEC_STRIDEB, r0+FDEC_STRIDEB*3/2, r1+r2, 0, m4, a
1234 add r0, FDEC_STRIDEB*2
1239 %endmacro ; PLANE_DEINTERLEAVE
1241 %macro PLANE_DEINTERLEAVE_RGB_CORE 9 ; pw, i_dsta, i_dstb, i_dstc, i_src, w, h, tmp1, tmp2
1243 mova m3, [deinterleave_rgb_shuf+(%1-3)*16]
1250 movu m1, [%8+%1*mmsize/4]
1252 pshufb m0, m3 ; b0 b1 b2 b3 g0 g1 g2 g3 r0 r1 r2 r3
1253 pshufb m1, m3 ; b4 b5 b6 b7 g4 g5 g6 g7 r4 r5 r6 r7
1256 punpcklqdq m0, m1 ; b0 g0 r0 b1 g1 r1 __ __ b4 g4 r4 b5 g5 r5
1258 punpcklqdq m2, m1 ; b2 g2 r2 b3 g3 r3 __ __ b6 g6 r6 b7 g7 r7
1261 punpckhbw m1, m0, m3 ; b4 b5 g4 g5 r4 r5
1262 punpcklbw m0, m3 ; b0 b1 g0 g1 r0 r1
1263 punpckhbw m3, m2, m4 ; b6 b7 g6 g7 r6 r7
1264 punpcklbw m2, m4 ; b2 b3 g2 g3 r2 r3
1265 punpcklwd m0, m2 ; b0 b1 b2 b3 g0 g1 g2 g3 r0 r1 r2 r3
1266 punpcklwd m1, m3 ; b4 b5 b6 b7 g4 g5 g6 g7 r4 r5 r6 r7
1268 pshufd m3, m0, q2301
1269 pshufd m4, m1, q2301
1270 punpckhbw m2, m0, m3 ; b2 b3 g2 g3 r2 r3
1271 punpcklbw m0, m3 ; b0 b1 g0 g1 r0 r1
1272 punpckhbw m3, m1, m4 ; b6 b7 g6 g7 r6 r7
1273 punpcklbw m1, m4 ; b4 b5 g4 g5 r4 r5
1274 punpcklwd m0, m2 ; b0 b1 b2 b3 g0 g1 g2 g3 r0 r1 r2 r3
1275 punpcklwd m1, m3 ; b4 b5 b6 b7 g4 g5 g6 g7 r4 r5 r6 r7
1277 punpckldq m2, m0, m1 ; b0 b1 b2 b3 b4 b5 b6 b7 g0 g1 g2 g3 g4 g5 g6 g7
1278 punpckhdq m0, m1 ; r0 r1 r2 r3 r4 r5 r6 r7
1293 %macro PLANE_DEINTERLEAVE_RGB 0
1294 ;-----------------------------------------------------------------------------
1295 ; void x264_plane_copy_deinterleave_rgb( pixel *dsta, intptr_t i_dsta,
1296 ; pixel *dstb, intptr_t i_dstb,
1297 ; pixel *dstc, intptr_t i_dstc,
1298 ; pixel *src, intptr_t i_src, int pw, int w, int h )
1299 ;-----------------------------------------------------------------------------
1301 cglobal plane_copy_deinterleave_rgb, 8,12
1302 %define %%args r1, r3, r5, r7, r8, r9, r10, r11
1310 cglobal plane_copy_deinterleave_rgb, 1,7
1311 %define %%args r1m, r3m, r5m, r7m, r9m, r1, r3, r5
1325 PLANE_DEINTERLEAVE_RGB_CORE 3, %%args ; BGR
1328 PLANE_DEINTERLEAVE_RGB_CORE 4, %%args ; BGRA
1333 %if HIGH_BIT_DEPTH == 0
1335 PLANE_DEINTERLEAVE_RGB
1337 PLANE_DEINTERLEAVE_RGB
1338 %endif ; !HIGH_BIT_DEPTH
1340 %macro PLANE_DEINTERLEAVE_V210 0
1341 ;-----------------------------------------------------------------------------
1342 ; void x264_plane_copy_deinterleave_v210( uint16_t *dsty, intptr_t i_dsty,
1343 ; uint16_t *dstc, intptr_t i_dstc,
1344 ; uint32_t *src, intptr_t i_src, int w, int h )
1345 ;-----------------------------------------------------------------------------
1347 cglobal plane_copy_deinterleave_v210, 8,10,7
1352 cglobal plane_copy_deinterleave_v210, 7,7,7
1357 FIX_STRIDES r1, r3, r6d
1364 mova m2, [v210_mask]
1365 mova m3, [v210_luma_shuf]
1366 mova m4, [v210_chroma_shuf]
1367 mova m5, [v210_mult] ; also functions as vpermd index for avx2
1368 pshufd m6, m5, q1102
1377 pmulhrsw m0, m5 ; y0 y1 y2 y3 y4 y5 __ __
1378 pmulhrsw m1, m6 ; u0 v0 u1 v1 u2 v2 __ __
1396 %endmacro ; PLANE_DEINTERLEAVE_V210
1407 PLANE_DEINTERLEAVE_V210
1411 PLANE_DEINTERLEAVE_V210
1413 PLANE_DEINTERLEAVE_V210
1426 ; These functions are not general-use; not only do the SSE ones require aligned input,
1427 ; but they also will fail if given a non-mod16 size.
1428 ; memzero SSE will fail for non-mod128.
1430 ;-----------------------------------------------------------------------------
1431 ; void *memcpy_aligned( void *dst, const void *src, size_t n );
1432 ;-----------------------------------------------------------------------------
1434 cglobal memcpy_aligned, 3,3
1445 mova m0, [r1+r2-1*mmsize]
1446 mova m1, [r1+r2-2*mmsize]
1447 mova [r0+r2-1*mmsize], m0
1448 mova [r0+r2-2*mmsize], m1
1454 mova m0, [r1+r2-1*mmsize]
1455 mova m1, [r1+r2-2*mmsize]
1456 mova m2, [r1+r2-3*mmsize]
1457 mova m3, [r1+r2-4*mmsize]
1458 mova [r0+r2-1*mmsize], m0
1459 mova [r0+r2-2*mmsize], m1
1460 mova [r0+r2-3*mmsize], m2
1461 mova [r0+r2-4*mmsize], m3
1473 ;-----------------------------------------------------------------------------
1474 ; void *memzero_aligned( void *dst, size_t n );
1475 ;-----------------------------------------------------------------------------
1477 cglobal memzero_aligned, 2,2
1488 mova [r0 + r1 + i], m0
1503 %if HIGH_BIT_DEPTH == 0
1504 ;-----------------------------------------------------------------------------
1505 ; void integral_init4h( uint16_t *sum, uint8_t *pix, intptr_t stride )
1506 ;-----------------------------------------------------------------------------
1507 %macro INTEGRAL_INIT4H 0
1508 cglobal integral_init4h, 3,4
1524 paddw m1, [r0+r2*2+mmsize]
1526 mova [r3+r2*2+mmsize], m1
1537 %macro INTEGRAL_INIT8H 0
1538 cglobal integral_init8h, 3,4
1547 mpsadbw m2, m0, m4, 100100b
1548 mpsadbw m3, m1, m4, 100100b
1552 mpsadbw m2, m0, m4, 100b
1553 mpsadbw m3, m1, m4, 100b
1558 paddw m1, [r0+r2*2+mmsize]
1562 mova [r3+r2*2+mmsize], m1
1574 %endif ; !HIGH_BIT_DEPTH
1576 %macro INTEGRAL_INIT_8V 0
1577 ;-----------------------------------------------------------------------------
1578 ; void integral_init8v( uint16_t *sum8, intptr_t stride )
1579 ;-----------------------------------------------------------------------------
1580 cglobal integral_init8v, 3,3
1587 mova m1, [r2+r1+mmsize]
1589 psubw m1, [r0+r1+mmsize]
1591 mova [r0+r1+mmsize], m1
1604 ;-----------------------------------------------------------------------------
1605 ; void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride )
1606 ;-----------------------------------------------------------------------------
1608 cglobal integral_init4v, 3,5
1630 cglobal integral_init4v, 3,5
1642 shufpd m0, [r0+r2+16], 1
1643 shufpd m1, [r4+r2+16], 1
1656 cglobal integral_init4v, 3,5
1682 cglobal integral_init4v, 3,5
1692 paddw m0, m2, [r0+r2+8]
1707 pavgb %4, [r0+r5*2+%7]
1708 PALIGNR %1, %3, 1, m6
1709 PALIGNR %2, %4, 1, m6
1727 pavgb m2, m3, [r0+1]
1729 pavgb m3, [r0+r5*2+1]
1733 mova m3, [r0+r5+mmsize]
1734 pavgb m2, m3, [r0+mmsize]
1735 movu m5, [r0+r5+1+mmsize]
1736 pavgb m4, m5, [r0+1+mmsize]
1737 pavgb m3, [r0+r5*2+mmsize]
1738 pavgb m5, [r0+r5*2+1+mmsize]
1746 punpckhqdq m4, m0, m2
1747 punpcklqdq m0, m0, m2
1748 punpckhqdq m5, m1, m3
1749 punpcklqdq m2, m1, m3
1750 vpermq m0, m0, q3120
1751 vpermq m1, m4, q3120
1752 vpermq m2, m2, q3120
1753 vpermq m3, m5, q3120
1761 mova m3, [r0+%4+mmsize]
1763 pavgb m3, [r0+%4+r5+mmsize]
1764 pavgb m2, [r0+%4+r5]
1765 PALIGNR %1, m3, 1, m6
1767 PALIGNR m3, m2, 1, m6
1770 vpperm m5, m3, %1, m7
1771 vpperm m3, m3, %1, m6
1788 pavgb m3, [r0+%3+r5+8]
1789 pavgb m2, [r0+%3+r5]
1792 pavgb m1, [r0+%3+r5+9]
1793 pavgb m0, [r0+%3+r5+1]
1809 pavgw m3, [r0+%3+r5+8]
1810 pavgw m2, [r0+%3+r5]
1813 pavgw m1, [r0+%3+r5+10]
1814 pavgw m0, [r0+%3+r5+2]
1828 mova m3, [r0+%4+mmsize]
1830 pavgw m3, [r0+%4+r5+mmsize]
1831 pavgw m2, [r0+%4+r5]
1832 PALIGNR %1, m3, 2, m6
1834 PALIGNR m3, m2, 2, m6
1837 vpperm m5, m3, %1, m7
1838 vpperm m3, m3, %1, m6
1852 ;-----------------------------------------------------------------------------
1853 ; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
1854 ; intptr_t src_stride, intptr_t dst_stride, int width, int height )
1855 ;-----------------------------------------------------------------------------
1856 %macro FRAME_INIT_LOWRES 0
1857 cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
1864 add dword r7m, mmsize-1
1865 and dword r7m, ~(mmsize-1)
1867 ; src += 2*(height-1)*stride + 2*width
1873 ; dst += (height-1)*stride + width
1882 ; gap = stride - width
1886 %define dst_gap [rsp+gprsize]
1891 %define src_gap [rsp]
1894 mova m6, [deinterleave_shuf32a]
1895 mova m7, [deinterleave_shuf32b]
1902 %ifnidn cpuname, mmx2
1914 %ifidn cpuname, mmx2
1918 FILT8xA m0, r1, r2, 0
1919 FILT8xA m1, r3, r4, r5
1923 %else ; !HIGH_BIT_DEPTH
1925 mova m7, [deinterleave_shuf]
1927 mova m6, [deinterleave_shuf32a]
1928 mova m7, [deinterleave_shuf32b]
1935 %ifnidn cpuname, mmx2
1950 FILT32x4U r1, r2, r3, r4
1952 FILT8x4 m0, m1, m2, m3, m10, m11, mmsize
1955 FILT8x4 m2, m3, m0, m1, m4, m5, 0
1957 vpperm m4, m2, m8, m7
1958 vpperm m2, m2, m8, m6
1959 vpperm m5, m3, m9, m7
1960 vpperm m3, m3, m9, m6
1971 %elifidn cpuname, mmx2
1975 FILT16x2 m0, r1, r2, 0
1976 FILT16x2 m1, r3, r4, r5
1980 %endif ; HIGH_BIT_DEPTH
1993 %endmacro ; FRAME_INIT_LOWRES
1997 %if ARCH_X86_64 == 0
1998 INIT_MMX cache32, mmx2
2009 %if HIGH_BIT_DEPTH==0
2014 ;-----------------------------------------------------------------------------
2015 ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
2016 ; uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
2017 ;-----------------------------------------------------------------------------
2019 cglobal mbtree_propagate_cost, 6,6,7
2033 movq m2, [r2+r5] ; intra
2034 movq m0, [r4+r5] ; invq
2035 movq m3, [r3+r5] ; inter
2036 movq m1, [r1+r5] ; prop
2047 fmaddps m0, m0, m6, m1
2055 fnmaddps m3, m1, m3, m2
2059 mulps m0, m6 ; intra*invq*fps_factor>>8
2060 cvtdq2ps m1, m1 ; prop
2061 addps m0, m1 ; prop + (intra*invq*fps_factor>>8)
2062 cvtdq2ps m1, m2 ; intra
2063 psubd m2, m3 ; intra - inter
2064 cvtdq2ps m2, m2 ; intra - inter
2065 rcpps m3, m1 ; 1 / intra 1st approximation
2066 mulps m1, m3 ; intra * (1/intra 1st approx)
2067 mulps m1, m3 ; intra * (1/intra 1st approx)^2
2068 mulps m0, m2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
2069 addps m3, m3 ; 2 * (1/intra 1st approx)
2070 subps m3, m1 ; 2nd approximation for 1/intra
2071 mulps m0, m3 ; / intra
2083 ; Bulldozer only has a 128-bit float unit, so the AVX version of this function is actually slower.
2087 %macro INT16_UNPACK 1
2088 punpckhwd xm4, xm%1, xm7
2090 vinsertf128 m%1, m%1, xm4, 1
2093 ; FIXME: align loads to 16 bytes
2095 cglobal mbtree_propagate_cost, 6,6,%1
2096 vbroadcastss m6, [r5]
2106 %if notcpuflag(avx2)
2111 pmovzxwd m0, [r2+r5] ; intra
2112 pmovzxwd m1, [r4+r5] ; invq
2113 pmovzxwd m2, [r1+r5] ; prop
2114 pand xm3, xm5, [r3+r5] ; inter
2123 fmaddps m1, m1, m6, m2
2128 fnmaddps m4, m2, m3, m4
2134 pand xm3, xm5, [r3+r5]
2146 mulps m1, m6 ; intra*invq*fps_factor>>8
2147 addps m1, m2 ; prop + (intra*invq*fps_factor>>8)
2148 rcpps m3, m0 ; 1 / intra 1st approximation
2149 mulps m2, m0, m3 ; intra * (1/intra 1st approx)
2150 mulps m2, m3 ; intra * (1/intra 1st approx)^2
2151 mulps m1, m4 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
2152 addps m3, m3 ; 2 * (1/intra 1st approx)
2153 subps m3, m2 ; 2nd approximation for 1/intra
2154 mulps m1, m3 ; / intra
2157 vextractf128 xm2, m1, 1
2170 %macro MBTREE_PROPAGATE_LIST 0
2171 ;-----------------------------------------------------------------------------
2172 ; void mbtree_propagate_list_internal( int16_t (*mvs)[2], int *propagate_amount, uint16_t *lowres_costs,
2173 ; int16_t *output, int bipred_weight, int mb_y, int len )
2174 ;-----------------------------------------------------------------------------
2175 cglobal mbtree_propagate_list_internal, 4,6,8
2176 movh m6, [pw_0to15] ; mb_x
2179 punpcklwd m6, m7 ; 0 y 1 y 2 y 3 y
2181 SPLATW m7, m7 ; bipred_weight
2182 psllw m7, 9 ; bipred_weight << 9
2189 mova m5, [pw_0xc000]
2192 pmulhrsw m5, m3, m7 ; propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
2194 pblendvb m5, m3, m5, m4
2198 por m5, m4 ; if( lists_used == 3 )
2199 ; propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
2202 movu m0, [r0+r4*4] ; x,y
2203 movu m1, [r0+r4*4+mmsize]
2208 paddw m2, m6 ; {mbx, mby} = ({x,y}>>5)+{h->mb.i_mb_x,h->mb.i_mb_y}
2209 paddw m6, m4 ; {mbx, mby} += {4, 0}
2210 paddw m3, m6 ; {mbx, mby} = ({x,y}>>5)+{h->mb.i_mb_x,h->mb.i_mb_y}
2211 paddw m6, m4 ; {mbx, mby} += {4, 0}
2213 mova [r3+mmsize*0], m2
2214 mova [r3+mmsize*1], m3
2217 pand m0, m3 ; x &= 31
2218 pand m1, m3 ; y &= 31
2223 pandn m1, m3 ; y premultiplied by (1<<5) for later use of pmulhrsw
2226 psubw m3, m0 ; 32 - x
2228 psubw m4, m1 ; (32 - y) << 5
2230 pmullw m2, m3, m4 ; idx0weight = (32-y)*(32-x) << 5
2231 pmullw m4, m0 ; idx1weight = (32-y)*x << 5
2232 pmullw m0, m1 ; idx3weight = y*x << 5
2233 pmullw m1, m3 ; idx2weight = y*(32-x) << 5
2235 ; avoid overflow in the input to pmulhrsw
2237 psubw m2, m3 ; idx0weight -= (idx0weight == 32768)
2239 pmulhrsw m2, m5 ; idx0weight * propagate_amount + 512 >> 10
2240 pmulhrsw m4, m5 ; idx1weight * propagate_amount + 512 >> 10
2241 pmulhrsw m1, m5 ; idx2weight * propagate_amount + 512 >> 10
2242 pmulhrsw m0, m5 ; idx3weight * propagate_amount + 512 >> 10
2244 SBUTTERFLY wd, 2, 4, 3
2245 SBUTTERFLY wd, 1, 0, 3
2246 mova [r3+mmsize*2], m2
2247 mova [r3+mmsize*3], m4
2248 mova [r3+mmsize*4], m1
2249 mova [r3+mmsize*5], m0
2258 MBTREE_PROPAGATE_LIST
2260 MBTREE_PROPAGATE_LIST