1 ;*****************************************************************************
2 ;* mc-a2.asm: x86 motion compensation
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2014 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Holger Lubitz <holger@lubitz.org>
9 ;* Mathieu Monnier <manao@melix.net>
10 ;* Oskar Arvidsson <oskar@irock.se>
12 ;* This program is free software; you can redistribute it and/or modify
13 ;* it under the terms of the GNU General Public License as published by
14 ;* the Free Software Foundation; either version 2 of the License, or
15 ;* (at your option) any later version.
17 ;* This program is distributed in the hope that it will be useful,
18 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;* GNU General Public License for more details.
22 ;* You should have received a copy of the GNU General Public License
23 ;* along with this program; if not, write to the Free Software
24 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
26 ;* This program is also available under a commercial proprietary license.
27 ;* For more information, contact us at licensing@x264.com.
28 ;*****************************************************************************
31 %include "x86util.asm"
35 filt_mul20: times 32 db 20
36 filt_mul15: times 16 db 1, -5
37 filt_mul51: times 16 db -5, 1
38 hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
39 deinterleave_shuf: times 2 db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
41 v210_mask: times 4 dq 0xc00ffc003ff003ff
42 v210_luma_shuf: times 2 db 1,2,4,5,6,7,9,10,12,13,14,15,12,13,14,15
43 v210_chroma_shuf: times 2 db 0,1,2,3,5,6,8,9,10,11,13,14,10,11,13,14
44 ; vpermd indices {0,1,2,4,5,7,_,_} merged in the 3 lsb of each dword to save a register
45 v210_mult: dw 0x2000,0x7fff,0x0801,0x2000,0x7ffa,0x0800,0x7ffc,0x0800
46 dw 0x1ffd,0x7fff,0x07ff,0x2000,0x7fff,0x0800,0x7fff,0x0800
49 deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14
50 deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15
52 deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
53 deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
55 pw_1024: times 16 dw 1024
58 pd_0f: times 4 dd 0xffff
60 pad10: times 8 dw 10*PIXEL_MAX
61 pad20: times 8 dw 20*PIXEL_MAX
62 pad30: times 8 dw 30*PIXEL_MAX
63 depad: times 4 dd 32*20*PIXEL_MAX + 512
65 tap1: times 4 dw 1, -5
66 tap2: times 4 dw 20, 20
67 tap3: times 4 dw -5, 1
107 psubw %1, %2 ; a-5*b+4*c
111 paddw %1, %3 ; a-5*b+20*c
117 psraw %1, 2 ; (a-b)/4
118 psubw %1, %2 ; (a-b)/4-b
119 paddw %1, %3 ; (a-b)/4-b+c
120 psraw %1, 2 ; ((a-b)/4-b+c)/4
121 paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
156 %if HIGH_BIT_DEPTH == 0
161 ;The hpel_filter routines use non-temporal writes for output.
162 ;The following defines may be uncommented for testing.
163 ;Doing the hpel_filter temporal may be a win if the last level cache
164 ;is big enough (preliminary benching suggests on the order of 4* framesize).
167 ;%define movntps movaps
171 ;-----------------------------------------------------------------------------
172 ; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, intptr_t stride, intptr_t width );
173 ;-----------------------------------------------------------------------------
175 cglobal hpel_filter_v, 5,6,11
195 mova m7, [pw_pixel_max]
202 mova m5, [r1+r3+mmsize]
203 mova m6, [r1+r3*2+mmsize]
207 paddw m4, [r5+r3*2+mmsize]
208 paddw m5, [r5+r3+mmsize]
209 paddw m6, [r5+mmsize]
212 FILT_V2 m1, m2, m3, m4, m5, m6
217 mova [r2+r4+mmsize], m4
220 FILT_PACK m1, m4, m6, 5, s10
224 mova [r0+r4+mmsize], m4
229 ;-----------------------------------------------------------------------------
230 ; void hpel_filter_c( uint16_t *dst, int16_t *buf, intptr_t width );
231 ;-----------------------------------------------------------------------------
232 cglobal hpel_filter_c, 3,3,10
272 CLIPW m1, [pb_0], [pw_pixel_max]
278 ;-----------------------------------------------------------------------------
279 ; void hpel_filter_h( uint16_t *dst, uint16_t *src, intptr_t width );
280 ;-----------------------------------------------------------------------------
281 cglobal hpel_filter_h, 3,4,8
287 mova m0, [pw_pixel_max]
299 movu m4, [src-4+mmsize]
300 movu m5, [src-2+mmsize]
302 movu m7, [src+4+mmsize]
303 movu m6, [src+6+mmsize]
306 movu m7, [src+2+mmsize]
307 mova m6, [src+0+mmsize]
309 FILT_H2 m1, m2, m3, m4, m5, m6
312 FILT_PACK m1, m4, m7, 1
316 mova [r0+r2+mmsize], m4
320 %endmacro ; HPEL_FILTER
326 %endif ; HIGH_BIT_DEPTH
328 %if HIGH_BIT_DEPTH == 0
330 ;-----------------------------------------------------------------------------
331 ; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, intptr_t stride, intptr_t width );
332 ;-----------------------------------------------------------------------------
333 cglobal hpel_filter_v, 5,6,%1
341 mova m0, [filt_mul15]
353 SBUTTERFLY bw, 1, 4, 7
354 SBUTTERFLY bw, 2, 5, 7
355 SBUTTERFLY bw, 3, 6, 7
360 pmaddubsw m3, [filt_mul20]
361 pmaddubsw m6, [filt_mul20]
368 LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1
369 LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1
370 LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0
371 LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
372 FILT_V2 m1, m2, m3, m4, m5, m6
377 mova [r2+r4*2+mmsize/2], xm4
378 vextracti128 [r2+r4*2+mmsize], m1, 1
379 vextracti128 [r2+r4*2+mmsize*3/2], m4, 1
382 mova [r2+r4*2+mmsize], m4
384 FILT_PACK m1, m4, m7, 5
393 ;-----------------------------------------------------------------------------
394 ; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
395 ;-----------------------------------------------------------------------------
397 cglobal hpel_filter_c, 3,3
409 paddw m3, [src+2] ; c0
413 paddw m4, [src+14] ; a1
414 paddw m5, [src+12] ; b1
415 paddw m6, [src+10] ; c1
416 FILT_H2 m1, m2, m3, m4, m5, m6
417 FILT_PACK m1, m4, m7, 6
423 ;-----------------------------------------------------------------------------
424 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
425 ;-----------------------------------------------------------------------------
427 cglobal hpel_filter_h, 3,3
461 FILT_H2 m1, m2, m3, m4, m5, m6
462 FILT_PACK m1, m4, m7, 1
469 ;-----------------------------------------------------------------------------
470 ; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
471 ;-----------------------------------------------------------------------------
472 cglobal hpel_filter_c, 3,3,9
477 %ifnidn cpuname, sse2
488 %define pw_rnd [pw_32]
490 ; This doesn't seem to be faster (with AVX) on Sandy Bridge or Bulldozer...
496 movu m3, [src-4+mmsize]
497 movu m2, [src-2+mmsize]
498 mova m1, [src+0+mmsize]
502 paddw m3, [src+6+mmsize]
503 paddw m2, [src+4+mmsize]
504 paddw m1, [src+2+mmsize]
505 FILT_H2 m4, m5, m6, m3, m2, m1
511 PALIGNR m4, m1, m0, 12, m7
512 PALIGNR m5, m1, m0, 14, m0
513 PALIGNR m0, m2, m1, 6, m7
515 PALIGNR m0, m2, m1, 4, m7
517 PALIGNR m6, m2, m1, 2, m7
523 PALIGNR m2, m1, 12, m7
524 PALIGNR m5, m1, 14, m1
526 PALIGNR m3, m1, m0, 6, m7
528 PALIGNR m6, m1, m0, 4, m7
530 PALIGNR m6, m1, m0, 2, m7
534 FILT_PACK m4, m3, pw_rnd, 6
544 ;-----------------------------------------------------------------------------
545 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
546 ;-----------------------------------------------------------------------------
548 cglobal hpel_filter_h, 3,3,8
585 mova m7, [pw_1] ; FIXME xmm8
586 FILT_H2 m1, m2, m3, m4, m5, m6
587 FILT_PACK m1, m4, m7, 1
593 ;-----------------------------------------------------------------------------
594 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
595 ;-----------------------------------------------------------------------------
597 cglobal hpel_filter_h, 3,3
607 ; Using unaligned loads instead of palignr is marginally slower on SB and significantly
608 ; slower on Bulldozer, despite their fast load units -- even though it would let us avoid
609 ; the repeated loads of constants for pmaddubsw.
610 palignr m3, m1, m0, 14
611 palignr m4, m1, m0, 15
612 palignr m0, m2, m1, 2
613 pmaddubsw m3, [filt_mul15]
614 pmaddubsw m4, [filt_mul15]
615 pmaddubsw m0, [filt_mul51]
616 palignr m5, m2, m1, 1
617 palignr m6, m2, m1, 3
620 pmaddubsw m1, [filt_mul20]
621 pmaddubsw m5, [filt_mul20]
622 pmaddubsw m6, [filt_mul51]
626 FILT_PACK m3, m4, m7, 5
627 pshufb m3, [hpel_shuf]
655 cglobal hpel_filter_h, 3,3,8
660 mova m5, [filt_mul15]
661 mova m6, [filt_mul20]
662 mova m7, [filt_mul51]
683 FILT_PACK m0, m1, m2, 5
684 pshufb m0, [hpel_shuf]
693 ;The optimum prefetch distance is difficult to determine in checkasm:
694 ;any prefetch seems slower than not prefetching.
695 ;In real use, the prefetch seems to be a slight win.
696 ;+mmsize is picked somewhat arbitrarily here based on the fact that even one
697 ;loop iteration is going to take longer than the prefetch.
698 prefetcht0 [r1+r2*2+mmsize]
725 LOAD_ADD_2 m1, m4, [r3 ], [r1+r2*2], m2, m5 ; a0 / a1
726 LOAD_ADD_2 m2, m5, [r3+r2 ], [r1+r2 ], m3, m6 ; b0 / b1
727 LOAD_ADD_2 m3, m6, [r3+r2*2], [r1 ], %3, %4 ; c0 / c1
729 FILT_V2 m1, m2, m3, m4, m5, m6
734 vinserti128 %1, m1, xm4, 1
735 vperm2i128 %2, m1, m4, q0301
740 FILT_PACK m1, m4, m15, 5
741 movntps [r8+r4+%5], m1
746 vperm2i128 m3, %2, %1, q0003
748 PALIGNR m1, %2, %1, (mmsize-4), m3
749 PALIGNR m2, %2, %1, (mmsize-2), m3
751 vperm2i128 %1, %3, %2, q0003
753 PALIGNR m3, %3, %2, 4, %1
754 PALIGNR m4, %3, %2, 2, %1
760 PALIGNR %3, %3, %2, 6, m2
769 FILT_PACK %3, %4, m15, 6
787 vperm2i128 m3, %2, %1, q0003
789 PALIGNR m1, %2, %1, (mmsize-2), m3
790 PALIGNR m2, %2, %1, (mmsize-1), m3
792 vperm2i128 m3, %3, %2, q0003
794 PALIGNR m4, %3, %2, 1 , m3
795 PALIGNR m5, %3, %2, 2 , m3
796 PALIGNR m6, %3, %2, 3 , m3
809 FILT_PACK m1, m2, m15, 5
810 pshufb m1, [hpel_shuf]
812 ADD8TO16 m1, m6, m12, m3, m0 ; a
813 ADD8TO16 m2, m5, m12, m3, m0 ; b
814 ADD8TO16 %2, m4, m12, m3, m0 ; c
815 FILT_V2 m1, m2, %2, m6, m5, m4
816 FILT_PACK m1, m6, m15, 5
823 ;-----------------------------------------------------------------------------
824 ; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
825 ; uint8_t *src, intptr_t stride, int width, int height )
826 ;-----------------------------------------------------------------------------
827 cglobal hpel_filter, 7,9,16
844 mova m0, [filt_mul51]
845 mova m12, [filt_mul15]
846 mova m14, [filt_mul20]
855 DO_FILT_V m8, m7, m13, m12, 0
858 DO_FILT_V m6, m5, m11, m12, mmsize
861 psrlw m15, 1 ; pw_512
863 paddw m15, m15 ; pw_32
865 DO_FILT_C m9, m8, m7, m6
867 paddw m15, m15 ; pw_1024
872 DO_FILT_H m10, m13, m11
877 ; setup regs for next y
905 %endif ; !HIGH_BIT_DEPTH
907 ;-----------------------------------------------------------------------------
908 ; void plane_copy_core( pixel *dst, intptr_t i_dst,
909 ; pixel *src, intptr_t i_src, int w, int h )
910 ;-----------------------------------------------------------------------------
911 ; assumes i_dst and w are multiples of 16, and i_dst>w
913 cglobal plane_copy_core_mmx2, 6,7
914 FIX_STRIDES r1, r3, r4d
915 %if HIGH_BIT_DEPTH == 0
966 %macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
970 mov%4 m0, [%2+(x/2)*mmsize]
971 mov%4 m1, [%3+(x/2)*mmsize]
974 mov%5a [%1+(x+0)*mmsize], m0
975 mov%5a [%1+(x+1)*mmsize], m2
995 %endif ; HIGH_BIT_DEPTH
998 %macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, shuffle constant, is aligned
1002 mova m0, [%3+(n+0)*mmsize]
1003 mova m1, [%3+(n+1)*mmsize]
1010 mov%6 [%1+(n/2)*mmsize], m0
1011 mov%6 [%2+(n/2)*mmsize], m2
1014 %else ; !HIGH_BIT_DEPTH
1044 %endif ; mmsize == 16
1045 %endif ; HIGH_BIT_DEPTH
1048 %macro PLANE_INTERLEAVE 0
1049 ;-----------------------------------------------------------------------------
1050 ; void plane_copy_interleave_core( uint8_t *dst, intptr_t i_dst,
1051 ; uint8_t *srcu, intptr_t i_srcu,
1052 ; uint8_t *srcv, intptr_t i_srcv, int w, int h )
1053 ;-----------------------------------------------------------------------------
1054 ; assumes i_dst and w are multiples of 16, and i_dst>2*w
1055 cglobal plane_copy_interleave_core, 6,9
1058 FIX_STRIDES r1, r3, r5, r6d
1072 shr t1, SIZEOF_PIXEL
1086 INTERLEAVE r0+r6*2+ 0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6+0*SIZEOF_PIXEL, u, nt
1087 INTERLEAVE r0+r6*2+16*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6+8*SIZEOF_PIXEL, u, nt
1088 add r6, 16*SIZEOF_PIXEL
1094 movntq [r0+r6*2+(n+ 0)], m0
1095 movntq [r0+r6*2+(n+ 8)], m0
1096 movntq [r0+r6*2+(n+16)], m0
1097 movntq [r0+r6*2+(n+24)], m0
1099 movntdq [r0+r6*2+(n+ 0)], m0
1100 movntdq [r0+r6*2+(n+16)], m0
1104 add r6, 16*SIZEOF_PIXEL
1116 ;-----------------------------------------------------------------------------
1117 ; void store_interleave_chroma( uint8_t *dst, intptr_t i_dst, uint8_t *srcu, uint8_t *srcv, int height )
1118 ;-----------------------------------------------------------------------------
1119 cglobal store_interleave_chroma, 5,5
1122 INTERLEAVE r0+ 0, r2+ 0, r3+ 0, a
1123 INTERLEAVE r0+r1, r2+FDEC_STRIDEB, r3+FDEC_STRIDEB, a
1124 add r2, FDEC_STRIDEB*2
1125 add r3, FDEC_STRIDEB*2
1130 %endmacro ; PLANE_INTERLEAVE
1132 %macro DEINTERLEAVE_START 0
1135 %elif cpuflag(ssse3)
1136 mova m4, [deinterleave_shuf]
1139 %endif ; HIGH_BIT_DEPTH
1142 %macro PLANE_DEINTERLEAVE 0
1143 ;-----------------------------------------------------------------------------
1144 ; void plane_copy_deinterleave( pixel *dstu, intptr_t i_dstu,
1145 ; pixel *dstv, intptr_t i_dstv,
1146 ; pixel *src, intptr_t i_src, int w, int h )
1147 ;-----------------------------------------------------------------------------
1148 cglobal plane_copy_deinterleave, 6,7
1151 FIX_STRIDES r1, r3, r5, r6d
1162 DEINTERLEAVE r0+r6+0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6*2+ 0*SIZEOF_PIXEL, 0, m4, u
1163 DEINTERLEAVE r0+r6+8*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6*2+16*SIZEOF_PIXEL, 0, m4, u
1164 add r6, 16*SIZEOF_PIXEL
1173 ;-----------------------------------------------------------------------------
1174 ; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height )
1175 ;-----------------------------------------------------------------------------
1176 cglobal load_deinterleave_chroma_fenc, 4,4
1180 DEINTERLEAVE r0+ 0, r0+FENC_STRIDEB*1/2, r1+ 0, 1, m4, a
1181 DEINTERLEAVE r0+FENC_STRIDEB, r0+FENC_STRIDEB*3/2, r1+r2, 1, m4, a
1182 add r0, FENC_STRIDEB*2
1188 ;-----------------------------------------------------------------------------
1189 ; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height )
1190 ;-----------------------------------------------------------------------------
1191 cglobal load_deinterleave_chroma_fdec, 4,4
1195 DEINTERLEAVE r0+ 0, r0+FDEC_STRIDEB*1/2, r1+ 0, 0, m4, a
1196 DEINTERLEAVE r0+FDEC_STRIDEB, r0+FDEC_STRIDEB*3/2, r1+r2, 0, m4, a
1197 add r0, FDEC_STRIDEB*2
1202 %endmacro ; PLANE_DEINTERLEAVE
1204 %macro PLANE_DEINTERLEAVE_V210 0
1205 ;-----------------------------------------------------------------------------
1206 ; void x264_plane_copy_deinterleave_v210( uint16_t *dsty, intptr_t i_dsty,
1207 ; uint16_t *dstc, intptr_t i_dstc,
1208 ; uint32_t *src, intptr_t i_src, int w, int h )
1209 ;-----------------------------------------------------------------------------
1211 cglobal plane_copy_deinterleave_v210, 8,10,7
1216 cglobal plane_copy_deinterleave_v210, 7,7,7
1221 FIX_STRIDES r1, r3, r6d
1228 mova m2, [v210_mask]
1229 mova m3, [v210_luma_shuf]
1230 mova m4, [v210_chroma_shuf]
1231 mova m5, [v210_mult] ; also functions as vpermd index for avx2
1232 pshufd m6, m5, q1102
1241 pmulhrsw m0, m5 ; y0 y1 y2 y3 y4 y5 __ __
1242 pmulhrsw m1, m6 ; u0 v0 u1 v1 u2 v2 __ __
1260 %endmacro ; PLANE_DEINTERLEAVE_V210
1271 PLANE_DEINTERLEAVE_V210
1275 PLANE_DEINTERLEAVE_V210
1277 PLANE_DEINTERLEAVE_V210
1290 ; These functions are not general-use; not only do the SSE ones require aligned input,
1291 ; but they also will fail if given a non-mod16 size.
1292 ; memzero SSE will fail for non-mod128.
1294 ;-----------------------------------------------------------------------------
1295 ; void *memcpy_aligned( void *dst, const void *src, size_t n );
1296 ;-----------------------------------------------------------------------------
1298 cglobal memcpy_aligned, 3,3
1309 mova m0, [r1+r2-1*mmsize]
1310 mova m1, [r1+r2-2*mmsize]
1311 mova [r0+r2-1*mmsize], m0
1312 mova [r0+r2-2*mmsize], m1
1318 mova m0, [r1+r2-1*mmsize]
1319 mova m1, [r1+r2-2*mmsize]
1320 mova m2, [r1+r2-3*mmsize]
1321 mova m3, [r1+r2-4*mmsize]
1322 mova [r0+r2-1*mmsize], m0
1323 mova [r0+r2-2*mmsize], m1
1324 mova [r0+r2-3*mmsize], m2
1325 mova [r0+r2-4*mmsize], m3
1337 ;-----------------------------------------------------------------------------
1338 ; void *memzero_aligned( void *dst, size_t n );
1339 ;-----------------------------------------------------------------------------
1341 cglobal memzero_aligned, 2,2
1352 mova [r0 + r1 + i], m0
1367 %if HIGH_BIT_DEPTH == 0
1368 ;-----------------------------------------------------------------------------
1369 ; void integral_init4h( uint16_t *sum, uint8_t *pix, intptr_t stride )
1370 ;-----------------------------------------------------------------------------
1371 %macro INTEGRAL_INIT4H 0
1372 cglobal integral_init4h, 3,4
1388 paddw m1, [r0+r2*2+mmsize]
1390 mova [r3+r2*2+mmsize], m1
1401 %macro INTEGRAL_INIT8H 0
1402 cglobal integral_init8h, 3,4
1411 mpsadbw m2, m0, m4, 100100b
1412 mpsadbw m3, m1, m4, 100100b
1416 mpsadbw m2, m0, m4, 100b
1417 mpsadbw m3, m1, m4, 100b
1422 paddw m1, [r0+r2*2+mmsize]
1426 mova [r3+r2*2+mmsize], m1
1438 %endif ; !HIGH_BIT_DEPTH
1440 %macro INTEGRAL_INIT_8V 0
1441 ;-----------------------------------------------------------------------------
1442 ; void integral_init8v( uint16_t *sum8, intptr_t stride )
1443 ;-----------------------------------------------------------------------------
1444 cglobal integral_init8v, 3,3
1451 mova m1, [r2+r1+mmsize]
1453 psubw m1, [r0+r1+mmsize]
1455 mova [r0+r1+mmsize], m1
1468 ;-----------------------------------------------------------------------------
1469 ; void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride )
1470 ;-----------------------------------------------------------------------------
1472 cglobal integral_init4v, 3,5
1494 cglobal integral_init4v, 3,5
1506 shufpd m0, [r0+r2+16], 1
1507 shufpd m1, [r4+r2+16], 1
1520 cglobal integral_init4v, 3,5
1546 cglobal integral_init4v, 3,5
1556 paddw m0, m2, [r0+r2+8]
1571 pavgb %4, [r0+r5*2+%7]
1572 PALIGNR %1, %3, 1, m6
1573 PALIGNR %2, %4, 1, m6
1591 pavgb m2, m3, [r0+1]
1593 pavgb m3, [r0+r5*2+1]
1597 mova m3, [r0+r5+mmsize]
1598 pavgb m2, m3, [r0+mmsize]
1599 movu m5, [r0+r5+1+mmsize]
1600 pavgb m4, m5, [r0+1+mmsize]
1601 pavgb m3, [r0+r5*2+mmsize]
1602 pavgb m5, [r0+r5*2+1+mmsize]
1610 punpckhqdq m4, m0, m2
1611 punpcklqdq m0, m0, m2
1612 punpckhqdq m5, m1, m3
1613 punpcklqdq m2, m1, m3
1614 vpermq m0, m0, q3120
1615 vpermq m1, m4, q3120
1616 vpermq m2, m2, q3120
1617 vpermq m3, m5, q3120
1625 mova m3, [r0+%4+mmsize]
1627 pavgb m3, [r0+%4+r5+mmsize]
1628 pavgb m2, [r0+%4+r5]
1629 PALIGNR %1, m3, 1, m6
1631 PALIGNR m3, m2, 1, m6
1634 vpperm m5, m3, %1, m7
1635 vpperm m3, m3, %1, m6
1652 pavgb m3, [r0+%3+r5+8]
1653 pavgb m2, [r0+%3+r5]
1656 pavgb m1, [r0+%3+r5+9]
1657 pavgb m0, [r0+%3+r5+1]
1673 pavgw m3, [r0+%3+r5+8]
1674 pavgw m2, [r0+%3+r5]
1677 pavgw m1, [r0+%3+r5+10]
1678 pavgw m0, [r0+%3+r5+2]
1692 mova m3, [r0+%4+mmsize]
1694 pavgw m3, [r0+%4+r5+mmsize]
1695 pavgw m2, [r0+%4+r5]
1696 PALIGNR %1, m3, 2, m6
1698 PALIGNR m3, m2, 2, m6
1701 vpperm m5, m3, %1, m7
1702 vpperm m3, m3, %1, m6
1716 ;-----------------------------------------------------------------------------
1717 ; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
1718 ; intptr_t src_stride, intptr_t dst_stride, int width, int height )
1719 ;-----------------------------------------------------------------------------
1720 %macro FRAME_INIT_LOWRES 0
1721 cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
1728 add dword r7m, mmsize-1
1729 and dword r7m, ~(mmsize-1)
1731 ; src += 2*(height-1)*stride + 2*width
1737 ; dst += (height-1)*stride + width
1746 ; gap = stride - width
1750 %define dst_gap [rsp+gprsize]
1755 %define src_gap [rsp]
1758 mova m6, [deinterleave_shuf32a]
1759 mova m7, [deinterleave_shuf32b]
1766 %ifnidn cpuname, mmx2
1778 %ifidn cpuname, mmx2
1782 FILT8xA m0, r1, r2, 0
1783 FILT8xA m1, r3, r4, r5
1787 %else ; !HIGH_BIT_DEPTH
1789 mova m7, [deinterleave_shuf]
1791 mova m6, [deinterleave_shuf32a]
1792 mova m7, [deinterleave_shuf32b]
1799 %ifnidn cpuname, mmx2
1814 FILT32x4U r1, r2, r3, r4
1816 FILT8x4 m0, m1, m2, m3, m10, m11, mmsize
1819 FILT8x4 m2, m3, m0, m1, m4, m5, 0
1821 vpperm m4, m2, m8, m7
1822 vpperm m2, m2, m8, m6
1823 vpperm m5, m3, m9, m7
1824 vpperm m3, m3, m9, m6
1835 %elifidn cpuname, mmx2
1839 FILT16x2 m0, r1, r2, 0
1840 FILT16x2 m1, r3, r4, r5
1844 %endif ; HIGH_BIT_DEPTH
1857 %endmacro ; FRAME_INIT_LOWRES
1861 %if ARCH_X86_64 == 0
1862 INIT_MMX cache32, mmx2
1873 %if HIGH_BIT_DEPTH==0
1878 ;-----------------------------------------------------------------------------
1879 ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
1880 ; uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
1881 ;-----------------------------------------------------------------------------
1883 cglobal mbtree_propagate_cost, 6,6,7
1897 movq m2, [r2+r5] ; intra
1898 movq m0, [r4+r5] ; invq
1899 movq m3, [r3+r5] ; inter
1900 movq m1, [r1+r5] ; prop
1910 fmaddps m0, m0, m6, m1
1918 fnmaddps m3, m1, m3, m2
1922 mulps m0, m6 ; intra*invq*fps_factor>>8
1923 cvtdq2ps m1, m1 ; prop
1924 addps m0, m1 ; prop + (intra*invq*fps_factor>>8)
1925 cvtdq2ps m1, m2 ; intra
1926 psubd m2, m3 ; intra - inter
1927 cvtdq2ps m2, m2 ; intra - inter
1928 rcpps m3, m1 ; 1 / intra 1st approximation
1929 mulps m1, m3 ; intra * (1/intra 1st approx)
1930 mulps m1, m3 ; intra * (1/intra 1st approx)^2
1931 mulps m0, m2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
1932 addps m3, m3 ; 2 * (1/intra 1st approx)
1933 subps m3, m1 ; 2nd approximation for 1/intra
1934 mulps m0, m3 ; / intra
1945 ; Bulldozer only has a 128-bit float unit, so the AVX version of this function is actually slower.
1949 %macro INT16_UNPACK 1
1950 punpckhwd xm4, xm%1, xm7
1952 vinsertf128 m%1, m%1, xm4, 1
1955 ; FIXME: align loads to 16 bytes
1957 cglobal mbtree_propagate_cost, 6,6,%1
1958 vbroadcastss m6, [r5]
1968 %if notcpuflag(avx2)
1973 pmovzxwd m0, [r2+r5] ; intra
1974 pmovzxwd m1, [r4+r5] ; invq
1975 pmovzxwd m2, [r1+r5] ; prop
1976 pand xm3, xm5, [r3+r5] ; inter
1984 fmaddps m1, m1, m6, m2
1989 fnmaddps m4, m2, m3, m4
1995 pand xm3, xm5, [r3+r5]
2006 mulps m1, m6 ; intra*invq*fps_factor>>8
2007 addps m1, m2 ; prop + (intra*invq*fps_factor>>8)
2008 rcpps m3, m0 ; 1 / intra 1st approximation
2009 mulps m2, m0, m3 ; intra * (1/intra 1st approx)
2010 mulps m2, m3 ; intra * (1/intra 1st approx)^2
2011 mulps m1, m4 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
2012 addps m3, m3 ; 2 * (1/intra 1st approx)
2013 subps m3, m2 ; 2nd approximation for 1/intra
2014 mulps m1, m3 ; / intra