1 ;*****************************************************************************
2 ;* mc-a2.asm: x86 motion compensation
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2013 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Holger Lubitz <holger@lubitz.org>
9 ;* Mathieu Monnier <manao@melix.net>
10 ;* Oskar Arvidsson <oskar@irock.se>
12 ;* This program is free software; you can redistribute it and/or modify
13 ;* it under the terms of the GNU General Public License as published by
14 ;* the Free Software Foundation; either version 2 of the License, or
15 ;* (at your option) any later version.
17 ;* This program is distributed in the hope that it will be useful,
18 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;* GNU General Public License for more details.
22 ;* You should have received a copy of the GNU General Public License
23 ;* along with this program; if not, write to the Free Software
24 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
26 ;* This program is also available under a commercial proprietary license.
27 ;* For more information, contact us at licensing@x264.com.
28 ;*****************************************************************************
31 %include "x86util.asm"
35 filt_mul20: times 32 db 20
36 filt_mul15: times 16 db 1, -5
37 filt_mul51: times 16 db -5, 1
38 hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
39 deinterleave_shuf: times 2 db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
41 deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14
42 deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15
44 deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
45 deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
49 pd_0f: times 4 dd 0xffff
50 pf_inv256: times 8 dd 0.00390625
52 pad10: times 8 dw 10*PIXEL_MAX
53 pad20: times 8 dw 20*PIXEL_MAX
54 pad30: times 8 dw 30*PIXEL_MAX
55 depad: times 4 dd 32*20*PIXEL_MAX + 512
57 tap1: times 4 dw 1, -5
58 tap2: times 4 dw 20, 20
59 tap3: times 4 dw -5, 1
98 psubw %1, %2 ; a-5*b+4*c
102 paddw %1, %3 ; a-5*b+20*c
108 psraw %1, 2 ; (a-b)/4
109 psubw %1, %2 ; (a-b)/4-b
110 paddw %1, %3 ; (a-b)/4-b+c
111 psraw %1, 2 ; ((a-b)/4-b+c)/4
112 paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
130 %macro FILT_PACK 4-6 b
147 ;The hpel_filter routines use non-temporal writes for output.
148 ;The following defines may be uncommented for testing.
149 ;Doing the hpel_filter temporal may be a win if the last level cache
150 ;is big enough (preliminary benching suggests on the order of 4* framesize).
153 ;%define movntps movaps
157 ;-----------------------------------------------------------------------------
158 ; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, intptr_t stride, intptr_t width );
159 ;-----------------------------------------------------------------------------
161 cglobal hpel_filter_v, 5,6,11
181 mova m7, [pw_pixel_max]
188 mova m5, [r1+r3+mmsize]
189 mova m6, [r1+r3*2+mmsize]
193 paddw m4, [r5+r3*2+mmsize]
194 paddw m5, [r5+r3+mmsize]
195 paddw m6, [r5+mmsize]
198 FILT_V2 m1, m2, m3, m4, m5, m6
203 mova [r2+r4+mmsize], m4
206 FILT_PACK m1, m4, 5, m6, w, s10
210 mova [r0+r4+mmsize], m4
215 ;-----------------------------------------------------------------------------
216 ; void hpel_filter_c( uint16_t *dst, int16_t *buf, intptr_t width );
217 ;-----------------------------------------------------------------------------
218 cglobal hpel_filter_c, 3,3,10
258 CLIPW m1, [pb_0], [pw_pixel_max]
264 ;-----------------------------------------------------------------------------
265 ; void hpel_filter_h( uint16_t *dst, uint16_t *src, intptr_t width );
266 ;-----------------------------------------------------------------------------
267 cglobal hpel_filter_h, 3,4,8
273 mova m0, [pw_pixel_max]
285 movu m4, [src-4+mmsize]
286 movu m5, [src-2+mmsize]
288 movu m7, [src+4+mmsize]
289 movu m6, [src+6+mmsize]
292 movu m7, [src+2+mmsize]
293 mova m6, [src+0+mmsize]
295 FILT_H2 m1, m2, m3, m4, m5, m6
298 FILT_PACK m1, m4, 1, m7, w
302 mova [r0+r2+mmsize], m4
306 %endmacro ; HPEL_FILTER
312 %endif ; HIGH_BIT_DEPTH
314 %if HIGH_BIT_DEPTH == 0
316 ;-----------------------------------------------------------------------------
317 ; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, intptr_t stride, intptr_t width );
318 ;-----------------------------------------------------------------------------
319 cglobal hpel_filter_v, 5,6,%1
327 mova m0, [filt_mul15]
339 SBUTTERFLY bw, 1, 4, 7
340 SBUTTERFLY bw, 2, 5, 7
341 SBUTTERFLY bw, 3, 6, 7
346 pmaddubsw m3, [filt_mul20]
347 pmaddubsw m6, [filt_mul20]
353 LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1
354 LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1
355 LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0
356 LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
357 FILT_V2 m1, m2, m3, m4, m5, m6
362 mova [r2+r4*2+mmsize/2], xm4
363 vextracti128 [r2+r4*2+mmsize], m1, 1
364 vextracti128 [r2+r4*2+mmsize*3/2], m4, 1
367 mova [r2+r4*2+mmsize], m4
369 FILT_PACK m1, m4, 5, m7
378 ;-----------------------------------------------------------------------------
379 ; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
380 ;-----------------------------------------------------------------------------
382 cglobal hpel_filter_c_mmx2, 3,3
394 paddw m3, [src+2] ; c0
398 paddw m4, [src+14] ; a1
399 paddw m5, [src+12] ; b1
400 paddw m6, [src+10] ; c1
401 FILT_H2 m1, m2, m3, m4, m5, m6
402 FILT_PACK m1, m4, 6, m7
408 ;-----------------------------------------------------------------------------
409 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
410 ;-----------------------------------------------------------------------------
411 cglobal hpel_filter_h_mmx2, 3,3
445 FILT_H2 m1, m2, m3, m4, m5, m6
446 FILT_PACK m1, m4, 1, m7
455 ;-----------------------------------------------------------------------------
456 ; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
457 ;-----------------------------------------------------------------------------
458 cglobal hpel_filter_c, 3,3,9
463 %ifnidn cpuname, sse2
470 %define tpw_32 [pw_32]
472 ; This doesn't seem to be faster (with AVX) on Sandy Bridge or Bulldozer...
473 %if cpuflag(misalign) || mmsize==32
478 movu m3, [src-4+mmsize]
479 movu m2, [src-2+mmsize]
480 mova m1, [src+0+mmsize]
484 paddw m3, [src+6+mmsize]
485 paddw m2, [src+4+mmsize]
486 paddw m1, [src+2+mmsize]
487 FILT_H2 m4, m5, m6, m3, m2, m1
493 PALIGNR m4, m1, m0, 12, m7
494 PALIGNR m5, m1, m0, 14, m0
495 PALIGNR m0, m2, m1, 6, m7
497 PALIGNR m0, m2, m1, 4, m7
499 PALIGNR m6, m2, m1, 2, m7
505 PALIGNR m2, m1, 12, m7
506 PALIGNR m5, m1, 14, m1
508 PALIGNR m3, m1, m0, 6, m7
510 PALIGNR m6, m1, m0, 4, m7
512 PALIGNR m6, m1, m0, 2, m7
516 FILT_PACK m4, m3, 6, tpw_32
526 ;-----------------------------------------------------------------------------
527 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
528 ;-----------------------------------------------------------------------------
529 cglobal hpel_filter_h_sse2, 3,3,8
566 mova m7, [pw_1] ; FIXME xmm8
567 FILT_H2 m1, m2, m3, m4, m5, m6
568 FILT_PACK m1, m4, 1, m7
574 ;-----------------------------------------------------------------------------
575 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
576 ;-----------------------------------------------------------------------------
578 cglobal hpel_filter_h, 3,3
588 ; Using unaligned loads instead of palignr is marginally slower on SB and significantly
589 ; slower on Bulldozer, despite their fast load units -- even though it would let us avoid
590 ; the repeated loads of constants for pmaddubsw.
591 palignr m3, m1, m0, 14
592 palignr m4, m1, m0, 15
593 palignr m0, m2, m1, 2
594 pmaddubsw m3, [filt_mul15]
595 pmaddubsw m4, [filt_mul15]
596 pmaddubsw m0, [filt_mul51]
597 palignr m5, m2, m1, 1
598 palignr m6, m2, m1, 3
601 pmaddubsw m1, [filt_mul20]
602 pmaddubsw m5, [filt_mul20]
603 pmaddubsw m6, [filt_mul51]
607 FILT_PACK m3, m4, 5, m7
608 pshufb m3, [hpel_shuf]
620 INIT_XMM sse2, misalign
639 cglobal hpel_filter_h, 3,3,8
644 mova m5, [filt_mul15]
645 mova m6, [filt_mul20]
646 mova m7, [filt_mul51]
667 FILT_PACK m0, m1, 5, m2
668 pshufb m0, [hpel_shuf]
676 ;The optimum prefetch distance is difficult to determine in checkasm:
677 ;any prefetch seems slower than not prefetching.
678 ;In real use, the prefetch seems to be a slight win.
679 ;+16 is picked somewhat arbitrarily here based on the fact that even one
680 ;loop iteration is going to take longer than the prefetch.
681 prefetcht0 [r1+r2*2+16]
708 LOAD_ADD_2 m1, m4, [r3 ], [r1+r2*2], m2, m5 ; a0 / a1
709 LOAD_ADD_2 m2, m5, [r3+r2 ], [r1+r2 ], m3, m6 ; b0 / b1
710 LOAD_ADD_2 m3, m6, [r3+r2*2], [r1 ], %3, %4 ; c0 / c1
712 FILT_V2 m1, m2, m3, m4, m5, m6
718 FILT_PACK m1, m4, 5, m15
719 movntps [r8+r4+%5], m1
723 PALIGNR m1, %2, %1, 12, m2
724 PALIGNR m2, %2, %1, 14, %1
725 PALIGNR m3, %3, %2, 4, %1
726 PALIGNR m4, %3, %2, 2, %1
729 PALIGNR %3, %2, 6, m2
738 FILT_PACK %3, %4, 6, m15
752 PALIGNR m1, %2, %1, 14, m3
753 PALIGNR m2, %2, %1, 15, m3
754 PALIGNR m4, %3, %2, 1 , m3
755 PALIGNR m5, %3, %2, 2 , m3
756 PALIGNR m6, %3, %2, 3 , m3
769 FILT_PACK m1, m2, 5, m15
770 pshufb m1, [hpel_shuf]
772 ADD8TO16 m1, m6, m12, m3, m0 ; a
773 ADD8TO16 m2, m5, m12, m3, m0 ; b
774 ADD8TO16 %2, m4, m12, m3, m0 ; c
775 FILT_V2 m1, m2, %2, m6, m5, m4
776 FILT_PACK m1, m6, 5, m15
783 ;-----------------------------------------------------------------------------
784 ; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
785 ; uint8_t *src, intptr_t stride, int width, int height )
786 ;-----------------------------------------------------------------------------
787 cglobal hpel_filter, 7,9,16
805 mova m0, [filt_mul51]
806 mova m12, [filt_mul15]
807 mova m14, [filt_mul20]
814 DO_FILT_V m8, m7, m13, m12, 0
817 DO_FILT_V m6, m5, m11, m12, 16
819 paddw m15, m15 ; pw_32
820 DO_FILT_C m9, m8, m7, m6
823 DO_FILT_H m10, m13, m11
828 ; setup regs for next y
854 %endif ; !HIGH_BIT_DEPTH
856 ;-----------------------------------------------------------------------------
857 ; void plane_copy_core( pixel *dst, intptr_t i_dst,
858 ; pixel *src, intptr_t i_src, int w, int h )
859 ;-----------------------------------------------------------------------------
860 ; assumes i_dst and w are multiples of 16, and i_dst>w
862 cglobal plane_copy_core_mmx2, 6,7
863 FIX_STRIDES r1, r3, r4d
864 %if HIGH_BIT_DEPTH == 0
915 %macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
919 mov%4 m0, [%2+(x/2)*mmsize]
920 mov%4 m1, [%3+(x/2)*mmsize]
923 mov%5a [%1+(x+0)*mmsize], m0
924 mov%5a [%1+(x+1)*mmsize], m2
944 %endif ; HIGH_BIT_DEPTH
947 %macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, shuffle constant, is aligned
951 mova m0, [%3+(n+0)*mmsize]
952 mova m1, [%3+(n+1)*mmsize]
959 mov%6 [%1+(n/2)*mmsize], m0
960 mov%6 [%2+(n/2)*mmsize], m2
963 %else ; !HIGH_BIT_DEPTH
993 %endif ; mmsize == 16
994 %endif ; HIGH_BIT_DEPTH
997 %macro PLANE_INTERLEAVE 0
998 ;-----------------------------------------------------------------------------
999 ; void plane_copy_interleave_core( uint8_t *dst, intptr_t i_dst,
1000 ; uint8_t *srcu, intptr_t i_srcu,
1001 ; uint8_t *srcv, intptr_t i_srcv, int w, int h )
1002 ;-----------------------------------------------------------------------------
1003 ; assumes i_dst and w are multiples of 16, and i_dst>2*w
1004 cglobal plane_copy_interleave_core, 6,9
1007 FIX_STRIDES r1, r3, r5, r6d
1021 shr t1, SIZEOF_PIXEL
1035 INTERLEAVE r0+r6*2+ 0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6+0*SIZEOF_PIXEL, u, nt
1036 INTERLEAVE r0+r6*2+16*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6+8*SIZEOF_PIXEL, u, nt
1037 add r6, 16*SIZEOF_PIXEL
1043 movntq [r0+r6*2+(n+ 0)], m0
1044 movntq [r0+r6*2+(n+ 8)], m0
1045 movntq [r0+r6*2+(n+16)], m0
1046 movntq [r0+r6*2+(n+24)], m0
1048 movntdq [r0+r6*2+(n+ 0)], m0
1049 movntdq [r0+r6*2+(n+16)], m0
1053 add r6, 16*SIZEOF_PIXEL
1065 ;-----------------------------------------------------------------------------
1066 ; void store_interleave_chroma( uint8_t *dst, intptr_t i_dst, uint8_t *srcu, uint8_t *srcv, int height )
1067 ;-----------------------------------------------------------------------------
1068 cglobal store_interleave_chroma, 5,5
1071 INTERLEAVE r0+ 0, r2+ 0, r3+ 0, a
1072 INTERLEAVE r0+r1, r2+FDEC_STRIDEB, r3+FDEC_STRIDEB, a
1073 add r2, FDEC_STRIDEB*2
1074 add r3, FDEC_STRIDEB*2
1079 %endmacro ; PLANE_INTERLEAVE
1081 %macro DEINTERLEAVE_START 0
1084 %elif cpuflag(ssse3)
1085 mova m4, [deinterleave_shuf]
1088 %endif ; HIGH_BIT_DEPTH
1091 %macro PLANE_DEINTERLEAVE 0
1092 ;-----------------------------------------------------------------------------
1093 ; void plane_copy_deinterleave( pixel *dstu, intptr_t i_dstu,
1094 ; pixel *dstv, intptr_t i_dstv,
1095 ; pixel *src, intptr_t i_src, int w, int h )
1096 ;-----------------------------------------------------------------------------
1097 cglobal plane_copy_deinterleave, 6,7
1100 FIX_STRIDES r1, r3, r5, r6d
1111 DEINTERLEAVE r0+r6+0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6*2+ 0*SIZEOF_PIXEL, 0, m4, u
1112 DEINTERLEAVE r0+r6+8*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6*2+16*SIZEOF_PIXEL, 0, m4, u
1113 add r6, 16*SIZEOF_PIXEL
1122 ;-----------------------------------------------------------------------------
1123 ; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height )
1124 ;-----------------------------------------------------------------------------
1125 cglobal load_deinterleave_chroma_fenc, 4,4
1129 DEINTERLEAVE r0+ 0, r0+FENC_STRIDEB*1/2, r1+ 0, 1, m4, a
1130 DEINTERLEAVE r0+FENC_STRIDEB, r0+FENC_STRIDEB*3/2, r1+r2, 1, m4, a
1131 add r0, FENC_STRIDEB*2
1137 ;-----------------------------------------------------------------------------
1138 ; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height )
1139 ;-----------------------------------------------------------------------------
1140 cglobal load_deinterleave_chroma_fdec, 4,4
1144 DEINTERLEAVE r0+ 0, r0+FDEC_STRIDEB*1/2, r1+ 0, 0, m4, a
1145 DEINTERLEAVE r0+FDEC_STRIDEB, r0+FDEC_STRIDEB*3/2, r1+r2, 0, m4, a
1146 add r0, FDEC_STRIDEB*2
1151 %endmacro ; PLANE_DEINTERLEAVE
1176 ; These functions are not general-use; not only do the SSE ones require aligned input,
1177 ; but they also will fail if given a non-mod16 size.
1178 ; memzero SSE will fail for non-mod128.
1180 ;-----------------------------------------------------------------------------
1181 ; void *memcpy_aligned( void *dst, const void *src, size_t n );
1182 ;-----------------------------------------------------------------------------
1184 cglobal memcpy_aligned, 3,3
1195 mova m0, [r1+r2-1*mmsize]
1196 mova m1, [r1+r2-2*mmsize]
1197 mova [r0+r2-1*mmsize], m0
1198 mova [r0+r2-2*mmsize], m1
1204 mova m0, [r1+r2-1*mmsize]
1205 mova m1, [r1+r2-2*mmsize]
1206 mova m2, [r1+r2-3*mmsize]
1207 mova m3, [r1+r2-4*mmsize]
1208 mova [r0+r2-1*mmsize], m0
1209 mova [r0+r2-2*mmsize], m1
1210 mova [r0+r2-3*mmsize], m2
1211 mova [r0+r2-4*mmsize], m3
1223 ;-----------------------------------------------------------------------------
1224 ; void *memzero_aligned( void *dst, size_t n );
1225 ;-----------------------------------------------------------------------------
1227 cglobal memzero_aligned, 2,2
1238 mova [r0 + r1 + i], m0
1253 %if HIGH_BIT_DEPTH == 0
1254 ;-----------------------------------------------------------------------------
1255 ; void integral_init4h( uint16_t *sum, uint8_t *pix, intptr_t stride )
1256 ;-----------------------------------------------------------------------------
1257 %macro INTEGRAL_INIT4H 0
1258 cglobal integral_init4h, 3,4
1274 paddw m1, [r0+r2*2+mmsize]
1276 mova [r3+r2*2+mmsize], m1
1287 %macro INTEGRAL_INIT8H 0
1288 cglobal integral_init8h, 3,4
1297 mpsadbw m2, m0, m4, 100100b
1298 mpsadbw m3, m1, m4, 100100b
1302 mpsadbw m2, m0, m4, 100b
1303 mpsadbw m3, m1, m4, 100b
1308 paddw m1, [r0+r2*2+mmsize]
1312 mova [r3+r2*2+mmsize], m1
1324 %endif ; !HIGH_BIT_DEPTH
1326 %macro INTEGRAL_INIT_8V 0
1327 ;-----------------------------------------------------------------------------
1328 ; void integral_init8v( uint16_t *sum8, intptr_t stride )
1329 ;-----------------------------------------------------------------------------
1330 cglobal integral_init8v, 3,3
1337 mova m1, [r2+r1+mmsize]
1339 psubw m1, [r0+r1+mmsize]
1341 mova [r0+r1+mmsize], m1
1354 ;-----------------------------------------------------------------------------
1355 ; void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride )
1356 ;-----------------------------------------------------------------------------
1358 cglobal integral_init4v, 3,5
1380 cglobal integral_init4v, 3,5
1392 shufpd m0, [r0+r2+16], 1
1393 shufpd m1, [r4+r2+16], 1
1406 cglobal integral_init4v, 3,5
1432 cglobal integral_init4v, 3,5
1442 paddw m0, m2, [r0+r2+8]
1457 pavgb %4, [r0+r5*2+%7]
1458 PALIGNR %1, %3, 1, m6
1459 PALIGNR %2, %4, 1, m6
1477 pavgb m2, m3, [r0+1]
1479 pavgb m3, [r0+r5*2+1]
1483 mova m3, [r0+r5+mmsize]
1484 pavgb m2, m3, [r0+mmsize]
1485 movu m5, [r0+r5+1+mmsize]
1486 pavgb m4, m5, [r0+1+mmsize]
1487 pavgb m3, [r0+r5*2+mmsize]
1488 pavgb m5, [r0+r5*2+1+mmsize]
1496 punpckhqdq m4, m0, m2
1497 punpcklqdq m0, m0, m2
1498 punpckhqdq m5, m1, m3
1499 punpcklqdq m2, m1, m3
1500 vpermq m0, m0, q3120
1501 vpermq m1, m4, q3120
1502 vpermq m2, m2, q3120
1503 vpermq m3, m5, q3120
1511 mova m3, [r0+%4+mmsize]
1513 pavgb m3, [r0+%4+r5+mmsize]
1514 pavgb m2, [r0+%4+r5]
1515 PALIGNR %1, m3, 1, m6
1517 PALIGNR m3, m2, 1, m6
1520 vpperm m5, m3, %1, m7
1521 vpperm m3, m3, %1, m6
1538 pavgb m3, [r0+%3+r5+8]
1539 pavgb m2, [r0+%3+r5]
1542 pavgb m1, [r0+%3+r5+9]
1543 pavgb m0, [r0+%3+r5+1]
1559 pavgw m3, [r0+%3+r5+8]
1560 pavgw m2, [r0+%3+r5]
1563 pavgw m1, [r0+%3+r5+10]
1564 pavgw m0, [r0+%3+r5+2]
1578 mova m3, [r0+%4+mmsize]
1580 pavgw m3, [r0+%4+r5+mmsize]
1581 pavgw m2, [r0+%4+r5]
1582 PALIGNR %1, m3, 2, m6
1584 PALIGNR m3, m2, 2, m6
1587 vpperm m5, m3, %1, m7
1588 vpperm m3, m3, %1, m6
1602 ;-----------------------------------------------------------------------------
1603 ; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
1604 ; intptr_t src_stride, intptr_t dst_stride, int width, int height )
1605 ;-----------------------------------------------------------------------------
1606 %macro FRAME_INIT_LOWRES 0
1607 cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
1614 add dword r7m, mmsize-1
1615 and dword r7m, ~(mmsize-1)
1617 ; src += 2*(height-1)*stride + 2*width
1623 ; dst += (height-1)*stride + width
1632 ; gap = stride - width
1636 %define dst_gap [rsp+gprsize]
1641 %define src_gap [rsp]
1644 mova m6, [deinterleave_shuf32a]
1645 mova m7, [deinterleave_shuf32b]
1652 %ifnidn cpuname, mmx2
1664 %ifidn cpuname, mmx2
1668 FILT8xA m0, r1, r2, 0
1669 FILT8xA m1, r3, r4, r5
1673 %else ; !HIGH_BIT_DEPTH
1675 mova m7, [deinterleave_shuf]
1677 mova m6, [deinterleave_shuf32a]
1678 mova m7, [deinterleave_shuf32b]
1685 %ifnidn cpuname, mmx2
1700 FILT32x4U r1, r2, r3, r4
1702 FILT8x4 m0, m1, m2, m3, m10, m11, mmsize
1705 FILT8x4 m2, m3, m0, m1, m4, m5, 0
1707 vpperm m4, m2, m8, m7
1708 vpperm m2, m2, m8, m6
1709 vpperm m5, m3, m9, m7
1710 vpperm m3, m3, m9, m6
1721 %elifidn cpuname, mmx2
1725 FILT16x2 m0, r1, r2, 0
1726 FILT16x2 m1, r3, r4, r5
1730 %endif ; HIGH_BIT_DEPTH
1743 %endmacro ; FRAME_INIT_LOWRES
1747 %if ARCH_X86_64 == 0
1748 INIT_MMX cache32, mmx2
1759 %if HIGH_BIT_DEPTH==0
1764 ;-----------------------------------------------------------------------------
1765 ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
1766 ; uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
1767 ;-----------------------------------------------------------------------------
1769 cglobal mbtree_propagate_cost, 7,7,7
1779 shufps xmm6, xmm6, 0
1780 mulps xmm6, [pf_inv256]
1781 movdqa xmm5, [pw_3fff]
1783 movq xmm2, [r2+r6] ; intra
1784 movq xmm0, [r4+r6] ; invq
1785 movq xmm3, [r3+r6] ; inter
1786 movq xmm1, [r1+r6] ; prop
1787 punpcklwd xmm2, xmm4
1788 punpcklwd xmm0, xmm4
1791 punpcklwd xmm1, xmm4
1792 punpcklwd xmm3, xmm4
1796 fmaddps xmm0, xmm0, xmm6, xmm1
1803 addps xmm2, xmm3, xmm3
1804 fnmaddps xmm3, xmm1, xmm3, xmm2
1808 mulps xmm0, xmm6 ; intra*invq*fps_factor>>8
1809 cvtdq2ps xmm1, xmm1 ; prop
1810 addps xmm0, xmm1 ; prop + (intra*invq*fps_factor>>8)
1811 cvtdq2ps xmm1, xmm2 ; intra
1812 psubd xmm2, xmm3 ; intra - inter
1813 cvtdq2ps xmm2, xmm2 ; intra - inter
1814 rcpps xmm3, xmm1 ; 1 / intra 1st approximation
1815 mulps xmm1, xmm3 ; intra * (1/intra 1st approx)
1816 mulps xmm1, xmm3 ; intra * (1/intra 1st approx)^2
1817 mulps xmm0, xmm2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
1818 addps xmm3, xmm3 ; 2 * (1/intra 1st approx)
1819 subps xmm3, xmm1 ; 2nd approximation for 1/intra
1820 mulps xmm0, xmm3 ; / intra
1823 movdqa [r0+r6*2], xmm0
1831 ; Bulldozer only has a 128-bit float unit, so the AVX version of this function is actually slower.
1835 %macro INT16_UNPACK 1
1836 vpunpckhwd xm4, xm%1, xm7
1837 vpunpcklwd xm%1, xm7
1838 vinsertf128 m%1, m%1, xm4, 1
1841 ; FIXME: align loads/stores to 16 bytes
1843 cglobal mbtree_propagate_cost, 7,7,8
1852 vbroadcastss m6, [r5]
1853 mulps m6, [pf_inv256]
1854 %if notcpuflag(avx2)
1859 pmovzxwd m0, [r2+r6] ; intra
1860 pmovzxwd m1, [r4+r6] ; invq
1861 pmovzxwd m2, [r1+r6] ; prop
1862 pand xm3, xm5, [r3+r6] ; inter
1870 fmaddps m1, m1, m6, m2
1875 fnmaddps m4, m2, m3, m4
1881 pand xm3, xm5, [r3+r6]
1892 mulps m1, m6 ; intra*invq*fps_factor>>8
1893 addps m1, m2 ; prop + (intra*invq*fps_factor>>8)
1894 rcpps m3, m0 ; 1 / intra 1st approximation
1895 mulps m2, m0, m3 ; intra * (1/intra 1st approx)
1896 mulps m2, m3 ; intra * (1/intra 1st approx)^2
1897 mulps m1, m4 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
1898 addps m3, m3 ; 2 * (1/intra 1st approx)
1899 subps m3, m2 ; 2nd approximation for 1/intra
1900 mulps m1, m3 ; / intra