1 ;*****************************************************************************
2 ;* mc-a2.asm: x86 motion compensation
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2012 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Holger Lubitz <holger@lubitz.org>
9 ;* Mathieu Monnier <manao@melix.net>
10 ;* Oskar Arvidsson <oskar@irock.se>
12 ;* This program is free software; you can redistribute it and/or modify
13 ;* it under the terms of the GNU General Public License as published by
14 ;* the Free Software Foundation; either version 2 of the License, or
15 ;* (at your option) any later version.
17 ;* This program is distributed in the hope that it will be useful,
18 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;* GNU General Public License for more details.
22 ;* You should have received a copy of the GNU General Public License
23 ;* along with this program; if not, write to the Free Software
24 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
26 ;* This program is also available under a commercial proprietary license.
27 ;* For more information, contact us at licensing@x264.com.
28 ;*****************************************************************************
31 %include "x86util.asm"
35 filt_mul20: times 16 db 20
36 filt_mul15: times 8 db 1, -5
37 filt_mul51: times 8 db -5, 1
38 hpel_shuf: db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
39 deinterleave_shuf: db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
41 deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14
42 deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15
44 deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
45 deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
49 pd_0f: times 4 dd 0xffff
50 pf_inv256: times 8 dd 0.00390625
52 pad10: times 8 dw 10*PIXEL_MAX
53 pad20: times 8 dw 20*PIXEL_MAX
54 pad30: times 8 dw 30*PIXEL_MAX
55 depad: times 4 dd 32*20*PIXEL_MAX + 512
57 tap1: times 4 dw 1, -5
58 tap2: times 4 dw 20, 20
59 tap3: times 4 dw -5, 1
98 psubw %1, %2 ; a-5*b+4*c
102 paddw %1, %3 ; a-5*b+20*c
108 psraw %1, 2 ; (a-b)/4
109 psubw %1, %2 ; (a-b)/4-b
110 paddw %1, %3 ; (a-b)/4-b+c
111 psraw %1, 2 ; ((a-b)/4-b+c)/4
112 paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
130 %macro FILT_PACK 4-6 b
147 ;The hpel_filter routines use non-temporal writes for output.
148 ;The following defines may be uncommented for testing.
149 ;Doing the hpel_filter temporal may be a win if the last level cache
150 ;is big enough (preliminary benching suggests on the order of 4* framesize).
153 ;%define movntps movaps
157 ;-----------------------------------------------------------------------------
158 ; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, intptr_t stride, intptr_t width );
159 ;-----------------------------------------------------------------------------
161 cglobal hpel_filter_v, 5,6,11
181 mova m7, [pw_pixel_max]
188 mova m5, [r1+r3+mmsize]
189 mova m6, [r1+r3*2+mmsize]
193 paddw m4, [r5+r3*2+mmsize]
194 paddw m5, [r5+r3+mmsize]
195 paddw m6, [r5+mmsize]
198 FILT_V2 m1, m2, m3, m4, m5, m6
203 mova [r2+r4+mmsize], m4
206 FILT_PACK m1, m4, 5, m6, w, s10
210 mova [r0+r4+mmsize], m4
215 ;-----------------------------------------------------------------------------
216 ; void hpel_filter_c( uint16_t *dst, int16_t *buf, intptr_t width );
217 ;-----------------------------------------------------------------------------
218 cglobal hpel_filter_c, 3,3,10
258 CLIPW m1, [pb_0], [pw_pixel_max]
264 ;-----------------------------------------------------------------------------
265 ; void hpel_filter_h( uint16_t *dst, uint16_t *src, intptr_t width );
266 ;-----------------------------------------------------------------------------
267 cglobal hpel_filter_h, 3,4,8
273 mova m0, [pw_pixel_max]
285 movu m4, [src-4+mmsize]
286 movu m5, [src-2+mmsize]
288 movu m7, [src+4+mmsize]
289 movu m6, [src+6+mmsize]
292 movu m7, [src+2+mmsize]
293 mova m6, [src+0+mmsize]
295 FILT_H2 m1, m2, m3, m4, m5, m6
298 FILT_PACK m1, m4, 1, m7, w
302 mova [r0+r2+mmsize], m4
306 %endmacro ; HPEL_FILTER
312 %endif ; HIGH_BIT_DEPTH
314 %if HIGH_BIT_DEPTH == 0
316 ;-----------------------------------------------------------------------------
317 ; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, intptr_t stride, intptr_t width );
318 ;-----------------------------------------------------------------------------
319 cglobal hpel_filter_v, 5,6,%1
327 mova m0, [filt_mul15]
339 SBUTTERFLY bw, 1, 4, 7
340 SBUTTERFLY bw, 2, 5, 7
341 SBUTTERFLY bw, 3, 6, 7
346 pmaddubsw m3, [filt_mul20]
347 pmaddubsw m6, [filt_mul20]
353 LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1
354 LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1
355 LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0
356 LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
357 FILT_V2 m1, m2, m3, m4, m5, m6
361 mova [r2+r4*2+mmsize], m4
362 FILT_PACK m1, m4, 5, m7
371 ;-----------------------------------------------------------------------------
372 ; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
373 ;-----------------------------------------------------------------------------
375 cglobal hpel_filter_c_mmx2, 3,3
387 paddw m3, [src+2] ; c0
391 paddw m4, [src+14] ; a1
392 paddw m5, [src+12] ; b1
393 paddw m6, [src+10] ; c1
394 FILT_H2 m1, m2, m3, m4, m5, m6
395 FILT_PACK m1, m4, 6, m7
401 ;-----------------------------------------------------------------------------
402 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
403 ;-----------------------------------------------------------------------------
404 cglobal hpel_filter_h_mmx2, 3,3
438 FILT_H2 m1, m2, m3, m4, m5, m6
439 FILT_PACK m1, m4, 1, m7
448 ;-----------------------------------------------------------------------------
449 ; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
450 ;-----------------------------------------------------------------------------
451 cglobal hpel_filter_c, 3,3,9
456 %ifnidn cpuname, sse2
463 %define tpw_32 [pw_32]
465 ; This doesn't seem to be faster (with AVX) on Sandy Bridge or Bulldozer...
466 %if cpuflag(misalign)
480 FILT_H2 m4, m5, m6, m3, m2, m1
486 PALIGNR m4, m1, m0, 12, m7
487 PALIGNR m5, m1, m0, 14, m0
488 PALIGNR m0, m2, m1, 6, m7
490 PALIGNR m0, m2, m1, 4, m7
492 PALIGNR m6, m2, m1, 2, m7
498 PALIGNR m2, m1, 12, m7
499 PALIGNR m5, m1, 14, m1
501 PALIGNR m3, m1, m0, 6, m7
503 PALIGNR m6, m1, m0, 4, m7
505 PALIGNR m6, m1, m0, 2, m7
509 FILT_PACK m4, m3, 6, tpw_32
516 ;-----------------------------------------------------------------------------
517 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
518 ;-----------------------------------------------------------------------------
519 cglobal hpel_filter_h_sse2, 3,3,8
556 mova m7, [pw_1] ; FIXME xmm8
557 FILT_H2 m1, m2, m3, m4, m5, m6
558 FILT_PACK m1, m4, 1, m7
564 ;-----------------------------------------------------------------------------
565 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
566 ;-----------------------------------------------------------------------------
568 cglobal hpel_filter_h, 3,3
578 ; Using unaligned loads instead of palignr is marginally slower on SB and significantly
579 ; slower on Bulldozer, despite their fast load units -- even though it would let us avoid
580 ; the repeated loads of constants for pmaddubsw.
581 palignr m3, m1, m0, 14
582 palignr m4, m1, m0, 15
583 palignr m0, m2, m1, 2
584 pmaddubsw m3, [filt_mul15]
585 pmaddubsw m4, [filt_mul15]
586 pmaddubsw m0, [filt_mul51]
587 palignr m5, m2, m1, 1
588 palignr m6, m2, m1, 3
591 pmaddubsw m1, [filt_mul20]
592 pmaddubsw m5, [filt_mul20]
593 pmaddubsw m6, [filt_mul51]
597 FILT_PACK m3, m4, 5, m7
598 pshufb m3, [hpel_shuf]
610 INIT_XMM sse2, misalign
627 ;The optimum prefetch distance is difficult to determine in checkasm:
628 ;any prefetch seems slower than not prefetching.
629 ;In real use, the prefetch seems to be a slight win.
630 ;+16 is picked somewhat arbitrarily here based on the fact that even one
631 ;loop iteration is going to take longer than the prefetch.
632 prefetcht0 [r1+r2*2+16]
659 LOAD_ADD_2 m1, m4, [r3 ], [r1+r2*2], m2, m5 ; a0 / a1
660 LOAD_ADD_2 m2, m5, [r3+r2 ], [r1+r2 ], m3, m6 ; b0 / b1
661 LOAD_ADD_2 m3, m6, [r3+r2*2], [r1 ], %3, %4 ; c0 / c1
663 FILT_V2 m1, m2, m3, m4, m5, m6
669 FILT_PACK m1, m4, 5, m15
670 movntps [r8+r4+%5], m1
674 PALIGNR m1, %2, %1, 12, m2
675 PALIGNR m2, %2, %1, 14, %1
676 PALIGNR m3, %3, %2, 4, %1
677 PALIGNR m4, %3, %2, 2, %1
680 PALIGNR %3, %2, 6, m2
689 FILT_PACK %3, %4, 6, m15
703 PALIGNR m1, %2, %1, 14, m3
704 PALIGNR m2, %2, %1, 15, m3
705 PALIGNR m4, %3, %2, 1 , m3
706 PALIGNR m5, %3, %2, 2 , m3
707 PALIGNR m6, %3, %2, 3 , m3
720 FILT_PACK m1, m2, 5, m15
721 pshufb m1, [hpel_shuf]
723 ADD8TO16 m1, m6, m12, m3, m0 ; a
724 ADD8TO16 m2, m5, m12, m3, m0 ; b
725 ADD8TO16 %2, m4, m12, m3, m0 ; c
726 FILT_V2 m1, m2, %2, m6, m5, m4
727 FILT_PACK m1, m6, 5, m15
734 ;-----------------------------------------------------------------------------
735 ; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
736 ; uint8_t *src, intptr_t stride, int width, int height )
737 ;-----------------------------------------------------------------------------
738 cglobal hpel_filter, 7,9,16
756 mova m0, [filt_mul51]
757 mova m12, [filt_mul15]
758 mova m14, [filt_mul20]
765 DO_FILT_V m8, m7, m13, m12, 0
768 DO_FILT_V m6, m5, m11, m12, 16
770 paddw m15, m15 ; pw_32
771 DO_FILT_C m9, m8, m7, m6
774 DO_FILT_H m10, m13, m11
779 ; setup regs for next y
805 %endif ; !HIGH_BIT_DEPTH
807 ;-----------------------------------------------------------------------------
808 ; void plane_copy_core( pixel *dst, intptr_t i_dst,
809 ; pixel *src, intptr_t i_src, int w, int h )
810 ;-----------------------------------------------------------------------------
811 ; assumes i_dst and w are multiples of 16, and i_dst>w
813 cglobal plane_copy_core_mmx2, 6,7
814 FIX_STRIDES r1, r3, r4d
815 %if HIGH_BIT_DEPTH == 0
866 %macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
870 mov%4 m0, [%2+(x/2)*mmsize]
871 mov%4 m1, [%3+(x/2)*mmsize]
874 mov%5a [%1+(x+0)*mmsize], m0
875 mov%5a [%1+(x+1)*mmsize], m2
895 %endif ; HIGH_BIT_DEPTH
898 %macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, shuffle constant, is aligned
902 mova m0, [%3+(n+0)*mmsize]
903 mova m1, [%3+(n+1)*mmsize]
910 mov%6 [%1+(n/2)*mmsize], m0
911 mov%6 [%2+(n/2)*mmsize], m2
914 %else ; !HIGH_BIT_DEPTH
944 %endif ; mmsize == 16
945 %endif ; HIGH_BIT_DEPTH
948 %macro PLANE_INTERLEAVE 0
949 ;-----------------------------------------------------------------------------
950 ; void plane_copy_interleave_core( uint8_t *dst, intptr_t i_dst,
951 ; uint8_t *srcu, intptr_t i_srcu,
952 ; uint8_t *srcv, intptr_t i_srcv, int w, int h )
953 ;-----------------------------------------------------------------------------
954 ; assumes i_dst and w are multiples of 16, and i_dst>2*w
955 cglobal plane_copy_interleave_core, 6,9
958 FIX_STRIDES r1, r3, r5, r6d
986 INTERLEAVE r0+r6*2+ 0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6+0*SIZEOF_PIXEL, u, nt
987 INTERLEAVE r0+r6*2+16*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6+8*SIZEOF_PIXEL, u, nt
988 add r6, 16*SIZEOF_PIXEL
994 movntq [r0+r6*2+(n+ 0)], m0
995 movntq [r0+r6*2+(n+ 8)], m0
996 movntq [r0+r6*2+(n+16)], m0
997 movntq [r0+r6*2+(n+24)], m0
999 movntdq [r0+r6*2+(n+ 0)], m0
1000 movntdq [r0+r6*2+(n+16)], m0
1004 add r6, 16*SIZEOF_PIXEL
1016 ;-----------------------------------------------------------------------------
1017 ; void store_interleave_chroma( uint8_t *dst, intptr_t i_dst, uint8_t *srcu, uint8_t *srcv, int height )
1018 ;-----------------------------------------------------------------------------
1019 cglobal store_interleave_chroma, 5,5
1022 INTERLEAVE r0+ 0, r2+ 0, r3+ 0, a
1023 INTERLEAVE r0+r1, r2+FDEC_STRIDEB, r3+FDEC_STRIDEB, a
1024 add r2, FDEC_STRIDEB*2
1025 add r3, FDEC_STRIDEB*2
1030 %endmacro ; PLANE_INTERLEAVE
1032 %macro DEINTERLEAVE_START 0
1035 %elif cpuflag(ssse3)
1036 mova m4, [deinterleave_shuf]
1039 %endif ; HIGH_BIT_DEPTH
1042 %macro PLANE_DEINTERLEAVE 0
1043 ;-----------------------------------------------------------------------------
1044 ; void plane_copy_deinterleave( pixel *dstu, intptr_t i_dstu,
1045 ; pixel *dstv, intptr_t i_dstv,
1046 ; pixel *src, intptr_t i_src, int w, int h )
1047 ;-----------------------------------------------------------------------------
1048 cglobal plane_copy_deinterleave, 6,7
1051 FIX_STRIDES r1, r3, r5, r6d
1062 DEINTERLEAVE r0+r6+0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6*2+ 0*SIZEOF_PIXEL, 0, m4, u
1063 DEINTERLEAVE r0+r6+8*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6*2+16*SIZEOF_PIXEL, 0, m4, u
1064 add r6, 16*SIZEOF_PIXEL
1073 ;-----------------------------------------------------------------------------
1074 ; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height )
1075 ;-----------------------------------------------------------------------------
1076 cglobal load_deinterleave_chroma_fenc, 4,4
1080 DEINTERLEAVE r0+ 0, r0+FENC_STRIDEB*1/2, r1+ 0, 1, m4, a
1081 DEINTERLEAVE r0+FENC_STRIDEB, r0+FENC_STRIDEB*3/2, r1+r2, 1, m4, a
1082 add r0, FENC_STRIDEB*2
1088 ;-----------------------------------------------------------------------------
1089 ; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height )
1090 ;-----------------------------------------------------------------------------
1091 cglobal load_deinterleave_chroma_fdec, 4,4
1095 DEINTERLEAVE r0+ 0, r0+FDEC_STRIDEB*1/2, r1+ 0, 0, m4, a
1096 DEINTERLEAVE r0+FDEC_STRIDEB, r0+FDEC_STRIDEB*3/2, r1+r2, 0, m4, a
1097 add r0, FDEC_STRIDEB*2
1102 %endmacro ; PLANE_DEINTERLEAVE
1127 ; These functions are not general-use; not only do the SSE ones require aligned input,
1128 ; but they also will fail if given a non-mod16 size.
1129 ; memzero SSE will fail for non-mod128.
1131 ;-----------------------------------------------------------------------------
1132 ; void *memcpy_aligned( void *dst, const void *src, size_t n );
1133 ;-----------------------------------------------------------------------------
1135 cglobal memcpy_aligned_mmx, 3,3
1138 movq mm0, [r1 + r2 - 16]
1139 movq mm1, [r1 + r2 - 8]
1140 movq [r0 + r2 - 16], mm0
1141 movq [r0 + r2 - 8], mm1
1147 movq mm0, [r1 + r2 - 32]
1148 movq mm1, [r1 + r2 - 24]
1149 movq mm2, [r1 + r2 - 16]
1150 movq mm3, [r1 + r2 - 8]
1151 movq [r0 + r2 - 32], mm0
1152 movq [r0 + r2 - 24], mm1
1153 movq [r0 + r2 - 16], mm2
1154 movq [r0 + r2 - 8], mm3
1160 ;-----------------------------------------------------------------------------
1161 ; void *memcpy_aligned( void *dst, const void *src, size_t n );
1162 ;-----------------------------------------------------------------------------
1163 cglobal memcpy_aligned_sse2, 3,3
1166 movdqa xmm0, [r1 + r2 - 16]
1167 movdqa [r0 + r2 - 16], xmm0
1172 movdqa xmm0, [r1 + r2 - 32]
1173 movdqa [r0 + r2 - 32], xmm0
1174 movdqa xmm1, [r1 + r2 - 16]
1175 movdqa [r0 + r2 - 16], xmm1
1181 movdqa xmm0, [r1 + r2 - 64]
1182 movdqa [r0 + r2 - 64], xmm0
1183 movdqa xmm1, [r1 + r2 - 48]
1184 movdqa [r0 + r2 - 48], xmm1
1185 movdqa xmm2, [r1 + r2 - 32]
1186 movdqa [r0 + r2 - 32], xmm2
1187 movdqa xmm3, [r1 + r2 - 16]
1188 movdqa [r0 + r2 - 16], xmm3
1194 ;-----------------------------------------------------------------------------
1195 ; void *memzero_aligned( void *dst, size_t n );
1196 ;-----------------------------------------------------------------------------
1198 cglobal memzero_aligned, 2,2
1205 mova [r0 + r1 + i], m0
1220 %if HIGH_BIT_DEPTH == 0
1221 ;-----------------------------------------------------------------------------
1222 ; void integral_init4h( uint16_t *sum, uint8_t *pix, intptr_t stride )
1223 ;-----------------------------------------------------------------------------
1225 cglobal integral_init4h_sse4, 3,4
1232 movdqa m1, [r1+r2+16]
1237 paddw m1, [r0+r2*2+16]
1238 movdqa [r3+r2*2 ], m0
1239 movdqa [r3+r2*2+16], m1
1244 %macro INTEGRAL_INIT8H 0
1245 cglobal integral_init8h, 3,4
1252 movdqa m1, [r1+r2+16]
1254 mpsadbw m2, m0, m4, 4
1255 mpsadbw m3, m1, m4, 4
1259 paddw m1, [r0+r2*2+16]
1262 movdqa [r3+r2*2 ], m0
1263 movdqa [r3+r2*2+16], m1
1273 %endif ; !HIGH_BIT_DEPTH
1275 %macro INTEGRAL_INIT_8V 0
1276 ;-----------------------------------------------------------------------------
1277 ; void integral_init8v( uint16_t *sum8, intptr_t stride )
1278 ;-----------------------------------------------------------------------------
1279 cglobal integral_init8v, 3,3
1286 mova m1, [r2+r1+mmsize]
1288 psubw m1, [r0+r1+mmsize]
1290 mova [r0+r1+mmsize], m1
1301 ;-----------------------------------------------------------------------------
1302 ; void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride )
1303 ;-----------------------------------------------------------------------------
1305 cglobal integral_init4v_mmx, 3,5
1327 cglobal integral_init4v_sse2, 3,5
1339 shufpd m0, [r0+r2+16], 1
1340 shufpd m1, [r4+r2+16], 1
1352 cglobal integral_init4v_ssse3, 3,5
1381 pavgb %4, [r0+r5*2+%7]
1382 PALIGNR %1, %3, 1, m6
1383 PALIGNR %2, %4, 1, m6
1398 mova m3, [r0+%4+mmsize]
1400 pavgb m3, [r0+%4+r5+mmsize]
1401 pavgb m2, [r0+%4+r5]
1402 PALIGNR %1, m3, 1, m6
1404 PALIGNR m3, m2, 1, m6
1407 vpperm m5, m3, %1, m7
1408 vpperm m3, m3, %1, m6
1425 pavgb m3, [r0+%3+r5+8]
1426 pavgb m2, [r0+%3+r5]
1429 pavgb m1, [r0+%3+r5+9]
1430 pavgb m0, [r0+%3+r5+1]
1446 pavgw m3, [r0+%3+r5+8]
1447 pavgw m2, [r0+%3+r5]
1450 pavgw m1, [r0+%3+r5+10]
1451 pavgw m0, [r0+%3+r5+2]
1465 mova m3, [r0+%4+mmsize]
1467 pavgw m3, [r0+%4+r5+mmsize]
1468 pavgw m2, [r0+%4+r5]
1469 PALIGNR %1, m3, 2, m6
1471 PALIGNR m3, m2, 2, m6
1474 vpperm m5, m3, %1, m7
1475 vpperm m3, m3, %1, m6
1489 ;-----------------------------------------------------------------------------
1490 ; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
1491 ; intptr_t src_stride, intptr_t dst_stride, int width, int height )
1492 ;-----------------------------------------------------------------------------
1493 %macro FRAME_INIT_LOWRES 0
1494 cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
1500 ; src += 2*(height-1)*stride + 2*width
1506 ; dst += (height-1)*stride + width
1515 ; gap = stride - width
1519 %define dst_gap [rsp+gprsize]
1524 %define src_gap [rsp]
1527 mova m6, [deinterleave_shuf32a]
1528 mova m7, [deinterleave_shuf32b]
1535 %ifnidn cpuname, mmx2
1547 %ifidn cpuname, mmx2
1551 FILT8xA m0, r1, r2, 0
1552 FILT8xA m1, r3, r4, r5
1556 %else ; !HIGH_BIT_DEPTH
1558 ; adjust for the odd end case
1568 mova m6, [deinterleave_shuf32a]
1569 mova m7, [deinterleave_shuf32b]
1576 %ifnidn cpuname, mmx2
1586 FILT8x4 m0, m1, m2, m3, m4, m5, 0
1589 vpperm m0, m4, m1, m6
1590 vpperm m1, m4, m1, m7
1615 FILT8x4 m0, m1, m2, m3, m10, m11, mmsize
1618 FILT8x4 m2, m3, m0, m1, m4, m5, 0
1620 vpperm m4, m2, m8, m7
1621 vpperm m2, m2, m8, m6
1622 vpperm m5, m3, m9, m7
1623 vpperm m3, m3, m9, m6
1634 %elifidn cpuname, mmx2
1638 FILT16x2 m0, r1, r2, 0
1639 FILT16x2 m1, r3, r4, r5
1643 %endif ; HIGH_BIT_DEPTH
1656 %endmacro ; FRAME_INIT_LOWRES
1660 %if ARCH_X86_64 == 0
1661 INIT_MMX cache32, mmx2
1673 ;-----------------------------------------------------------------------------
1674 ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
1675 ; uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
1676 ;-----------------------------------------------------------------------------
1678 cglobal mbtree_propagate_cost, 7,7,7
1688 shufps xmm6, xmm6, 0
1689 mulps xmm6, [pf_inv256]
1690 movdqa xmm5, [pw_3fff]
1692 movq xmm2, [r2+r6] ; intra
1693 movq xmm0, [r4+r6] ; invq
1694 movq xmm3, [r3+r6] ; inter
1695 movq xmm1, [r1+r6] ; prop
1696 punpcklwd xmm2, xmm4
1697 punpcklwd xmm0, xmm4
1700 punpcklwd xmm1, xmm4
1701 punpcklwd xmm3, xmm4
1705 vfmaddps xmm0, xmm0, xmm6, xmm1
1712 addps xmm2, xmm3, xmm3
1713 vfnmaddps xmm3, xmm1, xmm3, xmm2
1717 mulps xmm0, xmm6 ; intra*invq*fps_factor>>8
1718 cvtdq2ps xmm1, xmm1 ; prop
1719 addps xmm0, xmm1 ; prop + (intra*invq*fps_factor>>8)
1720 cvtdq2ps xmm1, xmm2 ; intra
1721 psubd xmm2, xmm3 ; intra - inter
1722 cvtdq2ps xmm2, xmm2 ; intra - inter
1723 rcpps xmm3, xmm1 ; 1 / intra 1st approximation
1724 mulps xmm1, xmm3 ; intra * (1/intra 1st approx)
1725 mulps xmm1, xmm3 ; intra * (1/intra 1st approx)^2
1726 mulps xmm0, xmm2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
1727 addps xmm3, xmm3 ; 2 * (1/intra 1st approx)
1728 subps xmm3, xmm1 ; 2nd approximation for 1/intra
1729 mulps xmm0, xmm3 ; / intra
1732 movdqa [r0+r6*2], xmm0
1740 ; Bulldozer only has a 128-bit float unit, so the AVX version of this function is actually slower.
1744 %macro INT16_TO_FLOAT 1
1745 vpunpckhwd xmm4, xmm%1, xmm7
1746 vpunpcklwd xmm%1, xmm7
1747 vinsertf128 ymm%1, ymm%1, xmm4, 1
1748 vcvtdq2ps ymm%1, ymm%1
1751 ; FIXME: align loads/stores to 16 bytes
1753 cglobal mbtree_propagate_cost, 7,7,8
1761 vmovdqa xmm5, [pw_3fff]
1762 vbroadcastss ymm6, [r5]
1763 vmulps ymm6, ymm6, [pf_inv256]
1766 vmovdqu xmm0, [r2+r6] ; intra
1767 vmovdqu xmm1, [r4+r6] ; invq
1768 vmovdqu xmm2, [r1+r6] ; prop
1769 vpand xmm3, xmm5, [r3+r6] ; inter
1774 vmulps ymm1, ymm1, ymm0
1775 vsubps ymm4, ymm0, ymm3
1776 vmulps ymm1, ymm1, ymm6 ; intra*invq*fps_factor>>8
1777 vaddps ymm1, ymm1, ymm2 ; prop + (intra*invq*fps_factor>>8)
1778 vrcpps ymm3, ymm0 ; 1 / intra 1st approximation
1779 vmulps ymm2, ymm0, ymm3 ; intra * (1/intra 1st approx)
1780 vmulps ymm2, ymm2, ymm3 ; intra * (1/intra 1st approx)^2
1781 vmulps ymm1, ymm1, ymm4 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
1782 vaddps ymm3, ymm3, ymm3 ; 2 * (1/intra 1st approx)
1783 vsubps ymm3, ymm3, ymm2 ; 2nd approximation for 1/intra
1784 vmulps ymm1, ymm1, ymm3 ; / intra
1785 vcvtps2dq ymm1, ymm1
1786 vmovdqu [r0+r6*2], ymm1