1 ;*****************************************************************************
2 ;* mc-a2.asm: x86 motion compensation
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2011 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Holger Lubitz <holger@lubitz.org>
9 ;* Mathieu Monnier <manao@melix.net>
10 ;* Oskar Arvidsson <oskar@irock.se>
12 ;* This program is free software; you can redistribute it and/or modify
13 ;* it under the terms of the GNU General Public License as published by
14 ;* the Free Software Foundation; either version 2 of the License, or
15 ;* (at your option) any later version.
17 ;* This program is distributed in the hope that it will be useful,
18 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;* GNU General Public License for more details.
22 ;* You should have received a copy of the GNU General Public License
23 ;* along with this program; if not, write to the Free Software
24 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
26 ;* This program is also available under a commercial proprietary license.
27 ;* For more information, contact us at licensing@x264.com.
28 ;*****************************************************************************
31 %include "x86util.asm"
35 filt_mul20: times 16 db 20
36 filt_mul15: times 8 db 1, -5
37 filt_mul51: times 8 db -5, 1
38 hpel_shuf: db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
39 deinterleave_shuf: db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
42 pd_0f: times 4 dd 0xffff
43 pf_inv256: times 8 dd 0.00390625
45 pad10: times 8 dw 10*PIXEL_MAX
46 pad20: times 8 dw 20*PIXEL_MAX
47 pad30: times 8 dw 30*PIXEL_MAX
48 depad: times 4 dd 32*20*PIXEL_MAX + 512
50 tap1: times 4 dw 1, -5
51 tap2: times 4 dw 20, 20
52 tap3: times 4 dw -5, 1
91 psubw %1, %2 ; a-5*b+4*c
95 paddw %1, %3 ; a-5*b+20*c
101 psraw %1, 2 ; (a-b)/4
102 psubw %1, %2 ; (a-b)/4-b
103 paddw %1, %3 ; (a-b)/4-b+c
104 psraw %1, 2 ; ((a-b)/4-b+c)/4
105 paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
123 %macro FILT_PACK 4-6 b
140 ;The hpel_filter routines use non-temporal writes for output.
141 ;The following defines may be uncommented for testing.
142 ;Doing the hpel_filter temporal may be a win if the last level cache
143 ;is big enough (preliminary benching suggests on the order of 4* framesize).
146 ;%define movntps movaps
149 %ifdef HIGH_BIT_DEPTH
150 ;-----------------------------------------------------------------------------
151 ; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, int stride, int width );
152 ;-----------------------------------------------------------------------------
154 cglobal hpel_filter_v, 5,6,11
177 mova m7, [pw_pixel_max]
184 mova m5, [r1+r3+mmsize]
185 mova m6, [r1+r3*2+mmsize]
189 paddw m4, [r5+r3*2+mmsize]
190 paddw m5, [r5+r3+mmsize]
191 paddw m6, [r5+mmsize]
194 FILT_V2 m1, m2, m3, m4, m5, m6
199 mova [r2+r4+mmsize], m4
202 FILT_PACK m1, m4, 5, m6, w, s10
206 mova [r0+r4+mmsize], m4
211 ;-----------------------------------------------------------------------------
212 ; void hpel_filter_c( uint16_t *dst, int16_t *buf, int width );
213 ;-----------------------------------------------------------------------------
214 cglobal hpel_filter_c, 3,3,10
254 CLIPW m1, [pb_0], [pw_pixel_max]
260 ;-----------------------------------------------------------------------------
261 ; void hpel_filter_h( uint16_t *dst, uint16_t *src, int width );
262 ;-----------------------------------------------------------------------------
263 cglobal hpel_filter_h, 3,4,8
269 mova m0, [pw_pixel_max]
281 movu m4, [src-4+mmsize]
282 movu m5, [src-2+mmsize]
284 movu m7, [src+4+mmsize]
285 movu m6, [src+6+mmsize]
288 movu m7, [src+2+mmsize]
289 mova m6, [src+0+mmsize]
291 FILT_H2 m1, m2, m3, m4, m5, m6
294 FILT_PACK m1, m4, 1, m7, w
298 mova [r0+r2+mmsize], m4
302 %endmacro ; HPEL_FILTER
308 %endif ; HIGH_BIT_DEPTH
310 %ifndef HIGH_BIT_DEPTH
312 ;-----------------------------------------------------------------------------
313 ; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width );
314 ;-----------------------------------------------------------------------------
315 cglobal hpel_filter_v, 5,6,%1
326 mova m0, [filt_mul15]
338 SBUTTERFLY bw, 1, 4, 7
339 SBUTTERFLY bw, 2, 5, 7
340 SBUTTERFLY bw, 3, 6, 7
345 pmaddubsw m3, [filt_mul20]
346 pmaddubsw m6, [filt_mul20]
352 LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1
353 LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1
354 LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0
355 LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
356 FILT_V2 m1, m2, m3, m4, m5, m6
360 mova [r2+r4*2+mmsize], m4
361 FILT_PACK m1, m4, 5, m7
370 ;-----------------------------------------------------------------------------
371 ; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
372 ;-----------------------------------------------------------------------------
374 cglobal hpel_filter_c_mmx2, 3,3
386 paddw m3, [src+2] ; c0
390 paddw m4, [src+14] ; a1
391 paddw m5, [src+12] ; b1
392 paddw m6, [src+10] ; c1
393 FILT_H2 m1, m2, m3, m4, m5, m6
394 FILT_PACK m1, m4, 6, m7
400 ;-----------------------------------------------------------------------------
401 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
402 ;-----------------------------------------------------------------------------
403 cglobal hpel_filter_h_mmx2, 3,3
437 FILT_H2 m1, m2, m3, m4, m5, m6
438 FILT_PACK m1, m4, 1, m7
447 ;-----------------------------------------------------------------------------
448 ; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
449 ;-----------------------------------------------------------------------------
450 cglobal hpel_filter_c, 3,3,9
455 %ifnidn cpuname, sse2
462 %define tpw_32 [pw_32]
464 %if cpuflag(misalign)
478 FILT_H2 m4, m5, m6, m3, m2, m1
484 PALIGNR m4, m1, m0, 12, m7
485 PALIGNR m5, m1, m0, 14, m0
486 PALIGNR m0, m2, m1, 6, m7
488 PALIGNR m0, m2, m1, 4, m7
490 PALIGNR m6, m2, m1, 2, m7
496 PALIGNR m2, m1, 12, m7
497 PALIGNR m5, m1, 14, m1
499 PALIGNR m3, m1, m0, 6, m7
501 PALIGNR m6, m1, m0, 4, m7
503 PALIGNR m6, m1, m0, 2, m7
507 FILT_PACK m4, m3, 6, tpw_32
514 ;-----------------------------------------------------------------------------
515 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
516 ;-----------------------------------------------------------------------------
517 cglobal hpel_filter_h_sse2, 3,3,8
554 mova m7, [pw_1] ; FIXME xmm8
555 FILT_H2 m1, m2, m3, m4, m5, m6
556 FILT_PACK m1, m4, 1, m7
563 ;-----------------------------------------------------------------------------
564 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
565 ;-----------------------------------------------------------------------------
566 cglobal hpel_filter_h_ssse3, 3,3
576 palignr m3, m1, m0, 14
577 palignr m4, m1, m0, 15
578 palignr m0, m2, m1, 2
579 pmaddubsw m3, [filt_mul15]
580 pmaddubsw m4, [filt_mul15]
581 pmaddubsw m0, [filt_mul51]
582 palignr m5, m2, m1, 1
583 palignr m6, m2, m1, 3
586 pmaddubsw m1, [filt_mul20]
587 pmaddubsw m5, [filt_mul20]
588 pmaddubsw m6, [filt_mul51]
592 FILT_PACK m3, m4, 5, m7
593 pshufb m3, [hpel_shuf]
599 %endif ; !ARCH_X86_64
605 INIT_XMM sse2, misalign
620 ;The optimum prefetch distance is difficult to determine in checkasm:
621 ;any prefetch seems slower than not prefetching.
622 ;In real use, the prefetch seems to be a slight win.
623 ;+16 is picked somewhat arbitrarily here based on the fact that even one
624 ;loop iteration is going to take longer than the prefetch.
625 prefetcht0 [r1+r2*2+16]
652 LOAD_ADD_2 m1, m4, [r3 ], [r1+r2*2], m2, m5 ; a0 / a1
653 LOAD_ADD_2 m2, m5, [r3+r2 ], [r1+r2 ], m3, m6 ; b0 / b1
654 LOAD_ADD_2 m3, m6, [r3+r2*2], [r1 ], %3, %4 ; c0 / c1
656 FILT_V2 m1, m2, m3, m4, m5, m6
662 FILT_PACK m1, m4, 5, m15
663 movntps [r11+r4+%5], m1
667 PALIGNR m1, %2, %1, 12, m2
668 PALIGNR m2, %2, %1, 14, %1
669 PALIGNR m3, %3, %2, 4, %1
670 PALIGNR m4, %3, %2, 2, %1
673 PALIGNR %3, %2, 6, m2
682 FILT_PACK %3, %4, 6, m15
696 PALIGNR m1, %2, %1, 14, m3
697 PALIGNR m2, %2, %1, 15, m3
698 PALIGNR m4, %3, %2, 1 , m3
699 PALIGNR m5, %3, %2, 2 , m3
700 PALIGNR m6, %3, %2, 3 , m3
713 FILT_PACK m1, m2, 5, m15
714 pshufb m1, [hpel_shuf]
716 ADD8TO16 m1, m6, m12, m3, m0 ; a
717 ADD8TO16 m2, m5, m12, m3, m0 ; b
718 ADD8TO16 %2, m4, m12, m3, m0 ; c
719 FILT_V2 m1, m2, %2, m6, m5, m4
720 FILT_PACK m1, m6, 5, m15
727 ;-----------------------------------------------------------------------------
728 ; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
729 ; uint8_t *src, int stride, int width, int height)
730 ;-----------------------------------------------------------------------------
731 cglobal hpel_filter, 7,7,16
753 mova m0, [filt_mul51]
754 mova m12, [filt_mul15]
755 mova m14, [filt_mul20]
762 DO_FILT_V m8, m7, m13, m12, 0
765 DO_FILT_V m6, m5, m11, m12, 16
767 paddw m15, m15 ; pw_32
768 DO_FILT_C m9, m8, m7, m6
771 DO_FILT_H m10, m13, m11
776 ; setup regs for next y
802 %endif ; !HIGH_BIT_DEPTH
804 ;-----------------------------------------------------------------------------
805 ; void plane_copy_core( pixel *dst, int i_dst,
806 ; pixel *src, int i_src, int w, int h)
807 ;-----------------------------------------------------------------------------
808 ; assumes i_dst and w are multiples of 16, and i_dst>w
810 cglobal plane_copy_core_mmx2, 6,7
811 FIX_STRIDES r1d, r3d, r4d
864 %macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
865 %ifdef HIGH_BIT_DEPTH
868 mov%4 m0, [%2+(x/2)*mmsize]
869 mov%4 m1, [%3+(x/2)*mmsize]
872 mov%5a [%1+(x+0)*mmsize], m0
873 mov%5a [%1+(x+1)*mmsize], m2
893 %endif ; HIGH_BIT_DEPTH
896 %macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, shuffle constant, is aligned
897 %ifdef HIGH_BIT_DEPTH
900 mova m0, [%3+(n+0)*mmsize]
901 mova m1, [%3+(n+1)*mmsize]
908 mov%6 [%1+(n/2)*mmsize], m0
909 mov%6 [%2+(n/2)*mmsize], m2
912 %else ; !HIGH_BIT_DEPTH
942 %endif ; mmsize == 16
943 %endif ; HIGH_BIT_DEPTH
946 %macro PLANE_INTERLEAVE 0
947 ;-----------------------------------------------------------------------------
948 ; void plane_copy_interleave_core( uint8_t *dst, int i_dst,
949 ; uint8_t *srcu, int i_srcu,
950 ; uint8_t *srcv, int i_srcv, int w, int h )
951 ;-----------------------------------------------------------------------------
952 ; assumes i_dst and w are multiples of 16, and i_dst>2*w
953 cglobal plane_copy_interleave_core, 7,7
954 FIX_STRIDES r1d, r3d, r5d, r6d
955 %ifdef HIGH_BIT_DEPTH
968 DECLARE_REG_TMP 10,11
987 INTERLEAVE r0+r6*2+ 0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6+0*SIZEOF_PIXEL, u, nt
988 INTERLEAVE r0+r6*2+16*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6+8*SIZEOF_PIXEL, u, nt
989 add r6, 16*SIZEOF_PIXEL
995 movntq [r0+r6*2+(n+ 0)], m0
996 movntq [r0+r6*2+(n+ 8)], m0
997 movntq [r0+r6*2+(n+16)], m0
998 movntq [r0+r6*2+(n+24)], m0
1000 movntdq [r0+r6*2+(n+ 0)], m0
1001 movntdq [r0+r6*2+(n+16)], m0
1005 add r6, 16*SIZEOF_PIXEL
1017 ;-----------------------------------------------------------------------------
1018 ; void store_interleave_chroma( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv, int height )
1019 ;-----------------------------------------------------------------------------
1020 cglobal store_interleave_chroma, 5,5
1023 INTERLEAVE r0+ 0, r2+ 0, r3+ 0, a
1024 INTERLEAVE r0+r1, r2+FDEC_STRIDEB, r3+FDEC_STRIDEB, a
1025 add r2, FDEC_STRIDEB*2
1026 add r3, FDEC_STRIDEB*2
1031 %endmacro ; PLANE_INTERLEAVE
1033 %macro DEINTERLEAVE_START 0
1034 %ifdef HIGH_BIT_DEPTH
1036 %elif cpuflag(ssse3)
1037 mova m4, [deinterleave_shuf]
1040 %endif ; HIGH_BIT_DEPTH
1043 %macro PLANE_DEINTERLEAVE 0
1044 ;-----------------------------------------------------------------------------
1045 ; void plane_copy_deinterleave( pixel *dstu, int i_dstu,
1046 ; pixel *dstv, int i_dstv,
1047 ; pixel *src, int i_src, int w, int h )
1048 ;-----------------------------------------------------------------------------
1049 cglobal plane_copy_deinterleave, 6,7
1052 FIX_STRIDES r1d, r3d, r5d, r6d
1053 %ifdef HIGH_BIT_DEPTH
1056 movsxdifnidn r1, r1d
1057 movsxdifnidn r3, r3d
1058 movsxdifnidn r5, r5d
1066 DEINTERLEAVE r0+r6+0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6*2+ 0*SIZEOF_PIXEL, 0, m4, u
1067 DEINTERLEAVE r0+r6+8*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6*2+16*SIZEOF_PIXEL, 0, m4, u
1068 add r6, 16*SIZEOF_PIXEL
1077 ;-----------------------------------------------------------------------------
1078 ; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, int i_src, int height )
1079 ;-----------------------------------------------------------------------------
1080 cglobal load_deinterleave_chroma_fenc, 4,4
1084 DEINTERLEAVE r0+ 0, r0+FENC_STRIDEB*1/2, r1+ 0, 1, m4, a
1085 DEINTERLEAVE r0+FENC_STRIDEB, r0+FENC_STRIDEB*3/2, r1+r2, 1, m4, a
1086 add r0, FENC_STRIDEB*2
1092 ;-----------------------------------------------------------------------------
1093 ; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, int i_src, int height )
1094 ;-----------------------------------------------------------------------------
1095 cglobal load_deinterleave_chroma_fdec, 4,4
1099 DEINTERLEAVE r0+ 0, r0+FDEC_STRIDEB*1/2, r1+ 0, 0, m4, a
1100 DEINTERLEAVE r0+FDEC_STRIDEB, r0+FDEC_STRIDEB*3/2, r1+r2, 0, m4, a
1101 add r0, FDEC_STRIDEB*2
1106 %endmacro ; PLANE_DEINTERLEAVE
1108 %ifdef HIGH_BIT_DEPTH
1131 ; These functions are not general-use; not only do the SSE ones require aligned input,
1132 ; but they also will fail if given a non-mod16 size.
1133 ; memzero SSE will fail for non-mod128.
1135 ;-----------------------------------------------------------------------------
1136 ; void *memcpy_aligned( void *dst, const void *src, size_t n );
1137 ;-----------------------------------------------------------------------------
1139 cglobal memcpy_aligned_mmx, 3,3
1142 movq mm0, [r1 + r2 - 16]
1143 movq mm1, [r1 + r2 - 8]
1144 movq [r0 + r2 - 16], mm0
1145 movq [r0 + r2 - 8], mm1
1151 movq mm0, [r1 + r2 - 32]
1152 movq mm1, [r1 + r2 - 24]
1153 movq mm2, [r1 + r2 - 16]
1154 movq mm3, [r1 + r2 - 8]
1155 movq [r0 + r2 - 32], mm0
1156 movq [r0 + r2 - 24], mm1
1157 movq [r0 + r2 - 16], mm2
1158 movq [r0 + r2 - 8], mm3
1164 ;-----------------------------------------------------------------------------
1165 ; void *memcpy_aligned( void *dst, const void *src, size_t n );
1166 ;-----------------------------------------------------------------------------
1167 cglobal memcpy_aligned_sse2, 3,3
1170 movdqa xmm0, [r1 + r2 - 16]
1171 movdqa [r0 + r2 - 16], xmm0
1176 movdqa xmm0, [r1 + r2 - 32]
1177 movdqa [r0 + r2 - 32], xmm0
1178 movdqa xmm1, [r1 + r2 - 16]
1179 movdqa [r0 + r2 - 16], xmm1
1185 movdqa xmm0, [r1 + r2 - 64]
1186 movdqa [r0 + r2 - 64], xmm0
1187 movdqa xmm1, [r1 + r2 - 48]
1188 movdqa [r0 + r2 - 48], xmm1
1189 movdqa xmm2, [r1 + r2 - 32]
1190 movdqa [r0 + r2 - 32], xmm2
1191 movdqa xmm3, [r1 + r2 - 16]
1192 movdqa [r0 + r2 - 16], xmm3
1198 ;-----------------------------------------------------------------------------
1199 ; void *memzero_aligned( void *dst, size_t n );
1200 ;-----------------------------------------------------------------------------
1202 cglobal memzero_aligned, 2,2
1209 mova [r0 + r1 + i], m0
1224 %ifndef HIGH_BIT_DEPTH
1225 ;-----------------------------------------------------------------------------
1226 ; void integral_init4h( uint16_t *sum, uint8_t *pix, int stride )
1227 ;-----------------------------------------------------------------------------
1229 cglobal integral_init4h_sse4, 3,4
1236 movdqa m1, [r1+r2+16]
1241 paddw m1, [r0+r2*2+16]
1242 movdqa [r3+r2*2 ], m0
1243 movdqa [r3+r2*2+16], m1
1248 %macro INTEGRAL_INIT8H 0
1249 cglobal integral_init8h, 3,4
1256 movdqa m1, [r1+r2+16]
1258 mpsadbw m2, m0, m4, 4
1259 mpsadbw m3, m1, m4, 4
1263 paddw m1, [r0+r2*2+16]
1266 movdqa [r3+r2*2 ], m0
1267 movdqa [r3+r2*2+16], m1
1277 %endif ; !HIGH_BIT_DEPTH
1279 %macro INTEGRAL_INIT_8V 0
1280 ;-----------------------------------------------------------------------------
1281 ; void integral_init8v( uint16_t *sum8, int stride )
1282 ;-----------------------------------------------------------------------------
1283 cglobal integral_init8v, 3,3
1290 mova m1, [r2+r1+mmsize]
1292 psubw m1, [r0+r1+mmsize]
1294 mova [r0+r1+mmsize], m1
1305 ;-----------------------------------------------------------------------------
1306 ; void integral_init4v( uint16_t *sum8, uint16_t *sum4, int stride )
1307 ;-----------------------------------------------------------------------------
1309 cglobal integral_init4v_mmx, 3,5
1331 cglobal integral_init4v_sse2, 3,5
1343 shufpd m0, [r0+r2+16], 1
1344 shufpd m1, [r4+r2+16], 1
1356 cglobal integral_init4v_ssse3, 3,5
1385 pavgb %4, [r0+r5*2+%7]
1386 PALIGNR %1, %3, 1, m6
1387 PALIGNR %2, %4, 1, m6
1397 mova m3, [r0+%4+mmsize]
1399 pavgb m3, [r0+%4+r5+mmsize]
1400 pavgb m2, [r0+%4+r5]
1401 PALIGNR %1, m3, 1, m6
1403 PALIGNR m3, m2, 1, m6
1419 pavgb m3, [r0+%3+r5+8]
1420 pavgb m2, [r0+%3+r5]
1423 pavgb m1, [r0+%3+r5+9]
1424 pavgb m0, [r0+%3+r5+1]
1440 pavgw m3, [r0+%3+r5+8]
1441 pavgw m2, [r0+%3+r5]
1444 pavgw m1, [r0+%3+r5+10]
1445 pavgw m0, [r0+%3+r5+2]
1459 mova m3, [r0+%4+mmsize]
1461 pavgw m3, [r0+%4+r5+mmsize]
1462 pavgw m2, [r0+%4+r5]
1463 PALIGNR %1, m3, 2, m6
1465 PALIGNR m3, m2, 2, m6
1478 ;-----------------------------------------------------------------------------
1479 ; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
1480 ; int src_stride, int dst_stride, int width, int height )
1481 ;-----------------------------------------------------------------------------
1482 %macro FRAME_INIT_LOWRES 0
1483 cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
1484 %ifdef HIGH_BIT_DEPTH
1492 ; src += 2*(height-1)*stride + 2*width
1498 ; dst += (height-1)*stride + width
1507 ; gap = stride - width
1511 %define dst_gap [rsp+gprsize]
1516 %define src_gap [rsp]
1517 %ifdef HIGH_BIT_DEPTH
1522 %ifnidn cpuname, mmx2
1534 %ifidn cpuname, mmx2
1538 FILT8xA m0, r1, r2, 0
1539 FILT8xA m1, r3, r4, r5
1543 %else ; !HIGH_BIT_DEPTH
1545 ; adjust for the odd end case
1558 %ifnidn cpuname, mmx2
1568 FILT8x4 m0, m1, m2, m3, m4, m5, 0
1587 FILT8x4 m0, m1, m2, m3, m10, m11, mmsize
1590 FILT8x4 m2, m3, m0, m1, m4, m5, 0
1599 %elifidn cpuname, mmx2
1603 FILT16x2 m0, r1, r2, 0
1604 FILT16x2 m1, r3, r4, r5
1608 %endif ; HIGH_BIT_DEPTH
1621 %endmacro ; FRAME_INIT_LOWRES
1626 INIT_MMX cache32, mmx2
1634 ;-----------------------------------------------------------------------------
1635 ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
1636 ; uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
1637 ;-----------------------------------------------------------------------------
1639 cglobal mbtree_propagate_cost_sse2, 7,7,7
1649 shufps xmm6, xmm6, 0
1650 mulps xmm6, [pf_inv256]
1651 movdqa xmm5, [pw_3fff]
1653 movq xmm2, [r2+r6] ; intra
1654 movq xmm0, [r4+r6] ; invq
1655 movq xmm3, [r3+r6] ; inter
1656 movq xmm1, [r1+r6] ; prop
1657 punpcklwd xmm2, xmm4
1658 punpcklwd xmm0, xmm4
1661 punpcklwd xmm1, xmm4
1662 punpcklwd xmm3, xmm4
1664 mulps xmm0, xmm6 ; intra*invq*fps_factor>>8
1665 cvtdq2ps xmm1, xmm1 ; prop
1666 addps xmm0, xmm1 ; prop + (intra*invq*fps_factor>>8)
1667 cvtdq2ps xmm1, xmm2 ; intra
1668 psubd xmm2, xmm3 ; intra - inter
1669 cvtdq2ps xmm2, xmm2 ; intra - inter
1670 rcpps xmm3, xmm1 ; 1 / intra 1st approximation
1671 mulps xmm1, xmm3 ; intra * (1/intra 1st approx)
1672 mulps xmm1, xmm3 ; intra * (1/intra 1st approx)^2
1673 mulps xmm0, xmm2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
1674 addps xmm3, xmm3 ; 2 * (1/intra 1st approx)
1675 subps xmm3, xmm1 ; 2nd approximation for 1/intra
1676 mulps xmm0, xmm3 ; / intra
1678 movdqa [r0+r6*2], xmm0
1683 %macro INT16_TO_FLOAT 1
1684 vpunpckhwd xmm4, xmm%1, xmm7
1685 vpunpcklwd xmm%1, xmm7
1686 vinsertf128 ymm%1, ymm%1, xmm4, 1
1687 vcvtdq2ps ymm%1, ymm%1
1690 ; FIXME: align loads/stores to 16 bytes
1691 cglobal mbtree_propagate_cost_avx, 7,7,8
1699 vmovdqa xmm5, [pw_3fff]
1700 vbroadcastss ymm6, [r5]
1701 vmulps ymm6, ymm6, [pf_inv256]
1704 vmovdqu xmm0, [r2+r6] ; intra
1705 vmovdqu xmm1, [r4+r6] ; invq
1706 vmovdqu xmm2, [r1+r6] ; prop
1707 vpand xmm3, xmm5, [r3+r6] ; inter
1712 vmulps ymm1, ymm1, ymm0
1713 vsubps ymm4, ymm0, ymm3
1714 vmulps ymm1, ymm1, ymm6 ; intra*invq*fps_factor>>8
1715 vaddps ymm1, ymm1, ymm2 ; prop + (intra*invq*fps_factor>>8)
1716 vrcpps ymm3, ymm0 ; 1 / intra 1st approximation
1717 vmulps ymm2, ymm0, ymm3 ; intra * (1/intra 1st approx)
1718 vmulps ymm2, ymm2, ymm3 ; intra * (1/intra 1st approx)^2
1719 vmulps ymm1, ymm1, ymm4 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
1720 vaddps ymm3, ymm3, ymm3 ; 2 * (1/intra 1st approx)
1721 vsubps ymm3, ymm3, ymm2 ; 2nd approximation for 1/intra
1722 vmulps ymm1, ymm1, ymm3 ; / intra
1723 vcvtps2dq ymm1, ymm1
1724 vmovdqu [r0+r6*2], ymm1