1 ;*****************************************************************************
2 ;* mc-a2.asm: x86 motion compensation
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2010 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Holger Lubitz <holger@lubitz.org>
9 ;* Mathieu Monnier <manao@melix.net>
10 ;* Oskar Arvidsson <oskar@irock.se>
12 ;* This program is free software; you can redistribute it and/or modify
13 ;* it under the terms of the GNU General Public License as published by
14 ;* the Free Software Foundation; either version 2 of the License, or
15 ;* (at your option) any later version.
17 ;* This program is distributed in the hope that it will be useful,
18 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;* GNU General Public License for more details.
22 ;* You should have received a copy of the GNU General Public License
23 ;* along with this program; if not, write to the Free Software
24 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
26 ;* This program is also available under a commercial proprietary license.
27 ;* For more information, contact us at licensing@x264.com.
28 ;*****************************************************************************
31 %include "x86util.asm"
35 filt_mul20: times 16 db 20
36 filt_mul15: times 8 db 1, -5
37 filt_mul51: times 8 db -5, 1
38 hpel_shuf: db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
39 deinterleave_shuf: db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
42 pd_0f: times 4 dd 0xffff
44 pad10: times 8 dw 10*PIXEL_MAX
45 pad20: times 8 dw 20*PIXEL_MAX
46 pad30: times 8 dw 30*PIXEL_MAX
47 depad: times 4 dd 32*20*PIXEL_MAX + 512
49 tap1: times 4 dw 1, -5
50 tap2: times 4 dw 20, 20
51 tap3: times 4 dw -5, 1
93 psubw %1, %2 ; a-5*b+4*c
97 paddw %1, %3 ; a-5*b+20*c
103 psraw %1, 2 ; (a-b)/4
104 psubw %1, %2 ; (a-b)/4-b
105 paddw %1, %3 ; (a-b)/4-b+c
106 psraw %1, 2 ; ((a-b)/4-b+c)/4
107 paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
125 %macro FILT_PACK 4-6 b
142 ;The hpel_filter routines use non-temporal writes for output.
143 ;The following defines may be uncommented for testing.
144 ;Doing the hpel_filter temporal may be a win if the last level cache
145 ;is big enough (preliminary benching suggests on the order of 4* framesize).
148 ;%define movntps movaps
151 %ifdef HIGH_BIT_DEPTH
152 ;-----------------------------------------------------------------------------
153 ; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, int stride, int width );
154 ;-----------------------------------------------------------------------------
156 cglobal hpel_filter_v_%1, 5,6,11*(mmsize/16)
179 mova m7, [pw_pixel_max]
186 mova m5, [r1+r3+mmsize]
187 mova m6, [r1+r3*2+mmsize]
191 paddw m4, [r5+r3*2+mmsize]
192 paddw m5, [r5+r3+mmsize]
193 paddw m6, [r5+mmsize]
196 FILT_V2 m1, m2, m3, m4, m5, m6
201 mova [r2+r4+mmsize], m4
205 FILT_PACK m1, m4, 5, m6, w, s10
208 mova [r0+r4-mmsize*2], m1
209 mova [r0+r4-mmsize*1], m4
213 ;-----------------------------------------------------------------------------
214 ; void hpel_filter_c( uint16_t *dst, int16_t *buf, int width );
215 ;-----------------------------------------------------------------------------
216 cglobal hpel_filter_c_%1, 3,3,10*(mmsize/16)
256 CLIPW m1, [pb_0], [pw_pixel_max]
262 ;-----------------------------------------------------------------------------
263 ; void hpel_filter_h( uint16_t *dst, uint16_t *src, int width );
264 ;-----------------------------------------------------------------------------
265 cglobal hpel_filter_h_%1, 3,4,8*(mmsize/16)
271 mova m0, [pw_pixel_max]
283 movu m4, [src-4+mmsize]
284 movu m5, [src-2+mmsize]
286 movu m7, [src+4+mmsize]
287 movu m6, [src+6+mmsize]
290 movu m7, [src+2+mmsize]
291 mova m6, [src+0+mmsize]
293 FILT_H2 m1, m2, m3, m4, m5, m6
297 FILT_PACK m1, m4, 1, m7, w
300 mova [r0+r2-mmsize*2], m1
301 mova [r0+r2-mmsize*1], m4
310 %endif ; HIGH_BIT_DEPTH
312 %ifndef HIGH_BIT_DEPTH
316 ;-----------------------------------------------------------------------------
317 ; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width );
318 ;-----------------------------------------------------------------------------
319 cglobal hpel_filter_v_%1, 5,6,%2
332 mova m0, [filt_mul15]
342 SBUTTERFLY bw, 1, 4, 7
343 SBUTTERFLY bw, 2, 5, 7
344 SBUTTERFLY bw, 3, 6, 7
349 pmaddubsw m3, [filt_mul20]
350 pmaddubsw m6, [filt_mul20]
356 LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1
357 LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1
358 LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0
359 LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
360 FILT_V2 m1, m2, m3, m4, m5, m6
364 mova [r2+r4*2+mmsize], m4
365 FILT_PACK m1, m4, 5, m7
375 ;-----------------------------------------------------------------------------
376 ; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
377 ;-----------------------------------------------------------------------------
378 cglobal hpel_filter_c_mmxext, 3,3
390 paddw m3, [src+2] ; c0
394 paddw m4, [src+14] ; a1
395 paddw m5, [src+12] ; b1
396 paddw m6, [src+10] ; c1
397 FILT_H2 m1, m2, m3, m4, m5, m6
398 FILT_PACK m1, m4, 6, m7
404 ;-----------------------------------------------------------------------------
405 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
406 ;-----------------------------------------------------------------------------
407 cglobal hpel_filter_h_mmxext, 3,3
441 FILT_H2 m1, m2, m3, m4, m5, m6
442 FILT_PACK m1, m4, 1, m7
451 ;-----------------------------------------------------------------------------
452 ; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
453 ;-----------------------------------------------------------------------------
454 cglobal hpel_filter_c_%1, 3,3,9
466 %define tpw_32 [pw_32]
468 %ifidn %1,sse2_misalign
482 FILT_H2 m4, m5, m6, m3, m2, m1
489 PALIGNR m4, m0, 12, m7
491 PALIGNR m5, m0, 14, m0
493 PALIGNR m0, m1, 6, m7
496 PALIGNR m0, m1, 4, m7
499 PALIGNR m6, m1, 2, m7
505 PALIGNR m2, m1, 12, m7
506 PALIGNR m5, m1, 14, m1
509 PALIGNR m3, m0, 6, m7
512 PALIGNR m6, m0, 4, m7
515 PALIGNR m6, m0, 2, m7
519 FILT_PACK m4, m3, 6, tpw_32
526 ;-----------------------------------------------------------------------------
527 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
528 ;-----------------------------------------------------------------------------
529 cglobal hpel_filter_h_sse2, 3,3,8
566 mova m7, [pw_1] ; FIXME xmm8
567 FILT_H2 m1, m2, m3, m4, m5, m6
568 FILT_PACK m1, m4, 1, m7
575 ;-----------------------------------------------------------------------------
576 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
577 ;-----------------------------------------------------------------------------
578 cglobal hpel_filter_h_ssse3, 3,3
594 pmaddubsw m3, [filt_mul15]
595 pmaddubsw m4, [filt_mul15]
596 pmaddubsw m0, [filt_mul51]
603 pmaddubsw m1, [filt_mul20]
604 pmaddubsw m5, [filt_mul20]
605 pmaddubsw m6, [filt_mul51]
609 FILT_PACK m3, m4, 5, m7
610 pshufb m3, [hpel_shuf]
618 %define PALIGNR PALIGNR_MMX
624 %define PALIGNR PALIGNR_SSSE3
631 ;The optimum prefetch distance is difficult to determine in checkasm:
632 ;any prefetch seems slower than not prefetching.
633 ;In real use, the prefetch seems to be a slight win.
634 ;+16 is picked somewhat arbitrarily here based on the fact that even one
635 ;loop iteration is going to take longer than the prefetch.
636 prefetcht0 [r1+r2*2+16]
666 LOAD_ADD_2 m1, m4, [r3 ], [r1+r2*2], m2, m5 ; a0 / a1
667 LOAD_ADD_2 m2, m5, [r3+r2 ], [r1+r2 ], m3, m6 ; b0 / b1
668 LOAD_ADD_2 m3, m6, [r3+r2*2], [r1 ], %3, %4 ; c0 / c1
670 FILT_V2 m1, m2, m3, m4, m5, m6
676 FILT_PACK m1, m4, 5, m15
677 movntps [r11+r4+%5], m1
682 PALIGNR m1, %1, 12, m2
684 PALIGNR m2, %1, 14, %1
686 PALIGNR m3, %2, 4, %1
688 PALIGNR m4, %2, 2, %1
691 PALIGNR %3, %2, 6, m2
700 FILT_PACK %3, %4, 6, m15
717 PALIGNR m1, %1, 14, m3
719 PALIGNR m2, %1, 15, m3
721 PALIGNR m4, %2, 1 , m3
723 PALIGNR m5, %2, 2 , m3
725 PALIGNR m6, %2, 3 , m3
728 ADD8TO16 m1, m6, m12, m3, m0 ; a
729 ADD8TO16 m2, m5, m12, m3, m0 ; b
730 ADD8TO16 %2, m4, m12, m3, m0 ; c
731 FILT_V2 m1, m2, %2, m6, m5, m4
732 FILT_PACK m1, m6, 5, m15
744 FILT_PACK m1, m2, 5, m15
745 pshufb m1, [hpel_shuf]
752 ;-----------------------------------------------------------------------------
753 ; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
754 ; uint8_t *src, int stride, int width, int height)
755 ;-----------------------------------------------------------------------------
756 cglobal hpel_filter_%1, 7,7,16
780 mova m0, [filt_mul51]
781 mova m12, [filt_mul15]
782 mova m14, [filt_mul20]
787 DO_FILT_V m8, m7, m13, m12, 0, %1
790 DO_FILT_V m6, m5, m11, m12, 16, %1
792 paddw m15, m15 ; pw_32
793 DO_FILT_C m9, m8, m7, m6
796 DO_FILT_H m10, m13, m11, %1
801 ; setup regs for next y
816 %define PALIGNR PALIGNR_MMX
818 %define PALIGNR PALIGNR_SSSE3
825 %endif ; !HIGH_BIT_DEPTH
827 ;-----------------------------------------------------------------------------
828 ; void plane_copy_core( pixel *dst, int i_dst,
829 ; pixel *src, int i_src, int w, int h)
830 ;-----------------------------------------------------------------------------
831 ; assumes i_dst and w are multiples of 16, and i_dst>w
833 cglobal plane_copy_core_mmxext, 6,7
834 FIX_STRIDES r1d, r3d, r4d
887 %macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
888 %ifdef HIGH_BIT_DEPTH
891 mov%4 m0, [%2+(x/2)*mmsize]
892 mov%4 m1, [%3+(x/2)*mmsize]
896 mov%5a [%1+(x+0)*mmsize], m0
897 mov%5a [%1+(x+1)*mmsize], m2
918 %endif ; HIGH_BIT_DEPTH
921 %macro DEINTERLEAVE 7 ; dstu, dstv, src, dstv==dstu+8, cpu, shuffle constant, is aligned
922 %ifdef HIGH_BIT_DEPTH
925 mova m0, [%3+(n+0)*mmsize]
926 mova m1, [%3+(n+1)*mmsize]
935 mov%7 [%1+(n/2)*mmsize], m0
936 mov%7 [%2+(n/2)*mmsize], m2
939 %else ; !HIGH_BIT_DEPTH
969 %endif ; mmsize == 16
970 %endif ; HIGH_BIT_DEPTH
973 %macro PLANE_INTERLEAVE 1
974 ;-----------------------------------------------------------------------------
975 ; void plane_copy_interleave_core( uint8_t *dst, int i_dst,
976 ; uint8_t *srcu, int i_srcu,
977 ; uint8_t *srcv, int i_srcv, int w, int h )
978 ;-----------------------------------------------------------------------------
979 ; assumes i_dst and w are multiples of 16, and i_dst>2*w
980 cglobal plane_copy_interleave_core_%1, 7,7
981 FIX_STRIDES r1d, r3d, r5d, r6d
982 %ifdef HIGH_BIT_DEPTH
995 DECLARE_REG_TMP 10,11
1000 shr t1, SIZEOF_PIXEL
1014 INTERLEAVE r0+r6*2+ 0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6+0*SIZEOF_PIXEL, u, nt
1015 INTERLEAVE r0+r6*2+16*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6+8*SIZEOF_PIXEL, u, nt
1016 add r6, 16*SIZEOF_PIXEL
1022 movntq [r0+r6*2+(n+ 0)], m0
1023 movntq [r0+r6*2+(n+ 8)], m0
1024 movntq [r0+r6*2+(n+16)], m0
1025 movntq [r0+r6*2+(n+24)], m0
1027 movntdq [r0+r6*2+(n+ 0)], m0
1028 movntdq [r0+r6*2+(n+16)], m0
1032 add r6, 16*SIZEOF_PIXEL
1044 ;-----------------------------------------------------------------------------
1045 ; void store_interleave_8x8x2( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv )
1046 ;-----------------------------------------------------------------------------
1047 cglobal store_interleave_8x8x2_%1, 4,5
1051 INTERLEAVE r0+ 0, r2+ 0, r3+ 0, a
1052 INTERLEAVE r0+r1, r2+FDEC_STRIDEB, r3+FDEC_STRIDEB, a
1053 add r2, FDEC_STRIDEB*2
1054 add r3, FDEC_STRIDEB*2
1059 %endmacro ; PLANE_INTERLEAVE
1061 %macro DEINTERLEAVE_START 1
1062 %ifdef HIGH_BIT_DEPTH
1065 mova m4, [deinterleave_shuf]
1068 %endif ; HIGH_BIT_DEPTH
1071 %macro PLANE_DEINTERLEAVE 1
1072 ;-----------------------------------------------------------------------------
1073 ; void plane_copy_deinterleave( pixel *dstu, int i_dstu,
1074 ; pixel *dstv, int i_dstv,
1075 ; pixel *src, int i_src, int w, int h )
1076 ;-----------------------------------------------------------------------------
1077 cglobal plane_copy_deinterleave_%1, 6,7
1078 DEINTERLEAVE_START %1
1080 FIX_STRIDES r1d, r3d, r5d, r6d
1081 %ifdef HIGH_BIT_DEPTH
1084 movsxdifnidn r1, r1d
1085 movsxdifnidn r3, r3d
1086 movsxdifnidn r5, r5d
1094 DEINTERLEAVE r0+r6+0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6*2+ 0*SIZEOF_PIXEL, 0, %1, m4, u
1095 DEINTERLEAVE r0+r6+8*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6*2+16*SIZEOF_PIXEL, 0, %1, m4, u
1096 add r6, 16*SIZEOF_PIXEL
1105 ;-----------------------------------------------------------------------------
1106 ; void load_deinterleave_8x8x2_fenc( pixel *dst, pixel *src, int i_src )
1107 ;-----------------------------------------------------------------------------
1108 cglobal load_deinterleave_8x8x2_fenc_%1, 3,4
1109 DEINTERLEAVE_START %1
1113 DEINTERLEAVE r0+ 0, r0+FENC_STRIDEB*1/2, r1+ 0, 1, %1, m4, a
1114 DEINTERLEAVE r0+FENC_STRIDEB, r0+FENC_STRIDEB*3/2, r1+r2, 1, %1, m4, a
1115 add r0, FENC_STRIDEB*2
1121 ;-----------------------------------------------------------------------------
1122 ; void load_deinterleave_8x8x2_fdec( pixel *dst, pixel *src, int i_src )
1123 ;-----------------------------------------------------------------------------
1124 cglobal load_deinterleave_8x8x2_fdec_%1, 3,4
1125 DEINTERLEAVE_START %1
1129 DEINTERLEAVE r0+ 0, r0+FDEC_STRIDEB*1/2, r1+ 0, 0, %1, m4, a
1130 DEINTERLEAVE r0+FDEC_STRIDEB, r0+FDEC_STRIDEB*3/2, r1+r2, 0, %1, m4, a
1131 add r0, FDEC_STRIDEB*2
1136 %endmacro ; PLANE_DEINTERLEAVE
1138 %ifdef HIGH_BIT_DEPTH
1140 PLANE_INTERLEAVE mmxext
1141 PLANE_DEINTERLEAVE mmx
1143 PLANE_INTERLEAVE sse2
1144 PLANE_DEINTERLEAVE sse2
1147 PLANE_INTERLEAVE mmxext
1148 PLANE_DEINTERLEAVE mmx
1150 PLANE_INTERLEAVE sse2
1151 PLANE_DEINTERLEAVE sse2
1152 PLANE_DEINTERLEAVE ssse3
1155 ; These functions are not general-use; not only do the SSE ones require aligned input,
1156 ; but they also will fail if given a non-mod16 size or a size less than 64.
1157 ; memzero SSE will fail for non-mod128.
1159 ;-----------------------------------------------------------------------------
1160 ; void *memcpy_aligned( void *dst, const void *src, size_t n );
1161 ;-----------------------------------------------------------------------------
1162 cglobal memcpy_aligned_mmx, 3,3
1166 movq mm0, [r1 + r2 + 0]
1167 movq mm1, [r1 + r2 + 8]
1168 movq [r0 + r2 + 0], mm0
1169 movq [r0 + r2 + 8], mm1
1172 movq mm0, [r1 + r2 + 0]
1173 movq mm1, [r1 + r2 + 8]
1174 movq mm2, [r1 + r2 + 16]
1175 movq mm3, [r1 + r2 + 24]
1176 movq [r0 + r2 + 0], mm0
1177 movq [r0 + r2 + 8], mm1
1178 movq [r0 + r2 + 16], mm2
1179 movq [r0 + r2 + 24], mm3
1183 ;-----------------------------------------------------------------------------
1184 ; void *memcpy_aligned( void *dst, const void *src, size_t n );
1185 ;-----------------------------------------------------------------------------
1186 cglobal memcpy_aligned_sse2, 3,3
1190 movdqa xmm0, [r1 + r2]
1191 movdqa [r0 + r2], xmm0
1196 movdqa xmm0, [r1 + r2 + 0]
1197 movdqa [r0 + r2 + 0], xmm0
1198 movdqa xmm1, [r1 + r2 + 16]
1199 movdqa [r0 + r2 + 16], xmm1
1202 movdqa xmm0, [r1 + r2 + 0]
1203 movdqa [r0 + r2 + 0], xmm0
1204 movdqa xmm1, [r1 + r2 + 16]
1205 movdqa [r0 + r2 + 16], xmm1
1206 movdqa xmm2, [r1 + r2 + 32]
1207 movdqa [r0 + r2 + 32], xmm2
1208 movdqa xmm3, [r1 + r2 + 48]
1209 movdqa [r0 + r2 + 48], xmm3
1213 ;-----------------------------------------------------------------------------
1214 ; void *memzero_aligned( void *dst, size_t n );
1215 ;-----------------------------------------------------------------------------
1217 cglobal memzero_aligned_%1, 2,2
1224 mova [r0 + r1 + i], m0
1239 ;-----------------------------------------------------------------------------
1240 ; void integral_init4h( uint16_t *sum, uint8_t *pix, int stride )
1241 ;-----------------------------------------------------------------------------
1242 cglobal integral_init4h_sse4, 3,4
1249 movdqa m1, [r1+r2+16]
1254 paddw m1, [r0+r2*2+16]
1255 movdqa [r3+r2*2 ], m0
1256 movdqa [r3+r2*2+16], m1
1261 cglobal integral_init8h_sse4, 3,4
1268 movdqa m1, [r1+r2+16]
1277 paddw m1, [r0+r2*2+16]
1280 movdqa [r3+r2*2 ], m0
1281 movdqa [r3+r2*2+16], m1
1286 %macro INTEGRAL_INIT_8V 1
1287 ;-----------------------------------------------------------------------------
1288 ; void integral_init8v( uint16_t *sum8, int stride )
1289 ;-----------------------------------------------------------------------------
1290 cglobal integral_init8v_%1, 3,3
1297 mova m1, [r2+r1+mmsize]
1299 psubw m1, [r0+r1+mmsize]
1301 mova [r0+r1+mmsize], m1
1308 INTEGRAL_INIT_8V mmx
1310 INTEGRAL_INIT_8V sse2
1312 ;-----------------------------------------------------------------------------
1313 ; void integral_init4v( uint16_t *sum8, uint16_t *sum4, int stride )
1314 ;-----------------------------------------------------------------------------
1316 cglobal integral_init4v_mmx, 3,5
1338 cglobal integral_init4v_sse2, 3,5
1350 shufpd m0, [r0+r2+16], 1
1351 shufpd m1, [r4+r2+16], 1
1363 cglobal integral_init4v_ssse3, 3,5
1392 pavgb %4, [r0+r5*2+%7]
1393 PALIGNR %1, %3, 1, m6
1394 PALIGNR %2, %4, 1, m6
1406 mova m3, [r0+%4+mmsize]
1408 pavgb m3, [r0+%4+r5+mmsize]
1409 pavgb m2, [r0+%4+r5]
1410 PALIGNR %1, m3, 1, m6
1412 PALIGNR m3, m2, 1, m6
1430 pavgb m3, [r0+%3+r5+8]
1431 pavgb m2, [r0+%3+r5]
1434 pavgb m1, [r0+%3+r5+9]
1435 pavgb m0, [r0+%3+r5+1]
1453 pavgw m3, [r0+%3+r5+8]
1454 pavgw m2, [r0+%3+r5]
1457 pavgw m1, [r0+%3+r5+10]
1458 pavgw m0, [r0+%3+r5+2]
1474 mova m3, [r0+%4+mmsize]
1476 pavgw m3, [r0+%4+r5+mmsize]
1477 pavgw m2, [r0+%4+r5]
1478 PALIGNR %1, m3, 2, m6
1480 PALIGNR m3, m2, 2, m6
1495 ;-----------------------------------------------------------------------------
1496 ; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
1497 ; int src_stride, int dst_stride, int width, int height )
1498 ;-----------------------------------------------------------------------------
1499 %macro FRAME_INIT_LOWRES 1
1500 cglobal frame_init_lowres_core_%1, 6,7,(12-4*(BIT_DEPTH/9))*(mmsize/16) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
1501 %ifdef HIGH_BIT_DEPTH
1509 ; src += 2*(height-1)*stride + 2*width
1515 ; dst += (height-1)*stride + width
1524 ; gap = stride - width
1528 %define dst_gap [rsp+gprsize]
1533 %define src_gap [rsp]
1534 %ifdef HIGH_BIT_DEPTH
1555 FILT8xA m0, r1, r2, 0
1556 FILT8xA m1, r3, r4, r5
1560 %else ; !HIGH_BIT_DEPTH
1562 ; adjust for the odd end case
1585 FILT8x4 m0, m1, m2, m3, m4, m5, 0
1603 FILT8x4 m0, m1, m2, m3, m10, m11, mmsize
1606 FILT8x4 m2, m3, m0, m1, m4, m5, 0
1619 FILT16x2 m0, r1, r2, 0
1620 FILT16x2 m1, r3, r4, r5
1624 %endif ; HIGH_BIT_DEPTH
1637 %endmacro ; FRAME_INIT_LOWRES
1640 %define PALIGNR PALIGNR_MMX
1641 FRAME_INIT_LOWRES mmxext
1643 FRAME_INIT_LOWRES cache32_mmxext
1646 FRAME_INIT_LOWRES sse2
1647 %define PALIGNR PALIGNR_SSSE3
1648 FRAME_INIT_LOWRES ssse3
1650 ;-----------------------------------------------------------------------------
1651 ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
1652 ; uint16_t *inter_costs, uint16_t *inv_qscales, int len )
1653 ;-----------------------------------------------------------------------------
1654 cglobal mbtree_propagate_cost_sse2, 6,6,7
1663 movdqa xmm6, [pw_3fff]
1664 movdqa xmm4, [pd_128]
1666 movq xmm2, [r2+r5] ; intra
1667 movq xmm0, [r4+r5] ; invq
1668 movq xmm3, [r3+r5] ; inter
1669 movq xmm1, [r1+r5] ; prop
1670 punpcklwd xmm2, xmm5
1671 punpcklwd xmm0, xmm5
1674 punpcklwd xmm1, xmm5
1675 punpcklwd xmm3, xmm5
1677 psrld xmm0, 8 ; intra*invq>>8
1678 paddd xmm0, xmm1 ; prop + (intra*invq>>8)
1679 cvtdq2ps xmm1, xmm2 ; intra
1680 psubd xmm2, xmm3 ; intra - inter
1681 rcpps xmm3, xmm1 ; 1 / intra 1st approximation
1683 mulps xmm1, xmm3 ; intra * (1/intra 1st approx)
1685 mulps xmm1, xmm3 ; intra * (1/intra 1st approx)^2
1686 mulps xmm0, xmm2 ; (prop + (intra*invq>>8)) * (intra - inter)
1687 addps xmm3, xmm3 ; 2 * (1/intra 1st approx)
1688 subps xmm3, xmm1 ; 2nd approximation for 1/intra
1689 mulps xmm0, xmm3 ; / intra
1690 cvttps2dq xmm0, xmm0 ; truncation isn't really desired, but matches the integer implementation
1691 movdqa [r0+r5*2], xmm0