1 ;*****************************************************************************
2 ;* mc-a2.asm: x86 motion compensation
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2010 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Holger Lubitz <holger@lubitz.org>
9 ;* Mathieu Monnier <manao@melix.net>
10 ;* Oskar Arvidsson <oskar@irock.se>
12 ;* This program is free software; you can redistribute it and/or modify
13 ;* it under the terms of the GNU General Public License as published by
14 ;* the Free Software Foundation; either version 2 of the License, or
15 ;* (at your option) any later version.
17 ;* This program is distributed in the hope that it will be useful,
18 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;* GNU General Public License for more details.
22 ;* You should have received a copy of the GNU General Public License
23 ;* along with this program; if not, write to the Free Software
24 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
26 ;* This program is also available under a commercial proprietary license.
27 ;* For more information, contact us at licensing@x264.com.
28 ;*****************************************************************************
31 %include "x86util.asm"
35 filt_mul20: times 16 db 20
36 filt_mul15: times 8 db 1, -5
37 filt_mul51: times 8 db -5, 1
38 hpel_shuf: db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
39 deinterleave_shuf: db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
42 pd_0f: times 4 dd 0xffff
44 pad10: times 8 dw 10*PIXEL_MAX
45 pad20: times 8 dw 20*PIXEL_MAX
46 pad30: times 8 dw 30*PIXEL_MAX
47 depad: times 4 dd 32*20*PIXEL_MAX + 512
49 tap1: times 4 dw 1, -5
50 tap2: times 4 dw 20, 20
51 tap3: times 4 dw -5, 1
92 psubw %1, %2 ; a-5*b+4*c
96 paddw %1, %3 ; a-5*b+20*c
102 psraw %1, 2 ; (a-b)/4
103 psubw %1, %2 ; (a-b)/4-b
104 paddw %1, %3 ; (a-b)/4-b+c
105 psraw %1, 2 ; ((a-b)/4-b+c)/4
106 paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
124 %macro FILT_PACK 4-6 b
141 ;The hpel_filter routines use non-temporal writes for output.
142 ;The following defines may be uncommented for testing.
143 ;Doing the hpel_filter temporal may be a win if the last level cache
144 ;is big enough (preliminary benching suggests on the order of 4* framesize).
147 ;%define movntps movaps
150 %ifdef HIGH_BIT_DEPTH
151 ;-----------------------------------------------------------------------------
152 ; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, int stride, int width );
153 ;-----------------------------------------------------------------------------
155 cglobal hpel_filter_v_%1, 5,6,11*(mmsize/16)
178 mova m7, [pw_pixel_max]
185 mova m5, [r1+r3+mmsize]
186 mova m6, [r1+r3*2+mmsize]
190 paddw m4, [r5+r3*2+mmsize]
191 paddw m5, [r5+r3+mmsize]
192 paddw m6, [r5+mmsize]
195 FILT_V2 m1, m2, m3, m4, m5, m6
200 mova [r2+r4+mmsize], m4
204 FILT_PACK m1, m4, 5, m6, w, s10
207 mova [r0+r4-mmsize*2], m1
208 mova [r0+r4-mmsize*1], m4
212 ;-----------------------------------------------------------------------------
213 ; void hpel_filter_c( uint16_t *dst, int16_t *buf, int width );
214 ;-----------------------------------------------------------------------------
215 cglobal hpel_filter_c_%1, 3,3,10*(mmsize/16)
255 CLIPW m1, [pb_0], [pw_pixel_max]
261 ;-----------------------------------------------------------------------------
262 ; void hpel_filter_h( uint16_t *dst, uint16_t *src, int width );
263 ;-----------------------------------------------------------------------------
264 cglobal hpel_filter_h_%1, 3,4,8*(mmsize/16)
270 mova m0, [pw_pixel_max]
282 movu m4, [src-4+mmsize]
283 movu m5, [src-2+mmsize]
285 movu m7, [src+4+mmsize]
286 movu m6, [src+6+mmsize]
289 movu m7, [src+2+mmsize]
290 mova m6, [src+0+mmsize]
292 FILT_H2 m1, m2, m3, m4, m5, m6
296 FILT_PACK m1, m4, 1, m7, w
299 mova [r0+r2-mmsize*2], m1
300 mova [r0+r2-mmsize*1], m4
309 %endif ; HIGH_BIT_DEPTH
311 %ifndef HIGH_BIT_DEPTH
315 ;-----------------------------------------------------------------------------
316 ; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width );
317 ;-----------------------------------------------------------------------------
318 cglobal hpel_filter_v_%1, 5,6,%2
331 mova m0, [filt_mul15]
341 SBUTTERFLY bw, 1, 4, 7
342 SBUTTERFLY bw, 2, 5, 7
343 SBUTTERFLY bw, 3, 6, 7
348 pmaddubsw m3, [filt_mul20]
349 pmaddubsw m6, [filt_mul20]
355 LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1
356 LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1
357 LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0
358 LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
359 FILT_V2 m1, m2, m3, m4, m5, m6
363 mova [r2+r4*2+mmsize], m4
364 FILT_PACK m1, m4, 5, m7
374 ;-----------------------------------------------------------------------------
375 ; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
376 ;-----------------------------------------------------------------------------
377 cglobal hpel_filter_c_mmxext, 3,3
389 paddw m3, [src+2] ; c0
393 paddw m4, [src+14] ; a1
394 paddw m5, [src+12] ; b1
395 paddw m6, [src+10] ; c1
396 FILT_H2 m1, m2, m3, m4, m5, m6
397 FILT_PACK m1, m4, 6, m7
403 ;-----------------------------------------------------------------------------
404 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
405 ;-----------------------------------------------------------------------------
406 cglobal hpel_filter_h_mmxext, 3,3
440 FILT_H2 m1, m2, m3, m4, m5, m6
441 FILT_PACK m1, m4, 1, m7
450 ;-----------------------------------------------------------------------------
451 ; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
452 ;-----------------------------------------------------------------------------
453 cglobal hpel_filter_c_%1, 3,3,9
465 %define tpw_32 [pw_32]
467 %ifidn %1,sse2_misalign
481 FILT_H2 m4, m5, m6, m3, m2, m1
488 PALIGNR m4, m0, 12, m7
490 PALIGNR m5, m0, 14, m0
492 PALIGNR m0, m1, 6, m7
495 PALIGNR m0, m1, 4, m7
498 PALIGNR m6, m1, 2, m7
504 PALIGNR m2, m1, 12, m7
505 PALIGNR m5, m1, 14, m1
508 PALIGNR m3, m0, 6, m7
511 PALIGNR m6, m0, 4, m7
514 PALIGNR m6, m0, 2, m7
518 FILT_PACK m4, m3, 6, tpw_32
525 ;-----------------------------------------------------------------------------
526 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
527 ;-----------------------------------------------------------------------------
528 cglobal hpel_filter_h_sse2, 3,3,8
565 mova m7, [pw_1] ; FIXME xmm8
566 FILT_H2 m1, m2, m3, m4, m5, m6
567 FILT_PACK m1, m4, 1, m7
574 ;-----------------------------------------------------------------------------
575 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
576 ;-----------------------------------------------------------------------------
577 cglobal hpel_filter_h_ssse3, 3,3
593 pmaddubsw m3, [filt_mul15]
594 pmaddubsw m4, [filt_mul15]
595 pmaddubsw m0, [filt_mul51]
602 pmaddubsw m1, [filt_mul20]
603 pmaddubsw m5, [filt_mul20]
604 pmaddubsw m6, [filt_mul51]
608 FILT_PACK m3, m4, 5, m7
609 pshufb m3, [hpel_shuf]
617 %define PALIGNR PALIGNR_MMX
623 %define PALIGNR PALIGNR_SSSE3
630 ;The optimum prefetch distance is difficult to determine in checkasm:
631 ;any prefetch seems slower than not prefetching.
632 ;In real use, the prefetch seems to be a slight win.
633 ;+16 is picked somewhat arbitrarily here based on the fact that even one
634 ;loop iteration is going to take longer than the prefetch.
635 prefetcht0 [r1+r2*2+16]
665 LOAD_ADD_2 m1, m4, [r3 ], [r1+r2*2], m2, m5 ; a0 / a1
666 LOAD_ADD_2 m2, m5, [r3+r2 ], [r1+r2 ], m3, m6 ; b0 / b1
667 LOAD_ADD_2 m3, m6, [r3+r2*2], [r1 ], %3, %4 ; c0 / c1
669 FILT_V2 m1, m2, m3, m4, m5, m6
675 FILT_PACK m1, m4, 5, m15
676 movntps [r11+r4+%5], m1
681 PALIGNR m1, %1, 12, m2
683 PALIGNR m2, %1, 14, %1
685 PALIGNR m3, %2, 4, %1
687 PALIGNR m4, %2, 2, %1
690 PALIGNR %3, %2, 6, m2
699 FILT_PACK %3, %4, 6, m15
716 PALIGNR m1, %1, 14, m3
718 PALIGNR m2, %1, 15, m3
720 PALIGNR m4, %2, 1 , m3
722 PALIGNR m5, %2, 2 , m3
724 PALIGNR m6, %2, 3 , m3
727 ADD8TO16 m1, m6, m12, m3, m0 ; a
728 ADD8TO16 m2, m5, m12, m3, m0 ; b
729 ADD8TO16 %2, m4, m12, m3, m0 ; c
730 FILT_V2 m1, m2, %2, m6, m5, m4
731 FILT_PACK m1, m6, 5, m15
743 FILT_PACK m1, m2, 5, m15
744 pshufb m1, [hpel_shuf]
751 ;-----------------------------------------------------------------------------
752 ; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
753 ; uint8_t *src, int stride, int width, int height)
754 ;-----------------------------------------------------------------------------
755 cglobal hpel_filter_%1, 7,7,16
779 mova m0, [filt_mul51]
780 mova m12, [filt_mul15]
781 mova m14, [filt_mul20]
786 DO_FILT_V m8, m7, m13, m12, 0, %1
789 DO_FILT_V m6, m5, m11, m12, 16, %1
791 paddw m15, m15 ; pw_32
792 DO_FILT_C m9, m8, m7, m6
795 DO_FILT_H m10, m13, m11, %1
800 ; setup regs for next y
815 %define PALIGNR PALIGNR_MMX
817 %define PALIGNR PALIGNR_SSSE3
824 %endif ; !HIGH_BIT_DEPTH
826 ;-----------------------------------------------------------------------------
827 ; void plane_copy_core( uint8_t *dst, int i_dst,
828 ; uint8_t *src, int i_src, int w, int h)
829 ;-----------------------------------------------------------------------------
830 ; assumes i_dst and w are multiples of 16, and i_dst>w
831 cglobal plane_copy_core_mmxext, 6,7
883 %ifdef HIGH_BIT_DEPTH
885 %macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
889 SBUTTERFLY wd, 0, 1, 2
895 SBUTTERFLY wd, 0, 1, 2
900 SBUTTERFLY wd, 0, 1, 2
906 %macro PLANE_INTERLEAVE 1
907 ;-----------------------------------------------------------------------------
908 ; void store_interleave_8x8x2( uint16_t *dst, int i_dst, uint16_t *srcu, uint16_t *srcv )
909 ;-----------------------------------------------------------------------------
910 cglobal store_interleave_8x8x2_%1, 4,5
914 INTERLEAVE r0, r2, r3, a
922 %endmacro ; PLANE_INTERLEAVE
925 PLANE_INTERLEAVE mmxext
927 PLANE_INTERLEAVE sse2
929 %else ;!HIGH_BIT_DEPTH
931 %macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
951 %macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, cpu, shuffle constant
984 %macro PLANE_INTERLEAVE 1
985 ;-----------------------------------------------------------------------------
986 ; void plane_copy_interleave_core( uint8_t *dst, int i_dst,
987 ; uint8_t *srcu, int i_srcu,
988 ; uint8_t *srcv, int i_srcv, int w, int h )
989 ;-----------------------------------------------------------------------------
990 ; assumes i_dst and w are multiples of 16, and i_dst>2*w
991 cglobal plane_copy_interleave_core_%1, 6,7
1000 DECLARE_REG_TMP 10,11
1019 INTERLEAVE r0+r6*2, r2+r6, r4+r6, u, nt
1020 INTERLEAVE r0+r6*2+16, r2+r6+8, r4+r6+8, u, nt
1025 movntq [r0+r6*2], m0
1026 movntq [r0+r6*2+8], m0
1027 movntq [r0+r6*2+16], m0
1028 movntq [r0+r6*2+24], m0
1030 movntdq [r0+r6*2], m0
1031 movntdq [r0+r6*2+16], m0
1045 ;-----------------------------------------------------------------------------
1046 ; void store_interleave_8x8x2( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv )
1047 ;-----------------------------------------------------------------------------
1048 cglobal store_interleave_8x8x2_%1, 4,5
1051 INTERLEAVE r0, r2, r3, a
1052 INTERLEAVE r0+r1, r2+FDEC_STRIDE, r3+FDEC_STRIDE, a
1053 add r2, FDEC_STRIDE*2
1054 add r3, FDEC_STRIDE*2
1059 %endmacro ; PLANE_INTERLEAVE
1061 %macro DEINTERLEAVE_START 1
1063 mova m4, [deinterleave_shuf]
1069 %macro PLANE_DEINTERLEAVE 1
1070 ;-----------------------------------------------------------------------------
1071 ; void plane_copy_deinterleave( uint8_t *dstu, int i_dstu,
1072 ; uint8_t *dstv, int i_dstv,
1073 ; uint8_t *src, int i_src, int w, int h )
1074 ;-----------------------------------------------------------------------------
1075 cglobal plane_copy_deinterleave_%1, 6,7
1076 DEINTERLEAVE_START %1
1078 movsxdifnidn r1, r1d
1079 movsxdifnidn r3, r3d
1080 movsxdifnidn r5, r5d
1088 DEINTERLEAVE r0+r6, r2+r6, r4+r6*2, 0, %1, m4
1089 DEINTERLEAVE r0+r6+8, r2+r6+8, r4+r6*2+16, 0, %1, m4
1099 ;-----------------------------------------------------------------------------
1100 ; void load_deinterleave_8x8x2_fenc( uint8_t *dst, uint8_t *src, int i_src )
1101 ;-----------------------------------------------------------------------------
1102 cglobal load_deinterleave_8x8x2_fenc_%1, 3,4
1103 DEINTERLEAVE_START %1
1106 DEINTERLEAVE r0, r0+FENC_STRIDE/2, r1, 1, %1, m4
1107 DEINTERLEAVE r0+FENC_STRIDE, r0+FENC_STRIDE*3/2, r1+r2, 1, %1, m4
1108 add r0, FENC_STRIDE*2
1114 ;-----------------------------------------------------------------------------
1115 ; void load_deinterleave_8x8x2_fdec( uint8_t *dst, uint8_t *src, int i_src )
1116 ;-----------------------------------------------------------------------------
1117 cglobal load_deinterleave_8x8x2_fdec_%1, 3,4
1118 DEINTERLEAVE_START %1
1121 DEINTERLEAVE r0, r0+FDEC_STRIDE/2, r1, 0, %1, m4
1122 DEINTERLEAVE r0+FDEC_STRIDE, r0+FDEC_STRIDE*3/2, r1+r2, 0, %1, m4
1123 add r0, FDEC_STRIDE*2
1128 %endmacro ; PLANE_DEINTERLEAVE
1131 PLANE_INTERLEAVE mmxext
1132 PLANE_DEINTERLEAVE mmx
1134 PLANE_INTERLEAVE sse2
1135 PLANE_DEINTERLEAVE sse2
1136 PLANE_DEINTERLEAVE ssse3
1138 %endif ; HIGH_BIT_DEPTH
1140 ; These functions are not general-use; not only do the SSE ones require aligned input,
1141 ; but they also will fail if given a non-mod16 size or a size less than 64.
1142 ; memzero SSE will fail for non-mod128.
1144 ;-----------------------------------------------------------------------------
1145 ; void *memcpy_aligned( void *dst, const void *src, size_t n );
1146 ;-----------------------------------------------------------------------------
1147 cglobal memcpy_aligned_mmx, 3,3
1151 movq mm0, [r1 + r2 + 0]
1152 movq mm1, [r1 + r2 + 8]
1153 movq [r0 + r2 + 0], mm0
1154 movq [r0 + r2 + 8], mm1
1157 movq mm0, [r1 + r2 + 0]
1158 movq mm1, [r1 + r2 + 8]
1159 movq mm2, [r1 + r2 + 16]
1160 movq mm3, [r1 + r2 + 24]
1161 movq [r0 + r2 + 0], mm0
1162 movq [r0 + r2 + 8], mm1
1163 movq [r0 + r2 + 16], mm2
1164 movq [r0 + r2 + 24], mm3
1168 ;-----------------------------------------------------------------------------
1169 ; void *memcpy_aligned( void *dst, const void *src, size_t n );
1170 ;-----------------------------------------------------------------------------
1171 cglobal memcpy_aligned_sse2, 3,3
1175 movdqa xmm0, [r1 + r2]
1176 movdqa [r0 + r2], xmm0
1181 movdqa xmm0, [r1 + r2 + 0]
1182 movdqa [r0 + r2 + 0], xmm0
1183 movdqa xmm1, [r1 + r2 + 16]
1184 movdqa [r0 + r2 + 16], xmm1
1187 movdqa xmm0, [r1 + r2 + 0]
1188 movdqa [r0 + r2 + 0], xmm0
1189 movdqa xmm1, [r1 + r2 + 16]
1190 movdqa [r0 + r2 + 16], xmm1
1191 movdqa xmm2, [r1 + r2 + 32]
1192 movdqa [r0 + r2 + 32], xmm2
1193 movdqa xmm3, [r1 + r2 + 48]
1194 movdqa [r0 + r2 + 48], xmm3
1198 ;-----------------------------------------------------------------------------
1199 ; void *memzero_aligned( void *dst, size_t n );
1200 ;-----------------------------------------------------------------------------
1202 cglobal memzero_aligned_%1, 2,2
1209 mova [r0 + r1 + i], m0
1224 ;-----------------------------------------------------------------------------
1225 ; void integral_init4h( uint16_t *sum, uint8_t *pix, int stride )
1226 ;-----------------------------------------------------------------------------
1227 cglobal integral_init4h_sse4, 3,4
1234 movdqa m1, [r1+r2+16]
1239 paddw m1, [r0+r2*2+16]
1240 movdqa [r3+r2*2 ], m0
1241 movdqa [r3+r2*2+16], m1
1246 cglobal integral_init8h_sse4, 3,4
1253 movdqa m1, [r1+r2+16]
1262 paddw m1, [r0+r2*2+16]
1265 movdqa [r3+r2*2 ], m0
1266 movdqa [r3+r2*2+16], m1
1271 %macro INTEGRAL_INIT_8V 1
1272 ;-----------------------------------------------------------------------------
1273 ; void integral_init8v( uint16_t *sum8, int stride )
1274 ;-----------------------------------------------------------------------------
1275 cglobal integral_init8v_%1, 3,3
1282 mova m1, [r2+r1+mmsize]
1284 psubw m1, [r0+r1+mmsize]
1286 mova [r0+r1+mmsize], m1
1293 INTEGRAL_INIT_8V mmx
1295 INTEGRAL_INIT_8V sse2
1297 ;-----------------------------------------------------------------------------
1298 ; void integral_init4v( uint16_t *sum8, uint16_t *sum4, int stride )
1299 ;-----------------------------------------------------------------------------
1301 cglobal integral_init4v_mmx, 3,5
1323 cglobal integral_init4v_sse2, 3,5
1335 shufpd m0, [r0+r2+16], 1
1336 shufpd m1, [r4+r2+16], 1
1348 cglobal integral_init4v_ssse3, 3,5
1377 pavgb %4, [r0+r5*2+%7]
1378 PALIGNR %1, %3, 1, m6
1379 PALIGNR %2, %4, 1, m6
1391 mova m3, [r0+%4+mmsize]
1393 pavgb m3, [r0+%4+r5+mmsize]
1394 pavgb m2, [r0+%4+r5]
1395 PALIGNR %1, m3, 1, m6
1397 PALIGNR m3, m2, 1, m6
1415 pavgb m3, [r0+%3+r5+8]
1416 pavgb m2, [r0+%3+r5]
1419 pavgb m1, [r0+%3+r5+9]
1420 pavgb m0, [r0+%3+r5+1]
1435 ;-----------------------------------------------------------------------------
1436 ; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
1437 ; int src_stride, int dst_stride, int width, int height )
1438 ;-----------------------------------------------------------------------------
1439 %macro FRAME_INIT_LOWRES 1-2 0 ; FIXME
1440 cglobal frame_init_lowres_core_%1, 6,7,%2
1444 ; src += 2*(height-1)*stride + 2*width
1450 ; dst += (height-1)*stride + width
1459 ; gap = stride - width
1463 %define dst_gap [rsp+gprsize]
1468 %define src_gap [rsp]
1470 ; adjust for the odd end case
1493 FILT8x4 m0, m1, m2, m3, m4, m5, 0
1511 FILT8x4 m0, m1, m2, m3, m10, m11, mmsize
1514 FILT8x4 m2, m3, m0, m1, m4, m5, 0
1527 FILT16x2 m0, r1, r2, 0
1528 FILT16x2 m1, r3, r4, r5
1544 %endmacro ; FRAME_INIT_LOWRES
1547 %define PALIGNR PALIGNR_MMX
1548 FRAME_INIT_LOWRES mmxext
1550 FRAME_INIT_LOWRES cache32_mmxext
1553 FRAME_INIT_LOWRES sse2, 12
1554 %define PALIGNR PALIGNR_SSSE3
1555 FRAME_INIT_LOWRES ssse3, 12
1557 ;-----------------------------------------------------------------------------
1558 ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
1559 ; uint16_t *inter_costs, uint16_t *inv_qscales, int len )
1560 ;-----------------------------------------------------------------------------
1561 cglobal mbtree_propagate_cost_sse2, 6,6,7
1570 movdqa xmm6, [pw_3fff]
1571 movdqa xmm4, [pd_128]
1573 movq xmm2, [r2+r5] ; intra
1574 movq xmm0, [r4+r5] ; invq
1575 movq xmm3, [r3+r5] ; inter
1576 movq xmm1, [r1+r5] ; prop
1577 punpcklwd xmm2, xmm5
1578 punpcklwd xmm0, xmm5
1581 punpcklwd xmm1, xmm5
1582 punpcklwd xmm3, xmm5
1584 psrld xmm0, 8 ; intra*invq>>8
1585 paddd xmm0, xmm1 ; prop + (intra*invq>>8)
1586 cvtdq2ps xmm1, xmm2 ; intra
1587 psubd xmm2, xmm3 ; intra - inter
1588 rcpps xmm3, xmm1 ; 1 / intra 1st approximation
1590 mulps xmm1, xmm3 ; intra * (1/intra 1st approx)
1592 mulps xmm1, xmm3 ; intra * (1/intra 1st approx)^2
1593 mulps xmm0, xmm2 ; (prop + (intra*invq>>8)) * (intra - inter)
1594 addps xmm3, xmm3 ; 2 * (1/intra 1st approx)
1595 subps xmm3, xmm1 ; 2nd approximation for 1/intra
1596 mulps xmm0, xmm3 ; / intra
1597 cvttps2dq xmm0, xmm0 ; truncation isn't really desired, but matches the integer implementation
1598 movdqa [r0+r5*2], xmm0