1 ;*****************************************************************************
2 ;* quant-a.asm: x86 quantization and level-run
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2010 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Christian Heine <sennindemokrit@gmx.net>
9 ;* Oskar Arvidsson <oskar@irock.se>
11 ;* This program is free software; you can redistribute it and/or modify
12 ;* it under the terms of the GNU General Public License as published by
13 ;* the Free Software Foundation; either version 2 of the License, or
14 ;* (at your option) any later version.
16 ;* This program is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;* GNU General Public License for more details.
21 ;* You should have received a copy of the GNU General Public License
22 ;* along with this program; if not, write to the Free Software
23 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
25 ;* This program is also available under a commercial proprietary license.
26 ;* For more information, contact us at licensing@x264.com.
27 ;*****************************************************************************
30 %include "x86util.asm"
35 dw %1, %2, %1, %2, %2, %3, %2, %3
38 dw %1, %4, %5, %4, %1, %4, %5, %4
39 dw %4, %2, %6, %2, %4, %2, %6, %2
40 dw %5, %6, %3, %6, %5, %6, %3, %6
41 ; last line not used, just padding for power-of-2 stride
54 DQM8 20, 18, 32, 19, 25, 24
55 DQM8 22, 19, 35, 21, 28, 26
56 DQM8 26, 23, 42, 24, 33, 31
57 DQM8 28, 25, 45, 26, 35, 33
58 DQM8 32, 28, 51, 30, 40, 38
59 DQM8 36, 32, 58, 34, 46, 43
62 db 0,3,2,6,2,5,5,9,1,5,4,8,5,8,8,12,1,4,4,8,4,7,7,11,4,8,7,11,8,11,11,15,1,4
63 db 3,7,4,7,7,11,3,7,6,10,7,10,10,14,4,7,7,11,7,10,10,14,7,11,10,14,11,14,14
64 db 18,0,4,3,7,3,6,6,10,3,7,6,10,7,10,10,14,3,6,6,10,6,9,9,13,6,10,9,13,10,13
65 db 13,17,4,7,6,10,7,10,10,14,6,10,9,13,10,13,13,17,7,10,10,14,10,13,13,17,10
66 db 14,13,17,14,17,17,21,0,3,3,7,3,6,6,10,2,6,5,9,6,9,9,13,3,6,6,10,6,9,9,13
67 db 6,10,9,13,10,13,13,17,3,6,5,9,6,9,9,13,5,9,8,12,9,12,12,16,6,9,9,13,9,12
68 db 12,16,9,13,12,16,13,16,16,20,3,7,6,10,6,9,9,13,6,10,9,13,10,13,13,17,6,9
69 db 9,13,9,12,12,16,9,13,12,16,13,16,16,20,7,10,9,13,10,13,13,17,9,13,12,16
70 db 13,16,16,20,10,13,13,17,13,16,16,20,13,17,16,20,17,20,20,24
79 %macro QUANT_DC_START_MMX 0
88 %endif ; HIGH_BIT_DEPTH
91 %macro QUANT_DC_START_SSSE3 0
116 %macro PSIGNW_SSSE3 2
125 %macro PSIGND_SSSE3 2
141 %macro QUANT_END_MMX 0
154 cmp ecx, (1<<mmsize)-1
164 %macro QUANT_END_SSE4 0
170 %ifdef HIGH_BIT_DEPTH
171 %macro QUANT_ONE_DC_MMX 4
191 %macro QUANT_TWO_DC_MMX 4
192 QUANT_ONE_DC_MMX %1, %2, %3, %4
193 QUANT_ONE_DC_MMX %1+mmsize, %2, %3, %4+mmsize
196 %macro QUANT_ONE_DC_SSE4 4
211 %macro QUANT_TWO_DC_SSE4 4
234 %macro QUANT_ONE_AC_MMX 4
257 %macro QUANT_TWO_AC_MMX 4
258 QUANT_ONE_AC_MMX %1, %2, %3, %4
259 QUANT_ONE_AC_MMX %1+mmsize, %2+mmsize, %3+mmsize, %4+mmsize
262 %macro QUANT_TWO_AC_SSE4 4
268 paddd m3, [%3+mmsize]
270 pmulld m3, [%2+mmsize]
285 ;-----------------------------------------------------------------------------
286 ; int quant_2x2( int32_t dct[M*N], int mf, int bias )
287 ;-----------------------------------------------------------------------------
289 cglobal quant_%1x%2_dc_%3, 3,3,8*(mmsize/16)
291 %if %1*%2 <= mmsize/4
292 QUANT_ONE_DC r0, m6, m7, 0
295 %rep %1*%2/(mmsize/2)
296 QUANT_TWO_DC r0+x, m6, m7, x
304 ;-----------------------------------------------------------------------------
305 ; int quant_MxN( int32_t dct[M*N], uint32_t mf[M*N], uint32_t bias[M*N] )
306 ;-----------------------------------------------------------------------------
308 cglobal quant_%1x%2_%3, 3,3,8*(mmsize/16)
310 %rep %1*%2/(mmsize/2)
311 QUANT_TWO_AC r0+x, r1+x, r2+x, x
318 %define QUANT_TWO_AC QUANT_TWO_AC_MMX
319 %define QUANT_ONE_DC QUANT_ONE_DC_MMX
320 %define QUANT_TWO_DC QUANT_TWO_DC_MMX
321 %define QUANT_END QUANT_END_MMX
322 %define PABSD PABSD_MMX
323 %define PSIGND PSIGND_MMX
325 QUANT_DC 2, 2, mmxext
326 QUANT_DC 4, 4, mmxext
335 %define PABSD PABSD_SSSE3
336 %define PSIGND PSIGND_SSSE3
342 %define QUANT_TWO_AC QUANT_TWO_AC_SSE4
343 %define QUANT_ONE_DC QUANT_ONE_DC_SSE4
344 %define QUANT_TWO_DC QUANT_TWO_DC_SSE4
345 %define QUANT_END QUANT_END_SSE4
357 %endif ; HIGH_BIT_DEPTH
359 %ifndef HIGH_BIT_DEPTH
361 ;;; %1 (m64) dct[y][x]
362 ;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t)
363 ;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t)
364 mova m1, %1 ; load dct coeffs
366 paddusw m0, %3 ; round
367 pmulhuw m0, %2 ; divide
368 PSIGNW m0, m1 ; restore sign
399 ;-----------------------------------------------------------------------------
400 ; void quant_4x4_dc( int16_t dct[16], int mf, int bias )
401 ;-----------------------------------------------------------------------------
402 %macro QUANT_DC 2-3 0
406 QUANT_ONE [r0], m6, m7, 0
410 QUANT_TWO [r0+x], [r0+x+mmsize], m6, m6, m7, m7, x
418 ;-----------------------------------------------------------------------------
419 ; int quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
420 ;-----------------------------------------------------------------------------
425 QUANT_TWO [r0+x], [r0+x+mmsize], [r1+x], [r1+x+mmsize], [r2+x], [r2+x+mmsize], x
433 %define QUANT_END QUANT_END_MMX
434 %define PABSW PABSW_MMX
435 %define PSIGNW PSIGNW_MMX
436 %define QUANT_DC_START QUANT_DC_START_MMX
437 QUANT_DC quant_2x2_dc_mmxext, 1
438 %ifndef ARCH_X86_64 ; not needed because sse2 is faster
439 QUANT_DC quant_4x4_dc_mmxext, 4
440 QUANT_AC quant_4x4_mmx, 4
441 QUANT_AC quant_8x8_mmx, 16
445 QUANT_DC quant_4x4_dc_sse2, 2, 8
446 QUANT_AC quant_4x4_sse2, 2
447 QUANT_AC quant_8x8_sse2, 8
449 %define PABSW PABSW_SSSE3
450 %define PSIGNW PSIGNW_SSSE3
451 QUANT_DC quant_4x4_dc_ssse3, 2, 8
452 QUANT_AC quant_4x4_ssse3, 2
453 QUANT_AC quant_8x8_ssse3, 8
456 QUANT_DC quant_2x2_dc_ssse3, 1
457 %define QUANT_END QUANT_END_SSE4
458 ;Not faster on Conroe, so only used in SSE4 versions
459 %define QUANT_DC_START QUANT_DC_START_SSSE3
461 QUANT_DC quant_4x4_dc_sse4, 2, 8
462 QUANT_AC quant_4x4_sse4, 2
463 QUANT_AC quant_8x8_sse4, 8
464 %endif ; !HIGH_BIT_DEPTH
468 ;=============================================================================
470 ;=============================================================================
474 ;;; %2,%3 dequant_mf[i_mf][y][x]
477 %ifdef HIGH_BIT_DEPTH
490 ;;; %2,%3 dequant_mf[i_mf][y][x]
495 %ifdef HIGH_BIT_DEPTH
514 %macro DEQUANT_LOOP 3
518 %1 [r0+(t0 )*SIZEOF_PIXEL], [r1+t0*2 ], [r1+t0*2+ 8*%3]
519 %1 [r0+(t0+8*%3)*SIZEOF_PIXEL], [r1+t0*2+16*%3], [r1+t0*2+24*%3]
524 %1 [r0+(8*%3)*SIZEOF_PIXEL], [r1+16*%3], [r1+24*%3]
525 %1 [r0+(0 )*SIZEOF_PIXEL], [r1+0 ], [r1+ 8*%3]
530 %macro DEQUANT16_FLAT 2-5
548 DECLARE_REG_TMP 6,3,2
550 DECLARE_REG_TMP 4,3,2
552 DECLARE_REG_TMP 2,0,1
555 %macro DEQUANT_START 2
558 shr t0d, 8 ; i_qbits = i_qp / 6
561 sub t2d, t1d ; i_mf = i_qp % 6
564 add r1, t2 ; dequant_mf[i_mf]
566 add r1, r1mp ; dequant_mf[i_mf]
570 jl .rshift32 ; negative qbits => rightshift
573 ;-----------------------------------------------------------------------------
574 ; void dequant_4x4( dctcoef dct[4][4], int dequant_mf[6][4][4], int i_qp )
575 ;-----------------------------------------------------------------------------
577 cglobal dequant_%2x%2_%1, 0,3,6*(mmsize/16)
579 DEQUANT_START %3+2, %3
583 DEQUANT_LOOP DEQUANT16_L, %2*%2/4, %4
592 DEQUANT_LOOP DEQUANT32_R, %2*%2/4, %4
594 cglobal dequant_%2x%2_flat16_%1, 0,3
598 jl dequant_%2x%2_%1.skip_prologue
602 shr t0d, 8 ; i_qbits = i_qp / 6
605 sub t2d, t1d ; i_mf = i_qp % 6
608 lea r1, [dequant%2_scale]
611 lea r1, [dequant%2_scale + t2]
617 DEQUANT16_FLAT [r1], 0, 16
618 DEQUANT16_FLAT [r1+8], 8, 24
620 DEQUANT16_FLAT [r1], 0, 16
623 DEQUANT16_FLAT [r1], 0, 8, 64, 72
624 DEQUANT16_FLAT [r1+16], 16, 24, 48, 56
625 DEQUANT16_FLAT [r1+16], 80, 88, 112, 120
626 DEQUANT16_FLAT [r1+32], 32, 40, 96, 104
628 DEQUANT16_FLAT [r1], 0, 64
629 DEQUANT16_FLAT [r1+16], 16, 48, 80, 112
630 DEQUANT16_FLAT [r1+32], 32, 96
635 %ifdef HIGH_BIT_DEPTH
637 DEQUANT sse2, 4, 4, 1
638 DEQUANT sse4, 4, 4, 1
639 DEQUANT sse2, 8, 6, 1
640 DEQUANT sse4, 8, 6, 1
648 DEQUANT sse2, 4, 4, 2
649 DEQUANT sse2, 8, 6, 2
653 cglobal dequant_4x4dc_%1, 0,3,6*(mmsize/16)
660 %ifdef HIGH_BIT_DEPTH
663 %rep SIZEOF_PIXEL*16/mmsize
664 mova m0, [r0+mmsize*0+x]
665 mova m1, [r0+mmsize*1+x]
668 mova [r0+mmsize*0+x], m0
669 mova [r0+mmsize*1+x], m1
673 %else ; !HIGH_BIT_DEPTH
681 %rep SIZEOF_PIXEL*16/mmsize
682 mova m0, [r0+mmsize*0+x]
683 mova m1, [r0+mmsize*1+x]
686 mova [r0+mmsize*0+x], m0
687 mova [r0+mmsize*1+x], m1
690 %endif ; HIGH_BIT_DEPTH
702 %ifdef HIGH_BIT_DEPTH
704 %rep SIZEOF_PIXEL*32/mmsize
720 %rep SIZEOF_PIXEL*32/mmsize
737 %ifdef HIGH_BIT_DEPTH
748 %ifdef HIGH_BIT_DEPTH
749 ;-----------------------------------------------------------------------------
750 ; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size )
751 ;-----------------------------------------------------------------------------
752 %macro DENOISE_DCT 1-2 0
753 cglobal denoise_dct_%1, 4,5,%2
754 mov r4d, [r0] ; backup DC coefficient
758 mova m2, [r0+r3*4+0*mmsize]
759 mova m3, [r0+r3*4+1*mmsize]
764 psubd m0, [r2+r3*4+0*mmsize]
765 psubd m1, [r2+r3*4+1*mmsize]
774 mova [r0+r3*4+0*mmsize], m0
775 mova [r0+r3*4+1*mmsize], m1
776 paddd m4, [r1+r3*4+0*mmsize]
777 paddd m5, [r1+r3*4+1*mmsize]
778 mova [r1+r3*4+0*mmsize], m4
779 mova [r1+r3*4+1*mmsize], m5
781 mov [r0], r4d ; restore DC coefficient
785 %define PABSD PABSD_MMX
786 %define PSIGND PSIGND_MMX
793 %define PABSD PABSD_SSSE3
794 %define PSIGND PSIGND_SSSE3
797 %else ; !HIGH_BIT_DEPTH
799 ;-----------------------------------------------------------------------------
800 ; void denoise_dct( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
801 ;-----------------------------------------------------------------------------
802 %macro DENOISE_DCT 1-2 0
803 cglobal denoise_dct_%1, 4,5,%2
808 mova m2, [r0+r3*2+0*mmsize]
809 mova m3, [r0+r3*2+1*mmsize]
814 psubusw m0, [r2+r3*2+0*mmsize]
815 psubusw m1, [r2+r3*2+1*mmsize]
818 mova [r0+r3*2+0*mmsize], m0
819 mova [r0+r3*2+1*mmsize], m1
826 paddd m4, [r1+r3*4+0*mmsize]
827 paddd m2, [r1+r3*4+1*mmsize]
828 paddd m5, [r1+r3*4+2*mmsize]
829 paddd m3, [r1+r3*4+3*mmsize]
830 mova [r1+r3*4+0*mmsize], m4
831 mova [r1+r3*4+1*mmsize], m2
832 mova [r1+r3*4+2*mmsize], m5
833 mova [r1+r3*4+3*mmsize], m3
839 %define PABSW PABSW_MMX
840 %define PSIGNW PSIGNW_MMX
847 %define PABSW PABSW_SSSE3
848 %define PSIGNW PSIGNW_SSSE3
851 %endif ; !HIGH_BIT_DEPTH
853 ;-----------------------------------------------------------------------------
854 ; int decimate_score( dctcoef *dct )
855 ;-----------------------------------------------------------------------------
857 %macro DECIMATE_MASK_SSE2 7
858 %ifdef HIGH_BIT_DEPTH
861 packssdw xmm0, [%3+16]
862 packssdw xmm1, [%3+48]
867 ABS2_MMX xmm0, xmm1, xmm3, xmm4
876 ABS2_MMX xmm0, xmm1, xmm3, xmm4
887 %macro DECIMATE_MASK_MMX 7
888 %ifdef HIGH_BIT_DEPTH
893 packssdw mm0, [%3+ 8]
894 packssdw mm1, [%3+24]
895 packssdw mm2, [%3+40]
896 packssdw mm3, [%3+56]
903 ABS2_MMX mm0, mm1, mm6, mm7
904 ABS2_MMX mm2, mm3, mm6, mm7
923 cextern decimate_table4
924 cextern decimate_table8
928 ;A LUT is faster than bsf on AMD processors.
929 ;This is not true for score64.
930 cglobal decimate_score%1_%2, 1,3
932 lea r10, [decimate_table4]
933 lea r11, [decimate_mask_table4]
935 %define mask_table r11
937 %define table decimate_table4
938 %define mask_table decimate_mask_table4
940 DECIMATE_MASK edx, eax, r0, [pb_1], %2, ecx, %4
950 movzx eax, byte [mask_table + rcx]
959 add al, byte [table + rcx]
960 add al, byte [mask_table + rdx]
965 add al, byte [table + rcx]
978 %define DECIMATE_MASK DECIMATE_MASK_MMX
979 DECIMATE4x4 15, mmxext, 0, 0
980 DECIMATE4x4 16, mmxext, 0, 0
981 DECIMATE4x4 15, mmxext_slowctz, 1, 0
982 DECIMATE4x4 16, mmxext_slowctz, 1, 0
984 %define DECIMATE_MASK DECIMATE_MASK_SSE2
985 DECIMATE4x4 15, sse2, 0, 0
986 DECIMATE4x4 16, sse2, 0, 0
987 DECIMATE4x4 15, sse2_slowctz, 1, 0
988 DECIMATE4x4 16, sse2_slowctz, 1, 0
989 DECIMATE4x4 15, ssse3, 0, 1
990 DECIMATE4x4 16, ssse3, 0, 1
991 DECIMATE4x4 15, ssse3_slowctz, 1, 1
992 DECIMATE4x4 16, ssse3_slowctz, 1, 1
997 cglobal decimate_score64_%1, 1,4
999 lea r10, [decimate_table8]
1002 %define table decimate_table8
1005 DECIMATE_MASK r1d, eax, r0+SIZEOF_DCTCOEF* 0, m5, %1, null, %2
1008 DECIMATE_MASK r2d, eax, r0+SIZEOF_DCTCOEF*16, m5, %1, null, %2
1011 DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*32, m5, %1, null, %2
1015 DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*48, m5, %1, null, %2
1025 add al, byte [table + rcx]
1036 cglobal decimate_score64_%1, 1,6
1038 cglobal decimate_score64_%1, 1,5
1041 DECIMATE_MASK r3, r2, r0+SIZEOF_DCTCOEF* 0, m5, %1, r5, %2
1044 DECIMATE_MASK r4, r2, r0+SIZEOF_DCTCOEF*16, m5, %1, r5, %2
1047 DECIMATE_MASK r4, r1, r0+SIZEOF_DCTCOEF*32, m5, %1, r5, %2
1049 DECIMATE_MASK r1, r0, r0+SIZEOF_DCTCOEF*48, m5, %1, r5, %2
1057 jne .ret9 ;r0 is zero at this point, so we don't need to zero it
1064 add r0b, byte [decimate_table8 + ecx]
1067 cmp r0, 6 ;score64's threshold is never higher than 6
1068 jge .ret9 ;this early termination is only useful on 32-bit because it can be done in the latency after shrd
1096 %define DECIMATE_MASK DECIMATE_MASK_MMX
1097 DECIMATE8x8 mmxext, 0
1100 %define DECIMATE_MASK DECIMATE_MASK_SSE2
1102 DECIMATE8x8 ssse3, 1
1104 ;-----------------------------------------------------------------------------
1105 ; int coeff_last( dctcoef *dct )
1106 ;-----------------------------------------------------------------------------
1117 %ifdef HIGH_BIT_DEPTH
1118 %macro LAST_MASK4_MMX 2-3
1120 packssdw mm0, [%2+8]
1126 %macro LAST_MASK_SSE2 2-3
1127 movdqa xmm0, [%2+ 0]
1128 movdqa xmm1, [%2+32]
1129 packssdw xmm0, [%2+16]
1130 packssdw xmm1, [%2+48]
1136 %macro LAST_MASK_MMX 3
1139 packssdw mm0, [%2+ 8]
1140 packssdw mm1, [%2+24]
1143 packssdw mm3, [%2+40]
1144 packssdw mm4, [%2+56]
1155 %macro COEFF_LAST4 1
1156 cglobal coeff_last4_%1, 1,3
1158 LAST_MASK4_MMX r1d, r0
1165 %define LAST LAST_X86
1167 %define LAST LAST_SSE4A
1168 COEFF_LAST4 mmxext_lzcnt
1170 %else ; !HIGH_BIT_DEPTH
1171 %macro LAST_MASK4_MMX 2-3
1178 %macro LAST_MASK_SSE2 2-3
1179 movdqa xmm0, [%2+ 0]
1180 packsswb xmm0, [%2+16]
1185 %macro LAST_MASK_MMX 3
1188 packsswb mm0, [%2+ 8]
1189 packsswb mm1, [%2+24]
1198 %macro COEFF_LAST4 1
1200 cglobal coeff_last4_%1, 1,1
1201 LAST rax, [r0], 0x3f
1205 cglobal coeff_last4_%1, 0,3
1214 lea eax, [eax+ecx*2]
1219 %define LAST LAST_X86
1221 %define LAST LAST_SSE4A
1222 COEFF_LAST4 mmxext_lzcnt
1223 %endif ; HIGH_BIT_DEPTH
1226 cglobal coeff_last15_%1, 1,3
1228 LAST_MASK r1d, r0-SIZEOF_DCTCOEF, r2d
1234 cglobal coeff_last16_%1, 1,3
1236 LAST_MASK r1d, r0, r2d
1242 cglobal coeff_last64_%1, 1, 5-mmsize/16
1244 LAST_MASK r2d, r0+SIZEOF_DCTCOEF* 32, r4d
1245 LAST_MASK r3d, r0+SIZEOF_DCTCOEF* 48, r4d
1250 LAST_MASK r1d, r0+SIZEOF_DCTCOEF* 0, r4d
1251 LAST_MASK r3d, r0+SIZEOF_DCTCOEF*16, r4d
1262 cglobal coeff_last64_%1, 1,4
1264 LAST_MASK_SSE2 r1d, r0+SIZEOF_DCTCOEF* 0
1265 LAST_MASK_SSE2 r2d, r0+SIZEOF_DCTCOEF*16
1266 LAST_MASK_SSE2 r3d, r0+SIZEOF_DCTCOEF*32
1267 LAST_MASK_SSE2 r0d, r0+SIZEOF_DCTCOEF*48
1280 %define LAST LAST_X86
1283 %define LAST_MASK LAST_MASK_MMX
1287 %define LAST_MASK LAST_MASK_SSE2
1289 %define LAST LAST_SSE4A
1290 COEFF_LAST sse2_lzcnt
1292 ;-----------------------------------------------------------------------------
1293 ; int coeff_level_run( dctcoef *dct, run_level_t *runlevel )
1294 ;-----------------------------------------------------------------------------
1296 %macro LZCOUNT_X86 3
1301 %macro LZCOUNT_SSE4A 3
1305 ; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args
1307 DECLARE_REG_TMP 3,1,2,0,4,5,6
1308 %elifdef ARCH_X86_64
1309 DECLARE_REG_TMP 0,1,2,3,4,5,6
1311 DECLARE_REG_TMP 6,3,2,1,4,5,0
1314 %macro COEFF_LEVELRUN 2
1315 cglobal coeff_level_run%2_%1,0,7
1319 LAST_MASK t5d, t0-(%2&1)*SIZEOF_DCTCOEF, t4d
1321 shl t5d, 32-((%2+1)&~1)
1323 LZCOUNT t3d, t5d, 0x1f
1330 LZCOUNT t3d, t5d, 0x1f
1331 %ifdef HIGH_BIT_DEPTH
1333 mov [t1+t6 +4+16*4], t3b
1334 mov [t1+t6*4+ 4], t2d
1337 mov [t1+t6 +4+16*2], t3b
1338 mov [t1+t6*2+ 4], t2w
1349 %define LZCOUNT LZCOUNT_X86
1351 %define LAST_MASK LAST_MASK_MMX
1352 COEFF_LEVELRUN mmxext, 15
1353 COEFF_LEVELRUN mmxext, 16
1355 %define LAST_MASK LAST_MASK4_MMX
1356 COEFF_LEVELRUN mmxext, 4
1358 %define LAST_MASK LAST_MASK_SSE2
1359 COEFF_LEVELRUN sse2, 15
1360 COEFF_LEVELRUN sse2, 16
1361 %define LZCOUNT LZCOUNT_SSE4A
1362 COEFF_LEVELRUN sse2_lzcnt, 15
1363 COEFF_LEVELRUN sse2_lzcnt, 16
1365 %define LAST_MASK LAST_MASK4_MMX
1366 COEFF_LEVELRUN mmxext_lzcnt, 4