1 ;*****************************************************************************
2 ;* quant-a.asm: x86 quantization and level-run
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2013 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Christian Heine <sennindemokrit@gmx.net>
9 ;* Oskar Arvidsson <oskar@irock.se>
10 ;* Henrik Gramner <hengar-6@student.ltu.se>
12 ;* This program is free software; you can redistribute it and/or modify
13 ;* it under the terms of the GNU General Public License as published by
14 ;* the Free Software Foundation; either version 2 of the License, or
15 ;* (at your option) any later version.
17 ;* This program is distributed in the hope that it will be useful,
18 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;* GNU General Public License for more details.
22 ;* You should have received a copy of the GNU General Public License
23 ;* along with this program; if not, write to the Free Software
24 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
26 ;* This program is also available under a commercial proprietary license.
27 ;* For more information, contact us at licensing@x264.com.
28 ;*****************************************************************************
31 %include "x86util.asm"
36 dw %1, %2, %1, %2, %2, %3, %2, %3
39 dw %1, %4, %5, %4, %1, %4, %5, %4
40 dw %4, %2, %6, %2, %4, %2, %6, %2
41 dw %5, %6, %3, %6, %5, %6, %3, %6
42 ; last line not used, just padding for power-of-2 stride
55 DQM8 20, 18, 32, 19, 25, 24
56 DQM8 22, 19, 35, 21, 28, 26
57 DQM8 26, 23, 42, 24, 33, 31
58 DQM8 28, 25, 45, 26, 35, 33
59 DQM8 32, 28, 51, 30, 40, 38
60 DQM8 36, 32, 58, 34, 46, 43
63 db 0,3,2,6,2,5,5,9,1,5,4,8,5,8,8,12,1,4,4,8,4,7,7,11,4,8,7,11,8,11,11,15,1,4
64 db 3,7,4,7,7,11,3,7,6,10,7,10,10,14,4,7,7,11,7,10,10,14,7,11,10,14,11,14,14
65 db 18,0,4,3,7,3,6,6,10,3,7,6,10,7,10,10,14,3,6,6,10,6,9,9,13,6,10,9,13,10,13
66 db 13,17,4,7,6,10,7,10,10,14,6,10,9,13,10,13,13,17,7,10,10,14,10,13,13,17,10
67 db 14,13,17,14,17,17,21,0,3,3,7,3,6,6,10,2,6,5,9,6,9,9,13,3,6,6,10,6,9,9,13
68 db 6,10,9,13,10,13,13,17,3,6,5,9,6,9,9,13,5,9,8,12,9,12,12,16,6,9,9,13,9,12
69 db 12,16,9,13,12,16,13,16,16,20,3,7,6,10,6,9,9,13,6,10,9,13,10,13,13,17,6,9
70 db 9,13,9,12,12,16,9,13,12,16,13,16,16,20,7,10,9,13,10,13,13,17,9,13,12,16
71 db 13,16,16,20,10,13,13,17,13,16,16,20,13,17,16,20,17,20,20,24
73 chroma_dc_dct_mask_mmx: dw 0, 0,-1,-1, 0, 0,-1,-1
74 chroma_dc_dmf_mask_mmx: dw 0, 0,-1,-1, 0,-1,-1, 0
75 chroma_dc_dct_mask: dw 1, 1,-1,-1, 1, 1,-1,-1
76 chroma_dc_dmf_mask: dw 1, 1,-1,-1, 1,-1,-1, 1
86 %macro QUANT_DC_START 0
92 %elif cpuflag(sse4) ; ssse3, but not faster on conroe
118 cmp ecx, (1<<mmsize)-1
130 %macro QUANT_ONE_DC 4
154 %macro QUANT_TWO_DC 4
173 QUANT_ONE_DC %1, %2, %3, %4
174 QUANT_ONE_DC %1+mmsize, %2, %3, %4+mmsize
178 %macro QUANT_ONE_AC_MMX 5
197 %macro QUANT_TWO_AC 5
204 paddd m3, [%3+mmsize]
206 pmulld m3, [%2+mmsize]
214 ACCUM por, %5, 3, %4+mmsize
216 QUANT_ONE_AC_MMX %1, %2, %3, %4, %5
217 QUANT_ONE_AC_MMX %1+mmsize, %2+mmsize, %3+mmsize, %4+mmsize, %5
221 ;-----------------------------------------------------------------------------
222 ; int quant_2x2( int32_t dct[M*N], int mf, int bias )
223 ;-----------------------------------------------------------------------------
225 cglobal quant_%1x%2_dc, 3,3,8
227 %if %1*%2 <= mmsize/4
228 QUANT_ONE_DC r0, m6, m7, 0
231 %rep %1*%2/(mmsize/2)
232 QUANT_TWO_DC r0+x, m6, m7, x
240 ;-----------------------------------------------------------------------------
241 ; int quant_MxN( int32_t dct[M*N], uint32_t mf[M*N], uint32_t bias[M*N] )
242 ;-----------------------------------------------------------------------------
244 cglobal quant_%1x%2, 3,3,8
246 %rep %1*%2/(mmsize/2)
247 QUANT_TWO_AC r0+x, r1+x, r2+x, x, 5
255 QUANT_TWO_AC r0+%1+mmsize*0, r1+mmsize*0, r2+mmsize*0, mmsize*0, %2
256 QUANT_TWO_AC r0+%1+mmsize*2, r1+mmsize*2, r2+mmsize*2, mmsize*2, %2
260 cglobal quant_4x4x4, 3,3,8
269 packssdw m5, m5 ; AA BB CC DD
270 packsswb m5, m5 ; A B C D
300 %endif ; HIGH_BIT_DEPTH
302 %if HIGH_BIT_DEPTH == 0
304 ;;; %1 (m64) dct[y][x]
305 ;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t)
306 ;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t)
307 mova m1, %1 ; load dct coeffs
309 paddusw m0, %3 ; round
310 pmulhuw m0, %2 ; divide
311 PSIGNW m0, m1 ; restore sign
330 ACCUM por, %8, 2, %7+mmsize
333 ;-----------------------------------------------------------------------------
334 ; void quant_4x4_dc( int16_t dct[16], int mf, int bias )
335 ;-----------------------------------------------------------------------------
336 %macro QUANT_DC 2-3 0
340 QUANT_ONE [r0], m6, m7, 0
344 QUANT_TWO [r0+x], [r0+x+mmsize], m6, m6, m7, m7, x, 5
352 ;-----------------------------------------------------------------------------
353 ; int quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
354 ;-----------------------------------------------------------------------------
359 QUANT_TWO [r0+x], [r0+x+mmsize], [r1+x], [r1+x+mmsize], [r2+x], [r2+x+mmsize], x, 5
368 QUANT_TWO [r0+%1+mmsize*0], [r0+%1+mmsize*1], m8, m9, m10, m11, mmsize*0, %2
370 QUANT_TWO [r0+%1+mmsize*0], [r0+%1+mmsize*1], [r1+mmsize*0], [r1+mmsize*1], [r2+mmsize*0], [r2+mmsize*1], mmsize*0, %2
372 QUANT_TWO [r0+%1+mmsize*2], [r0+%1+mmsize*3], [r1+mmsize*2], [r1+mmsize*3], [r2+mmsize*2], [r2+mmsize*3], mmsize*2, %2
378 cglobal quant_4x4x4, 3,3,7
380 mova m8, [r1+mmsize*0]
381 mova m9, [r1+mmsize*1]
382 mova m10, [r2+mmsize*0]
383 mova m11, [r2+mmsize*1]
393 packssdw m4, m4 ; AA BB CC DD
395 packsswb m4, m4 ; A B C D
405 QUANT_DC quant_2x2_dc, 1
406 %if ARCH_X86_64 == 0 ; not needed because sse2 is faster
407 QUANT_DC quant_4x4_dc, 4
409 QUANT_AC quant_4x4, 4
410 QUANT_AC quant_8x8, 16
415 QUANT_DC quant_4x4_dc, 2, 8
416 QUANT_AC quant_4x4, 2
417 QUANT_AC quant_8x8, 8
421 QUANT_DC quant_4x4_dc, 2, 8
422 QUANT_AC quant_4x4, 2
423 QUANT_AC quant_8x8, 8
427 QUANT_DC quant_2x2_dc, 1
430 ;Not faster on Conroe, so only used in SSE4 versions
431 QUANT_DC quant_4x4_dc, 2, 8
432 QUANT_AC quant_4x4, 2
433 QUANT_AC quant_8x8, 8
434 %endif ; !HIGH_BIT_DEPTH
438 ;=============================================================================
440 ;=============================================================================
444 ;;; %2,%3 dequant_mf[i_mf][y][x]
460 ;;; %2,%3 dequant_mf[i_mf][y][x]
466 pmadcswd m0, m0, %2, m3
471 pmadcswd m0, m0, %2, m3
472 pmadcswd m1, m1, %3, m3
480 %macro DEQUANT_LOOP 3
484 %1 [r0+(t0 )*SIZEOF_PIXEL], [r1+t0*2 ], [r1+t0*2+ 8*%3]
485 %1 [r0+(t0+8*%3)*SIZEOF_PIXEL], [r1+t0*2+16*%3], [r1+t0*2+24*%3]
490 %1 [r0+(8*%3)*SIZEOF_PIXEL], [r1+16*%3], [r1+24*%3]
491 %1 [r0+(0 )*SIZEOF_PIXEL], [r1+0 ], [r1+ 8*%3]
496 %macro DEQUANT16_FLAT 2-5
514 DECLARE_REG_TMP 6,3,2
516 DECLARE_REG_TMP 4,3,2
518 DECLARE_REG_TMP 2,0,1
521 %macro DEQUANT_START 2
524 shr t0d, 8 ; i_qbits = i_qp / 6
527 sub t2d, t1d ; i_mf = i_qp % 6
530 add r1, t2 ; dequant_mf[i_mf]
532 add r1, r1mp ; dequant_mf[i_mf]
536 jl .rshift32 ; negative qbits => rightshift
539 ;-----------------------------------------------------------------------------
540 ; void dequant_4x4( dctcoef dct[4][4], int dequant_mf[6][4][4], int i_qp )
541 ;-----------------------------------------------------------------------------
543 cglobal dequant_%1x%1, 0,3,6
545 DEQUANT_START %2+2, %2
549 DEQUANT_LOOP DEQUANT16_L, %1*%1/4, %3
558 DEQUANT_LOOP DEQUANT32_R, %1*%1/4, %3
560 %if HIGH_BIT_DEPTH == 0 && notcpuflag(avx)
561 cglobal dequant_%1x%1_flat16, 0,3
565 jl dequant_%1x%1 %+ SUFFIX %+ .skip_prologue
569 shr t0d, 8 ; i_qbits = i_qp / 6
572 sub t2d, t1d ; i_mf = i_qp % 6
575 lea r1, [dequant%1_scale]
578 lea r1, [dequant%1_scale + t2]
584 DEQUANT16_FLAT [r1], 0, 16
585 DEQUANT16_FLAT [r1+8], 8, 24
587 DEQUANT16_FLAT [r1], 0, 16
590 DEQUANT16_FLAT [r1], 0, 8, 64, 72
591 DEQUANT16_FLAT [r1+16], 16, 24, 48, 56
592 DEQUANT16_FLAT [r1+16], 80, 88, 112, 120
593 DEQUANT16_FLAT [r1+32], 32, 40, 96, 104
595 DEQUANT16_FLAT [r1], 0, 64
596 DEQUANT16_FLAT [r1+16], 16, 48, 80, 112
597 DEQUANT16_FLAT [r1+32], 32, 96
600 %endif ; !HIGH_BIT_DEPTH && !AVX
628 cglobal dequant_4x4dc, 0,3,6
637 %rep SIZEOF_PIXEL*16/mmsize
638 mova m0, [r0+mmsize*0+x]
639 mova m1, [r0+mmsize*1+x]
642 mova [r0+mmsize*0+x], m0
643 mova [r0+mmsize*1+x], m1
659 %rep SIZEOF_PIXEL*32/mmsize
661 pmadcswd m0, m0, m2, m4
667 %else ; !HIGH_BIT_DEPTH
670 %rep SIZEOF_PIXEL*32/mmsize
682 %endif ; !HIGH_BIT_DEPTH
688 DEQUANT_DC d, pmaddwd
690 DEQUANT_DC d, pmaddwd
702 ; t4 is eax for return value.
704 DECLARE_REG_TMP 0,1,2,3,6,4 ; Identical for both Windows and *NIX
706 DECLARE_REG_TMP 4,1,2,3,0,5
709 ;-----------------------------------------------------------------------------
710 ; x264_optimize_chroma_2x2_dc( dctcoef dct[4], int dequant_mf )
711 ;-----------------------------------------------------------------------------
713 %macro OPTIMIZE_CHROMA_2x2_DC 0
716 %assign %%regs %%regs-1
719 %assign %%regs %%regs+1 ; t0-t4 are volatile on x86-64
721 cglobal optimize_chroma_2x2_dc, 0,%%regs,7
732 mova m3, [chroma_dc_dct_mask]
733 mova m5, [chroma_dc_dmf_mask]
735 mova m3, [chroma_dc_dct_mask_mmx]
736 mova m5, [chroma_dc_dmf_mask_mmx]
739 pshufd m0, m1, q0101 ; 1 0 3 2 1 0 3 2
741 punpcklqdq m1, m1 ; 3 2 1 0 3 2 1 0
742 mova m6, [pd_1024] ; 32<<5, elements are shifted 5 bits to the left
743 PSIGNW m0, m3 ; -1 -0 3 2 -1 -0 3 2
744 PSIGNW m2, m5 ; + - - + - - + +
745 paddw m0, m1 ; -1+3 -0+2 1+3 0+2 -1+3 -0+2 1+3 0+2
746 pmaddwd m0, m2 ; 0-1-2+3 0-1+2-3 0+1-2-3 0+1+2+3 * dmf
748 psrad m2, 16 ; + - - +
752 %if notcpuflag(ssse3)
753 psrad m1, 31 ; has to be 0 or -1 in order for PSIGND_MMX to work correctly
765 jz .ret ; if the DC coefficients already round to zero, terminate early
768 movsx t3d, word [t0+2*t1] ; dct[coeff]
770 pshufd m1, m1, q2100 ; move the next element to high dword
779 psubd m3, m5 ; coeff -= sign
790 paddd m3, m5 ; coeff += sign
795 pshufd m2, m2, q1320 ; - + - + / - - + +
806 punpcklqdq m2, m2 ; + + + +
813 %if HIGH_BIT_DEPTH == 0
815 OPTIMIZE_CHROMA_2x2_DC
817 OPTIMIZE_CHROMA_2x2_DC
819 OPTIMIZE_CHROMA_2x2_DC
821 OPTIMIZE_CHROMA_2x2_DC
822 %endif ; !HIGH_BIT_DEPTH
825 ;-----------------------------------------------------------------------------
826 ; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size )
827 ;-----------------------------------------------------------------------------
829 cglobal denoise_dct, 4,4,8
833 mova m2, [r0+r3*4-2*mmsize]
834 mova m3, [r0+r3*4-1*mmsize]
839 psubd m0, [r2+r3*4-2*mmsize]
840 psubd m1, [r2+r3*4-1*mmsize]
847 mova [r0+r3*4-2*mmsize], m0
848 mova [r0+r3*4-1*mmsize], m1
849 paddd m4, [r1+r3*4-2*mmsize]
850 paddd m5, [r1+r3*4-1*mmsize]
851 mova [r1+r3*4-2*mmsize], m4
852 mova [r1+r3*4-1*mmsize], m5
869 %else ; !HIGH_BIT_DEPTH
871 ;-----------------------------------------------------------------------------
872 ; void denoise_dct( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
873 ;-----------------------------------------------------------------------------
875 cglobal denoise_dct, 4,4,7
879 mova m2, [r0+r3*2-2*mmsize]
880 mova m3, [r0+r3*2-1*mmsize]
883 psubusw m4, m0, [r2+r3*2-2*mmsize]
884 psubusw m5, m1, [r2+r3*2-1*mmsize]
887 mova [r0+r3*2-2*mmsize], m4
888 mova [r0+r3*2-1*mmsize], m5
893 paddd m2, [r1+r3*4-4*mmsize]
894 paddd m0, [r1+r3*4-3*mmsize]
895 paddd m3, [r1+r3*4-2*mmsize]
896 paddd m1, [r1+r3*4-1*mmsize]
897 mova [r1+r3*4-4*mmsize], m2
898 mova [r1+r3*4-3*mmsize], m0
899 mova [r1+r3*4-2*mmsize], m3
900 mova [r1+r3*4-1*mmsize], m1
917 %endif ; !HIGH_BIT_DEPTH
919 ;-----------------------------------------------------------------------------
920 ; int decimate_score( dctcoef *dct )
921 ;-----------------------------------------------------------------------------
923 %macro DECIMATE_MASK 5
928 packssdw xmm0, [%3+16]
929 packssdw xmm1, [%3+48]
930 ABSW2 xmm0, xmm1, xmm0, xmm1, xmm3, xmm4
932 ABSW xmm0, [%3+ 0], xmm3
933 ABSW xmm1, [%3+16], xmm4
948 packssdw mm0, [%3+ 8]
949 packssdw mm1, [%3+24]
950 packssdw mm2, [%3+40]
951 packssdw mm3, [%3+56]
958 ABSW2 mm0, mm1, mm0, mm1, mm6, mm7
959 ABSW2 mm2, mm3, mm2, mm3, mm6, mm7
979 cextern decimate_table4
980 cextern decimate_table8
984 ;A LUT is faster than bsf on older AMD processors.
985 ;This is not true for score64.
986 cglobal decimate_score%1, 1,3
988 lea r4, [decimate_table4]
989 lea r5, [decimate_mask_table4]
991 %define mask_table r5
993 %define table decimate_table4
994 %define mask_table decimate_mask_table4
996 DECIMATE_MASK edx, eax, r0, [pb_1], ecx
1004 %if cpuflag(slowctz)
1006 movzx eax, byte [mask_table + rcx]
1015 add al, byte [table + rcx]
1016 add al, byte [mask_table + rdx]
1021 add al, byte [table + rcx]
1033 %if ARCH_X86_64 == 0
1037 INIT_MMX mmx2, slowctz
1044 INIT_XMM sse2, slowctz
1050 INIT_XMM ssse3, slowctz
1054 %macro DECIMATE8x8 0
1057 cglobal decimate_score64, 1,5
1059 lea r4, [decimate_table8]
1062 %define table decimate_table8
1065 DECIMATE_MASK r1d, eax, r0+SIZEOF_DCTCOEF* 0, m5, null
1068 DECIMATE_MASK r2d, eax, r0+SIZEOF_DCTCOEF*16, m5, null
1071 DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*32, m5, null
1075 DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*48, m5, null
1085 add al, byte [table + rcx]
1096 cglobal decimate_score64, 1,6
1098 cglobal decimate_score64, 1,5
1101 DECIMATE_MASK r3, r2, r0+SIZEOF_DCTCOEF* 0, m5, r5
1104 DECIMATE_MASK r4, r2, r0+SIZEOF_DCTCOEF*16, m5, r5
1107 DECIMATE_MASK r4, r1, r0+SIZEOF_DCTCOEF*32, m5, r5
1109 DECIMATE_MASK r1, r0, r0+SIZEOF_DCTCOEF*48, m5, r5
1117 jne .ret9 ;r0 is zero at this point, so we don't need to zero it
1124 add r0b, byte [decimate_table8 + ecx]
1127 cmp r0, 6 ;score64's threshold is never higher than 6
1128 jge .ret9 ;this early termination is only useful on 32-bit because it can be done in the latency after shrd
1154 %if ARCH_X86_64 == 0
1163 ;-----------------------------------------------------------------------------
1164 ; int coeff_last( dctcoef *dct )
1165 ;-----------------------------------------------------------------------------
1186 %macro LAST_MASK 3-4
1189 packssdw mm0, [%3+8]
1194 movdqa xmm0, [%3+ 0]
1196 packssdw xmm0, [%3+16]
1199 movdqa xmm1, [%3+32]
1200 packssdw xmm0, [%3+16]
1201 packssdw xmm1, [%3+48]
1209 packssdw mm0, [%3+ 8]
1210 packssdw mm1, [%3+24]
1217 packssdw mm0, [%3+ 8]
1218 packssdw mm1, [%3+24]
1221 packssdw mm3, [%3+40]
1222 packssdw mm4, [%3+56]
1234 %macro COEFF_LAST4 0
1235 cglobal coeff_last4, 1,3
1237 LAST_MASK 4, r1d, r0
1246 INIT_MMX mmx2, lzcnt
1249 %macro COEFF_LAST8 0
1250 cglobal coeff_last8, 1,3
1252 LAST_MASK 8, r1d, r0
1263 %if ARCH_X86_64 == 0
1269 INIT_XMM sse2, lzcnt
1272 %else ; !HIGH_BIT_DEPTH
1273 %macro LAST_MASK 3-4
1279 packsswb mm0, [%3+ 8]
1284 movdqa xmm0, [%3+ 0]
1285 packsswb xmm0, [%3+16]
1291 packsswb mm0, [%3+ 8]
1292 packsswb mm1, [%3+24]
1302 %macro COEFF_LAST48 0
1304 cglobal coeff_last4, 1,1
1309 cglobal coeff_last4, 0,3
1318 lea eax, [eax+ecx*2]
1322 cglobal coeff_last8, 1,3
1324 LAST_MASK 8, r1d, r0, r2d
1332 INIT_MMX mmx2, lzcnt
1334 %endif ; HIGH_BIT_DEPTH
1337 cglobal coeff_last15, 1,3
1339 LAST_MASK 15, r1d, r0-SIZEOF_DCTCOEF, r2d
1345 cglobal coeff_last16, 1,3
1347 LAST_MASK 16, r1d, r0, r2d
1352 %if ARCH_X86_64 == 0
1353 cglobal coeff_last64, 1, 5-mmsize/16
1355 LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF* 32, r4d
1356 LAST_MASK 16, r3d, r0+SIZEOF_DCTCOEF* 48, r4d
1361 LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0, r4d
1362 LAST_MASK 16, r3d, r0+SIZEOF_DCTCOEF*16, r4d
1373 cglobal coeff_last64, 1,3
1375 LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0
1376 LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*16
1379 LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*32
1380 LAST_MASK 16, r0d, r0+SIZEOF_DCTCOEF*48
1391 %if ARCH_X86_64 == 0
1397 INIT_XMM sse2, lzcnt
1400 ;-----------------------------------------------------------------------------
1401 ; int coeff_level_run( dctcoef *dct, run_level_t *runlevel )
1402 ;-----------------------------------------------------------------------------
1404 ; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args
1406 DECLARE_REG_TMP 3,1,2,0,4,5,6
1408 DECLARE_REG_TMP 0,1,2,3,4,5,6
1410 DECLARE_REG_TMP 6,3,2,1,4,5,0
1413 %macro COEFF_LEVELRUN 1
1414 cglobal coeff_level_run%1,0,7
1418 LAST_MASK %1, t5d, t0-(%1&1)*SIZEOF_DCTCOEF, t4d
1430 LZCOUNT t3d, t5d, 0x1f
1437 LZCOUNT t3d, t5d, 0x1f
1446 mov [t1+t6*4+ 8], t2d
1448 mov [t1+t6*2+ 8], t2w
1457 %if ARCH_X86_64 == 0
1469 INIT_XMM sse2, lzcnt
1475 INIT_MMX mmx2, lzcnt