1 ;*****************************************************************************
2 ;* quant-a.asm: x86 quantization and level-run
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2011 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Christian Heine <sennindemokrit@gmx.net>
9 ;* Oskar Arvidsson <oskar@irock.se>
10 ;* Henrik Gramner <hengar-6@student.ltu.se>
12 ;* This program is free software; you can redistribute it and/or modify
13 ;* it under the terms of the GNU General Public License as published by
14 ;* the Free Software Foundation; either version 2 of the License, or
15 ;* (at your option) any later version.
17 ;* This program is distributed in the hope that it will be useful,
18 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;* GNU General Public License for more details.
22 ;* You should have received a copy of the GNU General Public License
23 ;* along with this program; if not, write to the Free Software
24 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
26 ;* This program is also available under a commercial proprietary license.
27 ;* For more information, contact us at licensing@x264.com.
28 ;*****************************************************************************
31 %include "x86util.asm"
36 dw %1, %2, %1, %2, %2, %3, %2, %3
39 dw %1, %4, %5, %4, %1, %4, %5, %4
40 dw %4, %2, %6, %2, %4, %2, %6, %2
41 dw %5, %6, %3, %6, %5, %6, %3, %6
42 ; last line not used, just padding for power-of-2 stride
55 DQM8 20, 18, 32, 19, 25, 24
56 DQM8 22, 19, 35, 21, 28, 26
57 DQM8 26, 23, 42, 24, 33, 31
58 DQM8 28, 25, 45, 26, 35, 33
59 DQM8 32, 28, 51, 30, 40, 38
60 DQM8 36, 32, 58, 34, 46, 43
63 db 0,3,2,6,2,5,5,9,1,5,4,8,5,8,8,12,1,4,4,8,4,7,7,11,4,8,7,11,8,11,11,15,1,4
64 db 3,7,4,7,7,11,3,7,6,10,7,10,10,14,4,7,7,11,7,10,10,14,7,11,10,14,11,14,14
65 db 18,0,4,3,7,3,6,6,10,3,7,6,10,7,10,10,14,3,6,6,10,6,9,9,13,6,10,9,13,10,13
66 db 13,17,4,7,6,10,7,10,10,14,6,10,9,13,10,13,13,17,7,10,10,14,10,13,13,17,10
67 db 14,13,17,14,17,17,21,0,3,3,7,3,6,6,10,2,6,5,9,6,9,9,13,3,6,6,10,6,9,9,13
68 db 6,10,9,13,10,13,13,17,3,6,5,9,6,9,9,13,5,9,8,12,9,12,12,16,6,9,9,13,9,12
69 db 12,16,9,13,12,16,13,16,16,20,3,7,6,10,6,9,9,13,6,10,9,13,10,13,13,17,6,9
70 db 9,13,9,12,12,16,9,13,12,16,13,16,16,20,7,10,9,13,10,13,13,17,9,13,12,16
71 db 13,16,16,20,10,13,13,17,13,16,16,20,13,17,16,20,17,20,20,24
73 chroma_dc_dct_mask_mmx: dw 0, 0,-1,-1, 0, 0,-1,-1
74 chroma_dc_dmf_mask_mmx: dw 0, 0,-1,-1, 0,-1,-1, 0
75 chroma_dc_dct_mask: dw 1, 1,-1,-1, 1, 1,-1,-1
76 chroma_dc_dmf_mask: dw 1, 1,-1,-1, 1,-1,-1, 1
86 %macro QUANT_DC_START_MMX 0
95 %endif ; HIGH_BIT_DEPTH
98 %macro QUANT_DC_START_SSSE3 0
123 %macro PSIGNW_SSSE3 2
127 %macro PSIGND_MMX 2-3
138 %macro PSIGND_SSSE3 2+
154 %macro QUANT_END_MMX 0
167 cmp ecx, (1<<mmsize)-1
177 %macro QUANT_END_SSE4 0
183 %ifdef HIGH_BIT_DEPTH
184 %macro QUANT_ONE_DC_MMX 4
204 %macro QUANT_TWO_DC_MMX 4
205 QUANT_ONE_DC_MMX %1, %2, %3, %4
206 QUANT_ONE_DC_MMX %1+mmsize, %2, %3, %4+mmsize
209 %macro QUANT_ONE_DC_SSE4 4
224 %macro QUANT_TWO_DC_SSE4 4
247 %macro QUANT_ONE_AC_MMX 4
270 %macro QUANT_TWO_AC_MMX 4
271 QUANT_ONE_AC_MMX %1, %2, %3, %4
272 QUANT_ONE_AC_MMX %1+mmsize, %2+mmsize, %3+mmsize, %4+mmsize
275 %macro QUANT_TWO_AC_SSE4 4
281 paddd m3, [%3+mmsize]
283 pmulld m3, [%2+mmsize]
298 ;-----------------------------------------------------------------------------
299 ; int quant_2x2( int32_t dct[M*N], int mf, int bias )
300 ;-----------------------------------------------------------------------------
302 cglobal quant_%1x%2_dc_%3, 3,3,8*(mmsize/16)
304 %if %1*%2 <= mmsize/4
305 QUANT_ONE_DC r0, m6, m7, 0
308 %rep %1*%2/(mmsize/2)
309 QUANT_TWO_DC r0+x, m6, m7, x
317 ;-----------------------------------------------------------------------------
318 ; int quant_MxN( int32_t dct[M*N], uint32_t mf[M*N], uint32_t bias[M*N] )
319 ;-----------------------------------------------------------------------------
321 cglobal quant_%1x%2_%3, 3,3,8*(mmsize/16)
323 %rep %1*%2/(mmsize/2)
324 QUANT_TWO_AC r0+x, r1+x, r2+x, x
331 %define QUANT_TWO_AC QUANT_TWO_AC_MMX
332 %define QUANT_ONE_DC QUANT_ONE_DC_MMX
333 %define QUANT_TWO_DC QUANT_TWO_DC_MMX
334 %define QUANT_END QUANT_END_MMX
335 %define PABSD PABSD_MMX
336 %define PSIGND PSIGND_MMX
343 %define PABSD PABSD_SSSE3
344 %define PSIGND PSIGND_SSSE3
350 %define QUANT_TWO_AC QUANT_TWO_AC_SSE4
351 %define QUANT_ONE_DC QUANT_ONE_DC_SSE4
352 %define QUANT_TWO_DC QUANT_TWO_DC_SSE4
353 %define QUANT_END QUANT_END_SSE4
365 %endif ; HIGH_BIT_DEPTH
367 %ifndef HIGH_BIT_DEPTH
369 ;;; %1 (m64) dct[y][x]
370 ;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t)
371 ;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t)
372 mova m1, %1 ; load dct coeffs
374 paddusw m0, %3 ; round
375 pmulhuw m0, %2 ; divide
376 PSIGNW m0, m1 ; restore sign
407 ;-----------------------------------------------------------------------------
408 ; void quant_4x4_dc( int16_t dct[16], int mf, int bias )
409 ;-----------------------------------------------------------------------------
410 %macro QUANT_DC 2-3 0
414 QUANT_ONE [r0], m6, m7, 0
418 QUANT_TWO [r0+x], [r0+x+mmsize], m6, m6, m7, m7, x
426 ;-----------------------------------------------------------------------------
427 ; int quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
428 ;-----------------------------------------------------------------------------
433 QUANT_TWO [r0+x], [r0+x+mmsize], [r1+x], [r1+x+mmsize], [r2+x], [r2+x+mmsize], x
441 %define QUANT_END QUANT_END_MMX
442 %define PABSW PABSW_MMX
443 %define PSIGNW PSIGNW_MMX
444 %define QUANT_DC_START QUANT_DC_START_MMX
445 QUANT_DC quant_2x2_dc_mmx2, 1
446 %ifndef ARCH_X86_64 ; not needed because sse2 is faster
447 QUANT_DC quant_4x4_dc_mmx2, 4
448 QUANT_AC quant_4x4_mmx, 4
449 QUANT_AC quant_8x8_mmx, 16
453 QUANT_DC quant_4x4_dc_sse2, 2, 8
454 QUANT_AC quant_4x4_sse2, 2
455 QUANT_AC quant_8x8_sse2, 8
457 %define PABSW PABSW_SSSE3
458 %define PSIGNW PSIGNW_SSSE3
459 QUANT_DC quant_4x4_dc_ssse3, 2, 8
460 QUANT_AC quant_4x4_ssse3, 2
461 QUANT_AC quant_8x8_ssse3, 8
464 QUANT_DC quant_2x2_dc_ssse3, 1
465 %define QUANT_END QUANT_END_SSE4
466 ;Not faster on Conroe, so only used in SSE4 versions
467 %define QUANT_DC_START QUANT_DC_START_SSSE3
469 QUANT_DC quant_4x4_dc_sse4, 2, 8
470 QUANT_AC quant_4x4_sse4, 2
471 QUANT_AC quant_8x8_sse4, 8
472 %endif ; !HIGH_BIT_DEPTH
476 ;=============================================================================
478 ;=============================================================================
482 ;;; %2,%3 dequant_mf[i_mf][y][x]
485 %ifdef HIGH_BIT_DEPTH
498 ;;; %2,%3 dequant_mf[i_mf][y][x]
503 %ifdef HIGH_BIT_DEPTH
521 %macro DEQUANT_LOOP 3
525 %1 [r0+(t0 )*SIZEOF_PIXEL], [r1+t0*2 ], [r1+t0*2+ 8*%3]
526 %1 [r0+(t0+8*%3)*SIZEOF_PIXEL], [r1+t0*2+16*%3], [r1+t0*2+24*%3]
531 %1 [r0+(8*%3)*SIZEOF_PIXEL], [r1+16*%3], [r1+24*%3]
532 %1 [r0+(0 )*SIZEOF_PIXEL], [r1+0 ], [r1+ 8*%3]
537 %macro DEQUANT16_FLAT 2-5
555 DECLARE_REG_TMP 6,3,2
557 DECLARE_REG_TMP 4,3,2
559 DECLARE_REG_TMP 2,0,1
562 %macro DEQUANT_START 2
565 shr t0d, 8 ; i_qbits = i_qp / 6
568 sub t2d, t1d ; i_mf = i_qp % 6
571 add r1, t2 ; dequant_mf[i_mf]
573 add r1, r1mp ; dequant_mf[i_mf]
577 jl .rshift32 ; negative qbits => rightshift
580 ;-----------------------------------------------------------------------------
581 ; void dequant_4x4( dctcoef dct[4][4], int dequant_mf[6][4][4], int i_qp )
582 ;-----------------------------------------------------------------------------
584 cglobal dequant_%2x%2_%1, 0,3,6*(mmsize/16)
586 DEQUANT_START %3+2, %3
590 DEQUANT_LOOP DEQUANT16_L, %2*%2/4, %4
599 DEQUANT_LOOP DEQUANT32_R, %2*%2/4, %4
602 cglobal dequant_%2x%2_flat16_%1, 0,3
606 jl dequant_%2x%2_%1.skip_prologue
610 shr t0d, 8 ; i_qbits = i_qp / 6
613 sub t2d, t1d ; i_mf = i_qp % 6
616 lea r1, [dequant%2_scale]
619 lea r1, [dequant%2_scale + t2]
625 DEQUANT16_FLAT [r1], 0, 16
626 DEQUANT16_FLAT [r1+8], 8, 24
628 DEQUANT16_FLAT [r1], 0, 16
631 DEQUANT16_FLAT [r1], 0, 8, 64, 72
632 DEQUANT16_FLAT [r1+16], 16, 24, 48, 56
633 DEQUANT16_FLAT [r1+16], 80, 88, 112, 120
634 DEQUANT16_FLAT [r1+32], 32, 40, 96, 104
636 DEQUANT16_FLAT [r1], 0, 64
637 DEQUANT16_FLAT [r1+16], 16, 48, 80, 112
638 DEQUANT16_FLAT [r1+32], 32, 96
644 %ifdef HIGH_BIT_DEPTH
646 DEQUANT sse2, 4, 4, 1
647 DEQUANT sse4, 4, 4, 1
648 DEQUANT sse2, 8, 6, 1
649 DEQUANT sse4, 8, 6, 1
657 DEQUANT sse2, 4, 4, 2
658 DEQUANT sse2, 8, 6, 2
665 cglobal dequant_4x4dc_%1, 0,3,6*(mmsize/16)
672 %ifdef HIGH_BIT_DEPTH
675 %rep SIZEOF_PIXEL*16/mmsize
676 mova m0, [r0+mmsize*0+x]
677 mova m1, [r0+mmsize*1+x]
680 mova [r0+mmsize*0+x], m0
681 mova [r0+mmsize*1+x], m1
685 %else ; !HIGH_BIT_DEPTH
693 %rep SIZEOF_PIXEL*16/mmsize
694 mova m0, [r0+mmsize*0+x]
695 mova m1, [r0+mmsize*1+x]
698 mova [r0+mmsize*0+x], m0
699 mova [r0+mmsize*1+x], m1
702 %endif ; HIGH_BIT_DEPTH
714 %ifdef HIGH_BIT_DEPTH
716 %rep SIZEOF_PIXEL*32/mmsize
732 %rep SIZEOF_PIXEL*32/mmsize
748 %ifdef HIGH_BIT_DEPTH
763 ; t4 is eax for return value.
765 DECLARE_REG_TMP 0,1,2,3,6,4 ; Identical for both Windows and *NIX
767 DECLARE_REG_TMP 4,1,2,3,0,5
770 ;-----------------------------------------------------------------------------
771 ; x264_optimize_chroma_dc( dctcoef dct[4], int dequant_mf )
772 ;-----------------------------------------------------------------------------
774 ; %2 == 1 for sse2 or ssse3, 0 for sse4/avx
775 %macro OPTIMIZE_CHROMA_DC 2
778 %assign %%regs %%regs+1 ; t0-t4 are volatile on x86-64
780 cglobal optimize_chroma_dc_%1, 0,%%regs,7
791 mova m3, [chroma_dc_dct_mask_mmx]
792 mova m5, [chroma_dc_dmf_mask_mmx]
794 mova m3, [chroma_dc_dct_mask]
795 mova m5, [chroma_dc_dmf_mask]
798 pshufd m0, m1, 00010001b ; 1 0 3 2 1 0 3 2
800 punpcklqdq m1, m1 ; 3 2 1 0 3 2 1 0
801 mova m6, [pd_1024] ; 32<<5, elements are shifted 5 bits to the left
802 PSIGNW m0, m3 ; -1 -0 3 2 -1 -0 3 2
803 PSIGNW m2, m5 ; + - - + - - + +
804 paddw m0, m1 ; -1+3 -0+2 1+3 0+2 -1+3 -0+2 1+3 0+2
805 pmaddwd m0, m2 ; 0-1-2+3 0-1+2-3 0+1-2-3 0+1+2+3 * dmf
807 psrad m2, 16 ; + - - +
812 psrad m1, 31 ; has to be 0 or -1 in order for PSIGND_MMX to work correctly
824 jz .ret ; if the DC coefficients already round to zero, terminate early
827 movsx t3d, word [t0+2*t1] ; dct[coeff]
828 pshufd m6, m1, 11111111b
829 pshufd m1, m1, 10010000b ; move the next element to high dword
838 psubd m3, m5 ; coeff -= sign
849 paddd m3, m5 ; coeff += sign
854 pshufd m2, m2, 01111000b ; - + - + / - - + +
865 punpcklqdq m2, m2 ; + + + +
873 %define PSIGNW PSIGNW_MMX
874 %define PSIGND PSIGND_MMX
875 OPTIMIZE_CHROMA_DC sse2, 1
876 %define PSIGNW PSIGNW_SSSE3
877 %define PSIGND PSIGND_SSSE3
878 OPTIMIZE_CHROMA_DC ssse3, 1
879 OPTIMIZE_CHROMA_DC sse4, 0
881 OPTIMIZE_CHROMA_DC avx, 0
883 %ifdef HIGH_BIT_DEPTH
884 ;-----------------------------------------------------------------------------
885 ; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size )
886 ;-----------------------------------------------------------------------------
887 %macro DENOISE_DCT 1-2 0
888 cglobal denoise_dct_%1, 4,4,%2
892 mova m2, [r0+r3*4+0*mmsize]
893 mova m3, [r0+r3*4+1*mmsize]
898 psubd m0, [r2+r3*4+0*mmsize]
899 psubd m1, [r2+r3*4+1*mmsize]
906 mova [r0+r3*4+0*mmsize], m0
907 mova [r0+r3*4+1*mmsize], m1
908 paddd m4, [r1+r3*4+0*mmsize]
909 paddd m5, [r1+r3*4+1*mmsize]
910 mova [r1+r3*4+0*mmsize], m4
911 mova [r1+r3*4+1*mmsize], m5
916 %define PABSD PABSD_MMX
917 %define PSIGND PSIGND_MMX
924 %define PABSD PABSD_SSSE3
925 %define PSIGND PSIGND_SSSE3
930 %else ; !HIGH_BIT_DEPTH
932 ;-----------------------------------------------------------------------------
933 ; void denoise_dct( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
934 ;-----------------------------------------------------------------------------
935 %macro DENOISE_DCT 1-2 0
936 cglobal denoise_dct_%1, 4,4,%2
940 mova m2, [r0+r3*2+0*mmsize]
941 mova m3, [r0+r3*2+1*mmsize]
944 psubusw m4, m0, [r2+r3*2+0*mmsize]
945 psubusw m5, m1, [r2+r3*2+1*mmsize]
948 mova [r0+r3*2+0*mmsize], m4
949 mova [r0+r3*2+1*mmsize], m5
954 paddd m2, [r1+r3*4+0*mmsize]
955 paddd m0, [r1+r3*4+1*mmsize]
956 paddd m3, [r1+r3*4+2*mmsize]
957 paddd m1, [r1+r3*4+3*mmsize]
958 mova [r1+r3*4+0*mmsize], m2
959 mova [r1+r3*4+1*mmsize], m0
960 mova [r1+r3*4+2*mmsize], m3
961 mova [r1+r3*4+3*mmsize], m1
966 %define PABSW PABSW_MMX
967 %define PSIGNW PSIGNW_MMX
974 %define PABSW PABSW_SSSE3
975 %define PSIGNW PSIGNW_SSSE3
980 %endif ; !HIGH_BIT_DEPTH
982 ;-----------------------------------------------------------------------------
983 ; int decimate_score( dctcoef *dct )
984 ;-----------------------------------------------------------------------------
986 %macro DECIMATE_MASK_SSE2 7
987 %ifdef HIGH_BIT_DEPTH
990 packssdw xmm0, [%3+16]
991 packssdw xmm1, [%3+48]
996 ABS2_MMX xmm0, xmm1, xmm3, xmm4
1003 movdqa xmm0, [%3+ 0]
1004 movdqa xmm1, [%3+16]
1005 ABS2_MMX xmm0, xmm1, xmm3, xmm4
1016 %macro DECIMATE_MASK_MMX 7
1017 %ifdef HIGH_BIT_DEPTH
1022 packssdw mm0, [%3+ 8]
1023 packssdw mm1, [%3+24]
1024 packssdw mm2, [%3+40]
1025 packssdw mm3, [%3+56]
1032 ABS2_MMX mm0, mm1, mm6, mm7
1033 ABS2_MMX mm2, mm3, mm6, mm7
1052 cextern decimate_table4
1053 cextern decimate_table8
1055 %macro DECIMATE4x4 4
1057 ;A LUT is faster than bsf on AMD processors.
1058 ;This is not true for score64.
1059 cglobal decimate_score%1_%2, 1,3
1061 lea r10, [decimate_table4]
1062 lea r11, [decimate_mask_table4]
1064 %define mask_table r11
1066 %define table decimate_table4
1067 %define mask_table decimate_mask_table4
1069 DECIMATE_MASK edx, eax, r0, [pb_1], %2, ecx, %4
1079 movzx eax, byte [mask_table + rcx]
1088 add al, byte [table + rcx]
1089 add al, byte [mask_table + rdx]
1094 add al, byte [table + rcx]
1108 %define DECIMATE_MASK DECIMATE_MASK_MMX
1109 DECIMATE4x4 15, mmx2, 0, 0
1110 DECIMATE4x4 16, mmx2, 0, 0
1111 DECIMATE4x4 15, mmx2_slowctz, 1, 0
1112 DECIMATE4x4 16, mmx2_slowctz, 1, 0
1115 %define DECIMATE_MASK DECIMATE_MASK_SSE2
1116 DECIMATE4x4 15, sse2, 0, 0
1117 DECIMATE4x4 16, sse2, 0, 0
1118 DECIMATE4x4 15, sse2_slowctz, 1, 0
1119 DECIMATE4x4 16, sse2_slowctz, 1, 0
1120 DECIMATE4x4 15, ssse3, 0, 1
1121 DECIMATE4x4 16, ssse3, 0, 1
1122 DECIMATE4x4 15, ssse3_slowctz, 1, 1
1123 DECIMATE4x4 16, ssse3_slowctz, 1, 1
1125 %macro DECIMATE8x8 2
1128 cglobal decimate_score64_%1, 1,4
1130 lea r10, [decimate_table8]
1133 %define table decimate_table8
1136 DECIMATE_MASK r1d, eax, r0+SIZEOF_DCTCOEF* 0, m5, %1, null, %2
1139 DECIMATE_MASK r2d, eax, r0+SIZEOF_DCTCOEF*16, m5, %1, null, %2
1142 DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*32, m5, %1, null, %2
1146 DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*48, m5, %1, null, %2
1156 add al, byte [table + rcx]
1167 cglobal decimate_score64_%1, 1,6
1169 cglobal decimate_score64_%1, 1,5
1172 DECIMATE_MASK r3, r2, r0+SIZEOF_DCTCOEF* 0, m5, %1, r5, %2
1175 DECIMATE_MASK r4, r2, r0+SIZEOF_DCTCOEF*16, m5, %1, r5, %2
1178 DECIMATE_MASK r4, r1, r0+SIZEOF_DCTCOEF*32, m5, %1, r5, %2
1180 DECIMATE_MASK r1, r0, r0+SIZEOF_DCTCOEF*48, m5, %1, r5, %2
1188 jne .ret9 ;r0 is zero at this point, so we don't need to zero it
1195 add r0b, byte [decimate_table8 + ecx]
1198 cmp r0, 6 ;score64's threshold is never higher than 6
1199 jge .ret9 ;this early termination is only useful on 32-bit because it can be done in the latency after shrd
1227 %define DECIMATE_MASK DECIMATE_MASK_MMX
1231 %define DECIMATE_MASK DECIMATE_MASK_SSE2
1233 DECIMATE8x8 ssse3, 1
1235 ;-----------------------------------------------------------------------------
1236 ; int coeff_last( dctcoef *dct )
1237 ;-----------------------------------------------------------------------------
1248 %ifdef HIGH_BIT_DEPTH
1249 %macro LAST_MASK4_MMX 2-3
1251 packssdw mm0, [%2+8]
1257 %macro LAST_MASK_SSE2 2-3
1258 movdqa xmm0, [%2+ 0]
1259 movdqa xmm1, [%2+32]
1260 packssdw xmm0, [%2+16]
1261 packssdw xmm1, [%2+48]
1267 %macro LAST_MASK_MMX 3
1270 packssdw mm0, [%2+ 8]
1271 packssdw mm1, [%2+24]
1274 packssdw mm3, [%2+40]
1275 packssdw mm4, [%2+56]
1286 %macro COEFF_LAST4 1
1287 cglobal coeff_last4_%1, 1,3
1289 LAST_MASK4_MMX r1d, r0
1296 %define LAST LAST_X86
1298 %define LAST LAST_SSE4A
1299 COEFF_LAST4 mmx2_lzcnt
1301 %else ; !HIGH_BIT_DEPTH
1302 %macro LAST_MASK4_MMX 2-3
1309 %macro LAST_MASK_SSE2 2-3
1310 movdqa xmm0, [%2+ 0]
1311 packsswb xmm0, [%2+16]
1316 %macro LAST_MASK_MMX 3
1319 packsswb mm0, [%2+ 8]
1320 packsswb mm1, [%2+24]
1329 %macro COEFF_LAST4 1
1331 cglobal coeff_last4_%1, 1,1
1332 LAST rax, [r0], 0x3f
1336 cglobal coeff_last4_%1, 0,3
1345 lea eax, [eax+ecx*2]
1350 %define LAST LAST_X86
1352 %define LAST LAST_SSE4A
1353 COEFF_LAST4 mmx2_lzcnt
1354 %endif ; HIGH_BIT_DEPTH
1357 cglobal coeff_last15_%1, 1,3
1359 LAST_MASK r1d, r0-SIZEOF_DCTCOEF, r2d
1365 cglobal coeff_last16_%1, 1,3
1367 LAST_MASK r1d, r0, r2d
1373 cglobal coeff_last64_%1, 1, 5-mmsize/16
1375 LAST_MASK r2d, r0+SIZEOF_DCTCOEF* 32, r4d
1376 LAST_MASK r3d, r0+SIZEOF_DCTCOEF* 48, r4d
1381 LAST_MASK r1d, r0+SIZEOF_DCTCOEF* 0, r4d
1382 LAST_MASK r3d, r0+SIZEOF_DCTCOEF*16, r4d
1393 cglobal coeff_last64_%1, 1,4
1395 LAST_MASK_SSE2 r1d, r0+SIZEOF_DCTCOEF* 0
1396 LAST_MASK_SSE2 r2d, r0+SIZEOF_DCTCOEF*16
1397 LAST_MASK_SSE2 r3d, r0+SIZEOF_DCTCOEF*32
1398 LAST_MASK_SSE2 r0d, r0+SIZEOF_DCTCOEF*48
1411 %define LAST LAST_X86
1414 %define LAST_MASK LAST_MASK_MMX
1418 %define LAST_MASK LAST_MASK_SSE2
1420 %define LAST LAST_SSE4A
1421 COEFF_LAST sse2_lzcnt
1423 ;-----------------------------------------------------------------------------
1424 ; int coeff_level_run( dctcoef *dct, run_level_t *runlevel )
1425 ;-----------------------------------------------------------------------------
1427 %macro LZCOUNT_X86 3
1432 %macro LZCOUNT_SSE4A 3
1436 ; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args
1438 DECLARE_REG_TMP 3,1,2,0,4,5,6
1439 %elifdef ARCH_X86_64
1440 DECLARE_REG_TMP 0,1,2,3,4,5,6
1442 DECLARE_REG_TMP 6,3,2,1,4,5,0
1445 %macro COEFF_LEVELRUN 2
1446 cglobal coeff_level_run%2_%1,0,7
1450 LAST_MASK t5d, t0-(%2&1)*SIZEOF_DCTCOEF, t4d
1452 shl t5d, 32-((%2+1)&~1)
1454 LZCOUNT t3d, t5d, 0x1f
1461 LZCOUNT t3d, t5d, 0x1f
1462 %ifdef HIGH_BIT_DEPTH
1464 mov [t1+t6 +4+16*4], t3b
1465 mov [t1+t6*4+ 4], t2d
1468 mov [t1+t6 +4+16*2], t3b
1469 mov [t1+t6*2+ 4], t2w
1480 %define LZCOUNT LZCOUNT_X86
1482 %define LAST_MASK LAST_MASK_MMX
1483 COEFF_LEVELRUN mmx2, 15
1484 COEFF_LEVELRUN mmx2, 16
1486 %define LAST_MASK LAST_MASK4_MMX
1487 COEFF_LEVELRUN mmx2, 4
1489 %define LAST_MASK LAST_MASK_SSE2
1490 COEFF_LEVELRUN sse2, 15
1491 COEFF_LEVELRUN sse2, 16
1492 %define LZCOUNT LZCOUNT_SSE4A
1493 COEFF_LEVELRUN sse2_lzcnt, 15
1494 COEFF_LEVELRUN sse2_lzcnt, 16
1496 %define LAST_MASK LAST_MASK4_MMX
1497 COEFF_LEVELRUN mmx2_lzcnt, 4