+%if HIGH_BIT_DEPTH
+INIT_XMM sse2
+DEQUANT_DC d, pmaddwd
+INIT_XMM xop
+DEQUANT_DC d, pmaddwd
+%else
+%if ARCH_X86_64 == 0
+INIT_MMX mmx2
+DEQUANT_DC w, pmullw
+%endif
+INIT_XMM sse2
+DEQUANT_DC w, pmullw
+INIT_XMM avx
+DEQUANT_DC w, pmullw
+%endif
+
+; t4 is eax for return value.
+%if ARCH_X86_64
+ DECLARE_REG_TMP 0,1,2,3,6,4 ; Identical for both Windows and *NIX
+%else
+ DECLARE_REG_TMP 4,1,2,3,0,5
+%endif
+
+;-----------------------------------------------------------------------------
+; x264_optimize_chroma_2x2_dc( dctcoef dct[4], int dequant_mf )
+;-----------------------------------------------------------------------------
+
+%macro OPTIMIZE_CHROMA_2x2_DC 0
+%assign %%regs 5
+%if cpuflag(sse4)
+ %assign %%regs %%regs-1
+%endif
+%if ARCH_X86_64 == 0
+ %assign %%regs %%regs+1 ; t0-t4 are volatile on x86-64
+%endif
+cglobal optimize_chroma_2x2_dc, 0,%%regs,7
+ movifnidn t0, r0mp
+ movd m2, r1m
+ movq m1, [t0]
+%if cpuflag(sse4)
+ pcmpeqb m4, m4
+ pslld m4, 11
+%else
+ pxor m4, m4
+%endif
+%if cpuflag(ssse3)
+ mova m3, [chroma_dc_dct_mask]
+ mova m5, [chroma_dc_dmf_mask]
+%else
+ mova m3, [chroma_dc_dct_mask_mmx]
+ mova m5, [chroma_dc_dmf_mask_mmx]
+%endif
+ pshuflw m2, m2, 0
+ pshufd m0, m1, q0101 ; 1 0 3 2 1 0 3 2
+ punpcklqdq m2, m2
+ punpcklqdq m1, m1 ; 3 2 1 0 3 2 1 0
+ mova m6, [pd_1024] ; 32<<5, elements are shifted 5 bits to the left
+ PSIGNW m0, m3 ; -1 -0 3 2 -1 -0 3 2
+ PSIGNW m2, m5 ; + - - + - - + +
+ paddw m0, m1 ; -1+3 -0+2 1+3 0+2 -1+3 -0+2 1+3 0+2
+ pmaddwd m0, m2 ; 0-1-2+3 0-1+2-3 0+1-2-3 0+1+2+3 * dmf
+ punpcklwd m1, m1
+ psrad m2, 16 ; + - - +
+ mov t1d, 3
+ paddd m0, m6
+ xor t4d, t4d
+%if notcpuflag(ssse3)
+ psrad m1, 31 ; has to be 0 or -1 in order for PSIGND_MMX to work correctly
+%endif
+%if cpuflag(sse4)
+ ptest m0, m4
+%else
+ mova m6, m0
+ SWAP 0, 6
+ psrad m6, 11
+ pcmpeqd m6, m4
+ pmovmskb t5d, m6
+ cmp t5d, 0xffff
+%endif
+ jz .ret ; if the DC coefficients already round to zero, terminate early
+ mova m3, m0
+.outer_loop:
+ movsx t3d, word [t0+2*t1] ; dct[coeff]
+ pshufd m6, m1, q3333
+ pshufd m1, m1, q2100 ; move the next element to high dword
+ PSIGND m5, m2, m6
+ test t3d, t3d
+ jz .loop_end
+.outer_loop_0:
+ mov t2d, t3d
+ sar t3d, 31
+ or t3d, 1
+.inner_loop:
+ psubd m3, m5 ; coeff -= sign
+ pxor m6, m0, m3
+%if cpuflag(sse4)
+ ptest m6, m4