1 ;*****************************************************************************
2 ;* dct-a.asm: x86 transform and zigzag
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2010 x264 project
6 ;* Authors: Holger Lubitz <holger@lubitz.org>
7 ;* Loren Merritt <lorenm@u.washington.edu>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Min Chen <chenm001.163.com>
10 ;* Fiona Glaser <fiona@x264.com>
12 ;* This program is free software; you can redistribute it and/or modify
13 ;* it under the terms of the GNU General Public License as published by
14 ;* the Free Software Foundation; either version 2 of the License, or
15 ;* (at your option) any later version.
17 ;* This program is distributed in the hope that it will be useful,
18 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;* GNU General Public License for more details.
22 ;* You should have received a copy of the GNU General Public License
23 ;* along with this program; if not, write to the Free Software
24 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
26 ;* This program is also available under a commercial proprietary license.
27 ;* For more information, contact us at licensing@x264.com.
28 ;*****************************************************************************
31 %include "x86util.asm"
33 %macro SHUFFLE_16BIT 8
42 pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
43 pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15
44 pb_subacmask: dw 0,-1,-1,-1,-1,-1,-1,-1
45 pb_scan4framea: SHUFFLE_16BIT 6,3,7,0,4,1,2,5
46 pb_scan4frameb: SHUFFLE_16BIT 0,4,1,2,5,6,3,7
47 pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
48 pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
63 SUMSUB_BADC %1, m%5, m%4, m%3, m%2, m%6
64 SUMSUB_BADC %1, m%5, m%3, m%4, m%2, m%6
68 %macro SUMSUB_17BIT 4 ; a, b, tmp, 0x8000
90 ;-----------------------------------------------------------------------------
91 ; void dct4x4dc( dctcoef d[4][4] )
92 ;-----------------------------------------------------------------------------
93 cglobal dct4x4dc_sse2, 1,1,5
98 WALSH4_1D d, 0,1,2,3,4
99 TRANSPOSE4x4D 0,1,2,3,4
101 WALSH4_1D d, 0,1,2,3,4
114 cglobal dct4x4dc_mmx, 1,1
119 movq m7, [pw_8000] ; convert to unsigned and back, so that pavgw works
120 WALSH4_1D w, 0,1,2,3,4
121 TRANSPOSE4x4W 0,1,2,3,4
122 SUMSUB_BADC w, m1, m0, m3, m2, m4
132 %endif ; HIGH_BIT_DEPTH
134 %ifdef HIGH_BIT_DEPTH
135 ;-----------------------------------------------------------------------------
136 ; void idct4x4dc( int32_t d[4][4] )
137 ;-----------------------------------------------------------------------------
139 cglobal idct4x4dc_sse2, 1,1
144 WALSH4_1D d,0,1,2,3,4
145 TRANSPOSE4x4D 0,1,2,3,4
146 WALSH4_1D d,0,1,2,3,4
155 ;-----------------------------------------------------------------------------
156 ; void idct4x4dc( int16_t d[4][4] )
157 ;-----------------------------------------------------------------------------
158 cglobal idct4x4dc_mmx, 1,1
163 WALSH4_1D w,0,1,2,3,4
164 TRANSPOSE4x4W 0,1,2,3,4
165 WALSH4_1D w,0,1,2,3,4
171 %endif ; HIGH_BIT_DEPTH
174 %ifdef HIGH_BIT_DEPTH
175 ;-----------------------------------------------------------------------------
176 ; void sub4x4_dct( dctcoef dct[4][4], pixel *pix1, pixel *pix2 )
177 ;-----------------------------------------------------------------------------
178 cglobal sub4x4_dct_mmx, 3,3
180 LOAD_DIFF m0, m4, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
181 LOAD_DIFF m3, m4, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
182 LOAD_DIFF m1, m4, none, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
183 LOAD_DIFF m2, m4, none, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
185 TRANSPOSE4x4W 0,1,2,3,4
187 SUMSUB_BADC w, m3, m0, m2, m1
188 SUMSUB_BA w, m2, m3, m4
189 DCT_UNPACK m2, m4, m5
190 DCT_UNPACK m3, m6, m7
191 mova [r0+ 0], m2 ; s03 + s12
193 mova [r0+32], m3 ; s03 - s12
196 DCT_UNPACK m0, m2, m4
197 DCT_UNPACK m1, m3, m5
198 SUMSUB2_AB d, m0, m1, m4
199 SUMSUB2_AB d, m2, m3, m5
200 mova [r0+16], m0 ; d03*2 + d12
202 mova [r0+48], m4 ; d03 - 2*d12
208 cglobal sub4x4_dct_%1, 3,3
211 LOAD_DIFF m0, m4, m5, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
212 LOAD_DIFF m3, m4, m5, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
213 LOAD_DIFF m1, m4, m5, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
214 LOAD_DIFF m2, m4, m5, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
217 LOAD_DIFF8x4_SSSE3 0, 3, 1, 2, 4, 5, r1, r2
220 TRANSPOSE4x4W 0,1,2,3,4
231 %endif ; HIGH_BIT_DEPTH
233 %ifdef HIGH_BIT_DEPTH
234 ;-----------------------------------------------------------------------------
235 ; void add4x4_idct( pixel *p_dst, dctcoef dct[4][4] )
236 ;-----------------------------------------------------------------------------
237 %macro STORE_DIFFx2 6
244 CLIPW %1, %4, [pw_pixel_max]
250 cglobal add4x4_idct_sse2, 2,2,6
251 add r0, 4*FDEC_STRIDE
257 IDCT4_1D d,0,1,2,3,4,5
258 TRANSPOSE4x4D 0,1,2,3,4
260 IDCT4_1D d,0,1,2,3,4,5
262 STORE_DIFFx2 m0, m1, m4, m5, [r0-4*FDEC_STRIDE], [r0-2*FDEC_STRIDE]
263 STORE_DIFFx2 m2, m3, m4, m5, [r0+0*FDEC_STRIDE], [r0+2*FDEC_STRIDE]
267 cglobal add4x4_idct_mmx, 2,2
274 IDCT4_1D w,0,1,2,3,4,5
275 TRANSPOSE4x4W 0,1,2,3,4
277 IDCT4_1D w,0,1,2,3,4,5
278 STORE_DIFF m0, m4, m7, [r0+0*FDEC_STRIDE]
279 STORE_DIFF m1, m4, m7, [r0+1*FDEC_STRIDE]
280 STORE_DIFF m2, m4, m7, [r0+2*FDEC_STRIDE]
281 STORE_DIFF m3, m4, m7, [r0+3*FDEC_STRIDE]
285 cglobal add4x4_idct_sse4, 2,2,6
286 mova m0, [r1+0x00] ; row1/row0
287 mova m2, [r1+0x10] ; row3/row2
288 mova m1, m0 ; row1/row0
289 psraw m0, 1 ; row1>>1/...
290 mova m3, m2 ; row3/row2
291 psraw m2, 1 ; row3>>1/...
292 movsd m0, m1 ; row1>>1/row0
293 movsd m2, m3 ; row3>>1/row2
294 psubw m0, m3 ; row1>>1-row3/row0-2
295 paddw m2, m1 ; row3>>1+row1/row0+2
296 SBUTTERFLY2 wd, 0, 2, 1
297 SUMSUB_BA w, m2, m0, m1
298 pshuflw m1, m2, 10110001b
299 pshufhw m2, m2, 10110001b
305 paddw m1, m0 ; row1/row0 corrected
306 psraw m0, 1 ; row1>>1/...
307 mova m3, m2 ; row3/row2
308 psraw m2, 1 ; row3>>1/...
309 movsd m0, m1 ; row1>>1/row0
310 movsd m2, m3 ; row3>>1/row2
311 psubw m0, m3 ; row1>>1-row3/row0-2
312 paddw m2, m1 ; row3>>1+row1/row0+2
313 SBUTTERFLY2 qdq, 0, 2, 1
314 SUMSUB_BA w, m2, m0, m1
316 movd m4, [r0+FDEC_STRIDE*0]
317 movd m1, [r0+FDEC_STRIDE*1]
318 movd m3, [r0+FDEC_STRIDE*2]
319 movd m5, [r0+FDEC_STRIDE*3]
320 punpckldq m1, m4 ; row0/row1
322 punpckldq m3, m5 ; row3/row2
329 packuswb m0, m2 ; row0/row1/row3/row2
330 pextrd [r0+FDEC_STRIDE*0], m0, 3
331 pextrd [r0+FDEC_STRIDE*1], m0, 2
332 movd [r0+FDEC_STRIDE*2], m0
333 pextrd [r0+FDEC_STRIDE*3], m0, 1
335 %endif ; HIGH_BIT_DEPTH
338 ;-----------------------------------------------------------------------------
339 ; void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
340 ;-----------------------------------------------------------------------------
342 cglobal %1, 3,3,11*(mmsize/16)
343 %ifndef HIGH_BIT_DEPTH
347 add r2, 4*FDEC_STRIDE
350 %endif ; !HIGH_BIT_DEPTH
357 add r1, %4-%5-%6*FENC_STRIDE
358 add r2, %4-%5-%6*FDEC_STRIDE
361 add r1, (%4-%6)*FENC_STRIDE-%5-%4
362 add r2, (%4-%6)*FDEC_STRIDE-%5-%4
365 add r1, %4-%5-%6*FENC_STRIDE
366 add r2, %4-%5-%6*FDEC_STRIDE
376 ;-----------------------------------------------------------------------------
377 ; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
378 ;-----------------------------------------------------------------------------
379 %macro ADD_NxN_IDCT 6-7
380 %ifdef HIGH_BIT_DEPTH
381 cglobal %1, 2,2,6*(mmsize/16)
383 cglobal %1, 2,2,11*(mmsize/16)
387 add r0, 4*FDEC_STRIDE
394 add r0, %4-%5-%6*FDEC_STRIDE
397 add r0, (%4-%6)*FDEC_STRIDE-%5-%4
400 add r0, %4-%5-%6*FDEC_STRIDE
411 %ifdef HIGH_BIT_DEPTH
413 SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx.skip_prologue, 64, 8, 0, 0
414 SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx.skip_prologue, 64, 16, 8, 8
416 ADD_NxN_IDCT add8x8_idct_sse2, add4x4_idct_sse2.skip_prologue,64, 8, 0, 0
417 ADD_NxN_IDCT add16x16_idct_sse2,add8x8_idct_sse2.skip_prologue,64, 16, 8, 8
418 %else ; !HIGH_BIT_DEPTH
420 SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx.skip_prologue, 32, 4, 0, 0
421 ADD_NxN_IDCT add8x8_idct_mmx, add4x4_idct_mmx.skip_prologue, 32, 4, 0, 0
422 SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx.skip_prologue, 32, 8, 4, 4
423 ADD_NxN_IDCT add16x16_idct_mmx, add8x8_idct_mmx.skip_prologue, 32, 8, 4, 4
425 cextern sub8x8_dct8_mmx.skip_prologue
426 cextern add8x8_idct8_mmx.skip_prologue
427 SUB_NxN_DCT sub16x16_dct8_mmx, sub8x8_dct8_mmx.skip_prologue, 128, 8, 0, 0
428 ADD_NxN_IDCT add16x16_idct8_mmx, add8x8_idct8_mmx.skip_prologue, 128, 8, 0, 0
433 cextern sub8x8_dct_sse2.skip_prologue
434 cextern sub8x8_dct_ssse3.skip_prologue
435 SUB_NxN_DCT sub16x16_dct_sse2, sub8x8_dct_sse2.skip_prologue, 128, 8, 0, 0
436 SUB_NxN_DCT sub16x16_dct_ssse3, sub8x8_dct_ssse3.skip_prologue, 128, 8, 0, 0
437 cextern add8x8_idct_sse2.skip_prologue
438 ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2.skip_prologue, 2*64, 8, 0, 0
440 cextern sub8x8_dct8_sse2.skip_prologue
441 cextern add8x8_idct8_sse2.skip_prologue
442 SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2.skip_prologue, 128, 8, 0, 0
443 ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2.skip_prologue, 128, 8, 0, 0
445 cextern sub8x8_dct8_ssse3.skip_prologue
446 SUB_NxN_DCT sub16x16_dct8_ssse3, sub8x8_dct8_ssse3.skip_prologue, 128, 8, 0, 0
447 %endif ; HIGH_BIT_DEPTH
449 %ifdef HIGH_BIT_DEPTH
451 ;-----------------------------------------------------------------------------
452 ; void add8x8_idct_dc( pixel *p_dst, dctcoef *dct2x2 )
453 ;-----------------------------------------------------------------------------
455 mova m0, [%1+FDEC_STRIDEB*0] ; 8pixels
456 mova m1, [%1+FDEC_STRIDEB*1]
457 mova m2, [%1+FDEC_STRIDEB*2]
461 paddsw %2, [%1+FDEC_STRIDEB*3]
466 mova [%1+FDEC_STRIDEB*0], m0
467 mova [%1+FDEC_STRIDEB*1], m1
468 mova [%1+FDEC_STRIDEB*2], m2
469 mova [%1+FDEC_STRIDEB*3], %2
473 cglobal add8x8_idct_dc_sse2, 2,2,7
474 mova m6, [pw_pixel_max]
478 psrad m3, 6 ; dc0 0 dc1 0 dc2 0 dc3 0
479 pshuflw m4, m3, 10100000b ; dc0 dc0 dc1 dc1 _ _ _ _
480 pshufhw m3, m3, 10100000b ; _ _ _ _ dc2 dc2 dc3 dc3
481 pshufd m4, m4, 01010000b ; dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
482 pshufd m3, m3, 11111010b ; dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
483 ADD_DC r0+FDEC_STRIDEB*0, m4
484 ADD_DC r0+FDEC_STRIDEB*4, m3
487 cglobal add16x16_idct_dc_sse2, 2,3,8
489 mova m6, [pw_pixel_max]
495 psrad m3, 6 ; dc0 0 dc1 0 dc2 0 dc3 0
496 pshuflw m4, m3, 10100000b ; dc0 dc0 dc1 dc1 _ _ _ _
497 pshufhw m3, m3, 10100000b ; _ _ _ _ dc2 dc2 dc3 dc3
498 pshufd m4, m4, 01010000b ; dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
499 pshufd m3, m3, 11111010b ; dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
500 ADD_DC r0+FDEC_STRIDEB*0, m4
501 ADD_DC r0+SIZEOF_PIXEL*8, m3
503 add r0, 4*FDEC_STRIDEB
508 %else ;!HIGH_BIT_DEPTH
510 movq mm4, [%3+FDEC_STRIDE*0]
511 movq mm5, [%3+FDEC_STRIDE*1]
512 movq mm6, [%3+FDEC_STRIDE*2]
516 paddusb %1, [%3+FDEC_STRIDE*3]
521 movq [%3+FDEC_STRIDE*0], mm4
522 movq [%3+FDEC_STRIDE*1], mm5
523 movq [%3+FDEC_STRIDE*2], mm6
524 movq [%3+FDEC_STRIDE*3], %1
527 cglobal add8x8_idct_dc_mmx, 2,2
530 add r0, FDEC_STRIDE*4
538 pshufw mm2, mm0, 0xFA
539 pshufw mm3, mm1, 0xFA
542 ADD_DC mm0, mm1, r0-FDEC_STRIDE*4
546 cglobal add8x8_idct_dc_ssse3, 2,2
549 add r0, FDEC_STRIDE*4
553 movdqa xmm5, [pb_idctdc_unpack]
558 movq xmm2, [r0+FDEC_STRIDE*-4]
559 movq xmm3, [r0+FDEC_STRIDE*-3]
560 movq xmm4, [r0+FDEC_STRIDE*-2]
561 movq xmm5, [r0+FDEC_STRIDE*-1]
562 movhps xmm2, [r0+FDEC_STRIDE* 0]
563 movhps xmm3, [r0+FDEC_STRIDE* 1]
564 movhps xmm4, [r0+FDEC_STRIDE* 2]
565 movhps xmm5, [r0+FDEC_STRIDE* 3]
574 movq [r0+FDEC_STRIDE*-4], xmm2
575 movq [r0+FDEC_STRIDE*-3], xmm3
576 movq [r0+FDEC_STRIDE*-2], xmm4
577 movq [r0+FDEC_STRIDE*-1], xmm5
578 movhps [r0+FDEC_STRIDE* 0], xmm2
579 movhps [r0+FDEC_STRIDE* 1], xmm3
580 movhps [r0+FDEC_STRIDE* 2], xmm4
581 movhps [r0+FDEC_STRIDE* 3], xmm5
584 cglobal add16x16_idct_dc_mmx, 2,3
596 pshufw mm2, mm0, 0xFA
597 pshufw mm3, mm1, 0xFA
601 ADD_DC mm2, mm3, r0+8
603 add r0, FDEC_STRIDE*4
608 %macro IDCT_DC_STORE 3
609 movdqa xmm4, [r0+%1+FDEC_STRIDE*0]
610 movdqa xmm5, [r0+%1+FDEC_STRIDE*1]
611 movdqa xmm6, [r0+%1+FDEC_STRIDE*2]
612 movdqa xmm7, [r0+%1+FDEC_STRIDE*3]
621 movdqa [r0+%1+FDEC_STRIDE*0], xmm4
622 movdqa [r0+%1+FDEC_STRIDE*1], xmm5
623 movdqa [r0+%1+FDEC_STRIDE*2], xmm6
624 movdqa [r0+%1+FDEC_STRIDE*3], xmm7
627 cglobal add16x16_idct_dc_sse2, 2,2,8
629 add r0, FDEC_STRIDE*4
635 add r0, FDEC_STRIDE*4
657 IDCT_DC_STORE FDEC_STRIDE*-4, xmm0, xmm1
658 IDCT_DC_STORE 0, xmm2, xmm3
661 cglobal add16x16_idct_dc_ssse3, 2,2,8
663 add r0, FDEC_STRIDE*4
669 add r0, FDEC_STRIDE*4
676 movdqa xmm5, [ pb_idctdc_unpack]
677 movdqa xmm6, [pb_idctdc_unpack2]
686 IDCT_DC_STORE FDEC_STRIDE*-4, xmm0, xmm1
687 IDCT_DC_STORE 0, xmm2, xmm3
690 %endif ; HIGH_BIT_DEPTH
692 ;-----------------------------------------------------------------------------
693 ; void sub8x8_dct_dc( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 )
694 ;-----------------------------------------------------------------------------
696 %macro DCTDC_2ROW_MMX 3
697 movq %1, [r1+FENC_STRIDE*(0+%3)]
698 movq m1, [r1+FENC_STRIDE*(1+%3)]
699 movq m2, [r2+FDEC_STRIDE*(0+%3)]
700 movq m3, [r2+FDEC_STRIDE*(1+%3)]
716 %macro DCT2x2 2 ; reg s1/s0 (!=m1), reg s3/s2
717 pshufw mm1, %1, 10100000b ; s1 s1 s0 s0
718 pshufw mm0, %2, 10110001b ; s3 __ s2 __
719 paddw mm1, %2 ; s1 s13 s0 s02
720 psubw mm1, mm0 ; d13 s13 d02 s02
721 pshufw mm0, mm1, 01000100b ; d02 s02 d02 s02
722 psrlq mm1, 32 ; __ __ d13 s13
723 paddw mm0, mm1 ; d02 s02 d02+d13 s02+s13
724 psllq mm1, 32 ; d13 s13
725 psubw mm0, mm1 ; d02-d13 s02-s13 d02+d13 s02+s13
729 cglobal sub8x8_dct_dc_mmxext, 3,3
730 DCTDC_2ROW_MMX m0, m4, 0
731 DCTDC_2ROW_MMX m5, m6, 2
735 add r1, FENC_STRIDE*4
736 add r2, FDEC_STRIDE*4
737 DCTDC_2ROW_MMX m7, m4, 0
738 DCTDC_2ROW_MMX m5, m6, 2
747 %macro DCTDC_2ROW_SSE2 3
748 movq m0, [r1+FENC_STRIDE*(0+%1)]
749 movq m1, [r1+FENC_STRIDE*(1+%1)]
750 movq m2, [r2+FDEC_STRIDE*(0+%1)]
751 movq m3, [r2+FDEC_STRIDE*(1+%1)]
765 cglobal sub8x8_dct_dc_sse2, 3,3,8
767 DCTDC_2ROW_SSE2 0, 0, m4
768 DCTDC_2ROW_SSE2 2, 1, m4
769 add r1, FENC_STRIDE*4
770 add r2, FDEC_STRIDE*4
772 DCTDC_2ROW_SSE2 0, 0, m5
773 DCTDC_2ROW_SSE2 2, 1, m5
783 ;-----------------------------------------------------------------------------
784 ; void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
785 ;-----------------------------------------------------------------------------
787 cglobal zigzag_scan_8x8_frame_%1, 2,2,8
791 PALIGNR xmm1, xmm1, 14, xmm2
796 PALIGNR xmm2, xmm2, 12, xmm4
798 PALIGNR xmm3, xmm3, 10, xmm4
824 movdqa xmm7, [r1+112]
832 PALIGNR xmm4, xmm4, 14, xmm3
834 PALIGNR xmm5, xmm5, 12, xmm3
836 PALIGNR xmm6, xmm6, 10, xmm3
839 PALIGNR xmm7, xmm7, 8, xmm3
843 punpcklqdq xmm7, xmm7
863 pshufw mm4, mm4, 0x6c
877 pshufhw xmm0, xmm0, 0x1b
878 pshuflw xmm4, xmm4, 0x1b
879 pshufhw xmm3, xmm3, 0x1b
880 pshuflw xmm7, xmm7, 0x1b
882 movlps [r0+2*10], xmm0
883 movhps [r0+2*17], xmm0
884 movlps [r0+2*21], xmm3
885 movlps [r0+2*28], xmm4
886 movhps [r0+2*32], xmm3
887 movhps [r0+2*39], xmm4
888 movlps [r0+2*43], xmm7
889 movhps [r0+2*50], xmm7
894 %ifndef HIGH_BIT_DEPTH
896 %define PALIGNR PALIGNR_MMX
898 %define PALIGNR PALIGNR_SSSE3
902 ;-----------------------------------------------------------------------------
903 ; void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[8][8] )
904 ;-----------------------------------------------------------------------------
905 %macro SCAN_8x8_FRAME 6
906 cglobal zigzag_scan_8x8_frame_%1, 2,2,8*(mmsize/16)
908 mova m1, [r1+ 8*SIZEOF_DCTCOEF]
909 movu m2, [r1+14*SIZEOF_DCTCOEF]
910 movu m3, [r1+21*SIZEOF_DCTCOEF]
911 mova m4, [r1+28*SIZEOF_DCTCOEF]
920 mova m7, [r1+52*SIZEOF_DCTCOEF]
921 mova m0, [r1+60*SIZEOF_DCTCOEF]
928 mova [r0+ 4*SIZEOF_DCTCOEF], m1
929 mova [r0+ 8*SIZEOF_DCTCOEF], m6
932 mova m1, [r1+32*SIZEOF_DCTCOEF]
933 movu m5, [r1+39*SIZEOF_DCTCOEF]
934 movu m2, [r1+46*SIZEOF_DCTCOEF]
935 movu [r0+35*SIZEOF_DCTCOEF], m3
936 movu [r0+47*SIZEOF_DCTCOEF], m4
943 mova [r0+52*SIZEOF_DCTCOEF], m6
944 movu [r0+13*SIZEOF_DCTCOEF], m5
945 movu m4, [r1+11*SIZEOF_DCTCOEF]
946 movu m6, [r1+25*SIZEOF_DCTCOEF]
950 mova m3, [r1+ 4*SIZEOF_DCTCOEF]
951 movu m7, [r1+18*SIZEOF_DCTCOEF]
953 movu [r0+25*SIZEOF_DCTCOEF], m1
966 movu m4, [r1+35*SIZEOF_DCTCOEF]
967 movu m1, [r1+49*SIZEOF_DCTCOEF]
970 mova [r0+60*SIZEOF_DCTCOEF], m0
971 mova [r0+56*SIZEOF_DCTCOEF], m2
972 movu m0, [r1+42*SIZEOF_DCTCOEF]
973 mova m2, [r1+56*SIZEOF_DCTCOEF]
974 movu [r0+17*SIZEOF_DCTCOEF], m3
975 mova [r0+32*SIZEOF_DCTCOEF], m7
976 movu [r0+10*SIZEOF_DCTCOEF], m6
977 movu [r0+21*SIZEOF_DCTCOEF], m5
992 mova [r0+28*SIZEOF_DCTCOEF], m4
993 movu [r0+43*SIZEOF_DCTCOEF], m1
994 movu [r0+39*SIZEOF_DCTCOEF], m2
995 movu [r0+50*SIZEOF_DCTCOEF], m7
999 %ifdef HIGH_BIT_DEPTH
1001 SCAN_8x8_FRAME sse2 , 4 , dq, qdq, dq, d
1004 SCAN_8x8_FRAME mmxext, 16, q , dq , wd, w
1007 ;-----------------------------------------------------------------------------
1008 ; void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[4][4] )
1009 ;-----------------------------------------------------------------------------
1011 cglobal zigzag_scan_4x4_frame_%1, 2,2,8*(mmsize)/16
1013 mova m1, [r1+ 4*SIZEOF_DCTCOEF]
1014 mova m2, [r1+ 8*SIZEOF_DCTCOEF]
1015 mova m3, [r1+12*SIZEOF_DCTCOEF]
1033 mova [r0+ 4*SIZEOF_DCTCOEF], m5
1034 mova [r0+ 8*SIZEOF_DCTCOEF], m1
1035 mova [r0+12*SIZEOF_DCTCOEF], m3
1039 %ifdef HIGH_BIT_DEPTH
1041 SCAN_4x4 sse2, 4 , dq, qdq, dq
1044 SCAN_4x4 mmx , 16, q , dq , wd
1047 ;-----------------------------------------------------------------------------
1048 ; void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
1049 ;-----------------------------------------------------------------------------
1050 cglobal zigzag_scan_4x4_frame_ssse3, 2,2
1051 movdqa xmm1, [r1+16]
1053 pshufb xmm1, [pb_scan4frameb]
1054 pshufb xmm0, [pb_scan4framea]
1057 palignr xmm2, xmm0, 6
1059 palignr xmm1, xmm0, 10
1061 movdqa [r0+16], xmm1
1064 %ifdef HIGH_BIT_DEPTH
1066 ;-----------------------------------------------------------------------------
1067 ; void zigzag_scan_4x4_field( int32_t level[16], int32_t dct[4][4] )
1068 ;-----------------------------------------------------------------------------
1069 cglobal zigzag_scan_4x4_field_sse2, 2,3
1083 ;-----------------------------------------------------------------------------
1084 ; void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
1085 ;-----------------------------------------------------------------------------
1086 ; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2
1087 cglobal zigzag_scan_4x4_field_mmxext, 2,3
1088 pshufw mm0, [r1+4], 0xd2
1099 %endif ; HIGH_BIT_DEPTH
1101 ;-----------------------------------------------------------------------------
1102 ; void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] )
1103 ;-----------------------------------------------------------------------------
1106 ; 16 11 5 6 7 12 17 24
1107 ; 18 13 14 15 19 25 32 26
1108 ; 20 21 22 23 27 33 40 34
1109 ; 28 29 30 31 35 41 48 42
1110 ; 36 37 38 39 43 49 50 44
1111 ; 45 46 47 51 56 57 52 53
1112 ; 54 55 58 59 60 61 62 63
1115 cglobal zigzag_scan_8x8_field_%1, 2,3,8*(mmsize/16)
1116 mova m0, [r1+ 0*SIZEOF_DCTCOEF] ; 03 02 01 00
1117 mova m1, [r1+ 4*SIZEOF_DCTCOEF] ; 07 06 05 04
1118 mova m2, [r1+ 8*SIZEOF_DCTCOEF] ; 11 10 09 08
1119 pshuf%2 m3, m0, 011111111b ; 03 03 03 03
1121 pshuf%2 m2, m2, 000111001b ; 08 11 10 09
1122 punpckl%3 m3, m1 ; 05 03 04 03
1123 pinsr%2 m0, r2d, 3 ; 08 02 01 00
1125 punpckl%3 m2, m3 ; 04 10 03 09
1126 pshuf%2 m2, m2, 010110100b ; 10 04 03 09
1127 mova [r0+ 0*SIZEOF_DCTCOEF], m0 ; 08 02 01 00
1128 mova [r0+ 4*SIZEOF_DCTCOEF], m2 ; 10 04 03 09
1129 mova m3, [r1+12*SIZEOF_DCTCOEF] ; 15 14 13 12
1130 mova m5, [r1+16*SIZEOF_DCTCOEF] ; 19 18 17 16
1131 punpckl%4 m6, m5 ; 17 16 XX XX
1132 psrl%5 m1, %6 ; XX 07 06 05
1133 punpckh%3 m6, m4 ; 08 17 11 16
1134 punpckl%4 m6, m1 ; 06 05 11 16
1135 mova [r0+ 8*SIZEOF_DCTCOEF], m6 ; 06 05 11 16
1136 psrl%5 m1, %6 ; XX XX 07 06
1137 punpckl%3 m1, m5 ; 17 07 16 06
1138 mova m0, [r1+20*SIZEOF_DCTCOEF] ; 23 22 21 20
1139 mova m2, [r1+24*SIZEOF_DCTCOEF] ; 27 26 25 24
1141 punpckh%4 m1, m1 ; 17 07 17 07
1142 punpckl%3 m6, m2 ; 25 13 24 12
1144 mova [r0+24*SIZEOF_DCTCOEF], m0 ; 23 22 21 20
1145 punpckl%3 m1, m6 ; 24 17 12 07
1146 mova [r0+12*SIZEOF_DCTCOEF], m1
1147 pinsr%2 m3, r2d, 0 ; 15 14 13 18
1148 mova [r0+16*SIZEOF_DCTCOEF], m3 ; 15 14 13 18
1149 mova m7, [r1+28*SIZEOF_DCTCOEF]
1150 mova m0, [r1+32*SIZEOF_DCTCOEF] ; 35 34 33 32
1151 psrl%5 m5, %6*3 ; XX XX XX 19
1152 pshuf%2 m1, m2, 011111001b ; 27 27 26 25
1153 punpckl%3 m5, m0 ; 33 XX 32 19
1154 psrl%5 m2, %6*3 ; XX XX XX 27
1155 punpckl%3 m5, m1 ; 26 32 25 19
1156 mova [r0+32*SIZEOF_DCTCOEF], m7
1157 mova [r0+20*SIZEOF_DCTCOEF], m5 ; 26 32 25 19
1158 mova m7, [r1+36*SIZEOF_DCTCOEF]
1159 mova m1, [r1+40*SIZEOF_DCTCOEF] ; 43 42 41 40
1160 pshuf%2 m3, m0, 011111001b ; 35 35 34 33
1161 punpckl%3 m2, m1 ; 41 XX 40 27
1162 mova [r0+40*SIZEOF_DCTCOEF], m7
1163 punpckl%3 m2, m3 ; 34 40 33 27
1164 mova [r0+28*SIZEOF_DCTCOEF], m2
1165 mova m7, [r1+44*SIZEOF_DCTCOEF] ; 47 46 45 44
1166 mova m2, [r1+48*SIZEOF_DCTCOEF] ; 51 50 49 48
1167 psrl%5 m0, %6*3 ; XX XX XX 35
1168 punpckl%3 m0, m2 ; 49 XX 48 35
1169 pshuf%2 m3, m1, 011111001b ; 43 43 42 41
1170 punpckl%3 m0, m3 ; 42 48 41 35
1171 mova [r0+36*SIZEOF_DCTCOEF], m0
1172 pextr%2 r2d, m2, 3 ; 51
1173 psrl%5 m1, %6*3 ; XX XX XX 43
1174 punpckl%3 m1, m7 ; 45 XX 44 43
1175 psrl%5 m2, %6 ; XX 51 50 49
1176 punpckl%3 m1, m2 ; 50 44 49 43
1177 pshuf%2 m1, m1, 010110100b ; 44 50 49 43
1178 mova [r0+44*SIZEOF_DCTCOEF], m1
1179 psrl%5 m7, %6 ; XX 47 46 45
1180 pinsr%2 m7, r2d, 3 ; 51 47 46 45
1181 mova [r0+48*SIZEOF_DCTCOEF], m7
1182 mova m0, [r1+56*SIZEOF_DCTCOEF] ; 59 58 57 56
1183 mova m1, [r1+52*SIZEOF_DCTCOEF] ; 55 54 53 52
1185 mova m7, [r1+60*SIZEOF_DCTCOEF]
1186 punpckl%4 m2, m1 ; 53 52 57 56
1187 punpckh%4 m1, m0 ; 59 58 55 54
1188 mova [r0+52*SIZEOF_DCTCOEF], m2
1189 mova [r0+56*SIZEOF_DCTCOEF], m1
1190 mova [r0+60*SIZEOF_DCTCOEF], m7
1193 %ifdef HIGH_BIT_DEPTH
1195 SCAN_8x8 sse4 , d, dq, qdq, dq, 4
1198 SCAN_8x8 mmxext, w, wd, dq , q , 16
1201 ;-----------------------------------------------------------------------------
1202 ; void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *src, uint8_t *dst )
1203 ;-----------------------------------------------------------------------------
1204 %macro ZIGZAG_SUB_4x4 2
1206 cglobal zigzag_sub_4x4%1_%2_ssse3, 4,4,8
1208 cglobal zigzag_sub_4x4%1_%2_ssse3, 3,3,8
1210 movd xmm0, [r1+0*FENC_STRIDE]
1211 movd xmm1, [r1+1*FENC_STRIDE]
1212 movd xmm2, [r1+2*FENC_STRIDE]
1213 movd xmm3, [r1+3*FENC_STRIDE]
1214 movd xmm4, [r2+0*FDEC_STRIDE]
1215 movd xmm5, [r2+1*FDEC_STRIDE]
1216 movd xmm6, [r2+2*FDEC_STRIDE]
1217 movd xmm7, [r2+3*FDEC_STRIDE]
1218 movd [r2+0*FDEC_STRIDE], xmm0
1219 movd [r2+1*FDEC_STRIDE], xmm1
1220 movd [r2+2*FDEC_STRIDE], xmm2
1221 movd [r2+3*FDEC_STRIDE], xmm3
1222 punpckldq xmm0, xmm1
1223 punpckldq xmm2, xmm3
1224 punpckldq xmm4, xmm5
1225 punpckldq xmm6, xmm7
1226 punpcklqdq xmm0, xmm2
1227 punpcklqdq xmm4, xmm6
1229 movdqa xmm7, [pb_sub4frame]
1231 movdqa xmm7, [pb_sub4field]
1238 punpcklbw xmm0, xmm6
1239 punpckhbw xmm1, xmm6
1240 punpcklbw xmm4, xmm6
1241 punpckhbw xmm5, xmm6
1246 pand xmm0, [pb_subacmask]
1250 movdqa [r0+16], xmm1
1262 ZIGZAG_SUB_4x4 , frame
1263 ZIGZAG_SUB_4x4 ac, frame
1264 ZIGZAG_SUB_4x4 , field
1265 ZIGZAG_SUB_4x4 ac, field
1267 ;-----------------------------------------------------------------------------
1268 ; void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz )
1269 ;-----------------------------------------------------------------------------
1272 movq m0, [r1+%1*4+ 0]
1273 movq m1, [r1+%1*4+ 8]
1274 movq m2, [r1+%1*4+16]
1275 movq m3, [r1+%1*4+24]
1276 TRANSPOSE4x4W 0,1,2,3,4
1295 cglobal zigzag_interleave_8x8_cavlc_mmx, 3,3
1312 %macro INTERLEAVE_XMM 1
1313 mova m0, [r1+%1*4+ 0]
1314 mova m1, [r1+%1*4+16]
1315 mova m4, [r1+%1*4+32]
1316 mova m5, [r1+%1*4+48]
1317 SBUTTERFLY wd, 0, 1, 6
1318 SBUTTERFLY wd, 4, 5, 7
1319 SBUTTERFLY wd, 0, 1, 6
1320 SBUTTERFLY wd, 4, 5, 7
1322 movhps [r0+%1+ 32], m0
1323 movq [r0+%1+ 64], m1
1324 movhps [r0+%1+ 96], m1
1326 movhps [r0+%1+ 40], m4
1327 movq [r0+%1+ 72], m5
1328 movhps [r0+%1+104], m5
1343 cglobal zigzag_interleave_8x8_cavlc_sse2, 3,3,8