1 ;*****************************************************************************
2 ;* dct-a.asm: x86 transform and zigzag
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2013 x264 project
6 ;* Authors: Holger Lubitz <holger@lubitz.org>
7 ;* Loren Merritt <lorenm@u.washington.edu>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Min Chen <chenm001.163.com>
10 ;* Fiona Glaser <fiona@x264.com>
12 ;* This program is free software; you can redistribute it and/or modify
13 ;* it under the terms of the GNU General Public License as published by
14 ;* the Free Software Foundation; either version 2 of the License, or
15 ;* (at your option) any later version.
17 ;* This program is distributed in the hope that it will be useful,
18 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;* GNU General Public License for more details.
22 ;* You should have received a copy of the GNU General Public License
23 ;* along with this program; if not, write to the Free Software
24 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
26 ;* This program is also available under a commercial proprietary license.
27 ;* For more information, contact us at licensing@x264.com.
28 ;*****************************************************************************
31 %include "x86util.asm"
34 pw_ppmmmmpp: dw 1,1,-1,-1,-1,-1,1,1
35 pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
36 pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15
37 pb_subacmask: dw 0,-1,-1,-1,-1,-1,-1,-1
38 pb_scan4framea: SHUFFLE_MASK_W 6,3,7,0,4,1,2,5
39 pb_scan4frameb: SHUFFLE_MASK_W 0,4,1,2,5,6,3,7
40 pb_scan4frame2a: SHUFFLE_MASK_W 0,4,1,2,5,8,12,9
41 pb_scan4frame2b: SHUFFLE_MASK_W 6,3,7,10,13,14,11,15
42 pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
43 pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
45 pb_scan8framet1: SHUFFLE_MASK_W 0, 1, 6, 7, 8, 9, 13, 14
46 pb_scan8framet2: SHUFFLE_MASK_W 2 , 3, 4, 7, 9, 15, 10, 14
47 pb_scan8framet3: SHUFFLE_MASK_W 0, 1, 5, 6, 8, 11, 12, 13
48 pb_scan8framet4: SHUFFLE_MASK_W 0, 3, 4, 5, 8, 11, 12, 15
49 pb_scan8framet5: SHUFFLE_MASK_W 1, 2, 6, 7, 9, 10, 13, 14
50 pb_scan8framet6: SHUFFLE_MASK_W 0, 3, 4, 5, 10, 11, 12, 15
51 pb_scan8framet7: SHUFFLE_MASK_W 1, 2, 6, 7, 8, 9, 14, 15
52 pb_scan8framet8: SHUFFLE_MASK_W 0, 1, 2, 7, 8, 10, 11, 14
53 pb_scan8framet9: SHUFFLE_MASK_W 1, 4, 5, 7, 8, 13, 14, 15
55 pb_scan8frame1: SHUFFLE_MASK_W 0, 8, 1, 2, 9, 12, 4, 13
56 pb_scan8frame2: SHUFFLE_MASK_W 4, 0, 1, 5, 8, 10, 12, 14
57 pb_scan8frame3: SHUFFLE_MASK_W 12, 10, 8, 6, 2, 3, 7, 9
58 pb_scan8frame4: SHUFFLE_MASK_W 0, 1, 8, 12, 4, 13, 9, 2
59 pb_scan8frame5: SHUFFLE_MASK_W 5, 14, 10, 3, 11, 15, 6, 7
60 pb_scan8frame6: SHUFFLE_MASK_W 6, 8, 12, 13, 9, 7, 5, 3
61 pb_scan8frame7: SHUFFLE_MASK_W 1, 3, 5, 7, 10, 14, 15, 11
62 pb_scan8frame8: SHUFFLE_MASK_W 10, 3, 11, 14, 5, 6, 15, 7
64 pb_scan8field1 : SHUFFLE_MASK_W 0, 1, 2, 8, 9, 3, 4, 10
65 pb_scan8field2a: SHUFFLE_MASK_W 0x80, 11, 5, 6, 7, 12,0x80,0x80
66 pb_scan8field2b: SHUFFLE_MASK_W 0,0x80,0x80,0x80,0x80,0x80, 1, 8
67 pb_scan8field3a: SHUFFLE_MASK_W 10, 5, 6, 7, 11,0x80,0x80,0x80
68 pb_scan8field3b: SHUFFLE_MASK_W 0x80,0x80,0x80,0x80,0x80, 1, 8, 2
69 pb_scan8field4a: SHUFFLE_MASK_W 4, 5, 6, 7, 11,0x80,0x80,0x80
70 pb_scan8field6 : SHUFFLE_MASK_W 4, 5, 6, 7, 11,0x80,0x80, 12
71 pb_scan8field7 : SHUFFLE_MASK_W 5, 6, 7, 11,0x80,0x80, 12, 13
88 SUMSUB_BADC %1, %5, %4, %3, %2, %6
89 SUMSUB_BADC %1, %5, %3, %4, %2, %6
93 %macro SUMSUB_17BIT 4 ; a, b, tmp, 0x8000
114 ;-----------------------------------------------------------------------------
115 ; void dct4x4dc( dctcoef d[4][4] )
116 ;-----------------------------------------------------------------------------
118 cglobal dct4x4dc, 1,1,5
123 WALSH4_1D d, 0,1,2,3,4
124 TRANSPOSE4x4D 0,1,2,3,4
126 WALSH4_1D d, 0,1,2,3,4
136 %endmacro ; DCT4x4_DC
145 cglobal dct4x4dc, 1,1
150 movq m7, [pw_8000] ; convert to unsigned and back, so that pavgw works
151 WALSH4_1D w, 0,1,2,3,4
152 TRANSPOSE4x4W 0,1,2,3,4
153 SUMSUB_BADC w, 1, 0, 3, 2, 4
163 %endif ; HIGH_BIT_DEPTH
166 ;-----------------------------------------------------------------------------
167 ; void idct4x4dc( int32_t d[4][4] )
168 ;-----------------------------------------------------------------------------
170 cglobal idct4x4dc, 1,1
175 WALSH4_1D d,0,1,2,3,4
176 TRANSPOSE4x4D 0,1,2,3,4
177 WALSH4_1D d,0,1,2,3,4
183 %endmacro ; IDCT4x4DC
191 ;-----------------------------------------------------------------------------
192 ; void idct4x4dc( int16_t d[4][4] )
193 ;-----------------------------------------------------------------------------
195 cglobal idct4x4dc, 1,1
200 WALSH4_1D w,0,1,2,3,4
201 TRANSPOSE4x4W 0,1,2,3,4
202 WALSH4_1D w,0,1,2,3,4
208 %endif ; HIGH_BIT_DEPTH
211 ;-----------------------------------------------------------------------------
212 ; void sub4x4_dct( dctcoef dct[4][4], pixel *pix1, pixel *pix2 )
213 ;-----------------------------------------------------------------------------
215 cglobal sub4x4_dct, 3,3
217 LOAD_DIFF m0, m4, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
218 LOAD_DIFF m3, m4, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
219 LOAD_DIFF m1, m4, none, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
220 LOAD_DIFF m2, m4, none, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
222 TRANSPOSE4x4W 0,1,2,3,4
224 SUMSUB_BADC w, 3, 0, 2, 1
226 DCT_UNPACK m2, m4, m5
227 DCT_UNPACK m3, m6, m7
228 mova [r0+ 0], m2 ; s03 + s12
230 mova [r0+32], m3 ; s03 - s12
233 DCT_UNPACK m0, m2, m4
234 DCT_UNPACK m1, m3, m5
235 SUMSUB2_AB d, 0, 1, 4
236 SUMSUB2_AB d, 2, 3, 5
237 mova [r0+16], m0 ; d03*2 + d12
239 mova [r0+48], m4 ; d03 - 2*d12
245 cglobal sub4x4_dct, 3,3
250 LOAD_DIFF8x4 0, 3, 1, 2, 4, 5, r1, r2
252 TRANSPOSE4x4W 0,1,2,3,4
265 %endif ; HIGH_BIT_DEPTH
268 ;-----------------------------------------------------------------------------
269 ; void add4x4_idct( pixel *p_dst, dctcoef dct[4][4] )
270 ;-----------------------------------------------------------------------------
271 %macro STORE_DIFFx2 6
278 CLIPW %1, %4, [pw_pixel_max]
284 cglobal add4x4_idct, 2,2,6
285 add r0, 2*FDEC_STRIDEB
291 IDCT4_1D d,0,1,2,3,4,5
292 TRANSPOSE4x4D 0,1,2,3,4
294 IDCT4_1D d,0,1,2,3,4,5
296 STORE_DIFFx2 m0, m1, m4, m5, [r0-2*FDEC_STRIDEB], [r0-1*FDEC_STRIDEB]
297 STORE_DIFFx2 m2, m3, m4, m5, [r0+0*FDEC_STRIDEB], [r0+1*FDEC_STRIDEB]
306 %else ; !HIGH_BIT_DEPTH
309 cglobal add4x4_idct, 2,2
316 IDCT4_1D w,0,1,2,3,4,5
317 TRANSPOSE4x4W 0,1,2,3,4
319 IDCT4_1D w,0,1,2,3,4,5
320 STORE_DIFF m0, m4, m7, [r0+0*FDEC_STRIDE]
321 STORE_DIFF m1, m4, m7, [r0+1*FDEC_STRIDE]
322 STORE_DIFF m2, m4, m7, [r0+2*FDEC_STRIDE]
323 STORE_DIFF m3, m4, m7, [r0+3*FDEC_STRIDE]
327 cglobal add4x4_idct, 2,2,6
328 mova m1, [r1+0x00] ; row1/row0
329 mova m3, [r1+0x10] ; row3/row2
330 psraw m0, m1, 1 ; row1>>1/...
331 psraw m2, m3, 1 ; row3>>1/...
332 movsd m0, m1 ; row1>>1/row0
333 movsd m2, m3 ; row3>>1/row2
334 psubw m0, m3 ; row1>>1-row3/row0-2
335 paddw m2, m1 ; row3>>1+row1/row0+2
336 SBUTTERFLY2 wd, 0, 2, 1
338 pshuflw m1, m2, q2301
339 pshufhw m2, m2, q2301
345 paddw m1, m0 ; row1/row0 corrected
346 psraw m0, 1 ; row1>>1/...
347 psraw m3, m2, 1 ; row3>>1/...
348 movsd m0, m1 ; row1>>1/row0
349 movsd m3, m2 ; row3>>1/row2
350 psubw m0, m2 ; row1>>1-row3/row0-2
351 paddw m3, m1 ; row3>>1+row1/row0+2
352 SBUTTERFLY2 qdq, 0, 3, 1
355 movd m4, [r0+FDEC_STRIDE*0]
356 movd m1, [r0+FDEC_STRIDE*1]
357 movd m2, [r0+FDEC_STRIDE*2]
358 movd m5, [r0+FDEC_STRIDE*3]
359 punpckldq m1, m4 ; row0/row1
361 punpckldq m2, m5 ; row3/row2
368 packuswb m0, m3 ; row0/row1/row3/row2
369 pextrd [r0+FDEC_STRIDE*0], m0, 3
370 pextrd [r0+FDEC_STRIDE*1], m0, 2
371 movd [r0+FDEC_STRIDE*2], m0
372 pextrd [r0+FDEC_STRIDE*3], m0, 1
380 %endif ; HIGH_BIT_DEPTH
383 ;-----------------------------------------------------------------------------
384 ; void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
385 ;-----------------------------------------------------------------------------
388 %if HIGH_BIT_DEPTH == 0
392 add r2, 4*FDEC_STRIDE
395 %endif ; !HIGH_BIT_DEPTH
397 call %2.skip_prologue
399 add r1, %4-%5-%6*FENC_STRIDE
400 add r2, %4-%5-%6*FDEC_STRIDE
401 call %2.skip_prologue
403 add r1, (%4-%6)*FENC_STRIDE-%5-%4
404 add r2, (%4-%6)*FDEC_STRIDE-%5-%4
405 call %2.skip_prologue
407 add r1, %4-%5-%6*FENC_STRIDE
408 add r2, %4-%5-%6*FDEC_STRIDE
409 TAIL_CALL %2.skip_prologue, 1
412 ;-----------------------------------------------------------------------------
413 ; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
414 ;-----------------------------------------------------------------------------
415 %macro ADD_NxN_IDCT 6-7
425 %if mmsize==16 && %3!=256
426 add r0, 4*FDEC_STRIDE
429 call %2.skip_prologue
430 add r0, %4-%5-%6*FDEC_STRIDE
432 call %2.skip_prologue
433 add r0, (%4-%6)*FDEC_STRIDE-%5-%4
435 call %2.skip_prologue
436 add r0, %4-%5-%6*FDEC_STRIDE
438 TAIL_CALL %2.skip_prologue, 1
443 SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 64, 8, 0, 0, 0
444 SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx, 64, 16, 8, 8, 0
446 ADD_NxN_IDCT add8x8_idct_sse2, add4x4_idct_sse2, 64, 8, 0, 0, 6
447 ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2, 64, 16, 8, 8, 6
448 ADD_NxN_IDCT add8x8_idct_avx, add4x4_idct_avx, 64, 8, 0, 0, 6
449 ADD_NxN_IDCT add16x16_idct_avx, add8x8_idct_avx, 64, 16, 8, 8, 6
450 cextern add8x8_idct8_sse2.skip_prologue
451 cextern add8x8_idct8_avx.skip_prologue
452 ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2, 256, 16, 0, 0, 16
453 ADD_NxN_IDCT add16x16_idct8_avx, add8x8_idct8_avx, 256, 16, 0, 0, 16
454 cextern sub8x8_dct8_sse2.skip_prologue
455 cextern sub8x8_dct8_sse4.skip_prologue
456 cextern sub8x8_dct8_avx.skip_prologue
457 SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2, 256, 16, 0, 0, 14
458 SUB_NxN_DCT sub16x16_dct8_sse4, sub8x8_dct8_sse4, 256, 16, 0, 0, 14
459 SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx, 256, 16, 0, 0, 14
460 %else ; !HIGH_BIT_DEPTH
463 SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 32, 4, 0, 0, 0
464 ADD_NxN_IDCT add8x8_idct_mmx, add4x4_idct_mmx, 32, 4, 0, 0
465 SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx, 32, 8, 4, 4, 0
466 ADD_NxN_IDCT add16x16_idct_mmx, add8x8_idct_mmx, 32, 8, 4, 4
468 cextern sub8x8_dct8_mmx.skip_prologue
469 cextern add8x8_idct8_mmx.skip_prologue
470 SUB_NxN_DCT sub16x16_dct8_mmx, sub8x8_dct8_mmx, 128, 8, 0, 0, 0
471 ADD_NxN_IDCT add16x16_idct8_mmx, add8x8_idct8_mmx, 128, 8, 0, 0
475 cextern sub8x8_dct_sse2.skip_prologue
476 cextern sub8x8_dct_ssse3.skip_prologue
477 cextern sub8x8_dct_avx.skip_prologue
478 cextern sub8x8_dct_xop.skip_prologue
479 SUB_NxN_DCT sub16x16_dct_sse2, sub8x8_dct_sse2, 128, 8, 0, 0, 10
480 SUB_NxN_DCT sub16x16_dct_ssse3, sub8x8_dct_ssse3, 128, 8, 0, 0, 10
481 SUB_NxN_DCT sub16x16_dct_avx, sub8x8_dct_avx, 128, 8, 0, 0, 10
482 SUB_NxN_DCT sub16x16_dct_xop, sub8x8_dct_xop, 128, 8, 0, 0, 10
484 cextern add8x8_idct_sse2.skip_prologue
485 cextern add8x8_idct_avx.skip_prologue
486 ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2, 128, 8, 0, 0
487 ADD_NxN_IDCT add16x16_idct_avx, add8x8_idct_avx, 128, 8, 0, 0
489 cextern add8x8_idct8_sse2.skip_prologue
490 cextern add8x8_idct8_avx.skip_prologue
491 ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2, 128, 8, 0, 0
492 ADD_NxN_IDCT add16x16_idct8_avx, add8x8_idct8_avx, 128, 8, 0, 0
494 cextern sub8x8_dct8_sse2.skip_prologue
495 cextern sub8x8_dct8_ssse3.skip_prologue
496 cextern sub8x8_dct8_avx.skip_prologue
497 SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2, 128, 8, 0, 0, 11
498 SUB_NxN_DCT sub16x16_dct8_ssse3, sub8x8_dct8_ssse3, 128, 8, 0, 0, 11
499 SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx, 128, 8, 0, 0, 11
500 %endif ; HIGH_BIT_DEPTH
503 ;-----------------------------------------------------------------------------
504 ; void add8x8_idct_dc( pixel *p_dst, dctcoef *dct2x2 )
505 ;-----------------------------------------------------------------------------
507 mova m0, [%1+FDEC_STRIDEB*0] ; 8pixels
508 mova m1, [%1+FDEC_STRIDEB*1]
509 mova m2, [%1+FDEC_STRIDEB*2]
513 paddsw %2, [%1+FDEC_STRIDEB*3]
518 mova [%1+FDEC_STRIDEB*0], m0
519 mova [%1+FDEC_STRIDEB*1], m1
520 mova [%1+FDEC_STRIDEB*2], m2
521 mova [%1+FDEC_STRIDEB*3], %2
525 cglobal add8x8_idct_dc, 2,2,7
526 mova m6, [pw_pixel_max]
530 psrad m3, 6 ; dc0 0 dc1 0 dc2 0 dc3 0
531 pshuflw m4, m3, q2200 ; dc0 dc0 dc1 dc1 _ _ _ _
532 pshufhw m3, m3, q2200 ; _ _ _ _ dc2 dc2 dc3 dc3
533 pshufd m4, m4, q1100 ; dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
534 pshufd m3, m3, q3322 ; dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
535 ADD_DC r0+FDEC_STRIDEB*0, m4
536 ADD_DC r0+FDEC_STRIDEB*4, m3
539 cglobal add16x16_idct_dc, 2,3,8
541 mova m6, [pw_pixel_max]
547 psrad m3, 6 ; dc0 0 dc1 0 dc2 0 dc3 0
548 pshuflw m4, m3, q2200 ; dc0 dc0 dc1 dc1 _ _ _ _
549 pshufhw m3, m3, q2200 ; _ _ _ _ dc2 dc2 dc3 dc3
550 pshufd m4, m4, q1100 ; dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
551 pshufd m3, m3, q3322 ; dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
552 ADD_DC r0+FDEC_STRIDEB*0, m4
553 ADD_DC r0+SIZEOF_PIXEL*8, m3
555 add r0, 4*FDEC_STRIDEB
559 %endmacro ; ADD_IDCT_DC
566 %else ;!HIGH_BIT_DEPTH
568 mova m4, [%3+FDEC_STRIDE*0]
569 mova m5, [%3+FDEC_STRIDE*1]
570 mova m6, [%3+FDEC_STRIDE*2]
574 paddusb %1, [%3+FDEC_STRIDE*3]
579 mova [%3+FDEC_STRIDE*0], m4
580 mova [%3+FDEC_STRIDE*1], m5
581 mova [%3+FDEC_STRIDE*2], m6
582 mova [%3+FDEC_STRIDE*3], %1
586 cglobal add8x8_idct_dc, 2,2
589 add r0, FDEC_STRIDE*4
601 ADD_DC m0, m1, r0-FDEC_STRIDE*4
606 cglobal add8x8_idct_dc, 2,2
609 add r0, FDEC_STRIDE*4
613 mova m5, [pb_idctdc_unpack]
618 movh m2, [r0+FDEC_STRIDE*-4]
619 movh m3, [r0+FDEC_STRIDE*-3]
620 movh m4, [r0+FDEC_STRIDE*-2]
621 movh m5, [r0+FDEC_STRIDE*-1]
622 movhps m2, [r0+FDEC_STRIDE* 0]
623 movhps m3, [r0+FDEC_STRIDE* 1]
624 movhps m4, [r0+FDEC_STRIDE* 2]
625 movhps m5, [r0+FDEC_STRIDE* 3]
634 movh [r0+FDEC_STRIDE*-4], m2
635 movh [r0+FDEC_STRIDE*-3], m3
636 movh [r0+FDEC_STRIDE*-2], m4
637 movh [r0+FDEC_STRIDE*-1], m5
638 movhps [r0+FDEC_STRIDE* 0], m2
639 movhps [r0+FDEC_STRIDE* 1], m3
640 movhps [r0+FDEC_STRIDE* 2], m4
641 movhps [r0+FDEC_STRIDE* 3], m5
645 cglobal add16x16_idct_dc, 2,3
664 add r0, FDEC_STRIDE*4
670 cglobal add16x16_idct_dc, 2,2,8
672 add r0, FDEC_STRIDE*4
675 add r0, FDEC_STRIDE*4
694 ADD_DC m0, m1, r0+FDEC_STRIDE*-4
699 cglobal add16x16_idct_dc, 2,2,8
701 add r0, FDEC_STRIDE*4
704 add r0, FDEC_STRIDE*4
711 mova m5, [ pb_idctdc_unpack]
712 mova m6, [pb_idctdc_unpack2]
719 ADD_DC m0, m1, r0+FDEC_STRIDE*-4
729 %endif ; HIGH_BIT_DEPTH
731 ;-----------------------------------------------------------------------------
732 ; void sub8x8_dct_dc( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 )
733 ;-----------------------------------------------------------------------------
735 %macro DCTDC_2ROW_MMX 4
736 mova %1, [r1+FENC_STRIDE*(0+%3)]
737 mova m1, [r1+FENC_STRIDE*(1+%3)]
738 mova m2, [r2+FDEC_STRIDE*(0+%4)]
739 mova m3, [r2+FDEC_STRIDE*(1+%4)]
755 %macro DCT2x2 2 ; reg s1/s0, reg s3/s2 (!=m0/m1)
756 PSHUFLW m1, %1, q2200 ; s1 s1 s0 s0
757 PSHUFLW m0, %2, q2301 ; s3 __ s2 __
758 paddw m1, %2 ; s1 s13 s0 s02
759 psubw m1, m0 ; d13 s13 d02 s02
760 PSHUFLW m0, m1, q1010 ; d02 s02 d02 s02
761 psrlq m1, 32 ; __ __ d13 s13
762 paddw m0, m1 ; d02 s02 d02+d13 s02+s13
763 psllq m1, 32 ; d13 s13
764 psubw m0, m1 ; d02-d13 s02-s13 d02+d13 s02+s13
767 %if HIGH_BIT_DEPTH == 0
769 cglobal sub8x8_dct_dc, 3,3
770 DCTDC_2ROW_MMX m0, m4, 0, 0
771 DCTDC_2ROW_MMX m5, m6, 2, 2
775 add r2, FDEC_STRIDE*4
776 DCTDC_2ROW_MMX m7, m4, 4, 0
777 DCTDC_2ROW_MMX m5, m6, 6, 2
785 %macro DCTDC_2ROW_SSE2 4
786 movh m1, [r1+FENC_STRIDE*(0+%1)]
787 movh m2, [r1+FENC_STRIDE*(1+%1)]
789 movh m2, [r2+FDEC_STRIDE*(0+%2)]
790 punpckldq m2, [r2+FDEC_STRIDE*(1+%2)]
793 ACCUM paddd, %4, 1, %3
798 cglobal sub8x8_dct_dc, 3,3
800 DCTDC_2ROW_SSE2 0, 0, 0, 3
801 DCTDC_2ROW_SSE2 2, 2, 1, 3
802 add r2, FDEC_STRIDE*4
803 DCTDC_2ROW_SSE2 4, 0, 0, 4
804 DCTDC_2ROW_SSE2 6, 2, 1, 4
811 %macro SUB8x16_DCT_DC 0
812 cglobal sub8x16_dct_dc, 3,3
814 DCTDC_2ROW_SSE2 0, 0, 0, 3
815 DCTDC_2ROW_SSE2 2, 2, 1, 3
816 add r1, FENC_STRIDE*8
817 add r2, FDEC_STRIDE*8
818 DCTDC_2ROW_SSE2 -4, -4, 0, 4
819 DCTDC_2ROW_SSE2 -2, -2, 1, 4
821 DCTDC_2ROW_SSE2 0, 0, 0, 5
822 DCTDC_2ROW_SSE2 2, 2, 1, 5
823 add r2, FDEC_STRIDE*4
824 DCTDC_2ROW_SSE2 4, 0, 0, 4
825 DCTDC_2ROW_SSE2 6, 2, 1, 4
828 %define %%sign psignw
830 %define %%sign pmullw
834 pshuflw m0, m5, q2301
835 pshufhw m0, m0, q2301
836 %%sign m5, [pw_pmpmpmpm]
840 %%sign m1, [pw_ppppmmmm]
844 %endmacro ; SUB8x16_DCT_DC
851 %endif ; !HIGH_BIT_DEPTH
853 %macro DCTDC_4ROW_SSE2 2
854 mova %1, [r1+FENC_STRIDEB*%2]
855 mova m0, [r2+FDEC_STRIDEB*%2]
858 paddw %1, [r1+FENC_STRIDEB*Y]
859 paddw m0, [r2+FDEC_STRIDEB*Y]
868 %macro SUB8x8_DCT_DC_10 0
869 cglobal sub8x8_dct_dc, 3,3,3
870 DCTDC_4ROW_SSE2 m1, 0
871 DCTDC_4ROW_SSE2 m2, 4
872 mova m0, [pw_ppmmmmpp]
875 pshufd m0, m1, q2200 ; -1 -1 +0 +0
876 pshufd m1, m1, q0033 ; +0 +0 +1 +1
878 pshufd m0, m2, q1023 ; -2 +2 -3 +3
887 %macro SUB8x16_DCT_DC_10 0
888 cglobal sub8x16_dct_dc, 3,3,6
889 DCTDC_4ROW_SSE2 m1, 0
890 DCTDC_4ROW_SSE2 m2, 4
891 DCTDC_4ROW_SSE2 m3, 8
892 DCTDC_4ROW_SSE2 m4, 12
893 mova m0, [pw_ppmmmmpp]
896 pshufd m5, m1, q2200 ; -1 -1 +0 +0
897 pshufd m1, m1, q0033 ; +0 +0 +1 +1
899 pshufd m5, m2, q1023 ; -2 +2 -3 +3
901 paddd m1, m5 ; a6 a2 a4 a0
909 paddd m3, m5 ; a7 a3 a5 a1
914 punpcklqdq m2, m0, m1
926 ;-----------------------------------------------------------------------------
927 ; void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
928 ;-----------------------------------------------------------------------------
930 cglobal zigzag_scan_8x8_frame, 2,2,8
934 PALIGNR xmm1, xmm1, 14, xmm2
939 PALIGNR xmm2, xmm2, 12, xmm4
941 PALIGNR xmm3, xmm3, 10, xmm4
967 movdqa xmm7, [r1+112]
975 PALIGNR xmm4, xmm4, 14, xmm3
977 PALIGNR xmm5, xmm5, 12, xmm3
979 PALIGNR xmm6, xmm6, 10, xmm3
982 PALIGNR xmm7, xmm7, 8, xmm3
986 punpcklqdq xmm7, xmm7
1006 pshufw mm4, mm4, q1230
1014 punpckhdq xmm3, xmm0, xmm2
1015 punpckldq xmm0, xmm2
1016 punpckhdq xmm7, xmm4, xmm6
1017 punpckldq xmm4, xmm6
1018 pshufhw xmm0, xmm0, q0123
1019 pshuflw xmm4, xmm4, q0123
1020 pshufhw xmm3, xmm3, q0123
1021 pshuflw xmm7, xmm7, q0123
1023 movlps [r0+2*10], xmm0
1024 movhps [r0+2*17], xmm0
1025 movlps [r0+2*21], xmm3
1026 movlps [r0+2*28], xmm4
1027 movhps [r0+2*32], xmm3
1028 movhps [r0+2*39], xmm4
1029 movlps [r0+2*43], xmm7
1030 movhps [r0+2*50], xmm7
1035 %if HIGH_BIT_DEPTH == 0
1042 ;-----------------------------------------------------------------------------
1043 ; void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[8][8] )
1044 ;-----------------------------------------------------------------------------
1046 ; 0 8 1 2 9 16 24 17
1047 ; 10 3 4 11 18 25 32 40
1048 ; 33 26 19 12 5 6 13 20
1049 ; 27 34 41 48 56 49 42 35
1050 ; 28 21 14 7 15 22 29 36
1051 ; 43 50 57 58 51 44 37 30
1052 ; 23 31 38 45 52 59 60 53
1053 ; 46 39 47 54 61 62 55 63
1054 %macro SCAN_8x8_FRAME 5
1055 cglobal zigzag_scan_8x8_frame, 2,2,8
1057 mova m1, [r1+ 8*SIZEOF_DCTCOEF]
1058 movu m2, [r1+14*SIZEOF_DCTCOEF]
1059 movu m3, [r1+21*SIZEOF_DCTCOEF]
1060 mova m4, [r1+28*SIZEOF_DCTCOEF]
1061 punpckl%4 m5, m0, m1
1063 punpckh%4 m6, m1, m0
1067 mova m7, [r1+52*SIZEOF_DCTCOEF]
1068 mova m0, [r1+60*SIZEOF_DCTCOEF]
1075 mova [r0+ 4*SIZEOF_DCTCOEF], m1
1076 mova [r0+ 8*SIZEOF_DCTCOEF], m6
1079 mova m1, [r1+32*SIZEOF_DCTCOEF]
1080 movu m5, [r1+39*SIZEOF_DCTCOEF]
1081 movu m2, [r1+46*SIZEOF_DCTCOEF]
1082 movu [r0+35*SIZEOF_DCTCOEF], m3
1083 movu [r0+47*SIZEOF_DCTCOEF], m4
1086 punpckh%3 m3, m5, m5
1089 mova [r0+52*SIZEOF_DCTCOEF], m6
1090 movu [r0+13*SIZEOF_DCTCOEF], m5
1091 movu m4, [r1+11*SIZEOF_DCTCOEF]
1092 movu m6, [r1+25*SIZEOF_DCTCOEF]
1096 mova m3, [r1+ 4*SIZEOF_DCTCOEF]
1097 movu m7, [r1+18*SIZEOF_DCTCOEF]
1099 movu [r0+25*SIZEOF_DCTCOEF], m1
1106 punpckh%3 m3, m6, m4
1107 punpckh%3 m7, m5, m1
1110 movu m4, [r1+35*SIZEOF_DCTCOEF]
1111 movu m1, [r1+49*SIZEOF_DCTCOEF]
1112 pshuf%5 m6, m6, q0123
1113 pshuf%5 m5, m5, q0123
1114 mova [r0+60*SIZEOF_DCTCOEF], m0
1115 mova [r0+56*SIZEOF_DCTCOEF], m2
1116 movu m0, [r1+42*SIZEOF_DCTCOEF]
1117 mova m2, [r1+56*SIZEOF_DCTCOEF]
1118 movu [r0+17*SIZEOF_DCTCOEF], m3
1119 mova [r0+32*SIZEOF_DCTCOEF], m7
1120 movu [r0+10*SIZEOF_DCTCOEF], m6
1121 movu [r0+21*SIZEOF_DCTCOEF], m5
1122 punpckh%4 m3, m0, m4
1123 punpckh%4 m7, m2, m1
1126 punpckl%3 m4, m2, m0
1127 punpckl%3 m1, m7, m3
1130 pshuf%5 m2, m2, q0123
1131 pshuf%5 m7, m7, q0123
1132 mova [r0+28*SIZEOF_DCTCOEF], m4
1133 movu [r0+43*SIZEOF_DCTCOEF], m1
1134 movu [r0+39*SIZEOF_DCTCOEF], m2
1135 movu [r0+50*SIZEOF_DCTCOEF], m7
1141 SCAN_8x8_FRAME 4 , dq, qdq, dq, d
1143 SCAN_8x8_FRAME 4 , dq, qdq, dq, d
1146 SCAN_8x8_FRAME 16, q , dq , wd, w
1149 ;-----------------------------------------------------------------------------
1150 ; void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[4][4] )
1151 ;-----------------------------------------------------------------------------
1153 cglobal zigzag_scan_4x4_frame, 2,2,6
1154 mova m0, [r1+ 0*SIZEOF_DCTCOEF]
1155 mova m1, [r1+ 4*SIZEOF_DCTCOEF]
1156 mova m2, [r1+ 8*SIZEOF_DCTCOEF]
1157 mova m3, [r1+12*SIZEOF_DCTCOEF]
1158 punpckl%4 m4, m0, m1
1161 mova [r0+ 0*SIZEOF_DCTCOEF], m4
1163 punpckh%4 m4, m2, m3
1166 punpckl%4 m5, m1, m3
1171 mova [r0+ 4*SIZEOF_DCTCOEF], m5
1172 mova [r0+ 8*SIZEOF_DCTCOEF], m1
1173 mova [r0+12*SIZEOF_DCTCOEF], m3
1179 SCAN_4x4 4, dq, qdq, dq
1181 SCAN_4x4 4, dq, qdq, dq
1184 SCAN_4x4 16, q , dq , wd
1186 ;-----------------------------------------------------------------------------
1187 ; void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
1188 ;-----------------------------------------------------------------------------
1189 %macro SCAN_4x4_FRAME 0
1190 cglobal zigzag_scan_4x4_frame, 2,2
1193 pshufb m1, [pb_scan4frameb]
1194 pshufb m0, [pb_scan4framea]
1210 cglobal zigzag_scan_4x4_frame, 2,2
1213 vpperm m2, m0, m1, [pb_scan4frame2a]
1214 vpperm m1, m0, m1, [pb_scan4frame2b]
1218 %endif ; !HIGH_BIT_DEPTH
1221 ;-----------------------------------------------------------------------------
1222 ; void zigzag_scan_4x4_field( int32_t level[16], int32_t dct[4][4] )
1223 ;-----------------------------------------------------------------------------
1225 cglobal zigzag_scan_4x4_field, 2,3
1227 pshufd m0, m4, q3102
1239 ;-----------------------------------------------------------------------------
1240 ; void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
1241 ;-----------------------------------------------------------------------------
1242 ; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2
1244 cglobal zigzag_scan_4x4_field, 2,3
1245 pshufw m0, [r1+4], q3102
1256 %endif ; HIGH_BIT_DEPTH
1258 ;-----------------------------------------------------------------------------
1259 ; void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] )
1260 ;-----------------------------------------------------------------------------
1263 ; 16 11 5 6 7 12 17 24
1264 ; 18 13 14 15 19 25 32 26
1265 ; 20 21 22 23 27 33 40 34
1266 ; 28 29 30 31 35 41 48 42
1267 ; 36 37 38 39 43 49 50 44
1268 ; 45 46 47 51 56 57 52 53
1269 ; 54 55 58 59 60 61 62 63
1272 cglobal zigzag_scan_8x8_field, 2,3,8
1273 mova m0, [r1+ 0*SIZEOF_DCTCOEF] ; 03 02 01 00
1274 mova m1, [r1+ 4*SIZEOF_DCTCOEF] ; 07 06 05 04
1275 mova m2, [r1+ 8*SIZEOF_DCTCOEF] ; 11 10 09 08
1276 pshuf%1 m3, m0, q3333 ; 03 03 03 03
1277 movd r2d, m2 ; 09 08
1278 pshuf%1 m2, m2, q0321 ; 08 11 10 09
1279 punpckl%2 m3, m1 ; 05 03 04 03
1280 pinsr%1 m0, r2d, 3 ; 08 02 01 00
1281 punpckl%2 m4, m2, m3 ; 04 10 03 09
1282 pshuf%1 m4, m4, q2310 ; 10 04 03 09
1283 mova [r0+ 0*SIZEOF_DCTCOEF], m0 ; 08 02 01 00
1284 mova [r0+ 4*SIZEOF_DCTCOEF], m4 ; 10 04 03 09
1285 mova m3, [r1+12*SIZEOF_DCTCOEF] ; 15 14 13 12
1286 mova m5, [r1+16*SIZEOF_DCTCOEF] ; 19 18 17 16
1287 punpckl%3 m6, m5 ; 17 16 XX XX
1288 psrl%4 m1, %5 ; XX 07 06 05
1289 punpckh%2 m6, m2 ; 08 17 11 16
1290 punpckl%3 m6, m1 ; 06 05 11 16
1291 mova [r0+ 8*SIZEOF_DCTCOEF], m6 ; 06 05 11 16
1292 psrl%4 m1, %5 ; XX XX 07 06
1293 punpckl%2 m1, m5 ; 17 07 16 06
1294 mova m0, [r1+20*SIZEOF_DCTCOEF] ; 23 22 21 20
1295 mova m2, [r1+24*SIZEOF_DCTCOEF] ; 27 26 25 24
1296 punpckh%3 m1, m1 ; 17 07 17 07
1297 punpckl%2 m6, m3, m2 ; 25 13 24 12
1299 mova [r0+24*SIZEOF_DCTCOEF], m0 ; 23 22 21 20
1300 punpckl%2 m1, m6 ; 24 17 12 07
1301 mova [r0+12*SIZEOF_DCTCOEF], m1
1302 pinsr%1 m3, r2d, 0 ; 15 14 13 18
1303 mova [r0+16*SIZEOF_DCTCOEF], m3 ; 15 14 13 18
1304 mova m7, [r1+28*SIZEOF_DCTCOEF]
1305 mova m0, [r1+32*SIZEOF_DCTCOEF] ; 35 34 33 32
1306 psrl%4 m5, %5*3 ; XX XX XX 19
1307 pshuf%1 m1, m2, q3321 ; 27 27 26 25
1308 punpckl%2 m5, m0 ; 33 XX 32 19
1309 psrl%4 m2, %5*3 ; XX XX XX 27
1310 punpckl%2 m5, m1 ; 26 32 25 19
1311 mova [r0+32*SIZEOF_DCTCOEF], m7
1312 mova [r0+20*SIZEOF_DCTCOEF], m5 ; 26 32 25 19
1313 mova m7, [r1+36*SIZEOF_DCTCOEF]
1314 mova m1, [r1+40*SIZEOF_DCTCOEF] ; 43 42 41 40
1315 pshuf%1 m3, m0, q3321 ; 35 35 34 33
1316 punpckl%2 m2, m1 ; 41 XX 40 27
1317 mova [r0+40*SIZEOF_DCTCOEF], m7
1318 punpckl%2 m2, m3 ; 34 40 33 27
1319 mova [r0+28*SIZEOF_DCTCOEF], m2
1320 mova m7, [r1+44*SIZEOF_DCTCOEF] ; 47 46 45 44
1321 mova m2, [r1+48*SIZEOF_DCTCOEF] ; 51 50 49 48
1322 psrl%4 m0, %5*3 ; XX XX XX 35
1323 punpckl%2 m0, m2 ; 49 XX 48 35
1324 pshuf%1 m3, m1, q3321 ; 43 43 42 41
1325 punpckl%2 m0, m3 ; 42 48 41 35
1326 mova [r0+36*SIZEOF_DCTCOEF], m0
1327 pextr%1 r2d, m2, 3 ; 51
1328 psrl%4 m1, %5*3 ; XX XX XX 43
1329 punpckl%2 m1, m7 ; 45 XX 44 43
1330 psrl%4 m2, %5 ; XX 51 50 49
1331 punpckl%2 m1, m2 ; 50 44 49 43
1332 pshuf%1 m1, m1, q2310 ; 44 50 49 43
1333 mova [r0+44*SIZEOF_DCTCOEF], m1
1334 psrl%4 m7, %5 ; XX 47 46 45
1335 pinsr%1 m7, r2d, 3 ; 51 47 46 45
1336 mova [r0+48*SIZEOF_DCTCOEF], m7
1337 mova m0, [r1+56*SIZEOF_DCTCOEF] ; 59 58 57 56
1338 mova m1, [r1+52*SIZEOF_DCTCOEF] ; 55 54 53 52
1339 mova m7, [r1+60*SIZEOF_DCTCOEF]
1340 punpckl%3 m2, m0, m1 ; 53 52 57 56
1341 punpckh%3 m1, m0 ; 59 58 55 54
1342 mova [r0+52*SIZEOF_DCTCOEF], m2
1343 mova [r0+56*SIZEOF_DCTCOEF], m1
1344 mova [r0+60*SIZEOF_DCTCOEF], m7
1349 SCAN_8x8 d, dq, qdq, dq, 4
1351 SCAN_8x8 d, dq, qdq, dq, 4
1354 SCAN_8x8 w, wd, dq , q , 16
1357 ;-----------------------------------------------------------------------------
1358 ; void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *src, uint8_t *dst )
1359 ;-----------------------------------------------------------------------------
1360 %macro ZIGZAG_SUB_4x4 2
1362 cglobal zigzag_sub_4x4%1_%2, 4,4,8
1364 cglobal zigzag_sub_4x4%1_%2, 3,3,8
1366 movd m0, [r1+0*FENC_STRIDE]
1367 movd m1, [r1+1*FENC_STRIDE]
1368 movd m2, [r1+2*FENC_STRIDE]
1369 movd m3, [r1+3*FENC_STRIDE]
1370 movd m4, [r2+0*FDEC_STRIDE]
1371 movd m5, [r2+1*FDEC_STRIDE]
1372 movd m6, [r2+2*FDEC_STRIDE]
1373 movd m7, [r2+3*FDEC_STRIDE]
1374 movd [r2+0*FDEC_STRIDE], m0
1375 movd [r2+1*FDEC_STRIDE], m1
1376 movd [r2+2*FDEC_STRIDE], m2
1377 movd [r2+3*FDEC_STRIDE], m3
1384 mova m7, [pb_sub4%2]
1388 punpckhbw m1, m0, m4
1394 pand m0, [pb_subacmask]
1410 %if HIGH_BIT_DEPTH == 0
1412 ZIGZAG_SUB_4x4 , frame
1413 ZIGZAG_SUB_4x4 ac, frame
1414 ZIGZAG_SUB_4x4 , field
1415 ZIGZAG_SUB_4x4 ac, field
1417 ZIGZAG_SUB_4x4 , frame
1418 ZIGZAG_SUB_4x4 ac, frame
1419 ZIGZAG_SUB_4x4 , field
1420 ZIGZAG_SUB_4x4 ac, field
1421 %endif ; !HIGH_BIT_DEPTH
1423 %if HIGH_BIT_DEPTH == 0
1425 cglobal zigzag_scan_8x8_field, 2,3,7
1426 lea r2, [pb_scan8field1]
1427 %define off(m) (r2+m-pb_scan8field1)
1430 vpperm m5, m0, m1, [off(pb_scan8field1)]
1432 vpperm m0, m0, m1, [off(pb_scan8field2a)]
1435 vpperm m5, m2, m3, [off(pb_scan8field2b)]
1438 mova m4, [off(pb_scan8field3b)]
1439 vpperm m1, m1, m2, [off(pb_scan8field3a)]
1441 vpperm m5, m3, m0, m4
1444 ; 4b, 5b are the same as pb_scan8field3b.
1445 ; 5a is the same as pb_scan8field4a.
1446 mova m5, [off(pb_scan8field4a)]
1447 vpperm m2, m2, m3, m5
1449 vpperm m6, m0, m1, m4
1452 vpperm m3, m3, m0, m5
1454 vpperm m5, m1, m2, m4
1457 vpperm m5, m0, m1, [off(pb_scan8field6)]
1459 vpperm m5, m1, m2, [off(pb_scan8field7)]
1471 cglobal zigzag_scan_8x8_frame, 2,3,8
1472 lea r2, [pb_scan8frame1]
1473 %define off(m) (r2+m-pb_scan8frame1)
1476 vpperm m7, m7, m3, [off(pb_scan8framet1)] ; 8 9 14 15 16 17 21 22
1478 vpperm m0, m3, m2, [off(pb_scan8framet2)] ; 18 19 20 23 25 31 26 30
1481 vpperm m3, m4, m1, [off(pb_scan8framet3)] ; 32 33 37 38 40 43 44 45
1482 vpperm m6, m0, m3, [off(pb_scan8framet4)] ; 18 23 25 31 32 38 40 45
1483 vpperm m5, m0, m3, [off(pb_scan8framet5)] ; 19 20 26 30 33 37 43 44
1484 vpperm m3, m2, m4, [off(pb_scan8framet6)] ; 24 27 28 29 34 35 36 39
1486 vpperm m4, m1, m4, [off(pb_scan8framet7)] ; 41 42 46 47 48 49 54 55
1488 vpperm m2, m1, m3, [off(pb_scan8framet8)] ; 0 1 2 7 24 28 29 36
1489 vpperm m1, m2, m7, [off(pb_scan8frame1)] ; 0 8 1 2 9 16 24 17
1492 movhps m0, [r1+ 20] ; 3 4 5 6 10 11 12 13
1493 vpperm m1, m0, m6, [off(pb_scan8frame2)] ; 10 3 4 11 18 25 32 40
1495 vpperm m1, m0, m5, [off(pb_scan8frame3)] ; 33 26 19 12 5 6 13 20
1497 vpperm m1, m2, m7, [off(pb_scan8frame5)] ; 28 21 14 7 15 22 29 36
1500 movhps m0, [r1+114] ; 50 51 52 53 57 58 59 60
1501 vpperm m1, m5, m0, [off(pb_scan8frame6)] ; 43 50 57 58 51 44 37 30
1503 vpperm m1, m6, m0, [off(pb_scan8frame7)] ; 23 31 38 45 52 59 60 53
1506 vpperm m0, m3, m1, [off(pb_scan8framet9)] ; 27 34 35 39 56 61 62 63
1507 vpperm m1, m0, m4, [off(pb_scan8frame4)] ; 27 34 41 48 56 49 42 35
1509 vpperm m1, m0, m4, [off(pb_scan8frame8)] ; 46 39 47 54 61 62 55 63
1515 ;-----------------------------------------------------------------------------
1516 ; void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz )
1517 ;-----------------------------------------------------------------------------
1519 mova m0, [r1+(%1*4+ 0)*SIZEOF_PIXEL]
1520 mova m1, [r1+(%1*4+ 8)*SIZEOF_PIXEL]
1521 mova m2, [r1+(%1*4+16)*SIZEOF_PIXEL]
1522 mova m3, [r1+(%1*4+24)*SIZEOF_PIXEL]
1523 TRANSPOSE4x4%2 0,1,2,3,4
1524 mova [r0+(%1+ 0)*SIZEOF_PIXEL], m0
1525 mova [r0+(%1+32)*SIZEOF_PIXEL], m1
1526 mova [r0+(%1+64)*SIZEOF_PIXEL], m2
1527 mova [r0+(%1+96)*SIZEOF_PIXEL], m3
1534 %macro ZIGZAG_8x8_CAVLC 1
1535 cglobal zigzag_interleave_8x8_cavlc, 3,3,8
1566 %macro INTERLEAVE_XMM 1
1567 mova m0, [r1+%1*4+ 0]
1568 mova m1, [r1+%1*4+16]
1569 mova m4, [r1+%1*4+32]
1570 mova m5, [r1+%1*4+48]
1571 SBUTTERFLY wd, 0, 1, 6
1572 SBUTTERFLY wd, 4, 5, 7
1573 SBUTTERFLY wd, 0, 1, 6
1574 SBUTTERFLY wd, 4, 5, 7
1576 movhps [r0+%1+ 32], m0
1577 movh [r0+%1+ 64], m1
1578 movhps [r0+%1+ 96], m1
1580 movhps [r0+%1+ 40], m4
1581 movh [r0+%1+ 72], m5
1582 movhps [r0+%1+104], m5
1589 %if HIGH_BIT_DEPTH == 0
1590 %macro ZIGZAG_8x8_CAVLC 0
1591 cglobal zigzag_interleave_8x8_cavlc, 3,3,8
1611 %endif ; !HIGH_BIT_DEPTH