1 ;*****************************************************************************
2 ;* dct-32.asm: x86_32 transform and zigzag
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2016 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Holger Lubitz <holger@lubitz.org>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Min Chen <chenm001.163.com>
10 ;* Christian Heine <sennindemokrit@gmx.net>
12 ;* This program is free software; you can redistribute it and/or modify
13 ;* it under the terms of the GNU General Public License as published by
14 ;* the Free Software Foundation; either version 2 of the License, or
15 ;* (at your option) any later version.
17 ;* This program is distributed in the hope that it will be useful,
18 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;* GNU General Public License for more details.
22 ;* You should have received a copy of the GNU General Public License
23 ;* along with this program; if not, write to the Free Software
24 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
26 ;* This program is also available under a commercial proprietary license.
27 ;* For more information, contact us at licensing@x264.com.
28 ;*****************************************************************************
31 %include "x86util.asm"
42 %macro SPILL_SHUFFLE 3-* ; ptr, list of regs, list of memory offsets
47 mova [%%base + %2*16], %%tmp
52 %macro UNSPILL_SHUFFLE 3-*
57 mova %%tmp, [%%base + %2*16]
62 %macro SPILL 2+ ; assume offsets are the same as reg numbers
63 SPILL_SHUFFLE %1, %2, %2
67 UNSPILL_SHUFFLE %1, %2, %2
71 ; out: 0,4,6 in memory at %10,%11,%12, rest in regs
73 SUMSUB_BA %1, %9, %2 ; %9 = s07, %2 = d07
74 SUMSUB_BA %1, %8, %3 ; %8 = s16, %3 = d16
75 SUMSUB_BA %1, %7, %4 ; %7 = s25, %4 = d25
76 SUMSUB_BA %1, %6, %5 ; %6 = s34, %5 = d34
77 SUMSUB_BA %1, %6, %9 ; %6 = a0, %9 = a2
78 SUMSUB_BA %1, %7, %8 ; %7 = a1, %8 = a3
79 SUMSUB_BA %1, %7, %6 ; %7 = dst0, %6 = dst4
82 psra%1 m%7, m%8, 1 ; a3>>1
83 padd%1 m%7, m%9 ; a2 + (a3>>1)
85 psub%1 m%9, m%8 ; (a2>>1) - a3
88 padd%1 m%6, m%4 ; d25+(d25>>1)
89 psub%1 m%8, m%2, m%5 ; a5 = d07-d34-(d25+(d25>>1))
92 padd%1 m%6, m%3 ; d16+(d16>>1)
94 psub%1 m%9, m%6 ; a6 = d07+d34-(d16+(d16>>1))
96 padd%1 m%6, m%2 ; d07+(d07>>1)
98 padd%1 m%6, m%4 ; a4 = d16+d25+(d07+(d07>>1))
100 padd%1 m%2, m%5 ; d34+(d34>>1)
102 psub%1 m%2, m%4 ; a7 = d16-d25+(d34+(d34>>1))
104 padd%1 m%5, m%6 ; a4 + (a7>>2)
106 padd%1 m%4, m%8 ; a5 + (a6>>2)
109 psub%1 m%6, m%2 ; (a4>>2) - a7
110 psub%1 m%9, m%8 ; a6 - (a5>>2)
111 SWAP %3, %5, %4, %7, %9, %6
114 ; in: size, m[1,2,3,5,6,7], 0,4 in mem at %10,%11
163 cglobal sub8x8_dct8, 3,3,8
164 global current_function %+ .skip_prologue
166 LOAD_DIFF8x4 0,1,2,3, none,none, r1, r2
167 LOAD_DIFF8x4 4,5,6,7, none,none, r1, r2
169 DCT8_1D w, 0,1,2,3,4,5,6,7, [r0],[r0+0x10],[r0+0x50]
174 TRANSPOSE4x4W 0,1,2,3,4
179 DCT8_1D d, 0,4,1,5,2,6,3,7, [r0],[r0+0x80],[r0+0xC0]
190 TRANSPOSE4x4W 4,5,6,7,0
195 DCT8_1D d,4,0,5,1,6,2,7,3, [r0+0x10],[r0+0x90],[r0+0xD0]
202 %endmacro ; SUB8x8_DCT8
211 %macro ADD8x8_IDCT8 0
212 cglobal add8x8_idct8, 2,2
214 global current_function %+ .skip_prologue
216 UNSPILL_SHUFFLE r1, 1,2,3,5,6,7, -6,-4,-2,2,4,6
217 IDCT8_1D d,0,1,2,3,4,5,6,7,[r1-128],[r1+0]
219 TRANSPOSE4x4D 0,1,2,3,4
222 SPILL_SHUFFLE r1, 0,1,2,3, -8,-6,-4,-2
223 TRANSPOSE4x4D 4,5,6,7,3
225 SPILL_SHUFFLE r1, 4,5,6,7, 0,2,4,6
226 UNSPILL_SHUFFLE r1, 1,2,3,5,6,7, -5,-3,-1,3,5,7
227 IDCT8_1D d,0,1,2,3,4,5,6,7,[r1-112],[r1+16]
229 TRANSPOSE4x4D 0,1,2,3,4
232 TRANSPOSE4x4D 4,5,6,7,0
233 SPILL_SHUFFLE r1, 4,5,6,7, 1,3,5,7
234 UNSPILL_SHUFFLE r1, 5,6,7, -6,-4,-2
235 IDCT8_1D d,4,5,6,7,0,1,2,3,[r1-128],[r1-112]
236 SPILL_SHUFFLE r1, 4,5,6,7,0,1,2,3, -8,-7,-6,-5,-4,-3,-2,-1
237 UNSPILL_SHUFFLE r1, 1,2,3,5,6,7, 2,4,6,3,5,7
238 IDCT8_1D d,0,1,2,3,4,5,6,7,[r1+0],[r1+16]
239 SPILL_SHUFFLE r1, 7,6,5, 7,6,5
240 mova m7, [pw_pixel_max]
243 STORE_DIFF m5, m0, m6, m7, [r0+0*FDEC_STRIDEB]
245 STORE_DIFF m0, m1, m6, m7, [r0+1*FDEC_STRIDEB]
247 STORE_DIFF m0, m2, m6, m7, [r0+2*FDEC_STRIDEB]
249 STORE_DIFF m0, m3, m6, m7, [r0+3*FDEC_STRIDEB]
251 STORE_DIFF m0, m4, m6, m7, [r0+4*FDEC_STRIDEB]
254 STORE_DIFF m0, m1, m6, m7, [r0+5*FDEC_STRIDEB]
257 STORE_DIFF m0, m1, m6, m7, [r0+6*FDEC_STRIDEB]
260 STORE_DIFF m0, m1, m6, m7, [r0+7*FDEC_STRIDEB]
262 %endmacro ; ADD8x8_IDCT8
269 %else ; !HIGH_BIT_DEPTH
274 LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
275 LOAD_DIFF m1, m7, none, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
276 LOAD_DIFF m2, m7, none, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
277 LOAD_DIFF m3, m7, none, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
278 LOAD_DIFF m4, m7, none, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
279 LOAD_DIFF m5, m7, none, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE]
281 LOAD_DIFF m6, m7, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
282 LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE]
287 DCT8_1D w,0,1,2,3,4,5,6,7,[r0],[r0+0x40],[r0+0x60]
291 ;-----------------------------------------------------------------------------
292 ; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
293 ;-----------------------------------------------------------------------------
294 cglobal sub8x8_dct8_mmx, 3,3
295 global sub8x8_dct8_mmx.skip_prologue
298 call load_diff_4x8_mmx
301 TRANSPOSE4x4W 0,1,2,3,4
304 TRANSPOSE4x4W 4,5,6,7,0
310 call load_diff_4x8_mmx
316 TRANSPOSE4x4W 4,5,6,7,0
319 TRANSPOSE4x4W 0,1,2,3,5
321 SPILL_SHUFFLE r0, 0,1,2,3, 4,5,6,7
322 movq mm4, m6 ; depends on the permutation to not produce conflicts
328 UNSPILL r0+8, 4,5,6,7
332 SPILL r0+8, 1,2,3,5,7
334 UNSPILL r0, 0,1,2,3,4,5,6,7
340 IDCT8_1D w,0,1,2,3,4,5,6,7,[r1+0],[r1+64]
344 %macro ADD_STORE_ROW 3
345 movq m1, [r0+%1*FDEC_STRIDE]
351 movq [r0+%1*FDEC_STRIDE], m1
354 ;-----------------------------------------------------------------------------
355 ; void add8x8_idct8( uint8_t *dst, int16_t dct[8][8] )
356 ;-----------------------------------------------------------------------------
357 cglobal add8x8_idct8_mmx, 2,2
358 global add8x8_idct8_mmx.skip_prologue
362 UNSPILL r1, 1,2,3,5,6,7
365 TRANSPOSE4x4W 0,1,2,3,7
368 TRANSPOSE4x4W 4,5,6,7,0
371 UNSPILL r1+8, 1,2,3,5,6,7
376 TRANSPOSE4x4W 0,1,2,3,7
379 TRANSPOSE4x4W 4,5,6,7,0
386 ; memory layout at this time:
395 UNSPILL_SHUFFLE r1, 1,2,3, 5,6,7
408 movq [r1+0x08], m0 ; mm4
409 movq [r1+0x48], m4 ; mm5
410 movq [r1+0x58], m5 ; mm0
411 movq [r1+0x68], m6 ; mm2
412 movq [r1+0x78], m7 ; mm6
415 movq [r1+0x18], m1 ; mm1
416 movq [r1+0x28], m2 ; mm7
418 movq [r1+0x38], m3 ; mm3
433 ADD_STORE_ROW 0, [r1+0x00], [r1+0x08]
434 ADD_STORE_ROW 1, [r1+0x10], [r1+0x18]
435 ADD_STORE_ROW 2, [r1+0x20], [r1+0x28]
436 ADD_STORE_ROW 3, m3, [r1+0x38]
437 ADD_STORE_ROW 4, m4, [r1+0x48]
438 ADD_STORE_ROW 5, m5, [r1+0x58]
439 ADD_STORE_ROW 6, m6, [r1+0x68]
440 ADD_STORE_ROW 7, m7, [r1+0x78]
444 cglobal sub8x8_dct, 3,3
445 add r2, 4*FDEC_STRIDE
446 global current_function %+ .skip_prologue
451 LOAD_DIFF8x4 0, 1, 2, 3, 6, 7, r1, r2-4*FDEC_STRIDE
454 LOAD_DIFF8x4 4, 5, 6, 7, 1, 2, r1, r2-4*FDEC_STRIDE
459 DCT4_1D 0, 1, 2, 3, 7
460 TRANSPOSE2x4x4W 0, 1, 2, 3, 7
463 DCT4_1D 4, 5, 6, 7, 2
464 TRANSPOSE2x4x4W 4, 5, 6, 7, 2
467 DCT4_1D 0, 1, 2, 3, 6
469 STORE_DCT 0, 1, 2, 3, r0, 0
470 DCT4_1D 4, 5, 6, 7, 3
471 STORE_DCT 4, 5, 6, 7, r0, 64
474 ;-----------------------------------------------------------------------------
475 ; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
476 ;-----------------------------------------------------------------------------
477 cglobal sub8x8_dct8, 3,3
478 add r2, 4*FDEC_STRIDE
479 global current_function %+ .skip_prologue
483 LOAD_DIFF8x4 0, 1, 2, 3, 4, 7, r1, r2-4*FDEC_STRIDE
486 LOAD_DIFF8x4 4, 5, 6, 7, 0, 1, r1, r2-4*FDEC_STRIDE
489 LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2-4*FDEC_STRIDE]
490 LOAD_DIFF m1, m7, none, [r1+1*FENC_STRIDE], [r2-3*FDEC_STRIDE]
491 LOAD_DIFF m2, m7, none, [r1+2*FENC_STRIDE], [r2-2*FDEC_STRIDE]
492 LOAD_DIFF m3, m7, none, [r1+3*FENC_STRIDE], [r2-1*FDEC_STRIDE]
493 LOAD_DIFF m4, m7, none, [r1+4*FENC_STRIDE], [r2+0*FDEC_STRIDE]
494 LOAD_DIFF m5, m7, none, [r1+5*FENC_STRIDE], [r2+1*FDEC_STRIDE]
496 LOAD_DIFF m6, m7, none, [r1+6*FENC_STRIDE], [r2+2*FDEC_STRIDE]
497 LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+3*FDEC_STRIDE]
500 DCT8_1D w,0,1,2,3,4,5,6,7,[r0],[r0+0x40],[r0+0x60]
502 TRANSPOSE8x8W 0,1,2,3,4,5,6,7,[r0+0x60],[r0+0x40],1
504 DCT8_1D w,0,1,2,3,4,5,6,7,[r0],[r0+0x40],[r0+0x60]
510 %define movdqa movaps
511 %define punpcklqdq movlhps
522 ;-----------------------------------------------------------------------------
523 ; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
524 ;-----------------------------------------------------------------------------
526 cglobal add8x8_idct, 2,2
527 add r0, 4*FDEC_STRIDE
528 global current_function %+ .skip_prologue
530 UNSPILL_SHUFFLE r1, 0,2,1,3, 0,1,2,3
531 SBUTTERFLY qdq, 0, 1, 4
532 SBUTTERFLY qdq, 2, 3, 4
533 UNSPILL_SHUFFLE r1, 4,6,5,7, 4,5,6,7
535 SBUTTERFLY qdq, 4, 5, 0
536 SBUTTERFLY qdq, 6, 7, 0
538 IDCT4_1D w,0,1,2,3,r1
540 TRANSPOSE2x4x4W 0,1,2,3,4
542 IDCT4_1D w,4,5,6,7,r1
544 TRANSPOSE2x4x4W 4,5,6,7,0
547 IDCT4_1D w,0,1,2,3,r1
549 IDCT4_1D w,4,5,6,7,r1
552 DIFFx2 m0, m1, m6, m7, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]; m5
553 DIFFx2 m2, m3, m6, m7, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]; m5
554 UNSPILL_SHUFFLE r1, 0,2, 6,7
555 DIFFx2 m4, m5, m6, m7, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]; m5
556 DIFFx2 m0, m2, m6, m7, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]; m5
557 STORE_IDCT m1, m3, m5, m2
566 ;-----------------------------------------------------------------------------
567 ; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] )
568 ;-----------------------------------------------------------------------------
569 %macro ADD8x8_IDCT8 0
570 cglobal add8x8_idct8, 2,2
571 add r0, 4*FDEC_STRIDE
572 global current_function %+ .skip_prologue
574 UNSPILL r1, 1,2,3,5,6,7
575 IDCT8_1D w,0,1,2,3,4,5,6,7,[r1+0],[r1+64]
577 TRANSPOSE8x8W 0,1,2,3,4,5,6,7,[r1+0x60],[r1+0x40],1
580 IDCT8_1D w,0,1,2,3,4,5,6,7,[r1+0],[r1+64]
583 DIFFx2 m0, m1, m6, m7, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]; m5
584 DIFFx2 m2, m3, m6, m7, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]; m5
585 UNSPILL_SHUFFLE r1, 0,2, 6,7
586 DIFFx2 m4, m5, m6, m7, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]; m5
587 DIFFx2 m0, m2, m6, m7, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]; m5
588 STORE_IDCT m1, m3, m5, m2
590 %endmacro ; ADD8x8_IDCT8
596 %endif ; !HIGH_BIT_DEPTH