1 ;*****************************************************************************
2 ;* dct-32.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2008 x264 project
6 ;* Authors: Laurent Aimar <fenrir@via.ecp.fr> (initial version)
7 ;* Min Chen <chenm001.163.com> (converted to nasm)
8 ;* Christian Heine <sennindemokrit@gmx.net> (dct8/idct8 functions)
9 ;* Loren Merritt <lorenm@u.washington.edu> (misc)
11 ;* This program is free software; you can redistribute it and/or modify
12 ;* it under the terms of the GNU General Public License as published by
13 ;* the Free Software Foundation; either version 2 of the License, or
14 ;* (at your option) any later version.
16 ;* This program is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;* GNU General Public License for more details.
21 ;* You should have received a copy of the GNU General Public License
22 ;* along with this program; if not, write to the Free Software
23 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
24 ;*****************************************************************************
55 ; input ABCD output ADTC
56 %macro TRANSPOSE4x4W 5
57 SBUTTERFLY q, wd, %1, %2, %5
58 SBUTTERFLY q, wd, %3, %4, %2
59 SBUTTERFLY q, dq, %1, %3, %4
60 SBUTTERFLY q, dq, %5, %2, %3
63 ; input 2x8 unsigned bytes (%5,%6), zero (%7) output: difference (%1,%2)
77 %macro LOADSUMSUB 4 ; returns %1=%3+%4, %2=%3-%4
83 %macro STORE_DIFF_8P 4
93 ;-----------------------------------------------------------------------------
94 ; void x264_pixel_sub_8x8_mmx( int16_t *diff, uint8_t *pix1, uint8_t *pix2 );
95 ;-----------------------------------------------------------------------------
97 x264_pixel_sub_8x8_mmx:
101 LOAD_DIFF_8P mm0, mm1, mm2, mm3, [r1], [r2], mm7
110 ;-----------------------------------------------------------------------------
111 ; void x264_ydct8_mmx( int16_t dest[8][8] );
112 ;-----------------------------------------------------------------------------
115 ;-------------------------------------------------------------------------
116 ; vertical dct ( compute 4 columns at a time -> 2 loops )
117 ;-------------------------------------------------------------------------
121 LOADSUMSUB mm2, mm3, [r0+i+0*16], [r0+i+7*16] ; mm2 = s07, mm3 = d07
122 LOADSUMSUB mm1, mm5, [r0+i+1*16], [r0+i+6*16] ; mm1 = s16, mm5 = d16
123 LOADSUMSUB mm0, mm6, [r0+i+2*16], [r0+i+5*16] ; mm0 = s25, mm6 = d25
124 LOADSUMSUB mm4, mm7, [r0+i+3*16], [r0+i+4*16] ; mm4 = s34, mm7 = d34
126 SUMSUB_BA mm4, mm2 ; mm4 = a0, mm2 = a2
127 SUMSUB_BA mm0, mm1 ; mm0 = a1, mm1 = a3
128 SUMSUB_BA mm0, mm4 ; mm0 = dst0, mm1 = dst4
130 movq [r0+i+0*16], mm0
131 movq [r0+i+4*16], mm4
135 paddw mm0, mm2 ; a2 + (a3>>1)
137 psubw mm2, mm1 ; (a2>>1) - a3
139 movq [r0+i+2*16], mm0
140 movq [r0+i+6*16], mm2
144 paddw mm0, mm6 ; d25+(d25>>1)
146 psubw mm1, mm7 ; a5 = d07-d34-(d25+(d25>>1))
151 paddw mm0, mm5 ; d16+(d16>>1)
153 paddw mm2, mm7 ; a6 = d07+d34-(d16+(d16>>1))
158 paddw mm0, mm3 ; d07+(d07>>1)
160 paddw mm0, mm6 ; a4 = d16+d25+(d07+(d07>>1))
164 paddw mm3, mm7 ; d34+(d34>>1)
166 psubw mm3, mm6 ; a7 = d16-d25+(d34+(d34>>1))
170 paddw mm7, mm0 ; a4 + (a7>>2)
174 paddw mm6, mm1 ; a5 + (a6>>2)
178 psubw mm0, mm3 ; (a4>>2) - a7
179 psubw mm2, mm1 ; a6 - (a5>>2)
181 movq [r0+i+1*16], mm7
182 movq [r0+i+3*16], mm6
183 movq [r0+i+5*16], mm2
184 movq [r0+i+7*16], mm0
190 ;-----------------------------------------------------------------------------
191 ; void x264_yidct8_mmx( int16_t dest[8][8] );
192 ;-----------------------------------------------------------------------------
195 ;-------------------------------------------------------------------------
196 ; vertical idct ( compute 4 columns at a time -> 2 loops )
197 ;-------------------------------------------------------------------------
201 movq mm1, [r0+i+1*16] ; mm1 = d1
202 movq mm3, [r0+i+3*16] ; mm3 = d3
203 movq mm5, [r0+i+5*16] ; mm5 = d5
204 movq mm7, [r0+i+7*16] ; mm7 = d7
211 psubw mm0, mm3 ; mm0 = e1
218 paddw mm2, mm1 ; mm2 = e3
224 psubw mm4, mm1 ; mm4 = e5
230 paddw mm6, mm3 ; mm6 = e7
240 paddw mm1, mm6 ; mm1 = f1
241 paddw mm3, mm2 ; mm3 = f3
242 psubw mm5, mm4 ; mm5 = f5
243 psubw mm7, mm0 ; mm7 = f7
245 movq mm2, [r0+i+2*16] ; mm2 = d2
246 movq mm6, [r0+i+6*16] ; mm6 = d6
251 psubw mm4, mm0 ; mm4 = a4
252 paddw mm6, mm2 ; mm6 = a6
254 movq mm2, [r0+i+0*16] ; mm2 = d0
255 movq mm0, [r0+i+4*16] ; mm0 = d4
256 SUMSUB_BA mm0, mm2 ; mm0 = a0, mm2 = a2
258 SUMSUB_BADC mm6, mm0, mm4, mm2 ; mm6 = f0, mm0 = f6
261 SUMSUB_BADC mm7, mm6, mm5, mm4 ; mm7 = g0, mm6 = g7
263 SUMSUB_BADC mm3, mm2, mm1, mm0 ; mm3 = g2, mm2 = g5
266 movq [r0+i+0*16], mm7
267 movq [r0+i+1*16], mm5
268 movq [r0+i+2*16], mm3
269 movq [r0+i+3*16], mm1
270 movq [r0+i+4*16], mm0
271 movq [r0+i+5*16], mm2
272 movq [r0+i+6*16], mm4
273 movq [r0+i+7*16], mm6
279 ;-----------------------------------------------------------------------------
280 ; void x264_pixel_add_8x8_mmx( uint8_t *dst, int16_t src[8][8] );
281 ;-----------------------------------------------------------------------------
283 x264_pixel_add_8x8_mmx:
304 ;-----------------------------------------------------------------------------
305 ; void x264_transpose_8x8_mmx( int16_t src[8][8] );
306 ;-----------------------------------------------------------------------------
308 x264_transpose_8x8_mmx:
313 TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm4
323 TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm4
333 TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm4
343 TRANSPOSE4x4W mm1, mm5, mm6, mm7, mm4
350 ;-----------------------------------------------------------------------------
351 ; void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
352 ;-----------------------------------------------------------------------------
353 cglobal x264_sub8x8_dct8_mmx, 3,3
354 call x264_pixel_sub_8x8_mmx
356 call x264_transpose_8x8_mmx
359 ;-----------------------------------------------------------------------------
360 ; void x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] )
361 ;-----------------------------------------------------------------------------
362 cglobal x264_add8x8_idct8_mmx, 0,1
366 call x264_transpose_8x8_mmx
370 jmp x264_pixel_add_8x8_mmx
407 movdqa %1, [eax+0x00]
408 movdqa %6, [eax+0x40]
420 SBUTTERFLY dqa, wd, %1, %2, %8
423 SBUTTERFLY dqa, wd, %3, %4, %2
424 SBUTTERFLY dqa, wd, %5, %6, %4
425 SBUTTERFLY dqa, wd, %7, %8, %6
426 SBUTTERFLY dqa, dq, %1, %3, %8
429 SBUTTERFLY dqa, dq, %8, %2, %3
430 SBUTTERFLY dqa, dq, %5, %7, %2
431 SBUTTERFLY dqa, dq, %4, %6, %7
432 SBUTTERFLY dqa, qdq, %1, %5, %6
433 SBUTTERFLY dqa, qdq, %8, %4, %5
436 SBUTTERFLY dqa, qdq, %8, %2, %4
437 SBUTTERFLY dqa, qdq, %3, %7, %2
441 ;-----------------------------------------------------------------------------
442 ; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
443 ;-----------------------------------------------------------------------------
444 cglobal x264_add8x8_idct8_sse2
447 movdqa xmm1, [eax+0x10]
448 movdqa xmm2, [eax+0x20]
449 movdqa xmm3, [eax+0x30]
450 movdqa xmm5, [eax+0x50]
451 movdqa xmm6, [eax+0x60]
452 movdqa xmm7, [eax+0x70]
453 IDCT8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
454 TRANSPOSE8 xmm4, xmm1, xmm7, xmm3, xmm5, xmm0, xmm2, xmm6, eax
456 paddw xmm4, [pw_32 GLOBAL]
457 movdqa [eax+0x00], xmm4
458 movdqa [eax+0x40], xmm2
459 IDCT8_1D xmm4, xmm0, xmm6, xmm3, xmm2, xmm5, xmm7, xmm1
460 movdqa [eax+0x60], xmm6
461 movdqa [eax+0x70], xmm7
463 STORE_DIFF_8P xmm2, [ecx+FDEC_STRIDE*0], xmm6, xmm7
464 STORE_DIFF_8P xmm0, [ecx+FDEC_STRIDE*1], xmm6, xmm7
465 STORE_DIFF_8P xmm1, [ecx+FDEC_STRIDE*2], xmm6, xmm7
466 STORE_DIFF_8P xmm3, [ecx+FDEC_STRIDE*3], xmm6, xmm7
467 STORE_DIFF_8P xmm5, [ecx+FDEC_STRIDE*4], xmm6, xmm7
468 STORE_DIFF_8P xmm4, [ecx+FDEC_STRIDE*5], xmm6, xmm7
469 movdqa xmm0, [eax+0x60]
470 movdqa xmm1, [eax+0x70]
471 STORE_DIFF_8P xmm0, [ecx+FDEC_STRIDE*6], xmm6, xmm7
472 STORE_DIFF_8P xmm1, [ecx+FDEC_STRIDE*7], xmm6, xmm7
475 ;-----------------------------------------------------------------------------
476 ; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
477 ;-----------------------------------------------------------------------------
490 add dword [esp+0], %3
491 add dword [esp+4], %4*FENC_STRIDE-%4
492 add dword [esp+8], %4*FDEC_STRIDE-%4
494 add dword [esp+0], %3
495 add dword [esp+4], %4
496 add dword [esp+8], %4
502 ;-----------------------------------------------------------------------------
503 ; void x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
504 ;-----------------------------------------------------------------------------
505 %macro ADD_NxN_IDCT 4
514 add dword [esp+0], %4*FDEC_STRIDE-%4
515 add dword [esp+4], %3
517 add dword [esp+0], %4
518 add dword [esp+4], %3
524 SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx, 128, 8
525 ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx, 128, 8
527 ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8
529 ;-----------------------------------------------------------------------------
530 ; void x264_zigzag_scan_4x4_field_mmx( int level[16], int16_t dct[4][4] )
531 ;-----------------------------------------------------------------------------
532 cglobal x264_zigzag_scan_4x4_field_mmx
537 punpcklwd mm2, [edx+8]
538 punpckhwd mm3, [edx+8]
539 punpcklwd mm4, [edx+16]
540 punpckhwd mm5, [edx+16]
541 punpcklwd mm6, [edx+24]
542 punpckhwd mm7, [edx+24]