1 ;*****************************************************************************
2 ;* MMX/SSE2-optimized H.264 iDCT
3 ;*****************************************************************************
4 ;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt
5 ;* Copyright (C) 2003-2008 x264 project
7 ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
8 ;* Loren Merritt <lorenm@u.washington.edu>
9 ;* Holger Lubitz <hal@duncan.ol.sub.de>
10 ;* Min Chen <chenm001.163.com>
12 ;* This file is part of FFmpeg.
14 ;* FFmpeg is free software; you can redistribute it and/or
15 ;* modify it under the terms of the GNU Lesser General Public
16 ;* License as published by the Free Software Foundation; either
17 ;* version 2.1 of the License, or (at your option) any later version.
19 ;* FFmpeg is distributed in the hope that it will be useful,
20 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
21 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 ;* Lesser General Public License for more details.
24 ;* You should have received a copy of the GNU Lesser General Public
25 ;* License along with FFmpeg; if not, write to the Free Software
26 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27 ;*****************************************************************************
30 %include "x86util.asm"
34 ; FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
35 scan8_mem: db 4+1*8, 5+1*8, 4+2*8, 5+2*8
36 db 6+1*8, 7+1*8, 6+2*8, 7+2*8
37 db 4+3*8, 5+3*8, 4+4*8, 5+4*8
38 db 6+3*8, 7+3*8, 6+4*8, 7+4*8
46 %define scan8 scan8_mem
53 ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
61 IDCT4_1D 0, 1, 2, 3, 4, 5
63 TRANSPOSE4x4W 0, 1, 2, 3, 4
65 IDCT4_1D 0, 1, 2, 3, 4, 5
68 STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
70 STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3
74 ; ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
75 cglobal h264_idct_add_mmx, 3, 3, 0
127 SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
130 %macro IDCT8_1D_FULL 1
137 IDCT8_1D [%1], [%1+ 64]
140 ; %1=int16_t *block, %2=int16_t *dstblock
141 %macro IDCT8_ADD_MMX_START 2
144 TRANSPOSE4x4W 0, 1, 2, 3, 7
150 TRANSPOSE4x4W 4, 5, 6, 7, 3
157 ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
158 %macro IDCT8_ADD_MMX_END 3
165 STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
167 STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
172 STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3
174 STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
178 ; ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
179 cglobal h264_idct8_add_mmx, 3, 4, 0
180 %assign pad 128+4-(stack_offset&7)
184 IDCT8_ADD_MMX_START r1 , rsp
185 IDCT8_ADD_MMX_START r1+8, rsp+64
187 IDCT8_ADD_MMX_END r0 , rsp, r2
188 IDCT8_ADD_MMX_END r3 , rsp+8, r2
193 ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
194 %macro IDCT8_ADD_SSE 4
197 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
199 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16]
206 IDCT8_1D [%2], [%2+ 16]
219 STORE_DIFF m0, m6, m7, [%1 ]
220 STORE_DIFF m1, m6, m7, [%1+%3 ]
221 STORE_DIFF m2, m6, m7, [%1+%3*2]
222 STORE_DIFF m3, m6, m7, [%1+%4 ]
231 STORE_DIFF m4, m6, m7, [%1 ]
232 STORE_DIFF m5, m6, m7, [%1+%3 ]
233 STORE_DIFF m0, m6, m7, [%1+%3*2]
234 STORE_DIFF m1, m6, m7, [%1+%4 ]
238 ; ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
239 cglobal h264_idct8_add_sse2, 3, 4, 10
240 IDCT8_ADD_SSE r0, r1, r2, r3
243 %macro DC_ADD_MMX2_INIT 2-3
263 %macro DC_ADD_MMX2_OP 3-4
283 ; ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
284 cglobal h264_idct_dc_add_mmx2, 3, 3, 0
285 DC_ADD_MMX2_INIT r1, r2
286 DC_ADD_MMX2_OP movh, r0, r2, r1
289 ; ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
290 cglobal h264_idct8_dc_add_mmx2, 3, 3, 0
291 DC_ADD_MMX2_INIT r1, r2
292 DC_ADD_MMX2_OP mova, r0, r2, r1
294 DC_ADD_MMX2_OP mova, r0, r2, r1
297 ; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset,
298 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
299 cglobal h264_idct_add16_mmx, 5, 7, 0
305 movzx r6, byte [scan8+r5]
306 movzx r6, byte [r4+r6]
309 mov r6d, dword [r1+r5*4]
319 ; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset,
320 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
321 cglobal h264_idct8_add4_mmx, 5, 7, 0
322 %assign pad 128+4-(stack_offset&7)
330 movzx r6, byte [scan8+r5]
331 movzx r6, byte [r4+r6]
334 mov r6d, dword [r1+r5*4]
337 IDCT8_ADD_MMX_START r2 , rsp
338 IDCT8_ADD_MMX_START r2+8, rsp+64
339 IDCT8_ADD_MMX_END r6 , rsp, r3
340 mov r6d, dword [r1+r5*4]
342 IDCT8_ADD_MMX_END r6 , rsp+8, r3
351 ; ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset,
352 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
353 cglobal h264_idct_add16_mmx2, 5, 7, 0
359 movzx r6, byte [scan8+r5]
360 movzx r6, byte [r4+r6]
368 DC_ADD_MMX2_INIT r2, r3, r6
371 %define dst_regd r10d
376 mov dst_regd, dword [r1+r5*4]
377 lea dst_reg, [r0+dst_reg]
378 DC_ADD_MMX2_OP movh, dst_reg, r3, r6
388 mov r6d, dword [r1+r5*4]
398 ; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset,
399 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
400 cglobal h264_idct_add16intra_mmx, 5, 7, 0
406 movzx r6, byte [scan8+r5]
407 movzx r6, byte [r4+r6]
411 mov r6d, dword [r1+r5*4]
421 ; ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset,
422 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
423 cglobal h264_idct_add16intra_mmx2, 5, 7, 0
429 movzx r6, byte [scan8+r5]
430 movzx r6, byte [r4+r6]
433 mov r6d, dword [r1+r5*4]
445 DC_ADD_MMX2_INIT r2, r3, r6
448 %define dst_regd r10d
453 mov dst_regd, dword [r1+r5*4]
454 lea dst_reg, [r0+dst_reg]
455 DC_ADD_MMX2_OP movh, dst_reg, r3, r6
466 ; ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset,
467 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
468 cglobal h264_idct8_add4_mmx2, 5, 7, 0
469 %assign pad 128+4-(stack_offset&7)
477 movzx r6, byte [scan8+r5]
478 movzx r6, byte [r4+r6]
486 DC_ADD_MMX2_INIT r2, r3, r6
489 %define dst_regd r10d
494 mov dst_regd, dword [r1+r5*4]
495 lea dst_reg, [r0+dst_reg]
496 DC_ADD_MMX2_OP mova, dst_reg, r3, r6
497 lea dst_reg, [dst_reg+r3*4]
498 DC_ADD_MMX2_OP mova, dst_reg, r3, r6
510 mov r6d, dword [r1+r5*4]
513 IDCT8_ADD_MMX_START r2 , rsp
514 IDCT8_ADD_MMX_START r2+8, rsp+64
515 IDCT8_ADD_MMX_END r6 , rsp, r3
516 mov r6d, dword [r1+r5*4]
518 IDCT8_ADD_MMX_END r6 , rsp+8, r3
529 ; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset,
530 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
531 cglobal h264_idct8_add4_sse2, 5, 7, 10
537 movzx r6, byte [scan8+r5]
538 movzx r6, byte [r4+r6]
547 DC_ADD_MMX2_INIT r2, r3, r6
550 %define dst_regd r10d
555 mov dst_regd, dword [r1+r5*4]
556 lea dst_reg, [r0+dst_reg]
557 DC_ADD_MMX2_OP mova, dst_reg, r3, r6
558 lea dst_reg, [dst_reg+r3*4]
559 DC_ADD_MMX2_OP mova, dst_reg, r3, r6
570 mov dst_regd, dword [r1+r5*4]
571 lea dst_reg, [r0+dst_reg]
572 IDCT8_ADD_SSE dst_reg, r2, r3, r6
584 h264_idct_add8_mmx_plane:
586 movzx r6, byte [scan8+r5]
587 movzx r6, byte [r4+r6]
592 mov r0d, dword [r1+r5*4]
595 mov r0, r1m ; XXX r1m here is actually r0m of the calling func
597 add r0, dword [r1+r5*4]
607 ; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset,
608 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
609 cglobal h264_idct_add8_mmx, 5, 7, 0
618 call h264_idct_add8_mmx_plane
624 call h264_idct_add8_mmx_plane
627 h264_idct_add8_mmx2_plane
629 movzx r6, byte [scan8+r5]
630 movzx r6, byte [r4+r6]
634 mov r0d, dword [r1+r5*4]
637 mov r0, r1m ; XXX r1m here is actually r0m of the calling func
639 add r0, dword [r1+r5*4]
651 DC_ADD_MMX2_INIT r2, r3, r6
653 mov r0d, dword [r1+r5*4]
656 mov r0, r1m ; XXX r1m here is actually r0m of the calling func
658 add r0, dword [r1+r5*4]
660 DC_ADD_MMX2_OP movh, r0, r3, r6
668 ; ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset,
669 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
670 cglobal h264_idct_add8_mmx2, 5, 7, 0
679 call h264_idct_add8_mmx2_plane
685 call h264_idct_add8_mmx2_plane
689 ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
690 h264_idct_dc_add8_mmx2:
691 movd m0, [r2 ] ; 0 0 X D
692 punpcklwd m0, [r2+32] ; x X d D
695 punpcklwd m0, m0 ; d d D D
696 pxor m1, m1 ; 0 0 0 0
697 psubw m1, m0 ; -d-d-D-D
698 packuswb m0, m1 ; -d-d-D-D d d D D
699 pshufw m1, m0, 0xFA ; -d-d-d-d-D-D-D-D
700 punpcklwd m0, m0 ; d d d d D D D D
702 DC_ADD_MMX2_OP movq, r0, r3, r6
707 ; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
708 x264_add8x4_idct_sse2:
718 TRANSPOSE2x4x4W 0,1,2,3,4
722 STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3
724 STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3
727 %macro add16_sse2_cycle 2
728 movzx r0, word [r4+%2]
731 mov r0d, dword [r1+%1*8]
737 call x264_add8x4_idct_sse2
744 ; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset,
745 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
746 cglobal h264_idct_add16_sse2, 5, 5, 8
750 ; unrolling of the loop leads to an average performance gain of
752 add16_sse2_cycle 0, 0xc
753 add16_sse2_cycle 1, 0x14
754 add16_sse2_cycle 2, 0xe
755 add16_sse2_cycle 3, 0x16
756 add16_sse2_cycle 4, 0x1c
757 add16_sse2_cycle 5, 0x24
758 add16_sse2_cycle 6, 0x1e
759 add16_sse2_cycle 7, 0x26
762 ; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset,
763 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
764 cglobal h264_idct_add16intra_sse2, 5, 7, 8
773 movzx r0, byte [scan8+r5]
774 movzx r0, word [r4+r0]
777 mov r0d, dword [r1+r5*4]
783 call x264_add8x4_idct_sse2
793 mov r0d, dword [r1+r5*4]
799 call h264_idct_dc_add8_mmx2
807 h264_idct_add8_sse2_plane:
809 movzx r0, byte [scan8+r5]
810 movzx r0, word [r4+r0]
814 mov r0d, dword [r1+r5*4]
817 mov r0, r1m ; XXX r1m here is actually r0m of the calling func
819 add r0, dword [r1+r5*4]
821 call x264_add8x4_idct_sse2
832 mov r0d, dword [r1+r5*4]
835 mov r0, r1m ; XXX r1m here is actually r0m of the calling func
837 add r0, dword [r1+r5*4]
839 call h264_idct_dc_add8_mmx2
847 ; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset,
848 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
849 cglobal h264_idct_add8_sse2, 5, 7, 8
858 call h264_idct_add8_sse2_plane
864 call h264_idct_add8_sse2_plane