1 ;*****************************************************************************
2 ;* MMX/SSE2-optimized H.264 iDCT
3 ;*****************************************************************************
4 ;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt
5 ;* Copyright (C) 2003-2008 x264 project
7 ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
8 ;* Loren Merritt <lorenm@u.washington.edu>
9 ;* Holger Lubitz <hal@duncan.ol.sub.de>
10 ;* Min Chen <chenm001.163.com>
12 ;* This file is part of FFmpeg.
14 ;* FFmpeg is free software; you can redistribute it and/or
15 ;* modify it under the terms of the GNU Lesser General Public
16 ;* License as published by the Free Software Foundation; either
17 ;* version 2.1 of the License, or (at your option) any later version.
19 ;* FFmpeg is distributed in the hope that it will be useful,
20 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
21 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 ;* Lesser General Public License for more details.
24 ;* You should have received a copy of the GNU Lesser General Public
25 ;* License along with FFmpeg; if not, write to the Free Software
26 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27 ;*****************************************************************************
30 %include "x86util.asm"
34 ; FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
35 scan8_mem: db 4+1*8, 5+1*8, 4+2*8, 5+2*8
36 db 6+1*8, 7+1*8, 6+2*8, 7+2*8
37 db 4+3*8, 5+3*8, 4+4*8, 5+4*8
38 db 6+3*8, 7+3*8, 6+4*8, 7+4*8
46 %define scan8 scan8_mem
54 ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
62 IDCT4_1D 0, 1, 2, 3, 4, 5
64 TRANSPOSE4x4W 0, 1, 2, 3, 4
66 IDCT4_1D 0, 1, 2, 3, 4, 5
69 STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
71 STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3
75 ; ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
76 cglobal h264_idct_add_mmx, 3, 3, 0
128 SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
131 %macro IDCT8_1D_FULL 1
138 IDCT8_1D [%1], [%1+ 64]
141 ; %1=int16_t *block, %2=int16_t *dstblock
142 %macro IDCT8_ADD_MMX_START 2
145 TRANSPOSE4x4W 0, 1, 2, 3, 7
151 TRANSPOSE4x4W 4, 5, 6, 7, 3
158 ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
159 %macro IDCT8_ADD_MMX_END 3
166 STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
168 STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
173 STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3
175 STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
179 ; ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
180 cglobal h264_idct8_add_mmx, 3, 4, 0
181 %assign pad 128+4-(stack_offset&7)
185 IDCT8_ADD_MMX_START r1 , rsp
186 IDCT8_ADD_MMX_START r1+8, rsp+64
188 IDCT8_ADD_MMX_END r0 , rsp, r2
189 IDCT8_ADD_MMX_END r3 , rsp+8, r2
194 ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
195 %macro IDCT8_ADD_SSE 4
198 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
200 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16]
207 IDCT8_1D [%2], [%2+ 16]
220 STORE_DIFF m0, m6, m7, [%1 ]
221 STORE_DIFF m1, m6, m7, [%1+%3 ]
222 STORE_DIFF m2, m6, m7, [%1+%3*2]
223 STORE_DIFF m3, m6, m7, [%1+%4 ]
232 STORE_DIFF m4, m6, m7, [%1 ]
233 STORE_DIFF m5, m6, m7, [%1+%3 ]
234 STORE_DIFF m0, m6, m7, [%1+%3*2]
235 STORE_DIFF m1, m6, m7, [%1+%4 ]
239 ; ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
240 cglobal h264_idct8_add_sse2, 3, 4, 10
241 IDCT8_ADD_SSE r0, r1, r2, r3
244 %macro DC_ADD_MMX2_INIT 2-3
264 %macro DC_ADD_MMX2_OP 3-4
284 ; ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
285 cglobal h264_idct_dc_add_mmx2, 3, 3, 0
286 DC_ADD_MMX2_INIT r1, r2
287 DC_ADD_MMX2_OP movh, r0, r2, r1
290 ; ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
291 cglobal h264_idct8_dc_add_mmx2, 3, 3, 0
292 DC_ADD_MMX2_INIT r1, r2
293 DC_ADD_MMX2_OP mova, r0, r2, r1
295 DC_ADD_MMX2_OP mova, r0, r2, r1
298 ; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset,
299 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
300 cglobal h264_idct_add16_mmx, 5, 7, 0
306 movzx r6, byte [scan8+r5]
307 movzx r6, byte [r4+r6]
310 mov r6d, dword [r1+r5*4]
320 ; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset,
321 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
322 cglobal h264_idct8_add4_mmx, 5, 7, 0
323 %assign pad 128+4-(stack_offset&7)
331 movzx r6, byte [scan8+r5]
332 movzx r6, byte [r4+r6]
335 mov r6d, dword [r1+r5*4]
338 IDCT8_ADD_MMX_START r2 , rsp
339 IDCT8_ADD_MMX_START r2+8, rsp+64
340 IDCT8_ADD_MMX_END r6 , rsp, r3
341 mov r6d, dword [r1+r5*4]
343 IDCT8_ADD_MMX_END r6 , rsp+8, r3
352 ; ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset,
353 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
354 cglobal h264_idct_add16_mmx2, 5, 7, 0
360 movzx r6, byte [scan8+r5]
361 movzx r6, byte [r4+r6]
369 DC_ADD_MMX2_INIT r2, r3, r6
372 %define dst_regd r10d
377 mov dst_regd, dword [r1+r5*4]
378 lea dst_reg, [r0+dst_reg]
379 DC_ADD_MMX2_OP movh, dst_reg, r3, r6
389 mov r6d, dword [r1+r5*4]
399 ; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset,
400 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
401 cglobal h264_idct_add16intra_mmx, 5, 7, 0
407 movzx r6, byte [scan8+r5]
408 movzx r6, byte [r4+r6]
412 mov r6d, dword [r1+r5*4]
422 ; ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset,
423 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
424 cglobal h264_idct_add16intra_mmx2, 5, 7, 0
430 movzx r6, byte [scan8+r5]
431 movzx r6, byte [r4+r6]
434 mov r6d, dword [r1+r5*4]
446 DC_ADD_MMX2_INIT r2, r3, r6
449 %define dst_regd r10d
454 mov dst_regd, dword [r1+r5*4]
455 lea dst_reg, [r0+dst_reg]
456 DC_ADD_MMX2_OP movh, dst_reg, r3, r6
467 ; ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset,
468 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
469 cglobal h264_idct8_add4_mmx2, 5, 7, 0
470 %assign pad 128+4-(stack_offset&7)
478 movzx r6, byte [scan8+r5]
479 movzx r6, byte [r4+r6]
487 DC_ADD_MMX2_INIT r2, r3, r6
490 %define dst_regd r10d
495 mov dst_regd, dword [r1+r5*4]
496 lea dst_reg, [r0+dst_reg]
497 DC_ADD_MMX2_OP mova, dst_reg, r3, r6
498 lea dst_reg, [dst_reg+r3*4]
499 DC_ADD_MMX2_OP mova, dst_reg, r3, r6
511 mov r6d, dword [r1+r5*4]
514 IDCT8_ADD_MMX_START r2 , rsp
515 IDCT8_ADD_MMX_START r2+8, rsp+64
516 IDCT8_ADD_MMX_END r6 , rsp, r3
517 mov r6d, dword [r1+r5*4]
519 IDCT8_ADD_MMX_END r6 , rsp+8, r3
530 ; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset,
531 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
532 cglobal h264_idct8_add4_sse2, 5, 7, 10
538 movzx r6, byte [scan8+r5]
539 movzx r6, byte [r4+r6]
548 DC_ADD_MMX2_INIT r2, r3, r6
551 %define dst_regd r10d
556 mov dst_regd, dword [r1+r5*4]
557 lea dst_reg, [r0+dst_reg]
558 DC_ADD_MMX2_OP mova, dst_reg, r3, r6
559 lea dst_reg, [dst_reg+r3*4]
560 DC_ADD_MMX2_OP mova, dst_reg, r3, r6
571 mov dst_regd, dword [r1+r5*4]
572 lea dst_reg, [r0+dst_reg]
573 IDCT8_ADD_SSE dst_reg, r2, r3, r6
585 h264_idct_add8_mmx_plane:
587 movzx r6, byte [scan8+r5]
588 movzx r6, byte [r4+r6]
593 mov r0d, dword [r1+r5*4]
596 mov r0, r1m ; XXX r1m here is actually r0m of the calling func
598 add r0, dword [r1+r5*4]
608 ; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset,
609 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
610 cglobal h264_idct_add8_mmx, 5, 7, 0
619 call h264_idct_add8_mmx_plane
625 call h264_idct_add8_mmx_plane
628 h264_idct_add8_mmx2_plane
630 movzx r6, byte [scan8+r5]
631 movzx r6, byte [r4+r6]
635 mov r0d, dword [r1+r5*4]
638 mov r0, r1m ; XXX r1m here is actually r0m of the calling func
640 add r0, dword [r1+r5*4]
652 DC_ADD_MMX2_INIT r2, r3, r6
654 mov r0d, dword [r1+r5*4]
657 mov r0, r1m ; XXX r1m here is actually r0m of the calling func
659 add r0, dword [r1+r5*4]
661 DC_ADD_MMX2_OP movh, r0, r3, r6
669 ; ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset,
670 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
671 cglobal h264_idct_add8_mmx2, 5, 7, 0
680 call h264_idct_add8_mmx2_plane
686 call h264_idct_add8_mmx2_plane
690 ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
691 h264_idct_dc_add8_mmx2:
692 movd m0, [r2 ] ; 0 0 X D
693 punpcklwd m0, [r2+32] ; x X d D
696 punpcklwd m0, m0 ; d d D D
697 pxor m1, m1 ; 0 0 0 0
698 psubw m1, m0 ; -d-d-D-D
699 packuswb m0, m1 ; -d-d-D-D d d D D
700 pshufw m1, m0, 0xFA ; -d-d-d-d-D-D-D-D
701 punpcklwd m0, m0 ; d d d d D D D D
703 DC_ADD_MMX2_OP movq, r0, r3, r6
708 ; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
709 x264_add8x4_idct_sse2:
719 TRANSPOSE2x4x4W 0,1,2,3,4
723 STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3
725 STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3
728 %macro add16_sse2_cycle 2
729 movzx r0, word [r4+%2]
732 mov r0d, dword [r1+%1*8]
738 call x264_add8x4_idct_sse2
745 ; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset,
746 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
747 cglobal h264_idct_add16_sse2, 5, 5, 8
751 ; unrolling of the loop leads to an average performance gain of
753 add16_sse2_cycle 0, 0xc
754 add16_sse2_cycle 1, 0x14
755 add16_sse2_cycle 2, 0xe
756 add16_sse2_cycle 3, 0x16
757 add16_sse2_cycle 4, 0x1c
758 add16_sse2_cycle 5, 0x24
759 add16_sse2_cycle 6, 0x1e
760 add16_sse2_cycle 7, 0x26
763 %macro add16intra_sse2_cycle 2
764 movzx r0, word [r4+%2]
767 mov r0d, dword [r1+%1*8]
773 call x264_add8x4_idct_sse2
779 mov r0d, dword [r1+%1*8]
785 call h264_idct_dc_add8_mmx2
792 ; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset,
793 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
794 cglobal h264_idct_add16intra_sse2, 5, 7, 8
798 add16intra_sse2_cycle 0, 0xc
799 add16intra_sse2_cycle 1, 0x14
800 add16intra_sse2_cycle 2, 0xe
801 add16intra_sse2_cycle 3, 0x16
802 add16intra_sse2_cycle 4, 0x1c
803 add16intra_sse2_cycle 5, 0x24
804 add16intra_sse2_cycle 6, 0x1e
805 add16intra_sse2_cycle 7, 0x26
808 %macro add8_sse2_cycle 2
809 movzx r0, word [r4+%2]
813 mov r0d, dword [r1+%1*8+64]
818 add r0, dword [r1+%1*8+64]
820 call x264_add8x4_idct_sse2
827 mov r0d, dword [r1+%1*8+64]
832 add r0, dword [r1+%1*8+64]
834 call h264_idct_dc_add8_mmx2
841 ; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset,
842 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
843 cglobal h264_idct_add8_sse2, 5, 7, 8
848 add8_sse2_cycle 0, 0x09
849 add8_sse2_cycle 1, 0x11
855 add8_sse2_cycle 2, 0x21
856 add8_sse2_cycle 3, 0x29
859 ;void ff_h264_luma_dc_dequant_idct_mmx(DCTELEM *output, DCTELEM *input, int qmul)
862 SUMSUB_BADC m%4, m%3, m%2, m%1, m%5
863 SUMSUB_BADC m%4, m%2, m%3, m%1, m%5
889 %macro STORE_WORDS_MMX 5
901 %macro DEQUANT_STORE_MMX 1
902 DEQUANT_MMX m0, m1, %1
903 STORE_WORDS_MMX m0, 0, 1, 4, 5
904 STORE_WORDS_MMX m1, 2, 3, 6, 7
906 DEQUANT_MMX m2, m3, %1
907 STORE_WORDS_MMX m2, 8, 9, 12, 13
908 STORE_WORDS_MMX m3, 10, 11, 14, 15
911 %macro STORE_WORDS_SSE 9
933 %macro DEQUANT_STORE_SSE2 1
955 STORE_WORDS_SSE xmm0, 0, 1, 4, 5, 2, 3, 6, 7
956 STORE_WORDS_SSE xmm2, 8, 9, 12, 13, 10, 11, 14, 15
959 %macro IDCT_DC_DEQUANT 2
960 cglobal h264_luma_dc_dequant_idct_%1, 3,4,%2
966 TRANSPOSE4x4W 0,1,2,3,4
969 ; shift, tmp, output, qmul
971 DECLARE_REG_TMP 0,3,1,2
972 ; we can't avoid this, because r0 is the shift register (ecx) on win64
975 DECLARE_REG_TMP 3,1,0,2
977 DECLARE_REG_TMP 1,3,0,2
1000 DEQUANT_STORE_MMX m6
1003 DEQUANT_STORE_SSE2 xmm6
1009 IDCT_DC_DEQUANT mmx, 0
1010 IDCT_DC_DEQUANT sse2, 7