1 ;*****************************************************************************
2 ;* sad-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Laurent Aimar <fenrir@via.ecp.fr>
8 ;* Alex Izvorski <aizvorksi@gmail.com>
10 ;* This program is free software; you can redistribute it and/or modify
11 ;* it under the terms of the GNU General Public License as published by
12 ;* the Free Software Foundation; either version 2 of the License, or
13 ;* (at your option) any later version.
15 ;* This program is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;* GNU General Public License for more details.
20 ;* You should have received a copy of the GNU General Public License
21 ;* along with this program; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 ;*****************************************************************************
32 ;=============================================================================
34 ;=============================================================================
36 %macro SAD_INC_2x16P 0
67 punpckldq mm1, [r0+r1]
68 punpckldq mm2, [r2+r3]
75 ;-----------------------------------------------------------------------------
76 ; int x264_pixel_sad_16x16_mmxext (uint8_t *, int, uint8_t *, int )
77 ;-----------------------------------------------------------------------------
79 cglobal x264_pixel_sad_%1x%2_mmxext, 4,4
98 ;=============================================================================
100 ;=============================================================================
102 %macro SAD_END_SSE2 0
110 ;-----------------------------------------------------------------------------
111 ; int x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int )
112 ;-----------------------------------------------------------------------------
113 cglobal x264_pixel_sad_16x16_%1, 4,4
177 ;-----------------------------------------------------------------------------
178 ; int x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int )
179 ;-----------------------------------------------------------------------------
180 cglobal x264_pixel_sad_16x8_%1, 4,4
222 ;=============================================================================
224 ;=============================================================================
226 %macro SAD_X3_START_1x8P 0
249 %macro SAD_X3_START_2x4P 3
254 punpckldq mm3, [r0+FENC_STRIDE]
255 punpckldq %1, [r1+r4]
256 punpckldq %2, [r2+r4]
257 punpckldq %3, [r3+r4]
263 %macro SAD_X3_2x16P 1
270 SAD_X3_1x8P FENC_STRIDE, r4
271 SAD_X3_1x8P FENC_STRIDE+8, r4+8
272 add r0, 2*FENC_STRIDE
284 SAD_X3_1x8P FENC_STRIDE, r4
285 add r0, 2*FENC_STRIDE
293 SAD_X3_START_2x4P mm0, mm1, mm2
295 SAD_X3_START_2x4P mm4, mm5, mm6
300 add r0, 2*FENC_STRIDE
306 %macro SAD_X4_START_1x8P 0
333 %macro SAD_X4_START_2x4P 0
339 punpckldq mm7, [r0+FENC_STRIDE]
340 punpckldq mm0, [r1+r5]
341 punpckldq mm1, [r2+r5]
342 punpckldq mm2, [r3+r5]
343 punpckldq mm3, [r4+r5]
350 %macro SAD_X4_INC_2x4P 0
354 punpckldq mm7, [r0+FENC_STRIDE]
355 punpckldq mm4, [r1+r5]
356 punpckldq mm5, [r2+r5]
363 punpckldq mm4, [r3+r5]
364 punpckldq mm5, [r4+r5]
371 %macro SAD_X4_2x16P 1
378 SAD_X4_1x8P FENC_STRIDE, r5
379 SAD_X4_1x8P FENC_STRIDE+8, r5+8
380 add r0, 2*FENC_STRIDE
393 SAD_X4_1x8P FENC_STRIDE, r5
394 add r0, 2*FENC_STRIDE
407 add r0, 2*FENC_STRIDE
437 ;-----------------------------------------------------------------------------
438 ; void x264_pixel_sad_x3_16x16_mmxext( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
439 ; uint8_t *pix2, int i_stride, int scores[3] )
440 ;-----------------------------------------------------------------------------
442 cglobal x264_pixel_sad_x%1_%2x%3_mmxext, %1+2, %1+2
467 ;=============================================================================
469 ;=============================================================================
471 %macro SAD_X3_START_1x16P_SSE2 0
481 %macro SAD_X3_1x16P_SSE2 2
494 %macro SAD_X3_2x16P_SSE2 1
496 SAD_X3_START_1x16P_SSE2
498 SAD_X3_1x16P_SSE2 0, 0
500 SAD_X3_1x16P_SSE2 FENC_STRIDE, r4
501 add r0, 2*FENC_STRIDE
507 %macro SAD_X4_START_1x16P_SSE2 0
519 %macro SAD_X4_1x16P_SSE2 2
547 %macro SAD_X4_2x16P_SSE2 1
549 SAD_X4_START_1x16P_SSE2
551 SAD_X4_1x16P_SSE2 0, 0
553 SAD_X4_1x16P_SSE2 FENC_STRIDE, r5
554 add r0, 2*FENC_STRIDE
561 %macro SAD_X3_END_SSE2 0
581 %macro SAD_X4_END_SSE2 0
596 ;-----------------------------------------------------------------------------
597 ; void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
598 ; uint8_t *pix2, int i_stride, int scores[3] )
599 ;-----------------------------------------------------------------------------
601 cglobal x264_pixel_sad_x%1_%2x%3_%4, 2+%1,2+%1
609 SAD_X_SSE2 3, 16, 16, sse2
610 SAD_X_SSE2 3, 16, 8, sse2
611 SAD_X_SSE2 4, 16, 16, sse2
612 SAD_X_SSE2 4, 16, 8, sse2
615 SAD_X_SSE2 3, 16, 16, sse3
616 SAD_X_SSE2 3, 16, 8, sse3
617 SAD_X_SSE2 4, 16, 16, sse3
618 SAD_X_SSE2 4, 16, 8, sse3
623 ;=============================================================================
624 ; SAD cacheline split
625 ;=============================================================================
627 ; Core2 (Conroe) can load unaligned data just as quickly as aligned data...
628 ; unless the unaligned data spans the border between 2 cachelines, in which
629 ; case it's really slow. The exact numbers may differ, but all Intel cpus
630 ; have a large penalty for cacheline splits.
631 ; (8-byte alignment exactly half way between two cachelines is ok though.)
632 ; LDDQU was supposed to fix this, but it only works on Pentium 4.
633 ; So in the split case we load aligned data and explicitly perform the
634 ; alignment between registers. Like on archs that have only aligned loads,
635 ; except complicated by the fact that PALIGNR takes only an immediate, not
636 ; a variable alignment.
637 ; It is also possible to hoist the realignment to the macroblock level (keep
638 ; 2 copies of the reference frame, offset by 32 bytes), but the extra memory
639 ; needed for that method makes it often slower.
641 ; sad 16x16 costs on Core2:
642 ; good offsets: 49 cycles (50/64 of all mvs)
643 ; cacheline split: 234 cycles (14/64 of all mvs. ammortized: +40 cycles)
644 ; page split: 3600 cycles (14/4096 of all mvs. ammortized: +11.5 cycles)
645 ; cache or page split with palignr: 57 cycles (ammortized: +2 cycles)
647 ; computed jump assumes this loop is exactly 80 bytes
648 %macro SAD16_CACHELINE_LOOP_SSE2 1 ; alignment
650 sad_w16_align%1_sse2:
652 movdqa xmm2, [r2+r3+16]
668 jg sad_w16_align%1_sse2
672 ; computed jump assumes this loop is exactly 64 bytes
673 %macro SAD16_CACHELINE_LOOP_SSSE3 1 ; alignment
675 sad_w16_align%1_ssse3:
677 movdqa xmm2, [r2+r3+16]
678 palignr xmm1, [r2], %1
679 palignr xmm2, [r2+r3], %1
687 jg sad_w16_align%1_ssse3
691 %macro SAD16_CACHELINE_FUNC 2 ; cpu, height
692 cglobal x264_pixel_sad_16x%2_cache64_%1, 0,0
696 jle x264_pixel_sad_16x%2_sse2
701 shl r4d, 6 ; code size = 64
704 shl r4d, 4 ; code size = 80
706 %define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1))
708 lea r5, [sad_w16_addr GLOBAL]
712 lea r5, [sad_w16_addr + r4 GLOBAL]
724 %macro SAD_CACHELINE_START_MMX2 4 ; width, height, iterations, cacheline
726 and eax, 0x17|%1|(%4>>1)
727 cmp eax, 0x10|%1|(%4>>1)
728 jle x264_pixel_sad_%1x%2_mmxext
732 ; both versions work, but picgetgot is slower than gpr->mmx is slower than mem->mmx
738 movd mm6, [sw_64 GLOBAL]
748 %macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline
749 cglobal x264_pixel_sad_16x%1_cache%2_mmxext, 0,0
750 SAD_CACHELINE_START_MMX2 16, %1, %1, %2
774 %macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline
775 cglobal x264_pixel_sad_8x%1_cache%2_mmxext, 0,0
776 SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2
800 ; sad_x3/x4_cache64: check each mv.
801 ; if they're all within a cacheline, use normal sad_x3/x4.
802 ; otherwise, send them individually to sad_cache64.
803 %macro CHECK_SPLIT 3 ; pix, width, cacheline
805 and eax, 0x17|%2|(%3>>1)
806 cmp eax, 0x10|%2|(%3>>1)
810 %macro SADX3_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver
811 cglobal x264_pixel_sad_x3_%1x%2_cache%3_%5, 0,0
812 CHECK_SPLIT r1m, %1, %3
813 CHECK_SPLIT r2m, %1, %3
814 CHECK_SPLIT r3m, %1, %3
815 jmp x264_pixel_sad_x3_%1x%2_%4
825 call x264_pixel_sad_%1x%2_cache%3_%5
829 call x264_pixel_sad_%1x%2_cache%3_%5
833 call x264_pixel_sad_%1x%2_cache%3_%5
842 call x264_pixel_sad_%1x%2_cache%3_%5
846 call x264_pixel_sad_%1x%2_cache%3_%5
850 call x264_pixel_sad_%1x%2_cache%3_%5
858 %macro SADX4_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver
859 cglobal x264_pixel_sad_x4_%1x%2_cache%3_%5, 0,0
860 CHECK_SPLIT r1m, %1, %3
861 CHECK_SPLIT r2m, %1, %3
862 CHECK_SPLIT r3m, %1, %3
863 CHECK_SPLIT r4m, %1, %3
864 jmp x264_pixel_sad_x4_%1x%2_%4
875 call x264_pixel_sad_%1x%2_cache%3_%5
879 call x264_pixel_sad_%1x%2_cache%3_%5
883 call x264_pixel_sad_%1x%2_cache%3_%5
887 call x264_pixel_sad_%1x%2_cache%3_%5
896 call x264_pixel_sad_%1x%2_cache%3_%5
900 call x264_pixel_sad_%1x%2_cache%3_%5
904 call x264_pixel_sad_%1x%2_cache%3_%5
908 call x264_pixel_sad_%1x%2_cache%3_%5
916 %macro SADX34_CACHELINE_FUNC 5
917 SADX3_CACHELINE_FUNC %1, %2, %3, %4, %5
918 SADX4_CACHELINE_FUNC %1, %2, %3, %4, %5
922 ; instantiate the aligned sads
925 SAD16_CACHELINE_FUNC_MMX2 8, 32
926 SAD16_CACHELINE_FUNC_MMX2 16, 32
927 SAD8_CACHELINE_FUNC_MMX2 4, 32
928 SAD8_CACHELINE_FUNC_MMX2 8, 32
929 SAD8_CACHELINE_FUNC_MMX2 16, 32
930 SAD16_CACHELINE_FUNC_MMX2 8, 64
931 SAD16_CACHELINE_FUNC_MMX2 16, 64
932 %endif ; !ARCH_X86_64
933 SAD8_CACHELINE_FUNC_MMX2 4, 64
934 SAD8_CACHELINE_FUNC_MMX2 8, 64
935 SAD8_CACHELINE_FUNC_MMX2 16, 64
938 SADX34_CACHELINE_FUNC 16, 16, 32, mmxext, mmxext
939 SADX34_CACHELINE_FUNC 16, 8, 32, mmxext, mmxext
940 SADX34_CACHELINE_FUNC 8, 16, 32, mmxext, mmxext
941 SADX34_CACHELINE_FUNC 8, 8, 32, mmxext, mmxext
942 SADX34_CACHELINE_FUNC 16, 16, 64, mmxext, mmxext
943 SADX34_CACHELINE_FUNC 16, 8, 64, mmxext, mmxext
944 %endif ; !ARCH_X86_64
945 SADX34_CACHELINE_FUNC 8, 16, 64, mmxext, mmxext
946 SADX34_CACHELINE_FUNC 8, 8, 64, mmxext, mmxext
949 SAD16_CACHELINE_FUNC sse2, 8
950 SAD16_CACHELINE_FUNC sse2, 16
953 SAD16_CACHELINE_LOOP_SSE2 i
956 SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2
957 SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2
958 %endif ; !ARCH_X86_64
960 SAD16_CACHELINE_FUNC ssse3, 8
961 SAD16_CACHELINE_FUNC ssse3, 16
964 SAD16_CACHELINE_LOOP_SSSE3 i
967 SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3
968 SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3