1 ;*****************************************************************************
2 ;* sad-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Laurent Aimar <fenrir@via.ecp.fr>
8 ;* Alex Izvorski <aizvorksi@gmail.com>
10 ;* This program is free software; you can redistribute it and/or modify
11 ;* it under the terms of the GNU General Public License as published by
12 ;* the Free Software Foundation; either version 2 of the License, or
13 ;* (at your option) any later version.
15 ;* This program is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;* GNU General Public License for more details.
20 ;* You should have received a copy of the GNU General Public License
21 ;* along with this program; if not, write to the Free Software
22 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
23 ;*****************************************************************************
32 ;=============================================================================
34 ;=============================================================================
36 %macro SAD_INC_2x16P 0
67 punpckldq mm1, [r0+r1]
68 punpckldq mm2, [r2+r3]
75 ;-----------------------------------------------------------------------------
76 ; int x264_pixel_sad_16x16_mmxext (uint8_t *, int, uint8_t *, int )
77 ;-----------------------------------------------------------------------------
79 cglobal x264_pixel_sad_%1x%2_mmxext, 4,4
98 ;=============================================================================
100 ;=============================================================================
102 %macro SAD_END_SSE2 0
110 ;-----------------------------------------------------------------------------
111 ; int x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int )
112 ;-----------------------------------------------------------------------------
113 cglobal x264_pixel_sad_16x16_%1, 4,4
177 ;-----------------------------------------------------------------------------
178 ; int x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int )
179 ;-----------------------------------------------------------------------------
180 cglobal x264_pixel_sad_16x8_%1, 4,4
224 ;=============================================================================
226 ;=============================================================================
228 %macro SAD_X3_START_1x8P 0
251 %macro SAD_X3_START_2x4P 3
256 punpckldq mm3, [r0+FENC_STRIDE]
257 punpckldq %1, [r1+r4]
258 punpckldq %2, [r2+r4]
259 punpckldq %3, [r3+r4]
265 %macro SAD_X3_2x16P 1
272 SAD_X3_1x8P FENC_STRIDE, r4
273 SAD_X3_1x8P FENC_STRIDE+8, r4+8
274 add r0, 2*FENC_STRIDE
286 SAD_X3_1x8P FENC_STRIDE, r4
287 add r0, 2*FENC_STRIDE
295 SAD_X3_START_2x4P mm0, mm1, mm2
297 SAD_X3_START_2x4P mm4, mm5, mm6
302 add r0, 2*FENC_STRIDE
308 %macro SAD_X4_START_1x8P 0
335 %macro SAD_X4_START_2x4P 0
341 punpckldq mm7, [r0+FENC_STRIDE]
342 punpckldq mm0, [r1+r5]
343 punpckldq mm1, [r2+r5]
344 punpckldq mm2, [r3+r5]
345 punpckldq mm3, [r4+r5]
352 %macro SAD_X4_INC_2x4P 0
356 punpckldq mm7, [r0+FENC_STRIDE]
357 punpckldq mm4, [r1+r5]
358 punpckldq mm5, [r2+r5]
365 punpckldq mm4, [r3+r5]
366 punpckldq mm5, [r4+r5]
373 %macro SAD_X4_2x16P 1
380 SAD_X4_1x8P FENC_STRIDE, r5
381 SAD_X4_1x8P FENC_STRIDE+8, r5+8
382 add r0, 2*FENC_STRIDE
395 SAD_X4_1x8P FENC_STRIDE, r5
396 add r0, 2*FENC_STRIDE
409 add r0, 2*FENC_STRIDE
439 ;-----------------------------------------------------------------------------
440 ; void x264_pixel_sad_x3_16x16_mmxext( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
441 ; uint8_t *pix2, int i_stride, int scores[3] )
442 ;-----------------------------------------------------------------------------
444 cglobal x264_pixel_sad_x%1_%2x%3_mmxext, %1+2, %1+2
469 ;=============================================================================
471 ;=============================================================================
473 %macro SAD_X3_START_1x16P_SSE2 0
483 %macro SAD_X3_1x16P_SSE2 2
496 %macro SAD_X3_2x16P_SSE2 1
498 SAD_X3_START_1x16P_SSE2
500 SAD_X3_1x16P_SSE2 0, 0
502 SAD_X3_1x16P_SSE2 FENC_STRIDE, r4
503 add r0, 2*FENC_STRIDE
509 %macro SAD_X4_START_1x16P_SSE2 0
521 %macro SAD_X4_1x16P_SSE2 2
549 %macro SAD_X4_2x16P_SSE2 1
551 SAD_X4_START_1x16P_SSE2
553 SAD_X4_1x16P_SSE2 0, 0
555 SAD_X4_1x16P_SSE2 FENC_STRIDE, r5
556 add r0, 2*FENC_STRIDE
563 %macro SAD_X3_END_SSE2 0
583 %macro SAD_X4_END_SSE2 0
598 ;-----------------------------------------------------------------------------
599 ; void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
600 ; uint8_t *pix2, int i_stride, int scores[3] )
601 ;-----------------------------------------------------------------------------
603 cglobal x264_pixel_sad_x%1_%2x%3_%4, 2+%1,2+%1
611 SAD_X_SSE2 3, 16, 16, sse2
612 SAD_X_SSE2 3, 16, 8, sse2
613 SAD_X_SSE2 4, 16, 16, sse2
614 SAD_X_SSE2 4, 16, 8, sse2
618 SAD_X_SSE2 3, 16, 16, sse3
619 SAD_X_SSE2 3, 16, 8, sse3
620 SAD_X_SSE2 4, 16, 16, sse3
621 SAD_X_SSE2 4, 16, 8, sse3
627 ;=============================================================================
628 ; SAD cacheline split
629 ;=============================================================================
631 ; Core2 (Conroe) can load unaligned data just as quickly as aligned data...
632 ; unless the unaligned data spans the border between 2 cachelines, in which
633 ; case it's really slow. The exact numbers may differ, but all Intel cpus
634 ; have a large penalty for cacheline splits.
635 ; (8-byte alignment exactly half way between two cachelines is ok though.)
636 ; LDDQU was supposed to fix this, but it only works on Pentium 4.
637 ; So in the split case we load aligned data and explicitly perform the
638 ; alignment between registers. Like on archs that have only aligned loads,
639 ; except complicated by the fact that PALIGNR takes only an immediate, not
640 ; a variable alignment.
641 ; It is also possible to hoist the realignment to the macroblock level (keep
642 ; 2 copies of the reference frame, offset by 32 bytes), but the extra memory
643 ; needed for that method makes it often slower.
645 ; sad 16x16 costs on Core2:
646 ; good offsets: 49 cycles (50/64 of all mvs)
647 ; cacheline split: 234 cycles (14/64 of all mvs. ammortized: +40 cycles)
648 ; page split: 3600 cycles (14/4096 of all mvs. ammortized: +11.5 cycles)
649 ; cache or page split with palignr: 57 cycles (ammortized: +2 cycles)
651 ; computed jump assumes this loop is exactly 80 bytes
652 %macro SAD16_CACHELINE_LOOP_SSE2 1 ; alignment
654 sad_w16_align%1_sse2:
656 movdqa xmm2, [r2+r3+16]
672 jg sad_w16_align%1_sse2
676 ; computed jump assumes this loop is exactly 64 bytes
677 %macro SAD16_CACHELINE_LOOP_SSSE3 1 ; alignment
679 sad_w16_align%1_ssse3:
681 movdqa xmm2, [r2+r3+16]
682 palignr xmm1, [r2], %1
683 palignr xmm2, [r2+r3], %1
691 jg sad_w16_align%1_ssse3
695 %macro SAD16_CACHELINE_FUNC 2 ; cpu, height
696 cglobal x264_pixel_sad_16x%2_cache64_%1, 0,0
700 jle x264_pixel_sad_16x%2_sse2
705 shl r4d, 6 ; code size = 64
708 shl r4d, 4 ; code size = 80
710 %define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1))
712 lea r5, [sad_w16_addr GLOBAL]
716 lea r5, [sad_w16_addr + r4 GLOBAL]
728 %macro SAD_CACHELINE_START_MMX2 4 ; width, height, iterations, cacheline
730 and eax, 0x17|%1|(%4>>1)
731 cmp eax, 0x10|%1|(%4>>1)
732 jle x264_pixel_sad_%1x%2_mmxext
736 ; both versions work, but picgetgot is slower than gpr->mmx is slower than mem->mmx
742 movd mm6, [sw_64 GLOBAL]
752 %macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline
753 cglobal x264_pixel_sad_16x%1_cache%2_mmxext, 0,0
754 SAD_CACHELINE_START_MMX2 16, %1, %1, %2
778 %macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline
779 cglobal x264_pixel_sad_8x%1_cache%2_mmxext, 0,0
780 SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2
804 ; sad_x3/x4_cache64: check each mv.
805 ; if they're all within a cacheline, use normal sad_x3/x4.
806 ; otherwise, send them individually to sad_cache64.
807 %macro CHECK_SPLIT 3 ; pix, width, cacheline
809 and eax, 0x17|%2|(%3>>1)
810 cmp eax, 0x10|%2|(%3>>1)
814 %macro SADX3_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver
815 cglobal x264_pixel_sad_x3_%1x%2_cache%3_%5, 0,0
816 CHECK_SPLIT r1m, %1, %3
817 CHECK_SPLIT r2m, %1, %3
818 CHECK_SPLIT r3m, %1, %3
819 jmp x264_pixel_sad_x3_%1x%2_%4
829 call x264_pixel_sad_%1x%2_cache%3_%5
833 call x264_pixel_sad_%1x%2_cache%3_%5
837 call x264_pixel_sad_%1x%2_cache%3_%5
846 call x264_pixel_sad_%1x%2_cache%3_%5
850 call x264_pixel_sad_%1x%2_cache%3_%5
854 call x264_pixel_sad_%1x%2_cache%3_%5
862 %macro SADX4_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver
863 cglobal x264_pixel_sad_x4_%1x%2_cache%3_%5, 0,0
864 CHECK_SPLIT r1m, %1, %3
865 CHECK_SPLIT r2m, %1, %3
866 CHECK_SPLIT r3m, %1, %3
867 CHECK_SPLIT r4m, %1, %3
868 jmp x264_pixel_sad_x4_%1x%2_%4
879 call x264_pixel_sad_%1x%2_cache%3_%5
883 call x264_pixel_sad_%1x%2_cache%3_%5
887 call x264_pixel_sad_%1x%2_cache%3_%5
891 call x264_pixel_sad_%1x%2_cache%3_%5
900 call x264_pixel_sad_%1x%2_cache%3_%5
904 call x264_pixel_sad_%1x%2_cache%3_%5
908 call x264_pixel_sad_%1x%2_cache%3_%5
912 call x264_pixel_sad_%1x%2_cache%3_%5
920 %macro SADX34_CACHELINE_FUNC 5
921 SADX3_CACHELINE_FUNC %1, %2, %3, %4, %5
922 SADX4_CACHELINE_FUNC %1, %2, %3, %4, %5
926 ; instantiate the aligned sads
929 SAD16_CACHELINE_FUNC_MMX2 8, 32
930 SAD16_CACHELINE_FUNC_MMX2 16, 32
931 SAD8_CACHELINE_FUNC_MMX2 4, 32
932 SAD8_CACHELINE_FUNC_MMX2 8, 32
933 SAD8_CACHELINE_FUNC_MMX2 16, 32
934 SAD16_CACHELINE_FUNC_MMX2 8, 64
935 SAD16_CACHELINE_FUNC_MMX2 16, 64
936 %endif ; !ARCH_X86_64
937 SAD8_CACHELINE_FUNC_MMX2 4, 64
938 SAD8_CACHELINE_FUNC_MMX2 8, 64
939 SAD8_CACHELINE_FUNC_MMX2 16, 64
942 SADX34_CACHELINE_FUNC 16, 16, 32, mmxext, mmxext
943 SADX34_CACHELINE_FUNC 16, 8, 32, mmxext, mmxext
944 SADX34_CACHELINE_FUNC 8, 16, 32, mmxext, mmxext
945 SADX34_CACHELINE_FUNC 8, 8, 32, mmxext, mmxext
946 SADX34_CACHELINE_FUNC 16, 16, 64, mmxext, mmxext
947 SADX34_CACHELINE_FUNC 16, 8, 64, mmxext, mmxext
948 %endif ; !ARCH_X86_64
949 SADX34_CACHELINE_FUNC 8, 16, 64, mmxext, mmxext
950 SADX34_CACHELINE_FUNC 8, 8, 64, mmxext, mmxext
953 SAD16_CACHELINE_FUNC sse2, 8
954 SAD16_CACHELINE_FUNC sse2, 16
957 SAD16_CACHELINE_LOOP_SSE2 i
960 SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2
961 SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2
962 %endif ; !ARCH_X86_64
965 SAD16_CACHELINE_FUNC ssse3, 8
966 SAD16_CACHELINE_FUNC ssse3, 16
969 SAD16_CACHELINE_LOOP_SSSE3 i
972 SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3
973 SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3