1 ;*****************************************************************************
2 ;* pixel.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Laurent Aimar <fenrir@via.ecp.fr>
8 ;* Alex Izvorski <aizvorksi@gmail.com>
10 ;* This program is free software; you can redistribute it and/or modify
11 ;* it under the terms of the GNU General Public License as published by
12 ;* the Free Software Foundation; either version 2 of the License, or
13 ;* (at your option) any later version.
15 ;* This program is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;* GNU General Public License for more details.
20 ;* You should have received a copy of the GNU General Public License
21 ;* along with this program; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 ;*****************************************************************************
29 ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
30 ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
31 mask_ff: times 16 db 0xff
36 %macro HADDD 2 ; sum junk
44 pmaddwd %1, [pw_1 GLOBAL]
48 ;=============================================================================
50 ;=============================================================================
52 %macro SSD_INC_1x16P 0
86 %macro SSD_INC_2x16P 0
125 %macro SSD_INC_2x4P 0
146 ;-----------------------------------------------------------------------------
147 ; int x264_pixel_ssd_16x16_mmx (uint8_t *, int, uint8_t *, int )
148 ;-----------------------------------------------------------------------------
150 cglobal x264_pixel_ssd_%1x%2_mmx, 4,4
152 pxor mm0, mm0 ; mm0 holds the sum
171 %macro SSD_INC_2x16P_SSE2 0
206 %macro SSD_INC_2x8P_SSE2 0
227 ;-----------------------------------------------------------------------------
228 ; int x264_pixel_ssd_16x16_sse2 (uint8_t *, int, uint8_t *, int )
229 ;-----------------------------------------------------------------------------
231 cglobal x264_pixel_ssd_%1x%2_sse2, 4,4
250 ;=============================================================================
252 ;=============================================================================
254 %macro LOAD_DIFF_4P 4 ; dst, tmp, [pix1], [pix2]
262 %macro LOAD_DIFF_8P 4 ; dst, tmp, [pix1], [pix2]
270 %macro LOAD_DIFF_8x4P 6 ; 4x dest, 2x temp
271 LOAD_DIFF_8P %1, %5, [r0], [r2]
272 LOAD_DIFF_8P %2, %6, [r0+r1], [r2+r3]
273 LOAD_DIFF_8P %3, %5, [r0+2*r1], [r2+2*r3]
274 LOAD_DIFF_8P %4, %6, [r0+r4], [r2+r5]
277 ; phaddw is used only in 4x4 hadamard, because in 8x8 it's slower:
278 ; even on Penryn, phaddw has latency 3 while paddw and punpck* have 1.
279 ; 4x4 is special in that 4x4 transpose in xmmregs takes extra munging,
280 ; whereas phaddw-based transform doesn't care what order the coefs end up in.
288 %macro HADAMARD4_ROW_PHADD 5 ; abcd-t -> adtc
304 %macro HADAMARD4_1D 4
305 SUMSUB_BADC %1, %2, %3, %4
306 SUMSUB_BADC %1, %3, %2, %4
309 %macro HADAMARD8_1D 8
310 SUMSUB_BADC %1, %5, %2, %6
311 SUMSUB_BADC %3, %7, %4, %8
312 SUMSUB_BADC %1, %3, %2, %4
313 SUMSUB_BADC %5, %7, %6, %8
314 SUMSUB_BADC %1, %2, %3, %4
315 SUMSUB_BADC %5, %6, %7, %8
324 %macro SBUTTERFLY2 5 ; not really needed, but allows transpose4x4x2 to not shuffle registers
330 %macro TRANSPOSE4x4W 5 ; abcd-t -> adtc
331 SBUTTERFLY q, wd, %1, %2, %5
332 SBUTTERFLY q, wd, %3, %4, %2
333 SBUTTERFLY q, dq, %1, %3, %4
334 SBUTTERFLY q, dq, %5, %2, %3
337 %macro TRANSPOSE4x4D 5 ; abcd-t -> adtc
338 SBUTTERFLY dqa, dq, %1, %2, %5
339 SBUTTERFLY dqa, dq, %3, %4, %2
340 SBUTTERFLY dqa, qdq, %1, %3, %4
341 SBUTTERFLY dqa, qdq, %5, %2, %3
344 %macro TRANSPOSE2x4x4W 5 ; abcd-t -> abcd
345 SBUTTERFLY dqa, wd, %1, %2, %5
346 SBUTTERFLY dqa, wd, %3, %4, %2
347 SBUTTERFLY dqa, dq, %1, %3, %4
348 SBUTTERFLY2 dqa, dq, %5, %2, %3
349 SBUTTERFLY dqa, qdq, %1, %3, %2
350 SBUTTERFLY2 dqa, qdq, %4, %5, %3
354 %macro TRANSPOSE8x8W 9 ; abcdefgh-t -> afhdtecb
355 SBUTTERFLY dqa, wd, %1, %2, %9
356 SBUTTERFLY dqa, wd, %3, %4, %2
357 SBUTTERFLY dqa, wd, %5, %6, %4
358 SBUTTERFLY dqa, wd, %7, %8, %6
359 SBUTTERFLY dqa, dq, %1, %3, %8
360 SBUTTERFLY dqa, dq, %9, %2, %3
361 SBUTTERFLY dqa, dq, %5, %7, %2
362 SBUTTERFLY dqa, dq, %4, %6, %7
363 SBUTTERFLY dqa, qdq, %1, %5, %6
364 SBUTTERFLY dqa, qdq, %9, %4, %5
365 SBUTTERFLY dqa, qdq, %8, %2, %4
366 SBUTTERFLY dqa, qdq, %3, %7, %2
369 %macro TRANSPOSE8x8W 9 ; abcdefgh -> afhdgecb
371 SBUTTERFLY dqa, wd, %1, %2, %8
374 SBUTTERFLY dqa, wd, %3, %4, %2
375 SBUTTERFLY dqa, wd, %5, %6, %4
376 SBUTTERFLY dqa, wd, %7, %8, %6
377 SBUTTERFLY dqa, dq, %1, %3, %8
380 SBUTTERFLY dqa, dq, %8, %2, %3
381 SBUTTERFLY dqa, dq, %5, %7, %2
382 SBUTTERFLY dqa, dq, %4, %6, %7
383 SBUTTERFLY dqa, qdq, %1, %5, %6
384 SBUTTERFLY dqa, qdq, %8, %4, %5
387 SBUTTERFLY dqa, qdq, %8, %2, %4
388 SBUTTERFLY dqa, qdq, %3, %7, %2
393 %macro ABS1_MMX 2 ; a, tmp
399 %macro ABS2_MMX 4 ; a, b, tmp0, tmp1
417 %define ABS1 ABS1_MMX
418 %define ABS2 ABS2_MMX
425 %macro HADAMARD4x4_SUM 1 ; %1 = dest (row sum of one block)
426 HADAMARD4_1D mm4, mm5, mm6, mm7
427 TRANSPOSE4x4W mm4, mm5, mm6, mm7, %1
428 HADAMARD4_1D mm4, mm7, %1, mm6
429 ABS2 mm4, mm7, mm3, mm5
430 ABS2 %1, mm6, mm3, mm5
436 ; in: r4=3*stride1, r5=3*stride2
437 ; in: %2 = horizontal offset
438 ; in: %3 = whether we need to increment pix1 and pix2
441 %macro SATD_4x4_MMX 3
442 LOAD_DIFF_4P mm4, mm3, [r0+%2], [r2+%2]
443 LOAD_DIFF_4P mm5, mm3, [r0+r1+%2], [r2+r3+%2]
444 LOAD_DIFF_4P mm6, mm3, [r0+2*r1+%2], [r2+2*r3+%2]
445 LOAD_DIFF_4P mm7, mm3, [r0+r4+%2], [r2+r5+%2]
453 %macro SATD_8x4_START 1
454 SATD_4x4_MMX mm0, 0, 0
455 SATD_4x4_MMX mm1, 4, %1
458 %macro SATD_8x4_INC 1
459 SATD_4x4_MMX mm2, 0, 0
461 SATD_4x4_MMX mm1, 4, %1
465 %macro SATD_16x4_START 1
466 SATD_4x4_MMX mm0, 0, 0
467 SATD_4x4_MMX mm1, 4, 0
468 SATD_4x4_MMX mm2, 8, 0
470 SATD_4x4_MMX mm1, 12, %1
474 %macro SATD_16x4_INC 1
475 SATD_4x4_MMX mm2, 0, 0
477 SATD_4x4_MMX mm1, 4, 0
479 SATD_4x4_MMX mm2, 8, 0
481 SATD_4x4_MMX mm1, 12, %1
485 %macro SATD_8x4_SSE2 1
486 LOAD_DIFF_8x4P xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
491 HADAMARD4_1D xmm0, xmm1, xmm2, xmm3
492 TRANSPOSE2x4x4W xmm0, xmm1, xmm2, xmm3, xmm4
493 HADAMARD4_1D xmm0, xmm1, xmm2, xmm3
494 ABS4 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
501 %macro SATD_8x4_PHADD 1
502 LOAD_DIFF_8x4P xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
507 HADAMARD4_1D xmm0, xmm1, xmm2, xmm3
508 HADAMARD4_ROW_PHADD xmm0, xmm1, xmm2, xmm3, xmm4
509 ABS4 xmm0, xmm3, xmm4, xmm2, xmm1, xmm5
516 %macro SATD_START_MMX 0
517 lea r4, [3*r1] ; 3*stride1
518 lea r5, [3*r3] ; 3*stride2
521 %macro SATD_END_MMX 0
522 pshufw mm1, mm0, 01001110b
524 pshufw mm1, mm0, 10110001b
531 ; FIXME avoid the spilling of regs to hold 3*stride.
532 ; for small blocks on x86_32, modify pixel pointer instead.
534 ;-----------------------------------------------------------------------------
535 ; int x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int )
536 ;-----------------------------------------------------------------------------
537 cglobal x264_pixel_satd_16x16_mmxext, 4,6
545 pshufw mm1, mm0, 01001110b
548 pshufw mm1, mm0, 01001110b
553 cglobal x264_pixel_satd_16x8_mmxext, 4,6
560 cglobal x264_pixel_satd_8x16_mmxext, 4,6
569 cglobal x264_pixel_satd_8x8_mmxext, 4,6
576 cglobal x264_pixel_satd_8x4_mmxext, 4,6
583 cglobal x264_pixel_satd_4x8_%1, 4,6
585 SATD_4x4_MMX mm0, 0, 1
586 SATD_4x4_MMX mm1, 0, 0
590 cglobal x264_pixel_satd_4x4_%1, 4,6
592 SATD_4x4_MMX mm0, 0, 0
598 %macro SATD_START_SSE2 0
604 %macro SATD_END_SSE2 0
612 %macro BACKUP_POINTERS 0
619 %macro RESTORE_AND_INC_POINTERS 0
631 ;-----------------------------------------------------------------------------
632 ; int x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int )
633 ;-----------------------------------------------------------------------------
635 cglobal x264_pixel_satd_16x16_%1, 4,6
642 RESTORE_AND_INC_POINTERS
649 cglobal x264_pixel_satd_16x8_%1, 4,6
654 RESTORE_AND_INC_POINTERS
659 cglobal x264_pixel_satd_8x16_%1, 4,6
667 cglobal x264_pixel_satd_8x8_%1, 4,6
673 cglobal x264_pixel_satd_8x4_%1, 4,6
679 ;-----------------------------------------------------------------------------
680 ; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int )
681 ;-----------------------------------------------------------------------------
682 cglobal x264_pixel_sa8d_8x8_%1
686 LOAD_DIFF_8x4P xmm0, xmm1, xmm2, xmm3, xmm8, xmm9
689 LOAD_DIFF_8x4P xmm4, xmm5, xmm6, xmm7, xmm8, xmm9
691 HADAMARD8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
692 TRANSPOSE8x8W xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
693 HADAMARD8_1D xmm0, xmm5, xmm7, xmm3, xmm8, xmm4, xmm2, xmm1
695 ABS4 xmm0, xmm1, xmm2, xmm3, xmm6, xmm9
696 ABS4 xmm4, xmm5, xmm7, xmm8, xmm6, xmm9
706 add r10d, eax ; preserve rounding for 16x16
711 cglobal x264_pixel_sa8d_16x16_%1
713 call x264_pixel_sa8d_8x8_%1 ; pix[0]
716 call x264_pixel_sa8d_8x8_%1.skip_lea ; pix[8*stride]
717 neg r4 ; it's already r1*3
721 call x264_pixel_sa8d_8x8_%1 ; pix[8]
724 call x264_pixel_sa8d_8x8_%1.skip_lea ; pix[8*stride+8]
730 cglobal x264_pixel_sa8d_8x8_%1, 4,7
736 LOAD_DIFF_8x4P xmm0, xmm1, xmm2, xmm3, xmm6, xmm7
740 LOAD_DIFF_8x4P xmm4, xmm5, xmm6, xmm7, xmm2, xmm2
743 HADAMARD8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
744 TRANSPOSE8x8W xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, esp
745 HADAMARD8_1D xmm0, xmm5, xmm7, xmm3, xmm6, xmm4, xmm2, xmm1
749 movdqa [esp+16], xmm7
751 ABS2 xmm2, xmm3, xmm6, xmm7
752 ABS2 xmm0, xmm1, xmm6, xmm7
757 movdqa xmm7, [esp+16]
759 ABS2 xmm4, xmm5, xmm2, xmm3
760 ABS2 xmm6, xmm7, xmm2, xmm3
769 mov ecx, eax ; preserve rounding for 16x16
775 %endmacro ; SATDS_SSE2
777 %macro SA8D_16x16_32 1
779 cglobal x264_pixel_sa8d_16x16_%1
781 push dword [esp+20] ; stride2
782 push dword [esp+20] ; pix2
783 push dword [esp+20] ; stride1
784 push dword [esp+20] ; pix1
785 call x264_pixel_sa8d_8x8_%1
787 add dword [esp+0], 8 ; pix1+8
788 add dword [esp+8], 8 ; pix2+8
789 call x264_pixel_sa8d_8x8_%1
795 add [esp+0], eax ; pix1+8*stride1+8
796 add [esp+8], edx ; pix2+8*stride2+8
797 call x264_pixel_sa8d_8x8_%1
799 sub dword [esp+0], 8 ; pix1+8*stride1
800 sub dword [esp+8], 8 ; pix2+8*stride2
801 call x264_pixel_sa8d_8x8_%1
807 %endif ; !ARCH_X86_64
808 %endmacro ; SA8D_16x16_32
812 ;=============================================================================
814 ;=============================================================================
816 %macro INTRA_SA8D_SSE2 1
818 ;-----------------------------------------------------------------------------
819 ; void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *fenc, int16_t edges[2][8], int *res )
820 ;-----------------------------------------------------------------------------
821 cglobal x264_intra_sa8d_x3_8x8_core_%1
824 movq xmm0, [r0+0*FENC_STRIDE]
825 movq xmm7, [r0+1*FENC_STRIDE]
826 movq xmm6, [r0+2*FENC_STRIDE]
827 movq xmm3, [r0+3*FENC_STRIDE]
828 movq xmm5, [r0+4*FENC_STRIDE]
829 movq xmm1, [r0+5*FENC_STRIDE]
830 movq xmm8, [r0+6*FENC_STRIDE]
831 movq xmm2, [r0+7*FENC_STRIDE]
840 HADAMARD8_1D xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2
841 TRANSPOSE8x8W xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2, xmm4
842 HADAMARD8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
845 movzx edi, word [r1+0]
856 ABS4 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13
867 ABS2 xmm10, xmm11, xmm13, xmm14
874 movdqa xmm14, xmm15 ; 7x8 sum
876 movdqa xmm8, [r1+0] ; left edge
882 ABS1 xmm9, xmm11 ; 1x8 sum
891 punpcklqdq xmm0, xmm4 ; transpose
892 movdqa xmm1, [r1+16] ; top edge
895 psrldq xmm2, 2 ; 8x7 sum
896 psubw xmm0, xmm1 ; 8x1 sum
901 movdqa xmm7, [pw_1 GLOBAL]
906 punpckldq xmm2, xmm14
907 punpckhdq xmm3, xmm14
908 pshufd xmm5, xmm15, 0xf5
912 punpcklqdq xmm2, xmm5
913 punpckhqdq xmm3, xmm5
917 movq [r2], xmm3 ; i8x8_v, i8x8_h
919 movd [r2+8], xmm3 ; i8x8_dc
922 %endmacro ; INTRA_SATDS
925 ; out: mm0..mm3 = hadamard coefs
929 movd mm0, [r0+0*FENC_STRIDE]
930 movd mm4, [r0+1*FENC_STRIDE]
931 movd mm3, [r0+2*FENC_STRIDE]
932 movd mm1, [r0+3*FENC_STRIDE]
937 HADAMARD4_1D mm0, mm4, mm3, mm1
938 TRANSPOSE4x4W mm0, mm4, mm3, mm1, mm2
939 HADAMARD4_1D mm0, mm1, mm2, mm3
942 %macro SCALAR_SUMSUB 4
951 %macro SCALAR_HADAMARD_LEFT 5 ; y, 4x tmp
953 shl %1d, 5 ; log(FDEC_STRIDE)
955 movzx %2d, byte [r1+%1-1+0*FDEC_STRIDE]
956 movzx %3d, byte [r1+%1-1+1*FDEC_STRIDE]
957 movzx %4d, byte [r1+%1-1+2*FDEC_STRIDE]
958 movzx %5d, byte [r1+%1-1+3*FDEC_STRIDE]
962 SCALAR_SUMSUB %2d, %3d, %4d, %5d
963 SCALAR_SUMSUB %2d, %4d, %3d, %5d
964 mov [left_1d+2*%1+0], %2w
965 mov [left_1d+2*%1+2], %3w
966 mov [left_1d+2*%1+4], %4w
967 mov [left_1d+2*%1+6], %5w
970 %macro SCALAR_HADAMARD_TOP 5 ; x, 4x tmp
971 movzx %2d, byte [r1+%1-FDEC_STRIDE+0]
972 movzx %3d, byte [r1+%1-FDEC_STRIDE+1]
973 movzx %4d, byte [r1+%1-FDEC_STRIDE+2]
974 movzx %5d, byte [r1+%1-FDEC_STRIDE+3]
975 SCALAR_SUMSUB %2d, %3d, %4d, %5d
976 SCALAR_SUMSUB %2d, %4d, %3d, %5d
977 mov [top_1d+2*%1+0], %2w
978 mov [top_1d+2*%1+2], %3w
979 mov [top_1d+2*%1+4], %4w
980 mov [top_1d+2*%1+6], %5w
983 %macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
985 pshufw %4, %1, 01001110b
986 pshufw %5, %2, 01001110b
987 pshufw %6, %3, 01001110b
994 pshufw %4, %1, 01001110b
995 pshufw %5, %2, 01001110b
996 pshufw %6, %3, 01001110b
1004 mov qword [sums+0], 0
1005 mov qword [sums+8], 0
1006 mov qword [sums+16], 0
1027 ABS2 mm4, mm5, mm6, mm7
1035 ; in: mm0..mm3 (4x4), mm7 (3x4)
1036 ; out: mm0 v, mm4 h, mm5 dc
1038 %macro SUM4x3 3 ; dc, left, top
1046 punpckldq mm0, mm2 ; transpose
1050 ABS2 mm4, mm5, mm2, mm3 ; 1x4 sum
1051 ABS1 mm0, mm1 ; 4x1 sum
1054 %macro INTRA_SATDS_MMX 1
1055 ;-----------------------------------------------------------------------------
1056 ; void x264_intra_satd_x3_4x4_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
1057 ;-----------------------------------------------------------------------------
1058 cglobal x264_intra_satd_x3_4x4_%1, 2,6
1060 ; stack is 16 byte aligned because abi says so
1061 %define top_1d rsp-8 ; size 8
1062 %define left_1d rsp-16 ; size 8
1066 ; stack is 16 byte aligned at least in gcc, and we've pushed 3 regs + return address, so it's still aligned
1068 %define top_1d esp+8
1075 SCALAR_HADAMARD_LEFT 0, r0, r3, r4, r5
1077 SCALAR_HADAMARD_TOP 0, r0, r3, r4, r5
1078 lea t0d, [t0d + r0d + 4]
1083 SUM4x3 t0d, [left_1d], [top_1d]
1087 psrlq mm1, 16 ; 4x3 sum
1090 SUM_MM_X3 mm0, mm4, mm5, mm1, mm2, mm3, mm6, pavgw
1094 movd [r2+0], mm0 ; i4x4_v satd
1095 movd [r2+4], mm4 ; i4x4_h satd
1096 movd [r2+8], mm5 ; i4x4_dc satd
1116 ;-----------------------------------------------------------------------------
1117 ; void x264_intra_satd_x3_16x16_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
1118 ;-----------------------------------------------------------------------------
1119 cglobal x264_intra_satd_x3_16x16_%1, 0,7
1121 %assign stack_pad 88
1123 %assign stack_pad 88 + ((stack_offset+88+4)&15)
1125 ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
1127 %define sums rsp+64 ; size 24
1128 %define top_1d rsp+32 ; size 32
1129 %define left_1d rsp ; size 32
1137 SCALAR_HADAMARD_LEFT t0, r3, r4, r5, r6
1139 SCALAR_HADAMARD_TOP t0, r3, r4, r5, r6
1156 SUM4x3 t2d, [left_1d+8*r3], [top_1d+8*r4]
1159 paddw mm0, [sums+0] ; i16x16_v satd
1160 paddw mm4, [sums+8] ; i16x16_h satd
1161 paddw mm5, [sums+16] ; i16x16_dc satd
1170 add r0, 4*FENC_STRIDE-16
1181 SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd
1187 movd [r2+8], mm2 ; i16x16_dc satd
1188 movd [r2+4], mm1 ; i16x16_h satd
1189 movd [r2+0], mm0 ; i16x16_v satd
1193 ;-----------------------------------------------------------------------------
1194 ; void x264_intra_satd_x3_8x8c_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
1195 ;-----------------------------------------------------------------------------
1196 cglobal x264_intra_satd_x3_8x8c_%1, 0,6
1197 ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
1199 %define sums rsp+48 ; size 24
1200 %define dc_1d rsp+32 ; size 16
1201 %define top_1d rsp+16 ; size 16
1202 %define left_1d rsp ; size 16
1209 SCALAR_HADAMARD_LEFT t0, t2, r3, r4, r5
1210 SCALAR_HADAMARD_TOP t0, t2, r3, r4, r5
1215 movzx t2d, word [left_1d+0]
1216 movzx r3d, word [top_1d+0]
1217 movzx r4d, word [left_1d+8]
1218 movzx r5d, word [top_1d+8]
1229 mov [dc_1d+ 0], t2d ; tl
1230 mov [dc_1d+ 4], r5d ; tr
1231 mov [dc_1d+ 8], r4d ; bl
1232 mov [dc_1d+12], r3d ; br
1245 SUM4x3 [r5+4*r4], [left_1d+8*r3], [top_1d+8*r4]
1248 paddw mm0, [sums+16] ; i4x4_v satd
1249 paddw mm4, [sums+8] ; i4x4_h satd
1250 paddw mm5, [sums+0] ; i4x4_dc satd
1259 add r0, 4*FENC_STRIDE-8
1272 SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd
1274 movd [r2+0], mm0 ; i8x8c_dc satd
1275 movd [r2+4], mm1 ; i8x8c_h satd
1276 movd [r2+8], mm2 ; i8x8c_v satd
1284 cextern x264_pixel_sa8d_8x8_mmxext
1285 SA8D_16x16_32 mmxext
1288 %define ABS1 ABS1_MMX
1289 %define ABS2 ABS2_MMX
1292 INTRA_SA8D_SSE2 sse2
1293 INTRA_SATDS_MMX mmxext
1294 %define ABS1 ABS1_SSSE3
1295 %define ABS2 ABS2_SSSE3
1298 INTRA_SA8D_SSE2 ssse3
1299 INTRA_SATDS_MMX ssse3
1300 SATD_W4 ssse3 ; mmx, but uses pabsw from ssse3.
1301 %define SATD_8x4_SSE2 SATD_8x4_PHADD
1302 SATDS_SSE2 ssse3_phadd
1306 ;=============================================================================
1308 ;=============================================================================
1310 ;-----------------------------------------------------------------------------
1311 ; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
1312 ; const uint8_t *pix2, int stride2, int sums[2][4] )
1313 ;-----------------------------------------------------------------------------
1314 cglobal x264_pixel_ssim_4x4x2_core_sse2, 4,4
1323 punpcklbw xmm5, xmm0
1324 punpcklbw xmm6, xmm0
1340 movdqa xmm7, [pw_1 GLOBAL]
1341 pshufd xmm5, xmm3, 0xb1
1344 pshufd xmm6, xmm4, 0xb1
1347 pshufd xmm1, xmm1, 0xd8
1351 punpckldq xmm3, xmm4
1352 punpckhdq xmm5, xmm4
1371 ;-----------------------------------------------------------------------------
1372 ; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width )
1373 ;-----------------------------------------------------------------------------
1374 cglobal x264_pixel_ssim_end4_sse2, 3,3
1375 movdqa xmm0, [r0+ 0]
1376 movdqa xmm1, [r0+16]
1377 movdqa xmm2, [r0+32]
1378 movdqa xmm3, [r0+48]
1379 movdqa xmm4, [r0+64]
1390 movdqa xmm5, [ssim_c1 GLOBAL]
1391 movdqa xmm6, [ssim_c2 GLOBAL]
1392 TRANSPOSE4x4D xmm0, xmm1, xmm2, xmm3, xmm4
1394 ; s1=mm0, s2=mm3, ss=mm4, s12=mm2
1397 pmaddwd xmm1, xmm0 ; s1*s2
1399 pmaddwd xmm0, xmm0 ; s1*s1 + s2*s2
1403 psubd xmm2, xmm1 ; covar*2
1404 psubd xmm4, xmm0 ; vars
1409 cvtdq2ps xmm0, xmm0 ; (float)(s1*s1 + s2*s2 + ssim_c1)
1410 cvtdq2ps xmm1, xmm1 ; (float)(s1*s2*2 + ssim_c1)
1411 cvtdq2ps xmm2, xmm2 ; (float)(covar*2 + ssim_c2)
1412 cvtdq2ps xmm4, xmm4 ; (float)(vars + ssim_c2)
1415 divps xmm1, xmm0 ; ssim
1418 je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
1421 lea r3, [mask_ff + 16 GLOBAL]
1422 movdqu xmm3, [r3 + r2*4]
1424 movdqu xmm3, [mask_ff + r2*4 + 16 GLOBAL]
1430 pshuflw xmm1, xmm0, 0xE
1440 ;=============================================================================
1441 ; Successive Elimination ADS
1442 ;=============================================================================
1444 %macro ADS_START 1 ; unroll_size
1469 %define ABS1 ABS1_MMX
1471 ;-----------------------------------------------------------------------------
1472 ; int x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
1473 ; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
1474 ;-----------------------------------------------------------------------------
1475 cglobal x264_pixel_ads4_mmxext, 4,7
1479 pshufw mm6, mm6, 0xAA
1481 pshufw mm4, mm4, 0xAA
1491 movq mm3, [r1+r2+16]
1500 pshufw mm1, [r10+8], 0
1502 pshufw mm1, [ebp+stack_offset+28], 0
1510 cglobal x264_pixel_ads2_mmxext, 4,7
1514 pshufw mm6, mm6, 0xAA
1531 cglobal x264_pixel_ads1_mmxext, 4,7
1553 cglobal x264_pixel_ads4_%1, 4,7
1555 pshuflw xmm7, xmm4, 0
1556 pshuflw xmm6, xmm4, 0xAA
1557 pshufhw xmm5, xmm4, 0
1558 pshufhw xmm4, xmm4, 0xAA
1559 punpcklqdq xmm7, xmm7
1560 punpcklqdq xmm6, xmm6
1561 punpckhqdq xmm5, xmm5
1562 punpckhqdq xmm4, xmm4
1564 pshuflw xmm8, r6m, 0
1565 punpcklqdq xmm8, xmm8
1568 movdqu xmm11, [r1+r2]
1571 movdqu xmm1, [r1+16]
1578 movdqu xmm3, [r1+r2+16]
1597 movdqu xmm1, [r1+16]
1602 movdqu xmm2, [r1+r2]
1603 movdqu xmm3, [r1+r2+16]
1611 movd xmm1, [ebp+stack_offset+28]
1613 pshuflw xmm1, xmm1, 0
1614 punpcklqdq xmm1, xmm1
1622 cglobal x264_pixel_ads2_%1, 4,7
1625 pshuflw xmm7, xmm6, 0
1626 pshuflw xmm6, xmm6, 0xAA
1627 pshuflw xmm5, xmm5, 0
1628 punpcklqdq xmm7, xmm7
1629 punpcklqdq xmm6, xmm6
1630 punpcklqdq xmm5, xmm5
1634 movdqu xmm1, [r1+r2]
1648 cglobal x264_pixel_ads1_%1, 4,7
1651 pshuflw xmm7, xmm7, 0
1652 pshuflw xmm6, xmm6, 0
1653 punpcklqdq xmm7, xmm7
1654 punpcklqdq xmm6, xmm6
1658 movdqu xmm1, [r1+16]
1662 movdqu xmm3, [r3+16]
1677 %define ABS1 ABS1_SSSE3
1680 ; int x264_pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
1683 ; *(uint32_t*)(masks+width) = 0;
1684 ; for( i=0; i<width; i+=8 )
1686 ; uint64_t mask = *(uint64_t*)(masks+i);
1687 ; if( !mask ) continue;
1688 ; for( j=0; j<8; j++ )
1689 ; if( mask & (255<<j*8) )
1694 cglobal x264_pixel_ads_mvs
1702 ; clear last block in case width isn't divisible by 8. (assume divisible by 4, so clearing 4 bytes is enough.)
1703 mov dword [rsp+r5], 0
1716 test edi, 0xff<<(%1*8)
1737 ; no PROLOGUE, inherit from x264_pixel_ads1
1738 mov ebx, [ebp+stack_offset+20] ; mvs
1739 mov edi, [ebp+stack_offset+24] ; width
1740 mov dword [esp+edi], 0
1748 mov ebp, [esp+esi+4]
1749 mov edx, [esp+esi+8]
1756 test %2, 0xff<<(%1*8)