1 ;*****************************************************************************
2 ;* pixel.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Laurent Aimar <fenrir@via.ecp.fr>
8 ;* Alex Izvorski <aizvorksi@gmail.com>
10 ;* This program is free software; you can redistribute it and/or modify
11 ;* it under the terms of the GNU General Public License as published by
12 ;* the Free Software Foundation; either version 2 of the License, or
13 ;* (at your option) any later version.
15 ;* This program is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;* GNU General Public License for more details.
20 ;* You should have received a copy of the GNU General Public License
21 ;* along with this program; if not, write to the Free Software
22 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
23 ;*****************************************************************************
29 ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
30 ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
31 mask_ff: times 16 db 0xff
36 %macro HADDD 2 ; sum junk
44 pmaddwd %1, [pw_1 GLOBAL]
48 ;=============================================================================
50 ;=============================================================================
52 %macro SSD_INC_1x16P 0
86 %macro SSD_INC_2x16P 0
125 %macro SSD_INC_2x4P 0
146 ;-----------------------------------------------------------------------------
147 ; int x264_pixel_ssd_16x16_mmx (uint8_t *, int, uint8_t *, int )
148 ;-----------------------------------------------------------------------------
150 cglobal x264_pixel_ssd_%1x%2_mmx, 4,4
152 pxor mm0, mm0 ; mm0 holds the sum
171 %macro SSD_INC_2x16P_SSE2 0
206 %macro SSD_INC_2x8P_SSE2 0
227 ;-----------------------------------------------------------------------------
228 ; int x264_pixel_ssd_16x16_sse2 (uint8_t *, int, uint8_t *, int )
229 ;-----------------------------------------------------------------------------
231 cglobal x264_pixel_ssd_%1x%2_sse2, 4,4
250 ;=============================================================================
252 ;=============================================================================
254 %macro LOAD_DIFF_4P 4 ; dst, tmp, [pix1], [pix2]
262 %macro LOAD_DIFF_8P 4 ; dst, tmp, [pix1], [pix2]
270 %macro LOAD_DIFF_8x4P 6 ; 4x dest, 2x temp
271 LOAD_DIFF_8P %1, %5, [r0], [r2]
272 LOAD_DIFF_8P %2, %6, [r0+r1], [r2+r3]
273 LOAD_DIFF_8P %3, %5, [r0+2*r1], [r2+2*r3]
274 LOAD_DIFF_8P %4, %6, [r0+r4], [r2+r5]
277 ;;; row transform not used, because phaddw is much slower than paddw on a Conroe
284 ;%macro HADAMARD4_ROW_SSSE3 5 ; abcd-t -> adtc
285 ; PHSUMSUB %1, %2, %5
286 ; PHSUMSUB %3, %4, %2
287 ; PHSUMSUB %1, %3, %4
288 ; PHSUMSUB %5, %2, %3
300 %macro HADAMARD4_1D 4
301 SUMSUB_BADC %1, %2, %3, %4
302 SUMSUB_BADC %1, %3, %2, %4
305 %macro HADAMARD8_1D 8
306 SUMSUB_BADC %1, %5, %2, %6
307 SUMSUB_BADC %3, %7, %4, %8
308 SUMSUB_BADC %1, %3, %2, %4
309 SUMSUB_BADC %5, %7, %6, %8
310 SUMSUB_BADC %1, %2, %3, %4
311 SUMSUB_BADC %5, %6, %7, %8
320 %macro SBUTTERFLY2 5 ; not really needed, but allows transpose4x4x2 to not shuffle registers
326 %macro TRANSPOSE4x4W 5 ; abcd-t -> adtc
327 SBUTTERFLY q, wd, %1, %2, %5
328 SBUTTERFLY q, wd, %3, %4, %2
329 SBUTTERFLY q, dq, %1, %3, %4
330 SBUTTERFLY q, dq, %5, %2, %3
333 %macro TRANSPOSE4x4D 5 ; abcd-t -> adtc
334 SBUTTERFLY dqa, dq, %1, %2, %5
335 SBUTTERFLY dqa, dq, %3, %4, %2
336 SBUTTERFLY dqa, qdq, %1, %3, %4
337 SBUTTERFLY dqa, qdq, %5, %2, %3
340 %macro TRANSPOSE2x4x4W 5 ; abcd-t -> abcd
341 SBUTTERFLY dqa, wd, %1, %2, %5
342 SBUTTERFLY dqa, wd, %3, %4, %2
343 SBUTTERFLY dqa, dq, %1, %3, %4
344 SBUTTERFLY2 dqa, dq, %5, %2, %3
345 SBUTTERFLY dqa, qdq, %1, %3, %2
346 SBUTTERFLY2 dqa, qdq, %4, %5, %3
350 %macro TRANSPOSE8x8W 9 ; abcdefgh-t -> afhdtecb
351 SBUTTERFLY dqa, wd, %1, %2, %9
352 SBUTTERFLY dqa, wd, %3, %4, %2
353 SBUTTERFLY dqa, wd, %5, %6, %4
354 SBUTTERFLY dqa, wd, %7, %8, %6
355 SBUTTERFLY dqa, dq, %1, %3, %8
356 SBUTTERFLY dqa, dq, %9, %2, %3
357 SBUTTERFLY dqa, dq, %5, %7, %2
358 SBUTTERFLY dqa, dq, %4, %6, %7
359 SBUTTERFLY dqa, qdq, %1, %5, %6
360 SBUTTERFLY dqa, qdq, %9, %4, %5
361 SBUTTERFLY dqa, qdq, %8, %2, %4
362 SBUTTERFLY dqa, qdq, %3, %7, %2
365 %macro TRANSPOSE8x8W 9 ; abcdefgh -> afhdgecb
367 SBUTTERFLY dqa, wd, %1, %2, %8
370 SBUTTERFLY dqa, wd, %3, %4, %2
371 SBUTTERFLY dqa, wd, %5, %6, %4
372 SBUTTERFLY dqa, wd, %7, %8, %6
373 SBUTTERFLY dqa, dq, %1, %3, %8
376 SBUTTERFLY dqa, dq, %8, %2, %3
377 SBUTTERFLY dqa, dq, %5, %7, %2
378 SBUTTERFLY dqa, dq, %4, %6, %7
379 SBUTTERFLY dqa, qdq, %1, %5, %6
380 SBUTTERFLY dqa, qdq, %8, %4, %5
383 SBUTTERFLY dqa, qdq, %8, %2, %4
384 SBUTTERFLY dqa, qdq, %3, %7, %2
389 %macro ABS1_MMX 2 ; a, tmp
395 %macro ABS2_MMX 4 ; a, b, tmp0, tmp1
413 %define ABS1 ABS1_MMX
414 %define ABS2 ABS2_MMX
421 %macro HADAMARD4x4_SUM 1 ; %1 = dest (row sum of one block)
422 HADAMARD4_1D mm4, mm5, mm6, mm7
423 TRANSPOSE4x4W mm4, mm5, mm6, mm7, %1
424 HADAMARD4_1D mm4, mm7, %1, mm6
425 ABS2 mm4, mm7, mm3, mm5
426 ABS2 %1, mm6, mm3, mm5
432 ; in: r4=3*stride1, r5=3*stride2
433 ; in: %2 = horizontal offset
434 ; in: %3 = whether we need to increment pix1 and pix2
437 %macro SATD_4x4_MMX 3
438 LOAD_DIFF_4P mm4, mm3, [r0+%2], [r2+%2]
439 LOAD_DIFF_4P mm5, mm3, [r0+r1+%2], [r2+r3+%2]
440 LOAD_DIFF_4P mm6, mm3, [r0+2*r1+%2], [r2+2*r3+%2]
441 LOAD_DIFF_4P mm7, mm3, [r0+r4+%2], [r2+r5+%2]
449 %macro SATD_8x4_START 1
450 SATD_4x4_MMX mm0, 0, 0
451 SATD_4x4_MMX mm1, 4, %1
454 %macro SATD_8x4_INC 1
455 SATD_4x4_MMX mm2, 0, 0
457 SATD_4x4_MMX mm1, 4, %1
461 %macro SATD_16x4_START 1
462 SATD_4x4_MMX mm0, 0, 0
463 SATD_4x4_MMX mm1, 4, 0
464 SATD_4x4_MMX mm2, 8, 0
466 SATD_4x4_MMX mm1, 12, %1
470 %macro SATD_16x4_INC 1
471 SATD_4x4_MMX mm2, 0, 0
473 SATD_4x4_MMX mm1, 4, 0
475 SATD_4x4_MMX mm2, 8, 0
477 SATD_4x4_MMX mm1, 12, %1
481 %macro SATD_8x4_SSE2 1
482 LOAD_DIFF_8x4P xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
487 HADAMARD4_1D xmm0, xmm1, xmm2, xmm3
488 TRANSPOSE2x4x4W xmm0, xmm1, xmm2, xmm3, xmm4
489 HADAMARD4_1D xmm0, xmm1, xmm2, xmm3
490 ABS4 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
497 %macro SATD_START_MMX 0
498 lea r4, [3*r1] ; 3*stride1
499 lea r5, [3*r3] ; 3*stride2
502 %macro SATD_END_MMX 0
503 pshufw mm1, mm0, 01001110b
505 pshufw mm1, mm0, 10110001b
512 ; FIXME avoid the spilling of regs to hold 3*stride.
513 ; for small blocks on x86_32, modify pixel pointer instead.
515 ;-----------------------------------------------------------------------------
516 ; int x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int )
517 ;-----------------------------------------------------------------------------
518 cglobal x264_pixel_satd_16x16_mmxext, 4,6
526 pshufw mm1, mm0, 01001110b
529 pshufw mm1, mm0, 01001110b
534 cglobal x264_pixel_satd_16x8_mmxext, 4,6
541 cglobal x264_pixel_satd_8x16_mmxext, 4,6
550 cglobal x264_pixel_satd_8x8_mmxext, 4,6
557 cglobal x264_pixel_satd_8x4_mmxext, 4,6
564 cglobal x264_pixel_satd_4x8_%1, 4,6
566 SATD_4x4_MMX mm0, 0, 1
567 SATD_4x4_MMX mm1, 0, 0
571 cglobal x264_pixel_satd_4x4_%1, 4,6
573 SATD_4x4_MMX mm0, 0, 0
579 %macro SATD_START_SSE2 0
585 %macro SATD_END_SSE2 0
593 %macro BACKUP_POINTERS 0
600 %macro RESTORE_AND_INC_POINTERS 0
612 ;-----------------------------------------------------------------------------
613 ; int x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int )
614 ;-----------------------------------------------------------------------------
616 cglobal x264_pixel_satd_16x16_%1, 4,6
623 RESTORE_AND_INC_POINTERS
630 cglobal x264_pixel_satd_16x8_%1, 4,6
635 RESTORE_AND_INC_POINTERS
640 cglobal x264_pixel_satd_8x16_%1, 4,6
648 cglobal x264_pixel_satd_8x8_%1, 4,6
654 cglobal x264_pixel_satd_8x4_%1, 4,6
660 ;-----------------------------------------------------------------------------
661 ; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int )
662 ;-----------------------------------------------------------------------------
663 cglobal x264_pixel_sa8d_8x8_%1
667 LOAD_DIFF_8x4P xmm0, xmm1, xmm2, xmm3, xmm8, xmm9
670 LOAD_DIFF_8x4P xmm4, xmm5, xmm6, xmm7, xmm8, xmm9
672 HADAMARD8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
673 TRANSPOSE8x8W xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
674 HADAMARD8_1D xmm0, xmm5, xmm7, xmm3, xmm8, xmm4, xmm2, xmm1
676 ABS4 xmm0, xmm1, xmm2, xmm3, xmm6, xmm9
677 ABS4 xmm4, xmm5, xmm7, xmm8, xmm6, xmm9
687 add r10d, eax ; preserve rounding for 16x16
692 cglobal x264_pixel_sa8d_16x16_%1
694 call x264_pixel_sa8d_8x8_%1 ; pix[0]
697 call x264_pixel_sa8d_8x8_%1.skip_lea ; pix[8*stride]
698 neg r4 ; it's already r1*3
702 call x264_pixel_sa8d_8x8_%1 ; pix[8]
705 call x264_pixel_sa8d_8x8_%1.skip_lea ; pix[8*stride+8]
711 cglobal x264_pixel_sa8d_8x8_%1, 4,7
717 LOAD_DIFF_8x4P xmm0, xmm1, xmm2, xmm3, xmm6, xmm7
721 LOAD_DIFF_8x4P xmm4, xmm5, xmm6, xmm7, xmm2, xmm2
724 HADAMARD8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
725 TRANSPOSE8x8W xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, esp
726 HADAMARD8_1D xmm0, xmm5, xmm7, xmm3, xmm6, xmm4, xmm2, xmm1
730 movdqa [esp+16], xmm7
732 ABS2 xmm2, xmm3, xmm6, xmm7
733 ABS2 xmm0, xmm1, xmm6, xmm7
738 movdqa xmm7, [esp+16]
740 ABS2 xmm4, xmm5, xmm2, xmm3
741 ABS2 xmm6, xmm7, xmm2, xmm3
750 mov ecx, eax ; preserve rounding for 16x16
756 %endmacro ; SATDS_SSE2
758 %macro SA8D_16x16_32 1
760 cglobal x264_pixel_sa8d_16x16_%1
762 push dword [esp+20] ; stride2
763 push dword [esp+20] ; pix2
764 push dword [esp+20] ; stride1
765 push dword [esp+20] ; pix1
766 call x264_pixel_sa8d_8x8_%1
768 add dword [esp+0], 8 ; pix1+8
769 add dword [esp+8], 8 ; pix2+8
770 call x264_pixel_sa8d_8x8_%1
776 add [esp+0], eax ; pix1+8*stride1+8
777 add [esp+8], edx ; pix2+8*stride2+8
778 call x264_pixel_sa8d_8x8_%1
780 sub dword [esp+0], 8 ; pix1+8*stride1
781 sub dword [esp+8], 8 ; pix2+8*stride2
782 call x264_pixel_sa8d_8x8_%1
788 %endif ; !ARCH_X86_64
789 %endmacro ; SA8D_16x16_32
793 ;=============================================================================
795 ;=============================================================================
797 %macro INTRA_SA8D_SSE2 1
799 ;-----------------------------------------------------------------------------
800 ; void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *fenc, int16_t edges[2][8], int *res )
801 ;-----------------------------------------------------------------------------
802 cglobal x264_intra_sa8d_x3_8x8_core_%1
805 movq xmm0, [r0+0*FENC_STRIDE]
806 movq xmm7, [r0+1*FENC_STRIDE]
807 movq xmm6, [r0+2*FENC_STRIDE]
808 movq xmm3, [r0+3*FENC_STRIDE]
809 movq xmm5, [r0+4*FENC_STRIDE]
810 movq xmm1, [r0+5*FENC_STRIDE]
811 movq xmm8, [r0+6*FENC_STRIDE]
812 movq xmm2, [r0+7*FENC_STRIDE]
821 HADAMARD8_1D xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2
822 TRANSPOSE8x8W xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2, xmm4
823 HADAMARD8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
826 movzx edi, word [r1+0]
837 ABS4 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13
848 ABS2 xmm10, xmm11, xmm13, xmm14
855 movdqa xmm14, xmm15 ; 7x8 sum
857 movdqa xmm8, [r1+0] ; left edge
863 ABS1 xmm9, xmm11 ; 1x8 sum
872 punpcklqdq xmm0, xmm4 ; transpose
873 movdqa xmm1, [r1+16] ; top edge
876 psrldq xmm2, 2 ; 8x7 sum
877 psubw xmm0, xmm1 ; 8x1 sum
882 movdqa xmm7, [pw_1 GLOBAL]
887 punpckldq xmm2, xmm14
888 punpckhdq xmm3, xmm14
889 pshufd xmm5, xmm15, 0xf5
893 punpcklqdq xmm2, xmm5
894 punpckhqdq xmm3, xmm5
898 movq [r2], xmm3 ; i8x8_v, i8x8_h
900 movd [r2+8], xmm3 ; i8x8_dc
903 %endmacro ; INTRA_SATDS
906 ; out: mm0..mm3 = hadamard coefs
910 movd mm0, [r0+0*FENC_STRIDE]
911 movd mm4, [r0+1*FENC_STRIDE]
912 movd mm3, [r0+2*FENC_STRIDE]
913 movd mm1, [r0+3*FENC_STRIDE]
918 HADAMARD4_1D mm0, mm4, mm3, mm1
919 TRANSPOSE4x4W mm0, mm4, mm3, mm1, mm2
920 HADAMARD4_1D mm0, mm1, mm2, mm3
923 %macro SCALAR_SUMSUB 4
932 %macro SCALAR_HADAMARD_LEFT 5 ; y, 4x tmp
934 shl %1d, 5 ; log(FDEC_STRIDE)
936 movzx %2d, byte [r1+%1-1+0*FDEC_STRIDE]
937 movzx %3d, byte [r1+%1-1+1*FDEC_STRIDE]
938 movzx %4d, byte [r1+%1-1+2*FDEC_STRIDE]
939 movzx %5d, byte [r1+%1-1+3*FDEC_STRIDE]
943 SCALAR_SUMSUB %2d, %3d, %4d, %5d
944 SCALAR_SUMSUB %2d, %4d, %3d, %5d
945 mov [left_1d+2*%1+0], %2w
946 mov [left_1d+2*%1+2], %3w
947 mov [left_1d+2*%1+4], %4w
948 mov [left_1d+2*%1+6], %5w
951 %macro SCALAR_HADAMARD_TOP 5 ; x, 4x tmp
952 movzx %2d, byte [r1+%1-FDEC_STRIDE+0]
953 movzx %3d, byte [r1+%1-FDEC_STRIDE+1]
954 movzx %4d, byte [r1+%1-FDEC_STRIDE+2]
955 movzx %5d, byte [r1+%1-FDEC_STRIDE+3]
956 SCALAR_SUMSUB %2d, %3d, %4d, %5d
957 SCALAR_SUMSUB %2d, %4d, %3d, %5d
958 mov [top_1d+2*%1+0], %2w
959 mov [top_1d+2*%1+2], %3w
960 mov [top_1d+2*%1+4], %4w
961 mov [top_1d+2*%1+6], %5w
964 %macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
966 pshufw %4, %1, 01001110b
967 pshufw %5, %2, 01001110b
968 pshufw %6, %3, 01001110b
975 pshufw %4, %1, 01001110b
976 pshufw %5, %2, 01001110b
977 pshufw %6, %3, 01001110b
985 mov qword [sums+0], 0
986 mov qword [sums+8], 0
987 mov qword [sums+16], 0
1008 ABS2 mm4, mm5, mm6, mm7
1016 ; in: mm0..mm3 (4x4), mm7 (3x4)
1017 ; out: mm0 v, mm4 h, mm5 dc
1019 %macro SUM4x3 3 ; dc, left, top
1027 punpckldq mm0, mm2 ; transpose
1031 ABS2 mm4, mm5, mm2, mm3 ; 1x4 sum
1032 ABS1 mm0, mm1 ; 4x1 sum
1035 %macro INTRA_SATDS_MMX 1
1036 ;-----------------------------------------------------------------------------
1037 ; void x264_intra_satd_x3_4x4_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
1038 ;-----------------------------------------------------------------------------
1039 cglobal x264_intra_satd_x3_4x4_%1, 2,6
1041 ; stack is 16 byte aligned because abi says so
1042 %define top_1d rsp-8 ; size 8
1043 %define left_1d rsp-16 ; size 8
1047 ; stack is 16 byte aligned at least in gcc, and we've pushed 3 regs + return address, so it's still aligned
1049 %define top_1d esp+8
1056 SCALAR_HADAMARD_LEFT 0, r0, r3, r4, r5
1058 SCALAR_HADAMARD_TOP 0, r0, r3, r4, r5
1059 lea t0d, [t0d + r0d + 4]
1064 SUM4x3 t0d, [left_1d], [top_1d]
1068 psrlq mm1, 16 ; 4x3 sum
1071 SUM_MM_X3 mm0, mm4, mm5, mm1, mm2, mm3, mm6, pavgw
1075 movd [r2+0], mm0 ; i4x4_v satd
1076 movd [r2+4], mm4 ; i4x4_h satd
1077 movd [r2+8], mm5 ; i4x4_dc satd
1097 ;-----------------------------------------------------------------------------
1098 ; void x264_intra_satd_x3_16x16_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
1099 ;-----------------------------------------------------------------------------
1100 cglobal x264_intra_satd_x3_16x16_%1, 0,7
1102 %assign stack_pad 88
1104 %assign stack_pad 88 + ((stack_offset+88+4)&15)
1106 ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
1108 %define sums rsp+64 ; size 24
1109 %define top_1d rsp+32 ; size 32
1110 %define left_1d rsp ; size 32
1118 SCALAR_HADAMARD_LEFT t0, r3, r4, r5, r6
1120 SCALAR_HADAMARD_TOP t0, r3, r4, r5, r6
1137 SUM4x3 t2d, [left_1d+8*r3], [top_1d+8*r4]
1140 paddw mm0, [sums+0] ; i16x16_v satd
1141 paddw mm4, [sums+8] ; i16x16_h satd
1142 paddw mm5, [sums+16] ; i16x16_dc satd
1151 add r0, 4*FENC_STRIDE-16
1162 SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd
1168 movd [r2+8], mm2 ; i16x16_dc satd
1169 movd [r2+4], mm1 ; i16x16_h satd
1170 movd [r2+0], mm0 ; i16x16_v satd
1174 ;-----------------------------------------------------------------------------
1175 ; void x264_intra_satd_x3_8x8c_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
1176 ;-----------------------------------------------------------------------------
1177 cglobal x264_intra_satd_x3_8x8c_%1, 0,6
1178 ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
1180 %define sums rsp+48 ; size 24
1181 %define dc_1d rsp+32 ; size 16
1182 %define top_1d rsp+16 ; size 16
1183 %define left_1d rsp ; size 16
1190 SCALAR_HADAMARD_LEFT t0, t2, r3, r4, r5
1191 SCALAR_HADAMARD_TOP t0, t2, r3, r4, r5
1196 movzx t2d, word [left_1d+0]
1197 movzx r3d, word [top_1d+0]
1198 movzx r4d, word [left_1d+8]
1199 movzx r5d, word [top_1d+8]
1210 mov [dc_1d+ 0], t2d ; tl
1211 mov [dc_1d+ 4], r5d ; tr
1212 mov [dc_1d+ 8], r4d ; bl
1213 mov [dc_1d+12], r3d ; br
1226 SUM4x3 [r5+4*r4], [left_1d+8*r3], [top_1d+8*r4]
1229 paddw mm0, [sums+16] ; i4x4_v satd
1230 paddw mm4, [sums+8] ; i4x4_h satd
1231 paddw mm5, [sums+0] ; i4x4_dc satd
1240 add r0, 4*FENC_STRIDE-8
1253 SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd
1255 movd [r2+0], mm0 ; i8x8c_dc satd
1256 movd [r2+4], mm1 ; i8x8c_h satd
1257 movd [r2+8], mm2 ; i8x8c_v satd
1265 cextern x264_pixel_sa8d_8x8_mmxext
1266 SA8D_16x16_32 mmxext
1269 %define ABS1 ABS1_MMX
1270 %define ABS2 ABS2_MMX
1273 INTRA_SA8D_SSE2 sse2
1274 INTRA_SATDS_MMX mmxext
1276 %define ABS1 ABS1_SSSE3
1277 %define ABS2 ABS2_SSSE3
1280 INTRA_SA8D_SSE2 ssse3
1281 INTRA_SATDS_MMX ssse3
1282 SATD_W4 ssse3 ; mmx, but uses pabsw from ssse3.
1287 ;=============================================================================
1289 ;=============================================================================
1291 ;-----------------------------------------------------------------------------
1292 ; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
1293 ; const uint8_t *pix2, int stride2, int sums[2][4] )
1294 ;-----------------------------------------------------------------------------
1295 cglobal x264_pixel_ssim_4x4x2_core_sse2, 4,4
1304 punpcklbw xmm5, xmm0
1305 punpcklbw xmm6, xmm0
1321 movdqa xmm7, [pw_1 GLOBAL]
1322 pshufd xmm5, xmm3, 0xb1
1325 pshufd xmm6, xmm4, 0xb1
1328 pshufd xmm1, xmm1, 0xd8
1332 punpckldq xmm3, xmm4
1333 punpckhdq xmm5, xmm4
1352 ;-----------------------------------------------------------------------------
1353 ; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width )
1354 ;-----------------------------------------------------------------------------
1355 cglobal x264_pixel_ssim_end4_sse2, 3,3
1356 movdqa xmm0, [r0+ 0]
1357 movdqa xmm1, [r0+16]
1358 movdqa xmm2, [r0+32]
1359 movdqa xmm3, [r0+48]
1360 movdqa xmm4, [r0+64]
1371 movdqa xmm5, [ssim_c1 GLOBAL]
1372 movdqa xmm6, [ssim_c2 GLOBAL]
1373 TRANSPOSE4x4D xmm0, xmm1, xmm2, xmm3, xmm4
1375 ; s1=mm0, s2=mm3, ss=mm4, s12=mm2
1378 pmaddwd xmm1, xmm0 ; s1*s2
1380 pmaddwd xmm0, xmm0 ; s1*s1 + s2*s2
1384 psubd xmm2, xmm1 ; covar*2
1385 psubd xmm4, xmm0 ; vars
1390 cvtdq2ps xmm0, xmm0 ; (float)(s1*s1 + s2*s2 + ssim_c1)
1391 cvtdq2ps xmm1, xmm1 ; (float)(s1*s2*2 + ssim_c1)
1392 cvtdq2ps xmm2, xmm2 ; (float)(covar*2 + ssim_c2)
1393 cvtdq2ps xmm4, xmm4 ; (float)(vars + ssim_c2)
1396 divps xmm1, xmm0 ; ssim
1399 je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
1402 lea r3, [mask_ff + 16 GLOBAL]
1403 movdqu xmm3, [r3 + r2*4]
1405 movdqu xmm3, [mask_ff + r2*4 + 16 GLOBAL]
1411 pshuflw xmm1, xmm0, 0xE
1421 ;=============================================================================
1422 ; Successive Elimination ADS
1423 ;=============================================================================
1425 %macro ADS_START 1 ; unroll_size
1450 %define ABS1 ABS1_MMX
1452 ;-----------------------------------------------------------------------------
1453 ; int x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
1454 ; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
1455 ;-----------------------------------------------------------------------------
1456 cglobal x264_pixel_ads4_mmxext, 4,7
1460 pshufw mm6, mm6, 0xAA
1462 pshufw mm4, mm4, 0xAA
1472 movq mm3, [r1+r2+16]
1481 pshufw mm1, [r10+8], 0
1483 pshufw mm1, [ebp+stack_offset+28], 0
1491 cglobal x264_pixel_ads2_mmxext, 4,7
1495 pshufw mm6, mm6, 0xAA
1512 cglobal x264_pixel_ads1_mmxext, 4,7
1534 cglobal x264_pixel_ads4_%1, 4,7
1536 pshuflw xmm7, xmm4, 0
1537 pshuflw xmm6, xmm4, 0xAA
1538 pshufhw xmm5, xmm4, 0
1539 pshufhw xmm4, xmm4, 0xAA
1540 punpcklqdq xmm7, xmm7
1541 punpcklqdq xmm6, xmm6
1542 punpckhqdq xmm5, xmm5
1543 punpckhqdq xmm4, xmm4
1545 pshuflw xmm8, r6m, 0
1546 punpcklqdq xmm8, xmm8
1549 movdqu xmm11, [r1+r2]
1552 movdqu xmm1, [r1+16]
1559 movdqu xmm3, [r1+r2+16]
1578 movdqu xmm1, [r1+16]
1583 movdqu xmm2, [r1+r2]
1584 movdqu xmm3, [r1+r2+16]
1592 movd xmm1, [ebp+stack_offset+28]
1594 pshuflw xmm1, xmm1, 0
1595 punpcklqdq xmm1, xmm1
1603 cglobal x264_pixel_ads2_%1, 4,7
1606 pshuflw xmm7, xmm6, 0
1607 pshuflw xmm6, xmm6, 0xAA
1608 pshuflw xmm5, xmm5, 0
1609 punpcklqdq xmm7, xmm7
1610 punpcklqdq xmm6, xmm6
1611 punpcklqdq xmm5, xmm5
1615 movdqu xmm1, [r1+r2]
1629 cglobal x264_pixel_ads1_%1, 4,7
1632 pshuflw xmm7, xmm7, 0
1633 pshuflw xmm6, xmm6, 0
1634 punpcklqdq xmm7, xmm7
1635 punpcklqdq xmm6, xmm6
1639 movdqu xmm1, [r1+16]
1643 movdqu xmm3, [r3+16]
1659 %define ABS1 ABS1_SSSE3
1663 ; int x264_pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
1666 ; *(uint32_t*)(masks+width) = 0;
1667 ; for( i=0; i<width; i+=8 )
1669 ; uint64_t mask = *(uint64_t*)(masks+i);
1670 ; if( !mask ) continue;
1671 ; for( j=0; j<8; j++ )
1672 ; if( mask & (255<<j*8) )
1677 cglobal x264_pixel_ads_mvs
1685 ; clear last block in case width isn't divisible by 8. (assume divisible by 4, so clearing 4 bytes is enough.)
1686 mov dword [rsp+r5], 0
1699 test edi, 0xff<<(%1*8)
1720 ; no PROLOGUE, inherit from x264_pixel_ads1
1721 mov ebx, [ebp+stack_offset+20] ; mvs
1722 mov edi, [ebp+stack_offset+24] ; width
1723 mov dword [esp+edi], 0
1731 mov ebp, [esp+esi+4]
1732 mov edx, [esp+esi+8]
1739 test %2, 0xff<<(%1*8)