1 ;*****************************************************************************
2 ;* pixel.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Laurent Aimar <fenrir@via.ecp.fr>
8 ;* Alex Izvorski <aizvorksi@gmail.com>
10 ;* This program is free software; you can redistribute it and/or modify
11 ;* it under the terms of the GNU General Public License as published by
12 ;* the Free Software Foundation; either version 2 of the License, or
13 ;* (at your option) any later version.
15 ;* This program is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;* GNU General Public License for more details.
20 ;* You should have received a copy of the GNU General Public License
21 ;* along with this program; if not, write to the Free Software
22 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
23 ;*****************************************************************************
29 ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
30 ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
31 mask_ff: times 16 db 0xff
36 %macro HADDD 2 ; sum junk
44 pmaddwd %1, [pw_1 GLOBAL]
48 ;=============================================================================
50 ;=============================================================================
52 %macro SSD_INC_1x16P 0
93 por mm1, mm2 ; mm1 = 8bit abs diff
97 punpckhbw mm2, mm7 ; (mm1,mm2) = 16bit abs diff
107 %macro SSD_INC_1x4P 0
123 ;-----------------------------------------------------------------------------
124 ; int x264_pixel_ssd_16x16_mmx (uint8_t *, int, uint8_t *, int )
125 ;-----------------------------------------------------------------------------
127 cglobal x264_pixel_ssd_%1x%2_mmx, 4,4
129 pxor mm0, mm0 ; mm0 holds the sum
148 %macro SSD_INC_2x16P_SSE2 0
183 ;-----------------------------------------------------------------------------
184 ; int x264_pixel_ssd_16x16_sse2 (uint8_t *, int, uint8_t *, int )
185 ;-----------------------------------------------------------------------------
187 cglobal x264_pixel_ssd_%1x%2_sse2, 4,4
203 ;=============================================================================
205 ;=============================================================================
207 %macro LOAD_DIFF_4P 4 ; dst, tmp, [pix1], [pix2]
215 %macro LOAD_DIFF_8P 4 ; dst, tmp, [pix1], [pix2]
223 %macro LOAD_DIFF_8x4P 6 ; 4x dest, 2x temp
224 LOAD_DIFF_8P %1, %5, [r0], [r2]
225 LOAD_DIFF_8P %2, %6, [r0+r1], [r2+r3]
226 LOAD_DIFF_8P %3, %5, [r0+2*r1], [r2+2*r3]
227 LOAD_DIFF_8P %4, %6, [r0+r4], [r2+r5]
230 ;;; row transform not used, because phaddw is much slower than paddw on a Conroe
237 ;%macro HADAMARD4_ROW_SSSE3 5 ; abcd-t -> adtc
238 ; PHSUMSUB %1, %2, %5
239 ; PHSUMSUB %3, %4, %2
240 ; PHSUMSUB %1, %3, %4
241 ; PHSUMSUB %5, %2, %3
253 %macro HADAMARD4_1D 4
254 SUMSUB_BADC %1, %2, %3, %4
255 SUMSUB_BADC %1, %3, %2, %4
258 %macro HADAMARD8_1D 8
259 SUMSUB_BADC %1, %5, %2, %6
260 SUMSUB_BADC %3, %7, %4, %8
261 SUMSUB_BADC %1, %3, %2, %4
262 SUMSUB_BADC %5, %7, %6, %8
263 SUMSUB_BADC %1, %2, %3, %4
264 SUMSUB_BADC %5, %6, %7, %8
273 %macro SBUTTERFLY2 5 ; not really needed, but allows transpose4x4x2 to not shuffle registers
279 %macro TRANSPOSE4x4W 5 ; abcd-t -> adtc
280 SBUTTERFLY q, wd, %1, %2, %5
281 SBUTTERFLY q, wd, %3, %4, %2
282 SBUTTERFLY q, dq, %1, %3, %4
283 SBUTTERFLY q, dq, %5, %2, %3
286 %macro TRANSPOSE4x4D 5 ; abcd-t -> adtc
287 SBUTTERFLY dqa, dq, %1, %2, %5
288 SBUTTERFLY dqa, dq, %3, %4, %2
289 SBUTTERFLY dqa, qdq, %1, %3, %4
290 SBUTTERFLY dqa, qdq, %5, %2, %3
293 %macro TRANSPOSE2x4x4W 5 ; abcd-t -> abcd
294 SBUTTERFLY dqa, wd, %1, %2, %5
295 SBUTTERFLY dqa, wd, %3, %4, %2
296 SBUTTERFLY dqa, dq, %1, %3, %4
297 SBUTTERFLY2 dqa, dq, %5, %2, %3
298 SBUTTERFLY dqa, qdq, %1, %3, %2
299 SBUTTERFLY2 dqa, qdq, %4, %5, %3
303 %macro TRANSPOSE8x8W 9 ; abcdefgh-t -> afhdtecb
304 SBUTTERFLY dqa, wd, %1, %2, %9
305 SBUTTERFLY dqa, wd, %3, %4, %2
306 SBUTTERFLY dqa, wd, %5, %6, %4
307 SBUTTERFLY dqa, wd, %7, %8, %6
308 SBUTTERFLY dqa, dq, %1, %3, %8
309 SBUTTERFLY dqa, dq, %9, %2, %3
310 SBUTTERFLY dqa, dq, %5, %7, %2
311 SBUTTERFLY dqa, dq, %4, %6, %7
312 SBUTTERFLY dqa, qdq, %1, %5, %6
313 SBUTTERFLY dqa, qdq, %9, %4, %5
314 SBUTTERFLY dqa, qdq, %8, %2, %4
315 SBUTTERFLY dqa, qdq, %3, %7, %2
318 %macro TRANSPOSE8x8W 9 ; abcdefgh -> afhdgecb
320 SBUTTERFLY dqa, wd, %1, %2, %8
323 SBUTTERFLY dqa, wd, %3, %4, %2
324 SBUTTERFLY dqa, wd, %5, %6, %4
325 SBUTTERFLY dqa, wd, %7, %8, %6
326 SBUTTERFLY dqa, dq, %1, %3, %8
329 SBUTTERFLY dqa, dq, %8, %2, %3
330 SBUTTERFLY dqa, dq, %5, %7, %2
331 SBUTTERFLY dqa, dq, %4, %6, %7
332 SBUTTERFLY dqa, qdq, %1, %5, %6
333 SBUTTERFLY dqa, qdq, %8, %4, %5
336 SBUTTERFLY dqa, qdq, %8, %2, %4
337 SBUTTERFLY dqa, qdq, %3, %7, %2
342 %macro ABS1_MMX 2 ; a, tmp
348 %macro ABS2_MMX 4 ; a, b, tmp0, tmp1
366 %define ABS1 ABS1_MMX
367 %define ABS2 ABS2_MMX
374 %macro HADAMARD4x4_SUM 1 ; %1 = dest (row sum of one block)
375 HADAMARD4_1D mm4, mm5, mm6, mm7
376 TRANSPOSE4x4W mm4, mm5, mm6, mm7, %1
377 HADAMARD4_1D mm4, mm7, %1, mm6
378 ABS2 mm4, mm7, mm3, mm5
379 ABS2 %1, mm6, mm3, mm5
385 ; in: r4=3*stride1, r5=3*stride2
386 ; in: %2 = horizontal offset
387 ; in: %3 = whether we need to increment pix1 and pix2
390 %macro SATD_4x4_MMX 3
391 LOAD_DIFF_4P mm4, mm3, [r0+%2], [r2+%2]
392 LOAD_DIFF_4P mm5, mm3, [r0+r1+%2], [r2+r3+%2]
393 LOAD_DIFF_4P mm6, mm3, [r0+2*r1+%2], [r2+2*r3+%2]
394 LOAD_DIFF_4P mm7, mm3, [r0+r4+%2], [r2+r5+%2]
402 %macro SATD_8x4_START 1
403 SATD_4x4_MMX mm0, 0, 0
404 SATD_4x4_MMX mm1, 4, %1
407 %macro SATD_8x4_INC 1
408 SATD_4x4_MMX mm2, 0, 0
410 SATD_4x4_MMX mm1, 4, %1
414 %macro SATD_16x4_START 1
415 SATD_4x4_MMX mm0, 0, 0
416 SATD_4x4_MMX mm1, 4, 0
417 SATD_4x4_MMX mm2, 8, 0
419 SATD_4x4_MMX mm1, 12, %1
423 %macro SATD_16x4_INC 1
424 SATD_4x4_MMX mm2, 0, 0
426 SATD_4x4_MMX mm1, 4, 0
428 SATD_4x4_MMX mm2, 8, 0
430 SATD_4x4_MMX mm1, 12, %1
434 %macro SATD_8x4_SSE2 1
435 LOAD_DIFF_8x4P xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
440 HADAMARD4_1D xmm0, xmm1, xmm2, xmm3
441 TRANSPOSE2x4x4W xmm0, xmm1, xmm2, xmm3, xmm4
442 HADAMARD4_1D xmm0, xmm1, xmm2, xmm3
443 ABS4 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
450 %macro SATD_START_MMX 0
451 lea r4, [3*r1] ; 3*stride1
452 lea r5, [3*r3] ; 3*stride2
455 %macro SATD_END_MMX 0
456 pshufw mm1, mm0, 01001110b
458 pshufw mm1, mm0, 10110001b
465 ; FIXME avoid the spilling of regs to hold 3*stride.
466 ; for small blocks on x86_32, modify pixel pointer instead.
468 ;-----------------------------------------------------------------------------
469 ; int x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int )
470 ;-----------------------------------------------------------------------------
471 cglobal x264_pixel_satd_16x16_mmxext, 4,6
479 pshufw mm1, mm0, 01001110b
482 pshufw mm1, mm0, 01001110b
487 cglobal x264_pixel_satd_16x8_mmxext, 4,6
494 cglobal x264_pixel_satd_8x16_mmxext, 4,6
503 cglobal x264_pixel_satd_8x8_mmxext, 4,6
510 cglobal x264_pixel_satd_8x4_mmxext, 4,6
516 cglobal x264_pixel_satd_4x8_mmxext, 4,6
518 SATD_4x4_MMX mm0, 0, 1
519 SATD_4x4_MMX mm1, 0, 0
523 cglobal x264_pixel_satd_4x4_mmxext, 4,6
525 SATD_4x4_MMX mm0, 0, 0
530 %macro SATD_START_SSE2 0
536 %macro SATD_END_SSE2 0
544 %macro BACKUP_POINTERS 0
551 %macro RESTORE_AND_INC_POINTERS 0
563 ;-----------------------------------------------------------------------------
564 ; int x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int )
565 ;-----------------------------------------------------------------------------
567 cglobal x264_pixel_satd_16x16_%1, 4,6
574 RESTORE_AND_INC_POINTERS
581 cglobal x264_pixel_satd_16x8_%1, 4,6
586 RESTORE_AND_INC_POINTERS
591 cglobal x264_pixel_satd_8x16_%1, 4,6
599 cglobal x264_pixel_satd_8x8_%1, 4,6
605 cglobal x264_pixel_satd_8x4_%1, 4,6
611 ;-----------------------------------------------------------------------------
612 ; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int )
613 ;-----------------------------------------------------------------------------
614 cglobal x264_pixel_sa8d_8x8_%1
618 LOAD_DIFF_8x4P xmm0, xmm1, xmm2, xmm3, xmm8, xmm9
621 LOAD_DIFF_8x4P xmm4, xmm5, xmm6, xmm7, xmm8, xmm9
623 HADAMARD8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
624 TRANSPOSE8x8W xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
625 HADAMARD8_1D xmm0, xmm5, xmm7, xmm3, xmm8, xmm4, xmm2, xmm1
627 ABS4 xmm0, xmm1, xmm2, xmm3, xmm6, xmm9
628 ABS4 xmm4, xmm5, xmm7, xmm8, xmm6, xmm9
638 add r10d, eax ; preserve rounding for 16x16
643 cglobal x264_pixel_sa8d_16x16_%1
645 call x264_pixel_sa8d_8x8_%1 ; pix[0]
648 call x264_pixel_sa8d_8x8_%1.skip_lea ; pix[8*stride]
649 neg r4 ; it's already r1*3
653 call x264_pixel_sa8d_8x8_%1 ; pix[8]
656 call x264_pixel_sa8d_8x8_%1.skip_lea ; pix[8*stride+8]
662 cglobal x264_pixel_sa8d_8x8_%1, 4,7
668 LOAD_DIFF_8x4P xmm0, xmm1, xmm2, xmm3, xmm6, xmm7
672 LOAD_DIFF_8x4P xmm4, xmm5, xmm6, xmm7, xmm2, xmm2
675 HADAMARD8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
676 TRANSPOSE8x8W xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, esp
677 HADAMARD8_1D xmm0, xmm5, xmm7, xmm3, xmm6, xmm4, xmm2, xmm1
681 movdqa [esp+16], xmm7
683 ABS2 xmm2, xmm3, xmm6, xmm7
684 ABS2 xmm0, xmm1, xmm6, xmm7
689 movdqa xmm7, [esp+16]
691 ABS2 xmm4, xmm5, xmm2, xmm3
692 ABS2 xmm6, xmm7, xmm2, xmm3
701 mov ecx, eax ; preserve rounding for 16x16
707 %endmacro ; SATDS_SSE2
709 %macro SA8D_16x16_32 1
711 cglobal x264_pixel_sa8d_16x16_%1
713 push dword [esp+20] ; stride2
714 push dword [esp+20] ; pix2
715 push dword [esp+20] ; stride1
716 push dword [esp+20] ; pix1
717 call x264_pixel_sa8d_8x8_%1
719 add dword [esp+0], 8 ; pix1+8
720 add dword [esp+8], 8 ; pix2+8
721 call x264_pixel_sa8d_8x8_%1
727 add [esp+0], eax ; pix1+8*stride1+8
728 add [esp+8], edx ; pix2+8*stride2+8
729 call x264_pixel_sa8d_8x8_%1
731 sub dword [esp+0], 8 ; pix1+8*stride1
732 sub dword [esp+8], 8 ; pix2+8*stride2
733 call x264_pixel_sa8d_8x8_%1
739 %endif ; !ARCH_X86_64
740 %endmacro ; SA8D_16x16_32
744 ;=============================================================================
746 ;=============================================================================
748 %macro INTRA_SA8D_SSE2 1
750 ;-----------------------------------------------------------------------------
751 ; void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *fenc, int16_t edges[2][8], int *res )
752 ;-----------------------------------------------------------------------------
753 cglobal x264_intra_sa8d_x3_8x8_core_%1
756 movq xmm0, [r0+0*FENC_STRIDE]
757 movq xmm7, [r0+1*FENC_STRIDE]
758 movq xmm6, [r0+2*FENC_STRIDE]
759 movq xmm3, [r0+3*FENC_STRIDE]
760 movq xmm5, [r0+4*FENC_STRIDE]
761 movq xmm1, [r0+5*FENC_STRIDE]
762 movq xmm8, [r0+6*FENC_STRIDE]
763 movq xmm2, [r0+7*FENC_STRIDE]
772 HADAMARD8_1D xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2
773 TRANSPOSE8x8W xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2, xmm4
774 HADAMARD8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
777 movzx edi, word [r1+0]
788 ABS4 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13
799 ABS2 xmm10, xmm11, xmm13, xmm14
806 movdqa xmm14, xmm15 ; 7x8 sum
808 movdqa xmm8, [r1+0] ; left edge
814 ABS1 xmm9, xmm11 ; 1x8 sum
823 punpcklqdq xmm0, xmm4 ; transpose
824 movdqa xmm1, [r1+16] ; top edge
827 psrldq xmm2, 2 ; 8x7 sum
828 psubw xmm0, xmm1 ; 8x1 sum
833 movdqa xmm7, [pw_1 GLOBAL]
838 punpckldq xmm2, xmm14
839 punpckhdq xmm3, xmm14
840 pshufd xmm5, xmm15, 0xf5
844 punpcklqdq xmm2, xmm5
845 punpckhqdq xmm3, xmm5
849 movq [r2], xmm3 ; i8x8_v, i8x8_h
851 movd [r2+8], xmm3 ; i8x8_dc
854 %endmacro ; INTRA_SATDS
857 ; out: mm0..mm3 = hadamard coefs
861 movd mm0, [r0+0*FENC_STRIDE]
862 movd mm4, [r0+1*FENC_STRIDE]
863 movd mm3, [r0+2*FENC_STRIDE]
864 movd mm1, [r0+3*FENC_STRIDE]
869 HADAMARD4_1D mm0, mm4, mm3, mm1
870 TRANSPOSE4x4W mm0, mm4, mm3, mm1, mm2
871 HADAMARD4_1D mm0, mm1, mm2, mm3
874 %macro SCALAR_SUMSUB 4
883 %macro SCALAR_HADAMARD_LEFT 5 ; y, 4x tmp
885 shl %1d, 5 ; log(FDEC_STRIDE)
887 movzx %2d, byte [r1+%1-1+0*FDEC_STRIDE]
888 movzx %3d, byte [r1+%1-1+1*FDEC_STRIDE]
889 movzx %4d, byte [r1+%1-1+2*FDEC_STRIDE]
890 movzx %5d, byte [r1+%1-1+3*FDEC_STRIDE]
894 SCALAR_SUMSUB %2d, %3d, %4d, %5d
895 SCALAR_SUMSUB %2d, %4d, %3d, %5d
896 mov [left_1d+2*%1+0], %2w
897 mov [left_1d+2*%1+2], %3w
898 mov [left_1d+2*%1+4], %4w
899 mov [left_1d+2*%1+6], %5w
902 %macro SCALAR_HADAMARD_TOP 5 ; x, 4x tmp
903 movzx %2d, byte [r1+%1-FDEC_STRIDE+0]
904 movzx %3d, byte [r1+%1-FDEC_STRIDE+1]
905 movzx %4d, byte [r1+%1-FDEC_STRIDE+2]
906 movzx %5d, byte [r1+%1-FDEC_STRIDE+3]
907 SCALAR_SUMSUB %2d, %3d, %4d, %5d
908 SCALAR_SUMSUB %2d, %4d, %3d, %5d
909 mov [top_1d+2*%1+0], %2w
910 mov [top_1d+2*%1+2], %3w
911 mov [top_1d+2*%1+4], %4w
912 mov [top_1d+2*%1+6], %5w
915 %macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
917 pshufw %4, %1, 01001110b
918 pshufw %5, %2, 01001110b
919 pshufw %6, %3, 01001110b
926 pshufw %4, %1, 01001110b
927 pshufw %5, %2, 01001110b
928 pshufw %6, %3, 01001110b
936 mov qword [sums+0], 0
937 mov qword [sums+8], 0
938 mov qword [sums+16], 0
959 ABS2 mm4, mm5, mm6, mm7
967 ; in: mm0..mm3 (4x4), mm7 (3x4)
968 ; out: mm0 v, mm4 h, mm5 dc
970 %macro SUM4x3 3 ; dc, left, top
978 punpckldq mm0, mm2 ; transpose
982 ABS2 mm4, mm5, mm2, mm3 ; 1x4 sum
983 ABS1 mm0, mm1 ; 4x1 sum
986 %macro INTRA_SATDS_MMX 1
987 ;-----------------------------------------------------------------------------
988 ; void x264_intra_satd_x3_4x4_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
989 ;-----------------------------------------------------------------------------
990 cglobal x264_intra_satd_x3_4x4_%1, 2,6
992 ; stack is 16 byte aligned because abi says so
993 %define top_1d rsp-8 ; size 8
994 %define left_1d rsp-16 ; size 8
998 ; stack is 16 byte aligned at least in gcc, and we've pushed 3 regs + return address, so it's still aligned
1000 %define top_1d esp+8
1007 SCALAR_HADAMARD_LEFT 0, r0, r3, r4, r5
1009 SCALAR_HADAMARD_TOP 0, r0, r3, r4, r5
1010 lea t0d, [t0d + r0d + 4]
1015 SUM4x3 t0d, [left_1d], [top_1d]
1019 psrlq mm1, 16 ; 4x3 sum
1022 SUM_MM_X3 mm0, mm4, mm5, mm1, mm2, mm3, mm6, pavgw
1026 movd [r2+0], mm0 ; i4x4_v satd
1027 movd [r2+4], mm4 ; i4x4_h satd
1028 movd [r2+8], mm5 ; i4x4_dc satd
1048 ;-----------------------------------------------------------------------------
1049 ; void x264_intra_satd_x3_16x16_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
1050 ;-----------------------------------------------------------------------------
1051 cglobal x264_intra_satd_x3_16x16_%1, 0,7
1053 %assign stack_pad 88
1055 %assign stack_pad 88 + ((stack_offset+88+4)&15)
1057 ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
1059 %define sums rsp+64 ; size 24
1060 %define top_1d rsp+32 ; size 32
1061 %define left_1d rsp ; size 32
1069 SCALAR_HADAMARD_LEFT t0, r3, r4, r5, r6
1071 SCALAR_HADAMARD_TOP t0, r3, r4, r5, r6
1088 SUM4x3 t2d, [left_1d+8*r3], [top_1d+8*r4]
1091 paddw mm0, [sums+0] ; i16x16_v satd
1092 paddw mm4, [sums+8] ; i16x16_h satd
1093 paddw mm5, [sums+16] ; i16x16_dc satd
1102 add r0, 4*FENC_STRIDE-16
1113 SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd
1119 movd [r2+8], mm2 ; i16x16_dc satd
1120 movd [r2+4], mm1 ; i16x16_h satd
1121 movd [r2+0], mm0 ; i16x16_v satd
1125 ;-----------------------------------------------------------------------------
1126 ; void x264_intra_satd_x3_8x8c_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
1127 ;-----------------------------------------------------------------------------
1128 cglobal x264_intra_satd_x3_8x8c_%1, 0,6
1129 ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
1131 %define sums rsp+48 ; size 24
1132 %define dc_1d rsp+32 ; size 16
1133 %define top_1d rsp+16 ; size 16
1134 %define left_1d rsp ; size 16
1141 SCALAR_HADAMARD_LEFT t0, t2, r3, r4, r5
1142 SCALAR_HADAMARD_TOP t0, t2, r3, r4, r5
1147 movzx t2d, word [left_1d+0]
1148 movzx r3d, word [top_1d+0]
1149 movzx r4d, word [left_1d+8]
1150 movzx r5d, word [top_1d+8]
1161 mov [dc_1d+ 0], t2d ; tl
1162 mov [dc_1d+ 4], r5d ; tr
1163 mov [dc_1d+ 8], r4d ; bl
1164 mov [dc_1d+12], r3d ; br
1177 SUM4x3 [r5+4*r4], [left_1d+8*r3], [top_1d+8*r4]
1180 paddw mm0, [sums+16] ; i4x4_v satd
1181 paddw mm4, [sums+8] ; i4x4_h satd
1182 paddw mm5, [sums+0] ; i4x4_dc satd
1191 add r0, 4*FENC_STRIDE-8
1204 SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd
1206 movd [r2+0], mm0 ; i8x8c_dc satd
1207 movd [r2+4], mm1 ; i8x8c_h satd
1208 movd [r2+8], mm2 ; i8x8c_v satd
1214 ; FIXME width4 can benefit from pabsw even if not sse2
1216 cextern x264_pixel_sa8d_8x8_mmxext
1217 SA8D_16x16_32 mmxext
1219 %define ABS1 ABS1_MMX
1220 %define ABS2 ABS2_MMX
1223 INTRA_SA8D_SSE2 sse2
1224 INTRA_SATDS_MMX mmxext
1226 %define ABS1 ABS1_SSSE3
1227 %define ABS2 ABS2_SSSE3
1230 INTRA_SA8D_SSE2 ssse3
1231 INTRA_SATDS_MMX ssse3
1236 ;=============================================================================
1238 ;=============================================================================
1240 ;-----------------------------------------------------------------------------
1241 ; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
1242 ; const uint8_t *pix2, int stride2, int sums[2][4] )
1243 ;-----------------------------------------------------------------------------
1244 cglobal x264_pixel_ssim_4x4x2_core_sse2, 4,4
1253 punpcklbw xmm5, xmm0
1254 punpcklbw xmm6, xmm0
1270 movdqa xmm7, [pw_1 GLOBAL]
1271 pshufd xmm5, xmm3, 0xb1
1274 pshufd xmm6, xmm4, 0xb1
1277 pshufd xmm1, xmm1, 0xd8
1281 punpckldq xmm3, xmm4
1282 punpckhdq xmm5, xmm4
1301 ;-----------------------------------------------------------------------------
1302 ; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width )
1303 ;-----------------------------------------------------------------------------
1304 cglobal x264_pixel_ssim_end4_sse2, 3,3
1305 movdqa xmm0, [r0+ 0]
1306 movdqa xmm1, [r0+16]
1307 movdqa xmm2, [r0+32]
1308 movdqa xmm3, [r0+48]
1309 movdqa xmm4, [r0+64]
1320 movdqa xmm5, [ssim_c1 GLOBAL]
1321 movdqa xmm6, [ssim_c2 GLOBAL]
1322 TRANSPOSE4x4D xmm0, xmm1, xmm2, xmm3, xmm4
1324 ; s1=mm0, s2=mm3, ss=mm4, s12=mm2
1327 pmaddwd xmm1, xmm0 ; s1*s2
1329 pmaddwd xmm0, xmm0 ; s1*s1 + s2*s2
1333 psubd xmm2, xmm1 ; covar*2
1334 psubd xmm4, xmm0 ; vars
1339 cvtdq2ps xmm0, xmm0 ; (float)(s1*s1 + s2*s2 + ssim_c1)
1340 cvtdq2ps xmm1, xmm1 ; (float)(s1*s2*2 + ssim_c1)
1341 cvtdq2ps xmm2, xmm2 ; (float)(covar*2 + ssim_c2)
1342 cvtdq2ps xmm4, xmm4 ; (float)(vars + ssim_c2)
1345 divps xmm1, xmm0 ; ssim
1348 je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
1351 lea r3, [mask_ff + 16 GLOBAL]
1352 movdqu xmm3, [r3 + r2*4]
1354 movdqu xmm3, [mask_ff + r2*4 + 16 GLOBAL]
1360 pshuflw xmm1, xmm0, 0xE
1370 ;=============================================================================
1371 ; Successive Elimination ADS
1372 ;=============================================================================
1374 %macro ADS_START 1 ; unroll_size
1397 jmp x264_pixel_ads_mvs
1400 %define ABS1 ABS1_MMX
1402 ;-----------------------------------------------------------------------------
1403 ; int x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
1404 ; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
1405 ;-----------------------------------------------------------------------------
1406 cglobal x264_pixel_ads4_mmxext, 4,5
1410 pshufw mm6, mm6, 0xAA
1412 pshufw mm4, mm4, 0xAA
1422 movq mm3, [r1+r2+16]
1431 pshufw mm1, [r10+8], 0
1433 pshufw mm1, [ebp+stack_offset+28], 0
1441 cglobal x264_pixel_ads2_mmxext, 4,5
1445 pshufw mm6, mm6, 0xAA
1462 cglobal x264_pixel_ads1_mmxext, 4,5
1484 cglobal x264_pixel_ads4_%1, 4,5
1486 pshuflw xmm7, xmm4, 0
1487 pshuflw xmm6, xmm4, 0xAA
1488 pshufhw xmm5, xmm4, 0
1489 pshufhw xmm4, xmm4, 0xAA
1490 punpcklqdq xmm7, xmm7
1491 punpcklqdq xmm6, xmm6
1492 punpckhqdq xmm5, xmm5
1493 punpckhqdq xmm4, xmm4
1495 pshuflw xmm8, r6m, 0
1496 punpcklqdq xmm8, xmm8
1499 movdqu xmm11, [r1+r2]
1502 movdqu xmm1, [r1+16]
1509 movdqu xmm3, [r1+r2+16]
1528 movdqu xmm1, [r1+16]
1533 movdqu xmm2, [r1+r2]
1534 movdqu xmm3, [r1+r2+16]
1542 movd xmm1, [ebp+stack_offset+28]
1544 pshuflw xmm1, xmm1, 0
1545 punpcklqdq xmm1, xmm1
1553 cglobal x264_pixel_ads2_%1, 4,5
1556 pshuflw xmm7, xmm6, 0
1557 pshuflw xmm6, xmm6, 0xAA
1558 pshuflw xmm5, xmm5, 0
1559 punpcklqdq xmm7, xmm7
1560 punpcklqdq xmm6, xmm6
1561 punpcklqdq xmm5, xmm5
1565 movdqu xmm1, [r1+r2]
1579 cglobal x264_pixel_ads1_%1, 4,5
1582 pshuflw xmm7, xmm7, 0
1583 pshuflw xmm6, xmm6, 0
1584 punpcklqdq xmm7, xmm7
1585 punpcklqdq xmm6, xmm6
1589 movdqu xmm1, [r1+16]
1593 movdqu xmm3, [r3+16]
1609 %define ABS1 ABS1_SSSE3
1613 ; int x264_pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
1616 ; *(uint32_t*)(masks+width) = 0;
1617 ; for( i=0; i<width; i+=8 )
1619 ; uint64_t mask = *(uint64_t*)(masks+i);
1620 ; if( !mask ) continue;
1621 ; for( j=0; j<8; j++ )
1622 ; if( mask & (255<<j*8) )
1628 cglobal x264_pixel_ads_mvs
1632 ; clear last block in case width isn't divisible by 8. (assume divisible by 4, so clearing 4 bytes is enough.)
1633 mov dword [rsp+r5], 0
1648 test edi, 0xff<<(%1*8)
1669 cglobal x264_pixel_ads_mvs
1670 ; no PROLOGUE, inherit from x264_pixel_ads1
1671 mov ebx, [ebp+stack_offset+20] ; mvs
1672 mov edi, [ebp+stack_offset+24] ; width
1673 mov dword [esp+edi], 0
1683 mov ebp, [esp+esi+4]
1684 mov edx, [esp+esi+8]
1691 test %2, 0xff<<(%1*8)