1 ;*****************************************************************************
2 ;* pixel.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2003 x264 project
5 ;* $Id: pixel.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $
7 ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
9 ;* This program is free software; you can redistribute it and/or modify
10 ;* it under the terms of the GNU General Public License as published by
11 ;* the Free Software Foundation; either version 2 of the License, or
12 ;* (at your option) any later version.
14 ;* This program is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;* GNU General Public License for more details.
19 ;* You should have received a copy of the GNU General Public License
20 ;* along with this program; if not, write to the Free Software
21 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
22 ;*****************************************************************************
26 ;=============================================================================
27 ; Macros and other preprocessor constants
28 ;=============================================================================
39 %macro SAD_INC_2x16P 0
94 %macro SSD_INC_1x16P 0
128 %macro SSD_INC_1x8P 0
135 por mm1, mm2 ; mm1 = 8bit abs diff
139 punpckhbw mm2, mm7 ; (mm1,mm2) = 16bit abs diff
149 %macro SSD_INC_1x4P 0
165 %macro SSD_INC_8x16P 0
176 %macro SSD_INC_4x8P 0
183 %macro SSD_INC_4x4P 0
190 %macro LOAD_DIFF_4P 5 ; MMP, MMT, MMZ, [pix1], [pix2]
198 %macro LOAD_DIFF_INC_4x4 11 ; p1,p2,p3,p4, t, z, pix1, i_pix1, pix2, i_pix2, offset
199 LOAD_DIFF_4P %1, %5, %6, [%7+%11], [%9+%11]
200 LOAD_DIFF_4P %2, %5, %6, [%7+%8+%11], [%9+%10+%11]
203 LOAD_DIFF_4P %3, %5, %6, [%7+%11], [%9+%11]
204 LOAD_DIFF_4P %4, %5, %6, [%7+%8+%11], [%9+%10+%11]
209 %macro HADAMARD4_SUB_BADC 4
219 HADAMARD4_SUB_BADC %1, %2, %3, %4
220 HADAMARD4_SUB_BADC %1, %3, %2, %4
223 %macro SBUTTERFLYwd 3
229 %macro SBUTTERFLYdq 3
235 %macro TRANSPOSE4x4 5 ; abcd-t -> adtc
236 SBUTTERFLYwd %1, %2, %5
237 SBUTTERFLYwd %3, %4, %2
238 SBUTTERFLYdq %1, %3, %4
239 SBUTTERFLYdq %5, %2, %3
242 %macro MMX_ABS 2 ; mma, mmt
248 %macro MMX_ABS_SUM 3 ; mma, mmt, mms
256 %macro MMX_SUM_MM 2 ; mmv, mmt
268 %macro HADAMARD4x4_FIRST 0
269 HADAMARD4x4 mm0, mm1, mm2, mm3
270 TRANSPOSE4x4 mm0, mm1, mm2, mm3, mm4
271 HADAMARD4x4 mm0, mm3, mm4, mm2
273 MMX_ABS_SUM mm3, mm7, mm0
274 MMX_ABS_SUM mm4, mm7, mm0
275 MMX_ABS_SUM mm2, mm7, mm0
278 %macro HADAMARD4x4_NEXT 0
279 HADAMARD4x4 mm1, mm2, mm3, mm4
280 TRANSPOSE4x4 mm1, mm2, mm3, mm4, mm5
281 HADAMARD4x4 mm1, mm4, mm5, mm3
282 MMX_ABS_SUM mm1, mm7, mm0
283 MMX_ABS_SUM mm4, mm7, mm0
284 MMX_ABS_SUM mm5, mm7, mm0
285 MMX_ABS_SUM mm3, mm7, mm0
288 ;=============================================================================
290 ;=============================================================================
294 cglobal x264_pixel_sad_16x16_mmxext
295 cglobal x264_pixel_sad_16x8_mmxext
296 cglobal x264_pixel_sad_8x16_mmxext
297 cglobal x264_pixel_sad_8x8_mmxext
298 cglobal x264_pixel_sad_8x4_mmxext
299 cglobal x264_pixel_sad_4x8_mmxext
300 cglobal x264_pixel_sad_4x4_mmxext
302 cglobal x264_pixel_ssd_16x16_mmxext
303 cglobal x264_pixel_ssd_16x8_mmxext
304 cglobal x264_pixel_ssd_8x16_mmxext
305 cglobal x264_pixel_ssd_8x8_mmxext
306 cglobal x264_pixel_ssd_8x4_mmxext
307 cglobal x264_pixel_ssd_4x8_mmxext
308 cglobal x264_pixel_ssd_4x4_mmxext
310 cglobal x264_pixel_satd_4x4_mmxext
311 cglobal x264_pixel_satd_4x8_mmxext
312 cglobal x264_pixel_satd_8x4_mmxext
313 cglobal x264_pixel_satd_8x8_mmxext
314 cglobal x264_pixel_satd_16x8_mmxext
315 cglobal x264_pixel_satd_8x16_mmxext
316 cglobal x264_pixel_satd_16x16_mmxext
321 mov eax, [esp+ 8] ; pix1
322 mov ebx, [esp+12] ; stride1
323 mov ecx, [esp+16] ; pix2
324 mov edx, [esp+20] ; stride2
336 ;-----------------------------------------------------------------------------
337 ; int __cdecl x264_pixel_sad_16x16_mmxext (uint8_t *, int, uint8_t *, int )
338 ;-----------------------------------------------------------------------------
339 x264_pixel_sad_16x16_mmxext:
352 ;-----------------------------------------------------------------------------
353 ; int __cdecl x264_pixel_sad_16x8_mmxext (uint8_t *, int, uint8_t *, int )
354 ;-----------------------------------------------------------------------------
355 x264_pixel_sad_16x8_mmxext:
364 ;-----------------------------------------------------------------------------
365 ; int __cdecl x264_pixel_sad_8x16_mmxext (uint8_t *, int, uint8_t *, int )
366 ;-----------------------------------------------------------------------------
367 x264_pixel_sad_8x16_mmxext:
380 ;-----------------------------------------------------------------------------
381 ; int __cdecl x264_pixel_sad_8x8_mmxext (uint8_t *, int, uint8_t *, int )
382 ;-----------------------------------------------------------------------------
383 x264_pixel_sad_8x8_mmxext:
392 ;-----------------------------------------------------------------------------
393 ; int __cdecl x264_pixel_sad_8x4_mmxext (uint8_t *, int, uint8_t *, int )
394 ;-----------------------------------------------------------------------------
395 x264_pixel_sad_8x4_mmxext:
402 ;-----------------------------------------------------------------------------
403 ; int __cdecl x264_pixel_sad_4x8_mmxext (uint8_t *, int, uint8_t *, int )
404 ;-----------------------------------------------------------------------------
405 x264_pixel_sad_4x8_mmxext:
414 ;-----------------------------------------------------------------------------
415 ; int __cdecl x264_pixel_sad_4x4_mmxext (uint8_t *, int, uint8_t *, int )
416 ;-----------------------------------------------------------------------------
417 x264_pixel_sad_4x4_mmxext:
428 mov eax, [esp+ 8] ; pix1
429 mov ebx, [esp+12] ; stride1
430 mov ecx, [esp+16] ; pix2
431 mov edx, [esp+20] ; stride2
434 pxor mm0, mm0 ; mm0 holds the sum
448 ;-----------------------------------------------------------------------------
449 ; int __cdecl x264_pixel_ssd_16x16_mmxext (uint8_t *, int, uint8_t *, int )
450 ;-----------------------------------------------------------------------------
451 x264_pixel_ssd_16x16_mmxext:
458 x264_pixel_ssd_16x8_mmxext:
464 x264_pixel_ssd_8x16_mmxext:
473 x264_pixel_ssd_8x8_mmxext:
480 x264_pixel_ssd_8x4_mmxext:
486 x264_pixel_ssd_4x8_mmxext:
493 x264_pixel_ssd_4x4_mmxext:
501 ;-----------------------------------------------------------------------------
502 ; int __cdecl x264_pixel_satd_4x4_mmxext (uint8_t *, int, uint8_t *, int )
503 ;-----------------------------------------------------------------------------
504 x264_pixel_satd_4x4_mmxext:
507 mov eax, [esp+ 8] ; pix1
508 mov ebx, [esp+12] ; stride1
509 mov ecx, [esp+16] ; pix2
510 mov edx, [esp+20] ; stride2
514 LOAD_DIFF_4P mm0, mm6, mm7, [eax], [ecx]
515 LOAD_DIFF_4P mm1, mm6, mm7, [eax+ebx], [ecx+edx]
516 LOAD_DIFF_4P mm2, mm6, mm7, [eax+2*ebx], [ecx+2*edx]
519 LOAD_DIFF_4P mm3, mm6, mm7, [eax+2*ebx], [ecx+2*edx]
528 ;-----------------------------------------------------------------------------
529 ; int __cdecl x264_pixel_satd_4x8_mmxext (uint8_t *, int, uint8_t *, int )
530 ;-----------------------------------------------------------------------------
531 x264_pixel_satd_4x8_mmxext:
534 mov eax, [esp+ 8] ; pix1
535 mov ebx, [esp+12] ; stride1
536 mov ecx, [esp+16] ; pix2
537 mov edx, [esp+20] ; stride2
541 LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 0
544 LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
552 ;-----------------------------------------------------------------------------
553 ; int __cdecl x264_pixel_satd_8x4_mmxext (uint8_t *, int, uint8_t *, int )
554 ;-----------------------------------------------------------------------------
555 x264_pixel_satd_8x4_mmxext:
558 mov eax, [esp+ 8] ; pix1
559 mov ebx, [esp+12] ; stride1
560 mov ecx, [esp+16] ; pix2
561 mov edx, [esp+20] ; stride2
565 LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 0
568 mov eax, [esp+ 8] ; pix1
569 mov ecx, [esp+16] ; pix2
571 LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
579 ;-----------------------------------------------------------------------------
580 ; int __cdecl x264_pixel_satd_8x8_mmxext (uint8_t *, int, uint8_t *, int )
581 ;-----------------------------------------------------------------------------
582 x264_pixel_satd_8x8_mmxext:
585 mov eax, [esp+ 8] ; pix1
586 mov ebx, [esp+12] ; stride1
587 mov ecx, [esp+16] ; pix2
588 mov edx, [esp+20] ; stride2
592 LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 0
595 LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
598 mov eax, [esp+ 8] ; pix1
599 mov ecx, [esp+16] ; pix2
601 LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
604 LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
612 ;-----------------------------------------------------------------------------
613 ; int __cdecl x264_pixel_satd_16x8_mmxext (uint8_t *, int, uint8_t *, int )
614 ;-----------------------------------------------------------------------------
615 x264_pixel_satd_16x8_mmxext:
619 mov eax, [esp+12] ; pix1
620 mov ebx, [esp+16] ; stride1
621 mov ecx, [esp+20] ; pix2
622 mov edx, [esp+24] ; stride2
627 LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 0
630 LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
633 mov eax, [esp+12] ; pix1
634 mov ecx, [esp+20] ; pix2
636 LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
639 LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
645 mov eax, [esp+12] ; pix1
646 mov ecx, [esp+20] ; pix2
648 LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 8
651 LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 8
654 mov eax, [esp+12] ; pix1
655 mov ecx, [esp+20] ; pix2
657 LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 12
660 LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 12
671 ;-----------------------------------------------------------------------------
672 ; int __cdecl x264_pixel_satd_8x16_mmxext (uint8_t *, int, uint8_t *, int )
673 ;-----------------------------------------------------------------------------
674 x264_pixel_satd_8x16_mmxext:
678 mov eax, [esp+12] ; pix1
679 mov ebx, [esp+16] ; stride1
680 mov ecx, [esp+20] ; pix2
681 mov edx, [esp+24] ; stride2
686 LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 0
689 LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
692 LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
695 LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
701 mov eax, [esp+12] ; pix1
702 mov ecx, [esp+20] ; pix2
704 LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 4
707 LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
710 LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
713 LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
724 ;-----------------------------------------------------------------------------
725 ; int __cdecl x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int )
726 ;-----------------------------------------------------------------------------
727 x264_pixel_satd_16x16_mmxext:
731 mov eax, [esp+12] ; pix1
732 mov ebx, [esp+16] ; stride1
733 mov ecx, [esp+20] ; pix2
734 mov edx, [esp+24] ; stride2
739 LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 0
742 LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
745 LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
748 LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 0
754 mov eax, [esp+12] ; pix1
755 mov ecx, [esp+20] ; pix2
757 LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 4
760 LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
763 LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
766 LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 4
772 mov eax, [esp+12] ; pix1
773 mov ecx, [esp+20] ; pix2
775 LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 8
778 LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 8
781 LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 8
784 LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 8
790 mov eax, [esp+12] ; pix1
791 mov ecx, [esp+20] ; pix2
793 LOAD_DIFF_INC_4x4 mm0, mm1, mm2, mm3, mm6, mm7, eax, ebx, ecx, edx, 12
796 LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 12
799 LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 12
802 LOAD_DIFF_INC_4x4 mm1, mm2, mm3, mm4, mm6, mm7, eax, ebx, ecx, edx, 12