1 ; MMX/SSE optimized routines for SAD of 16*16 macroblocks
2 ; Copyright (C) Juan J. Sierralta P. <juanjo@atmlab.utfsm.cl>
4 ; dist1_* Original Copyright (C) 2000 Chris Atenasio <chris@crud.net>
5 ; Enhancements and rest Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
8 ; This program is free software; you can redistribute it and/or
9 ; modify it under the terms of the GNU General Public License
10 ; as published by the Free Software Foundation; either version 2
11 ; of the License, or (at your option) any later version.
13 ; This program is distributed in the hope that it will be useful,
14 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ; GNU General Public License for more details.
18 ; You should have received a copy of the GNU General Public License
19 ; along with this program; if not, write to the Free Software
20 ; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
23 global pix_abs16x16_mmx
25 ; int pix_abs16x16_mmx(unsigned char *pix1,unsigned char *pix2, int lx, int h);
26 ; esi = p1 (init: blk1)
27 ; edi = p2 (init: blk2)
28 ; ecx = rowsleft (init: h)
31 ; mm0 = distance accumulators (4 words)
32 ; mm1 = distance accumulators (4 words)
43 push ebp ; save frame pointer
46 push ebx ; Saves registers (called saves convention in
47 push ecx ; x86 GCC it seems)
52 pxor mm0, mm0 ; zero acculumators
55 mov esi, [ebp+8] ; get pix1
56 mov edi, [ebp+12] ; get pix2
57 mov edx, [ebp+16] ; get lx
58 mov ecx, [ebp+20] ; get rowsleft
63 ; First 8 bytes of the row
65 movq mm4, [edi] ; load first 8 bytes of pix2 row
66 movq mm5, [esi] ; load first 8 bytes of pix1 row
67 movq mm3, mm4 ; mm4 := abs(mm4-mm5)
68 movq mm2,[esi+8] ; load last 8 bytes of pix1 row
70 movq mm7,[edi+8] ; load last 8 bytes of pix2 row
74 ; Last 8 bytes of the row
76 movq mm3, mm7 ; mm7 := abs(mm7-mm2)
81 ; Now mm4 and mm7 have 16 absdiffs to add
83 ; First 8 bytes of the row2
87 movq mm2, [edi] ; load first 8 bytes of pix2 row
89 movq mm5, [esi] ; load first 8 bytes of pix1 row
93 movq mm3, mm2 ; mm2 := abs(mm2-mm5)
95 movq mm6,[esi+8] ; load last 8 bytes of pix1 row
99 ; Last 8 bytes of the row2
101 movq mm5,[edi+8] ; load last 8 bytes of pix2 row
104 movq mm3, mm5 ; mm5 := abs(mm5-mm6)
109 ; Now mm2, mm4, mm5, mm7 have 32 absdiffs
113 pxor mm6, mm6 ; Zero mm6
115 punpcklbw mm3, mm6 ; Unpack to words and add
121 punpcklbw mm3, mm6 ; Unpack to words and add
125 paddusw mm0, mm7 ; Add to the acumulator (mm0)
126 paddusw mm1, mm5 ; Add to the acumulator (mm1)
130 punpcklbw mm3, mm6 ; Unpack to words and add
137 punpcklbw mm5, mm6 ; Unpack to words and add
143 add esi, edx ; update pointers to next row
144 paddusw mm0, mm4 ; Add to the acumulator (mm0)
147 paddusw mm1, mm2 ; Add to the acumulator (mm1)
148 test ecx, ecx ; check rowsleft
152 movq mm2, mm0 ; Copy mm0 to mm2
154 paddusw mm0, mm2 ; Add
158 movd eax, mm0 ; Store return value
167 pop ebp ; restore stack pointer
169 ;emms ; clear mmx registers
172 global pix_abs16x16_sse
174 ; int pix_abs16x16_mmx(unsigned char *pix1,unsigned char *pix2, int lx, int h);
175 ; esi = p1 (init: blk1)
176 ; edi = p2 (init: blk2)
177 ; ecx = rowsleft (init: h)
180 ; mm0 = distance accumulators (4 words)
181 ; mm1 = distance accumulators (4 words)
192 push ebp ; save frame pointer
195 push ebx ; Saves registers (called saves convention in
196 push ecx ; x86 GCC it seems)
201 pxor mm0, mm0 ; zero acculumators
203 mov esi, [ebp+8] ; get pix1
204 mov edi, [ebp+12] ; get pix2
205 mov edx, [ebp+16] ; get lx
206 mov ecx, [ebp+20] ; get rowsleft
213 movq mm4, [edi] ; load first 8 bytes of pix2 row
214 movq mm5, [edi+8] ; load last 8 bytes of pix2 row
215 psadbw mm4, [esi] ; SAD of first 8 bytes
216 psadbw mm5, [esi+8] ; SAD of last 8 bytes
217 paddw mm0, mm4 ; Add to acumulators
225 movq mm6, [edi] ; load first 8 bytes of pix2 row
226 movq mm7, [edi+8] ; load last 8 bytes of pix2 row
227 psadbw mm6, [esi] ; SAD of first 8 bytes
228 psadbw mm7, [esi+8] ; SAD of last 8 bytes
229 paddw mm0, mm6 ; Add to acumulators
237 movq mm4, [edi] ; load first 8 bytes of pix2 row
238 movq mm5, [edi+8] ; load last 8 bytes of pix2 row
239 psadbw mm4, [esi] ; SAD of first 8 bytes
240 psadbw mm5, [esi+8] ; SAD of last 8 bytes
241 paddw mm0, mm4 ; Add to acumulators
249 movq mm6, [edi] ; load first 8 bytes of pix2 row
250 movq mm7, [edi+8] ; load last 8 bytes of pix2 row
251 psadbw mm6, [esi] ; SAD of first 8 bytes
252 psadbw mm7, [esi+8] ; SAD of last 8 bytes
253 paddw mm0, mm6 ; Add to acumulators
258 add esi, edx ; update pointers to next row
261 test ecx, ecx ; check rowsleft
264 paddd mm0, mm1 ; Sum acumulators
265 movd eax, mm0 ; Store return value
273 pop ebp ; restore stack pointer
275 ;emms ; clear mmx registers
278 global pix_abs16x16_x2_mmx
280 ; int pix_abs16x16_x2_mmx(unsigned char *pix1,unsigned char *pix2, int lx, int h);
281 ; esi = p1 (init: blk1)
282 ; edi = p2 (init: blk2)
283 ; ecx = rowsleft (init: h)
286 ; mm0 = distance accumulators (4 words)
287 ; mm1 = distance accumulators (4 words)
298 push ebp ; save frame pointer
301 push ebx ; Saves registers (called saves convention in
302 push ecx ; x86 GCC it seems)
307 pxor mm0, mm0 ; zero acculumators
310 mov esi, [ebp+8] ; get pix1
311 mov edi, [ebp+12] ; get pix2
312 mov edx, [ebp+16] ; get lx
313 mov ecx, [ebp+20] ; get rowsleft
318 ; First 8 bytes of the row
320 movq mm4, [edi] ; load first 8 bytes of pix2 row
321 movq mm5, [edi+1] ; load bytes 1-8 of pix2 row
323 movq mm2, mm4 ; copy mm4 on mm2
324 movq mm3, mm5 ; copy mm5 on mm3
325 punpcklbw mm4, mm6 ; first 4 bytes of [edi] on mm4
326 punpcklbw mm5, mm6 ; first 4 bytes of [edi+1] on mm5
327 paddusw mm4, mm5 ; mm4 := first 4 bytes interpolated in words
330 punpckhbw mm2, mm6 ; last 4 bytes of [edi] on mm2
331 punpckhbw mm3, mm6 ; last 4 bytes of [edi+1] on mm3
332 paddusw mm2, mm3 ; mm2 := last 4 bytes interpolated in words
335 packuswb mm4, mm2 ; pack 8 bytes interpolated on mm4
336 movq mm5,[esi] ; load first 8 bytes of pix1 row
338 movq mm3, mm4 ; mm4 := abs(mm4-mm5)
343 ; Last 8 bytes of the row
345 movq mm7, [edi+8] ; load last 8 bytes of pix2 row
346 movq mm5, [edi+9] ; load bytes 10-17 of pix2 row
348 movq mm2, mm7 ; copy mm7 on mm2
349 movq mm3, mm5 ; copy mm5 on mm3
350 punpcklbw mm7, mm6 ; first 4 bytes of [edi+8] on mm7
351 punpcklbw mm5, mm6 ; first 4 bytes of [edi+9] on mm5
352 paddusw mm7, mm5 ; mm1 := first 4 bytes interpolated in words
355 punpckhbw mm2, mm6 ; last 4 bytes of [edi] on mm2
356 punpckhbw mm3, mm6 ; last 4 bytes of [edi+1] on mm3
357 paddusw mm2, mm3 ; mm2 := last 4 bytes interpolated in words
360 packuswb mm7, mm2 ; pack 8 bytes interpolated on mm1
361 movq mm5,[esi+8] ; load last 8 bytes of pix1 row
363 movq mm3, mm7 ; mm7 := abs(mm1-mm5)
368 ; Now mm4 and mm7 have 16 absdiffs to add
370 movq mm3, mm4 ; Make copies of these bytes
373 punpcklbw mm4, mm6 ; Unpack to words and add
376 paddusw mm0, mm4 ; Add to the acumulator (mm0)
378 punpckhbw mm3, mm6 ; Unpack to words and add
381 paddusw mm1, mm3 ; Add to the acumulator (mm1)
385 add esi, edx ; update pointers to next row
389 test ecx, ecx ; check rowsleft
394 movq mm1, mm0 ; Copy mm0 to mm1
396 paddusw mm0, mm1 ; Add
400 movd eax, mm0 ; Store return value
409 pop ebp ; restore stack pointer
411 emms ; clear mmx registers
414 global pix_abs16x16_y2_mmx
416 ; int pix_abs16x16_y2_mmx(unsigned char *pix1,unsigned char *pix2, int lx, int h);
417 ; esi = p1 (init: blk1)
418 ; edi = p2 (init: blk2)
420 ; ecx = rowsleft (init: h)
423 ; mm0 = distance accumulators (4 words)
424 ; mm1 = distance accumulators (4 words)
435 push ebp ; save frame pointer
438 push ebx ; Saves registers (called saves convention in
439 push ecx ; x86 GCC it seems)
444 pxor mm0, mm0 ; zero acculumators
447 mov esi, [ebp+8] ; get pix1
448 mov edi, [ebp+12] ; get pix2
449 mov edx, [ebp+16] ; get lx
450 mov ecx, [ebp+20] ; get rowsleft
457 ; First 8 bytes of the row
459 movq mm4, [edi] ; load first 8 bytes of pix2 row
460 movq mm5, [ebx] ; load bytes 1-8 of pix2 row
462 movq mm2, mm4 ; copy mm4 on mm2
463 movq mm3, mm5 ; copy mm5 on mm3
464 punpcklbw mm4, mm6 ; first 4 bytes of [edi] on mm4
465 punpcklbw mm5, mm6 ; first 4 bytes of [ebx] on mm5
466 paddusw mm4, mm5 ; mm4 := first 4 bytes interpolated in words
469 punpckhbw mm2, mm6 ; last 4 bytes of [edi] on mm2
470 punpckhbw mm3, mm6 ; last 4 bytes of [edi+1] on mm3
471 paddusw mm2, mm3 ; mm2 := last 4 bytes interpolated in words
474 packuswb mm4, mm2 ; pack 8 bytes interpolated on mm4
475 movq mm5,[esi] ; load first 8 bytes of pix1 row
477 movq mm3, mm4 ; mm4 := abs(mm4-mm5)
482 ; Last 8 bytes of the row
484 movq mm7, [edi+8] ; load last 8 bytes of pix2 row
485 movq mm5, [ebx+8] ; load bytes 10-17 of pix2 row
487 movq mm2, mm7 ; copy mm7 on mm2
488 movq mm3, mm5 ; copy mm5 on mm3
489 punpcklbw mm7, mm6 ; first 4 bytes of [edi+8] on mm7
490 punpcklbw mm5, mm6 ; first 4 bytes of [ebx+8] on mm5
491 paddusw mm7, mm5 ; mm1 := first 4 bytes interpolated in words
494 punpckhbw mm2, mm6 ; last 4 bytes of [edi+8] on mm2
495 punpckhbw mm3, mm6 ; last 4 bytes of [ebx+8] on mm3
496 paddusw mm2, mm3 ; mm2 := last 4 bytes interpolated in words
499 packuswb mm7, mm2 ; pack 8 bytes interpolated on mm1
500 movq mm5,[esi+8] ; load last 8 bytes of pix1 row
502 movq mm3, mm7 ; mm7 := abs(mm1-mm5)
507 ; Now mm4 and mm7 have 16 absdiffs to add
509 movq mm3, mm4 ; Make copies of these bytes
512 punpcklbw mm4, mm6 ; Unpack to words and add
515 paddusw mm0, mm4 ; Add to the acumulator (mm0)
517 punpckhbw mm3, mm6 ; Unpack to words and add
520 paddusw mm1, mm3 ; Add to the acumulator (mm1)
524 add esi, edx ; update pointers to next row
528 test ecx, ecx ; check rowsleft
533 movq mm1, mm0 ; Copy mm0 to mm1
535 paddusw mm0, mm1 ; Add
539 movd eax, mm0 ; Store return value
548 pop ebp ; restore stack pointer
550 emms ; clear mmx registers
553 global pix_abs16x16_xy2_mmx
555 ; int pix_abs16x16_xy2_mmx(unsigned char *p1,unsigned char *p2,int lx,int h);
557 ; esi = p1 (init: blk1)
558 ; edi = p2 (init: blk2)
560 ; ecx = rowsleft (init: h)
563 ; mm0 = distance accumulators (4 words)
567 ; I'd love to find someplace to stash p1+1 and p1+lx+1's bytes
568 ; but I don't think thats going to happen in iA32-land...
569 ; mm4 = temp 4 bytes in words interpolating p1, p1+1
570 ; mm5 = temp 4 bytes in words from p2
571 ; mm6 = temp comparison bit mask p1,p2
572 ; mm7 = temp comparison bit mask p2,p1
576 pix_abs16x16_xy2_mmx:
577 push ebp ; save stack pointer
578 mov ebp, esp ; so that we can do this
580 push ebx ; Saves registers (called saves convention in
581 push ecx ; x86 GCC it seems)
586 pxor mm0, mm0 ; zero acculumators
588 mov esi, [ebp+12] ; get p1
589 mov edi, [ebp+8] ; get p2
590 mov edx, [ebp+16] ; get lx
591 mov ecx, [ebp+20] ; rowsleft := h
594 jmp .nextrowmm11 ; snap to it
599 ;; First 8 bytes of row
602 ;; First 4 bytes of 8
604 movq mm4, [esi] ; mm4 := first 4 bytes p1
606 movq mm2, mm4 ; mm2 records all 8 bytes
607 punpcklbw mm4, mm7 ; First 4 bytes p1 in Words...
609 movq mm6, [ebx] ; mm6 := first 4 bytes p1+lx
610 movq mm3, mm6 ; mm3 records all 8 bytes
615 movq mm5, [esi+1] ; mm5 := first 4 bytes p1+1
616 punpcklbw mm5, mm7 ; First 4 bytes p1 in Words...
618 movq mm6, [ebx+1] ; mm6 := first 4 bytes p1+lx+1
622 psrlw mm4, 2 ; mm4 := First 4 bytes interpolated in words
624 movq mm5, [edi] ; mm5:=first 4 bytes of p2 in words
629 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
631 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
635 paddw mm0, mm6 ; Add to accumulator
637 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
639 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
642 paddw mm0, mm5 ; Add to accumulator
644 ;; Second 4 bytes of 8
646 movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words
649 movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words
653 movq mm5, [esi+1] ; mm5 := first 4 bytes p1+1
654 punpckhbw mm5, mm7 ; First 4 bytes p1 in Words...
656 movq mm6, [ebx+1] ; mm6 := first 4 bytes p1+lx+1
660 psrlw mm4, 2 ; mm4 := First 4 bytes interpolated in words
662 movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words
666 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
668 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
672 paddw mm0, mm6 ; Add to accumulator
674 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
676 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
679 paddw mm0, mm5 ; Add to accumulator
683 ;; Second 8 bytes of row
685 ;; First 4 bytes of 8
687 movq mm4, [esi+8] ; mm4 := first 4 bytes p1+8
689 movq mm2, mm4 ; mm2 records all 8 bytes
690 punpcklbw mm4, mm7 ; First 4 bytes p1 in Words...
692 movq mm6, [ebx+8] ; mm6 := first 4 bytes p1+lx+8
693 movq mm3, mm6 ; mm3 records all 8 bytes
698 movq mm5, [esi+9] ; mm5 := first 4 bytes p1+9
699 punpcklbw mm5, mm7 ; First 4 bytes p1 in Words...
701 movq mm6, [ebx+9] ; mm6 := first 4 bytes p1+lx+9
705 psrlw mm4, 2 ; mm4 := First 4 bytes interpolated in words
707 movq mm5, [edi+8] ; mm5:=first 4 bytes of p2+8 in words
712 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
714 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
718 paddw mm0, mm6 ; Add to accumulator
720 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
722 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
725 paddw mm0, mm5 ; Add to accumulator
727 ;; Second 4 bytes of 8
729 movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words
732 movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words
736 movq mm5, [esi+9] ; mm5 := first 4 bytes p1+1
737 punpckhbw mm5, mm7 ; First 4 bytes p1 in Words...
739 movq mm6, [ebx+9] ; mm6 := first 4 bytes p1+lx+1
743 psrlw mm4, 2 ; mm4 := First 4 bytes interpolated in words
745 movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words
749 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
751 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
755 paddw mm0, mm6 ; Add to accumulator
757 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
759 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
762 paddw mm0, mm5 ; Add to accumulator
766 ;; Loop termination condition... and stepping
769 add esi, edx ; update pointer to next row
774 test ecx, ecx ; check rowsleft
775 jnz near .nextrowmm11
777 ;; Sum the Accumulators
784 movd eax, mm0 ; store return value
793 pop ebp ; restore stack pointer
795 emms ; clear mmx registers
796 ret ; we now return you to your regular programming