2 Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 doVertDefFilter Ec Ec Ec
28 doHorizDefFilter Ec Ec Ec
30 Vertical RKAlgo1 E a a
31 Horizontal RKAlgo1 a a
34 LinIpolDeinterlace e E E*
35 CubicIpolDeinterlace a e e*
36 LinBlendDeinterlace e E E*
37 MedianDeinterlace Ec Ec
40 * i dont have a 3dnow CPU -> its untested
41 E = Exact implementation
42 e = allmost exact implementation (slightly different rounding,...)
43 a = alternative / approximate impl
44 c = checked against the other implementations (-vo md5)
49 verify that everything workes as it should (how?)
50 reduce the time wasted on the mem transfer
52 implement everything in C at least (done at the moment but ...)
53 unroll stuff if instructions depend too much on the prior one
54 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
55 move YScale thing to the end instead of fixing QP
56 write a faster and higher quality deblocking filter :)
57 do something about the speed of the horizontal filters
58 make the mainloop more flexible (variable number of blocks at once
59 (the if/else stuff per block is slowing things down)
60 compare the quality & speed of all filters
62 fix warnings (unused vars, ...)
63 noise reduction filters
71 //Changelog: use the CVS log
77 #include "../config.h"
84 #include "postprocess.h"
86 #define MIN(a,b) ((a) > (b) ? (b) : (a))
87 #define MAX(a,b) ((a) < (b) ? (b) : (a))
88 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
89 #define SIGN(a) ((a) > 0 ? 1 : -1)
92 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
93 #elif defined (HAVE_3DNOW)
94 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
98 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
99 #elif defined (HAVE_MMX)
100 #define PMINUB(b,a,t) \
101 "movq " #a ", " #t " \n\t"\
102 "psubusb " #b ", " #t " \n\t"\
103 "psubb " #t ", " #a " \n\t"
107 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
108 #elif defined (HAVE_MMX)
109 #define PMAXUB(a,b) \
110 "psubusb " #a ", " #b " \n\t"\
111 "paddb " #a ", " #b " \n\t"
115 #define GET_MODE_BUFFER_SIZE 500
116 #define OPTIONS_ARRAY_SIZE 10
119 static uint64_t packedYOffset= 0x0000000000000000LL;
120 static uint64_t packedYScale= 0x0100010001000100LL;
121 static uint64_t w05= 0x0005000500050005LL;
122 static uint64_t w20= 0x0020002000200020LL;
123 static uint64_t w1400= 0x1400140014001400LL;
124 static uint64_t bm00000001= 0x00000000000000FFLL;
125 static uint64_t bm00010000= 0x000000FF00000000LL;
126 static uint64_t bm00001000= 0x00000000FF000000LL;
127 static uint64_t bm10000000= 0xFF00000000000000LL;
128 static uint64_t bm10000001= 0xFF000000000000FFLL;
129 static uint64_t bm11000011= 0xFFFF00000000FFFFLL;
130 static uint64_t bm00000011= 0x000000000000FFFFLL;
131 static uint64_t bm11111110= 0xFFFFFFFFFFFFFF00LL;
132 static uint64_t bm11000000= 0xFFFF000000000000LL;
133 static uint64_t bm00011000= 0x000000FFFF000000LL;
134 static uint64_t bm00110011= 0x0000FFFF0000FFFFLL;
135 static uint64_t bm11001100= 0xFFFF0000FFFF0000LL;
136 static uint64_t b00= 0x0000000000000000LL;
137 static uint64_t b01= 0x0101010101010101LL;
138 static uint64_t b02= 0x0202020202020202LL;
139 static uint64_t b0F= 0x0F0F0F0F0F0F0F0FLL;
140 static uint64_t b04= 0x0404040404040404LL;
141 static uint64_t b08= 0x0808080808080808LL;
142 static uint64_t bFF= 0xFFFFFFFFFFFFFFFFLL;
143 static uint64_t b20= 0x2020202020202020LL;
144 static uint64_t b80= 0x8080808080808080LL;
145 static uint64_t b7E= 0x7E7E7E7E7E7E7E7ELL;
146 static uint64_t b7C= 0x7C7C7C7C7C7C7C7CLL;
147 static uint64_t b3F= 0x3F3F3F3F3F3F3F3FLL;
148 static uint64_t temp0=0;
149 static uint64_t temp1=0;
150 static uint64_t temp2=0;
151 static uint64_t temp3=0;
152 static uint64_t temp4=0;
153 static uint64_t temp5=0;
154 static uint64_t pQPb=0;
155 static uint64_t pQPb2=0;
156 static uint8_t tempBlocks[8*16*2]; //used for the horizontal code
158 int hFlatnessThreshold= 56 - 16;
159 int vFlatnessThreshold= 56 - 16;
161 //amount of "black" u r willing to loose to get a brightness corrected picture
162 double maxClippedThreshold= 0.01;
167 static struct PPFilter filters[]=
169 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
170 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
171 {"vr", "rkvdeblock", 1, 2, 4, H_RK1_FILTER},
172 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
173 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
174 {"dr", "dering", 1, 5, 6, DERING},
175 {"al", "autolevels", 0, 1, 2, LEVEL_FIX},
176 {"lb", "linblenddeint", 0, 1, 6, LINEAR_BLEND_DEINT_FILTER},
177 {"li", "linipoldeint", 0, 1, 6, LINEAR_IPOL_DEINT_FILTER},
178 {"ci", "cubicipoldeint", 0, 1, 6, CUBIC_IPOL_DEINT_FILTER},
179 {"md", "mediandeint", 0, 1, 6, MEDIAN_DEINT_FILTER},
180 {NULL, NULL,0,0,0,0} //End Marker
183 static char *replaceTable[]=
185 "default", "hdeblock:a,vdeblock:a,dering:a,autolevels",
186 "de", "hdeblock:a,vdeblock:a,dering:a,autolevels",
187 "fast", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels",
188 "fa", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels",
192 static inline void unusedVariableWarningFixer()
195 packedYOffset + packedYScale + w05 + w20 + w1400 + bm00000001 + bm00010000
196 + bm00001000 + bm10000000 + bm10000001 + bm11000011 + bm00000011 + bm11111110
197 + bm11000000 + bm00011000 + bm00110011 + bm11001100 + b00 + b01 + b02 + b0F
198 + bFF + b20 + b80 + b7E + b7C + b3F + temp0 + temp1 + temp2 + temp3 + temp4
199 + temp5 + pQPb== 0) b00=0;
203 static inline long long rdtsc()
206 asm volatile( "rdtsc\n\t"
209 // printf("%d\n", int(l/1000));
215 static inline void prefetchnta(void *p)
217 asm volatile( "prefetchnta (%0)\n\t"
222 static inline void prefetcht0(void *p)
224 asm volatile( "prefetcht0 (%0)\n\t"
229 static inline void prefetcht1(void *p)
231 asm volatile( "prefetcht1 (%0)\n\t"
236 static inline void prefetcht2(void *p)
238 asm volatile( "prefetcht2 (%0)\n\t"
244 //FIXME? |255-0| = 1 (shouldnt be a problem ...)
246 * Check if the middle 8x8 Block in the given 8x16 block is flat
248 static inline int isVertDC(uint8_t src[], int stride){
253 src+= stride*4; // src points to begin of the 8x8 Block
256 "leal (%1, %2), %%eax \n\t"
257 "leal (%%eax, %2, 4), %%ebx \n\t"
258 // 0 1 2 3 4 5 6 7 8 9
259 // %1 eax eax+%2 eax+2%2 %1+4%2 ebx ebx+%2 ebx+2%2 %1+8%2 ebx+4%2
260 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F
261 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D
262 "movq (%1), %%mm0 \n\t"
263 "movq (%%eax), %%mm1 \n\t"
264 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
265 "paddb %%mm7, %%mm0 \n\t"
266 "pcmpgtb %%mm6, %%mm0 \n\t"
268 "movq (%%eax,%2), %%mm2 \n\t"
269 "psubb %%mm2, %%mm1 \n\t"
270 "paddb %%mm7, %%mm1 \n\t"
271 "pcmpgtb %%mm6, %%mm1 \n\t"
272 "paddb %%mm1, %%mm0 \n\t"
274 "movq (%%eax, %2, 2), %%mm1 \n\t"
275 "psubb %%mm1, %%mm2 \n\t"
276 "paddb %%mm7, %%mm2 \n\t"
277 "pcmpgtb %%mm6, %%mm2 \n\t"
278 "paddb %%mm2, %%mm0 \n\t"
280 "movq (%1, %2, 4), %%mm2 \n\t"
281 "psubb %%mm2, %%mm1 \n\t"
282 "paddb %%mm7, %%mm1 \n\t"
283 "pcmpgtb %%mm6, %%mm1 \n\t"
284 "paddb %%mm1, %%mm0 \n\t"
286 "movq (%%ebx), %%mm1 \n\t"
287 "psubb %%mm1, %%mm2 \n\t"
288 "paddb %%mm7, %%mm2 \n\t"
289 "pcmpgtb %%mm6, %%mm2 \n\t"
290 "paddb %%mm2, %%mm0 \n\t"
292 "movq (%%ebx, %2), %%mm2 \n\t"
293 "psubb %%mm2, %%mm1 \n\t"
294 "paddb %%mm7, %%mm1 \n\t"
295 "pcmpgtb %%mm6, %%mm1 \n\t"
296 "paddb %%mm1, %%mm0 \n\t"
298 "movq (%%ebx, %2, 2), %%mm1 \n\t"
299 "psubb %%mm1, %%mm2 \n\t"
300 "paddb %%mm7, %%mm2 \n\t"
301 "pcmpgtb %%mm6, %%mm2 \n\t"
302 "paddb %%mm2, %%mm0 \n\t"
305 "movq %%mm0, %%mm1 \n\t"
306 "psrlw $8, %%mm0 \n\t"
307 "paddb %%mm1, %%mm0 \n\t"
309 "pshufw $0xF9, %%mm0, %%mm1 \n\t"
310 "paddb %%mm1, %%mm0 \n\t"
311 "pshufw $0xFE, %%mm0, %%mm1 \n\t"
313 "movq %%mm0, %%mm1 \n\t"
314 "psrlq $16, %%mm0 \n\t"
315 "paddb %%mm1, %%mm0 \n\t"
316 "movq %%mm0, %%mm1 \n\t"
317 "psrlq $32, %%mm0 \n\t"
319 "paddb %%mm1, %%mm0 \n\t"
320 "movd %%mm0, %0 \n\t"
322 : "r" (src), "r" (stride)
326 numEq= (256 - numEq) &0xFF;
329 for(y=0; y<BLOCK_SIZE-1; y++)
331 if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
332 if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
333 if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++;
334 if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++;
335 if(((src[4] - src[4+stride] + 1)&0xFFFF) < 3) numEq++;
336 if(((src[5] - src[5+stride] + 1)&0xFFFF) < 3) numEq++;
337 if(((src[6] - src[6+stride] + 1)&0xFFFF) < 3) numEq++;
338 if(((src[7] - src[7+stride] + 1)&0xFFFF) < 3) numEq++;
342 /* if(abs(numEq - asmEq) > 0)
344 printf("\nasm:%d c:%d\n", asmEq, numEq);
345 for(int y=0; y<8; y++)
347 for(int x=0; x<8; x++)
349 printf("%d ", temp[x + y*stride]);
355 // for(int i=0; i<numEq/8; i++) src[i]=255;
356 return (numEq > vFlatnessThreshold) ? 1 : 0;
359 static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
366 "movq (%1, %2), %%mm0 \n\t"
367 "movq (%1, %2, 8), %%mm1 \n\t"
368 "movq %%mm0, %%mm2 \n\t"
369 "psubusb %%mm1, %%mm0 \n\t"
370 "psubusb %%mm2, %%mm1 \n\t"
371 "por %%mm1, %%mm0 \n\t" // ABS Diff
373 "movq pQPb, %%mm7 \n\t" // QP,..., QP
374 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
375 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0
376 "pcmpeqd b00, %%mm0 \n\t"
377 "psrlq $16, %%mm0 \n\t"
378 "pcmpeqd bFF, %%mm0 \n\t"
379 // "movd %%mm0, (%1, %2, 4)\n\t"
380 "movd %%mm0, %0 \n\t"
382 : "r" (src), "r" (stride)
390 for(x=0; x<BLOCK_SIZE; x++)
392 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
394 /* if(isOk && !isOk2 || !isOk && isOk2)
396 printf("\nasm:%d c:%d QP:%d\n", isOk, isOk2, QP);
397 for(int y=0; y<9; y++)
399 for(int x=0; x<8; x++)
401 printf("%d ", src[x + y*stride]);
413 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
414 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
416 static inline void doVertLowPass(uint8_t *src, int stride, int QP)
418 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
420 asm volatile( //"movv %0 %1 %2\n\t"
421 "movq pQPb, %%mm0 \n\t" // QP,..., QP
423 "movq (%0), %%mm6 \n\t"
424 "movq (%0, %1), %%mm5 \n\t"
425 "movq %%mm5, %%mm1 \n\t"
426 "movq %%mm6, %%mm2 \n\t"
427 "psubusb %%mm6, %%mm5 \n\t"
428 "psubusb %%mm1, %%mm2 \n\t"
429 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
430 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
431 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
433 "pand %%mm2, %%mm6 \n\t"
434 "pandn %%mm1, %%mm2 \n\t"
435 "por %%mm2, %%mm6 \n\t"// First Line to Filter
437 "movq (%0, %1, 8), %%mm5 \n\t"
438 "leal (%0, %1, 4), %%eax \n\t"
439 "leal (%0, %1, 8), %%ebx \n\t"
440 "subl %1, %%ebx \n\t"
441 "addl %1, %0 \n\t" // %0 points to line 1 not 0
442 "movq (%0, %1, 8), %%mm7 \n\t"
443 "movq %%mm5, %%mm1 \n\t"
444 "movq %%mm7, %%mm2 \n\t"
445 "psubusb %%mm7, %%mm5 \n\t"
446 "psubusb %%mm1, %%mm2 \n\t"
447 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
448 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
449 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
451 "pand %%mm2, %%mm7 \n\t"
452 "pandn %%mm1, %%mm2 \n\t"
453 "por %%mm2, %%mm7 \n\t" // First Line to Filter
457 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ebx eax+4%1
462 "movq (%0, %1), %%mm0 \n\t" // 1
463 "movq %%mm0, %%mm1 \n\t" // 1
464 PAVGB(%%mm6, %%mm0) //1 1 /2
465 PAVGB(%%mm6, %%mm0) //3 1 /4
467 "movq (%0, %1, 4), %%mm2 \n\t" // 1
468 "movq %%mm2, %%mm5 \n\t" // 1
469 PAVGB((%%eax), %%mm2) // 11 /2
470 PAVGB((%0, %1, 2), %%mm2) // 211 /4
471 "movq %%mm2, %%mm3 \n\t" // 211 /4
472 "movq (%0), %%mm4 \n\t" // 1
473 PAVGB(%%mm4, %%mm3) // 4 211 /8
474 PAVGB(%%mm0, %%mm3) //642211 /16
475 "movq %%mm3, (%0) \n\t" // X
476 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
477 "movq %%mm1, %%mm0 \n\t" // 1
478 PAVGB(%%mm6, %%mm0) //1 1 /2
479 "movq %%mm4, %%mm3 \n\t" // 1
480 PAVGB((%0,%1,2), %%mm3) // 1 1 /2
481 PAVGB((%%eax,%1,2), %%mm5) // 11 /2
482 PAVGB((%%eax), %%mm5) // 211 /4
483 PAVGB(%%mm5, %%mm3) // 2 2211 /8
484 PAVGB(%%mm0, %%mm3) //4242211 /16
485 "movq %%mm3, (%0,%1) \n\t" // X
486 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
487 PAVGB(%%mm4, %%mm6) //11 /2
488 "movq (%%ebx), %%mm0 \n\t" // 1
489 PAVGB((%%eax, %1, 2), %%mm0) // 11/2
490 "movq %%mm0, %%mm3 \n\t" // 11/2
491 PAVGB(%%mm1, %%mm0) // 2 11/4
492 PAVGB(%%mm6, %%mm0) //222 11/8
493 PAVGB(%%mm2, %%mm0) //22242211/16
494 "movq (%0, %1, 2), %%mm2 \n\t" // 1
495 "movq %%mm0, (%0, %1, 2) \n\t" // X
496 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
497 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
498 PAVGB((%%ebx), %%mm0) // 11 /2
499 PAVGB(%%mm0, %%mm6) //11 11 /4
500 PAVGB(%%mm1, %%mm4) // 11 /2
501 PAVGB(%%mm2, %%mm1) // 11 /2
502 PAVGB(%%mm1, %%mm6) //1122 11 /8
503 PAVGB(%%mm5, %%mm6) //112242211 /16
504 "movq (%%eax), %%mm5 \n\t" // 1
505 "movq %%mm6, (%%eax) \n\t" // X
506 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
507 "movq (%%eax, %1, 4), %%mm6 \n\t" // 1
508 PAVGB(%%mm7, %%mm6) // 11 /2
509 PAVGB(%%mm4, %%mm6) // 11 11 /4
510 PAVGB(%%mm3, %%mm6) // 11 2211 /8
511 PAVGB(%%mm5, %%mm2) // 11 /2
512 "movq (%0, %1, 4), %%mm4 \n\t" // 1
513 PAVGB(%%mm4, %%mm2) // 112 /4
514 PAVGB(%%mm2, %%mm6) // 112242211 /16
515 "movq %%mm6, (%0, %1, 4) \n\t" // X
516 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
517 PAVGB(%%mm7, %%mm1) // 11 2 /4
518 PAVGB(%%mm4, %%mm5) // 11 /2
519 PAVGB(%%mm5, %%mm0) // 11 11 /4
520 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1
521 PAVGB(%%mm6, %%mm1) // 11 4 2 /8
522 PAVGB(%%mm0, %%mm1) // 11224222 /16
523 "movq %%mm1, (%%eax, %1, 2) \n\t" // X
524 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
525 PAVGB((%%ebx), %%mm2) // 112 4 /8
526 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
527 PAVGB(%%mm0, %%mm6) // 1 1 /2
528 PAVGB(%%mm7, %%mm6) // 1 12 /4
529 PAVGB(%%mm2, %%mm6) // 1122424 /4
530 "movq %%mm6, (%%ebx) \n\t" // X
531 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
532 PAVGB(%%mm7, %%mm5) // 11 2 /4
533 PAVGB(%%mm7, %%mm5) // 11 6 /8
535 PAVGB(%%mm3, %%mm0) // 112 /4
536 PAVGB(%%mm0, %%mm5) // 112246 /16
537 "movq %%mm5, (%%eax, %1, 4) \n\t" // X
541 : "r" (src), "r" (stride)
545 const int l1= stride;
546 const int l2= stride + l1;
547 const int l3= stride + l2;
548 const int l4= stride + l3;
549 const int l5= stride + l4;
550 const int l6= stride + l5;
551 const int l7= stride + l6;
552 const int l8= stride + l7;
553 const int l9= stride + l8;
556 for(x=0; x<BLOCK_SIZE; x++)
558 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
559 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
562 sums[0] = first + src[l1];
563 sums[1] = src[l1] + src[l2];
564 sums[2] = src[l2] + src[l3];
565 sums[3] = src[l3] + src[l4];
566 sums[4] = src[l4] + src[l5];
567 sums[5] = src[l5] + src[l6];
568 sums[6] = src[l6] + src[l7];
569 sums[7] = src[l7] + src[l8];
570 sums[8] = src[l8] + last;
572 src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
573 src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
574 src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
575 src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
576 src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
577 src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
578 src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4;
579 src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
588 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
589 * values are correctly clipped (MMX2)
590 * values are wraparound (C)
591 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
598 static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
600 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
604 "pxor %%mm7, %%mm7 \n\t" // 0
605 "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
606 "leal (%0, %1), %%eax \n\t"
607 "leal (%%eax, %1, 4), %%ebx \n\t"
608 // 0 1 2 3 4 5 6 7 8 9
609 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
610 "movq pQPb, %%mm0 \n\t" // QP,..., QP
611 "movq %%mm0, %%mm1 \n\t" // QP,..., QP
612 "paddusb b02, %%mm0 \n\t"
613 "psrlw $2, %%mm0 \n\t"
614 "pand b3F, %%mm0 \n\t" // QP/4,..., QP/4
615 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ...
616 "movq (%0, %1, 4), %%mm2 \n\t" // line 4
617 "movq (%%ebx), %%mm3 \n\t" // line 5
618 "movq %%mm2, %%mm4 \n\t" // line 4
619 "pcmpeqb %%mm5, %%mm5 \n\t" // -1
620 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
622 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
623 "psubusb %%mm3, %%mm4 \n\t"
624 "psubusb %%mm2, %%mm3 \n\t"
625 "por %%mm3, %%mm4 \n\t" // |l4 - l5|
626 "psubusb %%mm0, %%mm4 \n\t"
627 "pcmpeqb %%mm7, %%mm4 \n\t"
628 "pand %%mm4, %%mm5 \n\t" // d/2
630 // "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80
631 "paddb %%mm5, %%mm2 \n\t"
632 // "psubb %%mm6, %%mm2 \n\t"
633 "movq %%mm2, (%0,%1, 4) \n\t"
635 "movq (%%ebx), %%mm2 \n\t"
636 // "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80
637 "psubb %%mm5, %%mm2 \n\t"
638 // "psubb %%mm6, %%mm2 \n\t"
639 "movq %%mm2, (%%ebx) \n\t"
641 "paddb %%mm6, %%mm5 \n\t"
642 "psrlw $2, %%mm5 \n\t"
643 "pand b3F, %%mm5 \n\t"
644 "psubb b20, %%mm5 \n\t" // (l5-l4)/8
646 "movq (%%eax, %1, 2), %%mm2 \n\t"
647 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80
648 "paddsb %%mm5, %%mm2 \n\t"
649 "psubb %%mm6, %%mm2 \n\t"
650 "movq %%mm2, (%%eax, %1, 2) \n\t"
652 "movq (%%ebx, %1), %%mm2 \n\t"
653 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80
654 "psubsb %%mm5, %%mm2 \n\t"
655 "psubb %%mm6, %%mm2 \n\t"
656 "movq %%mm2, (%%ebx, %1) \n\t"
659 : "r" (src), "r" (stride)
663 const int l1= stride;
664 const int l2= stride + l1;
665 const int l3= stride + l2;
666 const int l4= stride + l3;
667 const int l5= stride + l4;
668 const int l6= stride + l5;
669 // const int l7= stride + l6;
670 // const int l8= stride + l7;
671 // const int l9= stride + l8;
673 const int QP15= QP + (QP>>2);
675 for(x=0; x<BLOCK_SIZE; x++)
677 const int v = (src[x+l5] - src[x+l4]);
692 * Experimental Filter 1
693 * will not damage linear gradients
694 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
695 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
696 * MMX2 version does correct clipping C version doesnt
698 static inline void vertX1Filter(uint8_t *src, int stride, int QP)
700 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
704 "pxor %%mm7, %%mm7 \n\t" // 0
705 // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
706 "leal (%0, %1), %%eax \n\t"
707 "leal (%%eax, %1, 4), %%ebx \n\t"
708 // 0 1 2 3 4 5 6 7 8 9
709 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
710 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
711 "movq (%0, %1, 4), %%mm1 \n\t" // line 4
712 "movq %%mm1, %%mm2 \n\t" // line 4
713 "psubusb %%mm0, %%mm1 \n\t"
714 "psubusb %%mm2, %%mm0 \n\t"
715 "por %%mm1, %%mm0 \n\t" // |l2 - l3|
716 "movq (%%ebx), %%mm3 \n\t" // line 5
717 "movq (%%ebx, %1), %%mm4 \n\t" // line 6
718 "movq %%mm3, %%mm5 \n\t" // line 5
719 "psubusb %%mm4, %%mm3 \n\t"
720 "psubusb %%mm5, %%mm4 \n\t"
721 "por %%mm4, %%mm3 \n\t" // |l5 - l6|
722 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
723 "movq %%mm2, %%mm1 \n\t" // line 4
724 "psubusb %%mm5, %%mm2 \n\t"
725 "movq %%mm2, %%mm4 \n\t"
726 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
727 "psubusb %%mm1, %%mm5 \n\t"
728 "por %%mm5, %%mm4 \n\t" // |l4 - l5|
729 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
730 "movq %%mm4, %%mm3 \n\t" // d
731 "psubusb pQPb, %%mm4 \n\t"
732 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
733 "psubusb b01, %%mm3 \n\t"
734 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
736 PAVGB(%%mm7, %%mm3) // d/2
737 "movq %%mm3, %%mm1 \n\t" // d/2
738 PAVGB(%%mm7, %%mm3) // d/4
739 PAVGB(%%mm1, %%mm3) // 3*d/8
741 "movq (%0, %1, 4), %%mm0 \n\t" // line 4
742 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
743 "psubusb %%mm3, %%mm0 \n\t"
744 "pxor %%mm2, %%mm0 \n\t"
745 "movq %%mm0, (%0, %1, 4) \n\t" // line 4
747 "movq (%%ebx), %%mm0 \n\t" // line 5
748 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
749 "paddusb %%mm3, %%mm0 \n\t"
750 "pxor %%mm2, %%mm0 \n\t"
751 "movq %%mm0, (%%ebx) \n\t" // line 5
753 PAVGB(%%mm7, %%mm1) // d/4
755 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
756 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
757 "psubusb %%mm1, %%mm0 \n\t"
758 "pxor %%mm2, %%mm0 \n\t"
759 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3
761 "movq (%%ebx, %1), %%mm0 \n\t" // line 6
762 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
763 "paddusb %%mm1, %%mm0 \n\t"
764 "pxor %%mm2, %%mm0 \n\t"
765 "movq %%mm0, (%%ebx, %1) \n\t" // line 6
767 PAVGB(%%mm7, %%mm1) // d/8
769 "movq (%%eax, %1), %%mm0 \n\t" // line 2
770 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
771 "psubusb %%mm1, %%mm0 \n\t"
772 "pxor %%mm2, %%mm0 \n\t"
773 "movq %%mm0, (%%eax, %1) \n\t" // line 2
775 "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7
776 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
777 "paddusb %%mm1, %%mm0 \n\t"
778 "pxor %%mm2, %%mm0 \n\t"
779 "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7
782 : "r" (src), "r" (stride)
787 const int l1= stride;
788 const int l2= stride + l1;
789 const int l3= stride + l2;
790 const int l4= stride + l3;
791 const int l5= stride + l4;
792 const int l6= stride + l5;
793 const int l7= stride + l6;
794 // const int l8= stride + l7;
795 // const int l9= stride + l8;
799 for(x=0; x<BLOCK_SIZE; x++)
801 int a= src[l3] - src[l4];
802 int b= src[l4] - src[l5];
803 int c= src[l5] - src[l6];
805 int d= ABS(b) - ((ABS(a) + ABS(c))>>1);
810 int v = d * SIGN(-b);
823 const int l1= stride;
824 const int l2= stride + l1;
825 const int l3= stride + l2;
826 const int l4= stride + l3;
827 const int l5= stride + l4;
828 const int l6= stride + l5;
829 const int l7= stride + l6;
830 const int l8= stride + l7;
831 const int l9= stride + l8;
832 for(int x=0; x<BLOCK_SIZE; x++)
841 if(ABS(v4-v5)<QP && ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 )
843 src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6 )/16;
844 src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7 )/16;
845 src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16;
846 src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16;
855 * Experimental Filter 1 (Horizontal)
856 * will not damage linear gradients
857 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
858 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
859 * MMX2 version does correct clipping C version doesnt
860 * not identical with the vertical one
862 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
865 static uint64_t *lut= NULL;
869 lut= (uint64_t*)memalign(8, 256*8);
872 int v= i < 128 ? 2*i : 2*(i-256);
874 //Simulate 112242211 9-Tap filter
875 uint64_t a= (v/16) & 0xFF;
876 uint64_t b= (v/8) & 0xFF;
877 uint64_t c= (v/4) & 0xFF;
878 uint64_t d= (3*v/8) & 0xFF;
880 //Simulate piecewise linear interpolation
881 uint64_t a= (v/16) & 0xFF;
882 uint64_t b= (v*3/16) & 0xFF;
883 uint64_t c= (v*5/16) & 0xFF;
884 uint64_t d= (7*v/16) & 0xFF;
885 uint64_t A= (0x100 - a)&0xFF;
886 uint64_t B= (0x100 - b)&0xFF;
887 uint64_t C= (0x100 - c)&0xFF;
888 uint64_t D= (0x100 - c)&0xFF;
890 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
891 (D<<24) | (C<<16) | (B<<8) | (A);
892 //lut[i] = (v<<32) | (v<<24);
898 "pxor %%mm7, %%mm7 \n\t" // 0
899 // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
900 "leal (%0, %1), %%eax \n\t"
901 "leal (%%eax, %1, 4), %%ebx \n\t"
903 "movq b80, %%mm6 \n\t"
904 "movd pQPb, %%mm5 \n\t" // QP
905 "movq %%mm5, %%mm4 \n\t"
906 "paddusb %%mm5, %%mm5 \n\t" // 2QP
907 "paddusb %%mm5, %%mm4 \n\t" // 3QP
908 "pxor %%mm5, %%mm5 \n\t" // 0
909 "psubb %%mm4, %%mm5 \n\t" // -3QP
910 "por bm11111110, %%mm5 \n\t" // ...,FF,FF,-3QP
911 "psllq $24, %%mm5 \n\t"
913 // 0 1 2 3 4 5 6 7 8 9
914 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
917 "movd " #a ", %%mm0 \n\t"\
918 "movd 4" #a ", %%mm1 \n\t"\
919 "punpckldq %%mm1, %%mm0 \n\t"\
920 "movq %%mm0, %%mm1 \n\t"\
921 "movq %%mm0, %%mm2 \n\t"\
922 "psrlq $8, %%mm1 \n\t"\
923 "psubusb %%mm1, %%mm2 \n\t"\
924 "psubusb %%mm0, %%mm1 \n\t"\
925 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
926 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
927 "pshufw $0x00, %%mm1, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
928 PAVGB(%%mm1, %%mm3) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
929 "psrlq $16, %%mm3 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
930 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
931 "paddb %%mm5, %%mm1 \n\t"\
932 "psubusb %%mm5, %%mm1 \n\t"\
934 "pxor %%mm2, %%mm1 \n\t"\
935 "psubb %%mm2, %%mm1 \n\t"\
936 "psrlq $24, %%mm1 \n\t"\
937 "movd %%mm1, %%ecx \n\t"\
938 "paddb %%mm6, %%mm0 \n\t"\
939 "paddsb (%3, %%ecx, 8), %%mm0 \n\t"\
940 "paddb %%mm6, %%mm0 \n\t"\
941 "movq %%mm0, " #a " \n\t"\
947 HX1old((%%eax, %1, 2))
951 HX1old((%%ebx, %1, 2))
954 //FIXME add some comments, its unreadable ...
955 #define HX1b(a, c, b, d) \
956 "movd " #a ", %%mm0 \n\t"\
957 "movd 4" #a ", %%mm1 \n\t"\
958 "punpckldq %%mm1, %%mm0 \n\t"\
959 "movd " #b ", %%mm4 \n\t"\
960 "movq %%mm0, %%mm1 \n\t"\
961 "movq %%mm0, %%mm2 \n\t"\
962 "psrlq $8, %%mm1 \n\t"\
963 "movd 4" #b ", %%mm3 \n\t"\
964 "psubusb %%mm1, %%mm2 \n\t"\
965 "psubusb %%mm0, %%mm1 \n\t"\
966 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
967 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
968 "punpckldq %%mm3, %%mm4 \n\t"\
969 "movq %%mm1, %%mm3 \n\t"\
970 "psllq $32, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
971 PAVGB(%%mm1, %%mm3) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
972 "paddb %%mm6, %%mm0 \n\t"\
973 "psrlq $16, %%mm3 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
974 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
975 "movq %%mm4, %%mm3 \n\t"\
976 "paddb %%mm5, %%mm1 \n\t"\
977 "psubusb %%mm5, %%mm1 \n\t"\
978 "psrlq $8, %%mm3 \n\t"\
980 "pxor %%mm2, %%mm1 \n\t"\
981 "psubb %%mm2, %%mm1 \n\t"\
982 "movq %%mm4, %%mm2 \n\t"\
983 "psrlq $24, %%mm1 \n\t"\
984 "psubusb %%mm3, %%mm2 \n\t"\
985 "movd %%mm1, %%ecx \n\t"\
986 "psubusb %%mm4, %%mm3 \n\t"\
987 "paddsb (%2, %%ecx, 8), %%mm0 \n\t"\
988 "por %%mm2, %%mm3 \n\t" /* p´x = |px - p(x+1)| */\
989 "paddb %%mm6, %%mm0 \n\t"\
990 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
991 "movq %%mm3, %%mm1 \n\t"\
992 "psllq $32, %%mm1 \n\t" /* p´5 = |p1 - p2| */\
993 "movq %%mm0, " #a " \n\t"\
994 PAVGB(%%mm3, %%mm1) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
995 "paddb %%mm6, %%mm4 \n\t"\
996 "psrlq $16, %%mm1 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
997 "psubusb %%mm1, %%mm3 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
998 "paddb %%mm5, %%mm3 \n\t"\
999 "psubusb %%mm5, %%mm3 \n\t"\
1000 PAVGB(%%mm7, %%mm3)\
1001 "pxor %%mm2, %%mm3 \n\t"\
1002 "psubb %%mm2, %%mm3 \n\t"\
1003 "psrlq $24, %%mm3 \n\t"\
1004 "movd " #c ", %%mm0 \n\t"\
1005 "movd 4" #c ", %%mm1 \n\t"\
1006 "punpckldq %%mm1, %%mm0 \n\t"\
1007 "paddb %%mm6, %%mm0 \n\t"\
1008 "paddsb (%2, %%ecx, 8), %%mm0 \n\t"\
1009 "paddb %%mm6, %%mm0 \n\t"\
1010 "movq %%mm0, " #c " \n\t"\
1011 "movd %%mm3, %%ecx \n\t"\
1012 "movd " #d ", %%mm0 \n\t"\
1013 "paddsb (%2, %%ecx, 8), %%mm4 \n\t"\
1014 "movd 4" #d ", %%mm1 \n\t"\
1015 "paddb %%mm6, %%mm4 \n\t"\
1016 "punpckldq %%mm1, %%mm0 \n\t"\
1017 "movq %%mm4, " #b " \n\t"\
1018 "paddb %%mm6, %%mm0 \n\t"\
1019 "paddsb (%2, %%ecx, 8), %%mm0 \n\t"\
1020 "paddb %%mm6, %%mm0 \n\t"\
1021 "movq %%mm0, " #d " \n\t"\
1023 HX1b((%0),(%%eax),(%%eax, %1),(%%eax, %1, 2))
1024 HX1b((%0, %1, 4),(%%ebx),(%%ebx, %1),(%%ebx, %1, 2))
1028 : "r" (src), "r" (stride), "r" (lut)
1029 : "%eax", "%ebx", "%ecx"
1033 //FIXME (has little in common with the mmx2 version)
1034 for(y=0; y<BLOCK_SIZE; y++)
1036 int a= src[1] - src[2];
1037 int b= src[3] - src[4];
1038 int c= src[5] - src[6];
1040 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
1044 int v = d * SIGN(-b);
1060 static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
1064 //FIXME try pmul for *5 stuff
1067 "pxor %%mm7, %%mm7 \n\t"
1068 "leal (%0, %1), %%eax \n\t"
1069 "leal (%%eax, %1, 4), %%ebx \n\t"
1071 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
1072 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
1074 "movq (%0), %%mm0 \n\t"
1075 "movq %%mm0, %%mm1 \n\t"
1076 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
1077 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
1079 "movq (%%eax), %%mm2 \n\t"
1080 "movq %%mm2, %%mm3 \n\t"
1081 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
1082 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
1084 "movq (%%eax, %1), %%mm4 \n\t"
1085 "movq %%mm4, %%mm5 \n\t"
1086 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
1087 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
1089 "paddw %%mm0, %%mm0 \n\t" // 2L0
1090 "paddw %%mm1, %%mm1 \n\t" // 2H0
1091 "psubw %%mm4, %%mm2 \n\t" // L1 - L2
1092 "psubw %%mm5, %%mm3 \n\t" // H1 - H2
1093 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
1094 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
1096 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
1097 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
1098 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
1099 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
1101 "movq (%%eax, %1, 2), %%mm2 \n\t"
1102 "movq %%mm2, %%mm3 \n\t"
1103 "punpcklbw %%mm7, %%mm2 \n\t" // L3
1104 "punpckhbw %%mm7, %%mm3 \n\t" // H3
1106 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
1107 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
1108 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1109 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1110 "movq %%mm0, temp0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1111 "movq %%mm1, temp1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1113 "movq (%0, %1, 4), %%mm0 \n\t"
1114 "movq %%mm0, %%mm1 \n\t"
1115 "punpcklbw %%mm7, %%mm0 \n\t" // L4
1116 "punpckhbw %%mm7, %%mm1 \n\t" // H4
1118 "psubw %%mm0, %%mm2 \n\t" // L3 - L4
1119 "psubw %%mm1, %%mm3 \n\t" // H3 - H4
1120 "movq %%mm2, temp2 \n\t" // L3 - L4
1121 "movq %%mm3, temp3 \n\t" // H3 - H4
1122 "paddw %%mm4, %%mm4 \n\t" // 2L2
1123 "paddw %%mm5, %%mm5 \n\t" // 2H2
1124 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
1125 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
1127 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
1128 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
1129 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
1130 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
1132 "movq (%%ebx), %%mm2 \n\t"
1133 "movq %%mm2, %%mm3 \n\t"
1134 "punpcklbw %%mm7, %%mm2 \n\t" // L5
1135 "punpckhbw %%mm7, %%mm3 \n\t" // H5
1136 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
1137 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
1138 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
1139 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
1141 "movq (%%ebx, %1), %%mm6 \n\t"
1142 "punpcklbw %%mm7, %%mm6 \n\t" // L6
1143 "psubw %%mm6, %%mm2 \n\t" // L5 - L6
1144 "movq (%%ebx, %1), %%mm6 \n\t"
1145 "punpckhbw %%mm7, %%mm6 \n\t" // H6
1146 "psubw %%mm6, %%mm3 \n\t" // H5 - H6
1148 "paddw %%mm0, %%mm0 \n\t" // 2L4
1149 "paddw %%mm1, %%mm1 \n\t" // 2H4
1150 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
1151 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
1153 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
1154 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
1155 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
1156 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
1158 "movq (%%ebx, %1, 2), %%mm2 \n\t"
1159 "movq %%mm2, %%mm3 \n\t"
1160 "punpcklbw %%mm7, %%mm2 \n\t" // L7
1161 "punpckhbw %%mm7, %%mm3 \n\t" // H7
1163 "paddw %%mm2, %%mm2 \n\t" // 2L7
1164 "paddw %%mm3, %%mm3 \n\t" // 2H7
1165 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
1166 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
1168 "movq temp0, %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1169 "movq temp1, %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1172 "movq %%mm7, %%mm6 \n\t" // 0
1173 "psubw %%mm0, %%mm6 \n\t"
1174 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1175 "movq %%mm7, %%mm6 \n\t" // 0
1176 "psubw %%mm1, %%mm6 \n\t"
1177 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1178 "movq %%mm7, %%mm6 \n\t" // 0
1179 "psubw %%mm2, %%mm6 \n\t"
1180 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1181 "movq %%mm7, %%mm6 \n\t" // 0
1182 "psubw %%mm3, %%mm6 \n\t"
1183 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1185 "movq %%mm7, %%mm6 \n\t" // 0
1186 "pcmpgtw %%mm0, %%mm6 \n\t"
1187 "pxor %%mm6, %%mm0 \n\t"
1188 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1189 "movq %%mm7, %%mm6 \n\t" // 0
1190 "pcmpgtw %%mm1, %%mm6 \n\t"
1191 "pxor %%mm6, %%mm1 \n\t"
1192 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1193 "movq %%mm7, %%mm6 \n\t" // 0
1194 "pcmpgtw %%mm2, %%mm6 \n\t"
1195 "pxor %%mm6, %%mm2 \n\t"
1196 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1197 "movq %%mm7, %%mm6 \n\t" // 0
1198 "pcmpgtw %%mm3, %%mm6 \n\t"
1199 "pxor %%mm6, %%mm3 \n\t"
1200 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1204 "pminsw %%mm2, %%mm0 \n\t"
1205 "pminsw %%mm3, %%mm1 \n\t"
1207 "movq %%mm0, %%mm6 \n\t"
1208 "psubusw %%mm2, %%mm6 \n\t"
1209 "psubw %%mm6, %%mm0 \n\t"
1210 "movq %%mm1, %%mm6 \n\t"
1211 "psubusw %%mm3, %%mm6 \n\t"
1212 "psubw %%mm6, %%mm1 \n\t"
1215 "movq %%mm7, %%mm6 \n\t" // 0
1216 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
1217 "pxor %%mm6, %%mm4 \n\t"
1218 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
1219 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
1220 "pxor %%mm7, %%mm5 \n\t"
1221 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
1223 "movd %2, %%mm2 \n\t" // QP
1224 "punpcklwd %%mm2, %%mm2 \n\t"
1225 "punpcklwd %%mm2, %%mm2 \n\t"
1226 "psllw $3, %%mm2 \n\t" // 8QP
1227 "movq %%mm2, %%mm3 \n\t" // 8QP
1228 "pcmpgtw %%mm4, %%mm2 \n\t"
1229 "pcmpgtw %%mm5, %%mm3 \n\t"
1230 "pand %%mm2, %%mm4 \n\t"
1231 "pand %%mm3, %%mm5 \n\t"
1234 "psubusw %%mm0, %%mm4 \n\t" // hd
1235 "psubusw %%mm1, %%mm5 \n\t" // ld
1238 "movq w05, %%mm2 \n\t" // 5
1239 "pmullw %%mm2, %%mm4 \n\t"
1240 "pmullw %%mm2, %%mm5 \n\t"
1241 "movq w20, %%mm2 \n\t" // 32
1242 "paddw %%mm2, %%mm4 \n\t"
1243 "paddw %%mm2, %%mm5 \n\t"
1244 "psrlw $6, %%mm4 \n\t"
1245 "psrlw $6, %%mm5 \n\t"
1248 "movq w06, %%mm2 \n\t" // 6
1249 "paddw %%mm2, %%mm4 \n\t"
1250 "paddw %%mm2, %%mm5 \n\t"
1251 "movq w1400, %%mm2 \n\t" // 1400h = 5120 = 5/64*2^16
1252 //FIXME if *5/64 is supposed to be /13 then we should use 5041 instead of 5120
1253 "pmulhw %%mm2, %%mm4 \n\t" // hd/13
1254 "pmulhw %%mm2, %%mm5 \n\t" // ld/13
1257 "movq temp2, %%mm0 \n\t" // L3 - L4
1258 "movq temp3, %%mm1 \n\t" // H3 - H4
1260 "pxor %%mm2, %%mm2 \n\t"
1261 "pxor %%mm3, %%mm3 \n\t"
1263 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
1264 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
1265 "pxor %%mm2, %%mm0 \n\t"
1266 "pxor %%mm3, %%mm1 \n\t"
1267 "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
1268 "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
1269 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
1270 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
1272 "pxor %%mm6, %%mm2 \n\t"
1273 "pxor %%mm7, %%mm3 \n\t"
1274 "pand %%mm2, %%mm4 \n\t"
1275 "pand %%mm3, %%mm5 \n\t"
1278 "pminsw %%mm0, %%mm4 \n\t"
1279 "pminsw %%mm1, %%mm5 \n\t"
1281 "movq %%mm4, %%mm2 \n\t"
1282 "psubusw %%mm0, %%mm2 \n\t"
1283 "psubw %%mm2, %%mm4 \n\t"
1284 "movq %%mm5, %%mm2 \n\t"
1285 "psubusw %%mm1, %%mm2 \n\t"
1286 "psubw %%mm2, %%mm5 \n\t"
1288 "pxor %%mm6, %%mm4 \n\t"
1289 "pxor %%mm7, %%mm5 \n\t"
1290 "psubw %%mm6, %%mm4 \n\t"
1291 "psubw %%mm7, %%mm5 \n\t"
1292 "packsswb %%mm5, %%mm4 \n\t"
1293 "movq (%%eax, %1, 2), %%mm0 \n\t"
1294 "paddb %%mm4, %%mm0 \n\t"
1295 "movq %%mm0, (%%eax, %1, 2) \n\t"
1296 "movq (%0, %1, 4), %%mm0 \n\t"
1297 "psubb %%mm4, %%mm0 \n\t"
1298 "movq %%mm0, (%0, %1, 4) \n\t"
1301 : "r" (src), "r" (stride), "r" (QP)
1305 const int l1= stride;
1306 const int l2= stride + l1;
1307 const int l3= stride + l2;
1308 const int l4= stride + l3;
1309 const int l5= stride + l4;
1310 const int l6= stride + l5;
1311 const int l7= stride + l6;
1312 const int l8= stride + l7;
1313 // const int l9= stride + l8;
1316 for(x=0; x<BLOCK_SIZE; x++)
1318 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1319 if(ABS(middleEnergy) < 8*QP)
1321 const int q=(src[l4] - src[l5])/2;
1322 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1323 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1325 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1329 d*= SIGN(-middleEnergy);
1350 //FIXME? |255-0| = 1
1352 * Check if the given 8x8 Block is mostly "flat"
1354 static inline int isHorizDC(uint8_t src[], int stride)
1361 "leal (%1, %2), %%ecx \n\t"
1362 "leal (%%ecx, %2, 4), %%ebx \n\t"
1363 // 0 1 2 3 4 5 6 7 8 9
1364 // %1 ecx ecx+%2 ecx+2%2 %1+4%2 ebx ebx+%2 ebx+2%2 %1+8%2 ebx+4%2
1365 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F
1366 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D
1367 "pxor %%mm0, %%mm0 \n\t"
1368 "movl %1, %%eax \n\t"
1369 "andl $0x1F, %%eax \n\t"
1370 "cmpl $24, %%eax \n\t"
1371 "leal tempBlock, %%eax \n\t"
1374 #define HDC_CHECK_AND_CPY(src, dst) \
1375 "movd " #src ", %%mm2 \n\t"\
1376 "punpckldq 4" #src ", %%mm2 \n\t" /* (%1) */\
1377 "movq %%mm2, %%mm1 \n\t"\
1378 "psrlq $8, %%mm2 \n\t"\
1379 "psubb %%mm1, %%mm2 \n\t"\
1380 "paddb %%mm7, %%mm2 \n\t"\
1381 "pcmpgtb %%mm6, %%mm2 \n\t"\
1382 "paddb %%mm2, %%mm0 \n\t"\
1383 "movq %%mm1," #dst "(%%eax) \n\t"
1385 HDC_CHECK_AND_CPY((%1),0)
1386 HDC_CHECK_AND_CPY((%%ecx),8)
1387 HDC_CHECK_AND_CPY((%%ecx, %2),16)
1388 HDC_CHECK_AND_CPY((%%ecx, %2, 2),24)
1389 HDC_CHECK_AND_CPY((%1, %2, 4),32)
1390 HDC_CHECK_AND_CPY((%%ebx),40)
1391 HDC_CHECK_AND_CPY((%%ebx, %2),48)
1392 HDC_CHECK_AND_CPY((%%ebx, %2, 2),56)
1395 // src does not cross a 32 byte cache line so dont waste time with alignment
1396 #define HDC_CHECK_AND_CPY2(src, dst) \
1397 "movq " #src ", %%mm2 \n\t"\
1398 "movq " #src ", %%mm1 \n\t"\
1399 "psrlq $8, %%mm2 \n\t"\
1400 "psubb %%mm1, %%mm2 \n\t"\
1401 "paddb %%mm7, %%mm2 \n\t"\
1402 "pcmpgtb %%mm6, %%mm2 \n\t"\
1403 "paddb %%mm2, %%mm0 \n\t"\
1404 "movq %%mm1," #dst "(%%eax) \n\t"
1406 HDC_CHECK_AND_CPY2((%1),0)
1407 HDC_CHECK_AND_CPY2((%%ecx),8)
1408 HDC_CHECK_AND_CPY2((%%ecx, %2),16)
1409 HDC_CHECK_AND_CPY2((%%ecx, %2, 2),24)
1410 HDC_CHECK_AND_CPY2((%1, %2, 4),32)
1411 HDC_CHECK_AND_CPY2((%%ebx),40)
1412 HDC_CHECK_AND_CPY2((%%ebx, %2),48)
1413 HDC_CHECK_AND_CPY2((%%ebx, %2, 2),56)
1415 "psllq $8, %%mm0 \n\t" // remove dummy value
1416 "movq %%mm0, %%mm1 \n\t"
1417 "psrlw $8, %%mm0 \n\t"
1418 "paddb %%mm1, %%mm0 \n\t"
1419 "movq %%mm0, %%mm1 \n\t"
1420 "psrlq $16, %%mm0 \n\t"
1421 "paddb %%mm1, %%mm0 \n\t"
1422 "movq %%mm0, %%mm1 \n\t"
1423 "psrlq $32, %%mm0 \n\t"
1424 "paddb %%mm1, %%mm0 \n\t"
1425 "movd %%mm0, %0 \n\t"
1427 : "r" (src), "r" (stride)
1428 : "%eax", "%ebx", "%ecx"
1430 // printf("%d\n", numEq);
1431 numEq= (256 - numEq) &0xFF;
1434 for(y=0; y<BLOCK_SIZE; y++)
1436 if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++;
1437 if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++;
1438 if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++;
1439 if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++;
1440 if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++;
1441 if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++;
1442 if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++;
1446 /* if(abs(numEq - asmEq) > 0)
1448 // printf("\nasm:%d c:%d\n", asmEq, numEq);
1449 for(int y=0; y<8; y++)
1451 for(int x=0; x<8; x++)
1453 printf("%d ", src[x + y*stride]);
1459 // printf("%d\n", numEq);
1460 return numEq > hFlatnessThreshold;
1463 static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
1465 if(abs(src[0] - src[7]) > 2*QP) return 0;
1470 static inline void doHorizDefFilter(uint8_t dst[], int stride, int QP)
1474 "leal (%0, %1), %%ecx \n\t"
1475 "leal (%%ecx, %1, 4), %%ebx \n\t"
1476 // 0 1 2 3 4 5 6 7 8 9
1477 // %0 ecx ecx+%1 ecx+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1478 "pxor %%mm7, %%mm7 \n\t"
1479 "movq bm00001000, %%mm6 \n\t"
1480 "movd %2, %%mm5 \n\t" // QP
1481 "movq %%mm5, %%mm4 \n\t"
1482 "paddusb %%mm5, %%mm5 \n\t" // 2QP
1483 "paddusb %%mm5, %%mm4 \n\t" // 3QP
1484 "psllq $24, %%mm4 \n\t"
1485 "pxor %%mm5, %%mm5 \n\t" // 0
1486 "psubb %%mm4, %%mm5 \n\t" // -QP
1487 "leal tempBlock, %%eax \n\t"
1489 //FIXME? "unroll by 2" and mix
1491 #define HDF(src, dst) \
1492 "movq " #src "(%%eax), %%mm0 \n\t"\
1493 "movq " #src "(%%eax), %%mm1 \n\t"\
1494 "movq " #src "(%%eax), %%mm2 \n\t"\
1495 "psrlq $8, %%mm1 \n\t"\
1496 "psubusb %%mm1, %%mm2 \n\t"\
1497 "psubusb %%mm0, %%mm1 \n\t"\
1498 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
1499 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
1500 "pshufw $0x00, %%mm1, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
1501 "pminub %%mm1, %%mm3 \n\t" /* p´5 = min(|p2-p1|, |p6-p5|)*/\
1502 "psrlq $16, %%mm3 \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\
1503 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5-p6|) */\
1504 "paddb %%mm5, %%mm1 \n\t"\
1505 "psubusb %%mm5, %%mm1 \n\t"\
1506 "psrlw $2, %%mm1 \n\t"\
1507 "pxor %%mm2, %%mm1 \n\t"\
1508 "psubb %%mm2, %%mm1 \n\t"\
1509 "pand %%mm6, %%mm1 \n\t"\
1510 "psubb %%mm1, %%mm0 \n\t"\
1511 "psllq $8, %%mm1 \n\t"\
1512 "paddb %%mm1, %%mm0 \n\t"\
1513 "movd %%mm0, " #dst" \n\t"\
1514 "psrlq $32, %%mm0 \n\t"\
1515 "movd %%mm0, 4" #dst" \n\t"
1517 #define HDF(src, dst)\
1518 "movq " #src "(%%eax), %%mm0 \n\t"\
1519 "movq %%mm0, %%mm1 \n\t"\
1520 "movq %%mm0, %%mm2 \n\t"\
1521 "psrlq $8, %%mm1 \n\t"\
1522 "psubusb %%mm1, %%mm2 \n\t"\
1523 "psubusb %%mm0, %%mm1 \n\t"\
1524 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
1525 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
1526 "movq %%mm1, %%mm3 \n\t"\
1527 "psllq $32, %%mm3 \n\t"\
1528 "movq %%mm3, %%mm4 \n\t"\
1529 "psubusb %%mm1, %%mm4 \n\t"\
1530 "psubb %%mm4, %%mm3 \n\t"\
1531 "psrlq $16, %%mm3 \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\
1532 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5,ü6|) */\
1533 "paddb %%mm5, %%mm1 \n\t"\
1534 "psubusb %%mm5, %%mm1 \n\t"\
1535 "psrlw $2, %%mm1 \n\t"\
1536 "pxor %%mm2, %%mm1 \n\t"\
1537 "psubb %%mm2, %%mm1 \n\t"\
1538 "pand %%mm6, %%mm1 \n\t"\
1539 "psubb %%mm1, %%mm0 \n\t"\
1540 "psllq $8, %%mm1 \n\t"\
1541 "paddb %%mm1, %%mm0 \n\t"\
1542 "movd %%mm0, " #dst " \n\t"\
1543 "psrlq $32, %%mm0 \n\t"\
1544 "movd %%mm0, 4" #dst " \n\t"
1549 HDF(24,(%%ecx, %1, 2))
1553 HDF(56,(%%ebx, %1, 2))
1555 : "r" (dst), "r" (stride), "r" (QP)
1556 : "%eax", "%ebx", "%ecx"
1560 for(y=0; y<BLOCK_SIZE; y++)
1562 const int middleEnergy= 5*(dst[4] - dst[5]) + 2*(dst[2] - dst[5]);
1564 if(ABS(middleEnergy) < 8*QP)
1566 const int q=(dst[3] - dst[4])/2;
1567 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
1568 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
1570 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1574 d*= SIGN(-middleEnergy);
1596 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
1597 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
1598 * using the 7-Tap Filter (2,2,2,4,2,2,2)/16 (MMX2/3DNOW version)
1600 static inline void doHorizLowPass(uint8_t dst[], int stride, int QP)
1605 "leal (%0, %1), %%ecx \n\t"
1606 "leal (%%ecx, %1, 4), %%ebx \n\t"
1607 // 0 1 2 3 4 5 6 7 8 9
1608 // %0 ecx ecx+%1 ecx+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1609 "pxor %%mm7, %%mm7 \n\t"
1610 "leal tempBlock, %%eax \n\t"
1612 #define HLP1 "movq (%0), %%mm0 \n\t"\
1613 "movq %%mm0, %%mm1 \n\t"\
1614 "psllq $8, %%mm0 \n\t"\
1615 PAVGB(%%mm1, %%mm0)\
1616 "psrlw $8, %%mm0 \n\t"\
1617 "pxor %%mm1, %%mm1 \n\t"\
1618 "packuswb %%mm1, %%mm0 \n\t"\
1619 "movq %%mm0, %%mm1 \n\t"\
1620 "movq %%mm0, %%mm2 \n\t"\
1621 "psllq $32, %%mm0 \n\t"\
1622 "paddb %%mm0, %%mm1 \n\t"\
1623 "psllq $16, %%mm2 \n\t"\
1624 PAVGB(%%mm2, %%mm0)\
1625 "movq %%mm0, %%mm3 \n\t"\
1626 "pand bm11001100, %%mm0 \n\t"\
1627 "paddusb %%mm0, %%mm3 \n\t"\
1628 "psrlq $8, %%mm3 \n\t"\
1629 PAVGB(%%mm1, %%mm4)\
1630 PAVGB(%%mm3, %%mm2)\
1631 "psrlq $16, %%mm2 \n\t"\
1632 "punpcklbw %%mm2, %%mm2 \n\t"\
1633 "movq %%mm2, (%0) \n\t"\
1635 #define HLP2 "movq (%0), %%mm0 \n\t"\
1636 "movq %%mm0, %%mm1 \n\t"\
1637 "psllq $8, %%mm0 \n\t"\
1638 PAVGB(%%mm1, %%mm0)\
1639 "psrlw $8, %%mm0 \n\t"\
1640 "pxor %%mm1, %%mm1 \n\t"\
1641 "packuswb %%mm1, %%mm0 \n\t"\
1642 "movq %%mm0, %%mm2 \n\t"\
1643 "psllq $32, %%mm0 \n\t"\
1644 "psllq $16, %%mm2 \n\t"\
1645 PAVGB(%%mm2, %%mm0)\
1646 "movq %%mm0, %%mm3 \n\t"\
1647 "pand bm11001100, %%mm0 \n\t"\
1648 "paddusb %%mm0, %%mm3 \n\t"\
1649 "psrlq $8, %%mm3 \n\t"\
1650 PAVGB(%%mm3, %%mm2)\
1651 "psrlq $16, %%mm2 \n\t"\
1652 "punpcklbw %%mm2, %%mm2 \n\t"\
1653 "movq %%mm2, (%0) \n\t"\
1655 // approximately a 7-Tap Filter with Vector (1,2,3,4,3,2,1)/16
1657 Implemented Exact 7-Tap
1670 #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\
1671 "movq %%mm0, %%mm1 \n\t"\
1672 "movq %%mm0, %%mm2 \n\t"\
1673 "movq %%mm0, %%mm3 \n\t"\
1674 "movq %%mm0, %%mm4 \n\t"\
1675 "psllq $8, %%mm1 \n\t"\
1676 "psrlq $8, %%mm2 \n\t"\
1677 "pand bm00000001, %%mm3 \n\t"\
1678 "pand bm10000000, %%mm4 \n\t"\
1679 "por %%mm3, %%mm1 \n\t"\
1680 "por %%mm4, %%mm2 \n\t"\
1681 PAVGB(%%mm2, %%mm1)\
1682 PAVGB(%%mm1, %%mm0)\
1684 "pshufw $0xF9, %%mm0, %%mm3 \n\t"\
1685 "pshufw $0x90, %%mm0, %%mm4 \n\t"\
1686 PAVGB(%%mm3, %%mm4)\
1687 PAVGB(%%mm4, %%mm0)\
1688 "movd %%mm0, (%0) \n\t"\
1689 "psrlq $32, %%mm0 \n\t"\
1690 "movd %%mm0, 4(%0) \n\t"
1692 #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\
1693 "movq %%mm0, %%mm1 \n\t"\
1694 "movq %%mm0, %%mm2 \n\t"\
1695 "movd -4(%0), %%mm3 \n\t" /*0001000*/\
1696 "movd 8(%0), %%mm4 \n\t" /*0001000*/\
1697 "psllq $8, %%mm1 \n\t"\
1698 "psrlq $8, %%mm2 \n\t"\
1699 "psrlq $24, %%mm3 \n\t"\
1700 "psllq $56, %%mm4 \n\t"\
1701 "por %%mm3, %%mm1 \n\t"\
1702 "por %%mm4, %%mm2 \n\t"\
1703 PAVGB(%%mm2, %%mm1)\
1704 PAVGB(%%mm1, %%mm0)\
1706 "movq %%mm0, %%mm3 \n\t"\
1707 "movq %%mm0, %%mm4 \n\t"\
1708 "movq %%mm0, %%mm5 \n\t"\
1709 "psrlq $16, %%mm3 \n\t"\
1710 "psllq $16, %%mm4 \n\t"\
1711 "pand bm11000000, %%mm5 \n\t"\
1712 "por %%mm5, %%mm3 \n\t"\
1713 "movq %%mm0, %%mm5 \n\t"\
1714 "pand bm00000011, %%mm5 \n\t"\
1715 "por %%mm5, %%mm4 \n\t"\
1716 PAVGB(%%mm3, %%mm4)\
1717 PAVGB(%%mm4, %%mm0)\
1718 "movd %%mm0, (%0) \n\t"\
1719 "psrlq $32, %%mm0 \n\t"\
1720 "movd %%mm0, 4(%0) \n\t"
1723 /* uses the 7-Tap Filter: 1112111 */
1724 #define NEW_HLP(src, dst)\
1725 "movq " #src "(%%eax), %%mm1 \n\t"\
1726 "movq " #src "(%%eax), %%mm2 \n\t"\
1727 "psllq $8, %%mm1 \n\t"\
1728 "psrlq $8, %%mm2 \n\t"\
1729 "movd -4" #dst ", %%mm3 \n\t" /*0001000*/\
1730 "movd 8" #dst ", %%mm4 \n\t" /*0001000*/\
1731 "psrlq $24, %%mm3 \n\t"\
1732 "psllq $56, %%mm4 \n\t"\
1733 "por %%mm3, %%mm1 \n\t"\
1734 "por %%mm4, %%mm2 \n\t"\
1735 "movq %%mm1, %%mm5 \n\t"\
1736 PAVGB(%%mm2, %%mm1)\
1737 "movq " #src "(%%eax), %%mm0 \n\t"\
1738 PAVGB(%%mm1, %%mm0)\
1739 "psllq $8, %%mm5 \n\t"\
1740 "psrlq $8, %%mm2 \n\t"\
1741 "por %%mm3, %%mm5 \n\t"\
1742 "por %%mm4, %%mm2 \n\t"\
1743 "movq %%mm5, %%mm1 \n\t"\
1744 PAVGB(%%mm2, %%mm5)\
1745 "psllq $8, %%mm1 \n\t"\
1746 "psrlq $8, %%mm2 \n\t"\
1747 "por %%mm3, %%mm1 \n\t"\
1748 "por %%mm4, %%mm2 \n\t"\
1749 PAVGB(%%mm2, %%mm1)\
1750 PAVGB(%%mm1, %%mm5)\
1751 PAVGB(%%mm5, %%mm0)\
1752 "movd %%mm0, " #dst " \n\t"\
1753 "psrlq $32, %%mm0 \n\t"\
1754 "movd %%mm0, 4" #dst " \n\t"
1756 /* uses the 9-Tap Filter: 112242211 */
1757 #define NEW_HLP2(i)\
1758 "movq " #i "(%%eax), %%mm0 \n\t" /*0001000*/\
1759 "movq %%mm0, %%mm1 \n\t" /*0001000*/\
1760 "movq %%mm0, %%mm2 \n\t" /*0001000*/\
1761 "movd -4(%0), %%mm3 \n\t" /*0001000*/\
1762 "movd 8(%0), %%mm4 \n\t" /*0001000*/\
1763 "psllq $8, %%mm1 \n\t"\
1764 "psrlq $8, %%mm2 \n\t"\
1765 "psrlq $24, %%mm3 \n\t"\
1766 "psllq $56, %%mm4 \n\t"\
1767 "por %%mm3, %%mm1 \n\t" /*0010000*/\
1768 "por %%mm4, %%mm2 \n\t" /*0000100*/\
1769 "movq %%mm1, %%mm5 \n\t" /*0010000*/\
1770 PAVGB(%%mm2, %%mm1) /*0010100*/\
1771 PAVGB(%%mm1, %%mm0) /*0012100*/\
1772 "psllq $8, %%mm5 \n\t"\
1773 "psrlq $8, %%mm2 \n\t"\
1774 "por %%mm3, %%mm5 \n\t" /*0100000*/\
1775 "por %%mm4, %%mm2 \n\t" /*0000010*/\
1776 "movq %%mm5, %%mm1 \n\t" /*0100000*/\
1777 PAVGB(%%mm2, %%mm5) /*0100010*/\
1778 "psllq $8, %%mm1 \n\t"\
1779 "psrlq $8, %%mm2 \n\t"\
1780 "por %%mm3, %%mm1 \n\t" /*1000000*/\
1781 "por %%mm4, %%mm2 \n\t" /*0000001*/\
1782 "movq %%mm1, %%mm6 \n\t" /*1000000*/\
1783 PAVGB(%%mm2, %%mm1) /*1000001*/\
1784 "psllq $8, %%mm6 \n\t"\
1785 "psrlq $8, %%mm2 \n\t"\
1786 "por %%mm3, %%mm6 \n\t"/*100000000*/\
1787 "por %%mm4, %%mm2 \n\t"/*000000001*/\
1788 PAVGB(%%mm2, %%mm6) /*100000001*/\
1789 PAVGB(%%mm6, %%mm1) /*110000011*/\
1790 PAVGB(%%mm1, %%mm5) /*112000211*/\
1791 PAVGB(%%mm5, %%mm0) /*112242211*/\
1792 "movd %%mm0, (%0) \n\t"\
1793 "psrlq $32, %%mm0 \n\t"\
1794 "movd %%mm0, 4(%0) \n\t"
1796 #define HLP(src, dst) NEW_HLP(src, dst)
1800 HLP(16, (%%ecx, %1))
1801 HLP(24, (%%ecx, %1, 2))
1802 HLP(32, (%0, %1, 4))
1804 HLP(48, (%%ebx, %1))
1805 HLP(56, (%%ebx, %1, 2))
1808 : "r" (dst), "r" (stride)
1809 : "%eax", "%ebx", "%ecx"
1814 for(y=0; y<BLOCK_SIZE; y++)
1816 const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
1817 const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
1820 sums[0] = first + dst[0];
1821 sums[1] = dst[0] + dst[1];
1822 sums[2] = dst[1] + dst[2];
1823 sums[3] = dst[2] + dst[3];
1824 sums[4] = dst[3] + dst[4];
1825 sums[5] = dst[4] + dst[5];
1826 sums[6] = dst[5] + dst[6];
1827 sums[7] = dst[6] + dst[7];
1828 sums[8] = dst[7] + last;
1830 dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
1831 dst[1]= ((dst[1]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
1832 dst[2]= ((dst[2]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
1833 dst[3]= ((dst[3]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
1834 dst[4]= ((dst[4]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
1835 dst[5]= ((dst[5]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
1836 dst[6]= (((last + dst[6])<<2) + ((dst[7] + sums[5])<<1) + sums[3] + 8)>>4;
1837 dst[7]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
1844 static inline void dering(uint8_t src[], int stride, int QP)
1846 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1848 "movq pQPb, %%mm0 \n\t"
1849 "paddusb %%mm0, %%mm0 \n\t"
1850 "movq %%mm0, pQPb2 \n\t"
1852 "leal (%0, %1), %%eax \n\t"
1853 "leal (%%eax, %1, 4), %%ebx \n\t"
1854 // 0 1 2 3 4 5 6 7 8 9
1855 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1857 "pcmpeqb %%mm6, %%mm6 \n\t"
1858 "pxor %%mm7, %%mm7 \n\t"
1860 #define FIND_MIN_MAX(addr)\
1861 "movq " #addr ", %%mm0 \n\t"\
1862 "pminub %%mm0, %%mm6 \n\t"\
1863 "pmaxub %%mm0, %%mm7 \n\t"
1865 #define FIND_MIN_MAX(addr)\
1866 "movq " #addr ", %%mm0 \n\t"\
1867 "movq %%mm6, %%mm1 \n\t"\
1868 "psubusb %%mm0, %%mm7 \n\t"\
1869 "paddb %%mm0, %%mm7 \n\t"\
1870 "psubusb %%mm0, %%mm1 \n\t"\
1871 "psubb %%mm1, %%mm6 \n\t"
1874 FIND_MIN_MAX((%%eax))
1875 FIND_MIN_MAX((%%eax, %1))
1876 FIND_MIN_MAX((%%eax, %1, 2))
1877 FIND_MIN_MAX((%0, %1, 4))
1878 FIND_MIN_MAX((%%ebx))
1879 FIND_MIN_MAX((%%ebx, %1))
1880 FIND_MIN_MAX((%%ebx, %1, 2))
1881 FIND_MIN_MAX((%0, %1, 8))
1883 "movq %%mm6, %%mm4 \n\t"
1884 "psrlq $8, %%mm6 \n\t"
1886 "pminub %%mm4, %%mm6 \n\t" // min of pixels
1887 "pshufw $0xF9, %%mm6, %%mm4 \n\t"
1888 "pminub %%mm4, %%mm6 \n\t" // min of pixels
1889 "pshufw $0xFE, %%mm6, %%mm4 \n\t"
1890 "pminub %%mm4, %%mm6 \n\t"
1892 "movq %%mm6, %%mm1 \n\t"
1893 "psubusb %%mm4, %%mm1 \n\t"
1894 "psubb %%mm1, %%mm6 \n\t"
1895 "movq %%mm6, %%mm4 \n\t"
1896 "psrlq $16, %%mm6 \n\t"
1897 "movq %%mm6, %%mm1 \n\t"
1898 "psubusb %%mm4, %%mm1 \n\t"
1899 "psubb %%mm1, %%mm6 \n\t"
1900 "movq %%mm6, %%mm4 \n\t"
1901 "psrlq $32, %%mm6 \n\t"
1902 "movq %%mm6, %%mm1 \n\t"
1903 "psubusb %%mm4, %%mm1 \n\t"
1904 "psubb %%mm1, %%mm6 \n\t"
1908 "movq %%mm7, %%mm4 \n\t"
1909 "psrlq $8, %%mm7 \n\t"
1911 "pmaxub %%mm4, %%mm7 \n\t" // max of pixels
1912 "pshufw $0xF9, %%mm7, %%mm4 \n\t"
1913 "pmaxub %%mm4, %%mm7 \n\t"
1914 "pshufw $0xFE, %%mm7, %%mm4 \n\t"
1915 "pmaxub %%mm4, %%mm7 \n\t"
1917 "psubusb %%mm4, %%mm7 \n\t"
1918 "paddb %%mm4, %%mm7 \n\t"
1919 "movq %%mm7, %%mm4 \n\t"
1920 "psrlq $16, %%mm7 \n\t"
1921 "psubusb %%mm4, %%mm7 \n\t"
1922 "paddb %%mm4, %%mm7 \n\t"
1923 "movq %%mm7, %%mm4 \n\t"
1924 "psrlq $32, %%mm7 \n\t"
1925 "psubusb %%mm4, %%mm7 \n\t"
1926 "paddb %%mm4, %%mm7 \n\t"
1928 PAVGB(%%mm6, %%mm7) // a=(max + min)/2
1929 "punpcklbw %%mm7, %%mm7 \n\t"
1930 "punpcklbw %%mm7, %%mm7 \n\t"
1931 "punpcklbw %%mm7, %%mm7 \n\t"
1932 "movq %%mm7, temp0 \n\t"
1934 "movq (%0), %%mm0 \n\t" // L10
1935 "movq %%mm0, %%mm1 \n\t" // L10
1936 "movq %%mm0, %%mm2 \n\t" // L10
1937 "psllq $8, %%mm1 \n\t"
1938 "psrlq $8, %%mm2 \n\t"
1939 "movd -4(%0), %%mm3 \n\t"
1940 "movd 8(%0), %%mm4 \n\t"
1941 "psrlq $24, %%mm3 \n\t"
1942 "psllq $56, %%mm4 \n\t"
1943 "por %%mm3, %%mm1 \n\t" // L00
1944 "por %%mm4, %%mm2 \n\t" // L20
1945 "movq %%mm1, %%mm3 \n\t" // L00
1946 PAVGB(%%mm2, %%mm1) // (L20 + L00)/2
1947 PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4
1948 "psubusb %%mm7, %%mm0 \n\t"
1949 "psubusb %%mm7, %%mm2 \n\t"
1950 "psubusb %%mm7, %%mm3 \n\t"
1951 "pcmpeqb b00, %%mm0 \n\t" // L10 > a ? 0 : -1
1952 "pcmpeqb b00, %%mm2 \n\t" // L20 > a ? 0 : -1
1953 "pcmpeqb b00, %%mm3 \n\t" // L00 > a ? 0 : -1
1954 "paddb %%mm2, %%mm0 \n\t"
1955 "paddb %%mm3, %%mm0 \n\t"
1957 "movq (%%eax), %%mm2 \n\t" // L11
1958 "movq %%mm2, %%mm3 \n\t" // L11
1959 "movq %%mm2, %%mm4 \n\t" // L11
1960 "psllq $8, %%mm3 \n\t"
1961 "psrlq $8, %%mm4 \n\t"
1962 "movd -4(%%eax), %%mm5 \n\t"
1963 "movd 8(%%eax), %%mm6 \n\t"
1964 "psrlq $24, %%mm5 \n\t"
1965 "psllq $56, %%mm6 \n\t"
1966 "por %%mm5, %%mm3 \n\t" // L01
1967 "por %%mm6, %%mm4 \n\t" // L21
1968 "movq %%mm3, %%mm5 \n\t" // L01
1969 PAVGB(%%mm4, %%mm3) // (L21 + L01)/2
1970 PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4
1971 "psubusb %%mm7, %%mm2 \n\t"
1972 "psubusb %%mm7, %%mm4 \n\t"
1973 "psubusb %%mm7, %%mm5 \n\t"
1974 "pcmpeqb b00, %%mm2 \n\t" // L11 > a ? 0 : -1
1975 "pcmpeqb b00, %%mm4 \n\t" // L21 > a ? 0 : -1
1976 "pcmpeqb b00, %%mm5 \n\t" // L01 > a ? 0 : -1
1977 "paddb %%mm4, %%mm2 \n\t"
1978 "paddb %%mm5, %%mm2 \n\t"
1980 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1981 "movq " #src ", " #sx " \n\t" /* src[0] */\
1982 "movq " #sx ", " #lx " \n\t" /* src[0] */\
1983 "movq " #sx ", " #t0 " \n\t" /* src[0] */\
1984 "psllq $8, " #lx " \n\t"\
1985 "psrlq $8, " #t0 " \n\t"\
1986 "movd -4" #src ", " #t1 " \n\t"\
1987 "psrlq $24, " #t1 " \n\t"\
1988 "por " #t1 ", " #lx " \n\t" /* src[-1] */\
1989 "movd 8" #src ", " #t1 " \n\t"\
1990 "psllq $56, " #t1 " \n\t"\
1991 "por " #t1 ", " #t0 " \n\t" /* src[+1] */\
1992 "movq " #lx ", " #t1 " \n\t" /* src[-1] */\
1993 PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\
1994 PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\
1996 "movq " #lx ", temp1 \n\t"\
1997 "movq temp0, " #lx " \n\t"\
1998 "psubusb " #lx ", " #t1 " \n\t"\
1999 "psubusb " #lx ", " #t0 " \n\t"\
2000 "psubusb " #lx ", " #sx " \n\t"\
2001 "movq b00, " #lx " \n\t"\
2002 "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\
2003 "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\
2004 "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\
2005 "paddb " #t1 ", " #t0 " \n\t"\
2006 "paddb " #t0 ", " #sx " \n\t"\
2008 PAVGB(plx, pplx) /* filtered */\
2009 "movq " #dst ", " #t0 " \n\t" /* dst */\
2010 "movq " #t0 ", " #t1 " \n\t" /* dst */\
2011 "psubusb pQPb2, " #t0 " \n\t"\
2012 "paddusb pQPb2, " #t1 " \n\t"\
2014 PMINUB(t1, pplx, t0)\
2015 "paddb " #sx ", " #ppsx " \n\t"\
2016 "paddb " #psx ", " #ppsx " \n\t"\
2017 "#paddb b02, " #ppsx " \n\t"\
2018 "pand b08, " #ppsx " \n\t"\
2019 "pcmpeqb " #lx ", " #ppsx " \n\t"\
2020 "pand " #ppsx ", " #pplx " \n\t"\
2021 "pandn " #dst ", " #ppsx " \n\t"\
2022 "por " #pplx ", " #ppsx " \n\t"\
2023 "movq " #ppsx ", " #dst " \n\t"\
2024 "movq temp1, " #lx " \n\t"
2041 //DERING_CORE(dst,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1)
2042 DERING_CORE((%%eax),(%%eax, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
2043 DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
2044 DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
2045 DERING_CORE((%0, %1, 4),(%%ebx) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
2046 DERING_CORE((%%ebx),(%%ebx, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
2047 DERING_CORE((%%ebx, %1), (%%ebx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
2048 DERING_CORE((%%ebx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
2049 DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
2052 : : "r" (src), "r" (stride), "r" (QP)
2070 if(*p > max) max= *p;
2071 if(*p < min) min= *p;
2074 avg= (min + max + 1)/2;
2083 if(*p > avg) t |= (1<<x);
2087 t &= (t<<1) & (t>>1);
2094 int t = s[y-1] & s[y] & s[y+1];
2103 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
2104 +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1))
2105 +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
2108 if (*p + 2*QP < f) *p= *p + 2*QP;
2109 else if(*p - 2*QP > f) *p= *p - 2*QP;
2119 * Deinterlaces the given block
2120 * will be called for every 8x8 block and can read & write from line 4-15
2121 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
2122 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2124 static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride)
2126 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2129 "leal (%0, %1), %%eax \n\t"
2130 "leal (%%eax, %1, 4), %%ebx \n\t"
2131 // 0 1 2 3 4 5 6 7 8 9
2132 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2134 "movq (%0), %%mm0 \n\t"
2135 "movq (%%eax, %1), %%mm1 \n\t"
2137 "movq %%mm0, (%%eax) \n\t"
2138 "movq (%0, %1, 4), %%mm0 \n\t"
2140 "movq %%mm1, (%%eax, %1, 2) \n\t"
2141 "movq (%%ebx, %1), %%mm1 \n\t"
2143 "movq %%mm0, (%%ebx) \n\t"
2144 "movq (%0, %1, 8), %%mm0 \n\t"
2146 "movq %%mm1, (%%ebx, %1, 2) \n\t"
2148 : : "r" (src), "r" (stride)
2156 src[stride] = (src[0] + src[stride*2])>>1;
2157 src[stride*3] = (src[stride*2] + src[stride*4])>>1;
2158 src[stride*5] = (src[stride*4] + src[stride*6])>>1;
2159 src[stride*7] = (src[stride*6] + src[stride*8])>>1;
2166 * Deinterlaces the given block
2167 * will be called for every 8x8 block and can read & write from line 4-15
2168 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
2169 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2170 * this filter will read lines 3-15 and write 7-13
2171 * no cliping in C version
2173 static inline void deInterlaceInterpolateCubic(uint8_t src[], int stride)
2175 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2178 "leal (%0, %1), %%eax \n\t"
2179 "leal (%%eax, %1, 4), %%ebx \n\t"
2180 "leal (%%ebx, %1, 4), %%ecx \n\t"
2181 "addl %1, %%ecx \n\t"
2182 "pxor %%mm7, %%mm7 \n\t"
2183 // 0 1 2 3 4 5 6 7 8 9 10
2184 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 ecx
2186 #define DEINT_CUBIC(a,b,c,d,e)\
2187 "movq " #a ", %%mm0 \n\t"\
2188 "movq " #b ", %%mm1 \n\t"\
2189 "movq " #d ", %%mm2 \n\t"\
2190 "movq " #e ", %%mm3 \n\t"\
2191 PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\
2192 PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\
2193 "movq %%mm0, %%mm2 \n\t"\
2194 "punpcklbw %%mm7, %%mm0 \n\t"\
2195 "punpckhbw %%mm7, %%mm2 \n\t"\
2196 "movq %%mm1, %%mm3 \n\t"\
2197 "punpcklbw %%mm7, %%mm1 \n\t"\
2198 "punpckhbw %%mm7, %%mm3 \n\t"\
2199 "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\
2200 "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\
2201 "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\
2202 "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\
2203 "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\
2204 "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\
2205 "packuswb %%mm3, %%mm1 \n\t"\
2206 "movq %%mm1, " #c " \n\t"
2208 DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%ebx, %1))
2209 DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%ebx), (%%ebx, %1), (%0, %1, 8))
2210 DEINT_CUBIC((%0, %1, 4), (%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8), (%%ecx))
2211 DEINT_CUBIC((%%ebx, %1), (%0, %1, 8), (%%ebx, %1, 4), (%%ecx), (%%ecx, %1, 2))
2213 : : "r" (src), "r" (stride)
2214 : "%eax", "%ebx", "ecx"
2221 src[stride*3] = (-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4;
2222 src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4;
2223 src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4;
2224 src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4;
2231 * Deinterlaces the given block
2232 * will be called for every 8x8 block and can read & write from line 4-15
2233 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
2234 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2235 * will shift the image up by 1 line (FIXME if this is a problem)
2236 * this filter will read lines 4-13 and write 4-11
2238 static inline void deInterlaceBlendLinear(uint8_t src[], int stride)
2240 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2243 "leal (%0, %1), %%eax \n\t"
2244 "leal (%%eax, %1, 4), %%ebx \n\t"
2245 // 0 1 2 3 4 5 6 7 8 9
2246 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2248 "movq (%0), %%mm0 \n\t" // L0
2249 "movq (%%eax, %1), %%mm1 \n\t" // L2
2250 PAVGB(%%mm1, %%mm0) // L0+L2
2251 "movq (%%eax), %%mm2 \n\t" // L1
2253 "movq %%mm0, (%0) \n\t"
2254 "movq (%%eax, %1, 2), %%mm0 \n\t" // L3
2255 PAVGB(%%mm0, %%mm2) // L1+L3
2256 PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3
2257 "movq %%mm2, (%%eax) \n\t"
2258 "movq (%0, %1, 4), %%mm2 \n\t" // L4
2259 PAVGB(%%mm2, %%mm1) // L2+L4
2260 PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4
2261 "movq %%mm1, (%%eax, %1) \n\t"
2262 "movq (%%ebx), %%mm1 \n\t" // L5
2263 PAVGB(%%mm1, %%mm0) // L3+L5
2264 PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5
2265 "movq %%mm0, (%%eax, %1, 2) \n\t"
2266 "movq (%%ebx, %1), %%mm0 \n\t" // L6
2267 PAVGB(%%mm0, %%mm2) // L4+L6
2268 PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6
2269 "movq %%mm2, (%0, %1, 4) \n\t"
2270 "movq (%%ebx, %1, 2), %%mm2 \n\t" // L7
2271 PAVGB(%%mm2, %%mm1) // L5+L7
2272 PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7
2273 "movq %%mm1, (%%ebx) \n\t"
2274 "movq (%0, %1, 8), %%mm1 \n\t" // L8
2275 PAVGB(%%mm1, %%mm0) // L6+L8
2276 PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8
2277 "movq %%mm0, (%%ebx, %1) \n\t"
2278 "movq (%%ebx, %1, 4), %%mm0 \n\t" // L9
2279 PAVGB(%%mm0, %%mm2) // L7+L9
2280 PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9
2281 "movq %%mm2, (%%ebx, %1, 2) \n\t"
2284 : : "r" (src), "r" (stride)
2292 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2;
2293 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2;
2294 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2295 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
2296 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
2297 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
2298 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
2299 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
2306 * Deinterlaces the given block
2307 * will be called for every 8x8 block and can read & write from line 4-15,
2308 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
2309 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2311 static inline void deInterlaceMedian(uint8_t src[], int stride)
2317 "leal (%0, %1), %%eax \n\t"
2318 "leal (%%eax, %1, 4), %%ebx \n\t"
2319 // 0 1 2 3 4 5 6 7 8 9
2320 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2322 "movq (%0), %%mm0 \n\t" //
2323 "movq (%%eax, %1), %%mm2 \n\t" //
2324 "movq (%%eax), %%mm1 \n\t" //
2325 "movq %%mm0, %%mm3 \n\t"
2326 "pmaxub %%mm1, %%mm0 \n\t" //
2327 "pminub %%mm3, %%mm1 \n\t" //
2328 "pmaxub %%mm2, %%mm1 \n\t" //
2329 "pminub %%mm1, %%mm0 \n\t"
2330 "movq %%mm0, (%%eax) \n\t"
2332 "movq (%0, %1, 4), %%mm0 \n\t" //
2333 "movq (%%eax, %1, 2), %%mm1 \n\t" //
2334 "movq %%mm2, %%mm3 \n\t"
2335 "pmaxub %%mm1, %%mm2 \n\t" //
2336 "pminub %%mm3, %%mm1 \n\t" //
2337 "pmaxub %%mm0, %%mm1 \n\t" //
2338 "pminub %%mm1, %%mm2 \n\t"
2339 "movq %%mm2, (%%eax, %1, 2) \n\t"
2341 "movq (%%ebx), %%mm2 \n\t" //
2342 "movq (%%ebx, %1), %%mm1 \n\t" //
2343 "movq %%mm2, %%mm3 \n\t"
2344 "pmaxub %%mm0, %%mm2 \n\t" //
2345 "pminub %%mm3, %%mm0 \n\t" //
2346 "pmaxub %%mm1, %%mm0 \n\t" //
2347 "pminub %%mm0, %%mm2 \n\t"
2348 "movq %%mm2, (%%ebx) \n\t"
2350 "movq (%%ebx, %1, 2), %%mm2 \n\t" //
2351 "movq (%0, %1, 8), %%mm0 \n\t" //
2352 "movq %%mm2, %%mm3 \n\t"
2353 "pmaxub %%mm0, %%mm2 \n\t" //
2354 "pminub %%mm3, %%mm0 \n\t" //
2355 "pmaxub %%mm1, %%mm0 \n\t" //
2356 "pminub %%mm0, %%mm2 \n\t"
2357 "movq %%mm2, (%%ebx, %1, 2) \n\t"
2360 : : "r" (src), "r" (stride)
2364 #else // MMX without MMX2
2366 "leal (%0, %1), %%eax \n\t"
2367 "leal (%%eax, %1, 4), %%ebx \n\t"
2368 // 0 1 2 3 4 5 6 7 8 9
2369 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2370 "pxor %%mm7, %%mm7 \n\t"
2372 #define MEDIAN(a,b,c)\
2373 "movq " #a ", %%mm0 \n\t"\
2374 "movq " #b ", %%mm2 \n\t"\
2375 "movq " #c ", %%mm1 \n\t"\
2376 "movq %%mm0, %%mm3 \n\t"\
2377 "movq %%mm1, %%mm4 \n\t"\
2378 "movq %%mm2, %%mm5 \n\t"\
2379 "psubusb %%mm1, %%mm3 \n\t"\
2380 "psubusb %%mm2, %%mm4 \n\t"\
2381 "psubusb %%mm0, %%mm5 \n\t"\
2382 "pcmpeqb %%mm7, %%mm3 \n\t"\
2383 "pcmpeqb %%mm7, %%mm4 \n\t"\
2384 "pcmpeqb %%mm7, %%mm5 \n\t"\
2385 "movq %%mm3, %%mm6 \n\t"\
2386 "pxor %%mm4, %%mm3 \n\t"\
2387 "pxor %%mm5, %%mm4 \n\t"\
2388 "pxor %%mm6, %%mm5 \n\t"\
2389 "por %%mm3, %%mm1 \n\t"\
2390 "por %%mm4, %%mm2 \n\t"\
2391 "por %%mm5, %%mm0 \n\t"\
2392 "pand %%mm2, %%mm0 \n\t"\
2393 "pand %%mm1, %%mm0 \n\t"\
2394 "movq %%mm0, " #b " \n\t"
2396 MEDIAN((%0), (%%eax), (%%eax, %1))
2397 MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
2398 MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1))
2399 MEDIAN((%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8))
2401 : : "r" (src), "r" (stride)
2411 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2;
2412 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2;
2413 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2414 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
2415 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
2416 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
2417 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
2418 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
2426 * transposes and shift the given 8x8 Block into dst1 and dst2
2428 static inline void transpose1(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
2431 "leal (%0, %1), %%eax \n\t"
2432 "leal (%%eax, %1, 4), %%ebx \n\t"
2433 // 0 1 2 3 4 5 6 7 8 9
2434 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2435 "movq (%0), %%mm0 \n\t" // 12345678
2436 "movq (%%eax), %%mm1 \n\t" // abcdefgh
2437 "movq %%mm0, %%mm2 \n\t" // 12345678
2438 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2439 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2441 "movq (%%eax, %1), %%mm1 \n\t"
2442 "movq (%%eax, %1, 2), %%mm3 \n\t"
2443 "movq %%mm1, %%mm4 \n\t"
2444 "punpcklbw %%mm3, %%mm1 \n\t"
2445 "punpckhbw %%mm3, %%mm4 \n\t"
2447 "movq %%mm0, %%mm3 \n\t"
2448 "punpcklwd %%mm1, %%mm0 \n\t"
2449 "punpckhwd %%mm1, %%mm3 \n\t"
2450 "movq %%mm2, %%mm1 \n\t"
2451 "punpcklwd %%mm4, %%mm2 \n\t"
2452 "punpckhwd %%mm4, %%mm1 \n\t"
2454 "movd %%mm0, 128(%2) \n\t"
2455 "psrlq $32, %%mm0 \n\t"
2456 "movd %%mm0, 144(%2) \n\t"
2457 "movd %%mm3, 160(%2) \n\t"
2458 "psrlq $32, %%mm3 \n\t"
2459 "movd %%mm3, 176(%2) \n\t"
2460 "movd %%mm3, 48(%3) \n\t"
2461 "movd %%mm2, 192(%2) \n\t"
2462 "movd %%mm2, 64(%3) \n\t"
2463 "psrlq $32, %%mm2 \n\t"
2464 "movd %%mm2, 80(%3) \n\t"
2465 "movd %%mm1, 96(%3) \n\t"
2466 "psrlq $32, %%mm1 \n\t"
2467 "movd %%mm1, 112(%3) \n\t"
2469 "movq (%0, %1, 4), %%mm0 \n\t" // 12345678
2470 "movq (%%ebx), %%mm1 \n\t" // abcdefgh
2471 "movq %%mm0, %%mm2 \n\t" // 12345678
2472 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2473 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2475 "movq (%%ebx, %1), %%mm1 \n\t"
2476 "movq (%%ebx, %1, 2), %%mm3 \n\t"
2477 "movq %%mm1, %%mm4 \n\t"
2478 "punpcklbw %%mm3, %%mm1 \n\t"
2479 "punpckhbw %%mm3, %%mm4 \n\t"
2481 "movq %%mm0, %%mm3 \n\t"
2482 "punpcklwd %%mm1, %%mm0 \n\t"
2483 "punpckhwd %%mm1, %%mm3 \n\t"
2484 "movq %%mm2, %%mm1 \n\t"
2485 "punpcklwd %%mm4, %%mm2 \n\t"
2486 "punpckhwd %%mm4, %%mm1 \n\t"
2488 "movd %%mm0, 132(%2) \n\t"
2489 "psrlq $32, %%mm0 \n\t"
2490 "movd %%mm0, 148(%2) \n\t"
2491 "movd %%mm3, 164(%2) \n\t"
2492 "psrlq $32, %%mm3 \n\t"
2493 "movd %%mm3, 180(%2) \n\t"
2494 "movd %%mm3, 52(%3) \n\t"
2495 "movd %%mm2, 196(%2) \n\t"
2496 "movd %%mm2, 68(%3) \n\t"
2497 "psrlq $32, %%mm2 \n\t"
2498 "movd %%mm2, 84(%3) \n\t"
2499 "movd %%mm1, 100(%3) \n\t"
2500 "psrlq $32, %%mm1 \n\t"
2501 "movd %%mm1, 116(%3) \n\t"
2504 :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2)
2510 * transposes the given 8x8 block
2512 static inline void transpose2(uint8_t *dst, int dstStride, uint8_t *src)
2515 "leal (%0, %1), %%eax \n\t"
2516 "leal (%%eax, %1, 4), %%ebx \n\t"
2517 // 0 1 2 3 4 5 6 7 8 9
2518 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2519 "movq (%2), %%mm0 \n\t" // 12345678
2520 "movq 16(%2), %%mm1 \n\t" // abcdefgh
2521 "movq %%mm0, %%mm2 \n\t" // 12345678
2522 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2523 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2525 "movq 32(%2), %%mm1 \n\t"
2526 "movq 48(%2), %%mm3 \n\t"
2527 "movq %%mm1, %%mm4 \n\t"
2528 "punpcklbw %%mm3, %%mm1 \n\t"
2529 "punpckhbw %%mm3, %%mm4 \n\t"
2531 "movq %%mm0, %%mm3 \n\t"
2532 "punpcklwd %%mm1, %%mm0 \n\t"
2533 "punpckhwd %%mm1, %%mm3 \n\t"
2534 "movq %%mm2, %%mm1 \n\t"
2535 "punpcklwd %%mm4, %%mm2 \n\t"
2536 "punpckhwd %%mm4, %%mm1 \n\t"
2538 "movd %%mm0, (%0) \n\t"
2539 "psrlq $32, %%mm0 \n\t"
2540 "movd %%mm0, (%%eax) \n\t"
2541 "movd %%mm3, (%%eax, %1) \n\t"
2542 "psrlq $32, %%mm3 \n\t"
2543 "movd %%mm3, (%%eax, %1, 2) \n\t"
2544 "movd %%mm2, (%0, %1, 4) \n\t"
2545 "psrlq $32, %%mm2 \n\t"
2546 "movd %%mm2, (%%ebx) \n\t"
2547 "movd %%mm1, (%%ebx, %1) \n\t"
2548 "psrlq $32, %%mm1 \n\t"
2549 "movd %%mm1, (%%ebx, %1, 2) \n\t"
2552 "movq 64(%2), %%mm0 \n\t" // 12345678
2553 "movq 80(%2), %%mm1 \n\t" // abcdefgh
2554 "movq %%mm0, %%mm2 \n\t" // 12345678
2555 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2556 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2558 "movq 96(%2), %%mm1 \n\t"
2559 "movq 112(%2), %%mm3 \n\t"
2560 "movq %%mm1, %%mm4 \n\t"
2561 "punpcklbw %%mm3, %%mm1 \n\t"
2562 "punpckhbw %%mm3, %%mm4 \n\t"
2564 "movq %%mm0, %%mm3 \n\t"
2565 "punpcklwd %%mm1, %%mm0 \n\t"
2566 "punpckhwd %%mm1, %%mm3 \n\t"
2567 "movq %%mm2, %%mm1 \n\t"
2568 "punpcklwd %%mm4, %%mm2 \n\t"
2569 "punpckhwd %%mm4, %%mm1 \n\t"
2571 "movd %%mm0, 4(%0) \n\t"
2572 "psrlq $32, %%mm0 \n\t"
2573 "movd %%mm0, 4(%%eax) \n\t"
2574 "movd %%mm3, 4(%%eax, %1) \n\t"
2575 "psrlq $32, %%mm3 \n\t"
2576 "movd %%mm3, 4(%%eax, %1, 2) \n\t"
2577 "movd %%mm2, 4(%0, %1, 4) \n\t"
2578 "psrlq $32, %%mm2 \n\t"
2579 "movd %%mm2, 4(%%ebx) \n\t"
2580 "movd %%mm1, 4(%%ebx, %1) \n\t"
2581 "psrlq $32, %%mm1 \n\t"
2582 "movd %%mm1, 4(%%ebx, %1, 2) \n\t"
2584 :: "r" (dst), "r" (dstStride), "r" (src)
2590 #ifdef HAVE_ODIVX_POSTPROCESS
2591 #include "../opendivx/postprocess.h"
2595 static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2596 QP_STORE_T QPs[], int QPStride, int isColor, int mode);
2598 /* -pp Command line Help
2599 NOTE/FIXME: put this at an appropriate place (--help, html docs, man mplayer)?
2601 -pp <filterName>[:<option>[:<option>...]][,[-]<filterName>[:<option>...]]...
2604 -pp vdeblock:autoq,hdeblock:autoq,linblenddeint -pp default,-vdeblock
2606 -pp vb:a,hb:a,lb -pp de,-vb
2609 short long name short long option Description
2610 * * a autoq cpu power dependant enabler
2611 c chrom chrominance filtring enabled
2612 y nochrom chrominance filtring disabled
2613 hb hdeblock horizontal deblocking filter
2614 vb vdeblock vertical deblocking filter
2616 h1 x1hdeblock Experimental horizontal deblock filter 1
2617 v1 x1vdeblock Experimental vertical deblock filter 1
2618 dr dering not implemented yet
2619 al autolevels automatic brightness / contrast fixer
2620 f fullyrange stretch luminance range to (0..255)
2621 lb linblenddeint linear blend deinterlacer
2622 li linipoldeint linear interpolating deinterlacer
2623 ci cubicipoldeint cubic interpolating deinterlacer
2624 md mediandeint median deinterlacer
2625 de default hdeblock:a,vdeblock:a,dering:a,autolevels
2626 fa fast x1hdeblock:a,x1vdeblock:a,dering:a,autolevels
2630 * returns a PPMode struct which will have a non 0 error variable if an error occured
2631 * name is the string after "-pp" on the command line
2632 * quality is a number from 0 to GET_PP_QUALITY_MAX
2634 struct PPMode getPPModeByNameAndQuality(char *name, int quality)
2636 char temp[GET_MODE_BUFFER_SIZE];
2638 char *filterDelimiters= ",";
2639 char *optionDelimiters= ":";
2640 struct PPMode ppMode= {0,0,0,0,0,0};
2643 strncpy(temp, name, GET_MODE_BUFFER_SIZE);
2647 int q= GET_PP_QUALITY_MAX;
2650 char *options[OPTIONS_ARRAY_SIZE];
2653 int numOfUnknownOptions=0;
2654 int enable=1; //does the user want us to enabled or disabled the filter
2656 filterToken= strtok(p, filterDelimiters);
2657 if(filterToken == NULL) break;
2658 p+= strlen(filterToken) + 1;
2659 filterName= strtok(filterToken, optionDelimiters);
2660 printf("%s::%s\n", filterToken, filterName);
2662 if(*filterName == '-')
2667 for(;;){ //for all options
2668 option= strtok(NULL, optionDelimiters);
2669 if(option == NULL) break;
2671 printf("%s\n", option);
2672 if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
2673 else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
2674 else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
2677 options[numOfUnknownOptions] = option;
2678 numOfUnknownOptions++;
2679 options[numOfUnknownOptions] = NULL;
2681 if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
2684 /* replace stuff from the replace Table */
2685 for(i=0; replaceTable[2*i]!=NULL; i++)
2687 if(!strcmp(replaceTable[2*i], filterName))
2689 int newlen= strlen(replaceTable[2*i + 1]);
2693 if(p==NULL) p= temp, *p=0; //last filter
2694 else p--, *p=','; //not last filter
2697 spaceLeft= (int)p - (int)temp + plen;
2698 if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE)
2703 memmove(p + newlen, p, plen+1);
2704 memcpy(p, replaceTable[2*i + 1], newlen);
2709 for(i=0; filters[i].shortName!=NULL; i++)
2711 if( !strcmp(filters[i].longName, filterName)
2712 || !strcmp(filters[i].shortName, filterName))
2714 ppMode.lumMode &= ~filters[i].mask;
2715 ppMode.chromMode &= ~filters[i].mask;
2718 if(!enable) break; // user wants to disable it
2720 if(q >= filters[i].minLumQuality)
2721 ppMode.lumMode|= filters[i].mask;
2722 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
2723 if(q >= filters[i].minChromQuality)
2724 ppMode.chromMode|= filters[i].mask;
2726 if(filters[i].mask == LEVEL_FIX)
2729 ppMode.minAllowedY= 16;
2730 ppMode.maxAllowedY= 234;
2731 for(o=0; options[o]!=NULL; o++)
2732 if( !strcmp(options[o],"fullyrange")
2733 ||!strcmp(options[o],"f"))
2735 ppMode.minAllowedY= 0;
2736 ppMode.maxAllowedY= 255;
2737 numOfUnknownOptions--;
2742 if(!filterNameOk) ppMode.error++;
2743 ppMode.error += numOfUnknownOptions;
2746 if(ppMode.lumMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_H;
2747 if(ppMode.lumMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_V;
2748 if(ppMode.chromMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_H;
2749 if(ppMode.chromMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_V;
2750 if(ppMode.lumMode & DERING) ppMode.oldMode |= PP_DERING_Y;
2751 if(ppMode.chromMode & DERING) ppMode.oldMode |= PP_DERING_C;
2759 void postprocess(unsigned char * src[], int src_stride,
2760 unsigned char * dst[], int dst_stride,
2761 int horizontal_size, int vertical_size,
2762 QP_STORE_T *QP_store, int QP_stride,
2768 struct PPMode ppMode= getPPModeByNameAndQuality("fast,default,-hdeblock,-vdeblock", qual);
2771 printf("\n%d %d %d %d\n", ppMode.lumMode, ppMode.chromMode, ppMode.oldMode, ppMode.error);
2772 postprocess2(src, src_stride, dst, dst_stride,
2773 horizontal_size, vertical_size, QP_store, QP_stride, &ppMode);
2778 #ifdef HAVE_ODIVX_POSTPROCESS
2779 // Note: I could make this shit outside of this file, but it would mean one
2780 // more function call...
2782 odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,mode);
2787 postProcess(src[0], src_stride, dst[0], dst_stride,
2788 horizontal_size, vertical_size, QP_store, QP_stride, 0, mode);
2790 horizontal_size >>= 1;
2791 vertical_size >>= 1;
2794 mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00);
2795 // mode&= ~(LINEAR_IPOL_DEINT_FILTER | LINEAR_BLEND_DEINT_FILTER |
2796 // MEDIAN_DEINT_FILTER | CUBIC_IPOL_DEINT_FILTER);
2800 postProcess(src[1], src_stride, dst[1], dst_stride,
2801 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
2802 postProcess(src[2], src_stride, dst[2], dst_stride,
2803 horizontal_size, vertical_size, QP_store, QP_stride, 2, mode);
2807 memcpy(dst[1], src[1], src_stride*horizontal_size);
2808 memcpy(dst[2], src[2], src_stride*horizontal_size);
2812 void postprocess2(unsigned char * src[], int src_stride,
2813 unsigned char * dst[], int dst_stride,
2814 int horizontal_size, int vertical_size,
2815 QP_STORE_T *QP_store, int QP_stride,
2816 struct PPMode *mode)
2819 #ifdef HAVE_ODIVX_POSTPROCESS
2820 // Note: I could make this shit outside of this file, but it would mean one
2821 // more function call...
2823 odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,
2829 postProcess(src[0], src_stride, dst[0], dst_stride,
2830 horizontal_size, vertical_size, QP_store, QP_stride, 0, mode->lumMode);
2832 horizontal_size >>= 1;
2833 vertical_size >>= 1;
2837 postProcess(src[1], src_stride, dst[1], dst_stride,
2838 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode->chromMode);
2839 postProcess(src[2], src_stride, dst[2], dst_stride,
2840 horizontal_size, vertical_size, QP_store, QP_stride, 2, mode->chromMode);
2845 * gets the mode flags for a given quality (larger values mean slower but better postprocessing)
2848 int getPpModeForQuality(int quality){
2849 int modes[1+GET_PP_QUALITY_MAX]= {
2852 // horizontal filters first
2854 LUM_H_DEBLOCK | LUM_V_DEBLOCK,
2855 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK,
2856 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK,
2857 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING,
2858 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING | CHROM_DERING
2860 // vertical filters first
2862 LUM_V_DEBLOCK | LUM_H_DEBLOCK,
2863 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK,
2864 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK,
2865 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING,
2866 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING | CHROM_DERING
2870 #ifdef HAVE_ODIVX_POSTPROCESS
2871 int odivx_modes[1+GET_PP_QUALITY_MAX]= {
2874 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V,
2875 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H,
2876 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V,
2877 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y,
2878 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y|PP_DERING_C
2880 if(use_old_pp) return odivx_modes[quality];
2882 return modes[quality];
2886 * Copies a block from src to dst and fixes the blacklevel
2887 * numLines must be a multiple of 4
2888 * levelFix == 0 -> dont touch the brighness & contrast
2890 static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
2891 int numLines, int levelFix)
2900 "leal (%2,%2), %%eax \n\t"
2901 "leal (%3,%3), %%ebx \n\t"
2902 "movq packedYOffset, %%mm2 \n\t"
2903 "movq packedYScale, %%mm3 \n\t"
2904 "pxor %%mm4, %%mm4 \n\t"
2906 #define SCALED_CPY \
2907 "movq (%0), %%mm0 \n\t"\
2908 "movq (%0), %%mm5 \n\t"\
2909 "punpcklbw %%mm4, %%mm0 \n\t"\
2910 "punpckhbw %%mm4, %%mm5 \n\t"\
2911 "psubw %%mm2, %%mm0 \n\t"\
2912 "psubw %%mm2, %%mm5 \n\t"\
2913 "movq (%0,%2), %%mm1 \n\t"\
2914 "psllw $6, %%mm0 \n\t"\
2915 "psllw $6, %%mm5 \n\t"\
2916 "pmulhw %%mm3, %%mm0 \n\t"\
2917 "movq (%0,%2), %%mm6 \n\t"\
2918 "pmulhw %%mm3, %%mm5 \n\t"\
2919 "punpcklbw %%mm4, %%mm1 \n\t"\
2920 "punpckhbw %%mm4, %%mm6 \n\t"\
2921 "psubw %%mm2, %%mm1 \n\t"\
2922 "psubw %%mm2, %%mm6 \n\t"\
2923 "psllw $6, %%mm1 \n\t"\
2924 "psllw $6, %%mm6 \n\t"\
2925 "pmulhw %%mm3, %%mm1 \n\t"\
2926 "pmulhw %%mm3, %%mm6 \n\t"\
2927 "addl %%eax, %0 \n\t"\
2928 "packuswb %%mm5, %%mm0 \n\t"\
2929 "packuswb %%mm6, %%mm1 \n\t"\
2930 "movq %%mm0, (%1) \n\t"\
2931 "movq %%mm1, (%1, %3) \n\t"\
2934 "addl %%ebx, %1 \n\t"
2936 "addl %%ebx, %1 \n\t"
2938 "addl %%ebx, %1 \n\t"
2948 for(i=0; i<numLines; i++)
2949 memcpy( &(dst[dstStride*i]),
2950 &(src[srcStride*i]), BLOCK_SIZE);
2957 "movl %4, %%eax \n\t"
2958 "movl %%eax, temp0\n\t"
2961 "leal (%2,%2), %%eax \n\t"
2962 "leal (%3,%3), %%ebx \n\t"
2963 "movq packedYOffset, %%mm2 \n\t"
2964 "movq packedYScale, %%mm3 \n\t"
2966 #define SIMPLE_CPY \
2967 "movq (%0), %%mm0 \n\t"\
2968 "movq (%0,%2), %%mm1 \n\t"\
2969 "movq %%mm0, (%1) \n\t"\
2970 "movq %%mm1, (%1, %3) \n\t"\
2974 "addl %%eax, %0 \n\t"
2975 "addl %%ebx, %1 \n\t"
2977 "addl %%eax, %0 \n\t"
2978 "addl %%ebx, %1 \n\t"
2992 for(i=0; i<numLines; i++)
2993 memcpy( &(dst[dstStride*i]),
2994 &(src[srcStride*i]), BLOCK_SIZE);
3001 * Filters array of bytes (Y or U or V values)
3003 static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
3004 QP_STORE_T QPs[], int QPStride, int isColor, int mode)
3007 /* we need 64bit here otherwise we´ll going to have a problem
3008 after watching a black picture for 5 hours*/
3009 static uint64_t *yHistogram= NULL;
3010 int black=0, white=255; // blackest black and whitest white in the picture
3011 int QPCorrecture= 256;
3013 /* Temporary buffers for handling the last row(s) */
3014 static uint8_t *tempDst= NULL;
3015 static uint8_t *tempSrc= NULL;
3017 /* Temporary buffers for handling the last block */
3018 static uint8_t *tempDstBlock= NULL;
3019 static uint8_t *tempSrcBlock= NULL;
3021 #ifdef PP_FUNNY_STRIDE
3022 uint8_t *dstBlockPtrBackup;
3023 uint8_t *srcBlockPtrBackup;
3027 long long T0, T1, diffTime=0;
3030 long long memcpyTime=0, vertTime=0, horizTime=0, sumTime;
3037 tempDst= (uint8_t*)memalign(8, 1024*24);
3038 tempSrc= (uint8_t*)memalign(8, 1024*24);
3039 tempDstBlock= (uint8_t*)memalign(8, 1024*24);
3040 tempSrcBlock= (uint8_t*)memalign(8, 1024*24);
3046 yHistogram= (uint64_t*)malloc(8*256);
3047 for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256;
3049 if(mode & FULL_Y_RANGE)
3060 static int framenum= -1;
3061 uint64_t maxClipped;
3066 if(framenum == 1) yHistogram[0]= width*height/64*15/256;
3068 for(i=0; i<256; i++)
3070 sum+= yHistogram[i];
3071 // printf("%d ", yHistogram[i]);
3075 /* we allways get a completly black picture first */
3076 maxClipped= (uint64_t)(sum * maxClippedThreshold);
3079 for(black=255; black>0; black--)
3081 if(clipped < maxClipped) break;
3082 clipped-= yHistogram[black];
3086 for(white=0; white<256; white++)
3088 if(clipped < maxClipped) break;
3089 clipped-= yHistogram[white];
3092 packedYOffset= (black - minAllowedY) & 0xFFFF;
3093 packedYOffset|= packedYOffset<<32;
3094 packedYOffset|= packedYOffset<<16;
3096 scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);
3098 packedYScale= (uint16_t)(scale*1024.0 + 0.5);
3099 packedYScale|= packedYScale<<32;
3100 packedYScale|= packedYScale<<16;
3104 packedYScale= 0x0100010001000100LL;
3108 if(mode & LEVEL_FIX) QPCorrecture= packedYScale &0xFFFF;
3109 else QPCorrecture= 256;
3111 /* line before the first one */
3114 //1% speedup if these are here instead of the inner loop
3115 uint8_t *srcBlock= &(src[y*srcStride]);
3116 uint8_t *dstBlock= &(dst[y*dstStride]);
3118 dstBlock= tempDst + dstStride;
3120 // From this point on it is guranteed that we can read and write 16 lines downward
3121 // finish 1 block before the next otherwise we´ll might have a problem
3122 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
3123 for(x=0; x<width; x+=BLOCK_SIZE)
3128 prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3129 prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3130 prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3131 prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3134 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3135 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3136 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3137 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3141 "movl %4, %%eax \n\t"
3142 "shrl $2, %%eax \n\t"
3143 "andl $6, %%eax \n\t"
3144 "addl $8, %%eax \n\t"
3145 "movl %%eax, %%ebx \n\t"
3146 "imul %1, %%eax \n\t"
3147 "imul %3, %%ebx \n\t"
3148 "prefetchnta 32(%%eax, %0) \n\t"
3149 "prefetcht0 32(%%ebx, %2) \n\t"
3150 "addl %1, %%eax \n\t"
3151 "addl %3, %%ebx \n\t"
3152 "prefetchnta 32(%%eax, %0) \n\t"
3153 "prefetcht0 32(%%ebx, %2) \n\t"
3154 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
3159 #elif defined(HAVE_3DNOW)
3160 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
3161 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3162 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3163 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3164 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3168 blockCopy(dstBlock + dstStride*8, dstStride,
3169 srcBlock + srcStride*8, srcStride, 8, mode & LEVEL_FIX);
3171 if(mode & LINEAR_IPOL_DEINT_FILTER)
3172 deInterlaceInterpolateLinear(dstBlock, dstStride);
3173 else if(mode & LINEAR_BLEND_DEINT_FILTER)
3174 deInterlaceBlendLinear(dstBlock, dstStride);
3175 else if(mode & MEDIAN_DEINT_FILTER)
3176 deInterlaceMedian(dstBlock, dstStride);
3177 else if(mode & CUBIC_IPOL_DEINT_FILTER)
3178 deInterlaceInterpolateCubic(dstBlock, dstStride);
3179 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3180 deInterlaceBlendCubic(dstBlock, dstStride);
3185 memcpy(&(dst[y*dstStride]) + 8*dstStride, tempDst + 9*dstStride, 8*dstStride );
3188 for(y=0; y<height; y+=BLOCK_SIZE)
3190 //1% speedup if these are here instead of the inner loop
3191 uint8_t *srcBlock= &(src[y*srcStride]);
3192 uint8_t *dstBlock= &(dst[y*dstStride]);
3194 int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride];
3195 int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4);
3196 int QPFrac= QPDelta;
3197 uint8_t *tempBlock1= tempBlocks;
3198 uint8_t *tempBlock2= tempBlocks + 8;
3200 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
3201 if not than use a temporary buffer */
3204 /* copy from line 8 to 15 of src, these will be copied with
3205 blockcopy to dst later */
3206 memcpy(tempSrc + srcStride*8, srcBlock + srcStride*8,
3207 srcStride*MAX(height-y-8, 0) );
3209 /* duplicate last line to fill the void upto line 15 */
3213 for(i=height-y; i<=15; i++)
3214 memcpy(tempSrc + srcStride*i,
3215 src + srcStride*(height-1), srcStride);
3218 /* copy up to 9 lines of dst */
3219 memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, 9) );
3220 dstBlock= tempDst + dstStride;
3224 // From this point on it is guranteed that we can read and write 16 lines downward
3225 // finish 1 block before the next otherwise we´ll might have a problem
3226 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
3227 for(x=0; x<width; x+=BLOCK_SIZE)
3229 const int stride= dstStride;
3235 "sbbl %%eax, %%eax \n\t"
3236 "shll $2, %%eax \n\t"
3237 "subl %%eax, %0 \n\t"
3238 : "+r" (QPptr), "+m" (QPFrac)
3244 QPs[(y>>3)*QPStride + (x>>3)]:
3245 QPs[(y>>4)*QPStride + (x>>4)];
3249 QP= (QP* QPCorrecture)>>8;
3250 yHistogram[ srcBlock[srcStride*4 + 4] ]++;
3254 "movd %0, %%mm7 \n\t"
3255 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
3256 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
3257 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
3258 "movq %%mm7, pQPb \n\t"
3269 prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3270 prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3271 prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3272 prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3275 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3276 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3277 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3278 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3282 "movl %4, %%eax \n\t"
3283 "shrl $2, %%eax \n\t"
3284 "andl $6, %%eax \n\t"
3285 "addl $8, %%eax \n\t"
3286 "movl %%eax, %%ebx \n\t"
3287 "imul %1, %%eax \n\t"
3288 "imul %3, %%ebx \n\t"
3289 "prefetchnta 32(%%eax, %0) \n\t"
3290 "prefetcht0 32(%%ebx, %2) \n\t"
3291 "addl %1, %%eax \n\t"
3292 "addl %3, %%ebx \n\t"
3293 "prefetchnta 32(%%eax, %0) \n\t"
3294 "prefetcht0 32(%%ebx, %2) \n\t"
3295 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
3300 #elif defined(HAVE_3DNOW)
3301 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
3302 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3303 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3304 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3305 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3309 #ifdef PP_FUNNY_STRIDE
3310 //can we mess with a 8x16 block, if not use a temp buffer, yes again
3314 dstBlockPtrBackup= dstBlock;
3315 srcBlockPtrBackup= srcBlock;
3317 for(i=0;i<BLOCK_SIZE*2; i++)
3319 memcpy(tempSrcBlock+i*srcStride, srcBlock+i*srcStride, width-x);
3320 memcpy(tempDstBlock+i*dstStride, dstBlock+i*dstStride, width-x);
3323 dstBlock= tempDstBlock;
3324 srcBlock= tempSrcBlock;
3328 blockCopy(dstBlock + dstStride*8, dstStride,
3329 srcBlock + srcStride*8, srcStride, 8, mode & LEVEL_FIX);
3331 if(mode & LINEAR_IPOL_DEINT_FILTER)
3332 deInterlaceInterpolateLinear(dstBlock, dstStride);
3333 else if(mode & LINEAR_BLEND_DEINT_FILTER)
3334 deInterlaceBlendLinear(dstBlock, dstStride);
3335 else if(mode & MEDIAN_DEINT_FILTER)
3336 deInterlaceMedian(dstBlock, dstStride);
3337 else if(mode & CUBIC_IPOL_DEINT_FILTER)
3338 deInterlaceInterpolateCubic(dstBlock, dstStride);
3339 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3340 deInterlaceBlendCubic(dstBlock, dstStride);
3343 /* only deblock if we have 2 blocks */
3351 if(mode & V_RK1_FILTER)
3352 vertRK1Filter(dstBlock, stride, QP);
3353 else if(mode & V_X1_FILTER)
3354 vertX1Filter(dstBlock, stride, QP);
3355 else if(mode & V_DEBLOCK)
3357 if( isVertDC(dstBlock, stride))
3359 if(isVertMinMaxOk(dstBlock, stride, QP))
3360 doVertLowPass(dstBlock, stride, QP);
3363 doVertDefFilter(dstBlock, stride, QP);
3373 transpose1(tempBlock1, tempBlock2, dstBlock, dstStride);
3375 /* check if we have a previous block to deblock it with dstBlock */
3382 if(mode & H_RK1_FILTER)
3383 vertRK1Filter(tempBlock1, 16, QP);
3384 else if(mode & H_X1_FILTER)
3385 vertX1Filter(tempBlock1, 16, QP);
3386 else if(mode & H_DEBLOCK)
3388 if( isVertDC(tempBlock1, 16))
3390 if(isVertMinMaxOk(tempBlock1, 16, QP))
3391 doVertLowPass(tempBlock1, 16, QP);
3394 doVertDefFilter(tempBlock1, 16, QP);
3397 transpose2(dstBlock-4, dstStride, tempBlock1 + 4*16);
3400 if(mode & H_X1_FILTER)
3401 horizX1Filter(dstBlock-4, stride, QP);
3402 else if(mode & H_DEBLOCK)
3404 if( isHorizDC(dstBlock-4, stride))
3406 if(isHorizMinMaxOk(dstBlock-4, stride, QP))
3407 doHorizLowPass(dstBlock-4, stride, QP);
3410 doHorizDefFilter(dstBlock-4, stride, QP);
3420 //FIXME filter first line
3421 if(y>0) dering(dstBlock - stride - 8, stride, QP);
3424 else if(mode & DERING)
3426 //FIXME y+15 is required cuz of the tempBuffer thing -> bottom right block isnt filtered
3427 if(y > 8 && y+15 < height) dering(dstBlock - stride*9 + width - 8, stride, QP);
3431 #ifdef PP_FUNNY_STRIDE
3432 /* did we use a tmp-block buffer */
3436 dstBlock= dstBlockPtrBackup;
3437 srcBlock= srcBlockPtrBackup;
3439 for(i=0;i<BLOCK_SIZE*2; i++)
3441 memcpy(dstBlock+i*dstStride, tempDstBlock+i*dstStride, width-x);
3450 tmpXchg= tempBlock1;
3451 tempBlock1= tempBlock2;
3452 tempBlock2 = tmpXchg;
3456 /* did we use a tmp buffer for the last lines*/
3459 uint8_t *dstBlock= &(dst[y*dstStride]);
3460 memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) );
3464 asm volatile("femms");
3465 #elif defined (HAVE_MMX)
3466 asm volatile("emms");
3470 // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...)
3471 sumTime= rdtsc() - sumTime;
3473 printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d \r",
3474 (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
3475 (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)