2 Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 doVertDefFilter Ec Ec e e
28 doHorizDefFilter Ec Ec e e
30 Vertical RKAlgo1 E a a
31 Horizontal RKAlgo1 a a
34 LinIpolDeinterlace e E E*
35 CubicIpolDeinterlace a e e*
36 LinBlendDeinterlace e E E*
37 MedianDeinterlace# Ec Ec
40 * i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work
41 # more or less selfinvented filters so the exactness isnt too meaningfull
42 E = Exact implementation
43 e = allmost exact implementation (slightly different rounding,...)
44 a = alternative / approximate impl
45 c = checked against the other implementations (-vo md5)
50 verify that everything workes as it should (how?)
51 reduce the time wasted on the mem transfer
52 implement everything in C at least (done at the moment but ...)
53 unroll stuff if instructions depend too much on the prior one
54 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
55 move YScale thing to the end instead of fixing QP
56 write a faster and higher quality deblocking filter :)
57 make the mainloop more flexible (variable number of blocks at once
58 (the if/else stuff per block is slowing things down)
59 compare the quality & speed of all filters
63 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
65 commandline option for the deblock thresholds
69 //Changelog: use the CVS log
71 #include "../config.h"
82 //#define DEBUG_BRIGHTNESS
83 #include "postprocess.h"
85 #define MIN(a,b) ((a) > (b) ? (b) : (a))
86 #define MAX(a,b) ((a) < (b) ? (b) : (a))
87 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
88 #define SIGN(a) ((a) > 0 ? 1 : -1)
91 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
92 #elif defined (HAVE_3DNOW)
93 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
97 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
98 #elif defined (HAVE_MMX)
99 #define PMINUB(b,a,t) \
100 "movq " #a ", " #t " \n\t"\
101 "psubusb " #b ", " #t " \n\t"\
102 "psubb " #t ", " #a " \n\t"
106 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
107 #elif defined (HAVE_MMX)
108 #define PMAXUB(a,b) \
109 "psubusb " #a ", " #b " \n\t"\
110 "paddb " #a ", " #b " \n\t"
114 #define GET_MODE_BUFFER_SIZE 500
115 #define OPTIONS_ARRAY_SIZE 10
118 static volatile uint64_t __attribute__((aligned(8))) packedYOffset= 0x0000000000000000LL;
119 static volatile uint64_t __attribute__((aligned(8))) packedYScale= 0x0100010001000100LL;
120 static uint64_t __attribute__((aligned(8))) w05= 0x0005000500050005LL;
121 static uint64_t __attribute__((aligned(8))) w20= 0x0020002000200020LL;
122 static uint64_t __attribute__((aligned(8))) w1400= 0x1400140014001400LL;
123 static uint64_t __attribute__((aligned(8))) bm00000001= 0x00000000000000FFLL;
124 static uint64_t __attribute__((aligned(8))) bm00010000= 0x000000FF00000000LL;
125 static uint64_t __attribute__((aligned(8))) bm00001000= 0x00000000FF000000LL;
126 static uint64_t __attribute__((aligned(8))) bm10000000= 0xFF00000000000000LL;
127 static uint64_t __attribute__((aligned(8))) bm10000001= 0xFF000000000000FFLL;
128 static uint64_t __attribute__((aligned(8))) bm11000011= 0xFFFF00000000FFFFLL;
129 static uint64_t __attribute__((aligned(8))) bm00000011= 0x000000000000FFFFLL;
130 static uint64_t __attribute__((aligned(8))) bm11111110= 0xFFFFFFFFFFFFFF00LL;
131 static uint64_t __attribute__((aligned(8))) bm11000000= 0xFFFF000000000000LL;
132 static uint64_t __attribute__((aligned(8))) bm00011000= 0x000000FFFF000000LL;
133 static uint64_t __attribute__((aligned(8))) bm00110011= 0x0000FFFF0000FFFFLL;
134 static uint64_t __attribute__((aligned(8))) bm11001100= 0xFFFF0000FFFF0000LL;
135 static uint64_t __attribute__((aligned(8))) b00= 0x0000000000000000LL;
136 static uint64_t __attribute__((aligned(8))) b01= 0x0101010101010101LL;
137 static uint64_t __attribute__((aligned(8))) b02= 0x0202020202020202LL;
138 static uint64_t __attribute__((aligned(8))) b0F= 0x0F0F0F0F0F0F0F0FLL;
139 static uint64_t __attribute__((aligned(8))) b04= 0x0404040404040404LL;
140 static uint64_t __attribute__((aligned(8))) b08= 0x0808080808080808LL;
141 static uint64_t __attribute__((aligned(8))) bFF= 0xFFFFFFFFFFFFFFFFLL;
142 static uint64_t __attribute__((aligned(8))) b20= 0x2020202020202020LL;
143 static uint64_t __attribute__((aligned(8))) b80= 0x8080808080808080LL;
144 static uint64_t __attribute__((aligned(8))) b7E= 0x7E7E7E7E7E7E7E7ELL;
145 static uint64_t __attribute__((aligned(8))) b7C= 0x7C7C7C7C7C7C7C7CLL;
146 static uint64_t __attribute__((aligned(8))) b3F= 0x3F3F3F3F3F3F3F3FLL;
147 static uint64_t __attribute__((aligned(8))) temp0=0;
148 static uint64_t __attribute__((aligned(8))) temp1=0;
149 static uint64_t __attribute__((aligned(8))) temp2=0;
150 static uint64_t __attribute__((aligned(8))) temp3=0;
151 static uint64_t __attribute__((aligned(8))) temp4=0;
152 static uint64_t __attribute__((aligned(8))) temp5=0;
153 static uint64_t __attribute__((aligned(8))) pQPb=0;
154 static uint64_t __attribute__((aligned(8))) pQPb2=0;
155 static uint8_t __attribute__((aligned(8))) tempBlocks[8*16*2]; //used for the horizontal code
156 static uint32_t __attribute__((aligned(4))) maxTmpNoise[4];
158 static uint64_t packedYOffset= 0x0000000000000000LL;
159 static uint64_t packedYScale= 0x0100010001000100LL;
160 static uint8_t tempBlocks[8*16*2]; //used for the horizontal code
163 int hFlatnessThreshold= 56 - 16;
164 int vFlatnessThreshold= 56 - 16;
166 //amount of "black" u r willing to loose to get a brightness corrected picture
167 double maxClippedThreshold= 0.01;
172 static struct PPFilter filters[]=
174 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
175 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
176 {"vr", "rkvdeblock", 1, 2, 4, H_RK1_FILTER},
177 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
178 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
179 {"dr", "dering", 1, 5, 6, DERING},
180 {"al", "autolevels", 0, 1, 2, LEVEL_FIX},
181 {"lb", "linblenddeint", 0, 1, 6, LINEAR_BLEND_DEINT_FILTER},
182 {"li", "linipoldeint", 0, 1, 6, LINEAR_IPOL_DEINT_FILTER},
183 {"ci", "cubicipoldeint", 0, 1, 6, CUBIC_IPOL_DEINT_FILTER},
184 {"md", "mediandeint", 0, 1, 6, MEDIAN_DEINT_FILTER},
185 {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER},
186 {NULL, NULL,0,0,0,0} //End Marker
189 static char *replaceTable[]=
191 "default", "hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
192 "de", "hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
193 "fast", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
194 "fa", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
199 static inline void unusedVariableWarningFixer()
202 packedYOffset + packedYScale + w05 + w20 + w1400 + bm00000001 + bm00010000
203 + bm00001000 + bm10000000 + bm10000001 + bm11000011 + bm00000011 + bm11111110
204 + bm11000000 + bm00011000 + bm00110011 + bm11001100 + b00 + b01 + b02 + b0F
205 + bFF + b20 + b04+ b08 + pQPb2 + b80 + b7E + b7C + b3F + temp0 + temp1 + temp2 + temp3 + temp4
206 + temp5 + pQPb== 0) b00=0;
211 static inline long long rdtsc()
214 asm volatile( "rdtsc\n\t"
217 // printf("%d\n", int(l/1000));
223 static inline void prefetchnta(void *p)
225 asm volatile( "prefetchnta (%0)\n\t"
230 static inline void prefetcht0(void *p)
232 asm volatile( "prefetcht0 (%0)\n\t"
237 static inline void prefetcht1(void *p)
239 asm volatile( "prefetcht1 (%0)\n\t"
244 static inline void prefetcht2(void *p)
246 asm volatile( "prefetcht2 (%0)\n\t"
252 //FIXME? |255-0| = 1 (shouldnt be a problem ...)
254 * Check if the middle 8x8 Block in the given 8x16 block is flat
256 static inline int isVertDC(uint8_t src[], int stride){
261 src+= stride*4; // src points to begin of the 8x8 Block
264 "leal (%1, %2), %%eax \n\t"
265 "leal (%%eax, %2, 4), %%ebx \n\t"
266 // 0 1 2 3 4 5 6 7 8 9
267 // %1 eax eax+%2 eax+2%2 %1+4%2 ebx ebx+%2 ebx+2%2 %1+8%2 ebx+4%2
268 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F
269 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D
270 "movq (%1), %%mm0 \n\t"
271 "movq (%%eax), %%mm1 \n\t"
272 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
273 "paddb %%mm7, %%mm0 \n\t"
274 "pcmpgtb %%mm6, %%mm0 \n\t"
276 "movq (%%eax,%2), %%mm2 \n\t"
277 "psubb %%mm2, %%mm1 \n\t"
278 "paddb %%mm7, %%mm1 \n\t"
279 "pcmpgtb %%mm6, %%mm1 \n\t"
280 "paddb %%mm1, %%mm0 \n\t"
282 "movq (%%eax, %2, 2), %%mm1 \n\t"
283 "psubb %%mm1, %%mm2 \n\t"
284 "paddb %%mm7, %%mm2 \n\t"
285 "pcmpgtb %%mm6, %%mm2 \n\t"
286 "paddb %%mm2, %%mm0 \n\t"
288 "movq (%1, %2, 4), %%mm2 \n\t"
289 "psubb %%mm2, %%mm1 \n\t"
290 "paddb %%mm7, %%mm1 \n\t"
291 "pcmpgtb %%mm6, %%mm1 \n\t"
292 "paddb %%mm1, %%mm0 \n\t"
294 "movq (%%ebx), %%mm1 \n\t"
295 "psubb %%mm1, %%mm2 \n\t"
296 "paddb %%mm7, %%mm2 \n\t"
297 "pcmpgtb %%mm6, %%mm2 \n\t"
298 "paddb %%mm2, %%mm0 \n\t"
300 "movq (%%ebx, %2), %%mm2 \n\t"
301 "psubb %%mm2, %%mm1 \n\t"
302 "paddb %%mm7, %%mm1 \n\t"
303 "pcmpgtb %%mm6, %%mm1 \n\t"
304 "paddb %%mm1, %%mm0 \n\t"
306 "movq (%%ebx, %2, 2), %%mm1 \n\t"
307 "psubb %%mm1, %%mm2 \n\t"
308 "paddb %%mm7, %%mm2 \n\t"
309 "pcmpgtb %%mm6, %%mm2 \n\t"
310 "paddb %%mm2, %%mm0 \n\t"
313 "movq %%mm0, %%mm1 \n\t"
314 "psrlw $8, %%mm0 \n\t"
315 "paddb %%mm1, %%mm0 \n\t"
317 "pshufw $0xF9, %%mm0, %%mm1 \n\t"
318 "paddb %%mm1, %%mm0 \n\t"
319 "pshufw $0xFE, %%mm0, %%mm1 \n\t"
321 "movq %%mm0, %%mm1 \n\t"
322 "psrlq $16, %%mm0 \n\t"
323 "paddb %%mm1, %%mm0 \n\t"
324 "movq %%mm0, %%mm1 \n\t"
325 "psrlq $32, %%mm0 \n\t"
327 "paddb %%mm1, %%mm0 \n\t"
328 "movd %%mm0, %0 \n\t"
330 : "r" (src), "r" (stride)
334 numEq= (256 - numEq) &0xFF;
337 for(y=0; y<BLOCK_SIZE-1; y++)
339 if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
340 if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
341 if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++;
342 if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++;
343 if(((src[4] - src[4+stride] + 1)&0xFFFF) < 3) numEq++;
344 if(((src[5] - src[5+stride] + 1)&0xFFFF) < 3) numEq++;
345 if(((src[6] - src[6+stride] + 1)&0xFFFF) < 3) numEq++;
346 if(((src[7] - src[7+stride] + 1)&0xFFFF) < 3) numEq++;
350 /* if(abs(numEq - asmEq) > 0)
352 printf("\nasm:%d c:%d\n", asmEq, numEq);
353 for(int y=0; y<8; y++)
355 for(int x=0; x<8; x++)
357 printf("%d ", temp[x + y*stride]);
363 // for(int i=0; i<numEq/8; i++) src[i]=255;
364 return (numEq > vFlatnessThreshold) ? 1 : 0;
367 static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
374 "movq (%1, %2), %%mm0 \n\t"
375 "movq (%1, %2, 8), %%mm1 \n\t"
376 "movq %%mm0, %%mm2 \n\t"
377 "psubusb %%mm1, %%mm0 \n\t"
378 "psubusb %%mm2, %%mm1 \n\t"
379 "por %%mm1, %%mm0 \n\t" // ABS Diff
381 "movq pQPb, %%mm7 \n\t" // QP,..., QP
382 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
383 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0
384 "pcmpeqd b00, %%mm0 \n\t"
385 "psrlq $16, %%mm0 \n\t"
386 "pcmpeqd bFF, %%mm0 \n\t"
387 // "movd %%mm0, (%1, %2, 4)\n\t"
388 "movd %%mm0, %0 \n\t"
390 : "r" (src), "r" (stride)
398 for(x=0; x<BLOCK_SIZE; x++)
400 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
402 /* if(isOk && !isOk2 || !isOk && isOk2)
404 printf("\nasm:%d c:%d QP:%d\n", isOk, isOk2, QP);
405 for(int y=0; y<9; y++)
407 for(int x=0; x<8; x++)
409 printf("%d ", src[x + y*stride]);
421 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
422 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
424 static inline void doVertLowPass(uint8_t *src, int stride, int QP)
426 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
428 asm volatile( //"movv %0 %1 %2\n\t"
429 "movq pQPb, %%mm0 \n\t" // QP,..., QP
431 "movq (%0), %%mm6 \n\t"
432 "movq (%0, %1), %%mm5 \n\t"
433 "movq %%mm5, %%mm1 \n\t"
434 "movq %%mm6, %%mm2 \n\t"
435 "psubusb %%mm6, %%mm5 \n\t"
436 "psubusb %%mm1, %%mm2 \n\t"
437 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
438 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
439 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
441 "pand %%mm2, %%mm6 \n\t"
442 "pandn %%mm1, %%mm2 \n\t"
443 "por %%mm2, %%mm6 \n\t"// First Line to Filter
445 "movq (%0, %1, 8), %%mm5 \n\t"
446 "leal (%0, %1, 4), %%eax \n\t"
447 "leal (%0, %1, 8), %%ebx \n\t"
448 "subl %1, %%ebx \n\t"
449 "addl %1, %0 \n\t" // %0 points to line 1 not 0
450 "movq (%0, %1, 8), %%mm7 \n\t"
451 "movq %%mm5, %%mm1 \n\t"
452 "movq %%mm7, %%mm2 \n\t"
453 "psubusb %%mm7, %%mm5 \n\t"
454 "psubusb %%mm1, %%mm2 \n\t"
455 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
456 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
457 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
459 "pand %%mm2, %%mm7 \n\t"
460 "pandn %%mm1, %%mm2 \n\t"
461 "por %%mm2, %%mm7 \n\t" // First Line to Filter
465 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ebx eax+4%1
470 "movq (%0, %1), %%mm0 \n\t" // 1
471 "movq %%mm0, %%mm1 \n\t" // 1
472 PAVGB(%%mm6, %%mm0) //1 1 /2
473 PAVGB(%%mm6, %%mm0) //3 1 /4
475 "movq (%0, %1, 4), %%mm2 \n\t" // 1
476 "movq %%mm2, %%mm5 \n\t" // 1
477 PAVGB((%%eax), %%mm2) // 11 /2
478 PAVGB((%0, %1, 2), %%mm2) // 211 /4
479 "movq %%mm2, %%mm3 \n\t" // 211 /4
480 "movq (%0), %%mm4 \n\t" // 1
481 PAVGB(%%mm4, %%mm3) // 4 211 /8
482 PAVGB(%%mm0, %%mm3) //642211 /16
483 "movq %%mm3, (%0) \n\t" // X
484 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
485 "movq %%mm1, %%mm0 \n\t" // 1
486 PAVGB(%%mm6, %%mm0) //1 1 /2
487 "movq %%mm4, %%mm3 \n\t" // 1
488 PAVGB((%0,%1,2), %%mm3) // 1 1 /2
489 PAVGB((%%eax,%1,2), %%mm5) // 11 /2
490 PAVGB((%%eax), %%mm5) // 211 /4
491 PAVGB(%%mm5, %%mm3) // 2 2211 /8
492 PAVGB(%%mm0, %%mm3) //4242211 /16
493 "movq %%mm3, (%0,%1) \n\t" // X
494 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
495 PAVGB(%%mm4, %%mm6) //11 /2
496 "movq (%%ebx), %%mm0 \n\t" // 1
497 PAVGB((%%eax, %1, 2), %%mm0) // 11/2
498 "movq %%mm0, %%mm3 \n\t" // 11/2
499 PAVGB(%%mm1, %%mm0) // 2 11/4
500 PAVGB(%%mm6, %%mm0) //222 11/8
501 PAVGB(%%mm2, %%mm0) //22242211/16
502 "movq (%0, %1, 2), %%mm2 \n\t" // 1
503 "movq %%mm0, (%0, %1, 2) \n\t" // X
504 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
505 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
506 PAVGB((%%ebx), %%mm0) // 11 /2
507 PAVGB(%%mm0, %%mm6) //11 11 /4
508 PAVGB(%%mm1, %%mm4) // 11 /2
509 PAVGB(%%mm2, %%mm1) // 11 /2
510 PAVGB(%%mm1, %%mm6) //1122 11 /8
511 PAVGB(%%mm5, %%mm6) //112242211 /16
512 "movq (%%eax), %%mm5 \n\t" // 1
513 "movq %%mm6, (%%eax) \n\t" // X
514 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
515 "movq (%%eax, %1, 4), %%mm6 \n\t" // 1
516 PAVGB(%%mm7, %%mm6) // 11 /2
517 PAVGB(%%mm4, %%mm6) // 11 11 /4
518 PAVGB(%%mm3, %%mm6) // 11 2211 /8
519 PAVGB(%%mm5, %%mm2) // 11 /2
520 "movq (%0, %1, 4), %%mm4 \n\t" // 1
521 PAVGB(%%mm4, %%mm2) // 112 /4
522 PAVGB(%%mm2, %%mm6) // 112242211 /16
523 "movq %%mm6, (%0, %1, 4) \n\t" // X
524 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
525 PAVGB(%%mm7, %%mm1) // 11 2 /4
526 PAVGB(%%mm4, %%mm5) // 11 /2
527 PAVGB(%%mm5, %%mm0) // 11 11 /4
528 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1
529 PAVGB(%%mm6, %%mm1) // 11 4 2 /8
530 PAVGB(%%mm0, %%mm1) // 11224222 /16
531 "movq %%mm1, (%%eax, %1, 2) \n\t" // X
532 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
533 PAVGB((%%ebx), %%mm2) // 112 4 /8
534 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
535 PAVGB(%%mm0, %%mm6) // 1 1 /2
536 PAVGB(%%mm7, %%mm6) // 1 12 /4
537 PAVGB(%%mm2, %%mm6) // 1122424 /4
538 "movq %%mm6, (%%ebx) \n\t" // X
539 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
540 PAVGB(%%mm7, %%mm5) // 11 2 /4
541 PAVGB(%%mm7, %%mm5) // 11 6 /8
543 PAVGB(%%mm3, %%mm0) // 112 /4
544 PAVGB(%%mm0, %%mm5) // 112246 /16
545 "movq %%mm5, (%%eax, %1, 4) \n\t" // X
549 : "r" (src), "r" (stride)
553 const int l1= stride;
554 const int l2= stride + l1;
555 const int l3= stride + l2;
556 const int l4= stride + l3;
557 const int l5= stride + l4;
558 const int l6= stride + l5;
559 const int l7= stride + l6;
560 const int l8= stride + l7;
561 const int l9= stride + l8;
564 for(x=0; x<BLOCK_SIZE; x++)
566 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
567 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
570 sums[0] = first + src[l1];
571 sums[1] = src[l1] + src[l2];
572 sums[2] = src[l2] + src[l3];
573 sums[3] = src[l3] + src[l4];
574 sums[4] = src[l4] + src[l5];
575 sums[5] = src[l5] + src[l6];
576 sums[6] = src[l6] + src[l7];
577 sums[7] = src[l7] + src[l8];
578 sums[8] = src[l8] + last;
580 src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
581 src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
582 src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
583 src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
584 src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
585 src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
586 src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4;
587 src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
596 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
597 * values are correctly clipped (MMX2)
598 * values are wraparound (C)
599 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
606 static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
608 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
612 "pxor %%mm7, %%mm7 \n\t" // 0
613 "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
614 "leal (%0, %1), %%eax \n\t"
615 "leal (%%eax, %1, 4), %%ebx \n\t"
616 // 0 1 2 3 4 5 6 7 8 9
617 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
618 "movq pQPb, %%mm0 \n\t" // QP,..., QP
619 "movq %%mm0, %%mm1 \n\t" // QP,..., QP
620 "paddusb b02, %%mm0 \n\t"
621 "psrlw $2, %%mm0 \n\t"
622 "pand b3F, %%mm0 \n\t" // QP/4,..., QP/4
623 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ...
624 "movq (%0, %1, 4), %%mm2 \n\t" // line 4
625 "movq (%%ebx), %%mm3 \n\t" // line 5
626 "movq %%mm2, %%mm4 \n\t" // line 4
627 "pcmpeqb %%mm5, %%mm5 \n\t" // -1
628 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
630 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
631 "psubusb %%mm3, %%mm4 \n\t"
632 "psubusb %%mm2, %%mm3 \n\t"
633 "por %%mm3, %%mm4 \n\t" // |l4 - l5|
634 "psubusb %%mm0, %%mm4 \n\t"
635 "pcmpeqb %%mm7, %%mm4 \n\t"
636 "pand %%mm4, %%mm5 \n\t" // d/2
638 // "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80
639 "paddb %%mm5, %%mm2 \n\t"
640 // "psubb %%mm6, %%mm2 \n\t"
641 "movq %%mm2, (%0,%1, 4) \n\t"
643 "movq (%%ebx), %%mm2 \n\t"
644 // "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80
645 "psubb %%mm5, %%mm2 \n\t"
646 // "psubb %%mm6, %%mm2 \n\t"
647 "movq %%mm2, (%%ebx) \n\t"
649 "paddb %%mm6, %%mm5 \n\t"
650 "psrlw $2, %%mm5 \n\t"
651 "pand b3F, %%mm5 \n\t"
652 "psubb b20, %%mm5 \n\t" // (l5-l4)/8
654 "movq (%%eax, %1, 2), %%mm2 \n\t"
655 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80
656 "paddsb %%mm5, %%mm2 \n\t"
657 "psubb %%mm6, %%mm2 \n\t"
658 "movq %%mm2, (%%eax, %1, 2) \n\t"
660 "movq (%%ebx, %1), %%mm2 \n\t"
661 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80
662 "psubsb %%mm5, %%mm2 \n\t"
663 "psubb %%mm6, %%mm2 \n\t"
664 "movq %%mm2, (%%ebx, %1) \n\t"
667 : "r" (src), "r" (stride)
671 const int l1= stride;
672 const int l2= stride + l1;
673 const int l3= stride + l2;
674 const int l4= stride + l3;
675 const int l5= stride + l4;
676 const int l6= stride + l5;
677 // const int l7= stride + l6;
678 // const int l8= stride + l7;
679 // const int l9= stride + l8;
681 const int QP15= QP + (QP>>2);
683 for(x=0; x<BLOCK_SIZE; x++)
685 const int v = (src[x+l5] - src[x+l4]);
700 * Experimental Filter 1
701 * will not damage linear gradients
702 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
703 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
704 * MMX2 version does correct clipping C version doesnt
706 static inline void vertX1Filter(uint8_t *src, int stride, int QP)
708 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
712 "pxor %%mm7, %%mm7 \n\t" // 0
713 // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
714 "leal (%0, %1), %%eax \n\t"
715 "leal (%%eax, %1, 4), %%ebx \n\t"
716 // 0 1 2 3 4 5 6 7 8 9
717 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
718 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
719 "movq (%0, %1, 4), %%mm1 \n\t" // line 4
720 "movq %%mm1, %%mm2 \n\t" // line 4
721 "psubusb %%mm0, %%mm1 \n\t"
722 "psubusb %%mm2, %%mm0 \n\t"
723 "por %%mm1, %%mm0 \n\t" // |l2 - l3|
724 "movq (%%ebx), %%mm3 \n\t" // line 5
725 "movq (%%ebx, %1), %%mm4 \n\t" // line 6
726 "movq %%mm3, %%mm5 \n\t" // line 5
727 "psubusb %%mm4, %%mm3 \n\t"
728 "psubusb %%mm5, %%mm4 \n\t"
729 "por %%mm4, %%mm3 \n\t" // |l5 - l6|
730 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
731 "movq %%mm2, %%mm1 \n\t" // line 4
732 "psubusb %%mm5, %%mm2 \n\t"
733 "movq %%mm2, %%mm4 \n\t"
734 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
735 "psubusb %%mm1, %%mm5 \n\t"
736 "por %%mm5, %%mm4 \n\t" // |l4 - l5|
737 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
738 "movq %%mm4, %%mm3 \n\t" // d
739 "psubusb pQPb, %%mm4 \n\t"
740 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
741 "psubusb b01, %%mm3 \n\t"
742 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
744 PAVGB(%%mm7, %%mm3) // d/2
745 "movq %%mm3, %%mm1 \n\t" // d/2
746 PAVGB(%%mm7, %%mm3) // d/4
747 PAVGB(%%mm1, %%mm3) // 3*d/8
749 "movq (%0, %1, 4), %%mm0 \n\t" // line 4
750 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
751 "psubusb %%mm3, %%mm0 \n\t"
752 "pxor %%mm2, %%mm0 \n\t"
753 "movq %%mm0, (%0, %1, 4) \n\t" // line 4
755 "movq (%%ebx), %%mm0 \n\t" // line 5
756 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
757 "paddusb %%mm3, %%mm0 \n\t"
758 "pxor %%mm2, %%mm0 \n\t"
759 "movq %%mm0, (%%ebx) \n\t" // line 5
761 PAVGB(%%mm7, %%mm1) // d/4
763 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
764 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
765 "psubusb %%mm1, %%mm0 \n\t"
766 "pxor %%mm2, %%mm0 \n\t"
767 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3
769 "movq (%%ebx, %1), %%mm0 \n\t" // line 6
770 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
771 "paddusb %%mm1, %%mm0 \n\t"
772 "pxor %%mm2, %%mm0 \n\t"
773 "movq %%mm0, (%%ebx, %1) \n\t" // line 6
775 PAVGB(%%mm7, %%mm1) // d/8
777 "movq (%%eax, %1), %%mm0 \n\t" // line 2
778 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
779 "psubusb %%mm1, %%mm0 \n\t"
780 "pxor %%mm2, %%mm0 \n\t"
781 "movq %%mm0, (%%eax, %1) \n\t" // line 2
783 "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7
784 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
785 "paddusb %%mm1, %%mm0 \n\t"
786 "pxor %%mm2, %%mm0 \n\t"
787 "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7
790 : "r" (src), "r" (stride)
795 const int l1= stride;
796 const int l2= stride + l1;
797 const int l3= stride + l2;
798 const int l4= stride + l3;
799 const int l5= stride + l4;
800 const int l6= stride + l5;
801 const int l7= stride + l6;
802 // const int l8= stride + l7;
803 // const int l9= stride + l8;
807 for(x=0; x<BLOCK_SIZE; x++)
809 int a= src[l3] - src[l4];
810 int b= src[l4] - src[l5];
811 int c= src[l5] - src[l6];
813 int d= ABS(b) - ((ABS(a) + ABS(c))>>1);
818 int v = d * SIGN(-b);
831 const int l1= stride;
832 const int l2= stride + l1;
833 const int l3= stride + l2;
834 const int l4= stride + l3;
835 const int l5= stride + l4;
836 const int l6= stride + l5;
837 const int l7= stride + l6;
838 const int l8= stride + l7;
839 const int l9= stride + l8;
840 for(int x=0; x<BLOCK_SIZE; x++)
849 if(ABS(v4-v5)<QP && ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 )
851 src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6 )/16;
852 src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7 )/16;
853 src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16;
854 src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16;
863 * Experimental Filter 1 (Horizontal)
864 * will not damage linear gradients
865 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
866 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
867 * MMX2 version does correct clipping C version doesnt
868 * not identical with the vertical one
870 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
873 //FIXME (has little in common with the mmx2 version)
874 for(y=0; y<BLOCK_SIZE; y++)
876 int a= src[1] - src[2];
877 int b= src[3] - src[4];
878 int c= src[5] - src[6];
880 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
884 int v = d * SIGN(-b);
899 static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
901 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
904 const int l1= stride;
905 const int l2= stride + l1;
906 const int l3= stride + l2;
907 const int l4= (int)tmp - (int)src - stride*3;
908 const int l5= (int)tmp - (int)src - stride*3 + 8;
909 const int l6= stride*3 + l3;
910 const int l7= stride + l6;
911 const int l8= stride + l7;
913 memcpy(tmp, src+stride*7, 8);
914 memcpy(tmp+8, src+stride*8, 8);
919 #if 0 //sligtly more accurate and slightly slower
920 "pxor %%mm7, %%mm7 \n\t" // 0
921 "leal (%0, %1), %%eax \n\t"
922 "leal (%%eax, %1, 4), %%ebx \n\t"
924 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
925 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
928 "movq (%0, %1, 2), %%mm0 \n\t" // l2
929 "movq (%0), %%mm1 \n\t" // l0
930 "movq %%mm0, %%mm2 \n\t" // l2
931 PAVGB(%%mm7, %%mm0) // ~l2/2
932 PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4
933 PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8
935 "movq (%%eax), %%mm1 \n\t" // l1
936 "movq (%%eax, %1, 2), %%mm3 \n\t" // l3
937 "movq %%mm1, %%mm4 \n\t" // l1
938 PAVGB(%%mm7, %%mm1) // ~l1/2
939 PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4
940 PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8
942 "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8
943 "psubusb %%mm1, %%mm0 \n\t"
944 "psubusb %%mm4, %%mm1 \n\t"
945 "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
946 // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
948 "movq (%0, %1, 4), %%mm0 \n\t" // l4
949 "movq %%mm0, %%mm4 \n\t" // l4
950 PAVGB(%%mm7, %%mm0) // ~l4/2
951 PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4
952 PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8
954 "movq (%%ebx), %%mm2 \n\t" // l5
955 "movq %%mm3, %%mm5 \n\t" // l3
956 PAVGB(%%mm7, %%mm3) // ~l3/2
957 PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4
958 PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8
960 "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8
961 "psubusb %%mm3, %%mm0 \n\t"
962 "psubusb %%mm6, %%mm3 \n\t"
963 "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
964 "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
965 // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
967 "movq (%%ebx, %1), %%mm6 \n\t" // l6
968 "movq %%mm6, %%mm5 \n\t" // l6
969 PAVGB(%%mm7, %%mm6) // ~l6/2
970 PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4
971 PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8
973 "movq (%%ebx, %1, 2), %%mm5 \n\t" // l7
974 "movq %%mm2, %%mm4 \n\t" // l5
975 PAVGB(%%mm7, %%mm2) // ~l5/2
976 PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4
977 PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8
979 "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8
980 "psubusb %%mm2, %%mm6 \n\t"
981 "psubusb %%mm4, %%mm2 \n\t"
982 "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
983 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
986 PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8
987 "movq pQPb, %%mm4 \n\t" // QP //FIXME QP+1 ?
988 "paddusb b01, %%mm4 \n\t"
989 "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP
990 "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
991 "pand %%mm4, %%mm3 \n\t"
993 "movq %%mm3, %%mm1 \n\t"
994 // "psubusb b01, %%mm3 \n\t"
997 "paddusb %%mm1, %%mm3 \n\t"
998 // "paddusb b01, %%mm3 \n\t"
1000 "movq (%%eax, %1, 2), %%mm6 \n\t" //l3
1001 "movq (%0, %1, 4), %%mm5 \n\t" //l4
1002 "movq (%0, %1, 4), %%mm4 \n\t" //l4
1003 "psubusb %%mm6, %%mm5 \n\t"
1004 "psubusb %%mm4, %%mm6 \n\t"
1005 "por %%mm6, %%mm5 \n\t" // |l3-l4|
1006 "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4)
1007 "pxor %%mm6, %%mm0 \n\t"
1008 "pand %%mm0, %%mm3 \n\t"
1009 PMINUB(%%mm5, %%mm3, %%mm0)
1011 "psubusb b01, %%mm3 \n\t"
1014 "movq (%%eax, %1, 2), %%mm0 \n\t"
1015 "movq (%0, %1, 4), %%mm2 \n\t"
1016 "pxor %%mm6, %%mm0 \n\t"
1017 "pxor %%mm6, %%mm2 \n\t"
1018 "psubb %%mm3, %%mm0 \n\t"
1019 "paddb %%mm3, %%mm2 \n\t"
1020 "pxor %%mm6, %%mm0 \n\t"
1021 "pxor %%mm6, %%mm2 \n\t"
1022 "movq %%mm0, (%%eax, %1, 2) \n\t"
1023 "movq %%mm2, (%0, %1, 4) \n\t"
1026 "leal (%0, %1), %%eax \n\t"
1027 "pcmpeqb %%mm6, %%mm6 \n\t" // -1
1029 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
1030 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
1033 "movq (%%eax, %1, 2), %%mm1 \n\t" // l3
1034 "movq (%0, %1, 4), %%mm0 \n\t" // l4
1035 "pxor %%mm6, %%mm1 \n\t" // -l3-1
1036 PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2
1037 // mm1=-l3-1, mm0=128-q
1039 "movq (%%eax, %1, 4), %%mm2 \n\t" // l5
1040 "movq (%%eax, %1), %%mm3 \n\t" // l2
1041 "pxor %%mm6, %%mm2 \n\t" // -l5-1
1042 "movq %%mm2, %%mm5 \n\t" // -l5-1
1043 "movq b80, %%mm4 \n\t" // 128
1044 "leal (%%eax, %1, 4), %%ebx \n\t"
1045 PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2
1046 PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128
1047 PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128
1048 PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128
1049 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
1051 "movq (%%eax), %%mm2 \n\t" // l1
1052 "pxor %%mm6, %%mm2 \n\t" // -l1-1
1053 PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2
1054 PAVGB((%0), %%mm1) // (l0-l3+256)/2
1055 "movq b80, %%mm3 \n\t" // 128
1056 PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128
1057 PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128
1058 PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128
1059 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
1061 PAVGB((%%ebx, %1), %%mm5) // (l6-l5+256)/2
1062 "movq (%%ebx, %1, 2), %%mm1 \n\t" // l7
1063 "pxor %%mm6, %%mm1 \n\t" // -l7-1
1064 PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2
1065 "movq b80, %%mm2 \n\t" // 128
1066 PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128
1067 PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128
1068 PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128
1069 // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
1071 "movq b00, %%mm1 \n\t" // 0
1072 "movq b00, %%mm5 \n\t" // 0
1073 "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16
1074 "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16
1075 PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16|
1076 PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16|
1077 PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16
1079 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
1081 "movq b00, %%mm7 \n\t" // 0
1082 "movq pQPb, %%mm2 \n\t" // QP
1083 PAVGB(%%mm6, %%mm2) // 128 + QP/2
1084 "psubb %%mm6, %%mm2 \n\t"
1086 "movq %%mm4, %%mm1 \n\t"
1087 "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy)
1088 "pxor %%mm1, %%mm4 \n\t"
1089 "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16
1090 "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2
1091 "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
1092 // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
1094 "movq %%mm4, %%mm3 \n\t" // d
1095 "psubusb b01, %%mm4 \n\t"
1096 PAVGB(%%mm7, %%mm4) // d/32
1097 PAVGB(%%mm7, %%mm4) // (d + 32)/64
1098 "paddb %%mm3, %%mm4 \n\t" // 5d/64
1099 "pand %%mm2, %%mm4 \n\t"
1101 "movq b80, %%mm5 \n\t" // 128
1102 "psubb %%mm0, %%mm5 \n\t" // q
1103 "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding
1104 "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q)
1105 "pxor %%mm7, %%mm5 \n\t"
1107 PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64)
1108 "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q)
1110 "pand %%mm7, %%mm4 \n\t"
1111 "movq (%%eax, %1, 2), %%mm0 \n\t"
1112 "movq (%0, %1, 4), %%mm2 \n\t"
1113 "pxor %%mm1, %%mm0 \n\t"
1114 "pxor %%mm1, %%mm2 \n\t"
1115 "paddb %%mm4, %%mm0 \n\t"
1116 "psubb %%mm4, %%mm2 \n\t"
1117 "pxor %%mm1, %%mm0 \n\t"
1118 "pxor %%mm1, %%mm2 \n\t"
1119 "movq %%mm0, (%%eax, %1, 2) \n\t"
1120 "movq %%mm2, (%0, %1, 4) \n\t"
1123 : "r" (src), "r" (stride)
1131 for(x=0; x<BLOCK_SIZE; x++)
1133 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1134 if(ABS(middleEnergy)< 8*QP)
1136 const int q=(src[l4] - src[l5])/2;
1137 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1138 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1140 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1144 d*= SIGN(-middleEnergy);
1168 int d= src[x+y*stride] - tmp[x+(y-4)*8];
1179 src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
1183 if(num%1000000 == 0)
1185 printf(" %d %d %d %d\n", num, sum, max, bias);
1191 #elif defined (HAVE_MMX)
1195 "pxor %%mm7, %%mm7 \n\t"
1196 "leal (%0, %1), %%eax \n\t"
1197 "leal (%%eax, %1, 4), %%ebx \n\t"
1199 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
1200 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
1202 "movq (%0), %%mm0 \n\t"
1203 "movq %%mm0, %%mm1 \n\t"
1204 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
1205 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
1207 "movq (%%eax), %%mm2 \n\t"
1208 "movq %%mm2, %%mm3 \n\t"
1209 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
1210 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
1212 "movq (%%eax, %1), %%mm4 \n\t"
1213 "movq %%mm4, %%mm5 \n\t"
1214 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
1215 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
1217 "paddw %%mm0, %%mm0 \n\t" // 2L0
1218 "paddw %%mm1, %%mm1 \n\t" // 2H0
1219 "psubw %%mm4, %%mm2 \n\t" // L1 - L2
1220 "psubw %%mm5, %%mm3 \n\t" // H1 - H2
1221 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
1222 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
1224 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
1225 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
1226 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
1227 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
1229 "movq (%%eax, %1, 2), %%mm2 \n\t"
1230 "movq %%mm2, %%mm3 \n\t"
1231 "punpcklbw %%mm7, %%mm2 \n\t" // L3
1232 "punpckhbw %%mm7, %%mm3 \n\t" // H3
1234 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
1235 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
1236 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1237 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1238 "movq %%mm0, temp0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1239 "movq %%mm1, temp1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1241 "movq (%0, %1, 4), %%mm0 \n\t"
1242 "movq %%mm0, %%mm1 \n\t"
1243 "punpcklbw %%mm7, %%mm0 \n\t" // L4
1244 "punpckhbw %%mm7, %%mm1 \n\t" // H4
1246 "psubw %%mm0, %%mm2 \n\t" // L3 - L4
1247 "psubw %%mm1, %%mm3 \n\t" // H3 - H4
1248 "movq %%mm2, temp2 \n\t" // L3 - L4
1249 "movq %%mm3, temp3 \n\t" // H3 - H4
1250 "paddw %%mm4, %%mm4 \n\t" // 2L2
1251 "paddw %%mm5, %%mm5 \n\t" // 2H2
1252 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
1253 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
1255 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
1256 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
1257 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
1258 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
1260 "movq (%%ebx), %%mm2 \n\t"
1261 "movq %%mm2, %%mm3 \n\t"
1262 "punpcklbw %%mm7, %%mm2 \n\t" // L5
1263 "punpckhbw %%mm7, %%mm3 \n\t" // H5
1264 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
1265 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
1266 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
1267 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
1269 "movq (%%ebx, %1), %%mm6 \n\t"
1270 "punpcklbw %%mm7, %%mm6 \n\t" // L6
1271 "psubw %%mm6, %%mm2 \n\t" // L5 - L6
1272 "movq (%%ebx, %1), %%mm6 \n\t"
1273 "punpckhbw %%mm7, %%mm6 \n\t" // H6
1274 "psubw %%mm6, %%mm3 \n\t" // H5 - H6
1276 "paddw %%mm0, %%mm0 \n\t" // 2L4
1277 "paddw %%mm1, %%mm1 \n\t" // 2H4
1278 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
1279 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
1281 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
1282 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
1283 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
1284 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
1286 "movq (%%ebx, %1, 2), %%mm2 \n\t"
1287 "movq %%mm2, %%mm3 \n\t"
1288 "punpcklbw %%mm7, %%mm2 \n\t" // L7
1289 "punpckhbw %%mm7, %%mm3 \n\t" // H7
1291 "paddw %%mm2, %%mm2 \n\t" // 2L7
1292 "paddw %%mm3, %%mm3 \n\t" // 2H7
1293 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
1294 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
1296 "movq temp0, %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1297 "movq temp1, %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1300 "movq %%mm7, %%mm6 \n\t" // 0
1301 "psubw %%mm0, %%mm6 \n\t"
1302 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1303 "movq %%mm7, %%mm6 \n\t" // 0
1304 "psubw %%mm1, %%mm6 \n\t"
1305 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1306 "movq %%mm7, %%mm6 \n\t" // 0
1307 "psubw %%mm2, %%mm6 \n\t"
1308 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1309 "movq %%mm7, %%mm6 \n\t" // 0
1310 "psubw %%mm3, %%mm6 \n\t"
1311 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1313 "movq %%mm7, %%mm6 \n\t" // 0
1314 "pcmpgtw %%mm0, %%mm6 \n\t"
1315 "pxor %%mm6, %%mm0 \n\t"
1316 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1317 "movq %%mm7, %%mm6 \n\t" // 0
1318 "pcmpgtw %%mm1, %%mm6 \n\t"
1319 "pxor %%mm6, %%mm1 \n\t"
1320 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1321 "movq %%mm7, %%mm6 \n\t" // 0
1322 "pcmpgtw %%mm2, %%mm6 \n\t"
1323 "pxor %%mm6, %%mm2 \n\t"
1324 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1325 "movq %%mm7, %%mm6 \n\t" // 0
1326 "pcmpgtw %%mm3, %%mm6 \n\t"
1327 "pxor %%mm6, %%mm3 \n\t"
1328 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1332 "pminsw %%mm2, %%mm0 \n\t"
1333 "pminsw %%mm3, %%mm1 \n\t"
1335 "movq %%mm0, %%mm6 \n\t"
1336 "psubusw %%mm2, %%mm6 \n\t"
1337 "psubw %%mm6, %%mm0 \n\t"
1338 "movq %%mm1, %%mm6 \n\t"
1339 "psubusw %%mm3, %%mm6 \n\t"
1340 "psubw %%mm6, %%mm1 \n\t"
1343 "movq %%mm7, %%mm6 \n\t" // 0
1344 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
1345 "pxor %%mm6, %%mm4 \n\t"
1346 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
1347 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
1348 "pxor %%mm7, %%mm5 \n\t"
1349 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
1351 "movd %2, %%mm2 \n\t" // QP
1352 "punpcklwd %%mm2, %%mm2 \n\t"
1353 "punpcklwd %%mm2, %%mm2 \n\t"
1354 "psllw $3, %%mm2 \n\t" // 8QP
1355 "movq %%mm2, %%mm3 \n\t" // 8QP
1356 "pcmpgtw %%mm4, %%mm2 \n\t"
1357 "pcmpgtw %%mm5, %%mm3 \n\t"
1358 "pand %%mm2, %%mm4 \n\t"
1359 "pand %%mm3, %%mm5 \n\t"
1362 "psubusw %%mm0, %%mm4 \n\t" // hd
1363 "psubusw %%mm1, %%mm5 \n\t" // ld
1366 "movq w05, %%mm2 \n\t" // 5
1367 "pmullw %%mm2, %%mm4 \n\t"
1368 "pmullw %%mm2, %%mm5 \n\t"
1369 "movq w20, %%mm2 \n\t" // 32
1370 "paddw %%mm2, %%mm4 \n\t"
1371 "paddw %%mm2, %%mm5 \n\t"
1372 "psrlw $6, %%mm4 \n\t"
1373 "psrlw $6, %%mm5 \n\t"
1376 "movq w06, %%mm2 \n\t" // 6
1377 "paddw %%mm2, %%mm4 \n\t"
1378 "paddw %%mm2, %%mm5 \n\t"
1379 "movq w1400, %%mm2 \n\t" // 1400h = 5120 = 5/64*2^16
1380 //FIXME if *5/64 is supposed to be /13 then we should use 5041 instead of 5120
1381 "pmulhw %%mm2, %%mm4 \n\t" // hd/13
1382 "pmulhw %%mm2, %%mm5 \n\t" // ld/13
1385 "movq temp2, %%mm0 \n\t" // L3 - L4
1386 "movq temp3, %%mm1 \n\t" // H3 - H4
1388 "pxor %%mm2, %%mm2 \n\t"
1389 "pxor %%mm3, %%mm3 \n\t"
1391 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
1392 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
1393 "pxor %%mm2, %%mm0 \n\t"
1394 "pxor %%mm3, %%mm1 \n\t"
1395 "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
1396 "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
1397 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
1398 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
1400 "pxor %%mm6, %%mm2 \n\t"
1401 "pxor %%mm7, %%mm3 \n\t"
1402 "pand %%mm2, %%mm4 \n\t"
1403 "pand %%mm3, %%mm5 \n\t"
1406 "pminsw %%mm0, %%mm4 \n\t"
1407 "pminsw %%mm1, %%mm5 \n\t"
1409 "movq %%mm4, %%mm2 \n\t"
1410 "psubusw %%mm0, %%mm2 \n\t"
1411 "psubw %%mm2, %%mm4 \n\t"
1412 "movq %%mm5, %%mm2 \n\t"
1413 "psubusw %%mm1, %%mm2 \n\t"
1414 "psubw %%mm2, %%mm5 \n\t"
1416 "pxor %%mm6, %%mm4 \n\t"
1417 "pxor %%mm7, %%mm5 \n\t"
1418 "psubw %%mm6, %%mm4 \n\t"
1419 "psubw %%mm7, %%mm5 \n\t"
1420 "packsswb %%mm5, %%mm4 \n\t"
1421 "movq (%%eax, %1, 2), %%mm0 \n\t"
1422 "paddb %%mm4, %%mm0 \n\t"
1423 "movq %%mm0, (%%eax, %1, 2) \n\t"
1424 "movq (%0, %1, 4), %%mm0 \n\t"
1425 "psubb %%mm4, %%mm0 \n\t"
1426 "movq %%mm0, (%0, %1, 4) \n\t"
1429 : "r" (src), "r" (stride), "r" (QP)
1433 const int l1= stride;
1434 const int l2= stride + l1;
1435 const int l3= stride + l2;
1436 const int l4= stride + l3;
1437 const int l5= stride + l4;
1438 const int l6= stride + l5;
1439 const int l7= stride + l6;
1440 const int l8= stride + l7;
1441 // const int l9= stride + l8;
1444 for(x=0; x<BLOCK_SIZE; x++)
1446 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1447 if(ABS(middleEnergy) < 8*QP)
1449 const int q=(src[l4] - src[l5])/2;
1450 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1451 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1453 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1457 d*= SIGN(-middleEnergy);
1479 * Check if the given 8x8 Block is mostly "flat"
1481 static inline int isHorizDC(uint8_t src[], int stride)
1485 for(y=0; y<BLOCK_SIZE; y++)
1487 if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++;
1488 if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++;
1489 if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++;
1490 if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++;
1491 if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++;
1492 if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++;
1493 if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++;
1496 return numEq > hFlatnessThreshold;
1499 static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
1501 if(abs(src[0] - src[7]) > 2*QP) return 0;
1506 static inline void doHorizDefFilter(uint8_t dst[], int stride, int QP)
1509 for(y=0; y<BLOCK_SIZE; y++)
1511 const int middleEnergy= 5*(dst[4] - dst[5]) + 2*(dst[2] - dst[5]);
1513 if(ABS(middleEnergy) < 8*QP)
1515 const int q=(dst[3] - dst[4])/2;
1516 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
1517 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
1519 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1523 d*= SIGN(-middleEnergy);
1544 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
1545 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
1547 static inline void doHorizLowPass(uint8_t dst[], int stride, int QP)
1551 for(y=0; y<BLOCK_SIZE; y++)
1553 const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
1554 const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
1557 sums[0] = first + dst[0];
1558 sums[1] = dst[0] + dst[1];
1559 sums[2] = dst[1] + dst[2];
1560 sums[3] = dst[2] + dst[3];
1561 sums[4] = dst[3] + dst[4];
1562 sums[5] = dst[4] + dst[5];
1563 sums[6] = dst[5] + dst[6];
1564 sums[7] = dst[6] + dst[7];
1565 sums[8] = dst[7] + last;
1567 dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
1568 dst[1]= ((dst[1]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
1569 dst[2]= ((dst[2]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
1570 dst[3]= ((dst[3]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
1571 dst[4]= ((dst[4]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
1572 dst[5]= ((dst[5]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
1573 dst[6]= (((last + dst[6])<<2) + ((dst[7] + sums[5])<<1) + sums[3] + 8)>>4;
1574 dst[7]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
1581 static inline void dering(uint8_t src[], int stride, int QP)
1583 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1585 "movq pQPb, %%mm0 \n\t"
1586 "paddusb %%mm0, %%mm0 \n\t"
1587 "movq %%mm0, pQPb2 \n\t"
1589 "leal (%0, %1), %%eax \n\t"
1590 "leal (%%eax, %1, 4), %%ebx \n\t"
1591 // 0 1 2 3 4 5 6 7 8 9
1592 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1594 "pcmpeqb %%mm6, %%mm6 \n\t"
1595 "pxor %%mm7, %%mm7 \n\t"
1597 #define FIND_MIN_MAX(addr)\
1598 "movq " #addr ", %%mm0 \n\t"\
1599 "pminub %%mm0, %%mm6 \n\t"\
1600 "pmaxub %%mm0, %%mm7 \n\t"
1602 #define FIND_MIN_MAX(addr)\
1603 "movq " #addr ", %%mm0 \n\t"\
1604 "movq %%mm6, %%mm1 \n\t"\
1605 "psubusb %%mm0, %%mm7 \n\t"\
1606 "paddb %%mm0, %%mm7 \n\t"\
1607 "psubusb %%mm0, %%mm1 \n\t"\
1608 "psubb %%mm1, %%mm6 \n\t"
1611 FIND_MIN_MAX((%%eax))
1612 FIND_MIN_MAX((%%eax, %1))
1613 FIND_MIN_MAX((%%eax, %1, 2))
1614 FIND_MIN_MAX((%0, %1, 4))
1615 FIND_MIN_MAX((%%ebx))
1616 FIND_MIN_MAX((%%ebx, %1))
1617 FIND_MIN_MAX((%%ebx, %1, 2))
1618 FIND_MIN_MAX((%0, %1, 8))
1620 "movq %%mm6, %%mm4 \n\t"
1621 "psrlq $8, %%mm6 \n\t"
1623 "pminub %%mm4, %%mm6 \n\t" // min of pixels
1624 "pshufw $0xF9, %%mm6, %%mm4 \n\t"
1625 "pminub %%mm4, %%mm6 \n\t" // min of pixels
1626 "pshufw $0xFE, %%mm6, %%mm4 \n\t"
1627 "pminub %%mm4, %%mm6 \n\t"
1629 "movq %%mm6, %%mm1 \n\t"
1630 "psubusb %%mm4, %%mm1 \n\t"
1631 "psubb %%mm1, %%mm6 \n\t"
1632 "movq %%mm6, %%mm4 \n\t"
1633 "psrlq $16, %%mm6 \n\t"
1634 "movq %%mm6, %%mm1 \n\t"
1635 "psubusb %%mm4, %%mm1 \n\t"
1636 "psubb %%mm1, %%mm6 \n\t"
1637 "movq %%mm6, %%mm4 \n\t"
1638 "psrlq $32, %%mm6 \n\t"
1639 "movq %%mm6, %%mm1 \n\t"
1640 "psubusb %%mm4, %%mm1 \n\t"
1641 "psubb %%mm1, %%mm6 \n\t"
1645 "movq %%mm7, %%mm4 \n\t"
1646 "psrlq $8, %%mm7 \n\t"
1648 "pmaxub %%mm4, %%mm7 \n\t" // max of pixels
1649 "pshufw $0xF9, %%mm7, %%mm4 \n\t"
1650 "pmaxub %%mm4, %%mm7 \n\t"
1651 "pshufw $0xFE, %%mm7, %%mm4 \n\t"
1652 "pmaxub %%mm4, %%mm7 \n\t"
1654 "psubusb %%mm4, %%mm7 \n\t"
1655 "paddb %%mm4, %%mm7 \n\t"
1656 "movq %%mm7, %%mm4 \n\t"
1657 "psrlq $16, %%mm7 \n\t"
1658 "psubusb %%mm4, %%mm7 \n\t"
1659 "paddb %%mm4, %%mm7 \n\t"
1660 "movq %%mm7, %%mm4 \n\t"
1661 "psrlq $32, %%mm7 \n\t"
1662 "psubusb %%mm4, %%mm7 \n\t"
1663 "paddb %%mm4, %%mm7 \n\t"
1665 PAVGB(%%mm6, %%mm7) // a=(max + min)/2
1666 "punpcklbw %%mm7, %%mm7 \n\t"
1667 "punpcklbw %%mm7, %%mm7 \n\t"
1668 "punpcklbw %%mm7, %%mm7 \n\t"
1669 "movq %%mm7, temp0 \n\t"
1671 "movq (%0), %%mm0 \n\t" // L10
1672 "movq %%mm0, %%mm1 \n\t" // L10
1673 "movq %%mm0, %%mm2 \n\t" // L10
1674 "psllq $8, %%mm1 \n\t"
1675 "psrlq $8, %%mm2 \n\t"
1676 "movd -4(%0), %%mm3 \n\t"
1677 "movd 8(%0), %%mm4 \n\t"
1678 "psrlq $24, %%mm3 \n\t"
1679 "psllq $56, %%mm4 \n\t"
1680 "por %%mm3, %%mm1 \n\t" // L00
1681 "por %%mm4, %%mm2 \n\t" // L20
1682 "movq %%mm1, %%mm3 \n\t" // L00
1683 PAVGB(%%mm2, %%mm1) // (L20 + L00)/2
1684 PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4
1685 "psubusb %%mm7, %%mm0 \n\t"
1686 "psubusb %%mm7, %%mm2 \n\t"
1687 "psubusb %%mm7, %%mm3 \n\t"
1688 "pcmpeqb b00, %%mm0 \n\t" // L10 > a ? 0 : -1
1689 "pcmpeqb b00, %%mm2 \n\t" // L20 > a ? 0 : -1
1690 "pcmpeqb b00, %%mm3 \n\t" // L00 > a ? 0 : -1
1691 "paddb %%mm2, %%mm0 \n\t"
1692 "paddb %%mm3, %%mm0 \n\t"
1694 "movq (%%eax), %%mm2 \n\t" // L11
1695 "movq %%mm2, %%mm3 \n\t" // L11
1696 "movq %%mm2, %%mm4 \n\t" // L11
1697 "psllq $8, %%mm3 \n\t"
1698 "psrlq $8, %%mm4 \n\t"
1699 "movd -4(%%eax), %%mm5 \n\t"
1700 "movd 8(%%eax), %%mm6 \n\t"
1701 "psrlq $24, %%mm5 \n\t"
1702 "psllq $56, %%mm6 \n\t"
1703 "por %%mm5, %%mm3 \n\t" // L01
1704 "por %%mm6, %%mm4 \n\t" // L21
1705 "movq %%mm3, %%mm5 \n\t" // L01
1706 PAVGB(%%mm4, %%mm3) // (L21 + L01)/2
1707 PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4
1708 "psubusb %%mm7, %%mm2 \n\t"
1709 "psubusb %%mm7, %%mm4 \n\t"
1710 "psubusb %%mm7, %%mm5 \n\t"
1711 "pcmpeqb b00, %%mm2 \n\t" // L11 > a ? 0 : -1
1712 "pcmpeqb b00, %%mm4 \n\t" // L21 > a ? 0 : -1
1713 "pcmpeqb b00, %%mm5 \n\t" // L01 > a ? 0 : -1
1714 "paddb %%mm4, %%mm2 \n\t"
1715 "paddb %%mm5, %%mm2 \n\t"
1717 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1718 "movq " #src ", " #sx " \n\t" /* src[0] */\
1719 "movq " #sx ", " #lx " \n\t" /* src[0] */\
1720 "movq " #sx ", " #t0 " \n\t" /* src[0] */\
1721 "psllq $8, " #lx " \n\t"\
1722 "psrlq $8, " #t0 " \n\t"\
1723 "movd -4" #src ", " #t1 " \n\t"\
1724 "psrlq $24, " #t1 " \n\t"\
1725 "por " #t1 ", " #lx " \n\t" /* src[-1] */\
1726 "movd 8" #src ", " #t1 " \n\t"\
1727 "psllq $56, " #t1 " \n\t"\
1728 "por " #t1 ", " #t0 " \n\t" /* src[+1] */\
1729 "movq " #lx ", " #t1 " \n\t" /* src[-1] */\
1730 PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\
1731 PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\
1733 "movq " #lx ", temp1 \n\t"\
1734 "movq temp0, " #lx " \n\t"\
1735 "psubusb " #lx ", " #t1 " \n\t"\
1736 "psubusb " #lx ", " #t0 " \n\t"\
1737 "psubusb " #lx ", " #sx " \n\t"\
1738 "movq b00, " #lx " \n\t"\
1739 "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\
1740 "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\
1741 "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\
1742 "paddb " #t1 ", " #t0 " \n\t"\
1743 "paddb " #t0 ", " #sx " \n\t"\
1745 PAVGB(plx, pplx) /* filtered */\
1746 "movq " #dst ", " #t0 " \n\t" /* dst */\
1747 "movq " #t0 ", " #t1 " \n\t" /* dst */\
1748 "psubusb pQPb2, " #t0 " \n\t"\
1749 "paddusb pQPb2, " #t1 " \n\t"\
1751 PMINUB(t1, pplx, t0)\
1752 "paddb " #sx ", " #ppsx " \n\t"\
1753 "paddb " #psx ", " #ppsx " \n\t"\
1754 "#paddb b02, " #ppsx " \n\t"\
1755 "pand b08, " #ppsx " \n\t"\
1756 "pcmpeqb " #lx ", " #ppsx " \n\t"\
1757 "pand " #ppsx ", " #pplx " \n\t"\
1758 "pandn " #dst ", " #ppsx " \n\t"\
1759 "por " #pplx ", " #ppsx " \n\t"\
1760 "movq " #ppsx ", " #dst " \n\t"\
1761 "movq temp1, " #lx " \n\t"
1778 //DERING_CORE(dst,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1)
1779 DERING_CORE((%%eax),(%%eax, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1780 DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1781 DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1782 DERING_CORE((%0, %1, 4),(%%ebx) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1783 DERING_CORE((%%ebx),(%%ebx, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1784 DERING_CORE((%%ebx, %1), (%%ebx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1785 DERING_CORE((%%ebx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1786 DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1789 : : "r" (src), "r" (stride), "r" (QP)
1807 if(*p > max) max= *p;
1808 if(*p < min) min= *p;
1811 avg= (min + max + 1)/2;
1820 if(*p > avg) t |= (1<<x);
1824 t &= (t<<1) & (t>>1);
1831 int t = s[y-1] & s[y] & s[y+1];
1840 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
1841 +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1))
1842 +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
1845 if (*p + 2*QP < f) *p= *p + 2*QP;
1846 else if(*p - 2*QP > f) *p= *p - 2*QP;
1856 * Deinterlaces the given block
1857 * will be called for every 8x8 block and can read & write from line 4-15
1858 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1859 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1861 static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride)
1863 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1866 "leal (%0, %1), %%eax \n\t"
1867 "leal (%%eax, %1, 4), %%ebx \n\t"
1868 // 0 1 2 3 4 5 6 7 8 9
1869 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1871 "movq (%0), %%mm0 \n\t"
1872 "movq (%%eax, %1), %%mm1 \n\t"
1874 "movq %%mm0, (%%eax) \n\t"
1875 "movq (%0, %1, 4), %%mm0 \n\t"
1877 "movq %%mm1, (%%eax, %1, 2) \n\t"
1878 "movq (%%ebx, %1), %%mm1 \n\t"
1880 "movq %%mm0, (%%ebx) \n\t"
1881 "movq (%0, %1, 8), %%mm0 \n\t"
1883 "movq %%mm1, (%%ebx, %1, 2) \n\t"
1885 : : "r" (src), "r" (stride)
1893 src[stride] = (src[0] + src[stride*2])>>1;
1894 src[stride*3] = (src[stride*2] + src[stride*4])>>1;
1895 src[stride*5] = (src[stride*4] + src[stride*6])>>1;
1896 src[stride*7] = (src[stride*6] + src[stride*8])>>1;
1903 * Deinterlaces the given block
1904 * will be called for every 8x8 block and can read & write from line 4-15
1905 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1906 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1907 * this filter will read lines 3-15 and write 7-13
1908 * no cliping in C version
1910 static inline void deInterlaceInterpolateCubic(uint8_t src[], int stride)
1912 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1915 "leal (%0, %1), %%eax \n\t"
1916 "leal (%%eax, %1, 4), %%ebx \n\t"
1917 "leal (%%ebx, %1, 4), %%ecx \n\t"
1918 "addl %1, %%ecx \n\t"
1919 "pxor %%mm7, %%mm7 \n\t"
1920 // 0 1 2 3 4 5 6 7 8 9 10
1921 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 ecx
1923 #define DEINT_CUBIC(a,b,c,d,e)\
1924 "movq " #a ", %%mm0 \n\t"\
1925 "movq " #b ", %%mm1 \n\t"\
1926 "movq " #d ", %%mm2 \n\t"\
1927 "movq " #e ", %%mm3 \n\t"\
1928 PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\
1929 PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\
1930 "movq %%mm0, %%mm2 \n\t"\
1931 "punpcklbw %%mm7, %%mm0 \n\t"\
1932 "punpckhbw %%mm7, %%mm2 \n\t"\
1933 "movq %%mm1, %%mm3 \n\t"\
1934 "punpcklbw %%mm7, %%mm1 \n\t"\
1935 "punpckhbw %%mm7, %%mm3 \n\t"\
1936 "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\
1937 "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\
1938 "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\
1939 "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\
1940 "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\
1941 "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\
1942 "packuswb %%mm3, %%mm1 \n\t"\
1943 "movq %%mm1, " #c " \n\t"
1945 DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%ebx, %1))
1946 DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%ebx), (%%ebx, %1), (%0, %1, 8))
1947 DEINT_CUBIC((%0, %1, 4), (%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8), (%%ecx))
1948 DEINT_CUBIC((%%ebx, %1), (%0, %1, 8), (%%ebx, %1, 4), (%%ecx), (%%ecx, %1, 2))
1950 : : "r" (src), "r" (stride)
1951 : "%eax", "%ebx", "ecx"
1958 src[stride*3] = (-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4;
1959 src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4;
1960 src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4;
1961 src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4;
1968 * Deinterlaces the given block
1969 * will be called for every 8x8 block and can read & write from line 4-15
1970 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1971 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1972 * will shift the image up by 1 line (FIXME if this is a problem)
1973 * this filter will read lines 4-13 and write 4-11
1975 static inline void deInterlaceBlendLinear(uint8_t src[], int stride)
1977 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1980 "leal (%0, %1), %%eax \n\t"
1981 "leal (%%eax, %1, 4), %%ebx \n\t"
1982 // 0 1 2 3 4 5 6 7 8 9
1983 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1985 "movq (%0), %%mm0 \n\t" // L0
1986 "movq (%%eax, %1), %%mm1 \n\t" // L2
1987 PAVGB(%%mm1, %%mm0) // L0+L2
1988 "movq (%%eax), %%mm2 \n\t" // L1
1990 "movq %%mm0, (%0) \n\t"
1991 "movq (%%eax, %1, 2), %%mm0 \n\t" // L3
1992 PAVGB(%%mm0, %%mm2) // L1+L3
1993 PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3
1994 "movq %%mm2, (%%eax) \n\t"
1995 "movq (%0, %1, 4), %%mm2 \n\t" // L4
1996 PAVGB(%%mm2, %%mm1) // L2+L4
1997 PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4
1998 "movq %%mm1, (%%eax, %1) \n\t"
1999 "movq (%%ebx), %%mm1 \n\t" // L5
2000 PAVGB(%%mm1, %%mm0) // L3+L5
2001 PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5
2002 "movq %%mm0, (%%eax, %1, 2) \n\t"
2003 "movq (%%ebx, %1), %%mm0 \n\t" // L6
2004 PAVGB(%%mm0, %%mm2) // L4+L6
2005 PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6
2006 "movq %%mm2, (%0, %1, 4) \n\t"
2007 "movq (%%ebx, %1, 2), %%mm2 \n\t" // L7
2008 PAVGB(%%mm2, %%mm1) // L5+L7
2009 PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7
2010 "movq %%mm1, (%%ebx) \n\t"
2011 "movq (%0, %1, 8), %%mm1 \n\t" // L8
2012 PAVGB(%%mm1, %%mm0) // L6+L8
2013 PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8
2014 "movq %%mm0, (%%ebx, %1) \n\t"
2015 "movq (%%ebx, %1, 4), %%mm0 \n\t" // L9
2016 PAVGB(%%mm0, %%mm2) // L7+L9
2017 PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9
2018 "movq %%mm2, (%%ebx, %1, 2) \n\t"
2021 : : "r" (src), "r" (stride)
2029 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2;
2030 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2;
2031 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2032 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
2033 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
2034 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
2035 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
2036 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
2043 * Deinterlaces the given block
2044 * will be called for every 8x8 block and can read & write from line 4-15,
2045 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
2046 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2048 static inline void deInterlaceMedian(uint8_t src[], int stride)
2054 "leal (%0, %1), %%eax \n\t"
2055 "leal (%%eax, %1, 4), %%ebx \n\t"
2056 // 0 1 2 3 4 5 6 7 8 9
2057 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2059 "movq (%0), %%mm0 \n\t" //
2060 "movq (%%eax, %1), %%mm2 \n\t" //
2061 "movq (%%eax), %%mm1 \n\t" //
2062 "movq %%mm0, %%mm3 \n\t"
2063 "pmaxub %%mm1, %%mm0 \n\t" //
2064 "pminub %%mm3, %%mm1 \n\t" //
2065 "pmaxub %%mm2, %%mm1 \n\t" //
2066 "pminub %%mm1, %%mm0 \n\t"
2067 "movq %%mm0, (%%eax) \n\t"
2069 "movq (%0, %1, 4), %%mm0 \n\t" //
2070 "movq (%%eax, %1, 2), %%mm1 \n\t" //
2071 "movq %%mm2, %%mm3 \n\t"
2072 "pmaxub %%mm1, %%mm2 \n\t" //
2073 "pminub %%mm3, %%mm1 \n\t" //
2074 "pmaxub %%mm0, %%mm1 \n\t" //
2075 "pminub %%mm1, %%mm2 \n\t"
2076 "movq %%mm2, (%%eax, %1, 2) \n\t"
2078 "movq (%%ebx), %%mm2 \n\t" //
2079 "movq (%%ebx, %1), %%mm1 \n\t" //
2080 "movq %%mm2, %%mm3 \n\t"
2081 "pmaxub %%mm0, %%mm2 \n\t" //
2082 "pminub %%mm3, %%mm0 \n\t" //
2083 "pmaxub %%mm1, %%mm0 \n\t" //
2084 "pminub %%mm0, %%mm2 \n\t"
2085 "movq %%mm2, (%%ebx) \n\t"
2087 "movq (%%ebx, %1, 2), %%mm2 \n\t" //
2088 "movq (%0, %1, 8), %%mm0 \n\t" //
2089 "movq %%mm2, %%mm3 \n\t"
2090 "pmaxub %%mm0, %%mm2 \n\t" //
2091 "pminub %%mm3, %%mm0 \n\t" //
2092 "pmaxub %%mm1, %%mm0 \n\t" //
2093 "pminub %%mm0, %%mm2 \n\t"
2094 "movq %%mm2, (%%ebx, %1, 2) \n\t"
2097 : : "r" (src), "r" (stride)
2101 #else // MMX without MMX2
2103 "leal (%0, %1), %%eax \n\t"
2104 "leal (%%eax, %1, 4), %%ebx \n\t"
2105 // 0 1 2 3 4 5 6 7 8 9
2106 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2107 "pxor %%mm7, %%mm7 \n\t"
2109 #define MEDIAN(a,b,c)\
2110 "movq " #a ", %%mm0 \n\t"\
2111 "movq " #b ", %%mm2 \n\t"\
2112 "movq " #c ", %%mm1 \n\t"\
2113 "movq %%mm0, %%mm3 \n\t"\
2114 "movq %%mm1, %%mm4 \n\t"\
2115 "movq %%mm2, %%mm5 \n\t"\
2116 "psubusb %%mm1, %%mm3 \n\t"\
2117 "psubusb %%mm2, %%mm4 \n\t"\
2118 "psubusb %%mm0, %%mm5 \n\t"\
2119 "pcmpeqb %%mm7, %%mm3 \n\t"\
2120 "pcmpeqb %%mm7, %%mm4 \n\t"\
2121 "pcmpeqb %%mm7, %%mm5 \n\t"\
2122 "movq %%mm3, %%mm6 \n\t"\
2123 "pxor %%mm4, %%mm3 \n\t"\
2124 "pxor %%mm5, %%mm4 \n\t"\
2125 "pxor %%mm6, %%mm5 \n\t"\
2126 "por %%mm3, %%mm1 \n\t"\
2127 "por %%mm4, %%mm2 \n\t"\
2128 "por %%mm5, %%mm0 \n\t"\
2129 "pand %%mm2, %%mm0 \n\t"\
2130 "pand %%mm1, %%mm0 \n\t"\
2131 "movq %%mm0, " #b " \n\t"
2133 MEDIAN((%0), (%%eax), (%%eax, %1))
2134 MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
2135 MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1))
2136 MEDIAN((%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8))
2138 : : "r" (src), "r" (stride)
2148 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2;
2149 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2;
2150 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2151 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
2152 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
2153 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
2154 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
2155 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
2163 * transposes and shift the given 8x8 Block into dst1 and dst2
2165 static inline void transpose1(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
2168 "leal (%0, %1), %%eax \n\t"
2169 "leal (%%eax, %1, 4), %%ebx \n\t"
2170 // 0 1 2 3 4 5 6 7 8 9
2171 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2172 "movq (%0), %%mm0 \n\t" // 12345678
2173 "movq (%%eax), %%mm1 \n\t" // abcdefgh
2174 "movq %%mm0, %%mm2 \n\t" // 12345678
2175 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2176 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2178 "movq (%%eax, %1), %%mm1 \n\t"
2179 "movq (%%eax, %1, 2), %%mm3 \n\t"
2180 "movq %%mm1, %%mm4 \n\t"
2181 "punpcklbw %%mm3, %%mm1 \n\t"
2182 "punpckhbw %%mm3, %%mm4 \n\t"
2184 "movq %%mm0, %%mm3 \n\t"
2185 "punpcklwd %%mm1, %%mm0 \n\t"
2186 "punpckhwd %%mm1, %%mm3 \n\t"
2187 "movq %%mm2, %%mm1 \n\t"
2188 "punpcklwd %%mm4, %%mm2 \n\t"
2189 "punpckhwd %%mm4, %%mm1 \n\t"
2191 "movd %%mm0, 128(%2) \n\t"
2192 "psrlq $32, %%mm0 \n\t"
2193 "movd %%mm0, 144(%2) \n\t"
2194 "movd %%mm3, 160(%2) \n\t"
2195 "psrlq $32, %%mm3 \n\t"
2196 "movd %%mm3, 176(%2) \n\t"
2197 "movd %%mm3, 48(%3) \n\t"
2198 "movd %%mm2, 192(%2) \n\t"
2199 "movd %%mm2, 64(%3) \n\t"
2200 "psrlq $32, %%mm2 \n\t"
2201 "movd %%mm2, 80(%3) \n\t"
2202 "movd %%mm1, 96(%3) \n\t"
2203 "psrlq $32, %%mm1 \n\t"
2204 "movd %%mm1, 112(%3) \n\t"
2206 "movq (%0, %1, 4), %%mm0 \n\t" // 12345678
2207 "movq (%%ebx), %%mm1 \n\t" // abcdefgh
2208 "movq %%mm0, %%mm2 \n\t" // 12345678
2209 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2210 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2212 "movq (%%ebx, %1), %%mm1 \n\t"
2213 "movq (%%ebx, %1, 2), %%mm3 \n\t"
2214 "movq %%mm1, %%mm4 \n\t"
2215 "punpcklbw %%mm3, %%mm1 \n\t"
2216 "punpckhbw %%mm3, %%mm4 \n\t"
2218 "movq %%mm0, %%mm3 \n\t"
2219 "punpcklwd %%mm1, %%mm0 \n\t"
2220 "punpckhwd %%mm1, %%mm3 \n\t"
2221 "movq %%mm2, %%mm1 \n\t"
2222 "punpcklwd %%mm4, %%mm2 \n\t"
2223 "punpckhwd %%mm4, %%mm1 \n\t"
2225 "movd %%mm0, 132(%2) \n\t"
2226 "psrlq $32, %%mm0 \n\t"
2227 "movd %%mm0, 148(%2) \n\t"
2228 "movd %%mm3, 164(%2) \n\t"
2229 "psrlq $32, %%mm3 \n\t"
2230 "movd %%mm3, 180(%2) \n\t"
2231 "movd %%mm3, 52(%3) \n\t"
2232 "movd %%mm2, 196(%2) \n\t"
2233 "movd %%mm2, 68(%3) \n\t"
2234 "psrlq $32, %%mm2 \n\t"
2235 "movd %%mm2, 84(%3) \n\t"
2236 "movd %%mm1, 100(%3) \n\t"
2237 "psrlq $32, %%mm1 \n\t"
2238 "movd %%mm1, 116(%3) \n\t"
2241 :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2)
2247 * transposes the given 8x8 block
2249 static inline void transpose2(uint8_t *dst, int dstStride, uint8_t *src)
2252 "leal (%0, %1), %%eax \n\t"
2253 "leal (%%eax, %1, 4), %%ebx \n\t"
2254 // 0 1 2 3 4 5 6 7 8 9
2255 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2256 "movq (%2), %%mm0 \n\t" // 12345678
2257 "movq 16(%2), %%mm1 \n\t" // abcdefgh
2258 "movq %%mm0, %%mm2 \n\t" // 12345678
2259 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2260 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2262 "movq 32(%2), %%mm1 \n\t"
2263 "movq 48(%2), %%mm3 \n\t"
2264 "movq %%mm1, %%mm4 \n\t"
2265 "punpcklbw %%mm3, %%mm1 \n\t"
2266 "punpckhbw %%mm3, %%mm4 \n\t"
2268 "movq %%mm0, %%mm3 \n\t"
2269 "punpcklwd %%mm1, %%mm0 \n\t"
2270 "punpckhwd %%mm1, %%mm3 \n\t"
2271 "movq %%mm2, %%mm1 \n\t"
2272 "punpcklwd %%mm4, %%mm2 \n\t"
2273 "punpckhwd %%mm4, %%mm1 \n\t"
2275 "movd %%mm0, (%0) \n\t"
2276 "psrlq $32, %%mm0 \n\t"
2277 "movd %%mm0, (%%eax) \n\t"
2278 "movd %%mm3, (%%eax, %1) \n\t"
2279 "psrlq $32, %%mm3 \n\t"
2280 "movd %%mm3, (%%eax, %1, 2) \n\t"
2281 "movd %%mm2, (%0, %1, 4) \n\t"
2282 "psrlq $32, %%mm2 \n\t"
2283 "movd %%mm2, (%%ebx) \n\t"
2284 "movd %%mm1, (%%ebx, %1) \n\t"
2285 "psrlq $32, %%mm1 \n\t"
2286 "movd %%mm1, (%%ebx, %1, 2) \n\t"
2289 "movq 64(%2), %%mm0 \n\t" // 12345678
2290 "movq 80(%2), %%mm1 \n\t" // abcdefgh
2291 "movq %%mm0, %%mm2 \n\t" // 12345678
2292 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2293 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2295 "movq 96(%2), %%mm1 \n\t"
2296 "movq 112(%2), %%mm3 \n\t"
2297 "movq %%mm1, %%mm4 \n\t"
2298 "punpcklbw %%mm3, %%mm1 \n\t"
2299 "punpckhbw %%mm3, %%mm4 \n\t"
2301 "movq %%mm0, %%mm3 \n\t"
2302 "punpcklwd %%mm1, %%mm0 \n\t"
2303 "punpckhwd %%mm1, %%mm3 \n\t"
2304 "movq %%mm2, %%mm1 \n\t"
2305 "punpcklwd %%mm4, %%mm2 \n\t"
2306 "punpckhwd %%mm4, %%mm1 \n\t"
2308 "movd %%mm0, 4(%0) \n\t"
2309 "psrlq $32, %%mm0 \n\t"
2310 "movd %%mm0, 4(%%eax) \n\t"
2311 "movd %%mm3, 4(%%eax, %1) \n\t"
2312 "psrlq $32, %%mm3 \n\t"
2313 "movd %%mm3, 4(%%eax, %1, 2) \n\t"
2314 "movd %%mm2, 4(%0, %1, 4) \n\t"
2315 "psrlq $32, %%mm2 \n\t"
2316 "movd %%mm2, 4(%%ebx) \n\t"
2317 "movd %%mm1, 4(%%ebx, %1) \n\t"
2318 "psrlq $32, %%mm1 \n\t"
2319 "movd %%mm1, 4(%%ebx, %1, 2) \n\t"
2321 :: "r" (dst), "r" (dstStride), "r" (src)
2326 //static int test=0;
2328 static void inline tempNoiseReducer(uint8_t *src, int stride,
2329 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
2331 #define FAST_L2_DIFF
2332 //#define L1_DIFF //u should change the thresholds too if u try that one
2333 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2335 "leal (%2, %2, 2), %%eax \n\t" // 3*stride
2336 "leal (%2, %2, 4), %%ebx \n\t" // 5*stride
2337 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride
2338 // 0 1 2 3 4 5 6 7 8 9
2339 // %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+ebx %x+2eax %x+ecx %x+8%2
2341 #ifdef L1_DIFF //needs mmx2
2342 "movq (%0), %%mm0 \n\t" // L0
2343 "psadbw (%1), %%mm0 \n\t" // |L0-R0|
2344 "movq (%0, %2), %%mm1 \n\t" // L1
2345 "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1|
2346 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2347 "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2|
2348 "movq (%0, %%eax), %%mm3 \n\t" // L3
2349 "psadbw (%1, %%eax), %%mm3 \n\t" // |L3-R3|
2351 "movq (%0, %2, 4), %%mm4 \n\t" // L4
2352 "paddw %%mm1, %%mm0 \n\t"
2353 "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4|
2354 "movq (%0, %%ebx), %%mm5 \n\t" // L5
2355 "paddw %%mm2, %%mm0 \n\t"
2356 "psadbw (%1, %%ebx), %%mm5 \n\t" // |L5-R5|
2357 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6
2358 "paddw %%mm3, %%mm0 \n\t"
2359 "psadbw (%1, %%eax, 2), %%mm6 \n\t" // |L6-R6|
2360 "movq (%0, %%ecx), %%mm7 \n\t" // L7
2361 "paddw %%mm4, %%mm0 \n\t"
2362 "psadbw (%1, %%ecx), %%mm7 \n\t" // |L7-R7|
2363 "paddw %%mm5, %%mm6 \n\t"
2364 "paddw %%mm7, %%mm6 \n\t"
2365 "paddw %%mm6, %%mm0 \n\t"
2366 #elif defined (FAST_L2_DIFF)
2367 "pcmpeqb %%mm7, %%mm7 \n\t"
2368 "movq b80, %%mm6 \n\t"
2369 "pxor %%mm0, %%mm0 \n\t"
2370 #define L2_DIFF_CORE(a, b)\
2371 "movq " #a ", %%mm5 \n\t"\
2372 "movq " #b ", %%mm2 \n\t"\
2373 "pxor %%mm7, %%mm2 \n\t"\
2374 PAVGB(%%mm2, %%mm5)\
2375 "paddb %%mm6, %%mm5 \n\t"\
2376 "movq %%mm5, %%mm2 \n\t"\
2377 "psllw $8, %%mm5 \n\t"\
2378 "pmaddwd %%mm5, %%mm5 \n\t"\
2379 "pmaddwd %%mm2, %%mm2 \n\t"\
2380 "paddd %%mm2, %%mm5 \n\t"\
2381 "psrld $14, %%mm5 \n\t"\
2382 "paddd %%mm5, %%mm0 \n\t"
2384 L2_DIFF_CORE((%0), (%1))
2385 L2_DIFF_CORE((%0, %2), (%1, %2))
2386 L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
2387 L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
2388 L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
2389 L2_DIFF_CORE((%0, %%ebx), (%1, %%ebx))
2390 L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
2391 L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
2394 "pxor %%mm7, %%mm7 \n\t"
2395 "pxor %%mm0, %%mm0 \n\t"
2396 #define L2_DIFF_CORE(a, b)\
2397 "movq " #a ", %%mm5 \n\t"\
2398 "movq " #b ", %%mm2 \n\t"\
2399 "movq %%mm5, %%mm1 \n\t"\
2400 "movq %%mm2, %%mm3 \n\t"\
2401 "punpcklbw %%mm7, %%mm5 \n\t"\
2402 "punpckhbw %%mm7, %%mm1 \n\t"\
2403 "punpcklbw %%mm7, %%mm2 \n\t"\
2404 "punpckhbw %%mm7, %%mm3 \n\t"\
2405 "psubw %%mm2, %%mm5 \n\t"\
2406 "psubw %%mm3, %%mm1 \n\t"\
2407 "pmaddwd %%mm5, %%mm5 \n\t"\
2408 "pmaddwd %%mm1, %%mm1 \n\t"\
2409 "paddd %%mm1, %%mm5 \n\t"\
2410 "paddd %%mm5, %%mm0 \n\t"
2412 L2_DIFF_CORE((%0), (%1))
2413 L2_DIFF_CORE((%0, %2), (%1, %2))
2414 L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
2415 L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
2416 L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
2417 L2_DIFF_CORE((%0, %%ebx), (%1, %%ebx))
2418 L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
2419 L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
2423 "movq %%mm0, %%mm4 \n\t"
2424 "psrlq $32, %%mm0 \n\t"
2425 "paddd %%mm0, %%mm4 \n\t"
2426 "movd %%mm4, %%ecx \n\t"
2427 "shll $2, %%ecx \n\t"
2428 "movl %3, %%ebx \n\t"
2429 "addl -4(%%ebx), %%ecx \n\t"
2430 "addl 4(%%ebx), %%ecx \n\t"
2431 "addl -1024(%%ebx), %%ecx \n\t"
2432 "addl $4, %%ecx \n\t"
2433 "addl 1024(%%ebx), %%ecx \n\t"
2434 "shrl $3, %%ecx \n\t"
2435 "movl %%ecx, (%%ebx) \n\t"
2436 "leal (%%eax, %2, 2), %%ebx \n\t" // 5*stride
2438 // "movl %3, %%ecx \n\t"
2439 // "movl %%ecx, test \n\t"
2441 "cmpl 4+maxTmpNoise, %%ecx \n\t"
2443 "cmpl 8+maxTmpNoise, %%ecx \n\t"
2446 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride
2447 "movq (%0), %%mm0 \n\t" // L0
2448 "movq (%0, %2), %%mm1 \n\t" // L1
2449 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2450 "movq (%0, %%eax), %%mm3 \n\t" // L3
2451 "movq (%0, %2, 4), %%mm4 \n\t" // L4
2452 "movq (%0, %%ebx), %%mm5 \n\t" // L5
2453 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6
2454 "movq (%0, %%ecx), %%mm7 \n\t" // L7
2455 "movq %%mm0, (%1) \n\t" // L0
2456 "movq %%mm1, (%1, %2) \n\t" // L1
2457 "movq %%mm2, (%1, %2, 2) \n\t" // L2
2458 "movq %%mm3, (%1, %%eax) \n\t" // L3
2459 "movq %%mm4, (%1, %2, 4) \n\t" // L4
2460 "movq %%mm5, (%1, %%ebx) \n\t" // L5
2461 "movq %%mm6, (%1, %%eax, 2) \n\t" // L6
2462 "movq %%mm7, (%1, %%ecx) \n\t" // L7
2466 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride
2467 "movq (%0), %%mm0 \n\t" // L0
2468 "pavgb (%1), %%mm0 \n\t" // L0
2469 "movq (%0, %2), %%mm1 \n\t" // L1
2470 "pavgb (%1, %2), %%mm1 \n\t" // L1
2471 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2472 "pavgb (%1, %2, 2), %%mm2 \n\t" // L2
2473 "movq (%0, %%eax), %%mm3 \n\t" // L3
2474 "pavgb (%1, %%eax), %%mm3 \n\t" // L3
2475 "movq (%0, %2, 4), %%mm4 \n\t" // L4
2476 "pavgb (%1, %2, 4), %%mm4 \n\t" // L4
2477 "movq (%0, %%ebx), %%mm5 \n\t" // L5
2478 "pavgb (%1, %%ebx), %%mm5 \n\t" // L5
2479 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6
2480 "pavgb (%1, %%eax, 2), %%mm6 \n\t" // L6
2481 "movq (%0, %%ecx), %%mm7 \n\t" // L7
2482 "pavgb (%1, %%ecx), %%mm7 \n\t" // L7
2483 "movq %%mm0, (%1) \n\t" // R0
2484 "movq %%mm1, (%1, %2) \n\t" // R1
2485 "movq %%mm2, (%1, %2, 2) \n\t" // R2
2486 "movq %%mm3, (%1, %%eax) \n\t" // R3
2487 "movq %%mm4, (%1, %2, 4) \n\t" // R4
2488 "movq %%mm5, (%1, %%ebx) \n\t" // R5
2489 "movq %%mm6, (%1, %%eax, 2) \n\t" // R6
2490 "movq %%mm7, (%1, %%ecx) \n\t" // R7
2491 "movq %%mm0, (%0) \n\t" // L0
2492 "movq %%mm1, (%0, %2) \n\t" // L1
2493 "movq %%mm2, (%0, %2, 2) \n\t" // L2
2494 "movq %%mm3, (%0, %%eax) \n\t" // L3
2495 "movq %%mm4, (%0, %2, 4) \n\t" // L4
2496 "movq %%mm5, (%0, %%ebx) \n\t" // L5
2497 "movq %%mm6, (%0, %%eax, 2) \n\t" // L6
2498 "movq %%mm7, (%0, %%ecx) \n\t" // L7
2502 "cmpl maxTmpNoise, %%ecx \n\t"
2505 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride
2506 "movq (%0), %%mm0 \n\t" // L0
2507 "movq (%0, %2), %%mm1 \n\t" // L1
2508 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2509 "movq (%0, %%eax), %%mm3 \n\t" // L3
2510 "movq (%1), %%mm4 \n\t" // R0
2511 "movq (%1, %2), %%mm5 \n\t" // R1
2512 "movq (%1, %2, 2), %%mm6 \n\t" // R2
2513 "movq (%1, %%eax), %%mm7 \n\t" // R3
2522 "movq %%mm0, (%1) \n\t" // R0
2523 "movq %%mm1, (%1, %2) \n\t" // R1
2524 "movq %%mm2, (%1, %2, 2) \n\t" // R2
2525 "movq %%mm3, (%1, %%eax) \n\t" // R3
2526 "movq %%mm0, (%0) \n\t" // L0
2527 "movq %%mm1, (%0, %2) \n\t" // L1
2528 "movq %%mm2, (%0, %2, 2) \n\t" // L2
2529 "movq %%mm3, (%0, %%eax) \n\t" // L3
2531 "movq (%0, %2, 4), %%mm0 \n\t" // L4
2532 "movq (%0, %%ebx), %%mm1 \n\t" // L5
2533 "movq (%0, %%eax, 2), %%mm2 \n\t" // L6
2534 "movq (%0, %%ecx), %%mm3 \n\t" // L7
2535 "movq (%1, %2, 4), %%mm4 \n\t" // R4
2536 "movq (%1, %%ebx), %%mm5 \n\t" // R5
2537 "movq (%1, %%eax, 2), %%mm6 \n\t" // R6
2538 "movq (%1, %%ecx), %%mm7 \n\t" // R7
2547 "movq %%mm0, (%1, %2, 4) \n\t" // R4
2548 "movq %%mm1, (%1, %%ebx) \n\t" // R5
2549 "movq %%mm2, (%1, %%eax, 2) \n\t" // R6
2550 "movq %%mm3, (%1, %%ecx) \n\t" // R7
2551 "movq %%mm0, (%0, %2, 4) \n\t" // L4
2552 "movq %%mm1, (%0, %%ebx) \n\t" // L5
2553 "movq %%mm2, (%0, %%eax, 2) \n\t" // L6
2554 "movq %%mm3, (%0, %%ecx) \n\t" // L7
2558 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride
2559 "movq (%0), %%mm0 \n\t" // L0
2560 "movq (%0, %2), %%mm1 \n\t" // L1
2561 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2562 "movq (%0, %%eax), %%mm3 \n\t" // L3
2563 "movq (%1), %%mm4 \n\t" // R0
2564 "movq (%1, %2), %%mm5 \n\t" // R1
2565 "movq (%1, %2, 2), %%mm6 \n\t" // R2
2566 "movq (%1, %%eax), %%mm7 \n\t" // R3
2579 "movq %%mm0, (%1) \n\t" // R0
2580 "movq %%mm1, (%1, %2) \n\t" // R1
2581 "movq %%mm2, (%1, %2, 2) \n\t" // R2
2582 "movq %%mm3, (%1, %%eax) \n\t" // R3
2583 "movq %%mm0, (%0) \n\t" // L0
2584 "movq %%mm1, (%0, %2) \n\t" // L1
2585 "movq %%mm2, (%0, %2, 2) \n\t" // L2
2586 "movq %%mm3, (%0, %%eax) \n\t" // L3
2588 "movq (%0, %2, 4), %%mm0 \n\t" // L4
2589 "movq (%0, %%ebx), %%mm1 \n\t" // L5
2590 "movq (%0, %%eax, 2), %%mm2 \n\t" // L6
2591 "movq (%0, %%ecx), %%mm3 \n\t" // L7
2592 "movq (%1, %2, 4), %%mm4 \n\t" // R4
2593 "movq (%1, %%ebx), %%mm5 \n\t" // R5
2594 "movq (%1, %%eax, 2), %%mm6 \n\t" // R6
2595 "movq (%1, %%ecx), %%mm7 \n\t" // R7
2608 "movq %%mm0, (%1, %2, 4) \n\t" // R4
2609 "movq %%mm1, (%1, %%ebx) \n\t" // R5
2610 "movq %%mm2, (%1, %%eax, 2) \n\t" // R6
2611 "movq %%mm3, (%1, %%ecx) \n\t" // R7
2612 "movq %%mm0, (%0, %2, 4) \n\t" // L4
2613 "movq %%mm1, (%0, %%ebx) \n\t" // L5
2614 "movq %%mm2, (%0, %%eax, 2) \n\t" // L6
2615 "movq %%mm3, (%0, %%ecx) \n\t" // L7
2619 :: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast)
2620 : "%eax", "%ebx", "%ecx", "memory"
2622 //printf("%d\n", test);
2634 int ref= tempBlured[ x + y*stride ];
2635 int cur= src[ x + y*stride ];
2637 // if(x==0 || x==7) d1+= d1>>1;
2638 // if(y==0 || y==7) d1+= d1>>1;
2647 +(*(tempBluredPast-256))
2648 +(*(tempBluredPast-1))+ (*(tempBluredPast+1))
2649 +(*(tempBluredPast+256))
2652 // ((*tempBluredPast)*3 + d + 2)>>2;
2654 //printf("%d %d %d\n", maxNoise[0], maxNoise[1], maxNoise[2]);
2658 64 32 16 8 4 2 1 (1)
2659 64 48 36 27 20 15 11 (33) (approx)
2660 64 56 49 43 37 33 29 (200) (approx)
2671 int ref= tempBlured[ x + y*stride ];
2672 int cur= src[ x + y*stride ];
2673 tempBlured[ x + y*stride ]=
2674 src[ x + y*stride ]=
2686 tempBlured[ x + y*stride ]= src[ x + y*stride ];
2700 int ref= tempBlured[ x + y*stride ];
2701 int cur= src[ x + y*stride ];
2702 tempBlured[ x + y*stride ]=
2703 src[ x + y*stride ]=
2704 (ref*7 + cur + 4)>>3;
2715 int ref= tempBlured[ x + y*stride ];
2716 int cur= src[ x + y*stride ];
2717 tempBlured[ x + y*stride ]=
2718 src[ x + y*stride ]=
2719 (ref*3 + cur + 2)>>2;
2727 #ifdef HAVE_ODIVX_POSTPROCESS
2728 #include "../opendivx/postprocess.h"
2732 static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2733 QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
2735 /* -pp Command line Help
2736 NOTE/FIXME: put this at an appropriate place (--help, html docs, man mplayer)?
2738 -pp <filterName>[:<option>[:<option>...]][,[-]<filterName>[:<option>...]]...
2741 -pp vdeblock:autoq,hdeblock:autoq,linblenddeint -pp default,-vdeblock
2743 -pp vb:a,hb:a,lb -pp de,-vb
2748 short long name short long option Description
2749 * * a autoq cpu power dependant enabler
2750 c chrom chrominance filtring enabled
2751 y nochrom chrominance filtring disabled
2752 hb hdeblock horizontal deblocking filter
2753 vb vdeblock vertical deblocking filter
2755 h1 x1hdeblock Experimental horizontal deblock filter 1
2756 v1 x1vdeblock Experimental vertical deblock filter 1
2757 dr dering not implemented yet
2758 al autolevels automatic brightness / contrast fixer
2759 f fullyrange stretch luminance range to (0..255)
2760 lb linblenddeint linear blend deinterlacer
2761 li linipoldeint linear interpolating deinterlacer
2762 ci cubicipoldeint cubic interpolating deinterlacer
2763 md mediandeint median deinterlacer
2764 de default hdeblock:a,vdeblock:a,dering:a,autolevels
2765 fa fast x1hdeblock:a,x1vdeblock:a,dering:a,autolevels
2766 tn tmpnoise (3 Thresholds) Temporal Noise Reducer
2770 * returns a PPMode struct which will have a non 0 error variable if an error occured
2771 * name is the string after "-pp" on the command line
2772 * quality is a number from 0 to GET_PP_QUALITY_MAX
2774 struct PPMode getPPModeByNameAndQuality(char *name, int quality)
2776 char temp[GET_MODE_BUFFER_SIZE];
2778 char *filterDelimiters= ",";
2779 char *optionDelimiters= ":";
2780 struct PPMode ppMode= {0,0,0,0,0,0,{150,200,400}};
2783 strncpy(temp, name, GET_MODE_BUFFER_SIZE);
2785 printf("%s\n", name);
2789 int q= 1000000; //GET_PP_QUALITY_MAX;
2792 char *options[OPTIONS_ARRAY_SIZE];
2795 int numOfUnknownOptions=0;
2796 int enable=1; //does the user want us to enabled or disabled the filter
2798 filterToken= strtok(p, filterDelimiters);
2799 if(filterToken == NULL) break;
2800 p+= strlen(filterToken) + 1; // p points to next filterToken
2801 filterName= strtok(filterToken, optionDelimiters);
2802 printf("%s::%s\n", filterToken, filterName);
2804 if(*filterName == '-')
2810 for(;;){ //for all options
2811 option= strtok(NULL, optionDelimiters);
2812 if(option == NULL) break;
2814 printf("%s\n", option);
2815 if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
2816 else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
2817 else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
2820 options[numOfUnknownOptions] = option;
2821 numOfUnknownOptions++;
2823 if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
2825 options[numOfUnknownOptions] = NULL;
2827 /* replace stuff from the replace Table */
2828 for(i=0; replaceTable[2*i]!=NULL; i++)
2830 if(!strcmp(replaceTable[2*i], filterName))
2832 int newlen= strlen(replaceTable[2*i + 1]);
2836 if(p==NULL) p= temp, *p=0; //last filter
2837 else p--, *p=','; //not last filter
2840 spaceLeft= (int)p - (int)temp + plen;
2841 if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE)
2846 memmove(p + newlen, p, plen+1);
2847 memcpy(p, replaceTable[2*i + 1], newlen);
2852 for(i=0; filters[i].shortName!=NULL; i++)
2854 // printf("Compareing %s, %s, %s\n", filters[i].shortName,filters[i].longName, filterName);
2855 if( !strcmp(filters[i].longName, filterName)
2856 || !strcmp(filters[i].shortName, filterName))
2858 ppMode.lumMode &= ~filters[i].mask;
2859 ppMode.chromMode &= ~filters[i].mask;
2862 if(!enable) break; // user wants to disable it
2864 if(q >= filters[i].minLumQuality)
2865 ppMode.lumMode|= filters[i].mask;
2866 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
2867 if(q >= filters[i].minChromQuality)
2868 ppMode.chromMode|= filters[i].mask;
2870 if(filters[i].mask == LEVEL_FIX)
2873 ppMode.minAllowedY= 16;
2874 ppMode.maxAllowedY= 234;
2875 for(o=0; options[o]!=NULL; o++)
2876 if( !strcmp(options[o],"fullyrange")
2877 ||!strcmp(options[o],"f"))
2879 ppMode.minAllowedY= 0;
2880 ppMode.maxAllowedY= 255;
2881 numOfUnknownOptions--;
2884 else if(filters[i].mask == TEMP_NOISE_FILTER)
2888 ppMode.maxTmpNoise[0]= 150;
2889 ppMode.maxTmpNoise[1]= 200;
2890 ppMode.maxTmpNoise[2]= 400;
2892 for(o=0; options[o]!=NULL; o++)
2895 ppMode.maxTmpNoise[numOfNoises]=
2896 strtol(options[o], &tail, 0);
2897 if(tail!=options[o])
2900 numOfUnknownOptions--;
2901 if(numOfNoises >= 3) break;
2907 if(!filterNameOk) ppMode.error++;
2908 ppMode.error += numOfUnknownOptions;
2911 #ifdef HAVE_ODIVX_POSTPROCESS
2912 if(ppMode.lumMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_H;
2913 if(ppMode.lumMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_V;
2914 if(ppMode.chromMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_H;
2915 if(ppMode.chromMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_V;
2916 if(ppMode.lumMode & DERING) ppMode.oldMode |= PP_DERING_Y;
2917 if(ppMode.chromMode & DERING) ppMode.oldMode |= PP_DERING_C;
2924 * Obsolete, dont use it, use postprocess2() instead
2926 void postprocess(unsigned char * src[], int src_stride,
2927 unsigned char * dst[], int dst_stride,
2928 int horizontal_size, int vertical_size,
2929 QP_STORE_T *QP_store, int QP_stride,
2932 struct PPMode ppMode;
2933 static QP_STORE_T zeroArray[2048/8];
2937 ppMode= getPPModeByNameAndQuality("fast,default,-hdeblock,-vdeblock,tmpnoise:150:200:300", qual);
2941 printf("\n%X %X %X %X :%d: %d %d %d\n", ppMode.lumMode, ppMode.chromMode, ppMode.oldMode, ppMode.error,
2942 qual, ppMode.maxTmpNoise[0], ppMode.maxTmpNoise[1], ppMode.maxTmpNoise[2]);
2943 postprocess2(src, src_stride, dst, dst_stride,
2944 horizontal_size, vertical_size, QP_store, QP_stride, &ppMode);
2950 QP_store= zeroArray;
2954 ppMode.lumMode= mode;
2955 mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00);
2956 ppMode.chromMode= mode;
2957 ppMode.maxTmpNoise[0]= 700;
2958 ppMode.maxTmpNoise[1]= 1500;
2959 ppMode.maxTmpNoise[2]= 3000;
2961 #ifdef HAVE_ODIVX_POSTPROCESS
2962 // Note: I could make this shit outside of this file, but it would mean one
2963 // more function call...
2965 odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,mode);
2970 postProcess(src[0], src_stride, dst[0], dst_stride,
2971 horizontal_size, vertical_size, QP_store, QP_stride, 0, &ppMode);
2973 horizontal_size >>= 1;
2974 vertical_size >>= 1;
2980 postProcess(src[1], src_stride, dst[1], dst_stride,
2981 horizontal_size, vertical_size, QP_store, QP_stride, 1, &ppMode);
2982 postProcess(src[2], src_stride, dst[2], dst_stride,
2983 horizontal_size, vertical_size, QP_store, QP_stride, 2, &ppMode);
2987 memset(dst[1], 128, dst_stride*vertical_size);
2988 memset(dst[2], 128, dst_stride*vertical_size);
2989 // memcpy(dst[1], src[1], src_stride*horizontal_size);
2990 // memcpy(dst[2], src[2], src_stride*horizontal_size);
2994 void postprocess2(unsigned char * src[], int src_stride,
2995 unsigned char * dst[], int dst_stride,
2996 int horizontal_size, int vertical_size,
2997 QP_STORE_T *QP_store, int QP_stride,
2998 struct PPMode *mode)
3001 static QP_STORE_T zeroArray[2048/8];
3004 QP_store= zeroArray;
3008 #ifdef HAVE_ODIVX_POSTPROCESS
3009 // Note: I could make this shit outside of this file, but it would mean one
3010 // more function call...
3012 odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,
3018 postProcess(src[0], src_stride, dst[0], dst_stride,
3019 horizontal_size, vertical_size, QP_store, QP_stride, 0, mode);
3021 horizontal_size >>= 1;
3022 vertical_size >>= 1;
3026 postProcess(src[1], src_stride, dst[1], dst_stride,
3027 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
3028 postProcess(src[2], src_stride, dst[2], dst_stride,
3029 horizontal_size, vertical_size, QP_store, QP_stride, 2, mode);
3034 * gets the mode flags for a given quality (larger values mean slower but better postprocessing)
3037 int getPpModeForQuality(int quality){
3038 int modes[1+GET_PP_QUALITY_MAX]= {
3041 // horizontal filters first
3043 LUM_H_DEBLOCK | LUM_V_DEBLOCK,
3044 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK,
3045 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK,
3046 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING,
3047 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING | CHROM_DERING
3049 // vertical filters first
3051 LUM_V_DEBLOCK | LUM_H_DEBLOCK,
3052 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK,
3053 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK,
3054 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING,
3055 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING | CHROM_DERING
3059 #ifdef HAVE_ODIVX_POSTPROCESS
3060 int odivx_modes[1+GET_PP_QUALITY_MAX]= {
3063 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V,
3064 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H,
3065 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V,
3066 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y,
3067 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y|PP_DERING_C
3069 if(use_old_pp) return odivx_modes[quality];
3071 return modes[quality];
3075 * Copies a block from src to dst and fixes the blacklevel
3076 * numLines must be a multiple of 4
3077 * levelFix == 0 -> dont touch the brighness & contrast
3079 static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
3089 "leal (%0,%2), %%eax \n\t"
3090 "leal (%1,%3), %%ebx \n\t"
3091 "movq packedYOffset, %%mm2 \n\t"
3092 "movq packedYScale, %%mm3 \n\t"
3093 "pxor %%mm4, %%mm4 \n\t"
3095 #define SCALED_CPY(src1, src2, dst1, dst2) \
3096 "movq " #src1 ", %%mm0 \n\t"\
3097 "movq " #src1 ", %%mm5 \n\t"\
3098 "punpcklbw %%mm4, %%mm0 \n\t"\
3099 "punpckhbw %%mm4, %%mm5 \n\t"\
3100 "psubw %%mm2, %%mm0 \n\t"\
3101 "psubw %%mm2, %%mm5 \n\t"\
3102 "movq " #src2 ", %%mm1 \n\t"\
3103 "psllw $6, %%mm0 \n\t"\
3104 "psllw $6, %%mm5 \n\t"\
3105 "pmulhw %%mm3, %%mm0 \n\t"\
3106 "movq " #src2 ", %%mm6 \n\t"\
3107 "pmulhw %%mm3, %%mm5 \n\t"\
3108 "punpcklbw %%mm4, %%mm1 \n\t"\
3109 "punpckhbw %%mm4, %%mm6 \n\t"\
3110 "psubw %%mm2, %%mm1 \n\t"\
3111 "psubw %%mm2, %%mm6 \n\t"\
3112 "psllw $6, %%mm1 \n\t"\
3113 "psllw $6, %%mm6 \n\t"\
3114 "pmulhw %%mm3, %%mm1 \n\t"\
3115 "pmulhw %%mm3, %%mm6 \n\t"\
3116 "packuswb %%mm5, %%mm0 \n\t"\
3117 "packuswb %%mm6, %%mm1 \n\t"\
3118 "movq %%mm0, " #dst1 " \n\t"\
3119 "movq %%mm1, " #dst2 " \n\t"\
3121 SCALED_CPY((%0) , (%0, %2) , (%1) , (%1, %3))
3122 SCALED_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%ebx, %3, 2))
3123 SCALED_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%ebx, %3, 4))
3124 "leal (%%eax,%2,4), %%eax \n\t"
3125 "leal (%%ebx,%3,4), %%ebx \n\t"
3126 SCALED_CPY((%%eax, %2), (%%eax, %2, 2), (%%ebx, %3), (%%ebx, %3, 2))
3137 memcpy( &(dst[dstStride*i]),
3138 &(src[srcStride*i]), BLOCK_SIZE);
3145 "leal (%0,%2), %%eax \n\t"
3146 "leal (%1,%3), %%ebx \n\t"
3148 #define SIMPLE_CPY(src1, src2, dst1, dst2) \
3149 "movq " #src1 ", %%mm0 \n\t"\
3150 "movq " #src2 ", %%mm1 \n\t"\
3151 "movq %%mm0, " #dst1 " \n\t"\
3152 "movq %%mm1, " #dst2 " \n\t"\
3154 SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3))
3155 SIMPLE_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%ebx, %3, 2))
3156 SIMPLE_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%ebx, %3, 4))
3157 "leal (%%eax,%2,4), %%eax \n\t"
3158 "leal (%%ebx,%3,4), %%ebx \n\t"
3159 SIMPLE_CPY((%%eax, %2), (%%eax, %2, 2), (%%ebx, %3), (%%ebx, %3, 2))
3169 memcpy( &(dst[dstStride*i]),
3170 &(src[srcStride*i]), BLOCK_SIZE);
3177 * Filters array of bytes (Y or U or V values)
3179 static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
3180 QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode)
3183 const int mode= isColor ? ppMode->chromMode : ppMode->lumMode;
3185 /* we need 64bit here otherwise we´ll going to have a problem
3186 after watching a black picture for 5 hours*/
3187 static uint64_t *yHistogram= NULL;
3188 int black=0, white=255; // blackest black and whitest white in the picture
3189 int QPCorrecture= 256;
3191 /* Temporary buffers for handling the last row(s) */
3192 static uint8_t *tempDst= NULL;
3193 static uint8_t *tempSrc= NULL;
3195 /* Temporary buffers for handling the last block */
3196 static uint8_t *tempDstBlock= NULL;
3197 static uint8_t *tempSrcBlock= NULL;
3199 /* Temporal noise reducing buffers */
3200 static uint8_t *tempBlured[3]= {NULL,NULL,NULL};
3201 static uint32_t *tempBluredPast[3]= {NULL,NULL,NULL};
3205 #ifdef PP_FUNNY_STRIDE
3206 uint8_t *dstBlockPtrBackup;
3207 uint8_t *srcBlockPtrBackup;
3211 long long T0, T1, diffTime=0;
3214 long long memcpyTime=0, vertTime=0, horizTime=0, sumTime;
3219 maxTmpNoise[0]= ppMode->maxTmpNoise[0];
3220 maxTmpNoise[1]= ppMode->maxTmpNoise[1];
3221 maxTmpNoise[2]= ppMode->maxTmpNoise[2];
3224 if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
3225 else if(mode & LINEAR_BLEND_DEINT_FILTER) copyAhead=14;
3226 else if( (mode & V_DEBLOCK)
3227 || (mode & LINEAR_IPOL_DEINT_FILTER)
3228 || (mode & MEDIAN_DEINT_FILTER)) copyAhead=13;
3229 else if(mode & V_X1_FILTER) copyAhead=11;
3230 else if(mode & V_RK1_FILTER) copyAhead=10;
3231 else if(mode & DERING) copyAhead=9;
3238 tempDst= (uint8_t*)memalign(8, 1024*24);
3239 tempSrc= (uint8_t*)memalign(8, 1024*24);
3240 tempDstBlock= (uint8_t*)memalign(8, 1024*24);
3241 tempSrcBlock= (uint8_t*)memalign(8, 1024*24);
3244 if(tempBlured[isColor]==NULL && (mode & TEMP_NOISE_FILTER))
3246 // printf("%d %d %d\n", isColor, dstStride, height);
3247 //FIXME works only as long as the size doesnt increase
3248 //Note:the +17*1024 is just there so i dont have to worry about r/w over te end
3249 tempBlured[isColor]= (uint8_t*)memalign(8, dstStride*((height+7)&(~7)) + 17*1024);
3250 tempBluredPast[isColor]= (uint32_t*)memalign(8, 256*((height+7)&(~7))/2 + 17*1024);
3252 memset(tempBlured[isColor], 0, dstStride*((height+7)&(~7)) + 17*1024);
3253 memset(tempBluredPast[isColor], 0, 256*((height+7)&(~7))/2 + 17*1024);
3259 yHistogram= (uint64_t*)malloc(8*256);
3260 for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256;
3262 if(mode & FULL_Y_RANGE)
3273 static int framenum= -1;
3274 uint64_t maxClipped;
3279 if(framenum == 1) yHistogram[0]= width*height/64*15/256;
3281 for(i=0; i<256; i++)
3283 sum+= yHistogram[i];
3284 // printf("%d ", yHistogram[i]);
3288 /* we allways get a completly black picture first */
3289 maxClipped= (uint64_t)(sum * maxClippedThreshold);
3292 for(black=255; black>0; black--)
3294 if(clipped < maxClipped) break;
3295 clipped-= yHistogram[black];
3299 for(white=0; white<256; white++)
3301 if(clipped < maxClipped) break;
3302 clipped-= yHistogram[white];
3305 packedYOffset= (black - minAllowedY) & 0xFFFF;
3306 packedYOffset|= packedYOffset<<32;
3307 packedYOffset|= packedYOffset<<16;
3309 scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);
3311 packedYScale= (uint16_t)(scale*1024.0 + 0.5);
3312 packedYScale|= packedYScale<<32;
3313 packedYScale|= packedYScale<<16;
3317 packedYScale= 0x0100010001000100LL;
3321 if(mode & LEVEL_FIX) QPCorrecture= packedYScale &0xFFFF;
3322 else QPCorrecture= 256;
3324 /* copy & deinterlace first row of blocks */
3327 //1% speedup if these are here instead of the inner loop
3328 uint8_t *srcBlock= &(src[y*srcStride]);
3329 uint8_t *dstBlock= &(dst[y*dstStride]);
3331 dstBlock= tempDst + dstStride;
3333 // From this point on it is guranteed that we can read and write 16 lines downward
3334 // finish 1 block before the next otherwise we´ll might have a problem
3335 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
3336 for(x=0; x<width; x+=BLOCK_SIZE)
3341 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3342 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3343 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3344 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3348 "movl %4, %%eax \n\t"
3349 "shrl $2, %%eax \n\t"
3350 "andl $6, %%eax \n\t"
3351 "addl %5, %%eax \n\t"
3352 "movl %%eax, %%ebx \n\t"
3353 "imul %1, %%eax \n\t"
3354 "imul %3, %%ebx \n\t"
3355 "prefetchnta 32(%%eax, %0) \n\t"
3356 "prefetcht0 32(%%ebx, %2) \n\t"
3357 "addl %1, %%eax \n\t"
3358 "addl %3, %%ebx \n\t"
3359 "prefetchnta 32(%%eax, %0) \n\t"
3360 "prefetcht0 32(%%ebx, %2) \n\t"
3361 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
3362 "m" (x), "m" (copyAhead)
3366 #elif defined(HAVE_3DNOW)
3367 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
3368 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3369 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3370 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3371 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3375 blockCopy(dstBlock + dstStride*copyAhead, dstStride,
3376 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX);
3378 if(mode & LINEAR_IPOL_DEINT_FILTER)
3379 deInterlaceInterpolateLinear(dstBlock, dstStride);
3380 else if(mode & LINEAR_BLEND_DEINT_FILTER)
3381 deInterlaceBlendLinear(dstBlock, dstStride);
3382 else if(mode & MEDIAN_DEINT_FILTER)
3383 deInterlaceMedian(dstBlock, dstStride);
3384 else if(mode & CUBIC_IPOL_DEINT_FILTER)
3385 deInterlaceInterpolateCubic(dstBlock, dstStride);
3386 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3387 deInterlaceBlendCubic(dstBlock, dstStride);
3392 memcpy(&(dst[y*dstStride]) + 8*dstStride, tempDst + 9*dstStride, copyAhead*dstStride );
3395 for(y=0; y<height; y+=BLOCK_SIZE)
3397 //1% speedup if these are here instead of the inner loop
3398 uint8_t *srcBlock= &(src[y*srcStride]);
3399 uint8_t *dstBlock= &(dst[y*dstStride]);
3401 int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride];
3402 int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4);
3403 int QPFrac= QPDelta;
3404 uint8_t *tempBlock1= tempBlocks;
3405 uint8_t *tempBlock2= tempBlocks + 8;
3408 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
3409 if not than use a temporary buffer */
3413 /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
3414 blockcopy to dst later */
3415 memcpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
3416 srcStride*MAX(height-y-copyAhead, 0) );
3418 /* duplicate last line of src to fill the void upto line (copyAhead+7) */
3419 for(i=MAX(height-y, 8); i<copyAhead+8; i++)
3420 memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), srcStride);
3422 /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
3423 memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, copyAhead+1) );
3425 /* duplicate last line of dst to fill the void upto line (copyAhead) */
3426 for(i=height-y+1; i<=copyAhead; i++)
3427 memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), dstStride);
3429 dstBlock= tempDst + dstStride;
3433 // From this point on it is guranteed that we can read and write 16 lines downward
3434 // finish 1 block before the next otherwise we´ll might have a problem
3435 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
3436 for(x=0; x<width; x+=BLOCK_SIZE)
3438 const int stride= dstStride;
3444 "sbbl %%eax, %%eax \n\t"
3445 "shll $2, %%eax \n\t"
3446 "subl %%eax, %0 \n\t"
3447 : "+r" (QPptr), "+m" (QPFrac)
3453 QPs[(y>>3)*QPStride + (x>>3)]:
3454 QPs[(y>>4)*QPStride + (x>>4)];
3458 QP= (QP* QPCorrecture)>>8;
3459 yHistogram[ srcBlock[srcStride*12 + 4] ]++;
3463 "movd %0, %%mm7 \n\t"
3464 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
3465 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
3466 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
3467 "movq %%mm7, pQPb \n\t"
3478 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3479 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3480 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3481 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3485 "movl %4, %%eax \n\t"
3486 "shrl $2, %%eax \n\t"
3487 "andl $6, %%eax \n\t"
3488 "addl %5, %%eax \n\t"
3489 "movl %%eax, %%ebx \n\t"
3490 "imul %1, %%eax \n\t"
3491 "imul %3, %%ebx \n\t"
3492 "prefetchnta 32(%%eax, %0) \n\t"
3493 "prefetcht0 32(%%ebx, %2) \n\t"
3494 "addl %1, %%eax \n\t"
3495 "addl %3, %%ebx \n\t"
3496 "prefetchnta 32(%%eax, %0) \n\t"
3497 "prefetcht0 32(%%ebx, %2) \n\t"
3498 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
3499 "m" (x), "m" (copyAhead)
3503 #elif defined(HAVE_3DNOW)
3504 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
3505 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3506 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3507 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3508 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3512 #ifdef PP_FUNNY_STRIDE
3513 //can we mess with a 8x16 block, if not use a temp buffer, yes again
3517 dstBlockPtrBackup= dstBlock;
3518 srcBlockPtrBackup= srcBlock;
3520 for(i=0;i<BLOCK_SIZE*2; i++)
3522 memcpy(tempSrcBlock+i*srcStride, srcBlock+i*srcStride, width-x);
3523 memcpy(tempDstBlock+i*dstStride, dstBlock+i*dstStride, width-x);
3526 dstBlock= tempDstBlock;
3527 srcBlock= tempSrcBlock;
3531 blockCopy(dstBlock + dstStride*copyAhead, dstStride,
3532 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX);
3534 if(mode & LINEAR_IPOL_DEINT_FILTER)
3535 deInterlaceInterpolateLinear(dstBlock, dstStride);
3536 else if(mode & LINEAR_BLEND_DEINT_FILTER)
3537 deInterlaceBlendLinear(dstBlock, dstStride);
3538 else if(mode & MEDIAN_DEINT_FILTER)
3539 deInterlaceMedian(dstBlock, dstStride);
3540 else if(mode & CUBIC_IPOL_DEINT_FILTER)
3541 deInterlaceInterpolateCubic(dstBlock, dstStride);
3542 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3543 deInterlaceBlendCubic(dstBlock, dstStride);
3546 /* only deblock if we have 2 blocks */
3554 if(mode & V_RK1_FILTER)
3555 vertRK1Filter(dstBlock, stride, QP);
3556 else if(mode & V_X1_FILTER)
3557 vertX1Filter(dstBlock, stride, QP);
3558 else if(mode & V_DEBLOCK)
3560 if( isVertDC(dstBlock, stride))
3562 if(isVertMinMaxOk(dstBlock, stride, QP))
3563 doVertLowPass(dstBlock, stride, QP);
3566 doVertDefFilter(dstBlock, stride, QP);
3576 transpose1(tempBlock1, tempBlock2, dstBlock, dstStride);
3578 /* check if we have a previous block to deblock it with dstBlock */
3585 if(mode & H_RK1_FILTER)
3586 vertRK1Filter(tempBlock1, 16, QP);
3587 else if(mode & H_X1_FILTER)
3588 vertX1Filter(tempBlock1, 16, QP);
3589 else if(mode & H_DEBLOCK)
3591 if( isVertDC(tempBlock1, 16) )
3593 if(isVertMinMaxOk(tempBlock1, 16, QP))
3594 doVertLowPass(tempBlock1, 16, QP);
3597 doVertDefFilter(tempBlock1, 16, QP);
3600 transpose2(dstBlock-4, dstStride, tempBlock1 + 4*16);
3603 if(mode & H_X1_FILTER)
3604 horizX1Filter(dstBlock-4, stride, QP);
3605 else if(mode & H_DEBLOCK)
3607 if( isHorizDC(dstBlock-4, stride))
3609 if(isHorizMinMaxOk(dstBlock-4, stride, QP))
3610 doHorizLowPass(dstBlock-4, stride, QP);
3613 doHorizDefFilter(dstBlock-4, stride, QP);
3623 //FIXME filter first line
3624 if(y>0) dering(dstBlock - stride - 8, stride, QP);
3627 if(mode & TEMP_NOISE_FILTER)
3629 tempNoiseReducer(dstBlock-8, stride,
3630 tempBlured[isColor] + y*dstStride + x,
3631 tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
3632 ppMode->maxTmpNoise);
3636 #ifdef PP_FUNNY_STRIDE
3637 /* did we use a tmp-block buffer */
3641 dstBlock= dstBlockPtrBackup;
3642 srcBlock= srcBlockPtrBackup;
3644 for(i=0;i<BLOCK_SIZE*2; i++)
3646 memcpy(dstBlock+i*dstStride, tempDstBlock+i*dstStride, width-x);
3655 tmpXchg= tempBlock1;
3656 tempBlock1= tempBlock2;
3657 tempBlock2 = tmpXchg;
3663 if(y > 0) dering(dstBlock - dstStride - 8, dstStride, QP);
3666 if((mode & TEMP_NOISE_FILTER))
3668 tempNoiseReducer(dstBlock-8, dstStride,
3669 tempBlured[isColor] + y*dstStride + x,
3670 tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
3671 ppMode->maxTmpNoise);
3674 /* did we use a tmp buffer for the last lines*/
3677 uint8_t *dstBlock= &(dst[y*dstStride]);
3678 memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) );
3681 for(x=0; x<width; x+=32)
3684 i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride]
3685 + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride]
3686 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride];
3687 // + dstBlock[x +13*dstStride]
3688 // + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride];
3692 asm volatile("femms");
3693 #elif defined (HAVE_MMX)
3694 asm volatile("emms");
3698 // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...)
3699 sumTime= rdtsc() - sumTime;
3701 printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d \r",
3702 (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
3703 (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)
3706 #ifdef DEBUG_BRIGHTNESS
3711 for(i=0; i<256; i++)
3712 if(yHistogram[i] > max) max=yHistogram[i];
3714 for(i=1; i<256; i++)
3717 int start=yHistogram[i-1]/(max/256+1);
3718 int end=yHistogram[i]/(max/256+1);
3719 int inc= end > start ? 1 : -1;
3720 for(x=start; x!=end+inc; x+=inc)
3721 dst[ i*dstStride + x]+=128;
3724 for(i=0; i<100; i+=2)
3726 dst[ (white)*dstStride + i]+=128;
3727 dst[ (black)*dstStride + i]+=128;