2 Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 doVertDefFilter Ec Ec e e
28 doHorizDefFilter Ec Ec e e
30 Vertical RKAlgo1 E a a
31 Horizontal RKAlgo1 a a
34 LinIpolDeinterlace e E E*
35 CubicIpolDeinterlace a e e*
36 LinBlendDeinterlace e E E*
37 MedianDeinterlace# Ec Ec
40 * i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work
41 # more or less selfinvented filters so the exactness isnt too meaningfull
42 E = Exact implementation
43 e = allmost exact implementation (slightly different rounding,...)
44 a = alternative / approximate impl
45 c = checked against the other implementations (-vo md5)
50 reduce the time wasted on the mem transfer
51 implement everything in C at least (done at the moment but ...)
52 unroll stuff if instructions depend too much on the prior one
53 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
54 move YScale thing to the end instead of fixing QP
55 write a faster and higher quality deblocking filter :)
56 make the mainloop more flexible (variable number of blocks at once
57 (the if/else stuff per block is slowing things down)
58 compare the quality & speed of all filters
62 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
64 commandline option for the deblock / dering thresholds
65 memcpy chrominance if no chroma filtering is done
69 //Changelog: use the CVS log
71 #include "../config.h"
82 //#define DEBUG_BRIGHTNESS
83 #include "postprocess.h"
85 #define MIN(a,b) ((a) > (b) ? (b) : (a))
86 #define MAX(a,b) ((a) < (b) ? (b) : (a))
87 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
88 #define SIGN(a) ((a) > 0 ? 1 : -1)
91 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
92 #elif defined (HAVE_3DNOW)
93 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
97 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
98 #elif defined (HAVE_MMX)
99 #define PMINUB(b,a,t) \
100 "movq " #a ", " #t " \n\t"\
101 "psubusb " #b ", " #t " \n\t"\
102 "psubb " #t ", " #a " \n\t"
106 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
107 #elif defined (HAVE_MMX)
108 #define PMAXUB(a,b) \
109 "psubusb " #a ", " #b " \n\t"\
110 "paddb " #a ", " #b " \n\t"
114 #define GET_MODE_BUFFER_SIZE 500
115 #define OPTIONS_ARRAY_SIZE 10
118 static volatile uint64_t __attribute__((aligned(8))) packedYOffset= 0x0000000000000000LL;
119 static volatile uint64_t __attribute__((aligned(8))) packedYScale= 0x0100010001000100LL;
120 static uint64_t __attribute__((aligned(8))) w05= 0x0005000500050005LL;
121 static uint64_t __attribute__((aligned(8))) w20= 0x0020002000200020LL;
122 static uint64_t __attribute__((aligned(8))) w1400= 0x1400140014001400LL;
123 static uint64_t __attribute__((aligned(8))) bm00000001= 0x00000000000000FFLL;
124 static uint64_t __attribute__((aligned(8))) bm00010000= 0x000000FF00000000LL;
125 static uint64_t __attribute__((aligned(8))) bm00001000= 0x00000000FF000000LL;
126 static uint64_t __attribute__((aligned(8))) bm10000000= 0xFF00000000000000LL;
127 static uint64_t __attribute__((aligned(8))) bm10000001= 0xFF000000000000FFLL;
128 static uint64_t __attribute__((aligned(8))) bm11000011= 0xFFFF00000000FFFFLL;
129 static uint64_t __attribute__((aligned(8))) bm00000011= 0x000000000000FFFFLL;
130 static uint64_t __attribute__((aligned(8))) bm11111110= 0xFFFFFFFFFFFFFF00LL;
131 static uint64_t __attribute__((aligned(8))) bm11000000= 0xFFFF000000000000LL;
132 static uint64_t __attribute__((aligned(8))) bm00011000= 0x000000FFFF000000LL;
133 static uint64_t __attribute__((aligned(8))) bm00110011= 0x0000FFFF0000FFFFLL;
134 static uint64_t __attribute__((aligned(8))) bm11001100= 0xFFFF0000FFFF0000LL;
135 static uint64_t __attribute__((aligned(8))) b00= 0x0000000000000000LL;
136 static uint64_t __attribute__((aligned(8))) b01= 0x0101010101010101LL;
137 static uint64_t __attribute__((aligned(8))) b02= 0x0202020202020202LL;
138 static uint64_t __attribute__((aligned(8))) b0F= 0x0F0F0F0F0F0F0F0FLL;
139 static uint64_t __attribute__((aligned(8))) b04= 0x0404040404040404LL;
140 static uint64_t __attribute__((aligned(8))) b08= 0x0808080808080808LL;
141 static uint64_t __attribute__((aligned(8))) bFF= 0xFFFFFFFFFFFFFFFFLL;
142 static uint64_t __attribute__((aligned(8))) b20= 0x2020202020202020LL;
143 static uint64_t __attribute__((aligned(8))) b80= 0x8080808080808080LL;
144 static uint64_t __attribute__((aligned(8))) b7E= 0x7E7E7E7E7E7E7E7ELL;
145 static uint64_t __attribute__((aligned(8))) b7C= 0x7C7C7C7C7C7C7C7CLL;
146 static uint64_t __attribute__((aligned(8))) b3F= 0x3F3F3F3F3F3F3F3FLL;
147 static uint64_t __attribute__((aligned(8))) temp0=0;
148 static uint64_t __attribute__((aligned(8))) temp1=0;
149 static uint64_t __attribute__((aligned(8))) temp2=0;
150 static uint64_t __attribute__((aligned(8))) temp3=0;
151 static uint64_t __attribute__((aligned(8))) temp4=0;
152 static uint64_t __attribute__((aligned(8))) temp5=0;
153 static uint64_t __attribute__((aligned(8))) pQPb=0;
154 static uint64_t __attribute__((aligned(8))) pQPb2=0;
155 static uint8_t __attribute__((aligned(8))) tempBlocks[8*16*2]; //used for the horizontal code
156 static uint32_t __attribute__((aligned(4))) maxTmpNoise[4];
158 static uint64_t packedYOffset= 0x0000000000000000LL;
159 static uint64_t packedYScale= 0x0100010001000100LL;
160 static uint8_t tempBlocks[8*16*2]; //used for the horizontal code
163 int hFlatnessThreshold= 56 - 16;
164 int vFlatnessThreshold= 56 - 16;
165 int deringThreshold= 20;
167 //amount of "black" u r willing to loose to get a brightness corrected picture
168 double maxClippedThreshold= 0.01;
173 static struct PPFilter filters[]=
175 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
176 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
177 {"vr", "rkvdeblock", 1, 2, 4, H_RK1_FILTER},
178 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
179 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
180 {"dr", "dering", 1, 5, 6, DERING},
181 {"al", "autolevels", 0, 1, 2, LEVEL_FIX},
182 {"lb", "linblenddeint", 0, 1, 6, LINEAR_BLEND_DEINT_FILTER},
183 {"li", "linipoldeint", 0, 1, 6, LINEAR_IPOL_DEINT_FILTER},
184 {"ci", "cubicipoldeint", 0, 1, 6, CUBIC_IPOL_DEINT_FILTER},
185 {"md", "mediandeint", 0, 1, 6, MEDIAN_DEINT_FILTER},
186 {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER},
187 {NULL, NULL,0,0,0,0} //End Marker
190 static char *replaceTable[]=
192 "default", "hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
193 "de", "hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
194 "fast", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
195 "fa", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
200 static inline void unusedVariableWarningFixer()
203 packedYOffset + packedYScale + w05 + w20 + w1400 + bm00000001 + bm00010000
204 + bm00001000 + bm10000000 + bm10000001 + bm11000011 + bm00000011 + bm11111110
205 + bm11000000 + bm00011000 + bm00110011 + bm11001100 + b00 + b01 + b02 + b0F
206 + bFF + b20 + b04+ b08 + pQPb2 + b80 + b7E + b7C + b3F + temp0 + temp1 + temp2 + temp3 + temp4
207 + temp5 + pQPb== 0) b00=0;
212 static inline long long rdtsc()
215 asm volatile( "rdtsc\n\t"
218 // printf("%d\n", int(l/1000));
224 static inline void prefetchnta(void *p)
226 asm volatile( "prefetchnta (%0)\n\t"
231 static inline void prefetcht0(void *p)
233 asm volatile( "prefetcht0 (%0)\n\t"
238 static inline void prefetcht1(void *p)
240 asm volatile( "prefetcht1 (%0)\n\t"
245 static inline void prefetcht2(void *p)
247 asm volatile( "prefetcht2 (%0)\n\t"
253 //FIXME? |255-0| = 1 (shouldnt be a problem ...)
255 * Check if the middle 8x8 Block in the given 8x16 block is flat
257 static inline int isVertDC(uint8_t src[], int stride){
262 src+= stride*4; // src points to begin of the 8x8 Block
265 "leal (%1, %2), %%eax \n\t"
266 "leal (%%eax, %2, 4), %%ebx \n\t"
267 // 0 1 2 3 4 5 6 7 8 9
268 // %1 eax eax+%2 eax+2%2 %1+4%2 ebx ebx+%2 ebx+2%2 %1+8%2 ebx+4%2
269 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F
270 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D
271 "movq (%1), %%mm0 \n\t"
272 "movq (%%eax), %%mm1 \n\t"
273 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
274 "paddb %%mm7, %%mm0 \n\t"
275 "pcmpgtb %%mm6, %%mm0 \n\t"
277 "movq (%%eax,%2), %%mm2 \n\t"
278 "psubb %%mm2, %%mm1 \n\t"
279 "paddb %%mm7, %%mm1 \n\t"
280 "pcmpgtb %%mm6, %%mm1 \n\t"
281 "paddb %%mm1, %%mm0 \n\t"
283 "movq (%%eax, %2, 2), %%mm1 \n\t"
284 "psubb %%mm1, %%mm2 \n\t"
285 "paddb %%mm7, %%mm2 \n\t"
286 "pcmpgtb %%mm6, %%mm2 \n\t"
287 "paddb %%mm2, %%mm0 \n\t"
289 "movq (%1, %2, 4), %%mm2 \n\t"
290 "psubb %%mm2, %%mm1 \n\t"
291 "paddb %%mm7, %%mm1 \n\t"
292 "pcmpgtb %%mm6, %%mm1 \n\t"
293 "paddb %%mm1, %%mm0 \n\t"
295 "movq (%%ebx), %%mm1 \n\t"
296 "psubb %%mm1, %%mm2 \n\t"
297 "paddb %%mm7, %%mm2 \n\t"
298 "pcmpgtb %%mm6, %%mm2 \n\t"
299 "paddb %%mm2, %%mm0 \n\t"
301 "movq (%%ebx, %2), %%mm2 \n\t"
302 "psubb %%mm2, %%mm1 \n\t"
303 "paddb %%mm7, %%mm1 \n\t"
304 "pcmpgtb %%mm6, %%mm1 \n\t"
305 "paddb %%mm1, %%mm0 \n\t"
307 "movq (%%ebx, %2, 2), %%mm1 \n\t"
308 "psubb %%mm1, %%mm2 \n\t"
309 "paddb %%mm7, %%mm2 \n\t"
310 "pcmpgtb %%mm6, %%mm2 \n\t"
311 "paddb %%mm2, %%mm0 \n\t"
315 "pxor %%mm7, %%mm7 \n\t"
316 "psadbw %%mm7, %%mm0 \n\t"
318 "movq %%mm0, %%mm1 \n\t"
319 "psrlw $8, %%mm0 \n\t"
320 "paddb %%mm1, %%mm0 \n\t"
321 "movq %%mm0, %%mm1 \n\t"
322 "psrlq $16, %%mm0 \n\t"
323 "paddb %%mm1, %%mm0 \n\t"
324 "movq %%mm0, %%mm1 \n\t"
325 "psrlq $32, %%mm0 \n\t"
326 "paddb %%mm1, %%mm0 \n\t"
328 "movd %%mm0, %0 \n\t"
330 : "r" (src), "r" (stride)
333 numEq= (-numEq) &0xFF;
336 for(y=0; y<BLOCK_SIZE-1; y++)
338 if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
339 if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
340 if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++;
341 if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++;
342 if(((src[4] - src[4+stride] + 1)&0xFFFF) < 3) numEq++;
343 if(((src[5] - src[5+stride] + 1)&0xFFFF) < 3) numEq++;
344 if(((src[6] - src[6+stride] + 1)&0xFFFF) < 3) numEq++;
345 if(((src[7] - src[7+stride] + 1)&0xFFFF) < 3) numEq++;
349 /* if(abs(numEq - asmEq) > 0)
351 printf("\nasm:%d c:%d\n", asmEq, numEq);
352 for(int y=0; y<8; y++)
354 for(int x=0; x<8; x++)
356 printf("%d ", temp[x + y*stride]);
362 // for(int i=0; i<numEq/8; i++) src[i]=255;
363 return (numEq > vFlatnessThreshold) ? 1 : 0;
366 static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
373 "movq (%1, %2), %%mm0 \n\t"
374 "movq (%1, %2, 8), %%mm1 \n\t"
375 "movq %%mm0, %%mm2 \n\t"
376 "psubusb %%mm1, %%mm0 \n\t"
377 "psubusb %%mm2, %%mm1 \n\t"
378 "por %%mm1, %%mm0 \n\t" // ABS Diff
380 "movq pQPb, %%mm7 \n\t" // QP,..., QP
381 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
382 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0
383 "pcmpeqd b00, %%mm0 \n\t"
384 "psrlq $16, %%mm0 \n\t"
385 "pcmpeqd bFF, %%mm0 \n\t"
386 // "movd %%mm0, (%1, %2, 4)\n\t"
387 "movd %%mm0, %0 \n\t"
389 : "r" (src), "r" (stride)
397 for(x=0; x<BLOCK_SIZE; x++)
399 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
401 /* if(isOk && !isOk2 || !isOk && isOk2)
403 printf("\nasm:%d c:%d QP:%d\n", isOk, isOk2, QP);
404 for(int y=0; y<9; y++)
406 for(int x=0; x<8; x++)
408 printf("%d ", src[x + y*stride]);
420 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
421 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
423 static inline void doVertLowPass(uint8_t *src, int stride, int QP)
425 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
427 asm volatile( //"movv %0 %1 %2\n\t"
428 "movq pQPb, %%mm0 \n\t" // QP,..., QP
430 "movq (%0), %%mm6 \n\t"
431 "movq (%0, %1), %%mm5 \n\t"
432 "movq %%mm5, %%mm1 \n\t"
433 "movq %%mm6, %%mm2 \n\t"
434 "psubusb %%mm6, %%mm5 \n\t"
435 "psubusb %%mm1, %%mm2 \n\t"
436 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
437 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
438 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
440 "pand %%mm2, %%mm6 \n\t"
441 "pandn %%mm1, %%mm2 \n\t"
442 "por %%mm2, %%mm6 \n\t"// First Line to Filter
444 "movq (%0, %1, 8), %%mm5 \n\t"
445 "leal (%0, %1, 4), %%eax \n\t"
446 "leal (%0, %1, 8), %%ebx \n\t"
447 "subl %1, %%ebx \n\t"
448 "addl %1, %0 \n\t" // %0 points to line 1 not 0
449 "movq (%0, %1, 8), %%mm7 \n\t"
450 "movq %%mm5, %%mm1 \n\t"
451 "movq %%mm7, %%mm2 \n\t"
452 "psubusb %%mm7, %%mm5 \n\t"
453 "psubusb %%mm1, %%mm2 \n\t"
454 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
455 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
456 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
458 "pand %%mm2, %%mm7 \n\t"
459 "pandn %%mm1, %%mm2 \n\t"
460 "por %%mm2, %%mm7 \n\t" // First Line to Filter
464 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ebx eax+4%1
469 "movq (%0, %1), %%mm0 \n\t" // 1
470 "movq %%mm0, %%mm1 \n\t" // 1
471 PAVGB(%%mm6, %%mm0) //1 1 /2
472 PAVGB(%%mm6, %%mm0) //3 1 /4
474 "movq (%0, %1, 4), %%mm2 \n\t" // 1
475 "movq %%mm2, %%mm5 \n\t" // 1
476 PAVGB((%%eax), %%mm2) // 11 /2
477 PAVGB((%0, %1, 2), %%mm2) // 211 /4
478 "movq %%mm2, %%mm3 \n\t" // 211 /4
479 "movq (%0), %%mm4 \n\t" // 1
480 PAVGB(%%mm4, %%mm3) // 4 211 /8
481 PAVGB(%%mm0, %%mm3) //642211 /16
482 "movq %%mm3, (%0) \n\t" // X
483 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
484 "movq %%mm1, %%mm0 \n\t" // 1
485 PAVGB(%%mm6, %%mm0) //1 1 /2
486 "movq %%mm4, %%mm3 \n\t" // 1
487 PAVGB((%0,%1,2), %%mm3) // 1 1 /2
488 PAVGB((%%eax,%1,2), %%mm5) // 11 /2
489 PAVGB((%%eax), %%mm5) // 211 /4
490 PAVGB(%%mm5, %%mm3) // 2 2211 /8
491 PAVGB(%%mm0, %%mm3) //4242211 /16
492 "movq %%mm3, (%0,%1) \n\t" // X
493 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
494 PAVGB(%%mm4, %%mm6) //11 /2
495 "movq (%%ebx), %%mm0 \n\t" // 1
496 PAVGB((%%eax, %1, 2), %%mm0) // 11/2
497 "movq %%mm0, %%mm3 \n\t" // 11/2
498 PAVGB(%%mm1, %%mm0) // 2 11/4
499 PAVGB(%%mm6, %%mm0) //222 11/8
500 PAVGB(%%mm2, %%mm0) //22242211/16
501 "movq (%0, %1, 2), %%mm2 \n\t" // 1
502 "movq %%mm0, (%0, %1, 2) \n\t" // X
503 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
504 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
505 PAVGB((%%ebx), %%mm0) // 11 /2
506 PAVGB(%%mm0, %%mm6) //11 11 /4
507 PAVGB(%%mm1, %%mm4) // 11 /2
508 PAVGB(%%mm2, %%mm1) // 11 /2
509 PAVGB(%%mm1, %%mm6) //1122 11 /8
510 PAVGB(%%mm5, %%mm6) //112242211 /16
511 "movq (%%eax), %%mm5 \n\t" // 1
512 "movq %%mm6, (%%eax) \n\t" // X
513 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
514 "movq (%%eax, %1, 4), %%mm6 \n\t" // 1
515 PAVGB(%%mm7, %%mm6) // 11 /2
516 PAVGB(%%mm4, %%mm6) // 11 11 /4
517 PAVGB(%%mm3, %%mm6) // 11 2211 /8
518 PAVGB(%%mm5, %%mm2) // 11 /2
519 "movq (%0, %1, 4), %%mm4 \n\t" // 1
520 PAVGB(%%mm4, %%mm2) // 112 /4
521 PAVGB(%%mm2, %%mm6) // 112242211 /16
522 "movq %%mm6, (%0, %1, 4) \n\t" // X
523 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
524 PAVGB(%%mm7, %%mm1) // 11 2 /4
525 PAVGB(%%mm4, %%mm5) // 11 /2
526 PAVGB(%%mm5, %%mm0) // 11 11 /4
527 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1
528 PAVGB(%%mm6, %%mm1) // 11 4 2 /8
529 PAVGB(%%mm0, %%mm1) // 11224222 /16
530 "movq %%mm1, (%%eax, %1, 2) \n\t" // X
531 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
532 PAVGB((%%ebx), %%mm2) // 112 4 /8
533 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
534 PAVGB(%%mm0, %%mm6) // 1 1 /2
535 PAVGB(%%mm7, %%mm6) // 1 12 /4
536 PAVGB(%%mm2, %%mm6) // 1122424 /4
537 "movq %%mm6, (%%ebx) \n\t" // X
538 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
539 PAVGB(%%mm7, %%mm5) // 11 2 /4
540 PAVGB(%%mm7, %%mm5) // 11 6 /8
542 PAVGB(%%mm3, %%mm0) // 112 /4
543 PAVGB(%%mm0, %%mm5) // 112246 /16
544 "movq %%mm5, (%%eax, %1, 4) \n\t" // X
548 : "r" (src), "r" (stride)
552 const int l1= stride;
553 const int l2= stride + l1;
554 const int l3= stride + l2;
555 const int l4= stride + l3;
556 const int l5= stride + l4;
557 const int l6= stride + l5;
558 const int l7= stride + l6;
559 const int l8= stride + l7;
560 const int l9= stride + l8;
563 for(x=0; x<BLOCK_SIZE; x++)
565 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
566 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
569 sums[0] = first + src[l1];
570 sums[1] = src[l1] + src[l2];
571 sums[2] = src[l2] + src[l3];
572 sums[3] = src[l3] + src[l4];
573 sums[4] = src[l4] + src[l5];
574 sums[5] = src[l5] + src[l6];
575 sums[6] = src[l6] + src[l7];
576 sums[7] = src[l7] + src[l8];
577 sums[8] = src[l8] + last;
579 src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
580 src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
581 src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
582 src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
583 src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
584 src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
585 src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4;
586 src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
595 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
596 * values are correctly clipped (MMX2)
597 * values are wraparound (C)
598 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
605 static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
607 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
611 "pxor %%mm7, %%mm7 \n\t" // 0
612 "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
613 "leal (%0, %1), %%eax \n\t"
614 "leal (%%eax, %1, 4), %%ebx \n\t"
615 // 0 1 2 3 4 5 6 7 8 9
616 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
617 "movq pQPb, %%mm0 \n\t" // QP,..., QP
618 "movq %%mm0, %%mm1 \n\t" // QP,..., QP
619 "paddusb b02, %%mm0 \n\t"
620 "psrlw $2, %%mm0 \n\t"
621 "pand b3F, %%mm0 \n\t" // QP/4,..., QP/4
622 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ...
623 "movq (%0, %1, 4), %%mm2 \n\t" // line 4
624 "movq (%%ebx), %%mm3 \n\t" // line 5
625 "movq %%mm2, %%mm4 \n\t" // line 4
626 "pcmpeqb %%mm5, %%mm5 \n\t" // -1
627 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
629 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
630 "psubusb %%mm3, %%mm4 \n\t"
631 "psubusb %%mm2, %%mm3 \n\t"
632 "por %%mm3, %%mm4 \n\t" // |l4 - l5|
633 "psubusb %%mm0, %%mm4 \n\t"
634 "pcmpeqb %%mm7, %%mm4 \n\t"
635 "pand %%mm4, %%mm5 \n\t" // d/2
637 // "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80
638 "paddb %%mm5, %%mm2 \n\t"
639 // "psubb %%mm6, %%mm2 \n\t"
640 "movq %%mm2, (%0,%1, 4) \n\t"
642 "movq (%%ebx), %%mm2 \n\t"
643 // "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80
644 "psubb %%mm5, %%mm2 \n\t"
645 // "psubb %%mm6, %%mm2 \n\t"
646 "movq %%mm2, (%%ebx) \n\t"
648 "paddb %%mm6, %%mm5 \n\t"
649 "psrlw $2, %%mm5 \n\t"
650 "pand b3F, %%mm5 \n\t"
651 "psubb b20, %%mm5 \n\t" // (l5-l4)/8
653 "movq (%%eax, %1, 2), %%mm2 \n\t"
654 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80
655 "paddsb %%mm5, %%mm2 \n\t"
656 "psubb %%mm6, %%mm2 \n\t"
657 "movq %%mm2, (%%eax, %1, 2) \n\t"
659 "movq (%%ebx, %1), %%mm2 \n\t"
660 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80
661 "psubsb %%mm5, %%mm2 \n\t"
662 "psubb %%mm6, %%mm2 \n\t"
663 "movq %%mm2, (%%ebx, %1) \n\t"
666 : "r" (src), "r" (stride)
670 const int l1= stride;
671 const int l2= stride + l1;
672 const int l3= stride + l2;
673 const int l4= stride + l3;
674 const int l5= stride + l4;
675 const int l6= stride + l5;
676 // const int l7= stride + l6;
677 // const int l8= stride + l7;
678 // const int l9= stride + l8;
680 const int QP15= QP + (QP>>2);
682 for(x=0; x<BLOCK_SIZE; x++)
684 const int v = (src[x+l5] - src[x+l4]);
699 * Experimental Filter 1
700 * will not damage linear gradients
701 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
702 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
703 * MMX2 version does correct clipping C version doesnt
705 static inline void vertX1Filter(uint8_t *src, int stride, int QP)
707 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
711 "pxor %%mm7, %%mm7 \n\t" // 0
712 // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
713 "leal (%0, %1), %%eax \n\t"
714 "leal (%%eax, %1, 4), %%ebx \n\t"
715 // 0 1 2 3 4 5 6 7 8 9
716 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
717 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
718 "movq (%0, %1, 4), %%mm1 \n\t" // line 4
719 "movq %%mm1, %%mm2 \n\t" // line 4
720 "psubusb %%mm0, %%mm1 \n\t"
721 "psubusb %%mm2, %%mm0 \n\t"
722 "por %%mm1, %%mm0 \n\t" // |l2 - l3|
723 "movq (%%ebx), %%mm3 \n\t" // line 5
724 "movq (%%ebx, %1), %%mm4 \n\t" // line 6
725 "movq %%mm3, %%mm5 \n\t" // line 5
726 "psubusb %%mm4, %%mm3 \n\t"
727 "psubusb %%mm5, %%mm4 \n\t"
728 "por %%mm4, %%mm3 \n\t" // |l5 - l6|
729 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
730 "movq %%mm2, %%mm1 \n\t" // line 4
731 "psubusb %%mm5, %%mm2 \n\t"
732 "movq %%mm2, %%mm4 \n\t"
733 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
734 "psubusb %%mm1, %%mm5 \n\t"
735 "por %%mm5, %%mm4 \n\t" // |l4 - l5|
736 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
737 "movq %%mm4, %%mm3 \n\t" // d
738 "psubusb pQPb, %%mm4 \n\t"
739 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
740 "psubusb b01, %%mm3 \n\t"
741 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
743 PAVGB(%%mm7, %%mm3) // d/2
744 "movq %%mm3, %%mm1 \n\t" // d/2
745 PAVGB(%%mm7, %%mm3) // d/4
746 PAVGB(%%mm1, %%mm3) // 3*d/8
748 "movq (%0, %1, 4), %%mm0 \n\t" // line 4
749 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
750 "psubusb %%mm3, %%mm0 \n\t"
751 "pxor %%mm2, %%mm0 \n\t"
752 "movq %%mm0, (%0, %1, 4) \n\t" // line 4
754 "movq (%%ebx), %%mm0 \n\t" // line 5
755 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
756 "paddusb %%mm3, %%mm0 \n\t"
757 "pxor %%mm2, %%mm0 \n\t"
758 "movq %%mm0, (%%ebx) \n\t" // line 5
760 PAVGB(%%mm7, %%mm1) // d/4
762 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
763 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
764 "psubusb %%mm1, %%mm0 \n\t"
765 "pxor %%mm2, %%mm0 \n\t"
766 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3
768 "movq (%%ebx, %1), %%mm0 \n\t" // line 6
769 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
770 "paddusb %%mm1, %%mm0 \n\t"
771 "pxor %%mm2, %%mm0 \n\t"
772 "movq %%mm0, (%%ebx, %1) \n\t" // line 6
774 PAVGB(%%mm7, %%mm1) // d/8
776 "movq (%%eax, %1), %%mm0 \n\t" // line 2
777 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
778 "psubusb %%mm1, %%mm0 \n\t"
779 "pxor %%mm2, %%mm0 \n\t"
780 "movq %%mm0, (%%eax, %1) \n\t" // line 2
782 "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7
783 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
784 "paddusb %%mm1, %%mm0 \n\t"
785 "pxor %%mm2, %%mm0 \n\t"
786 "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7
789 : "r" (src), "r" (stride)
794 const int l1= stride;
795 const int l2= stride + l1;
796 const int l3= stride + l2;
797 const int l4= stride + l3;
798 const int l5= stride + l4;
799 const int l6= stride + l5;
800 const int l7= stride + l6;
801 // const int l8= stride + l7;
802 // const int l9= stride + l8;
806 for(x=0; x<BLOCK_SIZE; x++)
808 int a= src[l3] - src[l4];
809 int b= src[l4] - src[l5];
810 int c= src[l5] - src[l6];
812 int d= ABS(b) - ((ABS(a) + ABS(c))>>1);
817 int v = d * SIGN(-b);
830 const int l1= stride;
831 const int l2= stride + l1;
832 const int l3= stride + l2;
833 const int l4= stride + l3;
834 const int l5= stride + l4;
835 const int l6= stride + l5;
836 const int l7= stride + l6;
837 const int l8= stride + l7;
838 const int l9= stride + l8;
839 for(int x=0; x<BLOCK_SIZE; x++)
848 if(ABS(v4-v5)<QP && ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 )
850 src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6 )/16;
851 src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7 )/16;
852 src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16;
853 src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16;
862 * Experimental Filter 1 (Horizontal)
863 * will not damage linear gradients
864 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
865 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
866 * MMX2 version does correct clipping C version doesnt
867 * not identical with the vertical one
869 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
872 //FIXME (has little in common with the mmx2 version)
873 for(y=0; y<BLOCK_SIZE; y++)
875 int a= src[1] - src[2];
876 int b= src[3] - src[4];
877 int c= src[5] - src[6];
879 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
883 int v = d * SIGN(-b);
898 static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
900 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
903 const int l1= stride;
904 const int l2= stride + l1;
905 const int l3= stride + l2;
906 const int l4= (int)tmp - (int)src - stride*3;
907 const int l5= (int)tmp - (int)src - stride*3 + 8;
908 const int l6= stride*3 + l3;
909 const int l7= stride + l6;
910 const int l8= stride + l7;
912 memcpy(tmp, src+stride*7, 8);
913 memcpy(tmp+8, src+stride*8, 8);
918 #if 0 //sligtly more accurate and slightly slower
919 "pxor %%mm7, %%mm7 \n\t" // 0
920 "leal (%0, %1), %%eax \n\t"
921 "leal (%%eax, %1, 4), %%ebx \n\t"
923 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
924 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
927 "movq (%0, %1, 2), %%mm0 \n\t" // l2
928 "movq (%0), %%mm1 \n\t" // l0
929 "movq %%mm0, %%mm2 \n\t" // l2
930 PAVGB(%%mm7, %%mm0) // ~l2/2
931 PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4
932 PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8
934 "movq (%%eax), %%mm1 \n\t" // l1
935 "movq (%%eax, %1, 2), %%mm3 \n\t" // l3
936 "movq %%mm1, %%mm4 \n\t" // l1
937 PAVGB(%%mm7, %%mm1) // ~l1/2
938 PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4
939 PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8
941 "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8
942 "psubusb %%mm1, %%mm0 \n\t"
943 "psubusb %%mm4, %%mm1 \n\t"
944 "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
945 // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
947 "movq (%0, %1, 4), %%mm0 \n\t" // l4
948 "movq %%mm0, %%mm4 \n\t" // l4
949 PAVGB(%%mm7, %%mm0) // ~l4/2
950 PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4
951 PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8
953 "movq (%%ebx), %%mm2 \n\t" // l5
954 "movq %%mm3, %%mm5 \n\t" // l3
955 PAVGB(%%mm7, %%mm3) // ~l3/2
956 PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4
957 PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8
959 "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8
960 "psubusb %%mm3, %%mm0 \n\t"
961 "psubusb %%mm6, %%mm3 \n\t"
962 "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
963 "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
964 // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
966 "movq (%%ebx, %1), %%mm6 \n\t" // l6
967 "movq %%mm6, %%mm5 \n\t" // l6
968 PAVGB(%%mm7, %%mm6) // ~l6/2
969 PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4
970 PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8
972 "movq (%%ebx, %1, 2), %%mm5 \n\t" // l7
973 "movq %%mm2, %%mm4 \n\t" // l5
974 PAVGB(%%mm7, %%mm2) // ~l5/2
975 PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4
976 PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8
978 "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8
979 "psubusb %%mm2, %%mm6 \n\t"
980 "psubusb %%mm4, %%mm2 \n\t"
981 "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
982 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
985 PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8
986 "movq pQPb, %%mm4 \n\t" // QP //FIXME QP+1 ?
987 "paddusb b01, %%mm4 \n\t"
988 "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP
989 "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
990 "pand %%mm4, %%mm3 \n\t"
992 "movq %%mm3, %%mm1 \n\t"
993 // "psubusb b01, %%mm3 \n\t"
996 "paddusb %%mm1, %%mm3 \n\t"
997 // "paddusb b01, %%mm3 \n\t"
999 "movq (%%eax, %1, 2), %%mm6 \n\t" //l3
1000 "movq (%0, %1, 4), %%mm5 \n\t" //l4
1001 "movq (%0, %1, 4), %%mm4 \n\t" //l4
1002 "psubusb %%mm6, %%mm5 \n\t"
1003 "psubusb %%mm4, %%mm6 \n\t"
1004 "por %%mm6, %%mm5 \n\t" // |l3-l4|
1005 "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4)
1006 "pxor %%mm6, %%mm0 \n\t"
1007 "pand %%mm0, %%mm3 \n\t"
1008 PMINUB(%%mm5, %%mm3, %%mm0)
1010 "psubusb b01, %%mm3 \n\t"
1013 "movq (%%eax, %1, 2), %%mm0 \n\t"
1014 "movq (%0, %1, 4), %%mm2 \n\t"
1015 "pxor %%mm6, %%mm0 \n\t"
1016 "pxor %%mm6, %%mm2 \n\t"
1017 "psubb %%mm3, %%mm0 \n\t"
1018 "paddb %%mm3, %%mm2 \n\t"
1019 "pxor %%mm6, %%mm0 \n\t"
1020 "pxor %%mm6, %%mm2 \n\t"
1021 "movq %%mm0, (%%eax, %1, 2) \n\t"
1022 "movq %%mm2, (%0, %1, 4) \n\t"
1025 "leal (%0, %1), %%eax \n\t"
1026 "pcmpeqb %%mm6, %%mm6 \n\t" // -1
1028 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
1029 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
1032 "movq (%%eax, %1, 2), %%mm1 \n\t" // l3
1033 "movq (%0, %1, 4), %%mm0 \n\t" // l4
1034 "pxor %%mm6, %%mm1 \n\t" // -l3-1
1035 PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2
1036 // mm1=-l3-1, mm0=128-q
1038 "movq (%%eax, %1, 4), %%mm2 \n\t" // l5
1039 "movq (%%eax, %1), %%mm3 \n\t" // l2
1040 "pxor %%mm6, %%mm2 \n\t" // -l5-1
1041 "movq %%mm2, %%mm5 \n\t" // -l5-1
1042 "movq b80, %%mm4 \n\t" // 128
1043 "leal (%%eax, %1, 4), %%ebx \n\t"
1044 PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2
1045 PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128
1046 PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128
1047 PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128
1048 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
1050 "movq (%%eax), %%mm2 \n\t" // l1
1051 "pxor %%mm6, %%mm2 \n\t" // -l1-1
1052 PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2
1053 PAVGB((%0), %%mm1) // (l0-l3+256)/2
1054 "movq b80, %%mm3 \n\t" // 128
1055 PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128
1056 PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128
1057 PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128
1058 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
1060 PAVGB((%%ebx, %1), %%mm5) // (l6-l5+256)/2
1061 "movq (%%ebx, %1, 2), %%mm1 \n\t" // l7
1062 "pxor %%mm6, %%mm1 \n\t" // -l7-1
1063 PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2
1064 "movq b80, %%mm2 \n\t" // 128
1065 PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128
1066 PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128
1067 PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128
1068 // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
1070 "movq b00, %%mm1 \n\t" // 0
1071 "movq b00, %%mm5 \n\t" // 0
1072 "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16
1073 "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16
1074 PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16|
1075 PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16|
1076 PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16
1078 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
1080 "movq b00, %%mm7 \n\t" // 0
1081 "movq pQPb, %%mm2 \n\t" // QP
1082 PAVGB(%%mm6, %%mm2) // 128 + QP/2
1083 "psubb %%mm6, %%mm2 \n\t"
1085 "movq %%mm4, %%mm1 \n\t"
1086 "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy)
1087 "pxor %%mm1, %%mm4 \n\t"
1088 "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16
1089 "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2
1090 "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
1091 // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
1093 "movq %%mm4, %%mm3 \n\t" // d
1094 "psubusb b01, %%mm4 \n\t"
1095 PAVGB(%%mm7, %%mm4) // d/32
1096 PAVGB(%%mm7, %%mm4) // (d + 32)/64
1097 "paddb %%mm3, %%mm4 \n\t" // 5d/64
1098 "pand %%mm2, %%mm4 \n\t"
1100 "movq b80, %%mm5 \n\t" // 128
1101 "psubb %%mm0, %%mm5 \n\t" // q
1102 "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding
1103 "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q)
1104 "pxor %%mm7, %%mm5 \n\t"
1106 PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64)
1107 "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q)
1109 "pand %%mm7, %%mm4 \n\t"
1110 "movq (%%eax, %1, 2), %%mm0 \n\t"
1111 "movq (%0, %1, 4), %%mm2 \n\t"
1112 "pxor %%mm1, %%mm0 \n\t"
1113 "pxor %%mm1, %%mm2 \n\t"
1114 "paddb %%mm4, %%mm0 \n\t"
1115 "psubb %%mm4, %%mm2 \n\t"
1116 "pxor %%mm1, %%mm0 \n\t"
1117 "pxor %%mm1, %%mm2 \n\t"
1118 "movq %%mm0, (%%eax, %1, 2) \n\t"
1119 "movq %%mm2, (%0, %1, 4) \n\t"
1122 : "r" (src), "r" (stride)
1130 for(x=0; x<BLOCK_SIZE; x++)
1132 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1133 if(ABS(middleEnergy)< 8*QP)
1135 const int q=(src[l4] - src[l5])/2;
1136 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1137 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1139 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1143 d*= SIGN(-middleEnergy);
1167 int d= src[x+y*stride] - tmp[x+(y-4)*8];
1178 src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
1182 if(num%1000000 == 0)
1184 printf(" %d %d %d %d\n", num, sum, max, bias);
1190 #elif defined (HAVE_MMX)
1194 "pxor %%mm7, %%mm7 \n\t"
1195 "leal (%0, %1), %%eax \n\t"
1196 "leal (%%eax, %1, 4), %%ebx \n\t"
1198 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
1199 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
1201 "movq (%0), %%mm0 \n\t"
1202 "movq %%mm0, %%mm1 \n\t"
1203 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
1204 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
1206 "movq (%%eax), %%mm2 \n\t"
1207 "movq %%mm2, %%mm3 \n\t"
1208 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
1209 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
1211 "movq (%%eax, %1), %%mm4 \n\t"
1212 "movq %%mm4, %%mm5 \n\t"
1213 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
1214 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
1216 "paddw %%mm0, %%mm0 \n\t" // 2L0
1217 "paddw %%mm1, %%mm1 \n\t" // 2H0
1218 "psubw %%mm4, %%mm2 \n\t" // L1 - L2
1219 "psubw %%mm5, %%mm3 \n\t" // H1 - H2
1220 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
1221 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
1223 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
1224 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
1225 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
1226 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
1228 "movq (%%eax, %1, 2), %%mm2 \n\t"
1229 "movq %%mm2, %%mm3 \n\t"
1230 "punpcklbw %%mm7, %%mm2 \n\t" // L3
1231 "punpckhbw %%mm7, %%mm3 \n\t" // H3
1233 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
1234 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
1235 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1236 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1237 "movq %%mm0, temp0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1238 "movq %%mm1, temp1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1240 "movq (%0, %1, 4), %%mm0 \n\t"
1241 "movq %%mm0, %%mm1 \n\t"
1242 "punpcklbw %%mm7, %%mm0 \n\t" // L4
1243 "punpckhbw %%mm7, %%mm1 \n\t" // H4
1245 "psubw %%mm0, %%mm2 \n\t" // L3 - L4
1246 "psubw %%mm1, %%mm3 \n\t" // H3 - H4
1247 "movq %%mm2, temp2 \n\t" // L3 - L4
1248 "movq %%mm3, temp3 \n\t" // H3 - H4
1249 "paddw %%mm4, %%mm4 \n\t" // 2L2
1250 "paddw %%mm5, %%mm5 \n\t" // 2H2
1251 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
1252 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
1254 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
1255 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
1256 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
1257 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
1259 "movq (%%ebx), %%mm2 \n\t"
1260 "movq %%mm2, %%mm3 \n\t"
1261 "punpcklbw %%mm7, %%mm2 \n\t" // L5
1262 "punpckhbw %%mm7, %%mm3 \n\t" // H5
1263 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
1264 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
1265 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
1266 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
1268 "movq (%%ebx, %1), %%mm6 \n\t"
1269 "punpcklbw %%mm7, %%mm6 \n\t" // L6
1270 "psubw %%mm6, %%mm2 \n\t" // L5 - L6
1271 "movq (%%ebx, %1), %%mm6 \n\t"
1272 "punpckhbw %%mm7, %%mm6 \n\t" // H6
1273 "psubw %%mm6, %%mm3 \n\t" // H5 - H6
1275 "paddw %%mm0, %%mm0 \n\t" // 2L4
1276 "paddw %%mm1, %%mm1 \n\t" // 2H4
1277 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
1278 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
1280 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
1281 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
1282 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
1283 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
1285 "movq (%%ebx, %1, 2), %%mm2 \n\t"
1286 "movq %%mm2, %%mm3 \n\t"
1287 "punpcklbw %%mm7, %%mm2 \n\t" // L7
1288 "punpckhbw %%mm7, %%mm3 \n\t" // H7
1290 "paddw %%mm2, %%mm2 \n\t" // 2L7
1291 "paddw %%mm3, %%mm3 \n\t" // 2H7
1292 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
1293 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
1295 "movq temp0, %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1296 "movq temp1, %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1299 "movq %%mm7, %%mm6 \n\t" // 0
1300 "psubw %%mm0, %%mm6 \n\t"
1301 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1302 "movq %%mm7, %%mm6 \n\t" // 0
1303 "psubw %%mm1, %%mm6 \n\t"
1304 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1305 "movq %%mm7, %%mm6 \n\t" // 0
1306 "psubw %%mm2, %%mm6 \n\t"
1307 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1308 "movq %%mm7, %%mm6 \n\t" // 0
1309 "psubw %%mm3, %%mm6 \n\t"
1310 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1312 "movq %%mm7, %%mm6 \n\t" // 0
1313 "pcmpgtw %%mm0, %%mm6 \n\t"
1314 "pxor %%mm6, %%mm0 \n\t"
1315 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1316 "movq %%mm7, %%mm6 \n\t" // 0
1317 "pcmpgtw %%mm1, %%mm6 \n\t"
1318 "pxor %%mm6, %%mm1 \n\t"
1319 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1320 "movq %%mm7, %%mm6 \n\t" // 0
1321 "pcmpgtw %%mm2, %%mm6 \n\t"
1322 "pxor %%mm6, %%mm2 \n\t"
1323 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1324 "movq %%mm7, %%mm6 \n\t" // 0
1325 "pcmpgtw %%mm3, %%mm6 \n\t"
1326 "pxor %%mm6, %%mm3 \n\t"
1327 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1331 "pminsw %%mm2, %%mm0 \n\t"
1332 "pminsw %%mm3, %%mm1 \n\t"
1334 "movq %%mm0, %%mm6 \n\t"
1335 "psubusw %%mm2, %%mm6 \n\t"
1336 "psubw %%mm6, %%mm0 \n\t"
1337 "movq %%mm1, %%mm6 \n\t"
1338 "psubusw %%mm3, %%mm6 \n\t"
1339 "psubw %%mm6, %%mm1 \n\t"
1342 "movq %%mm7, %%mm6 \n\t" // 0
1343 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
1344 "pxor %%mm6, %%mm4 \n\t"
1345 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
1346 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
1347 "pxor %%mm7, %%mm5 \n\t"
1348 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
1350 "movd %2, %%mm2 \n\t" // QP
1351 "punpcklwd %%mm2, %%mm2 \n\t"
1352 "punpcklwd %%mm2, %%mm2 \n\t"
1353 "psllw $3, %%mm2 \n\t" // 8QP
1354 "movq %%mm2, %%mm3 \n\t" // 8QP
1355 "pcmpgtw %%mm4, %%mm2 \n\t"
1356 "pcmpgtw %%mm5, %%mm3 \n\t"
1357 "pand %%mm2, %%mm4 \n\t"
1358 "pand %%mm3, %%mm5 \n\t"
1361 "psubusw %%mm0, %%mm4 \n\t" // hd
1362 "psubusw %%mm1, %%mm5 \n\t" // ld
1365 "movq w05, %%mm2 \n\t" // 5
1366 "pmullw %%mm2, %%mm4 \n\t"
1367 "pmullw %%mm2, %%mm5 \n\t"
1368 "movq w20, %%mm2 \n\t" // 32
1369 "paddw %%mm2, %%mm4 \n\t"
1370 "paddw %%mm2, %%mm5 \n\t"
1371 "psrlw $6, %%mm4 \n\t"
1372 "psrlw $6, %%mm5 \n\t"
1375 "movq w06, %%mm2 \n\t" // 6
1376 "paddw %%mm2, %%mm4 \n\t"
1377 "paddw %%mm2, %%mm5 \n\t"
1378 "movq w1400, %%mm2 \n\t" // 1400h = 5120 = 5/64*2^16
1379 //FIXME if *5/64 is supposed to be /13 then we should use 5041 instead of 5120
1380 "pmulhw %%mm2, %%mm4 \n\t" // hd/13
1381 "pmulhw %%mm2, %%mm5 \n\t" // ld/13
1384 "movq temp2, %%mm0 \n\t" // L3 - L4
1385 "movq temp3, %%mm1 \n\t" // H3 - H4
1387 "pxor %%mm2, %%mm2 \n\t"
1388 "pxor %%mm3, %%mm3 \n\t"
1390 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
1391 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
1392 "pxor %%mm2, %%mm0 \n\t"
1393 "pxor %%mm3, %%mm1 \n\t"
1394 "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
1395 "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
1396 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
1397 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
1399 "pxor %%mm6, %%mm2 \n\t"
1400 "pxor %%mm7, %%mm3 \n\t"
1401 "pand %%mm2, %%mm4 \n\t"
1402 "pand %%mm3, %%mm5 \n\t"
1405 "pminsw %%mm0, %%mm4 \n\t"
1406 "pminsw %%mm1, %%mm5 \n\t"
1408 "movq %%mm4, %%mm2 \n\t"
1409 "psubusw %%mm0, %%mm2 \n\t"
1410 "psubw %%mm2, %%mm4 \n\t"
1411 "movq %%mm5, %%mm2 \n\t"
1412 "psubusw %%mm1, %%mm2 \n\t"
1413 "psubw %%mm2, %%mm5 \n\t"
1415 "pxor %%mm6, %%mm4 \n\t"
1416 "pxor %%mm7, %%mm5 \n\t"
1417 "psubw %%mm6, %%mm4 \n\t"
1418 "psubw %%mm7, %%mm5 \n\t"
1419 "packsswb %%mm5, %%mm4 \n\t"
1420 "movq (%%eax, %1, 2), %%mm0 \n\t"
1421 "paddb %%mm4, %%mm0 \n\t"
1422 "movq %%mm0, (%%eax, %1, 2) \n\t"
1423 "movq (%0, %1, 4), %%mm0 \n\t"
1424 "psubb %%mm4, %%mm0 \n\t"
1425 "movq %%mm0, (%0, %1, 4) \n\t"
1428 : "r" (src), "r" (stride), "r" (QP)
1432 const int l1= stride;
1433 const int l2= stride + l1;
1434 const int l3= stride + l2;
1435 const int l4= stride + l3;
1436 const int l5= stride + l4;
1437 const int l6= stride + l5;
1438 const int l7= stride + l6;
1439 const int l8= stride + l7;
1440 // const int l9= stride + l8;
1443 for(x=0; x<BLOCK_SIZE; x++)
1445 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1446 if(ABS(middleEnergy) < 8*QP)
1448 const int q=(src[l4] - src[l5])/2;
1449 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1450 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1452 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1456 d*= SIGN(-middleEnergy);
1478 * Check if the given 8x8 Block is mostly "flat"
1480 static inline int isHorizDC(uint8_t src[], int stride)
1484 for(y=0; y<BLOCK_SIZE; y++)
1486 if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++;
1487 if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++;
1488 if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++;
1489 if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++;
1490 if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++;
1491 if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++;
1492 if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++;
1495 return numEq > hFlatnessThreshold;
1498 static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
1500 if(abs(src[0] - src[7]) > 2*QP) return 0;
1505 static inline void doHorizDefFilter(uint8_t dst[], int stride, int QP)
1508 for(y=0; y<BLOCK_SIZE; y++)
1510 const int middleEnergy= 5*(dst[4] - dst[5]) + 2*(dst[2] - dst[5]);
1512 if(ABS(middleEnergy) < 8*QP)
1514 const int q=(dst[3] - dst[4])/2;
1515 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
1516 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
1518 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1522 d*= SIGN(-middleEnergy);
1543 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
1544 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
1546 static inline void doHorizLowPass(uint8_t dst[], int stride, int QP)
1550 for(y=0; y<BLOCK_SIZE; y++)
1552 const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
1553 const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
1556 sums[0] = first + dst[0];
1557 sums[1] = dst[0] + dst[1];
1558 sums[2] = dst[1] + dst[2];
1559 sums[3] = dst[2] + dst[3];
1560 sums[4] = dst[3] + dst[4];
1561 sums[5] = dst[4] + dst[5];
1562 sums[6] = dst[5] + dst[6];
1563 sums[7] = dst[6] + dst[7];
1564 sums[8] = dst[7] + last;
1566 dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
1567 dst[1]= ((dst[1]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
1568 dst[2]= ((dst[2]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
1569 dst[3]= ((dst[3]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
1570 dst[4]= ((dst[4]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
1571 dst[5]= ((dst[5]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
1572 dst[6]= (((last + dst[6])<<2) + ((dst[7] + sums[5])<<1) + sums[3] + 8)>>4;
1573 dst[7]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
1580 static inline void dering(uint8_t src[], int stride, int QP)
1582 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1584 "movq pQPb, %%mm0 \n\t"
1585 "paddusb %%mm0, %%mm0 \n\t"
1586 "movq %%mm0, pQPb2 \n\t"
1588 "leal (%0, %1), %%eax \n\t"
1589 "leal (%%eax, %1, 4), %%ebx \n\t"
1590 // 0 1 2 3 4 5 6 7 8 9
1591 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1593 "pcmpeqb %%mm7, %%mm7 \n\t"
1594 "pxor %%mm6, %%mm6 \n\t"
1596 #define FIND_MIN_MAX(addr)\
1597 "movq " #addr ", %%mm0 \n\t"\
1598 "pminub %%mm0, %%mm7 \n\t"\
1599 "pmaxub %%mm0, %%mm6 \n\t"
1601 #define FIND_MIN_MAX(addr)\
1602 "movq " #addr ", %%mm0 \n\t"\
1603 "movq %%mm7, %%mm1 \n\t"\
1604 "psubusb %%mm0, %%mm6 \n\t"\
1605 "paddb %%mm0, %%mm6 \n\t"\
1606 "psubusb %%mm0, %%mm1 \n\t"\
1607 "psubb %%mm1, %%mm7 \n\t"
1610 FIND_MIN_MAX((%%eax))
1611 FIND_MIN_MAX((%%eax, %1))
1612 FIND_MIN_MAX((%%eax, %1, 2))
1613 FIND_MIN_MAX((%0, %1, 4))
1614 FIND_MIN_MAX((%%ebx))
1615 FIND_MIN_MAX((%%ebx, %1))
1616 FIND_MIN_MAX((%%ebx, %1, 2))
1617 FIND_MIN_MAX((%0, %1, 8))
1619 "movq %%mm7, %%mm4 \n\t"
1620 "psrlq $8, %%mm7 \n\t"
1622 "pminub %%mm4, %%mm7 \n\t" // min of pixels
1623 "pshufw $0xF9, %%mm7, %%mm4 \n\t"
1624 "pminub %%mm4, %%mm7 \n\t" // min of pixels
1625 "pshufw $0xFE, %%mm7, %%mm4 \n\t"
1626 "pminub %%mm4, %%mm7 \n\t"
1628 "movq %%mm7, %%mm1 \n\t"
1629 "psubusb %%mm4, %%mm1 \n\t"
1630 "psubb %%mm1, %%mm7 \n\t"
1631 "movq %%mm7, %%mm4 \n\t"
1632 "psrlq $16, %%mm7 \n\t"
1633 "movq %%mm7, %%mm1 \n\t"
1634 "psubusb %%mm4, %%mm1 \n\t"
1635 "psubb %%mm1, %%mm7 \n\t"
1636 "movq %%mm7, %%mm4 \n\t"
1637 "psrlq $32, %%mm7 \n\t"
1638 "movq %%mm7, %%mm1 \n\t"
1639 "psubusb %%mm4, %%mm1 \n\t"
1640 "psubb %%mm1, %%mm7 \n\t"
1644 "movq %%mm6, %%mm4 \n\t"
1645 "psrlq $8, %%mm6 \n\t"
1647 "pmaxub %%mm4, %%mm6 \n\t" // max of pixels
1648 "pshufw $0xF9, %%mm6, %%mm4 \n\t"
1649 "pmaxub %%mm4, %%mm6 \n\t"
1650 "pshufw $0xFE, %%mm6, %%mm4 \n\t"
1651 "pmaxub %%mm4, %%mm6 \n\t"
1653 "psubusb %%mm4, %%mm6 \n\t"
1654 "paddb %%mm4, %%mm6 \n\t"
1655 "movq %%mm6, %%mm4 \n\t"
1656 "psrlq $16, %%mm6 \n\t"
1657 "psubusb %%mm4, %%mm6 \n\t"
1658 "paddb %%mm4, %%mm6 \n\t"
1659 "movq %%mm6, %%mm4 \n\t"
1660 "psrlq $32, %%mm6 \n\t"
1661 "psubusb %%mm4, %%mm6 \n\t"
1662 "paddb %%mm4, %%mm6 \n\t"
1664 "movq %%mm6, %%mm0 \n\t" // max
1665 "psubb %%mm7, %%mm6 \n\t" // max - min
1666 "movd %%mm6, %%ecx \n\t"
1667 "cmpb deringThreshold, %%cl \n\t"
1669 PAVGB(%%mm0, %%mm7) // a=(max + min)/2
1670 "punpcklbw %%mm7, %%mm7 \n\t"
1671 "punpcklbw %%mm7, %%mm7 \n\t"
1672 "punpcklbw %%mm7, %%mm7 \n\t"
1673 "movq %%mm7, temp0 \n\t"
1675 "movq (%0), %%mm0 \n\t" // L10
1676 "movq %%mm0, %%mm1 \n\t" // L10
1677 "movq %%mm0, %%mm2 \n\t" // L10
1678 "psllq $8, %%mm1 \n\t"
1679 "psrlq $8, %%mm2 \n\t"
1680 "movd -4(%0), %%mm3 \n\t"
1681 "movd 8(%0), %%mm4 \n\t"
1682 "psrlq $24, %%mm3 \n\t"
1683 "psllq $56, %%mm4 \n\t"
1684 "por %%mm3, %%mm1 \n\t" // L00
1685 "por %%mm4, %%mm2 \n\t" // L20
1686 "movq %%mm1, %%mm3 \n\t" // L00
1687 PAVGB(%%mm2, %%mm1) // (L20 + L00)/2
1688 PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4
1689 "psubusb %%mm7, %%mm0 \n\t"
1690 "psubusb %%mm7, %%mm2 \n\t"
1691 "psubusb %%mm7, %%mm3 \n\t"
1692 "pcmpeqb b00, %%mm0 \n\t" // L10 > a ? 0 : -1
1693 "pcmpeqb b00, %%mm2 \n\t" // L20 > a ? 0 : -1
1694 "pcmpeqb b00, %%mm3 \n\t" // L00 > a ? 0 : -1
1695 "paddb %%mm2, %%mm0 \n\t"
1696 "paddb %%mm3, %%mm0 \n\t"
1698 "movq (%%eax), %%mm2 \n\t" // L11
1699 "movq %%mm2, %%mm3 \n\t" // L11
1700 "movq %%mm2, %%mm4 \n\t" // L11
1701 "psllq $8, %%mm3 \n\t"
1702 "psrlq $8, %%mm4 \n\t"
1703 "movd -4(%%eax), %%mm5 \n\t"
1704 "movd 8(%%eax), %%mm6 \n\t"
1705 "psrlq $24, %%mm5 \n\t"
1706 "psllq $56, %%mm6 \n\t"
1707 "por %%mm5, %%mm3 \n\t" // L01
1708 "por %%mm6, %%mm4 \n\t" // L21
1709 "movq %%mm3, %%mm5 \n\t" // L01
1710 PAVGB(%%mm4, %%mm3) // (L21 + L01)/2
1711 PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4
1712 "psubusb %%mm7, %%mm2 \n\t"
1713 "psubusb %%mm7, %%mm4 \n\t"
1714 "psubusb %%mm7, %%mm5 \n\t"
1715 "pcmpeqb b00, %%mm2 \n\t" // L11 > a ? 0 : -1
1716 "pcmpeqb b00, %%mm4 \n\t" // L21 > a ? 0 : -1
1717 "pcmpeqb b00, %%mm5 \n\t" // L01 > a ? 0 : -1
1718 "paddb %%mm4, %%mm2 \n\t"
1719 "paddb %%mm5, %%mm2 \n\t"
1721 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1722 "movq " #src ", " #sx " \n\t" /* src[0] */\
1723 "movq " #sx ", " #lx " \n\t" /* src[0] */\
1724 "movq " #sx ", " #t0 " \n\t" /* src[0] */\
1725 "psllq $8, " #lx " \n\t"\
1726 "psrlq $8, " #t0 " \n\t"\
1727 "movd -4" #src ", " #t1 " \n\t"\
1728 "psrlq $24, " #t1 " \n\t"\
1729 "por " #t1 ", " #lx " \n\t" /* src[-1] */\
1730 "movd 8" #src ", " #t1 " \n\t"\
1731 "psllq $56, " #t1 " \n\t"\
1732 "por " #t1 ", " #t0 " \n\t" /* src[+1] */\
1733 "movq " #lx ", " #t1 " \n\t" /* src[-1] */\
1734 PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\
1735 PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\
1737 "movq " #lx ", temp1 \n\t"\
1738 "movq temp0, " #lx " \n\t"\
1739 "psubusb " #lx ", " #t1 " \n\t"\
1740 "psubusb " #lx ", " #t0 " \n\t"\
1741 "psubusb " #lx ", " #sx " \n\t"\
1742 "movq b00, " #lx " \n\t"\
1743 "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\
1744 "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\
1745 "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\
1746 "paddb " #t1 ", " #t0 " \n\t"\
1747 "paddb " #t0 ", " #sx " \n\t"\
1749 PAVGB(plx, pplx) /* filtered */\
1750 "movq " #dst ", " #t0 " \n\t" /* dst */\
1751 "movq " #t0 ", " #t1 " \n\t" /* dst */\
1752 "psubusb pQPb2, " #t0 " \n\t"\
1753 "paddusb pQPb2, " #t1 " \n\t"\
1755 PMINUB(t1, pplx, t0)\
1756 "paddb " #sx ", " #ppsx " \n\t"\
1757 "paddb " #psx ", " #ppsx " \n\t"\
1758 "#paddb b02, " #ppsx " \n\t"\
1759 "pand b08, " #ppsx " \n\t"\
1760 "pcmpeqb " #lx ", " #ppsx " \n\t"\
1761 "pand " #ppsx ", " #pplx " \n\t"\
1762 "pandn " #dst ", " #ppsx " \n\t"\
1763 "por " #pplx ", " #ppsx " \n\t"\
1764 "movq " #ppsx ", " #dst " \n\t"\
1765 "movq temp1, " #lx " \n\t"
1782 //DERING_CORE(dst,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1)
1783 DERING_CORE((%%eax),(%%eax, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1784 DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1785 DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1786 DERING_CORE((%0, %1, 4),(%%ebx) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1787 DERING_CORE((%%ebx),(%%ebx, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1788 DERING_CORE((%%ebx, %1), (%%ebx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1789 DERING_CORE((%%ebx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1790 DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1793 : : "r" (src), "r" (stride), "r" (QP)
1794 : "%eax", "%ebx", "%ecx"
1811 if(*p > max) max= *p;
1812 if(*p < min) min= *p;
1815 avg= (min + max + 1)/2;
1817 if(max - min <deringThreshold) return;
1826 if(*p > avg) t |= (1<<x);
1830 t &= (t<<1) & (t>>1);
1837 int t = s[y-1] & s[y] & s[y+1];
1846 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
1847 +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1))
1848 +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
1851 #ifdef DEBUG_DERING_THRESHOLD
1852 asm volatile("emms\n\t":);
1854 static long long numPixels=0;
1855 if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
1856 // if((max-min)<20 || (max-min)*QP<200)
1857 // if((max-min)*QP < 500)
1861 static int numSkiped=0;
1862 static int errorSum=0;
1863 static int worstQP=0;
1864 static int worstRange=0;
1865 static int worstDiff=0;
1867 int absDiff= ABS(diff);
1868 int error= diff*diff;
1870 if(x==1 || x==8 || y==1 || y==8) continue;
1873 if(absDiff > worstDiff)
1877 worstRange= max-min;
1881 if(1024LL*1024LL*1024LL % numSkiped == 0)
1883 printf( "sum:%1.3f, skip:%d, wQP:%d, "
1884 "wRange:%d, wDiff:%d, relSkip:%1.3f\n",
1885 (float)errorSum/numSkiped, numSkiped, worstQP, worstRange,
1886 worstDiff, (float)numSkiped/numPixels);
1891 if (*p + 2*QP < f) *p= *p + 2*QP;
1892 else if(*p - 2*QP > f) *p= *p - 2*QP;
1897 #ifdef DEBUG_DERING_THRESHOLD
1908 *p = MIN(*p + 20, 255);
1911 // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
1918 * Deinterlaces the given block
1919 * will be called for every 8x8 block and can read & write from line 4-15
1920 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1921 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1923 static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride)
1925 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1928 "leal (%0, %1), %%eax \n\t"
1929 "leal (%%eax, %1, 4), %%ebx \n\t"
1930 // 0 1 2 3 4 5 6 7 8 9
1931 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1933 "movq (%0), %%mm0 \n\t"
1934 "movq (%%eax, %1), %%mm1 \n\t"
1936 "movq %%mm0, (%%eax) \n\t"
1937 "movq (%0, %1, 4), %%mm0 \n\t"
1939 "movq %%mm1, (%%eax, %1, 2) \n\t"
1940 "movq (%%ebx, %1), %%mm1 \n\t"
1942 "movq %%mm0, (%%ebx) \n\t"
1943 "movq (%0, %1, 8), %%mm0 \n\t"
1945 "movq %%mm1, (%%ebx, %1, 2) \n\t"
1947 : : "r" (src), "r" (stride)
1955 src[stride] = (src[0] + src[stride*2])>>1;
1956 src[stride*3] = (src[stride*2] + src[stride*4])>>1;
1957 src[stride*5] = (src[stride*4] + src[stride*6])>>1;
1958 src[stride*7] = (src[stride*6] + src[stride*8])>>1;
1965 * Deinterlaces the given block
1966 * will be called for every 8x8 block and can read & write from line 4-15
1967 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1968 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1969 * this filter will read lines 3-15 and write 7-13
1970 * no cliping in C version
1972 static inline void deInterlaceInterpolateCubic(uint8_t src[], int stride)
1974 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1977 "leal (%0, %1), %%eax \n\t"
1978 "leal (%%eax, %1, 4), %%ebx \n\t"
1979 "leal (%%ebx, %1, 4), %%ecx \n\t"
1980 "addl %1, %%ecx \n\t"
1981 "pxor %%mm7, %%mm7 \n\t"
1982 // 0 1 2 3 4 5 6 7 8 9 10
1983 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 ecx
1985 #define DEINT_CUBIC(a,b,c,d,e)\
1986 "movq " #a ", %%mm0 \n\t"\
1987 "movq " #b ", %%mm1 \n\t"\
1988 "movq " #d ", %%mm2 \n\t"\
1989 "movq " #e ", %%mm3 \n\t"\
1990 PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\
1991 PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\
1992 "movq %%mm0, %%mm2 \n\t"\
1993 "punpcklbw %%mm7, %%mm0 \n\t"\
1994 "punpckhbw %%mm7, %%mm2 \n\t"\
1995 "movq %%mm1, %%mm3 \n\t"\
1996 "punpcklbw %%mm7, %%mm1 \n\t"\
1997 "punpckhbw %%mm7, %%mm3 \n\t"\
1998 "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\
1999 "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\
2000 "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\
2001 "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\
2002 "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\
2003 "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\
2004 "packuswb %%mm3, %%mm1 \n\t"\
2005 "movq %%mm1, " #c " \n\t"
2007 DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%ebx, %1))
2008 DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%ebx), (%%ebx, %1), (%0, %1, 8))
2009 DEINT_CUBIC((%0, %1, 4), (%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8), (%%ecx))
2010 DEINT_CUBIC((%%ebx, %1), (%0, %1, 8), (%%ebx, %1, 4), (%%ecx), (%%ecx, %1, 2))
2012 : : "r" (src), "r" (stride)
2013 : "%eax", "%ebx", "ecx"
2020 src[stride*3] = (-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4;
2021 src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4;
2022 src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4;
2023 src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4;
2030 * Deinterlaces the given block
2031 * will be called for every 8x8 block and can read & write from line 4-15
2032 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
2033 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2034 * will shift the image up by 1 line (FIXME if this is a problem)
2035 * this filter will read lines 4-13 and write 4-11
2037 static inline void deInterlaceBlendLinear(uint8_t src[], int stride)
2039 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2042 "leal (%0, %1), %%eax \n\t"
2043 "leal (%%eax, %1, 4), %%ebx \n\t"
2044 // 0 1 2 3 4 5 6 7 8 9
2045 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2047 "movq (%0), %%mm0 \n\t" // L0
2048 "movq (%%eax, %1), %%mm1 \n\t" // L2
2049 PAVGB(%%mm1, %%mm0) // L0+L2
2050 "movq (%%eax), %%mm2 \n\t" // L1
2052 "movq %%mm0, (%0) \n\t"
2053 "movq (%%eax, %1, 2), %%mm0 \n\t" // L3
2054 PAVGB(%%mm0, %%mm2) // L1+L3
2055 PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3
2056 "movq %%mm2, (%%eax) \n\t"
2057 "movq (%0, %1, 4), %%mm2 \n\t" // L4
2058 PAVGB(%%mm2, %%mm1) // L2+L4
2059 PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4
2060 "movq %%mm1, (%%eax, %1) \n\t"
2061 "movq (%%ebx), %%mm1 \n\t" // L5
2062 PAVGB(%%mm1, %%mm0) // L3+L5
2063 PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5
2064 "movq %%mm0, (%%eax, %1, 2) \n\t"
2065 "movq (%%ebx, %1), %%mm0 \n\t" // L6
2066 PAVGB(%%mm0, %%mm2) // L4+L6
2067 PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6
2068 "movq %%mm2, (%0, %1, 4) \n\t"
2069 "movq (%%ebx, %1, 2), %%mm2 \n\t" // L7
2070 PAVGB(%%mm2, %%mm1) // L5+L7
2071 PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7
2072 "movq %%mm1, (%%ebx) \n\t"
2073 "movq (%0, %1, 8), %%mm1 \n\t" // L8
2074 PAVGB(%%mm1, %%mm0) // L6+L8
2075 PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8
2076 "movq %%mm0, (%%ebx, %1) \n\t"
2077 "movq (%%ebx, %1, 4), %%mm0 \n\t" // L9
2078 PAVGB(%%mm0, %%mm2) // L7+L9
2079 PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9
2080 "movq %%mm2, (%%ebx, %1, 2) \n\t"
2083 : : "r" (src), "r" (stride)
2091 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2;
2092 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2;
2093 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2094 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
2095 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
2096 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
2097 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
2098 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
2105 * Deinterlaces the given block
2106 * will be called for every 8x8 block and can read & write from line 4-15,
2107 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
2108 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2110 static inline void deInterlaceMedian(uint8_t src[], int stride)
2116 "leal (%0, %1), %%eax \n\t"
2117 "leal (%%eax, %1, 4), %%ebx \n\t"
2118 // 0 1 2 3 4 5 6 7 8 9
2119 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2121 "movq (%0), %%mm0 \n\t" //
2122 "movq (%%eax, %1), %%mm2 \n\t" //
2123 "movq (%%eax), %%mm1 \n\t" //
2124 "movq %%mm0, %%mm3 \n\t"
2125 "pmaxub %%mm1, %%mm0 \n\t" //
2126 "pminub %%mm3, %%mm1 \n\t" //
2127 "pmaxub %%mm2, %%mm1 \n\t" //
2128 "pminub %%mm1, %%mm0 \n\t"
2129 "movq %%mm0, (%%eax) \n\t"
2131 "movq (%0, %1, 4), %%mm0 \n\t" //
2132 "movq (%%eax, %1, 2), %%mm1 \n\t" //
2133 "movq %%mm2, %%mm3 \n\t"
2134 "pmaxub %%mm1, %%mm2 \n\t" //
2135 "pminub %%mm3, %%mm1 \n\t" //
2136 "pmaxub %%mm0, %%mm1 \n\t" //
2137 "pminub %%mm1, %%mm2 \n\t"
2138 "movq %%mm2, (%%eax, %1, 2) \n\t"
2140 "movq (%%ebx), %%mm2 \n\t" //
2141 "movq (%%ebx, %1), %%mm1 \n\t" //
2142 "movq %%mm2, %%mm3 \n\t"
2143 "pmaxub %%mm0, %%mm2 \n\t" //
2144 "pminub %%mm3, %%mm0 \n\t" //
2145 "pmaxub %%mm1, %%mm0 \n\t" //
2146 "pminub %%mm0, %%mm2 \n\t"
2147 "movq %%mm2, (%%ebx) \n\t"
2149 "movq (%%ebx, %1, 2), %%mm2 \n\t" //
2150 "movq (%0, %1, 8), %%mm0 \n\t" //
2151 "movq %%mm2, %%mm3 \n\t"
2152 "pmaxub %%mm0, %%mm2 \n\t" //
2153 "pminub %%mm3, %%mm0 \n\t" //
2154 "pmaxub %%mm1, %%mm0 \n\t" //
2155 "pminub %%mm0, %%mm2 \n\t"
2156 "movq %%mm2, (%%ebx, %1, 2) \n\t"
2159 : : "r" (src), "r" (stride)
2163 #else // MMX without MMX2
2165 "leal (%0, %1), %%eax \n\t"
2166 "leal (%%eax, %1, 4), %%ebx \n\t"
2167 // 0 1 2 3 4 5 6 7 8 9
2168 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2169 "pxor %%mm7, %%mm7 \n\t"
2171 #define MEDIAN(a,b,c)\
2172 "movq " #a ", %%mm0 \n\t"\
2173 "movq " #b ", %%mm2 \n\t"\
2174 "movq " #c ", %%mm1 \n\t"\
2175 "movq %%mm0, %%mm3 \n\t"\
2176 "movq %%mm1, %%mm4 \n\t"\
2177 "movq %%mm2, %%mm5 \n\t"\
2178 "psubusb %%mm1, %%mm3 \n\t"\
2179 "psubusb %%mm2, %%mm4 \n\t"\
2180 "psubusb %%mm0, %%mm5 \n\t"\
2181 "pcmpeqb %%mm7, %%mm3 \n\t"\
2182 "pcmpeqb %%mm7, %%mm4 \n\t"\
2183 "pcmpeqb %%mm7, %%mm5 \n\t"\
2184 "movq %%mm3, %%mm6 \n\t"\
2185 "pxor %%mm4, %%mm3 \n\t"\
2186 "pxor %%mm5, %%mm4 \n\t"\
2187 "pxor %%mm6, %%mm5 \n\t"\
2188 "por %%mm3, %%mm1 \n\t"\
2189 "por %%mm4, %%mm2 \n\t"\
2190 "por %%mm5, %%mm0 \n\t"\
2191 "pand %%mm2, %%mm0 \n\t"\
2192 "pand %%mm1, %%mm0 \n\t"\
2193 "movq %%mm0, " #b " \n\t"
2195 MEDIAN((%0), (%%eax), (%%eax, %1))
2196 MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
2197 MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1))
2198 MEDIAN((%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8))
2200 : : "r" (src), "r" (stride)
2210 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2;
2211 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2;
2212 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2213 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
2214 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
2215 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
2216 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
2217 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
2225 * transposes and shift the given 8x8 Block into dst1 and dst2
2227 static inline void transpose1(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
2230 "leal (%0, %1), %%eax \n\t"
2231 "leal (%%eax, %1, 4), %%ebx \n\t"
2232 // 0 1 2 3 4 5 6 7 8 9
2233 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2234 "movq (%0), %%mm0 \n\t" // 12345678
2235 "movq (%%eax), %%mm1 \n\t" // abcdefgh
2236 "movq %%mm0, %%mm2 \n\t" // 12345678
2237 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2238 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2240 "movq (%%eax, %1), %%mm1 \n\t"
2241 "movq (%%eax, %1, 2), %%mm3 \n\t"
2242 "movq %%mm1, %%mm4 \n\t"
2243 "punpcklbw %%mm3, %%mm1 \n\t"
2244 "punpckhbw %%mm3, %%mm4 \n\t"
2246 "movq %%mm0, %%mm3 \n\t"
2247 "punpcklwd %%mm1, %%mm0 \n\t"
2248 "punpckhwd %%mm1, %%mm3 \n\t"
2249 "movq %%mm2, %%mm1 \n\t"
2250 "punpcklwd %%mm4, %%mm2 \n\t"
2251 "punpckhwd %%mm4, %%mm1 \n\t"
2253 "movd %%mm0, 128(%2) \n\t"
2254 "psrlq $32, %%mm0 \n\t"
2255 "movd %%mm0, 144(%2) \n\t"
2256 "movd %%mm3, 160(%2) \n\t"
2257 "psrlq $32, %%mm3 \n\t"
2258 "movd %%mm3, 176(%2) \n\t"
2259 "movd %%mm3, 48(%3) \n\t"
2260 "movd %%mm2, 192(%2) \n\t"
2261 "movd %%mm2, 64(%3) \n\t"
2262 "psrlq $32, %%mm2 \n\t"
2263 "movd %%mm2, 80(%3) \n\t"
2264 "movd %%mm1, 96(%3) \n\t"
2265 "psrlq $32, %%mm1 \n\t"
2266 "movd %%mm1, 112(%3) \n\t"
2268 "movq (%0, %1, 4), %%mm0 \n\t" // 12345678
2269 "movq (%%ebx), %%mm1 \n\t" // abcdefgh
2270 "movq %%mm0, %%mm2 \n\t" // 12345678
2271 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2272 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2274 "movq (%%ebx, %1), %%mm1 \n\t"
2275 "movq (%%ebx, %1, 2), %%mm3 \n\t"
2276 "movq %%mm1, %%mm4 \n\t"
2277 "punpcklbw %%mm3, %%mm1 \n\t"
2278 "punpckhbw %%mm3, %%mm4 \n\t"
2280 "movq %%mm0, %%mm3 \n\t"
2281 "punpcklwd %%mm1, %%mm0 \n\t"
2282 "punpckhwd %%mm1, %%mm3 \n\t"
2283 "movq %%mm2, %%mm1 \n\t"
2284 "punpcklwd %%mm4, %%mm2 \n\t"
2285 "punpckhwd %%mm4, %%mm1 \n\t"
2287 "movd %%mm0, 132(%2) \n\t"
2288 "psrlq $32, %%mm0 \n\t"
2289 "movd %%mm0, 148(%2) \n\t"
2290 "movd %%mm3, 164(%2) \n\t"
2291 "psrlq $32, %%mm3 \n\t"
2292 "movd %%mm3, 180(%2) \n\t"
2293 "movd %%mm3, 52(%3) \n\t"
2294 "movd %%mm2, 196(%2) \n\t"
2295 "movd %%mm2, 68(%3) \n\t"
2296 "psrlq $32, %%mm2 \n\t"
2297 "movd %%mm2, 84(%3) \n\t"
2298 "movd %%mm1, 100(%3) \n\t"
2299 "psrlq $32, %%mm1 \n\t"
2300 "movd %%mm1, 116(%3) \n\t"
2303 :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2)
2309 * transposes the given 8x8 block
2311 static inline void transpose2(uint8_t *dst, int dstStride, uint8_t *src)
2314 "leal (%0, %1), %%eax \n\t"
2315 "leal (%%eax, %1, 4), %%ebx \n\t"
2316 // 0 1 2 3 4 5 6 7 8 9
2317 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2318 "movq (%2), %%mm0 \n\t" // 12345678
2319 "movq 16(%2), %%mm1 \n\t" // abcdefgh
2320 "movq %%mm0, %%mm2 \n\t" // 12345678
2321 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2322 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2324 "movq 32(%2), %%mm1 \n\t"
2325 "movq 48(%2), %%mm3 \n\t"
2326 "movq %%mm1, %%mm4 \n\t"
2327 "punpcklbw %%mm3, %%mm1 \n\t"
2328 "punpckhbw %%mm3, %%mm4 \n\t"
2330 "movq %%mm0, %%mm3 \n\t"
2331 "punpcklwd %%mm1, %%mm0 \n\t"
2332 "punpckhwd %%mm1, %%mm3 \n\t"
2333 "movq %%mm2, %%mm1 \n\t"
2334 "punpcklwd %%mm4, %%mm2 \n\t"
2335 "punpckhwd %%mm4, %%mm1 \n\t"
2337 "movd %%mm0, (%0) \n\t"
2338 "psrlq $32, %%mm0 \n\t"
2339 "movd %%mm0, (%%eax) \n\t"
2340 "movd %%mm3, (%%eax, %1) \n\t"
2341 "psrlq $32, %%mm3 \n\t"
2342 "movd %%mm3, (%%eax, %1, 2) \n\t"
2343 "movd %%mm2, (%0, %1, 4) \n\t"
2344 "psrlq $32, %%mm2 \n\t"
2345 "movd %%mm2, (%%ebx) \n\t"
2346 "movd %%mm1, (%%ebx, %1) \n\t"
2347 "psrlq $32, %%mm1 \n\t"
2348 "movd %%mm1, (%%ebx, %1, 2) \n\t"
2351 "movq 64(%2), %%mm0 \n\t" // 12345678
2352 "movq 80(%2), %%mm1 \n\t" // abcdefgh
2353 "movq %%mm0, %%mm2 \n\t" // 12345678
2354 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2355 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2357 "movq 96(%2), %%mm1 \n\t"
2358 "movq 112(%2), %%mm3 \n\t"
2359 "movq %%mm1, %%mm4 \n\t"
2360 "punpcklbw %%mm3, %%mm1 \n\t"
2361 "punpckhbw %%mm3, %%mm4 \n\t"
2363 "movq %%mm0, %%mm3 \n\t"
2364 "punpcklwd %%mm1, %%mm0 \n\t"
2365 "punpckhwd %%mm1, %%mm3 \n\t"
2366 "movq %%mm2, %%mm1 \n\t"
2367 "punpcklwd %%mm4, %%mm2 \n\t"
2368 "punpckhwd %%mm4, %%mm1 \n\t"
2370 "movd %%mm0, 4(%0) \n\t"
2371 "psrlq $32, %%mm0 \n\t"
2372 "movd %%mm0, 4(%%eax) \n\t"
2373 "movd %%mm3, 4(%%eax, %1) \n\t"
2374 "psrlq $32, %%mm3 \n\t"
2375 "movd %%mm3, 4(%%eax, %1, 2) \n\t"
2376 "movd %%mm2, 4(%0, %1, 4) \n\t"
2377 "psrlq $32, %%mm2 \n\t"
2378 "movd %%mm2, 4(%%ebx) \n\t"
2379 "movd %%mm1, 4(%%ebx, %1) \n\t"
2380 "psrlq $32, %%mm1 \n\t"
2381 "movd %%mm1, 4(%%ebx, %1, 2) \n\t"
2383 :: "r" (dst), "r" (dstStride), "r" (src)
2388 //static int test=0;
2390 static void inline tempNoiseReducer(uint8_t *src, int stride,
2391 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
2393 #define FAST_L2_DIFF
2394 //#define L1_DIFF //u should change the thresholds too if u try that one
2395 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2397 "leal (%2, %2, 2), %%eax \n\t" // 3*stride
2398 "leal (%2, %2, 4), %%ebx \n\t" // 5*stride
2399 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride
2400 // 0 1 2 3 4 5 6 7 8 9
2401 // %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+ebx %x+2eax %x+ecx %x+8%2
2403 #ifdef L1_DIFF //needs mmx2
2404 "movq (%0), %%mm0 \n\t" // L0
2405 "psadbw (%1), %%mm0 \n\t" // |L0-R0|
2406 "movq (%0, %2), %%mm1 \n\t" // L1
2407 "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1|
2408 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2409 "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2|
2410 "movq (%0, %%eax), %%mm3 \n\t" // L3
2411 "psadbw (%1, %%eax), %%mm3 \n\t" // |L3-R3|
2413 "movq (%0, %2, 4), %%mm4 \n\t" // L4
2414 "paddw %%mm1, %%mm0 \n\t"
2415 "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4|
2416 "movq (%0, %%ebx), %%mm5 \n\t" // L5
2417 "paddw %%mm2, %%mm0 \n\t"
2418 "psadbw (%1, %%ebx), %%mm5 \n\t" // |L5-R5|
2419 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6
2420 "paddw %%mm3, %%mm0 \n\t"
2421 "psadbw (%1, %%eax, 2), %%mm6 \n\t" // |L6-R6|
2422 "movq (%0, %%ecx), %%mm7 \n\t" // L7
2423 "paddw %%mm4, %%mm0 \n\t"
2424 "psadbw (%1, %%ecx), %%mm7 \n\t" // |L7-R7|
2425 "paddw %%mm5, %%mm6 \n\t"
2426 "paddw %%mm7, %%mm6 \n\t"
2427 "paddw %%mm6, %%mm0 \n\t"
2428 #elif defined (FAST_L2_DIFF)
2429 "pcmpeqb %%mm7, %%mm7 \n\t"
2430 "movq b80, %%mm6 \n\t"
2431 "pxor %%mm0, %%mm0 \n\t"
2432 #define L2_DIFF_CORE(a, b)\
2433 "movq " #a ", %%mm5 \n\t"\
2434 "movq " #b ", %%mm2 \n\t"\
2435 "pxor %%mm7, %%mm2 \n\t"\
2436 PAVGB(%%mm2, %%mm5)\
2437 "paddb %%mm6, %%mm5 \n\t"\
2438 "movq %%mm5, %%mm2 \n\t"\
2439 "psllw $8, %%mm5 \n\t"\
2440 "pmaddwd %%mm5, %%mm5 \n\t"\
2441 "pmaddwd %%mm2, %%mm2 \n\t"\
2442 "paddd %%mm2, %%mm5 \n\t"\
2443 "psrld $14, %%mm5 \n\t"\
2444 "paddd %%mm5, %%mm0 \n\t"
2446 L2_DIFF_CORE((%0), (%1))
2447 L2_DIFF_CORE((%0, %2), (%1, %2))
2448 L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
2449 L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
2450 L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
2451 L2_DIFF_CORE((%0, %%ebx), (%1, %%ebx))
2452 L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
2453 L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
2456 "pxor %%mm7, %%mm7 \n\t"
2457 "pxor %%mm0, %%mm0 \n\t"
2458 #define L2_DIFF_CORE(a, b)\
2459 "movq " #a ", %%mm5 \n\t"\
2460 "movq " #b ", %%mm2 \n\t"\
2461 "movq %%mm5, %%mm1 \n\t"\
2462 "movq %%mm2, %%mm3 \n\t"\
2463 "punpcklbw %%mm7, %%mm5 \n\t"\
2464 "punpckhbw %%mm7, %%mm1 \n\t"\
2465 "punpcklbw %%mm7, %%mm2 \n\t"\
2466 "punpckhbw %%mm7, %%mm3 \n\t"\
2467 "psubw %%mm2, %%mm5 \n\t"\
2468 "psubw %%mm3, %%mm1 \n\t"\
2469 "pmaddwd %%mm5, %%mm5 \n\t"\
2470 "pmaddwd %%mm1, %%mm1 \n\t"\
2471 "paddd %%mm1, %%mm5 \n\t"\
2472 "paddd %%mm5, %%mm0 \n\t"
2474 L2_DIFF_CORE((%0), (%1))
2475 L2_DIFF_CORE((%0, %2), (%1, %2))
2476 L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
2477 L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
2478 L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
2479 L2_DIFF_CORE((%0, %%ebx), (%1, %%ebx))
2480 L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
2481 L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
2485 "movq %%mm0, %%mm4 \n\t"
2486 "psrlq $32, %%mm0 \n\t"
2487 "paddd %%mm0, %%mm4 \n\t"
2488 "movd %%mm4, %%ecx \n\t"
2489 "shll $2, %%ecx \n\t"
2490 "movl %3, %%ebx \n\t"
2491 "addl -4(%%ebx), %%ecx \n\t"
2492 "addl 4(%%ebx), %%ecx \n\t"
2493 "addl -1024(%%ebx), %%ecx \n\t"
2494 "addl $4, %%ecx \n\t"
2495 "addl 1024(%%ebx), %%ecx \n\t"
2496 "shrl $3, %%ecx \n\t"
2497 "movl %%ecx, (%%ebx) \n\t"
2498 "leal (%%eax, %2, 2), %%ebx \n\t" // 5*stride
2500 // "movl %3, %%ecx \n\t"
2501 // "movl %%ecx, test \n\t"
2503 "cmpl 4+maxTmpNoise, %%ecx \n\t"
2505 "cmpl 8+maxTmpNoise, %%ecx \n\t"
2508 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride
2509 "movq (%0), %%mm0 \n\t" // L0
2510 "movq (%0, %2), %%mm1 \n\t" // L1
2511 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2512 "movq (%0, %%eax), %%mm3 \n\t" // L3
2513 "movq (%0, %2, 4), %%mm4 \n\t" // L4
2514 "movq (%0, %%ebx), %%mm5 \n\t" // L5
2515 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6
2516 "movq (%0, %%ecx), %%mm7 \n\t" // L7
2517 "movq %%mm0, (%1) \n\t" // L0
2518 "movq %%mm1, (%1, %2) \n\t" // L1
2519 "movq %%mm2, (%1, %2, 2) \n\t" // L2
2520 "movq %%mm3, (%1, %%eax) \n\t" // L3
2521 "movq %%mm4, (%1, %2, 4) \n\t" // L4
2522 "movq %%mm5, (%1, %%ebx) \n\t" // L5
2523 "movq %%mm6, (%1, %%eax, 2) \n\t" // L6
2524 "movq %%mm7, (%1, %%ecx) \n\t" // L7
2528 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride
2529 "movq (%0), %%mm0 \n\t" // L0
2530 "pavgb (%1), %%mm0 \n\t" // L0
2531 "movq (%0, %2), %%mm1 \n\t" // L1
2532 "pavgb (%1, %2), %%mm1 \n\t" // L1
2533 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2534 "pavgb (%1, %2, 2), %%mm2 \n\t" // L2
2535 "movq (%0, %%eax), %%mm3 \n\t" // L3
2536 "pavgb (%1, %%eax), %%mm3 \n\t" // L3
2537 "movq (%0, %2, 4), %%mm4 \n\t" // L4
2538 "pavgb (%1, %2, 4), %%mm4 \n\t" // L4
2539 "movq (%0, %%ebx), %%mm5 \n\t" // L5
2540 "pavgb (%1, %%ebx), %%mm5 \n\t" // L5
2541 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6
2542 "pavgb (%1, %%eax, 2), %%mm6 \n\t" // L6
2543 "movq (%0, %%ecx), %%mm7 \n\t" // L7
2544 "pavgb (%1, %%ecx), %%mm7 \n\t" // L7
2545 "movq %%mm0, (%1) \n\t" // R0
2546 "movq %%mm1, (%1, %2) \n\t" // R1
2547 "movq %%mm2, (%1, %2, 2) \n\t" // R2
2548 "movq %%mm3, (%1, %%eax) \n\t" // R3
2549 "movq %%mm4, (%1, %2, 4) \n\t" // R4
2550 "movq %%mm5, (%1, %%ebx) \n\t" // R5
2551 "movq %%mm6, (%1, %%eax, 2) \n\t" // R6
2552 "movq %%mm7, (%1, %%ecx) \n\t" // R7
2553 "movq %%mm0, (%0) \n\t" // L0
2554 "movq %%mm1, (%0, %2) \n\t" // L1
2555 "movq %%mm2, (%0, %2, 2) \n\t" // L2
2556 "movq %%mm3, (%0, %%eax) \n\t" // L3
2557 "movq %%mm4, (%0, %2, 4) \n\t" // L4
2558 "movq %%mm5, (%0, %%ebx) \n\t" // L5
2559 "movq %%mm6, (%0, %%eax, 2) \n\t" // L6
2560 "movq %%mm7, (%0, %%ecx) \n\t" // L7
2564 "cmpl maxTmpNoise, %%ecx \n\t"
2567 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride
2568 "movq (%0), %%mm0 \n\t" // L0
2569 "movq (%0, %2), %%mm1 \n\t" // L1
2570 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2571 "movq (%0, %%eax), %%mm3 \n\t" // L3
2572 "movq (%1), %%mm4 \n\t" // R0
2573 "movq (%1, %2), %%mm5 \n\t" // R1
2574 "movq (%1, %2, 2), %%mm6 \n\t" // R2
2575 "movq (%1, %%eax), %%mm7 \n\t" // R3
2584 "movq %%mm0, (%1) \n\t" // R0
2585 "movq %%mm1, (%1, %2) \n\t" // R1
2586 "movq %%mm2, (%1, %2, 2) \n\t" // R2
2587 "movq %%mm3, (%1, %%eax) \n\t" // R3
2588 "movq %%mm0, (%0) \n\t" // L0
2589 "movq %%mm1, (%0, %2) \n\t" // L1
2590 "movq %%mm2, (%0, %2, 2) \n\t" // L2
2591 "movq %%mm3, (%0, %%eax) \n\t" // L3
2593 "movq (%0, %2, 4), %%mm0 \n\t" // L4
2594 "movq (%0, %%ebx), %%mm1 \n\t" // L5
2595 "movq (%0, %%eax, 2), %%mm2 \n\t" // L6
2596 "movq (%0, %%ecx), %%mm3 \n\t" // L7
2597 "movq (%1, %2, 4), %%mm4 \n\t" // R4
2598 "movq (%1, %%ebx), %%mm5 \n\t" // R5
2599 "movq (%1, %%eax, 2), %%mm6 \n\t" // R6
2600 "movq (%1, %%ecx), %%mm7 \n\t" // R7
2609 "movq %%mm0, (%1, %2, 4) \n\t" // R4
2610 "movq %%mm1, (%1, %%ebx) \n\t" // R5
2611 "movq %%mm2, (%1, %%eax, 2) \n\t" // R6
2612 "movq %%mm3, (%1, %%ecx) \n\t" // R7
2613 "movq %%mm0, (%0, %2, 4) \n\t" // L4
2614 "movq %%mm1, (%0, %%ebx) \n\t" // L5
2615 "movq %%mm2, (%0, %%eax, 2) \n\t" // L6
2616 "movq %%mm3, (%0, %%ecx) \n\t" // L7
2620 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride
2621 "movq (%0), %%mm0 \n\t" // L0
2622 "movq (%0, %2), %%mm1 \n\t" // L1
2623 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2624 "movq (%0, %%eax), %%mm3 \n\t" // L3
2625 "movq (%1), %%mm4 \n\t" // R0
2626 "movq (%1, %2), %%mm5 \n\t" // R1
2627 "movq (%1, %2, 2), %%mm6 \n\t" // R2
2628 "movq (%1, %%eax), %%mm7 \n\t" // R3
2641 "movq %%mm0, (%1) \n\t" // R0
2642 "movq %%mm1, (%1, %2) \n\t" // R1
2643 "movq %%mm2, (%1, %2, 2) \n\t" // R2
2644 "movq %%mm3, (%1, %%eax) \n\t" // R3
2645 "movq %%mm0, (%0) \n\t" // L0
2646 "movq %%mm1, (%0, %2) \n\t" // L1
2647 "movq %%mm2, (%0, %2, 2) \n\t" // L2
2648 "movq %%mm3, (%0, %%eax) \n\t" // L3
2650 "movq (%0, %2, 4), %%mm0 \n\t" // L4
2651 "movq (%0, %%ebx), %%mm1 \n\t" // L5
2652 "movq (%0, %%eax, 2), %%mm2 \n\t" // L6
2653 "movq (%0, %%ecx), %%mm3 \n\t" // L7
2654 "movq (%1, %2, 4), %%mm4 \n\t" // R4
2655 "movq (%1, %%ebx), %%mm5 \n\t" // R5
2656 "movq (%1, %%eax, 2), %%mm6 \n\t" // R6
2657 "movq (%1, %%ecx), %%mm7 \n\t" // R7
2670 "movq %%mm0, (%1, %2, 4) \n\t" // R4
2671 "movq %%mm1, (%1, %%ebx) \n\t" // R5
2672 "movq %%mm2, (%1, %%eax, 2) \n\t" // R6
2673 "movq %%mm3, (%1, %%ecx) \n\t" // R7
2674 "movq %%mm0, (%0, %2, 4) \n\t" // L4
2675 "movq %%mm1, (%0, %%ebx) \n\t" // L5
2676 "movq %%mm2, (%0, %%eax, 2) \n\t" // L6
2677 "movq %%mm3, (%0, %%ecx) \n\t" // L7
2681 :: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast)
2682 : "%eax", "%ebx", "%ecx", "memory"
2684 //printf("%d\n", test);
2696 int ref= tempBlured[ x + y*stride ];
2697 int cur= src[ x + y*stride ];
2699 // if(x==0 || x==7) d1+= d1>>1;
2700 // if(y==0 || y==7) d1+= d1>>1;
2709 +(*(tempBluredPast-256))
2710 +(*(tempBluredPast-1))+ (*(tempBluredPast+1))
2711 +(*(tempBluredPast+256))
2714 // ((*tempBluredPast)*3 + d + 2)>>2;
2716 //printf("%d %d %d\n", maxNoise[0], maxNoise[1], maxNoise[2]);
2720 64 32 16 8 4 2 1 (1)
2721 64 48 36 27 20 15 11 (33) (approx)
2722 64 56 49 43 37 33 29 (200) (approx)
2733 int ref= tempBlured[ x + y*stride ];
2734 int cur= src[ x + y*stride ];
2735 tempBlured[ x + y*stride ]=
2736 src[ x + y*stride ]=
2748 tempBlured[ x + y*stride ]= src[ x + y*stride ];
2762 int ref= tempBlured[ x + y*stride ];
2763 int cur= src[ x + y*stride ];
2764 tempBlured[ x + y*stride ]=
2765 src[ x + y*stride ]=
2766 (ref*7 + cur + 4)>>3;
2777 int ref= tempBlured[ x + y*stride ];
2778 int cur= src[ x + y*stride ];
2779 tempBlured[ x + y*stride ]=
2780 src[ x + y*stride ]=
2781 (ref*3 + cur + 2)>>2;
2789 #ifdef HAVE_ODIVX_POSTPROCESS
2790 #include "../opendivx/postprocess.h"
2794 static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2795 QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
2797 /* -pp Command line Help
2798 NOTE/FIXME: put this at an appropriate place (--help, html docs, man mplayer)?
2800 -pp <filterName>[:<option>[:<option>...]][,[-]<filterName>[:<option>...]]...
2803 -pp vdeblock:autoq,hdeblock:autoq,linblenddeint -pp default,-vdeblock
2805 -pp vb:a,hb:a,lb -pp de,-vb
2810 short long name short long option Description
2811 * * a autoq cpu power dependant enabler
2812 c chrom chrominance filtring enabled
2813 y nochrom chrominance filtring disabled
2814 hb hdeblock horizontal deblocking filter
2815 vb vdeblock vertical deblocking filter
2817 h1 x1hdeblock Experimental horizontal deblock filter 1
2818 v1 x1vdeblock Experimental vertical deblock filter 1
2819 dr dering not implemented yet
2820 al autolevels automatic brightness / contrast fixer
2821 f fullyrange stretch luminance range to (0..255)
2822 lb linblenddeint linear blend deinterlacer
2823 li linipoldeint linear interpolating deinterlacer
2824 ci cubicipoldeint cubic interpolating deinterlacer
2825 md mediandeint median deinterlacer
2826 de default hdeblock:a,vdeblock:a,dering:a,autolevels
2827 fa fast x1hdeblock:a,x1vdeblock:a,dering:a,autolevels
2828 tn tmpnoise (3 Thresholds) Temporal Noise Reducer
2832 * returns a PPMode struct which will have a non 0 error variable if an error occured
2833 * name is the string after "-pp" on the command line
2834 * quality is a number from 0 to GET_PP_QUALITY_MAX
2836 struct PPMode getPPModeByNameAndQuality(char *name, int quality)
2838 char temp[GET_MODE_BUFFER_SIZE];
2840 char *filterDelimiters= ",";
2841 char *optionDelimiters= ":";
2842 struct PPMode ppMode= {0,0,0,0,0,0,{150,200,400}};
2845 strncpy(temp, name, GET_MODE_BUFFER_SIZE);
2847 printf("%s\n", name);
2851 int q= 1000000; //GET_PP_QUALITY_MAX;
2854 char *options[OPTIONS_ARRAY_SIZE];
2857 int numOfUnknownOptions=0;
2858 int enable=1; //does the user want us to enabled or disabled the filter
2860 filterToken= strtok(p, filterDelimiters);
2861 if(filterToken == NULL) break;
2862 p+= strlen(filterToken) + 1; // p points to next filterToken
2863 filterName= strtok(filterToken, optionDelimiters);
2864 printf("%s::%s\n", filterToken, filterName);
2866 if(*filterName == '-')
2872 for(;;){ //for all options
2873 option= strtok(NULL, optionDelimiters);
2874 if(option == NULL) break;
2876 printf("%s\n", option);
2877 if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
2878 else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
2879 else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
2882 options[numOfUnknownOptions] = option;
2883 numOfUnknownOptions++;
2885 if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
2887 options[numOfUnknownOptions] = NULL;
2889 /* replace stuff from the replace Table */
2890 for(i=0; replaceTable[2*i]!=NULL; i++)
2892 if(!strcmp(replaceTable[2*i], filterName))
2894 int newlen= strlen(replaceTable[2*i + 1]);
2898 if(p==NULL) p= temp, *p=0; //last filter
2899 else p--, *p=','; //not last filter
2902 spaceLeft= (int)p - (int)temp + plen;
2903 if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE)
2908 memmove(p + newlen, p, plen+1);
2909 memcpy(p, replaceTable[2*i + 1], newlen);
2914 for(i=0; filters[i].shortName!=NULL; i++)
2916 // printf("Compareing %s, %s, %s\n", filters[i].shortName,filters[i].longName, filterName);
2917 if( !strcmp(filters[i].longName, filterName)
2918 || !strcmp(filters[i].shortName, filterName))
2920 ppMode.lumMode &= ~filters[i].mask;
2921 ppMode.chromMode &= ~filters[i].mask;
2924 if(!enable) break; // user wants to disable it
2926 if(q >= filters[i].minLumQuality)
2927 ppMode.lumMode|= filters[i].mask;
2928 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
2929 if(q >= filters[i].minChromQuality)
2930 ppMode.chromMode|= filters[i].mask;
2932 if(filters[i].mask == LEVEL_FIX)
2935 ppMode.minAllowedY= 16;
2936 ppMode.maxAllowedY= 234;
2937 for(o=0; options[o]!=NULL; o++)
2938 if( !strcmp(options[o],"fullyrange")
2939 ||!strcmp(options[o],"f"))
2941 ppMode.minAllowedY= 0;
2942 ppMode.maxAllowedY= 255;
2943 numOfUnknownOptions--;
2946 else if(filters[i].mask == TEMP_NOISE_FILTER)
2950 ppMode.maxTmpNoise[0]= 150;
2951 ppMode.maxTmpNoise[1]= 200;
2952 ppMode.maxTmpNoise[2]= 400;
2954 for(o=0; options[o]!=NULL; o++)
2957 ppMode.maxTmpNoise[numOfNoises]=
2958 strtol(options[o], &tail, 0);
2959 if(tail!=options[o])
2962 numOfUnknownOptions--;
2963 if(numOfNoises >= 3) break;
2969 if(!filterNameOk) ppMode.error++;
2970 ppMode.error += numOfUnknownOptions;
2973 #ifdef HAVE_ODIVX_POSTPROCESS
2974 if(ppMode.lumMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_H;
2975 if(ppMode.lumMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_V;
2976 if(ppMode.chromMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_H;
2977 if(ppMode.chromMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_V;
2978 if(ppMode.lumMode & DERING) ppMode.oldMode |= PP_DERING_Y;
2979 if(ppMode.chromMode & DERING) ppMode.oldMode |= PP_DERING_C;
2986 * Obsolete, dont use it, use postprocess2() instead
2988 void postprocess(unsigned char * src[], int src_stride,
2989 unsigned char * dst[], int dst_stride,
2990 int horizontal_size, int vertical_size,
2991 QP_STORE_T *QP_store, int QP_stride,
2994 struct PPMode ppMode;
2995 static QP_STORE_T zeroArray[2048/8];
2999 ppMode= getPPModeByNameAndQuality("fast,default,-hdeblock,-vdeblock,tmpnoise:150:200:300", qual);
3003 printf("\n%X %X %X %X :%d: %d %d %d\n", ppMode.lumMode, ppMode.chromMode, ppMode.oldMode, ppMode.error,
3004 qual, ppMode.maxTmpNoise[0], ppMode.maxTmpNoise[1], ppMode.maxTmpNoise[2]);
3005 postprocess2(src, src_stride, dst, dst_stride,
3006 horizontal_size, vertical_size, QP_store, QP_stride, &ppMode);
3012 QP_store= zeroArray;
3016 ppMode.lumMode= mode;
3017 mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00);
3018 ppMode.chromMode= mode;
3019 ppMode.maxTmpNoise[0]= 700;
3020 ppMode.maxTmpNoise[1]= 1500;
3021 ppMode.maxTmpNoise[2]= 3000;
3023 #ifdef HAVE_ODIVX_POSTPROCESS
3024 // Note: I could make this shit outside of this file, but it would mean one
3025 // more function call...
3027 odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,mode);
3032 postProcess(src[0], src_stride, dst[0], dst_stride,
3033 horizontal_size, vertical_size, QP_store, QP_stride, 0, &ppMode);
3035 horizontal_size >>= 1;
3036 vertical_size >>= 1;
3042 postProcess(src[1], src_stride, dst[1], dst_stride,
3043 horizontal_size, vertical_size, QP_store, QP_stride, 1, &ppMode);
3044 postProcess(src[2], src_stride, dst[2], dst_stride,
3045 horizontal_size, vertical_size, QP_store, QP_stride, 2, &ppMode);
3049 memset(dst[1], 128, dst_stride*vertical_size);
3050 memset(dst[2], 128, dst_stride*vertical_size);
3051 // memcpy(dst[1], src[1], src_stride*horizontal_size);
3052 // memcpy(dst[2], src[2], src_stride*horizontal_size);
3056 void postprocess2(unsigned char * src[], int src_stride,
3057 unsigned char * dst[], int dst_stride,
3058 int horizontal_size, int vertical_size,
3059 QP_STORE_T *QP_store, int QP_stride,
3060 struct PPMode *mode)
3063 static QP_STORE_T zeroArray[2048/8];
3066 QP_store= zeroArray;
3070 #ifdef HAVE_ODIVX_POSTPROCESS
3071 // Note: I could make this shit outside of this file, but it would mean one
3072 // more function call...
3074 odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,
3080 postProcess(src[0], src_stride, dst[0], dst_stride,
3081 horizontal_size, vertical_size, QP_store, QP_stride, 0, mode);
3083 horizontal_size >>= 1;
3084 vertical_size >>= 1;
3088 postProcess(src[1], src_stride, dst[1], dst_stride,
3089 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
3090 postProcess(src[2], src_stride, dst[2], dst_stride,
3091 horizontal_size, vertical_size, QP_store, QP_stride, 2, mode);
3096 * gets the mode flags for a given quality (larger values mean slower but better postprocessing)
3099 int getPpModeForQuality(int quality){
3100 int modes[1+GET_PP_QUALITY_MAX]= {
3103 // horizontal filters first
3105 LUM_H_DEBLOCK | LUM_V_DEBLOCK,
3106 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK,
3107 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK,
3108 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING,
3109 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING | CHROM_DERING
3111 // vertical filters first
3113 LUM_V_DEBLOCK | LUM_H_DEBLOCK,
3114 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK,
3115 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK,
3116 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING,
3117 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING | CHROM_DERING
3121 #ifdef HAVE_ODIVX_POSTPROCESS
3122 int odivx_modes[1+GET_PP_QUALITY_MAX]= {
3125 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V,
3126 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H,
3127 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V,
3128 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y,
3129 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y|PP_DERING_C
3131 if(use_old_pp) return odivx_modes[quality];
3133 return modes[quality];
3137 * Copies a block from src to dst and fixes the blacklevel
3138 * numLines must be a multiple of 4
3139 * levelFix == 0 -> dont touch the brighness & contrast
3141 static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
3151 "leal (%0,%2), %%eax \n\t"
3152 "leal (%1,%3), %%ebx \n\t"
3153 "movq packedYOffset, %%mm2 \n\t"
3154 "movq packedYScale, %%mm3 \n\t"
3155 "pxor %%mm4, %%mm4 \n\t"
3157 #define SCALED_CPY(src1, src2, dst1, dst2) \
3158 "movq " #src1 ", %%mm0 \n\t"\
3159 "movq " #src1 ", %%mm5 \n\t"\
3160 "punpcklbw %%mm4, %%mm0 \n\t"\
3161 "punpckhbw %%mm4, %%mm5 \n\t"\
3162 "psubw %%mm2, %%mm0 \n\t"\
3163 "psubw %%mm2, %%mm5 \n\t"\
3164 "movq " #src2 ", %%mm1 \n\t"\
3165 "psllw $6, %%mm0 \n\t"\
3166 "psllw $6, %%mm5 \n\t"\
3167 "pmulhw %%mm3, %%mm0 \n\t"\
3168 "movq " #src2 ", %%mm6 \n\t"\
3169 "pmulhw %%mm3, %%mm5 \n\t"\
3170 "punpcklbw %%mm4, %%mm1 \n\t"\
3171 "punpckhbw %%mm4, %%mm6 \n\t"\
3172 "psubw %%mm2, %%mm1 \n\t"\
3173 "psubw %%mm2, %%mm6 \n\t"\
3174 "psllw $6, %%mm1 \n\t"\
3175 "psllw $6, %%mm6 \n\t"\
3176 "pmulhw %%mm3, %%mm1 \n\t"\
3177 "pmulhw %%mm3, %%mm6 \n\t"\
3178 "packuswb %%mm5, %%mm0 \n\t"\
3179 "packuswb %%mm6, %%mm1 \n\t"\
3180 "movq %%mm0, " #dst1 " \n\t"\
3181 "movq %%mm1, " #dst2 " \n\t"\
3183 SCALED_CPY((%0) , (%0, %2) , (%1) , (%1, %3))
3184 SCALED_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%ebx, %3, 2))
3185 SCALED_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%ebx, %3, 4))
3186 "leal (%%eax,%2,4), %%eax \n\t"
3187 "leal (%%ebx,%3,4), %%ebx \n\t"
3188 SCALED_CPY((%%eax, %2), (%%eax, %2, 2), (%%ebx, %3), (%%ebx, %3, 2))
3199 memcpy( &(dst[dstStride*i]),
3200 &(src[srcStride*i]), BLOCK_SIZE);
3207 "leal (%0,%2), %%eax \n\t"
3208 "leal (%1,%3), %%ebx \n\t"
3210 #define SIMPLE_CPY(src1, src2, dst1, dst2) \
3211 "movq " #src1 ", %%mm0 \n\t"\
3212 "movq " #src2 ", %%mm1 \n\t"\
3213 "movq %%mm0, " #dst1 " \n\t"\
3214 "movq %%mm1, " #dst2 " \n\t"\
3216 SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3))
3217 SIMPLE_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%ebx, %3, 2))
3218 SIMPLE_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%ebx, %3, 4))
3219 "leal (%%eax,%2,4), %%eax \n\t"
3220 "leal (%%ebx,%3,4), %%ebx \n\t"
3221 SIMPLE_CPY((%%eax, %2), (%%eax, %2, 2), (%%ebx, %3), (%%ebx, %3, 2))
3231 memcpy( &(dst[dstStride*i]),
3232 &(src[srcStride*i]), BLOCK_SIZE);
3239 * Filters array of bytes (Y or U or V values)
3241 static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
3242 QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode)
3245 const int mode= isColor ? ppMode->chromMode : ppMode->lumMode;
3247 /* we need 64bit here otherwise we´ll going to have a problem
3248 after watching a black picture for 5 hours*/
3249 static uint64_t *yHistogram= NULL;
3250 int black=0, white=255; // blackest black and whitest white in the picture
3251 int QPCorrecture= 256;
3253 /* Temporary buffers for handling the last row(s) */
3254 static uint8_t *tempDst= NULL;
3255 static uint8_t *tempSrc= NULL;
3257 /* Temporary buffers for handling the last block */
3258 static uint8_t *tempDstBlock= NULL;
3259 static uint8_t *tempSrcBlock= NULL;
3261 /* Temporal noise reducing buffers */
3262 static uint8_t *tempBlured[3]= {NULL,NULL,NULL};
3263 static uint32_t *tempBluredPast[3]= {NULL,NULL,NULL};
3267 #ifdef PP_FUNNY_STRIDE
3268 uint8_t *dstBlockPtrBackup;
3269 uint8_t *srcBlockPtrBackup;
3273 long long T0, T1, diffTime=0;
3276 long long memcpyTime=0, vertTime=0, horizTime=0, sumTime;
3281 maxTmpNoise[0]= ppMode->maxTmpNoise[0];
3282 maxTmpNoise[1]= ppMode->maxTmpNoise[1];
3283 maxTmpNoise[2]= ppMode->maxTmpNoise[2];
3286 if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
3287 else if(mode & LINEAR_BLEND_DEINT_FILTER) copyAhead=14;
3288 else if( (mode & V_DEBLOCK)
3289 || (mode & LINEAR_IPOL_DEINT_FILTER)
3290 || (mode & MEDIAN_DEINT_FILTER)) copyAhead=13;
3291 else if(mode & V_X1_FILTER) copyAhead=11;
3292 else if(mode & V_RK1_FILTER) copyAhead=10;
3293 else if(mode & DERING) copyAhead=9;
3300 tempDst= (uint8_t*)memalign(8, 1024*24);
3301 tempSrc= (uint8_t*)memalign(8, 1024*24);
3302 tempDstBlock= (uint8_t*)memalign(8, 1024*24);
3303 tempSrcBlock= (uint8_t*)memalign(8, 1024*24);
3306 if(tempBlured[isColor]==NULL && (mode & TEMP_NOISE_FILTER))
3308 // printf("%d %d %d\n", isColor, dstStride, height);
3309 //FIXME works only as long as the size doesnt increase
3310 //Note:the +17*1024 is just there so i dont have to worry about r/w over te end
3311 tempBlured[isColor]= (uint8_t*)memalign(8, dstStride*((height+7)&(~7)) + 17*1024);
3312 tempBluredPast[isColor]= (uint32_t*)memalign(8, 256*((height+7)&(~7))/2 + 17*1024);
3314 memset(tempBlured[isColor], 0, dstStride*((height+7)&(~7)) + 17*1024);
3315 memset(tempBluredPast[isColor], 0, 256*((height+7)&(~7))/2 + 17*1024);
3321 yHistogram= (uint64_t*)malloc(8*256);
3322 for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256;
3324 if(mode & FULL_Y_RANGE)
3335 static int framenum= -1;
3336 uint64_t maxClipped;
3341 if(framenum == 1) yHistogram[0]= width*height/64*15/256;
3343 for(i=0; i<256; i++)
3345 sum+= yHistogram[i];
3346 // printf("%d ", yHistogram[i]);
3350 /* we allways get a completly black picture first */
3351 maxClipped= (uint64_t)(sum * maxClippedThreshold);
3354 for(black=255; black>0; black--)
3356 if(clipped < maxClipped) break;
3357 clipped-= yHistogram[black];
3361 for(white=0; white<256; white++)
3363 if(clipped < maxClipped) break;
3364 clipped-= yHistogram[white];
3367 packedYOffset= (black - minAllowedY) & 0xFFFF;
3368 packedYOffset|= packedYOffset<<32;
3369 packedYOffset|= packedYOffset<<16;
3371 scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);
3373 packedYScale= (uint16_t)(scale*1024.0 + 0.5);
3374 packedYScale|= packedYScale<<32;
3375 packedYScale|= packedYScale<<16;
3379 packedYScale= 0x0100010001000100LL;
3383 if(mode & LEVEL_FIX) QPCorrecture= packedYScale &0xFFFF;
3384 else QPCorrecture= 256;
3386 /* copy & deinterlace first row of blocks */
3389 //1% speedup if these are here instead of the inner loop
3390 uint8_t *srcBlock= &(src[y*srcStride]);
3391 uint8_t *dstBlock= &(dst[y*dstStride]);
3393 dstBlock= tempDst + dstStride;
3395 // From this point on it is guranteed that we can read and write 16 lines downward
3396 // finish 1 block before the next otherwise we´ll might have a problem
3397 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
3398 for(x=0; x<width; x+=BLOCK_SIZE)
3403 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3404 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3405 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3406 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3410 "movl %4, %%eax \n\t"
3411 "shrl $2, %%eax \n\t"
3412 "andl $6, %%eax \n\t"
3413 "addl %5, %%eax \n\t"
3414 "movl %%eax, %%ebx \n\t"
3415 "imul %1, %%eax \n\t"
3416 "imul %3, %%ebx \n\t"
3417 "prefetchnta 32(%%eax, %0) \n\t"
3418 "prefetcht0 32(%%ebx, %2) \n\t"
3419 "addl %1, %%eax \n\t"
3420 "addl %3, %%ebx \n\t"
3421 "prefetchnta 32(%%eax, %0) \n\t"
3422 "prefetcht0 32(%%ebx, %2) \n\t"
3423 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
3424 "m" (x), "m" (copyAhead)
3428 #elif defined(HAVE_3DNOW)
3429 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
3430 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3431 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3432 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3433 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3437 blockCopy(dstBlock + dstStride*copyAhead, dstStride,
3438 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX);
3440 if(mode & LINEAR_IPOL_DEINT_FILTER)
3441 deInterlaceInterpolateLinear(dstBlock, dstStride);
3442 else if(mode & LINEAR_BLEND_DEINT_FILTER)
3443 deInterlaceBlendLinear(dstBlock, dstStride);
3444 else if(mode & MEDIAN_DEINT_FILTER)
3445 deInterlaceMedian(dstBlock, dstStride);
3446 else if(mode & CUBIC_IPOL_DEINT_FILTER)
3447 deInterlaceInterpolateCubic(dstBlock, dstStride);
3448 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3449 deInterlaceBlendCubic(dstBlock, dstStride);
3454 memcpy(&(dst[y*dstStride]) + 8*dstStride, tempDst + 9*dstStride, copyAhead*dstStride );
3457 for(y=0; y<height; y+=BLOCK_SIZE)
3459 //1% speedup if these are here instead of the inner loop
3460 uint8_t *srcBlock= &(src[y*srcStride]);
3461 uint8_t *dstBlock= &(dst[y*dstStride]);
3463 int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride];
3464 int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4);
3465 int QPFrac= QPDelta;
3466 uint8_t *tempBlock1= tempBlocks;
3467 uint8_t *tempBlock2= tempBlocks + 8;
3470 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
3471 if not than use a temporary buffer */
3475 /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
3476 blockcopy to dst later */
3477 memcpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
3478 srcStride*MAX(height-y-copyAhead, 0) );
3480 /* duplicate last line of src to fill the void upto line (copyAhead+7) */
3481 for(i=MAX(height-y, 8); i<copyAhead+8; i++)
3482 memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), srcStride);
3484 /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
3485 memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, copyAhead+1) );
3487 /* duplicate last line of dst to fill the void upto line (copyAhead) */
3488 for(i=height-y+1; i<=copyAhead; i++)
3489 memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), dstStride);
3491 dstBlock= tempDst + dstStride;
3495 // From this point on it is guranteed that we can read and write 16 lines downward
3496 // finish 1 block before the next otherwise we´ll might have a problem
3497 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
3498 for(x=0; x<width; x+=BLOCK_SIZE)
3500 const int stride= dstStride;
3506 "sbbl %%eax, %%eax \n\t"
3507 "shll $2, %%eax \n\t"
3508 "subl %%eax, %0 \n\t"
3509 : "+r" (QPptr), "+m" (QPFrac)
3515 QPs[(y>>3)*QPStride + (x>>3)]:
3516 QPs[(y>>4)*QPStride + (x>>4)];
3520 QP= (QP* QPCorrecture)>>8;
3521 yHistogram[ srcBlock[srcStride*12 + 4] ]++;
3525 "movd %0, %%mm7 \n\t"
3526 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
3527 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
3528 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
3529 "movq %%mm7, pQPb \n\t"
3540 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3541 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3542 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3543 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3547 "movl %4, %%eax \n\t"
3548 "shrl $2, %%eax \n\t"
3549 "andl $6, %%eax \n\t"
3550 "addl %5, %%eax \n\t"
3551 "movl %%eax, %%ebx \n\t"
3552 "imul %1, %%eax \n\t"
3553 "imul %3, %%ebx \n\t"
3554 "prefetchnta 32(%%eax, %0) \n\t"
3555 "prefetcht0 32(%%ebx, %2) \n\t"
3556 "addl %1, %%eax \n\t"
3557 "addl %3, %%ebx \n\t"
3558 "prefetchnta 32(%%eax, %0) \n\t"
3559 "prefetcht0 32(%%ebx, %2) \n\t"
3560 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
3561 "m" (x), "m" (copyAhead)
3565 #elif defined(HAVE_3DNOW)
3566 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
3567 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3568 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3569 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3570 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3574 #ifdef PP_FUNNY_STRIDE
3575 //can we mess with a 8x16 block, if not use a temp buffer, yes again
3579 dstBlockPtrBackup= dstBlock;
3580 srcBlockPtrBackup= srcBlock;
3582 for(i=0;i<BLOCK_SIZE*2; i++)
3584 memcpy(tempSrcBlock+i*srcStride, srcBlock+i*srcStride, width-x);
3585 memcpy(tempDstBlock+i*dstStride, dstBlock+i*dstStride, width-x);
3588 dstBlock= tempDstBlock;
3589 srcBlock= tempSrcBlock;
3593 blockCopy(dstBlock + dstStride*copyAhead, dstStride,
3594 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX);
3596 if(mode & LINEAR_IPOL_DEINT_FILTER)
3597 deInterlaceInterpolateLinear(dstBlock, dstStride);
3598 else if(mode & LINEAR_BLEND_DEINT_FILTER)
3599 deInterlaceBlendLinear(dstBlock, dstStride);
3600 else if(mode & MEDIAN_DEINT_FILTER)
3601 deInterlaceMedian(dstBlock, dstStride);
3602 else if(mode & CUBIC_IPOL_DEINT_FILTER)
3603 deInterlaceInterpolateCubic(dstBlock, dstStride);
3604 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3605 deInterlaceBlendCubic(dstBlock, dstStride);
3608 /* only deblock if we have 2 blocks */
3616 if(mode & V_RK1_FILTER)
3617 vertRK1Filter(dstBlock, stride, QP);
3618 else if(mode & V_X1_FILTER)
3619 vertX1Filter(dstBlock, stride, QP);
3620 else if(mode & V_DEBLOCK)
3622 if( isVertDC(dstBlock, stride))
3624 if(isVertMinMaxOk(dstBlock, stride, QP))
3625 doVertLowPass(dstBlock, stride, QP);
3628 doVertDefFilter(dstBlock, stride, QP);
3638 transpose1(tempBlock1, tempBlock2, dstBlock, dstStride);
3640 /* check if we have a previous block to deblock it with dstBlock */
3647 if(mode & H_RK1_FILTER)
3648 vertRK1Filter(tempBlock1, 16, QP);
3649 else if(mode & H_X1_FILTER)
3650 vertX1Filter(tempBlock1, 16, QP);
3651 else if(mode & H_DEBLOCK)
3653 if( isVertDC(tempBlock1, 16) )
3655 if(isVertMinMaxOk(tempBlock1, 16, QP))
3656 doVertLowPass(tempBlock1, 16, QP);
3659 doVertDefFilter(tempBlock1, 16, QP);
3662 transpose2(dstBlock-4, dstStride, tempBlock1 + 4*16);
3665 if(mode & H_X1_FILTER)
3666 horizX1Filter(dstBlock-4, stride, QP);
3667 else if(mode & H_DEBLOCK)
3669 if( isHorizDC(dstBlock-4, stride))
3671 if(isHorizMinMaxOk(dstBlock-4, stride, QP))
3672 doHorizLowPass(dstBlock-4, stride, QP);
3675 doHorizDefFilter(dstBlock-4, stride, QP);
3685 //FIXME filter first line
3686 if(y>0) dering(dstBlock - stride - 8, stride, QP);
3689 if(mode & TEMP_NOISE_FILTER)
3691 tempNoiseReducer(dstBlock-8, stride,
3692 tempBlured[isColor] + y*dstStride + x,
3693 tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
3694 ppMode->maxTmpNoise);
3698 #ifdef PP_FUNNY_STRIDE
3699 /* did we use a tmp-block buffer */
3703 dstBlock= dstBlockPtrBackup;
3704 srcBlock= srcBlockPtrBackup;
3706 for(i=0;i<BLOCK_SIZE*2; i++)
3708 memcpy(dstBlock+i*dstStride, tempDstBlock+i*dstStride, width-x);
3717 tmpXchg= tempBlock1;
3718 tempBlock1= tempBlock2;
3719 tempBlock2 = tmpXchg;
3725 if(y > 0) dering(dstBlock - dstStride - 8, dstStride, QP);
3728 if((mode & TEMP_NOISE_FILTER))
3730 tempNoiseReducer(dstBlock-8, dstStride,
3731 tempBlured[isColor] + y*dstStride + x,
3732 tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
3733 ppMode->maxTmpNoise);
3736 /* did we use a tmp buffer for the last lines*/
3739 uint8_t *dstBlock= &(dst[y*dstStride]);
3740 memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) );
3743 for(x=0; x<width; x+=32)
3746 i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride]
3747 + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride]
3748 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride];
3749 // + dstBlock[x +13*dstStride]
3750 // + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride];
3754 asm volatile("femms");
3755 #elif defined (HAVE_MMX)
3756 asm volatile("emms");
3760 // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...)
3761 sumTime= rdtsc() - sumTime;
3763 printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d \r",
3764 (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
3765 (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)
3768 #ifdef DEBUG_BRIGHTNESS
3773 for(i=0; i<256; i++)
3774 if(yHistogram[i] > max) max=yHistogram[i];
3776 for(i=1; i<256; i++)
3779 int start=yHistogram[i-1]/(max/256+1);
3780 int end=yHistogram[i]/(max/256+1);
3781 int inc= end > start ? 1 : -1;
3782 for(x=start; x!=end+inc; x+=inc)
3783 dst[ i*dstStride + x]+=128;
3786 for(i=0; i<100; i+=2)
3788 dst[ (white)*dstStride + i]+=128;
3789 dst[ (black)*dstStride + i]+=128;