2 Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 doVertDefFilter Ec Ec Ec
28 doHorizDefFilter E ac ac
30 Vertical RKAlgo1 E a a
33 LinIpolDeinterlace e E E*
34 CubicIpolDeinterlace a e e*
35 LinBlendDeinterlace e E E*
36 MedianDeinterlace Ec Ec
39 * i dont have a 3dnow CPU -> its untested
40 E = Exact implementation
41 e = allmost exact implementation (slightly different rounding,...)
42 a = alternative / approximate impl
43 c = checked against the other implementations (-vo md5)
48 verify that everything workes as it should (how?)
49 reduce the time wasted on the mem transfer
51 implement everything in C at least (done at the moment but ...)
52 unroll stuff if instructions depend too much on the prior one
53 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
54 move YScale thing to the end instead of fixing QP
55 write a faster and higher quality deblocking filter :)
56 do something about the speed of the horizontal filters
57 make the mainloop more flexible (variable number of blocks at once
58 (the if/else stuff per block is slowing things down)
59 compare the quality & speed of all filters
61 fix warnings (unused vars, ...)
62 noise reduction filters
63 write an exact implementation of the horizontal delocking filter
70 //Changelog: use the CVS log
76 #include "../config.h"
80 #include "postprocess.h"
82 #define MIN(a,b) ((a) > (b) ? (b) : (a))
83 #define MAX(a,b) ((a) < (b) ? (b) : (a))
84 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
85 #define SIGN(a) ((a) > 0 ? 1 : -1)
88 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
89 #elif defined (HAVE_3DNOW)
90 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
93 #define GET_MODE_BUFFER_SIZE 500
94 #define OPTIONS_ARRAY_SIZE 10
97 static uint64_t packedYOffset= 0x0000000000000000LL;
98 static uint64_t packedYScale= 0x0100010001000100LL;
99 static uint64_t w05= 0x0005000500050005LL;
100 static uint64_t w20= 0x0020002000200020LL;
101 static uint64_t w1400= 0x1400140014001400LL;
102 static uint64_t bm00000001= 0x00000000000000FFLL;
103 static uint64_t bm00010000= 0x000000FF00000000LL;
104 static uint64_t bm00001000= 0x00000000FF000000LL;
105 static uint64_t bm10000000= 0xFF00000000000000LL;
106 static uint64_t bm10000001= 0xFF000000000000FFLL;
107 static uint64_t bm11000011= 0xFFFF00000000FFFFLL;
108 static uint64_t bm00000011= 0x000000000000FFFFLL;
109 static uint64_t bm11111110= 0xFFFFFFFFFFFFFF00LL;
110 static uint64_t bm11000000= 0xFFFF000000000000LL;
111 static uint64_t bm00011000= 0x000000FFFF000000LL;
112 static uint64_t bm00110011= 0x0000FFFF0000FFFFLL;
113 static uint64_t bm11001100= 0xFFFF0000FFFF0000LL;
114 static uint64_t b00= 0x0000000000000000LL;
115 static uint64_t b01= 0x0101010101010101LL;
116 static uint64_t b02= 0x0202020202020202LL;
117 static uint64_t b0F= 0x0F0F0F0F0F0F0F0FLL;
118 static uint64_t bFF= 0xFFFFFFFFFFFFFFFFLL;
119 static uint64_t b20= 0x2020202020202020LL;
120 static uint64_t b80= 0x8080808080808080LL;
121 static uint64_t b7E= 0x7E7E7E7E7E7E7E7ELL;
122 static uint64_t b7C= 0x7C7C7C7C7C7C7C7CLL;
123 static uint64_t b3F= 0x3F3F3F3F3F3F3F3FLL;
124 static uint64_t temp0=0;
125 static uint64_t temp1=0;
126 static uint64_t temp2=0;
127 static uint64_t temp3=0;
128 static uint64_t temp4=0;
129 static uint64_t temp5=0;
130 static uint64_t pQPb=0;
131 static uint8_t tempBlock[16*16]; //used so the horizontal code gets aligned data
133 int hFlatnessThreshold= 56 - 16;
134 int vFlatnessThreshold= 56 - 16;
136 //amount of "black" u r willing to loose to get a brightness corrected picture
137 double maxClippedThreshold= 0.01;
142 static struct PPFilter filters[]=
144 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
145 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
146 {"vr", "rkvdeblock", 1, 2, 4, H_RK1_FILTER},
147 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
148 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
149 {"dr", "dering", 1, 5, 6, DERING},
150 {"al", "autolevels", 0, 1, 2, LEVEL_FIX},
151 {"lb", "linblenddeint", 0, 1, 6, LINEAR_BLEND_DEINT_FILTER},
152 {"li", "linipoldeint", 0, 1, 6, LINEAR_IPOL_DEINT_FILTER},
153 {"ci", "cubicipoldeint", 0, 1, 6, CUBIC_IPOL_DEINT_FILTER},
154 {"md", "mediandeint", 0, 1, 6, MEDIAN_DEINT_FILTER},
155 {NULL, NULL,0,0,0,0} //End Marker
158 static char *replaceTable[]=
160 "default", "hdeblock:a,vdeblock:a,dering:a,autolevels",
161 "de", "hdeblock:a,vdeblock:a,dering:a,autolevels",
162 "fast", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels",
163 "fa", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels",
168 static inline long long rdtsc()
171 asm volatile( "rdtsc\n\t"
174 // printf("%d\n", int(l/1000));
180 static inline void prefetchnta(void *p)
182 asm volatile( "prefetchnta (%0)\n\t"
187 static inline void prefetcht0(void *p)
189 asm volatile( "prefetcht0 (%0)\n\t"
194 static inline void prefetcht1(void *p)
196 asm volatile( "prefetcht1 (%0)\n\t"
201 static inline void prefetcht2(void *p)
203 asm volatile( "prefetcht2 (%0)\n\t"
209 //FIXME? |255-0| = 1 (shouldnt be a problem ...)
211 * Check if the middle 8x8 Block in the given 8x16 block is flat
213 static inline int isVertDC(uint8_t src[], int stride){
216 src+= stride*4; // src points to begin of the 8x8 Block
219 "leal (%1, %2), %%eax \n\t"
220 "leal (%%eax, %2, 4), %%ebx \n\t"
221 // 0 1 2 3 4 5 6 7 8 9
222 // %1 eax eax+%2 eax+2%2 %1+4%2 ebx ebx+%2 ebx+2%2 %1+8%2 ebx+4%2
223 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F
224 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D
225 "movq (%1), %%mm0 \n\t"
226 "movq (%%eax), %%mm1 \n\t"
227 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
228 "paddb %%mm7, %%mm0 \n\t"
229 "pcmpgtb %%mm6, %%mm0 \n\t"
231 "movq (%%eax,%2), %%mm2 \n\t"
232 "psubb %%mm2, %%mm1 \n\t"
233 "paddb %%mm7, %%mm1 \n\t"
234 "pcmpgtb %%mm6, %%mm1 \n\t"
235 "paddb %%mm1, %%mm0 \n\t"
237 "movq (%%eax, %2, 2), %%mm1 \n\t"
238 "psubb %%mm1, %%mm2 \n\t"
239 "paddb %%mm7, %%mm2 \n\t"
240 "pcmpgtb %%mm6, %%mm2 \n\t"
241 "paddb %%mm2, %%mm0 \n\t"
243 "movq (%1, %2, 4), %%mm2 \n\t"
244 "psubb %%mm2, %%mm1 \n\t"
245 "paddb %%mm7, %%mm1 \n\t"
246 "pcmpgtb %%mm6, %%mm1 \n\t"
247 "paddb %%mm1, %%mm0 \n\t"
249 "movq (%%ebx), %%mm1 \n\t"
250 "psubb %%mm1, %%mm2 \n\t"
251 "paddb %%mm7, %%mm2 \n\t"
252 "pcmpgtb %%mm6, %%mm2 \n\t"
253 "paddb %%mm2, %%mm0 \n\t"
255 "movq (%%ebx, %2), %%mm2 \n\t"
256 "psubb %%mm2, %%mm1 \n\t"
257 "paddb %%mm7, %%mm1 \n\t"
258 "pcmpgtb %%mm6, %%mm1 \n\t"
259 "paddb %%mm1, %%mm0 \n\t"
261 "movq (%%ebx, %2, 2), %%mm1 \n\t"
262 "psubb %%mm1, %%mm2 \n\t"
263 "paddb %%mm7, %%mm2 \n\t"
264 "pcmpgtb %%mm6, %%mm2 \n\t"
265 "paddb %%mm2, %%mm0 \n\t"
268 "movq %%mm0, %%mm1 \n\t"
269 "psrlw $8, %%mm0 \n\t"
270 "paddb %%mm1, %%mm0 \n\t"
271 "movq %%mm0, %%mm1 \n\t"
272 "psrlq $16, %%mm0 \n\t"
273 "paddb %%mm1, %%mm0 \n\t"
274 "movq %%mm0, %%mm1 \n\t"
275 "psrlq $32, %%mm0 \n\t"
276 "paddb %%mm1, %%mm0 \n\t"
277 "movd %%mm0, %0 \n\t"
279 : "r" (src), "r" (stride)
282 numEq= (256 - numEq) &0xFF;
285 for(y=0; y<BLOCK_SIZE-1; y++)
287 if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
288 if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
289 if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++;
290 if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++;
291 if(((src[4] - src[4+stride] + 1)&0xFFFF) < 3) numEq++;
292 if(((src[5] - src[5+stride] + 1)&0xFFFF) < 3) numEq++;
293 if(((src[6] - src[6+stride] + 1)&0xFFFF) < 3) numEq++;
294 if(((src[7] - src[7+stride] + 1)&0xFFFF) < 3) numEq++;
298 /* if(abs(numEq - asmEq) > 0)
300 printf("\nasm:%d c:%d\n", asmEq, numEq);
301 for(int y=0; y<8; y++)
303 for(int x=0; x<8; x++)
305 printf("%d ", temp[x + y*stride]);
311 // for(int i=0; i<numEq/8; i++) src[i]=255;
312 return (numEq > vFlatnessThreshold) ? 1 : 0;
315 static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
322 "movq (%1, %2), %%mm0 \n\t"
323 "movq (%1, %2, 8), %%mm1 \n\t"
324 "movq %%mm0, %%mm2 \n\t"
325 "psubusb %%mm1, %%mm0 \n\t"
326 "psubusb %%mm2, %%mm1 \n\t"
327 "por %%mm1, %%mm0 \n\t" // ABS Diff
329 "movq pQPb, %%mm7 \n\t" // QP,..., QP
330 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
331 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0
332 "pcmpeqd b00, %%mm0 \n\t"
333 "psrlq $16, %%mm0 \n\t"
334 "pcmpeqd bFF, %%mm0 \n\t"
335 // "movd %%mm0, (%1, %2, 4)\n\t"
336 "movd %%mm0, %0 \n\t"
338 : "r" (src), "r" (stride)
346 for(x=0; x<BLOCK_SIZE; x++)
348 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
350 /* if(isOk && !isOk2 || !isOk && isOk2)
352 printf("\nasm:%d c:%d QP:%d\n", isOk, isOk2, QP);
353 for(int y=0; y<9; y++)
355 for(int x=0; x<8; x++)
357 printf("%d ", src[x + y*stride]);
369 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
370 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
372 static inline void doVertLowPass(uint8_t *src, int stride, int QP)
374 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
376 asm volatile( //"movv %0 %1 %2\n\t"
378 "movq pQPb, %%mm0 \n\t" // QP,..., QP
380 "movq (%0), %%mm6 \n\t"
381 "movq (%0, %1), %%mm5 \n\t"
382 "movq %%mm5, %%mm1 \n\t"
383 "movq %%mm6, %%mm2 \n\t"
384 "psubusb %%mm6, %%mm5 \n\t"
385 "psubusb %%mm1, %%mm2 \n\t"
386 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
387 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
388 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
390 "pand %%mm2, %%mm6 \n\t"
391 "pandn %%mm1, %%mm2 \n\t"
392 "por %%mm2, %%mm6 \n\t"// First Line to Filter
394 "movq (%0, %1, 8), %%mm5 \n\t"
395 "leal (%0, %1, 4), %%eax \n\t"
396 "leal (%0, %1, 8), %%ebx \n\t"
397 "subl %1, %%ebx \n\t"
398 "addl %1, %0 \n\t" // %0 points to line 1 not 0
399 "movq (%0, %1, 8), %%mm7 \n\t"
400 "movq %%mm5, %%mm1 \n\t"
401 "movq %%mm7, %%mm2 \n\t"
402 "psubusb %%mm7, %%mm5 \n\t"
403 "psubusb %%mm1, %%mm2 \n\t"
404 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
405 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
406 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
408 "pand %%mm2, %%mm7 \n\t"
409 "pandn %%mm1, %%mm2 \n\t"
410 "por %%mm2, %%mm7 \n\t" // First Line to Filter
414 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ebx eax+4%1
419 "movq (%0, %1), %%mm0 \n\t" // 1
420 "movq %%mm0, %%mm1 \n\t" // 1
421 PAVGB(%%mm6, %%mm0) //1 1 /2
422 PAVGB(%%mm6, %%mm0) //3 1 /4
424 "movq (%0, %1, 4), %%mm2 \n\t" // 1
425 "movq %%mm2, %%mm5 \n\t" // 1
426 PAVGB((%%eax), %%mm2) // 11 /2
427 PAVGB((%0, %1, 2), %%mm2) // 211 /4
428 "movq %%mm2, %%mm3 \n\t" // 211 /4
429 "movq (%0), %%mm4 \n\t" // 1
430 PAVGB(%%mm4, %%mm3) // 4 211 /8
431 PAVGB(%%mm0, %%mm3) //642211 /16
432 "movq %%mm3, (%0) \n\t" // X
433 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
434 "movq %%mm1, %%mm0 \n\t" // 1
435 PAVGB(%%mm6, %%mm0) //1 1 /2
436 "movq %%mm4, %%mm3 \n\t" // 1
437 PAVGB((%0,%1,2), %%mm3) // 1 1 /2
438 PAVGB((%%eax,%1,2), %%mm5) // 11 /2
439 PAVGB((%%eax), %%mm5) // 211 /4
440 PAVGB(%%mm5, %%mm3) // 2 2211 /8
441 PAVGB(%%mm0, %%mm3) //4242211 /16
442 "movq %%mm3, (%0,%1) \n\t" // X
443 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
444 PAVGB(%%mm4, %%mm6) //11 /2
445 "movq (%%ebx), %%mm0 \n\t" // 1
446 PAVGB((%%eax, %1, 2), %%mm0) // 11/2
447 "movq %%mm0, %%mm3 \n\t" // 11/2
448 PAVGB(%%mm1, %%mm0) // 2 11/4
449 PAVGB(%%mm6, %%mm0) //222 11/8
450 PAVGB(%%mm2, %%mm0) //22242211/16
451 "movq (%0, %1, 2), %%mm2 \n\t" // 1
452 "movq %%mm0, (%0, %1, 2) \n\t" // X
453 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
454 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
455 PAVGB((%%ebx), %%mm0) // 11 /2
456 PAVGB(%%mm0, %%mm6) //11 11 /4
457 PAVGB(%%mm1, %%mm4) // 11 /2
458 PAVGB(%%mm2, %%mm1) // 11 /2
459 PAVGB(%%mm1, %%mm6) //1122 11 /8
460 PAVGB(%%mm5, %%mm6) //112242211 /16
461 "movq (%%eax), %%mm5 \n\t" // 1
462 "movq %%mm6, (%%eax) \n\t" // X
463 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
464 "movq (%%eax, %1, 4), %%mm6 \n\t" // 1
465 PAVGB(%%mm7, %%mm6) // 11 /2
466 PAVGB(%%mm4, %%mm6) // 11 11 /4
467 PAVGB(%%mm3, %%mm6) // 11 2211 /8
468 PAVGB(%%mm5, %%mm2) // 11 /2
469 "movq (%0, %1, 4), %%mm4 \n\t" // 1
470 PAVGB(%%mm4, %%mm2) // 112 /4
471 PAVGB(%%mm2, %%mm6) // 112242211 /16
472 "movq %%mm6, (%0, %1, 4) \n\t" // X
473 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
474 PAVGB(%%mm7, %%mm1) // 11 2 /4
475 PAVGB(%%mm4, %%mm5) // 11 /2
476 PAVGB(%%mm5, %%mm0) // 11 11 /4
477 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1
478 PAVGB(%%mm6, %%mm1) // 11 4 2 /8
479 PAVGB(%%mm0, %%mm1) // 11224222 /16
480 "movq %%mm1, (%%eax, %1, 2) \n\t" // X
481 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
482 PAVGB((%%ebx), %%mm2) // 112 4 /8
483 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
484 PAVGB(%%mm0, %%mm6) // 1 1 /2
485 PAVGB(%%mm7, %%mm6) // 1 12 /4
486 PAVGB(%%mm2, %%mm6) // 1122424 /4
487 "movq %%mm6, (%%ebx) \n\t" // X
488 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
489 PAVGB(%%mm7, %%mm5) // 11 2 /4
490 PAVGB(%%mm7, %%mm5) // 11 6 /8
492 PAVGB(%%mm3, %%mm0) // 112 /4
493 PAVGB(%%mm0, %%mm5) // 112246 /16
494 "movq %%mm5, (%%eax, %1, 4) \n\t" // X
498 : "r" (src), "r" (stride)
502 const int l1= stride;
503 const int l2= stride + l1;
504 const int l3= stride + l2;
505 const int l4= stride + l3;
506 const int l5= stride + l4;
507 const int l6= stride + l5;
508 const int l7= stride + l6;
509 const int l8= stride + l7;
510 const int l9= stride + l8;
513 for(x=0; x<BLOCK_SIZE; x++)
515 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
516 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
519 sums[0] = first + src[l1];
520 sums[1] = src[l1] + src[l2];
521 sums[2] = src[l2] + src[l3];
522 sums[3] = src[l3] + src[l4];
523 sums[4] = src[l4] + src[l5];
524 sums[5] = src[l5] + src[l6];
525 sums[6] = src[l6] + src[l7];
526 sums[7] = src[l7] + src[l8];
527 sums[8] = src[l8] + last;
529 src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
530 src[l2]= ((src[l2]<<2) + (first + sums[0] + sums[3]<<1) + sums[5] + 8)>>4;
531 src[l3]= ((src[l3]<<2) + (first + sums[1] + sums[4]<<1) + sums[6] + 8)>>4;
532 src[l4]= ((src[l4]<<2) + (sums[2] + sums[5]<<1) + sums[0] + sums[7] + 8)>>4;
533 src[l5]= ((src[l5]<<2) + (sums[3] + sums[6]<<1) + sums[1] + sums[8] + 8)>>4;
534 src[l6]= ((src[l6]<<2) + (last + sums[7] + sums[4]<<1) + sums[2] + 8)>>4;
535 src[l7]= ((last + src[l7]<<2) + (src[l8] + sums[5]<<1) + sums[3] + 8)>>4;
536 src[l8]= ((sums[8]<<2) + (last + sums[6]<<1) + sums[4] + 8)>>4;
545 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
546 * values are correctly clipped (MMX2)
547 * values are wraparound (C)
548 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
555 static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
557 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
561 "pxor %%mm7, %%mm7 \n\t" // 0
562 "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
563 "leal (%0, %1), %%eax \n\t"
564 "leal (%%eax, %1, 4), %%ebx \n\t"
565 // 0 1 2 3 4 5 6 7 8 9
566 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
567 "movq pQPb, %%mm0 \n\t" // QP,..., QP
568 "movq %%mm0, %%mm1 \n\t" // QP,..., QP
569 "paddusb b02, %%mm0 \n\t"
570 "psrlw $2, %%mm0 \n\t"
571 "pand b3F, %%mm0 \n\t" // QP/4,..., QP/4
572 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ...
573 "movq (%0, %1, 4), %%mm2 \n\t" // line 4
574 "movq (%%ebx), %%mm3 \n\t" // line 5
575 "movq %%mm2, %%mm4 \n\t" // line 4
576 "pcmpeqb %%mm5, %%mm5 \n\t" // -1
577 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
579 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
580 "psubusb %%mm3, %%mm4 \n\t"
581 "psubusb %%mm2, %%mm3 \n\t"
582 "por %%mm3, %%mm4 \n\t" // |l4 - l5|
583 "psubusb %%mm0, %%mm4 \n\t"
584 "pcmpeqb %%mm7, %%mm4 \n\t"
585 "pand %%mm4, %%mm5 \n\t" // d/2
587 // "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80
588 "paddb %%mm5, %%mm2 \n\t"
589 // "psubb %%mm6, %%mm2 \n\t"
590 "movq %%mm2, (%0,%1, 4) \n\t"
592 "movq (%%ebx), %%mm2 \n\t"
593 // "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80
594 "psubb %%mm5, %%mm2 \n\t"
595 // "psubb %%mm6, %%mm2 \n\t"
596 "movq %%mm2, (%%ebx) \n\t"
598 "paddb %%mm6, %%mm5 \n\t"
599 "psrlw $2, %%mm5 \n\t"
600 "pand b3F, %%mm5 \n\t"
601 "psubb b20, %%mm5 \n\t" // (l5-l4)/8
603 "movq (%%eax, %1, 2), %%mm2 \n\t"
604 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80
605 "paddsb %%mm5, %%mm2 \n\t"
606 "psubb %%mm6, %%mm2 \n\t"
607 "movq %%mm2, (%%eax, %1, 2) \n\t"
609 "movq (%%ebx, %1), %%mm2 \n\t"
610 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80
611 "psubsb %%mm5, %%mm2 \n\t"
612 "psubb %%mm6, %%mm2 \n\t"
613 "movq %%mm2, (%%ebx, %1) \n\t"
616 : "r" (src), "r" (stride)
620 const int l1= stride;
621 const int l2= stride + l1;
622 const int l3= stride + l2;
623 const int l4= stride + l3;
624 const int l5= stride + l4;
625 const int l6= stride + l5;
626 const int l7= stride + l6;
627 const int l8= stride + l7;
628 const int l9= stride + l8;
631 for(x=0; x<BLOCK_SIZE; x++)
633 if(ABS(src[l4]-src[l5]) < QP + QP/4)
635 int v = (src[l5] - src[l4]);
650 * Experimental Filter 1
651 * will not damage linear gradients
652 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
653 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
654 * MMX2 version does correct clipping C version doesnt
656 static inline void vertX1Filter(uint8_t *src, int stride, int QP)
658 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
662 "pxor %%mm7, %%mm7 \n\t" // 0
663 // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
664 "leal (%0, %1), %%eax \n\t"
665 "leal (%%eax, %1, 4), %%ebx \n\t"
666 // 0 1 2 3 4 5 6 7 8 9
667 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
668 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
669 "movq (%0, %1, 4), %%mm1 \n\t" // line 4
670 "movq %%mm1, %%mm2 \n\t" // line 4
671 "psubusb %%mm0, %%mm1 \n\t"
672 "psubusb %%mm2, %%mm0 \n\t"
673 "por %%mm1, %%mm0 \n\t" // |l2 - l3|
674 "movq (%%ebx), %%mm3 \n\t" // line 5
675 "movq (%%ebx, %1), %%mm4 \n\t" // line 6
676 "movq %%mm3, %%mm5 \n\t" // line 5
677 "psubusb %%mm4, %%mm3 \n\t"
678 "psubusb %%mm5, %%mm4 \n\t"
679 "por %%mm4, %%mm3 \n\t" // |l5 - l6|
680 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
681 "movq %%mm2, %%mm1 \n\t" // line 4
682 "psubusb %%mm5, %%mm2 \n\t"
683 "movq %%mm2, %%mm4 \n\t"
684 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
685 "psubusb %%mm1, %%mm5 \n\t"
686 "por %%mm5, %%mm4 \n\t" // |l4 - l5|
687 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
688 "movq %%mm4, %%mm3 \n\t" // d
689 "psubusb pQPb, %%mm4 \n\t"
690 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
691 "psubusb b01, %%mm3 \n\t"
692 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
694 PAVGB(%%mm7, %%mm3) // d/2
695 "movq %%mm3, %%mm1 \n\t" // d/2
696 PAVGB(%%mm7, %%mm3) // d/4
697 PAVGB(%%mm1, %%mm3) // 3*d/8
699 "movq (%0, %1, 4), %%mm0 \n\t" // line 4
700 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
701 "psubusb %%mm3, %%mm0 \n\t"
702 "pxor %%mm2, %%mm0 \n\t"
703 "movq %%mm0, (%0, %1, 4) \n\t" // line 4
705 "movq (%%ebx), %%mm0 \n\t" // line 5
706 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
707 "paddusb %%mm3, %%mm0 \n\t"
708 "pxor %%mm2, %%mm0 \n\t"
709 "movq %%mm0, (%%ebx) \n\t" // line 5
711 PAVGB(%%mm7, %%mm1) // d/4
713 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
714 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
715 "psubusb %%mm1, %%mm0 \n\t"
716 "pxor %%mm2, %%mm0 \n\t"
717 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3
719 "movq (%%ebx, %1), %%mm0 \n\t" // line 6
720 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
721 "paddusb %%mm1, %%mm0 \n\t"
722 "pxor %%mm2, %%mm0 \n\t"
723 "movq %%mm0, (%%ebx, %1) \n\t" // line 6
725 PAVGB(%%mm7, %%mm1) // d/8
727 "movq (%%eax, %1), %%mm0 \n\t" // line 2
728 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
729 "psubusb %%mm1, %%mm0 \n\t"
730 "pxor %%mm2, %%mm0 \n\t"
731 "movq %%mm0, (%%eax, %1) \n\t" // line 2
733 "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7
734 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
735 "paddusb %%mm1, %%mm0 \n\t"
736 "pxor %%mm2, %%mm0 \n\t"
737 "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7
740 : "r" (src), "r" (stride)
745 const int l1= stride;
746 const int l2= stride + l1;
747 const int l3= stride + l2;
748 const int l4= stride + l3;
749 const int l5= stride + l4;
750 const int l6= stride + l5;
751 const int l7= stride + l6;
752 const int l8= stride + l7;
753 const int l9= stride + l8;
757 for(x=0; x<BLOCK_SIZE; x++)
759 int a= src[l3] - src[l4];
760 int b= src[l4] - src[l5];
761 int c= src[l5] - src[l6];
763 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
767 int v = d * SIGN(-b);
780 const int l1= stride;
781 const int l2= stride + l1;
782 const int l3= stride + l2;
783 const int l4= stride + l3;
784 const int l5= stride + l4;
785 const int l6= stride + l5;
786 const int l7= stride + l6;
787 const int l8= stride + l7;
788 const int l9= stride + l8;
789 for(int x=0; x<BLOCK_SIZE; x++)
798 if(ABS(v4-v5)<QP && ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 )
800 src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6 )/16;
801 src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7 )/16;
802 src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16;
803 src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16;
812 * Experimental Filter 1 (Horizontal)
813 * will not damage linear gradients
814 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
815 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
816 * MMX2 version does correct clipping C version doesnt
817 * not identical with the vertical one
819 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
822 static uint64_t *lut= NULL;
826 lut= (uint64_t*)memalign(8, 256*8);
829 int v= i < 128 ? 2*i : 2*(i-256);
831 //Simulate 112242211 9-Tap filter
832 uint64_t a= (v/16) & 0xFF;
833 uint64_t b= (v/8) & 0xFF;
834 uint64_t c= (v/4) & 0xFF;
835 uint64_t d= (3*v/8) & 0xFF;
837 //Simulate piecewise linear interpolation
838 uint64_t a= (v/16) & 0xFF;
839 uint64_t b= (v*3/16) & 0xFF;
840 uint64_t c= (v*5/16) & 0xFF;
841 uint64_t d= (7*v/16) & 0xFF;
842 uint64_t A= (0x100 - a)&0xFF;
843 uint64_t B= (0x100 - b)&0xFF;
844 uint64_t C= (0x100 - c)&0xFF;
845 uint64_t D= (0x100 - c)&0xFF;
847 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
848 (D<<24) | (C<<16) | (B<<8) | (A);
849 //lut[i] = (v<<32) | (v<<24);
853 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
855 "pxor %%mm7, %%mm7 \n\t" // 0
856 // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
857 "leal (%0, %1), %%eax \n\t"
858 "leal (%%eax, %1, 4), %%ebx \n\t"
860 "movq b80, %%mm6 \n\t"
861 "movd pQPb, %%mm5 \n\t" // QP
862 "movq %%mm5, %%mm4 \n\t"
863 "paddusb %%mm5, %%mm5 \n\t" // 2QP
864 "paddusb %%mm5, %%mm4 \n\t" // 3QP
865 "pxor %%mm5, %%mm5 \n\t" // 0
866 "psubb %%mm4, %%mm5 \n\t" // -3QP
867 "por bm11111110, %%mm5 \n\t" // ...,FF,FF,-3QP
868 "psllq $24, %%mm5 \n\t"
870 // 0 1 2 3 4 5 6 7 8 9
871 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
874 "movd " #a ", %%mm0 \n\t"\
875 "movd 4" #a ", %%mm1 \n\t"\
876 "punpckldq %%mm1, %%mm0 \n\t"\
877 "movq %%mm0, %%mm1 \n\t"\
878 "movq %%mm0, %%mm2 \n\t"\
879 "psrlq $8, %%mm1 \n\t"\
880 "psubusb %%mm1, %%mm2 \n\t"\
881 "psubusb %%mm0, %%mm1 \n\t"\
882 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
883 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
884 "pshufw $0x00, %%mm1, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
885 PAVGB(%%mm1, %%mm3) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
886 "psrlq $16, %%mm3 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
887 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
888 "paddb %%mm5, %%mm1 \n\t"\
889 "psubusb %%mm5, %%mm1 \n\t"\
891 "pxor %%mm2, %%mm1 \n\t"\
892 "psubb %%mm2, %%mm1 \n\t"\
893 "psrlq $24, %%mm1 \n\t"\
894 "movd %%mm1, %%ecx \n\t"\
895 "paddb %%mm6, %%mm0 \n\t"\
896 "paddsb (%3, %%ecx, 8), %%mm0 \n\t"\
897 "paddb %%mm6, %%mm0 \n\t"\
898 "movq %%mm0, " #a " \n\t"\
904 HX1old((%%eax, %1, 2))
908 HX1old((%%ebx, %1, 2))
911 //FIXME add some comments, its unreadable ...
912 #define HX1b(a, c, b, d) \
913 "movd " #a ", %%mm0 \n\t"\
914 "movd 4" #a ", %%mm1 \n\t"\
915 "punpckldq %%mm1, %%mm0 \n\t"\
916 "movd " #b ", %%mm4 \n\t"\
917 "movq %%mm0, %%mm1 \n\t"\
918 "movq %%mm0, %%mm2 \n\t"\
919 "psrlq $8, %%mm1 \n\t"\
920 "movd 4" #b ", %%mm3 \n\t"\
921 "psubusb %%mm1, %%mm2 \n\t"\
922 "psubusb %%mm0, %%mm1 \n\t"\
923 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
924 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
925 "punpckldq %%mm3, %%mm4 \n\t"\
926 "movq %%mm1, %%mm3 \n\t"\
927 "psllq $32, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
928 PAVGB(%%mm1, %%mm3) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
929 "paddb %%mm6, %%mm0 \n\t"\
930 "psrlq $16, %%mm3 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
931 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
932 "movq %%mm4, %%mm3 \n\t"\
933 "paddb %%mm5, %%mm1 \n\t"\
934 "psubusb %%mm5, %%mm1 \n\t"\
935 "psrlq $8, %%mm3 \n\t"\
937 "pxor %%mm2, %%mm1 \n\t"\
938 "psubb %%mm2, %%mm1 \n\t"\
939 "movq %%mm4, %%mm2 \n\t"\
940 "psrlq $24, %%mm1 \n\t"\
941 "psubusb %%mm3, %%mm2 \n\t"\
942 "movd %%mm1, %%ecx \n\t"\
943 "psubusb %%mm4, %%mm3 \n\t"\
944 "paddsb (%2, %%ecx, 8), %%mm0 \n\t"\
945 "por %%mm2, %%mm3 \n\t" /* p´x = |px - p(x+1)| */\
946 "paddb %%mm6, %%mm0 \n\t"\
947 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
948 "movq %%mm3, %%mm1 \n\t"\
949 "psllq $32, %%mm1 \n\t" /* p´5 = |p1 - p2| */\
950 "movq %%mm0, " #a " \n\t"\
951 PAVGB(%%mm3, %%mm1) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
952 "paddb %%mm6, %%mm4 \n\t"\
953 "psrlq $16, %%mm1 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
954 "psubusb %%mm1, %%mm3 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
955 "paddb %%mm5, %%mm3 \n\t"\
956 "psubusb %%mm5, %%mm3 \n\t"\
958 "pxor %%mm2, %%mm3 \n\t"\
959 "psubb %%mm2, %%mm3 \n\t"\
960 "psrlq $24, %%mm3 \n\t"\
961 "movd " #c ", %%mm0 \n\t"\
962 "movd 4" #c ", %%mm1 \n\t"\
963 "punpckldq %%mm1, %%mm0 \n\t"\
964 "paddb %%mm6, %%mm0 \n\t"\
965 "paddsb (%2, %%ecx, 8), %%mm0 \n\t"\
966 "paddb %%mm6, %%mm0 \n\t"\
967 "movq %%mm0, " #c " \n\t"\
968 "movd %%mm3, %%ecx \n\t"\
969 "movd " #d ", %%mm0 \n\t"\
970 "paddsb (%2, %%ecx, 8), %%mm4 \n\t"\
971 "movd 4" #d ", %%mm1 \n\t"\
972 "paddb %%mm6, %%mm4 \n\t"\
973 "punpckldq %%mm1, %%mm0 \n\t"\
974 "movq %%mm4, " #b " \n\t"\
975 "paddb %%mm6, %%mm0 \n\t"\
976 "paddsb (%2, %%ecx, 8), %%mm0 \n\t"\
977 "paddb %%mm6, %%mm0 \n\t"\
978 "movq %%mm0, " #d " \n\t"\
980 HX1b((%0),(%%eax),(%%eax, %1),(%%eax, %1, 2))
981 HX1b((%0, %1, 4),(%%ebx),(%%ebx, %1),(%%ebx, %1, 2))
985 : "r" (src), "r" (stride), "r" (lut)
986 : "%eax", "%ebx", "%ecx"
990 //FIXME (has little in common with the mmx2 version)
991 for(y=0; y<BLOCK_SIZE; y++)
993 int a= src[1] - src[2];
994 int b= src[3] - src[4];
995 int c= src[5] - src[6];
997 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
1001 int v = d * SIGN(-b);
1017 static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
1021 //FIXME try pmul for *5 stuff
1024 "pxor %%mm7, %%mm7 \n\t"
1025 "leal (%0, %1), %%eax \n\t"
1026 "leal (%%eax, %1, 4), %%ebx \n\t"
1028 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
1029 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
1031 "movq (%0), %%mm0 \n\t"
1032 "movq %%mm0, %%mm1 \n\t"
1033 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
1034 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
1036 "movq (%%eax), %%mm2 \n\t"
1037 "movq %%mm2, %%mm3 \n\t"
1038 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
1039 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
1041 "movq (%%eax, %1), %%mm4 \n\t"
1042 "movq %%mm4, %%mm5 \n\t"
1043 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
1044 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
1046 "paddw %%mm0, %%mm0 \n\t" // 2L0
1047 "paddw %%mm1, %%mm1 \n\t" // 2H0
1048 "psubw %%mm4, %%mm2 \n\t" // L1 - L2
1049 "psubw %%mm5, %%mm3 \n\t" // H1 - H2
1050 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
1051 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
1053 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
1054 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
1055 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
1056 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
1058 "movq (%%eax, %1, 2), %%mm2 \n\t"
1059 "movq %%mm2, %%mm3 \n\t"
1060 "punpcklbw %%mm7, %%mm2 \n\t" // L3
1061 "punpckhbw %%mm7, %%mm3 \n\t" // H3
1063 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
1064 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
1065 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1066 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1067 "movq %%mm0, temp0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1068 "movq %%mm1, temp1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1070 "movq (%0, %1, 4), %%mm0 \n\t"
1071 "movq %%mm0, %%mm1 \n\t"
1072 "punpcklbw %%mm7, %%mm0 \n\t" // L4
1073 "punpckhbw %%mm7, %%mm1 \n\t" // H4
1075 "psubw %%mm0, %%mm2 \n\t" // L3 - L4
1076 "psubw %%mm1, %%mm3 \n\t" // H3 - H4
1077 "movq %%mm2, temp2 \n\t" // L3 - L4
1078 "movq %%mm3, temp3 \n\t" // H3 - H4
1079 "paddw %%mm4, %%mm4 \n\t" // 2L2
1080 "paddw %%mm5, %%mm5 \n\t" // 2H2
1081 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
1082 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
1084 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
1085 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
1086 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
1087 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
1089 "movq (%%ebx), %%mm2 \n\t"
1090 "movq %%mm2, %%mm3 \n\t"
1091 "punpcklbw %%mm7, %%mm2 \n\t" // L5
1092 "punpckhbw %%mm7, %%mm3 \n\t" // H5
1093 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
1094 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
1095 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
1096 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
1098 "movq (%%ebx, %1), %%mm6 \n\t"
1099 "punpcklbw %%mm7, %%mm6 \n\t" // L6
1100 "psubw %%mm6, %%mm2 \n\t" // L5 - L6
1101 "movq (%%ebx, %1), %%mm6 \n\t"
1102 "punpckhbw %%mm7, %%mm6 \n\t" // H6
1103 "psubw %%mm6, %%mm3 \n\t" // H5 - H6
1105 "paddw %%mm0, %%mm0 \n\t" // 2L4
1106 "paddw %%mm1, %%mm1 \n\t" // 2H4
1107 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
1108 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
1110 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
1111 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
1112 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
1113 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
1115 "movq (%%ebx, %1, 2), %%mm2 \n\t"
1116 "movq %%mm2, %%mm3 \n\t"
1117 "punpcklbw %%mm7, %%mm2 \n\t" // L7
1118 "punpckhbw %%mm7, %%mm3 \n\t" // H7
1120 "paddw %%mm2, %%mm2 \n\t" // 2L7
1121 "paddw %%mm3, %%mm3 \n\t" // 2H7
1122 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
1123 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
1125 "movq temp0, %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1126 "movq temp1, %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1127 //FIXME pxor, psubw, pmax for abs
1128 "movq %%mm7, %%mm6 \n\t" // 0
1129 "pcmpgtw %%mm0, %%mm6 \n\t"
1130 "pxor %%mm6, %%mm0 \n\t"
1131 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1132 "movq %%mm7, %%mm6 \n\t" // 0
1133 "pcmpgtw %%mm1, %%mm6 \n\t"
1134 "pxor %%mm6, %%mm1 \n\t"
1135 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1137 "movq %%mm7, %%mm6 \n\t" // 0
1138 "pcmpgtw %%mm2, %%mm6 \n\t"
1139 "pxor %%mm6, %%mm2 \n\t"
1140 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1141 "movq %%mm7, %%mm6 \n\t" // 0
1142 "pcmpgtw %%mm3, %%mm6 \n\t"
1143 "pxor %%mm6, %%mm3 \n\t"
1144 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1147 "pminsw %%mm2, %%mm0 \n\t"
1148 "pminsw %%mm3, %%mm1 \n\t"
1150 "movq %%mm0, %%mm6 \n\t"
1151 "psubusw %%mm2, %%mm6 \n\t"
1152 "psubw %%mm6, %%mm0 \n\t"
1153 "movq %%mm1, %%mm6 \n\t"
1154 "psubusw %%mm3, %%mm6 \n\t"
1155 "psubw %%mm6, %%mm1 \n\t"
1158 "movq %%mm7, %%mm6 \n\t" // 0
1159 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
1160 "pxor %%mm6, %%mm4 \n\t"
1161 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
1162 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
1163 "pxor %%mm7, %%mm5 \n\t"
1164 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
1166 "movd %2, %%mm2 \n\t" // QP
1167 "punpcklwd %%mm2, %%mm2 \n\t"
1168 "punpcklwd %%mm2, %%mm2 \n\t"
1169 "psllw $3, %%mm2 \n\t" // 8QP
1170 "movq %%mm2, %%mm3 \n\t" // 8QP
1171 "pcmpgtw %%mm4, %%mm2 \n\t"
1172 "pcmpgtw %%mm5, %%mm3 \n\t"
1173 "pand %%mm2, %%mm4 \n\t"
1174 "pand %%mm3, %%mm5 \n\t"
1177 "psubusw %%mm0, %%mm4 \n\t" // hd
1178 "psubusw %%mm1, %%mm5 \n\t" // ld
1181 "movq w05, %%mm2 \n\t" // 5
1182 "pmullw %%mm2, %%mm4 \n\t"
1183 "pmullw %%mm2, %%mm5 \n\t"
1184 "movq w20, %%mm2 \n\t" // 32
1185 "paddw %%mm2, %%mm4 \n\t"
1186 "paddw %%mm2, %%mm5 \n\t"
1187 "psrlw $6, %%mm4 \n\t"
1188 "psrlw $6, %%mm5 \n\t"
1191 "movq w06, %%mm2 \n\t" // 6
1192 "paddw %%mm2, %%mm4 \n\t"
1193 "paddw %%mm2, %%mm5 \n\t"
1194 "movq w1400, %%mm2 \n\t" // 1400h = 5120 = 5/64*2^16
1195 //FIXME if *5/64 is supposed to be /13 then we should use 5041 instead of 5120
1196 "pmulhw %%mm2, %%mm4 \n\t" // hd/13
1197 "pmulhw %%mm2, %%mm5 \n\t" // ld/13
1200 "movq temp2, %%mm0 \n\t" // L3 - L4
1201 "movq temp3, %%mm1 \n\t" // H3 - H4
1203 "pxor %%mm2, %%mm2 \n\t"
1204 "pxor %%mm3, %%mm3 \n\t"
1206 // FIXME rounding error
1207 "psraw $1, %%mm0 \n\t" // (L3 - L4)/2
1208 "psraw $1, %%mm1 \n\t" // (H3 - H4)/2
1209 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
1210 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
1211 "pxor %%mm2, %%mm0 \n\t"
1212 "pxor %%mm3, %%mm1 \n\t"
1213 "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
1214 "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
1215 // "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
1216 // "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
1218 "pxor %%mm6, %%mm2 \n\t"
1219 "pxor %%mm7, %%mm3 \n\t"
1220 "pand %%mm2, %%mm4 \n\t"
1221 "pand %%mm3, %%mm5 \n\t"
1224 "pminsw %%mm0, %%mm4 \n\t"
1225 "pminsw %%mm1, %%mm5 \n\t"
1227 "movq %%mm4, %%mm2 \n\t"
1228 "psubusw %%mm0, %%mm2 \n\t"
1229 "psubw %%mm2, %%mm4 \n\t"
1230 "movq %%mm5, %%mm2 \n\t"
1231 "psubusw %%mm1, %%mm2 \n\t"
1232 "psubw %%mm2, %%mm5 \n\t"
1234 "pxor %%mm6, %%mm4 \n\t"
1235 "pxor %%mm7, %%mm5 \n\t"
1236 "psubw %%mm6, %%mm4 \n\t"
1237 "psubw %%mm7, %%mm5 \n\t"
1238 "packsswb %%mm5, %%mm4 \n\t"
1239 "movq (%%eax, %1, 2), %%mm0 \n\t"
1240 "paddb %%mm4, %%mm0 \n\t"
1241 "movq %%mm0, (%%eax, %1, 2) \n\t"
1242 "movq (%0, %1, 4), %%mm0 \n\t"
1243 "psubb %%mm4, %%mm0 \n\t"
1244 "movq %%mm0, (%0, %1, 4) \n\t"
1247 : "r" (src), "r" (stride), "r" (QP)
1251 const int l1= stride;
1252 const int l2= stride + l1;
1253 const int l3= stride + l2;
1254 const int l4= stride + l3;
1255 const int l5= stride + l4;
1256 const int l6= stride + l5;
1257 const int l7= stride + l6;
1258 const int l8= stride + l7;
1259 // const int l9= stride + l8;
1262 for(x=0; x<BLOCK_SIZE; x++)
1264 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1265 if(ABS(middleEnergy) < 8*QP)
1267 const int q=(src[l4] - src[l5])/2;
1268 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1269 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1271 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1275 d*= SIGN(-middleEnergy);
1296 //FIXME? |255-0| = 1
1298 * Check if the given 8x8 Block is mostly "flat" and copy the unaliged data into tempBlock.
1300 static inline int isHorizDCAndCopy2Temp(uint8_t src[], int stride)
1307 "leal (%1, %2), %%ecx \n\t"
1308 "leal (%%ecx, %2, 4), %%ebx \n\t"
1309 // 0 1 2 3 4 5 6 7 8 9
1310 // %1 ecx ecx+%2 ecx+2%2 %1+4%2 ebx ebx+%2 ebx+2%2 %1+8%2 ebx+4%2
1311 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F
1312 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D
1313 "pxor %%mm0, %%mm0 \n\t"
1314 "movl %1, %%eax \n\t"
1315 "andl $0x1F, %%eax \n\t"
1316 "cmpl $24, %%eax \n\t"
1317 "leal tempBlock, %%eax \n\t"
1320 #define HDC_CHECK_AND_CPY(src, dst) \
1321 "movd " #src ", %%mm2 \n\t"\
1322 "punpckldq 4" #src ", %%mm2 \n\t" /* (%1) */\
1323 "movq %%mm2, %%mm1 \n\t"\
1324 "psrlq $8, %%mm2 \n\t"\
1325 "psubb %%mm1, %%mm2 \n\t"\
1326 "paddb %%mm7, %%mm2 \n\t"\
1327 "pcmpgtb %%mm6, %%mm2 \n\t"\
1328 "paddb %%mm2, %%mm0 \n\t"\
1329 "movq %%mm1," #dst "(%%eax) \n\t"
1331 HDC_CHECK_AND_CPY((%1),0)
1332 HDC_CHECK_AND_CPY((%%ecx),8)
1333 HDC_CHECK_AND_CPY((%%ecx, %2),16)
1334 HDC_CHECK_AND_CPY((%%ecx, %2, 2),24)
1335 HDC_CHECK_AND_CPY((%1, %2, 4),32)
1336 HDC_CHECK_AND_CPY((%%ebx),40)
1337 HDC_CHECK_AND_CPY((%%ebx, %2),48)
1338 HDC_CHECK_AND_CPY((%%ebx, %2, 2),56)
1341 // src does not cross a 32 byte cache line so dont waste time with alignment
1342 #define HDC_CHECK_AND_CPY2(src, dst) \
1343 "movq " #src ", %%mm2 \n\t"\
1344 "movq " #src ", %%mm1 \n\t"\
1345 "psrlq $8, %%mm2 \n\t"\
1346 "psubb %%mm1, %%mm2 \n\t"\
1347 "paddb %%mm7, %%mm2 \n\t"\
1348 "pcmpgtb %%mm6, %%mm2 \n\t"\
1349 "paddb %%mm2, %%mm0 \n\t"\
1350 "movq %%mm1," #dst "(%%eax) \n\t"
1352 HDC_CHECK_AND_CPY2((%1),0)
1353 HDC_CHECK_AND_CPY2((%%ecx),8)
1354 HDC_CHECK_AND_CPY2((%%ecx, %2),16)
1355 HDC_CHECK_AND_CPY2((%%ecx, %2, 2),24)
1356 HDC_CHECK_AND_CPY2((%1, %2, 4),32)
1357 HDC_CHECK_AND_CPY2((%%ebx),40)
1358 HDC_CHECK_AND_CPY2((%%ebx, %2),48)
1359 HDC_CHECK_AND_CPY2((%%ebx, %2, 2),56)
1361 "psllq $8, %%mm0 \n\t" // remove dummy value
1362 "movq %%mm0, %%mm1 \n\t"
1363 "psrlw $8, %%mm0 \n\t"
1364 "paddb %%mm1, %%mm0 \n\t"
1365 "movq %%mm0, %%mm1 \n\t"
1366 "psrlq $16, %%mm0 \n\t"
1367 "paddb %%mm1, %%mm0 \n\t"
1368 "movq %%mm0, %%mm1 \n\t"
1369 "psrlq $32, %%mm0 \n\t"
1370 "paddb %%mm1, %%mm0 \n\t"
1371 "movd %%mm0, %0 \n\t"
1373 : "r" (src), "r" (stride)
1374 : "%eax", "%ebx", "%ecx"
1376 // printf("%d\n", numEq);
1377 numEq= (256 - numEq) &0xFF;
1380 for(y=0; y<BLOCK_SIZE; y++)
1382 if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++;
1383 if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++;
1384 if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++;
1385 if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++;
1386 if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++;
1387 if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++;
1388 if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++;
1389 tempBlock[0 + y*TEMP_STRIDE] = src[0];
1390 tempBlock[1 + y*TEMP_STRIDE] = src[1];
1391 tempBlock[2 + y*TEMP_STRIDE] = src[2];
1392 tempBlock[3 + y*TEMP_STRIDE] = src[3];
1393 tempBlock[4 + y*TEMP_STRIDE] = src[4];
1394 tempBlock[5 + y*TEMP_STRIDE] = src[5];
1395 tempBlock[6 + y*TEMP_STRIDE] = src[6];
1396 tempBlock[7 + y*TEMP_STRIDE] = src[7];
1400 /* if(abs(numEq - asmEq) > 0)
1402 // printf("\nasm:%d c:%d\n", asmEq, numEq);
1403 for(int y=0; y<8; y++)
1405 for(int x=0; x<8; x++)
1407 printf("%d ", src[x + y*stride]);
1413 // printf("%d\n", numEq);
1414 return numEq > hFlatnessThreshold;
1417 static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
1424 "movq (%1, %2), %%mm0 \n\t"
1425 "movq (%1, %2, 8), %%mm1 \n\t"
1426 "movq %%mm0, %%mm2 \n\t"
1427 "psubusb %%mm1, %%mm0 \n\t"
1428 "psubusb %%mm2, %%mm1 \n\t"
1429 "por %%mm1, %%mm0 \n\t" // ABS Diff
1431 "movq pQPb, %%mm7 \n\t" // QP,..., QP
1432 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
1433 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0
1434 "pcmpeqd b00, %%mm0 \n\t"
1435 "psrlq $16, %%mm0 \n\t"
1436 "pcmpeqd bFF, %%mm0 \n\t"
1437 // "movd %%mm0, (%1, %2, 4)\n\t"
1438 "movd %%mm0, %0 \n\t"
1440 : "r" (src), "r" (stride)
1444 if(abs(src[0] - src[7]) > 2*QP) return 0;
1450 static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP)
1454 "leal (%0, %1), %%ecx \n\t"
1455 "leal (%%ecx, %1, 4), %%ebx \n\t"
1456 // 0 1 2 3 4 5 6 7 8 9
1457 // %0 ecx ecx+%1 ecx+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1458 "pxor %%mm7, %%mm7 \n\t"
1459 "movq bm00001000, %%mm6 \n\t"
1460 "movd %2, %%mm5 \n\t" // QP
1461 "movq %%mm5, %%mm4 \n\t"
1462 "paddusb %%mm5, %%mm5 \n\t" // 2QP
1463 "paddusb %%mm5, %%mm4 \n\t" // 3QP
1464 "psllq $24, %%mm4 \n\t"
1465 "pxor %%mm5, %%mm5 \n\t" // 0
1466 "psubb %%mm4, %%mm5 \n\t" // -QP
1467 "leal tempBlock, %%eax \n\t"
1469 //FIXME? "unroll by 2" and mix
1471 #define HDF(src, dst) \
1472 "movq " #src "(%%eax), %%mm0 \n\t"\
1473 "movq " #src "(%%eax), %%mm1 \n\t"\
1474 "movq " #src "(%%eax), %%mm2 \n\t"\
1475 "psrlq $8, %%mm1 \n\t"\
1476 "psubusb %%mm1, %%mm2 \n\t"\
1477 "psubusb %%mm0, %%mm1 \n\t"\
1478 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
1479 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
1480 "pshufw $0x00, %%mm1, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
1481 "pminub %%mm1, %%mm3 \n\t" /* p´5 = min(|p2-p1|, |p6-p5|)*/\
1482 "psrlq $16, %%mm3 \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\
1483 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5-p6|) */\
1484 "paddb %%mm5, %%mm1 \n\t"\
1485 "psubusb %%mm5, %%mm1 \n\t"\
1486 "psrlw $2, %%mm1 \n\t"\
1487 "pxor %%mm2, %%mm1 \n\t"\
1488 "psubb %%mm2, %%mm1 \n\t"\
1489 "pand %%mm6, %%mm1 \n\t"\
1490 "psubb %%mm1, %%mm0 \n\t"\
1491 "psllq $8, %%mm1 \n\t"\
1492 "paddb %%mm1, %%mm0 \n\t"\
1493 "movd %%mm0, " #dst" \n\t"\
1494 "psrlq $32, %%mm0 \n\t"\
1495 "movd %%mm0, 4" #dst" \n\t"
1497 #define HDF(src, dst)\
1498 "movq " #src "(%%eax), %%mm0 \n\t"\
1499 "movq %%mm0, %%mm1 \n\t"\
1500 "movq %%mm0, %%mm2 \n\t"\
1501 "psrlq $8, %%mm1 \n\t"\
1502 "psubusb %%mm1, %%mm2 \n\t"\
1503 "psubusb %%mm0, %%mm1 \n\t"\
1504 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
1505 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
1506 "movq %%mm1, %%mm3 \n\t"\
1507 "psllq $32, %%mm3 \n\t"\
1508 "movq %%mm3, %%mm4 \n\t"\
1509 "psubusb %%mm1, %%mm4 \n\t"\
1510 "psubb %%mm4, %%mm3 \n\t"\
1511 "psrlq $16, %%mm3 \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\
1512 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5,ü6|) */\
1513 "paddb %%mm5, %%mm1 \n\t"\
1514 "psubusb %%mm5, %%mm1 \n\t"\
1515 "psrlw $2, %%mm1 \n\t"\
1516 "pxor %%mm2, %%mm1 \n\t"\
1517 "psubb %%mm2, %%mm1 \n\t"\
1518 "pand %%mm6, %%mm1 \n\t"\
1519 "psubb %%mm1, %%mm0 \n\t"\
1520 "psllq $8, %%mm1 \n\t"\
1521 "paddb %%mm1, %%mm0 \n\t"\
1522 "movd %%mm0, " #dst " \n\t"\
1523 "psrlq $32, %%mm0 \n\t"\
1524 "movd %%mm0, 4" #dst " \n\t"
1529 HDF(24,(%%ecx, %1, 2))
1533 HDF(56,(%%ebx, %1, 2))
1535 : "r" (dst), "r" (stride), "r" (QP)
1536 : "%eax", "%ebx", "%ecx"
1539 uint8_t *src= tempBlock;
1542 for(y=0; y<BLOCK_SIZE; y++)
1544 const int middleEnergy= 5*(src[4] - src[5]) + 2*(src[2] - src[5]);
1555 if(ABS(middleEnergy) < 8*QP)
1557 const int q=(src[3] - src[4])/2;
1558 const int leftEnergy= 5*(src[2] - src[1]) + 2*(src[0] - src[3]);
1559 const int rightEnergy= 5*(src[6] - src[5]) + 2*(src[4] - src[7]);
1561 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1565 d*= SIGN(-middleEnergy);
1588 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
1589 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
1590 * using the 7-Tap Filter (2,2,2,4,2,2,2)/16 (MMX2/3DNOW version)
1592 static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP)
1595 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1597 "leal (%0, %1), %%ecx \n\t"
1598 "leal (%%ecx, %1, 4), %%ebx \n\t"
1599 // 0 1 2 3 4 5 6 7 8 9
1600 // %0 ecx ecx+%1 ecx+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1601 "pxor %%mm7, %%mm7 \n\t"
1602 "leal tempBlock, %%eax \n\t"
1604 #define HLP1 "movq (%0), %%mm0 \n\t"\
1605 "movq %%mm0, %%mm1 \n\t"\
1606 "psllq $8, %%mm0 \n\t"\
1607 PAVGB(%%mm1, %%mm0)\
1608 "psrlw $8, %%mm0 \n\t"\
1609 "pxor %%mm1, %%mm1 \n\t"\
1610 "packuswb %%mm1, %%mm0 \n\t"\
1611 "movq %%mm0, %%mm1 \n\t"\
1612 "movq %%mm0, %%mm2 \n\t"\
1613 "psllq $32, %%mm0 \n\t"\
1614 "paddb %%mm0, %%mm1 \n\t"\
1615 "psllq $16, %%mm2 \n\t"\
1616 PAVGB(%%mm2, %%mm0)\
1617 "movq %%mm0, %%mm3 \n\t"\
1618 "pand bm11001100, %%mm0 \n\t"\
1619 "paddusb %%mm0, %%mm3 \n\t"\
1620 "psrlq $8, %%mm3 \n\t"\
1621 PAVGB(%%mm1, %%mm4)\
1622 PAVGB(%%mm3, %%mm2)\
1623 "psrlq $16, %%mm2 \n\t"\
1624 "punpcklbw %%mm2, %%mm2 \n\t"\
1625 "movq %%mm2, (%0) \n\t"\
1627 #define HLP2 "movq (%0), %%mm0 \n\t"\
1628 "movq %%mm0, %%mm1 \n\t"\
1629 "psllq $8, %%mm0 \n\t"\
1630 PAVGB(%%mm1, %%mm0)\
1631 "psrlw $8, %%mm0 \n\t"\
1632 "pxor %%mm1, %%mm1 \n\t"\
1633 "packuswb %%mm1, %%mm0 \n\t"\
1634 "movq %%mm0, %%mm2 \n\t"\
1635 "psllq $32, %%mm0 \n\t"\
1636 "psllq $16, %%mm2 \n\t"\
1637 PAVGB(%%mm2, %%mm0)\
1638 "movq %%mm0, %%mm3 \n\t"\
1639 "pand bm11001100, %%mm0 \n\t"\
1640 "paddusb %%mm0, %%mm3 \n\t"\
1641 "psrlq $8, %%mm3 \n\t"\
1642 PAVGB(%%mm3, %%mm2)\
1643 "psrlq $16, %%mm2 \n\t"\
1644 "punpcklbw %%mm2, %%mm2 \n\t"\
1645 "movq %%mm2, (%0) \n\t"\
1647 // approximately a 7-Tap Filter with Vector (1,2,3,4,3,2,1)/16
1649 Implemented Exact 7-Tap
1662 #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\
1663 "movq %%mm0, %%mm1 \n\t"\
1664 "movq %%mm0, %%mm2 \n\t"\
1665 "movq %%mm0, %%mm3 \n\t"\
1666 "movq %%mm0, %%mm4 \n\t"\
1667 "psllq $8, %%mm1 \n\t"\
1668 "psrlq $8, %%mm2 \n\t"\
1669 "pand bm00000001, %%mm3 \n\t"\
1670 "pand bm10000000, %%mm4 \n\t"\
1671 "por %%mm3, %%mm1 \n\t"\
1672 "por %%mm4, %%mm2 \n\t"\
1673 PAVGB(%%mm2, %%mm1)\
1674 PAVGB(%%mm1, %%mm0)\
1676 "pshufw $0xF9, %%mm0, %%mm3 \n\t"\
1677 "pshufw $0x90, %%mm0, %%mm4 \n\t"\
1678 PAVGB(%%mm3, %%mm4)\
1679 PAVGB(%%mm4, %%mm0)\
1680 "movd %%mm0, (%0) \n\t"\
1681 "psrlq $32, %%mm0 \n\t"\
1682 "movd %%mm0, 4(%0) \n\t"
1684 #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\
1685 "movq %%mm0, %%mm1 \n\t"\
1686 "movq %%mm0, %%mm2 \n\t"\
1687 "movd -4(%0), %%mm3 \n\t" /*0001000*/\
1688 "movd 8(%0), %%mm4 \n\t" /*0001000*/\
1689 "psllq $8, %%mm1 \n\t"\
1690 "psrlq $8, %%mm2 \n\t"\
1691 "psrlq $24, %%mm3 \n\t"\
1692 "psllq $56, %%mm4 \n\t"\
1693 "por %%mm3, %%mm1 \n\t"\
1694 "por %%mm4, %%mm2 \n\t"\
1695 PAVGB(%%mm2, %%mm1)\
1696 PAVGB(%%mm1, %%mm0)\
1698 "movq %%mm0, %%mm3 \n\t"\
1699 "movq %%mm0, %%mm4 \n\t"\
1700 "movq %%mm0, %%mm5 \n\t"\
1701 "psrlq $16, %%mm3 \n\t"\
1702 "psllq $16, %%mm4 \n\t"\
1703 "pand bm11000000, %%mm5 \n\t"\
1704 "por %%mm5, %%mm3 \n\t"\
1705 "movq %%mm0, %%mm5 \n\t"\
1706 "pand bm00000011, %%mm5 \n\t"\
1707 "por %%mm5, %%mm4 \n\t"\
1708 PAVGB(%%mm3, %%mm4)\
1709 PAVGB(%%mm4, %%mm0)\
1710 "movd %%mm0, (%0) \n\t"\
1711 "psrlq $32, %%mm0 \n\t"\
1712 "movd %%mm0, 4(%0) \n\t"
1715 /* uses the 7-Tap Filter: 1112111 */
1716 #define NEW_HLP(src, dst)\
1717 "movq " #src "(%%eax), %%mm1 \n\t"\
1718 "movq " #src "(%%eax), %%mm2 \n\t"\
1719 "psllq $8, %%mm1 \n\t"\
1720 "psrlq $8, %%mm2 \n\t"\
1721 "movd -4" #dst ", %%mm3 \n\t" /*0001000*/\
1722 "movd 8" #dst ", %%mm4 \n\t" /*0001000*/\
1723 "psrlq $24, %%mm3 \n\t"\
1724 "psllq $56, %%mm4 \n\t"\
1725 "por %%mm3, %%mm1 \n\t"\
1726 "por %%mm4, %%mm2 \n\t"\
1727 "movq %%mm1, %%mm5 \n\t"\
1728 PAVGB(%%mm2, %%mm1)\
1729 "movq " #src "(%%eax), %%mm0 \n\t"\
1730 PAVGB(%%mm1, %%mm0)\
1731 "psllq $8, %%mm5 \n\t"\
1732 "psrlq $8, %%mm2 \n\t"\
1733 "por %%mm3, %%mm5 \n\t"\
1734 "por %%mm4, %%mm2 \n\t"\
1735 "movq %%mm5, %%mm1 \n\t"\
1736 PAVGB(%%mm2, %%mm5)\
1737 "psllq $8, %%mm1 \n\t"\
1738 "psrlq $8, %%mm2 \n\t"\
1739 "por %%mm3, %%mm1 \n\t"\
1740 "por %%mm4, %%mm2 \n\t"\
1741 PAVGB(%%mm2, %%mm1)\
1742 PAVGB(%%mm1, %%mm5)\
1743 PAVGB(%%mm5, %%mm0)\
1744 "movd %%mm0, " #dst " \n\t"\
1745 "psrlq $32, %%mm0 \n\t"\
1746 "movd %%mm0, 4" #dst " \n\t"
1748 /* uses the 9-Tap Filter: 112242211 */
1749 #define NEW_HLP2(i)\
1750 "movq " #i "(%%eax), %%mm0 \n\t" /*0001000*/\
1751 "movq %%mm0, %%mm1 \n\t" /*0001000*/\
1752 "movq %%mm0, %%mm2 \n\t" /*0001000*/\
1753 "movd -4(%0), %%mm3 \n\t" /*0001000*/\
1754 "movd 8(%0), %%mm4 \n\t" /*0001000*/\
1755 "psllq $8, %%mm1 \n\t"\
1756 "psrlq $8, %%mm2 \n\t"\
1757 "psrlq $24, %%mm3 \n\t"\
1758 "psllq $56, %%mm4 \n\t"\
1759 "por %%mm3, %%mm1 \n\t" /*0010000*/\
1760 "por %%mm4, %%mm2 \n\t" /*0000100*/\
1761 "movq %%mm1, %%mm5 \n\t" /*0010000*/\
1762 PAVGB(%%mm2, %%mm1) /*0010100*/\
1763 PAVGB(%%mm1, %%mm0) /*0012100*/\
1764 "psllq $8, %%mm5 \n\t"\
1765 "psrlq $8, %%mm2 \n\t"\
1766 "por %%mm3, %%mm5 \n\t" /*0100000*/\
1767 "por %%mm4, %%mm2 \n\t" /*0000010*/\
1768 "movq %%mm5, %%mm1 \n\t" /*0100000*/\
1769 PAVGB(%%mm2, %%mm5) /*0100010*/\
1770 "psllq $8, %%mm1 \n\t"\
1771 "psrlq $8, %%mm2 \n\t"\
1772 "por %%mm3, %%mm1 \n\t" /*1000000*/\
1773 "por %%mm4, %%mm2 \n\t" /*0000001*/\
1774 "movq %%mm1, %%mm6 \n\t" /*1000000*/\
1775 PAVGB(%%mm2, %%mm1) /*1000001*/\
1776 "psllq $8, %%mm6 \n\t"\
1777 "psrlq $8, %%mm2 \n\t"\
1778 "por %%mm3, %%mm6 \n\t"/*100000000*/\
1779 "por %%mm4, %%mm2 \n\t"/*000000001*/\
1780 PAVGB(%%mm2, %%mm6) /*100000001*/\
1781 PAVGB(%%mm6, %%mm1) /*110000011*/\
1782 PAVGB(%%mm1, %%mm5) /*112000211*/\
1783 PAVGB(%%mm5, %%mm0) /*112242211*/\
1784 "movd %%mm0, (%0) \n\t"\
1785 "psrlq $32, %%mm0 \n\t"\
1786 "movd %%mm0, 4(%0) \n\t"
1788 #define HLP(src, dst) NEW_HLP(src, dst)
1792 HLP(16, (%%ecx, %1))
1793 HLP(24, (%%ecx, %1, 2))
1794 HLP(32, (%0, %1, 4))
1796 HLP(48, (%%ebx, %1))
1797 HLP(56, (%%ebx, %1, 2))
1800 : "r" (dst), "r" (stride)
1801 : "%eax", "%ebx", "%ecx"
1805 uint8_t *temp= tempBlock;
1807 for(y=0; y<BLOCK_SIZE; y++)
1809 const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
1810 const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
1813 sums[0] = first + temp[0];
1814 sums[1] = temp[0] + temp[1];
1815 sums[2] = temp[1] + temp[2];
1816 sums[3] = temp[2] + temp[3];
1817 sums[4] = temp[3] + temp[4];
1818 sums[5] = temp[4] + temp[5];
1819 sums[6] = temp[5] + temp[6];
1820 sums[7] = temp[6] + temp[7];
1821 sums[8] = temp[7] + last;
1823 dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
1824 dst[1]= ((dst[1]<<2) + (first + sums[0] + sums[3]<<1) + sums[5] + 8)>>4;
1825 dst[2]= ((dst[2]<<2) + (first + sums[1] + sums[4]<<1) + sums[6] + 8)>>4;
1826 dst[3]= ((dst[3]<<2) + (sums[2] + sums[5]<<1) + sums[0] + sums[7] + 8)>>4;
1827 dst[4]= ((dst[4]<<2) + (sums[3] + sums[6]<<1) + sums[1] + sums[8] + 8)>>4;
1828 dst[5]= ((dst[5]<<2) + (last + sums[7] + sums[4]<<1) + sums[2] + 8)>>4;
1829 dst[6]= ((last + dst[6]<<2) + (dst[7] + sums[5]<<1) + sums[3] + 8)>>4;
1830 dst[7]= ((sums[8]<<2) + (last + sums[6]<<1) + sums[4] + 8)>>4;
1839 static inline void dering(uint8_t src[], int stride, int QP)
1845 "leal (%0, %1), %%eax \n\t"
1846 "leal (%%eax, %1, 4), %%ebx \n\t"
1847 // 0 1 2 3 4 5 6 7 8 9
1848 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1850 "pcmpeq %%mm6, %%mm6 \n\t"
1851 "pxor %%mm7, %%mm7 \n\t"
1853 #define FIND_MIN_MAX(addr)\
1854 "movq (" #addr "), %%mm0, \n\t"\
1855 "pminub %%mm0, %%mm6 \n\t"\
1856 "pmaxub %%mm0, %%mm7 \n\t"
1860 FIND_MIN_MAX(%%eax, %1)
1861 FIND_MIN_MAX(%%eax, %1, 2)
1862 FIND_MIN_MAX(%0, %1, 4)
1864 FIND_MIN_MAX(%%ebx, %1)
1865 FIND_MIN_MAX(%%ebx, %1, 2)
1866 FIND_MIN_MAX(%0, %1, 8)
1867 FIND_MIN_MAX(%%ebx, %1, 2)
1869 "movq %%mm6, %%mm4 \n\t"
1870 "psrlq $32, %%mm6 \n\t"
1871 "pminub %%mm4, %%mm6 \n\t"
1872 "movq %%mm6, %%mm4 \n\t"
1873 "psrlq $16, %%mm6 \n\t"
1874 "pminub %%mm4, %%mm6 \n\t"
1875 "movq %%mm6, %%mm4 \n\t"
1876 "psrlq $8, %%mm6 \n\t"
1877 "pminub %%mm4, %%mm6 \n\t" // min of pixels
1879 "movq %%mm7, %%mm4 \n\t"
1880 "psrlq $32, %%mm7 \n\t"
1881 "pmaxub %%mm4, %%mm7 \n\t"
1882 "movq %%mm7, %%mm4 \n\t"
1883 "psrlq $16, %%mm7 \n\t"
1884 "pmaxub %%mm4, %%mm7 \n\t"
1885 "movq %%mm7, %%mm4 \n\t"
1886 "psrlq $8, %%mm7 \n\t"
1887 "pmaxub %%mm4, %%mm7 \n\t" // max of pixels
1888 PAVGB(%%mm6, %%mm7) // (max + min)/2
1891 : : "r" (src), "r" (stride), "r" (QP)
1901 * Deinterlaces the given block
1902 * will be called for every 8x8 block, and can read & write into an 8x16 block
1904 static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride)
1906 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1908 "leal (%0, %1), %%eax \n\t"
1909 "leal (%%eax, %1, 4), %%ebx \n\t"
1910 // 0 1 2 3 4 5 6 7 8 9
1911 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1913 "movq (%0), %%mm0 \n\t"
1914 "movq (%%eax, %1), %%mm1 \n\t"
1916 "movq %%mm0, (%%eax) \n\t"
1917 "movq (%0, %1, 4), %%mm0 \n\t"
1919 "movq %%mm1, (%%eax, %1, 2) \n\t"
1920 "movq (%%ebx, %1), %%mm1 \n\t"
1922 "movq %%mm0, (%%ebx) \n\t"
1923 "movq (%0, %1, 8), %%mm0 \n\t"
1925 "movq %%mm1, (%%ebx, %1, 2) \n\t"
1927 : : "r" (src), "r" (stride)
1934 src[stride] = (src[0] + src[stride*2])>>1;
1935 src[stride*3] = (src[stride*2] + src[stride*4])>>1;
1936 src[stride*5] = (src[stride*4] + src[stride*6])>>1;
1937 src[stride*7] = (src[stride*6] + src[stride*8])>>1;
1944 * Deinterlaces the given block
1945 * will be called for every 8x8 block, and can read & write into an 8x16 block
1946 * no cliping in C version
1948 static inline void deInterlaceInterpolateCubic(uint8_t src[], int stride)
1950 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1952 "leal (%0, %1), %%eax \n\t"
1953 "leal (%%eax, %1, 4), %%ebx \n\t"
1954 "leal (%%ebx, %1, 4), %%ecx \n\t"
1955 "addl %1, %%ecx \n\t"
1956 "pxor %%mm7, %%mm7 \n\t"
1957 // 0 1 2 3 4 5 6 7 8 9 10
1958 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 ecx
1960 #define DEINT_CUBIC(a,b,c,d,e)\
1961 "movq " #a ", %%mm0 \n\t"\
1962 "movq " #b ", %%mm1 \n\t"\
1963 "movq " #d ", %%mm2 \n\t"\
1964 "movq " #e ", %%mm3 \n\t"\
1965 PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\
1966 PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\
1967 "movq %%mm0, %%mm2 \n\t"\
1968 "punpcklbw %%mm7, %%mm0 \n\t"\
1969 "punpckhbw %%mm7, %%mm2 \n\t"\
1970 "movq %%mm1, %%mm3 \n\t"\
1971 "punpcklbw %%mm7, %%mm1 \n\t"\
1972 "punpckhbw %%mm7, %%mm3 \n\t"\
1973 "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\
1974 "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\
1975 "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\
1976 "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\
1977 "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\
1978 "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\
1979 "packuswb %%mm3, %%mm1 \n\t"\
1980 "movq %%mm1, " #c " \n\t"
1982 DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%ebx, %1))
1983 DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%ebx), (%%ebx, %1), (%0, %1, 8))
1984 DEINT_CUBIC((%0, %1, 4), (%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8), (%%ecx))
1985 DEINT_CUBIC((%%ebx, %1), (%0, %1, 8), (%%ebx, %1, 4), (%%ecx), (%%ecx, %1, 2))
1987 : : "r" (src), "r" (stride)
1988 : "%eax", "%ebx", "ecx"
1994 src[stride*3] = (-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4;
1995 src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4;
1996 src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4;
1997 src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4;
2004 * Deinterlaces the given block
2005 * will be called for every 8x8 block, and can read & write into an 8x16 block
2006 * will shift the image up by 1 line (FIXME if this is a problem)
2008 static inline void deInterlaceBlendLinear(uint8_t src[], int stride)
2010 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2012 "leal (%0, %1), %%eax \n\t"
2013 "leal (%%eax, %1, 4), %%ebx \n\t"
2014 // 0 1 2 3 4 5 6 7 8 9
2015 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2017 "movq (%0), %%mm0 \n\t" // L0
2018 "movq (%%eax, %1), %%mm1 \n\t" // L2
2019 PAVGB(%%mm1, %%mm0) // L0+L2
2020 "movq (%%eax), %%mm2 \n\t" // L1
2022 "movq %%mm0, (%0) \n\t"
2023 "movq (%%eax, %1, 2), %%mm0 \n\t" // L3
2024 PAVGB(%%mm0, %%mm2) // L1+L3
2025 PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3
2026 "movq %%mm2, (%%eax) \n\t"
2027 "movq (%0, %1, 4), %%mm2 \n\t" // L4
2028 PAVGB(%%mm2, %%mm1) // L2+L4
2029 PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4
2030 "movq %%mm1, (%%eax, %1) \n\t"
2031 "movq (%%ebx), %%mm1 \n\t" // L5
2032 PAVGB(%%mm1, %%mm0) // L3+L5
2033 PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5
2034 "movq %%mm0, (%%eax, %1, 2) \n\t"
2035 "movq (%%ebx, %1), %%mm0 \n\t" // L6
2036 PAVGB(%%mm0, %%mm2) // L4+L6
2037 PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6
2038 "movq %%mm2, (%0, %1, 4) \n\t"
2039 "movq (%%ebx, %1, 2), %%mm2 \n\t" // L7
2040 PAVGB(%%mm2, %%mm1) // L5+L7
2041 PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7
2042 "movq %%mm1, (%%ebx) \n\t"
2043 "movq (%0, %1, 8), %%mm1 \n\t" // L8
2044 PAVGB(%%mm1, %%mm0) // L6+L8
2045 PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8
2046 "movq %%mm0, (%%ebx, %1) \n\t"
2047 "movq (%%ebx, %1, 4), %%mm0 \n\t" // L9
2048 PAVGB(%%mm0, %%mm2) // L7+L9
2049 PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9
2050 "movq %%mm2, (%%ebx, %1, 2) \n\t"
2053 : : "r" (src), "r" (stride)
2060 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2;
2061 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2;
2062 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2063 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
2064 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
2065 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
2066 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
2067 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
2074 * Deinterlaces the given block
2075 * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block
2077 static inline void deInterlaceMedian(uint8_t src[], int stride)
2082 "leal (%0, %1), %%eax \n\t"
2083 "leal (%%eax, %1, 4), %%ebx \n\t"
2084 // 0 1 2 3 4 5 6 7 8 9
2085 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2087 "movq (%0), %%mm0 \n\t" //
2088 "movq (%%eax, %1), %%mm2 \n\t" //
2089 "movq (%%eax), %%mm1 \n\t" //
2090 "movq %%mm0, %%mm3 \n\t"
2091 "pmaxub %%mm1, %%mm0 \n\t" //
2092 "pminub %%mm3, %%mm1 \n\t" //
2093 "pmaxub %%mm2, %%mm1 \n\t" //
2094 "pminub %%mm1, %%mm0 \n\t"
2095 "movq %%mm0, (%%eax) \n\t"
2097 "movq (%0, %1, 4), %%mm0 \n\t" //
2098 "movq (%%eax, %1, 2), %%mm1 \n\t" //
2099 "movq %%mm2, %%mm3 \n\t"
2100 "pmaxub %%mm1, %%mm2 \n\t" //
2101 "pminub %%mm3, %%mm1 \n\t" //
2102 "pmaxub %%mm0, %%mm1 \n\t" //
2103 "pminub %%mm1, %%mm2 \n\t"
2104 "movq %%mm2, (%%eax, %1, 2) \n\t"
2106 "movq (%%ebx), %%mm2 \n\t" //
2107 "movq (%%ebx, %1), %%mm1 \n\t" //
2108 "movq %%mm2, %%mm3 \n\t"
2109 "pmaxub %%mm0, %%mm2 \n\t" //
2110 "pminub %%mm3, %%mm0 \n\t" //
2111 "pmaxub %%mm1, %%mm0 \n\t" //
2112 "pminub %%mm0, %%mm2 \n\t"
2113 "movq %%mm2, (%%ebx) \n\t"
2115 "movq (%%ebx, %1, 2), %%mm2 \n\t" //
2116 "movq (%0, %1, 8), %%mm0 \n\t" //
2117 "movq %%mm2, %%mm3 \n\t"
2118 "pmaxub %%mm0, %%mm2 \n\t" //
2119 "pminub %%mm3, %%mm0 \n\t" //
2120 "pmaxub %%mm1, %%mm0 \n\t" //
2121 "pminub %%mm0, %%mm2 \n\t"
2122 "movq %%mm2, (%%ebx, %1, 2) \n\t"
2125 : : "r" (src), "r" (stride)
2129 #else // MMX without MMX2
2131 "leal (%0, %1), %%eax \n\t"
2132 "leal (%%eax, %1, 4), %%ebx \n\t"
2133 // 0 1 2 3 4 5 6 7 8 9
2134 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2135 "pxor %%mm7, %%mm7 \n\t"
2137 #define MEDIAN(a,b,c)\
2138 "movq " #a ", %%mm0 \n\t"\
2139 "movq " #b ", %%mm2 \n\t"\
2140 "movq " #c ", %%mm1 \n\t"\
2141 "movq %%mm0, %%mm3 \n\t"\
2142 "movq %%mm1, %%mm4 \n\t"\
2143 "movq %%mm2, %%mm5 \n\t"\
2144 "psubusb %%mm1, %%mm3 \n\t"\
2145 "psubusb %%mm2, %%mm4 \n\t"\
2146 "psubusb %%mm0, %%mm5 \n\t"\
2147 "pcmpeqb %%mm7, %%mm3 \n\t"\
2148 "pcmpeqb %%mm7, %%mm4 \n\t"\
2149 "pcmpeqb %%mm7, %%mm5 \n\t"\
2150 "movq %%mm3, %%mm6 \n\t"\
2151 "pxor %%mm4, %%mm3 \n\t"\
2152 "pxor %%mm5, %%mm4 \n\t"\
2153 "pxor %%mm6, %%mm5 \n\t"\
2154 "por %%mm3, %%mm1 \n\t"\
2155 "por %%mm4, %%mm2 \n\t"\
2156 "por %%mm5, %%mm0 \n\t"\
2157 "pand %%mm2, %%mm0 \n\t"\
2158 "pand %%mm1, %%mm0 \n\t"\
2159 "movq %%mm0, " #b " \n\t"
2161 MEDIAN((%0), (%%eax), (%%eax, %1))
2162 MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
2163 MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1))
2164 MEDIAN((%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8))
2166 : : "r" (src), "r" (stride)
2175 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2;
2176 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2;
2177 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2178 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
2179 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
2180 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
2181 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
2182 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
2188 #ifdef HAVE_ODIVX_POSTPROCESS
2189 #include "../opendivx/postprocess.h"
2193 static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2194 QP_STORE_T QPs[], int QPStride, int isColor, int mode);
2196 /* -pp Command line Help
2197 NOTE/FIXME: put this at an appropriate place (--help, html docs, man mplayer)?
2199 -pp <filterName>[:<option>[:<option>...]][,[-]<filterName>[:<option>...]]...
2202 -pp vdeblock:autoq,hdeblock:autoq,linblenddeint -pp default,-vdeblock
2204 -pp vb:a,hb:a,lb -pp de,-vb
2207 short long name short long option Description
2208 * * a autoq cpu power dependant enabler
2209 c chrom chrominance filtring enabled
2210 y nochrom chrominance filtring disabled
2211 hb hdeblock horizontal deblocking filter
2212 vb vdeblock vertical deblocking filter
2214 h1 x1hdeblock Experimental horizontal deblock filter 1
2215 v1 x1vdeblock Experimental vertical deblock filter 1
2216 dr dering not implemented yet
2217 al autolevels automatic brightness / contrast fixer
2218 f fullyrange stretch luminance range to (0..255)
2219 lb linblenddeint linear blend deinterlacer
2220 li linipoldeint linear interpolating deinterlacer
2221 ci cubicipoldeint cubic interpolating deinterlacer
2222 md mediandeint median deinterlacer
2223 de default hdeblock:a,vdeblock:a,dering:a,autolevels
2224 fa fast x1hdeblock:a,x1vdeblock:a,dering:a,autolevels
2228 * returns a PPMode struct which will have a non 0 error variable if an error occured
2229 * name is the string after "-pp" on the command line
2230 * quality is a number from 0 to GET_PP_QUALITY_MAX
2232 struct PPMode getPPModeByNameAndQuality(char *name, int quality)
2234 char temp[GET_MODE_BUFFER_SIZE];
2236 char *filterDelimiters= ",";
2237 char *optionDelimiters= ":";
2238 struct PPMode ppMode= {0,0,0,0,0,0};
2241 strncpy(temp, name, GET_MODE_BUFFER_SIZE);
2246 int q= GET_PP_QUALITY_MAX;
2249 char *options[OPTIONS_ARRAY_SIZE];
2252 int numOfUnknownOptions=0;
2253 int enable=1; //does the user want us to enabled or disabled the filter
2255 filterToken= strtok(p, filterDelimiters);
2256 if(filterToken == NULL) break;
2257 p+= strlen(filterToken) + 1;
2258 filterName= strtok(filterToken, optionDelimiters);
2259 printf("%s::%s\n", filterToken, filterName);
2261 if(*filterName == '-')
2266 for(;;){ //for all options
2267 option= strtok(NULL, optionDelimiters);
2268 if(option == NULL) break;
2270 printf("%s\n", option);
2271 if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
2272 else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
2273 else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
2276 options[numOfUnknownOptions] = option;
2277 numOfUnknownOptions++;
2278 options[numOfUnknownOptions] = NULL;
2280 if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
2283 /* replace stuff from the replace Table */
2284 for(i=0; replaceTable[2*i]!=NULL; i++)
2286 if(!strcmp(replaceTable[2*i], filterName))
2288 int newlen= strlen(replaceTable[2*i + 1]);
2292 if(p==NULL) p= temp, *p=0; //last filter
2293 else p--, *p=','; //not last filter
2296 spaceLeft= (int)p - (int)temp + plen;
2297 if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE)
2302 memmove(p + newlen, p, plen+1);
2303 memcpy(p, replaceTable[2*i + 1], newlen);
2308 for(i=0; filters[i].shortName!=NULL; i++)
2310 if( !strcmp(filters[i].longName, filterName)
2311 || !strcmp(filters[i].shortName, filterName))
2313 ppMode.lumMode &= ~filters[i].mask;
2314 ppMode.chromMode &= ~filters[i].mask;
2317 if(!enable) break; // user wants to disable it
2319 if(q >= filters[i].minLumQuality)
2320 ppMode.lumMode|= filters[i].mask;
2321 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
2322 if(q >= filters[i].minChromQuality)
2323 ppMode.chromMode|= filters[i].mask;
2325 if(filters[i].mask == LEVEL_FIX)
2328 ppMode.minAllowedY= 16;
2329 ppMode.maxAllowedY= 234;
2330 for(o=0; options[o]!=NULL; o++)
2331 if( !strcmp(options[o],"fullyrange")
2332 ||!strcmp(options[o],"f"))
2334 ppMode.minAllowedY= 0;
2335 ppMode.maxAllowedY= 255;
2336 numOfUnknownOptions--;
2341 if(!filterNameOk) ppMode.error++;
2342 ppMode.error += numOfUnknownOptions;
2345 if(ppMode.lumMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_H;
2346 if(ppMode.lumMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_V;
2347 if(ppMode.chromMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_H;
2348 if(ppMode.chromMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_V;
2349 if(ppMode.lumMode & DERING) ppMode.oldMode |= PP_DERING_Y;
2350 if(ppMode.chromMode & DERING) ppMode.oldMode |= PP_DERING_C;
2358 void postprocess(unsigned char * src[], int src_stride,
2359 unsigned char * dst[], int dst_stride,
2360 int horizontal_size, int vertical_size,
2361 QP_STORE_T *QP_store, int QP_stride,
2367 struct PPMode ppMode= getPPModeByNameAndQuality("fast,default,-hdeblock,-vdeblock", qual);
2370 printf("\n%d %d %d %d\n", ppMode.lumMode, ppMode.chromMode, ppMode.oldMode, ppMode.error);
2371 postprocess2(src, src_stride, dst, dst_stride,
2372 horizontal_size, vertical_size, QP_store, QP_stride, &ppMode);
2377 #ifdef HAVE_ODIVX_POSTPROCESS
2378 // Note: I could make this shit outside of this file, but it would mean one
2379 // more function call...
2381 odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,mode);
2386 postProcess(src[0], src_stride, dst[0], dst_stride,
2387 horizontal_size, vertical_size, QP_store, QP_stride, 0, mode);
2389 horizontal_size >>= 1;
2390 vertical_size >>= 1;
2393 mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00);
2397 postProcess(src[1], src_stride, dst[1], dst_stride,
2398 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
2399 postProcess(src[2], src_stride, dst[2], dst_stride,
2400 horizontal_size, vertical_size, QP_store, QP_stride, 2, mode);
2404 memcpy(dst[1], src[1], src_stride*horizontal_size);
2405 memcpy(dst[2], src[2], src_stride*horizontal_size);
2409 void postprocess2(unsigned char * src[], int src_stride,
2410 unsigned char * dst[], int dst_stride,
2411 int horizontal_size, int vertical_size,
2412 QP_STORE_T *QP_store, int QP_stride,
2413 struct PPMode *mode)
2416 #ifdef HAVE_ODIVX_POSTPROCESS
2417 // Note: I could make this shit outside of this file, but it would mean one
2418 // more function call...
2420 odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,
2426 postProcess(src[0], src_stride, dst[0], dst_stride,
2427 horizontal_size, vertical_size, QP_store, QP_stride, 0, mode->lumMode);
2429 horizontal_size >>= 1;
2430 vertical_size >>= 1;
2434 postProcess(src[1], src_stride, dst[1], dst_stride,
2435 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode->chromMode);
2436 postProcess(src[2], src_stride, dst[2], dst_stride,
2437 horizontal_size, vertical_size, QP_store, QP_stride, 2, mode->chromMode);
2442 * gets the mode flags for a given quality (larger values mean slower but better postprocessing)
2445 int getPpModeForQuality(int quality){
2446 int modes[1+GET_PP_QUALITY_MAX]= {
2449 // horizontal filters first
2451 LUM_H_DEBLOCK | LUM_V_DEBLOCK,
2452 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK,
2453 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK,
2454 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING,
2455 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING | CHROM_DERING
2457 // vertical filters first
2459 LUM_V_DEBLOCK | LUM_H_DEBLOCK,
2460 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK,
2461 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK,
2462 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING,
2463 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING | CHROM_DERING
2467 #ifdef HAVE_ODIVX_POSTPROCESS
2468 int odivx_modes[1+GET_PP_QUALITY_MAX]= {
2471 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V,
2472 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H,
2473 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V,
2474 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y,
2475 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y|PP_DERING_C
2477 if(use_old_pp) return odivx_modes[quality];
2479 return modes[quality];
2483 * Copies a block from src to dst and fixes the blacklevel
2484 * numLines must be a multiple of 4
2485 * levelFix == 0 -> dont touch the brighness & contrast
2487 static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
2488 int numLines, int levelFix)
2495 "leal (%2,%2), %%eax \n\t"
2496 "leal (%3,%3), %%ebx \n\t"
2497 "movq packedYOffset, %%mm2 \n\t"
2498 "movq packedYScale, %%mm3 \n\t"
2499 "pxor %%mm4, %%mm4 \n\t"
2501 #define SCALED_CPY \
2502 "movq (%0), %%mm0 \n\t"\
2503 "movq (%0), %%mm5 \n\t"\
2504 "punpcklbw %%mm4, %%mm0 \n\t"\
2505 "punpckhbw %%mm4, %%mm5 \n\t"\
2506 "psubw %%mm2, %%mm0 \n\t"\
2507 "psubw %%mm2, %%mm5 \n\t"\
2508 "movq (%0,%2), %%mm1 \n\t"\
2509 "psllw $6, %%mm0 \n\t"\
2510 "psllw $6, %%mm5 \n\t"\
2511 "pmulhw %%mm3, %%mm0 \n\t"\
2512 "movq (%0,%2), %%mm6 \n\t"\
2513 "pmulhw %%mm3, %%mm5 \n\t"\
2514 "punpcklbw %%mm4, %%mm1 \n\t"\
2515 "punpckhbw %%mm4, %%mm6 \n\t"\
2516 "psubw %%mm2, %%mm1 \n\t"\
2517 "psubw %%mm2, %%mm6 \n\t"\
2518 "psllw $6, %%mm1 \n\t"\
2519 "psllw $6, %%mm6 \n\t"\
2520 "pmulhw %%mm3, %%mm1 \n\t"\
2521 "pmulhw %%mm3, %%mm6 \n\t"\
2522 "addl %%eax, %0 \n\t"\
2523 "packuswb %%mm5, %%mm0 \n\t"\
2524 "packuswb %%mm6, %%mm1 \n\t"\
2525 "movq %%mm0, (%1) \n\t"\
2526 "movq %%mm1, (%1, %3) \n\t"\
2529 "addl %%ebx, %1 \n\t"
2531 "addl %%ebx, %1 \n\t"
2533 "addl %%ebx, %1 \n\t"
2543 for(i=0; i<numLines; i++)
2544 memcpy( &(dst[dstStride*i]),
2545 &(src[srcStride*i]), BLOCK_SIZE);
2552 "movl %4, %%eax \n\t"
2553 "movl %%eax, temp0\n\t"
2556 "leal (%2,%2), %%eax \n\t"
2557 "leal (%3,%3), %%ebx \n\t"
2558 "movq packedYOffset, %%mm2 \n\t"
2559 "movq packedYScale, %%mm3 \n\t"
2561 #define SIMPLE_CPY \
2562 "movq (%0), %%mm0 \n\t"\
2563 "movq (%0,%2), %%mm1 \n\t"\
2564 "movq %%mm0, (%1) \n\t"\
2565 "movq %%mm1, (%1, %3) \n\t"\
2569 "addl %%eax, %0 \n\t"
2570 "addl %%ebx, %1 \n\t"
2572 "addl %%eax, %0 \n\t"
2573 "addl %%ebx, %1 \n\t"
2587 for(i=0; i<numLines; i++)
2588 memcpy( &(dst[dstStride*i]),
2589 &(src[srcStride*i]), BLOCK_SIZE);
2596 * Filters array of bytes (Y or U or V values)
2598 static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2599 QP_STORE_T QPs[], int QPStride, int isColor, int mode)
2602 /* we need 64bit here otherwise we´ll going to have a problem
2603 after watching a black picture for 5 hours*/
2604 static uint64_t *yHistogram= NULL;
2605 int black=0, white=255; // blackest black and whitest white in the picture
2607 /* Temporary buffers for handling the last row(s) */
2608 static uint8_t *tempDst= NULL;
2609 static uint8_t *tempSrc= NULL;
2611 /* Temporary buffers for handling the last block */
2612 static uint8_t *tempDstBlock= NULL;
2613 static uint8_t *tempSrcBlock= NULL;
2615 uint8_t *dstBlockPtrBackup;
2616 uint8_t *srcBlockPtrBackup;
2619 long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0;
2625 tempDst= (uint8_t*)memalign(8, 1024*24);
2626 tempSrc= (uint8_t*)memalign(8, 1024*24);
2627 tempDstBlock= (uint8_t*)memalign(8, 1024*24);
2628 tempSrcBlock= (uint8_t*)memalign(8, 1024*24);
2634 yHistogram= (uint64_t*)malloc(8*256);
2635 for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256;
2637 if(mode & FULL_Y_RANGE)
2648 static int framenum= -1;
2649 uint64_t maxClipped;
2654 if(framenum == 1) yHistogram[0]= width*height/64*15/256;
2656 for(i=0; i<256; i++)
2658 sum+= yHistogram[i];
2659 // printf("%d ", yHistogram[i]);
2663 /* we allways get a completly black picture first */
2664 maxClipped= (uint64_t)(sum * maxClippedThreshold);
2667 for(black=255; black>0; black--)
2669 if(clipped < maxClipped) break;
2670 clipped-= yHistogram[black];
2674 for(white=0; white<256; white++)
2676 if(clipped < maxClipped) break;
2677 clipped-= yHistogram[white];
2680 packedYOffset= (black - minAllowedY) & 0xFFFF;
2681 packedYOffset|= packedYOffset<<32;
2682 packedYOffset|= packedYOffset<<16;
2684 scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);
2686 packedYScale= (uint16_t)(scale*1024.0 + 0.5);
2687 packedYScale|= packedYScale<<32;
2688 packedYScale|= packedYScale<<16;
2692 packedYScale= 0x0100010001000100LL;
2696 /* copy first row of 8x8 blocks */
2697 for(x=0; x<width; x+=BLOCK_SIZE)
2698 blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX);
2700 for(y=0; y<height; y+=BLOCK_SIZE)
2702 //1% speedup if these are here instead of the inner loop
2703 uint8_t *srcBlock= &(src[y*srcStride]);
2704 uint8_t *dstBlock= &(dst[y*dstStride]);
2706 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not
2707 than use a temporary buffer */
2710 /* copy from line 5 to 12 of src, these will e copied with
2711 blockcopy to dst later */
2712 memcpy(tempSrc + srcStride*5, srcBlock + srcStride*5,
2713 srcStride*MAX(height-y-5, 0) );
2715 /* duplicate last line to fill the void upto line 12 */
2719 for(i=height-y; i<=12; i++)
2720 memcpy(tempSrc + srcStride*i,
2721 src + srcStride*(height-1), srcStride);
2725 /* copy up to 5 lines of dst */
2726 memcpy(tempDst, dstBlock, dstStride*MIN(height-y, 5) );
2731 // From this point on it is guranteed that we can read and write 16 lines downward
2732 // finish 1 block before the next otherwise we´ll might have a problem
2733 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
2734 for(x=0; x<width; x+=BLOCK_SIZE)
2736 const int stride= dstStride;
2740 QP=QPs[(y>>3)*QPStride + (x>>3)];
2744 QP= QPs[(y>>4)*QPStride + (x>>4)];
2745 if(mode & LEVEL_FIX) QP= (QP* (packedYScale &0xFFFF))>>8;
2746 yHistogram[ srcBlock[srcStride*5] ]++;
2750 "movd %0, %%mm7 \n\t"
2751 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
2752 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
2753 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
2754 "movq %%mm7, pQPb \n\t"
2764 prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
2765 prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
2766 prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
2767 prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
2768 #elif defined(HAVE_3DNOW)
2769 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
2770 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
2771 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
2772 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
2773 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
2777 #ifdef PP_FUNNY_STRIDE
2778 //can we mess with a 8x16 block, if not use a temp buffer, yes again
2782 dstBlockPtrBackup= dstBlock;
2783 srcBlockPtrBackup= srcBlock;
2785 for(i=0;i<BLOCK_SIZE*2; i++)
2787 memcpy(tempSrcBlock+i*srcStride, srcBlock+i*srcStride, width-x);
2788 memcpy(tempDstBlock+i*dstStride, dstBlock+i*dstStride, width-x);
2791 dstBlock= tempDstBlock;
2792 srcBlock= tempSrcBlock;
2796 blockCopy(dstBlock + dstStride*5, dstStride,
2797 srcBlock + srcStride*5, srcStride, 8, mode & LEVEL_FIX);
2799 if(mode & LINEAR_IPOL_DEINT_FILTER)
2800 deInterlaceInterpolateLinear(dstBlock, dstStride);
2801 else if(mode & LINEAR_BLEND_DEINT_FILTER)
2802 deInterlaceBlendLinear(dstBlock, dstStride);
2803 else if(mode & MEDIAN_DEINT_FILTER)
2804 deInterlaceMedian(dstBlock, dstStride);
2805 else if(mode & CUBIC_IPOL_DEINT_FILTER)
2806 deInterlaceInterpolateCubic(dstBlock, dstStride);
2807 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
2808 deInterlaceBlendCubic(dstBlock, dstStride);
2811 /* only deblock if we have 2 blocks */
2819 if(mode & V_RK1_FILTER)
2820 vertRK1Filter(dstBlock, stride, QP);
2821 else if(mode & V_X1_FILTER)
2822 vertX1Filter(dstBlock, stride, QP);
2823 else if(mode & V_DEBLOCK)
2825 if( isVertDC(dstBlock, stride))
2827 if(isVertMinMaxOk(dstBlock, stride, QP))
2828 doVertLowPass(dstBlock, stride, QP);
2831 doVertDefFilter(dstBlock, stride, QP);
2840 /* check if we have a previous block to deblock it with dstBlock */
2846 if(mode & H_X1_FILTER)
2847 horizX1Filter(dstBlock-4, stride, QP);
2848 else if(mode & H_DEBLOCK)
2850 if( isHorizDCAndCopy2Temp(dstBlock-4, stride))
2852 if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP))
2853 doHorizLowPassAndCopyBack(dstBlock-4, stride, QP);
2856 doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP);
2863 dering(dstBlock - 9 - stride, stride, QP);
2866 dering(dstBlock - stride*9 + width-9, stride, QP);
2867 //FIXME dering filter will not be applied to last block (bottom right)
2869 #ifdef PP_FUNNY_STRIDE
2870 /* did we use a tmp-block buffer */
2874 dstBlock= dstBlockPtrBackup;
2875 srcBlock= srcBlockPtrBackup;
2877 for(i=0;i<BLOCK_SIZE*2; i++)
2879 memcpy(dstBlock+i*dstStride, tempDstBlock+i*dstStride, width-x);
2888 /* did we use a tmp buffer */
2891 uint8_t *dstBlock= &(dst[y*dstStride]);
2892 memcpy(dstBlock, tempDst, dstStride*(height-y) );
2896 asm volatile("femms");
2897 #elif defined (HAVE_MMX)
2898 asm volatile("emms");
2902 // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...)
2903 sumTime= rdtsc() - sumTime;
2905 printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d \r",
2906 (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
2907 (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)