2 Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 doVertDefFilter Ec Ec Ec
28 doHorizDefFilter E ac ac
30 Vertical RKAlgo1 E a a
33 LinIpolDeinterlace e E E*
34 CubicIpolDeinterlace a e e*
35 LinBlendDeinterlace e E E*
36 MedianDeinterlace Ec Ec
39 * i dont have a 3dnow CPU -> its untested
40 E = Exact implementation
41 e = allmost exact implementation (slightly different rounding,...)
42 a = alternative / approximate impl
43 c = checked against the other implementations (-vo md5)
48 verify that everything workes as it should (how?)
49 reduce the time wasted on the mem transfer
51 implement everything in C at least (done at the moment but ...)
52 unroll stuff if instructions depend too much on the prior one
53 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
54 move YScale thing to the end instead of fixing QP
55 write a faster and higher quality deblocking filter :)
56 do something about the speed of the horizontal filters
57 make the mainloop more flexible (variable number of blocks at once
58 (the if/else stuff per block is slowing things down)
59 compare the quality & speed of all filters
61 fix warnings (unused vars, ...)
62 noise reduction filters
69 //Changelog: use the CVS log
74 #include "../config.h"
78 #include "postprocess.h"
80 #define MIN(a,b) ((a) > (b) ? (b) : (a))
81 #define MAX(a,b) ((a) < (b) ? (b) : (a))
82 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
83 #define SIGN(a) ((a) > 0 ? 1 : -1)
86 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
87 #elif defined (HAVE_3DNOW)
88 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
91 static uint64_t packedYOffset= 0x0000000000000000LL;
92 static uint64_t packedYScale= 0x0100010001000100LL;
93 static uint64_t w05= 0x0005000500050005LL;
94 static uint64_t w20= 0x0020002000200020LL;
95 static uint64_t w1400= 0x1400140014001400LL;
96 static uint64_t bm00000001= 0x00000000000000FFLL;
97 static uint64_t bm00010000= 0x000000FF00000000LL;
98 static uint64_t bm00001000= 0x00000000FF000000LL;
99 static uint64_t bm10000000= 0xFF00000000000000LL;
100 static uint64_t bm10000001= 0xFF000000000000FFLL;
101 static uint64_t bm11000011= 0xFFFF00000000FFFFLL;
102 static uint64_t bm00000011= 0x000000000000FFFFLL;
103 static uint64_t bm11111110= 0xFFFFFFFFFFFFFF00LL;
104 static uint64_t bm11000000= 0xFFFF000000000000LL;
105 static uint64_t bm00011000= 0x000000FFFF000000LL;
106 static uint64_t bm00110011= 0x0000FFFF0000FFFFLL;
107 static uint64_t bm11001100= 0xFFFF0000FFFF0000LL;
108 static uint64_t b00= 0x0000000000000000LL;
109 static uint64_t b01= 0x0101010101010101LL;
110 static uint64_t b02= 0x0202020202020202LL;
111 static uint64_t b0F= 0x0F0F0F0F0F0F0F0FLL;
112 static uint64_t bFF= 0xFFFFFFFFFFFFFFFFLL;
113 static uint64_t b20= 0x2020202020202020LL;
114 static uint64_t b80= 0x8080808080808080LL;
115 static uint64_t b7E= 0x7E7E7E7E7E7E7E7ELL;
116 static uint64_t b7C= 0x7C7C7C7C7C7C7C7CLL;
117 static uint64_t b3F= 0x3F3F3F3F3F3F3F3FLL;
118 static uint64_t temp0=0;
119 static uint64_t temp1=0;
120 static uint64_t temp2=0;
121 static uint64_t temp3=0;
122 static uint64_t temp4=0;
123 static uint64_t temp5=0;
124 static uint64_t pQPb=0;
125 static uint8_t tempBlock[16*16]; //used so the horizontal code gets aligned data
127 int hFlatnessThreshold= 56 - 16;
128 int vFlatnessThreshold= 56 - 16;
130 //amount of "black" u r willing to loose to get a brightness corrected picture
131 double maxClippedThreshold= 0.01;
134 //FIXME can never make a movie´s black brighter (anyone needs that?)
138 static inline long long rdtsc()
141 asm volatile( "rdtsc\n\t"
144 // printf("%d\n", int(l/1000));
150 static inline void prefetchnta(void *p)
152 asm volatile( "prefetchnta (%0)\n\t"
157 static inline void prefetcht0(void *p)
159 asm volatile( "prefetcht0 (%0)\n\t"
164 static inline void prefetcht1(void *p)
166 asm volatile( "prefetcht1 (%0)\n\t"
171 static inline void prefetcht2(void *p)
173 asm volatile( "prefetcht2 (%0)\n\t"
179 //FIXME? |255-0| = 1 (shouldnt be a problem ...)
181 * Check if the middle 8x8 Block in the given 8x16 block is flat
183 static inline int isVertDC(uint8_t src[], int stride){
186 src+= stride*4; // src points to begin of the 8x8 Block
190 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F
191 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D
192 "movq (%1), %%mm0 \n\t"
194 "movq (%1), %%mm1 \n\t"
195 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
196 "paddb %%mm7, %%mm0 \n\t"
197 "pcmpgtb %%mm6, %%mm0 \n\t"
200 "movq (%1), %%mm2 \n\t"
201 "psubb %%mm2, %%mm1 \n\t"
202 "paddb %%mm7, %%mm1 \n\t"
203 "pcmpgtb %%mm6, %%mm1 \n\t"
204 "paddb %%mm1, %%mm0 \n\t"
207 "movq (%1), %%mm1 \n\t"
208 "psubb %%mm1, %%mm2 \n\t"
209 "paddb %%mm7, %%mm2 \n\t"
210 "pcmpgtb %%mm6, %%mm2 \n\t"
211 "paddb %%mm2, %%mm0 \n\t"
214 "movq (%1), %%mm2 \n\t"
215 "psubb %%mm2, %%mm1 \n\t"
216 "paddb %%mm7, %%mm1 \n\t"
217 "pcmpgtb %%mm6, %%mm1 \n\t"
218 "paddb %%mm1, %%mm0 \n\t"
221 "movq (%1), %%mm1 \n\t"
222 "psubb %%mm1, %%mm2 \n\t"
223 "paddb %%mm7, %%mm2 \n\t"
224 "pcmpgtb %%mm6, %%mm2 \n\t"
225 "paddb %%mm2, %%mm0 \n\t"
228 "movq (%1), %%mm2 \n\t"
229 "psubb %%mm2, %%mm1 \n\t"
230 "paddb %%mm7, %%mm1 \n\t"
231 "pcmpgtb %%mm6, %%mm1 \n\t"
232 "paddb %%mm1, %%mm0 \n\t"
235 "movq (%1), %%mm1 \n\t"
236 "psubb %%mm1, %%mm2 \n\t"
237 "paddb %%mm7, %%mm2 \n\t"
238 "pcmpgtb %%mm6, %%mm2 \n\t"
239 "paddb %%mm2, %%mm0 \n\t"
242 "movq %%mm0, %%mm1 \n\t"
243 "psrlw $8, %%mm0 \n\t"
244 "paddb %%mm1, %%mm0 \n\t"
245 "movq %%mm0, %%mm1 \n\t"
246 "psrlq $16, %%mm0 \n\t"
247 "paddb %%mm1, %%mm0 \n\t"
248 "movq %%mm0, %%mm1 \n\t"
249 "psrlq $32, %%mm0 \n\t"
250 "paddb %%mm1, %%mm0 \n\t"
252 "movd %%mm0, %0 \n\t"
254 : "r" (src), "r" (stride)
256 // printf("%d\n", numEq);
257 numEq= (256 - (numEq & 0xFF)) &0xFF;
261 // uint8_t *temp= src;
264 for(y=0; y<BLOCK_SIZE-1; y++)
266 if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
267 if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
268 if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++;
269 if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++;
270 if(((src[4] - src[4+stride] + 1)&0xFFFF) < 3) numEq++;
271 if(((src[5] - src[5+stride] + 1)&0xFFFF) < 3) numEq++;
272 if(((src[6] - src[6+stride] + 1)&0xFFFF) < 3) numEq++;
273 if(((src[7] - src[7+stride] + 1)&0xFFFF) < 3) numEq++;
277 /* if(abs(numEq - asmEq) > 0)
279 printf("\nasm:%d c:%d\n", asmEq, numEq);
280 for(int y=0; y<8; y++)
282 for(int x=0; x<8; x++)
284 printf("%d ", temp[x + y*stride]);
290 // for(int i=0; i<numEq/8; i++) src[i]=255;
291 return (numEq > vFlatnessThreshold) ? 1 : 0;
294 static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
301 "movq (%1, %2), %%mm0 \n\t"
302 "movq (%1, %2, 8), %%mm1 \n\t"
303 "movq %%mm0, %%mm2 \n\t"
304 "psubusb %%mm1, %%mm0 \n\t"
305 "psubusb %%mm2, %%mm1 \n\t"
306 "por %%mm1, %%mm0 \n\t" // ABS Diff
308 "movq pQPb, %%mm7 \n\t" // QP,..., QP
309 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
310 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0
311 "pcmpeqd b00, %%mm0 \n\t"
312 "psrlq $16, %%mm0 \n\t"
313 "pcmpeqd bFF, %%mm0 \n\t"
314 // "movd %%mm0, (%1, %2, 4)\n\t"
315 "movd %%mm0, %0 \n\t"
317 : "r" (src), "r" (stride)
325 for(x=0; x<BLOCK_SIZE; x++)
327 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
329 /* if(isOk && !isOk2 || !isOk && isOk2)
331 printf("\nasm:%d c:%d QP:%d\n", isOk, isOk2, QP);
332 for(int y=0; y<9; y++)
334 for(int x=0; x<8; x++)
336 printf("%d ", src[x + y*stride]);
348 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
349 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
351 static inline void doVertLowPass(uint8_t *src, int stride, int QP)
353 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
355 asm volatile( //"movv %0 %1 %2\n\t"
357 "movq pQPb, %%mm0 \n\t" // QP,..., QP
359 "movq (%0), %%mm6 \n\t"
360 "movq (%0, %1), %%mm5 \n\t"
361 "movq %%mm5, %%mm1 \n\t"
362 "movq %%mm6, %%mm2 \n\t"
363 "psubusb %%mm6, %%mm5 \n\t"
364 "psubusb %%mm1, %%mm2 \n\t"
365 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
366 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
367 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
369 "pand %%mm2, %%mm6 \n\t"
370 "pandn %%mm1, %%mm2 \n\t"
371 "por %%mm2, %%mm6 \n\t"// First Line to Filter
373 "movq (%0, %1, 8), %%mm5 \n\t"
374 "leal (%0, %1, 4), %%eax \n\t"
375 "leal (%0, %1, 8), %%ebx \n\t"
376 "subl %1, %%ebx \n\t"
377 "addl %1, %0 \n\t" // %0 points to line 1 not 0
378 "movq (%0, %1, 8), %%mm7 \n\t"
379 "movq %%mm5, %%mm1 \n\t"
380 "movq %%mm7, %%mm2 \n\t"
381 "psubusb %%mm7, %%mm5 \n\t"
382 "psubusb %%mm1, %%mm2 \n\t"
383 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
384 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
385 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
387 "pand %%mm2, %%mm7 \n\t"
388 "pandn %%mm1, %%mm2 \n\t"
389 "por %%mm2, %%mm7 \n\t" // First Line to Filter
393 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ebx eax+4%1
398 "movq (%0, %1), %%mm0 \n\t" // 1
399 "movq %%mm0, %%mm1 \n\t" // 1
400 PAVGB(%%mm6, %%mm0) //1 1 /2
401 PAVGB(%%mm6, %%mm0) //3 1 /4
403 "movq (%0, %1, 4), %%mm2 \n\t" // 1
404 "movq %%mm2, %%mm5 \n\t" // 1
405 PAVGB((%%eax), %%mm2) // 11 /2
406 PAVGB((%0, %1, 2), %%mm2) // 211 /4
407 "movq %%mm2, %%mm3 \n\t" // 211 /4
408 "movq (%0), %%mm4 \n\t" // 1
409 PAVGB(%%mm4, %%mm3) // 4 211 /8
410 PAVGB(%%mm0, %%mm3) //642211 /16
411 "movq %%mm3, (%0) \n\t" // X
412 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
413 "movq %%mm1, %%mm0 \n\t" // 1
414 PAVGB(%%mm6, %%mm0) //1 1 /2
415 "movq %%mm4, %%mm3 \n\t" // 1
416 PAVGB((%0,%1,2), %%mm3) // 1 1 /2
417 PAVGB((%%eax,%1,2), %%mm5) // 11 /2
418 PAVGB((%%eax), %%mm5) // 211 /4
419 PAVGB(%%mm5, %%mm3) // 2 2211 /8
420 PAVGB(%%mm0, %%mm3) //4242211 /16
421 "movq %%mm3, (%0,%1) \n\t" // X
422 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
423 PAVGB(%%mm4, %%mm6) //11 /2
424 "movq (%%ebx), %%mm0 \n\t" // 1
425 PAVGB((%%eax, %1, 2), %%mm0) // 11/2
426 "movq %%mm0, %%mm3 \n\t" // 11/2
427 PAVGB(%%mm1, %%mm0) // 2 11/4
428 PAVGB(%%mm6, %%mm0) //222 11/8
429 PAVGB(%%mm2, %%mm0) //22242211/16
430 "movq (%0, %1, 2), %%mm2 \n\t" // 1
431 "movq %%mm0, (%0, %1, 2) \n\t" // X
432 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
433 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
434 PAVGB((%%ebx), %%mm0) // 11 /2
435 PAVGB(%%mm0, %%mm6) //11 11 /4
436 PAVGB(%%mm1, %%mm4) // 11 /2
437 PAVGB(%%mm2, %%mm1) // 11 /2
438 PAVGB(%%mm1, %%mm6) //1122 11 /8
439 PAVGB(%%mm5, %%mm6) //112242211 /16
440 "movq (%%eax), %%mm5 \n\t" // 1
441 "movq %%mm6, (%%eax) \n\t" // X
442 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
443 "movq (%%eax, %1, 4), %%mm6 \n\t" // 1
444 PAVGB(%%mm7, %%mm6) // 11 /2
445 PAVGB(%%mm4, %%mm6) // 11 11 /4
446 PAVGB(%%mm3, %%mm6) // 11 2211 /8
447 PAVGB(%%mm5, %%mm2) // 11 /2
448 "movq (%0, %1, 4), %%mm4 \n\t" // 1
449 PAVGB(%%mm4, %%mm2) // 112 /4
450 PAVGB(%%mm2, %%mm6) // 112242211 /16
451 "movq %%mm6, (%0, %1, 4) \n\t" // X
452 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
453 PAVGB(%%mm7, %%mm1) // 11 2 /4
454 PAVGB(%%mm4, %%mm5) // 11 /2
455 PAVGB(%%mm5, %%mm0) // 11 11 /4
456 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1
457 PAVGB(%%mm6, %%mm1) // 11 4 2 /8
458 PAVGB(%%mm0, %%mm1) // 11224222 /16
459 "movq %%mm1, (%%eax, %1, 2) \n\t" // X
460 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
461 PAVGB((%%ebx), %%mm2) // 112 4 /8
462 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
463 PAVGB(%%mm0, %%mm6) // 1 1 /2
464 PAVGB(%%mm7, %%mm6) // 1 12 /4
465 PAVGB(%%mm2, %%mm6) // 1122424 /4
466 "movq %%mm6, (%%ebx) \n\t" // X
467 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
468 PAVGB(%%mm7, %%mm5) // 11 2 /4
469 PAVGB(%%mm7, %%mm5) // 11 6 /8
471 PAVGB(%%mm3, %%mm0) // 112 /4
472 PAVGB(%%mm0, %%mm5) // 112246 /16
473 "movq %%mm5, (%%eax, %1, 4) \n\t" // X
477 : "r" (src), "r" (stride)
481 const int l1= stride;
482 const int l2= stride + l1;
483 const int l3= stride + l2;
484 const int l4= stride + l3;
485 const int l5= stride + l4;
486 const int l6= stride + l5;
487 const int l7= stride + l6;
488 const int l8= stride + l7;
489 const int l9= stride + l8;
492 for(x=0; x<BLOCK_SIZE; x++)
494 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
495 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
498 sums[0] = first + src[l1];
499 sums[1] = src[l1] + src[l2];
500 sums[2] = src[l2] + src[l3];
501 sums[3] = src[l3] + src[l4];
502 sums[4] = src[l4] + src[l5];
503 sums[5] = src[l5] + src[l6];
504 sums[6] = src[l6] + src[l7];
505 sums[7] = src[l7] + src[l8];
506 sums[8] = src[l8] + last;
508 src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
509 src[l2]= ((src[l2]<<2) + (first + sums[0] + sums[3]<<1) + sums[5] + 8)>>4;
510 src[l3]= ((src[l3]<<2) + (first + sums[1] + sums[4]<<1) + sums[6] + 8)>>4;
511 src[l4]= ((src[l4]<<2) + (sums[2] + sums[5]<<1) + sums[0] + sums[7] + 8)>>4;
512 src[l5]= ((src[l5]<<2) + (sums[3] + sums[6]<<1) + sums[1] + sums[8] + 8)>>4;
513 src[l6]= ((src[l6]<<2) + (last + sums[7] + sums[4]<<1) + sums[2] + 8)>>4;
514 src[l7]= ((last + src[l7]<<2) + (src[l8] + sums[5]<<1) + sums[3] + 8)>>4;
515 src[l8]= ((sums[8]<<2) + (last + sums[6]<<1) + sums[4] + 8)>>4;
524 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
525 * values are correctly clipped (MMX2)
526 * values are wraparound (C)
527 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
534 static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
536 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
540 "pxor %%mm7, %%mm7 \n\t" // 0
541 "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
542 "leal (%0, %1), %%eax \n\t"
543 "leal (%%eax, %1, 4), %%ebx \n\t"
544 // 0 1 2 3 4 5 6 7 8 9
545 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
546 "movq pQPb, %%mm0 \n\t" // QP,..., QP
547 "movq %%mm0, %%mm1 \n\t" // QP,..., QP
548 "paddusb b02, %%mm0 \n\t"
549 "psrlw $2, %%mm0 \n\t"
550 "pand b3F, %%mm0 \n\t" // QP/4,..., QP/4
551 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ...
552 "movq (%0, %1, 4), %%mm2 \n\t" // line 4
553 "movq (%%ebx), %%mm3 \n\t" // line 5
554 "movq %%mm2, %%mm4 \n\t" // line 4
555 "pcmpeqb %%mm5, %%mm5 \n\t" // -1
556 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
558 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
559 "psubusb %%mm3, %%mm4 \n\t"
560 "psubusb %%mm2, %%mm3 \n\t"
561 "por %%mm3, %%mm4 \n\t" // |l4 - l5|
562 "psubusb %%mm0, %%mm4 \n\t"
563 "pcmpeqb %%mm7, %%mm4 \n\t"
564 "pand %%mm4, %%mm5 \n\t" // d/2
566 // "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80
567 "paddb %%mm5, %%mm2 \n\t"
568 // "psubb %%mm6, %%mm2 \n\t"
569 "movq %%mm2, (%0,%1, 4) \n\t"
571 "movq (%%ebx), %%mm2 \n\t"
572 // "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80
573 "psubb %%mm5, %%mm2 \n\t"
574 // "psubb %%mm6, %%mm2 \n\t"
575 "movq %%mm2, (%%ebx) \n\t"
577 "paddb %%mm6, %%mm5 \n\t"
578 "psrlw $2, %%mm5 \n\t"
579 "pand b3F, %%mm5 \n\t"
580 "psubb b20, %%mm5 \n\t" // (l5-l4)/8
582 "movq (%%eax, %1, 2), %%mm2 \n\t"
583 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80
584 "paddsb %%mm5, %%mm2 \n\t"
585 "psubb %%mm6, %%mm2 \n\t"
586 "movq %%mm2, (%%eax, %1, 2) \n\t"
588 "movq (%%ebx, %1), %%mm2 \n\t"
589 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80
590 "psubsb %%mm5, %%mm2 \n\t"
591 "psubb %%mm6, %%mm2 \n\t"
592 "movq %%mm2, (%%ebx, %1) \n\t"
595 : "r" (src), "r" (stride)
599 const int l1= stride;
600 const int l2= stride + l1;
601 const int l3= stride + l2;
602 const int l4= stride + l3;
603 const int l5= stride + l4;
604 const int l6= stride + l5;
605 const int l7= stride + l6;
606 const int l8= stride + l7;
607 const int l9= stride + l8;
610 for(x=0; x<BLOCK_SIZE; x++)
612 if(ABS(src[l4]-src[l5]) < QP + QP/4)
614 int v = (src[l5] - src[l4]);
629 * Experimental Filter 1
630 * will not damage linear gradients
631 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
632 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
633 * MMX2 version does correct clipping C version doesnt
635 static inline void vertX1Filter(uint8_t *src, int stride, int QP)
637 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
641 "pxor %%mm7, %%mm7 \n\t" // 0
642 // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
643 "leal (%0, %1), %%eax \n\t"
644 "leal (%%eax, %1, 4), %%ebx \n\t"
645 // 0 1 2 3 4 5 6 7 8 9
646 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
647 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
648 "movq (%0, %1, 4), %%mm1 \n\t" // line 4
649 "movq %%mm1, %%mm2 \n\t" // line 4
650 "psubusb %%mm0, %%mm1 \n\t"
651 "psubusb %%mm2, %%mm0 \n\t"
652 "por %%mm1, %%mm0 \n\t" // |l2 - l3|
653 "movq (%%ebx), %%mm3 \n\t" // line 5
654 "movq (%%ebx, %1), %%mm4 \n\t" // line 6
655 "movq %%mm3, %%mm5 \n\t" // line 5
656 "psubusb %%mm4, %%mm3 \n\t"
657 "psubusb %%mm5, %%mm4 \n\t"
658 "por %%mm4, %%mm3 \n\t" // |l5 - l6|
659 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
660 "movq %%mm2, %%mm1 \n\t" // line 4
661 "psubusb %%mm5, %%mm2 \n\t"
662 "movq %%mm2, %%mm4 \n\t"
663 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
664 "psubusb %%mm1, %%mm5 \n\t"
665 "por %%mm5, %%mm4 \n\t" // |l4 - l5|
666 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
667 "movq %%mm4, %%mm3 \n\t" // d
668 "psubusb pQPb, %%mm4 \n\t"
669 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
670 "psubusb b01, %%mm3 \n\t"
671 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
673 PAVGB(%%mm7, %%mm3) // d/2
674 "movq %%mm3, %%mm1 \n\t" // d/2
675 PAVGB(%%mm7, %%mm3) // d/4
676 PAVGB(%%mm1, %%mm3) // 3*d/8
678 "movq (%0, %1, 4), %%mm0 \n\t" // line 4
679 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
680 "psubusb %%mm3, %%mm0 \n\t"
681 "pxor %%mm2, %%mm0 \n\t"
682 "movq %%mm0, (%0, %1, 4) \n\t" // line 4
684 "movq (%%ebx), %%mm0 \n\t" // line 5
685 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
686 "paddusb %%mm3, %%mm0 \n\t"
687 "pxor %%mm2, %%mm0 \n\t"
688 "movq %%mm0, (%%ebx) \n\t" // line 5
690 PAVGB(%%mm7, %%mm1) // d/4
692 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
693 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
694 "psubusb %%mm1, %%mm0 \n\t"
695 "pxor %%mm2, %%mm0 \n\t"
696 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3
698 "movq (%%ebx, %1), %%mm0 \n\t" // line 6
699 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
700 "paddusb %%mm1, %%mm0 \n\t"
701 "pxor %%mm2, %%mm0 \n\t"
702 "movq %%mm0, (%%ebx, %1) \n\t" // line 6
704 PAVGB(%%mm7, %%mm1) // d/8
706 "movq (%%eax, %1), %%mm0 \n\t" // line 2
707 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
708 "psubusb %%mm1, %%mm0 \n\t"
709 "pxor %%mm2, %%mm0 \n\t"
710 "movq %%mm0, (%%eax, %1) \n\t" // line 2
712 "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7
713 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
714 "paddusb %%mm1, %%mm0 \n\t"
715 "pxor %%mm2, %%mm0 \n\t"
716 "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7
719 : "r" (src), "r" (stride)
724 const int l1= stride;
725 const int l2= stride + l1;
726 const int l3= stride + l2;
727 const int l4= stride + l3;
728 const int l5= stride + l4;
729 const int l6= stride + l5;
730 const int l7= stride + l6;
731 const int l8= stride + l7;
732 const int l9= stride + l8;
736 for(x=0; x<BLOCK_SIZE; x++)
738 int a= src[l3] - src[l4];
739 int b= src[l4] - src[l5];
740 int c= src[l5] - src[l6];
742 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
746 int v = d * SIGN(-b);
759 const int l1= stride;
760 const int l2= stride + l1;
761 const int l3= stride + l2;
762 const int l4= stride + l3;
763 const int l5= stride + l4;
764 const int l6= stride + l5;
765 const int l7= stride + l6;
766 const int l8= stride + l7;
767 const int l9= stride + l8;
768 for(int x=0; x<BLOCK_SIZE; x++)
777 if(ABS(v4-v5)<QP && ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 )
779 src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6 )/16;
780 src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7 )/16;
781 src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16;
782 src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16;
791 * Experimental Filter 1 (Horizontal)
792 * will not damage linear gradients
793 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
794 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
795 * MMX2 version does correct clipping C version doesnt
796 * not identical with the vertical one
798 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
801 static uint64_t *lut= NULL;
805 lut= (uint64_t*)memalign(8, 256*8);
808 int v= i < 128 ? 2*i : 2*(i-256);
810 //Simulate 112242211 9-Tap filter
811 uint64_t a= (v/16) & 0xFF;
812 uint64_t b= (v/8) & 0xFF;
813 uint64_t c= (v/4) & 0xFF;
814 uint64_t d= (3*v/8) & 0xFF;
816 //Simulate piecewise linear interpolation
817 uint64_t a= (v/16) & 0xFF;
818 uint64_t b= (v*3/16) & 0xFF;
819 uint64_t c= (v*5/16) & 0xFF;
820 uint64_t d= (7*v/16) & 0xFF;
821 uint64_t A= (0x100 - a)&0xFF;
822 uint64_t B= (0x100 - b)&0xFF;
823 uint64_t C= (0x100 - c)&0xFF;
824 uint64_t D= (0x100 - c)&0xFF;
826 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
827 (D<<24) | (C<<16) | (B<<8) | (A);
828 //lut[i] = (v<<32) | (v<<24);
832 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
834 "pxor %%mm7, %%mm7 \n\t" // 0
835 // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
836 "leal (%0, %1), %%eax \n\t"
837 "leal (%%eax, %1, 4), %%ebx \n\t"
839 "movq b80, %%mm6 \n\t"
840 "movd pQPb, %%mm5 \n\t" // QP
841 "movq %%mm5, %%mm4 \n\t"
842 "paddusb %%mm5, %%mm5 \n\t" // 2QP
843 "paddusb %%mm5, %%mm4 \n\t" // 3QP
844 "pxor %%mm5, %%mm5 \n\t" // 0
845 "psubb %%mm4, %%mm5 \n\t" // -3QP
846 "por bm11111110, %%mm5 \n\t" // ...,FF,FF,-3QP
847 "psllq $24, %%mm5 \n\t"
849 // 0 1 2 3 4 5 6 7 8 9
850 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
853 "movd " #a ", %%mm0 \n\t"\
854 "movd 4" #a ", %%mm1 \n\t"\
855 "punpckldq %%mm1, %%mm0 \n\t"\
856 "movq %%mm0, %%mm1 \n\t"\
857 "movq %%mm0, %%mm2 \n\t"\
858 "psrlq $8, %%mm1 \n\t"\
859 "psubusb %%mm1, %%mm2 \n\t"\
860 "psubusb %%mm0, %%mm1 \n\t"\
861 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
862 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
863 "pshufw $0x00, %%mm1, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
864 PAVGB(%%mm1, %%mm3) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
865 "psrlq $16, %%mm3 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
866 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
867 "paddb %%mm5, %%mm1 \n\t"\
868 "psubusb %%mm5, %%mm1 \n\t"\
870 "pxor %%mm2, %%mm1 \n\t"\
871 "psubb %%mm2, %%mm1 \n\t"\
872 "psrlq $24, %%mm1 \n\t"\
873 "movd %%mm1, %%ecx \n\t"\
874 "paddb %%mm6, %%mm0 \n\t"\
875 "paddsb (%3, %%ecx, 8), %%mm0 \n\t"\
876 "paddb %%mm6, %%mm0 \n\t"\
877 "movq %%mm0, " #a " \n\t"\
883 HX1old((%%eax, %1, 2))
887 HX1old((%%ebx, %1, 2))
890 //FIXME add some comments, its unreadable ...
891 #define HX1b(a, c, b, d) \
892 "movd " #a ", %%mm0 \n\t"\
893 "movd 4" #a ", %%mm1 \n\t"\
894 "punpckldq %%mm1, %%mm0 \n\t"\
895 "movd " #b ", %%mm4 \n\t"\
896 "movq %%mm0, %%mm1 \n\t"\
897 "movq %%mm0, %%mm2 \n\t"\
898 "psrlq $8, %%mm1 \n\t"\
899 "movd 4" #b ", %%mm3 \n\t"\
900 "psubusb %%mm1, %%mm2 \n\t"\
901 "psubusb %%mm0, %%mm1 \n\t"\
902 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
903 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
904 "punpckldq %%mm3, %%mm4 \n\t"\
905 "movq %%mm1, %%mm3 \n\t"\
906 "psllq $32, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
907 PAVGB(%%mm1, %%mm3) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
908 "paddb %%mm6, %%mm0 \n\t"\
909 "psrlq $16, %%mm3 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
910 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
911 "movq %%mm4, %%mm3 \n\t"\
912 "paddb %%mm5, %%mm1 \n\t"\
913 "psubusb %%mm5, %%mm1 \n\t"\
914 "psrlq $8, %%mm3 \n\t"\
916 "pxor %%mm2, %%mm1 \n\t"\
917 "psubb %%mm2, %%mm1 \n\t"\
918 "movq %%mm4, %%mm2 \n\t"\
919 "psrlq $24, %%mm1 \n\t"\
920 "psubusb %%mm3, %%mm2 \n\t"\
921 "movd %%mm1, %%ecx \n\t"\
922 "psubusb %%mm4, %%mm3 \n\t"\
923 "paddsb (%2, %%ecx, 8), %%mm0 \n\t"\
924 "por %%mm2, %%mm3 \n\t" /* p´x = |px - p(x+1)| */\
925 "paddb %%mm6, %%mm0 \n\t"\
926 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
927 "movq %%mm3, %%mm1 \n\t"\
928 "psllq $32, %%mm1 \n\t" /* p´5 = |p1 - p2| */\
929 "movq %%mm0, " #a " \n\t"\
930 PAVGB(%%mm3, %%mm1) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
931 "paddb %%mm6, %%mm4 \n\t"\
932 "psrlq $16, %%mm1 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
933 "psubusb %%mm1, %%mm3 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
934 "paddb %%mm5, %%mm3 \n\t"\
935 "psubusb %%mm5, %%mm3 \n\t"\
937 "pxor %%mm2, %%mm3 \n\t"\
938 "psubb %%mm2, %%mm3 \n\t"\
939 "psrlq $24, %%mm3 \n\t"\
940 "movd " #c ", %%mm0 \n\t"\
941 "movd 4" #c ", %%mm1 \n\t"\
942 "punpckldq %%mm1, %%mm0 \n\t"\
943 "paddb %%mm6, %%mm0 \n\t"\
944 "paddsb (%2, %%ecx, 8), %%mm0 \n\t"\
945 "paddb %%mm6, %%mm0 \n\t"\
946 "movq %%mm0, " #c " \n\t"\
947 "movd %%mm3, %%ecx \n\t"\
948 "movd " #d ", %%mm0 \n\t"\
949 "paddsb (%2, %%ecx, 8), %%mm4 \n\t"\
950 "movd 4" #d ", %%mm1 \n\t"\
951 "paddb %%mm6, %%mm4 \n\t"\
952 "punpckldq %%mm1, %%mm0 \n\t"\
953 "movq %%mm4, " #b " \n\t"\
954 "paddb %%mm6, %%mm0 \n\t"\
955 "paddsb (%2, %%ecx, 8), %%mm0 \n\t"\
956 "paddb %%mm6, %%mm0 \n\t"\
957 "movq %%mm0, " #d " \n\t"\
959 HX1b((%0),(%%eax),(%%eax, %1),(%%eax, %1, 2))
960 HX1b((%0, %1, 4),(%%ebx),(%%ebx, %1),(%%ebx, %1, 2))
964 : "r" (src), "r" (stride), "r" (lut)
965 : "%eax", "%ebx", "%ecx"
969 //FIXME (has little in common with the mmx2 version)
970 for(y=0; y<BLOCK_SIZE; y++)
972 int a= src[1] - src[2];
973 int b= src[3] - src[4];
974 int c= src[5] - src[6];
976 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
980 int v = d * SIGN(-b);
996 static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
1000 //FIXME try pmul for *5 stuff
1003 "pxor %%mm7, %%mm7 \n\t"
1004 "leal (%0, %1), %%eax \n\t"
1005 "leal (%%eax, %1, 4), %%ebx \n\t"
1007 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
1008 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
1010 "movq (%0), %%mm0 \n\t"
1011 "movq %%mm0, %%mm1 \n\t"
1012 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
1013 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
1015 "movq (%%eax), %%mm2 \n\t"
1016 "movq %%mm2, %%mm3 \n\t"
1017 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
1018 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
1020 "movq (%%eax, %1), %%mm4 \n\t"
1021 "movq %%mm4, %%mm5 \n\t"
1022 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
1023 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
1025 "paddw %%mm0, %%mm0 \n\t" // 2L0
1026 "paddw %%mm1, %%mm1 \n\t" // 2H0
1027 "psubw %%mm4, %%mm2 \n\t" // L1 - L2
1028 "psubw %%mm5, %%mm3 \n\t" // H1 - H2
1029 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
1030 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
1032 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
1033 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
1034 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
1035 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
1037 "movq (%%eax, %1, 2), %%mm2 \n\t"
1038 "movq %%mm2, %%mm3 \n\t"
1039 "punpcklbw %%mm7, %%mm2 \n\t" // L3
1040 "punpckhbw %%mm7, %%mm3 \n\t" // H3
1042 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
1043 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
1044 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1045 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1046 "movq %%mm0, temp0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1047 "movq %%mm1, temp1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1049 "movq (%0, %1, 4), %%mm0 \n\t"
1050 "movq %%mm0, %%mm1 \n\t"
1051 "punpcklbw %%mm7, %%mm0 \n\t" // L4
1052 "punpckhbw %%mm7, %%mm1 \n\t" // H4
1054 "psubw %%mm0, %%mm2 \n\t" // L3 - L4
1055 "psubw %%mm1, %%mm3 \n\t" // H3 - H4
1056 "movq %%mm2, temp2 \n\t" // L3 - L4
1057 "movq %%mm3, temp3 \n\t" // H3 - H4
1058 "paddw %%mm4, %%mm4 \n\t" // 2L2
1059 "paddw %%mm5, %%mm5 \n\t" // 2H2
1060 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
1061 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
1063 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
1064 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
1065 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
1066 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
1068 "movq (%%ebx), %%mm2 \n\t"
1069 "movq %%mm2, %%mm3 \n\t"
1070 "punpcklbw %%mm7, %%mm2 \n\t" // L5
1071 "punpckhbw %%mm7, %%mm3 \n\t" // H5
1072 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
1073 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
1074 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
1075 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
1077 "movq (%%ebx, %1), %%mm6 \n\t"
1078 "punpcklbw %%mm7, %%mm6 \n\t" // L6
1079 "psubw %%mm6, %%mm2 \n\t" // L5 - L6
1080 "movq (%%ebx, %1), %%mm6 \n\t"
1081 "punpckhbw %%mm7, %%mm6 \n\t" // H6
1082 "psubw %%mm6, %%mm3 \n\t" // H5 - H6
1084 "paddw %%mm0, %%mm0 \n\t" // 2L4
1085 "paddw %%mm1, %%mm1 \n\t" // 2H4
1086 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
1087 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
1089 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
1090 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
1091 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
1092 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
1094 "movq (%%ebx, %1, 2), %%mm2 \n\t"
1095 "movq %%mm2, %%mm3 \n\t"
1096 "punpcklbw %%mm7, %%mm2 \n\t" // L7
1097 "punpckhbw %%mm7, %%mm3 \n\t" // H7
1099 "paddw %%mm2, %%mm2 \n\t" // 2L7
1100 "paddw %%mm3, %%mm3 \n\t" // 2H7
1101 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
1102 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
1104 "movq temp0, %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1105 "movq temp1, %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1106 //FIXME pxor, psubw, pmax for abs
1107 "movq %%mm7, %%mm6 \n\t" // 0
1108 "pcmpgtw %%mm0, %%mm6 \n\t"
1109 "pxor %%mm6, %%mm0 \n\t"
1110 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1111 "movq %%mm7, %%mm6 \n\t" // 0
1112 "pcmpgtw %%mm1, %%mm6 \n\t"
1113 "pxor %%mm6, %%mm1 \n\t"
1114 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1116 "movq %%mm7, %%mm6 \n\t" // 0
1117 "pcmpgtw %%mm2, %%mm6 \n\t"
1118 "pxor %%mm6, %%mm2 \n\t"
1119 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1120 "movq %%mm7, %%mm6 \n\t" // 0
1121 "pcmpgtw %%mm3, %%mm6 \n\t"
1122 "pxor %%mm6, %%mm3 \n\t"
1123 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1126 "pminsw %%mm2, %%mm0 \n\t"
1127 "pminsw %%mm3, %%mm1 \n\t"
1129 "movq %%mm0, %%mm6 \n\t"
1130 "psubusw %%mm2, %%mm6 \n\t"
1131 "psubw %%mm6, %%mm0 \n\t"
1132 "movq %%mm1, %%mm6 \n\t"
1133 "psubusw %%mm3, %%mm6 \n\t"
1134 "psubw %%mm6, %%mm1 \n\t"
1137 "movq %%mm7, %%mm6 \n\t" // 0
1138 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
1139 "pxor %%mm6, %%mm4 \n\t"
1140 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
1141 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
1142 "pxor %%mm7, %%mm5 \n\t"
1143 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
1145 "movd %2, %%mm2 \n\t" // QP
1146 "punpcklwd %%mm2, %%mm2 \n\t"
1147 "punpcklwd %%mm2, %%mm2 \n\t"
1148 "psllw $3, %%mm2 \n\t" // 8QP
1149 "movq %%mm2, %%mm3 \n\t" // 8QP
1150 "pcmpgtw %%mm4, %%mm2 \n\t"
1151 "pcmpgtw %%mm5, %%mm3 \n\t"
1152 "pand %%mm2, %%mm4 \n\t"
1153 "pand %%mm3, %%mm5 \n\t"
1156 "psubusw %%mm0, %%mm4 \n\t" // hd
1157 "psubusw %%mm1, %%mm5 \n\t" // ld
1160 "movq w05, %%mm2 \n\t" // 5
1161 "pmullw %%mm2, %%mm4 \n\t"
1162 "pmullw %%mm2, %%mm5 \n\t"
1163 "movq w20, %%mm2 \n\t" // 32
1164 "paddw %%mm2, %%mm4 \n\t"
1165 "paddw %%mm2, %%mm5 \n\t"
1166 "psrlw $6, %%mm4 \n\t"
1167 "psrlw $6, %%mm5 \n\t"
1170 "movq w06, %%mm2 \n\t" // 6
1171 "paddw %%mm2, %%mm4 \n\t"
1172 "paddw %%mm2, %%mm5 \n\t"
1173 "movq w1400, %%mm2 \n\t" // 1400h = 5120 = 5/64*2^16
1174 //FIXME if *5/64 is supposed to be /13 then we should use 5041 instead of 5120
1175 "pmulhw %%mm2, %%mm4 \n\t" // hd/13
1176 "pmulhw %%mm2, %%mm5 \n\t" // ld/13
1179 "movq temp2, %%mm0 \n\t" // L3 - L4
1180 "movq temp3, %%mm1 \n\t" // H3 - H4
1182 "pxor %%mm2, %%mm2 \n\t"
1183 "pxor %%mm3, %%mm3 \n\t"
1185 // FIXME rounding error
1186 "psraw $1, %%mm0 \n\t" // (L3 - L4)/2
1187 "psraw $1, %%mm1 \n\t" // (H3 - H4)/2
1188 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
1189 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
1190 "pxor %%mm2, %%mm0 \n\t"
1191 "pxor %%mm3, %%mm1 \n\t"
1192 "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
1193 "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
1194 // "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
1195 // "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
1197 "pxor %%mm6, %%mm2 \n\t"
1198 "pxor %%mm7, %%mm3 \n\t"
1199 "pand %%mm2, %%mm4 \n\t"
1200 "pand %%mm3, %%mm5 \n\t"
1203 "pminsw %%mm0, %%mm4 \n\t"
1204 "pminsw %%mm1, %%mm5 \n\t"
1206 "movq %%mm4, %%mm2 \n\t"
1207 "psubusw %%mm0, %%mm2 \n\t"
1208 "psubw %%mm2, %%mm4 \n\t"
1209 "movq %%mm5, %%mm2 \n\t"
1210 "psubusw %%mm1, %%mm2 \n\t"
1211 "psubw %%mm2, %%mm5 \n\t"
1213 "pxor %%mm6, %%mm4 \n\t"
1214 "pxor %%mm7, %%mm5 \n\t"
1215 "psubw %%mm6, %%mm4 \n\t"
1216 "psubw %%mm7, %%mm5 \n\t"
1217 "packsswb %%mm5, %%mm4 \n\t"
1218 "movq (%%eax, %1, 2), %%mm0 \n\t"
1219 "paddb %%mm4, %%mm0 \n\t"
1220 "movq %%mm0, (%%eax, %1, 2) \n\t"
1221 "movq (%0, %1, 4), %%mm0 \n\t"
1222 "psubb %%mm4, %%mm0 \n\t"
1223 "movq %%mm0, (%0, %1, 4) \n\t"
1226 : "r" (src), "r" (stride), "r" (QP)
1230 const int l1= stride;
1231 const int l2= stride + l1;
1232 const int l3= stride + l2;
1233 const int l4= stride + l3;
1234 const int l5= stride + l4;
1235 const int l6= stride + l5;
1236 const int l7= stride + l6;
1237 const int l8= stride + l7;
1238 // const int l9= stride + l8;
1241 for(x=0; x<BLOCK_SIZE; x++)
1243 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1244 if(ABS(middleEnergy) < 8*QP)
1246 const int q=(src[l4] - src[l5])/2;
1247 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1248 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1250 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1254 d*= SIGN(-middleEnergy);
1275 //FIXME? |255-0| = 1
1277 * Check if the given 8x8 Block is mostly "flat" and copy the unaliged data into tempBlock.
1279 static inline int isHorizDCAndCopy2Temp(uint8_t src[], int stride)
1287 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F
1288 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D
1289 "leal tempBlock, %%eax \n\t"
1290 "pxor %%mm0, %%mm0 \n\t"
1292 #define HDC_CHECK_AND_CPY(i) \
1293 "movq -4(%1), %%mm2 \n\t"\
1294 "psrlq $32, %%mm2 \n\t"\
1295 "punpckldq 4(%1), %%mm2 \n\t" /* (%1) */\
1296 "movq %%mm2, %%mm1 \n\t"\
1297 "psrlq $8, %%mm2 \n\t"\
1298 "psubb %%mm1, %%mm2 \n\t"\
1299 "paddb %%mm7, %%mm2 \n\t"\
1300 "pcmpgtb %%mm6, %%mm2 \n\t"\
1301 "paddb %%mm2, %%mm0 \n\t"\
1302 "movq %%mm1," #i "(%%eax) \n\t"
1304 HDC_CHECK_AND_CPY(0)
1306 HDC_CHECK_AND_CPY(8)
1308 HDC_CHECK_AND_CPY(16)
1310 HDC_CHECK_AND_CPY(24)
1312 HDC_CHECK_AND_CPY(32)
1314 HDC_CHECK_AND_CPY(40)
1316 HDC_CHECK_AND_CPY(48)
1318 HDC_CHECK_AND_CPY(56)
1320 "psllq $8, %%mm0 \n\t" // remove dummy value
1321 "movq %%mm0, %%mm1 \n\t"
1322 "psrlw $8, %%mm0 \n\t"
1323 "paddb %%mm1, %%mm0 \n\t"
1324 "movq %%mm0, %%mm1 \n\t"
1325 "psrlq $16, %%mm0 \n\t"
1326 "paddb %%mm1, %%mm0 \n\t"
1327 "movq %%mm0, %%mm1 \n\t"
1328 "psrlq $32, %%mm0 \n\t"
1329 "paddb %%mm1, %%mm0 \n\t"
1331 "movd %%mm0, %0 \n\t"
1333 : "r" (src), "r" (stride)
1336 // printf("%d\n", numEq);
1337 numEq= (256 - (numEq & 0xFF)) &0xFF;
1340 for(y=0; y<BLOCK_SIZE; y++)
1342 if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++;
1343 if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++;
1344 if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++;
1345 if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++;
1346 if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++;
1347 if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++;
1348 if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++;
1349 tempBlock[0 + y*TEMP_STRIDE] = src[0];
1350 tempBlock[1 + y*TEMP_STRIDE] = src[1];
1351 tempBlock[2 + y*TEMP_STRIDE] = src[2];
1352 tempBlock[3 + y*TEMP_STRIDE] = src[3];
1353 tempBlock[4 + y*TEMP_STRIDE] = src[4];
1354 tempBlock[5 + y*TEMP_STRIDE] = src[5];
1355 tempBlock[6 + y*TEMP_STRIDE] = src[6];
1356 tempBlock[7 + y*TEMP_STRIDE] = src[7];
1360 /* if(abs(numEq - asmEq) > 0)
1362 // printf("\nasm:%d c:%d\n", asmEq, numEq);
1363 for(int y=0; y<8; y++)
1365 for(int x=0; x<8; x++)
1367 printf("%d ", src[x + y*stride]);
1373 // printf("%d\n", numEq);
1374 return numEq > hFlatnessThreshold;
1377 static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
1384 "movq (%1, %2), %%mm0 \n\t"
1385 "movq (%1, %2, 8), %%mm1 \n\t"
1386 "movq %%mm0, %%mm2 \n\t"
1387 "psubusb %%mm1, %%mm0 \n\t"
1388 "psubusb %%mm2, %%mm1 \n\t"
1389 "por %%mm1, %%mm0 \n\t" // ABS Diff
1391 "movq pQPb, %%mm7 \n\t" // QP,..., QP
1392 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
1393 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0
1394 "pcmpeqd b00, %%mm0 \n\t"
1395 "psrlq $16, %%mm0 \n\t"
1396 "pcmpeqd bFF, %%mm0 \n\t"
1397 // "movd %%mm0, (%1, %2, 4)\n\t"
1398 "movd %%mm0, %0 \n\t"
1400 : "r" (src), "r" (stride)
1404 if(abs(src[0] - src[7]) > 2*QP) return 0;
1410 static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP)
1415 "pxor %%mm7, %%mm7 \n\t"
1416 "movq bm00001000, %%mm6 \n\t"
1417 "movd %2, %%mm5 \n\t" // QP
1418 "movq %%mm5, %%mm4 \n\t"
1419 "paddusb %%mm5, %%mm5 \n\t" // 2QP
1420 "paddusb %%mm5, %%mm4 \n\t" // 3QP
1421 "psllq $24, %%mm4 \n\t"
1422 "pxor %%mm5, %%mm5 \n\t" // 0
1423 "psubb %%mm4, %%mm5 \n\t" // -QP
1424 "leal tempBlock, %%eax \n\t"
1426 //FIXME? "unroll by 2" and mix
1429 "movq " #i "(%%eax), %%mm0 \n\t"\
1430 "movq %%mm0, %%mm1 \n\t"\
1431 "movq %%mm0, %%mm2 \n\t"\
1432 "psrlq $8, %%mm1 \n\t"\
1433 "psubusb %%mm1, %%mm2 \n\t"\
1434 "psubusb %%mm0, %%mm1 \n\t"\
1435 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
1436 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
1437 "pshufw $0x00, %%mm1, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
1438 "pminub %%mm1, %%mm3 \n\t" /* p´5 = min(|p2-p1|, |p6-p5|)*/\
1439 "psrlq $16, %%mm3 \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\
1440 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5-p6|) */\
1441 "paddb %%mm5, %%mm1 \n\t"\
1442 "psubusb %%mm5, %%mm1 \n\t"\
1443 "psrlw $2, %%mm1 \n\t"\
1444 "pxor %%mm2, %%mm1 \n\t"\
1445 "psubb %%mm2, %%mm1 \n\t"\
1446 "pand %%mm6, %%mm1 \n\t"\
1447 "psubb %%mm1, %%mm0 \n\t"\
1448 "psllq $8, %%mm1 \n\t"\
1449 "paddb %%mm1, %%mm0 \n\t"\
1450 "movd %%mm0, (%0) \n\t"\
1451 "psrlq $32, %%mm0 \n\t"\
1452 "movd %%mm0, 4(%0) \n\t"
1455 "movq " #i "(%%eax), %%mm0 \n\t"\
1456 "movq %%mm0, %%mm1 \n\t"\
1457 "movq %%mm0, %%mm2 \n\t"\
1458 "psrlq $8, %%mm1 \n\t"\
1459 "psubusb %%mm1, %%mm2 \n\t"\
1460 "psubusb %%mm0, %%mm1 \n\t"\
1461 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
1462 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
1463 "movq %%mm1, %%mm3 \n\t"\
1464 "psllq $32, %%mm3 \n\t"\
1465 "movq %%mm3, %%mm4 \n\t"\
1466 "psubusb %%mm1, %%mm4 \n\t"\
1467 "psubb %%mm4, %%mm3 \n\t"\
1468 "psrlq $16, %%mm3 \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\
1469 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5,ü6|) */\
1470 "paddb %%mm5, %%mm1 \n\t"\
1471 "psubusb %%mm5, %%mm1 \n\t"\
1472 "psrlw $2, %%mm1 \n\t"\
1473 "pxor %%mm2, %%mm1 \n\t"\
1474 "psubb %%mm2, %%mm1 \n\t"\
1475 "pand %%mm6, %%mm1 \n\t"\
1476 "psubb %%mm1, %%mm0 \n\t"\
1477 "psllq $8, %%mm1 \n\t"\
1478 "paddb %%mm1, %%mm0 \n\t"\
1479 "movd %%mm0, (%0) \n\t"\
1480 "psrlq $32, %%mm0 \n\t"\
1481 "movd %%mm0, 4(%0) \n\t"
1500 : "r" (dst), "r" (stride), "r" (QP)
1504 uint8_t *src= tempBlock;
1507 for(y=0; y<BLOCK_SIZE; y++)
1509 const int middleEnergy= 5*(src[4] - src[5]) + 2*(src[2] - src[5]);
1520 if(ABS(middleEnergy) < 8*QP)
1522 const int q=(src[3] - src[4])/2;
1523 const int leftEnergy= 5*(src[2] - src[1]) + 2*(src[0] - src[3]);
1524 const int rightEnergy= 5*(src[6] - src[5]) + 2*(src[4] - src[7]);
1526 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1530 d*= SIGN(-middleEnergy);
1553 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
1554 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
1555 * using the 7-Tap Filter (2,2,2,4,2,2,2)/16 (MMX2/3DNOW version)
1557 static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP)
1560 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1561 asm volatile( //"movv %0 %1 %2\n\t"
1563 "pxor %%mm7, %%mm7 \n\t"
1564 "leal tempBlock, %%eax \n\t"
1566 #define HLP1 "movq (%0), %%mm0 \n\t"\
1567 "movq %%mm0, %%mm1 \n\t"\
1568 "psllq $8, %%mm0 \n\t"\
1569 PAVGB(%%mm1, %%mm0)\
1570 "psrlw $8, %%mm0 \n\t"\
1571 "pxor %%mm1, %%mm1 \n\t"\
1572 "packuswb %%mm1, %%mm0 \n\t"\
1573 "movq %%mm0, %%mm1 \n\t"\
1574 "movq %%mm0, %%mm2 \n\t"\
1575 "psllq $32, %%mm0 \n\t"\
1576 "paddb %%mm0, %%mm1 \n\t"\
1577 "psllq $16, %%mm2 \n\t"\
1578 PAVGB(%%mm2, %%mm0)\
1579 "movq %%mm0, %%mm3 \n\t"\
1580 "pand bm11001100, %%mm0 \n\t"\
1581 "paddusb %%mm0, %%mm3 \n\t"\
1582 "psrlq $8, %%mm3 \n\t"\
1583 PAVGB(%%mm1, %%mm4)\
1584 PAVGB(%%mm3, %%mm2)\
1585 "psrlq $16, %%mm2 \n\t"\
1586 "punpcklbw %%mm2, %%mm2 \n\t"\
1587 "movq %%mm2, (%0) \n\t"\
1589 #define HLP2 "movq (%0), %%mm0 \n\t"\
1590 "movq %%mm0, %%mm1 \n\t"\
1591 "psllq $8, %%mm0 \n\t"\
1592 PAVGB(%%mm1, %%mm0)\
1593 "psrlw $8, %%mm0 \n\t"\
1594 "pxor %%mm1, %%mm1 \n\t"\
1595 "packuswb %%mm1, %%mm0 \n\t"\
1596 "movq %%mm0, %%mm2 \n\t"\
1597 "psllq $32, %%mm0 \n\t"\
1598 "psllq $16, %%mm2 \n\t"\
1599 PAVGB(%%mm2, %%mm0)\
1600 "movq %%mm0, %%mm3 \n\t"\
1601 "pand bm11001100, %%mm0 \n\t"\
1602 "paddusb %%mm0, %%mm3 \n\t"\
1603 "psrlq $8, %%mm3 \n\t"\
1604 PAVGB(%%mm3, %%mm2)\
1605 "psrlq $16, %%mm2 \n\t"\
1606 "punpcklbw %%mm2, %%mm2 \n\t"\
1607 "movq %%mm2, (%0) \n\t"\
1609 // approximately a 7-Tap Filter with Vector (1,2,3,4,3,2,1)/16
1611 Implemented Exact 7-Tap
1624 #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\
1625 "movq %%mm0, %%mm1 \n\t"\
1626 "movq %%mm0, %%mm2 \n\t"\
1627 "movq %%mm0, %%mm3 \n\t"\
1628 "movq %%mm0, %%mm4 \n\t"\
1629 "psllq $8, %%mm1 \n\t"\
1630 "psrlq $8, %%mm2 \n\t"\
1631 "pand bm00000001, %%mm3 \n\t"\
1632 "pand bm10000000, %%mm4 \n\t"\
1633 "por %%mm3, %%mm1 \n\t"\
1634 "por %%mm4, %%mm2 \n\t"\
1635 PAVGB(%%mm2, %%mm1)\
1636 PAVGB(%%mm1, %%mm0)\
1638 "pshufw $0xF9, %%mm0, %%mm3 \n\t"\
1639 "pshufw $0x90, %%mm0, %%mm4 \n\t"\
1640 PAVGB(%%mm3, %%mm4)\
1641 PAVGB(%%mm4, %%mm0)\
1642 "movd %%mm0, (%0) \n\t"\
1643 "psrlq $32, %%mm0 \n\t"\
1644 "movd %%mm0, 4(%0) \n\t"
1646 #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\
1647 "movq %%mm0, %%mm1 \n\t"\
1648 "movq %%mm0, %%mm2 \n\t"\
1649 "movd -4(%0), %%mm3 \n\t" /*0001000*/\
1650 "movd 8(%0), %%mm4 \n\t" /*0001000*/\
1651 "psllq $8, %%mm1 \n\t"\
1652 "psrlq $8, %%mm2 \n\t"\
1653 "psrlq $24, %%mm3 \n\t"\
1654 "psllq $56, %%mm4 \n\t"\
1655 "por %%mm3, %%mm1 \n\t"\
1656 "por %%mm4, %%mm2 \n\t"\
1657 PAVGB(%%mm2, %%mm1)\
1658 PAVGB(%%mm1, %%mm0)\
1660 "movq %%mm0, %%mm3 \n\t"\
1661 "movq %%mm0, %%mm4 \n\t"\
1662 "movq %%mm0, %%mm5 \n\t"\
1663 "psrlq $16, %%mm3 \n\t"\
1664 "psllq $16, %%mm4 \n\t"\
1665 "pand bm11000000, %%mm5 \n\t"\
1666 "por %%mm5, %%mm3 \n\t"\
1667 "movq %%mm0, %%mm5 \n\t"\
1668 "pand bm00000011, %%mm5 \n\t"\
1669 "por %%mm5, %%mm4 \n\t"\
1670 PAVGB(%%mm3, %%mm4)\
1671 PAVGB(%%mm4, %%mm0)\
1672 "movd %%mm0, (%0) \n\t"\
1673 "psrlq $32, %%mm0 \n\t"\
1674 "movd %%mm0, 4(%0) \n\t"
1677 /* uses the 7-Tap Filter: 1112111 */
1679 "movq " #i "(%%eax), %%mm0 \n\t"\
1680 "movq %%mm0, %%mm1 \n\t"\
1681 "movq %%mm0, %%mm2 \n\t"\
1682 "movd -4(%0), %%mm3 \n\t" /*0001000*/\
1683 "movd 8(%0), %%mm4 \n\t" /*0001000*/\
1684 "psllq $8, %%mm1 \n\t"\
1685 "psrlq $8, %%mm2 \n\t"\
1686 "psrlq $24, %%mm3 \n\t"\
1687 "psllq $56, %%mm4 \n\t"\
1688 "por %%mm3, %%mm1 \n\t"\
1689 "por %%mm4, %%mm2 \n\t"\
1690 "movq %%mm1, %%mm5 \n\t"\
1691 PAVGB(%%mm2, %%mm1)\
1692 PAVGB(%%mm1, %%mm0)\
1693 "psllq $8, %%mm5 \n\t"\
1694 "psrlq $8, %%mm2 \n\t"\
1695 "por %%mm3, %%mm5 \n\t"\
1696 "por %%mm4, %%mm2 \n\t"\
1697 "movq %%mm5, %%mm1 \n\t"\
1698 PAVGB(%%mm2, %%mm5)\
1699 "psllq $8, %%mm1 \n\t"\
1700 "psrlq $8, %%mm2 \n\t"\
1701 "por %%mm3, %%mm1 \n\t"\
1702 "por %%mm4, %%mm2 \n\t"\
1703 PAVGB(%%mm2, %%mm1)\
1704 PAVGB(%%mm1, %%mm5)\
1705 PAVGB(%%mm5, %%mm0)\
1706 "movd %%mm0, (%0) \n\t"\
1707 "psrlq $32, %%mm0 \n\t"\
1708 "movd %%mm0, 4(%0) \n\t"
1710 /* uses the 9-Tap Filter: 112242211 */
1711 #define NEW_HLP2(i)\
1712 "movq " #i "(%%eax), %%mm0 \n\t" /*0001000*/\
1713 "movq %%mm0, %%mm1 \n\t" /*0001000*/\
1714 "movq %%mm0, %%mm2 \n\t" /*0001000*/\
1715 "movd -4(%0), %%mm3 \n\t" /*0001000*/\
1716 "movd 8(%0), %%mm4 \n\t" /*0001000*/\
1717 "psllq $8, %%mm1 \n\t"\
1718 "psrlq $8, %%mm2 \n\t"\
1719 "psrlq $24, %%mm3 \n\t"\
1720 "psllq $56, %%mm4 \n\t"\
1721 "por %%mm3, %%mm1 \n\t" /*0010000*/\
1722 "por %%mm4, %%mm2 \n\t" /*0000100*/\
1723 "movq %%mm1, %%mm5 \n\t" /*0010000*/\
1724 PAVGB(%%mm2, %%mm1) /*0010100*/\
1725 PAVGB(%%mm1, %%mm0) /*0012100*/\
1726 "psllq $8, %%mm5 \n\t"\
1727 "psrlq $8, %%mm2 \n\t"\
1728 "por %%mm3, %%mm5 \n\t" /*0100000*/\
1729 "por %%mm4, %%mm2 \n\t" /*0000010*/\
1730 "movq %%mm5, %%mm1 \n\t" /*0100000*/\
1731 PAVGB(%%mm2, %%mm5) /*0100010*/\
1732 "psllq $8, %%mm1 \n\t"\
1733 "psrlq $8, %%mm2 \n\t"\
1734 "por %%mm3, %%mm1 \n\t" /*1000000*/\
1735 "por %%mm4, %%mm2 \n\t" /*0000001*/\
1736 "movq %%mm1, %%mm6 \n\t" /*1000000*/\
1737 PAVGB(%%mm2, %%mm1) /*1000001*/\
1738 "psllq $8, %%mm6 \n\t"\
1739 "psrlq $8, %%mm2 \n\t"\
1740 "por %%mm3, %%mm6 \n\t"/*100000000*/\
1741 "por %%mm4, %%mm2 \n\t"/*000000001*/\
1742 PAVGB(%%mm2, %%mm6) /*100000001*/\
1743 PAVGB(%%mm6, %%mm1) /*110000011*/\
1744 PAVGB(%%mm1, %%mm5) /*112000211*/\
1745 PAVGB(%%mm5, %%mm0) /*112242211*/\
1746 "movd %%mm0, (%0) \n\t"\
1747 "psrlq $32, %%mm0 \n\t"\
1748 "movd %%mm0, 4(%0) \n\t"
1750 #define HLP(i) NEW_HLP(i)
1770 : "r" (dst), "r" (stride)
1775 uint8_t *temp= tempBlock;
1777 for(y=0; y<BLOCK_SIZE; y++)
1779 const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
1780 const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
1783 sums[0] = first + temp[0];
1784 sums[1] = temp[0] + temp[1];
1785 sums[2] = temp[1] + temp[2];
1786 sums[3] = temp[2] + temp[3];
1787 sums[4] = temp[3] + temp[4];
1788 sums[5] = temp[4] + temp[5];
1789 sums[6] = temp[5] + temp[6];
1790 sums[7] = temp[6] + temp[7];
1791 sums[8] = temp[7] + last;
1793 dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
1794 dst[1]= ((dst[1]<<2) + (first + sums[0] + sums[3]<<1) + sums[5] + 8)>>4;
1795 dst[2]= ((dst[2]<<2) + (first + sums[1] + sums[4]<<1) + sums[6] + 8)>>4;
1796 dst[3]= ((dst[3]<<2) + (sums[2] + sums[5]<<1) + sums[0] + sums[7] + 8)>>4;
1797 dst[4]= ((dst[4]<<2) + (sums[3] + sums[6]<<1) + sums[1] + sums[8] + 8)>>4;
1798 dst[5]= ((dst[5]<<2) + (last + sums[7] + sums[4]<<1) + sums[2] + 8)>>4;
1799 dst[6]= ((last + dst[6]<<2) + (dst[7] + sums[5]<<1) + sums[3] + 8)>>4;
1800 dst[7]= ((sums[8]<<2) + (last + sums[6]<<1) + sums[4] + 8)>>4;
1809 static inline void dering(uint8_t src[], int stride, int QP)
1815 "leal (%0, %1), %%eax \n\t"
1816 "leal (%%eax, %1, 4), %%ebx \n\t"
1817 // 0 1 2 3 4 5 6 7 8 9
1818 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1820 "pcmpeq %%mm6, %%mm6 \n\t"
1821 "pxor %%mm7, %%mm7 \n\t"
1823 #define FIND_MIN_MAX(addr)\
1824 "movq (" #addr "), %%mm0, \n\t"\
1825 "pminub %%mm0, %%mm6 \n\t"\
1826 "pmaxub %%mm0, %%mm7 \n\t"
1830 FIND_MIN_MAX(%%eax, %1)
1831 FIND_MIN_MAX(%%eax, %1, 2)
1832 FIND_MIN_MAX(%0, %1, 4)
1834 FIND_MIN_MAX(%%ebx, %1)
1835 FIND_MIN_MAX(%%ebx, %1, 2)
1836 FIND_MIN_MAX(%0, %1, 8)
1837 FIND_MIN_MAX(%%ebx, %1, 2)
1839 "movq %%mm6, %%mm4 \n\t"
1840 "psrlq $32, %%mm6 \n\t"
1841 "pminub %%mm4, %%mm6 \n\t"
1842 "movq %%mm6, %%mm4 \n\t"
1843 "psrlq $16, %%mm6 \n\t"
1844 "pminub %%mm4, %%mm6 \n\t"
1845 "movq %%mm6, %%mm4 \n\t"
1846 "psrlq $8, %%mm6 \n\t"
1847 "pminub %%mm4, %%mm6 \n\t" // min of pixels
1849 "movq %%mm7, %%mm4 \n\t"
1850 "psrlq $32, %%mm7 \n\t"
1851 "pmaxub %%mm4, %%mm7 \n\t"
1852 "movq %%mm7, %%mm4 \n\t"
1853 "psrlq $16, %%mm7 \n\t"
1854 "pmaxub %%mm4, %%mm7 \n\t"
1855 "movq %%mm7, %%mm4 \n\t"
1856 "psrlq $8, %%mm7 \n\t"
1857 "pmaxub %%mm4, %%mm7 \n\t" // max of pixels
1858 PAVGB(%%mm6, %%mm7) // (max + min)/2
1861 : : "r" (src), "r" (stride), "r" (QP)
1871 * Deinterlaces the given block
1872 * will be called for every 8x8 block, and can read & write into an 8x16 block
1874 static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride)
1876 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1878 "leal (%0, %1), %%eax \n\t"
1879 "leal (%%eax, %1, 4), %%ebx \n\t"
1880 // 0 1 2 3 4 5 6 7 8 9
1881 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1883 "movq (%0), %%mm0 \n\t"
1884 "movq (%%eax, %1), %%mm1 \n\t"
1886 "movq %%mm0, (%%eax) \n\t"
1887 "movq (%0, %1, 4), %%mm0 \n\t"
1889 "movq %%mm1, (%%eax, %1, 2) \n\t"
1890 "movq (%%ebx, %1), %%mm1 \n\t"
1892 "movq %%mm0, (%%ebx) \n\t"
1893 "movq (%0, %1, 8), %%mm0 \n\t"
1895 "movq %%mm1, (%%ebx, %1, 2) \n\t"
1897 : : "r" (src), "r" (stride)
1904 src[stride] = (src[0] + src[stride*2])>>1;
1905 src[stride*3] = (src[stride*2] + src[stride*4])>>1;
1906 src[stride*5] = (src[stride*4] + src[stride*6])>>1;
1907 src[stride*7] = (src[stride*6] + src[stride*8])>>1;
1914 * Deinterlaces the given block
1915 * will be called for every 8x8 block, and can read & write into an 8x16 block
1916 * no cliping in C version
1918 static inline void deInterlaceInterpolateCubic(uint8_t src[], int stride)
1920 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1922 "leal (%0, %1), %%eax \n\t"
1923 "leal (%%eax, %1, 4), %%ebx \n\t"
1924 "leal (%%ebx, %1, 4), %%ecx \n\t"
1925 "addl %1, %%ecx \n\t"
1926 "pxor %%mm7, %%mm7 \n\t"
1927 // 0 1 2 3 4 5 6 7 8 9 10
1928 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 ecx
1930 #define DEINT_CUBIC(a,b,c,d,e)\
1931 "movq " #a ", %%mm0 \n\t"\
1932 "movq " #b ", %%mm1 \n\t"\
1933 "movq " #d ", %%mm2 \n\t"\
1934 "movq " #e ", %%mm3 \n\t"\
1935 PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\
1936 PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\
1937 "movq %%mm0, %%mm2 \n\t"\
1938 "punpcklbw %%mm7, %%mm0 \n\t"\
1939 "punpckhbw %%mm7, %%mm2 \n\t"\
1940 "movq %%mm1, %%mm3 \n\t"\
1941 "punpcklbw %%mm7, %%mm1 \n\t"\
1942 "punpckhbw %%mm7, %%mm3 \n\t"\
1943 "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\
1944 "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\
1945 "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\
1946 "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\
1947 "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\
1948 "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\
1949 "packuswb %%mm3, %%mm1 \n\t"\
1950 "movq %%mm1, " #c " \n\t"
1952 DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%ebx, %1))
1953 DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%ebx), (%%ebx, %1), (%0, %1, 8))
1954 DEINT_CUBIC((%0, %1, 4), (%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8), (%%ecx))
1955 DEINT_CUBIC((%%ebx, %1), (%0, %1, 8), (%%ebx, %1, 4), (%%ecx), (%%ecx, %1, 2))
1957 : : "r" (src), "r" (stride)
1958 : "%eax", "%ebx", "ecx"
1964 src[stride*3] = (-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4;
1965 src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4;
1966 src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4;
1967 src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4;
1974 * Deinterlaces the given block
1975 * will be called for every 8x8 block, and can read & write into an 8x16 block
1976 * will shift the image up by 1 line (FIXME if this is a problem)
1978 static inline void deInterlaceBlendLinear(uint8_t src[], int stride)
1980 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1982 "leal (%0, %1), %%eax \n\t"
1983 "leal (%%eax, %1, 4), %%ebx \n\t"
1984 // 0 1 2 3 4 5 6 7 8 9
1985 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1987 "movq (%0), %%mm0 \n\t" // L0
1988 "movq (%%eax, %1), %%mm1 \n\t" // L2
1989 PAVGB(%%mm1, %%mm0) // L0+L2
1990 "movq (%%eax), %%mm2 \n\t" // L1
1992 "movq %%mm0, (%0) \n\t"
1993 "movq (%%eax, %1, 2), %%mm0 \n\t" // L3
1994 PAVGB(%%mm0, %%mm2) // L1+L3
1995 PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3
1996 "movq %%mm2, (%%eax) \n\t"
1997 "movq (%0, %1, 4), %%mm2 \n\t" // L4
1998 PAVGB(%%mm2, %%mm1) // L2+L4
1999 PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4
2000 "movq %%mm1, (%%eax, %1) \n\t"
2001 "movq (%%ebx), %%mm1 \n\t" // L5
2002 PAVGB(%%mm1, %%mm0) // L3+L5
2003 PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5
2004 "movq %%mm0, (%%eax, %1, 2) \n\t"
2005 "movq (%%ebx, %1), %%mm0 \n\t" // L6
2006 PAVGB(%%mm0, %%mm2) // L4+L6
2007 PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6
2008 "movq %%mm2, (%0, %1, 4) \n\t"
2009 "movq (%%ebx, %1, 2), %%mm2 \n\t" // L7
2010 PAVGB(%%mm2, %%mm1) // L5+L7
2011 PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7
2012 "movq %%mm1, (%%ebx) \n\t"
2013 "movq (%0, %1, 8), %%mm1 \n\t" // L8
2014 PAVGB(%%mm1, %%mm0) // L6+L8
2015 PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8
2016 "movq %%mm0, (%%ebx, %1) \n\t"
2017 "movq (%%ebx, %1, 4), %%mm0 \n\t" // L9
2018 PAVGB(%%mm0, %%mm2) // L7+L9
2019 PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9
2020 "movq %%mm2, (%%ebx, %1, 2) \n\t"
2023 : : "r" (src), "r" (stride)
2030 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2;
2031 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2;
2032 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2033 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
2034 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
2035 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
2036 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
2037 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
2044 * Deinterlaces the given block
2045 * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block
2047 static inline void deInterlaceMedian(uint8_t src[], int stride)
2052 "leal (%0, %1), %%eax \n\t"
2053 "leal (%%eax, %1, 4), %%ebx \n\t"
2054 // 0 1 2 3 4 5 6 7 8 9
2055 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2057 "movq (%0), %%mm0 \n\t" //
2058 "movq (%%eax, %1), %%mm2 \n\t" //
2059 "movq (%%eax), %%mm1 \n\t" //
2060 "movq %%mm0, %%mm3 \n\t"
2061 "pmaxub %%mm1, %%mm0 \n\t" //
2062 "pminub %%mm3, %%mm1 \n\t" //
2063 "pmaxub %%mm2, %%mm1 \n\t" //
2064 "pminub %%mm1, %%mm0 \n\t"
2065 "movq %%mm0, (%%eax) \n\t"
2067 "movq (%0, %1, 4), %%mm0 \n\t" //
2068 "movq (%%eax, %1, 2), %%mm1 \n\t" //
2069 "movq %%mm2, %%mm3 \n\t"
2070 "pmaxub %%mm1, %%mm2 \n\t" //
2071 "pminub %%mm3, %%mm1 \n\t" //
2072 "pmaxub %%mm0, %%mm1 \n\t" //
2073 "pminub %%mm1, %%mm2 \n\t"
2074 "movq %%mm2, (%%eax, %1, 2) \n\t"
2076 "movq (%%ebx), %%mm2 \n\t" //
2077 "movq (%%ebx, %1), %%mm1 \n\t" //
2078 "movq %%mm2, %%mm3 \n\t"
2079 "pmaxub %%mm0, %%mm2 \n\t" //
2080 "pminub %%mm3, %%mm0 \n\t" //
2081 "pmaxub %%mm1, %%mm0 \n\t" //
2082 "pminub %%mm0, %%mm2 \n\t"
2083 "movq %%mm2, (%%ebx) \n\t"
2085 "movq (%%ebx, %1, 2), %%mm2 \n\t" //
2086 "movq (%0, %1, 8), %%mm0 \n\t" //
2087 "movq %%mm2, %%mm3 \n\t"
2088 "pmaxub %%mm0, %%mm2 \n\t" //
2089 "pminub %%mm3, %%mm0 \n\t" //
2090 "pmaxub %%mm1, %%mm0 \n\t" //
2091 "pminub %%mm0, %%mm2 \n\t"
2092 "movq %%mm2, (%%ebx, %1, 2) \n\t"
2095 : : "r" (src), "r" (stride)
2099 #else // MMX without MMX2
2101 "leal (%0, %1), %%eax \n\t"
2102 "leal (%%eax, %1, 4), %%ebx \n\t"
2103 // 0 1 2 3 4 5 6 7 8 9
2104 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2105 "pxor %%mm7, %%mm7 \n\t"
2107 #define MEDIAN(a,b,c)\
2108 "movq " #a ", %%mm0 \n\t"\
2109 "movq " #b ", %%mm2 \n\t"\
2110 "movq " #c ", %%mm1 \n\t"\
2111 "movq %%mm0, %%mm3 \n\t"\
2112 "movq %%mm1, %%mm4 \n\t"\
2113 "movq %%mm2, %%mm5 \n\t"\
2114 "psubusb %%mm1, %%mm3 \n\t"\
2115 "psubusb %%mm2, %%mm4 \n\t"\
2116 "psubusb %%mm0, %%mm5 \n\t"\
2117 "pcmpeqb %%mm7, %%mm3 \n\t"\
2118 "pcmpeqb %%mm7, %%mm4 \n\t"\
2119 "pcmpeqb %%mm7, %%mm5 \n\t"\
2120 "movq %%mm3, %%mm6 \n\t"\
2121 "pxor %%mm4, %%mm3 \n\t"\
2122 "pxor %%mm5, %%mm4 \n\t"\
2123 "pxor %%mm6, %%mm5 \n\t"\
2124 "por %%mm3, %%mm1 \n\t"\
2125 "por %%mm4, %%mm2 \n\t"\
2126 "por %%mm5, %%mm0 \n\t"\
2127 "pand %%mm2, %%mm0 \n\t"\
2128 "pand %%mm1, %%mm0 \n\t"\
2129 "movq %%mm0, " #b " \n\t"
2131 MEDIAN((%0), (%%eax), (%%eax, %1))
2132 MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
2133 MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1))
2134 MEDIAN((%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8))
2136 : : "r" (src), "r" (stride)
2145 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2;
2146 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2;
2147 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2148 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
2149 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
2150 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
2151 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
2152 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
2158 #ifdef HAVE_ODIVX_POSTPROCESS
2159 #include "../opendivx/postprocess.h"
2163 static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2164 QP_STORE_T QPs[], int QPStride, int isColor, int mode);
2169 void postprocess(unsigned char * src[], int src_stride,
2170 unsigned char * dst[], int dst_stride,
2171 int horizontal_size, int vertical_size,
2172 QP_STORE_T *QP_store, int QP_stride,
2176 #ifdef HAVE_ODIVX_POSTPROCESS
2177 // Note: I could make this shit outside of this file, but it would mean one
2178 // more function call...
2180 odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,mode);
2186 long long T= rdtsc();
2187 for(int y=vertical_size-1; y>=0 ; y--)
2188 memcpy(dst[0] + y*src_stride, src[0] + y*src_stride,src_stride);
2189 // memcpy(dst[0], src[0],src_stride*vertical_size);
2190 printf("%4dk\r", (rdtsc()-T)/1000);
2195 long long T= rdtsc();
2196 while( (rdtsc() - T)/1000 < 4000);
2200 postProcess(src[0], src_stride, dst[0], dst_stride,
2201 horizontal_size, vertical_size, QP_store, QP_stride, 0, mode);
2203 horizontal_size >>= 1;
2204 vertical_size >>= 1;
2207 mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00);
2211 postProcess(src[1], src_stride, dst[1], dst_stride,
2212 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
2213 postProcess(src[2], src_stride, dst[2], dst_stride,
2214 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
2218 memcpy(dst[1], src[1], src_stride*horizontal_size);
2219 memcpy(dst[2], src[2], src_stride*horizontal_size);
2224 * gets the mode flags for a given quality (larger values mean slower but better postprocessing)
2227 int getPpModeForQuality(int quality){
2228 int modes[1+GET_PP_QUALITY_MAX]= {
2231 // horizontal filters first
2233 LUM_H_DEBLOCK | LUM_V_DEBLOCK,
2234 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK,
2235 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK,
2236 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING,
2237 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING | CHROM_DERING
2239 // vertical filters first
2241 LUM_V_DEBLOCK | LUM_H_DEBLOCK,
2242 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK,
2243 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK,
2244 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING,
2245 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING | CHROM_DERING
2249 #ifdef HAVE_ODIVX_POSTPROCESS
2250 int odivx_modes[1+GET_PP_QUALITY_MAX]= {
2253 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V,
2254 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H,
2255 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V,
2256 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y,
2257 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y|PP_DERING_C
2259 if(use_old_pp) return odivx_modes[quality];
2261 return modes[quality];
2267 * Copies a block from src to dst and fixes the blacklevel
2268 * numLines must be a multiple of 4
2269 * levelFix == 0 -> dont touch the brighness & contrast
2271 static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
2272 int numLines, int levelFix)
2279 "movl %4, %%eax \n\t"
2280 "movl %%eax, temp0\n\t"
2283 "leal (%2,%2), %%eax \n\t"
2284 "leal (%3,%3), %%ebx \n\t"
2285 "movq packedYOffset, %%mm2 \n\t"
2286 "movq packedYScale, %%mm3 \n\t"
2287 "pxor %%mm4, %%mm4 \n\t"
2289 #define SCALED_CPY \
2290 "movq (%0), %%mm0 \n\t"\
2291 "movq (%0,%2), %%mm1 \n\t"\
2292 "psubusb %%mm2, %%mm0 \n\t"\
2293 "psubusb %%mm2, %%mm1 \n\t"\
2294 "movq %%mm0, %%mm5 \n\t"\
2295 "punpcklbw %%mm4, %%mm0 \n\t"\
2296 "punpckhbw %%mm4, %%mm5 \n\t"\
2297 "psllw $7, %%mm0 \n\t"\
2298 "psllw $7, %%mm5 \n\t"\
2299 "pmulhw %%mm3, %%mm0 \n\t"\
2300 "pmulhw %%mm3, %%mm5 \n\t"\
2301 "packuswb %%mm5, %%mm0 \n\t"\
2302 "movq %%mm0, (%1) \n\t"\
2303 "movq %%mm1, %%mm5 \n\t"\
2304 "punpcklbw %%mm4, %%mm1 \n\t"\
2305 "punpckhbw %%mm4, %%mm5 \n\t"\
2306 "psllw $7, %%mm1 \n\t"\
2307 "psllw $7, %%mm5 \n\t"\
2308 "pmulhw %%mm3, %%mm1 \n\t"\
2309 "pmulhw %%mm3, %%mm5 \n\t"\
2310 "packuswb %%mm5, %%mm1 \n\t"\
2311 "movq %%mm1, (%1, %3) \n\t"\
2315 "addl %%eax, %0 \n\t"
2316 "addl %%ebx, %1 \n\t"
2318 "addl %%eax, %0 \n\t"
2319 "addl %%ebx, %1 \n\t"
2333 for(i=0; i<numLines; i++)
2334 memcpy( &(dst[dstStride*i]),
2335 &(src[srcStride*i]), BLOCK_SIZE);
2342 "movl %4, %%eax \n\t"
2343 "movl %%eax, temp0\n\t"
2346 "leal (%2,%2), %%eax \n\t"
2347 "leal (%3,%3), %%ebx \n\t"
2348 "movq packedYOffset, %%mm2 \n\t"
2349 "movq packedYScale, %%mm3 \n\t"
2351 #define SIMPLE_CPY \
2352 "movq (%0), %%mm0 \n\t"\
2353 "movq (%0,%2), %%mm1 \n\t"\
2354 "movq %%mm0, (%1) \n\t"\
2355 "movq %%mm1, (%1, %3) \n\t"\
2359 "addl %%eax, %0 \n\t"
2360 "addl %%ebx, %1 \n\t"
2362 "addl %%eax, %0 \n\t"
2363 "addl %%ebx, %1 \n\t"
2377 for(i=0; i<numLines; i++)
2378 memcpy( &(dst[dstStride*i]),
2379 &(src[srcStride*i]), BLOCK_SIZE);
2386 * Filters array of bytes (Y or U or V values)
2388 static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2389 QP_STORE_T QPs[], int QPStride, int isColor, int mode)
2392 /* we need 64bit here otherwise we´ll going to have a problem
2393 after watching a black picture for 5 hours*/
2394 static uint64_t *yHistogram= NULL;
2395 int black=0, white=255; // blackest black and whitest white in the picture
2397 /* Temporary buffers for handling the last row(s) */
2398 static uint8_t *tempDst= NULL;
2399 static uint8_t *tempSrc= NULL;
2401 /* Temporary buffers for handling the last block */
2402 static uint8_t *tempDstBlock= NULL;
2403 static uint8_t *tempSrcBlock= NULL;
2405 uint8_t *dstBlockPtrBackup;
2406 uint8_t *srcBlockPtrBackup;
2409 long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0;
2415 tempDst= (uint8_t*)memalign(8, 1024*24);
2416 tempSrc= (uint8_t*)memalign(8, 1024*24);
2417 tempDstBlock= (uint8_t*)memalign(8, 1024*24);
2418 tempSrcBlock= (uint8_t*)memalign(8, 1024*24);
2424 yHistogram= (uint64_t*)malloc(8*256);
2425 for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256;
2427 if(mode & FULL_Y_RANGE)
2438 static int framenum= -1;
2439 uint64_t maxClipped;
2444 if(framenum == 1) yHistogram[0]= width*height/64*15/256;
2446 for(i=0; i<256; i++)
2448 sum+= yHistogram[i];
2449 // printf("%d ", yHistogram[i]);
2453 /* we allways get a completly black picture first */
2454 maxClipped= (uint64_t)(sum * maxClippedThreshold);
2457 for(black=255; black>0; black--)
2459 if(clipped < maxClipped) break;
2460 clipped-= yHistogram[black];
2464 for(white=0; white<256; white++)
2466 if(clipped < maxClipped) break;
2467 clipped-= yHistogram[white];
2470 // we cant handle negative correctures
2471 packedYOffset= MAX(black - minAllowedY, 0);
2472 packedYOffset|= packedYOffset<<32;
2473 packedYOffset|= packedYOffset<<16;
2474 packedYOffset|= packedYOffset<<8;
2476 scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);
2478 packedYScale= (uint16_t)(scale*512.0 + 0.5);
2479 packedYScale|= packedYScale<<32;
2480 packedYScale|= packedYScale<<16;
2484 packedYScale= 0x0100010001000100LL;
2488 /* copy first row of 8x8 blocks */
2489 for(x=0; x<width; x+=BLOCK_SIZE)
2490 blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX);
2492 for(y=0; y<height; y+=BLOCK_SIZE)
2494 //1% speedup if these are here instead of the inner loop
2495 uint8_t *srcBlock= &(src[y*srcStride]);
2496 uint8_t *dstBlock= &(dst[y*dstStride]);
2498 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not
2499 than use a temporary buffer */
2502 /* copy from line 5 to 12 of src, these will e copied with
2503 blockcopy to dst later */
2504 memcpy(tempSrc + srcStride*5, srcBlock + srcStride*5,
2505 srcStride*MAX(height-y-5, 0) );
2507 /* duplicate last line to fill the void upto line 12 */
2511 for(i=height-y; i<=12; i++)
2512 memcpy(tempSrc + srcStride*i,
2513 src + srcStride*(height-1), srcStride);
2517 /* copy up to 5 lines of dst */
2518 memcpy(tempDst, dstBlock, dstStride*MIN(height-y, 5) );
2523 // From this point on it is guranteed that we can read and write 16 lines downward
2524 // finish 1 block before the next otherwise we´ll might have a problem
2525 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
2526 for(x=0; x<width; x+=BLOCK_SIZE)
2528 const int stride= dstStride;
2530 QPs[(y>>3)*QPStride + (x>>3)]:
2531 QPs[(y>>4)*QPStride + (x>>4)];
2532 if(!isColor && (mode & LEVEL_FIX)) QP= (QP* (packedYScale &0xFFFF))>>8;
2535 "movd %0, %%mm7 \n\t"
2536 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
2537 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
2538 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
2539 "movq %%mm7, pQPb \n\t"
2549 prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
2550 prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
2551 prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
2552 prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
2553 #elif defined(HAVE_3DNOW)
2554 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
2555 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
2556 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
2557 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
2558 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
2562 if(!isColor) yHistogram[ srcBlock[srcStride*5] ]++;
2564 //can we mess with a 8x16 block, if not use a temp buffer, yes again
2568 dstBlockPtrBackup= dstBlock;
2569 srcBlockPtrBackup= srcBlock;
2571 for(i=0;i<BLOCK_SIZE*2; i++)
2573 memcpy(tempSrcBlock+i*srcStride, srcBlock+i*srcStride, width-x);
2574 memcpy(tempDstBlock+i*dstStride, dstBlock+i*dstStride, width-x);
2577 dstBlock= tempDstBlock;
2578 srcBlock= tempSrcBlock;
2581 blockCopy(dstBlock + dstStride*5, dstStride,
2582 srcBlock + srcStride*5, srcStride, 8, mode & LEVEL_FIX);
2584 if(mode & LINEAR_IPOL_DEINT_FILTER)
2585 deInterlaceInterpolateLinear(dstBlock, dstStride);
2586 else if(mode & LINEAR_BLEND_DEINT_FILTER)
2587 deInterlaceBlendLinear(dstBlock, dstStride);
2588 else if(mode & MEDIAN_DEINT_FILTER)
2589 deInterlaceMedian(dstBlock, dstStride);
2590 else if(mode & CUBIC_IPOL_DEINT_FILTER)
2591 deInterlaceInterpolateCubic(dstBlock, dstStride);
2592 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
2593 deInterlaceBlendCubic(dstBlock, dstStride);
2596 /* only deblock if we have 2 blocks */
2604 if(mode & V_DEBLOCK)
2606 if(mode & V_RK1_FILTER)
2607 vertRK1Filter(dstBlock, stride, QP);
2608 else if(mode & V_X1_FILTER)
2609 vertX1Filter(dstBlock, stride, QP);
2612 if( isVertDC(dstBlock, stride))
2614 if(isVertMinMaxOk(dstBlock, stride, QP))
2615 doVertLowPass(dstBlock, stride, QP);
2618 doVertDefFilter(dstBlock, stride, QP);
2628 /* check if we have a previous block to deblock it with dstBlock */
2634 if(mode & H_DEBLOCK)
2636 if(mode & H_X1_FILTER)
2637 horizX1Filter(dstBlock-4, stride, QP);
2640 if( isHorizDCAndCopy2Temp(dstBlock-4, stride))
2642 if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP))
2643 doHorizLowPassAndCopyBack(dstBlock-4, stride, QP);
2646 doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP);
2654 dering(dstBlock - 9 - stride, stride, QP);
2657 dering(dstBlock - stride*9 + width-9, stride, QP);
2658 //FIXME dering filter will not be applied to last block (bottom right)
2660 /* did we use a tmp-block buffer */
2664 dstBlock= dstBlockPtrBackup;
2665 srcBlock= srcBlockPtrBackup;
2667 for(i=0;i<BLOCK_SIZE*2; i++)
2669 memcpy(dstBlock+i*dstStride, tempDstBlock+i*dstStride, width-x);
2677 /* did we use a tmp buffer */
2680 uint8_t *dstBlock= &(dst[y*dstStride]);
2681 memcpy(dstBlock, tempDst, dstStride*(height-y) );
2685 asm volatile("femms");
2686 #elif defined (HAVE_MMX)
2687 asm volatile("emms");
2691 // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...)
2692 sumTime= rdtsc() - sumTime;
2694 printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d \r",
2695 (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
2696 (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)