2 Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 doVertDefFilter Ec Ec Ec
28 doHorizDefFilter E ac ac
30 Vertical RKAlgo1 E a a*
35 * i dont have a 3dnow CPU -> its untested
36 E = Exact implementation
37 e = allmost exact implementation
38 a = alternative / approximate impl
39 c = checked against the other implementations (-vo md5)
44 verify that everything workes as it should (how?)
45 reduce the time wasted on the mem transfer
47 implement everything in C at least (done at the moment but ...)
48 unroll stuff if instructions depend too much on the prior one
49 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
50 move YScale thing to the end instead of fixing QP
51 write a faster and higher quality deblocking filter :)
52 do something about the speed of the horizontal filters
53 make the mainloop more flexible (variable number of blocks at once
54 (the if/else stuff per block is slowing things down)
55 compare the quality & speed of all filters
56 implement a few simple deinterlacing filters
65 Changelog: use the CVS log
67 bugfixes: last 3 lines not brightness/contrast corrected
68 brightness statistics messed up with initial black pic
69 changed initial values of the brightness statistics
71 QP range question solved (very likely 1<=QP<=32 according to arpi)
72 new experimental vertical deblocking filter
73 RK filter has 3dNow support now (untested)
75 fixed a bug in the horizontal default filter
76 3dnow version of the Horizontal & Vertical Lowpass filters
77 mmx version of the Horizontal Default filter
78 mmx2 & C versions of a simple filter described in a paper from ramkishor & karandikar
79 added mode flags & quality2mode function
87 #include "../config.h"
91 #include "postprocess.h"
94 static uint64_t packedYOffset= 0x0000000000000000LL;
95 static uint64_t packedYScale= 0x0100010001000100LL;
96 static uint64_t w05= 0x0005000500050005LL;
97 static uint64_t w20= 0x0020002000200020LL;
98 static uint64_t w1400= 0x1400140014001400LL;
99 static uint64_t bm00000001= 0x00000000000000FFLL;
100 static uint64_t bm00010000= 0x000000FF00000000LL;
101 static uint64_t bm00001000= 0x00000000FF000000LL;
102 static uint64_t bm10000000= 0xFF00000000000000LL;
103 static uint64_t bm10000001= 0xFF000000000000FFLL;
104 static uint64_t bm11000011= 0xFFFF00000000FFFFLL;
105 static uint64_t bm00000011= 0x000000000000FFFFLL;
106 static uint64_t bm11111110= 0xFFFFFFFFFFFFFF00LL;
107 static uint64_t bm11000000= 0xFFFF000000000000LL;
108 static uint64_t bm00011000= 0x000000FFFF000000LL;
109 static uint64_t bm00110011= 0x0000FFFF0000FFFFLL;
110 static uint64_t bm11001100= 0xFFFF0000FFFF0000LL;
111 static uint64_t b00= 0x0000000000000000LL;
112 static uint64_t b01= 0x0101010101010101LL;
113 static uint64_t b02= 0x0202020202020202LL;
114 static uint64_t b0F= 0x0F0F0F0F0F0F0F0FLL;
115 static uint64_t bFF= 0xFFFFFFFFFFFFFFFFLL;
116 static uint64_t b20= 0x2020202020202020LL;
117 static uint64_t b80= 0x8080808080808080LL;
118 static uint64_t b7E= 0x7E7E7E7E7E7E7E7ELL;
119 static uint64_t b7C= 0x7C7C7C7C7C7C7C7CLL;
120 static uint64_t b3F= 0x3F3F3F3F3F3F3F3FLL;
121 static uint64_t temp0=0;
122 static uint64_t temp1=0;
123 static uint64_t temp2=0;
124 static uint64_t temp3=0;
125 static uint64_t temp4=0;
126 static uint64_t temp5=0;
127 static uint64_t pQPb=0;
128 static uint8_t tempBlock[16*16];
130 int hFlatnessThreshold= 56 - 16;
131 int vFlatnessThreshold= 56 - 16;
133 //amount of "black" u r willing to loose to get a brightness corrected picture
134 double maxClippedThreshold= 0.01;
137 //FIXME can never make a movie´s black brighter (anyone needs that?)
141 static inline long long rdtsc()
144 asm volatile( "rdtsc\n\t"
147 // printf("%d\n", int(l/1000));
151 static inline void prefetchnta(void *p)
153 asm volatile( "prefetchnta (%0)\n\t"
158 static inline void prefetcht0(void *p)
160 asm volatile( "prefetcht0 (%0)\n\t"
165 static inline void prefetcht1(void *p)
167 asm volatile( "prefetcht1 (%0)\n\t"
172 static inline void prefetcht2(void *p)
174 asm volatile( "prefetcht2 (%0)\n\t"
179 //FIXME? |255-0| = 1 (shouldnt be a problem ...)
181 * Check if the middle 8x8 Block in the given 8x10 block is flat
183 static inline int isVertDC(uint8_t src[], int stride){
187 src+= stride; // src points to begin of the 8x8 Block
192 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F
193 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D
194 "movq (%1), %%mm0 \n\t"
196 "movq (%1), %%mm1 \n\t"
197 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
198 "paddb %%mm7, %%mm0 \n\t"
199 "pcmpgtb %%mm6, %%mm0 \n\t"
202 "movq (%1), %%mm2 \n\t"
203 "psubb %%mm2, %%mm1 \n\t"
204 "paddb %%mm7, %%mm1 \n\t"
205 "pcmpgtb %%mm6, %%mm1 \n\t"
206 "paddb %%mm1, %%mm0 \n\t"
209 "movq (%1), %%mm1 \n\t"
210 "psubb %%mm1, %%mm2 \n\t"
211 "paddb %%mm7, %%mm2 \n\t"
212 "pcmpgtb %%mm6, %%mm2 \n\t"
213 "paddb %%mm2, %%mm0 \n\t"
216 "movq (%1), %%mm2 \n\t"
217 "psubb %%mm2, %%mm1 \n\t"
218 "paddb %%mm7, %%mm1 \n\t"
219 "pcmpgtb %%mm6, %%mm1 \n\t"
220 "paddb %%mm1, %%mm0 \n\t"
223 "movq (%1), %%mm1 \n\t"
224 "psubb %%mm1, %%mm2 \n\t"
225 "paddb %%mm7, %%mm2 \n\t"
226 "pcmpgtb %%mm6, %%mm2 \n\t"
227 "paddb %%mm2, %%mm0 \n\t"
230 "movq (%1), %%mm2 \n\t"
231 "psubb %%mm2, %%mm1 \n\t"
232 "paddb %%mm7, %%mm1 \n\t"
233 "pcmpgtb %%mm6, %%mm1 \n\t"
234 "paddb %%mm1, %%mm0 \n\t"
237 "movq (%1), %%mm1 \n\t"
238 "psubb %%mm1, %%mm2 \n\t"
239 "paddb %%mm7, %%mm2 \n\t"
240 "pcmpgtb %%mm6, %%mm2 \n\t"
241 "paddb %%mm2, %%mm0 \n\t"
244 "movq %%mm0, %%mm1 \n\t"
245 "psrlw $8, %%mm0 \n\t"
246 "paddb %%mm1, %%mm0 \n\t"
247 "movq %%mm0, %%mm1 \n\t"
248 "psrlq $16, %%mm0 \n\t"
249 "paddb %%mm1, %%mm0 \n\t"
250 "movq %%mm0, %%mm1 \n\t"
251 "psrlq $32, %%mm0 \n\t"
252 "paddb %%mm1, %%mm0 \n\t"
254 "movd %%mm0, %0 \n\t"
256 : "r" (src), "r" (stride)
258 // printf("%d\n", numEq);
259 numEq= (256 - (numEq & 0xFF)) &0xFF;
263 // uint8_t *temp= src;
266 for(y=0; y<BLOCK_SIZE-1; y++)
268 if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
269 if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
270 if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++;
271 if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++;
272 if(((src[4] - src[4+stride] + 1)&0xFFFF) < 3) numEq++;
273 if(((src[5] - src[5+stride] + 1)&0xFFFF) < 3) numEq++;
274 if(((src[6] - src[6+stride] + 1)&0xFFFF) < 3) numEq++;
275 if(((src[7] - src[7+stride] + 1)&0xFFFF) < 3) numEq++;
279 /* if(abs(numEq - asmEq) > 0)
281 printf("\nasm:%d c:%d\n", asmEq, numEq);
282 for(int y=0; y<8; y++)
284 for(int x=0; x<8; x++)
286 printf("%d ", temp[x + y*stride]);
292 // for(int i=0; i<numEq/8; i++) src[i]=255;
293 return (numEq > vFlatnessThreshold) ? 1 : 0;
296 static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
302 "movq (%1, %2), %%mm0 \n\t"
303 "movq (%1, %2, 8), %%mm1 \n\t"
304 "movq %%mm0, %%mm2 \n\t"
305 "psubusb %%mm1, %%mm0 \n\t"
306 "psubusb %%mm2, %%mm1 \n\t"
307 "por %%mm1, %%mm0 \n\t" // ABS Diff
309 "movq pQPb, %%mm7 \n\t" // QP,..., QP
310 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
311 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0
312 "pcmpeqd b00, %%mm0 \n\t"
313 "psrlq $16, %%mm0 \n\t"
314 "pcmpeqd bFF, %%mm0 \n\t"
315 // "movd %%mm0, (%1, %2, 4)\n\t"
316 "movd %%mm0, %0 \n\t"
318 : "r" (src), "r" (stride)
325 for(x=0; x<BLOCK_SIZE; x++)
327 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
329 /* if(isOk && !isOk2 || !isOk && isOk2)
331 printf("\nasm:%d c:%d QP:%d\n", isOk, isOk2, QP);
332 for(int y=0; y<9; y++)
334 for(int x=0; x<8; x++)
336 printf("%d ", src[x + y*stride]);
348 * Do a vertical low pass filter on the 8x10 block (only write to the 8x8 block in the middle)
349 * useing the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
351 static inline void doVertLowPass(uint8_t *src, int stride, int QP)
355 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
357 asm volatile( //"movv %0 %1 %2\n\t"
359 "movq pQPb, %%mm0 \n\t" // QP,..., QP
360 // "movq bFF , %%mm0 \n\t" // QP,..., QP
362 "movq (%0), %%mm6 \n\t"
363 "movq (%0, %1), %%mm5 \n\t"
364 "movq %%mm5, %%mm1 \n\t"
365 "movq %%mm6, %%mm2 \n\t"
366 "psubusb %%mm6, %%mm5 \n\t"
367 "psubusb %%mm1, %%mm2 \n\t"
368 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
369 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
370 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
372 "pand %%mm2, %%mm6 \n\t"
373 "pandn %%mm1, %%mm2 \n\t"
374 "por %%mm2, %%mm6 \n\t"// First Line to Filter
376 "movq (%0, %1, 8), %%mm5 \n\t"
377 "leal (%0, %1, 4), %%eax \n\t"
378 "leal (%0, %1, 8), %%ebx \n\t"
379 "subl %1, %%ebx \n\t"
380 "addl %1, %0 \n\t" // %0 points to line 1 not 0
381 "movq (%0, %1, 8), %%mm7 \n\t"
382 "movq %%mm5, %%mm1 \n\t"
383 "movq %%mm7, %%mm2 \n\t"
384 "psubusb %%mm7, %%mm5 \n\t"
385 "psubusb %%mm1, %%mm2 \n\t"
386 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
387 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
388 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
390 "pand %%mm2, %%mm7 \n\t"
391 "pandn %%mm1, %%mm2 \n\t"
392 "por %%mm2, %%mm7 \n\t" // First Line to Filter
396 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ebx eax+4%1
401 "movq %%mm6, %%mm2 \n\t" //1
402 "movq %%mm6, %%mm3 \n\t" //1
403 "paddusb b02, %%mm3 \n\t"
404 "psrlw $2, %%mm3 \n\t" //1 /4
405 "pand b3F, %%mm3 \n\t"
406 "psubb %%mm3, %%mm2 \n\t"
407 "movq (%0, %1), %%mm0 \n\t" // 1
408 "movq %%mm0, %%mm1 \n\t" // 1
409 "paddusb b02, %%mm0 \n\t"
410 "psrlw $2, %%mm0 \n\t" // 1 /4
411 "pand b3F, %%mm0 \n\t"
412 "paddusb %%mm2, %%mm0 \n\t" //3 1 /4
414 "movq (%0, %1), %%mm0 \n\t" // 1
415 "movq %%mm0, %%mm1 \n\t" // 1
416 PAVGB(%%mm6, %%mm0) //1 1 /2
417 PAVGB(%%mm6, %%mm0) //3 1 /4
419 "movq (%0, %1, 4), %%mm2 \n\t" // 1
420 "movq %%mm2, %%mm5 \n\t" // 1
421 PAVGB((%%eax), %%mm2) // 11 /2
422 PAVGB((%0, %1, 2), %%mm2) // 211 /4
423 "movq %%mm2, %%mm3 \n\t" // 211 /4
424 "movq (%0), %%mm4 \n\t" // 1
425 PAVGB(%%mm4, %%mm3) // 4 211 /8
426 PAVGB(%%mm0, %%mm3) //642211 /16
427 "movq %%mm3, (%0) \n\t" // X
428 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
429 "movq %%mm1, %%mm0 \n\t" // 1
430 PAVGB(%%mm6, %%mm0) //1 1 /2
431 "movq %%mm4, %%mm3 \n\t" // 1
432 PAVGB((%0,%1,2), %%mm3) // 1 1 /2
433 PAVGB((%%eax,%1,2), %%mm5) // 11 /2
434 PAVGB((%%eax), %%mm5) // 211 /4
435 PAVGB(%%mm5, %%mm3) // 2 2211 /8
436 PAVGB(%%mm0, %%mm3) //4242211 /16
437 "movq %%mm3, (%0,%1) \n\t" // X
438 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
439 PAVGB(%%mm4, %%mm6) //11 /2
440 "movq (%%ebx), %%mm0 \n\t" // 1
441 PAVGB((%%eax, %1, 2), %%mm0) // 11/2
442 "movq %%mm0, %%mm3 \n\t" // 11/2
443 PAVGB(%%mm1, %%mm0) // 2 11/4
444 PAVGB(%%mm6, %%mm0) //222 11/8
445 PAVGB(%%mm2, %%mm0) //22242211/16
446 "movq (%0, %1, 2), %%mm2 \n\t" // 1
447 "movq %%mm0, (%0, %1, 2) \n\t" // X
448 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
449 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
450 PAVGB((%%ebx), %%mm0) // 11 /2
451 PAVGB(%%mm0, %%mm6) //11 11 /4
452 PAVGB(%%mm1, %%mm4) // 11 /2
453 PAVGB(%%mm2, %%mm1) // 11 /2
454 PAVGB(%%mm1, %%mm6) //1122 11 /8
455 PAVGB(%%mm5, %%mm6) //112242211 /16
456 "movq (%%eax), %%mm5 \n\t" // 1
457 "movq %%mm6, (%%eax) \n\t" // X
458 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
459 "movq (%%eax, %1, 4), %%mm6 \n\t" // 1
460 PAVGB(%%mm7, %%mm6) // 11 /2
461 PAVGB(%%mm4, %%mm6) // 11 11 /4
462 PAVGB(%%mm3, %%mm6) // 11 2211 /8
463 PAVGB(%%mm5, %%mm2) // 11 /2
464 "movq (%0, %1, 4), %%mm4 \n\t" // 1
465 PAVGB(%%mm4, %%mm2) // 112 /4
466 PAVGB(%%mm2, %%mm6) // 112242211 /16
467 "movq %%mm6, (%0, %1, 4) \n\t" // X
468 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
469 PAVGB(%%mm7, %%mm1) // 11 2 /4
470 PAVGB(%%mm4, %%mm5) // 11 /2
471 PAVGB(%%mm5, %%mm0) // 11 11 /4
472 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1
473 PAVGB(%%mm6, %%mm1) // 11 4 2 /8
474 PAVGB(%%mm0, %%mm1) // 11224222 /16
475 // "pxor %%mm1, %%mm1 \n\t"
476 "movq %%mm1, (%%eax, %1, 2) \n\t" // X
477 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
478 PAVGB((%%ebx), %%mm2) // 112 4 /8
479 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
480 PAVGB(%%mm0, %%mm6) // 1 1 /2
481 PAVGB(%%mm7, %%mm6) // 1 12 /4
482 PAVGB(%%mm2, %%mm6) // 1122424 /4
483 // "pxor %%mm6, %%mm6 \n\t"
484 "movq %%mm6, (%%ebx) \n\t" // X
485 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
486 PAVGB(%%mm7, %%mm5) // 11 2 /4
487 PAVGB(%%mm7, %%mm5) // 11 6 /8
489 PAVGB(%%mm3, %%mm0) // 112 /4
490 PAVGB(%%mm0, %%mm5) // 112246 /16
491 // "pxor %%mm5, %%mm5 \n\t"
492 // "movq pQPb, %%mm5 \n\t"
493 "movq %%mm5, (%%eax, %1, 4) \n\t" // X
497 : "r" (src), "r" (stride)
501 const int l1= stride;
502 const int l2= stride + l1;
503 const int l3= stride + l2;
504 const int l4= stride + l3;
505 const int l5= stride + l4;
506 const int l6= stride + l5;
507 const int l7= stride + l6;
508 const int l8= stride + l7;
509 const int l9= stride + l8;
511 for(x=0; x<BLOCK_SIZE; x++)
513 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
514 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
517 sums[0] = first + src[l1];
518 sums[1] = src[l1] + src[l2];
519 sums[2] = src[l2] + src[l3];
520 sums[3] = src[l3] + src[l4];
521 sums[4] = src[l4] + src[l5];
522 sums[5] = src[l5] + src[l6];
523 sums[6] = src[l6] + src[l7];
524 sums[7] = src[l7] + src[l8];
525 sums[8] = src[l8] + last;
527 src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
528 src[l2]= ((src[l2]<<2) + (first + sums[0] + sums[3]<<1) + sums[5] + 8)>>4;
529 src[l3]= ((src[l3]<<2) + (first + sums[1] + sums[4]<<1) + sums[6] + 8)>>4;
530 src[l4]= ((src[l4]<<2) + (sums[2] + sums[5]<<1) + sums[0] + sums[7] + 8)>>4;
531 src[l5]= ((src[l5]<<2) + (sums[3] + sums[6]<<1) + sums[1] + sums[8] + 8)>>4;
532 src[l6]= ((src[l6]<<2) + (last + sums[7] + sums[4]<<1) + sums[2] + 8)>>4;
533 src[l7]= ((last + src[l7]<<2) + (src[l8] + sums[5]<<1) + sums[3] + 8)>>4;
534 src[l8]= ((sums[8]<<2) + (last + sums[6]<<1) + sums[4] + 8)>>4;
543 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
544 * values are correctly clipped (MMX2)
545 * values are wraparound (C)
546 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
553 static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
555 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
558 "pxor %%mm7, %%mm7 \n\t" // 0
559 "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
560 "leal (%0, %1), %%eax \n\t"
561 "leal (%%eax, %1, 4), %%ebx \n\t"
562 // 0 1 2 3 4 5 6 7 8 9
563 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
564 "movq pQPb, %%mm0 \n\t" // QP,..., QP
565 "movq %%mm0, %%mm1 \n\t" // QP,..., QP
566 "paddusb b02, %%mm0 \n\t"
567 "psrlw $2, %%mm0 \n\t"
568 "pand b3F, %%mm0 \n\t" // QP/4,..., QP/4
569 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ...
570 "movq (%0, %1, 4), %%mm2 \n\t" // line 4
571 "movq (%%ebx), %%mm3 \n\t" // line 5
572 "movq %%mm2, %%mm4 \n\t" // line 4
573 "pcmpeqb %%mm5, %%mm5 \n\t" // -1
574 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
576 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
577 "psubusb %%mm3, %%mm4 \n\t"
578 "psubusb %%mm2, %%mm3 \n\t"
579 "por %%mm3, %%mm4 \n\t" // |l4 - l5|
580 "psubusb %%mm0, %%mm4 \n\t"
581 "pcmpeqb %%mm7, %%mm4 \n\t"
582 "pand %%mm4, %%mm5 \n\t" // d/2
584 // "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80
585 "paddb %%mm5, %%mm2 \n\t"
586 // "psubb %%mm6, %%mm2 \n\t"
587 "movq %%mm2, (%0,%1, 4) \n\t"
589 "movq (%%ebx), %%mm2 \n\t"
590 // "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80
591 "psubb %%mm5, %%mm2 \n\t"
592 // "psubb %%mm6, %%mm2 \n\t"
593 "movq %%mm2, (%%ebx) \n\t"
595 "paddb %%mm6, %%mm5 \n\t"
596 "psrlw $2, %%mm5 \n\t"
597 "pand b3F, %%mm5 \n\t"
598 "psubb b20, %%mm5 \n\t" // (l5-l4)/8
600 "movq (%%eax, %1, 2), %%mm2 \n\t"
601 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80
602 "paddsb %%mm5, %%mm2 \n\t"
603 "psubb %%mm6, %%mm2 \n\t"
604 "movq %%mm2, (%%eax, %1, 2) \n\t"
606 "movq (%%ebx, %1), %%mm2 \n\t"
607 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80
608 "psubsb %%mm5, %%mm2 \n\t"
609 "psubb %%mm6, %%mm2 \n\t"
610 "movq %%mm2, (%%ebx, %1) \n\t"
613 : "r" (src), "r" (stride)
617 const int l1= stride;
618 const int l2= stride + l1;
619 const int l3= stride + l2;
620 const int l4= stride + l3;
621 const int l5= stride + l4;
622 const int l6= stride + l5;
623 const int l7= stride + l6;
624 const int l8= stride + l7;
625 const int l9= stride + l8;
627 for(x=0; x<BLOCK_SIZE; x++)
629 if(ABS(src[l4]-src[l5]) < QP + QP/4)
631 int v = (src[l5] - src[l4]);
646 * Experimental Filter 1
647 * will not damage linear gradients
648 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
649 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
650 * MMX2 version does correct clipping C version doesnt
652 static inline void vertX1Filter(uint8_t *src, int stride, int QP)
654 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
656 "pxor %%mm7, %%mm7 \n\t" // 0
657 // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
658 "leal (%0, %1), %%eax \n\t"
659 "leal (%%eax, %1, 4), %%ebx \n\t"
660 // 0 1 2 3 4 5 6 7 8 9
661 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
662 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
663 "movq (%0, %1, 4), %%mm1 \n\t" // line 4
664 "movq %%mm1, %%mm2 \n\t" // line 4
665 "psubusb %%mm0, %%mm1 \n\t"
666 "psubusb %%mm2, %%mm0 \n\t"
667 "por %%mm1, %%mm0 \n\t" // |l2 - l3|
668 "movq (%%ebx), %%mm3 \n\t" // line 5
669 "movq (%%ebx, %1), %%mm4 \n\t" // line 6
670 "movq %%mm3, %%mm5 \n\t" // line 5
671 "psubusb %%mm4, %%mm3 \n\t"
672 "psubusb %%mm5, %%mm4 \n\t"
673 "por %%mm4, %%mm3 \n\t" // |l5 - l6|
674 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
675 "movq %%mm2, %%mm1 \n\t" // line 4
676 "psubusb %%mm5, %%mm2 \n\t"
677 "movq %%mm2, %%mm4 \n\t"
678 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
679 "psubusb %%mm1, %%mm5 \n\t"
680 "por %%mm5, %%mm4 \n\t" // |l4 - l5|
681 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
682 "movq %%mm4, %%mm3 \n\t" // d
683 "psubusb pQPb, %%mm4 \n\t"
684 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
685 "psubusb b01, %%mm3 \n\t"
686 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
688 PAVGB(%%mm7, %%mm3) // d/2
689 "movq %%mm3, %%mm1 \n\t" // d/2
690 PAVGB(%%mm7, %%mm3) // d/4
691 PAVGB(%%mm1, %%mm3) // 3*d/8
693 "movq (%0, %1, 4), %%mm0 \n\t" // line 4
694 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
695 "psubusb %%mm3, %%mm0 \n\t"
696 "pxor %%mm2, %%mm0 \n\t"
697 "movq %%mm0, (%0, %1, 4) \n\t" // line 4
699 "movq (%%ebx), %%mm0 \n\t" // line 5
700 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
701 "paddusb %%mm3, %%mm0 \n\t"
702 "pxor %%mm2, %%mm0 \n\t"
703 "movq %%mm0, (%%ebx) \n\t" // line 5
705 PAVGB(%%mm7, %%mm1) // d/4
707 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
708 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
709 "psubusb %%mm1, %%mm0 \n\t"
710 "pxor %%mm2, %%mm0 \n\t"
711 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3
713 "movq (%%ebx, %1), %%mm0 \n\t" // line 6
714 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
715 "paddusb %%mm1, %%mm0 \n\t"
716 "pxor %%mm2, %%mm0 \n\t"
717 "movq %%mm0, (%%ebx, %1) \n\t" // line 6
719 PAVGB(%%mm7, %%mm1) // d/8
721 "movq (%%eax, %1), %%mm0 \n\t" // line 2
722 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
723 "psubusb %%mm1, %%mm0 \n\t"
724 "pxor %%mm2, %%mm0 \n\t"
725 "movq %%mm0, (%%eax, %1) \n\t" // line 2
727 "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7
728 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
729 "paddusb %%mm1, %%mm0 \n\t"
730 "pxor %%mm2, %%mm0 \n\t"
731 "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7
734 : "r" (src), "r" (stride)
739 const int l1= stride;
740 const int l2= stride + l1;
741 const int l3= stride + l2;
742 const int l4= stride + l3;
743 const int l5= stride + l4;
744 const int l6= stride + l5;
745 const int l7= stride + l6;
746 const int l8= stride + l7;
747 const int l9= stride + l8;
749 for(x=0; x<BLOCK_SIZE; x++)
751 int a= src[l3] - src[l4];
752 int b= src[l4] - src[l5];
753 int c= src[l5] - src[l6];
755 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
759 int v = d * SIGN(-b);
772 const int l1= stride;
773 const int l2= stride + l1;
774 const int l3= stride + l2;
775 const int l4= stride + l3;
776 const int l5= stride + l4;
777 const int l6= stride + l5;
778 const int l7= stride + l6;
779 const int l8= stride + l7;
780 const int l9= stride + l8;
781 for(int x=0; x<BLOCK_SIZE; x++)
790 if(ABS(v4-v5)<QP && ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 )
792 src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6 )/16;
793 src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7 )/16;
794 src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16;
795 src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16;
804 * Experimental Filter 1 (Horizontal)
805 * will not damage linear gradients
806 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
807 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
808 * MMX2 version does correct clipping C version doesnt
809 * not identical with the vertical one
811 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
814 static uint64_t *lut= NULL;
818 lut= (uint64_t*)memalign(8, 256*8);
821 int v= i < 128 ? 2*i : 2*(i-256);
823 //Simulate 112242211 9-Tap filter
824 uint64_t a= (v/16) & 0xFF;
825 uint64_t b= (v/8) & 0xFF;
826 uint64_t c= (v/4) & 0xFF;
827 uint64_t d= (3*v/8) & 0xFF;
829 //Simulate piecewise linear interpolation
830 uint64_t a= (v/16) & 0xFF;
831 uint64_t b= (v*3/16) & 0xFF;
832 uint64_t c= (v*5/16) & 0xFF;
833 uint64_t d= (7*v/16) & 0xFF;
834 uint64_t A= (0x100 - a)&0xFF;
835 uint64_t B= (0x100 - b)&0xFF;
836 uint64_t C= (0x100 - c)&0xFF;
837 uint64_t D= (0x100 - c)&0xFF;
839 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
840 (D<<24) | (C<<16) | (B<<8) | (A);
841 //lut[i] = (v<<32) | (v<<24);
845 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
847 "pxor %%mm7, %%mm7 \n\t" // 0
848 // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
849 "leal (%0, %1), %%eax \n\t"
850 "leal (%%eax, %1, 4), %%ebx \n\t"
852 "movq b80, %%mm6 \n\t"
853 "movd %2, %%mm5 \n\t" // QP
854 "movq %%mm5, %%mm4 \n\t"
855 "paddusb %%mm5, %%mm5 \n\t" // 2QP
856 "paddusb %%mm5, %%mm4 \n\t" // 3QP
857 "pxor %%mm5, %%mm5 \n\t" // 0
858 "psubb %%mm4, %%mm5 \n\t" // -3QP
859 "por bm11111110, %%mm5 \n\t" // ...,FF,FF,-3QP
860 "psllq $24, %%mm5 \n\t"
862 // 0 1 2 3 4 5 6 7 8 9
863 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
866 "movd " #a ", %%mm0 \n\t"\
867 "movd 4" #a ", %%mm1 \n\t"\
868 "punpckldq %%mm1, %%mm0 \n\t"\
869 "movq %%mm0, %%mm1 \n\t"\
870 "movq %%mm0, %%mm2 \n\t"\
871 "psrlq $8, %%mm1 \n\t"\
872 "psubusb %%mm1, %%mm2 \n\t"\
873 "psubusb %%mm0, %%mm1 \n\t"\
874 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
875 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
876 "pshufw $0x00, %%mm1, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
877 PAVGB(%%mm1, %%mm3) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
878 "psrlq $16, %%mm3 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
879 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
880 "paddb %%mm5, %%mm1 \n\t"\
881 "psubusb %%mm5, %%mm1 \n\t"\
883 "pxor %%mm2, %%mm1 \n\t"\
884 "psubb %%mm2, %%mm1 \n\t"\
885 "psrlq $24, %%mm1 \n\t"\
886 "movd %%mm1, %%ecx \n\t"\
887 "paddb %%mm6, %%mm0 \n\t"\
888 "paddsb (%3, %%ecx, 8), %%mm0 \n\t"\
889 "paddb %%mm6, %%mm0 \n\t"\
890 "movq %%mm0, " #a " \n\t"\
896 HX1old((%%eax, %1, 2))
900 HX1old((%%ebx, %1, 2))
903 //FIXME add some comments, its unreadable ...
904 #define HX1b(a, c, b, d) \
905 "movd " #a ", %%mm0 \n\t"\
906 "movd 4" #a ", %%mm1 \n\t"\
907 "punpckldq %%mm1, %%mm0 \n\t"\
908 "movd " #b ", %%mm4 \n\t"\
909 "movq %%mm0, %%mm1 \n\t"\
910 "movq %%mm0, %%mm2 \n\t"\
911 "psrlq $8, %%mm1 \n\t"\
912 "movd 4" #b ", %%mm3 \n\t"\
913 "psubusb %%mm1, %%mm2 \n\t"\
914 "psubusb %%mm0, %%mm1 \n\t"\
915 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
916 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
917 "punpckldq %%mm3, %%mm4 \n\t"\
918 "movq %%mm1, %%mm3 \n\t"\
919 "psllq $32, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
920 PAVGB(%%mm1, %%mm3) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
921 "paddb %%mm6, %%mm0 \n\t"\
922 "psrlq $16, %%mm3 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
923 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
924 "movq %%mm4, %%mm3 \n\t"\
925 "paddb %%mm5, %%mm1 \n\t"\
926 "psubusb %%mm5, %%mm1 \n\t"\
927 "psrlq $8, %%mm3 \n\t"\
929 "pxor %%mm2, %%mm1 \n\t"\
930 "psubb %%mm2, %%mm1 \n\t"\
931 "movq %%mm4, %%mm2 \n\t"\
932 "psrlq $24, %%mm1 \n\t"\
933 "psubusb %%mm3, %%mm2 \n\t"\
934 "movd %%mm1, %%ecx \n\t"\
935 "psubusb %%mm4, %%mm3 \n\t"\
936 "paddsb (%3, %%ecx, 8), %%mm0 \n\t"\
937 "por %%mm2, %%mm3 \n\t" /* p´x = |px - p(x+1)| */\
938 "paddb %%mm6, %%mm0 \n\t"\
939 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
940 "movq %%mm3, %%mm1 \n\t"\
941 "psllq $32, %%mm1 \n\t" /* p´5 = |p1 - p2| */\
942 "movq %%mm0, " #a " \n\t"\
943 PAVGB(%%mm3, %%mm1) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
944 "paddb %%mm6, %%mm4 \n\t"\
945 "psrlq $16, %%mm1 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
946 "psubusb %%mm1, %%mm3 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
947 "paddb %%mm5, %%mm3 \n\t"\
948 "psubusb %%mm5, %%mm3 \n\t"\
950 "pxor %%mm2, %%mm3 \n\t"\
951 "psubb %%mm2, %%mm3 \n\t"\
952 "psrlq $24, %%mm3 \n\t"\
953 "movd " #c ", %%mm0 \n\t"\
954 "movd 4" #c ", %%mm1 \n\t"\
955 "punpckldq %%mm1, %%mm0 \n\t"\
956 "paddb %%mm6, %%mm0 \n\t"\
957 "paddsb (%3, %%ecx, 8), %%mm0 \n\t"\
958 "paddb %%mm6, %%mm0 \n\t"\
959 "movq %%mm0, " #c " \n\t"\
960 "movd %%mm3, %%ecx \n\t"\
961 "movd " #d ", %%mm0 \n\t"\
962 "paddsb (%3, %%ecx, 8), %%mm4 \n\t"\
963 "movd 4" #d ", %%mm1 \n\t"\
964 "paddb %%mm6, %%mm4 \n\t"\
965 "punpckldq %%mm1, %%mm0 \n\t"\
966 "movq %%mm4, " #b " \n\t"\
967 "paddb %%mm6, %%mm0 \n\t"\
968 "paddsb (%3, %%ecx, 8), %%mm0 \n\t"\
969 "paddb %%mm6, %%mm0 \n\t"\
970 "movq %%mm0, " #d " \n\t"\
972 HX1b((%0),(%%eax),(%%eax, %1),(%%eax, %1, 2))
973 HX1b((%0, %1, 4),(%%ebx),(%%ebx, %1),(%%ebx, %1, 2))
977 : "r" (src), "r" (stride), "r" (QP), "r" (lut)
978 : "%eax", "%ebx", "%ecx"
982 //FIXME (has little in common with the mmx2 version)
983 for(y=0; y<BLOCK_SIZE; y++)
985 int a= src[1] - src[2];
986 int b= src[3] - src[4];
987 int c= src[5] - src[6];
989 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
993 int v = d * SIGN(-b);
1009 static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
1013 //FIXME try pmul for *5 stuff
1016 "pxor %%mm7, %%mm7 \n\t"
1017 "leal (%0, %1), %%eax \n\t"
1018 "leal (%%eax, %1, 4), %%ebx \n\t"
1020 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
1021 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
1023 "movq (%0), %%mm0 \n\t"
1024 "movq %%mm0, %%mm1 \n\t"
1025 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
1026 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
1028 "movq (%%eax), %%mm2 \n\t"
1029 "movq %%mm2, %%mm3 \n\t"
1030 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
1031 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
1033 "movq (%%eax, %1), %%mm4 \n\t"
1034 "movq %%mm4, %%mm5 \n\t"
1035 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
1036 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
1038 "paddw %%mm0, %%mm0 \n\t" // 2L0
1039 "paddw %%mm1, %%mm1 \n\t" // 2H0
1040 "psubw %%mm4, %%mm2 \n\t" // L1 - L2
1041 "psubw %%mm5, %%mm3 \n\t" // H1 - H2
1042 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
1043 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
1045 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
1046 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
1047 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
1048 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
1050 "movq (%%eax, %1, 2), %%mm2 \n\t"
1051 "movq %%mm2, %%mm3 \n\t"
1052 "punpcklbw %%mm7, %%mm2 \n\t" // L3
1053 "punpckhbw %%mm7, %%mm3 \n\t" // H3
1055 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
1056 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
1057 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1058 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1059 "movq %%mm0, temp0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1060 "movq %%mm1, temp1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1062 "movq (%0, %1, 4), %%mm0 \n\t"
1063 "movq %%mm0, %%mm1 \n\t"
1064 "punpcklbw %%mm7, %%mm0 \n\t" // L4
1065 "punpckhbw %%mm7, %%mm1 \n\t" // H4
1067 "psubw %%mm0, %%mm2 \n\t" // L3 - L4
1068 "psubw %%mm1, %%mm3 \n\t" // H3 - H4
1069 "movq %%mm2, temp2 \n\t" // L3 - L4
1070 "movq %%mm3, temp3 \n\t" // H3 - H4
1071 "paddw %%mm4, %%mm4 \n\t" // 2L2
1072 "paddw %%mm5, %%mm5 \n\t" // 2H2
1073 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
1074 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
1076 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
1077 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
1078 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
1079 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
1081 "movq (%%ebx), %%mm2 \n\t"
1082 "movq %%mm2, %%mm3 \n\t"
1083 "punpcklbw %%mm7, %%mm2 \n\t" // L5
1084 "punpckhbw %%mm7, %%mm3 \n\t" // H5
1085 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
1086 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
1087 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
1088 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
1090 "movq (%%ebx, %1), %%mm6 \n\t"
1091 "punpcklbw %%mm7, %%mm6 \n\t" // L6
1092 "psubw %%mm6, %%mm2 \n\t" // L5 - L6
1093 "movq (%%ebx, %1), %%mm6 \n\t"
1094 "punpckhbw %%mm7, %%mm6 \n\t" // H6
1095 "psubw %%mm6, %%mm3 \n\t" // H5 - H6
1097 "paddw %%mm0, %%mm0 \n\t" // 2L4
1098 "paddw %%mm1, %%mm1 \n\t" // 2H4
1099 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
1100 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
1102 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
1103 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
1104 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
1105 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
1107 "movq (%%ebx, %1, 2), %%mm2 \n\t"
1108 "movq %%mm2, %%mm3 \n\t"
1109 "punpcklbw %%mm7, %%mm2 \n\t" // L7
1110 "punpckhbw %%mm7, %%mm3 \n\t" // H7
1112 "paddw %%mm2, %%mm2 \n\t" // 2L7
1113 "paddw %%mm3, %%mm3 \n\t" // 2H7
1114 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
1115 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
1117 "movq temp0, %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1118 "movq temp1, %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1119 //FIXME pxor, psubw, pmax for abs
1120 "movq %%mm7, %%mm6 \n\t" // 0
1121 "pcmpgtw %%mm0, %%mm6 \n\t"
1122 "pxor %%mm6, %%mm0 \n\t"
1123 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1124 "movq %%mm7, %%mm6 \n\t" // 0
1125 "pcmpgtw %%mm1, %%mm6 \n\t"
1126 "pxor %%mm6, %%mm1 \n\t"
1127 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1129 "movq %%mm7, %%mm6 \n\t" // 0
1130 "pcmpgtw %%mm2, %%mm6 \n\t"
1131 "pxor %%mm6, %%mm2 \n\t"
1132 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1133 "movq %%mm7, %%mm6 \n\t" // 0
1134 "pcmpgtw %%mm3, %%mm6 \n\t"
1135 "pxor %%mm6, %%mm3 \n\t"
1136 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1139 "pminsw %%mm2, %%mm0 \n\t"
1140 "pminsw %%mm3, %%mm1 \n\t"
1142 "movq %%mm0, %%mm6 \n\t"
1143 "psubusw %%mm2, %%mm6 \n\t"
1144 "psubw %%mm6, %%mm0 \n\t"
1145 "movq %%mm1, %%mm6 \n\t"
1146 "psubusw %%mm3, %%mm6 \n\t"
1147 "psubw %%mm6, %%mm1 \n\t"
1150 "movq %%mm7, %%mm6 \n\t" // 0
1151 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
1152 "pxor %%mm6, %%mm4 \n\t"
1153 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
1154 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
1155 "pxor %%mm7, %%mm5 \n\t"
1156 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
1158 "movd %2, %%mm2 \n\t" // QP
1159 //"pcmpeqb %%mm2, %%mm2\n\t"
1160 "punpcklwd %%mm2, %%mm2 \n\t"
1161 "punpcklwd %%mm2, %%mm2 \n\t"
1162 "psllw $3, %%mm2 \n\t" // 8QP
1163 "movq %%mm2, %%mm3 \n\t" // 8QP
1164 "pcmpgtw %%mm4, %%mm2 \n\t"
1165 "pcmpgtw %%mm5, %%mm3 \n\t"
1166 "pand %%mm2, %%mm4 \n\t"
1167 "pand %%mm3, %%mm5 \n\t"
1170 "psubusw %%mm0, %%mm4 \n\t" // hd
1171 "psubusw %%mm1, %%mm5 \n\t" // ld
1174 "movq w05, %%mm2 \n\t" // 5
1175 "pmullw %%mm2, %%mm4 \n\t"
1176 "pmullw %%mm2, %%mm5 \n\t"
1177 "movq w20, %%mm2 \n\t" // 32
1178 "paddw %%mm2, %%mm4 \n\t"
1179 "paddw %%mm2, %%mm5 \n\t"
1180 "psrlw $6, %%mm4 \n\t"
1181 "psrlw $6, %%mm5 \n\t"
1184 "movq w06, %%mm2 \n\t" // 6
1185 "paddw %%mm2, %%mm4 \n\t"
1186 "paddw %%mm2, %%mm5 \n\t"
1187 "movq w1400, %%mm2 \n\t" // 1400h = 5120 = 5/64*2^16
1188 //FIXME if *5/64 is supposed to be /13 then we should use 5041 instead of 5120
1189 "pmulhw %%mm2, %%mm4 \n\t" // hd/13
1190 "pmulhw %%mm2, %%mm5 \n\t" // ld/13
1193 "movq temp2, %%mm0 \n\t" // L3 - L4
1194 "movq temp3, %%mm1 \n\t" // H3 - H4
1196 "pxor %%mm2, %%mm2 \n\t"
1197 "pxor %%mm3, %%mm3 \n\t"
1199 // FIXME rounding error
1200 "psraw $1, %%mm0 \n\t" // (L3 - L4)/2
1201 "psraw $1, %%mm1 \n\t" // (H3 - H4)/2
1202 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
1203 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
1204 "pxor %%mm2, %%mm0 \n\t"
1205 "pxor %%mm3, %%mm1 \n\t"
1206 "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
1207 "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
1208 // "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
1209 // "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
1211 "pxor %%mm6, %%mm2 \n\t"
1212 "pxor %%mm7, %%mm3 \n\t"
1213 "pand %%mm2, %%mm4 \n\t"
1214 "pand %%mm3, %%mm5 \n\t"
1217 "pminsw %%mm0, %%mm4 \n\t"
1218 "pminsw %%mm1, %%mm5 \n\t"
1220 "movq %%mm4, %%mm2 \n\t"
1221 "psubusw %%mm0, %%mm2 \n\t"
1222 "psubw %%mm2, %%mm4 \n\t"
1223 "movq %%mm5, %%mm2 \n\t"
1224 "psubusw %%mm1, %%mm2 \n\t"
1225 "psubw %%mm2, %%mm5 \n\t"
1227 "pxor %%mm6, %%mm4 \n\t"
1228 "pxor %%mm7, %%mm5 \n\t"
1229 "psubw %%mm6, %%mm4 \n\t"
1230 "psubw %%mm7, %%mm5 \n\t"
1231 "packsswb %%mm5, %%mm4 \n\t"
1232 "movq (%%eax, %1, 2), %%mm0 \n\t"
1233 "paddb %%mm4, %%mm0 \n\t"
1234 "movq %%mm0, (%%eax, %1, 2) \n\t"
1235 "movq (%0, %1, 4), %%mm0 \n\t"
1236 "psubb %%mm4, %%mm0 \n\t"
1237 // "pxor %%mm0, %%mm0 \n\t"
1238 "movq %%mm0, (%0, %1, 4) \n\t"
1241 : "r" (src), "r" (stride), "r" (QP)
1245 const int l1= stride;
1246 const int l2= stride + l1;
1247 const int l3= stride + l2;
1248 const int l4= stride + l3;
1249 const int l5= stride + l4;
1250 const int l6= stride + l5;
1251 const int l7= stride + l6;
1252 const int l8= stride + l7;
1253 // const int l9= stride + l8;
1255 for(x=0; x<BLOCK_SIZE; x++)
1257 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1258 if(ABS(middleEnergy) < 8*QP)
1260 const int q=(src[l4] - src[l5])/2;
1261 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1262 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1264 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1268 d*= SIGN(-middleEnergy);
1289 //FIXME? |255-0| = 1
1291 * Check if the given 8x8 Block is mostly "flat" and copy the unaliged data into tempBlock.
1293 static inline int isHorizDCAndCopy2Temp(uint8_t src[], int stride)
1301 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F
1302 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D
1303 "leal tempBlock, %%eax \n\t"
1304 "pxor %%mm0, %%mm0 \n\t"
1306 #define HDC_CHECK_AND_CPY(i) \
1307 "movq -4(%1), %%mm2 \n\t"\
1308 "psrlq $32, %%mm2 \n\t"\
1309 "punpckldq 4(%1), %%mm2 \n\t" /* (%1) */\
1310 "movq %%mm2, %%mm1 \n\t"\
1311 "psrlq $8, %%mm2 \n\t"\
1312 "psubb %%mm1, %%mm2 \n\t"\
1313 "paddb %%mm7, %%mm2 \n\t"\
1314 "pcmpgtb %%mm6, %%mm2 \n\t"\
1315 "paddb %%mm2, %%mm0 \n\t"\
1316 "movq %%mm1," #i "(%%eax) \n\t"
1318 HDC_CHECK_AND_CPY(0)
1320 HDC_CHECK_AND_CPY(8)
1322 HDC_CHECK_AND_CPY(16)
1324 HDC_CHECK_AND_CPY(24)
1326 HDC_CHECK_AND_CPY(32)
1328 HDC_CHECK_AND_CPY(40)
1330 HDC_CHECK_AND_CPY(48)
1332 HDC_CHECK_AND_CPY(56)
1334 "psllq $8, %%mm0 \n\t" // remove dummy value
1335 "movq %%mm0, %%mm1 \n\t"
1336 "psrlw $8, %%mm0 \n\t"
1337 "paddb %%mm1, %%mm0 \n\t"
1338 "movq %%mm0, %%mm1 \n\t"
1339 "psrlq $16, %%mm0 \n\t"
1340 "paddb %%mm1, %%mm0 \n\t"
1341 "movq %%mm0, %%mm1 \n\t"
1342 "psrlq $32, %%mm0 \n\t"
1343 "paddb %%mm1, %%mm0 \n\t"
1345 "movd %%mm0, %0 \n\t"
1347 : "r" (src), "r" (stride)
1350 // printf("%d\n", numEq);
1351 numEq= (256 - (numEq & 0xFF)) &0xFF;
1354 for(y=0; y<BLOCK_SIZE; y++)
1356 if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++;
1357 if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++;
1358 if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++;
1359 if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++;
1360 if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++;
1361 if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++;
1362 if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++;
1363 tempBlock[0 + y*TEMP_STRIDE] = src[0];
1364 tempBlock[1 + y*TEMP_STRIDE] = src[1];
1365 tempBlock[2 + y*TEMP_STRIDE] = src[2];
1366 tempBlock[3 + y*TEMP_STRIDE] = src[3];
1367 tempBlock[4 + y*TEMP_STRIDE] = src[4];
1368 tempBlock[5 + y*TEMP_STRIDE] = src[5];
1369 tempBlock[6 + y*TEMP_STRIDE] = src[6];
1370 tempBlock[7 + y*TEMP_STRIDE] = src[7];
1374 /* if(abs(numEq - asmEq) > 0)
1376 // printf("\nasm:%d c:%d\n", asmEq, numEq);
1377 for(int y=0; y<8; y++)
1379 for(int x=0; x<8; x++)
1381 printf("%d ", src[x + y*stride]);
1387 // printf("%d\n", numEq);
1388 return numEq > hFlatnessThreshold;
1391 static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
1398 "movq (%1, %2), %%mm0 \n\t"
1399 "movq (%1, %2, 8), %%mm1 \n\t"
1400 "movq %%mm0, %%mm2 \n\t"
1401 "psubusb %%mm1, %%mm0 \n\t"
1402 "psubusb %%mm2, %%mm1 \n\t"
1403 "por %%mm1, %%mm0 \n\t" // ABS Diff
1405 "movq pQPb, %%mm7 \n\t" // QP,..., QP
1406 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
1407 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0
1408 "pcmpeqd b00, %%mm0 \n\t"
1409 "psrlq $16, %%mm0 \n\t"
1410 "pcmpeqd bFF, %%mm0 \n\t"
1411 // "movd %%mm0, (%1, %2, 4)\n\t"
1412 "movd %%mm0, %0 \n\t"
1414 : "r" (src), "r" (stride)
1418 if(abs(src[0] - src[7]) > 2*QP) return 0;
1424 static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP)
1429 "pxor %%mm7, %%mm7 \n\t"
1430 "movq bm00001000, %%mm6 \n\t"
1431 "movd %2, %%mm5 \n\t" // QP
1432 "movq %%mm5, %%mm4 \n\t"
1433 "paddusb %%mm5, %%mm5 \n\t" // 2QP
1434 "paddusb %%mm5, %%mm4 \n\t" // 3QP
1435 "psllq $24, %%mm4 \n\t"
1436 "pxor %%mm5, %%mm5 \n\t" // 0
1437 "psubb %%mm4, %%mm5 \n\t" // -QP
1438 "leal tempBlock, %%eax \n\t"
1440 //FIXME? "unroll by 2" and mix
1443 "movq " #i "(%%eax), %%mm0 \n\t"\
1444 "movq %%mm0, %%mm1 \n\t"\
1445 "movq %%mm0, %%mm2 \n\t"\
1446 "psrlq $8, %%mm1 \n\t"\
1447 "psubusb %%mm1, %%mm2 \n\t"\
1448 "psubusb %%mm0, %%mm1 \n\t"\
1449 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
1450 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
1451 "pshufw $0x00, %%mm1, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
1452 "pminub %%mm1, %%mm3 \n\t" /* p´5 = min(|p2-p1|, |p6-p5|)*/\
1453 "psrlq $16, %%mm3 \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\
1454 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5-p6|) */\
1455 "paddb %%mm5, %%mm1 \n\t"\
1456 "psubusb %%mm5, %%mm1 \n\t"\
1457 "psrlw $2, %%mm1 \n\t"\
1458 "pxor %%mm2, %%mm1 \n\t"\
1459 "psubb %%mm2, %%mm1 \n\t"\
1460 "pand %%mm6, %%mm1 \n\t"\
1461 "psubb %%mm1, %%mm0 \n\t"\
1462 "psllq $8, %%mm1 \n\t"\
1463 "paddb %%mm1, %%mm0 \n\t"\
1464 "movd %%mm0, (%0) \n\t"\
1465 "psrlq $32, %%mm0 \n\t"\
1466 "movd %%mm0, 4(%0) \n\t"
1469 "movq " #i "(%%eax), %%mm0 \n\t"\
1470 "movq %%mm0, %%mm1 \n\t"\
1471 "movq %%mm0, %%mm2 \n\t"\
1472 "psrlq $8, %%mm1 \n\t"\
1473 "psubusb %%mm1, %%mm2 \n\t"\
1474 "psubusb %%mm0, %%mm1 \n\t"\
1475 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
1476 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
1477 "movq %%mm1, %%mm3 \n\t"\
1478 "psllq $32, %%mm3 \n\t"\
1479 "movq %%mm3, %%mm4 \n\t"\
1480 "psubusb %%mm1, %%mm4 \n\t"\
1481 "psubb %%mm4, %%mm3 \n\t"\
1482 "psrlq $16, %%mm3 \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\
1483 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5,ü6|) */\
1484 "paddb %%mm5, %%mm1 \n\t"\
1485 "psubusb %%mm5, %%mm1 \n\t"\
1486 "psrlw $2, %%mm1 \n\t"\
1487 "pxor %%mm2, %%mm1 \n\t"\
1488 "psubb %%mm2, %%mm1 \n\t"\
1489 "pand %%mm6, %%mm1 \n\t"\
1490 "psubb %%mm1, %%mm0 \n\t"\
1491 "psllq $8, %%mm1 \n\t"\
1492 "paddb %%mm1, %%mm0 \n\t"\
1493 "movd %%mm0, (%0) \n\t"\
1494 "psrlq $32, %%mm0 \n\t"\
1495 "movd %%mm0, 4(%0) \n\t"
1514 : "r" (dst), "r" (stride), "r" (QP)
1518 uint8_t *src= tempBlock;
1521 for(y=0; y<BLOCK_SIZE; y++)
1532 const int middleEnergy= 5*(src[4] - src[5]) + 2*(src[2] - src[5]);
1533 if(ABS(middleEnergy) < 8*QP)
1535 const int q=(src[3] - src[4])/2;
1536 const int leftEnergy= 5*(src[2] - src[1]) + 2*(src[0] - src[3]);
1537 const int rightEnergy= 5*(src[6] - src[5]) + 2*(src[4] - src[7]);
1539 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1543 d*= SIGN(-middleEnergy);
1566 * Do a horizontal low pass filter on the 8x8 block
1567 * useing the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
1568 * useing approximately the 7-Tap Filter (1,2,3,4,3,2,1)/16 (MMX2/3DNOW version)
1570 static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP)
1573 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1574 asm volatile( //"movv %0 %1 %2\n\t"
1576 "pxor %%mm7, %%mm7 \n\t"
1577 "leal tempBlock, %%eax \n\t"
1579 #define HLP1 "movq (%0), %%mm0 \n\t"\
1580 "movq %%mm0, %%mm1 \n\t"\
1581 "psllq $8, %%mm0 \n\t"\
1582 PAVGB(%%mm1, %%mm0)\
1583 "psrlw $8, %%mm0 \n\t"\
1584 "pxor %%mm1, %%mm1 \n\t"\
1585 "packuswb %%mm1, %%mm0 \n\t"\
1586 "movq %%mm0, %%mm1 \n\t"\
1587 "movq %%mm0, %%mm2 \n\t"\
1588 "psllq $32, %%mm0 \n\t"\
1589 "paddb %%mm0, %%mm1 \n\t"\
1590 "psllq $16, %%mm2 \n\t"\
1591 PAVGB(%%mm2, %%mm0)\
1592 "movq %%mm0, %%mm3 \n\t"\
1593 "pand bm11001100, %%mm0 \n\t"\
1594 "paddusb %%mm0, %%mm3 \n\t"\
1595 "psrlq $8, %%mm3 \n\t"\
1596 PAVGB(%%mm1, %%mm4)\
1597 PAVGB(%%mm3, %%mm2)\
1598 "psrlq $16, %%mm2 \n\t"\
1599 "punpcklbw %%mm2, %%mm2 \n\t"\
1600 "movq %%mm2, (%0) \n\t"\
1602 #define HLP2 "movq (%0), %%mm0 \n\t"\
1603 "movq %%mm0, %%mm1 \n\t"\
1604 "psllq $8, %%mm0 \n\t"\
1605 PAVGB(%%mm1, %%mm0)\
1606 "psrlw $8, %%mm0 \n\t"\
1607 "pxor %%mm1, %%mm1 \n\t"\
1608 "packuswb %%mm1, %%mm0 \n\t"\
1609 "movq %%mm0, %%mm2 \n\t"\
1610 "psllq $32, %%mm0 \n\t"\
1611 "psllq $16, %%mm2 \n\t"\
1612 PAVGB(%%mm2, %%mm0)\
1613 "movq %%mm0, %%mm3 \n\t"\
1614 "pand bm11001100, %%mm0 \n\t"\
1615 "paddusb %%mm0, %%mm3 \n\t"\
1616 "psrlq $8, %%mm3 \n\t"\
1617 PAVGB(%%mm3, %%mm2)\
1618 "psrlq $16, %%mm2 \n\t"\
1619 "punpcklbw %%mm2, %%mm2 \n\t"\
1620 "movq %%mm2, (%0) \n\t"\
1622 // approximately a 7-Tap Filter with Vector (1,2,3,4,3,2,1)/16
1632 Implemented Exact 7-Tap
1644 #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\
1645 "movq %%mm0, %%mm1 \n\t"\
1646 "movq %%mm0, %%mm2 \n\t"\
1647 "movq %%mm0, %%mm3 \n\t"\
1648 "movq %%mm0, %%mm4 \n\t"\
1649 "psllq $8, %%mm1 \n\t"\
1650 "psrlq $8, %%mm2 \n\t"\
1651 "pand bm00000001, %%mm3 \n\t"\
1652 "pand bm10000000, %%mm4 \n\t"\
1653 "por %%mm3, %%mm1 \n\t"\
1654 "por %%mm4, %%mm2 \n\t"\
1655 PAVGB(%%mm2, %%mm1)\
1656 PAVGB(%%mm1, %%mm0)\
1658 "pshufw $0xF9, %%mm0, %%mm3 \n\t"\
1659 "pshufw $0x90, %%mm0, %%mm4 \n\t"\
1660 PAVGB(%%mm3, %%mm4)\
1661 PAVGB(%%mm4, %%mm0)\
1662 "movd %%mm0, (%0) \n\t"\
1663 "psrlq $32, %%mm0 \n\t"\
1664 "movd %%mm0, 4(%0) \n\t"
1666 #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\
1667 "movq %%mm0, %%mm1 \n\t"\
1668 "movq %%mm0, %%mm2 \n\t"\
1669 "movq %%mm0, %%mm3 \n\t"\
1670 "movq %%mm0, %%mm4 \n\t"\
1671 "psllq $8, %%mm1 \n\t"\
1672 "psrlq $8, %%mm2 \n\t"\
1673 "pand bm00000001, %%mm3 \n\t"\
1674 "pand bm10000000, %%mm4 \n\t"\
1675 "por %%mm3, %%mm1 \n\t"\
1676 "por %%mm4, %%mm2 \n\t"\
1677 PAVGB(%%mm2, %%mm1)\
1678 PAVGB(%%mm1, %%mm0)\
1680 "movq %%mm0, %%mm3 \n\t"\
1681 "movq %%mm0, %%mm4 \n\t"\
1682 "movq %%mm0, %%mm5 \n\t"\
1683 "psrlq $16, %%mm3 \n\t"\
1684 "psllq $16, %%mm4 \n\t"\
1685 "pand bm11000000, %%mm5 \n\t"\
1686 "por %%mm5, %%mm3 \n\t"\
1687 "movq %%mm0, %%mm5 \n\t"\
1688 "pand bm00000011, %%mm5 \n\t"\
1689 "por %%mm5, %%mm4 \n\t"\
1690 PAVGB(%%mm3, %%mm4)\
1691 PAVGB(%%mm4, %%mm0)\
1692 "movd %%mm0, (%0) \n\t"\
1693 "psrlq $32, %%mm0 \n\t"\
1694 "movd %%mm0, 4(%0) \n\t"
1697 #define HLP(i) HLP3(i)
1717 : "r" (dst), "r" (stride)
1722 uint8_t *temp= tempBlock;
1724 for(y=0; y<BLOCK_SIZE; y++)
1726 const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
1727 const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
1730 sums[0] = first + temp[0];
1731 sums[1] = temp[0] + temp[1];
1732 sums[2] = temp[1] + temp[2];
1733 sums[3] = temp[2] + temp[3];
1734 sums[4] = temp[3] + temp[4];
1735 sums[5] = temp[4] + temp[5];
1736 sums[6] = temp[5] + temp[6];
1737 sums[7] = temp[6] + temp[7];
1738 sums[8] = temp[7] + last;
1740 dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
1741 dst[1]= ((dst[1]<<2) + (first + sums[0] + sums[3]<<1) + sums[5] + 8)>>4;
1742 dst[2]= ((dst[2]<<2) + (first + sums[1] + sums[4]<<1) + sums[6] + 8)>>4;
1743 dst[3]= ((dst[3]<<2) + (sums[2] + sums[5]<<1) + sums[0] + sums[7] + 8)>>4;
1744 dst[4]= ((dst[4]<<2) + (sums[3] + sums[6]<<1) + sums[1] + sums[8] + 8)>>4;
1745 dst[5]= ((dst[5]<<2) + (last + sums[7] + sums[4]<<1) + sums[2] + 8)>>4;
1746 dst[6]= ((last + dst[6]<<2) + (dst[7] + sums[5]<<1) + sums[3] + 8)>>4;
1747 dst[7]= ((sums[8]<<2) + (last + sums[6]<<1) + sums[4] + 8)>>4;
1756 static inline void dering(uint8_t src[], int stride, int QP)
1762 "leal (%0, %1), %%eax \n\t"
1763 "leal (%%eax, %1, 4), %%ebx \n\t"
1764 // 0 1 2 3 4 5 6 7 8 9
1765 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1767 "pcmpeq %%mm6, %%mm6 \n\t"
1768 "pxor %%mm7, %%mm7 \n\t"
1770 #define FIND_MIN_MAX(addr)\
1771 "movq (" #addr "), %%mm0, \n\t"\
1772 "pminub %%mm0, %%mm6 \n\t"\
1773 "pmaxub %%mm0, %%mm7 \n\t"
1777 FIND_MIN_MAX(%%eax, %1)
1778 FIND_MIN_MAX(%%eax, %1, 2)
1779 FIND_MIN_MAX(%0, %1, 4)
1781 FIND_MIN_MAX(%%ebx, %1)
1782 FIND_MIN_MAX(%%ebx, %1, 2)
1783 FIND_MIN_MAX(%0, %1, 8)
1784 FIND_MIN_MAX(%%ebx, %1, 2)
1786 "movq %%mm6, %%mm4 \n\t"
1787 "psrlq $32, %%mm6 \n\t"
1788 "pminub %%mm4, %%mm6 \n\t"
1789 "movq %%mm6, %%mm4 \n\t"
1790 "psrlq $16, %%mm6 \n\t"
1791 "pminub %%mm4, %%mm6 \n\t"
1792 "movq %%mm6, %%mm4 \n\t"
1793 "psrlq $8, %%mm6 \n\t"
1794 "pminub %%mm4, %%mm6 \n\t" // min of pixels
1796 "movq %%mm7, %%mm4 \n\t"
1797 "psrlq $32, %%mm7 \n\t"
1798 "pmaxub %%mm4, %%mm7 \n\t"
1799 "movq %%mm7, %%mm4 \n\t"
1800 "psrlq $16, %%mm7 \n\t"
1801 "pmaxub %%mm4, %%mm7 \n\t"
1802 "movq %%mm7, %%mm4 \n\t"
1803 "psrlq $8, %%mm7 \n\t"
1804 "pmaxub %%mm4, %%mm7 \n\t" // max of pixels
1805 PAVGB(%%mm6, %%mm7) // (max + min)/2
1808 : : "r" (src), "r" (stride), "r" (QP)
1822 * the mode value is interpreted as a quality value if its negative, its range is then (-1 ... -63)
1823 * -63 is best quality -1 is worst
1826 void postprocess(unsigned char * src[], int src_stride,
1827 unsigned char * dst[], int dst_stride,
1828 int horizontal_size, int vertical_size,
1829 QP_STORE_T *QP_store, int QP_stride,
1833 if(mode<0) mode= getModeForQuality(-mode);
1836 long long T= rdtsc();
1837 for(int y=vertical_size-1; y>=0 ; y--)
1838 memcpy(dst[0] + y*src_stride, src[0] + y*src_stride,src_stride);
1839 // memcpy(dst[0], src[0],src_stride*vertical_size);
1840 printf("%4dk\r", (rdtsc()-T)/1000);
1845 long long T= rdtsc();
1846 while( (rdtsc() - T)/1000 < 4000);
1850 postProcess(src[0], src_stride, dst[0], dst_stride,
1851 horizontal_size, vertical_size, QP_store, QP_stride, 0, mode);
1853 horizontal_size >>= 1;
1854 vertical_size >>= 1;
1857 mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00);
1861 postProcess(src[1], src_stride, dst[1], dst_stride,
1862 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
1863 postProcess(src[2], src_stride, dst[2], dst_stride,
1864 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
1868 memcpy(dst[1], src[1], src_stride*horizontal_size);
1869 memcpy(dst[2], src[2], src_stride*horizontal_size);
1873 * gets the mode flags for a given quality (larger values mean slower but better postprocessing)
1876 int getModeForQuality(int quality){
1879 LUM_V_DEBLOCK | LUM_H_DEBLOCK,
1880 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK,
1881 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK,
1882 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING,
1883 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING | CHROM_DERING
1886 return modes[ (quality*6) >>6 ];
1892 * Copies a block from src to dst and fixes the blacklevel
1893 * numLines must be a multiple of 4
1894 * levelFix == 0 -> dont touch the brighness & contrast
1896 static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
1897 int numLines, int levelFix)
1904 "movl %4, %%eax \n\t"
1905 "movl %%eax, temp0\n\t"
1908 "leal (%2,%2), %%eax \n\t"
1909 "leal (%3,%3), %%ebx \n\t"
1910 "movq packedYOffset, %%mm2 \n\t"
1911 "movq packedYScale, %%mm3 \n\t"
1913 #define SCALED_CPY \
1914 "movq (%0), %%mm0 \n\t"\
1915 "movq (%0,%2), %%mm1 \n\t"\
1916 "psubusb %%mm2, %%mm0 \n\t"\
1917 "psubusb %%mm2, %%mm1 \n\t"\
1918 "pxor %%mm4, %%mm4 \n\t"\
1919 "pxor %%mm5, %%mm5 \n\t"\
1920 "punpcklbw %%mm0, %%mm4 \n\t"\
1921 "punpckhbw %%mm0, %%mm5 \n\t"\
1922 "pmulhuw %%mm3, %%mm4 \n\t"\
1923 "pmulhuw %%mm3, %%mm5 \n\t"\
1924 "packuswb %%mm5, %%mm4 \n\t"\
1925 "movq %%mm4, (%1) \n\t"\
1926 "pxor %%mm4, %%mm4 \n\t"\
1927 "pxor %%mm5, %%mm5 \n\t"\
1928 "punpcklbw %%mm1, %%mm4 \n\t"\
1929 "punpckhbw %%mm1, %%mm5 \n\t"\
1930 "pmulhuw %%mm3, %%mm4 \n\t"\
1931 "pmulhuw %%mm3, %%mm5 \n\t"\
1932 "packuswb %%mm5, %%mm4 \n\t"\
1933 "movq %%mm4, (%1, %3) \n\t"\
1937 "addl %%eax, %0 \n\t"
1938 "addl %%ebx, %1 \n\t"
1940 "addl %%eax, %0 \n\t"
1941 "addl %%ebx, %1 \n\t"
1955 for(i=0; i<numLines; i++)
1956 memcpy( &(dst[dstStride*i]),
1957 &(src[srcStride*i]), BLOCK_SIZE);
1964 "movl %4, %%eax \n\t"
1965 "movl %%eax, temp0\n\t"
1968 "leal (%2,%2), %%eax \n\t"
1969 "leal (%3,%3), %%ebx \n\t"
1970 "movq packedYOffset, %%mm2 \n\t"
1971 "movq packedYScale, %%mm3 \n\t"
1973 #define SIMPLE_CPY \
1974 "movq (%0), %%mm0 \n\t"\
1975 "movq (%0,%2), %%mm1 \n\t"\
1976 "movq %%mm0, (%1) \n\t"\
1977 "movq %%mm1, (%1, %3) \n\t"\
1981 "addl %%eax, %0 \n\t"
1982 "addl %%ebx, %1 \n\t"
1984 "addl %%eax, %0 \n\t"
1985 "addl %%ebx, %1 \n\t"
1999 for(i=0; i<numLines; i++)
2000 memcpy( &(dst[dstStride*i]),
2001 &(src[srcStride*i]), BLOCK_SIZE);
2008 * Filters array of bytes (Y or U or V values)
2010 void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2011 QP_STORE_T QPs[], int QPStride, int isColor, int mode)
2014 /* we need 64bit here otherwise we´ll going to have a problem
2015 after watching a black picture for 5 hours*/
2016 static uint64_t *yHistogram= NULL;
2017 int black=0, white=255; // blackest black and whitest white in the picture
2020 long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0;
2027 yHistogram= (uint64_t*)malloc(8*256);
2028 for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256;
2035 static int framenum= -1;
2036 uint64_t maxClipped;
2041 if(framenum == 1) yHistogram[0]= width*height/64*15/256;
2043 for(i=0; i<256; i++)
2045 sum+= yHistogram[i];
2046 // printf("%d ", yHistogram[i]);
2050 /* we allways get a completly black picture first */
2052 maxClipped= (uint64_t)(sum * maxClippedThreshold);
2055 for(black=255; black>0; black--)
2057 if(clipped < maxClipped) break;
2058 clipped-= yHistogram[black];
2062 for(white=0; white<256; white++)
2064 if(clipped < maxClipped) break;
2065 clipped-= yHistogram[white];
2068 // we cant handle negative correctures
2069 packedYOffset= MAX(black - minAllowedY, 0);
2070 packedYOffset|= packedYOffset<<32;
2071 packedYOffset|= packedYOffset<<16;
2072 packedYOffset|= packedYOffset<<8;
2074 scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);
2076 packedYScale= (uint16_t)(scale*256.0 + 0.5);
2077 packedYScale|= packedYScale<<32;
2078 packedYScale|= packedYScale<<16;
2082 packedYScale= 0x0100010001000100LL;
2086 for(x=0; x<width; x+=BLOCK_SIZE)
2087 blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX);
2089 for(y=0; y<height; y+=BLOCK_SIZE)
2091 //1% speedup if these are here instead of the inner loop
2092 uint8_t *srcBlock= &(src[y*srcStride]);
2093 uint8_t *dstBlock= &(dst[y*dstStride]);
2094 uint8_t *vertSrcBlock= &(srcBlock[srcStride*3]); // Blocks are 10x8 -> *3 to start
2095 uint8_t *vertBlock= &(dstBlock[dstStride*3]);
2097 // finish 1 block before the next otherwise we´ll might have a problem
2098 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
2099 for(x=0; x<width; x+=BLOCK_SIZE)
2101 const int stride= dstStride;
2103 QPs[(y>>3)*QPStride + (x>>3)]:
2104 (QPs[(y>>4)*QPStride + (x>>4)] * (packedYScale &0xFFFF))>>8;
2107 "movd %0, %%mm7 \n\t"
2108 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
2109 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
2110 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
2111 "movq %%mm7, pQPb \n\t"
2124 prefetchnta(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32);
2125 prefetchnta(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32);
2126 prefetcht0(vertBlock + (((x>>3)&3) + 2)*dstStride + 32);
2127 prefetcht0(vertBlock + (((x>>3)&3) + 6)*dstStride + 32);
2128 #elif defined(HAVE_3DNOW)
2129 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
2130 /* prefetch(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32);
2131 prefetch(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32);
2132 prefetchw(vertBlock + (((x>>3)&3) + 2)*dstStride + 32);
2133 prefetchw(vertBlock + (((x>>3)&3) + 6)*dstStride + 32);
2136 if(!isColor) yHistogram[ srcBlock[0] ]++;
2138 blockCopy(vertBlock + dstStride*2, dstStride,
2139 vertSrcBlock + srcStride*2, srcStride, 8, mode & LEVEL_FIX);
2147 if(mode & V_DEBLOCK)
2149 if(mode & V_RK1_FILTER)
2150 vertRK1Filter(vertBlock, stride, QP);
2151 else if(mode & V_X1_FILTER)
2152 vertX1Filter(vertBlock, stride, QP);
2155 if( isVertDC(vertBlock, stride))
2157 if(isVertMinMaxOk(vertBlock, stride, QP))
2158 doVertLowPass(vertBlock, stride, QP);
2161 doVertDefFilter(vertBlock, stride, QP);
2171 blockCopy(vertBlock + dstStride*1, dstStride,
2172 vertSrcBlock + srcStride*1, srcStride, 4, mode & LEVEL_FIX);
2175 if(x - 8 >= 0 && x<width)
2180 if(mode & H_DEBLOCK)
2182 if(mode & H_X1_FILTER)
2183 horizX1Filter(dstBlock-4, stride, QP);
2186 if( isHorizDCAndCopy2Temp(dstBlock-4, stride))
2188 if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP))
2189 doHorizLowPassAndCopyBack(dstBlock-4, stride, QP);
2192 doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP);
2200 dering(dstBlock - 9 - stride, stride, QP);
2203 dering(dstBlock - stride*9 + width-9, stride, QP);
2204 //FIXME dering filter will not be applied to last block (bottom right)
2214 asm volatile("femms");
2215 #elif defined (HAVE_MMX)
2216 asm volatile("emms");
2220 // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...)
2221 sumTime= rdtsc() - sumTime;
2223 printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d \r",
2224 (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
2225 (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)