2 Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 doVertDefFilter Ec Ec Ec
28 doHorizDefFilter E ac ac
30 Vertical RKAlgo1 E a a
33 LinIpolDeinterlace e E E*
34 CubicIpolDeinterlace a e e*
35 LinBlendDeinterlace e E E*
36 MedianDeinterlace Ec Ec
39 * i dont have a 3dnow CPU -> its untested
40 E = Exact implementation
41 e = allmost exact implementation (slightly different rounding,...)
42 a = alternative / approximate impl
43 c = checked against the other implementations (-vo md5)
48 verify that everything workes as it should (how?)
49 reduce the time wasted on the mem transfer
51 implement everything in C at least (done at the moment but ...)
52 unroll stuff if instructions depend too much on the prior one
53 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
54 move YScale thing to the end instead of fixing QP
55 write a faster and higher quality deblocking filter :)
56 do something about the speed of the horizontal filters
57 make the mainloop more flexible (variable number of blocks at once
58 (the if/else stuff per block is slowing things down)
59 compare the quality & speed of all filters
61 fix warnings (unused vars, ...)
62 noise reduction filters
69 //Changelog: use the CVS log
75 #include "../config.h"
79 #include "postprocess.h"
81 #define MIN(a,b) ((a) > (b) ? (b) : (a))
82 #define MAX(a,b) ((a) < (b) ? (b) : (a))
83 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
84 #define SIGN(a) ((a) > 0 ? 1 : -1)
87 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
88 #elif defined (HAVE_3DNOW)
89 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
92 #define GET_MODE_BUFFER_SIZE 500
93 #define OPTIONS_ARRAY_SIZE 10
96 static uint64_t packedYOffset= 0x0000000000000000LL;
97 static uint64_t packedYScale= 0x0100010001000100LL;
98 static uint64_t w05= 0x0005000500050005LL;
99 static uint64_t w20= 0x0020002000200020LL;
100 static uint64_t w1400= 0x1400140014001400LL;
101 static uint64_t bm00000001= 0x00000000000000FFLL;
102 static uint64_t bm00010000= 0x000000FF00000000LL;
103 static uint64_t bm00001000= 0x00000000FF000000LL;
104 static uint64_t bm10000000= 0xFF00000000000000LL;
105 static uint64_t bm10000001= 0xFF000000000000FFLL;
106 static uint64_t bm11000011= 0xFFFF00000000FFFFLL;
107 static uint64_t bm00000011= 0x000000000000FFFFLL;
108 static uint64_t bm11111110= 0xFFFFFFFFFFFFFF00LL;
109 static uint64_t bm11000000= 0xFFFF000000000000LL;
110 static uint64_t bm00011000= 0x000000FFFF000000LL;
111 static uint64_t bm00110011= 0x0000FFFF0000FFFFLL;
112 static uint64_t bm11001100= 0xFFFF0000FFFF0000LL;
113 static uint64_t b00= 0x0000000000000000LL;
114 static uint64_t b01= 0x0101010101010101LL;
115 static uint64_t b02= 0x0202020202020202LL;
116 static uint64_t b0F= 0x0F0F0F0F0F0F0F0FLL;
117 static uint64_t bFF= 0xFFFFFFFFFFFFFFFFLL;
118 static uint64_t b20= 0x2020202020202020LL;
119 static uint64_t b80= 0x8080808080808080LL;
120 static uint64_t b7E= 0x7E7E7E7E7E7E7E7ELL;
121 static uint64_t b7C= 0x7C7C7C7C7C7C7C7CLL;
122 static uint64_t b3F= 0x3F3F3F3F3F3F3F3FLL;
123 static uint64_t temp0=0;
124 static uint64_t temp1=0;
125 static uint64_t temp2=0;
126 static uint64_t temp3=0;
127 static uint64_t temp4=0;
128 static uint64_t temp5=0;
129 static uint64_t pQPb=0;
130 static uint8_t tempBlock[16*16]; //used so the horizontal code gets aligned data
132 int hFlatnessThreshold= 56 - 16;
133 int vFlatnessThreshold= 56 - 16;
135 //amount of "black" u r willing to loose to get a brightness corrected picture
136 double maxClippedThreshold= 0.01;
141 static struct PPFilter filters[]=
143 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
144 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
145 {"vr", "rkvdeblock", 1, 2, 4, H_RK1_FILTER},
146 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
147 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
148 {"dr", "dering", 1, 5, 6, DERING},
149 {"al", "autolevels", 0, 1, 2, LEVEL_FIX},
150 {"lb", "linblenddeint", 0, 1, 6, LINEAR_BLEND_DEINT_FILTER},
151 {"li", "linipoldeint", 0, 1, 6, LINEAR_IPOL_DEINT_FILTER},
152 {"ci", "cubicipoldeint", 0, 1, 6, CUBIC_IPOL_DEINT_FILTER},
153 {"md", "mediandeint", 0, 1, 6, MEDIAN_DEINT_FILTER},
154 {NULL, NULL,0,0,0,0} //End Marker
157 static char *replaceTable[]=
159 "default", "hdeblock:a,vdeblock:a,dering:a,autolevels",
160 "de", "hdeblock:a,vdeblock:a,dering:a,autolevels",
161 "fast", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels",
162 "fa", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels",
167 static inline long long rdtsc()
170 asm volatile( "rdtsc\n\t"
173 // printf("%d\n", int(l/1000));
179 static inline void prefetchnta(void *p)
181 asm volatile( "prefetchnta (%0)\n\t"
186 static inline void prefetcht0(void *p)
188 asm volatile( "prefetcht0 (%0)\n\t"
193 static inline void prefetcht1(void *p)
195 asm volatile( "prefetcht1 (%0)\n\t"
200 static inline void prefetcht2(void *p)
202 asm volatile( "prefetcht2 (%0)\n\t"
208 //FIXME? |255-0| = 1 (shouldnt be a problem ...)
210 * Check if the middle 8x8 Block in the given 8x16 block is flat
212 static inline int isVertDC(uint8_t src[], int stride){
215 src+= stride*4; // src points to begin of the 8x8 Block
219 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F
220 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D
221 "movq (%1), %%mm0 \n\t"
223 "movq (%1), %%mm1 \n\t"
224 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
225 "paddb %%mm7, %%mm0 \n\t"
226 "pcmpgtb %%mm6, %%mm0 \n\t"
229 "movq (%1), %%mm2 \n\t"
230 "psubb %%mm2, %%mm1 \n\t"
231 "paddb %%mm7, %%mm1 \n\t"
232 "pcmpgtb %%mm6, %%mm1 \n\t"
233 "paddb %%mm1, %%mm0 \n\t"
236 "movq (%1), %%mm1 \n\t"
237 "psubb %%mm1, %%mm2 \n\t"
238 "paddb %%mm7, %%mm2 \n\t"
239 "pcmpgtb %%mm6, %%mm2 \n\t"
240 "paddb %%mm2, %%mm0 \n\t"
243 "movq (%1), %%mm2 \n\t"
244 "psubb %%mm2, %%mm1 \n\t"
245 "paddb %%mm7, %%mm1 \n\t"
246 "pcmpgtb %%mm6, %%mm1 \n\t"
247 "paddb %%mm1, %%mm0 \n\t"
250 "movq (%1), %%mm1 \n\t"
251 "psubb %%mm1, %%mm2 \n\t"
252 "paddb %%mm7, %%mm2 \n\t"
253 "pcmpgtb %%mm6, %%mm2 \n\t"
254 "paddb %%mm2, %%mm0 \n\t"
257 "movq (%1), %%mm2 \n\t"
258 "psubb %%mm2, %%mm1 \n\t"
259 "paddb %%mm7, %%mm1 \n\t"
260 "pcmpgtb %%mm6, %%mm1 \n\t"
261 "paddb %%mm1, %%mm0 \n\t"
264 "movq (%1), %%mm1 \n\t"
265 "psubb %%mm1, %%mm2 \n\t"
266 "paddb %%mm7, %%mm2 \n\t"
267 "pcmpgtb %%mm6, %%mm2 \n\t"
268 "paddb %%mm2, %%mm0 \n\t"
271 "movq %%mm0, %%mm1 \n\t"
272 "psrlw $8, %%mm0 \n\t"
273 "paddb %%mm1, %%mm0 \n\t"
274 "movq %%mm0, %%mm1 \n\t"
275 "psrlq $16, %%mm0 \n\t"
276 "paddb %%mm1, %%mm0 \n\t"
277 "movq %%mm0, %%mm1 \n\t"
278 "psrlq $32, %%mm0 \n\t"
279 "paddb %%mm1, %%mm0 \n\t"
281 "movd %%mm0, %0 \n\t"
283 : "r" (src), "r" (stride)
285 // printf("%d\n", numEq);
286 numEq= (256 - (numEq & 0xFF)) &0xFF;
290 // uint8_t *temp= src;
293 for(y=0; y<BLOCK_SIZE-1; y++)
295 if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
296 if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
297 if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++;
298 if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++;
299 if(((src[4] - src[4+stride] + 1)&0xFFFF) < 3) numEq++;
300 if(((src[5] - src[5+stride] + 1)&0xFFFF) < 3) numEq++;
301 if(((src[6] - src[6+stride] + 1)&0xFFFF) < 3) numEq++;
302 if(((src[7] - src[7+stride] + 1)&0xFFFF) < 3) numEq++;
306 /* if(abs(numEq - asmEq) > 0)
308 printf("\nasm:%d c:%d\n", asmEq, numEq);
309 for(int y=0; y<8; y++)
311 for(int x=0; x<8; x++)
313 printf("%d ", temp[x + y*stride]);
319 // for(int i=0; i<numEq/8; i++) src[i]=255;
320 return (numEq > vFlatnessThreshold) ? 1 : 0;
323 static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
330 "movq (%1, %2), %%mm0 \n\t"
331 "movq (%1, %2, 8), %%mm1 \n\t"
332 "movq %%mm0, %%mm2 \n\t"
333 "psubusb %%mm1, %%mm0 \n\t"
334 "psubusb %%mm2, %%mm1 \n\t"
335 "por %%mm1, %%mm0 \n\t" // ABS Diff
337 "movq pQPb, %%mm7 \n\t" // QP,..., QP
338 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
339 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0
340 "pcmpeqd b00, %%mm0 \n\t"
341 "psrlq $16, %%mm0 \n\t"
342 "pcmpeqd bFF, %%mm0 \n\t"
343 // "movd %%mm0, (%1, %2, 4)\n\t"
344 "movd %%mm0, %0 \n\t"
346 : "r" (src), "r" (stride)
354 for(x=0; x<BLOCK_SIZE; x++)
356 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
358 /* if(isOk && !isOk2 || !isOk && isOk2)
360 printf("\nasm:%d c:%d QP:%d\n", isOk, isOk2, QP);
361 for(int y=0; y<9; y++)
363 for(int x=0; x<8; x++)
365 printf("%d ", src[x + y*stride]);
377 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
378 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
380 static inline void doVertLowPass(uint8_t *src, int stride, int QP)
382 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
384 asm volatile( //"movv %0 %1 %2\n\t"
386 "movq pQPb, %%mm0 \n\t" // QP,..., QP
388 "movq (%0), %%mm6 \n\t"
389 "movq (%0, %1), %%mm5 \n\t"
390 "movq %%mm5, %%mm1 \n\t"
391 "movq %%mm6, %%mm2 \n\t"
392 "psubusb %%mm6, %%mm5 \n\t"
393 "psubusb %%mm1, %%mm2 \n\t"
394 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
395 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
396 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
398 "pand %%mm2, %%mm6 \n\t"
399 "pandn %%mm1, %%mm2 \n\t"
400 "por %%mm2, %%mm6 \n\t"// First Line to Filter
402 "movq (%0, %1, 8), %%mm5 \n\t"
403 "leal (%0, %1, 4), %%eax \n\t"
404 "leal (%0, %1, 8), %%ebx \n\t"
405 "subl %1, %%ebx \n\t"
406 "addl %1, %0 \n\t" // %0 points to line 1 not 0
407 "movq (%0, %1, 8), %%mm7 \n\t"
408 "movq %%mm5, %%mm1 \n\t"
409 "movq %%mm7, %%mm2 \n\t"
410 "psubusb %%mm7, %%mm5 \n\t"
411 "psubusb %%mm1, %%mm2 \n\t"
412 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
413 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
414 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
416 "pand %%mm2, %%mm7 \n\t"
417 "pandn %%mm1, %%mm2 \n\t"
418 "por %%mm2, %%mm7 \n\t" // First Line to Filter
422 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ebx eax+4%1
427 "movq (%0, %1), %%mm0 \n\t" // 1
428 "movq %%mm0, %%mm1 \n\t" // 1
429 PAVGB(%%mm6, %%mm0) //1 1 /2
430 PAVGB(%%mm6, %%mm0) //3 1 /4
432 "movq (%0, %1, 4), %%mm2 \n\t" // 1
433 "movq %%mm2, %%mm5 \n\t" // 1
434 PAVGB((%%eax), %%mm2) // 11 /2
435 PAVGB((%0, %1, 2), %%mm2) // 211 /4
436 "movq %%mm2, %%mm3 \n\t" // 211 /4
437 "movq (%0), %%mm4 \n\t" // 1
438 PAVGB(%%mm4, %%mm3) // 4 211 /8
439 PAVGB(%%mm0, %%mm3) //642211 /16
440 "movq %%mm3, (%0) \n\t" // X
441 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
442 "movq %%mm1, %%mm0 \n\t" // 1
443 PAVGB(%%mm6, %%mm0) //1 1 /2
444 "movq %%mm4, %%mm3 \n\t" // 1
445 PAVGB((%0,%1,2), %%mm3) // 1 1 /2
446 PAVGB((%%eax,%1,2), %%mm5) // 11 /2
447 PAVGB((%%eax), %%mm5) // 211 /4
448 PAVGB(%%mm5, %%mm3) // 2 2211 /8
449 PAVGB(%%mm0, %%mm3) //4242211 /16
450 "movq %%mm3, (%0,%1) \n\t" // X
451 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
452 PAVGB(%%mm4, %%mm6) //11 /2
453 "movq (%%ebx), %%mm0 \n\t" // 1
454 PAVGB((%%eax, %1, 2), %%mm0) // 11/2
455 "movq %%mm0, %%mm3 \n\t" // 11/2
456 PAVGB(%%mm1, %%mm0) // 2 11/4
457 PAVGB(%%mm6, %%mm0) //222 11/8
458 PAVGB(%%mm2, %%mm0) //22242211/16
459 "movq (%0, %1, 2), %%mm2 \n\t" // 1
460 "movq %%mm0, (%0, %1, 2) \n\t" // X
461 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
462 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
463 PAVGB((%%ebx), %%mm0) // 11 /2
464 PAVGB(%%mm0, %%mm6) //11 11 /4
465 PAVGB(%%mm1, %%mm4) // 11 /2
466 PAVGB(%%mm2, %%mm1) // 11 /2
467 PAVGB(%%mm1, %%mm6) //1122 11 /8
468 PAVGB(%%mm5, %%mm6) //112242211 /16
469 "movq (%%eax), %%mm5 \n\t" // 1
470 "movq %%mm6, (%%eax) \n\t" // X
471 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
472 "movq (%%eax, %1, 4), %%mm6 \n\t" // 1
473 PAVGB(%%mm7, %%mm6) // 11 /2
474 PAVGB(%%mm4, %%mm6) // 11 11 /4
475 PAVGB(%%mm3, %%mm6) // 11 2211 /8
476 PAVGB(%%mm5, %%mm2) // 11 /2
477 "movq (%0, %1, 4), %%mm4 \n\t" // 1
478 PAVGB(%%mm4, %%mm2) // 112 /4
479 PAVGB(%%mm2, %%mm6) // 112242211 /16
480 "movq %%mm6, (%0, %1, 4) \n\t" // X
481 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
482 PAVGB(%%mm7, %%mm1) // 11 2 /4
483 PAVGB(%%mm4, %%mm5) // 11 /2
484 PAVGB(%%mm5, %%mm0) // 11 11 /4
485 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1
486 PAVGB(%%mm6, %%mm1) // 11 4 2 /8
487 PAVGB(%%mm0, %%mm1) // 11224222 /16
488 "movq %%mm1, (%%eax, %1, 2) \n\t" // X
489 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
490 PAVGB((%%ebx), %%mm2) // 112 4 /8
491 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
492 PAVGB(%%mm0, %%mm6) // 1 1 /2
493 PAVGB(%%mm7, %%mm6) // 1 12 /4
494 PAVGB(%%mm2, %%mm6) // 1122424 /4
495 "movq %%mm6, (%%ebx) \n\t" // X
496 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
497 PAVGB(%%mm7, %%mm5) // 11 2 /4
498 PAVGB(%%mm7, %%mm5) // 11 6 /8
500 PAVGB(%%mm3, %%mm0) // 112 /4
501 PAVGB(%%mm0, %%mm5) // 112246 /16
502 "movq %%mm5, (%%eax, %1, 4) \n\t" // X
506 : "r" (src), "r" (stride)
510 const int l1= stride;
511 const int l2= stride + l1;
512 const int l3= stride + l2;
513 const int l4= stride + l3;
514 const int l5= stride + l4;
515 const int l6= stride + l5;
516 const int l7= stride + l6;
517 const int l8= stride + l7;
518 const int l9= stride + l8;
521 for(x=0; x<BLOCK_SIZE; x++)
523 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
524 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
527 sums[0] = first + src[l1];
528 sums[1] = src[l1] + src[l2];
529 sums[2] = src[l2] + src[l3];
530 sums[3] = src[l3] + src[l4];
531 sums[4] = src[l4] + src[l5];
532 sums[5] = src[l5] + src[l6];
533 sums[6] = src[l6] + src[l7];
534 sums[7] = src[l7] + src[l8];
535 sums[8] = src[l8] + last;
537 src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
538 src[l2]= ((src[l2]<<2) + (first + sums[0] + sums[3]<<1) + sums[5] + 8)>>4;
539 src[l3]= ((src[l3]<<2) + (first + sums[1] + sums[4]<<1) + sums[6] + 8)>>4;
540 src[l4]= ((src[l4]<<2) + (sums[2] + sums[5]<<1) + sums[0] + sums[7] + 8)>>4;
541 src[l5]= ((src[l5]<<2) + (sums[3] + sums[6]<<1) + sums[1] + sums[8] + 8)>>4;
542 src[l6]= ((src[l6]<<2) + (last + sums[7] + sums[4]<<1) + sums[2] + 8)>>4;
543 src[l7]= ((last + src[l7]<<2) + (src[l8] + sums[5]<<1) + sums[3] + 8)>>4;
544 src[l8]= ((sums[8]<<2) + (last + sums[6]<<1) + sums[4] + 8)>>4;
553 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
554 * values are correctly clipped (MMX2)
555 * values are wraparound (C)
556 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
563 static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
565 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
569 "pxor %%mm7, %%mm7 \n\t" // 0
570 "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
571 "leal (%0, %1), %%eax \n\t"
572 "leal (%%eax, %1, 4), %%ebx \n\t"
573 // 0 1 2 3 4 5 6 7 8 9
574 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
575 "movq pQPb, %%mm0 \n\t" // QP,..., QP
576 "movq %%mm0, %%mm1 \n\t" // QP,..., QP
577 "paddusb b02, %%mm0 \n\t"
578 "psrlw $2, %%mm0 \n\t"
579 "pand b3F, %%mm0 \n\t" // QP/4,..., QP/4
580 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ...
581 "movq (%0, %1, 4), %%mm2 \n\t" // line 4
582 "movq (%%ebx), %%mm3 \n\t" // line 5
583 "movq %%mm2, %%mm4 \n\t" // line 4
584 "pcmpeqb %%mm5, %%mm5 \n\t" // -1
585 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
587 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
588 "psubusb %%mm3, %%mm4 \n\t"
589 "psubusb %%mm2, %%mm3 \n\t"
590 "por %%mm3, %%mm4 \n\t" // |l4 - l5|
591 "psubusb %%mm0, %%mm4 \n\t"
592 "pcmpeqb %%mm7, %%mm4 \n\t"
593 "pand %%mm4, %%mm5 \n\t" // d/2
595 // "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80
596 "paddb %%mm5, %%mm2 \n\t"
597 // "psubb %%mm6, %%mm2 \n\t"
598 "movq %%mm2, (%0,%1, 4) \n\t"
600 "movq (%%ebx), %%mm2 \n\t"
601 // "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80
602 "psubb %%mm5, %%mm2 \n\t"
603 // "psubb %%mm6, %%mm2 \n\t"
604 "movq %%mm2, (%%ebx) \n\t"
606 "paddb %%mm6, %%mm5 \n\t"
607 "psrlw $2, %%mm5 \n\t"
608 "pand b3F, %%mm5 \n\t"
609 "psubb b20, %%mm5 \n\t" // (l5-l4)/8
611 "movq (%%eax, %1, 2), %%mm2 \n\t"
612 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80
613 "paddsb %%mm5, %%mm2 \n\t"
614 "psubb %%mm6, %%mm2 \n\t"
615 "movq %%mm2, (%%eax, %1, 2) \n\t"
617 "movq (%%ebx, %1), %%mm2 \n\t"
618 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80
619 "psubsb %%mm5, %%mm2 \n\t"
620 "psubb %%mm6, %%mm2 \n\t"
621 "movq %%mm2, (%%ebx, %1) \n\t"
624 : "r" (src), "r" (stride)
628 const int l1= stride;
629 const int l2= stride + l1;
630 const int l3= stride + l2;
631 const int l4= stride + l3;
632 const int l5= stride + l4;
633 const int l6= stride + l5;
634 const int l7= stride + l6;
635 const int l8= stride + l7;
636 const int l9= stride + l8;
639 for(x=0; x<BLOCK_SIZE; x++)
641 if(ABS(src[l4]-src[l5]) < QP + QP/4)
643 int v = (src[l5] - src[l4]);
658 * Experimental Filter 1
659 * will not damage linear gradients
660 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
661 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
662 * MMX2 version does correct clipping C version doesnt
664 static inline void vertX1Filter(uint8_t *src, int stride, int QP)
666 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
670 "pxor %%mm7, %%mm7 \n\t" // 0
671 // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
672 "leal (%0, %1), %%eax \n\t"
673 "leal (%%eax, %1, 4), %%ebx \n\t"
674 // 0 1 2 3 4 5 6 7 8 9
675 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
676 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
677 "movq (%0, %1, 4), %%mm1 \n\t" // line 4
678 "movq %%mm1, %%mm2 \n\t" // line 4
679 "psubusb %%mm0, %%mm1 \n\t"
680 "psubusb %%mm2, %%mm0 \n\t"
681 "por %%mm1, %%mm0 \n\t" // |l2 - l3|
682 "movq (%%ebx), %%mm3 \n\t" // line 5
683 "movq (%%ebx, %1), %%mm4 \n\t" // line 6
684 "movq %%mm3, %%mm5 \n\t" // line 5
685 "psubusb %%mm4, %%mm3 \n\t"
686 "psubusb %%mm5, %%mm4 \n\t"
687 "por %%mm4, %%mm3 \n\t" // |l5 - l6|
688 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
689 "movq %%mm2, %%mm1 \n\t" // line 4
690 "psubusb %%mm5, %%mm2 \n\t"
691 "movq %%mm2, %%mm4 \n\t"
692 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
693 "psubusb %%mm1, %%mm5 \n\t"
694 "por %%mm5, %%mm4 \n\t" // |l4 - l5|
695 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
696 "movq %%mm4, %%mm3 \n\t" // d
697 "psubusb pQPb, %%mm4 \n\t"
698 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
699 "psubusb b01, %%mm3 \n\t"
700 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
702 PAVGB(%%mm7, %%mm3) // d/2
703 "movq %%mm3, %%mm1 \n\t" // d/2
704 PAVGB(%%mm7, %%mm3) // d/4
705 PAVGB(%%mm1, %%mm3) // 3*d/8
707 "movq (%0, %1, 4), %%mm0 \n\t" // line 4
708 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
709 "psubusb %%mm3, %%mm0 \n\t"
710 "pxor %%mm2, %%mm0 \n\t"
711 "movq %%mm0, (%0, %1, 4) \n\t" // line 4
713 "movq (%%ebx), %%mm0 \n\t" // line 5
714 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
715 "paddusb %%mm3, %%mm0 \n\t"
716 "pxor %%mm2, %%mm0 \n\t"
717 "movq %%mm0, (%%ebx) \n\t" // line 5
719 PAVGB(%%mm7, %%mm1) // d/4
721 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
722 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
723 "psubusb %%mm1, %%mm0 \n\t"
724 "pxor %%mm2, %%mm0 \n\t"
725 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3
727 "movq (%%ebx, %1), %%mm0 \n\t" // line 6
728 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
729 "paddusb %%mm1, %%mm0 \n\t"
730 "pxor %%mm2, %%mm0 \n\t"
731 "movq %%mm0, (%%ebx, %1) \n\t" // line 6
733 PAVGB(%%mm7, %%mm1) // d/8
735 "movq (%%eax, %1), %%mm0 \n\t" // line 2
736 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
737 "psubusb %%mm1, %%mm0 \n\t"
738 "pxor %%mm2, %%mm0 \n\t"
739 "movq %%mm0, (%%eax, %1) \n\t" // line 2
741 "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7
742 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
743 "paddusb %%mm1, %%mm0 \n\t"
744 "pxor %%mm2, %%mm0 \n\t"
745 "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7
748 : "r" (src), "r" (stride)
753 const int l1= stride;
754 const int l2= stride + l1;
755 const int l3= stride + l2;
756 const int l4= stride + l3;
757 const int l5= stride + l4;
758 const int l6= stride + l5;
759 const int l7= stride + l6;
760 const int l8= stride + l7;
761 const int l9= stride + l8;
765 for(x=0; x<BLOCK_SIZE; x++)
767 int a= src[l3] - src[l4];
768 int b= src[l4] - src[l5];
769 int c= src[l5] - src[l6];
771 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
775 int v = d * SIGN(-b);
788 const int l1= stride;
789 const int l2= stride + l1;
790 const int l3= stride + l2;
791 const int l4= stride + l3;
792 const int l5= stride + l4;
793 const int l6= stride + l5;
794 const int l7= stride + l6;
795 const int l8= stride + l7;
796 const int l9= stride + l8;
797 for(int x=0; x<BLOCK_SIZE; x++)
806 if(ABS(v4-v5)<QP && ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 )
808 src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6 )/16;
809 src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7 )/16;
810 src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16;
811 src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16;
820 * Experimental Filter 1 (Horizontal)
821 * will not damage linear gradients
822 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
823 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
824 * MMX2 version does correct clipping C version doesnt
825 * not identical with the vertical one
827 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
830 static uint64_t *lut= NULL;
834 lut= (uint64_t*)memalign(8, 256*8);
837 int v= i < 128 ? 2*i : 2*(i-256);
839 //Simulate 112242211 9-Tap filter
840 uint64_t a= (v/16) & 0xFF;
841 uint64_t b= (v/8) & 0xFF;
842 uint64_t c= (v/4) & 0xFF;
843 uint64_t d= (3*v/8) & 0xFF;
845 //Simulate piecewise linear interpolation
846 uint64_t a= (v/16) & 0xFF;
847 uint64_t b= (v*3/16) & 0xFF;
848 uint64_t c= (v*5/16) & 0xFF;
849 uint64_t d= (7*v/16) & 0xFF;
850 uint64_t A= (0x100 - a)&0xFF;
851 uint64_t B= (0x100 - b)&0xFF;
852 uint64_t C= (0x100 - c)&0xFF;
853 uint64_t D= (0x100 - c)&0xFF;
855 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
856 (D<<24) | (C<<16) | (B<<8) | (A);
857 //lut[i] = (v<<32) | (v<<24);
861 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
863 "pxor %%mm7, %%mm7 \n\t" // 0
864 // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
865 "leal (%0, %1), %%eax \n\t"
866 "leal (%%eax, %1, 4), %%ebx \n\t"
868 "movq b80, %%mm6 \n\t"
869 "movd pQPb, %%mm5 \n\t" // QP
870 "movq %%mm5, %%mm4 \n\t"
871 "paddusb %%mm5, %%mm5 \n\t" // 2QP
872 "paddusb %%mm5, %%mm4 \n\t" // 3QP
873 "pxor %%mm5, %%mm5 \n\t" // 0
874 "psubb %%mm4, %%mm5 \n\t" // -3QP
875 "por bm11111110, %%mm5 \n\t" // ...,FF,FF,-3QP
876 "psllq $24, %%mm5 \n\t"
878 // 0 1 2 3 4 5 6 7 8 9
879 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
882 "movd " #a ", %%mm0 \n\t"\
883 "movd 4" #a ", %%mm1 \n\t"\
884 "punpckldq %%mm1, %%mm0 \n\t"\
885 "movq %%mm0, %%mm1 \n\t"\
886 "movq %%mm0, %%mm2 \n\t"\
887 "psrlq $8, %%mm1 \n\t"\
888 "psubusb %%mm1, %%mm2 \n\t"\
889 "psubusb %%mm0, %%mm1 \n\t"\
890 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
891 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
892 "pshufw $0x00, %%mm1, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
893 PAVGB(%%mm1, %%mm3) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
894 "psrlq $16, %%mm3 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
895 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
896 "paddb %%mm5, %%mm1 \n\t"\
897 "psubusb %%mm5, %%mm1 \n\t"\
899 "pxor %%mm2, %%mm1 \n\t"\
900 "psubb %%mm2, %%mm1 \n\t"\
901 "psrlq $24, %%mm1 \n\t"\
902 "movd %%mm1, %%ecx \n\t"\
903 "paddb %%mm6, %%mm0 \n\t"\
904 "paddsb (%3, %%ecx, 8), %%mm0 \n\t"\
905 "paddb %%mm6, %%mm0 \n\t"\
906 "movq %%mm0, " #a " \n\t"\
912 HX1old((%%eax, %1, 2))
916 HX1old((%%ebx, %1, 2))
919 //FIXME add some comments, its unreadable ...
920 #define HX1b(a, c, b, d) \
921 "movd " #a ", %%mm0 \n\t"\
922 "movd 4" #a ", %%mm1 \n\t"\
923 "punpckldq %%mm1, %%mm0 \n\t"\
924 "movd " #b ", %%mm4 \n\t"\
925 "movq %%mm0, %%mm1 \n\t"\
926 "movq %%mm0, %%mm2 \n\t"\
927 "psrlq $8, %%mm1 \n\t"\
928 "movd 4" #b ", %%mm3 \n\t"\
929 "psubusb %%mm1, %%mm2 \n\t"\
930 "psubusb %%mm0, %%mm1 \n\t"\
931 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
932 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
933 "punpckldq %%mm3, %%mm4 \n\t"\
934 "movq %%mm1, %%mm3 \n\t"\
935 "psllq $32, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
936 PAVGB(%%mm1, %%mm3) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
937 "paddb %%mm6, %%mm0 \n\t"\
938 "psrlq $16, %%mm3 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
939 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
940 "movq %%mm4, %%mm3 \n\t"\
941 "paddb %%mm5, %%mm1 \n\t"\
942 "psubusb %%mm5, %%mm1 \n\t"\
943 "psrlq $8, %%mm3 \n\t"\
945 "pxor %%mm2, %%mm1 \n\t"\
946 "psubb %%mm2, %%mm1 \n\t"\
947 "movq %%mm4, %%mm2 \n\t"\
948 "psrlq $24, %%mm1 \n\t"\
949 "psubusb %%mm3, %%mm2 \n\t"\
950 "movd %%mm1, %%ecx \n\t"\
951 "psubusb %%mm4, %%mm3 \n\t"\
952 "paddsb (%2, %%ecx, 8), %%mm0 \n\t"\
953 "por %%mm2, %%mm3 \n\t" /* p´x = |px - p(x+1)| */\
954 "paddb %%mm6, %%mm0 \n\t"\
955 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
956 "movq %%mm3, %%mm1 \n\t"\
957 "psllq $32, %%mm1 \n\t" /* p´5 = |p1 - p2| */\
958 "movq %%mm0, " #a " \n\t"\
959 PAVGB(%%mm3, %%mm1) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
960 "paddb %%mm6, %%mm4 \n\t"\
961 "psrlq $16, %%mm1 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
962 "psubusb %%mm1, %%mm3 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
963 "paddb %%mm5, %%mm3 \n\t"\
964 "psubusb %%mm5, %%mm3 \n\t"\
966 "pxor %%mm2, %%mm3 \n\t"\
967 "psubb %%mm2, %%mm3 \n\t"\
968 "psrlq $24, %%mm3 \n\t"\
969 "movd " #c ", %%mm0 \n\t"\
970 "movd 4" #c ", %%mm1 \n\t"\
971 "punpckldq %%mm1, %%mm0 \n\t"\
972 "paddb %%mm6, %%mm0 \n\t"\
973 "paddsb (%2, %%ecx, 8), %%mm0 \n\t"\
974 "paddb %%mm6, %%mm0 \n\t"\
975 "movq %%mm0, " #c " \n\t"\
976 "movd %%mm3, %%ecx \n\t"\
977 "movd " #d ", %%mm0 \n\t"\
978 "paddsb (%2, %%ecx, 8), %%mm4 \n\t"\
979 "movd 4" #d ", %%mm1 \n\t"\
980 "paddb %%mm6, %%mm4 \n\t"\
981 "punpckldq %%mm1, %%mm0 \n\t"\
982 "movq %%mm4, " #b " \n\t"\
983 "paddb %%mm6, %%mm0 \n\t"\
984 "paddsb (%2, %%ecx, 8), %%mm0 \n\t"\
985 "paddb %%mm6, %%mm0 \n\t"\
986 "movq %%mm0, " #d " \n\t"\
988 HX1b((%0),(%%eax),(%%eax, %1),(%%eax, %1, 2))
989 HX1b((%0, %1, 4),(%%ebx),(%%ebx, %1),(%%ebx, %1, 2))
993 : "r" (src), "r" (stride), "r" (lut)
994 : "%eax", "%ebx", "%ecx"
998 //FIXME (has little in common with the mmx2 version)
999 for(y=0; y<BLOCK_SIZE; y++)
1001 int a= src[1] - src[2];
1002 int b= src[3] - src[4];
1003 int c= src[5] - src[6];
1005 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
1009 int v = d * SIGN(-b);
1025 static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
1029 //FIXME try pmul for *5 stuff
1032 "pxor %%mm7, %%mm7 \n\t"
1033 "leal (%0, %1), %%eax \n\t"
1034 "leal (%%eax, %1, 4), %%ebx \n\t"
1036 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
1037 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
1039 "movq (%0), %%mm0 \n\t"
1040 "movq %%mm0, %%mm1 \n\t"
1041 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
1042 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
1044 "movq (%%eax), %%mm2 \n\t"
1045 "movq %%mm2, %%mm3 \n\t"
1046 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
1047 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
1049 "movq (%%eax, %1), %%mm4 \n\t"
1050 "movq %%mm4, %%mm5 \n\t"
1051 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
1052 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
1054 "paddw %%mm0, %%mm0 \n\t" // 2L0
1055 "paddw %%mm1, %%mm1 \n\t" // 2H0
1056 "psubw %%mm4, %%mm2 \n\t" // L1 - L2
1057 "psubw %%mm5, %%mm3 \n\t" // H1 - H2
1058 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
1059 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
1061 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
1062 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
1063 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
1064 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
1066 "movq (%%eax, %1, 2), %%mm2 \n\t"
1067 "movq %%mm2, %%mm3 \n\t"
1068 "punpcklbw %%mm7, %%mm2 \n\t" // L3
1069 "punpckhbw %%mm7, %%mm3 \n\t" // H3
1071 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
1072 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
1073 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1074 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1075 "movq %%mm0, temp0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1076 "movq %%mm1, temp1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1078 "movq (%0, %1, 4), %%mm0 \n\t"
1079 "movq %%mm0, %%mm1 \n\t"
1080 "punpcklbw %%mm7, %%mm0 \n\t" // L4
1081 "punpckhbw %%mm7, %%mm1 \n\t" // H4
1083 "psubw %%mm0, %%mm2 \n\t" // L3 - L4
1084 "psubw %%mm1, %%mm3 \n\t" // H3 - H4
1085 "movq %%mm2, temp2 \n\t" // L3 - L4
1086 "movq %%mm3, temp3 \n\t" // H3 - H4
1087 "paddw %%mm4, %%mm4 \n\t" // 2L2
1088 "paddw %%mm5, %%mm5 \n\t" // 2H2
1089 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
1090 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
1092 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
1093 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
1094 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
1095 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
1097 "movq (%%ebx), %%mm2 \n\t"
1098 "movq %%mm2, %%mm3 \n\t"
1099 "punpcklbw %%mm7, %%mm2 \n\t" // L5
1100 "punpckhbw %%mm7, %%mm3 \n\t" // H5
1101 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
1102 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
1103 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
1104 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
1106 "movq (%%ebx, %1), %%mm6 \n\t"
1107 "punpcklbw %%mm7, %%mm6 \n\t" // L6
1108 "psubw %%mm6, %%mm2 \n\t" // L5 - L6
1109 "movq (%%ebx, %1), %%mm6 \n\t"
1110 "punpckhbw %%mm7, %%mm6 \n\t" // H6
1111 "psubw %%mm6, %%mm3 \n\t" // H5 - H6
1113 "paddw %%mm0, %%mm0 \n\t" // 2L4
1114 "paddw %%mm1, %%mm1 \n\t" // 2H4
1115 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
1116 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
1118 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
1119 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
1120 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
1121 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
1123 "movq (%%ebx, %1, 2), %%mm2 \n\t"
1124 "movq %%mm2, %%mm3 \n\t"
1125 "punpcklbw %%mm7, %%mm2 \n\t" // L7
1126 "punpckhbw %%mm7, %%mm3 \n\t" // H7
1128 "paddw %%mm2, %%mm2 \n\t" // 2L7
1129 "paddw %%mm3, %%mm3 \n\t" // 2H7
1130 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
1131 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
1133 "movq temp0, %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1134 "movq temp1, %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1135 //FIXME pxor, psubw, pmax for abs
1136 "movq %%mm7, %%mm6 \n\t" // 0
1137 "pcmpgtw %%mm0, %%mm6 \n\t"
1138 "pxor %%mm6, %%mm0 \n\t"
1139 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1140 "movq %%mm7, %%mm6 \n\t" // 0
1141 "pcmpgtw %%mm1, %%mm6 \n\t"
1142 "pxor %%mm6, %%mm1 \n\t"
1143 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1145 "movq %%mm7, %%mm6 \n\t" // 0
1146 "pcmpgtw %%mm2, %%mm6 \n\t"
1147 "pxor %%mm6, %%mm2 \n\t"
1148 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1149 "movq %%mm7, %%mm6 \n\t" // 0
1150 "pcmpgtw %%mm3, %%mm6 \n\t"
1151 "pxor %%mm6, %%mm3 \n\t"
1152 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1155 "pminsw %%mm2, %%mm0 \n\t"
1156 "pminsw %%mm3, %%mm1 \n\t"
1158 "movq %%mm0, %%mm6 \n\t"
1159 "psubusw %%mm2, %%mm6 \n\t"
1160 "psubw %%mm6, %%mm0 \n\t"
1161 "movq %%mm1, %%mm6 \n\t"
1162 "psubusw %%mm3, %%mm6 \n\t"
1163 "psubw %%mm6, %%mm1 \n\t"
1166 "movq %%mm7, %%mm6 \n\t" // 0
1167 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
1168 "pxor %%mm6, %%mm4 \n\t"
1169 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
1170 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
1171 "pxor %%mm7, %%mm5 \n\t"
1172 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
1174 "movd %2, %%mm2 \n\t" // QP
1175 "punpcklwd %%mm2, %%mm2 \n\t"
1176 "punpcklwd %%mm2, %%mm2 \n\t"
1177 "psllw $3, %%mm2 \n\t" // 8QP
1178 "movq %%mm2, %%mm3 \n\t" // 8QP
1179 "pcmpgtw %%mm4, %%mm2 \n\t"
1180 "pcmpgtw %%mm5, %%mm3 \n\t"
1181 "pand %%mm2, %%mm4 \n\t"
1182 "pand %%mm3, %%mm5 \n\t"
1185 "psubusw %%mm0, %%mm4 \n\t" // hd
1186 "psubusw %%mm1, %%mm5 \n\t" // ld
1189 "movq w05, %%mm2 \n\t" // 5
1190 "pmullw %%mm2, %%mm4 \n\t"
1191 "pmullw %%mm2, %%mm5 \n\t"
1192 "movq w20, %%mm2 \n\t" // 32
1193 "paddw %%mm2, %%mm4 \n\t"
1194 "paddw %%mm2, %%mm5 \n\t"
1195 "psrlw $6, %%mm4 \n\t"
1196 "psrlw $6, %%mm5 \n\t"
1199 "movq w06, %%mm2 \n\t" // 6
1200 "paddw %%mm2, %%mm4 \n\t"
1201 "paddw %%mm2, %%mm5 \n\t"
1202 "movq w1400, %%mm2 \n\t" // 1400h = 5120 = 5/64*2^16
1203 //FIXME if *5/64 is supposed to be /13 then we should use 5041 instead of 5120
1204 "pmulhw %%mm2, %%mm4 \n\t" // hd/13
1205 "pmulhw %%mm2, %%mm5 \n\t" // ld/13
1208 "movq temp2, %%mm0 \n\t" // L3 - L4
1209 "movq temp3, %%mm1 \n\t" // H3 - H4
1211 "pxor %%mm2, %%mm2 \n\t"
1212 "pxor %%mm3, %%mm3 \n\t"
1214 // FIXME rounding error
1215 "psraw $1, %%mm0 \n\t" // (L3 - L4)/2
1216 "psraw $1, %%mm1 \n\t" // (H3 - H4)/2
1217 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
1218 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
1219 "pxor %%mm2, %%mm0 \n\t"
1220 "pxor %%mm3, %%mm1 \n\t"
1221 "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
1222 "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
1223 // "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
1224 // "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
1226 "pxor %%mm6, %%mm2 \n\t"
1227 "pxor %%mm7, %%mm3 \n\t"
1228 "pand %%mm2, %%mm4 \n\t"
1229 "pand %%mm3, %%mm5 \n\t"
1232 "pminsw %%mm0, %%mm4 \n\t"
1233 "pminsw %%mm1, %%mm5 \n\t"
1235 "movq %%mm4, %%mm2 \n\t"
1236 "psubusw %%mm0, %%mm2 \n\t"
1237 "psubw %%mm2, %%mm4 \n\t"
1238 "movq %%mm5, %%mm2 \n\t"
1239 "psubusw %%mm1, %%mm2 \n\t"
1240 "psubw %%mm2, %%mm5 \n\t"
1242 "pxor %%mm6, %%mm4 \n\t"
1243 "pxor %%mm7, %%mm5 \n\t"
1244 "psubw %%mm6, %%mm4 \n\t"
1245 "psubw %%mm7, %%mm5 \n\t"
1246 "packsswb %%mm5, %%mm4 \n\t"
1247 "movq (%%eax, %1, 2), %%mm0 \n\t"
1248 "paddb %%mm4, %%mm0 \n\t"
1249 "movq %%mm0, (%%eax, %1, 2) \n\t"
1250 "movq (%0, %1, 4), %%mm0 \n\t"
1251 "psubb %%mm4, %%mm0 \n\t"
1252 "movq %%mm0, (%0, %1, 4) \n\t"
1255 : "r" (src), "r" (stride), "r" (QP)
1259 const int l1= stride;
1260 const int l2= stride + l1;
1261 const int l3= stride + l2;
1262 const int l4= stride + l3;
1263 const int l5= stride + l4;
1264 const int l6= stride + l5;
1265 const int l7= stride + l6;
1266 const int l8= stride + l7;
1267 // const int l9= stride + l8;
1270 for(x=0; x<BLOCK_SIZE; x++)
1272 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1273 if(ABS(middleEnergy) < 8*QP)
1275 const int q=(src[l4] - src[l5])/2;
1276 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1277 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1279 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1283 d*= SIGN(-middleEnergy);
1304 //FIXME? |255-0| = 1
1306 * Check if the given 8x8 Block is mostly "flat" and copy the unaliged data into tempBlock.
1308 static inline int isHorizDCAndCopy2Temp(uint8_t src[], int stride)
1316 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F
1317 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D
1318 "leal tempBlock, %%eax \n\t"
1319 "pxor %%mm0, %%mm0 \n\t"
1321 #define HDC_CHECK_AND_CPY(i) \
1322 "movq -4(%1), %%mm2 \n\t"\
1323 "psrlq $32, %%mm2 \n\t"\
1324 "punpckldq 4(%1), %%mm2 \n\t" /* (%1) */\
1325 "movq %%mm2, %%mm1 \n\t"\
1326 "psrlq $8, %%mm2 \n\t"\
1327 "psubb %%mm1, %%mm2 \n\t"\
1328 "paddb %%mm7, %%mm2 \n\t"\
1329 "pcmpgtb %%mm6, %%mm2 \n\t"\
1330 "paddb %%mm2, %%mm0 \n\t"\
1331 "movq %%mm1," #i "(%%eax) \n\t"
1333 HDC_CHECK_AND_CPY(0)
1335 HDC_CHECK_AND_CPY(8)
1337 HDC_CHECK_AND_CPY(16)
1339 HDC_CHECK_AND_CPY(24)
1341 HDC_CHECK_AND_CPY(32)
1343 HDC_CHECK_AND_CPY(40)
1345 HDC_CHECK_AND_CPY(48)
1347 HDC_CHECK_AND_CPY(56)
1349 "psllq $8, %%mm0 \n\t" // remove dummy value
1350 "movq %%mm0, %%mm1 \n\t"
1351 "psrlw $8, %%mm0 \n\t"
1352 "paddb %%mm1, %%mm0 \n\t"
1353 "movq %%mm0, %%mm1 \n\t"
1354 "psrlq $16, %%mm0 \n\t"
1355 "paddb %%mm1, %%mm0 \n\t"
1356 "movq %%mm0, %%mm1 \n\t"
1357 "psrlq $32, %%mm0 \n\t"
1358 "paddb %%mm1, %%mm0 \n\t"
1360 "movd %%mm0, %0 \n\t"
1362 : "r" (src), "r" (stride)
1365 // printf("%d\n", numEq);
1366 numEq= (256 - (numEq & 0xFF)) &0xFF;
1369 for(y=0; y<BLOCK_SIZE; y++)
1371 if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++;
1372 if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++;
1373 if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++;
1374 if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++;
1375 if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++;
1376 if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++;
1377 if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++;
1378 tempBlock[0 + y*TEMP_STRIDE] = src[0];
1379 tempBlock[1 + y*TEMP_STRIDE] = src[1];
1380 tempBlock[2 + y*TEMP_STRIDE] = src[2];
1381 tempBlock[3 + y*TEMP_STRIDE] = src[3];
1382 tempBlock[4 + y*TEMP_STRIDE] = src[4];
1383 tempBlock[5 + y*TEMP_STRIDE] = src[5];
1384 tempBlock[6 + y*TEMP_STRIDE] = src[6];
1385 tempBlock[7 + y*TEMP_STRIDE] = src[7];
1389 /* if(abs(numEq - asmEq) > 0)
1391 // printf("\nasm:%d c:%d\n", asmEq, numEq);
1392 for(int y=0; y<8; y++)
1394 for(int x=0; x<8; x++)
1396 printf("%d ", src[x + y*stride]);
1402 // printf("%d\n", numEq);
1403 return numEq > hFlatnessThreshold;
1406 static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
1413 "movq (%1, %2), %%mm0 \n\t"
1414 "movq (%1, %2, 8), %%mm1 \n\t"
1415 "movq %%mm0, %%mm2 \n\t"
1416 "psubusb %%mm1, %%mm0 \n\t"
1417 "psubusb %%mm2, %%mm1 \n\t"
1418 "por %%mm1, %%mm0 \n\t" // ABS Diff
1420 "movq pQPb, %%mm7 \n\t" // QP,..., QP
1421 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
1422 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0
1423 "pcmpeqd b00, %%mm0 \n\t"
1424 "psrlq $16, %%mm0 \n\t"
1425 "pcmpeqd bFF, %%mm0 \n\t"
1426 // "movd %%mm0, (%1, %2, 4)\n\t"
1427 "movd %%mm0, %0 \n\t"
1429 : "r" (src), "r" (stride)
1433 if(abs(src[0] - src[7]) > 2*QP) return 0;
1439 static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP)
1444 "pxor %%mm7, %%mm7 \n\t"
1445 "movq bm00001000, %%mm6 \n\t"
1446 "movd %2, %%mm5 \n\t" // QP
1447 "movq %%mm5, %%mm4 \n\t"
1448 "paddusb %%mm5, %%mm5 \n\t" // 2QP
1449 "paddusb %%mm5, %%mm4 \n\t" // 3QP
1450 "psllq $24, %%mm4 \n\t"
1451 "pxor %%mm5, %%mm5 \n\t" // 0
1452 "psubb %%mm4, %%mm5 \n\t" // -QP
1453 "leal tempBlock, %%eax \n\t"
1455 //FIXME? "unroll by 2" and mix
1458 "movq " #i "(%%eax), %%mm0 \n\t"\
1459 "movq %%mm0, %%mm1 \n\t"\
1460 "movq %%mm0, %%mm2 \n\t"\
1461 "psrlq $8, %%mm1 \n\t"\
1462 "psubusb %%mm1, %%mm2 \n\t"\
1463 "psubusb %%mm0, %%mm1 \n\t"\
1464 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
1465 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
1466 "pshufw $0x00, %%mm1, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
1467 "pminub %%mm1, %%mm3 \n\t" /* p´5 = min(|p2-p1|, |p6-p5|)*/\
1468 "psrlq $16, %%mm3 \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\
1469 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5-p6|) */\
1470 "paddb %%mm5, %%mm1 \n\t"\
1471 "psubusb %%mm5, %%mm1 \n\t"\
1472 "psrlw $2, %%mm1 \n\t"\
1473 "pxor %%mm2, %%mm1 \n\t"\
1474 "psubb %%mm2, %%mm1 \n\t"\
1475 "pand %%mm6, %%mm1 \n\t"\
1476 "psubb %%mm1, %%mm0 \n\t"\
1477 "psllq $8, %%mm1 \n\t"\
1478 "paddb %%mm1, %%mm0 \n\t"\
1479 "movd %%mm0, (%0) \n\t"\
1480 "psrlq $32, %%mm0 \n\t"\
1481 "movd %%mm0, 4(%0) \n\t"
1484 "movq " #i "(%%eax), %%mm0 \n\t"\
1485 "movq %%mm0, %%mm1 \n\t"\
1486 "movq %%mm0, %%mm2 \n\t"\
1487 "psrlq $8, %%mm1 \n\t"\
1488 "psubusb %%mm1, %%mm2 \n\t"\
1489 "psubusb %%mm0, %%mm1 \n\t"\
1490 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
1491 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
1492 "movq %%mm1, %%mm3 \n\t"\
1493 "psllq $32, %%mm3 \n\t"\
1494 "movq %%mm3, %%mm4 \n\t"\
1495 "psubusb %%mm1, %%mm4 \n\t"\
1496 "psubb %%mm4, %%mm3 \n\t"\
1497 "psrlq $16, %%mm3 \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\
1498 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5,ü6|) */\
1499 "paddb %%mm5, %%mm1 \n\t"\
1500 "psubusb %%mm5, %%mm1 \n\t"\
1501 "psrlw $2, %%mm1 \n\t"\
1502 "pxor %%mm2, %%mm1 \n\t"\
1503 "psubb %%mm2, %%mm1 \n\t"\
1504 "pand %%mm6, %%mm1 \n\t"\
1505 "psubb %%mm1, %%mm0 \n\t"\
1506 "psllq $8, %%mm1 \n\t"\
1507 "paddb %%mm1, %%mm0 \n\t"\
1508 "movd %%mm0, (%0) \n\t"\
1509 "psrlq $32, %%mm0 \n\t"\
1510 "movd %%mm0, 4(%0) \n\t"
1529 : "r" (dst), "r" (stride), "r" (QP)
1533 uint8_t *src= tempBlock;
1536 for(y=0; y<BLOCK_SIZE; y++)
1538 const int middleEnergy= 5*(src[4] - src[5]) + 2*(src[2] - src[5]);
1549 if(ABS(middleEnergy) < 8*QP)
1551 const int q=(src[3] - src[4])/2;
1552 const int leftEnergy= 5*(src[2] - src[1]) + 2*(src[0] - src[3]);
1553 const int rightEnergy= 5*(src[6] - src[5]) + 2*(src[4] - src[7]);
1555 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1559 d*= SIGN(-middleEnergy);
1582 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
1583 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
1584 * using the 7-Tap Filter (2,2,2,4,2,2,2)/16 (MMX2/3DNOW version)
1586 static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP)
1589 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1590 asm volatile( //"movv %0 %1 %2\n\t"
1592 "pxor %%mm7, %%mm7 \n\t"
1593 "leal tempBlock, %%eax \n\t"
1595 #define HLP1 "movq (%0), %%mm0 \n\t"\
1596 "movq %%mm0, %%mm1 \n\t"\
1597 "psllq $8, %%mm0 \n\t"\
1598 PAVGB(%%mm1, %%mm0)\
1599 "psrlw $8, %%mm0 \n\t"\
1600 "pxor %%mm1, %%mm1 \n\t"\
1601 "packuswb %%mm1, %%mm0 \n\t"\
1602 "movq %%mm0, %%mm1 \n\t"\
1603 "movq %%mm0, %%mm2 \n\t"\
1604 "psllq $32, %%mm0 \n\t"\
1605 "paddb %%mm0, %%mm1 \n\t"\
1606 "psllq $16, %%mm2 \n\t"\
1607 PAVGB(%%mm2, %%mm0)\
1608 "movq %%mm0, %%mm3 \n\t"\
1609 "pand bm11001100, %%mm0 \n\t"\
1610 "paddusb %%mm0, %%mm3 \n\t"\
1611 "psrlq $8, %%mm3 \n\t"\
1612 PAVGB(%%mm1, %%mm4)\
1613 PAVGB(%%mm3, %%mm2)\
1614 "psrlq $16, %%mm2 \n\t"\
1615 "punpcklbw %%mm2, %%mm2 \n\t"\
1616 "movq %%mm2, (%0) \n\t"\
1618 #define HLP2 "movq (%0), %%mm0 \n\t"\
1619 "movq %%mm0, %%mm1 \n\t"\
1620 "psllq $8, %%mm0 \n\t"\
1621 PAVGB(%%mm1, %%mm0)\
1622 "psrlw $8, %%mm0 \n\t"\
1623 "pxor %%mm1, %%mm1 \n\t"\
1624 "packuswb %%mm1, %%mm0 \n\t"\
1625 "movq %%mm0, %%mm2 \n\t"\
1626 "psllq $32, %%mm0 \n\t"\
1627 "psllq $16, %%mm2 \n\t"\
1628 PAVGB(%%mm2, %%mm0)\
1629 "movq %%mm0, %%mm3 \n\t"\
1630 "pand bm11001100, %%mm0 \n\t"\
1631 "paddusb %%mm0, %%mm3 \n\t"\
1632 "psrlq $8, %%mm3 \n\t"\
1633 PAVGB(%%mm3, %%mm2)\
1634 "psrlq $16, %%mm2 \n\t"\
1635 "punpcklbw %%mm2, %%mm2 \n\t"\
1636 "movq %%mm2, (%0) \n\t"\
1638 // approximately a 7-Tap Filter with Vector (1,2,3,4,3,2,1)/16
1640 Implemented Exact 7-Tap
1653 #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\
1654 "movq %%mm0, %%mm1 \n\t"\
1655 "movq %%mm0, %%mm2 \n\t"\
1656 "movq %%mm0, %%mm3 \n\t"\
1657 "movq %%mm0, %%mm4 \n\t"\
1658 "psllq $8, %%mm1 \n\t"\
1659 "psrlq $8, %%mm2 \n\t"\
1660 "pand bm00000001, %%mm3 \n\t"\
1661 "pand bm10000000, %%mm4 \n\t"\
1662 "por %%mm3, %%mm1 \n\t"\
1663 "por %%mm4, %%mm2 \n\t"\
1664 PAVGB(%%mm2, %%mm1)\
1665 PAVGB(%%mm1, %%mm0)\
1667 "pshufw $0xF9, %%mm0, %%mm3 \n\t"\
1668 "pshufw $0x90, %%mm0, %%mm4 \n\t"\
1669 PAVGB(%%mm3, %%mm4)\
1670 PAVGB(%%mm4, %%mm0)\
1671 "movd %%mm0, (%0) \n\t"\
1672 "psrlq $32, %%mm0 \n\t"\
1673 "movd %%mm0, 4(%0) \n\t"
1675 #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\
1676 "movq %%mm0, %%mm1 \n\t"\
1677 "movq %%mm0, %%mm2 \n\t"\
1678 "movd -4(%0), %%mm3 \n\t" /*0001000*/\
1679 "movd 8(%0), %%mm4 \n\t" /*0001000*/\
1680 "psllq $8, %%mm1 \n\t"\
1681 "psrlq $8, %%mm2 \n\t"\
1682 "psrlq $24, %%mm3 \n\t"\
1683 "psllq $56, %%mm4 \n\t"\
1684 "por %%mm3, %%mm1 \n\t"\
1685 "por %%mm4, %%mm2 \n\t"\
1686 PAVGB(%%mm2, %%mm1)\
1687 PAVGB(%%mm1, %%mm0)\
1689 "movq %%mm0, %%mm3 \n\t"\
1690 "movq %%mm0, %%mm4 \n\t"\
1691 "movq %%mm0, %%mm5 \n\t"\
1692 "psrlq $16, %%mm3 \n\t"\
1693 "psllq $16, %%mm4 \n\t"\
1694 "pand bm11000000, %%mm5 \n\t"\
1695 "por %%mm5, %%mm3 \n\t"\
1696 "movq %%mm0, %%mm5 \n\t"\
1697 "pand bm00000011, %%mm5 \n\t"\
1698 "por %%mm5, %%mm4 \n\t"\
1699 PAVGB(%%mm3, %%mm4)\
1700 PAVGB(%%mm4, %%mm0)\
1701 "movd %%mm0, (%0) \n\t"\
1702 "psrlq $32, %%mm0 \n\t"\
1703 "movd %%mm0, 4(%0) \n\t"
1706 /* uses the 7-Tap Filter: 1112111 */
1708 "movq " #i "(%%eax), %%mm0 \n\t"\
1709 "movq %%mm0, %%mm1 \n\t"\
1710 "movq %%mm0, %%mm2 \n\t"\
1711 "movd -4(%0), %%mm3 \n\t" /*0001000*/\
1712 "movd 8(%0), %%mm4 \n\t" /*0001000*/\
1713 "psllq $8, %%mm1 \n\t"\
1714 "psrlq $8, %%mm2 \n\t"\
1715 "psrlq $24, %%mm3 \n\t"\
1716 "psllq $56, %%mm4 \n\t"\
1717 "por %%mm3, %%mm1 \n\t"\
1718 "por %%mm4, %%mm2 \n\t"\
1719 "movq %%mm1, %%mm5 \n\t"\
1720 PAVGB(%%mm2, %%mm1)\
1721 PAVGB(%%mm1, %%mm0)\
1722 "psllq $8, %%mm5 \n\t"\
1723 "psrlq $8, %%mm2 \n\t"\
1724 "por %%mm3, %%mm5 \n\t"\
1725 "por %%mm4, %%mm2 \n\t"\
1726 "movq %%mm5, %%mm1 \n\t"\
1727 PAVGB(%%mm2, %%mm5)\
1728 "psllq $8, %%mm1 \n\t"\
1729 "psrlq $8, %%mm2 \n\t"\
1730 "por %%mm3, %%mm1 \n\t"\
1731 "por %%mm4, %%mm2 \n\t"\
1732 PAVGB(%%mm2, %%mm1)\
1733 PAVGB(%%mm1, %%mm5)\
1734 PAVGB(%%mm5, %%mm0)\
1735 "movd %%mm0, (%0) \n\t"\
1736 "psrlq $32, %%mm0 \n\t"\
1737 "movd %%mm0, 4(%0) \n\t"
1739 /* uses the 9-Tap Filter: 112242211 */
1740 #define NEW_HLP2(i)\
1741 "movq " #i "(%%eax), %%mm0 \n\t" /*0001000*/\
1742 "movq %%mm0, %%mm1 \n\t" /*0001000*/\
1743 "movq %%mm0, %%mm2 \n\t" /*0001000*/\
1744 "movd -4(%0), %%mm3 \n\t" /*0001000*/\
1745 "movd 8(%0), %%mm4 \n\t" /*0001000*/\
1746 "psllq $8, %%mm1 \n\t"\
1747 "psrlq $8, %%mm2 \n\t"\
1748 "psrlq $24, %%mm3 \n\t"\
1749 "psllq $56, %%mm4 \n\t"\
1750 "por %%mm3, %%mm1 \n\t" /*0010000*/\
1751 "por %%mm4, %%mm2 \n\t" /*0000100*/\
1752 "movq %%mm1, %%mm5 \n\t" /*0010000*/\
1753 PAVGB(%%mm2, %%mm1) /*0010100*/\
1754 PAVGB(%%mm1, %%mm0) /*0012100*/\
1755 "psllq $8, %%mm5 \n\t"\
1756 "psrlq $8, %%mm2 \n\t"\
1757 "por %%mm3, %%mm5 \n\t" /*0100000*/\
1758 "por %%mm4, %%mm2 \n\t" /*0000010*/\
1759 "movq %%mm5, %%mm1 \n\t" /*0100000*/\
1760 PAVGB(%%mm2, %%mm5) /*0100010*/\
1761 "psllq $8, %%mm1 \n\t"\
1762 "psrlq $8, %%mm2 \n\t"\
1763 "por %%mm3, %%mm1 \n\t" /*1000000*/\
1764 "por %%mm4, %%mm2 \n\t" /*0000001*/\
1765 "movq %%mm1, %%mm6 \n\t" /*1000000*/\
1766 PAVGB(%%mm2, %%mm1) /*1000001*/\
1767 "psllq $8, %%mm6 \n\t"\
1768 "psrlq $8, %%mm2 \n\t"\
1769 "por %%mm3, %%mm6 \n\t"/*100000000*/\
1770 "por %%mm4, %%mm2 \n\t"/*000000001*/\
1771 PAVGB(%%mm2, %%mm6) /*100000001*/\
1772 PAVGB(%%mm6, %%mm1) /*110000011*/\
1773 PAVGB(%%mm1, %%mm5) /*112000211*/\
1774 PAVGB(%%mm5, %%mm0) /*112242211*/\
1775 "movd %%mm0, (%0) \n\t"\
1776 "psrlq $32, %%mm0 \n\t"\
1777 "movd %%mm0, 4(%0) \n\t"
1779 #define HLP(i) NEW_HLP(i)
1799 : "r" (dst), "r" (stride)
1804 uint8_t *temp= tempBlock;
1806 for(y=0; y<BLOCK_SIZE; y++)
1808 const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
1809 const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
1812 sums[0] = first + temp[0];
1813 sums[1] = temp[0] + temp[1];
1814 sums[2] = temp[1] + temp[2];
1815 sums[3] = temp[2] + temp[3];
1816 sums[4] = temp[3] + temp[4];
1817 sums[5] = temp[4] + temp[5];
1818 sums[6] = temp[5] + temp[6];
1819 sums[7] = temp[6] + temp[7];
1820 sums[8] = temp[7] + last;
1822 dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
1823 dst[1]= ((dst[1]<<2) + (first + sums[0] + sums[3]<<1) + sums[5] + 8)>>4;
1824 dst[2]= ((dst[2]<<2) + (first + sums[1] + sums[4]<<1) + sums[6] + 8)>>4;
1825 dst[3]= ((dst[3]<<2) + (sums[2] + sums[5]<<1) + sums[0] + sums[7] + 8)>>4;
1826 dst[4]= ((dst[4]<<2) + (sums[3] + sums[6]<<1) + sums[1] + sums[8] + 8)>>4;
1827 dst[5]= ((dst[5]<<2) + (last + sums[7] + sums[4]<<1) + sums[2] + 8)>>4;
1828 dst[6]= ((last + dst[6]<<2) + (dst[7] + sums[5]<<1) + sums[3] + 8)>>4;
1829 dst[7]= ((sums[8]<<2) + (last + sums[6]<<1) + sums[4] + 8)>>4;
1838 static inline void dering(uint8_t src[], int stride, int QP)
1844 "leal (%0, %1), %%eax \n\t"
1845 "leal (%%eax, %1, 4), %%ebx \n\t"
1846 // 0 1 2 3 4 5 6 7 8 9
1847 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1849 "pcmpeq %%mm6, %%mm6 \n\t"
1850 "pxor %%mm7, %%mm7 \n\t"
1852 #define FIND_MIN_MAX(addr)\
1853 "movq (" #addr "), %%mm0, \n\t"\
1854 "pminub %%mm0, %%mm6 \n\t"\
1855 "pmaxub %%mm0, %%mm7 \n\t"
1859 FIND_MIN_MAX(%%eax, %1)
1860 FIND_MIN_MAX(%%eax, %1, 2)
1861 FIND_MIN_MAX(%0, %1, 4)
1863 FIND_MIN_MAX(%%ebx, %1)
1864 FIND_MIN_MAX(%%ebx, %1, 2)
1865 FIND_MIN_MAX(%0, %1, 8)
1866 FIND_MIN_MAX(%%ebx, %1, 2)
1868 "movq %%mm6, %%mm4 \n\t"
1869 "psrlq $32, %%mm6 \n\t"
1870 "pminub %%mm4, %%mm6 \n\t"
1871 "movq %%mm6, %%mm4 \n\t"
1872 "psrlq $16, %%mm6 \n\t"
1873 "pminub %%mm4, %%mm6 \n\t"
1874 "movq %%mm6, %%mm4 \n\t"
1875 "psrlq $8, %%mm6 \n\t"
1876 "pminub %%mm4, %%mm6 \n\t" // min of pixels
1878 "movq %%mm7, %%mm4 \n\t"
1879 "psrlq $32, %%mm7 \n\t"
1880 "pmaxub %%mm4, %%mm7 \n\t"
1881 "movq %%mm7, %%mm4 \n\t"
1882 "psrlq $16, %%mm7 \n\t"
1883 "pmaxub %%mm4, %%mm7 \n\t"
1884 "movq %%mm7, %%mm4 \n\t"
1885 "psrlq $8, %%mm7 \n\t"
1886 "pmaxub %%mm4, %%mm7 \n\t" // max of pixels
1887 PAVGB(%%mm6, %%mm7) // (max + min)/2
1890 : : "r" (src), "r" (stride), "r" (QP)
1900 * Deinterlaces the given block
1901 * will be called for every 8x8 block, and can read & write into an 8x16 block
1903 static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride)
1905 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1907 "leal (%0, %1), %%eax \n\t"
1908 "leal (%%eax, %1, 4), %%ebx \n\t"
1909 // 0 1 2 3 4 5 6 7 8 9
1910 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1912 "movq (%0), %%mm0 \n\t"
1913 "movq (%%eax, %1), %%mm1 \n\t"
1915 "movq %%mm0, (%%eax) \n\t"
1916 "movq (%0, %1, 4), %%mm0 \n\t"
1918 "movq %%mm1, (%%eax, %1, 2) \n\t"
1919 "movq (%%ebx, %1), %%mm1 \n\t"
1921 "movq %%mm0, (%%ebx) \n\t"
1922 "movq (%0, %1, 8), %%mm0 \n\t"
1924 "movq %%mm1, (%%ebx, %1, 2) \n\t"
1926 : : "r" (src), "r" (stride)
1933 src[stride] = (src[0] + src[stride*2])>>1;
1934 src[stride*3] = (src[stride*2] + src[stride*4])>>1;
1935 src[stride*5] = (src[stride*4] + src[stride*6])>>1;
1936 src[stride*7] = (src[stride*6] + src[stride*8])>>1;
1943 * Deinterlaces the given block
1944 * will be called for every 8x8 block, and can read & write into an 8x16 block
1945 * no cliping in C version
1947 static inline void deInterlaceInterpolateCubic(uint8_t src[], int stride)
1949 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1951 "leal (%0, %1), %%eax \n\t"
1952 "leal (%%eax, %1, 4), %%ebx \n\t"
1953 "leal (%%ebx, %1, 4), %%ecx \n\t"
1954 "addl %1, %%ecx \n\t"
1955 "pxor %%mm7, %%mm7 \n\t"
1956 // 0 1 2 3 4 5 6 7 8 9 10
1957 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 ecx
1959 #define DEINT_CUBIC(a,b,c,d,e)\
1960 "movq " #a ", %%mm0 \n\t"\
1961 "movq " #b ", %%mm1 \n\t"\
1962 "movq " #d ", %%mm2 \n\t"\
1963 "movq " #e ", %%mm3 \n\t"\
1964 PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\
1965 PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\
1966 "movq %%mm0, %%mm2 \n\t"\
1967 "punpcklbw %%mm7, %%mm0 \n\t"\
1968 "punpckhbw %%mm7, %%mm2 \n\t"\
1969 "movq %%mm1, %%mm3 \n\t"\
1970 "punpcklbw %%mm7, %%mm1 \n\t"\
1971 "punpckhbw %%mm7, %%mm3 \n\t"\
1972 "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\
1973 "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\
1974 "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\
1975 "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\
1976 "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\
1977 "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\
1978 "packuswb %%mm3, %%mm1 \n\t"\
1979 "movq %%mm1, " #c " \n\t"
1981 DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%ebx, %1))
1982 DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%ebx), (%%ebx, %1), (%0, %1, 8))
1983 DEINT_CUBIC((%0, %1, 4), (%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8), (%%ecx))
1984 DEINT_CUBIC((%%ebx, %1), (%0, %1, 8), (%%ebx, %1, 4), (%%ecx), (%%ecx, %1, 2))
1986 : : "r" (src), "r" (stride)
1987 : "%eax", "%ebx", "ecx"
1993 src[stride*3] = (-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4;
1994 src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4;
1995 src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4;
1996 src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4;
2003 * Deinterlaces the given block
2004 * will be called for every 8x8 block, and can read & write into an 8x16 block
2005 * will shift the image up by 1 line (FIXME if this is a problem)
2007 static inline void deInterlaceBlendLinear(uint8_t src[], int stride)
2009 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2011 "leal (%0, %1), %%eax \n\t"
2012 "leal (%%eax, %1, 4), %%ebx \n\t"
2013 // 0 1 2 3 4 5 6 7 8 9
2014 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2016 "movq (%0), %%mm0 \n\t" // L0
2017 "movq (%%eax, %1), %%mm1 \n\t" // L2
2018 PAVGB(%%mm1, %%mm0) // L0+L2
2019 "movq (%%eax), %%mm2 \n\t" // L1
2021 "movq %%mm0, (%0) \n\t"
2022 "movq (%%eax, %1, 2), %%mm0 \n\t" // L3
2023 PAVGB(%%mm0, %%mm2) // L1+L3
2024 PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3
2025 "movq %%mm2, (%%eax) \n\t"
2026 "movq (%0, %1, 4), %%mm2 \n\t" // L4
2027 PAVGB(%%mm2, %%mm1) // L2+L4
2028 PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4
2029 "movq %%mm1, (%%eax, %1) \n\t"
2030 "movq (%%ebx), %%mm1 \n\t" // L5
2031 PAVGB(%%mm1, %%mm0) // L3+L5
2032 PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5
2033 "movq %%mm0, (%%eax, %1, 2) \n\t"
2034 "movq (%%ebx, %1), %%mm0 \n\t" // L6
2035 PAVGB(%%mm0, %%mm2) // L4+L6
2036 PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6
2037 "movq %%mm2, (%0, %1, 4) \n\t"
2038 "movq (%%ebx, %1, 2), %%mm2 \n\t" // L7
2039 PAVGB(%%mm2, %%mm1) // L5+L7
2040 PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7
2041 "movq %%mm1, (%%ebx) \n\t"
2042 "movq (%0, %1, 8), %%mm1 \n\t" // L8
2043 PAVGB(%%mm1, %%mm0) // L6+L8
2044 PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8
2045 "movq %%mm0, (%%ebx, %1) \n\t"
2046 "movq (%%ebx, %1, 4), %%mm0 \n\t" // L9
2047 PAVGB(%%mm0, %%mm2) // L7+L9
2048 PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9
2049 "movq %%mm2, (%%ebx, %1, 2) \n\t"
2052 : : "r" (src), "r" (stride)
2059 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2;
2060 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2;
2061 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2062 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
2063 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
2064 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
2065 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
2066 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
2073 * Deinterlaces the given block
2074 * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block
2076 static inline void deInterlaceMedian(uint8_t src[], int stride)
2081 "leal (%0, %1), %%eax \n\t"
2082 "leal (%%eax, %1, 4), %%ebx \n\t"
2083 // 0 1 2 3 4 5 6 7 8 9
2084 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2086 "movq (%0), %%mm0 \n\t" //
2087 "movq (%%eax, %1), %%mm2 \n\t" //
2088 "movq (%%eax), %%mm1 \n\t" //
2089 "movq %%mm0, %%mm3 \n\t"
2090 "pmaxub %%mm1, %%mm0 \n\t" //
2091 "pminub %%mm3, %%mm1 \n\t" //
2092 "pmaxub %%mm2, %%mm1 \n\t" //
2093 "pminub %%mm1, %%mm0 \n\t"
2094 "movq %%mm0, (%%eax) \n\t"
2096 "movq (%0, %1, 4), %%mm0 \n\t" //
2097 "movq (%%eax, %1, 2), %%mm1 \n\t" //
2098 "movq %%mm2, %%mm3 \n\t"
2099 "pmaxub %%mm1, %%mm2 \n\t" //
2100 "pminub %%mm3, %%mm1 \n\t" //
2101 "pmaxub %%mm0, %%mm1 \n\t" //
2102 "pminub %%mm1, %%mm2 \n\t"
2103 "movq %%mm2, (%%eax, %1, 2) \n\t"
2105 "movq (%%ebx), %%mm2 \n\t" //
2106 "movq (%%ebx, %1), %%mm1 \n\t" //
2107 "movq %%mm2, %%mm3 \n\t"
2108 "pmaxub %%mm0, %%mm2 \n\t" //
2109 "pminub %%mm3, %%mm0 \n\t" //
2110 "pmaxub %%mm1, %%mm0 \n\t" //
2111 "pminub %%mm0, %%mm2 \n\t"
2112 "movq %%mm2, (%%ebx) \n\t"
2114 "movq (%%ebx, %1, 2), %%mm2 \n\t" //
2115 "movq (%0, %1, 8), %%mm0 \n\t" //
2116 "movq %%mm2, %%mm3 \n\t"
2117 "pmaxub %%mm0, %%mm2 \n\t" //
2118 "pminub %%mm3, %%mm0 \n\t" //
2119 "pmaxub %%mm1, %%mm0 \n\t" //
2120 "pminub %%mm0, %%mm2 \n\t"
2121 "movq %%mm2, (%%ebx, %1, 2) \n\t"
2124 : : "r" (src), "r" (stride)
2128 #else // MMX without MMX2
2130 "leal (%0, %1), %%eax \n\t"
2131 "leal (%%eax, %1, 4), %%ebx \n\t"
2132 // 0 1 2 3 4 5 6 7 8 9
2133 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2134 "pxor %%mm7, %%mm7 \n\t"
2136 #define MEDIAN(a,b,c)\
2137 "movq " #a ", %%mm0 \n\t"\
2138 "movq " #b ", %%mm2 \n\t"\
2139 "movq " #c ", %%mm1 \n\t"\
2140 "movq %%mm0, %%mm3 \n\t"\
2141 "movq %%mm1, %%mm4 \n\t"\
2142 "movq %%mm2, %%mm5 \n\t"\
2143 "psubusb %%mm1, %%mm3 \n\t"\
2144 "psubusb %%mm2, %%mm4 \n\t"\
2145 "psubusb %%mm0, %%mm5 \n\t"\
2146 "pcmpeqb %%mm7, %%mm3 \n\t"\
2147 "pcmpeqb %%mm7, %%mm4 \n\t"\
2148 "pcmpeqb %%mm7, %%mm5 \n\t"\
2149 "movq %%mm3, %%mm6 \n\t"\
2150 "pxor %%mm4, %%mm3 \n\t"\
2151 "pxor %%mm5, %%mm4 \n\t"\
2152 "pxor %%mm6, %%mm5 \n\t"\
2153 "por %%mm3, %%mm1 \n\t"\
2154 "por %%mm4, %%mm2 \n\t"\
2155 "por %%mm5, %%mm0 \n\t"\
2156 "pand %%mm2, %%mm0 \n\t"\
2157 "pand %%mm1, %%mm0 \n\t"\
2158 "movq %%mm0, " #b " \n\t"
2160 MEDIAN((%0), (%%eax), (%%eax, %1))
2161 MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
2162 MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1))
2163 MEDIAN((%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8))
2165 : : "r" (src), "r" (stride)
2174 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2;
2175 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2;
2176 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2177 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
2178 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
2179 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
2180 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
2181 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
2187 #ifdef HAVE_ODIVX_POSTPROCESS
2188 #include "../opendivx/postprocess.h"
2192 static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2193 QP_STORE_T QPs[], int QPStride, int isColor, int mode);
2195 /* -pp Command line Help
2196 NOTE/FIXME: put this at an appropriate place (--help, html docs, man mplayer)?
2198 -pp <filterName>[:<option>[:<option>...]][,[-]<filterName>[:<option>...]]...
2201 -pp vdeblock:autoq,hdeblock:autoq,linblenddeint -pp default,-vdeblock
2203 -pp vb:a,hb:a,lb -pp de,-vb
2206 short long name short long option Description
2207 * * a autoq cpu power dependant enabler
2208 c chrom chrominance filtring enabled
2209 y nochrom chrominance filtring disabled
2210 hb hdeblock horizontal deblocking filter
2211 vb vdeblock vertical deblocking filter
2213 h1 x1hdeblock Experimental horizontal deblock filter 1
2214 v1 x1vdeblock Experimental vertical deblock filter 1
2215 dr dering not implemented yet
2216 al autolevels automatic brightness / contrast fixer
2217 f fullyrange stretch luminance range to (0..255)
2218 lb linblenddeint linear blend deinterlacer
2219 li linipoldeint linear interpolating deinterlacer
2220 ci cubicipoldeint cubic interpolating deinterlacer
2221 md mediandeint median deinterlacer
2222 de default hdeblock:a,vdeblock:a,dering:a,autolevels
2223 fa fast x1hdeblock:a,x1vdeblock:a,dering:a,autolevels
2227 * returns a PPMode struct which will have a non 0 error variable if an error occured
2228 * name is the string after "-pp" on the command line
2229 * quality is a number from 0 to GET_PP_QUALITY_MAX
2231 struct PPMode getPPModeByNameAndQuality(char *name, int quality)
2233 char temp[GET_MODE_BUFFER_SIZE];
2235 char *filterDelimiters= ",";
2236 char *optionDelimiters= ":";
2237 struct PPMode ppMode= {0,0,0,0,0,0};
2240 strncpy(temp, name, GET_MODE_BUFFER_SIZE);
2245 int q= GET_PP_QUALITY_MAX;
2248 char *options[OPTIONS_ARRAY_SIZE];
2251 int numOfUnknownOptions=0;
2252 int enable=1; //does the user want us to enabled or disabled the filter
2254 filterToken= strtok(p, filterDelimiters);
2255 if(filterToken == NULL) break;
2256 p+= strlen(filterToken) + 1;
2257 filterName= strtok(filterToken, optionDelimiters);
2258 printf("%s::%s\n", filterToken, filterName);
2260 if(*filterName == '-')
2265 for(;;){ //for all options
2266 option= strtok(NULL, optionDelimiters);
2267 if(option == NULL) break;
2269 printf("%s\n", option);
2270 if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
2271 else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
2272 else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
2275 options[numOfUnknownOptions] = option;
2276 numOfUnknownOptions++;
2277 options[numOfUnknownOptions] = NULL;
2279 if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
2282 /* replace stuff from the replace Table */
2283 for(i=0; replaceTable[2*i]!=NULL; i++)
2285 if(!strcmp(replaceTable[2*i], filterName))
2287 int newlen= strlen(replaceTable[2*i + 1]);
2291 if(p==NULL) p= temp, *p=0; //last filter
2292 else p--, *p=','; //not last filter
2295 spaceLeft= (int)p - (int)temp + plen;
2296 if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE)
2301 memmove(p + newlen, p, plen+1);
2302 memcpy(p, replaceTable[2*i + 1], newlen);
2307 for(i=0; filters[i].shortName!=NULL; i++)
2309 if( !strcmp(filters[i].longName, filterName)
2310 || !strcmp(filters[i].shortName, filterName))
2312 ppMode.lumMode &= ~filters[i].mask;
2313 ppMode.chromMode &= ~filters[i].mask;
2316 if(!enable) break; // user wants to disable it
2318 if(q >= filters[i].minLumQuality)
2319 ppMode.lumMode|= filters[i].mask;
2320 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
2321 if(q >= filters[i].minChromQuality)
2322 ppMode.chromMode|= filters[i].mask;
2324 if(filters[i].mask == LEVEL_FIX)
2327 ppMode.minAllowedY= 16;
2328 ppMode.maxAllowedY= 234;
2329 for(o=0; options[o]!=NULL; o++)
2330 if( !strcmp(options[o],"fullyrange")
2331 ||!strcmp(options[o],"f"))
2333 ppMode.minAllowedY= 0;
2334 ppMode.maxAllowedY= 255;
2335 numOfUnknownOptions--;
2340 if(!filterNameOk) ppMode.error++;
2341 ppMode.error += numOfUnknownOptions;
2344 if(ppMode.lumMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_H;
2345 if(ppMode.lumMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_V;
2346 if(ppMode.chromMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_H;
2347 if(ppMode.chromMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_V;
2348 if(ppMode.lumMode & DERING) ppMode.oldMode |= PP_DERING_Y;
2349 if(ppMode.chromMode & DERING) ppMode.oldMode |= PP_DERING_C;
2357 void postprocess(unsigned char * src[], int src_stride,
2358 unsigned char * dst[], int dst_stride,
2359 int horizontal_size, int vertical_size,
2360 QP_STORE_T *QP_store, int QP_stride,
2366 struct PPMode ppMode= getPPModeByNameAndQuality("fast,default,-hdeblock,-vdeblock", qual);
2369 printf("\n%d %d %d %d\n", ppMode.lumMode, ppMode.chromMode, ppMode.oldMode, ppMode.error);
2370 postprocess2(src, src_stride, dst, dst_stride,
2371 horizontal_size, vertical_size, QP_store, QP_stride, &ppMode);
2376 #ifdef HAVE_ODIVX_POSTPROCESS
2377 // Note: I could make this shit outside of this file, but it would mean one
2378 // more function call...
2380 odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,mode);
2385 postProcess(src[0], src_stride, dst[0], dst_stride,
2386 horizontal_size, vertical_size, QP_store, QP_stride, 0, mode);
2388 horizontal_size >>= 1;
2389 vertical_size >>= 1;
2392 mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00);
2396 postProcess(src[1], src_stride, dst[1], dst_stride,
2397 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
2398 postProcess(src[2], src_stride, dst[2], dst_stride,
2399 horizontal_size, vertical_size, QP_store, QP_stride, 2, mode);
2403 memcpy(dst[1], src[1], src_stride*horizontal_size);
2404 memcpy(dst[2], src[2], src_stride*horizontal_size);
2408 void postprocess2(unsigned char * src[], int src_stride,
2409 unsigned char * dst[], int dst_stride,
2410 int horizontal_size, int vertical_size,
2411 QP_STORE_T *QP_store, int QP_stride,
2412 struct PPMode *mode)
2415 #ifdef HAVE_ODIVX_POSTPROCESS
2416 // Note: I could make this shit outside of this file, but it would mean one
2417 // more function call...
2419 odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,
2425 postProcess(src[0], src_stride, dst[0], dst_stride,
2426 horizontal_size, vertical_size, QP_store, QP_stride, 0, mode->lumMode);
2428 horizontal_size >>= 1;
2429 vertical_size >>= 1;
2433 postProcess(src[1], src_stride, dst[1], dst_stride,
2434 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode->chromMode);
2435 postProcess(src[2], src_stride, dst[2], dst_stride,
2436 horizontal_size, vertical_size, QP_store, QP_stride, 2, mode->chromMode);
2441 * gets the mode flags for a given quality (larger values mean slower but better postprocessing)
2444 int getPpModeForQuality(int quality){
2445 int modes[1+GET_PP_QUALITY_MAX]= {
2448 // horizontal filters first
2450 LUM_H_DEBLOCK | LUM_V_DEBLOCK,
2451 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK,
2452 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK,
2453 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING,
2454 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING | CHROM_DERING
2456 // vertical filters first
2458 LUM_V_DEBLOCK | LUM_H_DEBLOCK,
2459 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK,
2460 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK,
2461 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING,
2462 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING | CHROM_DERING
2466 #ifdef HAVE_ODIVX_POSTPROCESS
2467 int odivx_modes[1+GET_PP_QUALITY_MAX]= {
2470 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V,
2471 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H,
2472 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V,
2473 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y,
2474 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y|PP_DERING_C
2476 if(use_old_pp) return odivx_modes[quality];
2478 return modes[quality];
2482 * Copies a block from src to dst and fixes the blacklevel
2483 * numLines must be a multiple of 4
2484 * levelFix == 0 -> dont touch the brighness & contrast
2486 static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
2487 int numLines, int levelFix)
2496 "leal (%2,%2), %%eax \n\t"
2497 "leal (%3,%3), %%ebx \n\t"
2498 "movq packedYOffset, %%mm2 \n\t"
2499 "movq packedYScale, %%mm3 \n\t"
2500 "pxor %%mm4, %%mm4 \n\t"
2502 #define SCALED_CPY \
2503 "movq (%0), %%mm0 \n\t"\
2504 "movq (%0), %%mm5 \n\t"\
2505 "punpcklbw %%mm4, %%mm0 \n\t"\
2506 "punpckhbw %%mm4, %%mm5 \n\t"\
2507 "psubw %%mm2, %%mm0 \n\t"\
2508 "psubw %%mm2, %%mm5 \n\t"\
2509 "movq (%0,%2), %%mm1 \n\t"\
2510 "psllw $6, %%mm0 \n\t"\
2511 "psllw $6, %%mm5 \n\t"\
2512 "pmulhw %%mm3, %%mm0 \n\t"\
2513 "movq (%0,%2), %%mm6 \n\t"\
2514 "pmulhw %%mm3, %%mm5 \n\t"\
2515 "punpcklbw %%mm4, %%mm1 \n\t"\
2516 "punpckhbw %%mm4, %%mm6 \n\t"\
2517 "psubw %%mm2, %%mm1 \n\t"\
2518 "psubw %%mm2, %%mm6 \n\t"\
2519 "psllw $6, %%mm1 \n\t"\
2520 "psllw $6, %%mm6 \n\t"\
2521 "pmulhw %%mm3, %%mm1 \n\t"\
2522 "pmulhw %%mm3, %%mm6 \n\t"\
2523 "addl %%eax, %0 \n\t"\
2524 "packuswb %%mm5, %%mm0 \n\t"\
2525 "packuswb %%mm6, %%mm1 \n\t"\
2526 "movq %%mm0, (%1) \n\t"\
2527 "movq %%mm1, (%1, %3) \n\t"\
2530 "addl %%ebx, %1 \n\t"
2532 "addl %%ebx, %1 \n\t"
2534 "addl %%ebx, %1 \n\t"
2546 for(i=0; i<numLines; i++)
2547 memcpy( &(dst[dstStride*i]),
2548 &(src[srcStride*i]), BLOCK_SIZE);
2555 "movl %4, %%eax \n\t"
2556 "movl %%eax, temp0\n\t"
2559 "leal (%2,%2), %%eax \n\t"
2560 "leal (%3,%3), %%ebx \n\t"
2561 "movq packedYOffset, %%mm2 \n\t"
2562 "movq packedYScale, %%mm3 \n\t"
2564 #define SIMPLE_CPY \
2565 "movq (%0), %%mm0 \n\t"\
2566 "movq (%0,%2), %%mm1 \n\t"\
2567 "movq %%mm0, (%1) \n\t"\
2568 "movq %%mm1, (%1, %3) \n\t"\
2572 "addl %%eax, %0 \n\t"
2573 "addl %%ebx, %1 \n\t"
2575 "addl %%eax, %0 \n\t"
2576 "addl %%ebx, %1 \n\t"
2590 for(i=0; i<numLines; i++)
2591 memcpy( &(dst[dstStride*i]),
2592 &(src[srcStride*i]), BLOCK_SIZE);
2599 * Filters array of bytes (Y or U or V values)
2601 static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2602 QP_STORE_T QPs[], int QPStride, int isColor, int mode)
2605 /* we need 64bit here otherwise we´ll going to have a problem
2606 after watching a black picture for 5 hours*/
2607 static uint64_t *yHistogram= NULL;
2608 int black=0, white=255; // blackest black and whitest white in the picture
2610 /* Temporary buffers for handling the last row(s) */
2611 static uint8_t *tempDst= NULL;
2612 static uint8_t *tempSrc= NULL;
2614 /* Temporary buffers for handling the last block */
2615 static uint8_t *tempDstBlock= NULL;
2616 static uint8_t *tempSrcBlock= NULL;
2618 uint8_t *dstBlockPtrBackup;
2619 uint8_t *srcBlockPtrBackup;
2622 long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0;
2628 tempDst= (uint8_t*)memalign(8, 1024*24);
2629 tempSrc= (uint8_t*)memalign(8, 1024*24);
2630 tempDstBlock= (uint8_t*)memalign(8, 1024*24);
2631 tempSrcBlock= (uint8_t*)memalign(8, 1024*24);
2637 yHistogram= (uint64_t*)malloc(8*256);
2638 for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256;
2640 if(mode & FULL_Y_RANGE)
2651 static int framenum= -1;
2652 uint64_t maxClipped;
2657 if(framenum == 1) yHistogram[0]= width*height/64*15/256;
2659 for(i=0; i<256; i++)
2661 sum+= yHistogram[i];
2662 // printf("%d ", yHistogram[i]);
2666 /* we allways get a completly black picture first */
2667 maxClipped= (uint64_t)(sum * maxClippedThreshold);
2670 for(black=255; black>0; black--)
2672 if(clipped < maxClipped) break;
2673 clipped-= yHistogram[black];
2677 for(white=0; white<256; white++)
2679 if(clipped < maxClipped) break;
2680 clipped-= yHistogram[white];
2683 packedYOffset= (black - minAllowedY) & 0xFFFF;
2684 packedYOffset|= packedYOffset<<32;
2685 packedYOffset|= packedYOffset<<16;
2687 scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);
2689 packedYScale= (uint16_t)(scale*1024.0 + 0.5);
2690 packedYScale|= packedYScale<<32;
2691 packedYScale|= packedYScale<<16;
2695 packedYScale= 0x0100010001000100LL;
2699 /* copy first row of 8x8 blocks */
2700 for(x=0; x<width; x+=BLOCK_SIZE)
2701 blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX);
2703 for(y=0; y<height; y+=BLOCK_SIZE)
2705 //1% speedup if these are here instead of the inner loop
2706 uint8_t *srcBlock= &(src[y*srcStride]);
2707 uint8_t *dstBlock= &(dst[y*dstStride]);
2709 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not
2710 than use a temporary buffer */
2713 /* copy from line 5 to 12 of src, these will e copied with
2714 blockcopy to dst later */
2715 memcpy(tempSrc + srcStride*5, srcBlock + srcStride*5,
2716 srcStride*MAX(height-y-5, 0) );
2718 /* duplicate last line to fill the void upto line 12 */
2722 for(i=height-y; i<=12; i++)
2723 memcpy(tempSrc + srcStride*i,
2724 src + srcStride*(height-1), srcStride);
2728 /* copy up to 5 lines of dst */
2729 memcpy(tempDst, dstBlock, dstStride*MIN(height-y, 5) );
2734 // From this point on it is guranteed that we can read and write 16 lines downward
2735 // finish 1 block before the next otherwise we´ll might have a problem
2736 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
2737 for(x=0; x<width; x+=BLOCK_SIZE)
2739 const int stride= dstStride;
2741 QPs[(y>>3)*QPStride + (x>>3)]:
2742 QPs[(y>>4)*QPStride + (x>>4)];
2743 if(!isColor && (mode & LEVEL_FIX)) QP= (QP* (packedYScale &0xFFFF))>>8;
2746 "movd %0, %%mm7 \n\t"
2747 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
2748 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
2749 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
2750 "movq %%mm7, pQPb \n\t"
2760 prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
2761 prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
2762 prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
2763 prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
2764 #elif defined(HAVE_3DNOW)
2765 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
2766 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
2767 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
2768 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
2769 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
2773 if(!isColor) yHistogram[ srcBlock[srcStride*5] ]++;
2775 #ifdef PP_FUNNY_STRIDE
2776 //can we mess with a 8x16 block, if not use a temp buffer, yes again
2780 dstBlockPtrBackup= dstBlock;
2781 srcBlockPtrBackup= srcBlock;
2783 for(i=0;i<BLOCK_SIZE*2; i++)
2785 memcpy(tempSrcBlock+i*srcStride, srcBlock+i*srcStride, width-x);
2786 memcpy(tempDstBlock+i*dstStride, dstBlock+i*dstStride, width-x);
2789 dstBlock= tempDstBlock;
2790 srcBlock= tempSrcBlock;
2794 blockCopy(dstBlock + dstStride*5, dstStride,
2795 srcBlock + srcStride*5, srcStride, 8, mode & LEVEL_FIX);
2797 if(mode & LINEAR_IPOL_DEINT_FILTER)
2798 deInterlaceInterpolateLinear(dstBlock, dstStride);
2799 else if(mode & LINEAR_BLEND_DEINT_FILTER)
2800 deInterlaceBlendLinear(dstBlock, dstStride);
2801 else if(mode & MEDIAN_DEINT_FILTER)
2802 deInterlaceMedian(dstBlock, dstStride);
2803 else if(mode & CUBIC_IPOL_DEINT_FILTER)
2804 deInterlaceInterpolateCubic(dstBlock, dstStride);
2805 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
2806 deInterlaceBlendCubic(dstBlock, dstStride);
2809 /* only deblock if we have 2 blocks */
2817 if(mode & V_RK1_FILTER)
2818 vertRK1Filter(dstBlock, stride, QP);
2819 else if(mode & V_X1_FILTER)
2820 vertX1Filter(dstBlock, stride, QP);
2821 else if(mode & V_DEBLOCK)
2823 if( isVertDC(dstBlock, stride))
2825 if(isVertMinMaxOk(dstBlock, stride, QP))
2826 doVertLowPass(dstBlock, stride, QP);
2829 doVertDefFilter(dstBlock, stride, QP);
2838 /* check if we have a previous block to deblock it with dstBlock */
2844 if(mode & H_X1_FILTER)
2845 horizX1Filter(dstBlock-4, stride, QP);
2846 else if(mode & H_DEBLOCK)
2848 if( isHorizDCAndCopy2Temp(dstBlock-4, stride))
2850 if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP))
2851 doHorizLowPassAndCopyBack(dstBlock-4, stride, QP);
2854 doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP);
2861 dering(dstBlock - 9 - stride, stride, QP);
2864 dering(dstBlock - stride*9 + width-9, stride, QP);
2865 //FIXME dering filter will not be applied to last block (bottom right)
2867 #ifdef PP_FUNNY_STRIDE
2868 /* did we use a tmp-block buffer */
2872 dstBlock= dstBlockPtrBackup;
2873 srcBlock= srcBlockPtrBackup;
2875 for(i=0;i<BLOCK_SIZE*2; i++)
2877 memcpy(dstBlock+i*dstStride, tempDstBlock+i*dstStride, width-x);
2886 /* did we use a tmp buffer */
2889 uint8_t *dstBlock= &(dst[y*dstStride]);
2890 memcpy(dstBlock, tempDst, dstStride*(height-y) );
2894 asm volatile("femms");
2895 #elif defined (HAVE_MMX)
2896 asm volatile("emms");
2900 // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...)
2901 sumTime= rdtsc() - sumTime;
2903 printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d \r",
2904 (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
2905 (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)