2 Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 doVertDefFilter Ec Ec Ec
28 doHorizDefFilter Ec Ec Ec
30 Vertical RKAlgo1 E a a
31 Horizontal RKAlgo1 a a
34 LinIpolDeinterlace e E E*
35 CubicIpolDeinterlace a e e*
36 LinBlendDeinterlace e E E*
37 MedianDeinterlace Ec Ec
40 * i dont have a 3dnow CPU -> its untested
41 E = Exact implementation
42 e = allmost exact implementation (slightly different rounding,...)
43 a = alternative / approximate impl
44 c = checked against the other implementations (-vo md5)
49 verify that everything workes as it should (how?)
50 reduce the time wasted on the mem transfer
52 implement everything in C at least (done at the moment but ...)
53 unroll stuff if instructions depend too much on the prior one
54 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
55 move YScale thing to the end instead of fixing QP
56 write a faster and higher quality deblocking filter :)
57 do something about the speed of the horizontal filters
58 make the mainloop more flexible (variable number of blocks at once
59 (the if/else stuff per block is slowing things down)
60 compare the quality & speed of all filters
62 fix warnings (unused vars, ...)
63 noise reduction filters
71 //Changelog: use the CVS log
73 #include "../config.h"
84 #include "postprocess.h"
86 #define MIN(a,b) ((a) > (b) ? (b) : (a))
87 #define MAX(a,b) ((a) < (b) ? (b) : (a))
88 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
89 #define SIGN(a) ((a) > 0 ? 1 : -1)
92 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
93 #elif defined (HAVE_3DNOW)
94 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
98 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
99 #elif defined (HAVE_MMX)
100 #define PMINUB(b,a,t) \
101 "movq " #a ", " #t " \n\t"\
102 "psubusb " #b ", " #t " \n\t"\
103 "psubb " #t ", " #a " \n\t"
107 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
108 #elif defined (HAVE_MMX)
109 #define PMAXUB(a,b) \
110 "psubusb " #a ", " #b " \n\t"\
111 "paddb " #a ", " #b " \n\t"
115 #define GET_MODE_BUFFER_SIZE 500
116 #define OPTIONS_ARRAY_SIZE 10
119 static volatile uint64_t __attribute__((aligned(8))) packedYOffset= 0x0000000000000000LL;
120 static volatile uint64_t __attribute__((aligned(8))) packedYScale= 0x0100010001000100LL;
121 static uint64_t __attribute__((aligned(8))) w05= 0x0005000500050005LL;
122 static uint64_t __attribute__((aligned(8))) w20= 0x0020002000200020LL;
123 static uint64_t __attribute__((aligned(8))) w1400= 0x1400140014001400LL;
124 static uint64_t __attribute__((aligned(8))) bm00000001= 0x00000000000000FFLL;
125 static uint64_t __attribute__((aligned(8))) bm00010000= 0x000000FF00000000LL;
126 static uint64_t __attribute__((aligned(8))) bm00001000= 0x00000000FF000000LL;
127 static uint64_t __attribute__((aligned(8))) bm10000000= 0xFF00000000000000LL;
128 static uint64_t __attribute__((aligned(8))) bm10000001= 0xFF000000000000FFLL;
129 static uint64_t __attribute__((aligned(8))) bm11000011= 0xFFFF00000000FFFFLL;
130 static uint64_t __attribute__((aligned(8))) bm00000011= 0x000000000000FFFFLL;
131 static uint64_t __attribute__((aligned(8))) bm11111110= 0xFFFFFFFFFFFFFF00LL;
132 static uint64_t __attribute__((aligned(8))) bm11000000= 0xFFFF000000000000LL;
133 static uint64_t __attribute__((aligned(8))) bm00011000= 0x000000FFFF000000LL;
134 static uint64_t __attribute__((aligned(8))) bm00110011= 0x0000FFFF0000FFFFLL;
135 static uint64_t __attribute__((aligned(8))) bm11001100= 0xFFFF0000FFFF0000LL;
136 static uint64_t __attribute__((aligned(8))) b00= 0x0000000000000000LL;
137 static uint64_t __attribute__((aligned(8))) b01= 0x0101010101010101LL;
138 static uint64_t __attribute__((aligned(8))) b02= 0x0202020202020202LL;
139 static uint64_t __attribute__((aligned(8))) b0F= 0x0F0F0F0F0F0F0F0FLL;
140 static uint64_t __attribute__((aligned(8))) b04= 0x0404040404040404LL;
141 static uint64_t __attribute__((aligned(8))) b08= 0x0808080808080808LL;
142 static uint64_t __attribute__((aligned(8))) bFF= 0xFFFFFFFFFFFFFFFFLL;
143 static uint64_t __attribute__((aligned(8))) b20= 0x2020202020202020LL;
144 static uint64_t __attribute__((aligned(8))) b80= 0x8080808080808080LL;
145 static uint64_t __attribute__((aligned(8))) b7E= 0x7E7E7E7E7E7E7E7ELL;
146 static uint64_t __attribute__((aligned(8))) b7C= 0x7C7C7C7C7C7C7C7CLL;
147 static uint64_t __attribute__((aligned(8))) b3F= 0x3F3F3F3F3F3F3F3FLL;
148 static uint64_t __attribute__((aligned(8))) temp0=0;
149 static uint64_t __attribute__((aligned(8))) temp1=0;
150 static uint64_t __attribute__((aligned(8))) temp2=0;
151 static uint64_t __attribute__((aligned(8))) temp3=0;
152 static uint64_t __attribute__((aligned(8))) temp4=0;
153 static uint64_t __attribute__((aligned(8))) temp5=0;
154 static uint64_t __attribute__((aligned(8))) pQPb=0;
155 static uint64_t __attribute__((aligned(8))) pQPb2=0;
156 static uint8_t __attribute__((aligned(8))) tempBlocks[8*16*2]; //used for the horizontal code
158 static uint64_t packedYOffset= 0x0000000000000000LL;
159 static uint64_t packedYScale= 0x0100010001000100LL;
160 static uint8_t tempBlocks[8*16*2]; //used for the horizontal code
163 int hFlatnessThreshold= 56 - 16;
164 int vFlatnessThreshold= 56 - 16;
166 //amount of "black" u r willing to loose to get a brightness corrected picture
167 double maxClippedThreshold= 0.01;
172 static struct PPFilter filters[]=
174 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
175 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
176 {"vr", "rkvdeblock", 1, 2, 4, H_RK1_FILTER},
177 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
178 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
179 {"dr", "dering", 1, 5, 6, DERING},
180 {"al", "autolevels", 0, 1, 2, LEVEL_FIX},
181 {"lb", "linblenddeint", 0, 1, 6, LINEAR_BLEND_DEINT_FILTER},
182 {"li", "linipoldeint", 0, 1, 6, LINEAR_IPOL_DEINT_FILTER},
183 {"ci", "cubicipoldeint", 0, 1, 6, CUBIC_IPOL_DEINT_FILTER},
184 {"md", "mediandeint", 0, 1, 6, MEDIAN_DEINT_FILTER},
185 {NULL, NULL,0,0,0,0} //End Marker
188 static char *replaceTable[]=
190 "default", "hdeblock:a,vdeblock:a,dering:a,autolevels",
191 "de", "hdeblock:a,vdeblock:a,dering:a,autolevels",
192 "fast", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels",
193 "fa", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels",
198 static inline void unusedVariableWarningFixer()
201 packedYOffset + packedYScale + w05 + w20 + w1400 + bm00000001 + bm00010000
202 + bm00001000 + bm10000000 + bm10000001 + bm11000011 + bm00000011 + bm11111110
203 + bm11000000 + bm00011000 + bm00110011 + bm11001100 + b00 + b01 + b02 + b0F
204 + bFF + b20 + b04+ b08 + pQPb2 + b80 + b7E + b7C + b3F + temp0 + temp1 + temp2 + temp3 + temp4
205 + temp5 + pQPb== 0) b00=0;
210 static inline long long rdtsc()
213 asm volatile( "rdtsc\n\t"
216 // printf("%d\n", int(l/1000));
222 static inline void prefetchnta(void *p)
224 asm volatile( "prefetchnta (%0)\n\t"
229 static inline void prefetcht0(void *p)
231 asm volatile( "prefetcht0 (%0)\n\t"
236 static inline void prefetcht1(void *p)
238 asm volatile( "prefetcht1 (%0)\n\t"
243 static inline void prefetcht2(void *p)
245 asm volatile( "prefetcht2 (%0)\n\t"
251 //FIXME? |255-0| = 1 (shouldnt be a problem ...)
253 * Check if the middle 8x8 Block in the given 8x16 block is flat
255 static inline int isVertDC(uint8_t src[], int stride){
260 src+= stride*4; // src points to begin of the 8x8 Block
263 "leal (%1, %2), %%eax \n\t"
264 "leal (%%eax, %2, 4), %%ebx \n\t"
265 // 0 1 2 3 4 5 6 7 8 9
266 // %1 eax eax+%2 eax+2%2 %1+4%2 ebx ebx+%2 ebx+2%2 %1+8%2 ebx+4%2
267 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F
268 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D
269 "movq (%1), %%mm0 \n\t"
270 "movq (%%eax), %%mm1 \n\t"
271 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
272 "paddb %%mm7, %%mm0 \n\t"
273 "pcmpgtb %%mm6, %%mm0 \n\t"
275 "movq (%%eax,%2), %%mm2 \n\t"
276 "psubb %%mm2, %%mm1 \n\t"
277 "paddb %%mm7, %%mm1 \n\t"
278 "pcmpgtb %%mm6, %%mm1 \n\t"
279 "paddb %%mm1, %%mm0 \n\t"
281 "movq (%%eax, %2, 2), %%mm1 \n\t"
282 "psubb %%mm1, %%mm2 \n\t"
283 "paddb %%mm7, %%mm2 \n\t"
284 "pcmpgtb %%mm6, %%mm2 \n\t"
285 "paddb %%mm2, %%mm0 \n\t"
287 "movq (%1, %2, 4), %%mm2 \n\t"
288 "psubb %%mm2, %%mm1 \n\t"
289 "paddb %%mm7, %%mm1 \n\t"
290 "pcmpgtb %%mm6, %%mm1 \n\t"
291 "paddb %%mm1, %%mm0 \n\t"
293 "movq (%%ebx), %%mm1 \n\t"
294 "psubb %%mm1, %%mm2 \n\t"
295 "paddb %%mm7, %%mm2 \n\t"
296 "pcmpgtb %%mm6, %%mm2 \n\t"
297 "paddb %%mm2, %%mm0 \n\t"
299 "movq (%%ebx, %2), %%mm2 \n\t"
300 "psubb %%mm2, %%mm1 \n\t"
301 "paddb %%mm7, %%mm1 \n\t"
302 "pcmpgtb %%mm6, %%mm1 \n\t"
303 "paddb %%mm1, %%mm0 \n\t"
305 "movq (%%ebx, %2, 2), %%mm1 \n\t"
306 "psubb %%mm1, %%mm2 \n\t"
307 "paddb %%mm7, %%mm2 \n\t"
308 "pcmpgtb %%mm6, %%mm2 \n\t"
309 "paddb %%mm2, %%mm0 \n\t"
312 "movq %%mm0, %%mm1 \n\t"
313 "psrlw $8, %%mm0 \n\t"
314 "paddb %%mm1, %%mm0 \n\t"
316 "pshufw $0xF9, %%mm0, %%mm1 \n\t"
317 "paddb %%mm1, %%mm0 \n\t"
318 "pshufw $0xFE, %%mm0, %%mm1 \n\t"
320 "movq %%mm0, %%mm1 \n\t"
321 "psrlq $16, %%mm0 \n\t"
322 "paddb %%mm1, %%mm0 \n\t"
323 "movq %%mm0, %%mm1 \n\t"
324 "psrlq $32, %%mm0 \n\t"
326 "paddb %%mm1, %%mm0 \n\t"
327 "movd %%mm0, %0 \n\t"
329 : "r" (src), "r" (stride)
333 numEq= (256 - numEq) &0xFF;
336 for(y=0; y<BLOCK_SIZE-1; y++)
338 if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
339 if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
340 if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++;
341 if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++;
342 if(((src[4] - src[4+stride] + 1)&0xFFFF) < 3) numEq++;
343 if(((src[5] - src[5+stride] + 1)&0xFFFF) < 3) numEq++;
344 if(((src[6] - src[6+stride] + 1)&0xFFFF) < 3) numEq++;
345 if(((src[7] - src[7+stride] + 1)&0xFFFF) < 3) numEq++;
349 /* if(abs(numEq - asmEq) > 0)
351 printf("\nasm:%d c:%d\n", asmEq, numEq);
352 for(int y=0; y<8; y++)
354 for(int x=0; x<8; x++)
356 printf("%d ", temp[x + y*stride]);
362 // for(int i=0; i<numEq/8; i++) src[i]=255;
363 return (numEq > vFlatnessThreshold) ? 1 : 0;
366 static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
373 "movq (%1, %2), %%mm0 \n\t"
374 "movq (%1, %2, 8), %%mm1 \n\t"
375 "movq %%mm0, %%mm2 \n\t"
376 "psubusb %%mm1, %%mm0 \n\t"
377 "psubusb %%mm2, %%mm1 \n\t"
378 "por %%mm1, %%mm0 \n\t" // ABS Diff
380 "movq pQPb, %%mm7 \n\t" // QP,..., QP
381 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
382 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0
383 "pcmpeqd b00, %%mm0 \n\t"
384 "psrlq $16, %%mm0 \n\t"
385 "pcmpeqd bFF, %%mm0 \n\t"
386 // "movd %%mm0, (%1, %2, 4)\n\t"
387 "movd %%mm0, %0 \n\t"
389 : "r" (src), "r" (stride)
397 for(x=0; x<BLOCK_SIZE; x++)
399 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
401 /* if(isOk && !isOk2 || !isOk && isOk2)
403 printf("\nasm:%d c:%d QP:%d\n", isOk, isOk2, QP);
404 for(int y=0; y<9; y++)
406 for(int x=0; x<8; x++)
408 printf("%d ", src[x + y*stride]);
420 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
421 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
423 static inline void doVertLowPass(uint8_t *src, int stride, int QP)
425 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
427 asm volatile( //"movv %0 %1 %2\n\t"
428 "movq pQPb, %%mm0 \n\t" // QP,..., QP
430 "movq (%0), %%mm6 \n\t"
431 "movq (%0, %1), %%mm5 \n\t"
432 "movq %%mm5, %%mm1 \n\t"
433 "movq %%mm6, %%mm2 \n\t"
434 "psubusb %%mm6, %%mm5 \n\t"
435 "psubusb %%mm1, %%mm2 \n\t"
436 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
437 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
438 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
440 "pand %%mm2, %%mm6 \n\t"
441 "pandn %%mm1, %%mm2 \n\t"
442 "por %%mm2, %%mm6 \n\t"// First Line to Filter
444 "movq (%0, %1, 8), %%mm5 \n\t"
445 "leal (%0, %1, 4), %%eax \n\t"
446 "leal (%0, %1, 8), %%ebx \n\t"
447 "subl %1, %%ebx \n\t"
448 "addl %1, %0 \n\t" // %0 points to line 1 not 0
449 "movq (%0, %1, 8), %%mm7 \n\t"
450 "movq %%mm5, %%mm1 \n\t"
451 "movq %%mm7, %%mm2 \n\t"
452 "psubusb %%mm7, %%mm5 \n\t"
453 "psubusb %%mm1, %%mm2 \n\t"
454 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
455 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
456 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
458 "pand %%mm2, %%mm7 \n\t"
459 "pandn %%mm1, %%mm2 \n\t"
460 "por %%mm2, %%mm7 \n\t" // First Line to Filter
464 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ebx eax+4%1
469 "movq (%0, %1), %%mm0 \n\t" // 1
470 "movq %%mm0, %%mm1 \n\t" // 1
471 PAVGB(%%mm6, %%mm0) //1 1 /2
472 PAVGB(%%mm6, %%mm0) //3 1 /4
474 "movq (%0, %1, 4), %%mm2 \n\t" // 1
475 "movq %%mm2, %%mm5 \n\t" // 1
476 PAVGB((%%eax), %%mm2) // 11 /2
477 PAVGB((%0, %1, 2), %%mm2) // 211 /4
478 "movq %%mm2, %%mm3 \n\t" // 211 /4
479 "movq (%0), %%mm4 \n\t" // 1
480 PAVGB(%%mm4, %%mm3) // 4 211 /8
481 PAVGB(%%mm0, %%mm3) //642211 /16
482 "movq %%mm3, (%0) \n\t" // X
483 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
484 "movq %%mm1, %%mm0 \n\t" // 1
485 PAVGB(%%mm6, %%mm0) //1 1 /2
486 "movq %%mm4, %%mm3 \n\t" // 1
487 PAVGB((%0,%1,2), %%mm3) // 1 1 /2
488 PAVGB((%%eax,%1,2), %%mm5) // 11 /2
489 PAVGB((%%eax), %%mm5) // 211 /4
490 PAVGB(%%mm5, %%mm3) // 2 2211 /8
491 PAVGB(%%mm0, %%mm3) //4242211 /16
492 "movq %%mm3, (%0,%1) \n\t" // X
493 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
494 PAVGB(%%mm4, %%mm6) //11 /2
495 "movq (%%ebx), %%mm0 \n\t" // 1
496 PAVGB((%%eax, %1, 2), %%mm0) // 11/2
497 "movq %%mm0, %%mm3 \n\t" // 11/2
498 PAVGB(%%mm1, %%mm0) // 2 11/4
499 PAVGB(%%mm6, %%mm0) //222 11/8
500 PAVGB(%%mm2, %%mm0) //22242211/16
501 "movq (%0, %1, 2), %%mm2 \n\t" // 1
502 "movq %%mm0, (%0, %1, 2) \n\t" // X
503 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
504 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
505 PAVGB((%%ebx), %%mm0) // 11 /2
506 PAVGB(%%mm0, %%mm6) //11 11 /4
507 PAVGB(%%mm1, %%mm4) // 11 /2
508 PAVGB(%%mm2, %%mm1) // 11 /2
509 PAVGB(%%mm1, %%mm6) //1122 11 /8
510 PAVGB(%%mm5, %%mm6) //112242211 /16
511 "movq (%%eax), %%mm5 \n\t" // 1
512 "movq %%mm6, (%%eax) \n\t" // X
513 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
514 "movq (%%eax, %1, 4), %%mm6 \n\t" // 1
515 PAVGB(%%mm7, %%mm6) // 11 /2
516 PAVGB(%%mm4, %%mm6) // 11 11 /4
517 PAVGB(%%mm3, %%mm6) // 11 2211 /8
518 PAVGB(%%mm5, %%mm2) // 11 /2
519 "movq (%0, %1, 4), %%mm4 \n\t" // 1
520 PAVGB(%%mm4, %%mm2) // 112 /4
521 PAVGB(%%mm2, %%mm6) // 112242211 /16
522 "movq %%mm6, (%0, %1, 4) \n\t" // X
523 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
524 PAVGB(%%mm7, %%mm1) // 11 2 /4
525 PAVGB(%%mm4, %%mm5) // 11 /2
526 PAVGB(%%mm5, %%mm0) // 11 11 /4
527 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1
528 PAVGB(%%mm6, %%mm1) // 11 4 2 /8
529 PAVGB(%%mm0, %%mm1) // 11224222 /16
530 "movq %%mm1, (%%eax, %1, 2) \n\t" // X
531 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
532 PAVGB((%%ebx), %%mm2) // 112 4 /8
533 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
534 PAVGB(%%mm0, %%mm6) // 1 1 /2
535 PAVGB(%%mm7, %%mm6) // 1 12 /4
536 PAVGB(%%mm2, %%mm6) // 1122424 /4
537 "movq %%mm6, (%%ebx) \n\t" // X
538 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
539 PAVGB(%%mm7, %%mm5) // 11 2 /4
540 PAVGB(%%mm7, %%mm5) // 11 6 /8
542 PAVGB(%%mm3, %%mm0) // 112 /4
543 PAVGB(%%mm0, %%mm5) // 112246 /16
544 "movq %%mm5, (%%eax, %1, 4) \n\t" // X
548 : "r" (src), "r" (stride)
552 const int l1= stride;
553 const int l2= stride + l1;
554 const int l3= stride + l2;
555 const int l4= stride + l3;
556 const int l5= stride + l4;
557 const int l6= stride + l5;
558 const int l7= stride + l6;
559 const int l8= stride + l7;
560 const int l9= stride + l8;
563 for(x=0; x<BLOCK_SIZE; x++)
565 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
566 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
569 sums[0] = first + src[l1];
570 sums[1] = src[l1] + src[l2];
571 sums[2] = src[l2] + src[l3];
572 sums[3] = src[l3] + src[l4];
573 sums[4] = src[l4] + src[l5];
574 sums[5] = src[l5] + src[l6];
575 sums[6] = src[l6] + src[l7];
576 sums[7] = src[l7] + src[l8];
577 sums[8] = src[l8] + last;
579 src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
580 src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
581 src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
582 src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
583 src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
584 src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
585 src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4;
586 src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
595 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
596 * values are correctly clipped (MMX2)
597 * values are wraparound (C)
598 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
605 static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
607 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
611 "pxor %%mm7, %%mm7 \n\t" // 0
612 "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
613 "leal (%0, %1), %%eax \n\t"
614 "leal (%%eax, %1, 4), %%ebx \n\t"
615 // 0 1 2 3 4 5 6 7 8 9
616 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
617 "movq pQPb, %%mm0 \n\t" // QP,..., QP
618 "movq %%mm0, %%mm1 \n\t" // QP,..., QP
619 "paddusb b02, %%mm0 \n\t"
620 "psrlw $2, %%mm0 \n\t"
621 "pand b3F, %%mm0 \n\t" // QP/4,..., QP/4
622 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ...
623 "movq (%0, %1, 4), %%mm2 \n\t" // line 4
624 "movq (%%ebx), %%mm3 \n\t" // line 5
625 "movq %%mm2, %%mm4 \n\t" // line 4
626 "pcmpeqb %%mm5, %%mm5 \n\t" // -1
627 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
629 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
630 "psubusb %%mm3, %%mm4 \n\t"
631 "psubusb %%mm2, %%mm3 \n\t"
632 "por %%mm3, %%mm4 \n\t" // |l4 - l5|
633 "psubusb %%mm0, %%mm4 \n\t"
634 "pcmpeqb %%mm7, %%mm4 \n\t"
635 "pand %%mm4, %%mm5 \n\t" // d/2
637 // "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80
638 "paddb %%mm5, %%mm2 \n\t"
639 // "psubb %%mm6, %%mm2 \n\t"
640 "movq %%mm2, (%0,%1, 4) \n\t"
642 "movq (%%ebx), %%mm2 \n\t"
643 // "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80
644 "psubb %%mm5, %%mm2 \n\t"
645 // "psubb %%mm6, %%mm2 \n\t"
646 "movq %%mm2, (%%ebx) \n\t"
648 "paddb %%mm6, %%mm5 \n\t"
649 "psrlw $2, %%mm5 \n\t"
650 "pand b3F, %%mm5 \n\t"
651 "psubb b20, %%mm5 \n\t" // (l5-l4)/8
653 "movq (%%eax, %1, 2), %%mm2 \n\t"
654 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80
655 "paddsb %%mm5, %%mm2 \n\t"
656 "psubb %%mm6, %%mm2 \n\t"
657 "movq %%mm2, (%%eax, %1, 2) \n\t"
659 "movq (%%ebx, %1), %%mm2 \n\t"
660 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80
661 "psubsb %%mm5, %%mm2 \n\t"
662 "psubb %%mm6, %%mm2 \n\t"
663 "movq %%mm2, (%%ebx, %1) \n\t"
666 : "r" (src), "r" (stride)
670 const int l1= stride;
671 const int l2= stride + l1;
672 const int l3= stride + l2;
673 const int l4= stride + l3;
674 const int l5= stride + l4;
675 const int l6= stride + l5;
676 // const int l7= stride + l6;
677 // const int l8= stride + l7;
678 // const int l9= stride + l8;
680 const int QP15= QP + (QP>>2);
682 for(x=0; x<BLOCK_SIZE; x++)
684 const int v = (src[x+l5] - src[x+l4]);
699 * Experimental Filter 1
700 * will not damage linear gradients
701 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
702 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
703 * MMX2 version does correct clipping C version doesnt
705 static inline void vertX1Filter(uint8_t *src, int stride, int QP)
707 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
711 "pxor %%mm7, %%mm7 \n\t" // 0
712 // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
713 "leal (%0, %1), %%eax \n\t"
714 "leal (%%eax, %1, 4), %%ebx \n\t"
715 // 0 1 2 3 4 5 6 7 8 9
716 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
717 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
718 "movq (%0, %1, 4), %%mm1 \n\t" // line 4
719 "movq %%mm1, %%mm2 \n\t" // line 4
720 "psubusb %%mm0, %%mm1 \n\t"
721 "psubusb %%mm2, %%mm0 \n\t"
722 "por %%mm1, %%mm0 \n\t" // |l2 - l3|
723 "movq (%%ebx), %%mm3 \n\t" // line 5
724 "movq (%%ebx, %1), %%mm4 \n\t" // line 6
725 "movq %%mm3, %%mm5 \n\t" // line 5
726 "psubusb %%mm4, %%mm3 \n\t"
727 "psubusb %%mm5, %%mm4 \n\t"
728 "por %%mm4, %%mm3 \n\t" // |l5 - l6|
729 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
730 "movq %%mm2, %%mm1 \n\t" // line 4
731 "psubusb %%mm5, %%mm2 \n\t"
732 "movq %%mm2, %%mm4 \n\t"
733 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
734 "psubusb %%mm1, %%mm5 \n\t"
735 "por %%mm5, %%mm4 \n\t" // |l4 - l5|
736 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
737 "movq %%mm4, %%mm3 \n\t" // d
738 "psubusb pQPb, %%mm4 \n\t"
739 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
740 "psubusb b01, %%mm3 \n\t"
741 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
743 PAVGB(%%mm7, %%mm3) // d/2
744 "movq %%mm3, %%mm1 \n\t" // d/2
745 PAVGB(%%mm7, %%mm3) // d/4
746 PAVGB(%%mm1, %%mm3) // 3*d/8
748 "movq (%0, %1, 4), %%mm0 \n\t" // line 4
749 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
750 "psubusb %%mm3, %%mm0 \n\t"
751 "pxor %%mm2, %%mm0 \n\t"
752 "movq %%mm0, (%0, %1, 4) \n\t" // line 4
754 "movq (%%ebx), %%mm0 \n\t" // line 5
755 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
756 "paddusb %%mm3, %%mm0 \n\t"
757 "pxor %%mm2, %%mm0 \n\t"
758 "movq %%mm0, (%%ebx) \n\t" // line 5
760 PAVGB(%%mm7, %%mm1) // d/4
762 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
763 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
764 "psubusb %%mm1, %%mm0 \n\t"
765 "pxor %%mm2, %%mm0 \n\t"
766 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3
768 "movq (%%ebx, %1), %%mm0 \n\t" // line 6
769 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
770 "paddusb %%mm1, %%mm0 \n\t"
771 "pxor %%mm2, %%mm0 \n\t"
772 "movq %%mm0, (%%ebx, %1) \n\t" // line 6
774 PAVGB(%%mm7, %%mm1) // d/8
776 "movq (%%eax, %1), %%mm0 \n\t" // line 2
777 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
778 "psubusb %%mm1, %%mm0 \n\t"
779 "pxor %%mm2, %%mm0 \n\t"
780 "movq %%mm0, (%%eax, %1) \n\t" // line 2
782 "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7
783 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
784 "paddusb %%mm1, %%mm0 \n\t"
785 "pxor %%mm2, %%mm0 \n\t"
786 "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7
789 : "r" (src), "r" (stride)
794 const int l1= stride;
795 const int l2= stride + l1;
796 const int l3= stride + l2;
797 const int l4= stride + l3;
798 const int l5= stride + l4;
799 const int l6= stride + l5;
800 const int l7= stride + l6;
801 // const int l8= stride + l7;
802 // const int l9= stride + l8;
806 for(x=0; x<BLOCK_SIZE; x++)
808 int a= src[l3] - src[l4];
809 int b= src[l4] - src[l5];
810 int c= src[l5] - src[l6];
812 int d= ABS(b) - ((ABS(a) + ABS(c))>>1);
817 int v = d * SIGN(-b);
830 const int l1= stride;
831 const int l2= stride + l1;
832 const int l3= stride + l2;
833 const int l4= stride + l3;
834 const int l5= stride + l4;
835 const int l6= stride + l5;
836 const int l7= stride + l6;
837 const int l8= stride + l7;
838 const int l9= stride + l8;
839 for(int x=0; x<BLOCK_SIZE; x++)
848 if(ABS(v4-v5)<QP && ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 )
850 src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6 )/16;
851 src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7 )/16;
852 src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16;
853 src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16;
862 * Experimental Filter 1 (Horizontal)
863 * will not damage linear gradients
864 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
865 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
866 * MMX2 version does correct clipping C version doesnt
867 * not identical with the vertical one
869 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
872 static uint64_t *lut= NULL;
876 lut= (uint64_t*)memalign(8, 256*8);
879 int v= i < 128 ? 2*i : 2*(i-256);
881 //Simulate 112242211 9-Tap filter
882 uint64_t a= (v/16) & 0xFF;
883 uint64_t b= (v/8) & 0xFF;
884 uint64_t c= (v/4) & 0xFF;
885 uint64_t d= (3*v/8) & 0xFF;
887 //Simulate piecewise linear interpolation
888 uint64_t a= (v/16) & 0xFF;
889 uint64_t b= (v*3/16) & 0xFF;
890 uint64_t c= (v*5/16) & 0xFF;
891 uint64_t d= (7*v/16) & 0xFF;
892 uint64_t A= (0x100 - a)&0xFF;
893 uint64_t B= (0x100 - b)&0xFF;
894 uint64_t C= (0x100 - c)&0xFF;
895 uint64_t D= (0x100 - c)&0xFF;
897 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
898 (D<<24) | (C<<16) | (B<<8) | (A);
899 //lut[i] = (v<<32) | (v<<24);
905 "pxor %%mm7, %%mm7 \n\t" // 0
906 // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
907 "leal (%0, %1), %%eax \n\t"
908 "leal (%%eax, %1, 4), %%ebx \n\t"
910 "movq b80, %%mm6 \n\t"
911 "movd pQPb, %%mm5 \n\t" // QP
912 "movq %%mm5, %%mm4 \n\t"
913 "paddusb %%mm5, %%mm5 \n\t" // 2QP
914 "paddusb %%mm5, %%mm4 \n\t" // 3QP
915 "pxor %%mm5, %%mm5 \n\t" // 0
916 "psubb %%mm4, %%mm5 \n\t" // -3QP
917 "por bm11111110, %%mm5 \n\t" // ...,FF,FF,-3QP
918 "psllq $24, %%mm5 \n\t"
920 // 0 1 2 3 4 5 6 7 8 9
921 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
924 "movd " #a ", %%mm0 \n\t"\
925 "movd 4" #a ", %%mm1 \n\t"\
926 "punpckldq %%mm1, %%mm0 \n\t"\
927 "movq %%mm0, %%mm1 \n\t"\
928 "movq %%mm0, %%mm2 \n\t"\
929 "psrlq $8, %%mm1 \n\t"\
930 "psubusb %%mm1, %%mm2 \n\t"\
931 "psubusb %%mm0, %%mm1 \n\t"\
932 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
933 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
934 "pshufw $0x00, %%mm1, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
935 PAVGB(%%mm1, %%mm3) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
936 "psrlq $16, %%mm3 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
937 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
938 "paddb %%mm5, %%mm1 \n\t"\
939 "psubusb %%mm5, %%mm1 \n\t"\
941 "pxor %%mm2, %%mm1 \n\t"\
942 "psubb %%mm2, %%mm1 \n\t"\
943 "psrlq $24, %%mm1 \n\t"\
944 "movd %%mm1, %%ecx \n\t"\
945 "paddb %%mm6, %%mm0 \n\t"\
946 "paddsb (%3, %%ecx, 8), %%mm0 \n\t"\
947 "paddb %%mm6, %%mm0 \n\t"\
948 "movq %%mm0, " #a " \n\t"\
954 HX1old((%%eax, %1, 2))
958 HX1old((%%ebx, %1, 2))
961 //FIXME add some comments, its unreadable ...
962 #define HX1b(a, c, b, d) \
963 "movd " #a ", %%mm0 \n\t"\
964 "movd 4" #a ", %%mm1 \n\t"\
965 "punpckldq %%mm1, %%mm0 \n\t"\
966 "movd " #b ", %%mm4 \n\t"\
967 "movq %%mm0, %%mm1 \n\t"\
968 "movq %%mm0, %%mm2 \n\t"\
969 "psrlq $8, %%mm1 \n\t"\
970 "movd 4" #b ", %%mm3 \n\t"\
971 "psubusb %%mm1, %%mm2 \n\t"\
972 "psubusb %%mm0, %%mm1 \n\t"\
973 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
974 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
975 "punpckldq %%mm3, %%mm4 \n\t"\
976 "movq %%mm1, %%mm3 \n\t"\
977 "psllq $32, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
978 PAVGB(%%mm1, %%mm3) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
979 "paddb %%mm6, %%mm0 \n\t"\
980 "psrlq $16, %%mm3 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
981 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
982 "movq %%mm4, %%mm3 \n\t"\
983 "paddb %%mm5, %%mm1 \n\t"\
984 "psubusb %%mm5, %%mm1 \n\t"\
985 "psrlq $8, %%mm3 \n\t"\
987 "pxor %%mm2, %%mm1 \n\t"\
988 "psubb %%mm2, %%mm1 \n\t"\
989 "movq %%mm4, %%mm2 \n\t"\
990 "psrlq $24, %%mm1 \n\t"\
991 "psubusb %%mm3, %%mm2 \n\t"\
992 "movd %%mm1, %%ecx \n\t"\
993 "psubusb %%mm4, %%mm3 \n\t"\
994 "paddsb (%2, %%ecx, 8), %%mm0 \n\t"\
995 "por %%mm2, %%mm3 \n\t" /* p´x = |px - p(x+1)| */\
996 "paddb %%mm6, %%mm0 \n\t"\
997 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
998 "movq %%mm3, %%mm1 \n\t"\
999 "psllq $32, %%mm1 \n\t" /* p´5 = |p1 - p2| */\
1000 "movq %%mm0, " #a " \n\t"\
1001 PAVGB(%%mm3, %%mm1) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
1002 "paddb %%mm6, %%mm4 \n\t"\
1003 "psrlq $16, %%mm1 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
1004 "psubusb %%mm1, %%mm3 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
1005 "paddb %%mm5, %%mm3 \n\t"\
1006 "psubusb %%mm5, %%mm3 \n\t"\
1007 PAVGB(%%mm7, %%mm3)\
1008 "pxor %%mm2, %%mm3 \n\t"\
1009 "psubb %%mm2, %%mm3 \n\t"\
1010 "psrlq $24, %%mm3 \n\t"\
1011 "movd " #c ", %%mm0 \n\t"\
1012 "movd 4" #c ", %%mm1 \n\t"\
1013 "punpckldq %%mm1, %%mm0 \n\t"\
1014 "paddb %%mm6, %%mm0 \n\t"\
1015 "paddsb (%2, %%ecx, 8), %%mm0 \n\t"\
1016 "paddb %%mm6, %%mm0 \n\t"\
1017 "movq %%mm0, " #c " \n\t"\
1018 "movd %%mm3, %%ecx \n\t"\
1019 "movd " #d ", %%mm0 \n\t"\
1020 "paddsb (%2, %%ecx, 8), %%mm4 \n\t"\
1021 "movd 4" #d ", %%mm1 \n\t"\
1022 "paddb %%mm6, %%mm4 \n\t"\
1023 "punpckldq %%mm1, %%mm0 \n\t"\
1024 "movq %%mm4, " #b " \n\t"\
1025 "paddb %%mm6, %%mm0 \n\t"\
1026 "paddsb (%2, %%ecx, 8), %%mm0 \n\t"\
1027 "paddb %%mm6, %%mm0 \n\t"\
1028 "movq %%mm0, " #d " \n\t"\
1030 HX1b((%0),(%%eax),(%%eax, %1),(%%eax, %1, 2))
1031 HX1b((%0, %1, 4),(%%ebx),(%%ebx, %1),(%%ebx, %1, 2))
1035 : "r" (src), "r" (stride), "r" (lut)
1036 : "%eax", "%ebx", "%ecx"
1040 //FIXME (has little in common with the mmx2 version)
1041 for(y=0; y<BLOCK_SIZE; y++)
1043 int a= src[1] - src[2];
1044 int b= src[3] - src[4];
1045 int c= src[5] - src[6];
1047 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
1051 int v = d * SIGN(-b);
1067 static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
1071 //FIXME try pmul for *5 stuff
1074 "pxor %%mm7, %%mm7 \n\t"
1075 "leal (%0, %1), %%eax \n\t"
1076 "leal (%%eax, %1, 4), %%ebx \n\t"
1078 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
1079 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
1081 "movq (%0), %%mm0 \n\t"
1082 "movq %%mm0, %%mm1 \n\t"
1083 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
1084 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
1086 "movq (%%eax), %%mm2 \n\t"
1087 "movq %%mm2, %%mm3 \n\t"
1088 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
1089 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
1091 "movq (%%eax, %1), %%mm4 \n\t"
1092 "movq %%mm4, %%mm5 \n\t"
1093 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
1094 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
1096 "paddw %%mm0, %%mm0 \n\t" // 2L0
1097 "paddw %%mm1, %%mm1 \n\t" // 2H0
1098 "psubw %%mm4, %%mm2 \n\t" // L1 - L2
1099 "psubw %%mm5, %%mm3 \n\t" // H1 - H2
1100 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
1101 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
1103 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
1104 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
1105 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
1106 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
1108 "movq (%%eax, %1, 2), %%mm2 \n\t"
1109 "movq %%mm2, %%mm3 \n\t"
1110 "punpcklbw %%mm7, %%mm2 \n\t" // L3
1111 "punpckhbw %%mm7, %%mm3 \n\t" // H3
1113 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
1114 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
1115 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1116 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1117 "movq %%mm0, temp0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1118 "movq %%mm1, temp1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1120 "movq (%0, %1, 4), %%mm0 \n\t"
1121 "movq %%mm0, %%mm1 \n\t"
1122 "punpcklbw %%mm7, %%mm0 \n\t" // L4
1123 "punpckhbw %%mm7, %%mm1 \n\t" // H4
1125 "psubw %%mm0, %%mm2 \n\t" // L3 - L4
1126 "psubw %%mm1, %%mm3 \n\t" // H3 - H4
1127 "movq %%mm2, temp2 \n\t" // L3 - L4
1128 "movq %%mm3, temp3 \n\t" // H3 - H4
1129 "paddw %%mm4, %%mm4 \n\t" // 2L2
1130 "paddw %%mm5, %%mm5 \n\t" // 2H2
1131 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
1132 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
1134 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
1135 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
1136 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
1137 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
1139 "movq (%%ebx), %%mm2 \n\t"
1140 "movq %%mm2, %%mm3 \n\t"
1141 "punpcklbw %%mm7, %%mm2 \n\t" // L5
1142 "punpckhbw %%mm7, %%mm3 \n\t" // H5
1143 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
1144 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
1145 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
1146 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
1148 "movq (%%ebx, %1), %%mm6 \n\t"
1149 "punpcklbw %%mm7, %%mm6 \n\t" // L6
1150 "psubw %%mm6, %%mm2 \n\t" // L5 - L6
1151 "movq (%%ebx, %1), %%mm6 \n\t"
1152 "punpckhbw %%mm7, %%mm6 \n\t" // H6
1153 "psubw %%mm6, %%mm3 \n\t" // H5 - H6
1155 "paddw %%mm0, %%mm0 \n\t" // 2L4
1156 "paddw %%mm1, %%mm1 \n\t" // 2H4
1157 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
1158 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
1160 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
1161 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
1162 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
1163 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
1165 "movq (%%ebx, %1, 2), %%mm2 \n\t"
1166 "movq %%mm2, %%mm3 \n\t"
1167 "punpcklbw %%mm7, %%mm2 \n\t" // L7
1168 "punpckhbw %%mm7, %%mm3 \n\t" // H7
1170 "paddw %%mm2, %%mm2 \n\t" // 2L7
1171 "paddw %%mm3, %%mm3 \n\t" // 2H7
1172 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
1173 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
1175 "movq temp0, %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1176 "movq temp1, %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1179 "movq %%mm7, %%mm6 \n\t" // 0
1180 "psubw %%mm0, %%mm6 \n\t"
1181 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1182 "movq %%mm7, %%mm6 \n\t" // 0
1183 "psubw %%mm1, %%mm6 \n\t"
1184 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1185 "movq %%mm7, %%mm6 \n\t" // 0
1186 "psubw %%mm2, %%mm6 \n\t"
1187 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1188 "movq %%mm7, %%mm6 \n\t" // 0
1189 "psubw %%mm3, %%mm6 \n\t"
1190 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1192 "movq %%mm7, %%mm6 \n\t" // 0
1193 "pcmpgtw %%mm0, %%mm6 \n\t"
1194 "pxor %%mm6, %%mm0 \n\t"
1195 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1196 "movq %%mm7, %%mm6 \n\t" // 0
1197 "pcmpgtw %%mm1, %%mm6 \n\t"
1198 "pxor %%mm6, %%mm1 \n\t"
1199 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1200 "movq %%mm7, %%mm6 \n\t" // 0
1201 "pcmpgtw %%mm2, %%mm6 \n\t"
1202 "pxor %%mm6, %%mm2 \n\t"
1203 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1204 "movq %%mm7, %%mm6 \n\t" // 0
1205 "pcmpgtw %%mm3, %%mm6 \n\t"
1206 "pxor %%mm6, %%mm3 \n\t"
1207 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1211 "pminsw %%mm2, %%mm0 \n\t"
1212 "pminsw %%mm3, %%mm1 \n\t"
1214 "movq %%mm0, %%mm6 \n\t"
1215 "psubusw %%mm2, %%mm6 \n\t"
1216 "psubw %%mm6, %%mm0 \n\t"
1217 "movq %%mm1, %%mm6 \n\t"
1218 "psubusw %%mm3, %%mm6 \n\t"
1219 "psubw %%mm6, %%mm1 \n\t"
1222 "movq %%mm7, %%mm6 \n\t" // 0
1223 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
1224 "pxor %%mm6, %%mm4 \n\t"
1225 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
1226 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
1227 "pxor %%mm7, %%mm5 \n\t"
1228 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
1230 "movd %2, %%mm2 \n\t" // QP
1231 "punpcklwd %%mm2, %%mm2 \n\t"
1232 "punpcklwd %%mm2, %%mm2 \n\t"
1233 "psllw $3, %%mm2 \n\t" // 8QP
1234 "movq %%mm2, %%mm3 \n\t" // 8QP
1235 "pcmpgtw %%mm4, %%mm2 \n\t"
1236 "pcmpgtw %%mm5, %%mm3 \n\t"
1237 "pand %%mm2, %%mm4 \n\t"
1238 "pand %%mm3, %%mm5 \n\t"
1241 "psubusw %%mm0, %%mm4 \n\t" // hd
1242 "psubusw %%mm1, %%mm5 \n\t" // ld
1245 "movq w05, %%mm2 \n\t" // 5
1246 "pmullw %%mm2, %%mm4 \n\t"
1247 "pmullw %%mm2, %%mm5 \n\t"
1248 "movq w20, %%mm2 \n\t" // 32
1249 "paddw %%mm2, %%mm4 \n\t"
1250 "paddw %%mm2, %%mm5 \n\t"
1251 "psrlw $6, %%mm4 \n\t"
1252 "psrlw $6, %%mm5 \n\t"
1255 "movq w06, %%mm2 \n\t" // 6
1256 "paddw %%mm2, %%mm4 \n\t"
1257 "paddw %%mm2, %%mm5 \n\t"
1258 "movq w1400, %%mm2 \n\t" // 1400h = 5120 = 5/64*2^16
1259 //FIXME if *5/64 is supposed to be /13 then we should use 5041 instead of 5120
1260 "pmulhw %%mm2, %%mm4 \n\t" // hd/13
1261 "pmulhw %%mm2, %%mm5 \n\t" // ld/13
1264 "movq temp2, %%mm0 \n\t" // L3 - L4
1265 "movq temp3, %%mm1 \n\t" // H3 - H4
1267 "pxor %%mm2, %%mm2 \n\t"
1268 "pxor %%mm3, %%mm3 \n\t"
1270 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
1271 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
1272 "pxor %%mm2, %%mm0 \n\t"
1273 "pxor %%mm3, %%mm1 \n\t"
1274 "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
1275 "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
1276 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
1277 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
1279 "pxor %%mm6, %%mm2 \n\t"
1280 "pxor %%mm7, %%mm3 \n\t"
1281 "pand %%mm2, %%mm4 \n\t"
1282 "pand %%mm3, %%mm5 \n\t"
1285 "pminsw %%mm0, %%mm4 \n\t"
1286 "pminsw %%mm1, %%mm5 \n\t"
1288 "movq %%mm4, %%mm2 \n\t"
1289 "psubusw %%mm0, %%mm2 \n\t"
1290 "psubw %%mm2, %%mm4 \n\t"
1291 "movq %%mm5, %%mm2 \n\t"
1292 "psubusw %%mm1, %%mm2 \n\t"
1293 "psubw %%mm2, %%mm5 \n\t"
1295 "pxor %%mm6, %%mm4 \n\t"
1296 "pxor %%mm7, %%mm5 \n\t"
1297 "psubw %%mm6, %%mm4 \n\t"
1298 "psubw %%mm7, %%mm5 \n\t"
1299 "packsswb %%mm5, %%mm4 \n\t"
1300 "movq (%%eax, %1, 2), %%mm0 \n\t"
1301 "paddb %%mm4, %%mm0 \n\t"
1302 "movq %%mm0, (%%eax, %1, 2) \n\t"
1303 "movq (%0, %1, 4), %%mm0 \n\t"
1304 "psubb %%mm4, %%mm0 \n\t"
1305 "movq %%mm0, (%0, %1, 4) \n\t"
1308 : "r" (src), "r" (stride), "r" (QP)
1312 const int l1= stride;
1313 const int l2= stride + l1;
1314 const int l3= stride + l2;
1315 const int l4= stride + l3;
1316 const int l5= stride + l4;
1317 const int l6= stride + l5;
1318 const int l7= stride + l6;
1319 const int l8= stride + l7;
1320 // const int l9= stride + l8;
1323 for(x=0; x<BLOCK_SIZE; x++)
1325 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1326 if(ABS(middleEnergy) < 8*QP)
1328 const int q=(src[l4] - src[l5])/2;
1329 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1330 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1332 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1336 d*= SIGN(-middleEnergy);
1357 //FIXME? |255-0| = 1
1359 * Check if the given 8x8 Block is mostly "flat"
1361 static inline int isHorizDC(uint8_t src[], int stride)
1368 "leal (%1, %2), %%ecx \n\t"
1369 "leal (%%ecx, %2, 4), %%ebx \n\t"
1370 // 0 1 2 3 4 5 6 7 8 9
1371 // %1 ecx ecx+%2 ecx+2%2 %1+4%2 ebx ebx+%2 ebx+2%2 %1+8%2 ebx+4%2
1372 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F
1373 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D
1374 "pxor %%mm0, %%mm0 \n\t"
1375 "movl %1, %%eax \n\t"
1376 "andl $0x1F, %%eax \n\t"
1377 "cmpl $24, %%eax \n\t"
1378 "leal tempBlock, %%eax \n\t"
1381 #define HDC_CHECK_AND_CPY(src, dst) \
1382 "movd " #src ", %%mm2 \n\t"\
1383 "punpckldq 4" #src ", %%mm2 \n\t" /* (%1) */\
1384 "movq %%mm2, %%mm1 \n\t"\
1385 "psrlq $8, %%mm2 \n\t"\
1386 "psubb %%mm1, %%mm2 \n\t"\
1387 "paddb %%mm7, %%mm2 \n\t"\
1388 "pcmpgtb %%mm6, %%mm2 \n\t"\
1389 "paddb %%mm2, %%mm0 \n\t"\
1390 "movq %%mm1," #dst "(%%eax) \n\t"
1392 HDC_CHECK_AND_CPY((%1),0)
1393 HDC_CHECK_AND_CPY((%%ecx),8)
1394 HDC_CHECK_AND_CPY((%%ecx, %2),16)
1395 HDC_CHECK_AND_CPY((%%ecx, %2, 2),24)
1396 HDC_CHECK_AND_CPY((%1, %2, 4),32)
1397 HDC_CHECK_AND_CPY((%%ebx),40)
1398 HDC_CHECK_AND_CPY((%%ebx, %2),48)
1399 HDC_CHECK_AND_CPY((%%ebx, %2, 2),56)
1402 // src does not cross a 32 byte cache line so dont waste time with alignment
1403 #define HDC_CHECK_AND_CPY2(src, dst) \
1404 "movq " #src ", %%mm2 \n\t"\
1405 "movq " #src ", %%mm1 \n\t"\
1406 "psrlq $8, %%mm2 \n\t"\
1407 "psubb %%mm1, %%mm2 \n\t"\
1408 "paddb %%mm7, %%mm2 \n\t"\
1409 "pcmpgtb %%mm6, %%mm2 \n\t"\
1410 "paddb %%mm2, %%mm0 \n\t"\
1411 "movq %%mm1," #dst "(%%eax) \n\t"
1413 HDC_CHECK_AND_CPY2((%1),0)
1414 HDC_CHECK_AND_CPY2((%%ecx),8)
1415 HDC_CHECK_AND_CPY2((%%ecx, %2),16)
1416 HDC_CHECK_AND_CPY2((%%ecx, %2, 2),24)
1417 HDC_CHECK_AND_CPY2((%1, %2, 4),32)
1418 HDC_CHECK_AND_CPY2((%%ebx),40)
1419 HDC_CHECK_AND_CPY2((%%ebx, %2),48)
1420 HDC_CHECK_AND_CPY2((%%ebx, %2, 2),56)
1422 "psllq $8, %%mm0 \n\t" // remove dummy value
1423 "movq %%mm0, %%mm1 \n\t"
1424 "psrlw $8, %%mm0 \n\t"
1425 "paddb %%mm1, %%mm0 \n\t"
1426 "movq %%mm0, %%mm1 \n\t"
1427 "psrlq $16, %%mm0 \n\t"
1428 "paddb %%mm1, %%mm0 \n\t"
1429 "movq %%mm0, %%mm1 \n\t"
1430 "psrlq $32, %%mm0 \n\t"
1431 "paddb %%mm1, %%mm0 \n\t"
1432 "movd %%mm0, %0 \n\t"
1434 : "r" (src), "r" (stride)
1435 : "%eax", "%ebx", "%ecx"
1437 // printf("%d\n", numEq);
1438 numEq= (256 - numEq) &0xFF;
1441 for(y=0; y<BLOCK_SIZE; y++)
1443 if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++;
1444 if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++;
1445 if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++;
1446 if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++;
1447 if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++;
1448 if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++;
1449 if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++;
1453 /* if(abs(numEq - asmEq) > 0)
1455 // printf("\nasm:%d c:%d\n", asmEq, numEq);
1456 for(int y=0; y<8; y++)
1458 for(int x=0; x<8; x++)
1460 printf("%d ", src[x + y*stride]);
1466 // printf("%d\n", numEq);
1467 return numEq > hFlatnessThreshold;
1470 static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
1472 if(abs(src[0] - src[7]) > 2*QP) return 0;
1477 static inline void doHorizDefFilter(uint8_t dst[], int stride, int QP)
1481 "leal (%0, %1), %%ecx \n\t"
1482 "leal (%%ecx, %1, 4), %%ebx \n\t"
1483 // 0 1 2 3 4 5 6 7 8 9
1484 // %0 ecx ecx+%1 ecx+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1485 "pxor %%mm7, %%mm7 \n\t"
1486 "movq bm00001000, %%mm6 \n\t"
1487 "movd %2, %%mm5 \n\t" // QP
1488 "movq %%mm5, %%mm4 \n\t"
1489 "paddusb %%mm5, %%mm5 \n\t" // 2QP
1490 "paddusb %%mm5, %%mm4 \n\t" // 3QP
1491 "psllq $24, %%mm4 \n\t"
1492 "pxor %%mm5, %%mm5 \n\t" // 0
1493 "psubb %%mm4, %%mm5 \n\t" // -QP
1494 "leal tempBlock, %%eax \n\t"
1496 //FIXME? "unroll by 2" and mix
1498 #define HDF(src, dst) \
1499 "movq " #src "(%%eax), %%mm0 \n\t"\
1500 "movq " #src "(%%eax), %%mm1 \n\t"\
1501 "movq " #src "(%%eax), %%mm2 \n\t"\
1502 "psrlq $8, %%mm1 \n\t"\
1503 "psubusb %%mm1, %%mm2 \n\t"\
1504 "psubusb %%mm0, %%mm1 \n\t"\
1505 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
1506 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
1507 "pshufw $0x00, %%mm1, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
1508 "pminub %%mm1, %%mm3 \n\t" /* p´5 = min(|p2-p1|, |p6-p5|)*/\
1509 "psrlq $16, %%mm3 \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\
1510 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5-p6|) */\
1511 "paddb %%mm5, %%mm1 \n\t"\
1512 "psubusb %%mm5, %%mm1 \n\t"\
1513 "psrlw $2, %%mm1 \n\t"\
1514 "pxor %%mm2, %%mm1 \n\t"\
1515 "psubb %%mm2, %%mm1 \n\t"\
1516 "pand %%mm6, %%mm1 \n\t"\
1517 "psubb %%mm1, %%mm0 \n\t"\
1518 "psllq $8, %%mm1 \n\t"\
1519 "paddb %%mm1, %%mm0 \n\t"\
1520 "movd %%mm0, " #dst" \n\t"\
1521 "psrlq $32, %%mm0 \n\t"\
1522 "movd %%mm0, 4" #dst" \n\t"
1524 #define HDF(src, dst)\
1525 "movq " #src "(%%eax), %%mm0 \n\t"\
1526 "movq %%mm0, %%mm1 \n\t"\
1527 "movq %%mm0, %%mm2 \n\t"\
1528 "psrlq $8, %%mm1 \n\t"\
1529 "psubusb %%mm1, %%mm2 \n\t"\
1530 "psubusb %%mm0, %%mm1 \n\t"\
1531 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
1532 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
1533 "movq %%mm1, %%mm3 \n\t"\
1534 "psllq $32, %%mm3 \n\t"\
1535 "movq %%mm3, %%mm4 \n\t"\
1536 "psubusb %%mm1, %%mm4 \n\t"\
1537 "psubb %%mm4, %%mm3 \n\t"\
1538 "psrlq $16, %%mm3 \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\
1539 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5,ü6|) */\
1540 "paddb %%mm5, %%mm1 \n\t"\
1541 "psubusb %%mm5, %%mm1 \n\t"\
1542 "psrlw $2, %%mm1 \n\t"\
1543 "pxor %%mm2, %%mm1 \n\t"\
1544 "psubb %%mm2, %%mm1 \n\t"\
1545 "pand %%mm6, %%mm1 \n\t"\
1546 "psubb %%mm1, %%mm0 \n\t"\
1547 "psllq $8, %%mm1 \n\t"\
1548 "paddb %%mm1, %%mm0 \n\t"\
1549 "movd %%mm0, " #dst " \n\t"\
1550 "psrlq $32, %%mm0 \n\t"\
1551 "movd %%mm0, 4" #dst " \n\t"
1556 HDF(24,(%%ecx, %1, 2))
1560 HDF(56,(%%ebx, %1, 2))
1562 : "r" (dst), "r" (stride), "r" (QP)
1563 : "%eax", "%ebx", "%ecx"
1567 for(y=0; y<BLOCK_SIZE; y++)
1569 const int middleEnergy= 5*(dst[4] - dst[5]) + 2*(dst[2] - dst[5]);
1571 if(ABS(middleEnergy) < 8*QP)
1573 const int q=(dst[3] - dst[4])/2;
1574 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
1575 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
1577 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1581 d*= SIGN(-middleEnergy);
1603 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
1604 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
1605 * using the 7-Tap Filter (2,2,2,4,2,2,2)/16 (MMX2/3DNOW version)
1607 static inline void doHorizLowPass(uint8_t dst[], int stride, int QP)
1612 "leal (%0, %1), %%ecx \n\t"
1613 "leal (%%ecx, %1, 4), %%ebx \n\t"
1614 // 0 1 2 3 4 5 6 7 8 9
1615 // %0 ecx ecx+%1 ecx+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1616 "pxor %%mm7, %%mm7 \n\t"
1617 "leal tempBlock, %%eax \n\t"
1619 #define HLP1 "movq (%0), %%mm0 \n\t"\
1620 "movq %%mm0, %%mm1 \n\t"\
1621 "psllq $8, %%mm0 \n\t"\
1622 PAVGB(%%mm1, %%mm0)\
1623 "psrlw $8, %%mm0 \n\t"\
1624 "pxor %%mm1, %%mm1 \n\t"\
1625 "packuswb %%mm1, %%mm0 \n\t"\
1626 "movq %%mm0, %%mm1 \n\t"\
1627 "movq %%mm0, %%mm2 \n\t"\
1628 "psllq $32, %%mm0 \n\t"\
1629 "paddb %%mm0, %%mm1 \n\t"\
1630 "psllq $16, %%mm2 \n\t"\
1631 PAVGB(%%mm2, %%mm0)\
1632 "movq %%mm0, %%mm3 \n\t"\
1633 "pand bm11001100, %%mm0 \n\t"\
1634 "paddusb %%mm0, %%mm3 \n\t"\
1635 "psrlq $8, %%mm3 \n\t"\
1636 PAVGB(%%mm1, %%mm4)\
1637 PAVGB(%%mm3, %%mm2)\
1638 "psrlq $16, %%mm2 \n\t"\
1639 "punpcklbw %%mm2, %%mm2 \n\t"\
1640 "movq %%mm2, (%0) \n\t"\
1642 #define HLP2 "movq (%0), %%mm0 \n\t"\
1643 "movq %%mm0, %%mm1 \n\t"\
1644 "psllq $8, %%mm0 \n\t"\
1645 PAVGB(%%mm1, %%mm0)\
1646 "psrlw $8, %%mm0 \n\t"\
1647 "pxor %%mm1, %%mm1 \n\t"\
1648 "packuswb %%mm1, %%mm0 \n\t"\
1649 "movq %%mm0, %%mm2 \n\t"\
1650 "psllq $32, %%mm0 \n\t"\
1651 "psllq $16, %%mm2 \n\t"\
1652 PAVGB(%%mm2, %%mm0)\
1653 "movq %%mm0, %%mm3 \n\t"\
1654 "pand bm11001100, %%mm0 \n\t"\
1655 "paddusb %%mm0, %%mm3 \n\t"\
1656 "psrlq $8, %%mm3 \n\t"\
1657 PAVGB(%%mm3, %%mm2)\
1658 "psrlq $16, %%mm2 \n\t"\
1659 "punpcklbw %%mm2, %%mm2 \n\t"\
1660 "movq %%mm2, (%0) \n\t"\
1662 // approximately a 7-Tap Filter with Vector (1,2,3,4,3,2,1)/16
1664 Implemented Exact 7-Tap
1677 #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\
1678 "movq %%mm0, %%mm1 \n\t"\
1679 "movq %%mm0, %%mm2 \n\t"\
1680 "movq %%mm0, %%mm3 \n\t"\
1681 "movq %%mm0, %%mm4 \n\t"\
1682 "psllq $8, %%mm1 \n\t"\
1683 "psrlq $8, %%mm2 \n\t"\
1684 "pand bm00000001, %%mm3 \n\t"\
1685 "pand bm10000000, %%mm4 \n\t"\
1686 "por %%mm3, %%mm1 \n\t"\
1687 "por %%mm4, %%mm2 \n\t"\
1688 PAVGB(%%mm2, %%mm1)\
1689 PAVGB(%%mm1, %%mm0)\
1691 "pshufw $0xF9, %%mm0, %%mm3 \n\t"\
1692 "pshufw $0x90, %%mm0, %%mm4 \n\t"\
1693 PAVGB(%%mm3, %%mm4)\
1694 PAVGB(%%mm4, %%mm0)\
1695 "movd %%mm0, (%0) \n\t"\
1696 "psrlq $32, %%mm0 \n\t"\
1697 "movd %%mm0, 4(%0) \n\t"
1699 #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\
1700 "movq %%mm0, %%mm1 \n\t"\
1701 "movq %%mm0, %%mm2 \n\t"\
1702 "movd -4(%0), %%mm3 \n\t" /*0001000*/\
1703 "movd 8(%0), %%mm4 \n\t" /*0001000*/\
1704 "psllq $8, %%mm1 \n\t"\
1705 "psrlq $8, %%mm2 \n\t"\
1706 "psrlq $24, %%mm3 \n\t"\
1707 "psllq $56, %%mm4 \n\t"\
1708 "por %%mm3, %%mm1 \n\t"\
1709 "por %%mm4, %%mm2 \n\t"\
1710 PAVGB(%%mm2, %%mm1)\
1711 PAVGB(%%mm1, %%mm0)\
1713 "movq %%mm0, %%mm3 \n\t"\
1714 "movq %%mm0, %%mm4 \n\t"\
1715 "movq %%mm0, %%mm5 \n\t"\
1716 "psrlq $16, %%mm3 \n\t"\
1717 "psllq $16, %%mm4 \n\t"\
1718 "pand bm11000000, %%mm5 \n\t"\
1719 "por %%mm5, %%mm3 \n\t"\
1720 "movq %%mm0, %%mm5 \n\t"\
1721 "pand bm00000011, %%mm5 \n\t"\
1722 "por %%mm5, %%mm4 \n\t"\
1723 PAVGB(%%mm3, %%mm4)\
1724 PAVGB(%%mm4, %%mm0)\
1725 "movd %%mm0, (%0) \n\t"\
1726 "psrlq $32, %%mm0 \n\t"\
1727 "movd %%mm0, 4(%0) \n\t"
1730 /* uses the 7-Tap Filter: 1112111 */
1731 #define NEW_HLP(src, dst)\
1732 "movq " #src "(%%eax), %%mm1 \n\t"\
1733 "movq " #src "(%%eax), %%mm2 \n\t"\
1734 "psllq $8, %%mm1 \n\t"\
1735 "psrlq $8, %%mm2 \n\t"\
1736 "movd -4" #dst ", %%mm3 \n\t" /*0001000*/\
1737 "movd 8" #dst ", %%mm4 \n\t" /*0001000*/\
1738 "psrlq $24, %%mm3 \n\t"\
1739 "psllq $56, %%mm4 \n\t"\
1740 "por %%mm3, %%mm1 \n\t"\
1741 "por %%mm4, %%mm2 \n\t"\
1742 "movq %%mm1, %%mm5 \n\t"\
1743 PAVGB(%%mm2, %%mm1)\
1744 "movq " #src "(%%eax), %%mm0 \n\t"\
1745 PAVGB(%%mm1, %%mm0)\
1746 "psllq $8, %%mm5 \n\t"\
1747 "psrlq $8, %%mm2 \n\t"\
1748 "por %%mm3, %%mm5 \n\t"\
1749 "por %%mm4, %%mm2 \n\t"\
1750 "movq %%mm5, %%mm1 \n\t"\
1751 PAVGB(%%mm2, %%mm5)\
1752 "psllq $8, %%mm1 \n\t"\
1753 "psrlq $8, %%mm2 \n\t"\
1754 "por %%mm3, %%mm1 \n\t"\
1755 "por %%mm4, %%mm2 \n\t"\
1756 PAVGB(%%mm2, %%mm1)\
1757 PAVGB(%%mm1, %%mm5)\
1758 PAVGB(%%mm5, %%mm0)\
1759 "movd %%mm0, " #dst " \n\t"\
1760 "psrlq $32, %%mm0 \n\t"\
1761 "movd %%mm0, 4" #dst " \n\t"
1763 /* uses the 9-Tap Filter: 112242211 */
1764 #define NEW_HLP2(i)\
1765 "movq " #i "(%%eax), %%mm0 \n\t" /*0001000*/\
1766 "movq %%mm0, %%mm1 \n\t" /*0001000*/\
1767 "movq %%mm0, %%mm2 \n\t" /*0001000*/\
1768 "movd -4(%0), %%mm3 \n\t" /*0001000*/\
1769 "movd 8(%0), %%mm4 \n\t" /*0001000*/\
1770 "psllq $8, %%mm1 \n\t"\
1771 "psrlq $8, %%mm2 \n\t"\
1772 "psrlq $24, %%mm3 \n\t"\
1773 "psllq $56, %%mm4 \n\t"\
1774 "por %%mm3, %%mm1 \n\t" /*0010000*/\
1775 "por %%mm4, %%mm2 \n\t" /*0000100*/\
1776 "movq %%mm1, %%mm5 \n\t" /*0010000*/\
1777 PAVGB(%%mm2, %%mm1) /*0010100*/\
1778 PAVGB(%%mm1, %%mm0) /*0012100*/\
1779 "psllq $8, %%mm5 \n\t"\
1780 "psrlq $8, %%mm2 \n\t"\
1781 "por %%mm3, %%mm5 \n\t" /*0100000*/\
1782 "por %%mm4, %%mm2 \n\t" /*0000010*/\
1783 "movq %%mm5, %%mm1 \n\t" /*0100000*/\
1784 PAVGB(%%mm2, %%mm5) /*0100010*/\
1785 "psllq $8, %%mm1 \n\t"\
1786 "psrlq $8, %%mm2 \n\t"\
1787 "por %%mm3, %%mm1 \n\t" /*1000000*/\
1788 "por %%mm4, %%mm2 \n\t" /*0000001*/\
1789 "movq %%mm1, %%mm6 \n\t" /*1000000*/\
1790 PAVGB(%%mm2, %%mm1) /*1000001*/\
1791 "psllq $8, %%mm6 \n\t"\
1792 "psrlq $8, %%mm2 \n\t"\
1793 "por %%mm3, %%mm6 \n\t"/*100000000*/\
1794 "por %%mm4, %%mm2 \n\t"/*000000001*/\
1795 PAVGB(%%mm2, %%mm6) /*100000001*/\
1796 PAVGB(%%mm6, %%mm1) /*110000011*/\
1797 PAVGB(%%mm1, %%mm5) /*112000211*/\
1798 PAVGB(%%mm5, %%mm0) /*112242211*/\
1799 "movd %%mm0, (%0) \n\t"\
1800 "psrlq $32, %%mm0 \n\t"\
1801 "movd %%mm0, 4(%0) \n\t"
1803 #define HLP(src, dst) NEW_HLP(src, dst)
1807 HLP(16, (%%ecx, %1))
1808 HLP(24, (%%ecx, %1, 2))
1809 HLP(32, (%0, %1, 4))
1811 HLP(48, (%%ebx, %1))
1812 HLP(56, (%%ebx, %1, 2))
1815 : "r" (dst), "r" (stride)
1816 : "%eax", "%ebx", "%ecx"
1821 for(y=0; y<BLOCK_SIZE; y++)
1823 const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
1824 const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
1827 sums[0] = first + dst[0];
1828 sums[1] = dst[0] + dst[1];
1829 sums[2] = dst[1] + dst[2];
1830 sums[3] = dst[2] + dst[3];
1831 sums[4] = dst[3] + dst[4];
1832 sums[5] = dst[4] + dst[5];
1833 sums[6] = dst[5] + dst[6];
1834 sums[7] = dst[6] + dst[7];
1835 sums[8] = dst[7] + last;
1837 dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
1838 dst[1]= ((dst[1]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
1839 dst[2]= ((dst[2]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
1840 dst[3]= ((dst[3]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
1841 dst[4]= ((dst[4]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
1842 dst[5]= ((dst[5]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
1843 dst[6]= (((last + dst[6])<<2) + ((dst[7] + sums[5])<<1) + sums[3] + 8)>>4;
1844 dst[7]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
1851 static inline void dering(uint8_t src[], int stride, int QP)
1853 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1855 "movq pQPb, %%mm0 \n\t"
1856 "paddusb %%mm0, %%mm0 \n\t"
1857 "movq %%mm0, pQPb2 \n\t"
1859 "leal (%0, %1), %%eax \n\t"
1860 "leal (%%eax, %1, 4), %%ebx \n\t"
1861 // 0 1 2 3 4 5 6 7 8 9
1862 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1864 "pcmpeqb %%mm6, %%mm6 \n\t"
1865 "pxor %%mm7, %%mm7 \n\t"
1867 #define FIND_MIN_MAX(addr)\
1868 "movq " #addr ", %%mm0 \n\t"\
1869 "pminub %%mm0, %%mm6 \n\t"\
1870 "pmaxub %%mm0, %%mm7 \n\t"
1872 #define FIND_MIN_MAX(addr)\
1873 "movq " #addr ", %%mm0 \n\t"\
1874 "movq %%mm6, %%mm1 \n\t"\
1875 "psubusb %%mm0, %%mm7 \n\t"\
1876 "paddb %%mm0, %%mm7 \n\t"\
1877 "psubusb %%mm0, %%mm1 \n\t"\
1878 "psubb %%mm1, %%mm6 \n\t"
1881 FIND_MIN_MAX((%%eax))
1882 FIND_MIN_MAX((%%eax, %1))
1883 FIND_MIN_MAX((%%eax, %1, 2))
1884 FIND_MIN_MAX((%0, %1, 4))
1885 FIND_MIN_MAX((%%ebx))
1886 FIND_MIN_MAX((%%ebx, %1))
1887 FIND_MIN_MAX((%%ebx, %1, 2))
1888 FIND_MIN_MAX((%0, %1, 8))
1890 "movq %%mm6, %%mm4 \n\t"
1891 "psrlq $8, %%mm6 \n\t"
1893 "pminub %%mm4, %%mm6 \n\t" // min of pixels
1894 "pshufw $0xF9, %%mm6, %%mm4 \n\t"
1895 "pminub %%mm4, %%mm6 \n\t" // min of pixels
1896 "pshufw $0xFE, %%mm6, %%mm4 \n\t"
1897 "pminub %%mm4, %%mm6 \n\t"
1899 "movq %%mm6, %%mm1 \n\t"
1900 "psubusb %%mm4, %%mm1 \n\t"
1901 "psubb %%mm1, %%mm6 \n\t"
1902 "movq %%mm6, %%mm4 \n\t"
1903 "psrlq $16, %%mm6 \n\t"
1904 "movq %%mm6, %%mm1 \n\t"
1905 "psubusb %%mm4, %%mm1 \n\t"
1906 "psubb %%mm1, %%mm6 \n\t"
1907 "movq %%mm6, %%mm4 \n\t"
1908 "psrlq $32, %%mm6 \n\t"
1909 "movq %%mm6, %%mm1 \n\t"
1910 "psubusb %%mm4, %%mm1 \n\t"
1911 "psubb %%mm1, %%mm6 \n\t"
1915 "movq %%mm7, %%mm4 \n\t"
1916 "psrlq $8, %%mm7 \n\t"
1918 "pmaxub %%mm4, %%mm7 \n\t" // max of pixels
1919 "pshufw $0xF9, %%mm7, %%mm4 \n\t"
1920 "pmaxub %%mm4, %%mm7 \n\t"
1921 "pshufw $0xFE, %%mm7, %%mm4 \n\t"
1922 "pmaxub %%mm4, %%mm7 \n\t"
1924 "psubusb %%mm4, %%mm7 \n\t"
1925 "paddb %%mm4, %%mm7 \n\t"
1926 "movq %%mm7, %%mm4 \n\t"
1927 "psrlq $16, %%mm7 \n\t"
1928 "psubusb %%mm4, %%mm7 \n\t"
1929 "paddb %%mm4, %%mm7 \n\t"
1930 "movq %%mm7, %%mm4 \n\t"
1931 "psrlq $32, %%mm7 \n\t"
1932 "psubusb %%mm4, %%mm7 \n\t"
1933 "paddb %%mm4, %%mm7 \n\t"
1935 PAVGB(%%mm6, %%mm7) // a=(max + min)/2
1936 "punpcklbw %%mm7, %%mm7 \n\t"
1937 "punpcklbw %%mm7, %%mm7 \n\t"
1938 "punpcklbw %%mm7, %%mm7 \n\t"
1939 "movq %%mm7, temp0 \n\t"
1941 "movq (%0), %%mm0 \n\t" // L10
1942 "movq %%mm0, %%mm1 \n\t" // L10
1943 "movq %%mm0, %%mm2 \n\t" // L10
1944 "psllq $8, %%mm1 \n\t"
1945 "psrlq $8, %%mm2 \n\t"
1946 "movd -4(%0), %%mm3 \n\t"
1947 "movd 8(%0), %%mm4 \n\t"
1948 "psrlq $24, %%mm3 \n\t"
1949 "psllq $56, %%mm4 \n\t"
1950 "por %%mm3, %%mm1 \n\t" // L00
1951 "por %%mm4, %%mm2 \n\t" // L20
1952 "movq %%mm1, %%mm3 \n\t" // L00
1953 PAVGB(%%mm2, %%mm1) // (L20 + L00)/2
1954 PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4
1955 "psubusb %%mm7, %%mm0 \n\t"
1956 "psubusb %%mm7, %%mm2 \n\t"
1957 "psubusb %%mm7, %%mm3 \n\t"
1958 "pcmpeqb b00, %%mm0 \n\t" // L10 > a ? 0 : -1
1959 "pcmpeqb b00, %%mm2 \n\t" // L20 > a ? 0 : -1
1960 "pcmpeqb b00, %%mm3 \n\t" // L00 > a ? 0 : -1
1961 "paddb %%mm2, %%mm0 \n\t"
1962 "paddb %%mm3, %%mm0 \n\t"
1964 "movq (%%eax), %%mm2 \n\t" // L11
1965 "movq %%mm2, %%mm3 \n\t" // L11
1966 "movq %%mm2, %%mm4 \n\t" // L11
1967 "psllq $8, %%mm3 \n\t"
1968 "psrlq $8, %%mm4 \n\t"
1969 "movd -4(%%eax), %%mm5 \n\t"
1970 "movd 8(%%eax), %%mm6 \n\t"
1971 "psrlq $24, %%mm5 \n\t"
1972 "psllq $56, %%mm6 \n\t"
1973 "por %%mm5, %%mm3 \n\t" // L01
1974 "por %%mm6, %%mm4 \n\t" // L21
1975 "movq %%mm3, %%mm5 \n\t" // L01
1976 PAVGB(%%mm4, %%mm3) // (L21 + L01)/2
1977 PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4
1978 "psubusb %%mm7, %%mm2 \n\t"
1979 "psubusb %%mm7, %%mm4 \n\t"
1980 "psubusb %%mm7, %%mm5 \n\t"
1981 "pcmpeqb b00, %%mm2 \n\t" // L11 > a ? 0 : -1
1982 "pcmpeqb b00, %%mm4 \n\t" // L21 > a ? 0 : -1
1983 "pcmpeqb b00, %%mm5 \n\t" // L01 > a ? 0 : -1
1984 "paddb %%mm4, %%mm2 \n\t"
1985 "paddb %%mm5, %%mm2 \n\t"
1987 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1988 "movq " #src ", " #sx " \n\t" /* src[0] */\
1989 "movq " #sx ", " #lx " \n\t" /* src[0] */\
1990 "movq " #sx ", " #t0 " \n\t" /* src[0] */\
1991 "psllq $8, " #lx " \n\t"\
1992 "psrlq $8, " #t0 " \n\t"\
1993 "movd -4" #src ", " #t1 " \n\t"\
1994 "psrlq $24, " #t1 " \n\t"\
1995 "por " #t1 ", " #lx " \n\t" /* src[-1] */\
1996 "movd 8" #src ", " #t1 " \n\t"\
1997 "psllq $56, " #t1 " \n\t"\
1998 "por " #t1 ", " #t0 " \n\t" /* src[+1] */\
1999 "movq " #lx ", " #t1 " \n\t" /* src[-1] */\
2000 PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\
2001 PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\
2003 "movq " #lx ", temp1 \n\t"\
2004 "movq temp0, " #lx " \n\t"\
2005 "psubusb " #lx ", " #t1 " \n\t"\
2006 "psubusb " #lx ", " #t0 " \n\t"\
2007 "psubusb " #lx ", " #sx " \n\t"\
2008 "movq b00, " #lx " \n\t"\
2009 "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\
2010 "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\
2011 "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\
2012 "paddb " #t1 ", " #t0 " \n\t"\
2013 "paddb " #t0 ", " #sx " \n\t"\
2015 PAVGB(plx, pplx) /* filtered */\
2016 "movq " #dst ", " #t0 " \n\t" /* dst */\
2017 "movq " #t0 ", " #t1 " \n\t" /* dst */\
2018 "psubusb pQPb2, " #t0 " \n\t"\
2019 "paddusb pQPb2, " #t1 " \n\t"\
2021 PMINUB(t1, pplx, t0)\
2022 "paddb " #sx ", " #ppsx " \n\t"\
2023 "paddb " #psx ", " #ppsx " \n\t"\
2024 "#paddb b02, " #ppsx " \n\t"\
2025 "pand b08, " #ppsx " \n\t"\
2026 "pcmpeqb " #lx ", " #ppsx " \n\t"\
2027 "pand " #ppsx ", " #pplx " \n\t"\
2028 "pandn " #dst ", " #ppsx " \n\t"\
2029 "por " #pplx ", " #ppsx " \n\t"\
2030 "movq " #ppsx ", " #dst " \n\t"\
2031 "movq temp1, " #lx " \n\t"
2048 //DERING_CORE(dst,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1)
2049 DERING_CORE((%%eax),(%%eax, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
2050 DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
2051 DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
2052 DERING_CORE((%0, %1, 4),(%%ebx) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
2053 DERING_CORE((%%ebx),(%%ebx, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
2054 DERING_CORE((%%ebx, %1), (%%ebx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
2055 DERING_CORE((%%ebx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
2056 DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
2059 : : "r" (src), "r" (stride), "r" (QP)
2077 if(*p > max) max= *p;
2078 if(*p < min) min= *p;
2081 avg= (min + max + 1)/2;
2090 if(*p > avg) t |= (1<<x);
2094 t &= (t<<1) & (t>>1);
2101 int t = s[y-1] & s[y] & s[y+1];
2110 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
2111 +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1))
2112 +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
2115 if (*p + 2*QP < f) *p= *p + 2*QP;
2116 else if(*p - 2*QP > f) *p= *p - 2*QP;
2126 * Deinterlaces the given block
2127 * will be called for every 8x8 block and can read & write from line 4-15
2128 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
2129 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2131 static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride)
2133 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2136 "leal (%0, %1), %%eax \n\t"
2137 "leal (%%eax, %1, 4), %%ebx \n\t"
2138 // 0 1 2 3 4 5 6 7 8 9
2139 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2141 "movq (%0), %%mm0 \n\t"
2142 "movq (%%eax, %1), %%mm1 \n\t"
2144 "movq %%mm0, (%%eax) \n\t"
2145 "movq (%0, %1, 4), %%mm0 \n\t"
2147 "movq %%mm1, (%%eax, %1, 2) \n\t"
2148 "movq (%%ebx, %1), %%mm1 \n\t"
2150 "movq %%mm0, (%%ebx) \n\t"
2151 "movq (%0, %1, 8), %%mm0 \n\t"
2153 "movq %%mm1, (%%ebx, %1, 2) \n\t"
2155 : : "r" (src), "r" (stride)
2163 src[stride] = (src[0] + src[stride*2])>>1;
2164 src[stride*3] = (src[stride*2] + src[stride*4])>>1;
2165 src[stride*5] = (src[stride*4] + src[stride*6])>>1;
2166 src[stride*7] = (src[stride*6] + src[stride*8])>>1;
2173 * Deinterlaces the given block
2174 * will be called for every 8x8 block and can read & write from line 4-15
2175 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
2176 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2177 * this filter will read lines 3-15 and write 7-13
2178 * no cliping in C version
2180 static inline void deInterlaceInterpolateCubic(uint8_t src[], int stride)
2182 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2185 "leal (%0, %1), %%eax \n\t"
2186 "leal (%%eax, %1, 4), %%ebx \n\t"
2187 "leal (%%ebx, %1, 4), %%ecx \n\t"
2188 "addl %1, %%ecx \n\t"
2189 "pxor %%mm7, %%mm7 \n\t"
2190 // 0 1 2 3 4 5 6 7 8 9 10
2191 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 ecx
2193 #define DEINT_CUBIC(a,b,c,d,e)\
2194 "movq " #a ", %%mm0 \n\t"\
2195 "movq " #b ", %%mm1 \n\t"\
2196 "movq " #d ", %%mm2 \n\t"\
2197 "movq " #e ", %%mm3 \n\t"\
2198 PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\
2199 PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\
2200 "movq %%mm0, %%mm2 \n\t"\
2201 "punpcklbw %%mm7, %%mm0 \n\t"\
2202 "punpckhbw %%mm7, %%mm2 \n\t"\
2203 "movq %%mm1, %%mm3 \n\t"\
2204 "punpcklbw %%mm7, %%mm1 \n\t"\
2205 "punpckhbw %%mm7, %%mm3 \n\t"\
2206 "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\
2207 "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\
2208 "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\
2209 "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\
2210 "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\
2211 "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\
2212 "packuswb %%mm3, %%mm1 \n\t"\
2213 "movq %%mm1, " #c " \n\t"
2215 DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%ebx, %1))
2216 DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%ebx), (%%ebx, %1), (%0, %1, 8))
2217 DEINT_CUBIC((%0, %1, 4), (%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8), (%%ecx))
2218 DEINT_CUBIC((%%ebx, %1), (%0, %1, 8), (%%ebx, %1, 4), (%%ecx), (%%ecx, %1, 2))
2220 : : "r" (src), "r" (stride)
2221 : "%eax", "%ebx", "ecx"
2228 src[stride*3] = (-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4;
2229 src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4;
2230 src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4;
2231 src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4;
2238 * Deinterlaces the given block
2239 * will be called for every 8x8 block and can read & write from line 4-15
2240 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
2241 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2242 * will shift the image up by 1 line (FIXME if this is a problem)
2243 * this filter will read lines 4-13 and write 4-11
2245 static inline void deInterlaceBlendLinear(uint8_t src[], int stride)
2247 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2250 "leal (%0, %1), %%eax \n\t"
2251 "leal (%%eax, %1, 4), %%ebx \n\t"
2252 // 0 1 2 3 4 5 6 7 8 9
2253 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2255 "movq (%0), %%mm0 \n\t" // L0
2256 "movq (%%eax, %1), %%mm1 \n\t" // L2
2257 PAVGB(%%mm1, %%mm0) // L0+L2
2258 "movq (%%eax), %%mm2 \n\t" // L1
2260 "movq %%mm0, (%0) \n\t"
2261 "movq (%%eax, %1, 2), %%mm0 \n\t" // L3
2262 PAVGB(%%mm0, %%mm2) // L1+L3
2263 PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3
2264 "movq %%mm2, (%%eax) \n\t"
2265 "movq (%0, %1, 4), %%mm2 \n\t" // L4
2266 PAVGB(%%mm2, %%mm1) // L2+L4
2267 PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4
2268 "movq %%mm1, (%%eax, %1) \n\t"
2269 "movq (%%ebx), %%mm1 \n\t" // L5
2270 PAVGB(%%mm1, %%mm0) // L3+L5
2271 PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5
2272 "movq %%mm0, (%%eax, %1, 2) \n\t"
2273 "movq (%%ebx, %1), %%mm0 \n\t" // L6
2274 PAVGB(%%mm0, %%mm2) // L4+L6
2275 PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6
2276 "movq %%mm2, (%0, %1, 4) \n\t"
2277 "movq (%%ebx, %1, 2), %%mm2 \n\t" // L7
2278 PAVGB(%%mm2, %%mm1) // L5+L7
2279 PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7
2280 "movq %%mm1, (%%ebx) \n\t"
2281 "movq (%0, %1, 8), %%mm1 \n\t" // L8
2282 PAVGB(%%mm1, %%mm0) // L6+L8
2283 PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8
2284 "movq %%mm0, (%%ebx, %1) \n\t"
2285 "movq (%%ebx, %1, 4), %%mm0 \n\t" // L9
2286 PAVGB(%%mm0, %%mm2) // L7+L9
2287 PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9
2288 "movq %%mm2, (%%ebx, %1, 2) \n\t"
2291 : : "r" (src), "r" (stride)
2299 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2;
2300 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2;
2301 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2302 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
2303 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
2304 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
2305 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
2306 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
2313 * Deinterlaces the given block
2314 * will be called for every 8x8 block and can read & write from line 4-15,
2315 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
2316 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2318 static inline void deInterlaceMedian(uint8_t src[], int stride)
2324 "leal (%0, %1), %%eax \n\t"
2325 "leal (%%eax, %1, 4), %%ebx \n\t"
2326 // 0 1 2 3 4 5 6 7 8 9
2327 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2329 "movq (%0), %%mm0 \n\t" //
2330 "movq (%%eax, %1), %%mm2 \n\t" //
2331 "movq (%%eax), %%mm1 \n\t" //
2332 "movq %%mm0, %%mm3 \n\t"
2333 "pmaxub %%mm1, %%mm0 \n\t" //
2334 "pminub %%mm3, %%mm1 \n\t" //
2335 "pmaxub %%mm2, %%mm1 \n\t" //
2336 "pminub %%mm1, %%mm0 \n\t"
2337 "movq %%mm0, (%%eax) \n\t"
2339 "movq (%0, %1, 4), %%mm0 \n\t" //
2340 "movq (%%eax, %1, 2), %%mm1 \n\t" //
2341 "movq %%mm2, %%mm3 \n\t"
2342 "pmaxub %%mm1, %%mm2 \n\t" //
2343 "pminub %%mm3, %%mm1 \n\t" //
2344 "pmaxub %%mm0, %%mm1 \n\t" //
2345 "pminub %%mm1, %%mm2 \n\t"
2346 "movq %%mm2, (%%eax, %1, 2) \n\t"
2348 "movq (%%ebx), %%mm2 \n\t" //
2349 "movq (%%ebx, %1), %%mm1 \n\t" //
2350 "movq %%mm2, %%mm3 \n\t"
2351 "pmaxub %%mm0, %%mm2 \n\t" //
2352 "pminub %%mm3, %%mm0 \n\t" //
2353 "pmaxub %%mm1, %%mm0 \n\t" //
2354 "pminub %%mm0, %%mm2 \n\t"
2355 "movq %%mm2, (%%ebx) \n\t"
2357 "movq (%%ebx, %1, 2), %%mm2 \n\t" //
2358 "movq (%0, %1, 8), %%mm0 \n\t" //
2359 "movq %%mm2, %%mm3 \n\t"
2360 "pmaxub %%mm0, %%mm2 \n\t" //
2361 "pminub %%mm3, %%mm0 \n\t" //
2362 "pmaxub %%mm1, %%mm0 \n\t" //
2363 "pminub %%mm0, %%mm2 \n\t"
2364 "movq %%mm2, (%%ebx, %1, 2) \n\t"
2367 : : "r" (src), "r" (stride)
2371 #else // MMX without MMX2
2373 "leal (%0, %1), %%eax \n\t"
2374 "leal (%%eax, %1, 4), %%ebx \n\t"
2375 // 0 1 2 3 4 5 6 7 8 9
2376 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2377 "pxor %%mm7, %%mm7 \n\t"
2379 #define MEDIAN(a,b,c)\
2380 "movq " #a ", %%mm0 \n\t"\
2381 "movq " #b ", %%mm2 \n\t"\
2382 "movq " #c ", %%mm1 \n\t"\
2383 "movq %%mm0, %%mm3 \n\t"\
2384 "movq %%mm1, %%mm4 \n\t"\
2385 "movq %%mm2, %%mm5 \n\t"\
2386 "psubusb %%mm1, %%mm3 \n\t"\
2387 "psubusb %%mm2, %%mm4 \n\t"\
2388 "psubusb %%mm0, %%mm5 \n\t"\
2389 "pcmpeqb %%mm7, %%mm3 \n\t"\
2390 "pcmpeqb %%mm7, %%mm4 \n\t"\
2391 "pcmpeqb %%mm7, %%mm5 \n\t"\
2392 "movq %%mm3, %%mm6 \n\t"\
2393 "pxor %%mm4, %%mm3 \n\t"\
2394 "pxor %%mm5, %%mm4 \n\t"\
2395 "pxor %%mm6, %%mm5 \n\t"\
2396 "por %%mm3, %%mm1 \n\t"\
2397 "por %%mm4, %%mm2 \n\t"\
2398 "por %%mm5, %%mm0 \n\t"\
2399 "pand %%mm2, %%mm0 \n\t"\
2400 "pand %%mm1, %%mm0 \n\t"\
2401 "movq %%mm0, " #b " \n\t"
2403 MEDIAN((%0), (%%eax), (%%eax, %1))
2404 MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
2405 MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1))
2406 MEDIAN((%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8))
2408 : : "r" (src), "r" (stride)
2418 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2;
2419 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2;
2420 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2421 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
2422 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
2423 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
2424 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
2425 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
2433 * transposes and shift the given 8x8 Block into dst1 and dst2
2435 static inline void transpose1(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
2438 "leal (%0, %1), %%eax \n\t"
2439 "leal (%%eax, %1, 4), %%ebx \n\t"
2440 // 0 1 2 3 4 5 6 7 8 9
2441 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2442 "movq (%0), %%mm0 \n\t" // 12345678
2443 "movq (%%eax), %%mm1 \n\t" // abcdefgh
2444 "movq %%mm0, %%mm2 \n\t" // 12345678
2445 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2446 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2448 "movq (%%eax, %1), %%mm1 \n\t"
2449 "movq (%%eax, %1, 2), %%mm3 \n\t"
2450 "movq %%mm1, %%mm4 \n\t"
2451 "punpcklbw %%mm3, %%mm1 \n\t"
2452 "punpckhbw %%mm3, %%mm4 \n\t"
2454 "movq %%mm0, %%mm3 \n\t"
2455 "punpcklwd %%mm1, %%mm0 \n\t"
2456 "punpckhwd %%mm1, %%mm3 \n\t"
2457 "movq %%mm2, %%mm1 \n\t"
2458 "punpcklwd %%mm4, %%mm2 \n\t"
2459 "punpckhwd %%mm4, %%mm1 \n\t"
2461 "movd %%mm0, 128(%2) \n\t"
2462 "psrlq $32, %%mm0 \n\t"
2463 "movd %%mm0, 144(%2) \n\t"
2464 "movd %%mm3, 160(%2) \n\t"
2465 "psrlq $32, %%mm3 \n\t"
2466 "movd %%mm3, 176(%2) \n\t"
2467 "movd %%mm3, 48(%3) \n\t"
2468 "movd %%mm2, 192(%2) \n\t"
2469 "movd %%mm2, 64(%3) \n\t"
2470 "psrlq $32, %%mm2 \n\t"
2471 "movd %%mm2, 80(%3) \n\t"
2472 "movd %%mm1, 96(%3) \n\t"
2473 "psrlq $32, %%mm1 \n\t"
2474 "movd %%mm1, 112(%3) \n\t"
2476 "movq (%0, %1, 4), %%mm0 \n\t" // 12345678
2477 "movq (%%ebx), %%mm1 \n\t" // abcdefgh
2478 "movq %%mm0, %%mm2 \n\t" // 12345678
2479 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2480 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2482 "movq (%%ebx, %1), %%mm1 \n\t"
2483 "movq (%%ebx, %1, 2), %%mm3 \n\t"
2484 "movq %%mm1, %%mm4 \n\t"
2485 "punpcklbw %%mm3, %%mm1 \n\t"
2486 "punpckhbw %%mm3, %%mm4 \n\t"
2488 "movq %%mm0, %%mm3 \n\t"
2489 "punpcklwd %%mm1, %%mm0 \n\t"
2490 "punpckhwd %%mm1, %%mm3 \n\t"
2491 "movq %%mm2, %%mm1 \n\t"
2492 "punpcklwd %%mm4, %%mm2 \n\t"
2493 "punpckhwd %%mm4, %%mm1 \n\t"
2495 "movd %%mm0, 132(%2) \n\t"
2496 "psrlq $32, %%mm0 \n\t"
2497 "movd %%mm0, 148(%2) \n\t"
2498 "movd %%mm3, 164(%2) \n\t"
2499 "psrlq $32, %%mm3 \n\t"
2500 "movd %%mm3, 180(%2) \n\t"
2501 "movd %%mm3, 52(%3) \n\t"
2502 "movd %%mm2, 196(%2) \n\t"
2503 "movd %%mm2, 68(%3) \n\t"
2504 "psrlq $32, %%mm2 \n\t"
2505 "movd %%mm2, 84(%3) \n\t"
2506 "movd %%mm1, 100(%3) \n\t"
2507 "psrlq $32, %%mm1 \n\t"
2508 "movd %%mm1, 116(%3) \n\t"
2511 :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2)
2517 * transposes the given 8x8 block
2519 static inline void transpose2(uint8_t *dst, int dstStride, uint8_t *src)
2522 "leal (%0, %1), %%eax \n\t"
2523 "leal (%%eax, %1, 4), %%ebx \n\t"
2524 // 0 1 2 3 4 5 6 7 8 9
2525 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2526 "movq (%2), %%mm0 \n\t" // 12345678
2527 "movq 16(%2), %%mm1 \n\t" // abcdefgh
2528 "movq %%mm0, %%mm2 \n\t" // 12345678
2529 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2530 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2532 "movq 32(%2), %%mm1 \n\t"
2533 "movq 48(%2), %%mm3 \n\t"
2534 "movq %%mm1, %%mm4 \n\t"
2535 "punpcklbw %%mm3, %%mm1 \n\t"
2536 "punpckhbw %%mm3, %%mm4 \n\t"
2538 "movq %%mm0, %%mm3 \n\t"
2539 "punpcklwd %%mm1, %%mm0 \n\t"
2540 "punpckhwd %%mm1, %%mm3 \n\t"
2541 "movq %%mm2, %%mm1 \n\t"
2542 "punpcklwd %%mm4, %%mm2 \n\t"
2543 "punpckhwd %%mm4, %%mm1 \n\t"
2545 "movd %%mm0, (%0) \n\t"
2546 "psrlq $32, %%mm0 \n\t"
2547 "movd %%mm0, (%%eax) \n\t"
2548 "movd %%mm3, (%%eax, %1) \n\t"
2549 "psrlq $32, %%mm3 \n\t"
2550 "movd %%mm3, (%%eax, %1, 2) \n\t"
2551 "movd %%mm2, (%0, %1, 4) \n\t"
2552 "psrlq $32, %%mm2 \n\t"
2553 "movd %%mm2, (%%ebx) \n\t"
2554 "movd %%mm1, (%%ebx, %1) \n\t"
2555 "psrlq $32, %%mm1 \n\t"
2556 "movd %%mm1, (%%ebx, %1, 2) \n\t"
2559 "movq 64(%2), %%mm0 \n\t" // 12345678
2560 "movq 80(%2), %%mm1 \n\t" // abcdefgh
2561 "movq %%mm0, %%mm2 \n\t" // 12345678
2562 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2563 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2565 "movq 96(%2), %%mm1 \n\t"
2566 "movq 112(%2), %%mm3 \n\t"
2567 "movq %%mm1, %%mm4 \n\t"
2568 "punpcklbw %%mm3, %%mm1 \n\t"
2569 "punpckhbw %%mm3, %%mm4 \n\t"
2571 "movq %%mm0, %%mm3 \n\t"
2572 "punpcklwd %%mm1, %%mm0 \n\t"
2573 "punpckhwd %%mm1, %%mm3 \n\t"
2574 "movq %%mm2, %%mm1 \n\t"
2575 "punpcklwd %%mm4, %%mm2 \n\t"
2576 "punpckhwd %%mm4, %%mm1 \n\t"
2578 "movd %%mm0, 4(%0) \n\t"
2579 "psrlq $32, %%mm0 \n\t"
2580 "movd %%mm0, 4(%%eax) \n\t"
2581 "movd %%mm3, 4(%%eax, %1) \n\t"
2582 "psrlq $32, %%mm3 \n\t"
2583 "movd %%mm3, 4(%%eax, %1, 2) \n\t"
2584 "movd %%mm2, 4(%0, %1, 4) \n\t"
2585 "psrlq $32, %%mm2 \n\t"
2586 "movd %%mm2, 4(%%ebx) \n\t"
2587 "movd %%mm1, 4(%%ebx, %1) \n\t"
2588 "psrlq $32, %%mm1 \n\t"
2589 "movd %%mm1, 4(%%ebx, %1, 2) \n\t"
2591 :: "r" (dst), "r" (dstStride), "r" (src)
2597 #ifdef HAVE_ODIVX_POSTPROCESS
2598 #include "../opendivx/postprocess.h"
2602 static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2603 QP_STORE_T QPs[], int QPStride, int isColor, int mode);
2605 /* -pp Command line Help
2606 NOTE/FIXME: put this at an appropriate place (--help, html docs, man mplayer)?
2608 -pp <filterName>[:<option>[:<option>...]][,[-]<filterName>[:<option>...]]...
2611 -pp vdeblock:autoq,hdeblock:autoq,linblenddeint -pp default,-vdeblock
2613 -pp vb:a,hb:a,lb -pp de,-vb
2616 short long name short long option Description
2617 * * a autoq cpu power dependant enabler
2618 c chrom chrominance filtring enabled
2619 y nochrom chrominance filtring disabled
2620 hb hdeblock horizontal deblocking filter
2621 vb vdeblock vertical deblocking filter
2623 h1 x1hdeblock Experimental horizontal deblock filter 1
2624 v1 x1vdeblock Experimental vertical deblock filter 1
2625 dr dering not implemented yet
2626 al autolevels automatic brightness / contrast fixer
2627 f fullyrange stretch luminance range to (0..255)
2628 lb linblenddeint linear blend deinterlacer
2629 li linipoldeint linear interpolating deinterlacer
2630 ci cubicipoldeint cubic interpolating deinterlacer
2631 md mediandeint median deinterlacer
2632 de default hdeblock:a,vdeblock:a,dering:a,autolevels
2633 fa fast x1hdeblock:a,x1vdeblock:a,dering:a,autolevels
2637 * returns a PPMode struct which will have a non 0 error variable if an error occured
2638 * name is the string after "-pp" on the command line
2639 * quality is a number from 0 to GET_PP_QUALITY_MAX
2641 struct PPMode getPPModeByNameAndQuality(char *name, int quality)
2643 char temp[GET_MODE_BUFFER_SIZE];
2645 char *filterDelimiters= ",";
2646 char *optionDelimiters= ":";
2647 struct PPMode ppMode= {0,0,0,0,0,0};
2650 strncpy(temp, name, GET_MODE_BUFFER_SIZE);
2654 int q= GET_PP_QUALITY_MAX;
2657 char *options[OPTIONS_ARRAY_SIZE];
2660 int numOfUnknownOptions=0;
2661 int enable=1; //does the user want us to enabled or disabled the filter
2663 filterToken= strtok(p, filterDelimiters);
2664 if(filterToken == NULL) break;
2665 p+= strlen(filterToken) + 1;
2666 filterName= strtok(filterToken, optionDelimiters);
2667 printf("%s::%s\n", filterToken, filterName);
2669 if(*filterName == '-')
2674 for(;;){ //for all options
2675 option= strtok(NULL, optionDelimiters);
2676 if(option == NULL) break;
2678 printf("%s\n", option);
2679 if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
2680 else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
2681 else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
2684 options[numOfUnknownOptions] = option;
2685 numOfUnknownOptions++;
2686 options[numOfUnknownOptions] = NULL;
2688 if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
2691 /* replace stuff from the replace Table */
2692 for(i=0; replaceTable[2*i]!=NULL; i++)
2694 if(!strcmp(replaceTable[2*i], filterName))
2696 int newlen= strlen(replaceTable[2*i + 1]);
2700 if(p==NULL) p= temp, *p=0; //last filter
2701 else p--, *p=','; //not last filter
2704 spaceLeft= (int)p - (int)temp + plen;
2705 if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE)
2710 memmove(p + newlen, p, plen+1);
2711 memcpy(p, replaceTable[2*i + 1], newlen);
2716 for(i=0; filters[i].shortName!=NULL; i++)
2718 if( !strcmp(filters[i].longName, filterName)
2719 || !strcmp(filters[i].shortName, filterName))
2721 ppMode.lumMode &= ~filters[i].mask;
2722 ppMode.chromMode &= ~filters[i].mask;
2725 if(!enable) break; // user wants to disable it
2727 if(q >= filters[i].minLumQuality)
2728 ppMode.lumMode|= filters[i].mask;
2729 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
2730 if(q >= filters[i].minChromQuality)
2731 ppMode.chromMode|= filters[i].mask;
2733 if(filters[i].mask == LEVEL_FIX)
2736 ppMode.minAllowedY= 16;
2737 ppMode.maxAllowedY= 234;
2738 for(o=0; options[o]!=NULL; o++)
2739 if( !strcmp(options[o],"fullyrange")
2740 ||!strcmp(options[o],"f"))
2742 ppMode.minAllowedY= 0;
2743 ppMode.maxAllowedY= 255;
2744 numOfUnknownOptions--;
2749 if(!filterNameOk) ppMode.error++;
2750 ppMode.error += numOfUnknownOptions;
2753 #ifdef HAVE_ODIVX_POSTPROCESS
2754 if(ppMode.lumMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_H;
2755 if(ppMode.lumMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_V;
2756 if(ppMode.chromMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_H;
2757 if(ppMode.chromMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_V;
2758 if(ppMode.lumMode & DERING) ppMode.oldMode |= PP_DERING_Y;
2759 if(ppMode.chromMode & DERING) ppMode.oldMode |= PP_DERING_C;
2768 void postprocess(unsigned char * src[], int src_stride,
2769 unsigned char * dst[], int dst_stride,
2770 int horizontal_size, int vertical_size,
2771 QP_STORE_T *QP_store, int QP_stride,
2777 struct PPMode ppMode= getPPModeByNameAndQuality("fast,default,-hdeblock,-vdeblock", qual);
2780 printf("\n%d %d %d %d\n", ppMode.lumMode, ppMode.chromMode, ppMode.oldMode, ppMode.error);
2781 postprocess2(src, src_stride, dst, dst_stride,
2782 horizontal_size, vertical_size, QP_store, QP_stride, &ppMode);
2786 static QP_STORE_T zeroArray[2048/8];
2789 QP_store= zeroArray;
2793 #ifdef HAVE_ODIVX_POSTPROCESS
2794 // Note: I could make this shit outside of this file, but it would mean one
2795 // more function call...
2797 odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,mode);
2802 postProcess(src[0], src_stride, dst[0], dst_stride,
2803 horizontal_size, vertical_size, QP_store, QP_stride, 0, mode);
2805 horizontal_size >>= 1;
2806 vertical_size >>= 1;
2809 mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00);
2810 // mode&= ~(LINEAR_IPOL_DEINT_FILTER | LINEAR_BLEND_DEINT_FILTER |
2811 // MEDIAN_DEINT_FILTER | CUBIC_IPOL_DEINT_FILTER);
2815 postProcess(src[1], src_stride, dst[1], dst_stride,
2816 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
2817 postProcess(src[2], src_stride, dst[2], dst_stride,
2818 horizontal_size, vertical_size, QP_store, QP_stride, 2, mode);
2822 memcpy(dst[1], src[1], src_stride*horizontal_size);
2823 memcpy(dst[2], src[2], src_stride*horizontal_size);
2827 void postprocess2(unsigned char * src[], int src_stride,
2828 unsigned char * dst[], int dst_stride,
2829 int horizontal_size, int vertical_size,
2830 QP_STORE_T *QP_store, int QP_stride,
2831 struct PPMode *mode)
2834 static QP_STORE_T zeroArray[2048/8];
2837 QP_store= zeroArray;
2841 #ifdef HAVE_ODIVX_POSTPROCESS
2842 // Note: I could make this shit outside of this file, but it would mean one
2843 // more function call...
2845 odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,
2851 postProcess(src[0], src_stride, dst[0], dst_stride,
2852 horizontal_size, vertical_size, QP_store, QP_stride, 0, mode->lumMode);
2854 horizontal_size >>= 1;
2855 vertical_size >>= 1;
2859 postProcess(src[1], src_stride, dst[1], dst_stride,
2860 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode->chromMode);
2861 postProcess(src[2], src_stride, dst[2], dst_stride,
2862 horizontal_size, vertical_size, QP_store, QP_stride, 2, mode->chromMode);
2867 * gets the mode flags for a given quality (larger values mean slower but better postprocessing)
2870 int getPpModeForQuality(int quality){
2871 int modes[1+GET_PP_QUALITY_MAX]= {
2874 // horizontal filters first
2876 LUM_H_DEBLOCK | LUM_V_DEBLOCK,
2877 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK,
2878 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK,
2879 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING,
2880 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING | CHROM_DERING
2882 // vertical filters first
2884 LUM_V_DEBLOCK | LUM_H_DEBLOCK,
2885 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK,
2886 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK,
2887 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING,
2888 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING | CHROM_DERING
2892 #ifdef HAVE_ODIVX_POSTPROCESS
2893 int odivx_modes[1+GET_PP_QUALITY_MAX]= {
2896 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V,
2897 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H,
2898 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V,
2899 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y,
2900 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y|PP_DERING_C
2902 if(use_old_pp) return odivx_modes[quality];
2904 return modes[quality];
2908 * Copies a block from src to dst and fixes the blacklevel
2909 * numLines must be a multiple of 4
2910 * levelFix == 0 -> dont touch the brighness & contrast
2912 static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
2913 int numLines, int levelFix)
2922 "leal (%2,%2), %%eax \n\t"
2923 "leal (%3,%3), %%ebx \n\t"
2924 "movq packedYOffset, %%mm2 \n\t"
2925 "movq packedYScale, %%mm3 \n\t"
2926 "pxor %%mm4, %%mm4 \n\t"
2928 #define SCALED_CPY \
2929 "movq (%0), %%mm0 \n\t"\
2930 "movq (%0), %%mm5 \n\t"\
2931 "punpcklbw %%mm4, %%mm0 \n\t"\
2932 "punpckhbw %%mm4, %%mm5 \n\t"\
2933 "psubw %%mm2, %%mm0 \n\t"\
2934 "psubw %%mm2, %%mm5 \n\t"\
2935 "movq (%0,%2), %%mm1 \n\t"\
2936 "psllw $6, %%mm0 \n\t"\
2937 "psllw $6, %%mm5 \n\t"\
2938 "pmulhw %%mm3, %%mm0 \n\t"\
2939 "movq (%0,%2), %%mm6 \n\t"\
2940 "pmulhw %%mm3, %%mm5 \n\t"\
2941 "punpcklbw %%mm4, %%mm1 \n\t"\
2942 "punpckhbw %%mm4, %%mm6 \n\t"\
2943 "psubw %%mm2, %%mm1 \n\t"\
2944 "psubw %%mm2, %%mm6 \n\t"\
2945 "psllw $6, %%mm1 \n\t"\
2946 "psllw $6, %%mm6 \n\t"\
2947 "pmulhw %%mm3, %%mm1 \n\t"\
2948 "pmulhw %%mm3, %%mm6 \n\t"\
2949 "addl %%eax, %0 \n\t"\
2950 "packuswb %%mm5, %%mm0 \n\t"\
2951 "packuswb %%mm6, %%mm1 \n\t"\
2952 "movq %%mm0, (%1) \n\t"\
2953 "movq %%mm1, (%1, %3) \n\t"\
2956 "addl %%ebx, %1 \n\t"
2958 "addl %%ebx, %1 \n\t"
2960 "addl %%ebx, %1 \n\t"
2970 for(i=0; i<numLines; i++)
2971 memcpy( &(dst[dstStride*i]),
2972 &(src[srcStride*i]), BLOCK_SIZE);
2979 "movl %4, %%eax \n\t"
2980 "movl %%eax, temp0\n\t"
2983 "leal (%2,%2), %%eax \n\t"
2984 "leal (%3,%3), %%ebx \n\t"
2985 "movq packedYOffset, %%mm2 \n\t"
2986 "movq packedYScale, %%mm3 \n\t"
2988 #define SIMPLE_CPY \
2989 "movq (%0), %%mm0 \n\t"\
2990 "movq (%0,%2), %%mm1 \n\t"\
2991 "movq %%mm0, (%1) \n\t"\
2992 "movq %%mm1, (%1, %3) \n\t"\
2996 "addl %%eax, %0 \n\t"
2997 "addl %%ebx, %1 \n\t"
2999 "addl %%eax, %0 \n\t"
3000 "addl %%ebx, %1 \n\t"
3014 for(i=0; i<numLines; i++)
3015 memcpy( &(dst[dstStride*i]),
3016 &(src[srcStride*i]), BLOCK_SIZE);
3023 * Filters array of bytes (Y or U or V values)
3025 static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
3026 QP_STORE_T QPs[], int QPStride, int isColor, int mode)
3029 /* we need 64bit here otherwise we´ll going to have a problem
3030 after watching a black picture for 5 hours*/
3031 static uint64_t *yHistogram= NULL;
3032 int black=0, white=255; // blackest black and whitest white in the picture
3033 int QPCorrecture= 256;
3035 /* Temporary buffers for handling the last row(s) */
3036 static uint8_t *tempDst= NULL;
3037 static uint8_t *tempSrc= NULL;
3039 /* Temporary buffers for handling the last block */
3040 static uint8_t *tempDstBlock= NULL;
3041 static uint8_t *tempSrcBlock= NULL;
3043 #ifdef PP_FUNNY_STRIDE
3044 uint8_t *dstBlockPtrBackup;
3045 uint8_t *srcBlockPtrBackup;
3049 long long T0, T1, diffTime=0;
3052 long long memcpyTime=0, vertTime=0, horizTime=0, sumTime;
3059 tempDst= (uint8_t*)memalign(8, 1024*24);
3060 tempSrc= (uint8_t*)memalign(8, 1024*24);
3061 tempDstBlock= (uint8_t*)memalign(8, 1024*24);
3062 tempSrcBlock= (uint8_t*)memalign(8, 1024*24);
3068 yHistogram= (uint64_t*)malloc(8*256);
3069 for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256;
3071 if(mode & FULL_Y_RANGE)
3082 static int framenum= -1;
3083 uint64_t maxClipped;
3088 if(framenum == 1) yHistogram[0]= width*height/64*15/256;
3090 for(i=0; i<256; i++)
3092 sum+= yHistogram[i];
3093 // printf("%d ", yHistogram[i]);
3097 /* we allways get a completly black picture first */
3098 maxClipped= (uint64_t)(sum * maxClippedThreshold);
3101 for(black=255; black>0; black--)
3103 if(clipped < maxClipped) break;
3104 clipped-= yHistogram[black];
3108 for(white=0; white<256; white++)
3110 if(clipped < maxClipped) break;
3111 clipped-= yHistogram[white];
3114 packedYOffset= (black - minAllowedY) & 0xFFFF;
3115 packedYOffset|= packedYOffset<<32;
3116 packedYOffset|= packedYOffset<<16;
3118 scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);
3120 packedYScale= (uint16_t)(scale*1024.0 + 0.5);
3121 packedYScale|= packedYScale<<32;
3122 packedYScale|= packedYScale<<16;
3126 packedYScale= 0x0100010001000100LL;
3130 if(mode & LEVEL_FIX) QPCorrecture= packedYScale &0xFFFF;
3131 else QPCorrecture= 256;
3133 /* copy & deinterlace first row of blocks */
3136 //1% speedup if these are here instead of the inner loop
3137 uint8_t *srcBlock= &(src[y*srcStride]);
3138 uint8_t *dstBlock= &(dst[y*dstStride]);
3140 dstBlock= tempDst + dstStride;
3142 // From this point on it is guranteed that we can read and write 16 lines downward
3143 // finish 1 block before the next otherwise we´ll might have a problem
3144 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
3145 for(x=0; x<width; x+=BLOCK_SIZE)
3150 prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3151 prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3152 prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3153 prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3156 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3157 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3158 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3159 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3163 "movl %4, %%eax \n\t"
3164 "shrl $2, %%eax \n\t"
3165 "andl $6, %%eax \n\t"
3166 "addl $8, %%eax \n\t"
3167 "movl %%eax, %%ebx \n\t"
3168 "imul %1, %%eax \n\t"
3169 "imul %3, %%ebx \n\t"
3170 "prefetchnta 32(%%eax, %0) \n\t"
3171 "prefetcht0 32(%%ebx, %2) \n\t"
3172 "addl %1, %%eax \n\t"
3173 "addl %3, %%ebx \n\t"
3174 "prefetchnta 32(%%eax, %0) \n\t"
3175 "prefetcht0 32(%%ebx, %2) \n\t"
3176 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
3181 #elif defined(HAVE_3DNOW)
3182 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
3183 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3184 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3185 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3186 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3190 blockCopy(dstBlock + dstStride*8, dstStride,
3191 srcBlock + srcStride*8, srcStride, 8, mode & LEVEL_FIX);
3193 if(mode & LINEAR_IPOL_DEINT_FILTER)
3194 deInterlaceInterpolateLinear(dstBlock, dstStride);
3195 else if(mode & LINEAR_BLEND_DEINT_FILTER)
3196 deInterlaceBlendLinear(dstBlock, dstStride);
3197 else if(mode & MEDIAN_DEINT_FILTER)
3198 deInterlaceMedian(dstBlock, dstStride);
3199 else if(mode & CUBIC_IPOL_DEINT_FILTER)
3200 deInterlaceInterpolateCubic(dstBlock, dstStride);
3201 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3202 deInterlaceBlendCubic(dstBlock, dstStride);
3207 memcpy(&(dst[y*dstStride]) + 8*dstStride, tempDst + 9*dstStride, 8*dstStride );
3210 for(y=0; y<height; y+=BLOCK_SIZE)
3212 //1% speedup if these are here instead of the inner loop
3213 uint8_t *srcBlock= &(src[y*srcStride]);
3214 uint8_t *dstBlock= &(dst[y*dstStride]);
3216 int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride];
3217 int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4);
3218 int QPFrac= QPDelta;
3219 uint8_t *tempBlock1= tempBlocks;
3220 uint8_t *tempBlock2= tempBlocks + 8;
3222 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
3223 if not than use a temporary buffer */
3226 /* copy from line 8 to 15 of src, these will be copied with
3227 blockcopy to dst later */
3228 memcpy(tempSrc + srcStride*8, srcBlock + srcStride*8,
3229 srcStride*MAX(height-y-8, 0) );
3231 /* duplicate last line to fill the void upto line 15 */
3235 for(i=height-y; i<=15; i++)
3236 memcpy(tempSrc + srcStride*i,
3237 src + srcStride*(height-1), srcStride);
3240 /* copy up to 9 lines of dst */
3241 memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, 9) );
3242 dstBlock= tempDst + dstStride;
3246 // From this point on it is guranteed that we can read and write 16 lines downward
3247 // finish 1 block before the next otherwise we´ll might have a problem
3248 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
3249 for(x=0; x<width; x+=BLOCK_SIZE)
3251 const int stride= dstStride;
3257 "sbbl %%eax, %%eax \n\t"
3258 "shll $2, %%eax \n\t"
3259 "subl %%eax, %0 \n\t"
3260 : "+r" (QPptr), "+m" (QPFrac)
3266 QPs[(y>>3)*QPStride + (x>>3)]:
3267 QPs[(y>>4)*QPStride + (x>>4)];
3271 QP= (QP* QPCorrecture)>>8;
3272 yHistogram[ srcBlock[srcStride*12 + 4] ]++;
3276 "movd %0, %%mm7 \n\t"
3277 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
3278 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
3279 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
3280 "movq %%mm7, pQPb \n\t"
3291 prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3292 prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3293 prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3294 prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3297 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3298 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3299 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3300 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3304 "movl %4, %%eax \n\t"
3305 "shrl $2, %%eax \n\t"
3306 "andl $6, %%eax \n\t"
3307 "addl $8, %%eax \n\t"
3308 "movl %%eax, %%ebx \n\t"
3309 "imul %1, %%eax \n\t"
3310 "imul %3, %%ebx \n\t"
3311 "prefetchnta 32(%%eax, %0) \n\t"
3312 "prefetcht0 32(%%ebx, %2) \n\t"
3313 "addl %1, %%eax \n\t"
3314 "addl %3, %%ebx \n\t"
3315 "prefetchnta 32(%%eax, %0) \n\t"
3316 "prefetcht0 32(%%ebx, %2) \n\t"
3317 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
3322 #elif defined(HAVE_3DNOW)
3323 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
3324 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3325 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3326 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3327 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3331 #ifdef PP_FUNNY_STRIDE
3332 //can we mess with a 8x16 block, if not use a temp buffer, yes again
3336 dstBlockPtrBackup= dstBlock;
3337 srcBlockPtrBackup= srcBlock;
3339 for(i=0;i<BLOCK_SIZE*2; i++)
3341 memcpy(tempSrcBlock+i*srcStride, srcBlock+i*srcStride, width-x);
3342 memcpy(tempDstBlock+i*dstStride, dstBlock+i*dstStride, width-x);
3345 dstBlock= tempDstBlock;
3346 srcBlock= tempSrcBlock;
3350 blockCopy(dstBlock + dstStride*8, dstStride,
3351 srcBlock + srcStride*8, srcStride, 8, mode & LEVEL_FIX);
3353 if(mode & LINEAR_IPOL_DEINT_FILTER)
3354 deInterlaceInterpolateLinear(dstBlock, dstStride);
3355 else if(mode & LINEAR_BLEND_DEINT_FILTER)
3356 deInterlaceBlendLinear(dstBlock, dstStride);
3357 else if(mode & MEDIAN_DEINT_FILTER)
3358 deInterlaceMedian(dstBlock, dstStride);
3359 else if(mode & CUBIC_IPOL_DEINT_FILTER)
3360 deInterlaceInterpolateCubic(dstBlock, dstStride);
3361 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3362 deInterlaceBlendCubic(dstBlock, dstStride);
3365 /* only deblock if we have 2 blocks */
3373 if(mode & V_RK1_FILTER)
3374 vertRK1Filter(dstBlock, stride, QP);
3375 else if(mode & V_X1_FILTER)
3376 vertX1Filter(dstBlock, stride, QP);
3377 else if(mode & V_DEBLOCK)
3379 if( isVertDC(dstBlock, stride))
3381 if(isVertMinMaxOk(dstBlock, stride, QP))
3382 doVertLowPass(dstBlock, stride, QP);
3385 doVertDefFilter(dstBlock, stride, QP);
3395 transpose1(tempBlock1, tempBlock2, dstBlock, dstStride);
3397 /* check if we have a previous block to deblock it with dstBlock */
3404 if(mode & H_RK1_FILTER)
3405 vertRK1Filter(tempBlock1, 16, QP);
3406 else if(mode & H_X1_FILTER)
3407 vertX1Filter(tempBlock1, 16, QP);
3408 else if(mode & H_DEBLOCK)
3410 if( isVertDC(tempBlock1, 16))
3412 if(isVertMinMaxOk(tempBlock1, 16, QP))
3413 doVertLowPass(tempBlock1, 16, QP);
3416 doVertDefFilter(tempBlock1, 16, QP);
3419 transpose2(dstBlock-4, dstStride, tempBlock1 + 4*16);
3422 if(mode & H_X1_FILTER)
3423 horizX1Filter(dstBlock-4, stride, QP);
3424 else if(mode & H_DEBLOCK)
3426 if( isHorizDC(dstBlock-4, stride))
3428 if(isHorizMinMaxOk(dstBlock-4, stride, QP))
3429 doHorizLowPass(dstBlock-4, stride, QP);
3432 doHorizDefFilter(dstBlock-4, stride, QP);
3442 //FIXME filter first line
3443 if(y>0) dering(dstBlock - stride - 8, stride, QP);
3446 else if(mode & DERING)
3448 //FIXME y+15 is required cuz of the tempBuffer thing -> bottom right block isnt filtered
3449 if(y > 8 && y+15 < height) dering(dstBlock - stride*9 + width - 8, stride, QP);
3453 #ifdef PP_FUNNY_STRIDE
3454 /* did we use a tmp-block buffer */
3458 dstBlock= dstBlockPtrBackup;
3459 srcBlock= srcBlockPtrBackup;
3461 for(i=0;i<BLOCK_SIZE*2; i++)
3463 memcpy(dstBlock+i*dstStride, tempDstBlock+i*dstStride, width-x);
3472 tmpXchg= tempBlock1;
3473 tempBlock1= tempBlock2;
3474 tempBlock2 = tmpXchg;
3478 /* did we use a tmp buffer for the last lines*/
3481 uint8_t *dstBlock= &(dst[y*dstStride]);
3482 memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) );
3486 asm volatile("femms");
3487 #elif defined (HAVE_MMX)
3488 asm volatile("emms");
3492 // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...)
3493 sumTime= rdtsc() - sumTime;
3495 printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d \r",
3496 (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
3497 (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)