]> git.sesse.net Git - ffmpeg/blob - postproc/postprocess_template.c
faster dering
[ffmpeg] / postproc / postprocess_template.c
1 /*
2     Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
3
4     This program is free software; you can redistribute it and/or modify
5     it under the terms of the GNU General Public License as published by
6     the Free Software Foundation; either version 2 of the License, or
7     (at your option) any later version.
8
9     This program is distributed in the hope that it will be useful,
10     but WITHOUT ANY WARRANTY; without even the implied warranty of
11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12     GNU General Public License for more details.
13
14     You should have received a copy of the GNU General Public License
15     along with this program; if not, write to the Free Software
16     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17 */
18
19 /*
20                         C       MMX     MMX2    3DNow
21 isVertDC                Ec      Ec
22 isVertMinMaxOk          Ec      Ec
23 doVertLowPass           E               e       e
24 doVertDefFilter         Ec      Ec      e       e
25 isHorizDC               Ec      Ec
26 isHorizMinMaxOk         a       E
27 doHorizLowPass          E               e       e
28 doHorizDefFilter        Ec      Ec      e       e
29 deRing                  E               e       e*
30 Vertical RKAlgo1        E               a       a
31 Horizontal RKAlgo1                      a       a
32 Vertical X1#            a               E       E
33 Horizontal X1#          a               E       E
34 LinIpolDeinterlace      e               E       E*
35 CubicIpolDeinterlace    a               e       e*
36 LinBlendDeinterlace     e               E       E*
37 MedianDeinterlace#              Ec      Ec
38 TempDeNoiser#           E               e       e
39
40 * i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work
41 # more or less selfinvented filters so the exactness isnt too meaningfull
42 E = Exact implementation
43 e = allmost exact implementation (slightly different rounding,...)
44 a = alternative / approximate impl
45 c = checked against the other implementations (-vo md5)
46 */
47
48 /*
49 TODO:
50 reduce the time wasted on the mem transfer
51 implement everything in C at least (done at the moment but ...)
52 unroll stuff if instructions depend too much on the prior one
53 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
54 move YScale thing to the end instead of fixing QP
55 write a faster and higher quality deblocking filter :)
56 make the mainloop more flexible (variable number of blocks at once
57         (the if/else stuff per block is slowing things down)
58 compare the quality & speed of all filters
59 split this huge file
60 border remover
61 optimize c versions
62 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
63 smart blur
64 commandline option for   the deblock / dering thresholds
65 memcpy chrominance if no chroma filtering is done
66 ...
67 */
68
69 //Changelog: use the CVS log
70
71 #include "../config.h"
72 #include <inttypes.h>
73 #include <stdio.h>
74 #include <stdlib.h>
75 #include <string.h>
76 #ifdef HAVE_MALLOC_H
77 #include <malloc.h>
78 #endif
79 //#undef HAVE_MMX2
80 //#define HAVE_3DNOW
81 //#undef HAVE_MMX
82 //#define DEBUG_BRIGHTNESS
83 #include "postprocess.h"
84
85 #define MIN(a,b) ((a) > (b) ? (b) : (a))
86 #define MAX(a,b) ((a) < (b) ? (b) : (a))
87 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
88 #define SIGN(a) ((a) > 0 ? 1 : -1)
89
90 #ifdef HAVE_MMX2
91 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
92 #elif defined (HAVE_3DNOW)
93 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
94 #endif
95
96 #ifdef HAVE_MMX2
97 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
98 #elif defined (HAVE_MMX)
99 #define PMINUB(b,a,t) \
100         "movq " #a ", " #t " \n\t"\
101         "psubusb " #b ", " #t " \n\t"\
102         "psubb " #t ", " #a " \n\t"
103 #endif
104
105 #ifdef HAVE_MMX2
106 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
107 #elif defined (HAVE_MMX)
108 #define PMAXUB(a,b) \
109         "psubusb " #a ", " #b " \n\t"\
110         "paddb " #a ", " #b " \n\t"
111 #endif
112
113
114 #define GET_MODE_BUFFER_SIZE 500
115 #define OPTIONS_ARRAY_SIZE 10
116
117 #ifdef HAVE_MMX
118 static volatile uint64_t __attribute__((aligned(8))) packedYOffset=     0x0000000000000000LL;
119 static volatile uint64_t __attribute__((aligned(8))) packedYScale=      0x0100010001000100LL;
120 static uint64_t __attribute__((aligned(8))) w05=                0x0005000500050005LL;
121 static uint64_t __attribute__((aligned(8))) w20=                0x0020002000200020LL;
122 static uint64_t __attribute__((aligned(8))) w1400=              0x1400140014001400LL;
123 static uint64_t __attribute__((aligned(8))) bm00000001=         0x00000000000000FFLL;
124 static uint64_t __attribute__((aligned(8))) bm00010000=         0x000000FF00000000LL;
125 static uint64_t __attribute__((aligned(8))) bm00001000=         0x00000000FF000000LL;
126 static uint64_t __attribute__((aligned(8))) bm10000000=         0xFF00000000000000LL;
127 static uint64_t __attribute__((aligned(8))) bm10000001=         0xFF000000000000FFLL;
128 static uint64_t __attribute__((aligned(8))) bm11000011=         0xFFFF00000000FFFFLL;
129 static uint64_t __attribute__((aligned(8))) bm00000011=         0x000000000000FFFFLL;
130 static uint64_t __attribute__((aligned(8))) bm11111110=         0xFFFFFFFFFFFFFF00LL;
131 static uint64_t __attribute__((aligned(8))) bm11000000=         0xFFFF000000000000LL;
132 static uint64_t __attribute__((aligned(8))) bm00011000=         0x000000FFFF000000LL;
133 static uint64_t __attribute__((aligned(8))) bm00110011=         0x0000FFFF0000FFFFLL;
134 static uint64_t __attribute__((aligned(8))) bm11001100=         0xFFFF0000FFFF0000LL;
135 static uint64_t __attribute__((aligned(8))) b00=                0x0000000000000000LL;
136 static uint64_t __attribute__((aligned(8))) b01=                0x0101010101010101LL;
137 static uint64_t __attribute__((aligned(8))) b02=                0x0202020202020202LL;
138 static uint64_t __attribute__((aligned(8))) b0F=                0x0F0F0F0F0F0F0F0FLL;
139 static uint64_t __attribute__((aligned(8))) b04=                0x0404040404040404LL;
140 static uint64_t __attribute__((aligned(8))) b08=                0x0808080808080808LL;
141 static uint64_t __attribute__((aligned(8))) bFF=                0xFFFFFFFFFFFFFFFFLL;
142 static uint64_t __attribute__((aligned(8))) b20=                0x2020202020202020LL;
143 static uint64_t __attribute__((aligned(8))) b80=                0x8080808080808080LL;
144 static uint64_t __attribute__((aligned(8))) b7E=                0x7E7E7E7E7E7E7E7ELL;
145 static uint64_t __attribute__((aligned(8))) b7C=                0x7C7C7C7C7C7C7C7CLL;
146 static uint64_t __attribute__((aligned(8))) b3F=                0x3F3F3F3F3F3F3F3FLL;
147 static uint64_t __attribute__((aligned(8))) temp0=0;
148 static uint64_t __attribute__((aligned(8))) temp1=0;
149 static uint64_t __attribute__((aligned(8))) temp2=0;
150 static uint64_t __attribute__((aligned(8))) temp3=0;
151 static uint64_t __attribute__((aligned(8))) temp4=0;
152 static uint64_t __attribute__((aligned(8))) temp5=0;
153 static uint64_t __attribute__((aligned(8))) pQPb=0;
154 static uint64_t __attribute__((aligned(8))) pQPb2=0;
155 static uint8_t __attribute__((aligned(8))) tempBlocks[8*16*2]; //used for the horizontal code
156 static uint32_t __attribute__((aligned(4))) maxTmpNoise[4];
157 #else
158 static uint64_t packedYOffset=  0x0000000000000000LL;
159 static uint64_t packedYScale=   0x0100010001000100LL;
160 static uint8_t tempBlocks[8*16*2]; //used for the horizontal code
161 #endif
162
163 int hFlatnessThreshold= 56 - 16;
164 int vFlatnessThreshold= 56 - 16;
165 int deringThreshold= 20;
166
167 //amount of "black" u r willing to loose to get a brightness corrected picture
168 double maxClippedThreshold= 0.01;
169
170 int maxAllowedY=234;
171 int minAllowedY=16;
172
173 static struct PPFilter filters[]=
174 {
175         {"hb", "hdeblock",              1, 1, 3, H_DEBLOCK},
176         {"vb", "vdeblock",              1, 2, 4, V_DEBLOCK},
177         {"vr", "rkvdeblock",            1, 2, 4, H_RK1_FILTER},
178         {"h1", "x1hdeblock",            1, 1, 3, H_X1_FILTER},
179         {"v1", "x1vdeblock",            1, 2, 4, V_X1_FILTER},
180         {"dr", "dering",                1, 5, 6, DERING},
181         {"al", "autolevels",            0, 1, 2, LEVEL_FIX},
182         {"lb", "linblenddeint",         0, 1, 6, LINEAR_BLEND_DEINT_FILTER},
183         {"li", "linipoldeint",          0, 1, 6, LINEAR_IPOL_DEINT_FILTER},
184         {"ci", "cubicipoldeint",        0, 1, 6, CUBIC_IPOL_DEINT_FILTER},
185         {"md", "mediandeint",           0, 1, 6, MEDIAN_DEINT_FILTER},
186         {"tn", "tmpnoise",              1, 7, 8, TEMP_NOISE_FILTER},
187         {NULL, NULL,0,0,0,0} //End Marker
188 };
189
190 static char *replaceTable[]=
191 {
192         "default",      "hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
193         "de",           "hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
194         "fast",         "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
195         "fa",           "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
196         NULL //End Marker
197 };
198
199 #ifdef HAVE_MMX
200 static inline void unusedVariableWarningFixer()
201 {
202 if(
203  packedYOffset + packedYScale + w05 + w20 + w1400 + bm00000001 + bm00010000
204  + bm00001000 + bm10000000 + bm10000001 + bm11000011 + bm00000011 + bm11111110
205  + bm11000000 + bm00011000 + bm00110011 + bm11001100 + b00 + b01 + b02 + b0F
206  + bFF + b20 + b04+ b08 + pQPb2 + b80 + b7E + b7C + b3F + temp0 + temp1 + temp2 + temp3 + temp4
207  + temp5 + pQPb== 0) b00=0;
208 }
209 #endif
210
211 #ifdef TIMING
212 static inline long long rdtsc()
213 {
214         long long l;
215         asm volatile(   "rdtsc\n\t"
216                 : "=A" (l)
217         );
218 //      printf("%d\n", int(l/1000));
219         return l;
220 }
221 #endif
222
223 #ifdef HAVE_MMX2
224 static inline void prefetchnta(void *p)
225 {
226         asm volatile(   "prefetchnta (%0)\n\t"
227                 : : "r" (p)
228         );
229 }
230
231 static inline void prefetcht0(void *p)
232 {
233         asm volatile(   "prefetcht0 (%0)\n\t"
234                 : : "r" (p)
235         );
236 }
237
238 static inline void prefetcht1(void *p)
239 {
240         asm volatile(   "prefetcht1 (%0)\n\t"
241                 : : "r" (p)
242         );
243 }
244
245 static inline void prefetcht2(void *p)
246 {
247         asm volatile(   "prefetcht2 (%0)\n\t"
248                 : : "r" (p)
249         );
250 }
251 #endif
252
253 //FIXME? |255-0| = 1 (shouldnt be a problem ...)
254 /**
255  * Check if the middle 8x8 Block in the given 8x16 block is flat
256  */
257 static inline int isVertDC(uint8_t src[], int stride){
258         int numEq= 0;
259 #ifndef HAVE_MMX
260         int y;
261 #endif
262         src+= stride*4; // src points to begin of the 8x8 Block
263 #ifdef HAVE_MMX
264 asm volatile(
265                 "leal (%1, %2), %%eax                           \n\t"
266                 "leal (%%eax, %2, 4), %%ebx                     \n\t"
267 //      0       1       2       3       4       5       6       7       8       9
268 //      %1      eax     eax+%2  eax+2%2 %1+4%2  ebx     ebx+%2  ebx+2%2 %1+8%2  ebx+4%2
269                 "movq b7E, %%mm7                                        \n\t" // mm7 = 0x7F
270                 "movq b7C, %%mm6                                        \n\t" // mm6 = 0x7D
271                 "movq (%1), %%mm0                               \n\t"
272                 "movq (%%eax), %%mm1                            \n\t"
273                 "psubb %%mm1, %%mm0                             \n\t" // mm0 = differnece
274                 "paddb %%mm7, %%mm0                             \n\t"
275                 "pcmpgtb %%mm6, %%mm0                           \n\t"
276
277                 "movq (%%eax,%2), %%mm2                         \n\t"
278                 "psubb %%mm2, %%mm1                             \n\t"
279                 "paddb %%mm7, %%mm1                             \n\t"
280                 "pcmpgtb %%mm6, %%mm1                           \n\t"
281                 "paddb %%mm1, %%mm0                             \n\t"
282
283                 "movq (%%eax, %2, 2), %%mm1                     \n\t"
284                 "psubb %%mm1, %%mm2                             \n\t"
285                 "paddb %%mm7, %%mm2                             \n\t"
286                 "pcmpgtb %%mm6, %%mm2                           \n\t"
287                 "paddb %%mm2, %%mm0                             \n\t"
288
289                 "movq (%1, %2, 4), %%mm2                        \n\t"
290                 "psubb %%mm2, %%mm1                             \n\t"
291                 "paddb %%mm7, %%mm1                             \n\t"
292                 "pcmpgtb %%mm6, %%mm1                           \n\t"
293                 "paddb %%mm1, %%mm0                             \n\t"
294
295                 "movq (%%ebx), %%mm1                            \n\t"
296                 "psubb %%mm1, %%mm2                             \n\t"
297                 "paddb %%mm7, %%mm2                             \n\t"
298                 "pcmpgtb %%mm6, %%mm2                           \n\t"
299                 "paddb %%mm2, %%mm0                             \n\t"
300
301                 "movq (%%ebx, %2), %%mm2                        \n\t"
302                 "psubb %%mm2, %%mm1                             \n\t"
303                 "paddb %%mm7, %%mm1                             \n\t"
304                 "pcmpgtb %%mm6, %%mm1                           \n\t"
305                 "paddb %%mm1, %%mm0                             \n\t"
306
307                 "movq (%%ebx, %2, 2), %%mm1                     \n\t"
308                 "psubb %%mm1, %%mm2                             \n\t"
309                 "paddb %%mm7, %%mm2                             \n\t"
310                 "pcmpgtb %%mm6, %%mm2                           \n\t"
311                 "paddb %%mm2, %%mm0                             \n\t"
312
313                 "                                               \n\t"
314 #ifdef HAVE_MMX2
315                 "pxor %%mm7, %%mm7                              \n\t"
316                 "psadbw %%mm7, %%mm0                            \n\t"
317 #else
318                 "movq %%mm0, %%mm1                              \n\t"
319                 "psrlw $8, %%mm0                                \n\t"
320                 "paddb %%mm1, %%mm0                             \n\t"
321                 "movq %%mm0, %%mm1                              \n\t"
322                 "psrlq $16, %%mm0                               \n\t"
323                 "paddb %%mm1, %%mm0                             \n\t"
324                 "movq %%mm0, %%mm1                              \n\t"
325                 "psrlq $32, %%mm0                               \n\t"
326                 "paddb %%mm1, %%mm0                             \n\t"
327 #endif
328                 "movd %%mm0, %0                                 \n\t"
329                 : "=r" (numEq)
330                 : "r" (src), "r" (stride)
331                 : "%ebx"
332                 );
333         numEq= (-numEq) &0xFF;
334
335 #else
336         for(y=0; y<BLOCK_SIZE-1; y++)
337         {
338                 if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
339                 if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
340                 if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++;
341                 if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++;
342                 if(((src[4] - src[4+stride] + 1)&0xFFFF) < 3) numEq++;
343                 if(((src[5] - src[5+stride] + 1)&0xFFFF) < 3) numEq++;
344                 if(((src[6] - src[6+stride] + 1)&0xFFFF) < 3) numEq++;
345                 if(((src[7] - src[7+stride] + 1)&0xFFFF) < 3) numEq++;
346                 src+= stride;
347         }
348 #endif
349 /*      if(abs(numEq - asmEq) > 0)
350         {
351                 printf("\nasm:%d  c:%d\n", asmEq, numEq);
352                 for(int y=0; y<8; y++)
353                 {
354                         for(int x=0; x<8; x++)
355                         {
356                                 printf("%d ", temp[x + y*stride]);
357                         }
358                         printf("\n");
359                 }
360         }
361 */
362 //      for(int i=0; i<numEq/8; i++) src[i]=255;
363         return (numEq > vFlatnessThreshold) ? 1 : 0;
364 }
365
366 static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
367 {
368 #ifdef HAVE_MMX
369         int isOk;
370         src+= stride*3;
371         asm volatile(
372 //              "int $3 \n\t"
373                 "movq (%1, %2), %%mm0                           \n\t"
374                 "movq (%1, %2, 8), %%mm1                        \n\t"
375                 "movq %%mm0, %%mm2                              \n\t"
376                 "psubusb %%mm1, %%mm0                           \n\t"
377                 "psubusb %%mm2, %%mm1                           \n\t"
378                 "por %%mm1, %%mm0                               \n\t" // ABS Diff
379
380                 "movq pQPb, %%mm7                               \n\t" // QP,..., QP
381                 "paddusb %%mm7, %%mm7                           \n\t" // 2QP ... 2QP
382                 "psubusb %%mm7, %%mm0                           \n\t" // Diff <= 2QP -> 0
383                 "pcmpeqd b00, %%mm0                             \n\t"
384                 "psrlq $16, %%mm0                               \n\t"
385                 "pcmpeqd bFF, %%mm0                             \n\t"
386 //              "movd %%mm0, (%1, %2, 4)\n\t"
387                 "movd %%mm0, %0                                 \n\t"
388                 : "=r" (isOk)
389                 : "r" (src), "r" (stride)
390                 );
391         return isOk;
392 #else
393
394         int isOk2= 1;
395         int x;
396         src+= stride*3;
397         for(x=0; x<BLOCK_SIZE; x++)
398         {
399                 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
400         }
401 /*      if(isOk && !isOk2 || !isOk && isOk2)
402         {
403                 printf("\nasm:%d  c:%d QP:%d\n", isOk, isOk2, QP);
404                 for(int y=0; y<9; y++)
405                 {
406                         for(int x=0; x<8; x++)
407                         {
408                                 printf("%d ", src[x + y*stride]);
409                         }
410                         printf("\n");
411                 }
412         } */
413
414         return isOk2;
415 #endif
416
417 }
418
419 /**
420  * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
421  * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
422  */
423 static inline void doVertLowPass(uint8_t *src, int stride, int QP)
424 {
425 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
426         src+= stride*3;
427         asm volatile(   //"movv %0 %1 %2\n\t"
428                 "movq pQPb, %%mm0                               \n\t"  // QP,..., QP
429
430                 "movq (%0), %%mm6                               \n\t"
431                 "movq (%0, %1), %%mm5                           \n\t"
432                 "movq %%mm5, %%mm1                              \n\t"
433                 "movq %%mm6, %%mm2                              \n\t"
434                 "psubusb %%mm6, %%mm5                           \n\t"
435                 "psubusb %%mm1, %%mm2                           \n\t"
436                 "por %%mm5, %%mm2                               \n\t" // ABS Diff of lines
437                 "psubusb %%mm0, %%mm2                           \n\t" // diff <= QP -> 0
438                 "pcmpeqb b00, %%mm2                             \n\t" // diff <= QP -> FF
439
440                 "pand %%mm2, %%mm6                              \n\t"
441                 "pandn %%mm1, %%mm2                             \n\t"
442                 "por %%mm2, %%mm6                               \n\t"// First Line to Filter
443
444                 "movq (%0, %1, 8), %%mm5                        \n\t"
445                 "leal (%0, %1, 4), %%eax                        \n\t"
446                 "leal (%0, %1, 8), %%ebx                        \n\t"
447                 "subl %1, %%ebx                                 \n\t"
448                 "addl %1, %0                                    \n\t" // %0 points to line 1 not 0
449                 "movq (%0, %1, 8), %%mm7                        \n\t"
450                 "movq %%mm5, %%mm1                              \n\t"
451                 "movq %%mm7, %%mm2                              \n\t"
452                 "psubusb %%mm7, %%mm5                           \n\t"
453                 "psubusb %%mm1, %%mm2                           \n\t"
454                 "por %%mm5, %%mm2                               \n\t" // ABS Diff of lines
455                 "psubusb %%mm0, %%mm2                           \n\t" // diff <= QP -> 0
456                 "pcmpeqb b00, %%mm2                             \n\t" // diff <= QP -> FF
457
458                 "pand %%mm2, %%mm7                              \n\t"
459                 "pandn %%mm1, %%mm2                             \n\t"
460                 "por %%mm2, %%mm7                               \n\t" // First Line to Filter
461
462
463                 //      1       2       3       4       5       6       7       8
464                 //      %0      %0+%1   %0+2%1  eax     %0+4%1  eax+2%1 ebx     eax+4%1
465                 // 6 4 2 2 1 1
466                 // 6 4 4 2
467                 // 6 8 2
468
469                 "movq (%0, %1), %%mm0                           \n\t" //  1
470                 "movq %%mm0, %%mm1                              \n\t" //  1
471                 PAVGB(%%mm6, %%mm0)                                   //1 1     /2
472                 PAVGB(%%mm6, %%mm0)                                   //3 1     /4
473
474                 "movq (%0, %1, 4), %%mm2                        \n\t" //     1
475                 "movq %%mm2, %%mm5                              \n\t" //     1
476                 PAVGB((%%eax), %%mm2)                                 //    11  /2
477                 PAVGB((%0, %1, 2), %%mm2)                             //   211  /4
478                 "movq %%mm2, %%mm3                              \n\t" //   211  /4
479                 "movq (%0), %%mm4                               \n\t" // 1
480                 PAVGB(%%mm4, %%mm3)                                   // 4 211  /8
481                 PAVGB(%%mm0, %%mm3)                                   //642211  /16
482                 "movq %%mm3, (%0)                               \n\t" // X
483                 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
484                 "movq %%mm1, %%mm0                              \n\t" //  1
485                 PAVGB(%%mm6, %%mm0)                                   //1 1     /2
486                 "movq %%mm4, %%mm3                              \n\t" // 1
487                 PAVGB((%0,%1,2), %%mm3)                               // 1 1    /2
488                 PAVGB((%%eax,%1,2), %%mm5)                            //     11 /2
489                 PAVGB((%%eax), %%mm5)                                 //    211 /4
490                 PAVGB(%%mm5, %%mm3)                                   // 2 2211 /8
491                 PAVGB(%%mm0, %%mm3)                                   //4242211 /16
492                 "movq %%mm3, (%0,%1)                            \n\t" //  X
493                 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
494                 PAVGB(%%mm4, %%mm6)                                   //11      /2
495                 "movq (%%ebx), %%mm0                            \n\t" //       1
496                 PAVGB((%%eax, %1, 2), %%mm0)                          //      11/2
497                 "movq %%mm0, %%mm3                              \n\t" //      11/2
498                 PAVGB(%%mm1, %%mm0)                                   //  2   11/4
499                 PAVGB(%%mm6, %%mm0)                                   //222   11/8
500                 PAVGB(%%mm2, %%mm0)                                   //22242211/16
501                 "movq (%0, %1, 2), %%mm2                        \n\t" //   1
502                 "movq %%mm0, (%0, %1, 2)                        \n\t" //   X
503                 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
504                 "movq (%%eax, %1, 4), %%mm0                     \n\t" //        1
505                 PAVGB((%%ebx), %%mm0)                                 //       11       /2
506                 PAVGB(%%mm0, %%mm6)                                   //11     11       /4
507                 PAVGB(%%mm1, %%mm4)                                   // 11             /2
508                 PAVGB(%%mm2, %%mm1)                                   //  11            /2
509                 PAVGB(%%mm1, %%mm6)                                   //1122   11       /8
510                 PAVGB(%%mm5, %%mm6)                                   //112242211       /16
511                 "movq (%%eax), %%mm5                            \n\t" //    1
512                 "movq %%mm6, (%%eax)                            \n\t" //    X
513                 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
514                 "movq (%%eax, %1, 4), %%mm6                     \n\t" //        1
515                 PAVGB(%%mm7, %%mm6)                                   //        11      /2
516                 PAVGB(%%mm4, %%mm6)                                   // 11     11      /4
517                 PAVGB(%%mm3, %%mm6)                                   // 11   2211      /8
518                 PAVGB(%%mm5, %%mm2)                                   //   11           /2
519                 "movq (%0, %1, 4), %%mm4                        \n\t" //     1
520                 PAVGB(%%mm4, %%mm2)                                   //   112          /4
521                 PAVGB(%%mm2, %%mm6)                                   // 112242211      /16
522                 "movq %%mm6, (%0, %1, 4)                        \n\t" //     X
523                 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
524                 PAVGB(%%mm7, %%mm1)                                   //  11     2      /4
525                 PAVGB(%%mm4, %%mm5)                                   //    11          /2
526                 PAVGB(%%mm5, %%mm0)                                   //    11 11       /4
527                 "movq (%%eax, %1, 2), %%mm6                     \n\t" //      1
528                 PAVGB(%%mm6, %%mm1)                                   //  11  4  2      /8
529                 PAVGB(%%mm0, %%mm1)                                   //  11224222      /16
530                 "movq %%mm1, (%%eax, %1, 2)                     \n\t" //      X
531                 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
532                 PAVGB((%%ebx), %%mm2)                                 //   112 4        /8
533                 "movq (%%eax, %1, 4), %%mm0                     \n\t" //        1
534                 PAVGB(%%mm0, %%mm6)                                   //      1 1       /2
535                 PAVGB(%%mm7, %%mm6)                                   //      1 12      /4
536                 PAVGB(%%mm2, %%mm6)                                   //   1122424      /4
537                 "movq %%mm6, (%%ebx)                            \n\t" //       X
538                 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
539                 PAVGB(%%mm7, %%mm5)                                   //    11   2      /4
540                 PAVGB(%%mm7, %%mm5)                                   //    11   6      /8
541
542                 PAVGB(%%mm3, %%mm0)                                   //      112       /4
543                 PAVGB(%%mm0, %%mm5)                                   //    112246      /16
544                 "movq %%mm5, (%%eax, %1, 4)                     \n\t" //        X
545                 "subl %1, %0                                    \n\t"
546
547                 :
548                 : "r" (src), "r" (stride)
549                 : "%eax", "%ebx"
550         );
551 #else
552         const int l1= stride;
553         const int l2= stride + l1;
554         const int l3= stride + l2;
555         const int l4= stride + l3;
556         const int l5= stride + l4;
557         const int l6= stride + l5;
558         const int l7= stride + l6;
559         const int l8= stride + l7;
560         const int l9= stride + l8;
561         int x;
562         src+= stride*3;
563         for(x=0; x<BLOCK_SIZE; x++)
564         {
565                 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
566                 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
567
568                 int sums[9];
569                 sums[0] = first + src[l1];
570                 sums[1] = src[l1] + src[l2];
571                 sums[2] = src[l2] + src[l3];
572                 sums[3] = src[l3] + src[l4];
573                 sums[4] = src[l4] + src[l5];
574                 sums[5] = src[l5] + src[l6];
575                 sums[6] = src[l6] + src[l7];
576                 sums[7] = src[l7] + src[l8];
577                 sums[8] = src[l8] + last;
578
579                 src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
580                 src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
581                 src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
582                 src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
583                 src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
584                 src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
585                 src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4;
586                 src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
587
588                 src++;
589         }
590
591 #endif
592 }
593
594 /**
595  * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
596  * values are correctly clipped (MMX2)
597  * values are wraparound (C)
598  * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
599         0 8 16 24
600         x = 8
601         x/2 = 4
602         x/8 = 1
603         1 12 12 23
604  */
605 static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
606 {
607 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
608         src+= stride*3;
609 // FIXME rounding
610         asm volatile(
611                 "pxor %%mm7, %%mm7                              \n\t" // 0
612                 "movq b80, %%mm6                                \n\t" // MIN_SIGNED_BYTE
613                 "leal (%0, %1), %%eax                           \n\t"
614                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
615 //      0       1       2       3       4       5       6       7       8       9
616 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
617                 "movq pQPb, %%mm0                               \n\t" // QP,..., QP
618                 "movq %%mm0, %%mm1                              \n\t" // QP,..., QP
619                 "paddusb b02, %%mm0                             \n\t"
620                 "psrlw $2, %%mm0                                \n\t"
621                 "pand b3F, %%mm0                                \n\t" // QP/4,..., QP/4
622                 "paddusb %%mm1, %%mm0                           \n\t" // QP*1.25 ...
623                 "movq (%0, %1, 4), %%mm2                        \n\t" // line 4
624                 "movq (%%ebx), %%mm3                            \n\t" // line 5
625                 "movq %%mm2, %%mm4                              \n\t" // line 4
626                 "pcmpeqb %%mm5, %%mm5                           \n\t" // -1
627                 "pxor %%mm2, %%mm5                              \n\t" // -line 4 - 1
628                 PAVGB(%%mm3, %%mm5)
629                 "paddb %%mm6, %%mm5                             \n\t" // (l5-l4)/2
630                 "psubusb %%mm3, %%mm4                           \n\t"
631                 "psubusb %%mm2, %%mm3                           \n\t"
632                 "por %%mm3, %%mm4                               \n\t" // |l4 - l5|
633                 "psubusb %%mm0, %%mm4                           \n\t"
634                 "pcmpeqb %%mm7, %%mm4                           \n\t"
635                 "pand %%mm4, %%mm5                              \n\t" // d/2
636
637 //              "paddb %%mm6, %%mm2                             \n\t" // line 4 + 0x80
638                 "paddb %%mm5, %%mm2                             \n\t"
639 //              "psubb %%mm6, %%mm2                             \n\t"
640                 "movq %%mm2, (%0,%1, 4)                         \n\t"
641
642                 "movq (%%ebx), %%mm2                            \n\t"
643 //              "paddb %%mm6, %%mm2                             \n\t" // line 5 + 0x80
644                 "psubb %%mm5, %%mm2                             \n\t"
645 //              "psubb %%mm6, %%mm2                             \n\t"
646                 "movq %%mm2, (%%ebx)                            \n\t"
647
648                 "paddb %%mm6, %%mm5                             \n\t"
649                 "psrlw $2, %%mm5                                \n\t"
650                 "pand b3F, %%mm5                                \n\t"
651                 "psubb b20, %%mm5                               \n\t" // (l5-l4)/8
652
653                 "movq (%%eax, %1, 2), %%mm2                     \n\t"
654                 "paddb %%mm6, %%mm2                             \n\t" // line 3 + 0x80
655                 "paddsb %%mm5, %%mm2                            \n\t"
656                 "psubb %%mm6, %%mm2                             \n\t"
657                 "movq %%mm2, (%%eax, %1, 2)                     \n\t"
658
659                 "movq (%%ebx, %1), %%mm2                        \n\t"
660                 "paddb %%mm6, %%mm2                             \n\t" // line 6 + 0x80
661                 "psubsb %%mm5, %%mm2                            \n\t"
662                 "psubb %%mm6, %%mm2                             \n\t"
663                 "movq %%mm2, (%%ebx, %1)                        \n\t"
664
665                 :
666                 : "r" (src), "r" (stride)
667                 : "%eax", "%ebx"
668         );
669 #else
670         const int l1= stride;
671         const int l2= stride + l1;
672         const int l3= stride + l2;
673         const int l4= stride + l3;
674         const int l5= stride + l4;
675         const int l6= stride + l5;
676 //      const int l7= stride + l6;
677 //      const int l8= stride + l7;
678 //      const int l9= stride + l8;
679         int x;
680         const int QP15= QP + (QP>>2);
681         src+= stride*3;
682         for(x=0; x<BLOCK_SIZE; x++)
683         {
684                 const int v = (src[x+l5] - src[x+l4]);
685                 if(ABS(v) < QP15)
686                 {
687                         src[x+l3] +=v>>3;
688                         src[x+l4] +=v>>1;
689                         src[x+l5] -=v>>1;
690                         src[x+l6] -=v>>3;
691
692                 }
693         }
694
695 #endif
696 }
697
698 /**
699  * Experimental Filter 1
700  * will not damage linear gradients
701  * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
702  * can only smooth blocks at the expected locations (it cant smooth them if they did move)
703  * MMX2 version does correct clipping C version doesnt
704  */
705 static inline void vertX1Filter(uint8_t *src, int stride, int QP)
706 {
707 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
708         src+= stride*3;
709
710         asm volatile(
711                 "pxor %%mm7, %%mm7                              \n\t" // 0
712 //              "movq b80, %%mm6                                \n\t" // MIN_SIGNED_BYTE
713                 "leal (%0, %1), %%eax                           \n\t"
714                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
715 //      0       1       2       3       4       5       6       7       8       9
716 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
717                 "movq (%%eax, %1, 2), %%mm0                     \n\t" // line 3
718                 "movq (%0, %1, 4), %%mm1                        \n\t" // line 4
719                 "movq %%mm1, %%mm2                              \n\t" // line 4
720                 "psubusb %%mm0, %%mm1                           \n\t"
721                 "psubusb %%mm2, %%mm0                           \n\t"
722                 "por %%mm1, %%mm0                               \n\t" // |l2 - l3|
723                 "movq (%%ebx), %%mm3                            \n\t" // line 5
724                 "movq (%%ebx, %1), %%mm4                                \n\t" // line 6
725                 "movq %%mm3, %%mm5                              \n\t" // line 5
726                 "psubusb %%mm4, %%mm3                           \n\t"
727                 "psubusb %%mm5, %%mm4                           \n\t"
728                 "por %%mm4, %%mm3                               \n\t" // |l5 - l6|
729                 PAVGB(%%mm3, %%mm0)                                   // (|l2 - l3| + |l5 - l6|)/2
730                 "movq %%mm2, %%mm1                              \n\t" // line 4
731                 "psubusb %%mm5, %%mm2                           \n\t"
732                 "movq %%mm2, %%mm4                              \n\t"
733                 "pcmpeqb %%mm7, %%mm2                           \n\t" // (l4 - l5) <= 0 ? -1 : 0
734                 "psubusb %%mm1, %%mm5                           \n\t"
735                 "por %%mm5, %%mm4                               \n\t" // |l4 - l5|
736                 "psubusb %%mm0, %%mm4           \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
737                 "movq %%mm4, %%mm3                              \n\t" // d
738                 "psubusb pQPb, %%mm4                            \n\t"
739                 "pcmpeqb %%mm7, %%mm4                           \n\t" // d <= QP ? -1 : 0
740                 "psubusb b01, %%mm3                             \n\t"
741                 "pand %%mm4, %%mm3                              \n\t" // d <= QP ? d : 0
742
743                 PAVGB(%%mm7, %%mm3)                                   // d/2
744                 "movq %%mm3, %%mm1                              \n\t" // d/2
745                 PAVGB(%%mm7, %%mm3)                                   // d/4
746                 PAVGB(%%mm1, %%mm3)                                   // 3*d/8
747
748                 "movq (%0, %1, 4), %%mm0                        \n\t" // line 4
749                 "pxor %%mm2, %%mm0                              \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
750                 "psubusb %%mm3, %%mm0                           \n\t"
751                 "pxor %%mm2, %%mm0                              \n\t"
752                 "movq %%mm0, (%0, %1, 4)                        \n\t" // line 4
753
754                 "movq (%%ebx), %%mm0                            \n\t" // line 5
755                 "pxor %%mm2, %%mm0                              \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
756                 "paddusb %%mm3, %%mm0                           \n\t"
757                 "pxor %%mm2, %%mm0                              \n\t"
758                 "movq %%mm0, (%%ebx)                            \n\t" // line 5
759
760                 PAVGB(%%mm7, %%mm1)                                   // d/4
761
762                 "movq (%%eax, %1, 2), %%mm0                     \n\t" // line 3
763                 "pxor %%mm2, %%mm0                              \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
764                 "psubusb %%mm1, %%mm0                           \n\t"
765                 "pxor %%mm2, %%mm0                              \n\t"
766                 "movq %%mm0, (%%eax, %1, 2)                     \n\t" // line 3
767
768                 "movq (%%ebx, %1), %%mm0                        \n\t" // line 6
769                 "pxor %%mm2, %%mm0                              \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
770                 "paddusb %%mm1, %%mm0                           \n\t"
771                 "pxor %%mm2, %%mm0                              \n\t"
772                 "movq %%mm0, (%%ebx, %1)                        \n\t" // line 6
773
774                 PAVGB(%%mm7, %%mm1)                                   // d/8
775
776                 "movq (%%eax, %1), %%mm0                        \n\t" // line 2
777                 "pxor %%mm2, %%mm0                              \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
778                 "psubusb %%mm1, %%mm0                           \n\t"
779                 "pxor %%mm2, %%mm0                              \n\t"
780                 "movq %%mm0, (%%eax, %1)                        \n\t" // line 2
781
782                 "movq (%%ebx, %1, 2), %%mm0                     \n\t" // line 7
783                 "pxor %%mm2, %%mm0                              \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
784                 "paddusb %%mm1, %%mm0                           \n\t"
785                 "pxor %%mm2, %%mm0                              \n\t"
786                 "movq %%mm0, (%%ebx, %1, 2)                     \n\t" // line 7
787
788                 :
789                 : "r" (src), "r" (stride)
790                 : "%eax", "%ebx"
791         );
792 #else
793
794         const int l1= stride;
795         const int l2= stride + l1;
796         const int l3= stride + l2;
797         const int l4= stride + l3;
798         const int l5= stride + l4;
799         const int l6= stride + l5;
800         const int l7= stride + l6;
801 //      const int l8= stride + l7;
802 //      const int l9= stride + l8;
803         int x;
804
805         src+= stride*3;
806         for(x=0; x<BLOCK_SIZE; x++)
807         {
808                 int a= src[l3] - src[l4];
809                 int b= src[l4] - src[l5];
810                 int c= src[l5] - src[l6];
811
812                 int d= ABS(b) - ((ABS(a) + ABS(c))>>1);
813                 d= MAX(d, 0);
814
815                 if(d < QP)
816                 {
817                         int v = d * SIGN(-b);
818
819                         src[l2] +=v>>3;
820                         src[l3] +=v>>2;
821                         src[l4] +=(3*v)>>3;
822                         src[l5] -=(3*v)>>3;
823                         src[l6] -=v>>2;
824                         src[l7] -=v>>3;
825
826                 }
827                 src++;
828         }
829         /*
830         const int l1= stride;
831         const int l2= stride + l1;
832         const int l3= stride + l2;
833         const int l4= stride + l3;
834         const int l5= stride + l4;
835         const int l6= stride + l5;
836         const int l7= stride + l6;
837         const int l8= stride + l7;
838         const int l9= stride + l8;
839         for(int x=0; x<BLOCK_SIZE; x++)
840         {
841                 int v2= src[l2];
842                 int v3= src[l3];
843                 int v4= src[l4];
844                 int v5= src[l5];
845                 int v6= src[l6];
846                 int v7= src[l7];
847
848                 if(ABS(v4-v5)<QP &&  ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 )
849                 {
850                         src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6         )/16;
851                         src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7  )/16;
852                         src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16;
853                         src[l6] = (       1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16;
854                 }
855                 src++;
856         }
857 */
858 #endif
859 }
860
861 /**
862  * Experimental Filter 1 (Horizontal)
863  * will not damage linear gradients
864  * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
865  * can only smooth blocks at the expected locations (it cant smooth them if they did move)
866  * MMX2 version does correct clipping C version doesnt
867  * not identical with the vertical one
868  */
869 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
870 {
871         int y;
872 //FIXME (has little in common with the mmx2 version)
873         for(y=0; y<BLOCK_SIZE; y++)
874         {
875                 int a= src[1] - src[2];
876                 int b= src[3] - src[4];
877                 int c= src[5] - src[6];
878
879                 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
880
881                 if(d < QP)
882                 {
883                         int v = d * SIGN(-b);
884
885                         src[1] +=v/8;
886                         src[2] +=v/4;
887                         src[3] +=3*v/8;
888                         src[4] -=3*v/8;
889                         src[5] -=v/4;
890                         src[6] -=v/8;
891
892                 }
893                 src+=stride;
894         }
895 }
896
897
898 static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
899 {
900 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
901 /*
902         uint8_t tmp[16];
903         const int l1= stride;
904         const int l2= stride + l1;
905         const int l3= stride + l2;
906         const int l4= (int)tmp - (int)src - stride*3;
907         const int l5= (int)tmp - (int)src - stride*3 + 8;
908         const int l6= stride*3 + l3;
909         const int l7= stride + l6;
910         const int l8= stride + l7;
911
912         memcpy(tmp, src+stride*7, 8);
913         memcpy(tmp+8, src+stride*8, 8);
914 */
915         src+= stride*4;
916         asm volatile(
917
918 #if 0 //sligtly more accurate and slightly slower
919                 "pxor %%mm7, %%mm7                              \n\t" // 0
920                 "leal (%0, %1), %%eax                           \n\t"
921                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
922 //      0       1       2       3       4       5       6       7
923 //      %0      %0+%1   %0+2%1  eax+2%1 %0+4%1  eax+4%1 ebx+%1  ebx+2%1
924 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1
925
926
927                 "movq (%0, %1, 2), %%mm0                        \n\t" // l2
928                 "movq (%0), %%mm1                               \n\t" // l0
929                 "movq %%mm0, %%mm2                              \n\t" // l2
930                 PAVGB(%%mm7, %%mm0)                                   // ~l2/2
931                 PAVGB(%%mm1, %%mm0)                                   // ~(l2 + 2l0)/4
932                 PAVGB(%%mm2, %%mm0)                                   // ~(5l2 + 2l0)/8
933
934                 "movq (%%eax), %%mm1                            \n\t" // l1
935                 "movq (%%eax, %1, 2), %%mm3                     \n\t" // l3
936                 "movq %%mm1, %%mm4                              \n\t" // l1
937                 PAVGB(%%mm7, %%mm1)                                   // ~l1/2
938                 PAVGB(%%mm3, %%mm1)                                   // ~(l1 + 2l3)/4
939                 PAVGB(%%mm4, %%mm1)                                   // ~(5l1 + 2l3)/8
940
941                 "movq %%mm0, %%mm4                              \n\t" // ~(5l2 + 2l0)/8
942                 "psubusb %%mm1, %%mm0                           \n\t"
943                 "psubusb %%mm4, %%mm1                           \n\t"
944                 "por %%mm0, %%mm1                               \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
945 // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
946
947                 "movq (%0, %1, 4), %%mm0                        \n\t" // l4
948                 "movq %%mm0, %%mm4                              \n\t" // l4
949                 PAVGB(%%mm7, %%mm0)                                   // ~l4/2
950                 PAVGB(%%mm2, %%mm0)                                   // ~(l4 + 2l2)/4
951                 PAVGB(%%mm4, %%mm0)                                   // ~(5l4 + 2l2)/8
952
953                 "movq (%%ebx), %%mm2                            \n\t" // l5
954                 "movq %%mm3, %%mm5                              \n\t" // l3
955                 PAVGB(%%mm7, %%mm3)                                   // ~l3/2
956                 PAVGB(%%mm2, %%mm3)                                   // ~(l3 + 2l5)/4
957                 PAVGB(%%mm5, %%mm3)                                   // ~(5l3 + 2l5)/8
958
959                 "movq %%mm0, %%mm6                              \n\t" // ~(5l4 + 2l2)/8
960                 "psubusb %%mm3, %%mm0                           \n\t"
961                 "psubusb %%mm6, %%mm3                           \n\t"
962                 "por %%mm0, %%mm3                               \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
963                 "pcmpeqb %%mm7, %%mm0                           \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
964 // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
965
966                 "movq (%%ebx, %1), %%mm6                        \n\t" // l6
967                 "movq %%mm6, %%mm5                              \n\t" // l6
968                 PAVGB(%%mm7, %%mm6)                                   // ~l6/2
969                 PAVGB(%%mm4, %%mm6)                                   // ~(l6 + 2l4)/4
970                 PAVGB(%%mm5, %%mm6)                                   // ~(5l6 + 2l4)/8
971
972                 "movq (%%ebx, %1, 2), %%mm5                     \n\t" // l7
973                 "movq %%mm2, %%mm4                              \n\t" // l5
974                 PAVGB(%%mm7, %%mm2)                                   // ~l5/2
975                 PAVGB(%%mm5, %%mm2)                                   // ~(l5 + 2l7)/4
976                 PAVGB(%%mm4, %%mm2)                                   // ~(5l5 + 2l7)/8
977
978                 "movq %%mm6, %%mm4                              \n\t" // ~(5l6 + 2l4)/8
979                 "psubusb %%mm2, %%mm6                           \n\t"
980                 "psubusb %%mm4, %%mm2                           \n\t"
981                 "por %%mm6, %%mm2                               \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
982 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
983
984
985                 PMINUB(%%mm2, %%mm1, %%mm4)                           // MIN(|lenergy|,|renergy|)/8
986                 "movq pQPb, %%mm4                               \n\t" // QP //FIXME QP+1 ?
987                 "paddusb b01, %%mm4                             \n\t"
988                 "pcmpgtb %%mm3, %%mm4                           \n\t" // |menergy|/8 < QP
989                 "psubusb %%mm1, %%mm3                           \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
990                 "pand %%mm4, %%mm3                              \n\t"
991
992                 "movq %%mm3, %%mm1                              \n\t"
993 //              "psubusb b01, %%mm3                             \n\t"
994                 PAVGB(%%mm7, %%mm3)
995                 PAVGB(%%mm7, %%mm3)
996                 "paddusb %%mm1, %%mm3                           \n\t"
997 //              "paddusb b01, %%mm3                             \n\t"
998
999                 "movq (%%eax, %1, 2), %%mm6                     \n\t" //l3
1000                 "movq (%0, %1, 4), %%mm5                        \n\t" //l4
1001                 "movq (%0, %1, 4), %%mm4                        \n\t" //l4
1002                 "psubusb %%mm6, %%mm5                           \n\t"
1003                 "psubusb %%mm4, %%mm6                           \n\t"
1004                 "por %%mm6, %%mm5                               \n\t" // |l3-l4|
1005                 "pcmpeqb %%mm7, %%mm6                           \n\t" // SIGN(l3-l4)
1006                 "pxor %%mm6, %%mm0                              \n\t"
1007                 "pand %%mm0, %%mm3                              \n\t"
1008                 PMINUB(%%mm5, %%mm3, %%mm0)
1009
1010                 "psubusb b01, %%mm3                             \n\t"
1011                 PAVGB(%%mm7, %%mm3)
1012
1013                 "movq (%%eax, %1, 2), %%mm0                     \n\t"
1014                 "movq (%0, %1, 4), %%mm2                        \n\t"
1015                 "pxor %%mm6, %%mm0                              \n\t"
1016                 "pxor %%mm6, %%mm2                              \n\t"
1017                 "psubb %%mm3, %%mm0                             \n\t"
1018                 "paddb %%mm3, %%mm2                             \n\t"
1019                 "pxor %%mm6, %%mm0                              \n\t"
1020                 "pxor %%mm6, %%mm2                              \n\t"
1021                 "movq %%mm0, (%%eax, %1, 2)                     \n\t"
1022                 "movq %%mm2, (%0, %1, 4)                        \n\t"
1023 #endif
1024
1025                 "leal (%0, %1), %%eax                           \n\t"
1026                 "pcmpeqb %%mm6, %%mm6                           \n\t" // -1
1027 //      0       1       2       3       4       5       6       7
1028 //      %0      %0+%1   %0+2%1  eax+2%1 %0+4%1  eax+4%1 ebx+%1  ebx+2%1
1029 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1
1030
1031
1032                 "movq (%%eax, %1, 2), %%mm1                     \n\t" // l3
1033                 "movq (%0, %1, 4), %%mm0                        \n\t" // l4
1034                 "pxor %%mm6, %%mm1                              \n\t" // -l3-1
1035                 PAVGB(%%mm1, %%mm0)                                   // -q+128 = (l4-l3+256)/2
1036 // mm1=-l3-1, mm0=128-q
1037
1038                 "movq (%%eax, %1, 4), %%mm2                     \n\t" // l5
1039                 "movq (%%eax, %1), %%mm3                        \n\t" // l2
1040                 "pxor %%mm6, %%mm2                              \n\t" // -l5-1
1041                 "movq %%mm2, %%mm5                              \n\t" // -l5-1
1042                 "movq b80, %%mm4                                \n\t" // 128
1043                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
1044                 PAVGB(%%mm3, %%mm2)                                   // (l2-l5+256)/2
1045                 PAVGB(%%mm0, %%mm4)                                   // ~(l4-l3)/4 + 128
1046                 PAVGB(%%mm2, %%mm4)                                   // ~(l2-l5)/4 +(l4-l3)/8 + 128
1047                 PAVGB(%%mm0, %%mm4)                                   // ~(l2-l5)/8 +5(l4-l3)/16 + 128
1048 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
1049
1050                 "movq (%%eax), %%mm2                            \n\t" // l1
1051                 "pxor %%mm6, %%mm2                              \n\t" // -l1-1
1052                 PAVGB(%%mm3, %%mm2)                                   // (l2-l1+256)/2
1053                 PAVGB((%0), %%mm1)                                    // (l0-l3+256)/2
1054                 "movq b80, %%mm3                                \n\t" // 128
1055                 PAVGB(%%mm2, %%mm3)                                   // ~(l2-l1)/4 + 128
1056                 PAVGB(%%mm1, %%mm3)                                   // ~(l0-l3)/4 +(l2-l1)/8 + 128
1057                 PAVGB(%%mm2, %%mm3)                                   // ~(l0-l3)/8 +5(l2-l1)/16 + 128
1058 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
1059
1060                 PAVGB((%%ebx, %1), %%mm5)                             // (l6-l5+256)/2
1061                 "movq (%%ebx, %1, 2), %%mm1                     \n\t" // l7
1062                 "pxor %%mm6, %%mm1                              \n\t" // -l7-1
1063                 PAVGB((%0, %1, 4), %%mm1)                             // (l4-l7+256)/2
1064                 "movq b80, %%mm2                                \n\t" // 128
1065                 PAVGB(%%mm5, %%mm2)                                   // ~(l6-l5)/4 + 128
1066                 PAVGB(%%mm1, %%mm2)                                   // ~(l4-l7)/4 +(l6-l5)/8 + 128
1067                 PAVGB(%%mm5, %%mm2)                                   // ~(l4-l7)/8 +5(l6-l5)/16 + 128
1068 // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
1069
1070                 "movq b00, %%mm1                                \n\t" // 0
1071                 "movq b00, %%mm5                                \n\t" // 0
1072                 "psubb %%mm2, %%mm1                             \n\t" // 128 - renergy/16
1073                 "psubb %%mm3, %%mm5                             \n\t" // 128 - lenergy/16
1074                 PMAXUB(%%mm1, %%mm2)                                  // 128 + |renergy/16|
1075                 PMAXUB(%%mm5, %%mm3)                                  // 128 + |lenergy/16|
1076                 PMINUB(%%mm2, %%mm3, %%mm1)                           // 128 + MIN(|lenergy|,|renergy|)/16
1077
1078 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
1079
1080                 "movq b00, %%mm7                                \n\t" // 0
1081                 "movq pQPb, %%mm2                               \n\t" // QP
1082                 PAVGB(%%mm6, %%mm2)                                   // 128 + QP/2
1083                 "psubb %%mm6, %%mm2                             \n\t"
1084
1085                 "movq %%mm4, %%mm1                              \n\t"
1086                 "pcmpgtb %%mm7, %%mm1                           \n\t" // SIGN(menergy)
1087                 "pxor %%mm1, %%mm4                              \n\t"
1088                 "psubb %%mm1, %%mm4                             \n\t" // 128 + |menergy|/16
1089                 "pcmpgtb %%mm4, %%mm2                           \n\t" // |menergy|/16 < QP/2
1090                 "psubusb %%mm3, %%mm4                           \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
1091 // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
1092
1093                 "movq %%mm4, %%mm3                              \n\t" // d
1094                 "psubusb b01, %%mm4                             \n\t"
1095                 PAVGB(%%mm7, %%mm4)                                   // d/32
1096                 PAVGB(%%mm7, %%mm4)                                   // (d + 32)/64
1097                 "paddb %%mm3, %%mm4                             \n\t" // 5d/64
1098                 "pand %%mm2, %%mm4                              \n\t"
1099
1100                 "movq b80, %%mm5                                \n\t" // 128
1101                 "psubb %%mm0, %%mm5                             \n\t" // q
1102                 "paddsb %%mm6, %%mm5                            \n\t" // fix bad rounding
1103                 "pcmpgtb %%mm5, %%mm7                           \n\t" // SIGN(q)
1104                 "pxor %%mm7, %%mm5                              \n\t"
1105
1106                 PMINUB(%%mm5, %%mm4, %%mm3)                           // MIN(|q|, 5d/64)
1107                 "pxor %%mm1, %%mm7                              \n\t" // SIGN(d*q)
1108
1109                 "pand %%mm7, %%mm4                              \n\t"
1110                 "movq (%%eax, %1, 2), %%mm0                     \n\t"
1111                 "movq (%0, %1, 4), %%mm2                        \n\t"
1112                 "pxor %%mm1, %%mm0                              \n\t"
1113                 "pxor %%mm1, %%mm2                              \n\t"
1114                 "paddb %%mm4, %%mm0                             \n\t"
1115                 "psubb %%mm4, %%mm2                             \n\t"
1116                 "pxor %%mm1, %%mm0                              \n\t"
1117                 "pxor %%mm1, %%mm2                              \n\t"
1118                 "movq %%mm0, (%%eax, %1, 2)                     \n\t"
1119                 "movq %%mm2, (%0, %1, 4)                        \n\t"
1120
1121                 :
1122                 : "r" (src), "r" (stride)
1123                 : "%eax", "%ebx"
1124         );
1125
1126 /*
1127         {
1128         int x;
1129         src-= stride;
1130         for(x=0; x<BLOCK_SIZE; x++)
1131         {
1132                 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1133                 if(ABS(middleEnergy)< 8*QP)
1134                 {
1135                         const int q=(src[l4] - src[l5])/2;
1136                         const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1137                         const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1138
1139                         int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1140                         d= MAX(d, 0);
1141
1142                         d= (5*d + 32) >> 6;
1143                         d*= SIGN(-middleEnergy);
1144
1145                         if(q>0)
1146                         {
1147                                 d= d<0 ? 0 : d;
1148                                 d= d>q ? q : d;
1149                         }
1150                         else
1151                         {
1152                                 d= d>0 ? 0 : d;
1153                                 d= d<q ? q : d;
1154                         }
1155
1156                         src[l4]-= d;
1157                         src[l5]+= d;
1158                 }
1159                 src++;
1160         }
1161 src-=8;
1162         for(x=0; x<8; x++)
1163         {
1164                 int y;
1165                 for(y=4; y<6; y++)
1166                 {
1167                         int d= src[x+y*stride] - tmp[x+(y-4)*8];
1168                         int ad= ABS(d);
1169                         static int max=0;
1170                         static int sum=0;
1171                         static int num=0;
1172                         static int bias=0;
1173
1174                         if(max<ad) max=ad;
1175                         sum+= ad>3 ? 1 : 0;
1176                         if(ad>3)
1177                         {
1178                                 src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
1179                         }
1180                         if(y==4) bias+=d;
1181                         num++;
1182                         if(num%1000000 == 0)
1183                         {
1184                                 printf(" %d %d %d %d\n", num, sum, max, bias);
1185                         }
1186                 }
1187         }
1188 }
1189 */
1190 #elif defined (HAVE_MMX)
1191         src+= stride*4;
1192
1193         asm volatile(
1194                 "pxor %%mm7, %%mm7                              \n\t"
1195                 "leal (%0, %1), %%eax                           \n\t"
1196                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
1197 //      0       1       2       3       4       5       6       7
1198 //      %0      %0+%1   %0+2%1  eax+2%1 %0+4%1  eax+4%1 ebx+%1  ebx+2%1
1199 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1
1200
1201                 "movq (%0), %%mm0                               \n\t"
1202                 "movq %%mm0, %%mm1                              \n\t"
1203                 "punpcklbw %%mm7, %%mm0                         \n\t" // low part of line 0
1204                 "punpckhbw %%mm7, %%mm1                         \n\t" // high part of line 0
1205
1206                 "movq (%%eax), %%mm2                            \n\t"
1207                 "movq %%mm2, %%mm3                              \n\t"
1208                 "punpcklbw %%mm7, %%mm2                         \n\t" // low part of line 1
1209                 "punpckhbw %%mm7, %%mm3                         \n\t" // high part of line 1
1210
1211                 "movq (%%eax, %1), %%mm4                        \n\t"
1212                 "movq %%mm4, %%mm5                              \n\t"
1213                 "punpcklbw %%mm7, %%mm4                         \n\t" // low part of line 2
1214                 "punpckhbw %%mm7, %%mm5                         \n\t" // high part of line 2
1215
1216                 "paddw %%mm0, %%mm0                             \n\t" // 2L0
1217                 "paddw %%mm1, %%mm1                             \n\t" // 2H0
1218                 "psubw %%mm4, %%mm2                             \n\t" // L1 - L2
1219                 "psubw %%mm5, %%mm3                             \n\t" // H1 - H2
1220                 "psubw %%mm2, %%mm0                             \n\t" // 2L0 - L1 + L2
1221                 "psubw %%mm3, %%mm1                             \n\t" // 2H0 - H1 + H2
1222
1223                 "psllw $2, %%mm2                                \n\t" // 4L1 - 4L2
1224                 "psllw $2, %%mm3                                \n\t" // 4H1 - 4H2
1225                 "psubw %%mm2, %%mm0                             \n\t" // 2L0 - 5L1 + 5L2
1226                 "psubw %%mm3, %%mm1                             \n\t" // 2H0 - 5H1 + 5H2
1227
1228                 "movq (%%eax, %1, 2), %%mm2                     \n\t"
1229                 "movq %%mm2, %%mm3                              \n\t"
1230                 "punpcklbw %%mm7, %%mm2                         \n\t" // L3
1231                 "punpckhbw %%mm7, %%mm3                         \n\t" // H3
1232
1233                 "psubw %%mm2, %%mm0                             \n\t" // 2L0 - 5L1 + 5L2 - L3
1234                 "psubw %%mm3, %%mm1                             \n\t" // 2H0 - 5H1 + 5H2 - H3
1235                 "psubw %%mm2, %%mm0                             \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1236                 "psubw %%mm3, %%mm1                             \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1237                 "movq %%mm0, temp0                              \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1238                 "movq %%mm1, temp1                              \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1239
1240                 "movq (%0, %1, 4), %%mm0                        \n\t"
1241                 "movq %%mm0, %%mm1                              \n\t"
1242                 "punpcklbw %%mm7, %%mm0                         \n\t" // L4
1243                 "punpckhbw %%mm7, %%mm1                         \n\t" // H4
1244
1245                 "psubw %%mm0, %%mm2                             \n\t" // L3 - L4
1246                 "psubw %%mm1, %%mm3                             \n\t" // H3 - H4
1247                 "movq %%mm2, temp2                              \n\t" // L3 - L4
1248                 "movq %%mm3, temp3                              \n\t" // H3 - H4
1249                 "paddw %%mm4, %%mm4                             \n\t" // 2L2
1250                 "paddw %%mm5, %%mm5                             \n\t" // 2H2
1251                 "psubw %%mm2, %%mm4                             \n\t" // 2L2 - L3 + L4
1252                 "psubw %%mm3, %%mm5                             \n\t" // 2H2 - H3 + H4
1253
1254                 "psllw $2, %%mm2                                \n\t" // 4L3 - 4L4
1255                 "psllw $2, %%mm3                                \n\t" // 4H3 - 4H4
1256                 "psubw %%mm2, %%mm4                             \n\t" // 2L2 - 5L3 + 5L4
1257                 "psubw %%mm3, %%mm5                             \n\t" // 2H2 - 5H3 + 5H4
1258 //50 opcodes so far
1259                 "movq (%%ebx), %%mm2                            \n\t"
1260                 "movq %%mm2, %%mm3                              \n\t"
1261                 "punpcklbw %%mm7, %%mm2                         \n\t" // L5
1262                 "punpckhbw %%mm7, %%mm3                         \n\t" // H5
1263                 "psubw %%mm2, %%mm4                             \n\t" // 2L2 - 5L3 + 5L4 - L5
1264                 "psubw %%mm3, %%mm5                             \n\t" // 2H2 - 5H3 + 5H4 - H5
1265                 "psubw %%mm2, %%mm4                             \n\t" // 2L2 - 5L3 + 5L4 - 2L5
1266                 "psubw %%mm3, %%mm5                             \n\t" // 2H2 - 5H3 + 5H4 - 2H5
1267
1268                 "movq (%%ebx, %1), %%mm6                        \n\t"
1269                 "punpcklbw %%mm7, %%mm6                         \n\t" // L6
1270                 "psubw %%mm6, %%mm2                             \n\t" // L5 - L6
1271                 "movq (%%ebx, %1), %%mm6                        \n\t"
1272                 "punpckhbw %%mm7, %%mm6                         \n\t" // H6
1273                 "psubw %%mm6, %%mm3                             \n\t" // H5 - H6
1274
1275                 "paddw %%mm0, %%mm0                             \n\t" // 2L4
1276                 "paddw %%mm1, %%mm1                             \n\t" // 2H4
1277                 "psubw %%mm2, %%mm0                             \n\t" // 2L4 - L5 + L6
1278                 "psubw %%mm3, %%mm1                             \n\t" // 2H4 - H5 + H6
1279
1280                 "psllw $2, %%mm2                                \n\t" // 4L5 - 4L6
1281                 "psllw $2, %%mm3                                \n\t" // 4H5 - 4H6
1282                 "psubw %%mm2, %%mm0                             \n\t" // 2L4 - 5L5 + 5L6
1283                 "psubw %%mm3, %%mm1                             \n\t" // 2H4 - 5H5 + 5H6
1284
1285                 "movq (%%ebx, %1, 2), %%mm2                     \n\t"
1286                 "movq %%mm2, %%mm3                              \n\t"
1287                 "punpcklbw %%mm7, %%mm2                         \n\t" // L7
1288                 "punpckhbw %%mm7, %%mm3                         \n\t" // H7
1289
1290                 "paddw %%mm2, %%mm2                             \n\t" // 2L7
1291                 "paddw %%mm3, %%mm3                             \n\t" // 2H7
1292                 "psubw %%mm2, %%mm0                             \n\t" // 2L4 - 5L5 + 5L6 - 2L7
1293                 "psubw %%mm3, %%mm1                             \n\t" // 2H4 - 5H5 + 5H6 - 2H7
1294
1295                 "movq temp0, %%mm2                              \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1296                 "movq temp1, %%mm3                              \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1297
1298 #ifdef HAVE_MMX2
1299                 "movq %%mm7, %%mm6                              \n\t" // 0
1300                 "psubw %%mm0, %%mm6                             \n\t"
1301                 "pmaxsw %%mm6, %%mm0                            \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1302                 "movq %%mm7, %%mm6                              \n\t" // 0
1303                 "psubw %%mm1, %%mm6                             \n\t"
1304                 "pmaxsw %%mm6, %%mm1                            \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1305                 "movq %%mm7, %%mm6                              \n\t" // 0
1306                 "psubw %%mm2, %%mm6                             \n\t"
1307                 "pmaxsw %%mm6, %%mm2                            \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1308                 "movq %%mm7, %%mm6                              \n\t" // 0
1309                 "psubw %%mm3, %%mm6                             \n\t"
1310                 "pmaxsw %%mm6, %%mm3                            \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1311 #else
1312                 "movq %%mm7, %%mm6                              \n\t" // 0
1313                 "pcmpgtw %%mm0, %%mm6                           \n\t"
1314                 "pxor %%mm6, %%mm0                              \n\t"
1315                 "psubw %%mm6, %%mm0                             \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1316                 "movq %%mm7, %%mm6                              \n\t" // 0
1317                 "pcmpgtw %%mm1, %%mm6                           \n\t"
1318                 "pxor %%mm6, %%mm1                              \n\t"
1319                 "psubw %%mm6, %%mm1                             \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1320                 "movq %%mm7, %%mm6                              \n\t" // 0
1321                 "pcmpgtw %%mm2, %%mm6                           \n\t"
1322                 "pxor %%mm6, %%mm2                              \n\t"
1323                 "psubw %%mm6, %%mm2                             \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1324                 "movq %%mm7, %%mm6                              \n\t" // 0
1325                 "pcmpgtw %%mm3, %%mm6                           \n\t"
1326                 "pxor %%mm6, %%mm3                              \n\t"
1327                 "psubw %%mm6, %%mm3                             \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1328 #endif
1329
1330 #ifdef HAVE_MMX2
1331                 "pminsw %%mm2, %%mm0                            \n\t"
1332                 "pminsw %%mm3, %%mm1                            \n\t"
1333 #else
1334                 "movq %%mm0, %%mm6                              \n\t"
1335                 "psubusw %%mm2, %%mm6                           \n\t"
1336                 "psubw %%mm6, %%mm0                             \n\t"
1337                 "movq %%mm1, %%mm6                              \n\t"
1338                 "psubusw %%mm3, %%mm6                           \n\t"
1339                 "psubw %%mm6, %%mm1                             \n\t"
1340 #endif
1341
1342                 "movq %%mm7, %%mm6                              \n\t" // 0
1343                 "pcmpgtw %%mm4, %%mm6                           \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
1344                 "pxor %%mm6, %%mm4                              \n\t"
1345                 "psubw %%mm6, %%mm4                             \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
1346                 "pcmpgtw %%mm5, %%mm7                           \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
1347                 "pxor %%mm7, %%mm5                              \n\t"
1348                 "psubw %%mm7, %%mm5                             \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
1349 // 100 opcodes
1350                 "movd %2, %%mm2                                 \n\t" // QP
1351                 "punpcklwd %%mm2, %%mm2                         \n\t"
1352                 "punpcklwd %%mm2, %%mm2                         \n\t"
1353                 "psllw $3, %%mm2                                \n\t" // 8QP
1354                 "movq %%mm2, %%mm3                              \n\t" // 8QP
1355                 "pcmpgtw %%mm4, %%mm2                           \n\t"
1356                 "pcmpgtw %%mm5, %%mm3                           \n\t"
1357                 "pand %%mm2, %%mm4                              \n\t"
1358                 "pand %%mm3, %%mm5                              \n\t"
1359
1360
1361                 "psubusw %%mm0, %%mm4                           \n\t" // hd
1362                 "psubusw %%mm1, %%mm5                           \n\t" // ld
1363
1364
1365                 "movq w05, %%mm2                                \n\t" // 5
1366                 "pmullw %%mm2, %%mm4                            \n\t"
1367                 "pmullw %%mm2, %%mm5                            \n\t"
1368                 "movq w20, %%mm2                                \n\t" // 32
1369                 "paddw %%mm2, %%mm4                             \n\t"
1370                 "paddw %%mm2, %%mm5                             \n\t"
1371                 "psrlw $6, %%mm4                                \n\t"
1372                 "psrlw $6, %%mm5                                \n\t"
1373
1374 /*
1375                 "movq w06, %%mm2                                \n\t" // 6
1376                 "paddw %%mm2, %%mm4                             \n\t"
1377                 "paddw %%mm2, %%mm5                             \n\t"
1378                 "movq w1400, %%mm2                              \n\t" // 1400h = 5120 = 5/64*2^16
1379 //FIXME if *5/64 is supposed to be /13 then we should use 5041 instead of 5120
1380                 "pmulhw %%mm2, %%mm4                            \n\t" // hd/13
1381                 "pmulhw %%mm2, %%mm5                            \n\t" // ld/13
1382 */
1383
1384                 "movq temp2, %%mm0                              \n\t" // L3 - L4
1385                 "movq temp3, %%mm1                              \n\t" // H3 - H4
1386
1387                 "pxor %%mm2, %%mm2                              \n\t"
1388                 "pxor %%mm3, %%mm3                              \n\t"
1389
1390                 "pcmpgtw %%mm0, %%mm2                           \n\t" // sign (L3-L4)
1391                 "pcmpgtw %%mm1, %%mm3                           \n\t" // sign (H3-H4)
1392                 "pxor %%mm2, %%mm0                              \n\t"
1393                 "pxor %%mm3, %%mm1                              \n\t"
1394                 "psubw %%mm2, %%mm0                             \n\t" // |L3-L4|
1395                 "psubw %%mm3, %%mm1                             \n\t" // |H3-H4|
1396                 "psrlw $1, %%mm0                                \n\t" // |L3 - L4|/2
1397                 "psrlw $1, %%mm1                                \n\t" // |H3 - H4|/2
1398
1399                 "pxor %%mm6, %%mm2                              \n\t"
1400                 "pxor %%mm7, %%mm3                              \n\t"
1401                 "pand %%mm2, %%mm4                              \n\t"
1402                 "pand %%mm3, %%mm5                              \n\t"
1403
1404 #ifdef HAVE_MMX2
1405                 "pminsw %%mm0, %%mm4                            \n\t"
1406                 "pminsw %%mm1, %%mm5                            \n\t"
1407 #else
1408                 "movq %%mm4, %%mm2                              \n\t"
1409                 "psubusw %%mm0, %%mm2                           \n\t"
1410                 "psubw %%mm2, %%mm4                             \n\t"
1411                 "movq %%mm5, %%mm2                              \n\t"
1412                 "psubusw %%mm1, %%mm2                           \n\t"
1413                 "psubw %%mm2, %%mm5                             \n\t"
1414 #endif
1415                 "pxor %%mm6, %%mm4                              \n\t"
1416                 "pxor %%mm7, %%mm5                              \n\t"
1417                 "psubw %%mm6, %%mm4                             \n\t"
1418                 "psubw %%mm7, %%mm5                             \n\t"
1419                 "packsswb %%mm5, %%mm4                          \n\t"
1420                 "movq (%%eax, %1, 2), %%mm0                     \n\t"
1421                 "paddb   %%mm4, %%mm0                           \n\t"
1422                 "movq %%mm0, (%%eax, %1, 2)                     \n\t"
1423                 "movq (%0, %1, 4), %%mm0                        \n\t"
1424                 "psubb %%mm4, %%mm0                             \n\t"
1425                 "movq %%mm0, (%0, %1, 4)                        \n\t"
1426
1427                 :
1428                 : "r" (src), "r" (stride), "r" (QP)
1429                 : "%eax", "%ebx"
1430         );
1431 #else
1432         const int l1= stride;
1433         const int l2= stride + l1;
1434         const int l3= stride + l2;
1435         const int l4= stride + l3;
1436         const int l5= stride + l4;
1437         const int l6= stride + l5;
1438         const int l7= stride + l6;
1439         const int l8= stride + l7;
1440 //      const int l9= stride + l8;
1441         int x;
1442         src+= stride*3;
1443         for(x=0; x<BLOCK_SIZE; x++)
1444         {
1445                 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1446                 if(ABS(middleEnergy) < 8*QP)
1447                 {
1448                         const int q=(src[l4] - src[l5])/2;
1449                         const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1450                         const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1451
1452                         int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1453                         d= MAX(d, 0);
1454
1455                         d= (5*d + 32) >> 6;
1456                         d*= SIGN(-middleEnergy);
1457
1458                         if(q>0)
1459                         {
1460                                 d= d<0 ? 0 : d;
1461                                 d= d>q ? q : d;
1462                         }
1463                         else
1464                         {
1465                                 d= d>0 ? 0 : d;
1466                                 d= d<q ? q : d;
1467                         }
1468
1469                         src[l4]-= d;
1470                         src[l5]+= d;
1471                 }
1472                 src++;
1473         }
1474 #endif
1475 }
1476
1477 /**
1478  * Check if the given 8x8 Block is mostly "flat"
1479  */
1480 static inline int isHorizDC(uint8_t src[], int stride)
1481 {
1482         int numEq= 0;
1483         int y;
1484         for(y=0; y<BLOCK_SIZE; y++)
1485         {
1486                 if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++;
1487                 if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++;
1488                 if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++;
1489                 if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++;
1490                 if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++;
1491                 if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++;
1492                 if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++;
1493                 src+= stride;
1494         }
1495         return numEq > hFlatnessThreshold;
1496 }
1497
1498 static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
1499 {
1500         if(abs(src[0] - src[7]) > 2*QP) return 0;
1501
1502         return 1;
1503 }
1504
1505 static inline void doHorizDefFilter(uint8_t dst[], int stride, int QP)
1506 {
1507         int y;
1508         for(y=0; y<BLOCK_SIZE; y++)
1509         {
1510                 const int middleEnergy= 5*(dst[4] - dst[5]) + 2*(dst[2] - dst[5]);
1511
1512                 if(ABS(middleEnergy) < 8*QP)
1513                 {
1514                         const int q=(dst[3] - dst[4])/2;
1515                         const int leftEnergy=  5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
1516                         const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
1517
1518                         int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1519                         d= MAX(d, 0);
1520
1521                         d= (5*d + 32) >> 6;
1522                         d*= SIGN(-middleEnergy);
1523
1524                         if(q>0)
1525                         {
1526                                 d= d<0 ? 0 : d;
1527                                 d= d>q ? q : d;
1528                         }
1529                         else
1530                         {
1531                                 d= d>0 ? 0 : d;
1532                                 d= d<q ? q : d;
1533                         }
1534
1535                         dst[3]-= d;
1536                         dst[4]+= d;
1537                 }
1538                 dst+= stride;
1539         }
1540 }
1541
1542 /**
1543  * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
1544  * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
1545  */
1546 static inline void doHorizLowPass(uint8_t dst[], int stride, int QP)
1547 {
1548
1549         int y;
1550         for(y=0; y<BLOCK_SIZE; y++)
1551         {
1552                 const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
1553                 const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
1554
1555                 int sums[9];
1556                 sums[0] = first + dst[0];
1557                 sums[1] = dst[0] + dst[1];
1558                 sums[2] = dst[1] + dst[2];
1559                 sums[3] = dst[2] + dst[3];
1560                 sums[4] = dst[3] + dst[4];
1561                 sums[5] = dst[4] + dst[5];
1562                 sums[6] = dst[5] + dst[6];
1563                 sums[7] = dst[6] + dst[7];
1564                 sums[8] = dst[7] + last;
1565
1566                 dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
1567                 dst[1]= ((dst[1]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
1568                 dst[2]= ((dst[2]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
1569                 dst[3]= ((dst[3]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
1570                 dst[4]= ((dst[4]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
1571                 dst[5]= ((dst[5]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
1572                 dst[6]= (((last + dst[6])<<2) + ((dst[7] + sums[5])<<1) + sums[3] + 8)>>4;
1573                 dst[7]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
1574
1575                 dst+= stride;
1576         }
1577 }
1578
1579
1580 static inline void dering(uint8_t src[], int stride, int QP)
1581 {
1582 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1583         asm volatile(
1584                 "movq pQPb, %%mm0                               \n\t"
1585                 "paddusb %%mm0, %%mm0                           \n\t"
1586                 "movq %%mm0, pQPb2                              \n\t"
1587
1588                 "leal (%0, %1), %%eax                           \n\t"
1589                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
1590 //      0       1       2       3       4       5       6       7       8       9
1591 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
1592
1593                 "pcmpeqb %%mm7, %%mm7                           \n\t"
1594                 "pxor %%mm6, %%mm6                              \n\t"
1595 #ifdef HAVE_MMX2
1596 #define FIND_MIN_MAX(addr)\
1597                 "movq " #addr ", %%mm0                          \n\t"\
1598                 "pminub %%mm0, %%mm7                            \n\t"\
1599                 "pmaxub %%mm0, %%mm6                            \n\t"
1600 #else
1601 #define FIND_MIN_MAX(addr)\
1602                 "movq " #addr ", %%mm0                          \n\t"\
1603                 "movq %%mm7, %%mm1                              \n\t"\
1604                 "psubusb %%mm0, %%mm6                           \n\t"\
1605                 "paddb %%mm0, %%mm6                             \n\t"\
1606                 "psubusb %%mm0, %%mm1                           \n\t"\
1607                 "psubb %%mm1, %%mm7                             \n\t"
1608 #endif
1609
1610 FIND_MIN_MAX((%%eax))
1611 FIND_MIN_MAX((%%eax, %1))
1612 FIND_MIN_MAX((%%eax, %1, 2))
1613 FIND_MIN_MAX((%0, %1, 4))
1614 FIND_MIN_MAX((%%ebx))
1615 FIND_MIN_MAX((%%ebx, %1))
1616 FIND_MIN_MAX((%%ebx, %1, 2))
1617 FIND_MIN_MAX((%0, %1, 8))
1618
1619                 "movq %%mm7, %%mm4                              \n\t"
1620                 "psrlq $8, %%mm7                                \n\t"
1621 #ifdef HAVE_MMX2
1622                 "pminub %%mm4, %%mm7                            \n\t" // min of pixels
1623                 "pshufw $0xF9, %%mm7, %%mm4                     \n\t"
1624                 "pminub %%mm4, %%mm7                            \n\t" // min of pixels
1625                 "pshufw $0xFE, %%mm7, %%mm4                     \n\t"
1626                 "pminub %%mm4, %%mm7                            \n\t"
1627 #else
1628                 "movq %%mm7, %%mm1                              \n\t"
1629                 "psubusb %%mm4, %%mm1                           \n\t"
1630                 "psubb %%mm1, %%mm7                             \n\t"
1631                 "movq %%mm7, %%mm4                              \n\t"
1632                 "psrlq $16, %%mm7                               \n\t"
1633                 "movq %%mm7, %%mm1                              \n\t"
1634                 "psubusb %%mm4, %%mm1                           \n\t"
1635                 "psubb %%mm1, %%mm7                             \n\t"
1636                 "movq %%mm7, %%mm4                              \n\t"
1637                 "psrlq $32, %%mm7                               \n\t"
1638                 "movq %%mm7, %%mm1                              \n\t"
1639                 "psubusb %%mm4, %%mm1                           \n\t"
1640                 "psubb %%mm1, %%mm7                             \n\t"
1641 #endif
1642
1643
1644                 "movq %%mm6, %%mm4                              \n\t"
1645                 "psrlq $8, %%mm6                                \n\t"
1646 #ifdef HAVE_MMX2
1647                 "pmaxub %%mm4, %%mm6                            \n\t" // max of pixels
1648                 "pshufw $0xF9, %%mm6, %%mm4                     \n\t"
1649                 "pmaxub %%mm4, %%mm6                            \n\t"
1650                 "pshufw $0xFE, %%mm6, %%mm4                     \n\t"
1651                 "pmaxub %%mm4, %%mm6                            \n\t"
1652 #else
1653                 "psubusb %%mm4, %%mm6                           \n\t"
1654                 "paddb %%mm4, %%mm6                             \n\t"
1655                 "movq %%mm6, %%mm4                              \n\t"
1656                 "psrlq $16, %%mm6                               \n\t"
1657                 "psubusb %%mm4, %%mm6                           \n\t"
1658                 "paddb %%mm4, %%mm6                             \n\t"
1659                 "movq %%mm6, %%mm4                              \n\t"
1660                 "psrlq $32, %%mm6                               \n\t"
1661                 "psubusb %%mm4, %%mm6                           \n\t"
1662                 "paddb %%mm4, %%mm6                             \n\t"
1663 #endif
1664                 "movq %%mm6, %%mm0                              \n\t" // max
1665                 "psubb %%mm7, %%mm6                             \n\t" // max - min
1666                 "movd %%mm6, %%ecx                              \n\t"
1667                 "cmpb deringThreshold, %%cl                     \n\t"
1668                 " jb 1f                                         \n\t"
1669                 PAVGB(%%mm0, %%mm7)                                   // a=(max + min)/2
1670                 "punpcklbw %%mm7, %%mm7                         \n\t"
1671                 "punpcklbw %%mm7, %%mm7                         \n\t"
1672                 "punpcklbw %%mm7, %%mm7                         \n\t"
1673                 "movq %%mm7, temp0                              \n\t"
1674
1675                 "movq (%0), %%mm0                               \n\t" // L10
1676                 "movq %%mm0, %%mm1                              \n\t" // L10
1677                 "movq %%mm0, %%mm2                              \n\t" // L10
1678                 "psllq $8, %%mm1                                \n\t"
1679                 "psrlq $8, %%mm2                                \n\t"
1680                 "movd -4(%0), %%mm3                             \n\t"
1681                 "movd 8(%0), %%mm4                              \n\t"
1682                 "psrlq $24, %%mm3                               \n\t"
1683                 "psllq $56, %%mm4                               \n\t"
1684                 "por %%mm3, %%mm1                               \n\t" // L00
1685                 "por %%mm4, %%mm2                               \n\t" // L20
1686                 "movq %%mm1, %%mm3                              \n\t" // L00
1687                 PAVGB(%%mm2, %%mm1)                                   // (L20 + L00)/2
1688                 PAVGB(%%mm0, %%mm1)                                   // (L20 + L00 + 2L10)/4
1689                 "psubusb %%mm7, %%mm0                           \n\t"
1690                 "psubusb %%mm7, %%mm2                           \n\t"
1691                 "psubusb %%mm7, %%mm3                           \n\t"
1692                 "pcmpeqb b00, %%mm0                             \n\t" // L10 > a ? 0 : -1
1693                 "pcmpeqb b00, %%mm2                             \n\t" // L20 > a ? 0 : -1
1694                 "pcmpeqb b00, %%mm3                             \n\t" // L00 > a ? 0 : -1
1695                 "paddb %%mm2, %%mm0                             \n\t"
1696                 "paddb %%mm3, %%mm0                             \n\t"
1697
1698                 "movq (%%eax), %%mm2                            \n\t" // L11
1699                 "movq %%mm2, %%mm3                              \n\t" // L11
1700                 "movq %%mm2, %%mm4                              \n\t" // L11
1701                 "psllq $8, %%mm3                                \n\t"
1702                 "psrlq $8, %%mm4                                \n\t"
1703                 "movd -4(%%eax), %%mm5                          \n\t"
1704                 "movd 8(%%eax), %%mm6                           \n\t"
1705                 "psrlq $24, %%mm5                               \n\t"
1706                 "psllq $56, %%mm6                               \n\t"
1707                 "por %%mm5, %%mm3                               \n\t" // L01
1708                 "por %%mm6, %%mm4                               \n\t" // L21
1709                 "movq %%mm3, %%mm5                              \n\t" // L01
1710                 PAVGB(%%mm4, %%mm3)                                   // (L21 + L01)/2
1711                 PAVGB(%%mm2, %%mm3)                                   // (L21 + L01 + 2L11)/4
1712                 "psubusb %%mm7, %%mm2                           \n\t"
1713                 "psubusb %%mm7, %%mm4                           \n\t"
1714                 "psubusb %%mm7, %%mm5                           \n\t"
1715                 "pcmpeqb b00, %%mm2                             \n\t" // L11 > a ? 0 : -1
1716                 "pcmpeqb b00, %%mm4                             \n\t" // L21 > a ? 0 : -1
1717                 "pcmpeqb b00, %%mm5                             \n\t" // L01 > a ? 0 : -1
1718                 "paddb %%mm4, %%mm2                             \n\t"
1719                 "paddb %%mm5, %%mm2                             \n\t"
1720 // 0, 2, 3, 1
1721 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1722                 "movq " #src ", " #sx "                         \n\t" /* src[0] */\
1723                 "movq " #sx ", " #lx "                          \n\t" /* src[0] */\
1724                 "movq " #sx ", " #t0 "                          \n\t" /* src[0] */\
1725                 "psllq $8, " #lx "                              \n\t"\
1726                 "psrlq $8, " #t0 "                              \n\t"\
1727                 "movd -4" #src ", " #t1 "                       \n\t"\
1728                 "psrlq $24, " #t1 "                             \n\t"\
1729                 "por " #t1 ", " #lx "                           \n\t" /* src[-1] */\
1730                 "movd 8" #src ", " #t1 "                        \n\t"\
1731                 "psllq $56, " #t1 "                             \n\t"\
1732                 "por " #t1 ", " #t0 "                           \n\t" /* src[+1] */\
1733                 "movq " #lx ", " #t1 "                          \n\t" /* src[-1] */\
1734                 PAVGB(t0, lx)                                         /* (src[-1] + src[+1])/2 */\
1735                 PAVGB(sx, lx)                                 /* (src[-1] + 2src[0] + src[+1])/4 */\
1736                 PAVGB(lx, pplx)                                      \
1737                 "movq " #lx ", temp1                            \n\t"\
1738                 "movq temp0, " #lx "                            \n\t"\
1739                 "psubusb " #lx ", " #t1 "                       \n\t"\
1740                 "psubusb " #lx ", " #t0 "                       \n\t"\
1741                 "psubusb " #lx ", " #sx "                       \n\t"\
1742                 "movq b00, " #lx "                              \n\t"\
1743                 "pcmpeqb " #lx ", " #t1 "                       \n\t" /* src[-1] > a ? 0 : -1*/\
1744                 "pcmpeqb " #lx ", " #t0 "                       \n\t" /* src[+1] > a ? 0 : -1*/\
1745                 "pcmpeqb " #lx ", " #sx "                       \n\t" /* src[0]  > a ? 0 : -1*/\
1746                 "paddb " #t1 ", " #t0 "                         \n\t"\
1747                 "paddb " #t0 ", " #sx "                         \n\t"\
1748 \
1749                 PAVGB(plx, pplx)                                      /* filtered */\
1750                 "movq " #dst ", " #t0 "                         \n\t" /* dst */\
1751                 "movq " #t0 ", " #t1 "                          \n\t" /* dst */\
1752                 "psubusb pQPb2, " #t0 "                         \n\t"\
1753                 "paddusb pQPb2, " #t1 "                         \n\t"\
1754                 PMAXUB(t0, pplx)\
1755                 PMINUB(t1, pplx, t0)\
1756                 "paddb " #sx ", " #ppsx "                       \n\t"\
1757                 "paddb " #psx ", " #ppsx "                      \n\t"\
1758         "#paddb b02, " #ppsx "                          \n\t"\
1759                 "pand b08, " #ppsx "                            \n\t"\
1760                 "pcmpeqb " #lx ", " #ppsx "                     \n\t"\
1761                 "pand " #ppsx ", " #pplx "                      \n\t"\
1762                 "pandn " #dst ", " #ppsx "                      \n\t"\
1763                 "por " #pplx ", " #ppsx "                       \n\t"\
1764                 "movq " #ppsx ", " #dst "                       \n\t"\
1765                 "movq temp1, " #lx "                            \n\t"
1766
1767 /*
1768 0000000
1769 1111111
1770
1771 1111110
1772 1111101
1773 1111100
1774 1111011
1775 1111010
1776 1111001
1777
1778 1111000
1779 1110111
1780
1781 */
1782 //DERING_CORE(dst,src                  ,ppsx ,psx  ,sx   ,pplx ,plx  ,lx   ,t0   ,t1)
1783 DERING_CORE((%%eax),(%%eax, %1)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1784 DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1785 DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1786 DERING_CORE((%0, %1, 4),(%%ebx)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1787 DERING_CORE((%%ebx),(%%ebx, %1)        ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1788 DERING_CORE((%%ebx, %1), (%%ebx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1789 DERING_CORE((%%ebx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1790 DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1791
1792                 "1:                     \n\t"
1793                 : : "r" (src), "r" (stride), "r" (QP)
1794                 : "%eax", "%ebx", "%ecx"
1795         );
1796 #else
1797         int y;
1798         int min=255;
1799         int max=0;
1800         int avg;
1801         uint8_t *p;
1802         int s[10];
1803
1804         for(y=1; y<9; y++)
1805         {
1806                 int x;
1807                 p= src + stride*y;
1808                 for(x=1; x<9; x++)
1809                 {
1810                         p++;
1811                         if(*p > max) max= *p;
1812                         if(*p < min) min= *p;
1813                 }
1814         }
1815         avg= (min + max + 1)/2;
1816
1817         if(max - min <deringThreshold) return;
1818
1819         for(y=0; y<10; y++)
1820         {
1821                 int x;
1822                 int t = 0;
1823                 p= src + stride*y;
1824                 for(x=0; x<10; x++)
1825                 {
1826                         if(*p > avg) t |= (1<<x);
1827                         p++;
1828                 }
1829                 t |= (~t)<<16;
1830                 t &= (t<<1) & (t>>1);
1831                 s[y] = t;
1832         }
1833
1834         for(y=1; y<9; y++)
1835         {
1836                 int x;
1837                 int t = s[y-1] & s[y] & s[y+1];
1838                 t|= t>>16;
1839
1840                 p= src + stride*y;
1841                 for(x=1; x<9; x++)
1842                 {
1843                         p++;
1844                         if(t & (1<<x))
1845                         {
1846                                 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
1847                                       +2*(*(p     -1)) + 4*(*p         ) + 2*(*(p     +1))
1848                                       +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
1849                                 f= (f + 8)>>4;
1850
1851 #ifdef DEBUG_DERING_THRESHOLD
1852                                 asm volatile("emms\n\t":);
1853                                 {
1854                                 static long long numPixels=0;
1855                                 if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
1856 //                              if((max-min)<20 || (max-min)*QP<200)
1857 //                              if((max-min)*QP < 500)
1858 //                              if(max-min<QP/2)
1859                                 if(max-min < 20)
1860                                 {
1861                                         static int numSkiped=0;
1862                                         static int errorSum=0;
1863                                         static int worstQP=0;
1864                                         static int worstRange=0;
1865                                         static int worstDiff=0;
1866                                         int diff= (f - *p);
1867                                         int absDiff= ABS(diff);
1868                                         int error= diff*diff;
1869
1870                                         if(x==1 || x==8 || y==1 || y==8) continue;
1871
1872                                         numSkiped++;
1873                                         if(absDiff > worstDiff)
1874                                         {
1875                                                 worstDiff= absDiff;
1876                                                 worstQP= QP;
1877                                                 worstRange= max-min;
1878                                         }
1879                                         errorSum+= error;
1880
1881                                         if(1024LL*1024LL*1024LL % numSkiped == 0)
1882                                         {
1883                                                 printf( "sum:%1.3f, skip:%d, wQP:%d, "
1884                                                         "wRange:%d, wDiff:%d, relSkip:%1.3f\n",
1885                                                         (float)errorSum/numSkiped, numSkiped, worstQP, worstRange,
1886                                                         worstDiff, (float)numSkiped/numPixels);
1887                                         }
1888                                 }
1889                                 }
1890 #endif
1891                                 if     (*p + 2*QP < f) *p= *p + 2*QP;
1892                                 else if(*p - 2*QP > f) *p= *p - 2*QP;
1893                                 else *p=f;
1894                         }
1895                 }
1896         }
1897 #ifdef DEBUG_DERING_THRESHOLD
1898         if(max-min < 20)
1899         {
1900                 for(y=1; y<9; y++)
1901                 {
1902                         int x;
1903                         int t = 0;
1904                         p= src + stride*y;
1905                         for(x=1; x<9; x++)
1906                         {
1907                                 p++;
1908                                 *p = MIN(*p + 20, 255);
1909                         }
1910                 }
1911 //              src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
1912         }
1913 #endif
1914 #endif
1915 }
1916
1917 /**
1918  * Deinterlaces the given block
1919  * will be called for every 8x8 block and can read & write from line 4-15
1920  * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1921  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1922  */
1923 static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride)
1924 {
1925 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1926         src+= 4*stride;
1927         asm volatile(
1928                 "leal (%0, %1), %%eax                           \n\t"
1929                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
1930 //      0       1       2       3       4       5       6       7       8       9
1931 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
1932
1933                 "movq (%0), %%mm0                               \n\t"
1934                 "movq (%%eax, %1), %%mm1                        \n\t"
1935                 PAVGB(%%mm1, %%mm0)
1936                 "movq %%mm0, (%%eax)                            \n\t"
1937                 "movq (%0, %1, 4), %%mm0                        \n\t"
1938                 PAVGB(%%mm0, %%mm1)
1939                 "movq %%mm1, (%%eax, %1, 2)                     \n\t"
1940                 "movq (%%ebx, %1), %%mm1                        \n\t"
1941                 PAVGB(%%mm1, %%mm0)
1942                 "movq %%mm0, (%%ebx)                            \n\t"
1943                 "movq (%0, %1, 8), %%mm0                        \n\t"
1944                 PAVGB(%%mm0, %%mm1)
1945                 "movq %%mm1, (%%ebx, %1, 2)                     \n\t"
1946
1947                 : : "r" (src), "r" (stride)
1948                 : "%eax", "%ebx"
1949         );
1950 #else
1951         int x;
1952         src+= 4*stride;
1953         for(x=0; x<8; x++)
1954         {
1955                 src[stride]   = (src[0]        + src[stride*2])>>1;
1956                 src[stride*3] = (src[stride*2] + src[stride*4])>>1;
1957                 src[stride*5] = (src[stride*4] + src[stride*6])>>1;
1958                 src[stride*7] = (src[stride*6] + src[stride*8])>>1;
1959                 src++;
1960         }
1961 #endif
1962 }
1963
1964 /**
1965  * Deinterlaces the given block
1966  * will be called for every 8x8 block and can read & write from line 4-15
1967  * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1968  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1969  * this filter will read lines 3-15 and write 7-13
1970  * no cliping in C version
1971  */
1972 static inline void deInterlaceInterpolateCubic(uint8_t src[], int stride)
1973 {
1974 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1975         src+= stride*3;
1976         asm volatile(
1977                 "leal (%0, %1), %%eax                           \n\t"
1978                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
1979                 "leal (%%ebx, %1, 4), %%ecx                     \n\t"
1980                 "addl %1, %%ecx                                 \n\t"
1981                 "pxor %%mm7, %%mm7                              \n\t"
1982 //      0       1       2       3       4       5       6       7       8       9       10
1983 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1 ecx
1984
1985 #define DEINT_CUBIC(a,b,c,d,e)\
1986                 "movq " #a ", %%mm0                             \n\t"\
1987                 "movq " #b ", %%mm1                             \n\t"\
1988                 "movq " #d ", %%mm2                             \n\t"\
1989                 "movq " #e ", %%mm3                             \n\t"\
1990                 PAVGB(%%mm2, %%mm1)                                     /* (b+d) /2 */\
1991                 PAVGB(%%mm3, %%mm0)                                     /* a(a+e) /2 */\
1992                 "movq %%mm0, %%mm2                              \n\t"\
1993                 "punpcklbw %%mm7, %%mm0                         \n\t"\
1994                 "punpckhbw %%mm7, %%mm2                         \n\t"\
1995                 "movq %%mm1, %%mm3                              \n\t"\
1996                 "punpcklbw %%mm7, %%mm1                         \n\t"\
1997                 "punpckhbw %%mm7, %%mm3                         \n\t"\
1998                 "psubw %%mm1, %%mm0                             \n\t"   /* L(a+e - (b+d))/2 */\
1999                 "psubw %%mm3, %%mm2                             \n\t"   /* H(a+e - (b+d))/2 */\
2000                 "psraw $3, %%mm0                                \n\t"   /* L(a+e - (b+d))/16 */\
2001                 "psraw $3, %%mm2                                \n\t"   /* H(a+e - (b+d))/16 */\
2002                 "psubw %%mm0, %%mm1                             \n\t"   /* L(9b + 9d - a - e)/16 */\
2003                 "psubw %%mm2, %%mm3                             \n\t"   /* H(9b + 9d - a - e)/16 */\
2004                 "packuswb %%mm3, %%mm1                          \n\t"\
2005                 "movq %%mm1, " #c "                             \n\t"
2006
2007 DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%ebx, %1))
2008 DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%ebx), (%%ebx, %1), (%0, %1, 8))
2009 DEINT_CUBIC((%0, %1, 4), (%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8), (%%ecx))
2010 DEINT_CUBIC((%%ebx, %1), (%0, %1, 8), (%%ebx, %1, 4), (%%ecx), (%%ecx, %1, 2))
2011
2012                 : : "r" (src), "r" (stride)
2013                 : "%eax", "%ebx", "ecx"
2014         );
2015 #else
2016         int x;
2017         src+= stride*3;
2018         for(x=0; x<8; x++)
2019         {
2020                 src[stride*3] = (-src[0]        + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4;
2021                 src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4;
2022                 src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4;
2023                 src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4;
2024                 src++;
2025         }
2026 #endif
2027 }
2028
2029 /**
2030  * Deinterlaces the given block
2031  * will be called for every 8x8 block and can read & write from line 4-15
2032  * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
2033  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2034  * will shift the image up by 1 line (FIXME if this is a problem)
2035  * this filter will read lines 4-13 and write 4-11
2036  */
2037 static inline void deInterlaceBlendLinear(uint8_t src[], int stride)
2038 {
2039 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2040         src+= 4*stride;
2041         asm volatile(
2042                 "leal (%0, %1), %%eax                           \n\t"
2043                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
2044 //      0       1       2       3       4       5       6       7       8       9
2045 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
2046
2047                 "movq (%0), %%mm0                               \n\t" // L0
2048                 "movq (%%eax, %1), %%mm1                        \n\t" // L2
2049                 PAVGB(%%mm1, %%mm0)                                   // L0+L2
2050                 "movq (%%eax), %%mm2                            \n\t" // L1
2051                 PAVGB(%%mm2, %%mm0)
2052                 "movq %%mm0, (%0)                               \n\t"
2053                 "movq (%%eax, %1, 2), %%mm0                     \n\t" // L3
2054                 PAVGB(%%mm0, %%mm2)                                   // L1+L3
2055                 PAVGB(%%mm1, %%mm2)                                   // 2L2 + L1 + L3
2056                 "movq %%mm2, (%%eax)                            \n\t"
2057                 "movq (%0, %1, 4), %%mm2                        \n\t" // L4
2058                 PAVGB(%%mm2, %%mm1)                                   // L2+L4
2059                 PAVGB(%%mm0, %%mm1)                                   // 2L3 + L2 + L4
2060                 "movq %%mm1, (%%eax, %1)                        \n\t"
2061                 "movq (%%ebx), %%mm1                            \n\t" // L5
2062                 PAVGB(%%mm1, %%mm0)                                   // L3+L5
2063                 PAVGB(%%mm2, %%mm0)                                   // 2L4 + L3 + L5
2064                 "movq %%mm0, (%%eax, %1, 2)                     \n\t"
2065                 "movq (%%ebx, %1), %%mm0                        \n\t" // L6
2066                 PAVGB(%%mm0, %%mm2)                                   // L4+L6
2067                 PAVGB(%%mm1, %%mm2)                                   // 2L5 + L4 + L6
2068                 "movq %%mm2, (%0, %1, 4)                        \n\t"
2069                 "movq (%%ebx, %1, 2), %%mm2                     \n\t" // L7
2070                 PAVGB(%%mm2, %%mm1)                                   // L5+L7
2071                 PAVGB(%%mm0, %%mm1)                                   // 2L6 + L5 + L7
2072                 "movq %%mm1, (%%ebx)                            \n\t"
2073                 "movq (%0, %1, 8), %%mm1                        \n\t" // L8
2074                 PAVGB(%%mm1, %%mm0)                                   // L6+L8
2075                 PAVGB(%%mm2, %%mm0)                                   // 2L7 + L6 + L8
2076                 "movq %%mm0, (%%ebx, %1)                        \n\t"
2077                 "movq (%%ebx, %1, 4), %%mm0                     \n\t" // L9
2078                 PAVGB(%%mm0, %%mm2)                                   // L7+L9
2079                 PAVGB(%%mm1, %%mm2)                                   // 2L8 + L7 + L9
2080                 "movq %%mm2, (%%ebx, %1, 2)                     \n\t"
2081
2082
2083                 : : "r" (src), "r" (stride)
2084                 : "%eax", "%ebx"
2085         );
2086 #else
2087         int x;
2088         src+= 4*stride;
2089         for(x=0; x<8; x++)
2090         {
2091                 src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
2092                 src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
2093                 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2094                 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
2095                 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
2096                 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
2097                 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
2098                 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
2099                 src++;
2100         }
2101 #endif
2102 }
2103
2104 /**
2105  * Deinterlaces the given block
2106  * will be called for every 8x8 block and can read & write from line 4-15,
2107  * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
2108  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2109  */
2110 static inline void deInterlaceMedian(uint8_t src[], int stride)
2111 {
2112 #ifdef HAVE_MMX
2113         src+= 4*stride;
2114 #ifdef HAVE_MMX2
2115         asm volatile(
2116                 "leal (%0, %1), %%eax                           \n\t"
2117                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
2118 //      0       1       2       3       4       5       6       7       8       9
2119 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
2120
2121                 "movq (%0), %%mm0                               \n\t" //
2122                 "movq (%%eax, %1), %%mm2                        \n\t" //
2123                 "movq (%%eax), %%mm1                            \n\t" //
2124                 "movq %%mm0, %%mm3                              \n\t"
2125                 "pmaxub %%mm1, %%mm0                            \n\t" //
2126                 "pminub %%mm3, %%mm1                            \n\t" //
2127                 "pmaxub %%mm2, %%mm1                            \n\t" //
2128                 "pminub %%mm1, %%mm0                            \n\t"
2129                 "movq %%mm0, (%%eax)                            \n\t"
2130
2131                 "movq (%0, %1, 4), %%mm0                        \n\t" //
2132                 "movq (%%eax, %1, 2), %%mm1                     \n\t" //
2133                 "movq %%mm2, %%mm3                              \n\t"
2134                 "pmaxub %%mm1, %%mm2                            \n\t" //
2135                 "pminub %%mm3, %%mm1                            \n\t" //
2136                 "pmaxub %%mm0, %%mm1                            \n\t" //
2137                 "pminub %%mm1, %%mm2                            \n\t"
2138                 "movq %%mm2, (%%eax, %1, 2)                     \n\t"
2139
2140                 "movq (%%ebx), %%mm2                            \n\t" //
2141                 "movq (%%ebx, %1), %%mm1                        \n\t" //
2142                 "movq %%mm2, %%mm3                              \n\t"
2143                 "pmaxub %%mm0, %%mm2                            \n\t" //
2144                 "pminub %%mm3, %%mm0                            \n\t" //
2145                 "pmaxub %%mm1, %%mm0                            \n\t" //
2146                 "pminub %%mm0, %%mm2                            \n\t"
2147                 "movq %%mm2, (%%ebx)                            \n\t"
2148
2149                 "movq (%%ebx, %1, 2), %%mm2                     \n\t" //
2150                 "movq (%0, %1, 8), %%mm0                        \n\t" //
2151                 "movq %%mm2, %%mm3                              \n\t"
2152                 "pmaxub %%mm0, %%mm2                            \n\t" //
2153                 "pminub %%mm3, %%mm0                            \n\t" //
2154                 "pmaxub %%mm1, %%mm0                            \n\t" //
2155                 "pminub %%mm0, %%mm2                            \n\t"
2156                 "movq %%mm2, (%%ebx, %1, 2)                     \n\t"
2157
2158
2159                 : : "r" (src), "r" (stride)
2160                 : "%eax", "%ebx"
2161         );
2162
2163 #else // MMX without MMX2
2164         asm volatile(
2165                 "leal (%0, %1), %%eax                           \n\t"
2166                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
2167 //      0       1       2       3       4       5       6       7       8       9
2168 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
2169                 "pxor %%mm7, %%mm7                              \n\t"
2170
2171 #define MEDIAN(a,b,c)\
2172                 "movq " #a ", %%mm0                             \n\t"\
2173                 "movq " #b ", %%mm2                             \n\t"\
2174                 "movq " #c ", %%mm1                             \n\t"\
2175                 "movq %%mm0, %%mm3                              \n\t"\
2176                 "movq %%mm1, %%mm4                              \n\t"\
2177                 "movq %%mm2, %%mm5                              \n\t"\
2178                 "psubusb %%mm1, %%mm3                           \n\t"\
2179                 "psubusb %%mm2, %%mm4                           \n\t"\
2180                 "psubusb %%mm0, %%mm5                           \n\t"\
2181                 "pcmpeqb %%mm7, %%mm3                           \n\t"\
2182                 "pcmpeqb %%mm7, %%mm4                           \n\t"\
2183                 "pcmpeqb %%mm7, %%mm5                           \n\t"\
2184                 "movq %%mm3, %%mm6                              \n\t"\
2185                 "pxor %%mm4, %%mm3                              \n\t"\
2186                 "pxor %%mm5, %%mm4                              \n\t"\
2187                 "pxor %%mm6, %%mm5                              \n\t"\
2188                 "por %%mm3, %%mm1                               \n\t"\
2189                 "por %%mm4, %%mm2                               \n\t"\
2190                 "por %%mm5, %%mm0                               \n\t"\
2191                 "pand %%mm2, %%mm0                              \n\t"\
2192                 "pand %%mm1, %%mm0                              \n\t"\
2193                 "movq %%mm0, " #b "                             \n\t"
2194
2195 MEDIAN((%0), (%%eax), (%%eax, %1))
2196 MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
2197 MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1))
2198 MEDIAN((%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8))
2199
2200                 : : "r" (src), "r" (stride)
2201                 : "%eax", "%ebx"
2202         );
2203 #endif // MMX
2204 #else
2205         //FIXME
2206         int x;
2207         src+= 4*stride;
2208         for(x=0; x<8; x++)
2209         {
2210                 src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
2211                 src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
2212                 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2213                 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
2214                 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
2215                 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
2216                 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
2217                 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
2218                 src++;
2219         }
2220 #endif
2221 }
2222
2223 #ifdef HAVE_MMX
2224 /**
2225  * transposes and shift the given 8x8 Block into dst1 and dst2
2226  */
2227 static inline void transpose1(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
2228 {
2229         asm(
2230                 "leal (%0, %1), %%eax                           \n\t"
2231                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
2232 //      0       1       2       3       4       5       6       7       8       9
2233 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
2234                 "movq (%0), %%mm0               \n\t" // 12345678
2235                 "movq (%%eax), %%mm1            \n\t" // abcdefgh
2236                 "movq %%mm0, %%mm2              \n\t" // 12345678
2237                 "punpcklbw %%mm1, %%mm0         \n\t" // 1a2b3c4d
2238                 "punpckhbw %%mm1, %%mm2         \n\t" // 5e6f7g8h
2239
2240                 "movq (%%eax, %1), %%mm1        \n\t"
2241                 "movq (%%eax, %1, 2), %%mm3     \n\t"
2242                 "movq %%mm1, %%mm4              \n\t"
2243                 "punpcklbw %%mm3, %%mm1         \n\t"
2244                 "punpckhbw %%mm3, %%mm4         \n\t"
2245
2246                 "movq %%mm0, %%mm3              \n\t"
2247                 "punpcklwd %%mm1, %%mm0         \n\t"
2248                 "punpckhwd %%mm1, %%mm3         \n\t"
2249                 "movq %%mm2, %%mm1              \n\t"
2250                 "punpcklwd %%mm4, %%mm2         \n\t"
2251                 "punpckhwd %%mm4, %%mm1         \n\t"
2252
2253                 "movd %%mm0, 128(%2)            \n\t"
2254                 "psrlq $32, %%mm0               \n\t"
2255                 "movd %%mm0, 144(%2)            \n\t"
2256                 "movd %%mm3, 160(%2)            \n\t"
2257                 "psrlq $32, %%mm3               \n\t"
2258                 "movd %%mm3, 176(%2)            \n\t"
2259                 "movd %%mm3, 48(%3)             \n\t"
2260                 "movd %%mm2, 192(%2)            \n\t"
2261                 "movd %%mm2, 64(%3)             \n\t"
2262                 "psrlq $32, %%mm2               \n\t"
2263                 "movd %%mm2, 80(%3)             \n\t"
2264                 "movd %%mm1, 96(%3)             \n\t"
2265                 "psrlq $32, %%mm1               \n\t"
2266                 "movd %%mm1, 112(%3)            \n\t"
2267
2268                 "movq (%0, %1, 4), %%mm0        \n\t" // 12345678
2269                 "movq (%%ebx), %%mm1            \n\t" // abcdefgh
2270                 "movq %%mm0, %%mm2              \n\t" // 12345678
2271                 "punpcklbw %%mm1, %%mm0         \n\t" // 1a2b3c4d
2272                 "punpckhbw %%mm1, %%mm2         \n\t" // 5e6f7g8h
2273
2274                 "movq (%%ebx, %1), %%mm1        \n\t"
2275                 "movq (%%ebx, %1, 2), %%mm3     \n\t"
2276                 "movq %%mm1, %%mm4              \n\t"
2277                 "punpcklbw %%mm3, %%mm1         \n\t"
2278                 "punpckhbw %%mm3, %%mm4         \n\t"
2279
2280                 "movq %%mm0, %%mm3              \n\t"
2281                 "punpcklwd %%mm1, %%mm0         \n\t"
2282                 "punpckhwd %%mm1, %%mm3         \n\t"
2283                 "movq %%mm2, %%mm1              \n\t"
2284                 "punpcklwd %%mm4, %%mm2         \n\t"
2285                 "punpckhwd %%mm4, %%mm1         \n\t"
2286
2287                 "movd %%mm0, 132(%2)            \n\t"
2288                 "psrlq $32, %%mm0               \n\t"
2289                 "movd %%mm0, 148(%2)            \n\t"
2290                 "movd %%mm3, 164(%2)            \n\t"
2291                 "psrlq $32, %%mm3               \n\t"
2292                 "movd %%mm3, 180(%2)            \n\t"
2293                 "movd %%mm3, 52(%3)             \n\t"
2294                 "movd %%mm2, 196(%2)            \n\t"
2295                 "movd %%mm2, 68(%3)             \n\t"
2296                 "psrlq $32, %%mm2               \n\t"
2297                 "movd %%mm2, 84(%3)             \n\t"
2298                 "movd %%mm1, 100(%3)            \n\t"
2299                 "psrlq $32, %%mm1               \n\t"
2300                 "movd %%mm1, 116(%3)            \n\t"
2301
2302
2303         :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2)
2304         : "%eax", "%ebx"
2305         );
2306 }
2307
2308 /**
2309  * transposes the given 8x8 block
2310  */
2311 static inline void transpose2(uint8_t *dst, int dstStride, uint8_t *src)
2312 {
2313         asm(
2314                 "leal (%0, %1), %%eax                           \n\t"
2315                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
2316 //      0       1       2       3       4       5       6       7       8       9
2317 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
2318                 "movq (%2), %%mm0               \n\t" // 12345678
2319                 "movq 16(%2), %%mm1             \n\t" // abcdefgh
2320                 "movq %%mm0, %%mm2              \n\t" // 12345678
2321                 "punpcklbw %%mm1, %%mm0         \n\t" // 1a2b3c4d
2322                 "punpckhbw %%mm1, %%mm2         \n\t" // 5e6f7g8h
2323
2324                 "movq 32(%2), %%mm1             \n\t"
2325                 "movq 48(%2), %%mm3             \n\t"
2326                 "movq %%mm1, %%mm4              \n\t"
2327                 "punpcklbw %%mm3, %%mm1         \n\t"
2328                 "punpckhbw %%mm3, %%mm4         \n\t"
2329
2330                 "movq %%mm0, %%mm3              \n\t"
2331                 "punpcklwd %%mm1, %%mm0         \n\t"
2332                 "punpckhwd %%mm1, %%mm3         \n\t"
2333                 "movq %%mm2, %%mm1              \n\t"
2334                 "punpcklwd %%mm4, %%mm2         \n\t"
2335                 "punpckhwd %%mm4, %%mm1         \n\t"
2336
2337                 "movd %%mm0, (%0)               \n\t"
2338                 "psrlq $32, %%mm0               \n\t"
2339                 "movd %%mm0, (%%eax)            \n\t"
2340                 "movd %%mm3, (%%eax, %1)        \n\t"
2341                 "psrlq $32, %%mm3               \n\t"
2342                 "movd %%mm3, (%%eax, %1, 2)     \n\t"
2343                 "movd %%mm2, (%0, %1, 4)        \n\t"
2344                 "psrlq $32, %%mm2               \n\t"
2345                 "movd %%mm2, (%%ebx)            \n\t"
2346                 "movd %%mm1, (%%ebx, %1)        \n\t"
2347                 "psrlq $32, %%mm1               \n\t"
2348                 "movd %%mm1, (%%ebx, %1, 2)     \n\t"
2349
2350
2351                 "movq 64(%2), %%mm0             \n\t" // 12345678
2352                 "movq 80(%2), %%mm1             \n\t" // abcdefgh
2353                 "movq %%mm0, %%mm2              \n\t" // 12345678
2354                 "punpcklbw %%mm1, %%mm0         \n\t" // 1a2b3c4d
2355                 "punpckhbw %%mm1, %%mm2         \n\t" // 5e6f7g8h
2356
2357                 "movq 96(%2), %%mm1             \n\t"
2358                 "movq 112(%2), %%mm3            \n\t"
2359                 "movq %%mm1, %%mm4              \n\t"
2360                 "punpcklbw %%mm3, %%mm1         \n\t"
2361                 "punpckhbw %%mm3, %%mm4         \n\t"
2362
2363                 "movq %%mm0, %%mm3              \n\t"
2364                 "punpcklwd %%mm1, %%mm0         \n\t"
2365                 "punpckhwd %%mm1, %%mm3         \n\t"
2366                 "movq %%mm2, %%mm1              \n\t"
2367                 "punpcklwd %%mm4, %%mm2         \n\t"
2368                 "punpckhwd %%mm4, %%mm1         \n\t"
2369
2370                 "movd %%mm0, 4(%0)              \n\t"
2371                 "psrlq $32, %%mm0               \n\t"
2372                 "movd %%mm0, 4(%%eax)           \n\t"
2373                 "movd %%mm3, 4(%%eax, %1)       \n\t"
2374                 "psrlq $32, %%mm3               \n\t"
2375                 "movd %%mm3, 4(%%eax, %1, 2)    \n\t"
2376                 "movd %%mm2, 4(%0, %1, 4)       \n\t"
2377                 "psrlq $32, %%mm2               \n\t"
2378                 "movd %%mm2, 4(%%ebx)           \n\t"
2379                 "movd %%mm1, 4(%%ebx, %1)       \n\t"
2380                 "psrlq $32, %%mm1               \n\t"
2381                 "movd %%mm1, 4(%%ebx, %1, 2)    \n\t"
2382
2383         :: "r" (dst), "r" (dstStride), "r" (src)
2384         : "%eax", "%ebx"
2385         );
2386 }
2387 #endif
2388 //static int test=0;
2389
2390 static void inline tempNoiseReducer(uint8_t *src, int stride,
2391                                     uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
2392 {
2393 #define FAST_L2_DIFF
2394 //#define L1_DIFF //u should change the thresholds too if u try that one
2395 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2396         asm volatile(
2397                 "leal (%2, %2, 2), %%eax                        \n\t" // 3*stride
2398                 "leal (%2, %2, 4), %%ebx                        \n\t" // 5*stride
2399                 "leal (%%ebx, %2, 2), %%ecx                     \n\t" // 7*stride
2400 //      0       1       2       3       4       5       6       7       8       9
2401 //      %x      %x+%2   %x+2%2  %x+eax  %x+4%2  %x+ebx  %x+2eax %x+ecx  %x+8%2
2402 //FIXME reorder?
2403 #ifdef L1_DIFF //needs mmx2
2404                 "movq (%0), %%mm0                               \n\t" // L0
2405                 "psadbw (%1), %%mm0                             \n\t" // |L0-R0|
2406                 "movq (%0, %2), %%mm1                           \n\t" // L1
2407                 "psadbw (%1, %2), %%mm1                         \n\t" // |L1-R1|
2408                 "movq (%0, %2, 2), %%mm2                        \n\t" // L2
2409                 "psadbw (%1, %2, 2), %%mm2                      \n\t" // |L2-R2|
2410                 "movq (%0, %%eax), %%mm3                        \n\t" // L3
2411                 "psadbw (%1, %%eax), %%mm3                      \n\t" // |L3-R3|
2412
2413                 "movq (%0, %2, 4), %%mm4                        \n\t" // L4
2414                 "paddw %%mm1, %%mm0                             \n\t"
2415                 "psadbw (%1, %2, 4), %%mm4                      \n\t" // |L4-R4|
2416                 "movq (%0, %%ebx), %%mm5                        \n\t" // L5
2417                 "paddw %%mm2, %%mm0                             \n\t"
2418                 "psadbw (%1, %%ebx), %%mm5                      \n\t" // |L5-R5|
2419                 "movq (%0, %%eax, 2), %%mm6                     \n\t" // L6
2420                 "paddw %%mm3, %%mm0                             \n\t"
2421                 "psadbw (%1, %%eax, 2), %%mm6                   \n\t" // |L6-R6|
2422                 "movq (%0, %%ecx), %%mm7                        \n\t" // L7
2423                 "paddw %%mm4, %%mm0                             \n\t"
2424                 "psadbw (%1, %%ecx), %%mm7                      \n\t" // |L7-R7|
2425                 "paddw %%mm5, %%mm6                             \n\t"
2426                 "paddw %%mm7, %%mm6                             \n\t"
2427                 "paddw %%mm6, %%mm0                             \n\t"
2428 #elif defined (FAST_L2_DIFF)
2429                 "pcmpeqb %%mm7, %%mm7                           \n\t"
2430                 "movq b80, %%mm6                                \n\t"
2431                 "pxor %%mm0, %%mm0                              \n\t"
2432 #define L2_DIFF_CORE(a, b)\
2433                 "movq " #a ", %%mm5                             \n\t"\
2434                 "movq " #b ", %%mm2                             \n\t"\
2435                 "pxor %%mm7, %%mm2                              \n\t"\
2436                 PAVGB(%%mm2, %%mm5)\
2437                 "paddb %%mm6, %%mm5                             \n\t"\
2438                 "movq %%mm5, %%mm2                              \n\t"\
2439                 "psllw $8, %%mm5                                \n\t"\
2440                 "pmaddwd %%mm5, %%mm5                           \n\t"\
2441                 "pmaddwd %%mm2, %%mm2                           \n\t"\
2442                 "paddd %%mm2, %%mm5                             \n\t"\
2443                 "psrld $14, %%mm5                               \n\t"\
2444                 "paddd %%mm5, %%mm0                             \n\t"
2445
2446 L2_DIFF_CORE((%0), (%1))
2447 L2_DIFF_CORE((%0, %2), (%1, %2))
2448 L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
2449 L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
2450 L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
2451 L2_DIFF_CORE((%0, %%ebx), (%1, %%ebx))
2452 L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
2453 L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
2454
2455 #else
2456                 "pxor %%mm7, %%mm7                              \n\t"
2457                 "pxor %%mm0, %%mm0                              \n\t"
2458 #define L2_DIFF_CORE(a, b)\
2459                 "movq " #a ", %%mm5                             \n\t"\
2460                 "movq " #b ", %%mm2                             \n\t"\
2461                 "movq %%mm5, %%mm1                              \n\t"\
2462                 "movq %%mm2, %%mm3                              \n\t"\
2463                 "punpcklbw %%mm7, %%mm5                         \n\t"\
2464                 "punpckhbw %%mm7, %%mm1                         \n\t"\
2465                 "punpcklbw %%mm7, %%mm2                         \n\t"\
2466                 "punpckhbw %%mm7, %%mm3                         \n\t"\
2467                 "psubw %%mm2, %%mm5                             \n\t"\
2468                 "psubw %%mm3, %%mm1                             \n\t"\
2469                 "pmaddwd %%mm5, %%mm5                           \n\t"\
2470                 "pmaddwd %%mm1, %%mm1                           \n\t"\
2471                 "paddd %%mm1, %%mm5                             \n\t"\
2472                 "paddd %%mm5, %%mm0                             \n\t"
2473
2474 L2_DIFF_CORE((%0), (%1))
2475 L2_DIFF_CORE((%0, %2), (%1, %2))
2476 L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
2477 L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
2478 L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
2479 L2_DIFF_CORE((%0, %%ebx), (%1, %%ebx))
2480 L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
2481 L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
2482
2483 #endif
2484
2485                 "movq %%mm0, %%mm4                              \n\t"
2486                 "psrlq $32, %%mm0                               \n\t"
2487                 "paddd %%mm0, %%mm4                             \n\t"
2488                 "movd %%mm4, %%ecx                              \n\t"
2489                 "shll $2, %%ecx                                 \n\t"
2490                 "movl %3, %%ebx                                 \n\t"
2491                 "addl -4(%%ebx), %%ecx                          \n\t"
2492                 "addl 4(%%ebx), %%ecx                           \n\t"
2493                 "addl -1024(%%ebx), %%ecx                       \n\t"
2494                 "addl $4, %%ecx                                 \n\t"
2495                 "addl 1024(%%ebx), %%ecx                        \n\t"
2496                 "shrl $3, %%ecx                                 \n\t"
2497                 "movl %%ecx, (%%ebx)                            \n\t"
2498                 "leal (%%eax, %2, 2), %%ebx                     \n\t" // 5*stride
2499
2500 //              "movl %3, %%ecx                         \n\t"
2501 //              "movl %%ecx, test                               \n\t"
2502 //              "jmp 4f \n\t"
2503                 "cmpl 4+maxTmpNoise, %%ecx                      \n\t"
2504                 " jb 2f                                         \n\t"
2505                 "cmpl 8+maxTmpNoise, %%ecx                      \n\t"
2506                 " jb 1f                                         \n\t"
2507
2508                 "leal (%%ebx, %2, 2), %%ecx                     \n\t" // 7*stride
2509                 "movq (%0), %%mm0                               \n\t" // L0
2510                 "movq (%0, %2), %%mm1                           \n\t" // L1
2511                 "movq (%0, %2, 2), %%mm2                        \n\t" // L2
2512                 "movq (%0, %%eax), %%mm3                        \n\t" // L3
2513                 "movq (%0, %2, 4), %%mm4                        \n\t" // L4
2514                 "movq (%0, %%ebx), %%mm5                        \n\t" // L5
2515                 "movq (%0, %%eax, 2), %%mm6                     \n\t" // L6
2516                 "movq (%0, %%ecx), %%mm7                        \n\t" // L7
2517                 "movq %%mm0, (%1)                               \n\t" // L0
2518                 "movq %%mm1, (%1, %2)                           \n\t" // L1
2519                 "movq %%mm2, (%1, %2, 2)                        \n\t" // L2
2520                 "movq %%mm3, (%1, %%eax)                        \n\t" // L3
2521                 "movq %%mm4, (%1, %2, 4)                        \n\t" // L4
2522                 "movq %%mm5, (%1, %%ebx)                        \n\t" // L5
2523                 "movq %%mm6, (%1, %%eax, 2)                     \n\t" // L6
2524                 "movq %%mm7, (%1, %%ecx)                        \n\t" // L7
2525                 "jmp 4f                                         \n\t"
2526
2527                 "1:                                             \n\t"
2528                 "leal (%%ebx, %2, 2), %%ecx                     \n\t" // 7*stride
2529                 "movq (%0), %%mm0                               \n\t" // L0
2530                 "pavgb (%1), %%mm0                              \n\t" // L0
2531                 "movq (%0, %2), %%mm1                           \n\t" // L1
2532                 "pavgb (%1, %2), %%mm1                          \n\t" // L1
2533                 "movq (%0, %2, 2), %%mm2                        \n\t" // L2
2534                 "pavgb (%1, %2, 2), %%mm2                       \n\t" // L2
2535                 "movq (%0, %%eax), %%mm3                        \n\t" // L3
2536                 "pavgb (%1, %%eax), %%mm3                       \n\t" // L3
2537                 "movq (%0, %2, 4), %%mm4                        \n\t" // L4
2538                 "pavgb (%1, %2, 4), %%mm4                       \n\t" // L4
2539                 "movq (%0, %%ebx), %%mm5                        \n\t" // L5
2540                 "pavgb (%1, %%ebx), %%mm5                       \n\t" // L5
2541                 "movq (%0, %%eax, 2), %%mm6                     \n\t" // L6
2542                 "pavgb (%1, %%eax, 2), %%mm6                    \n\t" // L6
2543                 "movq (%0, %%ecx), %%mm7                        \n\t" // L7
2544                 "pavgb (%1, %%ecx), %%mm7                       \n\t" // L7
2545                 "movq %%mm0, (%1)                               \n\t" // R0
2546                 "movq %%mm1, (%1, %2)                           \n\t" // R1
2547                 "movq %%mm2, (%1, %2, 2)                        \n\t" // R2
2548                 "movq %%mm3, (%1, %%eax)                        \n\t" // R3
2549                 "movq %%mm4, (%1, %2, 4)                        \n\t" // R4
2550                 "movq %%mm5, (%1, %%ebx)                        \n\t" // R5
2551                 "movq %%mm6, (%1, %%eax, 2)                     \n\t" // R6
2552                 "movq %%mm7, (%1, %%ecx)                        \n\t" // R7
2553                 "movq %%mm0, (%0)                               \n\t" // L0
2554                 "movq %%mm1, (%0, %2)                           \n\t" // L1
2555                 "movq %%mm2, (%0, %2, 2)                        \n\t" // L2
2556                 "movq %%mm3, (%0, %%eax)                        \n\t" // L3
2557                 "movq %%mm4, (%0, %2, 4)                        \n\t" // L4
2558                 "movq %%mm5, (%0, %%ebx)                        \n\t" // L5
2559                 "movq %%mm6, (%0, %%eax, 2)                     \n\t" // L6
2560                 "movq %%mm7, (%0, %%ecx)                        \n\t" // L7
2561                 "jmp 4f                                         \n\t"
2562
2563                 "2:                                             \n\t"
2564                 "cmpl maxTmpNoise, %%ecx                        \n\t"
2565                 " jb 3f                                         \n\t"
2566
2567                 "leal (%%ebx, %2, 2), %%ecx                     \n\t" // 7*stride
2568                 "movq (%0), %%mm0                               \n\t" // L0
2569                 "movq (%0, %2), %%mm1                           \n\t" // L1
2570                 "movq (%0, %2, 2), %%mm2                        \n\t" // L2
2571                 "movq (%0, %%eax), %%mm3                        \n\t" // L3
2572                 "movq (%1), %%mm4                               \n\t" // R0
2573                 "movq (%1, %2), %%mm5                           \n\t" // R1
2574                 "movq (%1, %2, 2), %%mm6                        \n\t" // R2
2575                 "movq (%1, %%eax), %%mm7                        \n\t" // R3
2576                 PAVGB(%%mm4, %%mm0)
2577                 PAVGB(%%mm5, %%mm1)
2578                 PAVGB(%%mm6, %%mm2)
2579                 PAVGB(%%mm7, %%mm3)
2580                 PAVGB(%%mm4, %%mm0)
2581                 PAVGB(%%mm5, %%mm1)
2582                 PAVGB(%%mm6, %%mm2)
2583                 PAVGB(%%mm7, %%mm3)
2584                 "movq %%mm0, (%1)                               \n\t" // R0
2585                 "movq %%mm1, (%1, %2)                           \n\t" // R1
2586                 "movq %%mm2, (%1, %2, 2)                        \n\t" // R2
2587                 "movq %%mm3, (%1, %%eax)                        \n\t" // R3
2588                 "movq %%mm0, (%0)                               \n\t" // L0
2589                 "movq %%mm1, (%0, %2)                           \n\t" // L1
2590                 "movq %%mm2, (%0, %2, 2)                        \n\t" // L2
2591                 "movq %%mm3, (%0, %%eax)                        \n\t" // L3
2592
2593                 "movq (%0, %2, 4), %%mm0                        \n\t" // L4
2594                 "movq (%0, %%ebx), %%mm1                        \n\t" // L5
2595                 "movq (%0, %%eax, 2), %%mm2                     \n\t" // L6
2596                 "movq (%0, %%ecx), %%mm3                        \n\t" // L7
2597                 "movq (%1, %2, 4), %%mm4                        \n\t" // R4
2598                 "movq (%1, %%ebx), %%mm5                        \n\t" // R5
2599                 "movq (%1, %%eax, 2), %%mm6                     \n\t" // R6
2600                 "movq (%1, %%ecx), %%mm7                        \n\t" // R7
2601                 PAVGB(%%mm4, %%mm0)
2602                 PAVGB(%%mm5, %%mm1)
2603                 PAVGB(%%mm6, %%mm2)
2604                 PAVGB(%%mm7, %%mm3)
2605                 PAVGB(%%mm4, %%mm0)
2606                 PAVGB(%%mm5, %%mm1)
2607                 PAVGB(%%mm6, %%mm2)
2608                 PAVGB(%%mm7, %%mm3)
2609                 "movq %%mm0, (%1, %2, 4)                        \n\t" // R4
2610                 "movq %%mm1, (%1, %%ebx)                        \n\t" // R5
2611                 "movq %%mm2, (%1, %%eax, 2)                     \n\t" // R6
2612                 "movq %%mm3, (%1, %%ecx)                        \n\t" // R7
2613                 "movq %%mm0, (%0, %2, 4)                        \n\t" // L4
2614                 "movq %%mm1, (%0, %%ebx)                        \n\t" // L5
2615                 "movq %%mm2, (%0, %%eax, 2)                     \n\t" // L6
2616                 "movq %%mm3, (%0, %%ecx)                        \n\t" // L7
2617                 "jmp 4f                                         \n\t"
2618
2619                 "3:                                             \n\t"
2620                 "leal (%%ebx, %2, 2), %%ecx                     \n\t" // 7*stride
2621                 "movq (%0), %%mm0                               \n\t" // L0
2622                 "movq (%0, %2), %%mm1                           \n\t" // L1
2623                 "movq (%0, %2, 2), %%mm2                        \n\t" // L2
2624                 "movq (%0, %%eax), %%mm3                        \n\t" // L3
2625                 "movq (%1), %%mm4                               \n\t" // R0
2626                 "movq (%1, %2), %%mm5                           \n\t" // R1
2627                 "movq (%1, %2, 2), %%mm6                        \n\t" // R2
2628                 "movq (%1, %%eax), %%mm7                        \n\t" // R3
2629                 PAVGB(%%mm4, %%mm0)
2630                 PAVGB(%%mm5, %%mm1)
2631                 PAVGB(%%mm6, %%mm2)
2632                 PAVGB(%%mm7, %%mm3)
2633                 PAVGB(%%mm4, %%mm0)
2634                 PAVGB(%%mm5, %%mm1)
2635                 PAVGB(%%mm6, %%mm2)
2636                 PAVGB(%%mm7, %%mm3)
2637                 PAVGB(%%mm4, %%mm0)
2638                 PAVGB(%%mm5, %%mm1)
2639                 PAVGB(%%mm6, %%mm2)
2640                 PAVGB(%%mm7, %%mm3)
2641                 "movq %%mm0, (%1)                               \n\t" // R0
2642                 "movq %%mm1, (%1, %2)                           \n\t" // R1
2643                 "movq %%mm2, (%1, %2, 2)                        \n\t" // R2
2644                 "movq %%mm3, (%1, %%eax)                        \n\t" // R3
2645                 "movq %%mm0, (%0)                               \n\t" // L0
2646                 "movq %%mm1, (%0, %2)                           \n\t" // L1
2647                 "movq %%mm2, (%0, %2, 2)                        \n\t" // L2
2648                 "movq %%mm3, (%0, %%eax)                        \n\t" // L3
2649
2650                 "movq (%0, %2, 4), %%mm0                        \n\t" // L4
2651                 "movq (%0, %%ebx), %%mm1                        \n\t" // L5
2652                 "movq (%0, %%eax, 2), %%mm2                     \n\t" // L6
2653                 "movq (%0, %%ecx), %%mm3                        \n\t" // L7
2654                 "movq (%1, %2, 4), %%mm4                        \n\t" // R4
2655                 "movq (%1, %%ebx), %%mm5                        \n\t" // R5
2656                 "movq (%1, %%eax, 2), %%mm6                     \n\t" // R6
2657                 "movq (%1, %%ecx), %%mm7                        \n\t" // R7
2658                 PAVGB(%%mm4, %%mm0)
2659                 PAVGB(%%mm5, %%mm1)
2660                 PAVGB(%%mm6, %%mm2)
2661                 PAVGB(%%mm7, %%mm3)
2662                 PAVGB(%%mm4, %%mm0)
2663                 PAVGB(%%mm5, %%mm1)
2664                 PAVGB(%%mm6, %%mm2)
2665                 PAVGB(%%mm7, %%mm3)
2666                 PAVGB(%%mm4, %%mm0)
2667                 PAVGB(%%mm5, %%mm1)
2668                 PAVGB(%%mm6, %%mm2)
2669                 PAVGB(%%mm7, %%mm3)
2670                 "movq %%mm0, (%1, %2, 4)                        \n\t" // R4
2671                 "movq %%mm1, (%1, %%ebx)                        \n\t" // R5
2672                 "movq %%mm2, (%1, %%eax, 2)                     \n\t" // R6
2673                 "movq %%mm3, (%1, %%ecx)                        \n\t" // R7
2674                 "movq %%mm0, (%0, %2, 4)                        \n\t" // L4
2675                 "movq %%mm1, (%0, %%ebx)                        \n\t" // L5
2676                 "movq %%mm2, (%0, %%eax, 2)                     \n\t" // L6
2677                 "movq %%mm3, (%0, %%ecx)                        \n\t" // L7
2678
2679                 "4:                                             \n\t"
2680
2681                 :: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast)
2682                 : "%eax", "%ebx", "%ecx", "memory"
2683                 );
2684 //printf("%d\n", test);
2685 #else
2686         int y;
2687         int d=0;
2688         int sysd=0;
2689         int i;
2690
2691         for(y=0; y<8; y++)
2692         {
2693                 int x;
2694                 for(x=0; x<8; x++)
2695                 {
2696                         int ref= tempBlured[ x + y*stride ];
2697                         int cur= src[ x + y*stride ];
2698                         int d1=ref - cur;
2699 //                      if(x==0 || x==7) d1+= d1>>1;
2700 //                      if(y==0 || y==7) d1+= d1>>1;
2701 //                      d+= ABS(d1);
2702                         d+= d1*d1;
2703                         sysd+= d1;
2704                 }
2705         }
2706         i=d;
2707         d=      (
2708                 4*d
2709                 +(*(tempBluredPast-256))
2710                 +(*(tempBluredPast-1))+ (*(tempBluredPast+1))
2711                 +(*(tempBluredPast+256))
2712                 +4)>>3;
2713         *tempBluredPast=i;
2714 //      ((*tempBluredPast)*3 + d + 2)>>2;
2715
2716 //printf("%d %d %d\n", maxNoise[0], maxNoise[1], maxNoise[2]);
2717 /*
2718 Switch between
2719  1  0  0  0  0  0  0  (0)
2720 64 32 16  8  4  2  1  (1)
2721 64 48 36 27 20 15 11 (33) (approx)
2722 64 56 49 43 37 33 29 (200) (approx)
2723 */
2724         if(d > maxNoise[1])
2725         {
2726                 if(d < maxNoise[2])
2727                 {
2728                         for(y=0; y<8; y++)
2729                         {
2730                                 int x;
2731                                 for(x=0; x<8; x++)
2732                                 {
2733                                         int ref= tempBlured[ x + y*stride ];
2734                                         int cur= src[ x + y*stride ];
2735                                         tempBlured[ x + y*stride ]=
2736                                         src[ x + y*stride ]=
2737                                                 (ref + cur + 1)>>1;
2738                                 }
2739                         }
2740                 }
2741                 else
2742                 {
2743                         for(y=0; y<8; y++)
2744                         {
2745                                 int x;
2746                                 for(x=0; x<8; x++)
2747                                 {
2748                                         tempBlured[ x + y*stride ]= src[ x + y*stride ];
2749                                 }
2750                         }
2751                 }
2752         }
2753         else
2754         {
2755                 if(d < maxNoise[0])
2756                 {
2757                         for(y=0; y<8; y++)
2758                         {
2759                                 int x;
2760                                 for(x=0; x<8; x++)
2761                                 {
2762                                         int ref= tempBlured[ x + y*stride ];
2763                                         int cur= src[ x + y*stride ];
2764                                         tempBlured[ x + y*stride ]=
2765                                         src[ x + y*stride ]=
2766                                                 (ref*7 + cur + 4)>>3;
2767                                 }
2768                         }
2769                 }
2770                 else
2771                 {
2772                         for(y=0; y<8; y++)
2773                         {
2774                                 int x;
2775                                 for(x=0; x<8; x++)
2776                                 {
2777                                         int ref= tempBlured[ x + y*stride ];
2778                                         int cur= src[ x + y*stride ];
2779                                         tempBlured[ x + y*stride ]=
2780                                         src[ x + y*stride ]=
2781                                                 (ref*3 + cur + 2)>>2;
2782                                 }
2783                         }
2784                 }
2785         }
2786 #endif
2787 }
2788
2789 #ifdef HAVE_ODIVX_POSTPROCESS
2790 #include "../opendivx/postprocess.h"
2791 int use_old_pp=0;
2792 #endif
2793
2794 static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2795         QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
2796
2797 /* -pp Command line Help
2798 NOTE/FIXME: put this at an appropriate place (--help, html docs, man mplayer)?
2799
2800 -pp <filterName>[:<option>[:<option>...]][,[-]<filterName>[:<option>...]]...
2801
2802 long form example:
2803 -pp vdeblock:autoq,hdeblock:autoq,linblenddeint         -pp default,-vdeblock
2804 short form example:
2805 -pp vb:a,hb:a,lb                                        -pp de,-vb
2806 more examples:
2807 -pp tn:64:128:256
2808
2809 Filters                 Options
2810 short   long name       short   long option     Description
2811 *       *               a       autoq           cpu power dependant enabler
2812                         c       chrom           chrominance filtring enabled
2813                         y       nochrom         chrominance filtring disabled
2814 hb      hdeblock                                horizontal deblocking filter
2815 vb      vdeblock                                vertical deblocking filter
2816 vr      rkvdeblock
2817 h1      x1hdeblock                              Experimental horizontal deblock filter 1
2818 v1      x1vdeblock                              Experimental vertical deblock filter 1
2819 dr      dering                                  not implemented yet
2820 al      autolevels                              automatic brightness / contrast fixer
2821                         f       fullyrange      stretch luminance range to (0..255)
2822 lb      linblenddeint                           linear blend deinterlacer
2823 li      linipoldeint                            linear interpolating deinterlacer
2824 ci      cubicipoldeint                          cubic interpolating deinterlacer
2825 md      mediandeint                             median deinterlacer
2826 de      default                                 hdeblock:a,vdeblock:a,dering:a,autolevels
2827 fa      fast                                    x1hdeblock:a,x1vdeblock:a,dering:a,autolevels
2828 tn      tmpnoise        (3 Thresholds)          Temporal Noise Reducer
2829 */
2830
2831 /**
2832  * returns a PPMode struct which will have a non 0 error variable if an error occured
2833  * name is the string after "-pp" on the command line
2834  * quality is a number from 0 to GET_PP_QUALITY_MAX
2835  */
2836 struct PPMode getPPModeByNameAndQuality(char *name, int quality)
2837 {
2838         char temp[GET_MODE_BUFFER_SIZE];
2839         char *p= temp;
2840         char *filterDelimiters= ",";
2841         char *optionDelimiters= ":";
2842         struct PPMode ppMode= {0,0,0,0,0,0,{150,200,400}};
2843         char *filterToken;
2844
2845         strncpy(temp, name, GET_MODE_BUFFER_SIZE);
2846
2847         printf("%s\n", name);
2848
2849         for(;;){
2850                 char *filterName;
2851                 int q= 1000000; //GET_PP_QUALITY_MAX;
2852                 int chrom=-1;
2853                 char *option;
2854                 char *options[OPTIONS_ARRAY_SIZE];
2855                 int i;
2856                 int filterNameOk=0;
2857                 int numOfUnknownOptions=0;
2858                 int enable=1; //does the user want us to enabled or disabled the filter
2859
2860                 filterToken= strtok(p, filterDelimiters);
2861                 if(filterToken == NULL) break;
2862                 p+= strlen(filterToken) + 1; // p points to next filterToken
2863                 filterName= strtok(filterToken, optionDelimiters);
2864                 printf("%s::%s\n", filterToken, filterName);
2865
2866                 if(*filterName == '-')
2867                 {
2868                         enable=0;
2869                         filterName++;
2870                 }
2871
2872                 for(;;){ //for all options
2873                         option= strtok(NULL, optionDelimiters);
2874                         if(option == NULL) break;
2875
2876                         printf("%s\n", option);
2877                         if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
2878                         else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
2879                         else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
2880                         else
2881                         {
2882                                 options[numOfUnknownOptions] = option;
2883                                 numOfUnknownOptions++;
2884                         }
2885                         if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
2886                 }
2887                 options[numOfUnknownOptions] = NULL;
2888
2889                 /* replace stuff from the replace Table */
2890                 for(i=0; replaceTable[2*i]!=NULL; i++)
2891                 {
2892                         if(!strcmp(replaceTable[2*i], filterName))
2893                         {
2894                                 int newlen= strlen(replaceTable[2*i + 1]);
2895                                 int plen;
2896                                 int spaceLeft;
2897
2898                                 if(p==NULL) p= temp, *p=0;      //last filter
2899                                 else p--, *p=',';               //not last filter
2900
2901                                 plen= strlen(p);
2902                                 spaceLeft= (int)p - (int)temp + plen;
2903                                 if(spaceLeft + newlen  >= GET_MODE_BUFFER_SIZE)
2904                                 {
2905                                         ppMode.error++;
2906                                         break;
2907                                 }
2908                                 memmove(p + newlen, p, plen+1);
2909                                 memcpy(p, replaceTable[2*i + 1], newlen);
2910                                 filterNameOk=1;
2911                         }
2912                 }
2913
2914                 for(i=0; filters[i].shortName!=NULL; i++)
2915                 {
2916 //                      printf("Compareing %s, %s, %s\n", filters[i].shortName,filters[i].longName, filterName);
2917                         if(   !strcmp(filters[i].longName, filterName)
2918                            || !strcmp(filters[i].shortName, filterName))
2919                         {
2920                                 ppMode.lumMode &= ~filters[i].mask;
2921                                 ppMode.chromMode &= ~filters[i].mask;
2922
2923                                 filterNameOk=1;
2924                                 if(!enable) break; // user wants to disable it
2925
2926                                 if(q >= filters[i].minLumQuality)
2927                                         ppMode.lumMode|= filters[i].mask;
2928                                 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
2929                                         if(q >= filters[i].minChromQuality)
2930                                                 ppMode.chromMode|= filters[i].mask;
2931
2932                                 if(filters[i].mask == LEVEL_FIX)
2933                                 {
2934                                         int o;
2935                                         ppMode.minAllowedY= 16;
2936                                         ppMode.maxAllowedY= 234;
2937                                         for(o=0; options[o]!=NULL; o++)
2938                                                 if(  !strcmp(options[o],"fullyrange")
2939                                                    ||!strcmp(options[o],"f"))
2940                                                 {
2941                                                         ppMode.minAllowedY= 0;
2942                                                         ppMode.maxAllowedY= 255;
2943                                                         numOfUnknownOptions--;
2944                                                 }
2945                                 }
2946                                 else if(filters[i].mask == TEMP_NOISE_FILTER)
2947                                 {
2948                                         int o;
2949                                         int numOfNoises=0;
2950                                         ppMode.maxTmpNoise[0]= 150;
2951                                         ppMode.maxTmpNoise[1]= 200;
2952                                         ppMode.maxTmpNoise[2]= 400;
2953
2954                                         for(o=0; options[o]!=NULL; o++)
2955                                         {
2956                                                 char *tail;
2957                                                 ppMode.maxTmpNoise[numOfNoises]=
2958                                                         strtol(options[o], &tail, 0);
2959                                                 if(tail!=options[o])
2960                                                 {
2961                                                         numOfNoises++;
2962                                                         numOfUnknownOptions--;
2963                                                         if(numOfNoises >= 3) break;
2964                                                 }
2965                                         }
2966                                 }
2967                         }
2968                 }
2969                 if(!filterNameOk) ppMode.error++;
2970                 ppMode.error += numOfUnknownOptions;
2971         }
2972
2973 #ifdef HAVE_ODIVX_POSTPROCESS
2974         if(ppMode.lumMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_H;
2975         if(ppMode.lumMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_V;
2976         if(ppMode.chromMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_H;
2977         if(ppMode.chromMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_V;
2978         if(ppMode.lumMode & DERING) ppMode.oldMode |= PP_DERING_Y;
2979         if(ppMode.chromMode & DERING) ppMode.oldMode |= PP_DERING_C;
2980 #endif
2981
2982         return ppMode;
2983 }
2984
2985 /**
2986  * Obsolete, dont use it, use postprocess2() instead
2987  */
2988 void  postprocess(unsigned char * src[], int src_stride,
2989                  unsigned char * dst[], int dst_stride,
2990                  int horizontal_size,   int vertical_size,
2991                  QP_STORE_T *QP_store,  int QP_stride,
2992                                           int mode)
2993 {
2994         struct PPMode ppMode;
2995         static QP_STORE_T zeroArray[2048/8];
2996 /*
2997         static int qual=0;
2998
2999         ppMode= getPPModeByNameAndQuality("fast,default,-hdeblock,-vdeblock,tmpnoise:150:200:300", qual);
3000         printf("OK\n");
3001         qual++;
3002         qual%=7;
3003         printf("\n%X %X %X %X :%d: %d %d %d\n", ppMode.lumMode, ppMode.chromMode, ppMode.oldMode, ppMode.error,
3004                 qual, ppMode.maxTmpNoise[0], ppMode.maxTmpNoise[1], ppMode.maxTmpNoise[2]);
3005         postprocess2(src, src_stride, dst, dst_stride,
3006                  horizontal_size, vertical_size, QP_store, QP_stride, &ppMode);
3007
3008         return;
3009 */
3010         if(QP_store==NULL)
3011         {
3012                 QP_store= zeroArray;
3013                 QP_stride= 0;
3014         }
3015
3016         ppMode.lumMode= mode;
3017         mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00);
3018         ppMode.chromMode= mode;
3019         ppMode.maxTmpNoise[0]= 700;
3020         ppMode.maxTmpNoise[1]= 1500;
3021         ppMode.maxTmpNoise[2]= 3000;
3022
3023 #ifdef HAVE_ODIVX_POSTPROCESS
3024 // Note: I could make this shit outside of this file, but it would mean one
3025 // more function call...
3026         if(use_old_pp){
3027             odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,mode);
3028             return;
3029         }
3030 #endif
3031
3032         postProcess(src[0], src_stride, dst[0], dst_stride,
3033                 horizontal_size, vertical_size, QP_store, QP_stride, 0, &ppMode);
3034
3035         horizontal_size >>= 1;
3036         vertical_size   >>= 1;
3037         src_stride      >>= 1;
3038         dst_stride      >>= 1;
3039
3040         if(1)
3041         {
3042                 postProcess(src[1], src_stride, dst[1], dst_stride,
3043                         horizontal_size, vertical_size, QP_store, QP_stride, 1, &ppMode);
3044                 postProcess(src[2], src_stride, dst[2], dst_stride,
3045                         horizontal_size, vertical_size, QP_store, QP_stride, 2, &ppMode);
3046         }
3047         else
3048         {
3049                 memset(dst[1], 128, dst_stride*vertical_size);
3050                 memset(dst[2], 128, dst_stride*vertical_size);
3051 //              memcpy(dst[1], src[1], src_stride*horizontal_size);
3052 //              memcpy(dst[2], src[2], src_stride*horizontal_size);
3053         }
3054 }
3055
3056 void  postprocess2(unsigned char * src[], int src_stride,
3057                  unsigned char * dst[], int dst_stride,
3058                  int horizontal_size,   int vertical_size,
3059                  QP_STORE_T *QP_store,  int QP_stride,
3060                  struct PPMode *mode)
3061 {
3062
3063         static QP_STORE_T zeroArray[2048/8];
3064         if(QP_store==NULL)
3065         {
3066                 QP_store= zeroArray;
3067                 QP_stride= 0;
3068         }
3069
3070 #ifdef HAVE_ODIVX_POSTPROCESS
3071 // Note: I could make this shit outside of this file, but it would mean one
3072 // more function call...
3073         if(use_old_pp){
3074             odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,
3075             mode->oldMode);
3076             return;
3077         }
3078 #endif
3079
3080         postProcess(src[0], src_stride, dst[0], dst_stride,
3081                 horizontal_size, vertical_size, QP_store, QP_stride, 0, mode);
3082
3083         horizontal_size >>= 1;
3084         vertical_size   >>= 1;
3085         src_stride      >>= 1;
3086         dst_stride      >>= 1;
3087
3088         postProcess(src[1], src_stride, dst[1], dst_stride,
3089                 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
3090         postProcess(src[2], src_stride, dst[2], dst_stride,
3091                 horizontal_size, vertical_size, QP_store, QP_stride, 2, mode);
3092 }
3093
3094
3095 /**
3096  * gets the mode flags for a given quality (larger values mean slower but better postprocessing)
3097  * 0 <= quality <= 6
3098  */
3099 int getPpModeForQuality(int quality){
3100         int modes[1+GET_PP_QUALITY_MAX]= {
3101                 0,
3102 #if 1
3103                 // horizontal filters first
3104                 LUM_H_DEBLOCK,
3105                 LUM_H_DEBLOCK | LUM_V_DEBLOCK,
3106                 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK,
3107                 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK,
3108                 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING,
3109                 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING | CHROM_DERING
3110 #else
3111                 // vertical filters first
3112                 LUM_V_DEBLOCK,
3113                 LUM_V_DEBLOCK | LUM_H_DEBLOCK,
3114                 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK,
3115                 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK,
3116                 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING,
3117                 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING | CHROM_DERING
3118 #endif
3119         };
3120
3121 #ifdef HAVE_ODIVX_POSTPROCESS
3122         int odivx_modes[1+GET_PP_QUALITY_MAX]= {
3123                 0,
3124                 PP_DEBLOCK_Y_H,
3125                 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V,
3126                 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H,
3127                 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V,
3128                 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y,
3129                 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y|PP_DERING_C
3130         };
3131         if(use_old_pp) return odivx_modes[quality];
3132 #endif
3133         return modes[quality];
3134 }
3135
3136 /**
3137  * Copies a block from src to dst and fixes the blacklevel
3138  * numLines must be a multiple of 4
3139  * levelFix == 0 -> dont touch the brighness & contrast
3140  */
3141 static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
3142         int levelFix)
3143 {
3144 #ifndef HAVE_MMX
3145         int i;
3146 #endif
3147         if(levelFix)
3148         {
3149 #ifdef HAVE_MMX
3150                                         asm volatile(
3151                                                 "leal (%0,%2), %%eax    \n\t"
3152                                                 "leal (%1,%3), %%ebx    \n\t"
3153                                                 "movq packedYOffset, %%mm2      \n\t"
3154                                                 "movq packedYScale, %%mm3       \n\t"
3155                                                 "pxor %%mm4, %%mm4      \n\t"
3156
3157 #define SCALED_CPY(src1, src2, dst1, dst2)                                      \
3158                                                 "movq " #src1 ", %%mm0  \n\t"\
3159                                                 "movq " #src1 ", %%mm5  \n\t"\
3160                                                 "punpcklbw %%mm4, %%mm0 \n\t"\
3161                                                 "punpckhbw %%mm4, %%mm5 \n\t"\
3162                                                 "psubw %%mm2, %%mm0     \n\t"\
3163                                                 "psubw %%mm2, %%mm5     \n\t"\
3164                                                 "movq " #src2 ", %%mm1  \n\t"\
3165                                                 "psllw $6, %%mm0        \n\t"\
3166                                                 "psllw $6, %%mm5        \n\t"\
3167                                                 "pmulhw %%mm3, %%mm0    \n\t"\
3168                                                 "movq " #src2 ", %%mm6  \n\t"\
3169                                                 "pmulhw %%mm3, %%mm5    \n\t"\
3170                                                 "punpcklbw %%mm4, %%mm1 \n\t"\
3171                                                 "punpckhbw %%mm4, %%mm6 \n\t"\
3172                                                 "psubw %%mm2, %%mm1     \n\t"\
3173                                                 "psubw %%mm2, %%mm6     \n\t"\
3174                                                 "psllw $6, %%mm1        \n\t"\
3175                                                 "psllw $6, %%mm6        \n\t"\
3176                                                 "pmulhw %%mm3, %%mm1    \n\t"\
3177                                                 "pmulhw %%mm3, %%mm6    \n\t"\
3178                                                 "packuswb %%mm5, %%mm0  \n\t"\
3179                                                 "packuswb %%mm6, %%mm1  \n\t"\
3180                                                 "movq %%mm0, " #dst1 "  \n\t"\
3181                                                 "movq %%mm1, " #dst2 "  \n\t"\
3182
3183 SCALED_CPY((%0)       , (%0, %2)      , (%1)       , (%1, %3))
3184 SCALED_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%ebx, %3, 2))
3185 SCALED_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%ebx, %3, 4))
3186                                                 "leal (%%eax,%2,4), %%eax       \n\t"
3187                                                 "leal (%%ebx,%3,4), %%ebx       \n\t"
3188 SCALED_CPY((%%eax, %2), (%%eax, %2, 2), (%%ebx, %3), (%%ebx, %3, 2))
3189
3190
3191                                                 : : "r"(src),
3192                                                 "r"(dst),
3193                                                 "r" (srcStride),
3194                                                 "r" (dstStride)
3195                                                 : "%eax", "%ebx"
3196                                         );
3197 #else
3198                                 for(i=0; i<8; i++)
3199                                         memcpy( &(dst[dstStride*i]),
3200                                                 &(src[srcStride*i]), BLOCK_SIZE);
3201 #endif
3202         }
3203         else
3204         {
3205 #ifdef HAVE_MMX
3206                                         asm volatile(
3207                                                 "leal (%0,%2), %%eax    \n\t"
3208                                                 "leal (%1,%3), %%ebx    \n\t"
3209
3210 #define SIMPLE_CPY(src1, src2, dst1, dst2)                              \
3211                                                 "movq " #src1 ", %%mm0  \n\t"\
3212                                                 "movq " #src2 ", %%mm1  \n\t"\
3213                                                 "movq %%mm0, " #dst1 "  \n\t"\
3214                                                 "movq %%mm1, " #dst2 "  \n\t"\
3215
3216 SIMPLE_CPY((%0)       , (%0, %2)      , (%1)       , (%1, %3))
3217 SIMPLE_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%ebx, %3, 2))
3218 SIMPLE_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%ebx, %3, 4))
3219                                                 "leal (%%eax,%2,4), %%eax       \n\t"
3220                                                 "leal (%%ebx,%3,4), %%ebx       \n\t"
3221 SIMPLE_CPY((%%eax, %2), (%%eax, %2, 2), (%%ebx, %3), (%%ebx, %3, 2))
3222
3223                                                 : : "r" (src),
3224                                                 "r" (dst),
3225                                                 "r" (srcStride),
3226                                                 "r" (dstStride)
3227                                                 : "%eax", "%ebx"
3228                                         );
3229 #else
3230                                 for(i=0; i<8; i++)
3231                                         memcpy( &(dst[dstStride*i]),
3232                                                 &(src[srcStride*i]), BLOCK_SIZE);
3233 #endif
3234         }
3235 }
3236
3237
3238 /**
3239  * Filters array of bytes (Y or U or V values)
3240  */
3241 static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
3242         QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode)
3243 {
3244         int x,y;
3245         const int mode= isColor ? ppMode->chromMode : ppMode->lumMode;
3246
3247         /* we need 64bit here otherwise we´ll going to have a problem
3248            after watching a black picture for 5 hours*/
3249         static uint64_t *yHistogram= NULL;
3250         int black=0, white=255; // blackest black and whitest white in the picture
3251         int QPCorrecture= 256;
3252
3253         /* Temporary buffers for handling the last row(s) */
3254         static uint8_t *tempDst= NULL;
3255         static uint8_t *tempSrc= NULL;
3256
3257         /* Temporary buffers for handling the last block */
3258         static uint8_t *tempDstBlock= NULL;
3259         static uint8_t *tempSrcBlock= NULL;
3260
3261         /* Temporal noise reducing buffers */
3262         static uint8_t *tempBlured[3]= {NULL,NULL,NULL};
3263         static uint32_t *tempBluredPast[3]= {NULL,NULL,NULL};
3264
3265         int copyAhead;
3266
3267 #ifdef PP_FUNNY_STRIDE
3268         uint8_t *dstBlockPtrBackup;
3269         uint8_t *srcBlockPtrBackup;
3270 #endif
3271
3272 #ifdef MORE_TIMING
3273         long long T0, T1, diffTime=0;
3274 #endif
3275 #ifdef TIMING
3276         long long memcpyTime=0, vertTime=0, horizTime=0, sumTime;
3277         sumTime= rdtsc();
3278 #endif
3279 //mode= 0x7F;
3280 #ifdef HAVE_MMX
3281         maxTmpNoise[0]= ppMode->maxTmpNoise[0];
3282         maxTmpNoise[1]= ppMode->maxTmpNoise[1];
3283         maxTmpNoise[2]= ppMode->maxTmpNoise[2];
3284 #endif
3285
3286         if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
3287         else if(mode & LINEAR_BLEND_DEINT_FILTER) copyAhead=14;
3288         else if(   (mode & V_DEBLOCK)
3289                 || (mode & LINEAR_IPOL_DEINT_FILTER)
3290                 || (mode & MEDIAN_DEINT_FILTER)) copyAhead=13;
3291         else if(mode & V_X1_FILTER) copyAhead=11;
3292         else if(mode & V_RK1_FILTER) copyAhead=10;
3293         else if(mode & DERING) copyAhead=9;
3294         else copyAhead=8;
3295
3296         copyAhead-= 8;
3297
3298         if(tempDst==NULL)
3299         {
3300                 tempDst= (uint8_t*)memalign(8, 1024*24);
3301                 tempSrc= (uint8_t*)memalign(8, 1024*24);
3302                 tempDstBlock= (uint8_t*)memalign(8, 1024*24);
3303                 tempSrcBlock= (uint8_t*)memalign(8, 1024*24);
3304         }
3305
3306         if(tempBlured[isColor]==NULL && (mode & TEMP_NOISE_FILTER))
3307         {
3308 //              printf("%d %d %d\n", isColor, dstStride, height);
3309                 //FIXME works only as long as the size doesnt increase
3310                 //Note:the +17*1024 is just there so i dont have to worry about r/w over te end
3311                 tempBlured[isColor]= (uint8_t*)memalign(8, dstStride*((height+7)&(~7)) + 17*1024);
3312                 tempBluredPast[isColor]= (uint32_t*)memalign(8, 256*((height+7)&(~7))/2 + 17*1024);
3313
3314                 memset(tempBlured[isColor], 0, dstStride*((height+7)&(~7)) + 17*1024);
3315                 memset(tempBluredPast[isColor], 0, 256*((height+7)&(~7))/2 + 17*1024);
3316         }
3317
3318         if(!yHistogram)
3319         {
3320                 int i;
3321                 yHistogram= (uint64_t*)malloc(8*256);
3322                 for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256;
3323
3324                 if(mode & FULL_Y_RANGE)
3325                 {
3326                         maxAllowedY=255;
3327                         minAllowedY=0;
3328                 }
3329         }
3330
3331         if(!isColor)
3332         {
3333                 uint64_t sum= 0;
3334                 int i;
3335                 static int framenum= -1;
3336                 uint64_t maxClipped;
3337                 uint64_t clipped;
3338                 double scale;
3339
3340                 framenum++;
3341                 if(framenum == 1) yHistogram[0]= width*height/64*15/256;
3342
3343                 for(i=0; i<256; i++)
3344                 {
3345                         sum+= yHistogram[i];
3346 //                      printf("%d ", yHistogram[i]);
3347                 }
3348 //              printf("\n\n");
3349
3350                 /* we allways get a completly black picture first */
3351                 maxClipped= (uint64_t)(sum * maxClippedThreshold);
3352
3353                 clipped= sum;
3354                 for(black=255; black>0; black--)
3355                 {
3356                         if(clipped < maxClipped) break;
3357                         clipped-= yHistogram[black];
3358                 }
3359
3360                 clipped= sum;
3361                 for(white=0; white<256; white++)
3362                 {
3363                         if(clipped < maxClipped) break;
3364                         clipped-= yHistogram[white];
3365                 }
3366
3367                 packedYOffset= (black - minAllowedY) & 0xFFFF;
3368                 packedYOffset|= packedYOffset<<32;
3369                 packedYOffset|= packedYOffset<<16;
3370
3371                 scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);
3372
3373                 packedYScale= (uint16_t)(scale*1024.0 + 0.5);
3374                 packedYScale|= packedYScale<<32;
3375                 packedYScale|= packedYScale<<16;
3376         }
3377         else
3378         {
3379                 packedYScale= 0x0100010001000100LL;
3380                 packedYOffset= 0;
3381         }
3382
3383         if(mode & LEVEL_FIX)    QPCorrecture= packedYScale &0xFFFF;
3384         else                    QPCorrecture= 256;
3385
3386         /* copy & deinterlace first row of blocks */
3387         y=-BLOCK_SIZE;
3388         {
3389                 //1% speedup if these are here instead of the inner loop
3390                 uint8_t *srcBlock= &(src[y*srcStride]);
3391                 uint8_t *dstBlock= &(dst[y*dstStride]);
3392
3393                 dstBlock= tempDst + dstStride;
3394
3395                 // From this point on it is guranteed that we can read and write 16 lines downward
3396                 // finish 1 block before the next otherwise we´ll might have a problem
3397                 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
3398                 for(x=0; x<width; x+=BLOCK_SIZE)
3399                 {
3400
3401 #ifdef HAVE_MMX2
3402 /*
3403                         prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3404                         prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3405                         prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3406                         prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3407 */
3408
3409                         asm(
3410                                 "movl %4, %%eax                 \n\t"
3411                                 "shrl $2, %%eax                 \n\t"
3412                                 "andl $6, %%eax                 \n\t"
3413                                 "addl %5, %%eax                 \n\t"
3414                                 "movl %%eax, %%ebx              \n\t"
3415                                 "imul %1, %%eax                 \n\t"
3416                                 "imul %3, %%ebx                 \n\t"
3417                                 "prefetchnta 32(%%eax, %0)      \n\t"
3418                                 "prefetcht0 32(%%ebx, %2)       \n\t"
3419                                 "addl %1, %%eax                 \n\t"
3420                                 "addl %3, %%ebx                 \n\t"
3421                                 "prefetchnta 32(%%eax, %0)      \n\t"
3422                                 "prefetcht0 32(%%ebx, %2)       \n\t"
3423                         :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
3424                         "m" (x), "m" (copyAhead)
3425                         : "%eax", "%ebx"
3426                         );
3427
3428 #elif defined(HAVE_3DNOW)
3429 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
3430 /*                      prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3431                         prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3432                         prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3433                         prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3434 */
3435 #endif
3436
3437                         blockCopy(dstBlock + dstStride*copyAhead, dstStride,
3438                                 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX);
3439
3440                         if(mode & LINEAR_IPOL_DEINT_FILTER)
3441                                 deInterlaceInterpolateLinear(dstBlock, dstStride);
3442                         else if(mode & LINEAR_BLEND_DEINT_FILTER)
3443                                 deInterlaceBlendLinear(dstBlock, dstStride);
3444                         else if(mode & MEDIAN_DEINT_FILTER)
3445                                 deInterlaceMedian(dstBlock, dstStride);
3446                         else if(mode & CUBIC_IPOL_DEINT_FILTER)
3447                                 deInterlaceInterpolateCubic(dstBlock, dstStride);
3448 /*                      else if(mode & CUBIC_BLEND_DEINT_FILTER)
3449                                 deInterlaceBlendCubic(dstBlock, dstStride);
3450 */
3451                         dstBlock+=8;
3452                         srcBlock+=8;
3453                 }
3454                 memcpy(&(dst[y*dstStride]) + 8*dstStride, tempDst + 9*dstStride, copyAhead*dstStride );
3455         }
3456
3457         for(y=0; y<height; y+=BLOCK_SIZE)
3458         {
3459                 //1% speedup if these are here instead of the inner loop
3460                 uint8_t *srcBlock= &(src[y*srcStride]);
3461                 uint8_t *dstBlock= &(dst[y*dstStride]);
3462 #ifdef ARCH_X86
3463                 int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride];
3464                 int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4);
3465                 int QPFrac= QPDelta;
3466                 uint8_t *tempBlock1= tempBlocks;
3467                 uint8_t *tempBlock2= tempBlocks + 8;
3468 #endif
3469                 int QP=0;
3470                 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
3471                    if not than use a temporary buffer */
3472                 if(y+15 >= height)
3473                 {
3474                         int i;
3475                         /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
3476                            blockcopy to dst later */
3477                         memcpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
3478                                 srcStride*MAX(height-y-copyAhead, 0) );
3479
3480                         /* duplicate last line of src to fill the void upto line (copyAhead+7) */
3481                         for(i=MAX(height-y, 8); i<copyAhead+8; i++)
3482                                 memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), srcStride);
3483
3484                         /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
3485                         memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, copyAhead+1) );
3486
3487                         /* duplicate last line of dst to fill the void upto line (copyAhead) */
3488                         for(i=height-y+1; i<=copyAhead; i++)
3489                                 memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), dstStride);
3490
3491                         dstBlock= tempDst + dstStride;
3492                         srcBlock= tempSrc;
3493                 }
3494
3495                 // From this point on it is guranteed that we can read and write 16 lines downward
3496                 // finish 1 block before the next otherwise we´ll might have a problem
3497                 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
3498                 for(x=0; x<width; x+=BLOCK_SIZE)
3499                 {
3500                         const int stride= dstStride;
3501                         uint8_t *tmpXchg;
3502 #ifdef ARCH_X86
3503                         QP= *QPptr;
3504                         asm volatile(
3505                                 "addl %2, %1            \n\t"
3506                                 "sbbl %%eax, %%eax      \n\t"
3507                                 "shll $2, %%eax         \n\t"
3508                                 "subl %%eax, %0         \n\t"
3509                                 : "+r" (QPptr), "+m" (QPFrac)
3510                                 : "r" (QPDelta)
3511                                 : "%eax"
3512                         );
3513 #else
3514                         QP= isColor ?
3515                                 QPs[(y>>3)*QPStride + (x>>3)]:
3516                                 QPs[(y>>4)*QPStride + (x>>4)];
3517 #endif
3518                         if(!isColor)
3519                         {
3520                                 QP= (QP* QPCorrecture)>>8;
3521                                 yHistogram[ srcBlock[srcStride*12 + 4] ]++;
3522                         }
3523 #ifdef HAVE_MMX
3524                         asm volatile(
3525                                 "movd %0, %%mm7                                 \n\t"
3526                                 "packuswb %%mm7, %%mm7                          \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
3527                                 "packuswb %%mm7, %%mm7                          \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
3528                                 "packuswb %%mm7, %%mm7                          \n\t" // QP,..., QP
3529                                 "movq %%mm7, pQPb                               \n\t"
3530                                 : : "r" (QP)
3531                         );
3532 #endif
3533
3534 #ifdef MORE_TIMING
3535                         T0= rdtsc();
3536 #endif
3537
3538 #ifdef HAVE_MMX2
3539 /*
3540                         prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3541                         prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3542                         prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3543                         prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3544 */
3545
3546                         asm(
3547                                 "movl %4, %%eax                 \n\t"
3548                                 "shrl $2, %%eax                 \n\t"
3549                                 "andl $6, %%eax                 \n\t"
3550                                 "addl %5, %%eax                 \n\t"
3551                                 "movl %%eax, %%ebx              \n\t"
3552                                 "imul %1, %%eax                 \n\t"
3553                                 "imul %3, %%ebx                 \n\t"
3554                                 "prefetchnta 32(%%eax, %0)      \n\t"
3555                                 "prefetcht0 32(%%ebx, %2)       \n\t"
3556                                 "addl %1, %%eax                 \n\t"
3557                                 "addl %3, %%ebx                 \n\t"
3558                                 "prefetchnta 32(%%eax, %0)      \n\t"
3559                                 "prefetcht0 32(%%ebx, %2)       \n\t"
3560                         :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
3561                         "m" (x), "m" (copyAhead)
3562                         : "%eax", "%ebx"
3563                         );
3564
3565 #elif defined(HAVE_3DNOW)
3566 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
3567 /*                      prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3568                         prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3569                         prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3570                         prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3571 */
3572 #endif
3573
3574 #ifdef PP_FUNNY_STRIDE
3575                         //can we mess with a 8x16 block, if not use a temp buffer, yes again
3576                         if(x+7 >= width)
3577                         {
3578                                 int i;
3579                                 dstBlockPtrBackup= dstBlock;
3580                                 srcBlockPtrBackup= srcBlock;
3581
3582                                 for(i=0;i<BLOCK_SIZE*2; i++)
3583                                 {
3584                                         memcpy(tempSrcBlock+i*srcStride, srcBlock+i*srcStride, width-x);
3585                                         memcpy(tempDstBlock+i*dstStride, dstBlock+i*dstStride, width-x);
3586                                 }
3587
3588                                 dstBlock= tempDstBlock;
3589                                 srcBlock= tempSrcBlock;
3590                         }
3591 #endif
3592
3593                         blockCopy(dstBlock + dstStride*copyAhead, dstStride,
3594                                 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX);
3595
3596                         if(mode & LINEAR_IPOL_DEINT_FILTER)
3597                                 deInterlaceInterpolateLinear(dstBlock, dstStride);
3598                         else if(mode & LINEAR_BLEND_DEINT_FILTER)
3599                                 deInterlaceBlendLinear(dstBlock, dstStride);
3600                         else if(mode & MEDIAN_DEINT_FILTER)
3601                                 deInterlaceMedian(dstBlock, dstStride);
3602                         else if(mode & CUBIC_IPOL_DEINT_FILTER)
3603                                 deInterlaceInterpolateCubic(dstBlock, dstStride);
3604 /*                      else if(mode & CUBIC_BLEND_DEINT_FILTER)
3605                                 deInterlaceBlendCubic(dstBlock, dstStride);
3606 */
3607
3608                         /* only deblock if we have 2 blocks */
3609                         if(y + 8 < height)
3610                         {
3611 #ifdef MORE_TIMING
3612                                 T1= rdtsc();
3613                                 memcpyTime+= T1-T0;
3614                                 T0=T1;
3615 #endif
3616                                 if(mode & V_RK1_FILTER)
3617                                         vertRK1Filter(dstBlock, stride, QP);
3618                                 else if(mode & V_X1_FILTER)
3619                                         vertX1Filter(dstBlock, stride, QP);
3620                                 else if(mode & V_DEBLOCK)
3621                                 {
3622                                         if( isVertDC(dstBlock, stride))
3623                                         {
3624                                                 if(isVertMinMaxOk(dstBlock, stride, QP))
3625                                                         doVertLowPass(dstBlock, stride, QP);
3626                                         }
3627                                         else
3628                                                 doVertDefFilter(dstBlock, stride, QP);
3629                                 }
3630 #ifdef MORE_TIMING
3631                                 T1= rdtsc();
3632                                 vertTime+= T1-T0;
3633                                 T0=T1;
3634 #endif
3635                         }
3636
3637 #ifdef HAVE_MMX
3638                         transpose1(tempBlock1, tempBlock2, dstBlock, dstStride);
3639 #endif
3640                         /* check if we have a previous block to deblock it with dstBlock */
3641                         if(x - 8 >= 0)
3642                         {
3643 #ifdef MORE_TIMING
3644                                 T0= rdtsc();
3645 #endif
3646 #ifdef HAVE_MMX
3647                                 if(mode & H_RK1_FILTER)
3648                                         vertRK1Filter(tempBlock1, 16, QP);
3649                                 else if(mode & H_X1_FILTER)
3650                                         vertX1Filter(tempBlock1, 16, QP);
3651                                 else if(mode & H_DEBLOCK)
3652                                 {
3653                                         if( isVertDC(tempBlock1, 16) )
3654                                         {
3655                                                 if(isVertMinMaxOk(tempBlock1, 16, QP))
3656                                                         doVertLowPass(tempBlock1, 16, QP);
3657                                         }
3658                                         else
3659                                                 doVertDefFilter(tempBlock1, 16, QP);
3660                                 }
3661
3662                                 transpose2(dstBlock-4, dstStride, tempBlock1 + 4*16);
3663
3664 #else
3665                                 if(mode & H_X1_FILTER)
3666                                         horizX1Filter(dstBlock-4, stride, QP);
3667                                 else if(mode & H_DEBLOCK)
3668                                 {
3669                                         if( isHorizDC(dstBlock-4, stride))
3670                                         {
3671                                                 if(isHorizMinMaxOk(dstBlock-4, stride, QP))
3672                                                         doHorizLowPass(dstBlock-4, stride, QP);
3673                                         }
3674                                         else
3675                                                 doHorizDefFilter(dstBlock-4, stride, QP);
3676                                 }
3677 #endif
3678 #ifdef MORE_TIMING
3679                                 T1= rdtsc();
3680                                 horizTime+= T1-T0;
3681                                 T0=T1;
3682 #endif
3683                                 if(mode & DERING)
3684                                 {
3685                                 //FIXME filter first line
3686                                         if(y>0) dering(dstBlock - stride - 8, stride, QP);
3687                                 }
3688
3689                                 if(mode & TEMP_NOISE_FILTER)
3690                                 {
3691                                         tempNoiseReducer(dstBlock-8, stride,
3692                                                 tempBlured[isColor] + y*dstStride + x,
3693                                                 tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
3694                                                 ppMode->maxTmpNoise);
3695                                 }
3696                         }
3697
3698 #ifdef PP_FUNNY_STRIDE
3699                         /* did we use a tmp-block buffer */
3700                         if(x+7 >= width)
3701                         {
3702                                 int i;
3703                                 dstBlock= dstBlockPtrBackup;
3704                                 srcBlock= srcBlockPtrBackup;
3705
3706                                 for(i=0;i<BLOCK_SIZE*2; i++)
3707                                 {
3708                                         memcpy(dstBlock+i*dstStride, tempDstBlock+i*dstStride, width-x);
3709                                 }
3710                         }
3711 #endif
3712
3713                         dstBlock+=8;
3714                         srcBlock+=8;
3715
3716 #ifdef HAVE_MMX
3717                         tmpXchg= tempBlock1;
3718                         tempBlock1= tempBlock2;
3719                         tempBlock2 = tmpXchg;
3720 #endif
3721                 }
3722
3723                 if(mode & DERING)
3724                 {
3725                                 if(y > 0) dering(dstBlock - dstStride - 8, dstStride, QP);
3726                 }
3727
3728                 if((mode & TEMP_NOISE_FILTER))
3729                 {
3730                         tempNoiseReducer(dstBlock-8, dstStride,
3731                                 tempBlured[isColor] + y*dstStride + x,
3732                                 tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
3733                                 ppMode->maxTmpNoise);
3734                 }
3735
3736                 /* did we use a tmp buffer for the last lines*/
3737                 if(y+15 >= height)
3738                 {
3739                         uint8_t *dstBlock= &(dst[y*dstStride]);
3740                         memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) );
3741                 }
3742 /*
3743                 for(x=0; x<width; x+=32)
3744                 {
3745                         volatile int i;
3746                         i+=     + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride]
3747                                 + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride]
3748                                 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride];
3749 //                              + dstBlock[x +13*dstStride]
3750 //                              + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride];
3751                 }*/
3752         }
3753 #ifdef HAVE_3DNOW
3754         asm volatile("femms");
3755 #elif defined (HAVE_MMX)
3756         asm volatile("emms");
3757 #endif
3758
3759 #ifdef TIMING
3760         // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...)
3761         sumTime= rdtsc() - sumTime;
3762         if(!isColor)
3763                 printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d    \r",
3764                         (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
3765                         (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)
3766                         , black, white);
3767 #endif
3768 #ifdef DEBUG_BRIGHTNESS
3769         if(!isColor)
3770         {
3771                 int max=1;
3772                 int i;
3773                 for(i=0; i<256; i++)
3774                         if(yHistogram[i] > max) max=yHistogram[i];
3775
3776                 for(i=1; i<256; i++)
3777                 {
3778                         int x;
3779                         int start=yHistogram[i-1]/(max/256+1);
3780                         int end=yHistogram[i]/(max/256+1);
3781                         int inc= end > start ? 1 : -1;
3782                         for(x=start; x!=end+inc; x+=inc)
3783                                 dst[ i*dstStride + x]+=128;
3784                 }
3785
3786                 for(i=0; i<100; i+=2)
3787                 {
3788                         dst[ (white)*dstStride + i]+=128;
3789                         dst[ (black)*dstStride + i]+=128;
3790                 }
3791
3792         }
3793 #endif
3794
3795 }