]> git.sesse.net Git - ffmpeg/blob - postproc/postprocess_template.c
1% speedup
[ffmpeg] / postproc / postprocess_template.c
1 /*
2     Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
3
4     This program is free software; you can redistribute it and/or modify
5     it under the terms of the GNU General Public License as published by
6     the Free Software Foundation; either version 2 of the License, or
7     (at your option) any later version.
8
9     This program is distributed in the hope that it will be useful,
10     but WITHOUT ANY WARRANTY; without even the implied warranty of
11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12     GNU General Public License for more details.
13
14     You should have received a copy of the GNU General Public License
15     along with this program; if not, write to the Free Software
16     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17 */
18
19 /*
20                         C       MMX     MMX2    3DNow
21 isVertDC                Ec      Ec
22 isVertMinMaxOk          Ec      Ec
23 doVertLowPass           E               e       e
24 doVertDefFilter         Ec      Ec      e       e
25 isHorizDC               Ec      Ec
26 isHorizMinMaxOk         a       E
27 doHorizLowPass          E               e       e
28 doHorizDefFilter        Ec      Ec      e       e
29 deRing                  E               e       e*
30 Vertical RKAlgo1        E               a       a
31 Horizontal RKAlgo1                      a       a
32 Vertical X1#            a               E       E
33 Horizontal X1#          a               E       E
34 LinIpolDeinterlace      e               E       E*
35 CubicIpolDeinterlace    a               e       e*
36 LinBlendDeinterlace     e               E       E*
37 MedianDeinterlace#              Ec      Ec
38 TempDeNoiser#           E               e       e
39
40 * i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work
41 # more or less selfinvented filters so the exactness isnt too meaningfull
42 E = Exact implementation
43 e = allmost exact implementation (slightly different rounding,...)
44 a = alternative / approximate impl
45 c = checked against the other implementations (-vo md5)
46 */
47
48 /*
49 TODO:
50 verify that everything workes as it should (how?)
51 reduce the time wasted on the mem transfer
52 implement everything in C at least (done at the moment but ...)
53 unroll stuff if instructions depend too much on the prior one
54 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
55 move YScale thing to the end instead of fixing QP
56 write a faster and higher quality deblocking filter :)
57 make the mainloop more flexible (variable number of blocks at once
58         (the if/else stuff per block is slowing things down)
59 compare the quality & speed of all filters
60 split this huge file
61 border remover
62 optimize c versions
63 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
64 smart blur
65 commandline option for   the deblock thresholds
66 ...
67 */
68
69 //Changelog: use the CVS log
70
71 #include "../config.h"
72 #include <inttypes.h>
73 #include <stdio.h>
74 #include <stdlib.h>
75 #include <string.h>
76 #ifdef HAVE_MALLOC_H
77 #include <malloc.h>
78 #endif
79 //#undef HAVE_MMX2
80 //#define HAVE_3DNOW
81 //#undef HAVE_MMX
82 //#define DEBUG_BRIGHTNESS
83 #include "postprocess.h"
84
85 #define MIN(a,b) ((a) > (b) ? (b) : (a))
86 #define MAX(a,b) ((a) < (b) ? (b) : (a))
87 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
88 #define SIGN(a) ((a) > 0 ? 1 : -1)
89
90 #ifdef HAVE_MMX2
91 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
92 #elif defined (HAVE_3DNOW)
93 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
94 #endif
95
96 #ifdef HAVE_MMX2
97 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
98 #elif defined (HAVE_MMX)
99 #define PMINUB(b,a,t) \
100         "movq " #a ", " #t " \n\t"\
101         "psubusb " #b ", " #t " \n\t"\
102         "psubb " #t ", " #a " \n\t"
103 #endif
104
105 #ifdef HAVE_MMX2
106 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
107 #elif defined (HAVE_MMX)
108 #define PMAXUB(a,b) \
109         "psubusb " #a ", " #b " \n\t"\
110         "paddb " #a ", " #b " \n\t"
111 #endif
112
113
114 #define GET_MODE_BUFFER_SIZE 500
115 #define OPTIONS_ARRAY_SIZE 10
116
117 #ifdef HAVE_MMX
118 static volatile uint64_t __attribute__((aligned(8))) packedYOffset=     0x0000000000000000LL;
119 static volatile uint64_t __attribute__((aligned(8))) packedYScale=      0x0100010001000100LL;
120 static uint64_t __attribute__((aligned(8))) w05=                0x0005000500050005LL;
121 static uint64_t __attribute__((aligned(8))) w20=                0x0020002000200020LL;
122 static uint64_t __attribute__((aligned(8))) w1400=              0x1400140014001400LL;
123 static uint64_t __attribute__((aligned(8))) bm00000001=         0x00000000000000FFLL;
124 static uint64_t __attribute__((aligned(8))) bm00010000=         0x000000FF00000000LL;
125 static uint64_t __attribute__((aligned(8))) bm00001000=         0x00000000FF000000LL;
126 static uint64_t __attribute__((aligned(8))) bm10000000=         0xFF00000000000000LL;
127 static uint64_t __attribute__((aligned(8))) bm10000001=         0xFF000000000000FFLL;
128 static uint64_t __attribute__((aligned(8))) bm11000011=         0xFFFF00000000FFFFLL;
129 static uint64_t __attribute__((aligned(8))) bm00000011=         0x000000000000FFFFLL;
130 static uint64_t __attribute__((aligned(8))) bm11111110=         0xFFFFFFFFFFFFFF00LL;
131 static uint64_t __attribute__((aligned(8))) bm11000000=         0xFFFF000000000000LL;
132 static uint64_t __attribute__((aligned(8))) bm00011000=         0x000000FFFF000000LL;
133 static uint64_t __attribute__((aligned(8))) bm00110011=         0x0000FFFF0000FFFFLL;
134 static uint64_t __attribute__((aligned(8))) bm11001100=         0xFFFF0000FFFF0000LL;
135 static uint64_t __attribute__((aligned(8))) b00=                0x0000000000000000LL;
136 static uint64_t __attribute__((aligned(8))) b01=                0x0101010101010101LL;
137 static uint64_t __attribute__((aligned(8))) b02=                0x0202020202020202LL;
138 static uint64_t __attribute__((aligned(8))) b0F=                0x0F0F0F0F0F0F0F0FLL;
139 static uint64_t __attribute__((aligned(8))) b04=                0x0404040404040404LL;
140 static uint64_t __attribute__((aligned(8))) b08=                0x0808080808080808LL;
141 static uint64_t __attribute__((aligned(8))) bFF=                0xFFFFFFFFFFFFFFFFLL;
142 static uint64_t __attribute__((aligned(8))) b20=                0x2020202020202020LL;
143 static uint64_t __attribute__((aligned(8))) b80=                0x8080808080808080LL;
144 static uint64_t __attribute__((aligned(8))) b7E=                0x7E7E7E7E7E7E7E7ELL;
145 static uint64_t __attribute__((aligned(8))) b7C=                0x7C7C7C7C7C7C7C7CLL;
146 static uint64_t __attribute__((aligned(8))) b3F=                0x3F3F3F3F3F3F3F3FLL;
147 static uint64_t __attribute__((aligned(8))) temp0=0;
148 static uint64_t __attribute__((aligned(8))) temp1=0;
149 static uint64_t __attribute__((aligned(8))) temp2=0;
150 static uint64_t __attribute__((aligned(8))) temp3=0;
151 static uint64_t __attribute__((aligned(8))) temp4=0;
152 static uint64_t __attribute__((aligned(8))) temp5=0;
153 static uint64_t __attribute__((aligned(8))) pQPb=0;
154 static uint64_t __attribute__((aligned(8))) pQPb2=0;
155 static uint8_t __attribute__((aligned(8))) tempBlocks[8*16*2]; //used for the horizontal code
156 static uint32_t __attribute__((aligned(4))) maxTmpNoise[4];
157 #else
158 static uint64_t packedYOffset=  0x0000000000000000LL;
159 static uint64_t packedYScale=   0x0100010001000100LL;
160 static uint8_t tempBlocks[8*16*2]; //used for the horizontal code
161 #endif
162
163 int hFlatnessThreshold= 56 - 16;
164 int vFlatnessThreshold= 56 - 16;
165
166 //amount of "black" u r willing to loose to get a brightness corrected picture
167 double maxClippedThreshold= 0.01;
168
169 int maxAllowedY=234;
170 int minAllowedY=16;
171
172 static struct PPFilter filters[]=
173 {
174         {"hb", "hdeblock",              1, 1, 3, H_DEBLOCK},
175         {"vb", "vdeblock",              1, 2, 4, V_DEBLOCK},
176         {"vr", "rkvdeblock",            1, 2, 4, H_RK1_FILTER},
177         {"h1", "x1hdeblock",            1, 1, 3, H_X1_FILTER},
178         {"v1", "x1vdeblock",            1, 2, 4, V_X1_FILTER},
179         {"dr", "dering",                1, 5, 6, DERING},
180         {"al", "autolevels",            0, 1, 2, LEVEL_FIX},
181         {"lb", "linblenddeint",         0, 1, 6, LINEAR_BLEND_DEINT_FILTER},
182         {"li", "linipoldeint",          0, 1, 6, LINEAR_IPOL_DEINT_FILTER},
183         {"ci", "cubicipoldeint",        0, 1, 6, CUBIC_IPOL_DEINT_FILTER},
184         {"md", "mediandeint",           0, 1, 6, MEDIAN_DEINT_FILTER},
185         {"tn", "tmpnoise",              1, 7, 8, TEMP_NOISE_FILTER},
186         {NULL, NULL,0,0,0,0} //End Marker
187 };
188
189 static char *replaceTable[]=
190 {
191         "default",      "hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
192         "de",           "hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
193         "fast",         "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
194         "fa",           "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
195         NULL //End Marker
196 };
197
198 #ifdef HAVE_MMX
199 static inline void unusedVariableWarningFixer()
200 {
201 if(
202  packedYOffset + packedYScale + w05 + w20 + w1400 + bm00000001 + bm00010000
203  + bm00001000 + bm10000000 + bm10000001 + bm11000011 + bm00000011 + bm11111110
204  + bm11000000 + bm00011000 + bm00110011 + bm11001100 + b00 + b01 + b02 + b0F
205  + bFF + b20 + b04+ b08 + pQPb2 + b80 + b7E + b7C + b3F + temp0 + temp1 + temp2 + temp3 + temp4
206  + temp5 + pQPb== 0) b00=0;
207 }
208 #endif
209
210 #ifdef TIMING
211 static inline long long rdtsc()
212 {
213         long long l;
214         asm volatile(   "rdtsc\n\t"
215                 : "=A" (l)
216         );
217 //      printf("%d\n", int(l/1000));
218         return l;
219 }
220 #endif
221
222 #ifdef HAVE_MMX2
223 static inline void prefetchnta(void *p)
224 {
225         asm volatile(   "prefetchnta (%0)\n\t"
226                 : : "r" (p)
227         );
228 }
229
230 static inline void prefetcht0(void *p)
231 {
232         asm volatile(   "prefetcht0 (%0)\n\t"
233                 : : "r" (p)
234         );
235 }
236
237 static inline void prefetcht1(void *p)
238 {
239         asm volatile(   "prefetcht1 (%0)\n\t"
240                 : : "r" (p)
241         );
242 }
243
244 static inline void prefetcht2(void *p)
245 {
246         asm volatile(   "prefetcht2 (%0)\n\t"
247                 : : "r" (p)
248         );
249 }
250 #endif
251
252 //FIXME? |255-0| = 1 (shouldnt be a problem ...)
253 /**
254  * Check if the middle 8x8 Block in the given 8x16 block is flat
255  */
256 static inline int isVertDC(uint8_t src[], int stride){
257         int numEq= 0;
258 #ifndef HAVE_MMX
259         int y;
260 #endif
261         src+= stride*4; // src points to begin of the 8x8 Block
262 #ifdef HAVE_MMX
263 asm volatile(
264                 "leal (%1, %2), %%eax                           \n\t"
265                 "leal (%%eax, %2, 4), %%ebx                     \n\t"
266 //      0       1       2       3       4       5       6       7       8       9
267 //      %1      eax     eax+%2  eax+2%2 %1+4%2  ebx     ebx+%2  ebx+2%2 %1+8%2  ebx+4%2
268                 "movq b7E, %%mm7                                        \n\t" // mm7 = 0x7F
269                 "movq b7C, %%mm6                                        \n\t" // mm6 = 0x7D
270                 "movq (%1), %%mm0                               \n\t"
271                 "movq (%%eax), %%mm1                            \n\t"
272                 "psubb %%mm1, %%mm0                             \n\t" // mm0 = differnece
273                 "paddb %%mm7, %%mm0                             \n\t"
274                 "pcmpgtb %%mm6, %%mm0                           \n\t"
275
276                 "movq (%%eax,%2), %%mm2                         \n\t"
277                 "psubb %%mm2, %%mm1                             \n\t"
278                 "paddb %%mm7, %%mm1                             \n\t"
279                 "pcmpgtb %%mm6, %%mm1                           \n\t"
280                 "paddb %%mm1, %%mm0                             \n\t"
281
282                 "movq (%%eax, %2, 2), %%mm1                     \n\t"
283                 "psubb %%mm1, %%mm2                             \n\t"
284                 "paddb %%mm7, %%mm2                             \n\t"
285                 "pcmpgtb %%mm6, %%mm2                           \n\t"
286                 "paddb %%mm2, %%mm0                             \n\t"
287
288                 "movq (%1, %2, 4), %%mm2                        \n\t"
289                 "psubb %%mm2, %%mm1                             \n\t"
290                 "paddb %%mm7, %%mm1                             \n\t"
291                 "pcmpgtb %%mm6, %%mm1                           \n\t"
292                 "paddb %%mm1, %%mm0                             \n\t"
293
294                 "movq (%%ebx), %%mm1                            \n\t"
295                 "psubb %%mm1, %%mm2                             \n\t"
296                 "paddb %%mm7, %%mm2                             \n\t"
297                 "pcmpgtb %%mm6, %%mm2                           \n\t"
298                 "paddb %%mm2, %%mm0                             \n\t"
299
300                 "movq (%%ebx, %2), %%mm2                        \n\t"
301                 "psubb %%mm2, %%mm1                             \n\t"
302                 "paddb %%mm7, %%mm1                             \n\t"
303                 "pcmpgtb %%mm6, %%mm1                           \n\t"
304                 "paddb %%mm1, %%mm0                             \n\t"
305
306                 "movq (%%ebx, %2, 2), %%mm1                     \n\t"
307                 "psubb %%mm1, %%mm2                             \n\t"
308                 "paddb %%mm7, %%mm2                             \n\t"
309                 "pcmpgtb %%mm6, %%mm2                           \n\t"
310                 "paddb %%mm2, %%mm0                             \n\t"
311
312                 "                                               \n\t"
313                 "movq %%mm0, %%mm1                              \n\t"
314                 "psrlw $8, %%mm0                                \n\t"
315                 "paddb %%mm1, %%mm0                             \n\t"
316 #ifdef HAVE_MMX2
317                 "pshufw $0xF9, %%mm0, %%mm1                     \n\t"
318                 "paddb %%mm1, %%mm0                             \n\t"
319                 "pshufw $0xFE, %%mm0, %%mm1                     \n\t"
320 #else
321                 "movq %%mm0, %%mm1                              \n\t"
322                 "psrlq $16, %%mm0                               \n\t"
323                 "paddb %%mm1, %%mm0                             \n\t"
324                 "movq %%mm0, %%mm1                              \n\t"
325                 "psrlq $32, %%mm0                               \n\t"
326 #endif
327                 "paddb %%mm1, %%mm0                             \n\t"
328                 "movd %%mm0, %0                                 \n\t"
329                 : "=r" (numEq)
330                 : "r" (src), "r" (stride)
331                 : "%eax", "%ebx"
332                 );
333
334         numEq= (256 - numEq) &0xFF;
335
336 #else
337         for(y=0; y<BLOCK_SIZE-1; y++)
338         {
339                 if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
340                 if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
341                 if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++;
342                 if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++;
343                 if(((src[4] - src[4+stride] + 1)&0xFFFF) < 3) numEq++;
344                 if(((src[5] - src[5+stride] + 1)&0xFFFF) < 3) numEq++;
345                 if(((src[6] - src[6+stride] + 1)&0xFFFF) < 3) numEq++;
346                 if(((src[7] - src[7+stride] + 1)&0xFFFF) < 3) numEq++;
347                 src+= stride;
348         }
349 #endif
350 /*      if(abs(numEq - asmEq) > 0)
351         {
352                 printf("\nasm:%d  c:%d\n", asmEq, numEq);
353                 for(int y=0; y<8; y++)
354                 {
355                         for(int x=0; x<8; x++)
356                         {
357                                 printf("%d ", temp[x + y*stride]);
358                         }
359                         printf("\n");
360                 }
361         }
362 */
363 //      for(int i=0; i<numEq/8; i++) src[i]=255;
364         return (numEq > vFlatnessThreshold) ? 1 : 0;
365 }
366
367 static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
368 {
369 #ifdef HAVE_MMX
370         int isOk;
371         src+= stride*3;
372         asm volatile(
373 //              "int $3 \n\t"
374                 "movq (%1, %2), %%mm0                           \n\t"
375                 "movq (%1, %2, 8), %%mm1                        \n\t"
376                 "movq %%mm0, %%mm2                              \n\t"
377                 "psubusb %%mm1, %%mm0                           \n\t"
378                 "psubusb %%mm2, %%mm1                           \n\t"
379                 "por %%mm1, %%mm0                               \n\t" // ABS Diff
380
381                 "movq pQPb, %%mm7                               \n\t" // QP,..., QP
382                 "paddusb %%mm7, %%mm7                           \n\t" // 2QP ... 2QP
383                 "psubusb %%mm7, %%mm0                           \n\t" // Diff <= 2QP -> 0
384                 "pcmpeqd b00, %%mm0                             \n\t"
385                 "psrlq $16, %%mm0                               \n\t"
386                 "pcmpeqd bFF, %%mm0                             \n\t"
387 //              "movd %%mm0, (%1, %2, 4)\n\t"
388                 "movd %%mm0, %0                                 \n\t"
389                 : "=r" (isOk)
390                 : "r" (src), "r" (stride)
391                 );
392         return isOk;
393 #else
394
395         int isOk2= 1;
396         int x;
397         src+= stride*3;
398         for(x=0; x<BLOCK_SIZE; x++)
399         {
400                 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
401         }
402 /*      if(isOk && !isOk2 || !isOk && isOk2)
403         {
404                 printf("\nasm:%d  c:%d QP:%d\n", isOk, isOk2, QP);
405                 for(int y=0; y<9; y++)
406                 {
407                         for(int x=0; x<8; x++)
408                         {
409                                 printf("%d ", src[x + y*stride]);
410                         }
411                         printf("\n");
412                 }
413         } */
414
415         return isOk2;
416 #endif
417
418 }
419
420 /**
421  * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
422  * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
423  */
424 static inline void doVertLowPass(uint8_t *src, int stride, int QP)
425 {
426 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
427         src+= stride*3;
428         asm volatile(   //"movv %0 %1 %2\n\t"
429                 "movq pQPb, %%mm0                               \n\t"  // QP,..., QP
430
431                 "movq (%0), %%mm6                               \n\t"
432                 "movq (%0, %1), %%mm5                           \n\t"
433                 "movq %%mm5, %%mm1                              \n\t"
434                 "movq %%mm6, %%mm2                              \n\t"
435                 "psubusb %%mm6, %%mm5                           \n\t"
436                 "psubusb %%mm1, %%mm2                           \n\t"
437                 "por %%mm5, %%mm2                               \n\t" // ABS Diff of lines
438                 "psubusb %%mm0, %%mm2                           \n\t" // diff <= QP -> 0
439                 "pcmpeqb b00, %%mm2                             \n\t" // diff <= QP -> FF
440
441                 "pand %%mm2, %%mm6                              \n\t"
442                 "pandn %%mm1, %%mm2                             \n\t"
443                 "por %%mm2, %%mm6                               \n\t"// First Line to Filter
444
445                 "movq (%0, %1, 8), %%mm5                        \n\t"
446                 "leal (%0, %1, 4), %%eax                        \n\t"
447                 "leal (%0, %1, 8), %%ebx                        \n\t"
448                 "subl %1, %%ebx                                 \n\t"
449                 "addl %1, %0                                    \n\t" // %0 points to line 1 not 0
450                 "movq (%0, %1, 8), %%mm7                        \n\t"
451                 "movq %%mm5, %%mm1                              \n\t"
452                 "movq %%mm7, %%mm2                              \n\t"
453                 "psubusb %%mm7, %%mm5                           \n\t"
454                 "psubusb %%mm1, %%mm2                           \n\t"
455                 "por %%mm5, %%mm2                               \n\t" // ABS Diff of lines
456                 "psubusb %%mm0, %%mm2                           \n\t" // diff <= QP -> 0
457                 "pcmpeqb b00, %%mm2                             \n\t" // diff <= QP -> FF
458
459                 "pand %%mm2, %%mm7                              \n\t"
460                 "pandn %%mm1, %%mm2                             \n\t"
461                 "por %%mm2, %%mm7                               \n\t" // First Line to Filter
462
463
464                 //      1       2       3       4       5       6       7       8
465                 //      %0      %0+%1   %0+2%1  eax     %0+4%1  eax+2%1 ebx     eax+4%1
466                 // 6 4 2 2 1 1
467                 // 6 4 4 2
468                 // 6 8 2
469
470                 "movq (%0, %1), %%mm0                           \n\t" //  1
471                 "movq %%mm0, %%mm1                              \n\t" //  1
472                 PAVGB(%%mm6, %%mm0)                                   //1 1     /2
473                 PAVGB(%%mm6, %%mm0)                                   //3 1     /4
474
475                 "movq (%0, %1, 4), %%mm2                        \n\t" //     1
476                 "movq %%mm2, %%mm5                              \n\t" //     1
477                 PAVGB((%%eax), %%mm2)                                 //    11  /2
478                 PAVGB((%0, %1, 2), %%mm2)                             //   211  /4
479                 "movq %%mm2, %%mm3                              \n\t" //   211  /4
480                 "movq (%0), %%mm4                               \n\t" // 1
481                 PAVGB(%%mm4, %%mm3)                                   // 4 211  /8
482                 PAVGB(%%mm0, %%mm3)                                   //642211  /16
483                 "movq %%mm3, (%0)                               \n\t" // X
484                 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
485                 "movq %%mm1, %%mm0                              \n\t" //  1
486                 PAVGB(%%mm6, %%mm0)                                   //1 1     /2
487                 "movq %%mm4, %%mm3                              \n\t" // 1
488                 PAVGB((%0,%1,2), %%mm3)                               // 1 1    /2
489                 PAVGB((%%eax,%1,2), %%mm5)                            //     11 /2
490                 PAVGB((%%eax), %%mm5)                                 //    211 /4
491                 PAVGB(%%mm5, %%mm3)                                   // 2 2211 /8
492                 PAVGB(%%mm0, %%mm3)                                   //4242211 /16
493                 "movq %%mm3, (%0,%1)                            \n\t" //  X
494                 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
495                 PAVGB(%%mm4, %%mm6)                                   //11      /2
496                 "movq (%%ebx), %%mm0                            \n\t" //       1
497                 PAVGB((%%eax, %1, 2), %%mm0)                          //      11/2
498                 "movq %%mm0, %%mm3                              \n\t" //      11/2
499                 PAVGB(%%mm1, %%mm0)                                   //  2   11/4
500                 PAVGB(%%mm6, %%mm0)                                   //222   11/8
501                 PAVGB(%%mm2, %%mm0)                                   //22242211/16
502                 "movq (%0, %1, 2), %%mm2                        \n\t" //   1
503                 "movq %%mm0, (%0, %1, 2)                        \n\t" //   X
504                 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
505                 "movq (%%eax, %1, 4), %%mm0                     \n\t" //        1
506                 PAVGB((%%ebx), %%mm0)                                 //       11       /2
507                 PAVGB(%%mm0, %%mm6)                                   //11     11       /4
508                 PAVGB(%%mm1, %%mm4)                                   // 11             /2
509                 PAVGB(%%mm2, %%mm1)                                   //  11            /2
510                 PAVGB(%%mm1, %%mm6)                                   //1122   11       /8
511                 PAVGB(%%mm5, %%mm6)                                   //112242211       /16
512                 "movq (%%eax), %%mm5                            \n\t" //    1
513                 "movq %%mm6, (%%eax)                            \n\t" //    X
514                 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
515                 "movq (%%eax, %1, 4), %%mm6                     \n\t" //        1
516                 PAVGB(%%mm7, %%mm6)                                   //        11      /2
517                 PAVGB(%%mm4, %%mm6)                                   // 11     11      /4
518                 PAVGB(%%mm3, %%mm6)                                   // 11   2211      /8
519                 PAVGB(%%mm5, %%mm2)                                   //   11           /2
520                 "movq (%0, %1, 4), %%mm4                        \n\t" //     1
521                 PAVGB(%%mm4, %%mm2)                                   //   112          /4
522                 PAVGB(%%mm2, %%mm6)                                   // 112242211      /16
523                 "movq %%mm6, (%0, %1, 4)                        \n\t" //     X
524                 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
525                 PAVGB(%%mm7, %%mm1)                                   //  11     2      /4
526                 PAVGB(%%mm4, %%mm5)                                   //    11          /2
527                 PAVGB(%%mm5, %%mm0)                                   //    11 11       /4
528                 "movq (%%eax, %1, 2), %%mm6                     \n\t" //      1
529                 PAVGB(%%mm6, %%mm1)                                   //  11  4  2      /8
530                 PAVGB(%%mm0, %%mm1)                                   //  11224222      /16
531                 "movq %%mm1, (%%eax, %1, 2)                     \n\t" //      X
532                 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
533                 PAVGB((%%ebx), %%mm2)                                 //   112 4        /8
534                 "movq (%%eax, %1, 4), %%mm0                     \n\t" //        1
535                 PAVGB(%%mm0, %%mm6)                                   //      1 1       /2
536                 PAVGB(%%mm7, %%mm6)                                   //      1 12      /4
537                 PAVGB(%%mm2, %%mm6)                                   //   1122424      /4
538                 "movq %%mm6, (%%ebx)                            \n\t" //       X
539                 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
540                 PAVGB(%%mm7, %%mm5)                                   //    11   2      /4
541                 PAVGB(%%mm7, %%mm5)                                   //    11   6      /8
542
543                 PAVGB(%%mm3, %%mm0)                                   //      112       /4
544                 PAVGB(%%mm0, %%mm5)                                   //    112246      /16
545                 "movq %%mm5, (%%eax, %1, 4)                     \n\t" //        X
546                 "subl %1, %0                                    \n\t"
547
548                 :
549                 : "r" (src), "r" (stride)
550                 : "%eax", "%ebx"
551         );
552 #else
553         const int l1= stride;
554         const int l2= stride + l1;
555         const int l3= stride + l2;
556         const int l4= stride + l3;
557         const int l5= stride + l4;
558         const int l6= stride + l5;
559         const int l7= stride + l6;
560         const int l8= stride + l7;
561         const int l9= stride + l8;
562         int x;
563         src+= stride*3;
564         for(x=0; x<BLOCK_SIZE; x++)
565         {
566                 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
567                 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
568
569                 int sums[9];
570                 sums[0] = first + src[l1];
571                 sums[1] = src[l1] + src[l2];
572                 sums[2] = src[l2] + src[l3];
573                 sums[3] = src[l3] + src[l4];
574                 sums[4] = src[l4] + src[l5];
575                 sums[5] = src[l5] + src[l6];
576                 sums[6] = src[l6] + src[l7];
577                 sums[7] = src[l7] + src[l8];
578                 sums[8] = src[l8] + last;
579
580                 src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
581                 src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
582                 src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
583                 src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
584                 src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
585                 src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
586                 src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4;
587                 src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
588
589                 src++;
590         }
591
592 #endif
593 }
594
595 /**
596  * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
597  * values are correctly clipped (MMX2)
598  * values are wraparound (C)
599  * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
600         0 8 16 24
601         x = 8
602         x/2 = 4
603         x/8 = 1
604         1 12 12 23
605  */
606 static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
607 {
608 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
609         src+= stride*3;
610 // FIXME rounding
611         asm volatile(
612                 "pxor %%mm7, %%mm7                              \n\t" // 0
613                 "movq b80, %%mm6                                \n\t" // MIN_SIGNED_BYTE
614                 "leal (%0, %1), %%eax                           \n\t"
615                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
616 //      0       1       2       3       4       5       6       7       8       9
617 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
618                 "movq pQPb, %%mm0                               \n\t" // QP,..., QP
619                 "movq %%mm0, %%mm1                              \n\t" // QP,..., QP
620                 "paddusb b02, %%mm0                             \n\t"
621                 "psrlw $2, %%mm0                                \n\t"
622                 "pand b3F, %%mm0                                \n\t" // QP/4,..., QP/4
623                 "paddusb %%mm1, %%mm0                           \n\t" // QP*1.25 ...
624                 "movq (%0, %1, 4), %%mm2                        \n\t" // line 4
625                 "movq (%%ebx), %%mm3                            \n\t" // line 5
626                 "movq %%mm2, %%mm4                              \n\t" // line 4
627                 "pcmpeqb %%mm5, %%mm5                           \n\t" // -1
628                 "pxor %%mm2, %%mm5                              \n\t" // -line 4 - 1
629                 PAVGB(%%mm3, %%mm5)
630                 "paddb %%mm6, %%mm5                             \n\t" // (l5-l4)/2
631                 "psubusb %%mm3, %%mm4                           \n\t"
632                 "psubusb %%mm2, %%mm3                           \n\t"
633                 "por %%mm3, %%mm4                               \n\t" // |l4 - l5|
634                 "psubusb %%mm0, %%mm4                           \n\t"
635                 "pcmpeqb %%mm7, %%mm4                           \n\t"
636                 "pand %%mm4, %%mm5                              \n\t" // d/2
637
638 //              "paddb %%mm6, %%mm2                             \n\t" // line 4 + 0x80
639                 "paddb %%mm5, %%mm2                             \n\t"
640 //              "psubb %%mm6, %%mm2                             \n\t"
641                 "movq %%mm2, (%0,%1, 4)                         \n\t"
642
643                 "movq (%%ebx), %%mm2                            \n\t"
644 //              "paddb %%mm6, %%mm2                             \n\t" // line 5 + 0x80
645                 "psubb %%mm5, %%mm2                             \n\t"
646 //              "psubb %%mm6, %%mm2                             \n\t"
647                 "movq %%mm2, (%%ebx)                            \n\t"
648
649                 "paddb %%mm6, %%mm5                             \n\t"
650                 "psrlw $2, %%mm5                                \n\t"
651                 "pand b3F, %%mm5                                \n\t"
652                 "psubb b20, %%mm5                               \n\t" // (l5-l4)/8
653
654                 "movq (%%eax, %1, 2), %%mm2                     \n\t"
655                 "paddb %%mm6, %%mm2                             \n\t" // line 3 + 0x80
656                 "paddsb %%mm5, %%mm2                            \n\t"
657                 "psubb %%mm6, %%mm2                             \n\t"
658                 "movq %%mm2, (%%eax, %1, 2)                     \n\t"
659
660                 "movq (%%ebx, %1), %%mm2                        \n\t"
661                 "paddb %%mm6, %%mm2                             \n\t" // line 6 + 0x80
662                 "psubsb %%mm5, %%mm2                            \n\t"
663                 "psubb %%mm6, %%mm2                             \n\t"
664                 "movq %%mm2, (%%ebx, %1)                        \n\t"
665
666                 :
667                 : "r" (src), "r" (stride)
668                 : "%eax", "%ebx"
669         );
670 #else
671         const int l1= stride;
672         const int l2= stride + l1;
673         const int l3= stride + l2;
674         const int l4= stride + l3;
675         const int l5= stride + l4;
676         const int l6= stride + l5;
677 //      const int l7= stride + l6;
678 //      const int l8= stride + l7;
679 //      const int l9= stride + l8;
680         int x;
681         const int QP15= QP + (QP>>2);
682         src+= stride*3;
683         for(x=0; x<BLOCK_SIZE; x++)
684         {
685                 const int v = (src[x+l5] - src[x+l4]);
686                 if(ABS(v) < QP15)
687                 {
688                         src[x+l3] +=v>>3;
689                         src[x+l4] +=v>>1;
690                         src[x+l5] -=v>>1;
691                         src[x+l6] -=v>>3;
692
693                 }
694         }
695
696 #endif
697 }
698
699 /**
700  * Experimental Filter 1
701  * will not damage linear gradients
702  * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
703  * can only smooth blocks at the expected locations (it cant smooth them if they did move)
704  * MMX2 version does correct clipping C version doesnt
705  */
706 static inline void vertX1Filter(uint8_t *src, int stride, int QP)
707 {
708 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
709         src+= stride*3;
710
711         asm volatile(
712                 "pxor %%mm7, %%mm7                              \n\t" // 0
713 //              "movq b80, %%mm6                                \n\t" // MIN_SIGNED_BYTE
714                 "leal (%0, %1), %%eax                           \n\t"
715                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
716 //      0       1       2       3       4       5       6       7       8       9
717 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
718                 "movq (%%eax, %1, 2), %%mm0                     \n\t" // line 3
719                 "movq (%0, %1, 4), %%mm1                        \n\t" // line 4
720                 "movq %%mm1, %%mm2                              \n\t" // line 4
721                 "psubusb %%mm0, %%mm1                           \n\t"
722                 "psubusb %%mm2, %%mm0                           \n\t"
723                 "por %%mm1, %%mm0                               \n\t" // |l2 - l3|
724                 "movq (%%ebx), %%mm3                            \n\t" // line 5
725                 "movq (%%ebx, %1), %%mm4                                \n\t" // line 6
726                 "movq %%mm3, %%mm5                              \n\t" // line 5
727                 "psubusb %%mm4, %%mm3                           \n\t"
728                 "psubusb %%mm5, %%mm4                           \n\t"
729                 "por %%mm4, %%mm3                               \n\t" // |l5 - l6|
730                 PAVGB(%%mm3, %%mm0)                                   // (|l2 - l3| + |l5 - l6|)/2
731                 "movq %%mm2, %%mm1                              \n\t" // line 4
732                 "psubusb %%mm5, %%mm2                           \n\t"
733                 "movq %%mm2, %%mm4                              \n\t"
734                 "pcmpeqb %%mm7, %%mm2                           \n\t" // (l4 - l5) <= 0 ? -1 : 0
735                 "psubusb %%mm1, %%mm5                           \n\t"
736                 "por %%mm5, %%mm4                               \n\t" // |l4 - l5|
737                 "psubusb %%mm0, %%mm4           \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
738                 "movq %%mm4, %%mm3                              \n\t" // d
739                 "psubusb pQPb, %%mm4                            \n\t"
740                 "pcmpeqb %%mm7, %%mm4                           \n\t" // d <= QP ? -1 : 0
741                 "psubusb b01, %%mm3                             \n\t"
742                 "pand %%mm4, %%mm3                              \n\t" // d <= QP ? d : 0
743
744                 PAVGB(%%mm7, %%mm3)                                   // d/2
745                 "movq %%mm3, %%mm1                              \n\t" // d/2
746                 PAVGB(%%mm7, %%mm3)                                   // d/4
747                 PAVGB(%%mm1, %%mm3)                                   // 3*d/8
748
749                 "movq (%0, %1, 4), %%mm0                        \n\t" // line 4
750                 "pxor %%mm2, %%mm0                              \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
751                 "psubusb %%mm3, %%mm0                           \n\t"
752                 "pxor %%mm2, %%mm0                              \n\t"
753                 "movq %%mm0, (%0, %1, 4)                        \n\t" // line 4
754
755                 "movq (%%ebx), %%mm0                            \n\t" // line 5
756                 "pxor %%mm2, %%mm0                              \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
757                 "paddusb %%mm3, %%mm0                           \n\t"
758                 "pxor %%mm2, %%mm0                              \n\t"
759                 "movq %%mm0, (%%ebx)                            \n\t" // line 5
760
761                 PAVGB(%%mm7, %%mm1)                                   // d/4
762
763                 "movq (%%eax, %1, 2), %%mm0                     \n\t" // line 3
764                 "pxor %%mm2, %%mm0                              \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
765                 "psubusb %%mm1, %%mm0                           \n\t"
766                 "pxor %%mm2, %%mm0                              \n\t"
767                 "movq %%mm0, (%%eax, %1, 2)                     \n\t" // line 3
768
769                 "movq (%%ebx, %1), %%mm0                        \n\t" // line 6
770                 "pxor %%mm2, %%mm0                              \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
771                 "paddusb %%mm1, %%mm0                           \n\t"
772                 "pxor %%mm2, %%mm0                              \n\t"
773                 "movq %%mm0, (%%ebx, %1)                        \n\t" // line 6
774
775                 PAVGB(%%mm7, %%mm1)                                   // d/8
776
777                 "movq (%%eax, %1), %%mm0                        \n\t" // line 2
778                 "pxor %%mm2, %%mm0                              \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
779                 "psubusb %%mm1, %%mm0                           \n\t"
780                 "pxor %%mm2, %%mm0                              \n\t"
781                 "movq %%mm0, (%%eax, %1)                        \n\t" // line 2
782
783                 "movq (%%ebx, %1, 2), %%mm0                     \n\t" // line 7
784                 "pxor %%mm2, %%mm0                              \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
785                 "paddusb %%mm1, %%mm0                           \n\t"
786                 "pxor %%mm2, %%mm0                              \n\t"
787                 "movq %%mm0, (%%ebx, %1, 2)                     \n\t" // line 7
788
789                 :
790                 : "r" (src), "r" (stride)
791                 : "%eax", "%ebx"
792         );
793 #else
794
795         const int l1= stride;
796         const int l2= stride + l1;
797         const int l3= stride + l2;
798         const int l4= stride + l3;
799         const int l5= stride + l4;
800         const int l6= stride + l5;
801         const int l7= stride + l6;
802 //      const int l8= stride + l7;
803 //      const int l9= stride + l8;
804         int x;
805
806         src+= stride*3;
807         for(x=0; x<BLOCK_SIZE; x++)
808         {
809                 int a= src[l3] - src[l4];
810                 int b= src[l4] - src[l5];
811                 int c= src[l5] - src[l6];
812
813                 int d= ABS(b) - ((ABS(a) + ABS(c))>>1);
814                 d= MAX(d, 0);
815
816                 if(d < QP)
817                 {
818                         int v = d * SIGN(-b);
819
820                         src[l2] +=v>>3;
821                         src[l3] +=v>>2;
822                         src[l4] +=(3*v)>>3;
823                         src[l5] -=(3*v)>>3;
824                         src[l6] -=v>>2;
825                         src[l7] -=v>>3;
826
827                 }
828                 src++;
829         }
830         /*
831         const int l1= stride;
832         const int l2= stride + l1;
833         const int l3= stride + l2;
834         const int l4= stride + l3;
835         const int l5= stride + l4;
836         const int l6= stride + l5;
837         const int l7= stride + l6;
838         const int l8= stride + l7;
839         const int l9= stride + l8;
840         for(int x=0; x<BLOCK_SIZE; x++)
841         {
842                 int v2= src[l2];
843                 int v3= src[l3];
844                 int v4= src[l4];
845                 int v5= src[l5];
846                 int v6= src[l6];
847                 int v7= src[l7];
848
849                 if(ABS(v4-v5)<QP &&  ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 )
850                 {
851                         src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6         )/16;
852                         src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7  )/16;
853                         src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16;
854                         src[l6] = (       1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16;
855                 }
856                 src++;
857         }
858 */
859 #endif
860 }
861
862 /**
863  * Experimental Filter 1 (Horizontal)
864  * will not damage linear gradients
865  * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
866  * can only smooth blocks at the expected locations (it cant smooth them if they did move)
867  * MMX2 version does correct clipping C version doesnt
868  * not identical with the vertical one
869  */
870 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
871 {
872         int y;
873 //FIXME (has little in common with the mmx2 version)
874         for(y=0; y<BLOCK_SIZE; y++)
875         {
876                 int a= src[1] - src[2];
877                 int b= src[3] - src[4];
878                 int c= src[5] - src[6];
879
880                 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
881
882                 if(d < QP)
883                 {
884                         int v = d * SIGN(-b);
885
886                         src[1] +=v/8;
887                         src[2] +=v/4;
888                         src[3] +=3*v/8;
889                         src[4] -=3*v/8;
890                         src[5] -=v/4;
891                         src[6] -=v/8;
892
893                 }
894                 src+=stride;
895         }
896 }
897
898
899 static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
900 {
901 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
902 /*
903         uint8_t tmp[16];
904         const int l1= stride;
905         const int l2= stride + l1;
906         const int l3= stride + l2;
907         const int l4= (int)tmp - (int)src - stride*3;
908         const int l5= (int)tmp - (int)src - stride*3 + 8;
909         const int l6= stride*3 + l3;
910         const int l7= stride + l6;
911         const int l8= stride + l7;
912
913         memcpy(tmp, src+stride*7, 8);
914         memcpy(tmp+8, src+stride*8, 8);
915 */
916         src+= stride*4;
917         asm volatile(
918
919 #if 0 //sligtly more accurate and slightly slower
920                 "pxor %%mm7, %%mm7                              \n\t" // 0
921                 "leal (%0, %1), %%eax                           \n\t"
922                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
923 //      0       1       2       3       4       5       6       7
924 //      %0      %0+%1   %0+2%1  eax+2%1 %0+4%1  eax+4%1 ebx+%1  ebx+2%1
925 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1
926
927
928                 "movq (%0, %1, 2), %%mm0                        \n\t" // l2
929                 "movq (%0), %%mm1                               \n\t" // l0
930                 "movq %%mm0, %%mm2                              \n\t" // l2
931                 PAVGB(%%mm7, %%mm0)                                   // ~l2/2
932                 PAVGB(%%mm1, %%mm0)                                   // ~(l2 + 2l0)/4
933                 PAVGB(%%mm2, %%mm0)                                   // ~(5l2 + 2l0)/8
934
935                 "movq (%%eax), %%mm1                            \n\t" // l1
936                 "movq (%%eax, %1, 2), %%mm3                     \n\t" // l3
937                 "movq %%mm1, %%mm4                              \n\t" // l1
938                 PAVGB(%%mm7, %%mm1)                                   // ~l1/2
939                 PAVGB(%%mm3, %%mm1)                                   // ~(l1 + 2l3)/4
940                 PAVGB(%%mm4, %%mm1)                                   // ~(5l1 + 2l3)/8
941
942                 "movq %%mm0, %%mm4                              \n\t" // ~(5l2 + 2l0)/8
943                 "psubusb %%mm1, %%mm0                           \n\t"
944                 "psubusb %%mm4, %%mm1                           \n\t"
945                 "por %%mm0, %%mm1                               \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
946 // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
947
948                 "movq (%0, %1, 4), %%mm0                        \n\t" // l4
949                 "movq %%mm0, %%mm4                              \n\t" // l4
950                 PAVGB(%%mm7, %%mm0)                                   // ~l4/2
951                 PAVGB(%%mm2, %%mm0)                                   // ~(l4 + 2l2)/4
952                 PAVGB(%%mm4, %%mm0)                                   // ~(5l4 + 2l2)/8
953
954                 "movq (%%ebx), %%mm2                            \n\t" // l5
955                 "movq %%mm3, %%mm5                              \n\t" // l3
956                 PAVGB(%%mm7, %%mm3)                                   // ~l3/2
957                 PAVGB(%%mm2, %%mm3)                                   // ~(l3 + 2l5)/4
958                 PAVGB(%%mm5, %%mm3)                                   // ~(5l3 + 2l5)/8
959
960                 "movq %%mm0, %%mm6                              \n\t" // ~(5l4 + 2l2)/8
961                 "psubusb %%mm3, %%mm0                           \n\t"
962                 "psubusb %%mm6, %%mm3                           \n\t"
963                 "por %%mm0, %%mm3                               \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
964                 "pcmpeqb %%mm7, %%mm0                           \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
965 // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
966
967                 "movq (%%ebx, %1), %%mm6                        \n\t" // l6
968                 "movq %%mm6, %%mm5                              \n\t" // l6
969                 PAVGB(%%mm7, %%mm6)                                   // ~l6/2
970                 PAVGB(%%mm4, %%mm6)                                   // ~(l6 + 2l4)/4
971                 PAVGB(%%mm5, %%mm6)                                   // ~(5l6 + 2l4)/8
972
973                 "movq (%%ebx, %1, 2), %%mm5                     \n\t" // l7
974                 "movq %%mm2, %%mm4                              \n\t" // l5
975                 PAVGB(%%mm7, %%mm2)                                   // ~l5/2
976                 PAVGB(%%mm5, %%mm2)                                   // ~(l5 + 2l7)/4
977                 PAVGB(%%mm4, %%mm2)                                   // ~(5l5 + 2l7)/8
978
979                 "movq %%mm6, %%mm4                              \n\t" // ~(5l6 + 2l4)/8
980                 "psubusb %%mm2, %%mm6                           \n\t"
981                 "psubusb %%mm4, %%mm2                           \n\t"
982                 "por %%mm6, %%mm2                               \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
983 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
984
985
986                 PMINUB(%%mm2, %%mm1, %%mm4)                           // MIN(|lenergy|,|renergy|)/8
987                 "movq pQPb, %%mm4                               \n\t" // QP //FIXME QP+1 ?
988                 "paddusb b01, %%mm4                             \n\t"
989                 "pcmpgtb %%mm3, %%mm4                           \n\t" // |menergy|/8 < QP
990                 "psubusb %%mm1, %%mm3                           \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
991                 "pand %%mm4, %%mm3                              \n\t"
992
993                 "movq %%mm3, %%mm1                              \n\t"
994 //              "psubusb b01, %%mm3                             \n\t"
995                 PAVGB(%%mm7, %%mm3)
996                 PAVGB(%%mm7, %%mm3)
997                 "paddusb %%mm1, %%mm3                           \n\t"
998 //              "paddusb b01, %%mm3                             \n\t"
999
1000                 "movq (%%eax, %1, 2), %%mm6                     \n\t" //l3
1001                 "movq (%0, %1, 4), %%mm5                        \n\t" //l4
1002                 "movq (%0, %1, 4), %%mm4                        \n\t" //l4
1003                 "psubusb %%mm6, %%mm5                           \n\t"
1004                 "psubusb %%mm4, %%mm6                           \n\t"
1005                 "por %%mm6, %%mm5                               \n\t" // |l3-l4|
1006                 "pcmpeqb %%mm7, %%mm6                           \n\t" // SIGN(l3-l4)
1007                 "pxor %%mm6, %%mm0                              \n\t"
1008                 "pand %%mm0, %%mm3                              \n\t"
1009                 PMINUB(%%mm5, %%mm3, %%mm0)
1010
1011                 "psubusb b01, %%mm3                             \n\t"
1012                 PAVGB(%%mm7, %%mm3)
1013
1014                 "movq (%%eax, %1, 2), %%mm0                     \n\t"
1015                 "movq (%0, %1, 4), %%mm2                        \n\t"
1016                 "pxor %%mm6, %%mm0                              \n\t"
1017                 "pxor %%mm6, %%mm2                              \n\t"
1018                 "psubb %%mm3, %%mm0                             \n\t"
1019                 "paddb %%mm3, %%mm2                             \n\t"
1020                 "pxor %%mm6, %%mm0                              \n\t"
1021                 "pxor %%mm6, %%mm2                              \n\t"
1022                 "movq %%mm0, (%%eax, %1, 2)                     \n\t"
1023                 "movq %%mm2, (%0, %1, 4)                        \n\t"
1024 #endif
1025
1026                 "leal (%0, %1), %%eax                           \n\t"
1027                 "pcmpeqb %%mm6, %%mm6                           \n\t" // -1
1028 //      0       1       2       3       4       5       6       7
1029 //      %0      %0+%1   %0+2%1  eax+2%1 %0+4%1  eax+4%1 ebx+%1  ebx+2%1
1030 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1
1031
1032
1033                 "movq (%%eax, %1, 2), %%mm1                     \n\t" // l3
1034                 "movq (%0, %1, 4), %%mm0                        \n\t" // l4
1035                 "pxor %%mm6, %%mm1                              \n\t" // -l3-1
1036                 PAVGB(%%mm1, %%mm0)                                   // -q+128 = (l4-l3+256)/2
1037 // mm1=-l3-1, mm0=128-q
1038
1039                 "movq (%%eax, %1, 4), %%mm2                     \n\t" // l5
1040                 "movq (%%eax, %1), %%mm3                        \n\t" // l2
1041                 "pxor %%mm6, %%mm2                              \n\t" // -l5-1
1042                 "movq %%mm2, %%mm5                              \n\t" // -l5-1
1043                 "movq b80, %%mm4                                \n\t" // 128
1044                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
1045                 PAVGB(%%mm3, %%mm2)                                   // (l2-l5+256)/2
1046                 PAVGB(%%mm0, %%mm4)                                   // ~(l4-l3)/4 + 128
1047                 PAVGB(%%mm2, %%mm4)                                   // ~(l2-l5)/4 +(l4-l3)/8 + 128
1048                 PAVGB(%%mm0, %%mm4)                                   // ~(l2-l5)/8 +5(l4-l3)/16 + 128
1049 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
1050
1051                 "movq (%%eax), %%mm2                            \n\t" // l1
1052                 "pxor %%mm6, %%mm2                              \n\t" // -l1-1
1053                 PAVGB(%%mm3, %%mm2)                                   // (l2-l1+256)/2
1054                 PAVGB((%0), %%mm1)                                    // (l0-l3+256)/2
1055                 "movq b80, %%mm3                                \n\t" // 128
1056                 PAVGB(%%mm2, %%mm3)                                   // ~(l2-l1)/4 + 128
1057                 PAVGB(%%mm1, %%mm3)                                   // ~(l0-l3)/4 +(l2-l1)/8 + 128
1058                 PAVGB(%%mm2, %%mm3)                                   // ~(l0-l3)/8 +5(l2-l1)/16 + 128
1059 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
1060
1061                 PAVGB((%%ebx, %1), %%mm5)                             // (l6-l5+256)/2
1062                 "movq (%%ebx, %1, 2), %%mm1                     \n\t" // l7
1063                 "pxor %%mm6, %%mm1                              \n\t" // -l7-1
1064                 PAVGB((%0, %1, 4), %%mm1)                             // (l4-l7+256)/2
1065                 "movq b80, %%mm2                                \n\t" // 128
1066                 PAVGB(%%mm5, %%mm2)                                   // ~(l6-l5)/4 + 128
1067                 PAVGB(%%mm1, %%mm2)                                   // ~(l4-l7)/4 +(l6-l5)/8 + 128
1068                 PAVGB(%%mm5, %%mm2)                                   // ~(l4-l7)/8 +5(l6-l5)/16 + 128
1069 // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
1070
1071                 "movq b00, %%mm1                                \n\t" // 0
1072                 "movq b00, %%mm5                                \n\t" // 0
1073                 "psubb %%mm2, %%mm1                             \n\t" // 128 - renergy/16
1074                 "psubb %%mm3, %%mm5                             \n\t" // 128 - lenergy/16
1075                 PMAXUB(%%mm1, %%mm2)                                  // 128 + |renergy/16|
1076                 PMAXUB(%%mm5, %%mm3)                                  // 128 + |lenergy/16|
1077                 PMINUB(%%mm2, %%mm3, %%mm1)                           // 128 + MIN(|lenergy|,|renergy|)/16
1078
1079 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
1080
1081                 "movq b00, %%mm7                                \n\t" // 0
1082                 "movq pQPb, %%mm2                               \n\t" // QP
1083                 PAVGB(%%mm6, %%mm2)                                   // 128 + QP/2
1084                 "psubb %%mm6, %%mm2                             \n\t"
1085
1086                 "movq %%mm4, %%mm1                              \n\t"
1087                 "pcmpgtb %%mm7, %%mm1                           \n\t" // SIGN(menergy)
1088                 "pxor %%mm1, %%mm4                              \n\t"
1089                 "psubb %%mm1, %%mm4                             \n\t" // 128 + |menergy|/16
1090                 "pcmpgtb %%mm4, %%mm2                           \n\t" // |menergy|/16 < QP/2
1091                 "psubusb %%mm3, %%mm4                           \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
1092 // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
1093
1094                 "movq %%mm4, %%mm3                              \n\t" // d
1095                 "psubusb b01, %%mm4                             \n\t"
1096                 PAVGB(%%mm7, %%mm4)                                   // d/32
1097                 PAVGB(%%mm7, %%mm4)                                   // (d + 32)/64
1098                 "paddb %%mm3, %%mm4                             \n\t" // 5d/64
1099                 "pand %%mm2, %%mm4                              \n\t"
1100
1101                 "movq b80, %%mm5                                \n\t" // 128
1102                 "psubb %%mm0, %%mm5                             \n\t" // q
1103                 "paddsb %%mm6, %%mm5                            \n\t" // fix bad rounding
1104                 "pcmpgtb %%mm5, %%mm7                           \n\t" // SIGN(q)
1105                 "pxor %%mm7, %%mm5                              \n\t"
1106
1107                 PMINUB(%%mm5, %%mm4, %%mm3)                           // MIN(|q|, 5d/64)
1108                 "pxor %%mm1, %%mm7                              \n\t" // SIGN(d*q)
1109
1110                 "pand %%mm7, %%mm4                              \n\t"
1111                 "movq (%%eax, %1, 2), %%mm0                     \n\t"
1112                 "movq (%0, %1, 4), %%mm2                        \n\t"
1113                 "pxor %%mm1, %%mm0                              \n\t"
1114                 "pxor %%mm1, %%mm2                              \n\t"
1115                 "paddb %%mm4, %%mm0                             \n\t"
1116                 "psubb %%mm4, %%mm2                             \n\t"
1117                 "pxor %%mm1, %%mm0                              \n\t"
1118                 "pxor %%mm1, %%mm2                              \n\t"
1119                 "movq %%mm0, (%%eax, %1, 2)                     \n\t"
1120                 "movq %%mm2, (%0, %1, 4)                        \n\t"
1121
1122                 :
1123                 : "r" (src), "r" (stride)
1124                 : "%eax", "%ebx"
1125         );
1126
1127 /*
1128         {
1129         int x;
1130         src-= stride;
1131         for(x=0; x<BLOCK_SIZE; x++)
1132         {
1133                 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1134                 if(ABS(middleEnergy)< 8*QP)
1135                 {
1136                         const int q=(src[l4] - src[l5])/2;
1137                         const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1138                         const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1139
1140                         int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1141                         d= MAX(d, 0);
1142
1143                         d= (5*d + 32) >> 6;
1144                         d*= SIGN(-middleEnergy);
1145
1146                         if(q>0)
1147                         {
1148                                 d= d<0 ? 0 : d;
1149                                 d= d>q ? q : d;
1150                         }
1151                         else
1152                         {
1153                                 d= d>0 ? 0 : d;
1154                                 d= d<q ? q : d;
1155                         }
1156
1157                         src[l4]-= d;
1158                         src[l5]+= d;
1159                 }
1160                 src++;
1161         }
1162 src-=8;
1163         for(x=0; x<8; x++)
1164         {
1165                 int y;
1166                 for(y=4; y<6; y++)
1167                 {
1168                         int d= src[x+y*stride] - tmp[x+(y-4)*8];
1169                         int ad= ABS(d);
1170                         static int max=0;
1171                         static int sum=0;
1172                         static int num=0;
1173                         static int bias=0;
1174
1175                         if(max<ad) max=ad;
1176                         sum+= ad>3 ? 1 : 0;
1177                         if(ad>3)
1178                         {
1179                                 src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
1180                         }
1181                         if(y==4) bias+=d;
1182                         num++;
1183                         if(num%1000000 == 0)
1184                         {
1185                                 printf(" %d %d %d %d\n", num, sum, max, bias);
1186                         }
1187                 }
1188         }
1189 }
1190 */
1191 #elif defined (HAVE_MMX)
1192         src+= stride*4;
1193
1194         asm volatile(
1195                 "pxor %%mm7, %%mm7                              \n\t"
1196                 "leal (%0, %1), %%eax                           \n\t"
1197                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
1198 //      0       1       2       3       4       5       6       7
1199 //      %0      %0+%1   %0+2%1  eax+2%1 %0+4%1  eax+4%1 ebx+%1  ebx+2%1
1200 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1
1201
1202                 "movq (%0), %%mm0                               \n\t"
1203                 "movq %%mm0, %%mm1                              \n\t"
1204                 "punpcklbw %%mm7, %%mm0                         \n\t" // low part of line 0
1205                 "punpckhbw %%mm7, %%mm1                         \n\t" // high part of line 0
1206
1207                 "movq (%%eax), %%mm2                            \n\t"
1208                 "movq %%mm2, %%mm3                              \n\t"
1209                 "punpcklbw %%mm7, %%mm2                         \n\t" // low part of line 1
1210                 "punpckhbw %%mm7, %%mm3                         \n\t" // high part of line 1
1211
1212                 "movq (%%eax, %1), %%mm4                        \n\t"
1213                 "movq %%mm4, %%mm5                              \n\t"
1214                 "punpcklbw %%mm7, %%mm4                         \n\t" // low part of line 2
1215                 "punpckhbw %%mm7, %%mm5                         \n\t" // high part of line 2
1216
1217                 "paddw %%mm0, %%mm0                             \n\t" // 2L0
1218                 "paddw %%mm1, %%mm1                             \n\t" // 2H0
1219                 "psubw %%mm4, %%mm2                             \n\t" // L1 - L2
1220                 "psubw %%mm5, %%mm3                             \n\t" // H1 - H2
1221                 "psubw %%mm2, %%mm0                             \n\t" // 2L0 - L1 + L2
1222                 "psubw %%mm3, %%mm1                             \n\t" // 2H0 - H1 + H2
1223
1224                 "psllw $2, %%mm2                                \n\t" // 4L1 - 4L2
1225                 "psllw $2, %%mm3                                \n\t" // 4H1 - 4H2
1226                 "psubw %%mm2, %%mm0                             \n\t" // 2L0 - 5L1 + 5L2
1227                 "psubw %%mm3, %%mm1                             \n\t" // 2H0 - 5H1 + 5H2
1228
1229                 "movq (%%eax, %1, 2), %%mm2                     \n\t"
1230                 "movq %%mm2, %%mm3                              \n\t"
1231                 "punpcklbw %%mm7, %%mm2                         \n\t" // L3
1232                 "punpckhbw %%mm7, %%mm3                         \n\t" // H3
1233
1234                 "psubw %%mm2, %%mm0                             \n\t" // 2L0 - 5L1 + 5L2 - L3
1235                 "psubw %%mm3, %%mm1                             \n\t" // 2H0 - 5H1 + 5H2 - H3
1236                 "psubw %%mm2, %%mm0                             \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1237                 "psubw %%mm3, %%mm1                             \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1238                 "movq %%mm0, temp0                              \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1239                 "movq %%mm1, temp1                              \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1240
1241                 "movq (%0, %1, 4), %%mm0                        \n\t"
1242                 "movq %%mm0, %%mm1                              \n\t"
1243                 "punpcklbw %%mm7, %%mm0                         \n\t" // L4
1244                 "punpckhbw %%mm7, %%mm1                         \n\t" // H4
1245
1246                 "psubw %%mm0, %%mm2                             \n\t" // L3 - L4
1247                 "psubw %%mm1, %%mm3                             \n\t" // H3 - H4
1248                 "movq %%mm2, temp2                              \n\t" // L3 - L4
1249                 "movq %%mm3, temp3                              \n\t" // H3 - H4
1250                 "paddw %%mm4, %%mm4                             \n\t" // 2L2
1251                 "paddw %%mm5, %%mm5                             \n\t" // 2H2
1252                 "psubw %%mm2, %%mm4                             \n\t" // 2L2 - L3 + L4
1253                 "psubw %%mm3, %%mm5                             \n\t" // 2H2 - H3 + H4
1254
1255                 "psllw $2, %%mm2                                \n\t" // 4L3 - 4L4
1256                 "psllw $2, %%mm3                                \n\t" // 4H3 - 4H4
1257                 "psubw %%mm2, %%mm4                             \n\t" // 2L2 - 5L3 + 5L4
1258                 "psubw %%mm3, %%mm5                             \n\t" // 2H2 - 5H3 + 5H4
1259 //50 opcodes so far
1260                 "movq (%%ebx), %%mm2                            \n\t"
1261                 "movq %%mm2, %%mm3                              \n\t"
1262                 "punpcklbw %%mm7, %%mm2                         \n\t" // L5
1263                 "punpckhbw %%mm7, %%mm3                         \n\t" // H5
1264                 "psubw %%mm2, %%mm4                             \n\t" // 2L2 - 5L3 + 5L4 - L5
1265                 "psubw %%mm3, %%mm5                             \n\t" // 2H2 - 5H3 + 5H4 - H5
1266                 "psubw %%mm2, %%mm4                             \n\t" // 2L2 - 5L3 + 5L4 - 2L5
1267                 "psubw %%mm3, %%mm5                             \n\t" // 2H2 - 5H3 + 5H4 - 2H5
1268
1269                 "movq (%%ebx, %1), %%mm6                        \n\t"
1270                 "punpcklbw %%mm7, %%mm6                         \n\t" // L6
1271                 "psubw %%mm6, %%mm2                             \n\t" // L5 - L6
1272                 "movq (%%ebx, %1), %%mm6                        \n\t"
1273                 "punpckhbw %%mm7, %%mm6                         \n\t" // H6
1274                 "psubw %%mm6, %%mm3                             \n\t" // H5 - H6
1275
1276                 "paddw %%mm0, %%mm0                             \n\t" // 2L4
1277                 "paddw %%mm1, %%mm1                             \n\t" // 2H4
1278                 "psubw %%mm2, %%mm0                             \n\t" // 2L4 - L5 + L6
1279                 "psubw %%mm3, %%mm1                             \n\t" // 2H4 - H5 + H6
1280
1281                 "psllw $2, %%mm2                                \n\t" // 4L5 - 4L6
1282                 "psllw $2, %%mm3                                \n\t" // 4H5 - 4H6
1283                 "psubw %%mm2, %%mm0                             \n\t" // 2L4 - 5L5 + 5L6
1284                 "psubw %%mm3, %%mm1                             \n\t" // 2H4 - 5H5 + 5H6
1285
1286                 "movq (%%ebx, %1, 2), %%mm2                     \n\t"
1287                 "movq %%mm2, %%mm3                              \n\t"
1288                 "punpcklbw %%mm7, %%mm2                         \n\t" // L7
1289                 "punpckhbw %%mm7, %%mm3                         \n\t" // H7
1290
1291                 "paddw %%mm2, %%mm2                             \n\t" // 2L7
1292                 "paddw %%mm3, %%mm3                             \n\t" // 2H7
1293                 "psubw %%mm2, %%mm0                             \n\t" // 2L4 - 5L5 + 5L6 - 2L7
1294                 "psubw %%mm3, %%mm1                             \n\t" // 2H4 - 5H5 + 5H6 - 2H7
1295
1296                 "movq temp0, %%mm2                              \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1297                 "movq temp1, %%mm3                              \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1298
1299 #ifdef HAVE_MMX2
1300                 "movq %%mm7, %%mm6                              \n\t" // 0
1301                 "psubw %%mm0, %%mm6                             \n\t"
1302                 "pmaxsw %%mm6, %%mm0                            \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1303                 "movq %%mm7, %%mm6                              \n\t" // 0
1304                 "psubw %%mm1, %%mm6                             \n\t"
1305                 "pmaxsw %%mm6, %%mm1                            \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1306                 "movq %%mm7, %%mm6                              \n\t" // 0
1307                 "psubw %%mm2, %%mm6                             \n\t"
1308                 "pmaxsw %%mm6, %%mm2                            \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1309                 "movq %%mm7, %%mm6                              \n\t" // 0
1310                 "psubw %%mm3, %%mm6                             \n\t"
1311                 "pmaxsw %%mm6, %%mm3                            \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1312 #else
1313                 "movq %%mm7, %%mm6                              \n\t" // 0
1314                 "pcmpgtw %%mm0, %%mm6                           \n\t"
1315                 "pxor %%mm6, %%mm0                              \n\t"
1316                 "psubw %%mm6, %%mm0                             \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1317                 "movq %%mm7, %%mm6                              \n\t" // 0
1318                 "pcmpgtw %%mm1, %%mm6                           \n\t"
1319                 "pxor %%mm6, %%mm1                              \n\t"
1320                 "psubw %%mm6, %%mm1                             \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1321                 "movq %%mm7, %%mm6                              \n\t" // 0
1322                 "pcmpgtw %%mm2, %%mm6                           \n\t"
1323                 "pxor %%mm6, %%mm2                              \n\t"
1324                 "psubw %%mm6, %%mm2                             \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1325                 "movq %%mm7, %%mm6                              \n\t" // 0
1326                 "pcmpgtw %%mm3, %%mm6                           \n\t"
1327                 "pxor %%mm6, %%mm3                              \n\t"
1328                 "psubw %%mm6, %%mm3                             \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1329 #endif
1330
1331 #ifdef HAVE_MMX2
1332                 "pminsw %%mm2, %%mm0                            \n\t"
1333                 "pminsw %%mm3, %%mm1                            \n\t"
1334 #else
1335                 "movq %%mm0, %%mm6                              \n\t"
1336                 "psubusw %%mm2, %%mm6                           \n\t"
1337                 "psubw %%mm6, %%mm0                             \n\t"
1338                 "movq %%mm1, %%mm6                              \n\t"
1339                 "psubusw %%mm3, %%mm6                           \n\t"
1340                 "psubw %%mm6, %%mm1                             \n\t"
1341 #endif
1342
1343                 "movq %%mm7, %%mm6                              \n\t" // 0
1344                 "pcmpgtw %%mm4, %%mm6                           \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
1345                 "pxor %%mm6, %%mm4                              \n\t"
1346                 "psubw %%mm6, %%mm4                             \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
1347                 "pcmpgtw %%mm5, %%mm7                           \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
1348                 "pxor %%mm7, %%mm5                              \n\t"
1349                 "psubw %%mm7, %%mm5                             \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
1350 // 100 opcodes
1351                 "movd %2, %%mm2                                 \n\t" // QP
1352                 "punpcklwd %%mm2, %%mm2                         \n\t"
1353                 "punpcklwd %%mm2, %%mm2                         \n\t"
1354                 "psllw $3, %%mm2                                \n\t" // 8QP
1355                 "movq %%mm2, %%mm3                              \n\t" // 8QP
1356                 "pcmpgtw %%mm4, %%mm2                           \n\t"
1357                 "pcmpgtw %%mm5, %%mm3                           \n\t"
1358                 "pand %%mm2, %%mm4                              \n\t"
1359                 "pand %%mm3, %%mm5                              \n\t"
1360
1361
1362                 "psubusw %%mm0, %%mm4                           \n\t" // hd
1363                 "psubusw %%mm1, %%mm5                           \n\t" // ld
1364
1365
1366                 "movq w05, %%mm2                                \n\t" // 5
1367                 "pmullw %%mm2, %%mm4                            \n\t"
1368                 "pmullw %%mm2, %%mm5                            \n\t"
1369                 "movq w20, %%mm2                                \n\t" // 32
1370                 "paddw %%mm2, %%mm4                             \n\t"
1371                 "paddw %%mm2, %%mm5                             \n\t"
1372                 "psrlw $6, %%mm4                                \n\t"
1373                 "psrlw $6, %%mm5                                \n\t"
1374
1375 /*
1376                 "movq w06, %%mm2                                \n\t" // 6
1377                 "paddw %%mm2, %%mm4                             \n\t"
1378                 "paddw %%mm2, %%mm5                             \n\t"
1379                 "movq w1400, %%mm2                              \n\t" // 1400h = 5120 = 5/64*2^16
1380 //FIXME if *5/64 is supposed to be /13 then we should use 5041 instead of 5120
1381                 "pmulhw %%mm2, %%mm4                            \n\t" // hd/13
1382                 "pmulhw %%mm2, %%mm5                            \n\t" // ld/13
1383 */
1384
1385                 "movq temp2, %%mm0                              \n\t" // L3 - L4
1386                 "movq temp3, %%mm1                              \n\t" // H3 - H4
1387
1388                 "pxor %%mm2, %%mm2                              \n\t"
1389                 "pxor %%mm3, %%mm3                              \n\t"
1390
1391                 "pcmpgtw %%mm0, %%mm2                           \n\t" // sign (L3-L4)
1392                 "pcmpgtw %%mm1, %%mm3                           \n\t" // sign (H3-H4)
1393                 "pxor %%mm2, %%mm0                              \n\t"
1394                 "pxor %%mm3, %%mm1                              \n\t"
1395                 "psubw %%mm2, %%mm0                             \n\t" // |L3-L4|
1396                 "psubw %%mm3, %%mm1                             \n\t" // |H3-H4|
1397                 "psrlw $1, %%mm0                                \n\t" // |L3 - L4|/2
1398                 "psrlw $1, %%mm1                                \n\t" // |H3 - H4|/2
1399
1400                 "pxor %%mm6, %%mm2                              \n\t"
1401                 "pxor %%mm7, %%mm3                              \n\t"
1402                 "pand %%mm2, %%mm4                              \n\t"
1403                 "pand %%mm3, %%mm5                              \n\t"
1404
1405 #ifdef HAVE_MMX2
1406                 "pminsw %%mm0, %%mm4                            \n\t"
1407                 "pminsw %%mm1, %%mm5                            \n\t"
1408 #else
1409                 "movq %%mm4, %%mm2                              \n\t"
1410                 "psubusw %%mm0, %%mm2                           \n\t"
1411                 "psubw %%mm2, %%mm4                             \n\t"
1412                 "movq %%mm5, %%mm2                              \n\t"
1413                 "psubusw %%mm1, %%mm2                           \n\t"
1414                 "psubw %%mm2, %%mm5                             \n\t"
1415 #endif
1416                 "pxor %%mm6, %%mm4                              \n\t"
1417                 "pxor %%mm7, %%mm5                              \n\t"
1418                 "psubw %%mm6, %%mm4                             \n\t"
1419                 "psubw %%mm7, %%mm5                             \n\t"
1420                 "packsswb %%mm5, %%mm4                          \n\t"
1421                 "movq (%%eax, %1, 2), %%mm0                     \n\t"
1422                 "paddb   %%mm4, %%mm0                           \n\t"
1423                 "movq %%mm0, (%%eax, %1, 2)                     \n\t"
1424                 "movq (%0, %1, 4), %%mm0                        \n\t"
1425                 "psubb %%mm4, %%mm0                             \n\t"
1426                 "movq %%mm0, (%0, %1, 4)                        \n\t"
1427
1428                 :
1429                 : "r" (src), "r" (stride), "r" (QP)
1430                 : "%eax", "%ebx"
1431         );
1432 #else
1433         const int l1= stride;
1434         const int l2= stride + l1;
1435         const int l3= stride + l2;
1436         const int l4= stride + l3;
1437         const int l5= stride + l4;
1438         const int l6= stride + l5;
1439         const int l7= stride + l6;
1440         const int l8= stride + l7;
1441 //      const int l9= stride + l8;
1442         int x;
1443         src+= stride*3;
1444         for(x=0; x<BLOCK_SIZE; x++)
1445         {
1446                 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1447                 if(ABS(middleEnergy) < 8*QP)
1448                 {
1449                         const int q=(src[l4] - src[l5])/2;
1450                         const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1451                         const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1452
1453                         int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1454                         d= MAX(d, 0);
1455
1456                         d= (5*d + 32) >> 6;
1457                         d*= SIGN(-middleEnergy);
1458
1459                         if(q>0)
1460                         {
1461                                 d= d<0 ? 0 : d;
1462                                 d= d>q ? q : d;
1463                         }
1464                         else
1465                         {
1466                                 d= d>0 ? 0 : d;
1467                                 d= d<q ? q : d;
1468                         }
1469
1470                         src[l4]-= d;
1471                         src[l5]+= d;
1472                 }
1473                 src++;
1474         }
1475 #endif
1476 }
1477
1478 /**
1479  * Check if the given 8x8 Block is mostly "flat"
1480  */
1481 static inline int isHorizDC(uint8_t src[], int stride)
1482 {
1483         int numEq= 0;
1484         int y;
1485         for(y=0; y<BLOCK_SIZE; y++)
1486         {
1487                 if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++;
1488                 if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++;
1489                 if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++;
1490                 if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++;
1491                 if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++;
1492                 if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++;
1493                 if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++;
1494                 src+= stride;
1495         }
1496         return numEq > hFlatnessThreshold;
1497 }
1498
1499 static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
1500 {
1501         if(abs(src[0] - src[7]) > 2*QP) return 0;
1502
1503         return 1;
1504 }
1505
1506 static inline void doHorizDefFilter(uint8_t dst[], int stride, int QP)
1507 {
1508         int y;
1509         for(y=0; y<BLOCK_SIZE; y++)
1510         {
1511                 const int middleEnergy= 5*(dst[4] - dst[5]) + 2*(dst[2] - dst[5]);
1512
1513                 if(ABS(middleEnergy) < 8*QP)
1514                 {
1515                         const int q=(dst[3] - dst[4])/2;
1516                         const int leftEnergy=  5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
1517                         const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
1518
1519                         int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1520                         d= MAX(d, 0);
1521
1522                         d= (5*d + 32) >> 6;
1523                         d*= SIGN(-middleEnergy);
1524
1525                         if(q>0)
1526                         {
1527                                 d= d<0 ? 0 : d;
1528                                 d= d>q ? q : d;
1529                         }
1530                         else
1531                         {
1532                                 d= d>0 ? 0 : d;
1533                                 d= d<q ? q : d;
1534                         }
1535
1536                         dst[3]-= d;
1537                         dst[4]+= d;
1538                 }
1539                 dst+= stride;
1540         }
1541 }
1542
1543 /**
1544  * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
1545  * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
1546  */
1547 static inline void doHorizLowPass(uint8_t dst[], int stride, int QP)
1548 {
1549
1550         int y;
1551         for(y=0; y<BLOCK_SIZE; y++)
1552         {
1553                 const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
1554                 const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
1555
1556                 int sums[9];
1557                 sums[0] = first + dst[0];
1558                 sums[1] = dst[0] + dst[1];
1559                 sums[2] = dst[1] + dst[2];
1560                 sums[3] = dst[2] + dst[3];
1561                 sums[4] = dst[3] + dst[4];
1562                 sums[5] = dst[4] + dst[5];
1563                 sums[6] = dst[5] + dst[6];
1564                 sums[7] = dst[6] + dst[7];
1565                 sums[8] = dst[7] + last;
1566
1567                 dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
1568                 dst[1]= ((dst[1]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
1569                 dst[2]= ((dst[2]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
1570                 dst[3]= ((dst[3]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
1571                 dst[4]= ((dst[4]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
1572                 dst[5]= ((dst[5]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
1573                 dst[6]= (((last + dst[6])<<2) + ((dst[7] + sums[5])<<1) + sums[3] + 8)>>4;
1574                 dst[7]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
1575
1576                 dst+= stride;
1577         }
1578 }
1579
1580
1581 static inline void dering(uint8_t src[], int stride, int QP)
1582 {
1583 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1584         asm volatile(
1585                 "movq pQPb, %%mm0                               \n\t"
1586                 "paddusb %%mm0, %%mm0                           \n\t"
1587                 "movq %%mm0, pQPb2                              \n\t"
1588
1589                 "leal (%0, %1), %%eax                           \n\t"
1590                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
1591 //      0       1       2       3       4       5       6       7       8       9
1592 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
1593
1594                 "pcmpeqb %%mm6, %%mm6                           \n\t"
1595                 "pxor %%mm7, %%mm7                              \n\t"
1596 #ifdef HAVE_MMX2
1597 #define FIND_MIN_MAX(addr)\
1598                 "movq " #addr ", %%mm0                          \n\t"\
1599                 "pminub %%mm0, %%mm6                            \n\t"\
1600                 "pmaxub %%mm0, %%mm7                            \n\t"
1601 #else
1602 #define FIND_MIN_MAX(addr)\
1603                 "movq " #addr ", %%mm0                          \n\t"\
1604                 "movq %%mm6, %%mm1                              \n\t"\
1605                 "psubusb %%mm0, %%mm7                           \n\t"\
1606                 "paddb %%mm0, %%mm7                             \n\t"\
1607                 "psubusb %%mm0, %%mm1                           \n\t"\
1608                 "psubb %%mm1, %%mm6                             \n\t"
1609 #endif
1610
1611 FIND_MIN_MAX((%%eax))
1612 FIND_MIN_MAX((%%eax, %1))
1613 FIND_MIN_MAX((%%eax, %1, 2))
1614 FIND_MIN_MAX((%0, %1, 4))
1615 FIND_MIN_MAX((%%ebx))
1616 FIND_MIN_MAX((%%ebx, %1))
1617 FIND_MIN_MAX((%%ebx, %1, 2))
1618 FIND_MIN_MAX((%0, %1, 8))
1619
1620                 "movq %%mm6, %%mm4                              \n\t"
1621                 "psrlq $8, %%mm6                                \n\t"
1622 #ifdef HAVE_MMX2
1623                 "pminub %%mm4, %%mm6                            \n\t" // min of pixels
1624                 "pshufw $0xF9, %%mm6, %%mm4                     \n\t"
1625                 "pminub %%mm4, %%mm6                            \n\t" // min of pixels
1626                 "pshufw $0xFE, %%mm6, %%mm4                     \n\t"
1627                 "pminub %%mm4, %%mm6                            \n\t"
1628 #else
1629                 "movq %%mm6, %%mm1                              \n\t"
1630                 "psubusb %%mm4, %%mm1                           \n\t"
1631                 "psubb %%mm1, %%mm6                             \n\t"
1632                 "movq %%mm6, %%mm4                              \n\t"
1633                 "psrlq $16, %%mm6                               \n\t"
1634                 "movq %%mm6, %%mm1                              \n\t"
1635                 "psubusb %%mm4, %%mm1                           \n\t"
1636                 "psubb %%mm1, %%mm6                             \n\t"
1637                 "movq %%mm6, %%mm4                              \n\t"
1638                 "psrlq $32, %%mm6                               \n\t"
1639                 "movq %%mm6, %%mm1                              \n\t"
1640                 "psubusb %%mm4, %%mm1                           \n\t"
1641                 "psubb %%mm1, %%mm6                             \n\t"
1642 #endif
1643
1644
1645                 "movq %%mm7, %%mm4                              \n\t"
1646                 "psrlq $8, %%mm7                                \n\t"
1647 #ifdef HAVE_MMX2
1648                 "pmaxub %%mm4, %%mm7                            \n\t" // max of pixels
1649                 "pshufw $0xF9, %%mm7, %%mm4                     \n\t"
1650                 "pmaxub %%mm4, %%mm7                            \n\t"
1651                 "pshufw $0xFE, %%mm7, %%mm4                     \n\t"
1652                 "pmaxub %%mm4, %%mm7                            \n\t"
1653 #else
1654                 "psubusb %%mm4, %%mm7                           \n\t"
1655                 "paddb %%mm4, %%mm7                             \n\t"
1656                 "movq %%mm7, %%mm4                              \n\t"
1657                 "psrlq $16, %%mm7                               \n\t"
1658                 "psubusb %%mm4, %%mm7                           \n\t"
1659                 "paddb %%mm4, %%mm7                             \n\t"
1660                 "movq %%mm7, %%mm4                              \n\t"
1661                 "psrlq $32, %%mm7                               \n\t"
1662                 "psubusb %%mm4, %%mm7                           \n\t"
1663                 "paddb %%mm4, %%mm7                             \n\t"
1664 #endif
1665                 PAVGB(%%mm6, %%mm7)                                   // a=(max + min)/2
1666                 "punpcklbw %%mm7, %%mm7                         \n\t"
1667                 "punpcklbw %%mm7, %%mm7                         \n\t"
1668                 "punpcklbw %%mm7, %%mm7                         \n\t"
1669                 "movq %%mm7, temp0                              \n\t"
1670
1671                 "movq (%0), %%mm0                               \n\t" // L10
1672                 "movq %%mm0, %%mm1                              \n\t" // L10
1673                 "movq %%mm0, %%mm2                              \n\t" // L10
1674                 "psllq $8, %%mm1                                \n\t"
1675                 "psrlq $8, %%mm2                                \n\t"
1676                 "movd -4(%0), %%mm3                             \n\t"
1677                 "movd 8(%0), %%mm4                              \n\t"
1678                 "psrlq $24, %%mm3                               \n\t"
1679                 "psllq $56, %%mm4                               \n\t"
1680                 "por %%mm3, %%mm1                               \n\t" // L00
1681                 "por %%mm4, %%mm2                               \n\t" // L20
1682                 "movq %%mm1, %%mm3                              \n\t" // L00
1683                 PAVGB(%%mm2, %%mm1)                                   // (L20 + L00)/2
1684                 PAVGB(%%mm0, %%mm1)                                   // (L20 + L00 + 2L10)/4
1685                 "psubusb %%mm7, %%mm0                           \n\t"
1686                 "psubusb %%mm7, %%mm2                           \n\t"
1687                 "psubusb %%mm7, %%mm3                           \n\t"
1688                 "pcmpeqb b00, %%mm0                             \n\t" // L10 > a ? 0 : -1
1689                 "pcmpeqb b00, %%mm2                             \n\t" // L20 > a ? 0 : -1
1690                 "pcmpeqb b00, %%mm3                             \n\t" // L00 > a ? 0 : -1
1691                 "paddb %%mm2, %%mm0                             \n\t"
1692                 "paddb %%mm3, %%mm0                             \n\t"
1693
1694                 "movq (%%eax), %%mm2                            \n\t" // L11
1695                 "movq %%mm2, %%mm3                              \n\t" // L11
1696                 "movq %%mm2, %%mm4                              \n\t" // L11
1697                 "psllq $8, %%mm3                                \n\t"
1698                 "psrlq $8, %%mm4                                \n\t"
1699                 "movd -4(%%eax), %%mm5                          \n\t"
1700                 "movd 8(%%eax), %%mm6                           \n\t"
1701                 "psrlq $24, %%mm5                               \n\t"
1702                 "psllq $56, %%mm6                               \n\t"
1703                 "por %%mm5, %%mm3                               \n\t" // L01
1704                 "por %%mm6, %%mm4                               \n\t" // L21
1705                 "movq %%mm3, %%mm5                              \n\t" // L01
1706                 PAVGB(%%mm4, %%mm3)                                   // (L21 + L01)/2
1707                 PAVGB(%%mm2, %%mm3)                                   // (L21 + L01 + 2L11)/4
1708                 "psubusb %%mm7, %%mm2                           \n\t"
1709                 "psubusb %%mm7, %%mm4                           \n\t"
1710                 "psubusb %%mm7, %%mm5                           \n\t"
1711                 "pcmpeqb b00, %%mm2                             \n\t" // L11 > a ? 0 : -1
1712                 "pcmpeqb b00, %%mm4                             \n\t" // L21 > a ? 0 : -1
1713                 "pcmpeqb b00, %%mm5                             \n\t" // L01 > a ? 0 : -1
1714                 "paddb %%mm4, %%mm2                             \n\t"
1715                 "paddb %%mm5, %%mm2                             \n\t"
1716 // 0, 2, 3, 1
1717 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1718                 "movq " #src ", " #sx "                         \n\t" /* src[0] */\
1719                 "movq " #sx ", " #lx "                          \n\t" /* src[0] */\
1720                 "movq " #sx ", " #t0 "                          \n\t" /* src[0] */\
1721                 "psllq $8, " #lx "                              \n\t"\
1722                 "psrlq $8, " #t0 "                              \n\t"\
1723                 "movd -4" #src ", " #t1 "                       \n\t"\
1724                 "psrlq $24, " #t1 "                             \n\t"\
1725                 "por " #t1 ", " #lx "                           \n\t" /* src[-1] */\
1726                 "movd 8" #src ", " #t1 "                        \n\t"\
1727                 "psllq $56, " #t1 "                             \n\t"\
1728                 "por " #t1 ", " #t0 "                           \n\t" /* src[+1] */\
1729                 "movq " #lx ", " #t1 "                          \n\t" /* src[-1] */\
1730                 PAVGB(t0, lx)                                         /* (src[-1] + src[+1])/2 */\
1731                 PAVGB(sx, lx)                                 /* (src[-1] + 2src[0] + src[+1])/4 */\
1732                 PAVGB(lx, pplx)                                      \
1733                 "movq " #lx ", temp1                            \n\t"\
1734                 "movq temp0, " #lx "                            \n\t"\
1735                 "psubusb " #lx ", " #t1 "                       \n\t"\
1736                 "psubusb " #lx ", " #t0 "                       \n\t"\
1737                 "psubusb " #lx ", " #sx "                       \n\t"\
1738                 "movq b00, " #lx "                              \n\t"\
1739                 "pcmpeqb " #lx ", " #t1 "                       \n\t" /* src[-1] > a ? 0 : -1*/\
1740                 "pcmpeqb " #lx ", " #t0 "                       \n\t" /* src[+1] > a ? 0 : -1*/\
1741                 "pcmpeqb " #lx ", " #sx "                       \n\t" /* src[0]  > a ? 0 : -1*/\
1742                 "paddb " #t1 ", " #t0 "                         \n\t"\
1743                 "paddb " #t0 ", " #sx "                         \n\t"\
1744 \
1745                 PAVGB(plx, pplx)                                      /* filtered */\
1746                 "movq " #dst ", " #t0 "                         \n\t" /* dst */\
1747                 "movq " #t0 ", " #t1 "                          \n\t" /* dst */\
1748                 "psubusb pQPb2, " #t0 "                         \n\t"\
1749                 "paddusb pQPb2, " #t1 "                         \n\t"\
1750                 PMAXUB(t0, pplx)\
1751                 PMINUB(t1, pplx, t0)\
1752                 "paddb " #sx ", " #ppsx "                       \n\t"\
1753                 "paddb " #psx ", " #ppsx "                      \n\t"\
1754         "#paddb b02, " #ppsx "                          \n\t"\
1755                 "pand b08, " #ppsx "                            \n\t"\
1756                 "pcmpeqb " #lx ", " #ppsx "                     \n\t"\
1757                 "pand " #ppsx ", " #pplx "                      \n\t"\
1758                 "pandn " #dst ", " #ppsx "                      \n\t"\
1759                 "por " #pplx ", " #ppsx "                       \n\t"\
1760                 "movq " #ppsx ", " #dst "                       \n\t"\
1761                 "movq temp1, " #lx "                            \n\t"
1762
1763 /*
1764 0000000
1765 1111111
1766
1767 1111110
1768 1111101
1769 1111100
1770 1111011
1771 1111010
1772 1111001
1773
1774 1111000
1775 1110111
1776
1777 */
1778 //DERING_CORE(dst,src                  ,ppsx ,psx  ,sx   ,pplx ,plx  ,lx   ,t0   ,t1)
1779 DERING_CORE((%%eax),(%%eax, %1)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1780 DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1781 DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1782 DERING_CORE((%0, %1, 4),(%%ebx)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1783 DERING_CORE((%%ebx),(%%ebx, %1)        ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1784 DERING_CORE((%%ebx, %1), (%%ebx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1785 DERING_CORE((%%ebx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1786 DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1787
1788
1789                 : : "r" (src), "r" (stride), "r" (QP)
1790                 : "%eax", "%ebx"
1791         );
1792 #else
1793         int y;
1794         int min=255;
1795         int max=0;
1796         int avg;
1797         uint8_t *p;
1798         int s[10];
1799
1800         for(y=1; y<9; y++)
1801         {
1802                 int x;
1803                 p= src + stride*y;
1804                 for(x=1; x<9; x++)
1805                 {
1806                         p++;
1807                         if(*p > max) max= *p;
1808                         if(*p < min) min= *p;
1809                 }
1810         }
1811         avg= (min + max + 1)/2;
1812
1813         for(y=0; y<10; y++)
1814         {
1815                 int x;
1816                 int t = 0;
1817                 p= src + stride*y;
1818                 for(x=0; x<10; x++)
1819                 {
1820                         if(*p > avg) t |= (1<<x);
1821                         p++;
1822                 }
1823                 t |= (~t)<<16;
1824                 t &= (t<<1) & (t>>1);
1825                 s[y] = t;
1826         }
1827
1828         for(y=1; y<9; y++)
1829         {
1830                 int x;
1831                 int t = s[y-1] & s[y] & s[y+1];
1832                 t|= t>>16;
1833
1834                 p= src + stride*y;
1835                 for(x=1; x<9; x++)
1836                 {
1837                         p++;
1838                         if(t & (1<<x))
1839                         {
1840                                 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
1841                                       +2*(*(p     -1)) + 4*(*p         ) + 2*(*(p     +1))
1842                                       +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
1843                                 f= (f + 8)>>4;
1844
1845                                 if     (*p + 2*QP < f) *p= *p + 2*QP;
1846                                 else if(*p - 2*QP > f) *p= *p - 2*QP;
1847                                 else *p=f;
1848                         }
1849                 }
1850         }
1851
1852 #endif
1853 }
1854
1855 /**
1856  * Deinterlaces the given block
1857  * will be called for every 8x8 block and can read & write from line 4-15
1858  * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1859  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1860  */
1861 static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride)
1862 {
1863 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1864         src+= 4*stride;
1865         asm volatile(
1866                 "leal (%0, %1), %%eax                           \n\t"
1867                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
1868 //      0       1       2       3       4       5       6       7       8       9
1869 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
1870
1871                 "movq (%0), %%mm0                               \n\t"
1872                 "movq (%%eax, %1), %%mm1                        \n\t"
1873                 PAVGB(%%mm1, %%mm0)
1874                 "movq %%mm0, (%%eax)                            \n\t"
1875                 "movq (%0, %1, 4), %%mm0                        \n\t"
1876                 PAVGB(%%mm0, %%mm1)
1877                 "movq %%mm1, (%%eax, %1, 2)                     \n\t"
1878                 "movq (%%ebx, %1), %%mm1                        \n\t"
1879                 PAVGB(%%mm1, %%mm0)
1880                 "movq %%mm0, (%%ebx)                            \n\t"
1881                 "movq (%0, %1, 8), %%mm0                        \n\t"
1882                 PAVGB(%%mm0, %%mm1)
1883                 "movq %%mm1, (%%ebx, %1, 2)                     \n\t"
1884
1885                 : : "r" (src), "r" (stride)
1886                 : "%eax", "%ebx"
1887         );
1888 #else
1889         int x;
1890         src+= 4*stride;
1891         for(x=0; x<8; x++)
1892         {
1893                 src[stride]   = (src[0]        + src[stride*2])>>1;
1894                 src[stride*3] = (src[stride*2] + src[stride*4])>>1;
1895                 src[stride*5] = (src[stride*4] + src[stride*6])>>1;
1896                 src[stride*7] = (src[stride*6] + src[stride*8])>>1;
1897                 src++;
1898         }
1899 #endif
1900 }
1901
1902 /**
1903  * Deinterlaces the given block
1904  * will be called for every 8x8 block and can read & write from line 4-15
1905  * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1906  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1907  * this filter will read lines 3-15 and write 7-13
1908  * no cliping in C version
1909  */
1910 static inline void deInterlaceInterpolateCubic(uint8_t src[], int stride)
1911 {
1912 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1913         src+= stride*3;
1914         asm volatile(
1915                 "leal (%0, %1), %%eax                           \n\t"
1916                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
1917                 "leal (%%ebx, %1, 4), %%ecx                     \n\t"
1918                 "addl %1, %%ecx                                 \n\t"
1919                 "pxor %%mm7, %%mm7                              \n\t"
1920 //      0       1       2       3       4       5       6       7       8       9       10
1921 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1 ecx
1922
1923 #define DEINT_CUBIC(a,b,c,d,e)\
1924                 "movq " #a ", %%mm0                             \n\t"\
1925                 "movq " #b ", %%mm1                             \n\t"\
1926                 "movq " #d ", %%mm2                             \n\t"\
1927                 "movq " #e ", %%mm3                             \n\t"\
1928                 PAVGB(%%mm2, %%mm1)                                     /* (b+d) /2 */\
1929                 PAVGB(%%mm3, %%mm0)                                     /* a(a+e) /2 */\
1930                 "movq %%mm0, %%mm2                              \n\t"\
1931                 "punpcklbw %%mm7, %%mm0                         \n\t"\
1932                 "punpckhbw %%mm7, %%mm2                         \n\t"\
1933                 "movq %%mm1, %%mm3                              \n\t"\
1934                 "punpcklbw %%mm7, %%mm1                         \n\t"\
1935                 "punpckhbw %%mm7, %%mm3                         \n\t"\
1936                 "psubw %%mm1, %%mm0                             \n\t"   /* L(a+e - (b+d))/2 */\
1937                 "psubw %%mm3, %%mm2                             \n\t"   /* H(a+e - (b+d))/2 */\
1938                 "psraw $3, %%mm0                                \n\t"   /* L(a+e - (b+d))/16 */\
1939                 "psraw $3, %%mm2                                \n\t"   /* H(a+e - (b+d))/16 */\
1940                 "psubw %%mm0, %%mm1                             \n\t"   /* L(9b + 9d - a - e)/16 */\
1941                 "psubw %%mm2, %%mm3                             \n\t"   /* H(9b + 9d - a - e)/16 */\
1942                 "packuswb %%mm3, %%mm1                          \n\t"\
1943                 "movq %%mm1, " #c "                             \n\t"
1944
1945 DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%ebx, %1))
1946 DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%ebx), (%%ebx, %1), (%0, %1, 8))
1947 DEINT_CUBIC((%0, %1, 4), (%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8), (%%ecx))
1948 DEINT_CUBIC((%%ebx, %1), (%0, %1, 8), (%%ebx, %1, 4), (%%ecx), (%%ecx, %1, 2))
1949
1950                 : : "r" (src), "r" (stride)
1951                 : "%eax", "%ebx", "ecx"
1952         );
1953 #else
1954         int x;
1955         src+= stride*3;
1956         for(x=0; x<8; x++)
1957         {
1958                 src[stride*3] = (-src[0]        + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4;
1959                 src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4;
1960                 src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4;
1961                 src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4;
1962                 src++;
1963         }
1964 #endif
1965 }
1966
1967 /**
1968  * Deinterlaces the given block
1969  * will be called for every 8x8 block and can read & write from line 4-15
1970  * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1971  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1972  * will shift the image up by 1 line (FIXME if this is a problem)
1973  * this filter will read lines 4-13 and write 4-11
1974  */
1975 static inline void deInterlaceBlendLinear(uint8_t src[], int stride)
1976 {
1977 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1978         src+= 4*stride;
1979         asm volatile(
1980                 "leal (%0, %1), %%eax                           \n\t"
1981                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
1982 //      0       1       2       3       4       5       6       7       8       9
1983 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
1984
1985                 "movq (%0), %%mm0                               \n\t" // L0
1986                 "movq (%%eax, %1), %%mm1                        \n\t" // L2
1987                 PAVGB(%%mm1, %%mm0)                                   // L0+L2
1988                 "movq (%%eax), %%mm2                            \n\t" // L1
1989                 PAVGB(%%mm2, %%mm0)
1990                 "movq %%mm0, (%0)                               \n\t"
1991                 "movq (%%eax, %1, 2), %%mm0                     \n\t" // L3
1992                 PAVGB(%%mm0, %%mm2)                                   // L1+L3
1993                 PAVGB(%%mm1, %%mm2)                                   // 2L2 + L1 + L3
1994                 "movq %%mm2, (%%eax)                            \n\t"
1995                 "movq (%0, %1, 4), %%mm2                        \n\t" // L4
1996                 PAVGB(%%mm2, %%mm1)                                   // L2+L4
1997                 PAVGB(%%mm0, %%mm1)                                   // 2L3 + L2 + L4
1998                 "movq %%mm1, (%%eax, %1)                        \n\t"
1999                 "movq (%%ebx), %%mm1                            \n\t" // L5
2000                 PAVGB(%%mm1, %%mm0)                                   // L3+L5
2001                 PAVGB(%%mm2, %%mm0)                                   // 2L4 + L3 + L5
2002                 "movq %%mm0, (%%eax, %1, 2)                     \n\t"
2003                 "movq (%%ebx, %1), %%mm0                        \n\t" // L6
2004                 PAVGB(%%mm0, %%mm2)                                   // L4+L6
2005                 PAVGB(%%mm1, %%mm2)                                   // 2L5 + L4 + L6
2006                 "movq %%mm2, (%0, %1, 4)                        \n\t"
2007                 "movq (%%ebx, %1, 2), %%mm2                     \n\t" // L7
2008                 PAVGB(%%mm2, %%mm1)                                   // L5+L7
2009                 PAVGB(%%mm0, %%mm1)                                   // 2L6 + L5 + L7
2010                 "movq %%mm1, (%%ebx)                            \n\t"
2011                 "movq (%0, %1, 8), %%mm1                        \n\t" // L8
2012                 PAVGB(%%mm1, %%mm0)                                   // L6+L8
2013                 PAVGB(%%mm2, %%mm0)                                   // 2L7 + L6 + L8
2014                 "movq %%mm0, (%%ebx, %1)                        \n\t"
2015                 "movq (%%ebx, %1, 4), %%mm0                     \n\t" // L9
2016                 PAVGB(%%mm0, %%mm2)                                   // L7+L9
2017                 PAVGB(%%mm1, %%mm2)                                   // 2L8 + L7 + L9
2018                 "movq %%mm2, (%%ebx, %1, 2)                     \n\t"
2019
2020
2021                 : : "r" (src), "r" (stride)
2022                 : "%eax", "%ebx"
2023         );
2024 #else
2025         int x;
2026         src+= 4*stride;
2027         for(x=0; x<8; x++)
2028         {
2029                 src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
2030                 src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
2031                 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2032                 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
2033                 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
2034                 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
2035                 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
2036                 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
2037                 src++;
2038         }
2039 #endif
2040 }
2041
2042 /**
2043  * Deinterlaces the given block
2044  * will be called for every 8x8 block and can read & write from line 4-15,
2045  * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
2046  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2047  */
2048 static inline void deInterlaceMedian(uint8_t src[], int stride)
2049 {
2050 #ifdef HAVE_MMX
2051         src+= 4*stride;
2052 #ifdef HAVE_MMX2
2053         asm volatile(
2054                 "leal (%0, %1), %%eax                           \n\t"
2055                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
2056 //      0       1       2       3       4       5       6       7       8       9
2057 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
2058
2059                 "movq (%0), %%mm0                               \n\t" //
2060                 "movq (%%eax, %1), %%mm2                        \n\t" //
2061                 "movq (%%eax), %%mm1                            \n\t" //
2062                 "movq %%mm0, %%mm3                              \n\t"
2063                 "pmaxub %%mm1, %%mm0                            \n\t" //
2064                 "pminub %%mm3, %%mm1                            \n\t" //
2065                 "pmaxub %%mm2, %%mm1                            \n\t" //
2066                 "pminub %%mm1, %%mm0                            \n\t"
2067                 "movq %%mm0, (%%eax)                            \n\t"
2068
2069                 "movq (%0, %1, 4), %%mm0                        \n\t" //
2070                 "movq (%%eax, %1, 2), %%mm1                     \n\t" //
2071                 "movq %%mm2, %%mm3                              \n\t"
2072                 "pmaxub %%mm1, %%mm2                            \n\t" //
2073                 "pminub %%mm3, %%mm1                            \n\t" //
2074                 "pmaxub %%mm0, %%mm1                            \n\t" //
2075                 "pminub %%mm1, %%mm2                            \n\t"
2076                 "movq %%mm2, (%%eax, %1, 2)                     \n\t"
2077
2078                 "movq (%%ebx), %%mm2                            \n\t" //
2079                 "movq (%%ebx, %1), %%mm1                        \n\t" //
2080                 "movq %%mm2, %%mm3                              \n\t"
2081                 "pmaxub %%mm0, %%mm2                            \n\t" //
2082                 "pminub %%mm3, %%mm0                            \n\t" //
2083                 "pmaxub %%mm1, %%mm0                            \n\t" //
2084                 "pminub %%mm0, %%mm2                            \n\t"
2085                 "movq %%mm2, (%%ebx)                            \n\t"
2086
2087                 "movq (%%ebx, %1, 2), %%mm2                     \n\t" //
2088                 "movq (%0, %1, 8), %%mm0                        \n\t" //
2089                 "movq %%mm2, %%mm3                              \n\t"
2090                 "pmaxub %%mm0, %%mm2                            \n\t" //
2091                 "pminub %%mm3, %%mm0                            \n\t" //
2092                 "pmaxub %%mm1, %%mm0                            \n\t" //
2093                 "pminub %%mm0, %%mm2                            \n\t"
2094                 "movq %%mm2, (%%ebx, %1, 2)                     \n\t"
2095
2096
2097                 : : "r" (src), "r" (stride)
2098                 : "%eax", "%ebx"
2099         );
2100
2101 #else // MMX without MMX2
2102         asm volatile(
2103                 "leal (%0, %1), %%eax                           \n\t"
2104                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
2105 //      0       1       2       3       4       5       6       7       8       9
2106 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
2107                 "pxor %%mm7, %%mm7                              \n\t"
2108
2109 #define MEDIAN(a,b,c)\
2110                 "movq " #a ", %%mm0                             \n\t"\
2111                 "movq " #b ", %%mm2                             \n\t"\
2112                 "movq " #c ", %%mm1                             \n\t"\
2113                 "movq %%mm0, %%mm3                              \n\t"\
2114                 "movq %%mm1, %%mm4                              \n\t"\
2115                 "movq %%mm2, %%mm5                              \n\t"\
2116                 "psubusb %%mm1, %%mm3                           \n\t"\
2117                 "psubusb %%mm2, %%mm4                           \n\t"\
2118                 "psubusb %%mm0, %%mm5                           \n\t"\
2119                 "pcmpeqb %%mm7, %%mm3                           \n\t"\
2120                 "pcmpeqb %%mm7, %%mm4                           \n\t"\
2121                 "pcmpeqb %%mm7, %%mm5                           \n\t"\
2122                 "movq %%mm3, %%mm6                              \n\t"\
2123                 "pxor %%mm4, %%mm3                              \n\t"\
2124                 "pxor %%mm5, %%mm4                              \n\t"\
2125                 "pxor %%mm6, %%mm5                              \n\t"\
2126                 "por %%mm3, %%mm1                               \n\t"\
2127                 "por %%mm4, %%mm2                               \n\t"\
2128                 "por %%mm5, %%mm0                               \n\t"\
2129                 "pand %%mm2, %%mm0                              \n\t"\
2130                 "pand %%mm1, %%mm0                              \n\t"\
2131                 "movq %%mm0, " #b "                             \n\t"
2132
2133 MEDIAN((%0), (%%eax), (%%eax, %1))
2134 MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
2135 MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1))
2136 MEDIAN((%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8))
2137
2138                 : : "r" (src), "r" (stride)
2139                 : "%eax", "%ebx"
2140         );
2141 #endif // MMX
2142 #else
2143         //FIXME
2144         int x;
2145         src+= 4*stride;
2146         for(x=0; x<8; x++)
2147         {
2148                 src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
2149                 src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
2150                 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2151                 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
2152                 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
2153                 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
2154                 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
2155                 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
2156                 src++;
2157         }
2158 #endif
2159 }
2160
2161 #ifdef HAVE_MMX
2162 /**
2163  * transposes and shift the given 8x8 Block into dst1 and dst2
2164  */
2165 static inline void transpose1(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
2166 {
2167         asm(
2168                 "leal (%0, %1), %%eax                           \n\t"
2169                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
2170 //      0       1       2       3       4       5       6       7       8       9
2171 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
2172                 "movq (%0), %%mm0               \n\t" // 12345678
2173                 "movq (%%eax), %%mm1            \n\t" // abcdefgh
2174                 "movq %%mm0, %%mm2              \n\t" // 12345678
2175                 "punpcklbw %%mm1, %%mm0         \n\t" // 1a2b3c4d
2176                 "punpckhbw %%mm1, %%mm2         \n\t" // 5e6f7g8h
2177
2178                 "movq (%%eax, %1), %%mm1        \n\t"
2179                 "movq (%%eax, %1, 2), %%mm3     \n\t"
2180                 "movq %%mm1, %%mm4              \n\t"
2181                 "punpcklbw %%mm3, %%mm1         \n\t"
2182                 "punpckhbw %%mm3, %%mm4         \n\t"
2183
2184                 "movq %%mm0, %%mm3              \n\t"
2185                 "punpcklwd %%mm1, %%mm0         \n\t"
2186                 "punpckhwd %%mm1, %%mm3         \n\t"
2187                 "movq %%mm2, %%mm1              \n\t"
2188                 "punpcklwd %%mm4, %%mm2         \n\t"
2189                 "punpckhwd %%mm4, %%mm1         \n\t"
2190
2191                 "movd %%mm0, 128(%2)            \n\t"
2192                 "psrlq $32, %%mm0               \n\t"
2193                 "movd %%mm0, 144(%2)            \n\t"
2194                 "movd %%mm3, 160(%2)            \n\t"
2195                 "psrlq $32, %%mm3               \n\t"
2196                 "movd %%mm3, 176(%2)            \n\t"
2197                 "movd %%mm3, 48(%3)             \n\t"
2198                 "movd %%mm2, 192(%2)            \n\t"
2199                 "movd %%mm2, 64(%3)             \n\t"
2200                 "psrlq $32, %%mm2               \n\t"
2201                 "movd %%mm2, 80(%3)             \n\t"
2202                 "movd %%mm1, 96(%3)             \n\t"
2203                 "psrlq $32, %%mm1               \n\t"
2204                 "movd %%mm1, 112(%3)            \n\t"
2205
2206                 "movq (%0, %1, 4), %%mm0        \n\t" // 12345678
2207                 "movq (%%ebx), %%mm1            \n\t" // abcdefgh
2208                 "movq %%mm0, %%mm2              \n\t" // 12345678
2209                 "punpcklbw %%mm1, %%mm0         \n\t" // 1a2b3c4d
2210                 "punpckhbw %%mm1, %%mm2         \n\t" // 5e6f7g8h
2211
2212                 "movq (%%ebx, %1), %%mm1        \n\t"
2213                 "movq (%%ebx, %1, 2), %%mm3     \n\t"
2214                 "movq %%mm1, %%mm4              \n\t"
2215                 "punpcklbw %%mm3, %%mm1         \n\t"
2216                 "punpckhbw %%mm3, %%mm4         \n\t"
2217
2218                 "movq %%mm0, %%mm3              \n\t"
2219                 "punpcklwd %%mm1, %%mm0         \n\t"
2220                 "punpckhwd %%mm1, %%mm3         \n\t"
2221                 "movq %%mm2, %%mm1              \n\t"
2222                 "punpcklwd %%mm4, %%mm2         \n\t"
2223                 "punpckhwd %%mm4, %%mm1         \n\t"
2224
2225                 "movd %%mm0, 132(%2)            \n\t"
2226                 "psrlq $32, %%mm0               \n\t"
2227                 "movd %%mm0, 148(%2)            \n\t"
2228                 "movd %%mm3, 164(%2)            \n\t"
2229                 "psrlq $32, %%mm3               \n\t"
2230                 "movd %%mm3, 180(%2)            \n\t"
2231                 "movd %%mm3, 52(%3)             \n\t"
2232                 "movd %%mm2, 196(%2)            \n\t"
2233                 "movd %%mm2, 68(%3)             \n\t"
2234                 "psrlq $32, %%mm2               \n\t"
2235                 "movd %%mm2, 84(%3)             \n\t"
2236                 "movd %%mm1, 100(%3)            \n\t"
2237                 "psrlq $32, %%mm1               \n\t"
2238                 "movd %%mm1, 116(%3)            \n\t"
2239
2240
2241         :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2)
2242         : "%eax", "%ebx"
2243         );
2244 }
2245
2246 /**
2247  * transposes the given 8x8 block
2248  */
2249 static inline void transpose2(uint8_t *dst, int dstStride, uint8_t *src)
2250 {
2251         asm(
2252                 "leal (%0, %1), %%eax                           \n\t"
2253                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
2254 //      0       1       2       3       4       5       6       7       8       9
2255 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
2256                 "movq (%2), %%mm0               \n\t" // 12345678
2257                 "movq 16(%2), %%mm1             \n\t" // abcdefgh
2258                 "movq %%mm0, %%mm2              \n\t" // 12345678
2259                 "punpcklbw %%mm1, %%mm0         \n\t" // 1a2b3c4d
2260                 "punpckhbw %%mm1, %%mm2         \n\t" // 5e6f7g8h
2261
2262                 "movq 32(%2), %%mm1             \n\t"
2263                 "movq 48(%2), %%mm3             \n\t"
2264                 "movq %%mm1, %%mm4              \n\t"
2265                 "punpcklbw %%mm3, %%mm1         \n\t"
2266                 "punpckhbw %%mm3, %%mm4         \n\t"
2267
2268                 "movq %%mm0, %%mm3              \n\t"
2269                 "punpcklwd %%mm1, %%mm0         \n\t"
2270                 "punpckhwd %%mm1, %%mm3         \n\t"
2271                 "movq %%mm2, %%mm1              \n\t"
2272                 "punpcklwd %%mm4, %%mm2         \n\t"
2273                 "punpckhwd %%mm4, %%mm1         \n\t"
2274
2275                 "movd %%mm0, (%0)               \n\t"
2276                 "psrlq $32, %%mm0               \n\t"
2277                 "movd %%mm0, (%%eax)            \n\t"
2278                 "movd %%mm3, (%%eax, %1)        \n\t"
2279                 "psrlq $32, %%mm3               \n\t"
2280                 "movd %%mm3, (%%eax, %1, 2)     \n\t"
2281                 "movd %%mm2, (%0, %1, 4)        \n\t"
2282                 "psrlq $32, %%mm2               \n\t"
2283                 "movd %%mm2, (%%ebx)            \n\t"
2284                 "movd %%mm1, (%%ebx, %1)        \n\t"
2285                 "psrlq $32, %%mm1               \n\t"
2286                 "movd %%mm1, (%%ebx, %1, 2)     \n\t"
2287
2288
2289                 "movq 64(%2), %%mm0             \n\t" // 12345678
2290                 "movq 80(%2), %%mm1             \n\t" // abcdefgh
2291                 "movq %%mm0, %%mm2              \n\t" // 12345678
2292                 "punpcklbw %%mm1, %%mm0         \n\t" // 1a2b3c4d
2293                 "punpckhbw %%mm1, %%mm2         \n\t" // 5e6f7g8h
2294
2295                 "movq 96(%2), %%mm1             \n\t"
2296                 "movq 112(%2), %%mm3            \n\t"
2297                 "movq %%mm1, %%mm4              \n\t"
2298                 "punpcklbw %%mm3, %%mm1         \n\t"
2299                 "punpckhbw %%mm3, %%mm4         \n\t"
2300
2301                 "movq %%mm0, %%mm3              \n\t"
2302                 "punpcklwd %%mm1, %%mm0         \n\t"
2303                 "punpckhwd %%mm1, %%mm3         \n\t"
2304                 "movq %%mm2, %%mm1              \n\t"
2305                 "punpcklwd %%mm4, %%mm2         \n\t"
2306                 "punpckhwd %%mm4, %%mm1         \n\t"
2307
2308                 "movd %%mm0, 4(%0)              \n\t"
2309                 "psrlq $32, %%mm0               \n\t"
2310                 "movd %%mm0, 4(%%eax)           \n\t"
2311                 "movd %%mm3, 4(%%eax, %1)       \n\t"
2312                 "psrlq $32, %%mm3               \n\t"
2313                 "movd %%mm3, 4(%%eax, %1, 2)    \n\t"
2314                 "movd %%mm2, 4(%0, %1, 4)       \n\t"
2315                 "psrlq $32, %%mm2               \n\t"
2316                 "movd %%mm2, 4(%%ebx)           \n\t"
2317                 "movd %%mm1, 4(%%ebx, %1)       \n\t"
2318                 "psrlq $32, %%mm1               \n\t"
2319                 "movd %%mm1, 4(%%ebx, %1, 2)    \n\t"
2320
2321         :: "r" (dst), "r" (dstStride), "r" (src)
2322         : "%eax", "%ebx"
2323         );
2324 }
2325 #endif
2326 //static int test=0;
2327
2328 static void inline tempNoiseReducer(uint8_t *src, int stride,
2329                                     uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
2330 {
2331 #define FAST_L2_DIFF
2332 //#define L1_DIFF //u should change the thresholds too if u try that one
2333 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2334         asm volatile(
2335                 "leal (%2, %2, 2), %%eax                        \n\t" // 3*stride
2336                 "leal (%2, %2, 4), %%ebx                        \n\t" // 5*stride
2337                 "leal (%%ebx, %2, 2), %%ecx                     \n\t" // 7*stride
2338 //      0       1       2       3       4       5       6       7       8       9
2339 //      %x      %x+%2   %x+2%2  %x+eax  %x+4%2  %x+ebx  %x+2eax %x+ecx  %x+8%2
2340 //FIXME reorder?
2341 #ifdef L1_DIFF //needs mmx2
2342                 "movq (%0), %%mm0                               \n\t" // L0
2343                 "psadbw (%1), %%mm0                             \n\t" // |L0-R0|
2344                 "movq (%0, %2), %%mm1                           \n\t" // L1
2345                 "psadbw (%1, %2), %%mm1                         \n\t" // |L1-R1|
2346                 "movq (%0, %2, 2), %%mm2                        \n\t" // L2
2347                 "psadbw (%1, %2, 2), %%mm2                      \n\t" // |L2-R2|
2348                 "movq (%0, %%eax), %%mm3                        \n\t" // L3
2349                 "psadbw (%1, %%eax), %%mm3                      \n\t" // |L3-R3|
2350
2351                 "movq (%0, %2, 4), %%mm4                        \n\t" // L4
2352                 "paddw %%mm1, %%mm0                             \n\t"
2353                 "psadbw (%1, %2, 4), %%mm4                      \n\t" // |L4-R4|
2354                 "movq (%0, %%ebx), %%mm5                        \n\t" // L5
2355                 "paddw %%mm2, %%mm0                             \n\t"
2356                 "psadbw (%1, %%ebx), %%mm5                      \n\t" // |L5-R5|
2357                 "movq (%0, %%eax, 2), %%mm6                     \n\t" // L6
2358                 "paddw %%mm3, %%mm0                             \n\t"
2359                 "psadbw (%1, %%eax, 2), %%mm6                   \n\t" // |L6-R6|
2360                 "movq (%0, %%ecx), %%mm7                        \n\t" // L7
2361                 "paddw %%mm4, %%mm0                             \n\t"
2362                 "psadbw (%1, %%ecx), %%mm7                      \n\t" // |L7-R7|
2363                 "paddw %%mm5, %%mm6                             \n\t"
2364                 "paddw %%mm7, %%mm6                             \n\t"
2365                 "paddw %%mm6, %%mm0                             \n\t"
2366 #elif defined (FAST_L2_DIFF)
2367                 "pcmpeqb %%mm7, %%mm7                           \n\t"
2368                 "movq b80, %%mm6                                \n\t"
2369                 "pxor %%mm0, %%mm0                              \n\t"
2370 #define L2_DIFF_CORE(a, b)\
2371                 "movq " #a ", %%mm5                             \n\t"\
2372                 "movq " #b ", %%mm2                             \n\t"\
2373                 "pxor %%mm7, %%mm2                              \n\t"\
2374                 PAVGB(%%mm2, %%mm5)\
2375                 "paddb %%mm6, %%mm5                             \n\t"\
2376                 "movq %%mm5, %%mm2                              \n\t"\
2377                 "psllw $8, %%mm5                                \n\t"\
2378                 "pmaddwd %%mm5, %%mm5                           \n\t"\
2379                 "pmaddwd %%mm2, %%mm2                           \n\t"\
2380                 "paddd %%mm2, %%mm5                             \n\t"\
2381                 "psrld $14, %%mm5                               \n\t"\
2382                 "paddd %%mm5, %%mm0                             \n\t"
2383
2384 L2_DIFF_CORE((%0), (%1))
2385 L2_DIFF_CORE((%0, %2), (%1, %2))
2386 L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
2387 L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
2388 L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
2389 L2_DIFF_CORE((%0, %%ebx), (%1, %%ebx))
2390 L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
2391 L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
2392
2393 #else
2394                 "pxor %%mm7, %%mm7                              \n\t"
2395                 "pxor %%mm0, %%mm0                              \n\t"
2396 #define L2_DIFF_CORE(a, b)\
2397                 "movq " #a ", %%mm5                             \n\t"\
2398                 "movq " #b ", %%mm2                             \n\t"\
2399                 "movq %%mm5, %%mm1                              \n\t"\
2400                 "movq %%mm2, %%mm3                              \n\t"\
2401                 "punpcklbw %%mm7, %%mm5                         \n\t"\
2402                 "punpckhbw %%mm7, %%mm1                         \n\t"\
2403                 "punpcklbw %%mm7, %%mm2                         \n\t"\
2404                 "punpckhbw %%mm7, %%mm3                         \n\t"\
2405                 "psubw %%mm2, %%mm5                             \n\t"\
2406                 "psubw %%mm3, %%mm1                             \n\t"\
2407                 "pmaddwd %%mm5, %%mm5                           \n\t"\
2408                 "pmaddwd %%mm1, %%mm1                           \n\t"\
2409                 "paddd %%mm1, %%mm5                             \n\t"\
2410                 "paddd %%mm5, %%mm0                             \n\t"
2411
2412 L2_DIFF_CORE((%0), (%1))
2413 L2_DIFF_CORE((%0, %2), (%1, %2))
2414 L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
2415 L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
2416 L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
2417 L2_DIFF_CORE((%0, %%ebx), (%1, %%ebx))
2418 L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
2419 L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
2420
2421 #endif
2422
2423                 "movq %%mm0, %%mm4                              \n\t"
2424                 "psrlq $32, %%mm0                               \n\t"
2425                 "paddd %%mm0, %%mm4                             \n\t"
2426                 "movd %%mm4, %%ecx                              \n\t"
2427                 "shll $2, %%ecx                                 \n\t"
2428                 "movl %3, %%ebx                                 \n\t"
2429                 "addl -4(%%ebx), %%ecx                          \n\t"
2430                 "addl 4(%%ebx), %%ecx                           \n\t"
2431                 "addl -1024(%%ebx), %%ecx                       \n\t"
2432                 "addl $4, %%ecx                                 \n\t"
2433                 "addl 1024(%%ebx), %%ecx                        \n\t"
2434                 "shrl $3, %%ecx                                 \n\t"
2435                 "movl %%ecx, (%%ebx)                            \n\t"
2436                 "leal (%%eax, %2, 2), %%ebx                     \n\t" // 5*stride
2437
2438 //              "movl %3, %%ecx                         \n\t"
2439 //              "movl %%ecx, test                               \n\t"
2440 //              "jmp 4f \n\t"
2441                 "cmpl 4+maxTmpNoise, %%ecx                      \n\t"
2442                 " jb 2f                                         \n\t"
2443                 "cmpl 8+maxTmpNoise, %%ecx                      \n\t"
2444                 " jb 1f                                         \n\t"
2445
2446                 "leal (%%ebx, %2, 2), %%ecx                     \n\t" // 7*stride
2447                 "movq (%0), %%mm0                               \n\t" // L0
2448                 "movq (%0, %2), %%mm1                           \n\t" // L1
2449                 "movq (%0, %2, 2), %%mm2                        \n\t" // L2
2450                 "movq (%0, %%eax), %%mm3                        \n\t" // L3
2451                 "movq (%0, %2, 4), %%mm4                        \n\t" // L4
2452                 "movq (%0, %%ebx), %%mm5                        \n\t" // L5
2453                 "movq (%0, %%eax, 2), %%mm6                     \n\t" // L6
2454                 "movq (%0, %%ecx), %%mm7                        \n\t" // L7
2455                 "movq %%mm0, (%1)                               \n\t" // L0
2456                 "movq %%mm1, (%1, %2)                           \n\t" // L1
2457                 "movq %%mm2, (%1, %2, 2)                        \n\t" // L2
2458                 "movq %%mm3, (%1, %%eax)                        \n\t" // L3
2459                 "movq %%mm4, (%1, %2, 4)                        \n\t" // L4
2460                 "movq %%mm5, (%1, %%ebx)                        \n\t" // L5
2461                 "movq %%mm6, (%1, %%eax, 2)                     \n\t" // L6
2462                 "movq %%mm7, (%1, %%ecx)                        \n\t" // L7
2463                 "jmp 4f                                         \n\t"
2464
2465                 "1:                                             \n\t"
2466                 "leal (%%ebx, %2, 2), %%ecx                     \n\t" // 7*stride
2467                 "movq (%0), %%mm0                               \n\t" // L0
2468                 "pavgb (%1), %%mm0                              \n\t" // L0
2469                 "movq (%0, %2), %%mm1                           \n\t" // L1
2470                 "pavgb (%1, %2), %%mm1                          \n\t" // L1
2471                 "movq (%0, %2, 2), %%mm2                        \n\t" // L2
2472                 "pavgb (%1, %2, 2), %%mm2                       \n\t" // L2
2473                 "movq (%0, %%eax), %%mm3                        \n\t" // L3
2474                 "pavgb (%1, %%eax), %%mm3                       \n\t" // L3
2475                 "movq (%0, %2, 4), %%mm4                        \n\t" // L4
2476                 "pavgb (%1, %2, 4), %%mm4                       \n\t" // L4
2477                 "movq (%0, %%ebx), %%mm5                        \n\t" // L5
2478                 "pavgb (%1, %%ebx), %%mm5                       \n\t" // L5
2479                 "movq (%0, %%eax, 2), %%mm6                     \n\t" // L6
2480                 "pavgb (%1, %%eax, 2), %%mm6                    \n\t" // L6
2481                 "movq (%0, %%ecx), %%mm7                        \n\t" // L7
2482                 "pavgb (%1, %%ecx), %%mm7                       \n\t" // L7
2483                 "movq %%mm0, (%1)                               \n\t" // R0
2484                 "movq %%mm1, (%1, %2)                           \n\t" // R1
2485                 "movq %%mm2, (%1, %2, 2)                        \n\t" // R2
2486                 "movq %%mm3, (%1, %%eax)                        \n\t" // R3
2487                 "movq %%mm4, (%1, %2, 4)                        \n\t" // R4
2488                 "movq %%mm5, (%1, %%ebx)                        \n\t" // R5
2489                 "movq %%mm6, (%1, %%eax, 2)                     \n\t" // R6
2490                 "movq %%mm7, (%1, %%ecx)                        \n\t" // R7
2491                 "movq %%mm0, (%0)                               \n\t" // L0
2492                 "movq %%mm1, (%0, %2)                           \n\t" // L1
2493                 "movq %%mm2, (%0, %2, 2)                        \n\t" // L2
2494                 "movq %%mm3, (%0, %%eax)                        \n\t" // L3
2495                 "movq %%mm4, (%0, %2, 4)                        \n\t" // L4
2496                 "movq %%mm5, (%0, %%ebx)                        \n\t" // L5
2497                 "movq %%mm6, (%0, %%eax, 2)                     \n\t" // L6
2498                 "movq %%mm7, (%0, %%ecx)                        \n\t" // L7
2499                 "jmp 4f                                         \n\t"
2500
2501                 "2:                                             \n\t"
2502                 "cmpl maxTmpNoise, %%ecx                        \n\t"
2503                 " jb 3f                                         \n\t"
2504
2505                 "leal (%%ebx, %2, 2), %%ecx                     \n\t" // 7*stride
2506                 "movq (%0), %%mm0                               \n\t" // L0
2507                 "movq (%0, %2), %%mm1                           \n\t" // L1
2508                 "movq (%0, %2, 2), %%mm2                        \n\t" // L2
2509                 "movq (%0, %%eax), %%mm3                        \n\t" // L3
2510                 "movq (%1), %%mm4                               \n\t" // R0
2511                 "movq (%1, %2), %%mm5                           \n\t" // R1
2512                 "movq (%1, %2, 2), %%mm6                        \n\t" // R2
2513                 "movq (%1, %%eax), %%mm7                        \n\t" // R3
2514                 PAVGB(%%mm4, %%mm0)
2515                 PAVGB(%%mm5, %%mm1)
2516                 PAVGB(%%mm6, %%mm2)
2517                 PAVGB(%%mm7, %%mm3)
2518                 PAVGB(%%mm4, %%mm0)
2519                 PAVGB(%%mm5, %%mm1)
2520                 PAVGB(%%mm6, %%mm2)
2521                 PAVGB(%%mm7, %%mm3)
2522                 "movq %%mm0, (%1)                               \n\t" // R0
2523                 "movq %%mm1, (%1, %2)                           \n\t" // R1
2524                 "movq %%mm2, (%1, %2, 2)                        \n\t" // R2
2525                 "movq %%mm3, (%1, %%eax)                        \n\t" // R3
2526                 "movq %%mm0, (%0)                               \n\t" // L0
2527                 "movq %%mm1, (%0, %2)                           \n\t" // L1
2528                 "movq %%mm2, (%0, %2, 2)                        \n\t" // L2
2529                 "movq %%mm3, (%0, %%eax)                        \n\t" // L3
2530
2531                 "movq (%0, %2, 4), %%mm0                        \n\t" // L4
2532                 "movq (%0, %%ebx), %%mm1                        \n\t" // L5
2533                 "movq (%0, %%eax, 2), %%mm2                     \n\t" // L6
2534                 "movq (%0, %%ecx), %%mm3                        \n\t" // L7
2535                 "movq (%1, %2, 4), %%mm4                        \n\t" // R4
2536                 "movq (%1, %%ebx), %%mm5                        \n\t" // R5
2537                 "movq (%1, %%eax, 2), %%mm6                     \n\t" // R6
2538                 "movq (%1, %%ecx), %%mm7                        \n\t" // R7
2539                 PAVGB(%%mm4, %%mm0)
2540                 PAVGB(%%mm5, %%mm1)
2541                 PAVGB(%%mm6, %%mm2)
2542                 PAVGB(%%mm7, %%mm3)
2543                 PAVGB(%%mm4, %%mm0)
2544                 PAVGB(%%mm5, %%mm1)
2545                 PAVGB(%%mm6, %%mm2)
2546                 PAVGB(%%mm7, %%mm3)
2547                 "movq %%mm0, (%1, %2, 4)                        \n\t" // R4
2548                 "movq %%mm1, (%1, %%ebx)                        \n\t" // R5
2549                 "movq %%mm2, (%1, %%eax, 2)                     \n\t" // R6
2550                 "movq %%mm3, (%1, %%ecx)                        \n\t" // R7
2551                 "movq %%mm0, (%0, %2, 4)                        \n\t" // L4
2552                 "movq %%mm1, (%0, %%ebx)                        \n\t" // L5
2553                 "movq %%mm2, (%0, %%eax, 2)                     \n\t" // L6
2554                 "movq %%mm3, (%0, %%ecx)                        \n\t" // L7
2555                 "jmp 4f                                         \n\t"
2556
2557                 "3:                                             \n\t"
2558                 "leal (%%ebx, %2, 2), %%ecx                     \n\t" // 7*stride
2559                 "movq (%0), %%mm0                               \n\t" // L0
2560                 "movq (%0, %2), %%mm1                           \n\t" // L1
2561                 "movq (%0, %2, 2), %%mm2                        \n\t" // L2
2562                 "movq (%0, %%eax), %%mm3                        \n\t" // L3
2563                 "movq (%1), %%mm4                               \n\t" // R0
2564                 "movq (%1, %2), %%mm5                           \n\t" // R1
2565                 "movq (%1, %2, 2), %%mm6                        \n\t" // R2
2566                 "movq (%1, %%eax), %%mm7                        \n\t" // R3
2567                 PAVGB(%%mm4, %%mm0)
2568                 PAVGB(%%mm5, %%mm1)
2569                 PAVGB(%%mm6, %%mm2)
2570                 PAVGB(%%mm7, %%mm3)
2571                 PAVGB(%%mm4, %%mm0)
2572                 PAVGB(%%mm5, %%mm1)
2573                 PAVGB(%%mm6, %%mm2)
2574                 PAVGB(%%mm7, %%mm3)
2575                 PAVGB(%%mm4, %%mm0)
2576                 PAVGB(%%mm5, %%mm1)
2577                 PAVGB(%%mm6, %%mm2)
2578                 PAVGB(%%mm7, %%mm3)
2579                 "movq %%mm0, (%1)                               \n\t" // R0
2580                 "movq %%mm1, (%1, %2)                           \n\t" // R1
2581                 "movq %%mm2, (%1, %2, 2)                        \n\t" // R2
2582                 "movq %%mm3, (%1, %%eax)                        \n\t" // R3
2583                 "movq %%mm0, (%0)                               \n\t" // L0
2584                 "movq %%mm1, (%0, %2)                           \n\t" // L1
2585                 "movq %%mm2, (%0, %2, 2)                        \n\t" // L2
2586                 "movq %%mm3, (%0, %%eax)                        \n\t" // L3
2587
2588                 "movq (%0, %2, 4), %%mm0                        \n\t" // L4
2589                 "movq (%0, %%ebx), %%mm1                        \n\t" // L5
2590                 "movq (%0, %%eax, 2), %%mm2                     \n\t" // L6
2591                 "movq (%0, %%ecx), %%mm3                        \n\t" // L7
2592                 "movq (%1, %2, 4), %%mm4                        \n\t" // R4
2593                 "movq (%1, %%ebx), %%mm5                        \n\t" // R5
2594                 "movq (%1, %%eax, 2), %%mm6                     \n\t" // R6
2595                 "movq (%1, %%ecx), %%mm7                        \n\t" // R7
2596                 PAVGB(%%mm4, %%mm0)
2597                 PAVGB(%%mm5, %%mm1)
2598                 PAVGB(%%mm6, %%mm2)
2599                 PAVGB(%%mm7, %%mm3)
2600                 PAVGB(%%mm4, %%mm0)
2601                 PAVGB(%%mm5, %%mm1)
2602                 PAVGB(%%mm6, %%mm2)
2603                 PAVGB(%%mm7, %%mm3)
2604                 PAVGB(%%mm4, %%mm0)
2605                 PAVGB(%%mm5, %%mm1)
2606                 PAVGB(%%mm6, %%mm2)
2607                 PAVGB(%%mm7, %%mm3)
2608                 "movq %%mm0, (%1, %2, 4)                        \n\t" // R4
2609                 "movq %%mm1, (%1, %%ebx)                        \n\t" // R5
2610                 "movq %%mm2, (%1, %%eax, 2)                     \n\t" // R6
2611                 "movq %%mm3, (%1, %%ecx)                        \n\t" // R7
2612                 "movq %%mm0, (%0, %2, 4)                        \n\t" // L4
2613                 "movq %%mm1, (%0, %%ebx)                        \n\t" // L5
2614                 "movq %%mm2, (%0, %%eax, 2)                     \n\t" // L6
2615                 "movq %%mm3, (%0, %%ecx)                        \n\t" // L7
2616
2617                 "4:                                             \n\t"
2618
2619                 :: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast)
2620                 : "%eax", "%ebx", "%ecx", "memory"
2621                 );
2622 //printf("%d\n", test);
2623 #else
2624         int y;
2625         int d=0;
2626         int sysd=0;
2627         int i;
2628
2629         for(y=0; y<8; y++)
2630         {
2631                 int x;
2632                 for(x=0; x<8; x++)
2633                 {
2634                         int ref= tempBlured[ x + y*stride ];
2635                         int cur= src[ x + y*stride ];
2636                         int d1=ref - cur;
2637 //                      if(x==0 || x==7) d1+= d1>>1;
2638 //                      if(y==0 || y==7) d1+= d1>>1;
2639 //                      d+= ABS(d1);
2640                         d+= d1*d1;
2641                         sysd+= d1;
2642                 }
2643         }
2644         i=d;
2645         d=      (
2646                 4*d
2647                 +(*(tempBluredPast-256))
2648                 +(*(tempBluredPast-1))+ (*(tempBluredPast+1))
2649                 +(*(tempBluredPast+256))
2650                 +4)>>3;
2651         *tempBluredPast=i;
2652 //      ((*tempBluredPast)*3 + d + 2)>>2;
2653
2654 //printf("%d %d %d\n", maxNoise[0], maxNoise[1], maxNoise[2]);
2655 /*
2656 Switch between
2657  1  0  0  0  0  0  0  (0)
2658 64 32 16  8  4  2  1  (1)
2659 64 48 36 27 20 15 11 (33) (approx)
2660 64 56 49 43 37 33 29 (200) (approx)
2661 */
2662         if(d > maxNoise[1])
2663         {
2664                 if(d < maxNoise[2])
2665                 {
2666                         for(y=0; y<8; y++)
2667                         {
2668                                 int x;
2669                                 for(x=0; x<8; x++)
2670                                 {
2671                                         int ref= tempBlured[ x + y*stride ];
2672                                         int cur= src[ x + y*stride ];
2673                                         tempBlured[ x + y*stride ]=
2674                                         src[ x + y*stride ]=
2675                                                 (ref + cur + 1)>>1;
2676                                 }
2677                         }
2678                 }
2679                 else
2680                 {
2681                         for(y=0; y<8; y++)
2682                         {
2683                                 int x;
2684                                 for(x=0; x<8; x++)
2685                                 {
2686                                         tempBlured[ x + y*stride ]= src[ x + y*stride ];
2687                                 }
2688                         }
2689                 }
2690         }
2691         else
2692         {
2693                 if(d < maxNoise[0])
2694                 {
2695                         for(y=0; y<8; y++)
2696                         {
2697                                 int x;
2698                                 for(x=0; x<8; x++)
2699                                 {
2700                                         int ref= tempBlured[ x + y*stride ];
2701                                         int cur= src[ x + y*stride ];
2702                                         tempBlured[ x + y*stride ]=
2703                                         src[ x + y*stride ]=
2704                                                 (ref*7 + cur + 4)>>3;
2705                                 }
2706                         }
2707                 }
2708                 else
2709                 {
2710                         for(y=0; y<8; y++)
2711                         {
2712                                 int x;
2713                                 for(x=0; x<8; x++)
2714                                 {
2715                                         int ref= tempBlured[ x + y*stride ];
2716                                         int cur= src[ x + y*stride ];
2717                                         tempBlured[ x + y*stride ]=
2718                                         src[ x + y*stride ]=
2719                                                 (ref*3 + cur + 2)>>2;
2720                                 }
2721                         }
2722                 }
2723         }
2724 #endif
2725 }
2726
2727 #ifdef HAVE_ODIVX_POSTPROCESS
2728 #include "../opendivx/postprocess.h"
2729 int use_old_pp=0;
2730 #endif
2731
2732 static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2733         QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
2734
2735 /* -pp Command line Help
2736 NOTE/FIXME: put this at an appropriate place (--help, html docs, man mplayer)?
2737
2738 -pp <filterName>[:<option>[:<option>...]][,[-]<filterName>[:<option>...]]...
2739
2740 long form example:
2741 -pp vdeblock:autoq,hdeblock:autoq,linblenddeint         -pp default,-vdeblock
2742 short form example:
2743 -pp vb:a,hb:a,lb                                        -pp de,-vb
2744 more examples:
2745 -pp tn:64:128:256
2746
2747 Filters                 Options
2748 short   long name       short   long option     Description
2749 *       *               a       autoq           cpu power dependant enabler
2750                         c       chrom           chrominance filtring enabled
2751                         y       nochrom         chrominance filtring disabled
2752 hb      hdeblock                                horizontal deblocking filter
2753 vb      vdeblock                                vertical deblocking filter
2754 vr      rkvdeblock
2755 h1      x1hdeblock                              Experimental horizontal deblock filter 1
2756 v1      x1vdeblock                              Experimental vertical deblock filter 1
2757 dr      dering                                  not implemented yet
2758 al      autolevels                              automatic brightness / contrast fixer
2759                         f       fullyrange      stretch luminance range to (0..255)
2760 lb      linblenddeint                           linear blend deinterlacer
2761 li      linipoldeint                            linear interpolating deinterlacer
2762 ci      cubicipoldeint                          cubic interpolating deinterlacer
2763 md      mediandeint                             median deinterlacer
2764 de      default                                 hdeblock:a,vdeblock:a,dering:a,autolevels
2765 fa      fast                                    x1hdeblock:a,x1vdeblock:a,dering:a,autolevels
2766 tn      tmpnoise        (3 Thresholds)          Temporal Noise Reducer
2767 */
2768
2769 /**
2770  * returns a PPMode struct which will have a non 0 error variable if an error occured
2771  * name is the string after "-pp" on the command line
2772  * quality is a number from 0 to GET_PP_QUALITY_MAX
2773  */
2774 struct PPMode getPPModeByNameAndQuality(char *name, int quality)
2775 {
2776         char temp[GET_MODE_BUFFER_SIZE];
2777         char *p= temp;
2778         char *filterDelimiters= ",";
2779         char *optionDelimiters= ":";
2780         struct PPMode ppMode= {0,0,0,0,0,0,{150,200,400}};
2781         char *filterToken;
2782
2783         strncpy(temp, name, GET_MODE_BUFFER_SIZE);
2784
2785         printf("%s\n", name);
2786
2787         for(;;){
2788                 char *filterName;
2789                 int q= 1000000; //GET_PP_QUALITY_MAX;
2790                 int chrom=-1;
2791                 char *option;
2792                 char *options[OPTIONS_ARRAY_SIZE];
2793                 int i;
2794                 int filterNameOk=0;
2795                 int numOfUnknownOptions=0;
2796                 int enable=1; //does the user want us to enabled or disabled the filter
2797
2798                 filterToken= strtok(p, filterDelimiters);
2799                 if(filterToken == NULL) break;
2800                 p+= strlen(filterToken) + 1; // p points to next filterToken
2801                 filterName= strtok(filterToken, optionDelimiters);
2802                 printf("%s::%s\n", filterToken, filterName);
2803
2804                 if(*filterName == '-')
2805                 {
2806                         enable=0;
2807                         filterName++;
2808                 }
2809
2810                 for(;;){ //for all options
2811                         option= strtok(NULL, optionDelimiters);
2812                         if(option == NULL) break;
2813
2814                         printf("%s\n", option);
2815                         if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
2816                         else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
2817                         else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
2818                         else
2819                         {
2820                                 options[numOfUnknownOptions] = option;
2821                                 numOfUnknownOptions++;
2822                         }
2823                         if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
2824                 }
2825                 options[numOfUnknownOptions] = NULL;
2826
2827                 /* replace stuff from the replace Table */
2828                 for(i=0; replaceTable[2*i]!=NULL; i++)
2829                 {
2830                         if(!strcmp(replaceTable[2*i], filterName))
2831                         {
2832                                 int newlen= strlen(replaceTable[2*i + 1]);
2833                                 int plen;
2834                                 int spaceLeft;
2835
2836                                 if(p==NULL) p= temp, *p=0;      //last filter
2837                                 else p--, *p=',';               //not last filter
2838
2839                                 plen= strlen(p);
2840                                 spaceLeft= (int)p - (int)temp + plen;
2841                                 if(spaceLeft + newlen  >= GET_MODE_BUFFER_SIZE)
2842                                 {
2843                                         ppMode.error++;
2844                                         break;
2845                                 }
2846                                 memmove(p + newlen, p, plen+1);
2847                                 memcpy(p, replaceTable[2*i + 1], newlen);
2848                                 filterNameOk=1;
2849                         }
2850                 }
2851
2852                 for(i=0; filters[i].shortName!=NULL; i++)
2853                 {
2854 //                      printf("Compareing %s, %s, %s\n", filters[i].shortName,filters[i].longName, filterName);
2855                         if(   !strcmp(filters[i].longName, filterName)
2856                            || !strcmp(filters[i].shortName, filterName))
2857                         {
2858                                 ppMode.lumMode &= ~filters[i].mask;
2859                                 ppMode.chromMode &= ~filters[i].mask;
2860
2861                                 filterNameOk=1;
2862                                 if(!enable) break; // user wants to disable it
2863
2864                                 if(q >= filters[i].minLumQuality)
2865                                         ppMode.lumMode|= filters[i].mask;
2866                                 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
2867                                         if(q >= filters[i].minChromQuality)
2868                                                 ppMode.chromMode|= filters[i].mask;
2869
2870                                 if(filters[i].mask == LEVEL_FIX)
2871                                 {
2872                                         int o;
2873                                         ppMode.minAllowedY= 16;
2874                                         ppMode.maxAllowedY= 234;
2875                                         for(o=0; options[o]!=NULL; o++)
2876                                                 if(  !strcmp(options[o],"fullyrange")
2877                                                    ||!strcmp(options[o],"f"))
2878                                                 {
2879                                                         ppMode.minAllowedY= 0;
2880                                                         ppMode.maxAllowedY= 255;
2881                                                         numOfUnknownOptions--;
2882                                                 }
2883                                 }
2884                                 else if(filters[i].mask == TEMP_NOISE_FILTER)
2885                                 {
2886                                         int o;
2887                                         int numOfNoises=0;
2888                                         ppMode.maxTmpNoise[0]= 150;
2889                                         ppMode.maxTmpNoise[1]= 200;
2890                                         ppMode.maxTmpNoise[2]= 400;
2891
2892                                         for(o=0; options[o]!=NULL; o++)
2893                                         {
2894                                                 char *tail;
2895                                                 ppMode.maxTmpNoise[numOfNoises]=
2896                                                         strtol(options[o], &tail, 0);
2897                                                 if(tail!=options[o])
2898                                                 {
2899                                                         numOfNoises++;
2900                                                         numOfUnknownOptions--;
2901                                                         if(numOfNoises >= 3) break;
2902                                                 }
2903                                         }
2904                                 }
2905                         }
2906                 }
2907                 if(!filterNameOk) ppMode.error++;
2908                 ppMode.error += numOfUnknownOptions;
2909         }
2910
2911 #ifdef HAVE_ODIVX_POSTPROCESS
2912         if(ppMode.lumMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_H;
2913         if(ppMode.lumMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_V;
2914         if(ppMode.chromMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_H;
2915         if(ppMode.chromMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_V;
2916         if(ppMode.lumMode & DERING) ppMode.oldMode |= PP_DERING_Y;
2917         if(ppMode.chromMode & DERING) ppMode.oldMode |= PP_DERING_C;
2918 #endif
2919
2920         return ppMode;
2921 }
2922
2923 /**
2924  * Obsolete, dont use it, use postprocess2() instead
2925  */
2926 void  postprocess(unsigned char * src[], int src_stride,
2927                  unsigned char * dst[], int dst_stride,
2928                  int horizontal_size,   int vertical_size,
2929                  QP_STORE_T *QP_store,  int QP_stride,
2930                                           int mode)
2931 {
2932         struct PPMode ppMode;
2933         static QP_STORE_T zeroArray[2048/8];
2934 /*
2935         static int qual=0;
2936
2937         ppMode= getPPModeByNameAndQuality("fast,default,-hdeblock,-vdeblock,tmpnoise:150:200:300", qual);
2938         printf("OK\n");
2939         qual++;
2940         qual%=7;
2941         printf("\n%X %X %X %X :%d: %d %d %d\n", ppMode.lumMode, ppMode.chromMode, ppMode.oldMode, ppMode.error,
2942                 qual, ppMode.maxTmpNoise[0], ppMode.maxTmpNoise[1], ppMode.maxTmpNoise[2]);
2943         postprocess2(src, src_stride, dst, dst_stride,
2944                  horizontal_size, vertical_size, QP_store, QP_stride, &ppMode);
2945
2946         return;
2947 */
2948         if(QP_store==NULL)
2949         {
2950                 QP_store= zeroArray;
2951                 QP_stride= 0;
2952         }
2953
2954         ppMode.lumMode= mode;
2955         mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00);
2956         ppMode.chromMode= mode;
2957         ppMode.maxTmpNoise[0]= 700;
2958         ppMode.maxTmpNoise[1]= 1500;
2959         ppMode.maxTmpNoise[2]= 3000;
2960
2961 #ifdef HAVE_ODIVX_POSTPROCESS
2962 // Note: I could make this shit outside of this file, but it would mean one
2963 // more function call...
2964         if(use_old_pp){
2965             odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,mode);
2966             return;
2967         }
2968 #endif
2969
2970         postProcess(src[0], src_stride, dst[0], dst_stride,
2971                 horizontal_size, vertical_size, QP_store, QP_stride, 0, &ppMode);
2972
2973         horizontal_size >>= 1;
2974         vertical_size   >>= 1;
2975         src_stride      >>= 1;
2976         dst_stride      >>= 1;
2977
2978         if(1)
2979         {
2980                 postProcess(src[1], src_stride, dst[1], dst_stride,
2981                         horizontal_size, vertical_size, QP_store, QP_stride, 1, &ppMode);
2982                 postProcess(src[2], src_stride, dst[2], dst_stride,
2983                         horizontal_size, vertical_size, QP_store, QP_stride, 2, &ppMode);
2984         }
2985         else
2986         {
2987                 memset(dst[1], 128, dst_stride*vertical_size);
2988                 memset(dst[2], 128, dst_stride*vertical_size);
2989 //              memcpy(dst[1], src[1], src_stride*horizontal_size);
2990 //              memcpy(dst[2], src[2], src_stride*horizontal_size);
2991         }
2992 }
2993
2994 void  postprocess2(unsigned char * src[], int src_stride,
2995                  unsigned char * dst[], int dst_stride,
2996                  int horizontal_size,   int vertical_size,
2997                  QP_STORE_T *QP_store,  int QP_stride,
2998                  struct PPMode *mode)
2999 {
3000
3001         static QP_STORE_T zeroArray[2048/8];
3002         if(QP_store==NULL)
3003         {
3004                 QP_store= zeroArray;
3005                 QP_stride= 0;
3006         }
3007
3008 #ifdef HAVE_ODIVX_POSTPROCESS
3009 // Note: I could make this shit outside of this file, but it would mean one
3010 // more function call...
3011         if(use_old_pp){
3012             odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,
3013             mode->oldMode);
3014             return;
3015         }
3016 #endif
3017
3018         postProcess(src[0], src_stride, dst[0], dst_stride,
3019                 horizontal_size, vertical_size, QP_store, QP_stride, 0, mode);
3020
3021         horizontal_size >>= 1;
3022         vertical_size   >>= 1;
3023         src_stride      >>= 1;
3024         dst_stride      >>= 1;
3025
3026         postProcess(src[1], src_stride, dst[1], dst_stride,
3027                 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
3028         postProcess(src[2], src_stride, dst[2], dst_stride,
3029                 horizontal_size, vertical_size, QP_store, QP_stride, 2, mode);
3030 }
3031
3032
3033 /**
3034  * gets the mode flags for a given quality (larger values mean slower but better postprocessing)
3035  * 0 <= quality <= 6
3036  */
3037 int getPpModeForQuality(int quality){
3038         int modes[1+GET_PP_QUALITY_MAX]= {
3039                 0,
3040 #if 1
3041                 // horizontal filters first
3042                 LUM_H_DEBLOCK,
3043                 LUM_H_DEBLOCK | LUM_V_DEBLOCK,
3044                 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK,
3045                 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK,
3046                 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING,
3047                 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING | CHROM_DERING
3048 #else
3049                 // vertical filters first
3050                 LUM_V_DEBLOCK,
3051                 LUM_V_DEBLOCK | LUM_H_DEBLOCK,
3052                 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK,
3053                 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK,
3054                 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING,
3055                 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING | CHROM_DERING
3056 #endif
3057         };
3058
3059 #ifdef HAVE_ODIVX_POSTPROCESS
3060         int odivx_modes[1+GET_PP_QUALITY_MAX]= {
3061                 0,
3062                 PP_DEBLOCK_Y_H,
3063                 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V,
3064                 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H,
3065                 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V,
3066                 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y,
3067                 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y|PP_DERING_C
3068         };
3069         if(use_old_pp) return odivx_modes[quality];
3070 #endif
3071         return modes[quality];
3072 }
3073
3074 /**
3075  * Copies a block from src to dst and fixes the blacklevel
3076  * numLines must be a multiple of 4
3077  * levelFix == 0 -> dont touch the brighness & contrast
3078  */
3079 static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
3080         int levelFix)
3081 {
3082 #ifndef HAVE_MMX
3083         int i;
3084 #endif
3085         if(levelFix)
3086         {
3087 #ifdef HAVE_MMX
3088                                         asm volatile(
3089                                                 "leal (%0,%2), %%eax    \n\t"
3090                                                 "leal (%1,%3), %%ebx    \n\t"
3091                                                 "movq packedYOffset, %%mm2      \n\t"
3092                                                 "movq packedYScale, %%mm3       \n\t"
3093                                                 "pxor %%mm4, %%mm4      \n\t"
3094
3095 #define SCALED_CPY(src1, src2, dst1, dst2)                                      \
3096                                                 "movq " #src1 ", %%mm0  \n\t"\
3097                                                 "movq " #src1 ", %%mm5  \n\t"\
3098                                                 "punpcklbw %%mm4, %%mm0 \n\t"\
3099                                                 "punpckhbw %%mm4, %%mm5 \n\t"\
3100                                                 "psubw %%mm2, %%mm0     \n\t"\
3101                                                 "psubw %%mm2, %%mm5     \n\t"\
3102                                                 "movq " #src2 ", %%mm1  \n\t"\
3103                                                 "psllw $6, %%mm0        \n\t"\
3104                                                 "psllw $6, %%mm5        \n\t"\
3105                                                 "pmulhw %%mm3, %%mm0    \n\t"\
3106                                                 "movq " #src2 ", %%mm6  \n\t"\
3107                                                 "pmulhw %%mm3, %%mm5    \n\t"\
3108                                                 "punpcklbw %%mm4, %%mm1 \n\t"\
3109                                                 "punpckhbw %%mm4, %%mm6 \n\t"\
3110                                                 "psubw %%mm2, %%mm1     \n\t"\
3111                                                 "psubw %%mm2, %%mm6     \n\t"\
3112                                                 "psllw $6, %%mm1        \n\t"\
3113                                                 "psllw $6, %%mm6        \n\t"\
3114                                                 "pmulhw %%mm3, %%mm1    \n\t"\
3115                                                 "pmulhw %%mm3, %%mm6    \n\t"\
3116                                                 "packuswb %%mm5, %%mm0  \n\t"\
3117                                                 "packuswb %%mm6, %%mm1  \n\t"\
3118                                                 "movq %%mm0, " #dst1 "  \n\t"\
3119                                                 "movq %%mm1, " #dst2 "  \n\t"\
3120
3121 SCALED_CPY((%0)       , (%0, %2)      , (%1)       , (%1, %3))
3122 SCALED_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%ebx, %3, 2))
3123 SCALED_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%ebx, %3, 4))
3124                                                 "leal (%%eax,%2,4), %%eax       \n\t"
3125                                                 "leal (%%ebx,%3,4), %%ebx       \n\t"
3126 SCALED_CPY((%%eax, %2), (%%eax, %2, 2), (%%ebx, %3), (%%ebx, %3, 2))
3127
3128
3129                                                 : : "r"(src),
3130                                                 "r"(dst),
3131                                                 "r" (srcStride),
3132                                                 "r" (dstStride)
3133                                                 : "%eax", "%ebx"
3134                                         );
3135 #else
3136                                 for(i=0; i<8; i++)
3137                                         memcpy( &(dst[dstStride*i]),
3138                                                 &(src[srcStride*i]), BLOCK_SIZE);
3139 #endif
3140         }
3141         else
3142         {
3143 #ifdef HAVE_MMX
3144                                         asm volatile(
3145                                                 "leal (%0,%2), %%eax    \n\t"
3146                                                 "leal (%1,%3), %%ebx    \n\t"
3147
3148 #define SIMPLE_CPY(src1, src2, dst1, dst2)                              \
3149                                                 "movq " #src1 ", %%mm0  \n\t"\
3150                                                 "movq " #src2 ", %%mm1  \n\t"\
3151                                                 "movq %%mm0, " #dst1 "  \n\t"\
3152                                                 "movq %%mm1, " #dst2 "  \n\t"\
3153
3154 SIMPLE_CPY((%0)       , (%0, %2)      , (%1)       , (%1, %3))
3155 SIMPLE_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%ebx, %3, 2))
3156 SIMPLE_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%ebx, %3, 4))
3157                                                 "leal (%%eax,%2,4), %%eax       \n\t"
3158                                                 "leal (%%ebx,%3,4), %%ebx       \n\t"
3159 SIMPLE_CPY((%%eax, %2), (%%eax, %2, 2), (%%ebx, %3), (%%ebx, %3, 2))
3160
3161                                                 : : "r" (src),
3162                                                 "r" (dst),
3163                                                 "r" (srcStride),
3164                                                 "r" (dstStride)
3165                                                 : "%eax", "%ebx"
3166                                         );
3167 #else
3168                                 for(i=0; i<8; i++)
3169                                         memcpy( &(dst[dstStride*i]),
3170                                                 &(src[srcStride*i]), BLOCK_SIZE);
3171 #endif
3172         }
3173 }
3174
3175
3176 /**
3177  * Filters array of bytes (Y or U or V values)
3178  */
3179 static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
3180         QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode)
3181 {
3182         int x,y;
3183         const int mode= isColor ? ppMode->chromMode : ppMode->lumMode;
3184
3185         /* we need 64bit here otherwise we´ll going to have a problem
3186            after watching a black picture for 5 hours*/
3187         static uint64_t *yHistogram= NULL;
3188         int black=0, white=255; // blackest black and whitest white in the picture
3189         int QPCorrecture= 256;
3190
3191         /* Temporary buffers for handling the last row(s) */
3192         static uint8_t *tempDst= NULL;
3193         static uint8_t *tempSrc= NULL;
3194
3195         /* Temporary buffers for handling the last block */
3196         static uint8_t *tempDstBlock= NULL;
3197         static uint8_t *tempSrcBlock= NULL;
3198
3199         /* Temporal noise reducing buffers */
3200         static uint8_t *tempBlured[3]= {NULL,NULL,NULL};
3201         static uint32_t *tempBluredPast[3]= {NULL,NULL,NULL};
3202
3203         int copyAhead;
3204
3205 #ifdef PP_FUNNY_STRIDE
3206         uint8_t *dstBlockPtrBackup;
3207         uint8_t *srcBlockPtrBackup;
3208 #endif
3209
3210 #ifdef MORE_TIMING
3211         long long T0, T1, diffTime=0;
3212 #endif
3213 #ifdef TIMING
3214         long long memcpyTime=0, vertTime=0, horizTime=0, sumTime;
3215         sumTime= rdtsc();
3216 #endif
3217 //mode= 0x7F;
3218 #ifdef HAVE_MMX
3219         maxTmpNoise[0]= ppMode->maxTmpNoise[0];
3220         maxTmpNoise[1]= ppMode->maxTmpNoise[1];
3221         maxTmpNoise[2]= ppMode->maxTmpNoise[2];
3222 #endif
3223
3224         if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
3225         else if(mode & LINEAR_BLEND_DEINT_FILTER) copyAhead=14;
3226         else if(   (mode & V_DEBLOCK)
3227                 || (mode & LINEAR_IPOL_DEINT_FILTER)
3228                 || (mode & MEDIAN_DEINT_FILTER)) copyAhead=13;
3229         else if(mode & V_X1_FILTER) copyAhead=11;
3230         else if(mode & V_RK1_FILTER) copyAhead=10;
3231         else if(mode & DERING) copyAhead=9;
3232         else copyAhead=8;
3233
3234         copyAhead-= 8;
3235
3236         if(tempDst==NULL)
3237         {
3238                 tempDst= (uint8_t*)memalign(8, 1024*24);
3239                 tempSrc= (uint8_t*)memalign(8, 1024*24);
3240                 tempDstBlock= (uint8_t*)memalign(8, 1024*24);
3241                 tempSrcBlock= (uint8_t*)memalign(8, 1024*24);
3242         }
3243
3244         if(tempBlured[isColor]==NULL && (mode & TEMP_NOISE_FILTER))
3245         {
3246 //              printf("%d %d %d\n", isColor, dstStride, height);
3247                 //FIXME works only as long as the size doesnt increase
3248                 //Note:the +17*1024 is just there so i dont have to worry about r/w over te end
3249                 tempBlured[isColor]= (uint8_t*)memalign(8, dstStride*((height+7)&(~7)) + 17*1024);
3250                 tempBluredPast[isColor]= (uint32_t*)memalign(8, 256*((height+7)&(~7))/2 + 17*1024);
3251
3252                 memset(tempBlured[isColor], 0, dstStride*((height+7)&(~7)) + 17*1024);
3253                 memset(tempBluredPast[isColor], 0, 256*((height+7)&(~7))/2 + 17*1024);
3254         }
3255
3256         if(!yHistogram)
3257         {
3258                 int i;
3259                 yHistogram= (uint64_t*)malloc(8*256);
3260                 for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256;
3261
3262                 if(mode & FULL_Y_RANGE)
3263                 {
3264                         maxAllowedY=255;
3265                         minAllowedY=0;
3266                 }
3267         }
3268
3269         if(!isColor)
3270         {
3271                 uint64_t sum= 0;
3272                 int i;
3273                 static int framenum= -1;
3274                 uint64_t maxClipped;
3275                 uint64_t clipped;
3276                 double scale;
3277
3278                 framenum++;
3279                 if(framenum == 1) yHistogram[0]= width*height/64*15/256;
3280
3281                 for(i=0; i<256; i++)
3282                 {
3283                         sum+= yHistogram[i];
3284 //                      printf("%d ", yHistogram[i]);
3285                 }
3286 //              printf("\n\n");
3287
3288                 /* we allways get a completly black picture first */
3289                 maxClipped= (uint64_t)(sum * maxClippedThreshold);
3290
3291                 clipped= sum;
3292                 for(black=255; black>0; black--)
3293                 {
3294                         if(clipped < maxClipped) break;
3295                         clipped-= yHistogram[black];
3296                 }
3297
3298                 clipped= sum;
3299                 for(white=0; white<256; white++)
3300                 {
3301                         if(clipped < maxClipped) break;
3302                         clipped-= yHistogram[white];
3303                 }
3304
3305                 packedYOffset= (black - minAllowedY) & 0xFFFF;
3306                 packedYOffset|= packedYOffset<<32;
3307                 packedYOffset|= packedYOffset<<16;
3308
3309                 scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);
3310
3311                 packedYScale= (uint16_t)(scale*1024.0 + 0.5);
3312                 packedYScale|= packedYScale<<32;
3313                 packedYScale|= packedYScale<<16;
3314         }
3315         else
3316         {
3317                 packedYScale= 0x0100010001000100LL;
3318                 packedYOffset= 0;
3319         }
3320
3321         if(mode & LEVEL_FIX)    QPCorrecture= packedYScale &0xFFFF;
3322         else                    QPCorrecture= 256;
3323
3324         /* copy & deinterlace first row of blocks */
3325         y=-BLOCK_SIZE;
3326         {
3327                 //1% speedup if these are here instead of the inner loop
3328                 uint8_t *srcBlock= &(src[y*srcStride]);
3329                 uint8_t *dstBlock= &(dst[y*dstStride]);
3330
3331                 dstBlock= tempDst + dstStride;
3332
3333                 // From this point on it is guranteed that we can read and write 16 lines downward
3334                 // finish 1 block before the next otherwise we´ll might have a problem
3335                 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
3336                 for(x=0; x<width; x+=BLOCK_SIZE)
3337                 {
3338
3339 #ifdef HAVE_MMX2
3340 /*
3341                         prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3342                         prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3343                         prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3344                         prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3345 */
3346
3347                         asm(
3348                                 "movl %4, %%eax                 \n\t"
3349                                 "shrl $2, %%eax                 \n\t"
3350                                 "andl $6, %%eax                 \n\t"
3351                                 "addl %5, %%eax                 \n\t"
3352                                 "movl %%eax, %%ebx              \n\t"
3353                                 "imul %1, %%eax                 \n\t"
3354                                 "imul %3, %%ebx                 \n\t"
3355                                 "prefetchnta 32(%%eax, %0)      \n\t"
3356                                 "prefetcht0 32(%%ebx, %2)       \n\t"
3357                                 "addl %1, %%eax                 \n\t"
3358                                 "addl %3, %%ebx                 \n\t"
3359                                 "prefetchnta 32(%%eax, %0)      \n\t"
3360                                 "prefetcht0 32(%%ebx, %2)       \n\t"
3361                         :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
3362                         "m" (x), "m" (copyAhead)
3363                         : "%eax", "%ebx"
3364                         );
3365
3366 #elif defined(HAVE_3DNOW)
3367 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
3368 /*                      prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3369                         prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3370                         prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3371                         prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3372 */
3373 #endif
3374
3375                         blockCopy(dstBlock + dstStride*copyAhead, dstStride,
3376                                 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX);
3377
3378                         if(mode & LINEAR_IPOL_DEINT_FILTER)
3379                                 deInterlaceInterpolateLinear(dstBlock, dstStride);
3380                         else if(mode & LINEAR_BLEND_DEINT_FILTER)
3381                                 deInterlaceBlendLinear(dstBlock, dstStride);
3382                         else if(mode & MEDIAN_DEINT_FILTER)
3383                                 deInterlaceMedian(dstBlock, dstStride);
3384                         else if(mode & CUBIC_IPOL_DEINT_FILTER)
3385                                 deInterlaceInterpolateCubic(dstBlock, dstStride);
3386 /*                      else if(mode & CUBIC_BLEND_DEINT_FILTER)
3387                                 deInterlaceBlendCubic(dstBlock, dstStride);
3388 */
3389                         dstBlock+=8;
3390                         srcBlock+=8;
3391                 }
3392                 memcpy(&(dst[y*dstStride]) + 8*dstStride, tempDst + 9*dstStride, copyAhead*dstStride );
3393         }
3394
3395         for(y=0; y<height; y+=BLOCK_SIZE)
3396         {
3397                 //1% speedup if these are here instead of the inner loop
3398                 uint8_t *srcBlock= &(src[y*srcStride]);
3399                 uint8_t *dstBlock= &(dst[y*dstStride]);
3400 #ifdef ARCH_X86
3401                 int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride];
3402                 int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4);
3403                 int QPFrac= QPDelta;
3404                 uint8_t *tempBlock1= tempBlocks;
3405                 uint8_t *tempBlock2= tempBlocks + 8;
3406 #endif
3407                 int QP=0;
3408                 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
3409                    if not than use a temporary buffer */
3410                 if(y+15 >= height)
3411                 {
3412                         int i;
3413                         /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
3414                            blockcopy to dst later */
3415                         memcpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
3416                                 srcStride*MAX(height-y-copyAhead, 0) );
3417
3418                         /* duplicate last line of src to fill the void upto line (copyAhead+7) */
3419                         for(i=MAX(height-y, 8); i<copyAhead+8; i++)
3420                                 memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), srcStride);
3421
3422                         /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
3423                         memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, copyAhead+1) );
3424
3425                         /* duplicate last line of dst to fill the void upto line (copyAhead) */
3426                         for(i=height-y+1; i<=copyAhead; i++)
3427                                 memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), dstStride);
3428
3429                         dstBlock= tempDst + dstStride;
3430                         srcBlock= tempSrc;
3431                 }
3432
3433                 // From this point on it is guranteed that we can read and write 16 lines downward
3434                 // finish 1 block before the next otherwise we´ll might have a problem
3435                 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
3436                 for(x=0; x<width; x+=BLOCK_SIZE)
3437                 {
3438                         const int stride= dstStride;
3439                         uint8_t *tmpXchg;
3440 #ifdef ARCH_X86
3441                         QP= *QPptr;
3442                         asm volatile(
3443                                 "addl %2, %1            \n\t"
3444                                 "sbbl %%eax, %%eax      \n\t"
3445                                 "shll $2, %%eax         \n\t"
3446                                 "subl %%eax, %0         \n\t"
3447                                 : "+r" (QPptr), "+m" (QPFrac)
3448                                 : "r" (QPDelta)
3449                                 : "%eax"
3450                         );
3451 #else
3452                         QP= isColor ?
3453                                 QPs[(y>>3)*QPStride + (x>>3)]:
3454                                 QPs[(y>>4)*QPStride + (x>>4)];
3455 #endif
3456                         if(!isColor)
3457                         {
3458                                 QP= (QP* QPCorrecture)>>8;
3459                                 yHistogram[ srcBlock[srcStride*12 + 4] ]++;
3460                         }
3461 #ifdef HAVE_MMX
3462                         asm volatile(
3463                                 "movd %0, %%mm7                                 \n\t"
3464                                 "packuswb %%mm7, %%mm7                          \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
3465                                 "packuswb %%mm7, %%mm7                          \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
3466                                 "packuswb %%mm7, %%mm7                          \n\t" // QP,..., QP
3467                                 "movq %%mm7, pQPb                               \n\t"
3468                                 : : "r" (QP)
3469                         );
3470 #endif
3471
3472 #ifdef MORE_TIMING
3473                         T0= rdtsc();
3474 #endif
3475
3476 #ifdef HAVE_MMX2
3477 /*
3478                         prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3479                         prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3480                         prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3481                         prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3482 */
3483
3484                         asm(
3485                                 "movl %4, %%eax                 \n\t"
3486                                 "shrl $2, %%eax                 \n\t"
3487                                 "andl $6, %%eax                 \n\t"
3488                                 "addl %5, %%eax                 \n\t"
3489                                 "movl %%eax, %%ebx              \n\t"
3490                                 "imul %1, %%eax                 \n\t"
3491                                 "imul %3, %%ebx                 \n\t"
3492                                 "prefetchnta 32(%%eax, %0)      \n\t"
3493                                 "prefetcht0 32(%%ebx, %2)       \n\t"
3494                                 "addl %1, %%eax                 \n\t"
3495                                 "addl %3, %%ebx                 \n\t"
3496                                 "prefetchnta 32(%%eax, %0)      \n\t"
3497                                 "prefetcht0 32(%%ebx, %2)       \n\t"
3498                         :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
3499                         "m" (x), "m" (copyAhead)
3500                         : "%eax", "%ebx"
3501                         );
3502
3503 #elif defined(HAVE_3DNOW)
3504 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
3505 /*                      prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3506                         prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3507                         prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3508                         prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3509 */
3510 #endif
3511
3512 #ifdef PP_FUNNY_STRIDE
3513                         //can we mess with a 8x16 block, if not use a temp buffer, yes again
3514                         if(x+7 >= width)
3515                         {
3516                                 int i;
3517                                 dstBlockPtrBackup= dstBlock;
3518                                 srcBlockPtrBackup= srcBlock;
3519
3520                                 for(i=0;i<BLOCK_SIZE*2; i++)
3521                                 {
3522                                         memcpy(tempSrcBlock+i*srcStride, srcBlock+i*srcStride, width-x);
3523                                         memcpy(tempDstBlock+i*dstStride, dstBlock+i*dstStride, width-x);
3524                                 }
3525
3526                                 dstBlock= tempDstBlock;
3527                                 srcBlock= tempSrcBlock;
3528                         }
3529 #endif
3530
3531                         blockCopy(dstBlock + dstStride*copyAhead, dstStride,
3532                                 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX);
3533
3534                         if(mode & LINEAR_IPOL_DEINT_FILTER)
3535                                 deInterlaceInterpolateLinear(dstBlock, dstStride);
3536                         else if(mode & LINEAR_BLEND_DEINT_FILTER)
3537                                 deInterlaceBlendLinear(dstBlock, dstStride);
3538                         else if(mode & MEDIAN_DEINT_FILTER)
3539                                 deInterlaceMedian(dstBlock, dstStride);
3540                         else if(mode & CUBIC_IPOL_DEINT_FILTER)
3541                                 deInterlaceInterpolateCubic(dstBlock, dstStride);
3542 /*                      else if(mode & CUBIC_BLEND_DEINT_FILTER)
3543                                 deInterlaceBlendCubic(dstBlock, dstStride);
3544 */
3545
3546                         /* only deblock if we have 2 blocks */
3547                         if(y + 8 < height)
3548                         {
3549 #ifdef MORE_TIMING
3550                                 T1= rdtsc();
3551                                 memcpyTime+= T1-T0;
3552                                 T0=T1;
3553 #endif
3554                                 if(mode & V_RK1_FILTER)
3555                                         vertRK1Filter(dstBlock, stride, QP);
3556                                 else if(mode & V_X1_FILTER)
3557                                         vertX1Filter(dstBlock, stride, QP);
3558                                 else if(mode & V_DEBLOCK)
3559                                 {
3560                                         if( isVertDC(dstBlock, stride))
3561                                         {
3562                                                 if(isVertMinMaxOk(dstBlock, stride, QP))
3563                                                         doVertLowPass(dstBlock, stride, QP);
3564                                         }
3565                                         else
3566                                                 doVertDefFilter(dstBlock, stride, QP);
3567                                 }
3568 #ifdef MORE_TIMING
3569                                 T1= rdtsc();
3570                                 vertTime+= T1-T0;
3571                                 T0=T1;
3572 #endif
3573                         }
3574
3575 #ifdef HAVE_MMX
3576                         transpose1(tempBlock1, tempBlock2, dstBlock, dstStride);
3577 #endif
3578                         /* check if we have a previous block to deblock it with dstBlock */
3579                         if(x - 8 >= 0)
3580                         {
3581 #ifdef MORE_TIMING
3582                                 T0= rdtsc();
3583 #endif
3584 #ifdef HAVE_MMX
3585                                 if(mode & H_RK1_FILTER)
3586                                         vertRK1Filter(tempBlock1, 16, QP);
3587                                 else if(mode & H_X1_FILTER)
3588                                         vertX1Filter(tempBlock1, 16, QP);
3589                                 else if(mode & H_DEBLOCK)
3590                                 {
3591                                         if( isVertDC(tempBlock1, 16) )
3592                                         {
3593                                                 if(isVertMinMaxOk(tempBlock1, 16, QP))
3594                                                         doVertLowPass(tempBlock1, 16, QP);
3595                                         }
3596                                         else
3597                                                 doVertDefFilter(tempBlock1, 16, QP);
3598                                 }
3599
3600                                 transpose2(dstBlock-4, dstStride, tempBlock1 + 4*16);
3601
3602 #else
3603                                 if(mode & H_X1_FILTER)
3604                                         horizX1Filter(dstBlock-4, stride, QP);
3605                                 else if(mode & H_DEBLOCK)
3606                                 {
3607                                         if( isHorizDC(dstBlock-4, stride))
3608                                         {
3609                                                 if(isHorizMinMaxOk(dstBlock-4, stride, QP))
3610                                                         doHorizLowPass(dstBlock-4, stride, QP);
3611                                         }
3612                                         else
3613                                                 doHorizDefFilter(dstBlock-4, stride, QP);
3614                                 }
3615 #endif
3616 #ifdef MORE_TIMING
3617                                 T1= rdtsc();
3618                                 horizTime+= T1-T0;
3619                                 T0=T1;
3620 #endif
3621                                 if(mode & DERING)
3622                                 {
3623                                 //FIXME filter first line
3624                                         if(y>0) dering(dstBlock - stride - 8, stride, QP);
3625                                 }
3626
3627                                 if(mode & TEMP_NOISE_FILTER)
3628                                 {
3629                                         tempNoiseReducer(dstBlock-8, stride,
3630                                                 tempBlured[isColor] + y*dstStride + x,
3631                                                 tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
3632                                                 ppMode->maxTmpNoise);
3633                                 }
3634                         }
3635
3636 #ifdef PP_FUNNY_STRIDE
3637                         /* did we use a tmp-block buffer */
3638                         if(x+7 >= width)
3639                         {
3640                                 int i;
3641                                 dstBlock= dstBlockPtrBackup;
3642                                 srcBlock= srcBlockPtrBackup;
3643
3644                                 for(i=0;i<BLOCK_SIZE*2; i++)
3645                                 {
3646                                         memcpy(dstBlock+i*dstStride, tempDstBlock+i*dstStride, width-x);
3647                                 }
3648                         }
3649 #endif
3650
3651                         dstBlock+=8;
3652                         srcBlock+=8;
3653
3654 #ifdef HAVE_MMX
3655                         tmpXchg= tempBlock1;
3656                         tempBlock1= tempBlock2;
3657                         tempBlock2 = tmpXchg;
3658 #endif
3659                 }
3660
3661                 if(mode & DERING)
3662                 {
3663                                 if(y > 0) dering(dstBlock - dstStride - 8, dstStride, QP);
3664                 }
3665
3666                 if((mode & TEMP_NOISE_FILTER))
3667                 {
3668                         tempNoiseReducer(dstBlock-8, dstStride,
3669                                 tempBlured[isColor] + y*dstStride + x,
3670                                 tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
3671                                 ppMode->maxTmpNoise);
3672                 }
3673
3674                 /* did we use a tmp buffer for the last lines*/
3675                 if(y+15 >= height)
3676                 {
3677                         uint8_t *dstBlock= &(dst[y*dstStride]);
3678                         memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) );
3679                 }
3680 /*
3681                 for(x=0; x<width; x+=32)
3682                 {
3683                         volatile int i;
3684                         i+=     + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride]
3685                                 + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride]
3686                                 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride];
3687 //                              + dstBlock[x +13*dstStride]
3688 //                              + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride];
3689                 }*/
3690         }
3691 #ifdef HAVE_3DNOW
3692         asm volatile("femms");
3693 #elif defined (HAVE_MMX)
3694         asm volatile("emms");
3695 #endif
3696
3697 #ifdef TIMING
3698         // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...)
3699         sumTime= rdtsc() - sumTime;
3700         if(!isColor)
3701                 printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d    \r",
3702                         (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
3703                         (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)
3704                         , black, white);
3705 #endif
3706 #ifdef DEBUG_BRIGHTNESS
3707         if(!isColor)
3708         {
3709                 int max=1;
3710                 int i;
3711                 for(i=0; i<256; i++)
3712                         if(yHistogram[i] > max) max=yHistogram[i];
3713
3714                 for(i=1; i<256; i++)
3715                 {
3716                         int x;
3717                         int start=yHistogram[i-1]/(max/256+1);
3718                         int end=yHistogram[i]/(max/256+1);
3719                         int inc= end > start ? 1 : -1;
3720                         for(x=start; x!=end+inc; x+=inc)
3721                                 dst[ i*dstStride + x]+=128;
3722                 }
3723
3724                 for(i=0; i<100; i+=2)
3725                 {
3726                         dst[ (white)*dstStride + i]+=128;
3727                         dst[ (black)*dstStride + i]+=128;
3728                 }
3729
3730         }
3731 #endif
3732
3733 }