]> git.sesse.net Git - ffmpeg/blob - postproc/postprocess_template.c
QP_store==null bugfix and no opendivx bugfix
[ffmpeg] / postproc / postprocess_template.c
1 /*
2     Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
3
4     This program is free software; you can redistribute it and/or modify
5     it under the terms of the GNU General Public License as published by
6     the Free Software Foundation; either version 2 of the License, or
7     (at your option) any later version.
8
9     This program is distributed in the hope that it will be useful,
10     but WITHOUT ANY WARRANTY; without even the implied warranty of
11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12     GNU General Public License for more details.
13
14     You should have received a copy of the GNU General Public License
15     along with this program; if not, write to the Free Software
16     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17 */
18
19 /*
20                         C       MMX     MMX2    3DNow
21 isVertDC                Ec      Ec
22 isVertMinMaxOk          Ec      Ec
23 doVertLowPass           E               e       e
24 doVertDefFilter         Ec      Ec      Ec
25 isHorizDC               Ec      Ec
26 isHorizMinMaxOk         a       E
27 doHorizLowPass          E               e       e
28 doHorizDefFilter        Ec      Ec      Ec
29 deRing                  E               e       e*
30 Vertical RKAlgo1        E               a       a
31 Horizontal RKAlgo1                      a       a
32 Vertical X1             a               E       E
33 Horizontal X1           a               E       E
34 LinIpolDeinterlace      e               E       E*
35 CubicIpolDeinterlace    a               e       e*
36 LinBlendDeinterlace     e               E       E*
37 MedianDeinterlace               Ec      Ec
38
39
40 * i dont have a 3dnow CPU -> its untested
41 E = Exact implementation
42 e = allmost exact implementation (slightly different rounding,...)
43 a = alternative / approximate impl
44 c = checked against the other implementations (-vo md5)
45 */
46
47 /*
48 TODO:
49 verify that everything workes as it should (how?)
50 reduce the time wasted on the mem transfer
51 implement dering
52 implement everything in C at least (done at the moment but ...)
53 unroll stuff if instructions depend too much on the prior one
54 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
55 move YScale thing to the end instead of fixing QP
56 write a faster and higher quality deblocking filter :)
57 do something about the speed of the horizontal filters
58 make the mainloop more flexible (variable number of blocks at once
59         (the if/else stuff per block is slowing things down)
60 compare the quality & speed of all filters
61 split this huge file
62 fix warnings (unused vars, ...)
63 noise reduction filters
64 border remover
65 optimize c versions
66 ...
67
68 Notes:
69 */
70
71 //Changelog: use the CVS log
72
73 #include "../config.h"
74 #include <inttypes.h>
75 #include <stdio.h>
76 #include <stdlib.h>
77 #include <string.h>
78 #ifdef HAVE_MALLOC_H
79 #include <malloc.h>
80 #endif
81 //#undef HAVE_MMX2
82 //#define HAVE_3DNOW
83 //#undef HAVE_MMX
84 #include "postprocess.h"
85
86 #define MIN(a,b) ((a) > (b) ? (b) : (a))
87 #define MAX(a,b) ((a) < (b) ? (b) : (a))
88 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
89 #define SIGN(a) ((a) > 0 ? 1 : -1)
90
91 #ifdef HAVE_MMX2
92 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
93 #elif defined (HAVE_3DNOW)
94 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
95 #endif
96
97 #ifdef HAVE_MMX2
98 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
99 #elif defined (HAVE_MMX)
100 #define PMINUB(b,a,t) \
101         "movq " #a ", " #t " \n\t"\
102         "psubusb " #b ", " #t " \n\t"\
103         "psubb " #t ", " #a " \n\t"
104 #endif
105
106 #ifdef HAVE_MMX2
107 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
108 #elif defined (HAVE_MMX)
109 #define PMAXUB(a,b) \
110         "psubusb " #a ", " #b " \n\t"\
111         "paddb " #a ", " #b " \n\t"
112 #endif
113
114
115 #define GET_MODE_BUFFER_SIZE 500
116 #define OPTIONS_ARRAY_SIZE 10
117
118 #ifdef HAVE_MMX
119 static volatile uint64_t __attribute__((aligned(8))) packedYOffset=     0x0000000000000000LL;
120 static volatile uint64_t __attribute__((aligned(8))) packedYScale=      0x0100010001000100LL;
121 static uint64_t __attribute__((aligned(8))) w05=                0x0005000500050005LL;
122 static uint64_t __attribute__((aligned(8))) w20=                0x0020002000200020LL;
123 static uint64_t __attribute__((aligned(8))) w1400=              0x1400140014001400LL;
124 static uint64_t __attribute__((aligned(8))) bm00000001=         0x00000000000000FFLL;
125 static uint64_t __attribute__((aligned(8))) bm00010000=         0x000000FF00000000LL;
126 static uint64_t __attribute__((aligned(8))) bm00001000=         0x00000000FF000000LL;
127 static uint64_t __attribute__((aligned(8))) bm10000000=         0xFF00000000000000LL;
128 static uint64_t __attribute__((aligned(8))) bm10000001=         0xFF000000000000FFLL;
129 static uint64_t __attribute__((aligned(8))) bm11000011=         0xFFFF00000000FFFFLL;
130 static uint64_t __attribute__((aligned(8))) bm00000011=         0x000000000000FFFFLL;
131 static uint64_t __attribute__((aligned(8))) bm11111110=         0xFFFFFFFFFFFFFF00LL;
132 static uint64_t __attribute__((aligned(8))) bm11000000=         0xFFFF000000000000LL;
133 static uint64_t __attribute__((aligned(8))) bm00011000=         0x000000FFFF000000LL;
134 static uint64_t __attribute__((aligned(8))) bm00110011=         0x0000FFFF0000FFFFLL;
135 static uint64_t __attribute__((aligned(8))) bm11001100=         0xFFFF0000FFFF0000LL;
136 static uint64_t __attribute__((aligned(8))) b00=                0x0000000000000000LL;
137 static uint64_t __attribute__((aligned(8))) b01=                0x0101010101010101LL;
138 static uint64_t __attribute__((aligned(8))) b02=                0x0202020202020202LL;
139 static uint64_t __attribute__((aligned(8))) b0F=                0x0F0F0F0F0F0F0F0FLL;
140 static uint64_t __attribute__((aligned(8))) b04=                0x0404040404040404LL;
141 static uint64_t __attribute__((aligned(8))) b08=                0x0808080808080808LL;
142 static uint64_t __attribute__((aligned(8))) bFF=                0xFFFFFFFFFFFFFFFFLL;
143 static uint64_t __attribute__((aligned(8))) b20=                0x2020202020202020LL;
144 static uint64_t __attribute__((aligned(8))) b80=                0x8080808080808080LL;
145 static uint64_t __attribute__((aligned(8))) b7E=                0x7E7E7E7E7E7E7E7ELL;
146 static uint64_t __attribute__((aligned(8))) b7C=                0x7C7C7C7C7C7C7C7CLL;
147 static uint64_t __attribute__((aligned(8))) b3F=                0x3F3F3F3F3F3F3F3FLL;
148 static uint64_t __attribute__((aligned(8))) temp0=0;
149 static uint64_t __attribute__((aligned(8))) temp1=0;
150 static uint64_t __attribute__((aligned(8))) temp2=0;
151 static uint64_t __attribute__((aligned(8))) temp3=0;
152 static uint64_t __attribute__((aligned(8))) temp4=0;
153 static uint64_t __attribute__((aligned(8))) temp5=0;
154 static uint64_t __attribute__((aligned(8))) pQPb=0;
155 static uint64_t __attribute__((aligned(8))) pQPb2=0;
156 static uint8_t __attribute__((aligned(8))) tempBlocks[8*16*2]; //used for the horizontal code
157 #else
158 static uint64_t packedYOffset=  0x0000000000000000LL;
159 static uint64_t packedYScale=   0x0100010001000100LL;
160 static uint8_t tempBlocks[8*16*2]; //used for the horizontal code
161 #endif
162
163 int hFlatnessThreshold= 56 - 16;
164 int vFlatnessThreshold= 56 - 16;
165
166 //amount of "black" u r willing to loose to get a brightness corrected picture
167 double maxClippedThreshold= 0.01;
168
169 int maxAllowedY=234;
170 int minAllowedY=16;
171
172 static struct PPFilter filters[]=
173 {
174         {"hb", "hdeblock",              1, 1, 3, H_DEBLOCK},
175         {"vb", "vdeblock",              1, 2, 4, V_DEBLOCK},
176         {"vr", "rkvdeblock",            1, 2, 4, H_RK1_FILTER},
177         {"h1", "x1hdeblock",            1, 1, 3, H_X1_FILTER},
178         {"v1", "x1vdeblock",            1, 2, 4, V_X1_FILTER},
179         {"dr", "dering",                1, 5, 6, DERING},
180         {"al", "autolevels",            0, 1, 2, LEVEL_FIX},
181         {"lb", "linblenddeint",         0, 1, 6, LINEAR_BLEND_DEINT_FILTER},
182         {"li", "linipoldeint",          0, 1, 6, LINEAR_IPOL_DEINT_FILTER},
183         {"ci", "cubicipoldeint",        0, 1, 6, CUBIC_IPOL_DEINT_FILTER},
184         {"md", "mediandeint",           0, 1, 6, MEDIAN_DEINT_FILTER},
185         {NULL, NULL,0,0,0,0} //End Marker
186 };
187
188 static char *replaceTable[]=
189 {
190         "default",      "hdeblock:a,vdeblock:a,dering:a,autolevels",
191         "de",           "hdeblock:a,vdeblock:a,dering:a,autolevels",
192         "fast",         "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels",
193         "fa",           "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels",
194         NULL //End Marker
195 };
196
197 #ifdef HAVE_MMX
198 static inline void unusedVariableWarningFixer()
199 {
200 if(
201  packedYOffset + packedYScale + w05 + w20 + w1400 + bm00000001 + bm00010000
202  + bm00001000 + bm10000000 + bm10000001 + bm11000011 + bm00000011 + bm11111110
203  + bm11000000 + bm00011000 + bm00110011 + bm11001100 + b00 + b01 + b02 + b0F
204  + bFF + b20 + b04+ b08 + pQPb2 + b80 + b7E + b7C + b3F + temp0 + temp1 + temp2 + temp3 + temp4
205  + temp5 + pQPb== 0) b00=0;
206 }
207 #endif
208
209 #ifdef TIMING
210 static inline long long rdtsc()
211 {
212         long long l;
213         asm volatile(   "rdtsc\n\t"
214                 : "=A" (l)
215         );
216 //      printf("%d\n", int(l/1000));
217         return l;
218 }
219 #endif
220
221 #ifdef HAVE_MMX2
222 static inline void prefetchnta(void *p)
223 {
224         asm volatile(   "prefetchnta (%0)\n\t"
225                 : : "r" (p)
226         );
227 }
228
229 static inline void prefetcht0(void *p)
230 {
231         asm volatile(   "prefetcht0 (%0)\n\t"
232                 : : "r" (p)
233         );
234 }
235
236 static inline void prefetcht1(void *p)
237 {
238         asm volatile(   "prefetcht1 (%0)\n\t"
239                 : : "r" (p)
240         );
241 }
242
243 static inline void prefetcht2(void *p)
244 {
245         asm volatile(   "prefetcht2 (%0)\n\t"
246                 : : "r" (p)
247         );
248 }
249 #endif
250
251 //FIXME? |255-0| = 1 (shouldnt be a problem ...)
252 /**
253  * Check if the middle 8x8 Block in the given 8x16 block is flat
254  */
255 static inline int isVertDC(uint8_t src[], int stride){
256         int numEq= 0;
257 #ifndef HAVE_MMX
258         int y;
259 #endif
260         src+= stride*4; // src points to begin of the 8x8 Block
261 #ifdef HAVE_MMX
262 asm volatile(
263                 "leal (%1, %2), %%eax                           \n\t"
264                 "leal (%%eax, %2, 4), %%ebx                     \n\t"
265 //      0       1       2       3       4       5       6       7       8       9
266 //      %1      eax     eax+%2  eax+2%2 %1+4%2  ebx     ebx+%2  ebx+2%2 %1+8%2  ebx+4%2
267                 "movq b7E, %%mm7                                        \n\t" // mm7 = 0x7F
268                 "movq b7C, %%mm6                                        \n\t" // mm6 = 0x7D
269                 "movq (%1), %%mm0                               \n\t"
270                 "movq (%%eax), %%mm1                            \n\t"
271                 "psubb %%mm1, %%mm0                             \n\t" // mm0 = differnece
272                 "paddb %%mm7, %%mm0                             \n\t"
273                 "pcmpgtb %%mm6, %%mm0                           \n\t"
274
275                 "movq (%%eax,%2), %%mm2                         \n\t"
276                 "psubb %%mm2, %%mm1                             \n\t"
277                 "paddb %%mm7, %%mm1                             \n\t"
278                 "pcmpgtb %%mm6, %%mm1                           \n\t"
279                 "paddb %%mm1, %%mm0                             \n\t"
280
281                 "movq (%%eax, %2, 2), %%mm1                     \n\t"
282                 "psubb %%mm1, %%mm2                             \n\t"
283                 "paddb %%mm7, %%mm2                             \n\t"
284                 "pcmpgtb %%mm6, %%mm2                           \n\t"
285                 "paddb %%mm2, %%mm0                             \n\t"
286
287                 "movq (%1, %2, 4), %%mm2                        \n\t"
288                 "psubb %%mm2, %%mm1                             \n\t"
289                 "paddb %%mm7, %%mm1                             \n\t"
290                 "pcmpgtb %%mm6, %%mm1                           \n\t"
291                 "paddb %%mm1, %%mm0                             \n\t"
292
293                 "movq (%%ebx), %%mm1                            \n\t"
294                 "psubb %%mm1, %%mm2                             \n\t"
295                 "paddb %%mm7, %%mm2                             \n\t"
296                 "pcmpgtb %%mm6, %%mm2                           \n\t"
297                 "paddb %%mm2, %%mm0                             \n\t"
298
299                 "movq (%%ebx, %2), %%mm2                        \n\t"
300                 "psubb %%mm2, %%mm1                             \n\t"
301                 "paddb %%mm7, %%mm1                             \n\t"
302                 "pcmpgtb %%mm6, %%mm1                           \n\t"
303                 "paddb %%mm1, %%mm0                             \n\t"
304
305                 "movq (%%ebx, %2, 2), %%mm1                     \n\t"
306                 "psubb %%mm1, %%mm2                             \n\t"
307                 "paddb %%mm7, %%mm2                             \n\t"
308                 "pcmpgtb %%mm6, %%mm2                           \n\t"
309                 "paddb %%mm2, %%mm0                             \n\t"
310
311                 "                                               \n\t"
312                 "movq %%mm0, %%mm1                              \n\t"
313                 "psrlw $8, %%mm0                                \n\t"
314                 "paddb %%mm1, %%mm0                             \n\t"
315 #ifdef HAVE_MMX2
316                 "pshufw $0xF9, %%mm0, %%mm1                     \n\t"
317                 "paddb %%mm1, %%mm0                             \n\t"
318                 "pshufw $0xFE, %%mm0, %%mm1                     \n\t"
319 #else
320                 "movq %%mm0, %%mm1                              \n\t"
321                 "psrlq $16, %%mm0                               \n\t"
322                 "paddb %%mm1, %%mm0                             \n\t"
323                 "movq %%mm0, %%mm1                              \n\t"
324                 "psrlq $32, %%mm0                               \n\t"
325 #endif
326                 "paddb %%mm1, %%mm0                             \n\t"
327                 "movd %%mm0, %0                                 \n\t"
328                 : "=r" (numEq)
329                 : "r" (src), "r" (stride)
330                 : "%eax", "%ebx"
331                 );
332
333         numEq= (256 - numEq) &0xFF;
334
335 #else
336         for(y=0; y<BLOCK_SIZE-1; y++)
337         {
338                 if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
339                 if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
340                 if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++;
341                 if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++;
342                 if(((src[4] - src[4+stride] + 1)&0xFFFF) < 3) numEq++;
343                 if(((src[5] - src[5+stride] + 1)&0xFFFF) < 3) numEq++;
344                 if(((src[6] - src[6+stride] + 1)&0xFFFF) < 3) numEq++;
345                 if(((src[7] - src[7+stride] + 1)&0xFFFF) < 3) numEq++;
346                 src+= stride;
347         }
348 #endif
349 /*      if(abs(numEq - asmEq) > 0)
350         {
351                 printf("\nasm:%d  c:%d\n", asmEq, numEq);
352                 for(int y=0; y<8; y++)
353                 {
354                         for(int x=0; x<8; x++)
355                         {
356                                 printf("%d ", temp[x + y*stride]);
357                         }
358                         printf("\n");
359                 }
360         }
361 */
362 //      for(int i=0; i<numEq/8; i++) src[i]=255;
363         return (numEq > vFlatnessThreshold) ? 1 : 0;
364 }
365
366 static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
367 {
368 #ifdef HAVE_MMX
369         int isOk;
370         src+= stride*3;
371         asm volatile(
372 //              "int $3 \n\t"
373                 "movq (%1, %2), %%mm0                           \n\t"
374                 "movq (%1, %2, 8), %%mm1                        \n\t"
375                 "movq %%mm0, %%mm2                              \n\t"
376                 "psubusb %%mm1, %%mm0                           \n\t"
377                 "psubusb %%mm2, %%mm1                           \n\t"
378                 "por %%mm1, %%mm0                               \n\t" // ABS Diff
379
380                 "movq pQPb, %%mm7                               \n\t" // QP,..., QP
381                 "paddusb %%mm7, %%mm7                           \n\t" // 2QP ... 2QP
382                 "psubusb %%mm7, %%mm0                           \n\t" // Diff <= 2QP -> 0
383                 "pcmpeqd b00, %%mm0                             \n\t"
384                 "psrlq $16, %%mm0                               \n\t"
385                 "pcmpeqd bFF, %%mm0                             \n\t"
386 //              "movd %%mm0, (%1, %2, 4)\n\t"
387                 "movd %%mm0, %0                                 \n\t"
388                 : "=r" (isOk)
389                 : "r" (src), "r" (stride)
390                 );
391         return isOk;
392 #else
393
394         int isOk2= 1;
395         int x;
396         src+= stride*3;
397         for(x=0; x<BLOCK_SIZE; x++)
398         {
399                 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
400         }
401 /*      if(isOk && !isOk2 || !isOk && isOk2)
402         {
403                 printf("\nasm:%d  c:%d QP:%d\n", isOk, isOk2, QP);
404                 for(int y=0; y<9; y++)
405                 {
406                         for(int x=0; x<8; x++)
407                         {
408                                 printf("%d ", src[x + y*stride]);
409                         }
410                         printf("\n");
411                 }
412         } */
413
414         return isOk2;
415 #endif
416
417 }
418
419 /**
420  * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
421  * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
422  */
423 static inline void doVertLowPass(uint8_t *src, int stride, int QP)
424 {
425 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
426         src+= stride*3;
427         asm volatile(   //"movv %0 %1 %2\n\t"
428                 "movq pQPb, %%mm0                               \n\t"  // QP,..., QP
429
430                 "movq (%0), %%mm6                               \n\t"
431                 "movq (%0, %1), %%mm5                           \n\t"
432                 "movq %%mm5, %%mm1                              \n\t"
433                 "movq %%mm6, %%mm2                              \n\t"
434                 "psubusb %%mm6, %%mm5                           \n\t"
435                 "psubusb %%mm1, %%mm2                           \n\t"
436                 "por %%mm5, %%mm2                               \n\t" // ABS Diff of lines
437                 "psubusb %%mm0, %%mm2                           \n\t" // diff <= QP -> 0
438                 "pcmpeqb b00, %%mm2                             \n\t" // diff <= QP -> FF
439
440                 "pand %%mm2, %%mm6                              \n\t"
441                 "pandn %%mm1, %%mm2                             \n\t"
442                 "por %%mm2, %%mm6                               \n\t"// First Line to Filter
443
444                 "movq (%0, %1, 8), %%mm5                        \n\t"
445                 "leal (%0, %1, 4), %%eax                        \n\t"
446                 "leal (%0, %1, 8), %%ebx                        \n\t"
447                 "subl %1, %%ebx                                 \n\t"
448                 "addl %1, %0                                    \n\t" // %0 points to line 1 not 0
449                 "movq (%0, %1, 8), %%mm7                        \n\t"
450                 "movq %%mm5, %%mm1                              \n\t"
451                 "movq %%mm7, %%mm2                              \n\t"
452                 "psubusb %%mm7, %%mm5                           \n\t"
453                 "psubusb %%mm1, %%mm2                           \n\t"
454                 "por %%mm5, %%mm2                               \n\t" // ABS Diff of lines
455                 "psubusb %%mm0, %%mm2                           \n\t" // diff <= QP -> 0
456                 "pcmpeqb b00, %%mm2                             \n\t" // diff <= QP -> FF
457
458                 "pand %%mm2, %%mm7                              \n\t"
459                 "pandn %%mm1, %%mm2                             \n\t"
460                 "por %%mm2, %%mm7                               \n\t" // First Line to Filter
461
462
463                 //      1       2       3       4       5       6       7       8
464                 //      %0      %0+%1   %0+2%1  eax     %0+4%1  eax+2%1 ebx     eax+4%1
465                 // 6 4 2 2 1 1
466                 // 6 4 4 2
467                 // 6 8 2
468
469                 "movq (%0, %1), %%mm0                           \n\t" //  1
470                 "movq %%mm0, %%mm1                              \n\t" //  1
471                 PAVGB(%%mm6, %%mm0)                                   //1 1     /2
472                 PAVGB(%%mm6, %%mm0)                                   //3 1     /4
473
474                 "movq (%0, %1, 4), %%mm2                        \n\t" //     1
475                 "movq %%mm2, %%mm5                              \n\t" //     1
476                 PAVGB((%%eax), %%mm2)                                 //    11  /2
477                 PAVGB((%0, %1, 2), %%mm2)                             //   211  /4
478                 "movq %%mm2, %%mm3                              \n\t" //   211  /4
479                 "movq (%0), %%mm4                               \n\t" // 1
480                 PAVGB(%%mm4, %%mm3)                                   // 4 211  /8
481                 PAVGB(%%mm0, %%mm3)                                   //642211  /16
482                 "movq %%mm3, (%0)                               \n\t" // X
483                 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
484                 "movq %%mm1, %%mm0                              \n\t" //  1
485                 PAVGB(%%mm6, %%mm0)                                   //1 1     /2
486                 "movq %%mm4, %%mm3                              \n\t" // 1
487                 PAVGB((%0,%1,2), %%mm3)                               // 1 1    /2
488                 PAVGB((%%eax,%1,2), %%mm5)                            //     11 /2
489                 PAVGB((%%eax), %%mm5)                                 //    211 /4
490                 PAVGB(%%mm5, %%mm3)                                   // 2 2211 /8
491                 PAVGB(%%mm0, %%mm3)                                   //4242211 /16
492                 "movq %%mm3, (%0,%1)                            \n\t" //  X
493                 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
494                 PAVGB(%%mm4, %%mm6)                                   //11      /2
495                 "movq (%%ebx), %%mm0                            \n\t" //       1
496                 PAVGB((%%eax, %1, 2), %%mm0)                          //      11/2
497                 "movq %%mm0, %%mm3                              \n\t" //      11/2
498                 PAVGB(%%mm1, %%mm0)                                   //  2   11/4
499                 PAVGB(%%mm6, %%mm0)                                   //222   11/8
500                 PAVGB(%%mm2, %%mm0)                                   //22242211/16
501                 "movq (%0, %1, 2), %%mm2                        \n\t" //   1
502                 "movq %%mm0, (%0, %1, 2)                        \n\t" //   X
503                 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
504                 "movq (%%eax, %1, 4), %%mm0                     \n\t" //        1
505                 PAVGB((%%ebx), %%mm0)                                 //       11       /2
506                 PAVGB(%%mm0, %%mm6)                                   //11     11       /4
507                 PAVGB(%%mm1, %%mm4)                                   // 11             /2
508                 PAVGB(%%mm2, %%mm1)                                   //  11            /2
509                 PAVGB(%%mm1, %%mm6)                                   //1122   11       /8
510                 PAVGB(%%mm5, %%mm6)                                   //112242211       /16
511                 "movq (%%eax), %%mm5                            \n\t" //    1
512                 "movq %%mm6, (%%eax)                            \n\t" //    X
513                 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
514                 "movq (%%eax, %1, 4), %%mm6                     \n\t" //        1
515                 PAVGB(%%mm7, %%mm6)                                   //        11      /2
516                 PAVGB(%%mm4, %%mm6)                                   // 11     11      /4
517                 PAVGB(%%mm3, %%mm6)                                   // 11   2211      /8
518                 PAVGB(%%mm5, %%mm2)                                   //   11           /2
519                 "movq (%0, %1, 4), %%mm4                        \n\t" //     1
520                 PAVGB(%%mm4, %%mm2)                                   //   112          /4
521                 PAVGB(%%mm2, %%mm6)                                   // 112242211      /16
522                 "movq %%mm6, (%0, %1, 4)                        \n\t" //     X
523                 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
524                 PAVGB(%%mm7, %%mm1)                                   //  11     2      /4
525                 PAVGB(%%mm4, %%mm5)                                   //    11          /2
526                 PAVGB(%%mm5, %%mm0)                                   //    11 11       /4
527                 "movq (%%eax, %1, 2), %%mm6                     \n\t" //      1
528                 PAVGB(%%mm6, %%mm1)                                   //  11  4  2      /8
529                 PAVGB(%%mm0, %%mm1)                                   //  11224222      /16
530                 "movq %%mm1, (%%eax, %1, 2)                     \n\t" //      X
531                 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
532                 PAVGB((%%ebx), %%mm2)                                 //   112 4        /8
533                 "movq (%%eax, %1, 4), %%mm0                     \n\t" //        1
534                 PAVGB(%%mm0, %%mm6)                                   //      1 1       /2
535                 PAVGB(%%mm7, %%mm6)                                   //      1 12      /4
536                 PAVGB(%%mm2, %%mm6)                                   //   1122424      /4
537                 "movq %%mm6, (%%ebx)                            \n\t" //       X
538                 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
539                 PAVGB(%%mm7, %%mm5)                                   //    11   2      /4
540                 PAVGB(%%mm7, %%mm5)                                   //    11   6      /8
541
542                 PAVGB(%%mm3, %%mm0)                                   //      112       /4
543                 PAVGB(%%mm0, %%mm5)                                   //    112246      /16
544                 "movq %%mm5, (%%eax, %1, 4)                     \n\t" //        X
545                 "subl %1, %0                                    \n\t"
546
547                 :
548                 : "r" (src), "r" (stride)
549                 : "%eax", "%ebx"
550         );
551 #else
552         const int l1= stride;
553         const int l2= stride + l1;
554         const int l3= stride + l2;
555         const int l4= stride + l3;
556         const int l5= stride + l4;
557         const int l6= stride + l5;
558         const int l7= stride + l6;
559         const int l8= stride + l7;
560         const int l9= stride + l8;
561         int x;
562         src+= stride*3;
563         for(x=0; x<BLOCK_SIZE; x++)
564         {
565                 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
566                 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
567
568                 int sums[9];
569                 sums[0] = first + src[l1];
570                 sums[1] = src[l1] + src[l2];
571                 sums[2] = src[l2] + src[l3];
572                 sums[3] = src[l3] + src[l4];
573                 sums[4] = src[l4] + src[l5];
574                 sums[5] = src[l5] + src[l6];
575                 sums[6] = src[l6] + src[l7];
576                 sums[7] = src[l7] + src[l8];
577                 sums[8] = src[l8] + last;
578
579                 src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
580                 src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
581                 src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
582                 src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
583                 src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
584                 src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
585                 src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4;
586                 src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
587
588                 src++;
589         }
590
591 #endif
592 }
593
594 /**
595  * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
596  * values are correctly clipped (MMX2)
597  * values are wraparound (C)
598  * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
599         0 8 16 24
600         x = 8
601         x/2 = 4
602         x/8 = 1
603         1 12 12 23
604  */
605 static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
606 {
607 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
608         src+= stride*3;
609 // FIXME rounding
610         asm volatile(
611                 "pxor %%mm7, %%mm7                              \n\t" // 0
612                 "movq b80, %%mm6                                \n\t" // MIN_SIGNED_BYTE
613                 "leal (%0, %1), %%eax                           \n\t"
614                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
615 //      0       1       2       3       4       5       6       7       8       9
616 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
617                 "movq pQPb, %%mm0                               \n\t" // QP,..., QP
618                 "movq %%mm0, %%mm1                              \n\t" // QP,..., QP
619                 "paddusb b02, %%mm0                             \n\t"
620                 "psrlw $2, %%mm0                                \n\t"
621                 "pand b3F, %%mm0                                \n\t" // QP/4,..., QP/4
622                 "paddusb %%mm1, %%mm0                           \n\t" // QP*1.25 ...
623                 "movq (%0, %1, 4), %%mm2                        \n\t" // line 4
624                 "movq (%%ebx), %%mm3                            \n\t" // line 5
625                 "movq %%mm2, %%mm4                              \n\t" // line 4
626                 "pcmpeqb %%mm5, %%mm5                           \n\t" // -1
627                 "pxor %%mm2, %%mm5                              \n\t" // -line 4 - 1
628                 PAVGB(%%mm3, %%mm5)
629                 "paddb %%mm6, %%mm5                             \n\t" // (l5-l4)/2
630                 "psubusb %%mm3, %%mm4                           \n\t"
631                 "psubusb %%mm2, %%mm3                           \n\t"
632                 "por %%mm3, %%mm4                               \n\t" // |l4 - l5|
633                 "psubusb %%mm0, %%mm4                           \n\t"
634                 "pcmpeqb %%mm7, %%mm4                           \n\t"
635                 "pand %%mm4, %%mm5                              \n\t" // d/2
636
637 //              "paddb %%mm6, %%mm2                             \n\t" // line 4 + 0x80
638                 "paddb %%mm5, %%mm2                             \n\t"
639 //              "psubb %%mm6, %%mm2                             \n\t"
640                 "movq %%mm2, (%0,%1, 4)                         \n\t"
641
642                 "movq (%%ebx), %%mm2                            \n\t"
643 //              "paddb %%mm6, %%mm2                             \n\t" // line 5 + 0x80
644                 "psubb %%mm5, %%mm2                             \n\t"
645 //              "psubb %%mm6, %%mm2                             \n\t"
646                 "movq %%mm2, (%%ebx)                            \n\t"
647
648                 "paddb %%mm6, %%mm5                             \n\t"
649                 "psrlw $2, %%mm5                                \n\t"
650                 "pand b3F, %%mm5                                \n\t"
651                 "psubb b20, %%mm5                               \n\t" // (l5-l4)/8
652
653                 "movq (%%eax, %1, 2), %%mm2                     \n\t"
654                 "paddb %%mm6, %%mm2                             \n\t" // line 3 + 0x80
655                 "paddsb %%mm5, %%mm2                            \n\t"
656                 "psubb %%mm6, %%mm2                             \n\t"
657                 "movq %%mm2, (%%eax, %1, 2)                     \n\t"
658
659                 "movq (%%ebx, %1), %%mm2                        \n\t"
660                 "paddb %%mm6, %%mm2                             \n\t" // line 6 + 0x80
661                 "psubsb %%mm5, %%mm2                            \n\t"
662                 "psubb %%mm6, %%mm2                             \n\t"
663                 "movq %%mm2, (%%ebx, %1)                        \n\t"
664
665                 :
666                 : "r" (src), "r" (stride)
667                 : "%eax", "%ebx"
668         );
669 #else
670         const int l1= stride;
671         const int l2= stride + l1;
672         const int l3= stride + l2;
673         const int l4= stride + l3;
674         const int l5= stride + l4;
675         const int l6= stride + l5;
676 //      const int l7= stride + l6;
677 //      const int l8= stride + l7;
678 //      const int l9= stride + l8;
679         int x;
680         const int QP15= QP + (QP>>2);
681         src+= stride*3;
682         for(x=0; x<BLOCK_SIZE; x++)
683         {
684                 const int v = (src[x+l5] - src[x+l4]);
685                 if(ABS(v) < QP15)
686                 {
687                         src[x+l3] +=v>>3;
688                         src[x+l4] +=v>>1;
689                         src[x+l5] -=v>>1;
690                         src[x+l6] -=v>>3;
691
692                 }
693         }
694
695 #endif
696 }
697
698 /**
699  * Experimental Filter 1
700  * will not damage linear gradients
701  * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
702  * can only smooth blocks at the expected locations (it cant smooth them if they did move)
703  * MMX2 version does correct clipping C version doesnt
704  */
705 static inline void vertX1Filter(uint8_t *src, int stride, int QP)
706 {
707 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
708         src+= stride*3;
709
710         asm volatile(
711                 "pxor %%mm7, %%mm7                              \n\t" // 0
712 //              "movq b80, %%mm6                                \n\t" // MIN_SIGNED_BYTE
713                 "leal (%0, %1), %%eax                           \n\t"
714                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
715 //      0       1       2       3       4       5       6       7       8       9
716 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
717                 "movq (%%eax, %1, 2), %%mm0                     \n\t" // line 3
718                 "movq (%0, %1, 4), %%mm1                        \n\t" // line 4
719                 "movq %%mm1, %%mm2                              \n\t" // line 4
720                 "psubusb %%mm0, %%mm1                           \n\t"
721                 "psubusb %%mm2, %%mm0                           \n\t"
722                 "por %%mm1, %%mm0                               \n\t" // |l2 - l3|
723                 "movq (%%ebx), %%mm3                            \n\t" // line 5
724                 "movq (%%ebx, %1), %%mm4                                \n\t" // line 6
725                 "movq %%mm3, %%mm5                              \n\t" // line 5
726                 "psubusb %%mm4, %%mm3                           \n\t"
727                 "psubusb %%mm5, %%mm4                           \n\t"
728                 "por %%mm4, %%mm3                               \n\t" // |l5 - l6|
729                 PAVGB(%%mm3, %%mm0)                                   // (|l2 - l3| + |l5 - l6|)/2
730                 "movq %%mm2, %%mm1                              \n\t" // line 4
731                 "psubusb %%mm5, %%mm2                           \n\t"
732                 "movq %%mm2, %%mm4                              \n\t"
733                 "pcmpeqb %%mm7, %%mm2                           \n\t" // (l4 - l5) <= 0 ? -1 : 0
734                 "psubusb %%mm1, %%mm5                           \n\t"
735                 "por %%mm5, %%mm4                               \n\t" // |l4 - l5|
736                 "psubusb %%mm0, %%mm4           \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
737                 "movq %%mm4, %%mm3                              \n\t" // d
738                 "psubusb pQPb, %%mm4                            \n\t"
739                 "pcmpeqb %%mm7, %%mm4                           \n\t" // d <= QP ? -1 : 0
740                 "psubusb b01, %%mm3                             \n\t"
741                 "pand %%mm4, %%mm3                              \n\t" // d <= QP ? d : 0
742
743                 PAVGB(%%mm7, %%mm3)                                   // d/2
744                 "movq %%mm3, %%mm1                              \n\t" // d/2
745                 PAVGB(%%mm7, %%mm3)                                   // d/4
746                 PAVGB(%%mm1, %%mm3)                                   // 3*d/8
747
748                 "movq (%0, %1, 4), %%mm0                        \n\t" // line 4
749                 "pxor %%mm2, %%mm0                              \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
750                 "psubusb %%mm3, %%mm0                           \n\t"
751                 "pxor %%mm2, %%mm0                              \n\t"
752                 "movq %%mm0, (%0, %1, 4)                        \n\t" // line 4
753
754                 "movq (%%ebx), %%mm0                            \n\t" // line 5
755                 "pxor %%mm2, %%mm0                              \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
756                 "paddusb %%mm3, %%mm0                           \n\t"
757                 "pxor %%mm2, %%mm0                              \n\t"
758                 "movq %%mm0, (%%ebx)                            \n\t" // line 5
759
760                 PAVGB(%%mm7, %%mm1)                                   // d/4
761
762                 "movq (%%eax, %1, 2), %%mm0                     \n\t" // line 3
763                 "pxor %%mm2, %%mm0                              \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
764                 "psubusb %%mm1, %%mm0                           \n\t"
765                 "pxor %%mm2, %%mm0                              \n\t"
766                 "movq %%mm0, (%%eax, %1, 2)                     \n\t" // line 3
767
768                 "movq (%%ebx, %1), %%mm0                        \n\t" // line 6
769                 "pxor %%mm2, %%mm0                              \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
770                 "paddusb %%mm1, %%mm0                           \n\t"
771                 "pxor %%mm2, %%mm0                              \n\t"
772                 "movq %%mm0, (%%ebx, %1)                        \n\t" // line 6
773
774                 PAVGB(%%mm7, %%mm1)                                   // d/8
775
776                 "movq (%%eax, %1), %%mm0                        \n\t" // line 2
777                 "pxor %%mm2, %%mm0                              \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
778                 "psubusb %%mm1, %%mm0                           \n\t"
779                 "pxor %%mm2, %%mm0                              \n\t"
780                 "movq %%mm0, (%%eax, %1)                        \n\t" // line 2
781
782                 "movq (%%ebx, %1, 2), %%mm0                     \n\t" // line 7
783                 "pxor %%mm2, %%mm0                              \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
784                 "paddusb %%mm1, %%mm0                           \n\t"
785                 "pxor %%mm2, %%mm0                              \n\t"
786                 "movq %%mm0, (%%ebx, %1, 2)                     \n\t" // line 7
787
788                 :
789                 : "r" (src), "r" (stride)
790                 : "%eax", "%ebx"
791         );
792 #else
793
794         const int l1= stride;
795         const int l2= stride + l1;
796         const int l3= stride + l2;
797         const int l4= stride + l3;
798         const int l5= stride + l4;
799         const int l6= stride + l5;
800         const int l7= stride + l6;
801 //      const int l8= stride + l7;
802 //      const int l9= stride + l8;
803         int x;
804
805         src+= stride*3;
806         for(x=0; x<BLOCK_SIZE; x++)
807         {
808                 int a= src[l3] - src[l4];
809                 int b= src[l4] - src[l5];
810                 int c= src[l5] - src[l6];
811
812                 int d= ABS(b) - ((ABS(a) + ABS(c))>>1);
813                 d= MAX(d, 0);
814
815                 if(d < QP)
816                 {
817                         int v = d * SIGN(-b);
818
819                         src[l2] +=v>>3;
820                         src[l3] +=v>>2;
821                         src[l4] +=(3*v)>>3;
822                         src[l5] -=(3*v)>>3;
823                         src[l6] -=v>>2;
824                         src[l7] -=v>>3;
825
826                 }
827                 src++;
828         }
829         /*
830         const int l1= stride;
831         const int l2= stride + l1;
832         const int l3= stride + l2;
833         const int l4= stride + l3;
834         const int l5= stride + l4;
835         const int l6= stride + l5;
836         const int l7= stride + l6;
837         const int l8= stride + l7;
838         const int l9= stride + l8;
839         for(int x=0; x<BLOCK_SIZE; x++)
840         {
841                 int v2= src[l2];
842                 int v3= src[l3];
843                 int v4= src[l4];
844                 int v5= src[l5];
845                 int v6= src[l6];
846                 int v7= src[l7];
847
848                 if(ABS(v4-v5)<QP &&  ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 )
849                 {
850                         src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6         )/16;
851                         src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7  )/16;
852                         src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16;
853                         src[l6] = (       1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16;
854                 }
855                 src++;
856         }
857 */
858 #endif
859 }
860
861 /**
862  * Experimental Filter 1 (Horizontal)
863  * will not damage linear gradients
864  * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
865  * can only smooth blocks at the expected locations (it cant smooth them if they did move)
866  * MMX2 version does correct clipping C version doesnt
867  * not identical with the vertical one
868  */
869 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
870 {
871         int y;
872         static uint64_t *lut= NULL;
873         if(lut==NULL)
874         {
875                 int i;
876                 lut= (uint64_t*)memalign(8, 256*8);
877                 for(i=0; i<256; i++)
878                 {
879                         int v= i < 128 ? 2*i : 2*(i-256);
880 /*
881 //Simulate 112242211 9-Tap filter
882                         uint64_t a= (v/16) & 0xFF;
883                         uint64_t b= (v/8) & 0xFF;
884                         uint64_t c= (v/4) & 0xFF;
885                         uint64_t d= (3*v/8) & 0xFF;
886 */
887 //Simulate piecewise linear interpolation
888                         uint64_t a= (v/16) & 0xFF;
889                         uint64_t b= (v*3/16) & 0xFF;
890                         uint64_t c= (v*5/16) & 0xFF;
891                         uint64_t d= (7*v/16) & 0xFF;
892                         uint64_t A= (0x100 - a)&0xFF;
893                         uint64_t B= (0x100 - b)&0xFF;
894                         uint64_t C= (0x100 - c)&0xFF;
895                         uint64_t D= (0x100 - c)&0xFF;
896
897                         lut[i]   = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
898                                 (D<<24) | (C<<16) | (B<<8) | (A);
899                         //lut[i] = (v<<32) | (v<<24);
900                 }
901         }
902
903 #if 0
904         asm volatile(
905                 "pxor %%mm7, %%mm7                              \n\t" // 0
906 //              "movq b80, %%mm6                                \n\t" // MIN_SIGNED_BYTE
907                 "leal (%0, %1), %%eax                           \n\t"
908                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
909
910                 "movq b80, %%mm6                                \n\t"
911                 "movd pQPb, %%mm5                               \n\t" // QP
912                 "movq %%mm5, %%mm4                              \n\t"
913                 "paddusb %%mm5, %%mm5                           \n\t" // 2QP
914                 "paddusb %%mm5, %%mm4                           \n\t" // 3QP
915                 "pxor %%mm5, %%mm5                              \n\t" // 0
916                 "psubb %%mm4, %%mm5                             \n\t" // -3QP
917                 "por bm11111110, %%mm5                          \n\t" // ...,FF,FF,-3QP
918                 "psllq $24, %%mm5                               \n\t"
919
920 //      0       1       2       3       4       5       6       7       8       9
921 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
922
923 #define HX1old(a) \
924                 "movd " #a ", %%mm0                             \n\t"\
925                 "movd 4" #a ", %%mm1                            \n\t"\
926                 "punpckldq %%mm1, %%mm0                         \n\t"\
927                 "movq %%mm0, %%mm1                              \n\t"\
928                 "movq %%mm0, %%mm2                              \n\t"\
929                 "psrlq $8, %%mm1                                \n\t"\
930                 "psubusb %%mm1, %%mm2                           \n\t"\
931                 "psubusb %%mm0, %%mm1                           \n\t"\
932                 "por %%mm2, %%mm1                               \n\t" /* p´x = |px - p(x+1)| */\
933                 "pcmpeqb %%mm7, %%mm2                           \n\t" /* p´x = sgn[px - p(x+1)] */\
934                 "pshufw $0x00, %%mm1, %%mm3                     \n\t" /* p´5 = |p1 - p2| */\
935                 PAVGB(%%mm1, %%mm3)                                   /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
936                 "psrlq $16, %%mm3                               \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
937                 "psubusb %%mm3, %%mm1                   \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
938                 "paddb %%mm5, %%mm1                             \n\t"\
939                 "psubusb %%mm5, %%mm1                           \n\t"\
940                 PAVGB(%%mm7, %%mm1)\
941                 "pxor %%mm2, %%mm1                              \n\t"\
942                 "psubb %%mm2, %%mm1                             \n\t"\
943                 "psrlq $24, %%mm1                               \n\t"\
944                 "movd %%mm1, %%ecx                              \n\t"\
945                 "paddb %%mm6, %%mm0                             \n\t"\
946                 "paddsb (%3, %%ecx, 8), %%mm0                   \n\t"\
947                 "paddb %%mm6, %%mm0                             \n\t"\
948                 "movq %%mm0, " #a "                             \n\t"\
949
950 /*
951 HX1old((%0))
952 HX1old((%%eax))
953 HX1old((%%eax, %1))
954 HX1old((%%eax, %1, 2))
955 HX1old((%0, %1, 4))
956 HX1old((%%ebx))
957 HX1old((%%ebx, %1))
958 HX1old((%%ebx, %1, 2))
959 */
960
961 //FIXME add some comments, its unreadable ...
962 #define HX1b(a, c, b, d) \
963                 "movd " #a ", %%mm0                             \n\t"\
964                 "movd 4" #a ", %%mm1                            \n\t"\
965                 "punpckldq %%mm1, %%mm0                         \n\t"\
966                 "movd " #b ", %%mm4                             \n\t"\
967                 "movq %%mm0, %%mm1                              \n\t"\
968                 "movq %%mm0, %%mm2                              \n\t"\
969                 "psrlq $8, %%mm1                                \n\t"\
970                 "movd 4" #b ", %%mm3                            \n\t"\
971                 "psubusb %%mm1, %%mm2                           \n\t"\
972                 "psubusb %%mm0, %%mm1                           \n\t"\
973                 "por %%mm2, %%mm1                               \n\t" /* p´x = |px - p(x+1)| */\
974                 "pcmpeqb %%mm7, %%mm2                           \n\t" /* p´x = sgn[px - p(x+1)] */\
975                 "punpckldq %%mm3, %%mm4                         \n\t"\
976                 "movq %%mm1, %%mm3                              \n\t"\
977                 "psllq $32, %%mm3                               \n\t" /* p´5 = |p1 - p2| */\
978                 PAVGB(%%mm1, %%mm3)                                   /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
979                 "paddb %%mm6, %%mm0                             \n\t"\
980                 "psrlq $16, %%mm3                               \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
981                 "psubusb %%mm3, %%mm1                   \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
982                 "movq %%mm4, %%mm3                              \n\t"\
983                 "paddb %%mm5, %%mm1                             \n\t"\
984                 "psubusb %%mm5, %%mm1                           \n\t"\
985                 "psrlq $8, %%mm3                                \n\t"\
986                 PAVGB(%%mm7, %%mm1)\
987                 "pxor %%mm2, %%mm1                              \n\t"\
988                 "psubb %%mm2, %%mm1                             \n\t"\
989                 "movq %%mm4, %%mm2                              \n\t"\
990                 "psrlq $24, %%mm1                               \n\t"\
991                 "psubusb %%mm3, %%mm2                           \n\t"\
992                 "movd %%mm1, %%ecx                              \n\t"\
993                 "psubusb %%mm4, %%mm3                           \n\t"\
994                 "paddsb (%2, %%ecx, 8), %%mm0                   \n\t"\
995                 "por %%mm2, %%mm3                               \n\t" /* p´x = |px - p(x+1)| */\
996                 "paddb %%mm6, %%mm0                             \n\t"\
997                 "pcmpeqb %%mm7, %%mm2                           \n\t" /* p´x = sgn[px - p(x+1)] */\
998                 "movq %%mm3, %%mm1                              \n\t"\
999                 "psllq $32, %%mm1                               \n\t" /* p´5 = |p1 - p2| */\
1000                 "movq %%mm0, " #a "                             \n\t"\
1001                 PAVGB(%%mm3, %%mm1)                                   /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
1002                 "paddb %%mm6, %%mm4                             \n\t"\
1003                 "psrlq $16, %%mm1                               \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
1004                 "psubusb %%mm1, %%mm3                   \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
1005                 "paddb %%mm5, %%mm3                             \n\t"\
1006                 "psubusb %%mm5, %%mm3                           \n\t"\
1007                 PAVGB(%%mm7, %%mm3)\
1008                 "pxor %%mm2, %%mm3                              \n\t"\
1009                 "psubb %%mm2, %%mm3                             \n\t"\
1010                 "psrlq $24, %%mm3                               \n\t"\
1011                 "movd " #c ", %%mm0                             \n\t"\
1012                 "movd 4" #c ", %%mm1                            \n\t"\
1013                 "punpckldq %%mm1, %%mm0                         \n\t"\
1014                 "paddb %%mm6, %%mm0                             \n\t"\
1015                 "paddsb (%2, %%ecx, 8), %%mm0                   \n\t"\
1016                 "paddb %%mm6, %%mm0                             \n\t"\
1017                 "movq %%mm0, " #c "                             \n\t"\
1018                 "movd %%mm3, %%ecx                              \n\t"\
1019                 "movd " #d ", %%mm0                             \n\t"\
1020                 "paddsb (%2, %%ecx, 8), %%mm4                   \n\t"\
1021                 "movd 4" #d ", %%mm1                            \n\t"\
1022                 "paddb %%mm6, %%mm4                             \n\t"\
1023                 "punpckldq %%mm1, %%mm0                         \n\t"\
1024                 "movq %%mm4, " #b "                             \n\t"\
1025                 "paddb %%mm6, %%mm0                             \n\t"\
1026                 "paddsb (%2, %%ecx, 8), %%mm0                   \n\t"\
1027                 "paddb %%mm6, %%mm0                             \n\t"\
1028                 "movq %%mm0, " #d "                             \n\t"\
1029
1030 HX1b((%0),(%%eax),(%%eax, %1),(%%eax, %1, 2))
1031 HX1b((%0, %1, 4),(%%ebx),(%%ebx, %1),(%%ebx, %1, 2))
1032
1033
1034                 :
1035                 : "r" (src), "r" (stride), "r" (lut)
1036                 : "%eax", "%ebx", "%ecx"
1037         );
1038 #else
1039
1040 //FIXME (has little in common with the mmx2 version)
1041         for(y=0; y<BLOCK_SIZE; y++)
1042         {
1043                 int a= src[1] - src[2];
1044                 int b= src[3] - src[4];
1045                 int c= src[5] - src[6];
1046
1047                 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
1048
1049                 if(d < QP)
1050                 {
1051                         int v = d * SIGN(-b);
1052
1053                         src[1] +=v/8;
1054                         src[2] +=v/4;
1055                         src[3] +=3*v/8;
1056                         src[4] -=3*v/8;
1057                         src[5] -=v/4;
1058                         src[6] -=v/8;
1059
1060                 }
1061                 src+=stride;
1062         }
1063 #endif
1064 }
1065
1066
1067 static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
1068 {
1069 #ifdef HAVE_MMX
1070         src+= stride*4;
1071         //FIXME try pmul for *5 stuff
1072 //      src[0]=0;
1073         asm volatile(
1074                 "pxor %%mm7, %%mm7                              \n\t"
1075                 "leal (%0, %1), %%eax                           \n\t"
1076                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
1077 //      0       1       2       3       4       5       6       7
1078 //      %0      %0+%1   %0+2%1  eax+2%1 %0+4%1  eax+4%1 ebx+%1  ebx+2%1
1079 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1
1080
1081                 "movq (%0), %%mm0                               \n\t"
1082                 "movq %%mm0, %%mm1                              \n\t"
1083                 "punpcklbw %%mm7, %%mm0                         \n\t" // low part of line 0
1084                 "punpckhbw %%mm7, %%mm1                         \n\t" // high part of line 0
1085
1086                 "movq (%%eax), %%mm2                            \n\t"
1087                 "movq %%mm2, %%mm3                              \n\t"
1088                 "punpcklbw %%mm7, %%mm2                         \n\t" // low part of line 1
1089                 "punpckhbw %%mm7, %%mm3                         \n\t" // high part of line 1
1090
1091                 "movq (%%eax, %1), %%mm4                        \n\t"
1092                 "movq %%mm4, %%mm5                              \n\t"
1093                 "punpcklbw %%mm7, %%mm4                         \n\t" // low part of line 2
1094                 "punpckhbw %%mm7, %%mm5                         \n\t" // high part of line 2
1095
1096                 "paddw %%mm0, %%mm0                             \n\t" // 2L0
1097                 "paddw %%mm1, %%mm1                             \n\t" // 2H0
1098                 "psubw %%mm4, %%mm2                             \n\t" // L1 - L2
1099                 "psubw %%mm5, %%mm3                             \n\t" // H1 - H2
1100                 "psubw %%mm2, %%mm0                             \n\t" // 2L0 - L1 + L2
1101                 "psubw %%mm3, %%mm1                             \n\t" // 2H0 - H1 + H2
1102
1103                 "psllw $2, %%mm2                                \n\t" // 4L1 - 4L2
1104                 "psllw $2, %%mm3                                \n\t" // 4H1 - 4H2
1105                 "psubw %%mm2, %%mm0                             \n\t" // 2L0 - 5L1 + 5L2
1106                 "psubw %%mm3, %%mm1                             \n\t" // 2H0 - 5H1 + 5H2
1107
1108                 "movq (%%eax, %1, 2), %%mm2                     \n\t"
1109                 "movq %%mm2, %%mm3                              \n\t"
1110                 "punpcklbw %%mm7, %%mm2                         \n\t" // L3
1111                 "punpckhbw %%mm7, %%mm3                         \n\t" // H3
1112
1113                 "psubw %%mm2, %%mm0                             \n\t" // 2L0 - 5L1 + 5L2 - L3
1114                 "psubw %%mm3, %%mm1                             \n\t" // 2H0 - 5H1 + 5H2 - H3
1115                 "psubw %%mm2, %%mm0                             \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1116                 "psubw %%mm3, %%mm1                             \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1117                 "movq %%mm0, temp0                              \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1118                 "movq %%mm1, temp1                              \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1119
1120                 "movq (%0, %1, 4), %%mm0                        \n\t"
1121                 "movq %%mm0, %%mm1                              \n\t"
1122                 "punpcklbw %%mm7, %%mm0                         \n\t" // L4
1123                 "punpckhbw %%mm7, %%mm1                         \n\t" // H4
1124
1125                 "psubw %%mm0, %%mm2                             \n\t" // L3 - L4
1126                 "psubw %%mm1, %%mm3                             \n\t" // H3 - H4
1127                 "movq %%mm2, temp2                              \n\t" // L3 - L4
1128                 "movq %%mm3, temp3                              \n\t" // H3 - H4
1129                 "paddw %%mm4, %%mm4                             \n\t" // 2L2
1130                 "paddw %%mm5, %%mm5                             \n\t" // 2H2
1131                 "psubw %%mm2, %%mm4                             \n\t" // 2L2 - L3 + L4
1132                 "psubw %%mm3, %%mm5                             \n\t" // 2H2 - H3 + H4
1133
1134                 "psllw $2, %%mm2                                \n\t" // 4L3 - 4L4
1135                 "psllw $2, %%mm3                                \n\t" // 4H3 - 4H4
1136                 "psubw %%mm2, %%mm4                             \n\t" // 2L2 - 5L3 + 5L4
1137                 "psubw %%mm3, %%mm5                             \n\t" // 2H2 - 5H3 + 5H4
1138 //50 opcodes so far
1139                 "movq (%%ebx), %%mm2                            \n\t"
1140                 "movq %%mm2, %%mm3                              \n\t"
1141                 "punpcklbw %%mm7, %%mm2                         \n\t" // L5
1142                 "punpckhbw %%mm7, %%mm3                         \n\t" // H5
1143                 "psubw %%mm2, %%mm4                             \n\t" // 2L2 - 5L3 + 5L4 - L5
1144                 "psubw %%mm3, %%mm5                             \n\t" // 2H2 - 5H3 + 5H4 - H5
1145                 "psubw %%mm2, %%mm4                             \n\t" // 2L2 - 5L3 + 5L4 - 2L5
1146                 "psubw %%mm3, %%mm5                             \n\t" // 2H2 - 5H3 + 5H4 - 2H5
1147
1148                 "movq (%%ebx, %1), %%mm6                        \n\t"
1149                 "punpcklbw %%mm7, %%mm6                         \n\t" // L6
1150                 "psubw %%mm6, %%mm2                             \n\t" // L5 - L6
1151                 "movq (%%ebx, %1), %%mm6                        \n\t"
1152                 "punpckhbw %%mm7, %%mm6                         \n\t" // H6
1153                 "psubw %%mm6, %%mm3                             \n\t" // H5 - H6
1154
1155                 "paddw %%mm0, %%mm0                             \n\t" // 2L4
1156                 "paddw %%mm1, %%mm1                             \n\t" // 2H4
1157                 "psubw %%mm2, %%mm0                             \n\t" // 2L4 - L5 + L6
1158                 "psubw %%mm3, %%mm1                             \n\t" // 2H4 - H5 + H6
1159
1160                 "psllw $2, %%mm2                                \n\t" // 4L5 - 4L6
1161                 "psllw $2, %%mm3                                \n\t" // 4H5 - 4H6
1162                 "psubw %%mm2, %%mm0                             \n\t" // 2L4 - 5L5 + 5L6
1163                 "psubw %%mm3, %%mm1                             \n\t" // 2H4 - 5H5 + 5H6
1164
1165                 "movq (%%ebx, %1, 2), %%mm2                     \n\t"
1166                 "movq %%mm2, %%mm3                              \n\t"
1167                 "punpcklbw %%mm7, %%mm2                         \n\t" // L7
1168                 "punpckhbw %%mm7, %%mm3                         \n\t" // H7
1169
1170                 "paddw %%mm2, %%mm2                             \n\t" // 2L7
1171                 "paddw %%mm3, %%mm3                             \n\t" // 2H7
1172                 "psubw %%mm2, %%mm0                             \n\t" // 2L4 - 5L5 + 5L6 - 2L7
1173                 "psubw %%mm3, %%mm1                             \n\t" // 2H4 - 5H5 + 5H6 - 2H7
1174
1175                 "movq temp0, %%mm2                              \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1176                 "movq temp1, %%mm3                              \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1177
1178 #ifdef HAVE_MMX2
1179                 "movq %%mm7, %%mm6                              \n\t" // 0
1180                 "psubw %%mm0, %%mm6                             \n\t"
1181                 "pmaxsw %%mm6, %%mm0                            \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1182                 "movq %%mm7, %%mm6                              \n\t" // 0
1183                 "psubw %%mm1, %%mm6                             \n\t"
1184                 "pmaxsw %%mm6, %%mm1                            \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1185                 "movq %%mm7, %%mm6                              \n\t" // 0
1186                 "psubw %%mm2, %%mm6                             \n\t"
1187                 "pmaxsw %%mm6, %%mm2                            \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1188                 "movq %%mm7, %%mm6                              \n\t" // 0
1189                 "psubw %%mm3, %%mm6                             \n\t"
1190                 "pmaxsw %%mm6, %%mm3                            \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1191 #else
1192                 "movq %%mm7, %%mm6                              \n\t" // 0
1193                 "pcmpgtw %%mm0, %%mm6                           \n\t"
1194                 "pxor %%mm6, %%mm0                              \n\t"
1195                 "psubw %%mm6, %%mm0                             \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1196                 "movq %%mm7, %%mm6                              \n\t" // 0
1197                 "pcmpgtw %%mm1, %%mm6                           \n\t"
1198                 "pxor %%mm6, %%mm1                              \n\t"
1199                 "psubw %%mm6, %%mm1                             \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1200                 "movq %%mm7, %%mm6                              \n\t" // 0
1201                 "pcmpgtw %%mm2, %%mm6                           \n\t"
1202                 "pxor %%mm6, %%mm2                              \n\t"
1203                 "psubw %%mm6, %%mm2                             \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1204                 "movq %%mm7, %%mm6                              \n\t" // 0
1205                 "pcmpgtw %%mm3, %%mm6                           \n\t"
1206                 "pxor %%mm6, %%mm3                              \n\t"
1207                 "psubw %%mm6, %%mm3                             \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1208 #endif
1209
1210 #ifdef HAVE_MMX2
1211                 "pminsw %%mm2, %%mm0                            \n\t"
1212                 "pminsw %%mm3, %%mm1                            \n\t"
1213 #else
1214                 "movq %%mm0, %%mm6                              \n\t"
1215                 "psubusw %%mm2, %%mm6                           \n\t"
1216                 "psubw %%mm6, %%mm0                             \n\t"
1217                 "movq %%mm1, %%mm6                              \n\t"
1218                 "psubusw %%mm3, %%mm6                           \n\t"
1219                 "psubw %%mm6, %%mm1                             \n\t"
1220 #endif
1221
1222                 "movq %%mm7, %%mm6                              \n\t" // 0
1223                 "pcmpgtw %%mm4, %%mm6                           \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
1224                 "pxor %%mm6, %%mm4                              \n\t"
1225                 "psubw %%mm6, %%mm4                             \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
1226                 "pcmpgtw %%mm5, %%mm7                           \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
1227                 "pxor %%mm7, %%mm5                              \n\t"
1228                 "psubw %%mm7, %%mm5                             \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
1229 // 100 opcodes
1230                 "movd %2, %%mm2                                 \n\t" // QP
1231                 "punpcklwd %%mm2, %%mm2                         \n\t"
1232                 "punpcklwd %%mm2, %%mm2                         \n\t"
1233                 "psllw $3, %%mm2                                \n\t" // 8QP
1234                 "movq %%mm2, %%mm3                              \n\t" // 8QP
1235                 "pcmpgtw %%mm4, %%mm2                           \n\t"
1236                 "pcmpgtw %%mm5, %%mm3                           \n\t"
1237                 "pand %%mm2, %%mm4                              \n\t"
1238                 "pand %%mm3, %%mm5                              \n\t"
1239
1240
1241                 "psubusw %%mm0, %%mm4                           \n\t" // hd
1242                 "psubusw %%mm1, %%mm5                           \n\t" // ld
1243
1244
1245                 "movq w05, %%mm2                                \n\t" // 5
1246                 "pmullw %%mm2, %%mm4                            \n\t"
1247                 "pmullw %%mm2, %%mm5                            \n\t"
1248                 "movq w20, %%mm2                                \n\t" // 32
1249                 "paddw %%mm2, %%mm4                             \n\t"
1250                 "paddw %%mm2, %%mm5                             \n\t"
1251                 "psrlw $6, %%mm4                                \n\t"
1252                 "psrlw $6, %%mm5                                \n\t"
1253
1254 /*
1255                 "movq w06, %%mm2                                \n\t" // 6
1256                 "paddw %%mm2, %%mm4                             \n\t"
1257                 "paddw %%mm2, %%mm5                             \n\t"
1258                 "movq w1400, %%mm2                              \n\t" // 1400h = 5120 = 5/64*2^16
1259 //FIXME if *5/64 is supposed to be /13 then we should use 5041 instead of 5120
1260                 "pmulhw %%mm2, %%mm4                            \n\t" // hd/13
1261                 "pmulhw %%mm2, %%mm5                            \n\t" // ld/13
1262 */
1263
1264                 "movq temp2, %%mm0                              \n\t" // L3 - L4
1265                 "movq temp3, %%mm1                              \n\t" // H3 - H4
1266
1267                 "pxor %%mm2, %%mm2                              \n\t"
1268                 "pxor %%mm3, %%mm3                              \n\t"
1269
1270                 "pcmpgtw %%mm0, %%mm2                           \n\t" // sign (L3-L4)
1271                 "pcmpgtw %%mm1, %%mm3                           \n\t" // sign (H3-H4)
1272                 "pxor %%mm2, %%mm0                              \n\t"
1273                 "pxor %%mm3, %%mm1                              \n\t"
1274                 "psubw %%mm2, %%mm0                             \n\t" // |L3-L4|
1275                 "psubw %%mm3, %%mm1                             \n\t" // |H3-H4|
1276                 "psrlw $1, %%mm0                                \n\t" // |L3 - L4|/2
1277                 "psrlw $1, %%mm1                                \n\t" // |H3 - H4|/2
1278
1279                 "pxor %%mm6, %%mm2                              \n\t"
1280                 "pxor %%mm7, %%mm3                              \n\t"
1281                 "pand %%mm2, %%mm4                              \n\t"
1282                 "pand %%mm3, %%mm5                              \n\t"
1283
1284 #ifdef HAVE_MMX2
1285                 "pminsw %%mm0, %%mm4                            \n\t"
1286                 "pminsw %%mm1, %%mm5                            \n\t"
1287 #else
1288                 "movq %%mm4, %%mm2                              \n\t"
1289                 "psubusw %%mm0, %%mm2                           \n\t"
1290                 "psubw %%mm2, %%mm4                             \n\t"
1291                 "movq %%mm5, %%mm2                              \n\t"
1292                 "psubusw %%mm1, %%mm2                           \n\t"
1293                 "psubw %%mm2, %%mm5                             \n\t"
1294 #endif
1295                 "pxor %%mm6, %%mm4                              \n\t"
1296                 "pxor %%mm7, %%mm5                              \n\t"
1297                 "psubw %%mm6, %%mm4                             \n\t"
1298                 "psubw %%mm7, %%mm5                             \n\t"
1299                 "packsswb %%mm5, %%mm4                          \n\t"
1300                 "movq (%%eax, %1, 2), %%mm0                     \n\t"
1301                 "paddb   %%mm4, %%mm0                           \n\t"
1302                 "movq %%mm0, (%%eax, %1, 2)                     \n\t"
1303                 "movq (%0, %1, 4), %%mm0                        \n\t"
1304                 "psubb %%mm4, %%mm0                             \n\t"
1305                 "movq %%mm0, (%0, %1, 4)                        \n\t"
1306
1307                 :
1308                 : "r" (src), "r" (stride), "r" (QP)
1309                 : "%eax", "%ebx"
1310         );
1311 #else
1312         const int l1= stride;
1313         const int l2= stride + l1;
1314         const int l3= stride + l2;
1315         const int l4= stride + l3;
1316         const int l5= stride + l4;
1317         const int l6= stride + l5;
1318         const int l7= stride + l6;
1319         const int l8= stride + l7;
1320 //      const int l9= stride + l8;
1321         int x;
1322         src+= stride*3;
1323         for(x=0; x<BLOCK_SIZE; x++)
1324         {
1325                 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1326                 if(ABS(middleEnergy) < 8*QP)
1327                 {
1328                         const int q=(src[l4] - src[l5])/2;
1329                         const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1330                         const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1331
1332                         int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1333                         d= MAX(d, 0);
1334
1335                         d= (5*d + 32) >> 6;
1336                         d*= SIGN(-middleEnergy);
1337
1338                         if(q>0)
1339                         {
1340                                 d= d<0 ? 0 : d;
1341                                 d= d>q ? q : d;
1342                         }
1343                         else
1344                         {
1345                                 d= d>0 ? 0 : d;
1346                                 d= d<q ? q : d;
1347                         }
1348
1349                         src[l4]-= d;
1350                         src[l5]+= d;
1351                 }
1352                 src++;
1353         }
1354 #endif
1355 }
1356
1357 //FIXME?  |255-0| = 1
1358 /**
1359  * Check if the given 8x8 Block is mostly "flat"
1360  */
1361 static inline int isHorizDC(uint8_t src[], int stride)
1362 {
1363 //      src++;
1364         int numEq= 0;
1365 #if 0
1366 asm volatile (
1367 //              "int $3 \n\t"
1368                 "leal (%1, %2), %%ecx                           \n\t"
1369                 "leal (%%ecx, %2, 4), %%ebx                     \n\t"
1370 //      0       1       2       3       4       5       6       7       8       9
1371 //      %1      ecx     ecx+%2  ecx+2%2 %1+4%2  ebx     ebx+%2  ebx+2%2 %1+8%2  ebx+4%2
1372                 "movq b7E, %%mm7                                \n\t" // mm7 = 0x7F
1373                 "movq b7C, %%mm6                                \n\t" // mm6 = 0x7D
1374                 "pxor %%mm0, %%mm0                              \n\t"
1375                 "movl %1, %%eax                                 \n\t"
1376                 "andl $0x1F, %%eax                              \n\t"
1377                 "cmpl $24, %%eax                                \n\t"
1378                 "leal tempBlock, %%eax                          \n\t"
1379                 "jb 1f                                          \n\t"
1380
1381 #define HDC_CHECK_AND_CPY(src, dst) \
1382                 "movd " #src ", %%mm2                           \n\t"\
1383                 "punpckldq 4" #src ", %%mm2                             \n\t" /* (%1) */\
1384                 "movq %%mm2, %%mm1                              \n\t"\
1385                 "psrlq $8, %%mm2                                \n\t"\
1386                 "psubb %%mm1, %%mm2                             \n\t"\
1387                 "paddb %%mm7, %%mm2                             \n\t"\
1388                 "pcmpgtb %%mm6, %%mm2                           \n\t"\
1389                 "paddb %%mm2, %%mm0                             \n\t"\
1390                 "movq %%mm1," #dst "(%%eax)                     \n\t"
1391
1392                 HDC_CHECK_AND_CPY((%1),0)
1393                 HDC_CHECK_AND_CPY((%%ecx),8)
1394                 HDC_CHECK_AND_CPY((%%ecx, %2),16)
1395                 HDC_CHECK_AND_CPY((%%ecx, %2, 2),24)
1396                 HDC_CHECK_AND_CPY((%1, %2, 4),32)
1397                 HDC_CHECK_AND_CPY((%%ebx),40)
1398                 HDC_CHECK_AND_CPY((%%ebx, %2),48)
1399                 HDC_CHECK_AND_CPY((%%ebx, %2, 2),56)
1400                 "jmp 2f                                         \n\t"
1401                 "1:                                             \n\t"
1402 // src does not cross a 32 byte cache line so dont waste time with alignment
1403 #define HDC_CHECK_AND_CPY2(src, dst) \
1404                 "movq " #src ", %%mm2                           \n\t"\
1405                 "movq " #src ", %%mm1                           \n\t"\
1406                 "psrlq $8, %%mm2                                \n\t"\
1407                 "psubb %%mm1, %%mm2                             \n\t"\
1408                 "paddb %%mm7, %%mm2                             \n\t"\
1409                 "pcmpgtb %%mm6, %%mm2                           \n\t"\
1410                 "paddb %%mm2, %%mm0                             \n\t"\
1411                 "movq %%mm1," #dst "(%%eax)                     \n\t"
1412
1413                 HDC_CHECK_AND_CPY2((%1),0)
1414                 HDC_CHECK_AND_CPY2((%%ecx),8)
1415                 HDC_CHECK_AND_CPY2((%%ecx, %2),16)
1416                 HDC_CHECK_AND_CPY2((%%ecx, %2, 2),24)
1417                 HDC_CHECK_AND_CPY2((%1, %2, 4),32)
1418                 HDC_CHECK_AND_CPY2((%%ebx),40)
1419                 HDC_CHECK_AND_CPY2((%%ebx, %2),48)
1420                 HDC_CHECK_AND_CPY2((%%ebx, %2, 2),56)
1421                 "2:                                             \n\t"
1422                 "psllq $8, %%mm0                                \n\t" // remove dummy value
1423                 "movq %%mm0, %%mm1                              \n\t"
1424                 "psrlw $8, %%mm0                                \n\t"
1425                 "paddb %%mm1, %%mm0                             \n\t"
1426                 "movq %%mm0, %%mm1                              \n\t"
1427                 "psrlq $16, %%mm0                               \n\t"
1428                 "paddb %%mm1, %%mm0                             \n\t"
1429                 "movq %%mm0, %%mm1                              \n\t"
1430                 "psrlq $32, %%mm0                               \n\t"
1431                 "paddb %%mm1, %%mm0                             \n\t"
1432                 "movd %%mm0, %0                                 \n\t"
1433                 : "=r" (numEq)
1434                 : "r" (src), "r" (stride)
1435                 : "%eax", "%ebx", "%ecx"
1436                 );
1437 //      printf("%d\n", numEq);
1438         numEq= (256 - numEq) &0xFF;
1439 #else
1440         int y;
1441         for(y=0; y<BLOCK_SIZE; y++)
1442         {
1443                 if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++;
1444                 if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++;
1445                 if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++;
1446                 if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++;
1447                 if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++;
1448                 if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++;
1449                 if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++;
1450                 src+= stride;
1451         }
1452 #endif
1453 /*      if(abs(numEq - asmEq) > 0)
1454         {
1455 //              printf("\nasm:%d  c:%d\n", asmEq, numEq);
1456                 for(int y=0; y<8; y++)
1457                 {
1458                         for(int x=0; x<8; x++)
1459                         {
1460                                 printf("%d ", src[x + y*stride]);
1461                         }
1462                         printf("\n");
1463                 }
1464         }
1465 */
1466 //      printf("%d\n", numEq);
1467         return numEq > hFlatnessThreshold;
1468 }
1469
1470 static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
1471 {
1472         if(abs(src[0] - src[7]) > 2*QP) return 0;
1473
1474         return 1;
1475 }
1476
1477 static inline void doHorizDefFilter(uint8_t dst[], int stride, int QP)
1478 {
1479 #if 0
1480         asm volatile(
1481                 "leal (%0, %1), %%ecx                           \n\t"
1482                 "leal (%%ecx, %1, 4), %%ebx                     \n\t"
1483 //      0       1       2       3       4       5       6       7       8       9
1484 //      %0      ecx     ecx+%1  ecx+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
1485                 "pxor %%mm7, %%mm7                              \n\t"
1486                 "movq bm00001000, %%mm6                         \n\t"
1487                 "movd %2, %%mm5                                 \n\t" // QP
1488                 "movq %%mm5, %%mm4                              \n\t"
1489                 "paddusb %%mm5, %%mm5                           \n\t" // 2QP
1490                 "paddusb %%mm5, %%mm4                           \n\t" // 3QP
1491                 "psllq $24, %%mm4                               \n\t"
1492                 "pxor %%mm5, %%mm5                              \n\t" // 0
1493                 "psubb %%mm4, %%mm5                             \n\t" // -QP
1494                 "leal tempBlock, %%eax                          \n\t"
1495
1496 //FIXME? "unroll by 2" and mix
1497 #ifdef HAVE_MMX2
1498 #define HDF(src, dst)   \
1499                 "movq " #src "(%%eax), %%mm0                    \n\t"\
1500                 "movq " #src "(%%eax), %%mm1                    \n\t"\
1501                 "movq " #src "(%%eax), %%mm2                    \n\t"\
1502                 "psrlq $8, %%mm1                                \n\t"\
1503                 "psubusb %%mm1, %%mm2                           \n\t"\
1504                 "psubusb %%mm0, %%mm1                           \n\t"\
1505                 "por %%mm2, %%mm1                               \n\t" /* p´x = |px - p(x+1)| */\
1506                 "pcmpeqb %%mm7, %%mm2                           \n\t" /* p´x = sgn[px - p(x+1)] */\
1507                 "pshufw $0x00, %%mm1, %%mm3                     \n\t" /* p´5 = |p1 - p2| */\
1508                 "pminub %%mm1, %%mm3                            \n\t" /* p´5 = min(|p2-p1|, |p6-p5|)*/\
1509                 "psrlq $16, %%mm3                               \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\
1510                 "psubusb %%mm3, %%mm1                   \n\t" /* |p3-p4|-min(|p1-p2|,|p5-p6|) */\
1511                 "paddb %%mm5, %%mm1                             \n\t"\
1512                 "psubusb %%mm5, %%mm1                           \n\t"\
1513                 "psrlw $2, %%mm1                                \n\t"\
1514                 "pxor %%mm2, %%mm1                              \n\t"\
1515                 "psubb %%mm2, %%mm1                             \n\t"\
1516                 "pand %%mm6, %%mm1                              \n\t"\
1517                 "psubb %%mm1, %%mm0                             \n\t"\
1518                 "psllq $8, %%mm1                                \n\t"\
1519                 "paddb %%mm1, %%mm0                             \n\t"\
1520                 "movd %%mm0, " #dst"                            \n\t"\
1521                 "psrlq $32, %%mm0                               \n\t"\
1522                 "movd %%mm0, 4" #dst"                           \n\t"
1523 #else
1524 #define HDF(src, dst)\
1525                 "movq " #src "(%%eax), %%mm0                    \n\t"\
1526                 "movq %%mm0, %%mm1                              \n\t"\
1527                 "movq %%mm0, %%mm2                              \n\t"\
1528                 "psrlq $8, %%mm1                                \n\t"\
1529                 "psubusb %%mm1, %%mm2                           \n\t"\
1530                 "psubusb %%mm0, %%mm1                           \n\t"\
1531                 "por %%mm2, %%mm1                               \n\t" /* p´x = |px - p(x+1)| */\
1532                 "pcmpeqb %%mm7, %%mm2                           \n\t" /* p´x = sgn[px - p(x+1)] */\
1533                 "movq %%mm1, %%mm3                              \n\t"\
1534                 "psllq $32, %%mm3                               \n\t"\
1535                 "movq %%mm3, %%mm4                              \n\t"\
1536                 "psubusb %%mm1, %%mm4                           \n\t"\
1537                 "psubb %%mm4, %%mm3                             \n\t"\
1538                 "psrlq $16, %%mm3                               \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\
1539                 "psubusb %%mm3, %%mm1                   \n\t" /* |p3-p4|-min(|p1-p2|,|p5,ü6|) */\
1540                 "paddb %%mm5, %%mm1                             \n\t"\
1541                 "psubusb %%mm5, %%mm1                           \n\t"\
1542                 "psrlw $2, %%mm1                                \n\t"\
1543                 "pxor %%mm2, %%mm1                              \n\t"\
1544                 "psubb %%mm2, %%mm1                             \n\t"\
1545                 "pand %%mm6, %%mm1                              \n\t"\
1546                 "psubb %%mm1, %%mm0                             \n\t"\
1547                 "psllq $8, %%mm1                                \n\t"\
1548                 "paddb %%mm1, %%mm0                             \n\t"\
1549                 "movd %%mm0, " #dst "                           \n\t"\
1550                 "psrlq $32, %%mm0                               \n\t"\
1551                 "movd %%mm0, 4" #dst "                          \n\t"
1552 #endif
1553                 HDF(0,(%0))
1554                 HDF(8,(%%ecx))
1555                 HDF(16,(%%ecx, %1))
1556                 HDF(24,(%%ecx, %1, 2))
1557                 HDF(32,(%0, %1, 4))
1558                 HDF(40,(%%ebx))
1559                 HDF(48,(%%ebx, %1))
1560                 HDF(56,(%%ebx, %1, 2))
1561                 :
1562                 : "r" (dst), "r" (stride), "r" (QP)
1563                 : "%eax", "%ebx", "%ecx"
1564         );
1565 #else
1566         int y;
1567         for(y=0; y<BLOCK_SIZE; y++)
1568         {
1569                 const int middleEnergy= 5*(dst[4] - dst[5]) + 2*(dst[2] - dst[5]);
1570
1571                 if(ABS(middleEnergy) < 8*QP)
1572                 {
1573                         const int q=(dst[3] - dst[4])/2;
1574                         const int leftEnergy=  5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
1575                         const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
1576
1577                         int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1578                         d= MAX(d, 0);
1579
1580                         d= (5*d + 32) >> 6;
1581                         d*= SIGN(-middleEnergy);
1582
1583                         if(q>0)
1584                         {
1585                                 d= d<0 ? 0 : d;
1586                                 d= d>q ? q : d;
1587                         }
1588                         else
1589                         {
1590                                 d= d>0 ? 0 : d;
1591                                 d= d<q ? q : d;
1592                         }
1593
1594                         dst[3]-= d;
1595                         dst[4]+= d;
1596                 }
1597                 dst+= stride;
1598         }
1599 #endif
1600 }
1601
1602 /**
1603  * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
1604  * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
1605  * using the 7-Tap Filter   (2,2,2,4,2,2,2)/16 (MMX2/3DNOW version)
1606  */
1607 static inline void doHorizLowPass(uint8_t dst[], int stride, int QP)
1608 {
1609
1610 #if 0
1611         asm volatile(
1612                 "leal (%0, %1), %%ecx                           \n\t"
1613                 "leal (%%ecx, %1, 4), %%ebx                     \n\t"
1614 //      0       1       2       3       4       5       6       7       8       9
1615 //      %0      ecx     ecx+%1  ecx+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
1616                 "pxor %%mm7, %%mm7                                      \n\t"
1617                 "leal tempBlock, %%eax                                  \n\t"
1618 /*
1619 #define HLP1    "movq (%0), %%mm0                                       \n\t"\
1620                 "movq %%mm0, %%mm1                                      \n\t"\
1621                 "psllq $8, %%mm0                                        \n\t"\
1622                 PAVGB(%%mm1, %%mm0)\
1623                 "psrlw $8, %%mm0                                        \n\t"\
1624                 "pxor %%mm1, %%mm1                                      \n\t"\
1625                 "packuswb %%mm1, %%mm0                                  \n\t"\
1626                 "movq %%mm0, %%mm1                                      \n\t"\
1627                 "movq %%mm0, %%mm2                                      \n\t"\
1628                 "psllq $32, %%mm0                                       \n\t"\
1629                 "paddb %%mm0, %%mm1                                     \n\t"\
1630                 "psllq $16, %%mm2                                       \n\t"\
1631                 PAVGB(%%mm2, %%mm0)\
1632                 "movq %%mm0, %%mm3                                      \n\t"\
1633                 "pand bm11001100, %%mm0                                 \n\t"\
1634                 "paddusb %%mm0, %%mm3                                   \n\t"\
1635                 "psrlq $8, %%mm3                                        \n\t"\
1636                 PAVGB(%%mm1, %%mm4)\
1637                 PAVGB(%%mm3, %%mm2)\
1638                 "psrlq $16, %%mm2                                       \n\t"\
1639                 "punpcklbw %%mm2, %%mm2                                 \n\t"\
1640                 "movq %%mm2, (%0)                                       \n\t"\
1641
1642 #define HLP2    "movq (%0), %%mm0                                       \n\t"\
1643                 "movq %%mm0, %%mm1                                      \n\t"\
1644                 "psllq $8, %%mm0                                        \n\t"\
1645                 PAVGB(%%mm1, %%mm0)\
1646                 "psrlw $8, %%mm0                                        \n\t"\
1647                 "pxor %%mm1, %%mm1                                      \n\t"\
1648                 "packuswb %%mm1, %%mm0                                  \n\t"\
1649                 "movq %%mm0, %%mm2                                      \n\t"\
1650                 "psllq $32, %%mm0                                       \n\t"\
1651                 "psllq $16, %%mm2                                       \n\t"\
1652                 PAVGB(%%mm2, %%mm0)\
1653                 "movq %%mm0, %%mm3                                      \n\t"\
1654                 "pand bm11001100, %%mm0                                 \n\t"\
1655                 "paddusb %%mm0, %%mm3                                   \n\t"\
1656                 "psrlq $8, %%mm3                                        \n\t"\
1657                 PAVGB(%%mm3, %%mm2)\
1658                 "psrlq $16, %%mm2                                       \n\t"\
1659                 "punpcklbw %%mm2, %%mm2                                 \n\t"\
1660                 "movq %%mm2, (%0)                                       \n\t"\
1661 */
1662 // approximately a 7-Tap Filter with Vector (1,2,3,4,3,2,1)/16
1663 /*
1664 Implemented     Exact 7-Tap
1665  9421           A321
1666  36421          64321
1667  334321         =
1668  1234321        =
1669   1234321       =
1670    123433       =
1671     12463         12346
1672      1249          123A
1673
1674 */
1675
1676 #ifdef HAVE_MMX2
1677 #define HLP3(i) "movq " #i "(%%eax), %%mm0                              \n\t"\
1678                 "movq %%mm0, %%mm1                                      \n\t"\
1679                 "movq %%mm0, %%mm2                                      \n\t"\
1680                 "movq %%mm0, %%mm3                                      \n\t"\
1681                 "movq %%mm0, %%mm4                                      \n\t"\
1682                 "psllq $8, %%mm1                                        \n\t"\
1683                 "psrlq $8, %%mm2                                        \n\t"\
1684                 "pand bm00000001, %%mm3                                 \n\t"\
1685                 "pand bm10000000, %%mm4                                 \n\t"\
1686                 "por %%mm3, %%mm1                                       \n\t"\
1687                 "por %%mm4, %%mm2                                       \n\t"\
1688                 PAVGB(%%mm2, %%mm1)\
1689                 PAVGB(%%mm1, %%mm0)\
1690 \
1691                 "pshufw $0xF9, %%mm0, %%mm3                             \n\t"\
1692                 "pshufw $0x90, %%mm0, %%mm4                             \n\t"\
1693                 PAVGB(%%mm3, %%mm4)\
1694                 PAVGB(%%mm4, %%mm0)\
1695                 "movd %%mm0, (%0)                                       \n\t"\
1696                 "psrlq $32, %%mm0                                       \n\t"\
1697                 "movd %%mm0, 4(%0)                                      \n\t"
1698 #else
1699 #define HLP3(i) "movq " #i "(%%eax), %%mm0                              \n\t"\
1700                 "movq %%mm0, %%mm1                                      \n\t"\
1701                 "movq %%mm0, %%mm2                                      \n\t"\
1702                 "movd -4(%0), %%mm3                                     \n\t" /*0001000*/\
1703                 "movd 8(%0), %%mm4                                      \n\t" /*0001000*/\
1704                 "psllq $8, %%mm1                                        \n\t"\
1705                 "psrlq $8, %%mm2                                        \n\t"\
1706                 "psrlq $24, %%mm3                                       \n\t"\
1707                 "psllq $56, %%mm4                                       \n\t"\
1708                 "por %%mm3, %%mm1                                       \n\t"\
1709                 "por %%mm4, %%mm2                                       \n\t"\
1710                 PAVGB(%%mm2, %%mm1)\
1711                 PAVGB(%%mm1, %%mm0)\
1712 \
1713                 "movq %%mm0, %%mm3                                      \n\t"\
1714                 "movq %%mm0, %%mm4                                      \n\t"\
1715                 "movq %%mm0, %%mm5                                      \n\t"\
1716                 "psrlq $16, %%mm3                                       \n\t"\
1717                 "psllq $16, %%mm4                                       \n\t"\
1718                 "pand bm11000000, %%mm5                                 \n\t"\
1719                 "por %%mm5, %%mm3                                       \n\t"\
1720                 "movq %%mm0, %%mm5                                      \n\t"\
1721                 "pand bm00000011, %%mm5                                 \n\t"\
1722                 "por %%mm5, %%mm4                                       \n\t"\
1723                 PAVGB(%%mm3, %%mm4)\
1724                 PAVGB(%%mm4, %%mm0)\
1725                 "movd %%mm0, (%0)                                       \n\t"\
1726                 "psrlq $32, %%mm0                                       \n\t"\
1727                 "movd %%mm0, 4(%0)                                      \n\t"
1728 #endif
1729
1730 /* uses the 7-Tap Filter: 1112111 */
1731 #define NEW_HLP(src, dst)\
1732                 "movq " #src "(%%eax), %%mm1                            \n\t"\
1733                 "movq " #src "(%%eax), %%mm2                            \n\t"\
1734                 "psllq $8, %%mm1                                        \n\t"\
1735                 "psrlq $8, %%mm2                                        \n\t"\
1736                 "movd -4" #dst ", %%mm3                                 \n\t" /*0001000*/\
1737                 "movd 8" #dst ", %%mm4                                  \n\t" /*0001000*/\
1738                 "psrlq $24, %%mm3                                       \n\t"\
1739                 "psllq $56, %%mm4                                       \n\t"\
1740                 "por %%mm3, %%mm1                                       \n\t"\
1741                 "por %%mm4, %%mm2                                       \n\t"\
1742                 "movq %%mm1, %%mm5                                      \n\t"\
1743                 PAVGB(%%mm2, %%mm1)\
1744                 "movq " #src "(%%eax), %%mm0                            \n\t"\
1745                 PAVGB(%%mm1, %%mm0)\
1746                 "psllq $8, %%mm5                                        \n\t"\
1747                 "psrlq $8, %%mm2                                        \n\t"\
1748                 "por %%mm3, %%mm5                                       \n\t"\
1749                 "por %%mm4, %%mm2                                       \n\t"\
1750                 "movq %%mm5, %%mm1                                      \n\t"\
1751                 PAVGB(%%mm2, %%mm5)\
1752                 "psllq $8, %%mm1                                        \n\t"\
1753                 "psrlq $8, %%mm2                                        \n\t"\
1754                 "por %%mm3, %%mm1                                       \n\t"\
1755                 "por %%mm4, %%mm2                                       \n\t"\
1756                 PAVGB(%%mm2, %%mm1)\
1757                 PAVGB(%%mm1, %%mm5)\
1758                 PAVGB(%%mm5, %%mm0)\
1759                 "movd %%mm0, " #dst "                                   \n\t"\
1760                 "psrlq $32, %%mm0                                       \n\t"\
1761                 "movd %%mm0, 4" #dst "                                  \n\t"
1762
1763 /* uses the 9-Tap Filter: 112242211 */
1764 #define NEW_HLP2(i)\
1765                 "movq " #i "(%%eax), %%mm0                              \n\t" /*0001000*/\
1766                 "movq %%mm0, %%mm1                                      \n\t" /*0001000*/\
1767                 "movq %%mm0, %%mm2                                      \n\t" /*0001000*/\
1768                 "movd -4(%0), %%mm3                                     \n\t" /*0001000*/\
1769                 "movd 8(%0), %%mm4                                      \n\t" /*0001000*/\
1770                 "psllq $8, %%mm1                                        \n\t"\
1771                 "psrlq $8, %%mm2                                        \n\t"\
1772                 "psrlq $24, %%mm3                                       \n\t"\
1773                 "psllq $56, %%mm4                                       \n\t"\
1774                 "por %%mm3, %%mm1                                       \n\t" /*0010000*/\
1775                 "por %%mm4, %%mm2                                       \n\t" /*0000100*/\
1776                 "movq %%mm1, %%mm5                                      \n\t" /*0010000*/\
1777                 PAVGB(%%mm2, %%mm1)                                           /*0010100*/\
1778                 PAVGB(%%mm1, %%mm0)                                           /*0012100*/\
1779                 "psllq $8, %%mm5                                        \n\t"\
1780                 "psrlq $8, %%mm2                                        \n\t"\
1781                 "por %%mm3, %%mm5                                       \n\t" /*0100000*/\
1782                 "por %%mm4, %%mm2                                       \n\t" /*0000010*/\
1783                 "movq %%mm5, %%mm1                                      \n\t" /*0100000*/\
1784                 PAVGB(%%mm2, %%mm5)                                           /*0100010*/\
1785                 "psllq $8, %%mm1                                        \n\t"\
1786                 "psrlq $8, %%mm2                                        \n\t"\
1787                 "por %%mm3, %%mm1                                       \n\t" /*1000000*/\
1788                 "por %%mm4, %%mm2                                       \n\t" /*0000001*/\
1789                 "movq %%mm1, %%mm6                                      \n\t" /*1000000*/\
1790                 PAVGB(%%mm2, %%mm1)                                           /*1000001*/\
1791                 "psllq $8, %%mm6                                        \n\t"\
1792                 "psrlq $8, %%mm2                                        \n\t"\
1793                 "por %%mm3, %%mm6                                       \n\t"/*100000000*/\
1794                 "por %%mm4, %%mm2                                       \n\t"/*000000001*/\
1795                 PAVGB(%%mm2, %%mm6)                                          /*100000001*/\
1796                 PAVGB(%%mm6, %%mm1)                                          /*110000011*/\
1797                 PAVGB(%%mm1, %%mm5)                                          /*112000211*/\
1798                 PAVGB(%%mm5, %%mm0)                                          /*112242211*/\
1799                 "movd %%mm0, (%0)                                       \n\t"\
1800                 "psrlq $32, %%mm0                                       \n\t"\
1801                 "movd %%mm0, 4(%0)                                      \n\t"
1802
1803 #define HLP(src, dst) NEW_HLP(src, dst)
1804
1805                 HLP(0, (%0))
1806                 HLP(8, (%%ecx))
1807                 HLP(16, (%%ecx, %1))
1808                 HLP(24, (%%ecx, %1, 2))
1809                 HLP(32, (%0, %1, 4))
1810                 HLP(40, (%%ebx))
1811                 HLP(48, (%%ebx, %1))
1812                 HLP(56, (%%ebx, %1, 2))
1813
1814                 :
1815                 : "r" (dst), "r" (stride)
1816                 : "%eax", "%ebx", "%ecx"
1817         );
1818
1819 #else
1820         int y;
1821         for(y=0; y<BLOCK_SIZE; y++)
1822         {
1823                 const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
1824                 const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
1825
1826                 int sums[9];
1827                 sums[0] = first + dst[0];
1828                 sums[1] = dst[0] + dst[1];
1829                 sums[2] = dst[1] + dst[2];
1830                 sums[3] = dst[2] + dst[3];
1831                 sums[4] = dst[3] + dst[4];
1832                 sums[5] = dst[4] + dst[5];
1833                 sums[6] = dst[5] + dst[6];
1834                 sums[7] = dst[6] + dst[7];
1835                 sums[8] = dst[7] + last;
1836
1837                 dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
1838                 dst[1]= ((dst[1]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
1839                 dst[2]= ((dst[2]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
1840                 dst[3]= ((dst[3]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
1841                 dst[4]= ((dst[4]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
1842                 dst[5]= ((dst[5]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
1843                 dst[6]= (((last + dst[6])<<2) + ((dst[7] + sums[5])<<1) + sums[3] + 8)>>4;
1844                 dst[7]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
1845
1846                 dst+= stride;
1847         }
1848 #endif
1849 }
1850
1851 static inline void dering(uint8_t src[], int stride, int QP)
1852 {
1853 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1854         asm volatile(
1855                 "movq pQPb, %%mm0                               \n\t"
1856                 "paddusb %%mm0, %%mm0                           \n\t"
1857                 "movq %%mm0, pQPb2                              \n\t"
1858
1859                 "leal (%0, %1), %%eax                           \n\t"
1860                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
1861 //      0       1       2       3       4       5       6       7       8       9
1862 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
1863
1864                 "pcmpeqb %%mm6, %%mm6                           \n\t"
1865                 "pxor %%mm7, %%mm7                              \n\t"
1866 #ifdef HAVE_MMX2
1867 #define FIND_MIN_MAX(addr)\
1868                 "movq " #addr ", %%mm0                          \n\t"\
1869                 "pminub %%mm0, %%mm6                            \n\t"\
1870                 "pmaxub %%mm0, %%mm7                            \n\t"
1871 #else
1872 #define FIND_MIN_MAX(addr)\
1873                 "movq " #addr ", %%mm0                          \n\t"\
1874                 "movq %%mm6, %%mm1                              \n\t"\
1875                 "psubusb %%mm0, %%mm7                           \n\t"\
1876                 "paddb %%mm0, %%mm7                             \n\t"\
1877                 "psubusb %%mm0, %%mm1                           \n\t"\
1878                 "psubb %%mm1, %%mm6                             \n\t"
1879 #endif
1880
1881 FIND_MIN_MAX((%%eax))
1882 FIND_MIN_MAX((%%eax, %1))
1883 FIND_MIN_MAX((%%eax, %1, 2))
1884 FIND_MIN_MAX((%0, %1, 4))
1885 FIND_MIN_MAX((%%ebx))
1886 FIND_MIN_MAX((%%ebx, %1))
1887 FIND_MIN_MAX((%%ebx, %1, 2))
1888 FIND_MIN_MAX((%0, %1, 8))
1889
1890                 "movq %%mm6, %%mm4                              \n\t"
1891                 "psrlq $8, %%mm6                                \n\t"
1892 #ifdef HAVE_MMX2
1893                 "pminub %%mm4, %%mm6                            \n\t" // min of pixels
1894                 "pshufw $0xF9, %%mm6, %%mm4                     \n\t"
1895                 "pminub %%mm4, %%mm6                            \n\t" // min of pixels
1896                 "pshufw $0xFE, %%mm6, %%mm4                     \n\t"
1897                 "pminub %%mm4, %%mm6                            \n\t"
1898 #else
1899                 "movq %%mm6, %%mm1                              \n\t"
1900                 "psubusb %%mm4, %%mm1                           \n\t"
1901                 "psubb %%mm1, %%mm6                             \n\t"
1902                 "movq %%mm6, %%mm4                              \n\t"
1903                 "psrlq $16, %%mm6                               \n\t"
1904                 "movq %%mm6, %%mm1                              \n\t"
1905                 "psubusb %%mm4, %%mm1                           \n\t"
1906                 "psubb %%mm1, %%mm6                             \n\t"
1907                 "movq %%mm6, %%mm4                              \n\t"
1908                 "psrlq $32, %%mm6                               \n\t"
1909                 "movq %%mm6, %%mm1                              \n\t"
1910                 "psubusb %%mm4, %%mm1                           \n\t"
1911                 "psubb %%mm1, %%mm6                             \n\t"
1912 #endif
1913
1914
1915                 "movq %%mm7, %%mm4                              \n\t"
1916                 "psrlq $8, %%mm7                                \n\t"
1917 #ifdef HAVE_MMX2
1918                 "pmaxub %%mm4, %%mm7                            \n\t" // max of pixels
1919                 "pshufw $0xF9, %%mm7, %%mm4                     \n\t"
1920                 "pmaxub %%mm4, %%mm7                            \n\t"
1921                 "pshufw $0xFE, %%mm7, %%mm4                     \n\t"
1922                 "pmaxub %%mm4, %%mm7                            \n\t"
1923 #else
1924                 "psubusb %%mm4, %%mm7                           \n\t"
1925                 "paddb %%mm4, %%mm7                             \n\t"
1926                 "movq %%mm7, %%mm4                              \n\t"
1927                 "psrlq $16, %%mm7                               \n\t"
1928                 "psubusb %%mm4, %%mm7                           \n\t"
1929                 "paddb %%mm4, %%mm7                             \n\t"
1930                 "movq %%mm7, %%mm4                              \n\t"
1931                 "psrlq $32, %%mm7                               \n\t"
1932                 "psubusb %%mm4, %%mm7                           \n\t"
1933                 "paddb %%mm4, %%mm7                             \n\t"
1934 #endif
1935                 PAVGB(%%mm6, %%mm7)                                   // a=(max + min)/2
1936                 "punpcklbw %%mm7, %%mm7                         \n\t"
1937                 "punpcklbw %%mm7, %%mm7                         \n\t"
1938                 "punpcklbw %%mm7, %%mm7                         \n\t"
1939                 "movq %%mm7, temp0                              \n\t"
1940
1941                 "movq (%0), %%mm0                               \n\t" // L10
1942                 "movq %%mm0, %%mm1                              \n\t" // L10
1943                 "movq %%mm0, %%mm2                              \n\t" // L10
1944                 "psllq $8, %%mm1                                \n\t"
1945                 "psrlq $8, %%mm2                                \n\t"
1946                 "movd -4(%0), %%mm3                             \n\t"
1947                 "movd 8(%0), %%mm4                              \n\t"
1948                 "psrlq $24, %%mm3                               \n\t"
1949                 "psllq $56, %%mm4                               \n\t"
1950                 "por %%mm3, %%mm1                               \n\t" // L00
1951                 "por %%mm4, %%mm2                               \n\t" // L20
1952                 "movq %%mm1, %%mm3                              \n\t" // L00
1953                 PAVGB(%%mm2, %%mm1)                                   // (L20 + L00)/2
1954                 PAVGB(%%mm0, %%mm1)                                   // (L20 + L00 + 2L10)/4
1955                 "psubusb %%mm7, %%mm0                           \n\t"
1956                 "psubusb %%mm7, %%mm2                           \n\t"
1957                 "psubusb %%mm7, %%mm3                           \n\t"
1958                 "pcmpeqb b00, %%mm0                             \n\t" // L10 > a ? 0 : -1
1959                 "pcmpeqb b00, %%mm2                             \n\t" // L20 > a ? 0 : -1
1960                 "pcmpeqb b00, %%mm3                             \n\t" // L00 > a ? 0 : -1
1961                 "paddb %%mm2, %%mm0                             \n\t"
1962                 "paddb %%mm3, %%mm0                             \n\t"
1963
1964                 "movq (%%eax), %%mm2                            \n\t" // L11
1965                 "movq %%mm2, %%mm3                              \n\t" // L11
1966                 "movq %%mm2, %%mm4                              \n\t" // L11
1967                 "psllq $8, %%mm3                                \n\t"
1968                 "psrlq $8, %%mm4                                \n\t"
1969                 "movd -4(%%eax), %%mm5                          \n\t"
1970                 "movd 8(%%eax), %%mm6                           \n\t"
1971                 "psrlq $24, %%mm5                               \n\t"
1972                 "psllq $56, %%mm6                               \n\t"
1973                 "por %%mm5, %%mm3                               \n\t" // L01
1974                 "por %%mm6, %%mm4                               \n\t" // L21
1975                 "movq %%mm3, %%mm5                              \n\t" // L01
1976                 PAVGB(%%mm4, %%mm3)                                   // (L21 + L01)/2
1977                 PAVGB(%%mm2, %%mm3)                                   // (L21 + L01 + 2L11)/4
1978                 "psubusb %%mm7, %%mm2                           \n\t"
1979                 "psubusb %%mm7, %%mm4                           \n\t"
1980                 "psubusb %%mm7, %%mm5                           \n\t"
1981                 "pcmpeqb b00, %%mm2                             \n\t" // L11 > a ? 0 : -1
1982                 "pcmpeqb b00, %%mm4                             \n\t" // L21 > a ? 0 : -1
1983                 "pcmpeqb b00, %%mm5                             \n\t" // L01 > a ? 0 : -1
1984                 "paddb %%mm4, %%mm2                             \n\t"
1985                 "paddb %%mm5, %%mm2                             \n\t"
1986 // 0, 2, 3, 1
1987 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1988                 "movq " #src ", " #sx "                         \n\t" /* src[0] */\
1989                 "movq " #sx ", " #lx "                          \n\t" /* src[0] */\
1990                 "movq " #sx ", " #t0 "                          \n\t" /* src[0] */\
1991                 "psllq $8, " #lx "                              \n\t"\
1992                 "psrlq $8, " #t0 "                              \n\t"\
1993                 "movd -4" #src ", " #t1 "                       \n\t"\
1994                 "psrlq $24, " #t1 "                             \n\t"\
1995                 "por " #t1 ", " #lx "                           \n\t" /* src[-1] */\
1996                 "movd 8" #src ", " #t1 "                        \n\t"\
1997                 "psllq $56, " #t1 "                             \n\t"\
1998                 "por " #t1 ", " #t0 "                           \n\t" /* src[+1] */\
1999                 "movq " #lx ", " #t1 "                          \n\t" /* src[-1] */\
2000                 PAVGB(t0, lx)                                         /* (src[-1] + src[+1])/2 */\
2001                 PAVGB(sx, lx)                                 /* (src[-1] + 2src[0] + src[+1])/4 */\
2002                 PAVGB(lx, pplx)                                      \
2003                 "movq " #lx ", temp1                            \n\t"\
2004                 "movq temp0, " #lx "                            \n\t"\
2005                 "psubusb " #lx ", " #t1 "                       \n\t"\
2006                 "psubusb " #lx ", " #t0 "                       \n\t"\
2007                 "psubusb " #lx ", " #sx "                       \n\t"\
2008                 "movq b00, " #lx "                              \n\t"\
2009                 "pcmpeqb " #lx ", " #t1 "                       \n\t" /* src[-1] > a ? 0 : -1*/\
2010                 "pcmpeqb " #lx ", " #t0 "                       \n\t" /* src[+1] > a ? 0 : -1*/\
2011                 "pcmpeqb " #lx ", " #sx "                       \n\t" /* src[0]  > a ? 0 : -1*/\
2012                 "paddb " #t1 ", " #t0 "                         \n\t"\
2013                 "paddb " #t0 ", " #sx "                         \n\t"\
2014 \
2015                 PAVGB(plx, pplx)                                      /* filtered */\
2016                 "movq " #dst ", " #t0 "                         \n\t" /* dst */\
2017                 "movq " #t0 ", " #t1 "                          \n\t" /* dst */\
2018                 "psubusb pQPb2, " #t0 "                         \n\t"\
2019                 "paddusb pQPb2, " #t1 "                         \n\t"\
2020                 PMAXUB(t0, pplx)\
2021                 PMINUB(t1, pplx, t0)\
2022                 "paddb " #sx ", " #ppsx "                       \n\t"\
2023                 "paddb " #psx ", " #ppsx "                      \n\t"\
2024         "#paddb b02, " #ppsx "                          \n\t"\
2025                 "pand b08, " #ppsx "                            \n\t"\
2026                 "pcmpeqb " #lx ", " #ppsx "                     \n\t"\
2027                 "pand " #ppsx ", " #pplx "                      \n\t"\
2028                 "pandn " #dst ", " #ppsx "                      \n\t"\
2029                 "por " #pplx ", " #ppsx "                       \n\t"\
2030                 "movq " #ppsx ", " #dst "                       \n\t"\
2031                 "movq temp1, " #lx "                            \n\t"
2032
2033 /*
2034 0000000
2035 1111111
2036
2037 1111110
2038 1111101
2039 1111100
2040 1111011
2041 1111010
2042 1111001
2043
2044 1111000
2045 1110111
2046
2047 */
2048 //DERING_CORE(dst,src                  ,ppsx ,psx  ,sx   ,pplx ,plx  ,lx   ,t0   ,t1)
2049 DERING_CORE((%%eax),(%%eax, %1)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
2050 DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
2051 DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
2052 DERING_CORE((%0, %1, 4),(%%ebx)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
2053 DERING_CORE((%%ebx),(%%ebx, %1)        ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
2054 DERING_CORE((%%ebx, %1), (%%ebx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
2055 DERING_CORE((%%ebx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
2056 DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
2057
2058
2059                 : : "r" (src), "r" (stride), "r" (QP)
2060                 : "%eax", "%ebx"
2061         );
2062 #else
2063         int y;
2064         int min=255;
2065         int max=0;
2066         int avg;
2067         uint8_t *p;
2068         int s[10];
2069
2070         for(y=1; y<9; y++)
2071         {
2072                 int x;
2073                 p= src + stride*y;
2074                 for(x=1; x<9; x++)
2075                 {
2076                         p++;
2077                         if(*p > max) max= *p;
2078                         if(*p < min) min= *p;
2079                 }
2080         }
2081         avg= (min + max + 1)/2;
2082
2083         for(y=0; y<10; y++)
2084         {
2085                 int x;
2086                 int t = 0;
2087                 p= src + stride*y;
2088                 for(x=0; x<10; x++)
2089                 {
2090                         if(*p > avg) t |= (1<<x);
2091                         p++;
2092                 }
2093                 t |= (~t)<<16;
2094                 t &= (t<<1) & (t>>1);
2095                 s[y] = t;
2096         }
2097
2098         for(y=1; y<9; y++)
2099         {
2100                 int x;
2101                 int t = s[y-1] & s[y] & s[y+1];
2102                 t|= t>>16;
2103
2104                 p= src + stride*y;
2105                 for(x=1; x<9; x++)
2106                 {
2107                         p++;
2108                         if(t & (1<<x))
2109                         {
2110                                 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
2111                                       +2*(*(p     -1)) + 4*(*p         ) + 2*(*(p     +1))
2112                                       +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
2113                                 f= (f + 8)>>4;
2114
2115                                 if     (*p + 2*QP < f) *p= *p + 2*QP;
2116                                 else if(*p - 2*QP > f) *p= *p - 2*QP;
2117                                 else *p=f;
2118                         }
2119                 }
2120         }
2121
2122 #endif
2123 }
2124
2125 /**
2126  * Deinterlaces the given block
2127  * will be called for every 8x8 block and can read & write from line 4-15
2128  * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
2129  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2130  */
2131 static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride)
2132 {
2133 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2134         src+= 4*stride;
2135         asm volatile(
2136                 "leal (%0, %1), %%eax                           \n\t"
2137                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
2138 //      0       1       2       3       4       5       6       7       8       9
2139 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
2140
2141                 "movq (%0), %%mm0                               \n\t"
2142                 "movq (%%eax, %1), %%mm1                        \n\t"
2143                 PAVGB(%%mm1, %%mm0)
2144                 "movq %%mm0, (%%eax)                            \n\t"
2145                 "movq (%0, %1, 4), %%mm0                        \n\t"
2146                 PAVGB(%%mm0, %%mm1)
2147                 "movq %%mm1, (%%eax, %1, 2)                     \n\t"
2148                 "movq (%%ebx, %1), %%mm1                        \n\t"
2149                 PAVGB(%%mm1, %%mm0)
2150                 "movq %%mm0, (%%ebx)                            \n\t"
2151                 "movq (%0, %1, 8), %%mm0                        \n\t"
2152                 PAVGB(%%mm0, %%mm1)
2153                 "movq %%mm1, (%%ebx, %1, 2)                     \n\t"
2154
2155                 : : "r" (src), "r" (stride)
2156                 : "%eax", "%ebx"
2157         );
2158 #else
2159         int x;
2160         src+= 4*stride;
2161         for(x=0; x<8; x++)
2162         {
2163                 src[stride]   = (src[0]        + src[stride*2])>>1;
2164                 src[stride*3] = (src[stride*2] + src[stride*4])>>1;
2165                 src[stride*5] = (src[stride*4] + src[stride*6])>>1;
2166                 src[stride*7] = (src[stride*6] + src[stride*8])>>1;
2167                 src++;
2168         }
2169 #endif
2170 }
2171
2172 /**
2173  * Deinterlaces the given block
2174  * will be called for every 8x8 block and can read & write from line 4-15
2175  * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
2176  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2177  * this filter will read lines 3-15 and write 7-13
2178  * no cliping in C version
2179  */
2180 static inline void deInterlaceInterpolateCubic(uint8_t src[], int stride)
2181 {
2182 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2183         src+= stride*3;
2184         asm volatile(
2185                 "leal (%0, %1), %%eax                           \n\t"
2186                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
2187                 "leal (%%ebx, %1, 4), %%ecx                     \n\t"
2188                 "addl %1, %%ecx                                 \n\t"
2189                 "pxor %%mm7, %%mm7                              \n\t"
2190 //      0       1       2       3       4       5       6       7       8       9       10
2191 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1 ecx
2192
2193 #define DEINT_CUBIC(a,b,c,d,e)\
2194                 "movq " #a ", %%mm0                             \n\t"\
2195                 "movq " #b ", %%mm1                             \n\t"\
2196                 "movq " #d ", %%mm2                             \n\t"\
2197                 "movq " #e ", %%mm3                             \n\t"\
2198                 PAVGB(%%mm2, %%mm1)                                     /* (b+d) /2 */\
2199                 PAVGB(%%mm3, %%mm0)                                     /* a(a+e) /2 */\
2200                 "movq %%mm0, %%mm2                              \n\t"\
2201                 "punpcklbw %%mm7, %%mm0                         \n\t"\
2202                 "punpckhbw %%mm7, %%mm2                         \n\t"\
2203                 "movq %%mm1, %%mm3                              \n\t"\
2204                 "punpcklbw %%mm7, %%mm1                         \n\t"\
2205                 "punpckhbw %%mm7, %%mm3                         \n\t"\
2206                 "psubw %%mm1, %%mm0                             \n\t"   /* L(a+e - (b+d))/2 */\
2207                 "psubw %%mm3, %%mm2                             \n\t"   /* H(a+e - (b+d))/2 */\
2208                 "psraw $3, %%mm0                                \n\t"   /* L(a+e - (b+d))/16 */\
2209                 "psraw $3, %%mm2                                \n\t"   /* H(a+e - (b+d))/16 */\
2210                 "psubw %%mm0, %%mm1                             \n\t"   /* L(9b + 9d - a - e)/16 */\
2211                 "psubw %%mm2, %%mm3                             \n\t"   /* H(9b + 9d - a - e)/16 */\
2212                 "packuswb %%mm3, %%mm1                          \n\t"\
2213                 "movq %%mm1, " #c "                             \n\t"
2214
2215 DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%ebx, %1))
2216 DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%ebx), (%%ebx, %1), (%0, %1, 8))
2217 DEINT_CUBIC((%0, %1, 4), (%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8), (%%ecx))
2218 DEINT_CUBIC((%%ebx, %1), (%0, %1, 8), (%%ebx, %1, 4), (%%ecx), (%%ecx, %1, 2))
2219
2220                 : : "r" (src), "r" (stride)
2221                 : "%eax", "%ebx", "ecx"
2222         );
2223 #else
2224         int x;
2225         src+= stride*3;
2226         for(x=0; x<8; x++)
2227         {
2228                 src[stride*3] = (-src[0]        + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4;
2229                 src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4;
2230                 src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4;
2231                 src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4;
2232                 src++;
2233         }
2234 #endif
2235 }
2236
2237 /**
2238  * Deinterlaces the given block
2239  * will be called for every 8x8 block and can read & write from line 4-15
2240  * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
2241  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2242  * will shift the image up by 1 line (FIXME if this is a problem)
2243  * this filter will read lines 4-13 and write 4-11
2244  */
2245 static inline void deInterlaceBlendLinear(uint8_t src[], int stride)
2246 {
2247 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2248         src+= 4*stride;
2249         asm volatile(
2250                 "leal (%0, %1), %%eax                           \n\t"
2251                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
2252 //      0       1       2       3       4       5       6       7       8       9
2253 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
2254
2255                 "movq (%0), %%mm0                               \n\t" // L0
2256                 "movq (%%eax, %1), %%mm1                        \n\t" // L2
2257                 PAVGB(%%mm1, %%mm0)                                   // L0+L2
2258                 "movq (%%eax), %%mm2                            \n\t" // L1
2259                 PAVGB(%%mm2, %%mm0)
2260                 "movq %%mm0, (%0)                               \n\t"
2261                 "movq (%%eax, %1, 2), %%mm0                     \n\t" // L3
2262                 PAVGB(%%mm0, %%mm2)                                   // L1+L3
2263                 PAVGB(%%mm1, %%mm2)                                   // 2L2 + L1 + L3
2264                 "movq %%mm2, (%%eax)                            \n\t"
2265                 "movq (%0, %1, 4), %%mm2                        \n\t" // L4
2266                 PAVGB(%%mm2, %%mm1)                                   // L2+L4
2267                 PAVGB(%%mm0, %%mm1)                                   // 2L3 + L2 + L4
2268                 "movq %%mm1, (%%eax, %1)                        \n\t"
2269                 "movq (%%ebx), %%mm1                            \n\t" // L5
2270                 PAVGB(%%mm1, %%mm0)                                   // L3+L5
2271                 PAVGB(%%mm2, %%mm0)                                   // 2L4 + L3 + L5
2272                 "movq %%mm0, (%%eax, %1, 2)                     \n\t"
2273                 "movq (%%ebx, %1), %%mm0                        \n\t" // L6
2274                 PAVGB(%%mm0, %%mm2)                                   // L4+L6
2275                 PAVGB(%%mm1, %%mm2)                                   // 2L5 + L4 + L6
2276                 "movq %%mm2, (%0, %1, 4)                        \n\t"
2277                 "movq (%%ebx, %1, 2), %%mm2                     \n\t" // L7
2278                 PAVGB(%%mm2, %%mm1)                                   // L5+L7
2279                 PAVGB(%%mm0, %%mm1)                                   // 2L6 + L5 + L7
2280                 "movq %%mm1, (%%ebx)                            \n\t"
2281                 "movq (%0, %1, 8), %%mm1                        \n\t" // L8
2282                 PAVGB(%%mm1, %%mm0)                                   // L6+L8
2283                 PAVGB(%%mm2, %%mm0)                                   // 2L7 + L6 + L8
2284                 "movq %%mm0, (%%ebx, %1)                        \n\t"
2285                 "movq (%%ebx, %1, 4), %%mm0                     \n\t" // L9
2286                 PAVGB(%%mm0, %%mm2)                                   // L7+L9
2287                 PAVGB(%%mm1, %%mm2)                                   // 2L8 + L7 + L9
2288                 "movq %%mm2, (%%ebx, %1, 2)                     \n\t"
2289
2290
2291                 : : "r" (src), "r" (stride)
2292                 : "%eax", "%ebx"
2293         );
2294 #else
2295         int x;
2296         src+= 4*stride;
2297         for(x=0; x<8; x++)
2298         {
2299                 src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
2300                 src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
2301                 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2302                 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
2303                 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
2304                 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
2305                 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
2306                 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
2307                 src++;
2308         }
2309 #endif
2310 }
2311
2312 /**
2313  * Deinterlaces the given block
2314  * will be called for every 8x8 block and can read & write from line 4-15,
2315  * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
2316  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2317  */
2318 static inline void deInterlaceMedian(uint8_t src[], int stride)
2319 {
2320 #ifdef HAVE_MMX
2321         src+= 4*stride;
2322 #ifdef HAVE_MMX2
2323         asm volatile(
2324                 "leal (%0, %1), %%eax                           \n\t"
2325                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
2326 //      0       1       2       3       4       5       6       7       8       9
2327 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
2328
2329                 "movq (%0), %%mm0                               \n\t" //
2330                 "movq (%%eax, %1), %%mm2                        \n\t" //
2331                 "movq (%%eax), %%mm1                            \n\t" //
2332                 "movq %%mm0, %%mm3                              \n\t"
2333                 "pmaxub %%mm1, %%mm0                            \n\t" //
2334                 "pminub %%mm3, %%mm1                            \n\t" //
2335                 "pmaxub %%mm2, %%mm1                            \n\t" //
2336                 "pminub %%mm1, %%mm0                            \n\t"
2337                 "movq %%mm0, (%%eax)                            \n\t"
2338
2339                 "movq (%0, %1, 4), %%mm0                        \n\t" //
2340                 "movq (%%eax, %1, 2), %%mm1                     \n\t" //
2341                 "movq %%mm2, %%mm3                              \n\t"
2342                 "pmaxub %%mm1, %%mm2                            \n\t" //
2343                 "pminub %%mm3, %%mm1                            \n\t" //
2344                 "pmaxub %%mm0, %%mm1                            \n\t" //
2345                 "pminub %%mm1, %%mm2                            \n\t"
2346                 "movq %%mm2, (%%eax, %1, 2)                     \n\t"
2347
2348                 "movq (%%ebx), %%mm2                            \n\t" //
2349                 "movq (%%ebx, %1), %%mm1                        \n\t" //
2350                 "movq %%mm2, %%mm3                              \n\t"
2351                 "pmaxub %%mm0, %%mm2                            \n\t" //
2352                 "pminub %%mm3, %%mm0                            \n\t" //
2353                 "pmaxub %%mm1, %%mm0                            \n\t" //
2354                 "pminub %%mm0, %%mm2                            \n\t"
2355                 "movq %%mm2, (%%ebx)                            \n\t"
2356
2357                 "movq (%%ebx, %1, 2), %%mm2                     \n\t" //
2358                 "movq (%0, %1, 8), %%mm0                        \n\t" //
2359                 "movq %%mm2, %%mm3                              \n\t"
2360                 "pmaxub %%mm0, %%mm2                            \n\t" //
2361                 "pminub %%mm3, %%mm0                            \n\t" //
2362                 "pmaxub %%mm1, %%mm0                            \n\t" //
2363                 "pminub %%mm0, %%mm2                            \n\t"
2364                 "movq %%mm2, (%%ebx, %1, 2)                     \n\t"
2365
2366
2367                 : : "r" (src), "r" (stride)
2368                 : "%eax", "%ebx"
2369         );
2370
2371 #else // MMX without MMX2
2372         asm volatile(
2373                 "leal (%0, %1), %%eax                           \n\t"
2374                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
2375 //      0       1       2       3       4       5       6       7       8       9
2376 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
2377                 "pxor %%mm7, %%mm7                              \n\t"
2378
2379 #define MEDIAN(a,b,c)\
2380                 "movq " #a ", %%mm0                             \n\t"\
2381                 "movq " #b ", %%mm2                             \n\t"\
2382                 "movq " #c ", %%mm1                             \n\t"\
2383                 "movq %%mm0, %%mm3                              \n\t"\
2384                 "movq %%mm1, %%mm4                              \n\t"\
2385                 "movq %%mm2, %%mm5                              \n\t"\
2386                 "psubusb %%mm1, %%mm3                           \n\t"\
2387                 "psubusb %%mm2, %%mm4                           \n\t"\
2388                 "psubusb %%mm0, %%mm5                           \n\t"\
2389                 "pcmpeqb %%mm7, %%mm3                           \n\t"\
2390                 "pcmpeqb %%mm7, %%mm4                           \n\t"\
2391                 "pcmpeqb %%mm7, %%mm5                           \n\t"\
2392                 "movq %%mm3, %%mm6                              \n\t"\
2393                 "pxor %%mm4, %%mm3                              \n\t"\
2394                 "pxor %%mm5, %%mm4                              \n\t"\
2395                 "pxor %%mm6, %%mm5                              \n\t"\
2396                 "por %%mm3, %%mm1                               \n\t"\
2397                 "por %%mm4, %%mm2                               \n\t"\
2398                 "por %%mm5, %%mm0                               \n\t"\
2399                 "pand %%mm2, %%mm0                              \n\t"\
2400                 "pand %%mm1, %%mm0                              \n\t"\
2401                 "movq %%mm0, " #b "                             \n\t"
2402
2403 MEDIAN((%0), (%%eax), (%%eax, %1))
2404 MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
2405 MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1))
2406 MEDIAN((%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8))
2407
2408                 : : "r" (src), "r" (stride)
2409                 : "%eax", "%ebx"
2410         );
2411 #endif // MMX
2412 #else
2413         //FIXME
2414         int x;
2415         src+= 4*stride;
2416         for(x=0; x<8; x++)
2417         {
2418                 src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
2419                 src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
2420                 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2421                 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
2422                 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
2423                 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
2424                 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
2425                 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
2426                 src++;
2427         }
2428 #endif
2429 }
2430
2431 #ifdef HAVE_MMX
2432 /**
2433  * transposes and shift the given 8x8 Block into dst1 and dst2
2434  */
2435 static inline void transpose1(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
2436 {
2437         asm(
2438                 "leal (%0, %1), %%eax                           \n\t"
2439                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
2440 //      0       1       2       3       4       5       6       7       8       9
2441 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
2442                 "movq (%0), %%mm0               \n\t" // 12345678
2443                 "movq (%%eax), %%mm1            \n\t" // abcdefgh
2444                 "movq %%mm0, %%mm2              \n\t" // 12345678
2445                 "punpcklbw %%mm1, %%mm0         \n\t" // 1a2b3c4d
2446                 "punpckhbw %%mm1, %%mm2         \n\t" // 5e6f7g8h
2447
2448                 "movq (%%eax, %1), %%mm1        \n\t"
2449                 "movq (%%eax, %1, 2), %%mm3     \n\t"
2450                 "movq %%mm1, %%mm4              \n\t"
2451                 "punpcklbw %%mm3, %%mm1         \n\t"
2452                 "punpckhbw %%mm3, %%mm4         \n\t"
2453
2454                 "movq %%mm0, %%mm3              \n\t"
2455                 "punpcklwd %%mm1, %%mm0         \n\t"
2456                 "punpckhwd %%mm1, %%mm3         \n\t"
2457                 "movq %%mm2, %%mm1              \n\t"
2458                 "punpcklwd %%mm4, %%mm2         \n\t"
2459                 "punpckhwd %%mm4, %%mm1         \n\t"
2460
2461                 "movd %%mm0, 128(%2)            \n\t"
2462                 "psrlq $32, %%mm0               \n\t"
2463                 "movd %%mm0, 144(%2)            \n\t"
2464                 "movd %%mm3, 160(%2)            \n\t"
2465                 "psrlq $32, %%mm3               \n\t"
2466                 "movd %%mm3, 176(%2)            \n\t"
2467                 "movd %%mm3, 48(%3)             \n\t"
2468                 "movd %%mm2, 192(%2)            \n\t"
2469                 "movd %%mm2, 64(%3)             \n\t"
2470                 "psrlq $32, %%mm2               \n\t"
2471                 "movd %%mm2, 80(%3)             \n\t"
2472                 "movd %%mm1, 96(%3)             \n\t"
2473                 "psrlq $32, %%mm1               \n\t"
2474                 "movd %%mm1, 112(%3)            \n\t"
2475
2476                 "movq (%0, %1, 4), %%mm0        \n\t" // 12345678
2477                 "movq (%%ebx), %%mm1            \n\t" // abcdefgh
2478                 "movq %%mm0, %%mm2              \n\t" // 12345678
2479                 "punpcklbw %%mm1, %%mm0         \n\t" // 1a2b3c4d
2480                 "punpckhbw %%mm1, %%mm2         \n\t" // 5e6f7g8h
2481
2482                 "movq (%%ebx, %1), %%mm1        \n\t"
2483                 "movq (%%ebx, %1, 2), %%mm3     \n\t"
2484                 "movq %%mm1, %%mm4              \n\t"
2485                 "punpcklbw %%mm3, %%mm1         \n\t"
2486                 "punpckhbw %%mm3, %%mm4         \n\t"
2487
2488                 "movq %%mm0, %%mm3              \n\t"
2489                 "punpcklwd %%mm1, %%mm0         \n\t"
2490                 "punpckhwd %%mm1, %%mm3         \n\t"
2491                 "movq %%mm2, %%mm1              \n\t"
2492                 "punpcklwd %%mm4, %%mm2         \n\t"
2493                 "punpckhwd %%mm4, %%mm1         \n\t"
2494
2495                 "movd %%mm0, 132(%2)            \n\t"
2496                 "psrlq $32, %%mm0               \n\t"
2497                 "movd %%mm0, 148(%2)            \n\t"
2498                 "movd %%mm3, 164(%2)            \n\t"
2499                 "psrlq $32, %%mm3               \n\t"
2500                 "movd %%mm3, 180(%2)            \n\t"
2501                 "movd %%mm3, 52(%3)             \n\t"
2502                 "movd %%mm2, 196(%2)            \n\t"
2503                 "movd %%mm2, 68(%3)             \n\t"
2504                 "psrlq $32, %%mm2               \n\t"
2505                 "movd %%mm2, 84(%3)             \n\t"
2506                 "movd %%mm1, 100(%3)            \n\t"
2507                 "psrlq $32, %%mm1               \n\t"
2508                 "movd %%mm1, 116(%3)            \n\t"
2509
2510
2511         :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2)
2512         : "%eax", "%ebx"
2513         );
2514 }
2515
2516 /**
2517  * transposes the given 8x8 block
2518  */
2519 static inline void transpose2(uint8_t *dst, int dstStride, uint8_t *src)
2520 {
2521         asm(
2522                 "leal (%0, %1), %%eax                           \n\t"
2523                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
2524 //      0       1       2       3       4       5       6       7       8       9
2525 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
2526                 "movq (%2), %%mm0               \n\t" // 12345678
2527                 "movq 16(%2), %%mm1             \n\t" // abcdefgh
2528                 "movq %%mm0, %%mm2              \n\t" // 12345678
2529                 "punpcklbw %%mm1, %%mm0         \n\t" // 1a2b3c4d
2530                 "punpckhbw %%mm1, %%mm2         \n\t" // 5e6f7g8h
2531
2532                 "movq 32(%2), %%mm1             \n\t"
2533                 "movq 48(%2), %%mm3             \n\t"
2534                 "movq %%mm1, %%mm4              \n\t"
2535                 "punpcklbw %%mm3, %%mm1         \n\t"
2536                 "punpckhbw %%mm3, %%mm4         \n\t"
2537
2538                 "movq %%mm0, %%mm3              \n\t"
2539                 "punpcklwd %%mm1, %%mm0         \n\t"
2540                 "punpckhwd %%mm1, %%mm3         \n\t"
2541                 "movq %%mm2, %%mm1              \n\t"
2542                 "punpcklwd %%mm4, %%mm2         \n\t"
2543                 "punpckhwd %%mm4, %%mm1         \n\t"
2544
2545                 "movd %%mm0, (%0)               \n\t"
2546                 "psrlq $32, %%mm0               \n\t"
2547                 "movd %%mm0, (%%eax)            \n\t"
2548                 "movd %%mm3, (%%eax, %1)        \n\t"
2549                 "psrlq $32, %%mm3               \n\t"
2550                 "movd %%mm3, (%%eax, %1, 2)     \n\t"
2551                 "movd %%mm2, (%0, %1, 4)        \n\t"
2552                 "psrlq $32, %%mm2               \n\t"
2553                 "movd %%mm2, (%%ebx)            \n\t"
2554                 "movd %%mm1, (%%ebx, %1)        \n\t"
2555                 "psrlq $32, %%mm1               \n\t"
2556                 "movd %%mm1, (%%ebx, %1, 2)     \n\t"
2557
2558
2559                 "movq 64(%2), %%mm0             \n\t" // 12345678
2560                 "movq 80(%2), %%mm1             \n\t" // abcdefgh
2561                 "movq %%mm0, %%mm2              \n\t" // 12345678
2562                 "punpcklbw %%mm1, %%mm0         \n\t" // 1a2b3c4d
2563                 "punpckhbw %%mm1, %%mm2         \n\t" // 5e6f7g8h
2564
2565                 "movq 96(%2), %%mm1             \n\t"
2566                 "movq 112(%2), %%mm3            \n\t"
2567                 "movq %%mm1, %%mm4              \n\t"
2568                 "punpcklbw %%mm3, %%mm1         \n\t"
2569                 "punpckhbw %%mm3, %%mm4         \n\t"
2570
2571                 "movq %%mm0, %%mm3              \n\t"
2572                 "punpcklwd %%mm1, %%mm0         \n\t"
2573                 "punpckhwd %%mm1, %%mm3         \n\t"
2574                 "movq %%mm2, %%mm1              \n\t"
2575                 "punpcklwd %%mm4, %%mm2         \n\t"
2576                 "punpckhwd %%mm4, %%mm1         \n\t"
2577
2578                 "movd %%mm0, 4(%0)              \n\t"
2579                 "psrlq $32, %%mm0               \n\t"
2580                 "movd %%mm0, 4(%%eax)           \n\t"
2581                 "movd %%mm3, 4(%%eax, %1)       \n\t"
2582                 "psrlq $32, %%mm3               \n\t"
2583                 "movd %%mm3, 4(%%eax, %1, 2)    \n\t"
2584                 "movd %%mm2, 4(%0, %1, 4)       \n\t"
2585                 "psrlq $32, %%mm2               \n\t"
2586                 "movd %%mm2, 4(%%ebx)           \n\t"
2587                 "movd %%mm1, 4(%%ebx, %1)       \n\t"
2588                 "psrlq $32, %%mm1               \n\t"
2589                 "movd %%mm1, 4(%%ebx, %1, 2)    \n\t"
2590
2591         :: "r" (dst), "r" (dstStride), "r" (src)
2592         : "%eax", "%ebx"
2593         );
2594 }
2595 #endif
2596
2597 #ifdef HAVE_ODIVX_POSTPROCESS
2598 #include "../opendivx/postprocess.h"
2599 int use_old_pp=0;
2600 #endif
2601
2602 static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2603         QP_STORE_T QPs[], int QPStride, int isColor, int mode);
2604
2605 /* -pp Command line Help
2606 NOTE/FIXME: put this at an appropriate place (--help, html docs, man mplayer)?
2607
2608 -pp <filterName>[:<option>[:<option>...]][,[-]<filterName>[:<option>...]]...
2609
2610 long form example:
2611 -pp vdeblock:autoq,hdeblock:autoq,linblenddeint         -pp default,-vdeblock
2612 short form example:
2613 -pp vb:a,hb:a,lb                                        -pp de,-vb
2614
2615 Filters                 Options
2616 short   long name       short   long option     Description
2617 *       *               a       autoq           cpu power dependant enabler
2618                         c       chrom           chrominance filtring enabled
2619                         y       nochrom         chrominance filtring disabled
2620 hb      hdeblock                                horizontal deblocking filter
2621 vb      vdeblock                                vertical deblocking filter
2622 vr      rkvdeblock
2623 h1      x1hdeblock                              Experimental horizontal deblock filter 1
2624 v1      x1vdeblock                              Experimental vertical deblock filter 1
2625 dr      dering                                  not implemented yet
2626 al      autolevels                              automatic brightness / contrast fixer
2627                         f       fullyrange      stretch luminance range to (0..255)
2628 lb      linblenddeint                           linear blend deinterlacer
2629 li      linipoldeint                            linear interpolating deinterlacer
2630 ci      cubicipoldeint                          cubic interpolating deinterlacer
2631 md      mediandeint                             median deinterlacer
2632 de      default                                 hdeblock:a,vdeblock:a,dering:a,autolevels
2633 fa      fast                                    x1hdeblock:a,x1vdeblock:a,dering:a,autolevels
2634 */
2635
2636 /**
2637  * returns a PPMode struct which will have a non 0 error variable if an error occured
2638  * name is the string after "-pp" on the command line
2639  * quality is a number from 0 to GET_PP_QUALITY_MAX
2640  */
2641 struct PPMode getPPModeByNameAndQuality(char *name, int quality)
2642 {
2643         char temp[GET_MODE_BUFFER_SIZE];
2644         char *p= temp;
2645         char *filterDelimiters= ",";
2646         char *optionDelimiters= ":";
2647         struct PPMode ppMode= {0,0,0,0,0,0};
2648         char *filterToken;
2649
2650         strncpy(temp, name, GET_MODE_BUFFER_SIZE);
2651
2652         for(;;){
2653                 char *filterName;
2654                 int q= GET_PP_QUALITY_MAX;
2655                 int chrom=-1;
2656                 char *option;
2657                 char *options[OPTIONS_ARRAY_SIZE];
2658                 int i;
2659                 int filterNameOk=0;
2660                 int numOfUnknownOptions=0;
2661                 int enable=1; //does the user want us to enabled or disabled the filter
2662
2663                 filterToken= strtok(p, filterDelimiters);
2664                 if(filterToken == NULL) break;
2665                 p+= strlen(filterToken) + 1;
2666                 filterName= strtok(filterToken, optionDelimiters);
2667                 printf("%s::%s\n", filterToken, filterName);
2668
2669                 if(*filterName == '-')
2670                 {
2671                         enable=0;
2672                         filterName++;
2673                 }
2674                 for(;;){ //for all options
2675                         option= strtok(NULL, optionDelimiters);
2676                         if(option == NULL) break;
2677
2678                         printf("%s\n", option);
2679                         if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
2680                         else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
2681                         else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
2682                         else
2683                         {
2684                                 options[numOfUnknownOptions] = option;
2685                                 numOfUnknownOptions++;
2686                                 options[numOfUnknownOptions] = NULL;
2687                         }
2688                         if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
2689                 }
2690
2691                 /* replace stuff from the replace Table */
2692                 for(i=0; replaceTable[2*i]!=NULL; i++)
2693                 {
2694                         if(!strcmp(replaceTable[2*i], filterName))
2695                         {
2696                                 int newlen= strlen(replaceTable[2*i + 1]);
2697                                 int plen;
2698                                 int spaceLeft;
2699
2700                                 if(p==NULL) p= temp, *p=0;      //last filter
2701                                 else p--, *p=',';               //not last filter
2702
2703                                 plen= strlen(p);
2704                                 spaceLeft= (int)p - (int)temp + plen;
2705                                 if(spaceLeft + newlen  >= GET_MODE_BUFFER_SIZE)
2706                                 {
2707                                         ppMode.error++;
2708                                         break;
2709                                 }
2710                                 memmove(p + newlen, p, plen+1);
2711                                 memcpy(p, replaceTable[2*i + 1], newlen);
2712                                 filterNameOk=1;
2713                         }
2714                 }
2715
2716                 for(i=0; filters[i].shortName!=NULL; i++)
2717                 {
2718                         if(   !strcmp(filters[i].longName, filterName)
2719                            || !strcmp(filters[i].shortName, filterName))
2720                         {
2721                                 ppMode.lumMode &= ~filters[i].mask;
2722                                 ppMode.chromMode &= ~filters[i].mask;
2723
2724                                 filterNameOk=1;
2725                                 if(!enable) break; // user wants to disable it
2726
2727                                 if(q >= filters[i].minLumQuality)
2728                                         ppMode.lumMode|= filters[i].mask;
2729                                 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
2730                                         if(q >= filters[i].minChromQuality)
2731                                                 ppMode.chromMode|= filters[i].mask;
2732
2733                                 if(filters[i].mask == LEVEL_FIX)
2734                                 {
2735                                         int o;
2736                                         ppMode.minAllowedY= 16;
2737                                         ppMode.maxAllowedY= 234;
2738                                         for(o=0; options[o]!=NULL; o++)
2739                                                 if(  !strcmp(options[o],"fullyrange")
2740                                                    ||!strcmp(options[o],"f"))
2741                                                 {
2742                                                         ppMode.minAllowedY= 0;
2743                                                         ppMode.maxAllowedY= 255;
2744                                                         numOfUnknownOptions--;
2745                                                 }
2746                                 }
2747                         }
2748                 }
2749                 if(!filterNameOk) ppMode.error++;
2750                 ppMode.error += numOfUnknownOptions;
2751         }
2752
2753 #ifdef HAVE_ODIVX_POSTPROCESS
2754         if(ppMode.lumMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_H;
2755         if(ppMode.lumMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_V;
2756         if(ppMode.chromMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_H;
2757         if(ppMode.chromMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_V;
2758         if(ppMode.lumMode & DERING) ppMode.oldMode |= PP_DERING_Y;
2759         if(ppMode.chromMode & DERING) ppMode.oldMode |= PP_DERING_C;
2760 #endif
2761
2762         return ppMode;
2763 }
2764
2765 /**
2766  * ...
2767  */
2768 void  postprocess(unsigned char * src[], int src_stride,
2769                  unsigned char * dst[], int dst_stride,
2770                  int horizontal_size,   int vertical_size,
2771                  QP_STORE_T *QP_store,  int QP_stride,
2772                                           int mode)
2773 {
2774 /*
2775         static int qual=0;
2776
2777         struct PPMode ppMode= getPPModeByNameAndQuality("fast,default,-hdeblock,-vdeblock", qual);
2778         qual++;
2779         qual%=7;
2780         printf("\n%d %d %d %d\n", ppMode.lumMode, ppMode.chromMode, ppMode.oldMode, ppMode.error);
2781         postprocess2(src, src_stride, dst, dst_stride,
2782                  horizontal_size, vertical_size, QP_store, QP_stride, &ppMode);
2783
2784         return;
2785 */
2786         static QP_STORE_T zeroArray[2048/8];
2787         if(QP_store==NULL)
2788         {
2789                 QP_store= zeroArray;
2790                 QP_stride= 0;
2791         }
2792
2793 #ifdef HAVE_ODIVX_POSTPROCESS
2794 // Note: I could make this shit outside of this file, but it would mean one
2795 // more function call...
2796         if(use_old_pp){
2797             odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,mode);
2798             return;
2799         }
2800 #endif
2801
2802         postProcess(src[0], src_stride, dst[0], dst_stride,
2803                 horizontal_size, vertical_size, QP_store, QP_stride, 0, mode);
2804
2805         horizontal_size >>= 1;
2806         vertical_size   >>= 1;
2807         src_stride      >>= 1;
2808         dst_stride      >>= 1;
2809         mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00);
2810 //      mode&= ~(LINEAR_IPOL_DEINT_FILTER | LINEAR_BLEND_DEINT_FILTER |
2811 //               MEDIAN_DEINT_FILTER | CUBIC_IPOL_DEINT_FILTER);
2812
2813         if(1)
2814         {
2815                 postProcess(src[1], src_stride, dst[1], dst_stride,
2816                         horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
2817                 postProcess(src[2], src_stride, dst[2], dst_stride,
2818                         horizontal_size, vertical_size, QP_store, QP_stride, 2, mode);
2819         }
2820         else
2821         {
2822                 memcpy(dst[1], src[1], src_stride*horizontal_size);
2823                 memcpy(dst[2], src[2], src_stride*horizontal_size);
2824         }
2825 }
2826
2827 void  postprocess2(unsigned char * src[], int src_stride,
2828                  unsigned char * dst[], int dst_stride,
2829                  int horizontal_size,   int vertical_size,
2830                  QP_STORE_T *QP_store,  int QP_stride,
2831                  struct PPMode *mode)
2832 {
2833
2834         static QP_STORE_T zeroArray[2048/8];
2835         if(QP_store==NULL)
2836         {
2837                 QP_store= zeroArray;
2838                 QP_stride= 0;
2839         }
2840
2841 #ifdef HAVE_ODIVX_POSTPROCESS
2842 // Note: I could make this shit outside of this file, but it would mean one
2843 // more function call...
2844         if(use_old_pp){
2845             odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,
2846             mode->oldMode);
2847             return;
2848         }
2849 #endif
2850
2851         postProcess(src[0], src_stride, dst[0], dst_stride,
2852                 horizontal_size, vertical_size, QP_store, QP_stride, 0, mode->lumMode);
2853
2854         horizontal_size >>= 1;
2855         vertical_size   >>= 1;
2856         src_stride      >>= 1;
2857         dst_stride      >>= 1;
2858
2859         postProcess(src[1], src_stride, dst[1], dst_stride,
2860                 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode->chromMode);
2861         postProcess(src[2], src_stride, dst[2], dst_stride,
2862                 horizontal_size, vertical_size, QP_store, QP_stride, 2, mode->chromMode);
2863 }
2864
2865
2866 /**
2867  * gets the mode flags for a given quality (larger values mean slower but better postprocessing)
2868  * 0 <= quality <= 6
2869  */
2870 int getPpModeForQuality(int quality){
2871         int modes[1+GET_PP_QUALITY_MAX]= {
2872                 0,
2873 #if 1
2874                 // horizontal filters first
2875                 LUM_H_DEBLOCK,
2876                 LUM_H_DEBLOCK | LUM_V_DEBLOCK,
2877                 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK,
2878                 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK,
2879                 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING,
2880                 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING | CHROM_DERING
2881 #else
2882                 // vertical filters first
2883                 LUM_V_DEBLOCK,
2884                 LUM_V_DEBLOCK | LUM_H_DEBLOCK,
2885                 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK,
2886                 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK,
2887                 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING,
2888                 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING | CHROM_DERING
2889 #endif
2890         };
2891
2892 #ifdef HAVE_ODIVX_POSTPROCESS
2893         int odivx_modes[1+GET_PP_QUALITY_MAX]= {
2894                 0,
2895                 PP_DEBLOCK_Y_H,
2896                 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V,
2897                 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H,
2898                 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V,
2899                 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y,
2900                 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y|PP_DERING_C
2901         };
2902         if(use_old_pp) return odivx_modes[quality];
2903 #endif
2904         return modes[quality];
2905 }
2906
2907 /**
2908  * Copies a block from src to dst and fixes the blacklevel
2909  * numLines must be a multiple of 4
2910  * levelFix == 0 -> dont touch the brighness & contrast
2911  */
2912 static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
2913         int numLines, int levelFix)
2914 {
2915 #ifndef HAVE_MMX
2916         int i;
2917 #endif
2918         if(levelFix)
2919         {
2920 #ifdef HAVE_MMX
2921                                         asm volatile(
2922                                                 "leal (%2,%2), %%eax    \n\t"
2923                                                 "leal (%3,%3), %%ebx    \n\t"
2924                                                 "movq packedYOffset, %%mm2      \n\t"
2925                                                 "movq packedYScale, %%mm3       \n\t"
2926                                                 "pxor %%mm4, %%mm4      \n\t"
2927
2928 #define SCALED_CPY                                      \
2929                                                 "movq (%0), %%mm0       \n\t"\
2930                                                 "movq (%0), %%mm5       \n\t"\
2931                                                 "punpcklbw %%mm4, %%mm0 \n\t"\
2932                                                 "punpckhbw %%mm4, %%mm5 \n\t"\
2933                                                 "psubw %%mm2, %%mm0     \n\t"\
2934                                                 "psubw %%mm2, %%mm5     \n\t"\
2935                                                 "movq (%0,%2), %%mm1    \n\t"\
2936                                                 "psllw $6, %%mm0        \n\t"\
2937                                                 "psllw $6, %%mm5        \n\t"\
2938                                                 "pmulhw %%mm3, %%mm0    \n\t"\
2939                                                 "movq (%0,%2), %%mm6    \n\t"\
2940                                                 "pmulhw %%mm3, %%mm5    \n\t"\
2941                                                 "punpcklbw %%mm4, %%mm1 \n\t"\
2942                                                 "punpckhbw %%mm4, %%mm6 \n\t"\
2943                                                 "psubw %%mm2, %%mm1     \n\t"\
2944                                                 "psubw %%mm2, %%mm6     \n\t"\
2945                                                 "psllw $6, %%mm1        \n\t"\
2946                                                 "psllw $6, %%mm6        \n\t"\
2947                                                 "pmulhw %%mm3, %%mm1    \n\t"\
2948                                                 "pmulhw %%mm3, %%mm6    \n\t"\
2949                                                 "addl %%eax, %0         \n\t"\
2950                                                 "packuswb %%mm5, %%mm0  \n\t"\
2951                                                 "packuswb %%mm6, %%mm1  \n\t"\
2952                                                 "movq %%mm0, (%1)       \n\t"\
2953                                                 "movq %%mm1, (%1, %3)   \n\t"\
2954
2955 SCALED_CPY
2956                                                 "addl %%ebx, %1         \n\t"
2957 SCALED_CPY
2958                                                 "addl %%ebx, %1         \n\t"
2959 SCALED_CPY
2960                                                 "addl %%ebx, %1         \n\t"
2961 SCALED_CPY
2962
2963                                                 : "+r"(src),
2964                                                 "+r"(dst)
2965                                                 :"r" (srcStride),
2966                                                 "r" (dstStride)
2967                                                 : "%eax", "%ebx"
2968                                         );
2969 #else
2970                                 for(i=0; i<numLines; i++)
2971                                         memcpy( &(dst[dstStride*i]),
2972                                                 &(src[srcStride*i]), BLOCK_SIZE);
2973 #endif
2974         }
2975         else
2976         {
2977 #ifdef HAVE_MMX
2978                                         asm volatile(
2979                                                 "movl %4, %%eax \n\t"
2980                                                 "movl %%eax, temp0\n\t"
2981                                                 "pushl %0 \n\t"
2982                                                 "pushl %1 \n\t"
2983                                                 "leal (%2,%2), %%eax    \n\t"
2984                                                 "leal (%3,%3), %%ebx    \n\t"
2985                                                 "movq packedYOffset, %%mm2      \n\t"
2986                                                 "movq packedYScale, %%mm3       \n\t"
2987
2988 #define SIMPLE_CPY                                      \
2989                                                 "movq (%0), %%mm0       \n\t"\
2990                                                 "movq (%0,%2), %%mm1    \n\t"\
2991                                                 "movq %%mm0, (%1)       \n\t"\
2992                                                 "movq %%mm1, (%1, %3)   \n\t"\
2993
2994                                                 "1:                     \n\t"
2995 SIMPLE_CPY
2996                                                 "addl %%eax, %0         \n\t"
2997                                                 "addl %%ebx, %1         \n\t"
2998 SIMPLE_CPY
2999                                                 "addl %%eax, %0         \n\t"
3000                                                 "addl %%ebx, %1         \n\t"
3001                                                 "decl temp0             \n\t"
3002                                                 "jnz 1b                 \n\t"
3003
3004                                                 "popl %1 \n\t"
3005                                                 "popl %0 \n\t"
3006                                                 : : "r" (src),
3007                                                 "r" (dst),
3008                                                 "r" (srcStride),
3009                                                 "r" (dstStride),
3010                                                 "m" (numLines>>2)
3011                                                 : "%eax", "%ebx"
3012                                         );
3013 #else
3014                                 for(i=0; i<numLines; i++)
3015                                         memcpy( &(dst[dstStride*i]),
3016                                                 &(src[srcStride*i]), BLOCK_SIZE);
3017 #endif
3018         }
3019 }
3020
3021
3022 /**
3023  * Filters array of bytes (Y or U or V values)
3024  */
3025 static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
3026         QP_STORE_T QPs[], int QPStride, int isColor, int mode)
3027 {
3028         int x,y;
3029         /* we need 64bit here otherwise we´ll going to have a problem
3030            after watching a black picture for 5 hours*/
3031         static uint64_t *yHistogram= NULL;
3032         int black=0, white=255; // blackest black and whitest white in the picture
3033         int QPCorrecture= 256;
3034
3035         /* Temporary buffers for handling the last row(s) */
3036         static uint8_t *tempDst= NULL;
3037         static uint8_t *tempSrc= NULL;
3038
3039         /* Temporary buffers for handling the last block */
3040         static uint8_t *tempDstBlock= NULL;
3041         static uint8_t *tempSrcBlock= NULL;
3042
3043 #ifdef PP_FUNNY_STRIDE
3044         uint8_t *dstBlockPtrBackup;
3045         uint8_t *srcBlockPtrBackup;
3046 #endif
3047
3048 #ifdef MORE_TIMING
3049         long long T0, T1, diffTime=0;
3050 #endif
3051 #ifdef TIMING
3052         long long memcpyTime=0, vertTime=0, horizTime=0, sumTime;
3053         sumTime= rdtsc();
3054 #endif
3055 //mode= 0x7F;
3056
3057         if(tempDst==NULL)
3058         {
3059                 tempDst= (uint8_t*)memalign(8, 1024*24);
3060                 tempSrc= (uint8_t*)memalign(8, 1024*24);
3061                 tempDstBlock= (uint8_t*)memalign(8, 1024*24);
3062                 tempSrcBlock= (uint8_t*)memalign(8, 1024*24);
3063         }
3064
3065         if(!yHistogram)
3066         {
3067                 int i;
3068                 yHistogram= (uint64_t*)malloc(8*256);
3069                 for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256;
3070
3071                 if(mode & FULL_Y_RANGE)
3072                 {
3073                         maxAllowedY=255;
3074                         minAllowedY=0;
3075                 }
3076         }
3077
3078         if(!isColor)
3079         {
3080                 uint64_t sum= 0;
3081                 int i;
3082                 static int framenum= -1;
3083                 uint64_t maxClipped;
3084                 uint64_t clipped;
3085                 double scale;
3086
3087                 framenum++;
3088                 if(framenum == 1) yHistogram[0]= width*height/64*15/256;
3089
3090                 for(i=0; i<256; i++)
3091                 {
3092                         sum+= yHistogram[i];
3093 //                      printf("%d ", yHistogram[i]);
3094                 }
3095 //              printf("\n\n");
3096
3097                 /* we allways get a completly black picture first */
3098                 maxClipped= (uint64_t)(sum * maxClippedThreshold);
3099
3100                 clipped= sum;
3101                 for(black=255; black>0; black--)
3102                 {
3103                         if(clipped < maxClipped) break;
3104                         clipped-= yHistogram[black];
3105                 }
3106
3107                 clipped= sum;
3108                 for(white=0; white<256; white++)
3109                 {
3110                         if(clipped < maxClipped) break;
3111                         clipped-= yHistogram[white];
3112                 }
3113
3114                 packedYOffset= (black - minAllowedY) & 0xFFFF;
3115                 packedYOffset|= packedYOffset<<32;
3116                 packedYOffset|= packedYOffset<<16;
3117
3118                 scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);
3119
3120                 packedYScale= (uint16_t)(scale*1024.0 + 0.5);
3121                 packedYScale|= packedYScale<<32;
3122                 packedYScale|= packedYScale<<16;
3123         }
3124         else
3125         {
3126                 packedYScale= 0x0100010001000100LL;
3127                 packedYOffset= 0;
3128         }
3129
3130         if(mode & LEVEL_FIX)    QPCorrecture= packedYScale &0xFFFF;
3131         else                    QPCorrecture= 256;
3132
3133         /* copy & deinterlace first row of blocks */
3134         y=-BLOCK_SIZE;
3135         {
3136                 //1% speedup if these are here instead of the inner loop
3137                 uint8_t *srcBlock= &(src[y*srcStride]);
3138                 uint8_t *dstBlock= &(dst[y*dstStride]);
3139
3140                 dstBlock= tempDst + dstStride;
3141
3142                 // From this point on it is guranteed that we can read and write 16 lines downward
3143                 // finish 1 block before the next otherwise we´ll might have a problem
3144                 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
3145                 for(x=0; x<width; x+=BLOCK_SIZE)
3146                 {
3147
3148 #ifdef HAVE_MMX2
3149 /*
3150                         prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3151                         prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3152                         prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3153                         prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3154 */
3155 /*
3156                         prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3157                         prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3158                         prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3159                         prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3160 */
3161
3162                         asm(
3163                                 "movl %4, %%eax                 \n\t"
3164                                 "shrl $2, %%eax                 \n\t"
3165                                 "andl $6, %%eax                 \n\t"
3166                                 "addl $8, %%eax                 \n\t"
3167                                 "movl %%eax, %%ebx              \n\t"
3168                                 "imul %1, %%eax                 \n\t"
3169                                 "imul %3, %%ebx                 \n\t"
3170                                 "prefetchnta 32(%%eax, %0)      \n\t"
3171                                 "prefetcht0 32(%%ebx, %2)       \n\t"
3172                                 "addl %1, %%eax                 \n\t"
3173                                 "addl %3, %%ebx                 \n\t"
3174                                 "prefetchnta 32(%%eax, %0)      \n\t"
3175                                 "prefetcht0 32(%%ebx, %2)       \n\t"
3176                         :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
3177                         "m" (x)
3178                         : "%eax", "%ebx"
3179                         );
3180
3181 #elif defined(HAVE_3DNOW)
3182 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
3183 /*                      prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3184                         prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3185                         prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3186                         prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3187 */
3188 #endif
3189
3190                         blockCopy(dstBlock + dstStride*8, dstStride,
3191                                 srcBlock + srcStride*8, srcStride, 8, mode & LEVEL_FIX);
3192
3193                         if(mode & LINEAR_IPOL_DEINT_FILTER)
3194                                 deInterlaceInterpolateLinear(dstBlock, dstStride);
3195                         else if(mode & LINEAR_BLEND_DEINT_FILTER)
3196                                 deInterlaceBlendLinear(dstBlock, dstStride);
3197                         else if(mode & MEDIAN_DEINT_FILTER)
3198                                 deInterlaceMedian(dstBlock, dstStride);
3199                         else if(mode & CUBIC_IPOL_DEINT_FILTER)
3200                                 deInterlaceInterpolateCubic(dstBlock, dstStride);
3201 /*                      else if(mode & CUBIC_BLEND_DEINT_FILTER)
3202                                 deInterlaceBlendCubic(dstBlock, dstStride);
3203 */
3204                         dstBlock+=8;
3205                         srcBlock+=8;
3206                 }
3207                 memcpy(&(dst[y*dstStride]) + 8*dstStride, tempDst + 9*dstStride, 8*dstStride );
3208         }
3209
3210         for(y=0; y<height; y+=BLOCK_SIZE)
3211         {
3212                 //1% speedup if these are here instead of the inner loop
3213                 uint8_t *srcBlock= &(src[y*srcStride]);
3214                 uint8_t *dstBlock= &(dst[y*dstStride]);
3215 #ifdef ARCH_X86
3216                 int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride];
3217                 int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4);
3218                 int QPFrac= QPDelta;
3219                 uint8_t *tempBlock1= tempBlocks;
3220                 uint8_t *tempBlock2= tempBlocks + 8;
3221 #endif
3222                 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
3223                    if not than use a temporary buffer */
3224                 if(y+15 >= height)
3225                 {
3226                         /* copy from line 8 to 15 of src, these will be copied with
3227                            blockcopy to dst later */
3228                         memcpy(tempSrc + srcStride*8, srcBlock + srcStride*8,
3229                                 srcStride*MAX(height-y-8, 0) );
3230
3231                         /* duplicate last line to fill the void upto line 15 */
3232                         if(y+15 >= height)
3233                         {
3234                                 int i;
3235                                 for(i=height-y; i<=15; i++)
3236                                         memcpy(tempSrc + srcStride*i,
3237                                                 src + srcStride*(height-1), srcStride);
3238                         }
3239
3240                         /* copy up to 9 lines of dst */
3241                         memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, 9) );
3242                         dstBlock= tempDst + dstStride;
3243                         srcBlock= tempSrc;
3244                 }
3245
3246                 // From this point on it is guranteed that we can read and write 16 lines downward
3247                 // finish 1 block before the next otherwise we´ll might have a problem
3248                 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
3249                 for(x=0; x<width; x+=BLOCK_SIZE)
3250                 {
3251                         const int stride= dstStride;
3252                         uint8_t *tmpXchg;
3253 #ifdef ARCH_X86
3254                         int QP= *QPptr;
3255                         asm volatile(
3256                                 "addl %2, %1            \n\t"
3257                                 "sbbl %%eax, %%eax      \n\t"
3258                                 "shll $2, %%eax         \n\t"
3259                                 "subl %%eax, %0         \n\t"
3260                                 : "+r" (QPptr), "+m" (QPFrac)
3261                                 : "r" (QPDelta)
3262                                 : "%eax"
3263                         );
3264 #else
3265                         int QP= isColor ?
3266                                 QPs[(y>>3)*QPStride + (x>>3)]:
3267                                 QPs[(y>>4)*QPStride + (x>>4)];
3268 #endif
3269                         if(!isColor)
3270                         {
3271                                 QP= (QP* QPCorrecture)>>8;
3272                                 yHistogram[ srcBlock[srcStride*12 + 4] ]++;
3273                         }
3274 #ifdef HAVE_MMX
3275                         asm volatile(
3276                                 "movd %0, %%mm7                                 \n\t"
3277                                 "packuswb %%mm7, %%mm7                          \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
3278                                 "packuswb %%mm7, %%mm7                          \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
3279                                 "packuswb %%mm7, %%mm7                          \n\t" // QP,..., QP
3280                                 "movq %%mm7, pQPb                               \n\t"
3281                                 : : "r" (QP)
3282                         );
3283 #endif
3284
3285 #ifdef MORE_TIMING
3286                         T0= rdtsc();
3287 #endif
3288
3289 #ifdef HAVE_MMX2
3290 /*
3291                         prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3292                         prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3293                         prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3294                         prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3295 */
3296 /*
3297                         prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3298                         prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3299                         prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3300                         prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3301 */
3302
3303                         asm(
3304                                 "movl %4, %%eax                 \n\t"
3305                                 "shrl $2, %%eax                 \n\t"
3306                                 "andl $6, %%eax                 \n\t"
3307                                 "addl $8, %%eax                 \n\t"
3308                                 "movl %%eax, %%ebx              \n\t"
3309                                 "imul %1, %%eax                 \n\t"
3310                                 "imul %3, %%ebx                 \n\t"
3311                                 "prefetchnta 32(%%eax, %0)      \n\t"
3312                                 "prefetcht0 32(%%ebx, %2)       \n\t"
3313                                 "addl %1, %%eax                 \n\t"
3314                                 "addl %3, %%ebx                 \n\t"
3315                                 "prefetchnta 32(%%eax, %0)      \n\t"
3316                                 "prefetcht0 32(%%ebx, %2)       \n\t"
3317                         :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
3318                         "m" (x)
3319                         : "%eax", "%ebx"
3320                         );
3321
3322 #elif defined(HAVE_3DNOW)
3323 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
3324 /*                      prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3325                         prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3326                         prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3327                         prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3328 */
3329 #endif
3330
3331 #ifdef PP_FUNNY_STRIDE
3332                         //can we mess with a 8x16 block, if not use a temp buffer, yes again
3333                         if(x+7 >= width)
3334                         {
3335                                 int i;
3336                                 dstBlockPtrBackup= dstBlock;
3337                                 srcBlockPtrBackup= srcBlock;
3338
3339                                 for(i=0;i<BLOCK_SIZE*2; i++)
3340                                 {
3341                                         memcpy(tempSrcBlock+i*srcStride, srcBlock+i*srcStride, width-x);
3342                                         memcpy(tempDstBlock+i*dstStride, dstBlock+i*dstStride, width-x);
3343                                 }
3344
3345                                 dstBlock= tempDstBlock;
3346                                 srcBlock= tempSrcBlock;
3347                         }
3348 #endif
3349
3350                         blockCopy(dstBlock + dstStride*8, dstStride,
3351                                 srcBlock + srcStride*8, srcStride, 8, mode & LEVEL_FIX);
3352
3353                         if(mode & LINEAR_IPOL_DEINT_FILTER)
3354                                 deInterlaceInterpolateLinear(dstBlock, dstStride);
3355                         else if(mode & LINEAR_BLEND_DEINT_FILTER)
3356                                 deInterlaceBlendLinear(dstBlock, dstStride);
3357                         else if(mode & MEDIAN_DEINT_FILTER)
3358                                 deInterlaceMedian(dstBlock, dstStride);
3359                         else if(mode & CUBIC_IPOL_DEINT_FILTER)
3360                                 deInterlaceInterpolateCubic(dstBlock, dstStride);
3361 /*                      else if(mode & CUBIC_BLEND_DEINT_FILTER)
3362                                 deInterlaceBlendCubic(dstBlock, dstStride);
3363 */
3364
3365                         /* only deblock if we have 2 blocks */
3366                         if(y + 8 < height)
3367                         {
3368 #ifdef MORE_TIMING
3369                                 T1= rdtsc();
3370                                 memcpyTime+= T1-T0;
3371                                 T0=T1;
3372 #endif
3373                                 if(mode & V_RK1_FILTER)
3374                                         vertRK1Filter(dstBlock, stride, QP);
3375                                 else if(mode & V_X1_FILTER)
3376                                         vertX1Filter(dstBlock, stride, QP);
3377                                 else if(mode & V_DEBLOCK)
3378                                 {
3379                                         if( isVertDC(dstBlock, stride))
3380                                         {
3381                                                 if(isVertMinMaxOk(dstBlock, stride, QP))
3382                                                         doVertLowPass(dstBlock, stride, QP);
3383                                         }
3384                                         else
3385                                                 doVertDefFilter(dstBlock, stride, QP);
3386                                 }
3387 #ifdef MORE_TIMING
3388                                 T1= rdtsc();
3389                                 vertTime+= T1-T0;
3390                                 T0=T1;
3391 #endif
3392                         }
3393
3394 #ifdef HAVE_MMX
3395                         transpose1(tempBlock1, tempBlock2, dstBlock, dstStride);
3396 #endif
3397                         /* check if we have a previous block to deblock it with dstBlock */
3398                         if(x - 8 >= 0)
3399                         {
3400 #ifdef MORE_TIMING
3401                                 T0= rdtsc();
3402 #endif
3403 #ifdef HAVE_MMX
3404                                 if(mode & H_RK1_FILTER)
3405                                         vertRK1Filter(tempBlock1, 16, QP);
3406                                 else if(mode & H_X1_FILTER)
3407                                         vertX1Filter(tempBlock1, 16, QP);
3408                                 else if(mode & H_DEBLOCK)
3409                                 {
3410                                         if( isVertDC(tempBlock1, 16))
3411                                         {
3412                                                 if(isVertMinMaxOk(tempBlock1, 16, QP))
3413                                                         doVertLowPass(tempBlock1, 16, QP);
3414                                         }
3415                                         else
3416                                                 doVertDefFilter(tempBlock1, 16, QP);
3417                                 }
3418
3419                                 transpose2(dstBlock-4, dstStride, tempBlock1 + 4*16);
3420
3421 #else
3422                                 if(mode & H_X1_FILTER)
3423                                         horizX1Filter(dstBlock-4, stride, QP);
3424                                 else if(mode & H_DEBLOCK)
3425                                 {
3426                                         if( isHorizDC(dstBlock-4, stride))
3427                                         {
3428                                                 if(isHorizMinMaxOk(dstBlock-4, stride, QP))
3429                                                         doHorizLowPass(dstBlock-4, stride, QP);
3430                                         }
3431                                         else
3432                                                 doHorizDefFilter(dstBlock-4, stride, QP);
3433                                 }
3434 #endif
3435 #ifdef MORE_TIMING
3436                                 T1= rdtsc();
3437                                 horizTime+= T1-T0;
3438                                 T0=T1;
3439 #endif
3440                                 if(mode & DERING)
3441                                 {
3442                                 //FIXME filter first line
3443                                         if(y>0) dering(dstBlock - stride - 8, stride, QP);
3444                                 }
3445                         }
3446                         else if(mode & DERING)
3447                         {
3448                          //FIXME y+15 is required cuz of the tempBuffer thing -> bottom right block isnt filtered
3449                                         if(y > 8 && y+15 < height) dering(dstBlock - stride*9 + width - 8, stride, QP);
3450                         }
3451
3452
3453 #ifdef PP_FUNNY_STRIDE
3454                         /* did we use a tmp-block buffer */
3455                         if(x+7 >= width)
3456                         {
3457                                 int i;
3458                                 dstBlock= dstBlockPtrBackup;
3459                                 srcBlock= srcBlockPtrBackup;
3460
3461                                 for(i=0;i<BLOCK_SIZE*2; i++)
3462                                 {
3463                                         memcpy(dstBlock+i*dstStride, tempDstBlock+i*dstStride, width-x);
3464                                 }
3465                         }
3466 #endif
3467
3468                         dstBlock+=8;
3469                         srcBlock+=8;
3470
3471 #ifdef HAVE_MMX
3472                         tmpXchg= tempBlock1;
3473                         tempBlock1= tempBlock2;
3474                         tempBlock2 = tmpXchg;
3475 #endif
3476                 }
3477
3478                 /* did we use a tmp buffer for the last lines*/
3479                 if(y+15 >= height)
3480                 {
3481                         uint8_t *dstBlock= &(dst[y*dstStride]);
3482                         memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) );
3483                 }
3484         }
3485 #ifdef HAVE_3DNOW
3486         asm volatile("femms");
3487 #elif defined (HAVE_MMX)
3488         asm volatile("emms");
3489 #endif
3490
3491 #ifdef TIMING
3492         // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...)
3493         sumTime= rdtsc() - sumTime;
3494         if(!isColor)
3495                 printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d    \r",
3496                         (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
3497                         (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)
3498                         , black, white);
3499 #endif
3500 }