]> git.sesse.net Git - ffmpeg/blob - postproc/postprocess.c
fixed yv12toyuy2
[ffmpeg] / postproc / postprocess.c
1 /*
2     Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
3
4     This program is free software; you can redistribute it and/or modify
5     it under the terms of the GNU General Public License as published by
6     the Free Software Foundation; either version 2 of the License, or
7     (at your option) any later version.
8
9     This program is distributed in the hope that it will be useful,
10     but WITHOUT ANY WARRANTY; without even the implied warranty of
11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12     GNU General Public License for more details.
13
14     You should have received a copy of the GNU General Public License
15     along with this program; if not, write to the Free Software
16     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17 */
18
19 /*
20                         C       MMX     MMX2    3DNow
21 isVertDC                Ec      Ec
22 isVertMinMaxOk          Ec      Ec
23 doVertLowPass           E               e       e
24 doVertDefFilter         Ec      Ec      Ec
25 isHorizDC               Ec      Ec
26 isHorizMinMaxOk         a       E
27 doHorizLowPass          E               e       e
28 doHorizDefFilter        Ec      Ec      Ec
29 deRing                  E               e       e*
30 Vertical RKAlgo1        E               a       a
31 Horizontal RKAlgo1                      a       a
32 Vertical X1             a               E       E
33 Horizontal X1           a               E       E
34 LinIpolDeinterlace      e               E       E*
35 CubicIpolDeinterlace    a               e       e*
36 LinBlendDeinterlace     e               E       E*
37 MedianDeinterlace               Ec      Ec
38
39
40 * i dont have a 3dnow CPU -> its untested
41 E = Exact implementation
42 e = allmost exact implementation (slightly different rounding,...)
43 a = alternative / approximate impl
44 c = checked against the other implementations (-vo md5)
45 */
46
47 /*
48 TODO:
49 verify that everything workes as it should (how?)
50 reduce the time wasted on the mem transfer
51 implement dering
52 implement everything in C at least (done at the moment but ...)
53 unroll stuff if instructions depend too much on the prior one
54 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
55 move YScale thing to the end instead of fixing QP
56 write a faster and higher quality deblocking filter :)
57 do something about the speed of the horizontal filters
58 make the mainloop more flexible (variable number of blocks at once
59         (the if/else stuff per block is slowing things down)
60 compare the quality & speed of all filters
61 split this huge file
62 fix warnings (unused vars, ...)
63 noise reduction filters
64 border remover
65 optimize c versions
66 ...
67
68 Notes:
69 */
70
71 //Changelog: use the CVS log
72
73 #include <inttypes.h>
74 #include <stdio.h>
75 #include <stdlib.h>
76 #include <string.h>
77 #include "../config.h"
78 #ifdef HAVE_MALLOC_H
79 #include <malloc.h>
80 #endif
81 //#undef HAVE_MMX2
82 //#define HAVE_3DNOW
83 //#undef HAVE_MMX
84 #include "postprocess.h"
85
86 #define MIN(a,b) ((a) > (b) ? (b) : (a))
87 #define MAX(a,b) ((a) < (b) ? (b) : (a))
88 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
89 #define SIGN(a) ((a) > 0 ? 1 : -1)
90
91 #ifdef HAVE_MMX2
92 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
93 #elif defined (HAVE_3DNOW)
94 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
95 #endif
96
97 #ifdef HAVE_MMX2
98 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
99 #elif defined (HAVE_MMX)
100 #define PMINUB(b,a,t) \
101         "movq " #a ", " #t " \n\t"\
102         "psubusb " #b ", " #t " \n\t"\
103         "psubb " #t ", " #a " \n\t"
104 #endif
105
106 #ifdef HAVE_MMX2
107 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
108 #elif defined (HAVE_MMX)
109 #define PMAXUB(a,b) \
110         "psubusb " #a ", " #b " \n\t"\
111         "paddb " #a ", " #b " \n\t"
112 #endif
113
114
115 #define GET_MODE_BUFFER_SIZE 500
116 #define OPTIONS_ARRAY_SIZE 10
117
118
119 static uint64_t packedYOffset=  0x0000000000000000LL;
120 static uint64_t packedYScale=   0x0100010001000100LL;
121 static uint64_t w05=            0x0005000500050005LL;
122 static uint64_t w20=            0x0020002000200020LL;
123 static uint64_t w1400=          0x1400140014001400LL;
124 static uint64_t bm00000001=     0x00000000000000FFLL;
125 static uint64_t bm00010000=     0x000000FF00000000LL;
126 static uint64_t bm00001000=     0x00000000FF000000LL;
127 static uint64_t bm10000000=     0xFF00000000000000LL;
128 static uint64_t bm10000001=     0xFF000000000000FFLL;
129 static uint64_t bm11000011=     0xFFFF00000000FFFFLL;
130 static uint64_t bm00000011=     0x000000000000FFFFLL;
131 static uint64_t bm11111110=     0xFFFFFFFFFFFFFF00LL;
132 static uint64_t bm11000000=     0xFFFF000000000000LL;
133 static uint64_t bm00011000=     0x000000FFFF000000LL;
134 static uint64_t bm00110011=     0x0000FFFF0000FFFFLL;
135 static uint64_t bm11001100=     0xFFFF0000FFFF0000LL;
136 static uint64_t b00=            0x0000000000000000LL;
137 static uint64_t b01=            0x0101010101010101LL;
138 static uint64_t b02=            0x0202020202020202LL;
139 static uint64_t b0F=            0x0F0F0F0F0F0F0F0FLL;
140 static uint64_t b04=            0x0404040404040404LL;
141 static uint64_t b08=            0x0808080808080808LL;
142 static uint64_t bFF=            0xFFFFFFFFFFFFFFFFLL;
143 static uint64_t b20=            0x2020202020202020LL;
144 static uint64_t b80=            0x8080808080808080LL;
145 static uint64_t b7E=            0x7E7E7E7E7E7E7E7ELL;
146 static uint64_t b7C=            0x7C7C7C7C7C7C7C7CLL;
147 static uint64_t b3F=            0x3F3F3F3F3F3F3F3FLL;
148 static uint64_t temp0=0;
149 static uint64_t temp1=0;
150 static uint64_t temp2=0;
151 static uint64_t temp3=0;
152 static uint64_t temp4=0;
153 static uint64_t temp5=0;
154 static uint64_t pQPb=0;
155 static uint64_t pQPb2=0;
156 static uint8_t tempBlocks[8*16*2]; //used for the horizontal code
157
158 int hFlatnessThreshold= 56 - 16;
159 int vFlatnessThreshold= 56 - 16;
160
161 //amount of "black" u r willing to loose to get a brightness corrected picture
162 double maxClippedThreshold= 0.01;
163
164 int maxAllowedY=234;
165 int minAllowedY=16;
166
167 static struct PPFilter filters[]=
168 {
169         {"hb", "hdeblock",              1, 1, 3, H_DEBLOCK},
170         {"vb", "vdeblock",              1, 2, 4, V_DEBLOCK},
171         {"vr", "rkvdeblock",            1, 2, 4, H_RK1_FILTER},
172         {"h1", "x1hdeblock",            1, 1, 3, H_X1_FILTER},
173         {"v1", "x1vdeblock",            1, 2, 4, V_X1_FILTER},
174         {"dr", "dering",                1, 5, 6, DERING},
175         {"al", "autolevels",            0, 1, 2, LEVEL_FIX},
176         {"lb", "linblenddeint",         0, 1, 6, LINEAR_BLEND_DEINT_FILTER},
177         {"li", "linipoldeint",          0, 1, 6, LINEAR_IPOL_DEINT_FILTER},
178         {"ci", "cubicipoldeint",        0, 1, 6, CUBIC_IPOL_DEINT_FILTER},
179         {"md", "mediandeint",           0, 1, 6, MEDIAN_DEINT_FILTER},
180         {NULL, NULL,0,0,0,0} //End Marker
181 };
182
183 static char *replaceTable[]=
184 {
185         "default",      "hdeblock:a,vdeblock:a,dering:a,autolevels",
186         "de",           "hdeblock:a,vdeblock:a,dering:a,autolevels",
187         "fast",         "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels",
188         "fa",           "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels",
189         NULL //End Marker
190 };
191
192 static inline void unusedVariableWarningFixer()
193 {
194 if(
195  packedYOffset + packedYScale + w05 + w20 + w1400 + bm00000001 + bm00010000
196  + bm00001000 + bm10000000 + bm10000001 + bm11000011 + bm00000011 + bm11111110
197  + bm11000000 + bm00011000 + bm00110011 + bm11001100 + b00 + b01 + b02 + b0F
198  + bFF + b20 + b80 + b7E + b7C + b3F + temp0 + temp1 + temp2 + temp3 + temp4
199  + temp5 + pQPb== 0) b00=0;
200 }
201
202 #ifdef TIMING
203 static inline long long rdtsc()
204 {
205         long long l;
206         asm volatile(   "rdtsc\n\t"
207                 : "=A" (l)
208         );
209 //      printf("%d\n", int(l/1000));
210         return l;
211 }
212 #endif
213
214 #ifdef HAVE_MMX2
215 static inline void prefetchnta(void *p)
216 {
217         asm volatile(   "prefetchnta (%0)\n\t"
218                 : : "r" (p)
219         );
220 }
221
222 static inline void prefetcht0(void *p)
223 {
224         asm volatile(   "prefetcht0 (%0)\n\t"
225                 : : "r" (p)
226         );
227 }
228
229 static inline void prefetcht1(void *p)
230 {
231         asm volatile(   "prefetcht1 (%0)\n\t"
232                 : : "r" (p)
233         );
234 }
235
236 static inline void prefetcht2(void *p)
237 {
238         asm volatile(   "prefetcht2 (%0)\n\t"
239                 : : "r" (p)
240         );
241 }
242 #endif
243
244 //FIXME? |255-0| = 1 (shouldnt be a problem ...)
245 /**
246  * Check if the middle 8x8 Block in the given 8x16 block is flat
247  */
248 static inline int isVertDC(uint8_t src[], int stride){
249         int numEq= 0;
250 #ifndef HAVE_MMX
251         int y;
252 #endif
253         src+= stride*4; // src points to begin of the 8x8 Block
254 #ifdef HAVE_MMX
255 asm volatile(
256                 "leal (%1, %2), %%eax                           \n\t"
257                 "leal (%%eax, %2, 4), %%ebx                     \n\t"
258 //      0       1       2       3       4       5       6       7       8       9
259 //      %1      eax     eax+%2  eax+2%2 %1+4%2  ebx     ebx+%2  ebx+2%2 %1+8%2  ebx+4%2
260                 "movq b7E, %%mm7                                        \n\t" // mm7 = 0x7F
261                 "movq b7C, %%mm6                                        \n\t" // mm6 = 0x7D
262                 "movq (%1), %%mm0                               \n\t"
263                 "movq (%%eax), %%mm1                            \n\t"
264                 "psubb %%mm1, %%mm0                             \n\t" // mm0 = differnece
265                 "paddb %%mm7, %%mm0                             \n\t"
266                 "pcmpgtb %%mm6, %%mm0                           \n\t"
267
268                 "movq (%%eax,%2), %%mm2                         \n\t"
269                 "psubb %%mm2, %%mm1                             \n\t"
270                 "paddb %%mm7, %%mm1                             \n\t"
271                 "pcmpgtb %%mm6, %%mm1                           \n\t"
272                 "paddb %%mm1, %%mm0                             \n\t"
273
274                 "movq (%%eax, %2, 2), %%mm1                     \n\t"
275                 "psubb %%mm1, %%mm2                             \n\t"
276                 "paddb %%mm7, %%mm2                             \n\t"
277                 "pcmpgtb %%mm6, %%mm2                           \n\t"
278                 "paddb %%mm2, %%mm0                             \n\t"
279
280                 "movq (%1, %2, 4), %%mm2                        \n\t"
281                 "psubb %%mm2, %%mm1                             \n\t"
282                 "paddb %%mm7, %%mm1                             \n\t"
283                 "pcmpgtb %%mm6, %%mm1                           \n\t"
284                 "paddb %%mm1, %%mm0                             \n\t"
285
286                 "movq (%%ebx), %%mm1                            \n\t"
287                 "psubb %%mm1, %%mm2                             \n\t"
288                 "paddb %%mm7, %%mm2                             \n\t"
289                 "pcmpgtb %%mm6, %%mm2                           \n\t"
290                 "paddb %%mm2, %%mm0                             \n\t"
291
292                 "movq (%%ebx, %2), %%mm2                        \n\t"
293                 "psubb %%mm2, %%mm1                             \n\t"
294                 "paddb %%mm7, %%mm1                             \n\t"
295                 "pcmpgtb %%mm6, %%mm1                           \n\t"
296                 "paddb %%mm1, %%mm0                             \n\t"
297
298                 "movq (%%ebx, %2, 2), %%mm1                     \n\t"
299                 "psubb %%mm1, %%mm2                             \n\t"
300                 "paddb %%mm7, %%mm2                             \n\t"
301                 "pcmpgtb %%mm6, %%mm2                           \n\t"
302                 "paddb %%mm2, %%mm0                             \n\t"
303
304                 "                                               \n\t"
305                 "movq %%mm0, %%mm1                              \n\t"
306                 "psrlw $8, %%mm0                                \n\t"
307                 "paddb %%mm1, %%mm0                             \n\t"
308 #ifdef HAVE_MMX2
309                 "pshufw $0xF9, %%mm0, %%mm1                     \n\t"
310                 "paddb %%mm1, %%mm0                             \n\t"
311                 "pshufw $0xFE, %%mm0, %%mm1                     \n\t"
312 #else
313                 "movq %%mm0, %%mm1                              \n\t"
314                 "psrlq $16, %%mm0                               \n\t"
315                 "paddb %%mm1, %%mm0                             \n\t"
316                 "movq %%mm0, %%mm1                              \n\t"
317                 "psrlq $32, %%mm0                               \n\t"
318 #endif
319                 "paddb %%mm1, %%mm0                             \n\t"
320                 "movd %%mm0, %0                                 \n\t"
321                 : "=r" (numEq)
322                 : "r" (src), "r" (stride)
323                 : "%eax", "%ebx"
324                 );
325
326         numEq= (256 - numEq) &0xFF;
327
328 #else
329         for(y=0; y<BLOCK_SIZE-1; y++)
330         {
331                 if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
332                 if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
333                 if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++;
334                 if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++;
335                 if(((src[4] - src[4+stride] + 1)&0xFFFF) < 3) numEq++;
336                 if(((src[5] - src[5+stride] + 1)&0xFFFF) < 3) numEq++;
337                 if(((src[6] - src[6+stride] + 1)&0xFFFF) < 3) numEq++;
338                 if(((src[7] - src[7+stride] + 1)&0xFFFF) < 3) numEq++;
339                 src+= stride;
340         }
341 #endif
342 /*      if(abs(numEq - asmEq) > 0)
343         {
344                 printf("\nasm:%d  c:%d\n", asmEq, numEq);
345                 for(int y=0; y<8; y++)
346                 {
347                         for(int x=0; x<8; x++)
348                         {
349                                 printf("%d ", temp[x + y*stride]);
350                         }
351                         printf("\n");
352                 }
353         }
354 */
355 //      for(int i=0; i<numEq/8; i++) src[i]=255;
356         return (numEq > vFlatnessThreshold) ? 1 : 0;
357 }
358
359 static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
360 {
361 #ifdef HAVE_MMX
362         int isOk;
363         src+= stride*3;
364         asm volatile(
365 //              "int $3 \n\t"
366                 "movq (%1, %2), %%mm0                           \n\t"
367                 "movq (%1, %2, 8), %%mm1                        \n\t"
368                 "movq %%mm0, %%mm2                              \n\t"
369                 "psubusb %%mm1, %%mm0                           \n\t"
370                 "psubusb %%mm2, %%mm1                           \n\t"
371                 "por %%mm1, %%mm0                               \n\t" // ABS Diff
372
373                 "movq pQPb, %%mm7                               \n\t" // QP,..., QP
374                 "paddusb %%mm7, %%mm7                           \n\t" // 2QP ... 2QP
375                 "psubusb %%mm7, %%mm0                           \n\t" // Diff <= 2QP -> 0
376                 "pcmpeqd b00, %%mm0                             \n\t"
377                 "psrlq $16, %%mm0                               \n\t"
378                 "pcmpeqd bFF, %%mm0                             \n\t"
379 //              "movd %%mm0, (%1, %2, 4)\n\t"
380                 "movd %%mm0, %0                                 \n\t"
381                 : "=r" (isOk)
382                 : "r" (src), "r" (stride)
383                 );
384         return isOk;
385 #else
386
387         int isOk2= 1;
388         int x;
389         src+= stride*3;
390         for(x=0; x<BLOCK_SIZE; x++)
391         {
392                 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
393         }
394 /*      if(isOk && !isOk2 || !isOk && isOk2)
395         {
396                 printf("\nasm:%d  c:%d QP:%d\n", isOk, isOk2, QP);
397                 for(int y=0; y<9; y++)
398                 {
399                         for(int x=0; x<8; x++)
400                         {
401                                 printf("%d ", src[x + y*stride]);
402                         }
403                         printf("\n");
404                 }
405         } */
406
407         return isOk2;
408 #endif
409
410 }
411
412 /**
413  * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
414  * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
415  */
416 static inline void doVertLowPass(uint8_t *src, int stride, int QP)
417 {
418 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
419         src+= stride*3;
420         asm volatile(   //"movv %0 %1 %2\n\t"
421                 "movq pQPb, %%mm0                               \n\t"  // QP,..., QP
422
423                 "movq (%0), %%mm6                               \n\t"
424                 "movq (%0, %1), %%mm5                           \n\t"
425                 "movq %%mm5, %%mm1                              \n\t"
426                 "movq %%mm6, %%mm2                              \n\t"
427                 "psubusb %%mm6, %%mm5                           \n\t"
428                 "psubusb %%mm1, %%mm2                           \n\t"
429                 "por %%mm5, %%mm2                               \n\t" // ABS Diff of lines
430                 "psubusb %%mm0, %%mm2                           \n\t" // diff <= QP -> 0
431                 "pcmpeqb b00, %%mm2                             \n\t" // diff <= QP -> FF
432
433                 "pand %%mm2, %%mm6                              \n\t"
434                 "pandn %%mm1, %%mm2                             \n\t"
435                 "por %%mm2, %%mm6                               \n\t"// First Line to Filter
436
437                 "movq (%0, %1, 8), %%mm5                        \n\t"
438                 "leal (%0, %1, 4), %%eax                        \n\t"
439                 "leal (%0, %1, 8), %%ebx                        \n\t"
440                 "subl %1, %%ebx                                 \n\t"
441                 "addl %1, %0                                    \n\t" // %0 points to line 1 not 0
442                 "movq (%0, %1, 8), %%mm7                        \n\t"
443                 "movq %%mm5, %%mm1                              \n\t"
444                 "movq %%mm7, %%mm2                              \n\t"
445                 "psubusb %%mm7, %%mm5                           \n\t"
446                 "psubusb %%mm1, %%mm2                           \n\t"
447                 "por %%mm5, %%mm2                               \n\t" // ABS Diff of lines
448                 "psubusb %%mm0, %%mm2                           \n\t" // diff <= QP -> 0
449                 "pcmpeqb b00, %%mm2                             \n\t" // diff <= QP -> FF
450
451                 "pand %%mm2, %%mm7                              \n\t"
452                 "pandn %%mm1, %%mm2                             \n\t"
453                 "por %%mm2, %%mm7                               \n\t" // First Line to Filter
454
455
456                 //      1       2       3       4       5       6       7       8
457                 //      %0      %0+%1   %0+2%1  eax     %0+4%1  eax+2%1 ebx     eax+4%1
458                 // 6 4 2 2 1 1
459                 // 6 4 4 2
460                 // 6 8 2
461
462                 "movq (%0, %1), %%mm0                           \n\t" //  1
463                 "movq %%mm0, %%mm1                              \n\t" //  1
464                 PAVGB(%%mm6, %%mm0)                                   //1 1     /2
465                 PAVGB(%%mm6, %%mm0)                                   //3 1     /4
466
467                 "movq (%0, %1, 4), %%mm2                        \n\t" //     1
468                 "movq %%mm2, %%mm5                              \n\t" //     1
469                 PAVGB((%%eax), %%mm2)                                 //    11  /2
470                 PAVGB((%0, %1, 2), %%mm2)                             //   211  /4
471                 "movq %%mm2, %%mm3                              \n\t" //   211  /4
472                 "movq (%0), %%mm4                               \n\t" // 1
473                 PAVGB(%%mm4, %%mm3)                                   // 4 211  /8
474                 PAVGB(%%mm0, %%mm3)                                   //642211  /16
475                 "movq %%mm3, (%0)                               \n\t" // X
476                 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
477                 "movq %%mm1, %%mm0                              \n\t" //  1
478                 PAVGB(%%mm6, %%mm0)                                   //1 1     /2
479                 "movq %%mm4, %%mm3                              \n\t" // 1
480                 PAVGB((%0,%1,2), %%mm3)                               // 1 1    /2
481                 PAVGB((%%eax,%1,2), %%mm5)                            //     11 /2
482                 PAVGB((%%eax), %%mm5)                                 //    211 /4
483                 PAVGB(%%mm5, %%mm3)                                   // 2 2211 /8
484                 PAVGB(%%mm0, %%mm3)                                   //4242211 /16
485                 "movq %%mm3, (%0,%1)                            \n\t" //  X
486                 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
487                 PAVGB(%%mm4, %%mm6)                                   //11      /2
488                 "movq (%%ebx), %%mm0                            \n\t" //       1
489                 PAVGB((%%eax, %1, 2), %%mm0)                          //      11/2
490                 "movq %%mm0, %%mm3                              \n\t" //      11/2
491                 PAVGB(%%mm1, %%mm0)                                   //  2   11/4
492                 PAVGB(%%mm6, %%mm0)                                   //222   11/8
493                 PAVGB(%%mm2, %%mm0)                                   //22242211/16
494                 "movq (%0, %1, 2), %%mm2                        \n\t" //   1
495                 "movq %%mm0, (%0, %1, 2)                        \n\t" //   X
496                 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
497                 "movq (%%eax, %1, 4), %%mm0                     \n\t" //        1
498                 PAVGB((%%ebx), %%mm0)                                 //       11       /2
499                 PAVGB(%%mm0, %%mm6)                                   //11     11       /4
500                 PAVGB(%%mm1, %%mm4)                                   // 11             /2
501                 PAVGB(%%mm2, %%mm1)                                   //  11            /2
502                 PAVGB(%%mm1, %%mm6)                                   //1122   11       /8
503                 PAVGB(%%mm5, %%mm6)                                   //112242211       /16
504                 "movq (%%eax), %%mm5                            \n\t" //    1
505                 "movq %%mm6, (%%eax)                            \n\t" //    X
506                 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
507                 "movq (%%eax, %1, 4), %%mm6                     \n\t" //        1
508                 PAVGB(%%mm7, %%mm6)                                   //        11      /2
509                 PAVGB(%%mm4, %%mm6)                                   // 11     11      /4
510                 PAVGB(%%mm3, %%mm6)                                   // 11   2211      /8
511                 PAVGB(%%mm5, %%mm2)                                   //   11           /2
512                 "movq (%0, %1, 4), %%mm4                        \n\t" //     1
513                 PAVGB(%%mm4, %%mm2)                                   //   112          /4
514                 PAVGB(%%mm2, %%mm6)                                   // 112242211      /16
515                 "movq %%mm6, (%0, %1, 4)                        \n\t" //     X
516                 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
517                 PAVGB(%%mm7, %%mm1)                                   //  11     2      /4
518                 PAVGB(%%mm4, %%mm5)                                   //    11          /2
519                 PAVGB(%%mm5, %%mm0)                                   //    11 11       /4
520                 "movq (%%eax, %1, 2), %%mm6                     \n\t" //      1
521                 PAVGB(%%mm6, %%mm1)                                   //  11  4  2      /8
522                 PAVGB(%%mm0, %%mm1)                                   //  11224222      /16
523                 "movq %%mm1, (%%eax, %1, 2)                     \n\t" //      X
524                 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
525                 PAVGB((%%ebx), %%mm2)                                 //   112 4        /8
526                 "movq (%%eax, %1, 4), %%mm0                     \n\t" //        1
527                 PAVGB(%%mm0, %%mm6)                                   //      1 1       /2
528                 PAVGB(%%mm7, %%mm6)                                   //      1 12      /4
529                 PAVGB(%%mm2, %%mm6)                                   //   1122424      /4
530                 "movq %%mm6, (%%ebx)                            \n\t" //       X
531                 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
532                 PAVGB(%%mm7, %%mm5)                                   //    11   2      /4
533                 PAVGB(%%mm7, %%mm5)                                   //    11   6      /8
534
535                 PAVGB(%%mm3, %%mm0)                                   //      112       /4
536                 PAVGB(%%mm0, %%mm5)                                   //    112246      /16
537                 "movq %%mm5, (%%eax, %1, 4)                     \n\t" //        X
538                 "subl %1, %0                                    \n\t"
539
540                 :
541                 : "r" (src), "r" (stride)
542                 : "%eax", "%ebx"
543         );
544 #else
545         const int l1= stride;
546         const int l2= stride + l1;
547         const int l3= stride + l2;
548         const int l4= stride + l3;
549         const int l5= stride + l4;
550         const int l6= stride + l5;
551         const int l7= stride + l6;
552         const int l8= stride + l7;
553         const int l9= stride + l8;
554         int x;
555         src+= stride*3;
556         for(x=0; x<BLOCK_SIZE; x++)
557         {
558                 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
559                 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
560
561                 int sums[9];
562                 sums[0] = first + src[l1];
563                 sums[1] = src[l1] + src[l2];
564                 sums[2] = src[l2] + src[l3];
565                 sums[3] = src[l3] + src[l4];
566                 sums[4] = src[l4] + src[l5];
567                 sums[5] = src[l5] + src[l6];
568                 sums[6] = src[l6] + src[l7];
569                 sums[7] = src[l7] + src[l8];
570                 sums[8] = src[l8] + last;
571
572                 src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
573                 src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
574                 src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
575                 src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
576                 src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
577                 src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
578                 src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4;
579                 src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
580
581                 src++;
582         }
583
584 #endif
585 }
586
587 /**
588  * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
589  * values are correctly clipped (MMX2)
590  * values are wraparound (C)
591  * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
592         0 8 16 24
593         x = 8
594         x/2 = 4
595         x/8 = 1
596         1 12 12 23
597  */
598 static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
599 {
600 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
601         src+= stride*3;
602 // FIXME rounding
603         asm volatile(
604                 "pxor %%mm7, %%mm7                              \n\t" // 0
605                 "movq b80, %%mm6                                \n\t" // MIN_SIGNED_BYTE
606                 "leal (%0, %1), %%eax                           \n\t"
607                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
608 //      0       1       2       3       4       5       6       7       8       9
609 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
610                 "movq pQPb, %%mm0                               \n\t" // QP,..., QP
611                 "movq %%mm0, %%mm1                              \n\t" // QP,..., QP
612                 "paddusb b02, %%mm0                             \n\t"
613                 "psrlw $2, %%mm0                                \n\t"
614                 "pand b3F, %%mm0                                \n\t" // QP/4,..., QP/4
615                 "paddusb %%mm1, %%mm0                           \n\t" // QP*1.25 ...
616                 "movq (%0, %1, 4), %%mm2                        \n\t" // line 4
617                 "movq (%%ebx), %%mm3                            \n\t" // line 5
618                 "movq %%mm2, %%mm4                              \n\t" // line 4
619                 "pcmpeqb %%mm5, %%mm5                           \n\t" // -1
620                 "pxor %%mm2, %%mm5                              \n\t" // -line 4 - 1
621                 PAVGB(%%mm3, %%mm5)
622                 "paddb %%mm6, %%mm5                             \n\t" // (l5-l4)/2
623                 "psubusb %%mm3, %%mm4                           \n\t"
624                 "psubusb %%mm2, %%mm3                           \n\t"
625                 "por %%mm3, %%mm4                               \n\t" // |l4 - l5|
626                 "psubusb %%mm0, %%mm4                           \n\t"
627                 "pcmpeqb %%mm7, %%mm4                           \n\t"
628                 "pand %%mm4, %%mm5                              \n\t" // d/2
629
630 //              "paddb %%mm6, %%mm2                             \n\t" // line 4 + 0x80
631                 "paddb %%mm5, %%mm2                             \n\t"
632 //              "psubb %%mm6, %%mm2                             \n\t"
633                 "movq %%mm2, (%0,%1, 4)                         \n\t"
634
635                 "movq (%%ebx), %%mm2                            \n\t"
636 //              "paddb %%mm6, %%mm2                             \n\t" // line 5 + 0x80
637                 "psubb %%mm5, %%mm2                             \n\t"
638 //              "psubb %%mm6, %%mm2                             \n\t"
639                 "movq %%mm2, (%%ebx)                            \n\t"
640
641                 "paddb %%mm6, %%mm5                             \n\t"
642                 "psrlw $2, %%mm5                                \n\t"
643                 "pand b3F, %%mm5                                \n\t"
644                 "psubb b20, %%mm5                               \n\t" // (l5-l4)/8
645
646                 "movq (%%eax, %1, 2), %%mm2                     \n\t"
647                 "paddb %%mm6, %%mm2                             \n\t" // line 3 + 0x80
648                 "paddsb %%mm5, %%mm2                            \n\t"
649                 "psubb %%mm6, %%mm2                             \n\t"
650                 "movq %%mm2, (%%eax, %1, 2)                     \n\t"
651
652                 "movq (%%ebx, %1), %%mm2                        \n\t"
653                 "paddb %%mm6, %%mm2                             \n\t" // line 6 + 0x80
654                 "psubsb %%mm5, %%mm2                            \n\t"
655                 "psubb %%mm6, %%mm2                             \n\t"
656                 "movq %%mm2, (%%ebx, %1)                        \n\t"
657
658                 :
659                 : "r" (src), "r" (stride)
660                 : "%eax", "%ebx"
661         );
662 #else
663         const int l1= stride;
664         const int l2= stride + l1;
665         const int l3= stride + l2;
666         const int l4= stride + l3;
667         const int l5= stride + l4;
668         const int l6= stride + l5;
669 //      const int l7= stride + l6;
670 //      const int l8= stride + l7;
671 //      const int l9= stride + l8;
672         int x;
673         const int QP15= QP + (QP>>2);
674         src+= stride*3;
675         for(x=0; x<BLOCK_SIZE; x++)
676         {
677                 const int v = (src[x+l5] - src[x+l4]);
678                 if(ABS(v) < QP15)
679                 {
680                         src[x+l3] +=v>>3;
681                         src[x+l4] +=v>>1;
682                         src[x+l5] -=v>>1;
683                         src[x+l6] -=v>>3;
684
685                 }
686         }
687
688 #endif
689 }
690
691 /**
692  * Experimental Filter 1
693  * will not damage linear gradients
694  * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
695  * can only smooth blocks at the expected locations (it cant smooth them if they did move)
696  * MMX2 version does correct clipping C version doesnt
697  */
698 static inline void vertX1Filter(uint8_t *src, int stride, int QP)
699 {
700 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
701         src+= stride*3;
702
703         asm volatile(
704                 "pxor %%mm7, %%mm7                              \n\t" // 0
705 //              "movq b80, %%mm6                                \n\t" // MIN_SIGNED_BYTE
706                 "leal (%0, %1), %%eax                           \n\t"
707                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
708 //      0       1       2       3       4       5       6       7       8       9
709 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
710                 "movq (%%eax, %1, 2), %%mm0                     \n\t" // line 3
711                 "movq (%0, %1, 4), %%mm1                        \n\t" // line 4
712                 "movq %%mm1, %%mm2                              \n\t" // line 4
713                 "psubusb %%mm0, %%mm1                           \n\t"
714                 "psubusb %%mm2, %%mm0                           \n\t"
715                 "por %%mm1, %%mm0                               \n\t" // |l2 - l3|
716                 "movq (%%ebx), %%mm3                            \n\t" // line 5
717                 "movq (%%ebx, %1), %%mm4                                \n\t" // line 6
718                 "movq %%mm3, %%mm5                              \n\t" // line 5
719                 "psubusb %%mm4, %%mm3                           \n\t"
720                 "psubusb %%mm5, %%mm4                           \n\t"
721                 "por %%mm4, %%mm3                               \n\t" // |l5 - l6|
722                 PAVGB(%%mm3, %%mm0)                                   // (|l2 - l3| + |l5 - l6|)/2
723                 "movq %%mm2, %%mm1                              \n\t" // line 4
724                 "psubusb %%mm5, %%mm2                           \n\t"
725                 "movq %%mm2, %%mm4                              \n\t"
726                 "pcmpeqb %%mm7, %%mm2                           \n\t" // (l4 - l5) <= 0 ? -1 : 0
727                 "psubusb %%mm1, %%mm5                           \n\t"
728                 "por %%mm5, %%mm4                               \n\t" // |l4 - l5|
729                 "psubusb %%mm0, %%mm4           \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
730                 "movq %%mm4, %%mm3                              \n\t" // d
731                 "psubusb pQPb, %%mm4                            \n\t"
732                 "pcmpeqb %%mm7, %%mm4                           \n\t" // d <= QP ? -1 : 0
733                 "psubusb b01, %%mm3                             \n\t"
734                 "pand %%mm4, %%mm3                              \n\t" // d <= QP ? d : 0
735
736                 PAVGB(%%mm7, %%mm3)                                   // d/2
737                 "movq %%mm3, %%mm1                              \n\t" // d/2
738                 PAVGB(%%mm7, %%mm3)                                   // d/4
739                 PAVGB(%%mm1, %%mm3)                                   // 3*d/8
740
741                 "movq (%0, %1, 4), %%mm0                        \n\t" // line 4
742                 "pxor %%mm2, %%mm0                              \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
743                 "psubusb %%mm3, %%mm0                           \n\t"
744                 "pxor %%mm2, %%mm0                              \n\t"
745                 "movq %%mm0, (%0, %1, 4)                        \n\t" // line 4
746
747                 "movq (%%ebx), %%mm0                            \n\t" // line 5
748                 "pxor %%mm2, %%mm0                              \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
749                 "paddusb %%mm3, %%mm0                           \n\t"
750                 "pxor %%mm2, %%mm0                              \n\t"
751                 "movq %%mm0, (%%ebx)                            \n\t" // line 5
752
753                 PAVGB(%%mm7, %%mm1)                                   // d/4
754
755                 "movq (%%eax, %1, 2), %%mm0                     \n\t" // line 3
756                 "pxor %%mm2, %%mm0                              \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
757                 "psubusb %%mm1, %%mm0                           \n\t"
758                 "pxor %%mm2, %%mm0                              \n\t"
759                 "movq %%mm0, (%%eax, %1, 2)                     \n\t" // line 3
760
761                 "movq (%%ebx, %1), %%mm0                        \n\t" // line 6
762                 "pxor %%mm2, %%mm0                              \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
763                 "paddusb %%mm1, %%mm0                           \n\t"
764                 "pxor %%mm2, %%mm0                              \n\t"
765                 "movq %%mm0, (%%ebx, %1)                        \n\t" // line 6
766
767                 PAVGB(%%mm7, %%mm1)                                   // d/8
768
769                 "movq (%%eax, %1), %%mm0                        \n\t" // line 2
770                 "pxor %%mm2, %%mm0                              \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
771                 "psubusb %%mm1, %%mm0                           \n\t"
772                 "pxor %%mm2, %%mm0                              \n\t"
773                 "movq %%mm0, (%%eax, %1)                        \n\t" // line 2
774
775                 "movq (%%ebx, %1, 2), %%mm0                     \n\t" // line 7
776                 "pxor %%mm2, %%mm0                              \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
777                 "paddusb %%mm1, %%mm0                           \n\t"
778                 "pxor %%mm2, %%mm0                              \n\t"
779                 "movq %%mm0, (%%ebx, %1, 2)                     \n\t" // line 7
780
781                 :
782                 : "r" (src), "r" (stride)
783                 : "%eax", "%ebx"
784         );
785 #else
786
787         const int l1= stride;
788         const int l2= stride + l1;
789         const int l3= stride + l2;
790         const int l4= stride + l3;
791         const int l5= stride + l4;
792         const int l6= stride + l5;
793         const int l7= stride + l6;
794 //      const int l8= stride + l7;
795 //      const int l9= stride + l8;
796         int x;
797
798         src+= stride*3;
799         for(x=0; x<BLOCK_SIZE; x++)
800         {
801                 int a= src[l3] - src[l4];
802                 int b= src[l4] - src[l5];
803                 int c= src[l5] - src[l6];
804
805                 int d= ABS(b) - ((ABS(a) + ABS(c))>>1);
806                 d= MAX(d, 0);
807
808                 if(d < QP)
809                 {
810                         int v = d * SIGN(-b);
811
812                         src[l2] +=v>>3;
813                         src[l3] +=v>>2;
814                         src[l4] +=(3*v)>>3;
815                         src[l5] -=(3*v)>>3;
816                         src[l6] -=v>>2;
817                         src[l7] -=v>>3;
818
819                 }
820                 src++;
821         }
822         /*
823         const int l1= stride;
824         const int l2= stride + l1;
825         const int l3= stride + l2;
826         const int l4= stride + l3;
827         const int l5= stride + l4;
828         const int l6= stride + l5;
829         const int l7= stride + l6;
830         const int l8= stride + l7;
831         const int l9= stride + l8;
832         for(int x=0; x<BLOCK_SIZE; x++)
833         {
834                 int v2= src[l2];
835                 int v3= src[l3];
836                 int v4= src[l4];
837                 int v5= src[l5];
838                 int v6= src[l6];
839                 int v7= src[l7];
840
841                 if(ABS(v4-v5)<QP &&  ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 )
842                 {
843                         src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6         )/16;
844                         src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7  )/16;
845                         src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16;
846                         src[l6] = (       1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16;
847                 }
848                 src++;
849         }
850 */
851 #endif
852 }
853
854 /**
855  * Experimental Filter 1 (Horizontal)
856  * will not damage linear gradients
857  * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
858  * can only smooth blocks at the expected locations (it cant smooth them if they did move)
859  * MMX2 version does correct clipping C version doesnt
860  * not identical with the vertical one
861  */
862 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
863 {
864         int y;
865         static uint64_t *lut= NULL;
866         if(lut==NULL)
867         {
868                 int i;
869                 lut= (uint64_t*)memalign(8, 256*8);
870                 for(i=0; i<256; i++)
871                 {
872                         int v= i < 128 ? 2*i : 2*(i-256);
873 /*
874 //Simulate 112242211 9-Tap filter
875                         uint64_t a= (v/16) & 0xFF;
876                         uint64_t b= (v/8) & 0xFF;
877                         uint64_t c= (v/4) & 0xFF;
878                         uint64_t d= (3*v/8) & 0xFF;
879 */
880 //Simulate piecewise linear interpolation
881                         uint64_t a= (v/16) & 0xFF;
882                         uint64_t b= (v*3/16) & 0xFF;
883                         uint64_t c= (v*5/16) & 0xFF;
884                         uint64_t d= (7*v/16) & 0xFF;
885                         uint64_t A= (0x100 - a)&0xFF;
886                         uint64_t B= (0x100 - b)&0xFF;
887                         uint64_t C= (0x100 - c)&0xFF;
888                         uint64_t D= (0x100 - c)&0xFF;
889
890                         lut[i]   = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
891                                 (D<<24) | (C<<16) | (B<<8) | (A);
892                         //lut[i] = (v<<32) | (v<<24);
893                 }
894         }
895
896 #if 0
897         asm volatile(
898                 "pxor %%mm7, %%mm7                              \n\t" // 0
899 //              "movq b80, %%mm6                                \n\t" // MIN_SIGNED_BYTE
900                 "leal (%0, %1), %%eax                           \n\t"
901                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
902
903                 "movq b80, %%mm6                                \n\t"
904                 "movd pQPb, %%mm5                               \n\t" // QP
905                 "movq %%mm5, %%mm4                              \n\t"
906                 "paddusb %%mm5, %%mm5                           \n\t" // 2QP
907                 "paddusb %%mm5, %%mm4                           \n\t" // 3QP
908                 "pxor %%mm5, %%mm5                              \n\t" // 0
909                 "psubb %%mm4, %%mm5                             \n\t" // -3QP
910                 "por bm11111110, %%mm5                          \n\t" // ...,FF,FF,-3QP
911                 "psllq $24, %%mm5                               \n\t"
912
913 //      0       1       2       3       4       5       6       7       8       9
914 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
915
916 #define HX1old(a) \
917                 "movd " #a ", %%mm0                             \n\t"\
918                 "movd 4" #a ", %%mm1                            \n\t"\
919                 "punpckldq %%mm1, %%mm0                         \n\t"\
920                 "movq %%mm0, %%mm1                              \n\t"\
921                 "movq %%mm0, %%mm2                              \n\t"\
922                 "psrlq $8, %%mm1                                \n\t"\
923                 "psubusb %%mm1, %%mm2                           \n\t"\
924                 "psubusb %%mm0, %%mm1                           \n\t"\
925                 "por %%mm2, %%mm1                               \n\t" /* p´x = |px - p(x+1)| */\
926                 "pcmpeqb %%mm7, %%mm2                           \n\t" /* p´x = sgn[px - p(x+1)] */\
927                 "pshufw $0x00, %%mm1, %%mm3                     \n\t" /* p´5 = |p1 - p2| */\
928                 PAVGB(%%mm1, %%mm3)                                   /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
929                 "psrlq $16, %%mm3                               \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
930                 "psubusb %%mm3, %%mm1                   \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
931                 "paddb %%mm5, %%mm1                             \n\t"\
932                 "psubusb %%mm5, %%mm1                           \n\t"\
933                 PAVGB(%%mm7, %%mm1)\
934                 "pxor %%mm2, %%mm1                              \n\t"\
935                 "psubb %%mm2, %%mm1                             \n\t"\
936                 "psrlq $24, %%mm1                               \n\t"\
937                 "movd %%mm1, %%ecx                              \n\t"\
938                 "paddb %%mm6, %%mm0                             \n\t"\
939                 "paddsb (%3, %%ecx, 8), %%mm0                   \n\t"\
940                 "paddb %%mm6, %%mm0                             \n\t"\
941                 "movq %%mm0, " #a "                             \n\t"\
942
943 /*
944 HX1old((%0))
945 HX1old((%%eax))
946 HX1old((%%eax, %1))
947 HX1old((%%eax, %1, 2))
948 HX1old((%0, %1, 4))
949 HX1old((%%ebx))
950 HX1old((%%ebx, %1))
951 HX1old((%%ebx, %1, 2))
952 */
953
954 //FIXME add some comments, its unreadable ...
955 #define HX1b(a, c, b, d) \
956                 "movd " #a ", %%mm0                             \n\t"\
957                 "movd 4" #a ", %%mm1                            \n\t"\
958                 "punpckldq %%mm1, %%mm0                         \n\t"\
959                 "movd " #b ", %%mm4                             \n\t"\
960                 "movq %%mm0, %%mm1                              \n\t"\
961                 "movq %%mm0, %%mm2                              \n\t"\
962                 "psrlq $8, %%mm1                                \n\t"\
963                 "movd 4" #b ", %%mm3                            \n\t"\
964                 "psubusb %%mm1, %%mm2                           \n\t"\
965                 "psubusb %%mm0, %%mm1                           \n\t"\
966                 "por %%mm2, %%mm1                               \n\t" /* p´x = |px - p(x+1)| */\
967                 "pcmpeqb %%mm7, %%mm2                           \n\t" /* p´x = sgn[px - p(x+1)] */\
968                 "punpckldq %%mm3, %%mm4                         \n\t"\
969                 "movq %%mm1, %%mm3                              \n\t"\
970                 "psllq $32, %%mm3                               \n\t" /* p´5 = |p1 - p2| */\
971                 PAVGB(%%mm1, %%mm3)                                   /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
972                 "paddb %%mm6, %%mm0                             \n\t"\
973                 "psrlq $16, %%mm3                               \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
974                 "psubusb %%mm3, %%mm1                   \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
975                 "movq %%mm4, %%mm3                              \n\t"\
976                 "paddb %%mm5, %%mm1                             \n\t"\
977                 "psubusb %%mm5, %%mm1                           \n\t"\
978                 "psrlq $8, %%mm3                                \n\t"\
979                 PAVGB(%%mm7, %%mm1)\
980                 "pxor %%mm2, %%mm1                              \n\t"\
981                 "psubb %%mm2, %%mm1                             \n\t"\
982                 "movq %%mm4, %%mm2                              \n\t"\
983                 "psrlq $24, %%mm1                               \n\t"\
984                 "psubusb %%mm3, %%mm2                           \n\t"\
985                 "movd %%mm1, %%ecx                              \n\t"\
986                 "psubusb %%mm4, %%mm3                           \n\t"\
987                 "paddsb (%2, %%ecx, 8), %%mm0                   \n\t"\
988                 "por %%mm2, %%mm3                               \n\t" /* p´x = |px - p(x+1)| */\
989                 "paddb %%mm6, %%mm0                             \n\t"\
990                 "pcmpeqb %%mm7, %%mm2                           \n\t" /* p´x = sgn[px - p(x+1)] */\
991                 "movq %%mm3, %%mm1                              \n\t"\
992                 "psllq $32, %%mm1                               \n\t" /* p´5 = |p1 - p2| */\
993                 "movq %%mm0, " #a "                             \n\t"\
994                 PAVGB(%%mm3, %%mm1)                                   /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
995                 "paddb %%mm6, %%mm4                             \n\t"\
996                 "psrlq $16, %%mm1                               \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
997                 "psubusb %%mm1, %%mm3                   \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
998                 "paddb %%mm5, %%mm3                             \n\t"\
999                 "psubusb %%mm5, %%mm3                           \n\t"\
1000                 PAVGB(%%mm7, %%mm3)\
1001                 "pxor %%mm2, %%mm3                              \n\t"\
1002                 "psubb %%mm2, %%mm3                             \n\t"\
1003                 "psrlq $24, %%mm3                               \n\t"\
1004                 "movd " #c ", %%mm0                             \n\t"\
1005                 "movd 4" #c ", %%mm1                            \n\t"\
1006                 "punpckldq %%mm1, %%mm0                         \n\t"\
1007                 "paddb %%mm6, %%mm0                             \n\t"\
1008                 "paddsb (%2, %%ecx, 8), %%mm0                   \n\t"\
1009                 "paddb %%mm6, %%mm0                             \n\t"\
1010                 "movq %%mm0, " #c "                             \n\t"\
1011                 "movd %%mm3, %%ecx                              \n\t"\
1012                 "movd " #d ", %%mm0                             \n\t"\
1013                 "paddsb (%2, %%ecx, 8), %%mm4                   \n\t"\
1014                 "movd 4" #d ", %%mm1                            \n\t"\
1015                 "paddb %%mm6, %%mm4                             \n\t"\
1016                 "punpckldq %%mm1, %%mm0                         \n\t"\
1017                 "movq %%mm4, " #b "                             \n\t"\
1018                 "paddb %%mm6, %%mm0                             \n\t"\
1019                 "paddsb (%2, %%ecx, 8), %%mm0                   \n\t"\
1020                 "paddb %%mm6, %%mm0                             \n\t"\
1021                 "movq %%mm0, " #d "                             \n\t"\
1022
1023 HX1b((%0),(%%eax),(%%eax, %1),(%%eax, %1, 2))
1024 HX1b((%0, %1, 4),(%%ebx),(%%ebx, %1),(%%ebx, %1, 2))
1025
1026
1027                 :
1028                 : "r" (src), "r" (stride), "r" (lut)
1029                 : "%eax", "%ebx", "%ecx"
1030         );
1031 #else
1032
1033 //FIXME (has little in common with the mmx2 version)
1034         for(y=0; y<BLOCK_SIZE; y++)
1035         {
1036                 int a= src[1] - src[2];
1037                 int b= src[3] - src[4];
1038                 int c= src[5] - src[6];
1039
1040                 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
1041
1042                 if(d < QP)
1043                 {
1044                         int v = d * SIGN(-b);
1045
1046                         src[1] +=v/8;
1047                         src[2] +=v/4;
1048                         src[3] +=3*v/8;
1049                         src[4] -=3*v/8;
1050                         src[5] -=v/4;
1051                         src[6] -=v/8;
1052
1053                 }
1054                 src+=stride;
1055         }
1056 #endif
1057 }
1058
1059
1060 static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
1061 {
1062 #ifdef HAVE_MMX
1063         src+= stride*4;
1064         //FIXME try pmul for *5 stuff
1065 //      src[0]=0;
1066         asm volatile(
1067                 "pxor %%mm7, %%mm7                              \n\t"
1068                 "leal (%0, %1), %%eax                           \n\t"
1069                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
1070 //      0       1       2       3       4       5       6       7
1071 //      %0      %0+%1   %0+2%1  eax+2%1 %0+4%1  eax+4%1 ebx+%1  ebx+2%1
1072 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1
1073
1074                 "movq (%0), %%mm0                               \n\t"
1075                 "movq %%mm0, %%mm1                              \n\t"
1076                 "punpcklbw %%mm7, %%mm0                         \n\t" // low part of line 0
1077                 "punpckhbw %%mm7, %%mm1                         \n\t" // high part of line 0
1078
1079                 "movq (%%eax), %%mm2                            \n\t"
1080                 "movq %%mm2, %%mm3                              \n\t"
1081                 "punpcklbw %%mm7, %%mm2                         \n\t" // low part of line 1
1082                 "punpckhbw %%mm7, %%mm3                         \n\t" // high part of line 1
1083
1084                 "movq (%%eax, %1), %%mm4                        \n\t"
1085                 "movq %%mm4, %%mm5                              \n\t"
1086                 "punpcklbw %%mm7, %%mm4                         \n\t" // low part of line 2
1087                 "punpckhbw %%mm7, %%mm5                         \n\t" // high part of line 2
1088
1089                 "paddw %%mm0, %%mm0                             \n\t" // 2L0
1090                 "paddw %%mm1, %%mm1                             \n\t" // 2H0
1091                 "psubw %%mm4, %%mm2                             \n\t" // L1 - L2
1092                 "psubw %%mm5, %%mm3                             \n\t" // H1 - H2
1093                 "psubw %%mm2, %%mm0                             \n\t" // 2L0 - L1 + L2
1094                 "psubw %%mm3, %%mm1                             \n\t" // 2H0 - H1 + H2
1095
1096                 "psllw $2, %%mm2                                \n\t" // 4L1 - 4L2
1097                 "psllw $2, %%mm3                                \n\t" // 4H1 - 4H2
1098                 "psubw %%mm2, %%mm0                             \n\t" // 2L0 - 5L1 + 5L2
1099                 "psubw %%mm3, %%mm1                             \n\t" // 2H0 - 5H1 + 5H2
1100
1101                 "movq (%%eax, %1, 2), %%mm2                     \n\t"
1102                 "movq %%mm2, %%mm3                              \n\t"
1103                 "punpcklbw %%mm7, %%mm2                         \n\t" // L3
1104                 "punpckhbw %%mm7, %%mm3                         \n\t" // H3
1105
1106                 "psubw %%mm2, %%mm0                             \n\t" // 2L0 - 5L1 + 5L2 - L3
1107                 "psubw %%mm3, %%mm1                             \n\t" // 2H0 - 5H1 + 5H2 - H3
1108                 "psubw %%mm2, %%mm0                             \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1109                 "psubw %%mm3, %%mm1                             \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1110                 "movq %%mm0, temp0                              \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1111                 "movq %%mm1, temp1                              \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1112
1113                 "movq (%0, %1, 4), %%mm0                        \n\t"
1114                 "movq %%mm0, %%mm1                              \n\t"
1115                 "punpcklbw %%mm7, %%mm0                         \n\t" // L4
1116                 "punpckhbw %%mm7, %%mm1                         \n\t" // H4
1117
1118                 "psubw %%mm0, %%mm2                             \n\t" // L3 - L4
1119                 "psubw %%mm1, %%mm3                             \n\t" // H3 - H4
1120                 "movq %%mm2, temp2                              \n\t" // L3 - L4
1121                 "movq %%mm3, temp3                              \n\t" // H3 - H4
1122                 "paddw %%mm4, %%mm4                             \n\t" // 2L2
1123                 "paddw %%mm5, %%mm5                             \n\t" // 2H2
1124                 "psubw %%mm2, %%mm4                             \n\t" // 2L2 - L3 + L4
1125                 "psubw %%mm3, %%mm5                             \n\t" // 2H2 - H3 + H4
1126
1127                 "psllw $2, %%mm2                                \n\t" // 4L3 - 4L4
1128                 "psllw $2, %%mm3                                \n\t" // 4H3 - 4H4
1129                 "psubw %%mm2, %%mm4                             \n\t" // 2L2 - 5L3 + 5L4
1130                 "psubw %%mm3, %%mm5                             \n\t" // 2H2 - 5H3 + 5H4
1131 //50 opcodes so far
1132                 "movq (%%ebx), %%mm2                            \n\t"
1133                 "movq %%mm2, %%mm3                              \n\t"
1134                 "punpcklbw %%mm7, %%mm2                         \n\t" // L5
1135                 "punpckhbw %%mm7, %%mm3                         \n\t" // H5
1136                 "psubw %%mm2, %%mm4                             \n\t" // 2L2 - 5L3 + 5L4 - L5
1137                 "psubw %%mm3, %%mm5                             \n\t" // 2H2 - 5H3 + 5H4 - H5
1138                 "psubw %%mm2, %%mm4                             \n\t" // 2L2 - 5L3 + 5L4 - 2L5
1139                 "psubw %%mm3, %%mm5                             \n\t" // 2H2 - 5H3 + 5H4 - 2H5
1140
1141                 "movq (%%ebx, %1), %%mm6                        \n\t"
1142                 "punpcklbw %%mm7, %%mm6                         \n\t" // L6
1143                 "psubw %%mm6, %%mm2                             \n\t" // L5 - L6
1144                 "movq (%%ebx, %1), %%mm6                        \n\t"
1145                 "punpckhbw %%mm7, %%mm6                         \n\t" // H6
1146                 "psubw %%mm6, %%mm3                             \n\t" // H5 - H6
1147
1148                 "paddw %%mm0, %%mm0                             \n\t" // 2L4
1149                 "paddw %%mm1, %%mm1                             \n\t" // 2H4
1150                 "psubw %%mm2, %%mm0                             \n\t" // 2L4 - L5 + L6
1151                 "psubw %%mm3, %%mm1                             \n\t" // 2H4 - H5 + H6
1152
1153                 "psllw $2, %%mm2                                \n\t" // 4L5 - 4L6
1154                 "psllw $2, %%mm3                                \n\t" // 4H5 - 4H6
1155                 "psubw %%mm2, %%mm0                             \n\t" // 2L4 - 5L5 + 5L6
1156                 "psubw %%mm3, %%mm1                             \n\t" // 2H4 - 5H5 + 5H6
1157
1158                 "movq (%%ebx, %1, 2), %%mm2                     \n\t"
1159                 "movq %%mm2, %%mm3                              \n\t"
1160                 "punpcklbw %%mm7, %%mm2                         \n\t" // L7
1161                 "punpckhbw %%mm7, %%mm3                         \n\t" // H7
1162
1163                 "paddw %%mm2, %%mm2                             \n\t" // 2L7
1164                 "paddw %%mm3, %%mm3                             \n\t" // 2H7
1165                 "psubw %%mm2, %%mm0                             \n\t" // 2L4 - 5L5 + 5L6 - 2L7
1166                 "psubw %%mm3, %%mm1                             \n\t" // 2H4 - 5H5 + 5H6 - 2H7
1167
1168                 "movq temp0, %%mm2                              \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1169                 "movq temp1, %%mm3                              \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1170
1171 #ifdef HAVE_MMX2
1172                 "movq %%mm7, %%mm6                              \n\t" // 0
1173                 "psubw %%mm0, %%mm6                             \n\t"
1174                 "pmaxsw %%mm6, %%mm0                            \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1175                 "movq %%mm7, %%mm6                              \n\t" // 0
1176                 "psubw %%mm1, %%mm6                             \n\t"
1177                 "pmaxsw %%mm6, %%mm1                            \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1178                 "movq %%mm7, %%mm6                              \n\t" // 0
1179                 "psubw %%mm2, %%mm6                             \n\t"
1180                 "pmaxsw %%mm6, %%mm2                            \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1181                 "movq %%mm7, %%mm6                              \n\t" // 0
1182                 "psubw %%mm3, %%mm6                             \n\t"
1183                 "pmaxsw %%mm6, %%mm3                            \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1184 #else
1185                 "movq %%mm7, %%mm6                              \n\t" // 0
1186                 "pcmpgtw %%mm0, %%mm6                           \n\t"
1187                 "pxor %%mm6, %%mm0                              \n\t"
1188                 "psubw %%mm6, %%mm0                             \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1189                 "movq %%mm7, %%mm6                              \n\t" // 0
1190                 "pcmpgtw %%mm1, %%mm6                           \n\t"
1191                 "pxor %%mm6, %%mm1                              \n\t"
1192                 "psubw %%mm6, %%mm1                             \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1193                 "movq %%mm7, %%mm6                              \n\t" // 0
1194                 "pcmpgtw %%mm2, %%mm6                           \n\t"
1195                 "pxor %%mm6, %%mm2                              \n\t"
1196                 "psubw %%mm6, %%mm2                             \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1197                 "movq %%mm7, %%mm6                              \n\t" // 0
1198                 "pcmpgtw %%mm3, %%mm6                           \n\t"
1199                 "pxor %%mm6, %%mm3                              \n\t"
1200                 "psubw %%mm6, %%mm3                             \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1201 #endif
1202
1203 #ifdef HAVE_MMX2
1204                 "pminsw %%mm2, %%mm0                            \n\t"
1205                 "pminsw %%mm3, %%mm1                            \n\t"
1206 #else
1207                 "movq %%mm0, %%mm6                              \n\t"
1208                 "psubusw %%mm2, %%mm6                           \n\t"
1209                 "psubw %%mm6, %%mm0                             \n\t"
1210                 "movq %%mm1, %%mm6                              \n\t"
1211                 "psubusw %%mm3, %%mm6                           \n\t"
1212                 "psubw %%mm6, %%mm1                             \n\t"
1213 #endif
1214
1215                 "movq %%mm7, %%mm6                              \n\t" // 0
1216                 "pcmpgtw %%mm4, %%mm6                           \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
1217                 "pxor %%mm6, %%mm4                              \n\t"
1218                 "psubw %%mm6, %%mm4                             \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
1219                 "pcmpgtw %%mm5, %%mm7                           \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
1220                 "pxor %%mm7, %%mm5                              \n\t"
1221                 "psubw %%mm7, %%mm5                             \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
1222 // 100 opcodes
1223                 "movd %2, %%mm2                                 \n\t" // QP
1224                 "punpcklwd %%mm2, %%mm2                         \n\t"
1225                 "punpcklwd %%mm2, %%mm2                         \n\t"
1226                 "psllw $3, %%mm2                                \n\t" // 8QP
1227                 "movq %%mm2, %%mm3                              \n\t" // 8QP
1228                 "pcmpgtw %%mm4, %%mm2                           \n\t"
1229                 "pcmpgtw %%mm5, %%mm3                           \n\t"
1230                 "pand %%mm2, %%mm4                              \n\t"
1231                 "pand %%mm3, %%mm5                              \n\t"
1232
1233
1234                 "psubusw %%mm0, %%mm4                           \n\t" // hd
1235                 "psubusw %%mm1, %%mm5                           \n\t" // ld
1236
1237
1238                 "movq w05, %%mm2                                \n\t" // 5
1239                 "pmullw %%mm2, %%mm4                            \n\t"
1240                 "pmullw %%mm2, %%mm5                            \n\t"
1241                 "movq w20, %%mm2                                \n\t" // 32
1242                 "paddw %%mm2, %%mm4                             \n\t"
1243                 "paddw %%mm2, %%mm5                             \n\t"
1244                 "psrlw $6, %%mm4                                \n\t"
1245                 "psrlw $6, %%mm5                                \n\t"
1246
1247 /*
1248                 "movq w06, %%mm2                                \n\t" // 6
1249                 "paddw %%mm2, %%mm4                             \n\t"
1250                 "paddw %%mm2, %%mm5                             \n\t"
1251                 "movq w1400, %%mm2                              \n\t" // 1400h = 5120 = 5/64*2^16
1252 //FIXME if *5/64 is supposed to be /13 then we should use 5041 instead of 5120
1253                 "pmulhw %%mm2, %%mm4                            \n\t" // hd/13
1254                 "pmulhw %%mm2, %%mm5                            \n\t" // ld/13
1255 */
1256
1257                 "movq temp2, %%mm0                              \n\t" // L3 - L4
1258                 "movq temp3, %%mm1                              \n\t" // H3 - H4
1259
1260                 "pxor %%mm2, %%mm2                              \n\t"
1261                 "pxor %%mm3, %%mm3                              \n\t"
1262
1263                 "pcmpgtw %%mm0, %%mm2                           \n\t" // sign (L3-L4)
1264                 "pcmpgtw %%mm1, %%mm3                           \n\t" // sign (H3-H4)
1265                 "pxor %%mm2, %%mm0                              \n\t"
1266                 "pxor %%mm3, %%mm1                              \n\t"
1267                 "psubw %%mm2, %%mm0                             \n\t" // |L3-L4|
1268                 "psubw %%mm3, %%mm1                             \n\t" // |H3-H4|
1269                 "psrlw $1, %%mm0                                \n\t" // |L3 - L4|/2
1270                 "psrlw $1, %%mm1                                \n\t" // |H3 - H4|/2
1271
1272                 "pxor %%mm6, %%mm2                              \n\t"
1273                 "pxor %%mm7, %%mm3                              \n\t"
1274                 "pand %%mm2, %%mm4                              \n\t"
1275                 "pand %%mm3, %%mm5                              \n\t"
1276
1277 #ifdef HAVE_MMX2
1278                 "pminsw %%mm0, %%mm4                            \n\t"
1279                 "pminsw %%mm1, %%mm5                            \n\t"
1280 #else
1281                 "movq %%mm4, %%mm2                              \n\t"
1282                 "psubusw %%mm0, %%mm2                           \n\t"
1283                 "psubw %%mm2, %%mm4                             \n\t"
1284                 "movq %%mm5, %%mm2                              \n\t"
1285                 "psubusw %%mm1, %%mm2                           \n\t"
1286                 "psubw %%mm2, %%mm5                             \n\t"
1287 #endif
1288                 "pxor %%mm6, %%mm4                              \n\t"
1289                 "pxor %%mm7, %%mm5                              \n\t"
1290                 "psubw %%mm6, %%mm4                             \n\t"
1291                 "psubw %%mm7, %%mm5                             \n\t"
1292                 "packsswb %%mm5, %%mm4                          \n\t"
1293                 "movq (%%eax, %1, 2), %%mm0                     \n\t"
1294                 "paddb   %%mm4, %%mm0                           \n\t"
1295                 "movq %%mm0, (%%eax, %1, 2)                     \n\t"
1296                 "movq (%0, %1, 4), %%mm0                        \n\t"
1297                 "psubb %%mm4, %%mm0                             \n\t"
1298                 "movq %%mm0, (%0, %1, 4)                        \n\t"
1299
1300                 :
1301                 : "r" (src), "r" (stride), "r" (QP)
1302                 : "%eax", "%ebx"
1303         );
1304 #else
1305         const int l1= stride;
1306         const int l2= stride + l1;
1307         const int l3= stride + l2;
1308         const int l4= stride + l3;
1309         const int l5= stride + l4;
1310         const int l6= stride + l5;
1311         const int l7= stride + l6;
1312         const int l8= stride + l7;
1313 //      const int l9= stride + l8;
1314         int x;
1315         src+= stride*3;
1316         for(x=0; x<BLOCK_SIZE; x++)
1317         {
1318                 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1319                 if(ABS(middleEnergy) < 8*QP)
1320                 {
1321                         const int q=(src[l4] - src[l5])/2;
1322                         const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1323                         const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1324
1325                         int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1326                         d= MAX(d, 0);
1327
1328                         d= (5*d + 32) >> 6;
1329                         d*= SIGN(-middleEnergy);
1330
1331                         if(q>0)
1332                         {
1333                                 d= d<0 ? 0 : d;
1334                                 d= d>q ? q : d;
1335                         }
1336                         else
1337                         {
1338                                 d= d>0 ? 0 : d;
1339                                 d= d<q ? q : d;
1340                         }
1341
1342                         src[l4]-= d;
1343                         src[l5]+= d;
1344                 }
1345                 src++;
1346         }
1347 #endif
1348 }
1349
1350 //FIXME?  |255-0| = 1
1351 /**
1352  * Check if the given 8x8 Block is mostly "flat"
1353  */
1354 static inline int isHorizDC(uint8_t src[], int stride)
1355 {
1356 //      src++;
1357         int numEq= 0;
1358 #if 0
1359 asm volatile (
1360 //              "int $3 \n\t"
1361                 "leal (%1, %2), %%ecx                           \n\t"
1362                 "leal (%%ecx, %2, 4), %%ebx                     \n\t"
1363 //      0       1       2       3       4       5       6       7       8       9
1364 //      %1      ecx     ecx+%2  ecx+2%2 %1+4%2  ebx     ebx+%2  ebx+2%2 %1+8%2  ebx+4%2
1365                 "movq b7E, %%mm7                                \n\t" // mm7 = 0x7F
1366                 "movq b7C, %%mm6                                \n\t" // mm6 = 0x7D
1367                 "pxor %%mm0, %%mm0                              \n\t"
1368                 "movl %1, %%eax                                 \n\t"
1369                 "andl $0x1F, %%eax                              \n\t"
1370                 "cmpl $24, %%eax                                \n\t"
1371                 "leal tempBlock, %%eax                          \n\t"
1372                 "jb 1f                                          \n\t"
1373
1374 #define HDC_CHECK_AND_CPY(src, dst) \
1375                 "movd " #src ", %%mm2                           \n\t"\
1376                 "punpckldq 4" #src ", %%mm2                             \n\t" /* (%1) */\
1377                 "movq %%mm2, %%mm1                              \n\t"\
1378                 "psrlq $8, %%mm2                                \n\t"\
1379                 "psubb %%mm1, %%mm2                             \n\t"\
1380                 "paddb %%mm7, %%mm2                             \n\t"\
1381                 "pcmpgtb %%mm6, %%mm2                           \n\t"\
1382                 "paddb %%mm2, %%mm0                             \n\t"\
1383                 "movq %%mm1," #dst "(%%eax)                     \n\t"
1384
1385                 HDC_CHECK_AND_CPY((%1),0)
1386                 HDC_CHECK_AND_CPY((%%ecx),8)
1387                 HDC_CHECK_AND_CPY((%%ecx, %2),16)
1388                 HDC_CHECK_AND_CPY((%%ecx, %2, 2),24)
1389                 HDC_CHECK_AND_CPY((%1, %2, 4),32)
1390                 HDC_CHECK_AND_CPY((%%ebx),40)
1391                 HDC_CHECK_AND_CPY((%%ebx, %2),48)
1392                 HDC_CHECK_AND_CPY((%%ebx, %2, 2),56)
1393                 "jmp 2f                                         \n\t"
1394                 "1:                                             \n\t"
1395 // src does not cross a 32 byte cache line so dont waste time with alignment
1396 #define HDC_CHECK_AND_CPY2(src, dst) \
1397                 "movq " #src ", %%mm2                           \n\t"\
1398                 "movq " #src ", %%mm1                           \n\t"\
1399                 "psrlq $8, %%mm2                                \n\t"\
1400                 "psubb %%mm1, %%mm2                             \n\t"\
1401                 "paddb %%mm7, %%mm2                             \n\t"\
1402                 "pcmpgtb %%mm6, %%mm2                           \n\t"\
1403                 "paddb %%mm2, %%mm0                             \n\t"\
1404                 "movq %%mm1," #dst "(%%eax)                     \n\t"
1405
1406                 HDC_CHECK_AND_CPY2((%1),0)
1407                 HDC_CHECK_AND_CPY2((%%ecx),8)
1408                 HDC_CHECK_AND_CPY2((%%ecx, %2),16)
1409                 HDC_CHECK_AND_CPY2((%%ecx, %2, 2),24)
1410                 HDC_CHECK_AND_CPY2((%1, %2, 4),32)
1411                 HDC_CHECK_AND_CPY2((%%ebx),40)
1412                 HDC_CHECK_AND_CPY2((%%ebx, %2),48)
1413                 HDC_CHECK_AND_CPY2((%%ebx, %2, 2),56)
1414                 "2:                                             \n\t"
1415                 "psllq $8, %%mm0                                \n\t" // remove dummy value
1416                 "movq %%mm0, %%mm1                              \n\t"
1417                 "psrlw $8, %%mm0                                \n\t"
1418                 "paddb %%mm1, %%mm0                             \n\t"
1419                 "movq %%mm0, %%mm1                              \n\t"
1420                 "psrlq $16, %%mm0                               \n\t"
1421                 "paddb %%mm1, %%mm0                             \n\t"
1422                 "movq %%mm0, %%mm1                              \n\t"
1423                 "psrlq $32, %%mm0                               \n\t"
1424                 "paddb %%mm1, %%mm0                             \n\t"
1425                 "movd %%mm0, %0                                 \n\t"
1426                 : "=r" (numEq)
1427                 : "r" (src), "r" (stride)
1428                 : "%eax", "%ebx", "%ecx"
1429                 );
1430 //      printf("%d\n", numEq);
1431         numEq= (256 - numEq) &0xFF;
1432 #else
1433         int y;
1434         for(y=0; y<BLOCK_SIZE; y++)
1435         {
1436                 if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++;
1437                 if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++;
1438                 if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++;
1439                 if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++;
1440                 if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++;
1441                 if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++;
1442                 if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++;
1443                 src+= stride;
1444         }
1445 #endif
1446 /*      if(abs(numEq - asmEq) > 0)
1447         {
1448 //              printf("\nasm:%d  c:%d\n", asmEq, numEq);
1449                 for(int y=0; y<8; y++)
1450                 {
1451                         for(int x=0; x<8; x++)
1452                         {
1453                                 printf("%d ", src[x + y*stride]);
1454                         }
1455                         printf("\n");
1456                 }
1457         }
1458 */
1459 //      printf("%d\n", numEq);
1460         return numEq > hFlatnessThreshold;
1461 }
1462
1463 static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
1464 {
1465         if(abs(src[0] - src[7]) > 2*QP) return 0;
1466
1467         return 1;
1468 }
1469
1470 static inline void doHorizDefFilter(uint8_t dst[], int stride, int QP)
1471 {
1472 #if 0
1473         asm volatile(
1474                 "leal (%0, %1), %%ecx                           \n\t"
1475                 "leal (%%ecx, %1, 4), %%ebx                     \n\t"
1476 //      0       1       2       3       4       5       6       7       8       9
1477 //      %0      ecx     ecx+%1  ecx+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
1478                 "pxor %%mm7, %%mm7                              \n\t"
1479                 "movq bm00001000, %%mm6                         \n\t"
1480                 "movd %2, %%mm5                                 \n\t" // QP
1481                 "movq %%mm5, %%mm4                              \n\t"
1482                 "paddusb %%mm5, %%mm5                           \n\t" // 2QP
1483                 "paddusb %%mm5, %%mm4                           \n\t" // 3QP
1484                 "psllq $24, %%mm4                               \n\t"
1485                 "pxor %%mm5, %%mm5                              \n\t" // 0
1486                 "psubb %%mm4, %%mm5                             \n\t" // -QP
1487                 "leal tempBlock, %%eax                          \n\t"
1488
1489 //FIXME? "unroll by 2" and mix
1490 #ifdef HAVE_MMX2
1491 #define HDF(src, dst)   \
1492                 "movq " #src "(%%eax), %%mm0                    \n\t"\
1493                 "movq " #src "(%%eax), %%mm1                    \n\t"\
1494                 "movq " #src "(%%eax), %%mm2                    \n\t"\
1495                 "psrlq $8, %%mm1                                \n\t"\
1496                 "psubusb %%mm1, %%mm2                           \n\t"\
1497                 "psubusb %%mm0, %%mm1                           \n\t"\
1498                 "por %%mm2, %%mm1                               \n\t" /* p´x = |px - p(x+1)| */\
1499                 "pcmpeqb %%mm7, %%mm2                           \n\t" /* p´x = sgn[px - p(x+1)] */\
1500                 "pshufw $0x00, %%mm1, %%mm3                     \n\t" /* p´5 = |p1 - p2| */\
1501                 "pminub %%mm1, %%mm3                            \n\t" /* p´5 = min(|p2-p1|, |p6-p5|)*/\
1502                 "psrlq $16, %%mm3                               \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\
1503                 "psubusb %%mm3, %%mm1                   \n\t" /* |p3-p4|-min(|p1-p2|,|p5-p6|) */\
1504                 "paddb %%mm5, %%mm1                             \n\t"\
1505                 "psubusb %%mm5, %%mm1                           \n\t"\
1506                 "psrlw $2, %%mm1                                \n\t"\
1507                 "pxor %%mm2, %%mm1                              \n\t"\
1508                 "psubb %%mm2, %%mm1                             \n\t"\
1509                 "pand %%mm6, %%mm1                              \n\t"\
1510                 "psubb %%mm1, %%mm0                             \n\t"\
1511                 "psllq $8, %%mm1                                \n\t"\
1512                 "paddb %%mm1, %%mm0                             \n\t"\
1513                 "movd %%mm0, " #dst"                            \n\t"\
1514                 "psrlq $32, %%mm0                               \n\t"\
1515                 "movd %%mm0, 4" #dst"                           \n\t"
1516 #else
1517 #define HDF(src, dst)\
1518                 "movq " #src "(%%eax), %%mm0                    \n\t"\
1519                 "movq %%mm0, %%mm1                              \n\t"\
1520                 "movq %%mm0, %%mm2                              \n\t"\
1521                 "psrlq $8, %%mm1                                \n\t"\
1522                 "psubusb %%mm1, %%mm2                           \n\t"\
1523                 "psubusb %%mm0, %%mm1                           \n\t"\
1524                 "por %%mm2, %%mm1                               \n\t" /* p´x = |px - p(x+1)| */\
1525                 "pcmpeqb %%mm7, %%mm2                           \n\t" /* p´x = sgn[px - p(x+1)] */\
1526                 "movq %%mm1, %%mm3                              \n\t"\
1527                 "psllq $32, %%mm3                               \n\t"\
1528                 "movq %%mm3, %%mm4                              \n\t"\
1529                 "psubusb %%mm1, %%mm4                           \n\t"\
1530                 "psubb %%mm4, %%mm3                             \n\t"\
1531                 "psrlq $16, %%mm3                               \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\
1532                 "psubusb %%mm3, %%mm1                   \n\t" /* |p3-p4|-min(|p1-p2|,|p5,ü6|) */\
1533                 "paddb %%mm5, %%mm1                             \n\t"\
1534                 "psubusb %%mm5, %%mm1                           \n\t"\
1535                 "psrlw $2, %%mm1                                \n\t"\
1536                 "pxor %%mm2, %%mm1                              \n\t"\
1537                 "psubb %%mm2, %%mm1                             \n\t"\
1538                 "pand %%mm6, %%mm1                              \n\t"\
1539                 "psubb %%mm1, %%mm0                             \n\t"\
1540                 "psllq $8, %%mm1                                \n\t"\
1541                 "paddb %%mm1, %%mm0                             \n\t"\
1542                 "movd %%mm0, " #dst "                           \n\t"\
1543                 "psrlq $32, %%mm0                               \n\t"\
1544                 "movd %%mm0, 4" #dst "                          \n\t"
1545 #endif
1546                 HDF(0,(%0))
1547                 HDF(8,(%%ecx))
1548                 HDF(16,(%%ecx, %1))
1549                 HDF(24,(%%ecx, %1, 2))
1550                 HDF(32,(%0, %1, 4))
1551                 HDF(40,(%%ebx))
1552                 HDF(48,(%%ebx, %1))
1553                 HDF(56,(%%ebx, %1, 2))
1554                 :
1555                 : "r" (dst), "r" (stride), "r" (QP)
1556                 : "%eax", "%ebx", "%ecx"
1557         );
1558 #else
1559         int y;
1560         for(y=0; y<BLOCK_SIZE; y++)
1561         {
1562                 const int middleEnergy= 5*(dst[4] - dst[5]) + 2*(dst[2] - dst[5]);
1563
1564                 if(ABS(middleEnergy) < 8*QP)
1565                 {
1566                         const int q=(dst[3] - dst[4])/2;
1567                         const int leftEnergy=  5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
1568                         const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
1569
1570                         int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1571                         d= MAX(d, 0);
1572
1573                         d= (5*d + 32) >> 6;
1574                         d*= SIGN(-middleEnergy);
1575
1576                         if(q>0)
1577                         {
1578                                 d= d<0 ? 0 : d;
1579                                 d= d>q ? q : d;
1580                         }
1581                         else
1582                         {
1583                                 d= d>0 ? 0 : d;
1584                                 d= d<q ? q : d;
1585                         }
1586
1587                         dst[3]-= d;
1588                         dst[4]+= d;
1589                 }
1590                 dst+= stride;
1591         }
1592 #endif
1593 }
1594
1595 /**
1596  * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
1597  * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
1598  * using the 7-Tap Filter   (2,2,2,4,2,2,2)/16 (MMX2/3DNOW version)
1599  */
1600 static inline void doHorizLowPass(uint8_t dst[], int stride, int QP)
1601 {
1602
1603 #if 0
1604         asm volatile(
1605                 "leal (%0, %1), %%ecx                           \n\t"
1606                 "leal (%%ecx, %1, 4), %%ebx                     \n\t"
1607 //      0       1       2       3       4       5       6       7       8       9
1608 //      %0      ecx     ecx+%1  ecx+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
1609                 "pxor %%mm7, %%mm7                                      \n\t"
1610                 "leal tempBlock, %%eax                                  \n\t"
1611 /*
1612 #define HLP1    "movq (%0), %%mm0                                       \n\t"\
1613                 "movq %%mm0, %%mm1                                      \n\t"\
1614                 "psllq $8, %%mm0                                        \n\t"\
1615                 PAVGB(%%mm1, %%mm0)\
1616                 "psrlw $8, %%mm0                                        \n\t"\
1617                 "pxor %%mm1, %%mm1                                      \n\t"\
1618                 "packuswb %%mm1, %%mm0                                  \n\t"\
1619                 "movq %%mm0, %%mm1                                      \n\t"\
1620                 "movq %%mm0, %%mm2                                      \n\t"\
1621                 "psllq $32, %%mm0                                       \n\t"\
1622                 "paddb %%mm0, %%mm1                                     \n\t"\
1623                 "psllq $16, %%mm2                                       \n\t"\
1624                 PAVGB(%%mm2, %%mm0)\
1625                 "movq %%mm0, %%mm3                                      \n\t"\
1626                 "pand bm11001100, %%mm0                                 \n\t"\
1627                 "paddusb %%mm0, %%mm3                                   \n\t"\
1628                 "psrlq $8, %%mm3                                        \n\t"\
1629                 PAVGB(%%mm1, %%mm4)\
1630                 PAVGB(%%mm3, %%mm2)\
1631                 "psrlq $16, %%mm2                                       \n\t"\
1632                 "punpcklbw %%mm2, %%mm2                                 \n\t"\
1633                 "movq %%mm2, (%0)                                       \n\t"\
1634
1635 #define HLP2    "movq (%0), %%mm0                                       \n\t"\
1636                 "movq %%mm0, %%mm1                                      \n\t"\
1637                 "psllq $8, %%mm0                                        \n\t"\
1638                 PAVGB(%%mm1, %%mm0)\
1639                 "psrlw $8, %%mm0                                        \n\t"\
1640                 "pxor %%mm1, %%mm1                                      \n\t"\
1641                 "packuswb %%mm1, %%mm0                                  \n\t"\
1642                 "movq %%mm0, %%mm2                                      \n\t"\
1643                 "psllq $32, %%mm0                                       \n\t"\
1644                 "psllq $16, %%mm2                                       \n\t"\
1645                 PAVGB(%%mm2, %%mm0)\
1646                 "movq %%mm0, %%mm3                                      \n\t"\
1647                 "pand bm11001100, %%mm0                                 \n\t"\
1648                 "paddusb %%mm0, %%mm3                                   \n\t"\
1649                 "psrlq $8, %%mm3                                        \n\t"\
1650                 PAVGB(%%mm3, %%mm2)\
1651                 "psrlq $16, %%mm2                                       \n\t"\
1652                 "punpcklbw %%mm2, %%mm2                                 \n\t"\
1653                 "movq %%mm2, (%0)                                       \n\t"\
1654 */
1655 // approximately a 7-Tap Filter with Vector (1,2,3,4,3,2,1)/16
1656 /*
1657 Implemented     Exact 7-Tap
1658  9421           A321
1659  36421          64321
1660  334321         =
1661  1234321        =
1662   1234321       =
1663    123433       =
1664     12463         12346
1665      1249          123A
1666
1667 */
1668
1669 #ifdef HAVE_MMX2
1670 #define HLP3(i) "movq " #i "(%%eax), %%mm0                              \n\t"\
1671                 "movq %%mm0, %%mm1                                      \n\t"\
1672                 "movq %%mm0, %%mm2                                      \n\t"\
1673                 "movq %%mm0, %%mm3                                      \n\t"\
1674                 "movq %%mm0, %%mm4                                      \n\t"\
1675                 "psllq $8, %%mm1                                        \n\t"\
1676                 "psrlq $8, %%mm2                                        \n\t"\
1677                 "pand bm00000001, %%mm3                                 \n\t"\
1678                 "pand bm10000000, %%mm4                                 \n\t"\
1679                 "por %%mm3, %%mm1                                       \n\t"\
1680                 "por %%mm4, %%mm2                                       \n\t"\
1681                 PAVGB(%%mm2, %%mm1)\
1682                 PAVGB(%%mm1, %%mm0)\
1683 \
1684                 "pshufw $0xF9, %%mm0, %%mm3                             \n\t"\
1685                 "pshufw $0x90, %%mm0, %%mm4                             \n\t"\
1686                 PAVGB(%%mm3, %%mm4)\
1687                 PAVGB(%%mm4, %%mm0)\
1688                 "movd %%mm0, (%0)                                       \n\t"\
1689                 "psrlq $32, %%mm0                                       \n\t"\
1690                 "movd %%mm0, 4(%0)                                      \n\t"
1691 #else
1692 #define HLP3(i) "movq " #i "(%%eax), %%mm0                              \n\t"\
1693                 "movq %%mm0, %%mm1                                      \n\t"\
1694                 "movq %%mm0, %%mm2                                      \n\t"\
1695                 "movd -4(%0), %%mm3                                     \n\t" /*0001000*/\
1696                 "movd 8(%0), %%mm4                                      \n\t" /*0001000*/\
1697                 "psllq $8, %%mm1                                        \n\t"\
1698                 "psrlq $8, %%mm2                                        \n\t"\
1699                 "psrlq $24, %%mm3                                       \n\t"\
1700                 "psllq $56, %%mm4                                       \n\t"\
1701                 "por %%mm3, %%mm1                                       \n\t"\
1702                 "por %%mm4, %%mm2                                       \n\t"\
1703                 PAVGB(%%mm2, %%mm1)\
1704                 PAVGB(%%mm1, %%mm0)\
1705 \
1706                 "movq %%mm0, %%mm3                                      \n\t"\
1707                 "movq %%mm0, %%mm4                                      \n\t"\
1708                 "movq %%mm0, %%mm5                                      \n\t"\
1709                 "psrlq $16, %%mm3                                       \n\t"\
1710                 "psllq $16, %%mm4                                       \n\t"\
1711                 "pand bm11000000, %%mm5                                 \n\t"\
1712                 "por %%mm5, %%mm3                                       \n\t"\
1713                 "movq %%mm0, %%mm5                                      \n\t"\
1714                 "pand bm00000011, %%mm5                                 \n\t"\
1715                 "por %%mm5, %%mm4                                       \n\t"\
1716                 PAVGB(%%mm3, %%mm4)\
1717                 PAVGB(%%mm4, %%mm0)\
1718                 "movd %%mm0, (%0)                                       \n\t"\
1719                 "psrlq $32, %%mm0                                       \n\t"\
1720                 "movd %%mm0, 4(%0)                                      \n\t"
1721 #endif
1722
1723 /* uses the 7-Tap Filter: 1112111 */
1724 #define NEW_HLP(src, dst)\
1725                 "movq " #src "(%%eax), %%mm1                            \n\t"\
1726                 "movq " #src "(%%eax), %%mm2                            \n\t"\
1727                 "psllq $8, %%mm1                                        \n\t"\
1728                 "psrlq $8, %%mm2                                        \n\t"\
1729                 "movd -4" #dst ", %%mm3                                 \n\t" /*0001000*/\
1730                 "movd 8" #dst ", %%mm4                                  \n\t" /*0001000*/\
1731                 "psrlq $24, %%mm3                                       \n\t"\
1732                 "psllq $56, %%mm4                                       \n\t"\
1733                 "por %%mm3, %%mm1                                       \n\t"\
1734                 "por %%mm4, %%mm2                                       \n\t"\
1735                 "movq %%mm1, %%mm5                                      \n\t"\
1736                 PAVGB(%%mm2, %%mm1)\
1737                 "movq " #src "(%%eax), %%mm0                            \n\t"\
1738                 PAVGB(%%mm1, %%mm0)\
1739                 "psllq $8, %%mm5                                        \n\t"\
1740                 "psrlq $8, %%mm2                                        \n\t"\
1741                 "por %%mm3, %%mm5                                       \n\t"\
1742                 "por %%mm4, %%mm2                                       \n\t"\
1743                 "movq %%mm5, %%mm1                                      \n\t"\
1744                 PAVGB(%%mm2, %%mm5)\
1745                 "psllq $8, %%mm1                                        \n\t"\
1746                 "psrlq $8, %%mm2                                        \n\t"\
1747                 "por %%mm3, %%mm1                                       \n\t"\
1748                 "por %%mm4, %%mm2                                       \n\t"\
1749                 PAVGB(%%mm2, %%mm1)\
1750                 PAVGB(%%mm1, %%mm5)\
1751                 PAVGB(%%mm5, %%mm0)\
1752                 "movd %%mm0, " #dst "                                   \n\t"\
1753                 "psrlq $32, %%mm0                                       \n\t"\
1754                 "movd %%mm0, 4" #dst "                                  \n\t"
1755
1756 /* uses the 9-Tap Filter: 112242211 */
1757 #define NEW_HLP2(i)\
1758                 "movq " #i "(%%eax), %%mm0                              \n\t" /*0001000*/\
1759                 "movq %%mm0, %%mm1                                      \n\t" /*0001000*/\
1760                 "movq %%mm0, %%mm2                                      \n\t" /*0001000*/\
1761                 "movd -4(%0), %%mm3                                     \n\t" /*0001000*/\
1762                 "movd 8(%0), %%mm4                                      \n\t" /*0001000*/\
1763                 "psllq $8, %%mm1                                        \n\t"\
1764                 "psrlq $8, %%mm2                                        \n\t"\
1765                 "psrlq $24, %%mm3                                       \n\t"\
1766                 "psllq $56, %%mm4                                       \n\t"\
1767                 "por %%mm3, %%mm1                                       \n\t" /*0010000*/\
1768                 "por %%mm4, %%mm2                                       \n\t" /*0000100*/\
1769                 "movq %%mm1, %%mm5                                      \n\t" /*0010000*/\
1770                 PAVGB(%%mm2, %%mm1)                                           /*0010100*/\
1771                 PAVGB(%%mm1, %%mm0)                                           /*0012100*/\
1772                 "psllq $8, %%mm5                                        \n\t"\
1773                 "psrlq $8, %%mm2                                        \n\t"\
1774                 "por %%mm3, %%mm5                                       \n\t" /*0100000*/\
1775                 "por %%mm4, %%mm2                                       \n\t" /*0000010*/\
1776                 "movq %%mm5, %%mm1                                      \n\t" /*0100000*/\
1777                 PAVGB(%%mm2, %%mm5)                                           /*0100010*/\
1778                 "psllq $8, %%mm1                                        \n\t"\
1779                 "psrlq $8, %%mm2                                        \n\t"\
1780                 "por %%mm3, %%mm1                                       \n\t" /*1000000*/\
1781                 "por %%mm4, %%mm2                                       \n\t" /*0000001*/\
1782                 "movq %%mm1, %%mm6                                      \n\t" /*1000000*/\
1783                 PAVGB(%%mm2, %%mm1)                                           /*1000001*/\
1784                 "psllq $8, %%mm6                                        \n\t"\
1785                 "psrlq $8, %%mm2                                        \n\t"\
1786                 "por %%mm3, %%mm6                                       \n\t"/*100000000*/\
1787                 "por %%mm4, %%mm2                                       \n\t"/*000000001*/\
1788                 PAVGB(%%mm2, %%mm6)                                          /*100000001*/\
1789                 PAVGB(%%mm6, %%mm1)                                          /*110000011*/\
1790                 PAVGB(%%mm1, %%mm5)                                          /*112000211*/\
1791                 PAVGB(%%mm5, %%mm0)                                          /*112242211*/\
1792                 "movd %%mm0, (%0)                                       \n\t"\
1793                 "psrlq $32, %%mm0                                       \n\t"\
1794                 "movd %%mm0, 4(%0)                                      \n\t"
1795
1796 #define HLP(src, dst) NEW_HLP(src, dst)
1797
1798                 HLP(0, (%0))
1799                 HLP(8, (%%ecx))
1800                 HLP(16, (%%ecx, %1))
1801                 HLP(24, (%%ecx, %1, 2))
1802                 HLP(32, (%0, %1, 4))
1803                 HLP(40, (%%ebx))
1804                 HLP(48, (%%ebx, %1))
1805                 HLP(56, (%%ebx, %1, 2))
1806
1807                 :
1808                 : "r" (dst), "r" (stride)
1809                 : "%eax", "%ebx", "%ecx"
1810         );
1811
1812 #else
1813         int y;
1814         for(y=0; y<BLOCK_SIZE; y++)
1815         {
1816                 const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
1817                 const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
1818
1819                 int sums[9];
1820                 sums[0] = first + dst[0];
1821                 sums[1] = dst[0] + dst[1];
1822                 sums[2] = dst[1] + dst[2];
1823                 sums[3] = dst[2] + dst[3];
1824                 sums[4] = dst[3] + dst[4];
1825                 sums[5] = dst[4] + dst[5];
1826                 sums[6] = dst[5] + dst[6];
1827                 sums[7] = dst[6] + dst[7];
1828                 sums[8] = dst[7] + last;
1829
1830                 dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
1831                 dst[1]= ((dst[1]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
1832                 dst[2]= ((dst[2]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
1833                 dst[3]= ((dst[3]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
1834                 dst[4]= ((dst[4]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
1835                 dst[5]= ((dst[5]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
1836                 dst[6]= (((last + dst[6])<<2) + ((dst[7] + sums[5])<<1) + sums[3] + 8)>>4;
1837                 dst[7]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
1838
1839                 dst+= stride;
1840         }
1841 #endif
1842 }
1843
1844 static inline void dering(uint8_t src[], int stride, int QP)
1845 {
1846 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1847         asm volatile(
1848                 "movq pQPb, %%mm0                               \n\t"
1849                 "paddusb %%mm0, %%mm0                           \n\t"
1850                 "movq %%mm0, pQPb2                              \n\t"
1851
1852                 "leal (%0, %1), %%eax                           \n\t"
1853                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
1854 //      0       1       2       3       4       5       6       7       8       9
1855 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
1856
1857                 "pcmpeqb %%mm6, %%mm6                           \n\t"
1858                 "pxor %%mm7, %%mm7                              \n\t"
1859 #ifdef HAVE_MMX2
1860 #define FIND_MIN_MAX(addr)\
1861                 "movq " #addr ", %%mm0                          \n\t"\
1862                 "pminub %%mm0, %%mm6                            \n\t"\
1863                 "pmaxub %%mm0, %%mm7                            \n\t"
1864 #else
1865 #define FIND_MIN_MAX(addr)\
1866                 "movq " #addr ", %%mm0                          \n\t"\
1867                 "movq %%mm6, %%mm1                              \n\t"\
1868                 "psubusb %%mm0, %%mm7                           \n\t"\
1869                 "paddb %%mm0, %%mm7                             \n\t"\
1870                 "psubusb %%mm0, %%mm1                           \n\t"\
1871                 "psubb %%mm1, %%mm6                             \n\t"
1872 #endif
1873
1874 FIND_MIN_MAX((%%eax))
1875 FIND_MIN_MAX((%%eax, %1))
1876 FIND_MIN_MAX((%%eax, %1, 2))
1877 FIND_MIN_MAX((%0, %1, 4))
1878 FIND_MIN_MAX((%%ebx))
1879 FIND_MIN_MAX((%%ebx, %1))
1880 FIND_MIN_MAX((%%ebx, %1, 2))
1881 FIND_MIN_MAX((%0, %1, 8))
1882
1883                 "movq %%mm6, %%mm4                              \n\t"
1884                 "psrlq $8, %%mm6                                \n\t"
1885 #ifdef HAVE_MMX2
1886                 "pminub %%mm4, %%mm6                            \n\t" // min of pixels
1887                 "pshufw $0xF9, %%mm6, %%mm4                     \n\t"
1888                 "pminub %%mm4, %%mm6                            \n\t" // min of pixels
1889                 "pshufw $0xFE, %%mm6, %%mm4                     \n\t"
1890                 "pminub %%mm4, %%mm6                            \n\t"
1891 #else
1892                 "movq %%mm6, %%mm1                              \n\t"
1893                 "psubusb %%mm4, %%mm1                           \n\t"
1894                 "psubb %%mm1, %%mm6                             \n\t"
1895                 "movq %%mm6, %%mm4                              \n\t"
1896                 "psrlq $16, %%mm6                               \n\t"
1897                 "movq %%mm6, %%mm1                              \n\t"
1898                 "psubusb %%mm4, %%mm1                           \n\t"
1899                 "psubb %%mm1, %%mm6                             \n\t"
1900                 "movq %%mm6, %%mm4                              \n\t"
1901                 "psrlq $32, %%mm6                               \n\t"
1902                 "movq %%mm6, %%mm1                              \n\t"
1903                 "psubusb %%mm4, %%mm1                           \n\t"
1904                 "psubb %%mm1, %%mm6                             \n\t"
1905 #endif
1906
1907
1908                 "movq %%mm7, %%mm4                              \n\t"
1909                 "psrlq $8, %%mm7                                \n\t"
1910 #ifdef HAVE_MMX2
1911                 "pmaxub %%mm4, %%mm7                            \n\t" // max of pixels
1912                 "pshufw $0xF9, %%mm7, %%mm4                     \n\t"
1913                 "pmaxub %%mm4, %%mm7                            \n\t"
1914                 "pshufw $0xFE, %%mm7, %%mm4                     \n\t"
1915                 "pmaxub %%mm4, %%mm7                            \n\t"
1916 #else
1917                 "psubusb %%mm4, %%mm7                           \n\t"
1918                 "paddb %%mm4, %%mm7                             \n\t"
1919                 "movq %%mm7, %%mm4                              \n\t"
1920                 "psrlq $16, %%mm7                               \n\t"
1921                 "psubusb %%mm4, %%mm7                           \n\t"
1922                 "paddb %%mm4, %%mm7                             \n\t"
1923                 "movq %%mm7, %%mm4                              \n\t"
1924                 "psrlq $32, %%mm7                               \n\t"
1925                 "psubusb %%mm4, %%mm7                           \n\t"
1926                 "paddb %%mm4, %%mm7                             \n\t"
1927 #endif
1928                 PAVGB(%%mm6, %%mm7)                                   // a=(max + min)/2
1929                 "punpcklbw %%mm7, %%mm7                         \n\t"
1930                 "punpcklbw %%mm7, %%mm7                         \n\t"
1931                 "punpcklbw %%mm7, %%mm7                         \n\t"
1932                 "movq %%mm7, temp0                              \n\t"
1933
1934                 "movq (%0), %%mm0                               \n\t" // L10
1935                 "movq %%mm0, %%mm1                              \n\t" // L10
1936                 "movq %%mm0, %%mm2                              \n\t" // L10
1937                 "psllq $8, %%mm1                                \n\t"
1938                 "psrlq $8, %%mm2                                \n\t"
1939                 "movd -4(%0), %%mm3                             \n\t"
1940                 "movd 8(%0), %%mm4                              \n\t"
1941                 "psrlq $24, %%mm3                               \n\t"
1942                 "psllq $56, %%mm4                               \n\t"
1943                 "por %%mm3, %%mm1                               \n\t" // L00
1944                 "por %%mm4, %%mm2                               \n\t" // L20
1945                 "movq %%mm1, %%mm3                              \n\t" // L00
1946                 PAVGB(%%mm2, %%mm1)                                   // (L20 + L00)/2
1947                 PAVGB(%%mm0, %%mm1)                                   // (L20 + L00 + 2L10)/4
1948                 "psubusb %%mm7, %%mm0                           \n\t"
1949                 "psubusb %%mm7, %%mm2                           \n\t"
1950                 "psubusb %%mm7, %%mm3                           \n\t"
1951                 "pcmpeqb b00, %%mm0                             \n\t" // L10 > a ? 0 : -1
1952                 "pcmpeqb b00, %%mm2                             \n\t" // L20 > a ? 0 : -1
1953                 "pcmpeqb b00, %%mm3                             \n\t" // L00 > a ? 0 : -1
1954                 "paddb %%mm2, %%mm0                             \n\t"
1955                 "paddb %%mm3, %%mm0                             \n\t"
1956
1957                 "movq (%%eax), %%mm2                            \n\t" // L11
1958                 "movq %%mm2, %%mm3                              \n\t" // L11
1959                 "movq %%mm2, %%mm4                              \n\t" // L11
1960                 "psllq $8, %%mm3                                \n\t"
1961                 "psrlq $8, %%mm4                                \n\t"
1962                 "movd -4(%%eax), %%mm5                          \n\t"
1963                 "movd 8(%%eax), %%mm6                           \n\t"
1964                 "psrlq $24, %%mm5                               \n\t"
1965                 "psllq $56, %%mm6                               \n\t"
1966                 "por %%mm5, %%mm3                               \n\t" // L01
1967                 "por %%mm6, %%mm4                               \n\t" // L21
1968                 "movq %%mm3, %%mm5                              \n\t" // L01
1969                 PAVGB(%%mm4, %%mm3)                                   // (L21 + L01)/2
1970                 PAVGB(%%mm2, %%mm3)                                   // (L21 + L01 + 2L11)/4
1971                 "psubusb %%mm7, %%mm2                           \n\t"
1972                 "psubusb %%mm7, %%mm4                           \n\t"
1973                 "psubusb %%mm7, %%mm5                           \n\t"
1974                 "pcmpeqb b00, %%mm2                             \n\t" // L11 > a ? 0 : -1
1975                 "pcmpeqb b00, %%mm4                             \n\t" // L21 > a ? 0 : -1
1976                 "pcmpeqb b00, %%mm5                             \n\t" // L01 > a ? 0 : -1
1977                 "paddb %%mm4, %%mm2                             \n\t"
1978                 "paddb %%mm5, %%mm2                             \n\t"
1979 // 0, 2, 3, 1
1980 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1981                 "movq " #src ", " #sx "                         \n\t" /* src[0] */\
1982                 "movq " #sx ", " #lx "                          \n\t" /* src[0] */\
1983                 "movq " #sx ", " #t0 "                          \n\t" /* src[0] */\
1984                 "psllq $8, " #lx "                              \n\t"\
1985                 "psrlq $8, " #t0 "                              \n\t"\
1986                 "movd -4" #src ", " #t1 "                       \n\t"\
1987                 "psrlq $24, " #t1 "                             \n\t"\
1988                 "por " #t1 ", " #lx "                           \n\t" /* src[-1] */\
1989                 "movd 8" #src ", " #t1 "                        \n\t"\
1990                 "psllq $56, " #t1 "                             \n\t"\
1991                 "por " #t1 ", " #t0 "                           \n\t" /* src[+1] */\
1992                 "movq " #lx ", " #t1 "                          \n\t" /* src[-1] */\
1993                 PAVGB(t0, lx)                                         /* (src[-1] + src[+1])/2 */\
1994                 PAVGB(sx, lx)                                 /* (src[-1] + 2src[0] + src[+1])/4 */\
1995                 PAVGB(lx, pplx)                                      \
1996                 "movq " #lx ", temp1                            \n\t"\
1997                 "movq temp0, " #lx "                            \n\t"\
1998                 "psubusb " #lx ", " #t1 "                       \n\t"\
1999                 "psubusb " #lx ", " #t0 "                       \n\t"\
2000                 "psubusb " #lx ", " #sx "                       \n\t"\
2001                 "movq b00, " #lx "                              \n\t"\
2002                 "pcmpeqb " #lx ", " #t1 "                       \n\t" /* src[-1] > a ? 0 : -1*/\
2003                 "pcmpeqb " #lx ", " #t0 "                       \n\t" /* src[+1] > a ? 0 : -1*/\
2004                 "pcmpeqb " #lx ", " #sx "                       \n\t" /* src[0]  > a ? 0 : -1*/\
2005                 "paddb " #t1 ", " #t0 "                         \n\t"\
2006                 "paddb " #t0 ", " #sx "                         \n\t"\
2007 \
2008                 PAVGB(plx, pplx)                                      /* filtered */\
2009                 "movq " #dst ", " #t0 "                         \n\t" /* dst */\
2010                 "movq " #t0 ", " #t1 "                          \n\t" /* dst */\
2011                 "psubusb pQPb2, " #t0 "                         \n\t"\
2012                 "paddusb pQPb2, " #t1 "                         \n\t"\
2013                 PMAXUB(t0, pplx)\
2014                 PMINUB(t1, pplx, t0)\
2015                 "paddb " #sx ", " #ppsx "                       \n\t"\
2016                 "paddb " #psx ", " #ppsx "                      \n\t"\
2017         "#paddb b02, " #ppsx "                          \n\t"\
2018                 "pand b08, " #ppsx "                            \n\t"\
2019                 "pcmpeqb " #lx ", " #ppsx "                     \n\t"\
2020                 "pand " #ppsx ", " #pplx "                      \n\t"\
2021                 "pandn " #dst ", " #ppsx "                      \n\t"\
2022                 "por " #pplx ", " #ppsx "                       \n\t"\
2023                 "movq " #ppsx ", " #dst "                       \n\t"\
2024                 "movq temp1, " #lx "                            \n\t"
2025
2026 /*
2027 0000000
2028 1111111
2029
2030 1111110
2031 1111101
2032 1111100
2033 1111011
2034 1111010
2035 1111001
2036
2037 1111000
2038 1110111
2039
2040 */
2041 //DERING_CORE(dst,src                  ,ppsx ,psx  ,sx   ,pplx ,plx  ,lx   ,t0   ,t1)
2042 DERING_CORE((%%eax),(%%eax, %1)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
2043 DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
2044 DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
2045 DERING_CORE((%0, %1, 4),(%%ebx)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
2046 DERING_CORE((%%ebx),(%%ebx, %1)        ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
2047 DERING_CORE((%%ebx, %1), (%%ebx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
2048 DERING_CORE((%%ebx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
2049 DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
2050
2051
2052                 : : "r" (src), "r" (stride), "r" (QP)
2053                 : "%eax", "%ebx"
2054         );
2055 #else
2056         int y;
2057         int min=255;
2058         int max=0;
2059         int avg;
2060         uint8_t *p;
2061         int s[10];
2062
2063         for(y=1; y<9; y++)
2064         {
2065                 int x;
2066                 p= src + stride*y;
2067                 for(x=1; x<9; x++)
2068                 {
2069                         p++;
2070                         if(*p > max) max= *p;
2071                         if(*p < min) min= *p;
2072                 }
2073         }
2074         avg= (min + max + 1)/2;
2075
2076         for(y=0; y<10; y++)
2077         {
2078                 int x;
2079                 int t = 0;
2080                 p= src + stride*y;
2081                 for(x=0; x<10; x++)
2082                 {
2083                         if(*p > avg) t |= (1<<x);
2084                         p++;
2085                 }
2086                 t |= (~t)<<16;
2087                 t &= (t<<1) & (t>>1);
2088                 s[y] = t;
2089         }
2090
2091         for(y=1; y<9; y++)
2092         {
2093                 int x;
2094                 int t = s[y-1] & s[y] & s[y+1];
2095                 t|= t>>16;
2096
2097                 p= src + stride*y;
2098                 for(x=1; x<9; x++)
2099                 {
2100                         p++;
2101                         if(t & (1<<x))
2102                         {
2103                                 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
2104                                       +2*(*(p     -1)) + 4*(*p         ) + 2*(*(p     +1))
2105                                       +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
2106                                 f= (f + 8)>>4;
2107
2108                                 if     (*p + 2*QP < f) *p= *p + 2*QP;
2109                                 else if(*p - 2*QP > f) *p= *p - 2*QP;
2110                                 else *p=f;
2111                         }
2112                 }
2113         }
2114
2115 #endif
2116 }
2117
2118 /**
2119  * Deinterlaces the given block
2120  * will be called for every 8x8 block and can read & write from line 4-15
2121  * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
2122  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2123  */
2124 static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride)
2125 {
2126 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2127         src+= 4*stride;
2128         asm volatile(
2129                 "leal (%0, %1), %%eax                           \n\t"
2130                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
2131 //      0       1       2       3       4       5       6       7       8       9
2132 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
2133
2134                 "movq (%0), %%mm0                               \n\t"
2135                 "movq (%%eax, %1), %%mm1                        \n\t"
2136                 PAVGB(%%mm1, %%mm0)
2137                 "movq %%mm0, (%%eax)                            \n\t"
2138                 "movq (%0, %1, 4), %%mm0                        \n\t"
2139                 PAVGB(%%mm0, %%mm1)
2140                 "movq %%mm1, (%%eax, %1, 2)                     \n\t"
2141                 "movq (%%ebx, %1), %%mm1                        \n\t"
2142                 PAVGB(%%mm1, %%mm0)
2143                 "movq %%mm0, (%%ebx)                            \n\t"
2144                 "movq (%0, %1, 8), %%mm0                        \n\t"
2145                 PAVGB(%%mm0, %%mm1)
2146                 "movq %%mm1, (%%ebx, %1, 2)                     \n\t"
2147
2148                 : : "r" (src), "r" (stride)
2149                 : "%eax", "%ebx"
2150         );
2151 #else
2152         int x;
2153         src+= 4*stride;
2154         for(x=0; x<8; x++)
2155         {
2156                 src[stride]   = (src[0]        + src[stride*2])>>1;
2157                 src[stride*3] = (src[stride*2] + src[stride*4])>>1;
2158                 src[stride*5] = (src[stride*4] + src[stride*6])>>1;
2159                 src[stride*7] = (src[stride*6] + src[stride*8])>>1;
2160                 src++;
2161         }
2162 #endif
2163 }
2164
2165 /**
2166  * Deinterlaces the given block
2167  * will be called for every 8x8 block and can read & write from line 4-15
2168  * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
2169  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2170  * this filter will read lines 3-15 and write 7-13
2171  * no cliping in C version
2172  */
2173 static inline void deInterlaceInterpolateCubic(uint8_t src[], int stride)
2174 {
2175 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2176         src+= stride*3;
2177         asm volatile(
2178                 "leal (%0, %1), %%eax                           \n\t"
2179                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
2180                 "leal (%%ebx, %1, 4), %%ecx                     \n\t"
2181                 "addl %1, %%ecx                                 \n\t"
2182                 "pxor %%mm7, %%mm7                              \n\t"
2183 //      0       1       2       3       4       5       6       7       8       9       10
2184 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1 ecx
2185
2186 #define DEINT_CUBIC(a,b,c,d,e)\
2187                 "movq " #a ", %%mm0                             \n\t"\
2188                 "movq " #b ", %%mm1                             \n\t"\
2189                 "movq " #d ", %%mm2                             \n\t"\
2190                 "movq " #e ", %%mm3                             \n\t"\
2191                 PAVGB(%%mm2, %%mm1)                                     /* (b+d) /2 */\
2192                 PAVGB(%%mm3, %%mm0)                                     /* a(a+e) /2 */\
2193                 "movq %%mm0, %%mm2                              \n\t"\
2194                 "punpcklbw %%mm7, %%mm0                         \n\t"\
2195                 "punpckhbw %%mm7, %%mm2                         \n\t"\
2196                 "movq %%mm1, %%mm3                              \n\t"\
2197                 "punpcklbw %%mm7, %%mm1                         \n\t"\
2198                 "punpckhbw %%mm7, %%mm3                         \n\t"\
2199                 "psubw %%mm1, %%mm0                             \n\t"   /* L(a+e - (b+d))/2 */\
2200                 "psubw %%mm3, %%mm2                             \n\t"   /* H(a+e - (b+d))/2 */\
2201                 "psraw $3, %%mm0                                \n\t"   /* L(a+e - (b+d))/16 */\
2202                 "psraw $3, %%mm2                                \n\t"   /* H(a+e - (b+d))/16 */\
2203                 "psubw %%mm0, %%mm1                             \n\t"   /* L(9b + 9d - a - e)/16 */\
2204                 "psubw %%mm2, %%mm3                             \n\t"   /* H(9b + 9d - a - e)/16 */\
2205                 "packuswb %%mm3, %%mm1                          \n\t"\
2206                 "movq %%mm1, " #c "                             \n\t"
2207
2208 DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%ebx, %1))
2209 DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%ebx), (%%ebx, %1), (%0, %1, 8))
2210 DEINT_CUBIC((%0, %1, 4), (%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8), (%%ecx))
2211 DEINT_CUBIC((%%ebx, %1), (%0, %1, 8), (%%ebx, %1, 4), (%%ecx), (%%ecx, %1, 2))
2212
2213                 : : "r" (src), "r" (stride)
2214                 : "%eax", "%ebx", "ecx"
2215         );
2216 #else
2217         int x;
2218         src+= stride*3;
2219         for(x=0; x<8; x++)
2220         {
2221                 src[stride*3] = (-src[0]        + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4;
2222                 src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4;
2223                 src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4;
2224                 src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4;
2225                 src++;
2226         }
2227 #endif
2228 }
2229
2230 /**
2231  * Deinterlaces the given block
2232  * will be called for every 8x8 block and can read & write from line 4-15
2233  * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
2234  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2235  * will shift the image up by 1 line (FIXME if this is a problem)
2236  * this filter will read lines 4-13 and write 4-11
2237  */
2238 static inline void deInterlaceBlendLinear(uint8_t src[], int stride)
2239 {
2240 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2241         src+= 4*stride;
2242         asm volatile(
2243                 "leal (%0, %1), %%eax                           \n\t"
2244                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
2245 //      0       1       2       3       4       5       6       7       8       9
2246 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
2247
2248                 "movq (%0), %%mm0                               \n\t" // L0
2249                 "movq (%%eax, %1), %%mm1                        \n\t" // L2
2250                 PAVGB(%%mm1, %%mm0)                                   // L0+L2
2251                 "movq (%%eax), %%mm2                            \n\t" // L1
2252                 PAVGB(%%mm2, %%mm0)
2253                 "movq %%mm0, (%0)                               \n\t"
2254                 "movq (%%eax, %1, 2), %%mm0                     \n\t" // L3
2255                 PAVGB(%%mm0, %%mm2)                                   // L1+L3
2256                 PAVGB(%%mm1, %%mm2)                                   // 2L2 + L1 + L3
2257                 "movq %%mm2, (%%eax)                            \n\t"
2258                 "movq (%0, %1, 4), %%mm2                        \n\t" // L4
2259                 PAVGB(%%mm2, %%mm1)                                   // L2+L4
2260                 PAVGB(%%mm0, %%mm1)                                   // 2L3 + L2 + L4
2261                 "movq %%mm1, (%%eax, %1)                        \n\t"
2262                 "movq (%%ebx), %%mm1                            \n\t" // L5
2263                 PAVGB(%%mm1, %%mm0)                                   // L3+L5
2264                 PAVGB(%%mm2, %%mm0)                                   // 2L4 + L3 + L5
2265                 "movq %%mm0, (%%eax, %1, 2)                     \n\t"
2266                 "movq (%%ebx, %1), %%mm0                        \n\t" // L6
2267                 PAVGB(%%mm0, %%mm2)                                   // L4+L6
2268                 PAVGB(%%mm1, %%mm2)                                   // 2L5 + L4 + L6
2269                 "movq %%mm2, (%0, %1, 4)                        \n\t"
2270                 "movq (%%ebx, %1, 2), %%mm2                     \n\t" // L7
2271                 PAVGB(%%mm2, %%mm1)                                   // L5+L7
2272                 PAVGB(%%mm0, %%mm1)                                   // 2L6 + L5 + L7
2273                 "movq %%mm1, (%%ebx)                            \n\t"
2274                 "movq (%0, %1, 8), %%mm1                        \n\t" // L8
2275                 PAVGB(%%mm1, %%mm0)                                   // L6+L8
2276                 PAVGB(%%mm2, %%mm0)                                   // 2L7 + L6 + L8
2277                 "movq %%mm0, (%%ebx, %1)                        \n\t"
2278                 "movq (%%ebx, %1, 4), %%mm0                     \n\t" // L9
2279                 PAVGB(%%mm0, %%mm2)                                   // L7+L9
2280                 PAVGB(%%mm1, %%mm2)                                   // 2L8 + L7 + L9
2281                 "movq %%mm2, (%%ebx, %1, 2)                     \n\t"
2282
2283
2284                 : : "r" (src), "r" (stride)
2285                 : "%eax", "%ebx"
2286         );
2287 #else
2288         int x;
2289         src+= 4*stride;
2290         for(x=0; x<8; x++)
2291         {
2292                 src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
2293                 src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
2294                 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2295                 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
2296                 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
2297                 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
2298                 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
2299                 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
2300                 src++;
2301         }
2302 #endif
2303 }
2304
2305 /**
2306  * Deinterlaces the given block
2307  * will be called for every 8x8 block and can read & write from line 4-15,
2308  * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
2309  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2310  */
2311 static inline void deInterlaceMedian(uint8_t src[], int stride)
2312 {
2313 #ifdef HAVE_MMX
2314         src+= 4*stride;
2315 #ifdef HAVE_MMX2
2316         asm volatile(
2317                 "leal (%0, %1), %%eax                           \n\t"
2318                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
2319 //      0       1       2       3       4       5       6       7       8       9
2320 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
2321
2322                 "movq (%0), %%mm0                               \n\t" //
2323                 "movq (%%eax, %1), %%mm2                        \n\t" //
2324                 "movq (%%eax), %%mm1                            \n\t" //
2325                 "movq %%mm0, %%mm3                              \n\t"
2326                 "pmaxub %%mm1, %%mm0                            \n\t" //
2327                 "pminub %%mm3, %%mm1                            \n\t" //
2328                 "pmaxub %%mm2, %%mm1                            \n\t" //
2329                 "pminub %%mm1, %%mm0                            \n\t"
2330                 "movq %%mm0, (%%eax)                            \n\t"
2331
2332                 "movq (%0, %1, 4), %%mm0                        \n\t" //
2333                 "movq (%%eax, %1, 2), %%mm1                     \n\t" //
2334                 "movq %%mm2, %%mm3                              \n\t"
2335                 "pmaxub %%mm1, %%mm2                            \n\t" //
2336                 "pminub %%mm3, %%mm1                            \n\t" //
2337                 "pmaxub %%mm0, %%mm1                            \n\t" //
2338                 "pminub %%mm1, %%mm2                            \n\t"
2339                 "movq %%mm2, (%%eax, %1, 2)                     \n\t"
2340
2341                 "movq (%%ebx), %%mm2                            \n\t" //
2342                 "movq (%%ebx, %1), %%mm1                        \n\t" //
2343                 "movq %%mm2, %%mm3                              \n\t"
2344                 "pmaxub %%mm0, %%mm2                            \n\t" //
2345                 "pminub %%mm3, %%mm0                            \n\t" //
2346                 "pmaxub %%mm1, %%mm0                            \n\t" //
2347                 "pminub %%mm0, %%mm2                            \n\t"
2348                 "movq %%mm2, (%%ebx)                            \n\t"
2349
2350                 "movq (%%ebx, %1, 2), %%mm2                     \n\t" //
2351                 "movq (%0, %1, 8), %%mm0                        \n\t" //
2352                 "movq %%mm2, %%mm3                              \n\t"
2353                 "pmaxub %%mm0, %%mm2                            \n\t" //
2354                 "pminub %%mm3, %%mm0                            \n\t" //
2355                 "pmaxub %%mm1, %%mm0                            \n\t" //
2356                 "pminub %%mm0, %%mm2                            \n\t"
2357                 "movq %%mm2, (%%ebx, %1, 2)                     \n\t"
2358
2359
2360                 : : "r" (src), "r" (stride)
2361                 : "%eax", "%ebx"
2362         );
2363
2364 #else // MMX without MMX2
2365         asm volatile(
2366                 "leal (%0, %1), %%eax                           \n\t"
2367                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
2368 //      0       1       2       3       4       5       6       7       8       9
2369 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
2370                 "pxor %%mm7, %%mm7                              \n\t"
2371
2372 #define MEDIAN(a,b,c)\
2373                 "movq " #a ", %%mm0                             \n\t"\
2374                 "movq " #b ", %%mm2                             \n\t"\
2375                 "movq " #c ", %%mm1                             \n\t"\
2376                 "movq %%mm0, %%mm3                              \n\t"\
2377                 "movq %%mm1, %%mm4                              \n\t"\
2378                 "movq %%mm2, %%mm5                              \n\t"\
2379                 "psubusb %%mm1, %%mm3                           \n\t"\
2380                 "psubusb %%mm2, %%mm4                           \n\t"\
2381                 "psubusb %%mm0, %%mm5                           \n\t"\
2382                 "pcmpeqb %%mm7, %%mm3                           \n\t"\
2383                 "pcmpeqb %%mm7, %%mm4                           \n\t"\
2384                 "pcmpeqb %%mm7, %%mm5                           \n\t"\
2385                 "movq %%mm3, %%mm6                              \n\t"\
2386                 "pxor %%mm4, %%mm3                              \n\t"\
2387                 "pxor %%mm5, %%mm4                              \n\t"\
2388                 "pxor %%mm6, %%mm5                              \n\t"\
2389                 "por %%mm3, %%mm1                               \n\t"\
2390                 "por %%mm4, %%mm2                               \n\t"\
2391                 "por %%mm5, %%mm0                               \n\t"\
2392                 "pand %%mm2, %%mm0                              \n\t"\
2393                 "pand %%mm1, %%mm0                              \n\t"\
2394                 "movq %%mm0, " #b "                             \n\t"
2395
2396 MEDIAN((%0), (%%eax), (%%eax, %1))
2397 MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
2398 MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1))
2399 MEDIAN((%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8))
2400
2401                 : : "r" (src), "r" (stride)
2402                 : "%eax", "%ebx"
2403         );
2404 #endif // MMX
2405 #else
2406         //FIXME
2407         int x;
2408         src+= 4*stride;
2409         for(x=0; x<8; x++)
2410         {
2411                 src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
2412                 src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
2413                 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2414                 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
2415                 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
2416                 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
2417                 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
2418                 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
2419                 src++;
2420         }
2421 #endif
2422 }
2423
2424 #ifdef HAVE_MMX
2425 /**
2426  * transposes and shift the given 8x8 Block into dst1 and dst2
2427  */
2428 static inline void transpose1(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
2429 {
2430         asm(
2431                 "leal (%0, %1), %%eax                           \n\t"
2432                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
2433 //      0       1       2       3       4       5       6       7       8       9
2434 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
2435                 "movq (%0), %%mm0               \n\t" // 12345678
2436                 "movq (%%eax), %%mm1            \n\t" // abcdefgh
2437                 "movq %%mm0, %%mm2              \n\t" // 12345678
2438                 "punpcklbw %%mm1, %%mm0         \n\t" // 1a2b3c4d
2439                 "punpckhbw %%mm1, %%mm2         \n\t" // 5e6f7g8h
2440
2441                 "movq (%%eax, %1), %%mm1        \n\t"
2442                 "movq (%%eax, %1, 2), %%mm3     \n\t"
2443                 "movq %%mm1, %%mm4              \n\t"
2444                 "punpcklbw %%mm3, %%mm1         \n\t"
2445                 "punpckhbw %%mm3, %%mm4         \n\t"
2446
2447                 "movq %%mm0, %%mm3              \n\t"
2448                 "punpcklwd %%mm1, %%mm0         \n\t"
2449                 "punpckhwd %%mm1, %%mm3         \n\t"
2450                 "movq %%mm2, %%mm1              \n\t"
2451                 "punpcklwd %%mm4, %%mm2         \n\t"
2452                 "punpckhwd %%mm4, %%mm1         \n\t"
2453
2454                 "movd %%mm0, 128(%2)            \n\t"
2455                 "psrlq $32, %%mm0               \n\t"
2456                 "movd %%mm0, 144(%2)            \n\t"
2457                 "movd %%mm3, 160(%2)            \n\t"
2458                 "psrlq $32, %%mm3               \n\t"
2459                 "movd %%mm3, 176(%2)            \n\t"
2460                 "movd %%mm3, 48(%3)             \n\t"
2461                 "movd %%mm2, 192(%2)            \n\t"
2462                 "movd %%mm2, 64(%3)             \n\t"
2463                 "psrlq $32, %%mm2               \n\t"
2464                 "movd %%mm2, 80(%3)             \n\t"
2465                 "movd %%mm1, 96(%3)             \n\t"
2466                 "psrlq $32, %%mm1               \n\t"
2467                 "movd %%mm1, 112(%3)            \n\t"
2468
2469                 "movq (%0, %1, 4), %%mm0        \n\t" // 12345678
2470                 "movq (%%ebx), %%mm1            \n\t" // abcdefgh
2471                 "movq %%mm0, %%mm2              \n\t" // 12345678
2472                 "punpcklbw %%mm1, %%mm0         \n\t" // 1a2b3c4d
2473                 "punpckhbw %%mm1, %%mm2         \n\t" // 5e6f7g8h
2474
2475                 "movq (%%ebx, %1), %%mm1        \n\t"
2476                 "movq (%%ebx, %1, 2), %%mm3     \n\t"
2477                 "movq %%mm1, %%mm4              \n\t"
2478                 "punpcklbw %%mm3, %%mm1         \n\t"
2479                 "punpckhbw %%mm3, %%mm4         \n\t"
2480
2481                 "movq %%mm0, %%mm3              \n\t"
2482                 "punpcklwd %%mm1, %%mm0         \n\t"
2483                 "punpckhwd %%mm1, %%mm3         \n\t"
2484                 "movq %%mm2, %%mm1              \n\t"
2485                 "punpcklwd %%mm4, %%mm2         \n\t"
2486                 "punpckhwd %%mm4, %%mm1         \n\t"
2487
2488                 "movd %%mm0, 132(%2)            \n\t"
2489                 "psrlq $32, %%mm0               \n\t"
2490                 "movd %%mm0, 148(%2)            \n\t"
2491                 "movd %%mm3, 164(%2)            \n\t"
2492                 "psrlq $32, %%mm3               \n\t"
2493                 "movd %%mm3, 180(%2)            \n\t"
2494                 "movd %%mm3, 52(%3)             \n\t"
2495                 "movd %%mm2, 196(%2)            \n\t"
2496                 "movd %%mm2, 68(%3)             \n\t"
2497                 "psrlq $32, %%mm2               \n\t"
2498                 "movd %%mm2, 84(%3)             \n\t"
2499                 "movd %%mm1, 100(%3)            \n\t"
2500                 "psrlq $32, %%mm1               \n\t"
2501                 "movd %%mm1, 116(%3)            \n\t"
2502
2503
2504         :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2)
2505         : "%eax", "%ebx"
2506         );
2507 }
2508
2509 /**
2510  * transposes the given 8x8 block
2511  */
2512 static inline void transpose2(uint8_t *dst, int dstStride, uint8_t *src)
2513 {
2514         asm(
2515                 "leal (%0, %1), %%eax                           \n\t"
2516                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
2517 //      0       1       2       3       4       5       6       7       8       9
2518 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
2519                 "movq (%2), %%mm0               \n\t" // 12345678
2520                 "movq 16(%2), %%mm1             \n\t" // abcdefgh
2521                 "movq %%mm0, %%mm2              \n\t" // 12345678
2522                 "punpcklbw %%mm1, %%mm0         \n\t" // 1a2b3c4d
2523                 "punpckhbw %%mm1, %%mm2         \n\t" // 5e6f7g8h
2524
2525                 "movq 32(%2), %%mm1             \n\t"
2526                 "movq 48(%2), %%mm3             \n\t"
2527                 "movq %%mm1, %%mm4              \n\t"
2528                 "punpcklbw %%mm3, %%mm1         \n\t"
2529                 "punpckhbw %%mm3, %%mm4         \n\t"
2530
2531                 "movq %%mm0, %%mm3              \n\t"
2532                 "punpcklwd %%mm1, %%mm0         \n\t"
2533                 "punpckhwd %%mm1, %%mm3         \n\t"
2534                 "movq %%mm2, %%mm1              \n\t"
2535                 "punpcklwd %%mm4, %%mm2         \n\t"
2536                 "punpckhwd %%mm4, %%mm1         \n\t"
2537
2538                 "movd %%mm0, (%0)               \n\t"
2539                 "psrlq $32, %%mm0               \n\t"
2540                 "movd %%mm0, (%%eax)            \n\t"
2541                 "movd %%mm3, (%%eax, %1)        \n\t"
2542                 "psrlq $32, %%mm3               \n\t"
2543                 "movd %%mm3, (%%eax, %1, 2)     \n\t"
2544                 "movd %%mm2, (%0, %1, 4)        \n\t"
2545                 "psrlq $32, %%mm2               \n\t"
2546                 "movd %%mm2, (%%ebx)            \n\t"
2547                 "movd %%mm1, (%%ebx, %1)        \n\t"
2548                 "psrlq $32, %%mm1               \n\t"
2549                 "movd %%mm1, (%%ebx, %1, 2)     \n\t"
2550
2551
2552                 "movq 64(%2), %%mm0             \n\t" // 12345678
2553                 "movq 80(%2), %%mm1             \n\t" // abcdefgh
2554                 "movq %%mm0, %%mm2              \n\t" // 12345678
2555                 "punpcklbw %%mm1, %%mm0         \n\t" // 1a2b3c4d
2556                 "punpckhbw %%mm1, %%mm2         \n\t" // 5e6f7g8h
2557
2558                 "movq 96(%2), %%mm1             \n\t"
2559                 "movq 112(%2), %%mm3            \n\t"
2560                 "movq %%mm1, %%mm4              \n\t"
2561                 "punpcklbw %%mm3, %%mm1         \n\t"
2562                 "punpckhbw %%mm3, %%mm4         \n\t"
2563
2564                 "movq %%mm0, %%mm3              \n\t"
2565                 "punpcklwd %%mm1, %%mm0         \n\t"
2566                 "punpckhwd %%mm1, %%mm3         \n\t"
2567                 "movq %%mm2, %%mm1              \n\t"
2568                 "punpcklwd %%mm4, %%mm2         \n\t"
2569                 "punpckhwd %%mm4, %%mm1         \n\t"
2570
2571                 "movd %%mm0, 4(%0)              \n\t"
2572                 "psrlq $32, %%mm0               \n\t"
2573                 "movd %%mm0, 4(%%eax)           \n\t"
2574                 "movd %%mm3, 4(%%eax, %1)       \n\t"
2575                 "psrlq $32, %%mm3               \n\t"
2576                 "movd %%mm3, 4(%%eax, %1, 2)    \n\t"
2577                 "movd %%mm2, 4(%0, %1, 4)       \n\t"
2578                 "psrlq $32, %%mm2               \n\t"
2579                 "movd %%mm2, 4(%%ebx)           \n\t"
2580                 "movd %%mm1, 4(%%ebx, %1)       \n\t"
2581                 "psrlq $32, %%mm1               \n\t"
2582                 "movd %%mm1, 4(%%ebx, %1, 2)    \n\t"
2583
2584         :: "r" (dst), "r" (dstStride), "r" (src)
2585         : "%eax", "%ebx"
2586         );
2587 }
2588 #endif
2589
2590 #ifdef HAVE_ODIVX_POSTPROCESS
2591 #include "../opendivx/postprocess.h"
2592 int use_old_pp=0;
2593 #endif
2594
2595 static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2596         QP_STORE_T QPs[], int QPStride, int isColor, int mode);
2597
2598 /* -pp Command line Help
2599 NOTE/FIXME: put this at an appropriate place (--help, html docs, man mplayer)?
2600
2601 -pp <filterName>[:<option>[:<option>...]][,[-]<filterName>[:<option>...]]...
2602
2603 long form example:
2604 -pp vdeblock:autoq,hdeblock:autoq,linblenddeint         -pp default,-vdeblock
2605 short form example:
2606 -pp vb:a,hb:a,lb                                        -pp de,-vb
2607
2608 Filters                 Options
2609 short   long name       short   long option     Description
2610 *       *               a       autoq           cpu power dependant enabler
2611                         c       chrom           chrominance filtring enabled
2612                         y       nochrom         chrominance filtring disabled
2613 hb      hdeblock                                horizontal deblocking filter
2614 vb      vdeblock                                vertical deblocking filter
2615 vr      rkvdeblock
2616 h1      x1hdeblock                              Experimental horizontal deblock filter 1
2617 v1      x1vdeblock                              Experimental vertical deblock filter 1
2618 dr      dering                                  not implemented yet
2619 al      autolevels                              automatic brightness / contrast fixer
2620                         f       fullyrange      stretch luminance range to (0..255)
2621 lb      linblenddeint                           linear blend deinterlacer
2622 li      linipoldeint                            linear interpolating deinterlacer
2623 ci      cubicipoldeint                          cubic interpolating deinterlacer
2624 md      mediandeint                             median deinterlacer
2625 de      default                                 hdeblock:a,vdeblock:a,dering:a,autolevels
2626 fa      fast                                    x1hdeblock:a,x1vdeblock:a,dering:a,autolevels
2627 */
2628
2629 /**
2630  * returns a PPMode struct which will have a non 0 error variable if an error occured
2631  * name is the string after "-pp" on the command line
2632  * quality is a number from 0 to GET_PP_QUALITY_MAX
2633  */
2634 struct PPMode getPPModeByNameAndQuality(char *name, int quality)
2635 {
2636         char temp[GET_MODE_BUFFER_SIZE];
2637         char *p= temp;
2638         char *filterDelimiters= ",";
2639         char *optionDelimiters= ":";
2640         struct PPMode ppMode= {0,0,0,0,0,0};
2641         char *filterToken;
2642
2643         strncpy(temp, name, GET_MODE_BUFFER_SIZE);
2644
2645         for(;;){
2646                 char *filterName;
2647                 int q= GET_PP_QUALITY_MAX;
2648                 int chrom=-1;
2649                 char *option;
2650                 char *options[OPTIONS_ARRAY_SIZE];
2651                 int i;
2652                 int filterNameOk=0;
2653                 int numOfUnknownOptions=0;
2654                 int enable=1; //does the user want us to enabled or disabled the filter
2655
2656                 filterToken= strtok(p, filterDelimiters);
2657                 if(filterToken == NULL) break;
2658                 p+= strlen(filterToken) + 1;
2659                 filterName= strtok(filterToken, optionDelimiters);
2660                 printf("%s::%s\n", filterToken, filterName);
2661
2662                 if(*filterName == '-')
2663                 {
2664                         enable=0;
2665                         filterName++;
2666                 }
2667                 for(;;){ //for all options
2668                         option= strtok(NULL, optionDelimiters);
2669                         if(option == NULL) break;
2670
2671                         printf("%s\n", option);
2672                         if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
2673                         else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
2674                         else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
2675                         else
2676                         {
2677                                 options[numOfUnknownOptions] = option;
2678                                 numOfUnknownOptions++;
2679                                 options[numOfUnknownOptions] = NULL;
2680                         }
2681                         if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
2682                 }
2683
2684                 /* replace stuff from the replace Table */
2685                 for(i=0; replaceTable[2*i]!=NULL; i++)
2686                 {
2687                         if(!strcmp(replaceTable[2*i], filterName))
2688                         {
2689                                 int newlen= strlen(replaceTable[2*i + 1]);
2690                                 int plen;
2691                                 int spaceLeft;
2692
2693                                 if(p==NULL) p= temp, *p=0;      //last filter
2694                                 else p--, *p=',';               //not last filter
2695
2696                                 plen= strlen(p);
2697                                 spaceLeft= (int)p - (int)temp + plen;
2698                                 if(spaceLeft + newlen  >= GET_MODE_BUFFER_SIZE)
2699                                 {
2700                                         ppMode.error++;
2701                                         break;
2702                                 }
2703                                 memmove(p + newlen, p, plen+1);
2704                                 memcpy(p, replaceTable[2*i + 1], newlen);
2705                                 filterNameOk=1;
2706                         }
2707                 }
2708
2709                 for(i=0; filters[i].shortName!=NULL; i++)
2710                 {
2711                         if(   !strcmp(filters[i].longName, filterName)
2712                            || !strcmp(filters[i].shortName, filterName))
2713                         {
2714                                 ppMode.lumMode &= ~filters[i].mask;
2715                                 ppMode.chromMode &= ~filters[i].mask;
2716
2717                                 filterNameOk=1;
2718                                 if(!enable) break; // user wants to disable it
2719
2720                                 if(q >= filters[i].minLumQuality)
2721                                         ppMode.lumMode|= filters[i].mask;
2722                                 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
2723                                         if(q >= filters[i].minChromQuality)
2724                                                 ppMode.chromMode|= filters[i].mask;
2725
2726                                 if(filters[i].mask == LEVEL_FIX)
2727                                 {
2728                                         int o;
2729                                         ppMode.minAllowedY= 16;
2730                                         ppMode.maxAllowedY= 234;
2731                                         for(o=0; options[o]!=NULL; o++)
2732                                                 if(  !strcmp(options[o],"fullyrange")
2733                                                    ||!strcmp(options[o],"f"))
2734                                                 {
2735                                                         ppMode.minAllowedY= 0;
2736                                                         ppMode.maxAllowedY= 255;
2737                                                         numOfUnknownOptions--;
2738                                                 }
2739                                 }
2740                         }
2741                 }
2742                 if(!filterNameOk) ppMode.error++;
2743                 ppMode.error += numOfUnknownOptions;
2744         }
2745
2746         if(ppMode.lumMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_H;
2747         if(ppMode.lumMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_V;
2748         if(ppMode.chromMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_H;
2749         if(ppMode.chromMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_V;
2750         if(ppMode.lumMode & DERING) ppMode.oldMode |= PP_DERING_Y;
2751         if(ppMode.chromMode & DERING) ppMode.oldMode |= PP_DERING_C;
2752
2753         return ppMode;
2754 }
2755
2756 /**
2757  * ...
2758  */
2759 void  postprocess(unsigned char * src[], int src_stride,
2760                  unsigned char * dst[], int dst_stride,
2761                  int horizontal_size,   int vertical_size,
2762                  QP_STORE_T *QP_store,  int QP_stride,
2763                                           int mode)
2764 {
2765 /*
2766         static int qual=0;
2767
2768         struct PPMode ppMode= getPPModeByNameAndQuality("fast,default,-hdeblock,-vdeblock", qual);
2769         qual++;
2770         qual%=7;
2771         printf("\n%d %d %d %d\n", ppMode.lumMode, ppMode.chromMode, ppMode.oldMode, ppMode.error);
2772         postprocess2(src, src_stride, dst, dst_stride,
2773                  horizontal_size, vertical_size, QP_store, QP_stride, &ppMode);
2774
2775         return;
2776 */
2777
2778 #ifdef HAVE_ODIVX_POSTPROCESS
2779 // Note: I could make this shit outside of this file, but it would mean one
2780 // more function call...
2781         if(use_old_pp){
2782             odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,mode);
2783             return;
2784         }
2785 #endif
2786
2787         postProcess(src[0], src_stride, dst[0], dst_stride,
2788                 horizontal_size, vertical_size, QP_store, QP_stride, 0, mode);
2789
2790         horizontal_size >>= 1;
2791         vertical_size   >>= 1;
2792         src_stride      >>= 1;
2793         dst_stride      >>= 1;
2794         mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00);
2795 //      mode&= ~(LINEAR_IPOL_DEINT_FILTER | LINEAR_BLEND_DEINT_FILTER |
2796 //               MEDIAN_DEINT_FILTER | CUBIC_IPOL_DEINT_FILTER);
2797
2798         if(1)
2799         {
2800                 postProcess(src[1], src_stride, dst[1], dst_stride,
2801                         horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
2802                 postProcess(src[2], src_stride, dst[2], dst_stride,
2803                         horizontal_size, vertical_size, QP_store, QP_stride, 2, mode);
2804         }
2805         else
2806         {
2807                 memcpy(dst[1], src[1], src_stride*horizontal_size);
2808                 memcpy(dst[2], src[2], src_stride*horizontal_size);
2809         }
2810 }
2811
2812 void  postprocess2(unsigned char * src[], int src_stride,
2813                  unsigned char * dst[], int dst_stride,
2814                  int horizontal_size,   int vertical_size,
2815                  QP_STORE_T *QP_store,  int QP_stride,
2816                  struct PPMode *mode)
2817 {
2818
2819 #ifdef HAVE_ODIVX_POSTPROCESS
2820 // Note: I could make this shit outside of this file, but it would mean one
2821 // more function call...
2822         if(use_old_pp){
2823             odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,
2824             mode->oldMode);
2825             return;
2826         }
2827 #endif
2828
2829         postProcess(src[0], src_stride, dst[0], dst_stride,
2830                 horizontal_size, vertical_size, QP_store, QP_stride, 0, mode->lumMode);
2831
2832         horizontal_size >>= 1;
2833         vertical_size   >>= 1;
2834         src_stride      >>= 1;
2835         dst_stride      >>= 1;
2836
2837         postProcess(src[1], src_stride, dst[1], dst_stride,
2838                 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode->chromMode);
2839         postProcess(src[2], src_stride, dst[2], dst_stride,
2840                 horizontal_size, vertical_size, QP_store, QP_stride, 2, mode->chromMode);
2841 }
2842
2843
2844 /**
2845  * gets the mode flags for a given quality (larger values mean slower but better postprocessing)
2846  * 0 <= quality <= 6
2847  */
2848 int getPpModeForQuality(int quality){
2849         int modes[1+GET_PP_QUALITY_MAX]= {
2850                 0,
2851 #if 1
2852                 // horizontal filters first
2853                 LUM_H_DEBLOCK,
2854                 LUM_H_DEBLOCK | LUM_V_DEBLOCK,
2855                 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK,
2856                 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK,
2857                 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING,
2858                 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING | CHROM_DERING
2859 #else
2860                 // vertical filters first
2861                 LUM_V_DEBLOCK,
2862                 LUM_V_DEBLOCK | LUM_H_DEBLOCK,
2863                 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK,
2864                 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK,
2865                 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING,
2866                 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING | CHROM_DERING
2867 #endif
2868         };
2869
2870 #ifdef HAVE_ODIVX_POSTPROCESS
2871         int odivx_modes[1+GET_PP_QUALITY_MAX]= {
2872                 0,
2873                 PP_DEBLOCK_Y_H,
2874                 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V,
2875                 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H,
2876                 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V,
2877                 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y,
2878                 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y|PP_DERING_C
2879         };
2880         if(use_old_pp) return odivx_modes[quality];
2881 #endif
2882         return modes[quality];
2883 }
2884
2885 /**
2886  * Copies a block from src to dst and fixes the blacklevel
2887  * numLines must be a multiple of 4
2888  * levelFix == 0 -> dont touch the brighness & contrast
2889  */
2890 static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
2891         int numLines, int levelFix)
2892 {
2893 #ifndef HAVE_MMX
2894         int i;
2895 #endif
2896         if(levelFix)
2897         {
2898 #ifdef HAVE_MMX
2899                                         asm volatile(
2900                                                 "leal (%2,%2), %%eax    \n\t"
2901                                                 "leal (%3,%3), %%ebx    \n\t"
2902                                                 "movq packedYOffset, %%mm2      \n\t"
2903                                                 "movq packedYScale, %%mm3       \n\t"
2904                                                 "pxor %%mm4, %%mm4      \n\t"
2905
2906 #define SCALED_CPY                                      \
2907                                                 "movq (%0), %%mm0       \n\t"\
2908                                                 "movq (%0), %%mm5       \n\t"\
2909                                                 "punpcklbw %%mm4, %%mm0 \n\t"\
2910                                                 "punpckhbw %%mm4, %%mm5 \n\t"\
2911                                                 "psubw %%mm2, %%mm0     \n\t"\
2912                                                 "psubw %%mm2, %%mm5     \n\t"\
2913                                                 "movq (%0,%2), %%mm1    \n\t"\
2914                                                 "psllw $6, %%mm0        \n\t"\
2915                                                 "psllw $6, %%mm5        \n\t"\
2916                                                 "pmulhw %%mm3, %%mm0    \n\t"\
2917                                                 "movq (%0,%2), %%mm6    \n\t"\
2918                                                 "pmulhw %%mm3, %%mm5    \n\t"\
2919                                                 "punpcklbw %%mm4, %%mm1 \n\t"\
2920                                                 "punpckhbw %%mm4, %%mm6 \n\t"\
2921                                                 "psubw %%mm2, %%mm1     \n\t"\
2922                                                 "psubw %%mm2, %%mm6     \n\t"\
2923                                                 "psllw $6, %%mm1        \n\t"\
2924                                                 "psllw $6, %%mm6        \n\t"\
2925                                                 "pmulhw %%mm3, %%mm1    \n\t"\
2926                                                 "pmulhw %%mm3, %%mm6    \n\t"\
2927                                                 "addl %%eax, %0         \n\t"\
2928                                                 "packuswb %%mm5, %%mm0  \n\t"\
2929                                                 "packuswb %%mm6, %%mm1  \n\t"\
2930                                                 "movq %%mm0, (%1)       \n\t"\
2931                                                 "movq %%mm1, (%1, %3)   \n\t"\
2932
2933 SCALED_CPY
2934                                                 "addl %%ebx, %1         \n\t"
2935 SCALED_CPY
2936                                                 "addl %%ebx, %1         \n\t"
2937 SCALED_CPY
2938                                                 "addl %%ebx, %1         \n\t"
2939 SCALED_CPY
2940
2941                                                 : "+r"(src),
2942                                                 "+r"(dst)
2943                                                 :"r" (srcStride),
2944                                                 "r" (dstStride)
2945                                                 : "%eax", "%ebx"
2946                                         );
2947 #else
2948                                 for(i=0; i<numLines; i++)
2949                                         memcpy( &(dst[dstStride*i]),
2950                                                 &(src[srcStride*i]), BLOCK_SIZE);
2951 #endif
2952         }
2953         else
2954         {
2955 #ifdef HAVE_MMX
2956                                         asm volatile(
2957                                                 "movl %4, %%eax \n\t"
2958                                                 "movl %%eax, temp0\n\t"
2959                                                 "pushl %0 \n\t"
2960                                                 "pushl %1 \n\t"
2961                                                 "leal (%2,%2), %%eax    \n\t"
2962                                                 "leal (%3,%3), %%ebx    \n\t"
2963                                                 "movq packedYOffset, %%mm2      \n\t"
2964                                                 "movq packedYScale, %%mm3       \n\t"
2965
2966 #define SIMPLE_CPY                                      \
2967                                                 "movq (%0), %%mm0       \n\t"\
2968                                                 "movq (%0,%2), %%mm1    \n\t"\
2969                                                 "movq %%mm0, (%1)       \n\t"\
2970                                                 "movq %%mm1, (%1, %3)   \n\t"\
2971
2972                                                 "1:                     \n\t"
2973 SIMPLE_CPY
2974                                                 "addl %%eax, %0         \n\t"
2975                                                 "addl %%ebx, %1         \n\t"
2976 SIMPLE_CPY
2977                                                 "addl %%eax, %0         \n\t"
2978                                                 "addl %%ebx, %1         \n\t"
2979                                                 "decl temp0             \n\t"
2980                                                 "jnz 1b                 \n\t"
2981
2982                                                 "popl %1 \n\t"
2983                                                 "popl %0 \n\t"
2984                                                 : : "r" (src),
2985                                                 "r" (dst),
2986                                                 "r" (srcStride),
2987                                                 "r" (dstStride),
2988                                                 "m" (numLines>>2)
2989                                                 : "%eax", "%ebx"
2990                                         );
2991 #else
2992                                 for(i=0; i<numLines; i++)
2993                                         memcpy( &(dst[dstStride*i]),
2994                                                 &(src[srcStride*i]), BLOCK_SIZE);
2995 #endif
2996         }
2997 }
2998
2999
3000 /**
3001  * Filters array of bytes (Y or U or V values)
3002  */
3003 static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
3004         QP_STORE_T QPs[], int QPStride, int isColor, int mode)
3005 {
3006         int x,y;
3007         /* we need 64bit here otherwise we´ll going to have a problem
3008            after watching a black picture for 5 hours*/
3009         static uint64_t *yHistogram= NULL;
3010         int black=0, white=255; // blackest black and whitest white in the picture
3011         int QPCorrecture= 256;
3012
3013         /* Temporary buffers for handling the last row(s) */
3014         static uint8_t *tempDst= NULL;
3015         static uint8_t *tempSrc= NULL;
3016
3017         /* Temporary buffers for handling the last block */
3018         static uint8_t *tempDstBlock= NULL;
3019         static uint8_t *tempSrcBlock= NULL;
3020
3021 #ifdef PP_FUNNY_STRIDE
3022         uint8_t *dstBlockPtrBackup;
3023         uint8_t *srcBlockPtrBackup;
3024 #endif
3025
3026 #ifdef MORE_TIMING
3027         long long T0, T1, diffTime=0;
3028 #endif
3029 #ifdef TIMING
3030         long long memcpyTime=0, vertTime=0, horizTime=0, sumTime;
3031         sumTime= rdtsc();
3032 #endif
3033 //mode= 0x7F;
3034
3035         if(tempDst==NULL)
3036         {
3037                 tempDst= (uint8_t*)memalign(8, 1024*24);
3038                 tempSrc= (uint8_t*)memalign(8, 1024*24);
3039                 tempDstBlock= (uint8_t*)memalign(8, 1024*24);
3040                 tempSrcBlock= (uint8_t*)memalign(8, 1024*24);
3041         }
3042
3043         if(!yHistogram)
3044         {
3045                 int i;
3046                 yHistogram= (uint64_t*)malloc(8*256);
3047                 for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256;
3048
3049                 if(mode & FULL_Y_RANGE)
3050                 {
3051                         maxAllowedY=255;
3052                         minAllowedY=0;
3053                 }
3054         }
3055
3056         if(!isColor)
3057         {
3058                 uint64_t sum= 0;
3059                 int i;
3060                 static int framenum= -1;
3061                 uint64_t maxClipped;
3062                 uint64_t clipped;
3063                 double scale;
3064
3065                 framenum++;
3066                 if(framenum == 1) yHistogram[0]= width*height/64*15/256;
3067
3068                 for(i=0; i<256; i++)
3069                 {
3070                         sum+= yHistogram[i];
3071 //                      printf("%d ", yHistogram[i]);
3072                 }
3073 //              printf("\n\n");
3074
3075                 /* we allways get a completly black picture first */
3076                 maxClipped= (uint64_t)(sum * maxClippedThreshold);
3077
3078                 clipped= sum;
3079                 for(black=255; black>0; black--)
3080                 {
3081                         if(clipped < maxClipped) break;
3082                         clipped-= yHistogram[black];
3083                 }
3084
3085                 clipped= sum;
3086                 for(white=0; white<256; white++)
3087                 {
3088                         if(clipped < maxClipped) break;
3089                         clipped-= yHistogram[white];
3090                 }
3091
3092                 packedYOffset= (black - minAllowedY) & 0xFFFF;
3093                 packedYOffset|= packedYOffset<<32;
3094                 packedYOffset|= packedYOffset<<16;
3095
3096                 scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);
3097
3098                 packedYScale= (uint16_t)(scale*1024.0 + 0.5);
3099                 packedYScale|= packedYScale<<32;
3100                 packedYScale|= packedYScale<<16;
3101         }
3102         else
3103         {
3104                 packedYScale= 0x0100010001000100LL;
3105                 packedYOffset= 0;
3106         }
3107
3108         if(mode & LEVEL_FIX)    QPCorrecture= packedYScale &0xFFFF;
3109         else                    QPCorrecture= 256;
3110
3111         /* line before the first one */
3112         y=-BLOCK_SIZE;
3113         {
3114                 //1% speedup if these are here instead of the inner loop
3115                 uint8_t *srcBlock= &(src[y*srcStride]);
3116                 uint8_t *dstBlock= &(dst[y*dstStride]);
3117
3118                 dstBlock= tempDst + dstStride;
3119
3120                 // From this point on it is guranteed that we can read and write 16 lines downward
3121                 // finish 1 block before the next otherwise we´ll might have a problem
3122                 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
3123                 for(x=0; x<width; x+=BLOCK_SIZE)
3124                 {
3125
3126 #ifdef HAVE_MMX2
3127 /*
3128                         prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3129                         prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3130                         prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3131                         prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3132 */
3133 /*
3134                         prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3135                         prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3136                         prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3137                         prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3138 */
3139
3140                         asm(
3141                                 "movl %4, %%eax                 \n\t"
3142                                 "shrl $2, %%eax                 \n\t"
3143                                 "andl $6, %%eax                 \n\t"
3144                                 "addl $8, %%eax                 \n\t"
3145                                 "movl %%eax, %%ebx              \n\t"
3146                                 "imul %1, %%eax                 \n\t"
3147                                 "imul %3, %%ebx                 \n\t"
3148                                 "prefetchnta 32(%%eax, %0)      \n\t"
3149                                 "prefetcht0 32(%%ebx, %2)       \n\t"
3150                                 "addl %1, %%eax                 \n\t"
3151                                 "addl %3, %%ebx                 \n\t"
3152                                 "prefetchnta 32(%%eax, %0)      \n\t"
3153                                 "prefetcht0 32(%%ebx, %2)       \n\t"
3154                         :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
3155                         "m" (x)
3156                         : "%eax", "%ebx"
3157                         );
3158
3159 #elif defined(HAVE_3DNOW)
3160 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
3161 /*                      prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3162                         prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3163                         prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3164                         prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3165 */
3166 #endif
3167
3168                         blockCopy(dstBlock + dstStride*8, dstStride,
3169                                 srcBlock + srcStride*8, srcStride, 8, mode & LEVEL_FIX);
3170
3171                         if(mode & LINEAR_IPOL_DEINT_FILTER)
3172                                 deInterlaceInterpolateLinear(dstBlock, dstStride);
3173                         else if(mode & LINEAR_BLEND_DEINT_FILTER)
3174                                 deInterlaceBlendLinear(dstBlock, dstStride);
3175                         else if(mode & MEDIAN_DEINT_FILTER)
3176                                 deInterlaceMedian(dstBlock, dstStride);
3177                         else if(mode & CUBIC_IPOL_DEINT_FILTER)
3178                                 deInterlaceInterpolateCubic(dstBlock, dstStride);
3179 /*                      else if(mode & CUBIC_BLEND_DEINT_FILTER)
3180                                 deInterlaceBlendCubic(dstBlock, dstStride);
3181 */
3182                         dstBlock+=8;
3183                         srcBlock+=8;
3184                 }
3185                 memcpy(&(dst[y*dstStride]) + 8*dstStride, tempDst + 9*dstStride, 8*dstStride );
3186         }
3187
3188         for(y=0; y<height; y+=BLOCK_SIZE)
3189         {
3190                 //1% speedup if these are here instead of the inner loop
3191                 uint8_t *srcBlock= &(src[y*srcStride]);
3192                 uint8_t *dstBlock= &(dst[y*dstStride]);
3193 #ifdef ARCH_X86
3194                 int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride];
3195                 int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4);
3196                 int QPFrac= QPDelta;
3197                 uint8_t *tempBlock1= tempBlocks;
3198                 uint8_t *tempBlock2= tempBlocks + 8;
3199 #endif
3200                 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
3201                    if not than use a temporary buffer */
3202                 if(y+15 >= height)
3203                 {
3204                         /* copy from line 8 to 15 of src, these will be copied with
3205                            blockcopy to dst later */
3206                         memcpy(tempSrc + srcStride*8, srcBlock + srcStride*8,
3207                                 srcStride*MAX(height-y-8, 0) );
3208
3209                         /* duplicate last line to fill the void upto line 15 */
3210                         if(y+15 >= height)
3211                         {
3212                                 int i;
3213                                 for(i=height-y; i<=15; i++)
3214                                         memcpy(tempSrc + srcStride*i,
3215                                                 src + srcStride*(height-1), srcStride);
3216                         }
3217
3218                         /* copy up to 9 lines of dst */
3219                         memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, 9) );
3220                         dstBlock= tempDst + dstStride;
3221                         srcBlock= tempSrc;
3222                 }
3223
3224                 // From this point on it is guranteed that we can read and write 16 lines downward
3225                 // finish 1 block before the next otherwise we´ll might have a problem
3226                 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
3227                 for(x=0; x<width; x+=BLOCK_SIZE)
3228                 {
3229                         const int stride= dstStride;
3230                         uint8_t *tmpXchg;
3231 #ifdef ARCH_X86
3232                         int QP= *QPptr;
3233                         asm volatile(
3234                                 "addl %2, %1            \n\t"
3235                                 "sbbl %%eax, %%eax      \n\t"
3236                                 "shll $2, %%eax         \n\t"
3237                                 "subl %%eax, %0         \n\t"
3238                                 : "+r" (QPptr), "+m" (QPFrac)
3239                                 : "r" (QPDelta)
3240                                 : "%eax"
3241                         );
3242 #else
3243                         int QP= isColor ?
3244                                 QPs[(y>>3)*QPStride + (x>>3)]:
3245                                 QPs[(y>>4)*QPStride + (x>>4)];
3246 #endif
3247                         if(!isColor)
3248                         {
3249                                 QP= (QP* QPCorrecture)>>8;
3250                                 yHistogram[ srcBlock[srcStride*4 + 4] ]++;
3251                         }
3252 #ifdef HAVE_MMX
3253                         asm volatile(
3254                                 "movd %0, %%mm7                                 \n\t"
3255                                 "packuswb %%mm7, %%mm7                          \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
3256                                 "packuswb %%mm7, %%mm7                          \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
3257                                 "packuswb %%mm7, %%mm7                          \n\t" // QP,..., QP
3258                                 "movq %%mm7, pQPb                               \n\t"
3259                                 : : "r" (QP)
3260                         );
3261 #endif
3262
3263 #ifdef MORE_TIMING
3264                         T0= rdtsc();
3265 #endif
3266
3267 #ifdef HAVE_MMX2
3268 /*
3269                         prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3270                         prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3271                         prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3272                         prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3273 */
3274 /*
3275                         prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3276                         prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3277                         prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3278                         prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3279 */
3280
3281                         asm(
3282                                 "movl %4, %%eax                 \n\t"
3283                                 "shrl $2, %%eax                 \n\t"
3284                                 "andl $6, %%eax                 \n\t"
3285                                 "addl $8, %%eax                 \n\t"
3286                                 "movl %%eax, %%ebx              \n\t"
3287                                 "imul %1, %%eax                 \n\t"
3288                                 "imul %3, %%ebx                 \n\t"
3289                                 "prefetchnta 32(%%eax, %0)      \n\t"
3290                                 "prefetcht0 32(%%ebx, %2)       \n\t"
3291                                 "addl %1, %%eax                 \n\t"
3292                                 "addl %3, %%ebx                 \n\t"
3293                                 "prefetchnta 32(%%eax, %0)      \n\t"
3294                                 "prefetcht0 32(%%ebx, %2)       \n\t"
3295                         :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
3296                         "m" (x)
3297                         : "%eax", "%ebx"
3298                         );
3299
3300 #elif defined(HAVE_3DNOW)
3301 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
3302 /*                      prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3303                         prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3304                         prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3305                         prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3306 */
3307 #endif
3308
3309 #ifdef PP_FUNNY_STRIDE
3310                         //can we mess with a 8x16 block, if not use a temp buffer, yes again
3311                         if(x+7 >= width)
3312                         {
3313                                 int i;
3314                                 dstBlockPtrBackup= dstBlock;
3315                                 srcBlockPtrBackup= srcBlock;
3316
3317                                 for(i=0;i<BLOCK_SIZE*2; i++)
3318                                 {
3319                                         memcpy(tempSrcBlock+i*srcStride, srcBlock+i*srcStride, width-x);
3320                                         memcpy(tempDstBlock+i*dstStride, dstBlock+i*dstStride, width-x);
3321                                 }
3322
3323                                 dstBlock= tempDstBlock;
3324                                 srcBlock= tempSrcBlock;
3325                         }
3326 #endif
3327
3328                         blockCopy(dstBlock + dstStride*8, dstStride,
3329                                 srcBlock + srcStride*8, srcStride, 8, mode & LEVEL_FIX);
3330
3331                         if(mode & LINEAR_IPOL_DEINT_FILTER)
3332                                 deInterlaceInterpolateLinear(dstBlock, dstStride);
3333                         else if(mode & LINEAR_BLEND_DEINT_FILTER)
3334                                 deInterlaceBlendLinear(dstBlock, dstStride);
3335                         else if(mode & MEDIAN_DEINT_FILTER)
3336                                 deInterlaceMedian(dstBlock, dstStride);
3337                         else if(mode & CUBIC_IPOL_DEINT_FILTER)
3338                                 deInterlaceInterpolateCubic(dstBlock, dstStride);
3339 /*                      else if(mode & CUBIC_BLEND_DEINT_FILTER)
3340                                 deInterlaceBlendCubic(dstBlock, dstStride);
3341 */
3342
3343                         /* only deblock if we have 2 blocks */
3344                         if(y + 8 < height)
3345                         {
3346 #ifdef MORE_TIMING
3347                                 T1= rdtsc();
3348                                 memcpyTime+= T1-T0;
3349                                 T0=T1;
3350 #endif
3351                                 if(mode & V_RK1_FILTER)
3352                                         vertRK1Filter(dstBlock, stride, QP);
3353                                 else if(mode & V_X1_FILTER)
3354                                         vertX1Filter(dstBlock, stride, QP);
3355                                 else if(mode & V_DEBLOCK)
3356                                 {
3357                                         if( isVertDC(dstBlock, stride))
3358                                         {
3359                                                 if(isVertMinMaxOk(dstBlock, stride, QP))
3360                                                         doVertLowPass(dstBlock, stride, QP);
3361                                         }
3362                                         else
3363                                                 doVertDefFilter(dstBlock, stride, QP);
3364                                 }
3365 #ifdef MORE_TIMING
3366                                 T1= rdtsc();
3367                                 vertTime+= T1-T0;
3368                                 T0=T1;
3369 #endif
3370                         }
3371
3372 #ifdef HAVE_MMX
3373                         transpose1(tempBlock1, tempBlock2, dstBlock, dstStride);
3374 #endif
3375                         /* check if we have a previous block to deblock it with dstBlock */
3376                         if(x - 8 >= 0)
3377                         {
3378 #ifdef MORE_TIMING
3379                                 T0= rdtsc();
3380 #endif
3381 #ifdef HAVE_MMX
3382                                 if(mode & H_RK1_FILTER)
3383                                         vertRK1Filter(tempBlock1, 16, QP);
3384                                 else if(mode & H_X1_FILTER)
3385                                         vertX1Filter(tempBlock1, 16, QP);
3386                                 else if(mode & H_DEBLOCK)
3387                                 {
3388                                         if( isVertDC(tempBlock1, 16))
3389                                         {
3390                                                 if(isVertMinMaxOk(tempBlock1, 16, QP))
3391                                                         doVertLowPass(tempBlock1, 16, QP);
3392                                         }
3393                                         else
3394                                                 doVertDefFilter(tempBlock1, 16, QP);
3395                                 }
3396
3397                                 transpose2(dstBlock-4, dstStride, tempBlock1 + 4*16);
3398
3399 #else
3400                                 if(mode & H_X1_FILTER)
3401                                         horizX1Filter(dstBlock-4, stride, QP);
3402                                 else if(mode & H_DEBLOCK)
3403                                 {
3404                                         if( isHorizDC(dstBlock-4, stride))
3405                                         {
3406                                                 if(isHorizMinMaxOk(dstBlock-4, stride, QP))
3407                                                         doHorizLowPass(dstBlock-4, stride, QP);
3408                                         }
3409                                         else
3410                                                 doHorizDefFilter(dstBlock-4, stride, QP);
3411                                 }
3412 #endif
3413 #ifdef MORE_TIMING
3414                                 T1= rdtsc();
3415                                 horizTime+= T1-T0;
3416                                 T0=T1;
3417 #endif
3418                                 if(mode & DERING)
3419                                 {
3420                                 //FIXME filter first line
3421                                         if(y>0) dering(dstBlock - stride - 8, stride, QP);
3422                                 }
3423                         }
3424                         else if(mode & DERING)
3425                         {
3426                          //FIXME y+15 is required cuz of the tempBuffer thing -> bottom right block isnt filtered
3427                                         if(y > 8 && y+15 < height) dering(dstBlock - stride*9 + width - 8, stride, QP);
3428                         }
3429
3430
3431 #ifdef PP_FUNNY_STRIDE
3432                         /* did we use a tmp-block buffer */
3433                         if(x+7 >= width)
3434                         {
3435                                 int i;
3436                                 dstBlock= dstBlockPtrBackup;
3437                                 srcBlock= srcBlockPtrBackup;
3438
3439                                 for(i=0;i<BLOCK_SIZE*2; i++)
3440                                 {
3441                                         memcpy(dstBlock+i*dstStride, tempDstBlock+i*dstStride, width-x);
3442                                 }
3443                         }
3444 #endif
3445
3446                         dstBlock+=8;
3447                         srcBlock+=8;
3448
3449 #ifdef HAVE_MMX
3450                         tmpXchg= tempBlock1;
3451                         tempBlock1= tempBlock2;
3452                         tempBlock2 = tmpXchg;
3453 #endif
3454                 }
3455
3456                 /* did we use a tmp buffer for the last lines*/
3457                 if(y+15 >= height)
3458                 {
3459                         uint8_t *dstBlock= &(dst[y*dstStride]);
3460                         memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) );
3461                 }
3462         }
3463 #ifdef HAVE_3DNOW
3464         asm volatile("femms");
3465 #elif defined (HAVE_MMX)
3466         asm volatile("emms");
3467 #endif
3468
3469 #ifdef TIMING
3470         // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...)
3471         sumTime= rdtsc() - sumTime;
3472         if(!isColor)
3473                 printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d    \r",
3474                         (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
3475                         (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)
3476                         , black, white);
3477 #endif
3478 }