git.sesse.net Git - ffmpeg/blob - libavcodec/sh4/qpel.c

   1 /*
   2  * This is optimized for sh, which have post increment addressing (*p++).
   3  * Some CPU may be index (p[n]) faster than post increment (*p++).
   4  *
   5  * copyright (c) 2001-2003 BERO <bero@geocities.co.jp>
   6  *
   7  * This file is part of FFmpeg.
   8  *
   9  * FFmpeg is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License as published by the Free Software Foundation; either
  12  * version 2.1 of the License, or (at your option) any later version.
  13  *
  14  * FFmpeg is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with FFmpeg; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22  */
  23
  24 #include "libavutil/common.h"
  25
  26 #define PIXOP2(OPNAME, OP) \
  27 \
  28 static inline void OPNAME ## _pixels4_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
  29 {\
  30         do {\
  31                 OP(LP(dst  ),rnd_avg32(LPC(src1  ),LPC(src2  )) ); \
  32                 src1+=src_stride1; \
  33                 src2+=src_stride2; \
  34                 dst+=dst_stride; \
  35         } while(--h); \
  36 }\
  37 \
  38 static inline void OPNAME ## _pixels4_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
  39 {\
  40         do {\
  41                 OP(LP(dst  ),rnd_avg32(AV_RN32(src1  ),LPC(src2  )) ); \
  42                 src1+=src_stride1; \
  43                 src2+=src_stride2; \
  44                 dst+=dst_stride; \
  45         } while(--h); \
  46 }\
  47 \
  48 static inline void OPNAME ## _no_rnd_pixels16_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
  49 {\
  50         do {\
  51                 OP(LP(dst  ),no_rnd_avg32(AV_RN32(src1  ),LPC(src2  )) ); \
  52                 OP(LP(dst+4),no_rnd_avg32(AV_RN32(src1+4),LPC(src2+4)) ); \
  53                 OP(LP(dst+8),no_rnd_avg32(AV_RN32(src1+8),LPC(src2+8)) ); \
  54                 OP(LP(dst+12),no_rnd_avg32(AV_RN32(src1+12),LPC(src2+12)) ); \
  55                 src1+=src_stride1; \
  56                 src2+=src_stride2; \
  57                 dst+=dst_stride; \
  58         } while(--h); \
  59 }\
  60 \
  61 static inline void OPNAME ## _pixels16_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
  62 {\
  63         do {\
  64                 OP(LP(dst  ),rnd_avg32(AV_RN32(src1  ),LPC(src2  )) ); \
  65                 OP(LP(dst+4),rnd_avg32(AV_RN32(src1+4),LPC(src2+4)) ); \
  66                 OP(LP(dst+8),rnd_avg32(AV_RN32(src1+8),LPC(src2+8)) ); \
  67                 OP(LP(dst+12),rnd_avg32(AV_RN32(src1+12),LPC(src2+12)) ); \
  68                 src1+=src_stride1; \
  69                 src2+=src_stride2; \
  70                 dst+=dst_stride; \
  71         } while(--h); \
  72 }\
  73 \
  74 static inline void OPNAME ## _no_rnd_pixels8_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
  75 {\
  76         do { /* onlye src2 aligned */\
  77                 OP(LP(dst  ),no_rnd_avg32(AV_RN32(src1  ),LPC(src2  )) ); \
  78                 OP(LP(dst+4),no_rnd_avg32(AV_RN32(src1+4),LPC(src2+4)) ); \
  79                 src1+=src_stride1; \
  80                 src2+=src_stride2; \
  81                 dst+=dst_stride; \
  82         } while(--h); \
  83 }\
  84 \
  85 static inline void OPNAME ## _pixels8_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
  86 {\
  87         do {\
  88                 OP(LP(dst  ),rnd_avg32(AV_RN32(src1  ),LPC(src2  )) ); \
  89                 OP(LP(dst+4),rnd_avg32(AV_RN32(src1+4),LPC(src2+4)) ); \
  90                 src1+=src_stride1; \
  91                 src2+=src_stride2; \
  92                 dst+=dst_stride; \
  93         } while(--h); \
  94 }\
  95 \
  96 static inline void OPNAME ## _no_rnd_pixels8_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
  97 {\
  98         do {\
  99                 OP(LP(dst  ),no_rnd_avg32(LPC(src1  ),LPC(src2  )) ); \
 100                 OP(LP(dst+4),no_rnd_avg32(LPC(src1+4),LPC(src2+4)) ); \
 101                 src1+=src_stride1; \
 102                 src2+=src_stride2; \
 103                 dst+=dst_stride; \
 104         } while(--h); \
 105 }\
 106 \
 107 static inline void OPNAME ## _pixels8_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
 108 {\
 109         do {\
 110                 OP(LP(dst  ),rnd_avg32(LPC(src1  ),LPC(src2  )) ); \
 111                 OP(LP(dst+4),rnd_avg32(LPC(src1+4),LPC(src2+4)) ); \
 112                 src1+=src_stride1; \
 113                 src2+=src_stride2; \
 114                 dst+=dst_stride; \
 115         } while(--h); \
 116 }\
 117 \
 118 static inline void OPNAME ## _no_rnd_pixels16_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
 119 {\
 120         do {\
 121                 OP(LP(dst  ),no_rnd_avg32(LPC(src1  ),LPC(src2  )) ); \
 122                 OP(LP(dst+4),no_rnd_avg32(LPC(src1+4),LPC(src2+4)) ); \
 123                 OP(LP(dst+8),no_rnd_avg32(LPC(src1+8),LPC(src2+8)) ); \
 124                 OP(LP(dst+12),no_rnd_avg32(LPC(src1+12),LPC(src2+12)) ); \
 125                 src1+=src_stride1; \
 126                 src2+=src_stride2; \
 127                 dst+=dst_stride; \
 128         } while(--h); \
 129 }\
 130 \
 131 static inline void OPNAME ## _pixels16_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
 132 {\
 133         do {\
 134                 OP(LP(dst  ),rnd_avg32(LPC(src1  ),LPC(src2  )) ); \
 135                 OP(LP(dst+4),rnd_avg32(LPC(src1+4),LPC(src2+4)) ); \
 136                 OP(LP(dst+8),rnd_avg32(LPC(src1+8),LPC(src2+8)) ); \
 137                 OP(LP(dst+12),rnd_avg32(LPC(src1+12),LPC(src2+12)) ); \
 138                 src1+=src_stride1; \
 139                 src2+=src_stride2; \
 140                 dst+=dst_stride; \
 141         } while(--h); \
 142 }\
 143 \
 144 static inline void OPNAME ## _no_rnd_pixels16_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
 145 { OPNAME ## _no_rnd_pixels16_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
 146 \
 147 static inline void OPNAME ## _pixels16_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
 148 { OPNAME ## _pixels16_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
 149 \
 150 static inline void OPNAME ## _no_rnd_pixels8_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
 151 { OPNAME ## _no_rnd_pixels8_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
 152 \
 153 static inline void OPNAME ## _pixels8_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
 154 { OPNAME ## _pixels8_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
 155 \
 156 static inline void OPNAME ## _pixels8_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 157         do { \
 158                 uint32_t a0,a1,a2,a3; \
 159                 UNPACK(a0,a1,LPC(src1),LPC(src2)); \
 160                 UNPACK(a2,a3,LPC(src3),LPC(src4)); \
 161                 OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
 162                 UNPACK(a0,a1,LPC(src1+4),LPC(src2+4)); \
 163                 UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
 164                 OP(LP(dst+4),rnd_PACK(a0,a1,a2,a3)); \
 165                 src1+=src_stride1;\
 166                 src2+=src_stride2;\
 167                 src3+=src_stride3;\
 168                 src4+=src_stride4;\
 169                 dst+=dst_stride;\
 170         } while(--h); \
 171 } \
 172 \
 173 static inline void OPNAME ## _no_rnd_pixels8_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 174         do { \
 175                 uint32_t a0,a1,a2,a3; \
 176                 UNPACK(a0,a1,LPC(src1),LPC(src2)); \
 177                 UNPACK(a2,a3,LPC(src3),LPC(src4)); \
 178                 OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
 179                 UNPACK(a0,a1,LPC(src1+4),LPC(src2+4)); \
 180                 UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
 181                 OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
 182                 src1+=src_stride1;\
 183                 src2+=src_stride2;\
 184                 src3+=src_stride3;\
 185                 src4+=src_stride4;\
 186                 dst+=dst_stride;\
 187         } while(--h); \
 188 } \
 189 \
 190 static inline void OPNAME ## _pixels8_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 191         do { \
 192                 uint32_t a0,a1,a2,a3; /* src1 only not aligned */\
 193                 UNPACK(a0,a1,AV_RN32(src1),LPC(src2)); \
 194                 UNPACK(a2,a3,LPC(src3),LPC(src4)); \
 195                 OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
 196                 UNPACK(a0,a1,AV_RN32(src1+4),LPC(src2+4)); \
 197                 UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
 198                 OP(LP(dst+4),rnd_PACK(a0,a1,a2,a3)); \
 199                 src1+=src_stride1;\
 200                 src2+=src_stride2;\
 201                 src3+=src_stride3;\
 202                 src4+=src_stride4;\
 203                 dst+=dst_stride;\
 204         } while(--h); \
 205 } \
 206 \
 207 static inline void OPNAME ## _no_rnd_pixels8_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 208         do { \
 209                 uint32_t a0,a1,a2,a3; \
 210                 UNPACK(a0,a1,AV_RN32(src1),LPC(src2)); \
 211                 UNPACK(a2,a3,LPC(src3),LPC(src4)); \
 212                 OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
 213                 UNPACK(a0,a1,AV_RN32(src1+4),LPC(src2+4)); \
 214                 UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
 215                 OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
 216                 src1+=src_stride1;\
 217                 src2+=src_stride2;\
 218                 src3+=src_stride3;\
 219                 src4+=src_stride4;\
 220                 dst+=dst_stride;\
 221         } while(--h); \
 222 } \
 223 \
 224 static inline void OPNAME ## _pixels16_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 225         do { \
 226                 uint32_t a0,a1,a2,a3; \
 227                 UNPACK(a0,a1,LPC(src1),LPC(src2)); \
 228                 UNPACK(a2,a3,LPC(src3),LPC(src4)); \
 229                 OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
 230                 UNPACK(a0,a1,LPC(src1+4),LPC(src2+4)); \
 231                 UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
 232                 OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
 233                 UNPACK(a0,a1,LPC(src1+8),LPC(src2+8)); \
 234                 UNPACK(a2,a3,LPC(src3+8),LPC(src4+8)); \
 235                 OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
 236                 UNPACK(a0,a1,LPC(src1+12),LPC(src2+12)); \
 237                 UNPACK(a2,a3,LPC(src3+12),LPC(src4+12)); \
 238                 OP(LP(dst+12),rnd_PACK(a0,a1,a2,a3)); \
 239                 src1+=src_stride1;\
 240                 src2+=src_stride2;\
 241                 src3+=src_stride3;\
 242                 src4+=src_stride4;\
 243                 dst+=dst_stride;\
 244         } while(--h); \
 245 } \
 246 \
 247 static inline void OPNAME ## _no_rnd_pixels16_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 248         do { \
 249                 uint32_t a0,a1,a2,a3; \
 250                 UNPACK(a0,a1,LPC(src1),LPC(src2)); \
 251                 UNPACK(a2,a3,LPC(src3),LPC(src4)); \
 252                 OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
 253                 UNPACK(a0,a1,LPC(src1+4),LPC(src2+4)); \
 254                 UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
 255                 OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
 256                 UNPACK(a0,a1,LPC(src1+8),LPC(src2+8)); \
 257                 UNPACK(a2,a3,LPC(src3+8),LPC(src4+8)); \
 258                 OP(LP(dst+8),no_rnd_PACK(a0,a1,a2,a3)); \
 259                 UNPACK(a0,a1,LPC(src1+12),LPC(src2+12)); \
 260                 UNPACK(a2,a3,LPC(src3+12),LPC(src4+12)); \
 261                 OP(LP(dst+12),no_rnd_PACK(a0,a1,a2,a3)); \
 262                 src1+=src_stride1;\
 263                 src2+=src_stride2;\
 264                 src3+=src_stride3;\
 265                 src4+=src_stride4;\
 266                 dst+=dst_stride;\
 267         } while(--h); \
 268 } \
 269 \
 270 static inline void OPNAME ## _pixels16_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 271         do { /* src1 is unaligned */\
 272                 uint32_t a0,a1,a2,a3; \
 273                 UNPACK(a0,a1,AV_RN32(src1),LPC(src2)); \
 274                 UNPACK(a2,a3,LPC(src3),LPC(src4)); \
 275                 OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
 276                 UNPACK(a0,a1,AV_RN32(src1+4),LPC(src2+4)); \
 277                 UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
 278                 OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
 279                 UNPACK(a0,a1,AV_RN32(src1+8),LPC(src2+8)); \
 280                 UNPACK(a2,a3,LPC(src3+8),LPC(src4+8)); \
 281                 OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
 282                 UNPACK(a0,a1,AV_RN32(src1+12),LPC(src2+12)); \
 283                 UNPACK(a2,a3,LPC(src3+12),LPC(src4+12)); \
 284                 OP(LP(dst+12),rnd_PACK(a0,a1,a2,a3)); \
 285                 src1+=src_stride1;\
 286                 src2+=src_stride2;\
 287                 src3+=src_stride3;\
 288                 src4+=src_stride4;\
 289                 dst+=dst_stride;\
 290         } while(--h); \
 291 } \
 292 \
 293 static inline void OPNAME ## _no_rnd_pixels16_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 294         do { \
 295                 uint32_t a0,a1,a2,a3; \
 296                 UNPACK(a0,a1,AV_RN32(src1),LPC(src2)); \
 297                 UNPACK(a2,a3,LPC(src3),LPC(src4)); \
 298                 OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
 299                 UNPACK(a0,a1,AV_RN32(src1+4),LPC(src2+4)); \
 300                 UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
 301                 OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
 302                 UNPACK(a0,a1,AV_RN32(src1+8),LPC(src2+8)); \
 303                 UNPACK(a2,a3,LPC(src3+8),LPC(src4+8)); \
 304                 OP(LP(dst+8),no_rnd_PACK(a0,a1,a2,a3)); \
 305                 UNPACK(a0,a1,AV_RN32(src1+12),LPC(src2+12)); \
 306                 UNPACK(a2,a3,LPC(src3+12),LPC(src4+12)); \
 307                 OP(LP(dst+12),no_rnd_PACK(a0,a1,a2,a3)); \
 308                 src1+=src_stride1;\
 309                 src2+=src_stride2;\
 310                 src3+=src_stride3;\
 311                 src4+=src_stride4;\
 312                 dst+=dst_stride;\
 313         } while(--h); \
 314 } \
 315 \
 316
 317 #define op_avg(a, b) a = rnd_avg32(a,b)
 318 #define op_put(a, b) a = b
 319
 320 PIXOP2(avg, op_avg)
 321 PIXOP2(put, op_put)
 322 #undef op_avg
 323 #undef op_put
 324
 325 #define avg2(a,b) ((a+b+1)>>1)
 326 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 327
 328
 329 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
 330 {
 331     const int A=(16-x16)*(16-y16);
 332     const int B=(   x16)*(16-y16);
 333     const int C=(16-x16)*(   y16);
 334     const int D=(   x16)*(   y16);
 335
 336     do {
 337         int t0,t1,t2,t3;
 338         uint8_t *s0 = src;
 339         uint8_t *s1 = src+stride;
 340         t0 = *s0++; t2 = *s1++;
 341         t1 = *s0++; t3 = *s1++;
 342         dst[0]= (A*t0 + B*t1 + C*t2 + D*t3 + rounder)>>8;
 343         t0 = *s0++; t2 = *s1++;
 344         dst[1]= (A*t1 + B*t0 + C*t3 + D*t2 + rounder)>>8;
 345         t1 = *s0++; t3 = *s1++;
 346         dst[2]= (A*t0 + B*t1 + C*t2 + D*t3 + rounder)>>8;
 347         t0 = *s0++; t2 = *s1++;
 348         dst[3]= (A*t1 + B*t0 + C*t3 + D*t2 + rounder)>>8;
 349         t1 = *s0++; t3 = *s1++;
 350         dst[4]= (A*t0 + B*t1 + C*t2 + D*t3 + rounder)>>8;
 351         t0 = *s0++; t2 = *s1++;
 352         dst[5]= (A*t1 + B*t0 + C*t3 + D*t2 + rounder)>>8;
 353         t1 = *s0++; t3 = *s1++;
 354         dst[6]= (A*t0 + B*t1 + C*t2 + D*t3 + rounder)>>8;
 355         t0 = *s0++; t2 = *s1++;
 356         dst[7]= (A*t1 + B*t0 + C*t3 + D*t2 + rounder)>>8;
 357         dst+= stride;
 358         src+= stride;
 359     }while(--h);
 360 }
 361
 362 #define H264_CHROMA_MC(OPNAME, OP)\
 363 static void OPNAME ## h264_chroma_mc2_sh4(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
 364     const int A=(8-x)*(8-y);\
 365     const int B=(  x)*(8-y);\
 366     const int C=(8-x)*(  y);\
 367     const int D=(  x)*(  y);\
 368     \
 369     assert(x<8 && y<8 && x>=0 && y>=0);\
 370 \
 371     do {\
 372         int t0,t1,t2,t3; \
 373         uint8_t *s0 = src; \
 374         uint8_t *s1 = src+stride; \
 375         t0 = *s0++; t2 = *s1++; \
 376         t1 = *s0++; t3 = *s1++; \
 377         OP(dst[0], (A*t0 + B*t1 + C*t2 + D*t3));\
 378         t0 = *s0++; t2 = *s1++; \
 379         OP(dst[1], (A*t1 + B*t0 + C*t3 + D*t2));\
 380         dst+= stride;\
 381         src+= stride;\
 382     }while(--h);\
 383 }\
 384 \
 385 static void OPNAME ## h264_chroma_mc4_sh4(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
 386     const int A=(8-x)*(8-y);\
 387     const int B=(  x)*(8-y);\
 388     const int C=(8-x)*(  y);\
 389     const int D=(  x)*(  y);\
 390     \
 391     assert(x<8 && y<8 && x>=0 && y>=0);\
 392 \
 393     do {\
 394         int t0,t1,t2,t3; \
 395         uint8_t *s0 = src; \
 396         uint8_t *s1 = src+stride; \
 397         t0 = *s0++; t2 = *s1++; \
 398         t1 = *s0++; t3 = *s1++; \
 399         OP(dst[0], (A*t0 + B*t1 + C*t2 + D*t3));\
 400         t0 = *s0++; t2 = *s1++; \
 401         OP(dst[1], (A*t1 + B*t0 + C*t3 + D*t2));\
 402         t1 = *s0++; t3 = *s1++; \
 403         OP(dst[2], (A*t0 + B*t1 + C*t2 + D*t3));\
 404         t0 = *s0++; t2 = *s1++; \
 405         OP(dst[3], (A*t1 + B*t0 + C*t3 + D*t2));\
 406         dst+= stride;\
 407         src+= stride;\
 408     }while(--h);\
 409 }\
 410 \
 411 static void OPNAME ## h264_chroma_mc8_sh4(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
 412     const int A=(8-x)*(8-y);\
 413     const int B=(  x)*(8-y);\
 414     const int C=(8-x)*(  y);\
 415     const int D=(  x)*(  y);\
 416     \
 417     assert(x<8 && y<8 && x>=0 && y>=0);\
 418 \
 419     do {\
 420         int t0,t1,t2,t3; \
 421         uint8_t *s0 = src; \
 422         uint8_t *s1 = src+stride; \
 423         t0 = *s0++; t2 = *s1++; \
 424         t1 = *s0++; t3 = *s1++; \
 425         OP(dst[0], (A*t0 + B*t1 + C*t2 + D*t3));\
 426         t0 = *s0++; t2 = *s1++; \
 427         OP(dst[1], (A*t1 + B*t0 + C*t3 + D*t2));\
 428         t1 = *s0++; t3 = *s1++; \
 429         OP(dst[2], (A*t0 + B*t1 + C*t2 + D*t3));\
 430         t0 = *s0++; t2 = *s1++; \
 431         OP(dst[3], (A*t1 + B*t0 + C*t3 + D*t2));\
 432         t1 = *s0++; t3 = *s1++; \
 433         OP(dst[4], (A*t0 + B*t1 + C*t2 + D*t3));\
 434         t0 = *s0++; t2 = *s1++; \
 435         OP(dst[5], (A*t1 + B*t0 + C*t3 + D*t2));\
 436         t1 = *s0++; t3 = *s1++; \
 437         OP(dst[6], (A*t0 + B*t1 + C*t2 + D*t3));\
 438         t0 = *s0++; t2 = *s1++; \
 439         OP(dst[7], (A*t1 + B*t0 + C*t3 + D*t2));\
 440         dst+= stride;\
 441         src+= stride;\
 442     }while(--h);\
 443 }
 444
 445 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
 446 #define op_put(a, b) a = (((b) + 32)>>6)
 447
 448 H264_CHROMA_MC(put_       , op_put)
 449 H264_CHROMA_MC(avg_       , op_avg)
 450 #undef op_avg
 451 #undef op_put
 452
 453 #define QPEL_MC(r, OPNAME, RND, OP) \
 454 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 455     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 456     do {\
 457         uint8_t *s = src; \
 458         int src0,src1,src2,src3,src4,src5,src6,src7,src8;\
 459         src0= *s++;\
 460         src1= *s++;\
 461         src2= *s++;\
 462         src3= *s++;\
 463         src4= *s++;\
 464         OP(dst[0], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
 465         src5= *s++;\
 466         OP(dst[1], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
 467         src6= *s++;\
 468         OP(dst[2], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
 469         src7= *s++;\
 470         OP(dst[3], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
 471         src8= *s++;\
 472         OP(dst[4], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
 473         OP(dst[5], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
 474         OP(dst[6], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
 475         OP(dst[7], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
 476         dst+=dstStride;\
 477         src+=srcStride;\
 478     }while(--h);\
 479 }\
 480 \
 481 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 482     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 483     int w=8;\
 484     do{\
 485         uint8_t *s = src, *d=dst;\
 486         int src0,src1,src2,src3,src4,src5,src6,src7,src8;\
 487         src0 = *s; s+=srcStride; \
 488         src1 = *s; s+=srcStride; \
 489         src2 = *s; s+=srcStride; \
 490         src3 = *s; s+=srcStride; \
 491         src4 = *s; s+=srcStride; \
 492         OP(*d, (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));d+=dstStride;\
 493         src5 = *s; s+=srcStride; \
 494         OP(*d, (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));d+=dstStride;\
 495         src6 = *s; s+=srcStride; \
 496         OP(*d, (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));d+=dstStride;\
 497         src7 = *s; s+=srcStride; \
 498         OP(*d, (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));d+=dstStride;\
 499         src8 = *s; \
 500         OP(*d, (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));d+=dstStride;\
 501         OP(*d, (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));d+=dstStride;\
 502         OP(*d, (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));d+=dstStride;\
 503         OP(*d, (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
 504         dst++;\
 505         src++;\
 506     }while(--w);\
 507 }\
 508 \
 509 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 510     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 511     do {\
 512         uint8_t *s = src;\
 513         int src0,src1,src2,src3,src4,src5,src6,src7,src8;\
 514         int src9,src10,src11,src12,src13,src14,src15,src16;\
 515         src0= *s++;\
 516         src1= *s++;\
 517         src2= *s++;\
 518         src3= *s++;\
 519         src4= *s++;\
 520         OP(dst[ 0], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
 521         src5= *s++;\
 522         OP(dst[ 1], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
 523         src6= *s++;\
 524         OP(dst[ 2], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
 525         src7= *s++;\
 526         OP(dst[ 3], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
 527         src8= *s++;\
 528         OP(dst[ 4], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
 529         src9= *s++;\
 530         OP(dst[ 5], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
 531         src10= *s++;\
 532         OP(dst[ 6], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
 533         src11= *s++;\
 534         OP(dst[ 7], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
 535         src12= *s++;\
 536         OP(dst[ 8], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
 537         src13= *s++;\
 538         OP(dst[ 9], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
 539         src14= *s++;\
 540         OP(dst[10], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
 541         src15= *s++;\
 542         OP(dst[11], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
 543         src16= *s++;\
 544         OP(dst[12], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
 545         OP(dst[13], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
 546         OP(dst[14], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
 547         OP(dst[15], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
 548         dst+=dstStride;\
 549         src+=srcStride;\
 550     }while(--h);\
 551 }\
 552 \
 553 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 554     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 555     int w=16;\
 556     do {\
 557         uint8_t *s = src, *d=dst;\
 558         int src0,src1,src2,src3,src4,src5,src6,src7,src8;\
 559         int src9,src10,src11,src12,src13,src14,src15,src16;\
 560         src0 = *s; s+=srcStride; \
 561         src1 = *s; s+=srcStride; \
 562         src2 = *s; s+=srcStride; \
 563         src3 = *s; s+=srcStride; \
 564         src4 = *s; s+=srcStride; \
 565         OP(*d, (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));d+=dstStride;\
 566         src5 = *s; s+=srcStride; \
 567         OP(*d, (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));d+=dstStride;\
 568         src6 = *s; s+=srcStride; \
 569         OP(*d, (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));d+=dstStride;\
 570         src7 = *s; s+=srcStride; \
 571         OP(*d, (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));d+=dstStride;\
 572         src8 = *s; s+=srcStride; \
 573         OP(*d, (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));d+=dstStride;\
 574         src9 = *s; s+=srcStride; \
 575         OP(*d, (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));d+=dstStride;\
 576         src10 = *s; s+=srcStride; \
 577         OP(*d, (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));d+=dstStride;\
 578         src11 = *s; s+=srcStride; \
 579         OP(*d, (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));d+=dstStride;\
 580         src12 = *s; s+=srcStride; \
 581         OP(*d, (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));d+=dstStride;\
 582         src13 = *s; s+=srcStride; \
 583         OP(*d, (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));d+=dstStride;\
 584         src14 = *s; s+=srcStride; \
 585         OP(*d, (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));d+=dstStride;\
 586         src15 = *s; s+=srcStride; \
 587         OP(*d, (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));d+=dstStride;\
 588         src16 = *s; \
 589         OP(*d, (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));d+=dstStride;\
 590         OP(*d, (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));d+=dstStride;\
 591         OP(*d, (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));d+=dstStride;\
 592         OP(*d, (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
 593         dst++;\
 594         src++;\
 595     }while(--w);\
 596 }\
 597 \
 598 static void OPNAME ## qpel8_mc00_sh4 (uint8_t *dst, uint8_t *src, int stride){\
 599     OPNAME ## pixels8_c(dst, src, stride, 8);\
 600 }\
 601 \
 602 static void OPNAME ## qpel8_mc10_sh4(uint8_t *dst, uint8_t *src, int stride){\
 603     uint8_t half[64];\
 604     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 605     OPNAME ## pixels8_l2_aligned2(dst, src, half, stride, stride, 8, 8);\
 606 }\
 607 \
 608 static void OPNAME ## qpel8_mc20_sh4(uint8_t *dst, uint8_t *src, int stride){\
 609     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
 610 }\
 611 \
 612 static void OPNAME ## qpel8_mc30_sh4(uint8_t *dst, uint8_t *src, int stride){\
 613     uint8_t half[64];\
 614     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 615     OPNAME ## pixels8_l2_aligned2(dst, src+1, half, stride, stride, 8, 8);\
 616 }\
 617 \
 618 static void OPNAME ## qpel8_mc01_sh4(uint8_t *dst, uint8_t *src, int stride){\
 619     uint8_t full[16*9];\
 620     uint8_t half[64];\
 621     copy_block9(full, src, 16, stride, 9);\
 622     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 623     OPNAME ## pixels8_l2_aligned(dst, full, half, stride, 16, 8, 8);\
 624 }\
 625 \
 626 static void OPNAME ## qpel8_mc02_sh4(uint8_t *dst, uint8_t *src, int stride){\
 627     uint8_t full[16*9];\
 628     copy_block9(full, src, 16, stride, 9);\
 629     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
 630 }\
 631 \
 632 static void OPNAME ## qpel8_mc03_sh4(uint8_t *dst, uint8_t *src, int stride){\
 633     uint8_t full[16*9];\
 634     uint8_t half[64];\
 635     copy_block9(full, src, 16, stride, 9);\
 636     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 637     OPNAME ## pixels8_l2_aligned(dst, full+16, half, stride, 16, 8, 8);\
 638 }\
 639 static void OPNAME ## qpel8_mc11_sh4(uint8_t *dst, uint8_t *src, int stride){\
 640     uint8_t full[16*9];\
 641     uint8_t halfH[72];\
 642     uint8_t halfHV[64];\
 643     copy_block9(full, src, 16, stride, 9);\
 644     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 645     put ## RND ## pixels8_l2_aligned(halfH, halfH, full, 8, 8, 16, 9);\
 646     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 647     OPNAME ## pixels8_l2_aligned(dst, halfH, halfHV, stride, 8, 8, 8);\
 648 }\
 649 static void OPNAME ## qpel8_mc31_sh4(uint8_t *dst, uint8_t *src, int stride){\
 650     uint8_t full[16*9];\
 651     uint8_t halfH[72];\
 652     uint8_t halfHV[64];\
 653     copy_block9(full, src, 16, stride, 9);\
 654     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 655     put ## RND ## pixels8_l2_aligned1(halfH, halfH, full+1, 8, 8, 16, 9);\
 656     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 657     OPNAME ## pixels8_l2_aligned(dst, halfH, halfHV, stride, 8, 8, 8);\
 658 }\
 659 static void OPNAME ## qpel8_mc13_sh4(uint8_t *dst, uint8_t *src, int stride){\
 660     uint8_t full[16*9];\
 661     uint8_t halfH[72];\
 662     uint8_t halfHV[64];\
 663     copy_block9(full, src, 16, stride, 9);\
 664     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 665     put ## RND ## pixels8_l2_aligned(halfH, halfH, full, 8, 8, 16, 9);\
 666     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 667     OPNAME ## pixels8_l2_aligned(dst, halfH+8, halfHV, stride, 8, 8, 8);\
 668 }\
 669 static void OPNAME ## qpel8_mc33_sh4(uint8_t *dst, uint8_t *src, int stride){\
 670     uint8_t full[16*9];\
 671     uint8_t halfH[72];\
 672     uint8_t halfHV[64];\
 673     copy_block9(full, src, 16, stride, 9);\
 674     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 675     put ## RND ## pixels8_l2_aligned1(halfH, halfH, full+1, 8, 8, 16, 9);\
 676     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 677     OPNAME ## pixels8_l2_aligned(dst, halfH+8, halfHV, stride, 8, 8, 8);\
 678 }\
 679 static void OPNAME ## qpel8_mc21_sh4(uint8_t *dst, uint8_t *src, int stride){\
 680     uint8_t halfH[72];\
 681     uint8_t halfHV[64];\
 682     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
 683     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 684     OPNAME ## pixels8_l2_aligned(dst, halfH, halfHV, stride, 8, 8, 8);\
 685 }\
 686 static void OPNAME ## qpel8_mc23_sh4(uint8_t *dst, uint8_t *src, int stride){\
 687     uint8_t halfH[72];\
 688     uint8_t halfHV[64];\
 689     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
 690     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 691     OPNAME ## pixels8_l2_aligned(dst, halfH+8, halfHV, stride, 8, 8, 8);\
 692 }\
 693 static void OPNAME ## qpel8_mc12_sh4(uint8_t *dst, uint8_t *src, int stride){\
 694     uint8_t full[16*9];\
 695     uint8_t halfH[72];\
 696     copy_block9(full, src, 16, stride, 9);\
 697     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 698     put ## RND ## pixels8_l2_aligned(halfH, halfH, full, 8, 8, 16, 9);\
 699     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
 700 }\
 701 static void OPNAME ## qpel8_mc32_sh4(uint8_t *dst, uint8_t *src, int stride){\
 702     uint8_t full[16*9];\
 703     uint8_t halfH[72];\
 704     copy_block9(full, src, 16, stride, 9);\
 705     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 706     put ## RND ## pixels8_l2_aligned1(halfH, halfH, full+1, 8, 8, 16, 9);\
 707     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
 708 }\
 709 static void OPNAME ## qpel8_mc22_sh4(uint8_t *dst, uint8_t *src, int stride){\
 710     uint8_t halfH[72];\
 711     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
 712     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
 713 }\
 714 static void OPNAME ## qpel16_mc00_sh4 (uint8_t *dst, uint8_t *src, int stride){\
 715     OPNAME ## pixels16_c(dst, src, stride, 16);\
 716 }\
 717 \
 718 static void OPNAME ## qpel16_mc10_sh4(uint8_t *dst, uint8_t *src, int stride){\
 719     uint8_t half[256];\
 720     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
 721     OPNAME ## pixels16_l2_aligned2(dst, src, half, stride, stride, 16, 16);\
 722 }\
 723 \
 724 static void OPNAME ## qpel16_mc20_sh4(uint8_t *dst, uint8_t *src, int stride){\
 725     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
 726 }\
 727 \
 728 static void OPNAME ## qpel16_mc30_sh4(uint8_t *dst, uint8_t *src, int stride){\
 729     uint8_t half[256];\
 730     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
 731     OPNAME ## pixels16_l2_aligned2(dst, src+1, half, stride, stride, 16, 16);\
 732 }\
 733 \
 734 static void OPNAME ## qpel16_mc01_sh4(uint8_t *dst, uint8_t *src, int stride){\
 735     uint8_t full[24*17];\
 736     uint8_t half[256];\
 737     copy_block17(full, src, 24, stride, 17);\
 738     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
 739     OPNAME ## pixels16_l2_aligned(dst, full, half, stride, 24, 16, 16);\
 740 }\
 741 \
 742 static void OPNAME ## qpel16_mc02_sh4(uint8_t *dst, uint8_t *src, int stride){\
 743     uint8_t full[24*17];\
 744     copy_block17(full, src, 24, stride, 17);\
 745     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
 746 }\
 747 \
 748 static void OPNAME ## qpel16_mc03_sh4(uint8_t *dst, uint8_t *src, int stride){\
 749     uint8_t full[24*17];\
 750     uint8_t half[256];\
 751     copy_block17(full, src, 24, stride, 17);\
 752     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
 753     OPNAME ## pixels16_l2_aligned(dst, full+24, half, stride, 24, 16, 16);\
 754 }\
 755 static void OPNAME ## qpel16_mc11_sh4(uint8_t *dst, uint8_t *src, int stride){\
 756     uint8_t full[24*17];\
 757     uint8_t halfH[272];\
 758     uint8_t halfHV[256];\
 759     copy_block17(full, src, 24, stride, 17);\
 760     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
 761     put ## RND ## pixels16_l2_aligned(halfH, halfH, full, 16, 16, 24, 17);\
 762     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
 763     OPNAME ## pixels16_l2_aligned(dst, halfH, halfHV, stride, 16, 16, 16);\
 764 }\
 765 static void OPNAME ## qpel16_mc31_sh4(uint8_t *dst, uint8_t *src, int stride){\
 766     uint8_t full[24*17];\
 767     uint8_t halfH[272];\
 768     uint8_t halfHV[256];\
 769     copy_block17(full, src, 24, stride, 17);\
 770     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
 771     put ## RND ## pixels16_l2_aligned1(halfH, halfH, full+1, 16, 16, 24, 17);\
 772     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
 773     OPNAME ## pixels16_l2_aligned(dst, halfH, halfHV, stride, 16, 16, 16);\
 774 }\
 775 static void OPNAME ## qpel16_mc13_sh4(uint8_t *dst, uint8_t *src, int stride){\
 776     uint8_t full[24*17];\
 777     uint8_t halfH[272];\
 778     uint8_t halfHV[256];\
 779     copy_block17(full, src, 24, stride, 17);\
 780     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
 781     put ## RND ## pixels16_l2_aligned(halfH, halfH, full, 16, 16, 24, 17);\
 782     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
 783     OPNAME ## pixels16_l2_aligned(dst, halfH+16, halfHV, stride, 16, 16, 16);\
 784 }\
 785 static void OPNAME ## qpel16_mc33_sh4(uint8_t *dst, uint8_t *src, int stride){\
 786     uint8_t full[24*17];\
 787     uint8_t halfH[272];\
 788     uint8_t halfHV[256];\
 789     copy_block17(full, src, 24, stride, 17);\
 790     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
 791     put ## RND ## pixels16_l2_aligned1(halfH, halfH, full+1, 16, 16, 24, 17);\
 792     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
 793     OPNAME ## pixels16_l2_aligned(dst, halfH+16, halfHV, stride, 16, 16, 16);\
 794 }\
 795 static void OPNAME ## qpel16_mc21_sh4(uint8_t *dst, uint8_t *src, int stride){\
 796     uint8_t halfH[272];\
 797     uint8_t halfHV[256];\
 798     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
 799     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
 800     OPNAME ## pixels16_l2_aligned(dst, halfH, halfHV, stride, 16, 16, 16);\
 801 }\
 802 static void OPNAME ## qpel16_mc23_sh4(uint8_t *dst, uint8_t *src, int stride){\
 803     uint8_t halfH[272];\
 804     uint8_t halfHV[256];\
 805     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
 806     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
 807     OPNAME ## pixels16_l2_aligned(dst, halfH+16, halfHV, stride, 16, 16, 16);\
 808 }\
 809 static void OPNAME ## qpel16_mc12_sh4(uint8_t *dst, uint8_t *src, int stride){\
 810     uint8_t full[24*17];\
 811     uint8_t halfH[272];\
 812     copy_block17(full, src, 24, stride, 17);\
 813     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
 814     put ## RND ## pixels16_l2_aligned(halfH, halfH, full, 16, 16, 24, 17);\
 815     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
 816 }\
 817 static void OPNAME ## qpel16_mc32_sh4(uint8_t *dst, uint8_t *src, int stride){\
 818     uint8_t full[24*17];\
 819     uint8_t halfH[272];\
 820     copy_block17(full, src, 24, stride, 17);\
 821     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
 822     put ## RND ## pixels16_l2_aligned1(halfH, halfH, full+1, 16, 16, 24, 17);\
 823     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
 824 }\
 825 static void OPNAME ## qpel16_mc22_sh4(uint8_t *dst, uint8_t *src, int stride){\
 826     uint8_t halfH[272];\
 827     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
 828     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
 829 }
 830
 831 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
 832 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
 833 #define op_put(a, b) a = cm[((b) + 16)>>5]
 834 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
 835
 836 QPEL_MC(0, put_       , _       , op_put)
 837 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
 838 QPEL_MC(0, avg_       , _       , op_avg)
 839 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
 840 #undef op_avg
 841 #undef op_avg_no_rnd
 842 #undef op_put
 843 #undef op_put_no_rnd
 844
 845 #define H264_LOWPASS(OPNAME, OP, OP2) \
 846 static inline void OPNAME ## h264_qpel_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride,int w,int h){\
 847     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 848     do {\
 849         int srcB,srcA,src0,src1,src2,src3,src4,src5,src6;\
 850         uint8_t *s = src-2;\
 851         srcB = *s++;\
 852         srcA = *s++;\
 853         src0 = *s++;\
 854         src1 = *s++;\
 855         src2 = *s++;\
 856         src3 = *s++;\
 857         OP(dst[0], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
 858         src4 = *s++;\
 859         OP(dst[1], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
 860         src5 = *s++;\
 861         OP(dst[2], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
 862         src6 = *s++;\
 863         OP(dst[3], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
 864       if (w>4) { /* it optimized */ \
 865         int src7,src8,src9,src10; \
 866         src7 = *s++;\
 867         OP(dst[4], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
 868         src8 = *s++;\
 869         OP(dst[5], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
 870         src9 = *s++;\
 871         OP(dst[6], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
 872         src10 = *s++;\
 873         OP(dst[7], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
 874        if (w>8) { \
 875         int src11,src12,src13,src14,src15,src16,src17,src18; \
 876         src11 = *s++;\
 877         OP(dst[8] , (src8 +src9 )*20 - (src7 +src10)*5 + (src6 +src11));\
 878         src12 = *s++;\
 879         OP(dst[9] , (src9 +src10)*20 - (src8 +src11)*5 + (src7 +src12));\
 880         src13 = *s++;\
 881         OP(dst[10], (src10+src11)*20 - (src9 +src12)*5 + (src8 +src13));\
 882         src14 = *s++;\
 883         OP(dst[11], (src11+src12)*20 - (src10+src13)*5 + (src9 +src14));\
 884         src15 = *s++;\
 885         OP(dst[12], (src12+src13)*20 - (src11+src14)*5 + (src10+src15));\
 886         src16 = *s++;\
 887         OP(dst[13], (src13+src14)*20 - (src12+src15)*5 + (src11+src16));\
 888         src17 = *s++;\
 889         OP(dst[14], (src14+src15)*20 - (src13+src16)*5 + (src12+src17));\
 890         src18 = *s++;\
 891         OP(dst[15], (src15+src16)*20 - (src14+src17)*5 + (src13+src18));\
 892        } \
 893       } \
 894         dst+=dstStride;\
 895         src+=srcStride;\
 896     }while(--h);\
 897 }\
 898 \
 899 static inline void OPNAME ## h264_qpel_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride,int w,int h){\
 900     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 901     do{\
 902         int srcB,srcA,src0,src1,src2,src3,src4,src5,src6;\
 903         uint8_t *s = src-2*srcStride,*d=dst;\
 904         srcB = *s; s+=srcStride;\
 905         srcA = *s; s+=srcStride;\
 906         src0 = *s; s+=srcStride;\
 907         src1 = *s; s+=srcStride;\
 908         src2 = *s; s+=srcStride;\
 909         src3 = *s; s+=srcStride;\
 910         OP(*d, (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));d+=dstStride;\
 911         src4 = *s; s+=srcStride;\
 912         OP(*d, (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));d+=dstStride;\
 913         src5 = *s; s+=srcStride;\
 914         OP(*d, (src2+src3)*20 - (src1+src4)*5 + (src0+src5));d+=dstStride;\
 915         src6 = *s; s+=srcStride;\
 916         OP(*d, (src3+src4)*20 - (src2+src5)*5 + (src1+src6));d+=dstStride;\
 917       if (h>4) { \
 918         int src7,src8,src9,src10; \
 919         src7 = *s; s+=srcStride;\
 920         OP(*d, (src4+src5)*20 - (src3+src6)*5 + (src2+src7));d+=dstStride;\
 921         src8 = *s; s+=srcStride;\
 922         OP(*d, (src5+src6)*20 - (src4+src7)*5 + (src3+src8));d+=dstStride;\
 923         src9 = *s; s+=srcStride;\
 924         OP(*d, (src6+src7)*20 - (src5+src8)*5 + (src4+src9));d+=dstStride;\
 925         src10 = *s; s+=srcStride;\
 926         OP(*d, (src7+src8)*20 - (src6+src9)*5 + (src5+src10));d+=dstStride;\
 927        if (h>8) { \
 928         int src11,src12,src13,src14,src15,src16,src17,src18; \
 929         src11 = *s; s+=srcStride;\
 930         OP(*d , (src8 +src9 )*20 - (src7 +src10)*5 + (src6 +src11));d+=dstStride;\
 931         src12 = *s; s+=srcStride;\
 932         OP(*d , (src9 +src10)*20 - (src8 +src11)*5 + (src7 +src12));d+=dstStride;\
 933         src13 = *s; s+=srcStride;\
 934         OP(*d, (src10+src11)*20 - (src9 +src12)*5 + (src8 +src13));d+=dstStride;\
 935         src14 = *s; s+=srcStride;\
 936         OP(*d, (src11+src12)*20 - (src10+src13)*5 + (src9 +src14));d+=dstStride;\
 937         src15 = *s; s+=srcStride;\
 938         OP(*d, (src12+src13)*20 - (src11+src14)*5 + (src10+src15));d+=dstStride;\
 939         src16 = *s; s+=srcStride;\
 940         OP(*d, (src13+src14)*20 - (src12+src15)*5 + (src11+src16));d+=dstStride;\
 941         src17 = *s; s+=srcStride;\
 942         OP(*d, (src14+src15)*20 - (src13+src16)*5 + (src12+src17));d+=dstStride;\
 943         src18 = *s; s+=srcStride;\
 944         OP(*d, (src15+src16)*20 - (src14+src17)*5 + (src13+src18));d+=dstStride;\
 945        } \
 946       } \
 947         dst++;\
 948         src++;\
 949     }while(--w);\
 950 }\
 951 \
 952 static inline void OPNAME ## h264_qpel_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride,int w,int h){\
 953     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 954     int i;\
 955     src -= 2*srcStride;\
 956     i= h+5; \
 957     do {\
 958         int srcB,srcA,src0,src1,src2,src3,src4,src5,src6;\
 959         uint8_t *s = src-2;\
 960         srcB = *s++;\
 961         srcA = *s++;\
 962         src0 = *s++;\
 963         src1 = *s++;\
 964         src2 = *s++;\
 965         src3 = *s++;\
 966         tmp[0] = ((src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
 967         src4 = *s++;\
 968         tmp[1] = ((src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
 969         src5 = *s++;\
 970         tmp[2] = ((src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
 971         src6 = *s++;\
 972         tmp[3] = ((src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
 973       if (w>4) { /* it optimized */ \
 974         int src7,src8,src9,src10; \
 975         src7 = *s++;\
 976         tmp[4] = ((src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
 977         src8 = *s++;\
 978         tmp[5] = ((src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
 979         src9 = *s++;\
 980         tmp[6] = ((src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
 981         src10 = *s++;\
 982         tmp[7] = ((src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
 983        if (w>8) { \
 984         int src11,src12,src13,src14,src15,src16,src17,src18; \
 985         src11 = *s++;\
 986         tmp[8] = ((src8 +src9 )*20 - (src7 +src10)*5 + (src6 +src11));\
 987         src12 = *s++;\
 988         tmp[9] = ((src9 +src10)*20 - (src8 +src11)*5 + (src7 +src12));\
 989         src13 = *s++;\
 990         tmp[10] = ((src10+src11)*20 - (src9 +src12)*5 + (src8 +src13));\
 991         src14 = *s++;\
 992         tmp[11] = ((src11+src12)*20 - (src10+src13)*5 + (src9 +src14));\
 993         src15 = *s++;\
 994         tmp[12] = ((src12+src13)*20 - (src11+src14)*5 + (src10+src15));\
 995         src16 = *s++;\
 996         tmp[13] = ((src13+src14)*20 - (src12+src15)*5 + (src11+src16));\
 997         src17 = *s++;\
 998         tmp[14] = ((src14+src15)*20 - (src13+src16)*5 + (src12+src17));\
 999         src18 = *s++;\
1000         tmp[15] = ((src15+src16)*20 - (src14+src17)*5 + (src13+src18));\
1001        } \
1002       } \
1003         tmp+=tmpStride;\
1004         src+=srcStride;\
1005     }while(--i);\
1006     tmp -= tmpStride*(h+5-2);\
1007     i = w; \
1008     do {\
1009         int tmpB,tmpA,tmp0,tmp1,tmp2,tmp3,tmp4,tmp5,tmp6;\
1010         int16_t *s = tmp-2*tmpStride; \
1011         uint8_t *d=dst;\
1012         tmpB = *s; s+=tmpStride;\
1013         tmpA = *s; s+=tmpStride;\
1014         tmp0 = *s; s+=tmpStride;\
1015         tmp1 = *s; s+=tmpStride;\
1016         tmp2 = *s; s+=tmpStride;\
1017         tmp3 = *s; s+=tmpStride;\
1018         OP2(*d, (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));d+=dstStride;\
1019         tmp4 = *s; s+=tmpStride;\
1020         OP2(*d, (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));d+=dstStride;\
1021         tmp5 = *s; s+=tmpStride;\
1022         OP2(*d, (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));d+=dstStride;\
1023         tmp6 = *s; s+=tmpStride;\
1024         OP2(*d, (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));d+=dstStride;\
1025       if (h>4) { \
1026         int tmp7,tmp8,tmp9,tmp10; \
1027         tmp7 = *s; s+=tmpStride;\
1028         OP2(*d, (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));d+=dstStride;\
1029         tmp8 = *s; s+=tmpStride;\
1030         OP2(*d, (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));d+=dstStride;\
1031         tmp9 = *s; s+=tmpStride;\
1032         OP2(*d, (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));d+=dstStride;\
1033         tmp10 = *s; s+=tmpStride;\
1034         OP2(*d, (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));d+=dstStride;\
1035        if (h>8) { \
1036         int tmp11,tmp12,tmp13,tmp14,tmp15,tmp16,tmp17,tmp18; \
1037         tmp11 = *s; s+=tmpStride;\
1038         OP2(*d , (tmp8 +tmp9 )*20 - (tmp7 +tmp10)*5 + (tmp6 +tmp11));d+=dstStride;\
1039         tmp12 = *s; s+=tmpStride;\
1040         OP2(*d , (tmp9 +tmp10)*20 - (tmp8 +tmp11)*5 + (tmp7 +tmp12));d+=dstStride;\
1041         tmp13 = *s; s+=tmpStride;\
1042         OP2(*d, (tmp10+tmp11)*20 - (tmp9 +tmp12)*5 + (tmp8 +tmp13));d+=dstStride;\
1043         tmp14 = *s; s+=tmpStride;\
1044         OP2(*d, (tmp11+tmp12)*20 - (tmp10+tmp13)*5 + (tmp9 +tmp14));d+=dstStride;\
1045         tmp15 = *s; s+=tmpStride;\
1046         OP2(*d, (tmp12+tmp13)*20 - (tmp11+tmp14)*5 + (tmp10+tmp15));d+=dstStride;\
1047         tmp16 = *s; s+=tmpStride;\
1048         OP2(*d, (tmp13+tmp14)*20 - (tmp12+tmp15)*5 + (tmp11+tmp16));d+=dstStride;\
1049         tmp17 = *s; s+=tmpStride;\
1050         OP2(*d, (tmp14+tmp15)*20 - (tmp13+tmp16)*5 + (tmp12+tmp17));d+=dstStride;\
1051         tmp18 = *s; s+=tmpStride;\
1052         OP2(*d, (tmp15+tmp16)*20 - (tmp14+tmp17)*5 + (tmp13+tmp18));d+=dstStride;\
1053        } \
1054       } \
1055         dst++;\
1056         tmp++;\
1057     }while(--i);\
1058 }\
1059 \
1060 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1061     OPNAME ## h264_qpel_h_lowpass(dst,src,dstStride,srcStride,4,4); \
1062 }\
1063 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1064    OPNAME ## h264_qpel_h_lowpass(dst,src,dstStride,srcStride,8,8); \
1065 }\
1066 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1067    OPNAME ## h264_qpel_h_lowpass(dst,src,dstStride,srcStride,16,16); \
1068 }\
1069 \
1070 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1071    OPNAME ## h264_qpel_v_lowpass(dst,src,dstStride,srcStride,4,4); \
1072 }\
1073 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1074    OPNAME ## h264_qpel_v_lowpass(dst,src,dstStride,srcStride,8,8); \
1075 }\
1076 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1077    OPNAME ## h264_qpel_v_lowpass(dst,src,dstStride,srcStride,16,16); \
1078 }\
1079 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1080    OPNAME ## h264_qpel_hv_lowpass(dst,tmp,src,dstStride,tmpStride,srcStride,4,4); \
1081 }\
1082 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1083    OPNAME ## h264_qpel_hv_lowpass(dst,tmp,src,dstStride,tmpStride,srcStride,8,8); \
1084 }\
1085 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1086    OPNAME ## h264_qpel_hv_lowpass(dst,tmp,src,dstStride,tmpStride,srcStride,16,16); \
1087 }\
1088
1089 #define H264_MC(OPNAME, SIZE) \
1090 static void OPNAME ## h264_qpel ## SIZE ## _mc00_sh4 (uint8_t *dst, uint8_t *src, int stride){\
1091     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
1092 }\
1093 \
1094 static void OPNAME ## h264_qpel ## SIZE ## _mc10_sh4(uint8_t *dst, uint8_t *src, int stride){\
1095     uint8_t half[SIZE*SIZE];\
1096     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
1097     OPNAME ## pixels ## SIZE ## _l2_aligned2(dst, src, half, stride, stride, SIZE, SIZE);\
1098 }\
1099 \
1100 static void OPNAME ## h264_qpel ## SIZE ## _mc20_sh4(uint8_t *dst, uint8_t *src, int stride){\
1101     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
1102 }\
1103 \
1104 static void OPNAME ## h264_qpel ## SIZE ## _mc30_sh4(uint8_t *dst, uint8_t *src, int stride){\
1105     uint8_t half[SIZE*SIZE];\
1106     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
1107     OPNAME ## pixels ## SIZE ## _l2_aligned2(dst, src+1, half, stride, stride, SIZE, SIZE);\
1108 }\
1109 \
1110 static void OPNAME ## h264_qpel ## SIZE ## _mc01_sh4(uint8_t *dst, uint8_t *src, int stride){\
1111     uint8_t full[SIZE*(SIZE+5)];\
1112     uint8_t * const full_mid= full + SIZE*2;\
1113     uint8_t half[SIZE*SIZE];\
1114     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
1115     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
1116     OPNAME ## pixels ## SIZE ## _l2_aligned(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
1117 }\
1118 \
1119 static void OPNAME ## h264_qpel ## SIZE ## _mc02_sh4(uint8_t *dst, uint8_t *src, int stride){\
1120     uint8_t full[SIZE*(SIZE+5)];\
1121     uint8_t * const full_mid= full + SIZE*2;\
1122     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
1123     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
1124 }\
1125 \
1126 static void OPNAME ## h264_qpel ## SIZE ## _mc03_sh4(uint8_t *dst, uint8_t *src, int stride){\
1127     uint8_t full[SIZE*(SIZE+5)];\
1128     uint8_t * const full_mid= full + SIZE*2;\
1129     uint8_t half[SIZE*SIZE];\
1130     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
1131     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
1132     OPNAME ## pixels ## SIZE ## _l2_aligned(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
1133 }\
1134 \
1135 static void OPNAME ## h264_qpel ## SIZE ## _mc11_sh4(uint8_t *dst, uint8_t *src, int stride){\
1136     uint8_t full[SIZE*(SIZE+5)];\
1137     uint8_t * const full_mid= full + SIZE*2;\
1138     uint8_t halfH[SIZE*SIZE];\
1139     uint8_t halfV[SIZE*SIZE];\
1140     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
1141     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
1142     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1143     OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1144 }\
1145 \
1146 static void OPNAME ## h264_qpel ## SIZE ## _mc31_sh4(uint8_t *dst, uint8_t *src, int stride){\
1147     uint8_t full[SIZE*(SIZE+5)];\
1148     uint8_t * const full_mid= full + SIZE*2;\
1149     uint8_t halfH[SIZE*SIZE];\
1150     uint8_t halfV[SIZE*SIZE];\
1151     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
1152     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
1153     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1154     OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1155 }\
1156 \
1157 static void OPNAME ## h264_qpel ## SIZE ## _mc13_sh4(uint8_t *dst, uint8_t *src, int stride){\
1158     uint8_t full[SIZE*(SIZE+5)];\
1159     uint8_t * const full_mid= full + SIZE*2;\
1160     uint8_t halfH[SIZE*SIZE];\
1161     uint8_t halfV[SIZE*SIZE];\
1162     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
1163     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
1164     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1165     OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1166 }\
1167 \
1168 static void OPNAME ## h264_qpel ## SIZE ## _mc33_sh4(uint8_t *dst, uint8_t *src, int stride){\
1169     uint8_t full[SIZE*(SIZE+5)];\
1170     uint8_t * const full_mid= full + SIZE*2;\
1171     uint8_t halfH[SIZE*SIZE];\
1172     uint8_t halfV[SIZE*SIZE];\
1173     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
1174     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
1175     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1176     OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1177 }\
1178 \
1179 static void OPNAME ## h264_qpel ## SIZE ## _mc22_sh4(uint8_t *dst, uint8_t *src, int stride){\
1180     int16_t tmp[SIZE*(SIZE+5)];\
1181     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
1182 }\
1183 \
1184 static void OPNAME ## h264_qpel ## SIZE ## _mc21_sh4(uint8_t *dst, uint8_t *src, int stride){\
1185     int16_t tmp[SIZE*(SIZE+5)];\
1186     uint8_t halfH[SIZE*SIZE];\
1187     uint8_t halfHV[SIZE*SIZE];\
1188     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
1189     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1190     OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
1191 }\
1192 \
1193 static void OPNAME ## h264_qpel ## SIZE ## _mc23_sh4(uint8_t *dst, uint8_t *src, int stride){\
1194     int16_t tmp[SIZE*(SIZE+5)];\
1195     uint8_t halfH[SIZE*SIZE];\
1196     uint8_t halfHV[SIZE*SIZE];\
1197     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
1198     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1199     OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
1200 }\
1201 \
1202 static void OPNAME ## h264_qpel ## SIZE ## _mc12_sh4(uint8_t *dst, uint8_t *src, int stride){\
1203     uint8_t full[SIZE*(SIZE+5)];\
1204     uint8_t * const full_mid= full + SIZE*2;\
1205     int16_t tmp[SIZE*(SIZE+5)];\
1206     uint8_t halfV[SIZE*SIZE];\
1207     uint8_t halfHV[SIZE*SIZE];\
1208     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
1209     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1210     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1211     OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
1212 }\
1213 \
1214 static void OPNAME ## h264_qpel ## SIZE ## _mc32_sh4(uint8_t *dst, uint8_t *src, int stride){\
1215     uint8_t full[SIZE*(SIZE+5)];\
1216     uint8_t * const full_mid= full + SIZE*2;\
1217     int16_t tmp[SIZE*(SIZE+5)];\
1218     uint8_t halfV[SIZE*SIZE];\
1219     uint8_t halfHV[SIZE*SIZE];\
1220     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
1221     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1222     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1223     OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
1224 }\
1225
1226 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1227 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
1228 #define op_put(a, b)  a = cm[((b) + 16)>>5]
1229 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
1230 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
1231
1232 H264_LOWPASS(put_       , op_put, op2_put)
1233 H264_LOWPASS(avg_       , op_avg, op2_avg)
1234 H264_MC(put_, 4)
1235 H264_MC(put_, 8)
1236 H264_MC(put_, 16)
1237 H264_MC(avg_, 4)
1238 H264_MC(avg_, 8)
1239 H264_MC(avg_, 16)
1240
1241 #undef op_avg
1242 #undef op_put
1243 #undef op2_avg
1244 #undef op2_put
1245
1246 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1247     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1248
1249     do{
1250         int src_1,src0,src1,src2,src3,src4,src5,src6,src7,src8,src9;
1251         uint8_t *s = src;
1252         src_1 = s[-1];
1253         src0 = *s++;
1254         src1 = *s++;
1255         src2 = *s++;
1256         dst[0]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1257         src3 = *s++;
1258         dst[1]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1259         src4 = *s++;
1260         dst[2]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1261         src5 = *s++;
1262         dst[3]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1263         src6 = *s++;
1264         dst[4]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1265         src7 = *s++;
1266         dst[5]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1267         src8 = *s++;
1268         dst[6]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1269         src9 = *s++;
1270         dst[7]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1271         dst+=dstStride;
1272         src+=srcStride;
1273     }while(--h);
1274 }
1275
1276 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1277     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1278
1279     do{
1280         int src_1,src0,src1,src2,src3,src4,src5,src6,src7,src8,src9;
1281         uint8_t *s = src,*d = dst;
1282         src_1 = *(s-srcStride);
1283         src0 = *s; s+=srcStride;
1284         src1 = *s; s+=srcStride;
1285         src2 = *s; s+=srcStride;
1286         *d= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4]; d+=dstStride;
1287         src3 = *s; s+=srcStride;
1288         *d= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4]; d+=dstStride;
1289         src4 = *s; s+=srcStride;
1290         *d= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4]; d+=dstStride;
1291         src5 = *s; s+=srcStride;
1292         *d= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4]; d+=dstStride;
1293         src6 = *s; s+=srcStride;
1294         *d= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4]; d+=dstStride;
1295         src7 = *s; s+=srcStride;
1296         *d= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4]; d+=dstStride;
1297         src8 = *s; s+=srcStride;
1298         *d= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4]; d+=dstStride;
1299         src9 = *s;
1300         *d= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4]; d+=dstStride;
1301         src++;
1302         dst++;
1303     }while(--w);
1304 }
1305
1306 static void put_mspel8_mc00_sh4 (uint8_t *dst, uint8_t *src, int stride){
1307     put_pixels8_c(dst, src, stride, 8);
1308 }
1309
1310 static void put_mspel8_mc10_sh4(uint8_t *dst, uint8_t *src, int stride){
1311     uint8_t half[64];
1312     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1313     put_pixels8_l2_aligned2(dst, src, half, stride, stride, 8, 8);
1314 }
1315
1316 static void put_mspel8_mc20_sh4(uint8_t *dst, uint8_t *src, int stride){
1317     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1318 }
1319
1320 static void put_mspel8_mc30_sh4(uint8_t *dst, uint8_t *src, int stride){
1321     uint8_t half[64];
1322     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1323     put_pixels8_l2_aligned2(dst, src+1, half, stride, stride, 8, 8);
1324 }
1325
1326 static void put_mspel8_mc02_sh4(uint8_t *dst, uint8_t *src, int stride){
1327     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1328 }
1329
1330 static void put_mspel8_mc12_sh4(uint8_t *dst, uint8_t *src, int stride){
1331     uint8_t halfH[88];
1332     uint8_t halfV[64];
1333     uint8_t halfHV[64];
1334     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1335     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1336     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1337     put_pixels8_l2_aligned(dst, halfV, halfHV, stride, 8, 8, 8);
1338 }
1339 static void put_mspel8_mc32_sh4(uint8_t *dst, uint8_t *src, int stride){
1340     uint8_t halfH[88];
1341     uint8_t halfV[64];
1342     uint8_t halfHV[64];
1343     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1344     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1345     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1346     put_pixels8_l2_aligned(dst, halfV, halfHV, stride, 8, 8, 8);
1347 }
1348 static void put_mspel8_mc22_sh4(uint8_t *dst, uint8_t *src, int stride){
1349     uint8_t halfH[88];
1350     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1351     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1352 }