git.sesse.net Git - ffmpeg/blob - libavcodec/ppc/h264_altivec.c

   1 /*
   2  * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 #include "dsputil.h"
  22
  23 #include "gcc_fixes.h"
  24
  25 #include "dsputil_altivec.h"
  26 #include "types_altivec.h"
  27
  28 #define PUT_OP_U8_ALTIVEC(d, s, dst) d = s
  29 #define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s)
  30
  31 #define OP_U8_ALTIVEC                          PUT_OP_U8_ALTIVEC
  32 #define PREFIX_h264_chroma_mc8_altivec         put_h264_chroma_mc8_altivec
  33 #define PREFIX_h264_chroma_mc8_num             altivec_put_h264_chroma_mc8_num
  34 #define PREFIX_h264_qpel16_h_lowpass_altivec   put_h264_qpel16_h_lowpass_altivec
  35 #define PREFIX_h264_qpel16_h_lowpass_num       altivec_put_h264_qpel16_h_lowpass_num
  36 #define PREFIX_h264_qpel16_v_lowpass_altivec   put_h264_qpel16_v_lowpass_altivec
  37 #define PREFIX_h264_qpel16_v_lowpass_num       altivec_put_h264_qpel16_v_lowpass_num
  38 #define PREFIX_h264_qpel16_hv_lowpass_altivec  put_h264_qpel16_hv_lowpass_altivec
  39 #define PREFIX_h264_qpel16_hv_lowpass_num      altivec_put_h264_qpel16_hv_lowpass_num
  40 #include "h264_template_altivec.c"
  41 #undef OP_U8_ALTIVEC
  42 #undef PREFIX_h264_chroma_mc8_altivec
  43 #undef PREFIX_h264_chroma_mc8_num
  44 #undef PREFIX_h264_qpel16_h_lowpass_altivec
  45 #undef PREFIX_h264_qpel16_h_lowpass_num
  46 #undef PREFIX_h264_qpel16_v_lowpass_altivec
  47 #undef PREFIX_h264_qpel16_v_lowpass_num
  48 #undef PREFIX_h264_qpel16_hv_lowpass_altivec
  49 #undef PREFIX_h264_qpel16_hv_lowpass_num
  50
  51 #define OP_U8_ALTIVEC                          AVG_OP_U8_ALTIVEC
  52 #define PREFIX_h264_chroma_mc8_altivec         avg_h264_chroma_mc8_altivec
  53 #define PREFIX_h264_chroma_mc8_num             altivec_avg_h264_chroma_mc8_num
  54 #define PREFIX_h264_qpel16_h_lowpass_altivec   avg_h264_qpel16_h_lowpass_altivec
  55 #define PREFIX_h264_qpel16_h_lowpass_num       altivec_avg_h264_qpel16_h_lowpass_num
  56 #define PREFIX_h264_qpel16_v_lowpass_altivec   avg_h264_qpel16_v_lowpass_altivec
  57 #define PREFIX_h264_qpel16_v_lowpass_num       altivec_avg_h264_qpel16_v_lowpass_num
  58 #define PREFIX_h264_qpel16_hv_lowpass_altivec  avg_h264_qpel16_hv_lowpass_altivec
  59 #define PREFIX_h264_qpel16_hv_lowpass_num      altivec_avg_h264_qpel16_hv_lowpass_num
  60 #include "h264_template_altivec.c"
  61 #undef OP_U8_ALTIVEC
  62 #undef PREFIX_h264_chroma_mc8_altivec
  63 #undef PREFIX_h264_chroma_mc8_num
  64 #undef PREFIX_h264_qpel16_h_lowpass_altivec
  65 #undef PREFIX_h264_qpel16_h_lowpass_num
  66 #undef PREFIX_h264_qpel16_v_lowpass_altivec
  67 #undef PREFIX_h264_qpel16_v_lowpass_num
  68 #undef PREFIX_h264_qpel16_hv_lowpass_altivec
  69 #undef PREFIX_h264_qpel16_hv_lowpass_num
  70
  71 #define H264_MC(OPNAME, SIZE, CODETYPE) \
  72 static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## CODETYPE (uint8_t *dst, uint8_t *src, int stride){\
  73     OPNAME ## pixels ## SIZE ## _ ## CODETYPE(dst, src, stride, SIZE);\
  74 }\
  75 \
  76 static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){ \
  77     DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\
  78     put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
  79     OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\
  80 }\
  81 \
  82 static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
  83     OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(dst, src, stride, stride);\
  84 }\
  85 \
  86 static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
  87     DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\
  88     put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
  89     OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+1, half, stride, stride, SIZE);\
  90 }\
  91 \
  92 static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
  93     DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\
  94     put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
  95     OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\
  96 }\
  97 \
  98 static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
  99     OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(dst, src, stride, stride);\
 100 }\
 101 \
 102 static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
 103     DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\
 104     put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
 105     OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+stride, half, stride, stride, SIZE);\
 106 }\
 107 \
 108 static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
 109     DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
 110     DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
 111     put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
 112     put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
 113     OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
 114 }\
 115 \
 116 static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
 117     DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
 118     DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
 119     put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
 120     put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
 121     OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
 122 }\
 123 \
 124 static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
 125     DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
 126     DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
 127     put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
 128     put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
 129     OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
 130 }\
 131 \
 132 static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
 133     DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
 134     DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
 135     put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
 136     put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
 137     OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
 138 }\
 139 \
 140 static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
 141     DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
 142     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(dst, tmp, src, stride, SIZE, stride);\
 143 }\
 144 \
 145 static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
 146     DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
 147     DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\
 148     DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
 149     put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
 150     put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
 151     OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\
 152 }\
 153 \
 154 static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
 155     DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
 156     DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\
 157     DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
 158     put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
 159     put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
 160     OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\
 161 }\
 162 \
 163 static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
 164     DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
 165     DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\
 166     DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
 167     put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
 168     put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
 169     OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\
 170 }\
 171 \
 172 static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
 173     DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
 174     DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\
 175     DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
 176     put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
 177     put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
 178     OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\
 179 }\
 180
 181 /* this code assume that stride % 16 == 0 */
 182 void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
 183    DECLARE_ALIGNED_16(signed int, ABCD[4]) =
 184                         {((8 - x) * (8 - y)),
 185                           ((x) * (8 - y)),
 186                           ((8 - x) * (y)),
 187                           ((x) * (y))};
 188     register int i;
 189     vector unsigned char fperm;
 190     const vector signed int vABCD = vec_ld(0, ABCD);
 191     const vector signed short vA = vec_splat((vector signed short)vABCD, 1);
 192     const vector signed short vB = vec_splat((vector signed short)vABCD, 3);
 193     const vector signed short vC = vec_splat((vector signed short)vABCD, 5);
 194     const vector signed short vD = vec_splat((vector signed short)vABCD, 7);
 195     const vector signed int vzero = vec_splat_s32(0);
 196     const vector signed short v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
 197     const vector unsigned short v6us = vec_splat_u16(6);
 198     register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
 199     register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
 200
 201     vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
 202     vector unsigned char vsrc0uc, vsrc1uc;
 203     vector signed short vsrc0ssH, vsrc1ssH;
 204     vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc;
 205     vector signed short vsrc2ssH, vsrc3ssH, psum;
 206     vector unsigned char vdst, ppsum, fsum;
 207
 208     if (((unsigned long)dst) % 16 == 0) {
 209       fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13,
 210                                         0x14, 0x15, 0x16, 0x17,
 211                                         0x08, 0x09, 0x0A, 0x0B,
 212                                         0x0C, 0x0D, 0x0E, 0x0F);
 213     } else {
 214       fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03,
 215                                         0x04, 0x05, 0x06, 0x07,
 216                                         0x18, 0x19, 0x1A, 0x1B,
 217                                         0x1C, 0x1D, 0x1E, 0x1F);
 218     }
 219
 220     vsrcAuc = vec_ld(0, src);
 221
 222     if (loadSecond)
 223       vsrcBuc = vec_ld(16, src);
 224     vsrcperm0 = vec_lvsl(0, src);
 225     vsrcperm1 = vec_lvsl(1, src);
 226
 227     vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
 228     if (reallyBadAlign)
 229       vsrc1uc = vsrcBuc;
 230     else
 231       vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
 232
 233     vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
 234                                                (vector unsigned char)vsrc0uc);
 235     vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
 236                                                (vector unsigned char)vsrc1uc);
 237
 238     if (!loadSecond) {// -> !reallyBadAlign
 239       for (i = 0 ; i < h ; i++) {
 240
 241
 242         vsrcCuc = vec_ld(stride + 0, src);
 243
 244         vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
 245         vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
 246
 247         vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
 248                                                 (vector unsigned char)vsrc2uc);
 249         vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
 250                                                 (vector unsigned char)vsrc3uc);
 251
 252         psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
 253         psum = vec_mladd(vB, vsrc1ssH, psum);
 254         psum = vec_mladd(vC, vsrc2ssH, psum);
 255         psum = vec_mladd(vD, vsrc3ssH, psum);
 256         psum = vec_add(v28ss, psum);
 257         psum = vec_sra(psum, v6us);
 258
 259         vdst = vec_ld(0, dst);
 260         ppsum = (vector unsigned char)vec_packsu(psum, psum);
 261         fsum = vec_perm(vdst, ppsum, fperm);
 262
 263         vec_st(fsum, 0, dst);
 264
 265         vsrc0ssH = vsrc2ssH;
 266         vsrc1ssH = vsrc3ssH;
 267
 268         dst += stride;
 269         src += stride;
 270       }
 271     } else {
 272         vector unsigned char vsrcDuc;
 273       for (i = 0 ; i < h ; i++) {
 274         vsrcCuc = vec_ld(stride + 0, src);
 275         vsrcDuc = vec_ld(stride + 16, src);
 276
 277         vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
 278         if (reallyBadAlign)
 279           vsrc3uc = vsrcDuc;
 280         else
 281           vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
 282
 283         vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
 284                                                 (vector unsigned char)vsrc2uc);
 285         vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
 286                                                 (vector unsigned char)vsrc3uc);
 287
 288         psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
 289         psum = vec_mladd(vB, vsrc1ssH, psum);
 290         psum = vec_mladd(vC, vsrc2ssH, psum);
 291         psum = vec_mladd(vD, vsrc3ssH, psum);
 292         psum = vec_add(v28ss, psum);
 293         psum = vec_sr(psum, v6us);
 294
 295         vdst = vec_ld(0, dst);
 296         ppsum = (vector unsigned char)vec_pack(psum, psum);
 297         fsum = vec_perm(vdst, ppsum, fperm);
 298
 299         vec_st(fsum, 0, dst);
 300
 301         vsrc0ssH = vsrc2ssH;
 302         vsrc1ssH = vsrc3ssH;
 303
 304         dst += stride;
 305         src += stride;
 306       }
 307     }
 308 }
 309
 310 static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
 311                                     const uint8_t * src2, int dst_stride,
 312                                     int src_stride1, int h)
 313 {
 314     int i;
 315     vector unsigned char a, b, d, tmp1, tmp2, mask, mask_, edges, align;
 316
 317     mask_ = vec_lvsl(0, src2);
 318
 319     for (i = 0; i < h; i++) {
 320
 321         tmp1 = vec_ld(i * src_stride1, src1);
 322         mask = vec_lvsl(i * src_stride1, src1);
 323         tmp2 = vec_ld(i * src_stride1 + 15, src1);
 324
 325         a = vec_perm(tmp1, tmp2, mask);
 326
 327         tmp1 = vec_ld(i * 16, src2);
 328         tmp2 = vec_ld(i * 16 + 15, src2);
 329
 330         b = vec_perm(tmp1, tmp2, mask_);
 331
 332         tmp1 = vec_ld(0, dst);
 333         mask = vec_lvsl(0, dst);
 334         tmp2 = vec_ld(15, dst);
 335
 336         d = vec_avg(a, b);
 337
 338         edges = vec_perm(tmp2, tmp1, mask);
 339
 340         align = vec_lvsr(0, dst);
 341
 342         tmp2 = vec_perm(d, edges, align);
 343         tmp1 = vec_perm(edges, d, align);
 344
 345         vec_st(tmp2, 15, dst);
 346         vec_st(tmp1, 0 , dst);
 347
 348         dst += dst_stride;
 349     }
 350 }
 351
 352 static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
 353                                     const uint8_t * src2, int dst_stride,
 354                                     int src_stride1, int h)
 355 {
 356     int i;
 357     vector unsigned char a, b, d, tmp1, tmp2, mask, mask_, edges, align;
 358
 359     mask_ = vec_lvsl(0, src2);
 360
 361     for (i = 0; i < h; i++) {
 362
 363         tmp1 = vec_ld(i * src_stride1, src1);
 364         mask = vec_lvsl(i * src_stride1, src1);
 365         tmp2 = vec_ld(i * src_stride1 + 15, src1);
 366
 367         a = vec_perm(tmp1, tmp2, mask);
 368
 369         tmp1 = vec_ld(i * 16, src2);
 370         tmp2 = vec_ld(i * 16 + 15, src2);
 371
 372         b = vec_perm(tmp1, tmp2, mask_);
 373
 374         tmp1 = vec_ld(0, dst);
 375         mask = vec_lvsl(0, dst);
 376         tmp2 = vec_ld(15, dst);
 377
 378         d = vec_avg(vec_perm(tmp1, tmp2, mask), vec_avg(a, b));
 379
 380         edges = vec_perm(tmp2, tmp1, mask);
 381
 382         align = vec_lvsr(0, dst);
 383
 384         tmp2 = vec_perm(d, edges, align);
 385         tmp1 = vec_perm(edges, d, align);
 386
 387         vec_st(tmp2, 15, dst);
 388         vec_st(tmp1, 0 , dst);
 389
 390         dst += dst_stride;
 391     }
 392 }
 393
 394 /* Implemented but could be faster
 395 #define put_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) put_pixels16_l2(d,s1,s2,ds,s1s,16,h)
 396 #define avg_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) avg_pixels16_l2(d,s1,s2,ds,s1s,16,h)
 397  */
 398
 399   H264_MC(put_, 16, altivec)
 400   H264_MC(avg_, 16, altivec)
 401
 402
 403 /****************************************************************************
 404  * IDCT transform:
 405  ****************************************************************************/
 406
 407 #define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7,  d0, d1, d2, d3, d4, d5, d6, d7) {\
 408     /*        a0  = SRC(0) + SRC(4); */ \
 409     vec_s16_t a0v = vec_add(s0, s4);    \
 410     /*        a2  = SRC(0) - SRC(4); */ \
 411     vec_s16_t a2v = vec_sub(s0, s4);    \
 412     /*        a4  =           (SRC(2)>>1) - SRC(6); */ \
 413     vec_s16_t a4v = vec_sub(vec_sra(s2, onev), s6);    \
 414     /*        a6  =           (SRC(6)>>1) + SRC(2); */ \
 415     vec_s16_t a6v = vec_add(vec_sra(s6, onev), s2);    \
 416     /*        b0  =         a0 + a6; */ \
 417     vec_s16_t b0v = vec_add(a0v, a6v);  \
 418     /*        b2  =         a2 + a4; */ \
 419     vec_s16_t b2v = vec_add(a2v, a4v);  \
 420     /*        b4  =         a2 - a4; */ \
 421     vec_s16_t b4v = vec_sub(a2v, a4v);  \
 422     /*        b6  =         a0 - a6; */ \
 423     vec_s16_t b6v = vec_sub(a0v, a6v);  \
 424     /* a1 =  SRC(5) - SRC(3) - SRC(7) - (SRC(7)>>1); */ \
 425     /*        a1 =             (SRC(5)-SRC(3)) -  (SRC(7)  +  (SRC(7)>>1)); */ \
 426     vec_s16_t a1v = vec_sub( vec_sub(s5, s3), vec_add(s7, vec_sra(s7, onev)) ); \
 427     /* a3 =  SRC(7) + SRC(1) - SRC(3) - (SRC(3)>>1); */ \
 428     /*        a3 =             (SRC(7)+SRC(1)) -  (SRC(3)  +  (SRC(3)>>1)); */ \
 429     vec_s16_t a3v = vec_sub( vec_add(s7, s1), vec_add(s3, vec_sra(s3, onev)) );\
 430     /* a5 =  SRC(7) - SRC(1) + SRC(5) + (SRC(5)>>1); */ \
 431     /*        a5 =             (SRC(7)-SRC(1)) +   SRC(5) +   (SRC(5)>>1); */ \
 432     vec_s16_t a5v = vec_add( vec_sub(s7, s1), vec_add(s5, vec_sra(s5, onev)) );\
 433     /*        a7 =                SRC(5)+SRC(3) +  SRC(1) +   (SRC(1)>>1); */ \
 434     vec_s16_t a7v = vec_add( vec_add(s5, s3), vec_add(s1, vec_sra(s1, onev)) );\
 435     /*        b1 =                  (a7>>2)  +  a1; */ \
 436     vec_s16_t b1v = vec_add( vec_sra(a7v, twov), a1v); \
 437     /*        b3 =          a3 +        (a5>>2); */ \
 438     vec_s16_t b3v = vec_add(a3v, vec_sra(a5v, twov)); \
 439     /*        b5 =                  (a3>>2)  -   a5; */ \
 440     vec_s16_t b5v = vec_sub( vec_sra(a3v, twov), a5v); \
 441     /*        b7 =           a7 -        (a1>>2); */ \
 442     vec_s16_t b7v = vec_sub( a7v, vec_sra(a1v, twov)); \
 443     /* DST(0,    b0 + b7); */ \
 444     d0 = vec_add(b0v, b7v); \
 445     /* DST(1,    b2 + b5); */ \
 446     d1 = vec_add(b2v, b5v); \
 447     /* DST(2,    b4 + b3); */ \
 448     d2 = vec_add(b4v, b3v); \
 449     /* DST(3,    b6 + b1); */ \
 450     d3 = vec_add(b6v, b1v); \
 451     /* DST(4,    b6 - b1); */ \
 452     d4 = vec_sub(b6v, b1v); \
 453     /* DST(5,    b4 - b3); */ \
 454     d5 = vec_sub(b4v, b3v); \
 455     /* DST(6,    b2 - b5); */ \
 456     d6 = vec_sub(b2v, b5v); \
 457     /* DST(7,    b0 - b7); */ \
 458     d7 = vec_sub(b0v, b7v); \
 459 }
 460
 461 #define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel) { \
 462     /* unaligned load */                                       \
 463     vec_u8_t hv = vec_ld( 0, dest );                           \
 464     vec_u8_t lv = vec_ld( 7, dest );                           \
 465     vec_u8_t dstv   = vec_perm( hv, lv, (vec_u8_t)perm_ldv );  \
 466     vec_s16_t idct_sh6 = vec_sra(idctv, sixv);                 \
 467     vec_u16_t dst16 = (vec_u16_t)vec_mergeh(zero_u8v, dstv);   \
 468     vec_s16_t idstsum = vec_adds(idct_sh6, (vec_s16_t)dst16);  \
 469     vec_u8_t idstsum8 = vec_packsu(zero_s16v, idstsum);        \
 470     vec_u8_t edgehv;                                           \
 471     /* unaligned store */                                      \
 472     vec_u8_t bodyv  = vec_perm( idstsum8, idstsum8, perm_stv );\
 473     vec_u8_t edgelv = vec_perm( sel, zero_u8v, perm_stv );     \
 474     lv    = vec_sel( lv, bodyv, edgelv );                      \
 475     vec_st( lv, 7, dest );                                     \
 476     hv    = vec_ld( 0, dest );                                 \
 477     edgehv = vec_perm( zero_u8v, sel, perm_stv );              \
 478     hv    = vec_sel( hv, bodyv, edgehv );                      \
 479     vec_st( hv, 0, dest );                                     \
 480  }
 481
 482 void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) {
 483     vec_s16_t s0, s1, s2, s3, s4, s5, s6, s7;
 484     vec_s16_t d0, d1, d2, d3, d4, d5, d6, d7;
 485     vec_s16_t idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7;
 486
 487     vec_u8_t perm_ldv = vec_lvsl(0, dst);
 488     vec_u8_t perm_stv = vec_lvsr(8, dst);
 489
 490     const vec_u16_t onev = vec_splat_u16(1);
 491     const vec_u16_t twov = vec_splat_u16(2);
 492     const vec_u16_t sixv = vec_splat_u16(6);
 493
 494     const vec_u8_t sel = (vec_u8_t) AVV(0,0,0,0,0,0,0,0,
 495                                         -1,-1,-1,-1,-1,-1,-1,-1);
 496     LOAD_ZERO;
 497
 498     dct[0] += 32; // rounding for the >>6 at the end
 499
 500     s0 = vec_ld(0x00, (int16_t*)dct);
 501     s1 = vec_ld(0x10, (int16_t*)dct);
 502     s2 = vec_ld(0x20, (int16_t*)dct);
 503     s3 = vec_ld(0x30, (int16_t*)dct);
 504     s4 = vec_ld(0x40, (int16_t*)dct);
 505     s5 = vec_ld(0x50, (int16_t*)dct);
 506     s6 = vec_ld(0x60, (int16_t*)dct);
 507     s7 = vec_ld(0x70, (int16_t*)dct);
 508
 509     IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7,
 510                      d0, d1, d2, d3, d4, d5, d6, d7);
 511
 512     TRANSPOSE8( d0,  d1,  d2,  d3,  d4,  d5,  d6, d7 );
 513
 514     IDCT8_1D_ALTIVEC(d0,  d1,  d2,  d3,  d4,  d5,  d6, d7,
 515                      idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7);
 516
 517     ALTIVEC_STORE_SUM_CLIP(&dst[0*stride], idct0, perm_ldv, perm_stv, sel);
 518     ALTIVEC_STORE_SUM_CLIP(&dst[1*stride], idct1, perm_ldv, perm_stv, sel);
 519     ALTIVEC_STORE_SUM_CLIP(&dst[2*stride], idct2, perm_ldv, perm_stv, sel);
 520     ALTIVEC_STORE_SUM_CLIP(&dst[3*stride], idct3, perm_ldv, perm_stv, sel);
 521     ALTIVEC_STORE_SUM_CLIP(&dst[4*stride], idct4, perm_ldv, perm_stv, sel);
 522     ALTIVEC_STORE_SUM_CLIP(&dst[5*stride], idct5, perm_ldv, perm_stv, sel);
 523     ALTIVEC_STORE_SUM_CLIP(&dst[6*stride], idct6, perm_ldv, perm_stv, sel);
 524     ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel);
 525 }
 526
 527 void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) {
 528
 529 #ifdef HAVE_ALTIVEC
 530   if (has_altivec()) {
 531     c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_altivec;
 532     c->put_no_rnd_h264_chroma_pixels_tab[0] = put_no_rnd_h264_chroma_mc8_altivec;
 533     c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec;
 534     c->h264_idct8_add = ff_h264_idct8_add_altivec;
 535
 536 #define dspfunc(PFX, IDX, NUM) \
 537     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_altivec; \
 538     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_altivec; \
 539     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_altivec; \
 540     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_altivec; \
 541     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_altivec; \
 542     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_altivec; \
 543     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_altivec; \
 544     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_altivec; \
 545     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_altivec; \
 546     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_altivec; \
 547     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_altivec; \
 548     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_altivec; \
 549     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_altivec; \
 550     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_altivec; \
 551     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_altivec; \
 552     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_altivec
 553
 554     dspfunc(put_h264_qpel, 0, 16);
 555     dspfunc(avg_h264_qpel, 0, 16);
 556 #undef dspfunc
 557
 558   } else
 559 #endif /* HAVE_ALTIVEC */
 560   {
 561     // Non-AltiVec PPC optimisations
 562
 563     // ... pending ...
 564   }
 565 }