git.sesse.net Git - ffmpeg/blob - libavcodec/ppc/pixblockdsp.c

   1 /*
   2  * Copyright (c) 2002 Brian Foley
   3  * Copyright (c) 2002 Dieter Shirley
   4  * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2.1 of the License, or (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with FFmpeg; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  */
  22
  23 #include "config.h"
  24 #if HAVE_ALTIVEC_H
  25 #include <altivec.h>
  26 #endif
  27
  28 #include "libavutil/attributes.h"
  29 #include "libavutil/cpu.h"
  30 #include "libavutil/ppc/cpu.h"
  31 #include "libavutil/ppc/types_altivec.h"
  32 #include "libavutil/ppc/util_altivec.h"
  33 #include "libavcodec/avcodec.h"
  34 #include "libavcodec/pixblockdsp.h"
  35
  36 #if HAVE_ALTIVEC
  37
  38 #if HAVE_VSX
  39 static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels,
  40                                ptrdiff_t stride)
  41 {
  42     int i;
  43     vector unsigned char perm =
  44         (vector unsigned char) {0x00,0x10, 0x01,0x11,0x02,0x12,0x03,0x13,\
  45             0x04,0x14,0x05,0x15,0x06,0x16,0x07,0x17};
  46     const vector unsigned char zero =
  47         (const vector unsigned char) vec_splat_u8(0);
  48
  49     for (i = 0; i < 8; i++) {
  50         /* Read potentially unaligned pixels.
  51          * We're reading 16 pixels, and actually only want 8,
  52          * but we simply ignore the extras. */
  53         vector unsigned char bytes = vec_vsx_ld(0, pixels);
  54
  55         // Convert the bytes into shorts.
  56         //vector signed short shorts = (vector signed short) vec_perm(zero, bytes, perm);
  57         vector signed short shorts = (vector signed short) vec_perm(bytes, zero, perm);
  58
  59         // Save the data to the block, we assume the block is 16-byte aligned.
  60         vec_vsx_st(shorts, i * 16, (vector signed short *) block);
  61
  62         pixels += stride;
  63     }
  64 }
  65 #else
  66 static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels,
  67                                ptrdiff_t stride)
  68 {
  69     int i;
  70     const vec_u8 zero = (const vec_u8)vec_splat_u8(0);
  71
  72     for (i = 0; i < 8; i++) {
  73         vec_u8 perm = vec_lvsl(0, pixels);
  74         /* Read potentially unaligned pixels.
  75          * We're reading 16 pixels, and actually only want 8,
  76          * but we simply ignore the extras. */
  77         vec_u8 pixl = vec_ld(0, pixels);
  78         vec_u8 pixr = vec_ld(7, pixels);
  79         vec_u8 bytes = vec_perm(pixl, pixr, perm);
  80
  81         // Convert the bytes into shorts.
  82         vec_s16 shorts = (vec_s16)vec_mergeh(zero, bytes);
  83
  84         // Save the data to the block, we assume the block is 16-byte aligned.
  85         vec_st(shorts, i * 16, (vec_s16 *)block);
  86
  87         pixels += stride;
  88     }
  89 }
  90
  91 #endif /* HAVE_VSX */
  92
  93 #if HAVE_VSX
  94 static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1,
  95                                 const uint8_t *s2, ptrdiff_t stride)
  96 {
  97   int i;
  98   const vector unsigned char zero =
  99     (const vector unsigned char) vec_splat_u8(0);
 100   vector signed short shorts1, shorts2;
 101
 102   for (i = 0; i < 4; i++) {
 103     /* Read potentially unaligned pixels.
 104      * We're reading 16 pixels, and actually only want 8,
 105      * but we simply ignore the extras. */
 106     vector unsigned char bytes = vec_vsx_ld(0,  s1);
 107
 108     // Convert the bytes into shorts.
 109     shorts1 = (vector signed short) vec_mergeh(bytes, zero);
 110
 111     // Do the same for the second block of pixels.
 112     bytes =vec_vsx_ld(0,  s2);
 113
 114     // Convert the bytes into shorts.
 115     shorts2 = (vector signed short) vec_mergeh(bytes, zero);
 116
 117     // Do the subtraction.
 118     shorts1 = vec_sub(shorts1, shorts2);
 119
 120     // Save the data to the block, we assume the block is 16-byte aligned.
 121     vec_vsx_st(shorts1, 0, (vector signed short *) block);
 122
 123     s1    += stride;
 124     s2    += stride;
 125     block += 8;
 126
 127     /* The code below is a copy of the code above...
 128      * This is a manual unroll. */
 129
 130     /* Read potentially unaligned pixels.
 131      * We're reading 16 pixels, and actually only want 8,
 132      * but we simply ignore the extras. */
 133     bytes = vec_vsx_ld(0,  s1);
 134
 135     // Convert the bytes into shorts.
 136     shorts1 = (vector signed short) vec_mergeh(bytes, zero);
 137
 138     // Do the same for the second block of pixels.
 139     bytes = vec_vsx_ld(0,  s2);
 140
 141     // Convert the bytes into shorts.
 142     shorts2 = (vector signed short) vec_mergeh(bytes, zero);
 143
 144     // Do the subtraction.
 145     shorts1 = vec_sub(shorts1, shorts2);
 146
 147     // Save the data to the block, we assume the block is 16-byte aligned.
 148     vec_vsx_st(shorts1, 0, (vector signed short *) block);
 149
 150     s1    += stride;
 151     s2    += stride;
 152     block += 8;
 153   }
 154 }
 155 #else
 156 static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1,
 157                                 const uint8_t *s2, ptrdiff_t stride)
 158 {
 159     int i;
 160     vec_u8 perm;
 161     const vec_u8 zero = (const vec_u8)vec_splat_u8(0);
 162     vec_s16 shorts1, shorts2;
 163
 164     for (i = 0; i < 4; i++) {
 165         /* Read potentially unaligned pixels.
 166          * We're reading 16 pixels, and actually only want 8,
 167          * but we simply ignore the extras. */
 168         perm = vec_lvsl(0, s1);
 169         vec_u8 pixl  = vec_ld(0,  s1);
 170         vec_u8 pixr  = vec_ld(15, s1);
 171         vec_u8 bytes = vec_perm(pixl, pixr, perm);
 172
 173         // Convert the bytes into shorts.
 174         shorts1 = (vec_s16)vec_mergeh(zero, bytes);
 175
 176         // Do the same for the second block of pixels.
 177         perm = vec_lvsl(0, s2);
 178         pixl  = vec_ld(0,  s2);
 179         pixr  = vec_ld(15, s2);
 180         bytes = vec_perm(pixl, pixr, perm);
 181
 182         // Convert the bytes into shorts.
 183         shorts2 = (vec_s16)vec_mergeh(zero, bytes);
 184
 185         // Do the subtraction.
 186         shorts1 = vec_sub(shorts1, shorts2);
 187
 188         // Save the data to the block, we assume the block is 16-byte aligned.
 189         vec_st(shorts1, 0, (vec_s16 *)block);
 190
 191         s1    += stride;
 192         s2    += stride;
 193         block += 8;
 194
 195         /* The code below is a copy of the code above...
 196          * This is a manual unroll. */
 197
 198         /* Read potentially unaligned pixels.
 199          * We're reading 16 pixels, and actually only want 8,
 200          * but we simply ignore the extras. */
 201         perm = vec_lvsl(0, s1);
 202         pixl  = vec_ld(0,  s1);
 203         pixr  = vec_ld(15, s1);
 204         bytes = vec_perm(pixl, pixr, perm);
 205
 206         // Convert the bytes into shorts.
 207         shorts1 = (vec_s16)vec_mergeh(zero, bytes);
 208
 209         // Do the same for the second block of pixels.
 210         perm = vec_lvsl(0, s2);
 211         pixl  = vec_ld(0,  s2);
 212         pixr  = vec_ld(15, s2);
 213         bytes = vec_perm(pixl, pixr, perm);
 214
 215         // Convert the bytes into shorts.
 216         shorts2 = (vec_s16)vec_mergeh(zero, bytes);
 217
 218         // Do the subtraction.
 219         shorts1 = vec_sub(shorts1, shorts2);
 220
 221         // Save the data to the block, we assume the block is 16-byte aligned.
 222         vec_st(shorts1, 0, (vec_s16 *)block);
 223
 224         s1    += stride;
 225         s2    += stride;
 226         block += 8;
 227     }
 228 }
 229
 230 #endif /* HAVE_VSX */
 231
 232 #endif /* HAVE_ALTIVEC */
 233
 234 #if HAVE_VSX
 235 static void get_pixels_vsx(int16_t *restrict block, const uint8_t *pixels,
 236                            ptrdiff_t stride)
 237 {
 238     int i;
 239     for (i = 0; i < 8; i++) {
 240         vec_s16 shorts = vsx_ld_u8_s16(0, pixels);
 241
 242         vec_vsx_st(shorts, i * 16, block);
 243
 244         pixels += stride;
 245     }
 246 }
 247
 248 static void diff_pixels_vsx(int16_t *restrict block, const uint8_t *s1,
 249                             const uint8_t *s2, ptrdiff_t stride)
 250 {
 251     int i;
 252     vec_s16 shorts1, shorts2;
 253     for (i = 0; i < 8; i++) {
 254         shorts1 = vsx_ld_u8_s16(0, s1);
 255         shorts2 = vsx_ld_u8_s16(0, s2);
 256
 257         shorts1 = vec_sub(shorts1, shorts2);
 258
 259         vec_vsx_st(shorts1, 0, block);
 260
 261         s1    += stride;
 262         s2    += stride;
 263         block += 8;
 264     }
 265 }
 266 #endif /* HAVE_VSX */
 267
 268 av_cold void ff_pixblockdsp_init_ppc(PixblockDSPContext *c,
 269                                      AVCodecContext *avctx,
 270                                      unsigned high_bit_depth)
 271 {
 272 #if HAVE_ALTIVEC
 273     if (!PPC_ALTIVEC(av_get_cpu_flags()))
 274         return;
 275
 276     c->diff_pixels = diff_pixels_altivec;
 277
 278     if (!high_bit_depth) {
 279         c->get_pixels = get_pixels_altivec;
 280     }
 281 #endif /* HAVE_ALTIVEC */
 282
 283 #if HAVE_VSX
 284     if (!PPC_VSX(av_get_cpu_flags()))
 285         return;
 286
 287     c->diff_pixels = diff_pixels_vsx;
 288
 289     if (!high_bit_depth)
 290         c->get_pixels = get_pixels_vsx;
 291 #endif /* HAVE_VSX */
 292 }