git.sesse.net Git - vlc/blob - modules/codec/avcodec/copy.c

   1 /*****************************************************************************
   2  * copy.c: Fast YV12/NV12 copy
   3  *****************************************************************************
   4  * Copyright (C) 2010 Laurent Aimar
   5  * $Id$
   6  *
   7  * Authors: Laurent Aimar <fenrir _AT_ videolan _DOT_ org>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  22  *****************************************************************************/
  23
  24 #ifdef HAVE_CONFIG_H
  25 # include "config.h"
  26 #endif
  27
  28 #include <vlc_common.h>
  29 #include <vlc_picture.h>
  30 #include <vlc_cpu.h>
  31 #include <assert.h>
  32
  33 #include "copy.h"
  34
  35 /* Copy 64 bytes from srcp to dstp loading data with the SSE>=2 instruction
  36  * load and storing data with the SSE>=2 instruction store.
  37  */
  38 #define COPY64(dstp, srcp, load, store) \
  39     asm volatile (                      \
  40         load "  0(%[src]), %%xmm1\n"    \
  41         load " 16(%[src]), %%xmm2\n"    \
  42         load " 32(%[src]), %%xmm3\n"    \
  43         load " 48(%[src]), %%xmm4\n"    \
  44         store " %%xmm1,    0(%[dst])\n" \
  45         store " %%xmm2,   16(%[dst])\n" \
  46         store " %%xmm3,   32(%[dst])\n" \
  47         store " %%xmm4,   48(%[dst])\n" \
  48         : : [dst]"r"(dstp), [src]"r"(srcp) : "memory")
  49
  50 #ifndef __SSE4_1__
  51 # undef vlc_CPU_SSE4_1
  52 # define vlc_CPU_SSE4_1() ((cpu & VLC_CPU_SSE4_1) != 0)
  53 #endif
  54
  55 #ifndef __SSSE3__
  56 # undef vlc_CPU_SSSE3
  57 # define vlc_CPU_SSSE3() ((cpu & VLC_CPU_SSSE3) != 0)
  58 #endif
  59
  60 /* Execute the instruction op only if SSE2 is supported. */
  61 #ifdef CAN_COMPILE_SSE2
  62 # ifdef __SSE2__
  63 #  define ASM_SSE2(cpu, op) asm volatile (op)
  64 # else
  65 #  define ASM_SSE2(cpu, op) do { \
  66     if (cpu & VLC_CPU_SSE2) \
  67         asm volatile (op); \
  68     } while (0)
  69 #  undef vlc_CPU_SSE2
  70 #  define vlc_CPU_SSE2() ((cpu & VLC_CPU_SSE2) != 0)
  71 # endif
  72 #else
  73 # define ASM_SSE2(cpu, op)
  74 #endif
  75
  76 /* Optimized copy from "Uncacheable Speculative Write Combining" memory
  77  * as used by some video surface.
  78  * XXX It is really efficient only when SSE4.1 is available.
  79  */
  80 static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
  81                          const uint8_t *src, size_t src_pitch,
  82                          unsigned width, unsigned height,
  83                          unsigned cpu)
  84 {
  85     assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
  86
  87     ASM_SSE2(cpu, "mfence");
  88     for (unsigned y = 0; y < height; y++) {
  89         const unsigned unaligned = (-(uintptr_t)src) & 0x0f;
  90         unsigned x = 0;
  91
  92         for (; x < unaligned; x++)
  93             dst[x] = src[x];
  94
  95 #ifdef CAN_COMPILE_SSE4_1
  96         if (vlc_CPU_SSE4_1()) {
  97             if (!unaligned) {
  98                 for (; x+63 < width; x += 64)
  99                     COPY64(&dst[x], &src[x], "movntdqa", "movdqa");
 100             } else {
 101                 for (; x+63 < width; x += 64)
 102                     COPY64(&dst[x], &src[x], "movntdqa", "movdqu");
 103             }
 104         } else
 105 #endif
 106 #ifdef CAN_COMPILE_SSE2
 107         if (vlc_CPU_SSE2()) {
 108             if (!unaligned) {
 109                 for (; x+63 < width; x += 64)
 110                     COPY64(&dst[x], &src[x], "movdqa", "movdqa");
 111             } else {
 112                 for (; x+63 < width; x += 64)
 113                     COPY64(&dst[x], &src[x], "movdqa", "movdqu");
 114             }
 115         }
 116 #endif
 117
 118         for (; x < width; x++)
 119             dst[x] = src[x];
 120
 121         src += src_pitch;
 122         dst += dst_pitch;
 123     }
 124 }
 125
 126 static void Copy2d(uint8_t *dst, size_t dst_pitch,
 127                    const uint8_t *src, size_t src_pitch,
 128                    unsigned width, unsigned height,
 129                    unsigned cpu)
 130 {
 131     assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
 132
 133     ASM_SSE2(cpu, "mfence");
 134
 135     for (unsigned y = 0; y < height; y++) {
 136         unsigned x = 0;
 137
 138 #ifdef CAN_COMPILE_SSE2
 139         bool unaligned = ((intptr_t)dst & 0x0f) != 0;
 140
 141         if (vlc_CPU_SSE2()) {
 142             if (!unaligned) {
 143                 for (; x+63 < width; x += 64)
 144                     COPY64(&dst[x], &src[x], "movdqa", "movntdq");
 145             } else {
 146                 for (; x+63 < width; x += 64)
 147                     COPY64(&dst[x], &src[x], "movdqa", "movdqu");
 148             }
 149         }
 150 #endif
 151
 152         for (; x < width; x++)
 153             dst[x] = src[x];
 154
 155         src += src_pitch;
 156         dst += dst_pitch;
 157     }
 158 }
 159
 160 static void SplitUV(uint8_t *dstu, size_t dstu_pitch,
 161                     uint8_t *dstv, size_t dstv_pitch,
 162                     const uint8_t *src, size_t src_pitch,
 163                     unsigned width, unsigned height, unsigned cpu)
 164 {
 165     const uint8_t shuffle[] = { 0, 2, 4, 6, 8, 10, 12, 14,
 166                                 1, 3, 5, 7, 9, 11, 13, 15 };
 167     const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
 168                              0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };
 169
 170     assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
 171
 172     ASM_SSE2(cpu, "mfence");
 173
 174     for (unsigned y = 0; y < height; y++) {
 175         unsigned x = 0;
 176
 177 #define LOAD64 \
 178     "movdqa  0(%[src]), %%xmm0\n" \
 179     "movdqa 16(%[src]), %%xmm1\n" \
 180     "movdqa 32(%[src]), %%xmm2\n" \
 181     "movdqa 48(%[src]), %%xmm3\n"
 182
 183 #define STORE2X32 \
 184     "movq   %%xmm0,   0(%[dst1])\n" \
 185     "movq   %%xmm1,   8(%[dst1])\n" \
 186     "movhpd %%xmm0,   0(%[dst2])\n" \
 187     "movhpd %%xmm1,   8(%[dst2])\n" \
 188     "movq   %%xmm2,  16(%[dst1])\n" \
 189     "movq   %%xmm3,  24(%[dst1])\n" \
 190     "movhpd %%xmm2,  16(%[dst2])\n" \
 191     "movhpd %%xmm3,  24(%[dst2])\n"
 192
 193 #ifdef CAN_COMPILE_SSSE3
 194         if (vlc_CPU_SSSE3())
 195         {
 196             for (x = 0; x < (width & ~31); x += 32) {
 197                 asm volatile (
 198                     "movdqu (%[shuffle]), %%xmm7\n"
 199                     LOAD64
 200                     "pshufb  %%xmm7, %%xmm0\n"
 201                     "pshufb  %%xmm7, %%xmm1\n"
 202                     "pshufb  %%xmm7, %%xmm2\n"
 203                     "pshufb  %%xmm7, %%xmm3\n"
 204                     STORE2X32
 205                     : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory");
 206             }
 207         } else
 208 #endif
 209 #ifdef CAN_COMPILE_SSE2
 210         if (vlc_CPU_SSE2()) {
 211             for (x = 0; x < (width & ~31); x += 32) {
 212                 asm volatile (
 213                     "movdqu (%[mask]), %%xmm7\n"
 214                     LOAD64
 215                     "movdqa   %%xmm0, %%xmm4\n"
 216                     "movdqa   %%xmm1, %%xmm5\n"
 217                     "movdqa   %%xmm2, %%xmm6\n"
 218                     "psrlw    $8,     %%xmm0\n"
 219                     "psrlw    $8,     %%xmm1\n"
 220                     "pand     %%xmm7, %%xmm4\n"
 221                     "pand     %%xmm7, %%xmm5\n"
 222                     "pand     %%xmm7, %%xmm6\n"
 223                     "packuswb %%xmm4, %%xmm0\n"
 224                     "packuswb %%xmm5, %%xmm1\n"
 225                     "pand     %%xmm3, %%xmm7\n"
 226                     "psrlw    $8,     %%xmm2\n"
 227                     "psrlw    $8,     %%xmm3\n"
 228                     "packuswb %%xmm6, %%xmm2\n"
 229                     "packuswb %%xmm7, %%xmm3\n"
 230                     STORE2X32
 231                     : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory");
 232             }
 233         }
 234 #endif
 235 #undef STORE2X32
 236 #undef LOAD64
 237
 238         for (; x < width; x++) {
 239             dstu[x] = src[2*x+0];
 240             dstv[x] = src[2*x+1];
 241         }
 242         src  += src_pitch;
 243         dstu += dstu_pitch;
 244         dstv += dstv_pitch;
 245     }
 246 }
 247
 248 static void CopyPlane(uint8_t *dst, size_t dst_pitch, const uint8_t *src, size_t src_pitch,
 249                       uint8_t *cache, size_t cache_size,
 250                       unsigned width, unsigned height,
 251                       unsigned cpu)
 252 {
 253     const unsigned w16 = (width+15) & ~15;
 254     const unsigned hstep = cache_size / w16;
 255     assert(hstep > 0);
 256
 257     for (unsigned y = 0; y < height; y += hstep) {
 258         const unsigned hblock =  __MIN(hstep, height - y);
 259
 260         /* Copy a bunch of line into our cache */
 261         CopyFromUswc(cache, w16,
 262                      src, src_pitch,
 263                      width, hblock, cpu);
 264
 265         /* Copy from our cache to the destination */
 266         Copy2d(dst, dst_pitch,
 267                cache, w16,
 268                width, hblock, cpu);
 269
 270         /* */
 271         src += src_pitch * hblock;
 272         dst += dst_pitch * hblock;
 273     }
 274
 275     ASM_SSE2(cpu, "mfence");
 276 }
 277 static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
 278                         uint8_t *dstv, size_t dstv_pitch,
 279                         const uint8_t *src, size_t src_pitch,
 280                         uint8_t *cache, size_t cache_size,
 281                         unsigned width, unsigned height,
 282                         unsigned cpu)
 283 {
 284     const unsigned w2_16 = (2*width+15) & ~15;
 285     const unsigned hstep = cache_size / w2_16;
 286     assert(hstep > 0);
 287
 288     for (unsigned y = 0; y < height; y += hstep) {
 289         const unsigned hblock =  __MIN(hstep, height - y);
 290
 291         /* Copy a bunch of line into our cache */
 292         CopyFromUswc(cache, w2_16,
 293                      src, src_pitch,
 294                      2*width, hblock, cpu);
 295
 296         /* Copy from our cache to the destination */
 297         SplitUV(dstu, dstu_pitch,
 298                 dstv, dstv_pitch,
 299                 cache, w2_16,
 300                 width, hblock, cpu);
 301
 302         /* */
 303         src  += src_pitch  * hblock;
 304         dstu += dstu_pitch * hblock;
 305         dstv += dstv_pitch * hblock;
 306     }
 307
 308     ASM_SSE2(cpu, "mfence");
 309 }
 310
 311 int CopyInitCache(copy_cache_t *cache, unsigned width)
 312 {
 313     cache->size = __MAX((width + 0x0f) & ~ 0x0f, 4096);
 314     cache->buffer = vlc_memalign(16, cache->size);
 315     if (!cache->buffer)
 316         return VLC_EGENERIC;
 317     return VLC_SUCCESS;
 318 }
 319 void CopyCleanCache(copy_cache_t *cache)
 320 {
 321     vlc_free(cache->buffer);
 322     cache->buffer = NULL;
 323     cache->size   = 0;
 324 }
 325
 326 void CopyFromNv12(picture_t *dst, uint8_t *src[2], size_t src_pitch[2],
 327                   unsigned width, unsigned height,
 328                   copy_cache_t *cache)
 329 {
 330     const unsigned cpu = vlc_CPU();
 331
 332     /* */
 333     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 334               src[0], src_pitch[0],
 335               cache->buffer, cache->size,
 336               width, height, cpu);
 337     SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch,
 338                 dst->p[1].p_pixels, dst->p[1].i_pitch,
 339                 src[1], src_pitch[1],
 340                 cache->buffer, cache->size,
 341                 width/2, height/2, cpu);
 342
 343     ASM_SSE2(cpu, "emms");
 344 }
 345 void CopyFromYv12(picture_t *dst, uint8_t *src[3], size_t src_pitch[3],
 346                   unsigned width, unsigned height,
 347                   copy_cache_t *cache)
 348 {
 349     const unsigned cpu = vlc_CPU();
 350
 351     /* */
 352     for (unsigned n = 0; n < 3; n++) {
 353         const unsigned d = n > 0 ? 2 : 1;
 354         CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch,
 355                   src[n], src_pitch[n],
 356                   cache->buffer, cache->size,
 357                   width/d, height/d, cpu);
 358     }
 359     ASM_SSE2(cpu, "emms");
 360 }
 361
 362 #undef ASM_SSE2
 363 #undef COPY64
 364