git.sesse.net Git - vlc/blob - modules/video_chroma/copy.c

   1 /*****************************************************************************
   2  * copy.c: Fast YV12/NV12 copy
   3  *****************************************************************************
   4  * Copyright (C) 2010 Laurent Aimar
   5  * $Id$
   6  *
   7  * Authors: Laurent Aimar <fenrir _AT_ videolan _DOT_ org>
   8  *
   9  * This program is free software; you can redistribute it and/or modify it
  10  * under the terms of the GNU Lesser General Public License as published by
  11  * the Free Software Foundation; either version 2.1 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17  * GNU Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public License
  20  * along with this program; if not, write to the Free Software Foundation,
  21  * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  22  *****************************************************************************/
  23
  24 #ifdef HAVE_CONFIG_H
  25 # include "config.h"
  26 #endif
  27
  28 #include <vlc_common.h>
  29 #include <vlc_picture.h>
  30 #include <vlc_cpu.h>
  31 #include <assert.h>
  32
  33 #include "copy.h"
  34
  35 int CopyInitCache(copy_cache_t *cache, unsigned width)
  36 {
  37 #ifdef CAN_COMPILE_SSE2
  38     cache->size = __MAX((width + 0x0f) & ~ 0x0f, 4096);
  39     cache->buffer = vlc_memalign(16, cache->size);
  40     if (!cache->buffer)
  41         return VLC_EGENERIC;
  42 #else
  43     (void) cache; (void) width;
  44 #endif
  45     return VLC_SUCCESS;
  46 }
  47
  48 void CopyCleanCache(copy_cache_t *cache)
  49 {
  50 #ifdef CAN_COMPILE_SSE2
  51     vlc_free(cache->buffer);
  52     cache->buffer = NULL;
  53     cache->size   = 0;
  54 #else
  55     (void) cache;
  56 #endif
  57 }
  58
  59 #ifdef CAN_COMPILE_SSE2
  60 /* Copy 64 bytes from srcp to dstp loading data with the SSE>=2 instruction
  61  * load and storing data with the SSE>=2 instruction store.
  62  */
  63 #define COPY64(dstp, srcp, load, store) \
  64     asm volatile (                      \
  65         load "  0(%[src]), %%xmm1\n"    \
  66         load " 16(%[src]), %%xmm2\n"    \
  67         load " 32(%[src]), %%xmm3\n"    \
  68         load " 48(%[src]), %%xmm4\n"    \
  69         store " %%xmm1,    0(%[dst])\n" \
  70         store " %%xmm2,   16(%[dst])\n" \
  71         store " %%xmm3,   32(%[dst])\n" \
  72         store " %%xmm4,   48(%[dst])\n" \
  73         : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1", "xmm2", "xmm3", "xmm4")
  74
  75 #ifndef __SSE4_1__
  76 # undef vlc_CPU_SSE4_1
  77 # define vlc_CPU_SSE4_1() ((cpu & VLC_CPU_SSE4_1) != 0)
  78 #endif
  79
  80 #ifndef __SSSE3__
  81 # undef vlc_CPU_SSSE3
  82 # define vlc_CPU_SSSE3() ((cpu & VLC_CPU_SSSE3) != 0)
  83 #endif
  84
  85 #ifndef __SSE2__
  86 # undef vlc_CPU_SSE2
  87 # define vlc_CPU_SSE2() ((cpu & VLC_CPU_SSE2) != 0)
  88 #endif
  89
  90 /* Optimized copy from "Uncacheable Speculative Write Combining" memory
  91  * as used by some video surface.
  92  * XXX It is really efficient only when SSE4.1 is available.
  93  */
  94 VLC_SSE
  95 static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
  96                          const uint8_t *src, size_t src_pitch,
  97                          unsigned width, unsigned height,
  98                          unsigned cpu)
  99 {
 100     assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
 101
 102     asm volatile ("mfence");
 103
 104     for (unsigned y = 0; y < height; y++) {
 105         const unsigned unaligned = (-(uintptr_t)src) & 0x0f;
 106         unsigned x = 0;
 107
 108         for (; x < unaligned; x++)
 109             dst[x] = src[x];
 110
 111 #ifdef CAN_COMPILE_SSE4_1
 112         if (vlc_CPU_SSE4_1()) {
 113             if (!unaligned) {
 114                 for (; x+63 < width; x += 64)
 115                     COPY64(&dst[x], &src[x], "movntdqa", "movdqa");
 116             } else {
 117                 for (; x+63 < width; x += 64)
 118                     COPY64(&dst[x], &src[x], "movntdqa", "movdqu");
 119             }
 120         } else
 121 #endif
 122         {
 123             if (!unaligned) {
 124                 for (; x+63 < width; x += 64)
 125                     COPY64(&dst[x], &src[x], "movdqa", "movdqa");
 126             } else {
 127                 for (; x+63 < width; x += 64)
 128                     COPY64(&dst[x], &src[x], "movdqa", "movdqu");
 129             }
 130         }
 131
 132         for (; x < width; x++)
 133             dst[x] = src[x];
 134
 135         src += src_pitch;
 136         dst += dst_pitch;
 137     }
 138 }
 139
 140 VLC_SSE
 141 static void Copy2d(uint8_t *dst, size_t dst_pitch,
 142                    const uint8_t *src, size_t src_pitch,
 143                    unsigned width, unsigned height)
 144 {
 145     assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
 146
 147     asm volatile ("mfence");
 148
 149     for (unsigned y = 0; y < height; y++) {
 150         unsigned x = 0;
 151
 152         bool unaligned = ((intptr_t)dst & 0x0f) != 0;
 153         if (!unaligned) {
 154             for (; x+63 < width; x += 64)
 155                 COPY64(&dst[x], &src[x], "movdqa", "movntdq");
 156         } else {
 157             for (; x+63 < width; x += 64)
 158                 COPY64(&dst[x], &src[x], "movdqa", "movdqu");
 159         }
 160
 161         for (; x < width; x++)
 162             dst[x] = src[x];
 163
 164         src += src_pitch;
 165         dst += dst_pitch;
 166     }
 167 }
 168
 169 VLC_SSE
 170 static void SSE_SplitUV(uint8_t *dstu, size_t dstu_pitch,
 171                         uint8_t *dstv, size_t dstv_pitch,
 172                         const uint8_t *src, size_t src_pitch,
 173                         unsigned width, unsigned height, unsigned cpu)
 174 {
 175     VLC_UNUSED(cpu);
 176     const uint8_t shuffle[] = { 0, 2, 4, 6, 8, 10, 12, 14,
 177                                 1, 3, 5, 7, 9, 11, 13, 15 };
 178     const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
 179                              0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };
 180
 181     assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
 182
 183     asm volatile ("mfence");
 184
 185     for (unsigned y = 0; y < height; y++) {
 186         unsigned x = 0;
 187
 188 #define LOAD64 \
 189     "movdqa  0(%[src]), %%xmm0\n" \
 190     "movdqa 16(%[src]), %%xmm1\n" \
 191     "movdqa 32(%[src]), %%xmm2\n" \
 192     "movdqa 48(%[src]), %%xmm3\n"
 193
 194 #define STORE2X32 \
 195     "movq   %%xmm0,   0(%[dst1])\n" \
 196     "movq   %%xmm1,   8(%[dst1])\n" \
 197     "movhpd %%xmm0,   0(%[dst2])\n" \
 198     "movhpd %%xmm1,   8(%[dst2])\n" \
 199     "movq   %%xmm2,  16(%[dst1])\n" \
 200     "movq   %%xmm3,  24(%[dst1])\n" \
 201     "movhpd %%xmm2,  16(%[dst2])\n" \
 202     "movhpd %%xmm3,  24(%[dst2])\n"
 203
 204 #ifdef CAN_COMPILE_SSSE3
 205         if (vlc_CPU_SSSE3())
 206         {
 207             for (x = 0; x < (width & ~31); x += 32) {
 208                 asm volatile (
 209                     "movdqu (%[shuffle]), %%xmm7\n"
 210                     LOAD64
 211                     "pshufb  %%xmm7, %%xmm0\n"
 212                     "pshufb  %%xmm7, %%xmm1\n"
 213                     "pshufb  %%xmm7, %%xmm2\n"
 214                     "pshufb  %%xmm7, %%xmm3\n"
 215                     STORE2X32
 216                     : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7");
 217             }
 218         } else
 219 #endif
 220         {
 221             for (x = 0; x < (width & ~31); x += 32) {
 222                 asm volatile (
 223                     "movdqu (%[mask]), %%xmm7\n"
 224                     LOAD64
 225                     "movdqa   %%xmm0, %%xmm4\n"
 226                     "movdqa   %%xmm1, %%xmm5\n"
 227                     "movdqa   %%xmm2, %%xmm6\n"
 228                     "psrlw    $8,     %%xmm0\n"
 229                     "psrlw    $8,     %%xmm1\n"
 230                     "pand     %%xmm7, %%xmm4\n"
 231                     "pand     %%xmm7, %%xmm5\n"
 232                     "pand     %%xmm7, %%xmm6\n"
 233                     "packuswb %%xmm4, %%xmm0\n"
 234                     "packuswb %%xmm5, %%xmm1\n"
 235                     "pand     %%xmm3, %%xmm7\n"
 236                     "psrlw    $8,     %%xmm2\n"
 237                     "psrlw    $8,     %%xmm3\n"
 238                     "packuswb %%xmm6, %%xmm2\n"
 239                     "packuswb %%xmm7, %%xmm3\n"
 240                     STORE2X32
 241                     : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
 242             }
 243         }
 244 #undef STORE2X32
 245 #undef LOAD64
 246
 247         for (; x < width; x++) {
 248             dstu[x] = src[2*x+0];
 249             dstv[x] = src[2*x+1];
 250         }
 251         src  += src_pitch;
 252         dstu += dstu_pitch;
 253         dstv += dstv_pitch;
 254     }
 255 }
 256
 257 static void SSE_CopyPlane(uint8_t *dst, size_t dst_pitch,
 258                           const uint8_t *src, size_t src_pitch,
 259                           uint8_t *cache, size_t cache_size,
 260                           unsigned width, unsigned height, unsigned cpu)
 261 {
 262     const unsigned w16 = (width+15) & ~15;
 263     const unsigned hstep = cache_size / w16;
 264     assert(hstep > 0);
 265
 266     for (unsigned y = 0; y < height; y += hstep) {
 267         const unsigned hblock =  __MIN(hstep, height - y);
 268
 269         /* Copy a bunch of line into our cache */
 270         CopyFromUswc(cache, w16,
 271                      src, src_pitch,
 272                      width, hblock, cpu);
 273
 274         /* Copy from our cache to the destination */
 275         Copy2d(dst, dst_pitch,
 276                cache, w16,
 277                width, hblock);
 278
 279         /* */
 280         src += src_pitch * hblock;
 281         dst += dst_pitch * hblock;
 282     }
 283     asm volatile ("mfence");
 284 }
 285
 286 static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
 287                             uint8_t *dstv, size_t dstv_pitch,
 288                             const uint8_t *src, size_t src_pitch,
 289                             uint8_t *cache, size_t cache_size,
 290                             unsigned width, unsigned height, unsigned cpu)
 291 {
 292     const unsigned w2_16 = (2*width+15) & ~15;
 293     const unsigned hstep = cache_size / w2_16;
 294     assert(hstep > 0);
 295
 296     for (unsigned y = 0; y < height; y += hstep) {
 297         const unsigned hblock =  __MIN(hstep, height - y);
 298
 299         /* Copy a bunch of line into our cache */
 300         CopyFromUswc(cache, w2_16, src, src_pitch,
 301                      2*width, hblock, cpu);
 302
 303         /* Copy from our cache to the destination */
 304         SSE_SplitUV(dstu, dstu_pitch, dstv, dstv_pitch,
 305                     cache, w2_16, width, hblock, cpu);
 306
 307         /* */
 308         src  += src_pitch  * hblock;
 309         dstu += dstu_pitch * hblock;
 310         dstv += dstv_pitch * hblock;
 311     }
 312     asm volatile ("mfence");
 313 }
 314
 315 static void SSE_CopyFromNv12(picture_t *dst,
 316                              uint8_t *src[2], size_t src_pitch[2],
 317                              unsigned width, unsigned height,
 318                              copy_cache_t *cache, unsigned cpu)
 319 {
 320     SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 321                   src[0], src_pitch[0],
 322                   cache->buffer, cache->size,
 323                   width, height, cpu);
 324     SSE_SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch,
 325                     dst->p[1].p_pixels, dst->p[1].i_pitch,
 326                     src[1], src_pitch[1],
 327                     cache->buffer, cache->size,
 328                     width/2, height/2, cpu);
 329     asm volatile ("emms");
 330 }
 331
 332 static void SSE_CopyFromYv12(picture_t *dst,
 333                              uint8_t *src[3], size_t src_pitch[3],
 334                              unsigned width, unsigned height,
 335                              copy_cache_t *cache, unsigned cpu)
 336 {
 337     for (unsigned n = 0; n < 3; n++) {
 338         const unsigned d = n > 0 ? 2 : 1;
 339         SSE_CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch,
 340                       src[n], src_pitch[n],
 341                       cache->buffer, cache->size,
 342                       width/d, height/d, cpu);
 343     }
 344     asm volatile ("emms");
 345 }
 346 #undef COPY64
 347 #endif /* CAN_COMPILE_SSE2 */
 348
 349 static void CopyPlane(uint8_t *dst, size_t dst_pitch,
 350                       const uint8_t *src, size_t src_pitch,
 351                       unsigned width, unsigned height)
 352 {
 353     for (unsigned y = 0; y < height; y++) {
 354         memcpy(dst, src, width);
 355         src += src_pitch;
 356         dst += dst_pitch;
 357     }
 358 }
 359
 360 static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
 361                         uint8_t *dstv, size_t dstv_pitch,
 362                         const uint8_t *src, size_t src_pitch,
 363                         unsigned width, unsigned height)
 364 {
 365     for (unsigned y = 0; y < height; y++) {
 366         for (unsigned x = 0; x < width; x++) {
 367             dstu[x] = src[2*x+0];
 368             dstv[x] = src[2*x+1];
 369         }
 370         src  += src_pitch;
 371         dstu += dstu_pitch;
 372         dstv += dstv_pitch;
 373     }
 374 }
 375
 376 void CopyFromNv12(picture_t *dst, uint8_t *src[2], size_t src_pitch[2],
 377                   unsigned width, unsigned height,
 378                   copy_cache_t *cache)
 379 {
 380 #ifdef CAN_COMPILE_SSE2
 381     unsigned cpu = vlc_CPU();
 382     if (vlc_CPU_SSE2())
 383         return SSE_CopyFromNv12(dst, src, src_pitch, width, height,
 384                                 cache, cpu);
 385 #else
 386     (void) cache;
 387 #endif
 388
 389     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 390               src[0], src_pitch[0],
 391               width, height);
 392     SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch,
 393                 dst->p[1].p_pixels, dst->p[1].i_pitch,
 394                 src[1], src_pitch[1],
 395                 width/2, height/2);
 396 }
 397
 398 void CopyFromYv12(picture_t *dst, uint8_t *src[3], size_t src_pitch[3],
 399                   unsigned width, unsigned height,
 400                   copy_cache_t *cache)
 401 {
 402 #ifdef CAN_COMPILE_SSE2
 403     unsigned cpu = vlc_CPU();
 404     if (vlc_CPU_SSE2())
 405         return SSE_CopyFromYv12(dst, src, src_pitch, width, height,
 406                                 cache, cpu);
 407 #else
 408     (void) cache;
 409 #endif
 410
 411      CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 412                src[0], src_pitch[0], width, height);
 413      CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch,
 414                src[1], src_pitch[1], width / 2, height / 2);
 415      CopyPlane(dst->p[2].p_pixels, dst->p[2].i_pitch,
 416                src[2], src_pitch[2], width / 2, height / 2);
 417 }