git.sesse.net Git - vlc/blob - modules/video_chroma/copy.c

   1 /*****************************************************************************
   2  * copy.c: Fast YV12/NV12 copy
   3  *****************************************************************************
   4  * Copyright (C) 2010 Laurent Aimar
   5  * $Id$
   6  *
   7  * Authors: Laurent Aimar <fenrir _AT_ videolan _DOT_ org>
   8  *
   9  * This program is free software; you can redistribute it and/or modify it
  10  * under the terms of the GNU Lesser General Public License as published by
  11  * the Free Software Foundation; either version 2.1 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17  * GNU Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public License
  20  * along with this program; if not, write to the Free Software Foundation,
  21  * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  22  *****************************************************************************/
  23
  24 #ifdef HAVE_CONFIG_H
  25 # include "config.h"
  26 #endif
  27
  28 #include <vlc_common.h>
  29 #include <vlc_picture.h>
  30 #include <vlc_cpu.h>
  31 #include <assert.h>
  32
  33 #include "copy.h"
  34
  35 int CopyInitCache(copy_cache_t *cache, unsigned width)
  36 {
  37 #ifdef CAN_COMPILE_SSE2
  38     cache->size = __MAX((width + 0x3f) & ~ 0x3f, 4096);
  39     cache->buffer = vlc_memalign(64, cache->size);
  40     if (!cache->buffer)
  41         return VLC_EGENERIC;
  42 #else
  43     (void) cache; (void) width;
  44 #endif
  45     return VLC_SUCCESS;
  46 }
  47
  48 void CopyCleanCache(copy_cache_t *cache)
  49 {
  50 #ifdef CAN_COMPILE_SSE2
  51     vlc_free(cache->buffer);
  52     cache->buffer = NULL;
  53     cache->size   = 0;
  54 #else
  55     (void) cache;
  56 #endif
  57 }
  58
  59 #ifdef CAN_COMPILE_SSE2
  60 /* Copy 16/64 bytes from srcp to dstp loading data with the SSE>=2 instruction
  61  * load and storing data with the SSE>=2 instruction store.
  62  */
  63 #define COPY16(dstp, srcp, load, store) \
  64     asm volatile (                      \
  65         load "  0(%[src]), %%xmm1\n"    \
  66         store " %%xmm1,    0(%[dst])\n" \
  67         : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1")
  68
  69 #define COPY64(dstp, srcp, load, store) \
  70     asm volatile (                      \
  71         load "  0(%[src]), %%xmm1\n"    \
  72         load " 16(%[src]), %%xmm2\n"    \
  73         load " 32(%[src]), %%xmm3\n"    \
  74         load " 48(%[src]), %%xmm4\n"    \
  75         store " %%xmm1,    0(%[dst])\n" \
  76         store " %%xmm2,   16(%[dst])\n" \
  77         store " %%xmm3,   32(%[dst])\n" \
  78         store " %%xmm4,   48(%[dst])\n" \
  79         : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1", "xmm2", "xmm3", "xmm4")
  80
  81 #ifndef __SSE4_1__
  82 # undef vlc_CPU_SSE4_1
  83 # define vlc_CPU_SSE4_1() ((cpu & VLC_CPU_SSE4_1) != 0)
  84 #endif
  85
  86 #ifndef __SSSE3__
  87 # undef vlc_CPU_SSSE3
  88 # define vlc_CPU_SSSE3() ((cpu & VLC_CPU_SSSE3) != 0)
  89 #endif
  90
  91 #ifndef __SSE2__
  92 # undef vlc_CPU_SSE2
  93 # define vlc_CPU_SSE2() ((cpu & VLC_CPU_SSE2) != 0)
  94 #endif
  95
  96 /* Optimized copy from "Uncacheable Speculative Write Combining" memory
  97  * as used by some video surface.
  98  * XXX It is really efficient only when SSE4.1 is available.
  99  */
 100 VLC_SSE
 101 static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
 102                          const uint8_t *src, size_t src_pitch,
 103                          unsigned width, unsigned height,
 104                          unsigned cpu)
 105 {
 106 #ifndef CAN_COMPILE_SSSE3
 107     VLC_UNUSED(cpu);
 108 #endif
 109     assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
 110
 111     asm volatile ("mfence");
 112
 113     for (unsigned y = 0; y < height; y++) {
 114         const unsigned unaligned = (-(uintptr_t)src) & 0x0f;
 115         unsigned x = unaligned;
 116
 117 #ifdef CAN_COMPILE_SSE4_1
 118         if (vlc_CPU_SSE4_1()) {
 119             if (!unaligned) {
 120                 for (; x+63 < width; x += 64)
 121                     COPY64(&dst[x], &src[x], "movntdqa", "movdqa");
 122             } else {
 123                 COPY16(dst, src, "movdqu", "movdqa");
 124                 for (; x+63 < width; x += 64)
 125                     COPY64(&dst[x], &src[x], "movntdqa", "movdqu");
 126             }
 127         } else
 128 #endif
 129         {
 130             if (!unaligned) {
 131                 for (; x+63 < width; x += 64)
 132                     COPY64(&dst[x], &src[x], "movdqa", "movdqa");
 133             } else {
 134                 COPY16(dst, src, "movdqu", "movdqa");
 135                 for (; x+63 < width; x += 64)
 136                     COPY64(&dst[x], &src[x], "movdqa", "movdqu");
 137             }
 138         }
 139
 140         for (; x < width; x++)
 141             dst[x] = src[x];
 142
 143         src += src_pitch;
 144         dst += dst_pitch;
 145     }
 146     asm volatile ("mfence");
 147 }
 148
 149 VLC_SSE
 150 static void Copy2d(uint8_t *dst, size_t dst_pitch,
 151                    const uint8_t *src, size_t src_pitch,
 152                    unsigned width, unsigned height)
 153 {
 154     assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
 155
 156     for (unsigned y = 0; y < height; y++) {
 157         unsigned x = 0;
 158
 159         bool unaligned = ((intptr_t)dst & 0x0f) != 0;
 160         if (!unaligned) {
 161             for (; x+63 < width; x += 64)
 162                 COPY64(&dst[x], &src[x], "movdqa", "movntdq");
 163         } else {
 164             for (; x+63 < width; x += 64)
 165                 COPY64(&dst[x], &src[x], "movdqa", "movdqu");
 166         }
 167
 168         for (; x < width; x++)
 169             dst[x] = src[x];
 170
 171         src += src_pitch;
 172         dst += dst_pitch;
 173     }
 174 }
 175
 176 VLC_SSE
 177 static void SSE_SplitUV(uint8_t *dstu, size_t dstu_pitch,
 178                         uint8_t *dstv, size_t dstv_pitch,
 179                         const uint8_t *src, size_t src_pitch,
 180                         unsigned width, unsigned height, unsigned cpu)
 181 {
 182 #ifndef CAN_COMPILE_SSSE3
 183     VLC_UNUSED(cpu);
 184 #endif
 185     const uint8_t shuffle[] = { 0, 2, 4, 6, 8, 10, 12, 14,
 186                                 1, 3, 5, 7, 9, 11, 13, 15 };
 187     const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
 188                              0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };
 189
 190     assert(((intptr_t)src & 0xf) == 0 && (src_pitch & 0x0f) == 0);
 191
 192     for (unsigned y = 0; y < height; y++) {
 193         unsigned x = 0;
 194
 195 #define LOAD64 \
 196     "movdqa  0(%[src]), %%xmm0\n" \
 197     "movdqa 16(%[src]), %%xmm1\n" \
 198     "movdqa 32(%[src]), %%xmm2\n" \
 199     "movdqa 48(%[src]), %%xmm3\n"
 200
 201 #define STORE2X32 \
 202     "movq   %%xmm0,   0(%[dst1])\n" \
 203     "movq   %%xmm1,   8(%[dst1])\n" \
 204     "movhpd %%xmm0,   0(%[dst2])\n" \
 205     "movhpd %%xmm1,   8(%[dst2])\n" \
 206     "movq   %%xmm2,  16(%[dst1])\n" \
 207     "movq   %%xmm3,  24(%[dst1])\n" \
 208     "movhpd %%xmm2,  16(%[dst2])\n" \
 209     "movhpd %%xmm3,  24(%[dst2])\n"
 210
 211 #ifdef CAN_COMPILE_SSSE3
 212         if (vlc_CPU_SSSE3())
 213         {
 214             for (x = 0; x < (width & ~31); x += 32) {
 215                 asm volatile (
 216                     "movdqu (%[shuffle]), %%xmm7\n"
 217                     LOAD64
 218                     "pshufb  %%xmm7, %%xmm0\n"
 219                     "pshufb  %%xmm7, %%xmm1\n"
 220                     "pshufb  %%xmm7, %%xmm2\n"
 221                     "pshufb  %%xmm7, %%xmm3\n"
 222                     STORE2X32
 223                     : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7");
 224             }
 225         } else
 226 #endif
 227         {
 228             for (x = 0; x < (width & ~31); x += 32) {
 229                 asm volatile (
 230                     "movdqu (%[mask]), %%xmm7\n"
 231                     LOAD64
 232                     "movdqa   %%xmm0, %%xmm4\n"
 233                     "movdqa   %%xmm1, %%xmm5\n"
 234                     "movdqa   %%xmm2, %%xmm6\n"
 235                     "psrlw    $8,     %%xmm0\n"
 236                     "psrlw    $8,     %%xmm1\n"
 237                     "pand     %%xmm7, %%xmm4\n"
 238                     "pand     %%xmm7, %%xmm5\n"
 239                     "pand     %%xmm7, %%xmm6\n"
 240                     "packuswb %%xmm4, %%xmm0\n"
 241                     "packuswb %%xmm5, %%xmm1\n"
 242                     "pand     %%xmm3, %%xmm7\n"
 243                     "psrlw    $8,     %%xmm2\n"
 244                     "psrlw    $8,     %%xmm3\n"
 245                     "packuswb %%xmm6, %%xmm2\n"
 246                     "packuswb %%xmm7, %%xmm3\n"
 247                     STORE2X32
 248                     : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
 249             }
 250         }
 251 #undef STORE2X32
 252 #undef LOAD64
 253
 254         for (; x < width; x++) {
 255             dstu[x] = src[2*x+0];
 256             dstv[x] = src[2*x+1];
 257         }
 258         src  += src_pitch;
 259         dstu += dstu_pitch;
 260         dstv += dstv_pitch;
 261     }
 262 }
 263
 264 static void SSE_CopyPlane(uint8_t *dst, size_t dst_pitch,
 265                           const uint8_t *src, size_t src_pitch,
 266                           uint8_t *cache, size_t cache_size,
 267                           unsigned width, unsigned height, unsigned cpu)
 268 {
 269     const unsigned w16 = (width+15) & ~15;
 270     const unsigned hstep = cache_size / w16;
 271     assert(hstep > 0);
 272
 273     for (unsigned y = 0; y < height; y += hstep) {
 274         const unsigned hblock =  __MIN(hstep, height - y);
 275
 276         /* Copy a bunch of line into our cache */
 277         CopyFromUswc(cache, w16,
 278                      src, src_pitch,
 279                      width, hblock, cpu);
 280
 281         /* Copy from our cache to the destination */
 282         Copy2d(dst, dst_pitch,
 283                cache, w16,
 284                width, hblock);
 285
 286         /* */
 287         src += src_pitch * hblock;
 288         dst += dst_pitch * hblock;
 289     }
 290 }
 291
 292 static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
 293                             uint8_t *dstv, size_t dstv_pitch,
 294                             const uint8_t *src, size_t src_pitch,
 295                             uint8_t *cache, size_t cache_size,
 296                             unsigned width, unsigned height, unsigned cpu)
 297 {
 298     const unsigned w16 = (2*width+15) & ~15;
 299     const unsigned hstep = cache_size / w16;
 300     assert(hstep > 0);
 301
 302     for (unsigned y = 0; y < height; y += hstep) {
 303         const unsigned hblock =  __MIN(hstep, height - y);
 304
 305         /* Copy a bunch of line into our cache */
 306         CopyFromUswc(cache, w16, src, src_pitch,
 307                      2*width, hblock, cpu);
 308
 309         /* Copy from our cache to the destination */
 310         SSE_SplitUV(dstu, dstu_pitch, dstv, dstv_pitch,
 311                     cache, w16, width, hblock, cpu);
 312
 313         /* */
 314         src  += src_pitch  * hblock;
 315         dstu += dstu_pitch * hblock;
 316         dstv += dstv_pitch * hblock;
 317     }
 318 }
 319
 320 static void SSE_CopyFromNv12(picture_t *dst,
 321                              uint8_t *src[2], size_t src_pitch[2],
 322                              unsigned width, unsigned height,
 323                              copy_cache_t *cache, unsigned cpu)
 324 {
 325     SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 326                   src[0], src_pitch[0],
 327                   cache->buffer, cache->size,
 328                   width, height, cpu);
 329     SSE_SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch,
 330                     dst->p[1].p_pixels, dst->p[1].i_pitch,
 331                     src[1], src_pitch[1],
 332                     cache->buffer, cache->size,
 333                     (width+1)/2, (height+1)/2, cpu);
 334     asm volatile ("emms");
 335 }
 336
 337 static void SSE_CopyFromYv12(picture_t *dst,
 338                              uint8_t *src[3], size_t src_pitch[3],
 339                              unsigned width, unsigned height,
 340                              copy_cache_t *cache, unsigned cpu)
 341 {
 342     for (unsigned n = 0; n < 3; n++) {
 343         const unsigned d = n > 0 ? 2 : 1;
 344         SSE_CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch,
 345                       src[n], src_pitch[n],
 346                       cache->buffer, cache->size,
 347                       (width+d-1)/d, (height+d-1)/d, cpu);
 348     }
 349     asm volatile ("emms");
 350 }
 351 #undef COPY64
 352 #endif /* CAN_COMPILE_SSE2 */
 353
 354 static void CopyPlane(uint8_t *dst, size_t dst_pitch,
 355                       const uint8_t *src, size_t src_pitch,
 356                       unsigned width, unsigned height)
 357 {
 358     for (unsigned y = 0; y < height; y++) {
 359         memcpy(dst, src, width);
 360         src += src_pitch;
 361         dst += dst_pitch;
 362     }
 363 }
 364
 365 static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
 366                         uint8_t *dstv, size_t dstv_pitch,
 367                         const uint8_t *src, size_t src_pitch,
 368                         unsigned width, unsigned height)
 369 {
 370     for (unsigned y = 0; y < height; y++) {
 371         for (unsigned x = 0; x < width; x++) {
 372             dstu[x] = src[2*x+0];
 373             dstv[x] = src[2*x+1];
 374         }
 375         src  += src_pitch;
 376         dstu += dstu_pitch;
 377         dstv += dstv_pitch;
 378     }
 379 }
 380
 381 void CopyFromNv12(picture_t *dst, uint8_t *src[2], size_t src_pitch[2],
 382                   unsigned width, unsigned height,
 383                   copy_cache_t *cache)
 384 {
 385 #ifdef CAN_COMPILE_SSE2
 386     unsigned cpu = vlc_CPU();
 387     if (vlc_CPU_SSE2())
 388         return SSE_CopyFromNv12(dst, src, src_pitch, width, height,
 389                                 cache, cpu);
 390 #else
 391     (void) cache;
 392 #endif
 393
 394     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 395               src[0], src_pitch[0],
 396               width, height);
 397     SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch,
 398                 dst->p[1].p_pixels, dst->p[1].i_pitch,
 399                 src[1], src_pitch[1],
 400                 width/2, height/2);
 401 }
 402
 403 void CopyFromYv12(picture_t *dst, uint8_t *src[3], size_t src_pitch[3],
 404                   unsigned width, unsigned height,
 405                   copy_cache_t *cache)
 406 {
 407 #ifdef CAN_COMPILE_SSE2
 408     unsigned cpu = vlc_CPU();
 409     if (vlc_CPU_SSE2())
 410         return SSE_CopyFromYv12(dst, src, src_pitch, width, height,
 411                                 cache, cpu);
 412 #else
 413     (void) cache;
 414 #endif
 415
 416      CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 417                src[0], src_pitch[0], width, height);
 418      CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch,
 419                src[1], src_pitch[1], width / 2, height / 2);
 420      CopyPlane(dst->p[2].p_pixels, dst->p[2].i_pitch,
 421                src[2], src_pitch[2], width / 2, height / 2);
 422 }