git.sesse.net Git - ffmpeg/blob - libavfilter/x86/vf_hflip.asm

   1 ;*****************************************************************************
   2 ;* x86-optimized functions for hflip filter
   3 ;*
   4 ;* Copyright (C) 2017 Paul B Mahol
   5 ;*
   6 ;* This file is part of FFmpeg.
   7 ;*
   8 ;* FFmpeg is free software; you can redistribute it and/or
   9 ;* modify it under the terms of the GNU Lesser General Public
  10 ;* License as published by the Free Software Foundation; either
  11 ;* version 2.1 of the License, or (at your option) any later version.
  12 ;*
  13 ;* FFmpeg is distributed in the hope that it will be useful,
  14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16 ;* Lesser General Public License for more details.
  17 ;*
  18 ;* You should have received a copy of the GNU Lesser General Public
  19 ;* License along with FFmpeg; if not, write to the Free Software
  20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21 ;*****************************************************************************
  22
  23 %include "libavutil/x86/x86util.asm"
  24
  25 SECTION_RODATA
  26
  27 pb_flip_byte:  db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
  28 pb_flip_short: db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1
  29
  30 SECTION .text
  31
  32 ;%1 byte or short, %2 b or w, %3 size in byte (1 for byte, 2 for short)
  33 %macro HFLIP 3
  34 cglobal hflip_%1, 3, 5, 3, src, dst, w, r, x
  35     VBROADCASTI128    m0, [pb_flip_%1]
  36     xor               xq, xq
  37 %if %3 == 1
  38     movsxdifnidn wq, wd
  39 %else ; short
  40     add     wd, wd
  41 %endif
  42     mov     rq, wq
  43     and     rq, 2 * mmsize - 1
  44     cmp     wq, 2 * mmsize
  45     jl .loop1
  46     sub     wq, rq
  47
  48     .loop0:
  49         neg     xq
  50 %if mmsize == 32
  51         vpermq  m1, [srcq + xq -     mmsize + %3], 0x4e; flip each lane at load
  52         vpermq  m2, [srcq + xq - 2 * mmsize + %3], 0x4e; flip each lane at load
  53 %else
  54         movu    m1, [srcq + xq -     mmsize + %3]
  55         movu    m2, [srcq + xq - 2 * mmsize + %3]
  56 %endif
  57         pshufb  m1, m0
  58         pshufb  m2, m0
  59         neg     xq
  60         movu    [dstq + xq         ], m1
  61         movu    [dstq + xq + mmsize], m2
  62         add     xq, mmsize * 2
  63         cmp     xq, wq
  64         jl .loop0
  65
  66     cmp    rq, 0
  67     je .end
  68     add    wq, rq
  69
  70     .loop1:
  71         neg    xq
  72         mov    r%2, [srcq + xq]
  73         neg    xq
  74         mov    [dstq + xq], r%2
  75         add    xq, %3
  76         cmp    xq, wq
  77         jl .loop1
  78     .end:
  79         RET
  80 %endmacro
  81
  82 INIT_XMM ssse3
  83 HFLIP byte, b, 1
  84 HFLIP short, w, 2
  85
  86 %if HAVE_AVX2_EXTERNAL
  87 INIT_YMM avx2
  88 HFLIP byte, b, 1
  89 HFLIP short, w, 2
  90 %endif