git.sesse.net Git - ffmpeg/blob - libavcodec/x86/lossless_videoencdsp.asm

   1 ;************************************************************************
   2 ;* SIMD-optimized lossless video encoding functions
   3 ;* Copyright (c) 2000, 2001 Fabrice Bellard
   4 ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5 ;*
   6 ;* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
   7 ;* Conversion to NASM format by Tiancheng "Timothy" Gu <timothygu99@gmail.com>
   8 ;*
   9 ;* This file is part of FFmpeg.
  10 ;*
  11 ;* FFmpeg is free software; you can redistribute it and/or
  12 ;* modify it under the terms of the GNU Lesser General Public
  13 ;* License as published by the Free Software Foundation; either
  14 ;* version 2.1 of the License, or (at your option) any later version.
  15 ;*
  16 ;* FFmpeg is distributed in the hope that it will be useful,
  17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19 ;* Lesser General Public License for more details.
  20 ;*
  21 ;* You should have received a copy of the GNU Lesser General Public
  22 ;* License along with FFmpeg; if not, write to the Free Software
  23 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  24 ;******************************************************************************
  25
  26 %include "libavutil/x86/x86util.asm"
  27
  28 section .text
  29
  30 ; void ff_diff_bytes(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
  31 ;                    intptr_t w);
  32 %macro DIFF_BYTES_PROLOGUE 0
  33 %if ARCH_X86_32
  34 cglobal diff_bytes, 3,5,2, dst, src1, src2
  35 %define wq r4q
  36     DECLARE_REG_TMP 3
  37     mov               wq, r3mp
  38 %else
  39 cglobal diff_bytes, 4,5,2, dst, src1, src2, w
  40     DECLARE_REG_TMP 4
  41 %endif ; ARCH_X86_32
  42 %define i t0q
  43 %endmacro
  44
  45 ; label to jump to if w < regsize
  46 %macro DIFF_BYTES_LOOP_PREP 1
  47     mov                i, wq
  48     and                i, -2 * regsize
  49         jz            %1
  50     add             dstq, i
  51     add            src1q, i
  52     add            src2q, i
  53     neg                i
  54 %endmacro
  55
  56 ; mov type used for src1q, dstq, first reg, second reg
  57 %macro DIFF_BYTES_LOOP_CORE 4
  58 %if mmsize != 16
  59     mov%1             %3, [src1q + i]
  60     mov%1             %4, [src1q + i + regsize]
  61     psubb             %3, [src2q + i]
  62     psubb             %4, [src2q + i + regsize]
  63     mov%2           [dstq + i], %3
  64     mov%2 [regsize + dstq + i], %4
  65 %else
  66     ; SSE enforces alignment of psubb operand
  67     mov%1             %3, [src1q + i]
  68     movu              %4, [src2q + i]
  69     psubb             %3, %4
  70     mov%2     [dstq + i], %3
  71     mov%1             %3, [src1q + i + regsize]
  72     movu              %4, [src2q + i + regsize]
  73     psubb             %3, %4
  74     mov%2 [regsize + dstq + i], %3
  75 %endif
  76 %endmacro
  77
  78 %macro DIFF_BYTES_BODY 2 ; mov type used for src1q, for dstq
  79     %define regsize mmsize
  80 .loop_%1%2:
  81     DIFF_BYTES_LOOP_CORE %1, %2, m0, m1
  82     add                i, 2 * regsize
  83         jl    .loop_%1%2
  84 .skip_main_%1%2:
  85     and               wq, 2 * regsize - 1
  86         jz     .end_%1%2
  87 %if mmsize > 16
  88     ; fall back to narrower xmm
  89     %define regsize mmsize / 2
  90     DIFF_BYTES_LOOP_PREP .setup_loop_gpr_aa
  91 .loop2_%1%2:
  92     DIFF_BYTES_LOOP_CORE %1, %2, xm0, xm1
  93     add                i, 2 * regsize
  94         jl   .loop2_%1%2
  95 .setup_loop_gpr_%1%2:
  96     and               wq, 2 * regsize - 1
  97         jz     .end_%1%2
  98 %endif
  99     add             dstq, wq
 100     add            src1q, wq
 101     add            src2q, wq
 102     neg               wq
 103 .loop_gpr_%1%2:
 104     mov              t0b, [src1q + wq]
 105     sub              t0b, [src2q + wq]
 106     mov      [dstq + wq], t0b
 107     inc               wq
 108         jl .loop_gpr_%1%2
 109 .end_%1%2:
 110     REP_RET
 111 %endmacro
 112
 113 %if ARCH_X86_32
 114 INIT_MMX mmx
 115 DIFF_BYTES_PROLOGUE
 116     %define regsize mmsize
 117     DIFF_BYTES_LOOP_PREP .skip_main_aa
 118     DIFF_BYTES_BODY    a, a
 119 %undef i
 120 %endif
 121
 122 INIT_XMM sse2
 123 DIFF_BYTES_PROLOGUE
 124     %define regsize mmsize
 125     DIFF_BYTES_LOOP_PREP .skip_main_aa
 126     test            dstq, regsize - 1
 127         jnz     .loop_uu
 128     test           src1q, regsize - 1
 129         jnz     .loop_ua
 130     DIFF_BYTES_BODY    a, a
 131     DIFF_BYTES_BODY    u, a
 132     DIFF_BYTES_BODY    u, u
 133 %undef i
 134
 135 %if HAVE_AVX2_EXTERNAL
 136 INIT_YMM avx2
 137 DIFF_BYTES_PROLOGUE
 138     %define regsize mmsize
 139     ; Directly using unaligned SSE2 version is marginally faster than
 140     ; branching based on arguments.
 141     DIFF_BYTES_LOOP_PREP .skip_main_uu
 142     test            dstq, regsize - 1
 143         jnz     .loop_uu
 144     test           src1q, regsize - 1
 145         jnz     .loop_ua
 146     DIFF_BYTES_BODY    a, a
 147     DIFF_BYTES_BODY    u, a
 148     DIFF_BYTES_BODY    u, u
 149 %undef i
 150 %endif