git.sesse.net Git - ffmpeg/blob - libavcodec/x86/lossless_videoencdsp.asm

   1 ;************************************************************************
   2 ;* SIMD-optimized lossless video encoding functions
   3 ;* Copyright (c) 2000, 2001 Fabrice Bellard
   4 ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5 ;*
   6 ;* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
   7 ;* Conversion to NASM format by Tiancheng "Timothy" Gu <timothygu99@gmail.com>
   8 ;*
   9 ;* This file is part of FFmpeg.
  10 ;*
  11 ;* FFmpeg is free software; you can redistribute it and/or
  12 ;* modify it under the terms of the GNU Lesser General Public
  13 ;* License as published by the Free Software Foundation; either
  14 ;* version 2.1 of the License, or (at your option) any later version.
  15 ;*
  16 ;* FFmpeg is distributed in the hope that it will be useful,
  17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19 ;* Lesser General Public License for more details.
  20 ;*
  21 ;* You should have received a copy of the GNU Lesser General Public
  22 ;* License along with FFmpeg; if not, write to the Free Software
  23 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  24 ;******************************************************************************
  25
  26 %include "libavutil/x86/x86util.asm"
  27
  28 SECTION .text
  29
  30 ; void ff_diff_bytes(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
  31 ;                    intptr_t w);
  32 %macro DIFF_BYTES_PROLOGUE 0
  33 %if ARCH_X86_32
  34 cglobal diff_bytes, 3,5,2, dst, src1, src2
  35 %define wq r4q
  36     DECLARE_REG_TMP 3
  37     mov               wq, r3mp
  38 %else
  39 cglobal diff_bytes, 4,5,2, dst, src1, src2, w
  40     DECLARE_REG_TMP 4
  41 %endif ; ARCH_X86_32
  42 %define i t0q
  43 %endmacro
  44
  45 ; labels to jump to if w < regsize and w < 0
  46 %macro DIFF_BYTES_LOOP_PREP 2
  47     mov                i, wq
  48     and                i, -2 * regsize
  49         js            %2
  50         jz            %1
  51     add             dstq, i
  52     add            src1q, i
  53     add            src2q, i
  54     neg                i
  55 %endmacro
  56
  57 ; mov type used for src1q, dstq, first reg, second reg
  58 %macro DIFF_BYTES_LOOP_CORE 4
  59 %if mmsize != 16
  60     mov%1             %3, [src1q + i]
  61     mov%1             %4, [src1q + i + regsize]
  62     psubb             %3, [src2q + i]
  63     psubb             %4, [src2q + i + regsize]
  64     mov%2           [dstq + i], %3
  65     mov%2 [regsize + dstq + i], %4
  66 %else
  67     ; SSE enforces alignment of psubb operand
  68     mov%1             %3, [src1q + i]
  69     movu              %4, [src2q + i]
  70     psubb             %3, %4
  71     mov%2     [dstq + i], %3
  72     mov%1             %3, [src1q + i + regsize]
  73     movu              %4, [src2q + i + regsize]
  74     psubb             %3, %4
  75     mov%2 [regsize + dstq + i], %3
  76 %endif
  77 %endmacro
  78
  79 %macro DIFF_BYTES_BODY 2 ; mov type used for src1q, for dstq
  80     %define regsize mmsize
  81 .loop_%1%2:
  82     DIFF_BYTES_LOOP_CORE %1, %2, m0, m1
  83     add                i, 2 * regsize
  84         jl    .loop_%1%2
  85 .skip_main_%1%2:
  86     and               wq, 2 * regsize - 1
  87         jz     .end_%1%2
  88 %if mmsize > 16
  89     ; fall back to narrower xmm
  90     %define regsize (mmsize / 2)
  91     DIFF_BYTES_LOOP_PREP .setup_loop_gpr_aa, .end_aa
  92 .loop2_%1%2:
  93     DIFF_BYTES_LOOP_CORE %1, %2, xm0, xm1
  94     add                i, 2 * regsize
  95         jl   .loop2_%1%2
  96 .setup_loop_gpr_%1%2:
  97     and               wq, 2 * regsize - 1
  98         jz     .end_%1%2
  99 %endif
 100     add             dstq, wq
 101     add            src1q, wq
 102     add            src2q, wq
 103     neg               wq
 104 .loop_gpr_%1%2:
 105     mov              t0b, [src1q + wq]
 106     sub              t0b, [src2q + wq]
 107     mov      [dstq + wq], t0b
 108     inc               wq
 109         jl .loop_gpr_%1%2
 110 .end_%1%2:
 111     REP_RET
 112 %endmacro
 113
 114 %if ARCH_X86_32
 115 INIT_MMX mmx
 116 DIFF_BYTES_PROLOGUE
 117     %define regsize mmsize
 118     DIFF_BYTES_LOOP_PREP .skip_main_aa, .end_aa
 119     DIFF_BYTES_BODY    a, a
 120 %undef i
 121 %endif
 122
 123 INIT_XMM sse2
 124 DIFF_BYTES_PROLOGUE
 125     %define regsize mmsize
 126     DIFF_BYTES_LOOP_PREP .skip_main_aa, .end_aa
 127     test            dstq, regsize - 1
 128         jnz     .loop_uu
 129     test           src1q, regsize - 1
 130         jnz     .loop_ua
 131     DIFF_BYTES_BODY    a, a
 132     DIFF_BYTES_BODY    u, a
 133     DIFF_BYTES_BODY    u, u
 134 %undef i
 135
 136 %if HAVE_AVX2_EXTERNAL
 137 INIT_YMM avx2
 138 DIFF_BYTES_PROLOGUE
 139     %define regsize mmsize
 140     ; Directly using unaligned SSE2 version is marginally faster than
 141     ; branching based on arguments.
 142     DIFF_BYTES_LOOP_PREP .skip_main_uu, .end_uu
 143     test            dstq, regsize - 1
 144         jnz     .loop_uu
 145     test           src1q, regsize - 1
 146         jnz     .loop_ua
 147     DIFF_BYTES_BODY    a, a
 148     DIFF_BYTES_BODY    u, a
 149     DIFF_BYTES_BODY    u, u
 150 %undef i
 151 %endif