1 ;******************************************************************************
2 ;* Copyright (c) 2010 David Conrad
4 ;* This file is part of FFmpeg.
6 ;* FFmpeg is free software; you can redistribute it and/or
7 ;* modify it under the terms of the GNU Lesser General Public
8 ;* License as published by the Free Software Foundation; either
9 ;* version 2.1 of the License, or (at your option) any later version.
11 ;* FFmpeg is distributed in the hope that it will be useful,
12 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 ;* Lesser General Public License for more details.
16 ;* You should have received a copy of the GNU Lesser General Public
17 ;* License along with FFmpeg; if not, write to the Free Software
18 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 ;******************************************************************************
21 %include "libavutil/x86/x86util.asm"
25 convert_to_unsigned_10bit: times 4 dd 0x200
26 clip_10bit: times 8 dw 0x3ff
49 ; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width);
50 cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3
52 lea stridex3q, [3*strideq]
57 UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a
61 ; 3*( ... + src[-2] + src[3])
62 UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a
68 ; ... - 7*(src[-1] + src[2])
69 UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a
75 ; ... - (src[-3] + src[4])
76 UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a
93 ; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width);
94 cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width
97 and widthd, ~(mmsize-1)
100 UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u
104 ; 3*( ... + src[-2] + src[3])
105 UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u
111 ; ... - 7*(src[-1] + src[2])
112 UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u
118 ; ... - (src[-3] + src[4])
119 UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u
128 mova [dstq + widthq], m0
135 ; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height)
136 cglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w, dst2, src2
142 movsxd dst_strideq, dst_strided
143 movsxd src_strideq, src_strided
155 lea src2q, [srcq+src_strideq]
156 lea dst2q, [dstq+dst_strideq]
159 mova m1, [srcq +2*wq]
160 mova m2, [src2q+2*wq]
161 packsswb m1, [srcq +2*wq+mmsize]
162 packsswb m2, [src2q+2*wq+mmsize]
169 lea srcq, [srcq+src_strideq*2]
170 lea dstq, [dstq+dst_strideq*2]
178 ; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height)
179 cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h
185 movsxd strideq, strided
186 movsxd idwt_strideq, idwt_strided
196 movu m1, [srcq +2*wq] ; FIXME: ensure alignment
199 movu m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment
202 paddw m1, [idwtq+2*wq]
203 paddw m2, [idwtq+2*wq+mmsize]
208 lea srcq, [srcq + 2*strideq]
210 lea idwtq, [idwtq+ 2*idwt_strideq]
218 ; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen)
219 cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen
235 movu m3, [dstq+2*i+mmsize]
239 movu [dstq+2*i+mmsize], m1
242 lea srcq, [srcq+strideq]
243 lea dstq, [dstq+2*strideq]
271 ; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h)
272 cglobal dequant_subband_32, 7, 7, 4, src, dst, stride, qf, qs, tot_v, tot_h
307 ; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height)
309 cglobal put_signed_rect_clamped_10, 6, 8, 5, dst, dst_stride, src, src_stride, w, h, t1, t2
311 cglobal put_signed_rect_clamped_10, 5, 7, 5, dst, dst_stride, src, src_stride, w, t1, t2
320 mova m3, [clip_10bit]
321 mova m4, [convert_to_unsigned_10bit]
328 movu m0, [srcq+wq+0*mmsize]
329 movu m1, [srcq+wq+1*mmsize]
334 CLIPW m0, m2, m3 ; packusdw saturates so it's fine
342 add srcq, src_strideq