1 ;******************************************************************************
2 ;* VC1 deblocking optimizations
3 ;* Copyright (c) 2009 David Conrad
5 ;* This file is part of FFmpeg.
7 ;* FFmpeg is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* FFmpeg is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with FFmpeg; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
23 %include "x86util.asm"
30 ; dst_low, dst_high (src), zero
31 ; zero-extends one vector from 8 to 16 bits
38 %macro STORE_4_WORDS_MMX 6
54 %macro STORE_4_WORDS_SSE4 6
61 ; in: p1 p0 q0 q1, clobbers p0
62 ; out: p1 = (2*(p1 - q1) - 5*(p0 - q0) + 4) >> 3
63 %macro VC1_LOOP_FILTER_A0 4
83 pcmpgtw m6, m3 ; if (a2 < a0 || a1 < a0)
85 pmullw m3, [pw_5] ; 5*(a3 - a0)
87 psraw m2, 3 ; abs(d/8)
88 pxor m7, m3 ; d_sign ^= a0_sign
96 pcmpgtw m3, m4 ; if (a0 < pq)
103 pxor m3, m7 ; d_sign ^ clip_sign
105 pminsw m2, m4 ; min(d, clip)
107 pand m6, m4 ; filt3 (C return value)
109 ; each set of 4 pixels is not filtered if the 3rd is not
120 pand m3, m2 ; d final
129 ; 1st param: size of filter
130 ; 2nd param: mov suffix equivalent to the filter size
131 %macro VC1_V_LOOP_FILTER 2
142 VC1_LOOP_FILTER_A0 m6, m4, m7, m0
148 VC1_LOOP_FILTER_A0 m7, m4, m1, m2
154 VC1_LOOP_FILTER_A0 m5, m2, m3, m4
161 ; 1st param: size of filter
162 ; NOTE: UNPACK_8TO16 this number of 8 bit numbers are in half a register
163 ; 2nd (optional) param: temp register to use for storing words
164 %macro VC1_H_LOOP_FILTER 1-2
170 TRANSPOSE4x4B 0, 1, 2, 3, 4
184 TRANSPOSE4x4W 0, 1, 2, 3, 4
188 UNPACK_8TO16 bw, 6, 0, 5
189 UNPACK_8TO16 bw, 7, 1, 5
190 VC1_LOOP_FILTER_A0 m6, m0, m7, m1
191 UNPACK_8TO16 bw, 4, 2, 5
192 mova m0, m1 ; m0 = p0
193 VC1_LOOP_FILTER_A0 m7, m1, m4, m2
194 UNPACK_8TO16 bw, 1, 3, 5
196 VC1_LOOP_FILTER_A0 m5, m2, m1, m3
202 STORE_4_WORDS_MMX [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, %2
205 STORE_4_WORDS_MMX [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, %2
208 STORE_4_WORDS_SSE4 [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, 0
209 STORE_4_WORDS_SSE4 [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, 4
214 %macro START_V_FILTER 0
222 %macro START_H_FILTER 1
230 ; I dont know why the sign extension is needed...
231 %macro PSIGNW_SRA_MMX 2
239 cglobal vc1_v_loop_filter_internal_%1
240 VC1_V_LOOP_FILTER 4, d
243 cglobal vc1_h_loop_filter_internal_%1
244 VC1_H_LOOP_FILTER 4, r4
247 ; void ff_vc1_v_loop_filter4_mmx2(uint8_t *src, int stride, int pq)
248 cglobal vc1_v_loop_filter4_%1, 3,5,0
250 call vc1_v_loop_filter_internal_%1
253 ; void ff_vc1_h_loop_filter4_mmx2(uint8_t *src, int stride, int pq)
254 cglobal vc1_h_loop_filter4_%1, 3,5,0
256 call vc1_h_loop_filter_internal_%1
259 ; void ff_vc1_v_loop_filter8_mmx2(uint8_t *src, int stride, int pq)
260 cglobal vc1_v_loop_filter8_%1, 3,5,0
262 call vc1_v_loop_filter_internal_%1
265 call vc1_v_loop_filter_internal_%1
268 ; void ff_vc1_h_loop_filter8_mmx2(uint8_t *src, int stride, int pq)
269 cglobal vc1_h_loop_filter8_%1, 3,5,0
271 call vc1_h_loop_filter_internal_%1
273 call vc1_h_loop_filter_internal_%1
277 %define PABSW PABSW_MMX
278 %define PSIGNW PSIGNW_SRA_MMX
281 %define PABSW PABSW_MMX2
285 ; void ff_vc1_v_loop_filter8_sse2(uint8_t *src, int stride, int pq)
286 cglobal vc1_v_loop_filter8_sse2, 3,5,8
288 VC1_V_LOOP_FILTER 8, q
291 ; void ff_vc1_h_loop_filter8_sse2(uint8_t *src, int stride, int pq)
292 cglobal vc1_h_loop_filter8_sse2, 3,6,8
294 VC1_H_LOOP_FILTER 8, r5
297 %define PABSW PABSW_SSSE3
298 %define PSIGNW PSIGNW_SSSE3
301 ; void ff_vc1_v_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
302 cglobal vc1_v_loop_filter4_ssse3, 3,5,0
304 VC1_V_LOOP_FILTER 4, d
307 ; void ff_vc1_h_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
308 cglobal vc1_h_loop_filter4_ssse3, 3,5,0
310 VC1_H_LOOP_FILTER 4, r4
314 ; void ff_vc1_v_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
315 cglobal vc1_v_loop_filter8_ssse3, 3,5,8
317 VC1_V_LOOP_FILTER 8, q
320 ; void ff_vc1_h_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
321 cglobal vc1_h_loop_filter8_ssse3, 3,6,8
323 VC1_H_LOOP_FILTER 8, r5
326 ; void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq)
327 cglobal vc1_h_loop_filter8_sse4, 3,5,8