1 ;*****************************************************************************
2 ;* deblock-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
8 ;* This program is free software; you can redistribute it and/or modify
9 ;* it under the terms of the GNU General Public License as published by
10 ;* the Free Software Foundation; either version 2 of the License, or
11 ;* (at your option) any later version.
13 ;* This program is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ;* GNU General Public License for more details.
18 ;* You should have received a copy of the GNU General Public License
19 ;* along with this program; if not, write to the Free Software
20 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
21 ;*****************************************************************************
25 %include "amd64inc.asm"
27 SECTION .rodata align=16
28 pb_01: times 16 db 0x01
29 pb_03: times 16 db 0x03
30 pb_a1: times 16 db 0xa1
34 ; expands to [base],...,[base+7*stride]
35 %define PASS8ROWS(base, base3, stride, stride3) \
36 [base], [base+stride], [base+stride*2], [base3], \
37 [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
39 ; in: 8 rows of 4 bytes in %1..%8
40 ; out: 4 rows of 8 bytes in mm0..mm3
41 %macro TRANSPOSE4x8_LOAD 8
70 ; in: 4 rows of 8 bytes in mm0..mm3
71 ; out: 8 rows of 4 bytes in %1..%8
72 %macro TRANSPOSE8x4_STORE 8
112 ; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
113 ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
114 %macro TRANSPOSE6x8_MEM 9
119 SBUTTERFLY bw, mm0, %2, mm4
120 SBUTTERFLY bw, mm1, %4, mm5
121 SBUTTERFLY bw, mm2, %6, mm6
123 SBUTTERFLY bw, mm3, %8, mm7
124 SBUTTERFLY wd, mm0, mm1, mm5
125 SBUTTERFLY wd, mm2, mm3, mm1
128 SBUTTERFLY wd, mm4, [%9+0x10], mm3
129 SBUTTERFLY wd, mm6, mm7, mm2
130 SBUTTERFLY dq, mm4, mm6, mm0
131 SBUTTERFLY dq, mm5, mm1, mm7
140 ; out: %4 = |%1-%2|>%3
151 DIFF_GT q, %1, %2, %3, %4, %5
153 %macro DIFF_GT_SSE2 5
154 DIFF_GT dqa, %1, %2, %3, %4, %5
157 ; out: %4 = |%1-%2|>%3
168 %macro DIFF_GT2_MMX 5
169 DIFF_GT2 q, %1, %2, %3, %4, %5
171 %macro DIFF_GT2_SSE2 5
172 DIFF_GT2 dqa, %1, %2, %3, %4, %5
175 ; in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 %1=alpha-1 %2=beta-1
176 ; out: mm5=beta-1, mm7=mask
178 %macro LOAD_MASK_MMX 2
183 packuswb mm4, mm4 ; 8x alpha-1
184 packuswb mm5, mm5 ; 8x beta-1
185 DIFF_GT_MMX mm1, mm2, mm4, mm7, mm6 ; |p0-q0| > alpha-1
186 DIFF_GT_MMX mm0, mm1, mm5, mm4, mm6 ; |p1-p0| > beta-1
188 DIFF_GT_MMX mm3, mm2, mm5, mm4, mm6 ; |q1-q0| > beta-1
193 %macro LOAD_MASK_SSE2 2
196 pshuflw xmm4, xmm4, 0
197 pshuflw xmm5, xmm5, 0
198 punpcklqdq xmm4, xmm4
199 punpcklqdq xmm5, xmm5
200 packuswb xmm4, xmm4 ; 16x alpha-1
201 packuswb xmm5, xmm5 ; 16x beta-1
202 DIFF_GT_SSE2 xmm1, xmm2, xmm4, xmm7, xmm6 ; |p0-q0| > alpha-1
203 DIFF_GT_SSE2 xmm0, xmm1, xmm5, xmm4, xmm6 ; |p1-p0| > beta-1
205 DIFF_GT_SSE2 xmm3, xmm2, xmm5, xmm4, xmm6 ; |q1-q0| > beta-1
211 ; in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask)
212 ; out: mm1=p0' mm2=q0'
214 %macro DEBLOCK_P0_Q0 2
216 pxor %2m5, %2m2 ; p0^q0
217 pand %2m5, [pb_01 GLOBAL] ; (p0^q0)&1
220 pavgb %2m3, %2m0 ; (p1 - q1 + 256)>>1
221 pavgb %2m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
223 pavgb %2m4, %2m2 ; (q0 - p0 + 256)>>1
225 paddusb %2m3, %2m4 ; d+128+33
226 mov%1 %2m6, [pb_a1 GLOBAL]
228 psubusb %2m3, [pb_a1 GLOBAL]
236 %macro DEBLOCK_P0_Q0_MMX 0
239 %macro DEBLOCK_P0_Q0_SSE2 0
240 DEBLOCK_P0_Q0 dqa, xm
244 ; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
245 ; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
246 ; clobbers: q2, tmp, tc0
247 %macro LUMA_Q1_SSE2 6
250 pavgb %2, %6 ; avg(p2,avg(p0,q0))
252 pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1
253 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
264 ;-----------------------------------------------------------------------------
265 ; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
266 ;-----------------------------------------------------------------------------
267 cglobal x264_deblock_v_luma_sse2
269 movsxd rsi, esi ; stride
272 movd xmm8, [r8] ; tc0
276 sub r8, rsi ; pix-3*stride
278 movdqa xmm0, [r8+rsi] ; p1
279 movdqa xmm1, [r8+2*rsi] ; p0
280 movdqa xmm2, [rdi] ; q0
281 movdqa xmm3, [rdi+rsi] ; q1
282 LOAD_MASK_SSE2 edx, ecx
285 punpcklbw xmm8, xmm8 ; xmm8 = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
291 movdqa xmm3, [r8] ; p2
292 DIFF_GT2_SSE2 xmm1, xmm3, xmm5, xmm6, xmm7 ; |p2-p0| > beta-1
297 LUMA_Q1_SSE2 xmm0, xmm3, [r8], [r8+rsi], xmm6, xmm4
299 movdqa xmm4, [rdi+2*rsi] ; q2
300 DIFF_GT2_SSE2 xmm2, xmm4, xmm5, xmm6, xmm3 ; |q2-q0| > beta-1
304 movdqa xmm3, [rdi+rsi]
305 LUMA_Q1_SSE2 xmm3, xmm4, [rdi+2*rsi], [rdi+rsi], xmm8, xmm6
308 movdqa [r8+2*rsi], xmm1
313 ;-----------------------------------------------------------------------------
314 ; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
315 ;-----------------------------------------------------------------------------
316 cglobal x264_deblock_h_luma_sse2
321 %define pix_tmp rsp-104 ; 16x6 for the buffer + 8 for x264_deblock_v_luma_sse2's return address
323 ; transpose 6x16 -> tmp space
324 TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp
327 TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp+8
330 ; alpha, beta, tc0 are still in edx, ecx, r8
331 ; don't backup rax, r9, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
332 lea rdi, [pix_tmp+0x30]
334 call x264_deblock_v_luma_sse2
336 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
339 movq mm0, [pix_tmp+0x18]
340 movq mm1, [pix_tmp+0x28]
341 movq mm2, [pix_tmp+0x38]
342 movq mm3, [pix_tmp+0x48]
343 TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11)
349 movq mm0, [pix_tmp+0x10]
350 movq mm1, [pix_tmp+0x20]
351 movq mm2, [pix_tmp+0x30]
352 movq mm3, [pix_tmp+0x40]
353 TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11)
358 %macro CHROMA_V_START 0
360 movsxd rsi, esi ; stride
368 %macro CHROMA_H_START 0
378 ;-----------------------------------------------------------------------------
379 ; void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
380 ;-----------------------------------------------------------------------------
381 cglobal x264_deblock_v_chroma_mmxext
389 LOAD_MASK_MMX edx, ecx
400 ;-----------------------------------------------------------------------------
401 ; void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
402 ;-----------------------------------------------------------------------------
403 cglobal x264_deblock_h_chroma_mmxext
406 TRANSPOSE4x8_LOAD PASS8ROWS(rax, rdi, rsi, r9)
410 LOAD_MASK_MMX edx, ecx
418 TRANSPOSE8x4_STORE PASS8ROWS(rax, rdi, rsi, r9)
422 ; in: %1=p0 %2=p1 %3=q1
423 ; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
424 %macro CHROMA_INTRA_P0 3
427 pand mm4, [pb_01 GLOBAL] ; mm4 = (p0^q1)&1
430 pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
433 %macro CHROMA_INTRA_BODY 0
434 LOAD_MASK_MMX edx, ecx
437 CHROMA_INTRA_P0 mm1, mm0, mm3
438 CHROMA_INTRA_P0 mm2, mm3, mm0
447 ;-----------------------------------------------------------------------------
448 ; void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
449 ;-----------------------------------------------------------------------------
450 cglobal x264_deblock_v_chroma_intra_mmxext
464 ;-----------------------------------------------------------------------------
465 ; void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
466 ;-----------------------------------------------------------------------------
467 cglobal x264_deblock_h_chroma_intra_mmxext
469 TRANSPOSE4x8_LOAD PASS8ROWS(rax, rdi, rsi, r9)
471 TRANSPOSE8x4_STORE PASS8ROWS(rax, rdi, rsi, r9)