1 ;*****************************************************************************
2 ;* deblock-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
8 ;* This program is free software; you can redistribute it and/or modify
9 ;* it under the terms of the GNU General Public License as published by
10 ;* the Free Software Foundation; either version 2 of the License, or
11 ;* (at your option) any later version.
13 ;* This program is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ;* GNU General Public License for more details.
18 ;* You should have received a copy of the GNU General Public License
19 ;* along with this program; if not, write to the Free Software
20 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
21 ;*****************************************************************************
25 %include "amd64inc.asm"
27 SECTION .rodata align=16
28 pb_01: times 16 db 0x01
29 pb_03: times 16 db 0x03
30 pb_a1: times 16 db 0xa1
33 cglobal x264_deblock_v_luma_sse2
34 cglobal x264_deblock_h_luma_sse2
35 cglobal x264_deblock_v_chroma_mmxext
36 cglobal x264_deblock_h_chroma_mmxext
37 cglobal x264_deblock_v_chroma_intra_mmxext
38 cglobal x264_deblock_h_chroma_intra_mmxext
40 ; expands to [base],...,[base+7*stride]
41 %define PASS8ROWS(base, base3, stride, stride3) \
42 [base], [base+stride], [base+stride*2], [base3], \
43 [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
45 ; in: 8 rows of 4 bytes in %1..%8
46 ; out: 4 rows of 8 bytes in mm0..mm3
47 %macro TRANSPOSE4x8_LOAD 8
76 ; in: 4 rows of 8 bytes in mm0..mm3
77 ; out: 8 rows of 4 bytes in %1..%8
78 %macro TRANSPOSE8x4_STORE 8
118 ; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
119 ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
120 %macro TRANSPOSE6x8_MEM 9
125 SBUTTERFLY bw, mm0, %2, mm4
126 SBUTTERFLY bw, mm1, %4, mm5
127 SBUTTERFLY bw, mm2, %6, mm6
129 SBUTTERFLY bw, mm3, %8, mm7
130 SBUTTERFLY wd, mm0, mm1, mm5
131 SBUTTERFLY wd, mm2, mm3, mm1
134 SBUTTERFLY wd, mm4, [%9+0x10], mm3
135 SBUTTERFLY wd, mm6, mm7, mm2
136 SBUTTERFLY dq, mm4, mm6, mm0
137 SBUTTERFLY dq, mm5, mm1, mm7
146 ; out: %4 = |%1-%2|>%3
157 DIFF_GT q, %1, %2, %3, %4, %5
159 %macro DIFF_GT_SSE2 5
160 DIFF_GT dqa, %1, %2, %3, %4, %5
163 ; out: %4 = |%1-%2|>%3
174 %macro DIFF_GT2_MMX 5
175 DIFF_GT2 q, %1, %2, %3, %4, %5
177 %macro DIFF_GT2_SSE2 5
178 DIFF_GT2 dqa, %1, %2, %3, %4, %5
181 ; in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 %1=alpha-1 %2=beta-1
182 ; out: mm5=beta-1, mm7=mask
184 %macro LOAD_MASK_MMX 2
189 packuswb mm4, mm4 ; 8x alpha-1
190 packuswb mm5, mm5 ; 8x beta-1
191 DIFF_GT_MMX mm1, mm2, mm4, mm7, mm6 ; |p0-q0| > alpha-1
192 DIFF_GT_MMX mm0, mm1, mm5, mm4, mm6 ; |p1-p0| > beta-1
194 DIFF_GT_MMX mm3, mm2, mm5, mm4, mm6 ; |q1-q0| > beta-1
199 %macro LOAD_MASK_SSE2 2
202 pshuflw xmm4, xmm4, 0
203 pshuflw xmm5, xmm5, 0
204 punpcklqdq xmm4, xmm4
205 punpcklqdq xmm5, xmm5
206 packuswb xmm4, xmm4 ; 16x alpha-1
207 packuswb xmm5, xmm5 ; 16x beta-1
208 DIFF_GT_SSE2 xmm1, xmm2, xmm4, xmm7, xmm6 ; |p0-q0| > alpha-1
209 DIFF_GT_SSE2 xmm0, xmm1, xmm5, xmm4, xmm6 ; |p1-p0| > beta-1
211 DIFF_GT_SSE2 xmm3, xmm2, xmm5, xmm4, xmm6 ; |q1-q0| > beta-1
217 ; in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask)
218 ; out: mm1=p0' mm2=q0'
220 %macro DEBLOCK_P0_Q0 2
222 pxor %2m5, %2m2 ; p0^q0
223 pand %2m5, [pb_01 GLOBAL] ; (p0^q0)&1
226 pavgb %2m3, %2m0 ; (p1 - q1 + 256)>>1
227 pavgb %2m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
229 pavgb %2m4, %2m2 ; (q0 - p0 + 256)>>1
231 paddusb %2m3, %2m4 ; d+128+33
232 mov%1 %2m6, [pb_a1 GLOBAL]
234 psubusb %2m3, [pb_a1 GLOBAL]
242 %macro DEBLOCK_P0_Q0_MMX 0
245 %macro DEBLOCK_P0_Q0_SSE2 0
246 DEBLOCK_P0_Q0 dqa, xm
250 ; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
251 ; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
252 ; clobbers: q2, tmp, tc0
253 %macro LUMA_Q1_SSE2 6
256 pavgb %2, %6 ; avg(p2,avg(p0,q0))
258 pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1
259 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
271 ;-----------------------------------------------------------------------------
272 ; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
273 ;-----------------------------------------------------------------------------
274 x264_deblock_v_luma_sse2:
276 movsxd rsi, esi ; stride
279 movd xmm8, [r8] ; tc0
283 sub r8, rsi ; pix-3*stride
285 movdqa xmm0, [r8+rsi] ; p1
286 movdqa xmm1, [r8+2*rsi] ; p0
287 movdqa xmm2, [rdi] ; q0
288 movdqa xmm3, [rdi+rsi] ; q1
289 LOAD_MASK_SSE2 edx, ecx
292 punpcklbw xmm8, xmm8 ; xmm8 = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
298 movdqa xmm3, [r8] ; p2
299 DIFF_GT2_SSE2 xmm1, xmm3, xmm5, xmm6, xmm7 ; |p2-p0| > beta-1
304 LUMA_Q1_SSE2 xmm0, xmm3, [r8], [r8+rsi], xmm6, xmm4
306 movdqa xmm4, [rdi+2*rsi] ; q2
307 DIFF_GT2_SSE2 xmm2, xmm4, xmm5, xmm6, xmm3 ; |q2-q0| > beta-1
311 movdqa xmm3, [rdi+rsi]
312 LUMA_Q1_SSE2 xmm3, xmm4, [rdi+2*rsi], [rdi+rsi], xmm8, xmm6
315 movdqa [r8+2*rsi], xmm1
321 ;-----------------------------------------------------------------------------
322 ; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
323 ;-----------------------------------------------------------------------------
324 x264_deblock_h_luma_sse2:
329 %define pix_tmp rsp-104 ; 16x6 for the buffer + 8 for x264_deblock_v_luma_sse2's return address
331 ; transpose 6x16 -> tmp space
332 TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp
335 TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp+8
338 ; alpha, beta, tc0 are still in edx, ecx, r8
339 ; don't backup rax, r9, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
340 lea rdi, [pix_tmp+0x30]
342 call x264_deblock_v_luma_sse2
344 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
347 movq mm0, [pix_tmp+0x18]
348 movq mm1, [pix_tmp+0x28]
349 movq mm2, [pix_tmp+0x38]
350 movq mm3, [pix_tmp+0x48]
351 TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11)
357 movq mm0, [pix_tmp+0x10]
358 movq mm1, [pix_tmp+0x20]
359 movq mm2, [pix_tmp+0x30]
360 movq mm3, [pix_tmp+0x40]
361 TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11)
366 %macro CHROMA_V_START 0
368 movsxd rsi, esi ; stride
376 %macro CHROMA_H_START 0
387 ;-----------------------------------------------------------------------------
388 ; void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
389 ;-----------------------------------------------------------------------------
390 x264_deblock_v_chroma_mmxext:
398 LOAD_MASK_MMX edx, ecx
410 ;-----------------------------------------------------------------------------
411 ; void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
412 ;-----------------------------------------------------------------------------
413 x264_deblock_h_chroma_mmxext:
416 TRANSPOSE4x8_LOAD PASS8ROWS(rax, rdi, rsi, r9)
420 LOAD_MASK_MMX edx, ecx
428 TRANSPOSE8x4_STORE PASS8ROWS(rax, rdi, rsi, r9)
432 ; in: %1=p0 %2=p1 %3=q1
433 ; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
434 %macro CHROMA_INTRA_P0 3
437 pand mm4, [pb_01 GLOBAL] ; mm4 = (p0^q1)&1
440 pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
443 %macro CHROMA_INTRA_BODY 0
444 LOAD_MASK_MMX edx, ecx
447 CHROMA_INTRA_P0 mm1, mm0, mm3
448 CHROMA_INTRA_P0 mm2, mm3, mm0
458 ;-----------------------------------------------------------------------------
459 ; void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
460 ;-----------------------------------------------------------------------------
461 x264_deblock_v_chroma_intra_mmxext:
476 ;-----------------------------------------------------------------------------
477 ; void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
478 ;-----------------------------------------------------------------------------
479 x264_deblock_h_chroma_intra_mmxext:
481 TRANSPOSE4x8_LOAD PASS8ROWS(rax, rdi, rsi, r9)
483 TRANSPOSE8x4_STORE PASS8ROWS(rax, rdi, rsi, r9)