1 ;*****************************************************************************
2 ;* deblock-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
8 ;* This program is free software; you can redistribute it and/or modify
9 ;* it under the terms of the GNU General Public License as published by
10 ;* the Free Software Foundation; either version 2 of the License, or
11 ;* (at your option) any later version.
13 ;* This program is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ;* GNU General Public License for more details.
18 ;* You should have received a copy of the GNU General Public License
19 ;* along with this program; if not, write to the Free Software
20 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
21 ;*****************************************************************************
26 pb_01: times 16 db 0x01
27 pb_03: times 16 db 0x03
28 pb_a1: times 16 db 0xa1
60 ; expands to [base],...,[base+7*stride]
61 %define PASS8ROWS(base, base3, stride, stride3) \
62 [base], [base+stride], [base+stride*2], [base3], \
63 [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
65 ; in: 8 rows of 4 bytes in %1..%8
66 ; out: 4 rows of 8 bytes in m0..m3
67 %macro TRANSPOSE4x8_LOAD 8
96 ; in: 4 rows of 8 bytes in m0..m3
97 ; out: 8 rows of 4 bytes in %1..%8
98 %macro TRANSPOSE8x4_STORE 8
138 ; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
139 ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
140 %macro TRANSPOSE6x8_MEM 9
145 SBUTTERFLY bw, m0, %2, m4
146 SBUTTERFLY bw, m1, %4, m5
147 SBUTTERFLY bw, m2, %6, m6
149 SBUTTERFLY bw, m3, %8, m7
150 SBUTTERFLY wd, m0, m1, m5
151 SBUTTERFLY wd, m2, m3, m1
154 SBUTTERFLY wd, m4, [%9+0x10], m3
155 SBUTTERFLY wd, m6, m7, m2
156 SBUTTERFLY dq, m4, m6, m0
157 SBUTTERFLY dq, m5, m1, m7
166 ; out: %4 = |%1-%2|>%3
177 ; out: %4 = |%1-%2|>%3
198 ; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
199 ; out: m5=beta-1, m7=mask
206 packuswb m4, m4 ; 16x alpha-1
207 packuswb m5, m5 ; 16x beta-1
208 DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
209 DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
211 DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
217 ; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
220 %macro DEBLOCK_P0_Q0 0
223 pand m5, [pb_01 GLOBAL] ; (p0^q0)&1
226 pavgb m3, m0 ; (p1 - q1 + 256)>>1
227 pavgb m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
229 pavgb m4, m2 ; (q0 - p0 + 256)>>1
231 paddusb m3, m4 ; d+128+33
232 movq m6, [pb_a1 GLOBAL]
234 psubusb m3, [pb_a1 GLOBAL]
244 ; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
245 ; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
246 ; clobbers: q2, tmp, tc0
250 pavgb %2, %6 ; avg(p2,avg(p0,q0))
252 pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1
253 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
262 ;-----------------------------------------------------------------------------
263 ; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
264 ;-----------------------------------------------------------------------------
267 cglobal x264_deblock_v_luma_sse2
273 add r4, r0 ; pix-3*stride
275 movdqa m0, [r4+r1] ; p1
276 movdqa m1, [r4+2*r1] ; p0
278 movdqa m3, [r0+r1] ; q1
282 punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
289 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
294 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
296 movdqa m4, [r0+2*r1] ; q2
297 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
302 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
309 ;-----------------------------------------------------------------------------
310 ; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
311 ;-----------------------------------------------------------------------------
313 cglobal x264_deblock_h_luma_sse2
321 ; transpose 6x16 -> tmp space
322 TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp
325 TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp+8
328 ; alpha, beta, tc0 are still in r2d, r3d, r4
329 ; don't backup rax, r9, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
330 lea r0, [pix_tmp+0x30]
332 call x264_deblock_v_luma_sse2
334 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
337 movq m0, [pix_tmp+0x18]
338 movq m1, [pix_tmp+0x28]
339 movq m2, [pix_tmp+0x38]
340 movq m3, [pix_tmp+0x48]
341 TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11)
347 movq m0, [pix_tmp+0x10]
348 movq m1, [pix_tmp+0x20]
349 movq m2, [pix_tmp+0x30]
350 movq m3, [pix_tmp+0x40]
351 TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11)
358 %macro DEBLOCK_LUMA 3
359 ;-----------------------------------------------------------------------------
360 ; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
361 ;-----------------------------------------------------------------------------
362 cglobal x264_deblock_%2_luma_%1, 5,5,1
367 add r4, r0 ; pix-3*stride
369 movq m0, [r4+r1] ; p1
370 movq m1, [r4+2*r1] ; p0
372 movq m3, [r0+r1] ; q1
386 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
387 movq [esp+%3], m4 ; tc
391 movq [esp], m4 ; mask
394 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
396 pand m4, [esp+%3] ; tc
400 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
402 movq m4, [r0+2*r1] ; q2
403 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
404 movq m5, [esp] ; mask
406 movq m5, [esp+%3] ; tc
410 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
423 ;-----------------------------------------------------------------------------
424 ; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
425 ;-----------------------------------------------------------------------------
427 cglobal x264_deblock_h_luma_%1, 0,6
438 ; transpose 6x16 -> tmp space
439 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp
442 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
445 lea r0, [pix_tmp+0x30]
451 call x264_deblock_%2_luma_%1
453 add dword [esp ], 8 ; pix_tmp+0x38
454 add dword [esp+16], 2 ; tc0+2
455 call x264_deblock_%2_luma_%1
459 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
464 movq m0, [pix_tmp+0x10]
465 movq m1, [pix_tmp+0x20]
466 movq m2, [pix_tmp+0x30]
467 movq m3, [pix_tmp+0x40]
468 TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4)
472 movq m0, [pix_tmp+0x18]
473 movq m1, [pix_tmp+0x28]
474 movq m2, [pix_tmp+0x38]
475 movq m3, [pix_tmp+0x48]
476 TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4)
480 %endmacro ; DEBLOCK_LUMA
483 DEBLOCK_LUMA mmxext, v8, 8
485 DEBLOCK_LUMA sse2, v, 16
493 %macro CHROMA_V_START 0
501 %macro CHROMA_H_START 0
513 ;-----------------------------------------------------------------------------
514 ; void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
515 ;-----------------------------------------------------------------------------
516 cglobal x264_deblock_v_chroma_mmxext, 5,6
535 ;-----------------------------------------------------------------------------
536 ; void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
537 ;-----------------------------------------------------------------------------
538 cglobal x264_deblock_h_chroma_mmxext, 5,7
540 %define buf0 [rsp-16]
548 TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6)
561 TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6)
566 ; in: %1=p0 %2=p1 %3=q1
567 ; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
568 %macro CHROMA_INTRA_P0 3
571 pand m4, [pb_01 GLOBAL] ; m4 = (p0^q1)&1
574 pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
577 %macro CHROMA_INTRA_BODY 0
581 CHROMA_INTRA_P0 m1, m0, m3
582 CHROMA_INTRA_P0 m2, m3, m0
594 ;-----------------------------------------------------------------------------
595 ; void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
596 ;-----------------------------------------------------------------------------
597 cglobal x264_deblock_v_chroma_intra_mmxext, 4,5,1
611 ;-----------------------------------------------------------------------------
612 ; void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
613 ;-----------------------------------------------------------------------------
614 cglobal x264_deblock_h_chroma_intra_mmxext, 4,6,1
616 TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6)
618 TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6)