1 ;*****************************************************************************
2 ;* deblock-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
8 ;* This program is free software; you can redistribute it and/or modify
9 ;* it under the terms of the GNU General Public License as published by
10 ;* the Free Software Foundation; either version 2 of the License, or
11 ;* (at your option) any later version.
13 ;* This program is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ;* GNU General Public License for more details.
18 ;* You should have received a copy of the GNU General Public License
19 ;* along with this program; if not, write to the Free Software
20 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
21 ;*****************************************************************************
26 pb_01: times 16 db 0x01
27 pb_03: times 16 db 0x03
28 pb_a1: times 16 db 0xa1
32 ; expands to [base],...,[base+7*stride]
33 %define PASS8ROWS(base, base3, stride, stride3) \
34 [base], [base+stride], [base+stride*2], [base3], \
35 [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
37 ; in: 8 rows of 4 bytes in %1..%8
38 ; out: 4 rows of 8 bytes in m0..m3
39 %macro TRANSPOSE4x8_LOAD 8
68 ; in: 4 rows of 8 bytes in m0..m3
69 ; out: 8 rows of 4 bytes in %1..%8
70 %macro TRANSPOSE8x4_STORE 8
110 ; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
111 ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
112 %macro TRANSPOSE6x8_MEM 9
117 SBUTTERFLY bw, m0, %2, m4
118 SBUTTERFLY bw, m1, %4, m5
119 SBUTTERFLY bw, m2, %6, m6
121 SBUTTERFLY bw, m3, %8, m7
122 SBUTTERFLY wd, m0, m1, m5
123 SBUTTERFLY wd, m2, m3, m1
126 SBUTTERFLY wd, m4, [%9+0x10], m3
127 SBUTTERFLY wd, m6, m7, m2
128 SBUTTERFLY dq, m4, m6, m0
129 SBUTTERFLY dq, m5, m1, m7
138 ; out: %4 = |%1-%2|>%3
149 ; out: %4 = |%1-%2|>%3
170 ; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
171 ; out: m5=beta-1, m7=mask
178 packuswb m4, m4 ; 16x alpha-1
179 packuswb m5, m5 ; 16x beta-1
180 DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
181 DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
183 DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
189 ; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
192 %macro DEBLOCK_P0_Q0 0
195 pand m5, [pb_01 GLOBAL] ; (p0^q0)&1
198 pavgb m3, m0 ; (p1 - q1 + 256)>>1
199 pavgb m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
201 pavgb m4, m2 ; (q0 - p0 + 256)>>1
203 paddusb m3, m4 ; d+128+33
204 mova m6, [pb_a1 GLOBAL]
206 psubusb m3, [pb_a1 GLOBAL]
216 ; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
217 ; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
218 ; clobbers: q2, tmp, tc0
222 pavgb %2, %6 ; avg(p2,avg(p0,q0))
224 pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1
225 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
234 ;-----------------------------------------------------------------------------
235 ; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
236 ;-----------------------------------------------------------------------------
239 cglobal x264_deblock_v_luma_sse2
245 add r4, r0 ; pix-3*stride
247 mova m0, [r4+r1] ; p1
248 mova m1, [r4+2*r1] ; p0
250 mova m3, [r0+r1] ; q1
254 punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
261 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
266 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
268 movdqa m4, [r0+2*r1] ; q2
269 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
274 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
281 ;-----------------------------------------------------------------------------
282 ; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
283 ;-----------------------------------------------------------------------------
285 cglobal x264_deblock_h_luma_sse2
293 ; transpose 6x16 -> tmp space
294 TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp
297 TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp+8
300 ; alpha, beta, tc0 are still in r2d, r3d, r4
301 ; don't backup rax, r9, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
302 lea r0, [pix_tmp+0x30]
304 call x264_deblock_v_luma_sse2
306 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
309 movq m0, [pix_tmp+0x18]
310 movq m1, [pix_tmp+0x28]
311 movq m2, [pix_tmp+0x38]
312 movq m3, [pix_tmp+0x48]
313 TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11)
319 movq m0, [pix_tmp+0x10]
320 movq m1, [pix_tmp+0x20]
321 movq m2, [pix_tmp+0x30]
322 movq m3, [pix_tmp+0x40]
323 TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11)
330 %macro DEBLOCK_LUMA 3
331 ;-----------------------------------------------------------------------------
332 ; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
333 ;-----------------------------------------------------------------------------
334 cglobal x264_deblock_%2_luma_%1, 5,5,1
339 add r4, r0 ; pix-3*stride
341 mova m0, [r4+r1] ; p1
342 mova m1, [r4+2*r1] ; p0
344 mova m3, [r0+r1] ; q1
358 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
359 mova [esp+%3], m4 ; tc
363 mova [esp], m4 ; mask
366 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
368 pand m4, [esp+%3] ; tc
372 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
374 mova m4, [r0+2*r1] ; q2
375 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
376 mova m5, [esp] ; mask
378 mova m5, [esp+%3] ; tc
382 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
395 ;-----------------------------------------------------------------------------
396 ; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
397 ;-----------------------------------------------------------------------------
399 cglobal x264_deblock_h_luma_%1, 0,6
410 ; transpose 6x16 -> tmp space
411 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp
414 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
417 lea r0, [pix_tmp+0x30]
423 call x264_deblock_%2_luma_%1
425 add dword [esp ], 8 ; pix_tmp+0x38
426 add dword [esp+16], 2 ; tc0+2
427 call x264_deblock_%2_luma_%1
431 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
436 movq m0, [pix_tmp+0x10]
437 movq m1, [pix_tmp+0x20]
438 movq m2, [pix_tmp+0x30]
439 movq m3, [pix_tmp+0x40]
440 TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4)
444 movq m0, [pix_tmp+0x18]
445 movq m1, [pix_tmp+0x28]
446 movq m2, [pix_tmp+0x38]
447 movq m3, [pix_tmp+0x48]
448 TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4)
452 %endmacro ; DEBLOCK_LUMA
455 DEBLOCK_LUMA mmxext, v8, 8
457 DEBLOCK_LUMA sse2, v, 16
465 %macro CHROMA_V_START 0
473 %macro CHROMA_H_START 0
485 ;-----------------------------------------------------------------------------
486 ; void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
487 ;-----------------------------------------------------------------------------
488 cglobal x264_deblock_v_chroma_mmxext, 5,6
507 ;-----------------------------------------------------------------------------
508 ; void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
509 ;-----------------------------------------------------------------------------
510 cglobal x264_deblock_h_chroma_mmxext, 5,7
512 %define buf0 [rsp-16]
520 TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6)
533 TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6)
538 ; in: %1=p0 %2=p1 %3=q1
539 ; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
540 %macro CHROMA_INTRA_P0 3
543 pand m4, [pb_01 GLOBAL] ; m4 = (p0^q1)&1
546 pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
549 %macro CHROMA_INTRA_BODY 0
553 CHROMA_INTRA_P0 m1, m0, m3
554 CHROMA_INTRA_P0 m2, m3, m0
566 ;-----------------------------------------------------------------------------
567 ; void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
568 ;-----------------------------------------------------------------------------
569 cglobal x264_deblock_v_chroma_intra_mmxext, 4,5,1
583 ;-----------------------------------------------------------------------------
584 ; void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
585 ;-----------------------------------------------------------------------------
586 cglobal x264_deblock_h_chroma_intra_mmxext, 4,6,1
588 TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6)
590 TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6)