1 ;*****************************************************************************
2 ;* predict-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
8 ;* This program is free software; you can redistribute it and/or modify
9 ;* it under the terms of the GNU General Public License as published by
10 ;* the Free Software Foundation; either version 2 of the License, or
11 ;* (at your option) any later version.
13 ;* This program is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ;* GNU General Public License for more details.
18 ;* You should have received a copy of the GNU General Public License
19 ;* along with this program; if not, write to the Free Software
20 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
21 ;*****************************************************************************
25 ;=============================================================================
26 ; Macros and other preprocessor constants
27 ;=============================================================================
29 %include "amd64inc.asm"
32 movq [parm1q + 0*FDEC_STRIDE], %1
33 movq [parm1q + 1*FDEC_STRIDE], %1
34 movq [parm1q + 2*FDEC_STRIDE], %1
35 movq [parm1q + 3*FDEC_STRIDE], %1
36 movq [parm1q + 4*FDEC_STRIDE], %2
37 movq [parm1q + 5*FDEC_STRIDE], %2
38 movq [parm1q + 6*FDEC_STRIDE], %2
39 movq [parm1q + 7*FDEC_STRIDE], %2
46 movq [parm1q + 1*FDEC_STRIDE], %1
47 movq [parm1q + 2*FDEC_STRIDE], %1
48 movq [parm1q + 3*FDEC_STRIDE], %1
49 movq [parm1q + 4*FDEC_STRIDE], %1
50 movq [parm1q + 1*FDEC_STRIDE + 8], %2
51 movq [parm1q + 2*FDEC_STRIDE + 8], %2
52 movq [parm1q + 3*FDEC_STRIDE + 8], %2
53 movq [parm1q + 4*FDEC_STRIDE + 8], %2
55 lea parm1q, [parm1q + 4*FDEC_STRIDE]
79 ;=============================================================================
81 ;=============================================================================
85 ; dest, left, right, src, tmp
86 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
87 %macro PRED8x8_LOWPASS0 6
92 pand %3, [pb_1 GLOBAL]
96 %macro PRED8x8_LOWPASS 5
97 PRED8x8_LOWPASS0 %1, %2, %3, %4, %5, q
99 %macro PRED8x8_LOWPASS_XMM 5
100 PRED8x8_LOWPASS0 %1, %2, %3, %4, %5, dqa
104 ;-----------------------------------------------------------------------------
105 ; void predict_4x4_ddl_mmxext( uint8_t *src )
106 ;-----------------------------------------------------------------------------
107 cglobal predict_4x4_ddl_mmxext
108 sub parm1q, FDEC_STRIDE
112 movq mm4, [pb_0s_ff GLOBAL]
117 PRED8x8_LOWPASS mm0, mm1, mm2, mm3, mm5
122 movd [parm1q+Y*FDEC_STRIDE], mm0
128 ;-----------------------------------------------------------------------------
129 ; void predict_4x4_vl_mmxext( uint8_t *src )
130 ;-----------------------------------------------------------------------------
131 cglobal predict_4x4_vl_mmxext
132 movq mm1, [parm1q-FDEC_STRIDE]
140 PRED8x8_LOWPASS mm0, mm1, mm2, mm3, mm5
142 movd [parm1q+0*FDEC_STRIDE], mm4
143 movd [parm1q+1*FDEC_STRIDE], mm0
146 movd [parm1q+2*FDEC_STRIDE], mm4
147 movd [parm1q+3*FDEC_STRIDE], mm0
151 ;-----------------------------------------------------------------------------
152 ; void predict_8x8_v_mmxext( uint8_t *src, uint8_t *edge )
153 ;-----------------------------------------------------------------------------
154 cglobal predict_8x8_v_mmxext
155 movq mm0, [parm2q+16]
159 ;-----------------------------------------------------------------------------
160 ; void predict_8x8_dc_mmxext( uint8_t *src, uint8_t *edge );
161 ;-----------------------------------------------------------------------------
162 cglobal predict_8x8_dc_mmxext
165 psadbw mm0, [parm2q+7]
166 psadbw mm1, [parm2q+16]
167 paddw mm0, [pw_8 GLOBAL]
175 ;-----------------------------------------------------------------------------
176 ; void predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t *edge );
177 ;-----------------------------------------------------------------------------
178 cglobal predict_8x8_dc_top_mmxext
180 psadbw mm0, [parm2q+16]
181 paddw mm0, [pw_4 GLOBAL]
188 ;-----------------------------------------------------------------------------
189 ; void predict_8x8_dc_left_mmxext( uint8_t *src, uint8_t *edge );
190 ;-----------------------------------------------------------------------------
191 cglobal predict_8x8_dc_left_mmxext
193 psadbw mm0, [parm2q+7]
194 paddw mm0, [pw_4 GLOBAL]
201 ;-----------------------------------------------------------------------------
202 ; void predict_8x8_ddl_mmxext( uint8_t *src, uint8_t *edge )
203 ;-----------------------------------------------------------------------------
204 cglobal predict_8x8_ddl_mmxext
205 movq mm5, [parm2q+16]
206 movq mm2, [parm2q+17]
207 movq mm3, [parm2q+23]
208 movq mm4, [parm2q+25]
211 PRED8x8_LOWPASS mm0, mm1, mm2, mm5, mm7
212 PRED8x8_LOWPASS mm1, mm3, mm4, [parm2q+24], mm6
216 movq [parm1q+Y*FDEC_STRIDE], mm1
224 movq [parm1q+Y*FDEC_STRIDE], mm1
229 movq [parm1q+Y*FDEC_STRIDE], mm1
233 ;-----------------------------------------------------------------------------
234 ; void predict_8x8_ddl_sse2( uint8_t *src, uint8_t *edge )
235 ;-----------------------------------------------------------------------------
236 cglobal predict_8x8_ddl_sse2
237 movdqa xmm3, [parm2q+16]
238 movdqu xmm2, [parm2q+17]
241 PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4
246 movq [parm1q+Y*FDEC_STRIDE], xmm0
251 ;-----------------------------------------------------------------------------
252 ; void predict_8x8_ddr_sse2( uint8_t *src, uint8_t *edge )
253 ;-----------------------------------------------------------------------------
254 cglobal predict_8x8_ddr_sse2
255 movdqu xmm3, [parm2q+8]
256 movdqu xmm1, [parm2q+7]
259 PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4
265 movq [parm1q+Y*FDEC_STRIDE], xmm0
266 movq [parm1q+(Y-1)*FDEC_STRIDE], xmm1
271 movq [parm1q+1*FDEC_STRIDE], xmm0
272 movq [parm1q+0*FDEC_STRIDE], xmm1
276 ;-----------------------------------------------------------------------------
277 ; void predict_8x8_vl_sse2( uint8_t *src, uint8_t *edge )
278 ;-----------------------------------------------------------------------------
279 cglobal predict_8x8_vl_sse2
280 movdqa xmm4, [parm2q+16]
287 PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm4, xmm5
288 ; xmm0: (t0 + 2*t1 + t2 + 2) >> 2
289 ; xmm3: (t0 + t1 + 1) >> 1
294 movq [parm1q+ Y *FDEC_STRIDE], xmm3
295 movq [parm1q+(Y+1)*FDEC_STRIDE], xmm0
300 movq [parm1q+ Y *FDEC_STRIDE], xmm3
301 movq [parm1q+(Y+1)*FDEC_STRIDE], xmm0
305 ;-----------------------------------------------------------------------------
306 ; void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t *edge )
307 ;-----------------------------------------------------------------------------
309 ; fills only some pixels:
320 cglobal predict_8x8_vr_core_mmxext
321 movq mm2, [parm2q+16]
322 movq mm3, [parm2q+15]
323 movq mm1, [parm2q+14]
326 PRED8x8_LOWPASS mm0, mm1, mm2, mm4, mm7
330 movq [parm1q+ Y *FDEC_STRIDE], mm3
331 movq [parm1q+(Y+1)*FDEC_STRIDE], mm0
336 movq [parm1q+ Y *FDEC_STRIDE], mm3
337 movq [parm1q+(Y+1)*FDEC_STRIDE], mm0
341 ;-----------------------------------------------------------------------------
342 ; void predict_8x8c_v_mmx( uint8_t *src )
343 ;-----------------------------------------------------------------------------
344 cglobal predict_8x8c_v_mmx
345 movq mm0, [parm1q - FDEC_STRIDE]
349 ;-----------------------------------------------------------------------------
350 ; void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 )
351 ;-----------------------------------------------------------------------------
352 cglobal predict_8x8c_dc_core_mmxext
353 movq mm0, [parm1q - FDEC_STRIDE]
366 paddw mm1, [pw_2 GLOBAL]
369 pshufw mm0, mm0, 0 ; dc0 (w)
371 psrlw mm3, 3 ; dc3 (w)
372 psrlw mm2, 2 ; dc2 (w)
373 psrlw mm1, 2 ; dc1 (w)
375 packuswb mm0, mm1 ; dc0,dc1 (b)
376 packuswb mm2, mm3 ; dc2,dc3 (b)
381 ;-----------------------------------------------------------------------------
382 ; void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c )
383 ;-----------------------------------------------------------------------------
384 cglobal predict_8x8c_p_core_mmxext
392 pmullw mm2, [pw_3210 GLOBAL]
394 paddsw mm0, mm2 ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b}
395 paddsw mm1, mm0 ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b}
409 add parm1q, FDEC_STRIDE
416 ;-----------------------------------------------------------------------------
417 ; void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c )
418 ;-----------------------------------------------------------------------------
419 cglobal predict_16x16_p_core_mmxext
428 pmullw mm5, [pw_3210 GLOBAL]
432 paddsw mm0, mm5 ; mm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b}
433 paddsw mm1, mm0 ; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
434 paddsw mm2, mm0 ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b}
435 paddsw mm3, mm1 ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b}
458 add parm1q, FDEC_STRIDE
465 ;-----------------------------------------------------------------------------
466 ; void predict_16x16_v_mmx( uint8_t *src )
467 ;-----------------------------------------------------------------------------
468 cglobal predict_16x16_v_mmx
469 sub parm1q, FDEC_STRIDE
471 movq mm1, [parm1q + 8]
475 ;-----------------------------------------------------------------------------
476 ; void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left )
477 ;-----------------------------------------------------------------------------
479 %macro PRED16x16_DC 2
480 sub parm1q, FDEC_STRIDE
485 psadbw mm1, [parm1q + 8]
490 packuswb mm0, mm0 ; dc in bytes
495 cglobal predict_16x16_dc_core_mmxext
500 cglobal predict_16x16_dc_top_mmxext
501 PRED16x16_DC [pw_8 GLOBAL], 4