1 ;*****************************************************************************
2 ;* predict-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
8 ;* This program is free software; you can redistribute it and/or modify
9 ;* it under the terms of the GNU General Public License as published by
10 ;* the Free Software Foundation; either version 2 of the License, or
11 ;* (at your option) any later version.
13 ;* This program is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ;* GNU General Public License for more details.
18 ;* You should have received a copy of the GNU General Public License
19 ;* along with this program; if not, write to the Free Software
20 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
21 ;*****************************************************************************
25 ;=============================================================================
26 ; Macros and other preprocessor constants
27 ;=============================================================================
29 %include "amd64inc.asm"
32 movq [parm1q + 0*FDEC_STRIDE], %1
33 movq [parm1q + 1*FDEC_STRIDE], %1
34 movq [parm1q + 2*FDEC_STRIDE], %1
35 movq [parm1q + 3*FDEC_STRIDE], %1
36 movq [parm1q + 4*FDEC_STRIDE], %2
37 movq [parm1q + 5*FDEC_STRIDE], %2
38 movq [parm1q + 6*FDEC_STRIDE], %2
39 movq [parm1q + 7*FDEC_STRIDE], %2
46 movq [parm1q + 1*FDEC_STRIDE], %1
47 movq [parm1q + 2*FDEC_STRIDE], %1
48 movq [parm1q + 3*FDEC_STRIDE], %1
49 movq [parm1q + 4*FDEC_STRIDE], %1
50 movq [parm1q + 1*FDEC_STRIDE + 8], %2
51 movq [parm1q + 2*FDEC_STRIDE + 8], %2
52 movq [parm1q + 3*FDEC_STRIDE + 8], %2
53 movq [parm1q + 4*FDEC_STRIDE + 8], %2
55 lea parm1q, [parm1q + 4*FDEC_STRIDE]
61 SECTION .rodata align=16
80 ;=============================================================================
82 ;=============================================================================
86 ; dest, left, right, src, tmp
87 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
88 %macro PRED8x8_LOWPASS0 6
93 pand %3, [pb_1 GLOBAL]
97 %macro PRED8x8_LOWPASS 5
98 PRED8x8_LOWPASS0 %1, %2, %3, %4, %5, q
100 %macro PRED8x8_LOWPASS_XMM 5
101 PRED8x8_LOWPASS0 %1, %2, %3, %4, %5, dqa
105 ;-----------------------------------------------------------------------------
106 ; void predict_4x4_ddl_mmxext( uint8_t *src )
107 ;-----------------------------------------------------------------------------
108 cglobal predict_4x4_ddl_mmxext
109 sub parm1q, FDEC_STRIDE
113 movq mm4, [pb_0s_ff GLOBAL]
118 PRED8x8_LOWPASS mm0, mm1, mm2, mm3, mm5
123 movd [parm1q+Y*FDEC_STRIDE], mm0
129 ;-----------------------------------------------------------------------------
130 ; void predict_4x4_vl_mmxext( uint8_t *src )
131 ;-----------------------------------------------------------------------------
132 cglobal predict_4x4_vl_mmxext
133 movq mm1, [parm1q-FDEC_STRIDE]
141 PRED8x8_LOWPASS mm0, mm1, mm2, mm3, mm5
143 movd [parm1q+0*FDEC_STRIDE], mm4
144 movd [parm1q+1*FDEC_STRIDE], mm0
147 movd [parm1q+2*FDEC_STRIDE], mm4
148 movd [parm1q+3*FDEC_STRIDE], mm0
152 ;-----------------------------------------------------------------------------
153 ; void predict_8x8_v_mmxext( uint8_t *src, uint8_t *edge )
154 ;-----------------------------------------------------------------------------
155 cglobal predict_8x8_v_mmxext
156 movq mm0, [parm2q+16]
160 ;-----------------------------------------------------------------------------
161 ; void predict_8x8_dc_mmxext( uint8_t *src, uint8_t *edge );
162 ;-----------------------------------------------------------------------------
163 cglobal predict_8x8_dc_mmxext
166 psadbw mm0, [parm2q+7]
167 psadbw mm1, [parm2q+16]
168 paddw mm0, [pw_8 GLOBAL]
176 ;-----------------------------------------------------------------------------
177 ; void predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t *edge );
178 ;-----------------------------------------------------------------------------
179 cglobal predict_8x8_dc_top_mmxext
181 psadbw mm0, [parm2q+16]
182 paddw mm0, [pw_4 GLOBAL]
189 ;-----------------------------------------------------------------------------
190 ; void predict_8x8_dc_left_mmxext( uint8_t *src, uint8_t *edge );
191 ;-----------------------------------------------------------------------------
192 cglobal predict_8x8_dc_left_mmxext
194 psadbw mm0, [parm2q+7]
195 paddw mm0, [pw_4 GLOBAL]
202 ;-----------------------------------------------------------------------------
203 ; void predict_8x8_ddl_mmxext( uint8_t *src, uint8_t *edge )
204 ;-----------------------------------------------------------------------------
205 cglobal predict_8x8_ddl_mmxext
206 movq mm5, [parm2q+16]
207 movq mm2, [parm2q+17]
208 movq mm3, [parm2q+23]
209 movq mm4, [parm2q+25]
212 PRED8x8_LOWPASS mm0, mm1, mm2, mm5, mm7
213 PRED8x8_LOWPASS mm1, mm3, mm4, [parm2q+24], mm6
217 movq [parm1q+Y*FDEC_STRIDE], mm1
225 movq [parm1q+Y*FDEC_STRIDE], mm1
230 movq [parm1q+Y*FDEC_STRIDE], mm1
234 ;-----------------------------------------------------------------------------
235 ; void predict_8x8_ddl_sse2( uint8_t *src, uint8_t *edge )
236 ;-----------------------------------------------------------------------------
237 cglobal predict_8x8_ddl_sse2
238 movdqa xmm3, [parm2q+16]
239 movdqu xmm2, [parm2q+17]
242 PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4
247 movq [parm1q+Y*FDEC_STRIDE], xmm0
252 ;-----------------------------------------------------------------------------
253 ; void predict_8x8_ddr_sse2( uint8_t *src, uint8_t *edge )
254 ;-----------------------------------------------------------------------------
255 cglobal predict_8x8_ddr_sse2
256 movdqu xmm3, [parm2q+8]
257 movdqu xmm1, [parm2q+7]
260 PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4
266 movq [parm1q+Y*FDEC_STRIDE], xmm0
267 movq [parm1q+(Y-1)*FDEC_STRIDE], xmm1
272 movq [parm1q+1*FDEC_STRIDE], xmm0
273 movq [parm1q+0*FDEC_STRIDE], xmm1
277 ;-----------------------------------------------------------------------------
278 ; void predict_8x8_vl_sse2( uint8_t *src, uint8_t *edge )
279 ;-----------------------------------------------------------------------------
280 cglobal predict_8x8_vl_sse2
281 movdqa xmm4, [parm2q+16]
288 PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm4, xmm5
289 ; xmm0: (t0 + 2*t1 + t2 + 2) >> 2
290 ; xmm3: (t0 + t1 + 1) >> 1
295 movq [parm1q+ Y *FDEC_STRIDE], xmm3
296 movq [parm1q+(Y+1)*FDEC_STRIDE], xmm0
301 movq [parm1q+ Y *FDEC_STRIDE], xmm3
302 movq [parm1q+(Y+1)*FDEC_STRIDE], xmm0
306 ;-----------------------------------------------------------------------------
307 ; void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t *edge )
308 ;-----------------------------------------------------------------------------
310 ; fills only some pixels:
321 cglobal predict_8x8_vr_core_mmxext
322 movq mm2, [parm2q+16]
323 movq mm3, [parm2q+15]
324 movq mm1, [parm2q+14]
327 PRED8x8_LOWPASS mm0, mm1, mm2, mm4, mm7
331 movq [parm1q+ Y *FDEC_STRIDE], mm3
332 movq [parm1q+(Y+1)*FDEC_STRIDE], mm0
337 movq [parm1q+ Y *FDEC_STRIDE], mm3
338 movq [parm1q+(Y+1)*FDEC_STRIDE], mm0
342 ;-----------------------------------------------------------------------------
343 ; void predict_8x8c_v_mmx( uint8_t *src )
344 ;-----------------------------------------------------------------------------
345 cglobal predict_8x8c_v_mmx
346 movq mm0, [parm1q - FDEC_STRIDE]
350 ;-----------------------------------------------------------------------------
351 ; void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 )
352 ;-----------------------------------------------------------------------------
353 cglobal predict_8x8c_dc_core_mmxext
354 movq mm0, [parm1q - FDEC_STRIDE]
367 paddw mm1, [pw_2 GLOBAL]
370 pshufw mm0, mm0, 0 ; dc0 (w)
372 psrlw mm3, 3 ; dc3 (w)
373 psrlw mm2, 2 ; dc2 (w)
374 psrlw mm1, 2 ; dc1 (w)
376 packuswb mm0, mm1 ; dc0,dc1 (b)
377 packuswb mm2, mm3 ; dc2,dc3 (b)
382 ;-----------------------------------------------------------------------------
383 ; void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c )
384 ;-----------------------------------------------------------------------------
385 cglobal predict_8x8c_p_core_mmxext
393 pmullw mm2, [pw_3210 GLOBAL]
395 paddsw mm0, mm2 ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b}
396 paddsw mm1, mm0 ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b}
410 add parm1q, FDEC_STRIDE
417 ;-----------------------------------------------------------------------------
418 ; void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c )
419 ;-----------------------------------------------------------------------------
420 cglobal predict_16x16_p_core_mmxext
429 pmullw mm5, [pw_3210 GLOBAL]
433 paddsw mm0, mm5 ; mm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b}
434 paddsw mm1, mm0 ; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
435 paddsw mm2, mm0 ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b}
436 paddsw mm3, mm1 ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b}
459 add parm1q, FDEC_STRIDE
466 ;-----------------------------------------------------------------------------
467 ; void predict_16x16_v_mmx( uint8_t *src )
468 ;-----------------------------------------------------------------------------
469 cglobal predict_16x16_v_mmx
470 sub parm1q, FDEC_STRIDE
472 movq mm1, [parm1q + 8]
476 ;-----------------------------------------------------------------------------
477 ; void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left )
478 ;-----------------------------------------------------------------------------
480 %macro PRED16x16_DC 2
481 sub parm1q, FDEC_STRIDE
486 psadbw mm1, [parm1q + 8]
491 packuswb mm0, mm0 ; dc in bytes
496 cglobal predict_16x16_dc_core_mmxext
501 cglobal predict_16x16_dc_top_mmxext
502 PRED16x16_DC [pw_8 GLOBAL], 4