1 ;*****************************************************************************
2 ;* predict-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
8 ;* This program is free software; you can redistribute it and/or modify
9 ;* it under the terms of the GNU General Public License as published by
10 ;* the Free Software Foundation; either version 2 of the License, or
11 ;* (at your option) any later version.
13 ;* This program is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ;* GNU General Public License for more details.
18 ;* You should have received a copy of the GNU General Public License
19 ;* along with this program; if not, write to the Free Software
20 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
21 ;*****************************************************************************
25 ;=============================================================================
26 ; Macros and other preprocessor constants
27 ;=============================================================================
29 %include "i386inc.asm"
32 movq [edx + 0*FDEC_STRIDE], %1
33 movq [edx + 1*FDEC_STRIDE], %1
34 movq [edx + 2*FDEC_STRIDE], %1
35 movq [edx + 3*FDEC_STRIDE], %1
36 movq [edx + 4*FDEC_STRIDE], %2
37 movq [edx + 5*FDEC_STRIDE], %2
38 movq [edx + 6*FDEC_STRIDE], %2
39 movq [edx + 7*FDEC_STRIDE], %2
66 ;=============================================================================
68 ;=============================================================================
72 cglobal predict_8x8_v_mmxext
73 cglobal predict_8x8_dc_mmxext
74 cglobal predict_8x8_dc_top_mmxext
75 cglobal predict_8x8_dc_left_mmxext
76 cglobal predict_8x8_ddl_mmxext
77 cglobal predict_8x8_ddr_mmxext
78 cglobal predict_8x8_vr_core_mmxext
79 cglobal predict_8x8c_v_mmx
80 cglobal predict_8x8c_dc_core_mmxext
81 cglobal predict_8x8c_p_core_mmxext
82 cglobal predict_16x16_p_core_mmxext
83 cglobal predict_16x16_v_mmx
84 cglobal predict_16x16_dc_core_mmxext
85 cglobal predict_16x16_dc_top_mmxext
88 ; dest, left, right, src, tmp
89 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
90 %macro PRED8x8_LOWPASS 5
95 pand %3, [pb_1 GOT_ebx]
101 ;-----------------------------------------------------------------------------
102 ; void predict_8x8_v_mmxext( uint8_t *src, uint8_t *edge )
103 ;-----------------------------------------------------------------------------
106 predict_8x8_v_mmxext:
113 ;-----------------------------------------------------------------------------
114 ; void predict_8x8_dc_mmxext( uint8_t *src, uint8_t *edge )
115 ;-----------------------------------------------------------------------------
118 predict_8x8_dc_mmxext:
121 mov eax, [picesp + 8]
122 mov edx, [picesp + 4]
127 paddw mm0, [pw_8 GOT_ebx]
136 ;-----------------------------------------------------------------------------
137 ; void predict_8x8_top_mmxext( uint8_t *src, uint8_t *edge )
138 ;-----------------------------------------------------------------------------
144 mov eax, [picesp + 8]
145 mov edx, [picesp + 4]
148 paddw mm0, [pw_4 GOT_ebx]
157 PRED8x8_DC predict_8x8_dc_top_mmxext, 16
158 PRED8x8_DC predict_8x8_dc_left_mmxext, 7
160 ;-----------------------------------------------------------------------------
161 ; void predict_8x8_ddl_mmxext( uint8_t *src, uint8_t *edge )
162 ;-----------------------------------------------------------------------------
165 predict_8x8_ddl_mmxext:
168 mov eax, [picesp + 8]
169 mov edx, [picesp + 4]
174 PRED8x8_LOWPASS mm0, mm1, mm2, [eax + 16], mm7
175 PRED8x8_LOWPASS mm1, mm3, mm4, [eax + 24], mm6
179 movq [edx + Y*FDEC_STRIDE], mm1
187 movq [edx + Y*FDEC_STRIDE], mm1
192 movq [edx + Y*FDEC_STRIDE], mm1
197 ;-----------------------------------------------------------------------------
198 ; void predict_8x8_ddr_mmxext( uint8_t *src, uint8_t *edge )
199 ;-----------------------------------------------------------------------------
202 predict_8x8_ddr_mmxext:
205 mov eax, [picesp + 8]
206 mov edx, [picesp + 4]
211 PRED8x8_LOWPASS mm0, mm1, mm2, [eax + 8], mm7
212 PRED8x8_LOWPASS mm1, mm3, mm4, [eax + 16], mm6
216 movq [edx + Y*FDEC_STRIDE], mm0
224 movq [edx + Y*FDEC_STRIDE], mm0
229 movq [edx + Y*FDEC_STRIDE], mm0
234 ;-----------------------------------------------------------------------------
235 ; void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t *edge )
236 ;-----------------------------------------------------------------------------
238 ; fills only some pixels:
250 predict_8x8_vr_core_mmxext:
253 mov eax, [picesp + 8]
254 mov edx, [picesp + 4]
260 PRED8x8_LOWPASS mm0, mm1, mm2, mm4, mm7
264 movq [edx + Y *FDEC_STRIDE], mm3
265 movq [edx + (Y+1)*FDEC_STRIDE], mm0
270 movq [edx + Y *FDEC_STRIDE], mm3
271 movq [edx + (Y+1)*FDEC_STRIDE], mm0
276 ;-----------------------------------------------------------------------------
277 ; void predict_8x8c_v_mmx( uint8_t *src )
278 ;-----------------------------------------------------------------------------
283 movq mm0, [edx - FDEC_STRIDE]
287 ;-----------------------------------------------------------------------------
288 ; void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 )
289 ;-----------------------------------------------------------------------------
292 predict_8x8c_dc_core_mmxext:
296 mov edx, [picesp + 4]
298 movq mm0, [edx - FDEC_STRIDE]
306 paddw mm0, [picesp + 8]
307 pshufw mm2, [picesp + 12], 0
309 paddw mm1, [pw_2 GOT_ebx]
312 pshufw mm0, mm0, 0 ; dc0 (w)
314 psrlw mm3, 3 ; dc3 (w)
315 psrlw mm2, 2 ; dc2 (w)
316 psrlw mm1, 2 ; dc1 (w)
318 packuswb mm0, mm1 ; dc0,dc1 (b)
319 packuswb mm2, mm3 ; dc2,dc3 (b)
326 ;-----------------------------------------------------------------------------
327 ; void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c )
328 ;-----------------------------------------------------------------------------
331 predict_8x8c_p_core_mmxext:
335 mov edx, [picesp + 4]
337 pshufw mm0, [picesp + 8], 0
338 pshufw mm2, [picesp +12], 0
339 pshufw mm4, [picesp +16], 0
341 pmullw mm2, [pw_3210 GOT_ebx]
343 paddsw mm0, mm2 ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b}
344 paddsw mm1, mm0 ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b}
366 ;-----------------------------------------------------------------------------
367 ; void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c )
368 ;-----------------------------------------------------------------------------
371 predict_16x16_p_core_mmxext:
376 mov edx, [picesp + 4]
378 pshufw mm0, [picesp + 8], 0
379 pshufw mm2, [picesp +12], 0
380 pshufw mm4, [picesp +16], 0
383 pmullw mm5, [pw_3210 GOT_ebx]
387 paddsw mm0, mm5 ; mm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b}
388 paddsw mm1, mm0 ; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
389 paddsw mm2, mm0 ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b}
390 paddsw mm3, mm1 ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b}
421 ;-----------------------------------------------------------------------------
422 ; void predict_16x16_v_mmx( uint8_t *src )
423 ;-----------------------------------------------------------------------------
426 predict_16x16_v_mmx :
430 sub edx, ecx ; edx <-- line -1
434 lea eax, [ecx + 2*ecx] ; eax <-- 3* stride
436 SAVE_0_1 (edx + ecx) ; 0
437 SAVE_0_1 (edx + 2 * ecx) ; 1
438 SAVE_0_1 (edx + eax) ; 2
439 SAVE_0_1 (edx + 4 * ecx) ; 3
440 SAVE_0_1 (edx + 2 * eax) ; 5
441 SAVE_0_1 (edx + 8 * ecx) ; 7
442 SAVE_0_1 (edx + 4 * eax) ; 11
443 add edx, ecx ; edx <-- line 0
444 SAVE_0_1 (edx + 4 * ecx) ; 4
445 SAVE_0_1 (edx + 2 * eax) ; 6
446 SAVE_0_1 (edx + 8 * ecx) ; 8
447 SAVE_0_1 (edx + 4 * eax) ; 12
448 lea edx, [edx + 8 * ecx] ; edx <-- line 8
449 SAVE_0_1 (edx + ecx) ; 9
450 SAVE_0_1 (edx + 2 * ecx) ; 10
451 lea edx, [edx + 4 * ecx] ; edx <-- line 12
452 SAVE_0_1 (edx + ecx) ; 13
453 SAVE_0_1 (edx + 2 * ecx) ; 14
454 SAVE_0_1 (edx + eax) ; 15
458 ;-----------------------------------------------------------------------------
459 ; void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left )
460 ;-----------------------------------------------------------------------------
462 %macro PRED16x16_DC 3
465 sub edx, ecx ; edx <-- line -1
470 psadbw mm1, [edx + 8]
472 paddusw mm0, %1 ; FIXME is stack alignment guaranteed?
476 lea eax, [ecx + 2*ecx] ; eax <-- 3* stride
477 packuswb mm0, mm0 ; dc in bytes
482 SAVE_0_0 (edx + ecx) ; 0
483 SAVE_0_0 (edx + 2 * ecx) ; 1
484 SAVE_0_0 (edx + eax) ; 2
485 SAVE_0_0 (edx + 4 * ecx) ; 3
487 lea edx, [edx + 4 * ecx]
494 predict_16x16_dc_core_mmxext:
495 PRED16x16_DC [esp+8], 5, esp
499 predict_16x16_dc_top_mmxext:
502 PRED16x16_DC [pw_8 GOT_ebx], 4, picesp