1 /*****************************************************************************
2 * predict-c.c: msa intra prediction
3 *****************************************************************************
4 * Copyright (C) 2015-2016 x264 project
6 * Authors: Mandar Sahastrabuddhe <mandar.sahastrabuddhe@imgtec.com>
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 * This program is also available under a commercial proprietary license.
23 * For more information, contact us at licensing@x264.com.
24 *****************************************************************************/
26 #include "common/common.h"
30 static void intra_predict_vert_4x4_msa( uint8_t *p_src, uint8_t *p_dst,
31 int32_t i_dst_stride )
35 u_src_data = LW( p_src );
37 SW4( u_src_data, u_src_data, u_src_data, u_src_data, p_dst, i_dst_stride );
40 static void intra_predict_vert_8x8_msa( uint8_t *p_src, uint8_t *p_dst,
41 int32_t i_dst_stride )
47 SD4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride );
48 p_dst += ( 4 * i_dst_stride );
49 SD4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride );
52 static void intra_predict_vert_16x16_msa( uint8_t *p_src, uint8_t *p_dst,
53 int32_t i_dst_stride )
55 v16u8 src0 = LD_UB( p_src );
57 ST_UB8( src0, src0, src0, src0, src0, src0, src0, src0, p_dst,
59 p_dst += ( 8 * i_dst_stride );
60 ST_UB8( src0, src0, src0, src0, src0, src0, src0, src0, p_dst,
64 static void intra_predict_horiz_4x4_msa( uint8_t *p_src, int32_t i_src_stride,
65 uint8_t *p_dst, int32_t i_dst_stride )
67 uint32_t u_out0, u_out1, u_out2, u_out3;
69 u_out0 = p_src[0 * i_src_stride] * 0x01010101;
70 u_out1 = p_src[1 * i_src_stride] * 0x01010101;
71 u_out2 = p_src[2 * i_src_stride] * 0x01010101;
72 u_out3 = p_src[3 * i_src_stride] * 0x01010101;
74 SW4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
77 static void intra_predict_horiz_8x8_msa( uint8_t *p_src, int32_t i_src_stride,
78 uint8_t *p_dst, int32_t i_dst_stride )
80 uint64_t u_out0, u_out1, u_out2, u_out3, u_out4, u_out5, u_out6, u_out7;
82 u_out0 = p_src[0 * i_src_stride] * 0x0101010101010101ull;
83 u_out1 = p_src[1 * i_src_stride] * 0x0101010101010101ull;
84 u_out2 = p_src[2 * i_src_stride] * 0x0101010101010101ull;
85 u_out3 = p_src[3 * i_src_stride] * 0x0101010101010101ull;
86 u_out4 = p_src[4 * i_src_stride] * 0x0101010101010101ull;
87 u_out5 = p_src[5 * i_src_stride] * 0x0101010101010101ull;
88 u_out6 = p_src[6 * i_src_stride] * 0x0101010101010101ull;
89 u_out7 = p_src[7 * i_src_stride] * 0x0101010101010101ull;
91 SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
92 p_dst += ( 4 * i_dst_stride );
93 SD4( u_out4, u_out5, u_out6, u_out7, p_dst, i_dst_stride );
96 static void intra_predict_horiz_16x16_msa( uint8_t *p_src, int32_t i_src_stride,
98 int32_t i_dst_stride )
101 uint8_t u_inp0, u_inp1, u_inp2, u_inp3;
102 v16u8 src0, src1, src2, src3;
104 for ( u_row = 4; u_row--; )
107 p_src += i_src_stride;
109 p_src += i_src_stride;
111 p_src += i_src_stride;
113 p_src += i_src_stride;
115 src0 = ( v16u8 ) __msa_fill_b( u_inp0 );
116 src1 = ( v16u8 ) __msa_fill_b( u_inp1 );
117 src2 = ( v16u8 ) __msa_fill_b( u_inp2 );
118 src3 = ( v16u8 ) __msa_fill_b( u_inp3 );
120 ST_UB4( src0, src1, src2, src3, p_dst, i_dst_stride );
121 p_dst += ( 4 * i_dst_stride );
125 static void intra_predict_dc_4x4_msa( uint8_t *p_src_top, uint8_t *p_src_left,
126 int32_t i_src_stride_left,
127 uint8_t *p_dst, int32_t i_dst_stride,
128 uint8_t is_above, uint8_t is_left )
131 uint32_t u_out, u_addition = 0;
132 v16u8 src_above, store;
136 if ( is_left && is_above )
138 src_above = LD_UB( p_src_top );
140 sum_above = __msa_hadd_u_h( src_above, src_above );
141 sum = __msa_hadd_u_w( sum_above, sum_above );
142 u_addition = __msa_copy_u_w( ( v4i32 ) sum, 0 );
144 for ( u_row = 0; u_row < 4; u_row++ )
146 u_addition += p_src_left[u_row * i_src_stride_left];
149 u_addition = ( u_addition + 4 ) >> 3;
150 store = ( v16u8 ) __msa_fill_b( u_addition );
154 for ( u_row = 0; u_row < 4; u_row++ )
156 u_addition += p_src_left[u_row * i_src_stride_left];
159 u_addition = ( u_addition + 2 ) >> 2;
160 store = ( v16u8 ) __msa_fill_b( u_addition );
164 src_above = LD_UB( p_src_top );
166 sum_above = __msa_hadd_u_h( src_above, src_above );
167 sum = __msa_hadd_u_w( sum_above, sum_above );
168 sum = ( v4u32 ) __msa_srari_w( ( v4i32 ) sum, 2 );
169 store = ( v16u8 ) __msa_splati_b( ( v16i8 ) sum, 0 );
173 store = ( v16u8 ) __msa_ldi_b( 128 );
176 u_out = __msa_copy_u_w( ( v4i32 ) store, 0 );
178 SW4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride );
181 static void intra_predict_dc_8x8_msa( uint8_t *p_src_top, uint8_t *p_src_left,
182 uint8_t *p_dst, int32_t i_dst_stride )
184 uint64_t u_val0, u_val1;
191 u_val0 = LD( p_src_top );
192 u_val1 = LD( p_src_left );
193 INSERT_D2_UB( u_val0, u_val1, src );
194 sum_h = __msa_hadd_u_h( src, src );
195 sum_w = __msa_hadd_u_w( sum_h, sum_h );
196 sum_d = __msa_hadd_u_d( sum_w, sum_w );
197 sum_w = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum_d, ( v4i32 ) sum_d );
198 sum_d = __msa_hadd_u_d( sum_w, sum_w );
199 sum_w = ( v4u32 ) __msa_srari_w( ( v4i32 ) sum_d, 4 );
200 store = __msa_splati_b( ( v16i8 ) sum_w, 0 );
201 u_val0 = __msa_copy_u_d( ( v2i64 ) store, 0 );
203 SD4( u_val0, u_val0, u_val0, u_val0, p_dst, i_dst_stride );
204 p_dst += ( 4 * i_dst_stride );
205 SD4( u_val0, u_val0, u_val0, u_val0, p_dst, i_dst_stride );
208 static void intra_predict_dc_16x16_msa( uint8_t *p_src_top, uint8_t *p_src_left,
209 int32_t i_src_stride_left,
210 uint8_t *p_dst, int32_t i_dst_stride,
211 uint8_t is_above, uint8_t is_left )
214 uint32_t u_addition = 0;
215 v16u8 src_above, store;
220 if ( is_left && is_above )
222 src_above = LD_UB( p_src_top );
224 sum_above = __msa_hadd_u_h( src_above, src_above );
225 sum_top = __msa_hadd_u_w( sum_above, sum_above );
226 sum = __msa_hadd_u_d( sum_top, sum_top );
227 sum_top = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum, ( v4i32 ) sum );
228 sum = __msa_hadd_u_d( sum_top, sum_top );
229 u_addition = __msa_copy_u_w( ( v4i32 ) sum, 0 );
231 for ( u_row = 0; u_row < 16; u_row++ )
233 u_addition += p_src_left[u_row * i_src_stride_left];
236 u_addition = ( u_addition + 16 ) >> 5;
237 store = ( v16u8 ) __msa_fill_b( u_addition );
241 for ( u_row = 0; u_row < 16; u_row++ )
243 u_addition += p_src_left[u_row * i_src_stride_left];
246 u_addition = ( u_addition + 8 ) >> 4;
247 store = ( v16u8 ) __msa_fill_b( u_addition );
251 src_above = LD_UB( p_src_top );
253 sum_above = __msa_hadd_u_h( src_above, src_above );
254 sum_top = __msa_hadd_u_w( sum_above, sum_above );
255 sum = __msa_hadd_u_d( sum_top, sum_top );
256 sum_top = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum, ( v4i32 ) sum );
257 sum = __msa_hadd_u_d( sum_top, sum_top );
258 sum = ( v2u64 ) __msa_srari_d( ( v2i64 ) sum, 4 );
259 store = ( v16u8 ) __msa_splati_b( ( v16i8 ) sum, 0 );
263 store = ( v16u8 ) __msa_ldi_b( 128 );
266 ST_UB8( store, store, store, store, store, store, store, store, p_dst,
268 p_dst += ( 8 * i_dst_stride );
269 ST_UB8( store, store, store, store, store, store, store, store, p_dst,
273 static void intra_predict_plane_8x8_msa( uint8_t *p_src, int32_t i_stride )
276 int32_t i_res, i_res0, i_res1, i_res2, i_res3;
277 uint64_t u_out0, u_out1;
278 v16i8 shf_mask = { 3, 5, 2, 6, 1, 7, 0, 8, 3, 5, 2, 6, 1, 7, 0, 8 };
279 v8i16 short_multiplier = { 1, 2, 3, 4, 1, 2, 3, 4 };
280 v4i32 int_multiplier = { 0, 1, 2, 3 };
282 v8i16 vec9, vec10, vec11;
283 v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8;
286 p_src_top = LD_UB( p_src - ( i_stride + 1 ) );
287 p_src_top = ( v16u8 ) __msa_vshf_b( shf_mask, ( v16i8 ) p_src_top,
288 ( v16i8 ) p_src_top );
290 vec9 = __msa_hsub_u_h( p_src_top, p_src_top );
291 vec9 *= short_multiplier;
292 vec8 = __msa_hadd_s_w( vec9, vec9 );
293 sum = __msa_hadd_s_d( vec8, vec8 );
295 i_res0 = __msa_copy_s_w( ( v4i32 ) sum, 0 );
297 i_res1 = ( p_src[4 * i_stride - 1] - p_src[2 * i_stride - 1] ) +
298 2 * ( p_src[5 * i_stride - 1] - p_src[i_stride - 1] ) +
299 3 * ( p_src[6 * i_stride - 1] - p_src[-1] ) +
300 4 * ( p_src[7 * i_stride - 1] - p_src[-i_stride - 1] );
304 i_res0 = ( i_res0 + 16 ) >> 5;
305 i_res1 = ( i_res1 + 16 ) >> 5;
307 i_res3 = 3 * ( i_res0 + i_res1 );
308 i_res2 = 16 * ( p_src[7 * i_stride - 1] + p_src[-i_stride + 7] + 1 );
309 i_res = i_res2 - i_res3;
311 vec8 = __msa_fill_w( i_res0 );
312 vec4 = __msa_fill_w( i_res );
313 vec2 = __msa_fill_w( i_res1 );
314 vec5 = vec8 * int_multiplier;
317 for ( u_lpcnt = 4; u_lpcnt--; )
327 SRA_4V( vec0, vec1, vec6, vec7, 5 );
328 PCKEV_H2_SH( vec1, vec0, vec7, vec6, vec10, vec11 );
329 CLIP_SH2_0_255( vec10, vec11 );
330 PCKEV_B2_SH( vec10, vec10, vec11, vec11, vec10, vec11 );
332 u_out0 = __msa_copy_s_d( ( v2i64 ) vec10, 0 );
333 u_out1 = __msa_copy_s_d( ( v2i64 ) vec11, 0 );
343 static void intra_predict_plane_16x16_msa( uint8_t *p_src, int32_t i_stride )
346 int32_t i_res0, i_res1, i_res2, i_res3;
347 uint64_t u_load0, u_load1;
348 v16i8 shf_mask = { 7, 8, 6, 9, 5, 10, 4, 11, 3, 12, 2, 13, 1, 14, 0, 15 };
349 v8i16 short_multiplier = { 1, 2, 3, 4, 5, 6, 7, 8 };
350 v4i32 int_multiplier = { 0, 1, 2, 3 };
351 v16u8 p_src_top = { 0 };
353 v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, res_add;
355 u_load0 = LD( p_src - ( i_stride + 1 ) );
356 u_load1 = LD( p_src - ( i_stride + 1 ) + 9 );
358 INSERT_D2_UB( u_load0, u_load1, p_src_top );
360 p_src_top = ( v16u8 ) __msa_vshf_b( shf_mask, ( v16i8 ) p_src_top,
361 ( v16i8 ) p_src_top );
363 vec9 = __msa_hsub_u_h( p_src_top, p_src_top );
364 vec9 *= short_multiplier;
365 vec8 = __msa_hadd_s_w( vec9, vec9 );
366 res_add = ( v4i32 ) __msa_hadd_s_d( vec8, vec8 );
368 i_res0 = __msa_copy_s_w( res_add, 0 ) + __msa_copy_s_w( res_add, 2 );
370 i_res1 = ( p_src[8 * i_stride - 1] - p_src[6 * i_stride - 1] ) +
371 2 * ( p_src[9 * i_stride - 1] - p_src[5 * i_stride - 1] ) +
372 3 * ( p_src[10 * i_stride - 1] - p_src[4 * i_stride - 1] ) +
373 4 * ( p_src[11 * i_stride - 1] - p_src[3 * i_stride - 1] ) +
374 5 * ( p_src[12 * i_stride - 1] - p_src[2 * i_stride - 1] ) +
375 6 * ( p_src[13 * i_stride - 1] - p_src[i_stride - 1] ) +
376 7 * ( p_src[14 * i_stride - 1] - p_src[-1] ) +
377 8 * ( p_src[15 * i_stride - 1] - p_src[-1 * i_stride - 1] );
381 i_res0 = ( i_res0 + 32 ) >> 6;
382 i_res1 = ( i_res1 + 32 ) >> 6;
384 i_res3 = 7 * ( i_res0 + i_res1 );
385 i_res2 = 16 * ( p_src[15 * i_stride - 1] + p_src[-i_stride + 15] + 1 );
388 vec8 = __msa_fill_w( i_res0 );
389 vec4 = __msa_fill_w( i_res2 );
390 vec5 = __msa_fill_w( i_res1 );
392 vec7 = vec8 * int_multiplier;
394 for ( u_lpcnt = 16; u_lpcnt--; )
402 SRA_4V( vec0, vec1, vec2, vec3, 5 );
403 PCKEV_H2_SH( vec1, vec0, vec3, vec2, vec9, vec10 );
404 CLIP_SH2_0_255( vec9, vec10 );
405 PCKEV_ST_SB( vec9, vec10, p_src );
412 static void intra_predict_dc_4blk_8x8_msa( uint8_t *p_src, int32_t i_stride )
415 uint32_t u_src0, u_src1, u_src3, u_src2 = 0;
416 uint32_t u_out0, u_out1, u_out2, u_out3;
421 p_src_top = LD_UB( p_src - i_stride );
422 add = __msa_hadd_u_h( ( v16u8 ) p_src_top, ( v16u8 ) p_src_top );
423 sum = __msa_hadd_u_w( add, add );
424 u_src0 = __msa_copy_u_w( ( v4i32 ) sum, 0 );
425 u_src1 = __msa_copy_u_w( ( v4i32 ) sum, 1 );
427 for ( u_lp_cnt = 0; u_lp_cnt < 4; u_lp_cnt++ )
429 u_src0 += p_src[u_lp_cnt * i_stride - 1];
430 u_src2 += p_src[( 4 + u_lp_cnt ) * i_stride - 1];
433 u_src0 = ( u_src0 + 4 ) >> 3;
434 u_src3 = ( u_src1 + u_src2 + 4 ) >> 3;
435 u_src1 = ( u_src1 + 2 ) >> 2;
436 u_src2 = ( u_src2 + 2 ) >> 2;
438 u_out0 = u_src0 * 0x01010101;
439 u_out1 = u_src1 * 0x01010101;
440 u_out2 = u_src2 * 0x01010101;
441 u_out3 = u_src3 * 0x01010101;
443 for ( u_lp_cnt = 4; u_lp_cnt--; )
446 SW( u_out1, ( p_src + 4 ) );
447 SW( u_out2, ( p_src + 4 * i_stride ) );
448 SW( u_out3, ( p_src + 4 * i_stride + 4 ) );
453 static void intra_predict_ddl_8x8_msa( uint8_t *p_src, uint8_t *p_dst,
454 int32_t i_dst_stride )
456 uint8_t u_src_val = p_src[15];
457 uint64_t u_out0, u_out1, u_out2, u_out3;
458 v16u8 src, vec4, vec5, res0;
459 v8u16 vec0, vec1, vec2, vec3;
460 v2i64 res1, res2, res3;
462 src = LD_UB( p_src );
464 vec4 = ( v16u8 ) __msa_sldi_b( ( v16i8 ) src, ( v16i8 ) src, 1 );
465 vec5 = ( v16u8 ) __msa_sldi_b( ( v16i8 ) src, ( v16i8 ) src, 2 );
466 vec5 = ( v16u8 ) __msa_insert_b( ( v16i8 ) vec5, 14, u_src_val );
467 ILVR_B2_UH( vec5, src, vec4, vec4, vec0, vec1 );
468 ILVL_B2_UH( vec5, src, vec4, vec4, vec2, vec3 );
469 HADD_UB4_UH( vec0, vec1, vec2, vec3, vec0, vec1, vec2, vec3 );
473 vec0 = ( v8u16 ) __msa_srari_h( ( v8i16 ) vec0, 2 );
474 vec2 = ( v8u16 ) __msa_srari_h( ( v8i16 ) vec2, 2 );
476 res0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) vec2, ( v16i8 ) vec0 );
477 res1 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 1 );
478 res2 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 2 );
479 res3 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 3 );
481 u_out0 = __msa_copy_u_d( ( v2i64 ) res0, 0 );
482 u_out1 = __msa_copy_u_d( res1, 0 );
483 u_out2 = __msa_copy_u_d( res2, 0 );
484 u_out3 = __msa_copy_u_d( res3, 0 );
485 SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
486 p_dst += ( 4 * i_dst_stride );
488 res0 = ( v16u8 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 4 );
489 res1 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 1 );
490 res2 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 2 );
491 res3 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 3 );
493 u_out0 = __msa_copy_u_d( ( v2i64 ) res0, 0 );
494 u_out1 = __msa_copy_u_d( res1, 0 );
495 u_out2 = __msa_copy_u_d( res2, 0 );
496 u_out3 = __msa_copy_u_d( res3, 0 );
497 SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
500 static void intra_predict_128dc_16x16_msa( uint8_t *p_dst,
501 int32_t i_dst_stride )
503 v16u8 out = ( v16u8 ) __msa_ldi_b( 128 );
505 ST_UB8( out, out, out, out, out, out, out, out, p_dst, i_dst_stride );
506 p_dst += ( 8 * i_dst_stride );
507 ST_UB8( out, out, out, out, out, out, out, out, p_dst, i_dst_stride );
510 void x264_intra_predict_dc_16x16_msa( uint8_t *p_src )
512 intra_predict_dc_16x16_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
513 FDEC_STRIDE, p_src, FDEC_STRIDE, 1, 1 );
516 void x264_intra_predict_dc_left_16x16_msa( uint8_t *p_src )
518 intra_predict_dc_16x16_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
519 FDEC_STRIDE, p_src, FDEC_STRIDE, 0, 1 );
522 void x264_intra_predict_dc_top_16x16_msa( uint8_t *p_src )
524 intra_predict_dc_16x16_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
525 FDEC_STRIDE, p_src, FDEC_STRIDE, 1, 0 );
528 void x264_intra_predict_dc_128_16x16_msa( uint8_t *p_src )
530 intra_predict_128dc_16x16_msa( p_src, FDEC_STRIDE );
533 void x264_intra_predict_hor_16x16_msa( uint8_t *p_src )
535 intra_predict_horiz_16x16_msa( ( p_src - 1 ), FDEC_STRIDE,
536 p_src, FDEC_STRIDE );
539 void x264_intra_predict_vert_16x16_msa( uint8_t *p_src )
541 intra_predict_vert_16x16_msa( ( p_src - FDEC_STRIDE ), p_src, FDEC_STRIDE );
544 void x264_intra_predict_plane_16x16_msa( uint8_t *p_src )
546 intra_predict_plane_16x16_msa( p_src, FDEC_STRIDE );
549 void x264_intra_predict_dc_4blk_8x8_msa( uint8_t *p_src )
551 intra_predict_dc_4blk_8x8_msa( p_src, FDEC_STRIDE );
554 void x264_intra_predict_hor_8x8_msa( uint8_t *p_src )
556 intra_predict_horiz_8x8_msa( ( p_src - 1 ), FDEC_STRIDE,
557 p_src, FDEC_STRIDE );
560 void x264_intra_predict_vert_8x8_msa( uint8_t *p_src )
562 intra_predict_vert_8x8_msa( ( p_src - FDEC_STRIDE ), p_src, FDEC_STRIDE );
565 void x264_intra_predict_plane_8x8_msa( uint8_t *p_src )
567 intra_predict_plane_8x8_msa( p_src, FDEC_STRIDE );
570 void x264_intra_predict_ddl_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
572 intra_predict_ddl_8x8_msa( ( pu_xyz + 16 ), p_src, FDEC_STRIDE );
575 void x264_intra_predict_dc_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
577 intra_predict_dc_8x8_msa( ( pu_xyz + 16 ), ( pu_xyz + 7 ),
578 p_src, FDEC_STRIDE );
581 void x264_intra_predict_h_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
583 intra_predict_horiz_8x8_msa( ( pu_xyz + 14 ), -1, p_src, FDEC_STRIDE );
586 void x264_intra_predict_v_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
588 intra_predict_vert_8x8_msa( ( pu_xyz + 16 ), p_src, FDEC_STRIDE );
591 void x264_intra_predict_dc_4x4_msa( uint8_t *p_src )
593 intra_predict_dc_4x4_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
594 FDEC_STRIDE, p_src, FDEC_STRIDE, 1, 1 );
597 void x264_intra_predict_hor_4x4_msa( uint8_t *p_src )
599 intra_predict_horiz_4x4_msa( ( p_src - 1 ), FDEC_STRIDE,
600 p_src, FDEC_STRIDE );
603 void x264_intra_predict_vert_4x4_msa( uint8_t *p_src )
605 intra_predict_vert_4x4_msa( ( p_src - FDEC_STRIDE ), p_src, FDEC_STRIDE );