1 /*****************************************************************************
2 * pixel-c.c: msa pixel metrics
3 *****************************************************************************
4 * Copyright (C) 2015 x264 project
6 * Authors: Mandar Sahastrabuddhe <mandar.sahastrabuddhe@imgtec.com>
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 * This program is also available under a commercial proprietary license.
23 * For more information, contact us at licensing@x264.com.
24 *****************************************************************************/
26 #include "common/common.h"
32 #define CALC_MSE_B( src, ref, var ) \
34 v16u8 src_l0_m, src_l1_m; \
35 v8i16 res_l0_m, res_l1_m; \
37 ILVRL_B2_UB( src, ref, src_l0_m, src_l1_m ); \
38 HSUB_UB2_SH( src_l0_m, src_l1_m, res_l0_m, res_l1_m ); \
39 DPADD_SH2_SW( res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var ); \
42 #define CALC_MSE_AVG_B( src, ref, var, sub ) \
44 v16u8 src_l0_m, src_l1_m; \
45 v8i16 res_l0_m, res_l1_m; \
47 ILVRL_B2_UB( src, ref, src_l0_m, src_l1_m ); \
48 HSUB_UB2_SH( src_l0_m, src_l1_m, res_l0_m, res_l1_m ); \
49 DPADD_SH2_SW( res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var ); \
51 sub += res_l0_m + res_l1_m; \
54 #define VARIANCE_WxH( sse, diff, shift ) \
55 ( ( sse ) - ( ( ( uint32_t )( diff ) * ( diff ) ) >> ( shift ) ) )
57 static uint32_t sad_4width_msa( uint8_t *p_src, int32_t i_src_stride,
58 uint8_t *p_ref, int32_t i_ref_stride,
62 uint32_t u_src0, u_src1, u_src2, u_src3, u_ref0, u_ref1, u_ref2, u_ref3;
68 for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
70 LW4( p_src, i_src_stride, u_src0, u_src1, u_src2, u_src3 );
71 p_src += ( 4 * i_src_stride );
72 LW4( p_ref, i_ref_stride, u_ref0, u_ref1, u_ref2, u_ref3 );
73 p_ref += ( 4 * i_ref_stride );
75 INSERT_W4_UB( u_src0, u_src1, u_src2, u_src3, src );
76 INSERT_W4_UB( u_ref0, u_ref1, u_ref2, u_ref3, ref );
78 diff = __msa_asub_u_b( src, ref );
79 sad += __msa_hadd_u_h( diff, diff );
82 return ( HADD_UH_U32( sad ) );
85 static uint32_t sad_8width_msa( uint8_t *p_src, int32_t i_src_stride,
86 uint8_t *p_ref, int32_t i_ref_stride,
90 v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
93 for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
95 LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
96 p_src += ( 4 * i_src_stride );
97 LD_UB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 );
98 p_ref += ( 4 * i_ref_stride );
100 PCKEV_D4_UB( src1, src0, src3, src2, ref1, ref0, ref3, ref2,
101 src0, src1, ref0, ref1 );
102 sad += SAD_UB2_UH( src0, src1, ref0, ref1 );
105 return ( HADD_UH_U32( sad ) );
108 static uint32_t sad_16width_msa( uint8_t *p_src, int32_t i_src_stride,
109 uint8_t *p_ref, int32_t i_ref_stride,
113 v16u8 src0, src1, ref0, ref1;
116 for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
118 LD_UB2( p_src, i_src_stride, src0, src1 );
119 p_src += ( 2 * i_src_stride );
120 LD_UB2( p_ref, i_ref_stride, ref0, ref1 );
121 p_ref += ( 2 * i_ref_stride );
122 sad += SAD_UB2_UH( src0, src1, ref0, ref1 );
124 LD_UB2( p_src, i_src_stride, src0, src1 );
125 p_src += ( 2 * i_src_stride );
126 LD_UB2( p_ref, i_ref_stride, ref0, ref1 );
127 p_ref += ( 2 * i_ref_stride );
128 sad += SAD_UB2_UH( src0, src1, ref0, ref1 );
131 return ( HADD_UH_U32( sad ) );
134 static void sad_4width_x3d_msa( uint8_t *p_src, int32_t i_src_stride,
135 uint8_t *p_ref0, uint8_t *p_ref1,
136 uint8_t *p_ref2, int32_t i_ref_stride,
137 int32_t i_height, uint32_t *pu_sad_array )
141 uint32_t src0, src1, src2, src3, load0, load1, load2, load3;
150 for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
152 LW4( p_src, i_src_stride, src0, src1, src2, src3 );
153 INSERT_W4_UB( src0, src1, src2, src3, src );
154 p_src += ( 4 * i_src_stride );
156 LW4( p_ref0, i_ref_stride, load0, load1, load2, load3 );
157 INSERT_W4_UB( load0, load1, load2, load3, ref0 );
158 p_ref0 += ( 4 * i_ref_stride );
160 LW4( p_ref1, i_ref_stride, load0, load1, load2, load3 );
161 INSERT_W4_UB( load0, load1, load2, load3, ref1 );
162 p_ref1 += ( 4 * i_ref_stride );
164 LW4( p_ref2, i_ref_stride, load0, load1, load2, load3 );
165 INSERT_W4_UB( load0, load1, load2, load3, ref2 );
166 p_ref2 += ( 4 * i_ref_stride );
168 diff = __msa_asub_u_b( src, ref0 );
169 sad0 += __msa_hadd_u_h( diff, diff );
171 diff = __msa_asub_u_b( src, ref1 );
172 sad1 += __msa_hadd_u_h( diff, diff );
174 diff = __msa_asub_u_b( src, ref2 );
175 sad2 += __msa_hadd_u_h( diff, diff );
178 pu_sad_array[0] = HADD_UH_U32( sad0 );
179 pu_sad_array[1] = HADD_UH_U32( sad1 );
180 pu_sad_array[2] = HADD_UH_U32( sad2 );
183 static void sad_8width_x3d_msa( uint8_t *p_src, int32_t i_src_stride,
184 uint8_t *p_ref0, uint8_t *p_ref1,
185 uint8_t *p_ref2, int32_t i_ref_stride,
186 int32_t i_height, uint32_t *pu_sad_array )
189 v16u8 src0, src1, src2, src3;
190 v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
195 for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
197 LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
198 p_src += ( 4 * i_src_stride );
199 LD_UB4( p_ref0, i_ref_stride, ref00, ref11, ref22, ref33 );
200 p_ref0 += ( 4 * i_ref_stride );
202 PCKEV_D4_UB( src1, src0, src3, src2, ref11, ref00, ref33, ref22,
203 src0, src1, ref0, ref1 );
204 sad0 += SAD_UB2_UH( src0, src1, ref0, ref1 );
206 LD_UB4( p_ref1, i_ref_stride, ref00, ref11, ref22, ref33 );
207 p_ref1 += ( 4 * i_ref_stride );
209 PCKEV_D2_UB( ref11, ref00, ref33, ref22, ref0, ref1 );
210 sad1 += SAD_UB2_UH( src0, src1, ref0, ref1 );
212 LD_UB4( p_ref2, i_ref_stride, ref00, ref11, ref22, ref33 );
213 p_ref2 += ( 4 * i_ref_stride );
215 PCKEV_D2_UB( ref11, ref00, ref33, ref22, ref0, ref1 );
216 sad2 += SAD_UB2_UH( src0, src1, ref0, ref1 );
219 pu_sad_array[0] = HADD_UH_U32( sad0 );
220 pu_sad_array[1] = HADD_UH_U32( sad1 );
221 pu_sad_array[2] = HADD_UH_U32( sad2 );
224 static void sad_16width_x3d_msa( uint8_t *p_src, int32_t i_src_stride,
225 uint8_t *p_ref0, uint8_t *p_ref1,
226 uint8_t *p_ref2, int32_t i_ref_stride,
227 int32_t i_height, uint32_t *pu_sad_array )
236 for ( i_ht_cnt = ( i_height >> 1 ); i_ht_cnt--; )
238 src = LD_UB( p_src );
239 p_src += i_src_stride;
241 ref = LD_UB( p_ref0 );
242 p_ref0 += i_ref_stride;
243 diff = __msa_asub_u_b( src, ref );
244 sad0 += __msa_hadd_u_h( diff, diff );
246 ref = LD_UB( p_ref1 );
247 p_ref1 += i_ref_stride;
248 diff = __msa_asub_u_b( src, ref );
249 sad1 += __msa_hadd_u_h( diff, diff );
251 ref = LD_UB( p_ref2 );
252 p_ref2 += i_ref_stride;
253 diff = __msa_asub_u_b( src, ref );
254 sad2 += __msa_hadd_u_h( diff, diff );
256 src = LD_UB( p_src );
257 p_src += i_src_stride;
259 ref = LD_UB( p_ref0 );
260 p_ref0 += i_ref_stride;
261 diff = __msa_asub_u_b( src, ref );
262 sad0 += __msa_hadd_u_h( diff, diff );
264 ref = LD_UB( p_ref1 );
265 p_ref1 += i_ref_stride;
266 diff = __msa_asub_u_b( src, ref );
267 sad1 += __msa_hadd_u_h( diff, diff );
269 ref = LD_UB( p_ref2 );
270 p_ref2 += i_ref_stride;
271 diff = __msa_asub_u_b( src, ref );
272 sad2 += __msa_hadd_u_h( diff, diff );
275 pu_sad_array[0] = HADD_UH_U32( sad0 );
276 pu_sad_array[1] = HADD_UH_U32( sad1 );
277 pu_sad_array[2] = HADD_UH_U32( sad2 );
280 static void sad_4width_x4d_msa( uint8_t *p_src, int32_t i_src_stride,
281 uint8_t *p_aref[], int32_t i_ref_stride,
282 int32_t i_height, uint32_t *pu_sad_array )
284 uint8_t *p_ref0, *p_ref1, *p_ref2, *p_ref3;
286 uint32_t src0, src1, src2, src3;
287 uint32_t ref0, ref1, ref2, ref3;
301 for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
303 LW4( p_src, i_src_stride, src0, src1, src2, src3 );
304 INSERT_W4_UB( src0, src1, src2, src3, src );
305 p_src += ( 4 * i_src_stride );
307 LW4( p_ref0, i_ref_stride, ref0, ref1, ref2, ref3 );
308 INSERT_W4_UB( ref0, ref1, ref2, ref3, ref );
309 p_ref0 += ( 4 * i_ref_stride );
311 diff = __msa_asub_u_b( src, ref );
312 sad0 += __msa_hadd_u_h( diff, diff );
314 LW4( p_ref1, i_ref_stride, ref0, ref1, ref2, ref3 );
315 INSERT_W4_UB( ref0, ref1, ref2, ref3, ref );
316 p_ref1 += ( 4 * i_ref_stride );
318 diff = __msa_asub_u_b( src, ref );
319 sad1 += __msa_hadd_u_h( diff, diff );
321 LW4( p_ref2, i_ref_stride, ref0, ref1, ref2, ref3 );
322 INSERT_W4_UB( ref0, ref1, ref2, ref3, ref );
323 p_ref2 += ( 4 * i_ref_stride );
325 diff = __msa_asub_u_b( src, ref );
326 sad2 += __msa_hadd_u_h( diff, diff );
328 LW4( p_ref3, i_ref_stride, ref0, ref1, ref2, ref3 );
329 INSERT_W4_UB( ref0, ref1, ref2, ref3, ref );
330 p_ref3 += ( 4 * i_ref_stride );
332 diff = __msa_asub_u_b( src, ref );
333 sad3 += __msa_hadd_u_h( diff, diff );
336 pu_sad_array[0] = HADD_UH_U32( sad0 );
337 pu_sad_array[1] = HADD_UH_U32( sad1 );
338 pu_sad_array[2] = HADD_UH_U32( sad2 );
339 pu_sad_array[3] = HADD_UH_U32( sad3 );
342 static void sad_8width_x4d_msa( uint8_t *p_src, int32_t i_src_stride,
343 uint8_t *p_aref[], int32_t i_ref_stride,
344 int32_t i_height, uint32_t *pu_sad_array )
347 uint8_t *p_ref0, *p_ref1, *p_ref2, *p_ref3;
348 v16u8 src0, src1, src2, src3;
349 v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
350 v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
361 for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
363 LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
364 p_src += ( 4 * i_src_stride );
365 LD_UB4( p_ref0, i_ref_stride, ref0, ref1, ref2, ref3 );
366 p_ref0 += ( 4 * i_ref_stride );
367 LD_UB4( p_ref1, i_ref_stride, ref4, ref5, ref6, ref7 );
368 p_ref1 += ( 4 * i_ref_stride );
369 LD_UB4( p_ref2, i_ref_stride, ref8, ref9, ref10, ref11 );
370 p_ref2 += ( 4 * i_ref_stride );
371 LD_UB4( p_ref3, i_ref_stride, ref12, ref13, ref14, ref15 );
372 p_ref3 += ( 4 * i_ref_stride );
374 PCKEV_D2_UB( src1, src0, src3, src2, src0, src1 );
375 PCKEV_D2_UB( ref1, ref0, ref3, ref2, ref0, ref1 );
376 sad0 += SAD_UB2_UH( src0, src1, ref0, ref1 );
378 PCKEV_D2_UB( ref5, ref4, ref7, ref6, ref0, ref1 );
379 sad1 += SAD_UB2_UH( src0, src1, ref0, ref1 );
381 PCKEV_D2_UB( ref9, ref8, ref11, ref10, ref0, ref1 );
382 sad2 += SAD_UB2_UH( src0, src1, ref0, ref1 );
384 PCKEV_D2_UB( ref13, ref12, ref15, ref14, ref0, ref1 );
385 sad3 += SAD_UB2_UH( src0, src1, ref0, ref1 );
388 pu_sad_array[0] = HADD_UH_U32( sad0 );
389 pu_sad_array[1] = HADD_UH_U32( sad1 );
390 pu_sad_array[2] = HADD_UH_U32( sad2 );
391 pu_sad_array[3] = HADD_UH_U32( sad3 );
394 static void sad_16width_x4d_msa( uint8_t *p_src, int32_t i_src_stride,
395 uint8_t *p_aref[], int32_t i_ref_stride,
396 int32_t i_height, uint32_t *pu_sad_array )
399 uint8_t *p_ref0, *p_ref1, *p_ref2, *p_ref3;
400 v16u8 src, ref0, ref1, ref2, ref3, diff;
411 for ( i_ht_cnt = ( i_height >> 1 ); i_ht_cnt--; )
413 src = LD_UB( p_src );
414 p_src += i_src_stride;
415 ref0 = LD_UB( p_ref0 );
416 p_ref0 += i_ref_stride;
417 ref1 = LD_UB( p_ref1 );
418 p_ref1 += i_ref_stride;
419 ref2 = LD_UB( p_ref2 );
420 p_ref2 += i_ref_stride;
421 ref3 = LD_UB( p_ref3 );
422 p_ref3 += i_ref_stride;
424 diff = __msa_asub_u_b( src, ref0 );
425 sad0 += __msa_hadd_u_h( diff, diff );
426 diff = __msa_asub_u_b( src, ref1 );
427 sad1 += __msa_hadd_u_h( diff, diff );
428 diff = __msa_asub_u_b( src, ref2 );
429 sad2 += __msa_hadd_u_h( diff, diff );
430 diff = __msa_asub_u_b( src, ref3 );
431 sad3 += __msa_hadd_u_h( diff, diff );
433 src = LD_UB( p_src );
434 p_src += i_src_stride;
435 ref0 = LD_UB( p_ref0 );
436 p_ref0 += i_ref_stride;
437 ref1 = LD_UB( p_ref1 );
438 p_ref1 += i_ref_stride;
439 ref2 = LD_UB( p_ref2 );
440 p_ref2 += i_ref_stride;
441 ref3 = LD_UB( p_ref3 );
442 p_ref3 += i_ref_stride;
444 diff = __msa_asub_u_b( src, ref0 );
445 sad0 += __msa_hadd_u_h( diff, diff );
446 diff = __msa_asub_u_b( src, ref1 );
447 sad1 += __msa_hadd_u_h( diff, diff );
448 diff = __msa_asub_u_b( src, ref2 );
449 sad2 += __msa_hadd_u_h( diff, diff );
450 diff = __msa_asub_u_b( src, ref3 );
451 sad3 += __msa_hadd_u_h( diff, diff );
454 pu_sad_array[0] = HADD_UH_U32( sad0 );
455 pu_sad_array[1] = HADD_UH_U32( sad1 );
456 pu_sad_array[2] = HADD_UH_U32( sad2 );
457 pu_sad_array[3] = HADD_UH_U32( sad3 );
460 static uint64_t avc_pixel_var16width_msa( uint8_t *p_pix, int32_t i_stride,
463 uint32_t u_sum = 0, u_sqr_out = 0, u_cnt;
464 v16i8 pix, zero = { 0 };
465 v8u16 add, pix_r, pix_l;
468 for ( u_cnt = i_height; u_cnt--; )
470 pix = LD_SB( p_pix );
472 add = __msa_hadd_u_h( ( v16u8 ) pix, ( v16u8 ) pix );
473 u_sum += HADD_UH_U32( add );
474 ILVRL_B2_UH( zero, pix, pix_r, pix_l );
475 sqr = __msa_dpadd_u_w( sqr, pix_r, pix_r );
476 sqr = __msa_dpadd_u_w( sqr, pix_l, pix_l );
479 u_sqr_out = HADD_SW_S32( sqr );
481 return ( u_sum + ( ( uint64_t ) u_sqr_out << 32 ) );
484 static uint64_t avc_pixel_var8width_msa( uint8_t *p_pix, int32_t i_stride,
487 uint32_t u_sum = 0, u_sqr_out = 0, u_cnt;
488 v16i8 pix, zero = { 0 };
492 for ( u_cnt = i_height; u_cnt--; )
494 pix = LD_SB( p_pix );
496 pix_r = ( v8u16 ) __msa_ilvr_b( zero, pix );
497 add = __msa_hadd_u_h( ( v16u8 ) pix_r, ( v16u8 ) pix_r );
498 u_sum += HADD_UH_U32( add );
499 sqr = __msa_dpadd_u_w( sqr, pix_r, pix_r );
502 u_sqr_out = HADD_SW_S32( sqr );
504 return ( u_sum + ( ( uint64_t ) u_sqr_out << 32 ) );
507 static uint32_t sse_diff_8width_msa( uint8_t *p_src, int32_t i_src_stride,
508 uint8_t *p_ref, int32_t i_ref_stride,
509 int32_t i_height, int32_t *p_diff )
513 v16u8 src0, src1, src2, src3;
514 v16u8 ref0, ref1, ref2, ref3;
516 v4i32 vec, var = { 0 };
518 for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
520 LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
521 p_src += ( 4 * i_src_stride );
522 LD_UB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 );
523 p_ref += ( 4 * i_ref_stride );
525 PCKEV_D4_UB( src1, src0, src3, src2, ref1, ref0, ref3, ref2,
526 src0, src1, ref0, ref1 );
527 CALC_MSE_AVG_B( src0, ref0, var, avg );
528 CALC_MSE_AVG_B( src1, ref1, var, avg );
531 vec = __msa_hadd_s_w( avg, avg );
532 *p_diff = HADD_SW_S32( vec );
533 u_sse = HADD_SW_S32( var );
538 static uint32_t sse_4width_msa( uint8_t *p_src, int32_t i_src_stride,
539 uint8_t *p_ref, int32_t i_ref_stride,
544 uint32_t u_src0, u_src1, u_src2, u_src3;
545 uint32_t u_ref0, u_ref1, u_ref2, u_ref3;
550 for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
552 LW4( p_src, i_src_stride, u_src0, u_src1, u_src2, u_src3 );
553 p_src += ( 4 * i_src_stride );
554 LW4( p_ref, i_ref_stride, u_ref0, u_ref1, u_ref2, u_ref3 );
555 p_ref += ( 4 * i_ref_stride );
557 INSERT_W4_UB( u_src0, u_src1, u_src2, u_src3, src );
558 INSERT_W4_UB( u_ref0, u_ref1, u_ref2, u_ref3, ref );
559 CALC_MSE_B( src, ref, var );
562 u_sse = HADD_SW_S32( var );
567 static uint32_t sse_8width_msa( uint8_t *p_src, int32_t i_src_stride,
568 uint8_t *p_ref, int32_t i_ref_stride,
573 v16u8 src0, src1, src2, src3;
574 v16u8 ref0, ref1, ref2, ref3;
577 for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
579 LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
580 p_src += ( 4 * i_src_stride );
581 LD_UB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 );
582 p_ref += ( 4 * i_ref_stride );
584 PCKEV_D4_UB( src1, src0, src3, src2, ref1, ref0, ref3, ref2,
585 src0, src1, ref0, ref1 );
586 CALC_MSE_B( src0, ref0, var );
587 CALC_MSE_B( src1, ref1, var );
590 u_sse = HADD_SW_S32( var );
595 static uint32_t sse_16width_msa( uint8_t *p_src, int32_t i_src_stride,
596 uint8_t *p_ref, int32_t i_ref_stride,
604 for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
606 src = LD_UB( p_src );
607 p_src += i_src_stride;
608 ref = LD_UB( p_ref );
609 p_ref += i_ref_stride;
610 CALC_MSE_B( src, ref, var );
612 src = LD_UB( p_src );
613 p_src += i_src_stride;
614 ref = LD_UB( p_ref );
615 p_ref += i_ref_stride;
616 CALC_MSE_B( src, ref, var );
618 src = LD_UB( p_src );
619 p_src += i_src_stride;
620 ref = LD_UB( p_ref );
621 p_ref += i_ref_stride;
622 CALC_MSE_B( src, ref, var );
624 src = LD_UB( p_src );
625 p_src += i_src_stride;
626 ref = LD_UB( p_ref );
627 p_ref += i_ref_stride;
628 CALC_MSE_B( src, ref, var );
631 u_sse = HADD_SW_S32( var );
636 static void ssim_4x4x2_core_msa( const uint8_t *p_src, int32_t i_src_stride,
637 const uint8_t *p_ref, int32_t i_ref_stride,
638 int32_t pi_sum_array[2][4] )
641 v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
642 v8u16 temp0, temp1, temp2, temp3;
643 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
647 LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
648 p_src += ( 4 * i_src_stride );
649 LD_UB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 );
650 p_ref += ( 4 * i_ref_stride );
652 ILVR_D2_UB( src1, src0, src3, src2, src0, src2 );
653 ILVR_D2_UB( ref1, ref0, ref3, ref2, ref0, ref2 );
654 HADD_UB2_UH( src0, src2, temp0, temp1 );
656 temp2 = ( v8u16 ) __msa_ilvev_w( ( v4i32 ) temp1, ( v4i32 ) temp0 );
657 temp3 = ( v8u16 ) __msa_ilvod_w( ( v4i32 ) temp1, ( v4i32 ) temp0 );
659 pi_sum_array[0][0] = ( int32_t ) HADD_UH_U32( temp2 );
660 pi_sum_array[1][0] = ( int32_t ) HADD_UH_U32( temp3 );
662 HADD_UB2_UH( ref0, ref2, temp0, temp1 );
664 temp2 = ( v8u16 ) __msa_ilvev_w( ( v4i32 ) temp1, ( v4i32 ) temp0 );
665 temp3 = ( v8u16 ) __msa_ilvod_w( ( v4i32 ) temp1, ( v4i32 ) temp0 );
667 pi_sum_array[0][1] = ( int32_t ) HADD_UH_U32( temp2 );
668 pi_sum_array[1][1] = ( int32_t ) HADD_UH_U32( temp3 );
670 ILVR_B4_UH( zero, src0, zero, src2, zero, ref0, zero, ref2, vec0, vec2,
672 ILVL_B4_UH( zero, src0, zero, src2, zero, ref0, zero, ref2, vec1, vec3,
675 tmp0 = __msa_dotp_u_w( vec0, vec0 );
676 tmp0 = __msa_dpadd_u_w( tmp0, vec1, vec1 );
677 tmp0 = __msa_dpadd_u_w( tmp0, vec2, vec2 );
678 tmp0 = __msa_dpadd_u_w( tmp0, vec3, vec3 );
679 tmp0 = __msa_dpadd_u_w( tmp0, vec4, vec4 );
680 tmp0 = __msa_dpadd_u_w( tmp0, vec5, vec5 );
681 tmp0 = __msa_dpadd_u_w( tmp0, vec6, vec6 );
682 tmp0 = __msa_dpadd_u_w( tmp0, vec7, vec7 );
684 tmp2 = ( v4i32 ) __msa_ilvev_d( ( v2i64 ) tmp0, ( v2i64 ) tmp0 );
685 tmp3 = ( v4i32 ) __msa_ilvod_d( ( v2i64 ) tmp0, ( v2i64 ) tmp0 );
686 tmp2 = ( v4i32 ) __msa_hadd_u_d( ( v4u32 ) tmp2, ( v4u32 ) tmp2 );
687 tmp3 = ( v4i32 ) __msa_hadd_u_d( ( v4u32 ) tmp3, ( v4u32 ) tmp3 );
689 pi_sum_array[0][2] = __msa_copy_u_w( tmp2, 0 );
690 pi_sum_array[1][2] = __msa_copy_u_w( tmp3, 0 );
692 tmp0 = __msa_dotp_u_w( vec4, vec0 );
693 tmp0 = __msa_dpadd_u_w( tmp0, vec5, vec1 );
694 tmp0 = __msa_dpadd_u_w( tmp0, vec6, vec2 );
695 tmp0 = __msa_dpadd_u_w( tmp0, vec7, vec3 );
697 tmp2 = ( v4i32 ) __msa_ilvev_d( ( v2i64 ) tmp0, ( v2i64 ) tmp0 );
698 tmp3 = ( v4i32 ) __msa_ilvod_d( ( v2i64 ) tmp0, ( v2i64 ) tmp0 );
699 tmp2 = ( v4i32 ) __msa_hadd_u_d( ( v4u32 ) tmp2, ( v4u32 ) tmp2 );
700 tmp3 = ( v4i32 ) __msa_hadd_u_d( ( v4u32 ) tmp3, ( v4u32 ) tmp3 );
702 pi_sum_array[0][3] = __msa_copy_u_w( tmp2, 0 );
703 pi_sum_array[1][3] = __msa_copy_u_w( tmp3, 0 );
706 static int32_t pixel_satd_4width_msa( uint8_t *p_src, int32_t i_src_stride,
707 uint8_t *p_ref, int32_t i_ref_stride,
712 v16i8 src0, src1, src2, src3;
713 v16i8 ref0, ref1, ref2, ref3;
715 v8i16 diff0, diff1, diff2, diff3;
716 v8i16 temp0, temp1, temp2, temp3;
718 for ( cnt = i_height >> 2; cnt--; )
720 LD_SB4( p_src, i_src_stride, src0, src1, src2, src3 );
721 p_src += 4 * i_src_stride;
722 LD_SB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 );
723 p_ref += 4 * i_ref_stride;
725 ILVR_B4_SH( src0, ref0, src1, ref1, src2, ref2, src3, ref3,
726 diff0, diff1, diff2, diff3 );
727 HSUB_UB4_SH( diff0, diff1, diff2, diff3, diff0, diff1, diff2, diff3 );
728 TRANSPOSE4x4_SH_SH( diff0, diff1, diff2, diff3,
729 diff0, diff1, diff2, diff3 );
730 BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
731 BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
732 TRANSPOSE4x4_SH_SH( diff0, diff1, diff2, diff3,
733 diff0, diff1, diff2, diff3 );
734 BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
735 BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
737 diff0 = __msa_add_a_h( diff0, zero );
738 diff1 = __msa_add_a_h( diff1, zero );
739 diff2 = __msa_add_a_h( diff2, zero );
740 diff3 = __msa_add_a_h( diff3, zero );
741 diff0 = ( diff0 + diff1 + diff2 + diff3 );
742 diff0 = ( v8i16 ) __msa_hadd_u_w( ( v8u16 ) diff0, ( v8u16 ) diff0 );
743 diff0 = ( v8i16 ) __msa_hadd_u_d( ( v4u32 ) diff0, ( v4u32 ) diff0 );
744 u_sum += __msa_copy_u_w( ( v4i32 ) diff0, 0 );
747 return ( u_sum >> 1 );
750 static int32_t pixel_satd_8width_msa( uint8_t *p_src, int32_t i_src_stride,
751 uint8_t *p_ref, int32_t i_ref_stride,
756 v16i8 src0, src1, src2, src3;
757 v16i8 ref0, ref1, ref2, ref3;
759 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
760 v8i16 temp0, temp1, temp2, temp3;
762 for ( cnt = i_height >> 2; cnt--; )
764 LD_SB4( p_src, i_src_stride, src0, src1, src2, src3 );
765 p_src += 4 * i_src_stride;
766 LD_SB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 );
767 p_ref += 4 * i_ref_stride;
769 ILVR_B4_SH( src0, ref0, src1, ref1, src2, ref2, src3, ref3,
770 diff0, diff1, diff2, diff3 );
771 HSUB_UB4_SH( diff0, diff1, diff2, diff3, diff0, diff1, diff2, diff3 );
772 TRANSPOSE8X4_SH_SH( diff0, diff1, diff2, diff3,
773 diff0, diff2, diff4, diff6 );
775 diff1 = ( v8i16 ) __msa_splati_d( ( v2i64 ) diff0, 1 );
776 diff3 = ( v8i16 ) __msa_splati_d( ( v2i64 ) diff2, 1 );
777 diff5 = ( v8i16 ) __msa_splati_d( ( v2i64 ) diff4, 1 );
778 diff7 = ( v8i16 ) __msa_splati_d( ( v2i64 ) diff6, 1 );
780 BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
781 BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
782 BUTTERFLY_4( diff4, diff6, diff7, diff5, temp0, temp2, temp3, temp1 );
783 BUTTERFLY_4( temp0, temp1, temp3, temp2, diff4, diff5, diff7, diff6 );
784 TRANSPOSE4X8_SH_SH( diff0, diff1, diff2, diff3, diff4, diff5, diff6,
785 diff7, diff0, diff1, diff2, diff3, diff4, diff5,
787 BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
788 BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
790 diff0 = __msa_add_a_h( diff0, zero );
791 diff1 = __msa_add_a_h( diff1, zero );
792 diff2 = __msa_add_a_h( diff2, zero );
793 diff3 = __msa_add_a_h( diff3, zero );
794 diff0 = ( diff0 + diff1 + diff2 + diff3 );
795 u_sum += HADD_UH_U32( diff0 );
798 return ( u_sum >> 1 );
801 static int32_t sa8d_8x8_msa( uint8_t *p_src, int32_t i_src_stride,
802 uint8_t *p_ref, int32_t i_ref_stride )
805 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
806 v16i8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
808 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
809 v8i16 sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7;
810 v8i16 temp0, temp1, temp2, temp3;
812 LD_SB8( p_src, i_src_stride, src0, src1, src2, src3, src4, src5, src6, src7 );
813 LD_SB8( p_ref, i_ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7 );
814 ILVR_B4_SH( src0, ref0, src1, ref1, src2, ref2, src3, ref3, sub0, sub1,
816 ILVR_B4_SH( src4, ref4, src5, ref5, src6, ref6, src7, ref7, sub4, sub5,
818 HSUB_UB4_SH( sub0, sub1, sub2, sub3, sub0, sub1, sub2, sub3 );
819 HSUB_UB4_SH( sub4, sub5, sub6, sub7, sub4, sub5, sub6, sub7 );
820 TRANSPOSE8x8_SH_SH( sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
821 sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7 );
822 BUTTERFLY_4( sub0, sub2, sub3, sub1, diff0, diff1, diff4, diff5 );
823 BUTTERFLY_4( sub4, sub6, sub7, sub5, diff2, diff3, diff7, diff6 );
824 BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
825 BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
826 BUTTERFLY_4( diff4, diff6, diff7, diff5, temp0, temp2, temp3, temp1 );
827 BUTTERFLY_4( temp0, temp1, temp3, temp2, diff4, diff5, diff7, diff6 );
828 TRANSPOSE8x8_SH_SH( diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7,
829 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7 );
830 BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
831 BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
832 BUTTERFLY_4( diff4, diff6, diff7, diff5, temp0, temp2, temp3, temp1 );
833 BUTTERFLY_4( temp0, temp1, temp3, temp2, diff4, diff5, diff7, diff6 );
835 temp0 = diff0 + diff4;
836 temp1 = diff1 + diff5;
837 temp2 = diff2 + diff6;
838 temp3 = diff3 + diff7;
840 temp0 = __msa_add_a_h( temp0, zero );
841 temp1 = __msa_add_a_h( temp1, zero );
842 temp2 = __msa_add_a_h( temp2, zero );
843 temp3 = __msa_add_a_h( temp3, zero );
845 diff0 = temp0 + __msa_asub_s_h( diff0, diff4 );
846 diff1 = temp1 + __msa_asub_s_h( diff1, diff5 );
847 diff2 = temp2 + __msa_asub_s_h( diff2, diff6 );
848 diff3 = temp3 + __msa_asub_s_h( diff3, diff7 );
849 diff0 = ( diff0 + diff1 + diff2 + diff3 );
851 u_sum = HADD_UH_U32( diff0 );
856 static uint64_t pixel_hadamard_ac_8x8_msa( uint8_t *p_pix, int32_t i_stride )
858 int16_t tmp0, tmp1, tmp2, tmp3;
859 uint32_t u_sum4 = 0, u_sum8 = 0, u_dc;
860 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
862 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
863 v8i16 sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7;
864 v8i16 temp0, temp1, temp2, temp3;
866 LD_UB8( p_pix, i_stride, src0, src1, src2, src3, src4, src5, src6, src7 );
868 ILVR_B4_SH( zero, src0, zero, src1, zero, src2, zero, src3, diff0, diff1,
870 ILVR_B4_SH( zero, src4, zero, src5, zero, src6, zero, src7, diff4, diff5,
872 TRANSPOSE8x8_SH_SH( diff0, diff1, diff2, diff3,
873 diff4, diff5, diff6, diff7,
874 diff0, diff1, diff2, diff3,
875 diff4, diff5, diff6, diff7 );
876 BUTTERFLY_4( diff0, diff2, diff3, diff1,
877 temp0, temp2, temp3, temp1 );
878 BUTTERFLY_4( temp0, temp1, temp3, temp2,
879 diff0, diff1, diff3, diff2 );
880 BUTTERFLY_4( diff4, diff6, diff7, diff5,
881 temp0, temp2, temp3, temp1 );
882 BUTTERFLY_4( temp0, temp1, temp3, temp2,
883 diff4, diff5, diff7, diff6 );
884 TRANSPOSE8x8_SH_SH( diff0, diff1, diff2, diff3,
885 diff4, diff5, diff6, diff7,
886 diff0, diff1, diff2, diff3,
887 diff4, diff5, diff6, diff7 );
888 BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
889 BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
890 BUTTERFLY_4( diff4, diff6, diff7, diff5, temp0, temp2, temp3, temp1 );
891 BUTTERFLY_4( temp0, temp1, temp3, temp2, diff4, diff5, diff7, diff6 );
898 sub0 = __msa_add_a_h( diff0, zero );
899 sub1 = __msa_add_a_h( diff1, zero );
900 sub2 = __msa_add_a_h( diff2, zero );
901 sub3 = __msa_add_a_h( diff3, zero );
902 sub4 = __msa_add_a_h( diff4, zero );
903 sub5 = __msa_add_a_h( diff5, zero );
904 sub6 = __msa_add_a_h( diff6, zero );
905 sub7 = __msa_add_a_h( diff7, zero );
907 sub0 = ( sub0 + sub1 + sub2 + sub3 );
908 sub1 = ( sub4 + sub5 + sub6 + sub7 );
911 u_sum4 += HADD_UH_U32( sub0 );
913 TRANSPOSE8x8_SH_SH( diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7,
914 sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7 );
916 ILVR_D2_SH( sub2, sub0, sub6, sub4, diff0, diff1 );
917 ILVR_D2_SH( sub3, sub1, sub7, sub5, diff4, diff6 );
919 diff2 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) sub2, ( v2i64 ) sub0 );
920 diff3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) sub6, ( v2i64 ) sub4 );
921 diff5 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) sub3, ( v2i64 ) sub1 );
922 diff7 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) sub7, ( v2i64 ) sub5 );
924 BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
925 BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
926 BUTTERFLY_4( diff4, diff6, diff7, diff5, temp0, temp2, temp3, temp1 );
927 BUTTERFLY_4( temp0, temp1, temp3, temp2, diff4, diff5, diff7, diff6 );
929 sub0 = __msa_add_a_h( diff0, zero );
930 sub1 = __msa_add_a_h( diff1, zero );
931 sub2 = __msa_add_a_h( diff2, zero );
932 sub3 = __msa_add_a_h( diff3, zero );
933 sub4 = __msa_add_a_h( diff4, zero );
934 sub5 = __msa_add_a_h( diff5, zero );
935 sub6 = __msa_add_a_h( diff6, zero );
936 sub7 = __msa_add_a_h( diff7, zero );
938 sub0 = ( sub0 + sub1 + sub2 + sub3 );
939 sub1 = ( sub4 + sub5 + sub6 + sub7 );
942 u_sum8 += HADD_UH_U32( sub0 );
944 u_dc = ( uint16_t ) ( tmp0 + tmp1 + tmp2 + tmp3 );
945 u_sum4 = u_sum4 - u_dc;
946 u_sum8 = u_sum8 - u_dc;
948 return ( ( uint64_t ) u_sum8 << 32 ) + u_sum4;
951 int32_t x264_pixel_sad_16x16_msa( uint8_t *p_src, intptr_t i_src_stride,
952 uint8_t *p_ref, intptr_t i_ref_stride )
954 return sad_16width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 );
957 int32_t x264_pixel_sad_16x8_msa( uint8_t *p_src, intptr_t i_src_stride,
958 uint8_t *p_ref, intptr_t i_ref_stride )
960 return sad_16width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 );
963 int32_t x264_pixel_sad_8x16_msa( uint8_t *p_src, intptr_t i_src_stride,
964 uint8_t *p_ref, intptr_t i_ref_stride )
966 return sad_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 );
969 int32_t x264_pixel_sad_8x8_msa( uint8_t *p_src, intptr_t i_src_stride,
970 uint8_t *p_ref, intptr_t i_ref_stride )
972 return sad_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 );
975 int32_t x264_pixel_sad_8x4_msa( uint8_t *p_src, intptr_t i_src_stride,
976 uint8_t *p_ref, intptr_t i_ref_stride )
978 return sad_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 4 );
981 int32_t x264_pixel_sad_4x16_msa( uint8_t *p_src, intptr_t i_src_stride,
982 uint8_t *p_ref, intptr_t i_ref_stride )
984 return sad_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 );
987 int32_t x264_pixel_sad_4x8_msa( uint8_t *p_src, intptr_t i_src_stride,
988 uint8_t *p_ref, intptr_t i_ref_stride )
990 return sad_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 );
993 int32_t x264_pixel_sad_4x4_msa( uint8_t *p_src, intptr_t i_src_stride,
994 uint8_t *p_ref, intptr_t i_ref_stride )
996 return sad_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 4 );
999 void x264_pixel_sad_x4_16x16_msa( uint8_t *p_src, uint8_t *p_ref0,
1000 uint8_t *p_ref1, uint8_t *p_ref2,
1001 uint8_t *p_ref3, intptr_t i_ref_stride,
1002 int32_t p_sad_array[4] )
1004 uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 };
1006 sad_16width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 16,
1007 ( uint32_t * ) p_sad_array );
1010 void x264_pixel_sad_x4_16x8_msa( uint8_t *p_src, uint8_t *p_ref0,
1011 uint8_t *p_ref1, uint8_t *p_ref2,
1012 uint8_t *p_ref3, intptr_t i_ref_stride,
1013 int32_t p_sad_array[4] )
1015 uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 };
1017 sad_16width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 8,
1018 ( uint32_t * ) p_sad_array );
1021 void x264_pixel_sad_x4_8x16_msa( uint8_t *p_src, uint8_t *p_ref0,
1022 uint8_t *p_ref1, uint8_t *p_ref2,
1023 uint8_t *p_ref3, intptr_t i_ref_stride,
1024 int32_t p_sad_array[4] )
1026 uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 };
1028 sad_8width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 16,
1029 ( uint32_t * ) p_sad_array );
1032 void x264_pixel_sad_x4_8x8_msa( uint8_t *p_src, uint8_t *p_ref0,
1033 uint8_t *p_ref1, uint8_t *p_ref2,
1034 uint8_t *p_ref3, intptr_t i_ref_stride,
1035 int32_t p_sad_array[4] )
1037 uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 };
1039 sad_8width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 8,
1040 ( uint32_t * ) p_sad_array );
1043 void x264_pixel_sad_x4_8x4_msa( uint8_t *p_src, uint8_t *p_ref0,
1044 uint8_t *p_ref1, uint8_t *p_ref2,
1045 uint8_t *p_ref3, intptr_t i_ref_stride,
1046 int32_t p_sad_array[4] )
1048 uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 };
1050 sad_8width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 4,
1051 ( uint32_t * ) p_sad_array );
1054 void x264_pixel_sad_x4_4x8_msa( uint8_t *p_src, uint8_t *p_ref0,
1055 uint8_t *p_ref1, uint8_t *p_ref2,
1056 uint8_t *p_ref3, intptr_t i_ref_stride,
1057 int32_t p_sad_array[4] )
1059 uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 };
1061 sad_4width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 8,
1062 ( uint32_t * ) p_sad_array );
1065 void x264_pixel_sad_x4_4x4_msa( uint8_t *p_src, uint8_t *p_ref0,
1066 uint8_t *p_ref1, uint8_t *p_ref2,
1067 uint8_t *p_ref3, intptr_t i_ref_stride,
1068 int32_t p_sad_array[4] )
1070 uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 };
1072 sad_4width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 4,
1073 ( uint32_t * ) p_sad_array );
1076 void x264_pixel_sad_x3_16x16_msa( uint8_t *p_src, uint8_t *p_ref0,
1077 uint8_t *p_ref1, uint8_t *p_ref2,
1078 intptr_t i_ref_stride,
1079 int32_t p_sad_array[3] )
1081 sad_16width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2,
1082 i_ref_stride, 16, ( uint32_t * ) p_sad_array );
1085 void x264_pixel_sad_x3_16x8_msa( uint8_t *p_src, uint8_t *p_ref0,
1086 uint8_t *p_ref1, uint8_t *p_ref2,
1087 intptr_t i_ref_stride,
1088 int32_t p_sad_array[3] )
1090 sad_16width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2,
1091 i_ref_stride, 8, ( uint32_t * ) p_sad_array );
1094 void x264_pixel_sad_x3_8x16_msa( uint8_t *p_src, uint8_t *p_ref0,
1095 uint8_t *p_ref1, uint8_t *p_ref2,
1096 intptr_t i_ref_stride,
1097 int32_t p_sad_array[3] )
1099 sad_8width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2,
1100 i_ref_stride, 16, ( uint32_t * ) p_sad_array );
1103 void x264_pixel_sad_x3_8x8_msa( uint8_t *p_src, uint8_t *p_ref0,
1104 uint8_t *p_ref1, uint8_t *p_ref2,
1105 intptr_t i_ref_stride,
1106 int32_t p_sad_array[3] )
1108 sad_8width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2,
1109 i_ref_stride, 8, ( uint32_t * ) p_sad_array );
1112 void x264_pixel_sad_x3_8x4_msa( uint8_t *p_src, uint8_t *p_ref0,
1113 uint8_t *p_ref1, uint8_t *p_ref2,
1114 intptr_t i_ref_stride,
1115 int32_t p_sad_array[3] )
1117 sad_8width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2,
1118 i_ref_stride, 4, ( uint32_t * ) p_sad_array );
1121 void x264_pixel_sad_x3_4x8_msa( uint8_t *p_src, uint8_t *p_ref0,
1122 uint8_t *p_ref1, uint8_t *p_ref2,
1123 intptr_t i_ref_stride,
1124 int32_t p_sad_array[3] )
1126 sad_4width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2,
1127 i_ref_stride, 8, ( uint32_t * ) p_sad_array );
1130 void x264_pixel_sad_x3_4x4_msa( uint8_t *p_src, uint8_t *p_ref0,
1131 uint8_t *p_ref1, uint8_t *p_ref2,
1132 intptr_t i_ref_stride,
1133 int32_t p_sad_array[3] )
1135 sad_4width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2,
1136 i_ref_stride, 4, ( uint32_t * ) p_sad_array );
1139 int32_t x264_pixel_ssd_16x16_msa( uint8_t *p_src, intptr_t i_src_stride,
1140 uint8_t *p_ref, intptr_t i_ref_stride )
1142 return sse_16width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 );
1145 int32_t x264_pixel_ssd_16x8_msa( uint8_t *p_src, intptr_t i_src_stride,
1146 uint8_t *p_ref, intptr_t i_ref_stride )
1148 return sse_16width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 );
1151 int32_t x264_pixel_ssd_8x16_msa( uint8_t *p_src, intptr_t i_src_stride,
1152 uint8_t *p_ref, intptr_t i_ref_stride )
1154 return sse_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 );
1157 int32_t x264_pixel_ssd_8x8_msa( uint8_t *p_src, intptr_t i_src_stride,
1158 uint8_t *p_ref, intptr_t i_ref_stride )
1160 return sse_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 );
1163 int32_t x264_pixel_ssd_8x4_msa( uint8_t *p_src, intptr_t i_src_stride,
1164 uint8_t *p_ref, intptr_t i_ref_stride )
1166 return sse_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 4 );
1169 int32_t x264_pixel_ssd_4x16_msa( uint8_t *p_src, intptr_t i_src_stride,
1170 uint8_t *p_ref, intptr_t i_ref_stride )
1172 return sse_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 );
1175 int32_t x264_pixel_ssd_4x8_msa( uint8_t *p_src, intptr_t i_src_stride,
1176 uint8_t *p_ref, intptr_t i_ref_stride )
1178 return sse_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 );
1181 int32_t x264_pixel_ssd_4x4_msa( uint8_t *p_src, intptr_t i_src_stride,
1182 uint8_t *p_ref, intptr_t i_ref_stride )
1184 return sse_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 4 );
1187 void x264_intra_sad_x3_4x4_msa( uint8_t *p_enc, uint8_t *p_dec,
1188 int32_t p_sad_array[3] )
1190 x264_intra_predict_vert_4x4_msa( p_dec );
1191 p_sad_array[0] = x264_pixel_sad_4x4_msa( p_dec, FDEC_STRIDE,
1192 p_enc, FENC_STRIDE );
1194 x264_intra_predict_hor_4x4_msa( p_dec );
1195 p_sad_array[1] = x264_pixel_sad_4x4_msa( p_dec, FDEC_STRIDE,
1196 p_enc, FENC_STRIDE );
1198 x264_intra_predict_dc_4x4_msa( p_dec );
1199 p_sad_array[2] = x264_pixel_sad_4x4_msa( p_dec, FDEC_STRIDE,
1200 p_enc, FENC_STRIDE );
1203 void x264_intra_sad_x3_16x16_msa( uint8_t *p_enc, uint8_t *p_dec,
1204 int32_t p_sad_array[3] )
1206 x264_intra_predict_vert_16x16_msa( p_dec );
1207 p_sad_array[0] = x264_pixel_sad_16x16_msa( p_dec, FDEC_STRIDE,
1208 p_enc, FENC_STRIDE );
1210 x264_intra_predict_hor_16x16_msa( p_dec );
1211 p_sad_array[1] = x264_pixel_sad_16x16_msa( p_dec, FDEC_STRIDE,
1212 p_enc, FENC_STRIDE );
1214 x264_intra_predict_dc_16x16_msa( p_dec );
1215 p_sad_array[2] = x264_pixel_sad_16x16_msa( p_dec, FDEC_STRIDE,
1216 p_enc, FENC_STRIDE );
1219 void x264_intra_sad_x3_8x8_msa( uint8_t *p_enc, uint8_t p_edge[36],
1220 int32_t p_sad_array[3] )
1222 ALIGNED_ARRAY_16( uint8_t, pix, [8 * FDEC_STRIDE] );
1224 x264_intra_predict_v_8x8_msa( pix, p_edge );
1225 p_sad_array[0] = x264_pixel_sad_8x8_msa( pix, FDEC_STRIDE,
1226 p_enc, FENC_STRIDE );
1228 x264_intra_predict_h_8x8_msa( pix, p_edge );
1229 p_sad_array[1] = x264_pixel_sad_8x8_msa( pix, FDEC_STRIDE,
1230 p_enc, FENC_STRIDE );
1232 x264_intra_predict_dc_8x8_msa( pix, p_edge );
1233 p_sad_array[2] = x264_pixel_sad_8x8_msa( pix, FDEC_STRIDE,
1234 p_enc, FENC_STRIDE );
1237 void x264_intra_sad_x3_8x8c_msa( uint8_t *p_enc, uint8_t *p_dec,
1238 int32_t p_sad_array[3] )
1240 x264_intra_predict_dc_4blk_8x8_msa( p_dec );
1241 p_sad_array[0] = x264_pixel_sad_8x8_msa( p_dec, FDEC_STRIDE,
1242 p_enc, FENC_STRIDE );
1244 x264_intra_predict_hor_8x8_msa( p_dec );
1245 p_sad_array[1] = x264_pixel_sad_8x8_msa( p_dec, FDEC_STRIDE,
1246 p_enc, FENC_STRIDE );
1248 x264_intra_predict_vert_8x8_msa( p_dec );
1249 p_sad_array[2] = x264_pixel_sad_8x8_msa( p_dec, FDEC_STRIDE,
1250 p_enc, FENC_STRIDE );
1253 void x264_ssim_4x4x2_core_msa( const uint8_t *p_pix1, intptr_t i_stride1,
1254 const uint8_t *p_pix2, intptr_t i_stride2,
1255 int32_t i_sums[2][4] )
1257 ssim_4x4x2_core_msa( p_pix1, i_stride1, p_pix2, i_stride2, i_sums );
1260 uint64_t x264_pixel_hadamard_ac_8x8_msa( uint8_t *p_pix, intptr_t i_stride )
1264 u_sum = pixel_hadamard_ac_8x8_msa( p_pix, i_stride );
1266 return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
1269 uint64_t x264_pixel_hadamard_ac_8x16_msa( uint8_t *p_pix, intptr_t i_stride )
1273 u_sum = pixel_hadamard_ac_8x8_msa( p_pix, i_stride );
1274 u_sum += pixel_hadamard_ac_8x8_msa( p_pix + 8 * i_stride, i_stride );
1276 return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
1279 uint64_t x264_pixel_hadamard_ac_16x8_msa( uint8_t *p_pix, intptr_t i_stride )
1283 u_sum = pixel_hadamard_ac_8x8_msa( p_pix, i_stride );
1284 u_sum += pixel_hadamard_ac_8x8_msa( p_pix + 8, i_stride );
1286 return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
1289 uint64_t x264_pixel_hadamard_ac_16x16_msa( uint8_t *p_pix, intptr_t i_stride )
1293 u_sum = pixel_hadamard_ac_8x8_msa( p_pix, i_stride );
1294 u_sum += pixel_hadamard_ac_8x8_msa( p_pix + 8, i_stride );
1295 u_sum += pixel_hadamard_ac_8x8_msa( p_pix + 8 * i_stride, i_stride );
1296 u_sum += pixel_hadamard_ac_8x8_msa( p_pix + 8 * i_stride + 8, i_stride );
1298 return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
1301 int32_t x264_pixel_satd_4x4_msa( uint8_t *p_pix1, intptr_t i_stride,
1302 uint8_t *p_pix2, intptr_t i_stride2 )
1304 return pixel_satd_4width_msa( p_pix1, i_stride, p_pix2, i_stride2, 4 );
1307 int32_t x264_pixel_satd_4x8_msa( uint8_t *p_pix1, intptr_t i_stride,
1308 uint8_t *p_pix2, intptr_t i_stride2 )
1310 return pixel_satd_4width_msa( p_pix1, i_stride, p_pix2, i_stride2, 8 );
1313 int32_t x264_pixel_satd_4x16_msa( uint8_t *p_pix1, intptr_t i_stride,
1314 uint8_t *p_pix2, intptr_t i_stride2 )
1316 return pixel_satd_4width_msa( p_pix1, i_stride, p_pix2, i_stride2, 16 );
1319 int32_t x264_pixel_satd_8x4_msa( uint8_t *p_pix1, intptr_t i_stride,
1320 uint8_t *p_pix2, intptr_t i_stride2 )
1322 return pixel_satd_8width_msa( p_pix1, i_stride, p_pix2, i_stride2, 4 );
1325 int32_t x264_pixel_satd_8x8_msa( uint8_t *p_pix1, intptr_t i_stride,
1326 uint8_t *p_pix2, intptr_t i_stride2 )
1328 return pixel_satd_8width_msa( p_pix1, i_stride, p_pix2, i_stride2, 8 );
1331 int32_t x264_pixel_satd_8x16_msa( uint8_t *p_pix1, intptr_t i_stride,
1332 uint8_t *p_pix2, intptr_t i_stride2 )
1334 return pixel_satd_8width_msa( p_pix1, i_stride, p_pix2, i_stride2, 16 );
1337 int32_t x264_pixel_satd_16x8_msa( uint8_t *p_pix1, intptr_t i_stride,
1338 uint8_t *p_pix2, intptr_t i_stride2 )
1340 uint32_t u32Sum = 0;
1342 u32Sum = pixel_satd_8width_msa( p_pix1, i_stride, p_pix2, i_stride2, 8 );
1343 u32Sum += pixel_satd_8width_msa( p_pix1 + 8, i_stride,
1344 p_pix2 + 8, i_stride2, 8 );
1349 int32_t x264_pixel_satd_16x16_msa( uint8_t *p_pix1, intptr_t i_stride,
1350 uint8_t *p_pix2, intptr_t i_stride2 )
1352 uint32_t u32Sum = 0;
1354 u32Sum = pixel_satd_8width_msa( p_pix1, i_stride, p_pix2, i_stride2, 16 );
1355 u32Sum += pixel_satd_8width_msa( p_pix1 + 8, i_stride,
1356 p_pix2 + 8, i_stride2, 16 );
1361 int32_t x264_pixel_sa8d_8x8_msa( uint8_t *p_pix1, intptr_t i_stride,
1362 uint8_t *p_pix2, intptr_t i_stride2 )
1364 int32_t i32Sum = sa8d_8x8_msa( p_pix1, i_stride, p_pix2, i_stride2 );
1366 return ( i32Sum + 2 ) >> 2;
1369 int32_t x264_pixel_sa8d_16x16_msa( uint8_t *p_pix1, intptr_t i_stride,
1370 uint8_t *p_pix2, intptr_t i_stride2 )
1372 int32_t i32Sum = sa8d_8x8_msa( p_pix1, i_stride, p_pix2, i_stride2 ) +
1373 sa8d_8x8_msa( p_pix1 + 8, i_stride,
1374 p_pix2 + 8, i_stride2 ) +
1375 sa8d_8x8_msa( p_pix1 + 8 * i_stride, i_stride,
1376 p_pix2 + 8 * i_stride2, i_stride2 ) +
1377 sa8d_8x8_msa( p_pix1 + 8 + 8 * i_stride, i_stride,
1378 p_pix2 + 8 + 8 * i_stride2, i_stride2 );
1380 return ( i32Sum + 2 ) >> 2;
1383 void x264_intra_satd_x3_4x4_msa( uint8_t *p_enc, uint8_t *p_dec,
1384 int32_t p_sad_array[3] )
1386 x264_intra_predict_vert_4x4_msa( p_dec );
1387 p_sad_array[0] = x264_pixel_satd_4x4_msa( p_dec, FDEC_STRIDE,
1388 p_enc, FENC_STRIDE );
1390 x264_intra_predict_hor_4x4_msa( p_dec );
1391 p_sad_array[1] = x264_pixel_satd_4x4_msa( p_dec, FDEC_STRIDE,
1392 p_enc, FENC_STRIDE );
1394 x264_intra_predict_dc_4x4_msa( p_dec );
1395 p_sad_array[2] = x264_pixel_satd_4x4_msa( p_dec, FDEC_STRIDE,
1396 p_enc, FENC_STRIDE );
1399 void x264_intra_satd_x3_16x16_msa( uint8_t *p_enc, uint8_t *p_dec,
1400 int32_t p_sad_array[3] )
1402 x264_intra_predict_vert_16x16_msa( p_dec );
1403 p_sad_array[0] = x264_pixel_satd_16x16_msa( p_dec, FDEC_STRIDE,
1404 p_enc, FENC_STRIDE );
1406 x264_intra_predict_hor_16x16_msa( p_dec );
1407 p_sad_array[1] = x264_pixel_satd_16x16_msa( p_dec, FDEC_STRIDE,
1408 p_enc, FENC_STRIDE );
1410 x264_intra_predict_dc_16x16_msa( p_dec );
1411 p_sad_array[2] = x264_pixel_satd_16x16_msa( p_dec, FDEC_STRIDE,
1412 p_enc, FENC_STRIDE );
1415 void x264_intra_sa8d_x3_8x8_msa( uint8_t *p_enc, uint8_t p_edge[36],
1416 int32_t p_sad_array[3] )
1418 ALIGNED_ARRAY_16( uint8_t, pix, [8 * FDEC_STRIDE] );
1420 x264_intra_predict_v_8x8_msa( pix, p_edge );
1421 p_sad_array[0] = x264_pixel_sa8d_8x8_msa( pix, FDEC_STRIDE,
1422 p_enc, FENC_STRIDE );
1424 x264_intra_predict_h_8x8_msa( pix, p_edge );
1425 p_sad_array[1] = x264_pixel_sa8d_8x8_msa( pix, FDEC_STRIDE,
1426 p_enc, FENC_STRIDE );
1428 x264_intra_predict_dc_8x8_msa( pix, p_edge );
1429 p_sad_array[2] = x264_pixel_sa8d_8x8_msa( pix, FDEC_STRIDE,
1430 p_enc, FENC_STRIDE );
1433 void x264_intra_satd_x3_8x8c_msa( uint8_t *p_enc, uint8_t *p_dec,
1434 int32_t p_sad_array[3] )
1436 x264_intra_predict_dc_4blk_8x8_msa( p_dec );
1437 p_sad_array[0] = x264_pixel_satd_8x8_msa( p_dec, FDEC_STRIDE,
1438 p_enc, FENC_STRIDE );
1440 x264_intra_predict_hor_8x8_msa( p_dec );
1441 p_sad_array[1] = x264_pixel_satd_8x8_msa( p_dec, FDEC_STRIDE,
1442 p_enc, FENC_STRIDE );
1444 x264_intra_predict_vert_8x8_msa( p_dec );
1445 p_sad_array[2] = x264_pixel_satd_8x8_msa( p_dec, FDEC_STRIDE,
1446 p_enc, FENC_STRIDE );
1449 uint64_t x264_pixel_var_16x16_msa( uint8_t *p_pix, intptr_t i_stride )
1451 return avc_pixel_var16width_msa( p_pix, i_stride, 16 );
1454 uint64_t x264_pixel_var_8x16_msa( uint8_t *p_pix, intptr_t i_stride )
1456 return avc_pixel_var8width_msa( p_pix, i_stride, 16 );
1459 uint64_t x264_pixel_var_8x8_msa( uint8_t *p_pix, intptr_t i_stride )
1461 return avc_pixel_var8width_msa( p_pix, i_stride, 8 );
1464 int32_t x264_pixel_var2_8x16_msa( uint8_t *p_pix1, intptr_t i_stride1,
1465 uint8_t *p_pix2, intptr_t i_stride2,
1468 int32_t i_var = 0, i_diff = 0, i_sqr = 0;
1470 i_sqr = sse_diff_8width_msa( p_pix1, i_stride1, p_pix2, i_stride2, 16,
1472 i_var = VARIANCE_WxH( i_sqr, i_diff, 7 );
1478 int32_t x264_pixel_var2_8x8_msa( uint8_t *p_pix1, intptr_t i_stride1,
1479 uint8_t *p_pix2, intptr_t i_stride2,
1482 int32_t i_var = 0, i_diff = 0, i_sqr = 0;
1484 i_sqr = sse_diff_8width_msa( p_pix1, i_stride1,
1485 p_pix2, i_stride2, 8, &i_diff );
1486 i_var = VARIANCE_WxH( i_sqr, i_diff, 6 );