1 /*****************************************************************************
2 * deblock-c.c: msa deblocking
3 *****************************************************************************
4 * Copyright (C) 2015 x264 project
6 * Authors: Neha Rana <neha.rana@imgtec.com>
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 * This program is also available under a commercial proprietary license.
23 * For more information, contact us at licensing@x264.com.
24 *****************************************************************************/
26 #include "common/common.h"
30 #define AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_or_q3_org_in, p0_or_q0_org_in, \
31 q3_or_p3_org_in, p1_or_q1_org_in, \
32 p2_or_q2_org_in, q1_or_p1_org_in, \
33 p0_or_q0_out, p1_or_q1_out, p2_or_q2_out ) \
36 v8i16 const3 = __msa_ldi_h( 3 ); \
38 threshold = p0_or_q0_org_in + q3_or_p3_org_in; \
39 threshold += p1_or_q1_org_in; \
41 p0_or_q0_out = threshold << 1; \
42 p0_or_q0_out += p2_or_q2_org_in; \
43 p0_or_q0_out += q1_or_p1_org_in; \
44 p0_or_q0_out = __msa_srari_h( p0_or_q0_out, 3 ); \
46 p1_or_q1_out = p2_or_q2_org_in + threshold; \
47 p1_or_q1_out = __msa_srari_h( p1_or_q1_out, 2 ); \
49 p2_or_q2_out = p2_or_q2_org_in * const3; \
50 p2_or_q2_out += p3_or_q3_org_in; \
51 p2_or_q2_out += p3_or_q3_org_in; \
52 p2_or_q2_out += threshold; \
53 p2_or_q2_out = __msa_srari_h( p2_or_q2_out, 3 ); \
56 /* data[-u32_u_img_width] = ( uint8_t )( ( 2 * p1 + p0 + q1 + 2 ) >> 2 ); */
57 #define AVC_LPF_P0_OR_Q0( p0_or_q0_org_in, q1_or_p1_org_in, \
58 p1_or_q1_org_in, p0_or_q0_out ) \
60 p0_or_q0_out = p0_or_q0_org_in + q1_or_p1_org_in; \
61 p0_or_q0_out += p1_or_q1_org_in; \
62 p0_or_q0_out += p1_or_q1_org_in; \
63 p0_or_q0_out = __msa_srari_h( p0_or_q0_out, 2 ); \
66 #define AVC_LPF_P1_OR_Q1( p0_or_q0_org_in, q0_or_p0_org_in, \
67 p1_or_q1_org_in, p2_or_q2_org_in, \
68 negate_tc_in, tc_in, p1_or_q1_out ) \
72 clip3 = ( v8i16 ) __msa_aver_u_h( ( v8u16 ) p0_or_q0_org_in, \
73 ( v8u16 ) q0_or_p0_org_in ); \
74 temp = p1_or_q1_org_in << 1; \
76 clip3 = __msa_ave_s_h( p2_or_q2_org_in, clip3 ); \
77 clip3 = CLIP_SH( clip3, negate_tc_in, tc_in ); \
78 p1_or_q1_out = p1_or_q1_org_in + clip3; \
81 #define AVC_LPF_P0Q0( q0_or_p0_org_in, p0_or_q0_org_in, \
82 p1_or_q1_org_in, q1_or_p1_org_in, \
83 negate_threshold_in, threshold_in, \
84 p0_or_q0_out, q0_or_p0_out ) \
86 v8i16 q0_sub_p0, p1_sub_q1, delta; \
88 q0_sub_p0 = q0_or_p0_org_in - p0_or_q0_org_in; \
89 p1_sub_q1 = p1_or_q1_org_in - q1_or_p1_org_in; \
92 delta = q0_sub_p0 + p1_sub_q1; \
95 delta = CLIP_SH( delta, negate_threshold_in, threshold_in ); \
97 p0_or_q0_out = p0_or_q0_org_in + delta; \
98 q0_or_p0_out = q0_or_p0_org_in - delta; \
100 CLIP_SH2_0_255( p0_or_q0_out, q0_or_p0_out ); \
103 static void avc_loopfilter_luma_intra_edge_hor_msa( uint8_t *p_data,
106 uint32_t u_img_width )
108 v16u8 p2_asub_p0, q2_asub_q0, p0_asub_q0;
110 v16u8 is_less_than, is_less_than_beta, negate_is_less_than_beta;
111 v16u8 p2, p1, p0, q0, q1, q2;
112 v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
113 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
114 v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
130 alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
131 beta = ( v16u8 ) __msa_fill_b( u_beta_in );
133 LD_UB4( p_data - ( u_img_width << 1 ), u_img_width,
134 p1_org, p0_org, q0_org, q1_org );
137 v16u8 p1_asub_p0, q1_asub_q0, is_less_than_alpha;
139 p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
140 p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
141 q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
143 is_less_than_alpha = ( p0_asub_q0 < alpha );
144 is_less_than_beta = ( p1_asub_p0 < beta );
145 is_less_than = is_less_than_beta & is_less_than_alpha;
146 is_less_than_beta = ( q1_asub_q0 < beta );
147 is_less_than = is_less_than_beta & is_less_than;
150 if( !__msa_test_bz_v( is_less_than ) )
152 q2_org = LD_UB( p_data + ( 2 * u_img_width ) );
153 p3_org = LD_UB( p_data - ( u_img_width << 2 ) );
154 p2_org = LD_UB( p_data - ( 3 * u_img_width ) );
156 UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l );
157 UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l );
158 UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l );
160 tmp_flag = alpha >> 2;
161 tmp_flag = tmp_flag + 2;
162 tmp_flag = ( p0_asub_q0 < tmp_flag );
164 p2_asub_p0 = __msa_asub_u_b( p2_org, p0_org );
165 is_less_than_beta = ( p2_asub_p0 < beta );
166 is_less_than_beta = is_less_than_beta & tmp_flag;
167 negate_is_less_than_beta = __msa_xori_b( is_less_than_beta, 0xff );
168 is_less_than_beta = is_less_than_beta & is_less_than;
169 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
171 v8u16 is_less_than_beta_l, is_less_than_beta_r;
173 q1_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) q1_org );
175 is_less_than_beta_r =
176 ( v8u16 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, zero, 8 );
177 if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_r ) )
181 ILVR_B2_SH( zero, p3_org, zero, p2_org, p3_org_r, p2_r );
182 AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_org_r, p0_org_r,
184 p2_r, q1_org_r, p0_r, p1_r, p2_r );
187 q1_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) q1_org );
189 is_less_than_beta_l =
190 ( v8u16 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than_beta, 8 );
192 if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_l ) )
196 ILVL_B2_SH( zero, p3_org, zero, p2_org, p3_org_l, p2_l );
197 AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_org_l, p0_org_l,
199 p2_l, q1_org_l, p0_l, p1_l, p2_l );
202 /* combine and store */
203 if( !__msa_test_bz_v( is_less_than_beta ) )
205 PCKEV_B3_UB( p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2 );
207 p0_org = __msa_bmnz_v( p0_org, p0, is_less_than_beta );
208 p1_org = __msa_bmnz_v( p1_org, p1, is_less_than_beta );
209 p2_org = __msa_bmnz_v( p2_org, p2, is_less_than_beta );
211 ST_UB( p1_org, p_data - ( 2 * u_img_width ) );
212 ST_UB( p2_org, p_data - ( 3 * u_img_width ) );
215 v8u16 negate_is_less_than_beta_r, negate_is_less_than_beta_l;
217 negate_is_less_than_beta_r =
218 ( v8u16 ) __msa_sldi_b( ( v16i8 ) negate_is_less_than_beta,
220 if( !__msa_test_bz_v( ( v16u8 ) negate_is_less_than_beta_r ) )
222 AVC_LPF_P0_OR_Q0( p0_org_r, q1_org_r, p1_org_r, p0_r );
225 negate_is_less_than_beta_l =
226 ( v8u16 ) __msa_sldi_b( zero,
227 ( v16i8 ) negate_is_less_than_beta, 8 );
228 if( !__msa_test_bz_v( ( v16u8 ) negate_is_less_than_beta_l ) )
230 AVC_LPF_P0_OR_Q0( p0_org_l, q1_org_l, p1_org_l, p0_l );
233 if( !__msa_test_bz_v( negate_is_less_than_beta ) )
235 p0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) p0_l, ( v16i8 ) p0_r );
236 p0_org = __msa_bmnz_v( p0_org, p0, negate_is_less_than_beta );
239 ST_UB( p0_org, p_data - u_img_width );
241 q3_org = LD_UB( p_data + ( 3 * u_img_width ) );
242 q2_asub_q0 = __msa_asub_u_b( q2_org, q0_org );
243 is_less_than_beta = ( q2_asub_q0 < beta );
244 is_less_than_beta = is_less_than_beta & tmp_flag;
245 negate_is_less_than_beta = __msa_xori_b( is_less_than_beta, 0xff );
246 is_less_than_beta = is_less_than_beta & is_less_than;
247 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
250 v8u16 is_less_than_beta_l, is_less_than_beta_r;
251 is_less_than_beta_r =
252 ( v8u16 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, zero, 8 );
253 if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_r ) )
257 ILVR_B2_SH( zero, q3_org, zero, q2_org, q3_org_r, q2_r );
258 AVC_LPF_P0P1P2_OR_Q0Q1Q2( q3_org_r, q0_org_r,
260 q2_r, p1_org_r, q0_r, q1_r, q2_r );
262 is_less_than_beta_l =
263 ( v8u16 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than_beta, 8 );
264 if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_l ) )
268 ILVL_B2_SH( zero, q3_org, zero, q2_org, q3_org_l, q2_l );
269 AVC_LPF_P0P1P2_OR_Q0Q1Q2( q3_org_l, q0_org_l,
271 q2_l, p1_org_l, q0_l, q1_l, q2_l );
275 if( !__msa_test_bz_v( is_less_than_beta ) )
277 PCKEV_B3_UB( q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2 );
278 q0_org = __msa_bmnz_v( q0_org, q0, is_less_than_beta );
279 q1_org = __msa_bmnz_v( q1_org, q1, is_less_than_beta );
280 q2_org = __msa_bmnz_v( q2_org, q2, is_less_than_beta );
282 ST_UB( q1_org, p_data + u_img_width );
283 ST_UB( q2_org, p_data + 2 * u_img_width );
286 v8u16 negate_is_less_than_beta_r, negate_is_less_than_beta_l;
287 negate_is_less_than_beta_r =
288 ( v8u16 ) __msa_sldi_b( ( v16i8 ) negate_is_less_than_beta,
290 if( !__msa_test_bz_v( ( v16u8 ) negate_is_less_than_beta_r ) )
292 AVC_LPF_P0_OR_Q0( q0_org_r, p1_org_r, q1_org_r, q0_r );
295 negate_is_less_than_beta_l =
296 ( v8u16 ) __msa_sldi_b( zero,
297 ( v16i8 ) negate_is_less_than_beta, 8 );
298 if( !__msa_test_bz_v( ( v16u8 ) negate_is_less_than_beta_l ) )
300 AVC_LPF_P0_OR_Q0( q0_org_l, p1_org_l, q1_org_l, q0_l );
303 if( !__msa_test_bz_v( negate_is_less_than_beta ) )
305 q0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) q0_l, ( v16i8 ) q0_r );
306 q0_org = __msa_bmnz_v( q0_org, q0, negate_is_less_than_beta );
309 ST_UB( q0_org, p_data );
313 static void avc_loopfilter_luma_intra_edge_ver_msa( uint8_t *p_data,
316 uint32_t u_img_width )
319 v16u8 alpha, beta, p0_asub_q0;
320 v16u8 is_less_than_alpha, is_less_than;
321 v16u8 is_less_than_beta, negate_is_less_than_beta;
322 v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
323 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
324 v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
343 v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
344 v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
346 LD_UB8( p_src, u_img_width,
347 row0, row1, row2, row3, row4, row5, row6, row7 );
348 LD_UB8( p_src + ( 8 * u_img_width ), u_img_width,
349 row8, row9, row10, row11, row12, row13, row14, row15 );
351 TRANSPOSE16x8_UB_UB( row0, row1, row2, row3,
352 row4, row5, row6, row7,
353 row8, row9, row10, row11,
354 row12, row13, row14, row15,
355 p3_org, p2_org, p1_org, p0_org,
356 q0_org, q1_org, q2_org, q3_org );
359 UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l );
360 UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l );
361 UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l );
362 UNPCK_UB_SH( q1_org, q1_org_r, q1_org_l );
365 v16u8 p1_asub_p0, q1_asub_q0;
367 p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
368 p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
369 q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
371 alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
372 beta = ( v16u8 ) __msa_fill_b( u_beta_in );
374 is_less_than_alpha = ( p0_asub_q0 < alpha );
375 is_less_than_beta = ( p1_asub_p0 < beta );
376 is_less_than = is_less_than_beta & is_less_than_alpha;
377 is_less_than_beta = ( q1_asub_q0 < beta );
378 is_less_than = is_less_than_beta & is_less_than;
381 if( !__msa_test_bz_v( is_less_than ) )
383 tmp_flag = alpha >> 2;
384 tmp_flag = tmp_flag + 2;
385 tmp_flag = ( p0_asub_q0 < tmp_flag );
390 p2_asub_p0 = __msa_asub_u_b( p2_org, p0_org );
391 is_less_than_beta = ( p2_asub_p0 < beta );
393 is_less_than_beta = tmp_flag & is_less_than_beta;
394 negate_is_less_than_beta = __msa_xori_b( is_less_than_beta, 0xff );
395 is_less_than_beta = is_less_than_beta & is_less_than;
396 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
399 v16u8 is_less_than_beta_r;
401 is_less_than_beta_r =
402 ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, zero, 8 );
403 if( !__msa_test_bz_v( is_less_than_beta_r ) )
407 ILVR_B2_SH( zero, p3_org, zero, p2_org, p3_org_r, p2_r );
408 AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_org_r, p0_org_r,
410 p2_r, q1_org_r, p0_r, p1_r, p2_r );
415 v16u8 is_less_than_beta_l;
417 is_less_than_beta_l =
418 ( v16u8 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than_beta, 8 );
419 if( !__msa_test_bz_v( is_less_than_beta_l ) )
423 ILVL_B2_SH( zero, p3_org, zero, p2_org, p3_org_l, p2_l );
424 AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_org_l, p0_org_l,
426 p2_l, q1_org_l, p0_l, p1_l, p2_l );
429 if( !__msa_test_bz_v( is_less_than_beta ) )
433 PCKEV_B3_UB( p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2 );
434 p0_org = __msa_bmnz_v( p0_org, p0, is_less_than_beta );
435 p1_org = __msa_bmnz_v( p1_org, p1, is_less_than_beta );
436 p2_org = __msa_bmnz_v( p2_org, p2, is_less_than_beta );
439 v16u8 negate_is_less_than_beta_r;
441 negate_is_less_than_beta_r =
442 ( v16u8 ) __msa_sldi_b( ( v16i8 ) negate_is_less_than_beta,
445 if( !__msa_test_bz_v( negate_is_less_than_beta_r ) )
447 AVC_LPF_P0_OR_Q0( p0_org_r, q1_org_r, p1_org_r, p0_r );
451 v16u8 negate_is_less_than_beta_l;
453 negate_is_less_than_beta_l =
454 ( v16u8 ) __msa_sldi_b( zero,
455 ( v16i8 ) negate_is_less_than_beta, 8 );
456 if( !__msa_test_bz_v( negate_is_less_than_beta_l ) )
458 AVC_LPF_P0_OR_Q0( p0_org_l, q1_org_l, p1_org_l, p0_l );
462 if( !__msa_test_bz_v( negate_is_less_than_beta ) )
466 p0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) p0_l, ( v16i8 ) p0_r );
467 p0_org = __msa_bmnz_v( p0_org, p0, negate_is_less_than_beta );
473 q2_asub_q0 = __msa_asub_u_b( q2_org, q0_org );
474 is_less_than_beta = ( q2_asub_q0 < beta );
477 is_less_than_beta = is_less_than_beta & tmp_flag;
478 negate_is_less_than_beta = __msa_xori_b( is_less_than_beta, 0xff );
480 is_less_than_beta = is_less_than_beta & is_less_than;
481 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
484 v16u8 is_less_than_beta_r;
486 is_less_than_beta_r =
487 ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, zero, 8 );
488 if( !__msa_test_bz_v( is_less_than_beta_r ) )
492 ILVR_B2_SH( zero, q3_org, zero, q2_org, q3_org_r, q2_r );
493 AVC_LPF_P0P1P2_OR_Q0Q1Q2( q3_org_r, q0_org_r,
495 q2_r, p1_org_r, q0_r, q1_r, q2_r );
499 v16u8 is_less_than_beta_l;
501 is_less_than_beta_l =
502 ( v16u8 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than_beta, 8 );
503 if( !__msa_test_bz_v( is_less_than_beta_l ) )
507 ILVL_B2_SH( zero, q3_org, zero, q2_org, q3_org_l, q2_l );
508 AVC_LPF_P0P1P2_OR_Q0Q1Q2( q3_org_l, q0_org_l,
510 q2_l, p1_org_l, q0_l, q1_l, q2_l );
513 if( !__msa_test_bz_v( is_less_than_beta ) )
517 PCKEV_B3_UB( q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2 );
518 q0_org = __msa_bmnz_v( q0_org, q0, is_less_than_beta );
519 q1_org = __msa_bmnz_v( q1_org, q1, is_less_than_beta );
520 q2_org = __msa_bmnz_v( q2_org, q2, is_less_than_beta );
524 v16u8 negate_is_less_than_beta_r;
526 negate_is_less_than_beta_r =
527 ( v16u8 ) __msa_sldi_b( ( v16i8 ) negate_is_less_than_beta,
529 if( !__msa_test_bz_v( negate_is_less_than_beta_r ) )
531 AVC_LPF_P0_OR_Q0( q0_org_r, p1_org_r, q1_org_r, q0_r );
535 v16u8 negate_is_less_than_beta_l;
537 negate_is_less_than_beta_l =
538 ( v16u8 ) __msa_sldi_b( zero,
539 ( v16i8 ) negate_is_less_than_beta, 8 );
540 if( !__msa_test_bz_v( negate_is_less_than_beta_l ) )
542 AVC_LPF_P0_OR_Q0( q0_org_l, p1_org_l, q1_org_l, q0_l );
545 if( !__msa_test_bz_v( negate_is_less_than_beta ) )
549 q0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) q0_l, ( v16i8 ) q0_r );
550 q0_org = __msa_bmnz_v( q0_org, q0, negate_is_less_than_beta );
554 v8i16 tp0, tp1, tp2, tp3, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
556 ILVRL_B2_SH( p1_org, p2_org, tp0, tp2 );
557 ILVRL_B2_SH( q0_org, p0_org, tp1, tp3 );
558 ILVRL_B2_SH( q2_org, q1_org, tmp2, tmp5 );
560 ILVRL_H2_SH( tp1, tp0, tmp3, tmp4 );
561 ILVRL_H2_SH( tp3, tp2, tmp6, tmp7 );
564 ST4x4_UB( tmp3, tmp3, 0, 1, 2, 3, p_src, u_img_width );
565 ST2x4_UB( tmp2, 0, p_src + 4, u_img_width );
566 p_src += 4 * u_img_width;
567 ST4x4_UB( tmp4, tmp4, 0, 1, 2, 3, p_src, u_img_width );
568 ST2x4_UB( tmp2, 4, p_src + 4, u_img_width );
569 p_src += 4 * u_img_width;
571 ST4x4_UB( tmp6, tmp6, 0, 1, 2, 3, p_src, u_img_width );
572 ST2x4_UB( tmp5, 0, p_src + 4, u_img_width );
573 p_src += 4 * u_img_width;
574 ST4x4_UB( tmp7, tmp7, 0, 1, 2, 3, p_src, u_img_width );
575 ST2x4_UB( tmp5, 4, p_src + 4, u_img_width );
579 static void avc_lpf_cbcr_interleaved_intra_edge_hor_msa( uint8_t *p_chroma,
582 uint32_t u_img_width )
584 v16u8 alpha, beta, is_less_than;
585 v16u8 p0, q0, p1_org, p0_org, q0_org, q1_org;
591 alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
592 beta = ( v16u8 ) __msa_fill_b( u_beta_in );
594 LD_UB4( p_chroma - ( u_img_width << 1 ), u_img_width,
595 p1_org, p0_org, q0_org, q1_org );
598 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
599 v16u8 is_less_than_alpha, is_less_than_beta;
601 p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
602 p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
603 q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
605 is_less_than_alpha = ( p0_asub_q0 < alpha );
606 is_less_than_beta = ( p1_asub_p0 < beta );
607 is_less_than = is_less_than_beta & is_less_than_alpha;
608 is_less_than_beta = ( q1_asub_q0 < beta );
609 is_less_than = is_less_than_beta & is_less_than;
612 if( !__msa_test_bz_v( is_less_than ) )
615 v16u8 is_less_than_r, is_less_than_l;
617 is_less_than_r = ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than,
619 if( !__msa_test_bz_v( is_less_than_r ) )
621 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
623 ILVR_B4_SH( zero, p1_org, zero, p0_org, zero, q0_org,
624 zero, q1_org, p1_org_r, p0_org_r, q0_org_r,
626 AVC_LPF_P0_OR_Q0( p0_org_r, q1_org_r, p1_org_r, p0_r );
627 AVC_LPF_P0_OR_Q0( q0_org_r, p1_org_r, q1_org_r, q0_r );
630 is_less_than_l = ( v16u8 ) __msa_sldi_b( zero,
631 ( v16i8 ) is_less_than, 8 );
632 if( !__msa_test_bz_v( is_less_than_l ) )
634 v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
636 ILVL_B4_SH( zero, p1_org, zero, p0_org, zero, q0_org,
637 zero, q1_org, p1_org_l, p0_org_l, q0_org_l,
639 AVC_LPF_P0_OR_Q0( p0_org_l, q1_org_l, p1_org_l, p0_l );
640 AVC_LPF_P0_OR_Q0( q0_org_l, p1_org_l, q1_org_l, q0_l );
643 PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 );
645 p0_org = __msa_bmnz_v( p0_org, p0, is_less_than );
646 q0_org = __msa_bmnz_v( q0_org, q0, is_less_than );
648 ST_UB( p0_org, ( p_chroma - u_img_width ) );
649 ST_UB( q0_org, p_chroma );
653 static void avc_lpf_cbcr_interleaved_intra_edge_ver_msa( uint8_t *p_chroma,
656 uint32_t u_img_width )
659 v16u8 p0, q0, p1_org, p0_org, q0_org, q1_org;
664 v16u8 p1_u_org, p0_u_org, q0_u_org, q1_u_org;
665 v16u8 p1_v_org, p0_v_org, q0_v_org, q1_v_org;
666 v16i8 tmp0, tmp1, tmp2, tmp3;
668 v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
670 LD_UB8( ( p_chroma - 4 ), u_img_width,
671 row0, row1, row2, row3, row4, row5, row6, row7 );
673 TRANSPOSE8x8_UB_UB( row0, row1, row2, row3, row4, row5, row6, row7,
674 p1_u_org, p1_v_org, p0_u_org, p0_v_org,
675 q0_u_org, q0_v_org, q1_u_org, q1_v_org );
677 ILVR_D4_UB( p1_v_org, p1_u_org, p0_v_org, p0_u_org, q0_v_org, q0_u_org,
678 q1_v_org, q1_u_org, p1_org, p0_org, q0_org, q1_org );
681 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
682 v16u8 is_less_than_beta, is_less_than_alpha, alpha, beta;
684 p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
685 p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
686 q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
688 alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
689 beta = ( v16u8 ) __msa_fill_b( u_beta_in );
691 is_less_than_alpha = ( p0_asub_q0 < alpha );
692 is_less_than_beta = ( p1_asub_p0 < beta );
693 is_less_than = is_less_than_beta & is_less_than_alpha;
694 is_less_than_beta = ( q1_asub_q0 < beta );
695 is_less_than = is_less_than_beta & is_less_than;
698 if( !__msa_test_bz_v( is_less_than ) )
700 v16u8 is_less_than_r, is_less_than_l;
703 is_less_than_r = ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than,
705 if( !__msa_test_bz_v( is_less_than_r ) )
707 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
709 ILVR_B4_SH( zero, p1_org, zero, p0_org, zero, q0_org,
710 zero, q1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r );
711 AVC_LPF_P0_OR_Q0( p0_org_r, q1_org_r, p1_org_r, p0_r );
712 AVC_LPF_P0_OR_Q0( q0_org_r, p1_org_r, q1_org_r, q0_r );
715 is_less_than_l = ( v16u8 ) __msa_sldi_b( zero,
716 ( v16i8 ) is_less_than, 8 );
717 if( !__msa_test_bz_v( is_less_than_l ) )
719 v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
721 ILVL_B4_SH( zero, p1_org, zero, p0_org, zero, q0_org,
722 zero, q1_org, p1_org_l, p0_org_l, q0_org_l, q1_org_l );
723 AVC_LPF_P0_OR_Q0( p0_org_l, q1_org_l, p1_org_l, p0_l );
724 AVC_LPF_P0_OR_Q0( q0_org_l, p1_org_l, q1_org_l, q0_l );
727 PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 );
729 p0_org = __msa_bmnz_v( p0_org, p0, is_less_than );
730 q0_org = __msa_bmnz_v( q0_org, q0, is_less_than );
732 SLDI_B2_0_UB( p0_org, q0_org, p0_v_org, q0_v_org, 8 );
733 ILVR_D2_SB( p0_v_org, p0_org, q0_v_org, q0_org, tmp0, tmp1 );
734 ILVRL_B2_SB( tmp1, tmp0, tmp2, tmp3 );
735 ILVRL_B2_SW( tmp3, tmp2, vec0, vec1 );
737 ST4x8_UB( vec0, vec1, ( p_chroma - 2 ), u_img_width );
741 static void avc_loopfilter_luma_inter_edge_ver_msa( uint8_t *p_data,
752 uint32_t u_img_width )
755 v16u8 beta, tmp_vec, bs = { 0 };
757 v16u8 is_less_than, is_less_than_beta;
758 v16u8 p1, p0, q0, q1;
759 v8i16 p0_r, q0_r, p1_r = { 0 };
761 v8i16 p0_l, q0_l, p1_l = { 0 };
763 v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
764 v8i16 p2_org_r, p1_org_r, p0_org_r, q0_org_r, q1_org_r, q2_org_r;
765 v8i16 p2_org_l, p1_org_l, p0_org_l, q0_org_l, q1_org_l, q2_org_l;
768 v16u8 is_bs_greater_than0;
770 tmp_vec = ( v16u8 ) __msa_fill_b( u_bs0 );
771 bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 0, ( v4i32 ) tmp_vec );
772 tmp_vec = ( v16u8 ) __msa_fill_b( u_bs1 );
773 bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 1, ( v4i32 ) tmp_vec );
774 tmp_vec = ( v16u8 ) __msa_fill_b( u_bs2 );
775 bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 2, ( v4i32 ) tmp_vec );
776 tmp_vec = ( v16u8 ) __msa_fill_b( u_bs3 );
777 bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 3, ( v4i32 ) tmp_vec );
779 if( !__msa_test_bz_v( bs ) )
781 tmp_vec = ( v16u8 ) __msa_fill_b( u_tc0 );
782 tc = ( v16u8 ) __msa_insve_w( ( v4i32 ) tc, 0, ( v4i32 ) tmp_vec );
783 tmp_vec = ( v16u8 ) __msa_fill_b( u_tc1 );
784 tc = ( v16u8 ) __msa_insve_w( ( v4i32 ) tc, 1, ( v4i32 ) tmp_vec );
785 tmp_vec = ( v16u8 ) __msa_fill_b( u_tc2 );
786 tc = ( v16u8 ) __msa_insve_w( ( v4i32 ) tc, 2, ( v4i32 ) tmp_vec );
787 tmp_vec = ( v16u8 ) __msa_fill_b( u_tc3 );
788 tc = ( v16u8 ) __msa_insve_w( ( v4i32 ) tc, 3, ( v4i32 ) tmp_vec );
790 is_bs_greater_than0 = ( zero < bs );
793 v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
794 v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
799 LD_UB8( p_src, u_img_width,
800 row0, row1, row2, row3, row4, row5, row6, row7 );
801 p_src += ( 8 * u_img_width );
802 LD_UB8( p_src, u_img_width,
803 row8, row9, row10, row11, row12, row13, row14, row15 );
805 TRANSPOSE16x8_UB_UB( row0, row1, row2, row3, row4, row5, row6, row7,
806 row8, row9, row10, row11,
807 row12, row13, row14, row15,
808 p3_org, p2_org, p1_org, p0_org,
809 q0_org, q1_org, q2_org, q3_org );
812 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha;
813 v16u8 is_less_than_alpha;
815 p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
816 p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
817 q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
819 alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
820 beta = ( v16u8 ) __msa_fill_b( u_beta_in );
822 is_less_than_alpha = ( p0_asub_q0 < alpha );
823 is_less_than_beta = ( p1_asub_p0 < beta );
824 is_less_than = is_less_than_beta & is_less_than_alpha;
825 is_less_than_beta = ( q1_asub_q0 < beta );
826 is_less_than = is_less_than_beta & is_less_than;
827 is_less_than = is_less_than & is_bs_greater_than0;
829 if( !__msa_test_bz_v( is_less_than ) )
831 v16i8 negate_tc, sign_negate_tc;
832 v8i16 negate_tc_r, i16_negatetc_l;
834 negate_tc = zero - ( v16i8 ) tc;
835 sign_negate_tc = __msa_clti_s_b( negate_tc, 0 );
837 ILVRL_B2_SH( sign_negate_tc, negate_tc, negate_tc_r,
840 UNPCK_UB_SH( tc, tc_r, tc_l );
841 UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l );
842 UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l );
843 UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l );
847 v16u8 is_less_than_beta_r, is_less_than_beta_l;
849 p2_asub_p0 = __msa_asub_u_b( p2_org, p0_org );
850 is_less_than_beta = ( p2_asub_p0 < beta );
851 is_less_than_beta = is_less_than_beta & is_less_than;
853 is_less_than_beta_r =
854 ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta,
856 if( !__msa_test_bz_v( is_less_than_beta_r ) )
858 p2_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) p2_org );
860 AVC_LPF_P1_OR_Q1( p0_org_r, q0_org_r, p1_org_r, p2_org_r,
861 negate_tc_r, tc_r, p1_r );
864 is_less_than_beta_l =
865 ( v16u8 ) __msa_sldi_b( zero,
866 ( v16i8 ) is_less_than_beta, 8 );
867 if( !__msa_test_bz_v( is_less_than_beta_l ) )
869 p2_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) p2_org );
871 AVC_LPF_P1_OR_Q1( p0_org_l, q0_org_l, p1_org_l, p2_org_l,
872 i16_negatetc_l, tc_l, p1_l );
876 if( !__msa_test_bz_v( is_less_than_beta ) )
878 p1 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) p1_l, ( v16i8 ) p1_r );
879 p1_org = __msa_bmnz_v( p1_org, p1, is_less_than_beta );
881 is_less_than_beta = __msa_andi_b( is_less_than_beta, 1 );
882 tc = tc + is_less_than_beta;
887 v16u8 is_less_than_beta_l, is_less_than_beta_r;
889 u8_q2asub_q0 = __msa_asub_u_b( q2_org, q0_org );
890 is_less_than_beta = ( u8_q2asub_q0 < beta );
891 is_less_than_beta = is_less_than_beta & is_less_than;
893 q1_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) q1_org );
895 is_less_than_beta_r =
896 ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta,
898 if( !__msa_test_bz_v( is_less_than_beta_r ) )
900 q2_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) q2_org );
901 AVC_LPF_P1_OR_Q1( p0_org_r, q0_org_r, q1_org_r, q2_org_r,
902 negate_tc_r, tc_r, q1_r );
905 q1_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) q1_org );
907 is_less_than_beta_l =
908 ( v16u8 ) __msa_sldi_b( zero,
909 ( v16i8 ) is_less_than_beta, 8 );
910 if( !__msa_test_bz_v( is_less_than_beta_l ) )
912 q2_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) q2_org );
913 AVC_LPF_P1_OR_Q1( p0_org_l, q0_org_l, q1_org_l, q2_org_l,
914 i16_negatetc_l, tc_l, q1_l );
918 if( !__msa_test_bz_v( is_less_than_beta ) )
920 q1 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) q1_l, ( v16i8 ) q1_r );
921 q1_org = __msa_bmnz_v( q1_org, q1, is_less_than_beta );
923 is_less_than_beta = __msa_andi_b( is_less_than_beta, 1 );
924 tc = tc + is_less_than_beta;
928 v8i16 threshold_r, negate_thresh_r;
929 v8i16 threshold_l, negate_thresh_l;
930 v16i8 negate_thresh, sign_negate_thresh;
932 negate_thresh = zero - ( v16i8 ) tc;
933 sign_negate_thresh = __msa_clti_s_b( negate_thresh, 0 );
935 ILVR_B2_SH( zero, tc, sign_negate_thresh, negate_thresh,
936 threshold_r, negate_thresh_r );
938 AVC_LPF_P0Q0( q0_org_r, p0_org_r, p1_org_r, q1_org_r,
939 negate_thresh_r, threshold_r, p0_r, q0_r );
941 threshold_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) tc );
942 negate_thresh_l = ( v8i16 ) __msa_ilvl_b( sign_negate_thresh,
945 AVC_LPF_P0Q0( q0_org_l, p0_org_l, p1_org_l, q1_org_l,
946 negate_thresh_l, threshold_l, p0_l, q0_l );
949 PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 );
951 p0_org = __msa_bmnz_v( p0_org, p0, is_less_than );
952 q0_org = __msa_bmnz_v( q0_org, q0, is_less_than );
955 v16i8 tp0, tp1, tp2, tp3;
957 v4i32 tmp3, tmp4, tmp6, tmp7;
958 uint32_t u_out0, u_out2;
959 uint16_t u_out1, u_out3;
963 ILVRL_B2_SB( p1_org, p2_org, tp0, tp2 );
964 ILVRL_B2_SB( q0_org, p0_org, tp1, tp3 );
965 ILVRL_B2_SH( q2_org, q1_org, tmp2, tmp5 );
967 ILVRL_H2_SW( tp1, tp0, tmp3, tmp4 );
968 ILVRL_H2_SW( tp3, tp2, tmp6, tmp7 );
970 u_out0 = __msa_copy_u_w( tmp3, 0 );
971 u_out1 = __msa_copy_u_h( tmp2, 0 );
972 u_out2 = __msa_copy_u_w( tmp3, 1 );
973 u_out3 = __msa_copy_u_h( tmp2, 1 );
976 SH( u_out1, ( p_src + 4 ) );
977 p_src += u_img_width;
979 SH( u_out3, ( p_src + 4 ) );
981 u_out0 = __msa_copy_u_w( tmp3, 2 );
982 u_out1 = __msa_copy_u_h( tmp2, 2 );
983 u_out2 = __msa_copy_u_w( tmp3, 3 );
984 u_out3 = __msa_copy_u_h( tmp2, 3 );
986 p_src += u_img_width;
988 SH( u_out1, ( p_src + 4 ) );
989 p_src += u_img_width;
991 SH( u_out3, ( p_src + 4 ) );
993 u_out0 = __msa_copy_u_w( tmp4, 0 );
994 u_out1 = __msa_copy_u_h( tmp2, 4 );
995 u_out2 = __msa_copy_u_w( tmp4, 1 );
996 u_out3 = __msa_copy_u_h( tmp2, 5 );
998 p_src += u_img_width;
1000 SH( u_out1, ( p_src + 4 ) );
1001 p_src += u_img_width;
1002 SW( u_out2, p_src );
1003 SH( u_out3, ( p_src + 4 ) );
1005 u_out0 = __msa_copy_u_w( tmp4, 2 );
1006 u_out1 = __msa_copy_u_h( tmp2, 6 );
1007 u_out2 = __msa_copy_u_w( tmp4, 3 );
1008 u_out3 = __msa_copy_u_h( tmp2, 7 );
1010 p_src += u_img_width;
1011 SW( u_out0, p_src );
1012 SH( u_out1, ( p_src + 4 ) );
1013 p_src += u_img_width;
1014 SW( u_out2, p_src );
1015 SH( u_out3, ( p_src + 4 ) );
1017 u_out0 = __msa_copy_u_w( tmp6, 0 );
1018 u_out1 = __msa_copy_u_h( tmp5, 0 );
1019 u_out2 = __msa_copy_u_w( tmp6, 1 );
1020 u_out3 = __msa_copy_u_h( tmp5, 1 );
1022 p_src += u_img_width;
1023 SW( u_out0, p_src );
1024 SH( u_out1, ( p_src + 4 ) );
1025 p_src += u_img_width;
1026 SW( u_out2, p_src );
1027 SH( u_out3, ( p_src + 4 ) );
1029 u_out0 = __msa_copy_u_w( tmp6, 2 );
1030 u_out1 = __msa_copy_u_h( tmp5, 2 );
1031 u_out2 = __msa_copy_u_w( tmp6, 3 );
1032 u_out3 = __msa_copy_u_h( tmp5, 3 );
1034 p_src += u_img_width;
1035 SW( u_out0, p_src );
1036 SH( u_out1, ( p_src + 4 ) );
1037 p_src += u_img_width;
1038 SW( u_out2, p_src );
1039 SH( u_out3, ( p_src + 4 ) );
1041 u_out0 = __msa_copy_u_w( tmp7, 0 );
1042 u_out1 = __msa_copy_u_h( tmp5, 4 );
1043 u_out2 = __msa_copy_u_w( tmp7, 1 );
1044 u_out3 = __msa_copy_u_h( tmp5, 5 );
1046 p_src += u_img_width;
1047 SW( u_out0, p_src );
1048 SH( u_out1, ( p_src + 4 ) );
1049 p_src += u_img_width;
1050 SW( u_out2, p_src );
1051 SH( u_out3, ( p_src + 4 ) );
1053 u_out0 = __msa_copy_u_w( tmp7, 2 );
1054 u_out1 = __msa_copy_u_h( tmp5, 6 );
1055 u_out2 = __msa_copy_u_w( tmp7, 3 );
1056 u_out3 = __msa_copy_u_h( tmp5, 7 );
1058 p_src += u_img_width;
1059 SW( u_out0, p_src );
1060 SH( u_out1, ( p_src + 4 ) );
1061 p_src += u_img_width;
1062 SW( u_out2, p_src );
1063 SH( u_out3, ( p_src + 4 ) );
1068 static void avc_loopfilter_luma_inter_edge_hor_msa( uint8_t *p_data,
1079 uint32_t u_image_width )
1081 v16u8 p2_asub_p0, u8_q2asub_q0;
1082 v16u8 alpha, beta, is_less_than, is_less_than_beta;
1083 v16u8 p1, p0, q0, q1;
1085 v8i16 p0_r, q0_r, q1_r = { 0 };
1087 v8i16 p0_l, q0_l, q1_l = { 0 };
1088 v16u8 p2_org, p1_org, p0_org, q0_org, q1_org, q2_org;
1089 v8i16 p2_org_r, p1_org_r, p0_org_r, q0_org_r, q1_org_r, q2_org_r;
1090 v8i16 p2_org_l, p1_org_l, p0_org_l, q0_org_l, q1_org_l, q2_org_l;
1096 tmp_vec = ( v16u8 ) __msa_fill_b( u_bs0 );
1097 bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 0, ( v4i32 ) tmp_vec );
1098 tmp_vec = ( v16u8 ) __msa_fill_b( u_bs1 );
1099 bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 1, ( v4i32 ) tmp_vec );
1100 tmp_vec = ( v16u8 ) __msa_fill_b( u_bs2 );
1101 bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 2, ( v4i32 ) tmp_vec );
1102 tmp_vec = ( v16u8 ) __msa_fill_b( u_bs3 );
1103 bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 3, ( v4i32 ) tmp_vec );
1105 if( !__msa_test_bz_v( bs ) )
1107 tmp_vec = ( v16u8 ) __msa_fill_b( u_tc0 );
1108 tc = ( v16i8 ) __msa_insve_w( ( v4i32 ) tc, 0, ( v4i32 ) tmp_vec );
1109 tmp_vec = ( v16u8 ) __msa_fill_b( u_tc1 );
1110 tc = ( v16i8 ) __msa_insve_w( ( v4i32 ) tc, 1, ( v4i32 ) tmp_vec );
1111 tmp_vec = ( v16u8 ) __msa_fill_b( u_tc2 );
1112 tc = ( v16i8 ) __msa_insve_w( ( v4i32 ) tc, 2, ( v4i32 ) tmp_vec );
1113 tmp_vec = ( v16u8 ) __msa_fill_b( u_tc3 );
1114 tc = ( v16i8 ) __msa_insve_w( ( v4i32 ) tc, 3, ( v4i32 ) tmp_vec );
1116 alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
1117 beta = ( v16u8 ) __msa_fill_b( u_beta_in );
1119 LD_UB5( p_data - ( 3 * u_image_width ), u_image_width,
1120 p2_org, p1_org, p0_org, q0_org, q1_org );
1123 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1124 v16u8 is_less_than_alpha, is_bs_greater_than0;
1126 is_bs_greater_than0 = ( ( v16u8 ) zero < bs );
1127 p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
1128 p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
1129 q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
1131 is_less_than_alpha = ( p0_asub_q0 < alpha );
1132 is_less_than_beta = ( p1_asub_p0 < beta );
1133 is_less_than = is_less_than_beta & is_less_than_alpha;
1134 is_less_than_beta = ( q1_asub_q0 < beta );
1135 is_less_than = is_less_than_beta & is_less_than;
1136 is_less_than = is_less_than & is_bs_greater_than0;
1139 if( !__msa_test_bz_v( is_less_than ) )
1141 v16i8 sign_negate_tc, negate_tc;
1142 v8i16 negate_tc_r, i16_negatetc_l, tc_l, tc_r;
1144 q2_org = LD_UB( p_data + ( 2 * u_image_width ) );
1145 negate_tc = zero - tc;
1146 sign_negate_tc = __msa_clti_s_b( negate_tc, 0 );
1148 ILVRL_B2_SH( sign_negate_tc, negate_tc,
1149 negate_tc_r, i16_negatetc_l );
1151 UNPCK_UB_SH( tc, tc_r, tc_l );
1152 UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l );
1153 UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l );
1154 UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l );
1156 p2_asub_p0 = __msa_asub_u_b( p2_org, p0_org );
1157 is_less_than_beta = ( p2_asub_p0 < beta );
1158 is_less_than_beta = is_less_than_beta & is_less_than;
1160 v8u16 is_less_than_beta_r, is_less_than_beta_l;
1162 is_less_than_beta_r =
1163 ( v8u16 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta,
1165 if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_r ) )
1167 p2_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) p2_org );
1169 AVC_LPF_P1_OR_Q1( p0_org_r, q0_org_r, p1_org_r, p2_org_r,
1170 negate_tc_r, tc_r, p1_r );
1173 is_less_than_beta_l =
1174 ( v8u16 ) __msa_sldi_b( zero,
1175 ( v16i8 ) is_less_than_beta, 8 );
1176 if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_l ) )
1178 p2_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) p2_org );
1180 AVC_LPF_P1_OR_Q1( p0_org_l, q0_org_l, p1_org_l, p2_org_l,
1181 i16_negatetc_l, tc_l, p1_l );
1184 if( !__msa_test_bz_v( is_less_than_beta ) )
1186 p1 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) p1_l, ( v16i8 ) p1_r );
1187 p1_org = __msa_bmnz_v( p1_org, p1, is_less_than_beta );
1188 ST_UB( p1_org, p_data - ( 2 * u_image_width ) );
1190 is_less_than_beta = __msa_andi_b( is_less_than_beta, 1 );
1191 tc = tc + ( v16i8 ) is_less_than_beta;
1194 u8_q2asub_q0 = __msa_asub_u_b( q2_org, q0_org );
1195 is_less_than_beta = ( u8_q2asub_q0 < beta );
1196 is_less_than_beta = is_less_than_beta & is_less_than;
1199 v8u16 is_less_than_beta_r, is_less_than_beta_l;
1200 is_less_than_beta_r =
1201 ( v8u16 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta,
1204 q1_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) q1_org );
1205 if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_r ) )
1207 q2_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) q2_org );
1209 AVC_LPF_P1_OR_Q1( p0_org_r, q0_org_r, q1_org_r, q2_org_r,
1210 negate_tc_r, tc_r, q1_r );
1212 is_less_than_beta_l =
1213 ( v8u16 ) __msa_sldi_b( zero,
1214 ( v16i8 ) is_less_than_beta, 8 );
1216 q1_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) q1_org );
1217 if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_l ) )
1219 q2_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) q2_org );
1221 AVC_LPF_P1_OR_Q1( p0_org_l, q0_org_l, q1_org_l, q2_org_l,
1222 i16_negatetc_l, tc_l, q1_l );
1225 if( !__msa_test_bz_v( is_less_than_beta ) )
1227 q1 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) q1_l, ( v16i8 ) q1_r );
1228 q1_org = __msa_bmnz_v( q1_org, q1, is_less_than_beta );
1229 ST_UB( q1_org, p_data + u_image_width );
1231 is_less_than_beta = __msa_andi_b( is_less_than_beta, 1 );
1232 tc = tc + ( v16i8 ) is_less_than_beta;
1235 v16i8 negate_thresh, sign_negate_thresh;
1236 v8i16 threshold_r, threshold_l;
1237 v8i16 negate_thresh_l, negate_thresh_r;
1239 negate_thresh = zero - tc;
1240 sign_negate_thresh = __msa_clti_s_b( negate_thresh, 0 );
1242 ILVR_B2_SH( zero, tc, sign_negate_thresh, negate_thresh,
1243 threshold_r, negate_thresh_r );
1244 AVC_LPF_P0Q0( q0_org_r, p0_org_r, p1_org_r, q1_org_r,
1245 negate_thresh_r, threshold_r, p0_r, q0_r );
1247 threshold_l = ( v8i16 ) __msa_ilvl_b( zero, tc );
1248 negate_thresh_l = ( v8i16 ) __msa_ilvl_b( sign_negate_thresh,
1250 AVC_LPF_P0Q0( q0_org_l, p0_org_l, p1_org_l, q1_org_l,
1251 negate_thresh_l, threshold_l, p0_l, q0_l );
1254 PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 );
1256 p0_org = __msa_bmnz_v( p0_org, p0, is_less_than );
1257 q0_org = __msa_bmnz_v( q0_org, q0, is_less_than );
1259 ST_UB( p0_org, ( p_data - u_image_width ) );
1260 ST_UB( q0_org, p_data );
1265 static void avc_lpf_cbcr_interleaved_inter_edge_hor_msa( uint8_t *p_chroma,
1276 uint32_t u_img_width )
1279 v4i32 tmp_vec, bs = { 0 };
1281 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1283 v8i16 is_less_than_r, is_less_than_l;
1284 v16u8 is_less_than_beta, is_less_than_alpha, is_bs_greater_than0;
1290 v16u8 p1_org, p0_org, q0_org, q1_org;
1291 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1292 v16i8 negate_tc, sign_negate_tc;
1293 v8i16 negate_tc_r, i16_negatetc_l;
1296 v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
1298 tmp_vec = ( v4i32 ) __msa_fill_b( u_bs0 );
1299 bs = __msa_insve_w( bs, 0, tmp_vec );
1300 tmp_vec = ( v4i32 ) __msa_fill_b( u_bs1 );
1301 bs = __msa_insve_w( bs, 1, tmp_vec );
1302 tmp_vec = ( v4i32 ) __msa_fill_b( u_bs2 );
1303 bs = __msa_insve_w( bs, 2, tmp_vec );
1304 tmp_vec = ( v4i32 ) __msa_fill_b( u_bs3 );
1305 bs = __msa_insve_w( bs, 3, tmp_vec );
1307 if( !__msa_test_bz_v( ( v16u8 ) bs ) )
1309 tmp_vec = ( v4i32 ) __msa_fill_b( u_tc0 );
1310 tc = __msa_insve_w( tc, 0, tmp_vec );
1311 tmp_vec = ( v4i32 ) __msa_fill_b( u_tc1 );
1312 tc = __msa_insve_w( tc, 1, tmp_vec );
1313 tmp_vec = ( v4i32 ) __msa_fill_b( u_tc2 );
1314 tc = __msa_insve_w( tc, 2, tmp_vec );
1315 tmp_vec = ( v4i32 ) __msa_fill_b( u_tc3 );
1316 tc = __msa_insve_w( tc, 3, tmp_vec );
1318 is_bs_greater_than0 = ( v16u8 ) ( zero < ( v16i8 ) bs );
1320 alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
1321 beta = ( v16u8 ) __msa_fill_b( u_beta_in );
1323 LD_UB4( p_chroma - ( u_img_width << 1 ), u_img_width,
1324 p1_org, p0_org, q0_org, q1_org );
1326 p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
1327 p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
1328 q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
1330 is_less_than_alpha = ( p0_asub_q0 < alpha );
1331 is_less_than_beta = ( p1_asub_p0 < beta );
1332 is_less_than = is_less_than_beta & is_less_than_alpha;
1333 is_less_than_beta = ( q1_asub_q0 < beta );
1334 is_less_than = is_less_than_beta & is_less_than;
1336 is_less_than = is_less_than & is_bs_greater_than0;
1338 if( !__msa_test_bz_v( is_less_than ) )
1340 negate_tc = zero - ( v16i8 ) tc;
1341 sign_negate_tc = __msa_clti_s_b( negate_tc, 0 );
1343 ILVRL_B2_SH( sign_negate_tc, negate_tc, negate_tc_r,
1346 UNPCK_UB_SH( tc, tc_r, tc_l );
1347 UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l );
1348 UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l );
1349 UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l );
1350 UNPCK_UB_SH( q1_org, q1_org_r, q1_org_l );
1353 ( v8i16 ) __msa_sldi_b( ( v16i8 ) is_less_than, zero, 8 );
1354 if( !__msa_test_bz_v( ( v16u8 ) is_less_than_r ) )
1356 AVC_LPF_P0Q0( q0_org_r, p0_org_r, p1_org_r, q1_org_r,
1357 negate_tc_r, tc_r, p0_r, q0_r );
1361 ( v8i16 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than, 8 );
1362 if( !__msa_test_bz_v( ( v16u8 ) is_less_than_l ) )
1364 AVC_LPF_P0Q0( q0_org_l, p0_org_l, p1_org_l, q1_org_l,
1365 i16_negatetc_l, tc_l, p0_l, q0_l );
1368 PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 );
1370 p0_org = __msa_bmnz_v( p0_org, p0, is_less_than );
1371 q0_org = __msa_bmnz_v( q0_org, q0, is_less_than );
1373 ST_UB( p0_org, p_chroma - u_img_width );
1374 ST_UB( q0_org, p_chroma );
1379 static void avc_lpf_cbcr_interleaved_inter_edge_ver_msa( uint8_t *p_chroma,
1390 uint32_t u_img_width )
1393 v16u8 p0, q0, p0_asub_q0, p1_asub_p0, q1_asub_q0;
1394 v16u8 is_less_than, is_less_than1;
1395 v8i16 is_less_than_r, is_less_than_l;
1396 v16u8 is_less_than_beta, is_less_than_alpha;
1401 v16u8 p1_org, p0_org, q0_org, q1_org;
1402 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1403 v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
1404 v16u8 is_bs_less_than4, is_bs_greater_than0;
1405 v8i16 tc_r, tc_l, negate_tc_r, i16_negatetc_l;
1408 v8i16 tmp_vec, bs = { 0 };
1410 v16u8 p1_u_org, p0_u_org, q0_u_org, q1_u_org;
1411 v16u8 p1_v_org, p0_v_org, q0_v_org, q1_v_org;
1412 v16i8 tmp0, tmp1, tmp2, tmp3;
1414 v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1415 v16i8 negate_tc, sign_negate_tc;
1417 const4 = ( v16u8 ) __msa_ldi_b( 4 );
1419 tmp_vec = ( v8i16 ) __msa_fill_b( u_bs0 );
1420 bs = __msa_insve_h( bs, 0, tmp_vec );
1421 bs = __msa_insve_h( bs, 4, tmp_vec );
1423 tmp_vec = ( v8i16 ) __msa_fill_b( u_bs1 );
1424 bs = __msa_insve_h( bs, 1, tmp_vec );
1425 bs = __msa_insve_h( bs, 5, tmp_vec );
1427 tmp_vec = ( v8i16 ) __msa_fill_b( u_bs2 );
1428 bs = __msa_insve_h( bs, 2, tmp_vec );
1429 bs = __msa_insve_h( bs, 6, tmp_vec );
1431 tmp_vec = ( v8i16 ) __msa_fill_b( u_bs3 );
1432 bs = __msa_insve_h( bs, 3, tmp_vec );
1433 bs = __msa_insve_h( bs, 7, tmp_vec );
1435 if( !__msa_test_bz_v( ( v16u8 ) bs ) )
1437 tmp_vec = ( v8i16 ) __msa_fill_b( u_tc0 );
1438 tc = __msa_insve_h( tc, 0, tmp_vec );
1439 tc = __msa_insve_h( tc, 4, tmp_vec );
1441 tmp_vec = ( v8i16 ) __msa_fill_b( u_tc1 );
1442 tc = __msa_insve_h( tc, 1, tmp_vec );
1443 tc = __msa_insve_h( tc, 5, tmp_vec );
1445 tmp_vec = ( v8i16 ) __msa_fill_b( u_tc2 );
1446 tc = __msa_insve_h( tc, 2, tmp_vec );
1447 tc = __msa_insve_h( tc, 6, tmp_vec );
1449 tmp_vec = ( v8i16 ) __msa_fill_b( u_tc3 );
1450 tc = __msa_insve_h( tc, 3, tmp_vec );
1451 tc = __msa_insve_h( tc, 7, tmp_vec );
1453 is_bs_greater_than0 = ( v16u8 ) ( zero < ( v16i8 ) bs );
1455 LD_UB8( ( p_chroma - 4 ), u_img_width,
1456 row0, row1, row2, row3, row4, row5, row6, row7 );
1458 TRANSPOSE8x8_UB_UB( row0, row1, row2, row3,
1459 row4, row5, row6, row7,
1460 p1_u_org, p1_v_org, p0_u_org, p0_v_org,
1461 q0_u_org, q0_v_org, q1_u_org, q1_v_org );
1463 ILVR_D4_UB( p1_v_org, p1_u_org, p0_v_org, p0_u_org, q0_v_org, q0_u_org,
1464 q1_v_org, q1_u_org, p1_org, p0_org, q0_org, q1_org );
1466 p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
1467 p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
1468 q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
1470 alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
1471 beta = ( v16u8 ) __msa_fill_b( u_beta_in );
1473 is_less_than_alpha = ( p0_asub_q0 < alpha );
1474 is_less_than_beta = ( p1_asub_p0 < beta );
1475 is_less_than = is_less_than_beta & is_less_than_alpha;
1476 is_less_than_beta = ( q1_asub_q0 < beta );
1477 is_less_than = is_less_than_beta & is_less_than;
1478 is_less_than = is_bs_greater_than0 & is_less_than;
1480 if( !__msa_test_bz_v( is_less_than ) )
1482 UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l );
1483 UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l );
1484 UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l );
1485 UNPCK_UB_SH( q1_org, q1_org_r, q1_org_l );
1487 is_bs_less_than4 = ( ( v16u8 ) bs < const4 );
1489 is_less_than1 = is_less_than & is_bs_less_than4;
1490 if( !__msa_test_bz_v( ( v16u8 ) is_less_than1 ) )
1492 negate_tc = zero - ( v16i8 ) tc;
1493 sign_negate_tc = __msa_clti_s_b( negate_tc, 0 );
1495 ILVRL_B2_SH( sign_negate_tc, negate_tc, negate_tc_r,
1498 UNPCK_UB_SH( tc, tc_r, tc_l );
1501 ( v8i16 ) __msa_sldi_b( ( v16i8 ) is_less_than1, zero, 8 );
1502 if( !__msa_test_bz_v( ( v16u8 ) is_less_than_r ) )
1504 AVC_LPF_P0Q0( q0_org_r, p0_org_r, p1_org_r, q1_org_r,
1505 negate_tc_r, tc_r, p0_r, q0_r );
1509 ( v8i16 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than1, 8 );
1510 if( !__msa_test_bz_v( ( v16u8 ) is_less_than_l ) )
1512 AVC_LPF_P0Q0( q0_org_l, p0_org_l, p1_org_l, q1_org_l,
1513 i16_negatetc_l, tc_l, p0_l, q0_l );
1516 PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 );
1518 p0_org = __msa_bmnz_v( p0_org, p0, is_less_than1 );
1519 q0_org = __msa_bmnz_v( q0_org, q0, is_less_than1 );
1522 SLDI_B2_0_UB( p0_org, q0_org, p0_v_org, q0_v_org, 8 );
1523 ILVR_D2_SB( p0_v_org, p0_org, q0_v_org, q0_org, tmp0, tmp1 );
1524 ILVRL_B2_SB( tmp1, tmp0, tmp2, tmp3 );
1525 ILVRL_B2_SW( tmp3, tmp2, vec0, vec1 );
1526 ST4x8_UB( vec0, vec1, ( p_chroma - 2 ), u_img_width );
1531 static void avc_deblock_strength_msa( uint8_t *nnz,
1532 int8_t pi_ref[2][X264_SCAN8_LUMA_SIZE],
1533 int16_t pi_mv[2][X264_SCAN8_LUMA_SIZE][2],
1534 uint8_t pu_bs[2][8][4],
1535 int32_t i_mvy_limit )
1538 v16u8 nnz0, nnz1, nnz2, nnz3, nnz4;
1539 v16u8 nnz_mask, ref_mask, mask, one, two, dst = { 0 };
1540 v16i8 ref0, ref1, ref2, ref3, ref4;
1541 v16i8 temp_vec0, temp_vec1, temp_vec4, temp_vec5;
1542 v8i16 mv0, mv1, mv2, mv3, mv4, mv5, mv6, mv7, mv8, mv9, mv_a, mv_b;
1543 v8u16 four, mvy_limit_vec, sub0, sub1;
1545 nnz0 = LD_UB( nnz + 4 );
1546 nnz2 = LD_UB( nnz + 20 );
1547 nnz4 = LD_UB( nnz + 36 );
1549 ref0 = LD_SB( pi_ref[0] + 4 );
1550 ref2 = LD_SB( pi_ref[0] + 20 );
1551 ref4 = LD_SB( pi_ref[0] + 36 );
1553 mv0 = LD_SH( ( pi_mv[0] + 4 )[0] );
1554 mv1 = LD_SH( ( pi_mv[0] + 12 )[0] );
1555 mv2 = LD_SH( ( pi_mv[0] + 20 )[0] );
1556 mv3 = LD_SH( ( pi_mv[0] + 28 )[0] );
1557 mv4 = LD_SH( ( pi_mv[0] + 36 )[0] );
1559 mvy_limit_vec = ( v8u16 ) __msa_fill_h( i_mvy_limit );
1560 four = ( v8u16 ) __msa_fill_h( 4 );
1561 mask = ( v16u8 ) __msa_ldi_b( 0 );
1562 one = ( v16u8 ) __msa_ldi_b( 1 );
1563 two = ( v16u8 ) __msa_ldi_b( 2 );
1565 mv5 = __msa_pckod_h( mv0, mv0 );
1566 mv6 = __msa_pckod_h( mv1, mv1 );
1567 mv_a = __msa_pckev_h( mv0, mv0 );
1568 mv_b = __msa_pckev_h( mv1, mv1 );
1569 nnz1 = ( v16u8 ) __msa_splati_w( ( v4i32 ) nnz0, 2 );
1570 ref1 = ( v16i8 ) __msa_splati_w( ( v4i32 ) ref0, 2 );
1571 nnz_mask = nnz0 | nnz1;
1572 nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
1573 two = __msa_bmnz_v( two, mask, nnz_mask );
1575 ref_mask = ( v16u8 ) __msa_ceq_b( ref0, ref1 );
1576 ref_mask = ref_mask ^ 255;
1578 sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
1579 sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
1581 sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
1582 sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
1584 ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
1585 ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
1587 dst = __msa_bmnz_v( dst, one, ref_mask );
1588 dst = __msa_bmnz_v( two, dst, nnz_mask );
1590 u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
1591 SW( u_tmp, pu_bs[1][0] );
1593 dst = ( v16u8 ) __msa_ldi_b( 0 );
1594 two = ( v16u8 ) __msa_ldi_b( 2 );
1596 mv5 = __msa_pckod_h( mv1, mv1 );
1597 mv6 = __msa_pckod_h( mv2, mv2 );
1598 mv_a = __msa_pckev_h( mv1, mv1 );
1599 mv_b = __msa_pckev_h( mv2, mv2 );
1601 nnz_mask = nnz2 | nnz1;
1602 nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
1603 two = __msa_bmnz_v( two, mask, nnz_mask );
1605 ref_mask = ( v16u8 ) __msa_ceq_b( ref1, ref2 );
1606 ref_mask = ref_mask ^ 255;
1608 sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
1609 sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
1610 sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
1611 sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
1613 ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
1614 ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
1616 dst = __msa_bmnz_v( dst, one, ref_mask );
1617 dst = __msa_bmnz_v( two, dst, nnz_mask );
1619 u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
1620 SW( u_tmp, pu_bs[1][1] );
1622 dst = ( v16u8 ) __msa_ldi_b( 0 );
1623 two = ( v16u8 ) __msa_ldi_b( 2 );
1625 mv5 = __msa_pckod_h( mv2, mv2 );
1626 mv6 = __msa_pckod_h( mv3, mv3 );
1627 mv_a = __msa_pckev_h( mv2, mv2 );
1628 mv_b = __msa_pckev_h( mv3, mv3 );
1630 nnz3 = ( v16u8 ) __msa_splati_w( ( v4i32 ) nnz2, 2 );
1631 ref3 = ( v16i8 ) __msa_splati_w( ( v4i32 ) ref2, 2 );
1633 nnz_mask = nnz3 | nnz2;
1634 nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
1635 two = __msa_bmnz_v( two, mask, nnz_mask );
1637 ref_mask = ( v16u8 ) __msa_ceq_b( ref2, ref3 );
1638 ref_mask = ref_mask ^ 255;
1640 sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
1641 sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
1643 sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
1644 sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
1646 ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
1647 ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
1649 dst = __msa_bmnz_v( dst, one, ref_mask );
1650 dst = __msa_bmnz_v( two, dst, nnz_mask );
1652 u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
1653 SW( u_tmp, pu_bs[1][2] );
1655 dst = ( v16u8 ) __msa_ldi_b( 0 );
1656 two = ( v16u8 ) __msa_ldi_b( 2 );
1658 mv5 = __msa_pckod_h( mv3, mv3 );
1659 mv6 = __msa_pckod_h( mv4, mv4 );
1660 mv_a = __msa_pckev_h( mv3, mv3 );
1661 mv_b = __msa_pckev_h( mv4, mv4 );
1663 nnz_mask = nnz4 | nnz3;
1664 nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
1665 two = __msa_bmnz_v( two, mask, nnz_mask );
1667 ref_mask = ( v16u8 ) __msa_ceq_b( ref3, ref4 );
1668 ref_mask = ref_mask ^ 255;
1670 sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
1671 sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
1673 sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
1674 sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
1676 ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
1677 ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
1679 dst = __msa_bmnz_v( dst, one, ref_mask );
1680 dst = __msa_bmnz_v( two, dst, nnz_mask );
1682 u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
1683 SW( u_tmp, pu_bs[1][3] );
1685 nnz0 = LD_UB( nnz + 8 );
1686 nnz2 = LD_UB( nnz + 24 );
1688 ref0 = LD_SB( pi_ref[0] + 8 );
1689 ref2 = LD_SB( pi_ref[0] + 24 );
1691 mv0 = LD_SH( ( pi_mv[0] + 8 )[0] );
1692 mv1 = LD_SH( ( pi_mv[0] + 12 )[0] );
1693 mv2 = LD_SH( ( pi_mv[0] + 16 )[0] );
1694 mv3 = LD_SH( ( pi_mv[0] + 20 )[0] );
1695 mv4 = LD_SH( ( pi_mv[0] + 24 )[0] );
1696 mv7 = LD_SH( ( pi_mv[0] + 28 )[0] );
1697 mv8 = LD_SH( ( pi_mv[0] + 32 )[0] );
1698 mv9 = LD_SH( ( pi_mv[0] + 36 )[0] );
1700 nnz1 = ( v16u8 ) __msa_splati_d( ( v2i64 ) nnz0, 1 );
1701 nnz3 = ( v16u8 ) __msa_splati_d( ( v2i64 ) nnz2, 1 );
1703 ILVR_B2_SB( nnz2, nnz0, nnz3, nnz1, temp_vec0, temp_vec1 );
1705 ILVRL_B2_SB( temp_vec1, temp_vec0, temp_vec5, temp_vec4 );
1707 nnz0 = ( v16u8 ) __msa_splati_w( ( v4i32 ) temp_vec5, 3 );
1708 nnz1 = ( v16u8 ) temp_vec4;
1709 nnz2 = ( v16u8 ) __msa_splati_w( ( v4i32 ) nnz1, 1 );
1710 nnz3 = ( v16u8 ) __msa_splati_w( ( v4i32 ) nnz1, 2 );
1711 nnz4 = ( v16u8 ) __msa_splati_w( ( v4i32 ) nnz1, 3 );
1713 ref1 = ( v16i8 ) __msa_splati_d( ( v2i64 ) ref0, 1 );
1714 ref3 = ( v16i8 ) __msa_splati_d( ( v2i64 ) ref2, 1 );
1716 ILVR_B2_SB( ref2, ref0, ref3, ref1, temp_vec0, temp_vec1 );
1718 ILVRL_B2_SB( temp_vec1, temp_vec0, temp_vec5, ref1 );
1720 ref0 = ( v16i8 ) __msa_splati_w( ( v4i32 ) temp_vec5, 3 );
1722 ref2 = ( v16i8 ) __msa_splati_w( ( v4i32 ) ref1, 1 );
1723 ref3 = ( v16i8 ) __msa_splati_w( ( v4i32 ) ref1, 2 );
1724 ref4 = ( v16i8 ) __msa_splati_w( ( v4i32 ) ref1, 3 );
1726 TRANSPOSE8X4_SH_SH( mv0, mv2, mv4, mv8, mv5, mv5, mv5, mv0 );
1727 TRANSPOSE8X4_SH_SH( mv1, mv3, mv7, mv9, mv1, mv2, mv3, mv4 );
1729 mvy_limit_vec = ( v8u16 ) __msa_fill_h( i_mvy_limit );
1730 four = ( v8u16 ) __msa_fill_h( 4 );
1731 mask = ( v16u8 ) __msa_ldi_b( 0 );
1732 one = ( v16u8 ) __msa_ldi_b( 1 );
1733 two = ( v16u8 ) __msa_ldi_b( 2 );
1734 dst = ( v16u8 ) __msa_ldi_b( 0 );
1736 mv5 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv0, 1 );
1737 mv6 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv1, 1 );
1741 nnz_mask = nnz0 | nnz1;
1742 nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
1743 two = __msa_bmnz_v( two, mask, nnz_mask );
1745 ref_mask = ( v16u8 ) __msa_ceq_b( ref0, ref1 );
1746 ref_mask = ref_mask ^ 255;
1748 sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
1749 sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
1751 sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
1752 sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
1754 ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
1755 ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
1757 dst = __msa_bmnz_v( dst, one, ref_mask );
1758 dst = __msa_bmnz_v( two, dst, nnz_mask );
1760 u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
1761 SW( u_tmp, pu_bs[0][0] );
1763 two = ( v16u8 ) __msa_ldi_b( 2 );
1764 dst = ( v16u8 ) __msa_ldi_b( 0 );
1766 mv5 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv1, 1 );
1767 mv6 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv2, 1 );
1771 nnz_mask = nnz1 | nnz2;
1772 nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
1773 two = __msa_bmnz_v( two, mask, nnz_mask );
1775 ref_mask = ( v16u8 ) __msa_ceq_b( ref1, ref2 );
1776 ref_mask = ref_mask ^ 255;
1778 sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
1779 sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
1780 sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
1781 sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
1783 ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
1784 ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
1786 dst = __msa_bmnz_v( dst, one, ref_mask );
1787 dst = __msa_bmnz_v( two, dst, nnz_mask );
1789 u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
1790 SW( u_tmp, pu_bs[0][1] );
1792 two = ( v16u8 ) __msa_ldi_b( 2 );
1793 dst = ( v16u8 ) __msa_ldi_b( 0 );
1795 mv5 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv2, 1 );
1796 mv6 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv3, 1 );
1800 nnz_mask = nnz2 | nnz3;
1801 nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
1802 two = __msa_bmnz_v( two, mask, nnz_mask );
1804 ref_mask = ( v16u8 ) __msa_ceq_b( ref2, ref3 );
1805 ref_mask = ref_mask ^ 255;
1807 sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
1808 sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
1809 sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
1810 sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
1812 ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
1813 ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
1815 dst = __msa_bmnz_v( dst, one, ref_mask );
1816 dst = __msa_bmnz_v( two, dst, nnz_mask );
1818 u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
1819 SW( u_tmp, pu_bs[0][2] );
1821 two = ( v16u8 ) __msa_ldi_b( 2 );
1822 dst = ( v16u8 ) __msa_ldi_b( 0 );
1824 mv5 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv3, 1 );
1825 mv6 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv4, 1 );
1829 nnz_mask = nnz3 | nnz4;
1830 nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
1831 two = __msa_bmnz_v( two, mask, nnz_mask );
1833 ref_mask = ( v16u8 ) __msa_ceq_b( ref3, ref4 );
1834 ref_mask = ref_mask ^ 255;
1836 sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
1837 sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
1838 sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
1839 sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
1841 ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
1842 ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
1844 dst = __msa_bmnz_v( dst, one, ref_mask );
1845 dst = __msa_bmnz_v( two, dst, nnz_mask );
1847 u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
1848 SW( u_tmp, pu_bs[0][3] );
1851 void x264_deblock_v_luma_intra_msa( uint8_t *p_pix, intptr_t i_stride,
1852 int32_t i_alpha, int32_t i_beta )
1854 avc_loopfilter_luma_intra_edge_hor_msa( p_pix, ( uint8_t ) i_alpha,
1855 ( uint8_t ) i_beta, i_stride );
1858 void x264_deblock_h_luma_intra_msa( uint8_t *p_pix, intptr_t i_stride,
1859 int32_t i_alpha, int32_t i_beta )
1861 avc_loopfilter_luma_intra_edge_ver_msa( p_pix, ( uint8_t ) i_alpha,
1862 ( uint8_t ) i_beta, i_stride );
1865 void x264_deblock_v_chroma_intra_msa( uint8_t *p_pix, intptr_t i_stride,
1866 int32_t i_alpha, int32_t i_beta )
1868 avc_lpf_cbcr_interleaved_intra_edge_hor_msa( p_pix, ( uint8_t ) i_alpha,
1869 ( uint8_t ) i_beta, i_stride );
1872 void x264_deblock_h_chroma_intra_msa( uint8_t *p_pix, intptr_t i_stride,
1873 int32_t i_alpha, int32_t i_beta )
1875 avc_lpf_cbcr_interleaved_intra_edge_ver_msa( p_pix, ( uint8_t ) i_alpha,
1876 ( uint8_t ) i_beta, i_stride );
1879 void x264_deblock_h_luma_msa( uint8_t *p_pix, intptr_t i_stride,
1880 int32_t i_alpha, int32_t i_beta, int8_t *p_tc0 )
1887 if( p_tc0[0] < 0 ) u_bs0 = 0;
1888 if( p_tc0[1] < 0 ) u_bs1 = 0;
1889 if( p_tc0[2] < 0 ) u_bs2 = 0;
1890 if( p_tc0[3] < 0 ) u_bs3 = 0;
1892 avc_loopfilter_luma_inter_edge_ver_msa( p_pix,
1893 u_bs0, u_bs1, u_bs2, u_bs3,
1894 p_tc0[0], p_tc0[1], p_tc0[2],
1895 p_tc0[3], i_alpha, i_beta,
1899 void x264_deblock_v_luma_msa( uint8_t *p_pix, intptr_t i_stride,
1900 int32_t i_alpha, int32_t i_beta, int8_t *p_tc0 )
1907 if( p_tc0[0] < 0 ) u_bs0 = 0;
1908 if( p_tc0[1] < 0 ) u_bs1 = 0;
1909 if( p_tc0[2] < 0 ) u_bs2 = 0;
1910 if( p_tc0[3] < 0 ) u_bs3 = 0;
1912 avc_loopfilter_luma_inter_edge_hor_msa( p_pix,
1913 u_bs0, u_bs1, u_bs2, u_bs3,
1914 p_tc0[0], p_tc0[1], p_tc0[2],
1915 p_tc0[3], i_alpha, i_beta,
1919 void x264_deblock_v_chroma_msa( uint8_t *p_pix, intptr_t i_stride,
1920 int32_t i_alpha, int32_t i_beta, int8_t *p_tc0 )
1927 if( p_tc0[0] < 0 ) u_bs0 = 0;
1928 if( p_tc0[1] < 0 ) u_bs1 = 0;
1929 if( p_tc0[2] < 0 ) u_bs2 = 0;
1930 if( p_tc0[3] < 0 ) u_bs3 = 0;
1932 avc_lpf_cbcr_interleaved_inter_edge_hor_msa( p_pix,
1933 u_bs0, u_bs1, u_bs2, u_bs3,
1934 p_tc0[0], p_tc0[1], p_tc0[2],
1935 p_tc0[3], i_alpha, i_beta,
1939 void x264_deblock_h_chroma_msa( uint8_t *p_pix, intptr_t i_stride,
1940 int32_t i_alpha, int32_t i_beta, int8_t *p_tc0 )
1947 if( p_tc0[0] < 0 ) u_bs0 = 0;
1948 if( p_tc0[1] < 0 ) u_bs1 = 0;
1949 if( p_tc0[2] < 0 ) u_bs2 = 0;
1950 if( p_tc0[3] < 0 ) u_bs3 = 0;
1952 avc_lpf_cbcr_interleaved_inter_edge_ver_msa( p_pix,
1953 u_bs0, u_bs1, u_bs2, u_bs3,
1954 p_tc0[0], p_tc0[1], p_tc0[2],
1955 p_tc0[3], i_alpha, i_beta,
1959 void x264_deblock_strength_msa( uint8_t u_nnz[X264_SCAN8_SIZE],
1960 int8_t pi_ref[2][X264_SCAN8_LUMA_SIZE],
1961 int16_t pi_mv[2][X264_SCAN8_LUMA_SIZE][2],
1962 uint8_t pu_bs[2][8][4], int32_t i_mvy_limit,
1967 for( int32_t i_dir = 0; i_dir < 2; i_dir++ )
1969 int32_t s1 = i_dir ? 1 : 8;
1970 int32_t s2 = i_dir ? 8 : 1;
1972 for( int32_t i_edge = 0; i_edge < 4; i_edge++ )
1974 for( int32_t i = 0, loc = X264_SCAN8_0 + i_edge * s2; i < 4;
1977 int32_t locn = loc - s2;
1978 if( u_nnz[loc] || u_nnz[locn] )
1980 pu_bs[i_dir][i_edge][i] = 2;
1982 else if( pi_ref[0][loc] != pi_ref[0][locn] ||
1983 abs( pi_mv[0][loc][0] -
1984 pi_mv[0][locn][0] ) >= 4 ||
1985 abs( pi_mv[0][loc][1] -
1986 pi_mv[0][locn][1] ) >= i_mvy_limit ||
1988 ( pi_ref[1][loc] != pi_ref[1][locn] ||
1989 abs( pi_mv[1][loc][0] -
1990 pi_mv[1][locn][0] ) >= 4 ||
1991 abs( pi_mv[1][loc][1] -
1992 pi_mv[1][locn][1] ) >= i_mvy_limit ) )
1995 pu_bs[i_dir][i_edge][i] = 1;
1999 pu_bs[i_dir][i_edge][i] = 0;
2007 avc_deblock_strength_msa( u_nnz, pi_ref, pi_mv, pu_bs, i_mvy_limit );