1 /*****************************************************************************
3 *****************************************************************************
4 * Copyright (C) 2015-2016 x264 project
6 * Authors: Rishikesh More <rishikesh.more@imgtec.com>
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 * This program is also available under a commercial proprietary license.
23 * For more information, contact us at licensing@x264.com.
24 *****************************************************************************/
26 #ifndef X264_MIPS_MACROS_H
27 #define X264_MIPS_MACROS_H
32 #define LD_B( RTYPE, p_src ) *( ( RTYPE * )( p_src ) )
33 #define LD_UB( ... ) LD_B( v16u8, __VA_ARGS__ )
34 #define LD_SB( ... ) LD_B( v16i8, __VA_ARGS__ )
36 #define LD_H( RTYPE, p_src ) *( ( RTYPE * )( p_src ) )
37 #define LD_SH( ... ) LD_H( v8i16, __VA_ARGS__ )
39 #define LD_W( RTYPE, p_src ) *( ( RTYPE * )( p_src ) )
40 #define LD_SW( ... ) LD_W( v4i32, __VA_ARGS__ )
42 #define ST_B( RTYPE, in, p_dst ) *( ( RTYPE * )( p_dst ) ) = ( in )
43 #define ST_UB( ... ) ST_B( v16u8, __VA_ARGS__ )
44 #define ST_SB( ... ) ST_B( v16i8, __VA_ARGS__ )
46 #define ST_H( RTYPE, in, p_dst ) *( ( RTYPE * )( p_dst ) ) = ( in )
47 #define ST_UH( ... ) ST_H( v8u16, __VA_ARGS__ )
48 #define ST_SH( ... ) ST_H( v8i16, __VA_ARGS__ )
50 #if ( __mips_isa_rev >= 6 )
53 uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \
57 "lh %[u_val_h_m], %[p_src_m] \n\t" \
59 : [u_val_h_m] "=r" ( u_val_h_m ) \
60 : [p_src_m] "m" ( *p_src_m ) \
68 uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \
72 "lw %[u_val_w_m], %[p_src_m] \n\t" \
74 : [u_val_w_m] "=r" ( u_val_w_m ) \
75 : [p_src_m] "m" ( *p_src_m ) \
84 uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \
85 uint64_t u_val_d_m = 0; \
88 "ld %[u_val_d_m], %[p_src_m] \n\t" \
90 : [u_val_d_m] "=r" ( u_val_d_m ) \
91 : [p_src_m] "m" ( *p_src_m ) \
96 #else // !( __mips == 64 )
99 uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \
100 uint32_t u_val0_m, u_val1_m; \
101 uint64_t u_val_d_m = 0; \
103 u_val0_m = LW( p_src_m ); \
104 u_val1_m = LW( p_src_m + 4 ); \
106 u_val_d_m = ( uint64_t ) ( u_val1_m ); \
107 u_val_d_m = ( uint64_t ) ( ( u_val_d_m << 32 ) & \
108 0xFFFFFFFF00000000 ); \
109 u_val_d_m = ( uint64_t ) ( u_val_d_m | ( uint64_t ) u_val0_m ); \
113 #endif // ( __mips == 64 )
115 #define SH( u_val, p_dst ) \
117 uint8_t *p_dst_m = ( uint8_t * ) ( p_dst ); \
118 uint16_t u_val_h_m = ( u_val ); \
121 "sh %[u_val_h_m], %[p_dst_m] \n\t" \
123 : [p_dst_m] "=m" ( *p_dst_m ) \
124 : [u_val_h_m] "r" ( u_val_h_m ) \
128 #define SW( u_val, p_dst ) \
130 uint8_t *p_dst_m = ( uint8_t * ) ( p_dst ); \
131 uint32_t u_val_w_m = ( u_val ); \
134 "sw %[u_val_w_m], %[p_dst_m] \n\t" \
136 : [p_dst_m] "=m" ( *p_dst_m ) \
137 : [u_val_w_m] "r" ( u_val_w_m ) \
141 #define SD( u_val, p_dst ) \
143 uint8_t *p_dst_m = ( uint8_t * ) ( p_dst ); \
144 uint64_t u_val_d_m = ( u_val ); \
147 "sd %[u_val_d_m], %[p_dst_m] \n\t" \
149 : [p_dst_m] "=m" ( *p_dst_m ) \
150 : [u_val_d_m] "r" ( u_val_d_m ) \
154 #else // !( __mips_isa_rev >= 6 )
155 #define LH( p_src ) \
157 uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \
158 uint16_t u_val_h_m; \
161 "ulh %[u_val_h_m], %[p_src_m] \n\t" \
163 : [u_val_h_m] "=r" ( u_val_h_m ) \
164 : [p_src_m] "m" ( *p_src_m ) \
170 #define LW( p_src ) \
172 uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \
173 uint32_t u_val_w_m; \
176 "ulw %[u_val_w_m], %[p_src_m] \n\t" \
178 : [u_val_w_m] "=r" ( u_val_w_m ) \
179 : [p_src_m] "m" ( *p_src_m ) \
186 #define LD( p_src ) \
188 uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \
189 uint64_t u_val_d_m = 0; \
192 "uld %[u_val_d_m], %[p_src_m] \n\t" \
194 : [u_val_d_m] "=r" ( u_val_d_m ) \
195 : [p_src_m] "m" ( *p_src_m ) \
200 #else // !( __mips == 64 )
201 #define LD( p_src ) \
203 uint8_t *psrc_m1 = ( uint8_t * ) ( p_src ); \
204 uint32_t u_val0_m, u_val1_m; \
205 uint64_t u_val_d_m = 0; \
207 u_val0_m = LW( psrc_m1 ); \
208 u_val1_m = LW( psrc_m1 + 4 ); \
210 u_val_d_m = ( uint64_t ) ( u_val1_m ); \
211 u_val_d_m = ( uint64_t ) ( ( u_val_d_m << 32 ) & \
212 0xFFFFFFFF00000000 ); \
213 u_val_d_m = ( uint64_t ) ( u_val_d_m | ( uint64_t ) u_val0_m ); \
217 #endif // ( __mips == 64 )
219 #define SH( u_val, p_dst ) \
221 uint8_t *p_dst_m = ( uint8_t * ) ( p_dst ); \
222 uint16_t u_val_h_m = ( u_val ); \
225 "ush %[u_val_h_m], %[p_dst_m] \n\t" \
227 : [p_dst_m] "=m" ( *p_dst_m ) \
228 : [u_val_h_m] "r" ( u_val_h_m ) \
232 #define SW( u_val, p_dst ) \
234 uint8_t *p_dst_m = ( uint8_t * ) ( p_dst ); \
235 uint32_t u_val_w_m = ( u_val ); \
238 "usw %[u_val_w_m], %[p_dst_m] \n\t" \
240 : [p_dst_m] "=m" ( *p_dst_m ) \
241 : [u_val_w_m] "r" ( u_val_w_m ) \
245 #define SD( u_val, p_dst ) \
247 uint8_t *p_dst_m1 = ( uint8_t * ) ( p_dst ); \
248 uint32_t u_val0_m, u_val1_m; \
250 u_val0_m = ( uint32_t ) ( ( u_val ) & 0x00000000FFFFFFFF ); \
251 u_val1_m = ( uint32_t ) ( ( ( u_val ) >> 32 ) & 0x00000000FFFFFFFF ); \
253 SW( u_val0_m, p_dst_m1 ); \
254 SW( u_val1_m, p_dst_m1 + 4 ); \
257 #endif // ( __mips_isa_rev >= 6 )
259 /* Description : Load 4 words with stride
260 Arguments : Inputs - psrc (source pointer to load from)
262 Outputs - out0, out1, out2, out3
263 Details : Load word in 'out0' from (psrc)
264 Load word in 'out1' from (psrc + stride)
265 Load word in 'out2' from (psrc + 2 * stride)
266 Load word in 'out3' from (psrc + 3 * stride)
268 #define LW4( p_src, stride, out0, out1, out2, out3 ) \
270 out0 = LW( ( p_src ) ); \
271 out1 = LW( ( p_src ) + stride ); \
272 out2 = LW( ( p_src ) + 2 * stride ); \
273 out3 = LW( ( p_src ) + 3 * stride ); \
276 /* Description : Store 4 words with stride
277 Arguments : Inputs - in0, in1, in2, in3, pdst, stride
278 Details : Store word from 'in0' to (pdst)
279 Store word from 'in1' to (pdst + stride)
280 Store word from 'in2' to (pdst + 2 * stride)
281 Store word from 'in3' to (pdst + 3 * stride)
283 #define SW4( in0, in1, in2, in3, p_dst, stride ) \
285 SW( in0, ( p_dst ) ) \
286 SW( in1, ( p_dst ) + stride ); \
287 SW( in2, ( p_dst ) + 2 * stride ); \
288 SW( in3, ( p_dst ) + 3 * stride ); \
291 /* Description : Store 4 double words with stride
292 Arguments : Inputs - in0, in1, in2, in3, pdst, stride
293 Details : Store double word from 'in0' to (pdst)
294 Store double word from 'in1' to (pdst + stride)
295 Store double word from 'in2' to (pdst + 2 * stride)
296 Store double word from 'in3' to (pdst + 3 * stride)
298 #define SD4( in0, in1, in2, in3, p_dst, stride ) \
300 SD( in0, ( p_dst ) ) \
301 SD( in1, ( p_dst ) + stride ); \
302 SD( in2, ( p_dst ) + 2 * stride ); \
303 SD( in3, ( p_dst ) + 3 * stride ); \
306 /* Description : Load vectors with 16 byte elements with stride
307 Arguments : Inputs - psrc (source pointer to load from)
310 Return Type - as per RTYPE
311 Details : Load 16 byte elements in 'out0' from (psrc)
312 Load 16 byte elements in 'out1' from (psrc + stride)
314 #define LD_B2( RTYPE, p_src, stride, out0, out1 ) \
316 out0 = LD_B( RTYPE, ( p_src ) ); \
317 out1 = LD_B( RTYPE, ( p_src ) + stride ); \
319 #define LD_UB2( ... ) LD_B2( v16u8, __VA_ARGS__ )
320 #define LD_SB2( ... ) LD_B2( v16i8, __VA_ARGS__ )
322 #define LD_B3( RTYPE, p_src, stride, out0, out1, out2 ) \
324 LD_B2( RTYPE, ( p_src ), stride, out0, out1 ); \
325 out2 = LD_B( RTYPE, ( p_src ) + 2 * stride ); \
327 #define LD_UB3( ... ) LD_B3( v16u8, __VA_ARGS__ )
328 #define LD_SB3( ... ) LD_B3( v16i8, __VA_ARGS__ )
330 #define LD_B4( RTYPE, p_src, stride, out0, out1, out2, out3 ) \
332 LD_B2( RTYPE, ( p_src ), stride, out0, out1 ); \
333 LD_B2( RTYPE, ( p_src ) + 2 * stride , stride, out2, out3 ); \
335 #define LD_UB4( ... ) LD_B4( v16u8, __VA_ARGS__ )
336 #define LD_SB4( ... ) LD_B4( v16i8, __VA_ARGS__ )
338 #define LD_B5( RTYPE, p_src, stride, out0, out1, out2, out3, out4 ) \
340 LD_B4( RTYPE, ( p_src ), stride, out0, out1, out2, out3 ); \
341 out4 = LD_B( RTYPE, ( p_src ) + 4 * stride ); \
343 #define LD_UB5( ... ) LD_B5( v16u8, __VA_ARGS__ )
344 #define LD_SB5( ... ) LD_B5( v16i8, __VA_ARGS__ )
346 #define LD_B8( RTYPE, p_src, stride, \
347 out0, out1, out2, out3, out4, out5, out6, out7 ) \
349 LD_B4( RTYPE, ( p_src ), stride, out0, out1, out2, out3 ); \
350 LD_B4( RTYPE, ( p_src ) + 4 * stride, stride, out4, out5, out6, out7 ); \
352 #define LD_UB8( ... ) LD_B8( v16u8, __VA_ARGS__ )
353 #define LD_SB8( ... ) LD_B8( v16i8, __VA_ARGS__ )
355 /* Description : Load vectors with 8 halfword elements with stride
356 Arguments : Inputs - psrc (source pointer to load from)
359 Details : Load 8 halfword elements in 'out0' from (psrc)
360 Load 8 halfword elements in 'out1' from (psrc + stride)
362 #define LD_H2( RTYPE, p_src, stride, out0, out1 ) \
364 out0 = LD_H( RTYPE, ( p_src ) ); \
365 out1 = LD_H( RTYPE, ( p_src ) + ( stride ) ); \
367 #define LD_SH2( ... ) LD_H2( v8i16, __VA_ARGS__ )
369 #define LD_H4( RTYPE, p_src, stride, out0, out1, out2, out3 ) \
371 LD_H2( RTYPE, ( p_src ), stride, out0, out1 ); \
372 LD_H2( RTYPE, ( p_src ) + 2 * stride, stride, out2, out3 ); \
374 #define LD_SH4( ... ) LD_H4( v8i16, __VA_ARGS__ )
376 #define LD_H8( RTYPE, p_src, stride, \
377 out0, out1, out2, out3, out4, out5, out6, out7 ) \
379 LD_H4( RTYPE, ( p_src ), stride, out0, out1, out2, out3 ); \
380 LD_H4( RTYPE, ( p_src ) + 4 * stride, stride, out4, out5, out6, out7 ); \
382 #define LD_SH8( ... ) LD_H8( v8i16, __VA_ARGS__ )
384 /* Description : Load 4x4 block of signed halfword elements from 1D source
385 data into 4 vectors (Each vector with 4 signed halfwords)
386 Arguments : Inputs - psrc
387 Outputs - out0, out1, out2, out3
389 #define LD4x4_SH( p_src, out0, out1, out2, out3 ) \
391 out0 = LD_SH( p_src ); \
392 out2 = LD_SH( p_src + 8 ); \
393 out1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out0, ( v2i64 ) out0 ); \
394 out3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out2, ( v2i64 ) out2 ); \
397 /* Description : Load 2 vectors of signed word elements with stride
398 Arguments : Inputs - psrc (source pointer to load from)
401 Return Type - signed word
403 #define LD_SW2( p_src, stride, out0, out1 ) \
405 out0 = LD_SW( ( p_src ) ); \
406 out1 = LD_SW( ( p_src ) + stride ); \
409 /* Description : Store vectors of 16 byte elements with stride
410 Arguments : Inputs - in0, in1, stride
411 - pdst (destination pointer to store to)
412 Details : Store 16 byte elements from 'in0' to (pdst)
413 Store 16 byte elements from 'in1' to (pdst + stride)
415 #define ST_B2( RTYPE, in0, in1, p_dst, stride ) \
417 ST_B( RTYPE, in0, ( p_dst ) ); \
418 ST_B( RTYPE, in1, ( p_dst ) + stride ); \
420 #define ST_UB2( ... ) ST_B2( v16u8, __VA_ARGS__ )
422 #define ST_B4( RTYPE, in0, in1, in2, in3, p_dst, stride ) \
424 ST_B2( RTYPE, in0, in1, ( p_dst ), stride ); \
425 ST_B2( RTYPE, in2, in3, ( p_dst ) + 2 * stride, stride ); \
427 #define ST_UB4( ... ) ST_B4( v16u8, __VA_ARGS__ )
428 #define ST_SB4( ... ) ST_B4( v16i8, __VA_ARGS__ )
430 #define ST_B8( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
433 ST_B4( RTYPE, in0, in1, in2, in3, p_dst, stride ); \
434 ST_B4( RTYPE, in4, in5, in6, in7, ( p_dst ) + 4 * stride, stride ); \
436 #define ST_UB8( ... ) ST_B8( v16u8, __VA_ARGS__ )
438 /* Description : Store vectors of 8 halfword elements with stride
439 Arguments : Inputs - in0, in1, stride
440 - pdst (destination pointer to store to)
441 Details : Store 8 halfword elements from 'in0' to (pdst)
442 Store 8 halfword elements from 'in1' to (pdst + stride)
444 #define ST_H2( RTYPE, in0, in1, p_dst, stride ) \
446 ST_H( RTYPE, in0, ( p_dst ) ); \
447 ST_H( RTYPE, in1, ( p_dst ) + stride ); \
449 #define ST_SH2( ... ) ST_H2( v8i16, __VA_ARGS__ )
451 #define ST_H4( RTYPE, in0, in1, in2, in3, p_dst, stride ) \
453 ST_H2( RTYPE, in0, in1, ( p_dst ), stride ); \
454 ST_H2( RTYPE, in2, in3, ( p_dst ) + 2 * stride, stride ); \
456 #define ST_SH4( ... ) ST_H4( v8i16, __VA_ARGS__ )
458 #define ST_H8( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, p_dst, stride ) \
460 ST_H4( RTYPE, in0, in1, in2, in3, ( p_dst ), stride ); \
461 ST_H4( RTYPE, in4, in5, in6, in7, ( p_dst ) + 4 * stride, stride ); \
463 #define ST_SH8( ... ) ST_H8( v8i16, __VA_ARGS__ )
465 /* Description : Store 2x4 byte block to destination memory from input vector
466 Arguments : Inputs - in, stidx, pdst, stride
467 Details : Index 'stidx' halfword element from 'in' vector is copied to
468 GP register and stored to (pdst)
469 Index 'stidx+1' halfword element from 'in' vector is copied to
470 GP register and stored to (pdst + stride)
471 Index 'stidx+2' halfword element from 'in' vector is copied to
472 GP register and stored to (pdst + 2 * stride)
473 Index 'stidx+3' halfword element from 'in' vector is copied to
474 GP register and stored to (pdst + 3 * stride)
476 #define ST2x4_UB( in, stidx, p_dst, stride ) \
478 uint16_t u_out0_m, u_out1_m, u_out2_m, u_out3_m; \
479 uint8_t *pblk_2x4_m = ( uint8_t * ) ( p_dst ); \
481 u_out0_m = __msa_copy_u_h( ( v8i16 ) in, ( stidx ) ); \
482 u_out1_m = __msa_copy_u_h( ( v8i16 ) in, ( stidx + 1 ) ); \
483 u_out2_m = __msa_copy_u_h( ( v8i16 ) in, ( stidx + 2 ) ); \
484 u_out3_m = __msa_copy_u_h( ( v8i16 ) in, ( stidx + 3 ) ); \
486 SH( u_out0_m, pblk_2x4_m ); \
487 SH( u_out1_m, pblk_2x4_m + stride ); \
488 SH( u_out2_m, pblk_2x4_m + 2 * stride ); \
489 SH( u_out3_m, pblk_2x4_m + 3 * stride ); \
492 /* Description : Store 4x4 byte block to destination memory from input vector
493 Arguments : Inputs - in0, in1, pdst, stride
494 Details : 'Idx0' word element from input vector 'in0' is copied to
495 GP register and stored to (pdst)
496 'Idx1' word element from input vector 'in0' is copied to
497 GP register and stored to (pdst + stride)
498 'Idx2' word element from input vector 'in0' is copied to
499 GP register and stored to (pdst + 2 * stride)
500 'Idx3' word element from input vector 'in0' is copied to
501 GP register and stored to (pdst + 3 * stride)
503 #define ST4x4_UB( in0, in1, idx0, idx1, idx2, idx3, p_dst, stride ) \
505 uint32_t u_out0_m, u_out1_m, u_out2_m, u_out3_m; \
506 uint8_t *pblk_4x4_m = ( uint8_t * ) ( p_dst ); \
508 u_out0_m = __msa_copy_u_w( ( v4i32 ) in0, idx0 ); \
509 u_out1_m = __msa_copy_u_w( ( v4i32 ) in0, idx1 ); \
510 u_out2_m = __msa_copy_u_w( ( v4i32 ) in1, idx2 ); \
511 u_out3_m = __msa_copy_u_w( ( v4i32 ) in1, idx3 ); \
513 SW4( u_out0_m, u_out1_m, u_out2_m, u_out3_m, pblk_4x4_m, stride ); \
516 #define ST4x8_UB( in0, in1, p_dst, stride ) \
518 uint8_t *pblk_4x8 = ( uint8_t * ) ( p_dst ); \
520 ST4x4_UB( in0, in0, 0, 1, 2, 3, pblk_4x8, stride ); \
521 ST4x4_UB( in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride ); \
524 /* Description : Store 8x1 byte block to destination memory from input vector
525 Arguments : Inputs - in, pdst
526 Details : Index 0 double word element from 'in' vector is copied to
527 GP register and stored to (pdst)
529 #define ST8x1_UB( in, p_dst ) \
532 u_out0_m = __msa_copy_u_d( ( v2i64 ) in, 0 ); \
533 SD( u_out0_m, p_dst ); \
536 /* Description : Store 8x4 byte block to destination memory from input
538 Arguments : Inputs - in0, in1, pdst, stride
539 Details : Index 0 double word element from 'in0' vector is copied to
540 GP register and stored to (pdst)
541 Index 1 double word element from 'in0' vector is copied to
542 GP register and stored to (pdst + stride)
543 Index 0 double word element from 'in1' vector is copied to
544 GP register and stored to (pdst + 2 * stride)
545 Index 1 double word element from 'in1' vector is copied to
546 GP register and stored to (pdst + 3 * stride)
548 #define ST8x4_UB( in0, in1, p_dst, stride ) \
550 uint64_t u_out0_m, u_out1_m, u_out2_m, u_out3_m; \
551 uint8_t *pblk_8x4_m = ( uint8_t * ) ( p_dst ); \
553 u_out0_m = __msa_copy_u_d( ( v2i64 ) in0, 0 ); \
554 u_out1_m = __msa_copy_u_d( ( v2i64 ) in0, 1 ); \
555 u_out2_m = __msa_copy_u_d( ( v2i64 ) in1, 0 ); \
556 u_out3_m = __msa_copy_u_d( ( v2i64 ) in1, 1 ); \
558 SD4( u_out0_m, u_out1_m, u_out2_m, u_out3_m, pblk_8x4_m, stride ); \
561 /* Description : average with rounding (in0 + in1 + 1) / 2.
562 Arguments : Inputs - in0, in1, in2, in3,
564 Return Type - as per RTYPE
565 Details : Each unsigned byte element from 'in0' vector is added with
566 each unsigned byte element from 'in1' vector.
567 Average with rounding is calculated and written to 'out0'
569 #define AVER_UB2( RTYPE, in0, in1, in2, in3, out0, out1 ) \
571 out0 = ( RTYPE ) __msa_aver_u_b( ( v16u8 ) in0, ( v16u8 ) in1 ); \
572 out1 = ( RTYPE ) __msa_aver_u_b( ( v16u8 ) in2, ( v16u8 ) in3 ); \
574 #define AVER_UB2_UB( ... ) AVER_UB2( v16u8, __VA_ARGS__ )
576 #define AVER_UB4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
577 out0, out1, out2, out3 ) \
579 AVER_UB2( RTYPE, in0, in1, in2, in3, out0, out1 ) \
580 AVER_UB2( RTYPE, in4, in5, in6, in7, out2, out3 ) \
582 #define AVER_UB4_UB( ... ) AVER_UB4( v16u8, __VA_ARGS__ )
584 /* Description : Immediate number of elements to slide with zero
585 Arguments : Inputs - in0, in1, slide_val
587 Return Type - as per RTYPE
588 Details : Byte elements from 'zero_m' vector are slide into 'in0' by
589 value specified in 'slide_val'
591 #define SLDI_B2_0( RTYPE, in0, in1, out0, out1, slide_val ) \
593 v16i8 zero_m = { 0 }; \
594 out0 = ( RTYPE ) __msa_sldi_b( ( v16i8 ) zero_m, \
595 ( v16i8 ) in0, slide_val ); \
596 out1 = ( RTYPE ) __msa_sldi_b( ( v16i8 ) zero_m, \
597 ( v16i8 ) in1, slide_val ); \
599 #define SLDI_B2_0_UB( ... ) SLDI_B2_0( v16u8, __VA_ARGS__ )
601 /* Description : Immediate number of elements to slide
602 Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val
604 Return Type - as per RTYPE
605 Details : Byte elements from 'in0_0' vector are slide into 'in1_0' by
606 value specified in 'slide_val'
608 #define SLDI_B2( RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val ) \
610 out0 = ( RTYPE ) __msa_sldi_b( ( v16i8 ) in0_0, ( v16i8 ) in1_0, \
612 out1 = ( RTYPE ) __msa_sldi_b( ( v16i8 ) in0_1, ( v16i8 ) in1_1, \
615 #define SLDI_B2_UB( ... ) SLDI_B2( v16u8, __VA_ARGS__ )
617 /* Description : Shuffle byte vector elements as per mask vector
618 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
620 Return Type - as per RTYPE
621 Details : Selective byte elements from 'in0' & 'in1' are copied to
622 'out0' as per control vector 'mask0'
624 #define VSHF_B2( RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1 ) \
626 out0 = ( RTYPE ) __msa_vshf_b( ( v16i8 ) mask0, \
627 ( v16i8 ) in1, ( v16i8 ) in0 ); \
628 out1 = ( RTYPE ) __msa_vshf_b( ( v16i8 ) mask1, \
629 ( v16i8 ) in3, ( v16i8 ) in2 ); \
631 #define VSHF_B2_UB( ... ) VSHF_B2( v16u8, __VA_ARGS__ )
632 #define VSHF_B2_SB( ... ) VSHF_B2( v16i8, __VA_ARGS__ )
634 /* Description : Shuffle halfword vector elements as per mask vector
635 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
637 Return Type - as per RTYPE
638 Details : Selective byte elements from 'in0' & 'in1' are copied to
639 'out0' as per control vector 'mask0'
641 #define VSHF_H2( RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1 ) \
643 out0 = ( RTYPE ) __msa_vshf_h( ( v8i16 ) mask0, \
644 ( v8i16 ) in1, ( v8i16 ) in0 ); \
645 out1 = ( RTYPE ) __msa_vshf_h( ( v8i16 ) mask1, \
646 ( v8i16 ) in3, ( v8i16 ) in2 ); \
648 #define VSHF_H2_SH( ... ) VSHF_H2( v8i16, __VA_ARGS__ )
650 /* Description : Dot product of byte vector elements
651 Arguments : Inputs - mult0, mult1
654 Return Type - as per RTYPE
655 Details : Unsigned byte elements from 'mult0' are multiplied with
656 unsigned byte elements from 'cnst0' producing a result
657 twice the size of input i.e. unsigned halfword.
658 Multiplication result of adjacent odd-even elements
659 are added together and written to the 'out0' vector
661 #define DOTP_UB2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 ) \
663 out0 = ( RTYPE ) __msa_dotp_u_h( ( v16u8 ) mult0, ( v16u8 ) cnst0 ); \
664 out1 = ( RTYPE ) __msa_dotp_u_h( ( v16u8 ) mult1, ( v16u8 ) cnst1 ); \
666 #define DOTP_UB2_UH( ... ) DOTP_UB2( v8u16, __VA_ARGS__ )
668 #define DOTP_UB4( RTYPE, mult0, mult1, mult2, mult3, \
669 cnst0, cnst1, cnst2, cnst3, \
670 out0, out1, out2, out3 ) \
672 DOTP_UB2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 ); \
673 DOTP_UB2( RTYPE, mult2, mult3, cnst2, cnst3, out2, out3 ); \
675 #define DOTP_UB4_UH( ... ) DOTP_UB4( v8u16, __VA_ARGS__ )
677 /* Description : Dot product of byte vector elements
678 Arguments : Inputs - mult0, mult1
681 Return Type - as per RTYPE
682 Details : Signed byte elements from 'mult0' are multiplied with
683 signed byte elements from 'cnst0' producing a result
684 twice the size of input i.e. signed halfword.
685 Multiplication result of adjacent odd-even elements
686 are added together and written to the 'out0' vector
688 #define DPADD_SB2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 ) \
690 out0 = ( RTYPE ) __msa_dpadd_s_h( ( v8i16 ) out0, \
691 ( v16i8 ) mult0, ( v16i8 ) cnst0 ); \
692 out1 = ( RTYPE ) __msa_dpadd_s_h( ( v8i16 ) out1, \
693 ( v16i8 ) mult1, ( v16i8 ) cnst1 ); \
695 #define DPADD_SB2_SH( ... ) DPADD_SB2( v8i16, __VA_ARGS__ )
697 #define DPADD_SB4( RTYPE, mult0, mult1, mult2, mult3, \
698 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3 ) \
700 DPADD_SB2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 ); \
701 DPADD_SB2( RTYPE, mult2, mult3, cnst2, cnst3, out2, out3 ); \
703 #define DPADD_SB4_SH( ... ) DPADD_SB4( v8i16, __VA_ARGS__ )
705 /* Description : Dot product of halfword vector elements
706 Arguments : Inputs - mult0, mult1
709 Return Type - as per RTYPE
710 Details : Signed halfword elements from 'mult0' are multiplied with
711 signed halfword elements from 'cnst0' producing a result
712 twice the size of input i.e. signed word.
713 Multiplication result of adjacent odd-even elements
714 are added together and written to the 'out0' vector
716 #define DPADD_SH2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 ) \
718 out0 = ( RTYPE ) __msa_dpadd_s_w( ( v4i32 ) out0, \
719 ( v8i16 ) mult0, ( v8i16 ) cnst0 ); \
720 out1 = ( RTYPE ) __msa_dpadd_s_w( ( v4i32 ) out1, \
721 ( v8i16 ) mult1, ( v8i16 ) cnst1 ); \
723 #define DPADD_SH2_SW( ... ) DPADD_SH2( v4i32, __VA_ARGS__ )
725 /* Description : Clips all halfword elements of input vector between min & max
726 out = (in < min) ? min : ((in > max) ? max : in)
727 Arguments : Inputs - in, min, max
729 Return Type - signed halfword
731 #define CLIP_SH( in, min, max ) \
735 out_m = __msa_max_s_h( ( v8i16 ) min, ( v8i16 ) in ); \
736 out_m = __msa_min_s_h( ( v8i16 ) max, ( v8i16 ) out_m ); \
740 /* Description : Clips all signed halfword elements of input vector
742 Arguments : Input - in
744 Return Type - signed halfword
746 #define CLIP_SH_0_255( in ) \
748 v8i16 max_m = __msa_ldi_h( 255 ); \
751 out_m = __msa_maxi_s_h( ( v8i16 ) in, 0 ); \
752 out_m = __msa_min_s_h( ( v8i16 ) max_m, ( v8i16 ) out_m ); \
755 #define CLIP_SH2_0_255( in0, in1 ) \
757 in0 = CLIP_SH_0_255( in0 ); \
758 in1 = CLIP_SH_0_255( in1 ); \
760 #define CLIP_SH4_0_255( in0, in1, in2, in3 ) \
762 CLIP_SH2_0_255( in0, in1 ); \
763 CLIP_SH2_0_255( in2, in3 ); \
766 /* Description : Horizontal addition of 4 signed word elements of input vector
767 Arguments : Input - in (signed word vector)
768 Output - sum_m (i32 sum)
769 Return Type - signed word (GP)
770 Details : 4 signed word elements of 'in' vector are added together and
771 the resulting integer sum is returned
773 #define HADD_SW_S32( in ) \
775 v2i64 res0_m, res1_m; \
778 res0_m = __msa_hadd_s_d( ( v4i32 ) in, ( v4i32 ) in ); \
779 res1_m = __msa_splati_d( res0_m, 1 ); \
780 res0_m = res0_m + res1_m; \
781 i_sum_m = __msa_copy_s_w( ( v4i32 ) res0_m, 0 ); \
785 /* Description : Horizontal addition of 4 signed word elements of input vector
786 Arguments : Input - in (signed word vector)
787 Output - sum_m (i32 sum)
788 Return Type - signed word (GP)
789 Details : 4 signed word elements of 'in' vector are added together and
790 the resulting integer sum is returned
792 #define HADD_UH_U32( in ) \
795 v2u64 res0_m, res1_m; \
798 res_m = __msa_hadd_u_w( ( v8u16 ) in, ( v8u16 ) in ); \
799 res0_m = __msa_hadd_u_d( res_m, res_m ); \
800 res1_m = ( v2u64 ) __msa_splati_d( ( v2i64 ) res0_m, 1 ); \
801 res0_m = res0_m + res1_m; \
802 u_sum_m = __msa_copy_u_w( ( v4i32 ) res0_m, 0 ); \
806 /* Description : Horizontal addition of signed byte vector elements
807 Arguments : Inputs - in0, in1
809 Return Type - as per RTYPE
810 Details : Each signed odd byte element from 'in0' is added to
811 even signed byte element from 'in0' (pairwise) and the
812 halfword result is written in 'out0'
814 #define HADD_SB2( RTYPE, in0, in1, out0, out1 ) \
816 out0 = ( RTYPE ) __msa_hadd_s_h( ( v16i8 ) in0, ( v16i8 ) in0 ); \
817 out1 = ( RTYPE ) __msa_hadd_s_h( ( v16i8 ) in1, ( v16i8 ) in1 ); \
819 #define HADD_SB4( RTYPE, in0, in1, in2, in3, out0, out1, out2, out3 ) \
821 HADD_SB2( RTYPE, in0, in1, out0, out1 ); \
822 HADD_SB2( RTYPE, in2, in3, out2, out3 ); \
824 #define HADD_SB4_SH( ... ) HADD_SB4( v8i16, __VA_ARGS__ )
826 /* Description : Horizontal addition of unsigned byte vector elements
827 Arguments : Inputs - in0, in1
829 Return Type - as per RTYPE
830 Details : Each unsigned odd byte element from 'in0' is added to
831 even unsigned byte element from 'in0' (pairwise) and the
832 halfword result is written to 'out0'
834 #define HADD_UB2( RTYPE, in0, in1, out0, out1 ) \
836 out0 = ( RTYPE ) __msa_hadd_u_h( ( v16u8 ) in0, ( v16u8 ) in0 ); \
837 out1 = ( RTYPE ) __msa_hadd_u_h( ( v16u8 ) in1, ( v16u8 ) in1 ); \
839 #define HADD_UB2_UH( ... ) HADD_UB2( v8u16, __VA_ARGS__ )
841 #define HADD_UB4( RTYPE, in0, in1, in2, in3, out0, out1, out2, out3 ) \
843 HADD_UB2( RTYPE, in0, in1, out0, out1 ); \
844 HADD_UB2( RTYPE, in2, in3, out2, out3 ); \
846 #define HADD_UB4_UH( ... ) HADD_UB4( v8u16, __VA_ARGS__ )
848 /* Description : Horizontal subtraction of unsigned byte vector elements
849 Arguments : Inputs - in0, in1
851 Return Type - as per RTYPE
852 Details : Each unsigned odd byte element from 'in0' is subtracted from
853 even unsigned byte element from 'in0' (pairwise) and the
854 halfword result is written to 'out0'
856 #define HSUB_UB2( RTYPE, in0, in1, out0, out1 ) \
858 out0 = ( RTYPE ) __msa_hsub_u_h( ( v16u8 ) in0, ( v16u8 ) in0 ); \
859 out1 = ( RTYPE ) __msa_hsub_u_h( ( v16u8 ) in1, ( v16u8 ) in1 ); \
861 #define HSUB_UB2_SH( ... ) HSUB_UB2( v8i16, __VA_ARGS__ )
863 #define HSUB_UB4( RTYPE, in0, in1, in2, in3, out0, out1, out2, out3 ) \
865 HSUB_UB2( RTYPE, in0, in1, out0, out1 ); \
866 HSUB_UB2( RTYPE, in2, in3, out2, out3 ); \
868 #define HSUB_UB4_SH( ... ) HSUB_UB4( v8i16, __VA_ARGS__ )
870 /* Description : SAD (Sum of Absolute Difference)
871 Arguments : Inputs - in0, in1, ref0, ref1
872 Outputs - sad_m (halfword vector)
873 Return Type - unsigned halfword
874 Details : Absolute difference of all the byte elements from 'in0' with
875 'ref0' is calculated and preserved in 'diff0'. Then even-odd
876 pairs are added together to generate 8 halfword results.
878 #define SAD_UB2_UH( in0, in1, ref0, ref1 ) \
880 v16u8 diff0_m, diff1_m; \
881 v8u16 sad_m = { 0 }; \
883 diff0_m = __msa_asub_u_b( ( v16u8 ) in0, ( v16u8 ) ref0 ); \
884 diff1_m = __msa_asub_u_b( ( v16u8 ) in1, ( v16u8 ) ref1 ); \
886 sad_m += __msa_hadd_u_h( ( v16u8 ) diff0_m, ( v16u8 ) diff0_m ); \
887 sad_m += __msa_hadd_u_h( ( v16u8 ) diff1_m, ( v16u8 ) diff1_m ); \
892 /* Description : Set element n input vector to GPR value
893 Arguments : Inputs - in0, in1, in2, in3 (4 input vectors)
894 Output - out (output vector)
895 Return Type - as per RTYPE
896 Details : Set element 0 in vector 'out' to value specified in 'in0'
898 #define INSERT_W2( RTYPE, in0, in1, out ) \
900 out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 0, in0 ); \
901 out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 1, in1 ); \
903 #define INSERT_W2_SB( ... ) INSERT_W2( v16i8, __VA_ARGS__ )
905 #define INSERT_W4( RTYPE, in0, in1, in2, in3, out ) \
907 out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 0, in0 ); \
908 out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 1, in1 ); \
909 out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 2, in2 ); \
910 out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 3, in3 ); \
912 #define INSERT_W4_UB( ... ) INSERT_W4( v16u8, __VA_ARGS__ )
913 #define INSERT_W4_SB( ... ) INSERT_W4( v16i8, __VA_ARGS__ )
915 #define INSERT_D2( RTYPE, in0, in1, out ) \
917 out = ( RTYPE ) __msa_insert_d( ( v2i64 ) out, 0, in0 ); \
918 out = ( RTYPE ) __msa_insert_d( ( v2i64 ) out, 1, in1 ); \
920 #define INSERT_D2_UB( ... ) INSERT_D2( v16u8, __VA_ARGS__ )
922 /* Description : Interleave even halfword elements from vectors
923 Arguments : Inputs - in0, in1, in2, in3
925 Return Type - as per RTYPE
926 Details : Even halfword elements of 'in0' and 'in1' are interleaved
927 and written to 'out0'
929 #define ILVEV_H2( RTYPE, in0, in1, in2, in3, out0, out1 ) \
931 out0 = ( RTYPE ) __msa_ilvev_h( ( v8i16 ) in1, ( v8i16 ) in0 ); \
932 out1 = ( RTYPE ) __msa_ilvev_h( ( v8i16 ) in3, ( v8i16 ) in2 ); \
934 #define ILVEV_H2_UB( ... ) ILVEV_H2( v16u8, __VA_ARGS__ )
936 /* Description : Interleave even double word elements from vectors
937 Arguments : Inputs - in0, in1, in2, in3
939 Return Type - as per RTYPE
940 Details : Even double word elements of 'in0' and 'in1' are interleaved
941 and written to 'out0'
943 #define ILVEV_D2( RTYPE, in0, in1, in2, in3, out0, out1 ) \
945 out0 = ( RTYPE ) __msa_ilvev_d( ( v2i64 ) in1, ( v2i64 ) in0 ); \
946 out1 = ( RTYPE ) __msa_ilvev_d( ( v2i64 ) in3, ( v2i64 ) in2 ); \
948 #define ILVEV_D2_UB( ... ) ILVEV_D2( v16u8, __VA_ARGS__ )
950 /* Description : Interleave left half of byte elements from vectors
951 Arguments : Inputs - in0, in1, in2, in3
953 Return Type - as per RTYPE
954 Details : Left half of byte elements of 'in0' and 'in1' are interleaved
955 and written to 'out0'.
957 #define ILVL_B2( RTYPE, in0, in1, in2, in3, out0, out1 ) \
959 out0 = ( RTYPE ) __msa_ilvl_b( ( v16i8 ) in0, ( v16i8 ) in1 ); \
960 out1 = ( RTYPE ) __msa_ilvl_b( ( v16i8 ) in2, ( v16i8 ) in3 ); \
962 #define ILVL_B2_UH( ... ) ILVL_B2( v8u16, __VA_ARGS__ )
963 #define ILVL_B2_SH( ... ) ILVL_B2( v8i16, __VA_ARGS__ )
965 #define ILVL_B4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
966 out0, out1, out2, out3 ) \
968 ILVL_B2( RTYPE, in0, in1, in2, in3, out0, out1 ); \
969 ILVL_B2( RTYPE, in4, in5, in6, in7, out2, out3 ); \
971 #define ILVL_B4_UB( ... ) ILVL_B4( v16u8, __VA_ARGS__ )
972 #define ILVL_B4_SB( ... ) ILVL_B4( v16i8, __VA_ARGS__ )
973 #define ILVL_B4_UH( ... ) ILVL_B4( v8u16, __VA_ARGS__ )
974 #define ILVL_B4_SH( ... ) ILVL_B4( v8i16, __VA_ARGS__ )
976 /* Description : Interleave left half of halfword elements from vectors
977 Arguments : Inputs - in0, in1, in2, in3
979 Return Type - as per RTYPE
980 Details : Left half of halfword elements of 'in0' and 'in1' are
981 interleaved and written to 'out0'.
983 #define ILVL_H2( RTYPE, in0, in1, in2, in3, out0, out1 ) \
985 out0 = ( RTYPE ) __msa_ilvl_h( ( v8i16 ) in0, ( v8i16 ) in1 ); \
986 out1 = ( RTYPE ) __msa_ilvl_h( ( v8i16 ) in2, ( v8i16 ) in3 ); \
988 #define ILVL_H2_SH( ... ) ILVL_H2( v8i16, __VA_ARGS__ )
989 #define ILVL_H2_SW( ... ) ILVL_H2( v4i32, __VA_ARGS__ )
991 #define ILVL_H4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
992 out0, out1, out2, out3 ) \
994 ILVL_H2( RTYPE, in0, in1, in2, in3, out0, out1 ); \
995 ILVL_H2( RTYPE, in4, in5, in6, in7, out2, out3 ); \
997 #define ILVL_H4_SW( ... ) ILVL_H4( v4i32, __VA_ARGS__ )
999 /* Description : Interleave left half of word elements from vectors
1000 Arguments : Inputs - in0, in1, in2, in3
1001 Outputs - out0, out1
1002 Return Type - as per RTYPE
1003 Details : Left half of word elements of 'in0' and 'in1' are interleaved
1004 and written to 'out0'.
1006 #define ILVL_W2( RTYPE, in0, in1, in2, in3, out0, out1 ) \
1008 out0 = ( RTYPE ) __msa_ilvl_w( ( v4i32 ) in0, ( v4i32 ) in1 ); \
1009 out1 = ( RTYPE ) __msa_ilvl_w( ( v4i32 ) in2, ( v4i32 ) in3 ); \
1011 #define ILVL_W2_SH( ... ) ILVL_W2( v8i16, __VA_ARGS__ )
1013 /* Description : Interleave right half of byte elements from vectors
1014 Arguments : Inputs - in0, in1, in2, in3
1015 Outputs - out0, out1
1016 Return Type - as per RTYPE
1017 Details : Right half of byte elements of 'in0' and 'in1' are interleaved
1018 and written to out0.
1020 #define ILVR_B2( RTYPE, in0, in1, in2, in3, out0, out1 ) \
1022 out0 = ( RTYPE ) __msa_ilvr_b( ( v16i8 ) in0, ( v16i8 ) in1 ); \
1023 out1 = ( RTYPE ) __msa_ilvr_b( ( v16i8 ) in2, ( v16i8 ) in3 ); \
1025 #define ILVR_B2_SB( ... ) ILVR_B2( v16i8, __VA_ARGS__ )
1026 #define ILVR_B2_UH( ... ) ILVR_B2( v8u16, __VA_ARGS__ )
1027 #define ILVR_B2_SH( ... ) ILVR_B2( v8i16, __VA_ARGS__ )
1029 #define ILVR_B4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1030 out0, out1, out2, out3 ) \
1032 ILVR_B2( RTYPE, in0, in1, in2, in3, out0, out1 ); \
1033 ILVR_B2( RTYPE, in4, in5, in6, in7, out2, out3 ); \
1035 #define ILVR_B4_UB( ... ) ILVR_B4( v16u8, __VA_ARGS__ )
1036 #define ILVR_B4_SB( ... ) ILVR_B4( v16i8, __VA_ARGS__ )
1037 #define ILVR_B4_UH( ... ) ILVR_B4( v8u16, __VA_ARGS__ )
1038 #define ILVR_B4_SH( ... ) ILVR_B4( v8i16, __VA_ARGS__ )
1040 /* Description : Interleave right half of halfword elements from vectors
1041 Arguments : Inputs - in0, in1, in2, in3
1042 Outputs - out0, out1
1043 Return Type - as per RTYPE
1044 Details : Right half of halfword elements of 'in0' and 'in1' are
1045 interleaved and written to 'out0'.
1047 #define ILVR_H2( RTYPE, in0, in1, in2, in3, out0, out1 ) \
1049 out0 = ( RTYPE ) __msa_ilvr_h( ( v8i16 ) in0, ( v8i16 ) in1 ); \
1050 out1 = ( RTYPE ) __msa_ilvr_h( ( v8i16 ) in2, ( v8i16 ) in3 ); \
1052 #define ILVR_H2_SH( ... ) ILVR_H2( v8i16, __VA_ARGS__ )
1053 #define ILVR_H2_SW( ... ) ILVR_H2( v4i32, __VA_ARGS__ )
1055 #define ILVR_H4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1056 out0, out1, out2, out3 ) \
1058 ILVR_H2( RTYPE, in0, in1, in2, in3, out0, out1 ); \
1059 ILVR_H2( RTYPE, in4, in5, in6, in7, out2, out3 ); \
1061 #define ILVR_H4_SH( ... ) ILVR_H4( v8i16, __VA_ARGS__ )
1062 #define ILVR_H4_SW( ... ) ILVR_H4( v4i32, __VA_ARGS__ )
1064 #define ILVR_W2( RTYPE, in0, in1, in2, in3, out0, out1 ) \
1066 out0 = ( RTYPE ) __msa_ilvr_w( ( v4i32 ) in0, ( v4i32 ) in1 ); \
1067 out1 = ( RTYPE ) __msa_ilvr_w( ( v4i32 ) in2, ( v4i32 ) in3 ); \
1069 #define ILVR_W2_SH( ... ) ILVR_W2( v8i16, __VA_ARGS__ )
1071 /* Description : Interleave right half of double word elements from vectors
1072 Arguments : Inputs - in0, in1, in2, in3
1073 Outputs - out0, out1
1074 Return Type - as per RTYPE
1075 Details : Right half of double word elements of 'in0' and 'in1' are
1076 interleaved and written to 'out0'.
1078 #define ILVR_D2( RTYPE, in0, in1, in2, in3, out0, out1 ) \
1080 out0 = ( RTYPE ) __msa_ilvr_d( ( v2i64 ) ( in0 ), ( v2i64 ) ( in1 ) ); \
1081 out1 = ( RTYPE ) __msa_ilvr_d( ( v2i64 ) ( in2 ), ( v2i64 ) ( in3 ) ); \
1083 #define ILVR_D2_UB( ... ) ILVR_D2( v16u8, __VA_ARGS__ )
1084 #define ILVR_D2_SB( ... ) ILVR_D2( v16i8, __VA_ARGS__ )
1085 #define ILVR_D2_SH( ... ) ILVR_D2( v8i16, __VA_ARGS__ )
1087 #define ILVR_D4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1088 out0, out1, out2, out3 ) \
1090 ILVR_D2( RTYPE, in0, in1, in2, in3, out0, out1 ); \
1091 ILVR_D2( RTYPE, in4, in5, in6, in7, out2, out3 ); \
1093 #define ILVR_D4_UB( ... ) ILVR_D4( v16u8, __VA_ARGS__ )
1095 /* Description : Interleave both left and right half of input vectors
1096 Arguments : Inputs - in0, in1
1097 Outputs - out0, out1
1098 Return Type - as per RTYPE
1099 Details : Right half of byte elements from 'in0' and 'in1' are
1100 interleaved and written to 'out0'
1102 #define ILVRL_B2( RTYPE, in0, in1, out0, out1 ) \
1104 out0 = ( RTYPE ) __msa_ilvr_b( ( v16i8 ) in0, ( v16i8 ) in1 ); \
1105 out1 = ( RTYPE ) __msa_ilvl_b( ( v16i8 ) in0, ( v16i8 ) in1 ); \
1107 #define ILVRL_B2_UB( ... ) ILVRL_B2( v16u8, __VA_ARGS__ )
1108 #define ILVRL_B2_SB( ... ) ILVRL_B2( v16i8, __VA_ARGS__ )
1109 #define ILVRL_B2_UH( ... ) ILVRL_B2( v8u16, __VA_ARGS__ )
1110 #define ILVRL_B2_SH( ... ) ILVRL_B2( v8i16, __VA_ARGS__ )
1111 #define ILVRL_B2_SW( ... ) ILVRL_B2( v4i32, __VA_ARGS__ )
1113 #define ILVRL_H2( RTYPE, in0, in1, out0, out1 ) \
1115 out0 = ( RTYPE ) __msa_ilvr_h( ( v8i16 ) in0, ( v8i16 ) in1 ); \
1116 out1 = ( RTYPE ) __msa_ilvl_h( ( v8i16 ) in0, ( v8i16 ) in1 ); \
1118 #define ILVRL_H2_SH( ... ) ILVRL_H2( v8i16, __VA_ARGS__ )
1119 #define ILVRL_H2_SW( ... ) ILVRL_H2( v4i32, __VA_ARGS__ )
1121 #define ILVRL_W2( RTYPE, in0, in1, out0, out1 ) \
1123 out0 = ( RTYPE ) __msa_ilvr_w( ( v4i32 ) in0, ( v4i32 ) in1 ); \
1124 out1 = ( RTYPE ) __msa_ilvl_w( ( v4i32 ) in0, ( v4i32 ) in1 ); \
1126 #define ILVRL_W2_SH( ... ) ILVRL_W2( v8i16, __VA_ARGS__ )
1127 #define ILVRL_W2_SW( ... ) ILVRL_W2( v4i32, __VA_ARGS__ )
1129 /* Description : Maximum values between signed elements of vector and
1130 5-bit signed immediate value are copied to the output vector
1131 Arguments : Inputs - in0, in1, in2, in3, max_val
1132 Outputs - in place operation
1133 Return Type - unsigned halfword
1134 Details : Maximum of signed halfword element values from 'in0' and
1135 'max_val' are written in place
1137 #define MAXI_SH2( RTYPE, in0, in1, max_val ) \
1139 in0 = ( RTYPE ) __msa_maxi_s_h( ( v8i16 ) in0, ( max_val ) ); \
1140 in1 = ( RTYPE ) __msa_maxi_s_h( ( v8i16 ) in1, ( max_val ) ); \
1142 #define MAXI_SH2_UH( ... ) MAXI_SH2( v8u16, __VA_ARGS__ )
1143 #define MAXI_SH2_SH( ... ) MAXI_SH2( v8i16, __VA_ARGS__ )
1145 #define MAXI_SH4( RTYPE, in0, in1, in2, in3, max_val ) \
1147 MAXI_SH2( RTYPE, in0, in1, max_val ); \
1148 MAXI_SH2( RTYPE, in2, in3, max_val ); \
1150 #define MAXI_SH4_UH( ... ) MAXI_SH4( v8u16, __VA_ARGS__ )
1152 /* Description : Saturate the halfword element values to the max
1153 unsigned value of (sat_val + 1 bits)
1154 The element data width remains unchanged
1155 Arguments : Inputs - in0, in1, sat_val
1156 Outputs - in place operation
1157 Return Type - as per RTYPE
1158 Details : Each unsigned halfword element from 'in0' is saturated to the
1159 value generated with (sat_val+1) bit range.
1160 The results are written in place
1162 #define SAT_UH2( RTYPE, in0, in1, sat_val ) \
1164 in0 = ( RTYPE ) __msa_sat_u_h( ( v8u16 ) in0, sat_val ); \
1165 in1 = ( RTYPE ) __msa_sat_u_h( ( v8u16 ) in1, sat_val ); \
1167 #define SAT_UH2_UH( ... ) SAT_UH2( v8u16, __VA_ARGS__ )
1169 #define SAT_UH4( RTYPE, in0, in1, in2, in3, sat_val ) \
1171 SAT_UH2( RTYPE, in0, in1, sat_val ); \
1172 SAT_UH2( RTYPE, in2, in3, sat_val ) \
1174 #define SAT_UH4_UH( ... ) SAT_UH4( v8u16, __VA_ARGS__ )
1176 /* Description : Saturate the halfword element values to the max
1177 unsigned value of (sat_val+1 bits)
1178 The element data width remains unchanged
1179 Arguments : Inputs - in0, in1, sat_val
1180 Outputs - in place operation
1181 Return Type - as per RTYPE
1182 Details : Each unsigned halfword element from 'in0' is saturated to the
1183 value generated with (sat_val+1) bit range
1184 The results are written in place
1186 #define SAT_SH2( RTYPE, in0, in1, sat_val ) \
1188 in0 = ( RTYPE ) __msa_sat_s_h( ( v8i16 ) in0, sat_val ); \
1189 in1 = ( RTYPE ) __msa_sat_s_h( ( v8i16 ) in1, sat_val ); \
1191 #define SAT_SH2_SH( ... ) SAT_SH2( v8i16, __VA_ARGS__ )
1193 #define SAT_SH4( RTYPE, in0, in1, in2, in3, sat_val ) \
1195 SAT_SH2( RTYPE, in0, in1, sat_val ); \
1196 SAT_SH2( RTYPE, in2, in3, sat_val ); \
1198 #define SAT_SH4_SH( ... ) SAT_SH4( v8i16, __VA_ARGS__ )
1200 /* Description : Saturate the word element values to the max
1201 unsigned value of (sat_val+1 bits)
1202 The element data width remains unchanged
1203 Arguments : Inputs - in0, in1, sat_val
1204 Outputs - in place operation
1205 Return Type - as per RTYPE
1206 Details : Each unsigned word element from 'in0' is saturated to the
1207 value generated with (sat_val+1) bit range
1208 The results are written in place
1210 #define SAT_SW2( RTYPE, in0, in1, sat_val ) \
1212 in0 = ( RTYPE ) __msa_sat_s_w( ( v4i32 ) in0, sat_val ); \
1213 in1 = ( RTYPE ) __msa_sat_s_w( ( v4i32 ) in1, sat_val ); \
1215 #define SAT_SW2_SW( ... ) SAT_SW2( v4i32, __VA_ARGS__ )
1217 /* Description : Pack even byte elements of vector pairs
1218 Arguments : Inputs - in0, in1, in2, in3
1219 Outputs - out0, out1
1220 Return Type - as per RTYPE
1221 Details : Even byte elements of 'in0' are copied to the left half of
1222 'out0' & even byte elements of 'in1' are copied to the right
1225 #define PCKEV_B2( RTYPE, in0, in1, in2, in3, out0, out1 ) \
1227 out0 = ( RTYPE ) __msa_pckev_b( ( v16i8 ) in0, ( v16i8 ) in1 ); \
1228 out1 = ( RTYPE ) __msa_pckev_b( ( v16i8 ) in2, ( v16i8 ) in3 ); \
1230 #define PCKEV_B2_SB( ... ) PCKEV_B2( v16i8, __VA_ARGS__ )
1231 #define PCKEV_B2_UB( ... ) PCKEV_B2( v16u8, __VA_ARGS__ )
1232 #define PCKEV_B2_SH( ... ) PCKEV_B2( v8i16, __VA_ARGS__ )
1233 #define PCKEV_B2_SW( ... ) PCKEV_B2( v4i32, __VA_ARGS__ )
1235 #define PCKEV_B3( RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2 ) \
1237 PCKEV_B2( RTYPE, in0, in1, in2, in3, out0, out1 ); \
1238 out2 = ( RTYPE ) __msa_pckev_b( ( v16i8 ) in4, ( v16i8 ) in5 ); \
1240 #define PCKEV_B3_UB( ... ) PCKEV_B3( v16u8, __VA_ARGS__ )
1242 #define PCKEV_B4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1243 out0, out1, out2, out3 ) \
1245 PCKEV_B2( RTYPE, in0, in1, in2, in3, out0, out1 ); \
1246 PCKEV_B2( RTYPE, in4, in5, in6, in7, out2, out3 ); \
1248 #define PCKEV_B4_SB( ... ) PCKEV_B4( v16i8, __VA_ARGS__ )
1249 #define PCKEV_B4_UB( ... ) PCKEV_B4( v16u8, __VA_ARGS__ )
1251 /* Description : Pack even halfword elements of vector pairs
1252 Arguments : Inputs - in0, in1, in2, in3
1253 Outputs - out0, out1
1254 Return Type - as per RTYPE
1255 Details : Even halfword elements of 'in0' are copied to the left half of
1256 'out0' & even halfword elements of 'in1' are copied to the
1257 right half of 'out0'.
1259 #define PCKEV_H2( RTYPE, in0, in1, in2, in3, out0, out1 ) \
1261 out0 = ( RTYPE ) __msa_pckev_h( ( v8i16 ) in0, ( v8i16 ) in1 ); \
1262 out1 = ( RTYPE ) __msa_pckev_h( ( v8i16 ) in2, ( v8i16 ) in3 ); \
1264 #define PCKEV_H2_SH( ... ) PCKEV_H2( v8i16, __VA_ARGS__ )
1266 #define PCKEV_H4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1267 out0, out1, out2, out3 ) \
1269 PCKEV_H2( RTYPE, in0, in1, in2, in3, out0, out1 ); \
1270 PCKEV_H2( RTYPE, in4, in5, in6, in7, out2, out3 ); \
1272 #define PCKEV_H4_SH( ... ) PCKEV_H4( v8i16, __VA_ARGS__ )
1274 /* Description : Pack even double word elements of vector pairs
1275 Arguments : Inputs - in0, in1, in2, in3
1276 Outputs - out0, out1
1277 Return Type - as per RTYPE
1278 Details : Even double elements of 'in0' are copied to the left half of
1279 'out0' & even double elements of 'in1' are copied to the right
1282 #define PCKEV_D2( RTYPE, in0, in1, in2, in3, out0, out1 ) \
1284 out0 = ( RTYPE ) __msa_pckev_d( ( v2i64 ) in0, ( v2i64 ) in1 ); \
1285 out1 = ( RTYPE ) __msa_pckev_d( ( v2i64 ) in2, ( v2i64 ) in3 ); \
1287 #define PCKEV_D2_UB( ... ) PCKEV_D2( v16u8, __VA_ARGS__ )
1289 #define PCKEV_D4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1290 out0, out1, out2, out3 ) \
1292 PCKEV_D2( RTYPE, in0, in1, in2, in3, out0, out1 ); \
1293 PCKEV_D2( RTYPE, in4, in5, in6, in7, out2, out3 ); \
1295 #define PCKEV_D4_UB( ... ) PCKEV_D4( v16u8, __VA_ARGS__ )
1297 /* Description : Pack odd byte elements of vector pairs
1298 Arguments : Inputs - in0, in1, in2, in3
1299 Outputs - out0, out1
1300 Return Type - as per RTYPE
1301 Details : Odd byte elements of 'in0' are copied to the left half of
1302 'out0' & odd byte elements of 'in1' are copied to the right
1305 #define PCKOD_B2( RTYPE, in0, in1, in2, in3, out0, out1 ) \
1307 out0 = ( RTYPE ) __msa_pckod_b( ( v16i8 ) in0, ( v16i8 ) in1 ); \
1308 out1 = ( RTYPE ) __msa_pckod_b( ( v16i8 ) in2, ( v16i8 ) in3 ); \
1310 #define PCKOD_B2_UB( ... ) PCKOD_B2( v16u8, __VA_ARGS__ )
1312 #define PCKOD_B4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1313 out0, out1, out2, out3 ) \
1315 PCKOD_B2( RTYPE, in0, in1, in2, in3, out0, out1 ); \
1316 PCKOD_B2( RTYPE, in4, in5, in6, in7, out2, out3 ); \
1318 #define PCKOD_B4_UB( ... ) PCKOD_B4( v16u8, __VA_ARGS__ )
1320 /* Description : Pack odd double word elements of vector pairs
1321 Arguments : Inputs - in0, in1, in2, in3
1322 Outputs - out0, out1
1323 Return Type - as per RTYPE
1324 Details : Odd double word elements of 'in0' are copied to the left half
1325 of 'out0' & odd double word elements of 'in1' are copied to
1326 the right half of 'out0'.
1328 #define PCKOD_D2( RTYPE, in0, in1, in2, in3, out0, out1 ) \
1330 out0 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) in0, ( v2i64 ) in1 ); \
1331 out1 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) in2, ( v2i64 ) in3 ); \
1333 #define PCKOD_D2_SH( ... ) PCKOD_D2( v8i16, __VA_ARGS__ )
1334 #define PCKOD_D2_SD( ... ) PCKOD_D2( v2i64, __VA_ARGS__ )
1336 /* Description : Each byte element is logically xor'ed with immediate 128
1337 Arguments : Inputs - in0, in1
1338 Outputs - in place operation
1339 Return Type - as per RTYPE
1340 Details : Each unsigned byte element from input vector 'in0' is
1341 logically xor'ed with 128 and the result is stored in-place.
1343 #define XORI_B2_128( RTYPE, in0, in1 ) \
1345 in0 = ( RTYPE ) __msa_xori_b( ( v16u8 ) in0, 128 ); \
1346 in1 = ( RTYPE ) __msa_xori_b( ( v16u8 ) in1, 128 ); \
1348 #define XORI_B2_128_UB( ... ) XORI_B2_128( v16u8, __VA_ARGS__ )
1349 #define XORI_B2_128_SB( ... ) XORI_B2_128( v16i8, __VA_ARGS__ )
1351 #define XORI_B3_128( RTYPE, in0, in1, in2 ) \
1353 XORI_B2_128( RTYPE, in0, in1 ); \
1354 in2 = ( RTYPE ) __msa_xori_b( ( v16u8 ) in2, 128 ); \
1356 #define XORI_B3_128_SB( ... ) XORI_B3_128( v16i8, __VA_ARGS__ )
1358 #define XORI_B4_128( RTYPE, in0, in1, in2, in3 ) \
1360 XORI_B2_128( RTYPE, in0, in1 ); \
1361 XORI_B2_128( RTYPE, in2, in3 ); \
1363 #define XORI_B4_128_UB( ... ) XORI_B4_128( v16u8, __VA_ARGS__ )
1364 #define XORI_B4_128_SB( ... ) XORI_B4_128( v16i8, __VA_ARGS__ )
1366 #define XORI_B5_128( RTYPE, in0, in1, in2, in3, in4 ) \
1368 XORI_B3_128( RTYPE, in0, in1, in2 ); \
1369 XORI_B2_128( RTYPE, in3, in4 ); \
1371 #define XORI_B5_128_SB( ... ) XORI_B5_128( v16i8, __VA_ARGS__ )
1373 /* Description : Addition of signed halfword elements and signed saturation
1374 Arguments : Inputs - in0, in1, in2, in3
1375 Outputs - out0, out1
1376 Return Type - as per RTYPE
1377 Details : Signed halfword elements from 'in0' are added to signed
1378 halfword elements of 'in1'. The result is then signed saturated
1379 between halfword data type range
1381 #define ADDS_SH2( RTYPE, in0, in1, in2, in3, out0, out1 ) \
1383 out0 = ( RTYPE ) __msa_adds_s_h( ( v8i16 ) in0, ( v8i16 ) in1 ); \
1384 out1 = ( RTYPE ) __msa_adds_s_h( ( v8i16 ) in2, ( v8i16 ) in3 ); \
1386 #define ADDS_SH2_SH( ... ) ADDS_SH2( v8i16, __VA_ARGS__ )
1388 #define ADDS_SH4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1389 out0, out1, out2, out3 ) \
1391 ADDS_SH2( RTYPE, in0, in1, in2, in3, out0, out1 ); \
1392 ADDS_SH2( RTYPE, in4, in5, in6, in7, out2, out3 ); \
1394 #define ADDS_SH4_UH( ... ) ADDS_SH4( v8u16, __VA_ARGS__ )
1396 /* Description : Shift left all elements of vector (generic for all data types)
1397 Arguments : Inputs - in0, in1, in2, in3, shift
1398 Outputs - in place operation
1399 Return Type - as per input vector RTYPE
1400 Details : Each element of vector 'in0' is left shifted by 'shift' and
1401 the result is written in-place.
1403 #define SLLI_4V( in0, in1, in2, in3, shift ) \
1405 in0 = in0 << shift; \
1406 in1 = in1 << shift; \
1407 in2 = in2 << shift; \
1408 in3 = in3 << shift; \
1411 /* Description : Arithmetic shift right all elements of vector
1412 (generic for all data types)
1413 Arguments : Inputs - in0, in1, in2, in3, shift
1414 Outputs - in place operation
1415 Return Type - as per input vector RTYPE
1416 Details : Each element of vector 'in0' is right shifted by 'shift' and
1417 the result is written in-place. 'shift' is a GP variable.
1419 #define SRA_4V( in0, in1, in2, in3, shift ) \
1421 in0 = in0 >> shift; \
1422 in1 = in1 >> shift; \
1423 in2 = in2 >> shift; \
1424 in3 = in3 >> shift; \
1427 /* Description : Shift right arithmetic rounded halfwords
1428 Arguments : Inputs - in0, in1, shift
1429 Outputs - in place operation
1430 Return Type - as per RTYPE
1431 Details : Each element of vector 'in0' is shifted right arithmetic by
1432 number of bits respective element holds in vector 'shift'.
1433 The last discarded bit is added to shifted value for rounding
1434 and the result is written in-place.
1435 'shift' is a vector.
1437 #define SRAR_H2( RTYPE, in0, in1, shift ) \
1439 in0 = ( RTYPE ) __msa_srar_h( ( v8i16 ) in0, ( v8i16 ) shift ); \
1440 in1 = ( RTYPE ) __msa_srar_h( ( v8i16 ) in1, ( v8i16 ) shift ); \
1442 #define SRAR_H2_SH( ... ) SRAR_H2( v8i16, __VA_ARGS__ )
1444 #define SRAR_H4( RTYPE, in0, in1, in2, in3, shift ) \
1446 SRAR_H2( RTYPE, in0, in1, shift ) \
1447 SRAR_H2( RTYPE, in2, in3, shift ) \
1449 #define SRAR_H4_SH( ... ) SRAR_H4( v8i16, __VA_ARGS__ )
1451 /* Description : Shift right logical all halfword elements of vector
1452 Arguments : Inputs - in0, in1, in2, in3, shift
1453 Outputs - in place operation
1454 Return Type - as per RTYPE
1455 Details : Each element of vector 'in0' is shifted right logical by
1456 number of bits respective element holds in vector 'shift' and
1457 the result is stored in-place.'shift' is a vector.
1459 #define SRL_H4( RTYPE, in0, in1, in2, in3, shift ) \
1461 in0 = ( RTYPE ) __msa_srl_h( ( v8i16 ) in0, ( v8i16 ) shift ); \
1462 in1 = ( RTYPE ) __msa_srl_h( ( v8i16 ) in1, ( v8i16 ) shift ); \
1463 in2 = ( RTYPE ) __msa_srl_h( ( v8i16 ) in2, ( v8i16 ) shift ); \
1464 in3 = ( RTYPE ) __msa_srl_h( ( v8i16 ) in3, ( v8i16 ) shift ); \
1466 #define SRL_H4_UH( ... ) SRL_H4( v8u16, __VA_ARGS__ )
1468 /* Description : Shift right arithmetic rounded (immediate)
1469 Arguments : Inputs - in0, in1, shift
1470 Outputs - in place operation
1471 Return Type - as per RTYPE
1472 Details : Each element of vector 'in0' is shifted right arithmetic by
1473 value in 'shift'. The last discarded bit is added to shifted
1474 value for rounding and the result is written in-place.
1475 'shift' is an immediate value.
1477 #define SRARI_H2( RTYPE, in0, in1, shift ) \
1479 in0 = ( RTYPE ) __msa_srari_h( ( v8i16 ) in0, shift ); \
1480 in1 = ( RTYPE ) __msa_srari_h( ( v8i16 ) in1, shift ); \
1482 #define SRARI_H2_UH( ... ) SRARI_H2( v8u16, __VA_ARGS__ )
1483 #define SRARI_H2_SH( ... ) SRARI_H2( v8i16, __VA_ARGS__ )
1485 #define SRARI_H4( RTYPE, in0, in1, in2, in3, shift ) \
1487 SRARI_H2( RTYPE, in0, in1, shift ); \
1488 SRARI_H2( RTYPE, in2, in3, shift ); \
1490 #define SRARI_H4_UH( ... ) SRARI_H4( v8u16, __VA_ARGS__ )
1491 #define SRARI_H4_SH( ... ) SRARI_H4( v8i16, __VA_ARGS__ )
1493 #define SRARI_W2( RTYPE, in0, in1, shift ) \
1495 in0 = ( RTYPE ) __msa_srari_w( ( v4i32 ) in0, shift ); \
1496 in1 = ( RTYPE ) __msa_srari_w( ( v4i32 ) in1, shift ); \
1498 #define SRARI_W2_SW( ... ) SRARI_W2( v4i32, __VA_ARGS__ )
1500 #define SRARI_W4( RTYPE, in0, in1, in2, in3, shift ) \
1502 SRARI_W2( RTYPE, in0, in1, shift ); \
1503 SRARI_W2( RTYPE, in2, in3, shift ); \
1505 #define SRARI_W4_SW( ... ) SRARI_W4( v4i32, __VA_ARGS__ )
1507 /* Description : Multiplication of pairs of vectors
1508 Arguments : Inputs - in0, in1, in2, in3
1509 Outputs - out0, out1
1510 Details : Each element from 'in0' is multiplied with elements from 'in1'
1511 and the result is written to 'out0'
1513 #define MUL2( in0, in1, in2, in3, out0, out1 ) \
1518 #define MUL4( in0, in1, in2, in3, in4, in5, in6, in7, \
1519 out0, out1, out2, out3 ) \
1521 MUL2( in0, in1, in2, in3, out0, out1 ); \
1522 MUL2( in4, in5, in6, in7, out2, out3 ); \
1525 /* Description : Addition of 2 pairs of vectors
1526 Arguments : Inputs - in0, in1, in2, in3
1527 Outputs - out0, out1
1528 Details : Each element in 'in0' is added to 'in1' and result is written
1531 #define ADD2( in0, in1, in2, in3, out0, out1 ) \
1536 #define ADD4( in0, in1, in2, in3, in4, in5, in6, in7, \
1537 out0, out1, out2, out3 ) \
1539 ADD2( in0, in1, in2, in3, out0, out1 ); \
1540 ADD2( in4, in5, in6, in7, out2, out3 ); \
1543 #define SUB4( in0, in1, in2, in3, in4, in5, in6, in7, \
1544 out0, out1, out2, out3 ) \
1552 /* Description : Sign extend halfword elements from right half of the vector
1553 Arguments : Input - in (halfword vector)
1554 Output - out (sign extended word vector)
1555 Return Type - signed word
1556 Details : Sign bit of halfword elements from input vector 'in' is
1557 extracted and interleaved with same vector 'in0' to generate
1558 4 word elements keeping sign intact
1560 #define UNPCK_R_SH_SW( in, out ) \
1564 sign_m = __msa_clti_s_h( ( v8i16 ) in, 0 ); \
1565 out = ( v4i32 ) __msa_ilvr_h( sign_m, ( v8i16 ) in ); \
1568 /* Description : Zero extend unsigned byte elements to halfword elements
1569 Arguments : Input - in (unsigned byte vector)
1570 Outputs - out0, out1 (unsigned halfword vectors)
1571 Return Type - signed halfword
1572 Details : Zero extended right half of vector is returned in 'out0'
1573 Zero extended left half of vector is returned in 'out1'
1575 #define UNPCK_UB_SH( in, out0, out1 ) \
1577 v16i8 zero_m = { 0 }; \
1579 ILVRL_B2_SH( zero_m, in, out0, out1 ); \
1582 /* Description : Sign extend halfword elements from input vector and return
1583 the result in pair of vectors
1584 Arguments : Input - in (halfword vector)
1585 Outputs - out0, out1 (sign extended word vectors)
1586 Return Type - signed word
1587 Details : Sign bit of halfword elements from input vector 'in' is
1588 extracted and interleaved right with same vector 'in0' to
1589 generate 4 signed word elements in 'out0'
1590 Then interleaved left with same vector 'in0' to
1591 generate 4 signed word elements in 'out1'
1593 #define UNPCK_SH_SW( in, out0, out1 ) \
1597 tmp_m = __msa_clti_s_h( ( v8i16 ) in, 0 ); \
1598 ILVRL_H2_SW( tmp_m, in, out0, out1 ); \
1601 /* Description : Butterfly of 4 input vectors
1602 Arguments : Inputs - in0, in1, in2, in3
1603 Outputs - out0, out1, out2, out3
1604 Details : Butterfly operation
1606 #define BUTTERFLY_4( in0, in1, in2, in3, out0, out1, out2, out3 ) \
1615 /* Description : Butterfly of 8 input vectors
1616 Arguments : Inputs - in0 ... in7
1617 Outputs - out0 .. out7
1618 Details : Butterfly operation
1620 #define BUTTERFLY_8( in0, in1, in2, in3, in4, in5, in6, in7, \
1621 out0, out1, out2, out3, out4, out5, out6, out7 ) \
1634 /* Description : Transpose input 8x8 byte block
1635 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1636 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1637 Return Type - as per RTYPE
1639 #define TRANSPOSE8x8_UB( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1640 out0, out1, out2, out3, out4, out5, out6, out7 ) \
1642 v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
1643 v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
1645 ILVR_B4_SB( in2, in0, in3, in1, in6, in4, in7, in5, \
1646 tmp0_m, tmp1_m, tmp2_m, tmp3_m ); \
1647 ILVRL_B2_SB( tmp1_m, tmp0_m, tmp4_m, tmp5_m ); \
1648 ILVRL_B2_SB( tmp3_m, tmp2_m, tmp6_m, tmp7_m ); \
1649 ILVRL_W2( RTYPE, tmp6_m, tmp4_m, out0, out2 ); \
1650 ILVRL_W2( RTYPE, tmp7_m, tmp5_m, out4, out6 ); \
1651 SLDI_B2_0( RTYPE, out0, out2, out1, out3, 8 ); \
1652 SLDI_B2_0( RTYPE, out4, out6, out5, out7, 8 ); \
1654 #define TRANSPOSE8x8_UB_UB( ... ) TRANSPOSE8x8_UB( v16u8, __VA_ARGS__ )
1656 /* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
1657 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
1658 in8, in9, in10, in11, in12, in13, in14, in15
1659 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1660 Return Type - unsigned byte
1662 #define TRANSPOSE16x8_UB_UB( in0, in1, in2, in3, in4, in5, in6, in7, \
1663 in8, in9, in10, in11, in12, in13, in14, in15, \
1664 out0, out1, out2, out3, out4, out5, out6, out7 ) \
1666 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
1667 v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
1669 ILVEV_D2_UB( in0, in8, in1, in9, out7, out6 ); \
1670 ILVEV_D2_UB( in2, in10, in3, in11, out5, out4 ); \
1671 ILVEV_D2_UB( in4, in12, in5, in13, out3, out2 ); \
1672 ILVEV_D2_UB( in6, in14, in7, in15, out1, out0 ); \
1674 tmp0_m = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out6, ( v16i8 ) out7 ); \
1675 tmp4_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out6, ( v16i8 ) out7 ); \
1676 tmp1_m = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out4, ( v16i8 ) out5 ); \
1677 tmp5_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out4, ( v16i8 ) out5 ); \
1678 out5 = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out2, ( v16i8 ) out3 ); \
1679 tmp6_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out2, ( v16i8 ) out3 ); \
1680 out7 = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out0, ( v16i8 ) out1 ); \
1681 tmp7_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out0, ( v16i8 ) out1 ); \
1683 ILVEV_H2_UB( tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m ); \
1684 out0 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \
1685 out4 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \
1687 tmp2_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp1_m, ( v8i16 ) tmp0_m ); \
1688 tmp3_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) out7, ( v8i16 ) out5 ); \
1689 out2 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \
1690 out6 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \
1692 ILVEV_H2_UB( tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m ); \
1693 out1 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \
1694 out5 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \
1696 tmp2_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp5_m, ( v8i16 ) tmp4_m ); \
1697 tmp2_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp5_m, ( v8i16 ) tmp4_m ); \
1698 tmp3_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp7_m, ( v8i16 ) tmp6_m ); \
1699 tmp3_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp7_m, ( v8i16 ) tmp6_m ); \
1700 out3 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \
1701 out7 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \
1704 /* Description : Transpose 4x4 block with half word elements in vectors
1705 Arguments : Inputs - in0, in1, in2, in3
1706 Outputs - out0, out1, out2, out3
1707 Return Type - signed halfword
1709 #define TRANSPOSE4x4_SH_SH( in0, in1, in2, in3, out0, out1, out2, out3 ) \
1713 ILVR_H2_SH( in1, in0, in3, in2, s0_m, s1_m ); \
1714 ILVRL_W2_SH( s1_m, s0_m, out0, out2 ); \
1715 out1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out0, ( v2i64 ) out0 ); \
1716 out3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out0, ( v2i64 ) out2 ); \
1719 /* Description : Transpose 4x8 block with half word elements in vectors
1720 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1721 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1722 Return Type - signed halfword
1724 #define TRANSPOSE4X8_SH_SH( in0, in1, in2, in3, in4, in5, in6, in7, \
1725 out0, out1, out2, out3, out4, out5, out6, out7 ) \
1727 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
1728 v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \
1729 v8i16 zero_m = { 0 }; \
1731 ILVR_H4_SH( in1, in0, in3, in2, in5, in4, in7, in6, \
1732 tmp0_n, tmp1_n, tmp2_n, tmp3_n ); \
1733 ILVRL_W2_SH( tmp1_n, tmp0_n, tmp0_m, tmp2_m ); \
1734 ILVRL_W2_SH( tmp3_n, tmp2_n, tmp1_m, tmp3_m ); \
1736 out0 = ( v8i16 ) __msa_ilvr_d( ( v2i64 ) tmp1_m, ( v2i64 ) tmp0_m ); \
1737 out1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) tmp1_m, ( v2i64 ) tmp0_m ); \
1738 out2 = ( v8i16 ) __msa_ilvr_d( ( v2i64 ) tmp3_m, ( v2i64 ) tmp2_m ); \
1739 out3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) tmp3_m, ( v2i64 ) tmp2_m ); \
1747 /* Description : Transpose 8x4 block with half word elements in vectors
1748 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1749 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1750 Return Type - signed halfword
1752 #define TRANSPOSE8X4_SH_SH( in0, in1, in2, in3, out0, out1, out2, out3 ) \
1754 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
1756 ILVR_H2_SH( in1, in0, in3, in2, tmp0_m, tmp1_m ); \
1757 ILVL_H2_SH( in1, in0, in3, in2, tmp2_m, tmp3_m ); \
1758 ILVR_W2_SH( tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2 ); \
1759 ILVL_W2_SH( tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3 ); \
1762 /* Description : Transpose 8x8 block with half word elements in vectors
1763 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1764 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1765 Return Type - as per RTYPE
1767 #define TRANSPOSE8x8_H( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1768 out0, out1, out2, out3, out4, out5, out6, out7 ) \
1771 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
1772 v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
1774 ILVR_H2_SH( in6, in4, in7, in5, s0_m, s1_m ); \
1775 ILVRL_H2_SH( s1_m, s0_m, tmp0_m, tmp1_m ); \
1776 ILVL_H2_SH( in6, in4, in7, in5, s0_m, s1_m ); \
1777 ILVRL_H2_SH( s1_m, s0_m, tmp2_m, tmp3_m ); \
1778 ILVR_H2_SH( in2, in0, in3, in1, s0_m, s1_m ); \
1779 ILVRL_H2_SH( s1_m, s0_m, tmp4_m, tmp5_m ); \
1780 ILVL_H2_SH( in2, in0, in3, in1, s0_m, s1_m ); \
1781 ILVRL_H2_SH( s1_m, s0_m, tmp6_m, tmp7_m ); \
1782 PCKEV_D4( RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, \
1783 tmp3_m, tmp7_m, out0, out2, out4, out6 ); \
1784 out1 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) tmp0_m, ( v2i64 ) tmp4_m ); \
1785 out3 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) tmp1_m, ( v2i64 ) tmp5_m ); \
1786 out5 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) tmp2_m, ( v2i64 ) tmp6_m ); \
1787 out7 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) tmp3_m, ( v2i64 ) tmp7_m ); \
1789 #define TRANSPOSE8x8_SH_SH( ... ) TRANSPOSE8x8_H( v8i16, __VA_ARGS__ )
1791 /* Description : Transpose 4x4 block with word elements in vectors
1792 Arguments : Inputs - in0, in1, in2, in3
1793 Outputs - out0, out1, out2, out3
1794 Return Type - signed word
1796 #define TRANSPOSE4x4_SW_SW( in0, in1, in2, in3, out0, out1, out2, out3 ) \
1798 v4i32 s0_m, s1_m, s2_m, s3_m; \
1800 ILVRL_W2_SW( in1, in0, s0_m, s1_m ); \
1801 ILVRL_W2_SW( in3, in2, s2_m, s3_m ); \
1803 out0 = ( v4i32 ) __msa_ilvr_d( ( v2i64 ) s2_m, ( v2i64 ) s0_m ); \
1804 out1 = ( v4i32 ) __msa_ilvl_d( ( v2i64 ) s2_m, ( v2i64 ) s0_m ); \
1805 out2 = ( v4i32 ) __msa_ilvr_d( ( v2i64 ) s3_m, ( v2i64 ) s1_m ); \
1806 out3 = ( v4i32 ) __msa_ilvl_d( ( v2i64 ) s3_m, ( v2i64 ) s1_m ); \
1809 /* Description : Add block 4x4
1810 Arguments : Inputs - in0, in1, in2, in3, pdst, stride
1811 Details : Least significant 4 bytes from each input vector are added to
1812 the destination bytes, clipped between 0-255 and stored.
1814 #define ADDBLK_ST4x4_UB( in0, in1, in2, in3, p_dst, stride ) \
1816 uint32_t src0_m, src1_m, src2_m, src3_m; \
1817 uint32_t out0_m, out1_m, out2_m, out3_m; \
1818 v8i16 inp0_m, inp1_m, res0_m, res1_m; \
1819 v16i8 dst0_m = { 0 }; \
1820 v16i8 dst1_m = { 0 }; \
1821 v16i8 zero_m = { 0 }; \
1823 ILVR_D2_SH( in1, in0, in3, in2, inp0_m, inp1_m ) \
1824 LW4( p_dst, stride, src0_m, src1_m, src2_m, src3_m ); \
1825 INSERT_W2_SB( src0_m, src1_m, dst0_m ); \
1826 INSERT_W2_SB( src2_m, src3_m, dst1_m ); \
1827 ILVR_B2_SH( zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m ); \
1828 ADD2( res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m ); \
1829 CLIP_SH2_0_255( res0_m, res1_m ); \
1830 PCKEV_B2_SB( res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m ); \
1832 out0_m = __msa_copy_u_w( ( v4i32 ) dst0_m, 0 ); \
1833 out1_m = __msa_copy_u_w( ( v4i32 ) dst0_m, 1 ); \
1834 out2_m = __msa_copy_u_w( ( v4i32 ) dst1_m, 0 ); \
1835 out3_m = __msa_copy_u_w( ( v4i32 ) dst1_m, 1 ); \
1836 SW4( out0_m, out1_m, out2_m, out3_m, p_dst, stride ); \
1839 /* Description : Dot product and addition of 3 signed halfword input vectors
1840 Arguments : Inputs - in0, in1, in2, coeff0, coeff1, coeff2
1842 Return Type - signed halfword
1843 Details : Dot product of 'in0' with 'coeff0'
1844 Dot product of 'in1' with 'coeff1'
1845 Dot product of 'in2' with 'coeff2'
1846 Addition of all the 3 vector results
1847 out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2)
1849 #define DPADD_SH3_SH( in0, in1, in2, coeff0, coeff1, coeff2 ) \
1854 out0_m = __msa_dotp_s_h( ( v16i8 ) in0, ( v16i8 ) coeff0 ); \
1855 out0_m = __msa_dpadd_s_h( out0_m, ( v16i8 ) in1, ( v16i8 ) coeff1 ); \
1856 tmp1_m = __msa_dotp_s_h( ( v16i8 ) in2, ( v16i8 ) coeff2 ); \
1857 out0_m = __msa_adds_s_h( out0_m, tmp1_m ); \
1862 /* Description : Pack even elements of input vectors & xor with 128
1863 Arguments : Inputs - in0, in1
1865 Return Type - unsigned byte
1866 Details : Signed byte even elements from 'in0' and 'in1' are packed
1867 together in one vector and the resulting vector is xor'ed with
1868 128 to shift the range from signed to unsigned byte
1870 #define PCKEV_XORI128_UB( in0, in1 ) \
1873 out_m = ( v16u8 ) __msa_pckev_b( ( v16i8 ) in1, ( v16i8 ) in0 ); \
1874 out_m = ( v16u8 ) __msa_xori_b( ( v16u8 ) out_m, 128 ); \
1878 /* Description : Pack even byte elements, extract 0 & 2 index words from pair
1879 of results and store 4 words in destination memory as per
1881 Arguments : Inputs - in0, in1, in2, in3, pdst, stride
1883 #define PCKEV_ST4x4_UB( in0, in1, in2, in3, p_dst, stride ) \
1885 uint32_t out0_m, out1_m, out2_m, out3_m; \
1886 v16i8 tmp0_m, tmp1_m; \
1888 PCKEV_B2_SB( in1, in0, in3, in2, tmp0_m, tmp1_m ); \
1890 out0_m = __msa_copy_u_w( ( v4i32 ) tmp0_m, 0 ); \
1891 out1_m = __msa_copy_u_w( ( v4i32 ) tmp0_m, 2 ); \
1892 out2_m = __msa_copy_u_w( ( v4i32 ) tmp1_m, 0 ); \
1893 out3_m = __msa_copy_u_w( ( v4i32 ) tmp1_m, 2 ); \
1895 SW4( out0_m, out1_m, out2_m, out3_m, p_dst, stride ); \
1898 /* Description : Pack even byte elements and store byte vector in destination
1900 Arguments : Inputs - in0, in1, pdst
1902 #define PCKEV_ST_SB( in0, in1, p_dst ) \
1905 tmp_m = __msa_pckev_b( ( v16i8 ) in1, ( v16i8 ) in0 ); \
1906 ST_SB( tmp_m, ( p_dst ) ); \
1909 #define AVC_CALC_DPADD_H_6PIX_2COEFF_SH( in0, in1, in2, in3, in4, in5 ) \
1911 v4i32 tmp0_m, tmp1_m; \
1912 v8i16 out0_m, out1_m, out2_m, out3_m; \
1913 v8i16 minus5h_m = __msa_ldi_h( -5 ); \
1914 v8i16 plus20h_m = __msa_ldi_h( 20 ); \
1916 ILVRL_H2_SW( in5, in0, tmp0_m, tmp1_m ); \
1918 tmp0_m = __msa_hadd_s_w( ( v8i16 ) tmp0_m, ( v8i16 ) tmp0_m ); \
1919 tmp1_m = __msa_hadd_s_w( ( v8i16 ) tmp1_m, ( v8i16 ) tmp1_m ); \
1921 ILVRL_H2_SH( in1, in4, out0_m, out1_m ); \
1922 DPADD_SH2_SW( out0_m, out1_m, minus5h_m, minus5h_m, tmp0_m, tmp1_m ); \
1923 ILVRL_H2_SH( in2, in3, out2_m, out3_m ); \
1924 DPADD_SH2_SW( out2_m, out3_m, plus20h_m, plus20h_m, tmp0_m, tmp1_m ); \
1926 SRARI_W2_SW( tmp0_m, tmp1_m, 10 ); \
1927 SAT_SW2_SW( tmp0_m, tmp1_m, 7 ); \
1928 out0_m = __msa_pckev_h( ( v8i16 ) tmp1_m, ( v8i16 ) tmp0_m ); \
1933 #define AVC_HORZ_FILTER_SH( in, mask0, mask1, mask2 ) \
1935 v8i16 out0_m, out1_m; \
1936 v16i8 tmp0_m, tmp1_m; \
1937 v16i8 minus5b = __msa_ldi_b( -5 ); \
1938 v16i8 plus20b = __msa_ldi_b( 20 ); \
1940 tmp0_m = __msa_vshf_b( ( v16i8 ) mask0, in, in ); \
1941 out0_m = __msa_hadd_s_h( tmp0_m, tmp0_m ); \
1943 tmp0_m = __msa_vshf_b( ( v16i8 ) mask1, in, in ); \
1944 out0_m = __msa_dpadd_s_h( out0_m, minus5b, tmp0_m ); \
1946 tmp1_m = __msa_vshf_b( ( v16i8 ) ( mask2 ), in, in ); \
1947 out1_m = __msa_dpadd_s_h( out0_m, plus20b, tmp1_m ); \
1952 #endif /* X264_MIPS_MACROS_H */