2 * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavcodec/hevcdec.h"
22 #include "libavutil/mips/generic_macros_msa.h"
23 #include "hevcpred_mips.h"
25 static const int8_t intra_pred_angle_up[17] = {
26 -32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32
29 static const int8_t intra_pred_angle_low[16] = {
30 32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26
33 #define HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, \
34 mul_val_h0, mul_val_h1, mul_val_h2, mul_val_h3, \
35 res0, res1, mul_val_b0, mul_val_b1, round) \
37 v8i16 res0_m, res1_m, res2_m, res3_m; \
39 MUL4(mul_val_h0, vec0, mul_val_h2, vec0, mul_val_h0, vec1, \
40 mul_val_h2, vec1, res0_m, res1_m, res2_m, res3_m); \
42 res0_m += mul_val_h1 * tmp0; \
43 res1_m += mul_val_h3 * tmp0; \
44 res2_m += mul_val_h1 * tmp0; \
45 res3_m += mul_val_h3 * tmp0; \
47 res0_m += mul_val_b0 * src0_r; \
48 res1_m += mul_val_b0 * src0_l; \
49 res2_m += (mul_val_b0 - 1) * src0_r; \
50 res3_m += (mul_val_b0 - 1) * src0_l; \
52 res0_m += mul_val_b1 * tmp1; \
53 res1_m += mul_val_b1 * tmp1; \
54 res2_m += (mul_val_b1 + 1) * tmp1; \
55 res3_m += (mul_val_b1 + 1) * tmp1; \
57 SRARI_H4_SH(res0_m, res1_m, res2_m, res3_m, round); \
58 PCKEV_B2_SH(res1_m, res0_m, res3_m, res2_m, res0, res1); \
61 static void hevc_intra_pred_vert_4x4_msa(const uint8_t *src_top,
62 const uint8_t *src_left,
63 uint8_t *dst, int32_t stride,
68 v8i16 vec0, vec1, vec2;
71 src_data = LW(src_top);
72 SW4(src_data, src_data, src_data, src_data, dst, stride);
75 src_data = LW(src_left);
77 vec2 = (v8i16) __msa_insert_w((v4i32) vec2, 0, src_data);
79 vec0 = __msa_fill_h(src_left[-1]);
80 vec1 = __msa_fill_h(src_top[0]);
82 vec2 = (v8i16) __msa_ilvr_b(zero, (v16i8) vec2);
86 vec2 = CLIP_SH_0_255(vec2);
88 for (col = 0; col < 4; col++) {
89 dst[stride * col] = (uint8_t) vec2[col];
94 static void hevc_intra_pred_vert_8x8_msa(const uint8_t *src_top,
95 const uint8_t *src_left,
96 uint8_t *dst, int32_t stride,
99 uint8_t *tmp_dst = dst;
101 uint16_t val0, val1, val2, val3;
103 v8i16 vec0, vec1, vec2;
106 src_data1 = LD(src_top);
108 for (row = 8; row--;) {
109 SD(src_data1, tmp_dst);
114 src_data1 = LD(src_left);
116 vec2 = (v8i16) __msa_insert_d((v2i64) zero, 0, src_data1);
118 vec0 = __msa_fill_h(src_left[-1]);
119 vec1 = __msa_fill_h(src_top[0]);
121 vec2 = (v8i16) __msa_ilvr_b(zero, (v16i8) vec2);
125 vec2 = CLIP_SH_0_255(vec2);
134 dst[2 * stride] = val2;
135 dst[3 * stride] = val3;
142 dst[4 * stride] = val0;
143 dst[5 * stride] = val1;
144 dst[6 * stride] = val2;
145 dst[7 * stride] = val3;
149 static void hevc_intra_pred_vert_16x16_msa(const uint8_t *src_top,
150 const uint8_t *src_left,
151 uint8_t *dst, int32_t stride,
155 uint8_t *tmp_dst = dst;
158 v8i16 vec0, vec1, vec2, vec3;
160 src = LD_UB(src_top);
162 for (row = 16; row--;) {
168 src = LD_UB(src_left);
170 vec0 = __msa_fill_h(src_left[-1]);
171 vec1 = __msa_fill_h(src_top[0]);
173 UNPCK_UB_SH(src, vec2, vec3);
174 SUB2(vec2, vec0, vec3, vec0, vec2, vec3);
179 ADD2(vec2, vec1, vec3, vec1, vec2, vec3);
180 CLIP_SH2_0_255(vec2, vec3);
182 src = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2);
184 for (col = 0; col < 16; col++) {
185 dst[stride * col] = src[col];
190 static void hevc_intra_pred_horiz_4x4_msa(const uint8_t *src_top,
191 const uint8_t *src_left,
192 uint8_t *dst, int32_t stride,
195 uint32_t val0, val1, val2, val3;
197 v8i16 src0_r, src_top_val, src_left_val;
200 val0 = src_left[0] * 0x01010101;
201 val1 = src_left[1] * 0x01010101;
202 val2 = src_left[2] * 0x01010101;
203 val3 = src_left[3] * 0x01010101;
204 SW4(val0, val1, val2, val3, dst, stride);
208 src0 = (v16i8) __msa_insert_w((v4i32) src0, 0, val0);
209 src_top_val = __msa_fill_h(src_top[-1]);
210 src_left_val = __msa_fill_h(src_left[0]);
212 src0_r = (v8i16) __msa_ilvr_b(zero, src0);
214 src0_r -= src_top_val;
216 src0_r += src_left_val;
217 src0_r = CLIP_SH_0_255(src0_r);
218 src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
219 val0 = __msa_copy_s_w((v4i32) src0, 0);
224 static void hevc_intra_pred_horiz_8x8_msa(const uint8_t *src_top,
225 const uint8_t *src_left,
226 uint8_t *dst, int32_t stride,
229 uint64_t val0, val1, val2, val3;
231 v8i16 src0_r, src_top_val, src_left_val;
234 val0 = src_left[0] * 0x0101010101010101;
235 val1 = src_left[1] * 0x0101010101010101;
236 val2 = src_left[2] * 0x0101010101010101;
237 val3 = src_left[3] * 0x0101010101010101;
238 SD4(val0, val1, val2, val3, dst, stride);
240 val0 = src_left[4] * 0x0101010101010101;
241 val1 = src_left[5] * 0x0101010101010101;
242 val2 = src_left[6] * 0x0101010101010101;
243 val3 = src_left[7] * 0x0101010101010101;
244 SD4(val0, val1, val2, val3, dst + 4 * stride, stride);
248 src0 = (v16i8) __msa_insert_d((v2i64) src0, 0, val0);
249 src_top_val = __msa_fill_h(src_top[-1]);
250 src_left_val = __msa_fill_h(src_left[0]);
252 src0_r = (v8i16) __msa_ilvr_b(zero, src0);
254 src0_r -= src_top_val;
256 src0_r += src_left_val;
257 src0_r = CLIP_SH_0_255(src0_r);
258 src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
259 val0 = __msa_copy_s_d((v2i64) src0, 0);
264 static void hevc_intra_pred_horiz_16x16_msa(const uint8_t *src_top,
265 const uint8_t *src_left,
266 uint8_t *dst, int32_t stride,
269 uint8_t *tmp_dst = dst;
271 uint8_t inp0, inp1, inp2, inp3;
272 v16i8 src0, src1, src2, src3;
273 v8i16 src0_r, src0_l, src_left_val, src_top_val;
275 src_left_val = __msa_fill_h(src_left[0]);
277 for (row = 4; row--;) {
284 src0 = __msa_fill_b(inp0);
285 src1 = __msa_fill_b(inp1);
286 src2 = __msa_fill_b(inp2);
287 src3 = __msa_fill_b(inp3);
289 ST_SB4(src0, src1, src2, src3, tmp_dst, stride);
290 tmp_dst += (4 * stride);
294 src0 = LD_SB(src_top);
295 src_top_val = __msa_fill_h(src_top[-1]);
297 UNPCK_UB_SH(src0, src0_r, src0_l);
298 SUB2(src0_r, src_top_val, src0_l, src_top_val, src0_r, src0_l);
303 ADD2(src0_r, src_left_val, src0_l, src_left_val, src0_r, src0_l);
304 CLIP_SH2_0_255(src0_r, src0_l);
305 src0 = __msa_pckev_b((v16i8) src0_l, (v16i8) src0_r);
310 static void hevc_intra_pred_horiz_32x32_msa(const uint8_t *src_top,
311 const uint8_t *src_left,
312 uint8_t *dst, int32_t stride)
315 uint8_t inp0, inp1, inp2, inp3;
316 v16i8 src0, src1, src2, src3;
318 for (row = 0; row < 8; row++) {
319 inp0 = src_left[row * 4];
320 inp1 = src_left[row * 4 + 1];
321 inp2 = src_left[row * 4 + 2];
322 inp3 = src_left[row * 4 + 3];
324 src0 = __msa_fill_b(inp0);
325 src1 = __msa_fill_b(inp1);
326 src2 = __msa_fill_b(inp2);
327 src3 = __msa_fill_b(inp3);
329 ST_SB2(src0, src0, dst, 16);
331 ST_SB2(src1, src1, dst, 16);
333 ST_SB2(src2, src2, dst, 16);
335 ST_SB2(src3, src3, dst, 16);
340 static void hevc_intra_pred_dc_4x4_msa(const uint8_t *src_top,
341 const uint8_t *src_left,
342 uint8_t *dst, int32_t stride,
345 uint8_t *tmp_dst = dst;
346 uint32_t addition = 0;
347 uint32_t val0, val1, val2;
351 v8u16 sum, vec0, vec1;
355 INSERT_W2_SB(val0, val1, src);
356 sum = __msa_hadd_u_h((v16u8) src, (v16u8) src);
357 sum = (v8u16) __msa_hadd_u_w(sum, sum);
358 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
359 sum = (v8u16) __msa_srari_w((v4i32) sum, 3);
360 addition = __msa_copy_u_w((v4i32) sum, 0);
361 store = (v16u8) __msa_fill_b(addition);
362 val0 = __msa_copy_u_w((v4i32) store, 0);
363 SW4(val0, val0, val0, val0, dst, stride)
366 ILVR_B2_UH(zero, store, zero, src, vec0, vec1);
372 vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
373 store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1);
374 val1 = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
375 store = (v16u8) __msa_insert_b((v16i8) store, 0, val1);
376 val0 = __msa_copy_u_w((v4i32) store, 0);
385 ADD2(val0, addition, val1, addition, val0, val1);
395 tmp_dst[stride * 1] = val0;
396 tmp_dst[stride * 2] = val1;
397 tmp_dst[stride * 3] = val2;
401 static void hevc_intra_pred_dc_8x8_msa(const uint8_t *src_top,
402 const uint8_t *src_left,
403 uint8_t *dst, int32_t stride,
406 uint8_t *tmp_dst = dst;
407 uint32_t row, col, val;
408 uint32_t addition = 0;
412 v8u16 sum, vec0, vec1;
417 INSERT_D2_UB(val0, val1, src);
418 sum = __msa_hadd_u_h((v16u8) src, (v16u8) src);
419 sum = (v8u16) __msa_hadd_u_w(sum, sum);
420 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
421 sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
422 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
423 sum = (v8u16) __msa_srari_w((v4i32) sum, 4);
424 addition = __msa_copy_u_w((v4i32) sum, 0);
425 store = (v16u8) __msa_fill_b(addition);
426 val0 = __msa_copy_u_d((v2i64) store, 0);
428 for (row = 8; row--;) {
434 ILVR_B2_UH(zero, store, zero, src, vec0, vec1);
439 vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
440 store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1);
441 val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
442 store = (v16u8) __msa_insert_b((v16i8) store, 0, val);
443 val0 = __msa_copy_u_d((v2i64) store, 0);
447 src = (v16u8) __msa_insert_d((v2i64) src, 0, val0);
448 vec1 = (v8u16) __msa_ilvr_b(zero, (v16i8) src);
449 vec0 = (v8u16) __msa_fill_h(addition);
452 vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
454 for (col = 1; col < 8; col++) {
455 tmp_dst[stride * col] = vec1[col];
460 static void hevc_intra_pred_dc_16x16_msa(const uint8_t *src_top,
461 const uint8_t *src_left,
462 uint8_t *dst, int32_t stride,
465 uint8_t *tmp_dst = dst;
466 uint32_t row, col, val;
467 uint32_t addition = 0;
468 v16u8 src_above1, store, src_left1;
469 v8u16 sum, sum_above, sum_left;
470 v8u16 vec0, vec1, vec2;
473 src_above1 = LD_UB(src_top);
474 src_left1 = LD_UB(src_left);
476 HADD_UB2_UH(src_above1, src_left1, sum_above, sum_left);
477 sum = sum_above + sum_left;
478 sum = (v8u16) __msa_hadd_u_w(sum, sum);
479 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
480 sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
481 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
482 sum = (v8u16) __msa_srari_w((v4i32) sum, 5);
483 addition = __msa_copy_u_w((v4i32) sum, 0);
484 store = (v16u8) __msa_fill_b(addition);
486 for (row = 16; row--;) {
492 vec0 = (v8u16) __msa_ilvr_b(zero, (v16i8) store);
493 ILVRL_B2_UH(zero, src_above1, vec1, vec2);
494 ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
496 ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
497 SRARI_H2_UH(vec1, vec2, 2);
498 store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1);
499 val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
500 store = (v16u8) __msa_insert_b((v16i8) store, 0, val);
501 ST_UB(store, tmp_dst);
503 ILVRL_B2_UH(zero, src_left1, vec1, vec2);
504 vec0 = (v8u16) __msa_fill_h(addition);
506 ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
507 SRARI_H2_UH(vec1, vec2, 2);
508 store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1);
510 for (col = 1; col < 16; col++) {
511 tmp_dst[stride * col] = store[col];
516 static void hevc_intra_pred_dc_32x32_msa(const uint8_t *src_top,
517 const uint8_t *src_left,
518 uint8_t *dst, int32_t stride)
521 v16u8 src_above1, src_above2, store, src_left1, src_left2;
522 v8u16 sum_above1, sum_above2;
523 v8u16 sum_left1, sum_left2;
524 v8u16 sum, sum_above, sum_left;
526 LD_UB2(src_top, 16, src_above1, src_above2);
527 LD_UB2(src_left, 16, src_left1, src_left2);
528 HADD_UB2_UH(src_above1, src_above2, sum_above1, sum_above2);
529 HADD_UB2_UH(src_left1, src_left2, sum_left1, sum_left2);
530 sum_above = sum_above1 + sum_above2;
531 sum_left = sum_left1 + sum_left2;
532 sum = sum_above + sum_left;
533 sum = (v8u16) __msa_hadd_u_w(sum, sum);
534 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
535 sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
536 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
537 sum = (v8u16) __msa_srari_w((v4i32) sum, 6);
538 store = (v16u8) __msa_splati_b((v16i8) sum, 0);
540 for (row = 16; row--;) {
541 ST_UB2(store, store, dst, 16);
543 ST_UB2(store, store, dst, 16);
548 static void hevc_intra_pred_plane_4x4_msa(const uint8_t *src_top,
549 const uint8_t *src_left,
550 uint8_t *dst, int32_t stride)
553 v16i8 src_vec0, src_vec1;
554 v8i16 src_vec0_r, src1_r, tmp0, tmp1, mul_val1;
555 v8i16 vec0, vec1, vec2, vec3, res0, res1, res2, res3;
556 v8i16 mul_val0 = { 3, 2, 1, 0, 1, 2, 3, 4 };
562 mul_val1 = (v8i16) __msa_pckod_d((v2i64) mul_val0, (v2i64) mul_val0);
564 src_vec0 = (v16i8) __msa_insert_w((v4i32) zero, 0, src0);
565 src_vec1 = (v16i8) __msa_insert_w((v4i32) zero, 0, src1);
567 ILVR_B2_SH(zero, src_vec0, zero, src_vec1, src_vec0_r, src1_r);
568 SPLATI_H4_SH(src1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3);
570 tmp0 = __msa_fill_h(src_top[4]);
571 tmp1 = __msa_fill_h(src_left[4]);
573 MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3,
574 res0, res1, res2, res3);
576 res0 += mul_val1 * tmp0;
577 res1 += mul_val1 * tmp0;
578 res2 += mul_val1 * tmp0;
579 res3 += mul_val1 * tmp0;
581 res0 += 3 * src_vec0_r;
582 res1 += 2 * src_vec0_r;
589 PCKEV_D2_SH(res1, res0, res3, res2, res0, res1);
590 SRARI_H2_SH(res0, res1, 3);
591 src_vec0 = __msa_pckev_b((v16i8) res1, (v16i8) res0);
592 ST_W4(src_vec0, 0, 1, 2, 3, dst, stride);
595 static void hevc_intra_pred_plane_8x8_msa(const uint8_t *src_top,
596 const uint8_t *src_left,
597 uint8_t *dst, int32_t stride)
600 v16i8 src_vec0, src_vec1, src_vec2, src_vec3;
601 v8i16 src_vec0_r, src_vec1_r;
602 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
603 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
604 v8i16 tmp0, tmp1, tmp2;
605 v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
606 v8i16 mul_val0 = { 7, 6, 5, 4, 3, 2, 1, 0 };
612 src_vec0 = (v16i8) __msa_insert_d((v2i64) zero, 0, src0);
613 src_vec1 = (v16i8) __msa_insert_d((v2i64) zero, 0, src1);
615 ILVR_B2_SH(zero, src_vec0, zero, src_vec1, src_vec0_r, src_vec1_r);
616 SPLATI_H4_SH(src_vec1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3);
617 SPLATI_H4_SH(src_vec1_r, 4, 5, 6, 7, vec4, vec5, vec6, vec7);
619 tmp0 = __msa_fill_h(src_top[8]);
620 tmp1 = __msa_fill_h(src_left[8]);
622 MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3,
623 res0, res1, res2, res3);
624 MUL4(mul_val0, vec4, mul_val0, vec5, mul_val0, vec6, mul_val0, vec7,
625 res4, res5, res6, res7);
627 tmp2 = mul_val1 * tmp0;
637 res0 += 7 * src_vec0_r;
638 res1 += 6 * src_vec0_r;
639 res2 += 5 * src_vec0_r;
640 res3 += 4 * src_vec0_r;
641 res4 += 3 * src_vec0_r;
642 res5 += 2 * src_vec0_r;
654 SRARI_H4_SH(res0, res1, res2, res3, 4);
655 SRARI_H4_SH(res4, res5, res6, res7, 4);
656 PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
657 src_vec0, src_vec1, src_vec2, src_vec3);
659 ST_D8(src_vec0, src_vec1, src_vec2, src_vec3, 0, 1, 0, 1,
660 0, 1, 0, 1, dst, stride);
663 static void hevc_intra_pred_plane_16x16_msa(const uint8_t *src_top,
664 const uint8_t *src_left,
665 uint8_t *dst, int32_t stride)
668 v8i16 src0_r, src1_r, src0_l, src1_l;
670 v8i16 res0, res1, tmp0, tmp1;
671 v8i16 mul_val2, mul_val3;
672 v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
673 v8i16 mul_val0 = { 15, 14, 13, 12, 11, 10, 9, 8 };
675 src0 = LD_UB(src_top);
676 src1 = LD_UB(src_left);
678 UNPCK_UB_SH(src0, src0_r, src0_l);
679 UNPCK_UB_SH(src1, src1_r, src1_l);
681 mul_val2 = mul_val0 - 8;
682 mul_val3 = mul_val1 + 8;
684 tmp0 = __msa_fill_h(src_top[16]);
685 tmp1 = __msa_fill_h(src_left[16]);
687 SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
688 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
689 mul_val0, mul_val1, mul_val2, mul_val3,
690 res0, res1, 15, 1, 5);
691 ST_SH2(res0, res1, dst, stride);
694 SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
695 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
696 mul_val0, mul_val1, mul_val2, mul_val3,
697 res0, res1, 13, 3, 5);
698 ST_SH2(res0, res1, dst, stride);
701 SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
702 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
703 mul_val0, mul_val1, mul_val2, mul_val3,
704 res0, res1, 11, 5, 5);
705 ST_SH2(res0, res1, dst, stride);
708 SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
709 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
710 mul_val0, mul_val1, mul_val2, mul_val3,
711 res0, res1, 9, 7, 5);
712 ST_SH2(res0, res1, dst, stride);
715 SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
716 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
717 mul_val0, mul_val1, mul_val2, mul_val3,
718 res0, res1, 7, 9, 5);
719 ST_SH2(res0, res1, dst, stride);
722 SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
723 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
724 mul_val0, mul_val1, mul_val2, mul_val3,
725 res0, res1, 5, 11, 5);
726 ST_SH2(res0, res1, dst, stride);
729 SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
730 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
731 mul_val0, mul_val1, mul_val2, mul_val3,
732 res0, res1, 3, 13, 5);
733 ST_SH2(res0, res1, dst, stride);
736 SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
737 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
738 mul_val0, mul_val1, mul_val2, mul_val3,
739 res0, res1, 1, 15, 5);
740 ST_SH2(res0, res1, dst, stride);
743 static void process_intra_upper_16x16_msa(const uint8_t *src_top,
744 const uint8_t *src_left,
745 uint8_t *dst, int32_t stride,
749 v8i16 src0_r, src1_r, src0_l, src1_l;
750 v8i16 vec0, vec1, res0, res1;
752 v8i16 mul_val2, mul_val3;
753 v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
754 v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 };
756 tmp0 = __msa_fill_h(src_top[32 - offset]);
757 tmp1 = __msa_fill_h(src_left[32]);
759 src0 = LD_SB(src_top);
760 src1 = LD_SB(src_left);
762 UNPCK_UB_SH(src0, src0_r, src0_l);
763 UNPCK_UB_SH(src1, src1_r, src1_l);
767 mul_val2 = mul_val0 - 8;
768 mul_val3 = mul_val1 + 8;
770 SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
771 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
772 mul_val0, mul_val1, mul_val2, mul_val3,
773 res0, res1, 31, 1, 6);
774 ST_SH2(res0, res1, dst, stride);
777 SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
778 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
779 mul_val0, mul_val1, mul_val2, mul_val3,
780 res0, res1, 29, 3, 6);
781 ST_SH2(res0, res1, dst, stride);
784 SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
785 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
786 mul_val0, mul_val1, mul_val2, mul_val3,
787 res0, res1, 27, 5, 6);
788 ST_SH2(res0, res1, dst, stride);
791 SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
792 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
793 mul_val0, mul_val1, mul_val2, mul_val3,
794 res0, res1, 25, 7, 6);
795 ST_SH2(res0, res1, dst, stride);
798 SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
799 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
800 mul_val0, mul_val1, mul_val2, mul_val3,
801 res0, res1, 23, 9, 6);
802 ST_SH2(res0, res1, dst, stride);
805 SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
806 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
807 mul_val0, mul_val1, mul_val2, mul_val3,
808 res0, res1, 21, 11, 6);
809 ST_SH2(res0, res1, dst, stride);
812 SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
813 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
814 mul_val0, mul_val1, mul_val2, mul_val3,
815 res0, res1, 19, 13, 6);
816 ST_SH2(res0, res1, dst, stride);
819 SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
820 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
821 mul_val0, mul_val1, mul_val2, mul_val3,
822 res0, res1, 17, 15, 6);
823 ST_SH2(res0, res1, dst, stride);
826 static void process_intra_lower_16x16_msa(const uint8_t *src_top,
827 const uint8_t *src_left,
828 uint8_t *dst, int32_t stride,
832 v8i16 src0_r, src1_r, src0_l, src1_l;
833 v8i16 vec0, vec1, res0, res1, tmp0, tmp1;
834 v8i16 mul_val2, mul_val3;
835 v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
836 v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 };
838 tmp0 = __msa_fill_h(src_top[32 - offset]);
839 tmp1 = __msa_fill_h(src_left[16]);
841 src0 = LD_SB(src_top);
842 src1 = LD_SB(src_left);
844 UNPCK_UB_SH(src0, src0_r, src0_l);
845 UNPCK_UB_SH(src1, src1_r, src1_l);
849 mul_val2 = mul_val0 - 8;
850 mul_val3 = mul_val1 + 8;
852 SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
853 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
854 mul_val0, mul_val1, mul_val2, mul_val3,
855 res0, res1, 15, 17, 6);
856 ST_SH2(res0, res1, dst, stride);
859 SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
860 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
861 mul_val0, mul_val1, mul_val2, mul_val3,
862 res0, res1, 13, 19, 6);
863 ST_SH2(res0, res1, dst, stride);
866 SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
867 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
868 mul_val0, mul_val1, mul_val2, mul_val3,
869 res0, res1, 11, 21, 6);
870 ST_SH2(res0, res1, dst, stride);
873 SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
874 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
875 mul_val0, mul_val1, mul_val2, mul_val3,
876 res0, res1, 9, 23, 6);
877 ST_SH2(res0, res1, dst, stride);
880 SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
881 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
882 mul_val0, mul_val1, mul_val2, mul_val3,
883 res0, res1, 7, 25, 6);
884 ST_SH2(res0, res1, dst, stride);
887 SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
888 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
889 mul_val0, mul_val1, mul_val2, mul_val3,
890 res0, res1, 5, 27, 6);
891 ST_SH2(res0, res1, dst, stride);
894 SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
895 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
896 mul_val0, mul_val1, mul_val2, mul_val3,
897 res0, res1, 3, 29, 6);
898 ST_SH2(res0, res1, dst, stride);
901 SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
902 HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
903 mul_val0, mul_val1, mul_val2, mul_val3,
904 res0, res1, 1, 31, 6);
905 ST_SH2(res0, res1, dst, stride);
908 static void hevc_intra_pred_plane_32x32_msa(const uint8_t *src_top,
909 const uint8_t *src_left,
910 uint8_t *dst, int32_t stride)
912 process_intra_upper_16x16_msa(src_top, src_left, dst, stride, 0);
913 process_intra_upper_16x16_msa((src_top + 16), src_left,
914 (dst + 16), stride, 16);
915 dst += (16 * stride);
918 process_intra_lower_16x16_msa(src_top, src_left, dst, stride, 0);
919 process_intra_lower_16x16_msa((src_top + 16), src_left,
920 (dst + 16), stride, 16);
923 static void hevc_intra_pred_angular_upper_4width_msa(const uint8_t *src_top,
924 const uint8_t *src_left,
929 int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
930 uint8_t ref_array[3 * 32 + 4];
931 uint8_t *ref_tmp = ref_array + 4;
934 int32_t h_cnt, idx0, fact_val0, idx1, fact_val1;
935 int32_t idx2, fact_val2, idx3, fact_val3;
936 int32_t angle, angle_loop;
937 int32_t inv_angle_val, offset;
939 v16i8 top0, top1, top2, top3;
942 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
943 v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
945 angle = intra_pred_angle_up[mode - 18];
946 inv_angle_val = inv_angle[mode - 18];
951 if (angle < 0 && last < -1) {
952 inv_angle_val = inv_angle[mode - 18];
957 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
958 offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8);
959 ref_tmp[h_cnt] = src_left[offset];
965 idx0 = angle_loop >> 5;
966 fact_val0 = angle_loop & 31;
969 idx1 = angle_loop >> 5;
970 fact_val1 = angle_loop & 31;
973 idx2 = angle_loop >> 5;
974 fact_val2 = angle_loop & 31;
977 idx3 = angle_loop >> 5;
978 fact_val3 = angle_loop & 31;
980 top0 = LD_SB(ref + idx0 + 1);
981 top1 = LD_SB(ref + idx1 + 1);
982 top2 = LD_SB(ref + idx2 + 1);
983 top3 = LD_SB(ref + idx3 + 1);
985 fact0 = __msa_fill_h(fact_val0);
986 fact1 = __msa_fill_h(32 - fact_val0);
988 fact2 = __msa_fill_h(fact_val1);
989 fact3 = __msa_fill_h(32 - fact_val1);
991 fact4 = __msa_fill_h(fact_val2);
992 fact5 = __msa_fill_h(32 - fact_val2);
994 fact6 = __msa_fill_h(fact_val3);
995 fact7 = __msa_fill_h(32 - fact_val3);
997 ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2);
998 ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
999 ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3,
1000 diff0, diff2, diff4, diff6);
1001 SLDI_B4_0_SH(diff0, diff2, diff4, diff6, diff1, diff3, diff5, diff7, 2);
1002 ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
1003 ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
1004 MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
1006 diff1 += diff0 * fact1;
1007 diff3 += diff2 * fact3;
1009 SRARI_H2_SH(diff1, diff3, 5);
1010 dst_val0 = __msa_pckev_b((v16i8) diff3, (v16i8) diff1);
1011 ST_W4(dst_val0, 0, 1, 2, 3, dst, stride);
1014 static void hevc_intra_pred_angular_upper_8width_msa(const uint8_t *src_top,
1015 const uint8_t *src_left,
1020 int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1021 uint8_t ref_array[3 * 32 + 4];
1022 uint8_t *ref_tmp = ref_array + 8;
1024 const uint8_t *src_left_tmp = src_left - 1;
1025 int32_t last, offset;
1026 int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1027 int32_t idx2, fact_val2, idx3, fact_val3;
1028 int32_t angle, angle_loop;
1029 int32_t inv_angle_val, inv_angle_val_loop;
1030 int32_t tmp0, tmp1, tmp2;
1031 v16i8 top0, top1, top2, top3;
1032 v16u8 dst_val0, dst_val1;
1033 v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1034 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1036 angle = intra_pred_angle_up[mode - 18];
1037 inv_angle_val = inv_angle[mode - 18];
1038 last = (angle) >> 2;
1043 inv_angle_val_loop = inv_angle_val * last;
1049 SW(tmp1, ref_tmp + 4);
1050 SW(tmp2, ref_tmp + 8);
1052 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1053 offset = (inv_angle_val_loop + 128) >> 8;
1054 ref_tmp[h_cnt] = src_left_tmp[offset];
1055 inv_angle_val_loop += inv_angle_val;
1060 for (v_cnt = 0; v_cnt < 2; v_cnt++) {
1061 idx0 = (angle_loop) >> 5;
1062 fact_val0 = (angle_loop) & 31;
1063 angle_loop += angle;
1065 idx1 = (angle_loop) >> 5;
1066 fact_val1 = (angle_loop) & 31;
1067 angle_loop += angle;
1069 idx2 = (angle_loop) >> 5;
1070 fact_val2 = (angle_loop) & 31;
1071 angle_loop += angle;
1073 idx3 = (angle_loop) >> 5;
1074 fact_val3 = (angle_loop) & 31;
1075 angle_loop += angle;
1077 top0 = LD_SB(ref + idx0 + 1);
1078 top1 = LD_SB(ref + idx1 + 1);
1079 top2 = LD_SB(ref + idx2 + 1);
1080 top3 = LD_SB(ref + idx3 + 1);
1082 fact0 = __msa_fill_h(fact_val0);
1083 fact1 = __msa_fill_h(32 - fact_val0);
1084 fact2 = __msa_fill_h(fact_val1);
1085 fact3 = __msa_fill_h(32 - fact_val1);
1086 fact4 = __msa_fill_h(fact_val2);
1087 fact5 = __msa_fill_h(32 - fact_val2);
1088 fact6 = __msa_fill_h(fact_val3);
1089 fact7 = __msa_fill_h(32 - fact_val3);
1091 UNPCK_UB_SH(top0, diff0, diff1);
1092 UNPCK_UB_SH(top1, diff2, diff3);
1093 UNPCK_UB_SH(top2, diff4, diff5);
1094 UNPCK_UB_SH(top3, diff6, diff7);
1096 SLDI_B2_SH(diff1, diff3, diff0, diff2, diff1, diff3, 2);
1097 SLDI_B2_SH(diff5, diff7, diff4, diff6, diff5, diff7, 2);
1098 MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
1099 diff1, diff3, diff5, diff7);
1101 diff1 += diff0 * fact1;
1102 diff3 += diff2 * fact3;
1103 diff5 += diff4 * fact5;
1104 diff7 += diff6 * fact7;
1106 SRARI_H4_SH(diff1, diff3, diff5, diff7, 5);
1107 PCKEV_B2_UB(diff3, diff1, diff7, diff5, dst_val0, dst_val1);
1108 ST_D4(dst_val0, dst_val1, 0, 1, 0, 1, dst, stride);
1109 dst += (4 * stride);
1113 static void hevc_intra_pred_angular_upper_16width_msa(const uint8_t *src_top,
1114 const uint8_t *src_left,
1119 int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1120 int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1121 int32_t idx2, fact_val2, idx3, fact_val3;
1123 int32_t angle, angle_loop, offset;
1124 int32_t inv_angle_val, inv_angle_val_loop;
1125 uint8_t ref_array[3 * 32 + 4];
1126 uint8_t *ref_tmp = ref_array + 16;
1128 const uint8_t *src_left_tmp = src_left - 1;
1130 v16u8 top0, top1, top2, top3, top4, top5, top6, top7;
1131 v16i8 dst0, dst1, dst2, dst3;
1132 v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1133 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1134 v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1136 angle = intra_pred_angle_up[mode - 18];
1137 inv_angle_val = inv_angle[mode - 18];
1143 inv_angle_val_loop = inv_angle_val * last;
1146 tmp0 = LW(ref + 16);
1147 ST_UB(top0, ref_tmp);
1148 SW(tmp0, ref_tmp + 16);
1150 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1151 offset = (inv_angle_val_loop + 128) >> 8;
1152 ref_tmp[h_cnt] = src_left_tmp[offset];
1153 inv_angle_val_loop += inv_angle_val;
1158 for (v_cnt = 4; v_cnt--;) {
1159 idx0 = (angle_loop) >> 5;
1160 fact_val0 = (angle_loop) & 31;
1161 angle_loop += angle;
1163 idx1 = (angle_loop) >> 5;
1164 fact_val1 = (angle_loop) & 31;
1165 angle_loop += angle;
1167 idx2 = (angle_loop) >> 5;
1168 fact_val2 = (angle_loop) & 31;
1169 angle_loop += angle;
1171 idx3 = (angle_loop) >> 5;
1172 fact_val3 = (angle_loop) & 31;
1173 angle_loop += angle;
1175 LD_UB2(ref + idx0 + 1, 16, top0, top1);
1176 LD_UB2(ref + idx1 + 1, 16, top2, top3);
1177 LD_UB2(ref + idx2 + 1, 16, top4, top5);
1178 LD_UB2(ref + idx3 + 1, 16, top6, top7);
1180 fact0 = __msa_fill_h(fact_val0);
1181 fact1 = __msa_fill_h(32 - fact_val0);
1182 fact2 = __msa_fill_h(fact_val1);
1183 fact3 = __msa_fill_h(32 - fact_val1);
1184 fact4 = __msa_fill_h(fact_val2);
1185 fact5 = __msa_fill_h(32 - fact_val2);
1186 fact6 = __msa_fill_h(fact_val3);
1187 fact7 = __msa_fill_h(32 - fact_val3);
1189 SLDI_B2_UB(top1, top3, top0, top2, top1, top3, 1);
1190 SLDI_B2_UB(top5, top7, top4, top6, top5, top7, 1);
1191 UNPCK_UB_SH(top0, diff0, diff1);
1192 UNPCK_UB_SH(top1, diff2, diff3);
1193 UNPCK_UB_SH(top2, diff4, diff5);
1194 UNPCK_UB_SH(top3, diff6, diff7);
1195 UNPCK_UB_SH(top4, diff8, diff9);
1196 UNPCK_UB_SH(top5, diff10, diff11);
1197 UNPCK_UB_SH(top6, diff12, diff13);
1198 UNPCK_UB_SH(top7, diff14, diff15);
1200 MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2,
1201 diff2, diff3, diff6, diff7);
1202 MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6,
1203 diff10, diff11, diff14, diff15);
1205 diff2 += diff0 * fact1;
1206 diff3 += diff1 * fact1;
1207 diff6 += diff4 * fact3;
1208 diff7 += diff5 * fact3;
1209 diff10 += diff8 * fact5;
1210 diff11 += diff9 * fact5;
1211 diff14 += diff12 * fact7;
1212 diff15 += diff13 * fact7;
1214 SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1215 SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1216 PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1217 dst0, dst1, dst2, dst3);
1218 ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
1219 dst += (4 * stride);
1223 static void hevc_intra_pred_angular_upper_32width_msa(const uint8_t *src_top,
1224 const uint8_t *src_left,
1229 int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1230 uint8_t ref_array[3 * 32 + 4];
1233 const uint8_t *src_left_tmp = src_left - 1;
1234 int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1235 int32_t tmp0, tmp1, tmp2, tmp3;
1236 int32_t angle, angle_loop;
1237 int32_t inv_angle_val, inv_angle_val_loop;
1238 int32_t last, offset;
1239 v16u8 top0, top1, top2, top3, top4, top5, top6, top7;
1240 v16i8 dst0, dst1, dst2, dst3;
1241 v8i16 fact0, fact1, fact2, fact3;
1242 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1243 v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1245 ref_tmp = ref_array + 32;
1247 angle = intra_pred_angle_up[mode - 18];
1248 inv_angle_val = inv_angle[mode - 18];
1254 inv_angle_val_loop = inv_angle_val * last;
1255 LD_UB2(ref, 16, top0, top1);
1261 ST_UB2(top0, top1, ref_tmp, 16);
1267 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1268 offset = (inv_angle_val_loop + 128) >> 8;
1269 ref_tmp[h_cnt] = src_left_tmp[offset];
1270 inv_angle_val_loop += inv_angle_val;
1276 for (v_cnt = 16; v_cnt--;) {
1277 idx0 = (angle_loop) >> 5;
1278 fact_val0 = (angle_loop) & 31;
1279 angle_loop += angle;
1281 idx1 = (angle_loop) >> 5;
1282 fact_val1 = (angle_loop) & 31;
1283 angle_loop += angle;
1285 top0 = LD_UB(ref + idx0 + 1);
1286 top4 = LD_UB(ref + idx1 + 1);
1287 top1 = LD_UB(ref + idx0 + 17);
1288 top5 = LD_UB(ref + idx1 + 17);
1289 top3 = LD_UB(ref + idx0 + 33);
1290 top7 = LD_UB(ref + idx1 + 33);
1292 fact0 = __msa_fill_h(fact_val0);
1293 fact1 = __msa_fill_h(32 - fact_val0);
1294 fact2 = __msa_fill_h(fact_val1);
1295 fact3 = __msa_fill_h(32 - fact_val1);
1300 SLDI_B2_UB(top1, top3, top0, top2, top1, top3, 1);
1301 SLDI_B2_UB(top5, top7, top4, top6, top5, top7, 1);
1302 UNPCK_UB_SH(top0, diff0, diff1);
1303 UNPCK_UB_SH(top1, diff2, diff3);
1304 UNPCK_UB_SH(top2, diff4, diff5);
1305 UNPCK_UB_SH(top3, diff6, diff7);
1306 UNPCK_UB_SH(top4, diff8, diff9);
1307 UNPCK_UB_SH(top5, diff10, diff11);
1308 UNPCK_UB_SH(top6, diff12, diff13);
1309 UNPCK_UB_SH(top7, diff14, diff15);
1311 MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0,
1312 diff2, diff3, diff6, diff7);
1313 MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2,
1314 diff10, diff11, diff14, diff15);
1316 diff2 += diff0 * fact1;
1317 diff3 += diff1 * fact1;
1318 diff6 += diff4 * fact1;
1319 diff7 += diff5 * fact1;
1320 diff10 += diff8 * fact3;
1321 diff11 += diff9 * fact3;
1322 diff14 += diff12 * fact3;
1323 diff15 += diff13 * fact3;
1325 SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1326 SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1327 PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1328 dst0, dst1, dst2, dst3);
1330 ST_SB2(dst0, dst1, dst, 16);
1332 ST_SB2(dst2, dst3, dst, 16);
1337 static void hevc_intra_pred_angular_lower_4width_msa(const uint8_t *src_top,
1338 const uint8_t *src_left,
1343 int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1344 uint8_t ref_array[3 * 32 + 4];
1345 uint8_t *ref_tmp = ref_array + 4;
1347 int32_t last, offset;
1348 int32_t h_cnt, idx0, fact_val0, idx1, fact_val1;
1349 int32_t idx2, fact_val2, idx3, fact_val3;
1350 int32_t angle, angle_loop, inv_angle_val;
1352 v16i8 dst_val0, dst_val1;
1353 v16u8 top0, top1, top2, top3;
1355 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1356 v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1358 angle = intra_pred_angle_low[mode - 2];
1364 inv_angle_val = inv_angle[mode - 11];
1369 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1370 offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8);
1371 ref_tmp[h_cnt] = src_top[offset];
1377 idx0 = angle_loop >> 5;
1378 fact_val0 = angle_loop & 31;
1379 angle_loop += angle;
1381 idx1 = angle_loop >> 5;
1382 fact_val1 = angle_loop & 31;
1383 angle_loop += angle;
1385 idx2 = angle_loop >> 5;
1386 fact_val2 = angle_loop & 31;
1387 angle_loop += angle;
1389 idx3 = angle_loop >> 5;
1390 fact_val3 = angle_loop & 31;
1392 top0 = LD_UB(ref + idx0 + 1);
1393 top1 = LD_UB(ref + idx1 + 1);
1394 top2 = LD_UB(ref + idx2 + 1);
1395 top3 = LD_UB(ref + idx3 + 1);
1397 fact0 = __msa_fill_h(fact_val0);
1398 fact1 = __msa_fill_h(32 - fact_val0);
1399 fact2 = __msa_fill_h(fact_val1);
1400 fact3 = __msa_fill_h(32 - fact_val1);
1401 fact4 = __msa_fill_h(fact_val2);
1402 fact5 = __msa_fill_h(32 - fact_val2);
1403 fact6 = __msa_fill_h(fact_val3);
1404 fact7 = __msa_fill_h(32 - fact_val3);
1406 ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2);
1407 ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
1408 ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3,
1409 diff0, diff2, diff4, diff6);
1410 SLDI_B4_0_SH(diff0, diff2, diff4, diff6, diff1, diff3, diff5, diff7, 2);
1411 ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
1412 ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
1413 MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
1415 diff1 += diff0 * fact1;
1416 diff3 += diff2 * fact3;
1418 SRARI_H2_SH(diff1, diff3, 5);
1419 PCKEV_B2_SB(diff1, diff1, diff3, diff3, dst_val0, dst_val1);
1421 diff0 = (v8i16) __msa_pckev_b(dst_val1, dst_val0);
1422 diff1 = (v8i16) __msa_pckod_b(dst_val1, dst_val0);
1424 diff2 = (v8i16) __msa_pckev_w((v4i32) diff1, (v4i32) diff0);
1426 dst_val0 = __msa_pckev_b((v16i8) diff2, (v16i8) diff2);
1427 dst_val1 = __msa_pckod_b((v16i8) diff2, (v16i8) diff2);
1429 ST_W2(dst_val0, 0, 1, dst, stride);
1430 ST_W2(dst_val1, 0, 1, dst + 2 * stride, stride);
1433 static void hevc_intra_pred_angular_lower_8width_msa(const uint8_t *src_top,
1434 const uint8_t *src_left,
1439 int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1440 uint8_t ref_array[3 * 32 + 4];
1441 uint8_t *ref_tmp = ref_array + 8;
1443 const uint8_t *src_top_tmp = src_top - 1;
1445 int32_t last, offset, tmp0, tmp1, tmp2;
1446 int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1447 int32_t idx2, fact_val2, idx3, fact_val3;
1448 int32_t angle, angle_loop, inv_angle_val;
1449 v16i8 top0, top1, top2, top3;
1450 v16i8 dst_val0, dst_val1, dst_val2, dst_val3;
1451 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1452 v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1454 angle = intra_pred_angle_low[mode - 2];
1455 last = (angle) >> 2;
1460 inv_angle_val = inv_angle[mode - 11];
1466 SW(tmp1, ref_tmp + 4);
1467 SW(tmp2, ref_tmp + 8);
1469 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1470 offset = (h_cnt * inv_angle_val + 128) >> 8;
1471 ref_tmp[h_cnt] = src_top_tmp[offset];
1477 for (v_cnt = 0; v_cnt < 2; v_cnt++) {
1480 idx0 = angle_loop >> 5;
1481 fact_val0 = angle_loop & 31;
1482 angle_loop += angle;
1484 idx1 = angle_loop >> 5;
1485 fact_val1 = angle_loop & 31;
1486 angle_loop += angle;
1488 idx2 = angle_loop >> 5;
1489 fact_val2 = angle_loop & 31;
1490 angle_loop += angle;
1492 idx3 = angle_loop >> 5;
1493 fact_val3 = angle_loop & 31;
1494 angle_loop += angle;
1496 top0 = LD_SB(ref + idx0 + 1);
1497 top1 = LD_SB(ref + idx1 + 1);
1498 top2 = LD_SB(ref + idx2 + 1);
1499 top3 = LD_SB(ref + idx3 + 1);
1501 fact0 = __msa_fill_h(fact_val0);
1502 fact1 = __msa_fill_h(32 - fact_val0);
1503 fact2 = __msa_fill_h(fact_val1);
1504 fact3 = __msa_fill_h(32 - fact_val1);
1505 fact4 = __msa_fill_h(fact_val2);
1506 fact5 = __msa_fill_h(32 - fact_val2);
1507 fact6 = __msa_fill_h(fact_val3);
1508 fact7 = __msa_fill_h(32 - fact_val3);
1510 UNPCK_UB_SH(top0, diff0, diff1);
1511 UNPCK_UB_SH(top1, diff2, diff3);
1512 UNPCK_UB_SH(top2, diff4, diff5);
1513 UNPCK_UB_SH(top3, diff6, diff7);
1514 SLDI_B2_SH(diff1, diff3, diff0, diff2, diff1, diff3, 2);
1515 SLDI_B2_SH(diff5, diff7, diff4, diff6, diff5, diff7, 2);
1516 MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
1517 diff1, diff3, diff5, diff7);
1519 diff1 += diff0 * fact1;
1520 diff3 += diff2 * fact3;
1521 diff5 += diff4 * fact5;
1522 diff7 += diff6 * fact7;
1524 SRARI_H4_SH(diff1, diff3, diff5, diff7, 5);
1525 PCKEV_B4_SB(diff1, diff1, diff3, diff3, diff5, diff5, diff7, diff7,
1526 dst_val0, dst_val1, dst_val2, dst_val3);
1527 ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
1528 ILVRL_H2_SH(diff1, diff0, diff3, diff4);
1529 ST_W8(diff3, diff4, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
1534 static void hevc_intra_pred_angular_lower_16width_msa(const uint8_t *src_top,
1535 const uint8_t *src_left,
1540 int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1541 int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1542 int32_t idx2, fact_val2, idx3, fact_val3, tmp0;
1543 v16i8 top0, top1, dst_val0, top2, top3, dst_val1;
1544 v16i8 top4, top5, dst_val2, top6, top7, dst_val3;
1545 v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1546 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1547 v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1548 int32_t angle, angle_loop, inv_angle_val, offset;
1549 uint8_t ref_array[3 * 32 + 4];
1550 uint8_t *ref_tmp = ref_array + 16;
1551 const uint8_t *ref, *src_top_tmp = src_top - 1;
1555 angle = intra_pred_angle_low[mode - 2];
1556 last = (angle) >> 1;
1561 inv_angle_val = inv_angle[mode - 11];
1564 tmp0 = LW(ref + 16);
1565 ST_SB(top0, ref_tmp);
1566 SW(tmp0, ref_tmp + 16);
1568 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1569 offset = (h_cnt * inv_angle_val + 128) >> 8;
1570 ref_tmp[h_cnt] = src_top_tmp[offset];
1576 for (v_cnt = 0; v_cnt < 4; v_cnt++) {
1579 idx0 = angle_loop >> 5;
1580 fact_val0 = angle_loop & 31;
1581 angle_loop += angle;
1583 idx1 = angle_loop >> 5;
1584 fact_val1 = angle_loop & 31;
1585 angle_loop += angle;
1587 idx2 = angle_loop >> 5;
1588 fact_val2 = angle_loop & 31;
1589 angle_loop += angle;
1591 idx3 = angle_loop >> 5;
1592 fact_val3 = angle_loop & 31;
1593 angle_loop += angle;
1595 LD_SB2(ref + idx0 + 1, 16, top0, top1);
1596 LD_SB2(ref + idx1 + 1, 16, top2, top3);
1597 LD_SB2(ref + idx2 + 1, 16, top4, top5);
1598 LD_SB2(ref + idx3 + 1, 16, top6, top7);
1600 fact0 = __msa_fill_h(fact_val0);
1601 fact1 = __msa_fill_h(32 - fact_val0);
1602 fact2 = __msa_fill_h(fact_val1);
1603 fact3 = __msa_fill_h(32 - fact_val1);
1604 fact4 = __msa_fill_h(fact_val2);
1605 fact5 = __msa_fill_h(32 - fact_val2);
1606 fact6 = __msa_fill_h(fact_val3);
1607 fact7 = __msa_fill_h(32 - fact_val3);
1609 SLDI_B2_SB(top1, top3, top0, top2, top1, top3, 1);
1610 SLDI_B2_SB(top5, top7, top4, top6, top5, top7, 1);
1612 UNPCK_UB_SH(top0, diff0, diff1);
1613 UNPCK_UB_SH(top1, diff2, diff3);
1614 UNPCK_UB_SH(top2, diff4, diff5);
1615 UNPCK_UB_SH(top3, diff6, diff7);
1616 UNPCK_UB_SH(top4, diff8, diff9);
1617 UNPCK_UB_SH(top5, diff10, diff11);
1618 UNPCK_UB_SH(top6, diff12, diff13);
1619 UNPCK_UB_SH(top7, diff14, diff15);
1621 MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2,
1622 diff2, diff3, diff6, diff7);
1623 MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6,
1624 diff10, diff11, diff14, diff15);
1626 diff2 += diff0 * fact1;
1627 diff3 += diff1 * fact1;
1628 diff6 += diff4 * fact3;
1629 diff7 += diff5 * fact3;
1630 diff10 += diff8 * fact5;
1631 diff11 += diff9 * fact5;
1632 diff14 += diff12 * fact7;
1633 diff15 += diff13 * fact7;
1635 SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1636 SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1637 PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1638 dst_val0, dst_val1, dst_val2, dst_val3);
1639 ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
1640 ILVL_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff2, diff3);
1641 ILVRL_H2_SH(diff1, diff0, diff4, diff5);
1642 ILVRL_H2_SH(diff3, diff2, diff6, diff7);
1643 ST_W8(diff4, diff5, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
1644 dst_org += (8 * stride);
1645 ST_W8(diff6, diff7, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
1650 static void hevc_intra_pred_angular_lower_32width_msa(const uint8_t *src_top,
1651 const uint8_t *src_left,
1656 int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1657 int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1, tmp0;
1658 v16i8 top0, top1, dst_val0, top2, top3, dst_val1;
1659 v16i8 top4, top5, dst_val2, top6, top7, dst_val3;
1660 v8i16 fact0, fact1, fact2, fact3;
1661 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1662 v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1663 int32_t angle, angle_loop, inv_angle_val, offset;
1664 uint8_t ref_array[3 * 32 + 4];
1665 uint8_t *ref_tmp = ref_array + 32;
1666 const uint8_t *ref, *src_top_tmp = src_top - 1;
1670 angle = intra_pred_angle_low[mode - 2];
1676 inv_angle_val = inv_angle[mode - 11];
1678 LD_SB2(ref, 16, top0, top1);
1679 tmp0 = LW(ref + 32);
1680 ST_SB2(top0, top1, ref_tmp, 16);
1681 SW(tmp0, ref_tmp + 32);
1683 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1684 offset = (h_cnt * inv_angle_val + 128) >> 8;
1685 ref_tmp[h_cnt] = src_top_tmp[offset];
1691 for (v_cnt = 0; v_cnt < 16; v_cnt++) {
1693 idx0 = angle_loop >> 5;
1694 fact_val0 = angle_loop & 31;
1695 angle_loop += angle;
1697 idx1 = angle_loop >> 5;
1698 fact_val1 = angle_loop & 31;
1699 angle_loop += angle;
1701 top0 = LD_SB(ref + idx0 + 1);
1702 top4 = LD_SB(ref + idx1 + 1);
1703 top1 = LD_SB(ref + idx0 + 17);
1704 top5 = LD_SB(ref + idx1 + 17);
1705 top3 = LD_SB(ref + idx0 + 33);
1706 top7 = LD_SB(ref + idx1 + 33);
1708 fact0 = __msa_fill_h(fact_val0);
1709 fact1 = __msa_fill_h(32 - fact_val0);
1710 fact2 = __msa_fill_h(fact_val1);
1711 fact3 = __msa_fill_h(32 - fact_val1);
1716 SLDI_B2_SB(top1, top3, top0, top2, top1, top3, 1);
1717 SLDI_B2_SB(top5, top7, top4, top6, top5, top7, 1);
1719 UNPCK_UB_SH(top0, diff0, diff1);
1720 UNPCK_UB_SH(top1, diff2, diff3);
1721 UNPCK_UB_SH(top2, diff4, diff5);
1722 UNPCK_UB_SH(top3, diff6, diff7);
1723 UNPCK_UB_SH(top4, diff8, diff9);
1724 UNPCK_UB_SH(top5, diff10, diff11);
1725 UNPCK_UB_SH(top6, diff12, diff13);
1726 UNPCK_UB_SH(top7, diff14, diff15);
1728 MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0,
1729 diff2, diff3, diff6, diff7);
1730 MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2,
1731 diff10, diff11, diff14, diff15);
1733 diff2 += diff0 * fact1;
1734 diff3 += diff1 * fact1;
1735 diff6 += diff4 * fact1;
1736 diff7 += diff5 * fact1;
1737 diff10 += diff8 * fact3;
1738 diff11 += diff9 * fact3;
1739 diff14 += diff12 * fact3;
1740 diff15 += diff13 * fact3;
1742 SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1743 SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1744 PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1745 dst_val0, dst_val1, dst_val2, dst_val3);
1746 ILVRL_B2_SH(dst_val2, dst_val0, diff0, diff1);
1747 ILVRL_B2_SH(dst_val3, dst_val1, diff2, diff3);
1749 ST_H8(diff0, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1750 dst_org += (8 * stride);
1751 ST_H8(diff1, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1752 dst_org += (8 * stride);
1753 ST_H8(diff2, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1754 dst_org += (8 * stride);
1755 ST_H8(diff3, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1756 dst_org += (8 * stride);
1762 static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst,
1769 src2 = LD_UB(src + 16);
1771 for (row = 32; row--;) {
1772 ST_UB2(src1, src2, dst, 16);
1777 void ff_hevc_intra_pred_planar_0_msa(uint8_t *dst,
1778 const uint8_t *src_top,
1779 const uint8_t *src_left,
1782 hevc_intra_pred_plane_4x4_msa(src_top, src_left, dst, stride);
1785 void ff_hevc_intra_pred_planar_1_msa(uint8_t *dst,
1786 const uint8_t *src_top,
1787 const uint8_t *src_left,
1790 hevc_intra_pred_plane_8x8_msa(src_top, src_left, dst, stride);
1793 void ff_hevc_intra_pred_planar_2_msa(uint8_t *dst,
1794 const uint8_t *src_top,
1795 const uint8_t *src_left,
1798 hevc_intra_pred_plane_16x16_msa(src_top, src_left, dst, stride);
1801 void ff_hevc_intra_pred_planar_3_msa(uint8_t *dst,
1802 const uint8_t *src_top,
1803 const uint8_t *src_left,
1806 hevc_intra_pred_plane_32x32_msa(src_top, src_left, dst, stride);
1809 void ff_hevc_intra_pred_dc_msa(uint8_t *dst, const uint8_t *src_top,
1810 const uint8_t *src_left,
1811 ptrdiff_t stride, int log2, int c_idx)
1815 hevc_intra_pred_dc_4x4_msa(src_top, src_left, dst, stride, c_idx);
1819 hevc_intra_pred_dc_8x8_msa(src_top, src_left, dst, stride, c_idx);
1823 hevc_intra_pred_dc_16x16_msa(src_top, src_left, dst, stride, c_idx);
1827 hevc_intra_pred_dc_32x32_msa(src_top, src_left, dst, stride);
1832 void ff_pred_intra_pred_angular_0_msa(uint8_t *dst,
1833 const uint8_t *src_top,
1834 const uint8_t *src_left,
1835 ptrdiff_t stride, int c_idx, int mode)
1838 hevc_intra_pred_horiz_4x4_msa(src_top, src_left, dst, stride, c_idx);
1839 } else if (mode == 26) {
1840 hevc_intra_pred_vert_4x4_msa(src_top, src_left, dst, stride, c_idx);
1841 } else if (mode >= 18) {
1842 hevc_intra_pred_angular_upper_4width_msa(src_top, src_left,
1845 hevc_intra_pred_angular_lower_4width_msa(src_top, src_left,
1850 void ff_pred_intra_pred_angular_1_msa(uint8_t *dst,
1851 const uint8_t *src_top,
1852 const uint8_t *src_left,
1853 ptrdiff_t stride, int c_idx, int mode)
1856 hevc_intra_pred_horiz_8x8_msa(src_top, src_left, dst, stride, c_idx);
1857 } else if (mode == 26) {
1858 hevc_intra_pred_vert_8x8_msa(src_top, src_left, dst, stride, c_idx);
1859 } else if (mode >= 18) {
1860 hevc_intra_pred_angular_upper_8width_msa(src_top, src_left,
1863 hevc_intra_pred_angular_lower_8width_msa(src_top, src_left,
1868 void ff_pred_intra_pred_angular_2_msa(uint8_t *dst,
1869 const uint8_t *src_top,
1870 const uint8_t *src_left,
1871 ptrdiff_t stride, int c_idx, int mode)
1874 hevc_intra_pred_horiz_16x16_msa(src_top, src_left, dst, stride, c_idx);
1875 } else if (mode == 26) {
1876 hevc_intra_pred_vert_16x16_msa(src_top, src_left, dst, stride, c_idx);
1877 } else if (mode >= 18) {
1878 hevc_intra_pred_angular_upper_16width_msa(src_top, src_left,
1881 hevc_intra_pred_angular_lower_16width_msa(src_top, src_left,
1886 void ff_pred_intra_pred_angular_3_msa(uint8_t *dst,
1887 const uint8_t *src_top,
1888 const uint8_t *src_left,
1889 ptrdiff_t stride, int c_idx, int mode)
1892 hevc_intra_pred_horiz_32x32_msa(src_top, src_left, dst, stride);
1893 } else if (mode == 26) {
1894 intra_predict_vert_32x32_msa(src_top, dst, stride);
1895 } else if (mode >= 18) {
1896 hevc_intra_pred_angular_upper_32width_msa(src_top, src_left,
1899 hevc_intra_pred_angular_lower_32width_msa(src_top, src_left,
1904 void ff_intra_pred_8_16x16_msa(HEVCContext *s, int x0, int y0, int c_idx)
1907 HEVCLocalContext *lc = s->HEVClc;
1909 int hshift = s->ps.sps->hshift[c_idx];
1910 int vshift = s->ps.sps->vshift[c_idx];
1911 int size_in_luma_h = 16 << hshift;
1912 int size_in_tbs_h = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
1913 int size_in_luma_v = 16 << vshift;
1914 int size_in_tbs_v = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
1915 int x = x0 >> hshift;
1916 int y = y0 >> vshift;
1917 int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
1918 int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
1921 s->ps.pps->min_tb_addr_zs[(y_tb) * (s->ps.sps->tb_mask + 2) + (x_tb)];
1923 ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(uint8_t);
1924 uint8_t *src = (uint8_t *) s->frame->data[c_idx] + x + y * stride;
1926 int min_pu_width = s->ps.sps->min_pu_width;
1928 enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c :
1929 lc->tu.intra_pred_mode;
1931 uint8_t left_array[2 * 32 + 1];
1932 uint8_t filtered_left_array[2 * 32 + 1];
1933 uint8_t top_array[2 * 32 + 1];
1934 uint8_t filtered_top_array[2 * 32 + 1];
1936 uint8_t *left = left_array + 1;
1937 uint8_t *top = top_array + 1;
1938 uint8_t *filtered_left = filtered_left_array + 1;
1939 uint8_t *filtered_top = filtered_top_array + 1;
1940 int cand_bottom_left = lc->na.cand_bottom_left
1942 s->ps.pps->min_tb_addr_zs[((y_tb + size_in_tbs_v) & s->ps.sps->tb_mask) *
1943 (s->ps.sps->tb_mask + 2) + (x_tb - 1)];
1944 int cand_left = lc->na.cand_left;
1945 int cand_up_left = lc->na.cand_up_left;
1946 int cand_up = lc->na.cand_up;
1947 int cand_up_right = lc->na.cand_up_right
1949 s->ps.pps->min_tb_addr_zs[(y_tb - 1) * (s->ps.sps->tb_mask + 2) +
1950 ((x_tb + size_in_tbs_h) & s->ps.sps->tb_mask)];
1952 int bottom_left_size =
1953 (((y0 + 2 * size_in_luma_v) >
1954 (s->ps.sps->height) ? (s->ps.sps->height) : (y0 +
1955 2 * size_in_luma_v)) -
1956 (y0 + size_in_luma_v)) >> vshift;
1957 int top_right_size =
1958 (((x0 + 2 * size_in_luma_h) >
1959 (s->ps.sps->width) ? (s->ps.sps->width) : (x0 + 2 * size_in_luma_h)) -
1960 (x0 + size_in_luma_h)) >> hshift;
1962 if (s->ps.pps->constrained_intra_pred_flag == 1) {
1963 int size_in_luma_pu_v = ((size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
1964 int size_in_luma_pu_h = ((size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
1965 int on_pu_edge_x = !(x0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
1966 int on_pu_edge_y = !(y0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
1967 if (!size_in_luma_pu_h)
1968 size_in_luma_pu_h++;
1969 if (cand_bottom_left == 1 && on_pu_edge_x) {
1970 int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
1972 ((y0 + size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
1974 ((size_in_luma_pu_v) >
1975 (s->ps.sps->min_pu_height -
1976 y_bottom_pu) ? (s->ps.sps->min_pu_height -
1977 y_bottom_pu) : (size_in_luma_pu_v));
1978 cand_bottom_left = 0;
1979 for (i = 0; i < max; i += 2)
1981 ((s->ref->tab_mvf[(x_left_pu) +
1983 i) * min_pu_width]).pred_flag ==
1986 if (cand_left == 1 && on_pu_edge_x) {
1987 int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
1988 int y_left_pu = ((y0) >> s->ps.sps->log2_min_pu_size);
1990 ((size_in_luma_pu_v) >
1991 (s->ps.sps->min_pu_height -
1992 y_left_pu) ? (s->ps.sps->min_pu_height -
1993 y_left_pu) : (size_in_luma_pu_v));
1995 for (i = 0; i < max; i += 2)
1997 ((s->ref->tab_mvf[(x_left_pu) +
1999 i) * min_pu_width]).pred_flag ==
2002 if (cand_up_left == 1) {
2003 int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
2004 int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2006 (s->ref->tab_mvf[(x_left_pu) +
2007 (y_top_pu) * min_pu_width]).pred_flag ==
2010 if (cand_up == 1 && on_pu_edge_y) {
2011 int x_top_pu = ((x0) >> s->ps.sps->log2_min_pu_size);
2012 int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2014 ((size_in_luma_pu_h) >
2015 (s->ps.sps->min_pu_width -
2016 x_top_pu) ? (s->ps.sps->min_pu_width -
2017 x_top_pu) : (size_in_luma_pu_h));
2019 for (i = 0; i < max; i += 2)
2021 ((s->ref->tab_mvf[(x_top_pu + i) +
2023 min_pu_width]).pred_flag == PF_INTRA);
2025 if (cand_up_right == 1 && on_pu_edge_y) {
2026 int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2028 ((x0 + size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
2030 ((size_in_luma_pu_h) >
2031 (s->ps.sps->min_pu_width -
2032 x_right_pu) ? (s->ps.sps->min_pu_width -
2033 x_right_pu) : (size_in_luma_pu_h));
2035 for (i = 0; i < max; i += 2)
2037 ((s->ref->tab_mvf[(x_right_pu + i) +
2039 min_pu_width]).pred_flag == PF_INTRA);
2042 vec0 = (v16u8) __msa_ldi_b(128);
2044 ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2046 ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2051 left[-1] = src[(-1) + stride * (-1)];
2055 vec0 = LD_UB(src - stride);
2058 if (cand_up_right) {
2059 vec0 = LD_UB(src - stride + 16);
2060 ST_UB(vec0, (top + 16));
2064 ((src[(16 + top_right_size - 1) + stride * (-1)]) *
2066 for (i = 0; i < (16 - top_right_size); i += 4)
2067 ((((union unaligned_32 *) (top + 16 + top_right_size +
2072 for (i = 0; i < 16; i++)
2073 left[i] = src[(-1) + stride * (i)];
2074 if (cand_bottom_left) {
2075 for (i = 16; i < 16 + bottom_left_size; i++)
2076 left[i] = src[(-1) + stride * (i)];
2079 ((src[(-1) + stride * (16 + bottom_left_size - 1)]) *
2081 for (i = 0; i < (16 - bottom_left_size); i += 4)
2082 ((((union unaligned_32 *) (left + 16 + bottom_left_size +
2087 if (s->ps.pps->constrained_intra_pred_flag == 1) {
2088 if (cand_bottom_left || cand_left || cand_up_left || cand_up
2091 x0 + ((2 * 16) << hshift) <
2092 s->ps.sps->width ? 2 * 16 : (s->ps.sps->width - x0) >> hshift;
2094 y0 + ((2 * 16) << vshift) <
2095 s->ps.sps->height ? 2 * 16 : (s->ps.sps->height - y0) >> vshift;
2096 int j = 16 + (cand_bottom_left ? bottom_left_size : 0) - 1;
2097 if (!cand_up_right) {
2098 size_max_x = x0 + ((16) << hshift) < s->ps.sps->width ?
2099 16 : (s->ps.sps->width - x0) >> hshift;
2101 if (!cand_bottom_left) {
2102 size_max_y = y0 + ((16) << vshift) < s->ps.sps->height ?
2103 16 : (s->ps.sps->height - y0) >> vshift;
2105 if (cand_bottom_left || cand_left || cand_up_left) {
2108 !((s->ref->tab_mvf[(((x0 +
2109 ((-1) << hshift)) >> s->ps.sps->
2110 log2_min_pu_size)) + (((y0 +
2115 * min_pu_width]).pred_flag ==
2119 ((s->ref->tab_mvf[(((x0 +
2120 ((-1) << hshift)) >> s->ps.sps->
2121 log2_min_pu_size)) + (((y0 + ((j)
2126 * min_pu_width]).pred_flag == PF_INTRA)) {
2128 while (j < size_max_x
2130 !((s->ref->tab_mvf[(((x0 +
2131 ((j) << hshift)) >> s->ps.sps->
2132 log2_min_pu_size)) + (((y0 +
2138 * min_pu_width]).pred_flag ==
2141 for (i = j; i > (j) - (j + 1); i--)
2143 ((s->ref->tab_mvf[(((x0 +
2145 1) << hshift)) >> s->ps.sps->
2146 log2_min_pu_size)) + (((y0 +
2152 * min_pu_width]).pred_flag ==
2154 top[i - 1] = top[i];
2159 while (j < size_max_x
2161 !((s->ref->tab_mvf[(((x0 +
2162 ((j) << hshift)) >> s->ps.sps->
2163 log2_min_pu_size)) + (((y0 + ((-1)
2168 * min_pu_width]).pred_flag ==
2173 for (i = j; i > (j) - (j + 1); i--)
2175 ((s->ref->tab_mvf[(((x0 +
2178 s->ps.sps->log2_min_pu_size))
2182 s->ps.sps->log2_min_pu_size))
2184 min_pu_width]).pred_flag ==
2186 top[i - 1] = top[i];
2188 for (i = j; i > (j) - (j); i--)
2190 ((s->ref->tab_mvf[(((x0 +
2193 s->ps.sps->log2_min_pu_size))
2197 s->ps.sps->log2_min_pu_size))
2199 min_pu_width]).pred_flag ==
2201 top[i - 1] = top[i];
2207 if (cand_bottom_left || cand_left) {
2208 a = ((left[-1]) * 0x01010101U);
2209 for (i = 0; i < (0) + (size_max_y); i += 4)
2211 ((s->ref->tab_mvf[(((x0 +
2212 ((-1) << hshift)) >> s->ps.sps->
2213 log2_min_pu_size)) + (((y0 +
2218 * min_pu_width]).pred_flag ==
2220 ((((union unaligned_32 *) (&left[i]))->l) = (a));
2222 a = ((left[i + 3]) * 0x01010101U);
2225 vec0 = (v16u8) __msa_fill_b(left[-1]);
2229 if (!cand_bottom_left) {
2231 vec0 = (v16u8) __msa_fill_b(left[15]);
2233 ST_UB(vec0, (left + 16));
2235 if (x0 != 0 && y0 != 0) {
2236 a = ((left[size_max_y - 1]) * 0x01010101U);
2237 for (i = (size_max_y - 1);
2238 i > (size_max_y - 1) - (size_max_y); i -= 4)
2240 ((s->ref->tab_mvf[(((x0 +
2241 ((-1) << hshift)) >> s->ps.sps->
2242 log2_min_pu_size)) + (((y0 +
2248 * min_pu_width]).pred_flag ==
2250 ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2252 a = ((left[i - 3]) * 0x01010101U);
2254 ((s->ref->tab_mvf[(((x0 +
2255 ((-1) << hshift)) >> s->ps.sps->
2256 log2_min_pu_size)) + (((y0 + ((-1)
2261 * min_pu_width]).pred_flag == PF_INTRA))
2263 } else if (x0 == 0) {
2265 uint32_t pix = ((0) * 0x01010101U);
2266 for (i = 0; i < (size_max_y); i += 4)
2267 ((((union unaligned_32 *) (left + i))->l) = (pix));
2270 a = ((left[size_max_y - 1]) * 0x01010101U);
2271 for (i = (size_max_y - 1);
2272 i > (size_max_y - 1) - (size_max_y); i -= 4)
2274 ((s->ref->tab_mvf[(((x0 +
2275 ((-1) << hshift)) >> s->ps.sps->
2276 log2_min_pu_size)) + (((y0 +
2282 * min_pu_width]).pred_flag ==
2284 ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2286 a = ((left[i - 3]) * 0x01010101U);
2290 a = ((left[-1]) * 0x01010101U);
2291 for (i = 0; i < (0) + (size_max_x); i += 4)
2293 ((s->ref->tab_mvf[(((x0 +
2294 ((i) << hshift)) >> s->ps.sps->
2295 log2_min_pu_size)) + (((y0 + ((-1)
2300 * min_pu_width]).pred_flag ==
2302 ((((union unaligned_32 *) (&top[i]))->l) = (a));
2304 a = ((top[i + 3]) * 0x01010101U);
2309 if (!cand_bottom_left) {
2311 vec0 = (v16u8) __msa_fill_b(left[15]);
2313 ST_UB(vec0, (left + 16));
2315 } else if (cand_up_left) {
2316 vec0 = (v16u8) __msa_fill_b(left[-1]);
2318 ST_UB2(vec0, vec0, left, 16);
2321 } else if (cand_up) {
2324 vec0 = (v16u8) __msa_fill_b(left[-1]);
2326 ST_UB2(vec0, vec0, left, 16);
2330 } else if (cand_up_right) {
2331 vec0 = (v16u8) __msa_fill_b(top[16]);
2337 ST_UB2(vec0, vec0, left, 16);
2344 vec0 = (v16u8) __msa_ldi_b(128);
2346 ST_UB2(vec0, vec0, top, 16);
2347 ST_UB2(vec0, vec0, left, 16);
2352 vec0 = (v16u8) __msa_fill_b(left[16]);
2355 if (!cand_up_left) {
2359 vec0 = (v16u8) __msa_fill_b(left[-1]);
2362 if (!cand_up_right) {
2363 vec0 = (v16u8) __msa_fill_b(top[15]);
2364 ST_UB(vec0, (top + 16));
2370 if (!s->ps.sps->intra_smoothing_disabled_flag
2371 && (c_idx == 0 || s->ps.sps->chroma_format_idc == 3)) {
2372 if (mode != INTRA_DC && 16 != 4) {
2373 int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
2374 int min_dist_vert_hor =
2375 (((((int) (mode - 26U)) >=
2376 0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))) >
2377 ((((int) (mode - 10U)) >=
2378 0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2379 ? ((((int) (mode - 10U)) >=
2380 0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2381 : ((((int) (mode - 26U)) >=
2382 0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))));
2383 if (min_dist_vert_hor > intra_hor_ver_dist_thresh[4 - 3]) {
2384 filtered_left[2 * 16 - 1] = left[2 * 16 - 1];
2385 filtered_top[2 * 16 - 1] = top[2 * 16 - 1];
2386 for (i = 2 * 16 - 2; i >= 0; i--)
2387 filtered_left[i] = (left[i + 1] + 2 * left[i] +
2388 left[i - 1] + 2) >> 2;
2391 (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
2392 for (i = 2 * 16 - 2; i >= 0; i--)
2393 filtered_top[i] = (top[i + 1] + 2 * top[i] +
2394 top[i - 1] + 2) >> 2;
2395 left = filtered_left;
2403 s->hpc.pred_planar[4 - 2] ((uint8_t *) src, (uint8_t *) top,
2404 (uint8_t *) left, stride);
2407 s->hpc.pred_dc((uint8_t *) src, (uint8_t *) top,
2408 (uint8_t *) left, stride, 4, c_idx);
2411 s->hpc.pred_angular[4 - 2] ((uint8_t *) src, (uint8_t *) top,
2412 (uint8_t *) left, stride, c_idx, mode);
2417 void ff_intra_pred_8_32x32_msa(HEVCContext *s, int x0, int y0, int c_idx)
2420 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2421 v8i16 res0, res1, res2, res3;
2422 v8i16 mul_val0 = { 63, 62, 61, 60, 59, 58, 57, 56 };
2423 v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
2424 HEVCLocalContext *lc = s->HEVClc;
2426 int hshift = s->ps.sps->hshift[c_idx];
2427 int vshift = s->ps.sps->vshift[c_idx];
2428 int size_in_luma_h = 32 << hshift;
2429 int size_in_tbs_h = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
2430 int size_in_luma_v = 32 << vshift;
2431 int size_in_tbs_v = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
2432 int x = x0 >> hshift;
2433 int y = y0 >> vshift;
2434 int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
2435 int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
2438 s->ps.pps->min_tb_addr_zs[(y_tb) * (s->ps.sps->tb_mask + 2) + (x_tb)];
2440 ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(uint8_t);
2441 uint8_t *src = (uint8_t *) s->frame->data[c_idx] + x + y * stride;
2443 int min_pu_width = s->ps.sps->min_pu_width;
2445 enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c :
2446 lc->tu.intra_pred_mode;
2448 uint8_t left_array[2 * 32 + 1];
2449 uint8_t filtered_left_array[2 * 32 + 1];
2450 uint8_t top_array[2 * 32 + 1];
2451 uint8_t filtered_top_array[2 * 32 + 1];
2453 uint8_t *left = left_array + 1;
2454 uint8_t *top = top_array + 1;
2455 uint8_t *filtered_left = filtered_left_array + 1;
2456 uint8_t *filtered_top = filtered_top_array + 1;
2457 int cand_bottom_left = lc->na.cand_bottom_left
2459 s->ps.pps->min_tb_addr_zs[((y_tb + size_in_tbs_v) & s->ps.sps->tb_mask) *
2460 (s->ps.sps->tb_mask + 2) + (x_tb - 1)];
2461 int cand_left = lc->na.cand_left;
2462 int cand_up_left = lc->na.cand_up_left;
2463 int cand_up = lc->na.cand_up;
2464 int cand_up_right = lc->na.cand_up_right
2466 s->ps.pps->min_tb_addr_zs[(y_tb - 1) * (s->ps.sps->tb_mask + 2) +
2467 ((x_tb + size_in_tbs_h) & s->ps.sps->tb_mask)];
2469 int bottom_left_size =
2470 (((y0 + 2 * size_in_luma_v) >
2471 (s->ps.sps->height) ? (s->ps.sps->height) : (y0 +
2472 2 * size_in_luma_v)) -
2473 (y0 + size_in_luma_v)) >> vshift;
2474 int top_right_size =
2475 (((x0 + 2 * size_in_luma_h) >
2476 (s->ps.sps->width) ? (s->ps.sps->width) : (x0 + 2 * size_in_luma_h)) -
2477 (x0 + size_in_luma_h)) >> hshift;
2479 if (s->ps.pps->constrained_intra_pred_flag == 1) {
2480 int size_in_luma_pu_v = ((size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
2481 int size_in_luma_pu_h = ((size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
2482 int on_pu_edge_x = !(x0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
2483 int on_pu_edge_y = !(y0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
2484 if (!size_in_luma_pu_h)
2485 size_in_luma_pu_h++;
2486 if (cand_bottom_left == 1 && on_pu_edge_x) {
2487 int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
2489 ((y0 + size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
2491 ((size_in_luma_pu_v) >
2492 (s->ps.sps->min_pu_height -
2493 y_bottom_pu) ? (s->ps.sps->min_pu_height -
2494 y_bottom_pu) : (size_in_luma_pu_v));
2495 cand_bottom_left = 0;
2496 for (i = 0; i < max; i += 2)
2498 ((s->ref->tab_mvf[(x_left_pu) +
2500 i) * min_pu_width]).pred_flag ==
2503 if (cand_left == 1 && on_pu_edge_x) {
2504 int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
2505 int y_left_pu = ((y0) >> s->ps.sps->log2_min_pu_size);
2507 ((size_in_luma_pu_v) >
2508 (s->ps.sps->min_pu_height -
2509 y_left_pu) ? (s->ps.sps->min_pu_height -
2510 y_left_pu) : (size_in_luma_pu_v));
2512 for (i = 0; i < max; i += 2)
2514 ((s->ref->tab_mvf[(x_left_pu) +
2516 i) * min_pu_width]).pred_flag ==
2519 if (cand_up_left == 1) {
2520 int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
2521 int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2523 (s->ref->tab_mvf[(x_left_pu) +
2524 (y_top_pu) * min_pu_width]).pred_flag ==
2527 if (cand_up == 1 && on_pu_edge_y) {
2528 int x_top_pu = ((x0) >> s->ps.sps->log2_min_pu_size);
2529 int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2531 ((size_in_luma_pu_h) >
2532 (s->ps.sps->min_pu_width -
2533 x_top_pu) ? (s->ps.sps->min_pu_width -
2534 x_top_pu) : (size_in_luma_pu_h));
2536 for (i = 0; i < max; i += 2)
2538 ((s->ref->tab_mvf[(x_top_pu + i) +
2540 min_pu_width]).pred_flag == PF_INTRA);
2542 if (cand_up_right == 1 && on_pu_edge_y) {
2543 int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2545 ((x0 + size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
2547 ((size_in_luma_pu_h) >
2548 (s->ps.sps->min_pu_width -
2549 x_right_pu) ? (s->ps.sps->min_pu_width -
2550 x_right_pu) : (size_in_luma_pu_h));
2552 for (i = 0; i < max; i += 2)
2554 ((s->ref->tab_mvf[(x_right_pu + i) +
2556 min_pu_width]).pred_flag == PF_INTRA);
2558 vec0 = (v16u8) __msa_ldi_b(128);
2560 ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2561 ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2566 left[-1] = src[(-1) + stride * (-1)];
2570 LD_UB2(src - stride, 16, vec0, vec1);
2571 ST_UB2(vec0, vec1, top, 16);
2574 if (cand_up_right) {
2575 LD_UB2(src - stride + 32, 16, vec0, vec1);
2576 ST_UB2(vec0, vec1, (top + 32), 16);
2579 ((src[(32 + top_right_size - 1) + stride * (-1)]) *
2581 for (i = 0; i < (32 - top_right_size); i += 4)
2582 ((((union unaligned_32 *) (top + 32 + top_right_size +
2587 for (i = 0; i < 32; i++)
2588 left[i] = src[(-1) + stride * (i)];
2589 if (cand_bottom_left) {
2590 for (i = 32; i < 32 + bottom_left_size; i++)
2591 left[i] = src[(-1) + stride * (i)];
2594 ((src[(-1) + stride * (32 + bottom_left_size - 1)]) *
2596 for (i = 0; i < (32 - bottom_left_size); i += 4)
2597 ((((union unaligned_32 *) (left + 32 + bottom_left_size +
2602 if (s->ps.pps->constrained_intra_pred_flag == 1) {
2603 if (cand_bottom_left || cand_left || cand_up_left || cand_up
2606 x0 + ((2 * 32) << hshift) <
2607 s->ps.sps->width ? 2 * 32 : (s->ps.sps->width - x0) >> hshift;
2609 y0 + ((2 * 32) << vshift) <
2610 s->ps.sps->height ? 2 * 32 : (s->ps.sps->height - y0) >> vshift;
2611 int j = 32 + (cand_bottom_left ? bottom_left_size : 0) - 1;
2612 if (!cand_up_right) {
2613 size_max_x = x0 + ((32) << hshift) < s->ps.sps->width ?
2614 32 : (s->ps.sps->width - x0) >> hshift;
2616 if (!cand_bottom_left) {
2617 size_max_y = y0 + ((32) << vshift) < s->ps.sps->height ?
2618 32 : (s->ps.sps->height - y0) >> vshift;
2620 if (cand_bottom_left || cand_left || cand_up_left) {
2623 !((s->ref->tab_mvf[(((x0 +
2624 ((-1) << hshift)) >> s->ps.sps->
2625 log2_min_pu_size)) + (((y0 +
2630 * min_pu_width]).pred_flag ==
2634 ((s->ref->tab_mvf[(((x0 +
2635 ((-1) << hshift)) >> s->ps.sps->
2636 log2_min_pu_size)) + (((y0 + ((j)
2641 * min_pu_width]).pred_flag == PF_INTRA)) {
2643 while (j < size_max_x
2645 !((s->ref->tab_mvf[(((x0 +
2646 ((j) << hshift)) >> s->ps.sps->
2647 log2_min_pu_size)) + (((y0 +
2653 * min_pu_width]).pred_flag ==
2656 for (i = j; i > (j) - (j + 1); i--)
2658 ((s->ref->tab_mvf[(((x0 +
2660 1) << hshift)) >> s->ps.sps->
2661 log2_min_pu_size)) + (((y0 +
2667 * min_pu_width]).pred_flag ==
2669 top[i - 1] = top[i];
2674 while (j < size_max_x
2676 !((s->ref->tab_mvf[(((x0 +
2677 ((j) << hshift)) >> s->ps.sps->
2678 log2_min_pu_size)) + (((y0 + ((-1)
2683 * min_pu_width]).pred_flag ==
2688 for (i = j; i > (j) - (j + 1); i--)
2690 ((s->ref->tab_mvf[(((x0 +
2693 s->ps.sps->log2_min_pu_size))
2697 s->ps.sps->log2_min_pu_size))
2699 min_pu_width]).pred_flag ==
2701 top[i - 1] = top[i];
2703 for (i = j; i > (j) - (j); i--)
2705 ((s->ref->tab_mvf[(((x0 +
2708 s->ps.sps->log2_min_pu_size))
2712 s->ps.sps->log2_min_pu_size))
2714 min_pu_width]).pred_flag ==
2716 top[i - 1] = top[i];
2722 if (cand_bottom_left || cand_left) {
2723 a = ((left[-1]) * 0x01010101U);
2724 for (i = 0; i < (0) + (size_max_y); i += 4)
2726 ((s->ref->tab_mvf[(((x0 +
2727 ((-1) << hshift)) >> s->ps.sps->
2728 log2_min_pu_size)) + (((y0 +
2733 * min_pu_width]).pred_flag ==
2735 ((((union unaligned_32 *) (&left[i]))->l) = (a));
2737 a = ((left[i + 3]) * 0x01010101U);
2740 vec0 = (v16u8) __msa_fill_b(left[-1]);
2742 ST_UB2(vec0, vec0, left, 16);
2744 if (!cand_bottom_left) {
2745 vec0 = (v16u8) __msa_fill_b(left[31]);
2747 ST_UB2(vec0, vec0, (left + 32), 16);
2749 if (x0 != 0 && y0 != 0) {
2750 a = ((left[size_max_y - 1]) * 0x01010101U);
2751 for (i = (size_max_y - 1);
2752 i > (size_max_y - 1) - (size_max_y); i -= 4)
2754 ((s->ref->tab_mvf[(((x0 +
2755 ((-1) << hshift)) >> s->ps.sps->
2756 log2_min_pu_size)) + (((y0 +
2762 * min_pu_width]).pred_flag ==
2764 ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2766 a = ((left[i - 3]) * 0x01010101U);
2768 ((s->ref->tab_mvf[(((x0 +
2769 ((-1) << hshift)) >> s->ps.sps->
2770 log2_min_pu_size)) + (((y0 + ((-1)
2775 * min_pu_width]).pred_flag == PF_INTRA))
2777 } else if (x0 == 0) {
2779 uint32_t pix = ((0) * 0x01010101U);
2780 for (i = 0; i < (size_max_y); i += 4)
2781 ((((union unaligned_32 *) (left + i))->l) = (pix));
2784 a = ((left[size_max_y - 1]) * 0x01010101U);
2785 for (i = (size_max_y - 1);
2786 i > (size_max_y - 1) - (size_max_y); i -= 4)
2788 ((s->ref->tab_mvf[(((x0 +
2789 ((-1) << hshift)) >> s->ps.sps->
2790 log2_min_pu_size)) + (((y0 +
2796 * min_pu_width]).pred_flag ==
2798 ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2800 a = ((left[i - 3]) * 0x01010101U);
2804 a = ((left[-1]) * 0x01010101U);
2805 for (i = 0; i < (0) + (size_max_x); i += 4)
2807 ((s->ref->tab_mvf[(((x0 +
2808 ((i) << hshift)) >> s->ps.sps->
2809 log2_min_pu_size)) + (((y0 + ((-1)
2814 * min_pu_width]).pred_flag ==
2816 ((((union unaligned_32 *) (&top[i]))->l) = (a));
2818 a = ((top[i + 3]) * 0x01010101U);
2823 if (!cand_bottom_left) {
2825 vec0 = (v16u8) __msa_fill_b(left[31]);
2827 ST_UB2(vec0, vec0, (left + 32), 16);
2828 } else if (cand_up_left) {
2829 vec0 = (v16u8) __msa_fill_b(left[-1]);
2831 ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2834 } else if (cand_up) {
2837 vec0 = (v16u8) __msa_fill_b(left[-1]);
2839 ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2843 } else if (cand_up_right) {
2844 vec0 = (v16u8) __msa_fill_b(top[32]);
2846 ST_UB2(vec0, vec0, top, 16);
2850 ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2858 vec0 = (v16u8) __msa_ldi_b(128);
2860 ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2861 ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2866 vec0 = (v16u8) __msa_fill_b(left[32]);
2868 ST_UB2(vec0, vec0, left, 16);
2870 if (!cand_up_left) {
2874 vec0 = (v16u8) __msa_fill_b(left[-1]);
2876 ST_UB2(vec0, vec0, top, 16);
2878 if (!cand_up_right) {
2879 vec0 = (v16u8) __msa_fill_b(top[31]);
2881 ST_UB2(vec0, vec0, (top + 32), 16);
2887 if (!s->ps.sps->intra_smoothing_disabled_flag
2888 && (c_idx == 0 || s->ps.sps->chroma_format_idc == 3)) {
2889 if (mode != INTRA_DC && 32 != 4) {
2890 int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
2891 int min_dist_vert_hor =
2892 (((((int) (mode - 26U)) >=
2893 0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))) >
2894 ((((int) (mode - 10U)) >=
2895 0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2896 ? ((((int) (mode - 10U)) >=
2897 0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2898 : ((((int) (mode - 26U)) >=
2899 0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))));
2900 if (min_dist_vert_hor > intra_hor_ver_dist_thresh[5 - 3]) {
2901 int threshold = 1 << (8 - 5);
2902 if (s->ps.sps->sps_strong_intra_smoothing_enable_flag
2904 && ((top[-1] + top[63] - 2 * top[31]) >=
2905 0 ? (top[-1] + top[63] -
2906 2 * top[31]) : (-(top[-1] + top[63] -
2907 2 * top[31]))) < threshold
2908 && ((left[-1] + left[63] - 2 * left[31]) >=
2909 0 ? (left[-1] + left[63] -
2910 2 * left[31]) : (-(left[-1] + left[63] -
2911 2 * left[31]))) < threshold) {
2914 filtered_top[-1] = top[-1];
2915 filtered_top[63] = top[63];
2918 for (i = 0; i < 63; i++) {
2920 ((63 - i) * top[-1] + (i + 1) * top[63] + 32) >> 6;
2923 tmp0 = __msa_fill_h(top[-1]);
2924 tmp1 = __msa_fill_h(top[63]);
2926 tmp2 = mul_val0 - 8;
2927 tmp3 = mul_val0 - 16;
2928 tmp4 = mul_val0 - 24;
2929 tmp5 = mul_val1 + 8;
2930 tmp6 = mul_val1 + 16;
2931 tmp7 = mul_val1 + 24;
2933 res0 = mul_val0 * tmp0;
2937 res0 += mul_val1 * tmp1;
2938 res1 += tmp5 * tmp1;
2939 res2 += tmp6 * tmp1;
2940 res3 += tmp7 * tmp1;
2942 res0 = __msa_srari_h(res0, 6);
2943 res1 = __msa_srari_h(res1, 6);
2944 res2 = __msa_srari_h(res2, 6);
2945 res3 = __msa_srari_h(res3, 6);
2947 vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
2948 vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
2950 ST_UB2(vec0, vec1, filtered_top, 16);
2952 res0 = mul_val0 - 32;
2953 tmp2 = mul_val0 - 40;
2954 tmp3 = mul_val0 - 48;
2955 tmp4 = mul_val0 - 56;
2956 res3 = mul_val1 + 32;
2957 tmp5 = mul_val1 + 40;
2958 tmp6 = mul_val1 + 48;
2959 tmp7 = mul_val1 + 56;
2964 res0 += res3 * tmp1;
2966 res1 += tmp5 * tmp1;
2967 res2 += tmp6 * tmp1;
2968 res3 += tmp7 * tmp1;
2970 res0 = __msa_srari_h(res0, 6);
2971 res1 = __msa_srari_h(res1, 6);
2972 res2 = __msa_srari_h(res2, 6);
2973 res3 = __msa_srari_h(res3, 6);
2975 vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
2976 vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
2978 ST_UB2(vec0, vec1, (filtered_top + 32), 16);
2980 filtered_top[63] = top[63];
2982 tmp0 = __msa_fill_h(left[-1]);
2983 tmp1 = __msa_fill_h(left[63]);
2985 tmp2 = mul_val0 - 8;
2986 tmp3 = mul_val0 - 16;
2987 tmp4 = mul_val0 - 24;
2988 tmp5 = mul_val1 + 8;
2989 tmp6 = mul_val1 + 16;
2990 tmp7 = mul_val1 + 24;
2992 res0 = mul_val0 * tmp0;
2996 res0 += mul_val1 * tmp1;
2997 res1 += tmp5 * tmp1;
2998 res2 += tmp6 * tmp1;
2999 res3 += tmp7 * tmp1;
3001 res0 = __msa_srari_h(res0, 6);
3002 res1 = __msa_srari_h(res1, 6);
3003 res2 = __msa_srari_h(res2, 6);
3004 res3 = __msa_srari_h(res3, 6);
3006 vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
3007 vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
3009 ST_UB2(vec0, vec1, left, 16);
3011 res0 = mul_val0 - 32;
3012 tmp2 = mul_val0 - 40;
3013 tmp3 = mul_val0 - 48;
3014 tmp4 = mul_val0 - 56;
3015 res3 = mul_val1 + 32;
3016 tmp5 = mul_val1 + 40;
3017 tmp6 = mul_val1 + 48;
3018 tmp7 = mul_val1 + 56;
3023 res0 += res3 * tmp1;
3025 res1 += tmp5 * tmp1;
3026 res2 += tmp6 * tmp1;
3027 res3 += tmp7 * tmp1;
3029 res0 = __msa_srari_h(res0, 6);
3030 res1 = __msa_srari_h(res1, 6);
3031 res2 = __msa_srari_h(res2, 6);
3032 res3 = __msa_srari_h(res3, 6);
3034 vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
3035 vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
3037 ST_UB2(vec0, vec1, (left + 32), 16);
3043 filtered_left[2 * 32 - 1] = left[2 * 32 - 1];
3044 filtered_top[2 * 32 - 1] = top[2 * 32 - 1];
3045 for (i = 2 * 32 - 2; i >= 0; i--)
3046 filtered_left[i] = (left[i + 1] + 2 * left[i] +
3047 left[i - 1] + 2) >> 2;
3050 (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
3051 for (i = 2 * 32 - 2; i >= 0; i--)
3052 filtered_top[i] = (top[i + 1] + 2 * top[i] +
3053 top[i - 1] + 2) >> 2;
3054 left = filtered_left;
3063 s->hpc.pred_planar[3] ((uint8_t *) src, (uint8_t *) top,
3064 (uint8_t *) left, stride);
3067 s->hpc.pred_dc((uint8_t *) src, (uint8_t *) top,
3068 (uint8_t *) left, stride, 5, c_idx);
3071 s->hpc.pred_angular[3] ((uint8_t *) src, (uint8_t *) top,
3072 (uint8_t *) left, stride, c_idx, mode);