2 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * Blackfin Video Color Space Converters Operations
6 * convert I420 YV12 to RGB in various formats,
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27 YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock
28 and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts
31 The following calculation is used for the conversion:
33 r = clipz((y-oy)*cy + crv*(v-128))
34 g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128))
35 b = clipz((y-oy)*cy + cbu*(u-128))
37 y,u,v are pre scaled by a factor of 4 i.e. left shifted to gain precision.
40 New factorization to eliminate the truncation error which was
41 occuring due to the byteop3p.
44 1) use the bytop16m to subtract quad bytes we use this in U8 this
45 then so the offsets need to be renormalized to 8bits.
47 2) scale operands up by a factor of 4 not 8 because Blackfin
48 multiplies include a shift.
50 3) compute into the accumulators cy*yx0, cy*yx1
52 4) compute each of the linear equations
53 r = clipz((y-oy)*cy + crv*(v-128))
55 g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128))
57 b = clipz((y-oy)*cy + cbu*(u-128))
59 reuse of the accumulators requires that we actually multiply
60 twice once with addition and the second time with a subtaction.
62 because of this we need to compute the equations in the order R B
63 then G saving the writes for B in the case of 24/32 bit color
66 api: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
67 int dW, uint32_t *coeffs);
74 Where coeffs have the following layout in memory.
76 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv;
78 coeffs is a pointer to oy.
80 the {rgb} masks are only utilized by the 565 packing algorithm. Note the data
81 replication is used to simplify the internal algorithms for the dual mac architecture
84 All routines are exported with _ff_bfin_ as a symbol prefix
86 rough performance gain compared against -O3:
88 2779809/1484290 187.28%
90 which translates to ~33c/pel to ~57c/pel for the reference vs 17.5
91 c/pel for the optimized implementations. Not sure why there is such a
92 huge variation on the reference codes on Blackfin I guess it must have
93 to do with the memory system.
105 #define DEFUN(fname,where,interface) \
107 .global _ff_bfin_ ## fname; \
108 .type _ff_bfin_ ## fname, STT_FUNC; \
112 #define DEFUN_END(fname) \
113 .size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname
118 #define COEFF_LEN 11*4
119 #define COEFF_REL_CY_OFF 4*4
125 DEFUN(yuv2rgb565_line,MEM,
126 (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
140 m0 = COEFF_REL_CY_OFF;
144 r1.l = w[i2++]; // 2u
145 r1.h = w[i3++]; // 2v
148 lsetup (.L0565, .L1565) lc0 = p0;
151 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
152 r0 -- used to load 4ys
153 r1 -- used to load 2us,2vs
162 rrrrrrrr gggggggg bbbbbbbb
169 (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
170 (r7,r6) = byteop16m (r1:0, r3:2) (r);
171 r5 = r5 << 2 (v); // y1,y0
172 r4 = r4 << 2 (v); // y3,y2
173 r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
174 r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
176 a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
178 /* R = Y+ crv*(Cr-128) */
179 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
180 a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
181 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
185 /* B = Y+ cbu*(Cb-128) */
186 r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
187 a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
188 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
193 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
194 a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
195 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
196 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask
200 [p1++]=r3 || r1=[i1++]; // cy
204 a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
206 /* R = Y+ crv*(Cr-128) */
207 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
208 a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
209 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
213 /* B = Y+ cbu*(Cb-128) */
214 r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
215 a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
216 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
221 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
222 a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
223 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
224 r2 = byteop3p(r3:2, r1:0)(LO) || r0 = [i0++]; // 2Y
225 r2 = r2 << 3 (v) || r1.l = w[i2++]; // 2u
228 [p1++]=r3 || r1.h = w[i3++]; // 2v
229 .L1565: r2=[i1++]; // oy
236 DEFUN_END(yuv2rgb565_line)
238 DEFUN(yuv2rgb555_line,MEM,
239 (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
253 m0 = COEFF_REL_CY_OFF;
257 r1.l = w[i2++]; // 2u
258 r1.h = w[i3++]; // 2v
261 lsetup (.L0555, .L1555) lc0 = p0;
264 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
265 r0 -- used to load 4ys
266 r1 -- used to load 2us,2vs
275 rrrrrrrr gggggggg bbbbbbbb
283 (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
284 (r7,r6) = byteop16m (r1:0, r3:2) (r);
285 r5 = r5 << 2 (v); // y1,y0
286 r4 = r4 << 2 (v); // y3,y2
287 r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
288 r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
290 a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
292 /* R = Y+ crv*(Cr-128) */
293 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
294 a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
295 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
299 /* B = Y+ cbu*(Cb-128) */
300 r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
301 a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
302 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
307 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
308 a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
309 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
310 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask
314 [p1++]=r3 || r1=[i1++]; // cy
318 a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
320 /* R = Y+ crv*(Cr-128) */
321 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
322 a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
323 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
327 /* B = Y+ cbu*(Cb-128) */
328 r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
329 a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
330 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
335 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
336 a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
337 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
338 r2 = byteop3p(r3:2, r1:0)(LO) || r0=[i0++]; // 4Y
339 r2 = r2 << 2 (v) || r1.l=w[i2++]; // 2u
342 [p1++]=r3 || r1.h=w[i3++]; // 2v
344 .L1555: r2=[i1++]; // oy
351 DEFUN_END(yuv2rgb555_line)
353 DEFUN(yuv2rgb24_line,MEM,
354 (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
366 r0 = [fp+ARG_COEFF]; // coeff buffer
370 m0 = COEFF_REL_CY_OFF;
374 r1.l = w[i2++]; // 2u
375 r1.h = w[i3++]; // 2v
378 lsetup (.L0888, .L1888) lc0 = p0;
381 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
382 r0 -- used to load 4ys
383 r1 -- used to load 2us,2vs
391 (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
392 (r7,r6) = byteop16m (r1:0, r3:2) (r);
393 r5 = r5 << 2 (v); // y1,y0
394 r4 = r4 << 2 (v); // y3,y2
395 r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
396 r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
399 a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
401 /* R = Y+ crv*(Cr-128) */
402 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
403 a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
404 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
405 r2=r2>>16 || B[p1++]=r2;
408 /* B = Y+ cbu*(Cb-128) */
409 r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
410 a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
411 r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
413 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
414 a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
415 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
416 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask, oy,cy,zero
418 r2=r2>>16 || B[p1++]=r2;
421 r3=r3>>16 || B[p1++]=r3;
422 B[p2++]=r3 || r1=[i1++]; // cy
427 a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
429 /* R = Y+ crv*(Cr-128) */
430 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
431 a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
432 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
433 r2=r2>>16 || B[p1++]=r2;
436 /* B = Y+ cbu*(Cb-128) */
437 r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
438 a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
439 r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
441 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
442 a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
443 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
444 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++]; // gmask
445 r2=r2>>16 || B[p1++]=r2 || r0 = [i0++]; // 4y
446 B[p2++]=r2 || r1.l = w[i2++]; // 2u
447 r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v
448 B[p2++]=r3 || r2=[i1++]; // oy
458 DEFUN_END(yuv2rgb24_line)
464 #define ARG_height 28
465 #define ARG_lumStride 32
466 #define ARG_chromStride 36
467 #define ARG_srcStride 40
469 DEFUN(uyvytoyv12, mL3, (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
470 long width, long height,
471 long lumStride, long chromStride, long srcStride)):
473 [--sp] = (r7:4,p5:4);
475 p0 = r1; // Y top even
478 r2 = [fp + ARG_vdst];
481 r1 = [fp + ARG_srcStride];
483 r1 += -8; // i0,i1 is pre read need to correct
486 i0 = r0; // uyvy_T even
487 i1 = r2; // uyvy_B odd
489 p2 = [fp + ARG_lumStride];
490 p1 = p0 + p2; // Y bot odd
492 p5 = [fp + ARG_width];
493 p4 = [fp + ARG_height];
498 r2 = [fp + ARG_chromStride];
503 /* I0,I1 - src input line pointers
504 * p0,p1 - luma output line pointers
509 lsetup (0f, 1f) lc1 = p4; // H/2
510 0: r0 = [i0++] || r2 = [i1++];
511 r1 = [i0++] || r3 = [i1++];
512 r4 = byteop1p(r1:0, r3:2);
513 r5 = byteop1p(r1:0, r3:2) (r);
514 lsetup (2f, 3f) lc0 = p5; // W/4
519 r0 = bytepack(r0, r1);
520 r2 = bytepack(r2, r3) || [p0++] = r0; // yyyy
521 r6 = pack(r5.l, r4.l) || [p1++] = r2; // yyyy
522 r7 = pack(r5.h, r4.h) || r0 = [i0++] || r2 = [i1++];
523 r6 = bytepack(r6, r7) || r1 = [i0++] || r3 = [i1++];
524 r4 = byteop1p(r1:0, r3:2) || w[i2++] = r6.l; // uu
525 3: r5 = byteop1p(r1:0, r3:2) (r) || w[i3++] = r6.h; // vv
534 (r7:4,p5:4) = [sp++];
537 DEFUN_END(uyvytoyv12)
539 DEFUN(yuyvtoyv12, mL3, (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
540 long width, long height,
541 long lumStride, long chromStride, long srcStride)):
543 [--sp] = (r7:4,p5:4);
545 p0 = r1; // Y top even
548 r2 = [fp + ARG_vdst];
551 r1 = [fp + ARG_srcStride];
553 r1 += -8; // i0,i1 is pre read need to correct
556 i0 = r0; // uyvy_T even
557 i1 = r2; // uyvy_B odd
559 p2 = [fp + ARG_lumStride];
560 p1 = p0 + p2; // Y bot odd
562 p5 = [fp + ARG_width];
563 p4 = [fp + ARG_height];
568 r2 = [fp + ARG_chromStride];
573 /* I0,I1 - src input line pointers
574 * p0,p1 - luma output line pointers
579 lsetup (0f, 1f) lc1 = p4; // H/2
580 0: r0 = [i0++] || r2 = [i1++];
581 r1 = [i0++] || r3 = [i1++];
582 r4 = bytepack(r0, r1);
583 r5 = bytepack(r2, r3);
584 lsetup (2f, 3f) lc0 = p5; // W/4
585 2: r0 = r0 >> 8(v) || [p0++] = r4; // yyyy-even
586 r1 = r1 >> 8(v) || [p1++] = r5; // yyyy-odd
589 r4 = byteop1p(r1:0, r3:2);
590 r5 = byteop1p(r1:0, r3:2) (r);
591 r6 = pack(r5.l, r4.l);
592 r7 = pack(r5.h, r4.h) || r0 = [i0++] || r2 = [i1++];
593 r6 = bytepack(r6, r7) || r1 = [i0++] || r3 = [i1++];
594 r4 = bytepack(r0, r1) || w[i2++] = r6.l; // uu
595 3: r5 = bytepack(r2, r3) || w[i3++] = r6.h; // vv
604 (r7:4,p5:4) = [sp++];
607 DEFUN_END(yuyvtoyv12)