]> git.sesse.net Git - ffmpeg/blob - libswscale/internal_bfin.S
Blackfin optimized YUV420 to RGB CSC Color Space Converters.
[ffmpeg] / libswscale / internal_bfin.S
1 /*
2  * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
3  *                    April 20, 2007
4  *
5  * Blackfin Video Color Space Converters Operations
6  *  convert I420 YV12 to RGB in various formats,
7  *
8  * This file is part of FFmpeg.
9  *
10  * FFmpeg is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * FFmpeg is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with FFmpeg; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24
25
26 /*
27     YUV420 to RGB565 conversion.  This routine takes a YUV 420 planar macroblock
28     and converts it to RGB565.  R:5 bits, G:6 bits, B:5 bits.. packed into shorts
29
30
31     The following calculation is used for the conversion:
32
33       r = clipz((y-oy)*cy  + crv*(v-128))
34       g = clipz((y-oy)*cy  + cgv*(v-128) + cgu*(u-128))
35       b = clipz((y-oy)*cy  + cbu*(u-128))
36
37     y,u,v are pre scaled by a factor of 4 i.e. left shifted to gain precision.
38
39
40     New factorization to elliminate the truncation error which was
41     occuring due to the byteop3p.
42
43
44   1) use the bytop16m to subtract quad bytes we use this in U8 this
45    then so the offsets need to be renormalized to 8bits.
46
47   2) scale operands up by a factor of 4 not 8 because Blackfin
48      multiplies include a shift.
49
50   3) compute into the accumulators cy*yx0, cy*yx1
51
52   4) compute each of the linear equations
53       r = clipz((y-oy)*cy  + crv*(v-128))
54
55       g = clipz((y-oy)*cy  + cgv*(v-128) + cgu*(u-128))
56
57       b = clipz((y-oy)*cy  + cbu*(u-128))
58
59      reuse of the accumulators requires that we actually multiply
60      twice once with addition and the second time with a subtaction.
61
62      because of this we need to compute the equations in the order R B
63      then G saving the writes for B in the case of 24/32 bit color
64      formats.
65
66     api: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
67                        int dW, uint32_t *coeffs);
68
69         A          B
70         ---        ---
71         i2 = cb    i3 = cr
72         i1 = coeff i0 = y
73
74   Where coeffs have the following layout in memory.
75
76   uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv;
77
78   coeffs is a pointer to oy.
79
80   the {rgb} masks are only utilized by the 565 packing algorithm. Note the data
81   replication is used to simplify the internal algorithms for the dual mac architecture
82   of BlackFin.
83
84   All routines are exported with _ff_bfin_ as a symbol prefix
85
86   rough performance gain compared against -O3:
87
88   2779809/1484290 187.28%
89
90   which translates to ~33c/pel to ~57c/pel for the reference vs 17.5
91   c/pel for the optimized implementations. Not sure why there is such a
92   huge variation on the reference codes on Blackfin I guess it must have
93   to do with the memory system.
94
95 */
96
97 #define mL1 .l1.text
98 #define mL3 .text
99 #define MEM mL1
100
101 #define DEFUN(fname,where,interface) \
102         .section where;              \
103         .global _ff_bfin_ ## fname;  \
104         .type _ff_bfin_ ## fname, STT_FUNC; \
105         .align 8;                    \
106         _ff_bfin_ ## fname
107
108 #define DEFUN_END(fname) \
109         .size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname
110
111
112 .text
113
114 #define COEFF_LEN        11*4
115 #define COEFF_REL_CY_OFF 4*4
116
117 #define ARG_OUT   20
118 #define ARG_W     24
119 #define ARG_COEFF 28
120
121 DEFUN(yuv2rgb565_line,MEM,
122    (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
123         link 0;
124         [--sp] = (r7:4);
125         p1 = [fp+ARG_OUT];
126         r3 = [fp+ARG_W];
127
128         i0 = r0;
129         i2 = r1;
130         i3 = r2;
131
132         r0 = [fp+ARG_COEFF];
133         i1 = r0;
134         b1 = i1;
135         l1 = COEFF_LEN;
136         m0 = COEFF_REL_CY_OFF;
137         p0 = r3;
138
139         r0   = [i0++];         // 2Y
140         r1.l = w[i2++];        // 2u
141         r1.h = w[i3++];        // 2v
142         p0 = p0>>2;
143
144         lsetup (.L0565, .L1565) lc0 = p0;
145
146         /*
147            uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
148            r0 -- used to load 4ys
149            r1 -- used to load 2us,2vs
150            r4 -- y3,y2
151            r5 -- y1,y0
152            r6 -- u1,u0
153            r7 -- v1,v0
154         */
155                                                               r2=[i1++]; // oy
156 .L0565:
157         /*
158         rrrrrrrr gggggggg bbbbbbbb
159          5432109876543210
160                     bbbbb >>3
161               gggggggg    <<3
162          rrrrrrrr         <<8
163          rrrrrggggggbbbbb
164         */
165         (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
166         (r7,r6) = byteop16m (r1:0, r3:2) (r);
167         r5 = r5 << 2 (v);                                                // y1,y0
168         r4 = r4 << 2 (v);                                                // y3,y2
169         r6 = r6 << 2 (v)                                   || r0=[i1++]; // u1,u0, r0=zero
170         r7 = r7 << 2 (v)                                   || r1=[i1++]; // v1,v0  r1=cy
171         /* Y' = y*cy */
172         a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
173
174         /* R = Y+ crv*(Cr-128) */
175         r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
176                 a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
177         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
178         r2 = r2 >> 3 (v);
179         r3 = r2 & r5;
180
181         /* B = Y+ cbu*(Cb-128) */
182         r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
183                 a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
184         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
185         r2 = r2 << 8 (v);
186         r2 = r2 & r5;
187         r3 = r3 | r2;
188
189         /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
190                 a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
191         r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
192         r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask
193         r2 = r2 << 3 (v);
194         r2 = r2 & r5;
195         r3 = r3 | r2;
196         [p1++]=r3                                          || r1=[i1++]; // cy
197
198         /* Y' = y*cy */
199
200         a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
201
202         /* R = Y+ crv*(Cr-128) */
203         r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
204                 a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
205         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
206         r2 = r2 >> 3 (v);
207         r3 = r2 & r5;
208
209         /* B = Y+ cbu*(Cb-128) */
210         r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
211                 a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
212         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
213         r2 = r2 << 8 (v);
214         r2 = r2 & r5;
215         r3 = r3 | r2;
216
217         /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
218                 a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
219         r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
220         r2 = byteop3p(r3:2, r1:0)(LO)                      || r0   =  [i0++];        // 2Y
221         r2 = r2 << 3 (v)                                   || r1.l = w[i2++];        // 2u
222         r2 = r2 & r5;
223         r3 = r3 | r2;
224         [p1++]=r3                                          || r1.h = w[i3++];        // 2v
225 .L1565:                                                       r2=[i1++]; // oy
226
227         l1 = 0;
228
229         (r7:4) = [sp++];
230         unlink;
231         rts;
232 DEFUN_END(yuv2rgb565_line)
233
234 DEFUN(yuv2rgb555_line,MEM,
235    (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
236         link 0;
237         [--sp] = (r7:4);
238         p1 = [fp+ARG_OUT];
239         r3 = [fp+ARG_W];
240
241         i0 = r0;
242         i2 = r1;
243         i3 = r2;
244
245         r0 = [fp+ARG_COEFF];
246         i1 = r0;
247         b1 = i1;
248         l1 = COEFF_LEN;
249         m0 = COEFF_REL_CY_OFF;
250         p0 = r3;
251
252         r0   = [i0++];         // 2Y
253         r1.l = w[i2++];        // 2u
254         r1.h = w[i3++];        // 2v
255         p0 = p0>>2;
256
257         lsetup (.L0555, .L1555) lc0 = p0;
258
259         /*
260            uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
261            r0 -- used to load 4ys
262            r1 -- used to load 2us,2vs
263            r4 -- y3,y2
264            r5 -- y1,y0
265            r6 -- u1,u0
266            r7 -- v1,v0
267         */
268                                                               r2=[i1++]; // oy
269 .L0555:
270         /*
271         rrrrrrrr gggggggg bbbbbbbb
272          5432109876543210
273                     bbbbb >>3
274                gggggggg   <<2
275           rrrrrrrr        <<7
276          xrrrrrgggggbbbbb
277         */
278
279         (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
280         (r7,r6) = byteop16m (r1:0, r3:2) (r);
281         r5 = r5 << 2 (v);                                                // y1,y0
282         r4 = r4 << 2 (v);                                                // y3,y2
283         r6 = r6 << 2 (v)                                   || r0=[i1++]; // u1,u0, r0=zero
284         r7 = r7 << 2 (v)                                   || r1=[i1++]; // v1,v0  r1=cy
285         /* Y' = y*cy */
286         a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
287
288         /* R = Y+ crv*(Cr-128) */
289         r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
290                 a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
291         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
292         r2 = r2 >> 3 (v);
293         r3 = r2 & r5;
294
295         /* B = Y+ cbu*(Cb-128) */
296         r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
297                 a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
298         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
299         r2 = r2 << 7 (v);
300         r2 = r2 & r5;
301         r3 = r3 | r2;
302
303         /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
304                 a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
305         r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
306         r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask
307         r2 = r2 << 2 (v);
308         r2 = r2 & r5;
309         r3 = r3 | r2;
310         [p1++]=r3                                          || r1=[i1++]; // cy
311
312         /* Y' = y*cy */
313
314         a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
315
316         /* R = Y+ crv*(Cr-128) */
317         r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
318                 a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
319         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
320         r2 = r2 >> 3 (v);
321         r3 = r2 & r5;
322
323         /* B = Y+ cbu*(Cb-128) */
324         r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
325                 a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
326         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
327         r2 = r2 << 7 (v);
328         r2 = r2 & r5;
329         r3 = r3 | r2;
330
331         /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
332                 a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
333         r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
334         r2 = byteop3p(r3:2, r1:0)(LO)                      || r0=[i0++];     // 4Y
335         r2 = r2 << 2 (v)                                   || r1.l=w[i2++];  // 2u
336         r2 = r2 & r5;
337         r3 = r3 | r2;
338         [p1++]=r3                                          || r1.h=w[i3++]; // 2v
339
340 .L1555:                                                       r2=[i1++]; // oy
341
342         l1 = 0;
343
344         (r7:4) = [sp++];
345         unlink;
346         rts;
347 DEFUN_END(yuv2rgb555_line)
348
349 DEFUN(yuv2rgb24_line,MEM,
350    (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
351         link 0;
352         [--sp] = (r7:4);
353         p1 = [fp+ARG_OUT];
354         r3 = [fp+ARG_W];
355         p2 = p1;
356         p2 += 3;
357
358         i0 = r0;
359         i2 = r1;
360         i3 = r2;
361
362         r0 = [fp+ARG_COEFF]; // coeff buffer
363         i1 = r0;
364         b1 = i1;
365         l1 = COEFF_LEN;
366         m0 = COEFF_REL_CY_OFF;
367         p0 = r3;
368
369         r0   = [i0++];         // 2Y
370         r1.l = w[i2++];        // 2u
371         r1.h = w[i3++];        // 2v
372         p0 = p0>>2;
373
374         lsetup (.L0888, .L1888) lc0 = p0;
375
376         /*
377            uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
378            r0 -- used to load 4ys
379            r1 -- used to load 2us,2vs
380            r4 -- y3,y2
381            r5 -- y1,y0
382            r6 -- u1,u0
383            r7 -- v1,v0
384         */
385                                                               r2=[i1++]; // oy
386 .L0888:
387         (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
388         (r7,r6) = byteop16m (r1:0, r3:2) (r);
389         r5 = r5 << 2 (v);               // y1,y0
390         r4 = r4 << 2 (v);               // y3,y2
391         r6 = r6 << 2 (v) || r0=[i1++];  // u1,u0, r0=zero
392         r7 = r7 << 2 (v) || r1=[i1++];  // v1,v0  r1=cy
393
394         /* Y' = y*cy */
395         a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
396
397         /* R = Y+ crv*(Cr-128) */
398         r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
399                 a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
400         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
401         r2=r2>>16 || B[p1++]=r2;
402                      B[p2++]=r2;
403
404         /* B = Y+ cbu*(Cb-128) */
405         r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
406                 a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
407         r3 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
408
409         /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
410                 a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
411         r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
412         r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask, oy,cy,zero
413
414         r2=r2>>16 || B[p1++]=r2;
415                      B[p2++]=r2;
416
417         r3=r3>>16 || B[p1++]=r3;
418                      B[p2++]=r3                            || r1=[i1++]; // cy
419
420         p1+=3;
421         p2+=3;
422         /* Y' = y*cy */
423         a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
424
425         /* R = Y+ crv*(Cr-128) */
426         r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
427                 a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
428         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
429         r2=r2>>16 || B[p1++]=r2;
430         B[p2++]=r2;
431
432         /* B = Y+ cbu*(Cb-128) */
433         r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
434                 a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
435         r3 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
436
437         /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
438                 a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
439         r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
440         r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++]; // gmask
441         r2=r2>>16 || B[p1++]=r2 || r0 = [i0++];    // 4y
442                      B[p2++]=r2 || r1.l = w[i2++]; // 2u
443         r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v
444                      B[p2++]=r3 || r2=[i1++];      // oy
445
446         p1+=3;
447 .L1888: p2+=3;
448
449         l1 = 0;
450
451         (r7:4) = [sp++];
452         unlink;
453         rts;
454 DEFUN_END(yuv2rgb888_line)