@***************************************************************************** @ i420_rv16.S : ARM NEONv1 I420 to RV16 chroma conversion @***************************************************************************** @ Copyright (C) 2011 Sébastien Toque @ Rémi Denis-Courmont @ @ This program is free software; you can redistribute it and/or modify it @ under the terms of the GNU Lesser General Public License as published by @ the Free Software Foundation; either version 2.1 of the License, or @ (at your option) any later version. @ @ This program is distributed in the hope that it will be useful, @ but WITHOUT ANY WARRANTY; without even the implied warranty of @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the @ GNU Lesser General Public License for more details. @ @ You should have received a copy of the GNU Lesser General Public License @ along with this program; if not, write to the Free Software Foundation, @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA. @****************************************************************************/ .syntax unified .fpu neon .text /* ARM */ #define O1 r0 #define O2 r1 #define WIDTH r2 #define HEIGHT r3 #define Y1 r4 #define Y2 r5 #define U r6 #define V r7 #define YPITCH r8 #define OPAD r10 #define YPAD r11 #define COUNT ip #define OPITCH lr /* NEON */ #define coefY D0 #define coefRV D1 #define coefGU D2 #define coefGV D3 #define coefBU D4 #define Rc Q3 #define Gc Q4 #define Bc Q5 #define u D24 #define v D25 #define y1 D18 #define y2 D19 #define chro_r Q6 #define chro_g Q7 #define chro_b Q8 #define lumi1 Q15 #define lumi2 Q10 #define red16_1 Q9 #define green16_1 Q10 #define blue16_1 Q11 #define red16_2 Q12 #define green16_2 Q13 #define blue16_2 Q14 #define red1 D25 #define green1 D26 #define blue1 D27 #define red2 D29 #define green2 D30 #define blue2 D31 #define out1l D24 #define out1h D25 #define out2l D28 #define out2h D29 coefficients: .short -15872 .short 4992 .short -18432 .align 2 .global i420_rv16_neon .type i420_rv16_neon, %function i420_rv16_neon: push {r4-r8,r10-r11,lr} vpush {q4-q7} /* load arguments */ ldmia r0, {O1, OPITCH} ldmia r1, {Y1, U, V, YPITCH} /* round the width to be a multiple of 16 */ ands OPAD, WIDTH, #15 sub WIDTH, WIDTH, OPAD addne WIDTH, WIDTH, #16 /* init constants (scale value by 64) */ vmov.u8 coefY, #74 vmov.u8 coefRV, #115 vmov.u8 coefGU, #14 vmov.u8 coefGV, #34 vmov.u8 coefBU, #135 adr OPAD, coefficients vld1.s16 {d6[], d7[]}, [OPAD]! vld1.s16 {d8[], d9[]}, [OPAD]! vld1.s16 {d10[], d11[]}, [OPAD]! /* init padding */ cmp HEIGHT, #0 sub OPAD, OPITCH, WIDTH, lsl #1 sub YPAD, YPITCH, WIDTH loop_row: movsgt COUNT, WIDTH add O2, O1, OPITCH add Y2, Y1, YPITCH /* exit if all rows have been processed */ vpople {q4-q7} pople {r4-r8,r10-r11,pc} loop_col: /* Common U & V */ vld1.u8 {u}, [U,:64]! vld1.u8 {v}, [V,:64]! /* Y Top Row */ vld2.u8 {y1,y2}, [Y1,:128]! vmull.u8 Q14, v, coefRV vmull.u8 Q11, u, coefGU vmull.u8 Q13, u, coefBU vmlal.u8 Q11, v, coefGV vmull.u8 lumi2, y2, coefY vmull.u8 lumi1, y1, coefY vadd.s16 chro_r, Rc, Q14 vadd.s16 chro_b, Bc, Q13 vsub.s16 chro_g, Gc, Q11 pld [U] pld [V] /* chrominance + luminance */ vqadd.s16 red16_2, lumi2, chro_r vqadd.s16 green16_2, lumi2, chro_g vqadd.s16 blue16_2, lumi2, chro_b vqadd.s16 red16_1, lumi1, chro_r vqadd.s16 green16_1, lumi1, chro_g vqadd.s16 blue16_1, lumi1, chro_b /* clamp (divide by 64) */ vqrshrun.s16 green2, green16_2, #6 vqrshrun.s16 blue2, blue16_2, #6 vqrshrun.s16 red2, red16_2, #6 vqrshrun.s16 green1, green16_1, #6 vqrshrun.s16 red1, red16_1, #6 vqrshrun.s16 blue1, blue16_1, #6 pld [Y1] /* pack into RGB565 */ vshl.u8 out2l, green2, #3 // low 2a vsri.u8 out2h, green2, #5 // high 2 vshl.u8 out1l, green1, #3 // low 1a vsri.u8 out1h, green1, #5 // high 1 vsri.u8 out2l, blue2, #3 // low 2b vsri.u8 out1l, blue1, #3 // low 1b /* Y Bottom Row */ vld2.u8 {y1,y2}, [Y2,:128]! /* Top Row output */ vzip.u8 out1h, out2h vmull.u8 lumi2, y2, coefY vzip.u8 out1l, out2l vmull.u8 lumi1, y1, coefY vst2.u8 {out1l, out1h}, [O1,:128]! vst2.u8 {out2l, out2h}, [O1,:128]! /* chrominance + luminance */ vqadd.s16 green16_2, lumi2, chro_g vqadd.s16 red16_2, lumi2, chro_r vqadd.s16 blue16_2, lumi2, chro_b vqadd.s16 red16_1, lumi1, chro_r vqadd.s16 green16_1, lumi1, chro_g vqadd.s16 blue16_1, lumi1, chro_b /* clamp (divide by 64) */ vqrshrun.s16 green2, green16_2, #6 vqrshrun.s16 blue2, blue16_2, #6 vqrshrun.s16 red2, red16_2, #6 vqrshrun.s16 green1, green16_1, #6 vqrshrun.s16 red1, red16_1, #6 vqrshrun.s16 blue1, blue16_1, #6 pld [Y1] /* pack into RGB565 */ vshl.u8 out2l, green2, #3 // low 2a vsri.u8 out2h, green2, #5 // high 2 vshl.u8 out1l, green1, #3 // low 1a vsri.u8 out1h, green1, #5 // high 1 vsri.u8 out2l, blue2, #3 // low 2b vsri.u8 out1l, blue1, #3 // low 1b vzip.u8 out1h, out2h vzip.u8 out1l, out2l vst2.u8 {out1l, out1h}, [O2,:128]! vst2.u8 {out2l, out2h}, [O2,:128]! /* next columns (x16) */ subs COUNT, COUNT, #16 bgt loop_col /* next rows (x2) */ subs HEIGHT, #2 add O1, O2, OPAD add Y1, Y2, YPAD add U, U, YPAD, lsr #1 add V, V, YPAD, lsr #1 b loop_row