2 * Fast JIT powered scaler for ARM
4 * Copyright (C) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public License
8 * version 2.1 as published by the Free Software Foundation.
10 * This library is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
28 #include "arm_jit_swscale.h"
29 #include "arm_colorconv.h"
31 /* Size of cpu instructions cache, we should never exceed it in generated code */
32 #define INSTRUCTIONS_CACHE_SIZE 32768
34 /* Supported output formats */
35 #define FMT_OMAPFB_YUV422 1
36 #define FMT_OMAPFB_YUV420 2
38 extern void __clear_cache (char *beg, char *end);
41 * API is similar to API from ffmpeg libswscale
43 typedef struct SwsContextArmJit {
51 int armv6_is_supported;
57 #define INTERPOLATE_COPY_FIRST 0
58 #define INTERPOLATE_AVERAGE_1_3 1
59 #define INTERPOLATE_AVERAGE_2_2 2
60 #define INTERPOLATE_AVERAGE_3_1 3
63 * Get two nearest pixels from the source image
65 * @todo get rid of the floating point math
67 static inline int get_pix(int quality, int orig_w, int dest_w, int x, int *p1, int *p2)
69 double offs = ((double)x + 0.5) / (double)dest_w * (double)orig_w;
71 int pix1 = floor(offs - 0.5);
72 int pix2 = ceil(offs - 0.5);
73 // Special boundary cases
76 return INTERPOLATE_COPY_FIRST;
79 *p1 = *p2 = orig_w - 1;
80 return INTERPOLATE_COPY_FIRST;
82 dist = offs - ((double)pix1 + 0.5);
85 if (dist > 0.125 && dist < 0.375) {
88 return INTERPOLATE_AVERAGE_3_1;
90 if (dist > 0.625 && dist < 0.875) {
93 return INTERPOLATE_AVERAGE_1_3;
98 if (dist > 0.25 && dist < 0.75) {
101 return INTERPOLATE_AVERAGE_2_2;
107 return INTERPOLATE_COPY_FIRST;
110 return INTERPOLATE_COPY_FIRST;
114 static uint32_t *generate_arm_cmd_ldrb_r_r_offs(uint32_t *cmdbuffer, int dstreg, int basereg, int offset)
117 printf("ldrb r%d, [r%d, #%d]\n", dstreg, basereg, offset);
119 *cmdbuffer++ = 0xE5D00000 | (basereg << 16) | (dstreg << 12) | (offset);
123 static uint32_t *generate_arm_cmd_add_r_r_r_lsl(uint32_t *cmdbuffer, int dstreg, int r1, int r2, int r2_shift)
126 printf("add r%d, r%d, r%d, lsl #%d\n", dstreg, r1, r2, r2_shift);
128 *cmdbuffer++ = 0xE0800000 | (r1 << 16) | (dstreg << 12) | (r2_shift << 7) | (r2);
132 static uint32_t *generate_arm_cmd_mov_r_r_lsr(uint32_t *cmdbuffer, int dstreg, int r, int shift)
135 printf("mov r%d, r%d, lsr #%d\n", dstreg, r, shift);
137 *cmdbuffer++ = 0xE1A00020 | (dstreg << 12) | (shift << 7) | (r);
142 * Generation of 32-bit output scaled data
143 * @param quality - scaling quality level
144 * @param buf1reg - register that holds a pointer to the buffer with data for the first output byte
145 * @param buf2reg - register that holds a pointer to the buffer with data for the second output byte
146 * @param buf3reg - register that holds a pointer to the buffer with data for the third output byte
147 * @param buf4reg - register that holds a pointer to the buffer with data for the fourth output byte
149 static uint32_t *generate_32bit_scaled_data_write(
151 int quality, int orig_w, int dest_w,
152 int buf1reg, int size1, int offs1,
153 int buf2reg, int size2, int offs2,
154 int buf3reg, int size3, int offs3,
155 int buf4reg, int size4, int offs4)
158 int type_y1, type_y2, type_u, type_v;
159 // First stage: perform data loading
160 type_y1 = get_pix(quality, orig_w / size1, dest_w / size1, offs1 / size1, &p1, &p2);
161 if (type_y1 == INTERPOLATE_COPY_FIRST) {
162 // Special case, no interpolation is needed, so load this data
163 // directly into destination register
164 p = generate_arm_cmd_ldrb_r_r_offs(p, 4, buf1reg, p1);
166 p = generate_arm_cmd_ldrb_r_r_offs(p, 5, buf1reg, p1);
167 p = generate_arm_cmd_ldrb_r_r_offs(p, 6, buf1reg, p2);
170 type_u = get_pix(quality, orig_w / size2, dest_w / size2, offs2 / size2, &p1, &p2);
171 p = generate_arm_cmd_ldrb_r_r_offs(p, 7, buf2reg, p1);
172 if (type_u != INTERPOLATE_COPY_FIRST) p = generate_arm_cmd_ldrb_r_r_offs(p, 8, buf2reg, p2);
174 type_y2 = get_pix(quality, orig_w / size3, dest_w / size3, offs3 / size3, &p1, &p2);
175 p = generate_arm_cmd_ldrb_r_r_offs(p, 9, buf3reg, p1);
176 if (type_y2 != INTERPOLATE_COPY_FIRST) p = generate_arm_cmd_ldrb_r_r_offs(p, 10, buf3reg, p2);
178 type_v = get_pix(quality, orig_w / size4, dest_w / size4, offs4 / size4, &p1, &p2);
179 p = generate_arm_cmd_ldrb_r_r_offs(p, 11, buf4reg, p1);
180 if (type_v != INTERPOLATE_COPY_FIRST) p = generate_arm_cmd_ldrb_r_r_offs(p, 12, buf4reg, p2);
181 // Second stage: perform data shuffling
182 if (type_y1 == INTERPOLATE_AVERAGE_2_2) {
183 p = generate_arm_cmd_add_r_r_r_lsl(p, 14, 5, 6, 0);
184 p = generate_arm_cmd_mov_r_r_lsr(p, 4, 14, 1);
186 if (type_u == INTERPOLATE_COPY_FIRST) {
187 p = generate_arm_cmd_add_r_r_r_lsl(p, 4, 4, 7, 8);
188 } else if (type_u == INTERPOLATE_AVERAGE_2_2) {
189 p = generate_arm_cmd_add_r_r_r_lsl(p, 14, 7, 8, 0);
190 p = generate_arm_cmd_mov_r_r_lsr(p, 14, 14, 1);
191 p = generate_arm_cmd_add_r_r_r_lsl(p, 4, 4, 14, 8);
193 if (type_y2 == INTERPOLATE_COPY_FIRST) {
194 p = generate_arm_cmd_add_r_r_r_lsl(p, 4, 4, 9, 16);
195 } else if (type_y2 == INTERPOLATE_AVERAGE_2_2) {
196 p = generate_arm_cmd_add_r_r_r_lsl(p, 14, 9, 10, 0);
197 p = generate_arm_cmd_mov_r_r_lsr(p, 14, 14, 1);
198 p = generate_arm_cmd_add_r_r_r_lsl(p, 4, 4, 14, 16);
200 if (type_v == INTERPOLATE_COPY_FIRST) {
201 p = generate_arm_cmd_add_r_r_r_lsl(p, 4, 4, 11, 24);
202 } else if (type_v == INTERPOLATE_AVERAGE_2_2) {
203 p = generate_arm_cmd_add_r_r_r_lsl(p, 14, 11, 12, 0);
204 p = generate_arm_cmd_mov_r_r_lsr(p, 14, 14, 1);
205 p = generate_arm_cmd_add_r_r_r_lsl(p, 4, 4, 14, 24);
207 // Third stage: store data and advance output buffer pointer
208 *p++ = 0xE4834004; // str r4, [r3], #4
213 * Scaler code should assume:
217 * r3 - destination buffer
218 * r4 - result for storage into output buffer
219 * r5, r6 - source data for y1 calculation
220 * r7, r8 - source data for u calculation
221 * r9, r10 - source data for y2 calculation
222 * r11, r12 - source data for v calculation
223 * r14 (lr) - accumulator
225 * @param cmdbuffer - bugger for dynamically generated code
226 * @return - number of instructions generated
228 static int generate_yuv420p_to_yuyv422_line_scaler(uint32_t *cmdbuffer, int maxcmdcount, int orig_w, int dest_w, int quality)
230 int i, p1, p2, cmdcount;
231 int type_y1, type_y2, type_u, type_v;
233 uint32_t *p = cmdbuffer;
235 *p++ = 0xE92D4FF0; // stmfd sp!, {r4-r11, lr} @ save all registers
237 // Process a pair of destination pixels per loop iteration (it should result in 32-bit value write)
238 for (i = 0; i < dest_w; i += 2) {
239 p = generate_32bit_scaled_data_write(
240 p, quality, orig_w, dest_w,
246 *p++ = 0xE8BD8FF0; // ldmfd sp!, {r4-r11, pc} @ restore all registers and return
247 cmdcount = p - cmdbuffer;
250 printf("@ number of instructions = %d\n", cmdcount);
251 FILE *f = fopen("cmdbuf.bin", "w+");
252 fwrite(cmdbuffer, 1, INSTRUCTIONS_CACHE_SIZE, f);
258 static int generate_yuv420p_to_yuv420_line_scaler(uint32_t *cmdbuffer, int maxcmdcount, int orig_w, int dest_w, int quality)
260 int i = 0, p1, p2, cmdcount;
261 int type_y1, type_y2, type_u, type_v;
263 uint32_t *p = cmdbuffer;
268 *p++ = 0xE92D4FF0; // stmfd sp!, {r4-r11, lr} @ save all registers
270 while (i + 8 <= dest_w) {
271 p = generate_32bit_scaled_data_write(
272 p, quality, orig_w, dest_w,
276 SRC_Y, 1, i + 1 * 1);
277 p = generate_32bit_scaled_data_write(
278 p, quality, orig_w, dest_w,
282 SRC_U, 2, i + 2 * 2);
283 p = generate_32bit_scaled_data_write(
284 p, quality, orig_w, dest_w,
288 SRC_Y, 1, i + 6 * 1);
291 *p++ = 0xE8BD8FF0; // ldmfd sp!, {r4-r11, pc} @ restore all registers and return
292 cmdcount = p - cmdbuffer;
295 printf("@ number of instructions = %d\n", cmdcount);
296 FILE *f = fopen("cmdbuf.bin", "w+");
297 fwrite(cmdbuffer, 1, INSTRUCTIONS_CACHE_SIZE, f);
304 /******************************************************************************/
306 static struct SwsContextArmJit *sws_arm_jit_create_scaler_internal(int source_w, int source_h, int target_w, int target_h, int quality, int fmt)
309 uint32_t *p = mmap(0, INSTRUCTIONS_CACHE_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
310 if (fmt == FMT_OMAPFB_YUV422) {
311 generate_yuv420p_to_yuyv422_line_scaler(p, INSTRUCTIONS_CACHE_SIZE / 4, source_w, target_w, quality);
312 } else if (fmt == FMT_OMAPFB_YUV420) {
313 generate_yuv420p_to_yuv420_line_scaler(p, INSTRUCTIONS_CACHE_SIZE / 4, source_w, target_w, quality);
318 int *linebuffer = (int *)malloc(target_h * sizeof(int));
319 for (i = 0; i < target_h; i ++) {
320 get_pix(1, source_h, target_h, i, &p1, &p2);
324 __clear_cache((char *)p, (char *)p + INSTRUCTIONS_CACHE_SIZE);
326 SwsContextArmJit *context = (SwsContextArmJit *)malloc(sizeof(SwsContextArmJit));
327 memset(context, 0, sizeof(SwsContextArmJit));
328 context->source_w = source_w;
329 context->source_h = source_h;
330 context->target_w = target_w;
331 context->target_h = target_h;
332 context->codebuffer = p;
333 context->linebuffer = linebuffer;
335 context->armv6_is_supported = 0;
339 struct SwsContextArmJit *sws_arm_jit_create_omapfb_yuv422_scaler(int source_w, int source_h, int target_w, int target_h, int quality)
341 return sws_arm_jit_create_scaler_internal(source_w, source_h, target_w, target_h, quality, FMT_OMAPFB_YUV422);
344 struct SwsContextArmJit *sws_arm_jit_create_omapfb_yuv420_scaler(int source_w, int source_h, int target_w, int target_h, int quality)
346 return sws_arm_jit_create_scaler_internal(source_w, source_h, target_w, target_h, quality, FMT_OMAPFB_YUV420);
349 struct SwsContextArmJit *sws_arm_jit_create_omapfb_yuv420_scaler_armv6(int source_w, int source_h, int target_w, int target_h, int quality)
351 struct SwsContextArmJit *s = sws_arm_jit_create_scaler_internal(source_w, source_h, target_w, target_h, quality, FMT_OMAPFB_YUV420);
352 if (s) s->armv6_is_supported = 1;
356 void sws_arm_jit_free(SwsContextArmJit *context)
358 if (!context) return;
359 munmap(context->codebuffer, INSTRUCTIONS_CACHE_SIZE);
360 free(context->linebuffer);
364 static int sws_arm_jit_vscaleonly_internal(SwsContextArmJit *context, uint8_t* src[], int srcStride[], uint8_t* dst[], int dstStride[])
368 if (context->fmt == FMT_OMAPFB_YUV420) {
369 void (*yv12_to_yuv420_line)(uint16_t *dst, const uint16_t *src_y, const uint8_t *src_c, int w) =
370 yv12_to_yuv420_line_arm;
371 if (context->armv6_is_supported) yv12_to_yuv420_line = yv12_to_yuv420_line_armv6;
373 for (i = 0; i < context->target_h; i++) {
374 j = context->linebuffer[i];
376 yv12_to_yuv420_line((uint16_t *)(dst[0] + i * dstStride[0]),
377 src[0] + j * srcStride[0], src[2] + (j / 2) * srcStride[2], context->target_w);
379 yv12_to_yuv420_line((uint16_t *)(dst[0] + i * dstStride[0]),
380 src[0] + j * srcStride[0], src[1] + (j / 2) * srcStride[1], context->target_w);
384 } else if (context->fmt == FMT_OMAPFB_YUV422) {
385 void (*yv12_to_yuy2_line)(uint16_t *dst, const uint16_t *src_y, const uint8_t *src_u, const uint8_t *src_v, int w) =
386 yv12_to_yuy2_line_arm;
387 for (i = 0; i < context->target_h; i++) {
388 j = context->linebuffer[i];
390 dst[0] + i * dstStride[0],
391 src[0] + j * srcStride[0],
392 src[1] + (j / 2) * srcStride[1],
393 src[2] + (j / 2) * srcStride[2],
401 static int sws_arm_jit_scale_internal(SwsContextArmJit *context, uint8_t* src[], int srcStride[], uint8_t* dst[], int dstStride[])
404 void (*scale_line)(uint8_t *y, uint8_t *u, uint8_t *v, uint8_t *out) =
405 (void (*)(uint8_t *, uint8_t *, uint8_t *, uint8_t *))context->codebuffer;
407 if (context->source_w == context->target_w)
408 return sws_arm_jit_vscaleonly_internal(context, src, srcStride, dst, dstStride);
410 if (context->fmt == FMT_OMAPFB_YUV422) {
411 for (i = 0; i < context->target_h; i++) {
412 j = context->linebuffer[i];
414 src[0] + j * srcStride[0],
415 src[1] + (j / 2) * srcStride[1],
416 src[2] + (j / 2) * srcStride[2],
417 dst[0] + i * dstStride[0]);
420 } else if (context->fmt == FMT_OMAPFB_YUV420) {
421 for (i = 0; i < context->target_h; i++) {
422 j = context->linebuffer[i];
424 src[0] + j * srcStride[0],
425 (i & 1) ? (src[2] + (j / 2) * srcStride[2]) : (src[1] + (j / 2) * srcStride[1]),
427 dst[0] + i * dstStride[0]);
434 int sws_arm_jit_scale(SwsContextArmJit *context, uint8_t* src[], int srcStride[], int y, int h, uint8_t* dst[], int dstStride[])
436 if (y != 0 || h != context->source_h) return 0; // Slices are not supported yet
437 return sws_arm_jit_scale_internal(context, src, srcStride, dst, dstStride);