2 * Copyright (c) 2002 Brian Foley
3 * Copyright (c) 2002 Dieter Shirley
4 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 #include "libavutil/cpu.h"
25 #include "libavcodec/hpeldsp.h"
31 #include "libavutil/ppc/types_altivec.h"
32 #include "libavutil/ppc/util_altivec.h"
33 #include "dsputil_altivec.h"
35 /* next one assumes that ((line_size % 16) == 0) */
36 void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
38 register vector unsigned char pixelsv1, pixelsv2;
39 register vector unsigned char pixelsv1B, pixelsv2B;
40 register vector unsigned char pixelsv1C, pixelsv2C;
41 register vector unsigned char pixelsv1D, pixelsv2D;
43 register vector unsigned char perm = vec_lvsl(0, pixels);
45 register ptrdiff_t line_size_2 = line_size << 1;
46 register ptrdiff_t line_size_3 = line_size + line_size_2;
47 register ptrdiff_t line_size_4 = line_size << 2;
49 // hand-unrolling the loop by 4 gains about 15%
50 // mininum execution time goes from 74 to 60 cycles
51 // it's faster than -funroll-loops, but using
52 // -funroll-loops w/ this is bad - 74 cycles again.
53 // all this is on a 7450, tuning for the 7450
54 for (i = 0; i < h; i += 4) {
55 pixelsv1 = vec_ld( 0, pixels);
56 pixelsv2 = vec_ld(15, pixels);
57 pixelsv1B = vec_ld(line_size, pixels);
58 pixelsv2B = vec_ld(15 + line_size, pixels);
59 pixelsv1C = vec_ld(line_size_2, pixels);
60 pixelsv2C = vec_ld(15 + line_size_2, pixels);
61 pixelsv1D = vec_ld(line_size_3, pixels);
62 pixelsv2D = vec_ld(15 + line_size_3, pixels);
63 vec_st(vec_perm(pixelsv1, pixelsv2, perm),
64 0, (unsigned char*)block);
65 vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
66 line_size, (unsigned char*)block);
67 vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
68 line_size_2, (unsigned char*)block);
69 vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
70 line_size_3, (unsigned char*)block);
76 /* next one assumes that ((line_size % 16) == 0) */
77 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
78 void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
80 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
81 register vector unsigned char perm = vec_lvsl(0, pixels);
84 for (i = 0; i < h; i++) {
85 pixelsv1 = vec_ld( 0, pixels);
86 pixelsv2 = vec_ld(16,pixels);
87 blockv = vec_ld(0, block);
88 pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
89 blockv = vec_avg(blockv,pixelsv);
90 vec_st(blockv, 0, (unsigned char*)block);
96 /* next one assumes that ((line_size % 8) == 0) */
97 static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
99 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
102 for (i = 0; i < h; i++) {
103 /* block is 8 bytes-aligned, so we're either in the
104 left block (16 bytes-aligned) or in the right block (not) */
105 int rightside = ((unsigned long)block & 0x0000000F);
107 blockv = vec_ld(0, block);
108 pixelsv1 = vec_ld( 0, pixels);
109 pixelsv2 = vec_ld(16, pixels);
110 pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
113 pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
115 pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
118 blockv = vec_avg(blockv, pixelsv);
120 vec_st(blockv, 0, block);
127 /* next one assumes that ((line_size % 8) == 0) */
128 static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
131 register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
132 register vector unsigned char blockv, temp1, temp2;
133 register vector unsigned short pixelssum1, pixelssum2, temp3;
134 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
135 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
137 temp1 = vec_ld(0, pixels);
138 temp2 = vec_ld(16, pixels);
139 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
140 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
143 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
145 pixelsv1 = vec_mergeh(vczero, pixelsv1);
146 pixelsv2 = vec_mergeh(vczero, pixelsv2);
147 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
148 (vector unsigned short)pixelsv2);
149 pixelssum1 = vec_add(pixelssum1, vctwo);
151 for (i = 0; i < h ; i++) {
152 int rightside = ((unsigned long)block & 0x0000000F);
153 blockv = vec_ld(0, block);
155 temp1 = vec_ld(line_size, pixels);
156 temp2 = vec_ld(line_size + 16, pixels);
157 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
158 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
161 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
164 pixelsv1 = vec_mergeh(vczero, pixelsv1);
165 pixelsv2 = vec_mergeh(vczero, pixelsv2);
166 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
167 (vector unsigned short)pixelsv2);
168 temp3 = vec_add(pixelssum1, pixelssum2);
169 temp3 = vec_sra(temp3, vctwo);
170 pixelssum1 = vec_add(pixelssum2, vctwo);
171 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
174 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
176 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
179 vec_st(blockv, 0, block);
186 /* next one assumes that ((line_size % 8) == 0) */
187 static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
190 register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
191 register vector unsigned char blockv, temp1, temp2;
192 register vector unsigned short pixelssum1, pixelssum2, temp3;
193 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
194 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
195 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
197 temp1 = vec_ld(0, pixels);
198 temp2 = vec_ld(16, pixels);
199 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
200 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
203 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
205 pixelsv1 = vec_mergeh(vczero, pixelsv1);
206 pixelsv2 = vec_mergeh(vczero, pixelsv2);
207 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
208 (vector unsigned short)pixelsv2);
209 pixelssum1 = vec_add(pixelssum1, vcone);
211 for (i = 0; i < h ; i++) {
212 int rightside = ((unsigned long)block & 0x0000000F);
213 blockv = vec_ld(0, block);
215 temp1 = vec_ld(line_size, pixels);
216 temp2 = vec_ld(line_size + 16, pixels);
217 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
218 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
221 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
224 pixelsv1 = vec_mergeh(vczero, pixelsv1);
225 pixelsv2 = vec_mergeh(vczero, pixelsv2);
226 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
227 (vector unsigned short)pixelsv2);
228 temp3 = vec_add(pixelssum1, pixelssum2);
229 temp3 = vec_sra(temp3, vctwo);
230 pixelssum1 = vec_add(pixelssum2, vcone);
231 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
234 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
236 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
239 vec_st(blockv, 0, block);
246 /* next one assumes that ((line_size % 16) == 0) */
247 static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
250 register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
251 register vector unsigned char blockv, temp1, temp2;
252 register vector unsigned short temp3, temp4,
253 pixelssum1, pixelssum2, pixelssum3, pixelssum4;
254 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
255 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
257 temp1 = vec_ld(0, pixels);
258 temp2 = vec_ld(16, pixels);
259 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
260 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
263 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
265 pixelsv3 = vec_mergel(vczero, pixelsv1);
266 pixelsv4 = vec_mergel(vczero, pixelsv2);
267 pixelsv1 = vec_mergeh(vczero, pixelsv1);
268 pixelsv2 = vec_mergeh(vczero, pixelsv2);
269 pixelssum3 = vec_add((vector unsigned short)pixelsv3,
270 (vector unsigned short)pixelsv4);
271 pixelssum3 = vec_add(pixelssum3, vctwo);
272 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
273 (vector unsigned short)pixelsv2);
274 pixelssum1 = vec_add(pixelssum1, vctwo);
276 for (i = 0; i < h ; i++) {
277 blockv = vec_ld(0, block);
279 temp1 = vec_ld(line_size, pixels);
280 temp2 = vec_ld(line_size + 16, pixels);
281 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
282 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
285 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
288 pixelsv3 = vec_mergel(vczero, pixelsv1);
289 pixelsv4 = vec_mergel(vczero, pixelsv2);
290 pixelsv1 = vec_mergeh(vczero, pixelsv1);
291 pixelsv2 = vec_mergeh(vczero, pixelsv2);
293 pixelssum4 = vec_add((vector unsigned short)pixelsv3,
294 (vector unsigned short)pixelsv4);
295 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
296 (vector unsigned short)pixelsv2);
297 temp4 = vec_add(pixelssum3, pixelssum4);
298 temp4 = vec_sra(temp4, vctwo);
299 temp3 = vec_add(pixelssum1, pixelssum2);
300 temp3 = vec_sra(temp3, vctwo);
302 pixelssum3 = vec_add(pixelssum4, vctwo);
303 pixelssum1 = vec_add(pixelssum2, vctwo);
305 blockv = vec_packsu(temp3, temp4);
307 vec_st(blockv, 0, block);
314 /* next one assumes that ((line_size % 16) == 0) */
315 static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
318 register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
319 register vector unsigned char blockv, temp1, temp2;
320 register vector unsigned short temp3, temp4,
321 pixelssum1, pixelssum2, pixelssum3, pixelssum4;
322 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
323 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
324 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
326 temp1 = vec_ld(0, pixels);
327 temp2 = vec_ld(16, pixels);
328 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
329 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
332 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
334 pixelsv3 = vec_mergel(vczero, pixelsv1);
335 pixelsv4 = vec_mergel(vczero, pixelsv2);
336 pixelsv1 = vec_mergeh(vczero, pixelsv1);
337 pixelsv2 = vec_mergeh(vczero, pixelsv2);
338 pixelssum3 = vec_add((vector unsigned short)pixelsv3,
339 (vector unsigned short)pixelsv4);
340 pixelssum3 = vec_add(pixelssum3, vcone);
341 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
342 (vector unsigned short)pixelsv2);
343 pixelssum1 = vec_add(pixelssum1, vcone);
345 for (i = 0; i < h ; i++) {
346 blockv = vec_ld(0, block);
348 temp1 = vec_ld(line_size, pixels);
349 temp2 = vec_ld(line_size + 16, pixels);
350 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
351 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
354 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
357 pixelsv3 = vec_mergel(vczero, pixelsv1);
358 pixelsv4 = vec_mergel(vczero, pixelsv2);
359 pixelsv1 = vec_mergeh(vczero, pixelsv1);
360 pixelsv2 = vec_mergeh(vczero, pixelsv2);
362 pixelssum4 = vec_add((vector unsigned short)pixelsv3,
363 (vector unsigned short)pixelsv4);
364 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
365 (vector unsigned short)pixelsv2);
366 temp4 = vec_add(pixelssum3, pixelssum4);
367 temp4 = vec_sra(temp4, vctwo);
368 temp3 = vec_add(pixelssum1, pixelssum2);
369 temp3 = vec_sra(temp3, vctwo);
371 pixelssum3 = vec_add(pixelssum4, vcone);
372 pixelssum1 = vec_add(pixelssum2, vcone);
374 blockv = vec_packsu(temp3, temp4);
376 vec_st(blockv, 0, block);
383 /* next one assumes that ((line_size % 8) == 0) */
384 static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
387 register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
388 register vector unsigned char blockv, temp1, temp2, blocktemp;
389 register vector unsigned short pixelssum1, pixelssum2, temp3;
391 register const vector unsigned char vczero = (const vector unsigned char)
393 register const vector unsigned short vctwo = (const vector unsigned short)
396 temp1 = vec_ld(0, pixels);
397 temp2 = vec_ld(16, pixels);
398 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
399 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
402 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
404 pixelsv1 = vec_mergeh(vczero, pixelsv1);
405 pixelsv2 = vec_mergeh(vczero, pixelsv2);
406 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
407 (vector unsigned short)pixelsv2);
408 pixelssum1 = vec_add(pixelssum1, vctwo);
410 for (i = 0; i < h ; i++) {
411 int rightside = ((unsigned long)block & 0x0000000F);
412 blockv = vec_ld(0, block);
414 temp1 = vec_ld(line_size, pixels);
415 temp2 = vec_ld(line_size + 16, pixels);
416 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
417 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
420 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
423 pixelsv1 = vec_mergeh(vczero, pixelsv1);
424 pixelsv2 = vec_mergeh(vczero, pixelsv2);
425 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
426 (vector unsigned short)pixelsv2);
427 temp3 = vec_add(pixelssum1, pixelssum2);
428 temp3 = vec_sra(temp3, vctwo);
429 pixelssum1 = vec_add(pixelssum2, vctwo);
430 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
433 blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
435 blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
438 blockv = vec_avg(blocktemp, blockv);
439 vec_st(blockv, 0, block);
445 #endif /* HAVE_ALTIVEC */
447 void ff_hpeldsp_init_ppc(HpelDSPContext* c, int flags)
450 int mm_flags = av_get_cpu_flags();
452 if (mm_flags & AV_CPU_FLAG_ALTIVEC) {
453 c->put_pixels_tab[0][0] = ff_put_pixels16_altivec;
454 c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_altivec;
455 c->avg_pixels_tab[0][0] = ff_avg_pixels16_altivec;
456 c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
457 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec;
458 c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
459 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
460 c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
461 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
463 #endif /* HAVE_ALTIVEC */