2 * Copyright (c) 2002 Brian Foley
3 * Copyright (c) 2002 Dieter Shirley
4 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
29 #include "libavutil/attributes.h"
30 #include "libavutil/cpu.h"
31 #include "libavutil/ppc/cpu.h"
32 #include "libavutil/ppc/types_altivec.h"
33 #include "libavutil/ppc/util_altivec.h"
34 #include "libavcodec/hpeldsp.h"
35 #include "hpeldsp_altivec.h"
38 /* next one assumes that ((line_size % 16) == 0) */
40 void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
42 register vector unsigned char pixelsv1;
43 register vector unsigned char pixelsv1B;
44 register vector unsigned char pixelsv1C;
45 register vector unsigned char pixelsv1D;
48 register ptrdiff_t line_size_2 = line_size << 1;
49 register ptrdiff_t line_size_3 = line_size + line_size_2;
50 register ptrdiff_t line_size_4 = line_size << 2;
52 // hand-unrolling the loop by 4 gains about 15%
53 // mininum execution time goes from 74 to 60 cycles
54 // it's faster than -funroll-loops, but using
55 // -funroll-loops w/ this is bad - 74 cycles again.
56 // all this is on a 7450, tuning for the 7450
57 for (i = 0; i < h; i += 4) {
58 pixelsv1 = vec_vsx_ld( 0, pixels);
59 pixelsv1B = vec_vsx_ld(line_size, pixels);
60 pixelsv1C = vec_vsx_ld(line_size_2, pixels);
61 pixelsv1D = vec_vsx_ld(line_size_3, pixels);
62 vec_vsx_st(pixelsv1, 0, (unsigned char*)block);
63 vec_vsx_st(pixelsv1B, line_size, (unsigned char*)block);
64 vec_vsx_st(pixelsv1C, line_size_2, (unsigned char*)block);
65 vec_st(pixelsv1D, line_size_3, (unsigned char*)block);
71 void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
73 register vector unsigned char pixelsv1, pixelsv2;
74 register vector unsigned char pixelsv1B, pixelsv2B;
75 register vector unsigned char pixelsv1C, pixelsv2C;
76 register vector unsigned char pixelsv1D, pixelsv2D;
78 register vector unsigned char perm = vec_lvsl(0, pixels);
80 register ptrdiff_t line_size_2 = line_size << 1;
81 register ptrdiff_t line_size_3 = line_size + line_size_2;
82 register ptrdiff_t line_size_4 = line_size << 2;
84 // hand-unrolling the loop by 4 gains about 15%
85 // mininum execution time goes from 74 to 60 cycles
86 // it's faster than -funroll-loops, but using
87 // -funroll-loops w/ this is bad - 74 cycles again.
88 // all this is on a 7450, tuning for the 7450
89 for (i = 0; i < h; i += 4) {
90 pixelsv1 = vec_ld( 0, pixels);
91 pixelsv2 = vec_ld(15, pixels);
92 pixelsv1B = vec_ld(line_size, pixels);
93 pixelsv2B = vec_ld(15 + line_size, pixels);
94 pixelsv1C = vec_ld(line_size_2, pixels);
95 pixelsv2C = vec_ld(15 + line_size_2, pixels);
96 pixelsv1D = vec_ld(line_size_3, pixels);
97 pixelsv2D = vec_ld(15 + line_size_3, pixels);
98 vec_st(vec_perm(pixelsv1, pixelsv2, perm),
99 0, (unsigned char*)block);
100 vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
101 line_size, (unsigned char*)block);
102 vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
103 line_size_2, (unsigned char*)block);
104 vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
105 line_size_3, (unsigned char*)block);
111 #endif /* HAVE_VSX */
113 /* next one assumes that ((line_size % 16) == 0) */
114 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
115 void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
117 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
118 register vector unsigned char perm = vec_lvsl(0, pixels);
121 for (i = 0; i < h; i++) {
122 pixelsv1 = vec_ld( 0, pixels);
123 pixelsv2 = vec_ld(16,pixels);
124 blockv = vec_ld(0, block);
125 pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
126 blockv = vec_avg(blockv,pixelsv);
127 vec_st(blockv, 0, (unsigned char*)block);
133 /* next one assumes that ((line_size % 8) == 0) */
134 static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
136 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
139 for (i = 0; i < h; i++) {
140 /* block is 8 bytes-aligned, so we're either in the
141 left block (16 bytes-aligned) or in the right block (not) */
142 int rightside = ((unsigned long)block & 0x0000000F);
144 blockv = vec_ld(0, block);
145 pixelsv1 = vec_ld( 0, pixels);
146 pixelsv2 = vec_ld(16, pixels);
147 pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
150 pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
152 pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
155 blockv = vec_avg(blockv, pixelsv);
157 vec_st(blockv, 0, block);
164 /* next one assumes that ((line_size % 8) == 0) */
165 static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
168 register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
169 register vector unsigned char blockv, temp1, temp2;
170 register vector unsigned short pixelssum1, pixelssum2, temp3;
171 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
172 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
174 temp1 = vec_ld(0, pixels);
175 temp2 = vec_ld(16, pixels);
176 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
177 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
180 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
182 pixelsv1 = vec_mergeh(vczero, pixelsv1);
183 pixelsv2 = vec_mergeh(vczero, pixelsv2);
184 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
185 (vector unsigned short)pixelsv2);
186 pixelssum1 = vec_add(pixelssum1, vctwo);
188 for (i = 0; i < h ; i++) {
189 int rightside = ((unsigned long)block & 0x0000000F);
190 blockv = vec_ld(0, block);
192 temp1 = vec_ld(line_size, pixels);
193 temp2 = vec_ld(line_size + 16, pixels);
194 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
195 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
198 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
201 pixelsv1 = vec_mergeh(vczero, pixelsv1);
202 pixelsv2 = vec_mergeh(vczero, pixelsv2);
203 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
204 (vector unsigned short)pixelsv2);
205 temp3 = vec_add(pixelssum1, pixelssum2);
206 temp3 = vec_sra(temp3, vctwo);
207 pixelssum1 = vec_add(pixelssum2, vctwo);
208 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
211 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
213 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
216 vec_st(blockv, 0, block);
223 /* next one assumes that ((line_size % 8) == 0) */
224 static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
227 register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
228 register vector unsigned char blockv, temp1, temp2;
229 register vector unsigned short pixelssum1, pixelssum2, temp3;
230 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
231 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
232 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
234 temp1 = vec_ld(0, pixels);
235 temp2 = vec_ld(16, pixels);
236 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
237 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
240 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
242 pixelsv1 = vec_mergeh(vczero, pixelsv1);
243 pixelsv2 = vec_mergeh(vczero, pixelsv2);
244 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
245 (vector unsigned short)pixelsv2);
246 pixelssum1 = vec_add(pixelssum1, vcone);
248 for (i = 0; i < h ; i++) {
249 int rightside = ((unsigned long)block & 0x0000000F);
250 blockv = vec_ld(0, block);
252 temp1 = vec_ld(line_size, pixels);
253 temp2 = vec_ld(line_size + 16, pixels);
254 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
255 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
258 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
261 pixelsv1 = vec_mergeh(vczero, pixelsv1);
262 pixelsv2 = vec_mergeh(vczero, pixelsv2);
263 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
264 (vector unsigned short)pixelsv2);
265 temp3 = vec_add(pixelssum1, pixelssum2);
266 temp3 = vec_sra(temp3, vctwo);
267 pixelssum1 = vec_add(pixelssum2, vcone);
268 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
271 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
273 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
276 vec_st(blockv, 0, block);
283 /* next one assumes that ((line_size % 16) == 0) */
284 static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
287 register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
288 register vector unsigned char blockv, temp1, temp2;
289 register vector unsigned short temp3, temp4,
290 pixelssum1, pixelssum2, pixelssum3, pixelssum4;
291 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
292 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
294 temp1 = vec_ld(0, pixels);
295 temp2 = vec_ld(16, pixels);
296 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
297 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
300 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
302 pixelsv3 = vec_mergel(vczero, pixelsv1);
303 pixelsv4 = vec_mergel(vczero, pixelsv2);
304 pixelsv1 = vec_mergeh(vczero, pixelsv1);
305 pixelsv2 = vec_mergeh(vczero, pixelsv2);
306 pixelssum3 = vec_add((vector unsigned short)pixelsv3,
307 (vector unsigned short)pixelsv4);
308 pixelssum3 = vec_add(pixelssum3, vctwo);
309 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
310 (vector unsigned short)pixelsv2);
311 pixelssum1 = vec_add(pixelssum1, vctwo);
313 for (i = 0; i < h ; i++) {
314 blockv = vec_ld(0, block);
316 temp1 = vec_ld(line_size, pixels);
317 temp2 = vec_ld(line_size + 16, pixels);
318 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
319 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
322 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
325 pixelsv3 = vec_mergel(vczero, pixelsv1);
326 pixelsv4 = vec_mergel(vczero, pixelsv2);
327 pixelsv1 = vec_mergeh(vczero, pixelsv1);
328 pixelsv2 = vec_mergeh(vczero, pixelsv2);
330 pixelssum4 = vec_add((vector unsigned short)pixelsv3,
331 (vector unsigned short)pixelsv4);
332 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
333 (vector unsigned short)pixelsv2);
334 temp4 = vec_add(pixelssum3, pixelssum4);
335 temp4 = vec_sra(temp4, vctwo);
336 temp3 = vec_add(pixelssum1, pixelssum2);
337 temp3 = vec_sra(temp3, vctwo);
339 pixelssum3 = vec_add(pixelssum4, vctwo);
340 pixelssum1 = vec_add(pixelssum2, vctwo);
342 blockv = vec_packsu(temp3, temp4);
344 vec_st(blockv, 0, block);
351 /* next one assumes that ((line_size % 16) == 0) */
352 static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
355 register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
356 register vector unsigned char blockv, temp1, temp2;
357 register vector unsigned short temp3, temp4,
358 pixelssum1, pixelssum2, pixelssum3, pixelssum4;
359 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
360 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
361 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
363 temp1 = vec_ld(0, pixels);
364 temp2 = vec_ld(16, pixels);
365 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
366 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
369 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
371 pixelsv3 = vec_mergel(vczero, pixelsv1);
372 pixelsv4 = vec_mergel(vczero, pixelsv2);
373 pixelsv1 = vec_mergeh(vczero, pixelsv1);
374 pixelsv2 = vec_mergeh(vczero, pixelsv2);
375 pixelssum3 = vec_add((vector unsigned short)pixelsv3,
376 (vector unsigned short)pixelsv4);
377 pixelssum3 = vec_add(pixelssum3, vcone);
378 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
379 (vector unsigned short)pixelsv2);
380 pixelssum1 = vec_add(pixelssum1, vcone);
382 for (i = 0; i < h ; i++) {
383 blockv = vec_ld(0, block);
385 temp1 = vec_ld(line_size, pixels);
386 temp2 = vec_ld(line_size + 16, pixels);
387 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
388 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
391 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
394 pixelsv3 = vec_mergel(vczero, pixelsv1);
395 pixelsv4 = vec_mergel(vczero, pixelsv2);
396 pixelsv1 = vec_mergeh(vczero, pixelsv1);
397 pixelsv2 = vec_mergeh(vczero, pixelsv2);
399 pixelssum4 = vec_add((vector unsigned short)pixelsv3,
400 (vector unsigned short)pixelsv4);
401 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
402 (vector unsigned short)pixelsv2);
403 temp4 = vec_add(pixelssum3, pixelssum4);
404 temp4 = vec_sra(temp4, vctwo);
405 temp3 = vec_add(pixelssum1, pixelssum2);
406 temp3 = vec_sra(temp3, vctwo);
408 pixelssum3 = vec_add(pixelssum4, vcone);
409 pixelssum1 = vec_add(pixelssum2, vcone);
411 blockv = vec_packsu(temp3, temp4);
413 vec_st(blockv, 0, block);
420 /* next one assumes that ((line_size % 8) == 0) */
421 static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
424 register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
425 register vector unsigned char blockv, temp1, temp2, blocktemp;
426 register vector unsigned short pixelssum1, pixelssum2, temp3;
428 register const vector unsigned char vczero = (const vector unsigned char)
430 register const vector unsigned short vctwo = (const vector unsigned short)
433 temp1 = vec_ld(0, pixels);
434 temp2 = vec_ld(16, pixels);
435 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
436 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
439 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
441 pixelsv1 = vec_mergeh(vczero, pixelsv1);
442 pixelsv2 = vec_mergeh(vczero, pixelsv2);
443 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
444 (vector unsigned short)pixelsv2);
445 pixelssum1 = vec_add(pixelssum1, vctwo);
447 for (i = 0; i < h ; i++) {
448 int rightside = ((unsigned long)block & 0x0000000F);
449 blockv = vec_ld(0, block);
451 temp1 = vec_ld(line_size, pixels);
452 temp2 = vec_ld(line_size + 16, pixels);
453 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
454 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
457 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
460 pixelsv1 = vec_mergeh(vczero, pixelsv1);
461 pixelsv2 = vec_mergeh(vczero, pixelsv2);
462 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
463 (vector unsigned short)pixelsv2);
464 temp3 = vec_add(pixelssum1, pixelssum2);
465 temp3 = vec_sra(temp3, vctwo);
466 pixelssum1 = vec_add(pixelssum2, vctwo);
467 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
470 blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
472 blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
475 blockv = vec_avg(blocktemp, blockv);
476 vec_st(blockv, 0, block);
482 #endif /* HAVE_ALTIVEC */
484 av_cold void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags)
487 if (!PPC_ALTIVEC(av_get_cpu_flags()))
490 c->avg_pixels_tab[0][0] = ff_avg_pixels16_altivec;
491 c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
492 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec;
494 c->put_pixels_tab[0][0] = ff_put_pixels16_altivec;
495 c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
496 c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
498 c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_altivec;
499 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
500 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
501 #endif /* HAVE_ALTIVEC */