2 * Copyright (c) 2002 Brian Foley
3 * Copyright (c) 2002 Dieter Shirley
4 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/ppc/cpu.h"
28 #include "libavutil/ppc/util_altivec.h"
30 #include "libavcodec/hpeldsp.h"
32 #include "hpeldsp_altivec.h"
35 /* next one assumes that ((line_size % 16) == 0) */
36 void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
38 register vector unsigned char pixelsv1;
39 register vector unsigned char pixelsv1B;
40 register vector unsigned char pixelsv1C;
41 register vector unsigned char pixelsv1D;
44 register ptrdiff_t line_size_2 = line_size << 1;
45 register ptrdiff_t line_size_3 = line_size + line_size_2;
46 register ptrdiff_t line_size_4 = line_size << 2;
48 // hand-unrolling the loop by 4 gains about 15%
49 // mininum execution time goes from 74 to 60 cycles
50 // it's faster than -funroll-loops, but using
51 // -funroll-loops w/ this is bad - 74 cycles again.
52 // all this is on a 7450, tuning for the 7450
53 for (i = 0; i < h; i += 4) {
54 pixelsv1 = unaligned_load( 0, pixels);
55 pixelsv1B = unaligned_load(line_size, pixels);
56 pixelsv1C = unaligned_load(line_size_2, pixels);
57 pixelsv1D = unaligned_load(line_size_3, pixels);
58 VEC_ST(pixelsv1, 0, (unsigned char*)block);
59 VEC_ST(pixelsv1B, line_size, (unsigned char*)block);
60 VEC_ST(pixelsv1C, line_size_2, (unsigned char*)block);
61 VEC_ST(pixelsv1D, line_size_3, (unsigned char*)block);
67 /* next one assumes that ((line_size % 16) == 0) */
68 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
69 void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
71 register vector unsigned char pixelsv, blockv;
74 for (i = 0; i < h; i++) {
75 blockv = vec_ld(0, block);
76 pixelsv = VEC_LD( 0, pixels);
77 blockv = vec_avg(blockv,pixelsv);
78 vec_st(blockv, 0, (unsigned char*)block);
84 /* next one assumes that ((line_size % 8) == 0) */
85 static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
87 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
90 for (i = 0; i < h; i++) {
91 /* block is 8 bytes-aligned, so we're either in the
92 left block (16 bytes-aligned) or in the right block (not) */
93 int rightside = ((unsigned long)block & 0x0000000F);
95 blockv = vec_ld(0, block);
96 pixelsv = VEC_LD( 0, pixels);
99 pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
101 pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
104 blockv = vec_avg(blockv, pixelsv);
106 vec_st(blockv, 0, block);
113 /* next one assumes that ((line_size % 8) == 0) */
114 static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
117 register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
118 register vector unsigned char blockv;
119 register vector unsigned short pixelssum1, pixelssum2, temp3;
120 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
121 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
123 pixelsv1 = VEC_LD(0, pixels);
124 pixelsv2 = VEC_LD(1, pixels);
125 pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
126 pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
128 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
129 (vector unsigned short)pixelsv2);
130 pixelssum1 = vec_add(pixelssum1, vctwo);
132 for (i = 0; i < h ; i++) {
133 int rightside = ((unsigned long)block & 0x0000000F);
134 blockv = vec_ld(0, block);
136 pixelsv1 = unaligned_load(line_size, pixels);
137 pixelsv2 = unaligned_load(line_size+1, pixels);
138 pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
139 pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
140 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
141 (vector unsigned short)pixelsv2);
142 temp3 = vec_add(pixelssum1, pixelssum2);
143 temp3 = vec_sra(temp3, vctwo);
144 pixelssum1 = vec_add(pixelssum2, vctwo);
145 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
148 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
150 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
153 vec_st(blockv, 0, block);
160 /* next one assumes that ((line_size % 8) == 0) */
161 static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
164 register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
165 register vector unsigned char blockv;
166 register vector unsigned short pixelssum1, pixelssum2, temp3;
167 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
168 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
169 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
171 pixelsv1 = VEC_LD(0, pixels);
172 pixelsv2 = VEC_LD(1, pixels);
173 pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
174 pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
175 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
176 (vector unsigned short)pixelsv2);
177 pixelssum1 = vec_add(pixelssum1, vcone);
179 for (i = 0; i < h ; i++) {
180 int rightside = ((unsigned long)block & 0x0000000F);
181 blockv = vec_ld(0, block);
183 pixelsv1 = unaligned_load(line_size, pixels);
184 pixelsv2 = unaligned_load(line_size+1, pixels);
185 pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
186 pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
187 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
188 (vector unsigned short)pixelsv2);
189 temp3 = vec_add(pixelssum1, pixelssum2);
190 temp3 = vec_sra(temp3, vctwo);
191 pixelssum1 = vec_add(pixelssum2, vcone);
192 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
195 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
197 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
200 vec_st(blockv, 0, block);
207 /* next one assumes that ((line_size % 16) == 0) */
208 static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
211 register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
212 register vector unsigned char blockv;
213 register vector unsigned short temp3, temp4,
214 pixelssum1, pixelssum2, pixelssum3, pixelssum4;
215 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
216 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
218 pixelsv1 = VEC_LD(0, pixels);
219 pixelsv2 = VEC_LD(1, pixels);
220 pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
221 pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
222 pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
223 pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
224 pixelssum3 = vec_add((vector unsigned short)pixelsv3,
225 (vector unsigned short)pixelsv4);
226 pixelssum3 = vec_add(pixelssum3, vctwo);
227 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
228 (vector unsigned short)pixelsv2);
229 pixelssum1 = vec_add(pixelssum1, vctwo);
231 for (i = 0; i < h ; i++) {
232 blockv = vec_ld(0, block);
234 pixelsv1 = unaligned_load(line_size, pixels);
235 pixelsv2 = unaligned_load(line_size+1, pixels);
237 pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
238 pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
239 pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
240 pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
241 pixelssum4 = vec_add((vector unsigned short)pixelsv3,
242 (vector unsigned short)pixelsv4);
243 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
244 (vector unsigned short)pixelsv2);
245 temp4 = vec_add(pixelssum3, pixelssum4);
246 temp4 = vec_sra(temp4, vctwo);
247 temp3 = vec_add(pixelssum1, pixelssum2);
248 temp3 = vec_sra(temp3, vctwo);
250 pixelssum3 = vec_add(pixelssum4, vctwo);
251 pixelssum1 = vec_add(pixelssum2, vctwo);
253 blockv = vec_packsu(temp3, temp4);
255 vec_st(blockv, 0, block);
262 /* next one assumes that ((line_size % 16) == 0) */
263 static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
266 register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
267 register vector unsigned char blockv;
268 register vector unsigned short temp3, temp4,
269 pixelssum1, pixelssum2, pixelssum3, pixelssum4;
270 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
271 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
272 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
274 pixelsv1 = VEC_LD(0, pixels);
275 pixelsv2 = VEC_LD(1, pixels);
276 pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
277 pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
278 pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
279 pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
280 pixelssum3 = vec_add((vector unsigned short)pixelsv3,
281 (vector unsigned short)pixelsv4);
282 pixelssum3 = vec_add(pixelssum3, vcone);
283 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
284 (vector unsigned short)pixelsv2);
285 pixelssum1 = vec_add(pixelssum1, vcone);
287 for (i = 0; i < h ; i++) {
288 pixelsv1 = unaligned_load(line_size, pixels);
289 pixelsv2 = unaligned_load(line_size+1, pixels);
291 pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
292 pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
293 pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
294 pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
295 pixelssum4 = vec_add((vector unsigned short)pixelsv3,
296 (vector unsigned short)pixelsv4);
297 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
298 (vector unsigned short)pixelsv2);
299 temp4 = vec_add(pixelssum3, pixelssum4);
300 temp4 = vec_sra(temp4, vctwo);
301 temp3 = vec_add(pixelssum1, pixelssum2);
302 temp3 = vec_sra(temp3, vctwo);
304 pixelssum3 = vec_add(pixelssum4, vcone);
305 pixelssum1 = vec_add(pixelssum2, vcone);
307 blockv = vec_packsu(temp3, temp4);
309 VEC_ST(blockv, 0, block);
316 /* next one assumes that ((line_size % 8) == 0) */
317 static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
320 register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
321 register vector unsigned char blockv, blocktemp;
322 register vector unsigned short pixelssum1, pixelssum2, temp3;
324 register const vector unsigned char vczero = (const vector unsigned char)
326 register const vector unsigned short vctwo = (const vector unsigned short)
329 pixelsv1 = VEC_LD(0, pixels);
330 pixelsv2 = VEC_LD(1, pixels);
331 pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
332 pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
333 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
334 (vector unsigned short)pixelsv2);
335 pixelssum1 = vec_add(pixelssum1, vctwo);
337 for (i = 0; i < h ; i++) {
338 int rightside = ((unsigned long)block & 0x0000000F);
339 blockv = vec_ld(0, block);
341 pixelsv1 = unaligned_load(line_size, pixels);
342 pixelsv2 = unaligned_load(line_size+1, pixels);
344 pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
345 pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
346 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
347 (vector unsigned short)pixelsv2);
348 temp3 = vec_add(pixelssum1, pixelssum2);
349 temp3 = vec_sra(temp3, vctwo);
350 pixelssum1 = vec_add(pixelssum2, vctwo);
351 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
354 blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
356 blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
359 blockv = vec_avg(blocktemp, blockv);
360 vec_st(blockv, 0, block);
366 #endif /* HAVE_ALTIVEC */
368 av_cold void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags)
371 if (!PPC_ALTIVEC(av_get_cpu_flags()))
374 c->avg_pixels_tab[0][0] = ff_avg_pixels16_altivec;
375 c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
376 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec;
378 c->put_pixels_tab[0][0] = ff_put_pixels16_altivec;
379 c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
380 c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
382 c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_altivec;
383 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
384 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
385 #endif /* HAVE_ALTIVEC */