1 /*****************************************************************************
2 * merge.c : Merge (line blending) routines for the VLC deinterlacer
3 *****************************************************************************
4 * Copyright (C) 2011 the VideoLAN team
7 * Author: Sam Hocevar <sam@zoy.org> (generic C routine)
8 * Sigmund Augdal Helberg <sigmunau@videolan.org> (MMXEXT, 3DNow, SSE2)
9 * Eric Petit <eric.petit@lapsus.org> (Altivec)
10 * RĂ©mi Denis-Courmont <remi@remlab.net> (ARM NEON)
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
25 *****************************************************************************/
36 #ifdef CAN_COMPILE_MMXEXT
44 /*****************************************************************************
45 * Merge (line blending) routines
46 *****************************************************************************/
48 void MergeGeneric( void *_p_dest, const void *_p_s1,
49 const void *_p_s2, size_t i_bytes )
51 uint8_t* p_dest = (uint8_t*)_p_dest;
52 const uint8_t *p_s1 = (const uint8_t *)_p_s1;
53 const uint8_t *p_s2 = (const uint8_t *)_p_s2;
54 uint8_t* p_end = p_dest + i_bytes - 8;
56 while( p_dest < p_end )
58 *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
59 *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
60 *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
61 *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
62 *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
63 *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
64 *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
65 *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
70 while( p_dest < p_end )
72 *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
76 #if defined(CAN_COMPILE_MMXEXT)
77 void MergeMMXEXT( void *_p_dest, const void *_p_s1, const void *_p_s2,
80 uint8_t* p_dest = (uint8_t*)_p_dest;
81 const uint8_t *p_s1 = (const uint8_t *)_p_s1;
82 const uint8_t *p_s2 = (const uint8_t *)_p_s2;
83 uint8_t* p_end = p_dest + i_bytes - 8;
84 while( p_dest < p_end )
86 __asm__ __volatile__( "movq %2,%%mm1;"
88 "movq %%mm1, %0" :"=m" (*p_dest):
98 while( p_dest < p_end )
100 *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
105 #if defined(CAN_COMPILE_3DNOW)
106 void Merge3DNow( void *_p_dest, const void *_p_s1, const void *_p_s2,
109 uint8_t* p_dest = (uint8_t*)_p_dest;
110 const uint8_t *p_s1 = (const uint8_t *)_p_s1;
111 const uint8_t *p_s2 = (const uint8_t *)_p_s2;
112 uint8_t* p_end = p_dest + i_bytes - 8;
113 while( p_dest < p_end )
115 __asm__ __volatile__( "movq %2,%%mm1;"
117 "movq %%mm1, %0" :"=m" (*p_dest):
127 while( p_dest < p_end )
129 *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
134 #if defined(CAN_COMPILE_SSE)
135 void MergeSSE2( void *_p_dest, const void *_p_s1, const void *_p_s2,
138 uint8_t* p_dest = (uint8_t*)_p_dest;
139 const uint8_t *p_s1 = (const uint8_t *)_p_s1;
140 const uint8_t *p_s2 = (const uint8_t *)_p_s2;
142 while( (uintptr_t)p_s1 % 16 )
144 *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
146 p_end = p_dest + i_bytes - 16;
147 while( p_dest < p_end )
149 __asm__ __volatile__( "movdqu %2,%%xmm1;"
151 "movdqu %%xmm1, %0" :"=m" (*p_dest):
161 while( p_dest < p_end )
163 *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
168 #ifdef CAN_COMPILE_C_ALTIVEC
169 void MergeAltivec( void *_p_dest, const void *_p_s1,
170 const void *_p_s2, size_t i_bytes )
172 uint8_t *p_dest = (uint8_t *)_p_dest;
173 uint8_t *p_s1 = (uint8_t *)_p_s1;
174 uint8_t *p_s2 = (uint8_t *)_p_s2;
175 uint8_t *p_end = p_dest + i_bytes - 15;
177 /* Use C until the first 16-bytes aligned destination pixel */
178 while( (uintptr_t)p_dest & 0xF )
180 *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
183 if( ( (int)p_s1 & 0xF ) | ( (int)p_s2 & 0xF ) )
185 /* Unaligned source */
186 vector unsigned char s1v, s2v, destv;
187 vector unsigned char s1oldv, s2oldv, s1newv, s2newv;
188 vector unsigned char perm1v, perm2v;
190 perm1v = vec_lvsl( 0, p_s1 );
191 perm2v = vec_lvsl( 0, p_s2 );
192 s1oldv = vec_ld( 0, p_s1 );
193 s2oldv = vec_ld( 0, p_s2 );
195 while( p_dest < p_end )
197 s1newv = vec_ld( 16, p_s1 );
198 s2newv = vec_ld( 16, p_s2 );
199 s1v = vec_perm( s1oldv, s1newv, perm1v );
200 s2v = vec_perm( s2oldv, s2newv, perm2v );
203 destv = vec_avg( s1v, s2v );
204 vec_st( destv, 0, p_dest );
214 vector unsigned char s1v, s2v, destv;
216 while( p_dest < p_end )
218 s1v = vec_ld( 0, p_s1 );
219 s2v = vec_ld( 0, p_s2 );
220 destv = vec_avg( s1v, s2v );
221 vec_st( destv, 0, p_dest );
231 while( p_dest < p_end )
233 *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
239 void MergeNEON (void *restrict out, const void *in1,
240 const void *in2, size_t n)
243 const uint8_t *in1p = in1;
244 const uint8_t *in2p = in2;
245 size_t mis = ((uintptr_t)outp) & 15;
249 MergeGeneric (outp, in1p, in2p, mis);
256 uint8_t *end = outp + (n & ~15);
258 if ((((uintptr_t)in1p)|((uintptr_t)in2p)) & 15)
261 "vld1.u8 {q0-q1}, [%[in1]]!\n"
262 "vld1.u8 {q2-q3}, [%[in2]]!\n"
263 "vhadd.u8 q4, q0, q2\n"
264 "vld1.u8 {q6-q7}, [%[in1]]!\n"
265 "vhadd.u8 q5, q1, q3\n"
266 "vld1.u8 {q8-q9}, [%[in2]]!\n"
267 "vhadd.u8 q10, q6, q8\n"
268 "vhadd.u8 q11, q7, q9\n"
269 "vst1.u8 {q4-q5}, [%[out],:128]!\n"
270 "vst1.u8 {q10-q11}, [%[out],:128]!\n"
271 : [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
273 : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
274 "q8", "q9", "q10", "q11", "memory");
278 "vld1.u8 {q0-q1}, [%[in1],:128]!\n"
279 "vld1.u8 {q2-q3}, [%[in2],:128]!\n"
280 "vhadd.u8 q4, q0, q2\n"
281 "vld1.u8 {q6-q7}, [%[in1],:128]!\n"
282 "vhadd.u8 q5, q1, q3\n"
283 "vld1.u8 {q8-q9}, [%[in2],:128]!\n"
284 "vhadd.u8 q10, q6, q8\n"
285 "vhadd.u8 q11, q7, q9\n"
286 "vst1.u8 {q4-q5}, [%[out],:128]!\n"
287 "vst1.u8 {q10-q11}, [%[out],:128]!\n"
288 : [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
290 : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
291 "q8", "q9", "q10", "q11", "memory");
294 MergeGeneric (outp, in1p, in2p, n);
298 /*****************************************************************************
300 *****************************************************************************/
302 #if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
305 __asm__ __volatile__( "emms" :: );
309 #if defined(CAN_COMPILE_3DNOW)
310 void End3DNow( void )
312 __asm__ __volatile__( "femms" :: );