1 /*****************************************************************************
2 * merge.c : Merge (line blending) routines for the VLC deinterlacer
3 *****************************************************************************
4 * Copyright (C) 2011 the VideoLAN team
7 * Author: Sam Hocevar <sam@zoy.org>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
22 *****************************************************************************/
24 /* This handles including config.h, because the config
25 is already needed in the header. */
28 #ifdef CAN_COMPILE_MMXEXT
38 /*****************************************************************************
39 * Merge (line blending) routines
40 *****************************************************************************/
42 void MergeGeneric( void *_p_dest, const void *_p_s1,
43 const void *_p_s2, size_t i_bytes )
45 uint8_t* p_dest = (uint8_t*)_p_dest;
46 const uint8_t *p_s1 = (const uint8_t *)_p_s1;
47 const uint8_t *p_s2 = (const uint8_t *)_p_s2;
48 uint8_t* p_end = p_dest + i_bytes - 8;
50 while( p_dest < p_end )
52 *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
53 *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
54 *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
55 *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
56 *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
57 *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
58 *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
59 *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
64 while( p_dest < p_end )
66 *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
70 #if defined(CAN_COMPILE_MMXEXT)
71 void MergeMMXEXT( void *_p_dest, const void *_p_s1, const void *_p_s2,
74 uint8_t* p_dest = (uint8_t*)_p_dest;
75 const uint8_t *p_s1 = (const uint8_t *)_p_s1;
76 const uint8_t *p_s2 = (const uint8_t *)_p_s2;
77 uint8_t* p_end = p_dest + i_bytes - 8;
78 while( p_dest < p_end )
80 __asm__ __volatile__( "movq %2,%%mm1;"
82 "movq %%mm1, %0" :"=m" (*p_dest):
92 while( p_dest < p_end )
94 *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
99 #if defined(CAN_COMPILE_3DNOW)
100 void Merge3DNow( void *_p_dest, const void *_p_s1, const void *_p_s2,
103 uint8_t* p_dest = (uint8_t*)_p_dest;
104 const uint8_t *p_s1 = (const uint8_t *)_p_s1;
105 const uint8_t *p_s2 = (const uint8_t *)_p_s2;
106 uint8_t* p_end = p_dest + i_bytes - 8;
107 while( p_dest < p_end )
109 __asm__ __volatile__( "movq %2,%%mm1;"
111 "movq %%mm1, %0" :"=m" (*p_dest):
121 while( p_dest < p_end )
123 *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
128 #if defined(CAN_COMPILE_SSE)
129 void MergeSSE2( void *_p_dest, const void *_p_s1, const void *_p_s2,
132 uint8_t* p_dest = (uint8_t*)_p_dest;
133 const uint8_t *p_s1 = (const uint8_t *)_p_s1;
134 const uint8_t *p_s2 = (const uint8_t *)_p_s2;
136 while( (uintptr_t)p_s1 % 16 )
138 *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
140 p_end = p_dest + i_bytes - 16;
141 while( p_dest < p_end )
143 __asm__ __volatile__( "movdqu %2,%%xmm1;"
145 "movdqu %%xmm1, %0" :"=m" (*p_dest):
155 while( p_dest < p_end )
157 *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
162 #ifdef CAN_COMPILE_C_ALTIVEC
163 void MergeAltivec( void *_p_dest, const void *_p_s1,
164 const void *_p_s2, size_t i_bytes )
166 uint8_t *p_dest = (uint8_t *)_p_dest;
167 uint8_t *p_s1 = (uint8_t *)_p_s1;
168 uint8_t *p_s2 = (uint8_t *)_p_s2;
169 uint8_t *p_end = p_dest + i_bytes - 15;
171 /* Use C until the first 16-bytes aligned destination pixel */
172 while( (uintptr_t)p_dest & 0xF )
174 *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
177 if( ( (int)p_s1 & 0xF ) | ( (int)p_s2 & 0xF ) )
179 /* Unaligned source */
180 vector unsigned char s1v, s2v, destv;
181 vector unsigned char s1oldv, s2oldv, s1newv, s2newv;
182 vector unsigned char perm1v, perm2v;
184 perm1v = vec_lvsl( 0, p_s1 );
185 perm2v = vec_lvsl( 0, p_s2 );
186 s1oldv = vec_ld( 0, p_s1 );
187 s2oldv = vec_ld( 0, p_s2 );
189 while( p_dest < p_end )
191 s1newv = vec_ld( 16, p_s1 );
192 s2newv = vec_ld( 16, p_s2 );
193 s1v = vec_perm( s1oldv, s1newv, perm1v );
194 s2v = vec_perm( s2oldv, s2newv, perm2v );
197 destv = vec_avg( s1v, s2v );
198 vec_st( destv, 0, p_dest );
208 vector unsigned char s1v, s2v, destv;
210 while( p_dest < p_end )
212 s1v = vec_ld( 0, p_s1 );
213 s2v = vec_ld( 0, p_s2 );
214 destv = vec_avg( s1v, s2v );
215 vec_st( destv, 0, p_dest );
225 while( p_dest < p_end )
227 *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
233 void MergeNEON (void *restrict out, const void *in1,
234 const void *in2, size_t n)
237 const uint8_t *in1p = in1;
238 const uint8_t *in2p = in2;
239 size_t mis = ((uintptr_t)outp) & 15;
243 MergeGeneric (outp, in1p, in2p, mis);
250 uint8_t *end = outp + (n & ~15);
252 if ((((uintptr_t)in1p)|((uintptr_t)in2p)) & 15)
255 "vld1.u8 {q0-q1}, [%[in1]]!\n"
256 "vld1.u8 {q2-q3}, [%[in2]]!\n"
257 "vhadd.u8 q4, q0, q2\n"
258 "vld1.u8 {q6-q7}, [%[in1]]!\n"
259 "vhadd.u8 q5, q1, q3\n"
260 "vld1.u8 {q8-q9}, [%[in2]]!\n"
261 "vhadd.u8 q10, q6, q8\n"
262 "vhadd.u8 q11, q7, q9\n"
263 "vst1.u8 {q4-q5}, [%[out],:128]!\n"
264 "vst1.u8 {q10-q11}, [%[out],:128]!\n"
265 : [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
267 : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
268 "q8", "q9", "q10", "q11", "memory");
272 "vld1.u8 {q0-q1}, [%[in1],:128]!\n"
273 "vld1.u8 {q2-q3}, [%[in2],:128]!\n"
274 "vhadd.u8 q4, q0, q2\n"
275 "vld1.u8 {q6-q7}, [%[in1],:128]!\n"
276 "vhadd.u8 q5, q1, q3\n"
277 "vld1.u8 {q8-q9}, [%[in2],:128]!\n"
278 "vhadd.u8 q10, q6, q8\n"
279 "vhadd.u8 q11, q7, q9\n"
280 "vst1.u8 {q4-q5}, [%[out],:128]!\n"
281 "vst1.u8 {q10-q11}, [%[out],:128]!\n"
282 : [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
284 : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
285 "q8", "q9", "q10", "q11", "memory");
288 MergeGeneric (outp, in1p, in2p, n);
292 /*****************************************************************************
294 *****************************************************************************/
296 #if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
299 __asm__ __volatile__( "emms" :: );
303 #if defined(CAN_COMPILE_3DNOW)
304 void End3DNow( void )
306 __asm__ __volatile__( "femms" :: );