1 ;******************************************************************************
2 ;* x86-optimized input routines; does shuffling of packed
3 ;* YUV formats into individual planes, and converts RGB
4 ;* into YUV planes also.
5 ;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
7 ;* This file is part of Libav.
9 ;* Libav is free software; you can redistribute it and/or
10 ;* modify it under the terms of the GNU Lesser General Public
11 ;* License as published by the Free Software Foundation; either
12 ;* version 2.1 of the License, or (at your option) any later version.
14 ;* Libav is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 ;* Lesser General Public License for more details.
19 ;* You should have received a copy of the GNU Lesser General Public
20 ;* License along with Libav; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 ;******************************************************************************
25 %include "x86util.asm"
31 ;-----------------------------------------------------------------------------
32 ; YUYV/UYVY/NV12/NV21 packed pixel shuffling.
34 ; void <fmt>ToY_<opt>(uint8_t *dst, const uint8_t *src, int w);
36 ; void <fmt>toUV_<opt>(uint8_t *dstU, uint8_t *dstV, const uint8_t *src,
37 ; const uint8_t *unused, int w);
38 ;-----------------------------------------------------------------------------
40 ; %1 = a (aligned) or u (unaligned)
42 %macro LOOP_YUYV_TO_Y 2
44 mov%1 m0, [srcq+wq*2] ; (byte) { Y0, U0, Y1, V0, ... }
45 mov%1 m1, [srcq+wq*2+mmsize] ; (byte) { Y8, U4, Y9, V4, ... }
47 pand m0, m2 ; (word) { Y0, Y1, ..., Y7 }
48 pand m1, m2 ; (word) { Y8, Y9, ..., Y15 }
50 psrlw m0, 8 ; (word) { Y0, Y1, ..., Y7 }
51 psrlw m1, 8 ; (word) { Y8, Y9, ..., Y15 }
53 packuswb m0, m1 ; (byte) { Y0, ..., Y15 }
60 ; %1 = nr. of XMM registers
62 ; %3 = if specified, it means that unaligned and aligned code in loop
63 ; will be the same (i.e. YUYV+AVX), and thus we don't need to
64 ; split the loop in an aligned and unaligned case
65 %macro YUYV_TO_Y_FN 2-3
66 cglobal %2ToY, 5, 5, %1, dst, unused0, unused1, src, w
76 pcmpeqb m2, m2 ; (byte) { 0xff } x 16
77 psrlw m2, 8 ; (word) { 0x00ff } x 8
89 %endif ; mmsize == 8/16
92 ; %1 = a (aligned) or u (unaligned)
94 %macro LOOP_YUYV_TO_UV 2
97 mov%1 m0, [srcq+wq*4] ; (byte) { Y0, U0, Y1, V0, ... }
98 mov%1 m1, [srcq+wq*4+mmsize] ; (byte) { Y8, U4, Y9, V4, ... }
99 psrlw m0, 8 ; (word) { U0, V0, ..., U3, V3 }
100 psrlw m1, 8 ; (word) { U4, V4, ..., U7, V7 }
103 vpand m0, m2, [srcq+wq*4] ; (word) { U0, V0, ..., U3, V3 }
104 vpand m1, m2, [srcq+wq*4+mmsize] ; (word) { U4, V4, ..., U7, V7 }
106 mov%1 m0, [srcq+wq*4] ; (byte) { Y0, U0, Y1, V0, ... }
107 mov%1 m1, [srcq+wq*4+mmsize] ; (byte) { Y8, U4, Y9, V4, ... }
108 pand m0, m2 ; (word) { U0, V0, ..., U3, V3 }
109 pand m1, m2 ; (word) { U4, V4, ..., U7, V7 }
112 packuswb m0, m1 ; (byte) { U0, V0, ..., U7, V7 }
113 pand m1, m0, m2 ; (word) { U0, U1, ..., U7 }
114 psrlw m0, 8 ; (word) { V0, V1, ..., V7 }
116 packuswb m1, m0 ; (byte) { U0, ... U7, V1, ... V7 }
118 movhps [dstVq+wq], m1
120 packuswb m1, m1 ; (byte) { U0, ... U3 }
121 packuswb m0, m0 ; (byte) { V0, ... V3 }
124 %endif ; mmsize == 8/16
130 ; %1 = nr. of XMM registers
132 ; %3 = if specified, it means that unaligned and aligned code in loop
133 ; will be the same (i.e. UYVY+AVX), and thus we don't need to
134 ; split the loop in an aligned and unaligned case
135 %macro YUYV_TO_UV_FN 2-3
136 cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
144 %if mmsize == 16 && %0 == 2
147 lea srcq, [srcq+wq*4]
148 pcmpeqb m2, m2 ; (byte) { 0xff } x 16
149 psrlw m2, 8 ; (word) { 0x00ff } x 8
150 ; NOTE: if uyvy+avx, u/a are identical
151 %if mmsize == 16 && %0 == 2
154 LOOP_YUYV_TO_UV a, %2
157 LOOP_YUYV_TO_UV u, %2
160 LOOP_YUYV_TO_UV a, %2
161 %endif ; mmsize == 8/16
164 ; %1 = a (aligned) or u (unaligned)
166 %macro LOOP_NVXX_TO_UV 2
168 mov%1 m0, [srcq+wq*2] ; (byte) { U0, V0, U1, V1, ... }
169 mov%1 m1, [srcq+wq*2+mmsize] ; (byte) { U8, V8, U9, V9, ... }
170 pand m2, m0, m5 ; (word) { U0, U1, ..., U7 }
171 pand m3, m1, m5 ; (word) { U8, U9, ..., U15 }
172 psrlw m0, 8 ; (word) { V0, V1, ..., V7 }
173 psrlw m1, 8 ; (word) { V8, V9, ..., V15 }
174 packuswb m2, m3 ; (byte) { U0, ..., U15 }
175 packuswb m0, m1 ; (byte) { V0, ..., V15 }
188 ; %1 = nr. of XMM registers
190 %macro NVXX_TO_UV_FN 2
191 cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
202 lea srcq, [srcq+wq*2]
203 pcmpeqb m5, m5 ; (byte) { 0xff } x 16
204 psrlw m5, 8 ; (word) { 0x00ff } x 8
208 LOOP_NVXX_TO_UV a, %2
211 LOOP_NVXX_TO_UV u, %2
214 LOOP_NVXX_TO_UV a, %2
215 %endif ; mmsize == 8/16
222 YUYV_TO_UV_FN 0, yuyv
223 YUYV_TO_UV_FN 0, uyvy
224 NVXX_TO_UV_FN 0, nv12
225 NVXX_TO_UV_FN 0, nv21
231 YUYV_TO_UV_FN 3, yuyv
232 YUYV_TO_UV_FN 3, uyvy
233 NVXX_TO_UV_FN 5, nv12
234 NVXX_TO_UV_FN 5, nv21
238 ; in theory, we could write a yuy2-to-y using vpand (i.e. AVX), but
239 ; that's not faster in practice
240 YUYV_TO_UV_FN 3, yuyv
241 YUYV_TO_UV_FN 3, uyvy, 1
242 NVXX_TO_UV_FN 5, nv12
243 NVXX_TO_UV_FN 5, nv21