2 * Copyright (C) 2013 Andrea Mazzoleni
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
17 #if defined(CONFIG_X86) && defined(CONFIG_SSE2)
18 static const struct gfzconst16 {
22 } gfzconst16 __aligned(64) =
25 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
26 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d
29 0x8e, 0x8e, 0x8e, 0x8e, 0x8e, 0x8e, 0x8e, 0x8e,
30 0x8e, 0x8e, 0x8e, 0x8e, 0x8e, 0x8e, 0x8e, 0x8e
33 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
34 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f
39 #if defined(CONFIG_X86) && defined(CONFIG_SSE2)
41 * GENz (triple parity with powers of 2^-1) SSE2 implementation
43 void raid_genz_sse2(int nd, size_t size, void **vv)
45 uint8_t **v = (uint8_t**)vv;
59 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfzconst16.poly[0]));
60 asm volatile ("movdqa %0,%%xmm3" : : "m" (gfzconst16.half[0]));
61 asm volatile ("movdqa %0,%%xmm6" : : "m" (gfzconst16.low7[0]));
63 for (i = 0; i < size; i += 16) {
64 asm volatile ("movdqa %0,%%xmm0" : : "m" (v[l][i]));
65 asm volatile ("movdqa %xmm0,%xmm1");
66 asm volatile ("movdqa %xmm0,%xmm2");
67 for (d = l - 1; d >= 0; --d) {
68 asm volatile ("pxor %xmm4,%xmm4");
69 asm volatile ("pcmpgtb %xmm1,%xmm4");
70 asm volatile ("paddb %xmm1,%xmm1");
71 asm volatile ("pand %xmm7,%xmm4");
72 asm volatile ("pxor %xmm4,%xmm1");
74 asm volatile ("movdqa %xmm2,%xmm4");
75 asm volatile ("pxor %xmm5,%xmm5");
76 asm volatile ("psllw $7,%xmm4");
77 asm volatile ("psrlw $1,%xmm2");
78 asm volatile ("pcmpgtb %xmm4,%xmm5");
79 asm volatile ("pand %xmm6,%xmm2");
80 asm volatile ("pand %xmm3,%xmm5");
81 asm volatile ("pxor %xmm5,%xmm2");
83 asm volatile ("movdqa %0,%%xmm4" : : "m" (v[d][i]));
84 asm volatile ("pxor %xmm4,%xmm0");
85 asm volatile ("pxor %xmm4,%xmm1");
86 asm volatile ("pxor %xmm4,%xmm2");
88 asm volatile ("movntdq %%xmm0,%0" : "=m" (p[i]));
89 asm volatile ("movntdq %%xmm1,%0" : "=m" (q[i]));
90 asm volatile ("movntdq %%xmm2,%0" : "=m" (r[i]));
97 #if defined(CONFIG_X86_64) && defined(CONFIG_SSE2)
99 * GENz (triple parity with powers of 2^-1) SSE2 implementation
101 * Note that it uses 16 registers, meaning that x64 is required.
103 void raid_genz_sse2ext(int nd, size_t size, void **vv)
105 uint8_t **v = (uint8_t**)vv;
119 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfzconst16.poly[0]));
120 asm volatile ("movdqa %0,%%xmm3" : : "m" (gfzconst16.half[0]));
121 asm volatile ("movdqa %0,%%xmm11" : : "m" (gfzconst16.low7[0]));
123 for (i = 0; i < size; i += 32) {
124 asm volatile ("movdqa %0,%%xmm0" : : "m" (v[l][i]));
125 asm volatile ("movdqa %0,%%xmm8" : : "m" (v[l][i + 16]));
126 asm volatile ("movdqa %xmm0,%xmm1");
127 asm volatile ("movdqa %xmm8,%xmm9");
128 asm volatile ("movdqa %xmm0,%xmm2");
129 asm volatile ("movdqa %xmm8,%xmm10");
130 for (d = l - 1; d >= 0; --d) {
131 asm volatile ("movdqa %xmm2,%xmm6");
132 asm volatile ("movdqa %xmm10,%xmm14");
133 asm volatile ("pxor %xmm4,%xmm4");
134 asm volatile ("pxor %xmm12,%xmm12");
135 asm volatile ("pxor %xmm5,%xmm5");
136 asm volatile ("pxor %xmm13,%xmm13");
137 asm volatile ("psllw $7,%xmm6");
138 asm volatile ("psllw $7,%xmm14");
139 asm volatile ("psrlw $1,%xmm2");
140 asm volatile ("psrlw $1,%xmm10");
141 asm volatile ("pcmpgtb %xmm1,%xmm4");
142 asm volatile ("pcmpgtb %xmm9,%xmm12");
143 asm volatile ("pcmpgtb %xmm6,%xmm5");
144 asm volatile ("pcmpgtb %xmm14,%xmm13");
145 asm volatile ("paddb %xmm1,%xmm1");
146 asm volatile ("paddb %xmm9,%xmm9");
147 asm volatile ("pand %xmm11,%xmm2");
148 asm volatile ("pand %xmm11,%xmm10");
149 asm volatile ("pand %xmm7,%xmm4");
150 asm volatile ("pand %xmm7,%xmm12");
151 asm volatile ("pand %xmm3,%xmm5");
152 asm volatile ("pand %xmm3,%xmm13");
153 asm volatile ("pxor %xmm4,%xmm1");
154 asm volatile ("pxor %xmm12,%xmm9");
155 asm volatile ("pxor %xmm5,%xmm2");
156 asm volatile ("pxor %xmm13,%xmm10");
158 asm volatile ("movdqa %0,%%xmm4" : : "m" (v[d][i]));
159 asm volatile ("movdqa %0,%%xmm12" : : "m" (v[d][i + 16]));
160 asm volatile ("pxor %xmm4,%xmm0");
161 asm volatile ("pxor %xmm4,%xmm1");
162 asm volatile ("pxor %xmm4,%xmm2");
163 asm volatile ("pxor %xmm12,%xmm8");
164 asm volatile ("pxor %xmm12,%xmm9");
165 asm volatile ("pxor %xmm12,%xmm10");
167 asm volatile ("movntdq %%xmm0,%0" : "=m" (p[i]));
168 asm volatile ("movntdq %%xmm8,%0" : "=m" (p[i + 16]));
169 asm volatile ("movntdq %%xmm1,%0" : "=m" (q[i]));
170 asm volatile ("movntdq %%xmm9,%0" : "=m" (q[i + 16]));
171 asm volatile ("movntdq %%xmm2,%0" : "=m" (r[i]));
172 asm volatile ("movntdq %%xmm10,%0" : "=m" (r[i + 16]));
179 #if defined(CONFIG_X86_64) && defined(CONFIG_AVX2)
181 * GENz (triple parity with powers of 2^-1) AVX2 implementation
183 * Note that it uses 16 registers, meaning that x64 is required.
185 void raid_genz_avx2ext(int nd, size_t size, void **vv)
187 uint8_t **v = (uint8_t**)vv;
201 asm volatile ("vbroadcasti128 %0,%%ymm7" : : "m" (gfzconst16.poly[0]));
202 asm volatile ("vbroadcasti128 %0,%%ymm3" : : "m" (gfzconst16.half[0]));
203 asm volatile ("vbroadcasti128 %0,%%ymm11" : : "m" (gfzconst16.low7[0]));
204 asm volatile ("vpxor %ymm15,%ymm15,%ymm15");
206 for (i = 0; i < size; i += 64) {
207 asm volatile ("vmovdqa %0,%%ymm0" : : "m" (v[l][i]));
208 asm volatile ("vmovdqa %0,%%ymm8" : : "m" (v[l][i + 32]));
209 asm volatile ("vmovdqa %ymm0,%ymm1");
210 asm volatile ("vmovdqa %ymm8,%ymm9");
211 asm volatile ("vmovdqa %ymm0,%ymm2");
212 asm volatile ("vmovdqa %ymm8,%ymm10");
213 for (d = l - 1; d >= 0; --d) {
214 asm volatile ("vpsllw $7,%ymm2,%ymm6");
215 asm volatile ("vpsllw $7,%ymm10,%ymm14");
216 asm volatile ("vpsrlw $1,%ymm2,%ymm2");
217 asm volatile ("vpsrlw $1,%ymm10,%ymm10");
218 asm volatile ("vpcmpgtb %ymm1,%ymm15,%ymm4");
219 asm volatile ("vpcmpgtb %ymm9,%ymm15,%ymm12");
220 asm volatile ("vpcmpgtb %ymm6,%ymm15,%ymm5");
221 asm volatile ("vpcmpgtb %ymm14,%ymm15,%ymm13");
222 asm volatile ("vpaddb %ymm1,%ymm1,%ymm1");
223 asm volatile ("vpaddb %ymm9,%ymm9,%ymm9");
224 asm volatile ("vpand %ymm11,%ymm2,%ymm2");
225 asm volatile ("vpand %ymm11,%ymm10,%ymm10");
226 asm volatile ("vpand %ymm7,%ymm4,%ymm4");
227 asm volatile ("vpand %ymm7,%ymm12,%ymm12");
228 asm volatile ("vpand %ymm3,%ymm5,%ymm5");
229 asm volatile ("vpand %ymm3,%ymm13,%ymm13");
230 asm volatile ("vpxor %ymm4,%ymm1,%ymm1");
231 asm volatile ("vpxor %ymm12,%ymm9,%ymm9");
232 asm volatile ("vpxor %ymm5,%ymm2,%ymm2");
233 asm volatile ("vpxor %ymm13,%ymm10,%ymm10");
235 asm volatile ("vmovdqa %0,%%ymm4" : : "m" (v[d][i]));
236 asm volatile ("vmovdqa %0,%%ymm12" : : "m" (v[d][i + 32]));
237 asm volatile ("vpxor %ymm4,%ymm0,%ymm0");
238 asm volatile ("vpxor %ymm4,%ymm1,%ymm1");
239 asm volatile ("vpxor %ymm4,%ymm2,%ymm2");
240 asm volatile ("vpxor %ymm12,%ymm8,%ymm8");
241 asm volatile ("vpxor %ymm12,%ymm9,%ymm9");
242 asm volatile ("vpxor %ymm12,%ymm10,%ymm10");
244 asm volatile ("vmovntdq %%ymm0,%0" : "=m" (p[i]));
245 asm volatile ("vmovntdq %%ymm8,%0" : "=m" (p[i + 32]));
246 asm volatile ("vmovntdq %%ymm1,%0" : "=m" (q[i]));
247 asm volatile ("vmovntdq %%ymm9,%0" : "=m" (q[i + 32]));
248 asm volatile ("vmovntdq %%ymm2,%0" : "=m" (r[i]));
249 asm volatile ("vmovntdq %%ymm10,%0" : "=m" (r[i + 32]));