2 * Copyright (C) 2013 Andrea Mazzoleni
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
19 * For x86 optimizations you can see:
21 * Software optimization resources
22 * http://www.agner.org/optimize/
24 * x86, x64 Instruction Latency, Memory Latency and CPUID dumps
25 * http://users.atw.hu/instlatx64/
28 #if defined(CONFIG_X86) && defined(CONFIG_SSE2)
30 * GEN1 (RAID5 with xor) SSE2 implementation
32 * Intentionally don't process more than 64 bytes because 64 is the typical
33 * cache block, and processing 128 bytes doesn't increase performance, and in
34 * some cases it even decreases it.
36 void raid_gen1_sse2(int nd, size_t size, void **vv)
38 uint8_t **v = (uint8_t **)vv;
48 for (i = 0; i < size; i += 64) {
49 asm volatile ("movdqa %0,%%xmm0" : : "m" (v[l][i]));
50 asm volatile ("movdqa %0,%%xmm1" : : "m" (v[l][i + 16]));
51 asm volatile ("movdqa %0,%%xmm2" : : "m" (v[l][i + 32]));
52 asm volatile ("movdqa %0,%%xmm3" : : "m" (v[l][i + 48]));
53 for (d = l - 1; d >= 0; --d) {
54 asm volatile ("pxor %0,%%xmm0" : : "m" (v[d][i]));
55 asm volatile ("pxor %0,%%xmm1" : : "m" (v[d][i + 16]));
56 asm volatile ("pxor %0,%%xmm2" : : "m" (v[d][i + 32]));
57 asm volatile ("pxor %0,%%xmm3" : : "m" (v[d][i + 48]));
59 asm volatile ("movntdq %%xmm0,%0" : "=m" (p[i]));
60 asm volatile ("movntdq %%xmm1,%0" : "=m" (p[i + 16]));
61 asm volatile ("movntdq %%xmm2,%0" : "=m" (p[i + 32]));
62 asm volatile ("movntdq %%xmm3,%0" : "=m" (p[i + 48]));
69 #if defined(CONFIG_X86) && defined(CONFIG_AVX2)
71 * GEN1 (RAID5 with xor) AVX2 implementation
73 * Intentionally don't process more than 64 bytes because 64 is the typical
74 * cache block, and processing 128 bytes doesn't increase performance, and in
75 * some cases it even decreases it.
77 void raid_gen1_avx2(int nd, size_t size, void **vv)
79 uint8_t **v = (uint8_t **)vv;
89 for (i = 0; i < size; i += 64) {
90 asm volatile ("vmovdqa %0,%%ymm0" : : "m" (v[l][i]));
91 asm volatile ("vmovdqa %0,%%ymm1" : : "m" (v[l][i + 32]));
92 for (d = l - 1; d >= 0; --d) {
93 asm volatile ("vpxor %0,%%ymm0,%%ymm0" : : "m" (v[d][i]));
94 asm volatile ("vpxor %0,%%ymm1,%%ymm1" : : "m" (v[d][i + 32]));
96 asm volatile ("vmovntdq %%ymm0,%0" : "=m" (p[i]));
97 asm volatile ("vmovntdq %%ymm1,%0" : "=m" (p[i + 32]));
104 #if defined(CONFIG_X86) && defined(CONFIG_SSE2)
105 static const struct gfconst16 {
108 } gfconst16 __aligned(32) = {
110 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
111 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d
114 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
115 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f
120 #if defined(CONFIG_X86) && defined(CONFIG_SSE2)
122 * GEN2 (RAID6 with powers of 2) SSE2 implementation
124 void raid_gen2_sse2(int nd, size_t size, void **vv)
126 uint8_t **v = (uint8_t **)vv;
138 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.poly[0]));
140 for (i = 0; i < size; i += 32) {
141 asm volatile ("movdqa %0,%%xmm0" : : "m" (v[l][i]));
142 asm volatile ("movdqa %0,%%xmm1" : : "m" (v[l][i + 16]));
143 asm volatile ("movdqa %xmm0,%xmm2");
144 asm volatile ("movdqa %xmm1,%xmm3");
145 for (d = l - 1; d >= 0; --d) {
146 asm volatile ("pxor %xmm4,%xmm4");
147 asm volatile ("pxor %xmm5,%xmm5");
148 asm volatile ("pcmpgtb %xmm2,%xmm4");
149 asm volatile ("pcmpgtb %xmm3,%xmm5");
150 asm volatile ("paddb %xmm2,%xmm2");
151 asm volatile ("paddb %xmm3,%xmm3");
152 asm volatile ("pand %xmm7,%xmm4");
153 asm volatile ("pand %xmm7,%xmm5");
154 asm volatile ("pxor %xmm4,%xmm2");
155 asm volatile ("pxor %xmm5,%xmm3");
157 asm volatile ("movdqa %0,%%xmm4" : : "m" (v[d][i]));
158 asm volatile ("movdqa %0,%%xmm5" : : "m" (v[d][i + 16]));
159 asm volatile ("pxor %xmm4,%xmm0");
160 asm volatile ("pxor %xmm5,%xmm1");
161 asm volatile ("pxor %xmm4,%xmm2");
162 asm volatile ("pxor %xmm5,%xmm3");
164 asm volatile ("movntdq %%xmm0,%0" : "=m" (p[i]));
165 asm volatile ("movntdq %%xmm1,%0" : "=m" (p[i + 16]));
166 asm volatile ("movntdq %%xmm2,%0" : "=m" (q[i]));
167 asm volatile ("movntdq %%xmm3,%0" : "=m" (q[i + 16]));
174 #if defined(CONFIG_X86) && defined(CONFIG_AVX2)
176 * GEN2 (RAID6 with powers of 2) AVX2 implementation
178 void raid_gen2_avx2(int nd, size_t size, void **vv)
180 uint8_t **v = (uint8_t **)vv;
192 asm volatile ("vbroadcasti128 %0, %%ymm7" : : "m" (gfconst16.poly[0]));
193 asm volatile ("vpxor %ymm6,%ymm6,%ymm6");
195 for (i = 0; i < size; i += 64) {
196 asm volatile ("vmovdqa %0,%%ymm0" : : "m" (v[l][i]));
197 asm volatile ("vmovdqa %0,%%ymm1" : : "m" (v[l][i + 32]));
198 asm volatile ("vmovdqa %ymm0,%ymm2");
199 asm volatile ("vmovdqa %ymm1,%ymm3");
200 for (d = l - 1; d >= 0; --d) {
201 asm volatile ("vpcmpgtb %ymm2,%ymm6,%ymm4");
202 asm volatile ("vpcmpgtb %ymm3,%ymm6,%ymm5");
203 asm volatile ("vpaddb %ymm2,%ymm2,%ymm2");
204 asm volatile ("vpaddb %ymm3,%ymm3,%ymm3");
205 asm volatile ("vpand %ymm7,%ymm4,%ymm4");
206 asm volatile ("vpand %ymm7,%ymm5,%ymm5");
207 asm volatile ("vpxor %ymm4,%ymm2,%ymm2");
208 asm volatile ("vpxor %ymm5,%ymm3,%ymm3");
210 asm volatile ("vmovdqa %0,%%ymm4" : : "m" (v[d][i]));
211 asm volatile ("vmovdqa %0,%%ymm5" : : "m" (v[d][i + 32]));
212 asm volatile ("vpxor %ymm4,%ymm0,%ymm0");
213 asm volatile ("vpxor %ymm5,%ymm1,%ymm1");
214 asm volatile ("vpxor %ymm4,%ymm2,%ymm2");
215 asm volatile ("vpxor %ymm5,%ymm3,%ymm3");
217 asm volatile ("vmovntdq %%ymm0,%0" : "=m" (p[i]));
218 asm volatile ("vmovntdq %%ymm1,%0" : "=m" (p[i + 32]));
219 asm volatile ("vmovntdq %%ymm2,%0" : "=m" (q[i]));
220 asm volatile ("vmovntdq %%ymm3,%0" : "=m" (q[i + 32]));
227 #if defined(CONFIG_X86_64) && defined(CONFIG_SSE2)
229 * GEN2 (RAID6 with powers of 2) SSE2 implementation
231 * Note that it uses 16 registers, meaning that x64 is required.
233 void raid_gen2_sse2ext(int nd, size_t size, void **vv)
235 uint8_t **v = (uint8_t **)vv;
247 asm volatile ("movdqa %0,%%xmm15" : : "m" (gfconst16.poly[0]));
249 for (i = 0; i < size; i += 64) {
250 asm volatile ("movdqa %0,%%xmm0" : : "m" (v[l][i]));
251 asm volatile ("movdqa %0,%%xmm1" : : "m" (v[l][i + 16]));
252 asm volatile ("movdqa %0,%%xmm2" : : "m" (v[l][i + 32]));
253 asm volatile ("movdqa %0,%%xmm3" : : "m" (v[l][i + 48]));
254 asm volatile ("movdqa %xmm0,%xmm4");
255 asm volatile ("movdqa %xmm1,%xmm5");
256 asm volatile ("movdqa %xmm2,%xmm6");
257 asm volatile ("movdqa %xmm3,%xmm7");
258 for (d = l - 1; d >= 0; --d) {
259 asm volatile ("pxor %xmm8,%xmm8");
260 asm volatile ("pxor %xmm9,%xmm9");
261 asm volatile ("pxor %xmm10,%xmm10");
262 asm volatile ("pxor %xmm11,%xmm11");
263 asm volatile ("pcmpgtb %xmm4,%xmm8");
264 asm volatile ("pcmpgtb %xmm5,%xmm9");
265 asm volatile ("pcmpgtb %xmm6,%xmm10");
266 asm volatile ("pcmpgtb %xmm7,%xmm11");
267 asm volatile ("paddb %xmm4,%xmm4");
268 asm volatile ("paddb %xmm5,%xmm5");
269 asm volatile ("paddb %xmm6,%xmm6");
270 asm volatile ("paddb %xmm7,%xmm7");
271 asm volatile ("pand %xmm15,%xmm8");
272 asm volatile ("pand %xmm15,%xmm9");
273 asm volatile ("pand %xmm15,%xmm10");
274 asm volatile ("pand %xmm15,%xmm11");
275 asm volatile ("pxor %xmm8,%xmm4");
276 asm volatile ("pxor %xmm9,%xmm5");
277 asm volatile ("pxor %xmm10,%xmm6");
278 asm volatile ("pxor %xmm11,%xmm7");
280 asm volatile ("movdqa %0,%%xmm8" : : "m" (v[d][i]));
281 asm volatile ("movdqa %0,%%xmm9" : : "m" (v[d][i + 16]));
282 asm volatile ("movdqa %0,%%xmm10" : : "m" (v[d][i + 32]));
283 asm volatile ("movdqa %0,%%xmm11" : : "m" (v[d][i + 48]));
284 asm volatile ("pxor %xmm8,%xmm0");
285 asm volatile ("pxor %xmm9,%xmm1");
286 asm volatile ("pxor %xmm10,%xmm2");
287 asm volatile ("pxor %xmm11,%xmm3");
288 asm volatile ("pxor %xmm8,%xmm4");
289 asm volatile ("pxor %xmm9,%xmm5");
290 asm volatile ("pxor %xmm10,%xmm6");
291 asm volatile ("pxor %xmm11,%xmm7");
293 asm volatile ("movntdq %%xmm0,%0" : "=m" (p[i]));
294 asm volatile ("movntdq %%xmm1,%0" : "=m" (p[i + 16]));
295 asm volatile ("movntdq %%xmm2,%0" : "=m" (p[i + 32]));
296 asm volatile ("movntdq %%xmm3,%0" : "=m" (p[i + 48]));
297 asm volatile ("movntdq %%xmm4,%0" : "=m" (q[i]));
298 asm volatile ("movntdq %%xmm5,%0" : "=m" (q[i + 16]));
299 asm volatile ("movntdq %%xmm6,%0" : "=m" (q[i + 32]));
300 asm volatile ("movntdq %%xmm7,%0" : "=m" (q[i + 48]));
307 #if defined(CONFIG_X86) && defined(CONFIG_SSSE3)
309 * GEN3 (triple parity with Cauchy matrix) SSSE3 implementation
311 void raid_gen3_ssse3(int nd, size_t size, void **vv)
313 uint8_t **v = (uint8_t **)vv;
325 /* special case with only one data disk */
327 for (i = 0; i < 3; ++i)
328 memcpy(v[1 + i], v[0], size);
334 /* generic case with at least two data disks */
335 asm volatile ("movdqa %0,%%xmm3" : : "m" (gfconst16.poly[0]));
336 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.low4[0]));
338 for (i = 0; i < size; i += 16) {
339 /* last disk without the by two multiplication */
340 asm volatile ("movdqa %0,%%xmm4" : : "m" (v[l][i]));
342 asm volatile ("movdqa %xmm4,%xmm0");
343 asm volatile ("movdqa %xmm4,%xmm1");
345 asm volatile ("movdqa %xmm4,%xmm5");
346 asm volatile ("psrlw $4,%xmm5");
347 asm volatile ("pand %xmm7,%xmm4");
348 asm volatile ("pand %xmm7,%xmm5");
350 asm volatile ("movdqa %0,%%xmm2" : : "m" (gfgenpshufb[l][0][0][0]));
351 asm volatile ("movdqa %0,%%xmm6" : : "m" (gfgenpshufb[l][0][1][0]));
352 asm volatile ("pshufb %xmm4,%xmm2");
353 asm volatile ("pshufb %xmm5,%xmm6");
354 asm volatile ("pxor %xmm6,%xmm2");
356 /* intermediate disks */
357 for (d = l - 1; d > 0; --d) {
358 asm volatile ("movdqa %0,%%xmm4" : : "m" (v[d][i]));
360 asm volatile ("pxor %xmm5,%xmm5");
361 asm volatile ("pcmpgtb %xmm1,%xmm5");
362 asm volatile ("paddb %xmm1,%xmm1");
363 asm volatile ("pand %xmm3,%xmm5");
364 asm volatile ("pxor %xmm5,%xmm1");
366 asm volatile ("pxor %xmm4,%xmm0");
367 asm volatile ("pxor %xmm4,%xmm1");
369 asm volatile ("movdqa %xmm4,%xmm5");
370 asm volatile ("psrlw $4,%xmm5");
371 asm volatile ("pand %xmm7,%xmm4");
372 asm volatile ("pand %xmm7,%xmm5");
374 asm volatile ("movdqa %0,%%xmm6" : : "m" (gfgenpshufb[d][0][0][0]));
375 asm volatile ("pshufb %xmm4,%xmm6");
376 asm volatile ("pxor %xmm6,%xmm2");
377 asm volatile ("movdqa %0,%%xmm6" : : "m" (gfgenpshufb[d][0][1][0]));
378 asm volatile ("pshufb %xmm5,%xmm6");
379 asm volatile ("pxor %xmm6,%xmm2");
382 /* first disk with all coefficients at 1 */
383 asm volatile ("movdqa %0,%%xmm4" : : "m" (v[0][i]));
385 asm volatile ("pxor %xmm5,%xmm5");
386 asm volatile ("pcmpgtb %xmm1,%xmm5");
387 asm volatile ("paddb %xmm1,%xmm1");
388 asm volatile ("pand %xmm3,%xmm5");
389 asm volatile ("pxor %xmm5,%xmm1");
391 asm volatile ("pxor %xmm4,%xmm0");
392 asm volatile ("pxor %xmm4,%xmm1");
393 asm volatile ("pxor %xmm4,%xmm2");
395 asm volatile ("movntdq %%xmm0,%0" : "=m" (p[i]));
396 asm volatile ("movntdq %%xmm1,%0" : "=m" (q[i]));
397 asm volatile ("movntdq %%xmm2,%0" : "=m" (r[i]));
404 #if defined(CONFIG_X86_64) && defined(CONFIG_SSSE3)
406 * GEN3 (triple parity with Cauchy matrix) SSSE3 implementation
408 * Note that it uses 16 registers, meaning that x64 is required.
410 void raid_gen3_ssse3ext(int nd, size_t size, void **vv)
412 uint8_t **v = (uint8_t **)vv;
424 /* special case with only one data disk */
426 for (i = 0; i < 3; ++i)
427 memcpy(v[1 + i], v[0], size);
433 /* generic case with at least two data disks */
434 asm volatile ("movdqa %0,%%xmm3" : : "m" (gfconst16.poly[0]));
435 asm volatile ("movdqa %0,%%xmm11" : : "m" (gfconst16.low4[0]));
437 for (i = 0; i < size; i += 32) {
438 /* last disk without the by two multiplication */
439 asm volatile ("movdqa %0,%%xmm4" : : "m" (v[l][i]));
440 asm volatile ("movdqa %0,%%xmm12" : : "m" (v[l][i + 16]));
442 asm volatile ("movdqa %xmm4,%xmm0");
443 asm volatile ("movdqa %xmm4,%xmm1");
444 asm volatile ("movdqa %xmm12,%xmm8");
445 asm volatile ("movdqa %xmm12,%xmm9");
447 asm volatile ("movdqa %xmm4,%xmm5");
448 asm volatile ("movdqa %xmm12,%xmm13");
449 asm volatile ("psrlw $4,%xmm5");
450 asm volatile ("psrlw $4,%xmm13");
451 asm volatile ("pand %xmm11,%xmm4");
452 asm volatile ("pand %xmm11,%xmm12");
453 asm volatile ("pand %xmm11,%xmm5");
454 asm volatile ("pand %xmm11,%xmm13");
456 asm volatile ("movdqa %0,%%xmm2" : : "m" (gfgenpshufb[l][0][0][0]));
457 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[l][0][1][0]));
458 asm volatile ("movdqa %xmm2,%xmm10");
459 asm volatile ("movdqa %xmm7,%xmm15");
460 asm volatile ("pshufb %xmm4,%xmm2");
461 asm volatile ("pshufb %xmm12,%xmm10");
462 asm volatile ("pshufb %xmm5,%xmm7");
463 asm volatile ("pshufb %xmm13,%xmm15");
464 asm volatile ("pxor %xmm7,%xmm2");
465 asm volatile ("pxor %xmm15,%xmm10");
467 /* intermediate disks */
468 for (d = l - 1; d > 0; --d) {
469 asm volatile ("movdqa %0,%%xmm4" : : "m" (v[d][i]));
470 asm volatile ("movdqa %0,%%xmm12" : : "m" (v[d][i + 16]));
472 asm volatile ("pxor %xmm5,%xmm5");
473 asm volatile ("pxor %xmm13,%xmm13");
474 asm volatile ("pcmpgtb %xmm1,%xmm5");
475 asm volatile ("pcmpgtb %xmm9,%xmm13");
476 asm volatile ("paddb %xmm1,%xmm1");
477 asm volatile ("paddb %xmm9,%xmm9");
478 asm volatile ("pand %xmm3,%xmm5");
479 asm volatile ("pand %xmm3,%xmm13");
480 asm volatile ("pxor %xmm5,%xmm1");
481 asm volatile ("pxor %xmm13,%xmm9");
483 asm volatile ("pxor %xmm4,%xmm0");
484 asm volatile ("pxor %xmm4,%xmm1");
485 asm volatile ("pxor %xmm12,%xmm8");
486 asm volatile ("pxor %xmm12,%xmm9");
488 asm volatile ("movdqa %xmm4,%xmm5");
489 asm volatile ("movdqa %xmm12,%xmm13");
490 asm volatile ("psrlw $4,%xmm5");
491 asm volatile ("psrlw $4,%xmm13");
492 asm volatile ("pand %xmm11,%xmm4");
493 asm volatile ("pand %xmm11,%xmm12");
494 asm volatile ("pand %xmm11,%xmm5");
495 asm volatile ("pand %xmm11,%xmm13");
497 asm volatile ("movdqa %0,%%xmm6" : : "m" (gfgenpshufb[d][0][0][0]));
498 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[d][0][1][0]));
499 asm volatile ("movdqa %xmm6,%xmm14");
500 asm volatile ("movdqa %xmm7,%xmm15");
501 asm volatile ("pshufb %xmm4,%xmm6");
502 asm volatile ("pshufb %xmm12,%xmm14");
503 asm volatile ("pshufb %xmm5,%xmm7");
504 asm volatile ("pshufb %xmm13,%xmm15");
505 asm volatile ("pxor %xmm6,%xmm2");
506 asm volatile ("pxor %xmm14,%xmm10");
507 asm volatile ("pxor %xmm7,%xmm2");
508 asm volatile ("pxor %xmm15,%xmm10");
511 /* first disk with all coefficients at 1 */
512 asm volatile ("movdqa %0,%%xmm4" : : "m" (v[0][i]));
513 asm volatile ("movdqa %0,%%xmm12" : : "m" (v[0][i + 16]));
515 asm volatile ("pxor %xmm5,%xmm5");
516 asm volatile ("pxor %xmm13,%xmm13");
517 asm volatile ("pcmpgtb %xmm1,%xmm5");
518 asm volatile ("pcmpgtb %xmm9,%xmm13");
519 asm volatile ("paddb %xmm1,%xmm1");
520 asm volatile ("paddb %xmm9,%xmm9");
521 asm volatile ("pand %xmm3,%xmm5");
522 asm volatile ("pand %xmm3,%xmm13");
523 asm volatile ("pxor %xmm5,%xmm1");
524 asm volatile ("pxor %xmm13,%xmm9");
526 asm volatile ("pxor %xmm4,%xmm0");
527 asm volatile ("pxor %xmm4,%xmm1");
528 asm volatile ("pxor %xmm4,%xmm2");
529 asm volatile ("pxor %xmm12,%xmm8");
530 asm volatile ("pxor %xmm12,%xmm9");
531 asm volatile ("pxor %xmm12,%xmm10");
533 asm volatile ("movntdq %%xmm0,%0" : "=m" (p[i]));
534 asm volatile ("movntdq %%xmm8,%0" : "=m" (p[i + 16]));
535 asm volatile ("movntdq %%xmm1,%0" : "=m" (q[i]));
536 asm volatile ("movntdq %%xmm9,%0" : "=m" (q[i + 16]));
537 asm volatile ("movntdq %%xmm2,%0" : "=m" (r[i]));
538 asm volatile ("movntdq %%xmm10,%0" : "=m" (r[i + 16]));
545 #if defined(CONFIG_X86_64) && defined(CONFIG_AVX2)
547 * GEN3 (triple parity with Cauchy matrix) AVX2 implementation
549 * Note that it uses 16 registers, meaning that x64 is required.
551 void raid_gen3_avx2ext(int nd, size_t size, void **vv)
553 uint8_t **v = (uint8_t **)vv;
565 /* special case with only one data disk */
567 for (i = 0; i < 3; ++i)
568 memcpy(v[1 + i], v[0], size);
574 /* generic case with at least two data disks */
575 asm volatile ("vbroadcasti128 %0, %%ymm3" : : "m" (gfconst16.poly[0]));
576 asm volatile ("vbroadcasti128 %0, %%ymm11" : : "m" (gfconst16.low4[0]));
578 for (i = 0; i < size; i += 64) {
579 /* last disk without the by two multiplication */
580 asm volatile ("vmovdqa %0,%%ymm4" : : "m" (v[l][i]));
581 asm volatile ("vmovdqa %0,%%ymm12" : : "m" (v[l][i + 32]));
583 asm volatile ("vmovdqa %ymm4,%ymm0");
584 asm volatile ("vmovdqa %ymm4,%ymm1");
585 asm volatile ("vmovdqa %ymm12,%ymm8");
586 asm volatile ("vmovdqa %ymm12,%ymm9");
588 asm volatile ("vpsrlw $4,%ymm4,%ymm5");
589 asm volatile ("vpsrlw $4,%ymm12,%ymm13");
590 asm volatile ("vpand %ymm11,%ymm4,%ymm4");
591 asm volatile ("vpand %ymm11,%ymm12,%ymm12");
592 asm volatile ("vpand %ymm11,%ymm5,%ymm5");
593 asm volatile ("vpand %ymm11,%ymm13,%ymm13");
595 asm volatile ("vbroadcasti128 %0,%%ymm10" : : "m" (gfgenpshufb[l][0][0][0]));
596 asm volatile ("vbroadcasti128 %0,%%ymm15" : : "m" (gfgenpshufb[l][0][1][0]));
597 asm volatile ("vpshufb %ymm4,%ymm10,%ymm2");
598 asm volatile ("vpshufb %ymm12,%ymm10,%ymm10");
599 asm volatile ("vpshufb %ymm5,%ymm15,%ymm7");
600 asm volatile ("vpshufb %ymm13,%ymm15,%ymm15");
601 asm volatile ("vpxor %ymm7,%ymm2,%ymm2");
602 asm volatile ("vpxor %ymm15,%ymm10,%ymm10");
604 /* intermediate disks */
605 for (d = l - 1; d > 0; --d) {
606 asm volatile ("vmovdqa %0,%%ymm4" : : "m" (v[d][i]));
607 asm volatile ("vmovdqa %0,%%ymm12" : : "m" (v[d][i + 32]));
609 asm volatile ("vpxor %ymm5,%ymm5,%ymm5");
610 asm volatile ("vpxor %ymm13,%ymm13,%ymm13");
611 asm volatile ("vpcmpgtb %ymm1,%ymm5,%ymm5");
612 asm volatile ("vpcmpgtb %ymm9,%ymm13,%ymm13");
613 asm volatile ("vpaddb %ymm1,%ymm1,%ymm1");
614 asm volatile ("vpaddb %ymm9,%ymm9,%ymm9");
615 asm volatile ("vpand %ymm3,%ymm5,%ymm5");
616 asm volatile ("vpand %ymm3,%ymm13,%ymm13");
617 asm volatile ("vpxor %ymm5,%ymm1,%ymm1");
618 asm volatile ("vpxor %ymm13,%ymm9,%ymm9");
620 asm volatile ("vpxor %ymm4,%ymm0,%ymm0");
621 asm volatile ("vpxor %ymm4,%ymm1,%ymm1");
622 asm volatile ("vpxor %ymm12,%ymm8,%ymm8");
623 asm volatile ("vpxor %ymm12,%ymm9,%ymm9");
625 asm volatile ("vpsrlw $4,%ymm4,%ymm5");
626 asm volatile ("vpsrlw $4,%ymm12,%ymm13");
627 asm volatile ("vpand %ymm11,%ymm4,%ymm4");
628 asm volatile ("vpand %ymm11,%ymm12,%ymm12");
629 asm volatile ("vpand %ymm11,%ymm5,%ymm5");
630 asm volatile ("vpand %ymm11,%ymm13,%ymm13");
632 asm volatile ("vbroadcasti128 %0,%%ymm14" : : "m" (gfgenpshufb[d][0][0][0]));
633 asm volatile ("vbroadcasti128 %0,%%ymm15" : : "m" (gfgenpshufb[d][0][1][0]));
634 asm volatile ("vpshufb %ymm4,%ymm14,%ymm6");
635 asm volatile ("vpshufb %ymm12,%ymm14,%ymm14");
636 asm volatile ("vpshufb %ymm5,%ymm15,%ymm7");
637 asm volatile ("vpshufb %ymm13,%ymm15,%ymm15");
638 asm volatile ("vpxor %ymm6,%ymm2,%ymm2");
639 asm volatile ("vpxor %ymm14,%ymm10,%ymm10");
640 asm volatile ("vpxor %ymm7,%ymm2,%ymm2");
641 asm volatile ("vpxor %ymm15,%ymm10,%ymm10");
644 /* first disk with all coefficients at 1 */
645 asm volatile ("vmovdqa %0,%%ymm4" : : "m" (v[0][i]));
646 asm volatile ("vmovdqa %0,%%ymm12" : : "m" (v[0][i + 32]));
648 asm volatile ("vpxor %ymm5,%ymm5,%ymm5");
649 asm volatile ("vpxor %ymm13,%ymm13,%ymm13");
650 asm volatile ("vpcmpgtb %ymm1,%ymm5,%ymm5");
651 asm volatile ("vpcmpgtb %ymm9,%ymm13,%ymm13");
652 asm volatile ("vpaddb %ymm1,%ymm1,%ymm1");
653 asm volatile ("vpaddb %ymm9,%ymm9,%ymm9");
654 asm volatile ("vpand %ymm3,%ymm5,%ymm5");
655 asm volatile ("vpand %ymm3,%ymm13,%ymm13");
656 asm volatile ("vpxor %ymm5,%ymm1,%ymm1");
657 asm volatile ("vpxor %ymm13,%ymm9,%ymm9");
659 asm volatile ("vpxor %ymm4,%ymm0,%ymm0");
660 asm volatile ("vpxor %ymm4,%ymm1,%ymm1");
661 asm volatile ("vpxor %ymm4,%ymm2,%ymm2");
662 asm volatile ("vpxor %ymm12,%ymm8,%ymm8");
663 asm volatile ("vpxor %ymm12,%ymm9,%ymm9");
664 asm volatile ("vpxor %ymm12,%ymm10,%ymm10");
666 asm volatile ("vmovntdq %%ymm0,%0" : "=m" (p[i]));
667 asm volatile ("vmovntdq %%ymm8,%0" : "=m" (p[i + 32]));
668 asm volatile ("vmovntdq %%ymm1,%0" : "=m" (q[i]));
669 asm volatile ("vmovntdq %%ymm9,%0" : "=m" (q[i + 32]));
670 asm volatile ("vmovntdq %%ymm2,%0" : "=m" (r[i]));
671 asm volatile ("vmovntdq %%ymm10,%0" : "=m" (r[i + 32]));
678 #if defined(CONFIG_X86) && defined(CONFIG_SSSE3)
680 * GEN4 (quad parity with Cauchy matrix) SSSE3 implementation
682 void raid_gen4_ssse3(int nd, size_t size, void **vv)
684 uint8_t **v = (uint8_t **)vv;
698 /* special case with only one data disk */
700 for (i = 0; i < 4; ++i)
701 memcpy(v[1 + i], v[0], size);
707 /* generic case with at least two data disks */
708 for (i = 0; i < size; i += 16) {
709 /* last disk without the by two multiplication */
710 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.low4[0]));
711 asm volatile ("movdqa %0,%%xmm4" : : "m" (v[l][i]));
713 asm volatile ("movdqa %xmm4,%xmm0");
714 asm volatile ("movdqa %xmm4,%xmm1");
716 asm volatile ("movdqa %xmm4,%xmm5");
717 asm volatile ("psrlw $4,%xmm5");
718 asm volatile ("pand %xmm7,%xmm4");
719 asm volatile ("pand %xmm7,%xmm5");
721 asm volatile ("movdqa %0,%%xmm2" : : "m" (gfgenpshufb[l][0][0][0]));
722 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[l][0][1][0]));
723 asm volatile ("pshufb %xmm4,%xmm2");
724 asm volatile ("pshufb %xmm5,%xmm7");
725 asm volatile ("pxor %xmm7,%xmm2");
727 asm volatile ("movdqa %0,%%xmm3" : : "m" (gfgenpshufb[l][1][0][0]));
728 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[l][1][1][0]));
729 asm volatile ("pshufb %xmm4,%xmm3");
730 asm volatile ("pshufb %xmm5,%xmm7");
731 asm volatile ("pxor %xmm7,%xmm3");
733 /* intermediate disks */
734 for (d = l - 1; d > 0; --d) {
735 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.poly[0]));
736 asm volatile ("movdqa %0,%%xmm4" : : "m" (v[d][i]));
738 asm volatile ("pxor %xmm5,%xmm5");
739 asm volatile ("pcmpgtb %xmm1,%xmm5");
740 asm volatile ("paddb %xmm1,%xmm1");
741 asm volatile ("pand %xmm7,%xmm5");
742 asm volatile ("pxor %xmm5,%xmm1");
744 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.low4[0]));
746 asm volatile ("pxor %xmm4,%xmm0");
747 asm volatile ("pxor %xmm4,%xmm1");
749 asm volatile ("movdqa %xmm4,%xmm5");
750 asm volatile ("psrlw $4,%xmm5");
751 asm volatile ("pand %xmm7,%xmm4");
752 asm volatile ("pand %xmm7,%xmm5");
754 asm volatile ("movdqa %0,%%xmm6" : : "m" (gfgenpshufb[d][0][0][0]));
755 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[d][0][1][0]));
756 asm volatile ("pshufb %xmm4,%xmm6");
757 asm volatile ("pshufb %xmm5,%xmm7");
758 asm volatile ("pxor %xmm6,%xmm2");
759 asm volatile ("pxor %xmm7,%xmm2");
761 asm volatile ("movdqa %0,%%xmm6" : : "m" (gfgenpshufb[d][1][0][0]));
762 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[d][1][1][0]));
763 asm volatile ("pshufb %xmm4,%xmm6");
764 asm volatile ("pshufb %xmm5,%xmm7");
765 asm volatile ("pxor %xmm6,%xmm3");
766 asm volatile ("pxor %xmm7,%xmm3");
769 /* first disk with all coefficients at 1 */
770 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.poly[0]));
771 asm volatile ("movdqa %0,%%xmm4" : : "m" (v[0][i]));
773 asm volatile ("pxor %xmm5,%xmm5");
774 asm volatile ("pcmpgtb %xmm1,%xmm5");
775 asm volatile ("paddb %xmm1,%xmm1");
776 asm volatile ("pand %xmm7,%xmm5");
777 asm volatile ("pxor %xmm5,%xmm1");
779 asm volatile ("pxor %xmm4,%xmm0");
780 asm volatile ("pxor %xmm4,%xmm1");
781 asm volatile ("pxor %xmm4,%xmm2");
782 asm volatile ("pxor %xmm4,%xmm3");
784 asm volatile ("movntdq %%xmm0,%0" : "=m" (p[i]));
785 asm volatile ("movntdq %%xmm1,%0" : "=m" (q[i]));
786 asm volatile ("movntdq %%xmm2,%0" : "=m" (r[i]));
787 asm volatile ("movntdq %%xmm3,%0" : "=m" (s[i]));
794 #if defined(CONFIG_X86_64) && defined(CONFIG_SSSE3)
796 * GEN4 (quad parity with Cauchy matrix) SSSE3 implementation
798 * Note that it uses 16 registers, meaning that x64 is required.
800 void raid_gen4_ssse3ext(int nd, size_t size, void **vv)
802 uint8_t **v = (uint8_t **)vv;
816 /* special case with only one data disk */
818 for (i = 0; i < 4; ++i)
819 memcpy(v[1 + i], v[0], size);
825 /* generic case with at least two data disks */
826 for (i = 0; i < size; i += 32) {
827 /* last disk without the by two multiplication */
828 asm volatile ("movdqa %0,%%xmm15" : : "m" (gfconst16.low4[0]));
829 asm volatile ("movdqa %0,%%xmm4" : : "m" (v[l][i]));
830 asm volatile ("movdqa %0,%%xmm12" : : "m" (v[l][i + 16]));
832 asm volatile ("movdqa %xmm4,%xmm0");
833 asm volatile ("movdqa %xmm4,%xmm1");
834 asm volatile ("movdqa %xmm12,%xmm8");
835 asm volatile ("movdqa %xmm12,%xmm9");
837 asm volatile ("movdqa %xmm4,%xmm5");
838 asm volatile ("movdqa %xmm12,%xmm13");
839 asm volatile ("psrlw $4,%xmm5");
840 asm volatile ("psrlw $4,%xmm13");
841 asm volatile ("pand %xmm15,%xmm4");
842 asm volatile ("pand %xmm15,%xmm12");
843 asm volatile ("pand %xmm15,%xmm5");
844 asm volatile ("pand %xmm15,%xmm13");
846 asm volatile ("movdqa %0,%%xmm2" : : "m" (gfgenpshufb[l][0][0][0]));
847 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[l][0][1][0]));
848 asm volatile ("movdqa %xmm2,%xmm10");
849 asm volatile ("movdqa %xmm7,%xmm15");
850 asm volatile ("pshufb %xmm4,%xmm2");
851 asm volatile ("pshufb %xmm12,%xmm10");
852 asm volatile ("pshufb %xmm5,%xmm7");
853 asm volatile ("pshufb %xmm13,%xmm15");
854 asm volatile ("pxor %xmm7,%xmm2");
855 asm volatile ("pxor %xmm15,%xmm10");
857 asm volatile ("movdqa %0,%%xmm3" : : "m" (gfgenpshufb[l][1][0][0]));
858 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[l][1][1][0]));
859 asm volatile ("movdqa %xmm3,%xmm11");
860 asm volatile ("movdqa %xmm7,%xmm15");
861 asm volatile ("pshufb %xmm4,%xmm3");
862 asm volatile ("pshufb %xmm12,%xmm11");
863 asm volatile ("pshufb %xmm5,%xmm7");
864 asm volatile ("pshufb %xmm13,%xmm15");
865 asm volatile ("pxor %xmm7,%xmm3");
866 asm volatile ("pxor %xmm15,%xmm11");
868 /* intermediate disks */
869 for (d = l - 1; d > 0; --d) {
870 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.poly[0]));
871 asm volatile ("movdqa %0,%%xmm15" : : "m" (gfconst16.low4[0]));
872 asm volatile ("movdqa %0,%%xmm4" : : "m" (v[d][i]));
873 asm volatile ("movdqa %0,%%xmm12" : : "m" (v[d][i + 16]));
875 asm volatile ("pxor %xmm5,%xmm5");
876 asm volatile ("pxor %xmm13,%xmm13");
877 asm volatile ("pcmpgtb %xmm1,%xmm5");
878 asm volatile ("pcmpgtb %xmm9,%xmm13");
879 asm volatile ("paddb %xmm1,%xmm1");
880 asm volatile ("paddb %xmm9,%xmm9");
881 asm volatile ("pand %xmm7,%xmm5");
882 asm volatile ("pand %xmm7,%xmm13");
883 asm volatile ("pxor %xmm5,%xmm1");
884 asm volatile ("pxor %xmm13,%xmm9");
886 asm volatile ("pxor %xmm4,%xmm0");
887 asm volatile ("pxor %xmm4,%xmm1");
888 asm volatile ("pxor %xmm12,%xmm8");
889 asm volatile ("pxor %xmm12,%xmm9");
891 asm volatile ("movdqa %xmm4,%xmm5");
892 asm volatile ("movdqa %xmm12,%xmm13");
893 asm volatile ("psrlw $4,%xmm5");
894 asm volatile ("psrlw $4,%xmm13");
895 asm volatile ("pand %xmm15,%xmm4");
896 asm volatile ("pand %xmm15,%xmm12");
897 asm volatile ("pand %xmm15,%xmm5");
898 asm volatile ("pand %xmm15,%xmm13");
900 asm volatile ("movdqa %0,%%xmm6" : : "m" (gfgenpshufb[d][0][0][0]));
901 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[d][0][1][0]));
902 asm volatile ("movdqa %xmm6,%xmm14");
903 asm volatile ("movdqa %xmm7,%xmm15");
904 asm volatile ("pshufb %xmm4,%xmm6");
905 asm volatile ("pshufb %xmm12,%xmm14");
906 asm volatile ("pshufb %xmm5,%xmm7");
907 asm volatile ("pshufb %xmm13,%xmm15");
908 asm volatile ("pxor %xmm6,%xmm2");
909 asm volatile ("pxor %xmm14,%xmm10");
910 asm volatile ("pxor %xmm7,%xmm2");
911 asm volatile ("pxor %xmm15,%xmm10");
913 asm volatile ("movdqa %0,%%xmm6" : : "m" (gfgenpshufb[d][1][0][0]));
914 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[d][1][1][0]));
915 asm volatile ("movdqa %xmm6,%xmm14");
916 asm volatile ("movdqa %xmm7,%xmm15");
917 asm volatile ("pshufb %xmm4,%xmm6");
918 asm volatile ("pshufb %xmm12,%xmm14");
919 asm volatile ("pshufb %xmm5,%xmm7");
920 asm volatile ("pshufb %xmm13,%xmm15");
921 asm volatile ("pxor %xmm6,%xmm3");
922 asm volatile ("pxor %xmm14,%xmm11");
923 asm volatile ("pxor %xmm7,%xmm3");
924 asm volatile ("pxor %xmm15,%xmm11");
927 /* first disk with all coefficients at 1 */
928 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.poly[0]));
929 asm volatile ("movdqa %0,%%xmm15" : : "m" (gfconst16.low4[0]));
930 asm volatile ("movdqa %0,%%xmm4" : : "m" (v[0][i]));
931 asm volatile ("movdqa %0,%%xmm12" : : "m" (v[0][i + 16]));
933 asm volatile ("pxor %xmm5,%xmm5");
934 asm volatile ("pxor %xmm13,%xmm13");
935 asm volatile ("pcmpgtb %xmm1,%xmm5");
936 asm volatile ("pcmpgtb %xmm9,%xmm13");
937 asm volatile ("paddb %xmm1,%xmm1");
938 asm volatile ("paddb %xmm9,%xmm9");
939 asm volatile ("pand %xmm7,%xmm5");
940 asm volatile ("pand %xmm7,%xmm13");
941 asm volatile ("pxor %xmm5,%xmm1");
942 asm volatile ("pxor %xmm13,%xmm9");
944 asm volatile ("pxor %xmm4,%xmm0");
945 asm volatile ("pxor %xmm4,%xmm1");
946 asm volatile ("pxor %xmm4,%xmm2");
947 asm volatile ("pxor %xmm4,%xmm3");
948 asm volatile ("pxor %xmm12,%xmm8");
949 asm volatile ("pxor %xmm12,%xmm9");
950 asm volatile ("pxor %xmm12,%xmm10");
951 asm volatile ("pxor %xmm12,%xmm11");
953 asm volatile ("movntdq %%xmm0,%0" : "=m" (p[i]));
954 asm volatile ("movntdq %%xmm8,%0" : "=m" (p[i + 16]));
955 asm volatile ("movntdq %%xmm1,%0" : "=m" (q[i]));
956 asm volatile ("movntdq %%xmm9,%0" : "=m" (q[i + 16]));
957 asm volatile ("movntdq %%xmm2,%0" : "=m" (r[i]));
958 asm volatile ("movntdq %%xmm10,%0" : "=m" (r[i + 16]));
959 asm volatile ("movntdq %%xmm3,%0" : "=m" (s[i]));
960 asm volatile ("movntdq %%xmm11,%0" : "=m" (s[i + 16]));
967 #if defined(CONFIG_X86_64) && defined(CONFIG_AVX2)
969 * GEN4 (quad parity with Cauchy matrix) AVX2 implementation
971 * Note that it uses 16 registers, meaning that x64 is required.
973 void raid_gen4_avx2ext(int nd, size_t size, void **vv)
975 uint8_t **v = (uint8_t **)vv;
989 /* special case with only one data disk */
991 for (i = 0; i < 4; ++i)
992 memcpy(v[1 + i], v[0], size);
998 /* generic case with at least two data disks */
999 for (i = 0; i < size; i += 64) {
1000 /* last disk without the by two multiplication */
1001 asm volatile ("vbroadcasti128 %0,%%ymm15" : : "m" (gfconst16.low4[0]));
1002 asm volatile ("vmovdqa %0,%%ymm4" : : "m" (v[l][i]));
1003 asm volatile ("vmovdqa %0,%%ymm12" : : "m" (v[l][i + 32]));
1005 asm volatile ("vmovdqa %ymm4,%ymm0");
1006 asm volatile ("vmovdqa %ymm4,%ymm1");
1007 asm volatile ("vmovdqa %ymm12,%ymm8");
1008 asm volatile ("vmovdqa %ymm12,%ymm9");
1010 asm volatile ("vpsrlw $4,%ymm4,%ymm5");
1011 asm volatile ("vpsrlw $4,%ymm12,%ymm13");
1012 asm volatile ("vpand %ymm15,%ymm4,%ymm4");
1013 asm volatile ("vpand %ymm15,%ymm12,%ymm12");
1014 asm volatile ("vpand %ymm15,%ymm5,%ymm5");
1015 asm volatile ("vpand %ymm15,%ymm13,%ymm13");
1017 asm volatile ("vbroadcasti128 %0,%%ymm10" : : "m" (gfgenpshufb[l][0][0][0]));
1018 asm volatile ("vbroadcasti128 %0,%%ymm15" : : "m" (gfgenpshufb[l][0][1][0]));
1019 asm volatile ("vpshufb %ymm4,%ymm10,%ymm2");
1020 asm volatile ("vpshufb %ymm5,%ymm15,%ymm7");
1021 asm volatile ("vpshufb %ymm12,%ymm10,%ymm10");
1022 asm volatile ("vpshufb %ymm13,%ymm15,%ymm15");
1023 asm volatile ("vpxor %ymm7,%ymm2,%ymm2");
1024 asm volatile ("vpxor %ymm15,%ymm10,%ymm10");
1026 asm volatile ("vbroadcasti128 %0,%%ymm11" : : "m" (gfgenpshufb[l][1][0][0]));
1027 asm volatile ("vbroadcasti128 %0,%%ymm15" : : "m" (gfgenpshufb[l][1][1][0]));
1028 asm volatile ("vpshufb %ymm4,%ymm11,%ymm3");
1029 asm volatile ("vpshufb %ymm5,%ymm15,%ymm7");
1030 asm volatile ("vpshufb %ymm12,%ymm11,%ymm11");
1031 asm volatile ("vpshufb %ymm13,%ymm15,%ymm15");
1032 asm volatile ("vpxor %ymm7,%ymm3,%ymm3");
1033 asm volatile ("vpxor %ymm15,%ymm11,%ymm11");
1035 /* intermediate disks */
1036 for (d = l - 1; d > 0; --d) {
1037 asm volatile ("vbroadcasti128 %0,%%ymm7" : : "m" (gfconst16.poly[0]));
1038 asm volatile ("vbroadcasti128 %0,%%ymm15" : : "m" (gfconst16.low4[0]));
1039 asm volatile ("vmovdqa %0,%%ymm4" : : "m" (v[d][i]));
1040 asm volatile ("vmovdqa %0,%%ymm12" : : "m" (v[d][i + 32]));
1042 asm volatile ("vpxor %ymm5,%ymm5,%ymm5");
1043 asm volatile ("vpxor %ymm13,%ymm13,%ymm13");
1044 asm volatile ("vpcmpgtb %ymm1,%ymm5,%ymm5");
1045 asm volatile ("vpcmpgtb %ymm9,%ymm13,%ymm13");
1046 asm volatile ("vpaddb %ymm1,%ymm1,%ymm1");
1047 asm volatile ("vpaddb %ymm9,%ymm9,%ymm9");
1048 asm volatile ("vpand %ymm7,%ymm5,%ymm5");
1049 asm volatile ("vpand %ymm7,%ymm13,%ymm13");
1050 asm volatile ("vpxor %ymm5,%ymm1,%ymm1");
1051 asm volatile ("vpxor %ymm13,%ymm9,%ymm9");
1053 asm volatile ("vpxor %ymm4,%ymm0,%ymm0");
1054 asm volatile ("vpxor %ymm4,%ymm1,%ymm1");
1055 asm volatile ("vpxor %ymm12,%ymm8,%ymm8");
1056 asm volatile ("vpxor %ymm12,%ymm9,%ymm9");
1058 asm volatile ("vpsrlw $4,%ymm4,%ymm5");
1059 asm volatile ("vpsrlw $4,%ymm12,%ymm13");
1060 asm volatile ("vpand %ymm15,%ymm4,%ymm4");
1061 asm volatile ("vpand %ymm15,%ymm12,%ymm12");
1062 asm volatile ("vpand %ymm15,%ymm5,%ymm5");
1063 asm volatile ("vpand %ymm15,%ymm13,%ymm13");
1065 asm volatile ("vbroadcasti128 %0,%%ymm14" : : "m" (gfgenpshufb[d][0][0][0]));
1066 asm volatile ("vbroadcasti128 %0,%%ymm15" : : "m" (gfgenpshufb[d][0][1][0]));
1067 asm volatile ("vpshufb %ymm4,%ymm14,%ymm6");
1068 asm volatile ("vpshufb %ymm5,%ymm15,%ymm7");
1069 asm volatile ("vpshufb %ymm12,%ymm14,%ymm14");
1070 asm volatile ("vpshufb %ymm13,%ymm15,%ymm15");
1071 asm volatile ("vpxor %ymm6,%ymm2,%ymm2");
1072 asm volatile ("vpxor %ymm14,%ymm10,%ymm10");
1073 asm volatile ("vpxor %ymm7,%ymm2,%ymm2");
1074 asm volatile ("vpxor %ymm15,%ymm10,%ymm10");
1076 asm volatile ("vbroadcasti128 %0,%%ymm14" : : "m" (gfgenpshufb[d][1][0][0]));
1077 asm volatile ("vbroadcasti128 %0,%%ymm15" : : "m" (gfgenpshufb[d][1][1][0]));
1078 asm volatile ("vpshufb %ymm4,%ymm14,%ymm6");
1079 asm volatile ("vpshufb %ymm5,%ymm15,%ymm7");
1080 asm volatile ("vpshufb %ymm12,%ymm14,%ymm14");
1081 asm volatile ("vpshufb %ymm13,%ymm15,%ymm15");
1082 asm volatile ("vpxor %ymm6,%ymm3,%ymm3");
1083 asm volatile ("vpxor %ymm14,%ymm11,%ymm11");
1084 asm volatile ("vpxor %ymm7,%ymm3,%ymm3");
1085 asm volatile ("vpxor %ymm15,%ymm11,%ymm11");
1088 /* first disk with all coefficients at 1 */
1089 asm volatile ("vbroadcasti128 %0,%%ymm7" : : "m" (gfconst16.poly[0]));
1090 asm volatile ("vbroadcasti128 %0,%%ymm15" : : "m" (gfconst16.low4[0]));
1091 asm volatile ("vmovdqa %0,%%ymm4" : : "m" (v[0][i]));
1092 asm volatile ("vmovdqa %0,%%ymm12" : : "m" (v[0][i + 32]));
1094 asm volatile ("vpxor %ymm5,%ymm5,%ymm5");
1095 asm volatile ("vpxor %ymm13,%ymm13,%ymm13");
1096 asm volatile ("vpcmpgtb %ymm1,%ymm5,%ymm5");
1097 asm volatile ("vpcmpgtb %ymm9,%ymm13,%ymm13");
1098 asm volatile ("vpaddb %ymm1,%ymm1,%ymm1");
1099 asm volatile ("vpaddb %ymm9,%ymm9,%ymm9");
1100 asm volatile ("vpand %ymm7,%ymm5,%ymm5");
1101 asm volatile ("vpand %ymm7,%ymm13,%ymm13");
1102 asm volatile ("vpxor %ymm5,%ymm1,%ymm1");
1103 asm volatile ("vpxor %ymm13,%ymm9,%ymm9");
1105 asm volatile ("vpxor %ymm4,%ymm0,%ymm0");
1106 asm volatile ("vpxor %ymm4,%ymm1,%ymm1");
1107 asm volatile ("vpxor %ymm4,%ymm2,%ymm2");
1108 asm volatile ("vpxor %ymm4,%ymm3,%ymm3");
1109 asm volatile ("vpxor %ymm12,%ymm8,%ymm8");
1110 asm volatile ("vpxor %ymm12,%ymm9,%ymm9");
1111 asm volatile ("vpxor %ymm12,%ymm10,%ymm10");
1112 asm volatile ("vpxor %ymm12,%ymm11,%ymm11");
1114 asm volatile ("vmovntdq %%ymm0,%0" : "=m" (p[i]));
1115 asm volatile ("vmovntdq %%ymm8,%0" : "=m" (p[i + 32]));
1116 asm volatile ("vmovntdq %%ymm1,%0" : "=m" (q[i]));
1117 asm volatile ("vmovntdq %%ymm9,%0" : "=m" (q[i + 32]));
1118 asm volatile ("vmovntdq %%ymm2,%0" : "=m" (r[i]));
1119 asm volatile ("vmovntdq %%ymm10,%0" : "=m" (r[i + 32]));
1120 asm volatile ("vmovntdq %%ymm3,%0" : "=m" (s[i]));
1121 asm volatile ("vmovntdq %%ymm11,%0" : "=m" (s[i + 32]));
1128 #if defined(CONFIG_X86) && defined(CONFIG_SSSE3)
1130 * GEN5 (penta parity with Cauchy matrix) SSSE3 implementation
1132 void raid_gen5_ssse3(int nd, size_t size, void **vv)
1134 uint8_t **v = (uint8_t **)vv;
1142 uint8_t buffer[16+16];
1143 uint8_t *pd = __align_ptr(buffer, 16);
1152 /* special case with only one data disk */
1154 for (i = 0; i < 5; ++i)
1155 memcpy(v[1 + i], v[0], size);
1161 /* generic case with at least two data disks */
1162 for (i = 0; i < size; i += 16) {
1163 /* last disk without the by two multiplication */
1164 asm volatile ("movdqa %0,%%xmm4" : : "m" (v[l][i]));
1166 asm volatile ("movdqa %xmm4,%xmm0");
1167 asm volatile ("movdqa %%xmm4,%0" : "=m" (pd[0]));
1169 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.low4[0]));
1170 asm volatile ("movdqa %xmm4,%xmm5");
1171 asm volatile ("psrlw $4,%xmm5");
1172 asm volatile ("pand %xmm7,%xmm4");
1173 asm volatile ("pand %xmm7,%xmm5");
1175 asm volatile ("movdqa %0,%%xmm1" : : "m" (gfgenpshufb[l][0][0][0]));
1176 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[l][0][1][0]));
1177 asm volatile ("pshufb %xmm4,%xmm1");
1178 asm volatile ("pshufb %xmm5,%xmm7");
1179 asm volatile ("pxor %xmm7,%xmm1");
1181 asm volatile ("movdqa %0,%%xmm2" : : "m" (gfgenpshufb[l][1][0][0]));
1182 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[l][1][1][0]));
1183 asm volatile ("pshufb %xmm4,%xmm2");
1184 asm volatile ("pshufb %xmm5,%xmm7");
1185 asm volatile ("pxor %xmm7,%xmm2");
1187 asm volatile ("movdqa %0,%%xmm3" : : "m" (gfgenpshufb[l][2][0][0]));
1188 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[l][2][1][0]));
1189 asm volatile ("pshufb %xmm4,%xmm3");
1190 asm volatile ("pshufb %xmm5,%xmm7");
1191 asm volatile ("pxor %xmm7,%xmm3");
1193 /* intermediate disks */
1194 for (d = l - 1; d > 0; --d) {
1195 asm volatile ("movdqa %0,%%xmm4" : : "m" (v[d][i]));
1196 asm volatile ("movdqa %0,%%xmm6" : : "m" (pd[0]));
1197 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.poly[0]));
1199 asm volatile ("pxor %xmm5,%xmm5");
1200 asm volatile ("pcmpgtb %xmm0,%xmm5");
1201 asm volatile ("paddb %xmm0,%xmm0");
1202 asm volatile ("pand %xmm7,%xmm5");
1203 asm volatile ("pxor %xmm5,%xmm0");
1205 asm volatile ("pxor %xmm4,%xmm0");
1206 asm volatile ("pxor %xmm4,%xmm6");
1207 asm volatile ("movdqa %%xmm6,%0" : "=m" (pd[0]));
1209 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.low4[0]));
1210 asm volatile ("movdqa %xmm4,%xmm5");
1211 asm volatile ("psrlw $4,%xmm5");
1212 asm volatile ("pand %xmm7,%xmm4");
1213 asm volatile ("pand %xmm7,%xmm5");
1215 asm volatile ("movdqa %0,%%xmm6" : : "m" (gfgenpshufb[d][0][0][0]));
1216 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[d][0][1][0]));
1217 asm volatile ("pshufb %xmm4,%xmm6");
1218 asm volatile ("pshufb %xmm5,%xmm7");
1219 asm volatile ("pxor %xmm6,%xmm1");
1220 asm volatile ("pxor %xmm7,%xmm1");
1222 asm volatile ("movdqa %0,%%xmm6" : : "m" (gfgenpshufb[d][1][0][0]));
1223 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[d][1][1][0]));
1224 asm volatile ("pshufb %xmm4,%xmm6");
1225 asm volatile ("pshufb %xmm5,%xmm7");
1226 asm volatile ("pxor %xmm6,%xmm2");
1227 asm volatile ("pxor %xmm7,%xmm2");
1229 asm volatile ("movdqa %0,%%xmm6" : : "m" (gfgenpshufb[d][2][0][0]));
1230 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[d][2][1][0]));
1231 asm volatile ("pshufb %xmm4,%xmm6");
1232 asm volatile ("pshufb %xmm5,%xmm7");
1233 asm volatile ("pxor %xmm6,%xmm3");
1234 asm volatile ("pxor %xmm7,%xmm3");
1237 /* first disk with all coefficients at 1 */
1238 asm volatile ("movdqa %0,%%xmm4" : : "m" (v[0][i]));
1239 asm volatile ("movdqa %0,%%xmm6" : : "m" (pd[0]));
1240 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.poly[0]));
1242 asm volatile ("pxor %xmm5,%xmm5");
1243 asm volatile ("pcmpgtb %xmm0,%xmm5");
1244 asm volatile ("paddb %xmm0,%xmm0");
1245 asm volatile ("pand %xmm7,%xmm5");
1246 asm volatile ("pxor %xmm5,%xmm0");
1248 asm volatile ("pxor %xmm4,%xmm0");
1249 asm volatile ("pxor %xmm4,%xmm1");
1250 asm volatile ("pxor %xmm4,%xmm2");
1251 asm volatile ("pxor %xmm4,%xmm3");
1252 asm volatile ("pxor %xmm4,%xmm6");
1254 asm volatile ("movntdq %%xmm6,%0" : "=m" (p[i]));
1255 asm volatile ("movntdq %%xmm0,%0" : "=m" (q[i]));
1256 asm volatile ("movntdq %%xmm1,%0" : "=m" (r[i]));
1257 asm volatile ("movntdq %%xmm2,%0" : "=m" (s[i]));
1258 asm volatile ("movntdq %%xmm3,%0" : "=m" (t[i]));
1265 #if defined(CONFIG_X86_64) && defined(CONFIG_SSSE3)
1267 * GEN5 (penta parity with Cauchy matrix) SSSE3 implementation
1269 * Note that it uses 16 registers, meaning that x64 is required.
1271 void raid_gen5_ssse3ext(int nd, size_t size, void **vv)
1273 uint8_t **v = (uint8_t **)vv;
1289 /* special case with only one data disk */
1291 for (i = 0; i < 5; ++i)
1292 memcpy(v[1 + i], v[0], size);
1298 /* generic case with at least two data disks */
1299 asm volatile ("movdqa %0,%%xmm14" : : "m" (gfconst16.poly[0]));
1300 asm volatile ("movdqa %0,%%xmm15" : : "m" (gfconst16.low4[0]));
1302 for (i = 0; i < size; i += 16) {
1303 /* last disk without the by two multiplication */
1304 asm volatile ("movdqa %0,%%xmm10" : : "m" (v[l][i]));
1306 asm volatile ("movdqa %xmm10,%xmm0");
1307 asm volatile ("movdqa %xmm10,%xmm1");
1309 asm volatile ("movdqa %xmm10,%xmm11");
1310 asm volatile ("psrlw $4,%xmm11");
1311 asm volatile ("pand %xmm15,%xmm10");
1312 asm volatile ("pand %xmm15,%xmm11");
1314 asm volatile ("movdqa %0,%%xmm2" : : "m" (gfgenpshufb[l][0][0][0]));
1315 asm volatile ("movdqa %0,%%xmm13" : : "m" (gfgenpshufb[l][0][1][0]));
1316 asm volatile ("pshufb %xmm10,%xmm2");
1317 asm volatile ("pshufb %xmm11,%xmm13");
1318 asm volatile ("pxor %xmm13,%xmm2");
1320 asm volatile ("movdqa %0,%%xmm3" : : "m" (gfgenpshufb[l][1][0][0]));
1321 asm volatile ("movdqa %0,%%xmm13" : : "m" (gfgenpshufb[l][1][1][0]));
1322 asm volatile ("pshufb %xmm10,%xmm3");
1323 asm volatile ("pshufb %xmm11,%xmm13");
1324 asm volatile ("pxor %xmm13,%xmm3");
1326 asm volatile ("movdqa %0,%%xmm4" : : "m" (gfgenpshufb[l][2][0][0]));
1327 asm volatile ("movdqa %0,%%xmm13" : : "m" (gfgenpshufb[l][2][1][0]));
1328 asm volatile ("pshufb %xmm10,%xmm4");
1329 asm volatile ("pshufb %xmm11,%xmm13");
1330 asm volatile ("pxor %xmm13,%xmm4");
1332 /* intermediate disks */
1333 for (d = l - 1; d > 0; --d) {
1334 asm volatile ("movdqa %0,%%xmm10" : : "m" (v[d][i]));
1336 asm volatile ("pxor %xmm11,%xmm11");
1337 asm volatile ("pcmpgtb %xmm1,%xmm11");
1338 asm volatile ("paddb %xmm1,%xmm1");
1339 asm volatile ("pand %xmm14,%xmm11");
1340 asm volatile ("pxor %xmm11,%xmm1");
1342 asm volatile ("pxor %xmm10,%xmm0");
1343 asm volatile ("pxor %xmm10,%xmm1");
1345 asm volatile ("movdqa %xmm10,%xmm11");
1346 asm volatile ("psrlw $4,%xmm11");
1347 asm volatile ("pand %xmm15,%xmm10");
1348 asm volatile ("pand %xmm15,%xmm11");
1350 asm volatile ("movdqa %0,%%xmm12" : : "m" (gfgenpshufb[d][0][0][0]));
1351 asm volatile ("movdqa %0,%%xmm13" : : "m" (gfgenpshufb[d][0][1][0]));
1352 asm volatile ("pshufb %xmm10,%xmm12");
1353 asm volatile ("pshufb %xmm11,%xmm13");
1354 asm volatile ("pxor %xmm12,%xmm2");
1355 asm volatile ("pxor %xmm13,%xmm2");
1357 asm volatile ("movdqa %0,%%xmm12" : : "m" (gfgenpshufb[d][1][0][0]));
1358 asm volatile ("movdqa %0,%%xmm13" : : "m" (gfgenpshufb[d][1][1][0]));
1359 asm volatile ("pshufb %xmm10,%xmm12");
1360 asm volatile ("pshufb %xmm11,%xmm13");
1361 asm volatile ("pxor %xmm12,%xmm3");
1362 asm volatile ("pxor %xmm13,%xmm3");
1364 asm volatile ("movdqa %0,%%xmm12" : : "m" (gfgenpshufb[d][2][0][0]));
1365 asm volatile ("movdqa %0,%%xmm13" : : "m" (gfgenpshufb[d][2][1][0]));
1366 asm volatile ("pshufb %xmm10,%xmm12");
1367 asm volatile ("pshufb %xmm11,%xmm13");
1368 asm volatile ("pxor %xmm12,%xmm4");
1369 asm volatile ("pxor %xmm13,%xmm4");
1372 /* first disk with all coefficients at 1 */
1373 asm volatile ("movdqa %0,%%xmm10" : : "m" (v[0][i]));
1375 asm volatile ("pxor %xmm11,%xmm11");
1376 asm volatile ("pcmpgtb %xmm1,%xmm11");
1377 asm volatile ("paddb %xmm1,%xmm1");
1378 asm volatile ("pand %xmm14,%xmm11");
1379 asm volatile ("pxor %xmm11,%xmm1");
1381 asm volatile ("pxor %xmm10,%xmm0");
1382 asm volatile ("pxor %xmm10,%xmm1");
1383 asm volatile ("pxor %xmm10,%xmm2");
1384 asm volatile ("pxor %xmm10,%xmm3");
1385 asm volatile ("pxor %xmm10,%xmm4");
1387 asm volatile ("movntdq %%xmm0,%0" : "=m" (p[i]));
1388 asm volatile ("movntdq %%xmm1,%0" : "=m" (q[i]));
1389 asm volatile ("movntdq %%xmm2,%0" : "=m" (r[i]));
1390 asm volatile ("movntdq %%xmm3,%0" : "=m" (s[i]));
1391 asm volatile ("movntdq %%xmm4,%0" : "=m" (t[i]));
1398 #if defined(CONFIG_X86_64) && defined(CONFIG_AVX2)
1400 * GEN5 (penta parity with Cauchy matrix) AVX2 implementation
1402 * Note that it uses 16 registers, meaning that x64 is required.
1404 void raid_gen5_avx2ext(int nd, size_t size, void **vv)
1406 uint8_t **v = (uint8_t **)vv;
1422 /* special case with only one data disk */
1424 for (i = 0; i < 5; ++i)
1425 memcpy(v[1 + i], v[0], size);
1431 /* generic case with at least two data disks */
1432 asm volatile ("vpxor %ymm8,%ymm8,%ymm8");
1433 asm volatile ("vbroadcasti128 %0,%%ymm14" : : "m" (gfconst16.poly[0]));
1434 asm volatile ("vbroadcasti128 %0,%%ymm15" : : "m" (gfconst16.low4[0]));
1436 for (i = 0; i < size; i += 32) {
1437 /* last disk without the by two multiplication */
1438 asm volatile ("vmovdqa %0,%%ymm10" : : "m" (v[l][i]));
1440 asm volatile ("vmovdqa %ymm10,%ymm0");
1441 asm volatile ("vmovdqa %ymm10,%ymm1");
1443 asm volatile ("vpsrlw $4,%ymm10,%ymm11");
1444 asm volatile ("vpand %ymm15,%ymm10,%ymm10");
1445 asm volatile ("vpand %ymm15,%ymm11,%ymm11");
1447 asm volatile ("vbroadcasti128 %0,%%ymm2" : : "m" (gfgenpshufb[l][0][0][0]));
1448 asm volatile ("vbroadcasti128 %0,%%ymm13" : : "m" (gfgenpshufb[l][0][1][0]));
1449 asm volatile ("vpshufb %ymm10,%ymm2,%ymm2");
1450 asm volatile ("vpshufb %ymm11,%ymm13,%ymm13");
1451 asm volatile ("vpxor %ymm13,%ymm2,%ymm2");
1453 asm volatile ("vbroadcasti128 %0,%%ymm3" : : "m" (gfgenpshufb[l][1][0][0]));
1454 asm volatile ("vbroadcasti128 %0,%%ymm13" : : "m" (gfgenpshufb[l][1][1][0]));
1455 asm volatile ("vpshufb %ymm10,%ymm3,%ymm3");
1456 asm volatile ("vpshufb %ymm11,%ymm13,%ymm13");
1457 asm volatile ("vpxor %ymm13,%ymm3,%ymm3");
1459 asm volatile ("vbroadcasti128 %0,%%ymm4" : : "m" (gfgenpshufb[l][2][0][0]));
1460 asm volatile ("vbroadcasti128 %0,%%ymm13" : : "m" (gfgenpshufb[l][2][1][0]));
1461 asm volatile ("vpshufb %ymm10,%ymm4,%ymm4");
1462 asm volatile ("vpshufb %ymm11,%ymm13,%ymm13");
1463 asm volatile ("vpxor %ymm13,%ymm4,%ymm4");
1465 /* intermediate disks */
1466 for (d = l - 1; d > 0; --d) {
1467 asm volatile ("vmovdqa %0,%%ymm10" : : "m" (v[d][i]));
1469 asm volatile ("vpcmpgtb %ymm1,%ymm8,%ymm11");
1470 asm volatile ("vpaddb %ymm1,%ymm1,%ymm1");
1471 asm volatile ("vpand %ymm14,%ymm11,%ymm11");
1472 asm volatile ("vpxor %ymm11,%ymm1,%ymm1");
1474 asm volatile ("vpxor %ymm10,%ymm0,%ymm0");
1475 asm volatile ("vpxor %ymm10,%ymm1,%ymm1");
1477 asm volatile ("vpsrlw $4,%ymm10,%ymm11");
1478 asm volatile ("vpand %ymm15,%ymm10,%ymm10");
1479 asm volatile ("vpand %ymm15,%ymm11,%ymm11");
1481 asm volatile ("vbroadcasti128 %0,%%ymm12" : : "m" (gfgenpshufb[d][0][0][0]));
1482 asm volatile ("vbroadcasti128 %0,%%ymm13" : : "m" (gfgenpshufb[d][0][1][0]));
1483 asm volatile ("vpshufb %ymm10,%ymm12,%ymm12");
1484 asm volatile ("vpshufb %ymm11,%ymm13,%ymm13");
1485 asm volatile ("vpxor %ymm12,%ymm2,%ymm2");
1486 asm volatile ("vpxor %ymm13,%ymm2,%ymm2");
1488 asm volatile ("vbroadcasti128 %0,%%ymm12" : : "m" (gfgenpshufb[d][1][0][0]));
1489 asm volatile ("vbroadcasti128 %0,%%ymm13" : : "m" (gfgenpshufb[d][1][1][0]));
1490 asm volatile ("vpshufb %ymm10,%ymm12,%ymm12");
1491 asm volatile ("vpshufb %ymm11,%ymm13,%ymm13");
1492 asm volatile ("vpxor %ymm12,%ymm3,%ymm3");
1493 asm volatile ("vpxor %ymm13,%ymm3,%ymm3");
1495 asm volatile ("vbroadcasti128 %0,%%ymm12" : : "m" (gfgenpshufb[d][2][0][0]));
1496 asm volatile ("vbroadcasti128 %0,%%ymm13" : : "m" (gfgenpshufb[d][2][1][0]));
1497 asm volatile ("vpshufb %ymm10,%ymm12,%ymm12");
1498 asm volatile ("vpshufb %ymm11,%ymm13,%ymm13");
1499 asm volatile ("vpxor %ymm12,%ymm4,%ymm4");
1500 asm volatile ("vpxor %ymm13,%ymm4,%ymm4");
1503 /* first disk with all coefficients at 1 */
1504 asm volatile ("vmovdqa %0,%%ymm10" : : "m" (v[0][i]));
1506 asm volatile ("vpcmpgtb %ymm1,%ymm8,%ymm11");
1507 asm volatile ("vpaddb %ymm1,%ymm1,%ymm1");
1508 asm volatile ("vpand %ymm14,%ymm11,%ymm11");
1509 asm volatile ("vpxor %ymm11,%ymm1,%ymm1");
1511 asm volatile ("vpxor %ymm10,%ymm0,%ymm0");
1512 asm volatile ("vpxor %ymm10,%ymm1,%ymm1");
1513 asm volatile ("vpxor %ymm10,%ymm2,%ymm2");
1514 asm volatile ("vpxor %ymm10,%ymm3,%ymm3");
1515 asm volatile ("vpxor %ymm10,%ymm4,%ymm4");
1517 asm volatile ("vmovntdq %%ymm0,%0" : "=m" (p[i]));
1518 asm volatile ("vmovntdq %%ymm1,%0" : "=m" (q[i]));
1519 asm volatile ("vmovntdq %%ymm2,%0" : "=m" (r[i]));
1520 asm volatile ("vmovntdq %%ymm3,%0" : "=m" (s[i]));
1521 asm volatile ("vmovntdq %%ymm4,%0" : "=m" (t[i]));
1528 #if defined(CONFIG_X86) && defined(CONFIG_SSSE3)
1530 * GEN6 (hexa parity with Cauchy matrix) SSSE3 implementation
1532 void raid_gen6_ssse3(int nd, size_t size, void **vv)
1534 uint8_t **v = (uint8_t **)vv;
1543 uint8_t buffer[2*16+16];
1544 uint8_t *pd = __align_ptr(buffer, 16);
1554 /* special case with only one data disk */
1556 for (i = 0; i < 6; ++i)
1557 memcpy(v[1 + i], v[0], size);
1563 /* generic case with at least two data disks */
1564 for (i = 0; i < size; i += 16) {
1565 /* last disk without the by two multiplication */
1566 asm volatile ("movdqa %0,%%xmm4" : : "m" (v[l][i]));
1568 asm volatile ("movdqa %%xmm4,%0" : "=m" (pd[0]));
1569 asm volatile ("movdqa %%xmm4,%0" : "=m" (pd[16]));
1571 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.low4[0]));
1572 asm volatile ("movdqa %xmm4,%xmm5");
1573 asm volatile ("psrlw $4,%xmm5");
1574 asm volatile ("pand %xmm7,%xmm4");
1575 asm volatile ("pand %xmm7,%xmm5");
1577 asm volatile ("movdqa %0,%%xmm0" : : "m" (gfgenpshufb[l][0][0][0]));
1578 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[l][0][1][0]));
1579 asm volatile ("pshufb %xmm4,%xmm0");
1580 asm volatile ("pshufb %xmm5,%xmm7");
1581 asm volatile ("pxor %xmm7,%xmm0");
1583 asm volatile ("movdqa %0,%%xmm1" : : "m" (gfgenpshufb[l][1][0][0]));
1584 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[l][1][1][0]));
1585 asm volatile ("pshufb %xmm4,%xmm1");
1586 asm volatile ("pshufb %xmm5,%xmm7");
1587 asm volatile ("pxor %xmm7,%xmm1");
1589 asm volatile ("movdqa %0,%%xmm2" : : "m" (gfgenpshufb[l][2][0][0]));
1590 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[l][2][1][0]));
1591 asm volatile ("pshufb %xmm4,%xmm2");
1592 asm volatile ("pshufb %xmm5,%xmm7");
1593 asm volatile ("pxor %xmm7,%xmm2");
1595 asm volatile ("movdqa %0,%%xmm3" : : "m" (gfgenpshufb[l][3][0][0]));
1596 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[l][3][1][0]));
1597 asm volatile ("pshufb %xmm4,%xmm3");
1598 asm volatile ("pshufb %xmm5,%xmm7");
1599 asm volatile ("pxor %xmm7,%xmm3");
1601 /* intermediate disks */
1602 for (d = l - 1; d > 0; --d) {
1603 asm volatile ("movdqa %0,%%xmm5" : : "m" (pd[0]));
1604 asm volatile ("movdqa %0,%%xmm6" : : "m" (pd[16]));
1605 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.poly[0]));
1607 asm volatile ("pxor %xmm4,%xmm4");
1608 asm volatile ("pcmpgtb %xmm6,%xmm4");
1609 asm volatile ("paddb %xmm6,%xmm6");
1610 asm volatile ("pand %xmm7,%xmm4");
1611 asm volatile ("pxor %xmm4,%xmm6");
1613 asm volatile ("movdqa %0,%%xmm4" : : "m" (v[d][i]));
1615 asm volatile ("pxor %xmm4,%xmm5");
1616 asm volatile ("pxor %xmm4,%xmm6");
1617 asm volatile ("movdqa %%xmm5,%0" : "=m" (pd[0]));
1618 asm volatile ("movdqa %%xmm6,%0" : "=m" (pd[16]));
1620 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.low4[0]));
1621 asm volatile ("movdqa %xmm4,%xmm5");
1622 asm volatile ("psrlw $4,%xmm5");
1623 asm volatile ("pand %xmm7,%xmm4");
1624 asm volatile ("pand %xmm7,%xmm5");
1626 asm volatile ("movdqa %0,%%xmm6" : : "m" (gfgenpshufb[d][0][0][0]));
1627 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[d][0][1][0]));
1628 asm volatile ("pshufb %xmm4,%xmm6");
1629 asm volatile ("pshufb %xmm5,%xmm7");
1630 asm volatile ("pxor %xmm6,%xmm0");
1631 asm volatile ("pxor %xmm7,%xmm0");
1633 asm volatile ("movdqa %0,%%xmm6" : : "m" (gfgenpshufb[d][1][0][0]));
1634 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[d][1][1][0]));
1635 asm volatile ("pshufb %xmm4,%xmm6");
1636 asm volatile ("pshufb %xmm5,%xmm7");
1637 asm volatile ("pxor %xmm6,%xmm1");
1638 asm volatile ("pxor %xmm7,%xmm1");
1640 asm volatile ("movdqa %0,%%xmm6" : : "m" (gfgenpshufb[d][2][0][0]));
1641 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[d][2][1][0]));
1642 asm volatile ("pshufb %xmm4,%xmm6");
1643 asm volatile ("pshufb %xmm5,%xmm7");
1644 asm volatile ("pxor %xmm6,%xmm2");
1645 asm volatile ("pxor %xmm7,%xmm2");
1647 asm volatile ("movdqa %0,%%xmm6" : : "m" (gfgenpshufb[d][3][0][0]));
1648 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[d][3][1][0]));
1649 asm volatile ("pshufb %xmm4,%xmm6");
1650 asm volatile ("pshufb %xmm5,%xmm7");
1651 asm volatile ("pxor %xmm6,%xmm3");
1652 asm volatile ("pxor %xmm7,%xmm3");
1655 /* first disk with all coefficients at 1 */
1656 asm volatile ("movdqa %0,%%xmm5" : : "m" (pd[0]));
1657 asm volatile ("movdqa %0,%%xmm6" : : "m" (pd[16]));
1658 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.poly[0]));
1660 asm volatile ("pxor %xmm4,%xmm4");
1661 asm volatile ("pcmpgtb %xmm6,%xmm4");
1662 asm volatile ("paddb %xmm6,%xmm6");
1663 asm volatile ("pand %xmm7,%xmm4");
1664 asm volatile ("pxor %xmm4,%xmm6");
1666 asm volatile ("movdqa %0,%%xmm4" : : "m" (v[0][i]));
1667 asm volatile ("pxor %xmm4,%xmm0");
1668 asm volatile ("pxor %xmm4,%xmm1");
1669 asm volatile ("pxor %xmm4,%xmm2");
1670 asm volatile ("pxor %xmm4,%xmm3");
1671 asm volatile ("pxor %xmm4,%xmm5");
1672 asm volatile ("pxor %xmm4,%xmm6");
1674 asm volatile ("movntdq %%xmm5,%0" : "=m" (p[i]));
1675 asm volatile ("movntdq %%xmm6,%0" : "=m" (q[i]));
1676 asm volatile ("movntdq %%xmm0,%0" : "=m" (r[i]));
1677 asm volatile ("movntdq %%xmm1,%0" : "=m" (s[i]));
1678 asm volatile ("movntdq %%xmm2,%0" : "=m" (t[i]));
1679 asm volatile ("movntdq %%xmm3,%0" : "=m" (u[i]));
1686 #if defined(CONFIG_X86_64) && defined(CONFIG_SSSE3)
1688 * GEN6 (hexa parity with Cauchy matrix) SSSE3 implementation
1690 * Note that it uses 16 registers, meaning that x64 is required.
1692 void raid_gen6_ssse3ext(int nd, size_t size, void **vv)
1694 uint8_t **v = (uint8_t **)vv;
1712 /* special case with only one data disk */
1714 for (i = 0; i < 6; ++i)
1715 memcpy(v[1 + i], v[0], size);
1721 /* generic case with at least two data disks */
1722 asm volatile ("movdqa %0,%%xmm14" : : "m" (gfconst16.poly[0]));
1723 asm volatile ("movdqa %0,%%xmm15" : : "m" (gfconst16.low4[0]));
1725 for (i = 0; i < size; i += 16) {
1726 /* last disk without the by two multiplication */
1727 asm volatile ("movdqa %0,%%xmm10" : : "m" (v[l][i]));
1729 asm volatile ("movdqa %xmm10,%xmm0");
1730 asm volatile ("movdqa %xmm10,%xmm1");
1732 asm volatile ("movdqa %xmm10,%xmm11");
1733 asm volatile ("psrlw $4,%xmm11");
1734 asm volatile ("pand %xmm15,%xmm10");
1735 asm volatile ("pand %xmm15,%xmm11");
1737 asm volatile ("movdqa %0,%%xmm2" : : "m" (gfgenpshufb[l][0][0][0]));
1738 asm volatile ("movdqa %0,%%xmm13" : : "m" (gfgenpshufb[l][0][1][0]));
1739 asm volatile ("pshufb %xmm10,%xmm2");
1740 asm volatile ("pshufb %xmm11,%xmm13");
1741 asm volatile ("pxor %xmm13,%xmm2");
1743 asm volatile ("movdqa %0,%%xmm3" : : "m" (gfgenpshufb[l][1][0][0]));
1744 asm volatile ("movdqa %0,%%xmm13" : : "m" (gfgenpshufb[l][1][1][0]));
1745 asm volatile ("pshufb %xmm10,%xmm3");
1746 asm volatile ("pshufb %xmm11,%xmm13");
1747 asm volatile ("pxor %xmm13,%xmm3");
1749 asm volatile ("movdqa %0,%%xmm4" : : "m" (gfgenpshufb[l][2][0][0]));
1750 asm volatile ("movdqa %0,%%xmm13" : : "m" (gfgenpshufb[l][2][1][0]));
1751 asm volatile ("pshufb %xmm10,%xmm4");
1752 asm volatile ("pshufb %xmm11,%xmm13");
1753 asm volatile ("pxor %xmm13,%xmm4");
1755 asm volatile ("movdqa %0,%%xmm5" : : "m" (gfgenpshufb[l][3][0][0]));
1756 asm volatile ("movdqa %0,%%xmm13" : : "m" (gfgenpshufb[l][3][1][0]));
1757 asm volatile ("pshufb %xmm10,%xmm5");
1758 asm volatile ("pshufb %xmm11,%xmm13");
1759 asm volatile ("pxor %xmm13,%xmm5");
1761 /* intermediate disks */
1762 for (d = l - 1; d > 0; --d) {
1763 asm volatile ("movdqa %0,%%xmm10" : : "m" (v[d][i]));
1765 asm volatile ("pxor %xmm11,%xmm11");
1766 asm volatile ("pcmpgtb %xmm1,%xmm11");
1767 asm volatile ("paddb %xmm1,%xmm1");
1768 asm volatile ("pand %xmm14,%xmm11");
1769 asm volatile ("pxor %xmm11,%xmm1");
1771 asm volatile ("pxor %xmm10,%xmm0");
1772 asm volatile ("pxor %xmm10,%xmm1");
1774 asm volatile ("movdqa %xmm10,%xmm11");
1775 asm volatile ("psrlw $4,%xmm11");
1776 asm volatile ("pand %xmm15,%xmm10");
1777 asm volatile ("pand %xmm15,%xmm11");
1779 asm volatile ("movdqa %0,%%xmm12" : : "m" (gfgenpshufb[d][0][0][0]));
1780 asm volatile ("movdqa %0,%%xmm13" : : "m" (gfgenpshufb[d][0][1][0]));
1781 asm volatile ("pshufb %xmm10,%xmm12");
1782 asm volatile ("pshufb %xmm11,%xmm13");
1783 asm volatile ("pxor %xmm12,%xmm2");
1784 asm volatile ("pxor %xmm13,%xmm2");
1786 asm volatile ("movdqa %0,%%xmm12" : : "m" (gfgenpshufb[d][1][0][0]));
1787 asm volatile ("movdqa %0,%%xmm13" : : "m" (gfgenpshufb[d][1][1][0]));
1788 asm volatile ("pshufb %xmm10,%xmm12");
1789 asm volatile ("pshufb %xmm11,%xmm13");
1790 asm volatile ("pxor %xmm12,%xmm3");
1791 asm volatile ("pxor %xmm13,%xmm3");
1793 asm volatile ("movdqa %0,%%xmm12" : : "m" (gfgenpshufb[d][2][0][0]));
1794 asm volatile ("movdqa %0,%%xmm13" : : "m" (gfgenpshufb[d][2][1][0]));
1795 asm volatile ("pshufb %xmm10,%xmm12");
1796 asm volatile ("pshufb %xmm11,%xmm13");
1797 asm volatile ("pxor %xmm12,%xmm4");
1798 asm volatile ("pxor %xmm13,%xmm4");
1800 asm volatile ("movdqa %0,%%xmm12" : : "m" (gfgenpshufb[d][3][0][0]));
1801 asm volatile ("movdqa %0,%%xmm13" : : "m" (gfgenpshufb[d][3][1][0]));
1802 asm volatile ("pshufb %xmm10,%xmm12");
1803 asm volatile ("pshufb %xmm11,%xmm13");
1804 asm volatile ("pxor %xmm12,%xmm5");
1805 asm volatile ("pxor %xmm13,%xmm5");
1808 /* first disk with all coefficients at 1 */
1809 asm volatile ("movdqa %0,%%xmm10" : : "m" (v[0][i]));
1811 asm volatile ("pxor %xmm11,%xmm11");
1812 asm volatile ("pcmpgtb %xmm1,%xmm11");
1813 asm volatile ("paddb %xmm1,%xmm1");
1814 asm volatile ("pand %xmm14,%xmm11");
1815 asm volatile ("pxor %xmm11,%xmm1");
1817 asm volatile ("pxor %xmm10,%xmm0");
1818 asm volatile ("pxor %xmm10,%xmm1");
1819 asm volatile ("pxor %xmm10,%xmm2");
1820 asm volatile ("pxor %xmm10,%xmm3");
1821 asm volatile ("pxor %xmm10,%xmm4");
1822 asm volatile ("pxor %xmm10,%xmm5");
1824 asm volatile ("movntdq %%xmm0,%0" : "=m" (p[i]));
1825 asm volatile ("movntdq %%xmm1,%0" : "=m" (q[i]));
1826 asm volatile ("movntdq %%xmm2,%0" : "=m" (r[i]));
1827 asm volatile ("movntdq %%xmm3,%0" : "=m" (s[i]));
1828 asm volatile ("movntdq %%xmm4,%0" : "=m" (t[i]));
1829 asm volatile ("movntdq %%xmm5,%0" : "=m" (u[i]));
1836 #if defined(CONFIG_X86_64) && defined(CONFIG_AVX2)
1838 * GEN6 (hexa parity with Cauchy matrix) AVX2 implementation
1840 * Note that it uses 16 registers, meaning that x64 is required.
1842 void raid_gen6_avx2ext(int nd, size_t size, void **vv)
1844 uint8_t **v = (uint8_t **)vv;
1862 /* special case with only one data disk */
1864 for (i = 0; i < 6; ++i)
1865 memcpy(v[1 + i], v[0], size);
1871 /* generic case with at least two data disks */
1872 asm volatile ("vpxor %ymm8,%ymm8,%ymm8");
1873 asm volatile ("vbroadcasti128 %0,%%ymm14" : : "m" (gfconst16.poly[0]));
1874 asm volatile ("vbroadcasti128 %0,%%ymm15" : : "m" (gfconst16.low4[0]));
1876 for (i = 0; i < size; i += 32) {
1877 /* last disk without the by two multiplication */
1878 asm volatile ("vmovdqa %0,%%ymm10" : : "m" (v[l][i]));
1880 asm volatile ("vmovdqa %ymm10,%ymm0");
1881 asm volatile ("vmovdqa %ymm10,%ymm1");
1883 asm volatile ("vpsrlw $4,%ymm10,%ymm11");
1884 asm volatile ("vpand %ymm15,%ymm10,%ymm10");
1885 asm volatile ("vpand %ymm15,%ymm11,%ymm11");
1887 asm volatile ("vbroadcasti128 %0,%%ymm2" : : "m" (gfgenpshufb[l][0][0][0]));
1888 asm volatile ("vbroadcasti128 %0,%%ymm13" : : "m" (gfgenpshufb[l][0][1][0]));
1889 asm volatile ("vpshufb %ymm10,%ymm2,%ymm2");
1890 asm volatile ("vpshufb %ymm11,%ymm13,%ymm13");
1891 asm volatile ("vpxor %ymm13,%ymm2,%ymm2");
1893 asm volatile ("vbroadcasti128 %0,%%ymm3" : : "m" (gfgenpshufb[l][1][0][0]));
1894 asm volatile ("vbroadcasti128 %0,%%ymm13" : : "m" (gfgenpshufb[l][1][1][0]));
1895 asm volatile ("vpshufb %ymm10,%ymm3,%ymm3");
1896 asm volatile ("vpshufb %ymm11,%ymm13,%ymm13");
1897 asm volatile ("vpxor %ymm13,%ymm3,%ymm3");
1899 asm volatile ("vbroadcasti128 %0,%%ymm4" : : "m" (gfgenpshufb[l][2][0][0]));
1900 asm volatile ("vbroadcasti128 %0,%%ymm13" : : "m" (gfgenpshufb[l][2][1][0]));
1901 asm volatile ("vpshufb %ymm10,%ymm4,%ymm4");
1902 asm volatile ("vpshufb %ymm11,%ymm13,%ymm13");
1903 asm volatile ("vpxor %ymm13,%ymm4,%ymm4");
1905 asm volatile ("vbroadcasti128 %0,%%ymm5" : : "m" (gfgenpshufb[l][3][0][0]));
1906 asm volatile ("vbroadcasti128 %0,%%ymm13" : : "m" (gfgenpshufb[l][3][1][0]));
1907 asm volatile ("vpshufb %ymm10,%ymm5,%ymm5");
1908 asm volatile ("vpshufb %ymm11,%ymm13,%ymm13");
1909 asm volatile ("vpxor %ymm13,%ymm5,%ymm5");
1911 /* intermediate disks */
1912 for (d = l - 1; d > 0; --d) {
1913 asm volatile ("vmovdqa %0,%%ymm10" : : "m" (v[d][i]));
1915 asm volatile ("vpcmpgtb %ymm1,%ymm8,%ymm11");
1916 asm volatile ("vpaddb %ymm1,%ymm1,%ymm1");
1917 asm volatile ("vpand %ymm14,%ymm11,%ymm11");
1918 asm volatile ("vpxor %ymm11,%ymm1,%ymm1");
1920 asm volatile ("vpxor %ymm10,%ymm0,%ymm0");
1921 asm volatile ("vpxor %ymm10,%ymm1,%ymm1");
1923 asm volatile ("vpsrlw $4,%ymm10,%ymm11");
1924 asm volatile ("vpand %ymm15,%ymm10,%ymm10");
1925 asm volatile ("vpand %ymm15,%ymm11,%ymm11");
1927 asm volatile ("vbroadcasti128 %0,%%ymm12" : : "m" (gfgenpshufb[d][0][0][0]));
1928 asm volatile ("vbroadcasti128 %0,%%ymm13" : : "m" (gfgenpshufb[d][0][1][0]));
1929 asm volatile ("vpshufb %ymm10,%ymm12,%ymm12");
1930 asm volatile ("vpshufb %ymm11,%ymm13,%ymm13");
1931 asm volatile ("vpxor %ymm12,%ymm2,%ymm2");
1932 asm volatile ("vpxor %ymm13,%ymm2,%ymm2");
1934 asm volatile ("vbroadcasti128 %0,%%ymm12" : : "m" (gfgenpshufb[d][1][0][0]));
1935 asm volatile ("vbroadcasti128 %0,%%ymm13" : : "m" (gfgenpshufb[d][1][1][0]));
1936 asm volatile ("vpshufb %ymm10,%ymm12,%ymm12");
1937 asm volatile ("vpshufb %ymm11,%ymm13,%ymm13");
1938 asm volatile ("vpxor %ymm12,%ymm3,%ymm3");
1939 asm volatile ("vpxor %ymm13,%ymm3,%ymm3");
1941 asm volatile ("vbroadcasti128 %0,%%ymm12" : : "m" (gfgenpshufb[d][2][0][0]));
1942 asm volatile ("vbroadcasti128 %0,%%ymm13" : : "m" (gfgenpshufb[d][2][1][0]));
1943 asm volatile ("vpshufb %ymm10,%ymm12,%ymm12");
1944 asm volatile ("vpshufb %ymm11,%ymm13,%ymm13");
1945 asm volatile ("vpxor %ymm12,%ymm4,%ymm4");
1946 asm volatile ("vpxor %ymm13,%ymm4,%ymm4");
1948 asm volatile ("vbroadcasti128 %0,%%ymm12" : : "m" (gfgenpshufb[d][3][0][0]));
1949 asm volatile ("vbroadcasti128 %0,%%ymm13" : : "m" (gfgenpshufb[d][3][1][0]));
1950 asm volatile ("vpshufb %ymm10,%ymm12,%ymm12");
1951 asm volatile ("vpshufb %ymm11,%ymm13,%ymm13");
1952 asm volatile ("vpxor %ymm12,%ymm5,%ymm5");
1953 asm volatile ("vpxor %ymm13,%ymm5,%ymm5");
1956 /* first disk with all coefficients at 1 */
1957 asm volatile ("vmovdqa %0,%%ymm10" : : "m" (v[0][i]));
1959 asm volatile ("vpcmpgtb %ymm1,%ymm8,%ymm11");
1960 asm volatile ("vpaddb %ymm1,%ymm1,%ymm1");
1961 asm volatile ("vpand %ymm14,%ymm11,%ymm11");
1962 asm volatile ("vpxor %ymm11,%ymm1,%ymm1");
1964 asm volatile ("vpxor %ymm10,%ymm0,%ymm0");
1965 asm volatile ("vpxor %ymm10,%ymm1,%ymm1");
1966 asm volatile ("vpxor %ymm10,%ymm2,%ymm2");
1967 asm volatile ("vpxor %ymm10,%ymm3,%ymm3");
1968 asm volatile ("vpxor %ymm10,%ymm4,%ymm4");
1969 asm volatile ("vpxor %ymm10,%ymm5,%ymm5");
1971 asm volatile ("vmovntdq %%ymm0,%0" : "=m" (p[i]));
1972 asm volatile ("vmovntdq %%ymm1,%0" : "=m" (q[i]));
1973 asm volatile ("vmovntdq %%ymm2,%0" : "=m" (r[i]));
1974 asm volatile ("vmovntdq %%ymm3,%0" : "=m" (s[i]));
1975 asm volatile ("vmovntdq %%ymm4,%0" : "=m" (t[i]));
1976 asm volatile ("vmovntdq %%ymm5,%0" : "=m" (u[i]));
1983 #if defined(CONFIG_X86) && defined(CONFIG_SSSE3)
1985 * RAID recovering for one disk SSSE3 implementation
1987 void raid_rec1_ssse3(int nr, int *id, int *ip, int nd, size_t size, void **vv)
1989 uint8_t **v = (uint8_t **)vv;
1996 (void)nr; /* unused, it's always 1 */
1998 /* if it's RAID5 uses the faster function */
2000 raid_rec1of1(id, nd, size, vv);
2004 /* setup the coefficients matrix */
2005 G = A(ip[0], id[0]);
2007 /* invert it to solve the system of linear equations */
2010 /* compute delta parity */
2011 raid_delta_gen(1, id, ip, nd, size, vv);
2018 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.low4[0]));
2019 asm volatile ("movdqa %0,%%xmm4" : : "m" (gfmulpshufb[V][0][0]));
2020 asm volatile ("movdqa %0,%%xmm5" : : "m" (gfmulpshufb[V][1][0]));
2022 for (i = 0; i < size; i += 16) {
2023 asm volatile ("movdqa %0,%%xmm0" : : "m" (p[i]));
2024 asm volatile ("movdqa %0,%%xmm1" : : "m" (pa[i]));
2025 asm volatile ("movdqa %xmm4,%xmm2");
2026 asm volatile ("movdqa %xmm5,%xmm3");
2027 asm volatile ("pxor %xmm0,%xmm1");
2028 asm volatile ("movdqa %xmm1,%xmm0");
2029 asm volatile ("psrlw $4,%xmm1");
2030 asm volatile ("pand %xmm7,%xmm0");
2031 asm volatile ("pand %xmm7,%xmm1");
2032 asm volatile ("pshufb %xmm0,%xmm2");
2033 asm volatile ("pshufb %xmm1,%xmm3");
2034 asm volatile ("pxor %xmm3,%xmm2");
2035 asm volatile ("movdqa %%xmm2,%0" : "=m" (pa[i]));
2042 #if defined(CONFIG_X86) && defined(CONFIG_SSSE3)
2044 * RAID recovering for two disks SSSE3 implementation
2046 void raid_rec2_ssse3(int nr, int *id, int *ip, int nd, size_t size, void **vv)
2048 uint8_t **v = (uint8_t **)vv;
2057 (void)nr; /* unused, it's always 2 */
2059 /* setup the coefficients matrix */
2060 for (j = 0; j < N; ++j)
2061 for (k = 0; k < N; ++k)
2062 G[j * N + k] = A(ip[j], id[k]);
2064 /* invert it to solve the system of linear equations */
2065 raid_invert(G, V, N);
2067 /* compute delta parity */
2068 raid_delta_gen(N, id, ip, nd, size, vv);
2070 for (j = 0; j < N; ++j) {
2071 p[j] = v[nd + ip[j]];
2077 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.low4[0]));
2079 for (i = 0; i < size; i += 16) {
2080 asm volatile ("movdqa %0,%%xmm0" : : "m" (p[0][i]));
2081 asm volatile ("movdqa %0,%%xmm2" : : "m" (pa[0][i]));
2082 asm volatile ("movdqa %0,%%xmm1" : : "m" (p[1][i]));
2083 asm volatile ("movdqa %0,%%xmm3" : : "m" (pa[1][i]));
2084 asm volatile ("pxor %xmm2,%xmm0");
2085 asm volatile ("pxor %xmm3,%xmm1");
2087 asm volatile ("pxor %xmm6,%xmm6");
2089 asm volatile ("movdqa %0,%%xmm2" : : "m" (gfmulpshufb[V[0]][0][0]));
2090 asm volatile ("movdqa %0,%%xmm3" : : "m" (gfmulpshufb[V[0]][1][0]));
2091 asm volatile ("movdqa %xmm0,%xmm4");
2092 asm volatile ("movdqa %xmm0,%xmm5");
2093 asm volatile ("psrlw $4,%xmm5");
2094 asm volatile ("pand %xmm7,%xmm4");
2095 asm volatile ("pand %xmm7,%xmm5");
2096 asm volatile ("pshufb %xmm4,%xmm2");
2097 asm volatile ("pshufb %xmm5,%xmm3");
2098 asm volatile ("pxor %xmm2,%xmm6");
2099 asm volatile ("pxor %xmm3,%xmm6");
2101 asm volatile ("movdqa %0,%%xmm2" : : "m" (gfmulpshufb[V[1]][0][0]));
2102 asm volatile ("movdqa %0,%%xmm3" : : "m" (gfmulpshufb[V[1]][1][0]));
2103 asm volatile ("movdqa %xmm1,%xmm4");
2104 asm volatile ("movdqa %xmm1,%xmm5");
2105 asm volatile ("psrlw $4,%xmm5");
2106 asm volatile ("pand %xmm7,%xmm4");
2107 asm volatile ("pand %xmm7,%xmm5");
2108 asm volatile ("pshufb %xmm4,%xmm2");
2109 asm volatile ("pshufb %xmm5,%xmm3");
2110 asm volatile ("pxor %xmm2,%xmm6");
2111 asm volatile ("pxor %xmm3,%xmm6");
2113 asm volatile ("movdqa %%xmm6,%0" : "=m" (pa[0][i]));
2115 asm volatile ("pxor %xmm6,%xmm6");
2117 asm volatile ("movdqa %0,%%xmm2" : : "m" (gfmulpshufb[V[2]][0][0]));
2118 asm volatile ("movdqa %0,%%xmm3" : : "m" (gfmulpshufb[V[2]][1][0]));
2119 asm volatile ("movdqa %xmm0,%xmm4");
2120 asm volatile ("movdqa %xmm0,%xmm5");
2121 asm volatile ("psrlw $4,%xmm5");
2122 asm volatile ("pand %xmm7,%xmm4");
2123 asm volatile ("pand %xmm7,%xmm5");
2124 asm volatile ("pshufb %xmm4,%xmm2");
2125 asm volatile ("pshufb %xmm5,%xmm3");
2126 asm volatile ("pxor %xmm2,%xmm6");
2127 asm volatile ("pxor %xmm3,%xmm6");
2129 asm volatile ("movdqa %0,%%xmm2" : : "m" (gfmulpshufb[V[3]][0][0]));
2130 asm volatile ("movdqa %0,%%xmm3" : : "m" (gfmulpshufb[V[3]][1][0]));
2131 asm volatile ("movdqa %xmm1,%xmm4");
2132 asm volatile ("movdqa %xmm1,%xmm5");
2133 asm volatile ("psrlw $4,%xmm5");
2134 asm volatile ("pand %xmm7,%xmm4");
2135 asm volatile ("pand %xmm7,%xmm5");
2136 asm volatile ("pshufb %xmm4,%xmm2");
2137 asm volatile ("pshufb %xmm5,%xmm3");
2138 asm volatile ("pxor %xmm2,%xmm6");
2139 asm volatile ("pxor %xmm3,%xmm6");
2141 asm volatile ("movdqa %%xmm6,%0" : "=m" (pa[1][i]));
2148 #if defined(CONFIG_X86) && defined(CONFIG_SSSE3)
2150 * RAID recovering SSSE3 implementation
2152 void raid_recX_ssse3(int nr, int *id, int *ip, int nd, size_t size, void **vv)
2154 uint8_t **v = (uint8_t **)vv;
2156 uint8_t *p[RAID_PARITY_MAX];
2157 uint8_t *pa[RAID_PARITY_MAX];
2158 uint8_t G[RAID_PARITY_MAX * RAID_PARITY_MAX];
2159 uint8_t V[RAID_PARITY_MAX * RAID_PARITY_MAX];
2160 uint8_t buffer[RAID_PARITY_MAX*16+16];
2161 uint8_t *pd = __align_ptr(buffer, 16);
2165 /* setup the coefficients matrix */
2166 for (j = 0; j < N; ++j)
2167 for (k = 0; k < N; ++k)
2168 G[j * N + k] = A(ip[j], id[k]);
2170 /* invert it to solve the system of linear equations */
2171 raid_invert(G, V, N);
2173 /* compute delta parity */
2174 raid_delta_gen(N, id, ip, nd, size, vv);
2176 for (j = 0; j < N; ++j) {
2177 p[j] = v[nd + ip[j]];
2183 asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.low4[0]));
2185 for (i = 0; i < size; i += 16) {
2187 for (j = 0; j < N; ++j) {
2188 asm volatile ("movdqa %0,%%xmm0" : : "m" (p[j][i]));
2189 asm volatile ("movdqa %0,%%xmm1" : : "m" (pa[j][i]));
2190 asm volatile ("pxor %xmm1,%xmm0");
2191 asm volatile ("movdqa %%xmm0,%0" : "=m" (pd[j*16]));
2195 for (j = 0; j < N; ++j) {
2196 asm volatile ("pxor %xmm0,%xmm0");
2197 asm volatile ("pxor %xmm1,%xmm1");
2199 for (k = 0; k < N; ++k) {
2200 uint8_t m = V[j * N + k];
2202 asm volatile ("movdqa %0,%%xmm2" : : "m" (gfmulpshufb[m][0][0]));
2203 asm volatile ("movdqa %0,%%xmm3" : : "m" (gfmulpshufb[m][1][0]));
2204 asm volatile ("movdqa %0,%%xmm4" : : "m" (pd[k*16]));
2205 asm volatile ("movdqa %xmm4,%xmm5");
2206 asm volatile ("psrlw $4,%xmm5");
2207 asm volatile ("pand %xmm7,%xmm4");
2208 asm volatile ("pand %xmm7,%xmm5");
2209 asm volatile ("pshufb %xmm4,%xmm2");
2210 asm volatile ("pshufb %xmm5,%xmm3");
2211 asm volatile ("pxor %xmm2,%xmm0");
2212 asm volatile ("pxor %xmm3,%xmm1");
2215 asm volatile ("pxor %xmm1,%xmm0");
2216 asm volatile ("movdqa %%xmm0,%0" : "=m" (pa[j][i]));
2224 #if defined(CONFIG_X86) && defined(CONFIG_AVX2)
2226 * RAID recovering for one disk AVX2 implementation
2228 void raid_rec1_avx2(int nr, int *id, int *ip, int nd, size_t size, void **vv)
2230 uint8_t **v = (uint8_t **)vv;
2237 (void)nr; /* unused, it's always 1 */
2239 /* if it's RAID5 uses the faster function */
2241 raid_rec1of1(id, nd, size, vv);
2245 /* setup the coefficients matrix */
2246 G = A(ip[0], id[0]);
2248 /* invert it to solve the system of linear equations */
2251 /* compute delta parity */
2252 raid_delta_gen(1, id, ip, nd, size, vv);
2259 asm volatile ("vbroadcasti128 %0,%%ymm7" : : "m" (gfconst16.low4[0]));
2260 asm volatile ("vbroadcasti128 %0,%%ymm4" : : "m" (gfmulpshufb[V][0][0]));
2261 asm volatile ("vbroadcasti128 %0,%%ymm5" : : "m" (gfmulpshufb[V][1][0]));
2263 for (i = 0; i < size; i += 32) {
2264 asm volatile ("vmovdqa %0,%%ymm0" : : "m" (p[i]));
2265 asm volatile ("vmovdqa %0,%%ymm1" : : "m" (pa[i]));
2266 asm volatile ("vpxor %ymm1,%ymm0,%ymm0");
2267 asm volatile ("vpsrlw $4,%ymm0,%ymm1");
2268 asm volatile ("vpand %ymm7,%ymm0,%ymm0");
2269 asm volatile ("vpand %ymm7,%ymm1,%ymm1");
2270 asm volatile ("vpshufb %ymm0,%ymm4,%ymm2");
2271 asm volatile ("vpshufb %ymm1,%ymm5,%ymm3");
2272 asm volatile ("vpxor %ymm3,%ymm2,%ymm2");
2273 asm volatile ("vmovdqa %%ymm2,%0" : "=m" (pa[i]));
2280 #if defined(CONFIG_X86) && defined(CONFIG_AVX2)
2282 * RAID recovering for two disks AVX2 implementation
2284 void raid_rec2_avx2(int nr, int *id, int *ip, int nd, size_t size, void **vv)
2286 uint8_t **v = (uint8_t **)vv;
2295 (void)nr; /* unused, it's always 2 */
2297 /* setup the coefficients matrix */
2298 for (j = 0; j < N; ++j)
2299 for (k = 0; k < N; ++k)
2300 G[j * N + k] = A(ip[j], id[k]);
2302 /* invert it to solve the system of linear equations */
2303 raid_invert(G, V, N);
2305 /* compute delta parity */
2306 raid_delta_gen(N, id, ip, nd, size, vv);
2308 for (j = 0; j < N; ++j) {
2309 p[j] = v[nd + ip[j]];
2315 asm volatile ("vbroadcasti128 %0,%%ymm7" : : "m" (gfconst16.low4[0]));
2317 for (i = 0; i < size; i += 32) {
2318 asm volatile ("vmovdqa %0,%%ymm0" : : "m" (p[0][i]));
2319 asm volatile ("vmovdqa %0,%%ymm2" : : "m" (pa[0][i]));
2320 asm volatile ("vmovdqa %0,%%ymm1" : : "m" (p[1][i]));
2321 asm volatile ("vmovdqa %0,%%ymm3" : : "m" (pa[1][i]));
2322 asm volatile ("vpxor %ymm2,%ymm0,%ymm0");
2323 asm volatile ("vpxor %ymm3,%ymm1,%ymm1");
2325 asm volatile ("vpxor %ymm6,%ymm6,%ymm6");
2327 asm volatile ("vbroadcasti128 %0,%%ymm2" : : "m" (gfmulpshufb[V[0]][0][0]));
2328 asm volatile ("vbroadcasti128 %0,%%ymm3" : : "m" (gfmulpshufb[V[0]][1][0]));
2329 asm volatile ("vpsrlw $4,%ymm0,%ymm5");
2330 asm volatile ("vpand %ymm7,%ymm0,%ymm4");
2331 asm volatile ("vpand %ymm7,%ymm5,%ymm5");
2332 asm volatile ("vpshufb %ymm4,%ymm2,%ymm2");
2333 asm volatile ("vpshufb %ymm5,%ymm3,%ymm3");
2334 asm volatile ("vpxor %ymm2,%ymm6,%ymm6");
2335 asm volatile ("vpxor %ymm3,%ymm6,%ymm6");
2337 asm volatile ("vbroadcasti128 %0,%%ymm2" : : "m" (gfmulpshufb[V[1]][0][0]));
2338 asm volatile ("vbroadcasti128 %0,%%ymm3" : : "m" (gfmulpshufb[V[1]][1][0]));
2339 asm volatile ("vpsrlw $4,%ymm1,%ymm5");
2340 asm volatile ("vpand %ymm7,%ymm1,%ymm4");
2341 asm volatile ("vpand %ymm7,%ymm5,%ymm5");
2342 asm volatile ("vpshufb %ymm4,%ymm2,%ymm2");
2343 asm volatile ("vpshufb %ymm5,%ymm3,%ymm3");
2344 asm volatile ("vpxor %ymm2,%ymm6,%ymm6");
2345 asm volatile ("vpxor %ymm3,%ymm6,%ymm6");
2347 asm volatile ("vmovdqa %%ymm6,%0" : "=m" (pa[0][i]));
2349 asm volatile ("vpxor %ymm6,%ymm6,%ymm6");
2351 asm volatile ("vbroadcasti128 %0,%%ymm2" : : "m" (gfmulpshufb[V[2]][0][0]));
2352 asm volatile ("vbroadcasti128 %0,%%ymm3" : : "m" (gfmulpshufb[V[2]][1][0]));
2353 asm volatile ("vpsrlw $4,%ymm0,%ymm5");
2354 asm volatile ("vpand %ymm7,%ymm0,%ymm4");
2355 asm volatile ("vpand %ymm7,%ymm5,%ymm5");
2356 asm volatile ("vpshufb %ymm4,%ymm2,%ymm2");
2357 asm volatile ("vpshufb %ymm5,%ymm3,%ymm3");
2358 asm volatile ("vpxor %ymm2,%ymm6,%ymm6");
2359 asm volatile ("vpxor %ymm3,%ymm6,%ymm6");
2361 asm volatile ("vbroadcasti128 %0,%%ymm2" : : "m" (gfmulpshufb[V[3]][0][0]));
2362 asm volatile ("vbroadcasti128 %0,%%ymm3" : : "m" (gfmulpshufb[V[3]][1][0]));
2363 asm volatile ("vpsrlw $4,%ymm1,%ymm5");
2364 asm volatile ("vpand %ymm7,%ymm1,%ymm4");
2365 asm volatile ("vpand %ymm7,%ymm5,%ymm5");
2366 asm volatile ("vpshufb %ymm4,%ymm2,%ymm2");
2367 asm volatile ("vpshufb %ymm5,%ymm3,%ymm3");
2368 asm volatile ("vpxor %ymm2,%ymm6,%ymm6");
2369 asm volatile ("vpxor %ymm3,%ymm6,%ymm6");
2371 asm volatile ("vmovdqa %%ymm6,%0" : "=m" (pa[1][i]));
2378 #if defined(CONFIG_X86) && defined(CONFIG_AVX2)
2380 * RAID recovering AVX2 implementation
2382 void raid_recX_avx2(int nr, int *id, int *ip, int nd, size_t size, void **vv)
2384 uint8_t **v = (uint8_t **)vv;
2386 uint8_t *p[RAID_PARITY_MAX];
2387 uint8_t *pa[RAID_PARITY_MAX];
2388 uint8_t G[RAID_PARITY_MAX * RAID_PARITY_MAX];
2389 uint8_t V[RAID_PARITY_MAX * RAID_PARITY_MAX];
2390 uint8_t buffer[RAID_PARITY_MAX*32+32];
2391 uint8_t *pd = __align_ptr(buffer, 32);
2395 /* setup the coefficients matrix */
2396 for (j = 0; j < N; ++j)
2397 for (k = 0; k < N; ++k)
2398 G[j * N + k] = A(ip[j], id[k]);
2400 /* invert it to solve the system of linear equations */
2401 raid_invert(G, V, N);
2403 /* compute delta parity */
2404 raid_delta_gen(N, id, ip, nd, size, vv);
2406 for (j = 0; j < N; ++j) {
2407 p[j] = v[nd + ip[j]];
2413 asm volatile ("vbroadcasti128 %0,%%ymm7" : : "m" (gfconst16.low4[0]));
2415 for (i = 0; i < size; i += 32) {
2417 for (j = 0; j < N; ++j) {
2418 asm volatile ("vmovdqa %0,%%ymm0" : : "m" (p[j][i]));
2419 asm volatile ("vmovdqa %0,%%ymm1" : : "m" (pa[j][i]));
2420 asm volatile ("vpxor %ymm1,%ymm0,%ymm0");
2421 asm volatile ("vmovdqa %%ymm0,%0" : "=m" (pd[j*32]));
2425 for (j = 0; j < N; ++j) {
2426 asm volatile ("vpxor %ymm0,%ymm0,%ymm0");
2427 asm volatile ("vpxor %ymm1,%ymm1,%ymm1");
2429 for (k = 0; k < N; ++k) {
2430 uint8_t m = V[j * N + k];
2432 asm volatile ("vbroadcasti128 %0,%%ymm2" : : "m" (gfmulpshufb[m][0][0]));
2433 asm volatile ("vbroadcasti128 %0,%%ymm3" : : "m" (gfmulpshufb[m][1][0]));
2434 asm volatile ("vmovdqa %0,%%ymm4" : : "m" (pd[k*32]));
2435 asm volatile ("vpsrlw $4,%ymm4,%ymm5");
2436 asm volatile ("vpand %ymm7,%ymm4,%ymm4");
2437 asm volatile ("vpand %ymm7,%ymm5,%ymm5");
2438 asm volatile ("vpshufb %ymm4,%ymm2,%ymm2");
2439 asm volatile ("vpshufb %ymm5,%ymm3,%ymm3");
2440 asm volatile ("vpxor %ymm2,%ymm0,%ymm0");
2441 asm volatile ("vpxor %ymm3,%ymm1,%ymm1");
2444 asm volatile ("vpxor %ymm1,%ymm0,%ymm0");
2445 asm volatile ("vmovdqa %%ymm0,%0" : "=m" (pa[j][i]));