git.sesse.net Git - bcachefs-tools-debian/blob - raid/x86z.c

   1 /*
   2  * Copyright (C) 2013 Andrea Mazzoleni
   3  *
   4  * This program is free software: you can redistribute it and/or modify
   5  * it under the terms of the GNU General Public License as published by
   6  * the Free Software Foundation, either version 2 of the License, or
   7  * (at your option) any later version.
   8  *
   9  * This program is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  */
  14
  15 #include "internal.h"
  16
  17 #if defined(CONFIG_X86) && defined(CONFIG_SSE2)
  18 static const struct gfzconst16 {
  19         uint8_t poly[16];
  20         uint8_t half[16];
  21         uint8_t low7[16];
  22 } gfzconst16 __aligned(64) =
  23 {
  24         {
  25                 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
  26                 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d
  27         },
  28         {
  29                 0x8e, 0x8e, 0x8e, 0x8e, 0x8e, 0x8e, 0x8e, 0x8e,
  30                 0x8e, 0x8e, 0x8e, 0x8e, 0x8e, 0x8e, 0x8e, 0x8e
  31         },
  32         {
  33                 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
  34                 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f
  35         }
  36 };
  37 #endif
  38
  39 #if defined(CONFIG_X86) && defined(CONFIG_SSE2)
  40 /*
  41  * GENz (triple parity with powers of 2^-1) SSE2 implementation
  42  */
  43 void raid_genz_sse2(int nd, size_t size, void **vv)
  44 {
  45         uint8_t **v = (uint8_t**)vv;
  46         uint8_t *p;
  47         uint8_t *q;
  48         uint8_t *r;
  49         int d, l;
  50         size_t i;
  51
  52         l = nd - 1;
  53         p = v[nd];
  54         q = v[nd + 1];
  55         r = v[nd + 2];
  56
  57         raid_sse_begin();
  58
  59         asm volatile ("movdqa %0,%%xmm7" : : "m" (gfzconst16.poly[0]));
  60         asm volatile ("movdqa %0,%%xmm3" : : "m" (gfzconst16.half[0]));
  61         asm volatile ("movdqa %0,%%xmm6" : : "m" (gfzconst16.low7[0]));
  62
  63         for (i = 0; i < size; i += 16) {
  64                 asm volatile ("movdqa %0,%%xmm0" : : "m" (v[l][i]));
  65                 asm volatile ("movdqa %xmm0,%xmm1");
  66                 asm volatile ("movdqa %xmm0,%xmm2");
  67                 for (d = l - 1; d >= 0; --d) {
  68                         asm volatile ("pxor %xmm4,%xmm4");
  69                         asm volatile ("pcmpgtb %xmm1,%xmm4");
  70                         asm volatile ("paddb %xmm1,%xmm1");
  71                         asm volatile ("pand %xmm7,%xmm4");
  72                         asm volatile ("pxor %xmm4,%xmm1");
  73
  74                         asm volatile ("movdqa %xmm2,%xmm4");
  75                         asm volatile ("pxor %xmm5,%xmm5");
  76                         asm volatile ("psllw $7,%xmm4");
  77                         asm volatile ("psrlw $1,%xmm2");
  78                         asm volatile ("pcmpgtb %xmm4,%xmm5");
  79                         asm volatile ("pand %xmm6,%xmm2");
  80                         asm volatile ("pand %xmm3,%xmm5");
  81                         asm volatile ("pxor %xmm5,%xmm2");
  82
  83                         asm volatile ("movdqa %0,%%xmm4" : : "m" (v[d][i]));
  84                         asm volatile ("pxor %xmm4,%xmm0");
  85                         asm volatile ("pxor %xmm4,%xmm1");
  86                         asm volatile ("pxor %xmm4,%xmm2");
  87                 }
  88                 asm volatile ("movntdq %%xmm0,%0" : "=m" (p[i]));
  89                 asm volatile ("movntdq %%xmm1,%0" : "=m" (q[i]));
  90                 asm volatile ("movntdq %%xmm2,%0" : "=m" (r[i]));
  91         }
  92
  93         raid_sse_end();
  94 }
  95 #endif
  96
  97 #if defined(CONFIG_X86_64) && defined(CONFIG_SSE2)
  98 /*
  99  * GENz (triple parity with powers of 2^-1) SSE2 implementation
 100  *
 101  * Note that it uses 16 registers, meaning that x64 is required.
 102  */
 103 void raid_genz_sse2ext(int nd, size_t size, void **vv)
 104 {
 105         uint8_t **v = (uint8_t**)vv;
 106         uint8_t *p;
 107         uint8_t *q;
 108         uint8_t *r;
 109         int d, l;
 110         size_t i;
 111
 112         l = nd - 1;
 113         p = v[nd];
 114         q = v[nd + 1];
 115         r = v[nd + 2];
 116
 117         raid_sse_begin();
 118
 119         asm volatile ("movdqa %0,%%xmm7" : : "m" (gfzconst16.poly[0]));
 120         asm volatile ("movdqa %0,%%xmm3" : : "m" (gfzconst16.half[0]));
 121         asm volatile ("movdqa %0,%%xmm11" : : "m" (gfzconst16.low7[0]));
 122
 123         for (i = 0; i < size; i += 32) {
 124                 asm volatile ("movdqa %0,%%xmm0" : : "m" (v[l][i]));
 125                 asm volatile ("movdqa %0,%%xmm8" : : "m" (v[l][i + 16]));
 126                 asm volatile ("movdqa %xmm0,%xmm1");
 127                 asm volatile ("movdqa %xmm8,%xmm9");
 128                 asm volatile ("movdqa %xmm0,%xmm2");
 129                 asm volatile ("movdqa %xmm8,%xmm10");
 130                 for (d = l - 1; d >= 0; --d) {
 131                         asm volatile ("movdqa %xmm2,%xmm6");
 132                         asm volatile ("movdqa %xmm10,%xmm14");
 133                         asm volatile ("pxor %xmm4,%xmm4");
 134                         asm volatile ("pxor %xmm12,%xmm12");
 135                         asm volatile ("pxor %xmm5,%xmm5");
 136                         asm volatile ("pxor %xmm13,%xmm13");
 137                         asm volatile ("psllw $7,%xmm6");
 138                         asm volatile ("psllw $7,%xmm14");
 139                         asm volatile ("psrlw $1,%xmm2");
 140                         asm volatile ("psrlw $1,%xmm10");
 141                         asm volatile ("pcmpgtb %xmm1,%xmm4");
 142                         asm volatile ("pcmpgtb %xmm9,%xmm12");
 143                         asm volatile ("pcmpgtb %xmm6,%xmm5");
 144                         asm volatile ("pcmpgtb %xmm14,%xmm13");
 145                         asm volatile ("paddb %xmm1,%xmm1");
 146                         asm volatile ("paddb %xmm9,%xmm9");
 147                         asm volatile ("pand %xmm11,%xmm2");
 148                         asm volatile ("pand %xmm11,%xmm10");
 149                         asm volatile ("pand %xmm7,%xmm4");
 150                         asm volatile ("pand %xmm7,%xmm12");
 151                         asm volatile ("pand %xmm3,%xmm5");
 152                         asm volatile ("pand %xmm3,%xmm13");
 153                         asm volatile ("pxor %xmm4,%xmm1");
 154                         asm volatile ("pxor %xmm12,%xmm9");
 155                         asm volatile ("pxor %xmm5,%xmm2");
 156                         asm volatile ("pxor %xmm13,%xmm10");
 157
 158                         asm volatile ("movdqa %0,%%xmm4" : : "m" (v[d][i]));
 159                         asm volatile ("movdqa %0,%%xmm12" : : "m" (v[d][i + 16]));
 160                         asm volatile ("pxor %xmm4,%xmm0");
 161                         asm volatile ("pxor %xmm4,%xmm1");
 162                         asm volatile ("pxor %xmm4,%xmm2");
 163                         asm volatile ("pxor %xmm12,%xmm8");
 164                         asm volatile ("pxor %xmm12,%xmm9");
 165                         asm volatile ("pxor %xmm12,%xmm10");
 166                 }
 167                 asm volatile ("movntdq %%xmm0,%0" : "=m" (p[i]));
 168                 asm volatile ("movntdq %%xmm8,%0" : "=m" (p[i + 16]));
 169                 asm volatile ("movntdq %%xmm1,%0" : "=m" (q[i]));
 170                 asm volatile ("movntdq %%xmm9,%0" : "=m" (q[i + 16]));
 171                 asm volatile ("movntdq %%xmm2,%0" : "=m" (r[i]));
 172                 asm volatile ("movntdq %%xmm10,%0" : "=m" (r[i + 16]));
 173         }
 174
 175         raid_sse_end();
 176 }
 177 #endif
 178
 179 #if defined(CONFIG_X86_64) && defined(CONFIG_AVX2)
 180 /*
 181  * GENz (triple parity with powers of 2^-1) AVX2 implementation
 182  *
 183  * Note that it uses 16 registers, meaning that x64 is required.
 184  */
 185 void raid_genz_avx2ext(int nd, size_t size, void **vv)
 186 {
 187         uint8_t **v = (uint8_t**)vv;
 188         uint8_t *p;
 189         uint8_t *q;
 190         uint8_t *r;
 191         int d, l;
 192         size_t i;
 193
 194         l = nd - 1;
 195         p = v[nd];
 196         q = v[nd + 1];
 197         r = v[nd + 2];
 198
 199         raid_avx_begin();
 200
 201         asm volatile ("vbroadcasti128 %0,%%ymm7" : : "m" (gfzconst16.poly[0]));
 202         asm volatile ("vbroadcasti128 %0,%%ymm3" : : "m" (gfzconst16.half[0]));
 203         asm volatile ("vbroadcasti128 %0,%%ymm11" : : "m" (gfzconst16.low7[0]));
 204         asm volatile ("vpxor %ymm15,%ymm15,%ymm15");
 205
 206         for (i = 0; i < size; i += 64) {
 207                 asm volatile ("vmovdqa %0,%%ymm0" : : "m" (v[l][i]));
 208                 asm volatile ("vmovdqa %0,%%ymm8" : : "m" (v[l][i + 32]));
 209                 asm volatile ("vmovdqa %ymm0,%ymm1");
 210                 asm volatile ("vmovdqa %ymm8,%ymm9");
 211                 asm volatile ("vmovdqa %ymm0,%ymm2");
 212                 asm volatile ("vmovdqa %ymm8,%ymm10");
 213                 for (d = l - 1; d >= 0; --d) {
 214                         asm volatile ("vpsllw $7,%ymm2,%ymm6");
 215                         asm volatile ("vpsllw $7,%ymm10,%ymm14");
 216                         asm volatile ("vpsrlw $1,%ymm2,%ymm2");
 217                         asm volatile ("vpsrlw $1,%ymm10,%ymm10");
 218                         asm volatile ("vpcmpgtb %ymm1,%ymm15,%ymm4");
 219                         asm volatile ("vpcmpgtb %ymm9,%ymm15,%ymm12");
 220                         asm volatile ("vpcmpgtb %ymm6,%ymm15,%ymm5");
 221                         asm volatile ("vpcmpgtb %ymm14,%ymm15,%ymm13");
 222                         asm volatile ("vpaddb %ymm1,%ymm1,%ymm1");
 223                         asm volatile ("vpaddb %ymm9,%ymm9,%ymm9");
 224                         asm volatile ("vpand %ymm11,%ymm2,%ymm2");
 225                         asm volatile ("vpand %ymm11,%ymm10,%ymm10");
 226                         asm volatile ("vpand %ymm7,%ymm4,%ymm4");
 227                         asm volatile ("vpand %ymm7,%ymm12,%ymm12");
 228                         asm volatile ("vpand %ymm3,%ymm5,%ymm5");
 229                         asm volatile ("vpand %ymm3,%ymm13,%ymm13");
 230                         asm volatile ("vpxor %ymm4,%ymm1,%ymm1");
 231                         asm volatile ("vpxor %ymm12,%ymm9,%ymm9");
 232                         asm volatile ("vpxor %ymm5,%ymm2,%ymm2");
 233                         asm volatile ("vpxor %ymm13,%ymm10,%ymm10");
 234
 235                         asm volatile ("vmovdqa %0,%%ymm4" : : "m" (v[d][i]));
 236                         asm volatile ("vmovdqa %0,%%ymm12" : : "m" (v[d][i + 32]));
 237                         asm volatile ("vpxor %ymm4,%ymm0,%ymm0");
 238                         asm volatile ("vpxor %ymm4,%ymm1,%ymm1");
 239                         asm volatile ("vpxor %ymm4,%ymm2,%ymm2");
 240                         asm volatile ("vpxor %ymm12,%ymm8,%ymm8");
 241                         asm volatile ("vpxor %ymm12,%ymm9,%ymm9");
 242                         asm volatile ("vpxor %ymm12,%ymm10,%ymm10");
 243                 }
 244                 asm volatile ("vmovntdq %%ymm0,%0" : "=m" (p[i]));
 245                 asm volatile ("vmovntdq %%ymm8,%0" : "=m" (p[i + 32]));
 246                 asm volatile ("vmovntdq %%ymm1,%0" : "=m" (q[i]));
 247                 asm volatile ("vmovntdq %%ymm9,%0" : "=m" (q[i + 32]));
 248                 asm volatile ("vmovntdq %%ymm2,%0" : "=m" (r[i]));
 249                 asm volatile ("vmovntdq %%ymm10,%0" : "=m" (r[i + 32]));
 250         }
 251
 252         raid_avx_end();
 253 }
 254 #endif
 255