git.sesse.net Git - bcachefs-tools-debian/blob - c_src/raid/raid.c

   1 /*
   2  * Copyright (C) 2013 Andrea Mazzoleni
   3  *
   4  * This program is free software: you can redistribute it and/or modify
   5  * it under the terms of the GNU General Public License as published by
   6  * the Free Software Foundation, either version 2 of the License, or
   7  * (at your option) any later version.
   8  *
   9  * This program is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  */
  14
  15 #include "internal.h"
  16 #include "gf.h"
  17
  18 /*
  19  * This is a RAID implementation working in the Galois Field GF(2^8) with
  20  * the primitive polynomial x^8 + x^4 + x^3 + x^2 + 1 (285 decimal), and
  21  * supporting up to six parity levels.
  22  *
  23  * For RAID5 and RAID6 it works as as described in the H. Peter Anvin's
  24  * paper "The mathematics of RAID-6" [1]. Please refer to this paper for a
  25  * complete explanation.
  26  *
  27  * To support triple parity, it was first evaluated and then dropped, an
  28  * extension of the same approach, with additional parity coefficients set
  29  * as powers of 2^-1, with equations:
  30  *
  31  * P = sum(Di)
  32  * Q = sum(2^i * Di)
  33  * R = sum(2^-i * Di) with 0<=i<N
  34  *
  35  * This approach works well for triple parity and it's very efficient,
  36  * because we can implement very fast parallel multiplications and
  37  * divisions by 2 in GF(2^8).
  38  *
  39  * It's also similar at the approach used by ZFS RAIDZ3, with the
  40  * difference that ZFS uses powers of 4 instead of 2^-1.
  41  *
  42  * Unfortunately it doesn't work beyond triple parity, because whatever
  43  * value we choose to generate the power coefficients to compute other
  44  * parities, the resulting equations are not solvable for some
  45  * combinations of missing disks.
  46  *
  47  * This is expected, because the Vandermonde matrix used to compute the
  48  * parity has no guarantee to have all submatrices not singular
  49  * [2, Chap 11, Problem 7] and this is a requirement to have
  50  * a MDS (Maximum Distance Separable) code [2, Chap 11, Theorem 8].
  51  *
  52  * To overcome this limitation, we use a Cauchy matrix [3][4] to compute
  53  * the parity. A Cauchy matrix has the property to have all the square
  54  * submatrices not singular, resulting in always solvable equations,
  55  * for any combination of missing disks.
  56  *
  57  * The problem of this approach is that it requires the use of
  58  * generic multiplications, and not only by 2 or 2^-1, potentially
  59  * affecting badly the performance.
  60  *
  61  * Hopefully there is a method to implement parallel multiplications
  62  * using SSSE3 or AVX2 instructions [1][5]. Method competitive with the
  63  * computation of triple parity using power coefficients.
  64  *
  65  * Another important property of the Cauchy matrix is that we can setup
  66  * the first two rows with coeffients equal at the RAID5 and RAID6 approach
  67  * decribed, resulting in a compatible extension, and requiring SSSE3
  68  * or AVX2 instructions only if triple parity or beyond is used.
  69  *
  70  * The matrix is also adjusted, multipling each row by a constant factor
  71  * to make the first column of all 1, to optimize the computation for
  72  * the first disk.
  73  *
  74  * This results in the matrix A[row,col] defined as:
  75  *
  76  * 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01...
  77  * 01 02 04 08 10 20 40 80 1d 3a 74 e8 cd 87 13 26 4c 98 2d 5a b4 75...
  78  * 01 f5 d2 c4 9a 71 f1 7f fc 87 c1 c6 19 2f 40 55 3d ba 53 04 9c 61...
  79  * 01 bb a6 d7 c7 07 ce 82 4a 2f a5 9b b6 60 f1 ad e7 f4 06 d2 df 2e...
  80  * 01 97 7f 9c 7c 18 bd a2 58 1a da 74 70 a3 e5 47 29 07 f5 80 23 e9...
  81  * 01 2b 3f cf 73 2c d6 ed cb 74 15 78 8a c1 17 c9 89 68 21 ab 76 3b...
  82  *
  83  * This matrix supports 6 level of parity, one for each row, for up to 251
  84  * data disks, one for each column, with all the 377,342,351,231 square
  85  * submatrices not singular, verified also with brute-force.
  86  *
  87  * This matrix can be extended to support any number of parities, just
  88  * adding additional rows, and removing one column for each new row.
  89  * (see mktables.c for more details in how the matrix is generated)
  90  *
  91  * In details, parity is computed as:
  92  *
  93  * P = sum(Di)
  94  * Q = sum(2^i *  Di)
  95  * R = sum(A[2,i] * Di)
  96  * S = sum(A[3,i] * Di)
  97  * T = sum(A[4,i] * Di)
  98  * U = sum(A[5,i] * Di) with 0<=i<N
  99  *
 100  * To recover from a failure of six disks at indexes x,y,z,h,v,w,
 101  * with 0<=x<y<z<h<v<w<N, we compute the parity of the available N-6
 102  * disks as:
 103  *
 104  * Pa = sum(Di)
 105  * Qa = sum(2^i * Di)
 106  * Ra = sum(A[2,i] * Di)
 107  * Sa = sum(A[3,i] * Di)
 108  * Ta = sum(A[4,i] * Di)
 109  * Ua = sum(A[5,i] * Di) with 0<=i<N,i!=x,i!=y,i!=z,i!=h,i!=v,i!=w.
 110  *
 111  * And if we define:
 112  *
 113  * Pd = Pa + P
 114  * Qd = Qa + Q
 115  * Rd = Ra + R
 116  * Sd = Sa + S
 117  * Td = Ta + T
 118  * Ud = Ua + U
 119  *
 120  * we can sum these two sets of equations, obtaining:
 121  *
 122  * Pd =          Dx +          Dy +          Dz +          Dh +          Dv +          Dw
 123  * Qd =    2^x * Dx +    2^y * Dy +    2^z * Dz +    2^h * Dh +    2^v * Dv +    2^w * Dw
 124  * Rd = A[2,x] * Dx + A[2,y] * Dy + A[2,z] * Dz + A[2,h] * Dh + A[2,v] * Dv + A[2,w] * Dw
 125  * Sd = A[3,x] * Dx + A[3,y] * Dy + A[3,z] * Dz + A[3,h] * Dh + A[3,v] * Dv + A[3,w] * Dw
 126  * Td = A[4,x] * Dx + A[4,y] * Dy + A[4,z] * Dz + A[4,h] * Dh + A[4,v] * Dv + A[4,w] * Dw
 127  * Ud = A[5,x] * Dx + A[5,y] * Dy + A[5,z] * Dz + A[5,h] * Dh + A[5,v] * Dv + A[5,w] * Dw
 128  *
 129  * A linear system always solvable because the coefficients matrix is
 130  * always not singular due the properties of the matrix A[].
 131  *
 132  * Resulting speed in x64, with 8 data disks, using a stripe of 256 KiB,
 133  * for a Core i5-4670K Haswell Quad-Core 3.4GHz is:
 134  *
 135  *             int8   int32   int64    sse2   ssse3    avx2
 136  *   gen1             13339   25438   45438           50588
 137  *   gen2              4115    6514   21840           32201
 138  *   gen3       814                           10154   18613
 139  *   gen4       620                            7569   14229
 140  *   gen5       496                            5149   10051
 141  *   gen6       413                            4239    8190
 142  *
 143  * Values are in MiB/s of data processed by a single thread, not counting
 144  * generated parity.
 145  *
 146  * You can replicate these results in your machine using the
 147  * "raid/test/speedtest.c" program.
 148  *
 149  * For comparison, the triple parity computation using the power
 150  * coeffients "1,2,2^-1" is only a little faster than the one based on
 151  * the Cauchy matrix if SSSE3 or AVX2 is present.
 152  *
 153  *             int8   int32   int64    sse2   ssse3    avx2
 154  *   genz              2337    2874   10920           18944
 155  *
 156  * In conclusion, the use of power coefficients, and specifically powers
 157  * of 1,2,2^-1, is the best option to implement triple parity in CPUs
 158  * without SSSE3 and AVX2.
 159  * But if a modern CPU with SSSE3 or AVX2 is available, the Cauchy
 160  * matrix is the best option because it provides a fast and general
 161  * approach working for any number of parities.
 162  *
 163  * References:
 164  * [1] Anvin, "The mathematics of RAID-6", 2004
 165  * [2] MacWilliams, Sloane, "The Theory of Error-Correcting Codes", 1977
 166  * [3] Blomer, "An XOR-Based Erasure-Resilient Coding Scheme", 1995
 167  * [4] Roth, "Introduction to Coding Theory", 2006
 168  * [5] Plank, "Screaming Fast Galois Field Arithmetic Using Intel SIMD Instructions", 2013
 169  */
 170
 171 /**
 172  * Generator matrix currently used.
 173  */
 174 const uint8_t (*raid_gfgen)[256];
 175
 176 void raid_mode(int mode)
 177 {
 178         if (mode == RAID_MODE_VANDERMONDE) {
 179                 raid_gen_ptr[2] = raid_genz_ptr;
 180                 raid_gfgen = gfvandermonde;
 181         } else {
 182                 raid_gen_ptr[2] = raid_gen3_ptr;
 183                 raid_gfgen = gfcauchy;
 184         }
 185 }
 186
 187 /**
 188  * Buffer filled with 0 used in recovering.
 189  */
 190 static void *raid_zero_block;
 191
 192 void raid_zero(void *zero)
 193 {
 194         raid_zero_block = zero;
 195 }
 196
 197 /*
 198  * Forwarders for parity computation.
 199  *
 200  * These functions compute the parity blocks from the provided data.
 201  *
 202  * The number of parities to compute is implicit in the position in the
 203  * forwarder vector. Position at index #i, computes (#i+1) parities.
 204  *
 205  * All these functions give the guarantee that parities are written
 206  * in order. First parity P, then parity Q, and so on.
 207  * This allows to specify the same memory buffer for multiple parities
 208  * knowning that you'll get the latest written one.
 209  * This characteristic is used by the raid_delta_gen() function to
 210  * avoid to damage unused parities in recovering.
 211  *
 212  * @nd Number of data blocks
 213  * @size Size of the blocks pointed by @v. It must be a multipler of 64.
 214  * @v Vector of pointers to the blocks of data and parity.
 215  *   It has (@nd + #parities) elements. The starting elements are the blocks
 216  *   for data, following with the parity blocks.
 217  *   Each block has @size bytes.
 218  */
 219 void (*raid_gen_ptr[RAID_PARITY_MAX])(int nd, size_t size, void **vv);
 220 void (*raid_gen3_ptr)(int nd, size_t size, void **vv);
 221 void (*raid_genz_ptr)(int nd, size_t size, void **vv);
 222
 223 void raid_gen(int nd, int np, size_t size, void **v)
 224 {
 225         /* enforce limit on size */
 226         BUG_ON(size % 64 != 0);
 227
 228         /* enforce limit on number of failures */
 229         BUG_ON(np < 1);
 230         BUG_ON(np > RAID_PARITY_MAX);
 231
 232         raid_gen_ptr[np - 1](nd, size, v);
 233 }
 234
 235 /**
 236  * Inverts the square matrix M of size nxn into V.
 237  *
 238  * This is not a general matrix inversion because we assume the matrix M
 239  * to have all the square submatrix not singular.
 240  * We use Gauss elimination to invert.
 241  *
 242  * @M Matrix to invert with @n rows and @n columns.
 243  * @V Destination matrix where the result is put.
 244  * @n Number of rows and columns of the matrix.
 245  */
 246 void raid_invert(uint8_t *M, uint8_t *V, int n)
 247 {
 248         int i, j, k;
 249
 250         /* set the identity matrix in V */
 251         for (i = 0; i < n; ++i)
 252                 for (j = 0; j < n; ++j)
 253                         V[i * n + j] = i == j;
 254
 255         /* for each element in the diagonal */
 256         for (k = 0; k < n; ++k) {
 257                 uint8_t f;
 258
 259                 /* the diagonal element cannot be 0 because */
 260                 /* we are inverting matrices with all the square */
 261                 /* submatrices not singular */
 262                 BUG_ON(M[k * n + k] == 0);
 263
 264                 /* make the diagonal element to be 1 */
 265                 f = inv(M[k * n + k]);
 266                 for (j = 0; j < n; ++j) {
 267                         M[k * n + j] = mul(f, M[k * n + j]);
 268                         V[k * n + j] = mul(f, V[k * n + j]);
 269                 }
 270
 271                 /* make all the elements over and under the diagonal */
 272                 /* to be zero */
 273                 for (i = 0; i < n; ++i) {
 274                         if (i == k)
 275                                 continue;
 276                         f = M[i * n + k];
 277                         for (j = 0; j < n; ++j) {
 278                                 M[i * n + j] ^= mul(f, M[k * n + j]);
 279                                 V[i * n + j] ^= mul(f, V[k * n + j]);
 280                         }
 281                 }
 282         }
 283 }
 284
 285 /**
 286  * Computes the parity without the missing data blocks
 287  * and store it in the buffers of such data blocks.
 288  *
 289  * This is the parity expressed as Pa,Qa,Ra,Sa,Ta,Ua in the equations.
 290  */
 291 void raid_delta_gen(int nr, int *id, int *ip, int nd, size_t size, void **v)
 292 {
 293         void *p[RAID_PARITY_MAX];
 294         void *pa[RAID_PARITY_MAX];
 295         int i, j;
 296         int np;
 297         void *latest;
 298
 299         /* total number of parities we are going to process */
 300         /* they are both the used and the unused ones */
 301         np = ip[nr - 1] + 1;
 302
 303         /* latest missing data block */
 304         latest = v[id[nr - 1]];
 305
 306         /* setup pointers for delta computation */
 307         for (i = 0, j = 0; i < np; ++i) {
 308                 /* keep a copy of the original parity vector */
 309                 p[i] = v[nd + i];
 310
 311                 if (ip[j] == i) {
 312                         /*
 313                          * Set used parities to point to the missing
 314                          * data blocks.
 315                          *
 316                          * The related data blocks are instead set
 317                          * to point to the "zero" buffer.
 318                          */
 319
 320                         /* the latest parity to use ends the for loop and */
 321                         /* then it cannot happen to process more of them */
 322                         BUG_ON(j >= nr);
 323
 324                         /* buffer for missing data blocks */
 325                         pa[j] = v[id[j]];
 326
 327                         /* set at zero the missing data blocks */
 328                         v[id[j]] = raid_zero_block;
 329
 330                         /* compute the parity over the missing data blocks */
 331                         v[nd + i] = pa[j];
 332
 333                         /* check for the next used entry */
 334                         ++j;
 335                 } else {
 336                         /*
 337                          * Unused parities are going to be rewritten with
 338                          * not significative data, becase we don't have
 339                          * functions able to compute only a subset of
 340                          * parities.
 341                          *
 342                          * To avoid this, we reuse parity buffers,
 343                          * assuming that all the parity functions write
 344                          * parities in order.
 345                          *
 346                          * We assign the unused parity block to the same
 347                          * block of the latest used parity that we know it
 348                          * will be written.
 349                          *
 350                          * This means that this block will be written
 351                          * multiple times and only the latest write will
 352                          * contain the correct data.
 353                          */
 354                         v[nd + i] = latest;
 355                 }
 356         }
 357
 358         /* all the parities have to be processed */
 359         BUG_ON(j != nr);
 360
 361         /* recompute the parity, note that np may be smaller than the */
 362         /* total number of parities available */
 363         raid_gen(nd, np, size, v);
 364
 365         /* restore data buffers as before */
 366         for (j = 0; j < nr; ++j)
 367                 v[id[j]] = pa[j];
 368
 369         /* restore parity buffers as before */
 370         for (i = 0; i < np; ++i)
 371                 v[nd + i] = p[i];
 372 }
 373
 374 /**
 375  * Recover failure of one data block for PAR1.
 376  *
 377  * Starting from the equation:
 378  *
 379  * Pd = Dx
 380  *
 381  * and solving we get:
 382  *
 383  * Dx = Pd
 384  */
 385 void raid_rec1of1(int *id, int nd, size_t size, void **v)
 386 {
 387         void *p;
 388         void *pa;
 389
 390         /* for PAR1 we can directly compute the missing block */
 391         /* and we don't need to use the zero buffer */
 392         p = v[nd];
 393         pa = v[id[0]];
 394
 395         /* use the parity as missing data block */
 396         v[id[0]] = p;
 397
 398         /* compute the parity over the missing data block */
 399         v[nd] = pa;
 400
 401         /* compute */
 402         raid_gen(nd, 1, size, v);
 403
 404         /* restore as before */
 405         v[id[0]] = pa;
 406         v[nd] = p;
 407 }
 408
 409 /**
 410  * Recover failure of two data blocks for PAR2.
 411  *
 412  * Starting from the equations:
 413  *
 414  * Pd = Dx + Dy
 415  * Qd = 2^id[0] * Dx + 2^id[1] * Dy
 416  *
 417  * and solving we get:
 418  *
 419  *               1                     2^(-id[0])
 420  * Dy = ------------------- * Pd + ------------------- * Qd
 421  *      2^(id[1]-id[0]) + 1        2^(id[1]-id[0]) + 1
 422  *
 423  * Dx = Dy + Pd
 424  *
 425  * with conditions:
 426  *
 427  * 2^id[0] != 0
 428  * 2^(id[1]-id[0]) + 1 != 0
 429  *
 430  * That are always satisfied for any 0<=id[0]<id[1]<255.
 431  */
 432 void raid_rec2of2_int8(int *id, int *ip, int nd, size_t size, void **vv)
 433 {
 434         uint8_t **v = (uint8_t **)vv;
 435         size_t i;
 436         uint8_t *p;
 437         uint8_t *pa;
 438         uint8_t *q;
 439         uint8_t *qa;
 440         const uint8_t *T[2];
 441
 442         /* get multiplication tables */
 443         T[0] = table(inv(pow2(id[1] - id[0]) ^ 1));
 444         T[1] = table(inv(pow2(id[0]) ^ pow2(id[1])));
 445
 446         /* compute delta parity */
 447         raid_delta_gen(2, id, ip, nd, size, vv);
 448
 449         p = v[nd];
 450         q = v[nd + 1];
 451         pa = v[id[0]];
 452         qa = v[id[1]];
 453
 454         for (i = 0; i < size; ++i) {
 455                 /* delta */
 456                 uint8_t Pd = p[i] ^ pa[i];
 457                 uint8_t Qd = q[i] ^ qa[i];
 458
 459                 /* reconstruct */
 460                 uint8_t Dy = T[0][Pd] ^ T[1][Qd];
 461                 uint8_t Dx = Pd ^ Dy;
 462
 463                 /* set */
 464                 pa[i] = Dx;
 465                 qa[i] = Dy;
 466         }
 467 }
 468
 469 /*
 470  * Forwarders for data recovery.
 471  *
 472  * These functions recover data blocks using the specified parity
 473  * to recompute the missing data.
 474  *
 475  * Note that the format of vectors @id/@ip is different than raid_rec().
 476  * For example, in the vector @ip the first parity is represented with the
 477  * value 0 and not @nd.
 478  *
 479  * @nr Number of failed data blocks to recover.
 480  * @id[] Vector of @nr indexes of the data blocks to recover.
 481  *   The indexes start from 0. They must be in order.
 482  * @ip[] Vector of @nr indexes of the parity blocks to use in the recovering.
 483  *   The indexes start from 0. They must be in order.
 484  * @nd Number of data blocks.
 485  * @np Number of parity blocks.
 486  * @size Size of the blocks pointed by @v. It must be a multipler of 64.
 487  * @v Vector of pointers to the blocks of data and parity.
 488  *   It has (@nd + @np) elements. The starting elements are the blocks
 489  *   for data, following with the parity blocks.
 490  *   Each block has @size bytes.
 491  */
 492 void (*raid_rec_ptr[RAID_PARITY_MAX])(
 493         int nr, int *id, int *ip, int nd, size_t size, void **vv);
 494
 495 void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v)
 496 {
 497         int nrd; /* number of data blocks to recover */
 498         int nrp; /* number of parity blocks to recover */
 499
 500         /* enforce limit on size */
 501         BUG_ON(size % 64 != 0);
 502
 503         /* enforce limit on number of failures */
 504         BUG_ON(nr > np);
 505         BUG_ON(np > RAID_PARITY_MAX);
 506
 507         /* enforce order in index vector */
 508         BUG_ON(nr >= 2 && ir[0] >= ir[1]);
 509         BUG_ON(nr >= 3 && ir[1] >= ir[2]);
 510         BUG_ON(nr >= 4 && ir[2] >= ir[3]);
 511         BUG_ON(nr >= 5 && ir[3] >= ir[4]);
 512         BUG_ON(nr >= 6 && ir[4] >= ir[5]);
 513
 514         /* enforce limit on index vector */
 515         BUG_ON(nr > 0 && ir[nr-1] >= nd + np);
 516
 517         /* count the number of data blocks to recover */
 518         nrd = 0;
 519         while (nrd < nr && ir[nrd] < nd)
 520                 ++nrd;
 521
 522         /* all the remaining are parity */
 523         nrp = nr - nrd;
 524
 525         /* enforce limit on number of failures */
 526         BUG_ON(nrd > nd);
 527         BUG_ON(nrp > np);
 528
 529         /* if failed data is present */
 530         if (nrd != 0) {
 531                 int ip[RAID_PARITY_MAX];
 532                 int i, j, k;
 533
 534                 /* setup the vector of parities to use */
 535                 for (i = 0, j = 0, k = 0; i < np; ++i) {
 536                         if (j < nrp && ir[nrd + j] == nd + i) {
 537                                 /* this parity has to be recovered */
 538                                 ++j;
 539                         } else {
 540                                 /* this parity is used for recovering */
 541                                 ip[k] = i;
 542                                 ++k;
 543                         }
 544                 }
 545
 546                 /* recover the nrd data blocks specified in ir[], */
 547                 /* using the first nrd parity in ip[] for recovering */
 548                 raid_rec_ptr[nrd - 1](nrd, ir, ip, nd, size, v);
 549         }
 550
 551         /* recompute all the parities up to the last bad one */
 552         if (nrp != 0)
 553                 raid_gen(nd, ir[nr - 1] - nd + 1, size, v);
 554 }
 555
 556 void raid_data(int nr, int *id, int *ip, int nd, size_t size, void **v)
 557 {
 558         /* enforce limit on size */
 559         BUG_ON(size % 64 != 0);
 560
 561         /* enforce limit on number of failures */
 562         BUG_ON(nr > nd);
 563         BUG_ON(nr > RAID_PARITY_MAX);
 564
 565         /* enforce order in index vector for data */
 566         BUG_ON(nr >= 2 && id[0] >= id[1]);
 567         BUG_ON(nr >= 3 && id[1] >= id[2]);
 568         BUG_ON(nr >= 4 && id[2] >= id[3]);
 569         BUG_ON(nr >= 5 && id[3] >= id[4]);
 570         BUG_ON(nr >= 6 && id[4] >= id[5]);
 571
 572         /* enforce limit on index vector for data */
 573         BUG_ON(nr > 0 && id[nr-1] >= nd);
 574
 575         /* enforce order in index vector for parity */
 576         BUG_ON(nr >= 2 && ip[0] >= ip[1]);
 577         BUG_ON(nr >= 3 && ip[1] >= ip[2]);
 578         BUG_ON(nr >= 4 && ip[2] >= ip[3]);
 579         BUG_ON(nr >= 5 && ip[3] >= ip[4]);
 580         BUG_ON(nr >= 6 && ip[4] >= ip[5]);
 581
 582         /* if failed data is present */
 583         if (nr != 0)
 584                 raid_rec_ptr[nr - 1](nr, id, ip, nd, size, v);
 585 }
 586