git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/migrate.c

   1 /*
   2  * Code for moving data off a device.
   3  */
   4
   5 #include "bcachefs.h"
   6 #include "btree_update.h"
   7 #include "buckets.h"
   8 #include "extents.h"
   9 #include "io.h"
  10 #include "journal.h"
  11 #include "keylist.h"
  12 #include "migrate.h"
  13 #include "move.h"
  14 #include "super-io.h"
  15
  16 static int issue_migration_move(struct bch_dev *ca,
  17                                 struct moving_context *ctxt,
  18                                 struct bch_devs_mask *devs,
  19                                 struct bkey_s_c k)
  20 {
  21         struct bch_fs *c = ca->fs;
  22         struct disk_reservation res;
  23         const struct bch_extent_ptr *ptr;
  24         int ret;
  25
  26         if (bch2_disk_reservation_get(c, &res, k.k->size, 0))
  27                 return -ENOSPC;
  28
  29         extent_for_each_ptr(bkey_s_c_to_extent(k), ptr)
  30                 if (ptr->dev == ca->dev_idx)
  31                         goto found;
  32
  33         BUG();
  34 found:
  35         /* XXX: we need to be doing something with the disk reservation */
  36
  37         ret = bch2_data_move(c, ctxt, devs, k, ptr);
  38         if (ret)
  39                 bch2_disk_reservation_put(c, &res);
  40         return ret;
  41 }
  42
  43 #define MAX_DATA_OFF_ITER       10
  44
  45 /*
  46  * This moves only the data off, leaving the meta-data (if any) in place.
  47  * It walks the key space, and for any key with a valid pointer to the
  48  * relevant device, it copies it elsewhere, updating the key to point to
  49  * the copy.
  50  * The meta-data is moved off by bch_move_meta_data_off_device.
  51  *
  52  * Note: If the number of data replicas desired is > 1, ideally, any
  53  * new copies would not be made in the same device that already have a
  54  * copy (if there are enough devices).
  55  * This is _not_ currently implemented.  The multiple replicas can
  56  * land in the same device even if there are others available.
  57  */
  58
  59 int bch2_move_data_off_device(struct bch_dev *ca)
  60 {
  61         struct moving_context ctxt;
  62         struct bch_fs *c = ca->fs;
  63         unsigned pass = 0;
  64         u64 seen_key_count;
  65         int ret = 0;
  66
  67         BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW);
  68
  69         if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_USER)))
  70                 return 0;
  71
  72         mutex_lock(&c->replicas_gc_lock);
  73         bch2_replicas_gc_start(c, 1 << BCH_DATA_USER);
  74
  75         bch2_move_ctxt_init(&ctxt, NULL, SECTORS_IN_FLIGHT_PER_DEVICE);
  76         __set_bit(ca->dev_idx, ctxt.avoid.d);
  77
  78         /*
  79          * In theory, only one pass should be necessary as we've
  80          * quiesced all writes before calling this.
  81          *
  82          * However, in practice, more than one pass may be necessary:
  83          * - Some move fails due to an error. We can can find this out
  84          *   from the moving_context.
  85          * - Some key swap failed because some of the pointers in the
  86          *   key in the tree changed due to caching behavior, btree gc
  87          *   pruning stale pointers, or tiering (if the device being
  88          *   removed is in tier 0).  A smarter bkey_cmpxchg would
  89          *   handle these cases.
  90          *
  91          * Thus this scans the tree one more time than strictly necessary,
  92          * but that can be viewed as a verification pass.
  93          */
  94
  95         do {
  96                 struct btree_iter iter;
  97                 struct bkey_s_c k;
  98
  99                 seen_key_count = 0;
 100                 atomic_set(&ctxt.error_count, 0);
 101                 atomic_set(&ctxt.error_flags, 0);
 102
 103                 bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
 104                                      BTREE_ITER_PREFETCH);
 105
 106                 while (!bch2_move_ctxt_wait(&ctxt) &&
 107                        (k = bch2_btree_iter_peek(&iter)).k &&
 108                        !(ret = btree_iter_err(k))) {
 109                         if (!bkey_extent_is_data(k.k) ||
 110                             !bch2_extent_has_device(bkey_s_c_to_extent(k),
 111                                                    ca->dev_idx))
 112                                 goto next;
 113
 114                         ret = issue_migration_move(ca, &ctxt, NULL, k);
 115                         if (ret == -ENOMEM) {
 116                                 bch2_btree_iter_unlock(&iter);
 117
 118                                 /*
 119                                  * memory allocation failure, wait for some IO
 120                                  * to finish
 121                                  */
 122                                 bch2_move_ctxt_wait_for_io(&ctxt);
 123                                 continue;
 124                         }
 125                         if (ret == -ENOSPC)
 126                                 break;
 127                         BUG_ON(ret);
 128
 129                         seen_key_count++;
 130                         continue;
 131 next:
 132                         if (bkey_extent_is_data(k.k)) {
 133                                 ret = bch2_check_mark_super(c, bkey_s_c_to_extent(k),
 134                                                             BCH_DATA_USER);
 135                                 if (ret)
 136                                         break;
 137                         }
 138                         bch2_btree_iter_advance_pos(&iter);
 139                         bch2_btree_iter_cond_resched(&iter);
 140
 141                 }
 142                 bch2_btree_iter_unlock(&iter);
 143                 bch2_move_ctxt_exit(&ctxt);
 144
 145                 if (ret)
 146                         goto err;
 147         } while (seen_key_count && pass++ < MAX_DATA_OFF_ITER);
 148
 149         if (seen_key_count) {
 150                 pr_err("Unable to migrate all data in %d iterations.",
 151                        MAX_DATA_OFF_ITER);
 152                 ret = -1;
 153                 goto err;
 154         }
 155
 156 err:
 157         bch2_replicas_gc_end(c, ret);
 158         mutex_unlock(&c->replicas_gc_lock);
 159         return ret;
 160 }
 161
 162 /*
 163  * This walks the btree, and for any node on the relevant device it moves the
 164  * node elsewhere.
 165  */
 166 static int bch2_move_btree_off(struct bch_fs *c, struct bch_dev *ca,
 167                                enum btree_id id)
 168 {
 169         struct btree_iter iter;
 170         struct closure cl;
 171         struct btree *b;
 172         int ret;
 173
 174         BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW);
 175
 176         closure_init_stack(&cl);
 177
 178         for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
 179                 struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
 180
 181                 if (!bch2_extent_has_device(e, ca->dev_idx))
 182                         continue;
 183
 184                 ret = bch2_btree_node_rewrite(c, &iter, b->data->keys.seq, 0);
 185                 if (ret) {
 186                         bch2_btree_iter_unlock(&iter);
 187                         return ret;
 188                 }
 189
 190                 bch2_btree_iter_set_locks_want(&iter, 0);
 191         }
 192         ret = bch2_btree_iter_unlock(&iter);
 193         if (ret)
 194                 return ret; /* btree IO error */
 195
 196         if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
 197                 for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
 198                         struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
 199
 200                         BUG_ON(bch2_extent_has_device(e, ca->dev_idx));
 201                 }
 202                 bch2_btree_iter_unlock(&iter);
 203         }
 204
 205         return 0;
 206 }
 207
 208 /*
 209  * This moves only the meta-data off, leaving the data (if any) in place.
 210  * The data is moved off by bch_move_data_off_device, if desired, and
 211  * called first.
 212  *
 213  * Before calling this, allocation of buckets to the device must have
 214  * been disabled, as else we'll continue to write meta-data to the device
 215  * when new buckets are picked for meta-data writes.
 216  * In addition, the copying gc and allocator threads for the device
 217  * must have been stopped.  The allocator thread is the only thread
 218  * that writes prio/gen information.
 219  *
 220  * Meta-data consists of:
 221  * - Btree nodes
 222  * - Prio/gen information
 223  * - Journal entries
 224  * - Superblock
 225  *
 226  * This has to move the btree nodes and the journal only:
 227  * - prio/gen information is not written once the allocator thread is stopped.
 228  *   also, as the prio/gen information is per-device it is not moved.
 229  * - the superblock will be written by the caller once after everything
 230  *   is stopped.
 231  *
 232  * Note that currently there is no way to stop btree node and journal
 233  * meta-data writes to a device without moving the meta-data because
 234  * once a bucket is open for a btree node, unless a replacement btree
 235  * node is allocated (and the tree updated), the bucket will continue
 236  * to be written with updates.  Similarly for the journal (it gets
 237  * written until filled).
 238  *
 239  * This routine leaves the data (if any) in place.  Whether the data
 240  * should be moved off is a decision independent of whether the meta
 241  * data should be moved off and stopped:
 242  *
 243  * - For device removal, both data and meta-data are moved off, in
 244  *   that order.
 245  *
 246  * - However, for turning a device read-only without removing it, only
 247  *   meta-data is moved off since that's the only way to prevent it
 248  *   from being written.  Data is left in the device, but no new data
 249  *   is written.
 250  */
 251
 252 int bch2_move_metadata_off_device(struct bch_dev *ca)
 253 {
 254         struct bch_fs *c = ca->fs;
 255         unsigned i;
 256         int ret = 0;
 257
 258         BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW);
 259
 260         if (!(bch2_dev_has_data(c, ca) &
 261               ((1 << BCH_DATA_JOURNAL)|
 262                (1 << BCH_DATA_BTREE))))
 263                 return 0;
 264
 265         mutex_lock(&c->replicas_gc_lock);
 266         bch2_replicas_gc_start(c,
 267                                (1 << BCH_DATA_JOURNAL)|
 268                                (1 << BCH_DATA_BTREE));
 269
 270         /* 1st, Move the btree nodes off the device */
 271
 272         for (i = 0; i < BTREE_ID_NR; i++) {
 273                 ret = bch2_move_btree_off(c, ca, i);
 274                 if (ret)
 275                         goto err;
 276         }
 277
 278         /* There are no prios/gens to move -- they are already in the device. */
 279
 280         /* 2nd. Move the journal off the device */
 281
 282         ret = bch2_journal_move(ca);
 283         if (ret)
 284                 goto err;
 285
 286 err:
 287         bch2_replicas_gc_end(c, ret);
 288         mutex_unlock(&c->replicas_gc_lock);
 289         return ret;
 290 }
 291
 292 /*
 293  * Flagging data bad when forcibly removing a device after failing to
 294  * migrate the data off the device.
 295  */
 296
 297 static int bch2_flag_key_bad(struct btree_iter *iter,
 298                             struct bch_dev *ca,
 299                             struct bkey_s_c_extent orig)
 300 {
 301         BKEY_PADDED(key) tmp;
 302         struct bkey_s_extent e;
 303         struct bch_extent_ptr *ptr;
 304         struct bch_fs *c = ca->fs;
 305
 306         bkey_reassemble(&tmp.key, orig.s_c);
 307         e = bkey_i_to_s_extent(&tmp.key);
 308
 309         extent_for_each_ptr_backwards(e, ptr)
 310                 if (ptr->dev == ca->dev_idx)
 311                         bch2_extent_drop_ptr(e, ptr);
 312
 313         /*
 314          * If the new extent no longer has any pointers, bch2_extent_normalize()
 315          * will do the appropriate thing with it (turning it into a
 316          * KEY_TYPE_ERROR key, or just a discard if it was a cached extent)
 317          */
 318         bch2_extent_normalize(c, e.s);
 319
 320         return bch2_btree_insert_at(c, NULL, NULL, NULL,
 321                                    BTREE_INSERT_ATOMIC,
 322                                    BTREE_INSERT_ENTRY(iter, &tmp.key));
 323 }
 324
 325 /*
 326  * This doesn't actually move any data -- it marks the keys as bad
 327  * if they contain a pointer to a device that is forcibly removed
 328  * and don't have other valid pointers.  If there are valid pointers,
 329  * the necessary pointers to the removed device are replaced with
 330  * bad pointers instead.
 331  *
 332  * This is only called if bch_move_data_off_device above failed, meaning
 333  * that we've already tried to move the data MAX_DATA_OFF_ITER times and
 334  * are not likely to succeed if we try again.
 335  */
 336 int bch2_flag_data_bad(struct bch_dev *ca)
 337 {
 338         struct bch_fs *c = ca->fs;
 339         struct bkey_s_c k;
 340         struct bkey_s_c_extent e;
 341         struct btree_iter iter;
 342         int ret = 0;
 343
 344         mutex_lock(&c->replicas_gc_lock);
 345         bch2_replicas_gc_start(c, 1 << BCH_DATA_USER);
 346
 347         bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
 348                              POS_MIN, BTREE_ITER_PREFETCH);
 349
 350         while ((k = bch2_btree_iter_peek(&iter)).k &&
 351                !(ret = btree_iter_err(k))) {
 352                 if (!bkey_extent_is_data(k.k))
 353                         goto advance;
 354
 355                 e = bkey_s_c_to_extent(k);
 356                 if (!bch2_extent_has_device(e, ca->dev_idx))
 357                         goto advance;
 358
 359                 ret = bch2_flag_key_bad(&iter, ca, e);
 360
 361                 /*
 362                  * don't want to leave ret == -EINTR, since if we raced and
 363                  * something else overwrote the key we could spuriously return
 364                  * -EINTR below:
 365                  */
 366                 if (ret == -EINTR)
 367                         ret = 0;
 368                 if (ret)
 369                         break;
 370
 371                 /*
 372                  * If the replica we're dropping was dirty and there is an
 373                  * additional cached replica, the cached replica will now be
 374                  * considered dirty - upon inserting the new version of the key,
 375                  * the bucket accounting will be updated to reflect the fact
 376                  * that the cached data is now dirty and everything works out as
 377                  * if by magic without us having to do anything.
 378                  *
 379                  * The one thing we need to be concerned with here is there's a
 380                  * race between when we drop any stale pointers from the key
 381                  * we're about to insert, and when the key actually gets
 382                  * inserted and the cached data is marked as dirty - we could
 383                  * end up trying to insert a key with a pointer that should be
 384                  * dirty, but points to stale data.
 385                  *
 386                  * If that happens the insert code just bails out and doesn't do
 387                  * the insert - however, it doesn't return an error. Hence we
 388                  * need to always recheck the current key before advancing to
 389                  * the next:
 390                  */
 391                 continue;
 392 advance:
 393                 if (bkey_extent_is_data(k.k)) {
 394                         ret = bch2_check_mark_super(c, bkey_s_c_to_extent(k),
 395                                                     BCH_DATA_USER);
 396                         if (ret)
 397                                 break;
 398                 }
 399                 bch2_btree_iter_advance_pos(&iter);
 400         }
 401
 402         bch2_btree_iter_unlock(&iter);
 403
 404         bch2_replicas_gc_end(c, ret);
 405         mutex_unlock(&c->replicas_gc_lock);
 406
 407         return ret;
 408 }