git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_cache.c

   1
   2 #include "bcachefs.h"
   3 #include "btree_cache.h"
   4 #include "btree_io.h"
   5 #include "btree_iter.h"
   6 #include "btree_locking.h"
   7 #include "debug.h"
   8 #include "extents.h"
   9
  10 #include <linux/prefetch.h>
  11 #include <trace/events/bcachefs.h>
  12
  13 #define DEF_BTREE_ID(kwd, val, name) name,
  14
  15 const char * const bch2_btree_ids[] = {
  16         DEFINE_BCH_BTREE_IDS()
  17         NULL
  18 };
  19
  20 #undef DEF_BTREE_ID
  21
  22 void bch2_recalc_btree_reserve(struct bch_fs *c)
  23 {
  24         unsigned i, reserve = 16;
  25
  26         if (!c->btree_roots[0].b)
  27                 reserve += 8;
  28
  29         for (i = 0; i < BTREE_ID_NR; i++)
  30                 if (c->btree_roots[i].b)
  31                         reserve += min_t(unsigned, 1,
  32                                          c->btree_roots[i].b->level) * 8;
  33
  34         c->btree_cache_reserve = reserve;
  35 }
  36
  37 #define mca_can_free(c)                                         \
  38         max_t(int, 0, c->btree_cache_used - c->btree_cache_reserve)
  39
  40 static void __mca_data_free(struct bch_fs *c, struct btree *b)
  41 {
  42         EBUG_ON(btree_node_write_in_flight(b));
  43
  44         kvpfree(b->data, btree_bytes(c));
  45         b->data = NULL;
  46         bch2_btree_keys_free(b);
  47 }
  48
  49 static void mca_data_free(struct bch_fs *c, struct btree *b)
  50 {
  51         __mca_data_free(c, b);
  52         c->btree_cache_used--;
  53         list_move(&b->list, &c->btree_cache_freed);
  54 }
  55
  56 static const struct rhashtable_params bch_btree_cache_params = {
  57         .head_offset    = offsetof(struct btree, hash),
  58         .key_offset     = offsetof(struct btree, key.v),
  59         .key_len        = sizeof(struct bch_extent_ptr),
  60 };
  61
  62 static void mca_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
  63 {
  64         b->data = kvpmalloc(btree_bytes(c), gfp);
  65         if (!b->data)
  66                 goto err;
  67
  68         if (bch2_btree_keys_alloc(b, btree_page_order(c), gfp))
  69                 goto err;
  70
  71         c->btree_cache_used++;
  72         list_move(&b->list, &c->btree_cache_freeable);
  73         return;
  74 err:
  75         kvpfree(b->data, btree_bytes(c));
  76         b->data = NULL;
  77         list_move(&b->list, &c->btree_cache_freed);
  78 }
  79
  80 static struct btree *mca_bucket_alloc(struct bch_fs *c, gfp_t gfp)
  81 {
  82         struct btree *b = kzalloc(sizeof(struct btree), gfp);
  83         if (!b)
  84                 return NULL;
  85
  86         bkey_extent_init(&b->key);
  87         six_lock_init(&b->lock);
  88         INIT_LIST_HEAD(&b->list);
  89         INIT_LIST_HEAD(&b->write_blocked);
  90
  91         mca_data_alloc(c, b, gfp);
  92         return b->data ? b : NULL;
  93 }
  94
  95 /* Btree in memory cache - hash table */
  96
  97 void bch2_btree_node_hash_remove(struct bch_fs *c, struct btree *b)
  98 {
  99         rhashtable_remove_fast(&c->btree_cache_table, &b->hash,
 100                                bch_btree_cache_params);
 101
 102         /* Cause future lookups for this node to fail: */
 103         bkey_i_to_extent(&b->key)->v._data[0] = 0;
 104 }
 105
 106 int __bch2_btree_node_hash_insert(struct bch_fs *c, struct btree *b)
 107 {
 108         return rhashtable_lookup_insert_fast(&c->btree_cache_table, &b->hash,
 109                                              bch_btree_cache_params);
 110 }
 111
 112 int bch2_btree_node_hash_insert(struct bch_fs *c, struct btree *b,
 113                     unsigned level, enum btree_id id)
 114 {
 115         int ret;
 116
 117         b->level        = level;
 118         b->btree_id     = id;
 119
 120         mutex_lock(&c->btree_cache_lock);
 121         ret = __bch2_btree_node_hash_insert(c, b);
 122         if (!ret)
 123                 list_add(&b->list, &c->btree_cache);
 124         mutex_unlock(&c->btree_cache_lock);
 125
 126         return ret;
 127 }
 128
 129 __flatten
 130 static inline struct btree *mca_find(struct bch_fs *c,
 131                                      const struct bkey_i *k)
 132 {
 133         return rhashtable_lookup_fast(&c->btree_cache_table, &PTR_HASH(k),
 134                                       bch_btree_cache_params);
 135 }
 136
 137 /*
 138  * this version is for btree nodes that have already been freed (we're not
 139  * reaping a real btree node)
 140  */
 141 static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
 142 {
 143         int ret = 0;
 144
 145         lockdep_assert_held(&c->btree_cache_lock);
 146
 147         if (!six_trylock_intent(&b->lock))
 148                 return -ENOMEM;
 149
 150         if (!six_trylock_write(&b->lock))
 151                 goto out_unlock_intent;
 152
 153         if (btree_node_noevict(b))
 154                 goto out_unlock;
 155
 156         if (!btree_node_may_write(b))
 157                 goto out_unlock;
 158
 159         if (btree_node_dirty(b) ||
 160             btree_node_write_in_flight(b) ||
 161             btree_node_read_in_flight(b)) {
 162                 if (!flush)
 163                         goto out_unlock;
 164
 165                 wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
 166                                TASK_UNINTERRUPTIBLE);
 167
 168                 /*
 169                  * Using the underscore version because we don't want to compact
 170                  * bsets after the write, since this node is about to be evicted
 171                  * - unless btree verify mode is enabled, since it runs out of
 172                  * the post write cleanup:
 173                  */
 174                 if (verify_btree_ondisk(c))
 175                         bch2_btree_node_write(c, b, NULL, SIX_LOCK_intent);
 176                 else
 177                         __bch2_btree_node_write(c, b, NULL, SIX_LOCK_read);
 178
 179                 /* wait for any in flight btree write */
 180                 btree_node_wait_on_io(b);
 181         }
 182 out:
 183         if (PTR_HASH(&b->key))
 184                 trace_btree_node_reap(c, b, ret);
 185         return ret;
 186 out_unlock:
 187         six_unlock_write(&b->lock);
 188 out_unlock_intent:
 189         six_unlock_intent(&b->lock);
 190         ret = -ENOMEM;
 191         goto out;
 192 }
 193
 194 static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
 195 {
 196         return __btree_node_reclaim(c, b, false);
 197 }
 198
 199 static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
 200 {
 201         return __btree_node_reclaim(c, b, true);
 202 }
 203
 204 static unsigned long bch2_mca_scan(struct shrinker *shrink,
 205                                    struct shrink_control *sc)
 206 {
 207         struct bch_fs *c = container_of(shrink, struct bch_fs,
 208                                            btree_cache_shrink);
 209         struct btree *b, *t;
 210         unsigned long nr = sc->nr_to_scan;
 211         unsigned long can_free;
 212         unsigned long touched = 0;
 213         unsigned long freed = 0;
 214         unsigned i;
 215
 216         if (btree_shrinker_disabled(c))
 217                 return SHRINK_STOP;
 218
 219         if (c->btree_cache_alloc_lock)
 220                 return SHRINK_STOP;
 221
 222         /* Return -1 if we can't do anything right now */
 223         if (sc->gfp_mask & __GFP_IO)
 224                 mutex_lock(&c->btree_cache_lock);
 225         else if (!mutex_trylock(&c->btree_cache_lock))
 226                 return -1;
 227
 228         /*
 229          * It's _really_ critical that we don't free too many btree nodes - we
 230          * have to always leave ourselves a reserve. The reserve is how we
 231          * guarantee that allocating memory for a new btree node can always
 232          * succeed, so that inserting keys into the btree can always succeed and
 233          * IO can always make forward progress:
 234          */
 235         nr /= btree_pages(c);
 236         can_free = mca_can_free(c);
 237         nr = min_t(unsigned long, nr, can_free);
 238
 239         i = 0;
 240         list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
 241                 touched++;
 242
 243                 if (freed >= nr)
 244                         break;
 245
 246                 if (++i > 3 &&
 247                     !btree_node_reclaim(c, b)) {
 248                         mca_data_free(c, b);
 249                         six_unlock_write(&b->lock);
 250                         six_unlock_intent(&b->lock);
 251                         freed++;
 252                 }
 253         }
 254 restart:
 255         list_for_each_entry_safe(b, t, &c->btree_cache, list) {
 256                 touched++;
 257
 258                 if (freed >= nr) {
 259                         /* Save position */
 260                         if (&t->list != &c->btree_cache)
 261                                 list_move_tail(&c->btree_cache, &t->list);
 262                         break;
 263                 }
 264
 265                 if (!btree_node_accessed(b) &&
 266                     !btree_node_reclaim(c, b)) {
 267                         /* can't call bch2_btree_node_hash_remove under btree_cache_lock  */
 268                         freed++;
 269                         if (&t->list != &c->btree_cache)
 270                                 list_move_tail(&c->btree_cache, &t->list);
 271
 272                         mca_data_free(c, b);
 273                         mutex_unlock(&c->btree_cache_lock);
 274
 275                         bch2_btree_node_hash_remove(c, b);
 276                         six_unlock_write(&b->lock);
 277                         six_unlock_intent(&b->lock);
 278
 279                         if (freed >= nr)
 280                                 goto out;
 281
 282                         if (sc->gfp_mask & __GFP_IO)
 283                                 mutex_lock(&c->btree_cache_lock);
 284                         else if (!mutex_trylock(&c->btree_cache_lock))
 285                                 goto out;
 286                         goto restart;
 287                 } else
 288                         clear_btree_node_accessed(b);
 289         }
 290
 291         mutex_unlock(&c->btree_cache_lock);
 292 out:
 293         return (unsigned long) freed * btree_pages(c);
 294 }
 295
 296 static unsigned long bch2_mca_count(struct shrinker *shrink,
 297                                     struct shrink_control *sc)
 298 {
 299         struct bch_fs *c = container_of(shrink, struct bch_fs,
 300                                            btree_cache_shrink);
 301
 302         if (btree_shrinker_disabled(c))
 303                 return 0;
 304
 305         if (c->btree_cache_alloc_lock)
 306                 return 0;
 307
 308         return mca_can_free(c) * btree_pages(c);
 309 }
 310
 311 void bch2_fs_btree_exit(struct bch_fs *c)
 312 {
 313         struct btree *b;
 314         unsigned i;
 315
 316         if (c->btree_cache_shrink.list.next)
 317                 unregister_shrinker(&c->btree_cache_shrink);
 318
 319         mutex_lock(&c->btree_cache_lock);
 320
 321 #ifdef CONFIG_BCACHEFS_DEBUG
 322         if (c->verify_data)
 323                 list_move(&c->verify_data->list, &c->btree_cache);
 324
 325         kvpfree(c->verify_ondisk, btree_bytes(c));
 326 #endif
 327
 328         for (i = 0; i < BTREE_ID_NR; i++)
 329                 if (c->btree_roots[i].b)
 330                         list_add(&c->btree_roots[i].b->list, &c->btree_cache);
 331
 332         list_splice(&c->btree_cache_freeable,
 333                     &c->btree_cache);
 334
 335         while (!list_empty(&c->btree_cache)) {
 336                 b = list_first_entry(&c->btree_cache, struct btree, list);
 337
 338                 if (btree_node_dirty(b))
 339                         bch2_btree_complete_write(c, b, btree_current_write(b));
 340                 clear_btree_node_dirty(b);
 341
 342                 mca_data_free(c, b);
 343         }
 344
 345         while (!list_empty(&c->btree_cache_freed)) {
 346                 b = list_first_entry(&c->btree_cache_freed,
 347                                      struct btree, list);
 348                 list_del(&b->list);
 349                 kfree(b);
 350         }
 351
 352         mutex_unlock(&c->btree_cache_lock);
 353
 354         if (c->btree_cache_table_init_done)
 355                 rhashtable_destroy(&c->btree_cache_table);
 356 }
 357
 358 int bch2_fs_btree_init(struct bch_fs *c)
 359 {
 360         unsigned i;
 361         int ret;
 362
 363         ret = rhashtable_init(&c->btree_cache_table, &bch_btree_cache_params);
 364         if (ret)
 365                 return ret;
 366
 367         c->btree_cache_table_init_done = true;
 368
 369         bch2_recalc_btree_reserve(c);
 370
 371         for (i = 0; i < c->btree_cache_reserve; i++)
 372                 if (!mca_bucket_alloc(c, GFP_KERNEL))
 373                         return -ENOMEM;
 374
 375         list_splice_init(&c->btree_cache,
 376                          &c->btree_cache_freeable);
 377
 378 #ifdef CONFIG_BCACHEFS_DEBUG
 379         mutex_init(&c->verify_lock);
 380
 381         c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
 382         if (!c->verify_ondisk)
 383                 return -ENOMEM;
 384
 385         c->verify_data = mca_bucket_alloc(c, GFP_KERNEL);
 386         if (!c->verify_data)
 387                 return -ENOMEM;
 388
 389         list_del_init(&c->verify_data->list);
 390 #endif
 391
 392         c->btree_cache_shrink.count_objects = bch2_mca_count;
 393         c->btree_cache_shrink.scan_objects = bch2_mca_scan;
 394         c->btree_cache_shrink.seeks = 4;
 395         c->btree_cache_shrink.batch = btree_pages(c) * 2;
 396         register_shrinker(&c->btree_cache_shrink);
 397
 398         return 0;
 399 }
 400
 401 /*
 402  * We can only have one thread cannibalizing other cached btree nodes at a time,
 403  * or we'll deadlock. We use an open coded mutex to ensure that, which a
 404  * cannibalize_bucket() will take. This means every time we unlock the root of
 405  * the btree, we need to release this lock if we have it held.
 406  */
 407 void bch2_btree_node_cannibalize_unlock(struct bch_fs *c)
 408 {
 409         if (c->btree_cache_alloc_lock == current) {
 410                 trace_btree_node_cannibalize_unlock(c);
 411                 c->btree_cache_alloc_lock = NULL;
 412                 closure_wake_up(&c->mca_wait);
 413         }
 414 }
 415
 416 int bch2_btree_node_cannibalize_lock(struct bch_fs *c, struct closure *cl)
 417 {
 418         struct task_struct *old;
 419
 420         old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current);
 421         if (old == NULL || old == current)
 422                 goto success;
 423
 424         if (!cl) {
 425                 trace_btree_node_cannibalize_lock_fail(c);
 426                 return -ENOMEM;
 427         }
 428
 429         closure_wait(&c->mca_wait, cl);
 430
 431         /* Try again, after adding ourselves to waitlist */
 432         old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current);
 433         if (old == NULL || old == current) {
 434                 /* We raced */
 435                 closure_wake_up(&c->mca_wait);
 436                 goto success;
 437         }
 438
 439         trace_btree_node_cannibalize_lock_fail(c);
 440         return -EAGAIN;
 441
 442 success:
 443         trace_btree_node_cannibalize_lock(c);
 444         return 0;
 445 }
 446
 447 static struct btree *mca_cannibalize(struct bch_fs *c)
 448 {
 449         struct btree *b;
 450
 451         list_for_each_entry_reverse(b, &c->btree_cache, list)
 452                 if (!btree_node_reclaim(c, b))
 453                         return b;
 454
 455         while (1) {
 456                 list_for_each_entry_reverse(b, &c->btree_cache, list)
 457                         if (!btree_node_write_and_reclaim(c, b))
 458                                 return b;
 459
 460                 /*
 461                  * Rare case: all nodes were intent-locked.
 462                  * Just busy-wait.
 463                  */
 464                 WARN_ONCE(1, "btree cache cannibalize failed\n");
 465                 cond_resched();
 466         }
 467 }
 468
 469 struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
 470 {
 471         struct btree *b;
 472         u64 start_time = local_clock();
 473
 474         mutex_lock(&c->btree_cache_lock);
 475
 476         /*
 477          * btree_free() doesn't free memory; it sticks the node on the end of
 478          * the list. Check if there's any freed nodes there:
 479          */
 480         list_for_each_entry(b, &c->btree_cache_freeable, list)
 481                 if (!btree_node_reclaim(c, b))
 482                         goto out_unlock;
 483
 484         /*
 485          * We never free struct btree itself, just the memory that holds the on
 486          * disk node. Check the freed list before allocating a new one:
 487          */
 488         list_for_each_entry(b, &c->btree_cache_freed, list)
 489                 if (!btree_node_reclaim(c, b)) {
 490                         mca_data_alloc(c, b, __GFP_NOWARN|GFP_NOIO);
 491                         if (b->data)
 492                                 goto out_unlock;
 493
 494                         six_unlock_write(&b->lock);
 495                         six_unlock_intent(&b->lock);
 496                         goto err;
 497                 }
 498
 499         b = mca_bucket_alloc(c, __GFP_NOWARN|GFP_NOIO);
 500         if (!b)
 501                 goto err;
 502
 503         BUG_ON(!six_trylock_intent(&b->lock));
 504         BUG_ON(!six_trylock_write(&b->lock));
 505 out_unlock:
 506         BUG_ON(btree_node_hashed(b));
 507         BUG_ON(btree_node_write_in_flight(b));
 508
 509         list_del_init(&b->list);
 510         mutex_unlock(&c->btree_cache_lock);
 511 out:
 512         b->flags                = 0;
 513         b->written              = 0;
 514         b->nsets                = 0;
 515         b->sib_u64s[0]          = 0;
 516         b->sib_u64s[1]          = 0;
 517         b->whiteout_u64s        = 0;
 518         b->uncompacted_whiteout_u64s = 0;
 519         bch2_btree_keys_init(b, &c->expensive_debug_checks);
 520
 521         bch2_time_stats_update(&c->btree_node_mem_alloc_time, start_time);
 522
 523         return b;
 524 err:
 525         /* Try to cannibalize another cached btree node: */
 526         if (c->btree_cache_alloc_lock == current) {
 527                 b = mca_cannibalize(c);
 528                 list_del_init(&b->list);
 529                 mutex_unlock(&c->btree_cache_lock);
 530
 531                 bch2_btree_node_hash_remove(c, b);
 532
 533                 trace_btree_node_cannibalize(c);
 534                 goto out;
 535         }
 536
 537         mutex_unlock(&c->btree_cache_lock);
 538         return ERR_PTR(-ENOMEM);
 539 }
 540
 541 /* Slowpath, don't want it inlined into btree_iter_traverse() */
 542 static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
 543                                                    struct btree_iter *iter,
 544                                                    const struct bkey_i *k,
 545                                                    unsigned level,
 546                                                    enum six_lock_type lock_type)
 547 {
 548         struct btree *b;
 549
 550         /*
 551          * Parent node must be locked, else we could read in a btree node that's
 552          * been freed:
 553          */
 554         BUG_ON(!btree_node_locked(iter, level + 1));
 555
 556         b = bch2_btree_node_mem_alloc(c);
 557         if (IS_ERR(b))
 558                 return b;
 559
 560         bkey_copy(&b->key, k);
 561         if (bch2_btree_node_hash_insert(c, b, level, iter->btree_id)) {
 562                 /* raced with another fill: */
 563
 564                 /* mark as unhashed... */
 565                 bkey_i_to_extent(&b->key)->v._data[0] = 0;
 566
 567                 mutex_lock(&c->btree_cache_lock);
 568                 list_add(&b->list, &c->btree_cache_freeable);
 569                 mutex_unlock(&c->btree_cache_lock);
 570
 571                 six_unlock_write(&b->lock);
 572                 six_unlock_intent(&b->lock);
 573                 return NULL;
 574         }
 575
 576         /*
 577          * If the btree node wasn't cached, we can't drop our lock on
 578          * the parent until after it's added to the cache - because
 579          * otherwise we could race with a btree_split() freeing the node
 580          * we're trying to lock.
 581          *
 582          * But the deadlock described below doesn't exist in this case,
 583          * so it's safe to not drop the parent lock until here:
 584          */
 585         if (btree_node_read_locked(iter, level + 1))
 586                 btree_node_unlock(iter, level + 1);
 587
 588         bch2_btree_node_read(c, b, true);
 589         six_unlock_write(&b->lock);
 590
 591         if (lock_type == SIX_LOCK_read)
 592                 six_lock_downgrade(&b->lock);
 593
 594         return b;
 595 }
 596
 597 /**
 598  * bch_btree_node_get - find a btree node in the cache and lock it, reading it
 599  * in from disk if necessary.
 600  *
 601  * If IO is necessary and running under generic_make_request, returns -EAGAIN.
 602  *
 603  * The btree node will have either a read or a write lock held, depending on
 604  * the @write parameter.
 605  */
 606 struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
 607                                   const struct bkey_i *k, unsigned level,
 608                                   enum six_lock_type lock_type)
 609 {
 610         struct btree *b;
 611         struct bset_tree *t;
 612
 613         BUG_ON(level >= BTREE_MAX_DEPTH);
 614 retry:
 615         rcu_read_lock();
 616         b = mca_find(c, k);
 617         rcu_read_unlock();
 618
 619         if (unlikely(!b)) {
 620                 /*
 621                  * We must have the parent locked to call bch2_btree_node_fill(),
 622                  * else we could read in a btree node from disk that's been
 623                  * freed:
 624                  */
 625                 b = bch2_btree_node_fill(c, iter, k, level, lock_type);
 626
 627                 /* We raced and found the btree node in the cache */
 628                 if (!b)
 629                         goto retry;
 630
 631                 if (IS_ERR(b))
 632                         return b;
 633         } else {
 634                 /*
 635                  * There's a potential deadlock with splits and insertions into
 636                  * interior nodes we have to avoid:
 637                  *
 638                  * The other thread might be holding an intent lock on the node
 639                  * we want, and they want to update its parent node so they're
 640                  * going to upgrade their intent lock on the parent node to a
 641                  * write lock.
 642                  *
 643                  * But if we're holding a read lock on the parent, and we're
 644                  * trying to get the intent lock they're holding, we deadlock.
 645                  *
 646                  * So to avoid this we drop the read locks on parent nodes when
 647                  * we're starting to take intent locks - and handle the race.
 648                  *
 649                  * The race is that they might be about to free the node we
 650                  * want, and dropping our read lock on the parent node lets them
 651                  * update the parent marking the node we want as freed, and then
 652                  * free it:
 653                  *
 654                  * To guard against this, btree nodes are evicted from the cache
 655                  * when they're freed - and PTR_HASH() is zeroed out, which we
 656                  * check for after we lock the node.
 657                  *
 658                  * Then, bch2_btree_node_relock() on the parent will fail - because
 659                  * the parent was modified, when the pointer to the node we want
 660                  * was removed - and we'll bail out:
 661                  */
 662                 if (btree_node_read_locked(iter, level + 1))
 663                         btree_node_unlock(iter, level + 1);
 664
 665                 if (!btree_node_lock(b, k->k.p, level, iter, lock_type))
 666                         return ERR_PTR(-EINTR);
 667
 668                 if (unlikely(PTR_HASH(&b->key) != PTR_HASH(k) ||
 669                              b->level != level ||
 670                              race_fault())) {
 671                         six_unlock_type(&b->lock, lock_type);
 672                         if (bch2_btree_node_relock(iter, level + 1))
 673                                 goto retry;
 674
 675                         return ERR_PTR(-EINTR);
 676                 }
 677         }
 678
 679         wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
 680                        TASK_UNINTERRUPTIBLE);
 681
 682         prefetch(b->aux_data);
 683
 684         for_each_bset(b, t) {
 685                 void *p = (u64 *) b->aux_data + t->aux_data_offset;
 686
 687                 prefetch(p + L1_CACHE_BYTES * 0);
 688                 prefetch(p + L1_CACHE_BYTES * 1);
 689                 prefetch(p + L1_CACHE_BYTES * 2);
 690         }
 691
 692         /* avoid atomic set bit if it's not needed: */
 693         if (btree_node_accessed(b))
 694                 set_btree_node_accessed(b);
 695
 696         if (unlikely(btree_node_read_error(b))) {
 697                 six_unlock_type(&b->lock, lock_type);
 698                 return ERR_PTR(-EIO);
 699         }
 700
 701         EBUG_ON(!b->written);
 702         EBUG_ON(b->btree_id != iter->btree_id ||
 703                 BTREE_NODE_LEVEL(b->data) != level ||
 704                 bkey_cmp(b->data->max_key, k->k.p));
 705
 706         return b;
 707 }
 708
 709 struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
 710                                           struct btree_iter *iter,
 711                                           struct btree *b,
 712                                           enum btree_node_sibling sib)
 713 {
 714         struct btree *parent;
 715         struct btree_node_iter node_iter;
 716         struct bkey_packed *k;
 717         BKEY_PADDED(k) tmp;
 718         struct btree *ret;
 719         unsigned level = b->level;
 720
 721         parent = iter->nodes[level + 1];
 722         if (!parent)
 723                 return NULL;
 724
 725         if (!bch2_btree_node_relock(iter, level + 1)) {
 726                 bch2_btree_iter_set_locks_want(iter, level + 2);
 727                 return ERR_PTR(-EINTR);
 728         }
 729
 730         node_iter = iter->node_iters[parent->level];
 731
 732         k = bch2_btree_node_iter_peek_all(&node_iter, parent);
 733         BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p));
 734
 735         do {
 736                 k = sib == btree_prev_sib
 737                         ? bch2_btree_node_iter_prev_all(&node_iter, parent)
 738                         : (bch2_btree_node_iter_advance(&node_iter, parent),
 739                            bch2_btree_node_iter_peek_all(&node_iter, parent));
 740                 if (!k)
 741                         return NULL;
 742         } while (bkey_deleted(k));
 743
 744         bch2_bkey_unpack(parent, &tmp.k, k);
 745
 746         ret = bch2_btree_node_get(c, iter, &tmp.k, level, SIX_LOCK_intent);
 747
 748         if (IS_ERR(ret) && PTR_ERR(ret) == -EINTR) {
 749                 btree_node_unlock(iter, level);
 750                 ret = bch2_btree_node_get(c, iter, &tmp.k, level, SIX_LOCK_intent);
 751         }
 752
 753         if (!IS_ERR(ret) && !bch2_btree_node_relock(iter, level)) {
 754                 six_unlock_intent(&ret->lock);
 755                 ret = ERR_PTR(-EINTR);
 756         }
 757
 758         return ret;
 759 }
 760
 761 void bch2_btree_node_prefetch(struct bch_fs *c, const struct bkey_i *k,
 762                               unsigned level, enum btree_id btree_id)
 763 {
 764         struct btree *b;
 765
 766         BUG_ON(level >= BTREE_MAX_DEPTH);
 767
 768         rcu_read_lock();
 769         b = mca_find(c, k);
 770         rcu_read_unlock();
 771
 772         if (b)
 773                 return;
 774
 775         b = bch2_btree_node_mem_alloc(c);
 776         if (IS_ERR(b))
 777                 return;
 778
 779         bkey_copy(&b->key, k);
 780         if (bch2_btree_node_hash_insert(c, b, level, btree_id)) {
 781                 /* raced with another fill: */
 782
 783                 /* mark as unhashed... */
 784                 bkey_i_to_extent(&b->key)->v._data[0] = 0;
 785
 786                 mutex_lock(&c->btree_cache_lock);
 787                 list_add(&b->list, &c->btree_cache_freeable);
 788                 mutex_unlock(&c->btree_cache_lock);
 789                 goto out;
 790         }
 791
 792         bch2_btree_node_read(c, b, false);
 793 out:
 794         six_unlock_write(&b->lock);
 795         six_unlock_intent(&b->lock);
 796 }
 797
 798 int bch2_print_btree_node(struct bch_fs *c, struct btree *b,
 799                           char *buf, size_t len)
 800 {
 801         const struct bkey_format *f = &b->format;
 802         struct bset_stats stats;
 803         char ptrs[100];
 804
 805         memset(&stats, 0, sizeof(stats));
 806
 807         bch2_val_to_text(c, BKEY_TYPE_BTREE, ptrs, sizeof(ptrs),
 808                         bkey_i_to_s_c(&b->key));
 809         bch2_btree_keys_stats(b, &stats);
 810
 811         return scnprintf(buf, len,
 812                          "l %u %llu:%llu - %llu:%llu:\n"
 813                          "    ptrs: %s\n"
 814                          "    format: u64s %u fields %u %u %u %u %u\n"
 815                          "    unpack fn len: %u\n"
 816                          "    bytes used %zu/%zu (%zu%% full)\n"
 817                          "    sib u64s: %u, %u (merge threshold %zu)\n"
 818                          "    nr packed keys %u\n"
 819                          "    nr unpacked keys %u\n"
 820                          "    floats %zu\n"
 821                          "    failed unpacked %zu\n"
 822                          "    failed prev %zu\n"
 823                          "    failed overflow %zu\n",
 824                          b->level,
 825                          b->data->min_key.inode,
 826                          b->data->min_key.offset,
 827                          b->data->max_key.inode,
 828                          b->data->max_key.offset,
 829                          ptrs,
 830                          f->key_u64s,
 831                          f->bits_per_field[0],
 832                          f->bits_per_field[1],
 833                          f->bits_per_field[2],
 834                          f->bits_per_field[3],
 835                          f->bits_per_field[4],
 836                          b->unpack_fn_len,
 837                          b->nr.live_u64s * sizeof(u64),
 838                          btree_bytes(c) - sizeof(struct btree_node),
 839                          b->nr.live_u64s * 100 / btree_max_u64s(c),
 840                          b->sib_u64s[0],
 841                          b->sib_u64s[1],
 842                          BTREE_FOREGROUND_MERGE_THRESHOLD(c),
 843                          b->nr.packed_keys,
 844                          b->nr.unpacked_keys,
 845                          stats.floats,
 846                          stats.failed_unpacked,
 847                          stats.failed_prev,
 848                          stats.failed_overflow);
 849 }