]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_iter.c
Update bcachefs sources to b964c6cba8 bcachefs: Change lockrestart_do() to always...
[bcachefs-tools-debian] / libbcachefs / btree_iter.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_methods.h"
5 #include "bkey_buf.h"
6 #include "btree_cache.h"
7 #include "btree_iter.h"
8 #include "btree_key_cache.h"
9 #include "btree_locking.h"
10 #include "btree_update.h"
11 #include "debug.h"
12 #include "error.h"
13 #include "extents.h"
14 #include "journal.h"
15 #include "replicas.h"
16
17 #include <linux/prefetch.h>
18 #include <trace/events/bcachefs.h>
19
20 static void btree_iter_set_search_pos(struct btree_iter *, struct bpos);
21 static struct btree_iter *btree_iter_child_alloc(struct btree_iter *, unsigned long);
22 static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *);
23 static void btree_iter_copy(struct btree_iter *, struct btree_iter *);
24
25 static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
26 {
27         EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES);
28
29         /* Are we iterating over keys in all snapshots? */
30         if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
31                 p = bpos_successor(p);
32         } else {
33                 p = bpos_nosnap_successor(p);
34                 p.snapshot = iter->snapshot;
35         }
36
37         return p;
38 }
39
40 static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
41 {
42         EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES);
43
44         /* Are we iterating over keys in all snapshots? */
45         if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
46                 p = bpos_predecessor(p);
47         } else {
48                 p = bpos_nosnap_predecessor(p);
49                 p.snapshot = iter->snapshot;
50         }
51
52         return p;
53 }
54
55 static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
56 {
57         return l < BTREE_MAX_DEPTH &&
58                 (unsigned long) iter->l[l].b >= 128;
59 }
60
61 static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
62 {
63         struct bpos pos = iter->pos;
64
65         if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
66             bkey_cmp(pos, POS_MAX))
67                 pos = bkey_successor(iter, pos);
68         return pos;
69 }
70
71 static inline bool btree_iter_pos_before_node(struct btree_iter *iter,
72                                               struct btree *b)
73 {
74         return bpos_cmp(iter->real_pos, b->data->min_key) < 0;
75 }
76
77 static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
78                                              struct btree *b)
79 {
80         return bpos_cmp(b->key.k.p, iter->real_pos) < 0;
81 }
82
83 static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
84                                           struct btree *b)
85 {
86         return iter->btree_id == b->c.btree_id &&
87                 !btree_iter_pos_before_node(iter, b) &&
88                 !btree_iter_pos_after_node(iter, b);
89 }
90
91 /* Btree node locking: */
92
93 void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
94 {
95         bch2_btree_node_unlock_write_inlined(b, iter);
96 }
97
98 void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
99 {
100         struct btree_iter *linked;
101         unsigned readers = 0;
102
103         EBUG_ON(!btree_node_intent_locked(iter, b->c.level));
104
105         trans_for_each_iter(iter->trans, linked)
106                 if (linked->l[b->c.level].b == b &&
107                     btree_node_read_locked(linked, b->c.level))
108                         readers++;
109
110         /*
111          * Must drop our read locks before calling six_lock_write() -
112          * six_unlock() won't do wakeups until the reader count
113          * goes to 0, and it's safe because we have the node intent
114          * locked:
115          */
116         atomic64_sub(__SIX_VAL(read_lock, readers),
117                      &b->c.lock.state.counter);
118         btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write);
119         atomic64_add(__SIX_VAL(read_lock, readers),
120                      &b->c.lock.state.counter);
121 }
122
123 bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
124 {
125         struct btree *b = btree_iter_node(iter, level);
126         int want = __btree_lock_want(iter, level);
127
128         if (!is_btree_node(iter, level))
129                 return false;
130
131         if (race_fault())
132                 return false;
133
134         if (six_relock_type(&b->c.lock, want, iter->l[level].lock_seq) ||
135             (btree_node_lock_seq_matches(iter, b, level) &&
136              btree_node_lock_increment(iter->trans, b, level, want))) {
137                 mark_btree_node_locked(iter, level, want);
138                 return true;
139         } else {
140                 return false;
141         }
142 }
143
144 static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level)
145 {
146         struct btree *b = iter->l[level].b;
147
148         EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED);
149
150         if (!is_btree_node(iter, level))
151                 return false;
152
153         if (btree_node_intent_locked(iter, level))
154                 return true;
155
156         if (race_fault())
157                 return false;
158
159         if (btree_node_locked(iter, level)
160             ? six_lock_tryupgrade(&b->c.lock)
161             : six_relock_type(&b->c.lock, SIX_LOCK_intent, iter->l[level].lock_seq))
162                 goto success;
163
164         if (btree_node_lock_seq_matches(iter, b, level) &&
165             btree_node_lock_increment(iter->trans, b, level, BTREE_NODE_INTENT_LOCKED)) {
166                 btree_node_unlock(iter, level);
167                 goto success;
168         }
169
170         return false;
171 success:
172         mark_btree_node_intent_locked(iter, level);
173         return true;
174 }
175
176 static inline bool btree_iter_get_locks(struct btree_iter *iter, bool upgrade,
177                                         unsigned long trace_ip)
178 {
179         unsigned l = iter->level;
180         int fail_idx = -1;
181
182         do {
183                 if (!btree_iter_node(iter, l))
184                         break;
185
186                 if (!(upgrade
187                       ? bch2_btree_node_upgrade(iter, l)
188                       : bch2_btree_node_relock(iter, l))) {
189                         (upgrade
190                          ? trace_node_upgrade_fail
191                          : trace_node_relock_fail)(iter->trans->ip, trace_ip,
192                                         btree_iter_type(iter) == BTREE_ITER_CACHED,
193                                         iter->btree_id, &iter->real_pos,
194                                         l, iter->l[l].lock_seq,
195                                         is_btree_node(iter, l)
196                                         ? 0
197                                         : (unsigned long) iter->l[l].b,
198                                         is_btree_node(iter, l)
199                                         ? iter->l[l].b->c.lock.state.seq
200                                         : 0);
201                         fail_idx = l;
202                         btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
203                 }
204
205                 l++;
206         } while (l < iter->locks_want);
207
208         /*
209          * When we fail to get a lock, we have to ensure that any child nodes
210          * can't be relocked so bch2_btree_iter_traverse has to walk back up to
211          * the node that we failed to relock:
212          */
213         while (fail_idx >= 0) {
214                 btree_node_unlock(iter, fail_idx);
215                 iter->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS;
216                 --fail_idx;
217         }
218
219         if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
220                 iter->uptodate = BTREE_ITER_NEED_PEEK;
221
222         bch2_btree_trans_verify_locks(iter->trans);
223
224         return iter->uptodate < BTREE_ITER_NEED_RELOCK;
225 }
226
227 static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b,
228                                   enum btree_iter_type type)
229 {
230         return  type != BTREE_ITER_CACHED
231                 ? container_of(_b, struct btree, c)->key.k.p
232                 : container_of(_b, struct bkey_cached, c)->key.pos;
233 }
234
235 /* Slowpath: */
236 bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
237                             unsigned level, struct btree_iter *iter,
238                             enum six_lock_type type,
239                             six_lock_should_sleep_fn should_sleep_fn, void *p,
240                             unsigned long ip)
241 {
242         struct btree_trans *trans = iter->trans;
243         struct btree_iter *linked, *deadlock_iter = NULL;
244         u64 start_time = local_clock();
245         unsigned reason = 9;
246         bool ret;
247
248         /* Check if it's safe to block: */
249         trans_for_each_iter(trans, linked) {
250                 if (!linked->nodes_locked)
251                         continue;
252
253                 /*
254                  * Can't block taking an intent lock if we have _any_ nodes read
255                  * locked:
256                  *
257                  * - Our read lock blocks another thread with an intent lock on
258                  *   the same node from getting a write lock, and thus from
259                  *   dropping its intent lock
260                  *
261                  * - And the other thread may have multiple nodes intent locked:
262                  *   both the node we want to intent lock, and the node we
263                  *   already have read locked - deadlock:
264                  */
265                 if (type == SIX_LOCK_intent &&
266                     linked->nodes_locked != linked->nodes_intent_locked) {
267                         deadlock_iter = linked;
268                         reason = 1;
269                 }
270
271                 if (linked->btree_id != iter->btree_id) {
272                         if (linked->btree_id > iter->btree_id) {
273                                 deadlock_iter = linked;
274                                 reason = 3;
275                         }
276                         continue;
277                 }
278
279                 /*
280                  * Within the same btree, cached iterators come before non
281                  * cached iterators:
282                  */
283                 if (btree_iter_is_cached(linked) != btree_iter_is_cached(iter)) {
284                         if (btree_iter_is_cached(iter)) {
285                                 deadlock_iter = linked;
286                                 reason = 4;
287                         }
288                         continue;
289                 }
290
291                 /*
292                  * Interior nodes must be locked before their descendants: if
293                  * another iterator has possible descendants locked of the node
294                  * we're about to lock, it must have the ancestors locked too:
295                  */
296                 if (level > __fls(linked->nodes_locked)) {
297                         deadlock_iter = linked;
298                         reason = 5;
299                 }
300
301                 /* Must lock btree nodes in key order: */
302                 if (btree_node_locked(linked, level) &&
303                     bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b,
304                                                  btree_iter_type(linked))) <= 0) {
305                         deadlock_iter = linked;
306                         reason = 7;
307                         BUG_ON(trans->in_traverse_all);
308                 }
309         }
310
311         if (unlikely(deadlock_iter)) {
312                 trace_trans_restart_would_deadlock(trans->ip, ip,
313                                 trans->in_traverse_all, reason,
314                                 deadlock_iter->btree_id,
315                                 btree_iter_type(deadlock_iter),
316                                 &deadlock_iter->real_pos,
317                                 iter->btree_id,
318                                 btree_iter_type(iter),
319                                 &pos);
320                 btree_trans_restart(trans);
321                 return false;
322         }
323
324         if (six_trylock_type(&b->c.lock, type))
325                 return true;
326
327 #ifdef CONFIG_BCACHEFS_DEBUG
328         trans->locking_iter_idx = iter->idx;
329         trans->locking_pos      = pos;
330         trans->locking_btree_id = iter->btree_id;
331         trans->locking_level    = level;
332         trans->locking          = b;
333 #endif
334
335         ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0;
336
337 #ifdef CONFIG_BCACHEFS_DEBUG
338         trans->locking = NULL;
339 #endif
340         if (ret)
341                 bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)],
342                                        start_time);
343         return ret;
344 }
345
346 /* Btree iterator locking: */
347
348 #ifdef CONFIG_BCACHEFS_DEBUG
349 static void bch2_btree_iter_verify_locks(struct btree_iter *iter)
350 {
351         unsigned l;
352
353         if (!(iter->trans->iters_linked & (1ULL << iter->idx))) {
354                 BUG_ON(iter->nodes_locked);
355                 return;
356         }
357
358         for (l = 0; btree_iter_node(iter, l); l++) {
359                 if (iter->uptodate >= BTREE_ITER_NEED_RELOCK &&
360                     !btree_node_locked(iter, l))
361                         continue;
362
363                 BUG_ON(btree_lock_want(iter, l) !=
364                        btree_node_locked_type(iter, l));
365         }
366 }
367
368 void bch2_btree_trans_verify_locks(struct btree_trans *trans)
369 {
370         struct btree_iter *iter;
371
372         trans_for_each_iter(trans, iter)
373                 bch2_btree_iter_verify_locks(iter);
374 }
375 #else
376 static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {}
377 #endif
378
379 /*
380  * Only for btree_cache.c - only relocks intent locks
381  */
382 bool bch2_btree_iter_relock_intent(struct btree_iter *iter)
383 {
384         unsigned l;
385
386         for (l = iter->level;
387              l < iter->locks_want && btree_iter_node(iter, l);
388              l++) {
389                 if (!bch2_btree_node_relock(iter, l)) {
390                         trace_node_relock_fail(iter->trans->ip, _RET_IP_,
391                                         btree_iter_type(iter) == BTREE_ITER_CACHED,
392                                         iter->btree_id, &iter->real_pos,
393                                         l, iter->l[l].lock_seq,
394                                         is_btree_node(iter, l)
395                                         ? 0
396                                         : (unsigned long) iter->l[l].b,
397                                         is_btree_node(iter, l)
398                                         ? iter->l[l].b->c.lock.state.seq
399                                         : 0);
400                         btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
401                         btree_trans_restart(iter->trans);
402                         return false;
403                 }
404         }
405
406         return true;
407 }
408
409 __flatten
410 bool bch2_btree_iter_relock(struct btree_iter *iter, unsigned long trace_ip)
411 {
412         bool ret = btree_iter_get_locks(iter, false, trace_ip);
413
414         if (!ret)
415                 btree_trans_restart(iter->trans);
416         return ret;
417 }
418
419 bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
420                                unsigned new_locks_want)
421 {
422         struct btree_iter *linked;
423
424         EBUG_ON(iter->locks_want >= new_locks_want);
425
426         iter->locks_want = new_locks_want;
427
428         if (btree_iter_get_locks(iter, true, _THIS_IP_))
429                 return true;
430
431         /*
432          * XXX: this is ugly - we'd prefer to not be mucking with other
433          * iterators in the btree_trans here.
434          *
435          * On failure to upgrade the iterator, setting iter->locks_want and
436          * calling get_locks() is sufficient to make bch2_btree_iter_traverse()
437          * get the locks we want on transaction restart.
438          *
439          * But if this iterator was a clone, on transaction restart what we did
440          * to this iterator isn't going to be preserved.
441          *
442          * Possibly we could add an iterator field for the parent iterator when
443          * an iterator is a copy - for now, we'll just upgrade any other
444          * iterators with the same btree id.
445          *
446          * The code below used to be needed to ensure ancestor nodes get locked
447          * before interior nodes - now that's handled by
448          * bch2_btree_iter_traverse_all().
449          */
450         trans_for_each_iter(iter->trans, linked)
451                 if (linked != iter &&
452                     btree_iter_type(linked) == btree_iter_type(iter) &&
453                     linked->btree_id == iter->btree_id &&
454                     linked->locks_want < new_locks_want) {
455                         linked->locks_want = new_locks_want;
456                         btree_iter_get_locks(linked, true, _THIS_IP_);
457                 }
458
459         if (iter->should_be_locked)
460                 btree_trans_restart(iter->trans);
461         return false;
462 }
463
464 void __bch2_btree_iter_downgrade(struct btree_iter *iter,
465                                  unsigned new_locks_want)
466 {
467         unsigned l;
468
469         EBUG_ON(iter->locks_want < new_locks_want);
470
471         iter->locks_want = new_locks_want;
472
473         while (iter->nodes_locked &&
474                (l = __fls(iter->nodes_locked)) >= iter->locks_want) {
475                 if (l > iter->level) {
476                         btree_node_unlock(iter, l);
477                 } else {
478                         if (btree_node_intent_locked(iter, l)) {
479                                 six_lock_downgrade(&iter->l[l].b->c.lock);
480                                 iter->nodes_intent_locked ^= 1 << l;
481                         }
482                         break;
483                 }
484         }
485
486         bch2_btree_trans_verify_locks(iter->trans);
487 }
488
489 void bch2_trans_downgrade(struct btree_trans *trans)
490 {
491         struct btree_iter *iter;
492
493         trans_for_each_iter(trans, iter)
494                 bch2_btree_iter_downgrade(iter);
495 }
496
497 /* Btree transaction locking: */
498
499 static inline bool btree_iter_should_be_locked(struct btree_iter *iter)
500 {
501         return (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) ||
502                 iter->should_be_locked;
503 }
504
505 bool bch2_trans_relock(struct btree_trans *trans)
506 {
507         struct btree_iter *iter;
508
509         if (unlikely(trans->restarted))
510                 return false;
511
512         trans_for_each_iter(trans, iter)
513                 if (btree_iter_should_be_locked(iter) &&
514                     !bch2_btree_iter_relock(iter, _RET_IP_)) {
515                         trace_trans_restart_relock(trans->ip, _RET_IP_,
516                                         iter->btree_id, &iter->real_pos);
517                         BUG_ON(!trans->restarted);
518                         return false;
519                 }
520         return true;
521 }
522
523 void bch2_trans_unlock(struct btree_trans *trans)
524 {
525         struct btree_iter *iter;
526
527         trans_for_each_iter(trans, iter)
528                 __bch2_btree_iter_unlock(iter);
529
530         BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
531 }
532
533 /* Btree iterator: */
534
535 #ifdef CONFIG_BCACHEFS_DEBUG
536
537 static void bch2_btree_iter_verify_cached(struct btree_iter *iter)
538 {
539         struct bkey_cached *ck;
540         bool locked = btree_node_locked(iter, 0);
541
542         if (!bch2_btree_node_relock(iter, 0))
543                 return;
544
545         ck = (void *) iter->l[0].b;
546         BUG_ON(ck->key.btree_id != iter->btree_id ||
547                bkey_cmp(ck->key.pos, iter->pos));
548
549         if (!locked)
550                 btree_node_unlock(iter, 0);
551 }
552
553 static void bch2_btree_iter_verify_level(struct btree_iter *iter,
554                                          unsigned level)
555 {
556         struct btree_iter_level *l;
557         struct btree_node_iter tmp;
558         bool locked;
559         struct bkey_packed *p, *k;
560         char buf1[100], buf2[100], buf3[100];
561         const char *msg;
562
563         if (!bch2_debug_check_iterators)
564                 return;
565
566         l       = &iter->l[level];
567         tmp     = l->iter;
568         locked  = btree_node_locked(iter, level);
569
570         if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
571                 if (!level)
572                         bch2_btree_iter_verify_cached(iter);
573                 return;
574         }
575
576         BUG_ON(iter->level < iter->min_depth);
577
578         if (!btree_iter_node(iter, level))
579                 return;
580
581         if (!bch2_btree_node_relock(iter, level))
582                 return;
583
584         BUG_ON(!btree_iter_pos_in_node(iter, l->b));
585
586         /*
587          * node iterators don't use leaf node iterator:
588          */
589         if (btree_iter_type(iter) == BTREE_ITER_NODES &&
590             level <= iter->min_depth)
591                 goto unlock;
592
593         bch2_btree_node_iter_verify(&l->iter, l->b);
594
595         /*
596          * For interior nodes, the iterator will have skipped past
597          * deleted keys:
598          *
599          * For extents, the iterator may have skipped past deleted keys (but not
600          * whiteouts)
601          */
602         p = level || btree_node_type_is_extents(iter->btree_id)
603                 ? bch2_btree_node_iter_prev(&tmp, l->b)
604                 : bch2_btree_node_iter_prev_all(&tmp, l->b);
605         k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
606
607         if (p && bkey_iter_pos_cmp(l->b, p, &iter->real_pos) >= 0) {
608                 msg = "before";
609                 goto err;
610         }
611
612         if (k && bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) {
613                 msg = "after";
614                 goto err;
615         }
616 unlock:
617         if (!locked)
618                 btree_node_unlock(iter, level);
619         return;
620 err:
621         strcpy(buf2, "(none)");
622         strcpy(buf3, "(none)");
623
624         bch2_bpos_to_text(&PBUF(buf1), iter->real_pos);
625
626         if (p) {
627                 struct bkey uk = bkey_unpack_key(l->b, p);
628                 bch2_bkey_to_text(&PBUF(buf2), &uk);
629         }
630
631         if (k) {
632                 struct bkey uk = bkey_unpack_key(l->b, k);
633                 bch2_bkey_to_text(&PBUF(buf3), &uk);
634         }
635
636         panic("iterator should be %s key at level %u:\n"
637               "iter pos %s\n"
638               "prev key %s\n"
639               "cur  key %s\n",
640               msg, level, buf1, buf2, buf3);
641 }
642
643 static void bch2_btree_iter_verify(struct btree_iter *iter)
644 {
645         struct btree_trans *trans = iter->trans;
646         struct bch_fs *c = trans->c;
647         enum btree_iter_type type = btree_iter_type(iter);
648         unsigned i;
649
650         EBUG_ON(iter->btree_id >= BTREE_ID_NR);
651
652         BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
653                iter->pos.snapshot != iter->snapshot);
654
655         BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
656                (iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
657
658         BUG_ON(type == BTREE_ITER_NODES &&
659                !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
660
661         BUG_ON(type != BTREE_ITER_NODES &&
662                (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
663                !btree_type_has_snapshots(iter->btree_id));
664
665         for (i = 0; i < (type != BTREE_ITER_CACHED ? BTREE_MAX_DEPTH : 1); i++) {
666                 if (!iter->l[i].b) {
667                         BUG_ON(c->btree_roots[iter->btree_id].b->c.level > i);
668                         break;
669                 }
670
671                 bch2_btree_iter_verify_level(iter, i);
672         }
673
674         bch2_btree_iter_verify_locks(iter);
675 }
676
677 static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
678 {
679         enum btree_iter_type type = btree_iter_type(iter);
680
681         BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
682                iter->pos.snapshot != iter->snapshot);
683
684         BUG_ON((type == BTREE_ITER_KEYS ||
685                 type == BTREE_ITER_CACHED) &&
686                (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
687                 bkey_cmp(iter->pos, iter->k.p) > 0));
688 }
689
690 void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b)
691 {
692         struct btree_iter *iter;
693
694         if (!bch2_debug_check_iterators)
695                 return;
696
697         trans_for_each_iter_with_node(trans, b, iter)
698                 bch2_btree_iter_verify_level(iter, b->c.level);
699 }
700
701 #else
702
703 static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned l) {}
704 static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
705 static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
706
707 #endif
708
709 static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
710                                         struct btree *b,
711                                         struct bset_tree *t,
712                                         struct bkey_packed *k)
713 {
714         struct btree_node_iter_set *set;
715
716         btree_node_iter_for_each(iter, set)
717                 if (set->end == t->end_offset) {
718                         set->k = __btree_node_key_to_offset(b, k);
719                         bch2_btree_node_iter_sort(iter, b);
720                         return;
721                 }
722
723         bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
724 }
725
726 static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
727                                                struct btree *b,
728                                                struct bkey_packed *where)
729 {
730         struct btree_iter_level *l = &iter->l[b->c.level];
731
732         if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
733                 return;
734
735         if (bkey_iter_pos_cmp(l->b, where, &iter->real_pos) < 0)
736                 bch2_btree_node_iter_advance(&l->iter, l->b);
737
738         btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
739 }
740
741 void bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
742                                       struct btree *b,
743                                       struct bkey_packed *where)
744 {
745         struct btree_iter *linked;
746
747         trans_for_each_iter_with_node(iter->trans, b, linked) {
748                 __bch2_btree_iter_fix_key_modified(linked, b, where);
749                 bch2_btree_iter_verify_level(linked, b->c.level);
750         }
751 }
752
753 static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
754                                       struct btree *b,
755                                       struct btree_node_iter *node_iter,
756                                       struct bset_tree *t,
757                                       struct bkey_packed *where,
758                                       unsigned clobber_u64s,
759                                       unsigned new_u64s)
760 {
761         const struct bkey_packed *end = btree_bkey_last(b, t);
762         struct btree_node_iter_set *set;
763         unsigned offset = __btree_node_key_to_offset(b, where);
764         int shift = new_u64s - clobber_u64s;
765         unsigned old_end = t->end_offset - shift;
766         unsigned orig_iter_pos = node_iter->data[0].k;
767         bool iter_current_key_modified =
768                 orig_iter_pos >= offset &&
769                 orig_iter_pos <= offset + clobber_u64s;
770
771         btree_node_iter_for_each(node_iter, set)
772                 if (set->end == old_end)
773                         goto found;
774
775         /* didn't find the bset in the iterator - might have to readd it: */
776         if (new_u64s &&
777             bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) {
778                 bch2_btree_node_iter_push(node_iter, b, where, end);
779                 goto fixup_done;
780         } else {
781                 /* Iterator is after key that changed */
782                 return;
783         }
784 found:
785         set->end = t->end_offset;
786
787         /* Iterator hasn't gotten to the key that changed yet: */
788         if (set->k < offset)
789                 return;
790
791         if (new_u64s &&
792             bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) {
793                 set->k = offset;
794         } else if (set->k < offset + clobber_u64s) {
795                 set->k = offset + new_u64s;
796                 if (set->k == set->end)
797                         bch2_btree_node_iter_set_drop(node_iter, set);
798         } else {
799                 /* Iterator is after key that changed */
800                 set->k = (int) set->k + shift;
801                 return;
802         }
803
804         bch2_btree_node_iter_sort(node_iter, b);
805 fixup_done:
806         if (node_iter->data[0].k != orig_iter_pos)
807                 iter_current_key_modified = true;
808
809         /*
810          * When a new key is added, and the node iterator now points to that
811          * key, the iterator might have skipped past deleted keys that should
812          * come after the key the iterator now points to. We have to rewind to
813          * before those deleted keys - otherwise
814          * bch2_btree_node_iter_prev_all() breaks:
815          */
816         if (!bch2_btree_node_iter_end(node_iter) &&
817             iter_current_key_modified &&
818             (b->c.level ||
819              btree_node_type_is_extents(iter->btree_id))) {
820                 struct bset_tree *t;
821                 struct bkey_packed *k, *k2, *p;
822
823                 k = bch2_btree_node_iter_peek_all(node_iter, b);
824
825                 for_each_bset(b, t) {
826                         bool set_pos = false;
827
828                         if (node_iter->data[0].end == t->end_offset)
829                                 continue;
830
831                         k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t);
832
833                         while ((p = bch2_bkey_prev_all(b, t, k2)) &&
834                                bkey_iter_cmp(b, k, p) < 0) {
835                                 k2 = p;
836                                 set_pos = true;
837                         }
838
839                         if (set_pos)
840                                 btree_node_iter_set_set_pos(node_iter,
841                                                             b, t, k2);
842                 }
843         }
844
845         if (!b->c.level &&
846             node_iter == &iter->l[0].iter &&
847             iter_current_key_modified)
848                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
849 }
850
851 void bch2_btree_node_iter_fix(struct btree_iter *iter,
852                               struct btree *b,
853                               struct btree_node_iter *node_iter,
854                               struct bkey_packed *where,
855                               unsigned clobber_u64s,
856                               unsigned new_u64s)
857 {
858         struct bset_tree *t = bch2_bkey_to_bset(b, where);
859         struct btree_iter *linked;
860
861         if (node_iter != &iter->l[b->c.level].iter) {
862                 __bch2_btree_node_iter_fix(iter, b, node_iter, t,
863                                            where, clobber_u64s, new_u64s);
864
865                 if (bch2_debug_check_iterators)
866                         bch2_btree_node_iter_verify(node_iter, b);
867         }
868
869         trans_for_each_iter_with_node(iter->trans, b, linked) {
870                 __bch2_btree_node_iter_fix(linked, b,
871                                            &linked->l[b->c.level].iter, t,
872                                            where, clobber_u64s, new_u64s);
873                 bch2_btree_iter_verify_level(linked, b->c.level);
874         }
875 }
876
877 static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
878                                                   struct btree_iter_level *l,
879                                                   struct bkey *u,
880                                                   struct bkey_packed *k)
881 {
882         struct bkey_s_c ret;
883
884         if (unlikely(!k)) {
885                 /*
886                  * signal to bch2_btree_iter_peek_slot() that we're currently at
887                  * a hole
888                  */
889                 u->type = KEY_TYPE_deleted;
890                 return bkey_s_c_null;
891         }
892
893         ret = bkey_disassemble(l->b, k, u);
894
895         /*
896          * XXX: bch2_btree_bset_insert_key() generates invalid keys when we
897          * overwrite extents - it sets k->type = KEY_TYPE_deleted on the key
898          * being overwritten but doesn't change k->size. But this is ok, because
899          * those keys are never written out, we just have to avoid a spurious
900          * assertion here:
901          */
902         if (bch2_debug_check_bkeys && !bkey_deleted(ret.k))
903                 bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
904
905         return ret;
906 }
907
908 /* peek_all() doesn't skip deleted keys */
909 static inline struct bkey_s_c btree_iter_level_peek_all(struct btree_iter *iter,
910                                                         struct btree_iter_level *l)
911 {
912         return __btree_iter_unpack(iter, l, &iter->k,
913                         bch2_btree_node_iter_peek_all(&l->iter, l->b));
914 }
915
916 static inline struct bkey_s_c btree_iter_level_peek(struct btree_iter *iter,
917                                                     struct btree_iter_level *l)
918 {
919         struct bkey_s_c k = __btree_iter_unpack(iter, l, &iter->k,
920                         bch2_btree_node_iter_peek(&l->iter, l->b));
921
922         iter->real_pos = k.k ? k.k->p : l->b->key.k.p;
923         return k;
924 }
925
926 static inline struct bkey_s_c btree_iter_level_prev(struct btree_iter *iter,
927                                                     struct btree_iter_level *l)
928 {
929         struct bkey_s_c k = __btree_iter_unpack(iter, l, &iter->k,
930                         bch2_btree_node_iter_prev(&l->iter, l->b));
931
932         iter->real_pos = k.k ? k.k->p : l->b->data->min_key;
933         return k;
934 }
935
936 static inline bool btree_iter_advance_to_pos(struct btree_iter *iter,
937                                              struct btree_iter_level *l,
938                                              int max_advance)
939 {
940         struct bkey_packed *k;
941         int nr_advanced = 0;
942
943         while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
944                bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) {
945                 if (max_advance > 0 && nr_advanced >= max_advance)
946                         return false;
947
948                 bch2_btree_node_iter_advance(&l->iter, l->b);
949                 nr_advanced++;
950         }
951
952         return true;
953 }
954
955 /*
956  * Verify that iterator for parent node points to child node:
957  */
958 static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
959 {
960         struct btree_iter_level *l;
961         unsigned plevel;
962         bool parent_locked;
963         struct bkey_packed *k;
964
965         if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
966                 return;
967
968         plevel = b->c.level + 1;
969         if (!btree_iter_node(iter, plevel))
970                 return;
971
972         parent_locked = btree_node_locked(iter, plevel);
973
974         if (!bch2_btree_node_relock(iter, plevel))
975                 return;
976
977         l = &iter->l[plevel];
978         k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
979         if (!k ||
980             bkey_deleted(k) ||
981             bkey_cmp_left_packed(l->b, k, &b->key.k.p)) {
982                 char buf1[100];
983                 char buf2[100];
984                 char buf3[100];
985                 char buf4[100];
986                 struct bkey uk = bkey_unpack_key(b, k);
987
988                 bch2_dump_btree_node(iter->trans->c, l->b);
989                 bch2_bpos_to_text(&PBUF(buf1), iter->real_pos);
990                 bch2_bkey_to_text(&PBUF(buf2), &uk);
991                 bch2_bpos_to_text(&PBUF(buf3), b->data->min_key);
992                 bch2_bpos_to_text(&PBUF(buf3), b->data->max_key);
993                 panic("parent iter doesn't point to new node:\n"
994                       "iter pos %s %s\n"
995                       "iter key %s\n"
996                       "new node %s-%s\n",
997                       bch2_btree_ids[iter->btree_id], buf1,
998                       buf2, buf3, buf4);
999         }
1000
1001         if (!parent_locked)
1002                 btree_node_unlock(iter, b->c.level + 1);
1003 }
1004
1005 static inline void __btree_iter_init(struct btree_iter *iter,
1006                                      unsigned level)
1007 {
1008         struct btree_iter_level *l = &iter->l[level];
1009
1010         bch2_btree_node_iter_init(&l->iter, l->b, &iter->real_pos);
1011
1012         /*
1013          * Iterators to interior nodes should always be pointed at the first non
1014          * whiteout:
1015          */
1016         if (level)
1017                 bch2_btree_node_iter_peek(&l->iter, l->b);
1018
1019         btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
1020 }
1021
1022 static inline void btree_iter_node_set(struct btree_iter *iter,
1023                                        struct btree *b)
1024 {
1025         BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
1026
1027         btree_iter_verify_new_node(iter, b);
1028
1029         EBUG_ON(!btree_iter_pos_in_node(iter, b));
1030         EBUG_ON(b->c.lock.state.seq & 1);
1031
1032         iter->l[b->c.level].lock_seq = b->c.lock.state.seq;
1033         iter->l[b->c.level].b = b;
1034         __btree_iter_init(iter, b->c.level);
1035 }
1036
1037 /*
1038  * A btree node is being replaced - update the iterator to point to the new
1039  * node:
1040  */
1041 void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
1042 {
1043         enum btree_node_locked_type t;
1044         struct btree_iter *linked;
1045
1046         trans_for_each_iter(iter->trans, linked)
1047                 if (btree_iter_type(linked) != BTREE_ITER_CACHED &&
1048                     btree_iter_pos_in_node(linked, b)) {
1049                         /*
1050                          * bch2_btree_iter_node_drop() has already been called -
1051                          * the old node we're replacing has already been
1052                          * unlocked and the pointer invalidated
1053                          */
1054                         BUG_ON(btree_node_locked(linked, b->c.level));
1055
1056                         t = btree_lock_want(linked, b->c.level);
1057                         if (t != BTREE_NODE_UNLOCKED) {
1058                                 six_lock_increment(&b->c.lock, t);
1059                                 mark_btree_node_locked(linked, b->c.level, t);
1060                         }
1061
1062                         btree_iter_node_set(linked, b);
1063                 }
1064 }
1065
1066 void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
1067 {
1068         struct btree_iter *linked;
1069         unsigned level = b->c.level;
1070
1071         trans_for_each_iter(iter->trans, linked)
1072                 if (linked->l[level].b == b) {
1073                         btree_node_unlock(linked, level);
1074                         linked->l[level].b = BTREE_ITER_NO_NODE_DROP;
1075                 }
1076 }
1077
1078 /*
1079  * A btree node has been modified in such a way as to invalidate iterators - fix
1080  * them:
1081  */
1082 void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b)
1083 {
1084         struct btree_iter *linked;
1085
1086         trans_for_each_iter_with_node(iter->trans, b, linked)
1087                 __btree_iter_init(linked, b->c.level);
1088 }
1089
1090 static int lock_root_check_fn(struct six_lock *lock, void *p)
1091 {
1092         struct btree *b = container_of(lock, struct btree, c.lock);
1093         struct btree **rootp = p;
1094
1095         return b == *rootp ? 0 : -1;
1096 }
1097
1098 static inline int btree_iter_lock_root(struct btree_trans *trans,
1099                                        struct btree_iter *iter,
1100                                        unsigned depth_want,
1101                                        unsigned long trace_ip)
1102 {
1103         struct bch_fs *c = trans->c;
1104         struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b;
1105         enum six_lock_type lock_type;
1106         unsigned i;
1107
1108         EBUG_ON(iter->nodes_locked);
1109
1110         while (1) {
1111                 b = READ_ONCE(*rootp);
1112                 iter->level = READ_ONCE(b->c.level);
1113
1114                 if (unlikely(iter->level < depth_want)) {
1115                         /*
1116                          * the root is at a lower depth than the depth we want:
1117                          * got to the end of the btree, or we're walking nodes
1118                          * greater than some depth and there are no nodes >=
1119                          * that depth
1120                          */
1121                         iter->level = depth_want;
1122                         for (i = iter->level; i < BTREE_MAX_DEPTH; i++)
1123                                 iter->l[i].b = NULL;
1124                         return 1;
1125                 }
1126
1127                 lock_type = __btree_lock_want(iter, iter->level);
1128                 if (unlikely(!btree_node_lock(b, SPOS_MAX, iter->level,
1129                                               iter, lock_type,
1130                                               lock_root_check_fn, rootp,
1131                                               trace_ip))) {
1132                         if (trans->restarted)
1133                                 return -EINTR;
1134                         continue;
1135                 }
1136
1137                 if (likely(b == READ_ONCE(*rootp) &&
1138                            b->c.level == iter->level &&
1139                            !race_fault())) {
1140                         for (i = 0; i < iter->level; i++)
1141                                 iter->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT;
1142                         iter->l[iter->level].b = b;
1143                         for (i = iter->level + 1; i < BTREE_MAX_DEPTH; i++)
1144                                 iter->l[i].b = NULL;
1145
1146                         mark_btree_node_locked(iter, iter->level, lock_type);
1147                         btree_iter_node_set(iter, b);
1148                         return 0;
1149                 }
1150
1151                 six_unlock_type(&b->c.lock, lock_type);
1152         }
1153 }
1154
1155 noinline
1156 static int btree_iter_prefetch(struct btree_iter *iter)
1157 {
1158         struct bch_fs *c = iter->trans->c;
1159         struct btree_iter_level *l = &iter->l[iter->level];
1160         struct btree_node_iter node_iter = l->iter;
1161         struct bkey_packed *k;
1162         struct bkey_buf tmp;
1163         unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
1164                 ? (iter->level > 1 ? 0 :  2)
1165                 : (iter->level > 1 ? 1 : 16);
1166         bool was_locked = btree_node_locked(iter, iter->level);
1167         int ret = 0;
1168
1169         bch2_bkey_buf_init(&tmp);
1170
1171         while (nr && !ret) {
1172                 if (!bch2_btree_node_relock(iter, iter->level))
1173                         break;
1174
1175                 bch2_btree_node_iter_advance(&node_iter, l->b);
1176                 k = bch2_btree_node_iter_peek(&node_iter, l->b);
1177                 if (!k)
1178                         break;
1179
1180                 bch2_bkey_buf_unpack(&tmp, c, l->b, k);
1181                 ret = bch2_btree_node_prefetch(c, iter, tmp.k, iter->btree_id,
1182                                                iter->level - 1);
1183         }
1184
1185         if (!was_locked)
1186                 btree_node_unlock(iter, iter->level);
1187
1188         bch2_bkey_buf_exit(&tmp, c);
1189         return ret;
1190 }
1191
1192 static noinline void btree_node_mem_ptr_set(struct btree_iter *iter,
1193                                             unsigned plevel, struct btree *b)
1194 {
1195         struct btree_iter_level *l = &iter->l[plevel];
1196         bool locked = btree_node_locked(iter, plevel);
1197         struct bkey_packed *k;
1198         struct bch_btree_ptr_v2 *bp;
1199
1200         if (!bch2_btree_node_relock(iter, plevel))
1201                 return;
1202
1203         k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
1204         BUG_ON(k->type != KEY_TYPE_btree_ptr_v2);
1205
1206         bp = (void *) bkeyp_val(&l->b->format, k);
1207         bp->mem_ptr = (unsigned long)b;
1208
1209         if (!locked)
1210                 btree_node_unlock(iter, plevel);
1211 }
1212
1213 static __always_inline int btree_iter_down(struct btree_trans *trans,
1214                                            struct btree_iter *iter,
1215                                            unsigned long trace_ip)
1216 {
1217         struct bch_fs *c = trans->c;
1218         struct btree_iter_level *l = &iter->l[iter->level];
1219         struct btree *b;
1220         unsigned level = iter->level - 1;
1221         enum six_lock_type lock_type = __btree_lock_want(iter, level);
1222         struct bkey_buf tmp;
1223         int ret;
1224
1225         EBUG_ON(!btree_node_locked(iter, iter->level));
1226
1227         bch2_bkey_buf_init(&tmp);
1228         bch2_bkey_buf_unpack(&tmp, c, l->b,
1229                          bch2_btree_node_iter_peek(&l->iter, l->b));
1230
1231         b = bch2_btree_node_get(trans, iter, tmp.k, level, lock_type, trace_ip);
1232         ret = PTR_ERR_OR_ZERO(b);
1233         if (unlikely(ret))
1234                 goto err;
1235
1236         mark_btree_node_locked(iter, level, lock_type);
1237         btree_iter_node_set(iter, b);
1238
1239         if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 &&
1240             unlikely(b != btree_node_mem_ptr(tmp.k)))
1241                 btree_node_mem_ptr_set(iter, level + 1, b);
1242
1243         if (iter->flags & BTREE_ITER_PREFETCH)
1244                 ret = btree_iter_prefetch(iter);
1245
1246         if (btree_node_read_locked(iter, level + 1))
1247                 btree_node_unlock(iter, level + 1);
1248         iter->level = level;
1249
1250         bch2_btree_iter_verify_locks(iter);
1251 err:
1252         bch2_bkey_buf_exit(&tmp, c);
1253         return ret;
1254 }
1255
1256 static int btree_iter_traverse_one(struct btree_iter *, unsigned long);
1257
1258 static int __btree_iter_traverse_all(struct btree_trans *trans, int ret,
1259                                      unsigned long trace_ip)
1260 {
1261         struct bch_fs *c = trans->c;
1262         struct btree_iter *iter;
1263         u8 sorted[BTREE_ITER_MAX];
1264         int i, nr_sorted = 0;
1265
1266         if (trans->in_traverse_all)
1267                 return -EINTR;
1268
1269         trans->in_traverse_all = true;
1270 retry_all:
1271         trans->restarted = false;
1272
1273         nr_sorted = 0;
1274
1275         trans_for_each_iter(trans, iter) {
1276                 sorted[nr_sorted++] = iter->idx;
1277                 iter->should_be_locked = false;
1278         }
1279
1280 #define btree_iter_cmp_by_idx(_l, _r)                           \
1281                 btree_iter_lock_cmp(&trans->iters[_l], &trans->iters[_r])
1282
1283         bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx);
1284 #undef btree_iter_cmp_by_idx
1285
1286         for (i = nr_sorted - 2; i >= 0; --i) {
1287                 struct btree_iter *iter1 = trans->iters + sorted[i];
1288                 struct btree_iter *iter2 = trans->iters + sorted[i + 1];
1289
1290                 if (iter1->btree_id == iter2->btree_id &&
1291                     iter1->locks_want < iter2->locks_want)
1292                         __bch2_btree_iter_upgrade(iter1, iter2->locks_want);
1293                 else if (!iter1->locks_want && iter2->locks_want)
1294                         __bch2_btree_iter_upgrade(iter1, 1);
1295         }
1296
1297         bch2_trans_unlock(trans);
1298         cond_resched();
1299
1300         if (unlikely(ret == -ENOMEM)) {
1301                 struct closure cl;
1302
1303                 closure_init_stack(&cl);
1304
1305                 do {
1306                         ret = bch2_btree_cache_cannibalize_lock(c, &cl);
1307                         closure_sync(&cl);
1308                 } while (ret);
1309         }
1310
1311         if (unlikely(ret == -EIO)) {
1312                 trans->error = true;
1313                 goto out;
1314         }
1315
1316         BUG_ON(ret && ret != -EINTR);
1317
1318         /* Now, redo traversals in correct order: */
1319         for (i = 0; i < nr_sorted; i++) {
1320                 unsigned idx = sorted[i];
1321
1322                 /*
1323                  * sucessfully traversing one iterator can cause another to be
1324                  * unlinked, in btree_key_cache_fill()
1325                  */
1326                 if (!(trans->iters_linked & (1ULL << idx)))
1327                         continue;
1328
1329                 ret = btree_iter_traverse_one(&trans->iters[idx], _THIS_IP_);
1330                 if (ret)
1331                         goto retry_all;
1332         }
1333 out:
1334         bch2_btree_cache_cannibalize_unlock(c);
1335
1336         trans->in_traverse_all = false;
1337
1338         trace_trans_traverse_all(trans->ip, trace_ip);
1339         return ret;
1340 }
1341
1342 static int bch2_btree_iter_traverse_all(struct btree_trans *trans)
1343 {
1344         return __btree_iter_traverse_all(trans, 0, _RET_IP_);
1345 }
1346
1347 static inline bool btree_iter_good_node(struct btree_iter *iter,
1348                                         unsigned l, int check_pos)
1349 {
1350         if (!is_btree_node(iter, l) ||
1351             !bch2_btree_node_relock(iter, l))
1352                 return false;
1353
1354         if (check_pos < 0 && btree_iter_pos_before_node(iter, iter->l[l].b))
1355                 return false;
1356         if (check_pos > 0 && btree_iter_pos_after_node(iter, iter->l[l].b))
1357                 return false;
1358         return true;
1359 }
1360
1361 static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter,
1362                                                      int check_pos)
1363 {
1364         unsigned l = iter->level;
1365
1366         while (btree_iter_node(iter, l) &&
1367                !btree_iter_good_node(iter, l, check_pos)) {
1368                 btree_node_unlock(iter, l);
1369                 iter->l[l].b = BTREE_ITER_NO_NODE_UP;
1370                 l++;
1371         }
1372
1373         return l;
1374 }
1375
1376 /*
1377  * This is the main state machine for walking down the btree - walks down to a
1378  * specified depth
1379  *
1380  * Returns 0 on success, -EIO on error (error reading in a btree node).
1381  *
1382  * On error, caller (peek_node()/peek_key()) must return NULL; the error is
1383  * stashed in the iterator and returned from bch2_trans_exit().
1384  */
1385 static int btree_iter_traverse_one(struct btree_iter *iter,
1386                                    unsigned long trace_ip)
1387 {
1388         struct btree_trans *trans = iter->trans;
1389         unsigned l, depth_want = iter->level;
1390         int ret = 0;
1391
1392         /*
1393          * Ensure we obey iter->should_be_locked: if it's set, we can't unlock
1394          * and re-traverse the iterator without a transaction restart:
1395          */
1396         if (iter->should_be_locked) {
1397                 ret = bch2_btree_iter_relock(iter, trace_ip) ? 0 : -EINTR;
1398                 goto out;
1399         }
1400
1401         if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
1402                 ret = bch2_btree_iter_traverse_cached(iter);
1403                 goto out;
1404         }
1405
1406         if (unlikely(iter->level >= BTREE_MAX_DEPTH))
1407                 goto out;
1408
1409         iter->level = btree_iter_up_until_good_node(iter, 0);
1410
1411         /* If we need intent locks, take them too: */
1412         for (l = iter->level + 1;
1413              l < iter->locks_want && btree_iter_node(iter, l);
1414              l++)
1415                 if (!bch2_btree_node_relock(iter, l))
1416                         while (iter->level <= l) {
1417                                 btree_node_unlock(iter, iter->level);
1418                                 iter->l[iter->level].b = BTREE_ITER_NO_NODE_UP;
1419                                 iter->level++;
1420                         }
1421
1422         /*
1423          * Note: iter->nodes[iter->level] may be temporarily NULL here - that
1424          * would indicate to other code that we got to the end of the btree,
1425          * here it indicates that relocking the root failed - it's critical that
1426          * btree_iter_lock_root() comes next and that it can't fail
1427          */
1428         while (iter->level > depth_want) {
1429                 ret = btree_iter_node(iter, iter->level)
1430                         ? btree_iter_down(trans, iter, trace_ip)
1431                         : btree_iter_lock_root(trans, iter, depth_want, trace_ip);
1432                 if (unlikely(ret)) {
1433                         if (ret == 1) {
1434                                 /*
1435                                  * Got to the end of the btree (in
1436                                  * BTREE_ITER_NODES mode)
1437                                  */
1438                                 ret = 0;
1439                                 goto out;
1440                         }
1441
1442                         __bch2_btree_iter_unlock(iter);
1443                         iter->level = depth_want;
1444
1445                         if (ret == -EIO) {
1446                                 iter->flags |= BTREE_ITER_ERROR;
1447                                 iter->l[iter->level].b =
1448                                         BTREE_ITER_NO_NODE_ERROR;
1449                         } else {
1450                                 iter->l[iter->level].b =
1451                                         BTREE_ITER_NO_NODE_DOWN;
1452                         }
1453                         goto out;
1454                 }
1455         }
1456
1457         iter->uptodate = BTREE_ITER_NEED_PEEK;
1458 out:
1459         BUG_ON((ret == -EINTR) != !!trans->restarted);
1460         trace_iter_traverse(trans->ip, trace_ip,
1461                             btree_iter_type(iter) == BTREE_ITER_CACHED,
1462                             iter->btree_id, &iter->real_pos, ret);
1463         bch2_btree_iter_verify(iter);
1464         return ret;
1465 }
1466
1467 static int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
1468 {
1469         struct btree_trans *trans = iter->trans;
1470         int ret;
1471
1472         ret =   bch2_trans_cond_resched(trans) ?:
1473                 btree_iter_traverse_one(iter, _RET_IP_);
1474         if (unlikely(ret) && hweight64(trans->iters_linked) == 1) {
1475                 ret = __btree_iter_traverse_all(trans, ret, _RET_IP_);
1476                 BUG_ON(ret == -EINTR);
1477         }
1478
1479         return ret;
1480 }
1481
1482 /*
1483  * Note:
1484  * bch2_btree_iter_traverse() is for external users, btree_iter_traverse() is
1485  * for internal btree iterator users
1486  *
1487  * bch2_btree_iter_traverse sets iter->real_pos to iter->pos,
1488  * btree_iter_traverse() does not:
1489  */
1490 static inline int __must_check
1491 btree_iter_traverse(struct btree_iter *iter)
1492 {
1493         return iter->uptodate >= BTREE_ITER_NEED_RELOCK
1494                 ? __bch2_btree_iter_traverse(iter)
1495                 : 0;
1496 }
1497
1498 int __must_check
1499 bch2_btree_iter_traverse(struct btree_iter *iter)
1500 {
1501         int ret;
1502
1503         btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
1504
1505         ret = btree_iter_traverse(iter);
1506         if (ret)
1507                 return ret;
1508
1509         iter->should_be_locked = true;
1510         return 0;
1511 }
1512
1513 /* Iterate across nodes (leaf and interior nodes) */
1514
1515 struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
1516 {
1517         struct btree *b;
1518         int ret;
1519
1520         EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES);
1521         bch2_btree_iter_verify(iter);
1522
1523         ret = btree_iter_traverse(iter);
1524         if (ret)
1525                 return NULL;
1526
1527         b = btree_iter_node(iter, iter->level);
1528         if (!b)
1529                 return NULL;
1530
1531         BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0);
1532
1533         iter->pos = iter->real_pos = b->key.k.p;
1534
1535         bch2_btree_iter_verify(iter);
1536         iter->should_be_locked = true;
1537
1538         return b;
1539 }
1540
1541 struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
1542 {
1543         struct btree *b;
1544         int ret;
1545
1546         EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES);
1547         bch2_btree_iter_verify(iter);
1548
1549         /* already got to end? */
1550         if (!btree_iter_node(iter, iter->level))
1551                 return NULL;
1552
1553         bch2_trans_cond_resched(iter->trans);
1554
1555         btree_node_unlock(iter, iter->level);
1556         iter->l[iter->level].b = BTREE_ITER_NO_NODE_UP;
1557         iter->level++;
1558
1559         btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1560         ret = btree_iter_traverse(iter);
1561         if (ret)
1562                 return NULL;
1563
1564         /* got to end? */
1565         b = btree_iter_node(iter, iter->level);
1566         if (!b)
1567                 return NULL;
1568
1569         if (bpos_cmp(iter->pos, b->key.k.p) < 0) {
1570                 /*
1571                  * Haven't gotten to the end of the parent node: go back down to
1572                  * the next child node
1573                  */
1574                 btree_iter_set_search_pos(iter, bpos_successor(iter->pos));
1575
1576                 /* Unlock to avoid screwing up our lock invariants: */
1577                 btree_node_unlock(iter, iter->level);
1578
1579                 iter->level = iter->min_depth;
1580                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1581                 bch2_btree_iter_verify(iter);
1582
1583                 ret = btree_iter_traverse(iter);
1584                 if (ret)
1585                         return NULL;
1586
1587                 b = iter->l[iter->level].b;
1588         }
1589
1590         iter->pos = iter->real_pos = b->key.k.p;
1591
1592         bch2_btree_iter_verify(iter);
1593         iter->should_be_locked = true;
1594
1595         return b;
1596 }
1597
1598 /* Iterate across keys (in leaf nodes only) */
1599
1600 static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos)
1601 {
1602 #ifdef CONFIG_BCACHEFS_DEBUG
1603         struct bpos old_pos = iter->real_pos;
1604 #endif
1605         int cmp = bpos_cmp(new_pos, iter->real_pos);
1606         unsigned l = iter->level;
1607
1608         EBUG_ON(iter->trans->restarted);
1609
1610         if (!cmp)
1611                 goto out;
1612
1613         iter->real_pos = new_pos;
1614         iter->should_be_locked = false;
1615
1616         if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) {
1617                 btree_node_unlock(iter, 0);
1618                 iter->l[0].b = BTREE_ITER_NO_NODE_CACHED;
1619                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1620                 return;
1621         }
1622
1623         l = btree_iter_up_until_good_node(iter, cmp);
1624
1625         if (btree_iter_node(iter, l)) {
1626                 /*
1627                  * We might have to skip over many keys, or just a few: try
1628                  * advancing the node iterator, and if we have to skip over too
1629                  * many keys just reinit it (or if we're rewinding, since that
1630                  * is expensive).
1631                  */
1632                 if (cmp < 0 ||
1633                     !btree_iter_advance_to_pos(iter, &iter->l[l], 8))
1634                         __btree_iter_init(iter, l);
1635
1636                 /* Don't leave it locked if we're not supposed to: */
1637                 if (btree_lock_want(iter, l) == BTREE_NODE_UNLOCKED)
1638                         btree_node_unlock(iter, l);
1639         }
1640 out:
1641         if (l != iter->level)
1642                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1643         else
1644                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
1645
1646         bch2_btree_iter_verify(iter);
1647 #ifdef CONFIG_BCACHEFS_DEBUG
1648         trace_iter_set_search_pos(iter->trans->ip, _RET_IP_,
1649                                   iter->btree_id,
1650                                   &old_pos, &new_pos, l);
1651 #endif
1652 }
1653
1654 inline bool bch2_btree_iter_advance(struct btree_iter *iter)
1655 {
1656         struct bpos pos = iter->k.p;
1657         bool ret = bpos_cmp(pos, SPOS_MAX) != 0;
1658
1659         if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
1660                 pos = bkey_successor(iter, pos);
1661         bch2_btree_iter_set_pos(iter, pos);
1662         return ret;
1663 }
1664
1665 inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
1666 {
1667         struct bpos pos = bkey_start_pos(&iter->k);
1668         bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS
1669                     ? bpos_cmp(pos, POS_MIN)
1670                     : bkey_cmp(pos, POS_MIN)) != 0;
1671
1672         if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
1673                 pos = bkey_predecessor(iter, pos);
1674         bch2_btree_iter_set_pos(iter, pos);
1675         return ret;
1676 }
1677
1678 static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
1679 {
1680         struct bpos next_pos = iter->l[0].b->key.k.p;
1681         bool ret = bpos_cmp(next_pos, SPOS_MAX) != 0;
1682
1683         /*
1684          * Typically, we don't want to modify iter->pos here, since that
1685          * indicates where we searched from - unless we got to the end of the
1686          * btree, in that case we want iter->pos to reflect that:
1687          */
1688         if (ret)
1689                 btree_iter_set_search_pos(iter, bpos_successor(next_pos));
1690         else
1691                 bch2_btree_iter_set_pos(iter, SPOS_MAX);
1692
1693         return ret;
1694 }
1695
1696 static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter)
1697 {
1698         struct bpos next_pos = iter->l[0].b->data->min_key;
1699         bool ret = bpos_cmp(next_pos, POS_MIN) != 0;
1700
1701         if (ret)
1702                 btree_iter_set_search_pos(iter, bpos_predecessor(next_pos));
1703         else
1704                 bch2_btree_iter_set_pos(iter, POS_MIN);
1705
1706         return ret;
1707 }
1708
1709 static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter,
1710                                                       struct bpos pos)
1711 {
1712         struct btree_insert_entry *i;
1713
1714         if (!(iter->flags & BTREE_ITER_WITH_UPDATES))
1715                 return NULL;
1716
1717         trans_for_each_update(iter->trans, i)
1718                 if ((cmp_int(iter->btree_id,    i->iter->btree_id) ?:
1719                      bkey_cmp(pos,              i->k->k.p)) <= 0) {
1720                         if (iter->btree_id ==   i->iter->btree_id)
1721                                 return i->k;
1722                         break;
1723                 }
1724
1725         return NULL;
1726 }
1727
1728 /**
1729  * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
1730  * current position
1731  */
1732 struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
1733 {
1734         struct bpos search_key = btree_iter_search_key(iter);
1735         struct bkey_i *next_update;
1736         struct bkey_s_c k;
1737         int ret;
1738
1739         EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
1740         bch2_btree_iter_verify(iter);
1741         bch2_btree_iter_verify_entry_exit(iter);
1742 start:
1743         next_update = btree_trans_peek_updates(iter, search_key);
1744         btree_iter_set_search_pos(iter, search_key);
1745
1746         while (1) {
1747                 ret = btree_iter_traverse(iter);
1748                 if (unlikely(ret))
1749                         return bkey_s_c_err(ret);
1750
1751                 k = btree_iter_level_peek(iter, &iter->l[0]);
1752
1753                 if (next_update &&
1754                     bpos_cmp(next_update->k.p, iter->real_pos) <= 0) {
1755                         iter->k = next_update->k;
1756                         k = bkey_i_to_s_c(next_update);
1757                 }
1758
1759                 if (likely(k.k)) {
1760                         if (bkey_deleted(k.k)) {
1761                                 search_key = bkey_successor(iter, k.k->p);
1762                                 goto start;
1763                         }
1764
1765                         break;
1766                 }
1767
1768                 if (!btree_iter_set_pos_to_next_leaf(iter))
1769                         return bkey_s_c_null;
1770         }
1771
1772         /*
1773          * iter->pos should be mononotically increasing, and always be equal to
1774          * the key we just returned - except extents can straddle iter->pos:
1775          */
1776         if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
1777                 iter->pos = k.k->p;
1778         else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
1779                 iter->pos = bkey_start_pos(k.k);
1780
1781         bch2_btree_iter_verify_entry_exit(iter);
1782         bch2_btree_iter_verify(iter);
1783         iter->should_be_locked = true;
1784         return k;
1785 }
1786
1787 /**
1788  * bch2_btree_iter_next: returns first key greater than iterator's current
1789  * position
1790  */
1791 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
1792 {
1793         if (!bch2_btree_iter_advance(iter))
1794                 return bkey_s_c_null;
1795
1796         return bch2_btree_iter_peek(iter);
1797 }
1798
1799 /**
1800  * bch2_btree_iter_peek_prev: returns first key less than or equal to
1801  * iterator's current position
1802  */
1803 struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
1804 {
1805         struct btree_iter_level *l = &iter->l[0];
1806         struct bkey_s_c k;
1807         int ret;
1808
1809         EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
1810         EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES);
1811         bch2_btree_iter_verify(iter);
1812         bch2_btree_iter_verify_entry_exit(iter);
1813
1814         btree_iter_set_search_pos(iter, iter->pos);
1815
1816         while (1) {
1817                 ret = btree_iter_traverse(iter);
1818                 if (unlikely(ret)) {
1819                         k = bkey_s_c_err(ret);
1820                         goto no_key;
1821                 }
1822
1823                 k = btree_iter_level_peek(iter, l);
1824                 if (!k.k ||
1825                     ((iter->flags & BTREE_ITER_IS_EXTENTS)
1826                      ? bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0
1827                      : bkey_cmp(k.k->p, iter->pos) > 0))
1828                         k = btree_iter_level_prev(iter, l);
1829
1830                 if (likely(k.k))
1831                         break;
1832
1833                 if (!btree_iter_set_pos_to_prev_leaf(iter)) {
1834                         k = bkey_s_c_null;
1835                         goto no_key;
1836                 }
1837         }
1838
1839         EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0);
1840
1841         /* Extents can straddle iter->pos: */
1842         if (bkey_cmp(k.k->p, iter->pos) < 0)
1843                 iter->pos = k.k->p;
1844 out:
1845         bch2_btree_iter_verify_entry_exit(iter);
1846         bch2_btree_iter_verify(iter);
1847         iter->should_be_locked = true;
1848         return k;
1849 no_key:
1850         /*
1851          * btree_iter_level_peek() may have set iter->k to a key we didn't want, and
1852          * then we errored going to the previous leaf - make sure it's
1853          * consistent with iter->pos:
1854          */
1855         bkey_init(&iter->k);
1856         iter->k.p = iter->pos;
1857         goto out;
1858 }
1859
1860 /**
1861  * bch2_btree_iter_prev: returns first key less than iterator's current
1862  * position
1863  */
1864 struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
1865 {
1866         if (!bch2_btree_iter_rewind(iter))
1867                 return bkey_s_c_null;
1868
1869         return bch2_btree_iter_peek_prev(iter);
1870 }
1871
1872 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
1873 {
1874         struct bpos search_key;
1875         struct bkey_s_c k;
1876         int ret;
1877
1878         EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS &&
1879                 btree_iter_type(iter) != BTREE_ITER_CACHED);
1880         bch2_btree_iter_verify(iter);
1881         bch2_btree_iter_verify_entry_exit(iter);
1882
1883         /* extents can't span inode numbers: */
1884         if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
1885             unlikely(iter->pos.offset == KEY_OFFSET_MAX)) {
1886                 if (iter->pos.inode == KEY_INODE_MAX)
1887                         return bkey_s_c_null;
1888
1889                 bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
1890         }
1891
1892         search_key = btree_iter_search_key(iter);
1893         btree_iter_set_search_pos(iter, search_key);
1894
1895         ret = btree_iter_traverse(iter);
1896         if (unlikely(ret))
1897                 return bkey_s_c_err(ret);
1898
1899         if (btree_iter_type(iter) == BTREE_ITER_CACHED ||
1900             !(iter->flags & BTREE_ITER_IS_EXTENTS)) {
1901                 struct bkey_i *next_update;
1902                 struct bkey_cached *ck;
1903
1904                 switch (btree_iter_type(iter)) {
1905                 case BTREE_ITER_KEYS:
1906                         k = btree_iter_level_peek_all(iter, &iter->l[0]);
1907                         EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, iter->pos) == 0);
1908                         break;
1909                 case BTREE_ITER_CACHED:
1910                         ck = (void *) iter->l[0].b;
1911                         EBUG_ON(iter->btree_id != ck->key.btree_id ||
1912                                 bkey_cmp(iter->pos, ck->key.pos));
1913                         BUG_ON(!ck->valid);
1914
1915                         k = bkey_i_to_s_c(ck->k);
1916                         break;
1917                 case BTREE_ITER_NODES:
1918                         BUG();
1919                 }
1920
1921                 next_update = btree_trans_peek_updates(iter, search_key);
1922                 if (next_update &&
1923                     (!k.k || bpos_cmp(next_update->k.p, k.k->p) <= 0)) {
1924                         iter->k = next_update->k;
1925                         k = bkey_i_to_s_c(next_update);
1926                 }
1927         } else {
1928                 if ((iter->flags & BTREE_ITER_INTENT)) {
1929                         struct btree_iter *child =
1930                                 btree_iter_child_alloc(iter, _THIS_IP_);
1931
1932                         btree_iter_copy(child, iter);
1933                         k = bch2_btree_iter_peek(child);
1934
1935                         if (k.k && !bkey_err(k))
1936                                 iter->k = child->k;
1937                 } else {
1938                         struct bpos pos = iter->pos;
1939
1940                         k = bch2_btree_iter_peek(iter);
1941                         iter->pos = pos;
1942                 }
1943
1944                 if (unlikely(bkey_err(k)))
1945                         return k;
1946         }
1947
1948         if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) {
1949                 if (!k.k ||
1950                     ((iter->flags & BTREE_ITER_ALL_SNAPSHOTS)
1951                      ? bpos_cmp(iter->pos, k.k->p)
1952                      : bkey_cmp(iter->pos, k.k->p))) {
1953                         bkey_init(&iter->k);
1954                         iter->k.p = iter->pos;
1955                         k = (struct bkey_s_c) { &iter->k, NULL };
1956                 }
1957         } else {
1958                 struct bpos next = k.k ? bkey_start_pos(k.k) : POS_MAX;
1959
1960                 if (bkey_cmp(iter->pos, next) < 0) {
1961                         bkey_init(&iter->k);
1962                         iter->k.p = iter->pos;
1963                         bch2_key_resize(&iter->k,
1964                                         min_t(u64, KEY_SIZE_MAX,
1965                                               (next.inode == iter->pos.inode
1966                                                ? next.offset
1967                                                : KEY_OFFSET_MAX) -
1968                                               iter->pos.offset));
1969
1970                         k = (struct bkey_s_c) { &iter->k, NULL };
1971                         EBUG_ON(!k.k->size);
1972                 }
1973         }
1974
1975         bch2_btree_iter_verify_entry_exit(iter);
1976         bch2_btree_iter_verify(iter);
1977         iter->should_be_locked = true;
1978
1979         return k;
1980 }
1981
1982 struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
1983 {
1984         if (!bch2_btree_iter_advance(iter))
1985                 return bkey_s_c_null;
1986
1987         return bch2_btree_iter_peek_slot(iter);
1988 }
1989
1990 struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
1991 {
1992         if (!bch2_btree_iter_rewind(iter))
1993                 return bkey_s_c_null;
1994
1995         return bch2_btree_iter_peek_slot(iter);
1996 }
1997
1998 static inline void bch2_btree_iter_init(struct btree_trans *trans,
1999                         struct btree_iter *iter, enum btree_id btree_id)
2000 {
2001         struct bch_fs *c = trans->c;
2002         unsigned i;
2003
2004         iter->trans                     = trans;
2005         iter->uptodate                  = BTREE_ITER_NEED_TRAVERSE;
2006         iter->btree_id                  = btree_id;
2007         iter->real_pos                  = POS_MIN;
2008         iter->level                     = 0;
2009         iter->min_depth                 = 0;
2010         iter->locks_want                = 0;
2011         iter->nodes_locked              = 0;
2012         iter->nodes_intent_locked       = 0;
2013         for (i = 0; i < ARRAY_SIZE(iter->l); i++)
2014                 iter->l[i].b            = BTREE_ITER_NO_NODE_INIT;
2015
2016         prefetch(c->btree_roots[btree_id].b);
2017 }
2018
2019 /* new transactional stuff: */
2020
2021 static void btree_iter_child_free(struct btree_iter *iter)
2022 {
2023         struct btree_iter *child = btree_iter_child(iter);
2024
2025         if (child) {
2026                 bch2_trans_iter_free(iter->trans, child);
2027                 iter->child_idx = U8_MAX;
2028         }
2029 }
2030
2031 static struct btree_iter *btree_iter_child_alloc(struct btree_iter *iter,
2032                                                  unsigned long ip)
2033 {
2034         struct btree_trans *trans = iter->trans;
2035         struct btree_iter *child = btree_iter_child(iter);
2036
2037         if (!child) {
2038                 child = btree_trans_iter_alloc(trans);
2039                 child->ip_allocated     = ip;
2040                 iter->child_idx         = child->idx;
2041
2042                 trans->iters_live       |= 1ULL << child->idx;
2043                 trans->iters_touched    |= 1ULL << child->idx;
2044         }
2045
2046         return child;
2047 }
2048
2049 static inline void __bch2_trans_iter_free(struct btree_trans *trans,
2050                                           unsigned idx)
2051 {
2052         btree_iter_child_free(&trans->iters[idx]);
2053
2054         __bch2_btree_iter_unlock(&trans->iters[idx]);
2055         trans->iters_linked             &= ~(1ULL << idx);
2056         trans->iters_live               &= ~(1ULL << idx);
2057         trans->iters_touched            &= ~(1ULL << idx);
2058 }
2059
2060 int bch2_trans_iter_put(struct btree_trans *trans,
2061                         struct btree_iter *iter)
2062 {
2063         int ret;
2064
2065         if (IS_ERR_OR_NULL(iter))
2066                 return 0;
2067
2068         BUG_ON(trans->iters + iter->idx != iter);
2069         BUG_ON(!btree_iter_live(trans, iter));
2070
2071         ret = btree_iter_err(iter);
2072
2073         if (!(trans->iters_touched & (1ULL << iter->idx)) &&
2074             !(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT))
2075                 __bch2_trans_iter_free(trans, iter->idx);
2076
2077         trans->iters_live       &= ~(1ULL << iter->idx);
2078         return ret;
2079 }
2080
2081 int bch2_trans_iter_free(struct btree_trans *trans,
2082                          struct btree_iter *iter)
2083 {
2084         if (IS_ERR_OR_NULL(iter))
2085                 return 0;
2086
2087         set_btree_iter_dontneed(trans, iter);
2088
2089         return bch2_trans_iter_put(trans, iter);
2090 }
2091
2092 noinline __cold
2093 static void btree_trans_iter_alloc_fail(struct btree_trans *trans)
2094 {
2095
2096         struct btree_iter *iter;
2097         struct btree_insert_entry *i;
2098         char buf[100];
2099
2100         trans_for_each_iter(trans, iter)
2101                 printk(KERN_ERR "iter: btree %s pos %s%s%s%s %pS\n",
2102                        bch2_btree_ids[iter->btree_id],
2103                        (bch2_bpos_to_text(&PBUF(buf), iter->pos), buf),
2104                        btree_iter_live(trans, iter) ? " live" : "",
2105                        (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
2106                        iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
2107                        (void *) iter->ip_allocated);
2108
2109         trans_for_each_update(trans, i) {
2110                 char buf[300];
2111
2112                 bch2_bkey_val_to_text(&PBUF(buf), trans->c, bkey_i_to_s_c(i->k));
2113                 printk(KERN_ERR "update: btree %s %s\n",
2114                        bch2_btree_ids[i->iter->btree_id], buf);
2115         }
2116         panic("trans iter oveflow\n");
2117 }
2118
2119 static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans)
2120 {
2121         struct btree_iter *iter;
2122         unsigned idx;
2123
2124         if (unlikely(trans->iters_linked ==
2125                      ~((~0ULL << 1) << (BTREE_ITER_MAX - 1))))
2126                 btree_trans_iter_alloc_fail(trans);
2127
2128         idx = __ffs64(~trans->iters_linked);
2129         iter = &trans->iters[idx];
2130
2131         iter->trans             = trans;
2132         iter->idx               = idx;
2133         iter->child_idx         = U8_MAX;
2134         iter->flags             = 0;
2135         iter->nodes_locked      = 0;
2136         iter->nodes_intent_locked = 0;
2137         trans->iters_linked     |= 1ULL << idx;
2138         return iter;
2139 }
2140
2141 static void btree_iter_copy(struct btree_iter *dst, struct btree_iter *src)
2142 {
2143         unsigned i;
2144
2145         __bch2_btree_iter_unlock(dst);
2146         btree_iter_child_free(dst);
2147
2148         memcpy(&dst->flags, &src->flags,
2149                sizeof(struct btree_iter) - offsetof(struct btree_iter, flags));
2150
2151         for (i = 0; i < BTREE_MAX_DEPTH; i++)
2152                 if (btree_node_locked(dst, i))
2153                         six_lock_increment(&dst->l[i].b->c.lock,
2154                                            __btree_lock_want(dst, i));
2155
2156         dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
2157         dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT;
2158 }
2159
2160 struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
2161                                          unsigned btree_id, struct bpos pos,
2162                                          unsigned locks_want,
2163                                          unsigned depth,
2164                                          unsigned flags)
2165 {
2166         struct btree_iter *iter, *best = NULL;
2167         struct bpos real_pos, pos_min = POS_MIN;
2168
2169         EBUG_ON(trans->restarted);
2170
2171         if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
2172             btree_node_type_is_extents(btree_id) &&
2173             !(flags & BTREE_ITER_NOT_EXTENTS) &&
2174             !(flags & BTREE_ITER_ALL_SNAPSHOTS))
2175                 flags |= BTREE_ITER_IS_EXTENTS;
2176
2177         if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
2178             !btree_type_has_snapshots(btree_id))
2179                 flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
2180
2181         if (!(flags & BTREE_ITER_ALL_SNAPSHOTS))
2182                 pos.snapshot = btree_type_has_snapshots(btree_id)
2183                         ? U32_MAX : 0;
2184
2185         real_pos = pos;
2186
2187         if ((flags & BTREE_ITER_IS_EXTENTS) &&
2188             bkey_cmp(pos, POS_MAX))
2189                 real_pos = bpos_nosnap_successor(pos);
2190
2191         trans_for_each_iter(trans, iter) {
2192                 if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE))
2193                         continue;
2194
2195                 if (iter->btree_id != btree_id)
2196                         continue;
2197
2198                 if (best) {
2199                         int cmp = bkey_cmp(bpos_diff(best->real_pos, real_pos),
2200                                            bpos_diff(iter->real_pos, real_pos));
2201
2202                         if (cmp < 0 ||
2203                             ((cmp == 0 && btree_iter_keep(trans, iter))))
2204                                 continue;
2205                 }
2206
2207                 best = iter;
2208         }
2209
2210         if (!best) {
2211                 iter = btree_trans_iter_alloc(trans);
2212                 bch2_btree_iter_init(trans, iter, btree_id);
2213         } else if (btree_iter_keep(trans, best)) {
2214                 iter = btree_trans_iter_alloc(trans);
2215                 btree_iter_copy(iter, best);
2216         } else {
2217                 iter = best;
2218         }
2219
2220         trans->iters_live       |= 1ULL << iter->idx;
2221         trans->iters_touched    |= 1ULL << iter->idx;
2222
2223         iter->flags = flags;
2224
2225         iter->snapshot = pos.snapshot;
2226
2227         /*
2228          * If the iterator has locks_want greater than requested, we explicitly
2229          * do not downgrade it here - on transaction restart because btree node
2230          * split needs to upgrade locks, we might be putting/getting the
2231          * iterator again. Downgrading iterators only happens via an explicit
2232          * bch2_trans_downgrade().
2233          */
2234
2235         locks_want = min(locks_want, BTREE_MAX_DEPTH);
2236         if (locks_want > iter->locks_want) {
2237                 iter->locks_want = locks_want;
2238                 btree_iter_get_locks(iter, true, _THIS_IP_);
2239         }
2240
2241         while (iter->level != depth) {
2242                 btree_node_unlock(iter, iter->level);
2243                 iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT;
2244                 iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
2245                 if (iter->level < depth)
2246                         iter->level++;
2247                 else
2248                         iter->level--;
2249         }
2250
2251         iter->min_depth = depth;
2252
2253         bch2_btree_iter_set_pos(iter, pos);
2254         btree_iter_set_search_pos(iter, real_pos);
2255
2256         trace_trans_get_iter(_RET_IP_, trans->ip,
2257                              btree_id,
2258                              &real_pos, locks_want, iter->uptodate,
2259                              best ? &best->real_pos     : &pos_min,
2260                              best ? best->locks_want    : U8_MAX,
2261                              best ? best->uptodate      : U8_MAX);
2262
2263         return iter;
2264 }
2265
2266 struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
2267                                             enum btree_id btree_id,
2268                                             struct bpos pos,
2269                                             unsigned locks_want,
2270                                             unsigned depth,
2271                                             unsigned flags)
2272 {
2273         struct btree_iter *iter =
2274                 __bch2_trans_get_iter(trans, btree_id, pos,
2275                                       locks_want, depth,
2276                                       BTREE_ITER_NODES|
2277                                       BTREE_ITER_NOT_EXTENTS|
2278                                       BTREE_ITER_ALL_SNAPSHOTS|
2279                                       flags);
2280
2281         BUG_ON(bkey_cmp(iter->pos, pos));
2282         BUG_ON(iter->locks_want != min(locks_want, BTREE_MAX_DEPTH));
2283         BUG_ON(iter->level      != depth);
2284         BUG_ON(iter->min_depth  != depth);
2285         iter->ip_allocated = _RET_IP_;
2286
2287         return iter;
2288 }
2289
2290 struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans,
2291                                         struct btree_iter *src)
2292 {
2293         struct btree_iter *iter;
2294
2295         iter = btree_trans_iter_alloc(trans);
2296         btree_iter_copy(iter, src);
2297
2298         trans->iters_live |= 1ULL << iter->idx;
2299         /*
2300          * We don't need to preserve this iter since it's cheap to copy it
2301          * again - this will cause trans_iter_put() to free it right away:
2302          */
2303         set_btree_iter_dontneed(trans, iter);
2304
2305         return iter;
2306 }
2307
2308 void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
2309 {
2310         size_t new_top = trans->mem_top + size;
2311         void *p;
2312
2313         if (new_top > trans->mem_bytes) {
2314                 size_t old_bytes = trans->mem_bytes;
2315                 size_t new_bytes = roundup_pow_of_two(new_top);
2316                 void *new_mem;
2317
2318                 WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
2319
2320                 new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
2321                 if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
2322                         new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
2323                         new_bytes = BTREE_TRANS_MEM_MAX;
2324                         kfree(trans->mem);
2325                 }
2326
2327                 if (!new_mem)
2328                         return ERR_PTR(-ENOMEM);
2329
2330                 trans->mem = new_mem;
2331                 trans->mem_bytes = new_bytes;
2332
2333                 if (old_bytes) {
2334                         trace_trans_restart_mem_realloced(trans->ip, _RET_IP_, new_bytes);
2335                         btree_trans_restart(trans);
2336                         return ERR_PTR(-EINTR);
2337                 }
2338         }
2339
2340         p = trans->mem + trans->mem_top;
2341         trans->mem_top += size;
2342         memset(p, 0, size);
2343         return p;
2344 }
2345
2346 inline void bch2_trans_unlink_iters(struct btree_trans *trans)
2347 {
2348         u64 iters = trans->iters_linked &
2349                 ~trans->iters_touched &
2350                 ~trans->iters_live;
2351
2352         while (iters) {
2353                 unsigned idx = __ffs64(iters);
2354
2355                 iters &= ~(1ULL << idx);
2356                 __bch2_trans_iter_free(trans, idx);
2357         }
2358 }
2359
2360 /**
2361  * bch2_trans_begin() - reset a transaction after a interrupted attempt
2362  * @trans: transaction to reset
2363  *
2364  * While iterating over nodes or updating nodes a attempt to lock a btree
2365  * node may return EINTR when the trylock fails. When this occurs
2366  * bch2_trans_begin() should be called and the transaction retried.
2367  */
2368 void bch2_trans_begin(struct btree_trans *trans)
2369 {
2370         struct btree_iter *iter;
2371
2372         trans_for_each_iter(trans, iter)
2373                 iter->flags &= ~(BTREE_ITER_KEEP_UNTIL_COMMIT|
2374                                  BTREE_ITER_SET_POS_AFTER_COMMIT);
2375
2376         /*
2377          * XXX: we shouldn't be doing this if the transaction was restarted, but
2378          * currently we still overflow transaction iterators if we do that
2379          * */
2380         bch2_trans_unlink_iters(trans);
2381         trans->iters_touched &= trans->iters_live;
2382
2383         trans->extra_journal_res        = 0;
2384         trans->nr_updates               = 0;
2385         trans->mem_top                  = 0;
2386
2387         trans->hooks                    = NULL;
2388         trans->extra_journal_entries    = NULL;
2389         trans->extra_journal_entry_u64s = 0;
2390
2391         if (trans->fs_usage_deltas) {
2392                 trans->fs_usage_deltas->used = 0;
2393                 memset(&trans->fs_usage_deltas->memset_start, 0,
2394                        (void *) &trans->fs_usage_deltas->memset_end -
2395                        (void *) &trans->fs_usage_deltas->memset_start);
2396         }
2397
2398         bch2_trans_cond_resched(trans);
2399
2400         if (trans->restarted)
2401                 bch2_btree_iter_traverse_all(trans);
2402
2403         trans->restarted = false;
2404 }
2405
2406 static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c)
2407 {
2408         size_t iters_bytes      = sizeof(struct btree_iter) * BTREE_ITER_MAX;
2409         size_t updates_bytes    = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX;
2410         void *p = NULL;
2411
2412         BUG_ON(trans->used_mempool);
2413
2414 #ifdef __KERNEL__
2415         p = this_cpu_xchg(c->btree_iters_bufs->iter, NULL);
2416 #endif
2417         if (!p)
2418                 p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
2419
2420         trans->iters            = p; p += iters_bytes;
2421         trans->updates          = p; p += updates_bytes;
2422 }
2423
2424 void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
2425                      unsigned expected_nr_iters,
2426                      size_t expected_mem_bytes)
2427         __acquires(&c->btree_trans_barrier)
2428 {
2429         memset(trans, 0, sizeof(*trans));
2430         trans->c                = c;
2431         trans->ip               = _RET_IP_;
2432
2433         /*
2434          * reallocating iterators currently completely breaks
2435          * bch2_trans_iter_put(), we always allocate the max:
2436          */
2437         bch2_trans_alloc_iters(trans, c);
2438
2439         if (expected_mem_bytes) {
2440                 trans->mem_bytes = roundup_pow_of_two(expected_mem_bytes);
2441                 trans->mem = kmalloc(trans->mem_bytes, GFP_KERNEL|__GFP_NOFAIL);
2442
2443                 if (!unlikely(trans->mem)) {
2444                         trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
2445                         trans->mem_bytes = BTREE_TRANS_MEM_MAX;
2446                 }
2447         }
2448
2449         trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
2450
2451 #ifdef CONFIG_BCACHEFS_DEBUG
2452         trans->pid = current->pid;
2453         mutex_lock(&c->btree_trans_lock);
2454         list_add(&trans->list, &c->btree_trans_list);
2455         mutex_unlock(&c->btree_trans_lock);
2456 #endif
2457 }
2458
2459 int bch2_trans_exit(struct btree_trans *trans)
2460         __releases(&c->btree_trans_barrier)
2461 {
2462         struct bch_fs *c = trans->c;
2463
2464         bch2_trans_unlock(trans);
2465
2466 #ifdef CONFIG_BCACHEFS_DEBUG
2467         if (trans->iters_live) {
2468                 struct btree_iter *iter;
2469
2470                 trans_for_each_iter(trans, iter)
2471                         btree_iter_child_free(iter);
2472         }
2473
2474         if (trans->iters_live) {
2475                 struct btree_iter *iter;
2476
2477                 bch_err(c, "btree iterators leaked!");
2478                 trans_for_each_iter(trans, iter)
2479                         if (btree_iter_live(trans, iter))
2480                                 printk(KERN_ERR "  btree %s allocated at %pS\n",
2481                                        bch2_btree_ids[iter->btree_id],
2482                                        (void *) iter->ip_allocated);
2483                 /* Be noisy about this: */
2484                 bch2_fatal_error(c);
2485         }
2486
2487         mutex_lock(&trans->c->btree_trans_lock);
2488         list_del(&trans->list);
2489         mutex_unlock(&trans->c->btree_trans_lock);
2490 #endif
2491
2492         srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
2493
2494         bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
2495
2496         if (trans->fs_usage_deltas) {
2497                 if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
2498                     REPLICAS_DELTA_LIST_MAX)
2499                         mempool_free(trans->fs_usage_deltas,
2500                                      &trans->c->replicas_delta_pool);
2501                 else
2502                         kfree(trans->fs_usage_deltas);
2503         }
2504
2505         if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
2506                 mempool_free(trans->mem, &trans->c->btree_trans_mem_pool);
2507         else
2508                 kfree(trans->mem);
2509
2510 #ifdef __KERNEL__
2511         /*
2512          * Userspace doesn't have a real percpu implementation:
2513          */
2514         trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters);
2515 #endif
2516
2517         if (trans->iters)
2518                 mempool_free(trans->iters, &trans->c->btree_iters_pool);
2519
2520         trans->mem      = (void *) 0x1;
2521         trans->iters    = (void *) 0x1;
2522
2523         return trans->error ? -EIO : 0;
2524 }
2525
2526 static void __maybe_unused
2527 bch2_btree_iter_node_to_text(struct printbuf *out,
2528                              struct btree_bkey_cached_common *_b,
2529                              enum btree_iter_type type)
2530 {
2531         pr_buf(out, "    l=%u %s:",
2532                _b->level, bch2_btree_ids[_b->btree_id]);
2533         bch2_bpos_to_text(out, btree_node_pos(_b, type));
2534 }
2535
2536 #ifdef CONFIG_BCACHEFS_DEBUG
2537 static bool trans_has_btree_nodes_locked(struct btree_trans *trans)
2538 {
2539         struct btree_iter *iter;
2540
2541         trans_for_each_iter(trans, iter)
2542                 if (btree_iter_type(iter) != BTREE_ITER_CACHED &&
2543                     iter->nodes_locked)
2544                         return true;
2545         return false;
2546 }
2547 #endif
2548
2549 void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
2550 {
2551 #ifdef CONFIG_BCACHEFS_DEBUG
2552         struct btree_trans *trans;
2553         struct btree_iter *iter;
2554         struct btree *b;
2555         unsigned l;
2556
2557         mutex_lock(&c->btree_trans_lock);
2558         list_for_each_entry(trans, &c->btree_trans_list, list) {
2559                 if (!trans_has_btree_nodes_locked(trans))
2560                         continue;
2561
2562                 pr_buf(out, "%i %ps\n", trans->pid, (void *) trans->ip);
2563
2564                 trans_for_each_iter(trans, iter) {
2565                         if (!iter->nodes_locked)
2566                                 continue;
2567
2568                         pr_buf(out, "  iter %u %c %s:",
2569                                iter->idx,
2570                                btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b',
2571                                bch2_btree_ids[iter->btree_id]);
2572                         bch2_bpos_to_text(out, iter->pos);
2573                         pr_buf(out, "\n");
2574
2575                         for (l = 0; l < BTREE_MAX_DEPTH; l++) {
2576                                 if (btree_node_locked(iter, l)) {
2577                                         pr_buf(out, "    %s l=%u ",
2578                                                btree_node_intent_locked(iter, l) ? "i" : "r", l);
2579                                         bch2_btree_iter_node_to_text(out,
2580                                                         (void *) iter->l[l].b,
2581                                                         btree_iter_type(iter));
2582                                         pr_buf(out, "\n");
2583                                 }
2584                         }
2585                 }
2586
2587                 b = READ_ONCE(trans->locking);
2588                 if (b) {
2589                         iter = &trans->iters[trans->locking_iter_idx];
2590                         pr_buf(out, "  locking iter %u %c l=%u %s:",
2591                                trans->locking_iter_idx,
2592                                btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b',
2593                                trans->locking_level,
2594                                bch2_btree_ids[trans->locking_btree_id]);
2595                         bch2_bpos_to_text(out, trans->locking_pos);
2596
2597                         pr_buf(out, " node ");
2598                         bch2_btree_iter_node_to_text(out,
2599                                         (void *) b,
2600                                         btree_iter_type(iter));
2601                         pr_buf(out, "\n");
2602                 }
2603         }
2604         mutex_unlock(&c->btree_trans_lock);
2605 #endif
2606 }
2607
2608 void bch2_fs_btree_iter_exit(struct bch_fs *c)
2609 {
2610         mempool_exit(&c->btree_trans_mem_pool);
2611         mempool_exit(&c->btree_iters_pool);
2612         cleanup_srcu_struct(&c->btree_trans_barrier);
2613 }
2614
2615 int bch2_fs_btree_iter_init(struct bch_fs *c)
2616 {
2617         unsigned nr = BTREE_ITER_MAX;
2618
2619         INIT_LIST_HEAD(&c->btree_trans_list);
2620         mutex_init(&c->btree_trans_lock);
2621
2622         return  init_srcu_struct(&c->btree_trans_barrier) ?:
2623                 mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
2624                         sizeof(struct btree_iter) * nr +
2625                         sizeof(struct btree_insert_entry) * nr) ?:
2626                 mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
2627                                           BTREE_TRANS_MEM_MAX);
2628 }