]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_iter.c
Update bcachefs sources to 4837f82ee1 bcachefs: Use cached iterators for alloc btree
[bcachefs-tools-debian] / libbcachefs / btree_iter.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_methods.h"
5 #include "btree_cache.h"
6 #include "btree_iter.h"
7 #include "btree_key_cache.h"
8 #include "btree_locking.h"
9 #include "btree_update.h"
10 #include "debug.h"
11 #include "extents.h"
12 #include "journal.h"
13
14 #include <linux/prefetch.h>
15 #include <trace/events/bcachefs.h>
16
17 static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
18 {
19         return l < BTREE_MAX_DEPTH &&
20                 (unsigned long) iter->l[l].b >= 128;
21 }
22
23 static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
24 {
25         struct bpos pos = iter->pos;
26
27         if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
28             bkey_cmp(pos, POS_MAX))
29                 pos = bkey_successor(pos);
30         return pos;
31 }
32
33 static inline bool btree_iter_pos_before_node(struct btree_iter *iter,
34                                               struct btree *b)
35 {
36         return bkey_cmp(btree_iter_search_key(iter), b->data->min_key) < 0;
37 }
38
39 static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
40                                              struct btree *b)
41 {
42         return bkey_cmp(b->key.k.p, btree_iter_search_key(iter)) < 0;
43 }
44
45 static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
46                                           struct btree *b)
47 {
48         return iter->btree_id == b->c.btree_id &&
49                 !btree_iter_pos_before_node(iter, b) &&
50                 !btree_iter_pos_after_node(iter, b);
51 }
52
53 /* Btree node locking: */
54
55 void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
56 {
57         bch2_btree_node_unlock_write_inlined(b, iter);
58 }
59
60 void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
61 {
62         struct btree_iter *linked;
63         unsigned readers = 0;
64
65         EBUG_ON(!btree_node_intent_locked(iter, b->c.level));
66
67         trans_for_each_iter(iter->trans, linked)
68                 if (linked->l[b->c.level].b == b &&
69                     btree_node_read_locked(linked, b->c.level))
70                         readers++;
71
72         /*
73          * Must drop our read locks before calling six_lock_write() -
74          * six_unlock() won't do wakeups until the reader count
75          * goes to 0, and it's safe because we have the node intent
76          * locked:
77          */
78         atomic64_sub(__SIX_VAL(read_lock, readers),
79                      &b->c.lock.state.counter);
80         btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write);
81         atomic64_add(__SIX_VAL(read_lock, readers),
82                      &b->c.lock.state.counter);
83 }
84
85 bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
86 {
87         struct btree *b = btree_iter_node(iter, level);
88         int want = __btree_lock_want(iter, level);
89
90         if (!is_btree_node(iter, level))
91                 return false;
92
93         if (race_fault())
94                 return false;
95
96         if (six_relock_type(&b->c.lock, want, iter->l[level].lock_seq) ||
97             (btree_node_lock_seq_matches(iter, b, level) &&
98              btree_node_lock_increment(iter->trans, b, level, want))) {
99                 mark_btree_node_locked(iter, level, want);
100                 return true;
101         } else {
102                 return false;
103         }
104 }
105
106 static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level)
107 {
108         struct btree *b = iter->l[level].b;
109
110         EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED);
111
112         if (!is_btree_node(iter, level))
113                 return false;
114
115         if (btree_node_intent_locked(iter, level))
116                 return true;
117
118         if (race_fault())
119                 return false;
120
121         if (btree_node_locked(iter, level)
122             ? six_lock_tryupgrade(&b->c.lock)
123             : six_relock_type(&b->c.lock, SIX_LOCK_intent, iter->l[level].lock_seq))
124                 goto success;
125
126         if (btree_node_lock_seq_matches(iter, b, level) &&
127             btree_node_lock_increment(iter->trans, b, level, BTREE_NODE_INTENT_LOCKED)) {
128                 btree_node_unlock(iter, level);
129                 goto success;
130         }
131
132         return false;
133 success:
134         mark_btree_node_intent_locked(iter, level);
135         return true;
136 }
137
138 static inline bool btree_iter_get_locks(struct btree_iter *iter,
139                                         bool upgrade, bool trace)
140 {
141         unsigned l = iter->level;
142         int fail_idx = -1;
143
144         do {
145                 if (!btree_iter_node(iter, l))
146                         break;
147
148                 if (!(upgrade
149                       ? bch2_btree_node_upgrade(iter, l)
150                       : bch2_btree_node_relock(iter, l))) {
151                         if (trace)
152                                 (upgrade
153                                  ? trace_node_upgrade_fail
154                                  : trace_node_relock_fail)(l, iter->l[l].lock_seq,
155                                                 is_btree_node(iter, l)
156                                                 ? 0
157                                                 : (unsigned long) iter->l[l].b,
158                                                 is_btree_node(iter, l)
159                                                 ? iter->l[l].b->c.lock.state.seq
160                                                 : 0);
161
162                         fail_idx = l;
163                         btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
164                 }
165
166                 l++;
167         } while (l < iter->locks_want);
168
169         /*
170          * When we fail to get a lock, we have to ensure that any child nodes
171          * can't be relocked so bch2_btree_iter_traverse has to walk back up to
172          * the node that we failed to relock:
173          */
174         while (fail_idx >= 0) {
175                 btree_node_unlock(iter, fail_idx);
176                 iter->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS;
177                 --fail_idx;
178         }
179
180         if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
181                 iter->uptodate = BTREE_ITER_NEED_PEEK;
182
183         bch2_btree_trans_verify_locks(iter->trans);
184
185         return iter->uptodate < BTREE_ITER_NEED_RELOCK;
186 }
187
188 /* Slowpath: */
189 bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
190                             unsigned level, struct btree_iter *iter,
191                             enum six_lock_type type,
192                             six_lock_should_sleep_fn should_sleep_fn,
193                             void *p)
194 {
195         struct btree_trans *trans = iter->trans;
196         struct btree_iter *linked;
197         u64 start_time = local_clock();
198         bool ret = true;
199
200         /* Check if it's safe to block: */
201         trans_for_each_iter(trans, linked) {
202                 if (!linked->nodes_locked)
203                         continue;
204
205                 /*
206                  * Can't block taking an intent lock if we have _any_ nodes read
207                  * locked:
208                  *
209                  * - Our read lock blocks another thread with an intent lock on
210                  *   the same node from getting a write lock, and thus from
211                  *   dropping its intent lock
212                  *
213                  * - And the other thread may have multiple nodes intent locked:
214                  *   both the node we want to intent lock, and the node we
215                  *   already have read locked - deadlock:
216                  */
217                 if (type == SIX_LOCK_intent &&
218                     linked->nodes_locked != linked->nodes_intent_locked) {
219                         if (!(trans->nounlock)) {
220                                 linked->locks_want = max_t(unsigned,
221                                                 linked->locks_want,
222                                                 __fls(linked->nodes_locked) + 1);
223                                 if (!btree_iter_get_locks(linked, true, false))
224                                         ret = false;
225                         } else {
226                                 ret = false;
227                         }
228                 }
229
230                 /*
231                  * Interior nodes must be locked before their descendants: if
232                  * another iterator has possible descendants locked of the node
233                  * we're about to lock, it must have the ancestors locked too:
234                  */
235                 if (linked->btree_id == iter->btree_id &&
236                     level > __fls(linked->nodes_locked)) {
237                         if (!(trans->nounlock)) {
238                                 linked->locks_want =
239                                         max(level + 1, max_t(unsigned,
240                                             linked->locks_want,
241                                             iter->locks_want));
242                                 if (!btree_iter_get_locks(linked, true, false))
243                                         ret = false;
244                         } else {
245                                 ret = false;
246                         }
247                 }
248
249                 /* Must lock btree nodes in key order: */
250                 if ((cmp_int(iter->btree_id, linked->btree_id) ?:
251                      -cmp_int(btree_iter_type(iter), btree_iter_type(linked))) < 0)
252                         ret = false;
253
254                 if (iter->btree_id == linked->btree_id &&
255                     btree_node_locked(linked, level) &&
256                     bkey_cmp(pos, linked->l[level].b->key.k.p) <= 0)
257                         ret = false;
258
259                 /*
260                  * Recheck if this is a node we already have locked - since one
261                  * of the get_locks() calls might've successfully
262                  * upgraded/relocked it:
263                  */
264                 if (linked->l[level].b == b &&
265                     btree_node_locked_type(linked, level) >= type) {
266                         six_lock_increment(&b->c.lock, type);
267                         return true;
268                 }
269         }
270
271         if (unlikely(!ret)) {
272                 trace_trans_restart_would_deadlock(iter->trans->ip);
273                 return false;
274         }
275
276         if (six_trylock_type(&b->c.lock, type))
277                 return true;
278
279         if (six_lock_type(&b->c.lock, type, should_sleep_fn, p))
280                 return false;
281
282         bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)],
283                                start_time);
284         return true;
285 }
286
287 /* Btree iterator locking: */
288
289 #ifdef CONFIG_BCACHEFS_DEBUG
290 static void bch2_btree_iter_verify_locks(struct btree_iter *iter)
291 {
292         unsigned l;
293
294         if (!(iter->trans->iters_linked & (1ULL << iter->idx))) {
295                 BUG_ON(iter->nodes_locked);
296                 return;
297         }
298
299         for (l = 0; is_btree_node(iter, l); l++) {
300                 if (iter->uptodate >= BTREE_ITER_NEED_RELOCK &&
301                     !btree_node_locked(iter, l))
302                         continue;
303
304                 BUG_ON(btree_lock_want(iter, l) !=
305                        btree_node_locked_type(iter, l));
306         }
307 }
308
309 void bch2_btree_trans_verify_locks(struct btree_trans *trans)
310 {
311         struct btree_iter *iter;
312
313         trans_for_each_iter_all(trans, iter)
314                 bch2_btree_iter_verify_locks(iter);
315 }
316 #else
317 static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {}
318 #endif
319
320 __flatten
321 bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace)
322 {
323         return btree_iter_get_locks(iter, false, trace);
324 }
325
326 bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
327                                unsigned new_locks_want)
328 {
329         struct btree_iter *linked;
330
331         EBUG_ON(iter->locks_want >= new_locks_want);
332
333         iter->locks_want = new_locks_want;
334
335         if (btree_iter_get_locks(iter, true, true))
336                 return true;
337
338         /*
339          * Ancestor nodes must be locked before child nodes, so set locks_want
340          * on iterators that might lock ancestors before us to avoid getting
341          * -EINTR later:
342          */
343         trans_for_each_iter(iter->trans, linked)
344                 if (linked != iter &&
345                     linked->btree_id == iter->btree_id &&
346                     linked->locks_want < new_locks_want) {
347                         linked->locks_want = new_locks_want;
348                         btree_iter_get_locks(linked, true, false);
349                 }
350
351         return false;
352 }
353
354 bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *iter,
355                                         unsigned new_locks_want)
356 {
357         unsigned l = iter->level;
358
359         EBUG_ON(iter->locks_want >= new_locks_want);
360
361         iter->locks_want = new_locks_want;
362
363         do {
364                 if (!btree_iter_node(iter, l))
365                         break;
366
367                 if (!bch2_btree_node_upgrade(iter, l)) {
368                         iter->locks_want = l;
369                         return false;
370                 }
371
372                 l++;
373         } while (l < iter->locks_want);
374
375         return true;
376 }
377
378 void __bch2_btree_iter_downgrade(struct btree_iter *iter,
379                                  unsigned downgrade_to)
380 {
381         unsigned l, new_locks_want = downgrade_to ?:
382                 (iter->flags & BTREE_ITER_INTENT ? 1 : 0);
383
384         if (iter->locks_want < downgrade_to) {
385                 iter->locks_want = new_locks_want;
386
387                 while (iter->nodes_locked &&
388                        (l = __fls(iter->nodes_locked)) >= iter->locks_want) {
389                         if (l > iter->level) {
390                                 btree_node_unlock(iter, l);
391                         } else {
392                                 if (btree_node_intent_locked(iter, l)) {
393                                         six_lock_downgrade(&iter->l[l].b->c.lock);
394                                         iter->nodes_intent_locked ^= 1 << l;
395                                 }
396                                 break;
397                         }
398                 }
399         }
400
401         bch2_btree_trans_verify_locks(iter->trans);
402 }
403
404 void bch2_trans_downgrade(struct btree_trans *trans)
405 {
406         struct btree_iter *iter;
407
408         trans_for_each_iter(trans, iter)
409                 bch2_btree_iter_downgrade(iter);
410 }
411
412 /* Btree transaction locking: */
413
414 bool bch2_trans_relock(struct btree_trans *trans)
415 {
416         struct btree_iter *iter;
417         bool ret = true;
418
419         trans_for_each_iter(trans, iter)
420                 if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
421                         ret &= bch2_btree_iter_relock(iter, true);
422
423         return ret;
424 }
425
426 void bch2_trans_unlock(struct btree_trans *trans)
427 {
428         struct btree_iter *iter;
429
430         trans_for_each_iter(trans, iter)
431                 __bch2_btree_iter_unlock(iter);
432 }
433
434 /* Btree iterator: */
435
436 #ifdef CONFIG_BCACHEFS_DEBUG
437
438 static void bch2_btree_iter_verify_level(struct btree_iter *iter,
439                                          unsigned level)
440 {
441         struct bpos pos = btree_iter_search_key(iter);
442         struct btree_iter_level *l = &iter->l[level];
443         struct btree_node_iter tmp = l->iter;
444         bool locked = btree_node_locked(iter, level);
445         struct bkey_packed *p, *k;
446         char buf1[100], buf2[100];
447         const char *msg;
448
449         if (!debug_check_iterators(iter->trans->c))
450                 return;
451
452         BUG_ON(iter->level < iter->min_depth);
453
454         if (!btree_iter_node(iter, level))
455                 return;
456
457         if (!bch2_btree_node_relock(iter, level))
458                 return;
459
460         /*
461          * Ideally this invariant would always be true, and hopefully in the
462          * future it will be, but for now set_pos_same_leaf() breaks it:
463          */
464         BUG_ON(iter->uptodate < BTREE_ITER_NEED_TRAVERSE &&
465                !btree_iter_pos_in_node(iter, l->b));
466
467         /*
468          * node iterators don't use leaf node iterator:
469          */
470         if (btree_iter_type(iter) == BTREE_ITER_NODES &&
471             level <= iter->min_depth)
472                 goto unlock;
473
474         bch2_btree_node_iter_verify(&l->iter, l->b);
475
476         /*
477          * For interior nodes, the iterator will have skipped past
478          * deleted keys:
479          *
480          * For extents, the iterator may have skipped past deleted keys (but not
481          * whiteouts)
482          */
483         p = level || btree_node_type_is_extents(iter->btree_id)
484                 ? bch2_btree_node_iter_prev_filter(&tmp, l->b, KEY_TYPE_discard)
485                 : bch2_btree_node_iter_prev_all(&tmp, l->b);
486         k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
487
488         if (p && bkey_iter_pos_cmp(l->b, p, &pos) >= 0) {
489                 msg = "before";
490                 goto err;
491         }
492
493         if (k && bkey_iter_pos_cmp(l->b, k, &pos) < 0) {
494                 msg = "after";
495                 goto err;
496         }
497 unlock:
498         if (!locked)
499                 btree_node_unlock(iter, level);
500         return;
501 err:
502         strcpy(buf1, "(none)");
503         strcpy(buf2, "(none)");
504
505         if (p) {
506                 struct bkey uk = bkey_unpack_key(l->b, p);
507                 bch2_bkey_to_text(&PBUF(buf1), &uk);
508         }
509
510         if (k) {
511                 struct bkey uk = bkey_unpack_key(l->b, k);
512                 bch2_bkey_to_text(&PBUF(buf2), &uk);
513         }
514
515         panic("iterator should be %s key at level %u:\n"
516               "iter pos %s %llu:%llu\n"
517               "prev key %s\n"
518               "cur  key %s\n",
519               msg, level,
520               iter->flags & BTREE_ITER_IS_EXTENTS ? ">" : "=>",
521               iter->pos.inode, iter->pos.offset,
522               buf1, buf2);
523 }
524
525 static void bch2_btree_iter_verify(struct btree_iter *iter)
526 {
527         unsigned i;
528
529         bch2_btree_trans_verify_locks(iter->trans);
530
531         for (i = 0; i < BTREE_MAX_DEPTH; i++)
532                 bch2_btree_iter_verify_level(iter, i);
533 }
534
535 void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b)
536 {
537         struct btree_iter *iter;
538
539         if (!debug_check_iterators(trans->c))
540                 return;
541
542         trans_for_each_iter_with_node(trans, b, iter)
543                 bch2_btree_iter_verify_level(iter, b->c.level);
544 }
545
546 #else
547
548 static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned l) {}
549 static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
550
551 #endif
552
553 static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
554                                         struct btree *b,
555                                         struct bset_tree *t,
556                                         struct bkey_packed *k)
557 {
558         struct btree_node_iter_set *set;
559
560         btree_node_iter_for_each(iter, set)
561                 if (set->end == t->end_offset) {
562                         set->k = __btree_node_key_to_offset(b, k);
563                         bch2_btree_node_iter_sort(iter, b);
564                         return;
565                 }
566
567         bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
568 }
569
570 static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
571                                                struct btree *b,
572                                                struct bkey_packed *where)
573 {
574         struct btree_iter_level *l = &iter->l[b->c.level];
575         struct bpos pos = btree_iter_search_key(iter);
576
577         if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
578                 return;
579
580         if (bkey_iter_pos_cmp(l->b, where, &pos) < 0)
581                 bch2_btree_node_iter_advance(&l->iter, l->b);
582
583         btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
584 }
585
586 void bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
587                                       struct btree *b,
588                                       struct bkey_packed *where)
589 {
590         struct btree_iter *linked;
591
592         trans_for_each_iter_with_node(iter->trans, b, linked) {
593                 __bch2_btree_iter_fix_key_modified(linked, b, where);
594                 bch2_btree_iter_verify_level(linked, b->c.level);
595         }
596 }
597
598 static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
599                                       struct btree *b,
600                                       struct btree_node_iter *node_iter,
601                                       struct bset_tree *t,
602                                       struct bkey_packed *where,
603                                       unsigned clobber_u64s,
604                                       unsigned new_u64s)
605 {
606         const struct bkey_packed *end = btree_bkey_last(b, t);
607         struct btree_node_iter_set *set;
608         unsigned offset = __btree_node_key_to_offset(b, where);
609         int shift = new_u64s - clobber_u64s;
610         unsigned old_end = t->end_offset - shift;
611         unsigned orig_iter_pos = node_iter->data[0].k;
612         bool iter_current_key_modified =
613                 orig_iter_pos >= offset &&
614                 orig_iter_pos <= offset + clobber_u64s;
615         struct bpos iter_pos = btree_iter_search_key(iter);
616
617         btree_node_iter_for_each(node_iter, set)
618                 if (set->end == old_end)
619                         goto found;
620
621         /* didn't find the bset in the iterator - might have to readd it: */
622         if (new_u64s &&
623             bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) {
624                 bch2_btree_node_iter_push(node_iter, b, where, end);
625                 goto fixup_done;
626         } else {
627                 /* Iterator is after key that changed */
628                 return;
629         }
630 found:
631         set->end = t->end_offset;
632
633         /* Iterator hasn't gotten to the key that changed yet: */
634         if (set->k < offset)
635                 return;
636
637         if (new_u64s &&
638             bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) {
639                 set->k = offset;
640         } else if (set->k < offset + clobber_u64s) {
641                 set->k = offset + new_u64s;
642                 if (set->k == set->end)
643                         bch2_btree_node_iter_set_drop(node_iter, set);
644         } else {
645                 /* Iterator is after key that changed */
646                 set->k = (int) set->k + shift;
647                 return;
648         }
649
650         bch2_btree_node_iter_sort(node_iter, b);
651 fixup_done:
652         if (node_iter->data[0].k != orig_iter_pos)
653                 iter_current_key_modified = true;
654
655         /*
656          * When a new key is added, and the node iterator now points to that
657          * key, the iterator might have skipped past deleted keys that should
658          * come after the key the iterator now points to. We have to rewind to
659          * before those deleted keys - otherwise
660          * bch2_btree_node_iter_prev_all() breaks:
661          */
662         if (!bch2_btree_node_iter_end(node_iter) &&
663             iter_current_key_modified &&
664             (b->c.level ||
665              btree_node_type_is_extents(iter->btree_id))) {
666                 struct bset_tree *t;
667                 struct bkey_packed *k, *k2, *p;
668
669                 k = bch2_btree_node_iter_peek_all(node_iter, b);
670
671                 for_each_bset(b, t) {
672                         bool set_pos = false;
673
674                         if (node_iter->data[0].end == t->end_offset)
675                                 continue;
676
677                         k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t);
678
679                         while ((p = bch2_bkey_prev_all(b, t, k2)) &&
680                                bkey_iter_cmp(b, k, p) < 0) {
681                                 k2 = p;
682                                 set_pos = true;
683                         }
684
685                         if (set_pos)
686                                 btree_node_iter_set_set_pos(node_iter,
687                                                             b, t, k2);
688                 }
689         }
690
691         if (!b->c.level &&
692             node_iter == &iter->l[0].iter &&
693             iter_current_key_modified)
694                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
695 }
696
697 void bch2_btree_node_iter_fix(struct btree_iter *iter,
698                               struct btree *b,
699                               struct btree_node_iter *node_iter,
700                               struct bkey_packed *where,
701                               unsigned clobber_u64s,
702                               unsigned new_u64s)
703 {
704         struct bset_tree *t = bch2_bkey_to_bset(b, where);
705         struct btree_iter *linked;
706
707         if (node_iter != &iter->l[b->c.level].iter) {
708                 __bch2_btree_node_iter_fix(iter, b, node_iter, t,
709                                            where, clobber_u64s, new_u64s);
710
711                 if (debug_check_iterators(iter->trans->c))
712                         bch2_btree_node_iter_verify(node_iter, b);
713         }
714
715         trans_for_each_iter_with_node(iter->trans, b, linked) {
716                 __bch2_btree_node_iter_fix(linked, b,
717                                            &linked->l[b->c.level].iter, t,
718                                            where, clobber_u64s, new_u64s);
719                 bch2_btree_iter_verify_level(linked, b->c.level);
720         }
721 }
722
723 static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
724                                                   struct btree_iter_level *l,
725                                                   struct bkey *u,
726                                                   struct bkey_packed *k)
727 {
728         struct bkey_s_c ret;
729
730         if (unlikely(!k)) {
731                 /*
732                  * signal to bch2_btree_iter_peek_slot() that we're currently at
733                  * a hole
734                  */
735                 u->type = KEY_TYPE_deleted;
736                 return bkey_s_c_null;
737         }
738
739         ret = bkey_disassemble(l->b, k, u);
740
741         if (debug_check_bkeys(iter->trans->c))
742                 bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
743
744         return ret;
745 }
746
747 /* peek_all() doesn't skip deleted keys */
748 static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter,
749                                                     struct btree_iter_level *l,
750                                                     struct bkey *u)
751 {
752         return __btree_iter_unpack(iter, l, u,
753                         bch2_btree_node_iter_peek_all(&l->iter, l->b));
754 }
755
756 static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter,
757                                                 struct btree_iter_level *l)
758 {
759         return __btree_iter_unpack(iter, l, &iter->k,
760                         bch2_btree_node_iter_peek(&l->iter, l->b));
761 }
762
763 static inline struct bkey_s_c __btree_iter_prev(struct btree_iter *iter,
764                                                 struct btree_iter_level *l)
765 {
766         return __btree_iter_unpack(iter, l, &iter->k,
767                         bch2_btree_node_iter_prev(&l->iter, l->b));
768 }
769
770 static inline bool btree_iter_advance_to_pos(struct btree_iter *iter,
771                                              struct btree_iter_level *l,
772                                              int max_advance)
773 {
774         struct bpos pos = btree_iter_search_key(iter);
775         struct bkey_packed *k;
776         int nr_advanced = 0;
777
778         while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
779                bkey_iter_pos_cmp(l->b, k, &pos) < 0) {
780                 if (max_advance > 0 && nr_advanced >= max_advance)
781                         return false;
782
783                 bch2_btree_node_iter_advance(&l->iter, l->b);
784                 nr_advanced++;
785         }
786
787         return true;
788 }
789
790 /*
791  * Verify that iterator for parent node points to child node:
792  */
793 static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
794 {
795         struct btree_iter_level *l;
796         unsigned plevel;
797         bool parent_locked;
798         struct bkey_packed *k;
799
800         if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
801                 return;
802
803         plevel = b->c.level + 1;
804         if (!btree_iter_node(iter, plevel))
805                 return;
806
807         parent_locked = btree_node_locked(iter, plevel);
808
809         if (!bch2_btree_node_relock(iter, plevel))
810                 return;
811
812         l = &iter->l[plevel];
813         k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
814         if (!k ||
815             bkey_deleted(k) ||
816             bkey_cmp_left_packed(l->b, k, &b->key.k.p)) {
817                 char buf[100];
818                 struct bkey uk = bkey_unpack_key(b, k);
819
820                 bch2_bkey_to_text(&PBUF(buf), &uk);
821                 panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n",
822                       buf, b->key.k.p.inode, b->key.k.p.offset);
823         }
824
825         if (!parent_locked)
826                 btree_node_unlock(iter, b->c.level + 1);
827 }
828
829 static inline void __btree_iter_init(struct btree_iter *iter,
830                                      unsigned level)
831 {
832         struct bpos pos = btree_iter_search_key(iter);
833         struct btree_iter_level *l = &iter->l[level];
834
835         bch2_btree_node_iter_init(&l->iter, l->b, &pos);
836
837         btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
838 }
839
840 static inline void btree_iter_node_set(struct btree_iter *iter,
841                                        struct btree *b)
842 {
843         BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
844
845         btree_iter_verify_new_node(iter, b);
846
847         EBUG_ON(!btree_iter_pos_in_node(iter, b));
848         EBUG_ON(b->c.lock.state.seq & 1);
849
850         iter->l[b->c.level].lock_seq = b->c.lock.state.seq;
851         iter->l[b->c.level].b = b;
852         __btree_iter_init(iter, b->c.level);
853 }
854
855 /*
856  * A btree node is being replaced - update the iterator to point to the new
857  * node:
858  */
859 void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
860 {
861         enum btree_node_locked_type t;
862         struct btree_iter *linked;
863
864         trans_for_each_iter(iter->trans, linked)
865                 if (btree_iter_type(linked) != BTREE_ITER_CACHED &&
866                     btree_iter_pos_in_node(linked, b)) {
867                         /*
868                          * bch2_btree_iter_node_drop() has already been called -
869                          * the old node we're replacing has already been
870                          * unlocked and the pointer invalidated
871                          */
872                         BUG_ON(btree_node_locked(linked, b->c.level));
873
874                         t = btree_lock_want(linked, b->c.level);
875                         if (t != BTREE_NODE_UNLOCKED) {
876                                 six_lock_increment(&b->c.lock, t);
877                                 mark_btree_node_locked(linked, b->c.level, t);
878                         }
879
880                         btree_iter_node_set(linked, b);
881                 }
882 }
883
884 void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
885 {
886         struct btree_iter *linked;
887         unsigned level = b->c.level;
888
889         trans_for_each_iter(iter->trans, linked)
890                 if (linked->l[level].b == b) {
891                         __btree_node_unlock(linked, level);
892                         linked->l[level].b = BTREE_ITER_NO_NODE_DROP;
893                 }
894 }
895
896 /*
897  * A btree node has been modified in such a way as to invalidate iterators - fix
898  * them:
899  */
900 void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b)
901 {
902         struct btree_iter *linked;
903
904         trans_for_each_iter_with_node(iter->trans, b, linked)
905                 __btree_iter_init(linked, b->c.level);
906 }
907
908 static int lock_root_check_fn(struct six_lock *lock, void *p)
909 {
910         struct btree *b = container_of(lock, struct btree, c.lock);
911         struct btree **rootp = p;
912
913         return b == *rootp ? 0 : -1;
914 }
915
916 static inline int btree_iter_lock_root(struct btree_iter *iter,
917                                        unsigned depth_want)
918 {
919         struct bch_fs *c = iter->trans->c;
920         struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b;
921         enum six_lock_type lock_type;
922         unsigned i;
923
924         EBUG_ON(iter->nodes_locked);
925
926         while (1) {
927                 b = READ_ONCE(*rootp);
928                 iter->level = READ_ONCE(b->c.level);
929
930                 if (unlikely(iter->level < depth_want)) {
931                         /*
932                          * the root is at a lower depth than the depth we want:
933                          * got to the end of the btree, or we're walking nodes
934                          * greater than some depth and there are no nodes >=
935                          * that depth
936                          */
937                         iter->level = depth_want;
938                         for (i = iter->level; i < BTREE_MAX_DEPTH; i++)
939                                 iter->l[i].b = NULL;
940                         return 1;
941                 }
942
943                 lock_type = __btree_lock_want(iter, iter->level);
944                 if (unlikely(!btree_node_lock(b, POS_MAX, iter->level,
945                                               iter, lock_type,
946                                               lock_root_check_fn, rootp)))
947                         return -EINTR;
948
949                 if (likely(b == READ_ONCE(*rootp) &&
950                            b->c.level == iter->level &&
951                            !race_fault())) {
952                         for (i = 0; i < iter->level; i++)
953                                 iter->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT;
954                         iter->l[iter->level].b = b;
955                         for (i = iter->level + 1; i < BTREE_MAX_DEPTH; i++)
956                                 iter->l[i].b = NULL;
957
958                         mark_btree_node_locked(iter, iter->level, lock_type);
959                         btree_iter_node_set(iter, b);
960                         return 0;
961                 }
962
963                 six_unlock_type(&b->c.lock, lock_type);
964         }
965 }
966
967 noinline
968 static void btree_iter_prefetch(struct btree_iter *iter)
969 {
970         struct bch_fs *c = iter->trans->c;
971         struct btree_iter_level *l = &iter->l[iter->level];
972         struct btree_node_iter node_iter = l->iter;
973         struct bkey_packed *k;
974         BKEY_PADDED(k) tmp;
975         unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
976                 ? (iter->level > 1 ? 0 :  2)
977                 : (iter->level > 1 ? 1 : 16);
978         bool was_locked = btree_node_locked(iter, iter->level);
979
980         while (nr) {
981                 if (!bch2_btree_node_relock(iter, iter->level))
982                         return;
983
984                 bch2_btree_node_iter_advance(&node_iter, l->b);
985                 k = bch2_btree_node_iter_peek(&node_iter, l->b);
986                 if (!k)
987                         break;
988
989                 bch2_bkey_unpack(l->b, &tmp.k, k);
990                 bch2_btree_node_prefetch(c, iter, &tmp.k, iter->level - 1);
991         }
992
993         if (!was_locked)
994                 btree_node_unlock(iter, iter->level);
995 }
996
997 static noinline void btree_node_mem_ptr_set(struct btree_iter *iter,
998                                             unsigned plevel, struct btree *b)
999 {
1000         struct btree_iter_level *l = &iter->l[plevel];
1001         bool locked = btree_node_locked(iter, plevel);
1002         struct bkey_packed *k;
1003         struct bch_btree_ptr_v2 *bp;
1004
1005         if (!bch2_btree_node_relock(iter, plevel))
1006                 return;
1007
1008         k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
1009         BUG_ON(k->type != KEY_TYPE_btree_ptr_v2);
1010
1011         bp = (void *) bkeyp_val(&l->b->format, k);
1012         bp->mem_ptr = (unsigned long)b;
1013
1014         if (!locked)
1015                 btree_node_unlock(iter, plevel);
1016 }
1017
1018 static __always_inline int btree_iter_down(struct btree_iter *iter)
1019 {
1020         struct bch_fs *c = iter->trans->c;
1021         struct btree_iter_level *l = &iter->l[iter->level];
1022         struct btree *b;
1023         unsigned level = iter->level - 1;
1024         enum six_lock_type lock_type = __btree_lock_want(iter, level);
1025         BKEY_PADDED(k) tmp;
1026
1027         EBUG_ON(!btree_node_locked(iter, iter->level));
1028
1029         bch2_bkey_unpack(l->b, &tmp.k,
1030                          bch2_btree_node_iter_peek(&l->iter, l->b));
1031
1032         b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type);
1033         if (unlikely(IS_ERR(b)))
1034                 return PTR_ERR(b);
1035
1036         mark_btree_node_locked(iter, level, lock_type);
1037         btree_iter_node_set(iter, b);
1038
1039         if (tmp.k.k.type == KEY_TYPE_btree_ptr_v2 &&
1040             unlikely(b != btree_node_mem_ptr(&tmp.k)))
1041                 btree_node_mem_ptr_set(iter, level + 1, b);
1042
1043         if (iter->flags & BTREE_ITER_PREFETCH)
1044                 btree_iter_prefetch(iter);
1045
1046         iter->level = level;
1047
1048         return 0;
1049 }
1050
1051 static void btree_iter_up(struct btree_iter *iter)
1052 {
1053         btree_node_unlock(iter, iter->level++);
1054 }
1055
1056 static int btree_iter_traverse_one(struct btree_iter *);
1057
1058 static int __btree_iter_traverse_all(struct btree_trans *trans, int ret)
1059 {
1060         struct bch_fs *c = trans->c;
1061         struct btree_iter *iter;
1062         u8 sorted[BTREE_ITER_MAX];
1063         unsigned i, nr_sorted = 0;
1064
1065         if (trans->in_traverse_all)
1066                 return -EINTR;
1067
1068         trans->in_traverse_all = true;
1069 retry_all:
1070         nr_sorted = 0;
1071
1072         trans_for_each_iter(trans, iter)
1073                 sorted[nr_sorted++] = iter->idx;
1074
1075 #define btree_iter_cmp_by_idx(_l, _r)                           \
1076                 btree_iter_cmp(&trans->iters[_l], &trans->iters[_r])
1077
1078         bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx);
1079 #undef btree_iter_cmp_by_idx
1080         bch2_trans_unlock(trans);
1081
1082         if (unlikely(ret == -ENOMEM)) {
1083                 struct closure cl;
1084
1085                 closure_init_stack(&cl);
1086
1087                 do {
1088                         ret = bch2_btree_cache_cannibalize_lock(c, &cl);
1089                         closure_sync(&cl);
1090                 } while (ret);
1091         }
1092
1093         if (unlikely(ret == -EIO)) {
1094                 trans->error = true;
1095                 goto out;
1096         }
1097
1098         BUG_ON(ret && ret != -EINTR);
1099
1100         /* Now, redo traversals in correct order: */
1101         for (i = 0; i < nr_sorted; i++) {
1102                 unsigned idx = sorted[i];
1103
1104                 /*
1105                  * sucessfully traversing one iterator can cause another to be
1106                  * unlinked, in btree_key_cache_fill()
1107                  */
1108                 if (!(trans->iters_linked & (1ULL << idx)))
1109                         continue;
1110
1111                 ret = btree_iter_traverse_one(&trans->iters[idx]);
1112                 if (ret)
1113                         goto retry_all;
1114         }
1115
1116         if (hweight64(trans->iters_live) > 1)
1117                 ret = -EINTR;
1118         else
1119                 trans_for_each_iter(trans, iter)
1120                         if (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) {
1121                                 ret = -EINTR;
1122                                 break;
1123                         }
1124 out:
1125         bch2_btree_cache_cannibalize_unlock(c);
1126
1127         trans->in_traverse_all = false;
1128         return ret;
1129 }
1130
1131 int bch2_btree_iter_traverse_all(struct btree_trans *trans)
1132 {
1133         return __btree_iter_traverse_all(trans, 0);
1134 }
1135
1136 static inline bool btree_iter_good_node(struct btree_iter *iter,
1137                                         unsigned l, int check_pos)
1138 {
1139         if (!is_btree_node(iter, l) ||
1140             !bch2_btree_node_relock(iter, l))
1141                 return false;
1142
1143         if (check_pos <= 0 && btree_iter_pos_before_node(iter, iter->l[l].b))
1144                 return false;
1145         if (check_pos >= 0 && btree_iter_pos_after_node(iter, iter->l[l].b))
1146                 return false;
1147         return true;
1148 }
1149
1150 static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter,
1151                                                      int check_pos)
1152 {
1153         unsigned l = iter->level;
1154
1155         while (btree_iter_node(iter, l) &&
1156                !btree_iter_good_node(iter, l, check_pos)) {
1157                 btree_node_unlock(iter, l);
1158                 iter->l[l].b = BTREE_ITER_NO_NODE_UP;
1159                 l++;
1160         }
1161
1162         return l;
1163 }
1164
1165 /*
1166  * This is the main state machine for walking down the btree - walks down to a
1167  * specified depth
1168  *
1169  * Returns 0 on success, -EIO on error (error reading in a btree node).
1170  *
1171  * On error, caller (peek_node()/peek_key()) must return NULL; the error is
1172  * stashed in the iterator and returned from bch2_trans_exit().
1173  */
1174 static int btree_iter_traverse_one(struct btree_iter *iter)
1175 {
1176         unsigned depth_want = iter->level;
1177
1178         /*
1179          * if we need interior nodes locked, call btree_iter_relock() to make
1180          * sure we walk back up enough that we lock them:
1181          */
1182         if (iter->uptodate == BTREE_ITER_NEED_RELOCK ||
1183             iter->locks_want > 1)
1184                 bch2_btree_iter_relock(iter, false);
1185
1186         if (btree_iter_type(iter) == BTREE_ITER_CACHED)
1187                 return bch2_btree_iter_traverse_cached(iter);
1188
1189         if (iter->uptodate < BTREE_ITER_NEED_RELOCK)
1190                 return 0;
1191
1192         if (unlikely(iter->level >= BTREE_MAX_DEPTH))
1193                 return 0;
1194
1195         /*
1196          * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos
1197          * here unnecessary
1198          */
1199         iter->level = btree_iter_up_until_good_node(iter, 0);
1200
1201         /*
1202          * If we've got a btree node locked (i.e. we aren't about to relock the
1203          * root) - advance its node iterator if necessary:
1204          *
1205          * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary
1206          */
1207         if (btree_iter_node(iter, iter->level)) {
1208                 BUG_ON(!btree_iter_pos_in_node(iter, iter->l[iter->level].b));
1209
1210                 btree_iter_advance_to_pos(iter, &iter->l[iter->level], -1);
1211         }
1212
1213         /*
1214          * Note: iter->nodes[iter->level] may be temporarily NULL here - that
1215          * would indicate to other code that we got to the end of the btree,
1216          * here it indicates that relocking the root failed - it's critical that
1217          * btree_iter_lock_root() comes next and that it can't fail
1218          */
1219         while (iter->level > depth_want) {
1220                 int ret = btree_iter_node(iter, iter->level)
1221                         ? btree_iter_down(iter)
1222                         : btree_iter_lock_root(iter, depth_want);
1223                 if (unlikely(ret)) {
1224                         if (ret == 1)
1225                                 return 0;
1226
1227                         iter->level = depth_want;
1228
1229                         if (ret == -EIO) {
1230                                 iter->flags |= BTREE_ITER_ERROR;
1231                                 iter->l[iter->level].b =
1232                                         BTREE_ITER_NO_NODE_ERROR;
1233                         } else {
1234                                 iter->l[iter->level].b =
1235                                         BTREE_ITER_NO_NODE_DOWN;
1236                         }
1237                         return ret;
1238                 }
1239         }
1240
1241         iter->uptodate = BTREE_ITER_NEED_PEEK;
1242
1243         bch2_btree_iter_verify(iter);
1244         return 0;
1245 }
1246
1247 int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
1248 {
1249         struct btree_trans *trans = iter->trans;
1250         int ret;
1251
1252         ret =   bch2_trans_cond_resched(trans) ?:
1253                 btree_iter_traverse_one(iter);
1254         if (unlikely(ret))
1255                 ret = __btree_iter_traverse_all(trans, ret);
1256
1257         return ret;
1258 }
1259
1260 static inline void bch2_btree_iter_checks(struct btree_iter *iter,
1261                                           enum btree_iter_type type)
1262 {
1263         EBUG_ON(iter->btree_id >= BTREE_ID_NR);
1264         EBUG_ON(btree_iter_type(iter) != type);
1265
1266         BUG_ON(type == BTREE_ITER_KEYS &&
1267                (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
1268                 bkey_cmp(iter->pos, iter->k.p) > 0));
1269
1270         bch2_btree_iter_verify_locks(iter);
1271         bch2_btree_iter_verify_level(iter, iter->level);
1272 }
1273
1274 /* Iterate across nodes (leaf and interior nodes) */
1275
1276 struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
1277 {
1278         struct btree *b;
1279         int ret;
1280
1281         bch2_btree_iter_checks(iter, BTREE_ITER_NODES);
1282
1283         if (iter->uptodate == BTREE_ITER_UPTODATE)
1284                 return iter->l[iter->level].b;
1285
1286         ret = bch2_btree_iter_traverse(iter);
1287         if (ret)
1288                 return NULL;
1289
1290         b = btree_iter_node(iter, iter->level);
1291         if (!b)
1292                 return NULL;
1293
1294         BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0);
1295
1296         iter->pos = b->key.k.p;
1297         iter->uptodate = BTREE_ITER_UPTODATE;
1298
1299         bch2_btree_iter_verify(iter);
1300
1301         return b;
1302 }
1303
1304 struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
1305 {
1306         struct btree *b;
1307         int ret;
1308
1309         bch2_btree_iter_checks(iter, BTREE_ITER_NODES);
1310
1311         /* already got to end? */
1312         if (!btree_iter_node(iter, iter->level))
1313                 return NULL;
1314
1315         bch2_trans_cond_resched(iter->trans);
1316
1317         btree_iter_up(iter);
1318
1319         if (!bch2_btree_node_relock(iter, iter->level))
1320                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
1321
1322         ret = bch2_btree_iter_traverse(iter);
1323         if (ret)
1324                 return NULL;
1325
1326         /* got to end? */
1327         b = btree_iter_node(iter, iter->level);
1328         if (!b)
1329                 return NULL;
1330
1331         if (bkey_cmp(iter->pos, b->key.k.p) < 0) {
1332                 /*
1333                  * Haven't gotten to the end of the parent node: go back down to
1334                  * the next child node
1335                  */
1336
1337                 /*
1338                  * We don't really want to be unlocking here except we can't
1339                  * directly tell btree_iter_traverse() "traverse to this level"
1340                  * except by setting iter->level, so we have to unlock so we
1341                  * don't screw up our lock invariants:
1342                  */
1343                 if (btree_node_read_locked(iter, iter->level))
1344                         btree_node_unlock(iter, iter->level);
1345
1346                 iter->pos       = bkey_successor(iter->pos);
1347                 iter->level     = iter->min_depth;
1348
1349                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1350                 ret = bch2_btree_iter_traverse(iter);
1351                 if (ret)
1352                         return NULL;
1353
1354                 b = iter->l[iter->level].b;
1355         }
1356
1357         iter->pos = b->key.k.p;
1358         iter->uptodate = BTREE_ITER_UPTODATE;
1359
1360         bch2_btree_iter_verify(iter);
1361
1362         return b;
1363 }
1364
1365 /* Iterate across keys (in leaf nodes only) */
1366
1367 void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos)
1368 {
1369         struct btree_iter_level *l = &iter->l[0];
1370
1371         EBUG_ON(iter->level != 0);
1372         EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0);
1373         EBUG_ON(!btree_node_locked(iter, 0));
1374         EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0);
1375
1376         bkey_init(&iter->k);
1377         iter->k.p = iter->pos = new_pos;
1378         btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
1379
1380         btree_iter_advance_to_pos(iter, l, -1);
1381
1382         /*
1383          * XXX:
1384          * keeping a node locked that's outside (even just outside) iter->pos
1385          * breaks __bch2_btree_node_lock(). This seems to only affect
1386          * bch2_btree_node_get_sibling so for now it's fixed there, but we
1387          * should try to get rid of this corner case.
1388          *
1389          * (this behaviour is currently needed for BTREE_INSERT_NOUNLOCK)
1390          */
1391
1392         if (bch2_btree_node_iter_end(&l->iter) &&
1393             btree_iter_pos_after_node(iter, l->b))
1394                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1395 }
1396
1397 static void btree_iter_pos_changed(struct btree_iter *iter, int cmp)
1398 {
1399         unsigned l = iter->level;
1400
1401         if (!cmp)
1402                 goto out;
1403
1404         if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) {
1405                 btree_node_unlock(iter, 0);
1406                 iter->l[0].b = BTREE_ITER_NO_NODE_UP;
1407                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1408                 return;
1409         }
1410
1411         l = btree_iter_up_until_good_node(iter, cmp);
1412
1413         if (btree_iter_node(iter, l)) {
1414                 /*
1415                  * We might have to skip over many keys, or just a few: try
1416                  * advancing the node iterator, and if we have to skip over too
1417                  * many keys just reinit it (or if we're rewinding, since that
1418                  * is expensive).
1419                  */
1420                 if (cmp < 0 ||
1421                     !btree_iter_advance_to_pos(iter, &iter->l[l], 8))
1422                         __btree_iter_init(iter, l);
1423
1424                 /* Don't leave it locked if we're not supposed to: */
1425                 if (btree_lock_want(iter, l) == BTREE_NODE_UNLOCKED)
1426                         btree_node_unlock(iter, l);
1427         }
1428 out:
1429         if (l != iter->level)
1430                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1431         else
1432                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
1433 }
1434
1435 void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos,
1436                                bool strictly_greater)
1437 {
1438         struct bpos old = btree_iter_search_key(iter);
1439         int cmp;
1440
1441         iter->flags &= ~BTREE_ITER_IS_EXTENTS;
1442         iter->flags |= strictly_greater ? BTREE_ITER_IS_EXTENTS : 0;
1443
1444         bkey_init(&iter->k);
1445         iter->k.p = iter->pos = new_pos;
1446
1447         cmp = bkey_cmp(btree_iter_search_key(iter), old);
1448
1449         btree_iter_pos_changed(iter, cmp);
1450 }
1451
1452 void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
1453 {
1454         int cmp = bkey_cmp(new_pos, iter->pos);
1455
1456         bkey_init(&iter->k);
1457         iter->k.p = iter->pos = new_pos;
1458
1459         btree_iter_pos_changed(iter, cmp);
1460 }
1461
1462 static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
1463 {
1464         struct btree_iter_level *l = &iter->l[0];
1465         bool ret;
1466
1467         bkey_init(&iter->k);
1468         iter->k.p = iter->pos = l->b->key.k.p;
1469
1470         ret = bkey_cmp(iter->pos, POS_MAX) != 0;
1471         if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
1472                 iter->k.p = iter->pos = bkey_successor(iter->pos);
1473
1474         btree_iter_pos_changed(iter, 1);
1475         return ret;
1476 }
1477
1478 static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter)
1479 {
1480         struct btree_iter_level *l = &iter->l[0];
1481         bool ret;
1482
1483         bkey_init(&iter->k);
1484         iter->k.p = iter->pos = l->b->data->min_key;
1485         iter->uptodate  = BTREE_ITER_NEED_TRAVERSE;
1486
1487         ret = bkey_cmp(iter->pos, POS_MIN) != 0;
1488         if (ret) {
1489                 iter->k.p = iter->pos = bkey_predecessor(iter->pos);
1490
1491                 if (iter->flags & BTREE_ITER_IS_EXTENTS)
1492                         iter->k.p = iter->pos = bkey_predecessor(iter->pos);
1493         }
1494
1495         btree_iter_pos_changed(iter, -1);
1496         return ret;
1497 }
1498
1499 /**
1500  * btree_iter_peek_uptodate - given an iterator that is uptodate, return the key
1501  * it currently points to
1502  */
1503 static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter)
1504 {
1505         struct btree_iter_level *l = &iter->l[0];
1506         struct bkey_s_c ret = { .k = &iter->k };
1507
1508         if (!bkey_deleted(&iter->k)) {
1509                 struct bkey_packed *_k =
1510                         __bch2_btree_node_iter_peek_all(&l->iter, l->b);
1511
1512                 ret.v = bkeyp_val(&l->b->format, _k);
1513
1514                 if (debug_check_iterators(iter->trans->c)) {
1515                         struct bkey k = bkey_unpack_key(l->b, _k);
1516
1517                         BUG_ON(memcmp(&k, &iter->k, sizeof(k)));
1518                 }
1519
1520                 if (debug_check_bkeys(iter->trans->c))
1521                         bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
1522         }
1523
1524         return ret;
1525 }
1526
1527 /**
1528  * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
1529  * current position
1530  */
1531 struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
1532 {
1533         struct btree_iter_level *l = &iter->l[0];
1534         struct bkey_s_c k;
1535         int ret;
1536
1537         bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
1538
1539         if (iter->uptodate == BTREE_ITER_UPTODATE &&
1540             !bkey_deleted(&iter->k))
1541                 return btree_iter_peek_uptodate(iter);
1542
1543         while (1) {
1544                 ret = bch2_btree_iter_traverse(iter);
1545                 if (unlikely(ret))
1546                         return bkey_s_c_err(ret);
1547
1548                 k = __btree_iter_peek(iter, l);
1549                 if (likely(k.k))
1550                         break;
1551
1552                 if (!btree_iter_set_pos_to_next_leaf(iter))
1553                         return bkey_s_c_null;
1554         }
1555
1556         /*
1557          * iter->pos should always be equal to the key we just
1558          * returned - except extents can straddle iter->pos:
1559          */
1560         if (!(iter->flags & BTREE_ITER_IS_EXTENTS) ||
1561             bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
1562                 iter->pos = bkey_start_pos(k.k);
1563
1564         iter->uptodate = BTREE_ITER_UPTODATE;
1565
1566         bch2_btree_iter_verify_level(iter, 0);
1567         return k;
1568 }
1569
1570 /**
1571  * bch2_btree_iter_next: returns first key greater than iterator's current
1572  * position
1573  */
1574 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
1575 {
1576         if (unlikely(!bkey_cmp(iter->k.p, POS_MAX)))
1577                 return bkey_s_c_null;
1578
1579         bch2_btree_iter_set_pos(iter,
1580                 (iter->flags & BTREE_ITER_IS_EXTENTS)
1581                 ? iter->k.p
1582                 : bkey_successor(iter->k.p));
1583
1584         return bch2_btree_iter_peek(iter);
1585 }
1586
1587 static struct bkey_s_c __btree_trans_updates_peek(struct btree_iter *iter)
1588 {
1589         struct bpos pos = btree_iter_search_key(iter);
1590         struct btree_trans *trans = iter->trans;
1591         struct btree_insert_entry *i;
1592
1593         trans_for_each_update2(trans, i)
1594                 if ((cmp_int(iter->btree_id,    i->iter->btree_id) ?:
1595                      bkey_cmp(pos,              i->k->k.p)) <= 0)
1596                         break;
1597
1598         return i < trans->updates2 + trans->nr_updates2 &&
1599                 iter->btree_id == i->iter->btree_id
1600                 ? bkey_i_to_s_c(i->k)
1601                 : bkey_s_c_null;
1602 }
1603
1604 static struct bkey_s_c __bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
1605 {
1606         struct btree_iter_level *l = &iter->l[0];
1607         struct bkey_s_c k = __btree_iter_peek(iter, l);
1608         struct bkey_s_c u = __btree_trans_updates_peek(iter);
1609
1610         if (k.k && (!u.k || bkey_cmp(k.k->p, u.k->p) < 0))
1611                 return k;
1612         if (u.k && bkey_cmp(u.k->p, l->b->key.k.p) <= 0) {
1613                 iter->k = *u.k;
1614                 return u;
1615         }
1616         return bkey_s_c_null;
1617 }
1618
1619 struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
1620 {
1621         struct bkey_s_c k;
1622         int ret;
1623
1624         bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
1625
1626         while (1) {
1627                 ret = bch2_btree_iter_traverse(iter);
1628                 if (unlikely(ret))
1629                         return bkey_s_c_err(ret);
1630
1631                 k = __bch2_btree_iter_peek_with_updates(iter);
1632
1633                 if (k.k && bkey_deleted(k.k)) {
1634                         bch2_btree_iter_set_pos(iter,
1635                                 (iter->flags & BTREE_ITER_IS_EXTENTS)
1636                                 ? iter->k.p
1637                                 : bkey_successor(iter->k.p));
1638                         continue;
1639                 }
1640
1641                 if (likely(k.k))
1642                         break;
1643
1644                 if (!btree_iter_set_pos_to_next_leaf(iter))
1645                         return bkey_s_c_null;
1646         }
1647
1648         /*
1649          * iter->pos should always be equal to the key we just
1650          * returned - except extents can straddle iter->pos:
1651          */
1652         if (!(iter->flags & BTREE_ITER_IS_EXTENTS) ||
1653             bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
1654                 iter->pos = bkey_start_pos(k.k);
1655
1656         iter->uptodate = BTREE_ITER_UPTODATE;
1657         return k;
1658 }
1659
1660 struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter)
1661 {
1662         if (unlikely(!bkey_cmp(iter->k.p, POS_MAX)))
1663                 return bkey_s_c_null;
1664
1665         bch2_btree_iter_set_pos(iter,
1666                 (iter->flags & BTREE_ITER_IS_EXTENTS)
1667                 ? iter->k.p
1668                 : bkey_successor(iter->k.p));
1669
1670         return bch2_btree_iter_peek_with_updates(iter);
1671 }
1672
1673 /**
1674  * bch2_btree_iter_peek_prev: returns first key less than or equal to
1675  * iterator's current position
1676  */
1677 struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
1678 {
1679         struct bpos pos = iter->pos;
1680         struct btree_iter_level *l = &iter->l[0];
1681         struct bkey_s_c k;
1682         int ret;
1683
1684         bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
1685
1686         if (iter->uptodate == BTREE_ITER_UPTODATE &&
1687             !bkey_deleted(&iter->k))
1688                 return btree_iter_peek_uptodate(iter);
1689
1690         while (1) {
1691                 ret = bch2_btree_iter_traverse(iter);
1692                 if (unlikely(ret))
1693                         return bkey_s_c_err(ret);
1694
1695                 k = __btree_iter_peek(iter, l);
1696                 if (!k.k || bkey_cmp(bkey_start_pos(k.k), pos) > 0)
1697                         k = __btree_iter_prev(iter, l);
1698
1699                 if (likely(k.k))
1700                         break;
1701
1702                 if (!btree_iter_set_pos_to_prev_leaf(iter))
1703                         return bkey_s_c_null;
1704         }
1705
1706         EBUG_ON(bkey_cmp(bkey_start_pos(k.k), pos) > 0);
1707         iter->pos       = bkey_start_pos(k.k);
1708         iter->uptodate  = BTREE_ITER_UPTODATE;
1709         return k;
1710 }
1711
1712 /**
1713  * bch2_btree_iter_prev: returns first key less than iterator's current
1714  * position
1715  */
1716 struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
1717 {
1718         struct bpos pos = bkey_start_pos(&iter->k);
1719
1720         bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
1721
1722         if (unlikely(!bkey_cmp(pos, POS_MIN)))
1723                 return bkey_s_c_null;
1724
1725         bch2_btree_iter_set_pos(iter, bkey_predecessor(pos));
1726
1727         return bch2_btree_iter_peek_prev(iter);
1728 }
1729
1730 static inline struct bkey_s_c
1731 __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
1732 {
1733         struct btree_iter_level *l = &iter->l[0];
1734         struct btree_node_iter node_iter;
1735         struct bkey_s_c k;
1736         struct bkey n;
1737         int ret;
1738
1739         /* keys & holes can't span inode numbers: */
1740         if (iter->pos.offset == KEY_OFFSET_MAX) {
1741                 if (iter->pos.inode == KEY_INODE_MAX)
1742                         return bkey_s_c_null;
1743
1744                 bch2_btree_iter_set_pos(iter, bkey_successor(iter->pos));
1745
1746                 ret = bch2_btree_iter_traverse(iter);
1747                 if (unlikely(ret))
1748                         return bkey_s_c_err(ret);
1749         }
1750
1751         /*
1752          * iterator is now at the correct position for inserting at iter->pos,
1753          * but we need to keep iterating until we find the first non whiteout so
1754          * we know how big a hole we have, if any:
1755          */
1756
1757         node_iter = l->iter;
1758         k = __btree_iter_unpack(iter, l, &iter->k,
1759                 bch2_btree_node_iter_peek(&node_iter, l->b));
1760
1761         if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) {
1762                 /*
1763                  * We're not setting iter->uptodate because the node iterator
1764                  * doesn't necessarily point at the key we're returning:
1765                  */
1766
1767                 EBUG_ON(bkey_cmp(k.k->p, iter->pos) <= 0);
1768                 bch2_btree_iter_verify_level(iter, 0);
1769                 return k;
1770         }
1771
1772         /* hole */
1773
1774         if (!k.k)
1775                 k.k = &l->b->key.k;
1776
1777         bkey_init(&n);
1778         n.p = iter->pos;
1779         bch2_key_resize(&n,
1780                         min_t(u64, KEY_SIZE_MAX,
1781                               (k.k->p.inode == n.p.inode
1782                                ? bkey_start_offset(k.k)
1783                                : KEY_OFFSET_MAX) -
1784                               n.p.offset));
1785
1786         EBUG_ON(!n.size);
1787
1788         iter->k = n;
1789         iter->uptodate = BTREE_ITER_UPTODATE;
1790
1791         bch2_btree_iter_verify_level(iter, 0);
1792         return (struct bkey_s_c) { &iter->k, NULL };
1793 }
1794
1795 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
1796 {
1797         struct btree_iter_level *l = &iter->l[0];
1798         struct bkey_s_c k;
1799         int ret;
1800
1801         bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
1802
1803         if (iter->uptodate == BTREE_ITER_UPTODATE)
1804                 return btree_iter_peek_uptodate(iter);
1805
1806         ret = bch2_btree_iter_traverse(iter);
1807         if (unlikely(ret))
1808                 return bkey_s_c_err(ret);
1809
1810         if (iter->flags & BTREE_ITER_IS_EXTENTS)
1811                 return __bch2_btree_iter_peek_slot_extents(iter);
1812
1813         k = __btree_iter_peek_all(iter, l, &iter->k);
1814
1815         EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0);
1816
1817         if (!k.k || bkey_cmp(iter->pos, k.k->p)) {
1818                 /* hole */
1819                 bkey_init(&iter->k);
1820                 iter->k.p = iter->pos;
1821                 k = (struct bkey_s_c) { &iter->k, NULL };
1822         }
1823
1824         iter->uptodate = BTREE_ITER_UPTODATE;
1825         bch2_btree_iter_verify_level(iter, 0);
1826         return k;
1827 }
1828
1829 struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
1830 {
1831         if (unlikely(!bkey_cmp(iter->k.p, POS_MAX)))
1832                 return bkey_s_c_null;
1833
1834         bch2_btree_iter_set_pos(iter,
1835                 (iter->flags & BTREE_ITER_IS_EXTENTS)
1836                 ? iter->k.p
1837                 : bkey_successor(iter->k.p));
1838
1839         return bch2_btree_iter_peek_slot(iter);
1840 }
1841
1842 struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter)
1843 {
1844         struct bkey_cached *ck;
1845         int ret;
1846
1847         bch2_btree_iter_checks(iter, BTREE_ITER_CACHED);
1848
1849         ret = bch2_btree_iter_traverse(iter);
1850         if (unlikely(ret))
1851                 return bkey_s_c_err(ret);
1852
1853         ck = (void *) iter->l[0].b;
1854
1855         EBUG_ON(iter->btree_id != ck->key.btree_id ||
1856                 bkey_cmp(iter->pos, ck->key.pos));
1857         BUG_ON(!ck->valid);
1858
1859         return bkey_i_to_s_c(ck->k);
1860 }
1861
1862 static inline void bch2_btree_iter_init(struct btree_trans *trans,
1863                         struct btree_iter *iter, enum btree_id btree_id,
1864                         struct bpos pos, unsigned flags)
1865 {
1866         struct bch_fs *c = trans->c;
1867         unsigned i;
1868
1869         if (btree_node_type_is_extents(btree_id) &&
1870             !(flags & BTREE_ITER_NODES))
1871                 flags |= BTREE_ITER_IS_EXTENTS;
1872
1873         iter->trans                     = trans;
1874         iter->pos                       = pos;
1875         bkey_init(&iter->k);
1876         iter->k.p                       = pos;
1877         iter->flags                     = flags;
1878         iter->uptodate                  = BTREE_ITER_NEED_TRAVERSE;
1879         iter->btree_id                  = btree_id;
1880         iter->level                     = 0;
1881         iter->min_depth                 = 0;
1882         iter->locks_want                = flags & BTREE_ITER_INTENT ? 1 : 0;
1883         iter->nodes_locked              = 0;
1884         iter->nodes_intent_locked       = 0;
1885         for (i = 0; i < ARRAY_SIZE(iter->l); i++)
1886                 iter->l[i].b            = BTREE_ITER_NO_NODE_INIT;
1887
1888         prefetch(c->btree_roots[btree_id].b);
1889 }
1890
1891 /* new transactional stuff: */
1892
1893 static inline void __bch2_trans_iter_free(struct btree_trans *trans,
1894                                           unsigned idx)
1895 {
1896         __bch2_btree_iter_unlock(&trans->iters[idx]);
1897         trans->iters_linked             &= ~(1ULL << idx);
1898         trans->iters_live               &= ~(1ULL << idx);
1899         trans->iters_touched            &= ~(1ULL << idx);
1900 }
1901
1902 int bch2_trans_iter_put(struct btree_trans *trans,
1903                         struct btree_iter *iter)
1904 {
1905         int ret;
1906
1907         if (IS_ERR_OR_NULL(iter))
1908                 return 0;
1909
1910         BUG_ON(trans->iters + iter->idx != iter);
1911
1912         ret = btree_iter_err(iter);
1913
1914         if (!(trans->iters_touched & (1ULL << iter->idx)) &&
1915             !(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT))
1916                 __bch2_trans_iter_free(trans, iter->idx);
1917
1918         trans->iters_live       &= ~(1ULL << iter->idx);
1919         return ret;
1920 }
1921
1922 int bch2_trans_iter_free(struct btree_trans *trans,
1923                          struct btree_iter *iter)
1924 {
1925         if (IS_ERR_OR_NULL(iter))
1926                 return 0;
1927
1928         trans->iters_touched &= ~(1ULL << iter->idx);
1929
1930         return bch2_trans_iter_put(trans, iter);
1931 }
1932
1933 static int bch2_trans_realloc_iters(struct btree_trans *trans,
1934                                     unsigned new_size)
1935 {
1936         void *p, *new_iters, *new_updates, *new_updates2;
1937         size_t iters_bytes;
1938         size_t updates_bytes;
1939
1940         new_size = roundup_pow_of_two(new_size);
1941
1942         BUG_ON(new_size > BTREE_ITER_MAX);
1943
1944         if (new_size <= trans->size)
1945                 return 0;
1946
1947         BUG_ON(trans->used_mempool);
1948
1949         bch2_trans_unlock(trans);
1950
1951         iters_bytes     = sizeof(struct btree_iter) * new_size;
1952         updates_bytes   = sizeof(struct btree_insert_entry) * new_size;
1953
1954         p = kmalloc(iters_bytes +
1955                     updates_bytes +
1956                     updates_bytes, GFP_NOFS);
1957         if (p)
1958                 goto success;
1959
1960         p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
1961         new_size = BTREE_ITER_MAX;
1962
1963         trans->used_mempool = true;
1964 success:
1965         new_iters       = p; p += iters_bytes;
1966         new_updates     = p; p += updates_bytes;
1967         new_updates2    = p; p += updates_bytes;
1968
1969         memcpy(new_iters, trans->iters,
1970                sizeof(struct btree_iter) * trans->nr_iters);
1971         memcpy(new_updates, trans->updates,
1972                sizeof(struct btree_insert_entry) * trans->nr_updates);
1973         memcpy(new_updates2, trans->updates2,
1974                sizeof(struct btree_insert_entry) * trans->nr_updates2);
1975
1976         if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
1977                 memset(trans->iters, POISON_FREE,
1978                        sizeof(struct btree_iter) * trans->nr_iters +
1979                        sizeof(struct btree_insert_entry) * trans->nr_iters);
1980
1981         if (trans->iters != trans->iters_onstack)
1982                 kfree(trans->iters);
1983
1984         trans->iters            = new_iters;
1985         trans->updates          = new_updates;
1986         trans->updates2         = new_updates2;
1987         trans->size             = new_size;
1988
1989         if (trans->iters_live) {
1990                 trace_trans_restart_iters_realloced(trans->ip, trans->size);
1991                 return -EINTR;
1992         }
1993
1994         return 0;
1995 }
1996
1997 static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans)
1998 {
1999         unsigned idx = __ffs64(~trans->iters_linked);
2000
2001         if (idx < trans->nr_iters)
2002                 goto got_slot;
2003
2004         if (trans->nr_iters == trans->size) {
2005                 int ret;
2006
2007                 if (trans->nr_iters >= BTREE_ITER_MAX) {
2008                         struct btree_iter *iter;
2009
2010                         trans_for_each_iter(trans, iter) {
2011                                 pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps",
2012                                        bch2_btree_ids[iter->btree_id],
2013                                        iter->pos.inode,
2014                                        iter->pos.offset,
2015                                        (trans->iters_live & (1ULL << iter->idx)) ? " live" : "",
2016                                        (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
2017                                        iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
2018                                        (void *) iter->ip_allocated);
2019                         }
2020
2021                         panic("trans iter oveflow\n");
2022                 }
2023
2024                 ret = bch2_trans_realloc_iters(trans, trans->size * 2);
2025                 if (ret)
2026                         return ERR_PTR(ret);
2027         }
2028
2029         idx = trans->nr_iters++;
2030         BUG_ON(trans->nr_iters > trans->size);
2031
2032         trans->iters[idx].idx = idx;
2033 got_slot:
2034         BUG_ON(trans->iters_linked & (1ULL << idx));
2035         trans->iters_linked |= 1ULL << idx;
2036         trans->iters[idx].flags = 0;
2037         return &trans->iters[idx];
2038 }
2039
2040 static inline void btree_iter_copy(struct btree_iter *dst,
2041                                    struct btree_iter *src)
2042 {
2043         unsigned i, idx = dst->idx;
2044
2045         *dst = *src;
2046         dst->idx = idx;
2047         dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
2048
2049         for (i = 0; i < BTREE_MAX_DEPTH; i++)
2050                 if (btree_node_locked(dst, i))
2051                         six_lock_increment(&dst->l[i].b->c.lock,
2052                                            __btree_lock_want(dst, i));
2053
2054         dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
2055         dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT;
2056 }
2057
2058 static inline struct bpos bpos_diff(struct bpos l, struct bpos r)
2059 {
2060         if (bkey_cmp(l, r) > 0)
2061                 swap(l, r);
2062
2063         return POS(r.inode - l.inode, r.offset - l.offset);
2064 }
2065
2066 static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
2067                                                  unsigned btree_id, struct bpos pos,
2068                                                  unsigned flags)
2069 {
2070         struct btree_iter *iter, *best = NULL;
2071
2072         BUG_ON(trans->nr_iters > BTREE_ITER_MAX);
2073
2074         trans_for_each_iter(trans, iter) {
2075                 if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE))
2076                         continue;
2077
2078                 if (iter->btree_id != btree_id)
2079                         continue;
2080
2081                 if (best &&
2082                     bkey_cmp(bpos_diff(best->pos, pos),
2083                              bpos_diff(iter->pos, pos)) < 0)
2084                         continue;
2085
2086                 best = iter;
2087         }
2088
2089         if (!best) {
2090                 iter = btree_trans_iter_alloc(trans);
2091                 if (IS_ERR(iter))
2092                         return iter;
2093
2094                 bch2_btree_iter_init(trans, iter, btree_id, pos, flags);
2095         } else if ((trans->iters_live & (1ULL << best->idx)) ||
2096                    (best->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) {
2097                 iter = btree_trans_iter_alloc(trans);
2098                 if (IS_ERR(iter))
2099                         return iter;
2100
2101                 btree_iter_copy(iter, best);
2102         } else {
2103                 iter = best;
2104         }
2105
2106         iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
2107         iter->flags &= ~BTREE_ITER_USER_FLAGS;
2108         iter->flags |= flags & BTREE_ITER_USER_FLAGS;
2109
2110         if (iter->flags & BTREE_ITER_INTENT)
2111                 bch2_btree_iter_upgrade(iter, 1);
2112         else
2113                 bch2_btree_iter_downgrade(iter);
2114
2115         BUG_ON(iter->btree_id != btree_id);
2116         BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE);
2117         BUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
2118         BUG_ON(iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT);
2119         BUG_ON(trans->iters_live & (1ULL << iter->idx));
2120
2121         trans->iters_live       |= 1ULL << iter->idx;
2122         trans->iters_touched    |= 1ULL << iter->idx;
2123
2124         return iter;
2125 }
2126
2127 struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
2128                                          enum btree_id btree_id,
2129                                          struct bpos pos, unsigned flags)
2130 {
2131         struct btree_iter *iter =
2132                 __btree_trans_get_iter(trans, btree_id, pos, flags);
2133
2134         if (!IS_ERR(iter))
2135                 __bch2_btree_iter_set_pos(iter, pos,
2136                         btree_node_type_is_extents(btree_id));
2137         return iter;
2138 }
2139
2140 struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
2141                                             enum btree_id btree_id,
2142                                             struct bpos pos,
2143                                             unsigned locks_want,
2144                                             unsigned depth,
2145                                             unsigned flags)
2146 {
2147         struct btree_iter *iter =
2148                 __btree_trans_get_iter(trans, btree_id, pos,
2149                                        flags|BTREE_ITER_NODES);
2150         unsigned i;
2151
2152         BUG_ON(IS_ERR(iter));
2153         BUG_ON(bkey_cmp(iter->pos, pos));
2154
2155         iter->locks_want = locks_want;
2156         iter->level     = depth;
2157         iter->min_depth = depth;
2158
2159         for (i = 0; i < ARRAY_SIZE(iter->l); i++)
2160                 iter->l[i].b            = NULL;
2161         iter->l[iter->level].b          = BTREE_ITER_NO_NODE_INIT;
2162
2163         return iter;
2164 }
2165
2166 struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans,
2167                                         struct btree_iter *src)
2168 {
2169         struct btree_iter *iter;
2170
2171         iter = btree_trans_iter_alloc(trans);
2172         if (IS_ERR(iter))
2173                 return iter;
2174
2175         btree_iter_copy(iter, src);
2176
2177         trans->iters_live |= 1ULL << iter->idx;
2178         /*
2179          * We don't need to preserve this iter since it's cheap to copy it
2180          * again - this will cause trans_iter_put() to free it right away:
2181          */
2182         trans->iters_touched &= ~(1ULL << iter->idx);
2183
2184         return iter;
2185 }
2186
2187 static int bch2_trans_preload_mem(struct btree_trans *trans, size_t size)
2188 {
2189         if (size > trans->mem_bytes) {
2190                 size_t old_bytes = trans->mem_bytes;
2191                 size_t new_bytes = roundup_pow_of_two(size);
2192                 void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
2193
2194                 if (!new_mem)
2195                         return -ENOMEM;
2196
2197                 trans->mem = new_mem;
2198                 trans->mem_bytes = new_bytes;
2199
2200                 if (old_bytes) {
2201                         trace_trans_restart_mem_realloced(trans->ip, new_bytes);
2202                         return -EINTR;
2203                 }
2204         }
2205
2206         return 0;
2207 }
2208
2209 void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
2210 {
2211         void *p;
2212         int ret;
2213
2214         ret = bch2_trans_preload_mem(trans, trans->mem_top + size);
2215         if (ret)
2216                 return ERR_PTR(ret);
2217
2218         p = trans->mem + trans->mem_top;
2219         trans->mem_top += size;
2220         return p;
2221 }
2222
2223 inline void bch2_trans_unlink_iters(struct btree_trans *trans)
2224 {
2225         u64 iters = trans->iters_linked &
2226                 ~trans->iters_touched &
2227                 ~trans->iters_live;
2228
2229         while (iters) {
2230                 unsigned idx = __ffs64(iters);
2231
2232                 iters &= ~(1ULL << idx);
2233                 __bch2_trans_iter_free(trans, idx);
2234         }
2235 }
2236
2237 void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
2238 {
2239         struct btree_iter *iter;
2240
2241         trans_for_each_iter(trans, iter)
2242                 iter->flags &= ~(BTREE_ITER_KEEP_UNTIL_COMMIT|
2243                                  BTREE_ITER_SET_POS_AFTER_COMMIT);
2244
2245         bch2_trans_unlink_iters(trans);
2246
2247         trans->iters_touched &= trans->iters_live;
2248
2249         trans->need_reset               = 0;
2250         trans->nr_updates               = 0;
2251         trans->nr_updates2              = 0;
2252         trans->mem_top                  = 0;
2253
2254         trans->extra_journal_entries    = NULL;
2255         trans->extra_journal_entry_u64s = 0;
2256
2257         if (trans->fs_usage_deltas) {
2258                 trans->fs_usage_deltas->used = 0;
2259                 memset(&trans->fs_usage_deltas->memset_start, 0,
2260                        (void *) &trans->fs_usage_deltas->memset_end -
2261                        (void *) &trans->fs_usage_deltas->memset_start);
2262         }
2263
2264         if (!(flags & TRANS_RESET_NOTRAVERSE))
2265                 bch2_btree_iter_traverse_all(trans);
2266 }
2267
2268 void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
2269                      unsigned expected_nr_iters,
2270                      size_t expected_mem_bytes)
2271 {
2272         memset(trans, 0, offsetof(struct btree_trans, iters_onstack));
2273
2274         /*
2275          * reallocating iterators currently completely breaks
2276          * bch2_trans_iter_put():
2277          */
2278         expected_nr_iters = BTREE_ITER_MAX;
2279
2280         trans->c                = c;
2281         trans->ip               = _RET_IP_;
2282         trans->size             = ARRAY_SIZE(trans->iters_onstack);
2283         trans->iters            = trans->iters_onstack;
2284         trans->updates          = trans->updates_onstack;
2285         trans->updates2         = trans->updates2_onstack;
2286         trans->fs_usage_deltas  = NULL;
2287
2288         if (expected_nr_iters > trans->size)
2289                 bch2_trans_realloc_iters(trans, expected_nr_iters);
2290
2291         if (expected_mem_bytes)
2292                 bch2_trans_preload_mem(trans, expected_mem_bytes);
2293
2294 #ifdef CONFIG_BCACHEFS_DEBUG
2295         trans->pid = current->pid;
2296         mutex_lock(&c->btree_trans_lock);
2297         list_add(&trans->list, &c->btree_trans_list);
2298         mutex_unlock(&c->btree_trans_lock);
2299 #endif
2300 }
2301
2302 int bch2_trans_exit(struct btree_trans *trans)
2303 {
2304         bch2_trans_unlock(trans);
2305
2306 #ifdef CONFIG_BCACHEFS_DEBUG
2307         mutex_lock(&trans->c->btree_trans_lock);
2308         list_del(&trans->list);
2309         mutex_unlock(&trans->c->btree_trans_lock);
2310 #endif
2311
2312         bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
2313
2314         kfree(trans->fs_usage_deltas);
2315         kfree(trans->mem);
2316         if (trans->used_mempool)
2317                 mempool_free(trans->iters, &trans->c->btree_iters_pool);
2318         else if (trans->iters != trans->iters_onstack)
2319                 kfree(trans->iters);
2320         trans->mem      = (void *) 0x1;
2321         trans->iters    = (void *) 0x1;
2322
2323         return trans->error ? -EIO : 0;
2324 }
2325
2326 void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
2327 {
2328 #ifdef CONFIG_BCACHEFS_DEBUG
2329         struct btree_trans *trans;
2330         struct btree_iter *iter;
2331         struct btree *b;
2332         unsigned l;
2333
2334         mutex_lock(&c->btree_trans_lock);
2335         list_for_each_entry(trans, &c->btree_trans_list, list) {
2336                 pr_buf(out, "%i %px %ps\n", trans->pid, trans, (void *) trans->ip);
2337
2338                 trans_for_each_iter(trans, iter) {
2339                         if (!iter->nodes_locked)
2340                                 continue;
2341
2342                         pr_buf(out, "  iter %u %s:",
2343                                iter->idx,
2344                                bch2_btree_ids[iter->btree_id]);
2345                         bch2_bpos_to_text(out, iter->pos);
2346                         pr_buf(out, "\n");
2347
2348                         for (l = 0; l < BTREE_MAX_DEPTH; l++) {
2349                                 if (btree_node_locked(iter, l)) {
2350                                         b = iter->l[l].b;
2351
2352                                         pr_buf(out, "    %px %s l=%u ",
2353                                                b, btree_node_intent_locked(iter, l) ? "i" : "r", l);
2354                                         bch2_bpos_to_text(out, b->key.k.p);
2355                                         pr_buf(out, "\n");
2356                                 }
2357                         }
2358                 }
2359
2360                 b = READ_ONCE(trans->locking);
2361                 if (b) {
2362                         pr_buf(out, "  locking iter %u l=%u %s:",
2363                                trans->locking_iter_idx,
2364                                trans->locking_level,
2365                                bch2_btree_ids[trans->locking_btree_id]);
2366                         bch2_bpos_to_text(out, trans->locking_pos);
2367
2368                         pr_buf(out, " node %px l=%u %s:",
2369                                b, b->c.level,
2370                                bch2_btree_ids[b->c.btree_id]);
2371                         bch2_bpos_to_text(out, b->key.k.p);
2372                         pr_buf(out, "\n");
2373                 }
2374         }
2375         mutex_unlock(&c->btree_trans_lock);
2376 #endif
2377 }
2378
2379 void bch2_fs_btree_iter_exit(struct bch_fs *c)
2380 {
2381         mempool_exit(&c->btree_iters_pool);
2382 }
2383
2384 int bch2_fs_btree_iter_init(struct bch_fs *c)
2385 {
2386         unsigned nr = BTREE_ITER_MAX;
2387
2388         INIT_LIST_HEAD(&c->btree_trans_list);
2389         mutex_init(&c->btree_trans_lock);
2390
2391         return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
2392                         sizeof(struct btree_iter) * nr +
2393                         sizeof(struct btree_insert_entry) * nr +
2394                         sizeof(struct btree_insert_entry) * nr);
2395 }