]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_iter.c
Update bcachefs sources to 47ffed9fad bcachefs: bch2_btree_delete_range_trans() now...
[bcachefs-tools-debian] / libbcachefs / btree_iter.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_methods.h"
5 #include "bkey_buf.h"
6 #include "btree_cache.h"
7 #include "btree_iter.h"
8 #include "btree_key_cache.h"
9 #include "btree_locking.h"
10 #include "btree_update.h"
11 #include "debug.h"
12 #include "error.h"
13 #include "extents.h"
14 #include "journal.h"
15 #include "recovery.h"
16 #include "replicas.h"
17 #include "subvolume.h"
18
19 #include <linux/prandom.h>
20 #include <linux/prefetch.h>
21 #include <trace/events/bcachefs.h>
22
23 static void btree_trans_verify_sorted(struct btree_trans *);
24 inline void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int);
25
26 static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
27 static inline void btree_path_list_add(struct btree_trans *, struct btree_path *,
28                                        struct btree_path *);
29
30 static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter)
31 {
32 #ifdef CONFIG_BCACHEFS_DEBUG
33         return iter->ip_allocated;
34 #else
35         return 0;
36 #endif
37 }
38
39 static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *);
40
41 /*
42  * Unlocks before scheduling
43  * Note: does not revalidate iterator
44  */
45 static inline int bch2_trans_cond_resched(struct btree_trans *trans)
46 {
47         if (need_resched() || race_fault()) {
48                 bch2_trans_unlock(trans);
49                 schedule();
50                 return bch2_trans_relock(trans);
51         } else {
52                 return 0;
53         }
54 }
55
56 static inline int __btree_path_cmp(const struct btree_path *l,
57                                    enum btree_id        r_btree_id,
58                                    bool                 r_cached,
59                                    struct bpos          r_pos,
60                                    unsigned             r_level)
61 {
62         /*
63          * Must match lock ordering as defined by __bch2_btree_node_lock:
64          */
65         return   cmp_int(l->btree_id,   r_btree_id) ?:
66                  cmp_int((int) l->cached,       (int) r_cached) ?:
67                  bpos_cmp(l->pos,       r_pos) ?:
68                 -cmp_int(l->level,      r_level);
69 }
70
71 static inline int btree_path_cmp(const struct btree_path *l,
72                                  const struct btree_path *r)
73 {
74         return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level);
75 }
76
77 static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
78 {
79         /* Are we iterating over keys in all snapshots? */
80         if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
81                 p = bpos_successor(p);
82         } else {
83                 p = bpos_nosnap_successor(p);
84                 p.snapshot = iter->snapshot;
85         }
86
87         return p;
88 }
89
90 static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
91 {
92         /* Are we iterating over keys in all snapshots? */
93         if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
94                 p = bpos_predecessor(p);
95         } else {
96                 p = bpos_nosnap_predecessor(p);
97                 p.snapshot = iter->snapshot;
98         }
99
100         return p;
101 }
102
103 static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
104 {
105         struct bpos pos = iter->pos;
106
107         if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
108             bkey_cmp(pos, POS_MAX))
109                 pos = bkey_successor(iter, pos);
110         return pos;
111 }
112
113 static inline bool btree_path_pos_before_node(struct btree_path *path,
114                                               struct btree *b)
115 {
116         return bpos_cmp(path->pos, b->data->min_key) < 0;
117 }
118
119 static inline bool btree_path_pos_after_node(struct btree_path *path,
120                                              struct btree *b)
121 {
122         return bpos_cmp(b->key.k.p, path->pos) < 0;
123 }
124
125 static inline bool btree_path_pos_in_node(struct btree_path *path,
126                                           struct btree *b)
127 {
128         return path->btree_id == b->c.btree_id &&
129                 !btree_path_pos_before_node(path, b) &&
130                 !btree_path_pos_after_node(path, b);
131 }
132
133 /* Btree iterator: */
134
135 #ifdef CONFIG_BCACHEFS_DEBUG
136
137 static void bch2_btree_path_verify_cached(struct btree_trans *trans,
138                                           struct btree_path *path)
139 {
140         struct bkey_cached *ck;
141         bool locked = btree_node_locked(path, 0);
142
143         if (!bch2_btree_node_relock(trans, path, 0))
144                 return;
145
146         ck = (void *) path->l[0].b;
147         BUG_ON(ck->key.btree_id != path->btree_id ||
148                bkey_cmp(ck->key.pos, path->pos));
149
150         if (!locked)
151                 btree_node_unlock(trans, path, 0);
152 }
153
154 static void bch2_btree_path_verify_level(struct btree_trans *trans,
155                                 struct btree_path *path, unsigned level)
156 {
157         struct btree_path_level *l;
158         struct btree_node_iter tmp;
159         bool locked;
160         struct bkey_packed *p, *k;
161         struct printbuf buf1 = PRINTBUF;
162         struct printbuf buf2 = PRINTBUF;
163         struct printbuf buf3 = PRINTBUF;
164         const char *msg;
165
166         if (!bch2_debug_check_iterators)
167                 return;
168
169         l       = &path->l[level];
170         tmp     = l->iter;
171         locked  = btree_node_locked(path, level);
172
173         if (path->cached) {
174                 if (!level)
175                         bch2_btree_path_verify_cached(trans, path);
176                 return;
177         }
178
179         if (!btree_path_node(path, level))
180                 return;
181
182         if (!bch2_btree_node_relock_notrace(trans, path, level))
183                 return;
184
185         BUG_ON(!btree_path_pos_in_node(path, l->b));
186
187         bch2_btree_node_iter_verify(&l->iter, l->b);
188
189         /*
190          * For interior nodes, the iterator will have skipped past deleted keys:
191          */
192         p = level
193                 ? bch2_btree_node_iter_prev(&tmp, l->b)
194                 : bch2_btree_node_iter_prev_all(&tmp, l->b);
195         k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
196
197         if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) {
198                 msg = "before";
199                 goto err;
200         }
201
202         if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
203                 msg = "after";
204                 goto err;
205         }
206
207         if (!locked)
208                 btree_node_unlock(trans, path, level);
209         return;
210 err:
211         bch2_bpos_to_text(&buf1, path->pos);
212
213         if (p) {
214                 struct bkey uk = bkey_unpack_key(l->b, p);
215                 bch2_bkey_to_text(&buf2, &uk);
216         } else {
217                 prt_printf(&buf2, "(none)");
218         }
219
220         if (k) {
221                 struct bkey uk = bkey_unpack_key(l->b, k);
222                 bch2_bkey_to_text(&buf3, &uk);
223         } else {
224                 prt_printf(&buf3, "(none)");
225         }
226
227         panic("path should be %s key at level %u:\n"
228               "path pos %s\n"
229               "prev key %s\n"
230               "cur  key %s\n",
231               msg, level, buf1.buf, buf2.buf, buf3.buf);
232 }
233
234 static void bch2_btree_path_verify(struct btree_trans *trans,
235                                    struct btree_path *path)
236 {
237         struct bch_fs *c = trans->c;
238         unsigned i;
239
240         EBUG_ON(path->btree_id >= BTREE_ID_NR);
241
242         for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) {
243                 if (!path->l[i].b) {
244                         BUG_ON(!path->cached &&
245                                c->btree_roots[path->btree_id].b->c.level > i);
246                         break;
247                 }
248
249                 bch2_btree_path_verify_level(trans, path, i);
250         }
251
252         bch2_btree_path_verify_locks(path);
253 }
254
255 void bch2_trans_verify_paths(struct btree_trans *trans)
256 {
257         struct btree_path *path;
258
259         trans_for_each_path(trans, path)
260                 bch2_btree_path_verify(trans, path);
261 }
262
263 static void bch2_btree_iter_verify(struct btree_iter *iter)
264 {
265         struct btree_trans *trans = iter->trans;
266
267         BUG_ON(iter->btree_id >= BTREE_ID_NR);
268
269         BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached);
270
271         BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
272                (iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
273
274         BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
275                (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
276                !btree_type_has_snapshots(iter->btree_id));
277
278         if (iter->update_path)
279                 bch2_btree_path_verify(trans, iter->update_path);
280         bch2_btree_path_verify(trans, iter->path);
281 }
282
283 static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
284 {
285         BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
286                !iter->pos.snapshot);
287
288         BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
289                iter->pos.snapshot != iter->snapshot);
290
291         BUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
292                bkey_cmp(iter->pos, iter->k.p) > 0);
293 }
294
295 static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
296 {
297         struct btree_trans *trans = iter->trans;
298         struct btree_iter copy;
299         struct bkey_s_c prev;
300         int ret = 0;
301
302         if (!bch2_debug_check_iterators)
303                 return 0;
304
305         if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS))
306                 return 0;
307
308         if (bkey_err(k) || !k.k)
309                 return 0;
310
311         BUG_ON(!bch2_snapshot_is_ancestor(trans->c,
312                                           iter->snapshot,
313                                           k.k->p.snapshot));
314
315         bch2_trans_iter_init(trans, &copy, iter->btree_id, iter->pos,
316                              BTREE_ITER_NOPRESERVE|
317                              BTREE_ITER_ALL_SNAPSHOTS);
318         prev = bch2_btree_iter_prev(&copy);
319         if (!prev.k)
320                 goto out;
321
322         ret = bkey_err(prev);
323         if (ret)
324                 goto out;
325
326         if (!bkey_cmp(prev.k->p, k.k->p) &&
327             bch2_snapshot_is_ancestor(trans->c, iter->snapshot,
328                                       prev.k->p.snapshot) > 0) {
329                 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
330
331                 bch2_bkey_to_text(&buf1, k.k);
332                 bch2_bkey_to_text(&buf2, prev.k);
333
334                 panic("iter snap %u\n"
335                       "k    %s\n"
336                       "prev %s\n",
337                       iter->snapshot,
338                       buf1.buf, buf2.buf);
339         }
340 out:
341         bch2_trans_iter_exit(trans, &copy);
342         return ret;
343 }
344
345 void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
346                             struct bpos pos, bool key_cache)
347 {
348         struct btree_path *path;
349         unsigned idx;
350         struct printbuf buf = PRINTBUF;
351
352         trans_for_each_path_inorder(trans, path, idx) {
353                 int cmp = cmp_int(path->btree_id, id) ?:
354                         cmp_int(path->cached, key_cache);
355
356                 if (cmp > 0)
357                         break;
358                 if (cmp < 0)
359                         continue;
360
361                 if (!btree_node_locked(path, 0) ||
362                     !path->should_be_locked)
363                         continue;
364
365                 if (!key_cache) {
366                         if (bkey_cmp(pos, path->l[0].b->data->min_key) >= 0 &&
367                             bkey_cmp(pos, path->l[0].b->key.k.p) <= 0)
368                                 return;
369                 } else {
370                         if (!bkey_cmp(pos, path->pos))
371                                 return;
372                 }
373         }
374
375         bch2_dump_trans_paths_updates(trans);
376         bch2_bpos_to_text(&buf, pos);
377
378         panic("not locked: %s %s%s\n",
379               bch2_btree_ids[id], buf.buf,
380               key_cache ? " cached" : "");
381 }
382
383 #else
384
385 static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
386                                                 struct btree_path *path, unsigned l) {}
387 static inline void bch2_btree_path_verify(struct btree_trans *trans,
388                                           struct btree_path *path) {}
389 static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
390 static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
391 static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; }
392
393 #endif
394
395 /* Btree path: fixups after btree updates */
396
397 static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
398                                         struct btree *b,
399                                         struct bset_tree *t,
400                                         struct bkey_packed *k)
401 {
402         struct btree_node_iter_set *set;
403
404         btree_node_iter_for_each(iter, set)
405                 if (set->end == t->end_offset) {
406                         set->k = __btree_node_key_to_offset(b, k);
407                         bch2_btree_node_iter_sort(iter, b);
408                         return;
409                 }
410
411         bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
412 }
413
414 static void __bch2_btree_path_fix_key_modified(struct btree_path *path,
415                                                struct btree *b,
416                                                struct bkey_packed *where)
417 {
418         struct btree_path_level *l = &path->l[b->c.level];
419
420         if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
421                 return;
422
423         if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0)
424                 bch2_btree_node_iter_advance(&l->iter, l->b);
425 }
426
427 void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
428                                       struct btree *b,
429                                       struct bkey_packed *where)
430 {
431         struct btree_path *path;
432
433         trans_for_each_path_with_node(trans, b, path) {
434                 __bch2_btree_path_fix_key_modified(path, b, where);
435                 bch2_btree_path_verify_level(trans, path, b->c.level);
436         }
437 }
438
439 static void __bch2_btree_node_iter_fix(struct btree_path *path,
440                                        struct btree *b,
441                                        struct btree_node_iter *node_iter,
442                                        struct bset_tree *t,
443                                        struct bkey_packed *where,
444                                        unsigned clobber_u64s,
445                                        unsigned new_u64s)
446 {
447         const struct bkey_packed *end = btree_bkey_last(b, t);
448         struct btree_node_iter_set *set;
449         unsigned offset = __btree_node_key_to_offset(b, where);
450         int shift = new_u64s - clobber_u64s;
451         unsigned old_end = t->end_offset - shift;
452         unsigned orig_iter_pos = node_iter->data[0].k;
453         bool iter_current_key_modified =
454                 orig_iter_pos >= offset &&
455                 orig_iter_pos <= offset + clobber_u64s;
456
457         btree_node_iter_for_each(node_iter, set)
458                 if (set->end == old_end)
459                         goto found;
460
461         /* didn't find the bset in the iterator - might have to readd it: */
462         if (new_u64s &&
463             bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
464                 bch2_btree_node_iter_push(node_iter, b, where, end);
465                 goto fixup_done;
466         } else {
467                 /* Iterator is after key that changed */
468                 return;
469         }
470 found:
471         set->end = t->end_offset;
472
473         /* Iterator hasn't gotten to the key that changed yet: */
474         if (set->k < offset)
475                 return;
476
477         if (new_u64s &&
478             bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
479                 set->k = offset;
480         } else if (set->k < offset + clobber_u64s) {
481                 set->k = offset + new_u64s;
482                 if (set->k == set->end)
483                         bch2_btree_node_iter_set_drop(node_iter, set);
484         } else {
485                 /* Iterator is after key that changed */
486                 set->k = (int) set->k + shift;
487                 return;
488         }
489
490         bch2_btree_node_iter_sort(node_iter, b);
491 fixup_done:
492         if (node_iter->data[0].k != orig_iter_pos)
493                 iter_current_key_modified = true;
494
495         /*
496          * When a new key is added, and the node iterator now points to that
497          * key, the iterator might have skipped past deleted keys that should
498          * come after the key the iterator now points to. We have to rewind to
499          * before those deleted keys - otherwise
500          * bch2_btree_node_iter_prev_all() breaks:
501          */
502         if (!bch2_btree_node_iter_end(node_iter) &&
503             iter_current_key_modified &&
504             b->c.level) {
505                 struct bset_tree *t;
506                 struct bkey_packed *k, *k2, *p;
507
508                 k = bch2_btree_node_iter_peek_all(node_iter, b);
509
510                 for_each_bset(b, t) {
511                         bool set_pos = false;
512
513                         if (node_iter->data[0].end == t->end_offset)
514                                 continue;
515
516                         k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t);
517
518                         while ((p = bch2_bkey_prev_all(b, t, k2)) &&
519                                bkey_iter_cmp(b, k, p) < 0) {
520                                 k2 = p;
521                                 set_pos = true;
522                         }
523
524                         if (set_pos)
525                                 btree_node_iter_set_set_pos(node_iter,
526                                                             b, t, k2);
527                 }
528         }
529 }
530
531 void bch2_btree_node_iter_fix(struct btree_trans *trans,
532                               struct btree_path *path,
533                               struct btree *b,
534                               struct btree_node_iter *node_iter,
535                               struct bkey_packed *where,
536                               unsigned clobber_u64s,
537                               unsigned new_u64s)
538 {
539         struct bset_tree *t = bch2_bkey_to_bset(b, where);
540         struct btree_path *linked;
541
542         if (node_iter != &path->l[b->c.level].iter) {
543                 __bch2_btree_node_iter_fix(path, b, node_iter, t,
544                                            where, clobber_u64s, new_u64s);
545
546                 if (bch2_debug_check_iterators)
547                         bch2_btree_node_iter_verify(node_iter, b);
548         }
549
550         trans_for_each_path_with_node(trans, b, linked) {
551                 __bch2_btree_node_iter_fix(linked, b,
552                                            &linked->l[b->c.level].iter, t,
553                                            where, clobber_u64s, new_u64s);
554                 bch2_btree_path_verify_level(trans, linked, b->c.level);
555         }
556 }
557
558 /* Btree path level: pointer to a particular btree node and node iter */
559
560 static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c,
561                                                   struct btree_path_level *l,
562                                                   struct bkey *u,
563                                                   struct bkey_packed *k)
564 {
565         if (unlikely(!k)) {
566                 /*
567                  * signal to bch2_btree_iter_peek_slot() that we're currently at
568                  * a hole
569                  */
570                 u->type = KEY_TYPE_deleted;
571                 return bkey_s_c_null;
572         }
573
574         return bkey_disassemble(l->b, k, u);
575 }
576
577 static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
578                                                         struct btree_path_level *l,
579                                                         struct bkey *u)
580 {
581         return __btree_iter_unpack(c, l, u,
582                         bch2_btree_node_iter_peek_all(&l->iter, l->b));
583 }
584
585 static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans,
586                                                     struct btree_path *path,
587                                                     struct btree_path_level *l,
588                                                     struct bkey *u)
589 {
590         struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
591                         bch2_btree_node_iter_peek(&l->iter, l->b));
592
593         path->pos = k.k ? k.k->p : l->b->key.k.p;
594         bch2_btree_path_verify_level(trans, path, l - path->l);
595         return k;
596 }
597
598 static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
599                                                     struct btree_path *path,
600                                                     struct btree_path_level *l,
601                                                     struct bkey *u)
602 {
603         struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
604                         bch2_btree_node_iter_prev(&l->iter, l->b));
605
606         path->pos = k.k ? k.k->p : l->b->data->min_key;
607         bch2_btree_path_verify_level(trans, path, l - path->l);
608         return k;
609 }
610
611 static inline bool btree_path_advance_to_pos(struct btree_path *path,
612                                              struct btree_path_level *l,
613                                              int max_advance)
614 {
615         struct bkey_packed *k;
616         int nr_advanced = 0;
617
618         while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
619                bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
620                 if (max_advance > 0 && nr_advanced >= max_advance)
621                         return false;
622
623                 bch2_btree_node_iter_advance(&l->iter, l->b);
624                 nr_advanced++;
625         }
626
627         return true;
628 }
629
630 static inline void __btree_path_level_init(struct btree_path *path,
631                                            unsigned level)
632 {
633         struct btree_path_level *l = &path->l[level];
634
635         bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
636
637         /*
638          * Iterators to interior nodes should always be pointed at the first non
639          * whiteout:
640          */
641         if (level)
642                 bch2_btree_node_iter_peek(&l->iter, l->b);
643 }
644
645 inline void bch2_btree_path_level_init(struct btree_trans *trans,
646                                        struct btree_path *path,
647                                        struct btree *b)
648 {
649         BUG_ON(path->cached);
650
651         EBUG_ON(!btree_path_pos_in_node(path, b));
652         EBUG_ON(b->c.lock.state.seq & 1);
653
654         path->l[b->c.level].lock_seq = b->c.lock.state.seq;
655         path->l[b->c.level].b = b;
656         __btree_path_level_init(path, b->c.level);
657 }
658
659 /* Btree path: fixups after btree node updates: */
660
661 /*
662  * A btree node is being replaced - update the iterator to point to the new
663  * node:
664  */
665 void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
666 {
667         struct btree_path *path;
668
669         trans_for_each_path(trans, path)
670                 if (path->uptodate == BTREE_ITER_UPTODATE &&
671                     !path->cached &&
672                     btree_path_pos_in_node(path, b)) {
673                         enum btree_node_locked_type t =
674                                 btree_lock_want(path, b->c.level);
675
676                         if (t != BTREE_NODE_UNLOCKED) {
677                                 btree_node_unlock(trans, path, b->c.level);
678                                 six_lock_increment(&b->c.lock, t);
679                                 mark_btree_node_locked(trans, path, b->c.level, t);
680                         }
681
682                         bch2_btree_path_level_init(trans, path, b);
683                 }
684 }
685
686 /*
687  * A btree node has been modified in such a way as to invalidate iterators - fix
688  * them:
689  */
690 void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b)
691 {
692         struct btree_path *path;
693
694         trans_for_each_path_with_node(trans, b, path)
695                 __btree_path_level_init(path, b->c.level);
696 }
697
698 /* Btree path: traverse, set_pos: */
699
700 static inline int btree_path_lock_root(struct btree_trans *trans,
701                                        struct btree_path *path,
702                                        unsigned depth_want,
703                                        unsigned long trace_ip)
704 {
705         struct bch_fs *c = trans->c;
706         struct btree *b, **rootp = &c->btree_roots[path->btree_id].b;
707         enum six_lock_type lock_type;
708         unsigned i;
709         int ret;
710
711         EBUG_ON(path->nodes_locked);
712
713         while (1) {
714                 b = READ_ONCE(*rootp);
715                 path->level = READ_ONCE(b->c.level);
716
717                 if (unlikely(path->level < depth_want)) {
718                         /*
719                          * the root is at a lower depth than the depth we want:
720                          * got to the end of the btree, or we're walking nodes
721                          * greater than some depth and there are no nodes >=
722                          * that depth
723                          */
724                         path->level = depth_want;
725                         for (i = path->level; i < BTREE_MAX_DEPTH; i++)
726                                 path->l[i].b = NULL;
727                         return 1;
728                 }
729
730                 lock_type = __btree_lock_want(path, path->level);
731                 ret = btree_node_lock(trans, path, &b->c,
732                                       path->level, lock_type, trace_ip);
733                 if (unlikely(ret)) {
734                         if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed))
735                                 continue;
736                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
737                                 return ret;
738                         BUG();
739                 }
740
741                 if (likely(b == READ_ONCE(*rootp) &&
742                            b->c.level == path->level &&
743                            !race_fault())) {
744                         for (i = 0; i < path->level; i++)
745                                 path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_lock_root);
746                         path->l[path->level].b = b;
747                         for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
748                                 path->l[i].b = NULL;
749
750                         mark_btree_node_locked(trans, path, path->level, lock_type);
751                         bch2_btree_path_level_init(trans, path, b);
752                         return 0;
753                 }
754
755                 six_unlock_type(&b->c.lock, lock_type);
756         }
757 }
758
759 noinline
760 static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path)
761 {
762         struct bch_fs *c = trans->c;
763         struct btree_path_level *l = path_l(path);
764         struct btree_node_iter node_iter = l->iter;
765         struct bkey_packed *k;
766         struct bkey_buf tmp;
767         unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
768                 ? (path->level > 1 ? 0 :  2)
769                 : (path->level > 1 ? 1 : 16);
770         bool was_locked = btree_node_locked(path, path->level);
771         int ret = 0;
772
773         bch2_bkey_buf_init(&tmp);
774
775         while (nr && !ret) {
776                 if (!bch2_btree_node_relock(trans, path, path->level))
777                         break;
778
779                 bch2_btree_node_iter_advance(&node_iter, l->b);
780                 k = bch2_btree_node_iter_peek(&node_iter, l->b);
781                 if (!k)
782                         break;
783
784                 bch2_bkey_buf_unpack(&tmp, c, l->b, k);
785                 ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id,
786                                                path->level - 1);
787         }
788
789         if (!was_locked)
790                 btree_node_unlock(trans, path, path->level);
791
792         bch2_bkey_buf_exit(&tmp, c);
793         return ret;
794 }
795
796 static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path,
797                                  struct btree_and_journal_iter *jiter)
798 {
799         struct bch_fs *c = trans->c;
800         struct bkey_s_c k;
801         struct bkey_buf tmp;
802         unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
803                 ? (path->level > 1 ? 0 :  2)
804                 : (path->level > 1 ? 1 : 16);
805         bool was_locked = btree_node_locked(path, path->level);
806         int ret = 0;
807
808         bch2_bkey_buf_init(&tmp);
809
810         while (nr && !ret) {
811                 if (!bch2_btree_node_relock(trans, path, path->level))
812                         break;
813
814                 bch2_btree_and_journal_iter_advance(jiter);
815                 k = bch2_btree_and_journal_iter_peek(jiter);
816                 if (!k.k)
817                         break;
818
819                 bch2_bkey_buf_reassemble(&tmp, c, k);
820                 ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id,
821                                                path->level - 1);
822         }
823
824         if (!was_locked)
825                 btree_node_unlock(trans, path, path->level);
826
827         bch2_bkey_buf_exit(&tmp, c);
828         return ret;
829 }
830
831 static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
832                                             struct btree_path *path,
833                                             unsigned plevel, struct btree *b)
834 {
835         struct btree_path_level *l = &path->l[plevel];
836         bool locked = btree_node_locked(path, plevel);
837         struct bkey_packed *k;
838         struct bch_btree_ptr_v2 *bp;
839
840         if (!bch2_btree_node_relock(trans, path, plevel))
841                 return;
842
843         k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
844         BUG_ON(k->type != KEY_TYPE_btree_ptr_v2);
845
846         bp = (void *) bkeyp_val(&l->b->format, k);
847         bp->mem_ptr = (unsigned long)b;
848
849         if (!locked)
850                 btree_node_unlock(trans, path, plevel);
851 }
852
853 static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
854                                                      struct btree_path *path,
855                                                      unsigned flags,
856                                                      struct bkey_buf *out)
857 {
858         struct bch_fs *c = trans->c;
859         struct btree_path_level *l = path_l(path);
860         struct btree_and_journal_iter jiter;
861         struct bkey_s_c k;
862         int ret = 0;
863
864         __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos);
865
866         k = bch2_btree_and_journal_iter_peek(&jiter);
867
868         bch2_bkey_buf_reassemble(out, c, k);
869
870         if (flags & BTREE_ITER_PREFETCH)
871                 ret = btree_path_prefetch_j(trans, path, &jiter);
872
873         bch2_btree_and_journal_iter_exit(&jiter);
874         return ret;
875 }
876
877 static __always_inline int btree_path_down(struct btree_trans *trans,
878                                            struct btree_path *path,
879                                            unsigned flags,
880                                            unsigned long trace_ip)
881 {
882         struct bch_fs *c = trans->c;
883         struct btree_path_level *l = path_l(path);
884         struct btree *b;
885         unsigned level = path->level - 1;
886         enum six_lock_type lock_type = __btree_lock_want(path, level);
887         bool replay_done = test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
888         struct bkey_buf tmp;
889         int ret;
890
891         EBUG_ON(!btree_node_locked(path, path->level));
892
893         bch2_bkey_buf_init(&tmp);
894
895         if (unlikely(!replay_done)) {
896                 ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp);
897                 if (ret)
898                         goto err;
899         } else {
900                 bch2_bkey_buf_unpack(&tmp, c, l->b,
901                                  bch2_btree_node_iter_peek(&l->iter, l->b));
902
903                 if (flags & BTREE_ITER_PREFETCH) {
904                         ret = btree_path_prefetch(trans, path);
905                         if (ret)
906                                 goto err;
907                 }
908         }
909
910         b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip);
911         ret = PTR_ERR_OR_ZERO(b);
912         if (unlikely(ret))
913                 goto err;
914
915         if (likely(replay_done && tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
916             unlikely(b != btree_node_mem_ptr(tmp.k)))
917                 btree_node_mem_ptr_set(trans, path, level + 1, b);
918
919         if (btree_node_read_locked(path, level + 1))
920                 btree_node_unlock(trans, path, level + 1);
921
922         mark_btree_node_locked(trans, path, level, lock_type);
923         path->level = level;
924         bch2_btree_path_level_init(trans, path, b);
925
926         bch2_btree_path_verify_locks(path);
927 err:
928         bch2_bkey_buf_exit(&tmp, c);
929         return ret;
930 }
931
932 static int btree_path_traverse_one(struct btree_trans *, struct btree_path *,
933                                    unsigned, unsigned long);
934
935 static int bch2_btree_path_traverse_all(struct btree_trans *trans)
936 {
937         struct bch_fs *c = trans->c;
938         struct btree_path *path;
939         unsigned long trace_ip = _RET_IP_;
940         int ret = 0;
941
942         if (trans->in_traverse_all)
943                 return -BCH_ERR_transaction_restart_in_traverse_all;
944
945         trans->in_traverse_all = true;
946 retry_all:
947         trans->restarted = 0;
948         trans->traverse_all_idx = U8_MAX;
949
950         trans_for_each_path(trans, path)
951                 path->should_be_locked = false;
952
953         btree_trans_verify_sorted(trans);
954
955         bch2_trans_unlock(trans);
956         cond_resched();
957
958         if (unlikely(trans->memory_allocation_failure)) {
959                 struct closure cl;
960
961                 closure_init_stack(&cl);
962
963                 do {
964                         ret = bch2_btree_cache_cannibalize_lock(c, &cl);
965                         closure_sync(&cl);
966                 } while (ret);
967         }
968
969         /* Now, redo traversals in correct order: */
970         trans->traverse_all_idx = 0;
971         while (trans->traverse_all_idx < trans->nr_sorted) {
972                 path = trans->paths + trans->sorted[trans->traverse_all_idx];
973
974                 /*
975                  * Traversing a path can cause another path to be added at about
976                  * the same position:
977                  */
978                 if (path->uptodate) {
979                         ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_);
980                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
981                             ret == -ENOMEM)
982                                 goto retry_all;
983                         if (ret)
984                                 goto err;
985                         BUG_ON(path->uptodate);
986                 } else {
987                         trans->traverse_all_idx++;
988                 }
989         }
990
991         /*
992          * BTREE_ITER_NEED_RELOCK is ok here - if we called bch2_trans_unlock()
993          * and relock(), relock() won't relock since path->should_be_locked
994          * isn't set yet, which is all fine
995          */
996         trans_for_each_path(trans, path)
997                 BUG_ON(path->uptodate >= BTREE_ITER_NEED_TRAVERSE);
998 err:
999         bch2_btree_cache_cannibalize_unlock(c);
1000
1001         trans->in_traverse_all = false;
1002
1003         trace_and_count(c, trans_traverse_all, trans, trace_ip);
1004         return ret;
1005 }
1006
1007 static inline bool btree_path_good_node(struct btree_trans *trans,
1008                                         struct btree_path *path,
1009                                         unsigned l, int check_pos)
1010 {
1011         if (!is_btree_node(path, l) ||
1012             !bch2_btree_node_relock(trans, path, l))
1013                 return false;
1014
1015         if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b))
1016                 return false;
1017         if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b))
1018                 return false;
1019         return true;
1020 }
1021
1022 static void btree_path_set_level_down(struct btree_trans *trans,
1023                                       struct btree_path *path,
1024                                       unsigned new_level)
1025 {
1026         unsigned l;
1027
1028         path->level = new_level;
1029
1030         for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++)
1031                 if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
1032                         btree_node_unlock(trans, path, l);
1033
1034         btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
1035         bch2_btree_path_verify(trans, path);
1036 }
1037
1038 static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
1039                                                      struct btree_path *path,
1040                                                      int check_pos)
1041 {
1042         unsigned i, l = path->level;
1043 again:
1044         while (btree_path_node(path, l) &&
1045                !btree_path_good_node(trans, path, l, check_pos))
1046                 __btree_path_set_level_up(trans, path, l++);
1047
1048         /* If we need intent locks, take them too: */
1049         for (i = l + 1;
1050              i < path->locks_want && btree_path_node(path, i);
1051              i++)
1052                 if (!bch2_btree_node_relock(trans, path, i)) {
1053                         while (l <= i)
1054                                 __btree_path_set_level_up(trans, path, l++);
1055                         goto again;
1056                 }
1057
1058         return l;
1059 }
1060
1061 /*
1062  * This is the main state machine for walking down the btree - walks down to a
1063  * specified depth
1064  *
1065  * Returns 0 on success, -EIO on error (error reading in a btree node).
1066  *
1067  * On error, caller (peek_node()/peek_key()) must return NULL; the error is
1068  * stashed in the iterator and returned from bch2_trans_exit().
1069  */
1070 static int btree_path_traverse_one(struct btree_trans *trans,
1071                                    struct btree_path *path,
1072                                    unsigned flags,
1073                                    unsigned long trace_ip)
1074 {
1075         unsigned depth_want = path->level;
1076         int ret = trans->restarted;
1077
1078         if (unlikely(ret))
1079                 goto out;
1080
1081         /*
1082          * Ensure we obey path->should_be_locked: if it's set, we can't unlock
1083          * and re-traverse the path without a transaction restart:
1084          */
1085         if (path->should_be_locked) {
1086                 ret = bch2_btree_path_relock(trans, path, trace_ip);
1087                 goto out;
1088         }
1089
1090         if (path->cached) {
1091                 ret = bch2_btree_path_traverse_cached(trans, path, flags);
1092                 goto out;
1093         }
1094
1095         if (unlikely(path->level >= BTREE_MAX_DEPTH))
1096                 goto out;
1097
1098         path->level = btree_path_up_until_good_node(trans, path, 0);
1099
1100         EBUG_ON(btree_path_node(path, path->level) &&
1101                 !btree_node_locked(path, path->level));
1102
1103         /*
1104          * Note: path->nodes[path->level] may be temporarily NULL here - that
1105          * would indicate to other code that we got to the end of the btree,
1106          * here it indicates that relocking the root failed - it's critical that
1107          * btree_path_lock_root() comes next and that it can't fail
1108          */
1109         while (path->level > depth_want) {
1110                 ret = btree_path_node(path, path->level)
1111                         ? btree_path_down(trans, path, flags, trace_ip)
1112                         : btree_path_lock_root(trans, path, depth_want, trace_ip);
1113                 if (unlikely(ret)) {
1114                         if (ret == 1) {
1115                                 /*
1116                                  * No nodes at this level - got to the end of
1117                                  * the btree:
1118                                  */
1119                                 ret = 0;
1120                                 goto out;
1121                         }
1122
1123                         __bch2_btree_path_unlock(trans, path);
1124                         path->level = depth_want;
1125                         path->l[path->level].b = ERR_PTR(ret);
1126                         goto out;
1127                 }
1128         }
1129
1130         path->uptodate = BTREE_ITER_UPTODATE;
1131 out:
1132         BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
1133         bch2_btree_path_verify(trans, path);
1134         return ret;
1135 }
1136
1137 int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
1138                                           struct btree_path *path, unsigned flags)
1139 {
1140         if (0 && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
1141                 unsigned restart_probability_bits = 4 << min(trans->restart_count, 32U);
1142                 u64 mask = ~(~0ULL << restart_probability_bits);
1143
1144                 if ((prandom_u32() & mask) == mask) {
1145                         trace_and_count(trans->c, trans_restart_injected, trans, _RET_IP_);
1146                         return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject);
1147                 }
1148         }
1149
1150         if (path->uptodate < BTREE_ITER_NEED_RELOCK)
1151                 return 0;
1152
1153         return  bch2_trans_cond_resched(trans) ?:
1154                 btree_path_traverse_one(trans, path, flags, _RET_IP_);
1155 }
1156
1157 static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
1158                             struct btree_path *src)
1159 {
1160         unsigned i, offset = offsetof(struct btree_path, pos);
1161
1162         memcpy((void *) dst + offset,
1163                (void *) src + offset,
1164                sizeof(struct btree_path) - offset);
1165
1166         for (i = 0; i < BTREE_MAX_DEPTH; i++)
1167                 if (btree_node_locked(dst, i))
1168                         six_lock_increment(&dst->l[i].b->c.lock,
1169                                            __btree_lock_want(dst, i));
1170
1171         bch2_btree_path_check_sort(trans, dst, 0);
1172 }
1173
1174 static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src,
1175                                            bool intent)
1176 {
1177         struct btree_path *new = btree_path_alloc(trans, src);
1178
1179         btree_path_copy(trans, new, src);
1180         __btree_path_get(new, intent);
1181         return new;
1182 }
1183
1184 inline struct btree_path * __must_check
1185 bch2_btree_path_make_mut(struct btree_trans *trans,
1186                          struct btree_path *path, bool intent,
1187                          unsigned long ip)
1188 {
1189         if (path->ref > 1 || path->preserve) {
1190                 __btree_path_put(path, intent);
1191                 path = btree_path_clone(trans, path, intent);
1192                 path->preserve = false;
1193 #ifdef CONFIG_BCACHEFS_DEBUG
1194                 path->ip_allocated = ip;
1195 #endif
1196                 btree_trans_verify_sorted(trans);
1197         }
1198
1199         path->should_be_locked = false;
1200         return path;
1201 }
1202
1203 struct btree_path * __must_check
1204 bch2_btree_path_set_pos(struct btree_trans *trans,
1205                    struct btree_path *path, struct bpos new_pos,
1206                    bool intent, unsigned long ip)
1207 {
1208         int cmp = bpos_cmp(new_pos, path->pos);
1209         unsigned l = path->level;
1210
1211         EBUG_ON(trans->restarted);
1212         EBUG_ON(!path->ref);
1213
1214         if (!cmp)
1215                 return path;
1216
1217         path = bch2_btree_path_make_mut(trans, path, intent, ip);
1218
1219         path->pos = new_pos;
1220
1221         bch2_btree_path_check_sort(trans, path, cmp);
1222
1223         if (unlikely(path->cached)) {
1224                 btree_node_unlock(trans, path, 0);
1225                 path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
1226                 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
1227                 goto out;
1228         }
1229
1230         l = btree_path_up_until_good_node(trans, path, cmp);
1231
1232         if (btree_path_node(path, l)) {
1233                 BUG_ON(!btree_node_locked(path, l));
1234                 /*
1235                  * We might have to skip over many keys, or just a few: try
1236                  * advancing the node iterator, and if we have to skip over too
1237                  * many keys just reinit it (or if we're rewinding, since that
1238                  * is expensive).
1239                  */
1240                 if (cmp < 0 ||
1241                     !btree_path_advance_to_pos(path, &path->l[l], 8))
1242                         __btree_path_level_init(path, l);
1243         }
1244
1245         if (l != path->level) {
1246                 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
1247                 __bch2_btree_path_unlock(trans, path);
1248         }
1249 out:
1250         bch2_btree_path_verify(trans, path);
1251         return path;
1252 }
1253
1254 /* Btree path: main interface: */
1255
1256 static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path)
1257 {
1258         struct btree_path *sib;
1259
1260         sib = prev_btree_path(trans, path);
1261         if (sib && !btree_path_cmp(sib, path))
1262                 return sib;
1263
1264         sib = next_btree_path(trans, path);
1265         if (sib && !btree_path_cmp(sib, path))
1266                 return sib;
1267
1268         return NULL;
1269 }
1270
1271 static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path)
1272 {
1273         struct btree_path *sib;
1274
1275         sib = prev_btree_path(trans, path);
1276         if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
1277                 return sib;
1278
1279         sib = next_btree_path(trans, path);
1280         if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
1281                 return sib;
1282
1283         return NULL;
1284 }
1285
1286 static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path)
1287 {
1288         __bch2_btree_path_unlock(trans, path);
1289         btree_path_list_remove(trans, path);
1290         trans->paths_allocated &= ~(1ULL << path->idx);
1291 }
1292
1293 void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent)
1294 {
1295         struct btree_path *dup;
1296
1297         EBUG_ON(trans->paths + path->idx != path);
1298         EBUG_ON(!path->ref);
1299
1300         if (!__btree_path_put(path, intent))
1301                 return;
1302
1303         dup = path->preserve
1304                 ? have_path_at_pos(trans, path)
1305                 : have_node_at_pos(trans, path);
1306
1307         if (!dup && !(!path->preserve && !is_btree_node(path, path->level)))
1308                 return;
1309
1310         if (path->should_be_locked &&
1311             !trans->restarted &&
1312             (!dup || !bch2_btree_path_relock_norestart(trans, dup, _THIS_IP_)))
1313                 return;
1314
1315         if (dup) {
1316                 dup->preserve           |= path->preserve;
1317                 dup->should_be_locked   |= path->should_be_locked;
1318         }
1319
1320         __bch2_path_free(trans, path);
1321 }
1322
1323 void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
1324 {
1325         struct btree_insert_entry *i;
1326
1327         prt_printf(buf, "transaction updates for %s journal seq %llu",
1328                trans->fn, trans->journal_res.seq);
1329         prt_newline(buf);
1330         printbuf_indent_add(buf, 2);
1331
1332         trans_for_each_update(trans, i) {
1333                 struct bkey_s_c old = { &i->old_k, i->old_v };
1334
1335                 prt_printf(buf, "update: btree=%s cached=%u %pS",
1336                        bch2_btree_ids[i->btree_id],
1337                        i->cached,
1338                        (void *) i->ip_allocated);
1339                 prt_newline(buf);
1340
1341                 prt_printf(buf, "  old ");
1342                 bch2_bkey_val_to_text(buf, trans->c, old);
1343                 prt_newline(buf);
1344
1345                 prt_printf(buf, "  new ");
1346                 bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k));
1347                 prt_newline(buf);
1348         }
1349
1350         printbuf_indent_sub(buf, 2);
1351 }
1352
1353 noinline __cold
1354 void bch2_dump_trans_updates(struct btree_trans *trans)
1355 {
1356         struct printbuf buf = PRINTBUF;
1357
1358         bch2_trans_updates_to_text(&buf, trans);
1359         bch2_print_string_as_lines(KERN_ERR, buf.buf);
1360         printbuf_exit(&buf);
1361 }
1362
1363 void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path)
1364 {
1365         prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ",
1366                    path->idx, path->ref, path->intent_ref,
1367                    path->preserve ? 'P' : ' ',
1368                    path->should_be_locked ? 'S' : ' ',
1369                    bch2_btree_ids[path->btree_id],
1370                    path->level);
1371         bch2_bpos_to_text(out, path->pos);
1372
1373         prt_printf(out, " locks %u", path->nodes_locked);
1374 #ifdef CONFIG_BCACHEFS_DEBUG
1375         prt_printf(out, " %pS", (void *) path->ip_allocated);
1376 #endif
1377         prt_newline(out);
1378 }
1379
1380 void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans)
1381 {
1382         struct btree_path *path;
1383         unsigned idx;
1384
1385         trans_for_each_path_inorder(trans, path, idx)
1386                 bch2_btree_path_to_text(out, path);
1387 }
1388
1389 noinline __cold
1390 void bch2_dump_trans_paths_updates(struct btree_trans *trans)
1391 {
1392         struct printbuf buf = PRINTBUF;
1393
1394         bch2_trans_paths_to_text(&buf, trans);
1395         bch2_trans_updates_to_text(&buf, trans);
1396
1397         bch2_print_string_as_lines(KERN_ERR, buf.buf);
1398         printbuf_exit(&buf);
1399 }
1400
1401 noinline
1402 static void bch2_trans_update_max_paths(struct btree_trans *trans)
1403 {
1404         struct btree_transaction_stats *s = btree_trans_stats(trans);
1405         struct printbuf buf = PRINTBUF;
1406
1407         bch2_trans_paths_to_text(&buf, trans);
1408
1409         if (!buf.allocation_failure) {
1410                 mutex_lock(&s->lock);
1411                 if (s->nr_max_paths < hweight64(trans->paths_allocated)) {
1412                         s->nr_max_paths = trans->nr_max_paths =
1413                                 hweight64(trans->paths_allocated);
1414                         swap(s->max_paths_text, buf.buf);
1415                 }
1416                 mutex_unlock(&s->lock);
1417         }
1418
1419         printbuf_exit(&buf);
1420 }
1421
1422 static noinline void btree_path_overflow(struct btree_trans *trans)
1423 {
1424         bch2_dump_trans_paths_updates(trans);
1425         panic("trans path oveflow\n");
1426 }
1427
1428 static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
1429                                                   struct btree_path *pos)
1430 {
1431         struct btree_path *path;
1432         unsigned idx;
1433
1434         if (unlikely(trans->paths_allocated ==
1435                      ~((~0ULL << 1) << (BTREE_ITER_MAX - 1))))
1436                 btree_path_overflow(trans);
1437
1438         idx = __ffs64(~trans->paths_allocated);
1439         trans->paths_allocated |= 1ULL << idx;
1440
1441         if (unlikely(idx > trans->nr_max_paths))
1442                 bch2_trans_update_max_paths(trans);
1443
1444         path = &trans->paths[idx];
1445
1446         path->idx               = idx;
1447         path->ref               = 0;
1448         path->intent_ref        = 0;
1449         path->nodes_locked      = 0;
1450
1451         btree_path_list_add(trans, pos, path);
1452         return path;
1453 }
1454
1455 struct btree_path *bch2_path_get(struct btree_trans *trans,
1456                                  enum btree_id btree_id, struct bpos pos,
1457                                  unsigned locks_want, unsigned level,
1458                                  unsigned flags, unsigned long ip)
1459 {
1460         struct btree_path *path, *path_pos = NULL;
1461         bool cached = flags & BTREE_ITER_CACHED;
1462         bool intent = flags & BTREE_ITER_INTENT;
1463         int i;
1464
1465         BUG_ON(trans->restarted);
1466         btree_trans_verify_sorted(trans);
1467         bch2_trans_verify_locks(trans);
1468
1469         trans_for_each_path_inorder(trans, path, i) {
1470                 if (__btree_path_cmp(path,
1471                                      btree_id,
1472                                      cached,
1473                                      pos,
1474                                      level) > 0)
1475                         break;
1476
1477                 path_pos = path;
1478         }
1479
1480         if (path_pos &&
1481             path_pos->cached    == cached &&
1482             path_pos->btree_id  == btree_id &&
1483             path_pos->level     == level) {
1484                 __btree_path_get(path_pos, intent);
1485                 path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
1486         } else {
1487                 path = btree_path_alloc(trans, path_pos);
1488                 path_pos = NULL;
1489
1490                 __btree_path_get(path, intent);
1491                 path->pos                       = pos;
1492                 path->btree_id                  = btree_id;
1493                 path->cached                    = cached;
1494                 path->uptodate                  = BTREE_ITER_NEED_TRAVERSE;
1495                 path->should_be_locked          = false;
1496                 path->level                     = level;
1497                 path->locks_want                = locks_want;
1498                 path->nodes_locked              = 0;
1499                 for (i = 0; i < ARRAY_SIZE(path->l); i++)
1500                         path->l[i].b            = ERR_PTR(-BCH_ERR_no_btree_node_init);
1501 #ifdef CONFIG_BCACHEFS_DEBUG
1502                 path->ip_allocated              = ip;
1503 #endif
1504                 btree_trans_verify_sorted(trans);
1505         }
1506
1507         if (!(flags & BTREE_ITER_NOPRESERVE))
1508                 path->preserve = true;
1509
1510         if (path->intent_ref)
1511                 locks_want = max(locks_want, level + 1);
1512
1513         /*
1514          * If the path has locks_want greater than requested, we don't downgrade
1515          * it here - on transaction restart because btree node split needs to
1516          * upgrade locks, we might be putting/getting the iterator again.
1517          * Downgrading iterators only happens via bch2_trans_downgrade(), after
1518          * a successful transaction commit.
1519          */
1520
1521         locks_want = min(locks_want, BTREE_MAX_DEPTH);
1522         if (locks_want > path->locks_want)
1523                 bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want);
1524
1525         return path;
1526 }
1527
1528 inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
1529 {
1530
1531         struct bkey_s_c k;
1532
1533         EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
1534         EBUG_ON(!btree_node_locked(path, path->level));
1535
1536         if (!path->cached) {
1537                 struct btree_path_level *l = path_l(path);
1538                 struct bkey_packed *_k;
1539
1540                 _k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
1541                 k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null;
1542
1543                 EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, path->pos) == 0);
1544
1545                 if (!k.k || bpos_cmp(path->pos, k.k->p))
1546                         goto hole;
1547         } else {
1548                 struct bkey_cached *ck = (void *) path->l[0].b;
1549
1550                 EBUG_ON(ck &&
1551                         (path->btree_id != ck->key.btree_id ||
1552                          bkey_cmp(path->pos, ck->key.pos)));
1553                 EBUG_ON(!ck || !ck->valid);
1554
1555                 *u = ck->k->k;
1556                 k = bkey_i_to_s_c(ck->k);
1557         }
1558
1559         return k;
1560 hole:
1561         bkey_init(u);
1562         u->p = path->pos;
1563         return (struct bkey_s_c) { u, NULL };
1564 }
1565
1566 /* Btree iterators: */
1567
1568 int __must_check
1569 __bch2_btree_iter_traverse(struct btree_iter *iter)
1570 {
1571         return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
1572 }
1573
1574 int __must_check
1575 bch2_btree_iter_traverse(struct btree_iter *iter)
1576 {
1577         int ret;
1578
1579         iter->path = bch2_btree_path_set_pos(iter->trans, iter->path,
1580                                         btree_iter_search_key(iter),
1581                                         iter->flags & BTREE_ITER_INTENT,
1582                                         btree_iter_ip_allocated(iter));
1583
1584         ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
1585         if (ret)
1586                 return ret;
1587
1588         btree_path_set_should_be_locked(iter->path);
1589         return 0;
1590 }
1591
1592 /* Iterate across nodes (leaf and interior nodes) */
1593
1594 struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
1595 {
1596         struct btree_trans *trans = iter->trans;
1597         struct btree *b = NULL;
1598         int ret;
1599
1600         EBUG_ON(iter->path->cached);
1601         bch2_btree_iter_verify(iter);
1602
1603         ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
1604         if (ret)
1605                 goto err;
1606
1607         b = btree_path_node(iter->path, iter->path->level);
1608         if (!b)
1609                 goto out;
1610
1611         BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0);
1612
1613         bkey_init(&iter->k);
1614         iter->k.p = iter->pos = b->key.k.p;
1615
1616         iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
1617                                         iter->flags & BTREE_ITER_INTENT,
1618                                         btree_iter_ip_allocated(iter));
1619         btree_path_set_should_be_locked(iter->path);
1620 out:
1621         bch2_btree_iter_verify_entry_exit(iter);
1622         bch2_btree_iter_verify(iter);
1623
1624         return b;
1625 err:
1626         b = ERR_PTR(ret);
1627         goto out;
1628 }
1629
1630 struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
1631 {
1632         struct btree_trans *trans = iter->trans;
1633         struct btree_path *path = iter->path;
1634         struct btree *b = NULL;
1635         int ret;
1636
1637         BUG_ON(trans->restarted);
1638         EBUG_ON(iter->path->cached);
1639         bch2_btree_iter_verify(iter);
1640
1641         /* already at end? */
1642         if (!btree_path_node(path, path->level))
1643                 return NULL;
1644
1645         /* got to end? */
1646         if (!btree_path_node(path, path->level + 1)) {
1647                 btree_path_set_level_up(trans, path);
1648                 return NULL;
1649         }
1650
1651         if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
1652                 __bch2_btree_path_unlock(trans, path);
1653                 path->l[path->level].b          = ERR_PTR(-BCH_ERR_no_btree_node_relock);
1654                 path->l[path->level + 1].b      = ERR_PTR(-BCH_ERR_no_btree_node_relock);
1655                 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
1656                 trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path);
1657                 ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
1658                 goto err;
1659         }
1660
1661         b = btree_path_node(path, path->level + 1);
1662
1663         if (!bpos_cmp(iter->pos, b->key.k.p)) {
1664                 __btree_path_set_level_up(trans, path, path->level++);
1665         } else {
1666                 /*
1667                  * Haven't gotten to the end of the parent node: go back down to
1668                  * the next child node
1669                  */
1670                 path = iter->path =
1671                         bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos),
1672                                            iter->flags & BTREE_ITER_INTENT,
1673                                            btree_iter_ip_allocated(iter));
1674
1675                 btree_path_set_level_down(trans, path, iter->min_depth);
1676
1677                 ret = bch2_btree_path_traverse(trans, path, iter->flags);
1678                 if (ret)
1679                         goto err;
1680
1681                 b = path->l[path->level].b;
1682         }
1683
1684         bkey_init(&iter->k);
1685         iter->k.p = iter->pos = b->key.k.p;
1686
1687         iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
1688                                         iter->flags & BTREE_ITER_INTENT,
1689                                         btree_iter_ip_allocated(iter));
1690         btree_path_set_should_be_locked(iter->path);
1691         BUG_ON(iter->path->uptodate);
1692 out:
1693         bch2_btree_iter_verify_entry_exit(iter);
1694         bch2_btree_iter_verify(iter);
1695
1696         return b;
1697 err:
1698         b = ERR_PTR(ret);
1699         goto out;
1700 }
1701
1702 /* Iterate across keys (in leaf nodes only) */
1703
1704 inline bool bch2_btree_iter_advance(struct btree_iter *iter)
1705 {
1706         if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) {
1707                 struct bpos pos = iter->k.p;
1708                 bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS
1709                             ? bpos_cmp(pos, SPOS_MAX)
1710                             : bkey_cmp(pos, SPOS_MAX)) != 0;
1711
1712                 if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
1713                         pos = bkey_successor(iter, pos);
1714                 bch2_btree_iter_set_pos(iter, pos);
1715                 return ret;
1716         } else {
1717                 if (!btree_path_node(iter->path, iter->path->level))
1718                         return true;
1719
1720                 iter->advanced = true;
1721                 return false;
1722         }
1723 }
1724
1725 inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
1726 {
1727         struct bpos pos = bkey_start_pos(&iter->k);
1728         bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS
1729                     ? bpos_cmp(pos, POS_MIN)
1730                     : bkey_cmp(pos, POS_MIN)) != 0;
1731
1732         if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
1733                 pos = bkey_predecessor(iter, pos);
1734         bch2_btree_iter_set_pos(iter, pos);
1735         return ret;
1736 }
1737
1738 static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans,
1739                                                       enum btree_id btree_id,
1740                                                       struct bpos pos)
1741 {
1742         struct btree_insert_entry *i;
1743         struct bkey_i *ret = NULL;
1744
1745         trans_for_each_update(trans, i) {
1746                 if (i->btree_id < btree_id)
1747                         continue;
1748                 if (i->btree_id > btree_id)
1749                         break;
1750                 if (bpos_cmp(i->k->k.p, pos) < 0)
1751                         continue;
1752                 if (i->key_cache_already_flushed)
1753                         continue;
1754                 if (!ret || bpos_cmp(i->k->k.p, ret->k.p) < 0)
1755                         ret = i->k;
1756         }
1757
1758         return ret;
1759 }
1760
1761 struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
1762                                        struct btree_iter *iter,
1763                                        struct bpos start_pos,
1764                                        struct bpos end_pos)
1765 {
1766         struct bkey_i *k;
1767
1768         if (bpos_cmp(start_pos, iter->journal_pos) < 0)
1769                 iter->journal_idx = 0;
1770
1771         k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id, 0,
1772                                         start_pos, end_pos,
1773                                         &iter->journal_idx);
1774
1775         iter->journal_pos = k ? k->k.p : end_pos;
1776         return k;
1777 }
1778
1779 struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *trans,
1780                                             struct btree_iter *iter,
1781                                             struct bpos pos)
1782 {
1783         return bch2_btree_journal_peek(trans, iter, pos, pos);
1784 }
1785
1786 static noinline
1787 struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
1788                                          struct btree_iter *iter,
1789                                          struct bkey_s_c k)
1790 {
1791         struct bkey_i *next_journal =
1792                 bch2_btree_journal_peek(trans, iter, iter->path->pos,
1793                                 k.k ? k.k->p : iter->path->l[0].b->key.k.p);
1794
1795         if (next_journal) {
1796                 iter->k = next_journal->k;
1797                 k = bkey_i_to_s_c(next_journal);
1798         }
1799
1800         return k;
1801 }
1802
1803 /*
1804  * Checks btree key cache for key at iter->pos and returns it if present, or
1805  * bkey_s_c_null:
1806  */
1807 static noinline
1808 struct bkey_s_c __btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
1809 {
1810         struct btree_trans *trans = iter->trans;
1811         struct bch_fs *c = trans->c;
1812         struct bkey u;
1813         int ret;
1814
1815         if (!bch2_btree_key_cache_find(c, iter->btree_id, pos))
1816                 return bkey_s_c_null;
1817
1818         if (!iter->key_cache_path)
1819                 iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos,
1820                                                      iter->flags & BTREE_ITER_INTENT, 0,
1821                                                      iter->flags|BTREE_ITER_CACHED,
1822                                                      _THIS_IP_);
1823
1824         iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos,
1825                                         iter->flags & BTREE_ITER_INTENT,
1826                                         btree_iter_ip_allocated(iter));
1827
1828         ret = bch2_btree_path_traverse(trans, iter->key_cache_path, iter->flags|BTREE_ITER_CACHED);
1829         if (unlikely(ret))
1830                 return bkey_s_c_err(ret);
1831
1832         btree_path_set_should_be_locked(iter->key_cache_path);
1833
1834         return bch2_btree_path_peek_slot(iter->key_cache_path, &u);
1835 }
1836
1837 static noinline
1838 struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
1839 {
1840         struct bkey_s_c ret = __btree_trans_peek_key_cache(iter, pos);
1841         int err = bkey_err(ret) ?: bch2_btree_path_relock(iter->trans, iter->path, _THIS_IP_);
1842
1843         return err ? bkey_s_c_err(err) : ret;
1844 }
1845
1846 static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
1847 {
1848         struct btree_trans *trans = iter->trans;
1849         struct bkey_i *next_update;
1850         struct bkey_s_c k, k2;
1851         int ret;
1852
1853         EBUG_ON(iter->path->cached);
1854         bch2_btree_iter_verify(iter);
1855
1856         while (1) {
1857                 struct btree_path_level *l;
1858
1859                 iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
1860                                         iter->flags & BTREE_ITER_INTENT,
1861                                         btree_iter_ip_allocated(iter));
1862
1863                 ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
1864                 if (unlikely(ret)) {
1865                         /* ensure that iter->k is consistent with iter->pos: */
1866                         bch2_btree_iter_set_pos(iter, iter->pos);
1867                         k = bkey_s_c_err(ret);
1868                         goto out;
1869                 }
1870
1871                 l = path_l(iter->path);
1872
1873                 if (unlikely(!l->b)) {
1874                         /* No btree nodes at requested level: */
1875                         bch2_btree_iter_set_pos(iter, SPOS_MAX);
1876                         k = bkey_s_c_null;
1877                         goto out;
1878                 }
1879
1880                 btree_path_set_should_be_locked(iter->path);
1881
1882                 k = btree_path_level_peek_all(trans->c, l, &iter->k);
1883
1884                 if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
1885                     k.k &&
1886                     (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
1887                         k = k2;
1888                         ret = bkey_err(k);
1889                         if (ret) {
1890                                 bch2_btree_iter_set_pos(iter, iter->pos);
1891                                 goto out;
1892                         }
1893                 }
1894
1895                 if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL))
1896                         k = btree_trans_peek_journal(trans, iter, k);
1897
1898                 next_update = iter->flags & BTREE_ITER_WITH_UPDATES
1899                         ? btree_trans_peek_updates(trans, iter->btree_id, search_key)
1900                         : NULL;
1901                 if (next_update &&
1902                     bpos_cmp(next_update->k.p,
1903                              k.k ? k.k->p : l->b->key.k.p) <= 0) {
1904                         iter->k = next_update->k;
1905                         k = bkey_i_to_s_c(next_update);
1906                 }
1907
1908                 if (k.k && bkey_deleted(k.k)) {
1909                         /*
1910                          * If we've got a whiteout, and it's after the search
1911                          * key, advance the search key to the whiteout instead
1912                          * of just after the whiteout - it might be a btree
1913                          * whiteout, with a real key at the same position, since
1914                          * in the btree deleted keys sort before non deleted.
1915                          */
1916                         search_key = bpos_cmp(search_key, k.k->p)
1917                                 ? k.k->p
1918                                 : bpos_successor(k.k->p);
1919                         continue;
1920                 }
1921
1922                 if (likely(k.k)) {
1923                         break;
1924                 } else if (likely(bpos_cmp(l->b->key.k.p, SPOS_MAX))) {
1925                         /* Advance to next leaf node: */
1926                         search_key = bpos_successor(l->b->key.k.p);
1927                 } else {
1928                         /* End of btree: */
1929                         bch2_btree_iter_set_pos(iter, SPOS_MAX);
1930                         k = bkey_s_c_null;
1931                         goto out;
1932                 }
1933         }
1934 out:
1935         bch2_btree_iter_verify(iter);
1936
1937         return k;
1938 }
1939
1940 /**
1941  * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
1942  * current position
1943  */
1944 struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end)
1945 {
1946         struct btree_trans *trans = iter->trans;
1947         struct bpos search_key = btree_iter_search_key(iter);
1948         struct bkey_s_c k;
1949         struct bpos iter_pos;
1950         int ret;
1951
1952         EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
1953
1954         if (iter->update_path) {
1955                 bch2_path_put(trans, iter->update_path,
1956                               iter->flags & BTREE_ITER_INTENT);
1957                 iter->update_path = NULL;
1958         }
1959
1960         bch2_btree_iter_verify_entry_exit(iter);
1961
1962         while (1) {
1963                 k = __bch2_btree_iter_peek(iter, search_key);
1964                 if (!k.k || bkey_err(k))
1965                         goto out_no_locked;
1966
1967                 /*
1968                  * iter->pos should be mononotically increasing, and always be
1969                  * equal to the key we just returned - except extents can
1970                  * straddle iter->pos:
1971                  */
1972                 if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
1973                         iter_pos = k.k->p;
1974                 else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
1975                         iter_pos = bkey_start_pos(k.k);
1976                 else
1977                         iter_pos = iter->pos;
1978
1979                 if (bkey_cmp(iter_pos, end) > 0) {
1980                         bch2_btree_iter_set_pos(iter, end);
1981                         k = bkey_s_c_null;
1982                         goto out_no_locked;
1983                 }
1984
1985                 if (iter->update_path &&
1986                     bkey_cmp(iter->update_path->pos, k.k->p)) {
1987                         bch2_path_put(trans, iter->update_path,
1988                                       iter->flags & BTREE_ITER_INTENT);
1989                         iter->update_path = NULL;
1990                 }
1991
1992                 if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
1993                     (iter->flags & BTREE_ITER_INTENT) &&
1994                     !(iter->flags & BTREE_ITER_IS_EXTENTS) &&
1995                     !iter->update_path) {
1996                         struct bpos pos = k.k->p;
1997
1998                         if (pos.snapshot < iter->snapshot) {
1999                                 search_key = bpos_successor(k.k->p);
2000                                 continue;
2001                         }
2002
2003                         pos.snapshot = iter->snapshot;
2004
2005                         /*
2006                          * advance, same as on exit for iter->path, but only up
2007                          * to snapshot
2008                          */
2009                         __btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT);
2010                         iter->update_path = iter->path;
2011
2012                         iter->update_path = bch2_btree_path_set_pos(trans,
2013                                                 iter->update_path, pos,
2014                                                 iter->flags & BTREE_ITER_INTENT,
2015                                                 _THIS_IP_);
2016                 }
2017
2018                 /*
2019                  * We can never have a key in a leaf node at POS_MAX, so
2020                  * we don't have to check these successor() calls:
2021                  */
2022                 if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
2023                     !bch2_snapshot_is_ancestor(trans->c,
2024                                                iter->snapshot,
2025                                                k.k->p.snapshot)) {
2026                         search_key = bpos_successor(k.k->p);
2027                         continue;
2028                 }
2029
2030                 if (bkey_whiteout(k.k) &&
2031                     !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
2032                         search_key = bkey_successor(iter, k.k->p);
2033                         continue;
2034                 }
2035
2036                 break;
2037         }
2038
2039         iter->pos = iter_pos;
2040
2041         iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
2042                                 iter->flags & BTREE_ITER_INTENT,
2043                                 btree_iter_ip_allocated(iter));
2044
2045         btree_path_set_should_be_locked(iter->path);
2046 out_no_locked:
2047         if (iter->update_path) {
2048                 if (iter->update_path->uptodate &&
2049                     (ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_)))
2050                         k = bkey_s_c_err(ret);
2051                 else
2052                         btree_path_set_should_be_locked(iter->update_path);
2053         }
2054
2055         if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
2056                 iter->pos.snapshot = iter->snapshot;
2057
2058         ret = bch2_btree_iter_verify_ret(iter, k);
2059         if (unlikely(ret)) {
2060                 bch2_btree_iter_set_pos(iter, iter->pos);
2061                 k = bkey_s_c_err(ret);
2062         }
2063
2064         bch2_btree_iter_verify_entry_exit(iter);
2065
2066         return k;
2067 }
2068
2069 /**
2070  * bch2_btree_iter_peek_all_levels: returns the first key greater than or equal
2071  * to iterator's current position, returning keys from every level of the btree.
2072  * For keys at different levels of the btree that compare equal, the key from
2073  * the lower level (leaf) is returned first.
2074  */
2075 struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter)
2076 {
2077         struct btree_trans *trans = iter->trans;
2078         struct bkey_s_c k;
2079         int ret;
2080
2081         EBUG_ON(iter->path->cached);
2082         bch2_btree_iter_verify(iter);
2083         BUG_ON(iter->path->level < iter->min_depth);
2084         BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
2085         EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS));
2086
2087         while (1) {
2088                 iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos,
2089                                         iter->flags & BTREE_ITER_INTENT,
2090                                         btree_iter_ip_allocated(iter));
2091
2092                 ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
2093                 if (unlikely(ret)) {
2094                         /* ensure that iter->k is consistent with iter->pos: */
2095                         bch2_btree_iter_set_pos(iter, iter->pos);
2096                         k = bkey_s_c_err(ret);
2097                         goto out_no_locked;
2098                 }
2099
2100                 /* Already at end? */
2101                 if (!btree_path_node(iter->path, iter->path->level)) {
2102                         k = bkey_s_c_null;
2103                         goto out_no_locked;
2104                 }
2105
2106                 k = btree_path_level_peek_all(trans->c,
2107                                 &iter->path->l[iter->path->level], &iter->k);
2108
2109                 /* Check if we should go up to the parent node: */
2110                 if (!k.k ||
2111                     (iter->advanced &&
2112                      !bpos_cmp(path_l(iter->path)->b->key.k.p, iter->pos))) {
2113                         iter->pos = path_l(iter->path)->b->key.k.p;
2114                         btree_path_set_level_up(trans, iter->path);
2115                         iter->advanced = false;
2116                         continue;
2117                 }
2118
2119                 /*
2120                  * Check if we should go back down to a leaf:
2121                  * If we're not in a leaf node, we only return the current key
2122                  * if it exactly matches iter->pos - otherwise we first have to
2123                  * go back to the leaf:
2124                  */
2125                 if (iter->path->level != iter->min_depth &&
2126                     (iter->advanced ||
2127                      !k.k ||
2128                      bpos_cmp(iter->pos, k.k->p))) {
2129                         btree_path_set_level_down(trans, iter->path, iter->min_depth);
2130                         iter->pos = bpos_successor(iter->pos);
2131                         iter->advanced = false;
2132                         continue;
2133                 }
2134
2135                 /* Check if we should go to the next key: */
2136                 if (iter->path->level == iter->min_depth &&
2137                     iter->advanced &&
2138                     k.k &&
2139                     !bpos_cmp(iter->pos, k.k->p)) {
2140                         iter->pos = bpos_successor(iter->pos);
2141                         iter->advanced = false;
2142                         continue;
2143                 }
2144
2145                 if (iter->advanced &&
2146                     iter->path->level == iter->min_depth &&
2147                     bpos_cmp(k.k->p, iter->pos))
2148                         iter->advanced = false;
2149
2150                 BUG_ON(iter->advanced);
2151                 BUG_ON(!k.k);
2152                 break;
2153         }
2154
2155         iter->pos = k.k->p;
2156         btree_path_set_should_be_locked(iter->path);
2157 out_no_locked:
2158         bch2_btree_iter_verify(iter);
2159
2160         return k;
2161 }
2162
2163 /**
2164  * bch2_btree_iter_next: returns first key greater than iterator's current
2165  * position
2166  */
2167 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
2168 {
2169         if (!bch2_btree_iter_advance(iter))
2170                 return bkey_s_c_null;
2171
2172         return bch2_btree_iter_peek(iter);
2173 }
2174
2175 /**
2176  * bch2_btree_iter_peek_prev: returns first key less than or equal to
2177  * iterator's current position
2178  */
2179 struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
2180 {
2181         struct btree_trans *trans = iter->trans;
2182         struct bpos search_key = iter->pos;
2183         struct btree_path *saved_path = NULL;
2184         struct bkey_s_c k;
2185         struct bkey saved_k;
2186         const struct bch_val *saved_v;
2187         int ret;
2188
2189         EBUG_ON(iter->path->cached || iter->path->level);
2190         EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES);
2191
2192         if (iter->flags & BTREE_ITER_WITH_JOURNAL)
2193                 return bkey_s_c_err(-EIO);
2194
2195         bch2_btree_iter_verify(iter);
2196         bch2_btree_iter_verify_entry_exit(iter);
2197
2198         if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
2199                 search_key.snapshot = U32_MAX;
2200
2201         while (1) {
2202                 iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
2203                                                 iter->flags & BTREE_ITER_INTENT,
2204                                                 btree_iter_ip_allocated(iter));
2205
2206                 ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
2207                 if (unlikely(ret)) {
2208                         /* ensure that iter->k is consistent with iter->pos: */
2209                         bch2_btree_iter_set_pos(iter, iter->pos);
2210                         k = bkey_s_c_err(ret);
2211                         goto out_no_locked;
2212                 }
2213
2214                 k = btree_path_level_peek(trans, iter->path,
2215                                           &iter->path->l[0], &iter->k);
2216                 if (!k.k ||
2217                     ((iter->flags & BTREE_ITER_IS_EXTENTS)
2218                      ? bpos_cmp(bkey_start_pos(k.k), search_key) >= 0
2219                      : bpos_cmp(k.k->p, search_key) > 0))
2220                         k = btree_path_level_prev(trans, iter->path,
2221                                                   &iter->path->l[0], &iter->k);
2222
2223                 bch2_btree_path_check_sort(trans, iter->path, 0);
2224
2225                 if (likely(k.k)) {
2226                         if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) {
2227                                 if (k.k->p.snapshot == iter->snapshot)
2228                                         goto got_key;
2229
2230                                 /*
2231                                  * If we have a saved candidate, and we're no
2232                                  * longer at the same _key_ (not pos), return
2233                                  * that candidate
2234                                  */
2235                                 if (saved_path && bkey_cmp(k.k->p, saved_k.p)) {
2236                                         bch2_path_put(trans, iter->path,
2237                                                       iter->flags & BTREE_ITER_INTENT);
2238                                         iter->path = saved_path;
2239                                         saved_path = NULL;
2240                                         iter->k = saved_k;
2241                                         k.v     = saved_v;
2242                                         goto got_key;
2243                                 }
2244
2245                                 if (bch2_snapshot_is_ancestor(iter->trans->c,
2246                                                               iter->snapshot,
2247                                                               k.k->p.snapshot)) {
2248                                         if (saved_path)
2249                                                 bch2_path_put(trans, saved_path,
2250                                                       iter->flags & BTREE_ITER_INTENT);
2251                                         saved_path = btree_path_clone(trans, iter->path,
2252                                                                 iter->flags & BTREE_ITER_INTENT);
2253                                         saved_k = *k.k;
2254                                         saved_v = k.v;
2255                                 }
2256
2257                                 search_key = bpos_predecessor(k.k->p);
2258                                 continue;
2259                         }
2260 got_key:
2261                         if (bkey_whiteout(k.k) &&
2262                             !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
2263                                 search_key = bkey_predecessor(iter, k.k->p);
2264                                 if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
2265                                         search_key.snapshot = U32_MAX;
2266                                 continue;
2267                         }
2268
2269                         break;
2270                 } else if (likely(bpos_cmp(iter->path->l[0].b->data->min_key, POS_MIN))) {
2271                         /* Advance to previous leaf node: */
2272                         search_key = bpos_predecessor(iter->path->l[0].b->data->min_key);
2273                 } else {
2274                         /* Start of btree: */
2275                         bch2_btree_iter_set_pos(iter, POS_MIN);
2276                         k = bkey_s_c_null;
2277                         goto out_no_locked;
2278                 }
2279         }
2280
2281         EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0);
2282
2283         /* Extents can straddle iter->pos: */
2284         if (bkey_cmp(k.k->p, iter->pos) < 0)
2285                 iter->pos = k.k->p;
2286
2287         if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
2288                 iter->pos.snapshot = iter->snapshot;
2289
2290         btree_path_set_should_be_locked(iter->path);
2291 out_no_locked:
2292         if (saved_path)
2293                 bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
2294
2295         bch2_btree_iter_verify_entry_exit(iter);
2296         bch2_btree_iter_verify(iter);
2297
2298         return k;
2299 }
2300
2301 /**
2302  * bch2_btree_iter_prev: returns first key less than iterator's current
2303  * position
2304  */
2305 struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
2306 {
2307         if (!bch2_btree_iter_rewind(iter))
2308                 return bkey_s_c_null;
2309
2310         return bch2_btree_iter_peek_prev(iter);
2311 }
2312
2313 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
2314 {
2315         struct btree_trans *trans = iter->trans;
2316         struct bpos search_key;
2317         struct bkey_s_c k;
2318         int ret;
2319
2320         bch2_btree_iter_verify(iter);
2321         bch2_btree_iter_verify_entry_exit(iter);
2322         EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
2323         EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE));
2324
2325         /* extents can't span inode numbers: */
2326         if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
2327             unlikely(iter->pos.offset == KEY_OFFSET_MAX)) {
2328                 if (iter->pos.inode == KEY_INODE_MAX)
2329                         return bkey_s_c_null;
2330
2331                 bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
2332         }
2333
2334         search_key = btree_iter_search_key(iter);
2335         iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
2336                                         iter->flags & BTREE_ITER_INTENT,
2337                                         btree_iter_ip_allocated(iter));
2338
2339         ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
2340         if (unlikely(ret)) {
2341                 k = bkey_s_c_err(ret);
2342                 goto out_no_locked;
2343         }
2344
2345         if ((iter->flags & BTREE_ITER_CACHED) ||
2346             !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
2347                 struct bkey_i *next_update;
2348
2349                 if ((iter->flags & BTREE_ITER_WITH_UPDATES) &&
2350                     (next_update = btree_trans_peek_updates(trans,
2351                                                 iter->btree_id, search_key)) &&
2352                     !bpos_cmp(next_update->k.p, iter->pos)) {
2353                         iter->k = next_update->k;
2354                         k = bkey_i_to_s_c(next_update);
2355                         goto out;
2356                 }
2357
2358                 if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) &&
2359                     (next_update = bch2_btree_journal_peek_slot(trans,
2360                                         iter, iter->pos))) {
2361                         iter->k = next_update->k;
2362                         k = bkey_i_to_s_c(next_update);
2363                         goto out;
2364                 }
2365
2366                 if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
2367                     (k = __btree_trans_peek_key_cache(iter, iter->pos)).k) {
2368                         if (!bkey_err(k))
2369                                 iter->k = *k.k;
2370                         /* We're not returning a key from iter->path: */
2371                         goto out_no_locked;
2372                 }
2373
2374                 k = bch2_btree_path_peek_slot(iter->path, &iter->k);
2375         } else {
2376                 struct bpos next;
2377
2378                 EBUG_ON(iter->path->level);
2379
2380                 if (iter->flags & BTREE_ITER_INTENT) {
2381                         struct btree_iter iter2;
2382                         struct bpos end = iter->pos;
2383
2384                         if (iter->flags & BTREE_ITER_IS_EXTENTS)
2385                                 end.offset = U64_MAX;
2386
2387                         bch2_trans_copy_iter(&iter2, iter);
2388                         k = bch2_btree_iter_peek_upto(&iter2, end);
2389
2390                         if (k.k && !bkey_err(k)) {
2391                                 iter->k = iter2.k;
2392                                 k.k = &iter->k;
2393                         }
2394                         bch2_trans_iter_exit(trans, &iter2);
2395                 } else {
2396                         struct bpos pos = iter->pos;
2397
2398                         k = bch2_btree_iter_peek(iter);
2399                         if (unlikely(bkey_err(k)))
2400                                 bch2_btree_iter_set_pos(iter, pos);
2401                         else
2402                                 iter->pos = pos;
2403                 }
2404
2405                 if (unlikely(bkey_err(k)))
2406                         goto out_no_locked;
2407
2408                 next = k.k ? bkey_start_pos(k.k) : POS_MAX;
2409
2410                 if (bkey_cmp(iter->pos, next) < 0) {
2411                         bkey_init(&iter->k);
2412                         iter->k.p = iter->pos;
2413
2414                         if (iter->flags & BTREE_ITER_IS_EXTENTS) {
2415                                 bch2_key_resize(&iter->k,
2416                                                 min_t(u64, KEY_SIZE_MAX,
2417                                                       (next.inode == iter->pos.inode
2418                                                        ? next.offset
2419                                                        : KEY_OFFSET_MAX) -
2420                                                       iter->pos.offset));
2421                                 EBUG_ON(!iter->k.size);
2422                         }
2423
2424                         k = (struct bkey_s_c) { &iter->k, NULL };
2425                 }
2426         }
2427 out:
2428         btree_path_set_should_be_locked(iter->path);
2429 out_no_locked:
2430         bch2_btree_iter_verify_entry_exit(iter);
2431         bch2_btree_iter_verify(iter);
2432         ret = bch2_btree_iter_verify_ret(iter, k);
2433         if (unlikely(ret))
2434                 return bkey_s_c_err(ret);
2435
2436         return k;
2437 }
2438
2439 struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
2440 {
2441         if (!bch2_btree_iter_advance(iter))
2442                 return bkey_s_c_null;
2443
2444         return bch2_btree_iter_peek_slot(iter);
2445 }
2446
2447 struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
2448 {
2449         if (!bch2_btree_iter_rewind(iter))
2450                 return bkey_s_c_null;
2451
2452         return bch2_btree_iter_peek_slot(iter);
2453 }
2454
2455 /* new transactional stuff: */
2456
2457 static inline void btree_path_verify_sorted_ref(struct btree_trans *trans,
2458                                                 struct btree_path *path)
2459 {
2460         EBUG_ON(path->sorted_idx >= trans->nr_sorted);
2461         EBUG_ON(trans->sorted[path->sorted_idx] != path->idx);
2462         EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
2463 }
2464
2465 static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans)
2466 {
2467 #ifdef CONFIG_BCACHEFS_DEBUG
2468         unsigned i;
2469
2470         for (i = 0; i < trans->nr_sorted; i++)
2471                 btree_path_verify_sorted_ref(trans, trans->paths + trans->sorted[i]);
2472 #endif
2473 }
2474
2475 static void btree_trans_verify_sorted(struct btree_trans *trans)
2476 {
2477 #ifdef CONFIG_BCACHEFS_DEBUG
2478         struct btree_path *path, *prev = NULL;
2479         unsigned i;
2480
2481         if (!bch2_debug_check_iterators)
2482                 return;
2483
2484         trans_for_each_path_inorder(trans, path, i) {
2485                 if (prev && btree_path_cmp(prev, path) > 0) {
2486                         bch2_dump_trans_paths_updates(trans);
2487                         panic("trans paths out of order!\n");
2488                 }
2489                 prev = path;
2490         }
2491 #endif
2492 }
2493
2494 static inline void btree_path_swap(struct btree_trans *trans,
2495                                    struct btree_path *l, struct btree_path *r)
2496 {
2497         swap(l->sorted_idx, r->sorted_idx);
2498         swap(trans->sorted[l->sorted_idx],
2499              trans->sorted[r->sorted_idx]);
2500
2501         btree_path_verify_sorted_ref(trans, l);
2502         btree_path_verify_sorted_ref(trans, r);
2503 }
2504
2505 inline void bch2_btree_path_check_sort(struct btree_trans *trans, struct btree_path *path,
2506                                        int cmp)
2507 {
2508         struct btree_path *n;
2509
2510         if (cmp <= 0) {
2511                 n = prev_btree_path(trans, path);
2512                 if (n && btree_path_cmp(n, path) > 0) {
2513                         do {
2514                                 btree_path_swap(trans, n, path);
2515                                 n = prev_btree_path(trans, path);
2516                         } while (n && btree_path_cmp(n, path) > 0);
2517
2518                         goto out;
2519                 }
2520         }
2521
2522         if (cmp >= 0) {
2523                 n = next_btree_path(trans, path);
2524                 if (n && btree_path_cmp(path, n) > 0) {
2525                         do {
2526                                 btree_path_swap(trans, path, n);
2527                                 n = next_btree_path(trans, path);
2528                         } while (n && btree_path_cmp(path, n) > 0);
2529                 }
2530         }
2531 out:
2532         btree_trans_verify_sorted(trans);
2533 }
2534
2535 static inline void btree_path_list_remove(struct btree_trans *trans,
2536                                           struct btree_path *path)
2537 {
2538         unsigned i;
2539
2540         EBUG_ON(path->sorted_idx >= trans->nr_sorted);
2541
2542         array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx);
2543
2544         for (i = path->sorted_idx; i < trans->nr_sorted; i++)
2545                 trans->paths[trans->sorted[i]].sorted_idx = i;
2546
2547         path->sorted_idx = U8_MAX;
2548
2549         btree_trans_verify_sorted_refs(trans);
2550 }
2551
2552 static inline void btree_path_list_add(struct btree_trans *trans,
2553                                        struct btree_path *pos,
2554                                        struct btree_path *path)
2555 {
2556         unsigned i;
2557
2558         btree_trans_verify_sorted_refs(trans);
2559
2560         path->sorted_idx = pos ? pos->sorted_idx + 1 : 0;
2561
2562         if (trans->in_traverse_all &&
2563             trans->traverse_all_idx != U8_MAX &&
2564             trans->traverse_all_idx >= path->sorted_idx)
2565                 trans->traverse_all_idx++;
2566
2567         array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx);
2568
2569         for (i = path->sorted_idx; i < trans->nr_sorted; i++)
2570                 trans->paths[trans->sorted[i]].sorted_idx = i;
2571
2572         btree_trans_verify_sorted_refs(trans);
2573 }
2574
2575 void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
2576 {
2577         if (iter->path)
2578                 bch2_path_put(trans, iter->path,
2579                               iter->flags & BTREE_ITER_INTENT);
2580         if (iter->update_path)
2581                 bch2_path_put(trans, iter->update_path,
2582                               iter->flags & BTREE_ITER_INTENT);
2583         if (iter->key_cache_path)
2584                 bch2_path_put(trans, iter->key_cache_path,
2585                               iter->flags & BTREE_ITER_INTENT);
2586         iter->path = NULL;
2587         iter->update_path = NULL;
2588         iter->key_cache_path = NULL;
2589 }
2590
2591 static inline void __bch2_trans_iter_init(struct btree_trans *trans,
2592                                           struct btree_iter *iter,
2593                                           unsigned btree_id, struct bpos pos,
2594                                           unsigned locks_want,
2595                                           unsigned depth,
2596                                           unsigned flags,
2597                                           unsigned long ip)
2598 {
2599         if (trans->restarted)
2600                 panic("bch2_trans_iter_init(): in transaction restart, %s by %pS\n",
2601                       bch2_err_str(trans->restarted),
2602                       (void *) trans->last_restarted_ip);
2603
2604         if (flags & BTREE_ITER_ALL_LEVELS)
2605                 flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS;
2606
2607         if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) &&
2608             btree_node_type_is_extents(btree_id))
2609                 flags |= BTREE_ITER_IS_EXTENTS;
2610
2611         if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
2612             !btree_type_has_snapshots(btree_id))
2613                 flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
2614
2615         if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) &&
2616             btree_type_has_snapshots(btree_id))
2617                 flags |= BTREE_ITER_FILTER_SNAPSHOTS;
2618
2619         if (!test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags))
2620                 flags |= BTREE_ITER_WITH_JOURNAL;
2621
2622         iter->trans     = trans;
2623         iter->path      = NULL;
2624         iter->update_path = NULL;
2625         iter->key_cache_path = NULL;
2626         iter->btree_id  = btree_id;
2627         iter->min_depth = depth;
2628         iter->flags     = flags;
2629         iter->snapshot  = pos.snapshot;
2630         iter->pos       = pos;
2631         iter->k.type    = KEY_TYPE_deleted;
2632         iter->k.p       = pos;
2633         iter->k.size    = 0;
2634         iter->journal_idx = 0;
2635         iter->journal_pos = POS_MIN;
2636 #ifdef CONFIG_BCACHEFS_DEBUG
2637         iter->ip_allocated = ip;
2638 #endif
2639
2640         iter->path = bch2_path_get(trans, btree_id, iter->pos,
2641                                    locks_want, depth, flags, ip);
2642 }
2643
2644 void bch2_trans_iter_init(struct btree_trans *trans,
2645                           struct btree_iter *iter,
2646                           unsigned btree_id, struct bpos pos,
2647                           unsigned flags)
2648 {
2649         if (!btree_id_cached(trans->c, btree_id)) {
2650                 flags &= ~BTREE_ITER_CACHED;
2651                 flags &= ~BTREE_ITER_WITH_KEY_CACHE;
2652         } else if (!(flags & BTREE_ITER_CACHED))
2653                 flags |= BTREE_ITER_WITH_KEY_CACHE;
2654
2655         __bch2_trans_iter_init(trans, iter, btree_id, pos,
2656                                0, 0, flags, _RET_IP_);
2657 }
2658
2659 void bch2_trans_node_iter_init(struct btree_trans *trans,
2660                                struct btree_iter *iter,
2661                                enum btree_id btree_id,
2662                                struct bpos pos,
2663                                unsigned locks_want,
2664                                unsigned depth,
2665                                unsigned flags)
2666 {
2667         __bch2_trans_iter_init(trans, iter, btree_id, pos, locks_want, depth,
2668                                BTREE_ITER_NOT_EXTENTS|
2669                                __BTREE_ITER_ALL_SNAPSHOTS|
2670                                BTREE_ITER_ALL_SNAPSHOTS|
2671                                flags, _RET_IP_);
2672         BUG_ON(iter->path->locks_want    < min(locks_want, BTREE_MAX_DEPTH));
2673         BUG_ON(iter->path->level        != depth);
2674         BUG_ON(iter->min_depth          != depth);
2675 }
2676
2677 void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
2678 {
2679         *dst = *src;
2680         if (src->path)
2681                 __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT);
2682         if (src->update_path)
2683                 __btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT);
2684         dst->key_cache_path = NULL;
2685 }
2686
2687 void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
2688 {
2689         unsigned new_top = trans->mem_top + size;
2690         size_t old_bytes = trans->mem_bytes;
2691         size_t new_bytes = roundup_pow_of_two(new_top);
2692         void *new_mem;
2693         void *p;
2694
2695         trans->mem_max = max(trans->mem_max, new_top);
2696
2697         WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
2698
2699         new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
2700         if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
2701                 new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
2702                 new_bytes = BTREE_TRANS_MEM_MAX;
2703                 kfree(trans->mem);
2704         }
2705
2706         if (!new_mem)
2707                 return ERR_PTR(-ENOMEM);
2708
2709         trans->mem = new_mem;
2710         trans->mem_bytes = new_bytes;
2711
2712         if (old_bytes) {
2713                 trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
2714                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
2715         }
2716
2717         p = trans->mem + trans->mem_top;
2718         trans->mem_top += size;
2719         memset(p, 0, size);
2720         return p;
2721 }
2722
2723 /**
2724  * bch2_trans_begin() - reset a transaction after a interrupted attempt
2725  * @trans: transaction to reset
2726  *
2727  * While iterating over nodes or updating nodes a attempt to lock a btree node
2728  * may return BCH_ERR_transaction_restart when the trylock fails. When this
2729  * occurs bch2_trans_begin() should be called and the transaction retried.
2730  */
2731 u32 bch2_trans_begin(struct btree_trans *trans)
2732 {
2733         struct btree_path *path;
2734
2735         bch2_trans_reset_updates(trans);
2736
2737         trans->restart_count++;
2738         trans->mem_top                  = 0;
2739
2740         if (trans->fs_usage_deltas) {
2741                 trans->fs_usage_deltas->used = 0;
2742                 memset((void *) trans->fs_usage_deltas +
2743                        offsetof(struct replicas_delta_list, memset_start), 0,
2744                        (void *) &trans->fs_usage_deltas->memset_end -
2745                        (void *) &trans->fs_usage_deltas->memset_start);
2746         }
2747
2748         trans_for_each_path(trans, path) {
2749                 path->should_be_locked = false;
2750
2751                 /*
2752                  * If the transaction wasn't restarted, we're presuming to be
2753                  * doing something new: dont keep iterators excpt the ones that
2754                  * are in use - except for the subvolumes btree:
2755                  */
2756                 if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes)
2757                         path->preserve = false;
2758
2759                 /*
2760                  * XXX: we probably shouldn't be doing this if the transaction
2761                  * was restarted, but currently we still overflow transaction
2762                  * iterators if we do that
2763                  */
2764                 if (!path->ref && !path->preserve)
2765                         __bch2_path_free(trans, path);
2766                 else
2767                         path->preserve = false;
2768         }
2769
2770         if (!trans->restarted &&
2771             (need_resched() ||
2772              ktime_get_ns() - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) {
2773                 bch2_trans_unlock(trans);
2774                 cond_resched();
2775                 bch2_trans_relock(trans);
2776         }
2777
2778         trans->last_restarted_ip = _RET_IP_;
2779         if (trans->restarted)
2780                 bch2_btree_path_traverse_all(trans);
2781
2782         trans->last_begin_time = ktime_get_ns();
2783         return trans->restart_count;
2784 }
2785
2786 void bch2_trans_verify_not_restarted(struct btree_trans *trans, u32 restart_count)
2787 {
2788         if (trans_was_restarted(trans, restart_count))
2789                 panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
2790                       trans->restart_count, restart_count,
2791                       (void *) trans->last_restarted_ip);
2792 }
2793
2794 static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
2795 {
2796         size_t paths_bytes      = sizeof(struct btree_path) * BTREE_ITER_MAX;
2797         size_t updates_bytes    = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX;
2798         void *p = NULL;
2799
2800         BUG_ON(trans->used_mempool);
2801
2802 #ifdef __KERNEL__
2803         p = this_cpu_xchg(c->btree_paths_bufs->path , NULL);
2804 #endif
2805         if (!p)
2806                 p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS);
2807
2808         trans->paths            = p; p += paths_bytes;
2809         trans->updates          = p; p += updates_bytes;
2810 }
2811
2812 static inline unsigned bch2_trans_get_fn_idx(struct btree_trans *trans, struct bch_fs *c,
2813                                         const char *fn)
2814 {
2815         unsigned i;
2816
2817         for (i = 0; i < ARRAY_SIZE(c->btree_transaction_fns); i++)
2818                 if (!c->btree_transaction_fns[i] ||
2819                     c->btree_transaction_fns[i] == fn) {
2820                         c->btree_transaction_fns[i] = fn;
2821                         return i;
2822                 }
2823
2824         pr_warn_once("BCH_TRANSACTIONS_NR not big enough!");
2825         return i;
2826 }
2827
2828 void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, const char *fn)
2829         __acquires(&c->btree_trans_barrier)
2830 {
2831         struct btree_transaction_stats *s;
2832         struct btree_trans *pos;
2833
2834         BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
2835
2836         memset(trans, 0, sizeof(*trans));
2837         trans->c                = c;
2838         trans->fn               = fn;
2839         trans->last_begin_time  = ktime_get_ns();
2840         trans->fn_idx           = bch2_trans_get_fn_idx(trans, c, fn);
2841         trans->locking_wait.task = current;
2842         closure_init_stack(&trans->ref);
2843
2844         bch2_trans_alloc_paths(trans, c);
2845
2846         s = btree_trans_stats(trans);
2847         if (s) {
2848                 unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
2849
2850                 trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
2851
2852                 if (!unlikely(trans->mem)) {
2853                         trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
2854                         trans->mem_bytes = BTREE_TRANS_MEM_MAX;
2855                 } else {
2856                         trans->mem_bytes = expected_mem_bytes;
2857                 }
2858
2859                 trans->nr_max_paths = s->nr_max_paths;
2860         }
2861
2862         trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
2863
2864         mutex_lock(&c->btree_trans_lock);
2865         list_for_each_entry(pos, &c->btree_trans_list, list) {
2866                 if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) {
2867                         list_add_tail(&trans->list, &pos->list);
2868                         goto list_add_done;
2869                 }
2870         }
2871         list_add_tail(&trans->list, &c->btree_trans_list);
2872 list_add_done:
2873         mutex_unlock(&c->btree_trans_lock);
2874 }
2875
2876 static void check_btree_paths_leaked(struct btree_trans *trans)
2877 {
2878 #ifdef CONFIG_BCACHEFS_DEBUG
2879         struct bch_fs *c = trans->c;
2880         struct btree_path *path;
2881
2882         trans_for_each_path(trans, path)
2883                 if (path->ref)
2884                         goto leaked;
2885         return;
2886 leaked:
2887         bch_err(c, "btree paths leaked from %s!", trans->fn);
2888         trans_for_each_path(trans, path)
2889                 if (path->ref)
2890                         printk(KERN_ERR "  btree %s %pS\n",
2891                                bch2_btree_ids[path->btree_id],
2892                                (void *) path->ip_allocated);
2893         /* Be noisy about this: */
2894         bch2_fatal_error(c);
2895 #endif
2896 }
2897
2898 void bch2_trans_exit(struct btree_trans *trans)
2899         __releases(&c->btree_trans_barrier)
2900 {
2901         struct btree_insert_entry *i;
2902         struct bch_fs *c = trans->c;
2903         struct btree_transaction_stats *s = btree_trans_stats(trans);
2904
2905         bch2_trans_unlock(trans);
2906
2907         closure_sync(&trans->ref);
2908
2909         if (s)
2910                 s->max_mem = max(s->max_mem, trans->mem_max);
2911
2912         trans_for_each_update(trans, i)
2913                 __btree_path_put(i->path, true);
2914         trans->nr_updates               = 0;
2915
2916         check_btree_paths_leaked(trans);
2917
2918         mutex_lock(&c->btree_trans_lock);
2919         list_del(&trans->list);
2920         mutex_unlock(&c->btree_trans_lock);
2921
2922         srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
2923
2924         bch2_journal_preres_put(&c->journal, &trans->journal_preres);
2925
2926         kfree(trans->extra_journal_entries.data);
2927
2928         if (trans->fs_usage_deltas) {
2929                 if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
2930                     REPLICAS_DELTA_LIST_MAX)
2931                         mempool_free(trans->fs_usage_deltas,
2932                                      &c->replicas_delta_pool);
2933                 else
2934                         kfree(trans->fs_usage_deltas);
2935         }
2936
2937         if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
2938                 mempool_free(trans->mem, &c->btree_trans_mem_pool);
2939         else
2940                 kfree(trans->mem);
2941
2942 #ifdef __KERNEL__
2943         /*
2944          * Userspace doesn't have a real percpu implementation:
2945          */
2946         trans->paths = this_cpu_xchg(c->btree_paths_bufs->path, trans->paths);
2947 #endif
2948
2949         if (trans->paths)
2950                 mempool_free(trans->paths, &c->btree_paths_pool);
2951
2952         trans->mem      = (void *) 0x1;
2953         trans->paths    = (void *) 0x1;
2954 }
2955
2956 static void __maybe_unused
2957 bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
2958                                       struct btree_bkey_cached_common *b)
2959 {
2960         struct six_lock_count c = six_lock_counts(&b->lock);
2961         struct task_struct *owner;
2962         pid_t pid;
2963
2964         rcu_read_lock();
2965         owner = READ_ONCE(b->lock.owner);
2966         pid = owner ? owner->pid : 0;;
2967         rcu_read_unlock();
2968
2969         prt_tab(out);
2970         prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b',
2971                    b->level, bch2_btree_ids[b->btree_id]);
2972         bch2_bpos_to_text(out, btree_node_pos(b));
2973
2974         prt_tab(out);
2975         prt_printf(out, " locks %u:%u:%u held by pid %u",
2976                    c.n[0], c.n[1], c.n[2], pid);
2977 }
2978
2979 void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
2980 {
2981         struct btree_path *path;
2982         struct btree_bkey_cached_common *b;
2983         static char lock_types[] = { 'r', 'i', 'w' };
2984         unsigned l;
2985
2986         if (!out->nr_tabstops) {
2987                 printbuf_tabstop_push(out, 16);
2988                 printbuf_tabstop_push(out, 32);
2989         }
2990
2991         prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn);
2992
2993         trans_for_each_path(trans, path) {
2994                 if (!path->nodes_locked)
2995                         continue;
2996
2997                 prt_printf(out, "  path %u %c l=%u %s:",
2998                        path->idx,
2999                        path->cached ? 'c' : 'b',
3000                        path->level,
3001                        bch2_btree_ids[path->btree_id]);
3002                 bch2_bpos_to_text(out, path->pos);
3003                 prt_newline(out);
3004
3005                 for (l = 0; l < BTREE_MAX_DEPTH; l++) {
3006                         if (btree_node_locked(path, l) &&
3007                             !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) {
3008                                 prt_printf(out, "    %c l=%u ",
3009                                            lock_types[btree_node_locked_type(path, l)], l);
3010                                 bch2_btree_bkey_cached_common_to_text(out, b);
3011                                 prt_newline(out);
3012                         }
3013                 }
3014         }
3015
3016         b = READ_ONCE(trans->locking);
3017         if (b) {
3018                 prt_str(out, "  want");
3019                 prt_newline(out);
3020                 prt_printf(out, "    %c", lock_types[trans->locking_wait.lock_want]);
3021                 bch2_btree_bkey_cached_common_to_text(out, b);
3022                 prt_newline(out);
3023         }
3024 }
3025
3026 void bch2_fs_btree_iter_exit(struct bch_fs *c)
3027 {
3028         if (c->btree_trans_barrier_initialized)
3029                 cleanup_srcu_struct(&c->btree_trans_barrier);
3030         mempool_exit(&c->btree_trans_mem_pool);
3031         mempool_exit(&c->btree_paths_pool);
3032 }
3033
3034 int bch2_fs_btree_iter_init(struct bch_fs *c)
3035 {
3036         unsigned i, nr = BTREE_ITER_MAX;
3037         int ret;
3038
3039         for (i = 0; i < ARRAY_SIZE(c->btree_transaction_stats); i++)
3040                 mutex_init(&c->btree_transaction_stats[i].lock);
3041
3042         INIT_LIST_HEAD(&c->btree_trans_list);
3043         mutex_init(&c->btree_trans_lock);
3044
3045         ret   = mempool_init_kmalloc_pool(&c->btree_paths_pool, 1,
3046                         sizeof(struct btree_path) * nr +
3047                         sizeof(struct btree_insert_entry) * nr) ?:
3048                 mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
3049                                           BTREE_TRANS_MEM_MAX) ?:
3050                 init_srcu_struct(&c->btree_trans_barrier);
3051         if (!ret)
3052                 c->btree_trans_barrier_initialized = true;
3053         return ret;
3054 }