]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_iter.c
Update bcachefs sources to 44be8c1da2 fixup! bcachefs: Btree key cache improvements
[bcachefs-tools-debian] / libbcachefs / btree_iter.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_methods.h"
5 #include "bkey_buf.h"
6 #include "btree_cache.h"
7 #include "btree_iter.h"
8 #include "btree_key_cache.h"
9 #include "btree_locking.h"
10 #include "btree_update.h"
11 #include "debug.h"
12 #include "error.h"
13 #include "extents.h"
14 #include "journal.h"
15 #include "recovery.h"
16 #include "replicas.h"
17 #include "subvolume.h"
18
19 #include <linux/prandom.h>
20 #include <linux/prefetch.h>
21 #include <trace/events/bcachefs.h>
22
23 static void btree_trans_verify_sorted(struct btree_trans *);
24 inline void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int);
25
26 static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
27 static inline void btree_path_list_add(struct btree_trans *, struct btree_path *,
28                                        struct btree_path *);
29
30 static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter)
31 {
32 #ifdef CONFIG_BCACHEFS_DEBUG
33         return iter->ip_allocated;
34 #else
35         return 0;
36 #endif
37 }
38
39 static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *);
40
41 /*
42  * Unlocks before scheduling
43  * Note: does not revalidate iterator
44  */
45 static inline int bch2_trans_cond_resched(struct btree_trans *trans)
46 {
47         if (need_resched() || race_fault()) {
48                 bch2_trans_unlock(trans);
49                 schedule();
50                 return bch2_trans_relock(trans);
51         } else {
52                 return 0;
53         }
54 }
55
56 static inline int __btree_path_cmp(const struct btree_path *l,
57                                    enum btree_id        r_btree_id,
58                                    bool                 r_cached,
59                                    struct bpos          r_pos,
60                                    unsigned             r_level)
61 {
62         /*
63          * Must match lock ordering as defined by __bch2_btree_node_lock:
64          */
65         return   cmp_int(l->btree_id,   r_btree_id) ?:
66                  cmp_int((int) l->cached,       (int) r_cached) ?:
67                  bpos_cmp(l->pos,       r_pos) ?:
68                 -cmp_int(l->level,      r_level);
69 }
70
71 static inline int btree_path_cmp(const struct btree_path *l,
72                                  const struct btree_path *r)
73 {
74         return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level);
75 }
76
77 static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
78 {
79         /* Are we iterating over keys in all snapshots? */
80         if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
81                 p = bpos_successor(p);
82         } else {
83                 p = bpos_nosnap_successor(p);
84                 p.snapshot = iter->snapshot;
85         }
86
87         return p;
88 }
89
90 static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
91 {
92         /* Are we iterating over keys in all snapshots? */
93         if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
94                 p = bpos_predecessor(p);
95         } else {
96                 p = bpos_nosnap_predecessor(p);
97                 p.snapshot = iter->snapshot;
98         }
99
100         return p;
101 }
102
103 static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
104 {
105         struct bpos pos = iter->pos;
106
107         if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
108             bkey_cmp(pos, POS_MAX))
109                 pos = bkey_successor(iter, pos);
110         return pos;
111 }
112
113 static inline bool btree_path_pos_before_node(struct btree_path *path,
114                                               struct btree *b)
115 {
116         return bpos_cmp(path->pos, b->data->min_key) < 0;
117 }
118
119 static inline bool btree_path_pos_after_node(struct btree_path *path,
120                                              struct btree *b)
121 {
122         return bpos_cmp(b->key.k.p, path->pos) < 0;
123 }
124
125 static inline bool btree_path_pos_in_node(struct btree_path *path,
126                                           struct btree *b)
127 {
128         return path->btree_id == b->c.btree_id &&
129                 !btree_path_pos_before_node(path, b) &&
130                 !btree_path_pos_after_node(path, b);
131 }
132
133 /* Btree iterator: */
134
135 #ifdef CONFIG_BCACHEFS_DEBUG
136
137 static void bch2_btree_path_verify_cached(struct btree_trans *trans,
138                                           struct btree_path *path)
139 {
140         struct bkey_cached *ck;
141         bool locked = btree_node_locked(path, 0);
142
143         if (!bch2_btree_node_relock(trans, path, 0))
144                 return;
145
146         ck = (void *) path->l[0].b;
147         BUG_ON(ck->key.btree_id != path->btree_id ||
148                bkey_cmp(ck->key.pos, path->pos));
149
150         if (!locked)
151                 btree_node_unlock(trans, path, 0);
152 }
153
154 static void bch2_btree_path_verify_level(struct btree_trans *trans,
155                                 struct btree_path *path, unsigned level)
156 {
157         struct btree_path_level *l;
158         struct btree_node_iter tmp;
159         bool locked;
160         struct bkey_packed *p, *k;
161         struct printbuf buf1 = PRINTBUF;
162         struct printbuf buf2 = PRINTBUF;
163         struct printbuf buf3 = PRINTBUF;
164         const char *msg;
165
166         if (!bch2_debug_check_iterators)
167                 return;
168
169         l       = &path->l[level];
170         tmp     = l->iter;
171         locked  = btree_node_locked(path, level);
172
173         if (path->cached) {
174                 if (!level)
175                         bch2_btree_path_verify_cached(trans, path);
176                 return;
177         }
178
179         if (!btree_path_node(path, level))
180                 return;
181
182         if (!bch2_btree_node_relock_notrace(trans, path, level))
183                 return;
184
185         BUG_ON(!btree_path_pos_in_node(path, l->b));
186
187         bch2_btree_node_iter_verify(&l->iter, l->b);
188
189         /*
190          * For interior nodes, the iterator will have skipped past deleted keys:
191          */
192         p = level
193                 ? bch2_btree_node_iter_prev(&tmp, l->b)
194                 : bch2_btree_node_iter_prev_all(&tmp, l->b);
195         k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
196
197         if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) {
198                 msg = "before";
199                 goto err;
200         }
201
202         if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
203                 msg = "after";
204                 goto err;
205         }
206
207         if (!locked)
208                 btree_node_unlock(trans, path, level);
209         return;
210 err:
211         bch2_bpos_to_text(&buf1, path->pos);
212
213         if (p) {
214                 struct bkey uk = bkey_unpack_key(l->b, p);
215                 bch2_bkey_to_text(&buf2, &uk);
216         } else {
217                 prt_printf(&buf2, "(none)");
218         }
219
220         if (k) {
221                 struct bkey uk = bkey_unpack_key(l->b, k);
222                 bch2_bkey_to_text(&buf3, &uk);
223         } else {
224                 prt_printf(&buf3, "(none)");
225         }
226
227         panic("path should be %s key at level %u:\n"
228               "path pos %s\n"
229               "prev key %s\n"
230               "cur  key %s\n",
231               msg, level, buf1.buf, buf2.buf, buf3.buf);
232 }
233
234 static void bch2_btree_path_verify(struct btree_trans *trans,
235                                    struct btree_path *path)
236 {
237         struct bch_fs *c = trans->c;
238         unsigned i;
239
240         EBUG_ON(path->btree_id >= BTREE_ID_NR);
241
242         for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) {
243                 if (!path->l[i].b) {
244                         BUG_ON(!path->cached &&
245                                c->btree_roots[path->btree_id].b->c.level > i);
246                         break;
247                 }
248
249                 bch2_btree_path_verify_level(trans, path, i);
250         }
251
252         bch2_btree_path_verify_locks(path);
253 }
254
255 void bch2_trans_verify_paths(struct btree_trans *trans)
256 {
257         struct btree_path *path;
258
259         trans_for_each_path(trans, path)
260                 bch2_btree_path_verify(trans, path);
261 }
262
263 static void bch2_btree_iter_verify(struct btree_iter *iter)
264 {
265         struct btree_trans *trans = iter->trans;
266
267         BUG_ON(iter->btree_id >= BTREE_ID_NR);
268
269         BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached);
270
271         BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
272                (iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
273
274         BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
275                (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
276                !btree_type_has_snapshots(iter->btree_id));
277
278         if (iter->update_path)
279                 bch2_btree_path_verify(trans, iter->update_path);
280         bch2_btree_path_verify(trans, iter->path);
281 }
282
283 static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
284 {
285         BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
286                !iter->pos.snapshot);
287
288         BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
289                iter->pos.snapshot != iter->snapshot);
290
291         BUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
292                bkey_cmp(iter->pos, iter->k.p) > 0);
293 }
294
295 static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
296 {
297         struct btree_trans *trans = iter->trans;
298         struct btree_iter copy;
299         struct bkey_s_c prev;
300         int ret = 0;
301
302         if (!bch2_debug_check_iterators)
303                 return 0;
304
305         if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS))
306                 return 0;
307
308         if (bkey_err(k) || !k.k)
309                 return 0;
310
311         BUG_ON(!bch2_snapshot_is_ancestor(trans->c,
312                                           iter->snapshot,
313                                           k.k->p.snapshot));
314
315         bch2_trans_iter_init(trans, &copy, iter->btree_id, iter->pos,
316                              BTREE_ITER_NOPRESERVE|
317                              BTREE_ITER_ALL_SNAPSHOTS);
318         prev = bch2_btree_iter_prev(&copy);
319         if (!prev.k)
320                 goto out;
321
322         ret = bkey_err(prev);
323         if (ret)
324                 goto out;
325
326         if (!bkey_cmp(prev.k->p, k.k->p) &&
327             bch2_snapshot_is_ancestor(trans->c, iter->snapshot,
328                                       prev.k->p.snapshot) > 0) {
329                 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
330
331                 bch2_bkey_to_text(&buf1, k.k);
332                 bch2_bkey_to_text(&buf2, prev.k);
333
334                 panic("iter snap %u\n"
335                       "k    %s\n"
336                       "prev %s\n",
337                       iter->snapshot,
338                       buf1.buf, buf2.buf);
339         }
340 out:
341         bch2_trans_iter_exit(trans, &copy);
342         return ret;
343 }
344
345 void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
346                             struct bpos pos, bool key_cache)
347 {
348         struct btree_path *path;
349         unsigned idx;
350         struct printbuf buf = PRINTBUF;
351
352         trans_for_each_path_inorder(trans, path, idx) {
353                 int cmp = cmp_int(path->btree_id, id) ?:
354                         cmp_int(path->cached, key_cache);
355
356                 if (cmp > 0)
357                         break;
358                 if (cmp < 0)
359                         continue;
360
361                 if (!btree_node_locked(path, 0) ||
362                     !path->should_be_locked)
363                         continue;
364
365                 if (!key_cache) {
366                         if (bkey_cmp(pos, path->l[0].b->data->min_key) >= 0 &&
367                             bkey_cmp(pos, path->l[0].b->key.k.p) <= 0)
368                                 return;
369                 } else {
370                         if (!bkey_cmp(pos, path->pos))
371                                 return;
372                 }
373         }
374
375         bch2_dump_trans_paths_updates(trans);
376         bch2_bpos_to_text(&buf, pos);
377
378         panic("not locked: %s %s%s\n",
379               bch2_btree_ids[id], buf.buf,
380               key_cache ? " cached" : "");
381 }
382
383 #else
384
385 static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
386                                                 struct btree_path *path, unsigned l) {}
387 static inline void bch2_btree_path_verify(struct btree_trans *trans,
388                                           struct btree_path *path) {}
389 static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
390 static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
391 static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; }
392
393 #endif
394
395 /* Btree path: fixups after btree updates */
396
397 static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
398                                         struct btree *b,
399                                         struct bset_tree *t,
400                                         struct bkey_packed *k)
401 {
402         struct btree_node_iter_set *set;
403
404         btree_node_iter_for_each(iter, set)
405                 if (set->end == t->end_offset) {
406                         set->k = __btree_node_key_to_offset(b, k);
407                         bch2_btree_node_iter_sort(iter, b);
408                         return;
409                 }
410
411         bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
412 }
413
414 static void __bch2_btree_path_fix_key_modified(struct btree_path *path,
415                                                struct btree *b,
416                                                struct bkey_packed *where)
417 {
418         struct btree_path_level *l = &path->l[b->c.level];
419
420         if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
421                 return;
422
423         if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0)
424                 bch2_btree_node_iter_advance(&l->iter, l->b);
425 }
426
427 void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
428                                       struct btree *b,
429                                       struct bkey_packed *where)
430 {
431         struct btree_path *path;
432
433         trans_for_each_path_with_node(trans, b, path) {
434                 __bch2_btree_path_fix_key_modified(path, b, where);
435                 bch2_btree_path_verify_level(trans, path, b->c.level);
436         }
437 }
438
439 static void __bch2_btree_node_iter_fix(struct btree_path *path,
440                                        struct btree *b,
441                                        struct btree_node_iter *node_iter,
442                                        struct bset_tree *t,
443                                        struct bkey_packed *where,
444                                        unsigned clobber_u64s,
445                                        unsigned new_u64s)
446 {
447         const struct bkey_packed *end = btree_bkey_last(b, t);
448         struct btree_node_iter_set *set;
449         unsigned offset = __btree_node_key_to_offset(b, where);
450         int shift = new_u64s - clobber_u64s;
451         unsigned old_end = t->end_offset - shift;
452         unsigned orig_iter_pos = node_iter->data[0].k;
453         bool iter_current_key_modified =
454                 orig_iter_pos >= offset &&
455                 orig_iter_pos <= offset + clobber_u64s;
456
457         btree_node_iter_for_each(node_iter, set)
458                 if (set->end == old_end)
459                         goto found;
460
461         /* didn't find the bset in the iterator - might have to readd it: */
462         if (new_u64s &&
463             bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
464                 bch2_btree_node_iter_push(node_iter, b, where, end);
465                 goto fixup_done;
466         } else {
467                 /* Iterator is after key that changed */
468                 return;
469         }
470 found:
471         set->end = t->end_offset;
472
473         /* Iterator hasn't gotten to the key that changed yet: */
474         if (set->k < offset)
475                 return;
476
477         if (new_u64s &&
478             bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
479                 set->k = offset;
480         } else if (set->k < offset + clobber_u64s) {
481                 set->k = offset + new_u64s;
482                 if (set->k == set->end)
483                         bch2_btree_node_iter_set_drop(node_iter, set);
484         } else {
485                 /* Iterator is after key that changed */
486                 set->k = (int) set->k + shift;
487                 return;
488         }
489
490         bch2_btree_node_iter_sort(node_iter, b);
491 fixup_done:
492         if (node_iter->data[0].k != orig_iter_pos)
493                 iter_current_key_modified = true;
494
495         /*
496          * When a new key is added, and the node iterator now points to that
497          * key, the iterator might have skipped past deleted keys that should
498          * come after the key the iterator now points to. We have to rewind to
499          * before those deleted keys - otherwise
500          * bch2_btree_node_iter_prev_all() breaks:
501          */
502         if (!bch2_btree_node_iter_end(node_iter) &&
503             iter_current_key_modified &&
504             b->c.level) {
505                 struct bset_tree *t;
506                 struct bkey_packed *k, *k2, *p;
507
508                 k = bch2_btree_node_iter_peek_all(node_iter, b);
509
510                 for_each_bset(b, t) {
511                         bool set_pos = false;
512
513                         if (node_iter->data[0].end == t->end_offset)
514                                 continue;
515
516                         k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t);
517
518                         while ((p = bch2_bkey_prev_all(b, t, k2)) &&
519                                bkey_iter_cmp(b, k, p) < 0) {
520                                 k2 = p;
521                                 set_pos = true;
522                         }
523
524                         if (set_pos)
525                                 btree_node_iter_set_set_pos(node_iter,
526                                                             b, t, k2);
527                 }
528         }
529 }
530
531 void bch2_btree_node_iter_fix(struct btree_trans *trans,
532                               struct btree_path *path,
533                               struct btree *b,
534                               struct btree_node_iter *node_iter,
535                               struct bkey_packed *where,
536                               unsigned clobber_u64s,
537                               unsigned new_u64s)
538 {
539         struct bset_tree *t = bch2_bkey_to_bset(b, where);
540         struct btree_path *linked;
541
542         if (node_iter != &path->l[b->c.level].iter) {
543                 __bch2_btree_node_iter_fix(path, b, node_iter, t,
544                                            where, clobber_u64s, new_u64s);
545
546                 if (bch2_debug_check_iterators)
547                         bch2_btree_node_iter_verify(node_iter, b);
548         }
549
550         trans_for_each_path_with_node(trans, b, linked) {
551                 __bch2_btree_node_iter_fix(linked, b,
552                                            &linked->l[b->c.level].iter, t,
553                                            where, clobber_u64s, new_u64s);
554                 bch2_btree_path_verify_level(trans, linked, b->c.level);
555         }
556 }
557
558 /* Btree path level: pointer to a particular btree node and node iter */
559
560 static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c,
561                                                   struct btree_path_level *l,
562                                                   struct bkey *u,
563                                                   struct bkey_packed *k)
564 {
565         if (unlikely(!k)) {
566                 /*
567                  * signal to bch2_btree_iter_peek_slot() that we're currently at
568                  * a hole
569                  */
570                 u->type = KEY_TYPE_deleted;
571                 return bkey_s_c_null;
572         }
573
574         return bkey_disassemble(l->b, k, u);
575 }
576
577 static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
578                                                         struct btree_path_level *l,
579                                                         struct bkey *u)
580 {
581         return __btree_iter_unpack(c, l, u,
582                         bch2_btree_node_iter_peek_all(&l->iter, l->b));
583 }
584
585 static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans,
586                                                     struct btree_path *path,
587                                                     struct btree_path_level *l,
588                                                     struct bkey *u)
589 {
590         struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
591                         bch2_btree_node_iter_peek(&l->iter, l->b));
592
593         path->pos = k.k ? k.k->p : l->b->key.k.p;
594         bch2_btree_path_verify_level(trans, path, l - path->l);
595         return k;
596 }
597
598 static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
599                                                     struct btree_path *path,
600                                                     struct btree_path_level *l,
601                                                     struct bkey *u)
602 {
603         struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
604                         bch2_btree_node_iter_prev(&l->iter, l->b));
605
606         path->pos = k.k ? k.k->p : l->b->data->min_key;
607         bch2_btree_path_verify_level(trans, path, l - path->l);
608         return k;
609 }
610
611 static inline bool btree_path_advance_to_pos(struct btree_path *path,
612                                              struct btree_path_level *l,
613                                              int max_advance)
614 {
615         struct bkey_packed *k;
616         int nr_advanced = 0;
617
618         while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
619                bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
620                 if (max_advance > 0 && nr_advanced >= max_advance)
621                         return false;
622
623                 bch2_btree_node_iter_advance(&l->iter, l->b);
624                 nr_advanced++;
625         }
626
627         return true;
628 }
629
630 static inline void __btree_path_level_init(struct btree_path *path,
631                                            unsigned level)
632 {
633         struct btree_path_level *l = &path->l[level];
634
635         bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
636
637         /*
638          * Iterators to interior nodes should always be pointed at the first non
639          * whiteout:
640          */
641         if (level)
642                 bch2_btree_node_iter_peek(&l->iter, l->b);
643 }
644
645 inline void bch2_btree_path_level_init(struct btree_trans *trans,
646                                        struct btree_path *path,
647                                        struct btree *b)
648 {
649         BUG_ON(path->cached);
650
651         EBUG_ON(!btree_path_pos_in_node(path, b));
652         EBUG_ON(b->c.lock.state.seq & 1);
653
654         path->l[b->c.level].lock_seq = b->c.lock.state.seq;
655         path->l[b->c.level].b = b;
656         __btree_path_level_init(path, b->c.level);
657 }
658
659 /* Btree path: fixups after btree node updates: */
660
661 /*
662  * A btree node is being replaced - update the iterator to point to the new
663  * node:
664  */
665 void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
666 {
667         struct btree_path *path;
668
669         trans_for_each_path(trans, path)
670                 if (path->uptodate == BTREE_ITER_UPTODATE &&
671                     !path->cached &&
672                     btree_path_pos_in_node(path, b)) {
673                         enum btree_node_locked_type t =
674                                 btree_lock_want(path, b->c.level);
675
676                         if (t != BTREE_NODE_UNLOCKED) {
677                                 btree_node_unlock(trans, path, b->c.level);
678                                 six_lock_increment(&b->c.lock, t);
679                                 mark_btree_node_locked(trans, path, b->c.level, t);
680                         }
681
682                         bch2_btree_path_level_init(trans, path, b);
683                 }
684 }
685
686 /*
687  * A btree node has been modified in such a way as to invalidate iterators - fix
688  * them:
689  */
690 void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b)
691 {
692         struct btree_path *path;
693
694         trans_for_each_path_with_node(trans, b, path)
695                 __btree_path_level_init(path, b->c.level);
696 }
697
698 /* Btree path: traverse, set_pos: */
699
700 static inline int btree_path_lock_root(struct btree_trans *trans,
701                                        struct btree_path *path,
702                                        unsigned depth_want,
703                                        unsigned long trace_ip)
704 {
705         struct bch_fs *c = trans->c;
706         struct btree *b, **rootp = &c->btree_roots[path->btree_id].b;
707         enum six_lock_type lock_type;
708         unsigned i;
709         int ret;
710
711         EBUG_ON(path->nodes_locked);
712
713         while (1) {
714                 b = READ_ONCE(*rootp);
715                 path->level = READ_ONCE(b->c.level);
716
717                 if (unlikely(path->level < depth_want)) {
718                         /*
719                          * the root is at a lower depth than the depth we want:
720                          * got to the end of the btree, or we're walking nodes
721                          * greater than some depth and there are no nodes >=
722                          * that depth
723                          */
724                         path->level = depth_want;
725                         for (i = path->level; i < BTREE_MAX_DEPTH; i++)
726                                 path->l[i].b = NULL;
727                         return 1;
728                 }
729
730                 lock_type = __btree_lock_want(path, path->level);
731                 ret = btree_node_lock(trans, path, &b->c,
732                                       path->level, lock_type, trace_ip);
733                 if (unlikely(ret)) {
734                         if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed))
735                                 continue;
736                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
737                                 return ret;
738                         BUG();
739                 }
740
741                 if (likely(b == READ_ONCE(*rootp) &&
742                            b->c.level == path->level &&
743                            !race_fault())) {
744                         for (i = 0; i < path->level; i++)
745                                 path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_lock_root);
746                         path->l[path->level].b = b;
747                         for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
748                                 path->l[i].b = NULL;
749
750                         mark_btree_node_locked(trans, path, path->level, lock_type);
751                         bch2_btree_path_level_init(trans, path, b);
752                         return 0;
753                 }
754
755                 six_unlock_type(&b->c.lock, lock_type);
756         }
757 }
758
759 noinline
760 static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path)
761 {
762         struct bch_fs *c = trans->c;
763         struct btree_path_level *l = path_l(path);
764         struct btree_node_iter node_iter = l->iter;
765         struct bkey_packed *k;
766         struct bkey_buf tmp;
767         unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
768                 ? (path->level > 1 ? 0 :  2)
769                 : (path->level > 1 ? 1 : 16);
770         bool was_locked = btree_node_locked(path, path->level);
771         int ret = 0;
772
773         bch2_bkey_buf_init(&tmp);
774
775         while (nr-- && !ret) {
776                 if (!bch2_btree_node_relock(trans, path, path->level))
777                         break;
778
779                 bch2_btree_node_iter_advance(&node_iter, l->b);
780                 k = bch2_btree_node_iter_peek(&node_iter, l->b);
781                 if (!k)
782                         break;
783
784                 bch2_bkey_buf_unpack(&tmp, c, l->b, k);
785                 ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id,
786                                                path->level - 1);
787         }
788
789         if (!was_locked)
790                 btree_node_unlock(trans, path, path->level);
791
792         bch2_bkey_buf_exit(&tmp, c);
793         return ret;
794 }
795
796 static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path,
797                                  struct btree_and_journal_iter *jiter)
798 {
799         struct bch_fs *c = trans->c;
800         struct bkey_s_c k;
801         struct bkey_buf tmp;
802         unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
803                 ? (path->level > 1 ? 0 :  2)
804                 : (path->level > 1 ? 1 : 16);
805         bool was_locked = btree_node_locked(path, path->level);
806         int ret = 0;
807
808         bch2_bkey_buf_init(&tmp);
809
810         while (nr-- && !ret) {
811                 if (!bch2_btree_node_relock(trans, path, path->level))
812                         break;
813
814                 bch2_btree_and_journal_iter_advance(jiter);
815                 k = bch2_btree_and_journal_iter_peek(jiter);
816                 if (!k.k)
817                         break;
818
819                 bch2_bkey_buf_reassemble(&tmp, c, k);
820                 ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id,
821                                                path->level - 1);
822         }
823
824         if (!was_locked)
825                 btree_node_unlock(trans, path, path->level);
826
827         bch2_bkey_buf_exit(&tmp, c);
828         return ret;
829 }
830
831 static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
832                                             struct btree_path *path,
833                                             unsigned plevel, struct btree *b)
834 {
835         struct btree_path_level *l = &path->l[plevel];
836         bool locked = btree_node_locked(path, plevel);
837         struct bkey_packed *k;
838         struct bch_btree_ptr_v2 *bp;
839
840         if (!bch2_btree_node_relock(trans, path, plevel))
841                 return;
842
843         k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
844         BUG_ON(k->type != KEY_TYPE_btree_ptr_v2);
845
846         bp = (void *) bkeyp_val(&l->b->format, k);
847         bp->mem_ptr = (unsigned long)b;
848
849         if (!locked)
850                 btree_node_unlock(trans, path, plevel);
851 }
852
853 static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
854                                                      struct btree_path *path,
855                                                      unsigned flags,
856                                                      struct bkey_buf *out)
857 {
858         struct bch_fs *c = trans->c;
859         struct btree_path_level *l = path_l(path);
860         struct btree_and_journal_iter jiter;
861         struct bkey_s_c k;
862         int ret = 0;
863
864         __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos);
865
866         k = bch2_btree_and_journal_iter_peek(&jiter);
867
868         bch2_bkey_buf_reassemble(out, c, k);
869
870         if (flags & BTREE_ITER_PREFETCH)
871                 ret = btree_path_prefetch_j(trans, path, &jiter);
872
873         bch2_btree_and_journal_iter_exit(&jiter);
874         return ret;
875 }
876
877 static __always_inline int btree_path_down(struct btree_trans *trans,
878                                            struct btree_path *path,
879                                            unsigned flags,
880                                            unsigned long trace_ip)
881 {
882         struct bch_fs *c = trans->c;
883         struct btree_path_level *l = path_l(path);
884         struct btree *b;
885         unsigned level = path->level - 1;
886         enum six_lock_type lock_type = __btree_lock_want(path, level);
887         bool replay_done = test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
888         struct bkey_buf tmp;
889         int ret;
890
891         EBUG_ON(!btree_node_locked(path, path->level));
892
893         bch2_bkey_buf_init(&tmp);
894
895         if (unlikely(!replay_done)) {
896                 ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp);
897                 if (ret)
898                         goto err;
899         } else {
900                 bch2_bkey_buf_unpack(&tmp, c, l->b,
901                                  bch2_btree_node_iter_peek(&l->iter, l->b));
902
903                 if (flags & BTREE_ITER_PREFETCH) {
904                         ret = btree_path_prefetch(trans, path);
905                         if (ret)
906                                 goto err;
907                 }
908         }
909
910         b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip);
911         ret = PTR_ERR_OR_ZERO(b);
912         if (unlikely(ret))
913                 goto err;
914
915         if (likely(replay_done && tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
916             unlikely(b != btree_node_mem_ptr(tmp.k)))
917                 btree_node_mem_ptr_set(trans, path, level + 1, b);
918
919         if (btree_node_read_locked(path, level + 1))
920                 btree_node_unlock(trans, path, level + 1);
921
922         mark_btree_node_locked(trans, path, level, lock_type);
923         path->level = level;
924         bch2_btree_path_level_init(trans, path, b);
925
926         bch2_btree_path_verify_locks(path);
927 err:
928         bch2_bkey_buf_exit(&tmp, c);
929         return ret;
930 }
931
932 static int btree_path_traverse_one(struct btree_trans *, struct btree_path *,
933                                    unsigned, unsigned long);
934
935 static int bch2_btree_path_traverse_all(struct btree_trans *trans)
936 {
937         struct bch_fs *c = trans->c;
938         struct btree_path *path;
939         unsigned long trace_ip = _RET_IP_;
940         int ret = 0;
941
942         if (trans->in_traverse_all)
943                 return -BCH_ERR_transaction_restart_in_traverse_all;
944
945         trans->in_traverse_all = true;
946 retry_all:
947         trans->restarted = 0;
948         trans->traverse_all_idx = U8_MAX;
949
950         trans_for_each_path(trans, path)
951                 path->should_be_locked = false;
952
953         btree_trans_verify_sorted(trans);
954
955         bch2_trans_unlock(trans);
956         cond_resched();
957
958         if (unlikely(trans->memory_allocation_failure)) {
959                 struct closure cl;
960
961                 closure_init_stack(&cl);
962
963                 do {
964                         ret = bch2_btree_cache_cannibalize_lock(c, &cl);
965                         closure_sync(&cl);
966                 } while (ret);
967         }
968
969         /* Now, redo traversals in correct order: */
970         trans->traverse_all_idx = 0;
971         while (trans->traverse_all_idx < trans->nr_sorted) {
972                 path = trans->paths + trans->sorted[trans->traverse_all_idx];
973
974                 /*
975                  * Traversing a path can cause another path to be added at about
976                  * the same position:
977                  */
978                 if (path->uptodate) {
979                         ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_);
980                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
981                             ret == -ENOMEM)
982                                 goto retry_all;
983                         if (ret)
984                                 goto err;
985                         BUG_ON(path->uptodate);
986                 } else {
987                         trans->traverse_all_idx++;
988                 }
989         }
990
991         /*
992          * BTREE_ITER_NEED_RELOCK is ok here - if we called bch2_trans_unlock()
993          * and relock(), relock() won't relock since path->should_be_locked
994          * isn't set yet, which is all fine
995          */
996         trans_for_each_path(trans, path)
997                 BUG_ON(path->uptodate >= BTREE_ITER_NEED_TRAVERSE);
998 err:
999         bch2_btree_cache_cannibalize_unlock(c);
1000
1001         trans->in_traverse_all = false;
1002
1003         trace_and_count(c, trans_traverse_all, trans, trace_ip);
1004         return ret;
1005 }
1006
1007 static inline bool btree_path_good_node(struct btree_trans *trans,
1008                                         struct btree_path *path,
1009                                         unsigned l, int check_pos)
1010 {
1011         if (!is_btree_node(path, l) ||
1012             !bch2_btree_node_relock(trans, path, l))
1013                 return false;
1014
1015         if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b))
1016                 return false;
1017         if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b))
1018                 return false;
1019         return true;
1020 }
1021
1022 static void btree_path_set_level_down(struct btree_trans *trans,
1023                                       struct btree_path *path,
1024                                       unsigned new_level)
1025 {
1026         unsigned l;
1027
1028         path->level = new_level;
1029
1030         for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++)
1031                 if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
1032                         btree_node_unlock(trans, path, l);
1033
1034         btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
1035         bch2_btree_path_verify(trans, path);
1036 }
1037
1038 static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
1039                                                      struct btree_path *path,
1040                                                      int check_pos)
1041 {
1042         unsigned i, l = path->level;
1043 again:
1044         while (btree_path_node(path, l) &&
1045                !btree_path_good_node(trans, path, l, check_pos))
1046                 __btree_path_set_level_up(trans, path, l++);
1047
1048         /* If we need intent locks, take them too: */
1049         for (i = l + 1;
1050              i < path->locks_want && btree_path_node(path, i);
1051              i++)
1052                 if (!bch2_btree_node_relock(trans, path, i)) {
1053                         while (l <= i)
1054                                 __btree_path_set_level_up(trans, path, l++);
1055                         goto again;
1056                 }
1057
1058         return l;
1059 }
1060
1061 /*
1062  * This is the main state machine for walking down the btree - walks down to a
1063  * specified depth
1064  *
1065  * Returns 0 on success, -EIO on error (error reading in a btree node).
1066  *
1067  * On error, caller (peek_node()/peek_key()) must return NULL; the error is
1068  * stashed in the iterator and returned from bch2_trans_exit().
1069  */
1070 static int btree_path_traverse_one(struct btree_trans *trans,
1071                                    struct btree_path *path,
1072                                    unsigned flags,
1073                                    unsigned long trace_ip)
1074 {
1075         unsigned depth_want = path->level;
1076         int ret = trans->restarted;
1077
1078         if (unlikely(ret))
1079                 goto out;
1080
1081         /*
1082          * Ensure we obey path->should_be_locked: if it's set, we can't unlock
1083          * and re-traverse the path without a transaction restart:
1084          */
1085         if (path->should_be_locked) {
1086                 ret = bch2_btree_path_relock(trans, path, trace_ip);
1087                 goto out;
1088         }
1089
1090         if (path->cached) {
1091                 ret = bch2_btree_path_traverse_cached(trans, path, flags);
1092                 goto out;
1093         }
1094
1095         if (unlikely(path->level >= BTREE_MAX_DEPTH))
1096                 goto out;
1097
1098         path->level = btree_path_up_until_good_node(trans, path, 0);
1099
1100         EBUG_ON(btree_path_node(path, path->level) &&
1101                 !btree_node_locked(path, path->level));
1102
1103         /*
1104          * Note: path->nodes[path->level] may be temporarily NULL here - that
1105          * would indicate to other code that we got to the end of the btree,
1106          * here it indicates that relocking the root failed - it's critical that
1107          * btree_path_lock_root() comes next and that it can't fail
1108          */
1109         while (path->level > depth_want) {
1110                 ret = btree_path_node(path, path->level)
1111                         ? btree_path_down(trans, path, flags, trace_ip)
1112                         : btree_path_lock_root(trans, path, depth_want, trace_ip);
1113                 if (unlikely(ret)) {
1114                         if (ret == 1) {
1115                                 /*
1116                                  * No nodes at this level - got to the end of
1117                                  * the btree:
1118                                  */
1119                                 ret = 0;
1120                                 goto out;
1121                         }
1122
1123                         __bch2_btree_path_unlock(trans, path);
1124                         path->level = depth_want;
1125                         path->l[path->level].b = ERR_PTR(ret);
1126                         goto out;
1127                 }
1128         }
1129
1130         path->uptodate = BTREE_ITER_UPTODATE;
1131 out:
1132         BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
1133         bch2_btree_path_verify(trans, path);
1134         return ret;
1135 }
1136
1137 int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
1138                                           struct btree_path *path, unsigned flags)
1139 {
1140         if (0 && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
1141                 unsigned restart_probability_bits = 4 << min(trans->restart_count, 32U);
1142                 u64 mask = ~(~0ULL << restart_probability_bits);
1143
1144                 if ((prandom_u32() & mask) == mask) {
1145                         trace_and_count(trans->c, trans_restart_injected, trans, _RET_IP_);
1146                         return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject);
1147                 }
1148         }
1149
1150         if (path->uptodate < BTREE_ITER_NEED_RELOCK)
1151                 return 0;
1152
1153         return  bch2_trans_cond_resched(trans) ?:
1154                 btree_path_traverse_one(trans, path, flags, _RET_IP_);
1155 }
1156
1157 static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
1158                             struct btree_path *src)
1159 {
1160         unsigned i, offset = offsetof(struct btree_path, pos);
1161
1162         memcpy((void *) dst + offset,
1163                (void *) src + offset,
1164                sizeof(struct btree_path) - offset);
1165
1166         for (i = 0; i < BTREE_MAX_DEPTH; i++)
1167                 if (btree_node_locked(dst, i))
1168                         six_lock_increment(&dst->l[i].b->c.lock,
1169                                            __btree_lock_want(dst, i));
1170
1171         bch2_btree_path_check_sort(trans, dst, 0);
1172 }
1173
1174 static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src,
1175                                            bool intent)
1176 {
1177         struct btree_path *new = btree_path_alloc(trans, src);
1178
1179         btree_path_copy(trans, new, src);
1180         __btree_path_get(new, intent);
1181         return new;
1182 }
1183
1184 inline struct btree_path * __must_check
1185 bch2_btree_path_make_mut(struct btree_trans *trans,
1186                          struct btree_path *path, bool intent,
1187                          unsigned long ip)
1188 {
1189         if (path->ref > 1 || path->preserve) {
1190                 __btree_path_put(path, intent);
1191                 path = btree_path_clone(trans, path, intent);
1192                 path->preserve = false;
1193 #ifdef CONFIG_BCACHEFS_DEBUG
1194                 path->ip_allocated = ip;
1195 #endif
1196                 btree_trans_verify_sorted(trans);
1197         }
1198
1199         path->should_be_locked = false;
1200         return path;
1201 }
1202
1203 struct btree_path * __must_check
1204 bch2_btree_path_set_pos(struct btree_trans *trans,
1205                    struct btree_path *path, struct bpos new_pos,
1206                    bool intent, unsigned long ip)
1207 {
1208         int cmp = bpos_cmp(new_pos, path->pos);
1209         unsigned l = path->level;
1210
1211         EBUG_ON(trans->restarted);
1212         EBUG_ON(!path->ref);
1213
1214         if (!cmp)
1215                 return path;
1216
1217         path = bch2_btree_path_make_mut(trans, path, intent, ip);
1218
1219         path->pos = new_pos;
1220
1221         bch2_btree_path_check_sort(trans, path, cmp);
1222
1223         if (unlikely(path->cached)) {
1224                 btree_node_unlock(trans, path, 0);
1225                 path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
1226                 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
1227                 goto out;
1228         }
1229
1230         l = btree_path_up_until_good_node(trans, path, cmp);
1231
1232         if (btree_path_node(path, l)) {
1233                 BUG_ON(!btree_node_locked(path, l));
1234                 /*
1235                  * We might have to skip over many keys, or just a few: try
1236                  * advancing the node iterator, and if we have to skip over too
1237                  * many keys just reinit it (or if we're rewinding, since that
1238                  * is expensive).
1239                  */
1240                 if (cmp < 0 ||
1241                     !btree_path_advance_to_pos(path, &path->l[l], 8))
1242                         __btree_path_level_init(path, l);
1243         }
1244
1245         if (l != path->level) {
1246                 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
1247                 __bch2_btree_path_unlock(trans, path);
1248         }
1249 out:
1250         bch2_btree_path_verify(trans, path);
1251         return path;
1252 }
1253
1254 /* Btree path: main interface: */
1255
1256 static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path)
1257 {
1258         struct btree_path *sib;
1259
1260         sib = prev_btree_path(trans, path);
1261         if (sib && !btree_path_cmp(sib, path))
1262                 return sib;
1263
1264         sib = next_btree_path(trans, path);
1265         if (sib && !btree_path_cmp(sib, path))
1266                 return sib;
1267
1268         return NULL;
1269 }
1270
1271 static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path)
1272 {
1273         struct btree_path *sib;
1274
1275         sib = prev_btree_path(trans, path);
1276         if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
1277                 return sib;
1278
1279         sib = next_btree_path(trans, path);
1280         if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
1281                 return sib;
1282
1283         return NULL;
1284 }
1285
1286 static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path)
1287 {
1288         __bch2_btree_path_unlock(trans, path);
1289         btree_path_list_remove(trans, path);
1290         trans->paths_allocated &= ~(1ULL << path->idx);
1291 }
1292
1293 void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent)
1294 {
1295         struct btree_path *dup;
1296
1297         EBUG_ON(trans->paths + path->idx != path);
1298         EBUG_ON(!path->ref);
1299
1300         if (!__btree_path_put(path, intent))
1301                 return;
1302
1303         dup = path->preserve
1304                 ? have_path_at_pos(trans, path)
1305                 : have_node_at_pos(trans, path);
1306
1307         if (!dup && !(!path->preserve && !is_btree_node(path, path->level)))
1308                 return;
1309
1310         if (path->should_be_locked &&
1311             !trans->restarted &&
1312             (!dup || !bch2_btree_path_relock_norestart(trans, dup, _THIS_IP_)))
1313                 return;
1314
1315         if (dup) {
1316                 dup->preserve           |= path->preserve;
1317                 dup->should_be_locked   |= path->should_be_locked;
1318         }
1319
1320         __bch2_path_free(trans, path);
1321 }
1322
1323 static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *path,
1324                                  bool intent)
1325 {
1326         EBUG_ON(trans->paths + path->idx != path);
1327         EBUG_ON(!path->ref);
1328
1329         if (!__btree_path_put(path, intent))
1330                 return;
1331
1332         __bch2_path_free(trans, path);
1333 }
1334
1335 void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
1336 {
1337         struct btree_insert_entry *i;
1338
1339         prt_printf(buf, "transaction updates for %s journal seq %llu",
1340                trans->fn, trans->journal_res.seq);
1341         prt_newline(buf);
1342         printbuf_indent_add(buf, 2);
1343
1344         trans_for_each_update(trans, i) {
1345                 struct bkey_s_c old = { &i->old_k, i->old_v };
1346
1347                 prt_printf(buf, "update: btree=%s cached=%u %pS",
1348                        bch2_btree_ids[i->btree_id],
1349                        i->cached,
1350                        (void *) i->ip_allocated);
1351                 prt_newline(buf);
1352
1353                 prt_printf(buf, "  old ");
1354                 bch2_bkey_val_to_text(buf, trans->c, old);
1355                 prt_newline(buf);
1356
1357                 prt_printf(buf, "  new ");
1358                 bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k));
1359                 prt_newline(buf);
1360         }
1361
1362         printbuf_indent_sub(buf, 2);
1363 }
1364
1365 noinline __cold
1366 void bch2_dump_trans_updates(struct btree_trans *trans)
1367 {
1368         struct printbuf buf = PRINTBUF;
1369
1370         bch2_trans_updates_to_text(&buf, trans);
1371         bch2_print_string_as_lines(KERN_ERR, buf.buf);
1372         printbuf_exit(&buf);
1373 }
1374
1375 void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path)
1376 {
1377         prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ",
1378                    path->idx, path->ref, path->intent_ref,
1379                    path->preserve ? 'P' : ' ',
1380                    path->should_be_locked ? 'S' : ' ',
1381                    bch2_btree_ids[path->btree_id],
1382                    path->level);
1383         bch2_bpos_to_text(out, path->pos);
1384
1385         prt_printf(out, " locks %u", path->nodes_locked);
1386 #ifdef CONFIG_BCACHEFS_DEBUG
1387         prt_printf(out, " %pS", (void *) path->ip_allocated);
1388 #endif
1389         prt_newline(out);
1390 }
1391
1392 void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans)
1393 {
1394         struct btree_path *path;
1395         unsigned idx;
1396
1397         trans_for_each_path_inorder(trans, path, idx)
1398                 bch2_btree_path_to_text(out, path);
1399 }
1400
1401 noinline __cold
1402 void bch2_dump_trans_paths_updates(struct btree_trans *trans)
1403 {
1404         struct printbuf buf = PRINTBUF;
1405
1406         bch2_trans_paths_to_text(&buf, trans);
1407         bch2_trans_updates_to_text(&buf, trans);
1408
1409         bch2_print_string_as_lines(KERN_ERR, buf.buf);
1410         printbuf_exit(&buf);
1411 }
1412
1413 noinline
1414 static void bch2_trans_update_max_paths(struct btree_trans *trans)
1415 {
1416         struct btree_transaction_stats *s = btree_trans_stats(trans);
1417         struct printbuf buf = PRINTBUF;
1418
1419         bch2_trans_paths_to_text(&buf, trans);
1420
1421         if (!buf.allocation_failure) {
1422                 mutex_lock(&s->lock);
1423                 if (s->nr_max_paths < hweight64(trans->paths_allocated)) {
1424                         s->nr_max_paths = trans->nr_max_paths =
1425                                 hweight64(trans->paths_allocated);
1426                         swap(s->max_paths_text, buf.buf);
1427                 }
1428                 mutex_unlock(&s->lock);
1429         }
1430
1431         printbuf_exit(&buf);
1432 }
1433
1434 static noinline void btree_path_overflow(struct btree_trans *trans)
1435 {
1436         bch2_dump_trans_paths_updates(trans);
1437         panic("trans path oveflow\n");
1438 }
1439
1440 static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
1441                                                   struct btree_path *pos)
1442 {
1443         struct btree_path *path;
1444         unsigned idx;
1445
1446         if (unlikely(trans->paths_allocated ==
1447                      ~((~0ULL << 1) << (BTREE_ITER_MAX - 1))))
1448                 btree_path_overflow(trans);
1449
1450         idx = __ffs64(~trans->paths_allocated);
1451         trans->paths_allocated |= 1ULL << idx;
1452
1453         if (unlikely(idx > trans->nr_max_paths))
1454                 bch2_trans_update_max_paths(trans);
1455
1456         path = &trans->paths[idx];
1457
1458         path->idx               = idx;
1459         path->ref               = 0;
1460         path->intent_ref        = 0;
1461         path->nodes_locked      = 0;
1462
1463         btree_path_list_add(trans, pos, path);
1464         return path;
1465 }
1466
1467 struct btree_path *bch2_path_get(struct btree_trans *trans,
1468                                  enum btree_id btree_id, struct bpos pos,
1469                                  unsigned locks_want, unsigned level,
1470                                  unsigned flags, unsigned long ip)
1471 {
1472         struct btree_path *path, *path_pos = NULL;
1473         bool cached = flags & BTREE_ITER_CACHED;
1474         bool intent = flags & BTREE_ITER_INTENT;
1475         int i;
1476
1477         BUG_ON(trans->restarted);
1478         btree_trans_verify_sorted(trans);
1479         bch2_trans_verify_locks(trans);
1480
1481         trans_for_each_path_inorder(trans, path, i) {
1482                 if (__btree_path_cmp(path,
1483                                      btree_id,
1484                                      cached,
1485                                      pos,
1486                                      level) > 0)
1487                         break;
1488
1489                 path_pos = path;
1490         }
1491
1492         if (path_pos &&
1493             path_pos->cached    == cached &&
1494             path_pos->btree_id  == btree_id &&
1495             path_pos->level     == level) {
1496                 __btree_path_get(path_pos, intent);
1497                 path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
1498         } else {
1499                 path = btree_path_alloc(trans, path_pos);
1500                 path_pos = NULL;
1501
1502                 __btree_path_get(path, intent);
1503                 path->pos                       = pos;
1504                 path->btree_id                  = btree_id;
1505                 path->cached                    = cached;
1506                 path->uptodate                  = BTREE_ITER_NEED_TRAVERSE;
1507                 path->should_be_locked          = false;
1508                 path->level                     = level;
1509                 path->locks_want                = locks_want;
1510                 path->nodes_locked              = 0;
1511                 for (i = 0; i < ARRAY_SIZE(path->l); i++)
1512                         path->l[i].b            = ERR_PTR(-BCH_ERR_no_btree_node_init);
1513 #ifdef CONFIG_BCACHEFS_DEBUG
1514                 path->ip_allocated              = ip;
1515 #endif
1516                 btree_trans_verify_sorted(trans);
1517         }
1518
1519         if (!(flags & BTREE_ITER_NOPRESERVE))
1520                 path->preserve = true;
1521
1522         if (path->intent_ref)
1523                 locks_want = max(locks_want, level + 1);
1524
1525         /*
1526          * If the path has locks_want greater than requested, we don't downgrade
1527          * it here - on transaction restart because btree node split needs to
1528          * upgrade locks, we might be putting/getting the iterator again.
1529          * Downgrading iterators only happens via bch2_trans_downgrade(), after
1530          * a successful transaction commit.
1531          */
1532
1533         locks_want = min(locks_want, BTREE_MAX_DEPTH);
1534         if (locks_want > path->locks_want)
1535                 bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want);
1536
1537         return path;
1538 }
1539
1540 inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
1541 {
1542
1543         struct btree_path_level *l = path_l(path);
1544         struct bkey_packed *_k;
1545         struct bkey_s_c k;
1546
1547         if (unlikely(!l->b))
1548                 return bkey_s_c_null;
1549
1550         EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
1551         EBUG_ON(!btree_node_locked(path, path->level));
1552
1553         if (!path->cached) {
1554                 _k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
1555                 k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null;
1556
1557                 EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, path->pos) == 0);
1558
1559                 if (!k.k || bpos_cmp(path->pos, k.k->p))
1560                         goto hole;
1561         } else {
1562                 struct bkey_cached *ck = (void *) path->l[0].b;
1563
1564                 EBUG_ON(ck &&
1565                         (path->btree_id != ck->key.btree_id ||
1566                          bkey_cmp(path->pos, ck->key.pos)));
1567                 EBUG_ON(!ck || !ck->valid);
1568
1569                 *u = ck->k->k;
1570                 k = bkey_i_to_s_c(ck->k);
1571         }
1572
1573         return k;
1574 hole:
1575         bkey_init(u);
1576         u->p = path->pos;
1577         return (struct bkey_s_c) { u, NULL };
1578 }
1579
1580 /* Btree iterators: */
1581
1582 int __must_check
1583 __bch2_btree_iter_traverse(struct btree_iter *iter)
1584 {
1585         return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
1586 }
1587
1588 int __must_check
1589 bch2_btree_iter_traverse(struct btree_iter *iter)
1590 {
1591         int ret;
1592
1593         iter->path = bch2_btree_path_set_pos(iter->trans, iter->path,
1594                                         btree_iter_search_key(iter),
1595                                         iter->flags & BTREE_ITER_INTENT,
1596                                         btree_iter_ip_allocated(iter));
1597
1598         ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
1599         if (ret)
1600                 return ret;
1601
1602         btree_path_set_should_be_locked(iter->path);
1603         return 0;
1604 }
1605
1606 /* Iterate across nodes (leaf and interior nodes) */
1607
1608 struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
1609 {
1610         struct btree_trans *trans = iter->trans;
1611         struct btree *b = NULL;
1612         int ret;
1613
1614         EBUG_ON(iter->path->cached);
1615         bch2_btree_iter_verify(iter);
1616
1617         ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
1618         if (ret)
1619                 goto err;
1620
1621         b = btree_path_node(iter->path, iter->path->level);
1622         if (!b)
1623                 goto out;
1624
1625         BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0);
1626
1627         bkey_init(&iter->k);
1628         iter->k.p = iter->pos = b->key.k.p;
1629
1630         iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
1631                                         iter->flags & BTREE_ITER_INTENT,
1632                                         btree_iter_ip_allocated(iter));
1633         btree_path_set_should_be_locked(iter->path);
1634 out:
1635         bch2_btree_iter_verify_entry_exit(iter);
1636         bch2_btree_iter_verify(iter);
1637
1638         return b;
1639 err:
1640         b = ERR_PTR(ret);
1641         goto out;
1642 }
1643
1644 struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
1645 {
1646         struct btree_trans *trans = iter->trans;
1647         struct btree_path *path = iter->path;
1648         struct btree *b = NULL;
1649         int ret;
1650
1651         BUG_ON(trans->restarted);
1652         EBUG_ON(iter->path->cached);
1653         bch2_btree_iter_verify(iter);
1654
1655         /* already at end? */
1656         if (!btree_path_node(path, path->level))
1657                 return NULL;
1658
1659         /* got to end? */
1660         if (!btree_path_node(path, path->level + 1)) {
1661                 btree_path_set_level_up(trans, path);
1662                 return NULL;
1663         }
1664
1665         if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
1666                 __bch2_btree_path_unlock(trans, path);
1667                 path->l[path->level].b          = ERR_PTR(-BCH_ERR_no_btree_node_relock);
1668                 path->l[path->level + 1].b      = ERR_PTR(-BCH_ERR_no_btree_node_relock);
1669                 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
1670                 trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path);
1671                 ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
1672                 goto err;
1673         }
1674
1675         b = btree_path_node(path, path->level + 1);
1676
1677         if (!bpos_cmp(iter->pos, b->key.k.p)) {
1678                 __btree_path_set_level_up(trans, path, path->level++);
1679         } else {
1680                 /*
1681                  * Haven't gotten to the end of the parent node: go back down to
1682                  * the next child node
1683                  */
1684                 path = iter->path =
1685                         bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos),
1686                                            iter->flags & BTREE_ITER_INTENT,
1687                                            btree_iter_ip_allocated(iter));
1688
1689                 btree_path_set_level_down(trans, path, iter->min_depth);
1690
1691                 ret = bch2_btree_path_traverse(trans, path, iter->flags);
1692                 if (ret)
1693                         goto err;
1694
1695                 b = path->l[path->level].b;
1696         }
1697
1698         bkey_init(&iter->k);
1699         iter->k.p = iter->pos = b->key.k.p;
1700
1701         iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
1702                                         iter->flags & BTREE_ITER_INTENT,
1703                                         btree_iter_ip_allocated(iter));
1704         btree_path_set_should_be_locked(iter->path);
1705         BUG_ON(iter->path->uptodate);
1706 out:
1707         bch2_btree_iter_verify_entry_exit(iter);
1708         bch2_btree_iter_verify(iter);
1709
1710         return b;
1711 err:
1712         b = ERR_PTR(ret);
1713         goto out;
1714 }
1715
1716 /* Iterate across keys (in leaf nodes only) */
1717
1718 inline bool bch2_btree_iter_advance(struct btree_iter *iter)
1719 {
1720         if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) {
1721                 struct bpos pos = iter->k.p;
1722                 bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS
1723                             ? bpos_cmp(pos, SPOS_MAX)
1724                             : bkey_cmp(pos, SPOS_MAX)) != 0;
1725
1726                 if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
1727                         pos = bkey_successor(iter, pos);
1728                 bch2_btree_iter_set_pos(iter, pos);
1729                 return ret;
1730         } else {
1731                 if (!btree_path_node(iter->path, iter->path->level))
1732                         return true;
1733
1734                 iter->advanced = true;
1735                 return false;
1736         }
1737 }
1738
1739 inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
1740 {
1741         struct bpos pos = bkey_start_pos(&iter->k);
1742         bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS
1743                     ? bpos_cmp(pos, POS_MIN)
1744                     : bkey_cmp(pos, POS_MIN)) != 0;
1745
1746         if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
1747                 pos = bkey_predecessor(iter, pos);
1748         bch2_btree_iter_set_pos(iter, pos);
1749         return ret;
1750 }
1751
1752 static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans,
1753                                                       enum btree_id btree_id,
1754                                                       struct bpos pos)
1755 {
1756         struct btree_insert_entry *i;
1757         struct bkey_i *ret = NULL;
1758
1759         trans_for_each_update(trans, i) {
1760                 if (i->btree_id < btree_id)
1761                         continue;
1762                 if (i->btree_id > btree_id)
1763                         break;
1764                 if (bpos_cmp(i->k->k.p, pos) < 0)
1765                         continue;
1766                 if (i->key_cache_already_flushed)
1767                         continue;
1768                 if (!ret || bpos_cmp(i->k->k.p, ret->k.p) < 0)
1769                         ret = i->k;
1770         }
1771
1772         return ret;
1773 }
1774
1775 struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
1776                                        struct btree_iter *iter,
1777                                        struct bpos start_pos,
1778                                        struct bpos end_pos)
1779 {
1780         struct bkey_i *k;
1781
1782         if (bpos_cmp(start_pos, iter->journal_pos) < 0)
1783                 iter->journal_idx = 0;
1784
1785         k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id, 0,
1786                                         start_pos, end_pos,
1787                                         &iter->journal_idx);
1788
1789         iter->journal_pos = k ? k->k.p : end_pos;
1790         return k;
1791 }
1792
1793 struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *trans,
1794                                             struct btree_iter *iter,
1795                                             struct bpos pos)
1796 {
1797         return bch2_btree_journal_peek(trans, iter, pos, pos);
1798 }
1799
1800 static noinline
1801 struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
1802                                          struct btree_iter *iter,
1803                                          struct bkey_s_c k)
1804 {
1805         struct bkey_i *next_journal =
1806                 bch2_btree_journal_peek(trans, iter, iter->path->pos,
1807                                 k.k ? k.k->p : iter->path->l[0].b->key.k.p);
1808
1809         if (next_journal) {
1810                 iter->k = next_journal->k;
1811                 k = bkey_i_to_s_c(next_journal);
1812         }
1813
1814         return k;
1815 }
1816
1817 /*
1818  * Checks btree key cache for key at iter->pos and returns it if present, or
1819  * bkey_s_c_null:
1820  */
1821 static noinline
1822 struct bkey_s_c __btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
1823 {
1824         struct btree_trans *trans = iter->trans;
1825         struct bch_fs *c = trans->c;
1826         struct bkey u;
1827         int ret;
1828
1829         if (!bch2_btree_key_cache_find(c, iter->btree_id, pos))
1830                 return bkey_s_c_null;
1831
1832         if (!iter->key_cache_path)
1833                 iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos,
1834                                                      iter->flags & BTREE_ITER_INTENT, 0,
1835                                                      iter->flags|BTREE_ITER_CACHED,
1836                                                      _THIS_IP_);
1837
1838         iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos,
1839                                         iter->flags & BTREE_ITER_INTENT,
1840                                         btree_iter_ip_allocated(iter));
1841
1842         ret = bch2_btree_path_traverse(trans, iter->key_cache_path, iter->flags|BTREE_ITER_CACHED);
1843         if (unlikely(ret))
1844                 return bkey_s_c_err(ret);
1845
1846         btree_path_set_should_be_locked(iter->key_cache_path);
1847
1848         return bch2_btree_path_peek_slot(iter->key_cache_path, &u);
1849 }
1850
1851 static noinline
1852 struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
1853 {
1854         struct bkey_s_c ret = __btree_trans_peek_key_cache(iter, pos);
1855         int err = bkey_err(ret) ?: bch2_btree_path_relock(iter->trans, iter->path, _THIS_IP_);
1856
1857         return err ? bkey_s_c_err(err) : ret;
1858 }
1859
1860 static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
1861 {
1862         struct btree_trans *trans = iter->trans;
1863         struct bkey_i *next_update;
1864         struct bkey_s_c k, k2;
1865         int ret;
1866
1867         EBUG_ON(iter->path->cached);
1868         bch2_btree_iter_verify(iter);
1869
1870         while (1) {
1871                 struct btree_path_level *l;
1872
1873                 iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
1874                                         iter->flags & BTREE_ITER_INTENT,
1875                                         btree_iter_ip_allocated(iter));
1876
1877                 ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
1878                 if (unlikely(ret)) {
1879                         /* ensure that iter->k is consistent with iter->pos: */
1880                         bch2_btree_iter_set_pos(iter, iter->pos);
1881                         k = bkey_s_c_err(ret);
1882                         goto out;
1883                 }
1884
1885                 l = path_l(iter->path);
1886
1887                 if (unlikely(!l->b)) {
1888                         /* No btree nodes at requested level: */
1889                         bch2_btree_iter_set_pos(iter, SPOS_MAX);
1890                         k = bkey_s_c_null;
1891                         goto out;
1892                 }
1893
1894                 btree_path_set_should_be_locked(iter->path);
1895
1896                 k = btree_path_level_peek_all(trans->c, l, &iter->k);
1897
1898                 if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
1899                     k.k &&
1900                     (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
1901                         k = k2;
1902                         ret = bkey_err(k);
1903                         if (ret) {
1904                                 bch2_btree_iter_set_pos(iter, iter->pos);
1905                                 goto out;
1906                         }
1907                 }
1908
1909                 if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL))
1910                         k = btree_trans_peek_journal(trans, iter, k);
1911
1912                 next_update = iter->flags & BTREE_ITER_WITH_UPDATES
1913                         ? btree_trans_peek_updates(trans, iter->btree_id, search_key)
1914                         : NULL;
1915                 if (next_update &&
1916                     bpos_cmp(next_update->k.p,
1917                              k.k ? k.k->p : l->b->key.k.p) <= 0) {
1918                         iter->k = next_update->k;
1919                         k = bkey_i_to_s_c(next_update);
1920                 }
1921
1922                 if (k.k && bkey_deleted(k.k)) {
1923                         /*
1924                          * If we've got a whiteout, and it's after the search
1925                          * key, advance the search key to the whiteout instead
1926                          * of just after the whiteout - it might be a btree
1927                          * whiteout, with a real key at the same position, since
1928                          * in the btree deleted keys sort before non deleted.
1929                          */
1930                         search_key = bpos_cmp(search_key, k.k->p)
1931                                 ? k.k->p
1932                                 : bpos_successor(k.k->p);
1933                         continue;
1934                 }
1935
1936                 if (likely(k.k)) {
1937                         break;
1938                 } else if (likely(bpos_cmp(l->b->key.k.p, SPOS_MAX))) {
1939                         /* Advance to next leaf node: */
1940                         search_key = bpos_successor(l->b->key.k.p);
1941                 } else {
1942                         /* End of btree: */
1943                         bch2_btree_iter_set_pos(iter, SPOS_MAX);
1944                         k = bkey_s_c_null;
1945                         goto out;
1946                 }
1947         }
1948 out:
1949         bch2_btree_iter_verify(iter);
1950
1951         return k;
1952 }
1953
1954 /**
1955  * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
1956  * current position
1957  */
1958 struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end)
1959 {
1960         struct btree_trans *trans = iter->trans;
1961         struct bpos search_key = btree_iter_search_key(iter);
1962         struct bkey_s_c k;
1963         struct bpos iter_pos;
1964         int ret;
1965
1966         EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
1967
1968         if (iter->update_path) {
1969                 bch2_path_put_nokeep(trans, iter->update_path,
1970                                      iter->flags & BTREE_ITER_INTENT);
1971                 iter->update_path = NULL;
1972         }
1973
1974         bch2_btree_iter_verify_entry_exit(iter);
1975
1976         while (1) {
1977                 k = __bch2_btree_iter_peek(iter, search_key);
1978                 if (!k.k || bkey_err(k))
1979                         goto out_no_locked;
1980
1981                 /*
1982                  * iter->pos should be mononotically increasing, and always be
1983                  * equal to the key we just returned - except extents can
1984                  * straddle iter->pos:
1985                  */
1986                 if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
1987                         iter_pos = k.k->p;
1988                 else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
1989                         iter_pos = bkey_start_pos(k.k);
1990                 else
1991                         iter_pos = iter->pos;
1992
1993                 if (bkey_cmp(iter_pos, end) > 0) {
1994                         bch2_btree_iter_set_pos(iter, end);
1995                         k = bkey_s_c_null;
1996                         goto out_no_locked;
1997                 }
1998
1999                 if (iter->update_path &&
2000                     bkey_cmp(iter->update_path->pos, k.k->p)) {
2001                         bch2_path_put_nokeep(trans, iter->update_path,
2002                                              iter->flags & BTREE_ITER_INTENT);
2003                         iter->update_path = NULL;
2004                 }
2005
2006                 if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
2007                     (iter->flags & BTREE_ITER_INTENT) &&
2008                     !(iter->flags & BTREE_ITER_IS_EXTENTS) &&
2009                     !iter->update_path) {
2010                         struct bpos pos = k.k->p;
2011
2012                         if (pos.snapshot < iter->snapshot) {
2013                                 search_key = bpos_successor(k.k->p);
2014                                 continue;
2015                         }
2016
2017                         pos.snapshot = iter->snapshot;
2018
2019                         /*
2020                          * advance, same as on exit for iter->path, but only up
2021                          * to snapshot
2022                          */
2023                         __btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT);
2024                         iter->update_path = iter->path;
2025
2026                         iter->update_path = bch2_btree_path_set_pos(trans,
2027                                                 iter->update_path, pos,
2028                                                 iter->flags & BTREE_ITER_INTENT,
2029                                                 _THIS_IP_);
2030                 }
2031
2032                 /*
2033                  * We can never have a key in a leaf node at POS_MAX, so
2034                  * we don't have to check these successor() calls:
2035                  */
2036                 if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
2037                     !bch2_snapshot_is_ancestor(trans->c,
2038                                                iter->snapshot,
2039                                                k.k->p.snapshot)) {
2040                         search_key = bpos_successor(k.k->p);
2041                         continue;
2042                 }
2043
2044                 if (bkey_whiteout(k.k) &&
2045                     !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
2046                         search_key = bkey_successor(iter, k.k->p);
2047                         continue;
2048                 }
2049
2050                 break;
2051         }
2052
2053         iter->pos = iter_pos;
2054
2055         iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
2056                                 iter->flags & BTREE_ITER_INTENT,
2057                                 btree_iter_ip_allocated(iter));
2058
2059         btree_path_set_should_be_locked(iter->path);
2060 out_no_locked:
2061         if (iter->update_path) {
2062                 if (iter->update_path->uptodate &&
2063                     (ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_)))
2064                         k = bkey_s_c_err(ret);
2065                 else
2066                         btree_path_set_should_be_locked(iter->update_path);
2067         }
2068
2069         if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
2070                 iter->pos.snapshot = iter->snapshot;
2071
2072         ret = bch2_btree_iter_verify_ret(iter, k);
2073         if (unlikely(ret)) {
2074                 bch2_btree_iter_set_pos(iter, iter->pos);
2075                 k = bkey_s_c_err(ret);
2076         }
2077
2078         bch2_btree_iter_verify_entry_exit(iter);
2079
2080         return k;
2081 }
2082
2083 /**
2084  * bch2_btree_iter_peek_all_levels: returns the first key greater than or equal
2085  * to iterator's current position, returning keys from every level of the btree.
2086  * For keys at different levels of the btree that compare equal, the key from
2087  * the lower level (leaf) is returned first.
2088  */
2089 struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter)
2090 {
2091         struct btree_trans *trans = iter->trans;
2092         struct bkey_s_c k;
2093         int ret;
2094
2095         EBUG_ON(iter->path->cached);
2096         bch2_btree_iter_verify(iter);
2097         BUG_ON(iter->path->level < iter->min_depth);
2098         BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
2099         EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS));
2100
2101         while (1) {
2102                 iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos,
2103                                         iter->flags & BTREE_ITER_INTENT,
2104                                         btree_iter_ip_allocated(iter));
2105
2106                 ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
2107                 if (unlikely(ret)) {
2108                         /* ensure that iter->k is consistent with iter->pos: */
2109                         bch2_btree_iter_set_pos(iter, iter->pos);
2110                         k = bkey_s_c_err(ret);
2111                         goto out_no_locked;
2112                 }
2113
2114                 /* Already at end? */
2115                 if (!btree_path_node(iter->path, iter->path->level)) {
2116                         k = bkey_s_c_null;
2117                         goto out_no_locked;
2118                 }
2119
2120                 k = btree_path_level_peek_all(trans->c,
2121                                 &iter->path->l[iter->path->level], &iter->k);
2122
2123                 /* Check if we should go up to the parent node: */
2124                 if (!k.k ||
2125                     (iter->advanced &&
2126                      !bpos_cmp(path_l(iter->path)->b->key.k.p, iter->pos))) {
2127                         iter->pos = path_l(iter->path)->b->key.k.p;
2128                         btree_path_set_level_up(trans, iter->path);
2129                         iter->advanced = false;
2130                         continue;
2131                 }
2132
2133                 /*
2134                  * Check if we should go back down to a leaf:
2135                  * If we're not in a leaf node, we only return the current key
2136                  * if it exactly matches iter->pos - otherwise we first have to
2137                  * go back to the leaf:
2138                  */
2139                 if (iter->path->level != iter->min_depth &&
2140                     (iter->advanced ||
2141                      !k.k ||
2142                      bpos_cmp(iter->pos, k.k->p))) {
2143                         btree_path_set_level_down(trans, iter->path, iter->min_depth);
2144                         iter->pos = bpos_successor(iter->pos);
2145                         iter->advanced = false;
2146                         continue;
2147                 }
2148
2149                 /* Check if we should go to the next key: */
2150                 if (iter->path->level == iter->min_depth &&
2151                     iter->advanced &&
2152                     k.k &&
2153                     !bpos_cmp(iter->pos, k.k->p)) {
2154                         iter->pos = bpos_successor(iter->pos);
2155                         iter->advanced = false;
2156                         continue;
2157                 }
2158
2159                 if (iter->advanced &&
2160                     iter->path->level == iter->min_depth &&
2161                     bpos_cmp(k.k->p, iter->pos))
2162                         iter->advanced = false;
2163
2164                 BUG_ON(iter->advanced);
2165                 BUG_ON(!k.k);
2166                 break;
2167         }
2168
2169         iter->pos = k.k->p;
2170         btree_path_set_should_be_locked(iter->path);
2171 out_no_locked:
2172         bch2_btree_iter_verify(iter);
2173
2174         return k;
2175 }
2176
2177 /**
2178  * bch2_btree_iter_next: returns first key greater than iterator's current
2179  * position
2180  */
2181 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
2182 {
2183         if (!bch2_btree_iter_advance(iter))
2184                 return bkey_s_c_null;
2185
2186         return bch2_btree_iter_peek(iter);
2187 }
2188
2189 /**
2190  * bch2_btree_iter_peek_prev: returns first key less than or equal to
2191  * iterator's current position
2192  */
2193 struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
2194 {
2195         struct btree_trans *trans = iter->trans;
2196         struct bpos search_key = iter->pos;
2197         struct btree_path *saved_path = NULL;
2198         struct bkey_s_c k;
2199         struct bkey saved_k;
2200         const struct bch_val *saved_v;
2201         int ret;
2202
2203         EBUG_ON(iter->path->cached || iter->path->level);
2204         EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES);
2205
2206         if (iter->flags & BTREE_ITER_WITH_JOURNAL)
2207                 return bkey_s_c_err(-EIO);
2208
2209         bch2_btree_iter_verify(iter);
2210         bch2_btree_iter_verify_entry_exit(iter);
2211
2212         if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
2213                 search_key.snapshot = U32_MAX;
2214
2215         while (1) {
2216                 iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
2217                                                 iter->flags & BTREE_ITER_INTENT,
2218                                                 btree_iter_ip_allocated(iter));
2219
2220                 ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
2221                 if (unlikely(ret)) {
2222                         /* ensure that iter->k is consistent with iter->pos: */
2223                         bch2_btree_iter_set_pos(iter, iter->pos);
2224                         k = bkey_s_c_err(ret);
2225                         goto out_no_locked;
2226                 }
2227
2228                 k = btree_path_level_peek(trans, iter->path,
2229                                           &iter->path->l[0], &iter->k);
2230                 if (!k.k ||
2231                     ((iter->flags & BTREE_ITER_IS_EXTENTS)
2232                      ? bpos_cmp(bkey_start_pos(k.k), search_key) >= 0
2233                      : bpos_cmp(k.k->p, search_key) > 0))
2234                         k = btree_path_level_prev(trans, iter->path,
2235                                                   &iter->path->l[0], &iter->k);
2236
2237                 bch2_btree_path_check_sort(trans, iter->path, 0);
2238
2239                 if (likely(k.k)) {
2240                         if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) {
2241                                 if (k.k->p.snapshot == iter->snapshot)
2242                                         goto got_key;
2243
2244                                 /*
2245                                  * If we have a saved candidate, and we're no
2246                                  * longer at the same _key_ (not pos), return
2247                                  * that candidate
2248                                  */
2249                                 if (saved_path && bkey_cmp(k.k->p, saved_k.p)) {
2250                                         bch2_path_put_nokeep(trans, iter->path,
2251                                                       iter->flags & BTREE_ITER_INTENT);
2252                                         iter->path = saved_path;
2253                                         saved_path = NULL;
2254                                         iter->k = saved_k;
2255                                         k.v     = saved_v;
2256                                         goto got_key;
2257                                 }
2258
2259                                 if (bch2_snapshot_is_ancestor(iter->trans->c,
2260                                                               iter->snapshot,
2261                                                               k.k->p.snapshot)) {
2262                                         if (saved_path)
2263                                                 bch2_path_put_nokeep(trans, saved_path,
2264                                                       iter->flags & BTREE_ITER_INTENT);
2265                                         saved_path = btree_path_clone(trans, iter->path,
2266                                                                 iter->flags & BTREE_ITER_INTENT);
2267                                         saved_k = *k.k;
2268                                         saved_v = k.v;
2269                                 }
2270
2271                                 search_key = bpos_predecessor(k.k->p);
2272                                 continue;
2273                         }
2274 got_key:
2275                         if (bkey_whiteout(k.k) &&
2276                             !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
2277                                 search_key = bkey_predecessor(iter, k.k->p);
2278                                 if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
2279                                         search_key.snapshot = U32_MAX;
2280                                 continue;
2281                         }
2282
2283                         break;
2284                 } else if (likely(bpos_cmp(iter->path->l[0].b->data->min_key, POS_MIN))) {
2285                         /* Advance to previous leaf node: */
2286                         search_key = bpos_predecessor(iter->path->l[0].b->data->min_key);
2287                 } else {
2288                         /* Start of btree: */
2289                         bch2_btree_iter_set_pos(iter, POS_MIN);
2290                         k = bkey_s_c_null;
2291                         goto out_no_locked;
2292                 }
2293         }
2294
2295         EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0);
2296
2297         /* Extents can straddle iter->pos: */
2298         if (bkey_cmp(k.k->p, iter->pos) < 0)
2299                 iter->pos = k.k->p;
2300
2301         if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
2302                 iter->pos.snapshot = iter->snapshot;
2303
2304         btree_path_set_should_be_locked(iter->path);
2305 out_no_locked:
2306         if (saved_path)
2307                 bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
2308
2309         bch2_btree_iter_verify_entry_exit(iter);
2310         bch2_btree_iter_verify(iter);
2311
2312         return k;
2313 }
2314
2315 /**
2316  * bch2_btree_iter_prev: returns first key less than iterator's current
2317  * position
2318  */
2319 struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
2320 {
2321         if (!bch2_btree_iter_rewind(iter))
2322                 return bkey_s_c_null;
2323
2324         return bch2_btree_iter_peek_prev(iter);
2325 }
2326
2327 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
2328 {
2329         struct btree_trans *trans = iter->trans;
2330         struct bpos search_key;
2331         struct bkey_s_c k;
2332         int ret;
2333
2334         bch2_btree_iter_verify(iter);
2335         bch2_btree_iter_verify_entry_exit(iter);
2336         EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
2337         EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE));
2338
2339         /* extents can't span inode numbers: */
2340         if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
2341             unlikely(iter->pos.offset == KEY_OFFSET_MAX)) {
2342                 if (iter->pos.inode == KEY_INODE_MAX)
2343                         return bkey_s_c_null;
2344
2345                 bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
2346         }
2347
2348         search_key = btree_iter_search_key(iter);
2349         iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
2350                                         iter->flags & BTREE_ITER_INTENT,
2351                                         btree_iter_ip_allocated(iter));
2352
2353         ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
2354         if (unlikely(ret)) {
2355                 k = bkey_s_c_err(ret);
2356                 goto out_no_locked;
2357         }
2358
2359         if ((iter->flags & BTREE_ITER_CACHED) ||
2360             !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
2361                 struct bkey_i *next_update;
2362
2363                 if ((iter->flags & BTREE_ITER_WITH_UPDATES) &&
2364                     (next_update = btree_trans_peek_updates(trans,
2365                                                 iter->btree_id, search_key)) &&
2366                     !bpos_cmp(next_update->k.p, iter->pos)) {
2367                         iter->k = next_update->k;
2368                         k = bkey_i_to_s_c(next_update);
2369                         goto out;
2370                 }
2371
2372                 if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) &&
2373                     (next_update = bch2_btree_journal_peek_slot(trans,
2374                                         iter, iter->pos))) {
2375                         iter->k = next_update->k;
2376                         k = bkey_i_to_s_c(next_update);
2377                         goto out;
2378                 }
2379
2380                 if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
2381                     (k = __btree_trans_peek_key_cache(iter, iter->pos)).k) {
2382                         if (!bkey_err(k))
2383                                 iter->k = *k.k;
2384                         /* We're not returning a key from iter->path: */
2385                         goto out_no_locked;
2386                 }
2387
2388                 k = bch2_btree_path_peek_slot(iter->path, &iter->k);
2389                 if (unlikely(!k.k))
2390                         goto out_no_locked;
2391         } else {
2392                 struct bpos next;
2393
2394                 EBUG_ON(iter->path->level);
2395
2396                 if (iter->flags & BTREE_ITER_INTENT) {
2397                         struct btree_iter iter2;
2398                         struct bpos end = iter->pos;
2399
2400                         if (iter->flags & BTREE_ITER_IS_EXTENTS)
2401                                 end.offset = U64_MAX;
2402
2403                         bch2_trans_copy_iter(&iter2, iter);
2404                         k = bch2_btree_iter_peek_upto(&iter2, end);
2405
2406                         if (k.k && !bkey_err(k)) {
2407                                 iter->k = iter2.k;
2408                                 k.k = &iter->k;
2409                         }
2410                         bch2_trans_iter_exit(trans, &iter2);
2411                 } else {
2412                         struct bpos pos = iter->pos;
2413
2414                         k = bch2_btree_iter_peek(iter);
2415                         if (unlikely(bkey_err(k)))
2416                                 bch2_btree_iter_set_pos(iter, pos);
2417                         else
2418                                 iter->pos = pos;
2419                 }
2420
2421                 if (unlikely(bkey_err(k)))
2422                         goto out_no_locked;
2423
2424                 next = k.k ? bkey_start_pos(k.k) : POS_MAX;
2425
2426                 if (bkey_cmp(iter->pos, next) < 0) {
2427                         bkey_init(&iter->k);
2428                         iter->k.p = iter->pos;
2429
2430                         if (iter->flags & BTREE_ITER_IS_EXTENTS) {
2431                                 bch2_key_resize(&iter->k,
2432                                                 min_t(u64, KEY_SIZE_MAX,
2433                                                       (next.inode == iter->pos.inode
2434                                                        ? next.offset
2435                                                        : KEY_OFFSET_MAX) -
2436                                                       iter->pos.offset));
2437                                 EBUG_ON(!iter->k.size);
2438                         }
2439
2440                         k = (struct bkey_s_c) { &iter->k, NULL };
2441                 }
2442         }
2443 out:
2444         btree_path_set_should_be_locked(iter->path);
2445 out_no_locked:
2446         bch2_btree_iter_verify_entry_exit(iter);
2447         bch2_btree_iter_verify(iter);
2448         ret = bch2_btree_iter_verify_ret(iter, k);
2449         if (unlikely(ret))
2450                 return bkey_s_c_err(ret);
2451
2452         return k;
2453 }
2454
2455 struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
2456 {
2457         if (!bch2_btree_iter_advance(iter))
2458                 return bkey_s_c_null;
2459
2460         return bch2_btree_iter_peek_slot(iter);
2461 }
2462
2463 struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
2464 {
2465         if (!bch2_btree_iter_rewind(iter))
2466                 return bkey_s_c_null;
2467
2468         return bch2_btree_iter_peek_slot(iter);
2469 }
2470
2471 /* new transactional stuff: */
2472
2473 static inline void btree_path_verify_sorted_ref(struct btree_trans *trans,
2474                                                 struct btree_path *path)
2475 {
2476         EBUG_ON(path->sorted_idx >= trans->nr_sorted);
2477         EBUG_ON(trans->sorted[path->sorted_idx] != path->idx);
2478         EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
2479 }
2480
2481 static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans)
2482 {
2483 #ifdef CONFIG_BCACHEFS_DEBUG
2484         unsigned i;
2485
2486         for (i = 0; i < trans->nr_sorted; i++)
2487                 btree_path_verify_sorted_ref(trans, trans->paths + trans->sorted[i]);
2488 #endif
2489 }
2490
2491 static void btree_trans_verify_sorted(struct btree_trans *trans)
2492 {
2493 #ifdef CONFIG_BCACHEFS_DEBUG
2494         struct btree_path *path, *prev = NULL;
2495         unsigned i;
2496
2497         if (!bch2_debug_check_iterators)
2498                 return;
2499
2500         trans_for_each_path_inorder(trans, path, i) {
2501                 if (prev && btree_path_cmp(prev, path) > 0) {
2502                         bch2_dump_trans_paths_updates(trans);
2503                         panic("trans paths out of order!\n");
2504                 }
2505                 prev = path;
2506         }
2507 #endif
2508 }
2509
2510 static inline void btree_path_swap(struct btree_trans *trans,
2511                                    struct btree_path *l, struct btree_path *r)
2512 {
2513         swap(l->sorted_idx, r->sorted_idx);
2514         swap(trans->sorted[l->sorted_idx],
2515              trans->sorted[r->sorted_idx]);
2516
2517         btree_path_verify_sorted_ref(trans, l);
2518         btree_path_verify_sorted_ref(trans, r);
2519 }
2520
2521 inline void bch2_btree_path_check_sort(struct btree_trans *trans, struct btree_path *path,
2522                                        int cmp)
2523 {
2524         struct btree_path *n;
2525
2526         if (cmp <= 0) {
2527                 n = prev_btree_path(trans, path);
2528                 if (n && btree_path_cmp(n, path) > 0) {
2529                         do {
2530                                 btree_path_swap(trans, n, path);
2531                                 n = prev_btree_path(trans, path);
2532                         } while (n && btree_path_cmp(n, path) > 0);
2533
2534                         goto out;
2535                 }
2536         }
2537
2538         if (cmp >= 0) {
2539                 n = next_btree_path(trans, path);
2540                 if (n && btree_path_cmp(path, n) > 0) {
2541                         do {
2542                                 btree_path_swap(trans, path, n);
2543                                 n = next_btree_path(trans, path);
2544                         } while (n && btree_path_cmp(path, n) > 0);
2545                 }
2546         }
2547 out:
2548         btree_trans_verify_sorted(trans);
2549 }
2550
2551 static inline void btree_path_list_remove(struct btree_trans *trans,
2552                                           struct btree_path *path)
2553 {
2554         unsigned i;
2555
2556         EBUG_ON(path->sorted_idx >= trans->nr_sorted);
2557
2558         array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx);
2559
2560         for (i = path->sorted_idx; i < trans->nr_sorted; i++)
2561                 trans->paths[trans->sorted[i]].sorted_idx = i;
2562
2563         path->sorted_idx = U8_MAX;
2564
2565         btree_trans_verify_sorted_refs(trans);
2566 }
2567
2568 static inline void btree_path_list_add(struct btree_trans *trans,
2569                                        struct btree_path *pos,
2570                                        struct btree_path *path)
2571 {
2572         unsigned i;
2573
2574         btree_trans_verify_sorted_refs(trans);
2575
2576         path->sorted_idx = pos ? pos->sorted_idx + 1 : 0;
2577
2578         if (trans->in_traverse_all &&
2579             trans->traverse_all_idx != U8_MAX &&
2580             trans->traverse_all_idx >= path->sorted_idx)
2581                 trans->traverse_all_idx++;
2582
2583         array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx);
2584
2585         for (i = path->sorted_idx; i < trans->nr_sorted; i++)
2586                 trans->paths[trans->sorted[i]].sorted_idx = i;
2587
2588         btree_trans_verify_sorted_refs(trans);
2589 }
2590
2591 void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
2592 {
2593         if (iter->path)
2594                 bch2_path_put(trans, iter->path,
2595                               iter->flags & BTREE_ITER_INTENT);
2596         if (iter->update_path)
2597                 bch2_path_put_nokeep(trans, iter->update_path,
2598                               iter->flags & BTREE_ITER_INTENT);
2599         if (iter->key_cache_path)
2600                 bch2_path_put(trans, iter->key_cache_path,
2601                               iter->flags & BTREE_ITER_INTENT);
2602         iter->path = NULL;
2603         iter->update_path = NULL;
2604         iter->key_cache_path = NULL;
2605 }
2606
2607 static inline void __bch2_trans_iter_init(struct btree_trans *trans,
2608                                           struct btree_iter *iter,
2609                                           unsigned btree_id, struct bpos pos,
2610                                           unsigned locks_want,
2611                                           unsigned depth,
2612                                           unsigned flags,
2613                                           unsigned long ip)
2614 {
2615         if (trans->restarted)
2616                 panic("bch2_trans_iter_init(): in transaction restart, %s by %pS\n",
2617                       bch2_err_str(trans->restarted),
2618                       (void *) trans->last_restarted_ip);
2619
2620         if (flags & BTREE_ITER_ALL_LEVELS)
2621                 flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS;
2622
2623         if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) &&
2624             btree_node_type_is_extents(btree_id))
2625                 flags |= BTREE_ITER_IS_EXTENTS;
2626
2627         if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
2628             !btree_type_has_snapshots(btree_id))
2629                 flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
2630
2631         if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) &&
2632             btree_type_has_snapshots(btree_id))
2633                 flags |= BTREE_ITER_FILTER_SNAPSHOTS;
2634
2635         if (!test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags))
2636                 flags |= BTREE_ITER_WITH_JOURNAL;
2637
2638         iter->trans     = trans;
2639         iter->path      = NULL;
2640         iter->update_path = NULL;
2641         iter->key_cache_path = NULL;
2642         iter->btree_id  = btree_id;
2643         iter->min_depth = depth;
2644         iter->flags     = flags;
2645         iter->snapshot  = pos.snapshot;
2646         iter->pos       = pos;
2647         iter->k.type    = KEY_TYPE_deleted;
2648         iter->k.p       = pos;
2649         iter->k.size    = 0;
2650         iter->journal_idx = 0;
2651         iter->journal_pos = POS_MIN;
2652 #ifdef CONFIG_BCACHEFS_DEBUG
2653         iter->ip_allocated = ip;
2654 #endif
2655
2656         iter->path = bch2_path_get(trans, btree_id, iter->pos,
2657                                    locks_want, depth, flags, ip);
2658 }
2659
2660 void bch2_trans_iter_init(struct btree_trans *trans,
2661                           struct btree_iter *iter,
2662                           unsigned btree_id, struct bpos pos,
2663                           unsigned flags)
2664 {
2665         if (!btree_id_cached(trans->c, btree_id)) {
2666                 flags &= ~BTREE_ITER_CACHED;
2667                 flags &= ~BTREE_ITER_WITH_KEY_CACHE;
2668         } else if (!(flags & BTREE_ITER_CACHED))
2669                 flags |= BTREE_ITER_WITH_KEY_CACHE;
2670
2671         __bch2_trans_iter_init(trans, iter, btree_id, pos,
2672                                0, 0, flags, _RET_IP_);
2673 }
2674
2675 void bch2_trans_node_iter_init(struct btree_trans *trans,
2676                                struct btree_iter *iter,
2677                                enum btree_id btree_id,
2678                                struct bpos pos,
2679                                unsigned locks_want,
2680                                unsigned depth,
2681                                unsigned flags)
2682 {
2683         __bch2_trans_iter_init(trans, iter, btree_id, pos, locks_want, depth,
2684                                BTREE_ITER_NOT_EXTENTS|
2685                                __BTREE_ITER_ALL_SNAPSHOTS|
2686                                BTREE_ITER_ALL_SNAPSHOTS|
2687                                flags, _RET_IP_);
2688         BUG_ON(iter->path->locks_want    < min(locks_want, BTREE_MAX_DEPTH));
2689         BUG_ON(iter->path->level        != depth);
2690         BUG_ON(iter->min_depth          != depth);
2691 }
2692
2693 void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
2694 {
2695         *dst = *src;
2696         if (src->path)
2697                 __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT);
2698         if (src->update_path)
2699                 __btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT);
2700         dst->key_cache_path = NULL;
2701 }
2702
2703 void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
2704 {
2705         unsigned new_top = trans->mem_top + size;
2706         size_t old_bytes = trans->mem_bytes;
2707         size_t new_bytes = roundup_pow_of_two(new_top);
2708         void *new_mem;
2709         void *p;
2710
2711         trans->mem_max = max(trans->mem_max, new_top);
2712
2713         WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
2714
2715         new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
2716         if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
2717                 new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
2718                 new_bytes = BTREE_TRANS_MEM_MAX;
2719                 kfree(trans->mem);
2720         }
2721
2722         if (!new_mem)
2723                 return ERR_PTR(-ENOMEM);
2724
2725         trans->mem = new_mem;
2726         trans->mem_bytes = new_bytes;
2727
2728         if (old_bytes) {
2729                 trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
2730                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
2731         }
2732
2733         p = trans->mem + trans->mem_top;
2734         trans->mem_top += size;
2735         memset(p, 0, size);
2736         return p;
2737 }
2738
2739 /**
2740  * bch2_trans_begin() - reset a transaction after a interrupted attempt
2741  * @trans: transaction to reset
2742  *
2743  * While iterating over nodes or updating nodes a attempt to lock a btree node
2744  * may return BCH_ERR_transaction_restart when the trylock fails. When this
2745  * occurs bch2_trans_begin() should be called and the transaction retried.
2746  */
2747 u32 bch2_trans_begin(struct btree_trans *trans)
2748 {
2749         struct btree_path *path;
2750
2751         bch2_trans_reset_updates(trans);
2752
2753         trans->restart_count++;
2754         trans->mem_top                  = 0;
2755
2756         if (trans->fs_usage_deltas) {
2757                 trans->fs_usage_deltas->used = 0;
2758                 memset((void *) trans->fs_usage_deltas +
2759                        offsetof(struct replicas_delta_list, memset_start), 0,
2760                        (void *) &trans->fs_usage_deltas->memset_end -
2761                        (void *) &trans->fs_usage_deltas->memset_start);
2762         }
2763
2764         trans_for_each_path(trans, path) {
2765                 path->should_be_locked = false;
2766
2767                 /*
2768                  * If the transaction wasn't restarted, we're presuming to be
2769                  * doing something new: dont keep iterators excpt the ones that
2770                  * are in use - except for the subvolumes btree:
2771                  */
2772                 if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes)
2773                         path->preserve = false;
2774
2775                 /*
2776                  * XXX: we probably shouldn't be doing this if the transaction
2777                  * was restarted, but currently we still overflow transaction
2778                  * iterators if we do that
2779                  */
2780                 if (!path->ref && !path->preserve)
2781                         __bch2_path_free(trans, path);
2782                 else
2783                         path->preserve = false;
2784         }
2785
2786         if (!trans->restarted &&
2787             (need_resched() ||
2788              local_clock() - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) {
2789                 bch2_trans_unlock(trans);
2790                 cond_resched();
2791                 bch2_trans_relock(trans);
2792         }
2793
2794         trans->last_restarted_ip = _RET_IP_;
2795         if (trans->restarted)
2796                 bch2_btree_path_traverse_all(trans);
2797
2798         trans->last_begin_time = local_clock();
2799         return trans->restart_count;
2800 }
2801
2802 void bch2_trans_verify_not_restarted(struct btree_trans *trans, u32 restart_count)
2803 {
2804         if (trans_was_restarted(trans, restart_count))
2805                 panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
2806                       trans->restart_count, restart_count,
2807                       (void *) trans->last_restarted_ip);
2808 }
2809
2810 static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
2811 {
2812         size_t paths_bytes      = sizeof(struct btree_path) * BTREE_ITER_MAX;
2813         size_t updates_bytes    = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX;
2814         void *p = NULL;
2815
2816         BUG_ON(trans->used_mempool);
2817
2818 #ifdef __KERNEL__
2819         p = this_cpu_xchg(c->btree_paths_bufs->path , NULL);
2820 #endif
2821         if (!p)
2822                 p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS);
2823
2824         trans->paths            = p; p += paths_bytes;
2825         trans->updates          = p; p += updates_bytes;
2826 }
2827
2828 static inline unsigned bch2_trans_get_fn_idx(struct btree_trans *trans, struct bch_fs *c,
2829                                         const char *fn)
2830 {
2831         unsigned i;
2832
2833         for (i = 0; i < ARRAY_SIZE(c->btree_transaction_fns); i++)
2834                 if (!c->btree_transaction_fns[i] ||
2835                     c->btree_transaction_fns[i] == fn) {
2836                         c->btree_transaction_fns[i] = fn;
2837                         return i;
2838                 }
2839
2840         pr_warn_once("BCH_TRANSACTIONS_NR not big enough!");
2841         return i;
2842 }
2843
2844 void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, const char *fn)
2845         __acquires(&c->btree_trans_barrier)
2846 {
2847         struct btree_transaction_stats *s;
2848         struct btree_trans *pos;
2849
2850         BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
2851
2852         memset(trans, 0, sizeof(*trans));
2853         trans->c                = c;
2854         trans->fn               = fn;
2855         trans->last_begin_time  = local_clock();
2856         trans->fn_idx           = bch2_trans_get_fn_idx(trans, c, fn);
2857         trans->locking_wait.task = current;
2858         closure_init_stack(&trans->ref);
2859
2860         bch2_trans_alloc_paths(trans, c);
2861
2862         s = btree_trans_stats(trans);
2863         if (s) {
2864                 unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
2865
2866                 trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
2867
2868                 if (!unlikely(trans->mem)) {
2869                         trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
2870                         trans->mem_bytes = BTREE_TRANS_MEM_MAX;
2871                 } else {
2872                         trans->mem_bytes = expected_mem_bytes;
2873                 }
2874
2875                 trans->nr_max_paths = s->nr_max_paths;
2876         }
2877
2878         trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
2879
2880         mutex_lock(&c->btree_trans_lock);
2881         list_for_each_entry(pos, &c->btree_trans_list, list) {
2882                 if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) {
2883                         list_add_tail(&trans->list, &pos->list);
2884                         goto list_add_done;
2885                 }
2886         }
2887         list_add_tail(&trans->list, &c->btree_trans_list);
2888 list_add_done:
2889         mutex_unlock(&c->btree_trans_lock);
2890 }
2891
2892 static void check_btree_paths_leaked(struct btree_trans *trans)
2893 {
2894 #ifdef CONFIG_BCACHEFS_DEBUG
2895         struct bch_fs *c = trans->c;
2896         struct btree_path *path;
2897
2898         trans_for_each_path(trans, path)
2899                 if (path->ref)
2900                         goto leaked;
2901         return;
2902 leaked:
2903         bch_err(c, "btree paths leaked from %s!", trans->fn);
2904         trans_for_each_path(trans, path)
2905                 if (path->ref)
2906                         printk(KERN_ERR "  btree %s %pS\n",
2907                                bch2_btree_ids[path->btree_id],
2908                                (void *) path->ip_allocated);
2909         /* Be noisy about this: */
2910         bch2_fatal_error(c);
2911 #endif
2912 }
2913
2914 void bch2_trans_exit(struct btree_trans *trans)
2915         __releases(&c->btree_trans_barrier)
2916 {
2917         struct btree_insert_entry *i;
2918         struct bch_fs *c = trans->c;
2919         struct btree_transaction_stats *s = btree_trans_stats(trans);
2920
2921         bch2_trans_unlock(trans);
2922
2923         closure_sync(&trans->ref);
2924
2925         if (s)
2926                 s->max_mem = max(s->max_mem, trans->mem_max);
2927
2928         trans_for_each_update(trans, i)
2929                 __btree_path_put(i->path, true);
2930         trans->nr_updates               = 0;
2931
2932         check_btree_paths_leaked(trans);
2933
2934         mutex_lock(&c->btree_trans_lock);
2935         list_del(&trans->list);
2936         mutex_unlock(&c->btree_trans_lock);
2937
2938         srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
2939
2940         bch2_journal_preres_put(&c->journal, &trans->journal_preres);
2941
2942         kfree(trans->extra_journal_entries.data);
2943
2944         if (trans->fs_usage_deltas) {
2945                 if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
2946                     REPLICAS_DELTA_LIST_MAX)
2947                         mempool_free(trans->fs_usage_deltas,
2948                                      &c->replicas_delta_pool);
2949                 else
2950                         kfree(trans->fs_usage_deltas);
2951         }
2952
2953         if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
2954                 mempool_free(trans->mem, &c->btree_trans_mem_pool);
2955         else
2956                 kfree(trans->mem);
2957
2958 #ifdef __KERNEL__
2959         /*
2960          * Userspace doesn't have a real percpu implementation:
2961          */
2962         trans->paths = this_cpu_xchg(c->btree_paths_bufs->path, trans->paths);
2963 #endif
2964
2965         if (trans->paths)
2966                 mempool_free(trans->paths, &c->btree_paths_pool);
2967
2968         trans->mem      = (void *) 0x1;
2969         trans->paths    = (void *) 0x1;
2970 }
2971
2972 static void __maybe_unused
2973 bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
2974                                       struct btree_bkey_cached_common *b)
2975 {
2976         struct six_lock_count c = six_lock_counts(&b->lock);
2977         struct task_struct *owner;
2978         pid_t pid;
2979
2980         rcu_read_lock();
2981         owner = READ_ONCE(b->lock.owner);
2982         pid = owner ? owner->pid : 0;;
2983         rcu_read_unlock();
2984
2985         prt_tab(out);
2986         prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b',
2987                    b->level, bch2_btree_ids[b->btree_id]);
2988         bch2_bpos_to_text(out, btree_node_pos(b));
2989
2990         prt_tab(out);
2991         prt_printf(out, " locks %u:%u:%u held by pid %u",
2992                    c.n[0], c.n[1], c.n[2], pid);
2993 }
2994
2995 void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
2996 {
2997         struct btree_path *path;
2998         struct btree_bkey_cached_common *b;
2999         static char lock_types[] = { 'r', 'i', 'w' };
3000         unsigned l;
3001
3002         if (!out->nr_tabstops) {
3003                 printbuf_tabstop_push(out, 16);
3004                 printbuf_tabstop_push(out, 32);
3005         }
3006
3007         prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn);
3008
3009         trans_for_each_path(trans, path) {
3010                 if (!path->nodes_locked)
3011                         continue;
3012
3013                 prt_printf(out, "  path %u %c l=%u %s:",
3014                        path->idx,
3015                        path->cached ? 'c' : 'b',
3016                        path->level,
3017                        bch2_btree_ids[path->btree_id]);
3018                 bch2_bpos_to_text(out, path->pos);
3019                 prt_newline(out);
3020
3021                 for (l = 0; l < BTREE_MAX_DEPTH; l++) {
3022                         if (btree_node_locked(path, l) &&
3023                             !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) {
3024                                 prt_printf(out, "    %c l=%u ",
3025                                            lock_types[btree_node_locked_type(path, l)], l);
3026                                 bch2_btree_bkey_cached_common_to_text(out, b);
3027                                 prt_newline(out);
3028                         }
3029                 }
3030         }
3031
3032         b = READ_ONCE(trans->locking);
3033         if (b) {
3034                 prt_str(out, "  want");
3035                 prt_newline(out);
3036                 prt_printf(out, "    %c", lock_types[trans->locking_wait.lock_want]);
3037                 bch2_btree_bkey_cached_common_to_text(out, b);
3038                 prt_newline(out);
3039         }
3040 }
3041
3042 void bch2_fs_btree_iter_exit(struct bch_fs *c)
3043 {
3044         struct btree_transaction_stats *s;
3045
3046         for (s = c->btree_transaction_stats;
3047              s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
3048              s++)
3049                 kfree(s->max_paths_text);
3050
3051         if (c->btree_trans_barrier_initialized)
3052                 cleanup_srcu_struct(&c->btree_trans_barrier);
3053         mempool_exit(&c->btree_trans_mem_pool);
3054         mempool_exit(&c->btree_paths_pool);
3055 }
3056
3057 int bch2_fs_btree_iter_init(struct bch_fs *c)
3058 {
3059         unsigned i, nr = BTREE_ITER_MAX;
3060         int ret;
3061
3062         for (i = 0; i < ARRAY_SIZE(c->btree_transaction_stats); i++)
3063                 mutex_init(&c->btree_transaction_stats[i].lock);
3064
3065         INIT_LIST_HEAD(&c->btree_trans_list);
3066         mutex_init(&c->btree_trans_lock);
3067
3068         ret   = mempool_init_kmalloc_pool(&c->btree_paths_pool, 1,
3069                         sizeof(struct btree_path) * nr +
3070                         sizeof(struct btree_insert_entry) * nr) ?:
3071                 mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
3072                                           BTREE_TRANS_MEM_MAX) ?:
3073                 init_srcu_struct(&c->btree_trans_barrier);
3074         if (!ret)
3075                 c->btree_trans_barrier_initialized = true;
3076         return ret;
3077 }