]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_iter.c
Update bcachefs sources to 6ee8a33cee bcachefs: Call bch2_btree_update_add_new_node...
[bcachefs-tools-debian] / libbcachefs / btree_iter.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_methods.h"
5 #include "bkey_buf.h"
6 #include "btree_cache.h"
7 #include "btree_iter.h"
8 #include "btree_key_cache.h"
9 #include "btree_locking.h"
10 #include "btree_update.h"
11 #include "debug.h"
12 #include "error.h"
13 #include "extents.h"
14 #include "journal.h"
15 #include "recovery.h"
16 #include "replicas.h"
17 #include "subvolume.h"
18
19 #include <linux/prandom.h>
20 #include <linux/prefetch.h>
21 #include <trace/events/bcachefs.h>
22
23 static void btree_trans_verify_sorted(struct btree_trans *);
24 inline void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int);
25
26 static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
27 static inline void btree_path_list_add(struct btree_trans *, struct btree_path *,
28                                        struct btree_path *);
29
30 static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter)
31 {
32 #ifdef CONFIG_BCACHEFS_DEBUG
33         return iter->ip_allocated;
34 #else
35         return 0;
36 #endif
37 }
38
39 static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *);
40
41 /*
42  * Unlocks before scheduling
43  * Note: does not revalidate iterator
44  */
45 static inline int bch2_trans_cond_resched(struct btree_trans *trans)
46 {
47         if (need_resched() || race_fault()) {
48                 bch2_trans_unlock(trans);
49                 schedule();
50                 return bch2_trans_relock(trans);
51         } else {
52                 return 0;
53         }
54 }
55
56 static inline int __btree_path_cmp(const struct btree_path *l,
57                                    enum btree_id        r_btree_id,
58                                    bool                 r_cached,
59                                    struct bpos          r_pos,
60                                    unsigned             r_level)
61 {
62         /*
63          * Must match lock ordering as defined by __bch2_btree_node_lock:
64          */
65         return   cmp_int(l->btree_id,   r_btree_id) ?:
66                  cmp_int((int) l->cached,       (int) r_cached) ?:
67                  bpos_cmp(l->pos,       r_pos) ?:
68                 -cmp_int(l->level,      r_level);
69 }
70
71 static inline int btree_path_cmp(const struct btree_path *l,
72                                  const struct btree_path *r)
73 {
74         return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level);
75 }
76
77 static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
78 {
79         /* Are we iterating over keys in all snapshots? */
80         if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
81                 p = bpos_successor(p);
82         } else {
83                 p = bpos_nosnap_successor(p);
84                 p.snapshot = iter->snapshot;
85         }
86
87         return p;
88 }
89
90 static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
91 {
92         /* Are we iterating over keys in all snapshots? */
93         if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
94                 p = bpos_predecessor(p);
95         } else {
96                 p = bpos_nosnap_predecessor(p);
97                 p.snapshot = iter->snapshot;
98         }
99
100         return p;
101 }
102
103 static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
104 {
105         struct bpos pos = iter->pos;
106
107         if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
108             bkey_cmp(pos, POS_MAX))
109                 pos = bkey_successor(iter, pos);
110         return pos;
111 }
112
113 static inline bool btree_path_pos_before_node(struct btree_path *path,
114                                               struct btree *b)
115 {
116         return bpos_cmp(path->pos, b->data->min_key) < 0;
117 }
118
119 static inline bool btree_path_pos_after_node(struct btree_path *path,
120                                              struct btree *b)
121 {
122         return bpos_cmp(b->key.k.p, path->pos) < 0;
123 }
124
125 static inline bool btree_path_pos_in_node(struct btree_path *path,
126                                           struct btree *b)
127 {
128         return path->btree_id == b->c.btree_id &&
129                 !btree_path_pos_before_node(path, b) &&
130                 !btree_path_pos_after_node(path, b);
131 }
132
133 /* Btree iterator: */
134
135 #ifdef CONFIG_BCACHEFS_DEBUG
136
137 static void bch2_btree_path_verify_cached(struct btree_trans *trans,
138                                           struct btree_path *path)
139 {
140         struct bkey_cached *ck;
141         bool locked = btree_node_locked(path, 0);
142
143         if (!bch2_btree_node_relock(trans, path, 0))
144                 return;
145
146         ck = (void *) path->l[0].b;
147         BUG_ON(ck->key.btree_id != path->btree_id ||
148                bkey_cmp(ck->key.pos, path->pos));
149
150         if (!locked)
151                 btree_node_unlock(trans, path, 0);
152 }
153
154 static void bch2_btree_path_verify_level(struct btree_trans *trans,
155                                 struct btree_path *path, unsigned level)
156 {
157         struct btree_path_level *l;
158         struct btree_node_iter tmp;
159         bool locked;
160         struct bkey_packed *p, *k;
161         struct printbuf buf1 = PRINTBUF;
162         struct printbuf buf2 = PRINTBUF;
163         struct printbuf buf3 = PRINTBUF;
164         const char *msg;
165
166         if (!bch2_debug_check_iterators)
167                 return;
168
169         l       = &path->l[level];
170         tmp     = l->iter;
171         locked  = btree_node_locked(path, level);
172
173         if (path->cached) {
174                 if (!level)
175                         bch2_btree_path_verify_cached(trans, path);
176                 return;
177         }
178
179         if (!btree_path_node(path, level))
180                 return;
181
182         if (!bch2_btree_node_relock_notrace(trans, path, level))
183                 return;
184
185         BUG_ON(!btree_path_pos_in_node(path, l->b));
186
187         bch2_btree_node_iter_verify(&l->iter, l->b);
188
189         /*
190          * For interior nodes, the iterator will have skipped past deleted keys:
191          */
192         p = level
193                 ? bch2_btree_node_iter_prev(&tmp, l->b)
194                 : bch2_btree_node_iter_prev_all(&tmp, l->b);
195         k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
196
197         if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) {
198                 msg = "before";
199                 goto err;
200         }
201
202         if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
203                 msg = "after";
204                 goto err;
205         }
206
207         if (!locked)
208                 btree_node_unlock(trans, path, level);
209         return;
210 err:
211         bch2_bpos_to_text(&buf1, path->pos);
212
213         if (p) {
214                 struct bkey uk = bkey_unpack_key(l->b, p);
215                 bch2_bkey_to_text(&buf2, &uk);
216         } else {
217                 prt_printf(&buf2, "(none)");
218         }
219
220         if (k) {
221                 struct bkey uk = bkey_unpack_key(l->b, k);
222                 bch2_bkey_to_text(&buf3, &uk);
223         } else {
224                 prt_printf(&buf3, "(none)");
225         }
226
227         panic("path should be %s key at level %u:\n"
228               "path pos %s\n"
229               "prev key %s\n"
230               "cur  key %s\n",
231               msg, level, buf1.buf, buf2.buf, buf3.buf);
232 }
233
234 static void bch2_btree_path_verify(struct btree_trans *trans,
235                                    struct btree_path *path)
236 {
237         struct bch_fs *c = trans->c;
238         unsigned i;
239
240         EBUG_ON(path->btree_id >= BTREE_ID_NR);
241
242         for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) {
243                 if (!path->l[i].b) {
244                         BUG_ON(!path->cached &&
245                                c->btree_roots[path->btree_id].b->c.level > i);
246                         break;
247                 }
248
249                 bch2_btree_path_verify_level(trans, path, i);
250         }
251
252         bch2_btree_path_verify_locks(path);
253 }
254
255 void bch2_trans_verify_paths(struct btree_trans *trans)
256 {
257         struct btree_path *path;
258
259         trans_for_each_path(trans, path)
260                 bch2_btree_path_verify(trans, path);
261 }
262
263 static void bch2_btree_iter_verify(struct btree_iter *iter)
264 {
265         struct btree_trans *trans = iter->trans;
266
267         BUG_ON(iter->btree_id >= BTREE_ID_NR);
268
269         BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached);
270
271         BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
272                (iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
273
274         BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
275                (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
276                !btree_type_has_snapshots(iter->btree_id));
277
278         if (iter->update_path)
279                 bch2_btree_path_verify(trans, iter->update_path);
280         bch2_btree_path_verify(trans, iter->path);
281 }
282
283 static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
284 {
285         BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
286                !iter->pos.snapshot);
287
288         BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
289                iter->pos.snapshot != iter->snapshot);
290
291         BUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
292                bkey_cmp(iter->pos, iter->k.p) > 0);
293 }
294
295 static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
296 {
297         struct btree_trans *trans = iter->trans;
298         struct btree_iter copy;
299         struct bkey_s_c prev;
300         int ret = 0;
301
302         if (!bch2_debug_check_iterators)
303                 return 0;
304
305         if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS))
306                 return 0;
307
308         if (bkey_err(k) || !k.k)
309                 return 0;
310
311         BUG_ON(!bch2_snapshot_is_ancestor(trans->c,
312                                           iter->snapshot,
313                                           k.k->p.snapshot));
314
315         bch2_trans_iter_init(trans, &copy, iter->btree_id, iter->pos,
316                              BTREE_ITER_NOPRESERVE|
317                              BTREE_ITER_ALL_SNAPSHOTS);
318         prev = bch2_btree_iter_prev(&copy);
319         if (!prev.k)
320                 goto out;
321
322         ret = bkey_err(prev);
323         if (ret)
324                 goto out;
325
326         if (!bkey_cmp(prev.k->p, k.k->p) &&
327             bch2_snapshot_is_ancestor(trans->c, iter->snapshot,
328                                       prev.k->p.snapshot) > 0) {
329                 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
330
331                 bch2_bkey_to_text(&buf1, k.k);
332                 bch2_bkey_to_text(&buf2, prev.k);
333
334                 panic("iter snap %u\n"
335                       "k    %s\n"
336                       "prev %s\n",
337                       iter->snapshot,
338                       buf1.buf, buf2.buf);
339         }
340 out:
341         bch2_trans_iter_exit(trans, &copy);
342         return ret;
343 }
344
345 void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
346                             struct bpos pos, bool key_cache)
347 {
348         struct btree_path *path;
349         unsigned idx;
350         struct printbuf buf = PRINTBUF;
351
352         trans_for_each_path_inorder(trans, path, idx) {
353                 int cmp = cmp_int(path->btree_id, id) ?:
354                         cmp_int(path->cached, key_cache);
355
356                 if (cmp > 0)
357                         break;
358                 if (cmp < 0)
359                         continue;
360
361                 if (!btree_node_locked(path, 0) ||
362                     !path->should_be_locked)
363                         continue;
364
365                 if (!key_cache) {
366                         if (bkey_cmp(pos, path->l[0].b->data->min_key) >= 0 &&
367                             bkey_cmp(pos, path->l[0].b->key.k.p) <= 0)
368                                 return;
369                 } else {
370                         if (!bkey_cmp(pos, path->pos))
371                                 return;
372                 }
373         }
374
375         bch2_dump_trans_paths_updates(trans);
376         bch2_bpos_to_text(&buf, pos);
377
378         panic("not locked: %s %s%s\n",
379               bch2_btree_ids[id], buf.buf,
380               key_cache ? " cached" : "");
381 }
382
383 #else
384
385 static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
386                                                 struct btree_path *path, unsigned l) {}
387 static inline void bch2_btree_path_verify(struct btree_trans *trans,
388                                           struct btree_path *path) {}
389 static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
390 static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
391 static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; }
392
393 #endif
394
395 /* Btree path: fixups after btree updates */
396
397 static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
398                                         struct btree *b,
399                                         struct bset_tree *t,
400                                         struct bkey_packed *k)
401 {
402         struct btree_node_iter_set *set;
403
404         btree_node_iter_for_each(iter, set)
405                 if (set->end == t->end_offset) {
406                         set->k = __btree_node_key_to_offset(b, k);
407                         bch2_btree_node_iter_sort(iter, b);
408                         return;
409                 }
410
411         bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
412 }
413
414 static void __bch2_btree_path_fix_key_modified(struct btree_path *path,
415                                                struct btree *b,
416                                                struct bkey_packed *where)
417 {
418         struct btree_path_level *l = &path->l[b->c.level];
419
420         if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
421                 return;
422
423         if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0)
424                 bch2_btree_node_iter_advance(&l->iter, l->b);
425 }
426
427 void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
428                                       struct btree *b,
429                                       struct bkey_packed *where)
430 {
431         struct btree_path *path;
432
433         trans_for_each_path_with_node(trans, b, path) {
434                 __bch2_btree_path_fix_key_modified(path, b, where);
435                 bch2_btree_path_verify_level(trans, path, b->c.level);
436         }
437 }
438
439 static void __bch2_btree_node_iter_fix(struct btree_path *path,
440                                        struct btree *b,
441                                        struct btree_node_iter *node_iter,
442                                        struct bset_tree *t,
443                                        struct bkey_packed *where,
444                                        unsigned clobber_u64s,
445                                        unsigned new_u64s)
446 {
447         const struct bkey_packed *end = btree_bkey_last(b, t);
448         struct btree_node_iter_set *set;
449         unsigned offset = __btree_node_key_to_offset(b, where);
450         int shift = new_u64s - clobber_u64s;
451         unsigned old_end = t->end_offset - shift;
452         unsigned orig_iter_pos = node_iter->data[0].k;
453         bool iter_current_key_modified =
454                 orig_iter_pos >= offset &&
455                 orig_iter_pos <= offset + clobber_u64s;
456
457         btree_node_iter_for_each(node_iter, set)
458                 if (set->end == old_end)
459                         goto found;
460
461         /* didn't find the bset in the iterator - might have to readd it: */
462         if (new_u64s &&
463             bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
464                 bch2_btree_node_iter_push(node_iter, b, where, end);
465                 goto fixup_done;
466         } else {
467                 /* Iterator is after key that changed */
468                 return;
469         }
470 found:
471         set->end = t->end_offset;
472
473         /* Iterator hasn't gotten to the key that changed yet: */
474         if (set->k < offset)
475                 return;
476
477         if (new_u64s &&
478             bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
479                 set->k = offset;
480         } else if (set->k < offset + clobber_u64s) {
481                 set->k = offset + new_u64s;
482                 if (set->k == set->end)
483                         bch2_btree_node_iter_set_drop(node_iter, set);
484         } else {
485                 /* Iterator is after key that changed */
486                 set->k = (int) set->k + shift;
487                 return;
488         }
489
490         bch2_btree_node_iter_sort(node_iter, b);
491 fixup_done:
492         if (node_iter->data[0].k != orig_iter_pos)
493                 iter_current_key_modified = true;
494
495         /*
496          * When a new key is added, and the node iterator now points to that
497          * key, the iterator might have skipped past deleted keys that should
498          * come after the key the iterator now points to. We have to rewind to
499          * before those deleted keys - otherwise
500          * bch2_btree_node_iter_prev_all() breaks:
501          */
502         if (!bch2_btree_node_iter_end(node_iter) &&
503             iter_current_key_modified &&
504             b->c.level) {
505                 struct bset_tree *t;
506                 struct bkey_packed *k, *k2, *p;
507
508                 k = bch2_btree_node_iter_peek_all(node_iter, b);
509
510                 for_each_bset(b, t) {
511                         bool set_pos = false;
512
513                         if (node_iter->data[0].end == t->end_offset)
514                                 continue;
515
516                         k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t);
517
518                         while ((p = bch2_bkey_prev_all(b, t, k2)) &&
519                                bkey_iter_cmp(b, k, p) < 0) {
520                                 k2 = p;
521                                 set_pos = true;
522                         }
523
524                         if (set_pos)
525                                 btree_node_iter_set_set_pos(node_iter,
526                                                             b, t, k2);
527                 }
528         }
529 }
530
531 void bch2_btree_node_iter_fix(struct btree_trans *trans,
532                               struct btree_path *path,
533                               struct btree *b,
534                               struct btree_node_iter *node_iter,
535                               struct bkey_packed *where,
536                               unsigned clobber_u64s,
537                               unsigned new_u64s)
538 {
539         struct bset_tree *t = bch2_bkey_to_bset(b, where);
540         struct btree_path *linked;
541
542         if (node_iter != &path->l[b->c.level].iter) {
543                 __bch2_btree_node_iter_fix(path, b, node_iter, t,
544                                            where, clobber_u64s, new_u64s);
545
546                 if (bch2_debug_check_iterators)
547                         bch2_btree_node_iter_verify(node_iter, b);
548         }
549
550         trans_for_each_path_with_node(trans, b, linked) {
551                 __bch2_btree_node_iter_fix(linked, b,
552                                            &linked->l[b->c.level].iter, t,
553                                            where, clobber_u64s, new_u64s);
554                 bch2_btree_path_verify_level(trans, linked, b->c.level);
555         }
556 }
557
558 /* Btree path level: pointer to a particular btree node and node iter */
559
560 static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c,
561                                                   struct btree_path_level *l,
562                                                   struct bkey *u,
563                                                   struct bkey_packed *k)
564 {
565         if (unlikely(!k)) {
566                 /*
567                  * signal to bch2_btree_iter_peek_slot() that we're currently at
568                  * a hole
569                  */
570                 u->type = KEY_TYPE_deleted;
571                 return bkey_s_c_null;
572         }
573
574         return bkey_disassemble(l->b, k, u);
575 }
576
577 static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
578                                                         struct btree_path_level *l,
579                                                         struct bkey *u)
580 {
581         return __btree_iter_unpack(c, l, u,
582                         bch2_btree_node_iter_peek_all(&l->iter, l->b));
583 }
584
585 static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans,
586                                                     struct btree_path *path,
587                                                     struct btree_path_level *l,
588                                                     struct bkey *u)
589 {
590         struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
591                         bch2_btree_node_iter_peek(&l->iter, l->b));
592
593         path->pos = k.k ? k.k->p : l->b->key.k.p;
594         bch2_btree_path_verify_level(trans, path, l - path->l);
595         return k;
596 }
597
598 static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
599                                                     struct btree_path *path,
600                                                     struct btree_path_level *l,
601                                                     struct bkey *u)
602 {
603         struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
604                         bch2_btree_node_iter_prev(&l->iter, l->b));
605
606         path->pos = k.k ? k.k->p : l->b->data->min_key;
607         bch2_btree_path_verify_level(trans, path, l - path->l);
608         return k;
609 }
610
611 static inline bool btree_path_advance_to_pos(struct btree_path *path,
612                                              struct btree_path_level *l,
613                                              int max_advance)
614 {
615         struct bkey_packed *k;
616         int nr_advanced = 0;
617
618         while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
619                bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
620                 if (max_advance > 0 && nr_advanced >= max_advance)
621                         return false;
622
623                 bch2_btree_node_iter_advance(&l->iter, l->b);
624                 nr_advanced++;
625         }
626
627         return true;
628 }
629
630 static inline void __btree_path_level_init(struct btree_path *path,
631                                            unsigned level)
632 {
633         struct btree_path_level *l = &path->l[level];
634
635         bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
636
637         /*
638          * Iterators to interior nodes should always be pointed at the first non
639          * whiteout:
640          */
641         if (level)
642                 bch2_btree_node_iter_peek(&l->iter, l->b);
643 }
644
645 inline void bch2_btree_path_level_init(struct btree_trans *trans,
646                                        struct btree_path *path,
647                                        struct btree *b)
648 {
649         BUG_ON(path->cached);
650
651         EBUG_ON(!btree_path_pos_in_node(path, b));
652         EBUG_ON(b->c.lock.state.seq & 1);
653
654         path->l[b->c.level].lock_seq = b->c.lock.state.seq;
655         path->l[b->c.level].b = b;
656         __btree_path_level_init(path, b->c.level);
657 }
658
659 /* Btree path: fixups after btree node updates: */
660
661 /*
662  * A btree node is being replaced - update the iterator to point to the new
663  * node:
664  */
665 void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
666 {
667         struct btree_path *path;
668
669         trans_for_each_path(trans, path)
670                 if (path->uptodate == BTREE_ITER_UPTODATE &&
671                     !path->cached &&
672                     btree_path_pos_in_node(path, b)) {
673                         enum btree_node_locked_type t =
674                                 btree_lock_want(path, b->c.level);
675
676                         if (t != BTREE_NODE_UNLOCKED) {
677                                 btree_node_unlock(trans, path, b->c.level);
678                                 six_lock_increment(&b->c.lock, t);
679                                 mark_btree_node_locked(trans, path, b->c.level, t);
680                         }
681
682                         bch2_btree_path_level_init(trans, path, b);
683                 }
684 }
685
686 /*
687  * A btree node has been modified in such a way as to invalidate iterators - fix
688  * them:
689  */
690 void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b)
691 {
692         struct btree_path *path;
693
694         trans_for_each_path_with_node(trans, b, path)
695                 __btree_path_level_init(path, b->c.level);
696 }
697
698 /* Btree path: traverse, set_pos: */
699
700 static inline int btree_path_lock_root(struct btree_trans *trans,
701                                        struct btree_path *path,
702                                        unsigned depth_want,
703                                        unsigned long trace_ip)
704 {
705         struct bch_fs *c = trans->c;
706         struct btree *b, **rootp = &c->btree_roots[path->btree_id].b;
707         enum six_lock_type lock_type;
708         unsigned i;
709         int ret;
710
711         EBUG_ON(path->nodes_locked);
712
713         while (1) {
714                 b = READ_ONCE(*rootp);
715                 path->level = READ_ONCE(b->c.level);
716
717                 if (unlikely(path->level < depth_want)) {
718                         /*
719                          * the root is at a lower depth than the depth we want:
720                          * got to the end of the btree, or we're walking nodes
721                          * greater than some depth and there are no nodes >=
722                          * that depth
723                          */
724                         path->level = depth_want;
725                         for (i = path->level; i < BTREE_MAX_DEPTH; i++)
726                                 path->l[i].b = NULL;
727                         return 1;
728                 }
729
730                 lock_type = __btree_lock_want(path, path->level);
731                 ret = btree_node_lock(trans, path, &b->c,
732                                       path->level, lock_type, trace_ip);
733                 if (unlikely(ret)) {
734                         if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed))
735                                 continue;
736                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
737                                 return ret;
738                         BUG();
739                 }
740
741                 if (likely(b == READ_ONCE(*rootp) &&
742                            b->c.level == path->level &&
743                            !race_fault())) {
744                         for (i = 0; i < path->level; i++)
745                                 path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_lock_root);
746                         path->l[path->level].b = b;
747                         for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
748                                 path->l[i].b = NULL;
749
750                         mark_btree_node_locked(trans, path, path->level, lock_type);
751                         bch2_btree_path_level_init(trans, path, b);
752                         return 0;
753                 }
754
755                 six_unlock_type(&b->c.lock, lock_type);
756         }
757 }
758
759 noinline
760 static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path)
761 {
762         struct bch_fs *c = trans->c;
763         struct btree_path_level *l = path_l(path);
764         struct btree_node_iter node_iter = l->iter;
765         struct bkey_packed *k;
766         struct bkey_buf tmp;
767         unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
768                 ? (path->level > 1 ? 0 :  2)
769                 : (path->level > 1 ? 1 : 16);
770         bool was_locked = btree_node_locked(path, path->level);
771         int ret = 0;
772
773         bch2_bkey_buf_init(&tmp);
774
775         while (nr && !ret) {
776                 if (!bch2_btree_node_relock(trans, path, path->level))
777                         break;
778
779                 bch2_btree_node_iter_advance(&node_iter, l->b);
780                 k = bch2_btree_node_iter_peek(&node_iter, l->b);
781                 if (!k)
782                         break;
783
784                 bch2_bkey_buf_unpack(&tmp, c, l->b, k);
785                 ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id,
786                                                path->level - 1);
787         }
788
789         if (!was_locked)
790                 btree_node_unlock(trans, path, path->level);
791
792         bch2_bkey_buf_exit(&tmp, c);
793         return ret;
794 }
795
796 static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path,
797                                  struct btree_and_journal_iter *jiter)
798 {
799         struct bch_fs *c = trans->c;
800         struct bkey_s_c k;
801         struct bkey_buf tmp;
802         unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
803                 ? (path->level > 1 ? 0 :  2)
804                 : (path->level > 1 ? 1 : 16);
805         bool was_locked = btree_node_locked(path, path->level);
806         int ret = 0;
807
808         bch2_bkey_buf_init(&tmp);
809
810         while (nr && !ret) {
811                 if (!bch2_btree_node_relock(trans, path, path->level))
812                         break;
813
814                 bch2_btree_and_journal_iter_advance(jiter);
815                 k = bch2_btree_and_journal_iter_peek(jiter);
816                 if (!k.k)
817                         break;
818
819                 bch2_bkey_buf_reassemble(&tmp, c, k);
820                 ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id,
821                                                path->level - 1);
822         }
823
824         if (!was_locked)
825                 btree_node_unlock(trans, path, path->level);
826
827         bch2_bkey_buf_exit(&tmp, c);
828         return ret;
829 }
830
831 static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
832                                             struct btree_path *path,
833                                             unsigned plevel, struct btree *b)
834 {
835         struct btree_path_level *l = &path->l[plevel];
836         bool locked = btree_node_locked(path, plevel);
837         struct bkey_packed *k;
838         struct bch_btree_ptr_v2 *bp;
839
840         if (!bch2_btree_node_relock(trans, path, plevel))
841                 return;
842
843         k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
844         BUG_ON(k->type != KEY_TYPE_btree_ptr_v2);
845
846         bp = (void *) bkeyp_val(&l->b->format, k);
847         bp->mem_ptr = (unsigned long)b;
848
849         if (!locked)
850                 btree_node_unlock(trans, path, plevel);
851 }
852
853 static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
854                                                      struct btree_path *path,
855                                                      unsigned flags,
856                                                      struct bkey_buf *out)
857 {
858         struct bch_fs *c = trans->c;
859         struct btree_path_level *l = path_l(path);
860         struct btree_and_journal_iter jiter;
861         struct bkey_s_c k;
862         int ret = 0;
863
864         __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos);
865
866         k = bch2_btree_and_journal_iter_peek(&jiter);
867
868         bch2_bkey_buf_reassemble(out, c, k);
869
870         if (flags & BTREE_ITER_PREFETCH)
871                 ret = btree_path_prefetch_j(trans, path, &jiter);
872
873         bch2_btree_and_journal_iter_exit(&jiter);
874         return ret;
875 }
876
877 static __always_inline int btree_path_down(struct btree_trans *trans,
878                                            struct btree_path *path,
879                                            unsigned flags,
880                                            unsigned long trace_ip)
881 {
882         struct bch_fs *c = trans->c;
883         struct btree_path_level *l = path_l(path);
884         struct btree *b;
885         unsigned level = path->level - 1;
886         enum six_lock_type lock_type = __btree_lock_want(path, level);
887         bool replay_done = test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
888         struct bkey_buf tmp;
889         int ret;
890
891         EBUG_ON(!btree_node_locked(path, path->level));
892
893         bch2_bkey_buf_init(&tmp);
894
895         if (unlikely(!replay_done)) {
896                 ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp);
897                 if (ret)
898                         goto err;
899         } else {
900                 bch2_bkey_buf_unpack(&tmp, c, l->b,
901                                  bch2_btree_node_iter_peek(&l->iter, l->b));
902
903                 if (flags & BTREE_ITER_PREFETCH) {
904                         ret = btree_path_prefetch(trans, path);
905                         if (ret)
906                                 goto err;
907                 }
908         }
909
910         b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip);
911         ret = PTR_ERR_OR_ZERO(b);
912         if (unlikely(ret))
913                 goto err;
914
915         if (likely(replay_done && tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
916             unlikely(b != btree_node_mem_ptr(tmp.k)))
917                 btree_node_mem_ptr_set(trans, path, level + 1, b);
918
919         if (btree_node_read_locked(path, level + 1))
920                 btree_node_unlock(trans, path, level + 1);
921
922         mark_btree_node_locked(trans, path, level, lock_type);
923         path->level = level;
924         bch2_btree_path_level_init(trans, path, b);
925
926         bch2_btree_path_verify_locks(path);
927 err:
928         bch2_bkey_buf_exit(&tmp, c);
929         return ret;
930 }
931
932 static int btree_path_traverse_one(struct btree_trans *, struct btree_path *,
933                                    unsigned, unsigned long);
934
935 static int bch2_btree_path_traverse_all(struct btree_trans *trans)
936 {
937         struct bch_fs *c = trans->c;
938         struct btree_path *path;
939         unsigned long trace_ip = _RET_IP_;
940         int ret = 0;
941
942         if (trans->in_traverse_all)
943                 return -BCH_ERR_transaction_restart_in_traverse_all;
944
945         trans->in_traverse_all = true;
946 retry_all:
947         trans->restarted = 0;
948         trans->traverse_all_idx = U8_MAX;
949
950         trans_for_each_path(trans, path)
951                 path->should_be_locked = false;
952
953         btree_trans_verify_sorted(trans);
954
955         bch2_trans_unlock(trans);
956         cond_resched();
957
958         if (unlikely(trans->memory_allocation_failure)) {
959                 struct closure cl;
960
961                 closure_init_stack(&cl);
962
963                 do {
964                         ret = bch2_btree_cache_cannibalize_lock(c, &cl);
965                         closure_sync(&cl);
966                 } while (ret);
967         }
968
969         /* Now, redo traversals in correct order: */
970         trans->traverse_all_idx = 0;
971         while (trans->traverse_all_idx < trans->nr_sorted) {
972                 path = trans->paths + trans->sorted[trans->traverse_all_idx];
973
974                 /*
975                  * Traversing a path can cause another path to be added at about
976                  * the same position:
977                  */
978                 if (path->uptodate) {
979                         ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_);
980                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
981                             ret == -ENOMEM)
982                                 goto retry_all;
983                         if (ret)
984                                 goto err;
985                         BUG_ON(path->uptodate);
986                 } else {
987                         trans->traverse_all_idx++;
988                 }
989         }
990
991         /*
992          * BTREE_ITER_NEED_RELOCK is ok here - if we called bch2_trans_unlock()
993          * and relock(), relock() won't relock since path->should_be_locked
994          * isn't set yet, which is all fine
995          */
996         trans_for_each_path(trans, path)
997                 BUG_ON(path->uptodate >= BTREE_ITER_NEED_TRAVERSE);
998 err:
999         bch2_btree_cache_cannibalize_unlock(c);
1000
1001         trans->in_traverse_all = false;
1002
1003         trace_and_count(c, trans_traverse_all, trans, trace_ip);
1004         return ret;
1005 }
1006
1007 static inline bool btree_path_good_node(struct btree_trans *trans,
1008                                         struct btree_path *path,
1009                                         unsigned l, int check_pos)
1010 {
1011         if (!is_btree_node(path, l) ||
1012             !bch2_btree_node_relock(trans, path, l))
1013                 return false;
1014
1015         if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b))
1016                 return false;
1017         if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b))
1018                 return false;
1019         return true;
1020 }
1021
1022 static void btree_path_set_level_down(struct btree_trans *trans,
1023                                       struct btree_path *path,
1024                                       unsigned new_level)
1025 {
1026         unsigned l;
1027
1028         path->level = new_level;
1029
1030         for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++)
1031                 if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
1032                         btree_node_unlock(trans, path, l);
1033
1034         btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
1035         bch2_btree_path_verify(trans, path);
1036 }
1037
1038 static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
1039                                                      struct btree_path *path,
1040                                                      int check_pos)
1041 {
1042         unsigned i, l = path->level;
1043 again:
1044         while (btree_path_node(path, l) &&
1045                !btree_path_good_node(trans, path, l, check_pos))
1046                 __btree_path_set_level_up(trans, path, l++);
1047
1048         /* If we need intent locks, take them too: */
1049         for (i = l + 1;
1050              i < path->locks_want && btree_path_node(path, i);
1051              i++)
1052                 if (!bch2_btree_node_relock(trans, path, i)) {
1053                         while (l <= i)
1054                                 __btree_path_set_level_up(trans, path, l++);
1055                         goto again;
1056                 }
1057
1058         return l;
1059 }
1060
1061 /*
1062  * This is the main state machine for walking down the btree - walks down to a
1063  * specified depth
1064  *
1065  * Returns 0 on success, -EIO on error (error reading in a btree node).
1066  *
1067  * On error, caller (peek_node()/peek_key()) must return NULL; the error is
1068  * stashed in the iterator and returned from bch2_trans_exit().
1069  */
1070 static int btree_path_traverse_one(struct btree_trans *trans,
1071                                    struct btree_path *path,
1072                                    unsigned flags,
1073                                    unsigned long trace_ip)
1074 {
1075         unsigned depth_want = path->level;
1076         int ret = trans->restarted;
1077
1078         if (unlikely(ret))
1079                 goto out;
1080
1081         /*
1082          * Ensure we obey path->should_be_locked: if it's set, we can't unlock
1083          * and re-traverse the path without a transaction restart:
1084          */
1085         if (path->should_be_locked) {
1086                 ret = bch2_btree_path_relock(trans, path, trace_ip);
1087                 goto out;
1088         }
1089
1090         if (path->cached) {
1091                 ret = bch2_btree_path_traverse_cached(trans, path, flags);
1092                 goto out;
1093         }
1094
1095         if (unlikely(path->level >= BTREE_MAX_DEPTH))
1096                 goto out;
1097
1098         path->level = btree_path_up_until_good_node(trans, path, 0);
1099
1100         EBUG_ON(btree_path_node(path, path->level) &&
1101                 !btree_node_locked(path, path->level));
1102
1103         /*
1104          * Note: path->nodes[path->level] may be temporarily NULL here - that
1105          * would indicate to other code that we got to the end of the btree,
1106          * here it indicates that relocking the root failed - it's critical that
1107          * btree_path_lock_root() comes next and that it can't fail
1108          */
1109         while (path->level > depth_want) {
1110                 ret = btree_path_node(path, path->level)
1111                         ? btree_path_down(trans, path, flags, trace_ip)
1112                         : btree_path_lock_root(trans, path, depth_want, trace_ip);
1113                 if (unlikely(ret)) {
1114                         if (ret == 1) {
1115                                 /*
1116                                  * No nodes at this level - got to the end of
1117                                  * the btree:
1118                                  */
1119                                 ret = 0;
1120                                 goto out;
1121                         }
1122
1123                         __bch2_btree_path_unlock(trans, path);
1124                         path->level = depth_want;
1125                         path->l[path->level].b = ERR_PTR(ret);
1126                         goto out;
1127                 }
1128         }
1129
1130         path->uptodate = BTREE_ITER_UPTODATE;
1131 out:
1132         BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
1133         bch2_btree_path_verify(trans, path);
1134         return ret;
1135 }
1136
1137 int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
1138                                           struct btree_path *path, unsigned flags)
1139 {
1140         if (0 && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
1141                 unsigned restart_probability_bits = 4 << min(trans->restart_count, 32U);
1142                 u64 mask = ~(~0ULL << restart_probability_bits);
1143
1144                 if ((prandom_u32() & mask) == mask) {
1145                         trace_and_count(trans->c, trans_restart_injected, trans, _RET_IP_);
1146                         return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject);
1147                 }
1148         }
1149
1150         if (path->uptodate < BTREE_ITER_NEED_RELOCK)
1151                 return 0;
1152
1153         return  bch2_trans_cond_resched(trans) ?:
1154                 btree_path_traverse_one(trans, path, flags, _RET_IP_);
1155 }
1156
1157 static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
1158                             struct btree_path *src)
1159 {
1160         unsigned i, offset = offsetof(struct btree_path, pos);
1161
1162         memcpy((void *) dst + offset,
1163                (void *) src + offset,
1164                sizeof(struct btree_path) - offset);
1165
1166         for (i = 0; i < BTREE_MAX_DEPTH; i++)
1167                 if (btree_node_locked(dst, i))
1168                         six_lock_increment(&dst->l[i].b->c.lock,
1169                                            __btree_lock_want(dst, i));
1170
1171         bch2_btree_path_check_sort(trans, dst, 0);
1172 }
1173
1174 static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src,
1175                                            bool intent)
1176 {
1177         struct btree_path *new = btree_path_alloc(trans, src);
1178
1179         btree_path_copy(trans, new, src);
1180         __btree_path_get(new, intent);
1181         return new;
1182 }
1183
1184 inline struct btree_path * __must_check
1185 bch2_btree_path_make_mut(struct btree_trans *trans,
1186                          struct btree_path *path, bool intent,
1187                          unsigned long ip)
1188 {
1189         if (path->ref > 1 || path->preserve) {
1190                 __btree_path_put(path, intent);
1191                 path = btree_path_clone(trans, path, intent);
1192                 path->preserve = false;
1193 #ifdef CONFIG_BCACHEFS_DEBUG
1194                 path->ip_allocated = ip;
1195 #endif
1196                 btree_trans_verify_sorted(trans);
1197         }
1198
1199         path->should_be_locked = false;
1200         return path;
1201 }
1202
1203 struct btree_path * __must_check
1204 bch2_btree_path_set_pos(struct btree_trans *trans,
1205                    struct btree_path *path, struct bpos new_pos,
1206                    bool intent, unsigned long ip)
1207 {
1208         int cmp = bpos_cmp(new_pos, path->pos);
1209         unsigned l = path->level;
1210
1211         EBUG_ON(trans->restarted);
1212         EBUG_ON(!path->ref);
1213
1214         if (!cmp)
1215                 return path;
1216
1217         path = bch2_btree_path_make_mut(trans, path, intent, ip);
1218
1219         path->pos = new_pos;
1220
1221         bch2_btree_path_check_sort(trans, path, cmp);
1222
1223         if (unlikely(path->cached)) {
1224                 btree_node_unlock(trans, path, 0);
1225                 path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
1226                 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
1227                 goto out;
1228         }
1229
1230         l = btree_path_up_until_good_node(trans, path, cmp);
1231
1232         if (btree_path_node(path, l)) {
1233                 BUG_ON(!btree_node_locked(path, l));
1234                 /*
1235                  * We might have to skip over many keys, or just a few: try
1236                  * advancing the node iterator, and if we have to skip over too
1237                  * many keys just reinit it (or if we're rewinding, since that
1238                  * is expensive).
1239                  */
1240                 if (cmp < 0 ||
1241                     !btree_path_advance_to_pos(path, &path->l[l], 8))
1242                         __btree_path_level_init(path, l);
1243         }
1244
1245         if (l != path->level) {
1246                 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
1247                 __bch2_btree_path_unlock(trans, path);
1248         }
1249 out:
1250         bch2_btree_path_verify(trans, path);
1251         return path;
1252 }
1253
1254 /* Btree path: main interface: */
1255
1256 static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path)
1257 {
1258         struct btree_path *sib;
1259
1260         sib = prev_btree_path(trans, path);
1261         if (sib && !btree_path_cmp(sib, path))
1262                 return sib;
1263
1264         sib = next_btree_path(trans, path);
1265         if (sib && !btree_path_cmp(sib, path))
1266                 return sib;
1267
1268         return NULL;
1269 }
1270
1271 static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path)
1272 {
1273         struct btree_path *sib;
1274
1275         sib = prev_btree_path(trans, path);
1276         if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
1277                 return sib;
1278
1279         sib = next_btree_path(trans, path);
1280         if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
1281                 return sib;
1282
1283         return NULL;
1284 }
1285
1286 static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path)
1287 {
1288         __bch2_btree_path_unlock(trans, path);
1289         btree_path_list_remove(trans, path);
1290         trans->paths_allocated &= ~(1ULL << path->idx);
1291 }
1292
1293 void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent)
1294 {
1295         struct btree_path *dup;
1296
1297         EBUG_ON(trans->paths + path->idx != path);
1298         EBUG_ON(!path->ref);
1299
1300         if (!__btree_path_put(path, intent))
1301                 return;
1302
1303         dup = path->preserve
1304                 ? have_path_at_pos(trans, path)
1305                 : have_node_at_pos(trans, path);
1306
1307         if (!dup && !(!path->preserve && !is_btree_node(path, path->level)))
1308                 return;
1309
1310         if (path->should_be_locked &&
1311             !trans->restarted &&
1312             (!dup || !bch2_btree_path_relock_norestart(trans, dup, _THIS_IP_)))
1313                 return;
1314
1315         if (dup) {
1316                 dup->preserve           |= path->preserve;
1317                 dup->should_be_locked   |= path->should_be_locked;
1318         }
1319
1320         __bch2_path_free(trans, path);
1321 }
1322
1323 static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *path,
1324                                  bool intent)
1325 {
1326         EBUG_ON(trans->paths + path->idx != path);
1327         EBUG_ON(!path->ref);
1328
1329         if (!__btree_path_put(path, intent))
1330                 return;
1331
1332         __bch2_path_free(trans, path);
1333 }
1334
1335 void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
1336 {
1337         struct btree_insert_entry *i;
1338
1339         prt_printf(buf, "transaction updates for %s journal seq %llu",
1340                trans->fn, trans->journal_res.seq);
1341         prt_newline(buf);
1342         printbuf_indent_add(buf, 2);
1343
1344         trans_for_each_update(trans, i) {
1345                 struct bkey_s_c old = { &i->old_k, i->old_v };
1346
1347                 prt_printf(buf, "update: btree=%s cached=%u %pS",
1348                        bch2_btree_ids[i->btree_id],
1349                        i->cached,
1350                        (void *) i->ip_allocated);
1351                 prt_newline(buf);
1352
1353                 prt_printf(buf, "  old ");
1354                 bch2_bkey_val_to_text(buf, trans->c, old);
1355                 prt_newline(buf);
1356
1357                 prt_printf(buf, "  new ");
1358                 bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k));
1359                 prt_newline(buf);
1360         }
1361
1362         printbuf_indent_sub(buf, 2);
1363 }
1364
1365 noinline __cold
1366 void bch2_dump_trans_updates(struct btree_trans *trans)
1367 {
1368         struct printbuf buf = PRINTBUF;
1369
1370         bch2_trans_updates_to_text(&buf, trans);
1371         bch2_print_string_as_lines(KERN_ERR, buf.buf);
1372         printbuf_exit(&buf);
1373 }
1374
1375 void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path)
1376 {
1377         prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ",
1378                    path->idx, path->ref, path->intent_ref,
1379                    path->preserve ? 'P' : ' ',
1380                    path->should_be_locked ? 'S' : ' ',
1381                    bch2_btree_ids[path->btree_id],
1382                    path->level);
1383         bch2_bpos_to_text(out, path->pos);
1384
1385         prt_printf(out, " locks %u", path->nodes_locked);
1386 #ifdef CONFIG_BCACHEFS_DEBUG
1387         prt_printf(out, " %pS", (void *) path->ip_allocated);
1388 #endif
1389         prt_newline(out);
1390 }
1391
1392 void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans)
1393 {
1394         struct btree_path *path;
1395         unsigned idx;
1396
1397         trans_for_each_path_inorder(trans, path, idx)
1398                 bch2_btree_path_to_text(out, path);
1399 }
1400
1401 noinline __cold
1402 void bch2_dump_trans_paths_updates(struct btree_trans *trans)
1403 {
1404         struct printbuf buf = PRINTBUF;
1405
1406         bch2_trans_paths_to_text(&buf, trans);
1407         bch2_trans_updates_to_text(&buf, trans);
1408
1409         bch2_print_string_as_lines(KERN_ERR, buf.buf);
1410         printbuf_exit(&buf);
1411 }
1412
1413 noinline
1414 static void bch2_trans_update_max_paths(struct btree_trans *trans)
1415 {
1416         struct btree_transaction_stats *s = btree_trans_stats(trans);
1417         struct printbuf buf = PRINTBUF;
1418
1419         bch2_trans_paths_to_text(&buf, trans);
1420
1421         if (!buf.allocation_failure) {
1422                 mutex_lock(&s->lock);
1423                 if (s->nr_max_paths < hweight64(trans->paths_allocated)) {
1424                         s->nr_max_paths = trans->nr_max_paths =
1425                                 hweight64(trans->paths_allocated);
1426                         swap(s->max_paths_text, buf.buf);
1427                 }
1428                 mutex_unlock(&s->lock);
1429         }
1430
1431         printbuf_exit(&buf);
1432 }
1433
1434 static noinline void btree_path_overflow(struct btree_trans *trans)
1435 {
1436         bch2_dump_trans_paths_updates(trans);
1437         panic("trans path oveflow\n");
1438 }
1439
1440 static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
1441                                                   struct btree_path *pos)
1442 {
1443         struct btree_path *path;
1444         unsigned idx;
1445
1446         if (unlikely(trans->paths_allocated ==
1447                      ~((~0ULL << 1) << (BTREE_ITER_MAX - 1))))
1448                 btree_path_overflow(trans);
1449
1450         idx = __ffs64(~trans->paths_allocated);
1451         trans->paths_allocated |= 1ULL << idx;
1452
1453         if (unlikely(idx > trans->nr_max_paths))
1454                 bch2_trans_update_max_paths(trans);
1455
1456         path = &trans->paths[idx];
1457
1458         path->idx               = idx;
1459         path->ref               = 0;
1460         path->intent_ref        = 0;
1461         path->nodes_locked      = 0;
1462
1463         btree_path_list_add(trans, pos, path);
1464         return path;
1465 }
1466
1467 struct btree_path *bch2_path_get(struct btree_trans *trans,
1468                                  enum btree_id btree_id, struct bpos pos,
1469                                  unsigned locks_want, unsigned level,
1470                                  unsigned flags, unsigned long ip)
1471 {
1472         struct btree_path *path, *path_pos = NULL;
1473         bool cached = flags & BTREE_ITER_CACHED;
1474         bool intent = flags & BTREE_ITER_INTENT;
1475         int i;
1476
1477         BUG_ON(trans->restarted);
1478         btree_trans_verify_sorted(trans);
1479         bch2_trans_verify_locks(trans);
1480
1481         trans_for_each_path_inorder(trans, path, i) {
1482                 if (__btree_path_cmp(path,
1483                                      btree_id,
1484                                      cached,
1485                                      pos,
1486                                      level) > 0)
1487                         break;
1488
1489                 path_pos = path;
1490         }
1491
1492         if (path_pos &&
1493             path_pos->cached    == cached &&
1494             path_pos->btree_id  == btree_id &&
1495             path_pos->level     == level) {
1496                 __btree_path_get(path_pos, intent);
1497                 path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
1498         } else {
1499                 path = btree_path_alloc(trans, path_pos);
1500                 path_pos = NULL;
1501
1502                 __btree_path_get(path, intent);
1503                 path->pos                       = pos;
1504                 path->btree_id                  = btree_id;
1505                 path->cached                    = cached;
1506                 path->uptodate                  = BTREE_ITER_NEED_TRAVERSE;
1507                 path->should_be_locked          = false;
1508                 path->level                     = level;
1509                 path->locks_want                = locks_want;
1510                 path->nodes_locked              = 0;
1511                 for (i = 0; i < ARRAY_SIZE(path->l); i++)
1512                         path->l[i].b            = ERR_PTR(-BCH_ERR_no_btree_node_init);
1513 #ifdef CONFIG_BCACHEFS_DEBUG
1514                 path->ip_allocated              = ip;
1515 #endif
1516                 btree_trans_verify_sorted(trans);
1517         }
1518
1519         if (!(flags & BTREE_ITER_NOPRESERVE))
1520                 path->preserve = true;
1521
1522         if (path->intent_ref)
1523                 locks_want = max(locks_want, level + 1);
1524
1525         /*
1526          * If the path has locks_want greater than requested, we don't downgrade
1527          * it here - on transaction restart because btree node split needs to
1528          * upgrade locks, we might be putting/getting the iterator again.
1529          * Downgrading iterators only happens via bch2_trans_downgrade(), after
1530          * a successful transaction commit.
1531          */
1532
1533         locks_want = min(locks_want, BTREE_MAX_DEPTH);
1534         if (locks_want > path->locks_want)
1535                 bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want);
1536
1537         return path;
1538 }
1539
1540 inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
1541 {
1542
1543         struct btree_path_level *l = path_l(path);
1544         struct bkey_packed *_k;
1545         struct bkey_s_c k;
1546
1547         if (unlikely(!l->b))
1548                 return bkey_s_c_null;
1549
1550         EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
1551         EBUG_ON(!btree_node_locked(path, path->level));
1552
1553         if (!path->cached) {
1554                 _k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
1555                 k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null;
1556
1557                 EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, path->pos) == 0);
1558
1559                 if (!k.k || bpos_cmp(path->pos, k.k->p))
1560                         goto hole;
1561         } else {
1562                 struct bkey_cached *ck = (void *) path->l[0].b;
1563
1564                 EBUG_ON(ck &&
1565                         (path->btree_id != ck->key.btree_id ||
1566                          bkey_cmp(path->pos, ck->key.pos)));
1567                 EBUG_ON(!ck || !ck->valid);
1568
1569                 *u = ck->k->k;
1570                 k = bkey_i_to_s_c(ck->k);
1571         }
1572
1573         return k;
1574 hole:
1575         bkey_init(u);
1576         u->p = path->pos;
1577         return (struct bkey_s_c) { u, NULL };
1578 }
1579
1580 /* Btree iterators: */
1581
1582 int __must_check
1583 __bch2_btree_iter_traverse(struct btree_iter *iter)
1584 {
1585         return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
1586 }
1587
1588 int __must_check
1589 bch2_btree_iter_traverse(struct btree_iter *iter)
1590 {
1591         int ret;
1592
1593         iter->path = bch2_btree_path_set_pos(iter->trans, iter->path,
1594                                         btree_iter_search_key(iter),
1595                                         iter->flags & BTREE_ITER_INTENT,
1596                                         btree_iter_ip_allocated(iter));
1597
1598         ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
1599         if (ret)
1600                 return ret;
1601
1602         btree_path_set_should_be_locked(iter->path);
1603         return 0;
1604 }
1605
1606 /* Iterate across nodes (leaf and interior nodes) */
1607
1608 struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
1609 {
1610         struct btree_trans *trans = iter->trans;
1611         struct btree *b = NULL;
1612         int ret;
1613
1614         EBUG_ON(iter->path->cached);
1615         bch2_btree_iter_verify(iter);
1616
1617         ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
1618         if (ret)
1619                 goto err;
1620
1621         b = btree_path_node(iter->path, iter->path->level);
1622         if (!b)
1623                 goto out;
1624
1625         BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0);
1626
1627         bkey_init(&iter->k);
1628         iter->k.p = iter->pos = b->key.k.p;
1629
1630         iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
1631                                         iter->flags & BTREE_ITER_INTENT,
1632                                         btree_iter_ip_allocated(iter));
1633         btree_path_set_should_be_locked(iter->path);
1634 out:
1635         bch2_btree_iter_verify_entry_exit(iter);
1636         bch2_btree_iter_verify(iter);
1637
1638         return b;
1639 err:
1640         b = ERR_PTR(ret);
1641         goto out;
1642 }
1643
1644 struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
1645 {
1646         struct btree_trans *trans = iter->trans;
1647         struct btree_path *path = iter->path;
1648         struct btree *b = NULL;
1649         int ret;
1650
1651         BUG_ON(trans->restarted);
1652         EBUG_ON(iter->path->cached);
1653         bch2_btree_iter_verify(iter);
1654
1655         /* already at end? */
1656         if (!btree_path_node(path, path->level))
1657                 return NULL;
1658
1659         /* got to end? */
1660         if (!btree_path_node(path, path->level + 1)) {
1661                 btree_path_set_level_up(trans, path);
1662                 return NULL;
1663         }
1664
1665         if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
1666                 __bch2_btree_path_unlock(trans, path);
1667                 path->l[path->level].b          = ERR_PTR(-BCH_ERR_no_btree_node_relock);
1668                 path->l[path->level + 1].b      = ERR_PTR(-BCH_ERR_no_btree_node_relock);
1669                 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
1670                 trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path);
1671                 ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
1672                 goto err;
1673         }
1674
1675         b = btree_path_node(path, path->level + 1);
1676
1677         if (!bpos_cmp(iter->pos, b->key.k.p)) {
1678                 __btree_path_set_level_up(trans, path, path->level++);
1679         } else {
1680                 /*
1681                  * Haven't gotten to the end of the parent node: go back down to
1682                  * the next child node
1683                  */
1684                 path = iter->path =
1685                         bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos),
1686                                            iter->flags & BTREE_ITER_INTENT,
1687                                            btree_iter_ip_allocated(iter));
1688
1689                 btree_path_set_level_down(trans, path, iter->min_depth);
1690
1691                 ret = bch2_btree_path_traverse(trans, path, iter->flags);
1692                 if (ret)
1693                         goto err;
1694
1695                 b = path->l[path->level].b;
1696         }
1697
1698         bkey_init(&iter->k);
1699         iter->k.p = iter->pos = b->key.k.p;
1700
1701         iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
1702                                         iter->flags & BTREE_ITER_INTENT,
1703                                         btree_iter_ip_allocated(iter));
1704         btree_path_set_should_be_locked(iter->path);
1705         BUG_ON(iter->path->uptodate);
1706 out:
1707         bch2_btree_iter_verify_entry_exit(iter);
1708         bch2_btree_iter_verify(iter);
1709
1710         return b;
1711 err:
1712         b = ERR_PTR(ret);
1713         goto out;
1714 }
1715
1716 /* Iterate across keys (in leaf nodes only) */
1717
1718 inline bool bch2_btree_iter_advance(struct btree_iter *iter)
1719 {
1720         if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) {
1721                 struct bpos pos = iter->k.p;
1722                 bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS
1723                             ? bpos_cmp(pos, SPOS_MAX)
1724                             : bkey_cmp(pos, SPOS_MAX)) != 0;
1725
1726                 if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
1727                         pos = bkey_successor(iter, pos);
1728                 bch2_btree_iter_set_pos(iter, pos);
1729                 return ret;
1730         } else {
1731                 if (!btree_path_node(iter->path, iter->path->level))
1732                         return true;
1733
1734                 iter->advanced = true;
1735                 return false;
1736         }
1737 }
1738
1739 inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
1740 {
1741         struct bpos pos = bkey_start_pos(&iter->k);
1742         bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS
1743                     ? bpos_cmp(pos, POS_MIN)
1744                     : bkey_cmp(pos, POS_MIN)) != 0;
1745
1746         if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
1747                 pos = bkey_predecessor(iter, pos);
1748         bch2_btree_iter_set_pos(iter, pos);
1749         return ret;
1750 }
1751
1752 static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans,
1753                                                       enum btree_id btree_id,
1754                                                       struct bpos pos)
1755 {
1756         struct btree_insert_entry *i;
1757         struct bkey_i *ret = NULL;
1758
1759         trans_for_each_update(trans, i) {
1760                 if (i->btree_id < btree_id)
1761                         continue;
1762                 if (i->btree_id > btree_id)
1763                         break;
1764                 if (bpos_cmp(i->k->k.p, pos) < 0)
1765                         continue;
1766                 if (i->key_cache_already_flushed)
1767                         continue;
1768                 if (!ret || bpos_cmp(i->k->k.p, ret->k.p) < 0)
1769                         ret = i->k;
1770         }
1771
1772         return ret;
1773 }
1774
1775 struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
1776                                        struct btree_iter *iter,
1777                                        struct bpos start_pos,
1778                                        struct bpos end_pos)
1779 {
1780         struct bkey_i *k;
1781
1782         if (bpos_cmp(start_pos, iter->journal_pos) < 0)
1783                 iter->journal_idx = 0;
1784
1785         k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id, 0,
1786                                         start_pos, end_pos,
1787                                         &iter->journal_idx);
1788
1789         iter->journal_pos = k ? k->k.p : end_pos;
1790         return k;
1791 }
1792
1793 struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *trans,
1794                                             struct btree_iter *iter,
1795                                             struct bpos pos)
1796 {
1797         return bch2_btree_journal_peek(trans, iter, pos, pos);
1798 }
1799
1800 static noinline
1801 struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
1802                                          struct btree_iter *iter,
1803                                          struct bkey_s_c k)
1804 {
1805         struct bkey_i *next_journal =
1806                 bch2_btree_journal_peek(trans, iter, iter->path->pos,
1807                                 k.k ? k.k->p : iter->path->l[0].b->key.k.p);
1808
1809         if (next_journal) {
1810                 iter->k = next_journal->k;
1811                 k = bkey_i_to_s_c(next_journal);
1812         }
1813
1814         return k;
1815 }
1816
1817 /*
1818  * Checks btree key cache for key at iter->pos and returns it if present, or
1819  * bkey_s_c_null:
1820  */
1821 static noinline
1822 struct bkey_s_c __btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
1823 {
1824         struct btree_trans *trans = iter->trans;
1825         struct bch_fs *c = trans->c;
1826         struct bkey u;
1827         int ret;
1828
1829         if (!bch2_btree_key_cache_find(c, iter->btree_id, pos))
1830                 return bkey_s_c_null;
1831
1832         if (!iter->key_cache_path)
1833                 iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos,
1834                                                      iter->flags & BTREE_ITER_INTENT, 0,
1835                                                      iter->flags|BTREE_ITER_CACHED,
1836                                                      _THIS_IP_);
1837
1838         iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos,
1839                                         iter->flags & BTREE_ITER_INTENT,
1840                                         btree_iter_ip_allocated(iter));
1841
1842         ret = bch2_btree_path_traverse(trans, iter->key_cache_path, iter->flags|BTREE_ITER_CACHED);
1843         if (unlikely(ret))
1844                 return bkey_s_c_err(ret);
1845
1846         btree_path_set_should_be_locked(iter->key_cache_path);
1847
1848         return bch2_btree_path_peek_slot(iter->key_cache_path, &u);
1849 }
1850
1851 static noinline
1852 struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
1853 {
1854         struct bkey_s_c ret = __btree_trans_peek_key_cache(iter, pos);
1855         int err = bkey_err(ret) ?: bch2_btree_path_relock(iter->trans, iter->path, _THIS_IP_);
1856
1857         return err ? bkey_s_c_err(err) : ret;
1858 }
1859
1860 static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
1861 {
1862         struct btree_trans *trans = iter->trans;
1863         struct bkey_i *next_update;
1864         struct bkey_s_c k, k2;
1865         int ret;
1866
1867         EBUG_ON(iter->path->cached);
1868         bch2_btree_iter_verify(iter);
1869
1870         while (1) {
1871                 struct btree_path_level *l;
1872
1873                 iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
1874                                         iter->flags & BTREE_ITER_INTENT,
1875                                         btree_iter_ip_allocated(iter));
1876
1877                 ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
1878                 if (unlikely(ret)) {
1879                         /* ensure that iter->k is consistent with iter->pos: */
1880                         bch2_btree_iter_set_pos(iter, iter->pos);
1881                         k = bkey_s_c_err(ret);
1882                         goto out;
1883                 }
1884
1885                 l = path_l(iter->path);
1886
1887                 if (unlikely(!l->b)) {
1888                         /* No btree nodes at requested level: */
1889                         bch2_btree_iter_set_pos(iter, SPOS_MAX);
1890                         k = bkey_s_c_null;
1891                         goto out;
1892                 }
1893
1894                 btree_path_set_should_be_locked(iter->path);
1895
1896                 k = btree_path_level_peek_all(trans->c, l, &iter->k);
1897
1898                 if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
1899                     k.k &&
1900                     (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
1901                         k = k2;
1902                         ret = bkey_err(k);
1903                         if (ret) {
1904                                 bch2_btree_iter_set_pos(iter, iter->pos);
1905                                 goto out;
1906                         }
1907                 }
1908
1909                 if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL))
1910                         k = btree_trans_peek_journal(trans, iter, k);
1911
1912                 next_update = iter->flags & BTREE_ITER_WITH_UPDATES
1913                         ? btree_trans_peek_updates(trans, iter->btree_id, search_key)
1914                         : NULL;
1915                 if (next_update &&
1916                     bpos_cmp(next_update->k.p,
1917                              k.k ? k.k->p : l->b->key.k.p) <= 0) {
1918                         iter->k = next_update->k;
1919                         k = bkey_i_to_s_c(next_update);
1920                 }
1921
1922                 if (k.k && bkey_deleted(k.k)) {
1923                         /*
1924                          * If we've got a whiteout, and it's after the search
1925                          * key, advance the search key to the whiteout instead
1926                          * of just after the whiteout - it might be a btree
1927                          * whiteout, with a real key at the same position, since
1928                          * in the btree deleted keys sort before non deleted.
1929                          */
1930                         search_key = bpos_cmp(search_key, k.k->p)
1931                                 ? k.k->p
1932                                 : bpos_successor(k.k->p);
1933                         continue;
1934                 }
1935
1936                 if (likely(k.k)) {
1937                         break;
1938                 } else if (likely(bpos_cmp(l->b->key.k.p, SPOS_MAX))) {
1939                         /* Advance to next leaf node: */
1940                         search_key = bpos_successor(l->b->key.k.p);
1941                 } else {
1942                         /* End of btree: */
1943                         bch2_btree_iter_set_pos(iter, SPOS_MAX);
1944                         k = bkey_s_c_null;
1945                         goto out;
1946                 }
1947         }
1948 out:
1949         bch2_btree_iter_verify(iter);
1950
1951         return k;
1952 }
1953
1954 /**
1955  * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
1956  * current position
1957  */
1958 struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end)
1959 {
1960         struct btree_trans *trans = iter->trans;
1961         struct bpos search_key = btree_iter_search_key(iter);
1962         struct bkey_s_c k;
1963         struct bpos iter_pos;
1964         int ret;
1965
1966         EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
1967
1968         if (iter->update_path) {
1969                 bch2_path_put_nokeep(trans, iter->update_path,
1970                                      iter->flags & BTREE_ITER_INTENT);
1971                 iter->update_path = NULL;
1972         }
1973
1974         bch2_btree_iter_verify_entry_exit(iter);
1975
1976         while (1) {
1977                 k = __bch2_btree_iter_peek(iter, search_key);
1978                 if (!k.k || bkey_err(k))
1979                         goto out_no_locked;
1980
1981                 /*
1982                  * iter->pos should be mononotically increasing, and always be
1983                  * equal to the key we just returned - except extents can
1984                  * straddle iter->pos:
1985                  */
1986                 if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
1987                         iter_pos = k.k->p;
1988                 else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
1989                         iter_pos = bkey_start_pos(k.k);
1990                 else
1991                         iter_pos = iter->pos;
1992
1993                 if (bkey_cmp(iter_pos, end) > 0) {
1994                         bch2_btree_iter_set_pos(iter, end);
1995                         k = bkey_s_c_null;
1996                         goto out_no_locked;
1997                 }
1998
1999                 if (iter->update_path &&
2000                     bkey_cmp(iter->update_path->pos, k.k->p)) {
2001                         bch2_path_put_nokeep(trans, iter->update_path,
2002                                              iter->flags & BTREE_ITER_INTENT);
2003                         iter->update_path = NULL;
2004                 }
2005
2006                 if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
2007                     (iter->flags & BTREE_ITER_INTENT) &&
2008                     !(iter->flags & BTREE_ITER_IS_EXTENTS) &&
2009                     !iter->update_path) {
2010                         struct bpos pos = k.k->p;
2011
2012                         if (pos.snapshot < iter->snapshot) {
2013                                 search_key = bpos_successor(k.k->p);
2014                                 continue;
2015                         }
2016
2017                         pos.snapshot = iter->snapshot;
2018
2019                         /*
2020                          * advance, same as on exit for iter->path, but only up
2021                          * to snapshot
2022                          */
2023                         __btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT);
2024                         iter->update_path = iter->path;
2025
2026                         iter->update_path = bch2_btree_path_set_pos(trans,
2027                                                 iter->update_path, pos,
2028                                                 iter->flags & BTREE_ITER_INTENT,
2029                                                 _THIS_IP_);
2030                 }
2031
2032                 /*
2033                  * We can never have a key in a leaf node at POS_MAX, so
2034                  * we don't have to check these successor() calls:
2035                  */
2036                 if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
2037                     !bch2_snapshot_is_ancestor(trans->c,
2038                                                iter->snapshot,
2039                                                k.k->p.snapshot)) {
2040                         search_key = bpos_successor(k.k->p);
2041                         continue;
2042                 }
2043
2044                 if (bkey_whiteout(k.k) &&
2045                     !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
2046                         search_key = bkey_successor(iter, k.k->p);
2047                         continue;
2048                 }
2049
2050                 break;
2051         }
2052
2053         iter->pos = iter_pos;
2054
2055         iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
2056                                 iter->flags & BTREE_ITER_INTENT,
2057                                 btree_iter_ip_allocated(iter));
2058
2059         btree_path_set_should_be_locked(iter->path);
2060 out_no_locked:
2061         if (iter->update_path) {
2062                 if (iter->update_path->uptodate &&
2063                     (ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_)))
2064                         k = bkey_s_c_err(ret);
2065                 else
2066                         btree_path_set_should_be_locked(iter->update_path);
2067         }
2068
2069         if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
2070                 iter->pos.snapshot = iter->snapshot;
2071
2072         ret = bch2_btree_iter_verify_ret(iter, k);
2073         if (unlikely(ret)) {
2074                 bch2_btree_iter_set_pos(iter, iter->pos);
2075                 k = bkey_s_c_err(ret);
2076         }
2077
2078         bch2_btree_iter_verify_entry_exit(iter);
2079
2080         return k;
2081 }
2082
2083 /**
2084  * bch2_btree_iter_peek_all_levels: returns the first key greater than or equal
2085  * to iterator's current position, returning keys from every level of the btree.
2086  * For keys at different levels of the btree that compare equal, the key from
2087  * the lower level (leaf) is returned first.
2088  */
2089 struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter)
2090 {
2091         struct btree_trans *trans = iter->trans;
2092         struct bkey_s_c k;
2093         int ret;
2094
2095         EBUG_ON(iter->path->cached);
2096         bch2_btree_iter_verify(iter);
2097         BUG_ON(iter->path->level < iter->min_depth);
2098         BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
2099         EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS));
2100
2101         while (1) {
2102                 iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos,
2103                                         iter->flags & BTREE_ITER_INTENT,
2104                                         btree_iter_ip_allocated(iter));
2105
2106                 ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
2107                 if (unlikely(ret)) {
2108                         /* ensure that iter->k is consistent with iter->pos: */
2109                         bch2_btree_iter_set_pos(iter, iter->pos);
2110                         k = bkey_s_c_err(ret);
2111                         goto out_no_locked;
2112                 }
2113
2114                 /* Already at end? */
2115                 if (!btree_path_node(iter->path, iter->path->level)) {
2116                         k = bkey_s_c_null;
2117                         goto out_no_locked;
2118                 }
2119
2120                 k = btree_path_level_peek_all(trans->c,
2121                                 &iter->path->l[iter->path->level], &iter->k);
2122
2123                 /* Check if we should go up to the parent node: */
2124                 if (!k.k ||
2125                     (iter->advanced &&
2126                      !bpos_cmp(path_l(iter->path)->b->key.k.p, iter->pos))) {
2127                         iter->pos = path_l(iter->path)->b->key.k.p;
2128                         btree_path_set_level_up(trans, iter->path);
2129                         iter->advanced = false;
2130                         continue;
2131                 }
2132
2133                 /*
2134                  * Check if we should go back down to a leaf:
2135                  * If we're not in a leaf node, we only return the current key
2136                  * if it exactly matches iter->pos - otherwise we first have to
2137                  * go back to the leaf:
2138                  */
2139                 if (iter->path->level != iter->min_depth &&
2140                     (iter->advanced ||
2141                      !k.k ||
2142                      bpos_cmp(iter->pos, k.k->p))) {
2143                         btree_path_set_level_down(trans, iter->path, iter->min_depth);
2144                         iter->pos = bpos_successor(iter->pos);
2145                         iter->advanced = false;
2146                         continue;
2147                 }
2148
2149                 /* Check if we should go to the next key: */
2150                 if (iter->path->level == iter->min_depth &&
2151                     iter->advanced &&
2152                     k.k &&
2153                     !bpos_cmp(iter->pos, k.k->p)) {
2154                         iter->pos = bpos_successor(iter->pos);
2155                         iter->advanced = false;
2156                         continue;
2157                 }
2158
2159                 if (iter->advanced &&
2160                     iter->path->level == iter->min_depth &&
2161                     bpos_cmp(k.k->p, iter->pos))
2162                         iter->advanced = false;
2163
2164                 BUG_ON(iter->advanced);
2165                 BUG_ON(!k.k);
2166                 break;
2167         }
2168
2169         iter->pos = k.k->p;
2170         btree_path_set_should_be_locked(iter->path);
2171 out_no_locked:
2172         bch2_btree_iter_verify(iter);
2173
2174         return k;
2175 }
2176
2177 /**
2178  * bch2_btree_iter_next: returns first key greater than iterator's current
2179  * position
2180  */
2181 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
2182 {
2183         if (!bch2_btree_iter_advance(iter))
2184                 return bkey_s_c_null;
2185
2186         return bch2_btree_iter_peek(iter);
2187 }
2188
2189 /**
2190  * bch2_btree_iter_peek_prev: returns first key less than or equal to
2191  * iterator's current position
2192  */
2193 struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
2194 {
2195         struct btree_trans *trans = iter->trans;
2196         struct bpos search_key = iter->pos;
2197         struct btree_path *saved_path = NULL;
2198         struct bkey_s_c k;
2199         struct bkey saved_k;
2200         const struct bch_val *saved_v;
2201         int ret;
2202
2203         EBUG_ON(iter->path->cached || iter->path->level);
2204         EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES);
2205
2206         if (iter->flags & BTREE_ITER_WITH_JOURNAL)
2207                 return bkey_s_c_err(-EIO);
2208
2209         bch2_btree_iter_verify(iter);
2210         bch2_btree_iter_verify_entry_exit(iter);
2211
2212         if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
2213                 search_key.snapshot = U32_MAX;
2214
2215         while (1) {
2216                 iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
2217                                                 iter->flags & BTREE_ITER_INTENT,
2218                                                 btree_iter_ip_allocated(iter));
2219
2220                 ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
2221                 if (unlikely(ret)) {
2222                         /* ensure that iter->k is consistent with iter->pos: */
2223                         bch2_btree_iter_set_pos(iter, iter->pos);
2224                         k = bkey_s_c_err(ret);
2225                         goto out_no_locked;
2226                 }
2227
2228                 k = btree_path_level_peek(trans, iter->path,
2229                                           &iter->path->l[0], &iter->k);
2230                 if (!k.k ||
2231                     ((iter->flags & BTREE_ITER_IS_EXTENTS)
2232                      ? bpos_cmp(bkey_start_pos(k.k), search_key) >= 0
2233                      : bpos_cmp(k.k->p, search_key) > 0))
2234                         k = btree_path_level_prev(trans, iter->path,
2235                                                   &iter->path->l[0], &iter->k);
2236
2237                 bch2_btree_path_check_sort(trans, iter->path, 0);
2238
2239                 if (likely(k.k)) {
2240                         if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) {
2241                                 if (k.k->p.snapshot == iter->snapshot)
2242                                         goto got_key;
2243
2244                                 /*
2245                                  * If we have a saved candidate, and we're no
2246                                  * longer at the same _key_ (not pos), return
2247                                  * that candidate
2248                                  */
2249                                 if (saved_path && bkey_cmp(k.k->p, saved_k.p)) {
2250                                         bch2_path_put_nokeep(trans, iter->path,
2251                                                       iter->flags & BTREE_ITER_INTENT);
2252                                         iter->path = saved_path;
2253                                         saved_path = NULL;
2254                                         iter->k = saved_k;
2255                                         k.v     = saved_v;
2256                                         goto got_key;
2257                                 }
2258
2259                                 if (bch2_snapshot_is_ancestor(iter->trans->c,
2260                                                               iter->snapshot,
2261                                                               k.k->p.snapshot)) {
2262                                         if (saved_path)
2263                                                 bch2_path_put_nokeep(trans, saved_path,
2264                                                       iter->flags & BTREE_ITER_INTENT);
2265                                         saved_path = btree_path_clone(trans, iter->path,
2266                                                                 iter->flags & BTREE_ITER_INTENT);
2267                                         saved_k = *k.k;
2268                                         saved_v = k.v;
2269                                 }
2270
2271                                 search_key = bpos_predecessor(k.k->p);
2272                                 continue;
2273                         }
2274 got_key:
2275                         if (bkey_whiteout(k.k) &&
2276                             !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
2277                                 search_key = bkey_predecessor(iter, k.k->p);
2278                                 if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
2279                                         search_key.snapshot = U32_MAX;
2280                                 continue;
2281                         }
2282
2283                         break;
2284                 } else if (likely(bpos_cmp(iter->path->l[0].b->data->min_key, POS_MIN))) {
2285                         /* Advance to previous leaf node: */
2286                         search_key = bpos_predecessor(iter->path->l[0].b->data->min_key);
2287                 } else {
2288                         /* Start of btree: */
2289                         bch2_btree_iter_set_pos(iter, POS_MIN);
2290                         k = bkey_s_c_null;
2291                         goto out_no_locked;
2292                 }
2293         }
2294
2295         EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0);
2296
2297         /* Extents can straddle iter->pos: */
2298         if (bkey_cmp(k.k->p, iter->pos) < 0)
2299                 iter->pos = k.k->p;
2300
2301         if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
2302                 iter->pos.snapshot = iter->snapshot;
2303
2304         btree_path_set_should_be_locked(iter->path);
2305 out_no_locked:
2306         if (saved_path)
2307                 bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
2308
2309         bch2_btree_iter_verify_entry_exit(iter);
2310         bch2_btree_iter_verify(iter);
2311
2312         return k;
2313 }
2314
2315 /**
2316  * bch2_btree_iter_prev: returns first key less than iterator's current
2317  * position
2318  */
2319 struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
2320 {
2321         if (!bch2_btree_iter_rewind(iter))
2322                 return bkey_s_c_null;
2323
2324         return bch2_btree_iter_peek_prev(iter);
2325 }
2326
2327 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
2328 {
2329         struct btree_trans *trans = iter->trans;
2330         struct bpos search_key;
2331         struct bkey_s_c k;
2332         int ret;
2333
2334         bch2_btree_iter_verify(iter);
2335         bch2_btree_iter_verify_entry_exit(iter);
2336         EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
2337         EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE));
2338
2339         /* extents can't span inode numbers: */
2340         if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
2341             unlikely(iter->pos.offset == KEY_OFFSET_MAX)) {
2342                 if (iter->pos.inode == KEY_INODE_MAX)
2343                         return bkey_s_c_null;
2344
2345                 bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
2346         }
2347
2348         search_key = btree_iter_search_key(iter);
2349         iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
2350                                         iter->flags & BTREE_ITER_INTENT,
2351                                         btree_iter_ip_allocated(iter));
2352
2353         ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
2354         if (unlikely(ret)) {
2355                 k = bkey_s_c_err(ret);
2356                 goto out_no_locked;
2357         }
2358
2359         if ((iter->flags & BTREE_ITER_CACHED) ||
2360             !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
2361                 struct bkey_i *next_update;
2362
2363                 if ((iter->flags & BTREE_ITER_WITH_UPDATES) &&
2364                     (next_update = btree_trans_peek_updates(trans,
2365                                                 iter->btree_id, search_key)) &&
2366                     !bpos_cmp(next_update->k.p, iter->pos)) {
2367                         iter->k = next_update->k;
2368                         k = bkey_i_to_s_c(next_update);
2369                         goto out;
2370                 }
2371
2372                 if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) &&
2373                     (next_update = bch2_btree_journal_peek_slot(trans,
2374                                         iter, iter->pos))) {
2375                         iter->k = next_update->k;
2376                         k = bkey_i_to_s_c(next_update);
2377                         goto out;
2378                 }
2379
2380                 if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
2381                     (k = __btree_trans_peek_key_cache(iter, iter->pos)).k) {
2382                         if (!bkey_err(k))
2383                                 iter->k = *k.k;
2384                         /* We're not returning a key from iter->path: */
2385                         goto out_no_locked;
2386                 }
2387
2388                 k = bch2_btree_path_peek_slot(iter->path, &iter->k);
2389         } else {
2390                 struct bpos next;
2391
2392                 EBUG_ON(iter->path->level);
2393
2394                 if (iter->flags & BTREE_ITER_INTENT) {
2395                         struct btree_iter iter2;
2396                         struct bpos end = iter->pos;
2397
2398                         if (iter->flags & BTREE_ITER_IS_EXTENTS)
2399                                 end.offset = U64_MAX;
2400
2401                         bch2_trans_copy_iter(&iter2, iter);
2402                         k = bch2_btree_iter_peek_upto(&iter2, end);
2403
2404                         if (k.k && !bkey_err(k)) {
2405                                 iter->k = iter2.k;
2406                                 k.k = &iter->k;
2407                         }
2408                         bch2_trans_iter_exit(trans, &iter2);
2409                 } else {
2410                         struct bpos pos = iter->pos;
2411
2412                         k = bch2_btree_iter_peek(iter);
2413                         if (unlikely(bkey_err(k)))
2414                                 bch2_btree_iter_set_pos(iter, pos);
2415                         else
2416                                 iter->pos = pos;
2417                 }
2418
2419                 if (unlikely(bkey_err(k)))
2420                         goto out_no_locked;
2421
2422                 next = k.k ? bkey_start_pos(k.k) : POS_MAX;
2423
2424                 if (bkey_cmp(iter->pos, next) < 0) {
2425                         bkey_init(&iter->k);
2426                         iter->k.p = iter->pos;
2427
2428                         if (iter->flags & BTREE_ITER_IS_EXTENTS) {
2429                                 bch2_key_resize(&iter->k,
2430                                                 min_t(u64, KEY_SIZE_MAX,
2431                                                       (next.inode == iter->pos.inode
2432                                                        ? next.offset
2433                                                        : KEY_OFFSET_MAX) -
2434                                                       iter->pos.offset));
2435                                 EBUG_ON(!iter->k.size);
2436                         }
2437
2438                         k = (struct bkey_s_c) { &iter->k, NULL };
2439                 }
2440         }
2441 out:
2442         btree_path_set_should_be_locked(iter->path);
2443 out_no_locked:
2444         bch2_btree_iter_verify_entry_exit(iter);
2445         bch2_btree_iter_verify(iter);
2446         ret = bch2_btree_iter_verify_ret(iter, k);
2447         if (unlikely(ret))
2448                 return bkey_s_c_err(ret);
2449
2450         return k;
2451 }
2452
2453 struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
2454 {
2455         if (!bch2_btree_iter_advance(iter))
2456                 return bkey_s_c_null;
2457
2458         return bch2_btree_iter_peek_slot(iter);
2459 }
2460
2461 struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
2462 {
2463         if (!bch2_btree_iter_rewind(iter))
2464                 return bkey_s_c_null;
2465
2466         return bch2_btree_iter_peek_slot(iter);
2467 }
2468
2469 /* new transactional stuff: */
2470
2471 static inline void btree_path_verify_sorted_ref(struct btree_trans *trans,
2472                                                 struct btree_path *path)
2473 {
2474         EBUG_ON(path->sorted_idx >= trans->nr_sorted);
2475         EBUG_ON(trans->sorted[path->sorted_idx] != path->idx);
2476         EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
2477 }
2478
2479 static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans)
2480 {
2481 #ifdef CONFIG_BCACHEFS_DEBUG
2482         unsigned i;
2483
2484         for (i = 0; i < trans->nr_sorted; i++)
2485                 btree_path_verify_sorted_ref(trans, trans->paths + trans->sorted[i]);
2486 #endif
2487 }
2488
2489 static void btree_trans_verify_sorted(struct btree_trans *trans)
2490 {
2491 #ifdef CONFIG_BCACHEFS_DEBUG
2492         struct btree_path *path, *prev = NULL;
2493         unsigned i;
2494
2495         if (!bch2_debug_check_iterators)
2496                 return;
2497
2498         trans_for_each_path_inorder(trans, path, i) {
2499                 if (prev && btree_path_cmp(prev, path) > 0) {
2500                         bch2_dump_trans_paths_updates(trans);
2501                         panic("trans paths out of order!\n");
2502                 }
2503                 prev = path;
2504         }
2505 #endif
2506 }
2507
2508 static inline void btree_path_swap(struct btree_trans *trans,
2509                                    struct btree_path *l, struct btree_path *r)
2510 {
2511         swap(l->sorted_idx, r->sorted_idx);
2512         swap(trans->sorted[l->sorted_idx],
2513              trans->sorted[r->sorted_idx]);
2514
2515         btree_path_verify_sorted_ref(trans, l);
2516         btree_path_verify_sorted_ref(trans, r);
2517 }
2518
2519 inline void bch2_btree_path_check_sort(struct btree_trans *trans, struct btree_path *path,
2520                                        int cmp)
2521 {
2522         struct btree_path *n;
2523
2524         if (cmp <= 0) {
2525                 n = prev_btree_path(trans, path);
2526                 if (n && btree_path_cmp(n, path) > 0) {
2527                         do {
2528                                 btree_path_swap(trans, n, path);
2529                                 n = prev_btree_path(trans, path);
2530                         } while (n && btree_path_cmp(n, path) > 0);
2531
2532                         goto out;
2533                 }
2534         }
2535
2536         if (cmp >= 0) {
2537                 n = next_btree_path(trans, path);
2538                 if (n && btree_path_cmp(path, n) > 0) {
2539                         do {
2540                                 btree_path_swap(trans, path, n);
2541                                 n = next_btree_path(trans, path);
2542                         } while (n && btree_path_cmp(path, n) > 0);
2543                 }
2544         }
2545 out:
2546         btree_trans_verify_sorted(trans);
2547 }
2548
2549 static inline void btree_path_list_remove(struct btree_trans *trans,
2550                                           struct btree_path *path)
2551 {
2552         unsigned i;
2553
2554         EBUG_ON(path->sorted_idx >= trans->nr_sorted);
2555
2556         array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx);
2557
2558         for (i = path->sorted_idx; i < trans->nr_sorted; i++)
2559                 trans->paths[trans->sorted[i]].sorted_idx = i;
2560
2561         path->sorted_idx = U8_MAX;
2562
2563         btree_trans_verify_sorted_refs(trans);
2564 }
2565
2566 static inline void btree_path_list_add(struct btree_trans *trans,
2567                                        struct btree_path *pos,
2568                                        struct btree_path *path)
2569 {
2570         unsigned i;
2571
2572         btree_trans_verify_sorted_refs(trans);
2573
2574         path->sorted_idx = pos ? pos->sorted_idx + 1 : 0;
2575
2576         if (trans->in_traverse_all &&
2577             trans->traverse_all_idx != U8_MAX &&
2578             trans->traverse_all_idx >= path->sorted_idx)
2579                 trans->traverse_all_idx++;
2580
2581         array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx);
2582
2583         for (i = path->sorted_idx; i < trans->nr_sorted; i++)
2584                 trans->paths[trans->sorted[i]].sorted_idx = i;
2585
2586         btree_trans_verify_sorted_refs(trans);
2587 }
2588
2589 void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
2590 {
2591         if (iter->path)
2592                 bch2_path_put(trans, iter->path,
2593                               iter->flags & BTREE_ITER_INTENT);
2594         if (iter->update_path)
2595                 bch2_path_put_nokeep(trans, iter->update_path,
2596                               iter->flags & BTREE_ITER_INTENT);
2597         if (iter->key_cache_path)
2598                 bch2_path_put(trans, iter->key_cache_path,
2599                               iter->flags & BTREE_ITER_INTENT);
2600         iter->path = NULL;
2601         iter->update_path = NULL;
2602         iter->key_cache_path = NULL;
2603 }
2604
2605 static inline void __bch2_trans_iter_init(struct btree_trans *trans,
2606                                           struct btree_iter *iter,
2607                                           unsigned btree_id, struct bpos pos,
2608                                           unsigned locks_want,
2609                                           unsigned depth,
2610                                           unsigned flags,
2611                                           unsigned long ip)
2612 {
2613         if (trans->restarted)
2614                 panic("bch2_trans_iter_init(): in transaction restart, %s by %pS\n",
2615                       bch2_err_str(trans->restarted),
2616                       (void *) trans->last_restarted_ip);
2617
2618         if (flags & BTREE_ITER_ALL_LEVELS)
2619                 flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS;
2620
2621         if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) &&
2622             btree_node_type_is_extents(btree_id))
2623                 flags |= BTREE_ITER_IS_EXTENTS;
2624
2625         if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
2626             !btree_type_has_snapshots(btree_id))
2627                 flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
2628
2629         if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) &&
2630             btree_type_has_snapshots(btree_id))
2631                 flags |= BTREE_ITER_FILTER_SNAPSHOTS;
2632
2633         if (!test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags))
2634                 flags |= BTREE_ITER_WITH_JOURNAL;
2635
2636         iter->trans     = trans;
2637         iter->path      = NULL;
2638         iter->update_path = NULL;
2639         iter->key_cache_path = NULL;
2640         iter->btree_id  = btree_id;
2641         iter->min_depth = depth;
2642         iter->flags     = flags;
2643         iter->snapshot  = pos.snapshot;
2644         iter->pos       = pos;
2645         iter->k.type    = KEY_TYPE_deleted;
2646         iter->k.p       = pos;
2647         iter->k.size    = 0;
2648         iter->journal_idx = 0;
2649         iter->journal_pos = POS_MIN;
2650 #ifdef CONFIG_BCACHEFS_DEBUG
2651         iter->ip_allocated = ip;
2652 #endif
2653
2654         iter->path = bch2_path_get(trans, btree_id, iter->pos,
2655                                    locks_want, depth, flags, ip);
2656 }
2657
2658 void bch2_trans_iter_init(struct btree_trans *trans,
2659                           struct btree_iter *iter,
2660                           unsigned btree_id, struct bpos pos,
2661                           unsigned flags)
2662 {
2663         if (!btree_id_cached(trans->c, btree_id)) {
2664                 flags &= ~BTREE_ITER_CACHED;
2665                 flags &= ~BTREE_ITER_WITH_KEY_CACHE;
2666         } else if (!(flags & BTREE_ITER_CACHED))
2667                 flags |= BTREE_ITER_WITH_KEY_CACHE;
2668
2669         __bch2_trans_iter_init(trans, iter, btree_id, pos,
2670                                0, 0, flags, _RET_IP_);
2671 }
2672
2673 void bch2_trans_node_iter_init(struct btree_trans *trans,
2674                                struct btree_iter *iter,
2675                                enum btree_id btree_id,
2676                                struct bpos pos,
2677                                unsigned locks_want,
2678                                unsigned depth,
2679                                unsigned flags)
2680 {
2681         __bch2_trans_iter_init(trans, iter, btree_id, pos, locks_want, depth,
2682                                BTREE_ITER_NOT_EXTENTS|
2683                                __BTREE_ITER_ALL_SNAPSHOTS|
2684                                BTREE_ITER_ALL_SNAPSHOTS|
2685                                flags, _RET_IP_);
2686         BUG_ON(iter->path->locks_want    < min(locks_want, BTREE_MAX_DEPTH));
2687         BUG_ON(iter->path->level        != depth);
2688         BUG_ON(iter->min_depth          != depth);
2689 }
2690
2691 void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
2692 {
2693         *dst = *src;
2694         if (src->path)
2695                 __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT);
2696         if (src->update_path)
2697                 __btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT);
2698         dst->key_cache_path = NULL;
2699 }
2700
2701 void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
2702 {
2703         unsigned new_top = trans->mem_top + size;
2704         size_t old_bytes = trans->mem_bytes;
2705         size_t new_bytes = roundup_pow_of_two(new_top);
2706         void *new_mem;
2707         void *p;
2708
2709         trans->mem_max = max(trans->mem_max, new_top);
2710
2711         WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
2712
2713         new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
2714         if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
2715                 new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
2716                 new_bytes = BTREE_TRANS_MEM_MAX;
2717                 kfree(trans->mem);
2718         }
2719
2720         if (!new_mem)
2721                 return ERR_PTR(-ENOMEM);
2722
2723         trans->mem = new_mem;
2724         trans->mem_bytes = new_bytes;
2725
2726         if (old_bytes) {
2727                 trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
2728                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
2729         }
2730
2731         p = trans->mem + trans->mem_top;
2732         trans->mem_top += size;
2733         memset(p, 0, size);
2734         return p;
2735 }
2736
2737 /**
2738  * bch2_trans_begin() - reset a transaction after a interrupted attempt
2739  * @trans: transaction to reset
2740  *
2741  * While iterating over nodes or updating nodes a attempt to lock a btree node
2742  * may return BCH_ERR_transaction_restart when the trylock fails. When this
2743  * occurs bch2_trans_begin() should be called and the transaction retried.
2744  */
2745 u32 bch2_trans_begin(struct btree_trans *trans)
2746 {
2747         struct btree_path *path;
2748
2749         bch2_trans_reset_updates(trans);
2750
2751         trans->restart_count++;
2752         trans->mem_top                  = 0;
2753
2754         if (trans->fs_usage_deltas) {
2755                 trans->fs_usage_deltas->used = 0;
2756                 memset((void *) trans->fs_usage_deltas +
2757                        offsetof(struct replicas_delta_list, memset_start), 0,
2758                        (void *) &trans->fs_usage_deltas->memset_end -
2759                        (void *) &trans->fs_usage_deltas->memset_start);
2760         }
2761
2762         trans_for_each_path(trans, path) {
2763                 path->should_be_locked = false;
2764
2765                 /*
2766                  * If the transaction wasn't restarted, we're presuming to be
2767                  * doing something new: dont keep iterators excpt the ones that
2768                  * are in use - except for the subvolumes btree:
2769                  */
2770                 if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes)
2771                         path->preserve = false;
2772
2773                 /*
2774                  * XXX: we probably shouldn't be doing this if the transaction
2775                  * was restarted, but currently we still overflow transaction
2776                  * iterators if we do that
2777                  */
2778                 if (!path->ref && !path->preserve)
2779                         __bch2_path_free(trans, path);
2780                 else
2781                         path->preserve = false;
2782         }
2783
2784         if (!trans->restarted &&
2785             (need_resched() ||
2786              ktime_get_ns() - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) {
2787                 bch2_trans_unlock(trans);
2788                 cond_resched();
2789                 bch2_trans_relock(trans);
2790         }
2791
2792         trans->last_restarted_ip = _RET_IP_;
2793         if (trans->restarted)
2794                 bch2_btree_path_traverse_all(trans);
2795
2796         trans->last_begin_time = ktime_get_ns();
2797         return trans->restart_count;
2798 }
2799
2800 void bch2_trans_verify_not_restarted(struct btree_trans *trans, u32 restart_count)
2801 {
2802         if (trans_was_restarted(trans, restart_count))
2803                 panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
2804                       trans->restart_count, restart_count,
2805                       (void *) trans->last_restarted_ip);
2806 }
2807
2808 static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
2809 {
2810         size_t paths_bytes      = sizeof(struct btree_path) * BTREE_ITER_MAX;
2811         size_t updates_bytes    = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX;
2812         void *p = NULL;
2813
2814         BUG_ON(trans->used_mempool);
2815
2816 #ifdef __KERNEL__
2817         p = this_cpu_xchg(c->btree_paths_bufs->path , NULL);
2818 #endif
2819         if (!p)
2820                 p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS);
2821
2822         trans->paths            = p; p += paths_bytes;
2823         trans->updates          = p; p += updates_bytes;
2824 }
2825
2826 static inline unsigned bch2_trans_get_fn_idx(struct btree_trans *trans, struct bch_fs *c,
2827                                         const char *fn)
2828 {
2829         unsigned i;
2830
2831         for (i = 0; i < ARRAY_SIZE(c->btree_transaction_fns); i++)
2832                 if (!c->btree_transaction_fns[i] ||
2833                     c->btree_transaction_fns[i] == fn) {
2834                         c->btree_transaction_fns[i] = fn;
2835                         return i;
2836                 }
2837
2838         pr_warn_once("BCH_TRANSACTIONS_NR not big enough!");
2839         return i;
2840 }
2841
2842 void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, const char *fn)
2843         __acquires(&c->btree_trans_barrier)
2844 {
2845         struct btree_transaction_stats *s;
2846         struct btree_trans *pos;
2847
2848         BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
2849
2850         memset(trans, 0, sizeof(*trans));
2851         trans->c                = c;
2852         trans->fn               = fn;
2853         trans->last_begin_time  = ktime_get_ns();
2854         trans->fn_idx           = bch2_trans_get_fn_idx(trans, c, fn);
2855         trans->locking_wait.task = current;
2856         closure_init_stack(&trans->ref);
2857
2858         bch2_trans_alloc_paths(trans, c);
2859
2860         s = btree_trans_stats(trans);
2861         if (s) {
2862                 unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
2863
2864                 trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
2865
2866                 if (!unlikely(trans->mem)) {
2867                         trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
2868                         trans->mem_bytes = BTREE_TRANS_MEM_MAX;
2869                 } else {
2870                         trans->mem_bytes = expected_mem_bytes;
2871                 }
2872
2873                 trans->nr_max_paths = s->nr_max_paths;
2874         }
2875
2876         trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
2877
2878         mutex_lock(&c->btree_trans_lock);
2879         list_for_each_entry(pos, &c->btree_trans_list, list) {
2880                 if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) {
2881                         list_add_tail(&trans->list, &pos->list);
2882                         goto list_add_done;
2883                 }
2884         }
2885         list_add_tail(&trans->list, &c->btree_trans_list);
2886 list_add_done:
2887         mutex_unlock(&c->btree_trans_lock);
2888 }
2889
2890 static void check_btree_paths_leaked(struct btree_trans *trans)
2891 {
2892 #ifdef CONFIG_BCACHEFS_DEBUG
2893         struct bch_fs *c = trans->c;
2894         struct btree_path *path;
2895
2896         trans_for_each_path(trans, path)
2897                 if (path->ref)
2898                         goto leaked;
2899         return;
2900 leaked:
2901         bch_err(c, "btree paths leaked from %s!", trans->fn);
2902         trans_for_each_path(trans, path)
2903                 if (path->ref)
2904                         printk(KERN_ERR "  btree %s %pS\n",
2905                                bch2_btree_ids[path->btree_id],
2906                                (void *) path->ip_allocated);
2907         /* Be noisy about this: */
2908         bch2_fatal_error(c);
2909 #endif
2910 }
2911
2912 void bch2_trans_exit(struct btree_trans *trans)
2913         __releases(&c->btree_trans_barrier)
2914 {
2915         struct btree_insert_entry *i;
2916         struct bch_fs *c = trans->c;
2917         struct btree_transaction_stats *s = btree_trans_stats(trans);
2918
2919         bch2_trans_unlock(trans);
2920
2921         closure_sync(&trans->ref);
2922
2923         if (s)
2924                 s->max_mem = max(s->max_mem, trans->mem_max);
2925
2926         trans_for_each_update(trans, i)
2927                 __btree_path_put(i->path, true);
2928         trans->nr_updates               = 0;
2929
2930         check_btree_paths_leaked(trans);
2931
2932         mutex_lock(&c->btree_trans_lock);
2933         list_del(&trans->list);
2934         mutex_unlock(&c->btree_trans_lock);
2935
2936         srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
2937
2938         bch2_journal_preres_put(&c->journal, &trans->journal_preres);
2939
2940         kfree(trans->extra_journal_entries.data);
2941
2942         if (trans->fs_usage_deltas) {
2943                 if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
2944                     REPLICAS_DELTA_LIST_MAX)
2945                         mempool_free(trans->fs_usage_deltas,
2946                                      &c->replicas_delta_pool);
2947                 else
2948                         kfree(trans->fs_usage_deltas);
2949         }
2950
2951         if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
2952                 mempool_free(trans->mem, &c->btree_trans_mem_pool);
2953         else
2954                 kfree(trans->mem);
2955
2956 #ifdef __KERNEL__
2957         /*
2958          * Userspace doesn't have a real percpu implementation:
2959          */
2960         trans->paths = this_cpu_xchg(c->btree_paths_bufs->path, trans->paths);
2961 #endif
2962
2963         if (trans->paths)
2964                 mempool_free(trans->paths, &c->btree_paths_pool);
2965
2966         trans->mem      = (void *) 0x1;
2967         trans->paths    = (void *) 0x1;
2968 }
2969
2970 static void __maybe_unused
2971 bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
2972                                       struct btree_bkey_cached_common *b)
2973 {
2974         struct six_lock_count c = six_lock_counts(&b->lock);
2975         struct task_struct *owner;
2976         pid_t pid;
2977
2978         rcu_read_lock();
2979         owner = READ_ONCE(b->lock.owner);
2980         pid = owner ? owner->pid : 0;;
2981         rcu_read_unlock();
2982
2983         prt_tab(out);
2984         prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b',
2985                    b->level, bch2_btree_ids[b->btree_id]);
2986         bch2_bpos_to_text(out, btree_node_pos(b));
2987
2988         prt_tab(out);
2989         prt_printf(out, " locks %u:%u:%u held by pid %u",
2990                    c.n[0], c.n[1], c.n[2], pid);
2991 }
2992
2993 void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
2994 {
2995         struct btree_path *path;
2996         struct btree_bkey_cached_common *b;
2997         static char lock_types[] = { 'r', 'i', 'w' };
2998         unsigned l;
2999
3000         if (!out->nr_tabstops) {
3001                 printbuf_tabstop_push(out, 16);
3002                 printbuf_tabstop_push(out, 32);
3003         }
3004
3005         prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn);
3006
3007         trans_for_each_path(trans, path) {
3008                 if (!path->nodes_locked)
3009                         continue;
3010
3011                 prt_printf(out, "  path %u %c l=%u %s:",
3012                        path->idx,
3013                        path->cached ? 'c' : 'b',
3014                        path->level,
3015                        bch2_btree_ids[path->btree_id]);
3016                 bch2_bpos_to_text(out, path->pos);
3017                 prt_newline(out);
3018
3019                 for (l = 0; l < BTREE_MAX_DEPTH; l++) {
3020                         if (btree_node_locked(path, l) &&
3021                             !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) {
3022                                 prt_printf(out, "    %c l=%u ",
3023                                            lock_types[btree_node_locked_type(path, l)], l);
3024                                 bch2_btree_bkey_cached_common_to_text(out, b);
3025                                 prt_newline(out);
3026                         }
3027                 }
3028         }
3029
3030         b = READ_ONCE(trans->locking);
3031         if (b) {
3032                 prt_str(out, "  want");
3033                 prt_newline(out);
3034                 prt_printf(out, "    %c", lock_types[trans->locking_wait.lock_want]);
3035                 bch2_btree_bkey_cached_common_to_text(out, b);
3036                 prt_newline(out);
3037         }
3038 }
3039
3040 void bch2_fs_btree_iter_exit(struct bch_fs *c)
3041 {
3042         if (c->btree_trans_barrier_initialized)
3043                 cleanup_srcu_struct(&c->btree_trans_barrier);
3044         mempool_exit(&c->btree_trans_mem_pool);
3045         mempool_exit(&c->btree_paths_pool);
3046 }
3047
3048 int bch2_fs_btree_iter_init(struct bch_fs *c)
3049 {
3050         unsigned i, nr = BTREE_ITER_MAX;
3051         int ret;
3052
3053         for (i = 0; i < ARRAY_SIZE(c->btree_transaction_stats); i++)
3054                 mutex_init(&c->btree_transaction_stats[i].lock);
3055
3056         INIT_LIST_HEAD(&c->btree_trans_list);
3057         mutex_init(&c->btree_trans_lock);
3058
3059         ret   = mempool_init_kmalloc_pool(&c->btree_paths_pool, 1,
3060                         sizeof(struct btree_path) * nr +
3061                         sizeof(struct btree_insert_entry) * nr) ?:
3062                 mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
3063                                           BTREE_TRANS_MEM_MAX) ?:
3064                 init_srcu_struct(&c->btree_trans_barrier);
3065         if (!ret)
3066                 c->btree_trans_barrier_initialized = true;
3067         return ret;
3068 }