]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_iter.c
Update bcachefs sources to 8a65cc4951 bcachefs: Improve bch2_dev_freespace_init()
[bcachefs-tools-debian] / libbcachefs / btree_iter.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_methods.h"
5 #include "bkey_buf.h"
6 #include "btree_cache.h"
7 #include "btree_iter.h"
8 #include "btree_key_cache.h"
9 #include "btree_locking.h"
10 #include "btree_update.h"
11 #include "debug.h"
12 #include "error.h"
13 #include "extents.h"
14 #include "journal.h"
15 #include "recovery.h"
16 #include "replicas.h"
17 #include "subvolume.h"
18
19 #include <linux/prandom.h>
20 #include <linux/prefetch.h>
21 #include <trace/events/bcachefs.h>
22
23 static void btree_trans_verify_sorted(struct btree_trans *);
24 inline void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int);
25 static __always_inline void bch2_btree_path_check_sort_fast(struct btree_trans *,
26                                                    struct btree_path *, int);
27
28 static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
29 static inline void btree_path_list_add(struct btree_trans *, struct btree_path *,
30                                        struct btree_path *);
31
32 static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter)
33 {
34 #ifdef CONFIG_BCACHEFS_DEBUG
35         return iter->ip_allocated;
36 #else
37         return 0;
38 #endif
39 }
40
41 static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *);
42
43 /*
44  * Unlocks before scheduling
45  * Note: does not revalidate iterator
46  */
47 static inline int bch2_trans_cond_resched(struct btree_trans *trans)
48 {
49         if (need_resched() || race_fault()) {
50                 bch2_trans_unlock(trans);
51                 schedule();
52                 return bch2_trans_relock(trans);
53         } else {
54                 return 0;
55         }
56 }
57
58 static inline int __btree_path_cmp(const struct btree_path *l,
59                                    enum btree_id        r_btree_id,
60                                    bool                 r_cached,
61                                    struct bpos          r_pos,
62                                    unsigned             r_level)
63 {
64         /*
65          * Must match lock ordering as defined by __bch2_btree_node_lock:
66          */
67         return   cmp_int(l->btree_id,   r_btree_id) ?:
68                  cmp_int((int) l->cached,       (int) r_cached) ?:
69                  bpos_cmp(l->pos,       r_pos) ?:
70                 -cmp_int(l->level,      r_level);
71 }
72
73 static inline int btree_path_cmp(const struct btree_path *l,
74                                  const struct btree_path *r)
75 {
76         return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level);
77 }
78
79 static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
80 {
81         /* Are we iterating over keys in all snapshots? */
82         if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
83                 p = bpos_successor(p);
84         } else {
85                 p = bpos_nosnap_successor(p);
86                 p.snapshot = iter->snapshot;
87         }
88
89         return p;
90 }
91
92 static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
93 {
94         /* Are we iterating over keys in all snapshots? */
95         if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
96                 p = bpos_predecessor(p);
97         } else {
98                 p = bpos_nosnap_predecessor(p);
99                 p.snapshot = iter->snapshot;
100         }
101
102         return p;
103 }
104
105 static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
106 {
107         struct bpos pos = iter->pos;
108
109         if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
110             !bkey_eq(pos, POS_MAX))
111                 pos = bkey_successor(iter, pos);
112         return pos;
113 }
114
115 static inline bool btree_path_pos_before_node(struct btree_path *path,
116                                               struct btree *b)
117 {
118         return bpos_lt(path->pos, b->data->min_key);
119 }
120
121 static inline bool btree_path_pos_after_node(struct btree_path *path,
122                                              struct btree *b)
123 {
124         return bpos_gt(path->pos, b->key.k.p);
125 }
126
127 static inline bool btree_path_pos_in_node(struct btree_path *path,
128                                           struct btree *b)
129 {
130         return path->btree_id == b->c.btree_id &&
131                 !btree_path_pos_before_node(path, b) &&
132                 !btree_path_pos_after_node(path, b);
133 }
134
135 /* Btree iterator: */
136
137 #ifdef CONFIG_BCACHEFS_DEBUG
138
139 static void bch2_btree_path_verify_cached(struct btree_trans *trans,
140                                           struct btree_path *path)
141 {
142         struct bkey_cached *ck;
143         bool locked = btree_node_locked(path, 0);
144
145         if (!bch2_btree_node_relock(trans, path, 0))
146                 return;
147
148         ck = (void *) path->l[0].b;
149         BUG_ON(ck->key.btree_id != path->btree_id ||
150                !bkey_eq(ck->key.pos, path->pos));
151
152         if (!locked)
153                 btree_node_unlock(trans, path, 0);
154 }
155
156 static void bch2_btree_path_verify_level(struct btree_trans *trans,
157                                 struct btree_path *path, unsigned level)
158 {
159         struct btree_path_level *l;
160         struct btree_node_iter tmp;
161         bool locked;
162         struct bkey_packed *p, *k;
163         struct printbuf buf1 = PRINTBUF;
164         struct printbuf buf2 = PRINTBUF;
165         struct printbuf buf3 = PRINTBUF;
166         const char *msg;
167
168         if (!bch2_debug_check_iterators)
169                 return;
170
171         l       = &path->l[level];
172         tmp     = l->iter;
173         locked  = btree_node_locked(path, level);
174
175         if (path->cached) {
176                 if (!level)
177                         bch2_btree_path_verify_cached(trans, path);
178                 return;
179         }
180
181         if (!btree_path_node(path, level))
182                 return;
183
184         if (!bch2_btree_node_relock_notrace(trans, path, level))
185                 return;
186
187         BUG_ON(!btree_path_pos_in_node(path, l->b));
188
189         bch2_btree_node_iter_verify(&l->iter, l->b);
190
191         /*
192          * For interior nodes, the iterator will have skipped past deleted keys:
193          */
194         p = level
195                 ? bch2_btree_node_iter_prev(&tmp, l->b)
196                 : bch2_btree_node_iter_prev_all(&tmp, l->b);
197         k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
198
199         if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) {
200                 msg = "before";
201                 goto err;
202         }
203
204         if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
205                 msg = "after";
206                 goto err;
207         }
208
209         if (!locked)
210                 btree_node_unlock(trans, path, level);
211         return;
212 err:
213         bch2_bpos_to_text(&buf1, path->pos);
214
215         if (p) {
216                 struct bkey uk = bkey_unpack_key(l->b, p);
217
218                 bch2_bkey_to_text(&buf2, &uk);
219         } else {
220                 prt_printf(&buf2, "(none)");
221         }
222
223         if (k) {
224                 struct bkey uk = bkey_unpack_key(l->b, k);
225
226                 bch2_bkey_to_text(&buf3, &uk);
227         } else {
228                 prt_printf(&buf3, "(none)");
229         }
230
231         panic("path should be %s key at level %u:\n"
232               "path pos %s\n"
233               "prev key %s\n"
234               "cur  key %s\n",
235               msg, level, buf1.buf, buf2.buf, buf3.buf);
236 }
237
238 static void bch2_btree_path_verify(struct btree_trans *trans,
239                                    struct btree_path *path)
240 {
241         struct bch_fs *c = trans->c;
242         unsigned i;
243
244         EBUG_ON(path->btree_id >= BTREE_ID_NR);
245
246         for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) {
247                 if (!path->l[i].b) {
248                         BUG_ON(!path->cached &&
249                                c->btree_roots[path->btree_id].b->c.level > i);
250                         break;
251                 }
252
253                 bch2_btree_path_verify_level(trans, path, i);
254         }
255
256         bch2_btree_path_verify_locks(path);
257 }
258
259 void bch2_trans_verify_paths(struct btree_trans *trans)
260 {
261         struct btree_path *path;
262
263         trans_for_each_path(trans, path)
264                 bch2_btree_path_verify(trans, path);
265 }
266
267 static void bch2_btree_iter_verify(struct btree_iter *iter)
268 {
269         struct btree_trans *trans = iter->trans;
270
271         BUG_ON(iter->btree_id >= BTREE_ID_NR);
272
273         BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached);
274
275         BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
276                (iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
277
278         BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
279                (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
280                !btree_type_has_snapshots(iter->btree_id));
281
282         if (iter->update_path)
283                 bch2_btree_path_verify(trans, iter->update_path);
284         bch2_btree_path_verify(trans, iter->path);
285 }
286
287 static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
288 {
289         BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
290                !iter->pos.snapshot);
291
292         BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
293                iter->pos.snapshot != iter->snapshot);
294
295         BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) ||
296                bkey_gt(iter->pos, iter->k.p));
297 }
298
299 static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
300 {
301         struct btree_trans *trans = iter->trans;
302         struct btree_iter copy;
303         struct bkey_s_c prev;
304         int ret = 0;
305
306         if (!bch2_debug_check_iterators)
307                 return 0;
308
309         if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS))
310                 return 0;
311
312         if (bkey_err(k) || !k.k)
313                 return 0;
314
315         BUG_ON(!bch2_snapshot_is_ancestor(trans->c,
316                                           iter->snapshot,
317                                           k.k->p.snapshot));
318
319         bch2_trans_iter_init(trans, &copy, iter->btree_id, iter->pos,
320                              BTREE_ITER_NOPRESERVE|
321                              BTREE_ITER_ALL_SNAPSHOTS);
322         prev = bch2_btree_iter_prev(&copy);
323         if (!prev.k)
324                 goto out;
325
326         ret = bkey_err(prev);
327         if (ret)
328                 goto out;
329
330         if (bkey_eq(prev.k->p, k.k->p) &&
331             bch2_snapshot_is_ancestor(trans->c, iter->snapshot,
332                                       prev.k->p.snapshot) > 0) {
333                 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
334
335                 bch2_bkey_to_text(&buf1, k.k);
336                 bch2_bkey_to_text(&buf2, prev.k);
337
338                 panic("iter snap %u\n"
339                       "k    %s\n"
340                       "prev %s\n",
341                       iter->snapshot,
342                       buf1.buf, buf2.buf);
343         }
344 out:
345         bch2_trans_iter_exit(trans, &copy);
346         return ret;
347 }
348
349 void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
350                             struct bpos pos, bool key_cache)
351 {
352         struct btree_path *path;
353         unsigned idx;
354         struct printbuf buf = PRINTBUF;
355
356         trans_for_each_path_inorder(trans, path, idx) {
357                 int cmp = cmp_int(path->btree_id, id) ?:
358                         cmp_int(path->cached, key_cache);
359
360                 if (cmp > 0)
361                         break;
362                 if (cmp < 0)
363                         continue;
364
365                 if (!btree_node_locked(path, 0) ||
366                     !path->should_be_locked)
367                         continue;
368
369                 if (!key_cache) {
370                         if (bkey_ge(pos, path->l[0].b->data->min_key) &&
371                             bkey_le(pos, path->l[0].b->key.k.p))
372                                 return;
373                 } else {
374                         if (bkey_eq(pos, path->pos))
375                                 return;
376                 }
377         }
378
379         bch2_dump_trans_paths_updates(trans);
380         bch2_bpos_to_text(&buf, pos);
381
382         panic("not locked: %s %s%s\n",
383               bch2_btree_ids[id], buf.buf,
384               key_cache ? " cached" : "");
385 }
386
387 #else
388
389 static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
390                                                 struct btree_path *path, unsigned l) {}
391 static inline void bch2_btree_path_verify(struct btree_trans *trans,
392                                           struct btree_path *path) {}
393 static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
394 static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
395 static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; }
396
397 #endif
398
399 /* Btree path: fixups after btree updates */
400
401 static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
402                                         struct btree *b,
403                                         struct bset_tree *t,
404                                         struct bkey_packed *k)
405 {
406         struct btree_node_iter_set *set;
407
408         btree_node_iter_for_each(iter, set)
409                 if (set->end == t->end_offset) {
410                         set->k = __btree_node_key_to_offset(b, k);
411                         bch2_btree_node_iter_sort(iter, b);
412                         return;
413                 }
414
415         bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
416 }
417
418 static void __bch2_btree_path_fix_key_modified(struct btree_path *path,
419                                                struct btree *b,
420                                                struct bkey_packed *where)
421 {
422         struct btree_path_level *l = &path->l[b->c.level];
423
424         if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
425                 return;
426
427         if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0)
428                 bch2_btree_node_iter_advance(&l->iter, l->b);
429 }
430
431 void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
432                                       struct btree *b,
433                                       struct bkey_packed *where)
434 {
435         struct btree_path *path;
436
437         trans_for_each_path_with_node(trans, b, path) {
438                 __bch2_btree_path_fix_key_modified(path, b, where);
439                 bch2_btree_path_verify_level(trans, path, b->c.level);
440         }
441 }
442
443 static void __bch2_btree_node_iter_fix(struct btree_path *path,
444                                        struct btree *b,
445                                        struct btree_node_iter *node_iter,
446                                        struct bset_tree *t,
447                                        struct bkey_packed *where,
448                                        unsigned clobber_u64s,
449                                        unsigned new_u64s)
450 {
451         const struct bkey_packed *end = btree_bkey_last(b, t);
452         struct btree_node_iter_set *set;
453         unsigned offset = __btree_node_key_to_offset(b, where);
454         int shift = new_u64s - clobber_u64s;
455         unsigned old_end = t->end_offset - shift;
456         unsigned orig_iter_pos = node_iter->data[0].k;
457         bool iter_current_key_modified =
458                 orig_iter_pos >= offset &&
459                 orig_iter_pos <= offset + clobber_u64s;
460
461         btree_node_iter_for_each(node_iter, set)
462                 if (set->end == old_end)
463                         goto found;
464
465         /* didn't find the bset in the iterator - might have to readd it: */
466         if (new_u64s &&
467             bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
468                 bch2_btree_node_iter_push(node_iter, b, where, end);
469                 goto fixup_done;
470         } else {
471                 /* Iterator is after key that changed */
472                 return;
473         }
474 found:
475         set->end = t->end_offset;
476
477         /* Iterator hasn't gotten to the key that changed yet: */
478         if (set->k < offset)
479                 return;
480
481         if (new_u64s &&
482             bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
483                 set->k = offset;
484         } else if (set->k < offset + clobber_u64s) {
485                 set->k = offset + new_u64s;
486                 if (set->k == set->end)
487                         bch2_btree_node_iter_set_drop(node_iter, set);
488         } else {
489                 /* Iterator is after key that changed */
490                 set->k = (int) set->k + shift;
491                 return;
492         }
493
494         bch2_btree_node_iter_sort(node_iter, b);
495 fixup_done:
496         if (node_iter->data[0].k != orig_iter_pos)
497                 iter_current_key_modified = true;
498
499         /*
500          * When a new key is added, and the node iterator now points to that
501          * key, the iterator might have skipped past deleted keys that should
502          * come after the key the iterator now points to. We have to rewind to
503          * before those deleted keys - otherwise
504          * bch2_btree_node_iter_prev_all() breaks:
505          */
506         if (!bch2_btree_node_iter_end(node_iter) &&
507             iter_current_key_modified &&
508             b->c.level) {
509                 struct bset_tree *t;
510                 struct bkey_packed *k, *k2, *p;
511
512                 k = bch2_btree_node_iter_peek_all(node_iter, b);
513
514                 for_each_bset(b, t) {
515                         bool set_pos = false;
516
517                         if (node_iter->data[0].end == t->end_offset)
518                                 continue;
519
520                         k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t);
521
522                         while ((p = bch2_bkey_prev_all(b, t, k2)) &&
523                                bkey_iter_cmp(b, k, p) < 0) {
524                                 k2 = p;
525                                 set_pos = true;
526                         }
527
528                         if (set_pos)
529                                 btree_node_iter_set_set_pos(node_iter,
530                                                             b, t, k2);
531                 }
532         }
533 }
534
535 void bch2_btree_node_iter_fix(struct btree_trans *trans,
536                               struct btree_path *path,
537                               struct btree *b,
538                               struct btree_node_iter *node_iter,
539                               struct bkey_packed *where,
540                               unsigned clobber_u64s,
541                               unsigned new_u64s)
542 {
543         struct bset_tree *t = bch2_bkey_to_bset(b, where);
544         struct btree_path *linked;
545
546         if (node_iter != &path->l[b->c.level].iter) {
547                 __bch2_btree_node_iter_fix(path, b, node_iter, t,
548                                            where, clobber_u64s, new_u64s);
549
550                 if (bch2_debug_check_iterators)
551                         bch2_btree_node_iter_verify(node_iter, b);
552         }
553
554         trans_for_each_path_with_node(trans, b, linked) {
555                 __bch2_btree_node_iter_fix(linked, b,
556                                            &linked->l[b->c.level].iter, t,
557                                            where, clobber_u64s, new_u64s);
558                 bch2_btree_path_verify_level(trans, linked, b->c.level);
559         }
560 }
561
562 /* Btree path level: pointer to a particular btree node and node iter */
563
564 static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c,
565                                                   struct btree_path_level *l,
566                                                   struct bkey *u,
567                                                   struct bkey_packed *k)
568 {
569         if (unlikely(!k)) {
570                 /*
571                  * signal to bch2_btree_iter_peek_slot() that we're currently at
572                  * a hole
573                  */
574                 u->type = KEY_TYPE_deleted;
575                 return bkey_s_c_null;
576         }
577
578         return bkey_disassemble(l->b, k, u);
579 }
580
581 static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
582                                                         struct btree_path_level *l,
583                                                         struct bkey *u)
584 {
585         return __btree_iter_unpack(c, l, u,
586                         bch2_btree_node_iter_peek_all(&l->iter, l->b));
587 }
588
589 static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans,
590                                                     struct btree_path *path,
591                                                     struct btree_path_level *l,
592                                                     struct bkey *u)
593 {
594         struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
595                         bch2_btree_node_iter_peek(&l->iter, l->b));
596
597         path->pos = k.k ? k.k->p : l->b->key.k.p;
598         bch2_btree_path_verify_level(trans, path, l - path->l);
599         return k;
600 }
601
602 static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
603                                                     struct btree_path *path,
604                                                     struct btree_path_level *l,
605                                                     struct bkey *u)
606 {
607         struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
608                         bch2_btree_node_iter_prev(&l->iter, l->b));
609
610         path->pos = k.k ? k.k->p : l->b->data->min_key;
611         bch2_btree_path_verify_level(trans, path, l - path->l);
612         return k;
613 }
614
615 static inline bool btree_path_advance_to_pos(struct btree_path *path,
616                                              struct btree_path_level *l,
617                                              int max_advance)
618 {
619         struct bkey_packed *k;
620         int nr_advanced = 0;
621
622         while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
623                bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
624                 if (max_advance > 0 && nr_advanced >= max_advance)
625                         return false;
626
627                 bch2_btree_node_iter_advance(&l->iter, l->b);
628                 nr_advanced++;
629         }
630
631         return true;
632 }
633
634 static inline void __btree_path_level_init(struct btree_path *path,
635                                            unsigned level)
636 {
637         struct btree_path_level *l = &path->l[level];
638
639         bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
640
641         /*
642          * Iterators to interior nodes should always be pointed at the first non
643          * whiteout:
644          */
645         if (level)
646                 bch2_btree_node_iter_peek(&l->iter, l->b);
647 }
648
649 void bch2_btree_path_level_init(struct btree_trans *trans,
650                                 struct btree_path *path,
651                                 struct btree *b)
652 {
653         BUG_ON(path->cached);
654
655         EBUG_ON(!btree_path_pos_in_node(path, b));
656         EBUG_ON(b->c.lock.state.seq & 1);
657
658         path->l[b->c.level].lock_seq = b->c.lock.state.seq;
659         path->l[b->c.level].b = b;
660         __btree_path_level_init(path, b->c.level);
661 }
662
663 /* Btree path: fixups after btree node updates: */
664
665 /*
666  * A btree node is being replaced - update the iterator to point to the new
667  * node:
668  */
669 void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
670 {
671         struct btree_path *path;
672
673         trans_for_each_path(trans, path)
674                 if (path->uptodate == BTREE_ITER_UPTODATE &&
675                     !path->cached &&
676                     btree_path_pos_in_node(path, b)) {
677                         enum btree_node_locked_type t =
678                                 btree_lock_want(path, b->c.level);
679
680                         if (t != BTREE_NODE_UNLOCKED) {
681                                 btree_node_unlock(trans, path, b->c.level);
682                                 six_lock_increment(&b->c.lock, t);
683                                 mark_btree_node_locked(trans, path, b->c.level, t);
684                         }
685
686                         bch2_btree_path_level_init(trans, path, b);
687                 }
688 }
689
690 /*
691  * A btree node has been modified in such a way as to invalidate iterators - fix
692  * them:
693  */
694 void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b)
695 {
696         struct btree_path *path;
697
698         trans_for_each_path_with_node(trans, b, path)
699                 __btree_path_level_init(path, b->c.level);
700 }
701
702 /* Btree path: traverse, set_pos: */
703
704 static inline int btree_path_lock_root(struct btree_trans *trans,
705                                        struct btree_path *path,
706                                        unsigned depth_want,
707                                        unsigned long trace_ip)
708 {
709         struct bch_fs *c = trans->c;
710         struct btree *b, **rootp = &c->btree_roots[path->btree_id].b;
711         enum six_lock_type lock_type;
712         unsigned i;
713         int ret;
714
715         EBUG_ON(path->nodes_locked);
716
717         while (1) {
718                 b = READ_ONCE(*rootp);
719                 path->level = READ_ONCE(b->c.level);
720
721                 if (unlikely(path->level < depth_want)) {
722                         /*
723                          * the root is at a lower depth than the depth we want:
724                          * got to the end of the btree, or we're walking nodes
725                          * greater than some depth and there are no nodes >=
726                          * that depth
727                          */
728                         path->level = depth_want;
729                         for (i = path->level; i < BTREE_MAX_DEPTH; i++)
730                                 path->l[i].b = NULL;
731                         return 1;
732                 }
733
734                 lock_type = __btree_lock_want(path, path->level);
735                 ret = btree_node_lock(trans, path, &b->c,
736                                       path->level, lock_type, trace_ip);
737                 if (unlikely(ret)) {
738                         if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed))
739                                 continue;
740                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
741                                 return ret;
742                         BUG();
743                 }
744
745                 if (likely(b == READ_ONCE(*rootp) &&
746                            b->c.level == path->level &&
747                            !race_fault())) {
748                         for (i = 0; i < path->level; i++)
749                                 path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_lock_root);
750                         path->l[path->level].b = b;
751                         for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
752                                 path->l[i].b = NULL;
753
754                         mark_btree_node_locked(trans, path, path->level, lock_type);
755                         bch2_btree_path_level_init(trans, path, b);
756                         return 0;
757                 }
758
759                 six_unlock_type(&b->c.lock, lock_type);
760         }
761 }
762
763 noinline
764 static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path)
765 {
766         struct bch_fs *c = trans->c;
767         struct btree_path_level *l = path_l(path);
768         struct btree_node_iter node_iter = l->iter;
769         struct bkey_packed *k;
770         struct bkey_buf tmp;
771         unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
772                 ? (path->level > 1 ? 0 :  2)
773                 : (path->level > 1 ? 1 : 16);
774         bool was_locked = btree_node_locked(path, path->level);
775         int ret = 0;
776
777         bch2_bkey_buf_init(&tmp);
778
779         while (nr-- && !ret) {
780                 if (!bch2_btree_node_relock(trans, path, path->level))
781                         break;
782
783                 bch2_btree_node_iter_advance(&node_iter, l->b);
784                 k = bch2_btree_node_iter_peek(&node_iter, l->b);
785                 if (!k)
786                         break;
787
788                 bch2_bkey_buf_unpack(&tmp, c, l->b, k);
789                 ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id,
790                                                path->level - 1);
791         }
792
793         if (!was_locked)
794                 btree_node_unlock(trans, path, path->level);
795
796         bch2_bkey_buf_exit(&tmp, c);
797         return ret;
798 }
799
800 static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path,
801                                  struct btree_and_journal_iter *jiter)
802 {
803         struct bch_fs *c = trans->c;
804         struct bkey_s_c k;
805         struct bkey_buf tmp;
806         unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
807                 ? (path->level > 1 ? 0 :  2)
808                 : (path->level > 1 ? 1 : 16);
809         bool was_locked = btree_node_locked(path, path->level);
810         int ret = 0;
811
812         bch2_bkey_buf_init(&tmp);
813
814         while (nr-- && !ret) {
815                 if (!bch2_btree_node_relock(trans, path, path->level))
816                         break;
817
818                 bch2_btree_and_journal_iter_advance(jiter);
819                 k = bch2_btree_and_journal_iter_peek(jiter);
820                 if (!k.k)
821                         break;
822
823                 bch2_bkey_buf_reassemble(&tmp, c, k);
824                 ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id,
825                                                path->level - 1);
826         }
827
828         if (!was_locked)
829                 btree_node_unlock(trans, path, path->level);
830
831         bch2_bkey_buf_exit(&tmp, c);
832         return ret;
833 }
834
835 static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
836                                             struct btree_path *path,
837                                             unsigned plevel, struct btree *b)
838 {
839         struct btree_path_level *l = &path->l[plevel];
840         bool locked = btree_node_locked(path, plevel);
841         struct bkey_packed *k;
842         struct bch_btree_ptr_v2 *bp;
843
844         if (!bch2_btree_node_relock(trans, path, plevel))
845                 return;
846
847         k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
848         BUG_ON(k->type != KEY_TYPE_btree_ptr_v2);
849
850         bp = (void *) bkeyp_val(&l->b->format, k);
851         bp->mem_ptr = (unsigned long)b;
852
853         if (!locked)
854                 btree_node_unlock(trans, path, plevel);
855 }
856
857 static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
858                                                      struct btree_path *path,
859                                                      unsigned flags,
860                                                      struct bkey_buf *out)
861 {
862         struct bch_fs *c = trans->c;
863         struct btree_path_level *l = path_l(path);
864         struct btree_and_journal_iter jiter;
865         struct bkey_s_c k;
866         int ret = 0;
867
868         __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos);
869
870         k = bch2_btree_and_journal_iter_peek(&jiter);
871
872         bch2_bkey_buf_reassemble(out, c, k);
873
874         if (flags & BTREE_ITER_PREFETCH)
875                 ret = btree_path_prefetch_j(trans, path, &jiter);
876
877         bch2_btree_and_journal_iter_exit(&jiter);
878         return ret;
879 }
880
881 static __always_inline int btree_path_down(struct btree_trans *trans,
882                                            struct btree_path *path,
883                                            unsigned flags,
884                                            unsigned long trace_ip)
885 {
886         struct bch_fs *c = trans->c;
887         struct btree_path_level *l = path_l(path);
888         struct btree *b;
889         unsigned level = path->level - 1;
890         enum six_lock_type lock_type = __btree_lock_want(path, level);
891         struct bkey_buf tmp;
892         int ret;
893
894         EBUG_ON(!btree_node_locked(path, path->level));
895
896         bch2_bkey_buf_init(&tmp);
897
898         if (unlikely(trans->journal_replay_not_finished)) {
899                 ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp);
900                 if (ret)
901                         goto err;
902         } else {
903                 bch2_bkey_buf_unpack(&tmp, c, l->b,
904                                  bch2_btree_node_iter_peek(&l->iter, l->b));
905
906                 if (flags & BTREE_ITER_PREFETCH) {
907                         ret = btree_path_prefetch(trans, path);
908                         if (ret)
909                                 goto err;
910                 }
911         }
912
913         b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip);
914         ret = PTR_ERR_OR_ZERO(b);
915         if (unlikely(ret))
916                 goto err;
917
918         if (likely(!trans->journal_replay_not_finished &&
919                    tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
920             unlikely(b != btree_node_mem_ptr(tmp.k)))
921                 btree_node_mem_ptr_set(trans, path, level + 1, b);
922
923         if (btree_node_read_locked(path, level + 1))
924                 btree_node_unlock(trans, path, level + 1);
925
926         mark_btree_node_locked(trans, path, level, lock_type);
927         path->level = level;
928         bch2_btree_path_level_init(trans, path, b);
929
930         bch2_btree_path_verify_locks(path);
931 err:
932         bch2_bkey_buf_exit(&tmp, c);
933         return ret;
934 }
935
936 static int btree_path_traverse_one(struct btree_trans *, struct btree_path *,
937                                    unsigned, unsigned long);
938
939 static int bch2_btree_path_traverse_all(struct btree_trans *trans)
940 {
941         struct bch_fs *c = trans->c;
942         struct btree_path *path;
943         unsigned long trace_ip = _RET_IP_;
944         int ret = 0;
945
946         if (trans->in_traverse_all)
947                 return -BCH_ERR_transaction_restart_in_traverse_all;
948
949         trans->in_traverse_all = true;
950 retry_all:
951         trans->restarted = 0;
952         trans->traverse_all_idx = U8_MAX;
953
954         trans_for_each_path(trans, path)
955                 path->should_be_locked = false;
956
957         btree_trans_verify_sorted(trans);
958
959         bch2_trans_unlock(trans);
960         cond_resched();
961
962         if (unlikely(trans->memory_allocation_failure)) {
963                 struct closure cl;
964
965                 closure_init_stack(&cl);
966
967                 do {
968                         ret = bch2_btree_cache_cannibalize_lock(c, &cl);
969                         closure_sync(&cl);
970                 } while (ret);
971         }
972
973         /* Now, redo traversals in correct order: */
974         trans->traverse_all_idx = 0;
975         while (trans->traverse_all_idx < trans->nr_sorted) {
976                 path = trans->paths + trans->sorted[trans->traverse_all_idx];
977
978                 /*
979                  * Traversing a path can cause another path to be added at about
980                  * the same position:
981                  */
982                 if (path->uptodate) {
983                         ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_);
984                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
985                             ret == -ENOMEM)
986                                 goto retry_all;
987                         if (ret)
988                                 goto err;
989                         BUG_ON(path->uptodate);
990                 } else {
991                         trans->traverse_all_idx++;
992                 }
993         }
994
995         /*
996          * BTREE_ITER_NEED_RELOCK is ok here - if we called bch2_trans_unlock()
997          * and relock(), relock() won't relock since path->should_be_locked
998          * isn't set yet, which is all fine
999          */
1000         trans_for_each_path(trans, path)
1001                 BUG_ON(path->uptodate >= BTREE_ITER_NEED_TRAVERSE);
1002 err:
1003         bch2_btree_cache_cannibalize_unlock(c);
1004
1005         trans->in_traverse_all = false;
1006
1007         trace_and_count(c, trans_traverse_all, trans, trace_ip);
1008         return ret;
1009 }
1010
1011 static inline bool btree_path_check_pos_in_node(struct btree_path *path,
1012                                                 unsigned l, int check_pos)
1013 {
1014         if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b))
1015                 return false;
1016         if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b))
1017                 return false;
1018         return true;
1019 }
1020
1021 static inline bool btree_path_good_node(struct btree_trans *trans,
1022                                         struct btree_path *path,
1023                                         unsigned l, int check_pos)
1024 {
1025         return is_btree_node(path, l) &&
1026                 bch2_btree_node_relock(trans, path, l) &&
1027                 btree_path_check_pos_in_node(path, l, check_pos);
1028 }
1029
1030 static void btree_path_set_level_down(struct btree_trans *trans,
1031                                       struct btree_path *path,
1032                                       unsigned new_level)
1033 {
1034         unsigned l;
1035
1036         path->level = new_level;
1037
1038         for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++)
1039                 if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
1040                         btree_node_unlock(trans, path, l);
1041
1042         btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
1043         bch2_btree_path_verify(trans, path);
1044 }
1045
1046 static noinline unsigned __btree_path_up_until_good_node(struct btree_trans *trans,
1047                                                          struct btree_path *path,
1048                                                          int check_pos)
1049 {
1050         unsigned i, l = path->level;
1051 again:
1052         while (btree_path_node(path, l) &&
1053                !btree_path_good_node(trans, path, l, check_pos))
1054                 __btree_path_set_level_up(trans, path, l++);
1055
1056         /* If we need intent locks, take them too: */
1057         for (i = l + 1;
1058              i < path->locks_want && btree_path_node(path, i);
1059              i++)
1060                 if (!bch2_btree_node_relock(trans, path, i)) {
1061                         while (l <= i)
1062                                 __btree_path_set_level_up(trans, path, l++);
1063                         goto again;
1064                 }
1065
1066         return l;
1067 }
1068
1069 static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
1070                                                      struct btree_path *path,
1071                                                      int check_pos)
1072 {
1073         return likely(btree_node_locked(path, path->level) &&
1074                       btree_path_check_pos_in_node(path, path->level, check_pos))
1075                 ? path->level
1076                 : __btree_path_up_until_good_node(trans, path, check_pos);
1077 }
1078
1079 /*
1080  * This is the main state machine for walking down the btree - walks down to a
1081  * specified depth
1082  *
1083  * Returns 0 on success, -EIO on error (error reading in a btree node).
1084  *
1085  * On error, caller (peek_node()/peek_key()) must return NULL; the error is
1086  * stashed in the iterator and returned from bch2_trans_exit().
1087  */
1088 static int btree_path_traverse_one(struct btree_trans *trans,
1089                                    struct btree_path *path,
1090                                    unsigned flags,
1091                                    unsigned long trace_ip)
1092 {
1093         unsigned depth_want = path->level;
1094         int ret = -((int) trans->restarted);
1095
1096         if (unlikely(ret))
1097                 goto out;
1098
1099         /*
1100          * Ensure we obey path->should_be_locked: if it's set, we can't unlock
1101          * and re-traverse the path without a transaction restart:
1102          */
1103         if (path->should_be_locked) {
1104                 ret = bch2_btree_path_relock(trans, path, trace_ip);
1105                 goto out;
1106         }
1107
1108         if (path->cached) {
1109                 ret = bch2_btree_path_traverse_cached(trans, path, flags);
1110                 goto out;
1111         }
1112
1113         if (unlikely(path->level >= BTREE_MAX_DEPTH))
1114                 goto out;
1115
1116         path->level = btree_path_up_until_good_node(trans, path, 0);
1117
1118         EBUG_ON(btree_path_node(path, path->level) &&
1119                 !btree_node_locked(path, path->level));
1120
1121         /*
1122          * Note: path->nodes[path->level] may be temporarily NULL here - that
1123          * would indicate to other code that we got to the end of the btree,
1124          * here it indicates that relocking the root failed - it's critical that
1125          * btree_path_lock_root() comes next and that it can't fail
1126          */
1127         while (path->level > depth_want) {
1128                 ret = btree_path_node(path, path->level)
1129                         ? btree_path_down(trans, path, flags, trace_ip)
1130                         : btree_path_lock_root(trans, path, depth_want, trace_ip);
1131                 if (unlikely(ret)) {
1132                         if (ret == 1) {
1133                                 /*
1134                                  * No nodes at this level - got to the end of
1135                                  * the btree:
1136                                  */
1137                                 ret = 0;
1138                                 goto out;
1139                         }
1140
1141                         __bch2_btree_path_unlock(trans, path);
1142                         path->level = depth_want;
1143                         path->l[path->level].b = ERR_PTR(ret);
1144                         goto out;
1145                 }
1146         }
1147
1148         path->uptodate = BTREE_ITER_UPTODATE;
1149 out:
1150         BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
1151         bch2_btree_path_verify(trans, path);
1152         return ret;
1153 }
1154
1155 int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
1156                                           struct btree_path *path, unsigned flags)
1157 {
1158         if (0 && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
1159                 unsigned restart_probability_bits = 4 << min(trans->restart_count, 32U);
1160                 u64 mask = ~(~0ULL << restart_probability_bits);
1161
1162                 if ((prandom_u32() & mask) == mask) {
1163                         trace_and_count(trans->c, trans_restart_injected, trans, _RET_IP_);
1164                         return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject);
1165                 }
1166         }
1167
1168         if (path->uptodate < BTREE_ITER_NEED_RELOCK)
1169                 return 0;
1170
1171         return  bch2_trans_cond_resched(trans) ?:
1172                 btree_path_traverse_one(trans, path, flags, _RET_IP_);
1173 }
1174
1175 static inline void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
1176                             struct btree_path *src)
1177 {
1178         unsigned i, offset = offsetof(struct btree_path, pos);
1179
1180         memcpy((void *) dst + offset,
1181                (void *) src + offset,
1182                sizeof(struct btree_path) - offset);
1183
1184         for (i = 0; i < BTREE_MAX_DEPTH; i++) {
1185                 unsigned t = btree_node_locked_type(dst, i);
1186
1187                 if (t != BTREE_NODE_UNLOCKED)
1188                         six_lock_increment(&dst->l[i].b->c.lock, t);
1189         }
1190 }
1191
1192 static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src,
1193                                            bool intent)
1194 {
1195         struct btree_path *new = btree_path_alloc(trans, src);
1196
1197         btree_path_copy(trans, new, src);
1198         __btree_path_get(new, intent);
1199         return new;
1200 }
1201
1202 __flatten
1203 struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans,
1204                          struct btree_path *path, bool intent,
1205                          unsigned long ip)
1206 {
1207         __btree_path_put(path, intent);
1208         path = btree_path_clone(trans, path, intent);
1209         path->preserve = false;
1210 #ifdef CONFIG_BCACHEFS_DEBUG
1211         path->ip_allocated = ip;
1212 #endif
1213         btree_trans_verify_sorted(trans);
1214         return path;
1215 }
1216
1217 struct btree_path * __must_check
1218 __bch2_btree_path_set_pos(struct btree_trans *trans,
1219                    struct btree_path *path, struct bpos new_pos,
1220                    bool intent, unsigned long ip, int cmp)
1221 {
1222         unsigned l = path->level;
1223
1224         EBUG_ON(trans->restarted);
1225         EBUG_ON(!path->ref);
1226
1227         path = bch2_btree_path_make_mut(trans, path, intent, ip);
1228
1229         path->pos = new_pos;
1230
1231         bch2_btree_path_check_sort_fast(trans, path, cmp);
1232
1233         if (unlikely(path->cached)) {
1234                 btree_node_unlock(trans, path, 0);
1235                 path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
1236                 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
1237                 goto out;
1238         }
1239
1240         l = btree_path_up_until_good_node(trans, path, cmp);
1241
1242         if (btree_path_node(path, l)) {
1243                 BUG_ON(!btree_node_locked(path, l));
1244                 /*
1245                  * We might have to skip over many keys, or just a few: try
1246                  * advancing the node iterator, and if we have to skip over too
1247                  * many keys just reinit it (or if we're rewinding, since that
1248                  * is expensive).
1249                  */
1250                 if (cmp < 0 ||
1251                     !btree_path_advance_to_pos(path, &path->l[l], 8))
1252                         __btree_path_level_init(path, l);
1253         }
1254
1255         if (unlikely(l != path->level)) {
1256                 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
1257                 __bch2_btree_path_unlock(trans, path);
1258         }
1259 out:
1260         bch2_btree_path_verify(trans, path);
1261         return path;
1262 }
1263
1264 /* Btree path: main interface: */
1265
1266 static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path)
1267 {
1268         struct btree_path *sib;
1269
1270         sib = prev_btree_path(trans, path);
1271         if (sib && !btree_path_cmp(sib, path))
1272                 return sib;
1273
1274         sib = next_btree_path(trans, path);
1275         if (sib && !btree_path_cmp(sib, path))
1276                 return sib;
1277
1278         return NULL;
1279 }
1280
1281 static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path)
1282 {
1283         struct btree_path *sib;
1284
1285         sib = prev_btree_path(trans, path);
1286         if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
1287                 return sib;
1288
1289         sib = next_btree_path(trans, path);
1290         if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
1291                 return sib;
1292
1293         return NULL;
1294 }
1295
1296 static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path)
1297 {
1298         __bch2_btree_path_unlock(trans, path);
1299         btree_path_list_remove(trans, path);
1300         trans->paths_allocated &= ~(1ULL << path->idx);
1301 }
1302
1303 void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent)
1304 {
1305         struct btree_path *dup;
1306
1307         EBUG_ON(trans->paths + path->idx != path);
1308         EBUG_ON(!path->ref);
1309
1310         if (!__btree_path_put(path, intent))
1311                 return;
1312
1313         dup = path->preserve
1314                 ? have_path_at_pos(trans, path)
1315                 : have_node_at_pos(trans, path);
1316
1317         if (!dup && !(!path->preserve && !is_btree_node(path, path->level)))
1318                 return;
1319
1320         if (path->should_be_locked &&
1321             !trans->restarted &&
1322             (!dup || !bch2_btree_path_relock_norestart(trans, dup, _THIS_IP_)))
1323                 return;
1324
1325         if (dup) {
1326                 dup->preserve           |= path->preserve;
1327                 dup->should_be_locked   |= path->should_be_locked;
1328         }
1329
1330         __bch2_path_free(trans, path);
1331 }
1332
1333 static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *path,
1334                                  bool intent)
1335 {
1336         EBUG_ON(trans->paths + path->idx != path);
1337         EBUG_ON(!path->ref);
1338
1339         if (!__btree_path_put(path, intent))
1340                 return;
1341
1342         __bch2_path_free(trans, path);
1343 }
1344
1345 void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
1346 {
1347         struct btree_insert_entry *i;
1348
1349         prt_printf(buf, "transaction updates for %s journal seq %llu",
1350                trans->fn, trans->journal_res.seq);
1351         prt_newline(buf);
1352         printbuf_indent_add(buf, 2);
1353
1354         trans_for_each_update(trans, i) {
1355                 struct bkey_s_c old = { &i->old_k, i->old_v };
1356
1357                 prt_printf(buf, "update: btree=%s cached=%u %pS",
1358                        bch2_btree_ids[i->btree_id],
1359                        i->cached,
1360                        (void *) i->ip_allocated);
1361                 prt_newline(buf);
1362
1363                 prt_printf(buf, "  old ");
1364                 bch2_bkey_val_to_text(buf, trans->c, old);
1365                 prt_newline(buf);
1366
1367                 prt_printf(buf, "  new ");
1368                 bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k));
1369                 prt_newline(buf);
1370         }
1371
1372         printbuf_indent_sub(buf, 2);
1373 }
1374
1375 noinline __cold
1376 void bch2_dump_trans_updates(struct btree_trans *trans)
1377 {
1378         struct printbuf buf = PRINTBUF;
1379
1380         bch2_trans_updates_to_text(&buf, trans);
1381         bch2_print_string_as_lines(KERN_ERR, buf.buf);
1382         printbuf_exit(&buf);
1383 }
1384
1385 void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path)
1386 {
1387         prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ",
1388                    path->idx, path->ref, path->intent_ref,
1389                    path->preserve ? 'P' : ' ',
1390                    path->should_be_locked ? 'S' : ' ',
1391                    bch2_btree_ids[path->btree_id],
1392                    path->level);
1393         bch2_bpos_to_text(out, path->pos);
1394
1395         prt_printf(out, " locks %u", path->nodes_locked);
1396 #ifdef CONFIG_BCACHEFS_DEBUG
1397         prt_printf(out, " %pS", (void *) path->ip_allocated);
1398 #endif
1399         prt_newline(out);
1400 }
1401
1402 void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans)
1403 {
1404         struct btree_path *path;
1405         unsigned idx;
1406
1407         trans_for_each_path_inorder(trans, path, idx)
1408                 bch2_btree_path_to_text(out, path);
1409 }
1410
1411 noinline __cold
1412 void bch2_dump_trans_paths_updates(struct btree_trans *trans)
1413 {
1414         struct printbuf buf = PRINTBUF;
1415
1416         bch2_trans_paths_to_text(&buf, trans);
1417         bch2_trans_updates_to_text(&buf, trans);
1418
1419         bch2_print_string_as_lines(KERN_ERR, buf.buf);
1420         printbuf_exit(&buf);
1421 }
1422
1423 noinline
1424 static void bch2_trans_update_max_paths(struct btree_trans *trans)
1425 {
1426         struct btree_transaction_stats *s = btree_trans_stats(trans);
1427         struct printbuf buf = PRINTBUF;
1428
1429         bch2_trans_paths_to_text(&buf, trans);
1430
1431         if (!buf.allocation_failure) {
1432                 mutex_lock(&s->lock);
1433                 if (s->nr_max_paths < hweight64(trans->paths_allocated)) {
1434                         s->nr_max_paths = trans->nr_max_paths =
1435                                 hweight64(trans->paths_allocated);
1436                         swap(s->max_paths_text, buf.buf);
1437                 }
1438                 mutex_unlock(&s->lock);
1439         }
1440
1441         printbuf_exit(&buf);
1442 }
1443
1444 static noinline void btree_path_overflow(struct btree_trans *trans)
1445 {
1446         bch2_dump_trans_paths_updates(trans);
1447         panic("trans path oveflow\n");
1448 }
1449
1450 static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
1451                                                   struct btree_path *pos)
1452 {
1453         struct btree_path *path;
1454         unsigned idx;
1455
1456         if (unlikely(trans->paths_allocated ==
1457                      ~((~0ULL << 1) << (BTREE_ITER_MAX - 1))))
1458                 btree_path_overflow(trans);
1459
1460         idx = __ffs64(~trans->paths_allocated);
1461         trans->paths_allocated |= 1ULL << idx;
1462
1463         if (unlikely(idx > trans->nr_max_paths))
1464                 bch2_trans_update_max_paths(trans);
1465
1466         path = &trans->paths[idx];
1467
1468         path->idx               = idx;
1469         path->ref               = 0;
1470         path->intent_ref        = 0;
1471         path->nodes_locked      = 0;
1472
1473         btree_path_list_add(trans, pos, path);
1474         return path;
1475 }
1476
1477 struct btree_path *bch2_path_get(struct btree_trans *trans,
1478                                  enum btree_id btree_id, struct bpos pos,
1479                                  unsigned locks_want, unsigned level,
1480                                  unsigned flags, unsigned long ip)
1481 {
1482         struct btree_path *path, *path_pos = NULL;
1483         bool cached = flags & BTREE_ITER_CACHED;
1484         bool intent = flags & BTREE_ITER_INTENT;
1485         int i;
1486
1487         EBUG_ON(trans->restarted);
1488         btree_trans_verify_sorted(trans);
1489         bch2_trans_verify_locks(trans);
1490
1491         trans_for_each_path_inorder(trans, path, i) {
1492                 if (__btree_path_cmp(path,
1493                                      btree_id,
1494                                      cached,
1495                                      pos,
1496                                      level) > 0)
1497                         break;
1498
1499                 path_pos = path;
1500         }
1501
1502         if (path_pos &&
1503             path_pos->cached    == cached &&
1504             path_pos->btree_id  == btree_id &&
1505             path_pos->level     == level) {
1506                 __btree_path_get(path_pos, intent);
1507                 path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
1508         } else {
1509                 path = btree_path_alloc(trans, path_pos);
1510                 path_pos = NULL;
1511
1512                 __btree_path_get(path, intent);
1513                 path->pos                       = pos;
1514                 path->btree_id                  = btree_id;
1515                 path->cached                    = cached;
1516                 path->uptodate                  = BTREE_ITER_NEED_TRAVERSE;
1517                 path->should_be_locked          = false;
1518                 path->level                     = level;
1519                 path->locks_want                = locks_want;
1520                 path->nodes_locked              = 0;
1521                 for (i = 0; i < ARRAY_SIZE(path->l); i++)
1522                         path->l[i].b            = ERR_PTR(-BCH_ERR_no_btree_node_init);
1523 #ifdef CONFIG_BCACHEFS_DEBUG
1524                 path->ip_allocated              = ip;
1525 #endif
1526                 btree_trans_verify_sorted(trans);
1527         }
1528
1529         if (!(flags & BTREE_ITER_NOPRESERVE))
1530                 path->preserve = true;
1531
1532         if (path->intent_ref)
1533                 locks_want = max(locks_want, level + 1);
1534
1535         /*
1536          * If the path has locks_want greater than requested, we don't downgrade
1537          * it here - on transaction restart because btree node split needs to
1538          * upgrade locks, we might be putting/getting the iterator again.
1539          * Downgrading iterators only happens via bch2_trans_downgrade(), after
1540          * a successful transaction commit.
1541          */
1542
1543         locks_want = min(locks_want, BTREE_MAX_DEPTH);
1544         if (locks_want > path->locks_want)
1545                 bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want);
1546
1547         return path;
1548 }
1549
1550 struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
1551 {
1552
1553         struct btree_path_level *l = path_l(path);
1554         struct bkey_packed *_k;
1555         struct bkey_s_c k;
1556
1557         if (unlikely(!l->b))
1558                 return bkey_s_c_null;
1559
1560         EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
1561         EBUG_ON(!btree_node_locked(path, path->level));
1562
1563         if (!path->cached) {
1564                 _k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
1565                 k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null;
1566
1567                 EBUG_ON(k.k && bkey_deleted(k.k) && bpos_eq(k.k->p, path->pos));
1568
1569                 if (!k.k || !bpos_eq(path->pos, k.k->p))
1570                         goto hole;
1571         } else {
1572                 struct bkey_cached *ck = (void *) path->l[0].b;
1573
1574                 EBUG_ON(ck &&
1575                         (path->btree_id != ck->key.btree_id ||
1576                          !bkey_eq(path->pos, ck->key.pos)));
1577                 EBUG_ON(!ck || !ck->valid);
1578
1579                 *u = ck->k->k;
1580                 k = bkey_i_to_s_c(ck->k);
1581         }
1582
1583         return k;
1584 hole:
1585         bkey_init(u);
1586         u->p = path->pos;
1587         return (struct bkey_s_c) { u, NULL };
1588 }
1589
1590 /* Btree iterators: */
1591
1592 int __must_check
1593 __bch2_btree_iter_traverse(struct btree_iter *iter)
1594 {
1595         return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
1596 }
1597
1598 int __must_check
1599 bch2_btree_iter_traverse(struct btree_iter *iter)
1600 {
1601         int ret;
1602
1603         iter->path = bch2_btree_path_set_pos(iter->trans, iter->path,
1604                                         btree_iter_search_key(iter),
1605                                         iter->flags & BTREE_ITER_INTENT,
1606                                         btree_iter_ip_allocated(iter));
1607
1608         ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
1609         if (ret)
1610                 return ret;
1611
1612         btree_path_set_should_be_locked(iter->path);
1613         return 0;
1614 }
1615
1616 /* Iterate across nodes (leaf and interior nodes) */
1617
1618 struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
1619 {
1620         struct btree_trans *trans = iter->trans;
1621         struct btree *b = NULL;
1622         int ret;
1623
1624         EBUG_ON(iter->path->cached);
1625         bch2_btree_iter_verify(iter);
1626
1627         ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
1628         if (ret)
1629                 goto err;
1630
1631         b = btree_path_node(iter->path, iter->path->level);
1632         if (!b)
1633                 goto out;
1634
1635         BUG_ON(bpos_lt(b->key.k.p, iter->pos));
1636
1637         bkey_init(&iter->k);
1638         iter->k.p = iter->pos = b->key.k.p;
1639
1640         iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
1641                                         iter->flags & BTREE_ITER_INTENT,
1642                                         btree_iter_ip_allocated(iter));
1643         btree_path_set_should_be_locked(iter->path);
1644 out:
1645         bch2_btree_iter_verify_entry_exit(iter);
1646         bch2_btree_iter_verify(iter);
1647
1648         return b;
1649 err:
1650         b = ERR_PTR(ret);
1651         goto out;
1652 }
1653
1654 struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
1655 {
1656         struct btree_trans *trans = iter->trans;
1657         struct btree_path *path = iter->path;
1658         struct btree *b = NULL;
1659         int ret;
1660
1661         BUG_ON(trans->restarted);
1662         EBUG_ON(iter->path->cached);
1663         bch2_btree_iter_verify(iter);
1664
1665         /* already at end? */
1666         if (!btree_path_node(path, path->level))
1667                 return NULL;
1668
1669         /* got to end? */
1670         if (!btree_path_node(path, path->level + 1)) {
1671                 btree_path_set_level_up(trans, path);
1672                 return NULL;
1673         }
1674
1675         if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
1676                 __bch2_btree_path_unlock(trans, path);
1677                 path->l[path->level].b          = ERR_PTR(-BCH_ERR_no_btree_node_relock);
1678                 path->l[path->level + 1].b      = ERR_PTR(-BCH_ERR_no_btree_node_relock);
1679                 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
1680                 trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path);
1681                 ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
1682                 goto err;
1683         }
1684
1685         b = btree_path_node(path, path->level + 1);
1686
1687         if (bpos_eq(iter->pos, b->key.k.p)) {
1688                 __btree_path_set_level_up(trans, path, path->level++);
1689         } else {
1690                 /*
1691                  * Haven't gotten to the end of the parent node: go back down to
1692                  * the next child node
1693                  */
1694                 path = iter->path =
1695                         bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos),
1696                                            iter->flags & BTREE_ITER_INTENT,
1697                                            btree_iter_ip_allocated(iter));
1698
1699                 btree_path_set_level_down(trans, path, iter->min_depth);
1700
1701                 ret = bch2_btree_path_traverse(trans, path, iter->flags);
1702                 if (ret)
1703                         goto err;
1704
1705                 b = path->l[path->level].b;
1706         }
1707
1708         bkey_init(&iter->k);
1709         iter->k.p = iter->pos = b->key.k.p;
1710
1711         iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
1712                                         iter->flags & BTREE_ITER_INTENT,
1713                                         btree_iter_ip_allocated(iter));
1714         btree_path_set_should_be_locked(iter->path);
1715         BUG_ON(iter->path->uptodate);
1716 out:
1717         bch2_btree_iter_verify_entry_exit(iter);
1718         bch2_btree_iter_verify(iter);
1719
1720         return b;
1721 err:
1722         b = ERR_PTR(ret);
1723         goto out;
1724 }
1725
1726 /* Iterate across keys (in leaf nodes only) */
1727
1728 inline bool bch2_btree_iter_advance(struct btree_iter *iter)
1729 {
1730         if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) {
1731                 struct bpos pos = iter->k.p;
1732                 bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
1733                              ? bpos_eq(pos, SPOS_MAX)
1734                              : bkey_eq(pos, SPOS_MAX));
1735
1736                 if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
1737                         pos = bkey_successor(iter, pos);
1738                 bch2_btree_iter_set_pos(iter, pos);
1739                 return ret;
1740         } else {
1741                 if (!btree_path_node(iter->path, iter->path->level))
1742                         return true;
1743
1744                 iter->advanced = true;
1745                 return false;
1746         }
1747 }
1748
1749 inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
1750 {
1751         struct bpos pos = bkey_start_pos(&iter->k);
1752         bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
1753                      ? bpos_eq(pos, POS_MIN)
1754                      : bkey_eq(pos, POS_MIN));
1755
1756         if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
1757                 pos = bkey_predecessor(iter, pos);
1758         bch2_btree_iter_set_pos(iter, pos);
1759         return ret;
1760 }
1761
1762 static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans,
1763                                                       enum btree_id btree_id,
1764                                                       struct bpos pos)
1765 {
1766         struct btree_insert_entry *i;
1767         struct bkey_i *ret = NULL;
1768
1769         trans_for_each_update(trans, i) {
1770                 if (i->btree_id < btree_id)
1771                         continue;
1772                 if (i->btree_id > btree_id)
1773                         break;
1774                 if (bpos_lt(i->k->k.p, pos))
1775                         continue;
1776                 if (i->key_cache_already_flushed)
1777                         continue;
1778                 if (!ret || bpos_lt(i->k->k.p, ret->k.p))
1779                         ret = i->k;
1780         }
1781
1782         return ret;
1783 }
1784
1785 struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
1786                                        struct btree_iter *iter,
1787                                        struct bpos start_pos,
1788                                        struct bpos end_pos)
1789 {
1790         struct bkey_i *k;
1791
1792         if (bpos_lt(start_pos, iter->journal_pos))
1793                 iter->journal_idx = 0;
1794
1795         k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id,
1796                                         iter->path->level,
1797                                         start_pos, end_pos,
1798                                         &iter->journal_idx);
1799
1800         iter->journal_pos = k ? k->k.p : end_pos;
1801         return k;
1802 }
1803
1804 struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *trans,
1805                                             struct btree_iter *iter,
1806                                             struct bpos pos)
1807 {
1808         return bch2_btree_journal_peek(trans, iter, pos, pos);
1809 }
1810
1811 static noinline
1812 struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
1813                                          struct btree_iter *iter,
1814                                          struct bkey_s_c k)
1815 {
1816         struct bkey_i *next_journal =
1817                 bch2_btree_journal_peek(trans, iter, iter->path->pos,
1818                                 k.k ? k.k->p : path_l(iter->path)->b->key.k.p);
1819
1820         if (next_journal) {
1821                 iter->k = next_journal->k;
1822                 k = bkey_i_to_s_c(next_journal);
1823         }
1824
1825         return k;
1826 }
1827
1828 /*
1829  * Checks btree key cache for key at iter->pos and returns it if present, or
1830  * bkey_s_c_null:
1831  */
1832 static noinline
1833 struct bkey_s_c __btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
1834 {
1835         struct btree_trans *trans = iter->trans;
1836         struct bch_fs *c = trans->c;
1837         struct bkey u;
1838         int ret;
1839
1840         if (!bch2_btree_key_cache_find(c, iter->btree_id, pos))
1841                 return bkey_s_c_null;
1842
1843         if (!iter->key_cache_path)
1844                 iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos,
1845                                                      iter->flags & BTREE_ITER_INTENT, 0,
1846                                                      iter->flags|BTREE_ITER_CACHED,
1847                                                      _THIS_IP_);
1848
1849         iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos,
1850                                         iter->flags & BTREE_ITER_INTENT,
1851                                         btree_iter_ip_allocated(iter));
1852
1853         ret = bch2_btree_path_traverse(trans, iter->key_cache_path, iter->flags|BTREE_ITER_CACHED);
1854         if (unlikely(ret))
1855                 return bkey_s_c_err(ret);
1856
1857         btree_path_set_should_be_locked(iter->key_cache_path);
1858
1859         return bch2_btree_path_peek_slot(iter->key_cache_path, &u);
1860 }
1861
1862 static noinline
1863 struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
1864 {
1865         struct bkey_s_c ret = __btree_trans_peek_key_cache(iter, pos);
1866         int err = bkey_err(ret) ?: bch2_btree_path_relock(iter->trans, iter->path, _THIS_IP_);
1867
1868         return err ? bkey_s_c_err(err) : ret;
1869 }
1870
1871 static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
1872 {
1873         struct btree_trans *trans = iter->trans;
1874         struct bkey_i *next_update;
1875         struct bkey_s_c k, k2;
1876         int ret;
1877
1878         EBUG_ON(iter->path->cached);
1879         bch2_btree_iter_verify(iter);
1880
1881         while (1) {
1882                 struct btree_path_level *l;
1883
1884                 iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
1885                                         iter->flags & BTREE_ITER_INTENT,
1886                                         btree_iter_ip_allocated(iter));
1887
1888                 ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
1889                 if (unlikely(ret)) {
1890                         /* ensure that iter->k is consistent with iter->pos: */
1891                         bch2_btree_iter_set_pos(iter, iter->pos);
1892                         k = bkey_s_c_err(ret);
1893                         goto out;
1894                 }
1895
1896                 l = path_l(iter->path);
1897
1898                 if (unlikely(!l->b)) {
1899                         /* No btree nodes at requested level: */
1900                         bch2_btree_iter_set_pos(iter, SPOS_MAX);
1901                         k = bkey_s_c_null;
1902                         goto out;
1903                 }
1904
1905                 btree_path_set_should_be_locked(iter->path);
1906
1907                 k = btree_path_level_peek_all(trans->c, l, &iter->k);
1908
1909                 if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
1910                     k.k &&
1911                     (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
1912                         k = k2;
1913                         ret = bkey_err(k);
1914                         if (ret) {
1915                                 bch2_btree_iter_set_pos(iter, iter->pos);
1916                                 goto out;
1917                         }
1918                 }
1919
1920                 if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL))
1921                         k = btree_trans_peek_journal(trans, iter, k);
1922
1923                 next_update = iter->flags & BTREE_ITER_WITH_UPDATES
1924                         ? btree_trans_peek_updates(trans, iter->btree_id, search_key)
1925                         : NULL;
1926                 if (next_update &&
1927                     bpos_le(next_update->k.p,
1928                             k.k ? k.k->p : l->b->key.k.p)) {
1929                         iter->k = next_update->k;
1930                         k = bkey_i_to_s_c(next_update);
1931                 }
1932
1933                 if (k.k && bkey_deleted(k.k)) {
1934                         /*
1935                          * If we've got a whiteout, and it's after the search
1936                          * key, advance the search key to the whiteout instead
1937                          * of just after the whiteout - it might be a btree
1938                          * whiteout, with a real key at the same position, since
1939                          * in the btree deleted keys sort before non deleted.
1940                          */
1941                         search_key = !bpos_eq(search_key, k.k->p)
1942                                 ? k.k->p
1943                                 : bpos_successor(k.k->p);
1944                         continue;
1945                 }
1946
1947                 if (likely(k.k)) {
1948                         break;
1949                 } else if (likely(!bpos_eq(l->b->key.k.p, SPOS_MAX))) {
1950                         /* Advance to next leaf node: */
1951                         search_key = bpos_successor(l->b->key.k.p);
1952                 } else {
1953                         /* End of btree: */
1954                         bch2_btree_iter_set_pos(iter, SPOS_MAX);
1955                         k = bkey_s_c_null;
1956                         goto out;
1957                 }
1958         }
1959 out:
1960         bch2_btree_iter_verify(iter);
1961
1962         return k;
1963 }
1964
1965 /**
1966  * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
1967  * current position
1968  */
1969 struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end)
1970 {
1971         struct btree_trans *trans = iter->trans;
1972         struct bpos search_key = btree_iter_search_key(iter);
1973         struct bkey_s_c k;
1974         struct bpos iter_pos;
1975         int ret;
1976
1977         EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
1978         EBUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && bkey_eq(end, POS_MAX));
1979
1980         if (iter->update_path) {
1981                 bch2_path_put_nokeep(trans, iter->update_path,
1982                                      iter->flags & BTREE_ITER_INTENT);
1983                 iter->update_path = NULL;
1984         }
1985
1986         bch2_btree_iter_verify_entry_exit(iter);
1987
1988         while (1) {
1989                 k = __bch2_btree_iter_peek(iter, search_key);
1990                 if (unlikely(!k.k))
1991                         goto end;
1992                 if (unlikely(bkey_err(k)))
1993                         goto out_no_locked;
1994
1995                 /*
1996                  * iter->pos should be mononotically increasing, and always be
1997                  * equal to the key we just returned - except extents can
1998                  * straddle iter->pos:
1999                  */
2000                 if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
2001                         iter_pos = k.k->p;
2002                 else
2003                         iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k));
2004
2005                 if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS)
2006                              ? bkey_gt(iter_pos, end)
2007                              : bkey_ge(iter_pos, end)))
2008                         goto end;
2009
2010                 if (iter->update_path &&
2011                     !bkey_eq(iter->update_path->pos, k.k->p)) {
2012                         bch2_path_put_nokeep(trans, iter->update_path,
2013                                              iter->flags & BTREE_ITER_INTENT);
2014                         iter->update_path = NULL;
2015                 }
2016
2017                 if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
2018                     (iter->flags & BTREE_ITER_INTENT) &&
2019                     !(iter->flags & BTREE_ITER_IS_EXTENTS) &&
2020                     !iter->update_path) {
2021                         struct bpos pos = k.k->p;
2022
2023                         if (pos.snapshot < iter->snapshot) {
2024                                 search_key = bpos_successor(k.k->p);
2025                                 continue;
2026                         }
2027
2028                         pos.snapshot = iter->snapshot;
2029
2030                         /*
2031                          * advance, same as on exit for iter->path, but only up
2032                          * to snapshot
2033                          */
2034                         __btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT);
2035                         iter->update_path = iter->path;
2036
2037                         iter->update_path = bch2_btree_path_set_pos(trans,
2038                                                 iter->update_path, pos,
2039                                                 iter->flags & BTREE_ITER_INTENT,
2040                                                 _THIS_IP_);
2041                 }
2042
2043                 /*
2044                  * We can never have a key in a leaf node at POS_MAX, so
2045                  * we don't have to check these successor() calls:
2046                  */
2047                 if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
2048                     !bch2_snapshot_is_ancestor(trans->c,
2049                                                iter->snapshot,
2050                                                k.k->p.snapshot)) {
2051                         search_key = bpos_successor(k.k->p);
2052                         continue;
2053                 }
2054
2055                 if (bkey_whiteout(k.k) &&
2056                     !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
2057                         search_key = bkey_successor(iter, k.k->p);
2058                         continue;
2059                 }
2060
2061                 break;
2062         }
2063
2064         iter->pos = iter_pos;
2065
2066         iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
2067                                 iter->flags & BTREE_ITER_INTENT,
2068                                 btree_iter_ip_allocated(iter));
2069
2070         btree_path_set_should_be_locked(iter->path);
2071 out_no_locked:
2072         if (iter->update_path) {
2073                 if (iter->update_path->uptodate &&
2074                     (ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_)))
2075                         k = bkey_s_c_err(ret);
2076                 else
2077                         btree_path_set_should_be_locked(iter->update_path);
2078         }
2079
2080         if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
2081                 iter->pos.snapshot = iter->snapshot;
2082
2083         ret = bch2_btree_iter_verify_ret(iter, k);
2084         if (unlikely(ret)) {
2085                 bch2_btree_iter_set_pos(iter, iter->pos);
2086                 k = bkey_s_c_err(ret);
2087         }
2088
2089         bch2_btree_iter_verify_entry_exit(iter);
2090
2091         return k;
2092 end:
2093         bch2_btree_iter_set_pos(iter, end);
2094         k = bkey_s_c_null;
2095         goto out_no_locked;
2096 }
2097
2098 /**
2099  * bch2_btree_iter_peek_all_levels: returns the first key greater than or equal
2100  * to iterator's current position, returning keys from every level of the btree.
2101  * For keys at different levels of the btree that compare equal, the key from
2102  * the lower level (leaf) is returned first.
2103  */
2104 struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter)
2105 {
2106         struct btree_trans *trans = iter->trans;
2107         struct bkey_s_c k;
2108         int ret;
2109
2110         EBUG_ON(iter->path->cached);
2111         bch2_btree_iter_verify(iter);
2112         BUG_ON(iter->path->level < iter->min_depth);
2113         BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
2114         EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS));
2115
2116         while (1) {
2117                 iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos,
2118                                         iter->flags & BTREE_ITER_INTENT,
2119                                         btree_iter_ip_allocated(iter));
2120
2121                 ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
2122                 if (unlikely(ret)) {
2123                         /* ensure that iter->k is consistent with iter->pos: */
2124                         bch2_btree_iter_set_pos(iter, iter->pos);
2125                         k = bkey_s_c_err(ret);
2126                         goto out_no_locked;
2127                 }
2128
2129                 /* Already at end? */
2130                 if (!btree_path_node(iter->path, iter->path->level)) {
2131                         k = bkey_s_c_null;
2132                         goto out_no_locked;
2133                 }
2134
2135                 k = btree_path_level_peek_all(trans->c,
2136                                 &iter->path->l[iter->path->level], &iter->k);
2137
2138                 /* Check if we should go up to the parent node: */
2139                 if (!k.k ||
2140                     (iter->advanced &&
2141                      bpos_eq(path_l(iter->path)->b->key.k.p, iter->pos))) {
2142                         iter->pos = path_l(iter->path)->b->key.k.p;
2143                         btree_path_set_level_up(trans, iter->path);
2144                         iter->advanced = false;
2145                         continue;
2146                 }
2147
2148                 /*
2149                  * Check if we should go back down to a leaf:
2150                  * If we're not in a leaf node, we only return the current key
2151                  * if it exactly matches iter->pos - otherwise we first have to
2152                  * go back to the leaf:
2153                  */
2154                 if (iter->path->level != iter->min_depth &&
2155                     (iter->advanced ||
2156                      !k.k ||
2157                      !bpos_eq(iter->pos, k.k->p))) {
2158                         btree_path_set_level_down(trans, iter->path, iter->min_depth);
2159                         iter->pos = bpos_successor(iter->pos);
2160                         iter->advanced = false;
2161                         continue;
2162                 }
2163
2164                 /* Check if we should go to the next key: */
2165                 if (iter->path->level == iter->min_depth &&
2166                     iter->advanced &&
2167                     k.k &&
2168                     bpos_eq(iter->pos, k.k->p)) {
2169                         iter->pos = bpos_successor(iter->pos);
2170                         iter->advanced = false;
2171                         continue;
2172                 }
2173
2174                 if (iter->advanced &&
2175                     iter->path->level == iter->min_depth &&
2176                     !bpos_eq(k.k->p, iter->pos))
2177                         iter->advanced = false;
2178
2179                 BUG_ON(iter->advanced);
2180                 BUG_ON(!k.k);
2181                 break;
2182         }
2183
2184         iter->pos = k.k->p;
2185         btree_path_set_should_be_locked(iter->path);
2186 out_no_locked:
2187         bch2_btree_iter_verify(iter);
2188
2189         return k;
2190 }
2191
2192 /**
2193  * bch2_btree_iter_next: returns first key greater than iterator's current
2194  * position
2195  */
2196 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
2197 {
2198         if (!bch2_btree_iter_advance(iter))
2199                 return bkey_s_c_null;
2200
2201         return bch2_btree_iter_peek(iter);
2202 }
2203
2204 /**
2205  * bch2_btree_iter_peek_prev: returns first key less than or equal to
2206  * iterator's current position
2207  */
2208 struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
2209 {
2210         struct btree_trans *trans = iter->trans;
2211         struct bpos search_key = iter->pos;
2212         struct btree_path *saved_path = NULL;
2213         struct bkey_s_c k;
2214         struct bkey saved_k;
2215         const struct bch_val *saved_v;
2216         int ret;
2217
2218         EBUG_ON(iter->path->cached || iter->path->level);
2219         EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES);
2220
2221         if (iter->flags & BTREE_ITER_WITH_JOURNAL)
2222                 return bkey_s_c_err(-EIO);
2223
2224         bch2_btree_iter_verify(iter);
2225         bch2_btree_iter_verify_entry_exit(iter);
2226
2227         if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
2228                 search_key.snapshot = U32_MAX;
2229
2230         while (1) {
2231                 iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
2232                                                 iter->flags & BTREE_ITER_INTENT,
2233                                                 btree_iter_ip_allocated(iter));
2234
2235                 ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
2236                 if (unlikely(ret)) {
2237                         /* ensure that iter->k is consistent with iter->pos: */
2238                         bch2_btree_iter_set_pos(iter, iter->pos);
2239                         k = bkey_s_c_err(ret);
2240                         goto out_no_locked;
2241                 }
2242
2243                 k = btree_path_level_peek(trans, iter->path,
2244                                           &iter->path->l[0], &iter->k);
2245                 if (!k.k ||
2246                     ((iter->flags & BTREE_ITER_IS_EXTENTS)
2247                      ? bpos_ge(bkey_start_pos(k.k), search_key)
2248                      : bpos_gt(k.k->p, search_key)))
2249                         k = btree_path_level_prev(trans, iter->path,
2250                                                   &iter->path->l[0], &iter->k);
2251
2252                 bch2_btree_path_check_sort(trans, iter->path, 0);
2253
2254                 if (likely(k.k)) {
2255                         if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) {
2256                                 if (k.k->p.snapshot == iter->snapshot)
2257                                         goto got_key;
2258
2259                                 /*
2260                                  * If we have a saved candidate, and we're no
2261                                  * longer at the same _key_ (not pos), return
2262                                  * that candidate
2263                                  */
2264                                 if (saved_path && !bkey_eq(k.k->p, saved_k.p)) {
2265                                         bch2_path_put_nokeep(trans, iter->path,
2266                                                       iter->flags & BTREE_ITER_INTENT);
2267                                         iter->path = saved_path;
2268                                         saved_path = NULL;
2269                                         iter->k = saved_k;
2270                                         k.v     = saved_v;
2271                                         goto got_key;
2272                                 }
2273
2274                                 if (bch2_snapshot_is_ancestor(iter->trans->c,
2275                                                               iter->snapshot,
2276                                                               k.k->p.snapshot)) {
2277                                         if (saved_path)
2278                                                 bch2_path_put_nokeep(trans, saved_path,
2279                                                       iter->flags & BTREE_ITER_INTENT);
2280                                         saved_path = btree_path_clone(trans, iter->path,
2281                                                                 iter->flags & BTREE_ITER_INTENT);
2282                                         saved_k = *k.k;
2283                                         saved_v = k.v;
2284                                 }
2285
2286                                 search_key = bpos_predecessor(k.k->p);
2287                                 continue;
2288                         }
2289 got_key:
2290                         if (bkey_whiteout(k.k) &&
2291                             !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
2292                                 search_key = bkey_predecessor(iter, k.k->p);
2293                                 if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
2294                                         search_key.snapshot = U32_MAX;
2295                                 continue;
2296                         }
2297
2298                         break;
2299                 } else if (likely(!bpos_eq(iter->path->l[0].b->data->min_key, POS_MIN))) {
2300                         /* Advance to previous leaf node: */
2301                         search_key = bpos_predecessor(iter->path->l[0].b->data->min_key);
2302                 } else {
2303                         /* Start of btree: */
2304                         bch2_btree_iter_set_pos(iter, POS_MIN);
2305                         k = bkey_s_c_null;
2306                         goto out_no_locked;
2307                 }
2308         }
2309
2310         EBUG_ON(bkey_gt(bkey_start_pos(k.k), iter->pos));
2311
2312         /* Extents can straddle iter->pos: */
2313         if (bkey_lt(k.k->p, iter->pos))
2314                 iter->pos = k.k->p;
2315
2316         if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
2317                 iter->pos.snapshot = iter->snapshot;
2318
2319         btree_path_set_should_be_locked(iter->path);
2320 out_no_locked:
2321         if (saved_path)
2322                 bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
2323
2324         bch2_btree_iter_verify_entry_exit(iter);
2325         bch2_btree_iter_verify(iter);
2326
2327         return k;
2328 }
2329
2330 /**
2331  * bch2_btree_iter_prev: returns first key less than iterator's current
2332  * position
2333  */
2334 struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
2335 {
2336         if (!bch2_btree_iter_rewind(iter))
2337                 return bkey_s_c_null;
2338
2339         return bch2_btree_iter_peek_prev(iter);
2340 }
2341
2342 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
2343 {
2344         struct btree_trans *trans = iter->trans;
2345         struct bpos search_key;
2346         struct bkey_s_c k;
2347         int ret;
2348
2349         bch2_btree_iter_verify(iter);
2350         bch2_btree_iter_verify_entry_exit(iter);
2351         EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
2352         EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE));
2353
2354         /* extents can't span inode numbers: */
2355         if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
2356             unlikely(iter->pos.offset == KEY_OFFSET_MAX)) {
2357                 if (iter->pos.inode == KEY_INODE_MAX)
2358                         return bkey_s_c_null;
2359
2360                 bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
2361         }
2362
2363         search_key = btree_iter_search_key(iter);
2364         iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
2365                                         iter->flags & BTREE_ITER_INTENT,
2366                                         btree_iter_ip_allocated(iter));
2367
2368         ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
2369         if (unlikely(ret)) {
2370                 k = bkey_s_c_err(ret);
2371                 goto out_no_locked;
2372         }
2373
2374         if ((iter->flags & BTREE_ITER_CACHED) ||
2375             !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
2376                 struct bkey_i *next_update;
2377
2378                 if ((iter->flags & BTREE_ITER_WITH_UPDATES) &&
2379                     (next_update = btree_trans_peek_updates(trans,
2380                                                 iter->btree_id, search_key)) &&
2381                     bpos_eq(next_update->k.p, iter->pos)) {
2382                         iter->k = next_update->k;
2383                         k = bkey_i_to_s_c(next_update);
2384                         goto out;
2385                 }
2386
2387                 if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) &&
2388                     (next_update = bch2_btree_journal_peek_slot(trans,
2389                                         iter, iter->pos))) {
2390                         iter->k = next_update->k;
2391                         k = bkey_i_to_s_c(next_update);
2392                         goto out;
2393                 }
2394
2395                 if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
2396                     (k = __btree_trans_peek_key_cache(iter, iter->pos)).k) {
2397                         if (!bkey_err(k))
2398                                 iter->k = *k.k;
2399                         /* We're not returning a key from iter->path: */
2400                         goto out_no_locked;
2401                 }
2402
2403                 k = bch2_btree_path_peek_slot(iter->path, &iter->k);
2404                 if (unlikely(!k.k))
2405                         goto out_no_locked;
2406         } else {
2407                 struct bpos next;
2408                 struct bpos end = iter->pos;
2409
2410                 if (iter->flags & BTREE_ITER_IS_EXTENTS)
2411                         end.offset = U64_MAX;
2412
2413                 EBUG_ON(iter->path->level);
2414
2415                 if (iter->flags & BTREE_ITER_INTENT) {
2416                         struct btree_iter iter2;
2417
2418                         bch2_trans_copy_iter(&iter2, iter);
2419                         k = bch2_btree_iter_peek_upto(&iter2, end);
2420
2421                         if (k.k && !bkey_err(k)) {
2422                                 iter->k = iter2.k;
2423                                 k.k = &iter->k;
2424                         }
2425                         bch2_trans_iter_exit(trans, &iter2);
2426                 } else {
2427                         struct bpos pos = iter->pos;
2428
2429                         k = bch2_btree_iter_peek_upto(iter, end);
2430                         if (unlikely(bkey_err(k)))
2431                                 bch2_btree_iter_set_pos(iter, pos);
2432                         else
2433                                 iter->pos = pos;
2434                 }
2435
2436                 if (unlikely(bkey_err(k)))
2437                         goto out_no_locked;
2438
2439                 next = k.k ? bkey_start_pos(k.k) : POS_MAX;
2440
2441                 if (bkey_lt(iter->pos, next)) {
2442                         bkey_init(&iter->k);
2443                         iter->k.p = iter->pos;
2444
2445                         if (iter->flags & BTREE_ITER_IS_EXTENTS) {
2446                                 bch2_key_resize(&iter->k,
2447                                                 min_t(u64, KEY_SIZE_MAX,
2448                                                       (next.inode == iter->pos.inode
2449                                                        ? next.offset
2450                                                        : KEY_OFFSET_MAX) -
2451                                                       iter->pos.offset));
2452                                 EBUG_ON(!iter->k.size);
2453                         }
2454
2455                         k = (struct bkey_s_c) { &iter->k, NULL };
2456                 }
2457         }
2458 out:
2459         btree_path_set_should_be_locked(iter->path);
2460 out_no_locked:
2461         bch2_btree_iter_verify_entry_exit(iter);
2462         bch2_btree_iter_verify(iter);
2463         ret = bch2_btree_iter_verify_ret(iter, k);
2464         if (unlikely(ret))
2465                 return bkey_s_c_err(ret);
2466
2467         return k;
2468 }
2469
2470 struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
2471 {
2472         if (!bch2_btree_iter_advance(iter))
2473                 return bkey_s_c_null;
2474
2475         return bch2_btree_iter_peek_slot(iter);
2476 }
2477
2478 struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
2479 {
2480         if (!bch2_btree_iter_rewind(iter))
2481                 return bkey_s_c_null;
2482
2483         return bch2_btree_iter_peek_slot(iter);
2484 }
2485
2486 /* new transactional stuff: */
2487
2488 static inline void btree_path_verify_sorted_ref(struct btree_trans *trans,
2489                                                 struct btree_path *path)
2490 {
2491         EBUG_ON(path->sorted_idx >= trans->nr_sorted);
2492         EBUG_ON(trans->sorted[path->sorted_idx] != path->idx);
2493         EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
2494 }
2495
2496 static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans)
2497 {
2498 #ifdef CONFIG_BCACHEFS_DEBUG
2499         unsigned i;
2500
2501         for (i = 0; i < trans->nr_sorted; i++)
2502                 btree_path_verify_sorted_ref(trans, trans->paths + trans->sorted[i]);
2503 #endif
2504 }
2505
2506 static void btree_trans_verify_sorted(struct btree_trans *trans)
2507 {
2508 #ifdef CONFIG_BCACHEFS_DEBUG
2509         struct btree_path *path, *prev = NULL;
2510         unsigned i;
2511
2512         if (!bch2_debug_check_iterators)
2513                 return;
2514
2515         trans_for_each_path_inorder(trans, path, i) {
2516                 if (prev && btree_path_cmp(prev, path) > 0) {
2517                         bch2_dump_trans_paths_updates(trans);
2518                         panic("trans paths out of order!\n");
2519                 }
2520                 prev = path;
2521         }
2522 #endif
2523 }
2524
2525 static inline void btree_path_swap(struct btree_trans *trans,
2526                                    struct btree_path *l, struct btree_path *r)
2527 {
2528         swap(l->sorted_idx, r->sorted_idx);
2529         swap(trans->sorted[l->sorted_idx],
2530              trans->sorted[r->sorted_idx]);
2531
2532         btree_path_verify_sorted_ref(trans, l);
2533         btree_path_verify_sorted_ref(trans, r);
2534 }
2535
2536 static inline struct btree_path *sib_btree_path(struct btree_trans *trans,
2537                                                 struct btree_path *path, int sib)
2538 {
2539         unsigned idx = (unsigned) path->sorted_idx + sib;
2540
2541         EBUG_ON(sib != -1 && sib != 1);
2542
2543         return idx < trans->nr_sorted
2544                 ? trans->paths + trans->sorted[idx]
2545                 : NULL;
2546 }
2547
2548 static __always_inline void bch2_btree_path_check_sort_fast(struct btree_trans *trans,
2549                                                    struct btree_path *path,
2550                                                    int cmp)
2551 {
2552         struct btree_path *n;
2553         int cmp2;
2554
2555         EBUG_ON(!cmp);
2556
2557         while ((n = sib_btree_path(trans, path, cmp)) &&
2558                (cmp2 = btree_path_cmp(n, path)) &&
2559                cmp2 != cmp)
2560                 btree_path_swap(trans, n, path);
2561
2562         btree_trans_verify_sorted(trans);
2563 }
2564
2565 inline void bch2_btree_path_check_sort(struct btree_trans *trans, struct btree_path *path,
2566                                        int cmp)
2567 {
2568         struct btree_path *n;
2569
2570         if (cmp <= 0) {
2571                 n = prev_btree_path(trans, path);
2572                 if (n && btree_path_cmp(n, path) > 0) {
2573                         do {
2574                                 btree_path_swap(trans, n, path);
2575                                 n = prev_btree_path(trans, path);
2576                         } while (n && btree_path_cmp(n, path) > 0);
2577
2578                         goto out;
2579                 }
2580         }
2581
2582         if (cmp >= 0) {
2583                 n = next_btree_path(trans, path);
2584                 if (n && btree_path_cmp(path, n) > 0) {
2585                         do {
2586                                 btree_path_swap(trans, path, n);
2587                                 n = next_btree_path(trans, path);
2588                         } while (n && btree_path_cmp(path, n) > 0);
2589                 }
2590         }
2591 out:
2592         btree_trans_verify_sorted(trans);
2593 }
2594
2595 static inline void btree_path_list_remove(struct btree_trans *trans,
2596                                           struct btree_path *path)
2597 {
2598         unsigned i;
2599
2600         EBUG_ON(path->sorted_idx >= trans->nr_sorted);
2601 #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
2602         trans->nr_sorted--;
2603         memmove_u64s_down_small(trans->sorted + path->sorted_idx,
2604                                 trans->sorted + path->sorted_idx + 1,
2605                                 DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8));
2606 #else
2607         array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx);
2608 #endif
2609         for (i = path->sorted_idx; i < trans->nr_sorted; i++)
2610                 trans->paths[trans->sorted[i]].sorted_idx = i;
2611
2612         path->sorted_idx = U8_MAX;
2613
2614         btree_trans_verify_sorted_refs(trans);
2615 }
2616
2617 static inline void btree_path_list_add(struct btree_trans *trans,
2618                                        struct btree_path *pos,
2619                                        struct btree_path *path)
2620 {
2621         unsigned i;
2622
2623         btree_trans_verify_sorted_refs(trans);
2624
2625         path->sorted_idx = pos ? pos->sorted_idx + 1 : 0;
2626
2627         if (unlikely(trans->in_traverse_all) &&
2628             trans->traverse_all_idx != U8_MAX &&
2629             trans->traverse_all_idx >= path->sorted_idx)
2630                 trans->traverse_all_idx++;
2631
2632 #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
2633         memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1,
2634                               trans->sorted + path->sorted_idx,
2635                               DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8));
2636         trans->nr_sorted++;
2637         trans->sorted[path->sorted_idx] = path->idx;
2638 #else
2639         array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx);
2640 #endif
2641
2642         for (i = path->sorted_idx; i < trans->nr_sorted; i++)
2643                 trans->paths[trans->sorted[i]].sorted_idx = i;
2644
2645         btree_trans_verify_sorted_refs(trans);
2646 }
2647
2648 void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
2649 {
2650         if (iter->path)
2651                 bch2_path_put(trans, iter->path,
2652                               iter->flags & BTREE_ITER_INTENT);
2653         if (iter->update_path)
2654                 bch2_path_put_nokeep(trans, iter->update_path,
2655                               iter->flags & BTREE_ITER_INTENT);
2656         if (iter->key_cache_path)
2657                 bch2_path_put(trans, iter->key_cache_path,
2658                               iter->flags & BTREE_ITER_INTENT);
2659         iter->path = NULL;
2660         iter->update_path = NULL;
2661         iter->key_cache_path = NULL;
2662 }
2663
2664 static inline void bch2_trans_iter_init_inlined(struct btree_trans *trans,
2665                           struct btree_iter *iter,
2666                           unsigned btree_id, struct bpos pos,
2667                           unsigned flags)
2668 {
2669         bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
2670                                bch2_btree_iter_flags(trans, btree_id, flags),
2671                                _RET_IP_);
2672 }
2673
2674 void bch2_trans_iter_init_outlined(struct btree_trans *trans,
2675                           struct btree_iter *iter,
2676                           unsigned btree_id, struct bpos pos,
2677                           unsigned flags)
2678 {
2679         bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
2680                                bch2_btree_iter_flags(trans, btree_id, flags),
2681                                _RET_IP_);
2682 }
2683
2684 void bch2_trans_node_iter_init(struct btree_trans *trans,
2685                                struct btree_iter *iter,
2686                                enum btree_id btree_id,
2687                                struct bpos pos,
2688                                unsigned locks_want,
2689                                unsigned depth,
2690                                unsigned flags)
2691 {
2692        flags |= BTREE_ITER_NOT_EXTENTS;
2693        flags |= __BTREE_ITER_ALL_SNAPSHOTS;
2694        flags |= BTREE_ITER_ALL_SNAPSHOTS;
2695
2696         bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth,
2697                                __bch2_btree_iter_flags(trans, btree_id, flags),
2698                                _RET_IP_);
2699
2700         iter->min_depth = depth;
2701
2702         BUG_ON(iter->path->locks_want    < min(locks_want, BTREE_MAX_DEPTH));
2703         BUG_ON(iter->path->level        != depth);
2704         BUG_ON(iter->min_depth          != depth);
2705 }
2706
2707 void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
2708 {
2709         *dst = *src;
2710         if (src->path)
2711                 __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT);
2712         if (src->update_path)
2713                 __btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT);
2714         dst->key_cache_path = NULL;
2715 }
2716
2717 void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
2718 {
2719         unsigned new_top = trans->mem_top + size;
2720         size_t old_bytes = trans->mem_bytes;
2721         size_t new_bytes = roundup_pow_of_two(new_top);
2722         void *new_mem;
2723         void *p;
2724
2725         trans->mem_max = max(trans->mem_max, new_top);
2726
2727         WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
2728
2729         new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
2730         if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
2731                 new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
2732                 new_bytes = BTREE_TRANS_MEM_MAX;
2733                 kfree(trans->mem);
2734         }
2735
2736         if (!new_mem)
2737                 return ERR_PTR(-ENOMEM);
2738
2739         trans->mem = new_mem;
2740         trans->mem_bytes = new_bytes;
2741
2742         if (old_bytes) {
2743                 trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
2744                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
2745         }
2746
2747         p = trans->mem + trans->mem_top;
2748         trans->mem_top += size;
2749         memset(p, 0, size);
2750         return p;
2751 }
2752
2753 /**
2754  * bch2_trans_begin() - reset a transaction after a interrupted attempt
2755  * @trans: transaction to reset
2756  *
2757  * While iterating over nodes or updating nodes a attempt to lock a btree node
2758  * may return BCH_ERR_transaction_restart when the trylock fails. When this
2759  * occurs bch2_trans_begin() should be called and the transaction retried.
2760  */
2761 u32 bch2_trans_begin(struct btree_trans *trans)
2762 {
2763         struct btree_path *path;
2764
2765         bch2_trans_reset_updates(trans);
2766
2767         trans->restart_count++;
2768         trans->mem_top                  = 0;
2769
2770         if (trans->fs_usage_deltas) {
2771                 trans->fs_usage_deltas->used = 0;
2772                 memset((void *) trans->fs_usage_deltas +
2773                        offsetof(struct replicas_delta_list, memset_start), 0,
2774                        (void *) &trans->fs_usage_deltas->memset_end -
2775                        (void *) &trans->fs_usage_deltas->memset_start);
2776         }
2777
2778         trans_for_each_path(trans, path) {
2779                 path->should_be_locked = false;
2780
2781                 /*
2782                  * If the transaction wasn't restarted, we're presuming to be
2783                  * doing something new: dont keep iterators excpt the ones that
2784                  * are in use - except for the subvolumes btree:
2785                  */
2786                 if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes)
2787                         path->preserve = false;
2788
2789                 /*
2790                  * XXX: we probably shouldn't be doing this if the transaction
2791                  * was restarted, but currently we still overflow transaction
2792                  * iterators if we do that
2793                  */
2794                 if (!path->ref && !path->preserve)
2795                         __bch2_path_free(trans, path);
2796                 else
2797                         path->preserve = false;
2798         }
2799
2800         if (!trans->restarted &&
2801             (need_resched() ||
2802              local_clock() - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) {
2803                 bch2_trans_unlock(trans);
2804                 cond_resched();
2805                 bch2_trans_relock(trans);
2806         }
2807
2808         trans->last_restarted_ip = _RET_IP_;
2809         if (trans->restarted)
2810                 bch2_btree_path_traverse_all(trans);
2811
2812         trans->last_begin_time = local_clock();
2813         return trans->restart_count;
2814 }
2815
2816 void bch2_trans_verify_not_restarted(struct btree_trans *trans, u32 restart_count)
2817 {
2818         if (trans_was_restarted(trans, restart_count))
2819                 panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
2820                       trans->restart_count, restart_count,
2821                       (void *) trans->last_restarted_ip);
2822 }
2823
2824 static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
2825 {
2826         size_t paths_bytes      = sizeof(struct btree_path) * BTREE_ITER_MAX;
2827         size_t updates_bytes    = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX;
2828         void *p = NULL;
2829
2830         BUG_ON(trans->used_mempool);
2831
2832 #ifdef __KERNEL__
2833         p = this_cpu_xchg(c->btree_paths_bufs->path, NULL);
2834 #endif
2835         if (!p)
2836                 p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS);
2837
2838         trans->paths            = p; p += paths_bytes;
2839         trans->updates          = p; p += updates_bytes;
2840 }
2841
2842 const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
2843
2844 unsigned bch2_trans_get_fn_idx(const char *fn)
2845 {
2846         unsigned i;
2847
2848         for (i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++)
2849                 if (!bch2_btree_transaction_fns[i] ||
2850                     bch2_btree_transaction_fns[i] == fn) {
2851                         bch2_btree_transaction_fns[i] = fn;
2852                         return i;
2853                 }
2854
2855         pr_warn_once("BCH_TRANSACTIONS_NR not big enough!");
2856         return i;
2857 }
2858
2859 void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_idx)
2860         __acquires(&c->btree_trans_barrier)
2861 {
2862         struct btree_transaction_stats *s;
2863         struct btree_trans *pos;
2864
2865         BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
2866
2867         memset(trans, 0, sizeof(*trans));
2868         trans->c                = c;
2869         trans->fn               = fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns)
2870                 ? bch2_btree_transaction_fns[fn_idx] : NULL;
2871         trans->last_begin_time  = local_clock();
2872         trans->fn_idx           = fn_idx;
2873         trans->locking_wait.task = current;
2874         trans->journal_replay_not_finished =
2875                 !test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
2876         closure_init_stack(&trans->ref);
2877
2878         bch2_trans_alloc_paths(trans, c);
2879
2880         s = btree_trans_stats(trans);
2881         if (s && s->max_mem) {
2882                 unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
2883
2884                 trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
2885
2886                 if (!unlikely(trans->mem)) {
2887                         trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
2888                         trans->mem_bytes = BTREE_TRANS_MEM_MAX;
2889                 } else {
2890                         trans->mem_bytes = expected_mem_bytes;
2891                 }
2892         }
2893         if (s)
2894                 trans->nr_max_paths = s->nr_max_paths;
2895
2896         trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
2897
2898         mutex_lock(&c->btree_trans_lock);
2899         list_for_each_entry(pos, &c->btree_trans_list, list) {
2900                 if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) {
2901                         list_add_tail(&trans->list, &pos->list);
2902                         goto list_add_done;
2903                 }
2904         }
2905         list_add_tail(&trans->list, &c->btree_trans_list);
2906 list_add_done:
2907         mutex_unlock(&c->btree_trans_lock);
2908 }
2909
2910 static void check_btree_paths_leaked(struct btree_trans *trans)
2911 {
2912 #ifdef CONFIG_BCACHEFS_DEBUG
2913         struct bch_fs *c = trans->c;
2914         struct btree_path *path;
2915
2916         trans_for_each_path(trans, path)
2917                 if (path->ref)
2918                         goto leaked;
2919         return;
2920 leaked:
2921         bch_err(c, "btree paths leaked from %s!", trans->fn);
2922         trans_for_each_path(trans, path)
2923                 if (path->ref)
2924                         printk(KERN_ERR "  btree %s %pS\n",
2925                                bch2_btree_ids[path->btree_id],
2926                                (void *) path->ip_allocated);
2927         /* Be noisy about this: */
2928         bch2_fatal_error(c);
2929 #endif
2930 }
2931
2932 void bch2_trans_exit(struct btree_trans *trans)
2933         __releases(&c->btree_trans_barrier)
2934 {
2935         struct btree_insert_entry *i;
2936         struct bch_fs *c = trans->c;
2937         struct btree_transaction_stats *s = btree_trans_stats(trans);
2938
2939         bch2_trans_unlock(trans);
2940
2941         closure_sync(&trans->ref);
2942
2943         if (s)
2944                 s->max_mem = max(s->max_mem, trans->mem_max);
2945
2946         trans_for_each_update(trans, i)
2947                 __btree_path_put(i->path, true);
2948         trans->nr_updates               = 0;
2949
2950         check_btree_paths_leaked(trans);
2951
2952         mutex_lock(&c->btree_trans_lock);
2953         list_del(&trans->list);
2954         mutex_unlock(&c->btree_trans_lock);
2955
2956         srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
2957
2958         bch2_journal_preres_put(&c->journal, &trans->journal_preres);
2959
2960         kfree(trans->extra_journal_entries.data);
2961
2962         if (trans->fs_usage_deltas) {
2963                 if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
2964                     REPLICAS_DELTA_LIST_MAX)
2965                         mempool_free(trans->fs_usage_deltas,
2966                                      &c->replicas_delta_pool);
2967                 else
2968                         kfree(trans->fs_usage_deltas);
2969         }
2970
2971         if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
2972                 mempool_free(trans->mem, &c->btree_trans_mem_pool);
2973         else
2974                 kfree(trans->mem);
2975
2976 #ifdef __KERNEL__
2977         /*
2978          * Userspace doesn't have a real percpu implementation:
2979          */
2980         trans->paths = this_cpu_xchg(c->btree_paths_bufs->path, trans->paths);
2981 #endif
2982
2983         if (trans->paths)
2984                 mempool_free(trans->paths, &c->btree_paths_pool);
2985
2986         trans->mem      = (void *) 0x1;
2987         trans->paths    = (void *) 0x1;
2988 }
2989
2990 static void __maybe_unused
2991 bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
2992                                       struct btree_bkey_cached_common *b)
2993 {
2994         struct six_lock_count c = six_lock_counts(&b->lock);
2995         struct task_struct *owner;
2996         pid_t pid;
2997
2998         rcu_read_lock();
2999         owner = READ_ONCE(b->lock.owner);
3000         pid = owner ? owner->pid : 0;
3001         rcu_read_unlock();
3002
3003         prt_tab(out);
3004         prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b',
3005                    b->level, bch2_btree_ids[b->btree_id]);
3006         bch2_bpos_to_text(out, btree_node_pos(b));
3007
3008         prt_tab(out);
3009         prt_printf(out, " locks %u:%u:%u held by pid %u",
3010                    c.n[0], c.n[1], c.n[2], pid);
3011 }
3012
3013 void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
3014 {
3015         struct btree_path *path;
3016         struct btree_bkey_cached_common *b;
3017         static char lock_types[] = { 'r', 'i', 'w' };
3018         unsigned l;
3019
3020         if (!out->nr_tabstops) {
3021                 printbuf_tabstop_push(out, 16);
3022                 printbuf_tabstop_push(out, 32);
3023         }
3024
3025         prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn);
3026
3027         trans_for_each_path(trans, path) {
3028                 if (!path->nodes_locked)
3029                         continue;
3030
3031                 prt_printf(out, "  path %u %c l=%u %s:",
3032                        path->idx,
3033                        path->cached ? 'c' : 'b',
3034                        path->level,
3035                        bch2_btree_ids[path->btree_id]);
3036                 bch2_bpos_to_text(out, path->pos);
3037                 prt_newline(out);
3038
3039                 for (l = 0; l < BTREE_MAX_DEPTH; l++) {
3040                         if (btree_node_locked(path, l) &&
3041                             !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) {
3042                                 prt_printf(out, "    %c l=%u ",
3043                                            lock_types[btree_node_locked_type(path, l)], l);
3044                                 bch2_btree_bkey_cached_common_to_text(out, b);
3045                                 prt_newline(out);
3046                         }
3047                 }
3048         }
3049
3050         b = READ_ONCE(trans->locking);
3051         if (b) {
3052                 prt_str(out, "  want");
3053                 prt_newline(out);
3054                 prt_printf(out, "    %c", lock_types[trans->locking_wait.lock_want]);
3055                 bch2_btree_bkey_cached_common_to_text(out, b);
3056                 prt_newline(out);
3057         }
3058 }
3059
3060 void bch2_fs_btree_iter_exit(struct bch_fs *c)
3061 {
3062         struct btree_transaction_stats *s;
3063
3064         for (s = c->btree_transaction_stats;
3065              s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
3066              s++)
3067                 kfree(s->max_paths_text);
3068
3069         if (c->btree_trans_barrier_initialized)
3070                 cleanup_srcu_struct(&c->btree_trans_barrier);
3071         mempool_exit(&c->btree_trans_mem_pool);
3072         mempool_exit(&c->btree_paths_pool);
3073 }
3074
3075 int bch2_fs_btree_iter_init(struct bch_fs *c)
3076 {
3077         unsigned i, nr = BTREE_ITER_MAX;
3078         int ret;
3079
3080         for (i = 0; i < ARRAY_SIZE(c->btree_transaction_stats); i++)
3081                 mutex_init(&c->btree_transaction_stats[i].lock);
3082
3083         INIT_LIST_HEAD(&c->btree_trans_list);
3084         mutex_init(&c->btree_trans_lock);
3085
3086         ret   = mempool_init_kmalloc_pool(&c->btree_paths_pool, 1,
3087                         sizeof(struct btree_path) * nr +
3088                         sizeof(struct btree_insert_entry) * nr) ?:
3089                 mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
3090                                           BTREE_TRANS_MEM_MAX) ?:
3091                 init_srcu_struct(&c->btree_trans_barrier);
3092         if (!ret)
3093                 c->btree_trans_barrier_initialized = true;
3094         return ret;
3095 }