]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_iter.c
fd4358eb013006dc142dbac9ee296d4482ebf1b2
[bcachefs-tools-debian] / libbcachefs / btree_iter.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_methods.h"
5 #include "bkey_buf.h"
6 #include "btree_cache.h"
7 #include "btree_iter.h"
8 #include "btree_key_cache.h"
9 #include "btree_locking.h"
10 #include "btree_update.h"
11 #include "debug.h"
12 #include "error.h"
13 #include "extents.h"
14 #include "journal.h"
15 #include "recovery.h"
16 #include "replicas.h"
17 #include "subvolume.h"
18
19 #include <linux/prandom.h>
20 #include <linux/prefetch.h>
21 #include <trace/events/bcachefs.h>
22
23 static void btree_trans_verify_sorted(struct btree_trans *);
24 inline void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int);
25 static __always_inline void bch2_btree_path_check_sort_fast(struct btree_trans *,
26                                                    struct btree_path *, int);
27
28 static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
29 static inline void btree_path_list_add(struct btree_trans *, struct btree_path *,
30                                        struct btree_path *);
31
32 static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter)
33 {
34 #ifdef CONFIG_BCACHEFS_DEBUG
35         return iter->ip_allocated;
36 #else
37         return 0;
38 #endif
39 }
40
41 static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *);
42
43 /*
44  * Unlocks before scheduling
45  * Note: does not revalidate iterator
46  */
47 static inline int bch2_trans_cond_resched(struct btree_trans *trans)
48 {
49         if (need_resched() || race_fault()) {
50                 bch2_trans_unlock(trans);
51                 schedule();
52                 return bch2_trans_relock(trans);
53         } else {
54                 return 0;
55         }
56 }
57
58 static inline int __btree_path_cmp(const struct btree_path *l,
59                                    enum btree_id        r_btree_id,
60                                    bool                 r_cached,
61                                    struct bpos          r_pos,
62                                    unsigned             r_level)
63 {
64         /*
65          * Must match lock ordering as defined by __bch2_btree_node_lock:
66          */
67         return   cmp_int(l->btree_id,   r_btree_id) ?:
68                  cmp_int((int) l->cached,       (int) r_cached) ?:
69                  bpos_cmp(l->pos,       r_pos) ?:
70                 -cmp_int(l->level,      r_level);
71 }
72
73 static inline int btree_path_cmp(const struct btree_path *l,
74                                  const struct btree_path *r)
75 {
76         return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level);
77 }
78
79 static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
80 {
81         /* Are we iterating over keys in all snapshots? */
82         if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
83                 p = bpos_successor(p);
84         } else {
85                 p = bpos_nosnap_successor(p);
86                 p.snapshot = iter->snapshot;
87         }
88
89         return p;
90 }
91
92 static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
93 {
94         /* Are we iterating over keys in all snapshots? */
95         if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
96                 p = bpos_predecessor(p);
97         } else {
98                 p = bpos_nosnap_predecessor(p);
99                 p.snapshot = iter->snapshot;
100         }
101
102         return p;
103 }
104
105 static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
106 {
107         struct bpos pos = iter->pos;
108
109         if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
110             !bkey_eq(pos, POS_MAX))
111                 pos = bkey_successor(iter, pos);
112         return pos;
113 }
114
115 static inline bool btree_path_pos_before_node(struct btree_path *path,
116                                               struct btree *b)
117 {
118         return bpos_lt(path->pos, b->data->min_key);
119 }
120
121 static inline bool btree_path_pos_after_node(struct btree_path *path,
122                                              struct btree *b)
123 {
124         return bpos_gt(path->pos, b->key.k.p);
125 }
126
127 static inline bool btree_path_pos_in_node(struct btree_path *path,
128                                           struct btree *b)
129 {
130         return path->btree_id == b->c.btree_id &&
131                 !btree_path_pos_before_node(path, b) &&
132                 !btree_path_pos_after_node(path, b);
133 }
134
135 /* Btree iterator: */
136
137 #ifdef CONFIG_BCACHEFS_DEBUG
138
139 static void bch2_btree_path_verify_cached(struct btree_trans *trans,
140                                           struct btree_path *path)
141 {
142         struct bkey_cached *ck;
143         bool locked = btree_node_locked(path, 0);
144
145         if (!bch2_btree_node_relock(trans, path, 0))
146                 return;
147
148         ck = (void *) path->l[0].b;
149         BUG_ON(ck->key.btree_id != path->btree_id ||
150                !bkey_eq(ck->key.pos, path->pos));
151
152         if (!locked)
153                 btree_node_unlock(trans, path, 0);
154 }
155
156 static void bch2_btree_path_verify_level(struct btree_trans *trans,
157                                 struct btree_path *path, unsigned level)
158 {
159         struct btree_path_level *l;
160         struct btree_node_iter tmp;
161         bool locked;
162         struct bkey_packed *p, *k;
163         struct printbuf buf1 = PRINTBUF;
164         struct printbuf buf2 = PRINTBUF;
165         struct printbuf buf3 = PRINTBUF;
166         const char *msg;
167
168         if (!bch2_debug_check_iterators)
169                 return;
170
171         l       = &path->l[level];
172         tmp     = l->iter;
173         locked  = btree_node_locked(path, level);
174
175         if (path->cached) {
176                 if (!level)
177                         bch2_btree_path_verify_cached(trans, path);
178                 return;
179         }
180
181         if (!btree_path_node(path, level))
182                 return;
183
184         if (!bch2_btree_node_relock_notrace(trans, path, level))
185                 return;
186
187         BUG_ON(!btree_path_pos_in_node(path, l->b));
188
189         bch2_btree_node_iter_verify(&l->iter, l->b);
190
191         /*
192          * For interior nodes, the iterator will have skipped past deleted keys:
193          */
194         p = level
195                 ? bch2_btree_node_iter_prev(&tmp, l->b)
196                 : bch2_btree_node_iter_prev_all(&tmp, l->b);
197         k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
198
199         if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) {
200                 msg = "before";
201                 goto err;
202         }
203
204         if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
205                 msg = "after";
206                 goto err;
207         }
208
209         if (!locked)
210                 btree_node_unlock(trans, path, level);
211         return;
212 err:
213         bch2_bpos_to_text(&buf1, path->pos);
214
215         if (p) {
216                 struct bkey uk = bkey_unpack_key(l->b, p);
217
218                 bch2_bkey_to_text(&buf2, &uk);
219         } else {
220                 prt_printf(&buf2, "(none)");
221         }
222
223         if (k) {
224                 struct bkey uk = bkey_unpack_key(l->b, k);
225
226                 bch2_bkey_to_text(&buf3, &uk);
227         } else {
228                 prt_printf(&buf3, "(none)");
229         }
230
231         panic("path should be %s key at level %u:\n"
232               "path pos %s\n"
233               "prev key %s\n"
234               "cur  key %s\n",
235               msg, level, buf1.buf, buf2.buf, buf3.buf);
236 }
237
238 static void bch2_btree_path_verify(struct btree_trans *trans,
239                                    struct btree_path *path)
240 {
241         struct bch_fs *c = trans->c;
242         unsigned i;
243
244         EBUG_ON(path->btree_id >= BTREE_ID_NR);
245
246         for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) {
247                 if (!path->l[i].b) {
248                         BUG_ON(!path->cached &&
249                                c->btree_roots[path->btree_id].b->c.level > i);
250                         break;
251                 }
252
253                 bch2_btree_path_verify_level(trans, path, i);
254         }
255
256         bch2_btree_path_verify_locks(path);
257 }
258
259 void bch2_trans_verify_paths(struct btree_trans *trans)
260 {
261         struct btree_path *path;
262
263         trans_for_each_path(trans, path)
264                 bch2_btree_path_verify(trans, path);
265 }
266
267 static void bch2_btree_iter_verify(struct btree_iter *iter)
268 {
269         struct btree_trans *trans = iter->trans;
270
271         BUG_ON(iter->btree_id >= BTREE_ID_NR);
272
273         BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached);
274
275         BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
276                (iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
277
278         BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
279                (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
280                !btree_type_has_snapshots(iter->btree_id));
281
282         if (iter->update_path)
283                 bch2_btree_path_verify(trans, iter->update_path);
284         bch2_btree_path_verify(trans, iter->path);
285 }
286
287 static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
288 {
289         BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
290                !iter->pos.snapshot);
291
292         BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
293                iter->pos.snapshot != iter->snapshot);
294
295         BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) ||
296                bkey_gt(iter->pos, iter->k.p));
297 }
298
299 static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
300 {
301         struct btree_trans *trans = iter->trans;
302         struct btree_iter copy;
303         struct bkey_s_c prev;
304         int ret = 0;
305
306         if (!bch2_debug_check_iterators)
307                 return 0;
308
309         if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS))
310                 return 0;
311
312         if (bkey_err(k) || !k.k)
313                 return 0;
314
315         BUG_ON(!bch2_snapshot_is_ancestor(trans->c,
316                                           iter->snapshot,
317                                           k.k->p.snapshot));
318
319         bch2_trans_iter_init(trans, &copy, iter->btree_id, iter->pos,
320                              BTREE_ITER_NOPRESERVE|
321                              BTREE_ITER_ALL_SNAPSHOTS);
322         prev = bch2_btree_iter_prev(&copy);
323         if (!prev.k)
324                 goto out;
325
326         ret = bkey_err(prev);
327         if (ret)
328                 goto out;
329
330         if (bkey_eq(prev.k->p, k.k->p) &&
331             bch2_snapshot_is_ancestor(trans->c, iter->snapshot,
332                                       prev.k->p.snapshot) > 0) {
333                 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
334
335                 bch2_bkey_to_text(&buf1, k.k);
336                 bch2_bkey_to_text(&buf2, prev.k);
337
338                 panic("iter snap %u\n"
339                       "k    %s\n"
340                       "prev %s\n",
341                       iter->snapshot,
342                       buf1.buf, buf2.buf);
343         }
344 out:
345         bch2_trans_iter_exit(trans, &copy);
346         return ret;
347 }
348
349 void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
350                             struct bpos pos, bool key_cache)
351 {
352         struct btree_path *path;
353         unsigned idx;
354         struct printbuf buf = PRINTBUF;
355
356         trans_for_each_path_inorder(trans, path, idx) {
357                 int cmp = cmp_int(path->btree_id, id) ?:
358                         cmp_int(path->cached, key_cache);
359
360                 if (cmp > 0)
361                         break;
362                 if (cmp < 0)
363                         continue;
364
365                 if (!btree_node_locked(path, 0) ||
366                     !path->should_be_locked)
367                         continue;
368
369                 if (!key_cache) {
370                         if (bkey_ge(pos, path->l[0].b->data->min_key) &&
371                             bkey_le(pos, path->l[0].b->key.k.p))
372                                 return;
373                 } else {
374                         if (bkey_eq(pos, path->pos))
375                                 return;
376                 }
377         }
378
379         bch2_dump_trans_paths_updates(trans);
380         bch2_bpos_to_text(&buf, pos);
381
382         panic("not locked: %s %s%s\n",
383               bch2_btree_ids[id], buf.buf,
384               key_cache ? " cached" : "");
385 }
386
387 #else
388
389 static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
390                                                 struct btree_path *path, unsigned l) {}
391 static inline void bch2_btree_path_verify(struct btree_trans *trans,
392                                           struct btree_path *path) {}
393 static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
394 static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
395 static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; }
396
397 #endif
398
399 /* Btree path: fixups after btree updates */
400
401 static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
402                                         struct btree *b,
403                                         struct bset_tree *t,
404                                         struct bkey_packed *k)
405 {
406         struct btree_node_iter_set *set;
407
408         btree_node_iter_for_each(iter, set)
409                 if (set->end == t->end_offset) {
410                         set->k = __btree_node_key_to_offset(b, k);
411                         bch2_btree_node_iter_sort(iter, b);
412                         return;
413                 }
414
415         bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
416 }
417
418 static void __bch2_btree_path_fix_key_modified(struct btree_path *path,
419                                                struct btree *b,
420                                                struct bkey_packed *where)
421 {
422         struct btree_path_level *l = &path->l[b->c.level];
423
424         if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
425                 return;
426
427         if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0)
428                 bch2_btree_node_iter_advance(&l->iter, l->b);
429 }
430
431 void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
432                                       struct btree *b,
433                                       struct bkey_packed *where)
434 {
435         struct btree_path *path;
436
437         trans_for_each_path_with_node(trans, b, path) {
438                 __bch2_btree_path_fix_key_modified(path, b, where);
439                 bch2_btree_path_verify_level(trans, path, b->c.level);
440         }
441 }
442
443 static void __bch2_btree_node_iter_fix(struct btree_path *path,
444                                        struct btree *b,
445                                        struct btree_node_iter *node_iter,
446                                        struct bset_tree *t,
447                                        struct bkey_packed *where,
448                                        unsigned clobber_u64s,
449                                        unsigned new_u64s)
450 {
451         const struct bkey_packed *end = btree_bkey_last(b, t);
452         struct btree_node_iter_set *set;
453         unsigned offset = __btree_node_key_to_offset(b, where);
454         int shift = new_u64s - clobber_u64s;
455         unsigned old_end = t->end_offset - shift;
456         unsigned orig_iter_pos = node_iter->data[0].k;
457         bool iter_current_key_modified =
458                 orig_iter_pos >= offset &&
459                 orig_iter_pos <= offset + clobber_u64s;
460
461         btree_node_iter_for_each(node_iter, set)
462                 if (set->end == old_end)
463                         goto found;
464
465         /* didn't find the bset in the iterator - might have to readd it: */
466         if (new_u64s &&
467             bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
468                 bch2_btree_node_iter_push(node_iter, b, where, end);
469                 goto fixup_done;
470         } else {
471                 /* Iterator is after key that changed */
472                 return;
473         }
474 found:
475         set->end = t->end_offset;
476
477         /* Iterator hasn't gotten to the key that changed yet: */
478         if (set->k < offset)
479                 return;
480
481         if (new_u64s &&
482             bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
483                 set->k = offset;
484         } else if (set->k < offset + clobber_u64s) {
485                 set->k = offset + new_u64s;
486                 if (set->k == set->end)
487                         bch2_btree_node_iter_set_drop(node_iter, set);
488         } else {
489                 /* Iterator is after key that changed */
490                 set->k = (int) set->k + shift;
491                 return;
492         }
493
494         bch2_btree_node_iter_sort(node_iter, b);
495 fixup_done:
496         if (node_iter->data[0].k != orig_iter_pos)
497                 iter_current_key_modified = true;
498
499         /*
500          * When a new key is added, and the node iterator now points to that
501          * key, the iterator might have skipped past deleted keys that should
502          * come after the key the iterator now points to. We have to rewind to
503          * before those deleted keys - otherwise
504          * bch2_btree_node_iter_prev_all() breaks:
505          */
506         if (!bch2_btree_node_iter_end(node_iter) &&
507             iter_current_key_modified &&
508             b->c.level) {
509                 struct bset_tree *t;
510                 struct bkey_packed *k, *k2, *p;
511
512                 k = bch2_btree_node_iter_peek_all(node_iter, b);
513
514                 for_each_bset(b, t) {
515                         bool set_pos = false;
516
517                         if (node_iter->data[0].end == t->end_offset)
518                                 continue;
519
520                         k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t);
521
522                         while ((p = bch2_bkey_prev_all(b, t, k2)) &&
523                                bkey_iter_cmp(b, k, p) < 0) {
524                                 k2 = p;
525                                 set_pos = true;
526                         }
527
528                         if (set_pos)
529                                 btree_node_iter_set_set_pos(node_iter,
530                                                             b, t, k2);
531                 }
532         }
533 }
534
535 void bch2_btree_node_iter_fix(struct btree_trans *trans,
536                               struct btree_path *path,
537                               struct btree *b,
538                               struct btree_node_iter *node_iter,
539                               struct bkey_packed *where,
540                               unsigned clobber_u64s,
541                               unsigned new_u64s)
542 {
543         struct bset_tree *t = bch2_bkey_to_bset(b, where);
544         struct btree_path *linked;
545
546         if (node_iter != &path->l[b->c.level].iter) {
547                 __bch2_btree_node_iter_fix(path, b, node_iter, t,
548                                            where, clobber_u64s, new_u64s);
549
550                 if (bch2_debug_check_iterators)
551                         bch2_btree_node_iter_verify(node_iter, b);
552         }
553
554         trans_for_each_path_with_node(trans, b, linked) {
555                 __bch2_btree_node_iter_fix(linked, b,
556                                            &linked->l[b->c.level].iter, t,
557                                            where, clobber_u64s, new_u64s);
558                 bch2_btree_path_verify_level(trans, linked, b->c.level);
559         }
560 }
561
562 /* Btree path level: pointer to a particular btree node and node iter */
563
564 static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c,
565                                                   struct btree_path_level *l,
566                                                   struct bkey *u,
567                                                   struct bkey_packed *k)
568 {
569         if (unlikely(!k)) {
570                 /*
571                  * signal to bch2_btree_iter_peek_slot() that we're currently at
572                  * a hole
573                  */
574                 u->type = KEY_TYPE_deleted;
575                 return bkey_s_c_null;
576         }
577
578         return bkey_disassemble(l->b, k, u);
579 }
580
581 static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
582                                                         struct btree_path_level *l,
583                                                         struct bkey *u)
584 {
585         return __btree_iter_unpack(c, l, u,
586                         bch2_btree_node_iter_peek_all(&l->iter, l->b));
587 }
588
589 static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans,
590                                                     struct btree_path *path,
591                                                     struct btree_path_level *l,
592                                                     struct bkey *u)
593 {
594         struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
595                         bch2_btree_node_iter_peek(&l->iter, l->b));
596
597         path->pos = k.k ? k.k->p : l->b->key.k.p;
598         bch2_btree_path_verify_level(trans, path, l - path->l);
599         return k;
600 }
601
602 static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
603                                                     struct btree_path *path,
604                                                     struct btree_path_level *l,
605                                                     struct bkey *u)
606 {
607         struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
608                         bch2_btree_node_iter_prev(&l->iter, l->b));
609
610         path->pos = k.k ? k.k->p : l->b->data->min_key;
611         bch2_btree_path_verify_level(trans, path, l - path->l);
612         return k;
613 }
614
615 static inline bool btree_path_advance_to_pos(struct btree_path *path,
616                                              struct btree_path_level *l,
617                                              int max_advance)
618 {
619         struct bkey_packed *k;
620         int nr_advanced = 0;
621
622         while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
623                bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
624                 if (max_advance > 0 && nr_advanced >= max_advance)
625                         return false;
626
627                 bch2_btree_node_iter_advance(&l->iter, l->b);
628                 nr_advanced++;
629         }
630
631         return true;
632 }
633
634 static inline void __btree_path_level_init(struct btree_path *path,
635                                            unsigned level)
636 {
637         struct btree_path_level *l = &path->l[level];
638
639         bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
640
641         /*
642          * Iterators to interior nodes should always be pointed at the first non
643          * whiteout:
644          */
645         if (level)
646                 bch2_btree_node_iter_peek(&l->iter, l->b);
647 }
648
649 void bch2_btree_path_level_init(struct btree_trans *trans,
650                                 struct btree_path *path,
651                                 struct btree *b)
652 {
653         BUG_ON(path->cached);
654
655         EBUG_ON(!btree_path_pos_in_node(path, b));
656         EBUG_ON(b->c.lock.state.seq & 1);
657
658         path->l[b->c.level].lock_seq = b->c.lock.state.seq;
659         path->l[b->c.level].b = b;
660         __btree_path_level_init(path, b->c.level);
661 }
662
663 /* Btree path: fixups after btree node updates: */
664
665 static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, struct btree *b)
666 {
667         struct bch_fs *c = trans->c;
668         struct btree_insert_entry *i;
669
670         trans_for_each_update(trans, i)
671                 if (!i->cached &&
672                     i->level    == b->c.level &&
673                     i->btree_id == b->c.btree_id &&
674                     bpos_cmp(i->k->k.p, b->data->min_key) >= 0 &&
675                     bpos_cmp(i->k->k.p, b->data->max_key) <= 0) {
676                         i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v;
677
678                         if (unlikely(trans->journal_replay_not_finished)) {
679                                 struct bkey_i *j_k =
680                                         bch2_journal_keys_peek_slot(c, i->btree_id, i->level,
681                                                                     i->k->k.p);
682
683                                 if (j_k) {
684                                         i->old_k = j_k->k;
685                                         i->old_v = &j_k->v;
686                                 }
687                         }
688                 }
689 }
690
691 /*
692  * A btree node is being replaced - update the iterator to point to the new
693  * node:
694  */
695 void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
696 {
697         struct btree_path *path;
698
699         trans_for_each_path(trans, path)
700                 if (path->uptodate == BTREE_ITER_UPTODATE &&
701                     !path->cached &&
702                     btree_path_pos_in_node(path, b)) {
703                         enum btree_node_locked_type t =
704                                 btree_lock_want(path, b->c.level);
705
706                         if (t != BTREE_NODE_UNLOCKED) {
707                                 btree_node_unlock(trans, path, b->c.level);
708                                 six_lock_increment(&b->c.lock, t);
709                                 mark_btree_node_locked(trans, path, b->c.level, t);
710                         }
711
712                         bch2_btree_path_level_init(trans, path, b);
713                 }
714
715         bch2_trans_revalidate_updates_in_node(trans, b);
716 }
717
718 /*
719  * A btree node has been modified in such a way as to invalidate iterators - fix
720  * them:
721  */
722 void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b)
723 {
724         struct btree_path *path;
725
726         trans_for_each_path_with_node(trans, b, path)
727                 __btree_path_level_init(path, b->c.level);
728
729         bch2_trans_revalidate_updates_in_node(trans, b);
730 }
731
732 /* Btree path: traverse, set_pos: */
733
734 static inline int btree_path_lock_root(struct btree_trans *trans,
735                                        struct btree_path *path,
736                                        unsigned depth_want,
737                                        unsigned long trace_ip)
738 {
739         struct bch_fs *c = trans->c;
740         struct btree *b, **rootp = &c->btree_roots[path->btree_id].b;
741         enum six_lock_type lock_type;
742         unsigned i;
743         int ret;
744
745         EBUG_ON(path->nodes_locked);
746
747         while (1) {
748                 b = READ_ONCE(*rootp);
749                 path->level = READ_ONCE(b->c.level);
750
751                 if (unlikely(path->level < depth_want)) {
752                         /*
753                          * the root is at a lower depth than the depth we want:
754                          * got to the end of the btree, or we're walking nodes
755                          * greater than some depth and there are no nodes >=
756                          * that depth
757                          */
758                         path->level = depth_want;
759                         for (i = path->level; i < BTREE_MAX_DEPTH; i++)
760                                 path->l[i].b = NULL;
761                         return 1;
762                 }
763
764                 lock_type = __btree_lock_want(path, path->level);
765                 ret = btree_node_lock(trans, path, &b->c,
766                                       path->level, lock_type, trace_ip);
767                 if (unlikely(ret)) {
768                         if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed))
769                                 continue;
770                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
771                                 return ret;
772                         BUG();
773                 }
774
775                 if (likely(b == READ_ONCE(*rootp) &&
776                            b->c.level == path->level &&
777                            !race_fault())) {
778                         for (i = 0; i < path->level; i++)
779                                 path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_lock_root);
780                         path->l[path->level].b = b;
781                         for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
782                                 path->l[i].b = NULL;
783
784                         mark_btree_node_locked(trans, path, path->level, lock_type);
785                         bch2_btree_path_level_init(trans, path, b);
786                         return 0;
787                 }
788
789                 six_unlock_type(&b->c.lock, lock_type);
790         }
791 }
792
793 noinline
794 static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path)
795 {
796         struct bch_fs *c = trans->c;
797         struct btree_path_level *l = path_l(path);
798         struct btree_node_iter node_iter = l->iter;
799         struct bkey_packed *k;
800         struct bkey_buf tmp;
801         unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
802                 ? (path->level > 1 ? 0 :  2)
803                 : (path->level > 1 ? 1 : 16);
804         bool was_locked = btree_node_locked(path, path->level);
805         int ret = 0;
806
807         bch2_bkey_buf_init(&tmp);
808
809         while (nr-- && !ret) {
810                 if (!bch2_btree_node_relock(trans, path, path->level))
811                         break;
812
813                 bch2_btree_node_iter_advance(&node_iter, l->b);
814                 k = bch2_btree_node_iter_peek(&node_iter, l->b);
815                 if (!k)
816                         break;
817
818                 bch2_bkey_buf_unpack(&tmp, c, l->b, k);
819                 ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id,
820                                                path->level - 1);
821         }
822
823         if (!was_locked)
824                 btree_node_unlock(trans, path, path->level);
825
826         bch2_bkey_buf_exit(&tmp, c);
827         return ret;
828 }
829
830 static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path,
831                                  struct btree_and_journal_iter *jiter)
832 {
833         struct bch_fs *c = trans->c;
834         struct bkey_s_c k;
835         struct bkey_buf tmp;
836         unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
837                 ? (path->level > 1 ? 0 :  2)
838                 : (path->level > 1 ? 1 : 16);
839         bool was_locked = btree_node_locked(path, path->level);
840         int ret = 0;
841
842         bch2_bkey_buf_init(&tmp);
843
844         while (nr-- && !ret) {
845                 if (!bch2_btree_node_relock(trans, path, path->level))
846                         break;
847
848                 bch2_btree_and_journal_iter_advance(jiter);
849                 k = bch2_btree_and_journal_iter_peek(jiter);
850                 if (!k.k)
851                         break;
852
853                 bch2_bkey_buf_reassemble(&tmp, c, k);
854                 ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id,
855                                                path->level - 1);
856         }
857
858         if (!was_locked)
859                 btree_node_unlock(trans, path, path->level);
860
861         bch2_bkey_buf_exit(&tmp, c);
862         return ret;
863 }
864
865 static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
866                                             struct btree_path *path,
867                                             unsigned plevel, struct btree *b)
868 {
869         struct btree_path_level *l = &path->l[plevel];
870         bool locked = btree_node_locked(path, plevel);
871         struct bkey_packed *k;
872         struct bch_btree_ptr_v2 *bp;
873
874         if (!bch2_btree_node_relock(trans, path, plevel))
875                 return;
876
877         k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
878         BUG_ON(k->type != KEY_TYPE_btree_ptr_v2);
879
880         bp = (void *) bkeyp_val(&l->b->format, k);
881         bp->mem_ptr = (unsigned long)b;
882
883         if (!locked)
884                 btree_node_unlock(trans, path, plevel);
885 }
886
887 static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
888                                                      struct btree_path *path,
889                                                      unsigned flags,
890                                                      struct bkey_buf *out)
891 {
892         struct bch_fs *c = trans->c;
893         struct btree_path_level *l = path_l(path);
894         struct btree_and_journal_iter jiter;
895         struct bkey_s_c k;
896         int ret = 0;
897
898         __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos);
899
900         k = bch2_btree_and_journal_iter_peek(&jiter);
901
902         bch2_bkey_buf_reassemble(out, c, k);
903
904         if (flags & BTREE_ITER_PREFETCH)
905                 ret = btree_path_prefetch_j(trans, path, &jiter);
906
907         bch2_btree_and_journal_iter_exit(&jiter);
908         return ret;
909 }
910
911 static __always_inline int btree_path_down(struct btree_trans *trans,
912                                            struct btree_path *path,
913                                            unsigned flags,
914                                            unsigned long trace_ip)
915 {
916         struct bch_fs *c = trans->c;
917         struct btree_path_level *l = path_l(path);
918         struct btree *b;
919         unsigned level = path->level - 1;
920         enum six_lock_type lock_type = __btree_lock_want(path, level);
921         struct bkey_buf tmp;
922         int ret;
923
924         EBUG_ON(!btree_node_locked(path, path->level));
925
926         bch2_bkey_buf_init(&tmp);
927
928         if (unlikely(trans->journal_replay_not_finished)) {
929                 ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp);
930                 if (ret)
931                         goto err;
932         } else {
933                 bch2_bkey_buf_unpack(&tmp, c, l->b,
934                                  bch2_btree_node_iter_peek(&l->iter, l->b));
935
936                 if (flags & BTREE_ITER_PREFETCH) {
937                         ret = btree_path_prefetch(trans, path);
938                         if (ret)
939                                 goto err;
940                 }
941         }
942
943         b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip);
944         ret = PTR_ERR_OR_ZERO(b);
945         if (unlikely(ret))
946                 goto err;
947
948         if (likely(!trans->journal_replay_not_finished &&
949                    tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
950             unlikely(b != btree_node_mem_ptr(tmp.k)))
951                 btree_node_mem_ptr_set(trans, path, level + 1, b);
952
953         if (btree_node_read_locked(path, level + 1))
954                 btree_node_unlock(trans, path, level + 1);
955
956         mark_btree_node_locked(trans, path, level, lock_type);
957         path->level = level;
958         bch2_btree_path_level_init(trans, path, b);
959
960         bch2_btree_path_verify_locks(path);
961 err:
962         bch2_bkey_buf_exit(&tmp, c);
963         return ret;
964 }
965
966 static int btree_path_traverse_one(struct btree_trans *, struct btree_path *,
967                                    unsigned, unsigned long);
968
969 static int bch2_btree_path_traverse_all(struct btree_trans *trans)
970 {
971         struct bch_fs *c = trans->c;
972         struct btree_path *path;
973         unsigned long trace_ip = _RET_IP_;
974         int ret = 0;
975
976         if (trans->in_traverse_all)
977                 return -BCH_ERR_transaction_restart_in_traverse_all;
978
979         trans->in_traverse_all = true;
980 retry_all:
981         trans->restarted = 0;
982         trans->traverse_all_idx = U8_MAX;
983
984         trans_for_each_path(trans, path)
985                 path->should_be_locked = false;
986
987         btree_trans_verify_sorted(trans);
988
989         bch2_trans_unlock(trans);
990         cond_resched();
991
992         if (unlikely(trans->memory_allocation_failure)) {
993                 struct closure cl;
994
995                 closure_init_stack(&cl);
996
997                 do {
998                         ret = bch2_btree_cache_cannibalize_lock(c, &cl);
999                         closure_sync(&cl);
1000                 } while (ret);
1001         }
1002
1003         /* Now, redo traversals in correct order: */
1004         trans->traverse_all_idx = 0;
1005         while (trans->traverse_all_idx < trans->nr_sorted) {
1006                 path = trans->paths + trans->sorted[trans->traverse_all_idx];
1007
1008                 /*
1009                  * Traversing a path can cause another path to be added at about
1010                  * the same position:
1011                  */
1012                 if (path->uptodate) {
1013                         ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_);
1014                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
1015                             ret == -ENOMEM)
1016                                 goto retry_all;
1017                         if (ret)
1018                                 goto err;
1019                         BUG_ON(path->uptodate);
1020                 } else {
1021                         trans->traverse_all_idx++;
1022                 }
1023         }
1024
1025         /*
1026          * BTREE_ITER_NEED_RELOCK is ok here - if we called bch2_trans_unlock()
1027          * and relock(), relock() won't relock since path->should_be_locked
1028          * isn't set yet, which is all fine
1029          */
1030         trans_for_each_path(trans, path)
1031                 BUG_ON(path->uptodate >= BTREE_ITER_NEED_TRAVERSE);
1032 err:
1033         bch2_btree_cache_cannibalize_unlock(c);
1034
1035         trans->in_traverse_all = false;
1036
1037         trace_and_count(c, trans_traverse_all, trans, trace_ip);
1038         return ret;
1039 }
1040
1041 static inline bool btree_path_check_pos_in_node(struct btree_path *path,
1042                                                 unsigned l, int check_pos)
1043 {
1044         if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b))
1045                 return false;
1046         if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b))
1047                 return false;
1048         return true;
1049 }
1050
1051 static inline bool btree_path_good_node(struct btree_trans *trans,
1052                                         struct btree_path *path,
1053                                         unsigned l, int check_pos)
1054 {
1055         return is_btree_node(path, l) &&
1056                 bch2_btree_node_relock(trans, path, l) &&
1057                 btree_path_check_pos_in_node(path, l, check_pos);
1058 }
1059
1060 static void btree_path_set_level_down(struct btree_trans *trans,
1061                                       struct btree_path *path,
1062                                       unsigned new_level)
1063 {
1064         unsigned l;
1065
1066         path->level = new_level;
1067
1068         for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++)
1069                 if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
1070                         btree_node_unlock(trans, path, l);
1071
1072         btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
1073         bch2_btree_path_verify(trans, path);
1074 }
1075
1076 static noinline unsigned __btree_path_up_until_good_node(struct btree_trans *trans,
1077                                                          struct btree_path *path,
1078                                                          int check_pos)
1079 {
1080         unsigned i, l = path->level;
1081 again:
1082         while (btree_path_node(path, l) &&
1083                !btree_path_good_node(trans, path, l, check_pos))
1084                 __btree_path_set_level_up(trans, path, l++);
1085
1086         /* If we need intent locks, take them too: */
1087         for (i = l + 1;
1088              i < path->locks_want && btree_path_node(path, i);
1089              i++)
1090                 if (!bch2_btree_node_relock(trans, path, i)) {
1091                         while (l <= i)
1092                                 __btree_path_set_level_up(trans, path, l++);
1093                         goto again;
1094                 }
1095
1096         return l;
1097 }
1098
1099 static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
1100                                                      struct btree_path *path,
1101                                                      int check_pos)
1102 {
1103         return likely(btree_node_locked(path, path->level) &&
1104                       btree_path_check_pos_in_node(path, path->level, check_pos))
1105                 ? path->level
1106                 : __btree_path_up_until_good_node(trans, path, check_pos);
1107 }
1108
1109 /*
1110  * This is the main state machine for walking down the btree - walks down to a
1111  * specified depth
1112  *
1113  * Returns 0 on success, -EIO on error (error reading in a btree node).
1114  *
1115  * On error, caller (peek_node()/peek_key()) must return NULL; the error is
1116  * stashed in the iterator and returned from bch2_trans_exit().
1117  */
1118 static int btree_path_traverse_one(struct btree_trans *trans,
1119                                    struct btree_path *path,
1120                                    unsigned flags,
1121                                    unsigned long trace_ip)
1122 {
1123         unsigned depth_want = path->level;
1124         int ret = -((int) trans->restarted);
1125
1126         if (unlikely(ret))
1127                 goto out;
1128
1129         /*
1130          * Ensure we obey path->should_be_locked: if it's set, we can't unlock
1131          * and re-traverse the path without a transaction restart:
1132          */
1133         if (path->should_be_locked) {
1134                 ret = bch2_btree_path_relock(trans, path, trace_ip);
1135                 goto out;
1136         }
1137
1138         if (path->cached) {
1139                 ret = bch2_btree_path_traverse_cached(trans, path, flags);
1140                 goto out;
1141         }
1142
1143         if (unlikely(path->level >= BTREE_MAX_DEPTH))
1144                 goto out;
1145
1146         path->level = btree_path_up_until_good_node(trans, path, 0);
1147
1148         EBUG_ON(btree_path_node(path, path->level) &&
1149                 !btree_node_locked(path, path->level));
1150
1151         /*
1152          * Note: path->nodes[path->level] may be temporarily NULL here - that
1153          * would indicate to other code that we got to the end of the btree,
1154          * here it indicates that relocking the root failed - it's critical that
1155          * btree_path_lock_root() comes next and that it can't fail
1156          */
1157         while (path->level > depth_want) {
1158                 ret = btree_path_node(path, path->level)
1159                         ? btree_path_down(trans, path, flags, trace_ip)
1160                         : btree_path_lock_root(trans, path, depth_want, trace_ip);
1161                 if (unlikely(ret)) {
1162                         if (ret == 1) {
1163                                 /*
1164                                  * No nodes at this level - got to the end of
1165                                  * the btree:
1166                                  */
1167                                 ret = 0;
1168                                 goto out;
1169                         }
1170
1171                         __bch2_btree_path_unlock(trans, path);
1172                         path->level = depth_want;
1173                         path->l[path->level].b = ERR_PTR(ret);
1174                         goto out;
1175                 }
1176         }
1177
1178         path->uptodate = BTREE_ITER_UPTODATE;
1179 out:
1180         BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
1181         bch2_btree_path_verify(trans, path);
1182         return ret;
1183 }
1184
1185 int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
1186                                           struct btree_path *path, unsigned flags)
1187 {
1188         if (0 && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
1189                 unsigned restart_probability_bits = 4 << min(trans->restart_count, 32U);
1190                 u64 mask = ~(~0ULL << restart_probability_bits);
1191
1192                 if ((prandom_u32() & mask) == mask) {
1193                         trace_and_count(trans->c, trans_restart_injected, trans, _RET_IP_);
1194                         return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject);
1195                 }
1196         }
1197
1198         if (path->uptodate < BTREE_ITER_NEED_RELOCK)
1199                 return 0;
1200
1201         return  bch2_trans_cond_resched(trans) ?:
1202                 btree_path_traverse_one(trans, path, flags, _RET_IP_);
1203 }
1204
1205 static inline void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
1206                             struct btree_path *src)
1207 {
1208         unsigned i, offset = offsetof(struct btree_path, pos);
1209
1210         memcpy((void *) dst + offset,
1211                (void *) src + offset,
1212                sizeof(struct btree_path) - offset);
1213
1214         for (i = 0; i < BTREE_MAX_DEPTH; i++) {
1215                 unsigned t = btree_node_locked_type(dst, i);
1216
1217                 if (t != BTREE_NODE_UNLOCKED)
1218                         six_lock_increment(&dst->l[i].b->c.lock, t);
1219         }
1220 }
1221
1222 static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src,
1223                                            bool intent)
1224 {
1225         struct btree_path *new = btree_path_alloc(trans, src);
1226
1227         btree_path_copy(trans, new, src);
1228         __btree_path_get(new, intent);
1229         return new;
1230 }
1231
1232 __flatten
1233 struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans,
1234                          struct btree_path *path, bool intent,
1235                          unsigned long ip)
1236 {
1237         __btree_path_put(path, intent);
1238         path = btree_path_clone(trans, path, intent);
1239         path->preserve = false;
1240 #ifdef CONFIG_BCACHEFS_DEBUG
1241         path->ip_allocated = ip;
1242 #endif
1243         btree_trans_verify_sorted(trans);
1244         return path;
1245 }
1246
1247 struct btree_path * __must_check
1248 __bch2_btree_path_set_pos(struct btree_trans *trans,
1249                    struct btree_path *path, struct bpos new_pos,
1250                    bool intent, unsigned long ip, int cmp)
1251 {
1252         unsigned level = path->level;
1253
1254         EBUG_ON(trans->restarted);
1255         EBUG_ON(!path->ref);
1256
1257         path = bch2_btree_path_make_mut(trans, path, intent, ip);
1258
1259         path->pos = new_pos;
1260
1261         bch2_btree_path_check_sort_fast(trans, path, cmp);
1262
1263         if (unlikely(path->cached)) {
1264                 btree_node_unlock(trans, path, 0);
1265                 path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
1266                 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
1267                 goto out;
1268         }
1269
1270         level = btree_path_up_until_good_node(trans, path, cmp);
1271
1272         if (btree_path_node(path, level)) {
1273                 struct btree_path_level *l = &path->l[level];
1274
1275                 BUG_ON(!btree_node_locked(path, level));
1276                 /*
1277                  * We might have to skip over many keys, or just a few: try
1278                  * advancing the node iterator, and if we have to skip over too
1279                  * many keys just reinit it (or if we're rewinding, since that
1280                  * is expensive).
1281                  */
1282                 if (cmp < 0 ||
1283                     !btree_path_advance_to_pos(path, l, 8))
1284                         bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
1285
1286                 /*
1287                  * Iterators to interior nodes should always be pointed at the first non
1288                  * whiteout:
1289                  */
1290                 if (unlikely(level))
1291                         bch2_btree_node_iter_peek(&l->iter, l->b);
1292         }
1293
1294         if (unlikely(level != path->level)) {
1295                 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
1296                 __bch2_btree_path_unlock(trans, path);
1297         }
1298 out:
1299         bch2_btree_path_verify(trans, path);
1300         return path;
1301 }
1302
1303 /* Btree path: main interface: */
1304
1305 static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path)
1306 {
1307         struct btree_path *sib;
1308
1309         sib = prev_btree_path(trans, path);
1310         if (sib && !btree_path_cmp(sib, path))
1311                 return sib;
1312
1313         sib = next_btree_path(trans, path);
1314         if (sib && !btree_path_cmp(sib, path))
1315                 return sib;
1316
1317         return NULL;
1318 }
1319
1320 static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path)
1321 {
1322         struct btree_path *sib;
1323
1324         sib = prev_btree_path(trans, path);
1325         if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
1326                 return sib;
1327
1328         sib = next_btree_path(trans, path);
1329         if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
1330                 return sib;
1331
1332         return NULL;
1333 }
1334
1335 static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path)
1336 {
1337         __bch2_btree_path_unlock(trans, path);
1338         btree_path_list_remove(trans, path);
1339         trans->paths_allocated &= ~(1ULL << path->idx);
1340 }
1341
1342 void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent)
1343 {
1344         struct btree_path *dup;
1345
1346         EBUG_ON(trans->paths + path->idx != path);
1347         EBUG_ON(!path->ref);
1348
1349         if (!__btree_path_put(path, intent))
1350                 return;
1351
1352         dup = path->preserve
1353                 ? have_path_at_pos(trans, path)
1354                 : have_node_at_pos(trans, path);
1355
1356         if (!dup && !(!path->preserve && !is_btree_node(path, path->level)))
1357                 return;
1358
1359         if (path->should_be_locked &&
1360             !trans->restarted &&
1361             (!dup || !bch2_btree_path_relock_norestart(trans, dup, _THIS_IP_)))
1362                 return;
1363
1364         if (dup) {
1365                 dup->preserve           |= path->preserve;
1366                 dup->should_be_locked   |= path->should_be_locked;
1367         }
1368
1369         __bch2_path_free(trans, path);
1370 }
1371
1372 static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *path,
1373                                  bool intent)
1374 {
1375         EBUG_ON(trans->paths + path->idx != path);
1376         EBUG_ON(!path->ref);
1377
1378         if (!__btree_path_put(path, intent))
1379                 return;
1380
1381         __bch2_path_free(trans, path);
1382 }
1383
1384 void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
1385 {
1386         struct btree_insert_entry *i;
1387
1388         prt_printf(buf, "transaction updates for %s journal seq %llu",
1389                trans->fn, trans->journal_res.seq);
1390         prt_newline(buf);
1391         printbuf_indent_add(buf, 2);
1392
1393         trans_for_each_update(trans, i) {
1394                 struct bkey_s_c old = { &i->old_k, i->old_v };
1395
1396                 prt_printf(buf, "update: btree=%s cached=%u %pS",
1397                        bch2_btree_ids[i->btree_id],
1398                        i->cached,
1399                        (void *) i->ip_allocated);
1400                 prt_newline(buf);
1401
1402                 prt_printf(buf, "  old ");
1403                 bch2_bkey_val_to_text(buf, trans->c, old);
1404                 prt_newline(buf);
1405
1406                 prt_printf(buf, "  new ");
1407                 bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k));
1408                 prt_newline(buf);
1409         }
1410
1411         printbuf_indent_sub(buf, 2);
1412 }
1413
1414 noinline __cold
1415 void bch2_dump_trans_updates(struct btree_trans *trans)
1416 {
1417         struct printbuf buf = PRINTBUF;
1418
1419         bch2_trans_updates_to_text(&buf, trans);
1420         bch2_print_string_as_lines(KERN_ERR, buf.buf);
1421         printbuf_exit(&buf);
1422 }
1423
1424 void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path)
1425 {
1426         prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ",
1427                    path->idx, path->ref, path->intent_ref,
1428                    path->preserve ? 'P' : ' ',
1429                    path->should_be_locked ? 'S' : ' ',
1430                    bch2_btree_ids[path->btree_id],
1431                    path->level);
1432         bch2_bpos_to_text(out, path->pos);
1433
1434         prt_printf(out, " locks %u", path->nodes_locked);
1435 #ifdef CONFIG_BCACHEFS_DEBUG
1436         prt_printf(out, " %pS", (void *) path->ip_allocated);
1437 #endif
1438         prt_newline(out);
1439 }
1440
1441 void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans)
1442 {
1443         struct btree_path *path;
1444         unsigned idx;
1445
1446         trans_for_each_path_inorder(trans, path, idx)
1447                 bch2_btree_path_to_text(out, path);
1448 }
1449
1450 noinline __cold
1451 void bch2_dump_trans_paths_updates(struct btree_trans *trans)
1452 {
1453         struct printbuf buf = PRINTBUF;
1454
1455         bch2_trans_paths_to_text(&buf, trans);
1456         bch2_trans_updates_to_text(&buf, trans);
1457
1458         bch2_print_string_as_lines(KERN_ERR, buf.buf);
1459         printbuf_exit(&buf);
1460 }
1461
1462 noinline
1463 static void bch2_trans_update_max_paths(struct btree_trans *trans)
1464 {
1465         struct btree_transaction_stats *s = btree_trans_stats(trans);
1466         struct printbuf buf = PRINTBUF;
1467
1468         bch2_trans_paths_to_text(&buf, trans);
1469
1470         if (!buf.allocation_failure) {
1471                 mutex_lock(&s->lock);
1472                 if (s->nr_max_paths < hweight64(trans->paths_allocated)) {
1473                         s->nr_max_paths = trans->nr_max_paths =
1474                                 hweight64(trans->paths_allocated);
1475                         swap(s->max_paths_text, buf.buf);
1476                 }
1477                 mutex_unlock(&s->lock);
1478         }
1479
1480         printbuf_exit(&buf);
1481 }
1482
1483 static noinline void btree_path_overflow(struct btree_trans *trans)
1484 {
1485         bch2_dump_trans_paths_updates(trans);
1486         panic("trans path oveflow\n");
1487 }
1488
1489 static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
1490                                                   struct btree_path *pos)
1491 {
1492         struct btree_path *path;
1493         unsigned idx;
1494
1495         if (unlikely(trans->paths_allocated ==
1496                      ~((~0ULL << 1) << (BTREE_ITER_MAX - 1))))
1497                 btree_path_overflow(trans);
1498
1499         idx = __ffs64(~trans->paths_allocated);
1500         trans->paths_allocated |= 1ULL << idx;
1501
1502         if (unlikely(idx > trans->nr_max_paths))
1503                 bch2_trans_update_max_paths(trans);
1504
1505         path = &trans->paths[idx];
1506
1507         path->idx               = idx;
1508         path->ref               = 0;
1509         path->intent_ref        = 0;
1510         path->nodes_locked      = 0;
1511
1512         btree_path_list_add(trans, pos, path);
1513         return path;
1514 }
1515
1516 struct btree_path *bch2_path_get(struct btree_trans *trans,
1517                                  enum btree_id btree_id, struct bpos pos,
1518                                  unsigned locks_want, unsigned level,
1519                                  unsigned flags, unsigned long ip)
1520 {
1521         struct btree_path *path, *path_pos = NULL;
1522         bool cached = flags & BTREE_ITER_CACHED;
1523         bool intent = flags & BTREE_ITER_INTENT;
1524         int i;
1525
1526         EBUG_ON(trans->restarted);
1527         btree_trans_verify_sorted(trans);
1528         bch2_trans_verify_locks(trans);
1529
1530         trans_for_each_path_inorder(trans, path, i) {
1531                 if (__btree_path_cmp(path,
1532                                      btree_id,
1533                                      cached,
1534                                      pos,
1535                                      level) > 0)
1536                         break;
1537
1538                 path_pos = path;
1539         }
1540
1541         if (path_pos &&
1542             path_pos->cached    == cached &&
1543             path_pos->btree_id  == btree_id &&
1544             path_pos->level     == level) {
1545                 __btree_path_get(path_pos, intent);
1546                 path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
1547         } else {
1548                 path = btree_path_alloc(trans, path_pos);
1549                 path_pos = NULL;
1550
1551                 __btree_path_get(path, intent);
1552                 path->pos                       = pos;
1553                 path->btree_id                  = btree_id;
1554                 path->cached                    = cached;
1555                 path->uptodate                  = BTREE_ITER_NEED_TRAVERSE;
1556                 path->should_be_locked          = false;
1557                 path->level                     = level;
1558                 path->locks_want                = locks_want;
1559                 path->nodes_locked              = 0;
1560                 for (i = 0; i < ARRAY_SIZE(path->l); i++)
1561                         path->l[i].b            = ERR_PTR(-BCH_ERR_no_btree_node_init);
1562 #ifdef CONFIG_BCACHEFS_DEBUG
1563                 path->ip_allocated              = ip;
1564 #endif
1565                 btree_trans_verify_sorted(trans);
1566         }
1567
1568         if (!(flags & BTREE_ITER_NOPRESERVE))
1569                 path->preserve = true;
1570
1571         if (path->intent_ref)
1572                 locks_want = max(locks_want, level + 1);
1573
1574         /*
1575          * If the path has locks_want greater than requested, we don't downgrade
1576          * it here - on transaction restart because btree node split needs to
1577          * upgrade locks, we might be putting/getting the iterator again.
1578          * Downgrading iterators only happens via bch2_trans_downgrade(), after
1579          * a successful transaction commit.
1580          */
1581
1582         locks_want = min(locks_want, BTREE_MAX_DEPTH);
1583         if (locks_want > path->locks_want)
1584                 bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want);
1585
1586         return path;
1587 }
1588
1589 struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
1590 {
1591
1592         struct btree_path_level *l = path_l(path);
1593         struct bkey_packed *_k;
1594         struct bkey_s_c k;
1595
1596         if (unlikely(!l->b))
1597                 return bkey_s_c_null;
1598
1599         EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
1600         EBUG_ON(!btree_node_locked(path, path->level));
1601
1602         if (!path->cached) {
1603                 _k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
1604                 k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null;
1605
1606                 EBUG_ON(k.k && bkey_deleted(k.k) && bpos_eq(k.k->p, path->pos));
1607
1608                 if (!k.k || !bpos_eq(path->pos, k.k->p))
1609                         goto hole;
1610         } else {
1611                 struct bkey_cached *ck = (void *) path->l[0].b;
1612
1613                 EBUG_ON(ck &&
1614                         (path->btree_id != ck->key.btree_id ||
1615                          !bkey_eq(path->pos, ck->key.pos)));
1616                 EBUG_ON(!ck || !ck->valid);
1617
1618                 *u = ck->k->k;
1619                 k = bkey_i_to_s_c(ck->k);
1620         }
1621
1622         return k;
1623 hole:
1624         bkey_init(u);
1625         u->p = path->pos;
1626         return (struct bkey_s_c) { u, NULL };
1627 }
1628
1629 /* Btree iterators: */
1630
1631 int __must_check
1632 __bch2_btree_iter_traverse(struct btree_iter *iter)
1633 {
1634         return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
1635 }
1636
1637 int __must_check
1638 bch2_btree_iter_traverse(struct btree_iter *iter)
1639 {
1640         int ret;
1641
1642         iter->path = bch2_btree_path_set_pos(iter->trans, iter->path,
1643                                         btree_iter_search_key(iter),
1644                                         iter->flags & BTREE_ITER_INTENT,
1645                                         btree_iter_ip_allocated(iter));
1646
1647         ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
1648         if (ret)
1649                 return ret;
1650
1651         btree_path_set_should_be_locked(iter->path);
1652         return 0;
1653 }
1654
1655 /* Iterate across nodes (leaf and interior nodes) */
1656
1657 struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
1658 {
1659         struct btree_trans *trans = iter->trans;
1660         struct btree *b = NULL;
1661         int ret;
1662
1663         EBUG_ON(iter->path->cached);
1664         bch2_btree_iter_verify(iter);
1665
1666         ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
1667         if (ret)
1668                 goto err;
1669
1670         b = btree_path_node(iter->path, iter->path->level);
1671         if (!b)
1672                 goto out;
1673
1674         BUG_ON(bpos_lt(b->key.k.p, iter->pos));
1675
1676         bkey_init(&iter->k);
1677         iter->k.p = iter->pos = b->key.k.p;
1678
1679         iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
1680                                         iter->flags & BTREE_ITER_INTENT,
1681                                         btree_iter_ip_allocated(iter));
1682         btree_path_set_should_be_locked(iter->path);
1683 out:
1684         bch2_btree_iter_verify_entry_exit(iter);
1685         bch2_btree_iter_verify(iter);
1686
1687         return b;
1688 err:
1689         b = ERR_PTR(ret);
1690         goto out;
1691 }
1692
1693 struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
1694 {
1695         struct btree_trans *trans = iter->trans;
1696         struct btree_path *path = iter->path;
1697         struct btree *b = NULL;
1698         int ret;
1699
1700         BUG_ON(trans->restarted);
1701         EBUG_ON(iter->path->cached);
1702         bch2_btree_iter_verify(iter);
1703
1704         /* already at end? */
1705         if (!btree_path_node(path, path->level))
1706                 return NULL;
1707
1708         /* got to end? */
1709         if (!btree_path_node(path, path->level + 1)) {
1710                 btree_path_set_level_up(trans, path);
1711                 return NULL;
1712         }
1713
1714         if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
1715                 __bch2_btree_path_unlock(trans, path);
1716                 path->l[path->level].b          = ERR_PTR(-BCH_ERR_no_btree_node_relock);
1717                 path->l[path->level + 1].b      = ERR_PTR(-BCH_ERR_no_btree_node_relock);
1718                 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
1719                 trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path);
1720                 ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
1721                 goto err;
1722         }
1723
1724         b = btree_path_node(path, path->level + 1);
1725
1726         if (bpos_eq(iter->pos, b->key.k.p)) {
1727                 __btree_path_set_level_up(trans, path, path->level++);
1728         } else {
1729                 /*
1730                  * Haven't gotten to the end of the parent node: go back down to
1731                  * the next child node
1732                  */
1733                 path = iter->path =
1734                         bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos),
1735                                            iter->flags & BTREE_ITER_INTENT,
1736                                            btree_iter_ip_allocated(iter));
1737
1738                 btree_path_set_level_down(trans, path, iter->min_depth);
1739
1740                 ret = bch2_btree_path_traverse(trans, path, iter->flags);
1741                 if (ret)
1742                         goto err;
1743
1744                 b = path->l[path->level].b;
1745         }
1746
1747         bkey_init(&iter->k);
1748         iter->k.p = iter->pos = b->key.k.p;
1749
1750         iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
1751                                         iter->flags & BTREE_ITER_INTENT,
1752                                         btree_iter_ip_allocated(iter));
1753         btree_path_set_should_be_locked(iter->path);
1754         BUG_ON(iter->path->uptodate);
1755 out:
1756         bch2_btree_iter_verify_entry_exit(iter);
1757         bch2_btree_iter_verify(iter);
1758
1759         return b;
1760 err:
1761         b = ERR_PTR(ret);
1762         goto out;
1763 }
1764
1765 /* Iterate across keys (in leaf nodes only) */
1766
1767 inline bool bch2_btree_iter_advance(struct btree_iter *iter)
1768 {
1769         if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) {
1770                 struct bpos pos = iter->k.p;
1771                 bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
1772                              ? bpos_eq(pos, SPOS_MAX)
1773                              : bkey_eq(pos, SPOS_MAX));
1774
1775                 if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
1776                         pos = bkey_successor(iter, pos);
1777                 bch2_btree_iter_set_pos(iter, pos);
1778                 return ret;
1779         } else {
1780                 if (!btree_path_node(iter->path, iter->path->level))
1781                         return true;
1782
1783                 iter->advanced = true;
1784                 return false;
1785         }
1786 }
1787
1788 inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
1789 {
1790         struct bpos pos = bkey_start_pos(&iter->k);
1791         bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
1792                      ? bpos_eq(pos, POS_MIN)
1793                      : bkey_eq(pos, POS_MIN));
1794
1795         if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
1796                 pos = bkey_predecessor(iter, pos);
1797         bch2_btree_iter_set_pos(iter, pos);
1798         return ret;
1799 }
1800
1801 static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans,
1802                                                       enum btree_id btree_id,
1803                                                       struct bpos pos)
1804 {
1805         struct btree_insert_entry *i;
1806         struct bkey_i *ret = NULL;
1807
1808         trans_for_each_update(trans, i) {
1809                 if (i->btree_id < btree_id)
1810                         continue;
1811                 if (i->btree_id > btree_id)
1812                         break;
1813                 if (bpos_lt(i->k->k.p, pos))
1814                         continue;
1815                 if (i->key_cache_already_flushed)
1816                         continue;
1817                 if (!ret || bpos_lt(i->k->k.p, ret->k.p))
1818                         ret = i->k;
1819         }
1820
1821         return ret;
1822 }
1823
1824 struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
1825                                        struct btree_iter *iter,
1826                                        struct bpos start_pos,
1827                                        struct bpos end_pos)
1828 {
1829         struct bkey_i *k;
1830
1831         if (bpos_lt(start_pos, iter->journal_pos))
1832                 iter->journal_idx = 0;
1833
1834         k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id,
1835                                         iter->path->level,
1836                                         start_pos, end_pos,
1837                                         &iter->journal_idx);
1838
1839         iter->journal_pos = k ? k->k.p : end_pos;
1840         return k;
1841 }
1842
1843 struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *trans,
1844                                             struct btree_iter *iter,
1845                                             struct bpos pos)
1846 {
1847         return bch2_btree_journal_peek(trans, iter, pos, pos);
1848 }
1849
1850 static noinline
1851 struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
1852                                          struct btree_iter *iter,
1853                                          struct bkey_s_c k)
1854 {
1855         struct bkey_i *next_journal =
1856                 bch2_btree_journal_peek(trans, iter, iter->path->pos,
1857                                 k.k ? k.k->p : path_l(iter->path)->b->key.k.p);
1858
1859         if (next_journal) {
1860                 iter->k = next_journal->k;
1861                 k = bkey_i_to_s_c(next_journal);
1862         }
1863
1864         return k;
1865 }
1866
1867 /*
1868  * Checks btree key cache for key at iter->pos and returns it if present, or
1869  * bkey_s_c_null:
1870  */
1871 static noinline
1872 struct bkey_s_c __btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
1873 {
1874         struct btree_trans *trans = iter->trans;
1875         struct bch_fs *c = trans->c;
1876         struct bkey u;
1877         int ret;
1878
1879         if (!bch2_btree_key_cache_find(c, iter->btree_id, pos))
1880                 return bkey_s_c_null;
1881
1882         if (!iter->key_cache_path)
1883                 iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos,
1884                                                      iter->flags & BTREE_ITER_INTENT, 0,
1885                                                      iter->flags|BTREE_ITER_CACHED,
1886                                                      _THIS_IP_);
1887
1888         iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos,
1889                                         iter->flags & BTREE_ITER_INTENT,
1890                                         btree_iter_ip_allocated(iter));
1891
1892         ret = bch2_btree_path_traverse(trans, iter->key_cache_path, iter->flags|BTREE_ITER_CACHED);
1893         if (unlikely(ret))
1894                 return bkey_s_c_err(ret);
1895
1896         btree_path_set_should_be_locked(iter->key_cache_path);
1897
1898         return bch2_btree_path_peek_slot(iter->key_cache_path, &u);
1899 }
1900
1901 static noinline
1902 struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
1903 {
1904         struct bkey_s_c ret = __btree_trans_peek_key_cache(iter, pos);
1905         int err = bkey_err(ret) ?: bch2_btree_path_relock(iter->trans, iter->path, _THIS_IP_);
1906
1907         return err ? bkey_s_c_err(err) : ret;
1908 }
1909
1910 static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
1911 {
1912         struct btree_trans *trans = iter->trans;
1913         struct bkey_i *next_update;
1914         struct bkey_s_c k, k2;
1915         int ret;
1916
1917         EBUG_ON(iter->path->cached);
1918         bch2_btree_iter_verify(iter);
1919
1920         while (1) {
1921                 struct btree_path_level *l;
1922
1923                 iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
1924                                         iter->flags & BTREE_ITER_INTENT,
1925                                         btree_iter_ip_allocated(iter));
1926
1927                 ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
1928                 if (unlikely(ret)) {
1929                         /* ensure that iter->k is consistent with iter->pos: */
1930                         bch2_btree_iter_set_pos(iter, iter->pos);
1931                         k = bkey_s_c_err(ret);
1932                         goto out;
1933                 }
1934
1935                 l = path_l(iter->path);
1936
1937                 if (unlikely(!l->b)) {
1938                         /* No btree nodes at requested level: */
1939                         bch2_btree_iter_set_pos(iter, SPOS_MAX);
1940                         k = bkey_s_c_null;
1941                         goto out;
1942                 }
1943
1944                 btree_path_set_should_be_locked(iter->path);
1945
1946                 k = btree_path_level_peek_all(trans->c, l, &iter->k);
1947
1948                 if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
1949                     k.k &&
1950                     (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
1951                         k = k2;
1952                         ret = bkey_err(k);
1953                         if (ret) {
1954                                 bch2_btree_iter_set_pos(iter, iter->pos);
1955                                 goto out;
1956                         }
1957                 }
1958
1959                 if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL))
1960                         k = btree_trans_peek_journal(trans, iter, k);
1961
1962                 next_update = iter->flags & BTREE_ITER_WITH_UPDATES
1963                         ? btree_trans_peek_updates(trans, iter->btree_id, search_key)
1964                         : NULL;
1965                 if (next_update &&
1966                     bpos_le(next_update->k.p,
1967                             k.k ? k.k->p : l->b->key.k.p)) {
1968                         iter->k = next_update->k;
1969                         k = bkey_i_to_s_c(next_update);
1970                 }
1971
1972                 if (k.k && bkey_deleted(k.k)) {
1973                         /*
1974                          * If we've got a whiteout, and it's after the search
1975                          * key, advance the search key to the whiteout instead
1976                          * of just after the whiteout - it might be a btree
1977                          * whiteout, with a real key at the same position, since
1978                          * in the btree deleted keys sort before non deleted.
1979                          */
1980                         search_key = !bpos_eq(search_key, k.k->p)
1981                                 ? k.k->p
1982                                 : bpos_successor(k.k->p);
1983                         continue;
1984                 }
1985
1986                 if (likely(k.k)) {
1987                         break;
1988                 } else if (likely(!bpos_eq(l->b->key.k.p, SPOS_MAX))) {
1989                         /* Advance to next leaf node: */
1990                         search_key = bpos_successor(l->b->key.k.p);
1991                 } else {
1992                         /* End of btree: */
1993                         bch2_btree_iter_set_pos(iter, SPOS_MAX);
1994                         k = bkey_s_c_null;
1995                         goto out;
1996                 }
1997         }
1998 out:
1999         bch2_btree_iter_verify(iter);
2000
2001         return k;
2002 }
2003
2004 /**
2005  * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
2006  * current position
2007  */
2008 struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end)
2009 {
2010         struct btree_trans *trans = iter->trans;
2011         struct bpos search_key = btree_iter_search_key(iter);
2012         struct bkey_s_c k;
2013         struct bpos iter_pos;
2014         int ret;
2015
2016         EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
2017         EBUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && bkey_eq(end, POS_MAX));
2018
2019         if (iter->update_path) {
2020                 bch2_path_put_nokeep(trans, iter->update_path,
2021                                      iter->flags & BTREE_ITER_INTENT);
2022                 iter->update_path = NULL;
2023         }
2024
2025         bch2_btree_iter_verify_entry_exit(iter);
2026
2027         while (1) {
2028                 k = __bch2_btree_iter_peek(iter, search_key);
2029                 if (unlikely(!k.k))
2030                         goto end;
2031                 if (unlikely(bkey_err(k)))
2032                         goto out_no_locked;
2033
2034                 /*
2035                  * iter->pos should be mononotically increasing, and always be
2036                  * equal to the key we just returned - except extents can
2037                  * straddle iter->pos:
2038                  */
2039                 if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
2040                         iter_pos = k.k->p;
2041                 else
2042                         iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k));
2043
2044                 if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS)
2045                              ? bkey_gt(iter_pos, end)
2046                              : bkey_ge(iter_pos, end)))
2047                         goto end;
2048
2049                 if (iter->update_path &&
2050                     !bkey_eq(iter->update_path->pos, k.k->p)) {
2051                         bch2_path_put_nokeep(trans, iter->update_path,
2052                                              iter->flags & BTREE_ITER_INTENT);
2053                         iter->update_path = NULL;
2054                 }
2055
2056                 if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
2057                     (iter->flags & BTREE_ITER_INTENT) &&
2058                     !(iter->flags & BTREE_ITER_IS_EXTENTS) &&
2059                     !iter->update_path) {
2060                         struct bpos pos = k.k->p;
2061
2062                         if (pos.snapshot < iter->snapshot) {
2063                                 search_key = bpos_successor(k.k->p);
2064                                 continue;
2065                         }
2066
2067                         pos.snapshot = iter->snapshot;
2068
2069                         /*
2070                          * advance, same as on exit for iter->path, but only up
2071                          * to snapshot
2072                          */
2073                         __btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT);
2074                         iter->update_path = iter->path;
2075
2076                         iter->update_path = bch2_btree_path_set_pos(trans,
2077                                                 iter->update_path, pos,
2078                                                 iter->flags & BTREE_ITER_INTENT,
2079                                                 _THIS_IP_);
2080                         ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags);
2081                         if (unlikely(ret)) {
2082                                 k = bkey_s_c_err(ret);
2083                                 goto out_no_locked;
2084                         }
2085                 }
2086
2087                 /*
2088                  * We can never have a key in a leaf node at POS_MAX, so
2089                  * we don't have to check these successor() calls:
2090                  */
2091                 if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
2092                     !bch2_snapshot_is_ancestor(trans->c,
2093                                                iter->snapshot,
2094                                                k.k->p.snapshot)) {
2095                         search_key = bpos_successor(k.k->p);
2096                         continue;
2097                 }
2098
2099                 if (bkey_whiteout(k.k) &&
2100                     !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
2101                         search_key = bkey_successor(iter, k.k->p);
2102                         continue;
2103                 }
2104
2105                 break;
2106         }
2107
2108         iter->pos = iter_pos;
2109
2110         iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
2111                                 iter->flags & BTREE_ITER_INTENT,
2112                                 btree_iter_ip_allocated(iter));
2113
2114         btree_path_set_should_be_locked(iter->path);
2115 out_no_locked:
2116         if (iter->update_path) {
2117                 if (iter->update_path->uptodate &&
2118                     (ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_)))
2119                         k = bkey_s_c_err(ret);
2120                 else
2121                         btree_path_set_should_be_locked(iter->update_path);
2122         }
2123
2124         if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
2125                 iter->pos.snapshot = iter->snapshot;
2126
2127         ret = bch2_btree_iter_verify_ret(iter, k);
2128         if (unlikely(ret)) {
2129                 bch2_btree_iter_set_pos(iter, iter->pos);
2130                 k = bkey_s_c_err(ret);
2131         }
2132
2133         bch2_btree_iter_verify_entry_exit(iter);
2134
2135         return k;
2136 end:
2137         bch2_btree_iter_set_pos(iter, end);
2138         k = bkey_s_c_null;
2139         goto out_no_locked;
2140 }
2141
2142 /**
2143  * bch2_btree_iter_peek_all_levels: returns the first key greater than or equal
2144  * to iterator's current position, returning keys from every level of the btree.
2145  * For keys at different levels of the btree that compare equal, the key from
2146  * the lower level (leaf) is returned first.
2147  */
2148 struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter)
2149 {
2150         struct btree_trans *trans = iter->trans;
2151         struct bkey_s_c k;
2152         int ret;
2153
2154         EBUG_ON(iter->path->cached);
2155         bch2_btree_iter_verify(iter);
2156         BUG_ON(iter->path->level < iter->min_depth);
2157         BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
2158         EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS));
2159
2160         while (1) {
2161                 iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos,
2162                                         iter->flags & BTREE_ITER_INTENT,
2163                                         btree_iter_ip_allocated(iter));
2164
2165                 ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
2166                 if (unlikely(ret)) {
2167                         /* ensure that iter->k is consistent with iter->pos: */
2168                         bch2_btree_iter_set_pos(iter, iter->pos);
2169                         k = bkey_s_c_err(ret);
2170                         goto out_no_locked;
2171                 }
2172
2173                 /* Already at end? */
2174                 if (!btree_path_node(iter->path, iter->path->level)) {
2175                         k = bkey_s_c_null;
2176                         goto out_no_locked;
2177                 }
2178
2179                 k = btree_path_level_peek_all(trans->c,
2180                                 &iter->path->l[iter->path->level], &iter->k);
2181
2182                 /* Check if we should go up to the parent node: */
2183                 if (!k.k ||
2184                     (iter->advanced &&
2185                      bpos_eq(path_l(iter->path)->b->key.k.p, iter->pos))) {
2186                         iter->pos = path_l(iter->path)->b->key.k.p;
2187                         btree_path_set_level_up(trans, iter->path);
2188                         iter->advanced = false;
2189                         continue;
2190                 }
2191
2192                 /*
2193                  * Check if we should go back down to a leaf:
2194                  * If we're not in a leaf node, we only return the current key
2195                  * if it exactly matches iter->pos - otherwise we first have to
2196                  * go back to the leaf:
2197                  */
2198                 if (iter->path->level != iter->min_depth &&
2199                     (iter->advanced ||
2200                      !k.k ||
2201                      !bpos_eq(iter->pos, k.k->p))) {
2202                         btree_path_set_level_down(trans, iter->path, iter->min_depth);
2203                         iter->pos = bpos_successor(iter->pos);
2204                         iter->advanced = false;
2205                         continue;
2206                 }
2207
2208                 /* Check if we should go to the next key: */
2209                 if (iter->path->level == iter->min_depth &&
2210                     iter->advanced &&
2211                     k.k &&
2212                     bpos_eq(iter->pos, k.k->p)) {
2213                         iter->pos = bpos_successor(iter->pos);
2214                         iter->advanced = false;
2215                         continue;
2216                 }
2217
2218                 if (iter->advanced &&
2219                     iter->path->level == iter->min_depth &&
2220                     !bpos_eq(k.k->p, iter->pos))
2221                         iter->advanced = false;
2222
2223                 BUG_ON(iter->advanced);
2224                 BUG_ON(!k.k);
2225                 break;
2226         }
2227
2228         iter->pos = k.k->p;
2229         btree_path_set_should_be_locked(iter->path);
2230 out_no_locked:
2231         bch2_btree_iter_verify(iter);
2232
2233         return k;
2234 }
2235
2236 /**
2237  * bch2_btree_iter_next: returns first key greater than iterator's current
2238  * position
2239  */
2240 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
2241 {
2242         if (!bch2_btree_iter_advance(iter))
2243                 return bkey_s_c_null;
2244
2245         return bch2_btree_iter_peek(iter);
2246 }
2247
2248 /**
2249  * bch2_btree_iter_peek_prev: returns first key less than or equal to
2250  * iterator's current position
2251  */
2252 struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
2253 {
2254         struct btree_trans *trans = iter->trans;
2255         struct bpos search_key = iter->pos;
2256         struct btree_path *saved_path = NULL;
2257         struct bkey_s_c k;
2258         struct bkey saved_k;
2259         const struct bch_val *saved_v;
2260         int ret;
2261
2262         EBUG_ON(iter->path->cached || iter->path->level);
2263         EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES);
2264
2265         if (iter->flags & BTREE_ITER_WITH_JOURNAL)
2266                 return bkey_s_c_err(-EIO);
2267
2268         bch2_btree_iter_verify(iter);
2269         bch2_btree_iter_verify_entry_exit(iter);
2270
2271         if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
2272                 search_key.snapshot = U32_MAX;
2273
2274         while (1) {
2275                 iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
2276                                                 iter->flags & BTREE_ITER_INTENT,
2277                                                 btree_iter_ip_allocated(iter));
2278
2279                 ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
2280                 if (unlikely(ret)) {
2281                         /* ensure that iter->k is consistent with iter->pos: */
2282                         bch2_btree_iter_set_pos(iter, iter->pos);
2283                         k = bkey_s_c_err(ret);
2284                         goto out_no_locked;
2285                 }
2286
2287                 k = btree_path_level_peek(trans, iter->path,
2288                                           &iter->path->l[0], &iter->k);
2289                 if (!k.k ||
2290                     ((iter->flags & BTREE_ITER_IS_EXTENTS)
2291                      ? bpos_ge(bkey_start_pos(k.k), search_key)
2292                      : bpos_gt(k.k->p, search_key)))
2293                         k = btree_path_level_prev(trans, iter->path,
2294                                                   &iter->path->l[0], &iter->k);
2295
2296                 bch2_btree_path_check_sort(trans, iter->path, 0);
2297
2298                 if (likely(k.k)) {
2299                         if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) {
2300                                 if (k.k->p.snapshot == iter->snapshot)
2301                                         goto got_key;
2302
2303                                 /*
2304                                  * If we have a saved candidate, and we're no
2305                                  * longer at the same _key_ (not pos), return
2306                                  * that candidate
2307                                  */
2308                                 if (saved_path && !bkey_eq(k.k->p, saved_k.p)) {
2309                                         bch2_path_put_nokeep(trans, iter->path,
2310                                                       iter->flags & BTREE_ITER_INTENT);
2311                                         iter->path = saved_path;
2312                                         saved_path = NULL;
2313                                         iter->k = saved_k;
2314                                         k.v     = saved_v;
2315                                         goto got_key;
2316                                 }
2317
2318                                 if (bch2_snapshot_is_ancestor(iter->trans->c,
2319                                                               iter->snapshot,
2320                                                               k.k->p.snapshot)) {
2321                                         if (saved_path)
2322                                                 bch2_path_put_nokeep(trans, saved_path,
2323                                                       iter->flags & BTREE_ITER_INTENT);
2324                                         saved_path = btree_path_clone(trans, iter->path,
2325                                                                 iter->flags & BTREE_ITER_INTENT);
2326                                         saved_k = *k.k;
2327                                         saved_v = k.v;
2328                                 }
2329
2330                                 search_key = bpos_predecessor(k.k->p);
2331                                 continue;
2332                         }
2333 got_key:
2334                         if (bkey_whiteout(k.k) &&
2335                             !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
2336                                 search_key = bkey_predecessor(iter, k.k->p);
2337                                 if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
2338                                         search_key.snapshot = U32_MAX;
2339                                 continue;
2340                         }
2341
2342                         break;
2343                 } else if (likely(!bpos_eq(iter->path->l[0].b->data->min_key, POS_MIN))) {
2344                         /* Advance to previous leaf node: */
2345                         search_key = bpos_predecessor(iter->path->l[0].b->data->min_key);
2346                 } else {
2347                         /* Start of btree: */
2348                         bch2_btree_iter_set_pos(iter, POS_MIN);
2349                         k = bkey_s_c_null;
2350                         goto out_no_locked;
2351                 }
2352         }
2353
2354         EBUG_ON(bkey_gt(bkey_start_pos(k.k), iter->pos));
2355
2356         /* Extents can straddle iter->pos: */
2357         if (bkey_lt(k.k->p, iter->pos))
2358                 iter->pos = k.k->p;
2359
2360         if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
2361                 iter->pos.snapshot = iter->snapshot;
2362
2363         btree_path_set_should_be_locked(iter->path);
2364 out_no_locked:
2365         if (saved_path)
2366                 bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
2367
2368         bch2_btree_iter_verify_entry_exit(iter);
2369         bch2_btree_iter_verify(iter);
2370
2371         return k;
2372 }
2373
2374 /**
2375  * bch2_btree_iter_prev: returns first key less than iterator's current
2376  * position
2377  */
2378 struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
2379 {
2380         if (!bch2_btree_iter_rewind(iter))
2381                 return bkey_s_c_null;
2382
2383         return bch2_btree_iter_peek_prev(iter);
2384 }
2385
2386 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
2387 {
2388         struct btree_trans *trans = iter->trans;
2389         struct bpos search_key;
2390         struct bkey_s_c k;
2391         int ret;
2392
2393         bch2_btree_iter_verify(iter);
2394         bch2_btree_iter_verify_entry_exit(iter);
2395         EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
2396         EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE));
2397
2398         /* extents can't span inode numbers: */
2399         if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
2400             unlikely(iter->pos.offset == KEY_OFFSET_MAX)) {
2401                 if (iter->pos.inode == KEY_INODE_MAX)
2402                         return bkey_s_c_null;
2403
2404                 bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
2405         }
2406
2407         search_key = btree_iter_search_key(iter);
2408         iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
2409                                         iter->flags & BTREE_ITER_INTENT,
2410                                         btree_iter_ip_allocated(iter));
2411
2412         ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
2413         if (unlikely(ret)) {
2414                 k = bkey_s_c_err(ret);
2415                 goto out_no_locked;
2416         }
2417
2418         if ((iter->flags & BTREE_ITER_CACHED) ||
2419             !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
2420                 struct bkey_i *next_update;
2421
2422                 if ((iter->flags & BTREE_ITER_WITH_UPDATES) &&
2423                     (next_update = btree_trans_peek_updates(trans,
2424                                                 iter->btree_id, search_key)) &&
2425                     bpos_eq(next_update->k.p, iter->pos)) {
2426                         iter->k = next_update->k;
2427                         k = bkey_i_to_s_c(next_update);
2428                         goto out;
2429                 }
2430
2431                 if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) &&
2432                     (next_update = bch2_btree_journal_peek_slot(trans,
2433                                         iter, iter->pos))) {
2434                         iter->k = next_update->k;
2435                         k = bkey_i_to_s_c(next_update);
2436                         goto out;
2437                 }
2438
2439                 if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
2440                     (k = __btree_trans_peek_key_cache(iter, iter->pos)).k) {
2441                         if (!bkey_err(k))
2442                                 iter->k = *k.k;
2443                         /* We're not returning a key from iter->path: */
2444                         goto out_no_locked;
2445                 }
2446
2447                 k = bch2_btree_path_peek_slot(iter->path, &iter->k);
2448                 if (unlikely(!k.k))
2449                         goto out_no_locked;
2450         } else {
2451                 struct bpos next;
2452                 struct bpos end = iter->pos;
2453
2454                 if (iter->flags & BTREE_ITER_IS_EXTENTS)
2455                         end.offset = U64_MAX;
2456
2457                 EBUG_ON(iter->path->level);
2458
2459                 if (iter->flags & BTREE_ITER_INTENT) {
2460                         struct btree_iter iter2;
2461
2462                         bch2_trans_copy_iter(&iter2, iter);
2463                         k = bch2_btree_iter_peek_upto(&iter2, end);
2464
2465                         if (k.k && !bkey_err(k)) {
2466                                 iter->k = iter2.k;
2467                                 k.k = &iter->k;
2468                         }
2469                         bch2_trans_iter_exit(trans, &iter2);
2470                 } else {
2471                         struct bpos pos = iter->pos;
2472
2473                         k = bch2_btree_iter_peek_upto(iter, end);
2474                         if (unlikely(bkey_err(k)))
2475                                 bch2_btree_iter_set_pos(iter, pos);
2476                         else
2477                                 iter->pos = pos;
2478                 }
2479
2480                 if (unlikely(bkey_err(k)))
2481                         goto out_no_locked;
2482
2483                 next = k.k ? bkey_start_pos(k.k) : POS_MAX;
2484
2485                 if (bkey_lt(iter->pos, next)) {
2486                         bkey_init(&iter->k);
2487                         iter->k.p = iter->pos;
2488
2489                         if (iter->flags & BTREE_ITER_IS_EXTENTS) {
2490                                 bch2_key_resize(&iter->k,
2491                                                 min_t(u64, KEY_SIZE_MAX,
2492                                                       (next.inode == iter->pos.inode
2493                                                        ? next.offset
2494                                                        : KEY_OFFSET_MAX) -
2495                                                       iter->pos.offset));
2496                                 EBUG_ON(!iter->k.size);
2497                         }
2498
2499                         k = (struct bkey_s_c) { &iter->k, NULL };
2500                 }
2501         }
2502 out:
2503         btree_path_set_should_be_locked(iter->path);
2504 out_no_locked:
2505         bch2_btree_iter_verify_entry_exit(iter);
2506         bch2_btree_iter_verify(iter);
2507         ret = bch2_btree_iter_verify_ret(iter, k);
2508         if (unlikely(ret))
2509                 return bkey_s_c_err(ret);
2510
2511         return k;
2512 }
2513
2514 struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
2515 {
2516         if (!bch2_btree_iter_advance(iter))
2517                 return bkey_s_c_null;
2518
2519         return bch2_btree_iter_peek_slot(iter);
2520 }
2521
2522 struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
2523 {
2524         if (!bch2_btree_iter_rewind(iter))
2525                 return bkey_s_c_null;
2526
2527         return bch2_btree_iter_peek_slot(iter);
2528 }
2529
2530 /* new transactional stuff: */
2531
2532 static inline void btree_path_verify_sorted_ref(struct btree_trans *trans,
2533                                                 struct btree_path *path)
2534 {
2535         EBUG_ON(path->sorted_idx >= trans->nr_sorted);
2536         EBUG_ON(trans->sorted[path->sorted_idx] != path->idx);
2537         EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
2538 }
2539
2540 static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans)
2541 {
2542 #ifdef CONFIG_BCACHEFS_DEBUG
2543         unsigned i;
2544
2545         for (i = 0; i < trans->nr_sorted; i++)
2546                 btree_path_verify_sorted_ref(trans, trans->paths + trans->sorted[i]);
2547 #endif
2548 }
2549
2550 static void btree_trans_verify_sorted(struct btree_trans *trans)
2551 {
2552 #ifdef CONFIG_BCACHEFS_DEBUG
2553         struct btree_path *path, *prev = NULL;
2554         unsigned i;
2555
2556         if (!bch2_debug_check_iterators)
2557                 return;
2558
2559         trans_for_each_path_inorder(trans, path, i) {
2560                 if (prev && btree_path_cmp(prev, path) > 0) {
2561                         bch2_dump_trans_paths_updates(trans);
2562                         panic("trans paths out of order!\n");
2563                 }
2564                 prev = path;
2565         }
2566 #endif
2567 }
2568
2569 static inline void btree_path_swap(struct btree_trans *trans,
2570                                    struct btree_path *l, struct btree_path *r)
2571 {
2572         swap(l->sorted_idx, r->sorted_idx);
2573         swap(trans->sorted[l->sorted_idx],
2574              trans->sorted[r->sorted_idx]);
2575
2576         btree_path_verify_sorted_ref(trans, l);
2577         btree_path_verify_sorted_ref(trans, r);
2578 }
2579
2580 static inline struct btree_path *sib_btree_path(struct btree_trans *trans,
2581                                                 struct btree_path *path, int sib)
2582 {
2583         unsigned idx = (unsigned) path->sorted_idx + sib;
2584
2585         EBUG_ON(sib != -1 && sib != 1);
2586
2587         return idx < trans->nr_sorted
2588                 ? trans->paths + trans->sorted[idx]
2589                 : NULL;
2590 }
2591
2592 static __always_inline void bch2_btree_path_check_sort_fast(struct btree_trans *trans,
2593                                                    struct btree_path *path,
2594                                                    int cmp)
2595 {
2596         struct btree_path *n;
2597         int cmp2;
2598
2599         EBUG_ON(!cmp);
2600
2601         while ((n = sib_btree_path(trans, path, cmp)) &&
2602                (cmp2 = btree_path_cmp(n, path)) &&
2603                cmp2 != cmp)
2604                 btree_path_swap(trans, n, path);
2605
2606         btree_trans_verify_sorted(trans);
2607 }
2608
2609 inline void bch2_btree_path_check_sort(struct btree_trans *trans, struct btree_path *path,
2610                                        int cmp)
2611 {
2612         struct btree_path *n;
2613
2614         if (cmp <= 0) {
2615                 n = prev_btree_path(trans, path);
2616                 if (n && btree_path_cmp(n, path) > 0) {
2617                         do {
2618                                 btree_path_swap(trans, n, path);
2619                                 n = prev_btree_path(trans, path);
2620                         } while (n && btree_path_cmp(n, path) > 0);
2621
2622                         goto out;
2623                 }
2624         }
2625
2626         if (cmp >= 0) {
2627                 n = next_btree_path(trans, path);
2628                 if (n && btree_path_cmp(path, n) > 0) {
2629                         do {
2630                                 btree_path_swap(trans, path, n);
2631                                 n = next_btree_path(trans, path);
2632                         } while (n && btree_path_cmp(path, n) > 0);
2633                 }
2634         }
2635 out:
2636         btree_trans_verify_sorted(trans);
2637 }
2638
2639 static inline void btree_path_list_remove(struct btree_trans *trans,
2640                                           struct btree_path *path)
2641 {
2642         unsigned i;
2643
2644         EBUG_ON(path->sorted_idx >= trans->nr_sorted);
2645
2646         array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx);
2647
2648         for (i = path->sorted_idx; i < trans->nr_sorted; i++)
2649                 trans->paths[trans->sorted[i]].sorted_idx = i;
2650
2651         path->sorted_idx = U8_MAX;
2652
2653         btree_trans_verify_sorted_refs(trans);
2654 }
2655
2656 static inline void btree_path_list_add(struct btree_trans *trans,
2657                                        struct btree_path *pos,
2658                                        struct btree_path *path)
2659 {
2660         unsigned i;
2661
2662         btree_trans_verify_sorted_refs(trans);
2663
2664         path->sorted_idx = pos ? pos->sorted_idx + 1 : 0;
2665
2666         if (unlikely(trans->in_traverse_all) &&
2667             trans->traverse_all_idx != U8_MAX &&
2668             trans->traverse_all_idx >= path->sorted_idx)
2669                 trans->traverse_all_idx++;
2670
2671         array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx);
2672
2673         for (i = path->sorted_idx; i < trans->nr_sorted; i++)
2674                 trans->paths[trans->sorted[i]].sorted_idx = i;
2675
2676         btree_trans_verify_sorted_refs(trans);
2677 }
2678
2679 void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
2680 {
2681         if (iter->path)
2682                 bch2_path_put(trans, iter->path,
2683                               iter->flags & BTREE_ITER_INTENT);
2684         if (iter->update_path)
2685                 bch2_path_put_nokeep(trans, iter->update_path,
2686                               iter->flags & BTREE_ITER_INTENT);
2687         if (iter->key_cache_path)
2688                 bch2_path_put(trans, iter->key_cache_path,
2689                               iter->flags & BTREE_ITER_INTENT);
2690         iter->path = NULL;
2691         iter->update_path = NULL;
2692         iter->key_cache_path = NULL;
2693 }
2694
2695 static inline void bch2_trans_iter_init_inlined(struct btree_trans *trans,
2696                           struct btree_iter *iter,
2697                           unsigned btree_id, struct bpos pos,
2698                           unsigned flags)
2699 {
2700         bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
2701                                bch2_btree_iter_flags(trans, btree_id, flags),
2702                                _RET_IP_);
2703 }
2704
2705 void bch2_trans_iter_init_outlined(struct btree_trans *trans,
2706                           struct btree_iter *iter,
2707                           unsigned btree_id, struct bpos pos,
2708                           unsigned flags)
2709 {
2710         bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
2711                                bch2_btree_iter_flags(trans, btree_id, flags),
2712                                _RET_IP_);
2713 }
2714
2715 void bch2_trans_node_iter_init(struct btree_trans *trans,
2716                                struct btree_iter *iter,
2717                                enum btree_id btree_id,
2718                                struct bpos pos,
2719                                unsigned locks_want,
2720                                unsigned depth,
2721                                unsigned flags)
2722 {
2723        flags |= BTREE_ITER_NOT_EXTENTS;
2724        flags |= __BTREE_ITER_ALL_SNAPSHOTS;
2725        flags |= BTREE_ITER_ALL_SNAPSHOTS;
2726
2727         bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth,
2728                                __bch2_btree_iter_flags(trans, btree_id, flags),
2729                                _RET_IP_);
2730
2731         iter->min_depth = depth;
2732
2733         BUG_ON(iter->path->locks_want    < min(locks_want, BTREE_MAX_DEPTH));
2734         BUG_ON(iter->path->level        != depth);
2735         BUG_ON(iter->min_depth          != depth);
2736 }
2737
2738 void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
2739 {
2740         *dst = *src;
2741         if (src->path)
2742                 __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT);
2743         if (src->update_path)
2744                 __btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT);
2745         dst->key_cache_path = NULL;
2746 }
2747
2748 void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
2749 {
2750         unsigned new_top = trans->mem_top + size;
2751         size_t old_bytes = trans->mem_bytes;
2752         size_t new_bytes = roundup_pow_of_two(new_top);
2753         void *new_mem;
2754         void *p;
2755
2756         trans->mem_max = max(trans->mem_max, new_top);
2757
2758         WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
2759
2760         new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
2761         if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
2762                 new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
2763                 new_bytes = BTREE_TRANS_MEM_MAX;
2764                 kfree(trans->mem);
2765         }
2766
2767         if (!new_mem)
2768                 return ERR_PTR(-ENOMEM);
2769
2770         trans->mem = new_mem;
2771         trans->mem_bytes = new_bytes;
2772
2773         if (old_bytes) {
2774                 trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
2775                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
2776         }
2777
2778         p = trans->mem + trans->mem_top;
2779         trans->mem_top += size;
2780         memset(p, 0, size);
2781         return p;
2782 }
2783
2784 /**
2785  * bch2_trans_begin() - reset a transaction after a interrupted attempt
2786  * @trans: transaction to reset
2787  *
2788  * While iterating over nodes or updating nodes a attempt to lock a btree node
2789  * may return BCH_ERR_transaction_restart when the trylock fails. When this
2790  * occurs bch2_trans_begin() should be called and the transaction retried.
2791  */
2792 u32 bch2_trans_begin(struct btree_trans *trans)
2793 {
2794         struct btree_path *path;
2795
2796         bch2_trans_reset_updates(trans);
2797
2798         trans->restart_count++;
2799         trans->mem_top                  = 0;
2800
2801         if (trans->fs_usage_deltas) {
2802                 trans->fs_usage_deltas->used = 0;
2803                 memset((void *) trans->fs_usage_deltas +
2804                        offsetof(struct replicas_delta_list, memset_start), 0,
2805                        (void *) &trans->fs_usage_deltas->memset_end -
2806                        (void *) &trans->fs_usage_deltas->memset_start);
2807         }
2808
2809         trans_for_each_path(trans, path) {
2810                 path->should_be_locked = false;
2811
2812                 /*
2813                  * If the transaction wasn't restarted, we're presuming to be
2814                  * doing something new: dont keep iterators excpt the ones that
2815                  * are in use - except for the subvolumes btree:
2816                  */
2817                 if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes)
2818                         path->preserve = false;
2819
2820                 /*
2821                  * XXX: we probably shouldn't be doing this if the transaction
2822                  * was restarted, but currently we still overflow transaction
2823                  * iterators if we do that
2824                  */
2825                 if (!path->ref && !path->preserve)
2826                         __bch2_path_free(trans, path);
2827                 else
2828                         path->preserve = false;
2829         }
2830
2831         if (!trans->restarted &&
2832             (need_resched() ||
2833              local_clock() - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) {
2834                 bch2_trans_unlock(trans);
2835                 cond_resched();
2836                 bch2_trans_relock(trans);
2837         }
2838
2839         trans->last_restarted_ip = _RET_IP_;
2840         if (trans->restarted)
2841                 bch2_btree_path_traverse_all(trans);
2842
2843         trans->last_begin_time = local_clock();
2844         return trans->restart_count;
2845 }
2846
2847 void bch2_trans_verify_not_restarted(struct btree_trans *trans, u32 restart_count)
2848 {
2849         if (trans_was_restarted(trans, restart_count))
2850                 panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
2851                       trans->restart_count, restart_count,
2852                       (void *) trans->last_restarted_ip);
2853 }
2854
2855 static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
2856 {
2857         size_t paths_bytes      = sizeof(struct btree_path) * BTREE_ITER_MAX;
2858         size_t updates_bytes    = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX;
2859         void *p = NULL;
2860
2861         BUG_ON(trans->used_mempool);
2862
2863 #ifdef __KERNEL__
2864         p = this_cpu_xchg(c->btree_paths_bufs->path, NULL);
2865 #endif
2866         if (!p)
2867                 p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS);
2868
2869         trans->paths            = p; p += paths_bytes;
2870         trans->updates          = p; p += updates_bytes;
2871 }
2872
2873 const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
2874
2875 unsigned bch2_trans_get_fn_idx(const char *fn)
2876 {
2877         unsigned i;
2878
2879         for (i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++)
2880                 if (!bch2_btree_transaction_fns[i] ||
2881                     bch2_btree_transaction_fns[i] == fn) {
2882                         bch2_btree_transaction_fns[i] = fn;
2883                         return i;
2884                 }
2885
2886         pr_warn_once("BCH_TRANSACTIONS_NR not big enough!");
2887         return i;
2888 }
2889
2890 void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_idx)
2891         __acquires(&c->btree_trans_barrier)
2892 {
2893         struct btree_transaction_stats *s;
2894         struct btree_trans *pos;
2895
2896         BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
2897
2898         memset(trans, 0, sizeof(*trans));
2899         trans->c                = c;
2900         trans->fn               = fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns)
2901                 ? bch2_btree_transaction_fns[fn_idx] : NULL;
2902         trans->last_begin_time  = local_clock();
2903         trans->fn_idx           = fn_idx;
2904         trans->locking_wait.task = current;
2905         trans->journal_replay_not_finished =
2906                 !test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
2907         closure_init_stack(&trans->ref);
2908
2909         bch2_trans_alloc_paths(trans, c);
2910
2911         s = btree_trans_stats(trans);
2912         if (s && s->max_mem) {
2913                 unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
2914
2915                 trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
2916
2917                 if (!unlikely(trans->mem)) {
2918                         trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
2919                         trans->mem_bytes = BTREE_TRANS_MEM_MAX;
2920                 } else {
2921                         trans->mem_bytes = expected_mem_bytes;
2922                 }
2923         }
2924         if (s)
2925                 trans->nr_max_paths = s->nr_max_paths;
2926
2927         trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
2928
2929         mutex_lock(&c->btree_trans_lock);
2930         list_for_each_entry(pos, &c->btree_trans_list, list) {
2931                 if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) {
2932                         list_add_tail(&trans->list, &pos->list);
2933                         goto list_add_done;
2934                 }
2935         }
2936         list_add_tail(&trans->list, &c->btree_trans_list);
2937 list_add_done:
2938         mutex_unlock(&c->btree_trans_lock);
2939 }
2940
2941 static void check_btree_paths_leaked(struct btree_trans *trans)
2942 {
2943 #ifdef CONFIG_BCACHEFS_DEBUG
2944         struct bch_fs *c = trans->c;
2945         struct btree_path *path;
2946
2947         trans_for_each_path(trans, path)
2948                 if (path->ref)
2949                         goto leaked;
2950         return;
2951 leaked:
2952         bch_err(c, "btree paths leaked from %s!", trans->fn);
2953         trans_for_each_path(trans, path)
2954                 if (path->ref)
2955                         printk(KERN_ERR "  btree %s %pS\n",
2956                                bch2_btree_ids[path->btree_id],
2957                                (void *) path->ip_allocated);
2958         /* Be noisy about this: */
2959         bch2_fatal_error(c);
2960 #endif
2961 }
2962
2963 void bch2_trans_exit(struct btree_trans *trans)
2964         __releases(&c->btree_trans_barrier)
2965 {
2966         struct btree_insert_entry *i;
2967         struct bch_fs *c = trans->c;
2968         struct btree_transaction_stats *s = btree_trans_stats(trans);
2969
2970         bch2_trans_unlock(trans);
2971
2972         closure_sync(&trans->ref);
2973
2974         if (s)
2975                 s->max_mem = max(s->max_mem, trans->mem_max);
2976
2977         trans_for_each_update(trans, i)
2978                 __btree_path_put(i->path, true);
2979         trans->nr_updates               = 0;
2980
2981         check_btree_paths_leaked(trans);
2982
2983         mutex_lock(&c->btree_trans_lock);
2984         list_del(&trans->list);
2985         mutex_unlock(&c->btree_trans_lock);
2986
2987         srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
2988
2989         bch2_journal_preres_put(&c->journal, &trans->journal_preres);
2990
2991         kfree(trans->extra_journal_entries.data);
2992
2993         if (trans->fs_usage_deltas) {
2994                 if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
2995                     REPLICAS_DELTA_LIST_MAX)
2996                         mempool_free(trans->fs_usage_deltas,
2997                                      &c->replicas_delta_pool);
2998                 else
2999                         kfree(trans->fs_usage_deltas);
3000         }
3001
3002         if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
3003                 mempool_free(trans->mem, &c->btree_trans_mem_pool);
3004         else
3005                 kfree(trans->mem);
3006
3007 #ifdef __KERNEL__
3008         /*
3009          * Userspace doesn't have a real percpu implementation:
3010          */
3011         trans->paths = this_cpu_xchg(c->btree_paths_bufs->path, trans->paths);
3012 #endif
3013
3014         if (trans->paths)
3015                 mempool_free(trans->paths, &c->btree_paths_pool);
3016
3017         trans->mem      = (void *) 0x1;
3018         trans->paths    = (void *) 0x1;
3019 }
3020
3021 static void __maybe_unused
3022 bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
3023                                       struct btree_bkey_cached_common *b)
3024 {
3025         struct six_lock_count c = six_lock_counts(&b->lock);
3026         struct task_struct *owner;
3027         pid_t pid;
3028
3029         rcu_read_lock();
3030         owner = READ_ONCE(b->lock.owner);
3031         pid = owner ? owner->pid : 0;
3032         rcu_read_unlock();
3033
3034         prt_tab(out);
3035         prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b',
3036                    b->level, bch2_btree_ids[b->btree_id]);
3037         bch2_bpos_to_text(out, btree_node_pos(b));
3038
3039         prt_tab(out);
3040         prt_printf(out, " locks %u:%u:%u held by pid %u",
3041                    c.n[0], c.n[1], c.n[2], pid);
3042 }
3043
3044 void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
3045 {
3046         struct btree_path *path;
3047         struct btree_bkey_cached_common *b;
3048         static char lock_types[] = { 'r', 'i', 'w' };
3049         unsigned l;
3050
3051         if (!out->nr_tabstops) {
3052                 printbuf_tabstop_push(out, 16);
3053                 printbuf_tabstop_push(out, 32);
3054         }
3055
3056         prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn);
3057
3058         trans_for_each_path(trans, path) {
3059                 if (!path->nodes_locked)
3060                         continue;
3061
3062                 prt_printf(out, "  path %u %c l=%u %s:",
3063                        path->idx,
3064                        path->cached ? 'c' : 'b',
3065                        path->level,
3066                        bch2_btree_ids[path->btree_id]);
3067                 bch2_bpos_to_text(out, path->pos);
3068                 prt_newline(out);
3069
3070                 for (l = 0; l < BTREE_MAX_DEPTH; l++) {
3071                         if (btree_node_locked(path, l) &&
3072                             !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) {
3073                                 prt_printf(out, "    %c l=%u ",
3074                                            lock_types[btree_node_locked_type(path, l)], l);
3075                                 bch2_btree_bkey_cached_common_to_text(out, b);
3076                                 prt_newline(out);
3077                         }
3078                 }
3079         }
3080
3081         b = READ_ONCE(trans->locking);
3082         if (b) {
3083                 prt_str(out, "  want");
3084                 prt_newline(out);
3085                 prt_printf(out, "    %c", lock_types[trans->locking_wait.lock_want]);
3086                 bch2_btree_bkey_cached_common_to_text(out, b);
3087                 prt_newline(out);
3088         }
3089 }
3090
3091 void bch2_fs_btree_iter_exit(struct bch_fs *c)
3092 {
3093         struct btree_transaction_stats *s;
3094
3095         for (s = c->btree_transaction_stats;
3096              s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
3097              s++)
3098                 kfree(s->max_paths_text);
3099
3100         if (c->btree_trans_barrier_initialized)
3101                 cleanup_srcu_struct(&c->btree_trans_barrier);
3102         mempool_exit(&c->btree_trans_mem_pool);
3103         mempool_exit(&c->btree_paths_pool);
3104 }
3105
3106 int bch2_fs_btree_iter_init(struct bch_fs *c)
3107 {
3108         unsigned i, nr = BTREE_ITER_MAX;
3109         int ret;
3110
3111         for (i = 0; i < ARRAY_SIZE(c->btree_transaction_stats); i++)
3112                 mutex_init(&c->btree_transaction_stats[i].lock);
3113
3114         INIT_LIST_HEAD(&c->btree_trans_list);
3115         mutex_init(&c->btree_trans_lock);
3116
3117         ret   = mempool_init_kmalloc_pool(&c->btree_paths_pool, 1,
3118                         sizeof(struct btree_path) * nr +
3119                         sizeof(struct btree_insert_entry) * nr) ?:
3120                 mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
3121                                           BTREE_TRANS_MEM_MAX) ?:
3122                 init_srcu_struct(&c->btree_trans_barrier);
3123         if (!ret)
3124                 c->btree_trans_barrier_initialized = true;
3125         return ret;
3126 }