]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_iter.c
Update bcachefs sources to be2d60d948 bcachefs: New magic number
[bcachefs-tools-debian] / libbcachefs / btree_iter.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_methods.h"
5 #include "bkey_buf.h"
6 #include "btree_cache.h"
7 #include "btree_iter.h"
8 #include "btree_key_cache.h"
9 #include "btree_locking.h"
10 #include "btree_update.h"
11 #include "debug.h"
12 #include "error.h"
13 #include "extents.h"
14 #include "journal.h"
15 #include "recovery.h"
16 #include "replicas.h"
17 #include "subvolume.h"
18
19 #include <linux/prandom.h>
20 #include <linux/prefetch.h>
21 #include <trace/events/bcachefs.h>
22
23 static void btree_trans_verify_sorted(struct btree_trans *);
24 inline void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int);
25 static __always_inline void bch2_btree_path_check_sort_fast(struct btree_trans *,
26                                                    struct btree_path *, int);
27
28 static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
29 static inline void btree_path_list_add(struct btree_trans *, struct btree_path *,
30                                        struct btree_path *);
31
32 static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter)
33 {
34 #ifdef CONFIG_BCACHEFS_DEBUG
35         return iter->ip_allocated;
36 #else
37         return 0;
38 #endif
39 }
40
41 static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *);
42
43 /*
44  * Unlocks before scheduling
45  * Note: does not revalidate iterator
46  */
47 static inline int bch2_trans_cond_resched(struct btree_trans *trans)
48 {
49         if (need_resched() || race_fault()) {
50                 bch2_trans_unlock(trans);
51                 schedule();
52                 return bch2_trans_relock(trans);
53         } else {
54                 return 0;
55         }
56 }
57
58 static inline int __btree_path_cmp(const struct btree_path *l,
59                                    enum btree_id        r_btree_id,
60                                    bool                 r_cached,
61                                    struct bpos          r_pos,
62                                    unsigned             r_level)
63 {
64         /*
65          * Must match lock ordering as defined by __bch2_btree_node_lock:
66          */
67         return   cmp_int(l->btree_id,   r_btree_id) ?:
68                  cmp_int((int) l->cached,       (int) r_cached) ?:
69                  bpos_cmp(l->pos,       r_pos) ?:
70                 -cmp_int(l->level,      r_level);
71 }
72
73 static inline int btree_path_cmp(const struct btree_path *l,
74                                  const struct btree_path *r)
75 {
76         return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level);
77 }
78
79 static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
80 {
81         /* Are we iterating over keys in all snapshots? */
82         if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
83                 p = bpos_successor(p);
84         } else {
85                 p = bpos_nosnap_successor(p);
86                 p.snapshot = iter->snapshot;
87         }
88
89         return p;
90 }
91
92 static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
93 {
94         /* Are we iterating over keys in all snapshots? */
95         if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
96                 p = bpos_predecessor(p);
97         } else {
98                 p = bpos_nosnap_predecessor(p);
99                 p.snapshot = iter->snapshot;
100         }
101
102         return p;
103 }
104
105 static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
106 {
107         struct bpos pos = iter->pos;
108
109         if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
110             !bkey_eq(pos, POS_MAX))
111                 pos = bkey_successor(iter, pos);
112         return pos;
113 }
114
115 static inline bool btree_path_pos_before_node(struct btree_path *path,
116                                               struct btree *b)
117 {
118         return bpos_lt(path->pos, b->data->min_key);
119 }
120
121 static inline bool btree_path_pos_after_node(struct btree_path *path,
122                                              struct btree *b)
123 {
124         return bpos_gt(path->pos, b->key.k.p);
125 }
126
127 static inline bool btree_path_pos_in_node(struct btree_path *path,
128                                           struct btree *b)
129 {
130         return path->btree_id == b->c.btree_id &&
131                 !btree_path_pos_before_node(path, b) &&
132                 !btree_path_pos_after_node(path, b);
133 }
134
135 /* Btree iterator: */
136
137 #ifdef CONFIG_BCACHEFS_DEBUG
138
139 static void bch2_btree_path_verify_cached(struct btree_trans *trans,
140                                           struct btree_path *path)
141 {
142         struct bkey_cached *ck;
143         bool locked = btree_node_locked(path, 0);
144
145         if (!bch2_btree_node_relock(trans, path, 0))
146                 return;
147
148         ck = (void *) path->l[0].b;
149         BUG_ON(ck->key.btree_id != path->btree_id ||
150                !bkey_eq(ck->key.pos, path->pos));
151
152         if (!locked)
153                 btree_node_unlock(trans, path, 0);
154 }
155
156 static void bch2_btree_path_verify_level(struct btree_trans *trans,
157                                 struct btree_path *path, unsigned level)
158 {
159         struct btree_path_level *l;
160         struct btree_node_iter tmp;
161         bool locked;
162         struct bkey_packed *p, *k;
163         struct printbuf buf1 = PRINTBUF;
164         struct printbuf buf2 = PRINTBUF;
165         struct printbuf buf3 = PRINTBUF;
166         const char *msg;
167
168         if (!bch2_debug_check_iterators)
169                 return;
170
171         l       = &path->l[level];
172         tmp     = l->iter;
173         locked  = btree_node_locked(path, level);
174
175         if (path->cached) {
176                 if (!level)
177                         bch2_btree_path_verify_cached(trans, path);
178                 return;
179         }
180
181         if (!btree_path_node(path, level))
182                 return;
183
184         if (!bch2_btree_node_relock_notrace(trans, path, level))
185                 return;
186
187         BUG_ON(!btree_path_pos_in_node(path, l->b));
188
189         bch2_btree_node_iter_verify(&l->iter, l->b);
190
191         /*
192          * For interior nodes, the iterator will have skipped past deleted keys:
193          */
194         p = level
195                 ? bch2_btree_node_iter_prev(&tmp, l->b)
196                 : bch2_btree_node_iter_prev_all(&tmp, l->b);
197         k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
198
199         if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) {
200                 msg = "before";
201                 goto err;
202         }
203
204         if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
205                 msg = "after";
206                 goto err;
207         }
208
209         if (!locked)
210                 btree_node_unlock(trans, path, level);
211         return;
212 err:
213         bch2_bpos_to_text(&buf1, path->pos);
214
215         if (p) {
216                 struct bkey uk = bkey_unpack_key(l->b, p);
217
218                 bch2_bkey_to_text(&buf2, &uk);
219         } else {
220                 prt_printf(&buf2, "(none)");
221         }
222
223         if (k) {
224                 struct bkey uk = bkey_unpack_key(l->b, k);
225
226                 bch2_bkey_to_text(&buf3, &uk);
227         } else {
228                 prt_printf(&buf3, "(none)");
229         }
230
231         panic("path should be %s key at level %u:\n"
232               "path pos %s\n"
233               "prev key %s\n"
234               "cur  key %s\n",
235               msg, level, buf1.buf, buf2.buf, buf3.buf);
236 }
237
238 static void bch2_btree_path_verify(struct btree_trans *trans,
239                                    struct btree_path *path)
240 {
241         struct bch_fs *c = trans->c;
242         unsigned i;
243
244         EBUG_ON(path->btree_id >= BTREE_ID_NR);
245
246         for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) {
247                 if (!path->l[i].b) {
248                         BUG_ON(!path->cached &&
249                                c->btree_roots[path->btree_id].b->c.level > i);
250                         break;
251                 }
252
253                 bch2_btree_path_verify_level(trans, path, i);
254         }
255
256         bch2_btree_path_verify_locks(path);
257 }
258
259 void bch2_trans_verify_paths(struct btree_trans *trans)
260 {
261         struct btree_path *path;
262
263         trans_for_each_path(trans, path)
264                 bch2_btree_path_verify(trans, path);
265 }
266
267 static void bch2_btree_iter_verify(struct btree_iter *iter)
268 {
269         struct btree_trans *trans = iter->trans;
270
271         BUG_ON(iter->btree_id >= BTREE_ID_NR);
272
273         BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached);
274
275         BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
276                (iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
277
278         BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
279                (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
280                !btree_type_has_snapshots(iter->btree_id));
281
282         if (iter->update_path)
283                 bch2_btree_path_verify(trans, iter->update_path);
284         bch2_btree_path_verify(trans, iter->path);
285 }
286
287 static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
288 {
289         BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
290                !iter->pos.snapshot);
291
292         BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
293                iter->pos.snapshot != iter->snapshot);
294
295         BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) ||
296                bkey_gt(iter->pos, iter->k.p));
297 }
298
299 static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
300 {
301         struct btree_trans *trans = iter->trans;
302         struct btree_iter copy;
303         struct bkey_s_c prev;
304         int ret = 0;
305
306         if (!bch2_debug_check_iterators)
307                 return 0;
308
309         if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS))
310                 return 0;
311
312         if (bkey_err(k) || !k.k)
313                 return 0;
314
315         BUG_ON(!bch2_snapshot_is_ancestor(trans->c,
316                                           iter->snapshot,
317                                           k.k->p.snapshot));
318
319         bch2_trans_iter_init(trans, &copy, iter->btree_id, iter->pos,
320                              BTREE_ITER_NOPRESERVE|
321                              BTREE_ITER_ALL_SNAPSHOTS);
322         prev = bch2_btree_iter_prev(&copy);
323         if (!prev.k)
324                 goto out;
325
326         ret = bkey_err(prev);
327         if (ret)
328                 goto out;
329
330         if (bkey_eq(prev.k->p, k.k->p) &&
331             bch2_snapshot_is_ancestor(trans->c, iter->snapshot,
332                                       prev.k->p.snapshot) > 0) {
333                 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
334
335                 bch2_bkey_to_text(&buf1, k.k);
336                 bch2_bkey_to_text(&buf2, prev.k);
337
338                 panic("iter snap %u\n"
339                       "k    %s\n"
340                       "prev %s\n",
341                       iter->snapshot,
342                       buf1.buf, buf2.buf);
343         }
344 out:
345         bch2_trans_iter_exit(trans, &copy);
346         return ret;
347 }
348
349 void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
350                             struct bpos pos, bool key_cache)
351 {
352         struct btree_path *path;
353         unsigned idx;
354         struct printbuf buf = PRINTBUF;
355
356         trans_for_each_path_inorder(trans, path, idx) {
357                 int cmp = cmp_int(path->btree_id, id) ?:
358                         cmp_int(path->cached, key_cache);
359
360                 if (cmp > 0)
361                         break;
362                 if (cmp < 0)
363                         continue;
364
365                 if (!btree_node_locked(path, 0) ||
366                     !path->should_be_locked)
367                         continue;
368
369                 if (!key_cache) {
370                         if (bkey_ge(pos, path->l[0].b->data->min_key) &&
371                             bkey_le(pos, path->l[0].b->key.k.p))
372                                 return;
373                 } else {
374                         if (bkey_eq(pos, path->pos))
375                                 return;
376                 }
377         }
378
379         bch2_dump_trans_paths_updates(trans);
380         bch2_bpos_to_text(&buf, pos);
381
382         panic("not locked: %s %s%s\n",
383               bch2_btree_ids[id], buf.buf,
384               key_cache ? " cached" : "");
385 }
386
387 #else
388
389 static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
390                                                 struct btree_path *path, unsigned l) {}
391 static inline void bch2_btree_path_verify(struct btree_trans *trans,
392                                           struct btree_path *path) {}
393 static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
394 static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
395 static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; }
396
397 #endif
398
399 /* Btree path: fixups after btree updates */
400
401 static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
402                                         struct btree *b,
403                                         struct bset_tree *t,
404                                         struct bkey_packed *k)
405 {
406         struct btree_node_iter_set *set;
407
408         btree_node_iter_for_each(iter, set)
409                 if (set->end == t->end_offset) {
410                         set->k = __btree_node_key_to_offset(b, k);
411                         bch2_btree_node_iter_sort(iter, b);
412                         return;
413                 }
414
415         bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
416 }
417
418 static void __bch2_btree_path_fix_key_modified(struct btree_path *path,
419                                                struct btree *b,
420                                                struct bkey_packed *where)
421 {
422         struct btree_path_level *l = &path->l[b->c.level];
423
424         if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
425                 return;
426
427         if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0)
428                 bch2_btree_node_iter_advance(&l->iter, l->b);
429 }
430
431 void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
432                                       struct btree *b,
433                                       struct bkey_packed *where)
434 {
435         struct btree_path *path;
436
437         trans_for_each_path_with_node(trans, b, path) {
438                 __bch2_btree_path_fix_key_modified(path, b, where);
439                 bch2_btree_path_verify_level(trans, path, b->c.level);
440         }
441 }
442
443 static void __bch2_btree_node_iter_fix(struct btree_path *path,
444                                        struct btree *b,
445                                        struct btree_node_iter *node_iter,
446                                        struct bset_tree *t,
447                                        struct bkey_packed *where,
448                                        unsigned clobber_u64s,
449                                        unsigned new_u64s)
450 {
451         const struct bkey_packed *end = btree_bkey_last(b, t);
452         struct btree_node_iter_set *set;
453         unsigned offset = __btree_node_key_to_offset(b, where);
454         int shift = new_u64s - clobber_u64s;
455         unsigned old_end = t->end_offset - shift;
456         unsigned orig_iter_pos = node_iter->data[0].k;
457         bool iter_current_key_modified =
458                 orig_iter_pos >= offset &&
459                 orig_iter_pos <= offset + clobber_u64s;
460
461         btree_node_iter_for_each(node_iter, set)
462                 if (set->end == old_end)
463                         goto found;
464
465         /* didn't find the bset in the iterator - might have to readd it: */
466         if (new_u64s &&
467             bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
468                 bch2_btree_node_iter_push(node_iter, b, where, end);
469                 goto fixup_done;
470         } else {
471                 /* Iterator is after key that changed */
472                 return;
473         }
474 found:
475         set->end = t->end_offset;
476
477         /* Iterator hasn't gotten to the key that changed yet: */
478         if (set->k < offset)
479                 return;
480
481         if (new_u64s &&
482             bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
483                 set->k = offset;
484         } else if (set->k < offset + clobber_u64s) {
485                 set->k = offset + new_u64s;
486                 if (set->k == set->end)
487                         bch2_btree_node_iter_set_drop(node_iter, set);
488         } else {
489                 /* Iterator is after key that changed */
490                 set->k = (int) set->k + shift;
491                 return;
492         }
493
494         bch2_btree_node_iter_sort(node_iter, b);
495 fixup_done:
496         if (node_iter->data[0].k != orig_iter_pos)
497                 iter_current_key_modified = true;
498
499         /*
500          * When a new key is added, and the node iterator now points to that
501          * key, the iterator might have skipped past deleted keys that should
502          * come after the key the iterator now points to. We have to rewind to
503          * before those deleted keys - otherwise
504          * bch2_btree_node_iter_prev_all() breaks:
505          */
506         if (!bch2_btree_node_iter_end(node_iter) &&
507             iter_current_key_modified &&
508             b->c.level) {
509                 struct bset_tree *t;
510                 struct bkey_packed *k, *k2, *p;
511
512                 k = bch2_btree_node_iter_peek_all(node_iter, b);
513
514                 for_each_bset(b, t) {
515                         bool set_pos = false;
516
517                         if (node_iter->data[0].end == t->end_offset)
518                                 continue;
519
520                         k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t);
521
522                         while ((p = bch2_bkey_prev_all(b, t, k2)) &&
523                                bkey_iter_cmp(b, k, p) < 0) {
524                                 k2 = p;
525                                 set_pos = true;
526                         }
527
528                         if (set_pos)
529                                 btree_node_iter_set_set_pos(node_iter,
530                                                             b, t, k2);
531                 }
532         }
533 }
534
535 void bch2_btree_node_iter_fix(struct btree_trans *trans,
536                               struct btree_path *path,
537                               struct btree *b,
538                               struct btree_node_iter *node_iter,
539                               struct bkey_packed *where,
540                               unsigned clobber_u64s,
541                               unsigned new_u64s)
542 {
543         struct bset_tree *t = bch2_bkey_to_bset(b, where);
544         struct btree_path *linked;
545
546         if (node_iter != &path->l[b->c.level].iter) {
547                 __bch2_btree_node_iter_fix(path, b, node_iter, t,
548                                            where, clobber_u64s, new_u64s);
549
550                 if (bch2_debug_check_iterators)
551                         bch2_btree_node_iter_verify(node_iter, b);
552         }
553
554         trans_for_each_path_with_node(trans, b, linked) {
555                 __bch2_btree_node_iter_fix(linked, b,
556                                            &linked->l[b->c.level].iter, t,
557                                            where, clobber_u64s, new_u64s);
558                 bch2_btree_path_verify_level(trans, linked, b->c.level);
559         }
560 }
561
562 /* Btree path level: pointer to a particular btree node and node iter */
563
564 static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c,
565                                                   struct btree_path_level *l,
566                                                   struct bkey *u,
567                                                   struct bkey_packed *k)
568 {
569         if (unlikely(!k)) {
570                 /*
571                  * signal to bch2_btree_iter_peek_slot() that we're currently at
572                  * a hole
573                  */
574                 u->type = KEY_TYPE_deleted;
575                 return bkey_s_c_null;
576         }
577
578         return bkey_disassemble(l->b, k, u);
579 }
580
581 static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
582                                                         struct btree_path_level *l,
583                                                         struct bkey *u)
584 {
585         return __btree_iter_unpack(c, l, u,
586                         bch2_btree_node_iter_peek_all(&l->iter, l->b));
587 }
588
589 static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans,
590                                                     struct btree_path *path,
591                                                     struct btree_path_level *l,
592                                                     struct bkey *u)
593 {
594         struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
595                         bch2_btree_node_iter_peek(&l->iter, l->b));
596
597         path->pos = k.k ? k.k->p : l->b->key.k.p;
598         bch2_btree_path_verify_level(trans, path, l - path->l);
599         return k;
600 }
601
602 static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
603                                                     struct btree_path *path,
604                                                     struct btree_path_level *l,
605                                                     struct bkey *u)
606 {
607         struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
608                         bch2_btree_node_iter_prev(&l->iter, l->b));
609
610         path->pos = k.k ? k.k->p : l->b->data->min_key;
611         bch2_btree_path_verify_level(trans, path, l - path->l);
612         return k;
613 }
614
615 static inline bool btree_path_advance_to_pos(struct btree_path *path,
616                                              struct btree_path_level *l,
617                                              int max_advance)
618 {
619         struct bkey_packed *k;
620         int nr_advanced = 0;
621
622         while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
623                bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
624                 if (max_advance > 0 && nr_advanced >= max_advance)
625                         return false;
626
627                 bch2_btree_node_iter_advance(&l->iter, l->b);
628                 nr_advanced++;
629         }
630
631         return true;
632 }
633
634 static inline void __btree_path_level_init(struct btree_path *path,
635                                            unsigned level)
636 {
637         struct btree_path_level *l = &path->l[level];
638
639         bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
640
641         /*
642          * Iterators to interior nodes should always be pointed at the first non
643          * whiteout:
644          */
645         if (level)
646                 bch2_btree_node_iter_peek(&l->iter, l->b);
647 }
648
649 void bch2_btree_path_level_init(struct btree_trans *trans,
650                                 struct btree_path *path,
651                                 struct btree *b)
652 {
653         BUG_ON(path->cached);
654
655         EBUG_ON(!btree_path_pos_in_node(path, b));
656         EBUG_ON(b->c.lock.state.seq & 1);
657
658         path->l[b->c.level].lock_seq = b->c.lock.state.seq;
659         path->l[b->c.level].b = b;
660         __btree_path_level_init(path, b->c.level);
661 }
662
663 /* Btree path: fixups after btree node updates: */
664
665 static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, struct btree *b)
666 {
667         struct bch_fs *c = trans->c;
668         struct btree_insert_entry *i;
669
670         trans_for_each_update(trans, i)
671                 if (!i->cached &&
672                     i->level    == b->c.level &&
673                     i->btree_id == b->c.btree_id &&
674                     bpos_cmp(i->k->k.p, b->data->min_key) >= 0 &&
675                     bpos_cmp(i->k->k.p, b->data->max_key) <= 0) {
676                         i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v;
677
678                         if (unlikely(trans->journal_replay_not_finished)) {
679                                 struct bkey_i *j_k =
680                                         bch2_journal_keys_peek_slot(c, i->btree_id, i->level,
681                                                                     i->k->k.p);
682
683                                 if (j_k) {
684                                         i->old_k = j_k->k;
685                                         i->old_v = &j_k->v;
686                                 }
687                         }
688                 }
689 }
690
691 /*
692  * A btree node is being replaced - update the iterator to point to the new
693  * node:
694  */
695 void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
696 {
697         struct btree_path *path;
698
699         trans_for_each_path(trans, path)
700                 if (path->uptodate == BTREE_ITER_UPTODATE &&
701                     !path->cached &&
702                     btree_path_pos_in_node(path, b)) {
703                         enum btree_node_locked_type t =
704                                 btree_lock_want(path, b->c.level);
705
706                         if (t != BTREE_NODE_UNLOCKED) {
707                                 btree_node_unlock(trans, path, b->c.level);
708                                 six_lock_increment(&b->c.lock, t);
709                                 mark_btree_node_locked(trans, path, b->c.level, t);
710                         }
711
712                         bch2_btree_path_level_init(trans, path, b);
713                 }
714
715         bch2_trans_revalidate_updates_in_node(trans, b);
716 }
717
718 /*
719  * A btree node has been modified in such a way as to invalidate iterators - fix
720  * them:
721  */
722 void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b)
723 {
724         struct btree_path *path;
725
726         trans_for_each_path_with_node(trans, b, path)
727                 __btree_path_level_init(path, b->c.level);
728
729         bch2_trans_revalidate_updates_in_node(trans, b);
730 }
731
732 /* Btree path: traverse, set_pos: */
733
734 static inline int btree_path_lock_root(struct btree_trans *trans,
735                                        struct btree_path *path,
736                                        unsigned depth_want,
737                                        unsigned long trace_ip)
738 {
739         struct bch_fs *c = trans->c;
740         struct btree *b, **rootp = &c->btree_roots[path->btree_id].b;
741         enum six_lock_type lock_type;
742         unsigned i;
743         int ret;
744
745         EBUG_ON(path->nodes_locked);
746
747         while (1) {
748                 b = READ_ONCE(*rootp);
749                 path->level = READ_ONCE(b->c.level);
750
751                 if (unlikely(path->level < depth_want)) {
752                         /*
753                          * the root is at a lower depth than the depth we want:
754                          * got to the end of the btree, or we're walking nodes
755                          * greater than some depth and there are no nodes >=
756                          * that depth
757                          */
758                         path->level = depth_want;
759                         for (i = path->level; i < BTREE_MAX_DEPTH; i++)
760                                 path->l[i].b = NULL;
761                         return 1;
762                 }
763
764                 lock_type = __btree_lock_want(path, path->level);
765                 ret = btree_node_lock(trans, path, &b->c,
766                                       path->level, lock_type, trace_ip);
767                 if (unlikely(ret)) {
768                         if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed))
769                                 continue;
770                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
771                                 return ret;
772                         BUG();
773                 }
774
775                 if (likely(b == READ_ONCE(*rootp) &&
776                            b->c.level == path->level &&
777                            !race_fault())) {
778                         for (i = 0; i < path->level; i++)
779                                 path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_lock_root);
780                         path->l[path->level].b = b;
781                         for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
782                                 path->l[i].b = NULL;
783
784                         mark_btree_node_locked(trans, path, path->level, lock_type);
785                         bch2_btree_path_level_init(trans, path, b);
786                         return 0;
787                 }
788
789                 six_unlock_type(&b->c.lock, lock_type);
790         }
791 }
792
793 noinline
794 static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path)
795 {
796         struct bch_fs *c = trans->c;
797         struct btree_path_level *l = path_l(path);
798         struct btree_node_iter node_iter = l->iter;
799         struct bkey_packed *k;
800         struct bkey_buf tmp;
801         unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
802                 ? (path->level > 1 ? 0 :  2)
803                 : (path->level > 1 ? 1 : 16);
804         bool was_locked = btree_node_locked(path, path->level);
805         int ret = 0;
806
807         bch2_bkey_buf_init(&tmp);
808
809         while (nr-- && !ret) {
810                 if (!bch2_btree_node_relock(trans, path, path->level))
811                         break;
812
813                 bch2_btree_node_iter_advance(&node_iter, l->b);
814                 k = bch2_btree_node_iter_peek(&node_iter, l->b);
815                 if (!k)
816                         break;
817
818                 bch2_bkey_buf_unpack(&tmp, c, l->b, k);
819                 ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id,
820                                                path->level - 1);
821         }
822
823         if (!was_locked)
824                 btree_node_unlock(trans, path, path->level);
825
826         bch2_bkey_buf_exit(&tmp, c);
827         return ret;
828 }
829
830 static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path,
831                                  struct btree_and_journal_iter *jiter)
832 {
833         struct bch_fs *c = trans->c;
834         struct bkey_s_c k;
835         struct bkey_buf tmp;
836         unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
837                 ? (path->level > 1 ? 0 :  2)
838                 : (path->level > 1 ? 1 : 16);
839         bool was_locked = btree_node_locked(path, path->level);
840         int ret = 0;
841
842         bch2_bkey_buf_init(&tmp);
843
844         while (nr-- && !ret) {
845                 if (!bch2_btree_node_relock(trans, path, path->level))
846                         break;
847
848                 bch2_btree_and_journal_iter_advance(jiter);
849                 k = bch2_btree_and_journal_iter_peek(jiter);
850                 if (!k.k)
851                         break;
852
853                 bch2_bkey_buf_reassemble(&tmp, c, k);
854                 ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id,
855                                                path->level - 1);
856         }
857
858         if (!was_locked)
859                 btree_node_unlock(trans, path, path->level);
860
861         bch2_bkey_buf_exit(&tmp, c);
862         return ret;
863 }
864
865 static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
866                                             struct btree_path *path,
867                                             unsigned plevel, struct btree *b)
868 {
869         struct btree_path_level *l = &path->l[plevel];
870         bool locked = btree_node_locked(path, plevel);
871         struct bkey_packed *k;
872         struct bch_btree_ptr_v2 *bp;
873
874         if (!bch2_btree_node_relock(trans, path, plevel))
875                 return;
876
877         k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
878         BUG_ON(k->type != KEY_TYPE_btree_ptr_v2);
879
880         bp = (void *) bkeyp_val(&l->b->format, k);
881         bp->mem_ptr = (unsigned long)b;
882
883         if (!locked)
884                 btree_node_unlock(trans, path, plevel);
885 }
886
887 static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
888                                                      struct btree_path *path,
889                                                      unsigned flags,
890                                                      struct bkey_buf *out)
891 {
892         struct bch_fs *c = trans->c;
893         struct btree_path_level *l = path_l(path);
894         struct btree_and_journal_iter jiter;
895         struct bkey_s_c k;
896         int ret = 0;
897
898         __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos);
899
900         k = bch2_btree_and_journal_iter_peek(&jiter);
901
902         bch2_bkey_buf_reassemble(out, c, k);
903
904         if (flags & BTREE_ITER_PREFETCH)
905                 ret = btree_path_prefetch_j(trans, path, &jiter);
906
907         bch2_btree_and_journal_iter_exit(&jiter);
908         return ret;
909 }
910
911 static __always_inline int btree_path_down(struct btree_trans *trans,
912                                            struct btree_path *path,
913                                            unsigned flags,
914                                            unsigned long trace_ip)
915 {
916         struct bch_fs *c = trans->c;
917         struct btree_path_level *l = path_l(path);
918         struct btree *b;
919         unsigned level = path->level - 1;
920         enum six_lock_type lock_type = __btree_lock_want(path, level);
921         struct bkey_buf tmp;
922         int ret;
923
924         EBUG_ON(!btree_node_locked(path, path->level));
925
926         bch2_bkey_buf_init(&tmp);
927
928         if (unlikely(trans->journal_replay_not_finished)) {
929                 ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp);
930                 if (ret)
931                         goto err;
932         } else {
933                 bch2_bkey_buf_unpack(&tmp, c, l->b,
934                                  bch2_btree_node_iter_peek(&l->iter, l->b));
935
936                 if (flags & BTREE_ITER_PREFETCH) {
937                         ret = btree_path_prefetch(trans, path);
938                         if (ret)
939                                 goto err;
940                 }
941         }
942
943         b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip);
944         ret = PTR_ERR_OR_ZERO(b);
945         if (unlikely(ret))
946                 goto err;
947
948         if (likely(!trans->journal_replay_not_finished &&
949                    tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
950             unlikely(b != btree_node_mem_ptr(tmp.k)))
951                 btree_node_mem_ptr_set(trans, path, level + 1, b);
952
953         if (btree_node_read_locked(path, level + 1))
954                 btree_node_unlock(trans, path, level + 1);
955
956         mark_btree_node_locked(trans, path, level, lock_type);
957         path->level = level;
958         bch2_btree_path_level_init(trans, path, b);
959
960         bch2_btree_path_verify_locks(path);
961 err:
962         bch2_bkey_buf_exit(&tmp, c);
963         return ret;
964 }
965
966 static int btree_path_traverse_one(struct btree_trans *, struct btree_path *,
967                                    unsigned, unsigned long);
968
969 static int bch2_btree_path_traverse_all(struct btree_trans *trans)
970 {
971         struct bch_fs *c = trans->c;
972         struct btree_path *path;
973         unsigned long trace_ip = _RET_IP_;
974         int ret = 0;
975
976         if (trans->in_traverse_all)
977                 return -BCH_ERR_transaction_restart_in_traverse_all;
978
979         trans->in_traverse_all = true;
980 retry_all:
981         trans->restarted = 0;
982         trans->traverse_all_idx = U8_MAX;
983
984         trans_for_each_path(trans, path)
985                 path->should_be_locked = false;
986
987         btree_trans_verify_sorted(trans);
988
989         bch2_trans_unlock(trans);
990         cond_resched();
991
992         if (unlikely(trans->memory_allocation_failure)) {
993                 struct closure cl;
994
995                 closure_init_stack(&cl);
996
997                 do {
998                         ret = bch2_btree_cache_cannibalize_lock(c, &cl);
999                         closure_sync(&cl);
1000                 } while (ret);
1001         }
1002
1003         /* Now, redo traversals in correct order: */
1004         trans->traverse_all_idx = 0;
1005         while (trans->traverse_all_idx < trans->nr_sorted) {
1006                 path = trans->paths + trans->sorted[trans->traverse_all_idx];
1007
1008                 /*
1009                  * Traversing a path can cause another path to be added at about
1010                  * the same position:
1011                  */
1012                 if (path->uptodate) {
1013                         ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_);
1014                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
1015                             ret == -ENOMEM)
1016                                 goto retry_all;
1017                         if (ret)
1018                                 goto err;
1019                         BUG_ON(path->uptodate);
1020                 } else {
1021                         trans->traverse_all_idx++;
1022                 }
1023         }
1024
1025         /*
1026          * BTREE_ITER_NEED_RELOCK is ok here - if we called bch2_trans_unlock()
1027          * and relock(), relock() won't relock since path->should_be_locked
1028          * isn't set yet, which is all fine
1029          */
1030         trans_for_each_path(trans, path)
1031                 BUG_ON(path->uptodate >= BTREE_ITER_NEED_TRAVERSE);
1032 err:
1033         bch2_btree_cache_cannibalize_unlock(c);
1034
1035         trans->in_traverse_all = false;
1036
1037         trace_and_count(c, trans_traverse_all, trans, trace_ip);
1038         return ret;
1039 }
1040
1041 static inline bool btree_path_check_pos_in_node(struct btree_path *path,
1042                                                 unsigned l, int check_pos)
1043 {
1044         if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b))
1045                 return false;
1046         if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b))
1047                 return false;
1048         return true;
1049 }
1050
1051 static inline bool btree_path_good_node(struct btree_trans *trans,
1052                                         struct btree_path *path,
1053                                         unsigned l, int check_pos)
1054 {
1055         return is_btree_node(path, l) &&
1056                 bch2_btree_node_relock(trans, path, l) &&
1057                 btree_path_check_pos_in_node(path, l, check_pos);
1058 }
1059
1060 static void btree_path_set_level_down(struct btree_trans *trans,
1061                                       struct btree_path *path,
1062                                       unsigned new_level)
1063 {
1064         unsigned l;
1065
1066         path->level = new_level;
1067
1068         for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++)
1069                 if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
1070                         btree_node_unlock(trans, path, l);
1071
1072         btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
1073         bch2_btree_path_verify(trans, path);
1074 }
1075
1076 static noinline unsigned __btree_path_up_until_good_node(struct btree_trans *trans,
1077                                                          struct btree_path *path,
1078                                                          int check_pos)
1079 {
1080         unsigned i, l = path->level;
1081 again:
1082         while (btree_path_node(path, l) &&
1083                !btree_path_good_node(trans, path, l, check_pos))
1084                 __btree_path_set_level_up(trans, path, l++);
1085
1086         /* If we need intent locks, take them too: */
1087         for (i = l + 1;
1088              i < path->locks_want && btree_path_node(path, i);
1089              i++)
1090                 if (!bch2_btree_node_relock(trans, path, i)) {
1091                         while (l <= i)
1092                                 __btree_path_set_level_up(trans, path, l++);
1093                         goto again;
1094                 }
1095
1096         return l;
1097 }
1098
1099 static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
1100                                                      struct btree_path *path,
1101                                                      int check_pos)
1102 {
1103         return likely(btree_node_locked(path, path->level) &&
1104                       btree_path_check_pos_in_node(path, path->level, check_pos))
1105                 ? path->level
1106                 : __btree_path_up_until_good_node(trans, path, check_pos);
1107 }
1108
1109 /*
1110  * This is the main state machine for walking down the btree - walks down to a
1111  * specified depth
1112  *
1113  * Returns 0 on success, -EIO on error (error reading in a btree node).
1114  *
1115  * On error, caller (peek_node()/peek_key()) must return NULL; the error is
1116  * stashed in the iterator and returned from bch2_trans_exit().
1117  */
1118 static int btree_path_traverse_one(struct btree_trans *trans,
1119                                    struct btree_path *path,
1120                                    unsigned flags,
1121                                    unsigned long trace_ip)
1122 {
1123         unsigned depth_want = path->level;
1124         int ret = -((int) trans->restarted);
1125
1126         if (unlikely(ret))
1127                 goto out;
1128
1129         /*
1130          * Ensure we obey path->should_be_locked: if it's set, we can't unlock
1131          * and re-traverse the path without a transaction restart:
1132          */
1133         if (path->should_be_locked) {
1134                 ret = bch2_btree_path_relock(trans, path, trace_ip);
1135                 goto out;
1136         }
1137
1138         if (path->cached) {
1139                 ret = bch2_btree_path_traverse_cached(trans, path, flags);
1140                 goto out;
1141         }
1142
1143         if (unlikely(path->level >= BTREE_MAX_DEPTH))
1144                 goto out;
1145
1146         path->level = btree_path_up_until_good_node(trans, path, 0);
1147
1148         EBUG_ON(btree_path_node(path, path->level) &&
1149                 !btree_node_locked(path, path->level));
1150
1151         /*
1152          * Note: path->nodes[path->level] may be temporarily NULL here - that
1153          * would indicate to other code that we got to the end of the btree,
1154          * here it indicates that relocking the root failed - it's critical that
1155          * btree_path_lock_root() comes next and that it can't fail
1156          */
1157         while (path->level > depth_want) {
1158                 ret = btree_path_node(path, path->level)
1159                         ? btree_path_down(trans, path, flags, trace_ip)
1160                         : btree_path_lock_root(trans, path, depth_want, trace_ip);
1161                 if (unlikely(ret)) {
1162                         if (ret == 1) {
1163                                 /*
1164                                  * No nodes at this level - got to the end of
1165                                  * the btree:
1166                                  */
1167                                 ret = 0;
1168                                 goto out;
1169                         }
1170
1171                         __bch2_btree_path_unlock(trans, path);
1172                         path->level = depth_want;
1173                         path->l[path->level].b = ERR_PTR(ret);
1174                         goto out;
1175                 }
1176         }
1177
1178         path->uptodate = BTREE_ITER_UPTODATE;
1179 out:
1180         BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
1181         bch2_btree_path_verify(trans, path);
1182         return ret;
1183 }
1184
1185 int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
1186                                           struct btree_path *path, unsigned flags)
1187 {
1188         if (0 && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
1189                 unsigned restart_probability_bits = 4 << min(trans->restart_count, 32U);
1190                 u64 mask = ~(~0ULL << restart_probability_bits);
1191
1192                 if ((prandom_u32() & mask) == mask) {
1193                         trace_and_count(trans->c, trans_restart_injected, trans, _RET_IP_);
1194                         return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject);
1195                 }
1196         }
1197
1198         if (path->uptodate < BTREE_ITER_NEED_RELOCK)
1199                 return 0;
1200
1201         return  bch2_trans_cond_resched(trans) ?:
1202                 btree_path_traverse_one(trans, path, flags, _RET_IP_);
1203 }
1204
1205 static inline void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
1206                             struct btree_path *src)
1207 {
1208         unsigned i, offset = offsetof(struct btree_path, pos);
1209
1210         memcpy((void *) dst + offset,
1211                (void *) src + offset,
1212                sizeof(struct btree_path) - offset);
1213
1214         for (i = 0; i < BTREE_MAX_DEPTH; i++) {
1215                 unsigned t = btree_node_locked_type(dst, i);
1216
1217                 if (t != BTREE_NODE_UNLOCKED)
1218                         six_lock_increment(&dst->l[i].b->c.lock, t);
1219         }
1220 }
1221
1222 static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src,
1223                                            bool intent)
1224 {
1225         struct btree_path *new = btree_path_alloc(trans, src);
1226
1227         btree_path_copy(trans, new, src);
1228         __btree_path_get(new, intent);
1229         return new;
1230 }
1231
1232 __flatten
1233 struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans,
1234                          struct btree_path *path, bool intent,
1235                          unsigned long ip)
1236 {
1237         __btree_path_put(path, intent);
1238         path = btree_path_clone(trans, path, intent);
1239         path->preserve = false;
1240 #ifdef CONFIG_BCACHEFS_DEBUG
1241         path->ip_allocated = ip;
1242 #endif
1243         btree_trans_verify_sorted(trans);
1244         return path;
1245 }
1246
1247 struct btree_path * __must_check
1248 __bch2_btree_path_set_pos(struct btree_trans *trans,
1249                    struct btree_path *path, struct bpos new_pos,
1250                    bool intent, unsigned long ip, int cmp)
1251 {
1252         unsigned level = path->level;
1253
1254         EBUG_ON(trans->restarted);
1255         EBUG_ON(!path->ref);
1256
1257         path = bch2_btree_path_make_mut(trans, path, intent, ip);
1258
1259         path->pos = new_pos;
1260
1261         bch2_btree_path_check_sort_fast(trans, path, cmp);
1262
1263         if (unlikely(path->cached)) {
1264                 btree_node_unlock(trans, path, 0);
1265                 path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
1266                 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
1267                 goto out;
1268         }
1269
1270         level = btree_path_up_until_good_node(trans, path, cmp);
1271
1272         if (btree_path_node(path, level)) {
1273                 struct btree_path_level *l = &path->l[level];
1274
1275                 BUG_ON(!btree_node_locked(path, level));
1276                 /*
1277                  * We might have to skip over many keys, or just a few: try
1278                  * advancing the node iterator, and if we have to skip over too
1279                  * many keys just reinit it (or if we're rewinding, since that
1280                  * is expensive).
1281                  */
1282                 if (cmp < 0 ||
1283                     !btree_path_advance_to_pos(path, l, 8))
1284                         bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
1285
1286                 /*
1287                  * Iterators to interior nodes should always be pointed at the first non
1288                  * whiteout:
1289                  */
1290                 if (unlikely(level))
1291                         bch2_btree_node_iter_peek(&l->iter, l->b);
1292         }
1293
1294         if (unlikely(level != path->level)) {
1295                 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
1296                 __bch2_btree_path_unlock(trans, path);
1297         }
1298 out:
1299         bch2_btree_path_verify(trans, path);
1300         return path;
1301 }
1302
1303 /* Btree path: main interface: */
1304
1305 static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path)
1306 {
1307         struct btree_path *sib;
1308
1309         sib = prev_btree_path(trans, path);
1310         if (sib && !btree_path_cmp(sib, path))
1311                 return sib;
1312
1313         sib = next_btree_path(trans, path);
1314         if (sib && !btree_path_cmp(sib, path))
1315                 return sib;
1316
1317         return NULL;
1318 }
1319
1320 static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path)
1321 {
1322         struct btree_path *sib;
1323
1324         sib = prev_btree_path(trans, path);
1325         if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
1326                 return sib;
1327
1328         sib = next_btree_path(trans, path);
1329         if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
1330                 return sib;
1331
1332         return NULL;
1333 }
1334
1335 static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path)
1336 {
1337         __bch2_btree_path_unlock(trans, path);
1338         btree_path_list_remove(trans, path);
1339         trans->paths_allocated &= ~(1ULL << path->idx);
1340 }
1341
1342 void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent)
1343 {
1344         struct btree_path *dup;
1345
1346         EBUG_ON(trans->paths + path->idx != path);
1347         EBUG_ON(!path->ref);
1348
1349         if (!__btree_path_put(path, intent))
1350                 return;
1351
1352         dup = path->preserve
1353                 ? have_path_at_pos(trans, path)
1354                 : have_node_at_pos(trans, path);
1355
1356         if (!dup && !(!path->preserve && !is_btree_node(path, path->level)))
1357                 return;
1358
1359         if (path->should_be_locked &&
1360             !trans->restarted &&
1361             (!dup || !bch2_btree_path_relock_norestart(trans, dup, _THIS_IP_)))
1362                 return;
1363
1364         if (dup) {
1365                 dup->preserve           |= path->preserve;
1366                 dup->should_be_locked   |= path->should_be_locked;
1367         }
1368
1369         __bch2_path_free(trans, path);
1370 }
1371
1372 static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *path,
1373                                  bool intent)
1374 {
1375         EBUG_ON(trans->paths + path->idx != path);
1376         EBUG_ON(!path->ref);
1377
1378         if (!__btree_path_put(path, intent))
1379                 return;
1380
1381         __bch2_path_free(trans, path);
1382 }
1383
1384 void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
1385 {
1386         struct btree_insert_entry *i;
1387
1388         prt_printf(buf, "transaction updates for %s journal seq %llu",
1389                trans->fn, trans->journal_res.seq);
1390         prt_newline(buf);
1391         printbuf_indent_add(buf, 2);
1392
1393         trans_for_each_update(trans, i) {
1394                 struct bkey_s_c old = { &i->old_k, i->old_v };
1395
1396                 prt_printf(buf, "update: btree=%s cached=%u %pS",
1397                        bch2_btree_ids[i->btree_id],
1398                        i->cached,
1399                        (void *) i->ip_allocated);
1400                 prt_newline(buf);
1401
1402                 prt_printf(buf, "  old ");
1403                 bch2_bkey_val_to_text(buf, trans->c, old);
1404                 prt_newline(buf);
1405
1406                 prt_printf(buf, "  new ");
1407                 bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k));
1408                 prt_newline(buf);
1409         }
1410
1411         printbuf_indent_sub(buf, 2);
1412 }
1413
1414 noinline __cold
1415 void bch2_dump_trans_updates(struct btree_trans *trans)
1416 {
1417         struct printbuf buf = PRINTBUF;
1418
1419         bch2_trans_updates_to_text(&buf, trans);
1420         bch2_print_string_as_lines(KERN_ERR, buf.buf);
1421         printbuf_exit(&buf);
1422 }
1423
1424 void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path)
1425 {
1426         prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ",
1427                    path->idx, path->ref, path->intent_ref,
1428                    path->preserve ? 'P' : ' ',
1429                    path->should_be_locked ? 'S' : ' ',
1430                    bch2_btree_ids[path->btree_id],
1431                    path->level);
1432         bch2_bpos_to_text(out, path->pos);
1433
1434         prt_printf(out, " locks %u", path->nodes_locked);
1435 #ifdef CONFIG_BCACHEFS_DEBUG
1436         prt_printf(out, " %pS", (void *) path->ip_allocated);
1437 #endif
1438         prt_newline(out);
1439 }
1440
1441 void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans)
1442 {
1443         struct btree_path *path;
1444         unsigned idx;
1445
1446         trans_for_each_path_inorder(trans, path, idx)
1447                 bch2_btree_path_to_text(out, path);
1448 }
1449
1450 noinline __cold
1451 void bch2_dump_trans_paths_updates(struct btree_trans *trans)
1452 {
1453         struct printbuf buf = PRINTBUF;
1454
1455         bch2_trans_paths_to_text(&buf, trans);
1456         bch2_trans_updates_to_text(&buf, trans);
1457
1458         bch2_print_string_as_lines(KERN_ERR, buf.buf);
1459         printbuf_exit(&buf);
1460 }
1461
1462 noinline
1463 static void bch2_trans_update_max_paths(struct btree_trans *trans)
1464 {
1465         struct btree_transaction_stats *s = btree_trans_stats(trans);
1466         struct printbuf buf = PRINTBUF;
1467
1468         bch2_trans_paths_to_text(&buf, trans);
1469
1470         if (!buf.allocation_failure) {
1471                 mutex_lock(&s->lock);
1472                 if (s->nr_max_paths < hweight64(trans->paths_allocated)) {
1473                         s->nr_max_paths = trans->nr_max_paths =
1474                                 hweight64(trans->paths_allocated);
1475                         swap(s->max_paths_text, buf.buf);
1476                 }
1477                 mutex_unlock(&s->lock);
1478         }
1479
1480         printbuf_exit(&buf);
1481 }
1482
1483 static noinline void btree_path_overflow(struct btree_trans *trans)
1484 {
1485         bch2_dump_trans_paths_updates(trans);
1486         panic("trans path oveflow\n");
1487 }
1488
1489 static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
1490                                                   struct btree_path *pos)
1491 {
1492         struct btree_path *path;
1493         unsigned idx;
1494
1495         if (unlikely(trans->paths_allocated ==
1496                      ~((~0ULL << 1) << (BTREE_ITER_MAX - 1))))
1497                 btree_path_overflow(trans);
1498
1499         idx = __ffs64(~trans->paths_allocated);
1500         trans->paths_allocated |= 1ULL << idx;
1501
1502         if (unlikely(idx > trans->nr_max_paths))
1503                 bch2_trans_update_max_paths(trans);
1504
1505         path = &trans->paths[idx];
1506
1507         path->idx               = idx;
1508         path->ref               = 0;
1509         path->intent_ref        = 0;
1510         path->nodes_locked      = 0;
1511
1512         btree_path_list_add(trans, pos, path);
1513         return path;
1514 }
1515
1516 struct btree_path *bch2_path_get(struct btree_trans *trans,
1517                                  enum btree_id btree_id, struct bpos pos,
1518                                  unsigned locks_want, unsigned level,
1519                                  unsigned flags, unsigned long ip)
1520 {
1521         struct btree_path *path, *path_pos = NULL;
1522         bool cached = flags & BTREE_ITER_CACHED;
1523         bool intent = flags & BTREE_ITER_INTENT;
1524         int i;
1525
1526         EBUG_ON(trans->restarted);
1527         btree_trans_verify_sorted(trans);
1528         bch2_trans_verify_locks(trans);
1529
1530         trans_for_each_path_inorder(trans, path, i) {
1531                 if (__btree_path_cmp(path,
1532                                      btree_id,
1533                                      cached,
1534                                      pos,
1535                                      level) > 0)
1536                         break;
1537
1538                 path_pos = path;
1539         }
1540
1541         if (path_pos &&
1542             path_pos->cached    == cached &&
1543             path_pos->btree_id  == btree_id &&
1544             path_pos->level     == level) {
1545                 __btree_path_get(path_pos, intent);
1546                 path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
1547         } else {
1548                 path = btree_path_alloc(trans, path_pos);
1549                 path_pos = NULL;
1550
1551                 __btree_path_get(path, intent);
1552                 path->pos                       = pos;
1553                 path->btree_id                  = btree_id;
1554                 path->cached                    = cached;
1555                 path->uptodate                  = BTREE_ITER_NEED_TRAVERSE;
1556                 path->should_be_locked          = false;
1557                 path->level                     = level;
1558                 path->locks_want                = locks_want;
1559                 path->nodes_locked              = 0;
1560                 for (i = 0; i < ARRAY_SIZE(path->l); i++)
1561                         path->l[i].b            = ERR_PTR(-BCH_ERR_no_btree_node_init);
1562 #ifdef CONFIG_BCACHEFS_DEBUG
1563                 path->ip_allocated              = ip;
1564 #endif
1565                 btree_trans_verify_sorted(trans);
1566         }
1567
1568         if (!(flags & BTREE_ITER_NOPRESERVE))
1569                 path->preserve = true;
1570
1571         if (path->intent_ref)
1572                 locks_want = max(locks_want, level + 1);
1573
1574         /*
1575          * If the path has locks_want greater than requested, we don't downgrade
1576          * it here - on transaction restart because btree node split needs to
1577          * upgrade locks, we might be putting/getting the iterator again.
1578          * Downgrading iterators only happens via bch2_trans_downgrade(), after
1579          * a successful transaction commit.
1580          */
1581
1582         locks_want = min(locks_want, BTREE_MAX_DEPTH);
1583         if (locks_want > path->locks_want)
1584                 bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want);
1585
1586         return path;
1587 }
1588
1589 struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
1590 {
1591
1592         struct btree_path_level *l = path_l(path);
1593         struct bkey_packed *_k;
1594         struct bkey_s_c k;
1595
1596         if (unlikely(!l->b))
1597                 return bkey_s_c_null;
1598
1599         EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
1600         EBUG_ON(!btree_node_locked(path, path->level));
1601
1602         if (!path->cached) {
1603                 _k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
1604                 k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null;
1605
1606                 EBUG_ON(k.k && bkey_deleted(k.k) && bpos_eq(k.k->p, path->pos));
1607
1608                 if (!k.k || !bpos_eq(path->pos, k.k->p))
1609                         goto hole;
1610         } else {
1611                 struct bkey_cached *ck = (void *) path->l[0].b;
1612
1613                 EBUG_ON(ck &&
1614                         (path->btree_id != ck->key.btree_id ||
1615                          !bkey_eq(path->pos, ck->key.pos)));
1616                 EBUG_ON(!ck || !ck->valid);
1617
1618                 *u = ck->k->k;
1619                 k = bkey_i_to_s_c(ck->k);
1620         }
1621
1622         return k;
1623 hole:
1624         bkey_init(u);
1625         u->p = path->pos;
1626         return (struct bkey_s_c) { u, NULL };
1627 }
1628
1629 /* Btree iterators: */
1630
1631 int __must_check
1632 __bch2_btree_iter_traverse(struct btree_iter *iter)
1633 {
1634         return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
1635 }
1636
1637 int __must_check
1638 bch2_btree_iter_traverse(struct btree_iter *iter)
1639 {
1640         int ret;
1641
1642         iter->path = bch2_btree_path_set_pos(iter->trans, iter->path,
1643                                         btree_iter_search_key(iter),
1644                                         iter->flags & BTREE_ITER_INTENT,
1645                                         btree_iter_ip_allocated(iter));
1646
1647         ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
1648         if (ret)
1649                 return ret;
1650
1651         btree_path_set_should_be_locked(iter->path);
1652         return 0;
1653 }
1654
1655 /* Iterate across nodes (leaf and interior nodes) */
1656
1657 struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
1658 {
1659         struct btree_trans *trans = iter->trans;
1660         struct btree *b = NULL;
1661         int ret;
1662
1663         EBUG_ON(iter->path->cached);
1664         bch2_btree_iter_verify(iter);
1665
1666         ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
1667         if (ret)
1668                 goto err;
1669
1670         b = btree_path_node(iter->path, iter->path->level);
1671         if (!b)
1672                 goto out;
1673
1674         BUG_ON(bpos_lt(b->key.k.p, iter->pos));
1675
1676         bkey_init(&iter->k);
1677         iter->k.p = iter->pos = b->key.k.p;
1678
1679         iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
1680                                         iter->flags & BTREE_ITER_INTENT,
1681                                         btree_iter_ip_allocated(iter));
1682         btree_path_set_should_be_locked(iter->path);
1683 out:
1684         bch2_btree_iter_verify_entry_exit(iter);
1685         bch2_btree_iter_verify(iter);
1686
1687         return b;
1688 err:
1689         b = ERR_PTR(ret);
1690         goto out;
1691 }
1692
1693 struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
1694 {
1695         struct btree_trans *trans = iter->trans;
1696         struct btree_path *path = iter->path;
1697         struct btree *b = NULL;
1698         int ret;
1699
1700         BUG_ON(trans->restarted);
1701         EBUG_ON(iter->path->cached);
1702         bch2_btree_iter_verify(iter);
1703
1704         /* already at end? */
1705         if (!btree_path_node(path, path->level))
1706                 return NULL;
1707
1708         /* got to end? */
1709         if (!btree_path_node(path, path->level + 1)) {
1710                 btree_path_set_level_up(trans, path);
1711                 return NULL;
1712         }
1713
1714         if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
1715                 __bch2_btree_path_unlock(trans, path);
1716                 path->l[path->level].b          = ERR_PTR(-BCH_ERR_no_btree_node_relock);
1717                 path->l[path->level + 1].b      = ERR_PTR(-BCH_ERR_no_btree_node_relock);
1718                 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
1719                 trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path);
1720                 ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
1721                 goto err;
1722         }
1723
1724         b = btree_path_node(path, path->level + 1);
1725
1726         if (bpos_eq(iter->pos, b->key.k.p)) {
1727                 __btree_path_set_level_up(trans, path, path->level++);
1728         } else {
1729                 /*
1730                  * Haven't gotten to the end of the parent node: go back down to
1731                  * the next child node
1732                  */
1733                 path = iter->path =
1734                         bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos),
1735                                            iter->flags & BTREE_ITER_INTENT,
1736                                            btree_iter_ip_allocated(iter));
1737
1738                 btree_path_set_level_down(trans, path, iter->min_depth);
1739
1740                 ret = bch2_btree_path_traverse(trans, path, iter->flags);
1741                 if (ret)
1742                         goto err;
1743
1744                 b = path->l[path->level].b;
1745         }
1746
1747         bkey_init(&iter->k);
1748         iter->k.p = iter->pos = b->key.k.p;
1749
1750         iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
1751                                         iter->flags & BTREE_ITER_INTENT,
1752                                         btree_iter_ip_allocated(iter));
1753         btree_path_set_should_be_locked(iter->path);
1754         BUG_ON(iter->path->uptodate);
1755 out:
1756         bch2_btree_iter_verify_entry_exit(iter);
1757         bch2_btree_iter_verify(iter);
1758
1759         return b;
1760 err:
1761         b = ERR_PTR(ret);
1762         goto out;
1763 }
1764
1765 /* Iterate across keys (in leaf nodes only) */
1766
1767 inline bool bch2_btree_iter_advance(struct btree_iter *iter)
1768 {
1769         if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) {
1770                 struct bpos pos = iter->k.p;
1771                 bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
1772                              ? bpos_eq(pos, SPOS_MAX)
1773                              : bkey_eq(pos, SPOS_MAX));
1774
1775                 if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
1776                         pos = bkey_successor(iter, pos);
1777                 bch2_btree_iter_set_pos(iter, pos);
1778                 return ret;
1779         } else {
1780                 if (!btree_path_node(iter->path, iter->path->level))
1781                         return true;
1782
1783                 iter->advanced = true;
1784                 return false;
1785         }
1786 }
1787
1788 inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
1789 {
1790         struct bpos pos = bkey_start_pos(&iter->k);
1791         bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
1792                      ? bpos_eq(pos, POS_MIN)
1793                      : bkey_eq(pos, POS_MIN));
1794
1795         if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
1796                 pos = bkey_predecessor(iter, pos);
1797         bch2_btree_iter_set_pos(iter, pos);
1798         return ret;
1799 }
1800
1801 static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans,
1802                                                       enum btree_id btree_id,
1803                                                       struct bpos pos)
1804 {
1805         struct btree_insert_entry *i;
1806         struct bkey_i *ret = NULL;
1807
1808         trans_for_each_update(trans, i) {
1809                 if (i->btree_id < btree_id)
1810                         continue;
1811                 if (i->btree_id > btree_id)
1812                         break;
1813                 if (bpos_lt(i->k->k.p, pos))
1814                         continue;
1815                 if (i->key_cache_already_flushed)
1816                         continue;
1817                 if (!ret || bpos_lt(i->k->k.p, ret->k.p))
1818                         ret = i->k;
1819         }
1820
1821         return ret;
1822 }
1823
1824 struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
1825                                        struct btree_iter *iter,
1826                                        struct bpos start_pos,
1827                                        struct bpos end_pos)
1828 {
1829         struct bkey_i *k;
1830
1831         if (bpos_lt(start_pos, iter->journal_pos))
1832                 iter->journal_idx = 0;
1833
1834         k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id,
1835                                         iter->path->level,
1836                                         start_pos, end_pos,
1837                                         &iter->journal_idx);
1838
1839         iter->journal_pos = k ? k->k.p : end_pos;
1840         return k;
1841 }
1842
1843 struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *trans,
1844                                             struct btree_iter *iter,
1845                                             struct bpos pos)
1846 {
1847         return bch2_btree_journal_peek(trans, iter, pos, pos);
1848 }
1849
1850 static noinline
1851 struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
1852                                          struct btree_iter *iter,
1853                                          struct bkey_s_c k)
1854 {
1855         struct bkey_i *next_journal =
1856                 bch2_btree_journal_peek(trans, iter, iter->path->pos,
1857                                 k.k ? k.k->p : path_l(iter->path)->b->key.k.p);
1858
1859         if (next_journal) {
1860                 iter->k = next_journal->k;
1861                 k = bkey_i_to_s_c(next_journal);
1862         }
1863
1864         return k;
1865 }
1866
1867 /*
1868  * Checks btree key cache for key at iter->pos and returns it if present, or
1869  * bkey_s_c_null:
1870  */
1871 static noinline
1872 struct bkey_s_c __btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
1873 {
1874         struct btree_trans *trans = iter->trans;
1875         struct bch_fs *c = trans->c;
1876         struct bkey u;
1877         int ret;
1878
1879         if (!bch2_btree_key_cache_find(c, iter->btree_id, pos))
1880                 return bkey_s_c_null;
1881
1882         if (!iter->key_cache_path)
1883                 iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos,
1884                                                      iter->flags & BTREE_ITER_INTENT, 0,
1885                                                      iter->flags|BTREE_ITER_CACHED,
1886                                                      _THIS_IP_);
1887
1888         iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos,
1889                                         iter->flags & BTREE_ITER_INTENT,
1890                                         btree_iter_ip_allocated(iter));
1891
1892         ret = bch2_btree_path_traverse(trans, iter->key_cache_path, iter->flags|BTREE_ITER_CACHED);
1893         if (unlikely(ret))
1894                 return bkey_s_c_err(ret);
1895
1896         btree_path_set_should_be_locked(iter->key_cache_path);
1897
1898         return bch2_btree_path_peek_slot(iter->key_cache_path, &u);
1899 }
1900
1901 static noinline
1902 struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
1903 {
1904         struct bkey_s_c ret = __btree_trans_peek_key_cache(iter, pos);
1905         int err = bkey_err(ret) ?: bch2_btree_path_relock(iter->trans, iter->path, _THIS_IP_);
1906
1907         return err ? bkey_s_c_err(err) : ret;
1908 }
1909
1910 static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
1911 {
1912         struct btree_trans *trans = iter->trans;
1913         struct bkey_i *next_update;
1914         struct bkey_s_c k, k2;
1915         int ret;
1916
1917         EBUG_ON(iter->path->cached);
1918         bch2_btree_iter_verify(iter);
1919
1920         while (1) {
1921                 struct btree_path_level *l;
1922
1923                 iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
1924                                         iter->flags & BTREE_ITER_INTENT,
1925                                         btree_iter_ip_allocated(iter));
1926
1927                 ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
1928                 if (unlikely(ret)) {
1929                         /* ensure that iter->k is consistent with iter->pos: */
1930                         bch2_btree_iter_set_pos(iter, iter->pos);
1931                         k = bkey_s_c_err(ret);
1932                         goto out;
1933                 }
1934
1935                 l = path_l(iter->path);
1936
1937                 if (unlikely(!l->b)) {
1938                         /* No btree nodes at requested level: */
1939                         bch2_btree_iter_set_pos(iter, SPOS_MAX);
1940                         k = bkey_s_c_null;
1941                         goto out;
1942                 }
1943
1944                 btree_path_set_should_be_locked(iter->path);
1945
1946                 k = btree_path_level_peek_all(trans->c, l, &iter->k);
1947
1948                 if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
1949                     k.k &&
1950                     (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
1951                         k = k2;
1952                         ret = bkey_err(k);
1953                         if (ret) {
1954                                 bch2_btree_iter_set_pos(iter, iter->pos);
1955                                 goto out;
1956                         }
1957                 }
1958
1959                 if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL))
1960                         k = btree_trans_peek_journal(trans, iter, k);
1961
1962                 next_update = iter->flags & BTREE_ITER_WITH_UPDATES
1963                         ? btree_trans_peek_updates(trans, iter->btree_id, search_key)
1964                         : NULL;
1965                 if (next_update &&
1966                     bpos_le(next_update->k.p,
1967                             k.k ? k.k->p : l->b->key.k.p)) {
1968                         iter->k = next_update->k;
1969                         k = bkey_i_to_s_c(next_update);
1970                 }
1971
1972                 if (k.k && bkey_deleted(k.k)) {
1973                         /*
1974                          * If we've got a whiteout, and it's after the search
1975                          * key, advance the search key to the whiteout instead
1976                          * of just after the whiteout - it might be a btree
1977                          * whiteout, with a real key at the same position, since
1978                          * in the btree deleted keys sort before non deleted.
1979                          */
1980                         search_key = !bpos_eq(search_key, k.k->p)
1981                                 ? k.k->p
1982                                 : bpos_successor(k.k->p);
1983                         continue;
1984                 }
1985
1986                 if (likely(k.k)) {
1987                         break;
1988                 } else if (likely(!bpos_eq(l->b->key.k.p, SPOS_MAX))) {
1989                         /* Advance to next leaf node: */
1990                         search_key = bpos_successor(l->b->key.k.p);
1991                 } else {
1992                         /* End of btree: */
1993                         bch2_btree_iter_set_pos(iter, SPOS_MAX);
1994                         k = bkey_s_c_null;
1995                         goto out;
1996                 }
1997         }
1998 out:
1999         bch2_btree_iter_verify(iter);
2000
2001         return k;
2002 }
2003
2004 /**
2005  * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
2006  * current position
2007  */
2008 struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end)
2009 {
2010         struct btree_trans *trans = iter->trans;
2011         struct bpos search_key = btree_iter_search_key(iter);
2012         struct bkey_s_c k;
2013         struct bpos iter_pos;
2014         int ret;
2015
2016         EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
2017         EBUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && bkey_eq(end, POS_MAX));
2018
2019         if (iter->update_path) {
2020                 bch2_path_put_nokeep(trans, iter->update_path,
2021                                      iter->flags & BTREE_ITER_INTENT);
2022                 iter->update_path = NULL;
2023         }
2024
2025         bch2_btree_iter_verify_entry_exit(iter);
2026
2027         while (1) {
2028                 k = __bch2_btree_iter_peek(iter, search_key);
2029                 if (unlikely(!k.k))
2030                         goto end;
2031                 if (unlikely(bkey_err(k)))
2032                         goto out_no_locked;
2033
2034                 /*
2035                  * iter->pos should be mononotically increasing, and always be
2036                  * equal to the key we just returned - except extents can
2037                  * straddle iter->pos:
2038                  */
2039                 if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
2040                         iter_pos = k.k->p;
2041                 else
2042                         iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k));
2043
2044                 if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS)
2045                              ? bkey_gt(iter_pos, end)
2046                              : bkey_ge(iter_pos, end)))
2047                         goto end;
2048
2049                 if (iter->update_path &&
2050                     !bkey_eq(iter->update_path->pos, k.k->p)) {
2051                         bch2_path_put_nokeep(trans, iter->update_path,
2052                                              iter->flags & BTREE_ITER_INTENT);
2053                         iter->update_path = NULL;
2054                 }
2055
2056                 if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
2057                     (iter->flags & BTREE_ITER_INTENT) &&
2058                     !(iter->flags & BTREE_ITER_IS_EXTENTS) &&
2059                     !iter->update_path) {
2060                         struct bpos pos = k.k->p;
2061
2062                         if (pos.snapshot < iter->snapshot) {
2063                                 search_key = bpos_successor(k.k->p);
2064                                 continue;
2065                         }
2066
2067                         pos.snapshot = iter->snapshot;
2068
2069                         /*
2070                          * advance, same as on exit for iter->path, but only up
2071                          * to snapshot
2072                          */
2073                         __btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT);
2074                         iter->update_path = iter->path;
2075
2076                         iter->update_path = bch2_btree_path_set_pos(trans,
2077                                                 iter->update_path, pos,
2078                                                 iter->flags & BTREE_ITER_INTENT,
2079                                                 _THIS_IP_);
2080                 }
2081
2082                 /*
2083                  * We can never have a key in a leaf node at POS_MAX, so
2084                  * we don't have to check these successor() calls:
2085                  */
2086                 if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
2087                     !bch2_snapshot_is_ancestor(trans->c,
2088                                                iter->snapshot,
2089                                                k.k->p.snapshot)) {
2090                         search_key = bpos_successor(k.k->p);
2091                         continue;
2092                 }
2093
2094                 if (bkey_whiteout(k.k) &&
2095                     !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
2096                         search_key = bkey_successor(iter, k.k->p);
2097                         continue;
2098                 }
2099
2100                 break;
2101         }
2102
2103         iter->pos = iter_pos;
2104
2105         iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
2106                                 iter->flags & BTREE_ITER_INTENT,
2107                                 btree_iter_ip_allocated(iter));
2108
2109         btree_path_set_should_be_locked(iter->path);
2110 out_no_locked:
2111         if (iter->update_path) {
2112                 if (iter->update_path->uptodate &&
2113                     (ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_)))
2114                         k = bkey_s_c_err(ret);
2115                 else
2116                         btree_path_set_should_be_locked(iter->update_path);
2117         }
2118
2119         if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
2120                 iter->pos.snapshot = iter->snapshot;
2121
2122         ret = bch2_btree_iter_verify_ret(iter, k);
2123         if (unlikely(ret)) {
2124                 bch2_btree_iter_set_pos(iter, iter->pos);
2125                 k = bkey_s_c_err(ret);
2126         }
2127
2128         bch2_btree_iter_verify_entry_exit(iter);
2129
2130         return k;
2131 end:
2132         bch2_btree_iter_set_pos(iter, end);
2133         k = bkey_s_c_null;
2134         goto out_no_locked;
2135 }
2136
2137 /**
2138  * bch2_btree_iter_peek_all_levels: returns the first key greater than or equal
2139  * to iterator's current position, returning keys from every level of the btree.
2140  * For keys at different levels of the btree that compare equal, the key from
2141  * the lower level (leaf) is returned first.
2142  */
2143 struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter)
2144 {
2145         struct btree_trans *trans = iter->trans;
2146         struct bkey_s_c k;
2147         int ret;
2148
2149         EBUG_ON(iter->path->cached);
2150         bch2_btree_iter_verify(iter);
2151         BUG_ON(iter->path->level < iter->min_depth);
2152         BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
2153         EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS));
2154
2155         while (1) {
2156                 iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos,
2157                                         iter->flags & BTREE_ITER_INTENT,
2158                                         btree_iter_ip_allocated(iter));
2159
2160                 ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
2161                 if (unlikely(ret)) {
2162                         /* ensure that iter->k is consistent with iter->pos: */
2163                         bch2_btree_iter_set_pos(iter, iter->pos);
2164                         k = bkey_s_c_err(ret);
2165                         goto out_no_locked;
2166                 }
2167
2168                 /* Already at end? */
2169                 if (!btree_path_node(iter->path, iter->path->level)) {
2170                         k = bkey_s_c_null;
2171                         goto out_no_locked;
2172                 }
2173
2174                 k = btree_path_level_peek_all(trans->c,
2175                                 &iter->path->l[iter->path->level], &iter->k);
2176
2177                 /* Check if we should go up to the parent node: */
2178                 if (!k.k ||
2179                     (iter->advanced &&
2180                      bpos_eq(path_l(iter->path)->b->key.k.p, iter->pos))) {
2181                         iter->pos = path_l(iter->path)->b->key.k.p;
2182                         btree_path_set_level_up(trans, iter->path);
2183                         iter->advanced = false;
2184                         continue;
2185                 }
2186
2187                 /*
2188                  * Check if we should go back down to a leaf:
2189                  * If we're not in a leaf node, we only return the current key
2190                  * if it exactly matches iter->pos - otherwise we first have to
2191                  * go back to the leaf:
2192                  */
2193                 if (iter->path->level != iter->min_depth &&
2194                     (iter->advanced ||
2195                      !k.k ||
2196                      !bpos_eq(iter->pos, k.k->p))) {
2197                         btree_path_set_level_down(trans, iter->path, iter->min_depth);
2198                         iter->pos = bpos_successor(iter->pos);
2199                         iter->advanced = false;
2200                         continue;
2201                 }
2202
2203                 /* Check if we should go to the next key: */
2204                 if (iter->path->level == iter->min_depth &&
2205                     iter->advanced &&
2206                     k.k &&
2207                     bpos_eq(iter->pos, k.k->p)) {
2208                         iter->pos = bpos_successor(iter->pos);
2209                         iter->advanced = false;
2210                         continue;
2211                 }
2212
2213                 if (iter->advanced &&
2214                     iter->path->level == iter->min_depth &&
2215                     !bpos_eq(k.k->p, iter->pos))
2216                         iter->advanced = false;
2217
2218                 BUG_ON(iter->advanced);
2219                 BUG_ON(!k.k);
2220                 break;
2221         }
2222
2223         iter->pos = k.k->p;
2224         btree_path_set_should_be_locked(iter->path);
2225 out_no_locked:
2226         bch2_btree_iter_verify(iter);
2227
2228         return k;
2229 }
2230
2231 /**
2232  * bch2_btree_iter_next: returns first key greater than iterator's current
2233  * position
2234  */
2235 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
2236 {
2237         if (!bch2_btree_iter_advance(iter))
2238                 return bkey_s_c_null;
2239
2240         return bch2_btree_iter_peek(iter);
2241 }
2242
2243 /**
2244  * bch2_btree_iter_peek_prev: returns first key less than or equal to
2245  * iterator's current position
2246  */
2247 struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
2248 {
2249         struct btree_trans *trans = iter->trans;
2250         struct bpos search_key = iter->pos;
2251         struct btree_path *saved_path = NULL;
2252         struct bkey_s_c k;
2253         struct bkey saved_k;
2254         const struct bch_val *saved_v;
2255         int ret;
2256
2257         EBUG_ON(iter->path->cached || iter->path->level);
2258         EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES);
2259
2260         if (iter->flags & BTREE_ITER_WITH_JOURNAL)
2261                 return bkey_s_c_err(-EIO);
2262
2263         bch2_btree_iter_verify(iter);
2264         bch2_btree_iter_verify_entry_exit(iter);
2265
2266         if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
2267                 search_key.snapshot = U32_MAX;
2268
2269         while (1) {
2270                 iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
2271                                                 iter->flags & BTREE_ITER_INTENT,
2272                                                 btree_iter_ip_allocated(iter));
2273
2274                 ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
2275                 if (unlikely(ret)) {
2276                         /* ensure that iter->k is consistent with iter->pos: */
2277                         bch2_btree_iter_set_pos(iter, iter->pos);
2278                         k = bkey_s_c_err(ret);
2279                         goto out_no_locked;
2280                 }
2281
2282                 k = btree_path_level_peek(trans, iter->path,
2283                                           &iter->path->l[0], &iter->k);
2284                 if (!k.k ||
2285                     ((iter->flags & BTREE_ITER_IS_EXTENTS)
2286                      ? bpos_ge(bkey_start_pos(k.k), search_key)
2287                      : bpos_gt(k.k->p, search_key)))
2288                         k = btree_path_level_prev(trans, iter->path,
2289                                                   &iter->path->l[0], &iter->k);
2290
2291                 bch2_btree_path_check_sort(trans, iter->path, 0);
2292
2293                 if (likely(k.k)) {
2294                         if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) {
2295                                 if (k.k->p.snapshot == iter->snapshot)
2296                                         goto got_key;
2297
2298                                 /*
2299                                  * If we have a saved candidate, and we're no
2300                                  * longer at the same _key_ (not pos), return
2301                                  * that candidate
2302                                  */
2303                                 if (saved_path && !bkey_eq(k.k->p, saved_k.p)) {
2304                                         bch2_path_put_nokeep(trans, iter->path,
2305                                                       iter->flags & BTREE_ITER_INTENT);
2306                                         iter->path = saved_path;
2307                                         saved_path = NULL;
2308                                         iter->k = saved_k;
2309                                         k.v     = saved_v;
2310                                         goto got_key;
2311                                 }
2312
2313                                 if (bch2_snapshot_is_ancestor(iter->trans->c,
2314                                                               iter->snapshot,
2315                                                               k.k->p.snapshot)) {
2316                                         if (saved_path)
2317                                                 bch2_path_put_nokeep(trans, saved_path,
2318                                                       iter->flags & BTREE_ITER_INTENT);
2319                                         saved_path = btree_path_clone(trans, iter->path,
2320                                                                 iter->flags & BTREE_ITER_INTENT);
2321                                         saved_k = *k.k;
2322                                         saved_v = k.v;
2323                                 }
2324
2325                                 search_key = bpos_predecessor(k.k->p);
2326                                 continue;
2327                         }
2328 got_key:
2329                         if (bkey_whiteout(k.k) &&
2330                             !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
2331                                 search_key = bkey_predecessor(iter, k.k->p);
2332                                 if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
2333                                         search_key.snapshot = U32_MAX;
2334                                 continue;
2335                         }
2336
2337                         break;
2338                 } else if (likely(!bpos_eq(iter->path->l[0].b->data->min_key, POS_MIN))) {
2339                         /* Advance to previous leaf node: */
2340                         search_key = bpos_predecessor(iter->path->l[0].b->data->min_key);
2341                 } else {
2342                         /* Start of btree: */
2343                         bch2_btree_iter_set_pos(iter, POS_MIN);
2344                         k = bkey_s_c_null;
2345                         goto out_no_locked;
2346                 }
2347         }
2348
2349         EBUG_ON(bkey_gt(bkey_start_pos(k.k), iter->pos));
2350
2351         /* Extents can straddle iter->pos: */
2352         if (bkey_lt(k.k->p, iter->pos))
2353                 iter->pos = k.k->p;
2354
2355         if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
2356                 iter->pos.snapshot = iter->snapshot;
2357
2358         btree_path_set_should_be_locked(iter->path);
2359 out_no_locked:
2360         if (saved_path)
2361                 bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
2362
2363         bch2_btree_iter_verify_entry_exit(iter);
2364         bch2_btree_iter_verify(iter);
2365
2366         return k;
2367 }
2368
2369 /**
2370  * bch2_btree_iter_prev: returns first key less than iterator's current
2371  * position
2372  */
2373 struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
2374 {
2375         if (!bch2_btree_iter_rewind(iter))
2376                 return bkey_s_c_null;
2377
2378         return bch2_btree_iter_peek_prev(iter);
2379 }
2380
2381 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
2382 {
2383         struct btree_trans *trans = iter->trans;
2384         struct bpos search_key;
2385         struct bkey_s_c k;
2386         int ret;
2387
2388         bch2_btree_iter_verify(iter);
2389         bch2_btree_iter_verify_entry_exit(iter);
2390         EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
2391         EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE));
2392
2393         /* extents can't span inode numbers: */
2394         if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
2395             unlikely(iter->pos.offset == KEY_OFFSET_MAX)) {
2396                 if (iter->pos.inode == KEY_INODE_MAX)
2397                         return bkey_s_c_null;
2398
2399                 bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
2400         }
2401
2402         search_key = btree_iter_search_key(iter);
2403         iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
2404                                         iter->flags & BTREE_ITER_INTENT,
2405                                         btree_iter_ip_allocated(iter));
2406
2407         ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
2408         if (unlikely(ret)) {
2409                 k = bkey_s_c_err(ret);
2410                 goto out_no_locked;
2411         }
2412
2413         if ((iter->flags & BTREE_ITER_CACHED) ||
2414             !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
2415                 struct bkey_i *next_update;
2416
2417                 if ((iter->flags & BTREE_ITER_WITH_UPDATES) &&
2418                     (next_update = btree_trans_peek_updates(trans,
2419                                                 iter->btree_id, search_key)) &&
2420                     bpos_eq(next_update->k.p, iter->pos)) {
2421                         iter->k = next_update->k;
2422                         k = bkey_i_to_s_c(next_update);
2423                         goto out;
2424                 }
2425
2426                 if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) &&
2427                     (next_update = bch2_btree_journal_peek_slot(trans,
2428                                         iter, iter->pos))) {
2429                         iter->k = next_update->k;
2430                         k = bkey_i_to_s_c(next_update);
2431                         goto out;
2432                 }
2433
2434                 if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
2435                     (k = __btree_trans_peek_key_cache(iter, iter->pos)).k) {
2436                         if (!bkey_err(k))
2437                                 iter->k = *k.k;
2438                         /* We're not returning a key from iter->path: */
2439                         goto out_no_locked;
2440                 }
2441
2442                 k = bch2_btree_path_peek_slot(iter->path, &iter->k);
2443                 if (unlikely(!k.k))
2444                         goto out_no_locked;
2445         } else {
2446                 struct bpos next;
2447                 struct bpos end = iter->pos;
2448
2449                 if (iter->flags & BTREE_ITER_IS_EXTENTS)
2450                         end.offset = U64_MAX;
2451
2452                 EBUG_ON(iter->path->level);
2453
2454                 if (iter->flags & BTREE_ITER_INTENT) {
2455                         struct btree_iter iter2;
2456
2457                         bch2_trans_copy_iter(&iter2, iter);
2458                         k = bch2_btree_iter_peek_upto(&iter2, end);
2459
2460                         if (k.k && !bkey_err(k)) {
2461                                 iter->k = iter2.k;
2462                                 k.k = &iter->k;
2463                         }
2464                         bch2_trans_iter_exit(trans, &iter2);
2465                 } else {
2466                         struct bpos pos = iter->pos;
2467
2468                         k = bch2_btree_iter_peek_upto(iter, end);
2469                         if (unlikely(bkey_err(k)))
2470                                 bch2_btree_iter_set_pos(iter, pos);
2471                         else
2472                                 iter->pos = pos;
2473                 }
2474
2475                 if (unlikely(bkey_err(k)))
2476                         goto out_no_locked;
2477
2478                 next = k.k ? bkey_start_pos(k.k) : POS_MAX;
2479
2480                 if (bkey_lt(iter->pos, next)) {
2481                         bkey_init(&iter->k);
2482                         iter->k.p = iter->pos;
2483
2484                         if (iter->flags & BTREE_ITER_IS_EXTENTS) {
2485                                 bch2_key_resize(&iter->k,
2486                                                 min_t(u64, KEY_SIZE_MAX,
2487                                                       (next.inode == iter->pos.inode
2488                                                        ? next.offset
2489                                                        : KEY_OFFSET_MAX) -
2490                                                       iter->pos.offset));
2491                                 EBUG_ON(!iter->k.size);
2492                         }
2493
2494                         k = (struct bkey_s_c) { &iter->k, NULL };
2495                 }
2496         }
2497 out:
2498         btree_path_set_should_be_locked(iter->path);
2499 out_no_locked:
2500         bch2_btree_iter_verify_entry_exit(iter);
2501         bch2_btree_iter_verify(iter);
2502         ret = bch2_btree_iter_verify_ret(iter, k);
2503         if (unlikely(ret))
2504                 return bkey_s_c_err(ret);
2505
2506         return k;
2507 }
2508
2509 struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
2510 {
2511         if (!bch2_btree_iter_advance(iter))
2512                 return bkey_s_c_null;
2513
2514         return bch2_btree_iter_peek_slot(iter);
2515 }
2516
2517 struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
2518 {
2519         if (!bch2_btree_iter_rewind(iter))
2520                 return bkey_s_c_null;
2521
2522         return bch2_btree_iter_peek_slot(iter);
2523 }
2524
2525 /* new transactional stuff: */
2526
2527 static inline void btree_path_verify_sorted_ref(struct btree_trans *trans,
2528                                                 struct btree_path *path)
2529 {
2530         EBUG_ON(path->sorted_idx >= trans->nr_sorted);
2531         EBUG_ON(trans->sorted[path->sorted_idx] != path->idx);
2532         EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
2533 }
2534
2535 static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans)
2536 {
2537 #ifdef CONFIG_BCACHEFS_DEBUG
2538         unsigned i;
2539
2540         for (i = 0; i < trans->nr_sorted; i++)
2541                 btree_path_verify_sorted_ref(trans, trans->paths + trans->sorted[i]);
2542 #endif
2543 }
2544
2545 static void btree_trans_verify_sorted(struct btree_trans *trans)
2546 {
2547 #ifdef CONFIG_BCACHEFS_DEBUG
2548         struct btree_path *path, *prev = NULL;
2549         unsigned i;
2550
2551         if (!bch2_debug_check_iterators)
2552                 return;
2553
2554         trans_for_each_path_inorder(trans, path, i) {
2555                 if (prev && btree_path_cmp(prev, path) > 0) {
2556                         bch2_dump_trans_paths_updates(trans);
2557                         panic("trans paths out of order!\n");
2558                 }
2559                 prev = path;
2560         }
2561 #endif
2562 }
2563
2564 static inline void btree_path_swap(struct btree_trans *trans,
2565                                    struct btree_path *l, struct btree_path *r)
2566 {
2567         swap(l->sorted_idx, r->sorted_idx);
2568         swap(trans->sorted[l->sorted_idx],
2569              trans->sorted[r->sorted_idx]);
2570
2571         btree_path_verify_sorted_ref(trans, l);
2572         btree_path_verify_sorted_ref(trans, r);
2573 }
2574
2575 static inline struct btree_path *sib_btree_path(struct btree_trans *trans,
2576                                                 struct btree_path *path, int sib)
2577 {
2578         unsigned idx = (unsigned) path->sorted_idx + sib;
2579
2580         EBUG_ON(sib != -1 && sib != 1);
2581
2582         return idx < trans->nr_sorted
2583                 ? trans->paths + trans->sorted[idx]
2584                 : NULL;
2585 }
2586
2587 static __always_inline void bch2_btree_path_check_sort_fast(struct btree_trans *trans,
2588                                                    struct btree_path *path,
2589                                                    int cmp)
2590 {
2591         struct btree_path *n;
2592         int cmp2;
2593
2594         EBUG_ON(!cmp);
2595
2596         while ((n = sib_btree_path(trans, path, cmp)) &&
2597                (cmp2 = btree_path_cmp(n, path)) &&
2598                cmp2 != cmp)
2599                 btree_path_swap(trans, n, path);
2600
2601         btree_trans_verify_sorted(trans);
2602 }
2603
2604 inline void bch2_btree_path_check_sort(struct btree_trans *trans, struct btree_path *path,
2605                                        int cmp)
2606 {
2607         struct btree_path *n;
2608
2609         if (cmp <= 0) {
2610                 n = prev_btree_path(trans, path);
2611                 if (n && btree_path_cmp(n, path) > 0) {
2612                         do {
2613                                 btree_path_swap(trans, n, path);
2614                                 n = prev_btree_path(trans, path);
2615                         } while (n && btree_path_cmp(n, path) > 0);
2616
2617                         goto out;
2618                 }
2619         }
2620
2621         if (cmp >= 0) {
2622                 n = next_btree_path(trans, path);
2623                 if (n && btree_path_cmp(path, n) > 0) {
2624                         do {
2625                                 btree_path_swap(trans, path, n);
2626                                 n = next_btree_path(trans, path);
2627                         } while (n && btree_path_cmp(path, n) > 0);
2628                 }
2629         }
2630 out:
2631         btree_trans_verify_sorted(trans);
2632 }
2633
2634 static inline void btree_path_list_remove(struct btree_trans *trans,
2635                                           struct btree_path *path)
2636 {
2637         unsigned i;
2638
2639         EBUG_ON(path->sorted_idx >= trans->nr_sorted);
2640
2641         array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx);
2642
2643         for (i = path->sorted_idx; i < trans->nr_sorted; i++)
2644                 trans->paths[trans->sorted[i]].sorted_idx = i;
2645
2646         path->sorted_idx = U8_MAX;
2647
2648         btree_trans_verify_sorted_refs(trans);
2649 }
2650
2651 static inline void btree_path_list_add(struct btree_trans *trans,
2652                                        struct btree_path *pos,
2653                                        struct btree_path *path)
2654 {
2655         unsigned i;
2656
2657         btree_trans_verify_sorted_refs(trans);
2658
2659         path->sorted_idx = pos ? pos->sorted_idx + 1 : 0;
2660
2661         if (unlikely(trans->in_traverse_all) &&
2662             trans->traverse_all_idx != U8_MAX &&
2663             trans->traverse_all_idx >= path->sorted_idx)
2664                 trans->traverse_all_idx++;
2665
2666         array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx);
2667
2668         for (i = path->sorted_idx; i < trans->nr_sorted; i++)
2669                 trans->paths[trans->sorted[i]].sorted_idx = i;
2670
2671         btree_trans_verify_sorted_refs(trans);
2672 }
2673
2674 void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
2675 {
2676         if (iter->path)
2677                 bch2_path_put(trans, iter->path,
2678                               iter->flags & BTREE_ITER_INTENT);
2679         if (iter->update_path)
2680                 bch2_path_put_nokeep(trans, iter->update_path,
2681                               iter->flags & BTREE_ITER_INTENT);
2682         if (iter->key_cache_path)
2683                 bch2_path_put(trans, iter->key_cache_path,
2684                               iter->flags & BTREE_ITER_INTENT);
2685         iter->path = NULL;
2686         iter->update_path = NULL;
2687         iter->key_cache_path = NULL;
2688 }
2689
2690 static inline void bch2_trans_iter_init_inlined(struct btree_trans *trans,
2691                           struct btree_iter *iter,
2692                           unsigned btree_id, struct bpos pos,
2693                           unsigned flags)
2694 {
2695         bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
2696                                bch2_btree_iter_flags(trans, btree_id, flags),
2697                                _RET_IP_);
2698 }
2699
2700 void bch2_trans_iter_init_outlined(struct btree_trans *trans,
2701                           struct btree_iter *iter,
2702                           unsigned btree_id, struct bpos pos,
2703                           unsigned flags)
2704 {
2705         bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
2706                                bch2_btree_iter_flags(trans, btree_id, flags),
2707                                _RET_IP_);
2708 }
2709
2710 void bch2_trans_node_iter_init(struct btree_trans *trans,
2711                                struct btree_iter *iter,
2712                                enum btree_id btree_id,
2713                                struct bpos pos,
2714                                unsigned locks_want,
2715                                unsigned depth,
2716                                unsigned flags)
2717 {
2718        flags |= BTREE_ITER_NOT_EXTENTS;
2719        flags |= __BTREE_ITER_ALL_SNAPSHOTS;
2720        flags |= BTREE_ITER_ALL_SNAPSHOTS;
2721
2722         bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth,
2723                                __bch2_btree_iter_flags(trans, btree_id, flags),
2724                                _RET_IP_);
2725
2726         iter->min_depth = depth;
2727
2728         BUG_ON(iter->path->locks_want    < min(locks_want, BTREE_MAX_DEPTH));
2729         BUG_ON(iter->path->level        != depth);
2730         BUG_ON(iter->min_depth          != depth);
2731 }
2732
2733 void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
2734 {
2735         *dst = *src;
2736         if (src->path)
2737                 __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT);
2738         if (src->update_path)
2739                 __btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT);
2740         dst->key_cache_path = NULL;
2741 }
2742
2743 void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
2744 {
2745         unsigned new_top = trans->mem_top + size;
2746         size_t old_bytes = trans->mem_bytes;
2747         size_t new_bytes = roundup_pow_of_two(new_top);
2748         void *new_mem;
2749         void *p;
2750
2751         trans->mem_max = max(trans->mem_max, new_top);
2752
2753         WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
2754
2755         new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
2756         if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
2757                 new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
2758                 new_bytes = BTREE_TRANS_MEM_MAX;
2759                 kfree(trans->mem);
2760         }
2761
2762         if (!new_mem)
2763                 return ERR_PTR(-ENOMEM);
2764
2765         trans->mem = new_mem;
2766         trans->mem_bytes = new_bytes;
2767
2768         if (old_bytes) {
2769                 trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
2770                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
2771         }
2772
2773         p = trans->mem + trans->mem_top;
2774         trans->mem_top += size;
2775         memset(p, 0, size);
2776         return p;
2777 }
2778
2779 /**
2780  * bch2_trans_begin() - reset a transaction after a interrupted attempt
2781  * @trans: transaction to reset
2782  *
2783  * While iterating over nodes or updating nodes a attempt to lock a btree node
2784  * may return BCH_ERR_transaction_restart when the trylock fails. When this
2785  * occurs bch2_trans_begin() should be called and the transaction retried.
2786  */
2787 u32 bch2_trans_begin(struct btree_trans *trans)
2788 {
2789         struct btree_path *path;
2790
2791         bch2_trans_reset_updates(trans);
2792
2793         trans->restart_count++;
2794         trans->mem_top                  = 0;
2795
2796         if (trans->fs_usage_deltas) {
2797                 trans->fs_usage_deltas->used = 0;
2798                 memset((void *) trans->fs_usage_deltas +
2799                        offsetof(struct replicas_delta_list, memset_start), 0,
2800                        (void *) &trans->fs_usage_deltas->memset_end -
2801                        (void *) &trans->fs_usage_deltas->memset_start);
2802         }
2803
2804         trans_for_each_path(trans, path) {
2805                 path->should_be_locked = false;
2806
2807                 /*
2808                  * If the transaction wasn't restarted, we're presuming to be
2809                  * doing something new: dont keep iterators excpt the ones that
2810                  * are in use - except for the subvolumes btree:
2811                  */
2812                 if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes)
2813                         path->preserve = false;
2814
2815                 /*
2816                  * XXX: we probably shouldn't be doing this if the transaction
2817                  * was restarted, but currently we still overflow transaction
2818                  * iterators if we do that
2819                  */
2820                 if (!path->ref && !path->preserve)
2821                         __bch2_path_free(trans, path);
2822                 else
2823                         path->preserve = false;
2824         }
2825
2826         if (!trans->restarted &&
2827             (need_resched() ||
2828              local_clock() - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) {
2829                 bch2_trans_unlock(trans);
2830                 cond_resched();
2831                 bch2_trans_relock(trans);
2832         }
2833
2834         trans->last_restarted_ip = _RET_IP_;
2835         if (trans->restarted)
2836                 bch2_btree_path_traverse_all(trans);
2837
2838         trans->last_begin_time = local_clock();
2839         return trans->restart_count;
2840 }
2841
2842 void bch2_trans_verify_not_restarted(struct btree_trans *trans, u32 restart_count)
2843 {
2844         if (trans_was_restarted(trans, restart_count))
2845                 panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
2846                       trans->restart_count, restart_count,
2847                       (void *) trans->last_restarted_ip);
2848 }
2849
2850 static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
2851 {
2852         size_t paths_bytes      = sizeof(struct btree_path) * BTREE_ITER_MAX;
2853         size_t updates_bytes    = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX;
2854         void *p = NULL;
2855
2856         BUG_ON(trans->used_mempool);
2857
2858 #ifdef __KERNEL__
2859         p = this_cpu_xchg(c->btree_paths_bufs->path, NULL);
2860 #endif
2861         if (!p)
2862                 p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS);
2863
2864         trans->paths            = p; p += paths_bytes;
2865         trans->updates          = p; p += updates_bytes;
2866 }
2867
2868 const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
2869
2870 unsigned bch2_trans_get_fn_idx(const char *fn)
2871 {
2872         unsigned i;
2873
2874         for (i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++)
2875                 if (!bch2_btree_transaction_fns[i] ||
2876                     bch2_btree_transaction_fns[i] == fn) {
2877                         bch2_btree_transaction_fns[i] = fn;
2878                         return i;
2879                 }
2880
2881         pr_warn_once("BCH_TRANSACTIONS_NR not big enough!");
2882         return i;
2883 }
2884
2885 void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_idx)
2886         __acquires(&c->btree_trans_barrier)
2887 {
2888         struct btree_transaction_stats *s;
2889         struct btree_trans *pos;
2890
2891         BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
2892
2893         memset(trans, 0, sizeof(*trans));
2894         trans->c                = c;
2895         trans->fn               = fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns)
2896                 ? bch2_btree_transaction_fns[fn_idx] : NULL;
2897         trans->last_begin_time  = local_clock();
2898         trans->fn_idx           = fn_idx;
2899         trans->locking_wait.task = current;
2900         trans->journal_replay_not_finished =
2901                 !test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
2902         closure_init_stack(&trans->ref);
2903
2904         bch2_trans_alloc_paths(trans, c);
2905
2906         s = btree_trans_stats(trans);
2907         if (s && s->max_mem) {
2908                 unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
2909
2910                 trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
2911
2912                 if (!unlikely(trans->mem)) {
2913                         trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
2914                         trans->mem_bytes = BTREE_TRANS_MEM_MAX;
2915                 } else {
2916                         trans->mem_bytes = expected_mem_bytes;
2917                 }
2918         }
2919         if (s)
2920                 trans->nr_max_paths = s->nr_max_paths;
2921
2922         trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
2923
2924         mutex_lock(&c->btree_trans_lock);
2925         list_for_each_entry(pos, &c->btree_trans_list, list) {
2926                 if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) {
2927                         list_add_tail(&trans->list, &pos->list);
2928                         goto list_add_done;
2929                 }
2930         }
2931         list_add_tail(&trans->list, &c->btree_trans_list);
2932 list_add_done:
2933         mutex_unlock(&c->btree_trans_lock);
2934 }
2935
2936 static void check_btree_paths_leaked(struct btree_trans *trans)
2937 {
2938 #ifdef CONFIG_BCACHEFS_DEBUG
2939         struct bch_fs *c = trans->c;
2940         struct btree_path *path;
2941
2942         trans_for_each_path(trans, path)
2943                 if (path->ref)
2944                         goto leaked;
2945         return;
2946 leaked:
2947         bch_err(c, "btree paths leaked from %s!", trans->fn);
2948         trans_for_each_path(trans, path)
2949                 if (path->ref)
2950                         printk(KERN_ERR "  btree %s %pS\n",
2951                                bch2_btree_ids[path->btree_id],
2952                                (void *) path->ip_allocated);
2953         /* Be noisy about this: */
2954         bch2_fatal_error(c);
2955 #endif
2956 }
2957
2958 void bch2_trans_exit(struct btree_trans *trans)
2959         __releases(&c->btree_trans_barrier)
2960 {
2961         struct btree_insert_entry *i;
2962         struct bch_fs *c = trans->c;
2963         struct btree_transaction_stats *s = btree_trans_stats(trans);
2964
2965         bch2_trans_unlock(trans);
2966
2967         closure_sync(&trans->ref);
2968
2969         if (s)
2970                 s->max_mem = max(s->max_mem, trans->mem_max);
2971
2972         trans_for_each_update(trans, i)
2973                 __btree_path_put(i->path, true);
2974         trans->nr_updates               = 0;
2975
2976         check_btree_paths_leaked(trans);
2977
2978         mutex_lock(&c->btree_trans_lock);
2979         list_del(&trans->list);
2980         mutex_unlock(&c->btree_trans_lock);
2981
2982         srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
2983
2984         bch2_journal_preres_put(&c->journal, &trans->journal_preres);
2985
2986         kfree(trans->extra_journal_entries.data);
2987
2988         if (trans->fs_usage_deltas) {
2989                 if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
2990                     REPLICAS_DELTA_LIST_MAX)
2991                         mempool_free(trans->fs_usage_deltas,
2992                                      &c->replicas_delta_pool);
2993                 else
2994                         kfree(trans->fs_usage_deltas);
2995         }
2996
2997         if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
2998                 mempool_free(trans->mem, &c->btree_trans_mem_pool);
2999         else
3000                 kfree(trans->mem);
3001
3002 #ifdef __KERNEL__
3003         /*
3004          * Userspace doesn't have a real percpu implementation:
3005          */
3006         trans->paths = this_cpu_xchg(c->btree_paths_bufs->path, trans->paths);
3007 #endif
3008
3009         if (trans->paths)
3010                 mempool_free(trans->paths, &c->btree_paths_pool);
3011
3012         trans->mem      = (void *) 0x1;
3013         trans->paths    = (void *) 0x1;
3014 }
3015
3016 static void __maybe_unused
3017 bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
3018                                       struct btree_bkey_cached_common *b)
3019 {
3020         struct six_lock_count c = six_lock_counts(&b->lock);
3021         struct task_struct *owner;
3022         pid_t pid;
3023
3024         rcu_read_lock();
3025         owner = READ_ONCE(b->lock.owner);
3026         pid = owner ? owner->pid : 0;
3027         rcu_read_unlock();
3028
3029         prt_tab(out);
3030         prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b',
3031                    b->level, bch2_btree_ids[b->btree_id]);
3032         bch2_bpos_to_text(out, btree_node_pos(b));
3033
3034         prt_tab(out);
3035         prt_printf(out, " locks %u:%u:%u held by pid %u",
3036                    c.n[0], c.n[1], c.n[2], pid);
3037 }
3038
3039 void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
3040 {
3041         struct btree_path *path;
3042         struct btree_bkey_cached_common *b;
3043         static char lock_types[] = { 'r', 'i', 'w' };
3044         unsigned l;
3045
3046         if (!out->nr_tabstops) {
3047                 printbuf_tabstop_push(out, 16);
3048                 printbuf_tabstop_push(out, 32);
3049         }
3050
3051         prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn);
3052
3053         trans_for_each_path(trans, path) {
3054                 if (!path->nodes_locked)
3055                         continue;
3056
3057                 prt_printf(out, "  path %u %c l=%u %s:",
3058                        path->idx,
3059                        path->cached ? 'c' : 'b',
3060                        path->level,
3061                        bch2_btree_ids[path->btree_id]);
3062                 bch2_bpos_to_text(out, path->pos);
3063                 prt_newline(out);
3064
3065                 for (l = 0; l < BTREE_MAX_DEPTH; l++) {
3066                         if (btree_node_locked(path, l) &&
3067                             !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) {
3068                                 prt_printf(out, "    %c l=%u ",
3069                                            lock_types[btree_node_locked_type(path, l)], l);
3070                                 bch2_btree_bkey_cached_common_to_text(out, b);
3071                                 prt_newline(out);
3072                         }
3073                 }
3074         }
3075
3076         b = READ_ONCE(trans->locking);
3077         if (b) {
3078                 prt_str(out, "  want");
3079                 prt_newline(out);
3080                 prt_printf(out, "    %c", lock_types[trans->locking_wait.lock_want]);
3081                 bch2_btree_bkey_cached_common_to_text(out, b);
3082                 prt_newline(out);
3083         }
3084 }
3085
3086 void bch2_fs_btree_iter_exit(struct bch_fs *c)
3087 {
3088         struct btree_transaction_stats *s;
3089
3090         for (s = c->btree_transaction_stats;
3091              s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
3092              s++)
3093                 kfree(s->max_paths_text);
3094
3095         if (c->btree_trans_barrier_initialized)
3096                 cleanup_srcu_struct(&c->btree_trans_barrier);
3097         mempool_exit(&c->btree_trans_mem_pool);
3098         mempool_exit(&c->btree_paths_pool);
3099 }
3100
3101 int bch2_fs_btree_iter_init(struct bch_fs *c)
3102 {
3103         unsigned i, nr = BTREE_ITER_MAX;
3104         int ret;
3105
3106         for (i = 0; i < ARRAY_SIZE(c->btree_transaction_stats); i++)
3107                 mutex_init(&c->btree_transaction_stats[i].lock);
3108
3109         INIT_LIST_HEAD(&c->btree_trans_list);
3110         mutex_init(&c->btree_trans_lock);
3111
3112         ret   = mempool_init_kmalloc_pool(&c->btree_paths_pool, 1,
3113                         sizeof(struct btree_path) * nr +
3114                         sizeof(struct btree_insert_entry) * nr) ?:
3115                 mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
3116                                           BTREE_TRANS_MEM_MAX) ?:
3117                 init_srcu_struct(&c->btree_trans_barrier);
3118         if (!ret)
3119                 c->btree_trans_barrier_initialized = true;
3120         return ret;
3121 }