]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_iter.c
Update bcachefs sources to 4b5917839c bcachefs: Fix a null ptr deref in check_xattr()
[bcachefs-tools-debian] / libbcachefs / btree_iter.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_methods.h"
5 #include "bkey_buf.h"
6 #include "btree_cache.h"
7 #include "btree_iter.h"
8 #include "btree_key_cache.h"
9 #include "btree_locking.h"
10 #include "btree_update.h"
11 #include "debug.h"
12 #include "error.h"
13 #include "extents.h"
14 #include "journal.h"
15 #include "recovery.h"
16 #include "replicas.h"
17 #include "subvolume.h"
18 #include "trace.h"
19
20 #include <linux/random.h>
21 #include <linux/prefetch.h>
22
23 static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
24 static inline void btree_path_list_add(struct btree_trans *, struct btree_path *,
25                                        struct btree_path *);
26
27 static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter)
28 {
29 #ifdef TRACK_PATH_ALLOCATED
30         return iter->ip_allocated;
31 #else
32         return 0;
33 #endif
34 }
35
36 static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *);
37
38 /*
39  * Unlocks before scheduling
40  * Note: does not revalidate iterator
41  */
42 static inline int bch2_trans_cond_resched(struct btree_trans *trans)
43 {
44         if (need_resched() || race_fault())
45                 return drop_locks_do(trans, (schedule(), 0));
46         else
47                 return 0;
48 }
49
50 static inline int __btree_path_cmp(const struct btree_path *l,
51                                    enum btree_id        r_btree_id,
52                                    bool                 r_cached,
53                                    struct bpos          r_pos,
54                                    unsigned             r_level)
55 {
56         /*
57          * Must match lock ordering as defined by __bch2_btree_node_lock:
58          */
59         return   cmp_int(l->btree_id,   r_btree_id) ?:
60                  cmp_int((int) l->cached,       (int) r_cached) ?:
61                  bpos_cmp(l->pos,       r_pos) ?:
62                 -cmp_int(l->level,      r_level);
63 }
64
65 static inline int btree_path_cmp(const struct btree_path *l,
66                                  const struct btree_path *r)
67 {
68         return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level);
69 }
70
71 static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
72 {
73         /* Are we iterating over keys in all snapshots? */
74         if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
75                 p = bpos_successor(p);
76         } else {
77                 p = bpos_nosnap_successor(p);
78                 p.snapshot = iter->snapshot;
79         }
80
81         return p;
82 }
83
84 static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
85 {
86         /* Are we iterating over keys in all snapshots? */
87         if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
88                 p = bpos_predecessor(p);
89         } else {
90                 p = bpos_nosnap_predecessor(p);
91                 p.snapshot = iter->snapshot;
92         }
93
94         return p;
95 }
96
97 static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
98 {
99         struct bpos pos = iter->pos;
100
101         if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
102             !bkey_eq(pos, POS_MAX))
103                 pos = bkey_successor(iter, pos);
104         return pos;
105 }
106
107 static inline bool btree_path_pos_before_node(struct btree_path *path,
108                                               struct btree *b)
109 {
110         return bpos_lt(path->pos, b->data->min_key);
111 }
112
113 static inline bool btree_path_pos_after_node(struct btree_path *path,
114                                              struct btree *b)
115 {
116         return bpos_gt(path->pos, b->key.k.p);
117 }
118
119 static inline bool btree_path_pos_in_node(struct btree_path *path,
120                                           struct btree *b)
121 {
122         return path->btree_id == b->c.btree_id &&
123                 !btree_path_pos_before_node(path, b) &&
124                 !btree_path_pos_after_node(path, b);
125 }
126
127 /* Btree iterator: */
128
129 #ifdef CONFIG_BCACHEFS_DEBUG
130
131 static void bch2_btree_path_verify_cached(struct btree_trans *trans,
132                                           struct btree_path *path)
133 {
134         struct bkey_cached *ck;
135         bool locked = btree_node_locked(path, 0);
136
137         if (!bch2_btree_node_relock(trans, path, 0))
138                 return;
139
140         ck = (void *) path->l[0].b;
141         BUG_ON(ck->key.btree_id != path->btree_id ||
142                !bkey_eq(ck->key.pos, path->pos));
143
144         if (!locked)
145                 btree_node_unlock(trans, path, 0);
146 }
147
148 static void bch2_btree_path_verify_level(struct btree_trans *trans,
149                                 struct btree_path *path, unsigned level)
150 {
151         struct btree_path_level *l;
152         struct btree_node_iter tmp;
153         bool locked;
154         struct bkey_packed *p, *k;
155         struct printbuf buf1 = PRINTBUF;
156         struct printbuf buf2 = PRINTBUF;
157         struct printbuf buf3 = PRINTBUF;
158         const char *msg;
159
160         if (!bch2_debug_check_iterators)
161                 return;
162
163         l       = &path->l[level];
164         tmp     = l->iter;
165         locked  = btree_node_locked(path, level);
166
167         if (path->cached) {
168                 if (!level)
169                         bch2_btree_path_verify_cached(trans, path);
170                 return;
171         }
172
173         if (!btree_path_node(path, level))
174                 return;
175
176         if (!bch2_btree_node_relock_notrace(trans, path, level))
177                 return;
178
179         BUG_ON(!btree_path_pos_in_node(path, l->b));
180
181         bch2_btree_node_iter_verify(&l->iter, l->b);
182
183         /*
184          * For interior nodes, the iterator will have skipped past deleted keys:
185          */
186         p = level
187                 ? bch2_btree_node_iter_prev(&tmp, l->b)
188                 : bch2_btree_node_iter_prev_all(&tmp, l->b);
189         k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
190
191         if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) {
192                 msg = "before";
193                 goto err;
194         }
195
196         if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
197                 msg = "after";
198                 goto err;
199         }
200
201         if (!locked)
202                 btree_node_unlock(trans, path, level);
203         return;
204 err:
205         bch2_bpos_to_text(&buf1, path->pos);
206
207         if (p) {
208                 struct bkey uk = bkey_unpack_key(l->b, p);
209
210                 bch2_bkey_to_text(&buf2, &uk);
211         } else {
212                 prt_printf(&buf2, "(none)");
213         }
214
215         if (k) {
216                 struct bkey uk = bkey_unpack_key(l->b, k);
217
218                 bch2_bkey_to_text(&buf3, &uk);
219         } else {
220                 prt_printf(&buf3, "(none)");
221         }
222
223         panic("path should be %s key at level %u:\n"
224               "path pos %s\n"
225               "prev key %s\n"
226               "cur  key %s\n",
227               msg, level, buf1.buf, buf2.buf, buf3.buf);
228 }
229
230 static void bch2_btree_path_verify(struct btree_trans *trans,
231                                    struct btree_path *path)
232 {
233         struct bch_fs *c = trans->c;
234         unsigned i;
235
236         EBUG_ON(path->btree_id >= BTREE_ID_NR);
237
238         for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) {
239                 if (!path->l[i].b) {
240                         BUG_ON(!path->cached &&
241                                bch2_btree_id_root(c, path->btree_id)->b->c.level > i);
242                         break;
243                 }
244
245                 bch2_btree_path_verify_level(trans, path, i);
246         }
247
248         bch2_btree_path_verify_locks(path);
249 }
250
251 void bch2_trans_verify_paths(struct btree_trans *trans)
252 {
253         struct btree_path *path;
254
255         trans_for_each_path(trans, path)
256                 bch2_btree_path_verify(trans, path);
257 }
258
259 static void bch2_btree_iter_verify(struct btree_iter *iter)
260 {
261         struct btree_trans *trans = iter->trans;
262
263         BUG_ON(iter->btree_id >= BTREE_ID_NR);
264
265         BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached);
266
267         BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
268                (iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
269
270         BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
271                (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
272                !btree_type_has_snapshots(iter->btree_id));
273
274         if (iter->update_path)
275                 bch2_btree_path_verify(trans, iter->update_path);
276         bch2_btree_path_verify(trans, iter->path);
277 }
278
279 static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
280 {
281         BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
282                !iter->pos.snapshot);
283
284         BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
285                iter->pos.snapshot != iter->snapshot);
286
287         BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) ||
288                bkey_gt(iter->pos, iter->k.p));
289 }
290
291 static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
292 {
293         struct btree_trans *trans = iter->trans;
294         struct btree_iter copy;
295         struct bkey_s_c prev;
296         int ret = 0;
297
298         if (!bch2_debug_check_iterators)
299                 return 0;
300
301         if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS))
302                 return 0;
303
304         if (bkey_err(k) || !k.k)
305                 return 0;
306
307         BUG_ON(!bch2_snapshot_is_ancestor(trans->c,
308                                           iter->snapshot,
309                                           k.k->p.snapshot));
310
311         bch2_trans_iter_init(trans, &copy, iter->btree_id, iter->pos,
312                              BTREE_ITER_NOPRESERVE|
313                              BTREE_ITER_ALL_SNAPSHOTS);
314         prev = bch2_btree_iter_prev(&copy);
315         if (!prev.k)
316                 goto out;
317
318         ret = bkey_err(prev);
319         if (ret)
320                 goto out;
321
322         if (bkey_eq(prev.k->p, k.k->p) &&
323             bch2_snapshot_is_ancestor(trans->c, iter->snapshot,
324                                       prev.k->p.snapshot) > 0) {
325                 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
326
327                 bch2_bkey_to_text(&buf1, k.k);
328                 bch2_bkey_to_text(&buf2, prev.k);
329
330                 panic("iter snap %u\n"
331                       "k    %s\n"
332                       "prev %s\n",
333                       iter->snapshot,
334                       buf1.buf, buf2.buf);
335         }
336 out:
337         bch2_trans_iter_exit(trans, &copy);
338         return ret;
339 }
340
341 void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
342                             struct bpos pos, bool key_cache)
343 {
344         struct btree_path *path;
345         unsigned idx;
346         struct printbuf buf = PRINTBUF;
347
348         btree_trans_sort_paths(trans);
349
350         trans_for_each_path_inorder(trans, path, idx) {
351                 int cmp = cmp_int(path->btree_id, id) ?:
352                         cmp_int(path->cached, key_cache);
353
354                 if (cmp > 0)
355                         break;
356                 if (cmp < 0)
357                         continue;
358
359                 if (!btree_node_locked(path, 0) ||
360                     !path->should_be_locked)
361                         continue;
362
363                 if (!key_cache) {
364                         if (bkey_ge(pos, path->l[0].b->data->min_key) &&
365                             bkey_le(pos, path->l[0].b->key.k.p))
366                                 return;
367                 } else {
368                         if (bkey_eq(pos, path->pos))
369                                 return;
370                 }
371         }
372
373         bch2_dump_trans_paths_updates(trans);
374         bch2_bpos_to_text(&buf, pos);
375
376         panic("not locked: %s %s%s\n",
377               bch2_btree_ids[id], buf.buf,
378               key_cache ? " cached" : "");
379 }
380
381 #else
382
383 static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
384                                                 struct btree_path *path, unsigned l) {}
385 static inline void bch2_btree_path_verify(struct btree_trans *trans,
386                                           struct btree_path *path) {}
387 static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
388 static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
389 static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; }
390
391 #endif
392
393 /* Btree path: fixups after btree updates */
394
395 static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
396                                         struct btree *b,
397                                         struct bset_tree *t,
398                                         struct bkey_packed *k)
399 {
400         struct btree_node_iter_set *set;
401
402         btree_node_iter_for_each(iter, set)
403                 if (set->end == t->end_offset) {
404                         set->k = __btree_node_key_to_offset(b, k);
405                         bch2_btree_node_iter_sort(iter, b);
406                         return;
407                 }
408
409         bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
410 }
411
412 static void __bch2_btree_path_fix_key_modified(struct btree_path *path,
413                                                struct btree *b,
414                                                struct bkey_packed *where)
415 {
416         struct btree_path_level *l = &path->l[b->c.level];
417
418         if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
419                 return;
420
421         if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0)
422                 bch2_btree_node_iter_advance(&l->iter, l->b);
423 }
424
425 void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
426                                       struct btree *b,
427                                       struct bkey_packed *where)
428 {
429         struct btree_path *path;
430
431         trans_for_each_path_with_node(trans, b, path) {
432                 __bch2_btree_path_fix_key_modified(path, b, where);
433                 bch2_btree_path_verify_level(trans, path, b->c.level);
434         }
435 }
436
437 static void __bch2_btree_node_iter_fix(struct btree_path *path,
438                                        struct btree *b,
439                                        struct btree_node_iter *node_iter,
440                                        struct bset_tree *t,
441                                        struct bkey_packed *where,
442                                        unsigned clobber_u64s,
443                                        unsigned new_u64s)
444 {
445         const struct bkey_packed *end = btree_bkey_last(b, t);
446         struct btree_node_iter_set *set;
447         unsigned offset = __btree_node_key_to_offset(b, where);
448         int shift = new_u64s - clobber_u64s;
449         unsigned old_end = t->end_offset - shift;
450         unsigned orig_iter_pos = node_iter->data[0].k;
451         bool iter_current_key_modified =
452                 orig_iter_pos >= offset &&
453                 orig_iter_pos <= offset + clobber_u64s;
454
455         btree_node_iter_for_each(node_iter, set)
456                 if (set->end == old_end)
457                         goto found;
458
459         /* didn't find the bset in the iterator - might have to readd it: */
460         if (new_u64s &&
461             bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
462                 bch2_btree_node_iter_push(node_iter, b, where, end);
463                 goto fixup_done;
464         } else {
465                 /* Iterator is after key that changed */
466                 return;
467         }
468 found:
469         set->end = t->end_offset;
470
471         /* Iterator hasn't gotten to the key that changed yet: */
472         if (set->k < offset)
473                 return;
474
475         if (new_u64s &&
476             bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
477                 set->k = offset;
478         } else if (set->k < offset + clobber_u64s) {
479                 set->k = offset + new_u64s;
480                 if (set->k == set->end)
481                         bch2_btree_node_iter_set_drop(node_iter, set);
482         } else {
483                 /* Iterator is after key that changed */
484                 set->k = (int) set->k + shift;
485                 return;
486         }
487
488         bch2_btree_node_iter_sort(node_iter, b);
489 fixup_done:
490         if (node_iter->data[0].k != orig_iter_pos)
491                 iter_current_key_modified = true;
492
493         /*
494          * When a new key is added, and the node iterator now points to that
495          * key, the iterator might have skipped past deleted keys that should
496          * come after the key the iterator now points to. We have to rewind to
497          * before those deleted keys - otherwise
498          * bch2_btree_node_iter_prev_all() breaks:
499          */
500         if (!bch2_btree_node_iter_end(node_iter) &&
501             iter_current_key_modified &&
502             b->c.level) {
503                 struct bset_tree *t;
504                 struct bkey_packed *k, *k2, *p;
505
506                 k = bch2_btree_node_iter_peek_all(node_iter, b);
507
508                 for_each_bset(b, t) {
509                         bool set_pos = false;
510
511                         if (node_iter->data[0].end == t->end_offset)
512                                 continue;
513
514                         k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t);
515
516                         while ((p = bch2_bkey_prev_all(b, t, k2)) &&
517                                bkey_iter_cmp(b, k, p) < 0) {
518                                 k2 = p;
519                                 set_pos = true;
520                         }
521
522                         if (set_pos)
523                                 btree_node_iter_set_set_pos(node_iter,
524                                                             b, t, k2);
525                 }
526         }
527 }
528
529 void bch2_btree_node_iter_fix(struct btree_trans *trans,
530                               struct btree_path *path,
531                               struct btree *b,
532                               struct btree_node_iter *node_iter,
533                               struct bkey_packed *where,
534                               unsigned clobber_u64s,
535                               unsigned new_u64s)
536 {
537         struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where);
538         struct btree_path *linked;
539
540         if (node_iter != &path->l[b->c.level].iter) {
541                 __bch2_btree_node_iter_fix(path, b, node_iter, t,
542                                            where, clobber_u64s, new_u64s);
543
544                 if (bch2_debug_check_iterators)
545                         bch2_btree_node_iter_verify(node_iter, b);
546         }
547
548         trans_for_each_path_with_node(trans, b, linked) {
549                 __bch2_btree_node_iter_fix(linked, b,
550                                            &linked->l[b->c.level].iter, t,
551                                            where, clobber_u64s, new_u64s);
552                 bch2_btree_path_verify_level(trans, linked, b->c.level);
553         }
554 }
555
556 /* Btree path level: pointer to a particular btree node and node iter */
557
558 static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c,
559                                                   struct btree_path_level *l,
560                                                   struct bkey *u,
561                                                   struct bkey_packed *k)
562 {
563         if (unlikely(!k)) {
564                 /*
565                  * signal to bch2_btree_iter_peek_slot() that we're currently at
566                  * a hole
567                  */
568                 u->type = KEY_TYPE_deleted;
569                 return bkey_s_c_null;
570         }
571
572         return bkey_disassemble(l->b, k, u);
573 }
574
575 static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
576                                                         struct btree_path_level *l,
577                                                         struct bkey *u)
578 {
579         return __btree_iter_unpack(c, l, u,
580                         bch2_btree_node_iter_peek_all(&l->iter, l->b));
581 }
582
583 static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans,
584                                                     struct btree_path *path,
585                                                     struct btree_path_level *l,
586                                                     struct bkey *u)
587 {
588         struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
589                         bch2_btree_node_iter_peek(&l->iter, l->b));
590
591         path->pos = k.k ? k.k->p : l->b->key.k.p;
592         trans->paths_sorted = false;
593         bch2_btree_path_verify_level(trans, path, l - path->l);
594         return k;
595 }
596
597 static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
598                                                     struct btree_path *path,
599                                                     struct btree_path_level *l,
600                                                     struct bkey *u)
601 {
602         struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
603                         bch2_btree_node_iter_prev(&l->iter, l->b));
604
605         path->pos = k.k ? k.k->p : l->b->data->min_key;
606         trans->paths_sorted = false;
607         bch2_btree_path_verify_level(trans, path, l - path->l);
608         return k;
609 }
610
611 static inline bool btree_path_advance_to_pos(struct btree_path *path,
612                                              struct btree_path_level *l,
613                                              int max_advance)
614 {
615         struct bkey_packed *k;
616         int nr_advanced = 0;
617
618         while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
619                bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
620                 if (max_advance > 0 && nr_advanced >= max_advance)
621                         return false;
622
623                 bch2_btree_node_iter_advance(&l->iter, l->b);
624                 nr_advanced++;
625         }
626
627         return true;
628 }
629
630 static inline void __btree_path_level_init(struct btree_path *path,
631                                            unsigned level)
632 {
633         struct btree_path_level *l = &path->l[level];
634
635         bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
636
637         /*
638          * Iterators to interior nodes should always be pointed at the first non
639          * whiteout:
640          */
641         if (level)
642                 bch2_btree_node_iter_peek(&l->iter, l->b);
643 }
644
645 void bch2_btree_path_level_init(struct btree_trans *trans,
646                                 struct btree_path *path,
647                                 struct btree *b)
648 {
649         BUG_ON(path->cached);
650
651         EBUG_ON(!btree_path_pos_in_node(path, b));
652
653         path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
654         path->l[b->c.level].b = b;
655         __btree_path_level_init(path, b->c.level);
656 }
657
658 /* Btree path: fixups after btree node updates: */
659
660 static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, struct btree *b)
661 {
662         struct bch_fs *c = trans->c;
663         struct btree_insert_entry *i;
664
665         trans_for_each_update(trans, i)
666                 if (!i->cached &&
667                     i->level    == b->c.level &&
668                     i->btree_id == b->c.btree_id &&
669                     bpos_cmp(i->k->k.p, b->data->min_key) >= 0 &&
670                     bpos_cmp(i->k->k.p, b->data->max_key) <= 0) {
671                         i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v;
672
673                         if (unlikely(trans->journal_replay_not_finished)) {
674                                 struct bkey_i *j_k =
675                                         bch2_journal_keys_peek_slot(c, i->btree_id, i->level,
676                                                                     i->k->k.p);
677
678                                 if (j_k) {
679                                         i->old_k = j_k->k;
680                                         i->old_v = &j_k->v;
681                                 }
682                         }
683                 }
684 }
685
686 /*
687  * A btree node is being replaced - update the iterator to point to the new
688  * node:
689  */
690 void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
691 {
692         struct btree_path *path;
693
694         trans_for_each_path(trans, path)
695                 if (path->uptodate == BTREE_ITER_UPTODATE &&
696                     !path->cached &&
697                     btree_path_pos_in_node(path, b)) {
698                         enum btree_node_locked_type t =
699                                 btree_lock_want(path, b->c.level);
700
701                         if (t != BTREE_NODE_UNLOCKED) {
702                                 btree_node_unlock(trans, path, b->c.level);
703                                 six_lock_increment(&b->c.lock, (enum six_lock_type) t);
704                                 mark_btree_node_locked(trans, path, b->c.level, (enum six_lock_type) t);
705                         }
706
707                         bch2_btree_path_level_init(trans, path, b);
708                 }
709
710         bch2_trans_revalidate_updates_in_node(trans, b);
711 }
712
713 /*
714  * A btree node has been modified in such a way as to invalidate iterators - fix
715  * them:
716  */
717 void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b)
718 {
719         struct btree_path *path;
720
721         trans_for_each_path_with_node(trans, b, path)
722                 __btree_path_level_init(path, b->c.level);
723
724         bch2_trans_revalidate_updates_in_node(trans, b);
725 }
726
727 /* Btree path: traverse, set_pos: */
728
729 static inline int btree_path_lock_root(struct btree_trans *trans,
730                                        struct btree_path *path,
731                                        unsigned depth_want,
732                                        unsigned long trace_ip)
733 {
734         struct bch_fs *c = trans->c;
735         struct btree *b, **rootp = &bch2_btree_id_root(c, path->btree_id)->b;
736         enum six_lock_type lock_type;
737         unsigned i;
738         int ret;
739
740         EBUG_ON(path->nodes_locked);
741
742         while (1) {
743                 b = READ_ONCE(*rootp);
744                 path->level = READ_ONCE(b->c.level);
745
746                 if (unlikely(path->level < depth_want)) {
747                         /*
748                          * the root is at a lower depth than the depth we want:
749                          * got to the end of the btree, or we're walking nodes
750                          * greater than some depth and there are no nodes >=
751                          * that depth
752                          */
753                         path->level = depth_want;
754                         for (i = path->level; i < BTREE_MAX_DEPTH; i++)
755                                 path->l[i].b = NULL;
756                         return 1;
757                 }
758
759                 lock_type = __btree_lock_want(path, path->level);
760                 ret = btree_node_lock(trans, path, &b->c,
761                                       path->level, lock_type, trace_ip);
762                 if (unlikely(ret)) {
763                         if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed))
764                                 continue;
765                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
766                                 return ret;
767                         BUG();
768                 }
769
770                 if (likely(b == READ_ONCE(*rootp) &&
771                            b->c.level == path->level &&
772                            !race_fault())) {
773                         for (i = 0; i < path->level; i++)
774                                 path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_lock_root);
775                         path->l[path->level].b = b;
776                         for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
777                                 path->l[i].b = NULL;
778
779                         mark_btree_node_locked(trans, path, path->level, lock_type);
780                         bch2_btree_path_level_init(trans, path, b);
781                         return 0;
782                 }
783
784                 six_unlock_type(&b->c.lock, lock_type);
785         }
786 }
787
788 noinline
789 static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path)
790 {
791         struct bch_fs *c = trans->c;
792         struct btree_path_level *l = path_l(path);
793         struct btree_node_iter node_iter = l->iter;
794         struct bkey_packed *k;
795         struct bkey_buf tmp;
796         unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
797                 ? (path->level > 1 ? 0 :  2)
798                 : (path->level > 1 ? 1 : 16);
799         bool was_locked = btree_node_locked(path, path->level);
800         int ret = 0;
801
802         bch2_bkey_buf_init(&tmp);
803
804         while (nr-- && !ret) {
805                 if (!bch2_btree_node_relock(trans, path, path->level))
806                         break;
807
808                 bch2_btree_node_iter_advance(&node_iter, l->b);
809                 k = bch2_btree_node_iter_peek(&node_iter, l->b);
810                 if (!k)
811                         break;
812
813                 bch2_bkey_buf_unpack(&tmp, c, l->b, k);
814                 ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id,
815                                                path->level - 1);
816         }
817
818         if (!was_locked)
819                 btree_node_unlock(trans, path, path->level);
820
821         bch2_bkey_buf_exit(&tmp, c);
822         return ret;
823 }
824
825 static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path,
826                                  struct btree_and_journal_iter *jiter)
827 {
828         struct bch_fs *c = trans->c;
829         struct bkey_s_c k;
830         struct bkey_buf tmp;
831         unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
832                 ? (path->level > 1 ? 0 :  2)
833                 : (path->level > 1 ? 1 : 16);
834         bool was_locked = btree_node_locked(path, path->level);
835         int ret = 0;
836
837         bch2_bkey_buf_init(&tmp);
838
839         while (nr-- && !ret) {
840                 if (!bch2_btree_node_relock(trans, path, path->level))
841                         break;
842
843                 bch2_btree_and_journal_iter_advance(jiter);
844                 k = bch2_btree_and_journal_iter_peek(jiter);
845                 if (!k.k)
846                         break;
847
848                 bch2_bkey_buf_reassemble(&tmp, c, k);
849                 ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id,
850                                                path->level - 1);
851         }
852
853         if (!was_locked)
854                 btree_node_unlock(trans, path, path->level);
855
856         bch2_bkey_buf_exit(&tmp, c);
857         return ret;
858 }
859
860 static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
861                                             struct btree_path *path,
862                                             unsigned plevel, struct btree *b)
863 {
864         struct btree_path_level *l = &path->l[plevel];
865         bool locked = btree_node_locked(path, plevel);
866         struct bkey_packed *k;
867         struct bch_btree_ptr_v2 *bp;
868
869         if (!bch2_btree_node_relock(trans, path, plevel))
870                 return;
871
872         k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
873         BUG_ON(k->type != KEY_TYPE_btree_ptr_v2);
874
875         bp = (void *) bkeyp_val(&l->b->format, k);
876         bp->mem_ptr = (unsigned long)b;
877
878         if (!locked)
879                 btree_node_unlock(trans, path, plevel);
880 }
881
882 static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
883                                                      struct btree_path *path,
884                                                      unsigned flags,
885                                                      struct bkey_buf *out)
886 {
887         struct bch_fs *c = trans->c;
888         struct btree_path_level *l = path_l(path);
889         struct btree_and_journal_iter jiter;
890         struct bkey_s_c k;
891         int ret = 0;
892
893         __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos);
894
895         k = bch2_btree_and_journal_iter_peek(&jiter);
896
897         bch2_bkey_buf_reassemble(out, c, k);
898
899         if (flags & BTREE_ITER_PREFETCH)
900                 ret = btree_path_prefetch_j(trans, path, &jiter);
901
902         bch2_btree_and_journal_iter_exit(&jiter);
903         return ret;
904 }
905
906 static __always_inline int btree_path_down(struct btree_trans *trans,
907                                            struct btree_path *path,
908                                            unsigned flags,
909                                            unsigned long trace_ip)
910 {
911         struct bch_fs *c = trans->c;
912         struct btree_path_level *l = path_l(path);
913         struct btree *b;
914         unsigned level = path->level - 1;
915         enum six_lock_type lock_type = __btree_lock_want(path, level);
916         struct bkey_buf tmp;
917         int ret;
918
919         EBUG_ON(!btree_node_locked(path, path->level));
920
921         bch2_bkey_buf_init(&tmp);
922
923         if (unlikely(trans->journal_replay_not_finished)) {
924                 ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp);
925                 if (ret)
926                         goto err;
927         } else {
928                 bch2_bkey_buf_unpack(&tmp, c, l->b,
929                                  bch2_btree_node_iter_peek(&l->iter, l->b));
930
931                 if (flags & BTREE_ITER_PREFETCH) {
932                         ret = btree_path_prefetch(trans, path);
933                         if (ret)
934                                 goto err;
935                 }
936         }
937
938         b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip);
939         ret = PTR_ERR_OR_ZERO(b);
940         if (unlikely(ret))
941                 goto err;
942
943         if (likely(!trans->journal_replay_not_finished &&
944                    tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
945             unlikely(b != btree_node_mem_ptr(tmp.k)))
946                 btree_node_mem_ptr_set(trans, path, level + 1, b);
947
948         if (btree_node_read_locked(path, level + 1))
949                 btree_node_unlock(trans, path, level + 1);
950
951         mark_btree_node_locked(trans, path, level, lock_type);
952         path->level = level;
953         bch2_btree_path_level_init(trans, path, b);
954
955         bch2_btree_path_verify_locks(path);
956 err:
957         bch2_bkey_buf_exit(&tmp, c);
958         return ret;
959 }
960
961
962 static int bch2_btree_path_traverse_all(struct btree_trans *trans)
963 {
964         struct bch_fs *c = trans->c;
965         struct btree_path *path;
966         unsigned long trace_ip = _RET_IP_;
967         int i, ret = 0;
968
969         if (trans->in_traverse_all)
970                 return -BCH_ERR_transaction_restart_in_traverse_all;
971
972         trans->in_traverse_all = true;
973 retry_all:
974         trans->restarted = 0;
975         trans->last_restarted_ip = 0;
976
977         trans_for_each_path(trans, path)
978                 path->should_be_locked = false;
979
980         btree_trans_sort_paths(trans);
981
982         bch2_trans_unlock(trans);
983         cond_resched();
984
985         if (unlikely(trans->memory_allocation_failure)) {
986                 struct closure cl;
987
988                 closure_init_stack(&cl);
989
990                 do {
991                         ret = bch2_btree_cache_cannibalize_lock(c, &cl);
992                         closure_sync(&cl);
993                 } while (ret);
994         }
995
996         /* Now, redo traversals in correct order: */
997         i = 0;
998         while (i < trans->nr_sorted) {
999                 path = trans->paths + trans->sorted[i];
1000
1001                 /*
1002                  * Traversing a path can cause another path to be added at about
1003                  * the same position:
1004                  */
1005                 if (path->uptodate) {
1006                         __btree_path_get(path, false);
1007                         ret = bch2_btree_path_traverse_one(trans, path, 0, _THIS_IP_);
1008                         __btree_path_put(path, false);
1009
1010                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
1011                             bch2_err_matches(ret, ENOMEM))
1012                                 goto retry_all;
1013                         if (ret)
1014                                 goto err;
1015                 } else {
1016                         i++;
1017                 }
1018         }
1019
1020         /*
1021          * We used to assert that all paths had been traversed here
1022          * (path->uptodate < BTREE_ITER_NEED_TRAVERSE); however, since
1023          * path->Should_be_locked is not set yet, we we might have unlocked and
1024          * then failed to relock a path - that's fine.
1025          */
1026 err:
1027         bch2_btree_cache_cannibalize_unlock(c);
1028
1029         trans->in_traverse_all = false;
1030
1031         trace_and_count(c, trans_traverse_all, trans, trace_ip);
1032         return ret;
1033 }
1034
1035 static inline bool btree_path_check_pos_in_node(struct btree_path *path,
1036                                                 unsigned l, int check_pos)
1037 {
1038         if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b))
1039                 return false;
1040         if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b))
1041                 return false;
1042         return true;
1043 }
1044
1045 static inline bool btree_path_good_node(struct btree_trans *trans,
1046                                         struct btree_path *path,
1047                                         unsigned l, int check_pos)
1048 {
1049         return is_btree_node(path, l) &&
1050                 bch2_btree_node_relock(trans, path, l) &&
1051                 btree_path_check_pos_in_node(path, l, check_pos);
1052 }
1053
1054 static void btree_path_set_level_down(struct btree_trans *trans,
1055                                       struct btree_path *path,
1056                                       unsigned new_level)
1057 {
1058         unsigned l;
1059
1060         path->level = new_level;
1061
1062         for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++)
1063                 if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
1064                         btree_node_unlock(trans, path, l);
1065
1066         btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
1067         bch2_btree_path_verify(trans, path);
1068 }
1069
1070 static noinline unsigned __btree_path_up_until_good_node(struct btree_trans *trans,
1071                                                          struct btree_path *path,
1072                                                          int check_pos)
1073 {
1074         unsigned i, l = path->level;
1075 again:
1076         while (btree_path_node(path, l) &&
1077                !btree_path_good_node(trans, path, l, check_pos))
1078                 __btree_path_set_level_up(trans, path, l++);
1079
1080         /* If we need intent locks, take them too: */
1081         for (i = l + 1;
1082              i < path->locks_want && btree_path_node(path, i);
1083              i++)
1084                 if (!bch2_btree_node_relock(trans, path, i)) {
1085                         while (l <= i)
1086                                 __btree_path_set_level_up(trans, path, l++);
1087                         goto again;
1088                 }
1089
1090         return l;
1091 }
1092
1093 static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
1094                                                      struct btree_path *path,
1095                                                      int check_pos)
1096 {
1097         return likely(btree_node_locked(path, path->level) &&
1098                       btree_path_check_pos_in_node(path, path->level, check_pos))
1099                 ? path->level
1100                 : __btree_path_up_until_good_node(trans, path, check_pos);
1101 }
1102
1103 /*
1104  * This is the main state machine for walking down the btree - walks down to a
1105  * specified depth
1106  *
1107  * Returns 0 on success, -EIO on error (error reading in a btree node).
1108  *
1109  * On error, caller (peek_node()/peek_key()) must return NULL; the error is
1110  * stashed in the iterator and returned from bch2_trans_exit().
1111  */
1112 int bch2_btree_path_traverse_one(struct btree_trans *trans,
1113                                  struct btree_path *path,
1114                                  unsigned flags,
1115                                  unsigned long trace_ip)
1116 {
1117         unsigned depth_want = path->level;
1118         int ret = -((int) trans->restarted);
1119
1120         if (unlikely(ret))
1121                 goto out;
1122
1123         /*
1124          * Ensure we obey path->should_be_locked: if it's set, we can't unlock
1125          * and re-traverse the path without a transaction restart:
1126          */
1127         if (path->should_be_locked) {
1128                 ret = bch2_btree_path_relock(trans, path, trace_ip);
1129                 goto out;
1130         }
1131
1132         if (path->cached) {
1133                 ret = bch2_btree_path_traverse_cached(trans, path, flags);
1134                 goto out;
1135         }
1136
1137         if (unlikely(path->level >= BTREE_MAX_DEPTH))
1138                 goto out;
1139
1140         path->level = btree_path_up_until_good_node(trans, path, 0);
1141
1142         EBUG_ON(btree_path_node(path, path->level) &&
1143                 !btree_node_locked(path, path->level));
1144
1145         /*
1146          * Note: path->nodes[path->level] may be temporarily NULL here - that
1147          * would indicate to other code that we got to the end of the btree,
1148          * here it indicates that relocking the root failed - it's critical that
1149          * btree_path_lock_root() comes next and that it can't fail
1150          */
1151         while (path->level > depth_want) {
1152                 ret = btree_path_node(path, path->level)
1153                         ? btree_path_down(trans, path, flags, trace_ip)
1154                         : btree_path_lock_root(trans, path, depth_want, trace_ip);
1155                 if (unlikely(ret)) {
1156                         if (ret == 1) {
1157                                 /*
1158                                  * No nodes at this level - got to the end of
1159                                  * the btree:
1160                                  */
1161                                 ret = 0;
1162                                 goto out;
1163                         }
1164
1165                         __bch2_btree_path_unlock(trans, path);
1166                         path->level = depth_want;
1167                         path->l[path->level].b = ERR_PTR(ret);
1168                         goto out;
1169                 }
1170         }
1171
1172         path->uptodate = BTREE_ITER_UPTODATE;
1173 out:
1174         if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted)
1175                 panic("ret %s (%i) trans->restarted %s (%i)\n",
1176                       bch2_err_str(ret), ret,
1177                       bch2_err_str(trans->restarted), trans->restarted);
1178         bch2_btree_path_verify(trans, path);
1179         return ret;
1180 }
1181
1182 static inline void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
1183                             struct btree_path *src)
1184 {
1185         unsigned i, offset = offsetof(struct btree_path, pos);
1186
1187         memcpy((void *) dst + offset,
1188                (void *) src + offset,
1189                sizeof(struct btree_path) - offset);
1190
1191         for (i = 0; i < BTREE_MAX_DEPTH; i++) {
1192                 unsigned t = btree_node_locked_type(dst, i);
1193
1194                 if (t != BTREE_NODE_UNLOCKED)
1195                         six_lock_increment(&dst->l[i].b->c.lock, t);
1196         }
1197 }
1198
1199 static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src,
1200                                            bool intent)
1201 {
1202         struct btree_path *new = btree_path_alloc(trans, src);
1203
1204         btree_path_copy(trans, new, src);
1205         __btree_path_get(new, intent);
1206         return new;
1207 }
1208
1209 __flatten
1210 struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans,
1211                          struct btree_path *path, bool intent,
1212                          unsigned long ip)
1213 {
1214         __btree_path_put(path, intent);
1215         path = btree_path_clone(trans, path, intent);
1216         path->preserve = false;
1217         return path;
1218 }
1219
1220 struct btree_path * __must_check
1221 __bch2_btree_path_set_pos(struct btree_trans *trans,
1222                    struct btree_path *path, struct bpos new_pos,
1223                    bool intent, unsigned long ip, int cmp)
1224 {
1225         unsigned level = path->level;
1226
1227         bch2_trans_verify_not_in_restart(trans);
1228         EBUG_ON(!path->ref);
1229
1230         path = bch2_btree_path_make_mut(trans, path, intent, ip);
1231
1232         path->pos               = new_pos;
1233         trans->paths_sorted     = false;
1234
1235         if (unlikely(path->cached)) {
1236                 btree_node_unlock(trans, path, 0);
1237                 path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
1238                 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
1239                 goto out;
1240         }
1241
1242         level = btree_path_up_until_good_node(trans, path, cmp);
1243
1244         if (btree_path_node(path, level)) {
1245                 struct btree_path_level *l = &path->l[level];
1246
1247                 BUG_ON(!btree_node_locked(path, level));
1248                 /*
1249                  * We might have to skip over many keys, or just a few: try
1250                  * advancing the node iterator, and if we have to skip over too
1251                  * many keys just reinit it (or if we're rewinding, since that
1252                  * is expensive).
1253                  */
1254                 if (cmp < 0 ||
1255                     !btree_path_advance_to_pos(path, l, 8))
1256                         bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
1257
1258                 /*
1259                  * Iterators to interior nodes should always be pointed at the first non
1260                  * whiteout:
1261                  */
1262                 if (unlikely(level))
1263                         bch2_btree_node_iter_peek(&l->iter, l->b);
1264         }
1265
1266         if (unlikely(level != path->level)) {
1267                 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
1268                 __bch2_btree_path_unlock(trans, path);
1269         }
1270 out:
1271         bch2_btree_path_verify(trans, path);
1272         return path;
1273 }
1274
1275 /* Btree path: main interface: */
1276
1277 static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path)
1278 {
1279         struct btree_path *sib;
1280
1281         sib = prev_btree_path(trans, path);
1282         if (sib && !btree_path_cmp(sib, path))
1283                 return sib;
1284
1285         sib = next_btree_path(trans, path);
1286         if (sib && !btree_path_cmp(sib, path))
1287                 return sib;
1288
1289         return NULL;
1290 }
1291
1292 static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path)
1293 {
1294         struct btree_path *sib;
1295
1296         sib = prev_btree_path(trans, path);
1297         if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
1298                 return sib;
1299
1300         sib = next_btree_path(trans, path);
1301         if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
1302                 return sib;
1303
1304         return NULL;
1305 }
1306
1307 static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path)
1308 {
1309         __bch2_btree_path_unlock(trans, path);
1310         btree_path_list_remove(trans, path);
1311         trans->paths_allocated &= ~(1ULL << path->idx);
1312 }
1313
1314 void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent)
1315 {
1316         struct btree_path *dup;
1317
1318         EBUG_ON(trans->paths + path->idx != path);
1319         EBUG_ON(!path->ref);
1320
1321         if (!__btree_path_put(path, intent))
1322                 return;
1323
1324         dup = path->preserve
1325                 ? have_path_at_pos(trans, path)
1326                 : have_node_at_pos(trans, path);
1327
1328         if (!dup && !(!path->preserve && !is_btree_node(path, path->level)))
1329                 return;
1330
1331         if (path->should_be_locked &&
1332             !trans->restarted &&
1333             (!dup || !bch2_btree_path_relock_norestart(trans, dup, _THIS_IP_)))
1334                 return;
1335
1336         if (dup) {
1337                 dup->preserve           |= path->preserve;
1338                 dup->should_be_locked   |= path->should_be_locked;
1339         }
1340
1341         __bch2_path_free(trans, path);
1342 }
1343
1344 static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *path,
1345                                  bool intent)
1346 {
1347         EBUG_ON(trans->paths + path->idx != path);
1348         EBUG_ON(!path->ref);
1349
1350         if (!__btree_path_put(path, intent))
1351                 return;
1352
1353         __bch2_path_free(trans, path);
1354 }
1355
1356 void bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count)
1357 {
1358         panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
1359               trans->restart_count, restart_count,
1360               (void *) trans->last_begin_ip);
1361 }
1362
1363 void bch2_trans_in_restart_error(struct btree_trans *trans)
1364 {
1365         panic("in transaction restart: %s, last restarted by %pS\n",
1366               bch2_err_str(trans->restarted),
1367               (void *) trans->last_restarted_ip);
1368 }
1369
1370 noinline __cold
1371 void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
1372 {
1373         struct btree_insert_entry *i;
1374         struct btree_write_buffered_key *wb;
1375
1376         prt_printf(buf, "transaction updates for %s journal seq %llu",
1377                trans->fn, trans->journal_res.seq);
1378         prt_newline(buf);
1379         printbuf_indent_add(buf, 2);
1380
1381         trans_for_each_update(trans, i) {
1382                 struct bkey_s_c old = { &i->old_k, i->old_v };
1383
1384                 prt_printf(buf, "update: btree=%s cached=%u %pS",
1385                        bch2_btree_ids[i->btree_id],
1386                        i->cached,
1387                        (void *) i->ip_allocated);
1388                 prt_newline(buf);
1389
1390                 prt_printf(buf, "  old ");
1391                 bch2_bkey_val_to_text(buf, trans->c, old);
1392                 prt_newline(buf);
1393
1394                 prt_printf(buf, "  new ");
1395                 bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k));
1396                 prt_newline(buf);
1397         }
1398
1399         trans_for_each_wb_update(trans, wb) {
1400                 prt_printf(buf, "update: btree=%s wb=1 %pS",
1401                        bch2_btree_ids[wb->btree],
1402                        (void *) i->ip_allocated);
1403                 prt_newline(buf);
1404
1405                 prt_printf(buf, "  new ");
1406                 bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(&wb->k));
1407                 prt_newline(buf);
1408         }
1409
1410         printbuf_indent_sub(buf, 2);
1411 }
1412
1413 noinline __cold
1414 void bch2_dump_trans_updates(struct btree_trans *trans)
1415 {
1416         struct printbuf buf = PRINTBUF;
1417
1418         bch2_trans_updates_to_text(&buf, trans);
1419         bch2_print_string_as_lines(KERN_ERR, buf.buf);
1420         printbuf_exit(&buf);
1421 }
1422
1423 noinline __cold
1424 void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path)
1425 {
1426         prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ",
1427                    path->idx, path->ref, path->intent_ref,
1428                    path->preserve ? 'P' : ' ',
1429                    path->should_be_locked ? 'S' : ' ',
1430                    bch2_btree_ids[path->btree_id],
1431                    path->level);
1432         bch2_bpos_to_text(out, path->pos);
1433
1434         prt_printf(out, " locks %u", path->nodes_locked);
1435 #ifdef TRACK_PATH_ALLOCATED
1436         prt_printf(out, " %pS", (void *) path->ip_allocated);
1437 #endif
1438         prt_newline(out);
1439 }
1440
1441 static noinline __cold
1442 void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans,
1443                                 bool nosort)
1444 {
1445         struct btree_path *path;
1446         unsigned idx;
1447
1448         if (!nosort)
1449                 btree_trans_sort_paths(trans);
1450
1451         trans_for_each_path_inorder(trans, path, idx)
1452                 bch2_btree_path_to_text(out, path);
1453 }
1454
1455 noinline __cold
1456 void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans)
1457 {
1458         __bch2_trans_paths_to_text(out, trans, false);
1459 }
1460
1461 static noinline __cold
1462 void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort)
1463 {
1464         struct printbuf buf = PRINTBUF;
1465
1466         __bch2_trans_paths_to_text(&buf, trans, nosort);
1467         bch2_trans_updates_to_text(&buf, trans);
1468
1469         bch2_print_string_as_lines(KERN_ERR, buf.buf);
1470         printbuf_exit(&buf);
1471 }
1472
1473 noinline __cold
1474 void bch2_dump_trans_paths_updates(struct btree_trans *trans)
1475 {
1476         __bch2_dump_trans_paths_updates(trans, false);
1477 }
1478
1479 noinline __cold
1480 static void bch2_trans_update_max_paths(struct btree_trans *trans)
1481 {
1482         struct btree_transaction_stats *s = btree_trans_stats(trans);
1483         struct printbuf buf = PRINTBUF;
1484
1485         if (!s)
1486                 return;
1487
1488         bch2_trans_paths_to_text(&buf, trans);
1489
1490         if (!buf.allocation_failure) {
1491                 mutex_lock(&s->lock);
1492                 if (s->nr_max_paths < hweight64(trans->paths_allocated)) {
1493                         s->nr_max_paths = trans->nr_max_paths =
1494                                 hweight64(trans->paths_allocated);
1495                         swap(s->max_paths_text, buf.buf);
1496                 }
1497                 mutex_unlock(&s->lock);
1498         }
1499
1500         printbuf_exit(&buf);
1501
1502         trans->nr_max_paths = hweight64(trans->paths_allocated);
1503 }
1504
1505 static noinline void btree_path_overflow(struct btree_trans *trans)
1506 {
1507         bch2_dump_trans_paths_updates(trans);
1508         panic("trans path oveflow\n");
1509 }
1510
1511 static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
1512                                                   struct btree_path *pos)
1513 {
1514         struct btree_path *path;
1515         unsigned idx;
1516
1517         if (unlikely(trans->paths_allocated ==
1518                      ~((~0ULL << 1) << (BTREE_ITER_MAX - 1))))
1519                 btree_path_overflow(trans);
1520
1521         idx = __ffs64(~trans->paths_allocated);
1522
1523         /*
1524          * Do this before marking the new path as allocated, since it won't be
1525          * initialized yet:
1526          */
1527         if (unlikely(idx > trans->nr_max_paths))
1528                 bch2_trans_update_max_paths(trans);
1529
1530         trans->paths_allocated |= 1ULL << idx;
1531
1532         path = &trans->paths[idx];
1533         path->idx               = idx;
1534         path->ref               = 0;
1535         path->intent_ref        = 0;
1536         path->nodes_locked      = 0;
1537
1538         btree_path_list_add(trans, pos, path);
1539         trans->paths_sorted = false;
1540         return path;
1541 }
1542
1543 struct btree_path *bch2_path_get(struct btree_trans *trans,
1544                                  enum btree_id btree_id, struct bpos pos,
1545                                  unsigned locks_want, unsigned level,
1546                                  unsigned flags, unsigned long ip)
1547 {
1548         struct btree_path *path, *path_pos = NULL;
1549         bool cached = flags & BTREE_ITER_CACHED;
1550         bool intent = flags & BTREE_ITER_INTENT;
1551         int i;
1552
1553         bch2_trans_verify_not_in_restart(trans);
1554         bch2_trans_verify_locks(trans);
1555
1556         btree_trans_sort_paths(trans);
1557
1558         trans_for_each_path_inorder(trans, path, i) {
1559                 if (__btree_path_cmp(path,
1560                                      btree_id,
1561                                      cached,
1562                                      pos,
1563                                      level) > 0)
1564                         break;
1565
1566                 path_pos = path;
1567         }
1568
1569         if (path_pos &&
1570             path_pos->cached    == cached &&
1571             path_pos->btree_id  == btree_id &&
1572             path_pos->level     == level) {
1573                 __btree_path_get(path_pos, intent);
1574                 path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
1575         } else {
1576                 path = btree_path_alloc(trans, path_pos);
1577                 path_pos = NULL;
1578
1579                 __btree_path_get(path, intent);
1580                 path->pos                       = pos;
1581                 path->btree_id                  = btree_id;
1582                 path->cached                    = cached;
1583                 path->uptodate                  = BTREE_ITER_NEED_TRAVERSE;
1584                 path->should_be_locked          = false;
1585                 path->level                     = level;
1586                 path->locks_want                = locks_want;
1587                 path->nodes_locked              = 0;
1588                 for (i = 0; i < ARRAY_SIZE(path->l); i++)
1589                         path->l[i].b            = ERR_PTR(-BCH_ERR_no_btree_node_init);
1590 #ifdef TRACK_PATH_ALLOCATED
1591                 path->ip_allocated              = ip;
1592 #endif
1593                 trans->paths_sorted             = false;
1594         }
1595
1596         if (!(flags & BTREE_ITER_NOPRESERVE))
1597                 path->preserve = true;
1598
1599         if (path->intent_ref)
1600                 locks_want = max(locks_want, level + 1);
1601
1602         /*
1603          * If the path has locks_want greater than requested, we don't downgrade
1604          * it here - on transaction restart because btree node split needs to
1605          * upgrade locks, we might be putting/getting the iterator again.
1606          * Downgrading iterators only happens via bch2_trans_downgrade(), after
1607          * a successful transaction commit.
1608          */
1609
1610         locks_want = min(locks_want, BTREE_MAX_DEPTH);
1611         if (locks_want > path->locks_want)
1612                 bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want);
1613
1614         return path;
1615 }
1616
1617 struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
1618 {
1619
1620         struct btree_path_level *l = path_l(path);
1621         struct bkey_packed *_k;
1622         struct bkey_s_c k;
1623
1624         if (unlikely(!l->b))
1625                 return bkey_s_c_null;
1626
1627         EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
1628         EBUG_ON(!btree_node_locked(path, path->level));
1629
1630         if (!path->cached) {
1631                 _k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
1632                 k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null;
1633
1634                 EBUG_ON(k.k && bkey_deleted(k.k) && bpos_eq(k.k->p, path->pos));
1635
1636                 if (!k.k || !bpos_eq(path->pos, k.k->p))
1637                         goto hole;
1638         } else {
1639                 struct bkey_cached *ck = (void *) path->l[0].b;
1640
1641                 EBUG_ON(ck &&
1642                         (path->btree_id != ck->key.btree_id ||
1643                          !bkey_eq(path->pos, ck->key.pos)));
1644                 if (!ck || !ck->valid)
1645                         return bkey_s_c_null;
1646
1647                 *u = ck->k->k;
1648                 k = bkey_i_to_s_c(ck->k);
1649         }
1650
1651         return k;
1652 hole:
1653         bkey_init(u);
1654         u->p = path->pos;
1655         return (struct bkey_s_c) { u, NULL };
1656 }
1657
1658 /* Btree iterators: */
1659
1660 int __must_check
1661 __bch2_btree_iter_traverse(struct btree_iter *iter)
1662 {
1663         return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
1664 }
1665
1666 int __must_check
1667 bch2_btree_iter_traverse(struct btree_iter *iter)
1668 {
1669         int ret;
1670
1671         iter->path = bch2_btree_path_set_pos(iter->trans, iter->path,
1672                                         btree_iter_search_key(iter),
1673                                         iter->flags & BTREE_ITER_INTENT,
1674                                         btree_iter_ip_allocated(iter));
1675
1676         ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
1677         if (ret)
1678                 return ret;
1679
1680         btree_path_set_should_be_locked(iter->path);
1681         return 0;
1682 }
1683
1684 /* Iterate across nodes (leaf and interior nodes) */
1685
1686 struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
1687 {
1688         struct btree_trans *trans = iter->trans;
1689         struct btree *b = NULL;
1690         int ret;
1691
1692         EBUG_ON(iter->path->cached);
1693         bch2_btree_iter_verify(iter);
1694
1695         ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
1696         if (ret)
1697                 goto err;
1698
1699         b = btree_path_node(iter->path, iter->path->level);
1700         if (!b)
1701                 goto out;
1702
1703         BUG_ON(bpos_lt(b->key.k.p, iter->pos));
1704
1705         bkey_init(&iter->k);
1706         iter->k.p = iter->pos = b->key.k.p;
1707
1708         iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
1709                                         iter->flags & BTREE_ITER_INTENT,
1710                                         btree_iter_ip_allocated(iter));
1711         btree_path_set_should_be_locked(iter->path);
1712 out:
1713         bch2_btree_iter_verify_entry_exit(iter);
1714         bch2_btree_iter_verify(iter);
1715
1716         return b;
1717 err:
1718         b = ERR_PTR(ret);
1719         goto out;
1720 }
1721
1722 struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter)
1723 {
1724         struct btree *b;
1725
1726         while (b = bch2_btree_iter_peek_node(iter),
1727                bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart))
1728                 bch2_trans_begin(iter->trans);
1729
1730         return b;
1731 }
1732
1733 struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
1734 {
1735         struct btree_trans *trans = iter->trans;
1736         struct btree_path *path = iter->path;
1737         struct btree *b = NULL;
1738         int ret;
1739
1740         bch2_trans_verify_not_in_restart(trans);
1741         EBUG_ON(iter->path->cached);
1742         bch2_btree_iter_verify(iter);
1743
1744         /* already at end? */
1745         if (!btree_path_node(path, path->level))
1746                 return NULL;
1747
1748         /* got to end? */
1749         if (!btree_path_node(path, path->level + 1)) {
1750                 btree_path_set_level_up(trans, path);
1751                 return NULL;
1752         }
1753
1754         if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
1755                 __bch2_btree_path_unlock(trans, path);
1756                 path->l[path->level].b          = ERR_PTR(-BCH_ERR_no_btree_node_relock);
1757                 path->l[path->level + 1].b      = ERR_PTR(-BCH_ERR_no_btree_node_relock);
1758                 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
1759                 trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path);
1760                 ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
1761                 goto err;
1762         }
1763
1764         b = btree_path_node(path, path->level + 1);
1765
1766         if (bpos_eq(iter->pos, b->key.k.p)) {
1767                 __btree_path_set_level_up(trans, path, path->level++);
1768         } else {
1769                 /*
1770                  * Haven't gotten to the end of the parent node: go back down to
1771                  * the next child node
1772                  */
1773                 path = iter->path =
1774                         bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos),
1775                                            iter->flags & BTREE_ITER_INTENT,
1776                                            btree_iter_ip_allocated(iter));
1777
1778                 btree_path_set_level_down(trans, path, iter->min_depth);
1779
1780                 ret = bch2_btree_path_traverse(trans, path, iter->flags);
1781                 if (ret)
1782                         goto err;
1783
1784                 b = path->l[path->level].b;
1785         }
1786
1787         bkey_init(&iter->k);
1788         iter->k.p = iter->pos = b->key.k.p;
1789
1790         iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
1791                                         iter->flags & BTREE_ITER_INTENT,
1792                                         btree_iter_ip_allocated(iter));
1793         btree_path_set_should_be_locked(iter->path);
1794         BUG_ON(iter->path->uptodate);
1795 out:
1796         bch2_btree_iter_verify_entry_exit(iter);
1797         bch2_btree_iter_verify(iter);
1798
1799         return b;
1800 err:
1801         b = ERR_PTR(ret);
1802         goto out;
1803 }
1804
1805 /* Iterate across keys (in leaf nodes only) */
1806
1807 inline bool bch2_btree_iter_advance(struct btree_iter *iter)
1808 {
1809         if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) {
1810                 struct bpos pos = iter->k.p;
1811                 bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
1812                              ? bpos_eq(pos, SPOS_MAX)
1813                              : bkey_eq(pos, SPOS_MAX));
1814
1815                 if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
1816                         pos = bkey_successor(iter, pos);
1817                 bch2_btree_iter_set_pos(iter, pos);
1818                 return ret;
1819         } else {
1820                 if (!btree_path_node(iter->path, iter->path->level))
1821                         return true;
1822
1823                 iter->advanced = true;
1824                 return false;
1825         }
1826 }
1827
1828 inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
1829 {
1830         struct bpos pos = bkey_start_pos(&iter->k);
1831         bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
1832                      ? bpos_eq(pos, POS_MIN)
1833                      : bkey_eq(pos, POS_MIN));
1834
1835         if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
1836                 pos = bkey_predecessor(iter, pos);
1837         bch2_btree_iter_set_pos(iter, pos);
1838         return ret;
1839 }
1840
1841 static noinline
1842 struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter)
1843 {
1844         struct btree_insert_entry *i;
1845         struct bkey_i *ret = NULL;
1846
1847         trans_for_each_update(iter->trans, i) {
1848                 if (i->btree_id < iter->btree_id)
1849                         continue;
1850                 if (i->btree_id > iter->btree_id)
1851                         break;
1852                 if (bpos_lt(i->k->k.p, iter->path->pos))
1853                         continue;
1854                 if (i->key_cache_already_flushed)
1855                         continue;
1856                 if (!ret || bpos_lt(i->k->k.p, ret->k.p))
1857                         ret = i->k;
1858         }
1859
1860         return ret;
1861 }
1862
1863 static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter)
1864 {
1865         return iter->flags & BTREE_ITER_WITH_UPDATES
1866                 ? __bch2_btree_trans_peek_updates(iter)
1867                 : NULL;
1868 }
1869
1870 static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
1871                                               struct btree_iter *iter,
1872                                               struct bpos end_pos)
1873 {
1874         struct bkey_i *k;
1875
1876         if (bpos_lt(iter->path->pos, iter->journal_pos))
1877                 iter->journal_idx = 0;
1878
1879         k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id,
1880                                         iter->path->level,
1881                                         iter->path->pos,
1882                                         end_pos,
1883                                         &iter->journal_idx);
1884
1885         iter->journal_pos = k ? k->k.p : end_pos;
1886         return k;
1887 }
1888
1889 static noinline
1890 struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
1891                                               struct btree_iter *iter)
1892 {
1893         struct bkey_i *k = bch2_btree_journal_peek(trans, iter, iter->path->pos);
1894
1895         if (k) {
1896                 iter->k = k->k;
1897                 return bkey_i_to_s_c(k);
1898         } else {
1899                 return bkey_s_c_null;
1900         }
1901 }
1902
1903 static noinline
1904 struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
1905                                          struct btree_iter *iter,
1906                                          struct bkey_s_c k)
1907 {
1908         struct bkey_i *next_journal =
1909                 bch2_btree_journal_peek(trans, iter,
1910                                 k.k ? k.k->p : path_l(iter->path)->b->key.k.p);
1911
1912         if (next_journal) {
1913                 iter->k = next_journal->k;
1914                 k = bkey_i_to_s_c(next_journal);
1915         }
1916
1917         return k;
1918 }
1919
1920 /*
1921  * Checks btree key cache for key at iter->pos and returns it if present, or
1922  * bkey_s_c_null:
1923  */
1924 static noinline
1925 struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
1926 {
1927         struct btree_trans *trans = iter->trans;
1928         struct bch_fs *c = trans->c;
1929         struct bkey u;
1930         struct bkey_s_c k;
1931         int ret;
1932
1933         if ((iter->flags & BTREE_ITER_KEY_CACHE_FILL) &&
1934             bpos_eq(iter->pos, pos))
1935                 return bkey_s_c_null;
1936
1937         if (!bch2_btree_key_cache_find(c, iter->btree_id, pos))
1938                 return bkey_s_c_null;
1939
1940         if (!iter->key_cache_path)
1941                 iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos,
1942                                                      iter->flags & BTREE_ITER_INTENT, 0,
1943                                                      iter->flags|BTREE_ITER_CACHED|
1944                                                      BTREE_ITER_CACHED_NOFILL,
1945                                                      _THIS_IP_);
1946
1947         iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos,
1948                                         iter->flags & BTREE_ITER_INTENT,
1949                                         btree_iter_ip_allocated(iter));
1950
1951         ret =   bch2_btree_path_traverse(trans, iter->key_cache_path,
1952                                          iter->flags|BTREE_ITER_CACHED) ?:
1953                 bch2_btree_path_relock(trans, iter->path, _THIS_IP_);
1954         if (unlikely(ret))
1955                 return bkey_s_c_err(ret);
1956
1957         btree_path_set_should_be_locked(iter->key_cache_path);
1958
1959         k = bch2_btree_path_peek_slot(iter->key_cache_path, &u);
1960         if (k.k && !bkey_err(k)) {
1961                 iter->k = u;
1962                 k.k = &iter->k;
1963         }
1964         return k;
1965 }
1966
1967 static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
1968 {
1969         struct btree_trans *trans = iter->trans;
1970         struct bkey_i *next_update;
1971         struct bkey_s_c k, k2;
1972         int ret;
1973
1974         EBUG_ON(iter->path->cached);
1975         bch2_btree_iter_verify(iter);
1976
1977         while (1) {
1978                 struct btree_path_level *l;
1979
1980                 iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
1981                                         iter->flags & BTREE_ITER_INTENT,
1982                                         btree_iter_ip_allocated(iter));
1983
1984                 ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
1985                 if (unlikely(ret)) {
1986                         /* ensure that iter->k is consistent with iter->pos: */
1987                         bch2_btree_iter_set_pos(iter, iter->pos);
1988                         k = bkey_s_c_err(ret);
1989                         goto out;
1990                 }
1991
1992                 l = path_l(iter->path);
1993
1994                 if (unlikely(!l->b)) {
1995                         /* No btree nodes at requested level: */
1996                         bch2_btree_iter_set_pos(iter, SPOS_MAX);
1997                         k = bkey_s_c_null;
1998                         goto out;
1999                 }
2000
2001                 btree_path_set_should_be_locked(iter->path);
2002
2003                 k = btree_path_level_peek_all(trans->c, l, &iter->k);
2004
2005                 if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
2006                     k.k &&
2007                     (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
2008                         k = k2;
2009                         ret = bkey_err(k);
2010                         if (ret) {
2011                                 bch2_btree_iter_set_pos(iter, iter->pos);
2012                                 goto out;
2013                         }
2014                 }
2015
2016                 if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL))
2017                         k = btree_trans_peek_journal(trans, iter, k);
2018
2019                 next_update = btree_trans_peek_updates(iter);
2020
2021                 if (next_update &&
2022                     bpos_le(next_update->k.p,
2023                             k.k ? k.k->p : l->b->key.k.p)) {
2024                         iter->k = next_update->k;
2025                         k = bkey_i_to_s_c(next_update);
2026                 }
2027
2028                 if (k.k && bkey_deleted(k.k)) {
2029                         /*
2030                          * If we've got a whiteout, and it's after the search
2031                          * key, advance the search key to the whiteout instead
2032                          * of just after the whiteout - it might be a btree
2033                          * whiteout, with a real key at the same position, since
2034                          * in the btree deleted keys sort before non deleted.
2035                          */
2036                         search_key = !bpos_eq(search_key, k.k->p)
2037                                 ? k.k->p
2038                                 : bpos_successor(k.k->p);
2039                         continue;
2040                 }
2041
2042                 if (likely(k.k)) {
2043                         break;
2044                 } else if (likely(!bpos_eq(l->b->key.k.p, SPOS_MAX))) {
2045                         /* Advance to next leaf node: */
2046                         search_key = bpos_successor(l->b->key.k.p);
2047                 } else {
2048                         /* End of btree: */
2049                         bch2_btree_iter_set_pos(iter, SPOS_MAX);
2050                         k = bkey_s_c_null;
2051                         goto out;
2052                 }
2053         }
2054 out:
2055         bch2_btree_iter_verify(iter);
2056
2057         return k;
2058 }
2059
2060 /**
2061  * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
2062  * current position
2063  */
2064 struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end)
2065 {
2066         struct btree_trans *trans = iter->trans;
2067         struct bpos search_key = btree_iter_search_key(iter);
2068         struct bkey_s_c k;
2069         struct bpos iter_pos;
2070         int ret;
2071
2072         EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
2073         EBUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && bkey_eq(end, POS_MAX));
2074
2075         if (iter->update_path) {
2076                 bch2_path_put_nokeep(trans, iter->update_path,
2077                                      iter->flags & BTREE_ITER_INTENT);
2078                 iter->update_path = NULL;
2079         }
2080
2081         bch2_btree_iter_verify_entry_exit(iter);
2082
2083         while (1) {
2084                 k = __bch2_btree_iter_peek(iter, search_key);
2085                 if (unlikely(!k.k))
2086                         goto end;
2087                 if (unlikely(bkey_err(k)))
2088                         goto out_no_locked;
2089
2090                 /*
2091                  * iter->pos should be mononotically increasing, and always be
2092                  * equal to the key we just returned - except extents can
2093                  * straddle iter->pos:
2094                  */
2095                 if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
2096                         iter_pos = k.k->p;
2097                 else
2098                         iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k));
2099
2100                 if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS)
2101                              ? bkey_gt(iter_pos, end)
2102                              : bkey_ge(iter_pos, end)))
2103                         goto end;
2104
2105                 if (iter->update_path &&
2106                     !bkey_eq(iter->update_path->pos, k.k->p)) {
2107                         bch2_path_put_nokeep(trans, iter->update_path,
2108                                              iter->flags & BTREE_ITER_INTENT);
2109                         iter->update_path = NULL;
2110                 }
2111
2112                 if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
2113                     (iter->flags & BTREE_ITER_INTENT) &&
2114                     !(iter->flags & BTREE_ITER_IS_EXTENTS) &&
2115                     !iter->update_path) {
2116                         struct bpos pos = k.k->p;
2117
2118                         if (pos.snapshot < iter->snapshot) {
2119                                 search_key = bpos_successor(k.k->p);
2120                                 continue;
2121                         }
2122
2123                         pos.snapshot = iter->snapshot;
2124
2125                         /*
2126                          * advance, same as on exit for iter->path, but only up
2127                          * to snapshot
2128                          */
2129                         __btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT);
2130                         iter->update_path = iter->path;
2131
2132                         iter->update_path = bch2_btree_path_set_pos(trans,
2133                                                 iter->update_path, pos,
2134                                                 iter->flags & BTREE_ITER_INTENT,
2135                                                 _THIS_IP_);
2136                         ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags);
2137                         if (unlikely(ret)) {
2138                                 k = bkey_s_c_err(ret);
2139                                 goto out_no_locked;
2140                         }
2141                 }
2142
2143                 /*
2144                  * We can never have a key in a leaf node at POS_MAX, so
2145                  * we don't have to check these successor() calls:
2146                  */
2147                 if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
2148                     !bch2_snapshot_is_ancestor(trans->c,
2149                                                iter->snapshot,
2150                                                k.k->p.snapshot)) {
2151                         search_key = bpos_successor(k.k->p);
2152                         continue;
2153                 }
2154
2155                 if (bkey_whiteout(k.k) &&
2156                     !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
2157                         search_key = bkey_successor(iter, k.k->p);
2158                         continue;
2159                 }
2160
2161                 break;
2162         }
2163
2164         iter->pos = iter_pos;
2165
2166         iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
2167                                 iter->flags & BTREE_ITER_INTENT,
2168                                 btree_iter_ip_allocated(iter));
2169
2170         btree_path_set_should_be_locked(iter->path);
2171 out_no_locked:
2172         if (iter->update_path) {
2173                 ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_);
2174                 if (unlikely(ret))
2175                         k = bkey_s_c_err(ret);
2176                 else
2177                         btree_path_set_should_be_locked(iter->update_path);
2178         }
2179
2180         if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
2181                 iter->pos.snapshot = iter->snapshot;
2182
2183         ret = bch2_btree_iter_verify_ret(iter, k);
2184         if (unlikely(ret)) {
2185                 bch2_btree_iter_set_pos(iter, iter->pos);
2186                 k = bkey_s_c_err(ret);
2187         }
2188
2189         bch2_btree_iter_verify_entry_exit(iter);
2190
2191         return k;
2192 end:
2193         bch2_btree_iter_set_pos(iter, end);
2194         k = bkey_s_c_null;
2195         goto out_no_locked;
2196 }
2197
2198 /**
2199  * bch2_btree_iter_peek_all_levels: returns the first key greater than or equal
2200  * to iterator's current position, returning keys from every level of the btree.
2201  * For keys at different levels of the btree that compare equal, the key from
2202  * the lower level (leaf) is returned first.
2203  */
2204 struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter)
2205 {
2206         struct btree_trans *trans = iter->trans;
2207         struct bkey_s_c k;
2208         int ret;
2209
2210         EBUG_ON(iter->path->cached);
2211         bch2_btree_iter_verify(iter);
2212         BUG_ON(iter->path->level < iter->min_depth);
2213         BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
2214         EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS));
2215
2216         while (1) {
2217                 iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos,
2218                                         iter->flags & BTREE_ITER_INTENT,
2219                                         btree_iter_ip_allocated(iter));
2220
2221                 ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
2222                 if (unlikely(ret)) {
2223                         /* ensure that iter->k is consistent with iter->pos: */
2224                         bch2_btree_iter_set_pos(iter, iter->pos);
2225                         k = bkey_s_c_err(ret);
2226                         goto out_no_locked;
2227                 }
2228
2229                 /* Already at end? */
2230                 if (!btree_path_node(iter->path, iter->path->level)) {
2231                         k = bkey_s_c_null;
2232                         goto out_no_locked;
2233                 }
2234
2235                 k = btree_path_level_peek_all(trans->c,
2236                                 &iter->path->l[iter->path->level], &iter->k);
2237
2238                 /* Check if we should go up to the parent node: */
2239                 if (!k.k ||
2240                     (iter->advanced &&
2241                      bpos_eq(path_l(iter->path)->b->key.k.p, iter->pos))) {
2242                         iter->pos = path_l(iter->path)->b->key.k.p;
2243                         btree_path_set_level_up(trans, iter->path);
2244                         iter->advanced = false;
2245                         continue;
2246                 }
2247
2248                 /*
2249                  * Check if we should go back down to a leaf:
2250                  * If we're not in a leaf node, we only return the current key
2251                  * if it exactly matches iter->pos - otherwise we first have to
2252                  * go back to the leaf:
2253                  */
2254                 if (iter->path->level != iter->min_depth &&
2255                     (iter->advanced ||
2256                      !k.k ||
2257                      !bpos_eq(iter->pos, k.k->p))) {
2258                         btree_path_set_level_down(trans, iter->path, iter->min_depth);
2259                         iter->pos = bpos_successor(iter->pos);
2260                         iter->advanced = false;
2261                         continue;
2262                 }
2263
2264                 /* Check if we should go to the next key: */
2265                 if (iter->path->level == iter->min_depth &&
2266                     iter->advanced &&
2267                     k.k &&
2268                     bpos_eq(iter->pos, k.k->p)) {
2269                         iter->pos = bpos_successor(iter->pos);
2270                         iter->advanced = false;
2271                         continue;
2272                 }
2273
2274                 if (iter->advanced &&
2275                     iter->path->level == iter->min_depth &&
2276                     !bpos_eq(k.k->p, iter->pos))
2277                         iter->advanced = false;
2278
2279                 BUG_ON(iter->advanced);
2280                 BUG_ON(!k.k);
2281                 break;
2282         }
2283
2284         iter->pos = k.k->p;
2285         btree_path_set_should_be_locked(iter->path);
2286 out_no_locked:
2287         bch2_btree_iter_verify(iter);
2288
2289         return k;
2290 }
2291
2292 /**
2293  * bch2_btree_iter_next: returns first key greater than iterator's current
2294  * position
2295  */
2296 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
2297 {
2298         if (!bch2_btree_iter_advance(iter))
2299                 return bkey_s_c_null;
2300
2301         return bch2_btree_iter_peek(iter);
2302 }
2303
2304 /**
2305  * bch2_btree_iter_peek_prev: returns first key less than or equal to
2306  * iterator's current position
2307  */
2308 struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
2309 {
2310         struct btree_trans *trans = iter->trans;
2311         struct bpos search_key = iter->pos;
2312         struct btree_path *saved_path = NULL;
2313         struct bkey_s_c k;
2314         struct bkey saved_k;
2315         const struct bch_val *saved_v;
2316         int ret;
2317
2318         EBUG_ON(iter->path->cached || iter->path->level);
2319         EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES);
2320
2321         if (iter->flags & BTREE_ITER_WITH_JOURNAL)
2322                 return bkey_s_c_err(-EIO);
2323
2324         bch2_btree_iter_verify(iter);
2325         bch2_btree_iter_verify_entry_exit(iter);
2326
2327         if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
2328                 search_key.snapshot = U32_MAX;
2329
2330         while (1) {
2331                 iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
2332                                                 iter->flags & BTREE_ITER_INTENT,
2333                                                 btree_iter_ip_allocated(iter));
2334
2335                 ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
2336                 if (unlikely(ret)) {
2337                         /* ensure that iter->k is consistent with iter->pos: */
2338                         bch2_btree_iter_set_pos(iter, iter->pos);
2339                         k = bkey_s_c_err(ret);
2340                         goto out_no_locked;
2341                 }
2342
2343                 k = btree_path_level_peek(trans, iter->path,
2344                                           &iter->path->l[0], &iter->k);
2345                 if (!k.k ||
2346                     ((iter->flags & BTREE_ITER_IS_EXTENTS)
2347                      ? bpos_ge(bkey_start_pos(k.k), search_key)
2348                      : bpos_gt(k.k->p, search_key)))
2349                         k = btree_path_level_prev(trans, iter->path,
2350                                                   &iter->path->l[0], &iter->k);
2351
2352                 if (likely(k.k)) {
2353                         if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) {
2354                                 if (k.k->p.snapshot == iter->snapshot)
2355                                         goto got_key;
2356
2357                                 /*
2358                                  * If we have a saved candidate, and we're no
2359                                  * longer at the same _key_ (not pos), return
2360                                  * that candidate
2361                                  */
2362                                 if (saved_path && !bkey_eq(k.k->p, saved_k.p)) {
2363                                         bch2_path_put_nokeep(trans, iter->path,
2364                                                       iter->flags & BTREE_ITER_INTENT);
2365                                         iter->path = saved_path;
2366                                         saved_path = NULL;
2367                                         iter->k = saved_k;
2368                                         k.v     = saved_v;
2369                                         goto got_key;
2370                                 }
2371
2372                                 if (bch2_snapshot_is_ancestor(iter->trans->c,
2373                                                               iter->snapshot,
2374                                                               k.k->p.snapshot)) {
2375                                         if (saved_path)
2376                                                 bch2_path_put_nokeep(trans, saved_path,
2377                                                       iter->flags & BTREE_ITER_INTENT);
2378                                         saved_path = btree_path_clone(trans, iter->path,
2379                                                                 iter->flags & BTREE_ITER_INTENT);
2380                                         saved_k = *k.k;
2381                                         saved_v = k.v;
2382                                 }
2383
2384                                 search_key = bpos_predecessor(k.k->p);
2385                                 continue;
2386                         }
2387 got_key:
2388                         if (bkey_whiteout(k.k) &&
2389                             !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
2390                                 search_key = bkey_predecessor(iter, k.k->p);
2391                                 if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
2392                                         search_key.snapshot = U32_MAX;
2393                                 continue;
2394                         }
2395
2396                         break;
2397                 } else if (likely(!bpos_eq(iter->path->l[0].b->data->min_key, POS_MIN))) {
2398                         /* Advance to previous leaf node: */
2399                         search_key = bpos_predecessor(iter->path->l[0].b->data->min_key);
2400                 } else {
2401                         /* Start of btree: */
2402                         bch2_btree_iter_set_pos(iter, POS_MIN);
2403                         k = bkey_s_c_null;
2404                         goto out_no_locked;
2405                 }
2406         }
2407
2408         EBUG_ON(bkey_gt(bkey_start_pos(k.k), iter->pos));
2409
2410         /* Extents can straddle iter->pos: */
2411         if (bkey_lt(k.k->p, iter->pos))
2412                 iter->pos = k.k->p;
2413
2414         if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
2415                 iter->pos.snapshot = iter->snapshot;
2416
2417         btree_path_set_should_be_locked(iter->path);
2418 out_no_locked:
2419         if (saved_path)
2420                 bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
2421
2422         bch2_btree_iter_verify_entry_exit(iter);
2423         bch2_btree_iter_verify(iter);
2424
2425         return k;
2426 }
2427
2428 /**
2429  * bch2_btree_iter_prev: returns first key less than iterator's current
2430  * position
2431  */
2432 struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
2433 {
2434         if (!bch2_btree_iter_rewind(iter))
2435                 return bkey_s_c_null;
2436
2437         return bch2_btree_iter_peek_prev(iter);
2438 }
2439
2440 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
2441 {
2442         struct btree_trans *trans = iter->trans;
2443         struct bpos search_key;
2444         struct bkey_s_c k;
2445         int ret;
2446
2447         bch2_btree_iter_verify(iter);
2448         bch2_btree_iter_verify_entry_exit(iter);
2449         EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
2450         EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE));
2451
2452         /* extents can't span inode numbers: */
2453         if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
2454             unlikely(iter->pos.offset == KEY_OFFSET_MAX)) {
2455                 if (iter->pos.inode == KEY_INODE_MAX)
2456                         return bkey_s_c_null;
2457
2458                 bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
2459         }
2460
2461         search_key = btree_iter_search_key(iter);
2462         iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
2463                                         iter->flags & BTREE_ITER_INTENT,
2464                                         btree_iter_ip_allocated(iter));
2465
2466         ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
2467         if (unlikely(ret)) {
2468                 k = bkey_s_c_err(ret);
2469                 goto out_no_locked;
2470         }
2471
2472         if ((iter->flags & BTREE_ITER_CACHED) ||
2473             !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
2474                 struct bkey_i *next_update;
2475
2476                 if ((next_update = btree_trans_peek_updates(iter)) &&
2477                     bpos_eq(next_update->k.p, iter->pos)) {
2478                         iter->k = next_update->k;
2479                         k = bkey_i_to_s_c(next_update);
2480                         goto out;
2481                 }
2482
2483                 if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) &&
2484                     (k = btree_trans_peek_slot_journal(trans, iter)).k)
2485                         goto out;
2486
2487                 if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
2488                     (k = btree_trans_peek_key_cache(iter, iter->pos)).k) {
2489                         if (!bkey_err(k))
2490                                 iter->k = *k.k;
2491                         /* We're not returning a key from iter->path: */
2492                         goto out_no_locked;
2493                 }
2494
2495                 k = bch2_btree_path_peek_slot(iter->path, &iter->k);
2496                 if (unlikely(!k.k))
2497                         goto out_no_locked;
2498         } else {
2499                 struct bpos next;
2500                 struct bpos end = iter->pos;
2501
2502                 if (iter->flags & BTREE_ITER_IS_EXTENTS)
2503                         end.offset = U64_MAX;
2504
2505                 EBUG_ON(iter->path->level);
2506
2507                 if (iter->flags & BTREE_ITER_INTENT) {
2508                         struct btree_iter iter2;
2509
2510                         bch2_trans_copy_iter(&iter2, iter);
2511                         k = bch2_btree_iter_peek_upto(&iter2, end);
2512
2513                         if (k.k && !bkey_err(k)) {
2514                                 iter->k = iter2.k;
2515                                 k.k = &iter->k;
2516                         }
2517                         bch2_trans_iter_exit(trans, &iter2);
2518                 } else {
2519                         struct bpos pos = iter->pos;
2520
2521                         k = bch2_btree_iter_peek_upto(iter, end);
2522                         if (unlikely(bkey_err(k)))
2523                                 bch2_btree_iter_set_pos(iter, pos);
2524                         else
2525                                 iter->pos = pos;
2526                 }
2527
2528                 if (unlikely(bkey_err(k)))
2529                         goto out_no_locked;
2530
2531                 next = k.k ? bkey_start_pos(k.k) : POS_MAX;
2532
2533                 if (bkey_lt(iter->pos, next)) {
2534                         bkey_init(&iter->k);
2535                         iter->k.p = iter->pos;
2536
2537                         if (iter->flags & BTREE_ITER_IS_EXTENTS) {
2538                                 bch2_key_resize(&iter->k,
2539                                                 min_t(u64, KEY_SIZE_MAX,
2540                                                       (next.inode == iter->pos.inode
2541                                                        ? next.offset
2542                                                        : KEY_OFFSET_MAX) -
2543                                                       iter->pos.offset));
2544                                 EBUG_ON(!iter->k.size);
2545                         }
2546
2547                         k = (struct bkey_s_c) { &iter->k, NULL };
2548                 }
2549         }
2550 out:
2551         btree_path_set_should_be_locked(iter->path);
2552 out_no_locked:
2553         bch2_btree_iter_verify_entry_exit(iter);
2554         bch2_btree_iter_verify(iter);
2555         ret = bch2_btree_iter_verify_ret(iter, k);
2556         if (unlikely(ret))
2557                 return bkey_s_c_err(ret);
2558
2559         return k;
2560 }
2561
2562 struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
2563 {
2564         if (!bch2_btree_iter_advance(iter))
2565                 return bkey_s_c_null;
2566
2567         return bch2_btree_iter_peek_slot(iter);
2568 }
2569
2570 struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
2571 {
2572         if (!bch2_btree_iter_rewind(iter))
2573                 return bkey_s_c_null;
2574
2575         return bch2_btree_iter_peek_slot(iter);
2576 }
2577
2578 struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter)
2579 {
2580         struct bkey_s_c k;
2581
2582         while (btree_trans_too_many_iters(iter->trans) ||
2583                (k = bch2_btree_iter_peek_type(iter, iter->flags),
2584                 bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
2585                 bch2_trans_begin(iter->trans);
2586
2587         return k;
2588 }
2589
2590 /* new transactional stuff: */
2591
2592 #ifdef CONFIG_BCACHEFS_DEBUG
2593 static void btree_trans_verify_sorted_refs(struct btree_trans *trans)
2594 {
2595         struct btree_path *path;
2596         unsigned i;
2597
2598         BUG_ON(trans->nr_sorted != hweight64(trans->paths_allocated));
2599
2600         trans_for_each_path(trans, path) {
2601                 BUG_ON(path->sorted_idx >= trans->nr_sorted);
2602                 BUG_ON(trans->sorted[path->sorted_idx] != path->idx);
2603         }
2604
2605         for (i = 0; i < trans->nr_sorted; i++) {
2606                 unsigned idx = trans->sorted[i];
2607
2608                 EBUG_ON(!(trans->paths_allocated & (1ULL << idx)));
2609                 BUG_ON(trans->paths[idx].sorted_idx != i);
2610         }
2611 }
2612
2613 static void btree_trans_verify_sorted(struct btree_trans *trans)
2614 {
2615         struct btree_path *path, *prev = NULL;
2616         unsigned i;
2617
2618         if (!bch2_debug_check_iterators)
2619                 return;
2620
2621         trans_for_each_path_inorder(trans, path, i) {
2622                 if (prev && btree_path_cmp(prev, path) > 0) {
2623                         __bch2_dump_trans_paths_updates(trans, true);
2624                         panic("trans paths out of order!\n");
2625                 }
2626                 prev = path;
2627         }
2628 }
2629 #else
2630 static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) {}
2631 static inline void btree_trans_verify_sorted(struct btree_trans *trans) {}
2632 #endif
2633
2634 void __bch2_btree_trans_sort_paths(struct btree_trans *trans)
2635 {
2636         int i, l = 0, r = trans->nr_sorted, inc = 1;
2637         bool swapped;
2638
2639         btree_trans_verify_sorted_refs(trans);
2640
2641         if (trans->paths_sorted)
2642                 goto out;
2643
2644         /*
2645          * Cocktail shaker sort: this is efficient because iterators will be
2646          * mostly sorted.
2647          */
2648         do {
2649                 swapped = false;
2650
2651                 for (i = inc > 0 ? l : r - 2;
2652                      i + 1 < r && i >= l;
2653                      i += inc) {
2654                         if (btree_path_cmp(trans->paths + trans->sorted[i],
2655                                            trans->paths + trans->sorted[i + 1]) > 0) {
2656                                 swap(trans->sorted[i], trans->sorted[i + 1]);
2657                                 trans->paths[trans->sorted[i]].sorted_idx = i;
2658                                 trans->paths[trans->sorted[i + 1]].sorted_idx = i + 1;
2659                                 swapped = true;
2660                         }
2661                 }
2662
2663                 if (inc > 0)
2664                         --r;
2665                 else
2666                         l++;
2667                 inc = -inc;
2668         } while (swapped);
2669
2670         trans->paths_sorted = true;
2671 out:
2672         btree_trans_verify_sorted(trans);
2673 }
2674
2675 static inline void btree_path_list_remove(struct btree_trans *trans,
2676                                           struct btree_path *path)
2677 {
2678         unsigned i;
2679
2680         EBUG_ON(path->sorted_idx >= trans->nr_sorted);
2681 #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
2682         trans->nr_sorted--;
2683         memmove_u64s_down_small(trans->sorted + path->sorted_idx,
2684                                 trans->sorted + path->sorted_idx + 1,
2685                                 DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8));
2686 #else
2687         array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx);
2688 #endif
2689         for (i = path->sorted_idx; i < trans->nr_sorted; i++)
2690                 trans->paths[trans->sorted[i]].sorted_idx = i;
2691
2692         path->sorted_idx = U8_MAX;
2693 }
2694
2695 static inline void btree_path_list_add(struct btree_trans *trans,
2696                                        struct btree_path *pos,
2697                                        struct btree_path *path)
2698 {
2699         unsigned i;
2700
2701         path->sorted_idx = pos ? pos->sorted_idx + 1 : trans->nr_sorted;
2702
2703 #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
2704         memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1,
2705                               trans->sorted + path->sorted_idx,
2706                               DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8));
2707         trans->nr_sorted++;
2708         trans->sorted[path->sorted_idx] = path->idx;
2709 #else
2710         array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx);
2711 #endif
2712
2713         for (i = path->sorted_idx; i < trans->nr_sorted; i++)
2714                 trans->paths[trans->sorted[i]].sorted_idx = i;
2715
2716         btree_trans_verify_sorted_refs(trans);
2717 }
2718
2719 void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
2720 {
2721         if (iter->update_path)
2722                 bch2_path_put_nokeep(trans, iter->update_path,
2723                               iter->flags & BTREE_ITER_INTENT);
2724         if (iter->path)
2725                 bch2_path_put(trans, iter->path,
2726                               iter->flags & BTREE_ITER_INTENT);
2727         if (iter->key_cache_path)
2728                 bch2_path_put(trans, iter->key_cache_path,
2729                               iter->flags & BTREE_ITER_INTENT);
2730         iter->path = NULL;
2731         iter->update_path = NULL;
2732         iter->key_cache_path = NULL;
2733 }
2734
2735 static inline void bch2_trans_iter_init_inlined(struct btree_trans *trans,
2736                           struct btree_iter *iter,
2737                           unsigned btree_id, struct bpos pos,
2738                           unsigned flags)
2739 {
2740         bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
2741                                bch2_btree_iter_flags(trans, btree_id, flags),
2742                                _RET_IP_);
2743 }
2744
2745 void bch2_trans_iter_init_outlined(struct btree_trans *trans,
2746                           struct btree_iter *iter,
2747                           unsigned btree_id, struct bpos pos,
2748                           unsigned flags)
2749 {
2750         bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
2751                                bch2_btree_iter_flags(trans, btree_id, flags),
2752                                _RET_IP_);
2753 }
2754
2755 void bch2_trans_node_iter_init(struct btree_trans *trans,
2756                                struct btree_iter *iter,
2757                                enum btree_id btree_id,
2758                                struct bpos pos,
2759                                unsigned locks_want,
2760                                unsigned depth,
2761                                unsigned flags)
2762 {
2763        flags |= BTREE_ITER_NOT_EXTENTS;
2764        flags |= __BTREE_ITER_ALL_SNAPSHOTS;
2765        flags |= BTREE_ITER_ALL_SNAPSHOTS;
2766
2767         bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth,
2768                                __bch2_btree_iter_flags(trans, btree_id, flags),
2769                                _RET_IP_);
2770
2771         iter->min_depth = depth;
2772
2773         BUG_ON(iter->path->locks_want    < min(locks_want, BTREE_MAX_DEPTH));
2774         BUG_ON(iter->path->level        != depth);
2775         BUG_ON(iter->min_depth          != depth);
2776 }
2777
2778 void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
2779 {
2780         *dst = *src;
2781         if (src->path)
2782                 __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT);
2783         if (src->update_path)
2784                 __btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT);
2785         dst->key_cache_path = NULL;
2786 }
2787
2788 void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
2789 {
2790         unsigned new_top = trans->mem_top + size;
2791         size_t old_bytes = trans->mem_bytes;
2792         size_t new_bytes = roundup_pow_of_two(new_top);
2793         int ret;
2794         void *new_mem;
2795         void *p;
2796
2797         trans->mem_max = max(trans->mem_max, new_top);
2798
2799         WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
2800
2801         new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN);
2802         if (unlikely(!new_mem)) {
2803                 bch2_trans_unlock(trans);
2804
2805                 new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL);
2806                 if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
2807                         new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
2808                         new_bytes = BTREE_TRANS_MEM_MAX;
2809                         kfree(trans->mem);
2810                 }
2811
2812                 if (!new_mem)
2813                         return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
2814
2815                 trans->mem = new_mem;
2816                 trans->mem_bytes = new_bytes;
2817
2818                 ret = bch2_trans_relock(trans);
2819                 if (ret)
2820                         return ERR_PTR(ret);
2821         }
2822
2823         trans->mem = new_mem;
2824         trans->mem_bytes = new_bytes;
2825
2826         if (old_bytes) {
2827                 trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
2828                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
2829         }
2830
2831         p = trans->mem + trans->mem_top;
2832         trans->mem_top += size;
2833         memset(p, 0, size);
2834         return p;
2835 }
2836
2837 static noinline void bch2_trans_reset_srcu_lock(struct btree_trans *trans)
2838 {
2839         struct bch_fs *c = trans->c;
2840         struct btree_path *path;
2841
2842         trans_for_each_path(trans, path)
2843                 if (path->cached && !btree_node_locked(path, 0))
2844                         path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset);
2845
2846         srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
2847         trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
2848         trans->srcu_lock_time   = jiffies;
2849 }
2850
2851 /**
2852  * bch2_trans_begin() - reset a transaction after a interrupted attempt
2853  * @trans: transaction to reset
2854  *
2855  * While iterating over nodes or updating nodes a attempt to lock a btree node
2856  * may return BCH_ERR_transaction_restart when the trylock fails. When this
2857  * occurs bch2_trans_begin() should be called and the transaction retried.
2858  */
2859 u32 bch2_trans_begin(struct btree_trans *trans)
2860 {
2861         struct btree_path *path;
2862         u64 now;
2863
2864         bch2_trans_reset_updates(trans);
2865
2866         trans->restart_count++;
2867         trans->mem_top                  = 0;
2868
2869         trans_for_each_path(trans, path) {
2870                 path->should_be_locked = false;
2871
2872                 /*
2873                  * If the transaction wasn't restarted, we're presuming to be
2874                  * doing something new: dont keep iterators excpt the ones that
2875                  * are in use - except for the subvolumes btree:
2876                  */
2877                 if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes)
2878                         path->preserve = false;
2879
2880                 /*
2881                  * XXX: we probably shouldn't be doing this if the transaction
2882                  * was restarted, but currently we still overflow transaction
2883                  * iterators if we do that
2884                  */
2885                 if (!path->ref && !path->preserve)
2886                         __bch2_path_free(trans, path);
2887                 else
2888                         path->preserve = false;
2889         }
2890
2891         now = local_clock();
2892         if (!trans->restarted &&
2893             (need_resched() ||
2894              now - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) {
2895                 drop_locks_do(trans, (cond_resched(), 0));
2896                 now = local_clock();
2897         }
2898         trans->last_begin_time = now;
2899
2900         if (unlikely(time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10))))
2901                 bch2_trans_reset_srcu_lock(trans);
2902
2903         trans->last_begin_ip = _RET_IP_;
2904         if (trans->restarted) {
2905                 bch2_btree_path_traverse_all(trans);
2906                 trans->notrace_relock_fail = false;
2907         }
2908
2909         return trans->restart_count;
2910 }
2911
2912 static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
2913 {
2914         size_t paths_bytes      = sizeof(struct btree_path) * BTREE_ITER_MAX;
2915         size_t updates_bytes    = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX;
2916         void *p = NULL;
2917
2918         BUG_ON(trans->used_mempool);
2919
2920 #ifdef __KERNEL__
2921         p = this_cpu_xchg(c->btree_paths_bufs->path, NULL);
2922 #endif
2923         if (!p)
2924                 p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS);
2925         /*
2926          * paths need to be zeroed, bch2_check_for_deadlock looks at paths in
2927          * other threads
2928          */
2929
2930         trans->paths            = p; p += paths_bytes;
2931         trans->updates          = p; p += updates_bytes;
2932 }
2933
2934 const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
2935
2936 unsigned bch2_trans_get_fn_idx(const char *fn)
2937 {
2938         unsigned i;
2939
2940         for (i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++)
2941                 if (!bch2_btree_transaction_fns[i] ||
2942                     bch2_btree_transaction_fns[i] == fn) {
2943                         bch2_btree_transaction_fns[i] = fn;
2944                         return i;
2945                 }
2946
2947         pr_warn_once("BCH_TRANSACTIONS_NR not big enough!");
2948         return i;
2949 }
2950
2951 void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_idx)
2952         __acquires(&c->btree_trans_barrier)
2953 {
2954         struct btree_transaction_stats *s;
2955
2956         bch2_assert_btree_nodes_not_locked();
2957
2958         memset(trans, 0, sizeof(*trans));
2959         trans->c                = c;
2960         trans->fn               = fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns)
2961                 ? bch2_btree_transaction_fns[fn_idx] : NULL;
2962         trans->last_begin_time  = local_clock();
2963         trans->fn_idx           = fn_idx;
2964         trans->locking_wait.task = current;
2965         trans->journal_replay_not_finished =
2966                 !test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
2967         closure_init_stack(&trans->ref);
2968
2969         bch2_trans_alloc_paths(trans, c);
2970
2971         s = btree_trans_stats(trans);
2972         if (s && s->max_mem) {
2973                 unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
2974
2975                 trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
2976
2977                 if (!unlikely(trans->mem)) {
2978                         trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
2979                         trans->mem_bytes = BTREE_TRANS_MEM_MAX;
2980                 } else {
2981                         trans->mem_bytes = expected_mem_bytes;
2982                 }
2983         }
2984
2985         if (s) {
2986                 trans->nr_max_paths = s->nr_max_paths;
2987                 trans->wb_updates_size = s->wb_updates_size;
2988         }
2989
2990         trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
2991         trans->srcu_lock_time   = jiffies;
2992
2993         if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
2994                 struct btree_trans *pos;
2995
2996                 seqmutex_lock(&c->btree_trans_lock);
2997                 list_for_each_entry(pos, &c->btree_trans_list, list) {
2998                         /*
2999                          * We'd much prefer to be stricter here and completely
3000                          * disallow multiple btree_trans in the same thread -
3001                          * but the data move path calls bch2_write when we
3002                          * already have a btree_trans initialized.
3003                          */
3004                         BUG_ON(trans->locking_wait.task->pid == pos->locking_wait.task->pid &&
3005                                bch2_trans_locked(pos));
3006
3007                         if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) {
3008                                 list_add_tail(&trans->list, &pos->list);
3009                                 goto list_add_done;
3010                         }
3011                 }
3012                 list_add_tail(&trans->list, &c->btree_trans_list);
3013 list_add_done:
3014                 seqmutex_unlock(&c->btree_trans_lock);
3015         }
3016 }
3017
3018 static void check_btree_paths_leaked(struct btree_trans *trans)
3019 {
3020 #ifdef CONFIG_BCACHEFS_DEBUG
3021         struct bch_fs *c = trans->c;
3022         struct btree_path *path;
3023
3024         trans_for_each_path(trans, path)
3025                 if (path->ref)
3026                         goto leaked;
3027         return;
3028 leaked:
3029         bch_err(c, "btree paths leaked from %s!", trans->fn);
3030         trans_for_each_path(trans, path)
3031                 if (path->ref)
3032                         printk(KERN_ERR "  btree %s %pS\n",
3033                                bch2_btree_ids[path->btree_id],
3034                                (void *) path->ip_allocated);
3035         /* Be noisy about this: */
3036         bch2_fatal_error(c);
3037 #endif
3038 }
3039
3040 void bch2_trans_exit(struct btree_trans *trans)
3041         __releases(&c->btree_trans_barrier)
3042 {
3043         struct btree_insert_entry *i;
3044         struct bch_fs *c = trans->c;
3045         struct btree_transaction_stats *s = btree_trans_stats(trans);
3046
3047         bch2_trans_unlock(trans);
3048
3049         if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
3050                 seqmutex_lock(&c->btree_trans_lock);
3051                 list_del(&trans->list);
3052                 seqmutex_unlock(&c->btree_trans_lock);
3053         }
3054
3055         closure_sync(&trans->ref);
3056
3057         if (s)
3058                 s->max_mem = max(s->max_mem, trans->mem_max);
3059
3060         trans_for_each_update(trans, i)
3061                 __btree_path_put(i->path, true);
3062         trans->nr_updates               = 0;
3063
3064         check_btree_paths_leaked(trans);
3065
3066         srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
3067
3068         bch2_journal_preres_put(&c->journal, &trans->journal_preres);
3069
3070         kfree(trans->extra_journal_entries.data);
3071
3072         if (trans->fs_usage_deltas) {
3073                 if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
3074                     REPLICAS_DELTA_LIST_MAX)
3075                         mempool_free(trans->fs_usage_deltas,
3076                                      &c->replicas_delta_pool);
3077                 else
3078                         kfree(trans->fs_usage_deltas);
3079         }
3080
3081         if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
3082                 mempool_free(trans->mem, &c->btree_trans_mem_pool);
3083         else
3084                 kfree(trans->mem);
3085
3086 #ifdef __KERNEL__
3087         /*
3088          * Userspace doesn't have a real percpu implementation:
3089          */
3090         trans->paths = this_cpu_xchg(c->btree_paths_bufs->path, trans->paths);
3091 #endif
3092
3093         if (trans->paths)
3094                 mempool_free(trans->paths, &c->btree_paths_pool);
3095
3096         trans->mem      = (void *) 0x1;
3097         trans->paths    = (void *) 0x1;
3098 }
3099
3100 static void __maybe_unused
3101 bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
3102                                       struct btree_bkey_cached_common *b)
3103 {
3104         struct six_lock_count c = six_lock_counts(&b->lock);
3105         struct task_struct *owner;
3106         pid_t pid;
3107
3108         rcu_read_lock();
3109         owner = READ_ONCE(b->lock.owner);
3110         pid = owner ? owner->pid : 0;
3111         rcu_read_unlock();
3112
3113         prt_tab(out);
3114         prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b',
3115                    b->level, bch2_btree_ids[b->btree_id]);
3116         bch2_bpos_to_text(out, btree_node_pos(b));
3117
3118         prt_tab(out);
3119         prt_printf(out, " locks %u:%u:%u held by pid %u",
3120                    c.n[0], c.n[1], c.n[2], pid);
3121 }
3122
3123 void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
3124 {
3125         struct btree_path *path;
3126         struct btree_bkey_cached_common *b;
3127         static char lock_types[] = { 'r', 'i', 'w' };
3128         unsigned l, idx;
3129
3130         if (!out->nr_tabstops) {
3131                 printbuf_tabstop_push(out, 16);
3132                 printbuf_tabstop_push(out, 32);
3133         }
3134
3135         prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn);
3136
3137         trans_for_each_path_safe(trans, path, idx) {
3138                 if (!path->nodes_locked)
3139                         continue;
3140
3141                 prt_printf(out, "  path %u %c l=%u %s:",
3142                        path->idx,
3143                        path->cached ? 'c' : 'b',
3144                        path->level,
3145                        bch2_btree_ids[path->btree_id]);
3146                 bch2_bpos_to_text(out, path->pos);
3147                 prt_newline(out);
3148
3149                 for (l = 0; l < BTREE_MAX_DEPTH; l++) {
3150                         if (btree_node_locked(path, l) &&
3151                             !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) {
3152                                 prt_printf(out, "    %c l=%u ",
3153                                            lock_types[btree_node_locked_type(path, l)], l);
3154                                 bch2_btree_bkey_cached_common_to_text(out, b);
3155                                 prt_newline(out);
3156                         }
3157                 }
3158         }
3159
3160         b = READ_ONCE(trans->locking);
3161         if (b) {
3162                 prt_printf(out, "  blocked for %lluus on",
3163                            div_u64(local_clock() - trans->locking_wait.start_time,
3164                                    1000));
3165                 prt_newline(out);
3166                 prt_printf(out, "    %c", lock_types[trans->locking_wait.lock_want]);
3167                 bch2_btree_bkey_cached_common_to_text(out, b);
3168                 prt_newline(out);
3169         }
3170 }
3171
3172 void bch2_fs_btree_iter_exit(struct bch_fs *c)
3173 {
3174         struct btree_transaction_stats *s;
3175
3176         for (s = c->btree_transaction_stats;
3177              s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
3178              s++) {
3179                 kfree(s->max_paths_text);
3180                 bch2_time_stats_exit(&s->lock_hold_times);
3181         }
3182
3183         if (c->btree_trans_barrier_initialized)
3184                 cleanup_srcu_struct(&c->btree_trans_barrier);
3185         mempool_exit(&c->btree_trans_mem_pool);
3186         mempool_exit(&c->btree_paths_pool);
3187 }
3188
3189 int bch2_fs_btree_iter_init(struct bch_fs *c)
3190 {
3191         struct btree_transaction_stats *s;
3192         unsigned nr = BTREE_ITER_MAX;
3193         int ret;
3194
3195         for (s = c->btree_transaction_stats;
3196              s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
3197              s++) {
3198                 bch2_time_stats_init(&s->lock_hold_times);
3199                 mutex_init(&s->lock);
3200         }
3201
3202         INIT_LIST_HEAD(&c->btree_trans_list);
3203         seqmutex_init(&c->btree_trans_lock);
3204
3205         ret   = mempool_init_kmalloc_pool(&c->btree_paths_pool, 1,
3206                         sizeof(struct btree_path) * nr +
3207                         sizeof(struct btree_insert_entry) * nr) ?:
3208                 mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
3209                                           BTREE_TRANS_MEM_MAX) ?:
3210                 init_srcu_struct(&c->btree_trans_barrier);
3211         if (!ret)
3212                 c->btree_trans_barrier_initialized = true;
3213         return ret;
3214 }