]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_iter.c
New upstream snapshot
[bcachefs-tools-debian] / libbcachefs / btree_iter.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_methods.h"
5 #include "bkey_buf.h"
6 #include "btree_cache.h"
7 #include "btree_iter.h"
8 #include "btree_key_cache.h"
9 #include "btree_locking.h"
10 #include "btree_update.h"
11 #include "debug.h"
12 #include "error.h"
13 #include "extents.h"
14 #include "journal.h"
15 #include "replicas.h"
16
17 #include <linux/prefetch.h>
18 #include <trace/events/bcachefs.h>
19
20 static void btree_iter_set_search_pos(struct btree_iter *, struct bpos);
21
22 static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
23 {
24         EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES);
25
26         /* Are we iterating over keys in all snapshots? */
27         if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
28                 p = bpos_successor(p);
29         } else {
30                 p = bpos_nosnap_successor(p);
31                 p.snapshot = iter->snapshot;
32         }
33
34         return p;
35 }
36
37 static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
38 {
39         EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES);
40
41         /* Are we iterating over keys in all snapshots? */
42         if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
43                 p = bpos_predecessor(p);
44         } else {
45                 p = bpos_nosnap_predecessor(p);
46                 p.snapshot = iter->snapshot;
47         }
48
49         return p;
50 }
51
52 static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
53 {
54         return l < BTREE_MAX_DEPTH &&
55                 (unsigned long) iter->l[l].b >= 128;
56 }
57
58 static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
59 {
60         struct bpos pos = iter->pos;
61
62         if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
63             bkey_cmp(pos, POS_MAX))
64                 pos = bkey_successor(iter, pos);
65         return pos;
66 }
67
68 static inline bool btree_iter_pos_before_node(struct btree_iter *iter,
69                                               struct btree *b)
70 {
71         return bpos_cmp(iter->real_pos, b->data->min_key) < 0;
72 }
73
74 static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
75                                              struct btree *b)
76 {
77         return bpos_cmp(b->key.k.p, iter->real_pos) < 0;
78 }
79
80 static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
81                                           struct btree *b)
82 {
83         return iter->btree_id == b->c.btree_id &&
84                 !btree_iter_pos_before_node(iter, b) &&
85                 !btree_iter_pos_after_node(iter, b);
86 }
87
88 /* Btree node locking: */
89
90 void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
91 {
92         bch2_btree_node_unlock_write_inlined(b, iter);
93 }
94
95 void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
96 {
97         struct btree_iter *linked;
98         unsigned readers = 0;
99
100         EBUG_ON(!btree_node_intent_locked(iter, b->c.level));
101
102         trans_for_each_iter(iter->trans, linked)
103                 if (linked->l[b->c.level].b == b &&
104                     btree_node_read_locked(linked, b->c.level))
105                         readers++;
106
107         /*
108          * Must drop our read locks before calling six_lock_write() -
109          * six_unlock() won't do wakeups until the reader count
110          * goes to 0, and it's safe because we have the node intent
111          * locked:
112          */
113         atomic64_sub(__SIX_VAL(read_lock, readers),
114                      &b->c.lock.state.counter);
115         btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write);
116         atomic64_add(__SIX_VAL(read_lock, readers),
117                      &b->c.lock.state.counter);
118 }
119
120 bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
121 {
122         struct btree *b = btree_iter_node(iter, level);
123         int want = __btree_lock_want(iter, level);
124
125         if (!is_btree_node(iter, level))
126                 return false;
127
128         if (race_fault())
129                 return false;
130
131         if (six_relock_type(&b->c.lock, want, iter->l[level].lock_seq) ||
132             (btree_node_lock_seq_matches(iter, b, level) &&
133              btree_node_lock_increment(iter->trans, b, level, want))) {
134                 mark_btree_node_locked(iter, level, want);
135                 return true;
136         } else {
137                 return false;
138         }
139 }
140
141 static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level)
142 {
143         struct btree *b = iter->l[level].b;
144
145         EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED);
146
147         if (!is_btree_node(iter, level))
148                 return false;
149
150         if (btree_node_intent_locked(iter, level))
151                 return true;
152
153         if (race_fault())
154                 return false;
155
156         if (btree_node_locked(iter, level)
157             ? six_lock_tryupgrade(&b->c.lock)
158             : six_relock_type(&b->c.lock, SIX_LOCK_intent, iter->l[level].lock_seq))
159                 goto success;
160
161         if (btree_node_lock_seq_matches(iter, b, level) &&
162             btree_node_lock_increment(iter->trans, b, level, BTREE_NODE_INTENT_LOCKED)) {
163                 btree_node_unlock(iter, level);
164                 goto success;
165         }
166
167         return false;
168 success:
169         mark_btree_node_intent_locked(iter, level);
170         return true;
171 }
172
173 static inline bool btree_iter_get_locks(struct btree_iter *iter,
174                                         bool upgrade, bool trace)
175 {
176         unsigned l = iter->level;
177         int fail_idx = -1;
178
179         do {
180                 if (!btree_iter_node(iter, l))
181                         break;
182
183                 if (!(upgrade
184                       ? bch2_btree_node_upgrade(iter, l)
185                       : bch2_btree_node_relock(iter, l))) {
186                         if (trace)
187                                 (upgrade
188                                  ? trace_node_upgrade_fail
189                                  : trace_node_relock_fail)(l, iter->l[l].lock_seq,
190                                                 is_btree_node(iter, l)
191                                                 ? 0
192                                                 : (unsigned long) iter->l[l].b,
193                                                 is_btree_node(iter, l)
194                                                 ? iter->l[l].b->c.lock.state.seq
195                                                 : 0);
196
197                         fail_idx = l;
198                         btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
199                 }
200
201                 l++;
202         } while (l < iter->locks_want);
203
204         /*
205          * When we fail to get a lock, we have to ensure that any child nodes
206          * can't be relocked so bch2_btree_iter_traverse has to walk back up to
207          * the node that we failed to relock:
208          */
209         while (fail_idx >= 0) {
210                 btree_node_unlock(iter, fail_idx);
211                 iter->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS;
212                 --fail_idx;
213         }
214
215         if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
216                 iter->uptodate = BTREE_ITER_NEED_PEEK;
217
218         bch2_btree_trans_verify_locks(iter->trans);
219
220         return iter->uptodate < BTREE_ITER_NEED_RELOCK;
221 }
222
223 static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b,
224                                   enum btree_iter_type type)
225 {
226         return  type != BTREE_ITER_CACHED
227                 ? container_of(_b, struct btree, c)->key.k.p
228                 : container_of(_b, struct bkey_cached, c)->key.pos;
229 }
230
231 /* Slowpath: */
232 bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
233                             unsigned level, struct btree_iter *iter,
234                             enum six_lock_type type,
235                             six_lock_should_sleep_fn should_sleep_fn, void *p,
236                             unsigned long ip)
237 {
238         struct btree_trans *trans = iter->trans;
239         struct btree_iter *linked, *deadlock_iter = NULL;
240         u64 start_time = local_clock();
241         unsigned reason = 9;
242         bool ret;
243
244         /* Check if it's safe to block: */
245         trans_for_each_iter(trans, linked) {
246                 if (!linked->nodes_locked)
247                         continue;
248
249                 /*
250                  * Can't block taking an intent lock if we have _any_ nodes read
251                  * locked:
252                  *
253                  * - Our read lock blocks another thread with an intent lock on
254                  *   the same node from getting a write lock, and thus from
255                  *   dropping its intent lock
256                  *
257                  * - And the other thread may have multiple nodes intent locked:
258                  *   both the node we want to intent lock, and the node we
259                  *   already have read locked - deadlock:
260                  */
261                 if (type == SIX_LOCK_intent &&
262                     linked->nodes_locked != linked->nodes_intent_locked) {
263                         linked->locks_want = max_t(unsigned,
264                                         linked->locks_want,
265                                         __fls(linked->nodes_locked) + 1);
266                         if (!btree_iter_get_locks(linked, true, false)) {
267                                 deadlock_iter = linked;
268                                 reason = 1;
269                         }
270                 }
271
272                 if (linked->btree_id != iter->btree_id) {
273                         if (linked->btree_id > iter->btree_id) {
274                                 deadlock_iter = linked;
275                                 reason = 3;
276                         }
277                         continue;
278                 }
279
280                 /*
281                  * Within the same btree, cached iterators come before non
282                  * cached iterators:
283                  */
284                 if (btree_iter_is_cached(linked) != btree_iter_is_cached(iter)) {
285                         if (btree_iter_is_cached(iter)) {
286                                 deadlock_iter = linked;
287                                 reason = 4;
288                         }
289                         continue;
290                 }
291
292                 /*
293                  * Interior nodes must be locked before their descendants: if
294                  * another iterator has possible descendants locked of the node
295                  * we're about to lock, it must have the ancestors locked too:
296                  */
297                 if (level > __fls(linked->nodes_locked)) {
298                         linked->locks_want =
299                                 max(level + 1, max_t(unsigned,
300                                     linked->locks_want,
301                                     iter->locks_want));
302                         if (!btree_iter_get_locks(linked, true, false)) {
303                                 deadlock_iter = linked;
304                                 reason = 5;
305                         }
306                 }
307
308                 /* Must lock btree nodes in key order: */
309                 if (btree_node_locked(linked, level) &&
310                     bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b,
311                                                  btree_iter_type(linked))) <= 0) {
312                         deadlock_iter = linked;
313                         reason = 7;
314                 }
315
316                 /*
317                  * Recheck if this is a node we already have locked - since one
318                  * of the get_locks() calls might've successfully
319                  * upgraded/relocked it:
320                  */
321                 if (linked->l[level].b == b &&
322                     btree_node_locked_type(linked, level) >= type) {
323                         six_lock_increment(&b->c.lock, type);
324                         return true;
325                 }
326         }
327
328         if (unlikely(deadlock_iter)) {
329                 trace_trans_restart_would_deadlock(iter->trans->ip, ip,
330                                 reason,
331                                 deadlock_iter->btree_id,
332                                 btree_iter_type(deadlock_iter),
333                                 iter->btree_id,
334                                 btree_iter_type(iter));
335                 return false;
336         }
337
338         if (six_trylock_type(&b->c.lock, type))
339                 return true;
340
341 #ifdef CONFIG_BCACHEFS_DEBUG
342         trans->locking_iter_idx = iter->idx;
343         trans->locking_pos      = pos;
344         trans->locking_btree_id = iter->btree_id;
345         trans->locking_level    = level;
346         trans->locking          = b;
347 #endif
348
349         ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0;
350
351 #ifdef CONFIG_BCACHEFS_DEBUG
352         trans->locking = NULL;
353 #endif
354         if (ret)
355                 bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)],
356                                        start_time);
357         return ret;
358 }
359
360 /* Btree iterator locking: */
361
362 #ifdef CONFIG_BCACHEFS_DEBUG
363 static void bch2_btree_iter_verify_locks(struct btree_iter *iter)
364 {
365         unsigned l;
366
367         if (!(iter->trans->iters_linked & (1ULL << iter->idx))) {
368                 BUG_ON(iter->nodes_locked);
369                 return;
370         }
371
372         for (l = 0; is_btree_node(iter, l); l++) {
373                 if (iter->uptodate >= BTREE_ITER_NEED_RELOCK &&
374                     !btree_node_locked(iter, l))
375                         continue;
376
377                 BUG_ON(btree_lock_want(iter, l) !=
378                        btree_node_locked_type(iter, l));
379         }
380 }
381
382 void bch2_btree_trans_verify_locks(struct btree_trans *trans)
383 {
384         struct btree_iter *iter;
385
386         trans_for_each_iter(trans, iter)
387                 bch2_btree_iter_verify_locks(iter);
388 }
389 #else
390 static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {}
391 #endif
392
393 __flatten
394 bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace)
395 {
396         return btree_iter_get_locks(iter, false, trace);
397 }
398
399 bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
400                                unsigned new_locks_want)
401 {
402         struct btree_iter *linked;
403
404         EBUG_ON(iter->locks_want >= new_locks_want);
405
406         iter->locks_want = new_locks_want;
407
408         if (btree_iter_get_locks(iter, true, true))
409                 return true;
410
411         /*
412          * Ancestor nodes must be locked before child nodes, so set locks_want
413          * on iterators that might lock ancestors before us to avoid getting
414          * -EINTR later:
415          */
416         trans_for_each_iter(iter->trans, linked)
417                 if (linked != iter &&
418                     linked->btree_id == iter->btree_id &&
419                     linked->locks_want < new_locks_want) {
420                         linked->locks_want = new_locks_want;
421                         btree_iter_get_locks(linked, true, false);
422                 }
423
424         return false;
425 }
426
427 void __bch2_btree_iter_downgrade(struct btree_iter *iter,
428                                  unsigned new_locks_want)
429 {
430         unsigned l;
431
432         EBUG_ON(iter->locks_want < new_locks_want);
433
434         iter->locks_want = new_locks_want;
435
436         while (iter->nodes_locked &&
437                (l = __fls(iter->nodes_locked)) >= iter->locks_want) {
438                 if (l > iter->level) {
439                         btree_node_unlock(iter, l);
440                 } else {
441                         if (btree_node_intent_locked(iter, l)) {
442                                 six_lock_downgrade(&iter->l[l].b->c.lock);
443                                 iter->nodes_intent_locked ^= 1 << l;
444                         }
445                         break;
446                 }
447         }
448
449         bch2_btree_trans_verify_locks(iter->trans);
450 }
451
452 void bch2_trans_downgrade(struct btree_trans *trans)
453 {
454         struct btree_iter *iter;
455
456         trans_for_each_iter(trans, iter)
457                 bch2_btree_iter_downgrade(iter);
458 }
459
460 /* Btree transaction locking: */
461
462 bool bch2_trans_relock(struct btree_trans *trans)
463 {
464         struct btree_iter *iter;
465
466         trans_for_each_iter(trans, iter)
467                 if (btree_iter_keep(trans, iter) &&
468                     !bch2_btree_iter_relock(iter, true))
469                         return false;
470         return true;
471 }
472
473 void bch2_trans_unlock(struct btree_trans *trans)
474 {
475         struct btree_iter *iter;
476
477         trans_for_each_iter(trans, iter)
478                 __bch2_btree_iter_unlock(iter);
479 }
480
481 /* Btree iterator: */
482
483 #ifdef CONFIG_BCACHEFS_DEBUG
484
485 static void bch2_btree_iter_verify_cached(struct btree_iter *iter)
486 {
487         struct bkey_cached *ck;
488         bool locked = btree_node_locked(iter, 0);
489
490         if (!bch2_btree_node_relock(iter, 0))
491                 return;
492
493         ck = (void *) iter->l[0].b;
494         BUG_ON(ck->key.btree_id != iter->btree_id ||
495                bkey_cmp(ck->key.pos, iter->pos));
496
497         if (!locked)
498                 btree_node_unlock(iter, 0);
499 }
500
501 static void bch2_btree_iter_verify_level(struct btree_iter *iter,
502                                          unsigned level)
503 {
504         struct btree_iter_level *l;
505         struct btree_node_iter tmp;
506         bool locked;
507         struct bkey_packed *p, *k;
508         char buf1[100], buf2[100], buf3[100];
509         const char *msg;
510
511         if (!bch2_debug_check_iterators)
512                 return;
513
514         l       = &iter->l[level];
515         tmp     = l->iter;
516         locked  = btree_node_locked(iter, level);
517
518         if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
519                 if (!level)
520                         bch2_btree_iter_verify_cached(iter);
521                 return;
522         }
523
524         BUG_ON(iter->level < iter->min_depth);
525
526         if (!btree_iter_node(iter, level))
527                 return;
528
529         if (!bch2_btree_node_relock(iter, level))
530                 return;
531
532         BUG_ON(!btree_iter_pos_in_node(iter, l->b));
533
534         /*
535          * node iterators don't use leaf node iterator:
536          */
537         if (btree_iter_type(iter) == BTREE_ITER_NODES &&
538             level <= iter->min_depth)
539                 goto unlock;
540
541         bch2_btree_node_iter_verify(&l->iter, l->b);
542
543         /*
544          * For interior nodes, the iterator will have skipped past
545          * deleted keys:
546          *
547          * For extents, the iterator may have skipped past deleted keys (but not
548          * whiteouts)
549          */
550         p = level || btree_node_type_is_extents(iter->btree_id)
551                 ? bch2_btree_node_iter_prev(&tmp, l->b)
552                 : bch2_btree_node_iter_prev_all(&tmp, l->b);
553         k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
554
555         if (p && bkey_iter_pos_cmp(l->b, p, &iter->real_pos) >= 0) {
556                 msg = "before";
557                 goto err;
558         }
559
560         if (k && bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) {
561                 msg = "after";
562                 goto err;
563         }
564 unlock:
565         if (!locked)
566                 btree_node_unlock(iter, level);
567         return;
568 err:
569         strcpy(buf2, "(none)");
570         strcpy(buf3, "(none)");
571
572         bch2_bpos_to_text(&PBUF(buf1), iter->real_pos);
573
574         if (p) {
575                 struct bkey uk = bkey_unpack_key(l->b, p);
576                 bch2_bkey_to_text(&PBUF(buf2), &uk);
577         }
578
579         if (k) {
580                 struct bkey uk = bkey_unpack_key(l->b, k);
581                 bch2_bkey_to_text(&PBUF(buf3), &uk);
582         }
583
584         panic("iterator should be %s key at level %u:\n"
585               "iter pos %s\n"
586               "prev key %s\n"
587               "cur  key %s\n",
588               msg, level, buf1, buf2, buf3);
589 }
590
591 static void bch2_btree_iter_verify(struct btree_iter *iter)
592 {
593         enum btree_iter_type type = btree_iter_type(iter);
594         unsigned i;
595
596         EBUG_ON(iter->btree_id >= BTREE_ID_NR);
597
598         BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
599                iter->pos.snapshot != iter->snapshot);
600
601         BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
602                (iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
603
604         BUG_ON(type == BTREE_ITER_NODES &&
605                !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
606
607         BUG_ON(type != BTREE_ITER_NODES &&
608                (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
609                !btree_type_has_snapshots(iter->btree_id));
610
611         bch2_btree_iter_verify_locks(iter);
612
613         for (i = 0; i < BTREE_MAX_DEPTH; i++)
614                 bch2_btree_iter_verify_level(iter, i);
615 }
616
617 static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
618 {
619         enum btree_iter_type type = btree_iter_type(iter);
620
621         BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
622                iter->pos.snapshot != iter->snapshot);
623
624         BUG_ON((type == BTREE_ITER_KEYS ||
625                 type == BTREE_ITER_CACHED) &&
626                (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
627                 bkey_cmp(iter->pos, iter->k.p) > 0));
628 }
629
630 void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b)
631 {
632         struct btree_iter *iter;
633
634         if (!bch2_debug_check_iterators)
635                 return;
636
637         trans_for_each_iter_with_node(trans, b, iter)
638                 bch2_btree_iter_verify_level(iter, b->c.level);
639 }
640
641 #else
642
643 static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned l) {}
644 static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
645 static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
646
647 #endif
648
649 static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
650                                         struct btree *b,
651                                         struct bset_tree *t,
652                                         struct bkey_packed *k)
653 {
654         struct btree_node_iter_set *set;
655
656         btree_node_iter_for_each(iter, set)
657                 if (set->end == t->end_offset) {
658                         set->k = __btree_node_key_to_offset(b, k);
659                         bch2_btree_node_iter_sort(iter, b);
660                         return;
661                 }
662
663         bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
664 }
665
666 static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
667                                                struct btree *b,
668                                                struct bkey_packed *where)
669 {
670         struct btree_iter_level *l = &iter->l[b->c.level];
671
672         if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
673                 return;
674
675         if (bkey_iter_pos_cmp(l->b, where, &iter->real_pos) < 0)
676                 bch2_btree_node_iter_advance(&l->iter, l->b);
677
678         btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
679 }
680
681 void bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
682                                       struct btree *b,
683                                       struct bkey_packed *where)
684 {
685         struct btree_iter *linked;
686
687         trans_for_each_iter_with_node(iter->trans, b, linked) {
688                 __bch2_btree_iter_fix_key_modified(linked, b, where);
689                 bch2_btree_iter_verify_level(linked, b->c.level);
690         }
691 }
692
693 static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
694                                       struct btree *b,
695                                       struct btree_node_iter *node_iter,
696                                       struct bset_tree *t,
697                                       struct bkey_packed *where,
698                                       unsigned clobber_u64s,
699                                       unsigned new_u64s)
700 {
701         const struct bkey_packed *end = btree_bkey_last(b, t);
702         struct btree_node_iter_set *set;
703         unsigned offset = __btree_node_key_to_offset(b, where);
704         int shift = new_u64s - clobber_u64s;
705         unsigned old_end = t->end_offset - shift;
706         unsigned orig_iter_pos = node_iter->data[0].k;
707         bool iter_current_key_modified =
708                 orig_iter_pos >= offset &&
709                 orig_iter_pos <= offset + clobber_u64s;
710
711         btree_node_iter_for_each(node_iter, set)
712                 if (set->end == old_end)
713                         goto found;
714
715         /* didn't find the bset in the iterator - might have to readd it: */
716         if (new_u64s &&
717             bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) {
718                 bch2_btree_node_iter_push(node_iter, b, where, end);
719                 goto fixup_done;
720         } else {
721                 /* Iterator is after key that changed */
722                 return;
723         }
724 found:
725         set->end = t->end_offset;
726
727         /* Iterator hasn't gotten to the key that changed yet: */
728         if (set->k < offset)
729                 return;
730
731         if (new_u64s &&
732             bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) {
733                 set->k = offset;
734         } else if (set->k < offset + clobber_u64s) {
735                 set->k = offset + new_u64s;
736                 if (set->k == set->end)
737                         bch2_btree_node_iter_set_drop(node_iter, set);
738         } else {
739                 /* Iterator is after key that changed */
740                 set->k = (int) set->k + shift;
741                 return;
742         }
743
744         bch2_btree_node_iter_sort(node_iter, b);
745 fixup_done:
746         if (node_iter->data[0].k != orig_iter_pos)
747                 iter_current_key_modified = true;
748
749         /*
750          * When a new key is added, and the node iterator now points to that
751          * key, the iterator might have skipped past deleted keys that should
752          * come after the key the iterator now points to. We have to rewind to
753          * before those deleted keys - otherwise
754          * bch2_btree_node_iter_prev_all() breaks:
755          */
756         if (!bch2_btree_node_iter_end(node_iter) &&
757             iter_current_key_modified &&
758             (b->c.level ||
759              btree_node_type_is_extents(iter->btree_id))) {
760                 struct bset_tree *t;
761                 struct bkey_packed *k, *k2, *p;
762
763                 k = bch2_btree_node_iter_peek_all(node_iter, b);
764
765                 for_each_bset(b, t) {
766                         bool set_pos = false;
767
768                         if (node_iter->data[0].end == t->end_offset)
769                                 continue;
770
771                         k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t);
772
773                         while ((p = bch2_bkey_prev_all(b, t, k2)) &&
774                                bkey_iter_cmp(b, k, p) < 0) {
775                                 k2 = p;
776                                 set_pos = true;
777                         }
778
779                         if (set_pos)
780                                 btree_node_iter_set_set_pos(node_iter,
781                                                             b, t, k2);
782                 }
783         }
784
785         if (!b->c.level &&
786             node_iter == &iter->l[0].iter &&
787             iter_current_key_modified)
788                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
789 }
790
791 void bch2_btree_node_iter_fix(struct btree_iter *iter,
792                               struct btree *b,
793                               struct btree_node_iter *node_iter,
794                               struct bkey_packed *where,
795                               unsigned clobber_u64s,
796                               unsigned new_u64s)
797 {
798         struct bset_tree *t = bch2_bkey_to_bset(b, where);
799         struct btree_iter *linked;
800
801         if (node_iter != &iter->l[b->c.level].iter) {
802                 __bch2_btree_node_iter_fix(iter, b, node_iter, t,
803                                            where, clobber_u64s, new_u64s);
804
805                 if (bch2_debug_check_iterators)
806                         bch2_btree_node_iter_verify(node_iter, b);
807         }
808
809         trans_for_each_iter_with_node(iter->trans, b, linked) {
810                 __bch2_btree_node_iter_fix(linked, b,
811                                            &linked->l[b->c.level].iter, t,
812                                            where, clobber_u64s, new_u64s);
813                 bch2_btree_iter_verify_level(linked, b->c.level);
814         }
815 }
816
817 static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
818                                                   struct btree_iter_level *l,
819                                                   struct bkey *u,
820                                                   struct bkey_packed *k)
821 {
822         struct bkey_s_c ret;
823
824         if (unlikely(!k)) {
825                 /*
826                  * signal to bch2_btree_iter_peek_slot() that we're currently at
827                  * a hole
828                  */
829                 u->type = KEY_TYPE_deleted;
830                 return bkey_s_c_null;
831         }
832
833         ret = bkey_disassemble(l->b, k, u);
834
835         if (bch2_debug_check_bkeys)
836                 bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
837
838         return ret;
839 }
840
841 /* peek_all() doesn't skip deleted keys */
842 static inline struct bkey_s_c btree_iter_level_peek_all(struct btree_iter *iter,
843                                                         struct btree_iter_level *l,
844                                                         struct bkey *u)
845 {
846         return __btree_iter_unpack(iter, l, u,
847                         bch2_btree_node_iter_peek_all(&l->iter, l->b));
848 }
849
850 static inline struct bkey_s_c btree_iter_level_peek(struct btree_iter *iter,
851                                                     struct btree_iter_level *l)
852 {
853         struct bkey_s_c k = __btree_iter_unpack(iter, l, &iter->k,
854                         bch2_btree_node_iter_peek(&l->iter, l->b));
855
856         iter->real_pos = k.k ? k.k->p : l->b->key.k.p;
857         return k;
858 }
859
860 static inline struct bkey_s_c btree_iter_level_prev(struct btree_iter *iter,
861                                                     struct btree_iter_level *l)
862 {
863         struct bkey_s_c k = __btree_iter_unpack(iter, l, &iter->k,
864                         bch2_btree_node_iter_prev(&l->iter, l->b));
865
866         iter->real_pos = k.k ? k.k->p : l->b->data->min_key;
867         return k;
868 }
869
870 static inline bool btree_iter_advance_to_pos(struct btree_iter *iter,
871                                              struct btree_iter_level *l,
872                                              int max_advance)
873 {
874         struct bkey_packed *k;
875         int nr_advanced = 0;
876
877         while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
878                bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) {
879                 if (max_advance > 0 && nr_advanced >= max_advance)
880                         return false;
881
882                 bch2_btree_node_iter_advance(&l->iter, l->b);
883                 nr_advanced++;
884         }
885
886         return true;
887 }
888
889 /*
890  * Verify that iterator for parent node points to child node:
891  */
892 static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
893 {
894         struct btree_iter_level *l;
895         unsigned plevel;
896         bool parent_locked;
897         struct bkey_packed *k;
898
899         if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
900                 return;
901
902         plevel = b->c.level + 1;
903         if (!btree_iter_node(iter, plevel))
904                 return;
905
906         parent_locked = btree_node_locked(iter, plevel);
907
908         if (!bch2_btree_node_relock(iter, plevel))
909                 return;
910
911         l = &iter->l[plevel];
912         k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
913         if (!k ||
914             bkey_deleted(k) ||
915             bkey_cmp_left_packed(l->b, k, &b->key.k.p)) {
916                 char buf1[100];
917                 char buf2[100];
918                 char buf3[100];
919                 char buf4[100];
920                 struct bkey uk = bkey_unpack_key(b, k);
921
922                 bch2_dump_btree_node(iter->trans->c, l->b);
923                 bch2_bpos_to_text(&PBUF(buf1), iter->real_pos);
924                 bch2_bkey_to_text(&PBUF(buf2), &uk);
925                 bch2_bpos_to_text(&PBUF(buf3), b->data->min_key);
926                 bch2_bpos_to_text(&PBUF(buf3), b->data->max_key);
927                 panic("parent iter doesn't point to new node:\n"
928                       "iter pos %s %s\n"
929                       "iter key %s\n"
930                       "new node %s-%s\n",
931                       bch2_btree_ids[iter->btree_id], buf1,
932                       buf2, buf3, buf4);
933         }
934
935         if (!parent_locked)
936                 btree_node_unlock(iter, b->c.level + 1);
937 }
938
939 static inline void __btree_iter_init(struct btree_iter *iter,
940                                      unsigned level)
941 {
942         struct btree_iter_level *l = &iter->l[level];
943
944         bch2_btree_node_iter_init(&l->iter, l->b, &iter->real_pos);
945
946         /*
947          * Iterators to interior nodes should always be pointed at the first non
948          * whiteout:
949          */
950         if (level)
951                 bch2_btree_node_iter_peek(&l->iter, l->b);
952
953         btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
954 }
955
956 static inline void btree_iter_node_set(struct btree_iter *iter,
957                                        struct btree *b)
958 {
959         BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
960
961         btree_iter_verify_new_node(iter, b);
962
963         EBUG_ON(!btree_iter_pos_in_node(iter, b));
964         EBUG_ON(b->c.lock.state.seq & 1);
965
966         iter->l[b->c.level].lock_seq = b->c.lock.state.seq;
967         iter->l[b->c.level].b = b;
968         __btree_iter_init(iter, b->c.level);
969 }
970
971 /*
972  * A btree node is being replaced - update the iterator to point to the new
973  * node:
974  */
975 void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
976 {
977         enum btree_node_locked_type t;
978         struct btree_iter *linked;
979
980         trans_for_each_iter(iter->trans, linked)
981                 if (btree_iter_type(linked) != BTREE_ITER_CACHED &&
982                     btree_iter_pos_in_node(linked, b)) {
983                         /*
984                          * bch2_btree_iter_node_drop() has already been called -
985                          * the old node we're replacing has already been
986                          * unlocked and the pointer invalidated
987                          */
988                         BUG_ON(btree_node_locked(linked, b->c.level));
989
990                         t = btree_lock_want(linked, b->c.level);
991                         if (t != BTREE_NODE_UNLOCKED) {
992                                 six_lock_increment(&b->c.lock, t);
993                                 mark_btree_node_locked(linked, b->c.level, t);
994                         }
995
996                         btree_iter_node_set(linked, b);
997                 }
998 }
999
1000 void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
1001 {
1002         struct btree_iter *linked;
1003         unsigned level = b->c.level;
1004
1005         trans_for_each_iter(iter->trans, linked)
1006                 if (linked->l[level].b == b) {
1007                         btree_node_unlock(linked, level);
1008                         linked->l[level].b = BTREE_ITER_NO_NODE_DROP;
1009                 }
1010 }
1011
1012 /*
1013  * A btree node has been modified in such a way as to invalidate iterators - fix
1014  * them:
1015  */
1016 void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b)
1017 {
1018         struct btree_iter *linked;
1019
1020         trans_for_each_iter_with_node(iter->trans, b, linked)
1021                 __btree_iter_init(linked, b->c.level);
1022 }
1023
1024 static int lock_root_check_fn(struct six_lock *lock, void *p)
1025 {
1026         struct btree *b = container_of(lock, struct btree, c.lock);
1027         struct btree **rootp = p;
1028
1029         return b == *rootp ? 0 : -1;
1030 }
1031
1032 static inline int btree_iter_lock_root(struct btree_iter *iter,
1033                                        unsigned depth_want,
1034                                        unsigned long trace_ip)
1035 {
1036         struct bch_fs *c = iter->trans->c;
1037         struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b;
1038         enum six_lock_type lock_type;
1039         unsigned i;
1040
1041         EBUG_ON(iter->nodes_locked);
1042
1043         while (1) {
1044                 b = READ_ONCE(*rootp);
1045                 iter->level = READ_ONCE(b->c.level);
1046
1047                 if (unlikely(iter->level < depth_want)) {
1048                         /*
1049                          * the root is at a lower depth than the depth we want:
1050                          * got to the end of the btree, or we're walking nodes
1051                          * greater than some depth and there are no nodes >=
1052                          * that depth
1053                          */
1054                         iter->level = depth_want;
1055                         for (i = iter->level; i < BTREE_MAX_DEPTH; i++)
1056                                 iter->l[i].b = NULL;
1057                         return 1;
1058                 }
1059
1060                 lock_type = __btree_lock_want(iter, iter->level);
1061                 if (unlikely(!btree_node_lock(b, POS_MAX, iter->level,
1062                                               iter, lock_type,
1063                                               lock_root_check_fn, rootp,
1064                                               trace_ip)))
1065                         return -EINTR;
1066
1067                 if (likely(b == READ_ONCE(*rootp) &&
1068                            b->c.level == iter->level &&
1069                            !race_fault())) {
1070                         for (i = 0; i < iter->level; i++)
1071                                 iter->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT;
1072                         iter->l[iter->level].b = b;
1073                         for (i = iter->level + 1; i < BTREE_MAX_DEPTH; i++)
1074                                 iter->l[i].b = NULL;
1075
1076                         mark_btree_node_locked(iter, iter->level, lock_type);
1077                         btree_iter_node_set(iter, b);
1078                         return 0;
1079                 }
1080
1081                 six_unlock_type(&b->c.lock, lock_type);
1082         }
1083 }
1084
1085 noinline
1086 static void btree_iter_prefetch(struct btree_iter *iter)
1087 {
1088         struct bch_fs *c = iter->trans->c;
1089         struct btree_iter_level *l = &iter->l[iter->level];
1090         struct btree_node_iter node_iter = l->iter;
1091         struct bkey_packed *k;
1092         struct bkey_buf tmp;
1093         unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
1094                 ? (iter->level > 1 ? 0 :  2)
1095                 : (iter->level > 1 ? 1 : 16);
1096         bool was_locked = btree_node_locked(iter, iter->level);
1097
1098         bch2_bkey_buf_init(&tmp);
1099
1100         while (nr) {
1101                 if (!bch2_btree_node_relock(iter, iter->level))
1102                         break;
1103
1104                 bch2_btree_node_iter_advance(&node_iter, l->b);
1105                 k = bch2_btree_node_iter_peek(&node_iter, l->b);
1106                 if (!k)
1107                         break;
1108
1109                 bch2_bkey_buf_unpack(&tmp, c, l->b, k);
1110                 bch2_btree_node_prefetch(c, iter, tmp.k, iter->btree_id,
1111                                          iter->level - 1);
1112         }
1113
1114         if (!was_locked)
1115                 btree_node_unlock(iter, iter->level);
1116
1117         bch2_bkey_buf_exit(&tmp, c);
1118 }
1119
1120 static noinline void btree_node_mem_ptr_set(struct btree_iter *iter,
1121                                             unsigned plevel, struct btree *b)
1122 {
1123         struct btree_iter_level *l = &iter->l[plevel];
1124         bool locked = btree_node_locked(iter, plevel);
1125         struct bkey_packed *k;
1126         struct bch_btree_ptr_v2 *bp;
1127
1128         if (!bch2_btree_node_relock(iter, plevel))
1129                 return;
1130
1131         k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
1132         BUG_ON(k->type != KEY_TYPE_btree_ptr_v2);
1133
1134         bp = (void *) bkeyp_val(&l->b->format, k);
1135         bp->mem_ptr = (unsigned long)b;
1136
1137         if (!locked)
1138                 btree_node_unlock(iter, plevel);
1139 }
1140
1141 static __always_inline int btree_iter_down(struct btree_iter *iter,
1142                                            unsigned long trace_ip)
1143 {
1144         struct bch_fs *c = iter->trans->c;
1145         struct btree_iter_level *l = &iter->l[iter->level];
1146         struct btree *b;
1147         unsigned level = iter->level - 1;
1148         enum six_lock_type lock_type = __btree_lock_want(iter, level);
1149         struct bkey_buf tmp;
1150         int ret;
1151
1152         EBUG_ON(!btree_node_locked(iter, iter->level));
1153
1154         bch2_bkey_buf_init(&tmp);
1155         bch2_bkey_buf_unpack(&tmp, c, l->b,
1156                          bch2_btree_node_iter_peek(&l->iter, l->b));
1157
1158         b = bch2_btree_node_get(c, iter, tmp.k, level, lock_type, trace_ip);
1159         ret = PTR_ERR_OR_ZERO(b);
1160         if (unlikely(ret))
1161                 goto err;
1162
1163         mark_btree_node_locked(iter, level, lock_type);
1164         btree_iter_node_set(iter, b);
1165
1166         if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 &&
1167             unlikely(b != btree_node_mem_ptr(tmp.k)))
1168                 btree_node_mem_ptr_set(iter, level + 1, b);
1169
1170         if (iter->flags & BTREE_ITER_PREFETCH)
1171                 btree_iter_prefetch(iter);
1172
1173         iter->level = level;
1174 err:
1175         bch2_bkey_buf_exit(&tmp, c);
1176         return ret;
1177 }
1178
1179 static int btree_iter_traverse_one(struct btree_iter *, unsigned long);
1180
1181 static int __btree_iter_traverse_all(struct btree_trans *trans, int ret)
1182 {
1183         struct bch_fs *c = trans->c;
1184         struct btree_iter *iter;
1185         u8 sorted[BTREE_ITER_MAX];
1186         unsigned i, nr_sorted = 0;
1187
1188         if (trans->in_traverse_all)
1189                 return -EINTR;
1190
1191         trans->in_traverse_all = true;
1192 retry_all:
1193         nr_sorted = 0;
1194
1195         trans_for_each_iter(trans, iter)
1196                 sorted[nr_sorted++] = iter->idx;
1197
1198 #define btree_iter_cmp_by_idx(_l, _r)                           \
1199                 btree_iter_lock_cmp(&trans->iters[_l], &trans->iters[_r])
1200
1201         bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx);
1202 #undef btree_iter_cmp_by_idx
1203         bch2_trans_unlock(trans);
1204         cond_resched();
1205
1206         if (unlikely(ret == -ENOMEM)) {
1207                 struct closure cl;
1208
1209                 closure_init_stack(&cl);
1210
1211                 do {
1212                         ret = bch2_btree_cache_cannibalize_lock(c, &cl);
1213                         closure_sync(&cl);
1214                 } while (ret);
1215         }
1216
1217         if (unlikely(ret == -EIO)) {
1218                 trans->error = true;
1219                 goto out;
1220         }
1221
1222         BUG_ON(ret && ret != -EINTR);
1223
1224         /* Now, redo traversals in correct order: */
1225         for (i = 0; i < nr_sorted; i++) {
1226                 unsigned idx = sorted[i];
1227
1228                 /*
1229                  * sucessfully traversing one iterator can cause another to be
1230                  * unlinked, in btree_key_cache_fill()
1231                  */
1232                 if (!(trans->iters_linked & (1ULL << idx)))
1233                         continue;
1234
1235                 ret = btree_iter_traverse_one(&trans->iters[idx], _THIS_IP_);
1236                 if (ret)
1237                         goto retry_all;
1238         }
1239
1240         if (hweight64(trans->iters_live) > 1)
1241                 ret = -EINTR;
1242         else
1243                 trans_for_each_iter(trans, iter)
1244                         if (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) {
1245                                 ret = -EINTR;
1246                                 break;
1247                         }
1248 out:
1249         bch2_btree_cache_cannibalize_unlock(c);
1250
1251         trans->in_traverse_all = false;
1252         return ret;
1253 }
1254
1255 int bch2_btree_iter_traverse_all(struct btree_trans *trans)
1256 {
1257         return __btree_iter_traverse_all(trans, 0);
1258 }
1259
1260 static inline bool btree_iter_good_node(struct btree_iter *iter,
1261                                         unsigned l, int check_pos)
1262 {
1263         if (!is_btree_node(iter, l) ||
1264             !bch2_btree_node_relock(iter, l))
1265                 return false;
1266
1267         if (check_pos < 0 && btree_iter_pos_before_node(iter, iter->l[l].b))
1268                 return false;
1269         if (check_pos > 0 && btree_iter_pos_after_node(iter, iter->l[l].b))
1270                 return false;
1271         return true;
1272 }
1273
1274 static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter,
1275                                                      int check_pos)
1276 {
1277         unsigned l = iter->level;
1278
1279         while (btree_iter_node(iter, l) &&
1280                !btree_iter_good_node(iter, l, check_pos)) {
1281                 btree_node_unlock(iter, l);
1282                 iter->l[l].b = BTREE_ITER_NO_NODE_UP;
1283                 l++;
1284         }
1285
1286         return l;
1287 }
1288
1289 /*
1290  * This is the main state machine for walking down the btree - walks down to a
1291  * specified depth
1292  *
1293  * Returns 0 on success, -EIO on error (error reading in a btree node).
1294  *
1295  * On error, caller (peek_node()/peek_key()) must return NULL; the error is
1296  * stashed in the iterator and returned from bch2_trans_exit().
1297  */
1298 static int btree_iter_traverse_one(struct btree_iter *iter,
1299                                    unsigned long trace_ip)
1300 {
1301         unsigned depth_want = iter->level;
1302
1303         /*
1304          * if we need interior nodes locked, call btree_iter_relock() to make
1305          * sure we walk back up enough that we lock them:
1306          */
1307         if (iter->uptodate == BTREE_ITER_NEED_RELOCK ||
1308             iter->locks_want > 1)
1309                 bch2_btree_iter_relock(iter, false);
1310
1311         if (btree_iter_type(iter) == BTREE_ITER_CACHED)
1312                 return bch2_btree_iter_traverse_cached(iter);
1313
1314         if (iter->uptodate < BTREE_ITER_NEED_RELOCK)
1315                 return 0;
1316
1317         if (unlikely(iter->level >= BTREE_MAX_DEPTH))
1318                 return 0;
1319
1320         iter->level = btree_iter_up_until_good_node(iter, 0);
1321
1322         /*
1323          * Note: iter->nodes[iter->level] may be temporarily NULL here - that
1324          * would indicate to other code that we got to the end of the btree,
1325          * here it indicates that relocking the root failed - it's critical that
1326          * btree_iter_lock_root() comes next and that it can't fail
1327          */
1328         while (iter->level > depth_want) {
1329                 int ret = btree_iter_node(iter, iter->level)
1330                         ? btree_iter_down(iter, trace_ip)
1331                         : btree_iter_lock_root(iter, depth_want, trace_ip);
1332                 if (unlikely(ret)) {
1333                         if (ret == 1)
1334                                 return 0;
1335
1336                         iter->level = depth_want;
1337
1338                         if (ret == -EIO) {
1339                                 iter->flags |= BTREE_ITER_ERROR;
1340                                 iter->l[iter->level].b =
1341                                         BTREE_ITER_NO_NODE_ERROR;
1342                         } else {
1343                                 iter->l[iter->level].b =
1344                                         BTREE_ITER_NO_NODE_DOWN;
1345                         }
1346                         return ret;
1347                 }
1348         }
1349
1350         iter->uptodate = BTREE_ITER_NEED_PEEK;
1351
1352         bch2_btree_iter_verify(iter);
1353         return 0;
1354 }
1355
1356 static int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
1357 {
1358         struct btree_trans *trans = iter->trans;
1359         int ret;
1360
1361         ret =   bch2_trans_cond_resched(trans) ?:
1362                 btree_iter_traverse_one(iter, _RET_IP_);
1363         if (unlikely(ret))
1364                 ret = __btree_iter_traverse_all(trans, ret);
1365
1366         return ret;
1367 }
1368
1369 /*
1370  * Note:
1371  * bch2_btree_iter_traverse() is for external users, btree_iter_traverse() is
1372  * for internal btree iterator users
1373  *
1374  * bch2_btree_iter_traverse sets iter->real_pos to iter->pos,
1375  * btree_iter_traverse() does not:
1376  */
1377 static inline int __must_check
1378 btree_iter_traverse(struct btree_iter *iter)
1379 {
1380         return iter->uptodate >= BTREE_ITER_NEED_RELOCK
1381                 ? __bch2_btree_iter_traverse(iter)
1382                 : 0;
1383 }
1384
1385 int __must_check
1386 bch2_btree_iter_traverse(struct btree_iter *iter)
1387 {
1388         btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
1389
1390         return btree_iter_traverse(iter);
1391 }
1392
1393 /* Iterate across nodes (leaf and interior nodes) */
1394
1395 struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
1396 {
1397         struct btree *b;
1398         int ret;
1399
1400         EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES);
1401         bch2_btree_iter_verify(iter);
1402
1403         ret = btree_iter_traverse(iter);
1404         if (ret)
1405                 return NULL;
1406
1407         b = btree_iter_node(iter, iter->level);
1408         if (!b)
1409                 return NULL;
1410
1411         BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0);
1412
1413         iter->pos = iter->real_pos = b->key.k.p;
1414
1415         bch2_btree_iter_verify(iter);
1416
1417         return b;
1418 }
1419
1420 struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
1421 {
1422         struct btree *b;
1423         int ret;
1424
1425         EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES);
1426         bch2_btree_iter_verify(iter);
1427
1428         /* already got to end? */
1429         if (!btree_iter_node(iter, iter->level))
1430                 return NULL;
1431
1432         bch2_trans_cond_resched(iter->trans);
1433
1434         btree_node_unlock(iter, iter->level);
1435         iter->l[iter->level].b = BTREE_ITER_NO_NODE_UP;
1436         iter->level++;
1437
1438         btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1439         ret = btree_iter_traverse(iter);
1440         if (ret)
1441                 return NULL;
1442
1443         /* got to end? */
1444         b = btree_iter_node(iter, iter->level);
1445         if (!b)
1446                 return NULL;
1447
1448         if (bpos_cmp(iter->pos, b->key.k.p) < 0) {
1449                 /*
1450                  * Haven't gotten to the end of the parent node: go back down to
1451                  * the next child node
1452                  */
1453                 btree_iter_set_search_pos(iter, bpos_successor(iter->pos));
1454
1455                 /* Unlock to avoid screwing up our lock invariants: */
1456                 btree_node_unlock(iter, iter->level);
1457
1458                 iter->level = iter->min_depth;
1459                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1460                 bch2_btree_iter_verify(iter);
1461
1462                 ret = btree_iter_traverse(iter);
1463                 if (ret)
1464                         return NULL;
1465
1466                 b = iter->l[iter->level].b;
1467         }
1468
1469         iter->pos = iter->real_pos = b->key.k.p;
1470
1471         bch2_btree_iter_verify(iter);
1472
1473         return b;
1474 }
1475
1476 /* Iterate across keys (in leaf nodes only) */
1477
1478 static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos)
1479 {
1480         int cmp = bpos_cmp(new_pos, iter->real_pos);
1481         unsigned l = iter->level;
1482
1483         if (!cmp)
1484                 goto out;
1485
1486         iter->real_pos = new_pos;
1487
1488         if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) {
1489                 btree_node_unlock(iter, 0);
1490                 iter->l[0].b = BTREE_ITER_NO_NODE_UP;
1491                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1492                 return;
1493         }
1494
1495         l = btree_iter_up_until_good_node(iter, cmp);
1496
1497         if (btree_iter_node(iter, l)) {
1498                 /*
1499                  * We might have to skip over many keys, or just a few: try
1500                  * advancing the node iterator, and if we have to skip over too
1501                  * many keys just reinit it (or if we're rewinding, since that
1502                  * is expensive).
1503                  */
1504                 if (cmp < 0 ||
1505                     !btree_iter_advance_to_pos(iter, &iter->l[l], 8))
1506                         __btree_iter_init(iter, l);
1507
1508                 /* Don't leave it locked if we're not supposed to: */
1509                 if (btree_lock_want(iter, l) == BTREE_NODE_UNLOCKED)
1510                         btree_node_unlock(iter, l);
1511         }
1512 out:
1513         if (l != iter->level)
1514                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1515         else
1516                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
1517
1518         bch2_btree_iter_verify(iter);
1519 }
1520
1521 inline bool bch2_btree_iter_advance(struct btree_iter *iter)
1522 {
1523         struct bpos pos = iter->k.p;
1524         bool ret = bpos_cmp(pos, POS_MAX) != 0;
1525
1526         if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
1527                 pos = bkey_successor(iter, pos);
1528         bch2_btree_iter_set_pos(iter, pos);
1529         return ret;
1530 }
1531
1532 inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
1533 {
1534         struct bpos pos = bkey_start_pos(&iter->k);
1535         bool ret = bpos_cmp(pos, POS_MIN) != 0;
1536
1537         if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
1538                 pos = bkey_predecessor(iter, pos);
1539         bch2_btree_iter_set_pos(iter, pos);
1540         return ret;
1541 }
1542
1543 static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
1544 {
1545         struct bpos next_pos = iter->l[0].b->key.k.p;
1546         bool ret = bpos_cmp(next_pos, POS_MAX) != 0;
1547
1548         /*
1549          * Typically, we don't want to modify iter->pos here, since that
1550          * indicates where we searched from - unless we got to the end of the
1551          * btree, in that case we want iter->pos to reflect that:
1552          */
1553         if (ret)
1554                 btree_iter_set_search_pos(iter, bpos_successor(next_pos));
1555         else
1556                 bch2_btree_iter_set_pos(iter, POS_MAX);
1557
1558         return ret;
1559 }
1560
1561 static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter)
1562 {
1563         struct bpos next_pos = iter->l[0].b->data->min_key;
1564         bool ret = bpos_cmp(next_pos, POS_MIN) != 0;
1565
1566         if (ret)
1567                 btree_iter_set_search_pos(iter, bpos_predecessor(next_pos));
1568         else
1569                 bch2_btree_iter_set_pos(iter, POS_MIN);
1570
1571         return ret;
1572 }
1573
1574 static struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans,
1575                                                enum btree_id btree_id, struct bpos pos)
1576 {
1577         struct btree_insert_entry *i;
1578
1579         trans_for_each_update2(trans, i)
1580                 if ((cmp_int(btree_id,  i->iter->btree_id) ?:
1581                      bkey_cmp(pos,      i->k->k.p)) <= 0) {
1582                         if (btree_id == i->iter->btree_id)
1583                                 return i->k;
1584                         break;
1585                 }
1586
1587         return NULL;
1588 }
1589
1590 static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, bool with_updates)
1591 {
1592         struct bpos search_key = btree_iter_search_key(iter);
1593         struct bkey_i *next_update = with_updates
1594                 ? btree_trans_peek_updates(iter->trans, iter->btree_id, search_key)
1595                 : NULL;
1596         struct bkey_s_c k;
1597         int ret;
1598
1599         EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
1600         bch2_btree_iter_verify(iter);
1601         bch2_btree_iter_verify_entry_exit(iter);
1602
1603         btree_iter_set_search_pos(iter, search_key);
1604
1605         while (1) {
1606                 ret = btree_iter_traverse(iter);
1607                 if (unlikely(ret))
1608                         return bkey_s_c_err(ret);
1609
1610                 k = btree_iter_level_peek(iter, &iter->l[0]);
1611
1612                 if (next_update &&
1613                     bpos_cmp(next_update->k.p, iter->real_pos) <= 0)
1614                         k = bkey_i_to_s_c(next_update);
1615
1616                 if (likely(k.k)) {
1617                         if (bkey_deleted(k.k)) {
1618                                 btree_iter_set_search_pos(iter,
1619                                                 bkey_successor(iter, k.k->p));
1620                                 continue;
1621                         }
1622
1623                         break;
1624                 }
1625
1626                 if (!btree_iter_set_pos_to_next_leaf(iter))
1627                         return bkey_s_c_null;
1628         }
1629
1630         /*
1631          * iter->pos should be mononotically increasing, and always be equal to
1632          * the key we just returned - except extents can straddle iter->pos:
1633          */
1634         if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
1635                 iter->pos = bkey_start_pos(k.k);
1636
1637         bch2_btree_iter_verify_entry_exit(iter);
1638         bch2_btree_iter_verify(iter);
1639         return k;
1640 }
1641
1642 /**
1643  * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
1644  * current position
1645  */
1646 struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
1647 {
1648         return __btree_iter_peek(iter, false);
1649 }
1650
1651 /**
1652  * bch2_btree_iter_next: returns first key greater than iterator's current
1653  * position
1654  */
1655 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
1656 {
1657         if (!bch2_btree_iter_advance(iter))
1658                 return bkey_s_c_null;
1659
1660         return bch2_btree_iter_peek(iter);
1661 }
1662
1663 struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
1664 {
1665         return __btree_iter_peek(iter, true);
1666 }
1667
1668 struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter)
1669 {
1670         if (!bch2_btree_iter_advance(iter))
1671                 return bkey_s_c_null;
1672
1673         return bch2_btree_iter_peek_with_updates(iter);
1674 }
1675
1676 /**
1677  * bch2_btree_iter_peek_prev: returns first key less than or equal to
1678  * iterator's current position
1679  */
1680 struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
1681 {
1682         struct btree_iter_level *l = &iter->l[0];
1683         struct bkey_s_c k;
1684         int ret;
1685
1686         EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
1687         bch2_btree_iter_verify(iter);
1688         bch2_btree_iter_verify_entry_exit(iter);
1689
1690         btree_iter_set_search_pos(iter, iter->pos);
1691
1692         while (1) {
1693                 ret = btree_iter_traverse(iter);
1694                 if (unlikely(ret)) {
1695                         k = bkey_s_c_err(ret);
1696                         goto no_key;
1697                 }
1698
1699                 k = btree_iter_level_peek(iter, l);
1700                 if (!k.k ||
1701                     ((iter->flags & BTREE_ITER_IS_EXTENTS)
1702                      ? bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0
1703                      : bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0))
1704                         k = btree_iter_level_prev(iter, l);
1705
1706                 if (likely(k.k))
1707                         break;
1708
1709                 if (!btree_iter_set_pos_to_prev_leaf(iter)) {
1710                         k = bkey_s_c_null;
1711                         goto no_key;
1712                 }
1713         }
1714
1715         EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0);
1716
1717         /* Extents can straddle iter->pos: */
1718         if (bkey_cmp(k.k->p, iter->pos) < 0)
1719                 iter->pos = k.k->p;
1720 out:
1721         bch2_btree_iter_verify_entry_exit(iter);
1722         bch2_btree_iter_verify(iter);
1723         return k;
1724 no_key:
1725         /*
1726          * btree_iter_level_peek() may have set iter->k to a key we didn't want, and
1727          * then we errored going to the previous leaf - make sure it's
1728          * consistent with iter->pos:
1729          */
1730         bkey_init(&iter->k);
1731         iter->k.p = iter->pos;
1732         goto out;
1733 }
1734
1735 /**
1736  * bch2_btree_iter_prev: returns first key less than iterator's current
1737  * position
1738  */
1739 struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
1740 {
1741         if (!bch2_btree_iter_rewind(iter))
1742                 return bkey_s_c_null;
1743
1744         return bch2_btree_iter_peek_prev(iter);
1745 }
1746
1747 static inline struct bkey_s_c
1748 __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
1749 {
1750         struct bkey_s_c k;
1751         struct bpos pos, next_start;
1752
1753         /* keys & holes can't span inode numbers: */
1754         if (iter->pos.offset == KEY_OFFSET_MAX) {
1755                 if (iter->pos.inode == KEY_INODE_MAX)
1756                         return bkey_s_c_null;
1757
1758                 bch2_btree_iter_set_pos(iter, bkey_successor(iter, iter->pos));
1759         }
1760
1761         pos = iter->pos;
1762         k = bch2_btree_iter_peek(iter);
1763         iter->pos = pos;
1764
1765         if (bkey_err(k))
1766                 return k;
1767
1768         if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0)
1769                 return k;
1770
1771         next_start = k.k ? bkey_start_pos(k.k) : POS_MAX;
1772
1773         bkey_init(&iter->k);
1774         iter->k.p = iter->pos;
1775         bch2_key_resize(&iter->k,
1776                         min_t(u64, KEY_SIZE_MAX,
1777                               (next_start.inode == iter->pos.inode
1778                                ? next_start.offset
1779                                : KEY_OFFSET_MAX) -
1780                               iter->pos.offset));
1781
1782         EBUG_ON(!iter->k.size);
1783
1784         bch2_btree_iter_verify_entry_exit(iter);
1785         bch2_btree_iter_verify(iter);
1786
1787         return (struct bkey_s_c) { &iter->k, NULL };
1788 }
1789
1790 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
1791 {
1792         struct btree_iter_level *l = &iter->l[0];
1793         struct bkey_s_c k;
1794         int ret;
1795
1796         EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
1797         bch2_btree_iter_verify(iter);
1798         bch2_btree_iter_verify_entry_exit(iter);
1799
1800         btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
1801
1802         if (iter->flags & BTREE_ITER_IS_EXTENTS)
1803                 return __bch2_btree_iter_peek_slot_extents(iter);
1804
1805         ret = btree_iter_traverse(iter);
1806         if (unlikely(ret))
1807                 return bkey_s_c_err(ret);
1808
1809         k = btree_iter_level_peek_all(iter, l, &iter->k);
1810
1811         EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0);
1812
1813         if (!k.k || bkey_cmp(iter->pos, k.k->p)) {
1814                 /* hole */
1815                 bkey_init(&iter->k);
1816                 iter->k.p = iter->pos;
1817                 k = (struct bkey_s_c) { &iter->k, NULL };
1818         }
1819
1820         bch2_btree_iter_verify_entry_exit(iter);
1821         bch2_btree_iter_verify(iter);
1822         return k;
1823 }
1824
1825 struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
1826 {
1827         if (!bch2_btree_iter_advance(iter))
1828                 return bkey_s_c_null;
1829
1830         return bch2_btree_iter_peek_slot(iter);
1831 }
1832
1833 struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
1834 {
1835         if (!bch2_btree_iter_rewind(iter))
1836                 return bkey_s_c_null;
1837
1838         return bch2_btree_iter_peek_slot(iter);
1839 }
1840
1841 struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter)
1842 {
1843         struct bkey_cached *ck;
1844         int ret;
1845
1846         EBUG_ON(btree_iter_type(iter) != BTREE_ITER_CACHED);
1847         bch2_btree_iter_verify(iter);
1848
1849         ret = btree_iter_traverse(iter);
1850         if (unlikely(ret))
1851                 return bkey_s_c_err(ret);
1852
1853         ck = (void *) iter->l[0].b;
1854
1855         EBUG_ON(iter->btree_id != ck->key.btree_id ||
1856                 bkey_cmp(iter->pos, ck->key.pos));
1857         BUG_ON(!ck->valid);
1858
1859         return bkey_i_to_s_c(ck->k);
1860 }
1861
1862 static inline void bch2_btree_iter_init(struct btree_trans *trans,
1863                         struct btree_iter *iter, enum btree_id btree_id)
1864 {
1865         struct bch_fs *c = trans->c;
1866         unsigned i;
1867
1868         iter->trans                     = trans;
1869         iter->uptodate                  = BTREE_ITER_NEED_TRAVERSE;
1870         iter->btree_id                  = btree_id;
1871         iter->level                     = 0;
1872         iter->min_depth                 = 0;
1873         iter->locks_want                = 0;
1874         iter->nodes_locked              = 0;
1875         iter->nodes_intent_locked       = 0;
1876         for (i = 0; i < ARRAY_SIZE(iter->l); i++)
1877                 iter->l[i].b            = BTREE_ITER_NO_NODE_INIT;
1878
1879         prefetch(c->btree_roots[btree_id].b);
1880 }
1881
1882 /* new transactional stuff: */
1883
1884 static inline void __bch2_trans_iter_free(struct btree_trans *trans,
1885                                           unsigned idx)
1886 {
1887         __bch2_btree_iter_unlock(&trans->iters[idx]);
1888         trans->iters_linked             &= ~(1ULL << idx);
1889         trans->iters_live               &= ~(1ULL << idx);
1890         trans->iters_touched            &= ~(1ULL << idx);
1891 }
1892
1893 int bch2_trans_iter_put(struct btree_trans *trans,
1894                         struct btree_iter *iter)
1895 {
1896         int ret;
1897
1898         if (IS_ERR_OR_NULL(iter))
1899                 return 0;
1900
1901         BUG_ON(trans->iters + iter->idx != iter);
1902         BUG_ON(!btree_iter_live(trans, iter));
1903
1904         ret = btree_iter_err(iter);
1905
1906         if (!(trans->iters_touched & (1ULL << iter->idx)) &&
1907             !(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT))
1908                 __bch2_trans_iter_free(trans, iter->idx);
1909
1910         trans->iters_live       &= ~(1ULL << iter->idx);
1911         return ret;
1912 }
1913
1914 int bch2_trans_iter_free(struct btree_trans *trans,
1915                          struct btree_iter *iter)
1916 {
1917         if (IS_ERR_OR_NULL(iter))
1918                 return 0;
1919
1920         set_btree_iter_dontneed(trans, iter);
1921
1922         return bch2_trans_iter_put(trans, iter);
1923 }
1924
1925 noinline __cold
1926 static void btree_trans_iter_alloc_fail(struct btree_trans *trans)
1927 {
1928
1929         struct btree_iter *iter;
1930         struct btree_insert_entry *i;
1931         char buf[100];
1932
1933         trans_for_each_iter(trans, iter)
1934                 printk(KERN_ERR "iter: btree %s pos %s%s%s%s %pS\n",
1935                        bch2_btree_ids[iter->btree_id],
1936                        (bch2_bpos_to_text(&PBUF(buf), iter->pos), buf),
1937                        btree_iter_live(trans, iter) ? " live" : "",
1938                        (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
1939                        iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
1940                        (void *) iter->ip_allocated);
1941
1942         trans_for_each_update(trans, i) {
1943                 char buf[300];
1944
1945                 bch2_bkey_val_to_text(&PBUF(buf), trans->c, bkey_i_to_s_c(i->k));
1946                 printk(KERN_ERR "update: btree %s %s\n",
1947                        bch2_btree_ids[i->iter->btree_id], buf);
1948         }
1949         panic("trans iter oveflow\n");
1950 }
1951
1952 static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans)
1953 {
1954         unsigned idx;
1955
1956         if (unlikely(trans->iters_linked ==
1957                      ~((~0ULL << 1) << (BTREE_ITER_MAX - 1))))
1958                 btree_trans_iter_alloc_fail(trans);
1959
1960         idx = __ffs64(~trans->iters_linked);
1961
1962         trans->iters_linked     |= 1ULL << idx;
1963         trans->iters[idx].idx    = idx;
1964         trans->iters[idx].flags  = 0;
1965         return &trans->iters[idx];
1966 }
1967
1968 static inline void btree_iter_copy(struct btree_iter *dst,
1969                                    struct btree_iter *src)
1970 {
1971         unsigned i, idx = dst->idx;
1972
1973         *dst = *src;
1974         dst->idx = idx;
1975         dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
1976
1977         for (i = 0; i < BTREE_MAX_DEPTH; i++)
1978                 if (btree_node_locked(dst, i))
1979                         six_lock_increment(&dst->l[i].b->c.lock,
1980                                            __btree_lock_want(dst, i));
1981
1982         dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
1983         dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT;
1984 }
1985
1986 struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
1987                                          unsigned btree_id, struct bpos pos,
1988                                          unsigned locks_want,
1989                                          unsigned depth,
1990                                          unsigned flags)
1991 {
1992         struct btree_iter *iter, *best = NULL;
1993
1994         if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
1995             !btree_type_has_snapshots(btree_id))
1996                 flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
1997
1998         if (!(flags & BTREE_ITER_ALL_SNAPSHOTS))
1999                 pos.snapshot = btree_type_has_snapshots(btree_id)
2000                         ? U32_MAX : 0;
2001
2002         trans_for_each_iter(trans, iter) {
2003                 if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE))
2004                         continue;
2005
2006                 if (iter->btree_id != btree_id)
2007                         continue;
2008
2009                 if (best &&
2010                     bkey_cmp(bpos_diff(best->real_pos, pos),
2011                              bpos_diff(iter->real_pos, pos)) > 0)
2012                         continue;
2013
2014                 best = iter;
2015         }
2016
2017         if (!best) {
2018                 iter = btree_trans_iter_alloc(trans);
2019                 bch2_btree_iter_init(trans, iter, btree_id);
2020         } else if (btree_iter_keep(trans, best)) {
2021                 iter = btree_trans_iter_alloc(trans);
2022                 btree_iter_copy(iter, best);
2023         } else {
2024                 iter = best;
2025         }
2026
2027         trans->iters_live       |= 1ULL << iter->idx;
2028         trans->iters_touched    |= 1ULL << iter->idx;
2029
2030         if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
2031             btree_node_type_is_extents(btree_id) &&
2032             !(flags & BTREE_ITER_NOT_EXTENTS) &&
2033             !(flags & BTREE_ITER_ALL_SNAPSHOTS))
2034                 flags |= BTREE_ITER_IS_EXTENTS;
2035
2036         iter->flags = flags;
2037
2038         iter->snapshot = pos.snapshot;
2039
2040         locks_want = min(locks_want, BTREE_MAX_DEPTH);
2041
2042         if (locks_want > iter->locks_want) {
2043                 iter->locks_want = locks_want;
2044                 btree_iter_get_locks(iter, true, false);
2045         } else if (locks_want < iter->locks_want) {
2046                 __bch2_btree_iter_downgrade(iter, locks_want);
2047         }
2048
2049         while (iter->level < depth) {
2050                 btree_node_unlock(iter, iter->level);
2051                 iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT;
2052                 iter->level++;
2053         }
2054
2055         while (iter->level > depth)
2056                 iter->l[--iter->level].b = BTREE_ITER_NO_NODE_INIT;
2057
2058         iter->min_depth = depth;
2059
2060         bch2_btree_iter_set_pos(iter, pos);
2061         btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
2062
2063         return iter;
2064 }
2065
2066 struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
2067                                             enum btree_id btree_id,
2068                                             struct bpos pos,
2069                                             unsigned locks_want,
2070                                             unsigned depth,
2071                                             unsigned flags)
2072 {
2073         struct btree_iter *iter =
2074                 __bch2_trans_get_iter(trans, btree_id, pos,
2075                                       locks_want, depth,
2076                                       BTREE_ITER_NODES|
2077                                       BTREE_ITER_NOT_EXTENTS|
2078                                       BTREE_ITER_ALL_SNAPSHOTS|
2079                                       flags);
2080
2081         BUG_ON(bkey_cmp(iter->pos, pos));
2082         BUG_ON(iter->locks_want != min(locks_want, BTREE_MAX_DEPTH));
2083         BUG_ON(iter->level      != depth);
2084         BUG_ON(iter->min_depth  != depth);
2085         iter->ip_allocated = _RET_IP_;
2086
2087         return iter;
2088 }
2089
2090 struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans,
2091                                         struct btree_iter *src)
2092 {
2093         struct btree_iter *iter;
2094
2095         iter = btree_trans_iter_alloc(trans);
2096         btree_iter_copy(iter, src);
2097
2098         trans->iters_live |= 1ULL << iter->idx;
2099         /*
2100          * We don't need to preserve this iter since it's cheap to copy it
2101          * again - this will cause trans_iter_put() to free it right away:
2102          */
2103         set_btree_iter_dontneed(trans, iter);
2104
2105         return iter;
2106 }
2107
2108 static int bch2_trans_preload_mem(struct btree_trans *trans, size_t size)
2109 {
2110         if (size > trans->mem_bytes) {
2111                 size_t old_bytes = trans->mem_bytes;
2112                 size_t new_bytes = roundup_pow_of_two(size);
2113                 void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
2114
2115                 if (!new_mem)
2116                         return -ENOMEM;
2117
2118                 trans->mem = new_mem;
2119                 trans->mem_bytes = new_bytes;
2120
2121                 if (old_bytes) {
2122                         trace_trans_restart_mem_realloced(trans->ip, new_bytes);
2123                         return -EINTR;
2124                 }
2125         }
2126
2127         return 0;
2128 }
2129
2130 void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
2131 {
2132         void *p;
2133         int ret;
2134
2135         ret = bch2_trans_preload_mem(trans, trans->mem_top + size);
2136         if (ret)
2137                 return ERR_PTR(ret);
2138
2139         p = trans->mem + trans->mem_top;
2140         trans->mem_top += size;
2141         return p;
2142 }
2143
2144 inline void bch2_trans_unlink_iters(struct btree_trans *trans)
2145 {
2146         u64 iters = trans->iters_linked &
2147                 ~trans->iters_touched &
2148                 ~trans->iters_live;
2149
2150         while (iters) {
2151                 unsigned idx = __ffs64(iters);
2152
2153                 iters &= ~(1ULL << idx);
2154                 __bch2_trans_iter_free(trans, idx);
2155         }
2156 }
2157
2158 void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
2159 {
2160         struct btree_iter *iter;
2161
2162         trans_for_each_iter(trans, iter)
2163                 iter->flags &= ~(BTREE_ITER_KEEP_UNTIL_COMMIT|
2164                                  BTREE_ITER_SET_POS_AFTER_COMMIT);
2165
2166         bch2_trans_unlink_iters(trans);
2167
2168         trans->iters_touched &= trans->iters_live;
2169
2170         trans->nr_updates               = 0;
2171         trans->nr_updates2              = 0;
2172         trans->mem_top                  = 0;
2173
2174         trans->hooks                    = NULL;
2175         trans->extra_journal_entries    = NULL;
2176         trans->extra_journal_entry_u64s = 0;
2177
2178         if (trans->fs_usage_deltas) {
2179                 trans->fs_usage_deltas->used = 0;
2180                 memset(&trans->fs_usage_deltas->memset_start, 0,
2181                        (void *) &trans->fs_usage_deltas->memset_end -
2182                        (void *) &trans->fs_usage_deltas->memset_start);
2183         }
2184
2185         if (!(flags & TRANS_RESET_NOUNLOCK))
2186                 bch2_trans_cond_resched(trans);
2187
2188         if (!(flags & TRANS_RESET_NOTRAVERSE))
2189                 bch2_btree_iter_traverse_all(trans);
2190 }
2191
2192 static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c)
2193 {
2194         size_t iters_bytes      = sizeof(struct btree_iter) * BTREE_ITER_MAX;
2195         size_t updates_bytes    = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX;
2196         void *p = NULL;
2197
2198         BUG_ON(trans->used_mempool);
2199
2200 #ifdef __KERNEL__
2201         p = this_cpu_xchg(c->btree_iters_bufs->iter, NULL);
2202 #endif
2203         if (!p)
2204                 p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
2205
2206         trans->iters            = p; p += iters_bytes;
2207         trans->updates          = p; p += updates_bytes;
2208         trans->updates2         = p; p += updates_bytes;
2209 }
2210
2211 void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
2212                      unsigned expected_nr_iters,
2213                      size_t expected_mem_bytes)
2214 {
2215         memset(trans, 0, sizeof(*trans));
2216         trans->c                = c;
2217         trans->ip               = _RET_IP_;
2218
2219         /*
2220          * reallocating iterators currently completely breaks
2221          * bch2_trans_iter_put(), we always allocate the max:
2222          */
2223         bch2_trans_alloc_iters(trans, c);
2224
2225         if (expected_mem_bytes) {
2226                 trans->mem_bytes = roundup_pow_of_two(expected_mem_bytes);
2227                 trans->mem = kmalloc(trans->mem_bytes, GFP_KERNEL|__GFP_NOFAIL);
2228         }
2229
2230         trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
2231
2232 #ifdef CONFIG_BCACHEFS_DEBUG
2233         trans->pid = current->pid;
2234         mutex_lock(&c->btree_trans_lock);
2235         list_add(&trans->list, &c->btree_trans_list);
2236         mutex_unlock(&c->btree_trans_lock);
2237 #endif
2238 }
2239
2240 int bch2_trans_exit(struct btree_trans *trans)
2241 {
2242         struct bch_fs *c = trans->c;
2243
2244         bch2_trans_unlock(trans);
2245
2246 #ifdef CONFIG_BCACHEFS_DEBUG
2247         if (trans->iters_live) {
2248                 struct btree_iter *iter;
2249
2250                 bch_err(c, "btree iterators leaked!");
2251                 trans_for_each_iter(trans, iter)
2252                         if (btree_iter_live(trans, iter))
2253                                 printk(KERN_ERR "  btree %s allocated at %pS\n",
2254                                        bch2_btree_ids[iter->btree_id],
2255                                        (void *) iter->ip_allocated);
2256                 /* Be noisy about this: */
2257                 bch2_fatal_error(c);
2258         }
2259
2260         mutex_lock(&trans->c->btree_trans_lock);
2261         list_del(&trans->list);
2262         mutex_unlock(&trans->c->btree_trans_lock);
2263 #endif
2264
2265         srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
2266
2267         bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
2268
2269         kfree(trans->fs_usage_deltas);
2270         kfree(trans->mem);
2271
2272 #ifdef __KERNEL__
2273         /*
2274          * Userspace doesn't have a real percpu implementation:
2275          */
2276         trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters);
2277 #endif
2278         if (trans->iters)
2279                 mempool_free(trans->iters, &trans->c->btree_iters_pool);
2280
2281         trans->mem      = (void *) 0x1;
2282         trans->iters    = (void *) 0x1;
2283
2284         return trans->error ? -EIO : 0;
2285 }
2286
2287 static void __maybe_unused
2288 bch2_btree_iter_node_to_text(struct printbuf *out,
2289                              struct btree_bkey_cached_common *_b,
2290                              enum btree_iter_type type)
2291 {
2292         pr_buf(out, "    l=%u %s:",
2293                _b->level, bch2_btree_ids[_b->btree_id]);
2294         bch2_bpos_to_text(out, btree_node_pos(_b, type));
2295 }
2296
2297 #ifdef CONFIG_BCACHEFS_DEBUG
2298 static bool trans_has_btree_nodes_locked(struct btree_trans *trans)
2299 {
2300         struct btree_iter *iter;
2301
2302         trans_for_each_iter(trans, iter)
2303                 if (btree_iter_type(iter) != BTREE_ITER_CACHED &&
2304                     iter->nodes_locked)
2305                         return true;
2306         return false;
2307 }
2308 #endif
2309
2310 void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
2311 {
2312 #ifdef CONFIG_BCACHEFS_DEBUG
2313         struct btree_trans *trans;
2314         struct btree_iter *iter;
2315         struct btree *b;
2316         unsigned l;
2317
2318         mutex_lock(&c->btree_trans_lock);
2319         list_for_each_entry(trans, &c->btree_trans_list, list) {
2320                 if (!trans_has_btree_nodes_locked(trans))
2321                         continue;
2322
2323                 pr_buf(out, "%i %ps\n", trans->pid, (void *) trans->ip);
2324
2325                 trans_for_each_iter(trans, iter) {
2326                         if (!iter->nodes_locked)
2327                                 continue;
2328
2329                         pr_buf(out, "  iter %u %c %s:",
2330                                iter->idx,
2331                                btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b',
2332                                bch2_btree_ids[iter->btree_id]);
2333                         bch2_bpos_to_text(out, iter->pos);
2334                         pr_buf(out, "\n");
2335
2336                         for (l = 0; l < BTREE_MAX_DEPTH; l++) {
2337                                 if (btree_node_locked(iter, l)) {
2338                                         pr_buf(out, "    %s l=%u ",
2339                                                btree_node_intent_locked(iter, l) ? "i" : "r", l);
2340                                         bch2_btree_iter_node_to_text(out,
2341                                                         (void *) iter->l[l].b,
2342                                                         btree_iter_type(iter));
2343                                         pr_buf(out, "\n");
2344                                 }
2345                         }
2346                 }
2347
2348                 b = READ_ONCE(trans->locking);
2349                 if (b) {
2350                         iter = &trans->iters[trans->locking_iter_idx];
2351                         pr_buf(out, "  locking iter %u %c l=%u %s:",
2352                                trans->locking_iter_idx,
2353                                btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b',
2354                                trans->locking_level,
2355                                bch2_btree_ids[trans->locking_btree_id]);
2356                         bch2_bpos_to_text(out, trans->locking_pos);
2357
2358                         pr_buf(out, " node ");
2359                         bch2_btree_iter_node_to_text(out,
2360                                         (void *) b,
2361                                         btree_iter_type(iter));
2362                         pr_buf(out, "\n");
2363                 }
2364         }
2365         mutex_unlock(&c->btree_trans_lock);
2366 #endif
2367 }
2368
2369 void bch2_fs_btree_iter_exit(struct bch_fs *c)
2370 {
2371         mempool_exit(&c->btree_iters_pool);
2372         cleanup_srcu_struct(&c->btree_trans_barrier);
2373 }
2374
2375 int bch2_fs_btree_iter_init(struct bch_fs *c)
2376 {
2377         unsigned nr = BTREE_ITER_MAX;
2378
2379         INIT_LIST_HEAD(&c->btree_trans_list);
2380         mutex_init(&c->btree_trans_lock);
2381
2382         return  init_srcu_struct(&c->btree_trans_barrier) ?:
2383                 mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
2384                         sizeof(struct btree_iter) * nr +
2385                         sizeof(struct btree_insert_entry) * nr +
2386                         sizeof(struct btree_insert_entry) * nr);
2387 }