]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_iter.c
Update bcachefs sources to 454bd4f82d bcachefs: Fix for the stripes mark path and gc
[bcachefs-tools-debian] / libbcachefs / btree_iter.c
1
2 #include "bcachefs.h"
3 #include "bkey_methods.h"
4 #include "btree_cache.h"
5 #include "btree_iter.h"
6 #include "btree_locking.h"
7 #include "debug.h"
8 #include "extents.h"
9
10 #include <linux/prefetch.h>
11 #include <trace/events/bcachefs.h>
12
13 static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *,
14                                                     struct btree_iter_level *,
15                                                     struct bkey *);
16
17 #define BTREE_ITER_NOT_END      ((struct btree *) 1)
18
19 static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
20 {
21         return l < BTREE_MAX_DEPTH &&
22                 iter->l[l].b &&
23                 iter->l[l].b != BTREE_ITER_NOT_END;
24 }
25
26 /* Returns < 0 if @k is before iter pos, > 0 if @k is after */
27 static inline int __btree_iter_pos_cmp(struct btree_iter *iter,
28                                        const struct btree *b,
29                                        const struct bkey_packed *k,
30                                        bool interior_node)
31 {
32         int cmp = bkey_cmp_left_packed(b, k, &iter->pos);
33
34         if (cmp)
35                 return cmp;
36         if (bkey_deleted(k))
37                 return -1;
38
39         /*
40          * Normally, for extents we want the first key strictly greater than
41          * the iterator position - with the exception that for interior nodes,
42          * we don't want to advance past the last key if the iterator position
43          * is POS_MAX:
44          */
45         if (iter->flags & BTREE_ITER_IS_EXTENTS &&
46             (!interior_node ||
47              bkey_cmp_left_packed_byval(b, k, POS_MAX)))
48                 return -1;
49         return 1;
50 }
51
52 static inline int btree_iter_pos_cmp(struct btree_iter *iter,
53                                      const struct btree *b,
54                                      const struct bkey_packed *k)
55 {
56         return __btree_iter_pos_cmp(iter, b, k, b->level != 0);
57 }
58
59 /* Btree node locking: */
60
61 /*
62  * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
63  * succeed:
64  */
65 void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
66 {
67         struct btree_iter *linked;
68
69         EBUG_ON(iter->l[b->level].b != b);
70         EBUG_ON(iter->l[b->level].lock_seq + 1 != b->lock.state.seq);
71
72         trans_for_each_iter_with_node(iter->trans, b, linked)
73                 linked->l[b->level].lock_seq += 2;
74
75         six_unlock_write(&b->lock);
76 }
77
78 void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
79 {
80         struct btree_iter *linked;
81         unsigned readers = 0;
82
83         EBUG_ON(btree_node_read_locked(iter, b->level));
84
85         trans_for_each_iter(iter->trans, linked)
86                 if (linked->l[b->level].b == b &&
87                     btree_node_read_locked(linked, b->level))
88                         readers++;
89
90         /*
91          * Must drop our read locks before calling six_lock_write() -
92          * six_unlock() won't do wakeups until the reader count
93          * goes to 0, and it's safe because we have the node intent
94          * locked:
95          */
96         atomic64_sub(__SIX_VAL(read_lock, readers),
97                      &b->lock.state.counter);
98         btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write);
99         atomic64_add(__SIX_VAL(read_lock, readers),
100                      &b->lock.state.counter);
101 }
102
103 bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
104 {
105         struct btree *b = btree_iter_node(iter, level);
106         int want = __btree_lock_want(iter, level);
107
108         if (!b || b == BTREE_ITER_NOT_END)
109                 return false;
110
111         if (race_fault())
112                 return false;
113
114         if (!six_relock_type(&b->lock, want, iter->l[level].lock_seq) &&
115             !(iter->l[level].lock_seq >> 1 == b->lock.state.seq >> 1 &&
116               btree_node_lock_increment(iter, b, level, want)))
117                 return false;
118
119         mark_btree_node_locked(iter, level, want);
120         return true;
121 }
122
123 static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level)
124 {
125         struct btree *b = iter->l[level].b;
126
127         EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED);
128
129         if (!is_btree_node(iter, level))
130                 return false;
131
132         if (btree_node_intent_locked(iter, level))
133                 return true;
134
135         if (race_fault())
136                 return false;
137
138         if (btree_node_locked(iter, level)
139             ? six_lock_tryupgrade(&b->lock)
140             : six_relock_type(&b->lock, SIX_LOCK_intent, iter->l[level].lock_seq))
141                 goto success;
142
143         if (iter->l[level].lock_seq >> 1 == b->lock.state.seq >> 1 &&
144             btree_node_lock_increment(iter, b, level, BTREE_NODE_INTENT_LOCKED)) {
145                 btree_node_unlock(iter, level);
146                 goto success;
147         }
148
149         return false;
150 success:
151         mark_btree_node_intent_locked(iter, level);
152         return true;
153 }
154
155 static inline bool btree_iter_get_locks(struct btree_iter *iter,
156                                         bool upgrade)
157 {
158         unsigned l = iter->level;
159         int fail_idx = -1;
160
161         do {
162                 if (!btree_iter_node(iter, l))
163                         break;
164
165                 if (!(upgrade
166                       ? bch2_btree_node_upgrade(iter, l)
167                       : bch2_btree_node_relock(iter, l))) {
168                         fail_idx = l;
169                         btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
170                 }
171
172                 l++;
173         } while (l < iter->locks_want);
174
175         /*
176          * When we fail to get a lock, we have to ensure that any child nodes
177          * can't be relocked so bch2_btree_iter_traverse has to walk back up to
178          * the node that we failed to relock:
179          */
180         while (fail_idx >= 0) {
181                 btree_node_unlock(iter, fail_idx);
182                 iter->l[fail_idx].b = BTREE_ITER_NOT_END;
183                 --fail_idx;
184         }
185
186         if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
187                 iter->uptodate = BTREE_ITER_NEED_PEEK;
188
189         bch2_btree_trans_verify_locks(iter->trans);
190
191         return iter->uptodate < BTREE_ITER_NEED_RELOCK;
192 }
193
194 /* Slowpath: */
195 bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
196                            unsigned level,
197                            struct btree_iter *iter,
198                            enum six_lock_type type,
199                            bool may_drop_locks)
200 {
201         struct btree_iter *linked;
202         bool ret = true;
203
204         /* Check if it's safe to block: */
205         trans_for_each_iter(iter->trans, linked) {
206                 if (!linked->nodes_locked)
207                         continue;
208
209                 /* * Must lock btree nodes in key order: */
210                 if (__btree_iter_cmp(iter->btree_id, pos, linked) < 0)
211                         ret = false;
212
213                 /*
214                  * Can't block taking an intent lock if we have _any_ nodes read
215                  * locked:
216                  *
217                  * - Our read lock blocks another thread with an intent lock on
218                  *   the same node from getting a write lock, and thus from
219                  *   dropping its intent lock
220                  *
221                  * - And the other thread may have multiple nodes intent locked:
222                  *   both the node we want to intent lock, and the node we
223                  *   already have read locked - deadlock:
224                  */
225                 if (type == SIX_LOCK_intent &&
226                     linked->nodes_locked != linked->nodes_intent_locked) {
227                         if (may_drop_locks) {
228                                 linked->locks_want = max_t(unsigned,
229                                                 linked->locks_want,
230                                                 __fls(linked->nodes_locked) + 1);
231                                 btree_iter_get_locks(linked, true);
232                         }
233                         ret = false;
234                 }
235
236                 /*
237                  * Interior nodes must be locked before their descendants: if
238                  * another iterator has possible descendants locked of the node
239                  * we're about to lock, it must have the ancestors locked too:
240                  */
241                 if (linked->btree_id == iter->btree_id &&
242                     level > __fls(linked->nodes_locked)) {
243                         if (may_drop_locks) {
244                                 linked->locks_want =
245                                         max(level + 1, max_t(unsigned,
246                                             linked->locks_want,
247                                             iter->locks_want));
248                                 btree_iter_get_locks(linked, true);
249                         }
250                         ret = false;
251                 }
252         }
253
254         if (unlikely(!ret)) {
255                 trans_restart();
256                 trace_trans_restart_would_deadlock(iter->trans->c,
257                                                    iter->trans->ip);
258                 return false;
259         }
260
261         __btree_node_lock_type(iter->trans->c, b, type);
262         return true;
263 }
264
265 /* Btree iterator locking: */
266
267 #ifdef CONFIG_BCACHEFS_DEBUG
268 void bch2_btree_iter_verify_locks(struct btree_iter *iter)
269 {
270         unsigned l;
271
272         BUG_ON((iter->flags & BTREE_ITER_NOUNLOCK) &&
273                !btree_node_locked(iter, 0));
274
275         for (l = 0; btree_iter_node(iter, l); l++) {
276                 if (iter->uptodate >= BTREE_ITER_NEED_RELOCK &&
277                     !btree_node_locked(iter, l))
278                         continue;
279
280                 BUG_ON(btree_lock_want(iter, l) !=
281                        btree_node_locked_type(iter, l));
282         }
283 }
284
285 void bch2_btree_trans_verify_locks(struct btree_trans *trans)
286 {
287         struct btree_iter *iter;
288
289         trans_for_each_iter(trans, iter)
290                 bch2_btree_iter_verify_locks(iter);
291 }
292 #endif
293
294 __flatten
295 static bool bch2_btree_iter_relock(struct btree_iter *iter)
296 {
297         return iter->uptodate >= BTREE_ITER_NEED_RELOCK
298                 ? btree_iter_get_locks(iter, false)
299                 : true;
300 }
301
302 bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
303                                unsigned new_locks_want)
304 {
305         struct btree_iter *linked;
306
307         EBUG_ON(iter->locks_want >= new_locks_want);
308
309         iter->locks_want = new_locks_want;
310
311         if (btree_iter_get_locks(iter, true))
312                 return true;
313
314         /*
315          * Ancestor nodes must be locked before child nodes, so set locks_want
316          * on iterators that might lock ancestors before us to avoid getting
317          * -EINTR later:
318          */
319         trans_for_each_iter(iter->trans, linked)
320                 if (linked != iter &&
321                     linked->btree_id == iter->btree_id &&
322                     btree_iter_cmp(linked, iter) <= 0 &&
323                     linked->locks_want < new_locks_want) {
324                         linked->locks_want = new_locks_want;
325                         btree_iter_get_locks(linked, true);
326                 }
327
328         return false;
329 }
330
331 bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *iter,
332                                         unsigned new_locks_want)
333 {
334         unsigned l = iter->level;
335
336         EBUG_ON(iter->locks_want >= new_locks_want);
337
338         iter->locks_want = new_locks_want;
339
340         do {
341                 if (!btree_iter_node(iter, l))
342                         break;
343
344                 if (!bch2_btree_node_upgrade(iter, l)) {
345                         iter->locks_want = l;
346                         return false;
347                 }
348
349                 l++;
350         } while (l < iter->locks_want);
351
352         return true;
353 }
354
355 void __bch2_btree_iter_downgrade(struct btree_iter *iter,
356                                  unsigned downgrade_to)
357 {
358         struct btree_iter *linked;
359         unsigned l;
360
361         /*
362          * We downgrade linked iterators as well because btree_iter_upgrade
363          * might have had to modify locks_want on linked iterators due to lock
364          * ordering:
365          */
366         trans_for_each_iter(iter->trans, linked) {
367                 unsigned new_locks_want = downgrade_to ?:
368                         (linked->flags & BTREE_ITER_INTENT ? 1 : 0);
369
370                 if (linked->locks_want <= new_locks_want)
371                         continue;
372
373                 linked->locks_want = new_locks_want;
374
375                 while (linked->nodes_locked &&
376                        (l = __fls(linked->nodes_locked)) >= linked->locks_want) {
377                         if (l > linked->level) {
378                                 btree_node_unlock(linked, l);
379                         } else {
380                                 if (btree_node_intent_locked(linked, l)) {
381                                         six_lock_downgrade(&linked->l[l].b->lock);
382                                         linked->nodes_intent_locked ^= 1 << l;
383                                 }
384                                 break;
385                         }
386                 }
387         }
388
389         bch2_btree_trans_verify_locks(iter->trans);
390 }
391
392 int bch2_btree_iter_unlock(struct btree_iter *iter)
393 {
394         struct btree_iter *linked;
395
396         trans_for_each_iter(iter->trans, linked)
397                 __bch2_btree_iter_unlock(linked);
398
399         return btree_iter_err(iter);
400 }
401
402 bool bch2_btree_trans_relock(struct btree_trans *trans)
403 {
404         struct btree_iter *iter;
405         bool ret = true;
406
407         trans_for_each_iter(trans, iter)
408                 ret &= bch2_btree_iter_relock(iter);
409
410         return ret;
411 }
412
413 void bch2_btree_trans_unlock(struct btree_trans *trans)
414 {
415         struct btree_iter *iter;
416
417         trans_for_each_iter(trans, iter)
418                 __bch2_btree_iter_unlock(iter);
419 }
420
421 /* Btree transaction locking: */
422
423 /* Btree iterator: */
424
425 #ifdef CONFIG_BCACHEFS_DEBUG
426
427 static void __bch2_btree_iter_verify(struct btree_iter *iter,
428                                      struct btree *b)
429 {
430         struct btree_iter_level *l = &iter->l[b->level];
431         struct btree_node_iter tmp = l->iter;
432         struct bkey_packed *k;
433
434         if (!debug_check_iterators(iter->trans->c))
435                 return;
436
437         if (iter->uptodate > BTREE_ITER_NEED_PEEK)
438                 return;
439
440         bch2_btree_node_iter_verify(&l->iter, b);
441
442         /*
443          * For interior nodes, the iterator will have skipped past
444          * deleted keys:
445          *
446          * For extents, the iterator may have skipped past deleted keys (but not
447          * whiteouts)
448          */
449         k = b->level || iter->flags & BTREE_ITER_IS_EXTENTS
450                 ? bch2_btree_node_iter_prev_filter(&tmp, b, KEY_TYPE_discard)
451                 : bch2_btree_node_iter_prev_all(&tmp, b);
452         if (k && btree_iter_pos_cmp(iter, b, k) > 0) {
453                 char buf[100];
454                 struct bkey uk = bkey_unpack_key(b, k);
455
456                 bch2_bkey_to_text(&PBUF(buf), &uk);
457                 panic("prev key should be before iter pos:\n%s\n%llu:%llu\n",
458                       buf, iter->pos.inode, iter->pos.offset);
459         }
460
461         k = bch2_btree_node_iter_peek_all(&l->iter, b);
462         if (k && btree_iter_pos_cmp(iter, b, k) < 0) {
463                 char buf[100];
464                 struct bkey uk = bkey_unpack_key(b, k);
465
466                 bch2_bkey_to_text(&PBUF(buf), &uk);
467                 panic("iter should be after current key:\n"
468                       "iter pos %llu:%llu\n"
469                       "cur key  %s\n",
470                       iter->pos.inode, iter->pos.offset, buf);
471         }
472
473         BUG_ON(iter->uptodate == BTREE_ITER_UPTODATE &&
474                (iter->flags & BTREE_ITER_TYPE) == BTREE_ITER_KEYS &&
475                !bkey_whiteout(&iter->k) &&
476                bch2_btree_node_iter_end(&l->iter));
477 }
478
479 void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b)
480 {
481         struct btree_iter *linked;
482
483         if (!debug_check_iterators(iter->trans->c))
484                 return;
485
486         trans_for_each_iter_with_node(iter->trans, b, linked)
487                 __bch2_btree_iter_verify(linked, b);
488 }
489
490 #else
491
492 static inline void __bch2_btree_iter_verify(struct btree_iter *iter,
493                                             struct btree *b) {}
494
495 #endif
496
497 static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
498                                       struct btree *b,
499                                       struct btree_node_iter *node_iter,
500                                       struct bset_tree *t,
501                                       struct bkey_packed *where,
502                                       unsigned clobber_u64s,
503                                       unsigned new_u64s)
504 {
505         const struct bkey_packed *end = btree_bkey_last(b, t);
506         struct btree_node_iter_set *set;
507         unsigned offset = __btree_node_key_to_offset(b, where);
508         int shift = new_u64s - clobber_u64s;
509         unsigned old_end = t->end_offset - shift;
510
511         btree_node_iter_for_each(node_iter, set)
512                 if (set->end == old_end)
513                         goto found;
514
515         /* didn't find the bset in the iterator - might have to readd it: */
516         if (new_u64s &&
517             btree_iter_pos_cmp(iter, b, where) > 0) {
518                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
519
520                 bch2_btree_node_iter_push(node_iter, b, where, end);
521
522                 if (!b->level &&
523                     node_iter == &iter->l[0].iter)
524                         bkey_disassemble(b,
525                                 bch2_btree_node_iter_peek_all(node_iter, b),
526                                 &iter->k);
527         }
528         return;
529 found:
530         set->end = t->end_offset;
531
532         /* Iterator hasn't gotten to the key that changed yet: */
533         if (set->k < offset)
534                 return;
535
536         if (new_u64s &&
537             btree_iter_pos_cmp(iter, b, where) > 0) {
538                 set->k = offset;
539         } else if (set->k < offset + clobber_u64s) {
540                 set->k = offset + new_u64s;
541                 if (set->k == set->end)
542                         bch2_btree_node_iter_set_drop(node_iter, set);
543         } else {
544                 set->k = (int) set->k + shift;
545                 goto iter_current_key_not_modified;
546         }
547
548         btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
549
550         bch2_btree_node_iter_sort(node_iter, b);
551         if (!b->level && node_iter == &iter->l[0].iter) {
552                 /*
553                  * not legal to call bkey_debugcheck() here, because we're
554                  * called midway through the update path after update has been
555                  * marked but before deletes have actually happened:
556                  */
557 #if 0
558                 __btree_iter_peek_all(iter, &iter->l[0], &iter->k);
559 #endif
560                 struct btree_iter_level *l = &iter->l[0];
561                 struct bkey_packed *k =
562                         bch2_btree_node_iter_peek_all(&l->iter, l->b);
563
564                 if (unlikely(!k))
565                         iter->k.type = KEY_TYPE_deleted;
566                 else
567                         bkey_disassemble(l->b, k, &iter->k);
568         }
569 iter_current_key_not_modified:
570
571         /*
572          * Interior nodes are special because iterators for interior nodes don't
573          * obey the usual invariants regarding the iterator position:
574          *
575          * We may have whiteouts that compare greater than the iterator
576          * position, and logically should be in the iterator, but that we
577          * skipped past to find the first live key greater than the iterator
578          * position. This becomes an issue when we insert a new key that is
579          * greater than the current iterator position, but smaller than the
580          * whiteouts we've already skipped past - this happens in the course of
581          * a btree split.
582          *
583          * We have to rewind the iterator past to before those whiteouts here,
584          * else bkey_node_iter_prev() is not going to work and who knows what
585          * else would happen. And we have to do it manually, because here we've
586          * already done the insert and the iterator is currently inconsistent:
587          *
588          * We've got multiple competing invariants, here - we have to be careful
589          * about rewinding iterators for interior nodes, because they should
590          * always point to the key for the child node the btree iterator points
591          * to.
592          */
593         if (b->level && new_u64s &&
594             btree_iter_pos_cmp(iter, b, where) > 0) {
595                 struct bset_tree *t;
596                 struct bkey_packed *k;
597
598                 for_each_bset(b, t) {
599                         if (bch2_bkey_to_bset(b, where) == t)
600                                 continue;
601
602                         k = bch2_bkey_prev_all(b, t,
603                                 bch2_btree_node_iter_bset_pos(node_iter, b, t));
604                         if (k &&
605                             bkey_iter_cmp(b, k, where) > 0) {
606                                 struct btree_node_iter_set *set;
607                                 unsigned offset =
608                                         __btree_node_key_to_offset(b, bkey_next(k));
609
610                                 btree_node_iter_for_each(node_iter, set)
611                                         if (set->k == offset) {
612                                                 set->k = __btree_node_key_to_offset(b, k);
613                                                 bch2_btree_node_iter_sort(node_iter, b);
614                                                 goto next_bset;
615                                         }
616
617                                 bch2_btree_node_iter_push(node_iter, b, k,
618                                                 btree_bkey_last(b, t));
619                         }
620 next_bset:
621                         t = t;
622                 }
623         }
624 }
625
626 void bch2_btree_node_iter_fix(struct btree_iter *iter,
627                               struct btree *b,
628                               struct btree_node_iter *node_iter,
629                               struct bkey_packed *where,
630                               unsigned clobber_u64s,
631                               unsigned new_u64s)
632 {
633         struct bset_tree *t = bch2_bkey_to_bset(b, where);
634         struct btree_iter *linked;
635
636         if (node_iter != &iter->l[b->level].iter)
637                 __bch2_btree_node_iter_fix(iter, b, node_iter, t,
638                                           where, clobber_u64s, new_u64s);
639
640         trans_for_each_iter_with_node(iter->trans, b, linked)
641                 __bch2_btree_node_iter_fix(linked, b,
642                                           &linked->l[b->level].iter, t,
643                                           where, clobber_u64s, new_u64s);
644 }
645
646 static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
647                                                   struct btree_iter_level *l,
648                                                   struct bkey *u,
649                                                   struct bkey_packed *k)
650 {
651         struct bkey_s_c ret;
652
653         if (unlikely(!k)) {
654                 /*
655                  * signal to bch2_btree_iter_peek_slot() that we're currently at
656                  * a hole
657                  */
658                 u->type = KEY_TYPE_deleted;
659                 return bkey_s_c_null;
660         }
661
662         ret = bkey_disassemble(l->b, k, u);
663
664         if (debug_check_bkeys(iter->trans->c))
665                 bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
666
667         return ret;
668 }
669
670 /* peek_all() doesn't skip deleted keys */
671 static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter,
672                                                     struct btree_iter_level *l,
673                                                     struct bkey *u)
674 {
675         return __btree_iter_unpack(iter, l, u,
676                         bch2_btree_node_iter_peek_all(&l->iter, l->b));
677 }
678
679 static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter,
680                                                 struct btree_iter_level *l)
681 {
682         return __btree_iter_unpack(iter, l, &iter->k,
683                         bch2_btree_node_iter_peek(&l->iter, l->b));
684 }
685
686 static inline bool btree_iter_advance_to_pos(struct btree_iter *iter,
687                                              struct btree_iter_level *l,
688                                              int max_advance)
689 {
690         struct bkey_packed *k;
691         int nr_advanced = 0;
692
693         while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
694                btree_iter_pos_cmp(iter, l->b, k) < 0) {
695                 if (max_advance > 0 && nr_advanced >= max_advance)
696                         return false;
697
698                 bch2_btree_node_iter_advance(&l->iter, l->b);
699                 nr_advanced++;
700         }
701
702         return true;
703 }
704
705 /*
706  * Verify that iterator for parent node points to child node:
707  */
708 static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
709 {
710         struct btree_iter_level *l;
711         unsigned plevel;
712         bool parent_locked;
713         struct bkey_packed *k;
714
715         if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
716                 return;
717
718         plevel = b->level + 1;
719         if (!btree_iter_node(iter, plevel))
720                 return;
721
722         parent_locked = btree_node_locked(iter, plevel);
723
724         if (!bch2_btree_node_relock(iter, plevel))
725                 return;
726
727         l = &iter->l[plevel];
728         k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
729         if (!k ||
730             bkey_deleted(k) ||
731             bkey_cmp_left_packed(l->b, k, &b->key.k.p)) {
732                 char buf[100];
733                 struct bkey uk = bkey_unpack_key(b, k);
734
735                 bch2_bkey_to_text(&PBUF(buf), &uk);
736                 panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n",
737                       buf, b->key.k.p.inode, b->key.k.p.offset);
738         }
739
740         if (!parent_locked)
741                 btree_node_unlock(iter, b->level + 1);
742 }
743
744 static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
745                                              struct btree *b)
746 {
747         return __btree_iter_pos_cmp(iter, NULL,
748                         bkey_to_packed(&b->key), true) < 0;
749 }
750
751 static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
752                                           struct btree *b)
753 {
754         return iter->btree_id == b->btree_id &&
755                 bkey_cmp(iter->pos, b->data->min_key) >= 0 &&
756                 !btree_iter_pos_after_node(iter, b);
757 }
758
759 static inline void __btree_iter_init(struct btree_iter *iter,
760                                      unsigned level)
761 {
762         struct btree_iter_level *l = &iter->l[level];
763
764         bch2_btree_node_iter_init(&l->iter, l->b, &iter->pos);
765
766         if (iter->flags & BTREE_ITER_IS_EXTENTS)
767                 btree_iter_advance_to_pos(iter, l, -1);
768
769         /* Skip to first non whiteout: */
770         if (level)
771                 bch2_btree_node_iter_peek(&l->iter, l->b);
772
773         btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
774 }
775
776 static inline void btree_iter_node_set(struct btree_iter *iter,
777                                        struct btree *b)
778 {
779         btree_iter_verify_new_node(iter, b);
780
781         EBUG_ON(!btree_iter_pos_in_node(iter, b));
782         EBUG_ON(b->lock.state.seq & 1);
783
784         iter->l[b->level].lock_seq = b->lock.state.seq;
785         iter->l[b->level].b = b;
786         __btree_iter_init(iter, b->level);
787 }
788
789 /*
790  * A btree node is being replaced - update the iterator to point to the new
791  * node:
792  */
793 void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
794 {
795         enum btree_node_locked_type t;
796         struct btree_iter *linked;
797
798         trans_for_each_iter(iter->trans, linked)
799                 if (btree_iter_pos_in_node(linked, b)) {
800                         /*
801                          * bch2_btree_iter_node_drop() has already been called -
802                          * the old node we're replacing has already been
803                          * unlocked and the pointer invalidated
804                          */
805                         BUG_ON(btree_node_locked(linked, b->level));
806
807                         t = btree_lock_want(linked, b->level);
808                         if (t != BTREE_NODE_UNLOCKED) {
809                                 six_lock_increment(&b->lock, t);
810                                 mark_btree_node_locked(linked, b->level, t);
811                         }
812
813                         btree_iter_node_set(linked, b);
814                 }
815
816         six_unlock_intent(&b->lock);
817 }
818
819 void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
820 {
821         struct btree_iter *linked;
822         unsigned level = b->level;
823
824         trans_for_each_iter(iter->trans, linked)
825                 if (linked->l[level].b == b) {
826                         __btree_node_unlock(linked, level);
827                         linked->l[level].b = BTREE_ITER_NOT_END;
828                 }
829 }
830
831 /*
832  * A btree node has been modified in such a way as to invalidate iterators - fix
833  * them:
834  */
835 void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b)
836 {
837         struct btree_iter *linked;
838
839         trans_for_each_iter_with_node(iter->trans, b, linked)
840                 __btree_iter_init(linked, b->level);
841 }
842
843 static inline int btree_iter_lock_root(struct btree_iter *iter,
844                                        unsigned depth_want)
845 {
846         struct bch_fs *c = iter->trans->c;
847         struct btree *b;
848         enum six_lock_type lock_type;
849         unsigned i;
850
851         EBUG_ON(iter->nodes_locked);
852
853         while (1) {
854                 b = READ_ONCE(c->btree_roots[iter->btree_id].b);
855                 iter->level = READ_ONCE(b->level);
856
857                 if (unlikely(iter->level < depth_want)) {
858                         /*
859                          * the root is at a lower depth than the depth we want:
860                          * got to the end of the btree, or we're walking nodes
861                          * greater than some depth and there are no nodes >=
862                          * that depth
863                          */
864                         iter->level = depth_want;
865                         iter->l[iter->level].b = NULL;
866                         return 1;
867                 }
868
869                 lock_type = __btree_lock_want(iter, iter->level);
870                 if (unlikely(!btree_node_lock(b, POS_MAX, iter->level,
871                                               iter, lock_type, true)))
872                         return -EINTR;
873
874                 if (likely(b == c->btree_roots[iter->btree_id].b &&
875                            b->level == iter->level &&
876                            !race_fault())) {
877                         for (i = 0; i < iter->level; i++)
878                                 iter->l[i].b = BTREE_ITER_NOT_END;
879                         iter->l[iter->level].b = b;
880
881                         mark_btree_node_locked(iter, iter->level, lock_type);
882                         btree_iter_node_set(iter, b);
883                         return 0;
884
885                 }
886
887                 six_unlock_type(&b->lock, lock_type);
888         }
889 }
890
891 noinline
892 static void btree_iter_prefetch(struct btree_iter *iter)
893 {
894         struct bch_fs *c = iter->trans->c;
895         struct btree_iter_level *l = &iter->l[iter->level];
896         struct btree_node_iter node_iter = l->iter;
897         struct bkey_packed *k;
898         BKEY_PADDED(k) tmp;
899         unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
900                 ? (iter->level > 1 ? 0 :  2)
901                 : (iter->level > 1 ? 1 : 16);
902         bool was_locked = btree_node_locked(iter, iter->level);
903
904         while (nr) {
905                 if (!bch2_btree_node_relock(iter, iter->level))
906                         return;
907
908                 bch2_btree_node_iter_advance(&node_iter, l->b);
909                 k = bch2_btree_node_iter_peek(&node_iter, l->b);
910                 if (!k)
911                         break;
912
913                 bch2_bkey_unpack(l->b, &tmp.k, k);
914                 bch2_btree_node_prefetch(c, iter, &tmp.k, iter->level - 1);
915         }
916
917         if (!was_locked)
918                 btree_node_unlock(iter, iter->level);
919 }
920
921 static inline int btree_iter_down(struct btree_iter *iter)
922 {
923         struct bch_fs *c = iter->trans->c;
924         struct btree_iter_level *l = &iter->l[iter->level];
925         struct btree *b;
926         unsigned level = iter->level - 1;
927         enum six_lock_type lock_type = __btree_lock_want(iter, level);
928         BKEY_PADDED(k) tmp;
929
930         BUG_ON(!btree_node_locked(iter, iter->level));
931
932         bch2_bkey_unpack(l->b, &tmp.k,
933                          bch2_btree_node_iter_peek(&l->iter, l->b));
934
935         b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type, true);
936         if (unlikely(IS_ERR(b)))
937                 return PTR_ERR(b);
938
939         mark_btree_node_locked(iter, level, lock_type);
940         btree_iter_node_set(iter, b);
941
942         if (iter->flags & BTREE_ITER_PREFETCH)
943                 btree_iter_prefetch(iter);
944
945         iter->level = level;
946
947         return 0;
948 }
949
950 static void btree_iter_up(struct btree_iter *iter)
951 {
952         btree_node_unlock(iter, iter->level++);
953 }
954
955 int __must_check __bch2_btree_iter_traverse(struct btree_iter *);
956
957 static int __btree_iter_traverse_all(struct btree_trans *trans,
958                                      struct btree_iter *iter, int ret)
959 {
960         struct bch_fs *c = trans->c;
961         u8 sorted[BTREE_ITER_MAX];
962         unsigned i, nr_sorted = 0;
963
964         trans_for_each_iter(trans, iter)
965                 sorted[nr_sorted++] = iter - trans->iters;
966
967 #define btree_iter_cmp_by_idx(_l, _r)                           \
968                 btree_iter_cmp(&trans->iters[_l], &trans->iters[_r])
969
970         bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx);
971 #undef btree_iter_cmp_by_idx
972
973 retry_all:
974         bch2_btree_trans_unlock(trans);
975
976         if (unlikely(ret == -ENOMEM)) {
977                 struct closure cl;
978
979                 closure_init_stack(&cl);
980
981                 do {
982                         ret = bch2_btree_cache_cannibalize_lock(c, &cl);
983                         closure_sync(&cl);
984                 } while (ret);
985         }
986
987         if (unlikely(ret == -EIO)) {
988                 trans->error = true;
989                 iter->flags |= BTREE_ITER_ERROR;
990                 iter->l[iter->level].b = BTREE_ITER_NOT_END;
991                 goto out;
992         }
993
994         BUG_ON(ret && ret != -EINTR);
995
996         /* Now, redo traversals in correct order: */
997         for (i = 0; i < nr_sorted; i++) {
998                 iter = &trans->iters[sorted[i]];
999
1000                 do {
1001                         ret = __bch2_btree_iter_traverse(iter);
1002                 } while (ret == -EINTR);
1003
1004                 if (ret)
1005                         goto retry_all;
1006         }
1007
1008         ret = hweight64(trans->iters_live) > 1 ? -EINTR : 0;
1009 out:
1010         bch2_btree_cache_cannibalize_unlock(c);
1011         return ret;
1012 }
1013
1014 int bch2_btree_iter_traverse_all(struct btree_trans *trans)
1015 {
1016         return __btree_iter_traverse_all(trans, NULL, 0);
1017 }
1018
1019 static unsigned btree_iter_up_until_locked(struct btree_iter *iter,
1020                                            bool check_pos)
1021 {
1022         unsigned l = iter->level;
1023
1024         while (btree_iter_node(iter, l) &&
1025                !(is_btree_node(iter, l) &&
1026                  bch2_btree_node_relock(iter, l) &&
1027                  (!check_pos ||
1028                   btree_iter_pos_in_node(iter, iter->l[l].b)))) {
1029                 btree_node_unlock(iter, l);
1030                 iter->l[l].b = BTREE_ITER_NOT_END;
1031                 l++;
1032         }
1033
1034         return l;
1035 }
1036
1037 /*
1038  * This is the main state machine for walking down the btree - walks down to a
1039  * specified depth
1040  *
1041  * Returns 0 on success, -EIO on error (error reading in a btree node).
1042  *
1043  * On error, caller (peek_node()/peek_key()) must return NULL; the error is
1044  * stashed in the iterator and returned from bch2_btree_iter_unlock().
1045  */
1046 int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
1047 {
1048         unsigned depth_want = iter->level;
1049
1050         if (unlikely(iter->level >= BTREE_MAX_DEPTH))
1051                 return 0;
1052
1053         if (bch2_btree_iter_relock(iter))
1054                 return 0;
1055
1056         /*
1057          * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos
1058          * here unnecessary
1059          */
1060         iter->level = btree_iter_up_until_locked(iter, true);
1061
1062         /*
1063          * If we've got a btree node locked (i.e. we aren't about to relock the
1064          * root) - advance its node iterator if necessary:
1065          *
1066          * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary
1067          */
1068         if (btree_iter_node(iter, iter->level))
1069                 btree_iter_advance_to_pos(iter, &iter->l[iter->level], -1);
1070
1071         /*
1072          * Note: iter->nodes[iter->level] may be temporarily NULL here - that
1073          * would indicate to other code that we got to the end of the btree,
1074          * here it indicates that relocking the root failed - it's critical that
1075          * btree_iter_lock_root() comes next and that it can't fail
1076          */
1077         while (iter->level > depth_want) {
1078                 int ret = btree_iter_node(iter, iter->level)
1079                         ? btree_iter_down(iter)
1080                         : btree_iter_lock_root(iter, depth_want);
1081                 if (unlikely(ret)) {
1082                         if (ret == 1)
1083                                 return 0;
1084
1085                         iter->level = depth_want;
1086                         iter->l[iter->level].b = BTREE_ITER_NOT_END;
1087                         return ret;
1088                 }
1089         }
1090
1091         iter->uptodate = BTREE_ITER_NEED_PEEK;
1092
1093         bch2_btree_trans_verify_locks(iter->trans);
1094         __bch2_btree_iter_verify(iter, iter->l[iter->level].b);
1095         return 0;
1096 }
1097
1098 int __must_check bch2_btree_iter_traverse(struct btree_iter *iter)
1099 {
1100         int ret;
1101
1102         ret = __bch2_btree_iter_traverse(iter);
1103         if (unlikely(ret))
1104                 ret = __btree_iter_traverse_all(iter->trans, iter, ret);
1105
1106         return ret;
1107 }
1108
1109 static inline void bch2_btree_iter_checks(struct btree_iter *iter,
1110                                           enum btree_iter_type type)
1111 {
1112         EBUG_ON(iter->btree_id >= BTREE_ID_NR);
1113         EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
1114                 (iter->btree_id == BTREE_ID_EXTENTS &&
1115                  type != BTREE_ITER_NODES));
1116
1117         bch2_btree_trans_verify_locks(iter->trans);
1118 }
1119
1120 /* Iterate across nodes (leaf and interior nodes) */
1121
1122 struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
1123 {
1124         struct btree *b;
1125         int ret;
1126
1127         bch2_btree_iter_checks(iter, BTREE_ITER_NODES);
1128
1129         if (iter->uptodate == BTREE_ITER_UPTODATE)
1130                 return iter->l[iter->level].b;
1131
1132         ret = bch2_btree_iter_traverse(iter);
1133         if (ret)
1134                 return NULL;
1135
1136         b = btree_iter_node(iter, iter->level);
1137         if (!b)
1138                 return NULL;
1139
1140         BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0);
1141
1142         iter->pos = b->key.k.p;
1143         iter->uptodate = BTREE_ITER_UPTODATE;
1144
1145         return b;
1146 }
1147
1148 struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth)
1149 {
1150         struct btree *b;
1151         int ret;
1152
1153         bch2_btree_iter_checks(iter, BTREE_ITER_NODES);
1154
1155         /* already got to end? */
1156         if (!btree_iter_node(iter, iter->level))
1157                 return NULL;
1158
1159         bch2_trans_cond_resched(iter->trans);
1160
1161         btree_iter_up(iter);
1162
1163         if (!bch2_btree_node_relock(iter, iter->level))
1164                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
1165
1166         ret = bch2_btree_iter_traverse(iter);
1167         if (ret)
1168                 return NULL;
1169
1170         /* got to end? */
1171         b = btree_iter_node(iter, iter->level);
1172         if (!b)
1173                 return NULL;
1174
1175         if (bkey_cmp(iter->pos, b->key.k.p) < 0) {
1176                 /*
1177                  * Haven't gotten to the end of the parent node: go back down to
1178                  * the next child node
1179                  */
1180
1181                 /*
1182                  * We don't really want to be unlocking here except we can't
1183                  * directly tell btree_iter_traverse() "traverse to this level"
1184                  * except by setting iter->level, so we have to unlock so we
1185                  * don't screw up our lock invariants:
1186                  */
1187                 if (btree_node_read_locked(iter, iter->level))
1188                         btree_node_unlock(iter, iter->level);
1189
1190                 /* ick: */
1191                 iter->pos       = iter->btree_id == BTREE_ID_INODES
1192                         ? btree_type_successor(iter->btree_id, iter->pos)
1193                         : bkey_successor(iter->pos);
1194                 iter->level     = depth;
1195
1196                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1197                 ret = bch2_btree_iter_traverse(iter);
1198                 if (ret)
1199                         return NULL;
1200
1201                 b = iter->l[iter->level].b;
1202         }
1203
1204         iter->pos = b->key.k.p;
1205         iter->uptodate = BTREE_ITER_UPTODATE;
1206
1207         return b;
1208 }
1209
1210 /* Iterate across keys (in leaf nodes only) */
1211
1212 void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos)
1213 {
1214         struct btree_iter_level *l = &iter->l[0];
1215
1216         EBUG_ON(iter->level != 0);
1217         EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0);
1218         EBUG_ON(!btree_node_locked(iter, 0));
1219         EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0);
1220
1221         iter->pos = new_pos;
1222         btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
1223
1224         btree_iter_advance_to_pos(iter, l, -1);
1225
1226         if (bch2_btree_node_iter_end(&l->iter) &&
1227             btree_iter_pos_after_node(iter, l->b))
1228                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1229 }
1230
1231 void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
1232 {
1233         int cmp = bkey_cmp(new_pos, iter->pos);
1234         unsigned level;
1235
1236         if (!cmp)
1237                 return;
1238
1239         iter->pos = new_pos;
1240
1241         level = btree_iter_up_until_locked(iter, true);
1242
1243         if (btree_iter_node(iter, level)) {
1244                 /*
1245                  * We might have to skip over many keys, or just a few: try
1246                  * advancing the node iterator, and if we have to skip over too
1247                  * many keys just reinit it (or if we're rewinding, since that
1248                  * is expensive).
1249                  */
1250                 if (cmp < 0 ||
1251                     !btree_iter_advance_to_pos(iter, &iter->l[level], 8))
1252                         __btree_iter_init(iter, level);
1253
1254                 /* Don't leave it locked if we're not supposed to: */
1255                 if (btree_lock_want(iter, level) == BTREE_NODE_UNLOCKED)
1256                         btree_node_unlock(iter, level);
1257         }
1258
1259         if (level != iter->level)
1260                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1261         else
1262                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
1263 }
1264
1265 static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter)
1266 {
1267         struct btree_iter_level *l = &iter->l[0];
1268         struct bkey_s_c ret = { .k = &iter->k };
1269
1270         if (!bkey_deleted(&iter->k)) {
1271                 EBUG_ON(bch2_btree_node_iter_end(&l->iter));
1272                 ret.v = bkeyp_val(&l->b->format,
1273                         __bch2_btree_node_iter_peek_all(&l->iter, l->b));
1274         }
1275
1276         if (debug_check_bkeys(iter->trans->c) &&
1277             !bkey_deleted(ret.k))
1278                 bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
1279         return ret;
1280 }
1281
1282 struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
1283 {
1284         struct btree_iter_level *l = &iter->l[0];
1285         struct bkey_s_c k;
1286         int ret;
1287
1288         bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
1289
1290         if (iter->uptodate == BTREE_ITER_UPTODATE)
1291                 return btree_iter_peek_uptodate(iter);
1292
1293         while (1) {
1294                 ret = bch2_btree_iter_traverse(iter);
1295                 if (unlikely(ret))
1296                         return bkey_s_c_err(ret);
1297
1298                 k = __btree_iter_peek(iter, l);
1299                 if (likely(k.k))
1300                         break;
1301
1302                 /* got to the end of the leaf, iterator needs to be traversed: */
1303                 iter->pos       = l->b->key.k.p;
1304                 iter->uptodate  = BTREE_ITER_NEED_TRAVERSE;
1305
1306                 if (!bkey_cmp(iter->pos, POS_MAX))
1307                         return bkey_s_c_null;
1308
1309                 iter->pos = btree_type_successor(iter->btree_id, iter->pos);
1310         }
1311
1312         /*
1313          * iter->pos should always be equal to the key we just
1314          * returned - except extents can straddle iter->pos:
1315          */
1316         if (!(iter->flags & BTREE_ITER_IS_EXTENTS) ||
1317             bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
1318                 iter->pos = bkey_start_pos(k.k);
1319
1320         iter->uptodate = BTREE_ITER_UPTODATE;
1321         return k;
1322 }
1323
1324 static noinline
1325 struct bkey_s_c bch2_btree_iter_peek_next_leaf(struct btree_iter *iter)
1326 {
1327         struct btree_iter_level *l = &iter->l[0];
1328
1329         iter->pos       = l->b->key.k.p;
1330         iter->uptodate  = BTREE_ITER_NEED_TRAVERSE;
1331
1332         if (!bkey_cmp(iter->pos, POS_MAX))
1333                 return bkey_s_c_null;
1334
1335         iter->pos = btree_type_successor(iter->btree_id, iter->pos);
1336
1337         return bch2_btree_iter_peek(iter);
1338 }
1339
1340 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
1341 {
1342         struct btree_iter_level *l = &iter->l[0];
1343         struct bkey_packed *p;
1344         struct bkey_s_c k;
1345
1346         bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
1347
1348         if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
1349                 k = bch2_btree_iter_peek(iter);
1350                 if (IS_ERR_OR_NULL(k.k))
1351                         return k;
1352         }
1353
1354         do {
1355                 bch2_btree_node_iter_advance(&l->iter, l->b);
1356                 p = bch2_btree_node_iter_peek_all(&l->iter, l->b);
1357                 if (unlikely(!p))
1358                         return bch2_btree_iter_peek_next_leaf(iter);
1359         } while (bkey_whiteout(p));
1360
1361         k = __btree_iter_unpack(iter, l, &iter->k, p);
1362
1363         EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) < 0);
1364         iter->pos = bkey_start_pos(k.k);
1365         return k;
1366 }
1367
1368 struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
1369 {
1370         struct btree_iter_level *l = &iter->l[0];
1371         struct bkey_packed *p;
1372         struct bkey_s_c k;
1373         int ret;
1374
1375         bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
1376
1377         if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
1378                 k = bch2_btree_iter_peek(iter);
1379                 if (IS_ERR(k.k))
1380                         return k;
1381         }
1382
1383         while (1) {
1384                 p = bch2_btree_node_iter_prev(&l->iter, l->b);
1385                 if (likely(p))
1386                         break;
1387
1388                 iter->pos = l->b->data->min_key;
1389                 if (!bkey_cmp(iter->pos, POS_MIN))
1390                         return bkey_s_c_null;
1391
1392                 bch2_btree_iter_set_pos(iter,
1393                         btree_type_predecessor(iter->btree_id, iter->pos));
1394
1395                 ret = bch2_btree_iter_traverse(iter);
1396                 if (unlikely(ret))
1397                         return bkey_s_c_err(ret);
1398
1399                 p = bch2_btree_node_iter_peek(&l->iter, l->b);
1400                 if (p)
1401                         break;
1402         }
1403
1404         k = __btree_iter_unpack(iter, l, &iter->k, p);
1405
1406         EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0);
1407
1408         iter->pos       = bkey_start_pos(k.k);
1409         iter->uptodate  = BTREE_ITER_UPTODATE;
1410         return k;
1411 }
1412
1413 static inline struct bkey_s_c
1414 __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
1415 {
1416         struct btree_iter_level *l = &iter->l[0];
1417         struct btree_node_iter node_iter;
1418         struct bkey_s_c k;
1419         struct bkey n;
1420         int ret;
1421
1422 recheck:
1423         while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k &&
1424                bkey_deleted(k.k) &&
1425                bkey_cmp(bkey_start_pos(k.k), iter->pos) == 0)
1426                 bch2_btree_node_iter_advance(&l->iter, l->b);
1427
1428         /*
1429          * iterator is now at the correct position for inserting at iter->pos,
1430          * but we need to keep iterating until we find the first non whiteout so
1431          * we know how big a hole we have, if any:
1432          */
1433
1434         node_iter = l->iter;
1435         if (k.k && bkey_whiteout(k.k))
1436                 k = __btree_iter_unpack(iter, l, &iter->k,
1437                         bch2_btree_node_iter_peek(&node_iter, l->b));
1438
1439         /*
1440          * If we got to the end of the node, check if we need to traverse to the
1441          * next node:
1442          */
1443         if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) {
1444                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1445                 ret = bch2_btree_iter_traverse(iter);
1446                 if (unlikely(ret))
1447                         return bkey_s_c_err(ret);
1448
1449                 goto recheck;
1450         }
1451
1452         if (k.k &&
1453             !bkey_whiteout(k.k) &&
1454             bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) {
1455                 /*
1456                  * if we skipped forward to find the first non whiteout and
1457                  * there _wasn't_ actually a hole, we want the iterator to be
1458                  * pointed at the key we found:
1459                  */
1460                 l->iter = node_iter;
1461
1462                 EBUG_ON(bkey_cmp(k.k->p, iter->pos) < 0);
1463                 EBUG_ON(bkey_deleted(k.k));
1464                 iter->uptodate = BTREE_ITER_UPTODATE;
1465                 return k;
1466         }
1467
1468         /* hole */
1469
1470         /* holes can't span inode numbers: */
1471         if (iter->pos.offset == KEY_OFFSET_MAX) {
1472                 if (iter->pos.inode == KEY_INODE_MAX)
1473                         return bkey_s_c_null;
1474
1475                 iter->pos = bkey_successor(iter->pos);
1476                 goto recheck;
1477         }
1478
1479         if (!k.k)
1480                 k.k = &l->b->key.k;
1481
1482         bkey_init(&n);
1483         n.p = iter->pos;
1484         bch2_key_resize(&n,
1485                         min_t(u64, KEY_SIZE_MAX,
1486                               (k.k->p.inode == n.p.inode
1487                                ? bkey_start_offset(k.k)
1488                                : KEY_OFFSET_MAX) -
1489                               n.p.offset));
1490
1491         EBUG_ON(!n.size);
1492
1493         iter->k = n;
1494         iter->uptodate = BTREE_ITER_UPTODATE;
1495         return (struct bkey_s_c) { &iter->k, NULL };
1496 }
1497
1498 static inline struct bkey_s_c
1499 __bch2_btree_iter_peek_slot(struct btree_iter *iter)
1500 {
1501         struct btree_iter_level *l = &iter->l[0];
1502         struct bkey_s_c k;
1503         int ret;
1504
1505         if (iter->flags & BTREE_ITER_IS_EXTENTS)
1506                 return __bch2_btree_iter_peek_slot_extents(iter);
1507
1508 recheck:
1509         while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k &&
1510                bkey_deleted(k.k) &&
1511                bkey_cmp(k.k->p, iter->pos) == 0)
1512                 bch2_btree_node_iter_advance(&l->iter, l->b);
1513
1514         /*
1515          * If we got to the end of the node, check if we need to traverse to the
1516          * next node:
1517          */
1518         if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) {
1519                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1520                 ret = bch2_btree_iter_traverse(iter);
1521                 if (unlikely(ret))
1522                         return bkey_s_c_err(ret);
1523
1524                 goto recheck;
1525         }
1526
1527         if (k.k &&
1528             !bkey_deleted(k.k) &&
1529             !bkey_cmp(iter->pos, k.k->p)) {
1530                 iter->uptodate = BTREE_ITER_UPTODATE;
1531                 return k;
1532         } else {
1533                 /* hole */
1534                 bkey_init(&iter->k);
1535                 iter->k.p = iter->pos;
1536
1537                 iter->uptodate = BTREE_ITER_UPTODATE;
1538                 return (struct bkey_s_c) { &iter->k, NULL };
1539         }
1540 }
1541
1542 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
1543 {
1544         int ret;
1545
1546         bch2_btree_iter_checks(iter, BTREE_ITER_SLOTS);
1547
1548         if (iter->uptodate == BTREE_ITER_UPTODATE)
1549                 return btree_iter_peek_uptodate(iter);
1550
1551         ret = bch2_btree_iter_traverse(iter);
1552         if (unlikely(ret))
1553                 return bkey_s_c_err(ret);
1554
1555         return __bch2_btree_iter_peek_slot(iter);
1556 }
1557
1558 struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
1559 {
1560         bch2_btree_iter_checks(iter, BTREE_ITER_SLOTS);
1561
1562         iter->pos = btree_type_successor(iter->btree_id, iter->k.p);
1563
1564         if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
1565                 /*
1566                  * XXX: when we just need to relock we should be able to avoid
1567                  * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK
1568                  * for that to work
1569                  */
1570                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1571
1572                 return bch2_btree_iter_peek_slot(iter);
1573         }
1574
1575         if (!bkey_deleted(&iter->k))
1576                 bch2_btree_node_iter_advance(&iter->l[0].iter, iter->l[0].b);
1577
1578         btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
1579
1580         return __bch2_btree_iter_peek_slot(iter);
1581 }
1582
1583 static inline void bch2_btree_iter_init(struct btree_trans *trans,
1584                         struct btree_iter *iter, enum btree_id btree_id,
1585                         struct bpos pos, unsigned flags)
1586 {
1587         struct bch_fs *c = trans->c;
1588         unsigned i;
1589
1590         if (btree_id == BTREE_ID_EXTENTS &&
1591             !(flags & BTREE_ITER_NODES))
1592                 flags |= BTREE_ITER_IS_EXTENTS;
1593
1594         iter->trans                     = trans;
1595         iter->pos                       = pos;
1596         bkey_init(&iter->k);
1597         iter->k.p                       = pos;
1598         iter->flags                     = flags;
1599         iter->uptodate                  = BTREE_ITER_NEED_TRAVERSE;
1600         iter->btree_id                  = btree_id;
1601         iter->level                     = 0;
1602         iter->locks_want                = flags & BTREE_ITER_INTENT ? 1 : 0;
1603         iter->nodes_locked              = 0;
1604         iter->nodes_intent_locked       = 0;
1605         for (i = 0; i < ARRAY_SIZE(iter->l); i++)
1606                 iter->l[i].b            = NULL;
1607         iter->l[iter->level].b          = BTREE_ITER_NOT_END;
1608
1609         prefetch(c->btree_roots[btree_id].b);
1610 }
1611
1612 /* new transactional stuff: */
1613
1614 int bch2_trans_iter_put(struct btree_trans *trans,
1615                         struct btree_iter *iter)
1616 {
1617         int ret = btree_iter_err(iter);
1618
1619         trans->iters_live       &= ~(1ULL << iter->idx);
1620         return ret;
1621 }
1622
1623 static inline void __bch2_trans_iter_free(struct btree_trans *trans,
1624                                           unsigned idx)
1625 {
1626         __bch2_btree_iter_unlock(&trans->iters[idx]);
1627         trans->iters_linked             &= ~(1ULL << idx);
1628         trans->iters_live               &= ~(1ULL << idx);
1629         trans->iters_touched            &= ~(1ULL << idx);
1630         trans->iters_unlink_on_restart  &= ~(1ULL << idx);
1631         trans->iters_unlink_on_commit   &= ~(1ULL << idx);
1632 }
1633
1634 int bch2_trans_iter_free(struct btree_trans *trans,
1635                          struct btree_iter *iter)
1636 {
1637         int ret = btree_iter_err(iter);
1638
1639         __bch2_trans_iter_free(trans, iter->idx);
1640         return ret;
1641 }
1642
1643 int bch2_trans_iter_free_on_commit(struct btree_trans *trans,
1644                                    struct btree_iter *iter)
1645 {
1646         int ret = btree_iter_err(iter);
1647
1648         trans->iters_unlink_on_commit |= 1ULL << iter->idx;
1649         return ret;
1650 }
1651
1652 static int btree_trans_realloc_iters(struct btree_trans *trans,
1653                                      unsigned new_size)
1654 {
1655         void *new_iters, *new_updates;
1656
1657         BUG_ON(new_size > BTREE_ITER_MAX);
1658
1659         if (new_size <= trans->size)
1660                 return 0;
1661
1662         BUG_ON(trans->used_mempool);
1663
1664         bch2_trans_unlock(trans);
1665
1666         new_iters = kmalloc(sizeof(struct btree_iter) * new_size +
1667                             sizeof(struct btree_insert_entry) * (new_size + 4),
1668                             GFP_NOFS);
1669         if (new_iters)
1670                 goto success;
1671
1672         new_iters = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
1673         new_size = BTREE_ITER_MAX;
1674
1675         trans->used_mempool = true;
1676 success:
1677         new_updates = new_iters + sizeof(struct btree_iter) * new_size;
1678
1679         memcpy(new_iters, trans->iters,
1680                sizeof(struct btree_iter) * trans->nr_iters);
1681         memcpy(new_updates, trans->updates,
1682                sizeof(struct btree_insert_entry) * trans->nr_updates);
1683
1684         if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
1685                 memset(trans->iters, POISON_FREE,
1686                        sizeof(struct btree_iter) * trans->nr_iters +
1687                        sizeof(struct btree_insert_entry) * trans->nr_iters);
1688
1689         if (trans->iters != trans->iters_onstack)
1690                 kfree(trans->iters);
1691
1692         trans->iters    = new_iters;
1693         trans->updates  = new_updates;
1694         trans->size     = new_size;
1695
1696         if (trans->iters_live) {
1697                 trans_restart();
1698                 trace_trans_restart_iters_realloced(trans->c, trans->ip);
1699                 return -EINTR;
1700         }
1701
1702         return 0;
1703 }
1704
1705 void bch2_trans_preload_iters(struct btree_trans *trans)
1706 {
1707         btree_trans_realloc_iters(trans, BTREE_ITER_MAX);
1708 }
1709
1710 static int btree_trans_iter_alloc(struct btree_trans *trans)
1711 {
1712         unsigned idx = __ffs64(~trans->iters_linked);
1713
1714         if (idx < trans->nr_iters)
1715                 goto got_slot;
1716
1717         if (trans->nr_iters == trans->size) {
1718                 int ret = btree_trans_realloc_iters(trans, trans->size * 2);
1719                 if (ret)
1720                         return ret;
1721         }
1722
1723         idx = trans->nr_iters++;
1724         BUG_ON(trans->nr_iters > trans->size);
1725
1726         trans->iters[idx].idx = idx;
1727 got_slot:
1728         BUG_ON(trans->iters_linked & (1ULL << idx));
1729         trans->iters_linked |= 1ULL << idx;
1730         return idx;
1731 }
1732
1733 static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
1734                                                  unsigned btree_id, struct bpos pos,
1735                                                  unsigned flags, u64 iter_id)
1736 {
1737         struct btree_iter *iter;
1738         int idx;
1739
1740         BUG_ON(trans->nr_iters > BTREE_ITER_MAX);
1741
1742         for (idx = 0; idx < trans->nr_iters; idx++) {
1743                 if (!(trans->iters_linked & (1ULL << idx)))
1744                         continue;
1745
1746                 iter = &trans->iters[idx];
1747                 if (iter_id
1748                     ? iter->id == iter_id
1749                     : (iter->btree_id == btree_id &&
1750                        !bkey_cmp(iter->pos, pos)))
1751                         goto found;
1752         }
1753         idx = -1;
1754 found:
1755         if (idx < 0) {
1756                 idx = btree_trans_iter_alloc(trans);
1757                 if (idx < 0)
1758                         return ERR_PTR(idx);
1759
1760                 iter = &trans->iters[idx];
1761                 iter->id = iter_id;
1762
1763                 bch2_btree_iter_init(trans, iter, btree_id, pos, flags);
1764         } else {
1765                 iter = &trans->iters[idx];
1766
1767                 iter->flags &= ~(BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
1768                 iter->flags |= flags & (BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
1769         }
1770
1771         BUG_ON(iter->btree_id != btree_id);
1772         BUG_ON(trans->iters_live & (1ULL << idx));
1773         trans->iters_live       |= 1ULL << idx;
1774         trans->iters_touched    |= 1ULL << idx;
1775
1776         BUG_ON(iter->btree_id != btree_id);
1777         BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE);
1778
1779         return iter;
1780 }
1781
1782 struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
1783                                          enum btree_id btree_id,
1784                                          struct bpos pos, unsigned flags,
1785                                          u64 iter_id)
1786 {
1787         struct btree_iter *iter =
1788                 __btree_trans_get_iter(trans, btree_id, pos, flags, iter_id);
1789
1790         if (!IS_ERR(iter))
1791                 bch2_btree_iter_set_pos(iter, pos);
1792         return iter;
1793 }
1794
1795 struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
1796                                             enum btree_id btree_id,
1797                                             struct bpos pos,
1798                                             unsigned locks_want,
1799                                             unsigned depth,
1800                                             unsigned flags)
1801 {
1802         struct btree_iter *iter =
1803                 __btree_trans_get_iter(trans, btree_id, pos,
1804                                        flags|BTREE_ITER_NODES, 0);
1805         unsigned i;
1806
1807         BUG_ON(IS_ERR(iter));
1808         BUG_ON(bkey_cmp(iter->pos, pos));
1809
1810         iter->locks_want = locks_want;
1811         iter->level     = depth;
1812
1813         for (i = 0; i < ARRAY_SIZE(iter->l); i++)
1814                 iter->l[i].b            = NULL;
1815         iter->l[iter->level].b          = BTREE_ITER_NOT_END;
1816
1817         return iter;
1818 }
1819
1820 struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans,
1821                                         struct btree_iter *src)
1822 {
1823         struct btree_iter *iter;
1824         int i, idx;
1825
1826         idx = btree_trans_iter_alloc(trans);
1827         if (idx < 0)
1828                 return ERR_PTR(idx);
1829
1830         trans->iters_live               |= 1ULL << idx;
1831         trans->iters_touched            |= 1ULL << idx;
1832         trans->iters_unlink_on_restart  |= 1ULL << idx;
1833
1834         iter = &trans->iters[idx];
1835
1836         memcpy(&iter->trans,
1837                &src->trans,
1838                (void *) &iter[1] - (void *) &iter->trans);
1839
1840         for (i = 0; i < BTREE_MAX_DEPTH; i++)
1841                 if (btree_node_locked(iter, i))
1842                         six_lock_increment(&iter->l[i].b->lock,
1843                                            __btree_lock_want(iter, i));
1844
1845         return &trans->iters[idx];
1846 }
1847
1848 void *bch2_trans_kmalloc(struct btree_trans *trans,
1849                          size_t size)
1850 {
1851         void *ret;
1852
1853         if (trans->mem_top + size > trans->mem_bytes) {
1854                 size_t old_bytes = trans->mem_bytes;
1855                 size_t new_bytes = roundup_pow_of_two(trans->mem_top + size);
1856                 void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
1857
1858                 if (!new_mem)
1859                         return ERR_PTR(-ENOMEM);
1860
1861                 trans->mem = new_mem;
1862                 trans->mem_bytes = new_bytes;
1863
1864                 if (old_bytes) {
1865                         trans_restart();
1866                         trace_trans_restart_mem_realloced(trans->c, trans->ip);
1867                         return ERR_PTR(-EINTR);
1868                 }
1869         }
1870
1871         ret = trans->mem + trans->mem_top;
1872         trans->mem_top += size;
1873         return ret;
1874 }
1875
1876 int bch2_trans_unlock(struct btree_trans *trans)
1877 {
1878         u64 iters = trans->iters_linked;
1879         int ret = 0;
1880
1881         while (iters) {
1882                 unsigned idx = __ffs64(iters);
1883                 struct btree_iter *iter = &trans->iters[idx];
1884
1885                 ret = ret ?: btree_iter_err(iter);
1886
1887                 __bch2_btree_iter_unlock(iter);
1888                 iters ^= 1ULL << idx;
1889         }
1890
1891         return ret;
1892 }
1893
1894 inline void bch2_trans_unlink_iters(struct btree_trans *trans, u64 iters)
1895 {
1896         iters &= trans->iters_linked;
1897         iters &= ~trans->iters_live;
1898
1899         while (iters) {
1900                 unsigned idx = __ffs64(iters);
1901
1902                 iters &= ~(1ULL << idx);
1903                 __bch2_trans_iter_free(trans, idx);
1904         }
1905 }
1906
1907 void __bch2_trans_begin(struct btree_trans *trans)
1908 {
1909         u64 iters_to_unlink;
1910
1911         /*
1912          * On transaction restart, the transaction isn't required to allocate
1913          * all the same iterators it on the last iteration:
1914          *
1915          * Unlink any iterators it didn't use this iteration, assuming it got
1916          * further (allocated an iter with a higher idx) than where the iter
1917          * was originally allocated:
1918          */
1919         iters_to_unlink = ~trans->iters_live &
1920                 ((1ULL << fls64(trans->iters_live)) - 1);
1921
1922         iters_to_unlink |= trans->iters_unlink_on_restart;
1923         iters_to_unlink |= trans->iters_unlink_on_commit;
1924
1925         trans->iters_live               = 0;
1926
1927         bch2_trans_unlink_iters(trans, iters_to_unlink);
1928
1929         trans->iters_touched            = 0;
1930         trans->iters_unlink_on_restart  = 0;
1931         trans->iters_unlink_on_commit   = 0;
1932         trans->nr_updates               = 0;
1933         trans->mem_top                  = 0;
1934
1935         bch2_btree_iter_traverse_all(trans);
1936 }
1937
1938 void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c)
1939 {
1940         memset(trans, 0, offsetof(struct btree_trans, iters_onstack));
1941
1942         trans->c                = c;
1943         trans->ip               = _RET_IP_;
1944         trans->size             = ARRAY_SIZE(trans->iters_onstack);
1945         trans->iters            = trans->iters_onstack;
1946         trans->updates          = trans->updates_onstack;
1947 }
1948
1949 int bch2_trans_exit(struct btree_trans *trans)
1950 {
1951         bch2_trans_unlock(trans);
1952
1953         kfree(trans->mem);
1954         if (trans->used_mempool)
1955                 mempool_free(trans->iters, &trans->c->btree_iters_pool);
1956         else if (trans->iters != trans->iters_onstack)
1957                 kfree(trans->iters);
1958         trans->mem      = (void *) 0x1;
1959         trans->iters    = (void *) 0x1;
1960
1961         return trans->error ? -EIO : 0;
1962 }