]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_iter.c
146ad2f531abb356af80ba6e0f63127c281e047f
[bcachefs-tools-debian] / libbcachefs / btree_iter.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_methods.h"
5 #include "bkey_buf.h"
6 #include "btree_cache.h"
7 #include "btree_iter.h"
8 #include "btree_key_cache.h"
9 #include "btree_locking.h"
10 #include "btree_update.h"
11 #include "debug.h"
12 #include "extents.h"
13 #include "journal.h"
14
15 #include <linux/prefetch.h>
16 #include <trace/events/bcachefs.h>
17
18 static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
19 {
20         return l < BTREE_MAX_DEPTH &&
21                 (unsigned long) iter->l[l].b >= 128;
22 }
23
24 static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
25 {
26         struct bpos pos = iter->pos;
27
28         if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
29             bkey_cmp(pos, POS_MAX))
30                 pos = bkey_successor(pos);
31         return pos;
32 }
33
34 static inline bool btree_iter_pos_before_node(struct btree_iter *iter,
35                                               struct btree *b)
36 {
37         return bkey_cmp(btree_iter_search_key(iter), b->data->min_key) < 0;
38 }
39
40 static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
41                                              struct btree *b)
42 {
43         return bkey_cmp(b->key.k.p, btree_iter_search_key(iter)) < 0;
44 }
45
46 static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
47                                           struct btree *b)
48 {
49         return iter->btree_id == b->c.btree_id &&
50                 !btree_iter_pos_before_node(iter, b) &&
51                 !btree_iter_pos_after_node(iter, b);
52 }
53
54 /* Btree node locking: */
55
56 void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
57 {
58         bch2_btree_node_unlock_write_inlined(b, iter);
59 }
60
61 void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
62 {
63         struct btree_iter *linked;
64         unsigned readers = 0;
65
66         EBUG_ON(!btree_node_intent_locked(iter, b->c.level));
67
68         trans_for_each_iter(iter->trans, linked)
69                 if (linked->l[b->c.level].b == b &&
70                     btree_node_read_locked(linked, b->c.level))
71                         readers++;
72
73         /*
74          * Must drop our read locks before calling six_lock_write() -
75          * six_unlock() won't do wakeups until the reader count
76          * goes to 0, and it's safe because we have the node intent
77          * locked:
78          */
79         atomic64_sub(__SIX_VAL(read_lock, readers),
80                      &b->c.lock.state.counter);
81         btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write);
82         atomic64_add(__SIX_VAL(read_lock, readers),
83                      &b->c.lock.state.counter);
84 }
85
86 bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
87 {
88         struct btree *b = btree_iter_node(iter, level);
89         int want = __btree_lock_want(iter, level);
90
91         if (!is_btree_node(iter, level))
92                 return false;
93
94         if (race_fault())
95                 return false;
96
97         if (six_relock_type(&b->c.lock, want, iter->l[level].lock_seq) ||
98             (btree_node_lock_seq_matches(iter, b, level) &&
99              btree_node_lock_increment(iter->trans, b, level, want))) {
100                 mark_btree_node_locked(iter, level, want);
101                 return true;
102         } else {
103                 return false;
104         }
105 }
106
107 static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level)
108 {
109         struct btree *b = iter->l[level].b;
110
111         EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED);
112
113         if (!is_btree_node(iter, level))
114                 return false;
115
116         if (btree_node_intent_locked(iter, level))
117                 return true;
118
119         if (race_fault())
120                 return false;
121
122         if (btree_node_locked(iter, level)
123             ? six_lock_tryupgrade(&b->c.lock)
124             : six_relock_type(&b->c.lock, SIX_LOCK_intent, iter->l[level].lock_seq))
125                 goto success;
126
127         if (btree_node_lock_seq_matches(iter, b, level) &&
128             btree_node_lock_increment(iter->trans, b, level, BTREE_NODE_INTENT_LOCKED)) {
129                 btree_node_unlock(iter, level);
130                 goto success;
131         }
132
133         return false;
134 success:
135         mark_btree_node_intent_locked(iter, level);
136         return true;
137 }
138
139 static inline bool btree_iter_get_locks(struct btree_iter *iter,
140                                         bool upgrade, bool trace)
141 {
142         unsigned l = iter->level;
143         int fail_idx = -1;
144
145         do {
146                 if (!btree_iter_node(iter, l))
147                         break;
148
149                 if (!(upgrade
150                       ? bch2_btree_node_upgrade(iter, l)
151                       : bch2_btree_node_relock(iter, l))) {
152                         if (trace)
153                                 (upgrade
154                                  ? trace_node_upgrade_fail
155                                  : trace_node_relock_fail)(l, iter->l[l].lock_seq,
156                                                 is_btree_node(iter, l)
157                                                 ? 0
158                                                 : (unsigned long) iter->l[l].b,
159                                                 is_btree_node(iter, l)
160                                                 ? iter->l[l].b->c.lock.state.seq
161                                                 : 0);
162
163                         fail_idx = l;
164                         btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
165                 }
166
167                 l++;
168         } while (l < iter->locks_want);
169
170         /*
171          * When we fail to get a lock, we have to ensure that any child nodes
172          * can't be relocked so bch2_btree_iter_traverse has to walk back up to
173          * the node that we failed to relock:
174          */
175         while (fail_idx >= 0) {
176                 btree_node_unlock(iter, fail_idx);
177                 iter->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS;
178                 --fail_idx;
179         }
180
181         if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
182                 iter->uptodate = BTREE_ITER_NEED_PEEK;
183
184         bch2_btree_trans_verify_locks(iter->trans);
185
186         return iter->uptodate < BTREE_ITER_NEED_RELOCK;
187 }
188
189 static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b,
190                                   enum btree_iter_type type)
191 {
192         return  type != BTREE_ITER_CACHED
193                 ? container_of(_b, struct btree, c)->key.k.p
194                 : container_of(_b, struct bkey_cached, c)->key.pos;
195 }
196
197 /* Slowpath: */
198 bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
199                             unsigned level, struct btree_iter *iter,
200                             enum six_lock_type type,
201                             six_lock_should_sleep_fn should_sleep_fn, void *p,
202                             unsigned long ip)
203 {
204         struct btree_trans *trans = iter->trans;
205         struct btree_iter *linked, *deadlock_iter = NULL;
206         u64 start_time = local_clock();
207         unsigned reason = 9;
208
209         /* Check if it's safe to block: */
210         trans_for_each_iter(trans, linked) {
211                 if (!linked->nodes_locked)
212                         continue;
213
214                 /*
215                  * Can't block taking an intent lock if we have _any_ nodes read
216                  * locked:
217                  *
218                  * - Our read lock blocks another thread with an intent lock on
219                  *   the same node from getting a write lock, and thus from
220                  *   dropping its intent lock
221                  *
222                  * - And the other thread may have multiple nodes intent locked:
223                  *   both the node we want to intent lock, and the node we
224                  *   already have read locked - deadlock:
225                  */
226                 if (type == SIX_LOCK_intent &&
227                     linked->nodes_locked != linked->nodes_intent_locked) {
228                         if (!(trans->nounlock)) {
229                                 linked->locks_want = max_t(unsigned,
230                                                 linked->locks_want,
231                                                 __fls(linked->nodes_locked) + 1);
232                                 if (!btree_iter_get_locks(linked, true, false)) {
233                                         deadlock_iter = linked;
234                                         reason = 1;
235                                 }
236                         } else {
237                                 deadlock_iter = linked;
238                                 reason = 2;
239                         }
240                 }
241
242                 if (linked->btree_id != iter->btree_id) {
243                         if (linked->btree_id > iter->btree_id) {
244                                 deadlock_iter = linked;
245                                 reason = 3;
246                         }
247                         continue;
248                 }
249
250                 /*
251                  * Within the same btree, cached iterators come before non
252                  * cached iterators:
253                  */
254                 if (btree_iter_is_cached(linked) != btree_iter_is_cached(iter)) {
255                         if (btree_iter_is_cached(iter)) {
256                                 deadlock_iter = linked;
257                                 reason = 4;
258                         }
259                         continue;
260                 }
261
262                 /*
263                  * Interior nodes must be locked before their descendants: if
264                  * another iterator has possible descendants locked of the node
265                  * we're about to lock, it must have the ancestors locked too:
266                  */
267                 if (level > __fls(linked->nodes_locked)) {
268                         if (!(trans->nounlock)) {
269                                 linked->locks_want =
270                                         max(level + 1, max_t(unsigned,
271                                             linked->locks_want,
272                                             iter->locks_want));
273                                 if (!btree_iter_get_locks(linked, true, false)) {
274                                         deadlock_iter = linked;
275                                         reason = 5;
276                                 }
277                         } else {
278                                 deadlock_iter = linked;
279                                 reason = 6;
280                         }
281                 }
282
283                 /* Must lock btree nodes in key order: */
284                 if (btree_node_locked(linked, level) &&
285                     bkey_cmp(pos, btree_node_pos((void *) linked->l[level].b,
286                                                  btree_iter_type(linked))) <= 0) {
287                         deadlock_iter = linked;
288                         reason = 7;
289                 }
290
291                 /*
292                  * Recheck if this is a node we already have locked - since one
293                  * of the get_locks() calls might've successfully
294                  * upgraded/relocked it:
295                  */
296                 if (linked->l[level].b == b &&
297                     btree_node_locked_type(linked, level) >= type) {
298                         six_lock_increment(&b->c.lock, type);
299                         return true;
300                 }
301         }
302
303         if (unlikely(deadlock_iter)) {
304                 trace_trans_restart_would_deadlock(iter->trans->ip, ip,
305                                 reason,
306                                 deadlock_iter->btree_id,
307                                 btree_iter_type(deadlock_iter),
308                                 iter->btree_id,
309                                 btree_iter_type(iter));
310                 return false;
311         }
312
313         if (six_trylock_type(&b->c.lock, type))
314                 return true;
315
316         if (six_lock_type(&b->c.lock, type, should_sleep_fn, p))
317                 return false;
318
319         bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)],
320                                start_time);
321         return true;
322 }
323
324 /* Btree iterator locking: */
325
326 #ifdef CONFIG_BCACHEFS_DEBUG
327 static void bch2_btree_iter_verify_locks(struct btree_iter *iter)
328 {
329         unsigned l;
330
331         if (!(iter->trans->iters_linked & (1ULL << iter->idx))) {
332                 BUG_ON(iter->nodes_locked);
333                 return;
334         }
335
336         for (l = 0; is_btree_node(iter, l); l++) {
337                 if (iter->uptodate >= BTREE_ITER_NEED_RELOCK &&
338                     !btree_node_locked(iter, l))
339                         continue;
340
341                 BUG_ON(btree_lock_want(iter, l) !=
342                        btree_node_locked_type(iter, l));
343         }
344 }
345
346 void bch2_btree_trans_verify_locks(struct btree_trans *trans)
347 {
348         struct btree_iter *iter;
349
350         trans_for_each_iter(trans, iter)
351                 bch2_btree_iter_verify_locks(iter);
352 }
353 #else
354 static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {}
355 #endif
356
357 __flatten
358 bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace)
359 {
360         return btree_iter_get_locks(iter, false, trace);
361 }
362
363 bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
364                                unsigned new_locks_want)
365 {
366         struct btree_iter *linked;
367
368         EBUG_ON(iter->locks_want >= new_locks_want);
369
370         iter->locks_want = new_locks_want;
371
372         if (btree_iter_get_locks(iter, true, true))
373                 return true;
374
375         /*
376          * Ancestor nodes must be locked before child nodes, so set locks_want
377          * on iterators that might lock ancestors before us to avoid getting
378          * -EINTR later:
379          */
380         trans_for_each_iter(iter->trans, linked)
381                 if (linked != iter &&
382                     linked->btree_id == iter->btree_id &&
383                     linked->locks_want < new_locks_want) {
384                         linked->locks_want = new_locks_want;
385                         btree_iter_get_locks(linked, true, false);
386                 }
387
388         return false;
389 }
390
391 bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *iter,
392                                         unsigned new_locks_want)
393 {
394         unsigned l = iter->level;
395
396         EBUG_ON(iter->locks_want >= new_locks_want);
397
398         iter->locks_want = new_locks_want;
399
400         do {
401                 if (!btree_iter_node(iter, l))
402                         break;
403
404                 if (!bch2_btree_node_upgrade(iter, l)) {
405                         iter->locks_want = l;
406                         return false;
407                 }
408
409                 l++;
410         } while (l < iter->locks_want);
411
412         return true;
413 }
414
415 void __bch2_btree_iter_downgrade(struct btree_iter *iter,
416                                  unsigned downgrade_to)
417 {
418         unsigned l, new_locks_want = downgrade_to ?:
419                 (iter->flags & BTREE_ITER_INTENT ? 1 : 0);
420
421         if (iter->locks_want < downgrade_to) {
422                 iter->locks_want = new_locks_want;
423
424                 while (iter->nodes_locked &&
425                        (l = __fls(iter->nodes_locked)) >= iter->locks_want) {
426                         if (l > iter->level) {
427                                 btree_node_unlock(iter, l);
428                         } else {
429                                 if (btree_node_intent_locked(iter, l)) {
430                                         six_lock_downgrade(&iter->l[l].b->c.lock);
431                                         iter->nodes_intent_locked ^= 1 << l;
432                                 }
433                                 break;
434                         }
435                 }
436         }
437
438         bch2_btree_trans_verify_locks(iter->trans);
439 }
440
441 void bch2_trans_downgrade(struct btree_trans *trans)
442 {
443         struct btree_iter *iter;
444
445         trans_for_each_iter(trans, iter)
446                 bch2_btree_iter_downgrade(iter);
447 }
448
449 /* Btree transaction locking: */
450
451 bool bch2_trans_relock(struct btree_trans *trans)
452 {
453         struct btree_iter *iter;
454         bool ret = true;
455
456         trans_for_each_iter(trans, iter)
457                 if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
458                         ret &= bch2_btree_iter_relock(iter, true);
459
460         return ret;
461 }
462
463 void bch2_trans_unlock(struct btree_trans *trans)
464 {
465         struct btree_iter *iter;
466
467         trans_for_each_iter(trans, iter)
468                 __bch2_btree_iter_unlock(iter);
469 }
470
471 /* Btree iterator: */
472
473 #ifdef CONFIG_BCACHEFS_DEBUG
474
475 static void bch2_btree_iter_verify_cached(struct btree_iter *iter)
476 {
477         struct bkey_cached *ck;
478         bool locked = btree_node_locked(iter, 0);
479
480         if (!bch2_btree_node_relock(iter, 0))
481                 return;
482
483         ck = (void *) iter->l[0].b;
484         BUG_ON(ck->key.btree_id != iter->btree_id ||
485                bkey_cmp(ck->key.pos, iter->pos));
486
487         if (!locked)
488                 btree_node_unlock(iter, 0);
489 }
490
491 static void bch2_btree_iter_verify_level(struct btree_iter *iter,
492                                          unsigned level)
493 {
494         struct bpos pos = btree_iter_search_key(iter);
495         struct btree_iter_level *l = &iter->l[level];
496         struct btree_node_iter tmp = l->iter;
497         bool locked = btree_node_locked(iter, level);
498         struct bkey_packed *p, *k;
499         char buf1[100], buf2[100];
500         const char *msg;
501
502         if (!bch2_debug_check_iterators)
503                 return;
504
505         if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
506                 if (!level)
507                         bch2_btree_iter_verify_cached(iter);
508                 return;
509         }
510
511         BUG_ON(iter->level < iter->min_depth);
512
513         if (!btree_iter_node(iter, level))
514                 return;
515
516         if (!bch2_btree_node_relock(iter, level))
517                 return;
518
519         BUG_ON(!btree_iter_pos_in_node(iter, l->b));
520
521         /*
522          * node iterators don't use leaf node iterator:
523          */
524         if (btree_iter_type(iter) == BTREE_ITER_NODES &&
525             level <= iter->min_depth)
526                 goto unlock;
527
528         bch2_btree_node_iter_verify(&l->iter, l->b);
529
530         /*
531          * For interior nodes, the iterator will have skipped past
532          * deleted keys:
533          *
534          * For extents, the iterator may have skipped past deleted keys (but not
535          * whiteouts)
536          */
537         p = level || btree_node_type_is_extents(iter->btree_id)
538                 ? bch2_btree_node_iter_prev_filter(&tmp, l->b, KEY_TYPE_discard)
539                 : bch2_btree_node_iter_prev_all(&tmp, l->b);
540         k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
541
542         if (p && bkey_iter_pos_cmp(l->b, p, &pos) >= 0) {
543                 msg = "before";
544                 goto err;
545         }
546
547         if (k && bkey_iter_pos_cmp(l->b, k, &pos) < 0) {
548                 msg = "after";
549                 goto err;
550         }
551 unlock:
552         if (!locked)
553                 btree_node_unlock(iter, level);
554         return;
555 err:
556         strcpy(buf1, "(none)");
557         strcpy(buf2, "(none)");
558
559         if (p) {
560                 struct bkey uk = bkey_unpack_key(l->b, p);
561                 bch2_bkey_to_text(&PBUF(buf1), &uk);
562         }
563
564         if (k) {
565                 struct bkey uk = bkey_unpack_key(l->b, k);
566                 bch2_bkey_to_text(&PBUF(buf2), &uk);
567         }
568
569         panic("iterator should be %s key at level %u:\n"
570               "iter pos %s %llu:%llu\n"
571               "prev key %s\n"
572               "cur  key %s\n",
573               msg, level,
574               iter->flags & BTREE_ITER_IS_EXTENTS ? ">" : "=>",
575               iter->pos.inode, iter->pos.offset,
576               buf1, buf2);
577 }
578
579 static void bch2_btree_iter_verify(struct btree_iter *iter)
580 {
581         unsigned i;
582
583         bch2_btree_trans_verify_locks(iter->trans);
584
585         for (i = 0; i < BTREE_MAX_DEPTH; i++)
586                 bch2_btree_iter_verify_level(iter, i);
587 }
588
589 void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b)
590 {
591         struct btree_iter *iter;
592
593         if (!bch2_debug_check_iterators)
594                 return;
595
596         trans_for_each_iter_with_node(trans, b, iter)
597                 bch2_btree_iter_verify_level(iter, b->c.level);
598 }
599
600 #else
601
602 static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned l) {}
603 static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
604
605 #endif
606
607 static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
608                                         struct btree *b,
609                                         struct bset_tree *t,
610                                         struct bkey_packed *k)
611 {
612         struct btree_node_iter_set *set;
613
614         btree_node_iter_for_each(iter, set)
615                 if (set->end == t->end_offset) {
616                         set->k = __btree_node_key_to_offset(b, k);
617                         bch2_btree_node_iter_sort(iter, b);
618                         return;
619                 }
620
621         bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
622 }
623
624 static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
625                                                struct btree *b,
626                                                struct bkey_packed *where)
627 {
628         struct btree_iter_level *l = &iter->l[b->c.level];
629         struct bpos pos = btree_iter_search_key(iter);
630
631         if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
632                 return;
633
634         if (bkey_iter_pos_cmp(l->b, where, &pos) < 0)
635                 bch2_btree_node_iter_advance(&l->iter, l->b);
636
637         btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
638 }
639
640 void bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
641                                       struct btree *b,
642                                       struct bkey_packed *where)
643 {
644         struct btree_iter *linked;
645
646         trans_for_each_iter_with_node(iter->trans, b, linked) {
647                 __bch2_btree_iter_fix_key_modified(linked, b, where);
648                 bch2_btree_iter_verify_level(linked, b->c.level);
649         }
650 }
651
652 static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
653                                       struct btree *b,
654                                       struct btree_node_iter *node_iter,
655                                       struct bset_tree *t,
656                                       struct bkey_packed *where,
657                                       unsigned clobber_u64s,
658                                       unsigned new_u64s)
659 {
660         const struct bkey_packed *end = btree_bkey_last(b, t);
661         struct btree_node_iter_set *set;
662         unsigned offset = __btree_node_key_to_offset(b, where);
663         int shift = new_u64s - clobber_u64s;
664         unsigned old_end = t->end_offset - shift;
665         unsigned orig_iter_pos = node_iter->data[0].k;
666         bool iter_current_key_modified =
667                 orig_iter_pos >= offset &&
668                 orig_iter_pos <= offset + clobber_u64s;
669         struct bpos iter_pos = btree_iter_search_key(iter);
670
671         btree_node_iter_for_each(node_iter, set)
672                 if (set->end == old_end)
673                         goto found;
674
675         /* didn't find the bset in the iterator - might have to readd it: */
676         if (new_u64s &&
677             bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) {
678                 bch2_btree_node_iter_push(node_iter, b, where, end);
679                 goto fixup_done;
680         } else {
681                 /* Iterator is after key that changed */
682                 return;
683         }
684 found:
685         set->end = t->end_offset;
686
687         /* Iterator hasn't gotten to the key that changed yet: */
688         if (set->k < offset)
689                 return;
690
691         if (new_u64s &&
692             bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) {
693                 set->k = offset;
694         } else if (set->k < offset + clobber_u64s) {
695                 set->k = offset + new_u64s;
696                 if (set->k == set->end)
697                         bch2_btree_node_iter_set_drop(node_iter, set);
698         } else {
699                 /* Iterator is after key that changed */
700                 set->k = (int) set->k + shift;
701                 return;
702         }
703
704         bch2_btree_node_iter_sort(node_iter, b);
705 fixup_done:
706         if (node_iter->data[0].k != orig_iter_pos)
707                 iter_current_key_modified = true;
708
709         /*
710          * When a new key is added, and the node iterator now points to that
711          * key, the iterator might have skipped past deleted keys that should
712          * come after the key the iterator now points to. We have to rewind to
713          * before those deleted keys - otherwise
714          * bch2_btree_node_iter_prev_all() breaks:
715          */
716         if (!bch2_btree_node_iter_end(node_iter) &&
717             iter_current_key_modified &&
718             (b->c.level ||
719              btree_node_type_is_extents(iter->btree_id))) {
720                 struct bset_tree *t;
721                 struct bkey_packed *k, *k2, *p;
722
723                 k = bch2_btree_node_iter_peek_all(node_iter, b);
724
725                 for_each_bset(b, t) {
726                         bool set_pos = false;
727
728                         if (node_iter->data[0].end == t->end_offset)
729                                 continue;
730
731                         k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t);
732
733                         while ((p = bch2_bkey_prev_all(b, t, k2)) &&
734                                bkey_iter_cmp(b, k, p) < 0) {
735                                 k2 = p;
736                                 set_pos = true;
737                         }
738
739                         if (set_pos)
740                                 btree_node_iter_set_set_pos(node_iter,
741                                                             b, t, k2);
742                 }
743         }
744
745         if (!b->c.level &&
746             node_iter == &iter->l[0].iter &&
747             iter_current_key_modified)
748                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
749 }
750
751 void bch2_btree_node_iter_fix(struct btree_iter *iter,
752                               struct btree *b,
753                               struct btree_node_iter *node_iter,
754                               struct bkey_packed *where,
755                               unsigned clobber_u64s,
756                               unsigned new_u64s)
757 {
758         struct bset_tree *t = bch2_bkey_to_bset(b, where);
759         struct btree_iter *linked;
760
761         if (node_iter != &iter->l[b->c.level].iter) {
762                 __bch2_btree_node_iter_fix(iter, b, node_iter, t,
763                                            where, clobber_u64s, new_u64s);
764
765                 if (bch2_debug_check_iterators)
766                         bch2_btree_node_iter_verify(node_iter, b);
767         }
768
769         trans_for_each_iter_with_node(iter->trans, b, linked) {
770                 __bch2_btree_node_iter_fix(linked, b,
771                                            &linked->l[b->c.level].iter, t,
772                                            where, clobber_u64s, new_u64s);
773                 bch2_btree_iter_verify_level(linked, b->c.level);
774         }
775 }
776
777 static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
778                                                   struct btree_iter_level *l,
779                                                   struct bkey *u,
780                                                   struct bkey_packed *k)
781 {
782         struct bkey_s_c ret;
783
784         if (unlikely(!k)) {
785                 /*
786                  * signal to bch2_btree_iter_peek_slot() that we're currently at
787                  * a hole
788                  */
789                 u->type = KEY_TYPE_deleted;
790                 return bkey_s_c_null;
791         }
792
793         ret = bkey_disassemble(l->b, k, u);
794
795         if (bch2_debug_check_bkeys)
796                 bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
797
798         return ret;
799 }
800
801 /* peek_all() doesn't skip deleted keys */
802 static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter,
803                                                     struct btree_iter_level *l,
804                                                     struct bkey *u)
805 {
806         return __btree_iter_unpack(iter, l, u,
807                         bch2_btree_node_iter_peek_all(&l->iter, l->b));
808 }
809
810 static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter,
811                                                 struct btree_iter_level *l)
812 {
813         return __btree_iter_unpack(iter, l, &iter->k,
814                         bch2_btree_node_iter_peek(&l->iter, l->b));
815 }
816
817 static inline struct bkey_s_c __btree_iter_prev(struct btree_iter *iter,
818                                                 struct btree_iter_level *l)
819 {
820         return __btree_iter_unpack(iter, l, &iter->k,
821                         bch2_btree_node_iter_prev(&l->iter, l->b));
822 }
823
824 static inline bool btree_iter_advance_to_pos(struct btree_iter *iter,
825                                              struct btree_iter_level *l,
826                                              int max_advance)
827 {
828         struct bpos pos = btree_iter_search_key(iter);
829         struct bkey_packed *k;
830         int nr_advanced = 0;
831
832         while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
833                bkey_iter_pos_cmp(l->b, k, &pos) < 0) {
834                 if (max_advance > 0 && nr_advanced >= max_advance)
835                         return false;
836
837                 bch2_btree_node_iter_advance(&l->iter, l->b);
838                 nr_advanced++;
839         }
840
841         return true;
842 }
843
844 /*
845  * Verify that iterator for parent node points to child node:
846  */
847 static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
848 {
849         struct btree_iter_level *l;
850         unsigned plevel;
851         bool parent_locked;
852         struct bkey_packed *k;
853
854         if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
855                 return;
856
857         plevel = b->c.level + 1;
858         if (!btree_iter_node(iter, plevel))
859                 return;
860
861         parent_locked = btree_node_locked(iter, plevel);
862
863         if (!bch2_btree_node_relock(iter, plevel))
864                 return;
865
866         l = &iter->l[plevel];
867         k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
868         if (!k ||
869             bkey_deleted(k) ||
870             bkey_cmp_left_packed(l->b, k, &b->key.k.p)) {
871                 char buf[100];
872                 struct bkey uk = bkey_unpack_key(b, k);
873
874                 bch2_dump_btree_node(iter->trans->c, l->b);
875                 bch2_bkey_to_text(&PBUF(buf), &uk);
876                 panic("parent iter doesn't point to new node:\n"
877                       "iter pos %s %llu:%llu\n"
878                       "iter key %s\n"
879                       "new node %llu:%llu-%llu:%llu\n",
880                       bch2_btree_ids[iter->btree_id],
881                       iter->pos.inode,
882                       iter->pos.offset,
883                       buf,
884                       b->data->min_key.inode,
885                       b->data->min_key.offset,
886                       b->key.k.p.inode, b->key.k.p.offset);
887         }
888
889         if (!parent_locked)
890                 btree_node_unlock(iter, b->c.level + 1);
891 }
892
893 static inline void __btree_iter_init(struct btree_iter *iter,
894                                      unsigned level)
895 {
896         struct bpos pos = btree_iter_search_key(iter);
897         struct btree_iter_level *l = &iter->l[level];
898
899         bch2_btree_node_iter_init(&l->iter, l->b, &pos);
900
901         /*
902          * Iterators to interior nodes should always be pointed at the first non
903          * whiteout:
904          */
905         if (level)
906                 bch2_btree_node_iter_peek(&l->iter, l->b);
907
908         btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
909 }
910
911 static inline void btree_iter_node_set(struct btree_iter *iter,
912                                        struct btree *b)
913 {
914         BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
915
916         btree_iter_verify_new_node(iter, b);
917
918         EBUG_ON(!btree_iter_pos_in_node(iter, b));
919         EBUG_ON(b->c.lock.state.seq & 1);
920
921         iter->l[b->c.level].lock_seq = b->c.lock.state.seq;
922         iter->l[b->c.level].b = b;
923         __btree_iter_init(iter, b->c.level);
924 }
925
926 /*
927  * A btree node is being replaced - update the iterator to point to the new
928  * node:
929  */
930 void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
931 {
932         enum btree_node_locked_type t;
933         struct btree_iter *linked;
934
935         trans_for_each_iter(iter->trans, linked)
936                 if (btree_iter_type(linked) != BTREE_ITER_CACHED &&
937                     btree_iter_pos_in_node(linked, b)) {
938                         /*
939                          * bch2_btree_iter_node_drop() has already been called -
940                          * the old node we're replacing has already been
941                          * unlocked and the pointer invalidated
942                          */
943                         BUG_ON(btree_node_locked(linked, b->c.level));
944
945                         t = btree_lock_want(linked, b->c.level);
946                         if (t != BTREE_NODE_UNLOCKED) {
947                                 six_lock_increment(&b->c.lock, t);
948                                 mark_btree_node_locked(linked, b->c.level, t);
949                         }
950
951                         btree_iter_node_set(linked, b);
952                 }
953 }
954
955 void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
956 {
957         struct btree_iter *linked;
958         unsigned level = b->c.level;
959
960         trans_for_each_iter(iter->trans, linked)
961                 if (linked->l[level].b == b) {
962                         __btree_node_unlock(linked, level);
963                         linked->l[level].b = BTREE_ITER_NO_NODE_DROP;
964                 }
965 }
966
967 /*
968  * A btree node has been modified in such a way as to invalidate iterators - fix
969  * them:
970  */
971 void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b)
972 {
973         struct btree_iter *linked;
974
975         trans_for_each_iter_with_node(iter->trans, b, linked)
976                 __btree_iter_init(linked, b->c.level);
977 }
978
979 static int lock_root_check_fn(struct six_lock *lock, void *p)
980 {
981         struct btree *b = container_of(lock, struct btree, c.lock);
982         struct btree **rootp = p;
983
984         return b == *rootp ? 0 : -1;
985 }
986
987 static inline int btree_iter_lock_root(struct btree_iter *iter,
988                                        unsigned depth_want,
989                                        unsigned long trace_ip)
990 {
991         struct bch_fs *c = iter->trans->c;
992         struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b;
993         enum six_lock_type lock_type;
994         unsigned i;
995
996         EBUG_ON(iter->nodes_locked);
997
998         while (1) {
999                 b = READ_ONCE(*rootp);
1000                 iter->level = READ_ONCE(b->c.level);
1001
1002                 if (unlikely(iter->level < depth_want)) {
1003                         /*
1004                          * the root is at a lower depth than the depth we want:
1005                          * got to the end of the btree, or we're walking nodes
1006                          * greater than some depth and there are no nodes >=
1007                          * that depth
1008                          */
1009                         iter->level = depth_want;
1010                         for (i = iter->level; i < BTREE_MAX_DEPTH; i++)
1011                                 iter->l[i].b = NULL;
1012                         return 1;
1013                 }
1014
1015                 lock_type = __btree_lock_want(iter, iter->level);
1016                 if (unlikely(!btree_node_lock(b, POS_MAX, iter->level,
1017                                               iter, lock_type,
1018                                               lock_root_check_fn, rootp,
1019                                               trace_ip)))
1020                         return -EINTR;
1021
1022                 if (likely(b == READ_ONCE(*rootp) &&
1023                            b->c.level == iter->level &&
1024                            !race_fault())) {
1025                         for (i = 0; i < iter->level; i++)
1026                                 iter->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT;
1027                         iter->l[iter->level].b = b;
1028                         for (i = iter->level + 1; i < BTREE_MAX_DEPTH; i++)
1029                                 iter->l[i].b = NULL;
1030
1031                         mark_btree_node_locked(iter, iter->level, lock_type);
1032                         btree_iter_node_set(iter, b);
1033                         return 0;
1034                 }
1035
1036                 six_unlock_type(&b->c.lock, lock_type);
1037         }
1038 }
1039
1040 noinline
1041 static void btree_iter_prefetch(struct btree_iter *iter)
1042 {
1043         struct bch_fs *c = iter->trans->c;
1044         struct btree_iter_level *l = &iter->l[iter->level];
1045         struct btree_node_iter node_iter = l->iter;
1046         struct bkey_packed *k;
1047         struct bkey_buf tmp;
1048         unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
1049                 ? (iter->level > 1 ? 0 :  2)
1050                 : (iter->level > 1 ? 1 : 16);
1051         bool was_locked = btree_node_locked(iter, iter->level);
1052
1053         bch2_bkey_buf_init(&tmp);
1054
1055         while (nr) {
1056                 if (!bch2_btree_node_relock(iter, iter->level))
1057                         break;
1058
1059                 bch2_btree_node_iter_advance(&node_iter, l->b);
1060                 k = bch2_btree_node_iter_peek(&node_iter, l->b);
1061                 if (!k)
1062                         break;
1063
1064                 bch2_bkey_buf_unpack(&tmp, c, l->b, k);
1065                 bch2_btree_node_prefetch(c, iter, tmp.k, iter->btree_id,
1066                                          iter->level - 1);
1067         }
1068
1069         if (!was_locked)
1070                 btree_node_unlock(iter, iter->level);
1071
1072         bch2_bkey_buf_exit(&tmp, c);
1073 }
1074
1075 static noinline void btree_node_mem_ptr_set(struct btree_iter *iter,
1076                                             unsigned plevel, struct btree *b)
1077 {
1078         struct btree_iter_level *l = &iter->l[plevel];
1079         bool locked = btree_node_locked(iter, plevel);
1080         struct bkey_packed *k;
1081         struct bch_btree_ptr_v2 *bp;
1082
1083         if (!bch2_btree_node_relock(iter, plevel))
1084                 return;
1085
1086         k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
1087         BUG_ON(k->type != KEY_TYPE_btree_ptr_v2);
1088
1089         bp = (void *) bkeyp_val(&l->b->format, k);
1090         bp->mem_ptr = (unsigned long)b;
1091
1092         if (!locked)
1093                 btree_node_unlock(iter, plevel);
1094 }
1095
1096 static __always_inline int btree_iter_down(struct btree_iter *iter,
1097                                            unsigned long trace_ip)
1098 {
1099         struct bch_fs *c = iter->trans->c;
1100         struct btree_iter_level *l = &iter->l[iter->level];
1101         struct btree *b;
1102         unsigned level = iter->level - 1;
1103         enum six_lock_type lock_type = __btree_lock_want(iter, level);
1104         struct bkey_buf tmp;
1105         int ret;
1106
1107         EBUG_ON(!btree_node_locked(iter, iter->level));
1108
1109         bch2_bkey_buf_init(&tmp);
1110         bch2_bkey_buf_unpack(&tmp, c, l->b,
1111                          bch2_btree_node_iter_peek(&l->iter, l->b));
1112
1113         b = bch2_btree_node_get(c, iter, tmp.k, level, lock_type, trace_ip);
1114         ret = PTR_ERR_OR_ZERO(b);
1115         if (unlikely(ret))
1116                 goto err;
1117
1118         mark_btree_node_locked(iter, level, lock_type);
1119         btree_iter_node_set(iter, b);
1120
1121         if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 &&
1122             unlikely(b != btree_node_mem_ptr(tmp.k)))
1123                 btree_node_mem_ptr_set(iter, level + 1, b);
1124
1125         if (iter->flags & BTREE_ITER_PREFETCH)
1126                 btree_iter_prefetch(iter);
1127
1128         iter->level = level;
1129 err:
1130         bch2_bkey_buf_exit(&tmp, c);
1131         return ret;
1132 }
1133
1134 static void btree_iter_up(struct btree_iter *iter)
1135 {
1136         btree_node_unlock(iter, iter->level++);
1137 }
1138
1139 static int btree_iter_traverse_one(struct btree_iter *, unsigned long);
1140
1141 static int __btree_iter_traverse_all(struct btree_trans *trans, int ret)
1142 {
1143         struct bch_fs *c = trans->c;
1144         struct btree_iter *iter;
1145         u8 sorted[BTREE_ITER_MAX];
1146         unsigned i, nr_sorted = 0;
1147
1148         if (trans->in_traverse_all)
1149                 return -EINTR;
1150
1151         trans->in_traverse_all = true;
1152 retry_all:
1153         nr_sorted = 0;
1154
1155         trans_for_each_iter(trans, iter)
1156                 sorted[nr_sorted++] = iter->idx;
1157
1158 #define btree_iter_cmp_by_idx(_l, _r)                           \
1159                 btree_iter_lock_cmp(&trans->iters[_l], &trans->iters[_r])
1160
1161         bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx);
1162 #undef btree_iter_cmp_by_idx
1163         bch2_trans_unlock(trans);
1164         cond_resched();
1165
1166         if (unlikely(ret == -ENOMEM)) {
1167                 struct closure cl;
1168
1169                 closure_init_stack(&cl);
1170
1171                 do {
1172                         ret = bch2_btree_cache_cannibalize_lock(c, &cl);
1173                         closure_sync(&cl);
1174                 } while (ret);
1175         }
1176
1177         if (unlikely(ret == -EIO)) {
1178                 trans->error = true;
1179                 goto out;
1180         }
1181
1182         BUG_ON(ret && ret != -EINTR);
1183
1184         /* Now, redo traversals in correct order: */
1185         for (i = 0; i < nr_sorted; i++) {
1186                 unsigned idx = sorted[i];
1187
1188                 /*
1189                  * sucessfully traversing one iterator can cause another to be
1190                  * unlinked, in btree_key_cache_fill()
1191                  */
1192                 if (!(trans->iters_linked & (1ULL << idx)))
1193                         continue;
1194
1195                 ret = btree_iter_traverse_one(&trans->iters[idx], _THIS_IP_);
1196                 if (ret)
1197                         goto retry_all;
1198         }
1199
1200         if (hweight64(trans->iters_live) > 1)
1201                 ret = -EINTR;
1202         else
1203                 trans_for_each_iter(trans, iter)
1204                         if (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) {
1205                                 ret = -EINTR;
1206                                 break;
1207                         }
1208 out:
1209         bch2_btree_cache_cannibalize_unlock(c);
1210
1211         trans->in_traverse_all = false;
1212         return ret;
1213 }
1214
1215 int bch2_btree_iter_traverse_all(struct btree_trans *trans)
1216 {
1217         return __btree_iter_traverse_all(trans, 0);
1218 }
1219
1220 static inline bool btree_iter_good_node(struct btree_iter *iter,
1221                                         unsigned l, int check_pos)
1222 {
1223         if (!is_btree_node(iter, l) ||
1224             !bch2_btree_node_relock(iter, l))
1225                 return false;
1226
1227         if (check_pos <= 0 && btree_iter_pos_before_node(iter, iter->l[l].b))
1228                 return false;
1229         if (check_pos >= 0 && btree_iter_pos_after_node(iter, iter->l[l].b))
1230                 return false;
1231         return true;
1232 }
1233
1234 static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter,
1235                                                      int check_pos)
1236 {
1237         unsigned l = iter->level;
1238
1239         while (btree_iter_node(iter, l) &&
1240                !btree_iter_good_node(iter, l, check_pos)) {
1241                 btree_node_unlock(iter, l);
1242                 iter->l[l].b = BTREE_ITER_NO_NODE_UP;
1243                 l++;
1244         }
1245
1246         return l;
1247 }
1248
1249 /*
1250  * This is the main state machine for walking down the btree - walks down to a
1251  * specified depth
1252  *
1253  * Returns 0 on success, -EIO on error (error reading in a btree node).
1254  *
1255  * On error, caller (peek_node()/peek_key()) must return NULL; the error is
1256  * stashed in the iterator and returned from bch2_trans_exit().
1257  */
1258 static int btree_iter_traverse_one(struct btree_iter *iter,
1259                                    unsigned long trace_ip)
1260 {
1261         unsigned depth_want = iter->level;
1262
1263         /*
1264          * if we need interior nodes locked, call btree_iter_relock() to make
1265          * sure we walk back up enough that we lock them:
1266          */
1267         if (iter->uptodate == BTREE_ITER_NEED_RELOCK ||
1268             iter->locks_want > 1)
1269                 bch2_btree_iter_relock(iter, false);
1270
1271         if (btree_iter_type(iter) == BTREE_ITER_CACHED)
1272                 return bch2_btree_iter_traverse_cached(iter);
1273
1274         if (iter->uptodate < BTREE_ITER_NEED_RELOCK)
1275                 return 0;
1276
1277         if (unlikely(iter->level >= BTREE_MAX_DEPTH))
1278                 return 0;
1279
1280         /*
1281          * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos
1282          * here unnecessary
1283          */
1284         iter->level = btree_iter_up_until_good_node(iter, 0);
1285
1286         /*
1287          * If we've got a btree node locked (i.e. we aren't about to relock the
1288          * root) - advance its node iterator if necessary:
1289          *
1290          * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary
1291          */
1292         if (is_btree_node(iter, iter->level)) {
1293                 BUG_ON(!btree_iter_pos_in_node(iter, iter->l[iter->level].b));
1294
1295                 btree_iter_advance_to_pos(iter, &iter->l[iter->level], -1);
1296         }
1297
1298         /*
1299          * Note: iter->nodes[iter->level] may be temporarily NULL here - that
1300          * would indicate to other code that we got to the end of the btree,
1301          * here it indicates that relocking the root failed - it's critical that
1302          * btree_iter_lock_root() comes next and that it can't fail
1303          */
1304         while (iter->level > depth_want) {
1305                 int ret = btree_iter_node(iter, iter->level)
1306                         ? btree_iter_down(iter, trace_ip)
1307                         : btree_iter_lock_root(iter, depth_want, trace_ip);
1308                 if (unlikely(ret)) {
1309                         if (ret == 1)
1310                                 return 0;
1311
1312                         iter->level = depth_want;
1313
1314                         if (ret == -EIO) {
1315                                 iter->flags |= BTREE_ITER_ERROR;
1316                                 iter->l[iter->level].b =
1317                                         BTREE_ITER_NO_NODE_ERROR;
1318                         } else {
1319                                 iter->l[iter->level].b =
1320                                         BTREE_ITER_NO_NODE_DOWN;
1321                         }
1322                         return ret;
1323                 }
1324         }
1325
1326         iter->uptodate = BTREE_ITER_NEED_PEEK;
1327
1328         bch2_btree_iter_verify(iter);
1329         return 0;
1330 }
1331
1332 int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
1333 {
1334         struct btree_trans *trans = iter->trans;
1335         int ret;
1336
1337         ret =   bch2_trans_cond_resched(trans) ?:
1338                 btree_iter_traverse_one(iter, _RET_IP_);
1339         if (unlikely(ret))
1340                 ret = __btree_iter_traverse_all(trans, ret);
1341
1342         return ret;
1343 }
1344
1345 static inline void bch2_btree_iter_checks(struct btree_iter *iter)
1346 {
1347         enum btree_iter_type type = btree_iter_type(iter);
1348
1349         EBUG_ON(iter->btree_id >= BTREE_ID_NR);
1350
1351         BUG_ON((type == BTREE_ITER_KEYS ||
1352                 type == BTREE_ITER_CACHED) &&
1353                (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
1354                 bkey_cmp(iter->pos, iter->k.p) > 0));
1355
1356         bch2_btree_iter_verify_locks(iter);
1357         bch2_btree_iter_verify_level(iter, iter->level);
1358 }
1359
1360 /* Iterate across nodes (leaf and interior nodes) */
1361
1362 struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
1363 {
1364         struct btree *b;
1365         int ret;
1366
1367         EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES);
1368         bch2_btree_iter_checks(iter);
1369
1370         if (iter->uptodate == BTREE_ITER_UPTODATE)
1371                 return iter->l[iter->level].b;
1372
1373         ret = bch2_btree_iter_traverse(iter);
1374         if (ret)
1375                 return NULL;
1376
1377         b = btree_iter_node(iter, iter->level);
1378         if (!b)
1379                 return NULL;
1380
1381         BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0);
1382
1383         iter->pos = b->key.k.p;
1384         iter->uptodate = BTREE_ITER_UPTODATE;
1385
1386         bch2_btree_iter_verify(iter);
1387
1388         return b;
1389 }
1390
1391 struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
1392 {
1393         struct btree *b;
1394         int ret;
1395
1396         EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES);
1397         bch2_btree_iter_checks(iter);
1398
1399         /* already got to end? */
1400         if (!btree_iter_node(iter, iter->level))
1401                 return NULL;
1402
1403         bch2_trans_cond_resched(iter->trans);
1404
1405         btree_iter_up(iter);
1406
1407         if (!bch2_btree_node_relock(iter, iter->level))
1408                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
1409
1410         ret = bch2_btree_iter_traverse(iter);
1411         if (ret)
1412                 return NULL;
1413
1414         /* got to end? */
1415         b = btree_iter_node(iter, iter->level);
1416         if (!b)
1417                 return NULL;
1418
1419         if (bkey_cmp(iter->pos, b->key.k.p) < 0) {
1420                 /*
1421                  * Haven't gotten to the end of the parent node: go back down to
1422                  * the next child node
1423                  */
1424
1425                 /*
1426                  * We don't really want to be unlocking here except we can't
1427                  * directly tell btree_iter_traverse() "traverse to this level"
1428                  * except by setting iter->level, so we have to unlock so we
1429                  * don't screw up our lock invariants:
1430                  */
1431                 if (btree_node_read_locked(iter, iter->level))
1432                         btree_node_unlock(iter, iter->level);
1433
1434                 iter->pos       = bkey_successor(iter->pos);
1435                 iter->level     = iter->min_depth;
1436
1437                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1438                 ret = bch2_btree_iter_traverse(iter);
1439                 if (ret)
1440                         return NULL;
1441
1442                 b = iter->l[iter->level].b;
1443         }
1444
1445         iter->pos = b->key.k.p;
1446         iter->uptodate = BTREE_ITER_UPTODATE;
1447
1448         bch2_btree_iter_verify(iter);
1449
1450         return b;
1451 }
1452
1453 /* Iterate across keys (in leaf nodes only) */
1454
1455 static void btree_iter_pos_changed(struct btree_iter *iter, int cmp)
1456 {
1457         unsigned l = iter->level;
1458
1459         if (!cmp)
1460                 goto out;
1461
1462         if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) {
1463                 btree_node_unlock(iter, 0);
1464                 iter->l[0].b = BTREE_ITER_NO_NODE_UP;
1465                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1466                 return;
1467         }
1468
1469         l = btree_iter_up_until_good_node(iter, cmp);
1470
1471         if (btree_iter_node(iter, l)) {
1472                 /*
1473                  * We might have to skip over many keys, or just a few: try
1474                  * advancing the node iterator, and if we have to skip over too
1475                  * many keys just reinit it (or if we're rewinding, since that
1476                  * is expensive).
1477                  */
1478                 if (cmp < 0 ||
1479                     !btree_iter_advance_to_pos(iter, &iter->l[l], 8))
1480                         __btree_iter_init(iter, l);
1481
1482                 /* Don't leave it locked if we're not supposed to: */
1483                 if (btree_lock_want(iter, l) == BTREE_NODE_UNLOCKED)
1484                         btree_node_unlock(iter, l);
1485         }
1486 out:
1487         if (l != iter->level)
1488                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1489         else
1490                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
1491 }
1492
1493 void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos,
1494                                bool strictly_greater)
1495 {
1496         struct bpos old = btree_iter_search_key(iter);
1497         int cmp;
1498
1499         iter->flags &= ~BTREE_ITER_IS_EXTENTS;
1500         iter->flags |= strictly_greater ? BTREE_ITER_IS_EXTENTS : 0;
1501
1502         bkey_init(&iter->k);
1503         iter->k.p = iter->pos = new_pos;
1504
1505         cmp = bkey_cmp(btree_iter_search_key(iter), old);
1506
1507         btree_iter_pos_changed(iter, cmp);
1508 }
1509
1510 void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
1511 {
1512         int cmp = bkey_cmp(new_pos, iter->pos);
1513
1514         bkey_init(&iter->k);
1515         iter->k.p = iter->pos = new_pos;
1516
1517         btree_iter_pos_changed(iter, cmp);
1518 }
1519
1520 static inline bool bch2_btree_iter_advance_pos(struct btree_iter *iter)
1521 {
1522         struct bpos pos = iter->k.p;
1523
1524         if (unlikely(!bkey_cmp(pos, POS_MAX)))
1525                 return false;
1526
1527         if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
1528                 pos = bkey_successor(pos);
1529         bch2_btree_iter_set_pos(iter, pos);
1530         return true;
1531 }
1532
1533 static inline bool bch2_btree_iter_rewind_pos(struct btree_iter *iter)
1534 {
1535         struct bpos pos = bkey_start_pos(&iter->k);
1536
1537         if (unlikely(!bkey_cmp(pos, POS_MIN)))
1538                 return false;
1539
1540         if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
1541                 pos = bkey_predecessor(pos);
1542         bch2_btree_iter_set_pos(iter, pos);
1543         return true;
1544 }
1545
1546 static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
1547 {
1548         struct bpos next_pos = iter->l[0].b->key.k.p;
1549         bool ret = bkey_cmp(next_pos, POS_MAX) != 0;
1550
1551         if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
1552                 next_pos = bkey_successor(next_pos);
1553
1554         bch2_btree_iter_set_pos(iter, next_pos);
1555         return ret;
1556 }
1557
1558 static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter)
1559 {
1560         struct bpos next_pos = iter->l[0].b->data->min_key;
1561         bool ret = bkey_cmp(next_pos, POS_MIN) != 0;
1562
1563         if (ret) {
1564                 next_pos = bkey_predecessor(next_pos);
1565
1566                 if (iter->flags & BTREE_ITER_IS_EXTENTS)
1567                         next_pos = bkey_predecessor(next_pos);
1568         }
1569
1570         bch2_btree_iter_set_pos(iter, next_pos);
1571         return ret;
1572 }
1573
1574 /**
1575  * btree_iter_peek_uptodate - given an iterator that is uptodate, return the key
1576  * it currently points to
1577  */
1578 static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter)
1579 {
1580         struct btree_iter_level *l = &iter->l[0];
1581         struct bkey_s_c ret = { .k = &iter->k };
1582
1583         if (!bkey_deleted(&iter->k)) {
1584                 struct bkey_packed *_k =
1585                         __bch2_btree_node_iter_peek_all(&l->iter, l->b);
1586
1587                 ret.v = bkeyp_val(&l->b->format, _k);
1588
1589                 if (bch2_debug_check_iterators) {
1590                         struct bkey k = bkey_unpack_key(l->b, _k);
1591
1592                         BUG_ON(memcmp(&k, &iter->k, sizeof(k)));
1593                 }
1594
1595                 if (bch2_debug_check_bkeys)
1596                         bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
1597         }
1598
1599         return ret;
1600 }
1601
1602 /**
1603  * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
1604  * current position
1605  */
1606 struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
1607 {
1608         struct btree_iter_level *l = &iter->l[0];
1609         struct bkey_s_c k;
1610         int ret;
1611
1612         EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
1613         bch2_btree_iter_checks(iter);
1614
1615         if (iter->uptodate == BTREE_ITER_UPTODATE &&
1616             !bkey_deleted(&iter->k))
1617                 return btree_iter_peek_uptodate(iter);
1618
1619         while (1) {
1620                 ret = bch2_btree_iter_traverse(iter);
1621                 if (unlikely(ret))
1622                         return bkey_s_c_err(ret);
1623
1624                 k = __btree_iter_peek(iter, l);
1625                 if (likely(k.k))
1626                         break;
1627
1628                 if (!btree_iter_set_pos_to_next_leaf(iter))
1629                         return bkey_s_c_null;
1630         }
1631
1632         /*
1633          * iter->pos should always be equal to the key we just
1634          * returned - except extents can straddle iter->pos:
1635          */
1636         if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
1637                 iter->pos = bkey_start_pos(k.k);
1638
1639         iter->uptodate = BTREE_ITER_UPTODATE;
1640
1641         bch2_btree_iter_verify_level(iter, 0);
1642         return k;
1643 }
1644
1645 /**
1646  * bch2_btree_iter_next: returns first key greater than iterator's current
1647  * position
1648  */
1649 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
1650 {
1651         if (!bch2_btree_iter_advance_pos(iter))
1652                 return bkey_s_c_null;
1653
1654         return bch2_btree_iter_peek(iter);
1655 }
1656
1657 static struct bkey_s_c __btree_trans_updates_peek(struct btree_iter *iter)
1658 {
1659         struct bpos pos = btree_iter_search_key(iter);
1660         struct btree_trans *trans = iter->trans;
1661         struct btree_insert_entry *i;
1662
1663         trans_for_each_update2(trans, i)
1664                 if ((cmp_int(iter->btree_id,    i->iter->btree_id) ?:
1665                      bkey_cmp(pos,              i->k->k.p)) <= 0)
1666                         break;
1667
1668         return i < trans->updates2 + trans->nr_updates2 &&
1669                 iter->btree_id == i->iter->btree_id
1670                 ? bkey_i_to_s_c(i->k)
1671                 : bkey_s_c_null;
1672 }
1673
1674 static struct bkey_s_c __bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
1675 {
1676         struct btree_iter_level *l = &iter->l[0];
1677         struct bkey_s_c k = __btree_iter_peek(iter, l);
1678         struct bkey_s_c u = __btree_trans_updates_peek(iter);
1679
1680         if (k.k && (!u.k || bkey_cmp(k.k->p, u.k->p) < 0))
1681                 return k;
1682         if (u.k && bkey_cmp(u.k->p, l->b->key.k.p) <= 0) {
1683                 iter->k = *u.k;
1684                 return u;
1685         }
1686         return bkey_s_c_null;
1687 }
1688
1689 struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
1690 {
1691         struct bkey_s_c k;
1692         int ret;
1693
1694         EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
1695         bch2_btree_iter_checks(iter);
1696
1697         while (1) {
1698                 ret = bch2_btree_iter_traverse(iter);
1699                 if (unlikely(ret))
1700                         return bkey_s_c_err(ret);
1701
1702                 k = __bch2_btree_iter_peek_with_updates(iter);
1703
1704                 if (k.k && bkey_deleted(k.k)) {
1705                         bch2_btree_iter_advance_pos(iter);
1706                         continue;
1707                 }
1708
1709                 if (likely(k.k))
1710                         break;
1711
1712                 if (!btree_iter_set_pos_to_next_leaf(iter))
1713                         return bkey_s_c_null;
1714         }
1715
1716         /*
1717          * iter->pos should always be equal to the key we just
1718          * returned - except extents can straddle iter->pos:
1719          */
1720         if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
1721                 iter->pos = bkey_start_pos(k.k);
1722
1723         iter->uptodate = BTREE_ITER_UPTODATE;
1724         return k;
1725 }
1726
1727 struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter)
1728 {
1729         if (!bch2_btree_iter_advance_pos(iter))
1730                 return bkey_s_c_null;
1731
1732         return bch2_btree_iter_peek_with_updates(iter);
1733 }
1734
1735 /**
1736  * bch2_btree_iter_peek_prev: returns first key less than or equal to
1737  * iterator's current position
1738  */
1739 struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
1740 {
1741         struct bpos pos = iter->pos;
1742         struct btree_iter_level *l = &iter->l[0];
1743         struct bkey_s_c k;
1744         int ret;
1745
1746         EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
1747         bch2_btree_iter_checks(iter);
1748
1749         if (iter->uptodate == BTREE_ITER_UPTODATE &&
1750             !bkey_deleted(&iter->k))
1751                 return btree_iter_peek_uptodate(iter);
1752
1753         while (1) {
1754                 ret = bch2_btree_iter_traverse(iter);
1755                 if (unlikely(ret))
1756                         return bkey_s_c_err(ret);
1757
1758                 k = __btree_iter_peek(iter, l);
1759                 if (!k.k ||
1760                     ((iter->flags & BTREE_ITER_IS_EXTENTS)
1761                      ? bkey_cmp(bkey_start_pos(k.k), pos) >= 0
1762                      : bkey_cmp(bkey_start_pos(k.k), pos) > 0))
1763                         k = __btree_iter_prev(iter, l);
1764
1765                 if (likely(k.k))
1766                         break;
1767
1768                 if (!btree_iter_set_pos_to_prev_leaf(iter))
1769                         return bkey_s_c_null;
1770         }
1771
1772         EBUG_ON(bkey_cmp(bkey_start_pos(k.k), pos) > 0);
1773
1774         /* Extents can straddle iter->pos: */
1775         if (bkey_cmp(k.k->p, pos) < 0)
1776                 iter->pos = k.k->p;
1777         iter->uptodate  = BTREE_ITER_UPTODATE;
1778
1779         bch2_btree_iter_verify_level(iter, 0);
1780         return k;
1781 }
1782
1783 /**
1784  * bch2_btree_iter_prev: returns first key less than iterator's current
1785  * position
1786  */
1787 struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
1788 {
1789         if (!bch2_btree_iter_rewind_pos(iter))
1790                 return bkey_s_c_null;
1791
1792         return bch2_btree_iter_peek_prev(iter);
1793 }
1794
1795 static inline struct bkey_s_c
1796 __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
1797 {
1798         struct btree_iter_level *l = &iter->l[0];
1799         struct btree_node_iter node_iter;
1800         struct bkey_s_c k;
1801         struct bkey n;
1802         int ret;
1803
1804         /* keys & holes can't span inode numbers: */
1805         if (iter->pos.offset == KEY_OFFSET_MAX) {
1806                 if (iter->pos.inode == KEY_INODE_MAX)
1807                         return bkey_s_c_null;
1808
1809                 bch2_btree_iter_set_pos(iter, bkey_successor(iter->pos));
1810
1811                 ret = bch2_btree_iter_traverse(iter);
1812                 if (unlikely(ret))
1813                         return bkey_s_c_err(ret);
1814         }
1815
1816         /*
1817          * iterator is now at the correct position for inserting at iter->pos,
1818          * but we need to keep iterating until we find the first non whiteout so
1819          * we know how big a hole we have, if any:
1820          */
1821
1822         node_iter = l->iter;
1823         k = __btree_iter_unpack(iter, l, &iter->k,
1824                 bch2_btree_node_iter_peek(&node_iter, l->b));
1825
1826         if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) {
1827                 /*
1828                  * We're not setting iter->uptodate because the node iterator
1829                  * doesn't necessarily point at the key we're returning:
1830                  */
1831
1832                 EBUG_ON(bkey_cmp(k.k->p, iter->pos) <= 0);
1833                 bch2_btree_iter_verify_level(iter, 0);
1834                 return k;
1835         }
1836
1837         /* hole */
1838
1839         if (!k.k)
1840                 k.k = &l->b->key.k;
1841
1842         bkey_init(&n);
1843         n.p = iter->pos;
1844         bch2_key_resize(&n,
1845                         min_t(u64, KEY_SIZE_MAX,
1846                               (k.k->p.inode == n.p.inode
1847                                ? bkey_start_offset(k.k)
1848                                : KEY_OFFSET_MAX) -
1849                               n.p.offset));
1850
1851         EBUG_ON(!n.size);
1852
1853         iter->k = n;
1854         iter->uptodate = BTREE_ITER_UPTODATE;
1855
1856         bch2_btree_iter_verify_level(iter, 0);
1857         return (struct bkey_s_c) { &iter->k, NULL };
1858 }
1859
1860 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
1861 {
1862         struct btree_iter_level *l = &iter->l[0];
1863         struct bkey_s_c k;
1864         int ret;
1865
1866         EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
1867         bch2_btree_iter_checks(iter);
1868
1869         if (iter->uptodate == BTREE_ITER_UPTODATE)
1870                 return btree_iter_peek_uptodate(iter);
1871
1872         ret = bch2_btree_iter_traverse(iter);
1873         if (unlikely(ret))
1874                 return bkey_s_c_err(ret);
1875
1876         if (iter->flags & BTREE_ITER_IS_EXTENTS)
1877                 return __bch2_btree_iter_peek_slot_extents(iter);
1878
1879         k = __btree_iter_peek_all(iter, l, &iter->k);
1880
1881         EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0);
1882
1883         if (!k.k || bkey_cmp(iter->pos, k.k->p)) {
1884                 /* hole */
1885                 bkey_init(&iter->k);
1886                 iter->k.p = iter->pos;
1887                 k = (struct bkey_s_c) { &iter->k, NULL };
1888         }
1889
1890         iter->uptodate = BTREE_ITER_UPTODATE;
1891         bch2_btree_iter_verify_level(iter, 0);
1892         return k;
1893 }
1894
1895 struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
1896 {
1897         if (!bch2_btree_iter_advance_pos(iter))
1898                 return bkey_s_c_null;
1899
1900         return bch2_btree_iter_peek_slot(iter);
1901 }
1902
1903 struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter)
1904 {
1905         struct bkey_cached *ck;
1906         int ret;
1907
1908         EBUG_ON(btree_iter_type(iter) != BTREE_ITER_CACHED);
1909         bch2_btree_iter_checks(iter);
1910
1911         ret = bch2_btree_iter_traverse(iter);
1912         if (unlikely(ret))
1913                 return bkey_s_c_err(ret);
1914
1915         ck = (void *) iter->l[0].b;
1916
1917         EBUG_ON(iter->btree_id != ck->key.btree_id ||
1918                 bkey_cmp(iter->pos, ck->key.pos));
1919         BUG_ON(!ck->valid);
1920
1921         return bkey_i_to_s_c(ck->k);
1922 }
1923
1924 static inline void bch2_btree_iter_init(struct btree_trans *trans,
1925                         struct btree_iter *iter, enum btree_id btree_id,
1926                         struct bpos pos, unsigned flags)
1927 {
1928         struct bch_fs *c = trans->c;
1929         unsigned i;
1930
1931         if (btree_node_type_is_extents(btree_id) &&
1932             !(flags & BTREE_ITER_NODES))
1933                 flags |= BTREE_ITER_IS_EXTENTS;
1934
1935         iter->trans                     = trans;
1936         iter->pos                       = pos;
1937         bkey_init(&iter->k);
1938         iter->k.p                       = pos;
1939         iter->flags                     = flags;
1940         iter->uptodate                  = BTREE_ITER_NEED_TRAVERSE;
1941         iter->btree_id                  = btree_id;
1942         iter->level                     = 0;
1943         iter->min_depth                 = 0;
1944         iter->locks_want                = flags & BTREE_ITER_INTENT ? 1 : 0;
1945         iter->nodes_locked              = 0;
1946         iter->nodes_intent_locked       = 0;
1947         for (i = 0; i < ARRAY_SIZE(iter->l); i++)
1948                 iter->l[i].b            = BTREE_ITER_NO_NODE_INIT;
1949
1950         prefetch(c->btree_roots[btree_id].b);
1951 }
1952
1953 /* new transactional stuff: */
1954
1955 static inline void __bch2_trans_iter_free(struct btree_trans *trans,
1956                                           unsigned idx)
1957 {
1958         __bch2_btree_iter_unlock(&trans->iters[idx]);
1959         trans->iters_linked             &= ~(1ULL << idx);
1960         trans->iters_live               &= ~(1ULL << idx);
1961         trans->iters_touched            &= ~(1ULL << idx);
1962 }
1963
1964 int bch2_trans_iter_put(struct btree_trans *trans,
1965                         struct btree_iter *iter)
1966 {
1967         int ret;
1968
1969         if (IS_ERR_OR_NULL(iter))
1970                 return 0;
1971
1972         BUG_ON(trans->iters + iter->idx != iter);
1973
1974         ret = btree_iter_err(iter);
1975
1976         if (!(trans->iters_touched & (1ULL << iter->idx)) &&
1977             !(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT))
1978                 __bch2_trans_iter_free(trans, iter->idx);
1979
1980         trans->iters_live       &= ~(1ULL << iter->idx);
1981         return ret;
1982 }
1983
1984 int bch2_trans_iter_free(struct btree_trans *trans,
1985                          struct btree_iter *iter)
1986 {
1987         if (IS_ERR_OR_NULL(iter))
1988                 return 0;
1989
1990         trans->iters_touched &= ~(1ULL << iter->idx);
1991
1992         return bch2_trans_iter_put(trans, iter);
1993 }
1994
1995 noinline __cold
1996 static void btree_trans_iter_alloc_fail(struct btree_trans *trans)
1997 {
1998
1999         struct btree_iter *iter;
2000         struct btree_insert_entry *i;
2001
2002         trans_for_each_iter(trans, iter)
2003                 printk(KERN_ERR "iter: btree %s pos %llu:%llu%s%s%s %ps\n",
2004                        bch2_btree_ids[iter->btree_id],
2005                        iter->pos.inode,
2006                        iter->pos.offset,
2007                        (trans->iters_live & (1ULL << iter->idx)) ? " live" : "",
2008                        (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
2009                        iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
2010                        (void *) iter->ip_allocated);
2011
2012         trans_for_each_update(trans, i) {
2013                 char buf[300];
2014
2015                 bch2_bkey_val_to_text(&PBUF(buf), trans->c, bkey_i_to_s_c(i->k));
2016                 printk(KERN_ERR "update: btree %s %s\n",
2017                        bch2_btree_ids[i->iter->btree_id], buf);
2018         }
2019         panic("trans iter oveflow\n");
2020 }
2021
2022 static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans)
2023 {
2024         unsigned idx;
2025
2026         if (unlikely(trans->iters_linked ==
2027                      ~((~0ULL << 1) << (BTREE_ITER_MAX - 1))))
2028                 btree_trans_iter_alloc_fail(trans);
2029
2030         idx = __ffs64(~trans->iters_linked);
2031
2032         trans->iters_linked     |= 1ULL << idx;
2033         trans->iters[idx].idx    = idx;
2034         trans->iters[idx].flags  = 0;
2035         return &trans->iters[idx];
2036 }
2037
2038 static inline void btree_iter_copy(struct btree_iter *dst,
2039                                    struct btree_iter *src)
2040 {
2041         unsigned i, idx = dst->idx;
2042
2043         *dst = *src;
2044         dst->idx = idx;
2045         dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
2046
2047         for (i = 0; i < BTREE_MAX_DEPTH; i++)
2048                 if (btree_node_locked(dst, i))
2049                         six_lock_increment(&dst->l[i].b->c.lock,
2050                                            __btree_lock_want(dst, i));
2051
2052         dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
2053         dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT;
2054 }
2055
2056 static inline struct bpos bpos_diff(struct bpos l, struct bpos r)
2057 {
2058         if (bkey_cmp(l, r) > 0)
2059                 swap(l, r);
2060
2061         return POS(r.inode - l.inode, r.offset - l.offset);
2062 }
2063
2064 static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
2065                                                  unsigned btree_id, struct bpos pos,
2066                                                  unsigned flags)
2067 {
2068         struct btree_iter *iter, *best = NULL;
2069
2070         trans_for_each_iter(trans, iter) {
2071                 if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE))
2072                         continue;
2073
2074                 if (iter->btree_id != btree_id)
2075                         continue;
2076
2077                 if (best &&
2078                     bkey_cmp(bpos_diff(best->pos, pos),
2079                              bpos_diff(iter->pos, pos)) < 0)
2080                         continue;
2081
2082                 best = iter;
2083         }
2084
2085         if (!best) {
2086                 iter = btree_trans_iter_alloc(trans);
2087                 bch2_btree_iter_init(trans, iter, btree_id, pos, flags);
2088         } else if ((trans->iters_live & (1ULL << best->idx)) ||
2089                    (best->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) {
2090                 iter = btree_trans_iter_alloc(trans);
2091                 btree_iter_copy(iter, best);
2092         } else {
2093                 iter = best;
2094         }
2095
2096         iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
2097         iter->flags &= ~BTREE_ITER_USER_FLAGS;
2098         iter->flags |= flags & BTREE_ITER_USER_FLAGS;
2099
2100         if (iter->flags & BTREE_ITER_INTENT) {
2101                 if (!iter->locks_want) {
2102                         __bch2_btree_iter_unlock(iter);
2103                         iter->locks_want = 1;
2104                 }
2105         } else
2106                 bch2_btree_iter_downgrade(iter);
2107
2108         BUG_ON(iter->btree_id != btree_id);
2109         BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE);
2110         BUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
2111         BUG_ON(iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT);
2112         BUG_ON(trans->iters_live & (1ULL << iter->idx));
2113
2114         trans->iters_live       |= 1ULL << iter->idx;
2115         trans->iters_touched    |= 1ULL << iter->idx;
2116
2117         return iter;
2118 }
2119
2120 struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
2121                                          enum btree_id btree_id,
2122                                          struct bpos pos, unsigned flags)
2123 {
2124         struct btree_iter *iter =
2125                 __btree_trans_get_iter(trans, btree_id, pos, flags);
2126
2127         __bch2_btree_iter_set_pos(iter, pos,
2128                 btree_node_type_is_extents(btree_id));
2129         return iter;
2130 }
2131
2132 struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
2133                                             enum btree_id btree_id,
2134                                             struct bpos pos,
2135                                             unsigned locks_want,
2136                                             unsigned depth,
2137                                             unsigned flags)
2138 {
2139         struct btree_iter *iter =
2140                 __btree_trans_get_iter(trans, btree_id, pos,
2141                                        flags|BTREE_ITER_NODES);
2142         unsigned i;
2143
2144         BUG_ON(bkey_cmp(iter->pos, pos));
2145
2146         iter->locks_want = locks_want;
2147         iter->level     = depth;
2148         iter->min_depth = depth;
2149
2150         for (i = 0; i < ARRAY_SIZE(iter->l); i++)
2151                 iter->l[i].b            = NULL;
2152         iter->l[iter->level].b          = BTREE_ITER_NO_NODE_INIT;
2153
2154         return iter;
2155 }
2156
2157 struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans,
2158                                         struct btree_iter *src)
2159 {
2160         struct btree_iter *iter;
2161
2162         iter = btree_trans_iter_alloc(trans);
2163         btree_iter_copy(iter, src);
2164
2165         trans->iters_live |= 1ULL << iter->idx;
2166         /*
2167          * We don't need to preserve this iter since it's cheap to copy it
2168          * again - this will cause trans_iter_put() to free it right away:
2169          */
2170         trans->iters_touched &= ~(1ULL << iter->idx);
2171
2172         return iter;
2173 }
2174
2175 static int bch2_trans_preload_mem(struct btree_trans *trans, size_t size)
2176 {
2177         if (size > trans->mem_bytes) {
2178                 size_t old_bytes = trans->mem_bytes;
2179                 size_t new_bytes = roundup_pow_of_two(size);
2180                 void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
2181
2182                 if (!new_mem)
2183                         return -ENOMEM;
2184
2185                 trans->mem = new_mem;
2186                 trans->mem_bytes = new_bytes;
2187
2188                 if (old_bytes) {
2189                         trace_trans_restart_mem_realloced(trans->ip, new_bytes);
2190                         return -EINTR;
2191                 }
2192         }
2193
2194         return 0;
2195 }
2196
2197 void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
2198 {
2199         void *p;
2200         int ret;
2201
2202         ret = bch2_trans_preload_mem(trans, trans->mem_top + size);
2203         if (ret)
2204                 return ERR_PTR(ret);
2205
2206         p = trans->mem + trans->mem_top;
2207         trans->mem_top += size;
2208         return p;
2209 }
2210
2211 inline void bch2_trans_unlink_iters(struct btree_trans *trans)
2212 {
2213         u64 iters = trans->iters_linked &
2214                 ~trans->iters_touched &
2215                 ~trans->iters_live;
2216
2217         while (iters) {
2218                 unsigned idx = __ffs64(iters);
2219
2220                 iters &= ~(1ULL << idx);
2221                 __bch2_trans_iter_free(trans, idx);
2222         }
2223 }
2224
2225 void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
2226 {
2227         struct btree_iter *iter;
2228
2229         trans_for_each_iter(trans, iter)
2230                 iter->flags &= ~(BTREE_ITER_KEEP_UNTIL_COMMIT|
2231                                  BTREE_ITER_SET_POS_AFTER_COMMIT);
2232
2233         bch2_trans_unlink_iters(trans);
2234
2235         trans->iters_touched &= trans->iters_live;
2236
2237         trans->nr_updates               = 0;
2238         trans->nr_updates2              = 0;
2239         trans->mem_top                  = 0;
2240
2241         trans->extra_journal_entries    = NULL;
2242         trans->extra_journal_entry_u64s = 0;
2243
2244         if (trans->fs_usage_deltas) {
2245                 trans->fs_usage_deltas->used = 0;
2246                 memset(&trans->fs_usage_deltas->memset_start, 0,
2247                        (void *) &trans->fs_usage_deltas->memset_end -
2248                        (void *) &trans->fs_usage_deltas->memset_start);
2249         }
2250
2251         if (!(flags & TRANS_RESET_NOTRAVERSE))
2252                 bch2_btree_iter_traverse_all(trans);
2253 }
2254
2255 static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c)
2256 {
2257         size_t iters_bytes      = sizeof(struct btree_iter) * BTREE_ITER_MAX;
2258         size_t updates_bytes    = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX;
2259         void *p = NULL;
2260
2261         BUG_ON(trans->used_mempool);
2262
2263 #ifdef __KERNEL__
2264         p = this_cpu_xchg(c->btree_iters_bufs->iter, NULL);
2265 #endif
2266         if (!p)
2267                 p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
2268
2269         trans->iters            = p; p += iters_bytes;
2270         trans->updates          = p; p += updates_bytes;
2271         trans->updates2         = p; p += updates_bytes;
2272 }
2273
2274 void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
2275                      unsigned expected_nr_iters,
2276                      size_t expected_mem_bytes)
2277 {
2278         memset(trans, 0, sizeof(*trans));
2279         trans->c                = c;
2280         trans->ip               = _RET_IP_;
2281
2282         /*
2283          * reallocating iterators currently completely breaks
2284          * bch2_trans_iter_put(), we always allocate the max:
2285          */
2286         bch2_trans_alloc_iters(trans, c);
2287
2288         if (expected_mem_bytes) {
2289                 trans->mem_bytes = roundup_pow_of_two(expected_mem_bytes);
2290                 trans->mem = kmalloc(trans->mem_bytes, GFP_KERNEL|__GFP_NOFAIL);
2291         }
2292
2293         trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
2294
2295 #ifdef CONFIG_BCACHEFS_DEBUG
2296         trans->pid = current->pid;
2297         mutex_lock(&c->btree_trans_lock);
2298         list_add(&trans->list, &c->btree_trans_list);
2299         mutex_unlock(&c->btree_trans_lock);
2300 #endif
2301 }
2302
2303 int bch2_trans_exit(struct btree_trans *trans)
2304 {
2305         struct bch_fs *c = trans->c;
2306
2307         bch2_trans_unlock(trans);
2308
2309 #ifdef CONFIG_BCACHEFS_DEBUG
2310         mutex_lock(&trans->c->btree_trans_lock);
2311         list_del(&trans->list);
2312         mutex_unlock(&trans->c->btree_trans_lock);
2313 #endif
2314
2315         srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
2316
2317         bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
2318
2319         kfree(trans->fs_usage_deltas);
2320         kfree(trans->mem);
2321
2322 #ifdef __KERNEL__
2323         /*
2324          * Userspace doesn't have a real percpu implementation:
2325          */
2326         trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters);
2327 #endif
2328         if (trans->iters)
2329                 mempool_free(trans->iters, &trans->c->btree_iters_pool);
2330
2331         trans->mem      = (void *) 0x1;
2332         trans->iters    = (void *) 0x1;
2333
2334         return trans->error ? -EIO : 0;
2335 }
2336
2337 static void __maybe_unused
2338 bch2_btree_iter_node_to_text(struct printbuf *out,
2339                              struct btree_bkey_cached_common *_b,
2340                              enum btree_iter_type type)
2341 {
2342         pr_buf(out, "    %px l=%u %s:",
2343                _b, _b->level, bch2_btree_ids[_b->btree_id]);
2344         bch2_bpos_to_text(out, btree_node_pos(_b, type));
2345 }
2346
2347 void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
2348 {
2349 #ifdef CONFIG_BCACHEFS_DEBUG
2350         struct btree_trans *trans;
2351         struct btree_iter *iter;
2352         struct btree *b;
2353         unsigned l;
2354
2355         mutex_lock(&c->btree_trans_lock);
2356         list_for_each_entry(trans, &c->btree_trans_list, list) {
2357                 pr_buf(out, "%i %px %ps\n", trans->pid, trans, (void *) trans->ip);
2358
2359                 trans_for_each_iter(trans, iter) {
2360                         if (!iter->nodes_locked)
2361                                 continue;
2362
2363                         pr_buf(out, "  iter %u %s:",
2364                                iter->idx,
2365                                bch2_btree_ids[iter->btree_id]);
2366                         bch2_bpos_to_text(out, iter->pos);
2367                         pr_buf(out, "\n");
2368
2369                         for (l = 0; l < BTREE_MAX_DEPTH; l++) {
2370                                 if (btree_node_locked(iter, l)) {
2371                                         pr_buf(out, "    %s l=%u ",
2372                                                btree_node_intent_locked(iter, l) ? "i" : "r", l);
2373                                         bch2_btree_iter_node_to_text(out,
2374                                                         (void *) iter->l[l].b,
2375                                                         btree_iter_type(iter));
2376                                         pr_buf(out, "\n");
2377                                 }
2378                         }
2379                 }
2380
2381                 b = READ_ONCE(trans->locking);
2382                 if (b) {
2383                         pr_buf(out, "  locking iter %u l=%u %s:",
2384                                trans->locking_iter_idx,
2385                                trans->locking_level,
2386                                bch2_btree_ids[trans->locking_btree_id]);
2387                         bch2_bpos_to_text(out, trans->locking_pos);
2388
2389
2390                         pr_buf(out, " node ");
2391                         bch2_btree_iter_node_to_text(out,
2392                                         (void *) b,
2393                                         btree_iter_type(&trans->iters[trans->locking_iter_idx]));
2394                         pr_buf(out, "\n");
2395                 }
2396         }
2397         mutex_unlock(&c->btree_trans_lock);
2398 #endif
2399 }
2400
2401 void bch2_fs_btree_iter_exit(struct bch_fs *c)
2402 {
2403         mempool_exit(&c->btree_iters_pool);
2404         cleanup_srcu_struct(&c->btree_trans_barrier);
2405 }
2406
2407 int bch2_fs_btree_iter_init(struct bch_fs *c)
2408 {
2409         unsigned nr = BTREE_ITER_MAX;
2410
2411         INIT_LIST_HEAD(&c->btree_trans_list);
2412         mutex_init(&c->btree_trans_lock);
2413
2414         return  init_srcu_struct(&c->btree_trans_barrier) ?:
2415                 mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
2416                         sizeof(struct btree_iter) * nr +
2417                         sizeof(struct btree_insert_entry) * nr +
2418                         sizeof(struct btree_insert_entry) * nr);
2419 }