]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_locking.c
Update bcachefs sources to 24c6361e20 bcachefs: Fix a trans path overflow in bch2_btr...
[bcachefs-tools-debian] / libbcachefs / btree_locking.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "btree_locking.h"
5 #include "btree_types.h"
6
7 struct lock_class_key bch2_btree_node_lock_key;
8
9 /* Btree node locking: */
10
11 static inline void six_lock_readers_add(struct six_lock *lock, int nr)
12 {
13         if (lock->readers)
14                 this_cpu_add(*lock->readers, nr);
15         else if (nr > 0)
16                 atomic64_add(__SIX_VAL(read_lock, nr), &lock->state.counter);
17         else
18                 atomic64_sub(__SIX_VAL(read_lock, -nr), &lock->state.counter);
19 }
20
21 struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
22                                                   struct btree_path *skip,
23                                                   struct btree_bkey_cached_common *b,
24                                                   unsigned level)
25 {
26         struct btree_path *path;
27         struct six_lock_count ret;
28
29         memset(&ret, 0, sizeof(ret));
30
31         if (IS_ERR_OR_NULL(b))
32                 return ret;
33
34         trans_for_each_path(trans, path)
35                 if (path != skip && &path->l[level].b->c == b) {
36                         int t = btree_node_locked_type(path, level);
37
38                         if (t != BTREE_NODE_UNLOCKED)
39                                 ret.n[t]++;
40                 }
41
42         return ret;
43 }
44
45 /* unlock */
46
47 void bch2_btree_node_unlock_write(struct btree_trans *trans,
48                         struct btree_path *path, struct btree *b)
49 {
50         bch2_btree_node_unlock_write_inlined(trans, path, b);
51 }
52
53 /* lock */
54
55 /*
56  * @trans wants to lock @b with type @type
57  */
58 struct trans_waiting_for_lock {
59         struct btree_trans              *trans;
60         struct btree_bkey_cached_common *node_want;
61         enum six_lock_type              lock_want;
62
63         /* for iterating over held locks :*/
64         u8                              path_idx;
65         u8                              level;
66         u64                             lock_start_time;
67 };
68
69 struct lock_graph {
70         struct trans_waiting_for_lock   g[8];
71         unsigned                        nr;
72 };
73
74 static void lock_graph_pop(struct lock_graph *g)
75 {
76         closure_put(&g->g[--g->nr].trans->ref);
77 }
78
79 static noinline void print_cycle(struct printbuf *out, struct lock_graph *g)
80 {
81         struct trans_waiting_for_lock *i;
82
83         prt_printf(out, "Found lock cycle (%u entries):", g->nr);
84         prt_newline(out);
85
86         for (i = g->g; i < g->g + g->nr; i++)
87                 bch2_btree_trans_to_text(out, i->trans);
88 }
89
90 static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
91 {
92         int ret;
93
94         if (i == g->g) {
95                 trace_and_count(i->trans->c, trans_restart_would_deadlock, i->trans, _RET_IP_);
96                 ret = btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock);
97         } else {
98                 i->trans->lock_must_abort = true;
99                 ret = 0;
100         }
101
102         for (i = g->g + 1; i < g->g + g->nr; i++)
103                 wake_up_process(i->trans->locking_wait.task);
104         return ret;
105 }
106
107 static noinline int break_cycle(struct lock_graph *g)
108 {
109         struct trans_waiting_for_lock *i;
110
111         for (i = g->g; i < g->g + g->nr; i++) {
112                 if (i->trans->lock_may_not_fail ||
113                     i->trans->locking_wait.lock_want == SIX_LOCK_write)
114                         continue;
115
116                 return abort_lock(g, i);
117         }
118
119         for (i = g->g; i < g->g + g->nr; i++) {
120                 if (i->trans->lock_may_not_fail ||
121                     !i->trans->in_traverse_all)
122                         continue;
123
124                 return abort_lock(g, i);
125         }
126
127         for (i = g->g; i < g->g + g->nr; i++) {
128                 if (i->trans->lock_may_not_fail)
129                         continue;
130
131                 return abort_lock(g, i);
132         }
133
134         BUG();
135 }
136
137 static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
138                               struct printbuf *cycle)
139 {
140         struct btree_trans *orig_trans = g->g->trans;
141         struct trans_waiting_for_lock *i;
142         int ret = 0;
143
144         for (i = g->g; i < g->g + g->nr; i++) {
145                 if (i->trans->locking != i->node_want)
146                         while (g->g + g->nr >= i) {
147                                 lock_graph_pop(g);
148                                 return 0;
149                         }
150
151                 if (i->trans == trans) {
152                         if (cycle) {
153                                 /* Only checking: */
154                                 print_cycle(cycle, g);
155                                 ret = -1;
156                         } else {
157                                 ret = break_cycle(g);
158                         }
159
160                         if (ret)
161                                 goto deadlock;
162                         /*
163                          * If we didn't abort (instead telling another
164                          * transaction to abort), keep checking:
165                          */
166                 }
167         }
168
169         if (g->nr == ARRAY_SIZE(g->g)) {
170                 if (orig_trans->lock_may_not_fail)
171                         return 0;
172
173                 trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_);
174                 ret = btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit);
175                 goto deadlock;
176         }
177
178         closure_get(&trans->ref);
179
180         g->g[g->nr++] = (struct trans_waiting_for_lock) {
181                 .trans          = trans,
182                 .node_want      = trans->locking,
183                 .lock_want      = trans->locking_wait.lock_want,
184         };
185
186         return 0;
187 deadlock:
188         while (g->nr)
189                 lock_graph_pop(g);
190         return ret;
191 }
192
193 static noinline void lock_graph_remove_non_waiters(struct lock_graph *g)
194 {
195         struct trans_waiting_for_lock *i;
196
197         for (i = g->g + 1; i < g->g + g->nr; i++)
198                 if (i->trans->locking != i->node_want ||
199                     i->trans->locking_wait.start_time != i[-1].lock_start_time) {
200                         while (g->g + g->nr >= i)
201                                 lock_graph_pop(g);
202                         return;
203                 }
204         BUG();
205 }
206
207 static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2)
208 {
209         return t1 + t2 > 1;
210 }
211
212 int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle)
213 {
214         struct lock_graph g;
215         struct trans_waiting_for_lock *top;
216         struct btree_bkey_cached_common *b;
217         struct btree_path *path;
218         int ret;
219
220         if (trans->lock_must_abort) {
221                 trace_and_count(trans->c, trans_restart_would_deadlock, trans, _RET_IP_);
222                 return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
223         }
224
225         g.nr = 0;
226         ret = lock_graph_descend(&g, trans, cycle);
227         BUG_ON(ret);
228 next:
229         if (!g.nr)
230                 return 0;
231
232         top = &g.g[g.nr - 1];
233
234         trans_for_each_path_from(top->trans, path, top->path_idx) {
235                 if (!path->nodes_locked)
236                         continue;
237
238                 if (top->path_idx != path->idx) {
239                         top->path_idx           = path->idx;
240                         top->level              = 0;
241                         top->lock_start_time    = 0;
242                 }
243
244                 for (;
245                      top->level < BTREE_MAX_DEPTH;
246                      top->level++, top->lock_start_time = 0) {
247                         int lock_held = btree_node_locked_type(path, top->level);
248
249                         if (lock_held == BTREE_NODE_UNLOCKED)
250                                 continue;
251
252                         b = &READ_ONCE(path->l[top->level].b)->c;
253
254                         if (unlikely(IS_ERR_OR_NULL(b))) {
255                                 lock_graph_remove_non_waiters(&g);
256                                 goto next;
257                         }
258
259                         if (list_empty_careful(&b->lock.wait_list))
260                                 continue;
261
262                         raw_spin_lock(&b->lock.wait_lock);
263                         list_for_each_entry(trans, &b->lock.wait_list, locking_wait.list) {
264                                 BUG_ON(b != trans->locking);
265
266                                 if (top->lock_start_time &&
267                                     time_after_eq64(top->lock_start_time, trans->locking_wait.start_time))
268                                         continue;
269
270                                 top->lock_start_time = trans->locking_wait.start_time;
271
272                                 /* Don't check for self deadlock: */
273                                 if (trans == top->trans ||
274                                     !lock_type_conflicts(lock_held, trans->locking_wait.lock_want))
275                                         continue;
276
277                                 ret = lock_graph_descend(&g, trans, cycle);
278                                 raw_spin_unlock(&b->lock.wait_lock);
279
280                                 if (ret)
281                                         return ret < 0 ? ret : 0;
282                                 goto next;
283
284                         }
285                         raw_spin_unlock(&b->lock.wait_lock);
286                 }
287         }
288
289         lock_graph_pop(&g);
290         goto next;
291 }
292
293 int bch2_six_check_for_deadlock(struct six_lock *lock, void *p)
294 {
295         struct btree_trans *trans = p;
296
297         return bch2_check_for_deadlock(trans, NULL);
298 }
299
300 int __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *path,
301                                  struct btree_bkey_cached_common *b,
302                                  bool lock_may_not_fail)
303 {
304         int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->level).n[SIX_LOCK_read];
305         int ret;
306
307         /*
308          * Must drop our read locks before calling six_lock_write() -
309          * six_unlock() won't do wakeups until the reader count
310          * goes to 0, and it's safe because we have the node intent
311          * locked:
312          */
313         six_lock_readers_add(&b->lock, -readers);
314         ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write, lock_may_not_fail);
315         six_lock_readers_add(&b->lock, readers);
316
317         if (ret)
318                 mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_intent);
319
320         return ret;
321 }
322
323 /* relock */
324
325 static inline bool btree_path_get_locks(struct btree_trans *trans,
326                                         struct btree_path *path,
327                                         bool upgrade)
328 {
329         unsigned l = path->level;
330         int fail_idx = -1;
331
332         do {
333                 if (!btree_path_node(path, l))
334                         break;
335
336                 if (!(upgrade
337                       ? bch2_btree_node_upgrade(trans, path, l)
338                       : bch2_btree_node_relock(trans, path, l)))
339                         fail_idx = l;
340
341                 l++;
342         } while (l < path->locks_want);
343
344         /*
345          * When we fail to get a lock, we have to ensure that any child nodes
346          * can't be relocked so bch2_btree_path_traverse has to walk back up to
347          * the node that we failed to relock:
348          */
349         if (fail_idx >= 0) {
350                 __bch2_btree_path_unlock(trans, path);
351                 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
352
353                 do {
354                         path->l[fail_idx].b = upgrade
355                                 ? ERR_PTR(-BCH_ERR_no_btree_node_upgrade)
356                                 : ERR_PTR(-BCH_ERR_no_btree_node_relock);
357                         --fail_idx;
358                 } while (fail_idx >= 0);
359         }
360
361         if (path->uptodate == BTREE_ITER_NEED_RELOCK)
362                 path->uptodate = BTREE_ITER_UPTODATE;
363
364         bch2_trans_verify_locks(trans);
365
366         return path->uptodate < BTREE_ITER_NEED_RELOCK;
367 }
368
369 bool __bch2_btree_node_relock(struct btree_trans *trans,
370                               struct btree_path *path, unsigned level,
371                               bool trace)
372 {
373         struct btree *b = btree_path_node(path, level);
374         int want = __btree_lock_want(path, level);
375
376         if (race_fault())
377                 goto fail;
378
379         if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
380             (btree_node_lock_seq_matches(path, b, level) &&
381              btree_node_lock_increment(trans, &b->c, level, want))) {
382                 mark_btree_node_locked(trans, path, level, want);
383                 return true;
384         }
385 fail:
386         if (trace)
387                 trace_and_count(trans->c, btree_path_relock_fail, trans, _RET_IP_, path, level);
388         return false;
389 }
390
391 /* upgrade */
392
393 bool bch2_btree_node_upgrade(struct btree_trans *trans,
394                              struct btree_path *path, unsigned level)
395 {
396         struct btree *b = path->l[level].b;
397         struct six_lock_count count = bch2_btree_node_lock_counts(trans, path, &b->c, level);
398
399         if (!is_btree_node(path, level))
400                 return false;
401
402         switch (btree_lock_want(path, level)) {
403         case BTREE_NODE_UNLOCKED:
404                 BUG_ON(btree_node_locked(path, level));
405                 return true;
406         case BTREE_NODE_READ_LOCKED:
407                 BUG_ON(btree_node_intent_locked(path, level));
408                 return bch2_btree_node_relock(trans, path, level);
409         case BTREE_NODE_INTENT_LOCKED:
410                 break;
411         case BTREE_NODE_WRITE_LOCKED:
412                 BUG();
413         }
414
415         if (btree_node_intent_locked(path, level))
416                 return true;
417
418         if (race_fault())
419                 return false;
420
421         if (btree_node_locked(path, level)) {
422                 bool ret;
423
424                 six_lock_readers_add(&b->c.lock, -count.n[SIX_LOCK_read]);
425                 ret = six_lock_tryupgrade(&b->c.lock);
426                 six_lock_readers_add(&b->c.lock, count.n[SIX_LOCK_read]);
427
428                 if (ret)
429                         goto success;
430         } else {
431                 if (six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
432                         goto success;
433         }
434
435         /*
436          * Do we already have an intent lock via another path? If so, just bump
437          * lock count:
438          */
439         if (btree_node_lock_seq_matches(path, b, level) &&
440             btree_node_lock_increment(trans, &b->c, level, BTREE_NODE_INTENT_LOCKED)) {
441                 btree_node_unlock(trans, path, level);
442                 goto success;
443         }
444
445         trace_and_count(trans->c, btree_path_upgrade_fail, trans, _RET_IP_, path, level);
446         return false;
447 success:
448         mark_btree_node_locked_noreset(path, level, SIX_LOCK_intent);
449         return true;
450 }
451
452 /* Btree path locking: */
453
454 /*
455  * Only for btree_cache.c - only relocks intent locks
456  */
457 int bch2_btree_path_relock_intent(struct btree_trans *trans,
458                                   struct btree_path *path)
459 {
460         unsigned l;
461
462         for (l = path->level;
463              l < path->locks_want && btree_path_node(path, l);
464              l++) {
465                 if (!bch2_btree_node_relock(trans, path, l)) {
466                         __bch2_btree_path_unlock(trans, path);
467                         btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
468                         trace_and_count(trans->c, trans_restart_relock_path_intent, trans, _RET_IP_, path);
469                         return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent);
470                 }
471         }
472
473         return 0;
474 }
475
476 __flatten
477 bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
478                         struct btree_path *path, unsigned long trace_ip)
479 {
480         return btree_path_get_locks(trans, path, false);
481 }
482
483 __flatten
484 bool bch2_btree_path_upgrade_norestart(struct btree_trans *trans,
485                         struct btree_path *path, unsigned long trace_ip)
486 {
487         return btree_path_get_locks(trans, path, true);
488 }
489
490 bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans,
491                                struct btree_path *path,
492                                unsigned new_locks_want)
493 {
494         EBUG_ON(path->locks_want >= new_locks_want);
495
496         path->locks_want = new_locks_want;
497
498         return btree_path_get_locks(trans, path, true);
499 }
500
501 bool __bch2_btree_path_upgrade(struct btree_trans *trans,
502                                struct btree_path *path,
503                                unsigned new_locks_want)
504 {
505         struct btree_path *linked;
506
507         if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want))
508                 return true;
509
510         /*
511          * XXX: this is ugly - we'd prefer to not be mucking with other
512          * iterators in the btree_trans here.
513          *
514          * On failure to upgrade the iterator, setting iter->locks_want and
515          * calling get_locks() is sufficient to make bch2_btree_path_traverse()
516          * get the locks we want on transaction restart.
517          *
518          * But if this iterator was a clone, on transaction restart what we did
519          * to this iterator isn't going to be preserved.
520          *
521          * Possibly we could add an iterator field for the parent iterator when
522          * an iterator is a copy - for now, we'll just upgrade any other
523          * iterators with the same btree id.
524          *
525          * The code below used to be needed to ensure ancestor nodes get locked
526          * before interior nodes - now that's handled by
527          * bch2_btree_path_traverse_all().
528          */
529         if (!path->cached && !trans->in_traverse_all)
530                 trans_for_each_path(trans, linked)
531                         if (linked != path &&
532                             linked->cached == path->cached &&
533                             linked->btree_id == path->btree_id &&
534                             linked->locks_want < new_locks_want) {
535                                 linked->locks_want = new_locks_want;
536                                 btree_path_get_locks(trans, linked, true);
537                         }
538
539         return false;
540 }
541
542 void __bch2_btree_path_downgrade(struct btree_trans *trans,
543                                  struct btree_path *path,
544                                  unsigned new_locks_want)
545 {
546         unsigned l;
547
548         EBUG_ON(path->locks_want < new_locks_want);
549
550         path->locks_want = new_locks_want;
551
552         while (path->nodes_locked &&
553                (l = btree_path_highest_level_locked(path)) >= path->locks_want) {
554                 if (l > path->level) {
555                         btree_node_unlock(trans, path, l);
556                 } else {
557                         if (btree_node_intent_locked(path, l)) {
558                                 six_lock_downgrade(&path->l[l].b->c.lock);
559                                 mark_btree_node_locked_noreset(path, l, SIX_LOCK_read);
560                         }
561                         break;
562                 }
563         }
564
565         bch2_btree_path_verify_locks(path);
566 }
567
568 /* Btree transaction locking: */
569
570 void bch2_trans_downgrade(struct btree_trans *trans)
571 {
572         struct btree_path *path;
573
574         trans_for_each_path(trans, path)
575                 bch2_btree_path_downgrade(trans, path);
576 }
577
578 int bch2_trans_relock(struct btree_trans *trans)
579 {
580         struct btree_path *path;
581
582         if (unlikely(trans->restarted))
583                 return - ((int) trans->restarted);
584
585         trans_for_each_path(trans, path)
586                 if (path->should_be_locked &&
587                     !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) {
588                         trace_and_count(trans->c, trans_restart_relock, trans, _RET_IP_, path);
589                         return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
590                 }
591         return 0;
592 }
593
594 void bch2_trans_unlock(struct btree_trans *trans)
595 {
596         struct btree_path *path;
597
598         trans_for_each_path(trans, path)
599                 __bch2_btree_path_unlock(trans, path);
600
601         /*
602          * bch2_gc_btree_init_recurse() doesn't use btree iterators for walking
603          * btree nodes, it implements its own walking:
604          */
605         BUG_ON(!trans->is_initial_gc &&
606                lock_class_is_held(&bch2_btree_node_lock_key));
607 }
608
609 /* Debug */
610
611 #ifdef CONFIG_BCACHEFS_DEBUG
612
613 void bch2_btree_path_verify_locks(struct btree_path *path)
614 {
615         unsigned l;
616
617         if (!path->nodes_locked) {
618                 BUG_ON(path->uptodate == BTREE_ITER_UPTODATE &&
619                        btree_path_node(path, path->level));
620                 return;
621         }
622
623         for (l = 0; l < BTREE_MAX_DEPTH; l++) {
624                 int want = btree_lock_want(path, l);
625                 int have = btree_node_locked_type(path, l);
626
627                 BUG_ON(!is_btree_node(path, l) && have != BTREE_NODE_UNLOCKED);
628
629                 BUG_ON(is_btree_node(path, l) &&
630                        (want == BTREE_NODE_UNLOCKED ||
631                         have != BTREE_NODE_WRITE_LOCKED) &&
632                        want != have);
633         }
634 }
635
636 void bch2_trans_verify_locks(struct btree_trans *trans)
637 {
638         struct btree_path *path;
639
640         trans_for_each_path(trans, path)
641                 bch2_btree_path_verify_locks(path);
642 }
643
644 #endif