]> git.sesse.net Git - bcachefs-tools-debian/blob - linux/six.c
532800443398ccef305ada1287bc781116b2aa47
[bcachefs-tools-debian] / linux / six.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include <linux/export.h>
4 #include <linux/log2.h>
5 #include <linux/percpu.h>
6 #include <linux/preempt.h>
7 #include <linux/rcupdate.h>
8 #include <linux/sched.h>
9 #include <linux/sched/rt.h>
10 #include <linux/six.h>
11
12 #ifdef DEBUG
13 #define EBUG_ON(cond)           BUG_ON(cond)
14 #else
15 #define EBUG_ON(cond)           do {} while (0)
16 #endif
17
18 #define six_acquire(l, t)       lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_)
19 #define six_release(l)          lock_release(l, _RET_IP_)
20
21 struct six_lock_vals {
22         /* Value we add to the lock in order to take the lock: */
23         u64                     lock_val;
24
25         /* If the lock has this value (used as a mask), taking the lock fails: */
26         u64                     lock_fail;
27
28         /* Value we add to the lock in order to release the lock: */
29         u64                     unlock_val;
30
31         /* Mask that indicates lock is held for this type: */
32         u64                     held_mask;
33
34         /* Waitlist we wakeup when releasing the lock: */
35         enum six_lock_type      unlock_wakeup;
36 };
37
38 #define __SIX_LOCK_HELD_read    __SIX_VAL(read_lock, ~0)
39 #define __SIX_LOCK_HELD_intent  __SIX_VAL(intent_lock, ~0)
40 #define __SIX_LOCK_HELD_write   __SIX_VAL(seq, 1)
41
42 #define LOCK_VALS {                                                     \
43         [SIX_LOCK_read] = {                                             \
44                 .lock_val       = __SIX_VAL(read_lock, 1),              \
45                 .lock_fail      = __SIX_LOCK_HELD_write + __SIX_VAL(write_locking, 1),\
46                 .unlock_val     = -__SIX_VAL(read_lock, 1),             \
47                 .held_mask      = __SIX_LOCK_HELD_read,                 \
48                 .unlock_wakeup  = SIX_LOCK_write,                       \
49         },                                                              \
50         [SIX_LOCK_intent] = {                                           \
51                 .lock_val       = __SIX_VAL(intent_lock, 1),            \
52                 .lock_fail      = __SIX_LOCK_HELD_intent,               \
53                 .unlock_val     = -__SIX_VAL(intent_lock, 1),           \
54                 .held_mask      = __SIX_LOCK_HELD_intent,               \
55                 .unlock_wakeup  = SIX_LOCK_intent,                      \
56         },                                                              \
57         [SIX_LOCK_write] = {                                            \
58                 .lock_val       = __SIX_VAL(seq, 1),                    \
59                 .lock_fail      = __SIX_LOCK_HELD_read,                 \
60                 .unlock_val     = __SIX_VAL(seq, 1),                    \
61                 .held_mask      = __SIX_LOCK_HELD_write,                \
62                 .unlock_wakeup  = SIX_LOCK_read,                        \
63         },                                                              \
64 }
65
66 static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type,
67                                  union six_lock_state old)
68 {
69         if (type != SIX_LOCK_intent)
70                 return;
71
72         if (!old.intent_lock) {
73                 EBUG_ON(lock->owner);
74                 lock->owner = current;
75         } else {
76                 EBUG_ON(lock->owner != current);
77         }
78 }
79
80 static inline unsigned pcpu_read_count(struct six_lock *lock)
81 {
82         unsigned read_count = 0;
83         int cpu;
84
85         for_each_possible_cpu(cpu)
86                 read_count += *per_cpu_ptr(lock->readers, cpu);
87         return read_count;
88 }
89
90 struct six_lock_waiter {
91         struct list_head        list;
92         struct task_struct      *task;
93 };
94
95 /* This is probably up there with the more evil things I've done */
96 #define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))
97
98 static inline void six_lock_wakeup(struct six_lock *lock,
99                                    union six_lock_state state,
100                                    unsigned waitlist_id)
101 {
102         if (waitlist_id == SIX_LOCK_write) {
103                 if (state.write_locking && !state.read_lock) {
104                         struct task_struct *p = READ_ONCE(lock->owner);
105                         if (p)
106                                 wake_up_process(p);
107                 }
108         } else {
109                 struct list_head *wait_list = &lock->wait_list[waitlist_id];
110                 struct six_lock_waiter *w, *next;
111
112                 if (!(state.waiters & (1 << waitlist_id)))
113                         return;
114
115                 clear_bit(waitlist_bitnr(waitlist_id),
116                           (unsigned long *) &lock->state.v);
117
118                 raw_spin_lock(&lock->wait_lock);
119
120                 list_for_each_entry_safe(w, next, wait_list, list) {
121                         list_del_init(&w->list);
122
123                         if (wake_up_process(w->task) &&
124                             waitlist_id != SIX_LOCK_read) {
125                                 if (!list_empty(wait_list))
126                                         set_bit(waitlist_bitnr(waitlist_id),
127                                                 (unsigned long *) &lock->state.v);
128                                 break;
129                         }
130                 }
131
132                 raw_spin_unlock(&lock->wait_lock);
133         }
134 }
135
136 static __always_inline bool do_six_trylock_type(struct six_lock *lock,
137                                                 enum six_lock_type type,
138                                                 bool try)
139 {
140         const struct six_lock_vals l[] = LOCK_VALS;
141         union six_lock_state old, new;
142         bool ret;
143         u64 v;
144
145         EBUG_ON(type == SIX_LOCK_write && lock->owner != current);
146         EBUG_ON(type == SIX_LOCK_write && (lock->state.seq & 1));
147
148         EBUG_ON(type == SIX_LOCK_write && (try != !(lock->state.write_locking)));
149
150         /*
151          * Percpu reader mode:
152          *
153          * The basic idea behind this algorithm is that you can implement a lock
154          * between two threads without any atomics, just memory barriers:
155          *
156          * For two threads you'll need two variables, one variable for "thread a
157          * has the lock" and another for "thread b has the lock".
158          *
159          * To take the lock, a thread sets its variable indicating that it holds
160          * the lock, then issues a full memory barrier, then reads from the
161          * other thread's variable to check if the other thread thinks it has
162          * the lock. If we raced, we backoff and retry/sleep.
163          */
164
165         if (type == SIX_LOCK_read && lock->readers) {
166 retry:
167                 preempt_disable();
168                 this_cpu_inc(*lock->readers); /* signal that we own lock */
169
170                 smp_mb();
171
172                 old.v = READ_ONCE(lock->state.v);
173                 ret = !(old.v & l[type].lock_fail);
174
175                 this_cpu_sub(*lock->readers, !ret);
176                 preempt_enable();
177
178                 /*
179                  * If we failed because a writer was trying to take the
180                  * lock, issue a wakeup because we might have caused a
181                  * spurious trylock failure:
182                  */
183                 if (old.write_locking) {
184                         struct task_struct *p = READ_ONCE(lock->owner);
185
186                         if (p)
187                                 wake_up_process(p);
188                 }
189
190                 /*
191                  * If we failed from the lock path and the waiting bit wasn't
192                  * set, set it:
193                  */
194                 if (!try && !ret) {
195                         v = old.v;
196
197                         do {
198                                 new.v = old.v = v;
199
200                                 if (!(old.v & l[type].lock_fail))
201                                         goto retry;
202
203                                 if (new.waiters & (1 << type))
204                                         break;
205
206                                 new.waiters |= 1 << type;
207                         } while ((v = atomic64_cmpxchg(&lock->state.counter,
208                                                        old.v, new.v)) != old.v);
209                 }
210         } else if (type == SIX_LOCK_write && lock->readers) {
211                 if (try) {
212                         atomic64_add(__SIX_VAL(write_locking, 1),
213                                      &lock->state.counter);
214                         smp_mb__after_atomic();
215                 }
216
217                 ret = !pcpu_read_count(lock);
218
219                 /*
220                  * On success, we increment lock->seq; also we clear
221                  * write_locking unless we failed from the lock path:
222                  */
223                 v = 0;
224                 if (ret)
225                         v += __SIX_VAL(seq, 1);
226                 if (ret || try)
227                         v -= __SIX_VAL(write_locking, 1);
228
229                 if (try && !ret) {
230                         old.v = atomic64_add_return(v, &lock->state.counter);
231                         six_lock_wakeup(lock, old, SIX_LOCK_read);
232                 } else {
233                         atomic64_add(v, &lock->state.counter);
234                 }
235         } else {
236                 v = READ_ONCE(lock->state.v);
237                 do {
238                         new.v = old.v = v;
239
240                         if (!(old.v & l[type].lock_fail)) {
241                                 new.v += l[type].lock_val;
242
243                                 if (type == SIX_LOCK_write)
244                                         new.write_locking = 0;
245                         } else if (!try && type != SIX_LOCK_write &&
246                                    !(new.waiters & (1 << type)))
247                                 new.waiters |= 1 << type;
248                         else
249                                 break; /* waiting bit already set */
250                 } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
251                                         old.v, new.v)) != old.v);
252
253                 ret = !(old.v & l[type].lock_fail);
254         }
255
256         if (ret)
257                 six_set_owner(lock, type, old);
258
259         EBUG_ON(ret && !(lock->state.v & l[type].held_mask));
260         EBUG_ON(type == SIX_LOCK_write && (try || ret) && (lock->state.write_locking));
261
262         return ret;
263 }
264
265 __always_inline __flatten
266 static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type)
267 {
268         if (!do_six_trylock_type(lock, type, true))
269                 return false;
270
271         if (type != SIX_LOCK_write)
272                 six_acquire(&lock->dep_map, 1);
273         return true;
274 }
275
276 __always_inline __flatten
277 static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
278                               unsigned seq)
279 {
280         const struct six_lock_vals l[] = LOCK_VALS;
281         union six_lock_state old;
282         u64 v;
283
284         EBUG_ON(type == SIX_LOCK_write);
285
286         if (type == SIX_LOCK_read &&
287             lock->readers) {
288                 bool ret;
289
290                 preempt_disable();
291                 this_cpu_inc(*lock->readers);
292
293                 smp_mb();
294
295                 old.v = READ_ONCE(lock->state.v);
296                 ret = !(old.v & l[type].lock_fail) && old.seq == seq;
297
298                 this_cpu_sub(*lock->readers, !ret);
299                 preempt_enable();
300
301                 /*
302                  * Similar to the lock path, we may have caused a spurious write
303                  * lock fail and need to issue a wakeup:
304                  */
305                 if (old.write_locking) {
306                         struct task_struct *p = READ_ONCE(lock->owner);
307
308                         if (p)
309                                 wake_up_process(p);
310                 }
311
312                 return ret;
313         }
314
315         v = READ_ONCE(lock->state.v);
316         do {
317                 old.v = v;
318
319                 if (old.seq != seq || old.v & l[type].lock_fail)
320                         return false;
321         } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
322                                 old.v,
323                                 old.v + l[type].lock_val)) != old.v);
324
325         six_set_owner(lock, type, old);
326         if (type != SIX_LOCK_write)
327                 six_acquire(&lock->dep_map, 1);
328         return true;
329 }
330
331 #ifdef CONFIG_LOCK_SPIN_ON_OWNER
332
333 static inline int six_can_spin_on_owner(struct six_lock *lock)
334 {
335         struct task_struct *owner;
336         int retval = 1;
337
338         if (need_resched())
339                 return 0;
340
341         rcu_read_lock();
342         owner = READ_ONCE(lock->owner);
343         if (owner)
344                 retval = owner->on_cpu;
345         rcu_read_unlock();
346         /*
347          * if lock->owner is not set, the mutex owner may have just acquired
348          * it and not set the owner yet or the mutex has been released.
349          */
350         return retval;
351 }
352
353 static inline bool six_spin_on_owner(struct six_lock *lock,
354                                      struct task_struct *owner)
355 {
356         bool ret = true;
357
358         rcu_read_lock();
359         while (lock->owner == owner) {
360                 /*
361                  * Ensure we emit the owner->on_cpu, dereference _after_
362                  * checking lock->owner still matches owner. If that fails,
363                  * owner might point to freed memory. If it still matches,
364                  * the rcu_read_lock() ensures the memory stays valid.
365                  */
366                 barrier();
367
368                 if (!owner->on_cpu || need_resched()) {
369                         ret = false;
370                         break;
371                 }
372
373                 cpu_relax();
374         }
375         rcu_read_unlock();
376
377         return ret;
378 }
379
380 static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
381 {
382         struct task_struct *task = current;
383
384         if (type == SIX_LOCK_write)
385                 return false;
386
387         preempt_disable();
388         if (!six_can_spin_on_owner(lock))
389                 goto fail;
390
391         if (!osq_lock(&lock->osq))
392                 goto fail;
393
394         while (1) {
395                 struct task_struct *owner;
396
397                 /*
398                  * If there's an owner, wait for it to either
399                  * release the lock or go to sleep.
400                  */
401                 owner = READ_ONCE(lock->owner);
402                 if (owner && !six_spin_on_owner(lock, owner))
403                         break;
404
405                 if (do_six_trylock_type(lock, type, false)) {
406                         osq_unlock(&lock->osq);
407                         preempt_enable();
408                         return true;
409                 }
410
411                 /*
412                  * When there's no owner, we might have preempted between the
413                  * owner acquiring the lock and setting the owner field. If
414                  * we're an RT task that will live-lock because we won't let
415                  * the owner complete.
416                  */
417                 if (!owner && (need_resched() || rt_task(task)))
418                         break;
419
420                 /*
421                  * The cpu_relax() call is a compiler barrier which forces
422                  * everything in this loop to be re-loaded. We don't need
423                  * memory barriers as we'll eventually observe the right
424                  * values at the cost of a few extra spins.
425                  */
426                 cpu_relax();
427         }
428
429         osq_unlock(&lock->osq);
430 fail:
431         preempt_enable();
432
433         /*
434          * If we fell out of the spin path because of need_resched(),
435          * reschedule now, before we try-lock again. This avoids getting
436          * scheduled out right after we obtained the lock.
437          */
438         if (need_resched())
439                 schedule();
440
441         return false;
442 }
443
444 #else /* CONFIG_LOCK_SPIN_ON_OWNER */
445
446 static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
447 {
448         return false;
449 }
450
451 #endif
452
453 noinline
454 static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type,
455                                     six_lock_should_sleep_fn should_sleep_fn, void *p)
456 {
457         union six_lock_state old;
458         struct six_lock_waiter wait;
459         int ret = 0;
460
461         if (type == SIX_LOCK_write) {
462                 EBUG_ON(lock->state.write_locking);
463                 atomic64_add(__SIX_VAL(write_locking, 1), &lock->state.counter);
464                 smp_mb__after_atomic();
465         }
466
467         ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
468         if (ret)
469                 goto out_before_sleep;
470
471         if (six_optimistic_spin(lock, type))
472                 goto out_before_sleep;
473
474         lock_contended(&lock->dep_map, _RET_IP_);
475
476         INIT_LIST_HEAD(&wait.list);
477         wait.task = current;
478
479         while (1) {
480                 set_current_state(TASK_UNINTERRUPTIBLE);
481                 if (type == SIX_LOCK_write)
482                         EBUG_ON(lock->owner != current);
483                 else if (list_empty_careful(&wait.list)) {
484                         raw_spin_lock(&lock->wait_lock);
485                         list_add_tail(&wait.list, &lock->wait_list[type]);
486                         raw_spin_unlock(&lock->wait_lock);
487                 }
488
489                 if (do_six_trylock_type(lock, type, false))
490                         break;
491
492                 ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
493                 if (ret)
494                         break;
495
496                 schedule();
497         }
498
499         __set_current_state(TASK_RUNNING);
500
501         if (!list_empty_careful(&wait.list)) {
502                 raw_spin_lock(&lock->wait_lock);
503                 list_del_init(&wait.list);
504                 raw_spin_unlock(&lock->wait_lock);
505         }
506 out_before_sleep:
507         if (ret && type == SIX_LOCK_write) {
508                 old.v = atomic64_sub_return(__SIX_VAL(write_locking, 1),
509                                             &lock->state.counter);
510                 six_lock_wakeup(lock, old, SIX_LOCK_read);
511         }
512
513         return ret;
514 }
515
516 __always_inline
517 static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
518                            six_lock_should_sleep_fn should_sleep_fn, void *p)
519 {
520         int ret;
521
522         if (type != SIX_LOCK_write)
523                 six_acquire(&lock->dep_map, 0);
524
525         ret = do_six_trylock_type(lock, type, true) ? 0
526                 : __six_lock_type_slowpath(lock, type, should_sleep_fn, p);
527
528         if (ret && type != SIX_LOCK_write)
529                 six_release(&lock->dep_map);
530         if (!ret)
531                 lock_acquired(&lock->dep_map, _RET_IP_);
532
533         return ret;
534 }
535
536 __always_inline __flatten
537 static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
538 {
539         const struct six_lock_vals l[] = LOCK_VALS;
540         union six_lock_state state;
541
542         EBUG_ON(type == SIX_LOCK_write &&
543                 !(lock->state.v & __SIX_LOCK_HELD_intent));
544
545         if (type != SIX_LOCK_write)
546                 six_release(&lock->dep_map);
547
548         if (type == SIX_LOCK_intent) {
549                 EBUG_ON(lock->owner != current);
550
551                 if (lock->intent_lock_recurse) {
552                         --lock->intent_lock_recurse;
553                         return;
554                 }
555
556                 lock->owner = NULL;
557         }
558
559         if (type == SIX_LOCK_read &&
560             lock->readers) {
561                 smp_mb(); /* unlock barrier */
562                 this_cpu_dec(*lock->readers);
563                 state.v = READ_ONCE(lock->state.v);
564         } else {
565                 EBUG_ON(!(lock->state.v & l[type].held_mask));
566                 state.v = atomic64_add_return_release(l[type].unlock_val,
567                                                       &lock->state.counter);
568         }
569
570         six_lock_wakeup(lock, state, l[type].unlock_wakeup);
571 }
572
573 #define __SIX_LOCK(type)                                                \
574 bool six_trylock_##type(struct six_lock *lock)                          \
575 {                                                                       \
576         return __six_trylock_type(lock, SIX_LOCK_##type);               \
577 }                                                                       \
578 EXPORT_SYMBOL_GPL(six_trylock_##type);                                  \
579                                                                         \
580 bool six_relock_##type(struct six_lock *lock, u32 seq)                  \
581 {                                                                       \
582         return __six_relock_type(lock, SIX_LOCK_##type, seq);           \
583 }                                                                       \
584 EXPORT_SYMBOL_GPL(six_relock_##type);                                   \
585                                                                         \
586 int six_lock_##type(struct six_lock *lock,                              \
587                     six_lock_should_sleep_fn should_sleep_fn, void *p)  \
588 {                                                                       \
589         return __six_lock_type(lock, SIX_LOCK_##type, should_sleep_fn, p);\
590 }                                                                       \
591 EXPORT_SYMBOL_GPL(six_lock_##type);                                     \
592                                                                         \
593 void six_unlock_##type(struct six_lock *lock)                           \
594 {                                                                       \
595         __six_unlock_type(lock, SIX_LOCK_##type);                       \
596 }                                                                       \
597 EXPORT_SYMBOL_GPL(six_unlock_##type);
598
599 __SIX_LOCK(read)
600 __SIX_LOCK(intent)
601 __SIX_LOCK(write)
602
603 #undef __SIX_LOCK
604
605 /* Convert from intent to read: */
606 void six_lock_downgrade(struct six_lock *lock)
607 {
608         six_lock_increment(lock, SIX_LOCK_read);
609         six_unlock_intent(lock);
610 }
611 EXPORT_SYMBOL_GPL(six_lock_downgrade);
612
613 bool six_lock_tryupgrade(struct six_lock *lock)
614 {
615         union six_lock_state old, new;
616         u64 v = READ_ONCE(lock->state.v);
617
618         do {
619                 new.v = old.v = v;
620
621                 if (new.intent_lock)
622                         return false;
623
624                 if (!lock->readers) {
625                         EBUG_ON(!new.read_lock);
626                         new.read_lock--;
627                 }
628
629                 new.intent_lock = 1;
630         } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
631                                 old.v, new.v)) != old.v);
632
633         if (lock->readers)
634                 this_cpu_dec(*lock->readers);
635
636         six_set_owner(lock, SIX_LOCK_intent, old);
637
638         return true;
639 }
640 EXPORT_SYMBOL_GPL(six_lock_tryupgrade);
641
642 bool six_trylock_convert(struct six_lock *lock,
643                          enum six_lock_type from,
644                          enum six_lock_type to)
645 {
646         EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write);
647
648         if (to == from)
649                 return true;
650
651         if (to == SIX_LOCK_read) {
652                 six_lock_downgrade(lock);
653                 return true;
654         } else {
655                 return six_lock_tryupgrade(lock);
656         }
657 }
658 EXPORT_SYMBOL_GPL(six_trylock_convert);
659
660 /*
661  * Increment read/intent lock count, assuming we already have it read or intent
662  * locked:
663  */
664 void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
665 {
666         const struct six_lock_vals l[] = LOCK_VALS;
667
668         six_acquire(&lock->dep_map, 0);
669
670         /* XXX: assert already locked, and that we don't overflow: */
671
672         switch (type) {
673         case SIX_LOCK_read:
674                 if (lock->readers) {
675                         this_cpu_inc(*lock->readers);
676                 } else {
677                         EBUG_ON(!lock->state.read_lock &&
678                                 !lock->state.intent_lock);
679                         atomic64_add(l[type].lock_val, &lock->state.counter);
680                 }
681                 break;
682         case SIX_LOCK_intent:
683                 EBUG_ON(!lock->state.intent_lock);
684                 lock->intent_lock_recurse++;
685                 break;
686         case SIX_LOCK_write:
687                 BUG();
688                 break;
689         }
690 }
691 EXPORT_SYMBOL_GPL(six_lock_increment);
692
693 void six_lock_wakeup_all(struct six_lock *lock)
694 {
695         struct six_lock_waiter *w;
696
697         raw_spin_lock(&lock->wait_lock);
698
699         list_for_each_entry(w, &lock->wait_list[0], list)
700                 wake_up_process(w->task);
701         list_for_each_entry(w, &lock->wait_list[1], list)
702                 wake_up_process(w->task);
703
704         raw_spin_unlock(&lock->wait_lock);
705 }
706 EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
707
708 void six_lock_pcpu_free(struct six_lock *lock)
709 {
710         BUG_ON(lock->readers && pcpu_read_count(lock));
711         BUG_ON(lock->state.read_lock);
712
713         free_percpu(lock->readers);
714         lock->readers = NULL;
715 }
716 EXPORT_SYMBOL_GPL(six_lock_pcpu_free);
717
718 void six_lock_pcpu_alloc(struct six_lock *lock)
719 {
720         BUG_ON(lock->readers && pcpu_read_count(lock));
721         BUG_ON(lock->state.read_lock);
722 #ifdef __KERNEL__
723         if (!lock->readers)
724                 lock->readers = alloc_percpu(unsigned);
725 #endif
726 }
727 EXPORT_SYMBOL_GPL(six_lock_pcpu_alloc);