]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcache/super.c
Delete more unused shim code, update bcache code
[bcachefs-tools-debian] / libbcache / super.c
1 /*
2  * bcache setup/teardown code, and some metadata io - read a superblock and
3  * figure out what to do with it.
4  *
5  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
6  * Copyright 2012 Google, Inc.
7  */
8
9 #include "bcache.h"
10 #include "blockdev.h"
11 #include "alloc.h"
12 #include "btree_cache.h"
13 #include "btree_gc.h"
14 #include "btree_update.h"
15 #include "btree_io.h"
16 #include "chardev.h"
17 #include "checksum.h"
18 #include "clock.h"
19 #include "compress.h"
20 #include "debug.h"
21 #include "error.h"
22 #include "fs.h"
23 #include "fs-gc.h"
24 #include "inode.h"
25 #include "io.h"
26 #include "journal.h"
27 #include "keylist.h"
28 #include "move.h"
29 #include "migrate.h"
30 #include "movinggc.h"
31 #include "notify.h"
32 #include "stats.h"
33 #include "super.h"
34 #include "super-io.h"
35 #include "tier.h"
36 #include "writeback.h"
37
38 #include <linux/backing-dev.h>
39 #include <linux/blkdev.h>
40 #include <linux/debugfs.h>
41 #include <linux/device.h>
42 #include <linux/genhd.h>
43 #include <linux/idr.h>
44 #include <linux/kthread.h>
45 #include <linux/module.h>
46 #include <linux/percpu.h>
47 #include <linux/random.h>
48 #include <linux/reboot.h>
49 #include <linux/sysfs.h>
50 #include <crypto/hash.h>
51
52 #include <trace/events/bcache.h>
53
54 MODULE_LICENSE("GPL");
55 MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
56
57 static const uuid_le invalid_uuid = {
58         .b = {
59                 0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
60                 0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
61         }
62 };
63
64 static struct kset *bcache_kset;
65 struct mutex bch_register_lock;
66 LIST_HEAD(bch_fs_list);
67
68 static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait);
69 struct workqueue_struct *bcache_io_wq;
70 struct crypto_shash *bch_sha256;
71
72 static void bch_dev_stop(struct cache *);
73 static int bch_dev_online(struct cache *);
74
75 static int bch_congested_fn(void *data, int bdi_bits)
76 {
77         struct backing_dev_info *bdi;
78         struct cache_set *c = data;
79         struct cache *ca;
80         unsigned i;
81         int ret = 0;
82
83         rcu_read_lock();
84         if (bdi_bits & (1 << WB_sync_congested)) {
85                 /* Reads - check all devices: */
86                 for_each_cache_rcu(ca, c, i) {
87                         bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
88
89                         if (bdi_congested(bdi, bdi_bits)) {
90                                 ret = 1;
91                                 break;
92                         }
93                 }
94         } else {
95                 /* Writes only go to tier 0: */
96                 group_for_each_cache_rcu(ca, &c->cache_tiers[0], i) {
97                         bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
98
99                         if (bdi_congested(bdi, bdi_bits)) {
100                                 ret = 1;
101                                 break;
102                         }
103                 }
104         }
105         rcu_read_unlock();
106
107         return ret;
108 }
109
110 /* Cache set RO/RW: */
111
112 /*
113  * For startup/shutdown of RW stuff, the dependencies are:
114  *
115  * - foreground writes depend on copygc and tiering (to free up space)
116  *
117  * - copygc and tiering depend on mark and sweep gc (they actually probably
118  *   don't because they either reserve ahead of time or don't block if
119  *   allocations fail, but allocations can require mark and sweep gc to run
120  *   because of generation number wraparound)
121  *
122  * - all of the above depends on the allocator threads
123  *
124  * - allocator depends on the journal (when it rewrites prios and gens)
125  */
126
127 static void __bch_fs_read_only(struct cache_set *c)
128 {
129         struct cache *ca;
130         unsigned i;
131
132         c->tiering_pd.rate.rate = UINT_MAX;
133         bch_ratelimit_reset(&c->tiering_pd.rate);
134         bch_tiering_read_stop(c);
135
136         for_each_cache(ca, c, i)
137                 bch_moving_gc_stop(ca);
138
139         bch_gc_thread_stop(c);
140
141         bch_btree_flush(c);
142
143         for_each_cache(ca, c, i)
144                 bch_dev_allocator_stop(ca);
145
146         /*
147          * Write a journal entry after flushing the btree, so we don't end up
148          * replaying everything we just flushed:
149          */
150         if (test_bit(JOURNAL_STARTED, &c->journal.flags)) {
151                 int ret;
152
153                 bch_journal_flush_async(&c->journal, NULL);
154                 ret = bch_journal_meta(&c->journal);
155                 BUG_ON(ret && !bch_journal_error(&c->journal));
156         }
157
158         cancel_delayed_work_sync(&c->journal.write_work);
159         cancel_delayed_work_sync(&c->journal.reclaim_work);
160 }
161
162 static void bch_writes_disabled(struct percpu_ref *writes)
163 {
164         struct cache_set *c = container_of(writes, struct cache_set, writes);
165
166         set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
167         wake_up(&bch_read_only_wait);
168 }
169
170 static void bch_fs_read_only_work(struct work_struct *work)
171 {
172         struct cache_set *c =
173                 container_of(work, struct cache_set, read_only_work);
174
175         percpu_ref_put(&c->writes);
176
177         del_timer(&c->foreground_write_wakeup);
178         cancel_delayed_work(&c->pd_controllers_update);
179
180         c->foreground_write_pd.rate.rate = UINT_MAX;
181         bch_wake_delayed_writes((unsigned long) c);
182
183         if (!test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) {
184                 /*
185                  * If we're not doing an emergency shutdown, we want to wait on
186                  * outstanding writes to complete so they don't see spurious
187                  * errors due to shutting down the allocator:
188                  */
189                 wait_event(bch_read_only_wait,
190                            test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
191
192                 __bch_fs_read_only(c);
193
194                 if (!bch_journal_error(&c->journal) &&
195                     !test_bit(BCH_FS_ERROR, &c->flags)) {
196                         mutex_lock(&c->sb_lock);
197                         SET_BCH_SB_CLEAN(c->disk_sb, true);
198                         bch_write_super(c);
199                         mutex_unlock(&c->sb_lock);
200                 }
201         } else {
202                 /*
203                  * If we are doing an emergency shutdown outstanding writes may
204                  * hang until we shutdown the allocator so we don't want to wait
205                  * on outstanding writes before shutting everything down - but
206                  * we do need to wait on them before returning and signalling
207                  * that going RO is complete:
208                  */
209                 __bch_fs_read_only(c);
210
211                 wait_event(bch_read_only_wait,
212                            test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
213         }
214
215         bch_notify_fs_read_only(c);
216         trace_fs_read_only_done(c);
217
218         set_bit(BCH_FS_RO_COMPLETE, &c->flags);
219         wake_up(&bch_read_only_wait);
220 }
221
222 bool bch_fs_read_only(struct cache_set *c)
223 {
224         if (test_and_set_bit(BCH_FS_RO, &c->flags))
225                 return false;
226
227         trace_fs_read_only(c);
228
229         percpu_ref_get(&c->writes);
230
231         /*
232          * Block new foreground-end write operations from starting - any new
233          * writes will return -EROFS:
234          *
235          * (This is really blocking new _allocations_, writes to previously
236          * allocated space can still happen until stopping the allocator in
237          * bch_dev_allocator_stop()).
238          */
239         percpu_ref_kill(&c->writes);
240
241         queue_work(system_freezable_wq, &c->read_only_work);
242         return true;
243 }
244
245 bool bch_fs_emergency_read_only(struct cache_set *c)
246 {
247         bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags);
248
249         bch_fs_read_only(c);
250         bch_journal_halt(&c->journal);
251
252         wake_up(&bch_read_only_wait);
253         return ret;
254 }
255
256 void bch_fs_read_only_sync(struct cache_set *c)
257 {
258         /* so we don't race with bch_fs_read_write() */
259         lockdep_assert_held(&bch_register_lock);
260
261         bch_fs_read_only(c);
262
263         wait_event(bch_read_only_wait,
264                    test_bit(BCH_FS_RO_COMPLETE, &c->flags) &&
265                    test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
266 }
267
268 static const char *__bch_fs_read_write(struct cache_set *c)
269 {
270         struct cache *ca;
271         const char *err;
272         unsigned i;
273
274         lockdep_assert_held(&bch_register_lock);
275
276         err = "error starting allocator thread";
277         for_each_cache(ca, c, i)
278                 if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
279                     bch_dev_allocator_start(ca)) {
280                         percpu_ref_put(&ca->ref);
281                         goto err;
282                 }
283
284         err = "error starting btree GC thread";
285         if (bch_gc_thread_start(c))
286                 goto err;
287
288         for_each_cache(ca, c, i) {
289                 if (ca->mi.state != BCH_MEMBER_STATE_ACTIVE)
290                         continue;
291
292                 err = "error starting moving GC thread";
293                 if (bch_moving_gc_thread_start(ca)) {
294                         percpu_ref_put(&ca->ref);
295                         goto err;
296                 }
297         }
298
299         err = "error starting tiering thread";
300         if (bch_tiering_read_start(c))
301                 goto err;
302
303         schedule_delayed_work(&c->pd_controllers_update, 5 * HZ);
304
305         return NULL;
306 err:
307         __bch_fs_read_only(c);
308         return err;
309 }
310
311 const char *bch_fs_read_write(struct cache_set *c)
312 {
313         const char *err;
314
315         lockdep_assert_held(&bch_register_lock);
316
317         if (!test_bit(BCH_FS_RO_COMPLETE, &c->flags))
318                 return NULL;
319
320         err = __bch_fs_read_write(c);
321         if (err)
322                 return err;
323
324         percpu_ref_reinit(&c->writes);
325
326         clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
327         clear_bit(BCH_FS_EMERGENCY_RO, &c->flags);
328         clear_bit(BCH_FS_RO_COMPLETE, &c->flags);
329         clear_bit(BCH_FS_RO, &c->flags);
330         return NULL;
331 }
332
333 /* Cache set startup/shutdown: */
334
335 static void bch_fs_free(struct cache_set *c)
336 {
337         del_timer_sync(&c->foreground_write_wakeup);
338         cancel_delayed_work_sync(&c->pd_controllers_update);
339         cancel_work_sync(&c->read_only_work);
340         cancel_work_sync(&c->bio_submit_work);
341         cancel_work_sync(&c->read_retry_work);
342
343         bch_fs_encryption_free(c);
344         bch_btree_cache_free(c);
345         bch_journal_free(&c->journal);
346         bch_io_clock_exit(&c->io_clock[WRITE]);
347         bch_io_clock_exit(&c->io_clock[READ]);
348         bch_compress_free(c);
349         bch_fs_blockdev_exit(c);
350         bdi_destroy(&c->bdi);
351         lg_lock_free(&c->bucket_stats_lock);
352         free_percpu(c->bucket_stats_percpu);
353         mempool_exit(&c->btree_bounce_pool);
354         mempool_exit(&c->bio_bounce_pages);
355         bioset_exit(&c->bio_write);
356         bioset_exit(&c->bio_read_split);
357         bioset_exit(&c->bio_read);
358         bioset_exit(&c->btree_read_bio);
359         mempool_exit(&c->btree_interior_update_pool);
360         mempool_exit(&c->btree_reserve_pool);
361         mempool_exit(&c->fill_iter);
362         percpu_ref_exit(&c->writes);
363
364         if (c->copygc_wq)
365                 destroy_workqueue(c->copygc_wq);
366         if (c->wq)
367                 destroy_workqueue(c->wq);
368
369         kfree_rcu(rcu_dereference_protected(c->members, 1), rcu); /* shutting down */
370         free_pages((unsigned long) c->disk_sb, c->disk_sb_order);
371         kfree(c);
372         module_put(THIS_MODULE);
373 }
374
375 /*
376  * should be __bch_fs_stop4 - block devices are closed, now we can finally
377  * free it
378  */
379 void bch_fs_release(struct kobject *kobj)
380 {
381         struct cache_set *c = container_of(kobj, struct cache_set, kobj);
382         struct completion *stop_completion = c->stop_completion;
383
384         bch_notify_fs_stopped(c);
385         bch_info(c, "stopped");
386
387         bch_fs_free(c);
388
389         if (stop_completion)
390                 complete(stop_completion);
391 }
392
393 /*
394  * All activity on the cache_set should have stopped now - close devices:
395  */
396 static void __bch_fs_stop3(struct closure *cl)
397 {
398         struct cache_set *c = container_of(cl, struct cache_set, cl);
399         struct cache *ca;
400         unsigned i;
401
402         mutex_lock(&bch_register_lock);
403         for_each_cache(ca, c, i)
404                 bch_dev_stop(ca);
405
406         list_del(&c->list);
407         mutex_unlock(&bch_register_lock);
408
409         closure_debug_destroy(&c->cl);
410         kobject_put(&c->kobj);
411 }
412
413 /*
414  * Openers (i.e. block devices) should have exited, shutdown all userspace
415  * interfaces and wait for &c->cl to hit 0
416  */
417 static void __bch_fs_stop2(struct closure *cl)
418 {
419         struct cache_set *c = container_of(cl, struct cache_set, caching);
420
421         bch_debug_exit_cache_set(c);
422         bch_fs_chardev_exit(c);
423
424         if (c->kobj.state_in_sysfs)
425                 kobject_del(&c->kobj);
426
427         bch_cache_accounting_destroy(&c->accounting);
428
429         kobject_put(&c->time_stats);
430         kobject_put(&c->opts_dir);
431         kobject_put(&c->internal);
432
433         mutex_lock(&bch_register_lock);
434         bch_fs_read_only_sync(c);
435         mutex_unlock(&bch_register_lock);
436
437         closure_return(cl);
438 }
439
440 /*
441  * First phase of the shutdown process that's kicked off by bch_fs_stop(); we
442  * haven't waited for anything to stop yet, we're just punting to process
443  * context to shut down block devices:
444  */
445 static void __bch_fs_stop1(struct closure *cl)
446 {
447         struct cache_set *c = container_of(cl, struct cache_set, caching);
448
449         bch_blockdevs_stop(c);
450
451         continue_at(cl, __bch_fs_stop2, system_wq);
452 }
453
454 void bch_fs_stop(struct cache_set *c)
455 {
456         if (!test_and_set_bit(BCH_FS_STOPPING, &c->flags))
457                 closure_queue(&c->caching);
458 }
459
460 void bch_fs_stop_sync(struct cache_set *c)
461 {
462         DECLARE_COMPLETION_ONSTACK(complete);
463
464         c->stop_completion = &complete;
465         bch_fs_stop(c);
466         closure_put(&c->cl);
467
468         /* Killable? */
469         wait_for_completion(&complete);
470 }
471
472 /* Stop, detaching from backing devices: */
473 void bch_fs_detach(struct cache_set *c)
474 {
475         if (!test_and_set_bit(BCH_FS_DETACHING, &c->flags))
476                 bch_fs_stop(c);
477 }
478
479 static unsigned bch_fs_nr_devices(struct cache_set *c)
480 {
481         struct bch_sb_field_members *mi;
482         unsigned i, nr = 0;
483
484         mutex_lock(&c->sb_lock);
485         mi = bch_sb_get_members(c->disk_sb);
486
487         for (i = 0; i < c->disk_sb->nr_devices; i++)
488                 if (!bch_is_zero(mi->members[i].uuid.b, sizeof(uuid_le)))
489                         nr++;
490
491         mutex_unlock(&c->sb_lock);
492
493         return nr;
494 }
495
496 static unsigned bch_fs_nr_online_devices(struct cache_set *c)
497 {
498         unsigned i, nr = 0;
499
500         for (i = 0; i < c->sb.nr_devices; i++)
501                 if (c->cache[i])
502                         nr++;
503
504         return nr;
505 }
506
507 #define alloc_bucket_pages(gfp, ca)                     \
508         ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(ca))))
509
510 static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
511 {
512         struct cache_set *c;
513         unsigned iter_size, journal_entry_bytes;
514
515         c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
516         if (!c)
517                 return NULL;
518
519         __module_get(THIS_MODULE);
520
521         c->minor                = -1;
522
523         mutex_init(&c->sb_lock);
524         INIT_RADIX_TREE(&c->devices, GFP_KERNEL);
525         mutex_init(&c->btree_cache_lock);
526         mutex_init(&c->bucket_lock);
527         mutex_init(&c->btree_root_lock);
528         INIT_WORK(&c->read_only_work, bch_fs_read_only_work);
529
530         init_rwsem(&c->gc_lock);
531
532 #define BCH_TIME_STAT(name, frequency_units, duration_units)            \
533         spin_lock_init(&c->name##_time.lock);
534         BCH_TIME_STATS()
535 #undef BCH_TIME_STAT
536
537         bch_open_buckets_init(c);
538         bch_tiering_init_cache_set(c);
539
540         INIT_LIST_HEAD(&c->list);
541         INIT_LIST_HEAD(&c->cached_devs);
542         INIT_LIST_HEAD(&c->btree_cache);
543         INIT_LIST_HEAD(&c->btree_cache_freeable);
544         INIT_LIST_HEAD(&c->btree_cache_freed);
545
546         INIT_LIST_HEAD(&c->btree_interior_update_list);
547         mutex_init(&c->btree_reserve_cache_lock);
548         mutex_init(&c->btree_interior_update_lock);
549
550         mutex_init(&c->bio_bounce_pages_lock);
551         INIT_WORK(&c->bio_submit_work, bch_bio_submit_work);
552         spin_lock_init(&c->bio_submit_lock);
553         bio_list_init(&c->read_retry_list);
554         spin_lock_init(&c->read_retry_lock);
555         INIT_WORK(&c->read_retry_work, bch_read_retry_work);
556         mutex_init(&c->zlib_workspace_lock);
557
558         seqcount_init(&c->gc_pos_lock);
559
560         c->prio_clock[READ].hand = 1;
561         c->prio_clock[READ].min_prio = 0;
562         c->prio_clock[WRITE].hand = 1;
563         c->prio_clock[WRITE].min_prio = 0;
564
565         c->congested_read_threshold_us  = 2000;
566         c->congested_write_threshold_us = 20000;
567         c->error_limit  = 16 << IO_ERROR_SHIFT;
568         init_waitqueue_head(&c->writeback_wait);
569
570         c->writeback_pages_max = (256 << 10) / PAGE_SIZE;
571
572         c->copy_gc_enabled = 1;
573         c->tiering_enabled = 1;
574         c->tiering_percent = 10;
575
576         c->foreground_target_percent = 20;
577
578         c->journal.write_time   = &c->journal_write_time;
579         c->journal.delay_time   = &c->journal_delay_time;
580         c->journal.blocked_time = &c->journal_blocked_time;
581         c->journal.flush_seq_time = &c->journal_flush_seq_time;
582
583         mutex_init(&c->uevent_lock);
584
585         mutex_lock(&c->sb_lock);
586
587         if (bch_sb_to_cache_set(c, sb)) {
588                 mutex_unlock(&c->sb_lock);
589                 goto err;
590         }
591
592         mutex_unlock(&c->sb_lock);
593
594         scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid);
595
596         bch_opts_apply(&c->opts, bch_sb_opts(sb));
597         bch_opts_apply(&c->opts, opts);
598
599         c->opts.nochanges       |= c->opts.noreplay;
600         c->opts.read_only       |= c->opts.nochanges;
601
602         c->block_bits           = ilog2(c->sb.block_size);
603
604         if (bch_fs_init_fault("fs_alloc"))
605                 goto err;
606
607         iter_size = (btree_blocks(c) + 1) * 2 *
608                 sizeof(struct btree_node_iter_set);
609
610         journal_entry_bytes = 512U << BCH_SB_JOURNAL_ENTRY_SIZE(sb);
611
612         if (!(c->wq = alloc_workqueue("bcache",
613                                 WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
614             !(c->copygc_wq = alloc_workqueue("bcache_copygc",
615                                 WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
616             percpu_ref_init(&c->writes, bch_writes_disabled, 0, GFP_KERNEL) ||
617             mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1,
618                                       sizeof(struct btree_reserve)) ||
619             mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
620                                       sizeof(struct btree_interior_update)) ||
621             mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
622             bioset_init(&c->btree_read_bio, 1, 0) ||
623             bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio)) ||
624             bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio)) ||
625             bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio)) ||
626             mempool_init_page_pool(&c->bio_bounce_pages,
627                                    max_t(unsigned,
628                                          c->sb.btree_node_size,
629                                          BCH_ENCODED_EXTENT_MAX) /
630                                    PAGE_SECTORS, 0) ||
631             !(c->bucket_stats_percpu = alloc_percpu(struct bucket_stats_cache_set)) ||
632             lg_lock_init(&c->bucket_stats_lock) ||
633             mempool_init_page_pool(&c->btree_bounce_pool, 1,
634                                    ilog2(btree_pages(c))) ||
635             bdi_setup_and_register(&c->bdi, "bcache") ||
636             bch_fs_blockdev_init(c) ||
637             bch_io_clock_init(&c->io_clock[READ]) ||
638             bch_io_clock_init(&c->io_clock[WRITE]) ||
639             bch_journal_alloc(&c->journal, journal_entry_bytes) ||
640             bch_btree_cache_alloc(c) ||
641             bch_fs_encryption_init(c) ||
642             bch_compress_init(c) ||
643             bch_check_set_has_compressed_data(c, c->opts.compression))
644                 goto err;
645
646         c->bdi.ra_pages         = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
647         c->bdi.congested_fn     = bch_congested_fn;
648         c->bdi.congested_data   = c;
649
650         /*
651          * Now that all allocations have succeeded, init various refcounty
652          * things that let us shutdown:
653          */
654         closure_init(&c->cl, NULL);
655
656         c->kobj.kset = bcache_kset;
657         kobject_init(&c->kobj, &bch_fs_ktype);
658         kobject_init(&c->internal, &bch_fs_internal_ktype);
659         kobject_init(&c->opts_dir, &bch_fs_opts_dir_ktype);
660         kobject_init(&c->time_stats, &bch_fs_time_stats_ktype);
661
662         bch_cache_accounting_init(&c->accounting, &c->cl);
663
664         closure_init(&c->caching, &c->cl);
665         set_closure_fn(&c->caching, __bch_fs_stop1, system_wq);
666
667         continue_at_noreturn(&c->cl, __bch_fs_stop3, system_wq);
668         return c;
669 err:
670         bch_fs_free(c);
671         return NULL;
672 }
673
674 static int bch_fs_online(struct cache_set *c)
675 {
676         struct cache *ca;
677         unsigned i;
678         int ret;
679
680         lockdep_assert_held(&bch_register_lock);
681
682         if (!list_empty(&c->list))
683                 return 0;
684
685         list_add(&c->list, &bch_fs_list);
686
687         ret = bch_fs_chardev_init(c);
688         if (ret)
689                 return ret;
690
691         if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ||
692             kobject_add(&c->internal, &c->kobj, "internal") ||
693             kobject_add(&c->opts_dir, &c->kobj, "options") ||
694             kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
695             bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
696                 return -1;
697
698         for_each_cache(ca, c, i)
699                 if (bch_dev_online(ca)) {
700                         percpu_ref_put(&ca->ref);
701                         return -1;
702                 }
703
704         return 0;
705 }
706
707 static const char *bch_fs_start(struct cache_set *c)
708 {
709         const char *err = "cannot allocate memory";
710         struct bch_sb_field_members *mi;
711         struct cache *ca;
712         unsigned i, id;
713         time64_t now;
714         LIST_HEAD(journal);
715         struct jset *j;
716         int ret = -EINVAL;
717
718         lockdep_assert_held(&bch_register_lock);
719         BUG_ON(test_bit(BCH_FS_RUNNING, &c->flags));
720
721         /* We don't want bch_fatal_error() to free underneath us */
722         closure_get(&c->caching);
723
724         /*
725          * Make sure that each cache object's mi is up to date before
726          * we start testing it.
727          */
728         for_each_cache(ca, c, i)
729                 bch_sb_from_cache_set(c, ca);
730
731         if (BCH_SB_INITIALIZED(c->disk_sb)) {
732                 ret = bch_journal_read(c, &journal);
733                 if (ret)
734                         goto err;
735
736                 pr_debug("btree_journal_read() done");
737
738                 j = &list_entry(journal.prev, struct journal_replay, list)->j;
739
740                 err = "error reading priorities";
741                 for_each_cache(ca, c, i) {
742                         ret = bch_prio_read(ca);
743                         if (ret) {
744                                 percpu_ref_put(&ca->ref);
745                                 goto err;
746                         }
747                 }
748
749                 c->prio_clock[READ].hand = le16_to_cpu(j->read_clock);
750                 c->prio_clock[WRITE].hand = le16_to_cpu(j->write_clock);
751
752                 for_each_cache(ca, c, i) {
753                         bch_recalc_min_prio(ca, READ);
754                         bch_recalc_min_prio(ca, WRITE);
755                 }
756
757                 for (id = 0; id < BTREE_ID_NR; id++) {
758                         unsigned level;
759                         struct bkey_i *k;
760
761                         err = "bad btree root";
762                         k = bch_journal_find_btree_root(c, j, id, &level);
763                         if (!k && id == BTREE_ID_EXTENTS)
764                                 goto err;
765                         if (!k) {
766                                 pr_debug("missing btree root: %d", id);
767                                 continue;
768                         }
769
770                         err = "error reading btree root";
771                         if (bch_btree_root_read(c, id, k, level))
772                                 goto err;
773                 }
774
775                 bch_verbose(c, "starting mark and sweep:");
776
777                 err = "error in recovery";
778                 if (bch_initial_gc(c, &journal))
779                         goto err;
780
781                 if (c->opts.noreplay)
782                         goto recovery_done;
783
784                 bch_verbose(c, "mark and sweep done");
785
786                 /*
787                  * bch_journal_start() can't happen sooner, or btree_gc_finish()
788                  * will give spurious errors about oldest_gen > bucket_gen -
789                  * this is a hack but oh well.
790                  */
791                 bch_journal_start(c);
792
793                 err = "error starting allocator thread";
794                 for_each_cache(ca, c, i)
795                         if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
796                             bch_dev_allocator_start(ca)) {
797                                 percpu_ref_put(&ca->ref);
798                                 goto err;
799                         }
800
801                 bch_verbose(c, "starting journal replay:");
802
803                 err = "journal replay failed";
804                 ret = bch_journal_replay(c, &journal);
805                 if (ret)
806                         goto err;
807
808                 bch_verbose(c, "journal replay done");
809
810                 if (c->opts.norecovery)
811                         goto recovery_done;
812
813                 bch_verbose(c, "starting fsck:");
814                 err = "error in fsck";
815                 ret = bch_fsck(c, !c->opts.nofsck);
816                 if (ret)
817                         goto err;
818
819                 bch_verbose(c, "fsck done");
820         } else {
821                 struct bch_inode_unpacked inode;
822                 struct bkey_inode_buf packed_inode;
823                 struct closure cl;
824
825                 closure_init_stack(&cl);
826
827                 bch_notice(c, "initializing new filesystem");
828
829                 err = "unable to allocate journal buckets";
830                 for_each_cache(ca, c, i)
831                         if (bch_dev_journal_alloc(ca)) {
832                                 percpu_ref_put(&ca->ref);
833                                 goto err;
834                         }
835
836                 bch_initial_gc(c, NULL);
837
838                 /*
839                  * journal_res_get() will crash if called before this has
840                  * set up the journal.pin FIFO and journal.cur pointer:
841                  */
842                 bch_journal_start(c);
843                 bch_journal_set_replay_done(&c->journal);
844
845                 err = "error starting allocator thread";
846                 for_each_cache(ca, c, i)
847                         if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
848                             bch_dev_allocator_start(ca)) {
849                                 percpu_ref_put(&ca->ref);
850                                 goto err;
851                         }
852
853                 err = "cannot allocate new btree root";
854                 for (id = 0; id < BTREE_ID_NR; id++)
855                         if (bch_btree_root_alloc(c, id, &cl)) {
856                                 closure_sync(&cl);
857                                 goto err;
858                         }
859
860                 /* Wait for new btree roots to be written: */
861                 closure_sync(&cl);
862
863                 bch_inode_init(c, &inode, 0, 0,
864                                S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
865                 inode.inum = BCACHE_ROOT_INO;
866
867                 bch_inode_pack(&packed_inode, &inode);
868
869                 err = "error creating root directory";
870                 if (bch_btree_insert(c, BTREE_ID_INODES,
871                                      &packed_inode.inode.k_i,
872                                      NULL, NULL, NULL, 0))
873                         goto err;
874
875                 err = "error writing first journal entry";
876                 if (bch_journal_meta(&c->journal))
877                         goto err;
878         }
879 recovery_done:
880         if (c->opts.read_only) {
881                 bch_fs_read_only_sync(c);
882         } else {
883                 err = __bch_fs_read_write(c);
884                 if (err)
885                         goto err;
886         }
887
888         mutex_lock(&c->sb_lock);
889         mi = bch_sb_get_members(c->disk_sb);
890         now = ktime_get_seconds();
891
892         rcu_read_lock();
893         for_each_cache_rcu(ca, c, i)
894                 mi->members[ca->dev_idx].last_mount = cpu_to_le64(now);
895         rcu_read_unlock();
896
897         SET_BCH_SB_INITIALIZED(c->disk_sb, true);
898         SET_BCH_SB_CLEAN(c->disk_sb, false);
899         c->disk_sb->version = BCACHE_SB_VERSION_CDEV;
900
901         bch_write_super(c);
902         mutex_unlock(&c->sb_lock);
903
904         err = "dynamic fault";
905         if (bch_fs_init_fault("fs_start"))
906                 goto err;
907
908         err = "error creating kobject";
909         if (bch_fs_online(c))
910                 goto err;
911
912         err = "can't bring up blockdev volumes";
913         if (bch_blockdev_volumes_start(c))
914                 goto err;
915
916         bch_debug_init_cache_set(c);
917         set_bit(BCH_FS_RUNNING, &c->flags);
918         bch_attach_backing_devs(c);
919
920         bch_notify_fs_read_write(c);
921         err = NULL;
922 out:
923         bch_journal_entries_free(&journal);
924         closure_put(&c->caching);
925         return err;
926 err:
927         switch (ret) {
928         case BCH_FSCK_ERRORS_NOT_FIXED:
929                 bch_err(c, "filesystem contains errors: please report this to the developers");
930                 pr_cont("mount with -o fix_errors to repair");
931                 err = "fsck error";
932                 break;
933         case BCH_FSCK_REPAIR_UNIMPLEMENTED:
934                 bch_err(c, "filesystem contains errors: please report this to the developers");
935                 pr_cont("repair unimplemented: inform the developers so that it can be added");
936                 err = "fsck error";
937                 break;
938         case BCH_FSCK_REPAIR_IMPOSSIBLE:
939                 bch_err(c, "filesystem contains errors, but repair impossible");
940                 err = "fsck error";
941                 break;
942         case BCH_FSCK_UNKNOWN_VERSION:
943                 err = "unknown metadata version";;
944                 break;
945         case -ENOMEM:
946                 err = "cannot allocate memory";
947                 break;
948         case -EIO:
949                 err = "IO error";
950                 break;
951         }
952
953         BUG_ON(!err);
954         set_bit(BCH_FS_ERROR, &c->flags);
955         goto out;
956 }
957
958 static const char *bch_dev_may_add(struct bch_sb *sb, struct cache_set *c)
959 {
960         struct bch_sb_field_members *sb_mi;
961
962         sb_mi = bch_sb_get_members(sb);
963         if (!sb_mi)
964                 return "Invalid superblock: member info area missing";
965
966         if (le16_to_cpu(sb->block_size) != c->sb.block_size)
967                 return "mismatched block size";
968
969         if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) <
970             BCH_SB_BTREE_NODE_SIZE(c->disk_sb))
971                 return "new cache bucket_size is too small";
972
973         return NULL;
974 }
975
976 static const char *bch_dev_in_fs(struct bch_sb *sb, struct cache_set *c)
977 {
978         struct bch_sb_field_members *mi = bch_sb_get_members(c->disk_sb);
979         struct bch_sb_field_members *dev_mi = bch_sb_get_members(sb);
980         uuid_le dev_uuid = dev_mi->members[sb->dev_idx].uuid;
981         const char *err;
982
983         err = bch_dev_may_add(sb, c);
984         if (err)
985                 return err;
986
987         if (bch_is_zero(&dev_uuid, sizeof(dev_uuid)))
988                 return "device has been removed";
989
990         /*
991          * When attaching an existing device, the cache set superblock must
992          * already contain member_info with a matching UUID
993          */
994         if (sb->dev_idx >= c->disk_sb->nr_devices ||
995             memcmp(&mi->members[sb->dev_idx].uuid,
996                    &dev_uuid, sizeof(uuid_le)))
997                 return "cache sb does not match set";
998
999         return NULL;
1000 }
1001
1002 /* Cache device */
1003
1004 bool bch_dev_read_only(struct cache *ca)
1005 {
1006         struct cache_set *c = ca->set;
1007         struct bch_sb_field_members *mi;
1008         char buf[BDEVNAME_SIZE];
1009
1010         bdevname(ca->disk_sb.bdev, buf);
1011
1012         lockdep_assert_held(&bch_register_lock);
1013
1014         if (ca->mi.state != BCH_MEMBER_STATE_ACTIVE)
1015                 return false;
1016
1017         if (!bch_dev_may_remove(ca)) {
1018                 bch_err(c, "required member %s going RO, forcing fs RO", buf);
1019                 bch_fs_read_only_sync(c);
1020         }
1021
1022         trace_bcache_cache_read_only(ca);
1023
1024         bch_moving_gc_stop(ca);
1025
1026         /*
1027          * This stops new data writes (e.g. to existing open data
1028          * buckets) and then waits for all existing writes to
1029          * complete.
1030          */
1031         bch_dev_allocator_stop(ca);
1032
1033         bch_dev_group_remove(&c->journal.devs, ca);
1034
1035         /*
1036          * Device data write barrier -- no non-meta-data writes should
1037          * occur after this point.  However, writes to btree buckets,
1038          * journal buckets, and the superblock can still occur.
1039          */
1040         trace_bcache_cache_read_only_done(ca);
1041
1042         bch_notice(c, "%s read only", bdevname(ca->disk_sb.bdev, buf));
1043         bch_notify_dev_read_only(ca);
1044
1045         mutex_lock(&c->sb_lock);
1046         mi = bch_sb_get_members(c->disk_sb);
1047         SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx],
1048                              BCH_MEMBER_STATE_RO);
1049         bch_write_super(c);
1050         mutex_unlock(&c->sb_lock);
1051         return true;
1052 }
1053
1054 static const char *__bch_dev_read_write(struct cache_set *c, struct cache *ca)
1055 {
1056         lockdep_assert_held(&bch_register_lock);
1057
1058         if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE)
1059                 return NULL;
1060
1061         if (test_bit(BCH_DEV_REMOVING, &ca->flags))
1062                 return "removing";
1063
1064         trace_bcache_cache_read_write(ca);
1065
1066         if (bch_dev_allocator_start(ca))
1067                 return "error starting allocator thread";
1068
1069         if (bch_moving_gc_thread_start(ca))
1070                 return "error starting moving GC thread";
1071
1072         bch_dev_group_add(&c->journal.devs, ca);
1073
1074         wake_up_process(c->tiering_read);
1075
1076         bch_notify_dev_read_write(ca);
1077         trace_bcache_cache_read_write_done(ca);
1078
1079         return NULL;
1080 }
1081
1082 const char *bch_dev_read_write(struct cache *ca)
1083 {
1084         struct cache_set *c = ca->set;
1085         struct bch_sb_field_members *mi;
1086         const char *err;
1087
1088         err = __bch_dev_read_write(c, ca);
1089         if (err)
1090                 return err;
1091
1092         mutex_lock(&c->sb_lock);
1093         mi = bch_sb_get_members(c->disk_sb);
1094         SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx],
1095                              BCH_MEMBER_STATE_ACTIVE);
1096         bch_write_super(c);
1097         mutex_unlock(&c->sb_lock);
1098
1099         return NULL;
1100 }
1101
1102 /*
1103  * bch_dev_stop has already returned, so we no longer hold the register
1104  * lock at the point this is called.
1105  */
1106
1107 void bch_dev_release(struct kobject *kobj)
1108 {
1109         struct cache *ca = container_of(kobj, struct cache, kobj);
1110
1111         percpu_ref_exit(&ca->ref);
1112         kfree(ca);
1113 }
1114
1115 static void bch_dev_free_work(struct work_struct *work)
1116 {
1117         struct cache *ca = container_of(work, struct cache, free_work);
1118         struct cache_set *c = ca->set;
1119         unsigned i;
1120
1121         cancel_work_sync(&ca->io_error_work);
1122
1123         if (c && c->kobj.state_in_sysfs) {
1124                 char buf[12];
1125
1126                 sprintf(buf, "cache%u", ca->dev_idx);
1127                 sysfs_remove_link(&c->kobj, buf);
1128         }
1129
1130         if (ca->kobj.state_in_sysfs)
1131                 kobject_del(&ca->kobj);
1132
1133         bch_free_super(&ca->disk_sb);
1134
1135         /*
1136          * bch_dev_stop can be called in the middle of initialization
1137          * of the struct cache object.
1138          * As such, not all the sub-structures may be initialized.
1139          * However, they were zeroed when the object was allocated.
1140          */
1141
1142         bch_journal_free_cache(ca);
1143         free_percpu(ca->sectors_written);
1144         bioset_exit(&ca->replica_set);
1145         free_percpu(ca->bucket_stats_percpu);
1146         free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
1147         kfree(ca->prio_buckets);
1148         kfree(ca->bio_prio);
1149         kfree(ca->journal.bio);
1150         vfree(ca->buckets);
1151         vfree(ca->oldest_gens);
1152         free_heap(&ca->heap);
1153         free_fifo(&ca->free_inc);
1154
1155         for (i = 0; i < RESERVE_NR; i++)
1156                 free_fifo(&ca->free[i]);
1157
1158         kobject_put(&ca->kobj);
1159
1160         if (c)
1161                 kobject_put(&c->kobj);
1162 }
1163
1164 static void bch_dev_percpu_ref_release(struct percpu_ref *ref)
1165 {
1166         struct cache *ca = container_of(ref, struct cache, ref);
1167
1168         schedule_work(&ca->free_work);
1169 }
1170
1171 static void bch_dev_free_rcu(struct rcu_head *rcu)
1172 {
1173         struct cache *ca = container_of(rcu, struct cache, free_rcu);
1174
1175         /*
1176          * This decrements the ref count to ca, and once the ref count
1177          * is 0 (outstanding bios to the ca also incremented it and
1178          * decrement it on completion/error), bch_dev_percpu_ref_release
1179          * is called, and that eventually results in bch_dev_free_work
1180          * being called, which in turn results in bch_dev_release being
1181          * called.
1182          *
1183          * In particular, these functions won't be called until there are no
1184          * bios outstanding (the per-cpu ref counts are all 0), so it
1185          * is safe to remove the actual sysfs device at that point,
1186          * and that can indicate success to the user.
1187          */
1188
1189         percpu_ref_kill(&ca->ref);
1190 }
1191
1192 static void bch_dev_stop(struct cache *ca)
1193 {
1194         struct cache_set *c = ca->set;
1195
1196         lockdep_assert_held(&bch_register_lock);
1197
1198         if (c) {
1199                 BUG_ON(rcu_access_pointer(c->cache[ca->dev_idx]) != ca);
1200                 rcu_assign_pointer(c->cache[ca->dev_idx], NULL);
1201         }
1202
1203         call_rcu(&ca->free_rcu, bch_dev_free_rcu);
1204 }
1205
1206 static void bch_dev_remove_work(struct work_struct *work)
1207 {
1208         struct cache *ca = container_of(work, struct cache, remove_work);
1209         struct bch_sb_field_members *mi;
1210         struct cache_set *c = ca->set;
1211         char name[BDEVNAME_SIZE];
1212         bool force = test_bit(BCH_DEV_FORCE_REMOVE, &ca->flags);
1213         unsigned dev_idx = ca->dev_idx;
1214
1215         bdevname(ca->disk_sb.bdev, name);
1216
1217         /*
1218          * Device should already be RO, now migrate data off:
1219          *
1220          * XXX: locking is sketchy, bch_dev_read_write() has to check
1221          * BCH_DEV_REMOVING bit
1222          */
1223         if (!ca->mi.has_data) {
1224                 /* Nothing to do: */
1225         } else if (!bch_move_data_off_device(ca)) {
1226                 mutex_lock(&c->sb_lock);
1227                 mi = bch_sb_get_members(c->disk_sb);
1228                 SET_BCH_MEMBER_HAS_DATA(&mi->members[ca->dev_idx], false);
1229
1230                 bch_write_super(c);
1231                 mutex_unlock(&c->sb_lock);
1232         } else if (force) {
1233                 bch_flag_data_bad(ca);
1234
1235                 mutex_lock(&c->sb_lock);
1236                 mi = bch_sb_get_members(c->disk_sb);
1237                 SET_BCH_MEMBER_HAS_DATA(&mi->members[ca->dev_idx], false);
1238
1239                 bch_write_super(c);
1240                 mutex_unlock(&c->sb_lock);
1241         } else {
1242                 bch_err(c, "Remove of %s failed, unable to migrate data off",
1243                         name);
1244                 clear_bit(BCH_DEV_REMOVING, &ca->flags);
1245                 return;
1246         }
1247
1248         /* Now metadata: */
1249
1250         if (!ca->mi.has_metadata) {
1251                 /* Nothing to do: */
1252         } else if (!bch_move_meta_data_off_device(ca)) {
1253                 mutex_lock(&c->sb_lock);
1254                 mi = bch_sb_get_members(c->disk_sb);
1255                 SET_BCH_MEMBER_HAS_METADATA(&mi->members[ca->dev_idx], false);
1256
1257                 bch_write_super(c);
1258                 mutex_unlock(&c->sb_lock);
1259         } else {
1260                 bch_err(c, "Remove of %s failed, unable to migrate metadata off",
1261                         name);
1262                 clear_bit(BCH_DEV_REMOVING, &ca->flags);
1263                 return;
1264         }
1265
1266         /*
1267          * Ok, really doing the remove:
1268          * Drop device's prio pointer before removing it from superblock:
1269          */
1270         bch_notify_dev_removed(ca);
1271
1272         spin_lock(&c->journal.lock);
1273         c->journal.prio_buckets[dev_idx] = 0;
1274         spin_unlock(&c->journal.lock);
1275
1276         bch_journal_meta(&c->journal);
1277
1278         /*
1279          * Stop device before removing it from the cache set's list of devices -
1280          * and get our own ref on cache set since ca is going away:
1281          */
1282         closure_get(&c->cl);
1283
1284         mutex_lock(&bch_register_lock);
1285         bch_dev_stop(ca);
1286
1287         /*
1288          * RCU barrier between dropping between c->cache and dropping from
1289          * member info:
1290          */
1291         synchronize_rcu();
1292
1293         lockdep_assert_held(&bch_register_lock);
1294
1295         /*
1296          * Free this device's slot in the bch_member array - all pointers to
1297          * this device must be gone:
1298          */
1299         mutex_lock(&c->sb_lock);
1300         mi = bch_sb_get_members(c->disk_sb);
1301         memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid));
1302
1303         bch_write_super(c);
1304         mutex_unlock(&c->sb_lock);
1305
1306         mutex_unlock(&bch_register_lock);
1307
1308         closure_put(&c->cl);
1309 }
1310
1311 bool bch_dev_remove(struct cache *ca, bool force)
1312 {
1313         mutex_lock(&bch_register_lock);
1314
1315         if (test_bit(BCH_DEV_REMOVING, &ca->flags))
1316                 return false;
1317
1318         if (!bch_dev_may_remove(ca)) {
1319                 bch_err(ca->set, "Can't remove last device in tier %u",
1320                         ca->mi.tier);
1321                 bch_notify_dev_remove_failed(ca);
1322                 return false;
1323         }
1324
1325         /* First, go RO before we try to migrate data off: */
1326         bch_dev_read_only(ca);
1327
1328         if (force)
1329                 set_bit(BCH_DEV_FORCE_REMOVE, &ca->flags);
1330         set_bit(BCH_DEV_REMOVING, &ca->flags);
1331         bch_notify_dev_removing(ca);
1332
1333         mutex_unlock(&bch_register_lock);
1334
1335         /* Migrate the data and finish removal asynchronously: */
1336
1337         queue_work(system_long_wq, &ca->remove_work);
1338         return true;
1339 }
1340
1341 static int bch_dev_online(struct cache *ca)
1342 {
1343         char buf[12];
1344
1345         lockdep_assert_held(&bch_register_lock);
1346
1347         sprintf(buf, "cache%u", ca->dev_idx);
1348
1349         if (kobject_add(&ca->kobj,
1350                         &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj,
1351                         "bcache") ||
1352             sysfs_create_link(&ca->kobj, &ca->set->kobj, "set") ||
1353             sysfs_create_link(&ca->set->kobj, &ca->kobj, buf))
1354                 return -1;
1355
1356         return 0;
1357 }
1358
1359 static const char *bch_dev_alloc(struct bcache_superblock *sb,
1360                                  struct cache_set *c,
1361                                  struct cache **ret)
1362 {
1363         struct bch_member *member;
1364         size_t reserve_none, movinggc_reserve, free_inc_reserve, total_reserve;
1365         size_t heap_size;
1366         unsigned i;
1367         const char *err = "cannot allocate memory";
1368         struct cache *ca;
1369
1370         if (c->sb.nr_devices == 1)
1371                 bdevname(sb->bdev, c->name);
1372
1373         if (bch_fs_init_fault("dev_alloc"))
1374                 return err;
1375
1376         ca = kzalloc(sizeof(*ca), GFP_KERNEL);
1377         if (!ca)
1378                 return err;
1379
1380         if (percpu_ref_init(&ca->ref, bch_dev_percpu_ref_release,
1381                             0, GFP_KERNEL)) {
1382                 kfree(ca);
1383                 return err;
1384         }
1385
1386         kobject_init(&ca->kobj, &bch_dev_ktype);
1387
1388         spin_lock_init(&ca->self.lock);
1389         ca->self.nr_devices = 1;
1390         rcu_assign_pointer(ca->self.d[0].dev, ca);
1391         ca->dev_idx = sb->sb->dev_idx;
1392
1393         INIT_WORK(&ca->free_work, bch_dev_free_work);
1394         INIT_WORK(&ca->remove_work, bch_dev_remove_work);
1395         spin_lock_init(&ca->freelist_lock);
1396         spin_lock_init(&ca->prio_buckets_lock);
1397         mutex_init(&ca->heap_lock);
1398         bch_moving_init_cache(ca);
1399
1400         ca->disk_sb = *sb;
1401         ca->disk_sb.bdev->bd_holder = ca;
1402         memset(sb, 0, sizeof(*sb));
1403
1404         INIT_WORK(&ca->io_error_work, bch_nonfatal_io_error_work);
1405
1406         err = "dynamic fault";
1407         if (bch_fs_init_fault("dev_alloc"))
1408                 goto err;
1409
1410         member = bch_sb_get_members(ca->disk_sb.sb)->members +
1411                 ca->disk_sb.sb->dev_idx;
1412
1413         ca->mi = cache_mi_to_cpu_mi(member);
1414         ca->uuid = member->uuid;
1415         ca->bucket_bits = ilog2(ca->mi.bucket_size);
1416
1417         /* XXX: tune these */
1418         movinggc_reserve = max_t(size_t, 16, ca->mi.nbuckets >> 7);
1419         reserve_none = max_t(size_t, 4, ca->mi.nbuckets >> 9);
1420         /*
1421          * free_inc must be smaller than the copygc reserve: if it was bigger,
1422          * one copygc iteration might not make enough buckets available to fill
1423          * up free_inc and allow the allocator to make forward progress
1424          */
1425         free_inc_reserve = movinggc_reserve / 2;
1426         heap_size = movinggc_reserve * 8;
1427
1428         if (!init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
1429             !init_fifo(&ca->free[RESERVE_BTREE], BTREE_NODE_RESERVE, GFP_KERNEL) ||
1430             !init_fifo(&ca->free[RESERVE_MOVINGGC],
1431                        movinggc_reserve, GFP_KERNEL) ||
1432             !init_fifo(&ca->free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
1433             !init_fifo(&ca->free_inc,   free_inc_reserve, GFP_KERNEL) ||
1434             !init_heap(&ca->heap,       heap_size, GFP_KERNEL) ||
1435             !(ca->oldest_gens   = vzalloc(sizeof(u8) *
1436                                           ca->mi.nbuckets)) ||
1437             !(ca->buckets       = vzalloc(sizeof(struct bucket) *
1438                                           ca->mi.nbuckets)) ||
1439             !(ca->prio_buckets  = kzalloc(sizeof(uint64_t) * prio_buckets(ca) *
1440                                           2, GFP_KERNEL)) ||
1441             !(ca->disk_buckets  = alloc_bucket_pages(GFP_KERNEL, ca)) ||
1442             !(ca->bucket_stats_percpu = alloc_percpu(struct bucket_stats_cache)) ||
1443             !(ca->bio_prio = bio_kmalloc(GFP_NOIO, bucket_pages(ca))) ||
1444             bioset_init(&ca->replica_set, 4,
1445                         offsetof(struct bch_write_bio, bio)) ||
1446             !(ca->sectors_written = alloc_percpu(*ca->sectors_written)) ||
1447             bch_journal_init_cache(ca))
1448                 goto err;
1449
1450         ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
1451
1452         total_reserve = ca->free_inc.size;
1453         for (i = 0; i < RESERVE_NR; i++)
1454                 total_reserve += ca->free[i].size;
1455         pr_debug("%zu buckets reserved", total_reserve);
1456
1457         ca->copygc_write_point.group = &ca->self;
1458         ca->tiering_write_point.group = &ca->self;
1459
1460         /*
1461          * Increase journal write timeout if flushes to this device are
1462          * expensive:
1463          */
1464         if (!blk_queue_nonrot(bdev_get_queue(ca->disk_sb.bdev)) &&
1465             journal_flushes_device(ca))
1466                 c->journal.write_delay_ms =
1467                         max(c->journal.write_delay_ms, 1000U);
1468
1469         kobject_get(&c->kobj);
1470         ca->set = c;
1471
1472         kobject_get(&ca->kobj);
1473         rcu_assign_pointer(c->cache[ca->dev_idx], ca);
1474
1475         mutex_lock(&c->sb_lock);
1476
1477         if (le64_to_cpu(ca->disk_sb.sb->seq) > le64_to_cpu(c->disk_sb->seq))
1478                 bch_sb_to_cache_set(c, ca->disk_sb.sb);
1479
1480         mutex_unlock(&c->sb_lock);
1481
1482         err = "error creating kobject";
1483         if (c->kobj.state_in_sysfs &&
1484             bch_dev_online(ca))
1485                 goto err;
1486
1487         if (ret)
1488                 *ret = ca;
1489         else
1490                 kobject_put(&ca->kobj);
1491         return NULL;
1492 err:
1493         bch_dev_stop(ca);
1494         return err;
1495 }
1496
1497 static struct cache_set *bch_fs_lookup(uuid_le uuid)
1498 {
1499         struct cache_set *c;
1500
1501         lockdep_assert_held(&bch_register_lock);
1502
1503         list_for_each_entry(c, &bch_fs_list, list)
1504                 if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le)))
1505                         return c;
1506
1507         return NULL;
1508 }
1509
1510 int bch_dev_add(struct cache_set *c, const char *path)
1511 {
1512         struct bcache_superblock sb;
1513         const char *err;
1514         struct cache *ca;
1515         struct bch_sb_field *f;
1516         struct bch_sb_field_members *mi, *dev_mi;
1517         struct bch_member saved_mi;
1518         unsigned dev_idx, nr_devices, u64s;
1519         int ret = -EINVAL;
1520
1521         mutex_lock(&bch_register_lock);
1522
1523         err = bch_read_super(&sb, c->opts, path);
1524         if (err)
1525                 goto err_unlock_register;
1526
1527         err = bch_validate_cache_super(&sb);
1528         if (err)
1529                 goto err_unlock_register;
1530
1531         mutex_lock(&c->sb_lock);
1532
1533         err = bch_dev_may_add(sb.sb, c);
1534         if (err)
1535                 goto err_unlock;
1536
1537         /*
1538          * Preserve the old cache member information (esp. tier)
1539          * before we start bashing the disk stuff.
1540          */
1541         dev_mi = bch_sb_get_members(sb.sb);
1542         saved_mi = dev_mi->members[sb.sb->dev_idx];
1543         saved_mi.last_mount = cpu_to_le64(ktime_get_seconds());
1544
1545         down_read(&c->gc_lock);
1546
1547         if (dynamic_fault("bcache:add:no_slot"))
1548                 goto no_slot;
1549
1550         if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
1551                 goto no_slot;
1552
1553         mi = bch_sb_get_members(c->disk_sb);
1554         for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++)
1555                 if (dev_idx >= c->sb.nr_devices ||
1556                     bch_is_zero(mi->members[dev_idx].uuid.b,
1557                                  sizeof(uuid_le)))
1558                         goto have_slot;
1559 no_slot:
1560         up_read(&c->gc_lock);
1561
1562         err = "no slots available in superblock";
1563         ret = -ENOSPC;
1564         goto err_unlock;
1565
1566 have_slot:
1567         up_read(&c->gc_lock);
1568
1569         nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
1570         u64s = (sizeof(struct bch_sb_field_members) +
1571                 sizeof(struct bch_member) * nr_devices) / sizeof(u64);
1572         err = "no space in superblock for member info";
1573
1574         f = bch_fs_sb_field_resize(c, &mi->field, u64s);
1575         if (!f)
1576                 goto err_unlock;
1577
1578         mi = container_of(f, struct bch_sb_field_members, field);
1579
1580         f = bch_dev_sb_field_resize(&sb, &dev_mi->field, u64s);
1581         if (!f)
1582                 goto err_unlock;
1583
1584         dev_mi = container_of(f, struct bch_sb_field_members, field);
1585         memcpy(dev_mi, mi, u64s * sizeof(u64));
1586         dev_mi->members[dev_idx] = saved_mi;
1587
1588         sb.sb->dev_idx          = dev_idx;
1589         sb.sb->nr_devices       = nr_devices;
1590
1591         if (bch_fs_mi_update(c, dev_mi->members, nr_devices)) {
1592                 err = "cannot allocate memory";
1593                 ret = -ENOMEM;
1594                 goto err_unlock;
1595         }
1596
1597         /* commit new member info */
1598         memcpy(mi, dev_mi, u64s * sizeof(u64));
1599         c->disk_sb->nr_devices  = nr_devices;
1600         c->sb.nr_devices        = nr_devices;
1601
1602         err = bch_dev_alloc(&sb, c, &ca);
1603         if (err)
1604                 goto err_unlock;
1605
1606         bch_write_super(c);
1607
1608         err = "journal alloc failed";
1609         if (bch_dev_journal_alloc(ca))
1610                 goto err_put;
1611
1612         bch_notify_dev_added(ca);
1613
1614         if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE) {
1615                 err = __bch_dev_read_write(c, ca);
1616                 if (err)
1617                         goto err_put;
1618         }
1619
1620         kobject_put(&ca->kobj);
1621         mutex_unlock(&c->sb_lock);
1622         mutex_unlock(&bch_register_lock);
1623         return 0;
1624 err_put:
1625         bch_dev_stop(ca);
1626 err_unlock:
1627         mutex_unlock(&c->sb_lock);
1628 err_unlock_register:
1629         mutex_unlock(&bch_register_lock);
1630         bch_free_super(&sb);
1631
1632         bch_err(c, "Unable to add device: %s", err);
1633         return ret ?: -EINVAL;
1634 }
1635
1636 const char *bch_fs_open(char * const *devices, unsigned nr_devices,
1637                         struct bch_opts opts, struct cache_set **ret)
1638 {
1639         const char *err;
1640         struct cache_set *c = NULL;
1641         struct bcache_superblock *sb;
1642         uuid_le uuid;
1643         unsigned i;
1644
1645         memset(&uuid, 0, sizeof(uuid_le));
1646
1647         if (!nr_devices)
1648                 return "need at least one device";
1649
1650         if (!try_module_get(THIS_MODULE))
1651                 return "module unloading";
1652
1653         err = "cannot allocate memory";
1654         sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
1655         if (!sb)
1656                 goto err;
1657
1658         /*
1659          * bch_read_super() needs to happen under register_lock, so that the
1660          * exclusive open is atomic with adding the new cache set to the list of
1661          * cache sets:
1662          */
1663         mutex_lock(&bch_register_lock);
1664
1665         for (i = 0; i < nr_devices; i++) {
1666                 err = bch_read_super(&sb[i], opts, devices[i]);
1667                 if (err)
1668                         goto err_unlock;
1669
1670                 err = "attempting to register backing device";
1671                 if (__SB_IS_BDEV(le64_to_cpu(sb[i].sb->version)))
1672                         goto err_unlock;
1673
1674                 err = bch_validate_cache_super(&sb[i]);
1675                 if (err)
1676                         goto err_unlock;
1677         }
1678
1679         err = "cache set already registered";
1680         if (bch_fs_lookup(sb->sb->uuid))
1681                 goto err_unlock;
1682
1683         err = "cannot allocate memory";
1684         c = bch_fs_alloc(sb[0].sb, opts);
1685         if (!c)
1686                 goto err_unlock;
1687
1688         for (i = 0; i < nr_devices; i++) {
1689                 err = bch_dev_alloc(&sb[i], c, NULL);
1690                 if (err)
1691                         goto err_unlock;
1692         }
1693
1694         err = "insufficient devices";
1695         if (bch_fs_nr_online_devices(c) != bch_fs_nr_devices(c))
1696                 goto err_unlock;
1697
1698         err = bch_fs_start(c);
1699         if (err)
1700                 goto err_unlock;
1701
1702         err = "error creating kobject";
1703         if (bch_fs_online(c))
1704                 goto err_unlock;
1705
1706         if (ret) {
1707                 closure_get(&c->cl);
1708                 *ret = c;
1709         }
1710
1711         mutex_unlock(&bch_register_lock);
1712
1713         err = NULL;
1714 out:
1715         kfree(sb);
1716         module_put(THIS_MODULE);
1717         if (err)
1718                 c = NULL;
1719         return err;
1720 err_unlock:
1721         if (c)
1722                 bch_fs_stop(c);
1723         mutex_unlock(&bch_register_lock);
1724 err:
1725         for (i = 0; i < nr_devices; i++)
1726                 bch_free_super(&sb[i]);
1727         goto out;
1728 }
1729
1730 static const char *__bch_fs_open_incremental(struct bcache_superblock *sb,
1731                                   struct bch_opts opts)
1732 {
1733         char name[BDEVNAME_SIZE];
1734         const char *err;
1735         struct cache_set *c;
1736         bool allocated_cache_set = false;
1737
1738         err = bch_validate_cache_super(sb);
1739         if (err)
1740                 return err;
1741
1742         bdevname(sb->bdev, name);
1743
1744         c = bch_fs_lookup(sb->sb->uuid);
1745         if (c) {
1746                 err = bch_dev_in_fs(sb->sb, c);
1747                 if (err)
1748                         return err;
1749         } else {
1750                 c = bch_fs_alloc(sb->sb, opts);
1751                 if (!c)
1752                         return "cannot allocate memory";
1753
1754                 allocated_cache_set = true;
1755         }
1756
1757         err = bch_dev_alloc(sb, c, NULL);
1758         if (err)
1759                 goto err;
1760
1761         if (bch_fs_nr_online_devices(c) == bch_fs_nr_devices(c)) {
1762                 err = bch_fs_start(c);
1763                 if (err)
1764                         goto err;
1765         } else {
1766                 err = "error creating kobject";
1767                 if (bch_fs_online(c))
1768                         goto err;
1769         }
1770
1771         bch_info(c, "started");
1772         return NULL;
1773 err:
1774         if (allocated_cache_set)
1775                 bch_fs_stop(c);
1776         return err;
1777 }
1778
1779 const char *bch_fs_open_incremental(const char *path)
1780 {
1781         struct bcache_superblock sb;
1782         struct bch_opts opts = bch_opts_empty();
1783         const char *err;
1784
1785         mutex_lock(&bch_register_lock);
1786
1787         err = bch_read_super(&sb, opts, path);
1788         if (err)
1789                 goto err;
1790
1791         if (__SB_IS_BDEV(le64_to_cpu(sb.sb->version)))
1792                 err = bch_backing_dev_register(&sb);
1793         else
1794                 err = __bch_fs_open_incremental(&sb, opts);
1795
1796         bch_free_super(&sb);
1797 err:
1798         mutex_unlock(&bch_register_lock);
1799         return err;
1800 }
1801
1802 /* Global interfaces/init */
1803
1804 #define kobj_attribute_write(n, fn)                                     \
1805         static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn)
1806
1807 #define kobj_attribute_rw(n, show, store)                               \
1808         static struct kobj_attribute ksysfs_##n =                       \
1809                 __ATTR(n, S_IWUSR|S_IRUSR, show, store)
1810
1811 static ssize_t register_bcache(struct kobject *, struct kobj_attribute *,
1812                                const char *, size_t);
1813
1814 kobj_attribute_write(register,          register_bcache);
1815 kobj_attribute_write(register_quiet,    register_bcache);
1816
1817 static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
1818                                const char *buffer, size_t size)
1819 {
1820         ssize_t ret = -EINVAL;
1821         const char *err = "cannot allocate memory";
1822         char *path = NULL;
1823
1824         if (!try_module_get(THIS_MODULE))
1825                 return -EBUSY;
1826
1827         if (!(path = kstrndup(skip_spaces(buffer), size, GFP_KERNEL)))
1828                 goto err;
1829
1830         err = bch_fs_open_incremental(strim(path));
1831         if (err)
1832                 goto err;
1833
1834         ret = size;
1835 out:
1836         kfree(path);
1837         module_put(THIS_MODULE);
1838         return ret;
1839 err:
1840         pr_err("error opening %s: %s", path, err);
1841         goto out;
1842 }
1843
1844 static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
1845 {
1846         if (code == SYS_DOWN ||
1847             code == SYS_HALT ||
1848             code == SYS_POWER_OFF) {
1849                 struct cache_set *c;
1850
1851                 mutex_lock(&bch_register_lock);
1852
1853                 if (!list_empty(&bch_fs_list))
1854                         pr_info("Setting all devices read only:");
1855
1856                 list_for_each_entry(c, &bch_fs_list, list)
1857                         bch_fs_read_only(c);
1858
1859                 list_for_each_entry(c, &bch_fs_list, list)
1860                         bch_fs_read_only_sync(c);
1861
1862                 mutex_unlock(&bch_register_lock);
1863         }
1864
1865         return NOTIFY_DONE;
1866 }
1867
1868 static struct notifier_block reboot = {
1869         .notifier_call  = bcache_reboot,
1870         .priority       = INT_MAX, /* before any real devices */
1871 };
1872
1873 static ssize_t reboot_test(struct kobject *k, struct kobj_attribute *attr,
1874                            const char *buffer, size_t size)
1875 {
1876         bcache_reboot(NULL, SYS_DOWN, NULL);
1877         return size;
1878 }
1879
1880 kobj_attribute_write(reboot,            reboot_test);
1881
1882 static void bcache_exit(void)
1883 {
1884         bch_debug_exit();
1885         bch_fs_exit();
1886         bch_blockdev_exit();
1887         bch_chardev_exit();
1888         if (bcache_kset)
1889                 kset_unregister(bcache_kset);
1890         if (bcache_io_wq)
1891                 destroy_workqueue(bcache_io_wq);
1892         if (!IS_ERR_OR_NULL(bch_sha256))
1893                 crypto_free_shash(bch_sha256);
1894         unregister_reboot_notifier(&reboot);
1895 }
1896
1897 static int __init bcache_init(void)
1898 {
1899         static const struct attribute *files[] = {
1900                 &ksysfs_register.attr,
1901                 &ksysfs_register_quiet.attr,
1902                 &ksysfs_reboot.attr,
1903                 NULL
1904         };
1905
1906         mutex_init(&bch_register_lock);
1907         register_reboot_notifier(&reboot);
1908         closure_debug_init();
1909         bkey_pack_test();
1910
1911         bch_sha256 = crypto_alloc_shash("sha256", 0, 0);
1912         if (IS_ERR(bch_sha256))
1913                 goto err;
1914
1915         if (!(bcache_io_wq = create_freezable_workqueue("bcache_io")) ||
1916             !(bcache_kset = kset_create_and_add("bcache", NULL, fs_kobj)) ||
1917             sysfs_create_files(&bcache_kset->kobj, files) ||
1918             bch_chardev_init() ||
1919             bch_blockdev_init() ||
1920             bch_fs_init() ||
1921             bch_debug_init())
1922                 goto err;
1923
1924         return 0;
1925 err:
1926         bcache_exit();
1927         return -ENOMEM;
1928 }
1929
1930 #define BCH_DEBUG_PARAM(name, description)                      \
1931         bool bch_##name;                                        \
1932         module_param_named(name, bch_##name, bool, 0644);       \
1933         MODULE_PARM_DESC(name, description);
1934 BCH_DEBUG_PARAMS()
1935 #undef BCH_DEBUG_PARAM
1936
1937 module_exit(bcache_exit);
1938 module_init(bcache_init);