]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcache/super.c
1e272af2b59e46cb3d3549c786d06c9dbc53c62a
[bcachefs-tools-debian] / libbcache / super.c
1 /*
2  * bcache setup/teardown code, and some metadata io - read a superblock and
3  * figure out what to do with it.
4  *
5  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
6  * Copyright 2012 Google, Inc.
7  */
8
9 #include "bcache.h"
10 #include "blockdev.h"
11 #include "alloc.h"
12 #include "btree_cache.h"
13 #include "btree_gc.h"
14 #include "btree_update.h"
15 #include "btree_io.h"
16 #include "chardev.h"
17 #include "checksum.h"
18 #include "clock.h"
19 #include "compress.h"
20 #include "debug.h"
21 #include "error.h"
22 #include "fs.h"
23 #include "fs-gc.h"
24 #include "inode.h"
25 #include "io.h"
26 #include "journal.h"
27 #include "keylist.h"
28 #include "move.h"
29 #include "migrate.h"
30 #include "movinggc.h"
31 #include "notify.h"
32 #include "stats.h"
33 #include "super.h"
34 #include "super-io.h"
35 #include "tier.h"
36 #include "writeback.h"
37
38 #include <linux/backing-dev.h>
39 #include <linux/blkdev.h>
40 #include <linux/debugfs.h>
41 #include <linux/device.h>
42 #include <linux/genhd.h>
43 #include <linux/idr.h>
44 #include <linux/kthread.h>
45 #include <linux/module.h>
46 #include <linux/percpu.h>
47 #include <linux/random.h>
48 #include <linux/reboot.h>
49 #include <linux/sysfs.h>
50 #include <crypto/hash.h>
51
52 #include <trace/events/bcache.h>
53
54 MODULE_LICENSE("GPL");
55 MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
56
57 static const uuid_le invalid_uuid = {
58         .b = {
59                 0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
60                 0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
61         }
62 };
63
64 static struct kset *bcache_kset;
65 static LIST_HEAD(bch_fs_list);
66 static DEFINE_MUTEX(bch_fs_list_lock);
67
68 static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait);
69 struct workqueue_struct *bcache_io_wq;
70 struct crypto_shash *bch_sha256;
71
72 static void bch_dev_free(struct bch_dev *);
73 static int bch_dev_alloc(struct bch_fs *, unsigned);
74 static int bch_dev_sysfs_online(struct bch_dev *);
75 static void __bch_dev_read_only(struct bch_fs *, struct bch_dev *);
76
77 struct bch_fs *bch_bdev_to_fs(struct block_device *bdev)
78 {
79         struct bch_fs *c;
80         struct bch_dev *ca;
81         unsigned i;
82
83         mutex_lock(&bch_fs_list_lock);
84         rcu_read_lock();
85
86         list_for_each_entry(c, &bch_fs_list, list)
87                 for_each_member_device_rcu(ca, c, i)
88                         if (ca->disk_sb.bdev == bdev) {
89                                 closure_get(&c->cl);
90                                 goto found;
91                         }
92         c = NULL;
93 found:
94         rcu_read_unlock();
95         mutex_unlock(&bch_fs_list_lock);
96
97         return c;
98 }
99
100 static struct bch_fs *__bch_uuid_to_fs(uuid_le uuid)
101 {
102         struct bch_fs *c;
103
104         lockdep_assert_held(&bch_fs_list_lock);
105
106         list_for_each_entry(c, &bch_fs_list, list)
107                 if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le)))
108                         return c;
109
110         return NULL;
111 }
112
113 struct bch_fs *bch_uuid_to_fs(uuid_le uuid)
114 {
115         struct bch_fs *c;
116
117         mutex_lock(&bch_fs_list_lock);
118         c = __bch_uuid_to_fs(uuid);
119         if (c)
120                 closure_get(&c->cl);
121         mutex_unlock(&bch_fs_list_lock);
122
123         return c;
124 }
125
126 int bch_congested(struct bch_fs *c, int bdi_bits)
127 {
128         struct backing_dev_info *bdi;
129         struct bch_dev *ca;
130         unsigned i;
131         int ret = 0;
132
133         if (bdi_bits & (1 << WB_sync_congested)) {
134                 /* Reads - check all devices: */
135                 for_each_readable_member(ca, c, i) {
136                         bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
137
138                         if (bdi_congested(bdi, bdi_bits)) {
139                                 ret = 1;
140                                 break;
141                         }
142                 }
143         } else {
144                 /* Writes prefer fastest tier: */
145                 struct bch_tier *tier = READ_ONCE(c->fastest_tier);
146                 struct dev_group *grp = tier ? &tier->devs : &c->all_devs;
147
148                 rcu_read_lock();
149                 group_for_each_dev(ca, grp, i) {
150                         bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
151
152                         if (bdi_congested(bdi, bdi_bits)) {
153                                 ret = 1;
154                                 break;
155                         }
156                 }
157                 rcu_read_unlock();
158         }
159
160         return ret;
161 }
162
163 static int bch_congested_fn(void *data, int bdi_bits)
164 {
165         struct bch_fs *c = data;
166
167         return bch_congested(c, bdi_bits);
168 }
169
170 /* Filesystem RO/RW: */
171
172 /*
173  * For startup/shutdown of RW stuff, the dependencies are:
174  *
175  * - foreground writes depend on copygc and tiering (to free up space)
176  *
177  * - copygc and tiering depend on mark and sweep gc (they actually probably
178  *   don't because they either reserve ahead of time or don't block if
179  *   allocations fail, but allocations can require mark and sweep gc to run
180  *   because of generation number wraparound)
181  *
182  * - all of the above depends on the allocator threads
183  *
184  * - allocator depends on the journal (when it rewrites prios and gens)
185  */
186
187 static void __bch_fs_read_only(struct bch_fs *c)
188 {
189         struct bch_dev *ca;
190         unsigned i;
191
192         bch_tiering_stop(c);
193
194         for_each_member_device(ca, c, i)
195                 bch_moving_gc_stop(ca);
196
197         bch_gc_thread_stop(c);
198
199         bch_btree_flush(c);
200
201         for_each_member_device(ca, c, i)
202                 bch_dev_allocator_stop(ca);
203
204         bch_fs_journal_stop(&c->journal);
205 }
206
207 static void bch_writes_disabled(struct percpu_ref *writes)
208 {
209         struct bch_fs *c = container_of(writes, struct bch_fs, writes);
210
211         set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
212         wake_up(&bch_read_only_wait);
213 }
214
215 void bch_fs_read_only(struct bch_fs *c)
216 {
217         mutex_lock(&c->state_lock);
218         if (c->state != BCH_FS_STARTING &&
219             c->state != BCH_FS_RW)
220                 goto out;
221
222         if (test_bit(BCH_FS_ERROR, &c->flags))
223                 goto out;
224
225         trace_fs_read_only(c);
226
227         /*
228          * Block new foreground-end write operations from starting - any new
229          * writes will return -EROFS:
230          *
231          * (This is really blocking new _allocations_, writes to previously
232          * allocated space can still happen until stopping the allocator in
233          * bch_dev_allocator_stop()).
234          */
235         percpu_ref_kill(&c->writes);
236
237         del_timer(&c->foreground_write_wakeup);
238         cancel_delayed_work(&c->pd_controllers_update);
239
240         c->foreground_write_pd.rate.rate = UINT_MAX;
241         bch_wake_delayed_writes((unsigned long) c);
242
243         /*
244          * If we're not doing an emergency shutdown, we want to wait on
245          * outstanding writes to complete so they don't see spurious errors due
246          * to shutting down the allocator:
247          *
248          * If we are doing an emergency shutdown outstanding writes may
249          * hang until we shutdown the allocator so we don't want to wait
250          * on outstanding writes before shutting everything down - but
251          * we do need to wait on them before returning and signalling
252          * that going RO is complete:
253          */
254         wait_event(bch_read_only_wait,
255                    test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) ||
256                    test_bit(BCH_FS_EMERGENCY_RO, &c->flags));
257
258         __bch_fs_read_only(c);
259
260         wait_event(bch_read_only_wait,
261                    test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
262
263         clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
264
265         if (!bch_journal_error(&c->journal) &&
266             !test_bit(BCH_FS_ERROR, &c->flags)) {
267                 mutex_lock(&c->sb_lock);
268                 SET_BCH_SB_CLEAN(c->disk_sb, true);
269                 bch_write_super(c);
270                 mutex_unlock(&c->sb_lock);
271         }
272
273         c->state = BCH_FS_RO;
274         bch_notify_fs_read_only(c);
275         trace_fs_read_only_done(c);
276 out:
277         mutex_unlock(&c->state_lock);
278 }
279
280 static void bch_fs_read_only_work(struct work_struct *work)
281 {
282         struct bch_fs *c =
283                 container_of(work, struct bch_fs, read_only_work);
284
285         bch_fs_read_only(c);
286 }
287
288 static void bch_fs_read_only_async(struct bch_fs *c)
289 {
290         queue_work(system_long_wq, &c->read_only_work);
291 }
292
293 bool bch_fs_emergency_read_only(struct bch_fs *c)
294 {
295         bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags);
296
297         bch_fs_read_only_async(c);
298         bch_journal_halt(&c->journal);
299
300         wake_up(&bch_read_only_wait);
301         return ret;
302 }
303
304 const char *bch_fs_read_write(struct bch_fs *c)
305 {
306         struct bch_dev *ca;
307         const char *err = NULL;
308         unsigned i;
309
310         mutex_lock(&c->state_lock);
311         if (c->state != BCH_FS_STARTING &&
312             c->state != BCH_FS_RO)
313                 goto out;
314
315         err = "error starting allocator thread";
316         for_each_rw_member(ca, c, i)
317                 if (bch_dev_allocator_start(ca)) {
318                         percpu_ref_put(&ca->io_ref);
319                         goto err;
320                 }
321
322         err = "error starting btree GC thread";
323         if (bch_gc_thread_start(c))
324                 goto err;
325
326         err = "error starting moving GC thread";
327         for_each_rw_member(ca, c, i)
328                 if (bch_moving_gc_start(ca)) {
329                         percpu_ref_put(&ca->io_ref);
330                         goto err;
331                 }
332
333         err = "error starting tiering thread";
334         if (bch_tiering_start(c))
335                 goto err;
336
337         schedule_delayed_work(&c->pd_controllers_update, 5 * HZ);
338
339         if (c->state != BCH_FS_STARTING)
340                 percpu_ref_reinit(&c->writes);
341
342         c->state = BCH_FS_RW;
343         err = NULL;
344 out:
345         mutex_unlock(&c->state_lock);
346         return err;
347 err:
348         __bch_fs_read_only(c);
349         goto out;
350 }
351
352 /* Filesystem startup/shutdown: */
353
354 static void bch_fs_free(struct bch_fs *c)
355 {
356         bch_fs_encryption_exit(c);
357         bch_fs_btree_exit(c);
358         bch_fs_journal_exit(&c->journal);
359         bch_io_clock_exit(&c->io_clock[WRITE]);
360         bch_io_clock_exit(&c->io_clock[READ]);
361         bch_fs_compress_exit(c);
362         bch_fs_blockdev_exit(c);
363         bdi_destroy(&c->bdi);
364         lg_lock_free(&c->usage_lock);
365         free_percpu(c->usage_percpu);
366         mempool_exit(&c->btree_bounce_pool);
367         mempool_exit(&c->bio_bounce_pages);
368         bioset_exit(&c->bio_write);
369         bioset_exit(&c->bio_read_split);
370         bioset_exit(&c->bio_read);
371         bioset_exit(&c->btree_read_bio);
372         mempool_exit(&c->btree_interior_update_pool);
373         mempool_exit(&c->btree_reserve_pool);
374         mempool_exit(&c->fill_iter);
375         percpu_ref_exit(&c->writes);
376
377         if (c->copygc_wq)
378                 destroy_workqueue(c->copygc_wq);
379         if (c->wq)
380                 destroy_workqueue(c->wq);
381
382         free_pages((unsigned long) c->disk_sb, c->disk_sb_order);
383         kfree(c);
384         module_put(THIS_MODULE);
385 }
386
387 static void bch_fs_exit(struct bch_fs *c)
388 {
389         unsigned i;
390
391         del_timer_sync(&c->foreground_write_wakeup);
392         cancel_delayed_work_sync(&c->pd_controllers_update);
393         cancel_work_sync(&c->read_only_work);
394         cancel_work_sync(&c->bio_submit_work);
395         cancel_work_sync(&c->read_retry_work);
396
397         for (i = 0; i < c->sb.nr_devices; i++)
398                 if (c->devs[i])
399                         bch_dev_free(c->devs[i]);
400
401         closure_debug_destroy(&c->cl);
402         kobject_put(&c->kobj);
403 }
404
405 static void bch_fs_offline(struct bch_fs *c)
406 {
407         struct bch_dev *ca;
408         unsigned i;
409
410         mutex_lock(&bch_fs_list_lock);
411         list_del(&c->list);
412         mutex_unlock(&bch_fs_list_lock);
413
414         for_each_member_device(ca, c, i)
415                 if (ca->kobj.state_in_sysfs &&
416                     ca->disk_sb.bdev)
417                         sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj,
418                                           "bcache");
419
420         if (c->kobj.state_in_sysfs)
421                 kobject_del(&c->kobj);
422
423         bch_fs_debug_exit(c);
424         bch_fs_chardev_exit(c);
425
426         bch_cache_accounting_destroy(&c->accounting);
427
428         kobject_put(&c->time_stats);
429         kobject_put(&c->opts_dir);
430         kobject_put(&c->internal);
431
432         __bch_fs_read_only(c);
433 }
434
435 /*
436  * should be __bch_fs_stop4 - block devices are closed, now we can finally
437  * free it
438  */
439 void bch_fs_release(struct kobject *kobj)
440 {
441         struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
442
443         bch_notify_fs_stopped(c);
444         bch_fs_free(c);
445 }
446
447 /*
448  * All activity on the filesystem should have stopped now - close devices:
449  */
450 static void __bch_fs_stop3(struct closure *cl)
451 {
452         struct bch_fs *c = container_of(cl, struct bch_fs, cl);
453
454         bch_fs_exit(c);
455 }
456
457 /*
458  * Openers (i.e. block devices) should have exited, shutdown all userspace
459  * interfaces and wait for &c->cl to hit 0
460  */
461 static void __bch_fs_stop2(struct closure *cl)
462 {
463         struct bch_fs *c = container_of(cl, struct bch_fs, caching);
464
465         bch_fs_offline(c);
466
467         closure_return(cl);
468 }
469
470 /*
471  * First phase of the shutdown process that's kicked off by bch_fs_stop_async();
472  * we haven't waited for anything to stop yet, we're just punting to process
473  * context to shut down block devices:
474  */
475 static void __bch_fs_stop1(struct closure *cl)
476 {
477         struct bch_fs *c = container_of(cl, struct bch_fs, caching);
478
479         bch_blockdevs_stop(c);
480
481         continue_at(cl, __bch_fs_stop2, system_wq);
482 }
483
484 void bch_fs_stop_async(struct bch_fs *c)
485 {
486         mutex_lock(&c->state_lock);
487         if (c->state != BCH_FS_STOPPING) {
488                 c->state = BCH_FS_STOPPING;
489                 closure_queue(&c->caching);
490         }
491         mutex_unlock(&c->state_lock);
492 }
493
494 void bch_fs_stop(struct bch_fs *c)
495 {
496         mutex_lock(&c->state_lock);
497         BUG_ON(c->state == BCH_FS_STOPPING);
498         c->state = BCH_FS_STOPPING;
499         mutex_unlock(&c->state_lock);
500
501         bch_blockdevs_stop(c);
502
503         closure_sync(&c->caching);
504         closure_debug_destroy(&c->caching);
505
506         bch_fs_offline(c);
507
508         closure_put(&c->cl);
509         closure_sync(&c->cl);
510
511         bch_fs_exit(c);
512 }
513
514 /* Stop, detaching from backing devices: */
515 void bch_fs_detach(struct bch_fs *c)
516 {
517         if (!test_and_set_bit(BCH_FS_DETACHING, &c->flags))
518                 bch_fs_stop_async(c);
519 }
520
521 #define alloc_bucket_pages(gfp, ca)                     \
522         ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(ca))))
523
524 static struct bch_fs *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
525 {
526         struct bch_sb_field_members *mi;
527         struct bch_fs *c;
528         unsigned i, iter_size, journal_entry_bytes;
529
530         c = kzalloc(sizeof(struct bch_fs), GFP_KERNEL);
531         if (!c)
532                 return NULL;
533
534         __module_get(THIS_MODULE);
535
536         c->minor                = -1;
537
538         mutex_init(&c->state_lock);
539         mutex_init(&c->sb_lock);
540         INIT_RADIX_TREE(&c->devices, GFP_KERNEL);
541         mutex_init(&c->btree_cache_lock);
542         mutex_init(&c->bucket_lock);
543         mutex_init(&c->btree_root_lock);
544         INIT_WORK(&c->read_only_work, bch_fs_read_only_work);
545
546         init_rwsem(&c->gc_lock);
547
548 #define BCH_TIME_STAT(name, frequency_units, duration_units)            \
549         spin_lock_init(&c->name##_time.lock);
550         BCH_TIME_STATS()
551 #undef BCH_TIME_STAT
552
553         bch_fs_allocator_init(c);
554         bch_fs_tiering_init(c);
555
556         INIT_LIST_HEAD(&c->list);
557         INIT_LIST_HEAD(&c->cached_devs);
558         INIT_LIST_HEAD(&c->btree_cache);
559         INIT_LIST_HEAD(&c->btree_cache_freeable);
560         INIT_LIST_HEAD(&c->btree_cache_freed);
561
562         INIT_LIST_HEAD(&c->btree_interior_update_list);
563         mutex_init(&c->btree_reserve_cache_lock);
564         mutex_init(&c->btree_interior_update_lock);
565
566         mutex_init(&c->bio_bounce_pages_lock);
567         INIT_WORK(&c->bio_submit_work, bch_bio_submit_work);
568         spin_lock_init(&c->bio_submit_lock);
569         bio_list_init(&c->read_retry_list);
570         spin_lock_init(&c->read_retry_lock);
571         INIT_WORK(&c->read_retry_work, bch_read_retry_work);
572         mutex_init(&c->zlib_workspace_lock);
573
574         seqcount_init(&c->gc_pos_lock);
575
576         c->prio_clock[READ].hand = 1;
577         c->prio_clock[READ].min_prio = 0;
578         c->prio_clock[WRITE].hand = 1;
579         c->prio_clock[WRITE].min_prio = 0;
580
581         c->congested_read_threshold_us  = 2000;
582         c->congested_write_threshold_us = 20000;
583         c->error_limit  = 16 << IO_ERROR_SHIFT;
584         init_waitqueue_head(&c->writeback_wait);
585
586         c->writeback_pages_max = (256 << 10) / PAGE_SIZE;
587
588         c->copy_gc_enabled = 1;
589         c->tiering_enabled = 1;
590         c->tiering_percent = 10;
591
592         c->foreground_target_percent = 20;
593
594         c->journal.write_time   = &c->journal_write_time;
595         c->journal.delay_time   = &c->journal_delay_time;
596         c->journal.blocked_time = &c->journal_blocked_time;
597         c->journal.flush_seq_time = &c->journal_flush_seq_time;
598
599         mutex_init(&c->uevent_lock);
600
601         mutex_lock(&c->sb_lock);
602
603         if (bch_sb_to_fs(c, sb)) {
604                 mutex_unlock(&c->sb_lock);
605                 goto err;
606         }
607
608         mutex_unlock(&c->sb_lock);
609
610         scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid);
611
612         bch_opts_apply(&c->opts, bch_sb_opts(sb));
613         bch_opts_apply(&c->opts, opts);
614
615         c->opts.nochanges       |= c->opts.noreplay;
616         c->opts.read_only       |= c->opts.nochanges;
617
618         c->block_bits           = ilog2(c->sb.block_size);
619
620         if (bch_fs_init_fault("fs_alloc"))
621                 goto err;
622
623         iter_size = (btree_blocks(c) + 1) * 2 *
624                 sizeof(struct btree_node_iter_set);
625
626         journal_entry_bytes = 512U << BCH_SB_JOURNAL_ENTRY_SIZE(sb);
627
628         if (!(c->wq = alloc_workqueue("bcache",
629                                 WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
630             !(c->copygc_wq = alloc_workqueue("bcache_copygc",
631                                 WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
632             percpu_ref_init(&c->writes, bch_writes_disabled, 0, GFP_KERNEL) ||
633             mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1,
634                                       sizeof(struct btree_reserve)) ||
635             mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
636                                       sizeof(struct btree_interior_update)) ||
637             mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
638             bioset_init(&c->btree_read_bio, 1, 0) ||
639             bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio)) ||
640             bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio)) ||
641             bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio)) ||
642             mempool_init_page_pool(&c->bio_bounce_pages,
643                                    max_t(unsigned,
644                                          c->sb.btree_node_size,
645                                          BCH_ENCODED_EXTENT_MAX) /
646                                    PAGE_SECTORS, 0) ||
647             !(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
648             lg_lock_init(&c->usage_lock) ||
649             mempool_init_page_pool(&c->btree_bounce_pool, 1,
650                                    ilog2(btree_pages(c))) ||
651             bdi_setup_and_register(&c->bdi, "bcache") ||
652             bch_fs_blockdev_init(c) ||
653             bch_io_clock_init(&c->io_clock[READ]) ||
654             bch_io_clock_init(&c->io_clock[WRITE]) ||
655             bch_fs_journal_init(&c->journal, journal_entry_bytes) ||
656             bch_fs_btree_init(c) ||
657             bch_fs_encryption_init(c) ||
658             bch_fs_compress_init(c) ||
659             bch_check_set_has_compressed_data(c, c->opts.compression))
660                 goto err;
661
662         c->bdi.ra_pages         = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
663         c->bdi.congested_fn     = bch_congested_fn;
664         c->bdi.congested_data   = c;
665
666         mi = bch_sb_get_members(c->disk_sb);
667         for (i = 0; i < c->sb.nr_devices; i++)
668                 if (!bch_is_zero(mi->members[i].uuid.b, sizeof(uuid_le)) &&
669                     bch_dev_alloc(c, i))
670                         goto err;
671
672         /*
673          * Now that all allocations have succeeded, init various refcounty
674          * things that let us shutdown:
675          */
676         closure_init(&c->cl, NULL);
677
678         c->kobj.kset = bcache_kset;
679         kobject_init(&c->kobj, &bch_fs_ktype);
680         kobject_init(&c->internal, &bch_fs_internal_ktype);
681         kobject_init(&c->opts_dir, &bch_fs_opts_dir_ktype);
682         kobject_init(&c->time_stats, &bch_fs_time_stats_ktype);
683
684         bch_cache_accounting_init(&c->accounting, &c->cl);
685
686         closure_init(&c->caching, &c->cl);
687         set_closure_fn(&c->caching, __bch_fs_stop1, system_wq);
688
689         closure_get(&c->cl);
690         continue_at_noreturn(&c->cl, __bch_fs_stop3, system_wq);
691         return c;
692 err:
693         bch_fs_free(c);
694         return NULL;
695 }
696
697 static const char *__bch_fs_online(struct bch_fs *c)
698 {
699         struct bch_dev *ca;
700         const char *err = NULL;
701         unsigned i;
702         int ret;
703
704         lockdep_assert_held(&bch_fs_list_lock);
705
706         if (!list_empty(&c->list))
707                 return NULL;
708
709         if (__bch_uuid_to_fs(c->sb.uuid))
710                 return "filesystem UUID already open";
711
712         ret = bch_fs_chardev_init(c);
713         if (ret)
714                 return "error creating character device";
715
716         bch_fs_debug_init(c);
717
718         if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ||
719             kobject_add(&c->internal, &c->kobj, "internal") ||
720             kobject_add(&c->opts_dir, &c->kobj, "options") ||
721             kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
722             bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
723                 return "error creating sysfs objects";
724
725         mutex_lock(&c->state_lock);
726
727         err = "error creating sysfs objects";
728         __for_each_member_device(ca, c, i)
729                 if (bch_dev_sysfs_online(ca))
730                         goto err;
731
732         err = "can't bring up blockdev volumes";
733         if (bch_blockdev_volumes_start(c))
734                 goto err;
735
736         bch_attach_backing_devs(c);
737
738         list_add(&c->list, &bch_fs_list);
739         err = NULL;
740 err:
741         mutex_unlock(&c->state_lock);
742         return err;
743 }
744
745 static const char *bch_fs_online(struct bch_fs *c)
746 {
747         const char *err;
748
749         mutex_lock(&bch_fs_list_lock);
750         err = __bch_fs_online(c);
751         mutex_unlock(&bch_fs_list_lock);
752
753         return err;
754 }
755
756 static const char *__bch_fs_start(struct bch_fs *c)
757 {
758         const char *err = "cannot allocate memory";
759         struct bch_sb_field_members *mi;
760         struct bch_dev *ca;
761         unsigned i, id;
762         time64_t now;
763         LIST_HEAD(journal);
764         struct jset *j;
765         int ret = -EINVAL;
766
767         BUG_ON(c->state != BCH_FS_STARTING);
768
769         mutex_lock(&c->sb_lock);
770         for_each_online_member(ca, c, i)
771                 bch_sb_from_fs(c, ca);
772         mutex_unlock(&c->sb_lock);
773
774         if (BCH_SB_INITIALIZED(c->disk_sb)) {
775                 ret = bch_journal_read(c, &journal);
776                 if (ret)
777                         goto err;
778
779                 j = &list_entry(journal.prev, struct journal_replay, list)->j;
780
781                 c->prio_clock[READ].hand = le16_to_cpu(j->read_clock);
782                 c->prio_clock[WRITE].hand = le16_to_cpu(j->write_clock);
783
784                 err = "error reading priorities";
785                 for_each_readable_member(ca, c, i) {
786                         ret = bch_prio_read(ca);
787                         if (ret) {
788                                 percpu_ref_put(&ca->io_ref);
789                                 goto err;
790                         }
791                 }
792
793                 for (id = 0; id < BTREE_ID_NR; id++) {
794                         unsigned level;
795                         struct bkey_i *k;
796
797                         err = "bad btree root";
798                         k = bch_journal_find_btree_root(c, j, id, &level);
799                         if (!k && id == BTREE_ID_EXTENTS)
800                                 goto err;
801                         if (!k) {
802                                 pr_debug("missing btree root: %d", id);
803                                 continue;
804                         }
805
806                         err = "error reading btree root";
807                         if (bch_btree_root_read(c, id, k, level))
808                                 goto err;
809                 }
810
811                 bch_verbose(c, "starting mark and sweep:");
812
813                 err = "error in recovery";
814                 if (bch_initial_gc(c, &journal))
815                         goto err;
816
817                 if (c->opts.noreplay)
818                         goto recovery_done;
819
820                 bch_verbose(c, "mark and sweep done");
821
822                 /*
823                  * bch_journal_start() can't happen sooner, or btree_gc_finish()
824                  * will give spurious errors about oldest_gen > bucket_gen -
825                  * this is a hack but oh well.
826                  */
827                 bch_journal_start(c);
828
829                 err = "error starting allocator thread";
830                 for_each_rw_member(ca, c, i)
831                         if (bch_dev_allocator_start(ca)) {
832                                 percpu_ref_put(&ca->io_ref);
833                                 goto err;
834                         }
835
836                 bch_verbose(c, "starting journal replay:");
837
838                 err = "journal replay failed";
839                 ret = bch_journal_replay(c, &journal);
840                 if (ret)
841                         goto err;
842
843                 bch_verbose(c, "journal replay done");
844
845                 if (c->opts.norecovery)
846                         goto recovery_done;
847
848                 bch_verbose(c, "starting fsck:");
849                 err = "error in fsck";
850                 ret = bch_fsck(c, !c->opts.nofsck);
851                 if (ret)
852                         goto err;
853
854                 bch_verbose(c, "fsck done");
855         } else {
856                 struct bch_inode_unpacked inode;
857                 struct bkey_inode_buf packed_inode;
858                 struct closure cl;
859
860                 closure_init_stack(&cl);
861
862                 bch_notice(c, "initializing new filesystem");
863
864                 bch_initial_gc(c, NULL);
865
866                 err = "unable to allocate journal buckets";
867                 for_each_rw_member(ca, c, i)
868                         if (bch_dev_journal_alloc(ca)) {
869                                 percpu_ref_put(&ca->io_ref);
870                                 goto err;
871                         }
872
873                 /*
874                  * journal_res_get() will crash if called before this has
875                  * set up the journal.pin FIFO and journal.cur pointer:
876                  */
877                 bch_journal_start(c);
878                 bch_journal_set_replay_done(&c->journal);
879
880                 err = "error starting allocator thread";
881                 for_each_rw_member(ca, c, i)
882                         if (bch_dev_allocator_start(ca)) {
883                                 percpu_ref_put(&ca->io_ref);
884                                 goto err;
885                         }
886
887                 err = "cannot allocate new btree root";
888                 for (id = 0; id < BTREE_ID_NR; id++)
889                         if (bch_btree_root_alloc(c, id, &cl)) {
890                                 closure_sync(&cl);
891                                 goto err;
892                         }
893
894                 /* Wait for new btree roots to be written: */
895                 closure_sync(&cl);
896
897                 bch_inode_init(c, &inode, 0, 0,
898                                S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
899                 inode.inum = BCACHE_ROOT_INO;
900
901                 bch_inode_pack(&packed_inode, &inode);
902
903                 err = "error creating root directory";
904                 if (bch_btree_insert(c, BTREE_ID_INODES,
905                                      &packed_inode.inode.k_i,
906                                      NULL, NULL, NULL, 0))
907                         goto err;
908
909                 err = "error writing first journal entry";
910                 if (bch_journal_meta(&c->journal))
911                         goto err;
912         }
913 recovery_done:
914         err = "dynamic fault";
915         if (bch_fs_init_fault("fs_start"))
916                 goto err;
917
918         if (c->opts.read_only) {
919                 bch_fs_read_only(c);
920         } else {
921                 err = bch_fs_read_write(c);
922                 if (err)
923                         goto err;
924         }
925
926         mutex_lock(&c->sb_lock);
927         mi = bch_sb_get_members(c->disk_sb);
928         now = ktime_get_seconds();
929
930         for_each_member_device(ca, c, i)
931                 mi->members[ca->dev_idx].last_mount = cpu_to_le64(now);
932
933         SET_BCH_SB_INITIALIZED(c->disk_sb, true);
934         SET_BCH_SB_CLEAN(c->disk_sb, false);
935         c->disk_sb->version = BCACHE_SB_VERSION_CDEV;
936
937         bch_write_super(c);
938         mutex_unlock(&c->sb_lock);
939
940         err = NULL;
941 out:
942         bch_journal_entries_free(&journal);
943         return err;
944 err:
945         switch (ret) {
946         case BCH_FSCK_ERRORS_NOT_FIXED:
947                 bch_err(c, "filesystem contains errors: please report this to the developers");
948                 pr_cont("mount with -o fix_errors to repair");
949                 err = "fsck error";
950                 break;
951         case BCH_FSCK_REPAIR_UNIMPLEMENTED:
952                 bch_err(c, "filesystem contains errors: please report this to the developers");
953                 pr_cont("repair unimplemented: inform the developers so that it can be added");
954                 err = "fsck error";
955                 break;
956         case BCH_FSCK_REPAIR_IMPOSSIBLE:
957                 bch_err(c, "filesystem contains errors, but repair impossible");
958                 err = "fsck error";
959                 break;
960         case BCH_FSCK_UNKNOWN_VERSION:
961                 err = "unknown metadata version";;
962                 break;
963         case -ENOMEM:
964                 err = "cannot allocate memory";
965                 break;
966         case -EIO:
967                 err = "IO error";
968                 break;
969         }
970
971         BUG_ON(!err);
972         set_bit(BCH_FS_ERROR, &c->flags);
973         goto out;
974 }
975
976 const char *bch_fs_start(struct bch_fs *c)
977 {
978         return __bch_fs_start(c) ?: bch_fs_online(c);
979 }
980
981 static const char *bch_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
982 {
983         struct bch_sb_field_members *sb_mi;
984
985         sb_mi = bch_sb_get_members(sb);
986         if (!sb_mi)
987                 return "Invalid superblock: member info area missing";
988
989         if (le16_to_cpu(sb->block_size) != c->sb.block_size)
990                 return "mismatched block size";
991
992         if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) <
993             BCH_SB_BTREE_NODE_SIZE(c->disk_sb))
994                 return "new cache bucket size is too small";
995
996         return NULL;
997 }
998
999 static const char *bch_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb)
1000 {
1001         struct bch_sb *newest =
1002                 le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb;
1003         struct bch_sb_field_members *mi = bch_sb_get_members(newest);
1004
1005         if (uuid_le_cmp(fs->uuid, sb->uuid))
1006                 return "device not a member of filesystem";
1007
1008         if (sb->dev_idx >= newest->nr_devices)
1009                 return "device has invalid dev_idx";
1010
1011         if (bch_is_zero(mi->members[sb->dev_idx].uuid.b, sizeof(uuid_le)))
1012                 return "device has been removed";
1013
1014         if (fs->block_size != sb->block_size)
1015                 return "mismatched block size";
1016
1017         return NULL;
1018 }
1019
1020 /* Device startup/shutdown: */
1021
1022 void bch_dev_release(struct kobject *kobj)
1023 {
1024         struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
1025
1026         kfree(ca);
1027 }
1028
1029 static void bch_dev_free(struct bch_dev *ca)
1030 {
1031         unsigned i;
1032
1033         cancel_work_sync(&ca->io_error_work);
1034
1035         if (ca->kobj.state_in_sysfs &&
1036             ca->disk_sb.bdev)
1037                 sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj,
1038                                   "bcache");
1039
1040         if (ca->kobj.state_in_sysfs)
1041                 kobject_del(&ca->kobj);
1042
1043         bch_free_super(&ca->disk_sb);
1044         bch_dev_journal_exit(ca);
1045
1046         free_percpu(ca->sectors_written);
1047         bioset_exit(&ca->replica_set);
1048         free_percpu(ca->usage_percpu);
1049         free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
1050         kfree(ca->prio_buckets);
1051         kfree(ca->bio_prio);
1052         vfree(ca->buckets);
1053         vfree(ca->oldest_gens);
1054         free_heap(&ca->heap);
1055         free_fifo(&ca->free_inc);
1056
1057         for (i = 0; i < RESERVE_NR; i++)
1058                 free_fifo(&ca->free[i]);
1059
1060         percpu_ref_exit(&ca->io_ref);
1061         percpu_ref_exit(&ca->ref);
1062         kobject_put(&ca->kobj);
1063 }
1064
1065 static void bch_dev_io_ref_release(struct percpu_ref *ref)
1066 {
1067         struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
1068
1069         complete(&ca->offline_complete);
1070 }
1071
1072 static void __bch_dev_offline(struct bch_dev *ca)
1073 {
1074         struct bch_fs *c = ca->fs;
1075
1076         lockdep_assert_held(&c->state_lock);
1077
1078         __bch_dev_read_only(ca->fs, ca);
1079
1080         reinit_completion(&ca->offline_complete);
1081         percpu_ref_kill(&ca->io_ref);
1082         wait_for_completion(&ca->offline_complete);
1083
1084         if (ca->kobj.state_in_sysfs) {
1085                 struct kobject *block =
1086                         &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj;
1087
1088                 sysfs_remove_link(block, "bcache");
1089                 sysfs_remove_link(&ca->kobj, "block");
1090         }
1091
1092         bch_free_super(&ca->disk_sb);
1093         bch_dev_journal_exit(ca);
1094 }
1095
1096 static void bch_dev_ref_release(struct percpu_ref *ref)
1097 {
1098         struct bch_dev *ca = container_of(ref, struct bch_dev, ref);
1099
1100         complete(&ca->stop_complete);
1101 }
1102
1103 static void bch_dev_stop(struct bch_dev *ca)
1104 {
1105         struct bch_fs *c = ca->fs;
1106
1107         lockdep_assert_held(&c->state_lock);
1108
1109         BUG_ON(rcu_access_pointer(c->devs[ca->dev_idx]) != ca);
1110         rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
1111
1112         synchronize_rcu();
1113
1114         reinit_completion(&ca->stop_complete);
1115         percpu_ref_kill(&ca->ref);
1116         wait_for_completion(&ca->stop_complete);
1117 }
1118
1119 static int bch_dev_sysfs_online(struct bch_dev *ca)
1120 {
1121         struct bch_fs *c = ca->fs;
1122         int ret;
1123
1124         if (!c->kobj.state_in_sysfs)
1125                 return 0;
1126
1127         if (!ca->kobj.state_in_sysfs) {
1128                 ret = kobject_add(&ca->kobj, &ca->fs->kobj,
1129                                   "dev-%u", ca->dev_idx);
1130                 if (ret)
1131                         return ret;
1132         }
1133
1134         if (ca->disk_sb.bdev) {
1135                 struct kobject *block =
1136                         &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj;
1137
1138                 ret = sysfs_create_link(block, &ca->kobj, "bcache");
1139                 if (ret)
1140                         return ret;
1141                 ret = sysfs_create_link(&ca->kobj, block, "block");
1142                 if (ret)
1143                         return ret;
1144         }
1145
1146         return 0;
1147 }
1148
1149 static int bch_dev_alloc(struct bch_fs *c, unsigned dev_idx)
1150 {
1151         struct bch_member *member;
1152         size_t reserve_none, movinggc_reserve, free_inc_reserve, total_reserve;
1153         size_t heap_size;
1154         unsigned i;
1155         struct bch_dev *ca;
1156
1157         if (bch_fs_init_fault("dev_alloc"))
1158                 return -ENOMEM;
1159
1160         ca = kzalloc(sizeof(*ca), GFP_KERNEL);
1161         if (!ca)
1162                 return -ENOMEM;
1163
1164         kobject_init(&ca->kobj, &bch_dev_ktype);
1165         init_completion(&ca->stop_complete);
1166         init_completion(&ca->offline_complete);
1167
1168         spin_lock_init(&ca->self.lock);
1169         ca->self.nr = 1;
1170         rcu_assign_pointer(ca->self.d[0].dev, ca);
1171         ca->dev_idx = dev_idx;
1172
1173         spin_lock_init(&ca->freelist_lock);
1174         spin_lock_init(&ca->prio_buckets_lock);
1175         mutex_init(&ca->heap_lock);
1176         bch_dev_moving_gc_init(ca);
1177
1178         INIT_WORK(&ca->io_error_work, bch_nonfatal_io_error_work);
1179
1180         if (bch_fs_init_fault("dev_alloc"))
1181                 goto err;
1182
1183         member = bch_sb_get_members(c->disk_sb)->members + dev_idx;
1184
1185         ca->mi = bch_mi_to_cpu(member);
1186         ca->uuid = member->uuid;
1187         ca->bucket_bits = ilog2(ca->mi.bucket_size);
1188         scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);
1189
1190         /* XXX: tune these */
1191         movinggc_reserve = max_t(size_t, 16, ca->mi.nbuckets >> 7);
1192         reserve_none = max_t(size_t, 4, ca->mi.nbuckets >> 9);
1193         /*
1194          * free_inc must be smaller than the copygc reserve: if it was bigger,
1195          * one copygc iteration might not make enough buckets available to fill
1196          * up free_inc and allow the allocator to make forward progress
1197          */
1198         free_inc_reserve = movinggc_reserve / 2;
1199         heap_size = movinggc_reserve * 8;
1200
1201         if (percpu_ref_init(&ca->ref, bch_dev_ref_release,
1202                             0, GFP_KERNEL) ||
1203             percpu_ref_init(&ca->io_ref, bch_dev_io_ref_release,
1204                             PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
1205             !init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
1206             !init_fifo(&ca->free[RESERVE_BTREE], BTREE_NODE_RESERVE, GFP_KERNEL) ||
1207             !init_fifo(&ca->free[RESERVE_MOVINGGC],
1208                        movinggc_reserve, GFP_KERNEL) ||
1209             !init_fifo(&ca->free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
1210             !init_fifo(&ca->free_inc,   free_inc_reserve, GFP_KERNEL) ||
1211             !init_heap(&ca->heap,       heap_size, GFP_KERNEL) ||
1212             !(ca->oldest_gens   = vzalloc(sizeof(u8) *
1213                                           ca->mi.nbuckets)) ||
1214             !(ca->buckets       = vzalloc(sizeof(struct bucket) *
1215                                           ca->mi.nbuckets)) ||
1216             !(ca->prio_buckets  = kzalloc(sizeof(u64) * prio_buckets(ca) *
1217                                           2, GFP_KERNEL)) ||
1218             !(ca->disk_buckets  = alloc_bucket_pages(GFP_KERNEL, ca)) ||
1219             !(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)) ||
1220             !(ca->bio_prio = bio_kmalloc(GFP_NOIO, bucket_pages(ca))) ||
1221             bioset_init(&ca->replica_set, 4,
1222                         offsetof(struct bch_write_bio, bio)) ||
1223             !(ca->sectors_written = alloc_percpu(*ca->sectors_written)))
1224                 goto err;
1225
1226         ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
1227
1228         total_reserve = ca->free_inc.size;
1229         for (i = 0; i < RESERVE_NR; i++)
1230                 total_reserve += ca->free[i].size;
1231
1232         ca->copygc_write_point.group = &ca->self;
1233         ca->tiering_write_point.group = &ca->self;
1234
1235         ca->fs = c;
1236         rcu_assign_pointer(c->devs[ca->dev_idx], ca);
1237
1238         if (bch_dev_sysfs_online(ca))
1239                 pr_warn("error creating sysfs objects");
1240
1241         return 0;
1242 err:
1243         bch_dev_free(ca);
1244         return -ENOMEM;
1245 }
1246
1247 static int __bch_dev_online(struct bch_fs *c, struct bcache_superblock *sb)
1248 {
1249         struct bch_dev *ca;
1250         int ret;
1251
1252         lockdep_assert_held(&c->sb_lock);
1253
1254         if (le64_to_cpu(sb->sb->seq) >
1255             le64_to_cpu(c->disk_sb->seq))
1256                 bch_sb_to_fs(c, sb->sb);
1257
1258         BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices ||
1259                !c->devs[sb->sb->dev_idx]);
1260
1261         ca = c->devs[sb->sb->dev_idx];
1262         if (ca->disk_sb.bdev) {
1263                 bch_err(c, "already have device online in slot %u",
1264                         sb->sb->dev_idx);
1265                 return -EINVAL;
1266         }
1267
1268         ret = bch_dev_journal_init(ca, sb->sb);
1269         if (ret)
1270                 return ret;
1271
1272         /*
1273          * Increase journal write timeout if flushes to this device are
1274          * expensive:
1275          */
1276         if (!blk_queue_nonrot(bdev_get_queue(sb->bdev)) &&
1277             journal_flushes_device(ca))
1278                 c->journal.write_delay_ms =
1279                         max(c->journal.write_delay_ms, 1000U);
1280
1281         /* Commit: */
1282         ca->disk_sb = *sb;
1283         if (sb->mode & FMODE_EXCL)
1284                 ca->disk_sb.bdev->bd_holder = ca;
1285         memset(sb, 0, sizeof(*sb));
1286
1287         if (c->sb.nr_devices == 1)
1288                 bdevname(ca->disk_sb.bdev, c->name);
1289         bdevname(ca->disk_sb.bdev, ca->name);
1290
1291         if (bch_dev_sysfs_online(ca))
1292                 pr_warn("error creating sysfs objects");
1293
1294         lg_local_lock(&c->usage_lock);
1295         if (!gc_will_visit(c, gc_phase(GC_PHASE_SB_METADATA)))
1296                 bch_mark_dev_metadata(ca->fs, ca);
1297         lg_local_unlock(&c->usage_lock);
1298
1299         percpu_ref_reinit(&ca->io_ref);
1300         return 0;
1301 }
1302
1303 /* Device management: */
1304
1305 bool bch_fs_may_start(struct bch_fs *c, int flags)
1306 {
1307         struct bch_sb_field_members *mi;
1308         unsigned meta_missing = 0;
1309         unsigned data_missing = 0;
1310         bool degraded = false;
1311         unsigned i;
1312
1313         mutex_lock(&c->sb_lock);
1314         mi = bch_sb_get_members(c->disk_sb);
1315
1316         for (i = 0; i < c->disk_sb->nr_devices; i++)
1317                 if (!c->devs[i] &&
1318                     !bch_is_zero(mi->members[i].uuid.b, sizeof(uuid_le))) {
1319                         degraded = true;
1320                         if (BCH_MEMBER_HAS_METADATA(&mi->members[i]))
1321                                 meta_missing++;
1322                         if (BCH_MEMBER_HAS_DATA(&mi->members[i]))
1323                                 data_missing++;
1324                 }
1325         mutex_unlock(&c->sb_lock);
1326
1327         if (degraded &&
1328             !(flags & BCH_FORCE_IF_DEGRADED))
1329                 return false;
1330
1331         if (meta_missing &&
1332             !(flags & BCH_FORCE_IF_METADATA_DEGRADED))
1333                 return false;
1334
1335         if (meta_missing >= BCH_SB_META_REPLICAS_HAVE(c->disk_sb) &&
1336             !(flags & BCH_FORCE_IF_METADATA_LOST))
1337                 return false;
1338
1339         if (data_missing && !(flags & BCH_FORCE_IF_DATA_DEGRADED))
1340                 return false;
1341
1342         if (data_missing >= BCH_SB_DATA_REPLICAS_HAVE(c->disk_sb) &&
1343             !(flags & BCH_FORCE_IF_DATA_LOST))
1344                 return false;
1345
1346         return true;
1347 }
1348
1349 bool bch_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
1350                            enum bch_member_state new_state, int flags)
1351 {
1352         lockdep_assert_held(&c->state_lock);
1353
1354         if (new_state == BCH_MEMBER_STATE_RW)
1355                 return true;
1356
1357         if (ca->mi.has_data &&
1358             !(flags & BCH_FORCE_IF_DATA_DEGRADED))
1359                 return false;
1360
1361         if (ca->mi.has_data &&
1362             c->sb.data_replicas_have <= 1 &&
1363             !(flags & BCH_FORCE_IF_DATA_LOST))
1364                 return false;
1365
1366         if (ca->mi.has_metadata &&
1367             !(flags & BCH_FORCE_IF_METADATA_DEGRADED))
1368                 return false;
1369
1370         if (ca->mi.has_metadata &&
1371             c->sb.meta_replicas_have <= 1 &&
1372             !(flags & BCH_FORCE_IF_METADATA_LOST))
1373                 return false;
1374
1375         return true;
1376 }
1377
1378 static void __bch_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
1379 {
1380         bch_moving_gc_stop(ca);
1381
1382         /*
1383          * This stops new data writes (e.g. to existing open data
1384          * buckets) and then waits for all existing writes to
1385          * complete.
1386          */
1387         bch_dev_allocator_stop(ca);
1388
1389         bch_dev_group_remove(&c->journal.devs, ca);
1390 }
1391
1392 static const char *__bch_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
1393 {
1394         lockdep_assert_held(&c->state_lock);
1395
1396         BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW);
1397
1398         trace_bcache_cache_read_write(ca);
1399
1400         if (bch_dev_allocator_start(ca))
1401                 return "error starting allocator thread";
1402
1403         if (bch_moving_gc_start(ca))
1404                 return "error starting moving GC thread";
1405
1406         if (bch_tiering_start(c))
1407                 return "error starting tiering thread";
1408
1409         bch_notify_dev_read_write(ca);
1410         trace_bcache_cache_read_write_done(ca);
1411
1412         return NULL;
1413 }
1414
1415 int __bch_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
1416                         enum bch_member_state new_state, int flags)
1417 {
1418         struct bch_sb_field_members *mi;
1419
1420         if (ca->mi.state == new_state)
1421                 return 0;
1422
1423         if (!bch_dev_state_allowed(c, ca, new_state, flags))
1424                 return -EINVAL;
1425
1426         if (new_state == BCH_MEMBER_STATE_RW) {
1427                 if (__bch_dev_read_write(c, ca))
1428                         return -ENOMEM;
1429         } else {
1430                 __bch_dev_read_only(c, ca);
1431         }
1432
1433         bch_notice(ca, "%s", bch_dev_state[new_state]);
1434
1435         mutex_lock(&c->sb_lock);
1436         mi = bch_sb_get_members(c->disk_sb);
1437         SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], new_state);
1438         bch_write_super(c);
1439         mutex_unlock(&c->sb_lock);
1440
1441         return 0;
1442 }
1443
1444 int bch_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
1445                       enum bch_member_state new_state, int flags)
1446 {
1447         int ret;
1448
1449         mutex_lock(&c->state_lock);
1450         ret = __bch_dev_set_state(c, ca, new_state, flags);
1451         mutex_unlock(&c->state_lock);
1452
1453         return ret;
1454 }
1455
1456 #if 0
1457 int bch_dev_migrate_from(struct bch_fs *c, struct bch_dev *ca)
1458 {
1459         /* First, go RO before we try to migrate data off: */
1460         ret = bch_dev_set_state(c, ca, BCH_MEMBER_STATE_RO, flags);
1461         if (ret)
1462                 return ret;
1463
1464         bch_notify_dev_removing(ca);
1465
1466         /* Migrate data, metadata off device: */
1467
1468         ret = bch_move_data_off_device(ca);
1469         if (ret && !(flags & BCH_FORCE_IF_DATA_LOST)) {
1470                 bch_err(c, "Remove of %s failed, unable to migrate data off",
1471                         name);
1472                 return ret;
1473         }
1474
1475         if (ret)
1476                 ret = bch_flag_data_bad(ca);
1477         if (ret) {
1478                 bch_err(c, "Remove of %s failed, unable to migrate data off",
1479                         name);
1480                 return ret;
1481         }
1482
1483         ret = bch_move_metadata_off_device(ca);
1484         if (ret)
1485                 return ret;
1486 }
1487 #endif
1488
1489 /* Device add/removal: */
1490
1491 static int __bch_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
1492 {
1493         struct bch_sb_field_members *mi;
1494         unsigned dev_idx = ca->dev_idx;
1495         int ret;
1496
1497         if (ca->mi.state == BCH_MEMBER_STATE_RW) {
1498                 bch_err(ca, "Cannot remove RW device");
1499                 bch_notify_dev_remove_failed(ca);
1500                 return -EINVAL;
1501         }
1502
1503         if (!bch_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
1504                 bch_err(ca, "Cannot remove without losing data");
1505                 bch_notify_dev_remove_failed(ca);
1506                 return -EINVAL;
1507         }
1508
1509         /*
1510          * XXX: verify that dev_idx is really not in use anymore, anywhere
1511          *
1512          * flag_data_bad() does not check btree pointers
1513          */
1514         ret = bch_flag_data_bad(ca);
1515         if (ret) {
1516                 bch_err(ca, "Remove failed");
1517                 return ret;
1518         }
1519
1520         if (ca->mi.has_data || ca->mi.has_metadata) {
1521                 bch_err(ca, "Can't remove, still has data");
1522                 return ret;
1523         }
1524
1525         /*
1526          * Ok, really doing the remove:
1527          * Drop device's prio pointer before removing it from superblock:
1528          */
1529         bch_notify_dev_removed(ca);
1530
1531         spin_lock(&c->journal.lock);
1532         c->journal.prio_buckets[dev_idx] = 0;
1533         spin_unlock(&c->journal.lock);
1534
1535         bch_journal_meta(&c->journal);
1536
1537         __bch_dev_offline(ca);
1538         bch_dev_stop(ca);
1539         bch_dev_free(ca);
1540
1541         /*
1542          * Free this device's slot in the bch_member array - all pointers to
1543          * this device must be gone:
1544          */
1545         mutex_lock(&c->sb_lock);
1546         mi = bch_sb_get_members(c->disk_sb);
1547         memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid));
1548
1549         bch_write_super(c);
1550
1551         mutex_unlock(&c->sb_lock);
1552
1553         return 0;
1554 }
1555
1556 int bch_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
1557 {
1558         int ret;
1559
1560         mutex_lock(&c->state_lock);
1561         percpu_ref_put(&ca->ref);
1562         ret = __bch_dev_remove(c, ca, flags);
1563         mutex_unlock(&c->state_lock);
1564
1565         return ret;
1566 }
1567
1568 int bch_dev_add(struct bch_fs *c, const char *path)
1569 {
1570         struct bcache_superblock sb;
1571         const char *err;
1572         struct bch_dev *ca = NULL;
1573         struct bch_sb_field_members *mi, *dev_mi;
1574         struct bch_member saved_mi;
1575         unsigned dev_idx, nr_devices, u64s;
1576         int ret = -EINVAL;
1577
1578         err = bch_read_super(&sb, bch_opts_empty(), path);
1579         if (err)
1580                 return -EINVAL;
1581
1582         err = bch_validate_cache_super(&sb);
1583         if (err)
1584                 return -EINVAL;
1585
1586         err = bch_dev_may_add(sb.sb, c);
1587         if (err)
1588                 return -EINVAL;
1589
1590         mutex_lock(&c->state_lock);
1591         mutex_lock(&c->sb_lock);
1592
1593         /*
1594          * Preserve the old cache member information (esp. tier)
1595          * before we start bashing the disk stuff.
1596          */
1597         dev_mi = bch_sb_get_members(sb.sb);
1598         saved_mi = dev_mi->members[sb.sb->dev_idx];
1599         saved_mi.last_mount = cpu_to_le64(ktime_get_seconds());
1600
1601         if (dynamic_fault("bcache:add:no_slot"))
1602                 goto no_slot;
1603
1604         mi = bch_sb_get_members(c->disk_sb);
1605         for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++)
1606                 if (dev_idx >= c->sb.nr_devices ||
1607                     bch_is_zero(mi->members[dev_idx].uuid.b,
1608                                  sizeof(uuid_le)))
1609                         goto have_slot;
1610 no_slot:
1611         err = "no slots available in superblock";
1612         ret = -ENOSPC;
1613         goto err_unlock;
1614
1615 have_slot:
1616         nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
1617         u64s = (sizeof(struct bch_sb_field_members) +
1618                 sizeof(struct bch_member) * nr_devices) / sizeof(u64);
1619         err = "no space in superblock for member info";
1620
1621         mi = bch_fs_sb_resize_members(c, u64s);
1622         if (!mi)
1623                 goto err_unlock;
1624
1625         dev_mi = bch_sb_resize_members(&sb, u64s);
1626         if (!dev_mi)
1627                 goto err_unlock;
1628
1629         memcpy(dev_mi, mi, u64s * sizeof(u64));
1630         dev_mi->members[dev_idx] = saved_mi;
1631
1632         sb.sb->uuid             = c->disk_sb->uuid;
1633         sb.sb->dev_idx          = dev_idx;
1634         sb.sb->nr_devices       = nr_devices;
1635
1636         /* commit new member info */
1637         memcpy(mi, dev_mi, u64s * sizeof(u64));
1638         c->disk_sb->nr_devices  = nr_devices;
1639         c->sb.nr_devices        = nr_devices;
1640
1641         if (bch_dev_alloc(c, dev_idx)) {
1642                 err = "cannot allocate memory";
1643                 ret = -ENOMEM;
1644                 goto err_unlock;
1645         }
1646
1647         if (__bch_dev_online(c, &sb)) {
1648                 err = "bch_dev_online() error";
1649                 ret = -ENOMEM;
1650                 goto err_unlock;
1651         }
1652
1653         bch_write_super(c);
1654         mutex_unlock(&c->sb_lock);
1655
1656         ca = c->devs[dev_idx];
1657         if (ca->mi.state == BCH_MEMBER_STATE_RW) {
1658                 err = "journal alloc failed";
1659                 if (bch_dev_journal_alloc(ca))
1660                         goto err;
1661
1662                 err = __bch_dev_read_write(c, ca);
1663                 if (err)
1664                         goto err;
1665         }
1666
1667         bch_notify_dev_added(ca);
1668         mutex_unlock(&c->state_lock);
1669         return 0;
1670 err_unlock:
1671         mutex_unlock(&c->sb_lock);
1672 err:
1673         mutex_unlock(&c->state_lock);
1674         bch_free_super(&sb);
1675
1676         bch_err(c, "Unable to add device: %s", err);
1677         return ret ?: -EINVAL;
1678 }
1679
1680 int bch_dev_online(struct bch_fs *c, const char *path)
1681 {
1682         struct bcache_superblock sb = { 0 };
1683         const char *err;
1684
1685         mutex_lock(&c->state_lock);
1686
1687         err = bch_read_super(&sb, bch_opts_empty(), path);
1688         if (err)
1689                 goto err;
1690
1691         err = bch_dev_in_fs(c->disk_sb, sb.sb);
1692         if (err)
1693                 goto err;
1694
1695         mutex_lock(&c->sb_lock);
1696         if (__bch_dev_online(c, &sb)) {
1697                 mutex_unlock(&c->sb_lock);
1698                 goto err;
1699         }
1700         mutex_unlock(&c->sb_lock);
1701
1702         mutex_unlock(&c->state_lock);
1703         return 0;
1704 err:
1705         mutex_unlock(&c->state_lock);
1706         bch_free_super(&sb);
1707         bch_err(c, "error bringing %s online: %s", path, err);
1708         return -EINVAL;
1709 }
1710
1711 int bch_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
1712 {
1713         mutex_lock(&c->state_lock);
1714
1715         if (!bch_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
1716                 bch_err(ca, "Cannot offline required disk");
1717                 mutex_unlock(&c->state_lock);
1718                 return -EINVAL;
1719         }
1720
1721         __bch_dev_read_only(c, ca);
1722         __bch_dev_offline(ca);
1723
1724         mutex_unlock(&c->state_lock);
1725         return 0;
1726 }
1727
1728 int bch_dev_migrate(struct bch_fs *c, struct bch_dev *ca)
1729 {
1730         int ret;
1731
1732         mutex_lock(&c->state_lock);
1733
1734         if (ca->mi.state == BCH_MEMBER_STATE_RW) {
1735                 bch_err(ca, "Cannot migrate data off RW device");
1736                 mutex_unlock(&c->state_lock);
1737                 return -EINVAL;
1738         }
1739
1740         mutex_unlock(&c->state_lock);
1741
1742         ret = bch_move_data_off_device(ca);
1743         if (ret) {
1744                 bch_err(ca, "Error migrating data: %i", ret);
1745                 return ret;
1746         }
1747
1748         ret = bch_move_metadata_off_device(ca);
1749         if (ret) {
1750                 bch_err(ca, "Error migrating metadata: %i", ret);
1751                 return ret;
1752         }
1753
1754         if (ca->mi.has_data || ca->mi.has_metadata) {
1755                 bch_err(ca, "Migrate error: data still present");
1756                 return -EINVAL;
1757         }
1758
1759         return 0;
1760 }
1761
1762 /* Filesystem open: */
1763
1764 const char *bch_fs_open(char * const *devices, unsigned nr_devices,
1765                         struct bch_opts opts, struct bch_fs **ret)
1766 {
1767         const char *err;
1768         struct bch_fs *c = NULL;
1769         struct bcache_superblock *sb;
1770         unsigned i, best_sb = 0;
1771
1772         if (!nr_devices)
1773                 return "need at least one device";
1774
1775         if (!try_module_get(THIS_MODULE))
1776                 return "module unloading";
1777
1778         err = "cannot allocate memory";
1779         sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
1780         if (!sb)
1781                 goto err;
1782
1783         for (i = 0; i < nr_devices; i++) {
1784                 err = bch_read_super(&sb[i], opts, devices[i]);
1785                 if (err)
1786                         goto err;
1787
1788                 err = "attempting to register backing device";
1789                 if (__SB_IS_BDEV(le64_to_cpu(sb[i].sb->version)))
1790                         goto err;
1791
1792                 err = bch_validate_cache_super(&sb[i]);
1793                 if (err)
1794                         goto err;
1795         }
1796
1797         for (i = 1; i < nr_devices; i++)
1798                 if (le64_to_cpu(sb[i].sb->seq) >
1799                     le64_to_cpu(sb[best_sb].sb->seq))
1800                         best_sb = i;
1801
1802         for (i = 0; i < nr_devices; i++) {
1803                 err = bch_dev_in_fs(sb[best_sb].sb, sb[i].sb);
1804                 if (err)
1805                         goto err;
1806         }
1807
1808         err = "cannot allocate memory";
1809         c = bch_fs_alloc(sb[best_sb].sb, opts);
1810         if (!c)
1811                 goto err;
1812
1813         err = "bch_dev_online() error";
1814         mutex_lock(&c->sb_lock);
1815         for (i = 0; i < nr_devices; i++)
1816                 if (__bch_dev_online(c, &sb[i])) {
1817                         mutex_unlock(&c->sb_lock);
1818                         goto err;
1819                 }
1820         mutex_unlock(&c->sb_lock);
1821
1822         err = "insufficient devices";
1823         if (!bch_fs_may_start(c, 0))
1824                 goto err;
1825
1826         if (!c->opts.nostart) {
1827                 err = __bch_fs_start(c);
1828                 if (err)
1829                         goto err;
1830         }
1831
1832         err = bch_fs_online(c);
1833         if (err)
1834                 goto err;
1835
1836         if (ret)
1837                 *ret = c;
1838         else
1839                 closure_put(&c->cl);
1840
1841         err = NULL;
1842 out:
1843         kfree(sb);
1844         module_put(THIS_MODULE);
1845         if (err)
1846                 c = NULL;
1847         return err;
1848 err:
1849         if (c)
1850                 bch_fs_stop(c);
1851
1852         for (i = 0; i < nr_devices; i++)
1853                 bch_free_super(&sb[i]);
1854         goto out;
1855 }
1856
1857 static const char *__bch_fs_open_incremental(struct bcache_superblock *sb,
1858                                              struct bch_opts opts)
1859 {
1860         const char *err;
1861         struct bch_fs *c;
1862         bool allocated_fs = false;
1863
1864         err = bch_validate_cache_super(sb);
1865         if (err)
1866                 return err;
1867
1868         mutex_lock(&bch_fs_list_lock);
1869         c = __bch_uuid_to_fs(sb->sb->uuid);
1870         if (c) {
1871                 closure_get(&c->cl);
1872
1873                 err = bch_dev_in_fs(c->disk_sb, sb->sb);
1874                 if (err)
1875                         goto err;
1876         } else {
1877                 c = bch_fs_alloc(sb->sb, opts);
1878                 err = "cannot allocate memory";
1879                 if (!c)
1880                         goto err;
1881
1882                 allocated_fs = true;
1883         }
1884
1885         err = "bch_dev_online() error";
1886
1887         mutex_lock(&c->sb_lock);
1888         if (__bch_dev_online(c, sb)) {
1889                 mutex_unlock(&c->sb_lock);
1890                 goto err;
1891         }
1892         mutex_unlock(&c->sb_lock);
1893
1894         if (!c->opts.nostart && bch_fs_may_start(c, 0)) {
1895                 err = __bch_fs_start(c);
1896                 if (err)
1897                         goto err;
1898         }
1899
1900         err = __bch_fs_online(c);
1901         if (err)
1902                 goto err;
1903
1904         closure_put(&c->cl);
1905         mutex_unlock(&bch_fs_list_lock);
1906
1907         return NULL;
1908 err:
1909         mutex_unlock(&bch_fs_list_lock);
1910
1911         if (allocated_fs)
1912                 bch_fs_stop(c);
1913         else if (c)
1914                 closure_put(&c->cl);
1915
1916         return err;
1917 }
1918
1919 const char *bch_fs_open_incremental(const char *path)
1920 {
1921         struct bcache_superblock sb;
1922         struct bch_opts opts = bch_opts_empty();
1923         const char *err;
1924
1925         err = bch_read_super(&sb, opts, path);
1926         if (err)
1927                 return err;
1928
1929         if (__SB_IS_BDEV(le64_to_cpu(sb.sb->version))) {
1930                 mutex_lock(&bch_fs_list_lock);
1931                 err = bch_backing_dev_register(&sb);
1932                 mutex_unlock(&bch_fs_list_lock);
1933         } else {
1934                 err = __bch_fs_open_incremental(&sb, opts);
1935         }
1936
1937         bch_free_super(&sb);
1938
1939         return err;
1940 }
1941
1942 /* Global interfaces/init */
1943
1944 #define kobj_attribute_write(n, fn)                                     \
1945         static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn)
1946
1947 #define kobj_attribute_rw(n, show, store)                               \
1948         static struct kobj_attribute ksysfs_##n =                       \
1949                 __ATTR(n, S_IWUSR|S_IRUSR, show, store)
1950
1951 static ssize_t register_bcache(struct kobject *, struct kobj_attribute *,
1952                                const char *, size_t);
1953
1954 kobj_attribute_write(register,          register_bcache);
1955 kobj_attribute_write(register_quiet,    register_bcache);
1956
1957 static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
1958                                const char *buffer, size_t size)
1959 {
1960         ssize_t ret = -EINVAL;
1961         const char *err = "cannot allocate memory";
1962         char *path = NULL;
1963
1964         if (!try_module_get(THIS_MODULE))
1965                 return -EBUSY;
1966
1967         if (!(path = kstrndup(skip_spaces(buffer), size, GFP_KERNEL)))
1968                 goto err;
1969
1970         err = bch_fs_open_incremental(strim(path));
1971         if (err)
1972                 goto err;
1973
1974         ret = size;
1975 out:
1976         kfree(path);
1977         module_put(THIS_MODULE);
1978         return ret;
1979 err:
1980         pr_err("error opening %s: %s", path, err);
1981         goto out;
1982 }
1983
1984 static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
1985 {
1986         if (code == SYS_DOWN ||
1987             code == SYS_HALT ||
1988             code == SYS_POWER_OFF) {
1989                 struct bch_fs *c;
1990
1991                 mutex_lock(&bch_fs_list_lock);
1992
1993                 if (!list_empty(&bch_fs_list))
1994                         pr_info("Setting all devices read only:");
1995
1996                 list_for_each_entry(c, &bch_fs_list, list)
1997                         bch_fs_read_only_async(c);
1998
1999                 list_for_each_entry(c, &bch_fs_list, list)
2000                         bch_fs_read_only(c);
2001
2002                 mutex_unlock(&bch_fs_list_lock);
2003         }
2004
2005         return NOTIFY_DONE;
2006 }
2007
2008 static struct notifier_block reboot = {
2009         .notifier_call  = bcache_reboot,
2010         .priority       = INT_MAX, /* before any real devices */
2011 };
2012
2013 static ssize_t reboot_test(struct kobject *k, struct kobj_attribute *attr,
2014                            const char *buffer, size_t size)
2015 {
2016         bcache_reboot(NULL, SYS_DOWN, NULL);
2017         return size;
2018 }
2019
2020 kobj_attribute_write(reboot,            reboot_test);
2021
2022 static void bcache_exit(void)
2023 {
2024         bch_debug_exit();
2025         bch_vfs_exit();
2026         bch_blockdev_exit();
2027         bch_chardev_exit();
2028         if (bcache_kset)
2029                 kset_unregister(bcache_kset);
2030         if (bcache_io_wq)
2031                 destroy_workqueue(bcache_io_wq);
2032         if (!IS_ERR_OR_NULL(bch_sha256))
2033                 crypto_free_shash(bch_sha256);
2034         unregister_reboot_notifier(&reboot);
2035 }
2036
2037 static int __init bcache_init(void)
2038 {
2039         static const struct attribute *files[] = {
2040                 &ksysfs_register.attr,
2041                 &ksysfs_register_quiet.attr,
2042                 &ksysfs_reboot.attr,
2043                 NULL
2044         };
2045
2046         register_reboot_notifier(&reboot);
2047         closure_debug_init();
2048         bkey_pack_test();
2049
2050         bch_sha256 = crypto_alloc_shash("sha256", 0, 0);
2051         if (IS_ERR(bch_sha256))
2052                 goto err;
2053
2054         if (!(bcache_io_wq = create_freezable_workqueue("bcache_io")) ||
2055             !(bcache_kset = kset_create_and_add("bcache", NULL, fs_kobj)) ||
2056             sysfs_create_files(&bcache_kset->kobj, files) ||
2057             bch_chardev_init() ||
2058             bch_blockdev_init() ||
2059             bch_vfs_init() ||
2060             bch_debug_init())
2061                 goto err;
2062
2063         return 0;
2064 err:
2065         bcache_exit();
2066         return -ENOMEM;
2067 }
2068
2069 #define BCH_DEBUG_PARAM(name, description)                      \
2070         bool bch_##name;                                        \
2071         module_param_named(name, bch_##name, bool, 0644);       \
2072         MODULE_PARM_DESC(name, description);
2073 BCH_DEBUG_PARAMS()
2074 #undef BCH_DEBUG_PARAM
2075
2076 module_exit(bcache_exit);
2077 module_init(bcache_init);