2 * Authors: Kent Overstreet <kmo@daterainc.com>
3 * Gabriel de Perthuis <g2p.code@gmail.com>
4 * Jacob Malevich <jam@datera.io>
11 #include <nih/logging.h>
19 #include <sys/ioctl.h>
30 #include <sys/types.h>
33 #include <uuid/uuid.h>
35 #include <nih/command.h>
36 #include <nih/option.h>
39 #include "bcacheadm-format.h"
41 static struct cache_opts {
46 unsigned replacement_policy;
47 unsigned replication_set;
52 } cache_devices[MAX_DEVS];
54 static struct backingdev_opts {
58 } backing_devices[MAX_DEVS];
60 static size_t nr_backing_devices = 0, nr_cache_devices = 0;
62 static char *label = NULL;
64 /* All in units of 512 byte sectors */
65 static unsigned block_size, bucket_size, btree_node_size;
66 static u64 filesystem_size;
67 static unsigned tier, replacement_policy;
69 static uuid_le set_uuid, user_uuid;
70 static unsigned meta_csum_type = BCH_CSUM_CRC32C;
71 static unsigned data_csum_type = BCH_CSUM_CRC32C;
72 static unsigned compression_type = BCH_COMPRESSION_NONE;
74 static unsigned replication_set, meta_replicas = 1, data_replicas = 1;
75 static unsigned on_error_action;
77 static unsigned version = 1;
79 static u64 data_offset = BDEV_DATA_START_DEFAULT;
80 static unsigned cache_mode = CACHE_MODE_WRITEBACK;
82 static int set_cache(NihOption *option, const char *arg)
84 cache_devices[nr_cache_devices++] = (struct cache_opts) {
87 .bucket_size = bucket_size,
89 .replacement_policy = replacement_policy,
90 .replication_set = replication_set,
91 .filesystem_size = filesystem_size,
96 static int set_bdev(NihOption *option, const char *arg)
98 backing_devices[nr_backing_devices++] = (struct backingdev_opts) {
101 .label = label ? strdup(label) : NULL,
106 static int set_cache_set_uuid(NihOption *option, const char *arg)
108 if (uuid_parse(arg, user_uuid.b))
113 static int set_block_size(NihOption *option, const char *arg)
115 block_size = hatoi_validate(arg, "block size");
119 static int set_bucket_sizes(NihOption *option, const char *arg)
121 bucket_size = hatoi_validate(arg, "bucket size");
125 static int set_btree_node_size(NihOption *option, const char *arg)
127 btree_node_size = hatoi_validate(arg, "btree node size");
131 static int set_filesystem_size(NihOption *option, const char *arg)
133 filesystem_size = hatoi(arg) >> 9;
137 static int set_replacement_policy(NihOption *option, const char *arg)
139 replacement_policy = read_string_list_or_die(arg, replacement_policies,
140 "replacement policy");
144 static int set_csum_type(NihOption *option, const char *arg)
146 unsigned *csum_type = option->value;
148 *csum_type = read_string_list_or_die(arg, csum_types, "checksum type");
152 static int set_compression_type(NihOption *option, const char *arg)
154 compression_type = read_string_list_or_die(arg, compression_types,
159 static int set_on_error_action(NihOption *option, const char *arg)
161 on_error_action = read_string_list_or_die(arg, error_actions,
166 static int set_tier(NihOption *option, const char *arg)
168 tier = strtoul_or_die(arg, CACHE_TIERS, "tier");
172 static int set_replication_set(NihOption *option, const char *arg)
174 replication_set = strtoul_or_die(arg, CACHE_REPLICATION_SET_MAX,
179 static int set_meta_replicas(NihOption *option, const char *arg)
181 meta_replicas = strtoul_or_die(arg, CACHE_SET_META_REPLICAS_WANT_MAX,
186 static int set_data_replicas(NihOption *option, const char *arg)
188 data_replicas = strtoul_or_die(arg, CACHE_SET_DATA_REPLICAS_WANT_MAX,
193 static int set_cache_mode(NihOption *option, const char *arg)
195 cache_mode = read_string_list_or_die(arg, bdev_cache_mode,
200 static int set_version(NihOption *option, const char *arg)
202 version = strtoul_or_die(arg, 2, "version");
206 NihOption opts_format[] = {
207 // { int shortoption, char *longoption, char *help, NihOptionGroup, char *argname, void *value, NihOptionSetter}
209 { 'C', "cache", N_("Format a cache device"),
210 NULL, "dev", NULL, set_cache },
211 { 'B', "bdev", N_("Format a backing device"),
212 NULL, "dev", NULL, set_bdev },
214 { 'l', "label", N_("label"),
215 NULL, "label", &label, NULL},
216 { 0, "cset_uuid", N_("UUID for the cache set"),
217 NULL, "uuid", NULL, set_cache_set_uuid },
219 { 'w', "block", N_("block size (hard sector size of SSD, often 2k"),
220 NULL, "size", NULL, set_block_size },
221 { 'b', "bucket", N_("bucket size"),
222 NULL, "size", NULL, set_bucket_sizes },
223 { 'n', "btree_node", N_("Btree node size, default 256k"),
224 NULL, "size", NULL, set_btree_node_size },
225 { 0, "fs_size", N_("Size of filesystem on device" ),
226 NULL, "size", NULL, set_filesystem_size },
228 { 'p', "cache_replacement_policy", NULL,
229 NULL, "(lru|fifo|random)", NULL, set_replacement_policy },
231 { 0, "metadata_csum_type", N_("Checksum type"),
232 NULL, "(none|crc32c|crc64)", &meta_csum_type, set_csum_type },
234 { 0, "data_csum_type", N_("Checksum type"),
235 NULL, "(none|crc32c|crc64)", &data_csum_type, set_csum_type },
237 { 0, "compression_type", N_("Compression type"),
238 NULL, "(none|gzip)", NULL, set_compression_type },
240 { 0, "error_action", N_("Action to take on filesystem error"),
241 NULL, "(continue|readonly|panic)", NULL, set_on_error_action },
243 { 0, "discard", N_("Enable discards"),
244 NULL, NULL, &discard, NULL },
246 { 't', "tier", N_("tier of subsequent devices"),
247 NULL, "#", NULL, set_tier },
249 { 0, "replication_set", N_("replication set of subsequent devices"),
250 NULL, "#", NULL, set_replication_set },
252 { 0, "meta_replicas", N_("number of metadata replicas"),
253 NULL, "#", NULL, set_meta_replicas },
255 { 0, "data_replicas", N_("number of data replicas"),
256 NULL, "#", NULL, set_data_replicas },
258 { 0, "cache_mode", N_("Cache mode (for backing devices)"),
259 NULL, "(writethrough|writeback|writearound", NULL, set_cache_mode },
261 { 'o', "data_offset", N_("data offset in sectors"),
262 NULL, "offset", &data_offset, NULL},
264 { 'v', "version", N_("superblock version"),
265 NULL, "#", NULL, set_version},
270 static unsigned rounddown_pow_of_two(unsigned n)
282 static unsigned ilog2(u64 n)
294 void __do_write_sb(int fd, void *sb, size_t bytes)
296 char zeroes[SB_START] = {0};
298 /* Zero start of disk */
299 if (pwrite(fd, zeroes, SB_START, 0) != SB_START) {
300 perror("write error trying to zero start of disk\n");
303 /* Write superblock */
304 if (pwrite(fd, sb, bytes, SB_START) != bytes) {
305 perror("write error trying to write superblock\n");
313 #define do_write_sb(_fd, _sb) \
314 __do_write_sb(_fd, _sb, ((void *) __bset_bkey_last(_sb)) - (void *) _sb);
316 void write_backingdev_sb(int fd, unsigned block_size, unsigned mode,
317 u64 data_offset, const char *label,
323 memset(&sb, 0, sizeof(struct cache_sb));
325 sb.offset = SB_SECTOR;
326 sb.version = BCACHE_SB_VERSION_BDEV;
327 sb.magic = BCACHE_MAGIC;
328 uuid_generate(sb.disk_uuid.b);
329 sb.set_uuid = set_uuid;
330 sb.block_size = block_size;
332 uuid_unparse(sb.disk_uuid.b, uuid_str);
334 memcpy(sb.label, label, SB_LABEL_SIZE);
336 SET_BDEV_CACHE_MODE(&sb, mode);
338 if (data_offset != BDEV_DATA_START_DEFAULT) {
339 sb.version = BCACHE_SB_VERSION_BDEV_WITH_OFFSET;
340 sb.bdev_data_offset = data_offset;
343 sb.csum = csum_set(&sb, BCH_CSUM_CRC64);
348 "data_offset: %llu\n",
349 uuid_str, (unsigned) sb.version,
350 sb.block_size, data_offset);
352 do_write_sb(fd, &sb);
355 static void format_v0(void)
357 set_uuid = user_uuid;
359 for (struct cache_opts *i = cache_devices;
360 i < cache_devices + nr_cache_devices;
362 bucket_size = min(bucket_size, i->bucket_size);
364 struct cache_sb_v0 *sb = calloc(1, sizeof(*sb));
366 sb->offset = SB_SECTOR;
367 sb->version = BCACHE_SB_VERSION_CDEV_WITH_UUID;
368 sb->magic = BCACHE_MAGIC;
369 sb->block_size = block_size;
370 sb->bucket_size = bucket_size;
371 sb->set_uuid = set_uuid;
372 sb->nr_in_set = nr_cache_devices;
375 memcpy(sb->label, label, sizeof(sb->label));
377 for (struct cache_opts *i = cache_devices;
378 i < cache_devices + nr_cache_devices;
380 char uuid_str[40], set_uuid_str[40];
382 uuid_generate(sb->uuid.b);
383 sb->nbuckets = i->nbuckets;
384 sb->first_bucket = i->first_bucket;
385 sb->nr_this_dev = i - cache_devices;
386 sb->csum = csum_set(sb, BCH_CSUM_CRC64);
388 uuid_unparse(sb->uuid.b, uuid_str);
389 uuid_unparse(sb->set_uuid.b, set_uuid_str);
398 "first_bucket: %u\n",
399 uuid_str, set_uuid_str,
400 (unsigned) sb->version,
408 do_write_sb(i->fd, sb);
412 static void format_v1(void)
416 sb = calloc(1, sizeof(*sb) + sizeof(struct cache_member) *
419 sb->offset = SB_SECTOR;
420 sb->version = BCACHE_SB_VERSION_CDEV_V3;
421 sb->magic = BCACHE_MAGIC;
422 sb->block_size = block_size;
423 sb->set_uuid = set_uuid;
424 sb->user_uuid = user_uuid;
427 memcpy(sb->label, label, sizeof(sb->label));
430 * don't have a userspace crc32c implementation handy, just always use
433 SET_CACHE_SB_CSUM_TYPE(sb, BCH_CSUM_CRC64);
434 SET_CACHE_META_PREFERRED_CSUM_TYPE(sb, meta_csum_type);
435 SET_CACHE_DATA_PREFERRED_CSUM_TYPE(sb, data_csum_type);
436 SET_CACHE_COMPRESSION_TYPE(sb, compression_type);
438 SET_CACHE_BTREE_NODE_SIZE(sb, btree_node_size);
439 SET_CACHE_SET_META_REPLICAS_WANT(sb, meta_replicas);
440 SET_CACHE_SET_META_REPLICAS_HAVE(sb, meta_replicas);
441 SET_CACHE_SET_DATA_REPLICAS_WANT(sb, data_replicas);
442 SET_CACHE_SET_DATA_REPLICAS_HAVE(sb, data_replicas);
443 SET_CACHE_ERROR_ACTION(sb, on_error_action);
445 for (struct cache_opts *i = cache_devices;
446 i < cache_devices + nr_cache_devices;
448 struct cache_member *m = sb->members + sb->nr_in_set++;
450 uuid_generate(m->uuid.b);
451 m->nbuckets = i->nbuckets;
452 m->first_bucket = i->first_bucket;
453 m->bucket_size = i->bucket_size;
455 if (m->nbuckets < 1 << 7)
456 die("Not enough buckets: %llu, need %u",
457 m->nbuckets, 1 << 7);
459 SET_CACHE_TIER(m, i->tier);
460 SET_CACHE_REPLICATION_SET(m, i->replication_set);
461 SET_CACHE_REPLACEMENT(m, i->replacement_policy);
462 SET_CACHE_DISCARD(m, discard);
465 sb->u64s = bch_journal_buckets_offset(sb);
467 for (unsigned i = 0; i < sb->nr_in_set; i++) {
468 char uuid_str[40], set_uuid_str[40];
469 struct cache_member *m = sb->members + i;
471 sb->disk_uuid = m->uuid;
473 sb->csum = csum_set(sb,
474 CACHE_SB_CSUM_TYPE(sb));
476 uuid_unparse(sb->disk_uuid.b, uuid_str);
477 uuid_unparse(sb->user_uuid.b, set_uuid_str);
486 "first_bucket: %u\n",
487 uuid_str, set_uuid_str,
488 (unsigned) sb->version,
496 do_write_sb(cache_devices[i].fd, sb);
500 int cmd_format(NihCommand *command, char *const *args)
502 if (!nr_cache_devices && !nr_backing_devices)
503 die("Please supply a device");
505 if (uuid_is_null(user_uuid.b))
506 uuid_generate(user_uuid.b);
508 uuid_generate(set_uuid.b);
511 for (struct cache_opts *i = cache_devices;
512 i < cache_devices + nr_cache_devices;
514 block_size = max(block_size, get_blocksize(i->dev, i->fd));
516 for (struct backingdev_opts *i = backing_devices;
517 i < backing_devices + nr_backing_devices;
519 block_size = max(block_size, get_blocksize(i->dev, i->fd));
522 for (struct cache_opts *i = cache_devices;
523 i < cache_devices + nr_cache_devices;
525 if (!i->bucket_size) {
526 u64 size = (i->filesystem_size ?:
527 getblocks(i->fd)) << 9;
529 if (size < 1 << 20) /* 1M device - 256 4k buckets*/
530 i->bucket_size = rounddown_pow_of_two(size >> 17);
532 /* Max 1M bucket at around 256G */
533 i->bucket_size = 8 << min((ilog2(size >> 20) / 2), 9U);
536 if (i->bucket_size < block_size)
537 die("Bucket size cannot be smaller than block size");
539 i->nbuckets = (i->filesystem_size ?:
540 getblocks(i->fd)) / i->bucket_size;
541 i->first_bucket = (23 / i->bucket_size) + 3;
543 if (i->nbuckets < 1 << 7)
544 die("Not enough buckets: %llu, need %u",
545 i->nbuckets, 1 << 7);
548 if (!btree_node_size) {
549 /* 256k default btree node size */
550 btree_node_size = 512;
552 for (struct cache_opts *i = cache_devices;
553 i < cache_devices + nr_cache_devices;
555 btree_node_size = min(btree_node_size, i->bucket_size);
567 for (struct backingdev_opts *i = backing_devices;
568 i < backing_devices + nr_backing_devices;
570 write_backingdev_sb(i->fd, block_size, cache_mode,
571 data_offset, i->label,