2 * Authors: Kent Overstreet <kent.overstreet@gmail.com>
3 * Gabriel de Perthuis <g2p.code@gmail.com>
4 * Jacob Malevich <jam@datera.io>
15 #include <sys/types.h>
18 #include <uuid/uuid.h>
20 #include <nih/command.h>
21 #include <nih/option.h>
23 #include "ccan/ilog/ilog.h"
24 #include "ccan/darray/darray.h"
27 #include "bcache-format.h"
34 unsigned replacement_policy;
35 unsigned replication_set;
36 u64 size; /* 512 byte sectors */
42 struct backingdev_opts {
48 static darray(struct cache_opts) cache_devices;
49 static darray(struct backingdev_opts) backing_devices;
51 static char *label = NULL;
53 /* All in units of 512 byte sectors */
54 static unsigned block_size, bucket_size, btree_node_size;
55 static u64 filesystem_size;
56 static unsigned tier, replacement_policy;
58 static uuid_le set_uuid, user_uuid;
59 static unsigned meta_csum_type = BCH_CSUM_CRC32C;
60 static unsigned data_csum_type = BCH_CSUM_CRC32C;
61 static unsigned compression_type = BCH_COMPRESSION_NONE;
63 static unsigned replication_set, meta_replicas = 1, data_replicas = 1;
64 static unsigned on_error_action;
66 static unsigned version = 1;
68 static u64 data_offset = BDEV_DATA_START_DEFAULT;
69 static unsigned cache_mode = CACHE_MODE_WRITEBACK;
71 static int set_cache(NihOption *option, const char *arg)
73 darray_append(cache_devices, (struct cache_opts) {
76 .bucket_size = bucket_size,
78 .replacement_policy = replacement_policy,
79 .replication_set = replication_set,
80 .size = filesystem_size,
85 static int set_bdev(NihOption *option, const char *arg)
87 darray_append(backing_devices, (struct backingdev_opts) {
90 .label = label ? strdup(label) : NULL,
95 static int set_cache_set_uuid(NihOption *option, const char *arg)
97 if (uuid_parse(arg, user_uuid.b))
102 static int set_block_size(NihOption *option, const char *arg)
104 block_size = hatoi_validate(arg, "block size");
108 static int set_bucket_sizes(NihOption *option, const char *arg)
110 bucket_size = hatoi_validate(arg, "bucket size");
114 static int set_btree_node_size(NihOption *option, const char *arg)
116 btree_node_size = hatoi_validate(arg, "btree node size");
120 static int set_filesystem_size(NihOption *option, const char *arg)
122 filesystem_size = hatoi(arg) >> 9;
126 static int set_replacement_policy(NihOption *option, const char *arg)
128 replacement_policy = read_string_list_or_die(arg, replacement_policies,
129 "replacement policy");
133 static int set_csum_type(NihOption *option, const char *arg)
135 unsigned *csum_type = option->value;
137 *csum_type = read_string_list_or_die(arg, csum_types, "checksum type");
141 static int set_compression_type(NihOption *option, const char *arg)
143 compression_type = read_string_list_or_die(arg, compression_types,
148 static int set_on_error_action(NihOption *option, const char *arg)
150 on_error_action = read_string_list_or_die(arg, error_actions,
155 static int set_tier(NihOption *option, const char *arg)
157 tier = strtoul_or_die(arg, CACHE_TIERS, "tier");
161 static int set_replication_set(NihOption *option, const char *arg)
163 replication_set = strtoul_or_die(arg, CACHE_REPLICATION_SET_MAX,
168 static int set_meta_replicas(NihOption *option, const char *arg)
170 meta_replicas = strtoul_or_die(arg, CACHE_SET_META_REPLICAS_WANT_MAX,
175 static int set_data_replicas(NihOption *option, const char *arg)
177 data_replicas = strtoul_or_die(arg, CACHE_SET_DATA_REPLICAS_WANT_MAX,
182 static int set_cache_mode(NihOption *option, const char *arg)
184 cache_mode = read_string_list_or_die(arg, bdev_cache_mode,
189 static int set_version(NihOption *option, const char *arg)
191 version = strtoul_or_die(arg, 2, "version");
195 NihOption opts_format[] = {
196 // { int shortoption, char *longoption, char *help, NihOptionGroup, char *argname, void *value, NihOptionSetter}
198 { 'C', "cache", N_("Format a cache device"),
199 NULL, "dev", NULL, set_cache },
200 { 'B', "bdev", N_("Format a backing device"),
201 NULL, "dev", NULL, set_bdev },
203 { 'l', "label", N_("label"),
204 NULL, "label", &label, NULL},
205 { 0, "cset_uuid", N_("UUID for the cache set"),
206 NULL, "uuid", NULL, set_cache_set_uuid },
208 { 'w', "block", N_("block size (hard sector size of SSD, often 2k"),
209 NULL, "size", NULL, set_block_size },
210 { 'b', "bucket", N_("bucket size"),
211 NULL, "size", NULL, set_bucket_sizes },
212 { 'n', "btree_node", N_("Btree node size, default 256k"),
213 NULL, "size", NULL, set_btree_node_size },
214 { 0, "fs_size", N_("Size of filesystem on device" ),
215 NULL, "size", NULL, set_filesystem_size },
217 { 'p', "cache_replacement_policy", NULL,
218 NULL, "(lru|fifo|random)", NULL, set_replacement_policy },
220 { 0, "metadata_csum_type", N_("Checksum type"),
221 NULL, "(none|crc32c|crc64)", &meta_csum_type, set_csum_type },
223 { 0, "data_csum_type", N_("Checksum type"),
224 NULL, "(none|crc32c|crc64)", &data_csum_type, set_csum_type },
226 { 0, "compression_type", N_("Compression type"),
227 NULL, "(none|gzip)", NULL, set_compression_type },
229 { 0, "error_action", N_("Action to take on filesystem error"),
230 NULL, "(continue|readonly|panic)", NULL, set_on_error_action },
232 { 0, "discard", N_("Enable discards"),
233 NULL, NULL, &discard, NULL },
235 { 't', "tier", N_("tier of subsequent devices"),
236 NULL, "#", NULL, set_tier },
238 { 0, "replication_set", N_("replication set of subsequent devices"),
239 NULL, "#", NULL, set_replication_set },
241 { 0, "meta_replicas", N_("number of metadata replicas"),
242 NULL, "#", NULL, set_meta_replicas },
244 { 0, "data_replicas", N_("number of data replicas"),
245 NULL, "#", NULL, set_data_replicas },
247 { 0, "cache_mode", N_("Cache mode (for backing devices)"),
248 NULL, "(writethrough|writeback|writearound", NULL, set_cache_mode },
250 { 'o', "data_offset", N_("data offset in sectors"),
251 NULL, "offset", &data_offset, NULL},
253 { 'v', "version", N_("superblock version"),
254 NULL, "#", NULL, set_version},
259 void __do_write_sb(int fd, void *sb, size_t bytes)
261 char zeroes[SB_SECTOR << 9] = {0};
263 /* Zero start of disk */
264 if (pwrite(fd, zeroes, SB_SECTOR << 9, 0) != SB_SECTOR << 9) {
265 perror("write error trying to zero start of disk\n");
268 /* Write superblock */
269 if (pwrite(fd, sb, bytes, SB_SECTOR << 9) != bytes) {
270 perror("write error trying to write superblock\n");
278 #define do_write_sb(_fd, _sb) \
279 __do_write_sb(_fd, _sb, ((void *) __bset_bkey_last(_sb)) - (void *) _sb);
281 void write_backingdev_sb(int fd, unsigned block_size, unsigned mode,
282 u64 data_offset, const char *label,
288 memset(&sb, 0, sizeof(struct cache_sb));
290 sb.offset = SB_SECTOR;
291 sb.version = BCACHE_SB_VERSION_BDEV;
292 sb.magic = BCACHE_MAGIC;
293 uuid_generate(sb.disk_uuid.b);
294 sb.set_uuid = set_uuid;
295 sb.block_size = block_size;
297 uuid_unparse(sb.disk_uuid.b, uuid_str);
299 memcpy(sb.label, label, SB_LABEL_SIZE);
301 SET_BDEV_CACHE_MODE(&sb, mode);
303 if (data_offset != BDEV_DATA_START_DEFAULT) {
304 sb.version = BCACHE_SB_VERSION_BDEV_WITH_OFFSET;
305 sb.bdev_data_offset = data_offset;
308 sb.csum = csum_set(&sb, BCH_CSUM_CRC64);
313 "data_offset: %llu\n",
314 uuid_str, (unsigned) sb.version,
315 sb.block_size, data_offset);
317 do_write_sb(fd, &sb);
320 static void format_v0(void)
322 struct cache_opts *i;
324 set_uuid = user_uuid;
326 darray_foreach(i, cache_devices)
327 bucket_size = min(bucket_size, i->bucket_size);
329 struct cache_sb_v0 *sb = calloc(1, sizeof(*sb));
331 sb->offset = SB_SECTOR;
332 sb->version = BCACHE_SB_VERSION_CDEV_WITH_UUID;
333 sb->magic = BCACHE_MAGIC;
334 sb->block_size = block_size;
335 sb->bucket_size = bucket_size;
336 sb->set_uuid = set_uuid;
337 sb->nr_in_set = darray_size(cache_devices);
340 memcpy(sb->label, label, sizeof(sb->label));
342 darray_foreach(i, cache_devices) {
343 char uuid_str[40], set_uuid_str[40];
345 uuid_generate(sb->uuid.b);
346 sb->nbuckets = i->nbuckets;
347 sb->first_bucket = i->first_bucket;
348 sb->nr_this_dev = i - cache_devices.item;
349 sb->csum = csum_set(sb, BCH_CSUM_CRC64);
351 uuid_unparse(sb->uuid.b, uuid_str);
352 uuid_unparse(sb->set_uuid.b, set_uuid_str);
361 "first_bucket: %u\n",
362 uuid_str, set_uuid_str,
363 (unsigned) sb->version,
371 do_write_sb(i->fd, sb);
375 static void format_v1(void)
378 struct cache_opts *i;
380 sb = calloc(1, sizeof(*sb) + sizeof(struct cache_member) *
381 darray_size(cache_devices));
383 sb->offset = SB_SECTOR;
384 sb->version = BCACHE_SB_VERSION_CDEV_V3;
385 sb->magic = BCACHE_MAGIC;
386 sb->block_size = block_size;
387 sb->set_uuid = set_uuid;
388 sb->user_uuid = user_uuid;
389 sb->nr_in_set = darray_size(cache_devices);
392 memcpy(sb->label, label, sizeof(sb->label));
395 * don't have a userspace crc32c implementation handy, just always use
398 SET_CACHE_SB_CSUM_TYPE(sb, BCH_CSUM_CRC64);
399 SET_CACHE_META_PREFERRED_CSUM_TYPE(sb, meta_csum_type);
400 SET_CACHE_DATA_PREFERRED_CSUM_TYPE(sb, data_csum_type);
401 SET_CACHE_COMPRESSION_TYPE(sb, compression_type);
403 SET_CACHE_BTREE_NODE_SIZE(sb, btree_node_size);
404 SET_CACHE_SET_META_REPLICAS_WANT(sb, meta_replicas);
405 SET_CACHE_SET_META_REPLICAS_HAVE(sb, meta_replicas);
406 SET_CACHE_SET_DATA_REPLICAS_WANT(sb, data_replicas);
407 SET_CACHE_SET_DATA_REPLICAS_HAVE(sb, data_replicas);
408 SET_CACHE_ERROR_ACTION(sb, on_error_action);
410 darray_foreach(i, cache_devices) {
411 struct cache_member *m = sb->members +
412 (i - cache_devices.item);
414 uuid_generate(m->uuid.b);
415 m->nbuckets = i->nbuckets;
416 m->first_bucket = i->first_bucket;
417 m->bucket_size = i->bucket_size;
419 if (m->nbuckets < 1 << 7)
420 die("Not enough buckets: %llu, need %u",
421 m->nbuckets, 1 << 7);
423 SET_CACHE_TIER(m, i->tier);
424 SET_CACHE_REPLICATION_SET(m, i->replication_set);
425 SET_CACHE_REPLACEMENT(m, i->replacement_policy);
426 SET_CACHE_DISCARD(m, discard);
429 sb->u64s = bch_journal_buckets_offset(sb);
431 darray_foreach(i, cache_devices) {
432 char uuid_str[40], set_uuid_str[40];
433 struct cache_member *m = sb->members +
434 (i - cache_devices.item);
436 sb->disk_uuid = m->uuid;
437 sb->nr_this_dev = i - cache_devices.item;
438 sb->csum = csum_set(sb, CACHE_SB_CSUM_TYPE(sb));
440 uuid_unparse(sb->disk_uuid.b, uuid_str);
441 uuid_unparse(sb->user_uuid.b, set_uuid_str);
450 "first_bucket: %u\n",
451 uuid_str, set_uuid_str,
452 (unsigned) sb->version,
460 do_write_sb(i->fd, sb);
464 int cmd_format(NihCommand *command, char * const *args)
466 struct cache_opts *i;
467 struct backingdev_opts *ib;
469 if (!darray_size(cache_devices) &&
470 !darray_size(backing_devices))
471 die("Please supply a device");
473 if (uuid_is_null(user_uuid.b))
474 uuid_generate(user_uuid.b);
476 uuid_generate(set_uuid.b);
479 darray_foreach(i, cache_devices)
480 block_size = max(block_size,
481 get_blocksize(i->dev, i->fd));
483 darray_foreach(ib, backing_devices)
484 block_size = max(block_size,
485 get_blocksize(ib->dev, ib->fd));
488 darray_foreach(i, cache_devices) {
490 i->size = get_size(i->dev, i->fd);
492 if (!i->bucket_size) {
493 u64 bytes = i->size << 9;
495 if (bytes < 1 << 20) /* 1M device - 256 4k buckets*/
496 i->bucket_size = rounddown_pow_of_two(bytes >> 17);
498 /* Max 1M bucket at around 256G */
499 i->bucket_size = 8 << min((ilog2(bytes >> 20) / 2), 9U);
502 if (i->bucket_size < block_size)
503 die("Bucket size cannot be smaller than block size");
505 i->nbuckets = i->size / i->bucket_size;
506 i->first_bucket = (23 / i->bucket_size) + 3;
508 if (i->nbuckets < 1 << 7)
509 die("Not enough buckets: %llu, need %u",
510 i->nbuckets, 1 << 7);
513 if (!btree_node_size) {
514 /* 256k default btree node size */
515 btree_node_size = 512;
517 darray_foreach(i, cache_devices)
518 btree_node_size = min(btree_node_size, i->bucket_size);
530 darray_foreach(ib, backing_devices)
531 write_backingdev_sb(ib->fd, block_size, cache_mode,
532 data_offset, ib->label,