X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=linux%2Fblkdev.c;h=b7f58737298fa7280f1c949312486c363f85ad14;hb=38b8d01c4cf823d9804451eff5ec811c17b03408;hp=b4ff451831e87aca5db7cbb16b7a40d32dda141b;hpb=b33fc8298f7e13226b9895abc57c9bfce5e3fa2d;p=bcachefs-tools-debian diff --git a/linux/blkdev.c b/linux/blkdev.c index b4ff451..b7f5873 100644 --- a/linux/blkdev.c +++ b/linux/blkdev.c @@ -8,11 +8,32 @@ #include #include +#include + +#ifdef CONFIG_VALGRIND +#include +#endif + #include #include +#include #include +#include -int submit_bio_wait(struct bio *bio) +#include "tools-util.h" + +struct fops { + void (*init)(void); + void (*cleanup)(void); + void (*read)(struct bio *bio, struct iovec * iov, unsigned i); + void (*write)(struct bio *bio, struct iovec * iov, unsigned i); +}; + +static struct fops *fops; +static io_context_t aio_ctx; +static atomic_t running_requests; + +void generic_make_request(struct bio *bio) { struct iovec *iov; struct bvec_iter iter; @@ -20,8 +41,15 @@ int submit_bio_wait(struct bio *bio) ssize_t ret; unsigned i; - if (bio->bi_opf & REQ_PREFLUSH) - fdatasync(bio->bi_bdev->bd_fd); + if (bio->bi_opf & REQ_PREFLUSH) { + ret = fdatasync(bio->bi_bdev->bd_fd); + if (ret) { + fprintf(stderr, "fsync error: %m\n"); + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + return; + } + } i = 0; bio_for_each_segment(bv, bio, iter) @@ -30,44 +58,74 @@ int submit_bio_wait(struct bio *bio) iov = alloca(sizeof(*iov) * i); i = 0; - bio_for_each_segment(bv, bio, iter) + bio_for_each_segment(bv, bio, iter) { + void *start = page_address(bv.bv_page) + bv.bv_offset; + size_t len = bv.bv_len; + iov[i++] = (struct iovec) { - .iov_base = page_address(bv.bv_page) + bv.bv_offset, - .iov_len = bv.bv_len, + .iov_base = start, + .iov_len = len, }; +#ifdef CONFIG_VALGRIND + /* To be pedantic it should only be on IO completion. */ + if (bio_op(bio) == REQ_OP_READ) + VALGRIND_MAKE_MEM_DEFINED(start, len); +#endif + } + switch (bio_op(bio)) { case REQ_OP_READ: - ret = preadv(bio->bi_bdev->bd_fd, iov, i, - bio->bi_iter.bi_sector << 9); + fops->read(bio, iov, i); break; case REQ_OP_WRITE: - ret = pwritev(bio->bi_bdev->bd_fd, iov, i, - bio->bi_iter.bi_sector << 9); + fops->write(bio, iov, i); + break; + case REQ_OP_FLUSH: + ret = fsync(bio->bi_bdev->bd_fd); + if (ret) + die("fsync error: %m"); + bio_endio(bio); break; default: BUG(); } +} - if (bio->bi_opf & REQ_FUA) - fdatasync(bio->bi_bdev->bd_fd); - - return ret == bio->bi_iter.bi_size ? 0 : -EIO; +static void submit_bio_wait_endio(struct bio *bio) +{ + complete(bio->bi_private); } -void generic_make_request(struct bio *bio) +int submit_bio_wait(struct bio *bio) { - bio->bi_error = submit_bio_wait(bio); - bio_endio(bio); + struct completion done; + + init_completion(&done); + bio->bi_private = &done; + bio->bi_end_io = submit_bio_wait_endio; + bio->bi_opf |= REQ_SYNC; + submit_bio(bio); + wait_for_completion(&done); + + return blk_status_to_errno(bio->bi_status); } int blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, - gfp_t gfp_mask, unsigned long flags) + gfp_t gfp_mask) { return 0; } +int blkdev_issue_zeroout(struct block_device *bdev, + sector_t sector, sector_t nr_sects, + gfp_t gfp_mask, unsigned flags) +{ + /* Not yet implemented: */ + BUG(); +} + unsigned bdev_logical_block_size(struct block_device *bdev) { struct stat statbuf; @@ -78,12 +136,10 @@ unsigned bdev_logical_block_size(struct block_device *bdev) BUG_ON(ret); if (!S_ISBLK(statbuf.st_mode)) - return statbuf.st_blksize >> 9; + return statbuf.st_blksize; - ret = ioctl(bdev->bd_fd, BLKPBSZGET, &blksize); - BUG_ON(ret); - - return blksize >> 9; + xioctl(bdev->bd_fd, BLKPBSZGET, &blksize); + return blksize; } sector_t get_capacity(struct gendisk *disk) @@ -106,50 +162,267 @@ sector_t get_capacity(struct gendisk *disk) return bytes >> 9; } -void blkdev_put(struct block_device *bdev, fmode_t mode) +void bdev_release(struct bdev_handle *handle) { - fdatasync(bdev->bd_fd); - close(bdev->bd_fd); - free(bdev); + fdatasync(handle->bdev->bd_fd); + close(handle->bdev->bd_fd); + free(handle->bdev); + free(handle); } -struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, - void *holder) +struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode, + void *holder, const struct blk_holder_ops *hop) { - struct block_device *bdev; - int flags = O_DIRECT; + int fd, flags = 0; - if ((mode & (FMODE_READ|FMODE_WRITE)) == (FMODE_READ|FMODE_WRITE)) + if ((mode & (BLK_OPEN_READ|BLK_OPEN_WRITE)) == (BLK_OPEN_READ|BLK_OPEN_WRITE)) flags = O_RDWR; - else if (mode & FMODE_READ) + else if (mode & BLK_OPEN_READ) flags = O_RDONLY; - else if (mode & FMODE_WRITE) + else if (mode & BLK_OPEN_WRITE) flags = O_WRONLY; - if (mode & FMODE_EXCL) + if (!(mode & BLK_OPEN_BUFFERED)) + flags |= O_DIRECT; + + if (mode & BLK_OPEN_EXCL) flags |= O_EXCL; - bdev = malloc(sizeof(*bdev)); + fd = open(path, flags); + if (fd < 0) + return ERR_PTR(-errno); + + struct block_device *bdev = malloc(sizeof(*bdev)); memset(bdev, 0, sizeof(*bdev)); strncpy(bdev->name, path, sizeof(bdev->name)); bdev->name[sizeof(bdev->name) - 1] = '\0'; - bdev->bd_fd = open(path, flags); - bdev->bd_holder = holder; - bdev->bd_disk = &bdev->__bd_disk; + bdev->bd_dev = xfstat(fd).st_rdev; + bdev->bd_fd = fd; + bdev->bd_holder = holder; + bdev->bd_disk = &bdev->__bd_disk; + bdev->bd_disk->bdi = &bdev->bd_disk->__bdi; + bdev->queue.backing_dev_info = bdev->bd_disk->bdi; - BUG_ON(bdev->bd_fd < 0); + struct bdev_handle *handle = calloc(sizeof(*handle), 1); + handle->bdev = bdev; + handle->holder = holder; + handle->mode = mode; - return bdev; + return handle; } -void bdput(struct block_device *bdev) +int lookup_bdev(const char *path, dev_t *dev) { - BUG(); + return -EINVAL; +} + +static void io_fallback(void) +{ + fops++; + if (fops->init == NULL) + die("no fallback possible, something is very wrong"); + fops->init(); +} + +static void sync_check(struct bio *bio, int ret) +{ + if (ret != bio->bi_iter.bi_size) { + die("IO error: %s\n", strerror(-ret)); + } + + if (bio->bi_opf & REQ_FUA) { + ret = fdatasync(bio->bi_bdev->bd_fd); + if (ret) + die("fsync error: %s\n", strerror(-ret)); + } + bio_endio(bio); +} + +static void sync_init(void) {} + +static void sync_cleanup(void) +{ + /* not necessary? */ + sync(); +} + +static void sync_read(struct bio *bio, struct iovec * iov, unsigned i) +{ + + ssize_t ret = preadv(bio->bi_bdev->bd_fd, iov, i, + bio->bi_iter.bi_sector << 9); + sync_check(bio, ret); +} + +static void sync_write(struct bio *bio, struct iovec * iov, unsigned i) +{ + ssize_t ret = pwritev2(bio->bi_bdev->bd_fd, iov, i, + bio->bi_iter.bi_sector << 9, + bio->bi_opf & REQ_FUA ? RWF_SYNC : 0); + sync_check(bio, ret); +} + +static DECLARE_WAIT_QUEUE_HEAD(aio_events_completed); + +static int aio_completion_thread(void *arg) +{ + struct io_event events[8], *ev; + int ret; + bool stop = false; + + while (!stop) { + ret = io_getevents(aio_ctx, 1, ARRAY_SIZE(events), + events, NULL); + + if (ret < 0 && ret == -EINTR) + continue; + if (ret < 0) + die("io_getevents() error: %s", strerror(-ret)); + if (ret) + wake_up(&aio_events_completed); + + for (ev = events; ev < events + ret; ev++) { + struct bio *bio = (struct bio *) ev->data; + + /* This should only happen during blkdev_cleanup() */ + if (!bio) { + BUG_ON(atomic_read(&running_requests) != 0); + stop = true; + continue; + } + + if (ev->res != bio->bi_iter.bi_size) + bio->bi_status = BLK_STS_IOERR; + + bio_endio(bio); + atomic_dec(&running_requests); + } + } + + return 0; +} + +static struct task_struct *aio_task = NULL; + +static void aio_init(void) +{ + struct task_struct *p; + long err = io_setup(256, &aio_ctx); + if (!err) { + p = kthread_run(aio_completion_thread, NULL, "aio_completion"); + BUG_ON(IS_ERR(p)); + + aio_task = p; + + } else if (err == -ENOSYS) { + io_fallback(); + } else { + die("io_setup() error: %s", strerror(err)); + } +} + +static void aio_cleanup(void) +{ + struct task_struct *p = NULL; + swap(aio_task, p); + get_task_struct(p); + + /* I mean, really?! IO_CMD_NOOP is even defined, but not implemented. */ + int fds[2]; + int ret = pipe(fds); + if (ret != 0) + die("pipe err: %s", strerror(ret)); + + /* Wake up the completion thread with spurious work. */ + int junk = 0; + struct iocb iocb = { + .aio_lio_opcode = IO_CMD_PWRITE, + .data = NULL, /* Signal to stop */ + .aio_fildes = fds[1], + .u.c.buf = &junk, + .u.c.nbytes = 1, + }, *iocbp = &iocb; + ret = io_submit(aio_ctx, 1, &iocbp); + if (ret != 1) + die("io_submit cleanup err: %s", strerror(-ret)); + + ret = kthread_stop(p); + BUG_ON(ret); + + put_task_struct(p); + + close(fds[0]); + close(fds[1]); +} + +static void aio_op(struct bio *bio, struct iovec *iov, unsigned i, int opcode) +{ + ssize_t ret; + struct iocb iocb = { + .data = bio, + .aio_fildes = bio->bi_bdev->bd_fd, + .aio_rw_flags = bio->bi_opf & REQ_FUA ? RWF_SYNC : 0, + .aio_lio_opcode = opcode, + .u.c.buf = iov, + .u.c.nbytes = i, + .u.c.offset = bio->bi_iter.bi_sector << 9, + + }, *iocbp = &iocb; + + atomic_inc(&running_requests); + + wait_event(aio_events_completed, + (ret = io_submit(aio_ctx, 1, &iocbp)) != -EAGAIN);; + + if (ret != 1) + die("io_submit err: %s", strerror(-ret)); +} + +static void aio_read(struct bio *bio, struct iovec *iov, unsigned i) +{ + aio_op(bio, iov, i, IO_CMD_PREADV); +} + +static void aio_write(struct bio *bio, struct iovec * iov, unsigned i) +{ + aio_op(bio, iov, i, IO_CMD_PWRITEV); +} + + +/* not implemented */ +static void uring_init(void) { + io_fallback(); +} + +struct fops fops_list[] = { + { + .init = uring_init, + }, { + .init = aio_init, + .cleanup = aio_cleanup, + .read = aio_read, + .write = aio_write, + }, { + .init = sync_init, + .cleanup = sync_cleanup, + .read = sync_read, + .write = sync_write, + }, { + /* NULL */ + } +}; + +__attribute__((constructor(102))) +static void blkdev_init(void) +{ + fops = fops_list; + fops->init(); } -struct block_device *lookup_bdev(const char *path) +__attribute__((destructor(102))) +static void blkdev_cleanup(void) { - return ERR_PTR(-EINVAL); + fops->cleanup(); }